diff options
320 files changed, 12134 insertions, 9691 deletions
diff --git a/Documentation/console/console.txt b/Documentation/console/console.txt new file mode 100644 index 00000000000..d3e17447321 --- /dev/null +++ b/Documentation/console/console.txt @@ -0,0 +1,144 @@ +Console Drivers +=============== + +The linux kernel has 2 general types of console drivers. The first type is +assigned by the kernel to all the virtual consoles during the boot process. +This type will be called 'system driver', and only one system driver is allowed +to exist. The system driver is persistent and it can never be unloaded, though +it may become inactive. + +The second type has to be explicitly loaded and unloaded. This will be called +'modular driver' by this document. Multiple modular drivers can coexist at +any time with each driver sharing the console with other drivers including +the system driver. However, modular drivers cannot take over the console +that is currently occupied by another modular driver. (Exception: Drivers that +call take_over_console() will succeed in the takeover regardless of the type +of driver occupying the consoles.) They can only take over the console that is +occupied by the system driver. In the same token, if the modular driver is +released by the console, the system driver will take over. + +Modular drivers, from the programmer's point of view, has to call: + + take_over_console() - load and bind driver to console layer + give_up_console() - unbind and unload driver + +In newer kernels, the following are also available: + + register_con_driver() + unregister_con_driver() + +If sysfs is enabled, the contents of /sys/class/vtconsole can be +examined. This shows the console backends currently registered by the +system which are named vtcon<n> where <n> is an integer fro 0 to 15. Thus: + + ls /sys/class/vtconsole + . .. vtcon0 vtcon1 + +Each directory in /sys/class/vtconsole has 3 files: + + ls /sys/class/vtconsole/vtcon0 + . .. bind name uevent + +What do these files signify? + + 1. bind - this is a read/write file. It shows the status of the driver if + read, or acts to bind or unbind the driver to the virtual consoles + when written to. The possible values are: + + 0 - means the driver is not bound and if echo'ed, commands the driver + to unbind + + 1 - means the driver is bound and if echo'ed, commands the driver to + bind + + 2. name - read-only file. Shows the name of the driver in this format: + + cat /sys/class/vtconsole/vtcon0/name + (S) VGA+ + + '(S)' stands for a (S)ystem driver, ie, it cannot be directly + commanded to bind or unbind + + 'VGA+' is the name of the driver + + cat /sys/class/vtconsole/vtcon1/name + (M) frame buffer device + + In this case, '(M)' stands for a (M)odular driver, one that can be + directly commanded to bind or unbind. + + 3. uevent - ignore this file + +When unbinding, the modular driver is detached first, and then the system +driver takes over the consoles vacated by the driver. Binding, on the other +hand, will bind the driver to the consoles that are currently occupied by a +system driver. + +NOTE1: Binding and binding must be selected in Kconfig. It's under: + +Device Drivers -> Character devices -> Support for binding and unbinding +console drivers + +NOTE2: If any of the virtual consoles are in KD_GRAPHICS mode, then binding or +unbinding will not succeed. An example of an application that sets the console +to KD_GRAPHICS is X. + +How useful is this feature? This is very useful for console driver +developers. By unbinding the driver from the console layer, one can unload the +driver, make changes, recompile, reload and rebind the driver without any need +for rebooting the kernel. For regular users who may want to switch from +framebuffer console to VGA console and vice versa, this feature also makes +this possible. (NOTE NOTE NOTE: Please read fbcon.txt under Documentation/fb +for more details). + +Notes for developers: +===================== + +take_over_console() is now broken up into: + + register_con_driver() + bind_con_driver() - private function + +give_up_console() is a wrapper to unregister_con_driver(), and a driver must +be fully unbound for this call to succeed. con_is_bound() will check if the +driver is bound or not. + +Guidelines for console driver writers: +===================================== + +In order for binding to and unbinding from the console to properly work, +console drivers must follow these guidelines: + +1. All drivers, except system drivers, must call either register_con_driver() + or take_over_console(). register_con_driver() will just add the driver to + the console's internal list. It won't take over the + console. take_over_console(), as it name implies, will also take over (or + bind to) the console. + +2. All resources allocated during con->con_init() must be released in + con->con_deinit(). + +3. All resources allocated in con->con_startup() must be released when the + driver, which was previously bound, becomes unbound. The console layer + does not have a complementary call to con->con_startup() so it's up to the + driver to check when it's legal to release these resources. Calling + con_is_bound() in con->con_deinit() will help. If the call returned + false(), then it's safe to release the resources. This balance has to be + ensured because con->con_startup() can be called again when a request to + rebind the driver to the console arrives. + +4. Upon exit of the driver, ensure that the driver is totally unbound. If the + condition is satisfied, then the driver must call unregister_con_driver() + or give_up_console(). + +5. unregister_con_driver() can also be called on conditions which make it + impossible for the driver to service console requests. This can happen + with the framebuffer console that suddenly lost all of its drivers. + +The current crop of console drivers should still work correctly, but binding +and unbinding them may cause problems. With minimal fixes, these drivers can +be made to work correctly. + +========================== +Antonino Daplas <adaplas@pol.net> + diff --git a/Documentation/fb/fbcon.txt b/Documentation/fb/fbcon.txt index 08dce0f631b..f373df12ed4 100644 --- a/Documentation/fb/fbcon.txt +++ b/Documentation/fb/fbcon.txt @@ -135,10 +135,10 @@ C. Boot options The angle can be changed anytime afterwards by 'echoing' the same numbers to any one of the 2 attributes found in - /sys/class/graphics/fb{x} + /sys/class/graphics/fbcon - con_rotate - rotate the display of the active console - con_rotate_all - rotate the display of all consoles + rotate - rotate the display of the active console + rotate_all - rotate the display of all consoles Console rotation will only become available if Console Rotation Support is compiled in your kernel. @@ -148,5 +148,177 @@ C. Boot options Actually, the underlying fb driver is totally ignorant of console rotation. ---- +C. Attaching, Detaching and Unloading + +Before going on on how to attach, detach and unload the framebuffer console, an +illustration of the dependencies may help. + +The console layer, as with most subsystems, needs a driver that interfaces with +the hardware. Thus, in a VGA console: + +console ---> VGA driver ---> hardware. + +Assuming the VGA driver can be unloaded, one must first unbind the VGA driver +from the console layer before unloading the driver. The VGA driver cannot be +unloaded if it is still bound to the console layer. (See +Documentation/console/console.txt for more information). + +This is more complicated in the case of the the framebuffer console (fbcon), +because fbcon is an intermediate layer between the console and the drivers: + +console ---> fbcon ---> fbdev drivers ---> hardware + +The fbdev drivers cannot be unloaded if it's bound to fbcon, and fbcon cannot +be unloaded if it's bound to the console layer. + +So to unload the fbdev drivers, one must first unbind fbcon from the console, +then unbind the fbdev drivers from fbcon. Fortunately, unbinding fbcon from +the console layer will automatically unbind framebuffer drivers from +fbcon. Thus, there is no need to explicitly unbind the fbdev drivers from +fbcon. + +So, how do we unbind fbcon from the console? Part of the answer is in +Documentation/console/console.txt. To summarize: + +Echo a value to the bind file that represents the framebuffer console +driver. So assuming vtcon1 represents fbcon, then: + +echo 1 > sys/class/vtconsole/vtcon1/bind - attach framebuffer console to + console layer +echo 0 > sys/class/vtconsole/vtcon1/bind - detach framebuffer console from + console layer + +If fbcon is detached from the console layer, your boot console driver (which is +usually VGA text mode) will take over. A few drivers (rivafb and i810fb) will +restore VGA text mode for you. With the rest, before detaching fbcon, you +must take a few additional steps to make sure that your VGA text mode is +restored properly. The following is one of the several methods that you can do: + +1. Download or install vbetool. This utility is included with most + distributions nowadays, and is usually part of the suspend/resume tool. + +2. In your kernel configuration, ensure that CONFIG_FRAMEBUFFER_CONSOLE is set + to 'y' or 'm'. Enable one or more of your favorite framebuffer drivers. + +3. Boot into text mode and as root run: + + vbetool vbestate save > <vga state file> + + The above command saves the register contents of your graphics + hardware to <vga state file>. You need to do this step only once as + the state file can be reused. + +4. If fbcon is compiled as a module, load fbcon by doing: + + modprobe fbcon + +5. Now to detach fbcon: + + vbetool vbestate restore < <vga state file> && \ + echo 0 > /sys/class/vtconsole/vtcon1/bind + +6. That's it, you're back to VGA mode. And if you compiled fbcon as a module, + you can unload it by 'rmmod fbcon' + +7. To reattach fbcon: + + echo 1 > /sys/class/vtconsole/vtcon1/bind + +8. Once fbcon is unbound, all drivers registered to the system will also +become unbound. This means that fbcon and individual framebuffer drivers +can be unloaded or reloaded at will. Reloading the drivers or fbcon will +automatically bind the console, fbcon and the drivers together. Unloading +all the drivers without unloading fbcon will make it impossible for the +console to bind fbcon. + +Notes for vesafb users: +======================= + +Unfortunately, if your bootline includes a vga=xxx parameter that sets the +hardware in graphics mode, such as when loading vesafb, vgacon will not load. +Instead, vgacon will replace the default boot console with dummycon, and you +won't get any display after detaching fbcon. Your machine is still alive, so +you can reattach vesafb. However, to reattach vesafb, you need to do one of +the following: + +Variation 1: + + a. Before detaching fbcon, do + + vbetool vbemode save > <vesa state file> # do once for each vesafb mode, + # the file can be reused + + b. Detach fbcon as in step 5. + + c. Attach fbcon + + vbetool vbestate restore < <vesa state file> && \ + echo 1 > /sys/class/vtconsole/vtcon1/bind + +Variation 2: + + a. Before detaching fbcon, do: + echo <ID> > /sys/class/tty/console/bind + + + vbetool vbemode get + + b. Take note of the mode number + + b. Detach fbcon as in step 5. + + c. Attach fbcon: + + vbetool vbemode set <mode number> && \ + echo 1 > /sys/class/vtconsole/vtcon1/bind + +Samples: +======== + +Here are 2 sample bash scripts that you can use to bind or unbind the +framebuffer console driver if you are in an X86 box: + +--------------------------------------------------------------------------- +#!/bin/bash +# Unbind fbcon + +# Change this to where your actual vgastate file is located +# Or Use VGASTATE=$1 to indicate the state file at runtime +VGASTATE=/tmp/vgastate + +# path to vbetool +VBETOOL=/usr/local/bin + + +for (( i = 0; i < 16; i++)) +do + if test -x /sys/class/vtconsole/vtcon$i; then + if [ `cat /sys/class/vtconsole/vtcon$i/name | grep -c "frame buffer"` \ + = 1 ]; then + if test -x $VBETOOL/vbetool; then + echo Unbinding vtcon$i + $VBETOOL/vbetool vbestate restore < $VGASTATE + echo 0 > /sys/class/vtconsole/vtcon$i/bind + fi + fi + fi +done + +--------------------------------------------------------------------------- +#!/bin/bash +# Bind fbcon + +for (( i = 0; i < 16; i++)) +do + if test -x /sys/class/vtconsole/vtcon$i; then + if [ `cat /sys/class/vtconsole/vtcon$i/name | grep -c "frame buffer"` \ + = 1 ]; then + echo Unbinding vtcon$i + echo 1 > /sys/class/vtconsole/vtcon$i/bind + fi + fi +done +--------------------------------------------------------------------------- + +-- Antonino Daplas <adaplas@pol.net> diff --git a/Documentation/filesystems/ext3.txt b/Documentation/filesystems/ext3.txt index afb1335c05d..4aecc9bdb27 100644 --- a/Documentation/filesystems/ext3.txt +++ b/Documentation/filesystems/ext3.txt @@ -113,6 +113,14 @@ noquota grpquota usrquota +bh (*) ext3 associates buffer heads to data pages to +nobh (a) cache disk block mapping information + (b) link pages into transaction to provide + ordering guarantees. + "bh" option forces use of buffer heads. + "nobh" option tries to avoid associating buffer + heads (supported only for "writeback" mode). + Specification ============= diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index bca6f389da6..2e352a605fc 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -61,6 +61,7 @@ parameter is applicable: MTD MTD support is enabled. NET Appropriate network support is enabled. NUMA NUMA support is enabled. + GENERIC_TIME The generic timeofday code is enabled. NFS Appropriate NFS support is enabled. OSS OSS sound support is enabled. PARIDE The ParIDE subsystem is enabled. @@ -179,6 +180,11 @@ running once the system is up. override platform specific driver. See also Documentation/acpi-hotkey.txt. + acpi_pm_good [IA-32,X86-64] + Override the pmtimer bug detection: force the kernel + to assume that this machine's pmtimer latches its value + and always returns good values. + enable_timer_pin_1 [i386,x86-64] Enable PIN 1 of APIC timer Can be useful to work around chipset bugs @@ -341,10 +347,11 @@ running once the system is up. Value can be changed at runtime via /selinux/checkreqprot. - clock= [BUGS=IA-32,HW] gettimeofday timesource override. - Forces specified timesource (if avaliable) to be used - when calculating gettimeofday(). If specicified - timesource is not avalible, it defaults to PIT. + clock= [BUGS=IA-32, HW] gettimeofday clocksource override. + [Deprecated] + Forces specified clocksource (if avaliable) to be used + when calculating gettimeofday(). If specified + clocksource is not avalible, it defaults to PIT. Format: { pit | tsc | cyclone | pmtmr } disable_8254_timer @@ -1617,6 +1624,10 @@ running once the system is up. time Show timing data prefixed to each printk message line + clocksource= [GENERIC_TIME] Override the default clocksource + Override the default clocksource and use the clocksource + with the name specified. + tipar.timeout= [HW,PPT] Set communications timeout in tenths of a second (default 15). diff --git a/Documentation/keys.txt b/Documentation/keys.txt index 3bbe157b45e..61c0fad2fe2 100644 --- a/Documentation/keys.txt +++ b/Documentation/keys.txt @@ -241,25 +241,30 @@ The security class "key" has been added to SELinux so that mandatory access controls can be applied to keys created within various contexts. This support is preliminary, and is likely to change quite significantly in the near future. Currently, all of the basic permissions explained above are provided in SELinux -as well; SE Linux is simply invoked after all basic permission checks have been +as well; SELinux is simply invoked after all basic permission checks have been performed. -Each key is labeled with the same context as the task to which it belongs. -Typically, this is the same task that was running when the key was created. -The default keyrings are handled differently, but in a way that is very -intuitive: +The value of the file /proc/self/attr/keycreate influences the labeling of +newly-created keys. If the contents of that file correspond to an SELinux +security context, then the key will be assigned that context. Otherwise, the +key will be assigned the current context of the task that invoked the key +creation request. Tasks must be granted explicit permission to assign a +particular context to newly-created keys, using the "create" permission in the +key security class. - (*) The user and user session keyrings that are created when the user logs in - are currently labeled with the context of the login manager. - - (*) The keyrings associated with new threads are each labeled with the context - of their associated thread, and both session and process keyrings are - handled similarly. +The default keyrings associated with users will be labeled with the default +context of the user if and only if the login programs have been instrumented to +properly initialize keycreate during the login process. Otherwise, they will +be labeled with the context of the login program itself. Note, however, that the default keyrings associated with the root user are labeled with the default kernel context, since they are created early in the boot process, before root has a chance to log in. +The keyrings associated with new threads are each labeled with the context of +their associated thread, and both session and process keyrings are handled +similarly. + ================ NEW PROCFS FILES @@ -270,9 +275,17 @@ about the status of the key service: (*) /proc/keys - This lists all the keys on the system, giving information about their - type, description and permissions. The payload of the key is not available - this way: + This lists the keys that are currently viewable by the task reading the + file, giving information about their type, description and permissions. + It is not possible to view the payload of the key this way, though some + information about it may be given. + + The only keys included in the list are those that grant View permission to + the reading process whether or not it possesses them. Note that LSM + security checks are still performed, and may further filter out keys that + the current process is not authorised to view. + + The contents of the file look like this: SERIAL FLAGS USAGE EXPY PERM UID GID TYPE DESCRIPTION: SUMMARY 00000001 I----- 39 perm 1f3f0000 0 0 keyring _uid_ses.0: 1/4 @@ -300,7 +313,7 @@ about the status of the key service: (*) /proc/key-users This file lists the tracking data for each user that has at least one key - on the system. Such data includes quota information and statistics: + on the system. Such data includes quota information and statistics: [root@andromeda root]# cat /proc/key-users 0: 46 45/45 1/100 13/10000 diff --git a/Documentation/md.txt b/Documentation/md.txt index 03a13c462cf..0668f9dc9d2 100644 --- a/Documentation/md.txt +++ b/Documentation/md.txt @@ -200,6 +200,17 @@ All md devices contain: This can be written only while the array is being assembled, not after it is started. + layout + The "layout" for the array for the particular level. This is + simply a number that is interpretted differently by different + levels. It can be written while assembling an array. + + resync_start + The point at which resync should start. If no resync is needed, + this will be a very large number. At array creation it will + default to 0, though starting the array as 'clean' will + set it much larger. + new_dev This file can be written but not read. The value written should be a block device number as major:minor. e.g. 8:0 @@ -207,6 +218,54 @@ All md devices contain: available. It will then appear at md/dev-XXX (depending on the name of the device) and further configuration is then possible. + safe_mode_delay + When an md array has seen no write requests for a certain period + of time, it will be marked as 'clean'. When another write + request arrive, the array is marked as 'dirty' before the write + commenses. This is known as 'safe_mode'. + The 'certain period' is controlled by this file which stores the + period as a number of seconds. The default is 200msec (0.200). + Writing a value of 0 disables safemode. + + array_state + This file contains a single word which describes the current + state of the array. In many cases, the state can be set by + writing the word for the desired state, however some states + cannot be explicitly set, and some transitions are not allowed. + + clear + No devices, no size, no level + Writing is equivalent to STOP_ARRAY ioctl + inactive + May have some settings, but array is not active + all IO results in error + When written, doesn't tear down array, but just stops it + suspended (not supported yet) + All IO requests will block. The array can be reconfigured. + Writing this, if accepted, will block until array is quiessent + readonly + no resync can happen. no superblocks get written. + write requests fail + read-auto + like readonly, but behaves like 'clean' on a write request. + + clean - no pending writes, but otherwise active. + When written to inactive array, starts without resync + If a write request arrives then + if metadata is known, mark 'dirty' and switch to 'active'. + if not known, block and switch to write-pending + If written to an active array that has pending writes, then fails. + active + fully active: IO and resync can be happening. + When written to inactive array, starts with resync + + write-pending + clean, but writes are blocked waiting for 'active' to be written. + + active-idle + like active, but no writes have been seen for a while (safe_mode_delay). + + sync_speed_min sync_speed_max This are similar to /proc/sys/dev/raid/speed_limit_{min,max} @@ -250,10 +309,18 @@ Each directory contains: faulty - device has been kicked from active use due to a detected fault in_sync - device is a fully in-sync member of the array + writemostly - device will only be subject to read + requests if there are no other options. + This applies only to raid1 arrays. spare - device is working, but not a full member. This includes spares that are in the process of being recoverred to This list make grow in future. + This can be written to. + Writing "faulty" simulates a failure on the device. + Writing "remove" removes the device from the array. + Writing "writemostly" sets the writemostly flag. + Writing "-writemostly" clears the writemostly flag. errors An approximate count of read errors that have been detected on diff --git a/Documentation/tty.txt b/Documentation/tty.txt index 8ff7bc2a081..dab56604745 100644 --- a/Documentation/tty.txt +++ b/Documentation/tty.txt @@ -80,13 +80,6 @@ receive_buf() - Hand buffers of bytes from the driver to the ldisc for processing. Semantics currently rather mysterious 8( -receive_room() - Can be called by the driver layer at any time when - the ldisc is opened. The ldisc must be able to - handle the reported amount of data at that instant. - Synchronization between active receive_buf and - receive_room calls is down to the driver not the - ldisc. Must not sleep. - write_wakeup() - May be called at any point between open and close. The TTY_DO_WRITE_WAKEUP flag indicates if a call is needed but always races versus calls. Thus the diff --git a/MAINTAINERS b/MAINTAINERS index 4dcd2f1f14d..28c0a967692 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1118,6 +1118,11 @@ L: lm-sensors@lm-sensors.org W: http://www.lm-sensors.nu/ S: Maintained +HARDWARE RANDOM NUMBER GENERATOR CORE +P: Michael Buesch +M: mb@bu3sch.de +S: Maintained + HARD DRIVE ACTIVE PROTECTION SYSTEM (HDAPS) DRIVER P: Robert Love M: rlove@rlove.org @@ -1436,6 +1441,11 @@ P: Tigran Aivazian M: tigran@veritas.com S: Maintained +INTEL IXP4XX RANDOM NUMBER GENERATOR SUPPORT +P: Deepak Saxena +M: dsaxena@plexity.net +S: Maintained + INTEL PRO/100 ETHERNET SUPPORT P: John Ronciak M: john.ronciak@intel.com @@ -2725,6 +2735,11 @@ P: Christoph Hellwig M: hch@infradead.org S: Maintained +TI OMAP RANDOM NUMBER GENERATOR SUPPORT +P: Deepak Saxena +M: dsaxena@plexity.net +S: Maintained + TI PARALLEL LINK CABLE DRIVER P: Romain Lievin M: roms@lpg.ticalc.org diff --git a/arch/alpha/oprofile/common.c b/arch/alpha/oprofile/common.c index ba788cfdc3c..9fc0eeb4f0a 100644 --- a/arch/alpha/oprofile/common.c +++ b/arch/alpha/oprofile/common.c @@ -112,7 +112,7 @@ op_axp_create_files(struct super_block * sb, struct dentry * root) for (i = 0; i < model->num_counters; ++i) { struct dentry *dir; - char buf[3]; + char buf[4]; snprintf(buf, sizeof buf, "%d", i); dir = oprofilefs_mkdir(sb, root, buf); diff --git a/arch/arm/common/locomo.c b/arch/arm/common/locomo.c index a7dc1370695..0dafba3a701 100644 --- a/arch/arm/common/locomo.c +++ b/arch/arm/common/locomo.c @@ -629,21 +629,6 @@ static int locomo_resume(struct platform_device *dev) #endif -#define LCM_ALC_EN 0x8000 - -void frontlight_set(struct locomo *lchip, int duty, int vr, int bpwf) -{ - unsigned long flags; - - spin_lock_irqsave(&lchip->lock, flags); - locomo_writel(bpwf, lchip->base + LOCOMO_FRONTLIGHT + LOCOMO_ALS); - udelay(100); - locomo_writel(duty, lchip->base + LOCOMO_FRONTLIGHT + LOCOMO_ALD); - locomo_writel(bpwf | LCM_ALC_EN, lchip->base + LOCOMO_FRONTLIGHT + LOCOMO_ALS); - spin_unlock_irqrestore(&lchip->lock, flags); -} - - /** * locomo_probe - probe for a single LoCoMo chip. * @phys_addr: physical address of device. @@ -698,14 +683,10 @@ __locomo_probe(struct device *me, struct resource *mem, int irq) , lchip->base + LOCOMO_GPD); locomo_writel(0, lchip->base + LOCOMO_GIE); - /* FrontLight */ + /* Frontlight */ locomo_writel(0, lchip->base + LOCOMO_FRONTLIGHT + LOCOMO_ALS); locomo_writel(0, lchip->base + LOCOMO_FRONTLIGHT + LOCOMO_ALD); - /* Same constants can be used for collie and poodle - (depending on CONFIG options in original sharp code)? */ - frontlight_set(lchip, 163, 0, 148); - /* Longtime timer */ locomo_writel(0, lchip->base + LOCOMO_LTINT); /* SPI */ @@ -1063,6 +1044,30 @@ void locomo_m62332_senddata(struct locomo_dev *ldev, unsigned int dac_data, int } /* + * Frontlight control + */ + +static struct locomo *locomo_chip_driver(struct locomo_dev *ldev); + +void locomo_frontlight_set(struct locomo_dev *dev, int duty, int vr, int bpwf) +{ + unsigned long flags; + struct locomo *lchip = locomo_chip_driver(dev); + + if (vr) + locomo_gpio_write(dev, LOCOMO_GPIO_FL_VR, 1); + else + locomo_gpio_write(dev, LOCOMO_GPIO_FL_VR, 0); + + spin_lock_irqsave(&lchip->lock, flags); + locomo_writel(bpwf, lchip->base + LOCOMO_FRONTLIGHT + LOCOMO_ALS); + udelay(100); + locomo_writel(duty, lchip->base + LOCOMO_FRONTLIGHT + LOCOMO_ALD); + locomo_writel(bpwf | LOCOMO_ALC_EN, lchip->base + LOCOMO_FRONTLIGHT + LOCOMO_ALS); + spin_unlock_irqrestore(&lchip->lock, flags); +} + +/* * LoCoMo "Register Access Bus." * * We model this as a regular bus type, and hang devices directly diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index 1596101cfaf..374fb50608a 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -14,6 +14,10 @@ config X86_32 486, 586, Pentiums, and various instruction-set-compatible chips by AMD, Cyrix, and others. +config GENERIC_TIME + bool + default y + config SEMAPHORE_SLEEPERS bool default y @@ -1046,13 +1050,23 @@ config SCx200 tristate "NatSemi SCx200 support" depends on !X86_VOYAGER help - This provides basic support for the National Semiconductor SCx200 - processor. Right now this is just a driver for the GPIO pins. + This provides basic support for National Semiconductor's + (now AMD's) Geode processors. The driver probes for the + PCI-IDs of several on-chip devices, so its a good dependency + for other scx200_* drivers. - If you don't know what to do here, say N. + If compiled as a module, the driver is named scx200. - This support is also available as a module. If compiled as a - module, it will be called scx200. +config SCx200HR_TIMER + tristate "NatSemi SCx200 27MHz High-Resolution Timer Support" + depends on SCx200 && GENERIC_TIME + default y + help + This driver provides a clocksource built upon the on-chip + 27MHz high-resolution timer. Its also a workaround for + NSC Geode SC-1100's buggy TSC, which loses time when the + processor goes idle (as is done by the scheduler). The + other workaround is idle=poll boot option. source "drivers/pcmcia/Kconfig" diff --git a/arch/i386/boot/video.S b/arch/i386/boot/video.S index c9343c3a808..8c2a6faeeae 100644 --- a/arch/i386/boot/video.S +++ b/arch/i386/boot/video.S @@ -1929,7 +1929,7 @@ skip10: movb %ah, %al ret store_edid: -#ifdef CONFIG_FB_FIRMWARE_EDID +#ifdef CONFIG_FIRMWARE_EDID pushw %es # just save all registers pushw %ax pushw %bx @@ -1947,6 +1947,22 @@ store_edid: rep stosl + pushw %es # save ES + xorw %di, %di # Report Capability + pushw %di + popw %es # ES:DI must be 0:0 + movw $0x4f15, %ax + xorw %bx, %bx + xorw %cx, %cx + int $0x10 + popw %es # restore ES + + cmpb $0x00, %ah # call successful + jne no_edid + + cmpb $0x4f, %al # function supported + jne no_edid + movw $0x4f15, %ax # do VBE/DDC movw $0x01, %bx movw $0x00, %cx @@ -1954,6 +1970,7 @@ store_edid: movw $0x140, %di int $0x10 +no_edid: popw %di # restore all registers popw %dx popw %cx diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile index 96fb8a020af..0fac85df64f 100644 --- a/arch/i386/kernel/Makefile +++ b/arch/i386/kernel/Makefile @@ -7,10 +7,9 @@ extra-y := head.o init_task.o vmlinux.lds obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o \ ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \ pci-dma.o i386_ksyms.o i387.o bootflag.o \ - quirks.o i8237.o topology.o alternative.o + quirks.o i8237.o topology.o alternative.o i8253.o tsc.o obj-y += cpu/ -obj-y += timers/ obj-y += acpi/ obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o obj-$(CONFIG_MCA) += mca.o @@ -37,6 +36,7 @@ obj-$(CONFIG_EFI) += efi.o efi_stub.o obj-$(CONFIG_DOUBLEFAULT) += doublefault.o obj-$(CONFIG_VM86) += vm86.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.o +obj-$(CONFIG_HPET_TIMER) += hpet.o EXTRA_AFLAGS := -traditional diff --git a/arch/i386/kernel/hpet.c b/arch/i386/kernel/hpet.c new file mode 100644 index 00000000000..c6737c35815 --- /dev/null +++ b/arch/i386/kernel/hpet.c @@ -0,0 +1,67 @@ +#include <linux/clocksource.h> +#include <linux/errno.h> +#include <linux/hpet.h> +#include <linux/init.h> + +#include <asm/hpet.h> +#include <asm/io.h> + +#define HPET_MASK CLOCKSOURCE_MASK(32) +#define HPET_SHIFT 22 + +/* FSEC = 10^-15 NSEC = 10^-9 */ +#define FSEC_PER_NSEC 1000000 + +static void *hpet_ptr; + +static cycle_t read_hpet(void) +{ + return (cycle_t)readl(hpet_ptr); +} + +static struct clocksource clocksource_hpet = { + .name = "hpet", + .rating = 250, + .read = read_hpet, + .mask = HPET_MASK, + .mult = 0, /* set below */ + .shift = HPET_SHIFT, + .is_continuous = 1, +}; + +static int __init init_hpet_clocksource(void) +{ + unsigned long hpet_period; + void __iomem* hpet_base; + u64 tmp; + + if (!hpet_address) + return -ENODEV; + + /* calculate the hpet address: */ + hpet_base = + (void __iomem*)ioremap_nocache(hpet_address, HPET_MMAP_SIZE); + hpet_ptr = hpet_base + HPET_COUNTER; + + /* calculate the frequency: */ + hpet_period = readl(hpet_base + HPET_PERIOD); + + /* + * hpet period is in femto seconds per cycle + * so we need to convert this to ns/cyc units + * aproximated by mult/2^shift + * + * fsec/cyc * 1nsec/1000000fsec = nsec/cyc = mult/2^shift + * fsec/cyc * 1ns/1000000fsec * 2^shift = mult + * fsec/cyc * 2^shift * 1nsec/1000000fsec = mult + * (fsec/cyc << shift)/1000000 = mult + * (hpet_period << shift)/FSEC_PER_NSEC = mult + */ + tmp = (u64)hpet_period << HPET_SHIFT; + do_div(tmp, FSEC_PER_NSEC); + clocksource_hpet.mult = (u32)tmp; + + return clocksource_register(&clocksource_hpet); +} + +module_init(init_hpet_clocksource); diff --git a/arch/i386/kernel/i8253.c b/arch/i386/kernel/i8253.c new file mode 100644 index 00000000000..477b24daff5 --- /dev/null +++ b/arch/i386/kernel/i8253.c @@ -0,0 +1,118 @@ +/* + * i8253.c 8253/PIT functions + * + */ +#include <linux/clocksource.h> +#include <linux/spinlock.h> +#include <linux/jiffies.h> +#include <linux/sysdev.h> +#include <linux/module.h> +#include <linux/init.h> + +#include <asm/smp.h> +#include <asm/delay.h> +#include <asm/i8253.h> +#include <asm/io.h> + +#include "io_ports.h" + +DEFINE_SPINLOCK(i8253_lock); +EXPORT_SYMBOL(i8253_lock); + +void setup_pit_timer(void) +{ + unsigned long flags; + + spin_lock_irqsave(&i8253_lock, flags); + outb_p(0x34,PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */ + udelay(10); + outb_p(LATCH & 0xff , PIT_CH0); /* LSB */ + udelay(10); + outb(LATCH >> 8 , PIT_CH0); /* MSB */ + spin_unlock_irqrestore(&i8253_lock, flags); +} + +/* + * Since the PIT overflows every tick, its not very useful + * to just read by itself. So use jiffies to emulate a free + * running counter: + */ +static cycle_t pit_read(void) +{ + unsigned long flags; + int count; + u32 jifs; + static int old_count; + static u32 old_jifs; + + spin_lock_irqsave(&i8253_lock, flags); + /* + * Although our caller may have the read side of xtime_lock, + * this is now a seqlock, and we are cheating in this routine + * by having side effects on state that we cannot undo if + * there is a collision on the seqlock and our caller has to + * retry. (Namely, old_jifs and old_count.) So we must treat + * jiffies as volatile despite the lock. We read jiffies + * before latching the timer count to guarantee that although + * the jiffies value might be older than the count (that is, + * the counter may underflow between the last point where + * jiffies was incremented and the point where we latch the + * count), it cannot be newer. + */ + jifs = jiffies; + outb_p(0x00, PIT_MODE); /* latch the count ASAP */ + count = inb_p(PIT_CH0); /* read the latched count */ + count |= inb_p(PIT_CH0) << 8; + + /* VIA686a test code... reset the latch if count > max + 1 */ + if (count > LATCH) { + outb_p(0x34, PIT_MODE); + outb_p(LATCH & 0xff, PIT_CH0); + outb(LATCH >> 8, PIT_CH0); + count = LATCH - 1; + } + + /* + * It's possible for count to appear to go the wrong way for a + * couple of reasons: + * + * 1. The timer counter underflows, but we haven't handled the + * resulting interrupt and incremented jiffies yet. + * 2. Hardware problem with the timer, not giving us continuous time, + * the counter does small "jumps" upwards on some Pentium systems, + * (see c't 95/10 page 335 for Neptun bug.) + * + * Previous attempts to handle these cases intelligently were + * buggy, so we just do the simple thing now. + */ + if (count > old_count && jifs == old_jifs) { + count = old_count; + } + old_count = count; + old_jifs = jifs; + + spin_unlock_irqrestore(&i8253_lock, flags); + + count = (LATCH - 1) - count; + + return (cycle_t)(jifs * LATCH) + count; +} + +static struct clocksource clocksource_pit = { + .name = "pit", + .rating = 110, + .read = pit_read, + .mask = CLOCKSOURCE_MASK(32), + .mult = 0, + .shift = 20, +}; + +static int __init init_pit_clocksource(void) +{ + if (num_possible_cpus() > 4) /* PIT does not scale! */ + return 0; + + clocksource_pit.mult = clocksource_hz2mult(CLOCK_TICK_RATE, 20); + return clocksource_register(&clocksource_pit); +} +module_init(init_pit_clocksource); diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c index 395a9a6dff8..727e419ad78 100644 --- a/arch/i386/kernel/kprobes.c +++ b/arch/i386/kernel/kprobes.c @@ -57,34 +57,85 @@ static __always_inline void set_jmp_op(void *from, void *to) /* * returns non-zero if opcodes can be boosted. */ -static __always_inline int can_boost(kprobe_opcode_t opcode) +static __always_inline int can_boost(kprobe_opcode_t *opcodes) { - switch (opcode & 0xf0 ) { +#define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf) \ + (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ + (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \ + (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \ + (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \ + << (row % 32)) + /* + * Undefined/reserved opcodes, conditional jump, Opcode Extension + * Groups, and some special opcodes can not be boost. + */ + static const unsigned long twobyte_is_boostable[256 / 32] = { + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* ------------------------------- */ + W(0x00, 0,0,1,1,0,0,1,0,1,1,0,0,0,0,0,0)| /* 00 */ + W(0x10, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 10 */ + W(0x20, 1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0)| /* 20 */ + W(0x30, 0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 30 */ + W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 40 */ + W(0x50, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 50 */ + W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1)| /* 60 */ + W(0x70, 0,0,0,0,1,1,1,1,0,0,0,0,0,0,1,1), /* 70 */ + W(0x80, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 80 */ + W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1), /* 90 */ + W(0xa0, 1,1,0,1,1,1,0,0,1,1,0,1,1,1,0,1)| /* a0 */ + W(0xb0, 1,1,1,1,1,1,1,1,0,0,0,1,1,1,1,1), /* b0 */ + W(0xc0, 1,1,0,0,0,0,0,0,1,1,1,1,1,1,1,1)| /* c0 */ + W(0xd0, 0,1,1,1,0,1,0,0,1,1,0,1,1,1,0,1), /* d0 */ + W(0xe0, 0,1,1,0,0,1,0,0,1,1,0,1,1,1,0,1)| /* e0 */ + W(0xf0, 0,1,1,1,0,1,0,0,1,1,1,0,1,1,1,0) /* f0 */ + /* ------------------------------- */ + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + }; +#undef W + kprobe_opcode_t opcode; + kprobe_opcode_t *orig_opcodes = opcodes; +retry: + if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1) + return 0; + opcode = *(opcodes++); + + /* 2nd-byte opcode */ + if (opcode == 0x0f) { + if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1) + return 0; + return test_bit(*opcodes, twobyte_is_boostable); + } + + switch (opcode & 0xf0) { + case 0x60: + if (0x63 < opcode && opcode < 0x67) + goto retry; /* prefixes */ + /* can't boost Address-size override and bound */ + return (opcode != 0x62 && opcode != 0x67); case 0x70: return 0; /* can't boost conditional jump */ - case 0x90: - /* can't boost call and pushf */ - return opcode != 0x9a && opcode != 0x9c; case 0xc0: - /* can't boost undefined opcodes and soft-interruptions */ - return (0xc1 < opcode && opcode < 0xc6) || - (0xc7 < opcode && opcode < 0xcc) || opcode == 0xcf; + /* can't boost software-interruptions */ + return (0xc1 < opcode && opcode < 0xcc) || opcode == 0xcf; case 0xd0: /* can boost AA* and XLAT */ return (opcode == 0xd4 || opcode == 0xd5 || opcode == 0xd7); case 0xe0: - /* can boost in/out and (may be) jmps */ - return (0xe3 < opcode && opcode != 0xe8); + /* can boost in/out and absolute jmps */ + return ((opcode & 0x04) || opcode == 0xea); case 0xf0: + if ((opcode & 0x0c) == 0 && opcode != 0xf1) + goto retry; /* lock/rep(ne) prefix */ /* clear and set flags can be boost */ return (opcode == 0xf5 || (0xf7 < opcode && opcode < 0xfe)); default: - /* currently, can't boost 2 bytes opcodes */ - return opcode != 0x0f; + if (opcode == 0x26 || opcode == 0x36 || opcode == 0x3e) + goto retry; /* prefixes */ + /* can't boost CS override and call */ + return (opcode != 0x2e && opcode != 0x9a); } } - /* * returns non-zero if opcode modifies the interrupt flag. */ @@ -109,7 +160,7 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p) memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); p->opcode = *p->addr; - if (can_boost(p->opcode)) { + if (can_boost(p->addr)) { p->ainsn.boostable = 0; } else { p->ainsn.boostable = -1; @@ -208,7 +259,9 @@ static int __kprobes kprobe_handler(struct pt_regs *regs) struct kprobe_ctlblk *kcb; #ifdef CONFIG_PREEMPT unsigned pre_preempt_count = preempt_count(); -#endif /* CONFIG_PREEMPT */ +#else + unsigned pre_preempt_count = 1; +#endif addr = (kprobe_opcode_t *)(regs->eip - sizeof(kprobe_opcode_t)); @@ -285,22 +338,14 @@ static int __kprobes kprobe_handler(struct pt_regs *regs) /* handler has already set things up, so skip ss setup */ return 1; - if (p->ainsn.boostable == 1 && -#ifdef CONFIG_PREEMPT - !(pre_preempt_count) && /* - * This enables booster when the direct - * execution path aren't preempted. - */ -#endif /* CONFIG_PREEMPT */ - !p->post_handler && !p->break_handler ) { +ss_probe: + if (pre_preempt_count && p->ainsn.boostable == 1 && !p->post_handler){ /* Boost up -- we can execute copied instructions directly */ reset_current_kprobe(); regs->eip = (unsigned long)p->ainsn.insn; preempt_enable_no_resched(); return 1; } - -ss_probe: prepare_singlestep(p, regs); kcb->kprobe_status = KPROBE_HIT_SS; return 1; diff --git a/arch/i386/kernel/numaq.c b/arch/i386/kernel/numaq.c index 5f5b075f860..0caf14652ba 100644 --- a/arch/i386/kernel/numaq.c +++ b/arch/i386/kernel/numaq.c @@ -79,10 +79,12 @@ int __init get_memcfg_numaq(void) return 1; } -static int __init numaq_dsc_disable(void) +static int __init numaq_tsc_disable(void) { - printk(KERN_DEBUG "NUMAQ: disabling TSC\n"); - tsc_disable = 1; + if (num_online_nodes() > 1) { + printk(KERN_DEBUG "NUMAQ: disabling TSC\n"); + tsc_disable = 1; + } return 0; } -core_initcall(numaq_dsc_disable); +arch_initcall(numaq_tsc_disable); diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index 6bef9273733..4a65040cc62 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -1575,6 +1575,7 @@ void __init setup_arch(char **cmdline_p) conswitchp = &dummy_con; #endif #endif + tsc_init(); } static __init int add_pcspkr(void) diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c index 9d307475985..5f43d041012 100644 --- a/arch/i386/kernel/time.c +++ b/arch/i386/kernel/time.c @@ -82,13 +82,6 @@ extern unsigned long wall_jiffies; DEFINE_SPINLOCK(rtc_lock); EXPORT_SYMBOL(rtc_lock); -#include <asm/i8253.h> - -DEFINE_SPINLOCK(i8253_lock); -EXPORT_SYMBOL(i8253_lock); - -struct timer_opts *cur_timer __read_mostly = &timer_none; - /* * This is a special lock that is owned by the CPU and holds the index * register we are working with. It is required for NMI access to the @@ -118,99 +111,19 @@ void rtc_cmos_write(unsigned char val, unsigned char addr) } EXPORT_SYMBOL(rtc_cmos_write); -/* - * This version of gettimeofday has microsecond resolution - * and better than microsecond precision on fast x86 machines with TSC. - */ -void do_gettimeofday(struct timeval *tv) -{ - unsigned long seq; - unsigned long usec, sec; - unsigned long max_ntp_tick; - - do { - unsigned long lost; - - seq = read_seqbegin(&xtime_lock); - - usec = cur_timer->get_offset(); - lost = jiffies - wall_jiffies; - - /* - * If time_adjust is negative then NTP is slowing the clock - * so make sure not to go into next possible interval. - * Better to lose some accuracy than have time go backwards.. - */ - if (unlikely(time_adjust < 0)) { - max_ntp_tick = (USEC_PER_SEC / HZ) - tickadj; - usec = min(usec, max_ntp_tick); - - if (lost) - usec += lost * max_ntp_tick; - } - else if (unlikely(lost)) - usec += lost * (USEC_PER_SEC / HZ); - - sec = xtime.tv_sec; - usec += (xtime.tv_nsec / 1000); - } while (read_seqretry(&xtime_lock, seq)); - - while (usec >= 1000000) { - usec -= 1000000; - sec++; - } - - tv->tv_sec = sec; - tv->tv_usec = usec; -} - -EXPORT_SYMBOL(do_gettimeofday); - -int do_settimeofday(struct timespec *tv) -{ - time_t wtm_sec, sec = tv->tv_sec; - long wtm_nsec, nsec = tv->tv_nsec; - - if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) - return -EINVAL; - - write_seqlock_irq(&xtime_lock); - /* - * This is revolting. We need to set "xtime" correctly. However, the - * value in this location is the value at the most recent update of - * wall time. Discover what correction gettimeofday() would have - * made, and then undo it! - */ - nsec -= cur_timer->get_offset() * NSEC_PER_USEC; - nsec -= (jiffies - wall_jiffies) * TICK_NSEC; - - wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); - wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); - - set_normalized_timespec(&xtime, sec, nsec); - set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); - - ntp_clear(); - write_sequnlock_irq(&xtime_lock); - clock_was_set(); - return 0; -} - -EXPORT_SYMBOL(do_settimeofday); - static int set_rtc_mmss(unsigned long nowtime) { int retval; - - WARN_ON(irqs_disabled()); + unsigned long flags; /* gets recalled with irq locally disabled */ - spin_lock_irq(&rtc_lock); + /* XXX - does irqsave resolve this? -johnstul */ + spin_lock_irqsave(&rtc_lock, flags); if (efi_enabled) retval = efi_set_rtc_mmss(nowtime); else retval = mach_set_rtc_mmss(nowtime); - spin_unlock_irq(&rtc_lock); + spin_unlock_irqrestore(&rtc_lock, flags); return retval; } @@ -218,16 +131,6 @@ static int set_rtc_mmss(unsigned long nowtime) int timer_ack; -/* monotonic_clock(): returns # of nanoseconds passed since time_init() - * Note: This function is required to return accurate - * time even in the absence of multiple timer ticks. - */ -unsigned long long monotonic_clock(void) -{ - return cur_timer->monotonic_clock(); -} -EXPORT_SYMBOL(monotonic_clock); - #if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER) unsigned long profile_pc(struct pt_regs *regs) { @@ -242,11 +145,21 @@ EXPORT_SYMBOL(profile_pc); #endif /* - * timer_interrupt() needs to keep up the real-time clock, - * as well as call the "do_timer()" routine every clocktick + * This is the same as the above, except we _also_ save the current + * Time Stamp Counter value at the time of the timer interrupt, so that + * we later on can estimate the time of day more exactly. */ -static inline void do_timer_interrupt(int irq, struct pt_regs *regs) +irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) { + /* + * Here we are in the timer irq handler. We just have irqs locally + * disabled but we don't know if the timer_bh is running on the other + * CPU. We need to avoid to SMP race with it. NOTE: we don' t need + * the irq version of write_lock because as just said we have irq + * locally disabled. -arca + */ + write_seqlock(&xtime_lock); + #ifdef CONFIG_X86_IO_APIC if (timer_ack) { /* @@ -279,27 +192,6 @@ static inline void do_timer_interrupt(int irq, struct pt_regs *regs) irq = inb_p( 0x61 ); /* read the current state */ outb_p( irq|0x80, 0x61 ); /* reset the IRQ */ } -} - -/* - * This is the same as the above, except we _also_ save the current - * Time Stamp Counter value at the time of the timer interrupt, so that - * we later on can estimate the time of day more exactly. - */ -irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) -{ - /* - * Here we are in the timer irq handler. We just have irqs locally - * disabled but we don't know if the timer_bh is running on the other - * CPU. We need to avoid to SMP race with it. NOTE: we don' t need - * the irq version of write_lock because as just said we have irq - * locally disabled. -arca - */ - write_seqlock(&xtime_lock); - - cur_timer->mark_offset(); - - do_timer_interrupt(irq, regs); write_sequnlock(&xtime_lock); @@ -380,7 +272,6 @@ void notify_arch_cmos_timer(void) static long clock_cmos_diff, sleep_start; -static struct timer_opts *last_timer; static int timer_suspend(struct sys_device *dev, pm_message_t state) { /* @@ -389,10 +280,6 @@ static int timer_suspend(struct sys_device *dev, pm_message_t state) clock_cmos_diff = -get_cmos_time(); clock_cmos_diff += get_seconds(); sleep_start = get_cmos_time(); - last_timer = cur_timer; - cur_timer = &timer_none; - if (last_timer->suspend) - last_timer->suspend(state); return 0; } @@ -415,10 +302,6 @@ static int timer_resume(struct sys_device *dev) jiffies_64 += sleep_length; wall_jiffies += sleep_length; write_sequnlock_irqrestore(&xtime_lock, flags); - if (last_timer->resume) - last_timer->resume(); - cur_timer = last_timer; - last_timer = NULL; touch_softlockup_watchdog(); return 0; } @@ -460,9 +343,6 @@ static void __init hpet_time_init(void) printk("Using HPET for base-timer\n"); } - cur_timer = select_timer(); - printk(KERN_INFO "Using %s for high-res timesource\n",cur_timer->name); - time_init_hook(); } #endif @@ -484,8 +364,5 @@ void __init time_init(void) set_normalized_timespec(&wall_to_monotonic, -xtime.tv_sec, -xtime.tv_nsec); - cur_timer = select_timer(); - printk(KERN_INFO "Using %s for high-res timesource\n",cur_timer->name); - time_init_hook(); } diff --git a/arch/i386/kernel/timers/Makefile b/arch/i386/kernel/timers/Makefile deleted file mode 100644 index 8fa12be658d..00000000000 --- a/arch/i386/kernel/timers/Makefile +++ /dev/null @@ -1,9 +0,0 @@ -# -# Makefile for x86 timers -# - -obj-y := timer.o timer_none.o timer_tsc.o timer_pit.o common.o - -obj-$(CONFIG_X86_CYCLONE_TIMER) += timer_cyclone.o -obj-$(CONFIG_HPET_TIMER) += timer_hpet.o -obj-$(CONFIG_X86_PM_TIMER) += timer_pm.o diff --git a/arch/i386/kernel/timers/common.c b/arch/i386/kernel/timers/common.c deleted file mode 100644 index 8163fe0cf1f..00000000000 --- a/arch/i386/kernel/timers/common.c +++ /dev/null @@ -1,172 +0,0 @@ -/* - * Common functions used across the timers go here - */ - -#include <linux/init.h> -#include <linux/timex.h> -#include <linux/errno.h> -#include <linux/jiffies.h> -#include <linux/module.h> - -#include <asm/io.h> -#include <asm/timer.h> -#include <asm/hpet.h> - -#include "mach_timer.h" - -/* ------ Calibrate the TSC ------- - * Return 2^32 * (1 / (TSC clocks per usec)) for do_fast_gettimeoffset(). - * Too much 64-bit arithmetic here to do this cleanly in C, and for - * accuracy's sake we want to keep the overhead on the CTC speaker (channel 2) - * output busy loop as low as possible. We avoid reading the CTC registers - * directly because of the awkward 8-bit access mechanism of the 82C54 - * device. - */ - -#define CALIBRATE_TIME (5 * 1000020/HZ) - -unsigned long calibrate_tsc(void) -{ - mach_prepare_counter(); - - { - unsigned long startlow, starthigh; - unsigned long endlow, endhigh; - unsigned long count; - - rdtsc(startlow,starthigh); - mach_countup(&count); - rdtsc(endlow,endhigh); - - - /* Error: ECTCNEVERSET */ - if (count <= 1) - goto bad_ctc; - - /* 64-bit subtract - gcc just messes up with long longs */ - __asm__("subl %2,%0\n\t" - "sbbl %3,%1" - :"=a" (endlow), "=d" (endhigh) - :"g" (startlow), "g" (starthigh), - "0" (endlow), "1" (endhigh)); - - /* Error: ECPUTOOFAST */ - if (endhigh) - goto bad_ctc; - - /* Error: ECPUTOOSLOW */ - if (endlow <= CALIBRATE_TIME) - goto bad_ctc; - - __asm__("divl %2" - :"=a" (endlow), "=d" (endhigh) - :"r" (endlow), "0" (0), "1" (CALIBRATE_TIME)); - - return endlow; - } - - /* - * The CTC wasn't reliable: we got a hit on the very first read, - * or the CPU was so fast/slow that the quotient wouldn't fit in - * 32 bits.. - */ -bad_ctc: - return 0; -} - -#ifdef CONFIG_HPET_TIMER -/* ------ Calibrate the TSC using HPET ------- - * Return 2^32 * (1 / (TSC clocks per usec)) for getting the CPU freq. - * Second output is parameter 1 (when non NULL) - * Set 2^32 * (1 / (tsc per HPET clk)) for delay_hpet(). - * calibrate_tsc() calibrates the processor TSC by comparing - * it to the HPET timer of known frequency. - * Too much 64-bit arithmetic here to do this cleanly in C - */ -#define CALIBRATE_CNT_HPET (5 * hpet_tick) -#define CALIBRATE_TIME_HPET (5 * KERNEL_TICK_USEC) - -unsigned long __devinit calibrate_tsc_hpet(unsigned long *tsc_hpet_quotient_ptr) -{ - unsigned long tsc_startlow, tsc_starthigh; - unsigned long tsc_endlow, tsc_endhigh; - unsigned long hpet_start, hpet_end; - unsigned long result, remain; - - hpet_start = hpet_readl(HPET_COUNTER); - rdtsc(tsc_startlow, tsc_starthigh); - do { - hpet_end = hpet_readl(HPET_COUNTER); - } while ((hpet_end - hpet_start) < CALIBRATE_CNT_HPET); - rdtsc(tsc_endlow, tsc_endhigh); - - /* 64-bit subtract - gcc just messes up with long longs */ - __asm__("subl %2,%0\n\t" - "sbbl %3,%1" - :"=a" (tsc_endlow), "=d" (tsc_endhigh) - :"g" (tsc_startlow), "g" (tsc_starthigh), - "0" (tsc_endlow), "1" (tsc_endhigh)); - - /* Error: ECPUTOOFAST */ - if (tsc_endhigh) - goto bad_calibration; - - /* Error: ECPUTOOSLOW */ - if (tsc_endlow <= CALIBRATE_TIME_HPET) - goto bad_calibration; - - ASM_DIV64_REG(result, remain, tsc_endlow, 0, CALIBRATE_TIME_HPET); - if (remain > (tsc_endlow >> 1)) - result++; /* rounding the result */ - - if (tsc_hpet_quotient_ptr) { - unsigned long tsc_hpet_quotient; - - ASM_DIV64_REG(tsc_hpet_quotient, remain, tsc_endlow, 0, - CALIBRATE_CNT_HPET); - if (remain > (tsc_endlow >> 1)) - tsc_hpet_quotient++; /* rounding the result */ - *tsc_hpet_quotient_ptr = tsc_hpet_quotient; - } - - return result; -bad_calibration: - /* - * the CPU was so fast/slow that the quotient wouldn't fit in - * 32 bits.. - */ - return 0; -} -#endif - - -unsigned long read_timer_tsc(void) -{ - unsigned long retval; - rdtscl(retval); - return retval; -} - - -/* calculate cpu_khz */ -void init_cpu_khz(void) -{ - if (cpu_has_tsc) { - unsigned long tsc_quotient = calibrate_tsc(); - if (tsc_quotient) { - /* report CPU clock rate in Hz. - * The formula is (10^6 * 2^32) / (2^32 * 1 / (clocks/us)) = - * clock/second. Our precision is about 100 ppm. - */ - { unsigned long eax=0, edx=1000; - __asm__("divl %2" - :"=a" (cpu_khz), "=d" (edx) - :"r" (tsc_quotient), - "0" (eax), "1" (edx)); - printk("Detected %u.%03u MHz processor.\n", - cpu_khz / 1000, cpu_khz % 1000); - } - } - } -} - diff --git a/arch/i386/kernel/timers/timer.c b/arch/i386/kernel/timers/timer.c deleted file mode 100644 index 7e39ed8e33f..00000000000 --- a/arch/i386/kernel/timers/timer.c +++ /dev/null @@ -1,75 +0,0 @@ -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/string.h> -#include <asm/timer.h> - -#ifdef CONFIG_HPET_TIMER -/* - * HPET memory read is slower than tsc reads, but is more dependable as it - * always runs at constant frequency and reduces complexity due to - * cpufreq. So, we prefer HPET timer to tsc based one. Also, we cannot use - * timer_pit when HPET is active. So, we default to timer_tsc. - */ -#endif -/* list of timers, ordered by preference, NULL terminated */ -static struct init_timer_opts* __initdata timers[] = { -#ifdef CONFIG_X86_CYCLONE_TIMER - &timer_cyclone_init, -#endif -#ifdef CONFIG_HPET_TIMER - &timer_hpet_init, -#endif -#ifdef CONFIG_X86_PM_TIMER - &timer_pmtmr_init, -#endif - &timer_tsc_init, - &timer_pit_init, - NULL, -}; - -static char clock_override[10] __initdata; - -static int __init clock_setup(char* str) -{ - if (str) - strlcpy(clock_override, str, sizeof(clock_override)); - return 1; -} -__setup("clock=", clock_setup); - - -/* The chosen timesource has been found to be bad. - * Fall back to a known good timesource (the PIT) - */ -void clock_fallback(void) -{ - cur_timer = &timer_pit; -} - -/* iterates through the list of timers, returning the first - * one that initializes successfully. - */ -struct timer_opts* __init select_timer(void) -{ - int i = 0; - - /* find most preferred working timer */ - while (timers[i]) { - if (timers[i]->init) - if (timers[i]->init(clock_override) == 0) - return timers[i]->opts; - ++i; - } - - panic("select_timer: Cannot find a suitable timer\n"); - return NULL; -} - -int read_current_timer(unsigned long *timer_val) -{ - if (cur_timer->read_timer) { - *timer_val = cur_timer->read_timer(); - return 0; - } - return -1; -} diff --git a/arch/i386/kernel/timers/timer_cyclone.c b/arch/i386/kernel/timers/timer_cyclone.c deleted file mode 100644 index 13892a65c94..00000000000 --- a/arch/i386/kernel/timers/timer_cyclone.c +++ /dev/null @@ -1,259 +0,0 @@ -/* Cyclone-timer: - * This code implements timer_ops for the cyclone counter found - * on IBM x440, x360, and other Summit based systems. - * - * Copyright (C) 2002 IBM, John Stultz (johnstul@us.ibm.com) - */ - - -#include <linux/spinlock.h> -#include <linux/init.h> -#include <linux/timex.h> -#include <linux/errno.h> -#include <linux/string.h> -#include <linux/jiffies.h> - -#include <asm/timer.h> -#include <asm/io.h> -#include <asm/pgtable.h> -#include <asm/fixmap.h> -#include <asm/i8253.h> - -#include "io_ports.h" - -/* Number of usecs that the last interrupt was delayed */ -static int delay_at_last_interrupt; - -#define CYCLONE_CBAR_ADDR 0xFEB00CD0 -#define CYCLONE_PMCC_OFFSET 0x51A0 -#define CYCLONE_MPMC_OFFSET 0x51D0 -#define CYCLONE_MPCS_OFFSET 0x51A8 -#define CYCLONE_TIMER_FREQ 100000000 -#define CYCLONE_TIMER_MASK (((u64)1<<40)-1) /* 40 bit mask */ -int use_cyclone = 0; - -static u32* volatile cyclone_timer; /* Cyclone MPMC0 register */ -static u32 last_cyclone_low; -static u32 last_cyclone_high; -static unsigned long long monotonic_base; -static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED; - -/* helper macro to atomically read both cyclone counter registers */ -#define read_cyclone_counter(low,high) \ - do{ \ - high = cyclone_timer[1]; low = cyclone_timer[0]; \ - } while (high != cyclone_timer[1]); - - -static void mark_offset_cyclone(void) -{ - unsigned long lost, delay; - unsigned long delta = last_cyclone_low; - int count; - unsigned long long this_offset, last_offset; - - write_seqlock(&monotonic_lock); - last_offset = ((unsigned long long)last_cyclone_high<<32)|last_cyclone_low; - - spin_lock(&i8253_lock); - read_cyclone_counter(last_cyclone_low,last_cyclone_high); - - /* read values for delay_at_last_interrupt */ - outb_p(0x00, 0x43); /* latch the count ASAP */ - - count = inb_p(0x40); /* read the latched count */ - count |= inb(0x40) << 8; - - /* - * VIA686a test code... reset the latch if count > max + 1 - * from timer_pit.c - cjb - */ - if (count > LATCH) { - outb_p(0x34, PIT_MODE); - outb_p(LATCH & 0xff, PIT_CH0); - outb(LATCH >> 8, PIT_CH0); - count = LATCH - 1; - } - spin_unlock(&i8253_lock); - - /* lost tick compensation */ - delta = last_cyclone_low - delta; - delta /= (CYCLONE_TIMER_FREQ/1000000); - delta += delay_at_last_interrupt; - lost = delta/(1000000/HZ); - delay = delta%(1000000/HZ); - if (lost >= 2) - jiffies_64 += lost-1; - - /* update the monotonic base value */ - this_offset = ((unsigned long long)last_cyclone_high<<32)|last_cyclone_low; - monotonic_base += (this_offset - last_offset) & CYCLONE_TIMER_MASK; - write_sequnlock(&monotonic_lock); - - /* calculate delay_at_last_interrupt */ - count = ((LATCH-1) - count) * TICK_SIZE; - delay_at_last_interrupt = (count + LATCH/2) / LATCH; - - - /* catch corner case where tick rollover occured - * between cyclone and pit reads (as noted when - * usec delta is > 90% # of usecs/tick) - */ - if (lost && abs(delay - delay_at_last_interrupt) > (900000/HZ)) - jiffies_64++; -} - -static unsigned long get_offset_cyclone(void) -{ - u32 offset; - - if(!cyclone_timer) - return delay_at_last_interrupt; - - /* Read the cyclone timer */ - offset = cyclone_timer[0]; - - /* .. relative to previous jiffy */ - offset = offset - last_cyclone_low; - - /* convert cyclone ticks to microseconds */ - /* XXX slow, can we speed this up? */ - offset = offset/(CYCLONE_TIMER_FREQ/1000000); - - /* our adjusted time offset in microseconds */ - return delay_at_last_interrupt + offset; -} - -static unsigned long long monotonic_clock_cyclone(void) -{ - u32 now_low, now_high; - unsigned long long last_offset, this_offset, base; - unsigned long long ret; - unsigned seq; - - /* atomically read monotonic base & last_offset */ - do { - seq = read_seqbegin(&monotonic_lock); - last_offset = ((unsigned long long)last_cyclone_high<<32)|last_cyclone_low; - base = monotonic_base; - } while (read_seqretry(&monotonic_lock, seq)); - - - /* Read the cyclone counter */ - read_cyclone_counter(now_low,now_high); - this_offset = ((unsigned long long)now_high<<32)|now_low; - - /* convert to nanoseconds */ - ret = base + ((this_offset - last_offset)&CYCLONE_TIMER_MASK); - return ret * (1000000000 / CYCLONE_TIMER_FREQ); -} - -static int __init init_cyclone(char* override) -{ - u32* reg; - u32 base; /* saved cyclone base address */ - u32 pageaddr; /* page that contains cyclone_timer register */ - u32 offset; /* offset from pageaddr to cyclone_timer register */ - int i; - - /* check clock override */ - if (override[0] && strncmp(override,"cyclone",7)) - return -ENODEV; - - /*make sure we're on a summit box*/ - if(!use_cyclone) return -ENODEV; - - printk(KERN_INFO "Summit chipset: Starting Cyclone Counter.\n"); - - /* find base address */ - pageaddr = (CYCLONE_CBAR_ADDR)&PAGE_MASK; - offset = (CYCLONE_CBAR_ADDR)&(~PAGE_MASK); - set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr); - reg = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset); - if(!reg){ - printk(KERN_ERR "Summit chipset: Could not find valid CBAR register.\n"); - return -ENODEV; - } - base = *reg; - if(!base){ - printk(KERN_ERR "Summit chipset: Could not find valid CBAR value.\n"); - return -ENODEV; - } - - /* setup PMCC */ - pageaddr = (base + CYCLONE_PMCC_OFFSET)&PAGE_MASK; - offset = (base + CYCLONE_PMCC_OFFSET)&(~PAGE_MASK); - set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr); - reg = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset); - if(!reg){ - printk(KERN_ERR "Summit chipset: Could not find valid PMCC register.\n"); - return -ENODEV; - } - reg[0] = 0x00000001; - - /* setup MPCS */ - pageaddr = (base + CYCLONE_MPCS_OFFSET)&PAGE_MASK; - offset = (base + CYCLONE_MPCS_OFFSET)&(~PAGE_MASK); - set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr); - reg = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset); - if(!reg){ - printk(KERN_ERR "Summit chipset: Could not find valid MPCS register.\n"); - return -ENODEV; - } - reg[0] = 0x00000001; - - /* map in cyclone_timer */ - pageaddr = (base + CYCLONE_MPMC_OFFSET)&PAGE_MASK; - offset = (base + CYCLONE_MPMC_OFFSET)&(~PAGE_MASK); - set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr); - cyclone_timer = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset); - if(!cyclone_timer){ - printk(KERN_ERR "Summit chipset: Could not find valid MPMC register.\n"); - return -ENODEV; - } - - /*quick test to make sure its ticking*/ - for(i=0; i<3; i++){ - u32 old = cyclone_timer[0]; - int stall = 100; - while(stall--) barrier(); - if(cyclone_timer[0] == old){ - printk(KERN_ERR "Summit chipset: Counter not counting! DISABLED\n"); - cyclone_timer = 0; - return -ENODEV; - } - } - - init_cpu_khz(); - - /* Everything looks good! */ - return 0; -} - - -static void delay_cyclone(unsigned long loops) -{ - unsigned long bclock, now; - if(!cyclone_timer) - return; - bclock = cyclone_timer[0]; - do { - rep_nop(); - now = cyclone_timer[0]; - } while ((now-bclock) < loops); -} -/************************************************************/ - -/* cyclone timer_opts struct */ -static struct timer_opts timer_cyclone = { - .name = "cyclone", - .mark_offset = mark_offset_cyclone, - .get_offset = get_offset_cyclone, - .monotonic_clock = monotonic_clock_cyclone, - .delay = delay_cyclone, -}; - -struct init_timer_opts __initdata timer_cyclone_init = { - .init = init_cyclone, - .opts = &timer_cyclone, -}; diff --git a/arch/i386/kernel/timers/timer_hpet.c b/arch/i386/kernel/timers/timer_hpet.c deleted file mode 100644 index 17a6fe7166e..00000000000 --- a/arch/i386/kernel/timers/timer_hpet.c +++ /dev/null @@ -1,217 +0,0 @@ -/* - * This code largely moved from arch/i386/kernel/time.c. - * See comments there for proper credits. - */ - -#include <linux/spinlock.h> -#include <linux/init.h> -#include <linux/timex.h> -#include <linux/errno.h> -#include <linux/string.h> -#include <linux/jiffies.h> - -#include <asm/timer.h> -#include <asm/io.h> -#include <asm/processor.h> - -#include "io_ports.h" -#include "mach_timer.h" -#include <asm/hpet.h> - -static unsigned long hpet_usec_quotient __read_mostly; /* convert hpet clks to usec */ -static unsigned long tsc_hpet_quotient __read_mostly; /* convert tsc to hpet clks */ -static unsigned long hpet_last; /* hpet counter value at last tick*/ -static unsigned long last_tsc_low; /* lsb 32 bits of Time Stamp Counter */ -static unsigned long last_tsc_high; /* msb 32 bits of Time Stamp Counter */ -static unsigned long long monotonic_base; -static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED; - -/* convert from cycles(64bits) => nanoseconds (64bits) - * basic equation: - * ns = cycles / (freq / ns_per_sec) - * ns = cycles * (ns_per_sec / freq) - * ns = cycles * (10^9 / (cpu_khz * 10^3)) - * ns = cycles * (10^6 / cpu_khz) - * - * Then we use scaling math (suggested by george@mvista.com) to get: - * ns = cycles * (10^6 * SC / cpu_khz) / SC - * ns = cycles * cyc2ns_scale / SC - * - * And since SC is a constant power of two, we can convert the div - * into a shift. - * - * We can use khz divisor instead of mhz to keep a better percision, since - * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. - * (mathieu.desnoyers@polymtl.ca) - * - * -johnstul@us.ibm.com "math is hard, lets go shopping!" - */ -static unsigned long cyc2ns_scale __read_mostly; -#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ - -static inline void set_cyc2ns_scale(unsigned long cpu_khz) -{ - cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz; -} - -static inline unsigned long long cycles_2_ns(unsigned long long cyc) -{ - return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR; -} - -static unsigned long long monotonic_clock_hpet(void) -{ - unsigned long long last_offset, this_offset, base; - unsigned seq; - - /* atomically read monotonic base & last_offset */ - do { - seq = read_seqbegin(&monotonic_lock); - last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; - base = monotonic_base; - } while (read_seqretry(&monotonic_lock, seq)); - - /* Read the Time Stamp Counter */ - rdtscll(this_offset); - - /* return the value in ns */ - return base + cycles_2_ns(this_offset - last_offset); -} - -static unsigned long get_offset_hpet(void) -{ - register unsigned long eax, edx; - - eax = hpet_readl(HPET_COUNTER); - eax -= hpet_last; /* hpet delta */ - eax = min(hpet_tick, eax); - /* - * Time offset = (hpet delta) * ( usecs per HPET clock ) - * = (hpet delta) * ( usecs per tick / HPET clocks per tick) - * = (hpet delta) * ( hpet_usec_quotient ) / (2^32) - * - * Where, - * hpet_usec_quotient = (2^32 * usecs per tick)/HPET clocks per tick - * - * Using a mull instead of a divl saves some cycles in critical path. - */ - ASM_MUL64_REG(eax, edx, hpet_usec_quotient, eax); - - /* our adjusted time offset in microseconds */ - return edx; -} - -static void mark_offset_hpet(void) -{ - unsigned long long this_offset, last_offset; - unsigned long offset; - - write_seqlock(&monotonic_lock); - last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; - rdtsc(last_tsc_low, last_tsc_high); - - if (hpet_use_timer) - offset = hpet_readl(HPET_T0_CMP) - hpet_tick; - else - offset = hpet_readl(HPET_COUNTER); - if (unlikely(((offset - hpet_last) >= (2*hpet_tick)) && (hpet_last != 0))) { - int lost_ticks = ((offset - hpet_last) / hpet_tick) - 1; - jiffies_64 += lost_ticks; - } - hpet_last = offset; - - /* update the monotonic base value */ - this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; - monotonic_base += cycles_2_ns(this_offset - last_offset); - write_sequnlock(&monotonic_lock); -} - -static void delay_hpet(unsigned long loops) -{ - unsigned long hpet_start, hpet_end; - unsigned long eax; - - /* loops is the number of cpu cycles. Convert it to hpet clocks */ - ASM_MUL64_REG(eax, loops, tsc_hpet_quotient, loops); - - hpet_start = hpet_readl(HPET_COUNTER); - do { - rep_nop(); - hpet_end = hpet_readl(HPET_COUNTER); - } while ((hpet_end - hpet_start) < (loops)); -} - -static struct timer_opts timer_hpet; - -static int __init init_hpet(char* override) -{ - unsigned long result, remain; - - /* check clock override */ - if (override[0] && strncmp(override,"hpet",4)) - return -ENODEV; - - if (!is_hpet_enabled()) - return -ENODEV; - - printk("Using HPET for gettimeofday\n"); - if (cpu_has_tsc) { - unsigned long tsc_quotient = calibrate_tsc_hpet(&tsc_hpet_quotient); - if (tsc_quotient) { - /* report CPU clock rate in Hz. - * The formula is (10^6 * 2^32) / (2^32 * 1 / (clocks/us)) = - * clock/second. Our precision is about 100 ppm. - */ - { unsigned long eax=0, edx=1000; - ASM_DIV64_REG(cpu_khz, edx, tsc_quotient, - eax, edx); - printk("Detected %u.%03u MHz processor.\n", - cpu_khz / 1000, cpu_khz % 1000); - } - set_cyc2ns_scale(cpu_khz); - } - /* set this only when cpu_has_tsc */ - timer_hpet.read_timer = read_timer_tsc; - } - - /* - * Math to calculate hpet to usec multiplier - * Look for the comments at get_offset_hpet() - */ - ASM_DIV64_REG(result, remain, hpet_tick, 0, KERNEL_TICK_USEC); - if (remain > (hpet_tick >> 1)) - result++; /* rounding the result */ - hpet_usec_quotient = result; - - return 0; -} - -static int hpet_resume(void) -{ - write_seqlock(&monotonic_lock); - /* Assume this is the last mark offset time */ - rdtsc(last_tsc_low, last_tsc_high); - - if (hpet_use_timer) - hpet_last = hpet_readl(HPET_T0_CMP) - hpet_tick; - else - hpet_last = hpet_readl(HPET_COUNTER); - write_sequnlock(&monotonic_lock); - return 0; -} -/************************************************************/ - -/* tsc timer_opts struct */ -static struct timer_opts timer_hpet __read_mostly = { - .name = "hpet", - .mark_offset = mark_offset_hpet, - .get_offset = get_offset_hpet, - .monotonic_clock = monotonic_clock_hpet, - .delay = delay_hpet, - .resume = hpet_resume, -}; - -struct init_timer_opts __initdata timer_hpet_init = { - .init = init_hpet, - .opts = &timer_hpet, -}; diff --git a/arch/i386/kernel/timers/timer_none.c b/arch/i386/kernel/timers/timer_none.c deleted file mode 100644 index 4ea2f414dbb..00000000000 --- a/arch/i386/kernel/timers/timer_none.c +++ /dev/null @@ -1,39 +0,0 @@ -#include <linux/init.h> -#include <asm/timer.h> - -static void mark_offset_none(void) -{ - /* nothing needed */ -} - -static unsigned long get_offset_none(void) -{ - return 0; -} - -static unsigned long long monotonic_clock_none(void) -{ - return 0; -} - -static void delay_none(unsigned long loops) -{ - int d0; - __asm__ __volatile__( - "\tjmp 1f\n" - ".align 16\n" - "1:\tjmp 2f\n" - ".align 16\n" - "2:\tdecl %0\n\tjns 2b" - :"=&a" (d0) - :"0" (loops)); -} - -/* none timer_opts struct */ -struct timer_opts timer_none = { - .name = "none", - .mark_offset = mark_offset_none, - .get_offset = get_offset_none, - .monotonic_clock = monotonic_clock_none, - .delay = delay_none, -}; diff --git a/arch/i386/kernel/timers/timer_pit.c b/arch/i386/kernel/timers/timer_pit.c deleted file mode 100644 index b9b6bd56b9b..00000000000 --- a/arch/i386/kernel/timers/timer_pit.c +++ /dev/null @@ -1,177 +0,0 @@ -/* - * This code largely moved from arch/i386/kernel/time.c. - * See comments there for proper credits. - */ - -#include <linux/spinlock.h> -#include <linux/module.h> -#include <linux/device.h> -#include <linux/sysdev.h> -#include <linux/timex.h> -#include <asm/delay.h> -#include <asm/mpspec.h> -#include <asm/timer.h> -#include <asm/smp.h> -#include <asm/io.h> -#include <asm/arch_hooks.h> -#include <asm/i8253.h> - -#include "do_timer.h" -#include "io_ports.h" - -static int count_p; /* counter in get_offset_pit() */ - -static int __init init_pit(char* override) -{ - /* check clock override */ - if (override[0] && strncmp(override,"pit",3)) - printk(KERN_ERR "Warning: clock= override failed. Defaulting " - "to PIT\n"); - init_cpu_khz(); - count_p = LATCH; - return 0; -} - -static void mark_offset_pit(void) -{ - /* nothing needed */ -} - -static unsigned long long monotonic_clock_pit(void) -{ - return 0; -} - -static void delay_pit(unsigned long loops) -{ - int d0; - __asm__ __volatile__( - "\tjmp 1f\n" - ".align 16\n" - "1:\tjmp 2f\n" - ".align 16\n" - "2:\tdecl %0\n\tjns 2b" - :"=&a" (d0) - :"0" (loops)); -} - - -/* This function must be called with xtime_lock held. - * It was inspired by Steve McCanne's microtime-i386 for BSD. -- jrs - * - * However, the pc-audio speaker driver changes the divisor so that - * it gets interrupted rather more often - it loads 64 into the - * counter rather than 11932! This has an adverse impact on - * do_gettimeoffset() -- it stops working! What is also not - * good is that the interval that our timer function gets called - * is no longer 10.0002 ms, but 9.9767 ms. To get around this - * would require using a different timing source. Maybe someone - * could use the RTC - I know that this can interrupt at frequencies - * ranging from 8192Hz to 2Hz. If I had the energy, I'd somehow fix - * it so that at startup, the timer code in sched.c would select - * using either the RTC or the 8253 timer. The decision would be - * based on whether there was any other device around that needed - * to trample on the 8253. I'd set up the RTC to interrupt at 1024 Hz, - * and then do some jiggery to have a version of do_timer that - * advanced the clock by 1/1024 s. Every time that reached over 1/100 - * of a second, then do all the old code. If the time was kept correct - * then do_gettimeoffset could just return 0 - there is no low order - * divider that can be accessed. - * - * Ideally, you would be able to use the RTC for the speaker driver, - * but it appears that the speaker driver really needs interrupt more - * often than every 120 us or so. - * - * Anyway, this needs more thought.... pjsg (1993-08-28) - * - * If you are really that interested, you should be reading - * comp.protocols.time.ntp! - */ - -static unsigned long get_offset_pit(void) -{ - int count; - unsigned long flags; - static unsigned long jiffies_p = 0; - - /* - * cache volatile jiffies temporarily; we have xtime_lock. - */ - unsigned long jiffies_t; - - spin_lock_irqsave(&i8253_lock, flags); - /* timer count may underflow right here */ - outb_p(0x00, PIT_MODE); /* latch the count ASAP */ - - count = inb_p(PIT_CH0); /* read the latched count */ - - /* - * We do this guaranteed double memory access instead of a _p - * postfix in the previous port access. Wheee, hackady hack - */ - jiffies_t = jiffies; - - count |= inb_p(PIT_CH0) << 8; - - /* VIA686a test code... reset the latch if count > max + 1 */ - if (count > LATCH) { - outb_p(0x34, PIT_MODE); - outb_p(LATCH & 0xff, PIT_CH0); - outb(LATCH >> 8, PIT_CH0); - count = LATCH - 1; - } - - /* - * avoiding timer inconsistencies (they are rare, but they happen)... - * there are two kinds of problems that must be avoided here: - * 1. the timer counter underflows - * 2. hardware problem with the timer, not giving us continuous time, - * the counter does small "jumps" upwards on some Pentium systems, - * (see c't 95/10 page 335 for Neptun bug.) - */ - - if( jiffies_t == jiffies_p ) { - if( count > count_p ) { - /* the nutcase */ - count = do_timer_overflow(count); - } - } else - jiffies_p = jiffies_t; - - count_p = count; - - spin_unlock_irqrestore(&i8253_lock, flags); - - count = ((LATCH-1) - count) * TICK_SIZE; - count = (count + LATCH/2) / LATCH; - - return count; -} - - -/* tsc timer_opts struct */ -struct timer_opts timer_pit = { - .name = "pit", - .mark_offset = mark_offset_pit, - .get_offset = get_offset_pit, - .monotonic_clock = monotonic_clock_pit, - .delay = delay_pit, -}; - -struct init_timer_opts __initdata timer_pit_init = { - .init = init_pit, - .opts = &timer_pit, -}; - -void setup_pit_timer(void) -{ - unsigned long flags; - - spin_lock_irqsave(&i8253_lock, flags); - outb_p(0x34,PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */ - udelay(10); - outb_p(LATCH & 0xff , PIT_CH0); /* LSB */ - udelay(10); - outb(LATCH >> 8 , PIT_CH0); /* MSB */ - spin_unlock_irqrestore(&i8253_lock, flags); -} diff --git a/arch/i386/kernel/timers/timer_pm.c b/arch/i386/kernel/timers/timer_pm.c deleted file mode 100644 index 144e94a0493..00000000000 --- a/arch/i386/kernel/timers/timer_pm.c +++ /dev/null @@ -1,342 +0,0 @@ -/* - * (C) Dominik Brodowski <linux@brodo.de> 2003 - * - * Driver to use the Power Management Timer (PMTMR) available in some - * southbridges as primary timing source for the Linux kernel. - * - * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c, - * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4. - * - * This file is licensed under the GPL v2. - */ - - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/device.h> -#include <linux/init.h> -#include <linux/pci.h> -#include <asm/types.h> -#include <asm/timer.h> -#include <asm/smp.h> -#include <asm/io.h> -#include <asm/arch_hooks.h> - -#include <linux/timex.h> -#include "mach_timer.h" - -/* Number of PMTMR ticks expected during calibration run */ -#define PMTMR_TICKS_PER_SEC 3579545 -#define PMTMR_EXPECTED_RATE \ - ((CALIBRATE_LATCH * (PMTMR_TICKS_PER_SEC >> 10)) / (CLOCK_TICK_RATE>>10)) - - -/* The I/O port the PMTMR resides at. - * The location is detected during setup_arch(), - * in arch/i386/acpi/boot.c */ -u32 pmtmr_ioport = 0; - - -/* value of the Power timer at last timer interrupt */ -static u32 offset_tick; -static u32 offset_delay; - -static unsigned long long monotonic_base; -static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED; - -#define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */ - -static int pmtmr_need_workaround __read_mostly = 1; - -/*helper function to safely read acpi pm timesource*/ -static inline u32 read_pmtmr(void) -{ - if (pmtmr_need_workaround) { - u32 v1, v2, v3; - - /* It has been reported that because of various broken - * chipsets (ICH4, PIIX4 and PIIX4E) where the ACPI PM time - * source is not latched, so you must read it multiple - * times to insure a safe value is read. - */ - do { - v1 = inl(pmtmr_ioport); - v2 = inl(pmtmr_ioport); - v3 = inl(pmtmr_ioport); - } while ((v1 > v2 && v1 < v3) || (v2 > v3 && v2 < v1) - || (v3 > v1 && v3 < v2)); - - /* mask the output to 24 bits */ - return v2 & ACPI_PM_MASK; - } - - return inl(pmtmr_ioport) & ACPI_PM_MASK; -} - - -/* - * Some boards have the PMTMR running way too fast. We check - * the PMTMR rate against PIT channel 2 to catch these cases. - */ -static int verify_pmtmr_rate(void) -{ - u32 value1, value2; - unsigned long count, delta; - - mach_prepare_counter(); - value1 = read_pmtmr(); - mach_countup(&count); - value2 = read_pmtmr(); - delta = (value2 - value1) & ACPI_PM_MASK; - - /* Check that the PMTMR delta is within 5% of what we expect */ - if (delta < (PMTMR_EXPECTED_RATE * 19) / 20 || - delta > (PMTMR_EXPECTED_RATE * 21) / 20) { - printk(KERN_INFO "PM-Timer running at invalid rate: %lu%% of normal - aborting.\n", 100UL * delta / PMTMR_EXPECTED_RATE); - return -1; - } - - return 0; -} - - -static int init_pmtmr(char* override) -{ - u32 value1, value2; - unsigned int i; - - if (override[0] && strncmp(override,"pmtmr",5)) - return -ENODEV; - - if (!pmtmr_ioport) - return -ENODEV; - - /* we use the TSC for delay_pmtmr, so make sure it exists */ - if (!cpu_has_tsc) - return -ENODEV; - - /* "verify" this timing source */ - value1 = read_pmtmr(); - for (i = 0; i < 10000; i++) { - value2 = read_pmtmr(); - if (value2 == value1) - continue; - if (value2 > value1) - goto pm_good; - if ((value2 < value1) && ((value2) < 0xFFF)) - goto pm_good; - printk(KERN_INFO "PM-Timer had inconsistent results: 0x%#x, 0x%#x - aborting.\n", value1, value2); - return -EINVAL; - } - printk(KERN_INFO "PM-Timer had no reasonable result: 0x%#x - aborting.\n", value1); - return -ENODEV; - -pm_good: - if (verify_pmtmr_rate() != 0) - return -ENODEV; - - init_cpu_khz(); - return 0; -} - -static inline u32 cyc2us(u32 cycles) -{ - /* The Power Management Timer ticks at 3.579545 ticks per microsecond. - * 1 / PM_TIMER_FREQUENCY == 0.27936511 =~ 286/1024 [error: 0.024%] - * - * Even with HZ = 100, delta is at maximum 35796 ticks, so it can - * easily be multiplied with 286 (=0x11E) without having to fear - * u32 overflows. - */ - cycles *= 286; - return (cycles >> 10); -} - -/* - * this gets called during each timer interrupt - * - Called while holding the writer xtime_lock - */ -static void mark_offset_pmtmr(void) -{ - u32 lost, delta, last_offset; - static int first_run = 1; - last_offset = offset_tick; - - write_seqlock(&monotonic_lock); - - offset_tick = read_pmtmr(); - - /* calculate tick interval */ - delta = (offset_tick - last_offset) & ACPI_PM_MASK; - - /* convert to usecs */ - delta = cyc2us(delta); - - /* update the monotonic base value */ - monotonic_base += delta * NSEC_PER_USEC; - write_sequnlock(&monotonic_lock); - - /* convert to ticks */ - delta += offset_delay; - lost = delta / (USEC_PER_SEC / HZ); - offset_delay = delta % (USEC_PER_SEC / HZ); - - - /* compensate for lost ticks */ - if (lost >= 2) - jiffies_64 += lost - 1; - - /* don't calculate delay for first run, - or if we've got less then a tick */ - if (first_run || (lost < 1)) { - first_run = 0; - offset_delay = 0; - } -} - -static int pmtmr_resume(void) -{ - write_seqlock(&monotonic_lock); - /* Assume this is the last mark offset time */ - offset_tick = read_pmtmr(); - write_sequnlock(&monotonic_lock); - return 0; -} - -static unsigned long long monotonic_clock_pmtmr(void) -{ - u32 last_offset, this_offset; - unsigned long long base, ret; - unsigned seq; - - - /* atomically read monotonic base & last_offset */ - do { - seq = read_seqbegin(&monotonic_lock); - last_offset = offset_tick; - base = monotonic_base; - } while (read_seqretry(&monotonic_lock, seq)); - - /* Read the pmtmr */ - this_offset = read_pmtmr(); - - /* convert to nanoseconds */ - ret = (this_offset - last_offset) & ACPI_PM_MASK; - ret = base + (cyc2us(ret) * NSEC_PER_USEC); - return ret; -} - -static void delay_pmtmr(unsigned long loops) -{ - unsigned long bclock, now; - - rdtscl(bclock); - do - { - rep_nop(); - rdtscl(now); - } while ((now-bclock) < loops); -} - - -/* - * get the offset (in microseconds) from the last call to mark_offset() - * - Called holding a reader xtime_lock - */ -static unsigned long get_offset_pmtmr(void) -{ - u32 now, offset, delta = 0; - - offset = offset_tick; - now = read_pmtmr(); - delta = (now - offset)&ACPI_PM_MASK; - - return (unsigned long) offset_delay + cyc2us(delta); -} - - -/* acpi timer_opts struct */ -static struct timer_opts timer_pmtmr = { - .name = "pmtmr", - .mark_offset = mark_offset_pmtmr, - .get_offset = get_offset_pmtmr, - .monotonic_clock = monotonic_clock_pmtmr, - .delay = delay_pmtmr, - .read_timer = read_timer_tsc, - .resume = pmtmr_resume, -}; - -struct init_timer_opts __initdata timer_pmtmr_init = { - .init = init_pmtmr, - .opts = &timer_pmtmr, -}; - -#ifdef CONFIG_PCI -/* - * PIIX4 Errata: - * - * The power management timer may return improper results when read. - * Although the timer value settles properly after incrementing, - * while incrementing there is a 3 ns window every 69.8 ns where the - * timer value is indeterminate (a 4.2% chance that the data will be - * incorrect when read). As a result, the ACPI free running count up - * timer specification is violated due to erroneous reads. - */ -static int __init pmtmr_bug_check(void) -{ - static struct pci_device_id gray_list[] __initdata = { - /* these chipsets may have bug. */ - { PCI_DEVICE(PCI_VENDOR_ID_INTEL, - PCI_DEVICE_ID_INTEL_82801DB_0) }, - { }, - }; - struct pci_dev *dev; - int pmtmr_has_bug = 0; - u8 rev; - - if (cur_timer != &timer_pmtmr || !pmtmr_need_workaround) - return 0; - - dev = pci_get_device(PCI_VENDOR_ID_INTEL, - PCI_DEVICE_ID_INTEL_82371AB_3, NULL); - if (dev) { - pci_read_config_byte(dev, PCI_REVISION_ID, &rev); - /* the bug has been fixed in PIIX4M */ - if (rev < 3) { - printk(KERN_WARNING "* Found PM-Timer Bug on this " - "chipset. Due to workarounds for a bug,\n" - "* this time source is slow. Consider trying " - "other time sources (clock=)\n"); - pmtmr_has_bug = 1; - } - pci_dev_put(dev); - } - - if (pci_dev_present(gray_list)) { - printk(KERN_WARNING "* This chipset may have PM-Timer Bug. Due" - " to workarounds for a bug,\n" - "* this time source is slow. If you are sure your timer" - " does not have\n" - "* this bug, please use \"pmtmr_good\" to disable the " - "workaround\n"); - pmtmr_has_bug = 1; - } - - if (!pmtmr_has_bug) - pmtmr_need_workaround = 0; - - return 0; -} -device_initcall(pmtmr_bug_check); -#endif - -static int __init pmtr_good_setup(char *__str) -{ - pmtmr_need_workaround = 0; - return 1; -} -__setup("pmtmr_good", pmtr_good_setup); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>"); -MODULE_DESCRIPTION("Power Management Timer (PMTMR) as primary timing source for x86"); diff --git a/arch/i386/kernel/timers/timer_tsc.c b/arch/i386/kernel/timers/timer_tsc.c deleted file mode 100644 index f1187ddb0d0..00000000000 --- a/arch/i386/kernel/timers/timer_tsc.c +++ /dev/null @@ -1,617 +0,0 @@ -/* - * This code largely moved from arch/i386/kernel/time.c. - * See comments there for proper credits. - * - * 2004-06-25 Jesper Juhl - * moved mark_offset_tsc below cpufreq_delayed_get to avoid gcc 3.4 - * failing to inline. - */ - -#include <linux/spinlock.h> -#include <linux/init.h> -#include <linux/timex.h> -#include <linux/errno.h> -#include <linux/cpufreq.h> -#include <linux/string.h> -#include <linux/jiffies.h> - -#include <asm/timer.h> -#include <asm/io.h> -/* processor.h for distable_tsc flag */ -#include <asm/processor.h> - -#include "io_ports.h" -#include "mach_timer.h" - -#include <asm/hpet.h> -#include <asm/i8253.h> - -#ifdef CONFIG_HPET_TIMER -static unsigned long hpet_usec_quotient; -static unsigned long hpet_last; -static struct timer_opts timer_tsc; -#endif - -static inline void cpufreq_delayed_get(void); - -int tsc_disable __devinitdata = 0; - -static int use_tsc; -/* Number of usecs that the last interrupt was delayed */ -static int delay_at_last_interrupt; - -static unsigned long last_tsc_low; /* lsb 32 bits of Time Stamp Counter */ -static unsigned long last_tsc_high; /* msb 32 bits of Time Stamp Counter */ -static unsigned long long monotonic_base; -static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED; - -/* Avoid compensating for lost ticks before TSCs are synched */ -static int detect_lost_ticks; -static int __init start_lost_tick_compensation(void) -{ - detect_lost_ticks = 1; - return 0; -} -late_initcall(start_lost_tick_compensation); - -/* convert from cycles(64bits) => nanoseconds (64bits) - * basic equation: - * ns = cycles / (freq / ns_per_sec) - * ns = cycles * (ns_per_sec / freq) - * ns = cycles * (10^9 / (cpu_khz * 10^3)) - * ns = cycles * (10^6 / cpu_khz) - * - * Then we use scaling math (suggested by george@mvista.com) to get: - * ns = cycles * (10^6 * SC / cpu_khz) / SC - * ns = cycles * cyc2ns_scale / SC - * - * And since SC is a constant power of two, we can convert the div - * into a shift. - * - * We can use khz divisor instead of mhz to keep a better percision, since - * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. - * (mathieu.desnoyers@polymtl.ca) - * - * -johnstul@us.ibm.com "math is hard, lets go shopping!" - */ -static unsigned long cyc2ns_scale __read_mostly; -#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ - -static inline void set_cyc2ns_scale(unsigned long cpu_khz) -{ - cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz; -} - -static inline unsigned long long cycles_2_ns(unsigned long long cyc) -{ - return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR; -} - -static int count2; /* counter for mark_offset_tsc() */ - -/* Cached *multiplier* to convert TSC counts to microseconds. - * (see the equation below). - * Equal to 2^32 * (1 / (clocks per usec) ). - * Initialized in time_init. - */ -static unsigned long fast_gettimeoffset_quotient; - -static unsigned long get_offset_tsc(void) -{ - register unsigned long eax, edx; - - /* Read the Time Stamp Counter */ - - rdtsc(eax,edx); - - /* .. relative to previous jiffy (32 bits is enough) */ - eax -= last_tsc_low; /* tsc_low delta */ - - /* - * Time offset = (tsc_low delta) * fast_gettimeoffset_quotient - * = (tsc_low delta) * (usecs_per_clock) - * = (tsc_low delta) * (usecs_per_jiffy / clocks_per_jiffy) - * - * Using a mull instead of a divl saves up to 31 clock cycles - * in the critical path. - */ - - __asm__("mull %2" - :"=a" (eax), "=d" (edx) - :"rm" (fast_gettimeoffset_quotient), - "0" (eax)); - - /* our adjusted time offset in microseconds */ - return delay_at_last_interrupt + edx; -} - -static unsigned long long monotonic_clock_tsc(void) -{ - unsigned long long last_offset, this_offset, base; - unsigned seq; - - /* atomically read monotonic base & last_offset */ - do { - seq = read_seqbegin(&monotonic_lock); - last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; - base = monotonic_base; - } while (read_seqretry(&monotonic_lock, seq)); - - /* Read the Time Stamp Counter */ - rdtscll(this_offset); - - /* return the value in ns */ - return base + cycles_2_ns(this_offset - last_offset); -} - -/* - * Scheduler clock - returns current time in nanosec units. - */ -unsigned long long sched_clock(void) -{ - unsigned long long this_offset; - - /* - * In the NUMA case we dont use the TSC as they are not - * synchronized across all CPUs. - */ -#ifndef CONFIG_NUMA - if (!use_tsc) -#endif - /* no locking but a rare wrong value is not a big deal */ - return jiffies_64 * (1000000000 / HZ); - - /* Read the Time Stamp Counter */ - rdtscll(this_offset); - - /* return the value in ns */ - return cycles_2_ns(this_offset); -} - -static void delay_tsc(unsigned long loops) -{ - unsigned long bclock, now; - - rdtscl(bclock); - do - { - rep_nop(); - rdtscl(now); - } while ((now-bclock) < loops); -} - -#ifdef CONFIG_HPET_TIMER -static void mark_offset_tsc_hpet(void) -{ - unsigned long long this_offset, last_offset; - unsigned long offset, temp, hpet_current; - - write_seqlock(&monotonic_lock); - last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; - /* - * It is important that these two operations happen almost at - * the same time. We do the RDTSC stuff first, since it's - * faster. To avoid any inconsistencies, we need interrupts - * disabled locally. - */ - /* - * Interrupts are just disabled locally since the timer irq - * has the SA_INTERRUPT flag set. -arca - */ - /* read Pentium cycle counter */ - - hpet_current = hpet_readl(HPET_COUNTER); - rdtsc(last_tsc_low, last_tsc_high); - - /* lost tick compensation */ - offset = hpet_readl(HPET_T0_CMP) - hpet_tick; - if (unlikely(((offset - hpet_last) > hpet_tick) && (hpet_last != 0)) - && detect_lost_ticks) { - int lost_ticks = (offset - hpet_last) / hpet_tick; - jiffies_64 += lost_ticks; - } - hpet_last = hpet_current; - - /* update the monotonic base value */ - this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; - monotonic_base += cycles_2_ns(this_offset - last_offset); - write_sequnlock(&monotonic_lock); - - /* calculate delay_at_last_interrupt */ - /* - * Time offset = (hpet delta) * ( usecs per HPET clock ) - * = (hpet delta) * ( usecs per tick / HPET clocks per tick) - * = (hpet delta) * ( hpet_usec_quotient ) / (2^32) - * Where, - * hpet_usec_quotient = (2^32 * usecs per tick)/HPET clocks per tick - */ - delay_at_last_interrupt = hpet_current - offset; - ASM_MUL64_REG(temp, delay_at_last_interrupt, - hpet_usec_quotient, delay_at_last_interrupt); -} -#endif - - -#ifdef CONFIG_CPU_FREQ -#include <linux/workqueue.h> - -static unsigned int cpufreq_delayed_issched = 0; -static unsigned int cpufreq_init = 0; -static struct work_struct cpufreq_delayed_get_work; - -static void handle_cpufreq_delayed_get(void *v) -{ - unsigned int cpu; - for_each_online_cpu(cpu) { - cpufreq_get(cpu); - } - cpufreq_delayed_issched = 0; -} - -/* if we notice lost ticks, schedule a call to cpufreq_get() as it tries - * to verify the CPU frequency the timing core thinks the CPU is running - * at is still correct. - */ -static inline void cpufreq_delayed_get(void) -{ - if (cpufreq_init && !cpufreq_delayed_issched) { - cpufreq_delayed_issched = 1; - printk(KERN_DEBUG "Losing some ticks... checking if CPU frequency changed.\n"); - schedule_work(&cpufreq_delayed_get_work); - } -} - -/* If the CPU frequency is scaled, TSC-based delays will need a different - * loops_per_jiffy value to function properly. - */ - -static unsigned int ref_freq = 0; -static unsigned long loops_per_jiffy_ref = 0; - -#ifndef CONFIG_SMP -static unsigned long fast_gettimeoffset_ref = 0; -static unsigned int cpu_khz_ref = 0; -#endif - -static int -time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, - void *data) -{ - struct cpufreq_freqs *freq = data; - - if (val != CPUFREQ_RESUMECHANGE && val != CPUFREQ_SUSPENDCHANGE) - write_seqlock_irq(&xtime_lock); - if (!ref_freq) { - if (!freq->old){ - ref_freq = freq->new; - goto end; - } - ref_freq = freq->old; - loops_per_jiffy_ref = cpu_data[freq->cpu].loops_per_jiffy; -#ifndef CONFIG_SMP - fast_gettimeoffset_ref = fast_gettimeoffset_quotient; - cpu_khz_ref = cpu_khz; -#endif - } - - if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || - (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || - (val == CPUFREQ_RESUMECHANGE)) { - if (!(freq->flags & CPUFREQ_CONST_LOOPS)) - cpu_data[freq->cpu].loops_per_jiffy = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); -#ifndef CONFIG_SMP - if (cpu_khz) - cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new); - if (use_tsc) { - if (!(freq->flags & CPUFREQ_CONST_LOOPS)) { - fast_gettimeoffset_quotient = cpufreq_scale(fast_gettimeoffset_ref, freq->new, ref_freq); - set_cyc2ns_scale(cpu_khz); - } - } -#endif - } - -end: - if (val != CPUFREQ_RESUMECHANGE && val != CPUFREQ_SUSPENDCHANGE) - write_sequnlock_irq(&xtime_lock); - - return 0; -} - -static struct notifier_block time_cpufreq_notifier_block = { - .notifier_call = time_cpufreq_notifier -}; - - -static int __init cpufreq_tsc(void) -{ - int ret; - INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get, NULL); - ret = cpufreq_register_notifier(&time_cpufreq_notifier_block, - CPUFREQ_TRANSITION_NOTIFIER); - if (!ret) - cpufreq_init = 1; - return ret; -} -core_initcall(cpufreq_tsc); - -#else /* CONFIG_CPU_FREQ */ -static inline void cpufreq_delayed_get(void) { return; } -#endif - -int recalibrate_cpu_khz(void) -{ -#ifndef CONFIG_SMP - unsigned int cpu_khz_old = cpu_khz; - - if (cpu_has_tsc) { - local_irq_disable(); - init_cpu_khz(); - local_irq_enable(); - cpu_data[0].loops_per_jiffy = - cpufreq_scale(cpu_data[0].loops_per_jiffy, - cpu_khz_old, - cpu_khz); - return 0; - } else - return -ENODEV; -#else - return -ENODEV; -#endif -} -EXPORT_SYMBOL(recalibrate_cpu_khz); - -static void mark_offset_tsc(void) -{ - unsigned long lost,delay; - unsigned long delta = last_tsc_low; - int count; - int countmp; - static int count1 = 0; - unsigned long long this_offset, last_offset; - static int lost_count = 0; - - write_seqlock(&monotonic_lock); - last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; - /* - * It is important that these two operations happen almost at - * the same time. We do the RDTSC stuff first, since it's - * faster. To avoid any inconsistencies, we need interrupts - * disabled locally. - */ - - /* - * Interrupts are just disabled locally since the timer irq - * has the SA_INTERRUPT flag set. -arca - */ - - /* read Pentium cycle counter */ - - rdtsc(last_tsc_low, last_tsc_high); - - spin_lock(&i8253_lock); - outb_p(0x00, PIT_MODE); /* latch the count ASAP */ - - count = inb_p(PIT_CH0); /* read the latched count */ - count |= inb(PIT_CH0) << 8; - - /* - * VIA686a test code... reset the latch if count > max + 1 - * from timer_pit.c - cjb - */ - if (count > LATCH) { - outb_p(0x34, PIT_MODE); - outb_p(LATCH & 0xff, PIT_CH0); - outb(LATCH >> 8, PIT_CH0); - count = LATCH - 1; - } - - spin_unlock(&i8253_lock); - - if (pit_latch_buggy) { - /* get center value of last 3 time lutch */ - if ((count2 >= count && count >= count1) - || (count1 >= count && count >= count2)) { - count2 = count1; count1 = count; - } else if ((count1 >= count2 && count2 >= count) - || (count >= count2 && count2 >= count1)) { - countmp = count;count = count2; - count2 = count1;count1 = countmp; - } else { - count2 = count1; count1 = count; count = count1; - } - } - - /* lost tick compensation */ - delta = last_tsc_low - delta; - { - register unsigned long eax, edx; - eax = delta; - __asm__("mull %2" - :"=a" (eax), "=d" (edx) - :"rm" (fast_gettimeoffset_quotient), - "0" (eax)); - delta = edx; - } - delta += delay_at_last_interrupt; - lost = delta/(1000000/HZ); - delay = delta%(1000000/HZ); - if (lost >= 2 && detect_lost_ticks) { - jiffies_64 += lost-1; - - /* sanity check to ensure we're not always losing ticks */ - if (lost_count++ > 100) { - printk(KERN_WARNING "Losing too many ticks!\n"); - printk(KERN_WARNING "TSC cannot be used as a timesource. \n"); - printk(KERN_WARNING "Possible reasons for this are:\n"); - printk(KERN_WARNING " You're running with Speedstep,\n"); - printk(KERN_WARNING " You don't have DMA enabled for your hard disk (see hdparm),\n"); - printk(KERN_WARNING " Incorrect TSC synchronization on an SMP system (see dmesg).\n"); - printk(KERN_WARNING "Falling back to a sane timesource now.\n"); - - clock_fallback(); - } - /* ... but give the TSC a fair chance */ - if (lost_count > 25) - cpufreq_delayed_get(); - } else - lost_count = 0; - /* update the monotonic base value */ - this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; - monotonic_base += cycles_2_ns(this_offset - last_offset); - write_sequnlock(&monotonic_lock); - - /* calculate delay_at_last_interrupt */ - count = ((LATCH-1) - count) * TICK_SIZE; - delay_at_last_interrupt = (count + LATCH/2) / LATCH; - - /* catch corner case where tick rollover occured - * between tsc and pit reads (as noted when - * usec delta is > 90% # of usecs/tick) - */ - if (lost && abs(delay - delay_at_last_interrupt) > (900000/HZ)) - jiffies_64++; -} - -static int __init init_tsc(char* override) -{ - - /* check clock override */ - if (override[0] && strncmp(override,"tsc",3)) { -#ifdef CONFIG_HPET_TIMER - if (is_hpet_enabled()) { - printk(KERN_ERR "Warning: clock= override failed. Defaulting to tsc\n"); - } else -#endif - { - return -ENODEV; - } - } - - /* - * If we have APM enabled or the CPU clock speed is variable - * (CPU stops clock on HLT or slows clock to save power) - * then the TSC timestamps may diverge by up to 1 jiffy from - * 'real time' but nothing will break. - * The most frequent case is that the CPU is "woken" from a halt - * state by the timer interrupt itself, so we get 0 error. In the - * rare cases where a driver would "wake" the CPU and request a - * timestamp, the maximum error is < 1 jiffy. But timestamps are - * still perfectly ordered. - * Note that the TSC counter will be reset if APM suspends - * to disk; this won't break the kernel, though, 'cuz we're - * smart. See arch/i386/kernel/apm.c. - */ - /* - * Firstly we have to do a CPU check for chips with - * a potentially buggy TSC. At this point we haven't run - * the ident/bugs checks so we must run this hook as it - * may turn off the TSC flag. - * - * NOTE: this doesn't yet handle SMP 486 machines where only - * some CPU's have a TSC. Thats never worked and nobody has - * moaned if you have the only one in the world - you fix it! - */ - - count2 = LATCH; /* initialize counter for mark_offset_tsc() */ - - if (cpu_has_tsc) { - unsigned long tsc_quotient; -#ifdef CONFIG_HPET_TIMER - if (is_hpet_enabled() && hpet_use_timer) { - unsigned long result, remain; - printk("Using TSC for gettimeofday\n"); - tsc_quotient = calibrate_tsc_hpet(NULL); - timer_tsc.mark_offset = &mark_offset_tsc_hpet; - /* - * Math to calculate hpet to usec multiplier - * Look for the comments at get_offset_tsc_hpet() - */ - ASM_DIV64_REG(result, remain, hpet_tick, - 0, KERNEL_TICK_USEC); - if (remain > (hpet_tick >> 1)) - result++; /* rounding the result */ - - hpet_usec_quotient = result; - } else -#endif - { - tsc_quotient = calibrate_tsc(); - } - - if (tsc_quotient) { - fast_gettimeoffset_quotient = tsc_quotient; - use_tsc = 1; - /* - * We could be more selective here I suspect - * and just enable this for the next intel chips ? - */ - /* report CPU clock rate in Hz. - * The formula is (10^6 * 2^32) / (2^32 * 1 / (clocks/us)) = - * clock/second. Our precision is about 100 ppm. - */ - { unsigned long eax=0, edx=1000; - __asm__("divl %2" - :"=a" (cpu_khz), "=d" (edx) - :"r" (tsc_quotient), - "0" (eax), "1" (edx)); - printk("Detected %u.%03u MHz processor.\n", - cpu_khz / 1000, cpu_khz % 1000); - } - set_cyc2ns_scale(cpu_khz); - return 0; - } - } - return -ENODEV; -} - -static int tsc_resume(void) -{ - write_seqlock(&monotonic_lock); - /* Assume this is the last mark offset time */ - rdtsc(last_tsc_low, last_tsc_high); -#ifdef CONFIG_HPET_TIMER - if (is_hpet_enabled() && hpet_use_timer) - hpet_last = hpet_readl(HPET_COUNTER); -#endif - write_sequnlock(&monotonic_lock); - return 0; -} - -#ifndef CONFIG_X86_TSC -/* disable flag for tsc. Takes effect by clearing the TSC cpu flag - * in cpu/common.c */ -static int __init tsc_setup(char *str) -{ - tsc_disable = 1; - return 1; -} -#else -static int __init tsc_setup(char *str) -{ - printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, " - "cannot disable TSC.\n"); - return 1; -} -#endif -__setup("notsc", tsc_setup); - - - -/************************************************************/ - -/* tsc timer_opts struct */ -static struct timer_opts timer_tsc = { - .name = "tsc", - .mark_offset = mark_offset_tsc, - .get_offset = get_offset_tsc, - .monotonic_clock = monotonic_clock_tsc, - .delay = delay_tsc, - .read_timer = read_timer_tsc, - .resume = tsc_resume, -}; - -struct init_timer_opts __initdata timer_tsc_init = { - .init = init_tsc, - .opts = &timer_tsc, -}; diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c new file mode 100644 index 00000000000..7e0d8dab207 --- /dev/null +++ b/arch/i386/kernel/tsc.c @@ -0,0 +1,478 @@ +/* + * This code largely moved from arch/i386/kernel/timer/timer_tsc.c + * which was originally moved from arch/i386/kernel/time.c. + * See comments there for proper credits. + */ + +#include <linux/clocksource.h> +#include <linux/workqueue.h> +#include <linux/cpufreq.h> +#include <linux/jiffies.h> +#include <linux/init.h> +#include <linux/dmi.h> + +#include <asm/delay.h> +#include <asm/tsc.h> +#include <asm/delay.h> +#include <asm/io.h> + +#include "mach_timer.h" + +/* + * On some systems the TSC frequency does not + * change with the cpu frequency. So we need + * an extra value to store the TSC freq + */ +unsigned int tsc_khz; + +int tsc_disable __cpuinitdata = 0; + +#ifdef CONFIG_X86_TSC +static int __init tsc_setup(char *str) +{ + printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, " + "cannot disable TSC.\n"); + return 1; +} +#else +/* + * disable flag for tsc. Takes effect by clearing the TSC cpu flag + * in cpu/common.c + */ +static int __init tsc_setup(char *str) +{ + tsc_disable = 1; + + return 1; +} +#endif + +__setup("notsc", tsc_setup); + +/* + * code to mark and check if the TSC is unstable + * due to cpufreq or due to unsynced TSCs + */ +static int tsc_unstable; + +static inline int check_tsc_unstable(void) +{ + return tsc_unstable; +} + +void mark_tsc_unstable(void) +{ + tsc_unstable = 1; +} +EXPORT_SYMBOL_GPL(mark_tsc_unstable); + +/* Accellerators for sched_clock() + * convert from cycles(64bits) => nanoseconds (64bits) + * basic equation: + * ns = cycles / (freq / ns_per_sec) + * ns = cycles * (ns_per_sec / freq) + * ns = cycles * (10^9 / (cpu_khz * 10^3)) + * ns = cycles * (10^6 / cpu_khz) + * + * Then we use scaling math (suggested by george@mvista.com) to get: + * ns = cycles * (10^6 * SC / cpu_khz) / SC + * ns = cycles * cyc2ns_scale / SC + * + * And since SC is a constant power of two, we can convert the div + * into a shift. + * + * We can use khz divisor instead of mhz to keep a better percision, since + * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. + * (mathieu.desnoyers@polymtl.ca) + * + * -johnstul@us.ibm.com "math is hard, lets go shopping!" + */ +static unsigned long cyc2ns_scale __read_mostly; + +#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ + +static inline void set_cyc2ns_scale(unsigned long cpu_khz) +{ + cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz; +} + +static inline unsigned long long cycles_2_ns(unsigned long long cyc) +{ + return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR; +} + +/* + * Scheduler clock - returns current time in nanosec units. + */ +unsigned long long sched_clock(void) +{ + unsigned long long this_offset; + + /* + * in the NUMA case we dont use the TSC as they are not + * synchronized across all CPUs. + */ +#ifndef CONFIG_NUMA + if (!cpu_khz || check_tsc_unstable()) +#endif + /* no locking but a rare wrong value is not a big deal */ + return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); + + /* read the Time Stamp Counter: */ + rdtscll(this_offset); + + /* return the value in ns */ + return cycles_2_ns(this_offset); +} + +static unsigned long calculate_cpu_khz(void) +{ + unsigned long long start, end; + unsigned long count; + u64 delta64; + int i; + unsigned long flags; + + local_irq_save(flags); + + /* run 3 times to ensure the cache is warm */ + for (i = 0; i < 3; i++) { + mach_prepare_counter(); + rdtscll(start); + mach_countup(&count); + rdtscll(end); + } + /* + * Error: ECTCNEVERSET + * The CTC wasn't reliable: we got a hit on the very first read, + * or the CPU was so fast/slow that the quotient wouldn't fit in + * 32 bits.. + */ + if (count <= 1) + goto err; + + delta64 = end - start; + + /* cpu freq too fast: */ + if (delta64 > (1ULL<<32)) + goto err; + + /* cpu freq too slow: */ + if (delta64 <= CALIBRATE_TIME_MSEC) + goto err; + + delta64 += CALIBRATE_TIME_MSEC/2; /* round for do_div */ + do_div(delta64,CALIBRATE_TIME_MSEC); + + local_irq_restore(flags); + return (unsigned long)delta64; +err: + local_irq_restore(flags); + return 0; +} + +int recalibrate_cpu_khz(void) +{ +#ifndef CONFIG_SMP + unsigned long cpu_khz_old = cpu_khz; + + if (cpu_has_tsc) { + cpu_khz = calculate_cpu_khz(); + tsc_khz = cpu_khz; + cpu_data[0].loops_per_jiffy = + cpufreq_scale(cpu_data[0].loops_per_jiffy, + cpu_khz_old, cpu_khz); + return 0; + } else + return -ENODEV; +#else + return -ENODEV; +#endif +} + +EXPORT_SYMBOL(recalibrate_cpu_khz); + +void tsc_init(void) +{ + if (!cpu_has_tsc || tsc_disable) + return; + + cpu_khz = calculate_cpu_khz(); + tsc_khz = cpu_khz; + + if (!cpu_khz) + return; + + printk("Detected %lu.%03lu MHz processor.\n", + (unsigned long)cpu_khz / 1000, + (unsigned long)cpu_khz % 1000); + + set_cyc2ns_scale(cpu_khz); + use_tsc_delay(); +} + +#ifdef CONFIG_CPU_FREQ + +static unsigned int cpufreq_delayed_issched = 0; +static unsigned int cpufreq_init = 0; +static struct work_struct cpufreq_delayed_get_work; + +static void handle_cpufreq_delayed_get(void *v) +{ + unsigned int cpu; + + for_each_online_cpu(cpu) + cpufreq_get(cpu); + + cpufreq_delayed_issched = 0; +} + +/* + * if we notice cpufreq oddness, schedule a call to cpufreq_get() as it tries + * to verify the CPU frequency the timing core thinks the CPU is running + * at is still correct. + */ +static inline void cpufreq_delayed_get(void) +{ + if (cpufreq_init && !cpufreq_delayed_issched) { + cpufreq_delayed_issched = 1; + printk(KERN_DEBUG "Checking if CPU frequency changed.\n"); + schedule_work(&cpufreq_delayed_get_work); + } +} + +/* + * if the CPU frequency is scaled, TSC-based delays will need a different + * loops_per_jiffy value to function properly. + */ +static unsigned int ref_freq = 0; +static unsigned long loops_per_jiffy_ref = 0; +static unsigned long cpu_khz_ref = 0; + +static int +time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) +{ + struct cpufreq_freqs *freq = data; + + if (val != CPUFREQ_RESUMECHANGE && val != CPUFREQ_SUSPENDCHANGE) + write_seqlock_irq(&xtime_lock); + + if (!ref_freq) { + if (!freq->old){ + ref_freq = freq->new; + goto end; + } + ref_freq = freq->old; + loops_per_jiffy_ref = cpu_data[freq->cpu].loops_per_jiffy; + cpu_khz_ref = cpu_khz; + } + + if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || + (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || + (val == CPUFREQ_RESUMECHANGE)) { + if (!(freq->flags & CPUFREQ_CONST_LOOPS)) + cpu_data[freq->cpu].loops_per_jiffy = + cpufreq_scale(loops_per_jiffy_ref, + ref_freq, freq->new); + + if (cpu_khz) { + + if (num_online_cpus() == 1) + cpu_khz = cpufreq_scale(cpu_khz_ref, + ref_freq, freq->new); + if (!(freq->flags & CPUFREQ_CONST_LOOPS)) { + tsc_khz = cpu_khz; + set_cyc2ns_scale(cpu_khz); + /* + * TSC based sched_clock turns + * to junk w/ cpufreq + */ + mark_tsc_unstable(); + } + } + } +end: + if (val != CPUFREQ_RESUMECHANGE && val != CPUFREQ_SUSPENDCHANGE) + write_sequnlock_irq(&xtime_lock); + + return 0; +} + +static struct notifier_block time_cpufreq_notifier_block = { + .notifier_call = time_cpufreq_notifier +}; + +static int __init cpufreq_tsc(void) +{ + int ret; + + INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get, NULL); + ret = cpufreq_register_notifier(&time_cpufreq_notifier_block, + CPUFREQ_TRANSITION_NOTIFIER); + if (!ret) + cpufreq_init = 1; + + return ret; +} + +core_initcall(cpufreq_tsc); + +#endif + +/* clock source code */ + +static unsigned long current_tsc_khz = 0; +static int tsc_update_callback(void); + +static cycle_t read_tsc(void) +{ + cycle_t ret; + + rdtscll(ret); + + return ret; +} + +static struct clocksource clocksource_tsc = { + .name = "tsc", + .rating = 300, + .read = read_tsc, + .mask = CLOCKSOURCE_MASK(64), + .mult = 0, /* to be set */ + .shift = 22, + .update_callback = tsc_update_callback, + .is_continuous = 1, +}; + +static int tsc_update_callback(void) +{ + int change = 0; + + /* check to see if we should switch to the safe clocksource: */ + if (clocksource_tsc.rating != 50 && check_tsc_unstable()) { + clocksource_tsc.rating = 50; + clocksource_reselect(); + change = 1; + } + + /* only update if tsc_khz has changed: */ + if (current_tsc_khz != tsc_khz) { + current_tsc_khz = tsc_khz; + clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz, + clocksource_tsc.shift); + change = 1; + } + + return change; +} + +static int __init dmi_mark_tsc_unstable(struct dmi_system_id *d) +{ + printk(KERN_NOTICE "%s detected: marking TSC unstable.\n", + d->ident); + mark_tsc_unstable(); + return 0; +} + +/* List of systems that have known TSC problems */ +static struct dmi_system_id __initdata bad_tsc_dmi_table[] = { + { + .callback = dmi_mark_tsc_unstable, + .ident = "IBM Thinkpad 380XD", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), + DMI_MATCH(DMI_BOARD_NAME, "2635FA0"), + }, + }, + {} +}; + +#define TSC_FREQ_CHECK_INTERVAL (10*MSEC_PER_SEC) /* 10sec in MS */ +static struct timer_list verify_tsc_freq_timer; + +/* XXX - Probably should add locking */ +static void verify_tsc_freq(unsigned long unused) +{ + static u64 last_tsc; + static unsigned long last_jiffies; + + u64 now_tsc, interval_tsc; + unsigned long now_jiffies, interval_jiffies; + + + if (check_tsc_unstable()) + return; + + rdtscll(now_tsc); + now_jiffies = jiffies; + + if (!last_jiffies) { + goto out; + } + + interval_jiffies = now_jiffies - last_jiffies; + interval_tsc = now_tsc - last_tsc; + interval_tsc *= HZ; + do_div(interval_tsc, cpu_khz*1000); + + if (interval_tsc < (interval_jiffies * 3 / 4)) { + printk("TSC appears to be running slowly. " + "Marking it as unstable\n"); + mark_tsc_unstable(); + return; + } + +out: + last_tsc = now_tsc; + last_jiffies = now_jiffies; + /* set us up to go off on the next interval: */ + mod_timer(&verify_tsc_freq_timer, + jiffies + msecs_to_jiffies(TSC_FREQ_CHECK_INTERVAL)); +} + +/* + * Make an educated guess if the TSC is trustworthy and synchronized + * over all CPUs. + */ +static __init int unsynchronized_tsc(void) +{ + /* + * Intel systems are normally all synchronized. + * Exceptions must mark TSC as unstable: + */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + return 0; + + /* assume multi socket systems are not synchronized: */ + return num_possible_cpus() > 1; +} + +static int __init init_tsc_clocksource(void) +{ + + if (cpu_has_tsc && tsc_khz && !tsc_disable) { + /* check blacklist */ + dmi_check_system(bad_tsc_dmi_table); + + if (unsynchronized_tsc()) /* mark unstable if unsynced */ + mark_tsc_unstable(); + current_tsc_khz = tsc_khz; + clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz, + clocksource_tsc.shift); + /* lower the rating if we already know its unstable: */ + if (check_tsc_unstable()) + clocksource_tsc.rating = 50; + + init_timer(&verify_tsc_freq_timer); + verify_tsc_freq_timer.function = verify_tsc_freq; + verify_tsc_freq_timer.expires = + jiffies + msecs_to_jiffies(TSC_FREQ_CHECK_INTERVAL); + add_timer(&verify_tsc_freq_timer); + + return clocksource_register(&clocksource_tsc); + } + + return 0; +} + +module_init(init_tsc_clocksource); diff --git a/arch/i386/lib/delay.c b/arch/i386/lib/delay.c index c49a6acbee5..3c0714c4b66 100644 --- a/arch/i386/lib/delay.c +++ b/arch/i386/lib/delay.c @@ -10,43 +10,92 @@ * we have to worry about. */ +#include <linux/module.h> #include <linux/config.h> #include <linux/sched.h> #include <linux/delay.h> -#include <linux/module.h> + #include <asm/processor.h> #include <asm/delay.h> #include <asm/timer.h> #ifdef CONFIG_SMP -#include <asm/smp.h> +# include <asm/smp.h> #endif -extern struct timer_opts* timer; +/* simple loop based delay: */ +static void delay_loop(unsigned long loops) +{ + int d0; + + __asm__ __volatile__( + "\tjmp 1f\n" + ".align 16\n" + "1:\tjmp 2f\n" + ".align 16\n" + "2:\tdecl %0\n\tjns 2b" + :"=&a" (d0) + :"0" (loops)); +} + +/* TSC based delay: */ +static void delay_tsc(unsigned long loops) +{ + unsigned long bclock, now; + + rdtscl(bclock); + do { + rep_nop(); + rdtscl(now); + } while ((now-bclock) < loops); +} + +/* + * Since we calibrate only once at boot, this + * function should be set once at boot and not changed + */ +static void (*delay_fn)(unsigned long) = delay_loop; + +void use_tsc_delay(void) +{ + delay_fn = delay_tsc; +} + +int read_current_timer(unsigned long *timer_val) +{ + if (delay_fn == delay_tsc) { + rdtscl(*timer_val); + return 0; + } + return -1; +} void __delay(unsigned long loops) { - cur_timer->delay(loops); + delay_fn(loops); } inline void __const_udelay(unsigned long xloops) { int d0; + xloops *= 4; __asm__("mull %0" :"=d" (xloops), "=&a" (d0) - :"1" (xloops),"0" (cpu_data[raw_smp_processor_id()].loops_per_jiffy * (HZ/4))); - __delay(++xloops); + :"1" (xloops), "0" + (cpu_data[raw_smp_processor_id()].loops_per_jiffy * (HZ/4))); + + __delay(++xloops); } void __udelay(unsigned long usecs) { - __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */ + __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */ } void __ndelay(unsigned long nsecs) { - __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */ + __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */ } EXPORT_SYMBOL(__delay); diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c index bd6fe96cc16..6ee7faaf2c1 100644 --- a/arch/i386/mm/fault.c +++ b/arch/i386/mm/fault.c @@ -30,6 +30,40 @@ extern void die(const char *,struct pt_regs *,long); +#ifdef CONFIG_KPROBES +ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain); +int register_page_fault_notifier(struct notifier_block *nb) +{ + vmalloc_sync_all(); + return atomic_notifier_chain_register(¬ify_page_fault_chain, nb); +} + +int unregister_page_fault_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_unregister(¬ify_page_fault_chain, nb); +} + +static inline int notify_page_fault(enum die_val val, const char *str, + struct pt_regs *regs, long err, int trap, int sig) +{ + struct die_args args = { + .regs = regs, + .str = str, + .err = err, + .trapnr = trap, + .signr = sig + }; + return atomic_notifier_call_chain(¬ify_page_fault_chain, val, &args); +} +#else +static inline int notify_page_fault(enum die_val val, const char *str, + struct pt_regs *regs, long err, int trap, int sig) +{ + return NOTIFY_DONE; +} +#endif + + /* * Unlock any spinlocks which will prevent us from getting the * message out @@ -324,7 +358,7 @@ fastcall void __kprobes do_page_fault(struct pt_regs *regs, if (unlikely(address >= TASK_SIZE)) { if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0) return; - if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, + if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, SIGSEGV) == NOTIFY_STOP) return; /* @@ -334,7 +368,7 @@ fastcall void __kprobes do_page_fault(struct pt_regs *regs, goto bad_area_nosemaphore; } - if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, + if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, SIGSEGV) == NOTIFY_STOP) return; diff --git a/arch/i386/oprofile/nmi_int.c b/arch/i386/oprofile/nmi_int.c index ec0fd3cfa77..fa8a37bcb39 100644 --- a/arch/i386/oprofile/nmi_int.c +++ b/arch/i386/oprofile/nmi_int.c @@ -281,9 +281,9 @@ static int nmi_create_files(struct super_block * sb, struct dentry * root) for (i = 0; i < model->num_counters; ++i) { struct dentry * dir; - char buf[2]; + char buf[4]; - snprintf(buf, 2, "%d", i); + snprintf(buf, sizeof(buf), "%d", i); dir = oprofilefs_mkdir(sb, root, buf); oprofilefs_create_ulong(sb, dir, "enabled", &counter_config[i].enabled); oprofilefs_create_ulong(sb, dir, "event", &counter_config[i].event); diff --git a/arch/i386/pci/pcbios.c b/arch/i386/pci/pcbios.c index 1eec0868f4b..ed1512a175a 100644 --- a/arch/i386/pci/pcbios.c +++ b/arch/i386/pci/pcbios.c @@ -371,8 +371,7 @@ void __devinit pcibios_sort(void) list_for_each(ln, &pci_devices) { d = pci_dev_g(ln); if (d->bus->number == bus && d->devfn == devfn) { - list_del(&d->global_list); - list_add_tail(&d->global_list, &sorted_devices); + list_move_tail(&d->global_list, &sorted_devices); if (d == dev) found = 1; break; @@ -390,8 +389,7 @@ void __devinit pcibios_sort(void) if (!found) { printk(KERN_WARNING "PCI: Device %s not found by BIOS\n", pci_name(dev)); - list_del(&dev->global_list); - list_add_tail(&dev->global_list, &sorted_devices); + list_move_tail(&dev->global_list, &sorted_devices); } } list_splice(&sorted_devices, &pci_devices); diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c index d98ec49570b..14ef7cceb20 100644 --- a/arch/ia64/mm/fault.c +++ b/arch/ia64/mm/fault.c @@ -19,6 +19,40 @@ extern void die (char *, struct pt_regs *, long); +#ifdef CONFIG_KPROBES +ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain); + +/* Hook to register for page fault notifications */ +int register_page_fault_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_register(¬ify_page_fault_chain, nb); +} + +int unregister_page_fault_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_unregister(¬ify_page_fault_chain, nb); +} + +static inline int notify_page_fault(enum die_val val, const char *str, + struct pt_regs *regs, long err, int trap, int sig) +{ + struct die_args args = { + .regs = regs, + .str = str, + .err = err, + .trapnr = trap, + .signr = sig + }; + return atomic_notifier_call_chain(¬ify_page_fault_chain, val, &args); +} +#else +static inline int notify_page_fault(enum die_val val, const char *str, + struct pt_regs *regs, long err, int trap, int sig) +{ + return NOTIFY_DONE; +} +#endif + /* * Return TRUE if ADDRESS points at a page in the kernel's mapped segment * (inside region 5, on ia64) and that page is present. @@ -84,7 +118,7 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re /* * This is to handle the kprobes on user space access instructions */ - if (notify_die(DIE_PAGE_FAULT, "page fault", regs, code, TRAP_BRKPT, + if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, code, TRAP_BRKPT, SIGSEGV) == NOTIFY_STOP) return; diff --git a/arch/m68k/mm/memory.c b/arch/m68k/mm/memory.c index d6d582a5abb..a226668f20c 100644 --- a/arch/m68k/mm/memory.c +++ b/arch/m68k/mm/memory.c @@ -94,8 +94,7 @@ pmd_t *get_pointer_table (void) PD_MARKBITS(dp) = mask & ~tmp; if (!PD_MARKBITS(dp)) { /* move to end of list */ - list_del(dp); - list_add_tail(dp, &ptable_list); + list_move_tail(dp, &ptable_list); } return (pmd_t *) (page_address(PD_PAGE(dp)) + off); } @@ -123,8 +122,7 @@ int free_pointer_table (pmd_t *ptable) * move this descriptor to the front of the list, since * it has one or more free tables. */ - list_del(dp); - list_add(dp, &ptable_list); + list_move(dp, &ptable_list); } return 0; } diff --git a/arch/m68k/sun3/sun3dvma.c b/arch/m68k/sun3/sun3dvma.c index f04a1d25f1a..97c7bfde8ae 100644 --- a/arch/m68k/sun3/sun3dvma.c +++ b/arch/m68k/sun3/sun3dvma.c @@ -119,8 +119,7 @@ static inline int refill(void) if(hole->end == prev->start) { hole->size += prev->size; hole->end = prev->end; - list_del(&(prev->list)); - list_add(&(prev->list), &hole_cache); + list_move(&(prev->list), &hole_cache); ret++; } @@ -182,8 +181,7 @@ static inline unsigned long get_baddr(int len, unsigned long align) #endif return hole->end; } else if(hole->size == newlen) { - list_del(&(hole->list)); - list_add(&(hole->list), &hole_cache); + list_move(&(hole->list), &hole_cache); dvma_entry_use(hole->start) = newlen; #ifdef DVMA_DEBUG dvma_allocs++; diff --git a/arch/mips/oprofile/common.c b/arch/mips/oprofile/common.c index c31e4cff64e..65eb55400d7 100644 --- a/arch/mips/oprofile/common.c +++ b/arch/mips/oprofile/common.c @@ -38,7 +38,7 @@ static int op_mips_create_files(struct super_block * sb, struct dentry * root) for (i = 0; i < model->num_counters; ++i) { struct dentry *dir; - char buf[3]; + char buf[4]; snprintf(buf, sizeof buf, "%d", i); dir = oprofilefs_mkdir(sb, root, buf); diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index d20907561f4..7dd5dab789a 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -102,7 +102,7 @@ EXPORT_SYMBOL(tb_ticks_per_sec); /* for cputime_t conversions */ u64 tb_to_xs; unsigned tb_to_us; -#define TICKLEN_SCALE (SHIFT_SCALE - 10) +#define TICKLEN_SCALE TICK_LENGTH_SHIFT u64 last_tick_len; /* units are ns / 2^TICKLEN_SCALE */ u64 ticklen_to_xs; /* 0.64 fraction */ diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index fdbba4206d5..a0a9e1e0061 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -40,6 +40,40 @@ #include <asm/kdebug.h> #include <asm/siginfo.h> +#ifdef CONFIG_KPROBES +ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain); + +/* Hook to register for page fault notifications */ +int register_page_fault_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_register(¬ify_page_fault_chain, nb); +} + +int unregister_page_fault_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_unregister(¬ify_page_fault_chain, nb); +} + +static inline int notify_page_fault(enum die_val val, const char *str, + struct pt_regs *regs, long err, int trap, int sig) +{ + struct die_args args = { + .regs = regs, + .str = str, + .err = err, + .trapnr = trap, + .signr = sig + }; + return atomic_notifier_call_chain(¬ify_page_fault_chain, val, &args); +} +#else +static inline int notify_page_fault(enum die_val val, const char *str, + struct pt_regs *regs, long err, int trap, int sig) +{ + return NOTIFY_DONE; +} +#endif + /* * Check whether the instruction at regs->nip is a store using * an update addressing form which will update r1. @@ -142,7 +176,7 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address, is_write = error_code & ESR_DST; #endif /* CONFIG_4xx || CONFIG_BOOKE */ - if (notify_die(DIE_PAGE_FAULT, "page_fault", regs, error_code, + if (notify_page_fault(DIE_PAGE_FAULT, "page_fault", regs, error_code, 11, SIGSEGV) == NOTIFY_STOP) return 0; diff --git a/arch/powerpc/oprofile/common.c b/arch/powerpc/oprofile/common.c index 27ad56bd227..fd0bbbe7a4d 100644 --- a/arch/powerpc/oprofile/common.c +++ b/arch/powerpc/oprofile/common.c @@ -94,7 +94,7 @@ static int op_powerpc_create_files(struct super_block *sb, struct dentry *root) for (i = 0; i < model->num_counters; ++i) { struct dentry *dir; - char buf[3]; + char buf[4]; snprintf(buf, sizeof buf, "%d", i); dir = oprofilefs_mkdir(sb, root, buf); diff --git a/arch/sh/oprofile/op_model_sh7750.c b/arch/sh/oprofile/op_model_sh7750.c index 5ec9ddcc4b0..c265185b22a 100644 --- a/arch/sh/oprofile/op_model_sh7750.c +++ b/arch/sh/oprofile/op_model_sh7750.c @@ -198,7 +198,7 @@ static int sh7750_perf_counter_create_files(struct super_block *sb, struct dentr for (i = 0; i < NR_CNTRS; i++) { struct dentry *dir; - char buf[3]; + char buf[4]; snprintf(buf, sizeof(buf), "%d", i); dir = oprofilefs_mkdir(sb, root, buf); diff --git a/arch/sparc64/mm/fault.c b/arch/sparc64/mm/fault.c index 6e002aacb96..1605967cce9 100644 --- a/arch/sparc64/mm/fault.c +++ b/arch/sparc64/mm/fault.c @@ -31,6 +31,40 @@ #include <asm/kdebug.h> #include <asm/mmu_context.h> +#ifdef CONFIG_KPROBES +ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain); + +/* Hook to register for page fault notifications */ +int register_page_fault_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_register(¬ify_page_fault_chain, nb); +} + +int unregister_page_fault_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_unregister(¬ify_page_fault_chain, nb); +} + +static inline int notify_page_fault(enum die_val val, const char *str, + struct pt_regs *regs, long err, int trap, int sig) +{ + struct die_args args = { + .regs = regs, + .str = str, + .err = err, + .trapnr = trap, + .signr = sig + }; + return atomic_notifier_call_chain(¬ify_page_fault_chain, val, &args); +} +#else +static inline int notify_page_fault(enum die_val val, const char *str, + struct pt_regs *regs, long err, int trap, int sig) +{ + return NOTIFY_DONE; +} +#endif + /* * To debug kernel to catch accesses to certain virtual/physical addresses. * Mode = 0 selects physical watchpoints, mode = 1 selects virtual watchpoints. @@ -263,7 +297,7 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs) fault_code = get_thread_fault_code(); - if (notify_die(DIE_PAGE_FAULT, "page_fault", regs, + if (notify_page_fault(DIE_PAGE_FAULT, "page_fault", regs, fault_code, 0, SIGSEGV) == NOTIFY_STOP) return; diff --git a/arch/x86_64/boot/video.S b/arch/x86_64/boot/video.S index 32327bb37af..2aa565c136e 100644 --- a/arch/x86_64/boot/video.S +++ b/arch/x86_64/boot/video.S @@ -1929,6 +1929,7 @@ skip10: movb %ah, %al ret store_edid: +#ifdef CONFIG_FIRMWARE_EDID pushw %es # just save all registers pushw %ax pushw %bx @@ -1946,6 +1947,22 @@ store_edid: rep stosl + pushw %es # save ES + xorw %di, %di # Report Capability + pushw %di + popw %es # ES:DI must be 0:0 + movw $0x4f15, %ax + xorw %bx, %bx + xorw %cx, %cx + int $0x10 + popw %es # restore ES + + cmpb $0x00, %ah # call successful + jne no_edid + + cmpb $0x4f, %al # function supported + jne no_edid + movw $0x4f15, %ax # do VBE/DDC movw $0x01, %bx movw $0x00, %cx @@ -1953,12 +1970,14 @@ store_edid: movw $0x140, %di int $0x10 +no_edid: popw %di # restore all registers popw %dx popw %cx popw %bx popw %ax popw %es +#endif ret # VIDEO_SELECT-only variables diff --git a/arch/x86_64/kernel/pmtimer.c b/arch/x86_64/kernel/pmtimer.c index bf421ed2680..7554458dc9c 100644 --- a/arch/x86_64/kernel/pmtimer.c +++ b/arch/x86_64/kernel/pmtimer.c @@ -27,7 +27,7 @@ /* The I/O port the PMTMR resides at. * The location is detected during setup_arch(), * in arch/i386/kernel/acpi/boot.c */ -u32 pmtmr_ioport; +u32 pmtmr_ioport __read_mostly; /* value of the Power timer at last timer interrupt */ static u32 offset_delay; diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index fb850b52b4d..143c6503153 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -109,6 +109,7 @@ struct sys_desc_table_struct { }; struct edid_info edid_info; +EXPORT_SYMBOL_GPL(edid_info); struct e820map e820; extern int root_mountflags; diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c index 55250593d8c..0803d3858af 100644 --- a/arch/x86_64/mm/fault.c +++ b/arch/x86_64/mm/fault.c @@ -41,6 +41,41 @@ #define PF_RSVD (1<<3) #define PF_INSTR (1<<4) +#ifdef CONFIG_KPROBES +ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain); + +/* Hook to register for page fault notifications */ +int register_page_fault_notifier(struct notifier_block *nb) +{ + vmalloc_sync_all(); + return atomic_notifier_chain_register(¬ify_page_fault_chain, nb); +} + +int unregister_page_fault_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_unregister(¬ify_page_fault_chain, nb); +} + +static inline int notify_page_fault(enum die_val val, const char *str, + struct pt_regs *regs, long err, int trap, int sig) +{ + struct die_args args = { + .regs = regs, + .str = str, + .err = err, + .trapnr = trap, + .signr = sig + }; + return atomic_notifier_call_chain(¬ify_page_fault_chain, val, &args); +} +#else +static inline int notify_page_fault(enum die_val val, const char *str, + struct pt_regs *regs, long err, int trap, int sig) +{ + return NOTIFY_DONE; +} +#endif + void bust_spinlocks(int yes) { int loglevel_save = console_loglevel; @@ -348,7 +383,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, if (vmalloc_fault(address) >= 0) return; } - if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, + if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, SIGSEGV) == NOTIFY_STOP) return; /* @@ -358,7 +393,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, goto bad_area_nosemaphore; } - if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, + if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, SIGSEGV) == NOTIFY_STOP) return; diff --git a/drivers/Makefile b/drivers/Makefile index 3c5170310bd..fc2d744a4e4 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -74,4 +74,5 @@ obj-$(CONFIG_SGI_SN) += sn/ obj-y += firmware/ obj-$(CONFIG_CRYPTO) += crypto/ obj-$(CONFIG_SUPERH) += sh/ +obj-$(CONFIG_GENERIC_TIME) += clocksource/ obj-$(CONFIG_DMA_ENGINE) += dma/ diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index 3b97a5eae9e..a5f4f2aa007 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -369,6 +369,11 @@ static void acpi_processor_idle(void) t2 = inl(acpi_fadt.xpm_tmr_blk.address); /* Get end time (ticks) */ t2 = inl(acpi_fadt.xpm_tmr_blk.address); + +#ifdef CONFIG_GENERIC_TIME + /* TSC halts in C2, so notify users */ + mark_tsc_unstable(); +#endif /* Re-enable interrupts */ local_irq_enable(); set_thread_flag(TIF_POLLING_NRFLAG); @@ -409,6 +414,10 @@ static void acpi_processor_idle(void) ACPI_MTX_DO_NOT_LOCK); } +#ifdef CONFIG_GENERIC_TIME + /* TSC halts in C3, so notify users */ + mark_tsc_unstable(); +#endif /* Re-enable interrupts */ local_irq_enable(); set_thread_flag(TIF_POLLING_NRFLAG); diff --git a/drivers/base/power/resume.c b/drivers/base/power/resume.c index 520679ce53a..826093ef4c7 100644 --- a/drivers/base/power/resume.c +++ b/drivers/base/power/resume.c @@ -53,8 +53,7 @@ void dpm_resume(void) struct device * dev = to_device(entry); get_device(dev); - list_del_init(entry); - list_add_tail(entry, &dpm_active); + list_move_tail(entry, &dpm_active); up(&dpm_list_sem); if (!dev->power.prev_state.event) @@ -101,8 +100,7 @@ void dpm_power_up(void) struct device * dev = to_device(entry); get_device(dev); - list_del_init(entry); - list_add_tail(entry, &dpm_active); + list_move_tail(entry, &dpm_active); resume_device(dev); put_device(dev); } diff --git a/drivers/base/power/suspend.c b/drivers/base/power/suspend.c index 1a1fe43a305..69509e02f70 100644 --- a/drivers/base/power/suspend.c +++ b/drivers/base/power/suspend.c @@ -116,12 +116,10 @@ int device_suspend(pm_message_t state) /* Check if the device got removed */ if (!list_empty(&dev->power.entry)) { /* Move it to the dpm_off or dpm_off_irq list */ - if (!error) { - list_del(&dev->power.entry); - list_add(&dev->power.entry, &dpm_off); - } else if (error == -EAGAIN) { - list_del(&dev->power.entry); - list_add(&dev->power.entry, &dpm_off_irq); + if (!error) + list_move(&dev->power.entry, &dpm_off); + else if (error == -EAGAIN) { + list_move(&dev->power.entry, &dpm_off_irq); error = 0; } } @@ -139,8 +137,7 @@ int device_suspend(pm_message_t state) */ while (!list_empty(&dpm_off_irq)) { struct list_head * entry = dpm_off_irq.next; - list_del(entry); - list_add(entry, &dpm_off); + list_move(entry, &dpm_off); } dpm_resume(); } diff --git a/drivers/bluetooth/dtl1_cs.c b/drivers/bluetooth/dtl1_cs.c index a71a240611e..ed8dca84ff6 100644 --- a/drivers/bluetooth/dtl1_cs.c +++ b/drivers/bluetooth/dtl1_cs.c @@ -423,6 +423,9 @@ static int dtl1_hci_send_frame(struct sk_buff *skb) nsh.len = skb->len; s = bt_skb_alloc(NSHL + skb->len + 1, GFP_ATOMIC); + if (!s) + return -ENOMEM; + skb_reserve(s, NSHL); memcpy(skb_put(s, skb->len), skb->data, skb->len); if (skb->len & 0x0001) diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig index 63f28d169b3..3610c572955 100644 --- a/drivers/char/Kconfig +++ b/drivers/char/Kconfig @@ -62,6 +62,23 @@ config HW_CONSOLE depends on VT && !S390 && !UML default y +config VT_HW_CONSOLE_BINDING + bool "Support for binding and unbinding console drivers" + depends on HW_CONSOLE + default n + ---help--- + The virtual terminal is the device that interacts with the physical + terminal through console drivers. On these systems, at least one + console driver is loaded. In other configurations, additional console + drivers may be enabled, such as the framebuffer console. If more than + 1 console driver is enabled, setting this to 'y' will allow you to + select the console driver that will serve as the backend for the + virtual terminals. + + See <file:Documentation/console/console.txt> for more + information. For framebuffer console users, please refer to + <file:Documentation/fb/fbcon.txt>. + config SERIAL_NONSTANDARD bool "Non-standard serial port support" ---help--- @@ -670,20 +687,7 @@ config NWFLASH If you're not sure, say N. -config HW_RANDOM - tristate "Intel/AMD/VIA HW Random Number Generator support" - depends on (X86 || IA64) && PCI - ---help--- - This driver provides kernel-side support for the Random Number - Generator hardware found on Intel i8xx-based motherboards, - AMD 76x-based motherboards, and Via Nehemiah CPUs. - - Provides a character driver, used to read() entropy data. - - To compile this driver as a module, choose M here: the - module will be called hw_random. - - If unsure, say N. +source "drivers/char/hw_random/Kconfig" config NVRAM tristate "/dev/nvram support" diff --git a/drivers/char/Makefile b/drivers/char/Makefile index fb919bfb282..524105597ea 100644 --- a/drivers/char/Makefile +++ b/drivers/char/Makefile @@ -75,7 +75,7 @@ endif obj-$(CONFIG_TOSHIBA) += toshiba.o obj-$(CONFIG_I8K) += i8k.o obj-$(CONFIG_DS1620) += ds1620.o -obj-$(CONFIG_HW_RANDOM) += hw_random.o +obj-$(CONFIG_HW_RANDOM) += hw_random/ obj-$(CONFIG_FTAPE) += ftape/ obj-$(CONFIG_COBALT_LCD) += lcd.o obj-$(CONFIG_PPDEV) += ppdev.o diff --git a/drivers/char/hangcheck-timer.c b/drivers/char/hangcheck-timer.c index ac626418b32..d69f2ad9a67 100644 --- a/drivers/char/hangcheck-timer.c +++ b/drivers/char/hangcheck-timer.c @@ -117,12 +117,12 @@ __setup("hcheck_reboot", hangcheck_parse_reboot); __setup("hcheck_dump_tasks", hangcheck_parse_dump_tasks); #endif /* not MODULE */ -#if defined(CONFIG_X86) || defined(CONFIG_S390) +#if defined(CONFIG_X86_64) || defined(CONFIG_S390) # define HAVE_MONOTONIC # define TIMER_FREQ 1000000000ULL #elif defined(CONFIG_IA64) # define TIMER_FREQ ((unsigned long long)local_cpu_data->itc_freq) -#elif defined(CONFIG_PPC64) +#else # define TIMER_FREQ (HZ*loops_per_jiffy) #endif diff --git a/drivers/char/hw_random.c b/drivers/char/hw_random.c deleted file mode 100644 index 29dc87e5902..00000000000 --- a/drivers/char/hw_random.c +++ /dev/null @@ -1,698 +0,0 @@ -/* - Added support for the AMD Geode LX RNG - (c) Copyright 2004-2005 Advanced Micro Devices, Inc. - - derived from - - Hardware driver for the Intel/AMD/VIA Random Number Generators (RNG) - (c) Copyright 2003 Red Hat Inc <jgarzik@redhat.com> - - derived from - - Hardware driver for the AMD 768 Random Number Generator (RNG) - (c) Copyright 2001 Red Hat Inc <alan@redhat.com> - - derived from - - Hardware driver for Intel i810 Random Number Generator (RNG) - Copyright 2000,2001 Jeff Garzik <jgarzik@pobox.com> - Copyright 2000,2001 Philipp Rumpf <prumpf@mandrakesoft.com> - - Please read Documentation/hw_random.txt for details on use. - - ---------------------------------------------------------- - This software may be used and distributed according to the terms - of the GNU General Public License, incorporated herein by reference. - - */ - - -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/fs.h> -#include <linux/init.h> -#include <linux/pci.h> -#include <linux/interrupt.h> -#include <linux/spinlock.h> -#include <linux/random.h> -#include <linux/miscdevice.h> -#include <linux/smp_lock.h> -#include <linux/mm.h> -#include <linux/delay.h> - -#ifdef __i386__ -#include <asm/msr.h> -#include <asm/cpufeature.h> -#endif - -#include <asm/io.h> -#include <asm/uaccess.h> - - -/* - * core module and version information - */ -#define RNG_VERSION "1.0.0" -#define RNG_MODULE_NAME "hw_random" -#define RNG_DRIVER_NAME RNG_MODULE_NAME " hardware driver " RNG_VERSION -#define PFX RNG_MODULE_NAME ": " - - -/* - * debugging macros - */ - -/* pr_debug() collapses to a no-op if DEBUG is not defined */ -#define DPRINTK(fmt, args...) pr_debug(PFX "%s: " fmt, __FUNCTION__ , ## args) - - -#undef RNG_NDEBUG /* define to enable lightweight runtime checks */ -#ifdef RNG_NDEBUG -#define assert(expr) \ - if(!(expr)) { \ - printk(KERN_DEBUG PFX "Assertion failed! %s,%s,%s," \ - "line=%d\n", #expr, __FILE__, __FUNCTION__, __LINE__); \ - } -#else -#define assert(expr) -#endif - -#define RNG_MISCDEV_MINOR 183 /* official */ - -static int rng_dev_open (struct inode *inode, struct file *filp); -static ssize_t rng_dev_read (struct file *filp, char __user *buf, size_t size, - loff_t * offp); - -static int __init intel_init (struct pci_dev *dev); -static void intel_cleanup(void); -static unsigned int intel_data_present (void); -static u32 intel_data_read (void); - -static int __init amd_init (struct pci_dev *dev); -static void amd_cleanup(void); -static unsigned int amd_data_present (void); -static u32 amd_data_read (void); - -#ifdef __i386__ -static int __init via_init(struct pci_dev *dev); -static void via_cleanup(void); -static unsigned int via_data_present (void); -static u32 via_data_read (void); -#endif - -static int __init geode_init(struct pci_dev *dev); -static void geode_cleanup(void); -static unsigned int geode_data_present (void); -static u32 geode_data_read (void); - -struct rng_operations { - int (*init) (struct pci_dev *dev); - void (*cleanup) (void); - unsigned int (*data_present) (void); - u32 (*data_read) (void); - unsigned int n_bytes; /* number of bytes per ->data_read */ -}; -static struct rng_operations *rng_ops; - -static struct file_operations rng_chrdev_ops = { - .owner = THIS_MODULE, - .open = rng_dev_open, - .read = rng_dev_read, -}; - - -static struct miscdevice rng_miscdev = { - RNG_MISCDEV_MINOR, - RNG_MODULE_NAME, - &rng_chrdev_ops, -}; - -enum { - rng_hw_none, - rng_hw_intel, - rng_hw_amd, -#ifdef __i386__ - rng_hw_via, -#endif - rng_hw_geode, -}; - -static struct rng_operations rng_vendor_ops[] = { - /* rng_hw_none */ - { }, - - /* rng_hw_intel */ - { intel_init, intel_cleanup, intel_data_present, - intel_data_read, 1 }, - - /* rng_hw_amd */ - { amd_init, amd_cleanup, amd_data_present, amd_data_read, 4 }, - -#ifdef __i386__ - /* rng_hw_via */ - { via_init, via_cleanup, via_data_present, via_data_read, 1 }, -#endif - - /* rng_hw_geode */ - { geode_init, geode_cleanup, geode_data_present, geode_data_read, 4 } -}; - -/* - * Data for PCI driver interface - * - * This data only exists for exporting the supported - * PCI ids via MODULE_DEVICE_TABLE. We do not actually - * register a pci_driver, because someone else might one day - * want to register another driver on the same PCI id. - */ -static struct pci_device_id rng_pci_tbl[] = { - { 0x1022, 0x7443, PCI_ANY_ID, PCI_ANY_ID, 0, 0, rng_hw_amd }, - { 0x1022, 0x746b, PCI_ANY_ID, PCI_ANY_ID, 0, 0, rng_hw_amd }, - - { 0x8086, 0x2418, PCI_ANY_ID, PCI_ANY_ID, 0, 0, rng_hw_intel }, - { 0x8086, 0x2428, PCI_ANY_ID, PCI_ANY_ID, 0, 0, rng_hw_intel }, - { 0x8086, 0x2430, PCI_ANY_ID, PCI_ANY_ID, 0, 0, rng_hw_intel }, - { 0x8086, 0x2448, PCI_ANY_ID, PCI_ANY_ID, 0, 0, rng_hw_intel }, - { 0x8086, 0x244e, PCI_ANY_ID, PCI_ANY_ID, 0, 0, rng_hw_intel }, - { 0x8086, 0x245e, PCI_ANY_ID, PCI_ANY_ID, 0, 0, rng_hw_intel }, - - { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_LX_AES, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, rng_hw_geode }, - - { 0, }, /* terminate list */ -}; -MODULE_DEVICE_TABLE (pci, rng_pci_tbl); - - -/*********************************************************************** - * - * Intel RNG operations - * - */ - -/* - * RNG registers (offsets from rng_mem) - */ -#define INTEL_RNG_HW_STATUS 0 -#define INTEL_RNG_PRESENT 0x40 -#define INTEL_RNG_ENABLED 0x01 -#define INTEL_RNG_STATUS 1 -#define INTEL_RNG_DATA_PRESENT 0x01 -#define INTEL_RNG_DATA 2 - -/* - * Magic address at which Intel PCI bridges locate the RNG - */ -#define INTEL_RNG_ADDR 0xFFBC015F -#define INTEL_RNG_ADDR_LEN 3 - -/* token to our ioremap'd RNG register area */ -static void __iomem *rng_mem; - -static inline u8 intel_hwstatus (void) -{ - assert (rng_mem != NULL); - return readb (rng_mem + INTEL_RNG_HW_STATUS); -} - -static inline u8 intel_hwstatus_set (u8 hw_status) -{ - assert (rng_mem != NULL); - writeb (hw_status, rng_mem + INTEL_RNG_HW_STATUS); - return intel_hwstatus (); -} - -static unsigned int intel_data_present(void) -{ - assert (rng_mem != NULL); - - return (readb (rng_mem + INTEL_RNG_STATUS) & INTEL_RNG_DATA_PRESENT) ? - 1 : 0; -} - -static u32 intel_data_read(void) -{ - assert (rng_mem != NULL); - - return readb (rng_mem + INTEL_RNG_DATA); -} - -static int __init intel_init (struct pci_dev *dev) -{ - int rc; - u8 hw_status; - - DPRINTK ("ENTER\n"); - - rng_mem = ioremap (INTEL_RNG_ADDR, INTEL_RNG_ADDR_LEN); - if (rng_mem == NULL) { - printk (KERN_ERR PFX "cannot ioremap RNG Memory\n"); - rc = -EBUSY; - goto err_out; - } - - /* Check for Intel 82802 */ - hw_status = intel_hwstatus (); - if ((hw_status & INTEL_RNG_PRESENT) == 0) { - printk (KERN_ERR PFX "RNG not detected\n"); - rc = -ENODEV; - goto err_out_free_map; - } - - /* turn RNG h/w on, if it's off */ - if ((hw_status & INTEL_RNG_ENABLED) == 0) - hw_status = intel_hwstatus_set (hw_status | INTEL_RNG_ENABLED); - if ((hw_status & INTEL_RNG_ENABLED) == 0) { - printk (KERN_ERR PFX "cannot enable RNG, aborting\n"); - rc = -EIO; - goto err_out_free_map; - } - - DPRINTK ("EXIT, returning 0\n"); - return 0; - -err_out_free_map: - iounmap (rng_mem); - rng_mem = NULL; -err_out: - DPRINTK ("EXIT, returning %d\n", rc); - return rc; -} - -static void intel_cleanup(void) -{ - u8 hw_status; - - hw_status = intel_hwstatus (); - if (hw_status & INTEL_RNG_ENABLED) - intel_hwstatus_set (hw_status & ~INTEL_RNG_ENABLED); - else - printk(KERN_WARNING PFX "unusual: RNG already disabled\n"); - iounmap(rng_mem); - rng_mem = NULL; -} - -/*********************************************************************** - * - * AMD RNG operations - * - */ - -static u32 pmbase; /* PMxx I/O base */ -static struct pci_dev *amd_dev; - -static unsigned int amd_data_present (void) -{ - return inl(pmbase + 0xF4) & 1; -} - - -static u32 amd_data_read (void) -{ - return inl(pmbase + 0xF0); -} - -static int __init amd_init (struct pci_dev *dev) -{ - int rc; - u8 rnen; - - DPRINTK ("ENTER\n"); - - pci_read_config_dword(dev, 0x58, &pmbase); - - pmbase &= 0x0000FF00; - - if (pmbase == 0) - { - printk (KERN_ERR PFX "power management base not set\n"); - rc = -EIO; - goto err_out; - } - - pci_read_config_byte(dev, 0x40, &rnen); - rnen |= (1 << 7); /* RNG on */ - pci_write_config_byte(dev, 0x40, rnen); - - pci_read_config_byte(dev, 0x41, &rnen); - rnen |= (1 << 7); /* PMIO enable */ - pci_write_config_byte(dev, 0x41, rnen); - - pr_info( PFX "AMD768 system management I/O registers at 0x%X.\n", - pmbase); - - amd_dev = dev; - - DPRINTK ("EXIT, returning 0\n"); - return 0; - -err_out: - DPRINTK ("EXIT, returning %d\n", rc); - return rc; -} - -static void amd_cleanup(void) -{ - u8 rnen; - - pci_read_config_byte(amd_dev, 0x40, &rnen); - rnen &= ~(1 << 7); /* RNG off */ - pci_write_config_byte(amd_dev, 0x40, rnen); - - /* FIXME: twiddle pmio, also? */ -} - -#ifdef __i386__ -/*********************************************************************** - * - * VIA RNG operations - * - */ - -enum { - VIA_STRFILT_CNT_SHIFT = 16, - VIA_STRFILT_FAIL = (1 << 15), - VIA_STRFILT_ENABLE = (1 << 14), - VIA_RAWBITS_ENABLE = (1 << 13), - VIA_RNG_ENABLE = (1 << 6), - VIA_XSTORE_CNT_MASK = 0x0F, - - VIA_RNG_CHUNK_8 = 0x00, /* 64 rand bits, 64 stored bits */ - VIA_RNG_CHUNK_4 = 0x01, /* 32 rand bits, 32 stored bits */ - VIA_RNG_CHUNK_4_MASK = 0xFFFFFFFF, - VIA_RNG_CHUNK_2 = 0x02, /* 16 rand bits, 32 stored bits */ - VIA_RNG_CHUNK_2_MASK = 0xFFFF, - VIA_RNG_CHUNK_1 = 0x03, /* 8 rand bits, 32 stored bits */ - VIA_RNG_CHUNK_1_MASK = 0xFF, -}; - -static u32 via_rng_datum; - -/* - * Investigate using the 'rep' prefix to obtain 32 bits of random data - * in one insn. The upside is potentially better performance. The - * downside is that the instruction becomes no longer atomic. Due to - * this, just like familiar issues with /dev/random itself, the worst - * case of a 'rep xstore' could potentially pause a cpu for an - * unreasonably long time. In practice, this condition would likely - * only occur when the hardware is failing. (or so we hope :)) - * - * Another possible performance boost may come from simply buffering - * until we have 4 bytes, thus returning a u32 at a time, - * instead of the current u8-at-a-time. - */ - -static inline u32 xstore(u32 *addr, u32 edx_in) -{ - u32 eax_out; - - asm(".byte 0x0F,0xA7,0xC0 /* xstore %%edi (addr=%0) */" - :"=m"(*addr), "=a"(eax_out) - :"D"(addr), "d"(edx_in)); - - return eax_out; -} - -static unsigned int via_data_present(void) -{ - u32 bytes_out; - - /* We choose the recommended 1-byte-per-instruction RNG rate, - * for greater randomness at the expense of speed. Larger - * values 2, 4, or 8 bytes-per-instruction yield greater - * speed at lesser randomness. - * - * If you change this to another VIA_CHUNK_n, you must also - * change the ->n_bytes values in rng_vendor_ops[] tables. - * VIA_CHUNK_8 requires further code changes. - * - * A copy of MSR_VIA_RNG is placed in eax_out when xstore - * completes. - */ - via_rng_datum = 0; /* paranoia, not really necessary */ - bytes_out = xstore(&via_rng_datum, VIA_RNG_CHUNK_1) & VIA_XSTORE_CNT_MASK; - if (bytes_out == 0) - return 0; - - return 1; -} - -static u32 via_data_read(void) -{ - return via_rng_datum; -} - -static int __init via_init(struct pci_dev *dev) -{ - u32 lo, hi, old_lo; - - /* Control the RNG via MSR. Tread lightly and pay very close - * close attention to values written, as the reserved fields - * are documented to be "undefined and unpredictable"; but it - * does not say to write them as zero, so I make a guess that - * we restore the values we find in the register. - */ - rdmsr(MSR_VIA_RNG, lo, hi); - - old_lo = lo; - lo &= ~(0x7f << VIA_STRFILT_CNT_SHIFT); - lo &= ~VIA_XSTORE_CNT_MASK; - lo &= ~(VIA_STRFILT_ENABLE | VIA_STRFILT_FAIL | VIA_RAWBITS_ENABLE); - lo |= VIA_RNG_ENABLE; - - if (lo != old_lo) - wrmsr(MSR_VIA_RNG, lo, hi); - - /* perhaps-unnecessary sanity check; remove after testing if - unneeded */ - rdmsr(MSR_VIA_RNG, lo, hi); - if ((lo & VIA_RNG_ENABLE) == 0) { - printk(KERN_ERR PFX "cannot enable VIA C3 RNG, aborting\n"); - return -ENODEV; - } - - return 0; -} - -static void via_cleanup(void) -{ - /* do nothing */ -} -#endif - -/*********************************************************************** - * - * AMD Geode RNG operations - * - */ - -static void __iomem *geode_rng_base = NULL; - -#define GEODE_RNG_DATA_REG 0x50 -#define GEODE_RNG_STATUS_REG 0x54 - -static u32 geode_data_read(void) -{ - u32 val; - - assert(geode_rng_base != NULL); - val = readl(geode_rng_base + GEODE_RNG_DATA_REG); - return val; -} - -static unsigned int geode_data_present(void) -{ - u32 val; - - assert(geode_rng_base != NULL); - val = readl(geode_rng_base + GEODE_RNG_STATUS_REG); - return val; -} - -static void geode_cleanup(void) -{ - iounmap(geode_rng_base); - geode_rng_base = NULL; -} - -static int geode_init(struct pci_dev *dev) -{ - unsigned long rng_base = pci_resource_start(dev, 0); - - if (rng_base == 0) - return 1; - - geode_rng_base = ioremap(rng_base, 0x58); - - if (geode_rng_base == NULL) { - printk(KERN_ERR PFX "Cannot ioremap RNG memory\n"); - return -EBUSY; - } - - return 0; -} - -/*********************************************************************** - * - * /dev/hwrandom character device handling (major 10, minor 183) - * - */ - -static int rng_dev_open (struct inode *inode, struct file *filp) -{ - /* enforce read-only access to this chrdev */ - if ((filp->f_mode & FMODE_READ) == 0) - return -EINVAL; - if (filp->f_mode & FMODE_WRITE) - return -EINVAL; - - return 0; -} - - -static ssize_t rng_dev_read (struct file *filp, char __user *buf, size_t size, - loff_t * offp) -{ - static DEFINE_SPINLOCK(rng_lock); - unsigned int have_data; - u32 data = 0; - ssize_t ret = 0; - - while (size) { - spin_lock(&rng_lock); - - have_data = 0; - if (rng_ops->data_present()) { - data = rng_ops->data_read(); - have_data = rng_ops->n_bytes; - } - - spin_unlock (&rng_lock); - - while (have_data && size) { - if (put_user((u8)data, buf++)) { - ret = ret ? : -EFAULT; - break; - } - size--; - ret++; - have_data--; - data>>=8; - } - - if (filp->f_flags & O_NONBLOCK) - return ret ? : -EAGAIN; - - if(need_resched()) - schedule_timeout_interruptible(1); - else - udelay(200); /* FIXME: We could poll for 250uS ?? */ - - if (signal_pending (current)) - return ret ? : -ERESTARTSYS; - } - return ret; -} - - - -/* - * rng_init_one - look for and attempt to init a single RNG - */ -static int __init rng_init_one (struct pci_dev *dev) -{ - int rc; - - DPRINTK ("ENTER\n"); - - assert(rng_ops != NULL); - - rc = rng_ops->init(dev); - if (rc) - goto err_out; - - rc = misc_register (&rng_miscdev); - if (rc) { - printk (KERN_ERR PFX "misc device register failed\n"); - goto err_out_cleanup_hw; - } - - DPRINTK ("EXIT, returning 0\n"); - return 0; - -err_out_cleanup_hw: - rng_ops->cleanup(); -err_out: - DPRINTK ("EXIT, returning %d\n", rc); - return rc; -} - - - -MODULE_AUTHOR("The Linux Kernel team"); -MODULE_DESCRIPTION("H/W Random Number Generator (RNG) driver"); -MODULE_LICENSE("GPL"); - - -/* - * rng_init - initialize RNG module - */ -static int __init rng_init (void) -{ - int rc; - struct pci_dev *pdev = NULL; - const struct pci_device_id *ent; - - DPRINTK ("ENTER\n"); - - /* Probe for Intel, AMD, Geode RNGs */ - for_each_pci_dev(pdev) { - ent = pci_match_id(rng_pci_tbl, pdev); - if (ent) { - rng_ops = &rng_vendor_ops[ent->driver_data]; - goto match; - } - } - -#ifdef __i386__ - /* Probe for VIA RNG */ - if (cpu_has_xstore) { - rng_ops = &rng_vendor_ops[rng_hw_via]; - pdev = NULL; - goto match; - } -#endif - - DPRINTK ("EXIT, returning -ENODEV\n"); - return -ENODEV; - -match: - rc = rng_init_one (pdev); - if (rc) - return rc; - - pr_info( RNG_DRIVER_NAME " loaded\n"); - - DPRINTK ("EXIT, returning 0\n"); - return 0; -} - - -/* - * rng_init - shutdown RNG module - */ -static void __exit rng_cleanup (void) -{ - DPRINTK ("ENTER\n"); - - misc_deregister (&rng_miscdev); - - if (rng_ops->cleanup) - rng_ops->cleanup(); - - DPRINTK ("EXIT\n"); -} - - -module_init (rng_init); -module_exit (rng_cleanup); diff --git a/drivers/char/hw_random/Kconfig b/drivers/char/hw_random/Kconfig new file mode 100644 index 00000000000..9f7635f7517 --- /dev/null +++ b/drivers/char/hw_random/Kconfig @@ -0,0 +1,90 @@ +# +# Hardware Random Number Generator (RNG) configuration +# + +config HW_RANDOM + bool "Hardware Random Number Generator Core support" + default y + ---help--- + Hardware Random Number Generator Core infrastructure. + + If unsure, say Y. + +config HW_RANDOM_INTEL + tristate "Intel HW Random Number Generator support" + depends on HW_RANDOM && (X86 || IA64) && PCI + default y + ---help--- + This driver provides kernel-side support for the Random Number + Generator hardware found on Intel i8xx-based motherboards. + + To compile this driver as a module, choose M here: the + module will be called intel-rng. + + If unsure, say Y. + +config HW_RANDOM_AMD + tristate "AMD HW Random Number Generator support" + depends on HW_RANDOM && X86 && PCI + default y + ---help--- + This driver provides kernel-side support for the Random Number + Generator hardware found on AMD 76x-based motherboards. + + To compile this driver as a module, choose M here: the + module will be called amd-rng. + + If unsure, say Y. + +config HW_RANDOM_GEODE + tristate "AMD Geode HW Random Number Generator support" + depends on HW_RANDOM && X86 && PCI + default y + ---help--- + This driver provides kernel-side support for the Random Number + Generator hardware found on the AMD Geode LX. + + To compile this driver as a module, choose M here: the + module will be called geode-rng. + + If unsure, say Y. + +config HW_RANDOM_VIA + tristate "VIA HW Random Number Generator support" + depends on HW_RANDOM && X86_32 + default y + ---help--- + This driver provides kernel-side support for the Random Number + Generator hardware found on VIA based motherboards. + + To compile this driver as a module, choose M here: the + module will be called via-rng. + + If unsure, say Y. + +config HW_RANDOM_IXP4XX + tristate "Intel IXP4xx NPU HW Random Number Generator support" + depends on HW_RANDOM && ARCH_IXP4XX + default y + ---help--- + This driver provides kernel-side support for the Random + Number Generator hardware found on the Intel IXP4xx NPU. + + To compile this driver as a module, choose M here: the + module will be called ixp4xx-rng. + + If unsure, say Y. + +config HW_RANDOM_OMAP + tristate "OMAP Random Number Generator support" + depends on HW_RANDOM && (ARCH_OMAP16XX || ARCH_OMAP24XX) + default y + ---help--- + This driver provides kernel-side support for the Random Number + Generator hardware found on OMAP16xx and OMAP24xx multimedia + processors. + + To compile this driver as a module, choose M here: the + module will be called omap-rng. + + If unsure, say Y. diff --git a/drivers/char/hw_random/Makefile b/drivers/char/hw_random/Makefile new file mode 100644 index 00000000000..e263ae96f94 --- /dev/null +++ b/drivers/char/hw_random/Makefile @@ -0,0 +1,11 @@ +# +# Makefile for HW Random Number Generator (RNG) device drivers. +# + +obj-$(CONFIG_HW_RANDOM) += core.o +obj-$(CONFIG_HW_RANDOM_INTEL) += intel-rng.o +obj-$(CONFIG_HW_RANDOM_AMD) += amd-rng.o +obj-$(CONFIG_HW_RANDOM_GEODE) += geode-rng.o +obj-$(CONFIG_HW_RANDOM_VIA) += via-rng.o +obj-$(CONFIG_HW_RANDOM_IXP4XX) += ixp4xx-rng.o +obj-$(CONFIG_HW_RANDOM_OMAP) += omap-rng.o diff --git a/drivers/char/hw_random/amd-rng.c b/drivers/char/hw_random/amd-rng.c new file mode 100644 index 00000000000..71e4e0f3fd5 --- /dev/null +++ b/drivers/char/hw_random/amd-rng.c @@ -0,0 +1,152 @@ +/* + * RNG driver for AMD RNGs + * + * Copyright 2005 (c) MontaVista Software, Inc. + * + * with the majority of the code coming from: + * + * Hardware driver for the Intel/AMD/VIA Random Number Generators (RNG) + * (c) Copyright 2003 Red Hat Inc <jgarzik@redhat.com> + * + * derived from + * + * Hardware driver for the AMD 768 Random Number Generator (RNG) + * (c) Copyright 2001 Red Hat Inc <alan@redhat.com> + * + * derived from + * + * Hardware driver for Intel i810 Random Number Generator (RNG) + * Copyright 2000,2001 Jeff Garzik <jgarzik@pobox.com> + * Copyright 2000,2001 Philipp Rumpf <prumpf@mandrakesoft.com> + * + * This file is licensed under the terms of the GNU General Public + * License version 2. This program is licensed "as is" without any + * warranty of any kind, whether express or implied. + */ + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/pci.h> +#include <linux/hw_random.h> +#include <asm/io.h> + + +#define PFX KBUILD_MODNAME ": " + + +/* + * Data for PCI driver interface + * + * This data only exists for exporting the supported + * PCI ids via MODULE_DEVICE_TABLE. We do not actually + * register a pci_driver, because someone else might one day + * want to register another driver on the same PCI id. + */ +static const struct pci_device_id pci_tbl[] = { + { 0x1022, 0x7443, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0, }, + { 0x1022, 0x746b, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0, }, + { 0, }, /* terminate list */ +}; +MODULE_DEVICE_TABLE(pci, pci_tbl); + +static struct pci_dev *amd_pdev; + + +static int amd_rng_data_present(struct hwrng *rng) +{ + u32 pmbase = (u32)rng->priv; + + return !!(inl(pmbase + 0xF4) & 1); +} + +static int amd_rng_data_read(struct hwrng *rng, u32 *data) +{ + u32 pmbase = (u32)rng->priv; + + *data = inl(pmbase + 0xF0); + + return 4; +} + +static int amd_rng_init(struct hwrng *rng) +{ + u8 rnen; + + pci_read_config_byte(amd_pdev, 0x40, &rnen); + rnen |= (1 << 7); /* RNG on */ + pci_write_config_byte(amd_pdev, 0x40, rnen); + + pci_read_config_byte(amd_pdev, 0x41, &rnen); + rnen |= (1 << 7); /* PMIO enable */ + pci_write_config_byte(amd_pdev, 0x41, rnen); + + return 0; +} + +static void amd_rng_cleanup(struct hwrng *rng) +{ + u8 rnen; + + pci_read_config_byte(amd_pdev, 0x40, &rnen); + rnen &= ~(1 << 7); /* RNG off */ + pci_write_config_byte(amd_pdev, 0x40, rnen); +} + + +static struct hwrng amd_rng = { + .name = "amd", + .init = amd_rng_init, + .cleanup = amd_rng_cleanup, + .data_present = amd_rng_data_present, + .data_read = amd_rng_data_read, +}; + + +static int __init mod_init(void) +{ + int err = -ENODEV; + struct pci_dev *pdev = NULL; + const struct pci_device_id *ent; + u32 pmbase; + + for_each_pci_dev(pdev) { + ent = pci_match_id(pci_tbl, pdev); + if (ent) + goto found; + } + /* Device not found. */ + goto out; + +found: + err = pci_read_config_dword(pdev, 0x58, &pmbase); + if (err) + goto out; + err = -EIO; + pmbase &= 0x0000FF00; + if (pmbase == 0) + goto out; + amd_rng.priv = (unsigned long)pmbase; + amd_pdev = pdev; + + printk(KERN_INFO "AMD768 RNG detected\n"); + err = hwrng_register(&amd_rng); + if (err) { + printk(KERN_ERR PFX "RNG registering failed (%d)\n", + err); + goto out; + } +out: + return err; +} + +static void __exit mod_exit(void) +{ + hwrng_unregister(&amd_rng); +} + +subsys_initcall(mod_init); +module_exit(mod_exit); + +MODULE_AUTHOR("The Linux Kernel team"); +MODULE_DESCRIPTION("H/W RNG driver for AMD chipsets"); +MODULE_LICENSE("GPL"); diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c new file mode 100644 index 00000000000..88b026639f1 --- /dev/null +++ b/drivers/char/hw_random/core.c @@ -0,0 +1,354 @@ +/* + Added support for the AMD Geode LX RNG + (c) Copyright 2004-2005 Advanced Micro Devices, Inc. + + derived from + + Hardware driver for the Intel/AMD/VIA Random Number Generators (RNG) + (c) Copyright 2003 Red Hat Inc <jgarzik@redhat.com> + + derived from + + Hardware driver for the AMD 768 Random Number Generator (RNG) + (c) Copyright 2001 Red Hat Inc <alan@redhat.com> + + derived from + + Hardware driver for Intel i810 Random Number Generator (RNG) + Copyright 2000,2001 Jeff Garzik <jgarzik@pobox.com> + Copyright 2000,2001 Philipp Rumpf <prumpf@mandrakesoft.com> + + Added generic RNG API + Copyright 2006 Michael Buesch <mbuesch@freenet.de> + Copyright 2005 (c) MontaVista Software, Inc. + + Please read Documentation/hw_random.txt for details on use. + + ---------------------------------------------------------- + This software may be used and distributed according to the terms + of the GNU General Public License, incorporated herein by reference. + + */ + + +#include <linux/device.h> +#include <linux/hw_random.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/miscdevice.h> +#include <linux/delay.h> +#include <asm/uaccess.h> + + +#define RNG_MODULE_NAME "hw_random" +#define PFX RNG_MODULE_NAME ": " +#define RNG_MISCDEV_MINOR 183 /* official */ + + +static struct hwrng *current_rng; +static LIST_HEAD(rng_list); +static DEFINE_MUTEX(rng_mutex); + + +static inline int hwrng_init(struct hwrng *rng) +{ + if (!rng->init) + return 0; + return rng->init(rng); +} + +static inline void hwrng_cleanup(struct hwrng *rng) +{ + if (rng && rng->cleanup) + rng->cleanup(rng); +} + +static inline int hwrng_data_present(struct hwrng *rng) +{ + if (!rng->data_present) + return 1; + return rng->data_present(rng); +} + +static inline int hwrng_data_read(struct hwrng *rng, u32 *data) +{ + return rng->data_read(rng, data); +} + + +static int rng_dev_open(struct inode *inode, struct file *filp) +{ + /* enforce read-only access to this chrdev */ + if ((filp->f_mode & FMODE_READ) == 0) + return -EINVAL; + if (filp->f_mode & FMODE_WRITE) + return -EINVAL; + return 0; +} + +static ssize_t rng_dev_read(struct file *filp, char __user *buf, + size_t size, loff_t *offp) +{ + u32 data; + ssize_t ret = 0; + int i, err = 0; + int data_present; + int bytes_read; + + while (size) { + err = -ERESTARTSYS; + if (mutex_lock_interruptible(&rng_mutex)) + goto out; + if (!current_rng) { + mutex_unlock(&rng_mutex); + err = -ENODEV; + goto out; + } + if (filp->f_flags & O_NONBLOCK) { + data_present = hwrng_data_present(current_rng); + } else { + /* Some RNG require some time between data_reads to gather + * new entropy. Poll it. + */ + for (i = 0; i < 20; i++) { + data_present = hwrng_data_present(current_rng); + if (data_present) + break; + udelay(10); + } + } + bytes_read = 0; + if (data_present) + bytes_read = hwrng_data_read(current_rng, &data); + mutex_unlock(&rng_mutex); + + err = -EAGAIN; + if (!bytes_read && (filp->f_flags & O_NONBLOCK)) + goto out; + + err = -EFAULT; + while (bytes_read && size) { + if (put_user((u8)data, buf++)) + goto out; + size--; + ret++; + bytes_read--; + data >>= 8; + } + + if (need_resched()) + schedule_timeout_interruptible(1); + err = -ERESTARTSYS; + if (signal_pending(current)) + goto out; + } +out: + return ret ? : err; +} + + +static struct file_operations rng_chrdev_ops = { + .owner = THIS_MODULE, + .open = rng_dev_open, + .read = rng_dev_read, +}; + +static struct miscdevice rng_miscdev = { + .minor = RNG_MISCDEV_MINOR, + .name = RNG_MODULE_NAME, + .fops = &rng_chrdev_ops, +}; + + +static ssize_t hwrng_attr_current_store(struct class_device *class, + const char *buf, size_t len) +{ + int err; + struct hwrng *rng; + + err = mutex_lock_interruptible(&rng_mutex); + if (err) + return -ERESTARTSYS; + err = -ENODEV; + list_for_each_entry(rng, &rng_list, list) { + if (strcmp(rng->name, buf) == 0) { + if (rng == current_rng) { + err = 0; + break; + } + err = hwrng_init(rng); + if (err) + break; + hwrng_cleanup(current_rng); + current_rng = rng; + err = 0; + break; + } + } + mutex_unlock(&rng_mutex); + + return err ? : len; +} + +static ssize_t hwrng_attr_current_show(struct class_device *class, + char *buf) +{ + int err; + ssize_t ret; + const char *name = "none"; + + err = mutex_lock_interruptible(&rng_mutex); + if (err) + return -ERESTARTSYS; + if (current_rng) + name = current_rng->name; + ret = snprintf(buf, PAGE_SIZE, "%s\n", name); + mutex_unlock(&rng_mutex); + + return ret; +} + +static ssize_t hwrng_attr_available_show(struct class_device *class, + char *buf) +{ + int err; + ssize_t ret = 0; + struct hwrng *rng; + + err = mutex_lock_interruptible(&rng_mutex); + if (err) + return -ERESTARTSYS; + buf[0] = '\0'; + list_for_each_entry(rng, &rng_list, list) { + strncat(buf, rng->name, PAGE_SIZE - ret - 1); + ret += strlen(rng->name); + strncat(buf, " ", PAGE_SIZE - ret - 1); + ret++; + } + strncat(buf, "\n", PAGE_SIZE - ret - 1); + ret++; + mutex_unlock(&rng_mutex); + + return ret; +} + +static CLASS_DEVICE_ATTR(rng_current, S_IRUGO | S_IWUSR, + hwrng_attr_current_show, + hwrng_attr_current_store); +static CLASS_DEVICE_ATTR(rng_available, S_IRUGO, + hwrng_attr_available_show, + NULL); + + +static void unregister_miscdev(void) +{ + class_device_remove_file(rng_miscdev.class, + &class_device_attr_rng_available); + class_device_remove_file(rng_miscdev.class, + &class_device_attr_rng_current); + misc_deregister(&rng_miscdev); +} + +static int register_miscdev(void) +{ + int err; + + err = misc_register(&rng_miscdev); + if (err) + goto out; + err = class_device_create_file(rng_miscdev.class, + &class_device_attr_rng_current); + if (err) + goto err_misc_dereg; + err = class_device_create_file(rng_miscdev.class, + &class_device_attr_rng_available); + if (err) + goto err_remove_current; +out: + return err; + +err_remove_current: + class_device_remove_file(rng_miscdev.class, + &class_device_attr_rng_current); +err_misc_dereg: + misc_deregister(&rng_miscdev); + goto out; +} + +int hwrng_register(struct hwrng *rng) +{ + int must_register_misc; + int err = -EINVAL; + struct hwrng *old_rng, *tmp; + + if (rng->name == NULL || + rng->data_read == NULL) + goto out; + + mutex_lock(&rng_mutex); + + /* Must not register two RNGs with the same name. */ + err = -EEXIST; + list_for_each_entry(tmp, &rng_list, list) { + if (strcmp(tmp->name, rng->name) == 0) + goto out_unlock; + } + + must_register_misc = (current_rng == NULL); + old_rng = current_rng; + if (!old_rng) { + err = hwrng_init(rng); + if (err) + goto out_unlock; + current_rng = rng; + } + err = 0; + if (must_register_misc) { + err = register_miscdev(); + if (err) { + if (!old_rng) { + hwrng_cleanup(rng); + current_rng = NULL; + } + goto out_unlock; + } + } + INIT_LIST_HEAD(&rng->list); + list_add_tail(&rng->list, &rng_list); +out_unlock: + mutex_unlock(&rng_mutex); +out: + return err; +} +EXPORT_SYMBOL_GPL(hwrng_register); + +void hwrng_unregister(struct hwrng *rng) +{ + int err; + + mutex_lock(&rng_mutex); + + list_del(&rng->list); + if (current_rng == rng) { + hwrng_cleanup(rng); + if (list_empty(&rng_list)) { + current_rng = NULL; + } else { + current_rng = list_entry(rng_list.prev, struct hwrng, list); + err = hwrng_init(current_rng); + if (err) + current_rng = NULL; + } + } + if (list_empty(&rng_list)) + unregister_miscdev(); + + mutex_unlock(&rng_mutex); +} +EXPORT_SYMBOL_GPL(hwrng_unregister); + + +MODULE_DESCRIPTION("H/W Random Number Generator (RNG) driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/char/hw_random/geode-rng.c b/drivers/char/hw_random/geode-rng.c new file mode 100644 index 00000000000..be61f22ee7b --- /dev/null +++ b/drivers/char/hw_random/geode-rng.c @@ -0,0 +1,128 @@ +/* + * RNG driver for AMD Geode RNGs + * + * Copyright 2005 (c) MontaVista Software, Inc. + * + * with the majority of the code coming from: + * + * Hardware driver for the Intel/AMD/VIA Random Number Generators (RNG) + * (c) Copyright 2003 Red Hat Inc <jgarzik@redhat.com> + * + * derived from + * + * Hardware driver for the AMD 768 Random Number Generator (RNG) + * (c) Copyright 2001 Red Hat Inc <alan@redhat.com> + * + * derived from + * + * Hardware driver for Intel i810 Random Number Generator (RNG) + * Copyright 2000,2001 Jeff Garzik <jgarzik@pobox.com> + * Copyright 2000,2001 Philipp Rumpf <prumpf@mandrakesoft.com> + * + * This file is licensed under the terms of the GNU General Public + * License version 2. This program is licensed "as is" without any + * warranty of any kind, whether express or implied. + */ + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/pci.h> +#include <linux/hw_random.h> +#include <asm/io.h> + + +#define PFX KBUILD_MODNAME ": " + +#define GEODE_RNG_DATA_REG 0x50 +#define GEODE_RNG_STATUS_REG 0x54 + +/* + * Data for PCI driver interface + * + * This data only exists for exporting the supported + * PCI ids via MODULE_DEVICE_TABLE. We do not actually + * register a pci_driver, because someone else might one day + * want to register another driver on the same PCI id. + */ +static const struct pci_device_id pci_tbl[] = { + { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_LX_AES, + PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0, }, + { 0, }, /* terminate list */ +}; +MODULE_DEVICE_TABLE(pci, pci_tbl); + + +static int geode_rng_data_read(struct hwrng *rng, u32 *data) +{ + void __iomem *mem = (void __iomem *)rng->priv; + + *data = readl(mem + GEODE_RNG_DATA_REG); + + return 4; +} + +static int geode_rng_data_present(struct hwrng *rng) +{ + void __iomem *mem = (void __iomem *)rng->priv; + + return !!(readl(mem + GEODE_RNG_STATUS_REG)); +} + + +static struct hwrng geode_rng = { + .name = "geode", + .data_present = geode_rng_data_present, + .data_read = geode_rng_data_read, +}; + + +static int __init mod_init(void) +{ + int err = -ENODEV; + struct pci_dev *pdev = NULL; + const struct pci_device_id *ent; + void __iomem *mem; + unsigned long rng_base; + + for_each_pci_dev(pdev) { + ent = pci_match_id(pci_tbl, pdev); + if (ent) + goto found; + } + /* Device not found. */ + goto out; + +found: + rng_base = pci_resource_start(pdev, 0); + if (rng_base == 0) + goto out; + err = -ENOMEM; + mem = ioremap(rng_base, 0x58); + if (!mem) + goto out; + geode_rng.priv = (unsigned long)mem; + + printk(KERN_INFO "AMD Geode RNG detected\n"); + err = hwrng_register(&geode_rng); + if (err) { + printk(KERN_ERR PFX "RNG registering failed (%d)\n", + err); + goto out; + } +out: + return err; +} + +static void __exit mod_exit(void) +{ + void __iomem *mem = (void __iomem *)geode_rng.priv; + + hwrng_unregister(&geode_rng); + iounmap(mem); +} + +subsys_initcall(mod_init); +module_exit(mod_exit); + +MODULE_DESCRIPTION("H/W RNG driver for AMD Geode LX CPUs"); +MODULE_LICENSE("GPL"); diff --git a/drivers/char/hw_random/intel-rng.c b/drivers/char/hw_random/intel-rng.c new file mode 100644 index 00000000000..6594bd5645f --- /dev/null +++ b/drivers/char/hw_random/intel-rng.c @@ -0,0 +1,189 @@ +/* + * RNG driver for Intel RNGs + * + * Copyright 2005 (c) MontaVista Software, Inc. + * + * with the majority of the code coming from: + * + * Hardware driver for the Intel/AMD/VIA Random Number Generators (RNG) + * (c) Copyright 2003 Red Hat Inc <jgarzik@redhat.com> + * + * derived from + * + * Hardware driver for the AMD 768 Random Number Generator (RNG) + * (c) Copyright 2001 Red Hat Inc <alan@redhat.com> + * + * derived from + * + * Hardware driver for Intel i810 Random Number Generator (RNG) + * Copyright 2000,2001 Jeff Garzik <jgarzik@pobox.com> + * Copyright 2000,2001 Philipp Rumpf <prumpf@mandrakesoft.com> + * + * This file is licensed under the terms of the GNU General Public + * License version 2. This program is licensed "as is" without any + * warranty of any kind, whether express or implied. + */ + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/pci.h> +#include <linux/hw_random.h> +#include <asm/io.h> + + +#define PFX KBUILD_MODNAME ": " + +/* + * RNG registers + */ +#define INTEL_RNG_HW_STATUS 0 +#define INTEL_RNG_PRESENT 0x40 +#define INTEL_RNG_ENABLED 0x01 +#define INTEL_RNG_STATUS 1 +#define INTEL_RNG_DATA_PRESENT 0x01 +#define INTEL_RNG_DATA 2 + +/* + * Magic address at which Intel PCI bridges locate the RNG + */ +#define INTEL_RNG_ADDR 0xFFBC015F +#define INTEL_RNG_ADDR_LEN 3 + +/* + * Data for PCI driver interface + * + * This data only exists for exporting the supported + * PCI ids via MODULE_DEVICE_TABLE. We do not actually + * register a pci_driver, because someone else might one day + * want to register another driver on the same PCI id. + */ +static const struct pci_device_id pci_tbl[] = { + { 0x8086, 0x2418, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0, }, + { 0x8086, 0x2428, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0, }, + { 0x8086, 0x2430, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0, }, + { 0x8086, 0x2448, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0, }, + { 0x8086, 0x244e, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0, }, + { 0x8086, 0x245e, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0, }, + { 0, }, /* terminate list */ +}; +MODULE_DEVICE_TABLE(pci, pci_tbl); + + +static inline u8 hwstatus_get(void __iomem *mem) +{ + return readb(mem + INTEL_RNG_HW_STATUS); +} + +static inline u8 hwstatus_set(void __iomem *mem, + u8 hw_status) +{ + writeb(hw_status, mem + INTEL_RNG_HW_STATUS); + return hwstatus_get(mem); +} + +static int intel_rng_data_present(struct hwrng *rng) +{ + void __iomem *mem = (void __iomem *)rng->priv; + + return !!(readb(mem + INTEL_RNG_STATUS) & INTEL_RNG_DATA_PRESENT); +} + +static int intel_rng_data_read(struct hwrng *rng, u32 *data) +{ + void __iomem *mem = (void __iomem *)rng->priv; + + *data = readb(mem + INTEL_RNG_DATA); + + return 1; +} + +static int intel_rng_init(struct hwrng *rng) +{ + void __iomem *mem = (void __iomem *)rng->priv; + u8 hw_status; + int err = -EIO; + + hw_status = hwstatus_get(mem); + /* turn RNG h/w on, if it's off */ + if ((hw_status & INTEL_RNG_ENABLED) == 0) + hw_status = hwstatus_set(mem, hw_status | INTEL_RNG_ENABLED); + if ((hw_status & INTEL_RNG_ENABLED) == 0) { + printk(KERN_ERR PFX "cannot enable RNG, aborting\n"); + goto out; + } + err = 0; +out: + return err; +} + +static void intel_rng_cleanup(struct hwrng *rng) +{ + void __iomem *mem = (void __iomem *)rng->priv; + u8 hw_status; + + hw_status = hwstatus_get(mem); + if (hw_status & INTEL_RNG_ENABLED) + hwstatus_set(mem, hw_status & ~INTEL_RNG_ENABLED); + else + printk(KERN_WARNING PFX "unusual: RNG already disabled\n"); +} + + +static struct hwrng intel_rng = { + .name = "intel", + .init = intel_rng_init, + .cleanup = intel_rng_cleanup, + .data_present = intel_rng_data_present, + .data_read = intel_rng_data_read, +}; + + +static int __init mod_init(void) +{ + int err = -ENODEV; + void __iomem *mem; + u8 hw_status; + + if (!pci_dev_present(pci_tbl)) + goto out; /* Device not found. */ + + err = -ENOMEM; + mem = ioremap(INTEL_RNG_ADDR, INTEL_RNG_ADDR_LEN); + if (!mem) + goto out; + intel_rng.priv = (unsigned long)mem; + + /* Check for Intel 82802 */ + err = -ENODEV; + hw_status = hwstatus_get(mem); + if ((hw_status & INTEL_RNG_PRESENT) == 0) + goto err_unmap; + + printk(KERN_INFO "Intel 82802 RNG detected\n"); + err = hwrng_register(&intel_rng); + if (err) { + printk(KERN_ERR PFX "RNG registering failed (%d)\n", + err); + goto out; + } +out: + return err; + +err_unmap: + iounmap(mem); + goto out; +} + +static void __exit mod_exit(void) +{ + void __iomem *mem = (void __iomem *)intel_rng.priv; + + hwrng_unregister(&intel_rng); + iounmap(mem); +} + +subsys_initcall(mod_init); +module_exit(mod_exit); + +MODULE_DESCRIPTION("H/W RNG driver for Intel chipsets"); +MODULE_LICENSE("GPL"); diff --git a/drivers/char/hw_random/ixp4xx-rng.c b/drivers/char/hw_random/ixp4xx-rng.c new file mode 100644 index 00000000000..ef71022423c --- /dev/null +++ b/drivers/char/hw_random/ixp4xx-rng.c @@ -0,0 +1,73 @@ +/* + * drivers/char/rng/ixp4xx-rng.c + * + * RNG driver for Intel IXP4xx family of NPUs + * + * Author: Deepak Saxena <dsaxena@plexity.net> + * + * Copyright 2005 (c) MontaVista Software, Inc. + * + * Fixes by Michael Buesch + * + * This file is licensed under the terms of the GNU General Public + * License version 2. This program is licensed "as is" without any + * warranty of any kind, whether express or implied. + */ + +#include <linux/kernel.h> +#include <linux/config.h> +#include <linux/types.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/init.h> +#include <linux/bitops.h> +#include <linux/hw_random.h> + +#include <asm/io.h> +#include <asm/hardware.h> + + +static int ixp4xx_rng_data_read(struct hwrng *rng, u32 *buffer) +{ + void __iomem * rng_base = (void __iomem *)rng->priv; + + *buffer = __raw_readl(rng_base); + + return 4; +} + +static struct hwrng ixp4xx_rng_ops = { + .name = "ixp4xx", + .data_read = ixp4xx_rng_data_read, +}; + +static int __init ixp4xx_rng_init(void) +{ + void __iomem * rng_base; + int err; + + rng_base = ioremap(0x70002100, 4); + if (!rng_base) + return -ENOMEM; + ixp4xx_rng_ops.priv = (unsigned long)rng_base; + err = hwrng_register(&ixp4xx_rng_ops); + if (err) + iounmap(rng_base); + + return err; +} + +static void __exit ixp4xx_rng_exit(void) +{ + void __iomem * rng_base = (void __iomem *)ixp4xx_rng_ops.priv; + + hwrng_unregister(&ixp4xx_rng_ops); + iounmap(rng_base); +} + +subsys_initcall(ixp4xx_rng_init); +module_exit(ixp4xx_rng_exit); + +MODULE_AUTHOR("Deepak Saxena <dsaxena@plexity.net>"); +MODULE_DESCRIPTION("H/W Random Number Generator (RNG) driver for IXP4xx"); +MODULE_LICENSE("GPL"); diff --git a/drivers/char/hw_random/omap-rng.c b/drivers/char/hw_random/omap-rng.c new file mode 100644 index 00000000000..819516b35a7 --- /dev/null +++ b/drivers/char/hw_random/omap-rng.c @@ -0,0 +1,208 @@ +/* + * driver/char/hw_random/omap-rng.c + * + * RNG driver for TI OMAP CPU family + * + * Author: Deepak Saxena <dsaxena@plexity.net> + * + * Copyright 2005 (c) MontaVista Software, Inc. + * + * Mostly based on original driver: + * + * Copyright (C) 2005 Nokia Corporation + * Author: Juha Yrj��<juha.yrjola@nokia.com> + * + * This file is licensed under the terms of the GNU General Public + * License version 2. This program is licensed "as is" without any + * warranty of any kind, whether express or implied. + * + * TODO: + * + * - Make status updated be interrupt driven so we don't poll + * + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/random.h> +#include <linux/err.h> +#include <linux/device.h> +#include <linux/hw_random.h> + +#include <asm/io.h> +#include <asm/hardware/clock.h> + +#define RNG_OUT_REG 0x00 /* Output register */ +#define RNG_STAT_REG 0x04 /* Status register + [0] = STAT_BUSY */ +#define RNG_ALARM_REG 0x24 /* Alarm register + [7:0] = ALARM_COUNTER */ +#define RNG_CONFIG_REG 0x28 /* Configuration register + [11:6] = RESET_COUNT + [5:3] = RING2_DELAY + [2:0] = RING1_DELAY */ +#define RNG_REV_REG 0x3c /* Revision register + [7:0] = REV_NB */ +#define RNG_MASK_REG 0x40 /* Mask and reset register + [2] = IT_EN + [1] = SOFTRESET + [0] = AUTOIDLE */ +#define RNG_SYSSTATUS 0x44 /* System status + [0] = RESETDONE */ + +static void __iomem *rng_base; +static struct clk *rng_ick; +static struct device *rng_dev; + +static u32 omap_rng_read_reg(int reg) +{ + return __raw_readl(rng_base + reg); +} + +static void omap_rng_write_reg(int reg, u32 val) +{ + __raw_writel(val, rng_base + reg); +} + +/* REVISIT: Does the status bit really work on 16xx? */ +static int omap_rng_data_present(struct hwrng *rng) +{ + return omap_rng_read_reg(RNG_STAT_REG) ? 0 : 1; +} + +static int omap_rng_data_read(struct hwrng *rng, u32 *data) +{ + *data = omap_rng_read_reg(RNG_OUT_REG); + + return 4; +} + +static struct hwrng omap_rng_ops = { + .name = "omap", + .data_present = omap_rng_data_present, + .data_read = omap_rng_data_read, +}; + +static int __init omap_rng_probe(struct device *dev) +{ + struct platform_device *pdev = to_platform_device(dev); + struct resource *res, *mem; + int ret; + + /* + * A bit ugly, and it will never actually happen but there can + * be only one RNG and this catches any bork + */ + BUG_ON(rng_dev); + + if (cpu_is_omap24xx()) { + rng_ick = clk_get(NULL, "rng_ick"); + if (IS_ERR(rng_ick)) { + dev_err(dev, "Could not get rng_ick\n"); + ret = PTR_ERR(rng_ick); + return ret; + } + else { + clk_use(rng_ick); + } + } + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + + if (!res) + return -ENOENT; + + mem = request_mem_region(res->start, res->end - res->start + 1, + pdev->name); + if (mem == NULL) + return -EBUSY; + + dev_set_drvdata(dev, mem); + rng_base = (u32 __iomem *)io_p2v(res->start); + + ret = hwrng_register(&omap_rng_ops); + if (ret) { + release_resource(mem); + rng_base = NULL; + return ret; + } + + dev_info(dev, "OMAP Random Number Generator ver. %02x\n", + omap_rng_read_reg(RNG_REV_REG)); + omap_rng_write_reg(RNG_MASK_REG, 0x1); + + rng_dev = dev; + + return 0; +} + +static int __exit omap_rng_remove(struct device *dev) +{ + struct resource *mem = dev_get_drvdata(dev); + + hwrng_unregister(&omap_rng_ops); + + omap_rng_write_reg(RNG_MASK_REG, 0x0); + + if (cpu_is_omap24xx()) { + clk_unuse(rng_ick); + clk_put(rng_ick); + } + + release_resource(mem); + rng_base = NULL; + + return 0; +} + +#ifdef CONFIG_PM + +static int omap_rng_suspend(struct device *dev, pm_message_t message, u32 level) +{ + omap_rng_write_reg(RNG_MASK_REG, 0x0); + + return 0; +} + +static int omap_rng_resume(struct device *dev, pm_message_t message, u32 level) +{ + omap_rng_write_reg(RNG_MASK_REG, 0x1); + + return 1; +} + +#else + +#define omap_rng_suspend NULL +#define omap_rng_resume NULL + +#endif + + +static struct device_driver omap_rng_driver = { + .name = "omap_rng", + .bus = &platform_bus_type, + .probe = omap_rng_probe, + .remove = __exit_p(omap_rng_remove), + .suspend = omap_rng_suspend, + .resume = omap_rng_resume +}; + +static int __init omap_rng_init(void) +{ + if (!cpu_is_omap16xx() && !cpu_is_omap24xx()) + return -ENODEV; + + return driver_register(&omap_rng_driver); +} + +static void __exit omap_rng_exit(void) +{ + driver_unregister(&omap_rng_driver); +} + +module_init(omap_rng_init); +module_exit(omap_rng_exit); + +MODULE_AUTHOR("Deepak Saxena (and others)"); +MODULE_LICENSE("GPL"); diff --git a/drivers/char/hw_random/via-rng.c b/drivers/char/hw_random/via-rng.c new file mode 100644 index 00000000000..0e786b617bb --- /dev/null +++ b/drivers/char/hw_random/via-rng.c @@ -0,0 +1,183 @@ +/* + * RNG driver for VIA RNGs + * + * Copyright 2005 (c) MontaVista Software, Inc. + * + * with the majority of the code coming from: + * + * Hardware driver for the Intel/AMD/VIA Random Number Generators (RNG) + * (c) Copyright 2003 Red Hat Inc <jgarzik@redhat.com> + * + * derived from + * + * Hardware driver for the AMD 768 Random Number Generator (RNG) + * (c) Copyright 2001 Red Hat Inc <alan@redhat.com> + * + * derived from + * + * Hardware driver for Intel i810 Random Number Generator (RNG) + * Copyright 2000,2001 Jeff Garzik <jgarzik@pobox.com> + * Copyright 2000,2001 Philipp Rumpf <prumpf@mandrakesoft.com> + * + * This file is licensed under the terms of the GNU General Public + * License version 2. This program is licensed "as is" without any + * warranty of any kind, whether express or implied. + */ + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/pci.h> +#include <linux/hw_random.h> +#include <asm/io.h> +#include <asm/msr.h> +#include <asm/cpufeature.h> + + +#define PFX KBUILD_MODNAME ": " + + +enum { + VIA_STRFILT_CNT_SHIFT = 16, + VIA_STRFILT_FAIL = (1 << 15), + VIA_STRFILT_ENABLE = (1 << 14), + VIA_RAWBITS_ENABLE = (1 << 13), + VIA_RNG_ENABLE = (1 << 6), + VIA_XSTORE_CNT_MASK = 0x0F, + + VIA_RNG_CHUNK_8 = 0x00, /* 64 rand bits, 64 stored bits */ + VIA_RNG_CHUNK_4 = 0x01, /* 32 rand bits, 32 stored bits */ + VIA_RNG_CHUNK_4_MASK = 0xFFFFFFFF, + VIA_RNG_CHUNK_2 = 0x02, /* 16 rand bits, 32 stored bits */ + VIA_RNG_CHUNK_2_MASK = 0xFFFF, + VIA_RNG_CHUNK_1 = 0x03, /* 8 rand bits, 32 stored bits */ + VIA_RNG_CHUNK_1_MASK = 0xFF, +}; + +/* + * Investigate using the 'rep' prefix to obtain 32 bits of random data + * in one insn. The upside is potentially better performance. The + * downside is that the instruction becomes no longer atomic. Due to + * this, just like familiar issues with /dev/random itself, the worst + * case of a 'rep xstore' could potentially pause a cpu for an + * unreasonably long time. In practice, this condition would likely + * only occur when the hardware is failing. (or so we hope :)) + * + * Another possible performance boost may come from simply buffering + * until we have 4 bytes, thus returning a u32 at a time, + * instead of the current u8-at-a-time. + */ + +static inline u32 xstore(u32 *addr, u32 edx_in) +{ + u32 eax_out; + + asm(".byte 0x0F,0xA7,0xC0 /* xstore %%edi (addr=%0) */" + :"=m"(*addr), "=a"(eax_out) + :"D"(addr), "d"(edx_in)); + + return eax_out; +} + +static int via_rng_data_present(struct hwrng *rng) +{ + u32 bytes_out; + u32 *via_rng_datum = (u32 *)(&rng->priv); + + /* We choose the recommended 1-byte-per-instruction RNG rate, + * for greater randomness at the expense of speed. Larger + * values 2, 4, or 8 bytes-per-instruction yield greater + * speed at lesser randomness. + * + * If you change this to another VIA_CHUNK_n, you must also + * change the ->n_bytes values in rng_vendor_ops[] tables. + * VIA_CHUNK_8 requires further code changes. + * + * A copy of MSR_VIA_RNG is placed in eax_out when xstore + * completes. + */ + + *via_rng_datum = 0; /* paranoia, not really necessary */ + bytes_out = xstore(via_rng_datum, VIA_RNG_CHUNK_1); + bytes_out &= VIA_XSTORE_CNT_MASK; + if (bytes_out == 0) + return 0; + return 1; +} + +static int via_rng_data_read(struct hwrng *rng, u32 *data) +{ + u32 via_rng_datum = (u32)rng->priv; + + *data = via_rng_datum; + + return 1; +} + +static int via_rng_init(struct hwrng *rng) +{ + u32 lo, hi, old_lo; + + /* Control the RNG via MSR. Tread lightly and pay very close + * close attention to values written, as the reserved fields + * are documented to be "undefined and unpredictable"; but it + * does not say to write them as zero, so I make a guess that + * we restore the values we find in the register. + */ + rdmsr(MSR_VIA_RNG, lo, hi); + + old_lo = lo; + lo &= ~(0x7f << VIA_STRFILT_CNT_SHIFT); + lo &= ~VIA_XSTORE_CNT_MASK; + lo &= ~(VIA_STRFILT_ENABLE | VIA_STRFILT_FAIL | VIA_RAWBITS_ENABLE); + lo |= VIA_RNG_ENABLE; + + if (lo != old_lo) + wrmsr(MSR_VIA_RNG, lo, hi); + + /* perhaps-unnecessary sanity check; remove after testing if + unneeded */ + rdmsr(MSR_VIA_RNG, lo, hi); + if ((lo & VIA_RNG_ENABLE) == 0) { + printk(KERN_ERR PFX "cannot enable VIA C3 RNG, aborting\n"); + return -ENODEV; + } + + return 0; +} + + +static struct hwrng via_rng = { + .name = "via", + .init = via_rng_init, + .data_present = via_rng_data_present, + .data_read = via_rng_data_read, +}; + + +static int __init mod_init(void) +{ + int err; + + if (!cpu_has_xstore) + return -ENODEV; + printk(KERN_INFO "VIA RNG detected\n"); + err = hwrng_register(&via_rng); + if (err) { + printk(KERN_ERR PFX "RNG registering failed (%d)\n", + err); + goto out; + } +out: + return err; +} + +static void __exit mod_exit(void) +{ + hwrng_unregister(&via_rng); +} + +subsys_initcall(mod_init); +module_exit(mod_exit); + +MODULE_DESCRIPTION("H/W RNG driver for VIA chipsets"); +MODULE_LICENSE("GPL"); diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c index 9f2f8fdec69..23028559dbc 100644 --- a/drivers/char/ipmi/ipmi_msghandler.c +++ b/drivers/char/ipmi/ipmi_msghandler.c @@ -936,11 +936,8 @@ int ipmi_set_gets_events(ipmi_user_t user, int val) if (val) { /* Deliver any queued events. */ - list_for_each_entry_safe(msg, msg2, &intf->waiting_events, - link) { - list_del(&msg->link); - list_add_tail(&msg->link, &msgs); - } + list_for_each_entry_safe(msg, msg2, &intf->waiting_events, link) + list_move_tail(&msg->link, &msgs); intf->waiting_events_count = 0; } diff --git a/drivers/char/keyboard.c b/drivers/char/keyboard.c index edd996f6fb8..13e3126c1de 100644 --- a/drivers/char/keyboard.c +++ b/drivers/char/keyboard.c @@ -151,6 +151,7 @@ unsigned char kbd_sysrq_xlate[KEY_MAX + 1] = "230\177\000\000\213\214\000\000\000\000\000\000\000\000\000\000" /* 0x50 - 0x5f */ "\r\000/"; /* 0x60 - 0x6f */ static int sysrq_down; +static int sysrq_alt_use; #endif static int sysrq_alt; @@ -1143,7 +1144,7 @@ static void kbd_keycode(unsigned int keycode, int down, kbd = kbd_table + fg_console; if (keycode == KEY_LEFTALT || keycode == KEY_RIGHTALT) - sysrq_alt = down; + sysrq_alt = down ? keycode : 0; #ifdef CONFIG_SPARC if (keycode == KEY_STOP) sparc_l1_a_state = down; @@ -1163,9 +1164,14 @@ static void kbd_keycode(unsigned int keycode, int down, #ifdef CONFIG_MAGIC_SYSRQ /* Handle the SysRq Hack */ if (keycode == KEY_SYSRQ && (sysrq_down || (down == 1 && sysrq_alt))) { - sysrq_down = down; + if (!sysrq_down) { + sysrq_down = down; + sysrq_alt_use = sysrq_alt; + } return; } + if (sysrq_down && !down && keycode == sysrq_alt_use) + sysrq_down = 0; if (sysrq_down && down && !rep) { handle_sysrq(kbd_sysrq_xlate[keycode], regs, tty); return; diff --git a/drivers/char/vt.c b/drivers/char/vt.c index 6c94879e0b9..714d95ff2f1 100644 --- a/drivers/char/vt.c +++ b/drivers/char/vt.c @@ -98,7 +98,22 @@ #include <asm/system.h> #include <asm/uaccess.h> +#define MAX_NR_CON_DRIVER 16 +#define CON_DRIVER_FLAG_MODULE 1 +#define CON_DRIVER_FLAG_INIT 2 + +struct con_driver { + const struct consw *con; + const char *desc; + struct class_device *class_dev; + int node; + int first; + int last; + int flag; +}; + +static struct con_driver registered_con_driver[MAX_NR_CON_DRIVER]; const struct consw *conswitchp; /* A bitmap for codes <32. A bit of 1 indicates that the code @@ -2557,7 +2572,7 @@ static int __init con_init(void) { const char *display_desc = NULL; struct vc_data *vc; - unsigned int currcons = 0; + unsigned int currcons = 0, i; acquire_console_sem(); @@ -2569,6 +2584,22 @@ static int __init con_init(void) return 0; } + for (i = 0; i < MAX_NR_CON_DRIVER; i++) { + struct con_driver *con_driver = ®istered_con_driver[i]; + + if (con_driver->con == NULL) { + con_driver->con = conswitchp; + con_driver->desc = display_desc; + con_driver->flag = CON_DRIVER_FLAG_INIT; + con_driver->first = 0; + con_driver->last = MAX_NR_CONSOLES - 1; + break; + } + } + + for (i = 0; i < MAX_NR_CONSOLES; i++) + con_driver_map[i] = conswitchp; + init_timer(&console_timer); console_timer.function = blank_screen_t; if (blankinterval) { @@ -2656,38 +2687,53 @@ int __init vty_init(void) } #ifndef VT_SINGLE_DRIVER +#include <linux/device.h> -/* - * If we support more console drivers, this function is used - * when a driver wants to take over some existing consoles - * and become default driver for newly opened ones. - */ +static struct class *vtconsole_class; -int take_over_console(const struct consw *csw, int first, int last, int deflt) +static int bind_con_driver(const struct consw *csw, int first, int last, + int deflt) { - int i, j = -1; - const char *desc; - struct module *owner; + struct module *owner = csw->owner; + const char *desc = NULL; + struct con_driver *con_driver; + int i, j = -1, k = -1, retval = -ENODEV; - owner = csw->owner; if (!try_module_get(owner)) return -ENODEV; acquire_console_sem(); - desc = csw->con_startup(); - if (!desc) { - release_console_sem(); - module_put(owner); - return -ENODEV; + /* check if driver is registered */ + for (i = 0; i < MAX_NR_CON_DRIVER; i++) { + con_driver = ®istered_con_driver[i]; + + if (con_driver->con == csw) { + desc = con_driver->desc; + retval = 0; + break; + } + } + + if (retval) + goto err; + + if (!(con_driver->flag & CON_DRIVER_FLAG_INIT)) { + csw->con_startup(); + con_driver->flag |= CON_DRIVER_FLAG_INIT; } + if (deflt) { if (conswitchp) module_put(conswitchp->owner); + __module_get(owner); conswitchp = csw; } + first = max(first, con_driver->first); + last = min(last, con_driver->last); + for (i = first; i <= last; i++) { int old_was_color; struct vc_data *vc = vc_cons[i].d; @@ -2701,15 +2747,17 @@ int take_over_console(const struct consw *csw, int first, int last, int deflt) continue; j = i; - if (CON_IS_VISIBLE(vc)) + + if (CON_IS_VISIBLE(vc)) { + k = i; save_screen(vc); + } + old_was_color = vc->vc_can_do_color; vc->vc_sw->con_deinit(vc); vc->vc_origin = (unsigned long)vc->vc_screenbuf; - vc->vc_visible_origin = vc->vc_origin; - vc->vc_scr_end = vc->vc_origin + vc->vc_screenbuf_size; - vc->vc_pos = vc->vc_origin + vc->vc_size_row * vc->vc_y + 2 * vc->vc_x; visual_init(vc, i, 0); + set_origin(vc); update_attr(vc); /* If the console changed between mono <-> color, then @@ -2718,36 +2766,506 @@ int take_over_console(const struct consw *csw, int first, int last, int deflt) */ if (old_was_color != vc->vc_can_do_color) clear_buffer_attributes(vc); - - if (CON_IS_VISIBLE(vc)) - update_screen(vc); } + printk("Console: switching "); if (!deflt) printk("consoles %d-%d ", first+1, last+1); - if (j >= 0) + if (j >= 0) { + struct vc_data *vc = vc_cons[j].d; + printk("to %s %s %dx%d\n", - vc_cons[j].d->vc_can_do_color ? "colour" : "mono", - desc, vc_cons[j].d->vc_cols, vc_cons[j].d->vc_rows); - else + vc->vc_can_do_color ? "colour" : "mono", + desc, vc->vc_cols, vc->vc_rows); + + if (k >= 0) { + vc = vc_cons[k].d; + update_screen(vc); + } + } else printk("to %s\n", desc); + retval = 0; +err: release_console_sem(); + module_put(owner); + return retval; +}; + +#ifdef CONFIG_VT_HW_CONSOLE_BINDING +static int con_is_graphics(const struct consw *csw, int first, int last) +{ + int i, retval = 0; + + for (i = first; i <= last; i++) { + struct vc_data *vc = vc_cons[i].d; + + if (vc && vc->vc_mode == KD_GRAPHICS) { + retval = 1; + break; + } + } + + return retval; +} + +static int unbind_con_driver(const struct consw *csw, int first, int last, + int deflt) +{ + struct module *owner = csw->owner; + const struct consw *defcsw = NULL; + struct con_driver *con_driver = NULL, *con_back = NULL; + int i, retval = -ENODEV; + + if (!try_module_get(owner)) + return -ENODEV; + + acquire_console_sem(); + + /* check if driver is registered and if it is unbindable */ + for (i = 0; i < MAX_NR_CON_DRIVER; i++) { + con_driver = ®istered_con_driver[i]; + + if (con_driver->con == csw && + con_driver->flag & CON_DRIVER_FLAG_MODULE) { + retval = 0; + break; + } + } + + if (retval) { + release_console_sem(); + goto err; + } + + retval = -ENODEV; + + /* check if backup driver exists */ + for (i = 0; i < MAX_NR_CON_DRIVER; i++) { + con_back = ®istered_con_driver[i]; + + if (con_back->con && + !(con_back->flag & CON_DRIVER_FLAG_MODULE)) { + defcsw = con_back->con; + retval = 0; + break; + } + } + + if (retval) { + release_console_sem(); + goto err; + } + + if (!con_is_bound(csw)) { + release_console_sem(); + goto err; + } + + first = max(first, con_driver->first); + last = min(last, con_driver->last); + + for (i = first; i <= last; i++) { + if (con_driver_map[i] == csw) { + module_put(csw->owner); + con_driver_map[i] = NULL; + } + } + + if (!con_is_bound(defcsw)) { + const struct consw *defconsw = conswitchp; + + defcsw->con_startup(); + con_back->flag |= CON_DRIVER_FLAG_INIT; + /* + * vgacon may change the default driver to point + * to dummycon, we restore it here... + */ + conswitchp = defconsw; + } + + if (!con_is_bound(csw)) + con_driver->flag &= ~CON_DRIVER_FLAG_INIT; + release_console_sem(); + /* ignore return value, binding should not fail */ + bind_con_driver(defcsw, first, last, deflt); +err: module_put(owner); + return retval; + +} + +static int vt_bind(struct con_driver *con) +{ + const struct consw *defcsw = NULL, *csw = NULL; + int i, more = 1, first = -1, last = -1, deflt = 0; + + if (!con->con || !(con->flag & CON_DRIVER_FLAG_MODULE) || + con_is_graphics(con->con, con->first, con->last)) + goto err; + + csw = con->con; + + for (i = 0; i < MAX_NR_CON_DRIVER; i++) { + struct con_driver *con = ®istered_con_driver[i]; + + if (con->con && !(con->flag & CON_DRIVER_FLAG_MODULE)) { + defcsw = con->con; + break; + } + } + + if (!defcsw) + goto err; + + while (more) { + more = 0; + + for (i = con->first; i <= con->last; i++) { + if (con_driver_map[i] == defcsw) { + if (first == -1) + first = i; + last = i; + more = 1; + } else if (first != -1) + break; + } + + if (first == 0 && last == MAX_NR_CONSOLES -1) + deflt = 1; + + if (first != -1) + bind_con_driver(csw, first, last, deflt); + + first = -1; + last = -1; + deflt = 0; + } + +err: return 0; } -void give_up_console(const struct consw *csw) +static int vt_unbind(struct con_driver *con) +{ + const struct consw *csw = NULL; + int i, more = 1, first = -1, last = -1, deflt = 0; + + if (!con->con || !(con->flag & CON_DRIVER_FLAG_MODULE) || + con_is_graphics(con->con, con->first, con->last)) + goto err; + + csw = con->con; + + while (more) { + more = 0; + + for (i = con->first; i <= con->last; i++) { + if (con_driver_map[i] == csw) { + if (first == -1) + first = i; + last = i; + more = 1; + } else if (first != -1) + break; + } + + if (first == 0 && last == MAX_NR_CONSOLES -1) + deflt = 1; + + if (first != -1) + unbind_con_driver(csw, first, last, deflt); + + first = -1; + last = -1; + deflt = 0; + } + +err: + return 0; +} +#else +static inline int vt_bind(struct con_driver *con) +{ + return 0; +} +static inline int vt_unbind(struct con_driver *con) +{ + return 0; +} +#endif /* CONFIG_VT_HW_CONSOLE_BINDING */ + +static ssize_t store_bind(struct class_device *class_device, + const char *buf, size_t count) +{ + struct con_driver *con = class_get_devdata(class_device); + int bind = simple_strtoul(buf, NULL, 0); + + if (bind) + vt_bind(con); + else + vt_unbind(con); + + return count; +} + +static ssize_t show_bind(struct class_device *class_device, char *buf) +{ + struct con_driver *con = class_get_devdata(class_device); + int bind = con_is_bound(con->con); + + return snprintf(buf, PAGE_SIZE, "%i\n", bind); +} + +static ssize_t show_name(struct class_device *class_device, char *buf) +{ + struct con_driver *con = class_get_devdata(class_device); + + return snprintf(buf, PAGE_SIZE, "%s %s\n", + (con->flag & CON_DRIVER_FLAG_MODULE) ? "(M)" : "(S)", + con->desc); + +} + +static struct class_device_attribute class_device_attrs[] = { + __ATTR(bind, S_IRUGO|S_IWUSR, show_bind, store_bind), + __ATTR(name, S_IRUGO, show_name, NULL), +}; + +static int vtconsole_init_class_device(struct con_driver *con) +{ + int i; + + class_set_devdata(con->class_dev, con); + for (i = 0; i < ARRAY_SIZE(class_device_attrs); i++) + class_device_create_file(con->class_dev, + &class_device_attrs[i]); + + return 0; +} + +static void vtconsole_deinit_class_device(struct con_driver *con) { int i; - for(i = 0; i < MAX_NR_CONSOLES; i++) + for (i = 0; i < ARRAY_SIZE(class_device_attrs); i++) + class_device_remove_file(con->class_dev, + &class_device_attrs[i]); +} + +/** + * con_is_bound - checks if driver is bound to the console + * @csw: console driver + * + * RETURNS: zero if unbound, nonzero if bound + * + * Drivers can call this and if zero, they should release + * all resources allocated on con_startup() + */ +int con_is_bound(const struct consw *csw) +{ + int i, bound = 0; + + for (i = 0; i < MAX_NR_CONSOLES; i++) { if (con_driver_map[i] == csw) { - module_put(csw->owner); - con_driver_map[i] = NULL; + bound = 1; + break; + } + } + + return bound; +} +EXPORT_SYMBOL(con_is_bound); + +/** + * register_con_driver - register console driver to console layer + * @csw: console driver + * @first: the first console to take over, minimum value is 0 + * @last: the last console to take over, maximum value is MAX_NR_CONSOLES -1 + * + * DESCRIPTION: This function registers a console driver which can later + * bind to a range of consoles specified by @first and @last. It will + * also initialize the console driver by calling con_startup(). + */ +int register_con_driver(const struct consw *csw, int first, int last) +{ + struct module *owner = csw->owner; + struct con_driver *con_driver; + const char *desc; + int i, retval = 0; + + if (!try_module_get(owner)) + return -ENODEV; + + acquire_console_sem(); + + for (i = 0; i < MAX_NR_CON_DRIVER; i++) { + con_driver = ®istered_con_driver[i]; + + /* already registered */ + if (con_driver->con == csw) + retval = -EINVAL; + } + + if (retval) + goto err; + + desc = csw->con_startup(); + + if (!desc) + goto err; + + retval = -EINVAL; + + for (i = 0; i < MAX_NR_CON_DRIVER; i++) { + con_driver = ®istered_con_driver[i]; + + if (con_driver->con == NULL) { + con_driver->con = csw; + con_driver->desc = desc; + con_driver->node = i; + con_driver->flag = CON_DRIVER_FLAG_MODULE | + CON_DRIVER_FLAG_INIT; + con_driver->first = first; + con_driver->last = last; + retval = 0; + break; + } + } + + if (retval) + goto err; + + con_driver->class_dev = class_device_create(vtconsole_class, NULL, + MKDEV(0, con_driver->node), + NULL, "vtcon%i", + con_driver->node); + + if (IS_ERR(con_driver->class_dev)) { + printk(KERN_WARNING "Unable to create class_device for %s; " + "errno = %ld\n", con_driver->desc, + PTR_ERR(con_driver->class_dev)); + con_driver->class_dev = NULL; + } else { + vtconsole_init_class_device(con_driver); + } +err: + release_console_sem(); + module_put(owner); + return retval; +} +EXPORT_SYMBOL(register_con_driver); + +/** + * unregister_con_driver - unregister console driver from console layer + * @csw: console driver + * + * DESCRIPTION: All drivers that registers to the console layer must + * call this function upon exit, or if the console driver is in a state + * where it won't be able to handle console services, such as the + * framebuffer console without loaded framebuffer drivers. + * + * The driver must unbind first prior to unregistration. + */ +int unregister_con_driver(const struct consw *csw) +{ + int i, retval = -ENODEV; + + acquire_console_sem(); + + /* cannot unregister a bound driver */ + if (con_is_bound(csw)) + goto err; + + for (i = 0; i < MAX_NR_CON_DRIVER; i++) { + struct con_driver *con_driver = ®istered_con_driver[i]; + + if (con_driver->con == csw && + con_driver->flag & CON_DRIVER_FLAG_MODULE) { + vtconsole_deinit_class_device(con_driver); + class_device_destroy(vtconsole_class, + MKDEV(0, con_driver->node)); + con_driver->con = NULL; + con_driver->desc = NULL; + con_driver->class_dev = NULL; + con_driver->node = 0; + con_driver->flag = 0; + con_driver->first = 0; + con_driver->last = 0; + retval = 0; + break; + } + } +err: + release_console_sem(); + return retval; +} +EXPORT_SYMBOL(unregister_con_driver); + +/* + * If we support more console drivers, this function is used + * when a driver wants to take over some existing consoles + * and become default driver for newly opened ones. + * + * take_over_console is basically a register followed by unbind + */ +int take_over_console(const struct consw *csw, int first, int last, int deflt) +{ + int err; + + err = register_con_driver(csw, first, last); + + if (!err) + bind_con_driver(csw, first, last, deflt); + + return err; +} + +/* + * give_up_console is a wrapper to unregister_con_driver. It will only + * work if driver is fully unbound. + */ +void give_up_console(const struct consw *csw) +{ + unregister_con_driver(csw); +} + +static int __init vtconsole_class_init(void) +{ + int i; + + vtconsole_class = class_create(THIS_MODULE, "vtconsole"); + if (IS_ERR(vtconsole_class)) { + printk(KERN_WARNING "Unable to create vt console class; " + "errno = %ld\n", PTR_ERR(vtconsole_class)); + vtconsole_class = NULL; + } + + /* Add system drivers to sysfs */ + for (i = 0; i < MAX_NR_CON_DRIVER; i++) { + struct con_driver *con = ®istered_con_driver[i]; + + if (con->con && !con->class_dev) { + con->class_dev = + class_device_create(vtconsole_class, NULL, + MKDEV(0, con->node), NULL, + "vtcon%i", con->node); + + if (IS_ERR(con->class_dev)) { + printk(KERN_WARNING "Unable to create " + "class_device for %s; errno = %ld\n", + con->desc, PTR_ERR(con->class_dev)); + con->class_dev = NULL; + } else { + vtconsole_init_class_device(con); + } } + } + + return 0; } +postcore_initcall(vtconsole_class_init); #endif diff --git a/drivers/clocksource/Makefile b/drivers/clocksource/Makefile new file mode 100644 index 00000000000..a5222547022 --- /dev/null +++ b/drivers/clocksource/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_X86_CYCLONE_TIMER) += cyclone.o +obj-$(CONFIG_X86_PM_TIMER) += acpi_pm.o +obj-$(CONFIG_SCx200HR_TIMER) += scx200_hrt.o diff --git a/drivers/clocksource/acpi_pm.c b/drivers/clocksource/acpi_pm.c new file mode 100644 index 00000000000..7ad3be8c0f4 --- /dev/null +++ b/drivers/clocksource/acpi_pm.c @@ -0,0 +1,177 @@ +/* + * linux/drivers/clocksource/acpi_pm.c + * + * This file contains the ACPI PM based clocksource. + * + * This code was largely moved from the i386 timer_pm.c file + * which was (C) Dominik Brodowski <linux@brodo.de> 2003 + * and contained the following comments: + * + * Driver to use the Power Management Timer (PMTMR) available in some + * southbridges as primary timing source for the Linux kernel. + * + * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c, + * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4. + * + * This file is licensed under the GPL v2. + */ + +#include <linux/clocksource.h> +#include <linux/errno.h> +#include <linux/init.h> +#include <linux/pci.h> +#include <asm/io.h> + +/* Number of PMTMR ticks expected during calibration run */ +#define PMTMR_TICKS_PER_SEC 3579545 + +/* + * The I/O port the PMTMR resides at. + * The location is detected during setup_arch(), + * in arch/i386/acpi/boot.c + */ +u32 pmtmr_ioport __read_mostly; + +#define ACPI_PM_MASK CLOCKSOURCE_MASK(24) /* limit it to 24 bits */ + +static inline u32 read_pmtmr(void) +{ + /* mask the output to 24 bits */ + return inl(pmtmr_ioport) & ACPI_PM_MASK; +} + +static cycle_t acpi_pm_read_verified(void) +{ + u32 v1 = 0, v2 = 0, v3 = 0; + + /* + * It has been reported that because of various broken + * chipsets (ICH4, PIIX4 and PIIX4E) where the ACPI PM clock + * source is not latched, you must read it multiple + * times to ensure a safe value is read: + */ + do { + v1 = read_pmtmr(); + v2 = read_pmtmr(); + v3 = read_pmtmr(); + } while ((v1 > v2 && v1 < v3) || (v2 > v3 && v2 < v1) + || (v3 > v1 && v3 < v2)); + + return (cycle_t)v2; +} + +static cycle_t acpi_pm_read(void) +{ + return (cycle_t)read_pmtmr(); +} + +static struct clocksource clocksource_acpi_pm = { + .name = "acpi_pm", + .rating = 200, + .read = acpi_pm_read, + .mask = (cycle_t)ACPI_PM_MASK, + .mult = 0, /*to be caluclated*/ + .shift = 22, + .is_continuous = 1, +}; + + +#ifdef CONFIG_PCI +static int acpi_pm_good; +static int __init acpi_pm_good_setup(char *__str) +{ + acpi_pm_good = 1; + return 1; +} +__setup("acpi_pm_good", acpi_pm_good_setup); + +static inline void acpi_pm_need_workaround(void) +{ + clocksource_acpi_pm.read = acpi_pm_read_verified; + clocksource_acpi_pm.rating = 110; +} + +/* + * PIIX4 Errata: + * + * The power management timer may return improper results when read. + * Although the timer value settles properly after incrementing, + * while incrementing there is a 3 ns window every 69.8 ns where the + * timer value is indeterminate (a 4.2% chance that the data will be + * incorrect when read). As a result, the ACPI free running count up + * timer specification is violated due to erroneous reads. + */ +static void __devinit acpi_pm_check_blacklist(struct pci_dev *dev) +{ + u8 rev; + + if (acpi_pm_good) + return; + + pci_read_config_byte(dev, PCI_REVISION_ID, &rev); + /* the bug has been fixed in PIIX4M */ + if (rev < 3) { + printk(KERN_WARNING "* Found PM-Timer Bug on the chipset." + " Due to workarounds for a bug,\n" + "* this clock source is slow. Consider trying" + " other clock sources\n"); + + acpi_pm_need_workaround(); + } +} +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82371AB_3, + acpi_pm_check_blacklist); + +static void __devinit acpi_pm_check_graylist(struct pci_dev *dev) +{ + if (acpi_pm_good) + return; + + printk(KERN_WARNING "* The chipset may have PM-Timer Bug. Due to" + " workarounds for a bug,\n" + "* this clock source is slow. If you are sure your timer" + " does not have\n" + "* this bug, please use \"acpi_pm_good\" to disable the" + " workaround\n"); + + acpi_pm_need_workaround(); +} +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801DB_0, + acpi_pm_check_graylist); +#endif + + +static int __init init_acpi_pm_clocksource(void) +{ + u32 value1, value2; + unsigned int i; + + if (!pmtmr_ioport) + return -ENODEV; + + clocksource_acpi_pm.mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, + clocksource_acpi_pm.shift); + + /* "verify" this timing source: */ + value1 = read_pmtmr(); + for (i = 0; i < 10000; i++) { + value2 = read_pmtmr(); + if (value2 == value1) + continue; + if (value2 > value1) + goto pm_good; + if ((value2 < value1) && ((value2) < 0xFFF)) + goto pm_good; + printk(KERN_INFO "PM-Timer had inconsistent results:" + " 0x%#x, 0x%#x - aborting.\n", value1, value2); + return -EINVAL; + } + printk(KERN_INFO "PM-Timer had no reasonable result:" + " 0x%#x - aborting.\n", value1); + return -ENODEV; + +pm_good: + return clocksource_register(&clocksource_acpi_pm); +} + +module_init(init_acpi_pm_clocksource); diff --git a/drivers/clocksource/cyclone.c b/drivers/clocksource/cyclone.c new file mode 100644 index 00000000000..bf4d3d50d1c --- /dev/null +++ b/drivers/clocksource/cyclone.c @@ -0,0 +1,119 @@ +#include <linux/clocksource.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/timex.h> +#include <linux/init.h> + +#include <asm/pgtable.h> +#include <asm/io.h> + +#include "mach_timer.h" + +#define CYCLONE_CBAR_ADDR 0xFEB00CD0 /* base address ptr */ +#define CYCLONE_PMCC_OFFSET 0x51A0 /* offset to control register */ +#define CYCLONE_MPCS_OFFSET 0x51A8 /* offset to select register */ +#define CYCLONE_MPMC_OFFSET 0x51D0 /* offset to count register */ +#define CYCLONE_TIMER_FREQ 99780000 /* 100Mhz, but not really */ +#define CYCLONE_TIMER_MASK CLOCKSOURCE_MASK(32) /* 32 bit mask */ + +int use_cyclone = 0; +static void __iomem *cyclone_ptr; + +static cycle_t read_cyclone(void) +{ + return (cycle_t)readl(cyclone_ptr); +} + +static struct clocksource clocksource_cyclone = { + .name = "cyclone", + .rating = 250, + .read = read_cyclone, + .mask = CYCLONE_TIMER_MASK, + .mult = 10, + .shift = 0, + .is_continuous = 1, +}; + +static int __init init_cyclone_clocksource(void) +{ + unsigned long base; /* saved value from CBAR */ + unsigned long offset; + u32 __iomem* volatile cyclone_timer; /* Cyclone MPMC0 register */ + u32 __iomem* reg; + int i; + + /* make sure we're on a summit box: */ + if (!use_cyclone) + return -ENODEV; + + printk(KERN_INFO "Summit chipset: Starting Cyclone Counter.\n"); + + /* find base address: */ + offset = CYCLONE_CBAR_ADDR; + reg = ioremap_nocache(offset, sizeof(reg)); + if (!reg) { + printk(KERN_ERR "Summit chipset: Could not find valid CBAR register.\n"); + return -ENODEV; + } + /* even on 64bit systems, this is only 32bits: */ + base = readl(reg); + if (!base) { + printk(KERN_ERR "Summit chipset: Could not find valid CBAR value.\n"); + return -ENODEV; + } + iounmap(reg); + + /* setup PMCC: */ + offset = base + CYCLONE_PMCC_OFFSET; + reg = ioremap_nocache(offset, sizeof(reg)); + if (!reg) { + printk(KERN_ERR "Summit chipset: Could not find valid PMCC register.\n"); + return -ENODEV; + } + writel(0x00000001,reg); + iounmap(reg); + + /* setup MPCS: */ + offset = base + CYCLONE_MPCS_OFFSET; + reg = ioremap_nocache(offset, sizeof(reg)); + if (!reg) { + printk(KERN_ERR "Summit chipset: Could not find valid MPCS register.\n"); + return -ENODEV; + } + writel(0x00000001,reg); + iounmap(reg); + + /* map in cyclone_timer: */ + offset = base + CYCLONE_MPMC_OFFSET; + cyclone_timer = ioremap_nocache(offset, sizeof(u64)); + if (!cyclone_timer) { + printk(KERN_ERR "Summit chipset: Could not find valid MPMC register.\n"); + return -ENODEV; + } + + /* quick test to make sure its ticking: */ + for (i = 0; i < 3; i++){ + u32 old = readl(cyclone_timer); + int stall = 100; + + while (stall--) + barrier(); + + if (readl(cyclone_timer) == old) { + printk(KERN_ERR "Summit chipset: Counter not counting! DISABLED\n"); + iounmap(cyclone_timer); + cyclone_timer = NULL; + return -ENODEV; + } + } + cyclone_ptr = cyclone_timer; + + /* sort out mult/shift values: */ + clocksource_cyclone.shift = 22; + clocksource_cyclone.mult = clocksource_hz2mult(CYCLONE_TIMER_FREQ, + clocksource_cyclone.shift); + + return clocksource_register(&clocksource_cyclone); +} + +module_init(init_cyclone_clocksource); diff --git a/drivers/clocksource/scx200_hrt.c b/drivers/clocksource/scx200_hrt.c new file mode 100644 index 00000000000..d418b829721 --- /dev/null +++ b/drivers/clocksource/scx200_hrt.c @@ -0,0 +1,101 @@ +/* + * Copyright (C) 2006 Jim Cromie + * + * This is a clocksource driver for the Geode SCx200's 1 or 27 MHz + * high-resolution timer. The Geode SC-1100 (at least) has a buggy + * time stamp counter (TSC), which loses time unless 'idle=poll' is + * given as a boot-arg. In its absence, the Generic Timekeeping code + * will detect and de-rate the bad TSC, allowing this timer to take + * over timekeeping duties. + * + * Based on work by John Stultz, and Ted Phelps (in a 2.6.12-rc6 patch) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + */ + +#include <linux/clocksource.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/ioport.h> +#include <linux/scx200.h> + +#define NAME "scx200_hrt" + +static int mhz27; +module_param(mhz27, int, 0); /* load time only */ +MODULE_PARM_DESC(mhz27, "count at 27.0 MHz (default is 1.0 MHz)"); + +static int ppm; +module_param(ppm, int, 0); /* load time only */ +MODULE_PARM_DESC(ppm, "+-adjust to actual XO freq (ppm)"); + +/* HiRes Timer configuration register address */ +#define SCx200_TMCNFG_OFFSET (SCx200_TIMER_OFFSET + 5) + +/* and config settings */ +#define HR_TMEN (1 << 0) /* timer interrupt enable */ +#define HR_TMCLKSEL (1 << 1) /* 1|0 counts at 27|1 MHz */ +#define HR_TM27MPD (1 << 2) /* 1 turns off input clock (power-down) */ + +/* The base timer frequency, * 27 if selected */ +#define HRT_FREQ 1000000 + +static cycle_t read_hrt(void) +{ + /* Read the timer value */ + return (cycle_t) inl(scx200_cb_base + SCx200_TIMER_OFFSET); +} + +#define HRT_SHIFT_1 22 +#define HRT_SHIFT_27 26 + +static struct clocksource cs_hrt = { + .name = "scx200_hrt", + .rating = 250, + .read = read_hrt, + .mask = CLOCKSOURCE_MASK(32), + .is_continuous = 1, + /* mult, shift are set based on mhz27 flag */ +}; + +static int __init init_hrt_clocksource(void) +{ + /* Make sure scx200 has initializedd the configuration block */ + if (!scx200_cb_present()) + return -ENODEV; + + /* Reserve the timer's ISA io-region for ourselves */ + if (!request_region(scx200_cb_base + SCx200_TIMER_OFFSET, + SCx200_TIMER_SIZE, + "NatSemi SCx200 High-Resolution Timer")) { + printk(KERN_WARNING NAME ": unable to lock timer region\n"); + return -ENODEV; + } + + /* write timer config */ + outb(HR_TMEN | (mhz27) ? HR_TMCLKSEL : 0, + scx200_cb_base + SCx200_TMCNFG_OFFSET); + + if (mhz27) { + cs_hrt.shift = HRT_SHIFT_27; + cs_hrt.mult = clocksource_hz2mult((HRT_FREQ + ppm) * 27, + cs_hrt.shift); + } else { + cs_hrt.shift = HRT_SHIFT_1; + cs_hrt.mult = clocksource_hz2mult(HRT_FREQ + ppm, + cs_hrt.shift); + } + printk(KERN_INFO "enabling scx200 high-res timer (%s MHz +%d ppm)\n", + mhz27 ? "27":"1", ppm); + + return clocksource_register(&cs_hrt); +} + +module_init(init_hrt_clocksource); + +MODULE_AUTHOR("Jim Cromie <jim.cromie@gmail.com>"); +MODULE_DESCRIPTION("clocksource on SCx200 HiRes Timer"); +MODULE_LICENSE("GPL"); diff --git a/drivers/dma/ioatdma.c b/drivers/dma/ioatdma.c index 0fdf7fbd649..2801d14a5e4 100644 --- a/drivers/dma/ioatdma.c +++ b/drivers/dma/ioatdma.c @@ -824,10 +824,9 @@ static int __init ioat_init_module(void) { /* it's currently unsafe to unload this module */ /* if forced, worst case is that rmmod hangs */ - if (THIS_MODULE != NULL) - THIS_MODULE->unsafe = 1; + __unsafe(THIS_MODULE); - return pci_module_init(&ioat_pci_drv); + pci_module_init(&ioat_pci_drv); } module_init(ioat_init_module); diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c index 622a55c72f0..d2428cef159 100644 --- a/drivers/ide/ide-io.c +++ b/drivers/ide/ide-io.c @@ -959,7 +959,7 @@ static void ide_check_pm_state(ide_drive_t *drive, struct request *rq) printk(KERN_WARNING "%s: bus not ready on wakeup\n", drive->name); SELECT_DRIVE(drive); HWIF(drive)->OUTB(8, HWIF(drive)->io_ports[IDE_CONTROL_OFFSET]); - rc = ide_wait_not_busy(HWIF(drive), 10000); + rc = ide_wait_not_busy(HWIF(drive), 100000); if (rc) printk(KERN_WARNING "%s: drive not ready on wakeup\n", drive->name); } diff --git a/drivers/ide/ide-lib.c b/drivers/ide/ide-lib.c index 16a143133f9..7ddb1182873 100644 --- a/drivers/ide/ide-lib.c +++ b/drivers/ide/ide-lib.c @@ -485,7 +485,7 @@ static u8 ide_dump_ata_status(ide_drive_t *drive, const char *msg, u8 stat) unsigned long flags; u8 err = 0; - local_irq_set(flags); + local_irq_save(flags); printk("%s: %s: status=0x%02x { ", drive->name, msg, stat); if (stat & BUSY_STAT) printk("Busy "); @@ -567,7 +567,7 @@ static u8 ide_dump_atapi_status(ide_drive_t *drive, const char *msg, u8 stat) status.all = stat; error.all = 0; - local_irq_set(flags); + local_irq_save(flags); printk("%s: %s: status=0x%02x { ", drive->name, msg, stat); if (status.b.bsy) printk("Busy "); diff --git a/drivers/ide/ide-timing.h b/drivers/ide/ide-timing.h index 2fcfac6e967..c0864b1e922 100644 --- a/drivers/ide/ide-timing.h +++ b/drivers/ide/ide-timing.h @@ -220,6 +220,12 @@ static int ide_timing_compute(ide_drive_t *drive, short speed, struct ide_timing return -EINVAL; /* + * Copy the timing from the table. + */ + + *t = *s; + +/* * If the drive is an EIDE drive, it can tell us it needs extended * PIO/MWDMA cycle timing. */ @@ -247,7 +253,7 @@ static int ide_timing_compute(ide_drive_t *drive, short speed, struct ide_timing * Convert the timing to bus clock counts. */ - ide_timing_quantize(s, t, T, UT); + ide_timing_quantize(t, t, T, UT); /* * Even in DMA/UDMA modes we still use PIO access for IDENTIFY, S.M.A.R.T diff --git a/drivers/ide/pci/pdc202xx_old.c b/drivers/ide/pci/pdc202xx_old.c index 7ce5bf78368..22d17548ecd 100644 --- a/drivers/ide/pci/pdc202xx_old.c +++ b/drivers/ide/pci/pdc202xx_old.c @@ -370,7 +370,6 @@ chipset_is_set: if (!(speed)) { /* restore original pci-config space */ pci_write_config_dword(dev, drive_pci, drive_conf); - hwif->tuneproc(drive, 5); return 0; } @@ -415,8 +414,6 @@ static void pdc202xx_old_ide_dma_start(ide_drive_t *drive) if (drive->addressing == 1) { struct request *rq = HWGROUP(drive)->rq; ide_hwif_t *hwif = HWIF(drive); -// struct pci_dev *dev = hwif->pci_dev; -// unsgned long high_16 = pci_resource_start(dev, 4); unsigned long high_16 = hwif->dma_master; unsigned long atapi_reg = high_16 + (hwif->channel ? 0x24 : 0x20); u32 word_count = 0; @@ -436,7 +433,6 @@ static int pdc202xx_old_ide_dma_end(ide_drive_t *drive) { if (drive->addressing == 1) { ide_hwif_t *hwif = HWIF(drive); -// unsigned long high_16 = pci_resource_start(hwif->pci_dev, 4); unsigned long high_16 = hwif->dma_master; unsigned long atapi_reg = high_16 + (hwif->channel ? 0x24 : 0x20); u8 clock = 0; @@ -453,8 +449,6 @@ static int pdc202xx_old_ide_dma_end(ide_drive_t *drive) static int pdc202xx_old_ide_dma_test_irq(ide_drive_t *drive) { ide_hwif_t *hwif = HWIF(drive); -// struct pci_dev *dev = hwif->pci_dev; -// unsigned long high_16 = pci_resource_start(dev, 4); unsigned long high_16 = hwif->dma_master; u8 dma_stat = hwif->INB(hwif->dma_status); u8 sc1d = hwif->INB((high_16 + 0x001d)); @@ -492,12 +486,7 @@ static int pdc202xx_ide_dma_timeout(ide_drive_t *drive) static void pdc202xx_reset_host (ide_hwif_t *hwif) { -#ifdef CONFIG_BLK_DEV_IDEDMA -// unsigned long high_16 = hwif->dma_base - (8*(hwif->channel)); unsigned long high_16 = hwif->dma_master; -#else /* !CONFIG_BLK_DEV_IDEDMA */ - unsigned long high_16 = pci_resource_start(hwif->pci_dev, 4); -#endif /* CONFIG_BLK_DEV_IDEDMA */ u8 udma_speed_flag = hwif->INB(high_16|0x001f); hwif->OUTB((udma_speed_flag | 0x10), (high_16|0x001f)); @@ -550,31 +539,6 @@ static void pdc202xx_reset (ide_drive_t *drive) #endif } -/* - * Since SUN Cobalt is attempting to do this operation, I should disclose - * this has been a long time ago Thu Jul 27 16:40:57 2000 was the patch date - * HOTSWAP ATA Infrastructure. - */ -static int pdc202xx_tristate (ide_drive_t * drive, int state) -{ - ide_hwif_t *hwif = HWIF(drive); -// unsigned long high_16 = hwif->dma_base - (8*(hwif->channel)); - unsigned long high_16 = hwif->dma_master; - u8 sc1f = hwif->INB(high_16|0x001f); - - if (!hwif) - return -EINVAL; - -// hwif->bus_state = state; - - if (state) { - hwif->OUTB(sc1f | 0x08, (high_16|0x001f)); - } else { - hwif->OUTB(sc1f & ~0x08, (high_16|0x001f)); - } - return 0; -} - static unsigned int __devinit init_chipset_pdc202xx(struct pci_dev *dev, const char *name) { if (dev->resource[PCI_ROM_RESOURCE].start) { @@ -624,10 +588,8 @@ static void __devinit init_hwif_pdc202xx(ide_hwif_t *hwif) hwif->tuneproc = &config_chipset_for_pio; hwif->quirkproc = &pdc202xx_quirkproc; - if (hwif->pci_dev->device != PCI_DEVICE_ID_PROMISE_20246) { - hwif->busproc = &pdc202xx_tristate; + if (hwif->pci_dev->device != PCI_DEVICE_ID_PROMISE_20246) hwif->resetproc = &pdc202xx_reset; - } hwif->speedproc = &pdc202xx_tune_chipset; diff --git a/drivers/ide/pci/piix.c b/drivers/ide/pci/piix.c index e9b83e1a302..7fac6f57b5d 100644 --- a/drivers/ide/pci/piix.c +++ b/drivers/ide/pci/piix.c @@ -222,6 +222,8 @@ static void piix_tune_drive (ide_drive_t *drive, u8 pio) unsigned long flags; u16 master_data; u8 slave_data; + static DEFINE_SPINLOCK(tune_lock); + /* ISP RTC */ u8 timings[][2] = { { 0, 0 }, { 0, 0 }, @@ -230,7 +232,13 @@ static void piix_tune_drive (ide_drive_t *drive, u8 pio) { 2, 3 }, }; pio = ide_get_best_pio_mode(drive, pio, 5, NULL); - spin_lock_irqsave(&ide_lock, flags); + + /* + * Master vs slave is synchronized above us but the slave register is + * shared by the two hwifs so the corner case of two slave timeouts in + * parallel must be locked. + */ + spin_lock_irqsave(&tune_lock, flags); pci_read_config_word(dev, master_port, &master_data); if (is_slave) { master_data = master_data | 0x4000; @@ -250,7 +258,7 @@ static void piix_tune_drive (ide_drive_t *drive, u8 pio) pci_write_config_word(dev, master_port, master_data); if (is_slave) pci_write_config_byte(dev, slave_port, slave_data); - spin_unlock_irqrestore(&ide_lock, flags); + spin_unlock_irqrestore(&tune_lock, flags); } /** diff --git a/drivers/ieee1394/eth1394.c b/drivers/ieee1394/eth1394.c index 5bda15904a0..2d5b57be98c 100644 --- a/drivers/ieee1394/eth1394.c +++ b/drivers/ieee1394/eth1394.c @@ -1074,8 +1074,7 @@ static inline int update_partial_datagram(struct list_head *pdgl, struct list_he /* Move list entry to beginnig of list so that oldest partial * datagrams percolate to the end of the list */ - list_del(lh); - list_add(lh, pdgl); + list_move(lh, pdgl); return 0; } diff --git a/drivers/ieee1394/raw1394.c b/drivers/ieee1394/raw1394.c index 20ce539580f..571ea68c0cf 100644 --- a/drivers/ieee1394/raw1394.c +++ b/drivers/ieee1394/raw1394.c @@ -132,8 +132,7 @@ static void free_pending_request(struct pending_request *req) static void __queue_complete_req(struct pending_request *req) { struct file_info *fi = req->file_info; - list_del(&req->list); - list_add_tail(&req->list, &fi->req_complete); + list_move_tail(&req->list, &fi->req_complete); up(&fi->complete_sem); wake_up_interruptible(&fi->poll_wait_complete); diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index b38e02a5db3..5ed4dab52a6 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -1775,11 +1775,9 @@ ib_find_send_mad(struct ib_mad_agent_private *mad_agent_priv, void ib_mark_mad_done(struct ib_mad_send_wr_private *mad_send_wr) { mad_send_wr->timeout = 0; - if (mad_send_wr->refcount == 1) { - list_del(&mad_send_wr->agent_list); - list_add_tail(&mad_send_wr->agent_list, + if (mad_send_wr->refcount == 1) + list_move_tail(&mad_send_wr->agent_list, &mad_send_wr->mad_agent_priv->done_list); - } } static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv, @@ -2098,8 +2096,7 @@ retry: queued_send_wr = container_of(mad_list, struct ib_mad_send_wr_private, mad_list); - list_del(&mad_list->list); - list_add_tail(&mad_list->list, &send_queue->list); + list_move_tail(&mad_list->list, &send_queue->list); } spin_unlock_irqrestore(&send_queue->lock, flags); diff --git a/drivers/infiniband/core/mad_rmpp.c b/drivers/infiniband/core/mad_rmpp.c index d4704e054e3..ebcd5b18177 100644 --- a/drivers/infiniband/core/mad_rmpp.c +++ b/drivers/infiniband/core/mad_rmpp.c @@ -665,8 +665,7 @@ static void process_rmpp_ack(struct ib_mad_agent_private *agent, goto out; mad_send_wr->refcount++; - list_del(&mad_send_wr->agent_list); - list_add_tail(&mad_send_wr->agent_list, + list_move_tail(&mad_send_wr->agent_list, &mad_send_wr->mad_agent_priv->send_list); } out: diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index 216471fa01c..ab40488182b 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -864,8 +864,7 @@ void ipoib_mcast_restart_task(void *dev_ptr) if (mcast) { /* Destroy the send only entry */ - list_del(&mcast->list); - list_add_tail(&mcast->list, &remove_list); + list_move_tail(&mcast->list, &remove_list); rb_replace_node(&mcast->rb_node, &nmcast->rb_node, @@ -890,8 +889,7 @@ void ipoib_mcast_restart_task(void *dev_ptr) rb_erase(&mcast->rb_node, &priv->multicast_tree); /* Move to the remove list */ - list_del(&mcast->list); - list_add_tail(&mcast->list, &remove_list); + list_move_tail(&mcast->list, &remove_list); } } diff --git a/drivers/isdn/capi/capi.c b/drivers/isdn/capi/capi.c index 173c899a1fb..2e541fa0202 100644 --- a/drivers/isdn/capi/capi.c +++ b/drivers/isdn/capi/capi.c @@ -87,6 +87,11 @@ struct capincci; #ifdef CONFIG_ISDN_CAPI_MIDDLEWARE struct capiminor; +struct datahandle_queue { + struct list_head list; + u16 datahandle; +}; + struct capiminor { struct list_head list; struct capincci *nccip; @@ -109,12 +114,9 @@ struct capiminor { int outbytes; /* transmit path */ - struct datahandle_queue { - struct datahandle_queue *next; - u16 datahandle; - } *ackqueue; + struct list_head ackqueue; int nack; - + spinlock_t ackqlock; }; #endif /* CONFIG_ISDN_CAPI_MIDDLEWARE */ @@ -156,48 +158,54 @@ static LIST_HEAD(capiminor_list); static int capincci_add_ack(struct capiminor *mp, u16 datahandle) { - struct datahandle_queue *n, **pp; + struct datahandle_queue *n; + unsigned long flags; n = kmalloc(sizeof(*n), GFP_ATOMIC); - if (!n) { - printk(KERN_ERR "capi: alloc datahandle failed\n"); - return -1; + if (unlikely(!n)) { + printk(KERN_ERR "capi: alloc datahandle failed\n"); + return -1; } - n->next = NULL; n->datahandle = datahandle; - for (pp = &mp->ackqueue; *pp; pp = &(*pp)->next) ; - *pp = n; + INIT_LIST_HEAD(&n->list); + spin_lock_irqsave(&mp->ackqlock, flags); + list_add_tail(&n->list, &mp->ackqueue); mp->nack++; + spin_unlock_irqrestore(&mp->ackqlock, flags); return 0; } static int capiminor_del_ack(struct capiminor *mp, u16 datahandle) { - struct datahandle_queue **pp, *p; + struct datahandle_queue *p, *tmp; + unsigned long flags; - for (pp = &mp->ackqueue; *pp; pp = &(*pp)->next) { - if ((*pp)->datahandle == datahandle) { - p = *pp; - *pp = (*pp)->next; + spin_lock_irqsave(&mp->ackqlock, flags); + list_for_each_entry_safe(p, tmp, &mp->ackqueue, list) { + if (p->datahandle == datahandle) { + list_del(&p->list); kfree(p); mp->nack--; + spin_unlock_irqrestore(&mp->ackqlock, flags); return 0; } } + spin_unlock_irqrestore(&mp->ackqlock, flags); return -1; } static void capiminor_del_all_ack(struct capiminor *mp) { - struct datahandle_queue **pp, *p; + struct datahandle_queue *p, *tmp; + unsigned long flags; - pp = &mp->ackqueue; - while (*pp) { - p = *pp; - *pp = (*pp)->next; + spin_lock_irqsave(&mp->ackqlock, flags); + list_for_each_entry_safe(p, tmp, &mp->ackqueue, list) { + list_del(&p->list); kfree(p); mp->nack--; } + spin_unlock_irqrestore(&mp->ackqlock, flags); } @@ -220,6 +228,8 @@ static struct capiminor *capiminor_alloc(struct capi20_appl *ap, u32 ncci) mp->ncci = ncci; mp->msgid = 0; atomic_set(&mp->ttyopencount,0); + INIT_LIST_HEAD(&mp->ackqueue); + spin_lock_init(&mp->ackqlock); skb_queue_head_init(&mp->inqueue); skb_queue_head_init(&mp->outqueue); diff --git a/drivers/isdn/gigaset/bas-gigaset.c b/drivers/isdn/gigaset/bas-gigaset.c index eb41aba3dde..8a45715dd4c 100644 --- a/drivers/isdn/gigaset/bas-gigaset.c +++ b/drivers/isdn/gigaset/bas-gigaset.c @@ -65,23 +65,22 @@ static struct usb_device_id gigaset_table [] = { MODULE_DEVICE_TABLE(usb, gigaset_table); -/*======================= local function prototypes =============================*/ +/*======================= local function prototypes ==========================*/ -/* This function is called if a new device is connected to the USB port. It - * checks whether this new device belongs to this driver. - */ +/* function called if a new device belonging to this driver is connected */ static int gigaset_probe(struct usb_interface *interface, const struct usb_device_id *id); /* Function will be called if the device is unplugged */ static void gigaset_disconnect(struct usb_interface *interface); -static void read_ctrl_callback(struct urb *, struct pt_regs *); +static int atread_submit(struct cardstate *, int); static void stopurbs(struct bas_bc_state *); +static int req_submit(struct bc_state *, int, int, int); static int atwrite_submit(struct cardstate *, unsigned char *, int); static int start_cbsend(struct cardstate *); -/*==============================================================================*/ +/*============================================================================*/ struct bas_cardstate { struct usb_device *udev; /* USB device pointer */ @@ -91,6 +90,7 @@ struct bas_cardstate { struct urb *urb_ctrl; /* control pipe default URB */ struct usb_ctrlrequest dr_ctrl; struct timer_list timer_ctrl; /* control request timeout */ + int retry_ctrl; struct timer_list timer_atrdy; /* AT command ready timeout */ struct urb *urb_cmd_out; /* for sending AT commands */ @@ -307,6 +307,7 @@ static int gigaset_set_line_ctrl(struct cardstate *cs, unsigned cflag) * hang up any existing connection because of an unrecoverable error * This function may be called from any context and takes care of scheduling * the necessary actions for execution outside of interrupt context. + * cs->lock must not be held. * argument: * B channel control structure */ @@ -325,14 +326,17 @@ static inline void error_hangup(struct bc_state *bcs) /* error_reset * reset Gigaset device because of an unrecoverable error - * This function may be called from any context, and should take care of + * This function may be called from any context, and takes care of * scheduling the necessary actions for execution outside of interrupt context. - * Right now, it just generates a kernel message calling for help. + * cs->lock must not be held. * argument: * controller state structure */ static inline void error_reset(struct cardstate *cs) { + /* close AT command channel to recover (ignore errors) */ + req_submit(cs->bcs, HD_CLOSE_ATCHANNEL, 0, BAS_TIMEOUT); + //FIXME try to recover without bothering the user dev_err(cs->dev, "unrecoverable error - please disconnect Gigaset base to reset\n"); @@ -403,14 +407,30 @@ static void cmd_in_timeout(unsigned long data) { struct cardstate *cs = (struct cardstate *) data; struct bas_cardstate *ucs = cs->hw.bas; + int rc; if (!ucs->rcvbuf_size) { gig_dbg(DEBUG_USBREQ, "%s: no receive in progress", __func__); return; } - dev_err(cs->dev, "timeout reading AT response\n"); - error_reset(cs); //FIXME retry? + if (ucs->retry_cmd_in++ < BAS_RETRY) { + dev_notice(cs->dev, "control read: timeout, retry %d\n", + ucs->retry_cmd_in); + rc = atread_submit(cs, BAS_TIMEOUT); + if (rc >= 0 || rc == -ENODEV) + /* resubmitted or disconnected */ + /* - bypass regular exit block */ + return; + } else { + dev_err(cs->dev, + "control read: timeout, giving up after %d tries\n", + ucs->retry_cmd_in); + } + kfree(ucs->rcvbuf); + ucs->rcvbuf = NULL; + ucs->rcvbuf_size = 0; + error_reset(cs); } /* set/clear bits in base connection state, return previous state @@ -428,6 +448,96 @@ inline static int update_basstate(struct bas_cardstate *ucs, return state; } +/* read_ctrl_callback + * USB completion handler for control pipe input + * called by the USB subsystem in interrupt context + * parameter: + * urb USB request block + * urb->context = inbuf structure for controller state + */ +static void read_ctrl_callback(struct urb *urb, struct pt_regs *regs) +{ + struct inbuf_t *inbuf = urb->context; + struct cardstate *cs = inbuf->cs; + struct bas_cardstate *ucs = cs->hw.bas; + int have_data = 0; + unsigned numbytes; + int rc; + + update_basstate(ucs, 0, BS_ATRDPEND); + + if (!ucs->rcvbuf_size) { + dev_warn(cs->dev, "%s: no receive in progress\n", __func__); + return; + } + + del_timer(&ucs->timer_cmd_in); + + switch (urb->status) { + case 0: /* normal completion */ + numbytes = urb->actual_length; + if (unlikely(numbytes != ucs->rcvbuf_size)) { + dev_warn(cs->dev, + "control read: received %d chars, expected %d\n", + numbytes, ucs->rcvbuf_size); + if (numbytes > ucs->rcvbuf_size) + numbytes = ucs->rcvbuf_size; + } + + /* copy received bytes to inbuf */ + have_data = gigaset_fill_inbuf(inbuf, ucs->rcvbuf, numbytes); + + if (unlikely(numbytes < ucs->rcvbuf_size)) { + /* incomplete - resubmit for remaining bytes */ + ucs->rcvbuf_size -= numbytes; + ucs->retry_cmd_in = 0; + rc = atread_submit(cs, BAS_TIMEOUT); + if (rc >= 0 || rc == -ENODEV) + /* resubmitted or disconnected */ + /* - bypass regular exit block */ + return; + error_reset(cs); + } + break; + + case -ENOENT: /* cancelled */ + case -ECONNRESET: /* cancelled (async) */ + case -EINPROGRESS: /* pending */ + case -ENODEV: /* device removed */ + case -ESHUTDOWN: /* device shut down */ + /* no action necessary */ + gig_dbg(DEBUG_USBREQ, "%s: %s", + __func__, get_usb_statmsg(urb->status)); + break; + + default: /* severe trouble */ + dev_warn(cs->dev, "control read: %s\n", + get_usb_statmsg(urb->status)); + if (ucs->retry_cmd_in++ < BAS_RETRY) { + dev_notice(cs->dev, "control read: retry %d\n", + ucs->retry_cmd_in); + rc = atread_submit(cs, BAS_TIMEOUT); + if (rc >= 0 || rc == -ENODEV) + /* resubmitted or disconnected */ + /* - bypass regular exit block */ + return; + } else { + dev_err(cs->dev, + "control read: giving up after %d tries\n", + ucs->retry_cmd_in); + } + error_reset(cs); + } + + kfree(ucs->rcvbuf); + ucs->rcvbuf = NULL; + ucs->rcvbuf_size = 0; + if (have_data) { + gig_dbg(DEBUG_INTR, "%s-->BH", __func__); + gigaset_schedule_event(cs); + } +} + /* atread_submit * submit an HD_READ_ATMESSAGE command URB and optionally start a timeout * parameters: @@ -466,7 +576,7 @@ static int atread_submit(struct cardstate *cs, int timeout) if ((ret = usb_submit_urb(ucs->urb_cmd_in, SLAB_ATOMIC)) != 0) { update_basstate(ucs, 0, BS_ATRDPEND); dev_err(cs->dev, "could not submit HD_READ_ATMESSAGE: %s\n", - get_usb_statmsg(ret)); + get_usb_rcmsg(ret)); return ret; } @@ -611,9 +721,12 @@ static void read_int_callback(struct urb *urb, struct pt_regs *regs) kfree(ucs->rcvbuf); ucs->rcvbuf = NULL; ucs->rcvbuf_size = 0; - if (rc != -ENODEV) + if (rc != -ENODEV) { //FIXME corrective action? + spin_unlock_irqrestore(&cs->lock, flags); error_reset(cs); + break; + } } spin_unlock_irqrestore(&cs->lock, flags); break; @@ -643,97 +756,6 @@ resubmit: } } -/* read_ctrl_callback - * USB completion handler for control pipe input - * called by the USB subsystem in interrupt context - * parameter: - * urb USB request block - * urb->context = inbuf structure for controller state - */ -static void read_ctrl_callback(struct urb *urb, struct pt_regs *regs) -{ - struct inbuf_t *inbuf = urb->context; - struct cardstate *cs = inbuf->cs; - struct bas_cardstate *ucs = cs->hw.bas; - int have_data = 0; - unsigned numbytes; - int rc; - - update_basstate(ucs, 0, BS_ATRDPEND); - - if (!ucs->rcvbuf_size) { - dev_warn(cs->dev, "%s: no receive in progress\n", __func__); - return; - } - - del_timer(&ucs->timer_cmd_in); - - switch (urb->status) { - case 0: /* normal completion */ - numbytes = urb->actual_length; - if (unlikely(numbytes == 0)) { - dev_warn(cs->dev, - "control read: empty block received\n"); - goto retry; - } - if (unlikely(numbytes != ucs->rcvbuf_size)) { - dev_warn(cs->dev, - "control read: received %d chars, expected %d\n", - numbytes, ucs->rcvbuf_size); - if (numbytes > ucs->rcvbuf_size) - numbytes = ucs->rcvbuf_size; - } - - /* copy received bytes to inbuf */ - have_data = gigaset_fill_inbuf(inbuf, ucs->rcvbuf, numbytes); - - if (unlikely(numbytes < ucs->rcvbuf_size)) { - /* incomplete - resubmit for remaining bytes */ - ucs->rcvbuf_size -= numbytes; - ucs->retry_cmd_in = 0; - goto retry; - } - break; - - case -ENOENT: /* cancelled */ - case -ECONNRESET: /* cancelled (async) */ - case -EINPROGRESS: /* pending */ - case -ENODEV: /* device removed */ - case -ESHUTDOWN: /* device shut down */ - /* no action necessary */ - gig_dbg(DEBUG_USBREQ, "%s: %s", - __func__, get_usb_statmsg(urb->status)); - break; - - default: /* severe trouble */ - dev_warn(cs->dev, "control read: %s\n", - get_usb_statmsg(urb->status)); - retry: - if (ucs->retry_cmd_in++ < BAS_RETRY) { - dev_notice(cs->dev, "control read: retry %d\n", - ucs->retry_cmd_in); - rc = atread_submit(cs, BAS_TIMEOUT); - if (rc >= 0 || rc == -ENODEV) - /* resubmitted or disconnected */ - /* - bypass regular exit block */ - return; - } else { - dev_err(cs->dev, - "control read: giving up after %d tries\n", - ucs->retry_cmd_in); - } - error_reset(cs); - } - - kfree(ucs->rcvbuf); - ucs->rcvbuf = NULL; - ucs->rcvbuf_size = 0; - if (have_data) { - gig_dbg(DEBUG_INTR, "%s-->BH", __func__); - gigaset_schedule_event(cs); - } -} - /* read_iso_callback * USB completion handler for B channel isochronous input * called by the USB subsystem in interrupt context @@ -1378,6 +1400,7 @@ static void req_timeout(unsigned long data) case HD_CLOSE_B1CHANNEL: dev_err(bcs->cs->dev, "timeout closing channel %d\n", bcs->channel + 1); + error_reset(bcs->cs); break; default: @@ -1396,22 +1419,61 @@ static void req_timeout(unsigned long data) static void write_ctrl_callback(struct urb *urb, struct pt_regs *regs) { struct bas_cardstate *ucs = urb->context; + int rc; unsigned long flags; - spin_lock_irqsave(&ucs->lock, flags); - if (urb->status && ucs->pending) { - dev_err(&ucs->interface->dev, - "control request 0x%02x failed: %s\n", - ucs->pending, get_usb_statmsg(urb->status)); - del_timer(&ucs->timer_ctrl); - ucs->pending = 0; - } - /* individual handling of specific request types */ - switch (ucs->pending) { - case HD_DEVICE_INIT_ACK: /* no reply expected */ - ucs->pending = 0; + /* check status */ + switch (urb->status) { + case 0: /* normal completion */ + spin_lock_irqsave(&ucs->lock, flags); + switch (ucs->pending) { + case HD_DEVICE_INIT_ACK: /* no reply expected */ + del_timer(&ucs->timer_ctrl); + ucs->pending = 0; + break; + } + spin_unlock_irqrestore(&ucs->lock, flags); + return; + + case -ENOENT: /* cancelled */ + case -ECONNRESET: /* cancelled (async) */ + case -EINPROGRESS: /* pending */ + case -ENODEV: /* device removed */ + case -ESHUTDOWN: /* device shut down */ + /* ignore silently */ + gig_dbg(DEBUG_USBREQ, "%s: %s", + __func__, get_usb_statmsg(urb->status)); break; + + default: /* any failure */ + if (++ucs->retry_ctrl > BAS_RETRY) { + dev_err(&ucs->interface->dev, + "control request 0x%02x failed: %s\n", + ucs->dr_ctrl.bRequest, + get_usb_statmsg(urb->status)); + break; /* give up */ + } + dev_notice(&ucs->interface->dev, + "control request 0x%02x: %s, retry %d\n", + ucs->dr_ctrl.bRequest, get_usb_statmsg(urb->status), + ucs->retry_ctrl); + /* urb->dev is clobbered by USB subsystem */ + urb->dev = ucs->udev; + rc = usb_submit_urb(urb, SLAB_ATOMIC); + if (unlikely(rc)) { + dev_err(&ucs->interface->dev, + "could not resubmit request 0x%02x: %s\n", + ucs->dr_ctrl.bRequest, get_usb_rcmsg(rc)); + break; + } + /* resubmitted */ + return; } + + /* failed, clear pending request */ + spin_lock_irqsave(&ucs->lock, flags); + del_timer(&ucs->timer_ctrl); + ucs->pending = 0; spin_unlock_irqrestore(&ucs->lock, flags); } @@ -1455,9 +1517,11 @@ static int req_submit(struct bc_state *bcs, int req, int val, int timeout) usb_sndctrlpipe(ucs->udev, 0), (unsigned char*) &ucs->dr_ctrl, NULL, 0, write_ctrl_callback, ucs); - if ((ret = usb_submit_urb(ucs->urb_ctrl, SLAB_ATOMIC)) != 0) { + ucs->retry_ctrl = 0; + ret = usb_submit_urb(ucs->urb_ctrl, SLAB_ATOMIC); + if (unlikely(ret)) { dev_err(bcs->cs->dev, "could not submit request 0x%02x: %s\n", - req, get_usb_statmsg(ret)); + req, get_usb_rcmsg(ret)); spin_unlock_irqrestore(&ucs->lock, flags); return ret; } diff --git a/drivers/isdn/gigaset/ev-layer.c b/drivers/isdn/gigaset/ev-layer.c index 18e05c09b71..44f02dbd111 100644 --- a/drivers/isdn/gigaset/ev-layer.c +++ b/drivers/isdn/gigaset/ev-layer.c @@ -1262,7 +1262,8 @@ static void do_action(int action, struct cardstate *cs, break; case ACT_HUPMODEM: /* send "+++" (hangup in unimodem mode) */ - cs->ops->write_cmd(cs, "+++", 3, NULL); + if (cs->connected) + cs->ops->write_cmd(cs, "+++", 3, NULL); break; case ACT_RING: /* get fresh AT state structure for new CID */ @@ -1294,7 +1295,6 @@ static void do_action(int action, struct cardstate *cs, break; case ACT_ICALL: handle_icall(cs, bcs, p_at_state); - at_state = *p_at_state; break; case ACT_FAILSDOWN: dev_warn(cs->dev, "Could not shut down the device.\n"); @@ -1334,10 +1334,8 @@ static void do_action(int action, struct cardstate *cs, */ at_state->pending_commands |= PC_DLE0; atomic_set(&cs->commands_pending, 1); - } else { + } else disconnect(p_at_state); - at_state = *p_at_state; - } break; case ACT_FAKEDLE0: at_state->int_var[VAR_ZDLE] = 0; @@ -1354,10 +1352,8 @@ static void do_action(int action, struct cardstate *cs, at_state->cid = -1; if (bcs && cs->onechannel) at_state->pending_commands |= PC_DLE0; - else { + else disconnect(p_at_state); - at_state = *p_at_state; - } schedule_init(cs, MS_RECOVER); break; case ACT_FAILDLE0: @@ -1410,7 +1406,6 @@ static void do_action(int action, struct cardstate *cs, case ACT_ABORTACCEPT: /* hangup/error/timeout during ICALL processing */ disconnect(p_at_state); - at_state = *p_at_state; break; case ACT_ABORTDIAL: /* error/timeout during dial preparation */ diff --git a/drivers/isdn/hisax/q931.c b/drivers/isdn/hisax/q931.c index abecabf8c27..aacbf0d14b6 100644 --- a/drivers/isdn/hisax/q931.c +++ b/drivers/isdn/hisax/q931.c @@ -1402,12 +1402,12 @@ dlogframe(struct IsdnCardState *cs, struct sk_buff *skb, int dir) } /* No, locate it in the table */ if (cset == 0) { - for (i = 0; i < IESIZE; i++) + for (i = 0; i < IESIZE_NI1; i++) if (*buf == ielist_ni1[i].nr) break; /* When not found, give appropriate msg */ - if (i != IESIZE) { + if (i != IESIZE_NI1) { dp += sprintf(dp, " %s\n", ielist_ni1[i].descr); dp += ielist_ni1[i].f(dp, buf); } else diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index ac25a48362a..bf869ed03ee 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -90,7 +90,7 @@ config MD_RAID10 depends on BLK_DEV_MD && EXPERIMENTAL ---help--- RAID-10 provides a combination of striping (RAID-0) and - mirroring (RAID-1) with easier configuration and more flexable + mirroring (RAID-1) with easier configuration and more flexible layout. Unlike RAID-0, but like RAID-1, RAID-10 requires all devices to be the same size (or at least, only as much as the smallest device @@ -104,8 +104,8 @@ config MD_RAID10 If unsure, say Y. -config MD_RAID5 - tristate "RAID-4/RAID-5 mode" +config MD_RAID456 + tristate "RAID-4/RAID-5/RAID-6 mode" depends on BLK_DEV_MD ---help--- A RAID-5 set of N drives with a capacity of C MB per drive provides @@ -116,20 +116,28 @@ config MD_RAID5 while a RAID-5 set distributes the parity across the drives in one of the available parity distribution methods. + A RAID-6 set of N drives with a capacity of C MB per drive + provides the capacity of C * (N - 2) MB, and protects + against a failure of any two drives. For a given sector + (row) number, (N - 2) drives contain data sectors, and two + drives contains two independent redundancy syndromes. Like + RAID-5, RAID-6 distributes the syndromes across the drives + in one of the available parity distribution methods. + Information about Software RAID on Linux is contained in the Software-RAID mini-HOWTO, available from <http://www.tldp.org/docs.html#howto>. There you will also learn where to get the supporting user space utilities raidtools. - If you want to use such a RAID-4/RAID-5 set, say Y. To + If you want to use such a RAID-4/RAID-5/RAID-6 set, say Y. To compile this code as a module, choose M here: the module - will be called raid5. + will be called raid456. If unsure, say Y. config MD_RAID5_RESHAPE bool "Support adding drives to a raid-5 array (experimental)" - depends on MD_RAID5 && EXPERIMENTAL + depends on MD_RAID456 && EXPERIMENTAL ---help--- A RAID-5 set can be expanded by adding extra drives. This requires "restriping" the array which means (almost) every @@ -139,7 +147,7 @@ config MD_RAID5_RESHAPE is online. However it is still EXPERIMENTAL code. It should work, but please be sure that you have backups. - You will need mdadm verion 2.4.1 or later to use this + You will need mdadm version 2.4.1 or later to use this feature safely. During the early stage of reshape there is a critical section where live data is being over-written. A crash during this time needs extra care for recovery. The @@ -154,28 +162,6 @@ config MD_RAID5_RESHAPE There should be enough spares already present to make the new array workable. -config MD_RAID6 - tristate "RAID-6 mode" - depends on BLK_DEV_MD - ---help--- - A RAID-6 set of N drives with a capacity of C MB per drive - provides the capacity of C * (N - 2) MB, and protects - against a failure of any two drives. For a given sector - (row) number, (N - 2) drives contain data sectors, and two - drives contains two independent redundancy syndromes. Like - RAID-5, RAID-6 distributes the syndromes across the drives - in one of the available parity distribution methods. - - RAID-6 requires mdadm-1.5.0 or later, available at: - - ftp://ftp.kernel.org/pub/linux/utils/raid/mdadm/ - - If you want to use such a RAID-6 set, say Y. To compile - this code as a module, choose M here: the module will be - called raid6. - - If unsure, say Y. - config MD_MULTIPATH tristate "Multipath I/O support" depends on BLK_DEV_MD @@ -235,7 +221,7 @@ config DM_SNAPSHOT tristate "Snapshot target (EXPERIMENTAL)" depends on BLK_DEV_DM && EXPERIMENTAL ---help--- - Allow volume managers to take writeable snapshots of a device. + Allow volume managers to take writable snapshots of a device. config DM_MIRROR tristate "Mirror target (EXPERIMENTAL)" diff --git a/drivers/md/Makefile b/drivers/md/Makefile index d3efedf6a6a..34957a68d92 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -8,7 +8,7 @@ dm-multipath-objs := dm-hw-handler.o dm-path-selector.o dm-mpath.o dm-snapshot-objs := dm-snap.o dm-exception-store.o dm-mirror-objs := dm-log.o dm-raid1.o md-mod-objs := md.o bitmap.o -raid6-objs := raid6main.o raid6algos.o raid6recov.o raid6tables.o \ +raid456-objs := raid5.o raid6algos.o raid6recov.o raid6tables.o \ raid6int1.o raid6int2.o raid6int4.o \ raid6int8.o raid6int16.o raid6int32.o \ raid6altivec1.o raid6altivec2.o raid6altivec4.o \ @@ -25,8 +25,7 @@ obj-$(CONFIG_MD_LINEAR) += linear.o obj-$(CONFIG_MD_RAID0) += raid0.o obj-$(CONFIG_MD_RAID1) += raid1.o obj-$(CONFIG_MD_RAID10) += raid10.o -obj-$(CONFIG_MD_RAID5) += raid5.o xor.o -obj-$(CONFIG_MD_RAID6) += raid6.o xor.o +obj-$(CONFIG_MD_RAID456) += raid456.o xor.o obj-$(CONFIG_MD_MULTIPATH) += multipath.o obj-$(CONFIG_MD_FAULTY) += faulty.o obj-$(CONFIG_BLK_DEV_MD) += md-mod.o diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index f8ffaee20ff..ebbd2d85625 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -7,7 +7,6 @@ * additions, Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.: * - added disk storage for bitmap * - changes to allow various bitmap chunk sizes - * - added bitmap daemon (to asynchronously clear bitmap bits from disk) */ /* @@ -15,9 +14,6 @@ * * flush after percent set rather than just time based. (maybe both). * wait if count gets too high, wake when it drops to half. - * allow bitmap to be mirrored with superblock (before or after...) - * allow hot-add to re-instate a current device. - * allow hot-add of bitmap after quiessing device */ #include <linux/module.h> @@ -73,24 +69,6 @@ static inline char * bmname(struct bitmap *bitmap) /* - * test if the bitmap is active - */ -int bitmap_active(struct bitmap *bitmap) -{ - unsigned long flags; - int res = 0; - - if (!bitmap) - return res; - spin_lock_irqsave(&bitmap->lock, flags); - res = bitmap->flags & BITMAP_ACTIVE; - spin_unlock_irqrestore(&bitmap->lock, flags); - return res; -} - -#define WRITE_POOL_SIZE 256 - -/* * just a placeholder - calls kmalloc for bitmap pages */ static unsigned char *bitmap_alloc_page(struct bitmap *bitmap) @@ -269,6 +247,8 @@ static struct page *read_sb_page(mddev_t *mddev, long offset, unsigned long inde if (sync_page_io(rdev->bdev, target, PAGE_SIZE, page, READ)) { page->index = index; + attach_page_buffers(page, NULL); /* so that free_buffer will + * quietly no-op */ return page; } } @@ -300,77 +280,132 @@ static int write_sb_page(mddev_t *mddev, long offset, struct page *page, int wai */ static int write_page(struct bitmap *bitmap, struct page *page, int wait) { - int ret = -ENOMEM; + struct buffer_head *bh; if (bitmap->file == NULL) return write_sb_page(bitmap->mddev, bitmap->offset, page, wait); - flush_dcache_page(page); /* make sure visible to anyone reading the file */ + bh = page_buffers(page); - if (wait) - lock_page(page); - else { - if (TestSetPageLocked(page)) - return -EAGAIN; /* already locked */ - if (PageWriteback(page)) { - unlock_page(page); - return -EAGAIN; - } + while (bh && bh->b_blocknr) { + atomic_inc(&bitmap->pending_writes); + set_buffer_locked(bh); + set_buffer_mapped(bh); + submit_bh(WRITE, bh); + bh = bh->b_this_page; } - ret = page->mapping->a_ops->prepare_write(bitmap->file, page, 0, PAGE_SIZE); - if (!ret) - ret = page->mapping->a_ops->commit_write(bitmap->file, page, 0, - PAGE_SIZE); - if (ret) { - unlock_page(page); - return ret; + if (wait) { + wait_event(bitmap->write_wait, + atomic_read(&bitmap->pending_writes)==0); + return (bitmap->flags & BITMAP_WRITE_ERROR) ? -EIO : 0; } + return 0; +} - set_page_dirty(page); /* force it to be written out */ - - if (!wait) { - /* add to list to be waited for by daemon */ - struct page_list *item = mempool_alloc(bitmap->write_pool, GFP_NOIO); - item->page = page; - get_page(page); - spin_lock(&bitmap->write_lock); - list_add(&item->list, &bitmap->complete_pages); - spin_unlock(&bitmap->write_lock); - md_wakeup_thread(bitmap->writeback_daemon); +static void end_bitmap_write(struct buffer_head *bh, int uptodate) +{ + struct bitmap *bitmap = bh->b_private; + unsigned long flags; + + if (!uptodate) { + spin_lock_irqsave(&bitmap->lock, flags); + bitmap->flags |= BITMAP_WRITE_ERROR; + spin_unlock_irqrestore(&bitmap->lock, flags); + } + if (atomic_dec_and_test(&bitmap->pending_writes)) + wake_up(&bitmap->write_wait); +} + +/* copied from buffer.c */ +static void +__clear_page_buffers(struct page *page) +{ + ClearPagePrivate(page); + set_page_private(page, 0); + page_cache_release(page); +} +static void free_buffers(struct page *page) +{ + struct buffer_head *bh = page_buffers(page); + + while (bh) { + struct buffer_head *next = bh->b_this_page; + free_buffer_head(bh); + bh = next; } - return write_one_page(page, wait); + __clear_page_buffers(page); + put_page(page); } -/* read a page from a file, pinning it into cache, and return bytes_read */ +/* read a page from a file. + * We both read the page, and attach buffers to the page to record the + * address of each block (using bmap). These addresses will be used + * to write the block later, completely bypassing the filesystem. + * This usage is similar to how swap files are handled, and allows us + * to write to a file with no concerns of memory allocation failing. + */ static struct page *read_page(struct file *file, unsigned long index, - unsigned long *bytes_read) + struct bitmap *bitmap, + unsigned long count) { - struct inode *inode = file->f_mapping->host; struct page *page = NULL; - loff_t isize = i_size_read(inode); - unsigned long end_index = isize >> PAGE_SHIFT; + struct inode *inode = file->f_dentry->d_inode; + struct buffer_head *bh; + sector_t block; PRINTK("read bitmap file (%dB @ %Lu)\n", (int)PAGE_SIZE, (unsigned long long)index << PAGE_SHIFT); - page = read_cache_page(inode->i_mapping, index, - (filler_t *)inode->i_mapping->a_ops->readpage, file); + page = alloc_page(GFP_KERNEL); + if (!page) + page = ERR_PTR(-ENOMEM); if (IS_ERR(page)) goto out; - wait_on_page_locked(page); - if (!PageUptodate(page) || PageError(page)) { + + bh = alloc_page_buffers(page, 1<<inode->i_blkbits, 0); + if (!bh) { put_page(page); - page = ERR_PTR(-EIO); + page = ERR_PTR(-ENOMEM); goto out; } + attach_page_buffers(page, bh); + block = index << (PAGE_SHIFT - inode->i_blkbits); + while (bh) { + if (count == 0) + bh->b_blocknr = 0; + else { + bh->b_blocknr = bmap(inode, block); + if (bh->b_blocknr == 0) { + /* Cannot use this file! */ + free_buffers(page); + page = ERR_PTR(-EINVAL); + goto out; + } + bh->b_bdev = inode->i_sb->s_bdev; + if (count < (1<<inode->i_blkbits)) + count = 0; + else + count -= (1<<inode->i_blkbits); + + bh->b_end_io = end_bitmap_write; + bh->b_private = bitmap; + atomic_inc(&bitmap->pending_writes); + set_buffer_locked(bh); + set_buffer_mapped(bh); + submit_bh(READ, bh); + } + block++; + bh = bh->b_this_page; + } + page->index = index; - if (index > end_index) /* we have read beyond EOF */ - *bytes_read = 0; - else if (index == end_index) /* possible short read */ - *bytes_read = isize & ~PAGE_MASK; - else - *bytes_read = PAGE_SIZE; /* got a full page */ + wait_event(bitmap->write_wait, + atomic_read(&bitmap->pending_writes)==0); + if (bitmap->flags & BITMAP_WRITE_ERROR) { + free_buffers(page); + page = ERR_PTR(-EIO); + } out: if (IS_ERR(page)) printk(KERN_ALERT "md: bitmap read error: (%dB @ %Lu): %ld\n", @@ -441,16 +476,14 @@ static int bitmap_read_sb(struct bitmap *bitmap) char *reason = NULL; bitmap_super_t *sb; unsigned long chunksize, daemon_sleep, write_behind; - unsigned long bytes_read; unsigned long long events; int err = -EINVAL; /* page 0 is the superblock, read it... */ if (bitmap->file) - bitmap->sb_page = read_page(bitmap->file, 0, &bytes_read); + bitmap->sb_page = read_page(bitmap->file, 0, bitmap, PAGE_SIZE); else { bitmap->sb_page = read_sb_page(bitmap->mddev, bitmap->offset, 0); - bytes_read = PAGE_SIZE; } if (IS_ERR(bitmap->sb_page)) { err = PTR_ERR(bitmap->sb_page); @@ -460,13 +493,6 @@ static int bitmap_read_sb(struct bitmap *bitmap) sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); - if (bytes_read < sizeof(*sb)) { /* short read */ - printk(KERN_INFO "%s: bitmap file superblock truncated\n", - bmname(bitmap)); - err = -ENOSPC; - goto out; - } - chunksize = le32_to_cpu(sb->chunksize); daemon_sleep = le32_to_cpu(sb->daemon_sleep); write_behind = le32_to_cpu(sb->write_behind); @@ -550,7 +576,6 @@ static void bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits, spin_unlock_irqrestore(&bitmap->lock, flags); return; } - get_page(bitmap->sb_page); spin_unlock_irqrestore(&bitmap->lock, flags); sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); switch (op) { @@ -561,7 +586,6 @@ static void bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits, default: BUG(); } kunmap_atomic(sb, KM_USER0); - put_page(bitmap->sb_page); } /* @@ -614,48 +638,17 @@ static void bitmap_file_unmap(struct bitmap *bitmap) while (pages--) if (map[pages]->index != 0) /* 0 is sb_page, release it below */ - put_page(map[pages]); + free_buffers(map[pages]); kfree(map); kfree(attr); - safe_put_page(sb_page); -} - -static void bitmap_stop_daemon(struct bitmap *bitmap); - -/* dequeue the next item in a page list -- don't call from irq context */ -static struct page_list *dequeue_page(struct bitmap *bitmap) -{ - struct page_list *item = NULL; - struct list_head *head = &bitmap->complete_pages; - - spin_lock(&bitmap->write_lock); - if (list_empty(head)) - goto out; - item = list_entry(head->prev, struct page_list, list); - list_del(head->prev); -out: - spin_unlock(&bitmap->write_lock); - return item; -} - -static void drain_write_queues(struct bitmap *bitmap) -{ - struct page_list *item; - - while ((item = dequeue_page(bitmap))) { - /* don't bother to wait */ - put_page(item->page); - mempool_free(item, bitmap->write_pool); - } - - wake_up(&bitmap->write_wait); + if (sb_page) + free_buffers(sb_page); } static void bitmap_file_put(struct bitmap *bitmap) { struct file *file; - struct inode *inode; unsigned long flags; spin_lock_irqsave(&bitmap->lock, flags); @@ -663,17 +656,14 @@ static void bitmap_file_put(struct bitmap *bitmap) bitmap->file = NULL; spin_unlock_irqrestore(&bitmap->lock, flags); - bitmap_stop_daemon(bitmap); - - drain_write_queues(bitmap); - + if (file) + wait_event(bitmap->write_wait, + atomic_read(&bitmap->pending_writes)==0); bitmap_file_unmap(bitmap); if (file) { - inode = file->f_mapping->host; - spin_lock(&inode->i_lock); - atomic_set(&inode->i_writecount, 1); /* allow writes again */ - spin_unlock(&inode->i_lock); + struct inode *inode = file->f_dentry->d_inode; + invalidate_inode_pages(inode->i_mapping); fput(file); } } @@ -708,26 +698,27 @@ static void bitmap_file_kick(struct bitmap *bitmap) } enum bitmap_page_attr { - BITMAP_PAGE_DIRTY = 1, // there are set bits that need to be synced - BITMAP_PAGE_CLEAN = 2, // there are bits that might need to be cleared - BITMAP_PAGE_NEEDWRITE=4, // there are cleared bits that need to be synced + BITMAP_PAGE_DIRTY = 0, // there are set bits that need to be synced + BITMAP_PAGE_CLEAN = 1, // there are bits that might need to be cleared + BITMAP_PAGE_NEEDWRITE=2, // there are cleared bits that need to be synced }; static inline void set_page_attr(struct bitmap *bitmap, struct page *page, enum bitmap_page_attr attr) { - bitmap->filemap_attr[page->index] |= attr; + __set_bit((page->index<<2) + attr, bitmap->filemap_attr); } static inline void clear_page_attr(struct bitmap *bitmap, struct page *page, enum bitmap_page_attr attr) { - bitmap->filemap_attr[page->index] &= ~attr; + __clear_bit((page->index<<2) + attr, bitmap->filemap_attr); } -static inline unsigned long get_page_attr(struct bitmap *bitmap, struct page *page) +static inline unsigned long test_page_attr(struct bitmap *bitmap, struct page *page, + enum bitmap_page_attr attr) { - return bitmap->filemap_attr[page->index]; + return test_bit((page->index<<2) + attr, bitmap->filemap_attr); } /* @@ -751,11 +742,6 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) page = filemap_get_page(bitmap, chunk); bit = file_page_offset(chunk); - - /* make sure the page stays cached until it gets written out */ - if (! (get_page_attr(bitmap, page) & BITMAP_PAGE_DIRTY)) - get_page(page); - /* set the bit */ kaddr = kmap_atomic(page, KM_USER0); if (bitmap->flags & BITMAP_HOSTENDIAN) @@ -775,7 +761,8 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) * sync the dirty pages of the bitmap file to disk */ int bitmap_unplug(struct bitmap *bitmap) { - unsigned long i, attr, flags; + unsigned long i, flags; + int dirty, need_write; struct page *page; int wait = 0; int err; @@ -792,35 +779,26 @@ int bitmap_unplug(struct bitmap *bitmap) return 0; } page = bitmap->filemap[i]; - attr = get_page_attr(bitmap, page); + dirty = test_page_attr(bitmap, page, BITMAP_PAGE_DIRTY); + need_write = test_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE); clear_page_attr(bitmap, page, BITMAP_PAGE_DIRTY); clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE); - if ((attr & BITMAP_PAGE_DIRTY)) + if (dirty) wait = 1; spin_unlock_irqrestore(&bitmap->lock, flags); - if (attr & (BITMAP_PAGE_DIRTY | BITMAP_PAGE_NEEDWRITE)) { + if (dirty | need_write) err = write_page(bitmap, page, 0); - if (err == -EAGAIN) { - if (attr & BITMAP_PAGE_DIRTY) - err = write_page(bitmap, page, 1); - else - err = 0; - } - if (err) - return 1; - } } if (wait) { /* if any writes were performed, we need to wait on them */ - if (bitmap->file) { - spin_lock_irq(&bitmap->write_lock); - wait_event_lock_irq(bitmap->write_wait, - list_empty(&bitmap->complete_pages), bitmap->write_lock, - wake_up_process(bitmap->writeback_daemon->tsk)); - spin_unlock_irq(&bitmap->write_lock); - } else + if (bitmap->file) + wait_event(bitmap->write_wait, + atomic_read(&bitmap->pending_writes)==0); + else md_super_wait(bitmap->mddev); } + if (bitmap->flags & BITMAP_WRITE_ERROR) + bitmap_file_kick(bitmap); return 0; } @@ -842,7 +820,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) struct page *page = NULL, *oldpage = NULL; unsigned long num_pages, bit_cnt = 0; struct file *file; - unsigned long bytes, offset, dummy; + unsigned long bytes, offset; int outofdate; int ret = -ENOSPC; void *paddr; @@ -879,7 +857,12 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) if (!bitmap->filemap) goto out; - bitmap->filemap_attr = kzalloc(sizeof(long) * num_pages, GFP_KERNEL); + /* We need 4 bits per page, rounded up to a multiple of sizeof(unsigned long) */ + bitmap->filemap_attr = kzalloc( + (((num_pages*4/8)+sizeof(unsigned long)-1) + /sizeof(unsigned long)) + *sizeof(unsigned long), + GFP_KERNEL); if (!bitmap->filemap_attr) goto out; @@ -890,7 +873,12 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) index = file_page_index(i); bit = file_page_offset(i); if (index != oldindex) { /* this is a new page, read it in */ + int count; /* unmap the old page, we're done with it */ + if (index == num_pages-1) + count = bytes - index * PAGE_SIZE; + else + count = PAGE_SIZE; if (index == 0) { /* * if we're here then the superblock page @@ -900,7 +888,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) page = bitmap->sb_page; offset = sizeof(bitmap_super_t); } else if (file) { - page = read_page(file, index, &dummy); + page = read_page(file, index, bitmap, count); offset = 0; } else { page = read_sb_page(bitmap->mddev, bitmap->offset, index); @@ -971,12 +959,11 @@ void bitmap_write_all(struct bitmap *bitmap) /* We don't actually write all bitmap blocks here, * just flag them as needing to be written */ + int i; - unsigned long chunks = bitmap->chunks; - unsigned long bytes = (chunks+7)/8 + sizeof(bitmap_super_t); - unsigned long num_pages = (bytes + PAGE_SIZE-1) / PAGE_SIZE; - while (num_pages--) - bitmap->filemap_attr[num_pages] |= BITMAP_PAGE_NEEDWRITE; + for (i=0; i < bitmap->file_pages; i++) + set_page_attr(bitmap, bitmap->filemap[i], + BITMAP_PAGE_NEEDWRITE); } @@ -1007,7 +994,6 @@ int bitmap_daemon_work(struct bitmap *bitmap) struct page *page = NULL, *lastpage = NULL; int err = 0; int blocks; - int attr; void *paddr; if (bitmap == NULL) @@ -1029,43 +1015,34 @@ int bitmap_daemon_work(struct bitmap *bitmap) if (page != lastpage) { /* skip this page unless it's marked as needing cleaning */ - if (!((attr=get_page_attr(bitmap, page)) & BITMAP_PAGE_CLEAN)) { - if (attr & BITMAP_PAGE_NEEDWRITE) { - get_page(page); + if (!test_page_attr(bitmap, page, BITMAP_PAGE_CLEAN)) { + int need_write = test_page_attr(bitmap, page, + BITMAP_PAGE_NEEDWRITE); + if (need_write) clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE); - } + spin_unlock_irqrestore(&bitmap->lock, flags); - if (attr & BITMAP_PAGE_NEEDWRITE) { + if (need_write) { switch (write_page(bitmap, page, 0)) { - case -EAGAIN: - set_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE); - break; case 0: break; default: bitmap_file_kick(bitmap); } - put_page(page); } continue; } /* grab the new page, sync and release the old */ - get_page(page); if (lastpage != NULL) { - if (get_page_attr(bitmap, lastpage) & BITMAP_PAGE_NEEDWRITE) { + if (test_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE)) { clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); spin_unlock_irqrestore(&bitmap->lock, flags); err = write_page(bitmap, lastpage, 0); - if (err == -EAGAIN) { - err = 0; - set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); - } } else { set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); spin_unlock_irqrestore(&bitmap->lock, flags); } - put_page(lastpage); if (err) bitmap_file_kick(bitmap); } else @@ -1107,131 +1084,19 @@ int bitmap_daemon_work(struct bitmap *bitmap) /* now sync the final page */ if (lastpage != NULL) { spin_lock_irqsave(&bitmap->lock, flags); - if (get_page_attr(bitmap, lastpage) &BITMAP_PAGE_NEEDWRITE) { + if (test_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE)) { clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); spin_unlock_irqrestore(&bitmap->lock, flags); err = write_page(bitmap, lastpage, 0); - if (err == -EAGAIN) { - set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); - err = 0; - } } else { set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); spin_unlock_irqrestore(&bitmap->lock, flags); } - - put_page(lastpage); } return err; } -static void daemon_exit(struct bitmap *bitmap, mdk_thread_t **daemon) -{ - mdk_thread_t *dmn; - unsigned long flags; - - /* if no one is waiting on us, we'll free the md thread struct - * and exit, otherwise we let the waiter clean things up */ - spin_lock_irqsave(&bitmap->lock, flags); - if ((dmn = *daemon)) { /* no one is waiting, cleanup and exit */ - *daemon = NULL; - spin_unlock_irqrestore(&bitmap->lock, flags); - kfree(dmn); - complete_and_exit(NULL, 0); /* do_exit not exported */ - } - spin_unlock_irqrestore(&bitmap->lock, flags); -} - -static void bitmap_writeback_daemon(mddev_t *mddev) -{ - struct bitmap *bitmap = mddev->bitmap; - struct page *page; - struct page_list *item; - int err = 0; - - if (signal_pending(current)) { - printk(KERN_INFO - "%s: bitmap writeback daemon got signal, exiting...\n", - bmname(bitmap)); - err = -EINTR; - goto out; - } - if (bitmap == NULL) - /* about to be stopped. */ - return; - - PRINTK("%s: bitmap writeback daemon woke up...\n", bmname(bitmap)); - /* wait on bitmap page writebacks */ - while ((item = dequeue_page(bitmap))) { - page = item->page; - mempool_free(item, bitmap->write_pool); - PRINTK("wait on page writeback: %p\n", page); - wait_on_page_writeback(page); - PRINTK("finished page writeback: %p\n", page); - - err = PageError(page); - put_page(page); - if (err) { - printk(KERN_WARNING "%s: bitmap file writeback " - "failed (page %lu): %d\n", - bmname(bitmap), page->index, err); - bitmap_file_kick(bitmap); - goto out; - } - } - out: - wake_up(&bitmap->write_wait); - if (err) { - printk(KERN_INFO "%s: bitmap writeback daemon exiting (%d)\n", - bmname(bitmap), err); - daemon_exit(bitmap, &bitmap->writeback_daemon); - } -} - -static mdk_thread_t *bitmap_start_daemon(struct bitmap *bitmap, - void (*func)(mddev_t *), char *name) -{ - mdk_thread_t *daemon; - char namebuf[32]; - -#ifdef INJECT_FATAL_FAULT_2 - daemon = NULL; -#else - sprintf(namebuf, "%%s_%s", name); - daemon = md_register_thread(func, bitmap->mddev, namebuf); -#endif - if (!daemon) { - printk(KERN_ERR "%s: failed to start bitmap daemon\n", - bmname(bitmap)); - return ERR_PTR(-ECHILD); - } - - md_wakeup_thread(daemon); /* start it running */ - - PRINTK("%s: %s daemon (pid %d) started...\n", - bmname(bitmap), name, daemon->tsk->pid); - - return daemon; -} - -static void bitmap_stop_daemon(struct bitmap *bitmap) -{ - /* the daemon can't stop itself... it'll just exit instead... */ - if (bitmap->writeback_daemon && ! IS_ERR(bitmap->writeback_daemon) && - current->pid != bitmap->writeback_daemon->tsk->pid) { - mdk_thread_t *daemon; - unsigned long flags; - - spin_lock_irqsave(&bitmap->lock, flags); - daemon = bitmap->writeback_daemon; - bitmap->writeback_daemon = NULL; - spin_unlock_irqrestore(&bitmap->lock, flags); - if (daemon && ! IS_ERR(daemon)) - md_unregister_thread(daemon); /* destroy the thread */ - } -} - static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, sector_t offset, int *blocks, int create) @@ -1500,8 +1365,6 @@ static void bitmap_free(struct bitmap *bitmap) /* free all allocated memory */ - mempool_destroy(bitmap->write_pool); - if (bp) /* deallocate the page memory */ for (k = 0; k < pages; k++) if (bp[k].map && !bp[k].hijacked) @@ -1549,20 +1412,20 @@ int bitmap_create(mddev_t *mddev) return -ENOMEM; spin_lock_init(&bitmap->lock); - bitmap->mddev = mddev; - - spin_lock_init(&bitmap->write_lock); - INIT_LIST_HEAD(&bitmap->complete_pages); + atomic_set(&bitmap->pending_writes, 0); init_waitqueue_head(&bitmap->write_wait); - bitmap->write_pool = mempool_create_kmalloc_pool(WRITE_POOL_SIZE, - sizeof(struct page_list)); - err = -ENOMEM; - if (!bitmap->write_pool) - goto error; + + bitmap->mddev = mddev; bitmap->file = file; bitmap->offset = mddev->bitmap_offset; - if (file) get_file(file); + if (file) { + get_file(file); + do_sync_file_range(file, 0, LLONG_MAX, + SYNC_FILE_RANGE_WAIT_BEFORE | + SYNC_FILE_RANGE_WRITE | + SYNC_FILE_RANGE_WAIT_AFTER); + } /* read superblock from bitmap file (this sets bitmap->chunksize) */ err = bitmap_read_sb(bitmap); if (err) @@ -1594,8 +1457,6 @@ int bitmap_create(mddev_t *mddev) if (!bitmap->bp) goto error; - bitmap->flags |= BITMAP_ACTIVE; - /* now that we have some pages available, initialize the in-memory * bitmap from the on-disk bitmap */ start = 0; @@ -1613,15 +1474,6 @@ int bitmap_create(mddev_t *mddev) mddev->bitmap = bitmap; - if (file) - /* kick off the bitmap writeback daemon */ - bitmap->writeback_daemon = - bitmap_start_daemon(bitmap, - bitmap_writeback_daemon, - "bitmap_wb"); - - if (IS_ERR(bitmap->writeback_daemon)) - return PTR_ERR(bitmap->writeback_daemon); mddev->thread->timeout = bitmap->daemon_sleep * HZ; return bitmap_update_sb(bitmap); @@ -1638,4 +1490,3 @@ EXPORT_SYMBOL(bitmap_start_sync); EXPORT_SYMBOL(bitmap_end_sync); EXPORT_SYMBOL(bitmap_unplug); EXPORT_SYMBOL(bitmap_close_sync); -EXPORT_SYMBOL(bitmap_daemon_work); diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 61a590bb624..6022ed12a79 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -20,7 +20,7 @@ #include "dm.h" -#define PFX "crypt: " +#define DM_MSG_PREFIX "crypt" /* * per bio private data @@ -125,19 +125,19 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, u8 *salt; if (opts == NULL) { - ti->error = PFX "Digest algorithm missing for ESSIV mode"; + ti->error = "Digest algorithm missing for ESSIV mode"; return -EINVAL; } /* Hash the cipher key with the given hash algorithm */ hash_tfm = crypto_alloc_tfm(opts, CRYPTO_TFM_REQ_MAY_SLEEP); if (hash_tfm == NULL) { - ti->error = PFX "Error initializing ESSIV hash"; + ti->error = "Error initializing ESSIV hash"; return -EINVAL; } if (crypto_tfm_alg_type(hash_tfm) != CRYPTO_ALG_TYPE_DIGEST) { - ti->error = PFX "Expected digest algorithm for ESSIV hash"; + ti->error = "Expected digest algorithm for ESSIV hash"; crypto_free_tfm(hash_tfm); return -EINVAL; } @@ -145,7 +145,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, saltsize = crypto_tfm_alg_digestsize(hash_tfm); salt = kmalloc(saltsize, GFP_KERNEL); if (salt == NULL) { - ti->error = PFX "Error kmallocing salt storage in ESSIV"; + ti->error = "Error kmallocing salt storage in ESSIV"; crypto_free_tfm(hash_tfm); return -ENOMEM; } @@ -159,20 +159,20 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, CRYPTO_TFM_MODE_ECB | CRYPTO_TFM_REQ_MAY_SLEEP); if (essiv_tfm == NULL) { - ti->error = PFX "Error allocating crypto tfm for ESSIV"; + ti->error = "Error allocating crypto tfm for ESSIV"; kfree(salt); return -EINVAL; } if (crypto_tfm_alg_blocksize(essiv_tfm) != crypto_tfm_alg_ivsize(cc->tfm)) { - ti->error = PFX "Block size of ESSIV cipher does " + ti->error = "Block size of ESSIV cipher does " "not match IV size of block cipher"; crypto_free_tfm(essiv_tfm); kfree(salt); return -EINVAL; } if (crypto_cipher_setkey(essiv_tfm, salt, saltsize) < 0) { - ti->error = PFX "Failed to set key for ESSIV cipher"; + ti->error = "Failed to set key for ESSIV cipher"; crypto_free_tfm(essiv_tfm); kfree(salt); return -EINVAL; @@ -521,7 +521,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) unsigned long long tmpll; if (argc != 5) { - ti->error = PFX "Not enough arguments"; + ti->error = "Not enough arguments"; return -EINVAL; } @@ -532,21 +532,21 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) ivmode = strsep(&ivopts, ":"); if (tmp) - DMWARN(PFX "Unexpected additional cipher options"); + DMWARN("Unexpected additional cipher options"); key_size = strlen(argv[1]) >> 1; cc = kmalloc(sizeof(*cc) + key_size * sizeof(u8), GFP_KERNEL); if (cc == NULL) { ti->error = - PFX "Cannot allocate transparent encryption context"; + "Cannot allocate transparent encryption context"; return -ENOMEM; } cc->key_size = key_size; if ((!key_size && strcmp(argv[1], "-") != 0) || (key_size && crypt_decode_key(cc->key, argv[1], key_size) < 0)) { - ti->error = PFX "Error decoding key"; + ti->error = "Error decoding key"; goto bad1; } @@ -562,22 +562,22 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) else if (strcmp(chainmode, "ecb") == 0) crypto_flags = CRYPTO_TFM_MODE_ECB; else { - ti->error = PFX "Unknown chaining mode"; + ti->error = "Unknown chaining mode"; goto bad1; } if (crypto_flags != CRYPTO_TFM_MODE_ECB && !ivmode) { - ti->error = PFX "This chaining mode requires an IV mechanism"; + ti->error = "This chaining mode requires an IV mechanism"; goto bad1; } tfm = crypto_alloc_tfm(cipher, crypto_flags | CRYPTO_TFM_REQ_MAY_SLEEP); if (!tfm) { - ti->error = PFX "Error allocating crypto tfm"; + ti->error = "Error allocating crypto tfm"; goto bad1; } if (crypto_tfm_alg_type(tfm) != CRYPTO_ALG_TYPE_CIPHER) { - ti->error = PFX "Expected cipher algorithm"; + ti->error = "Expected cipher algorithm"; goto bad2; } @@ -595,7 +595,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) else if (strcmp(ivmode, "essiv") == 0) cc->iv_gen_ops = &crypt_iv_essiv_ops; else { - ti->error = PFX "Invalid IV mode"; + ti->error = "Invalid IV mode"; goto bad2; } @@ -610,7 +610,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) else { cc->iv_size = 0; if (cc->iv_gen_ops) { - DMWARN(PFX "Selected cipher does not support IVs"); + DMWARN("Selected cipher does not support IVs"); if (cc->iv_gen_ops->dtr) cc->iv_gen_ops->dtr(cc); cc->iv_gen_ops = NULL; @@ -619,36 +619,36 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) cc->io_pool = mempool_create_slab_pool(MIN_IOS, _crypt_io_pool); if (!cc->io_pool) { - ti->error = PFX "Cannot allocate crypt io mempool"; + ti->error = "Cannot allocate crypt io mempool"; goto bad3; } cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0); if (!cc->page_pool) { - ti->error = PFX "Cannot allocate page mempool"; + ti->error = "Cannot allocate page mempool"; goto bad4; } if (tfm->crt_cipher.cit_setkey(tfm, cc->key, key_size) < 0) { - ti->error = PFX "Error setting key"; + ti->error = "Error setting key"; goto bad5; } if (sscanf(argv[2], "%llu", &tmpll) != 1) { - ti->error = PFX "Invalid iv_offset sector"; + ti->error = "Invalid iv_offset sector"; goto bad5; } cc->iv_offset = tmpll; if (sscanf(argv[4], "%llu", &tmpll) != 1) { - ti->error = PFX "Invalid device sector"; + ti->error = "Invalid device sector"; goto bad5; } cc->start = tmpll; if (dm_get_device(ti, argv[3], cc->start, ti->len, dm_table_get_mode(ti->table), &cc->dev)) { - ti->error = PFX "Device lookup failed"; + ti->error = "Device lookup failed"; goto bad5; } @@ -657,7 +657,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) *(ivopts - 1) = ':'; cc->iv_mode = kmalloc(strlen(ivmode) + 1, GFP_KERNEL); if (!cc->iv_mode) { - ti->error = PFX "Error kmallocing iv_mode string"; + ti->error = "Error kmallocing iv_mode string"; goto bad5; } strcpy(cc->iv_mode, ivmode); @@ -918,13 +918,13 @@ static int __init dm_crypt_init(void) _kcryptd_workqueue = create_workqueue("kcryptd"); if (!_kcryptd_workqueue) { r = -ENOMEM; - DMERR(PFX "couldn't create kcryptd"); + DMERR("couldn't create kcryptd"); goto bad1; } r = dm_register_target(&crypt_target); if (r < 0) { - DMERR(PFX "register failed %d", r); + DMERR("register failed %d", r); goto bad2; } @@ -942,7 +942,7 @@ static void __exit dm_crypt_exit(void) int r = dm_unregister_target(&crypt_target); if (r < 0) - DMERR(PFX "unregister failed %d", r); + DMERR("unregister failed %d", r); destroy_workqueue(_kcryptd_workqueue); kmem_cache_destroy(_crypt_io_pool); diff --git a/drivers/md/dm-emc.c b/drivers/md/dm-emc.c index c7067674dcb..2a374ccb30d 100644 --- a/drivers/md/dm-emc.c +++ b/drivers/md/dm-emc.c @@ -12,6 +12,8 @@ #include <scsi/scsi.h> #include <scsi/scsi_cmnd.h> +#define DM_MSG_PREFIX "multipath emc" + struct emc_handler { spinlock_t lock; @@ -66,7 +68,7 @@ static struct bio *get_failover_bio(struct path *path, unsigned data_size) bio = bio_alloc(GFP_ATOMIC, 1); if (!bio) { - DMERR("dm-emc: get_failover_bio: bio_alloc() failed."); + DMERR("get_failover_bio: bio_alloc() failed."); return NULL; } @@ -78,13 +80,13 @@ static struct bio *get_failover_bio(struct path *path, unsigned data_size) page = alloc_page(GFP_ATOMIC); if (!page) { - DMERR("dm-emc: get_failover_bio: alloc_page() failed."); + DMERR("get_failover_bio: alloc_page() failed."); bio_put(bio); return NULL; } if (bio_add_page(bio, page, data_size, 0) != data_size) { - DMERR("dm-emc: get_failover_bio: alloc_page() failed."); + DMERR("get_failover_bio: alloc_page() failed."); __free_page(page); bio_put(bio); return NULL; @@ -103,7 +105,7 @@ static struct request *get_failover_req(struct emc_handler *h, /* FIXME: Figure out why it fails with GFP_ATOMIC. */ rq = blk_get_request(q, WRITE, __GFP_WAIT); if (!rq) { - DMERR("dm-emc: get_failover_req: blk_get_request failed"); + DMERR("get_failover_req: blk_get_request failed"); return NULL; } @@ -160,7 +162,7 @@ static struct request *emc_trespass_get(struct emc_handler *h, bio = get_failover_bio(path, data_size); if (!bio) { - DMERR("dm-emc: emc_trespass_get: no bio"); + DMERR("emc_trespass_get: no bio"); return NULL; } @@ -173,7 +175,7 @@ static struct request *emc_trespass_get(struct emc_handler *h, /* get request for block layer packet command */ rq = get_failover_req(h, bio, path); if (!rq) { - DMERR("dm-emc: emc_trespass_get: no rq"); + DMERR("emc_trespass_get: no rq"); free_bio(bio); return NULL; } @@ -200,18 +202,18 @@ static void emc_pg_init(struct hw_handler *hwh, unsigned bypassed, * initial state passed into us and then get an update here. */ if (!q) { - DMINFO("dm-emc: emc_pg_init: no queue"); + DMINFO("emc_pg_init: no queue"); goto fail_path; } /* FIXME: The request should be pre-allocated. */ rq = emc_trespass_get(hwh->context, path); if (!rq) { - DMERR("dm-emc: emc_pg_init: no rq"); + DMERR("emc_pg_init: no rq"); goto fail_path; } - DMINFO("dm-emc: emc_pg_init: sending switch-over command"); + DMINFO("emc_pg_init: sending switch-over command"); elv_add_request(q, rq, ELEVATOR_INSERT_FRONT, 1); return; @@ -241,18 +243,18 @@ static int emc_create(struct hw_handler *hwh, unsigned argc, char **argv) hr = 0; short_trespass = 0; } else if (argc != 2) { - DMWARN("dm-emc hwhandler: incorrect number of arguments"); + DMWARN("incorrect number of arguments"); return -EINVAL; } else { if ((sscanf(argv[0], "%u", &short_trespass) != 1) || (short_trespass > 1)) { - DMWARN("dm-emc: invalid trespass mode selected"); + DMWARN("invalid trespass mode selected"); return -EINVAL; } if ((sscanf(argv[1], "%u", &hr) != 1) || (hr > 1)) { - DMWARN("dm-emc: invalid honor reservation flag selected"); + DMWARN("invalid honor reservation flag selected"); return -EINVAL; } } @@ -264,14 +266,14 @@ static int emc_create(struct hw_handler *hwh, unsigned argc, char **argv) hwh->context = h; if ((h->short_trespass = short_trespass)) - DMWARN("dm-emc: short trespass command will be send"); + DMWARN("short trespass command will be send"); else - DMWARN("dm-emc: long trespass command will be send"); + DMWARN("long trespass command will be send"); if ((h->hr = hr)) - DMWARN("dm-emc: honor reservation bit will be set"); + DMWARN("honor reservation bit will be set"); else - DMWARN("dm-emc: honor reservation bit will not be set (default)"); + DMWARN("honor reservation bit will not be set (default)"); return 0; } @@ -336,9 +338,9 @@ static int __init dm_emc_init(void) int r = dm_register_hw_handler(&emc_hwh); if (r < 0) - DMERR("emc: register failed %d", r); + DMERR("register failed %d", r); - DMINFO("dm-emc version 0.0.3 loaded"); + DMINFO("version 0.0.3 loaded"); return r; } @@ -348,7 +350,7 @@ static void __exit dm_emc_exit(void) int r = dm_unregister_hw_handler(&emc_hwh); if (r < 0) - DMERR("emc: unregister failed %d", r); + DMERR("unregister failed %d", r); } module_init(dm_emc_init); diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c index cc07bbebbb1..d12379b5cdb 100644 --- a/drivers/md/dm-exception-store.c +++ b/drivers/md/dm-exception-store.c @@ -16,6 +16,8 @@ #include <linux/vmalloc.h> #include <linux/slab.h> +#define DM_MSG_PREFIX "snapshots" + /*----------------------------------------------------------------- * Persistent snapshots, by persistent we mean that the snapshot * will survive a reboot. @@ -91,7 +93,6 @@ struct pstore { struct dm_snapshot *snap; /* up pointer to my snapshot */ int version; int valid; - uint32_t chunk_size; uint32_t exceptions_per_area; /* @@ -133,7 +134,7 @@ static int alloc_area(struct pstore *ps) int r = -ENOMEM; size_t len; - len = ps->chunk_size << SECTOR_SHIFT; + len = ps->snap->chunk_size << SECTOR_SHIFT; /* * Allocate the chunk_size block of memory that will hold @@ -160,8 +161,8 @@ static int chunk_io(struct pstore *ps, uint32_t chunk, int rw) unsigned long bits; where.bdev = ps->snap->cow->bdev; - where.sector = ps->chunk_size * chunk; - where.count = ps->chunk_size; + where.sector = ps->snap->chunk_size * chunk; + where.count = ps->snap->chunk_size; return dm_io_sync_vm(1, &where, rw, ps->area, &bits); } @@ -188,7 +189,7 @@ static int area_io(struct pstore *ps, uint32_t area, int rw) static int zero_area(struct pstore *ps, uint32_t area) { - memset(ps->area, 0, ps->chunk_size << SECTOR_SHIFT); + memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT); return area_io(ps, area, WRITE); } @@ -196,6 +197,7 @@ static int read_header(struct pstore *ps, int *new_snapshot) { int r; struct disk_header *dh; + chunk_t chunk_size; r = chunk_io(ps, 0, READ); if (r) @@ -210,8 +212,29 @@ static int read_header(struct pstore *ps, int *new_snapshot) *new_snapshot = 0; ps->valid = le32_to_cpu(dh->valid); ps->version = le32_to_cpu(dh->version); - ps->chunk_size = le32_to_cpu(dh->chunk_size); - + chunk_size = le32_to_cpu(dh->chunk_size); + if (ps->snap->chunk_size != chunk_size) { + DMWARN("chunk size %llu in device metadata overrides " + "table chunk size of %llu.", + (unsigned long long)chunk_size, + (unsigned long long)ps->snap->chunk_size); + + /* We had a bogus chunk_size. Fix stuff up. */ + dm_io_put(sectors_to_pages(ps->snap->chunk_size)); + free_area(ps); + + ps->snap->chunk_size = chunk_size; + ps->snap->chunk_mask = chunk_size - 1; + ps->snap->chunk_shift = ffs(chunk_size) - 1; + + r = alloc_area(ps); + if (r) + return r; + + r = dm_io_get(sectors_to_pages(chunk_size)); + if (r) + return r; + } } else { DMWARN("Invalid/corrupt snapshot"); r = -ENXIO; @@ -224,13 +247,13 @@ static int write_header(struct pstore *ps) { struct disk_header *dh; - memset(ps->area, 0, ps->chunk_size << SECTOR_SHIFT); + memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT); dh = (struct disk_header *) ps->area; dh->magic = cpu_to_le32(SNAP_MAGIC); dh->valid = cpu_to_le32(ps->valid); dh->version = cpu_to_le32(ps->version); - dh->chunk_size = cpu_to_le32(ps->chunk_size); + dh->chunk_size = cpu_to_le32(ps->snap->chunk_size); return chunk_io(ps, 0, WRITE); } @@ -365,7 +388,7 @@ static void persistent_destroy(struct exception_store *store) { struct pstore *ps = get_info(store); - dm_io_put(sectors_to_pages(ps->chunk_size)); + dm_io_put(sectors_to_pages(ps->snap->chunk_size)); vfree(ps->callbacks); free_area(ps); kfree(ps); @@ -384,6 +407,16 @@ static int persistent_read_metadata(struct exception_store *store) return r; /* + * Now we know correct chunk_size, complete the initialisation. + */ + ps->exceptions_per_area = (ps->snap->chunk_size << SECTOR_SHIFT) / + sizeof(struct disk_exception); + ps->callbacks = dm_vcalloc(ps->exceptions_per_area, + sizeof(*ps->callbacks)); + if (!ps->callbacks) + return -ENOMEM; + + /* * Do we need to setup a new snapshot ? */ if (new_snapshot) { @@ -533,9 +566,6 @@ int dm_create_persistent(struct exception_store *store, uint32_t chunk_size) ps->snap = store->snap; ps->valid = 1; ps->version = SNAPSHOT_DISK_VERSION; - ps->chunk_size = chunk_size; - ps->exceptions_per_area = (chunk_size << SECTOR_SHIFT) / - sizeof(struct disk_exception); ps->next_free = 2; /* skipping the header and first area */ ps->current_committed = 0; @@ -543,18 +573,9 @@ int dm_create_persistent(struct exception_store *store, uint32_t chunk_size) if (r) goto bad; - /* - * Allocate space for all the callbacks. - */ ps->callback_count = 0; atomic_set(&ps->pending_count, 0); - ps->callbacks = dm_vcalloc(ps->exceptions_per_area, - sizeof(*ps->callbacks)); - - if (!ps->callbacks) { - r = -ENOMEM; - goto bad; - } + ps->callbacks = NULL; store->destroy = persistent_destroy; store->read_metadata = persistent_read_metadata; diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 8edd6435414..3edb3477f98 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1,6 +1,6 @@ /* * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. - * Copyright (C) 2004 - 2005 Red Hat, Inc. All rights reserved. + * Copyright (C) 2004 - 2006 Red Hat, Inc. All rights reserved. * * This file is released under the GPL. */ @@ -19,6 +19,7 @@ #include <asm/uaccess.h> +#define DM_MSG_PREFIX "ioctl" #define DM_DRIVER_EMAIL "dm-devel@redhat.com" /*----------------------------------------------------------------- @@ -48,7 +49,7 @@ struct vers_iter { static struct list_head _name_buckets[NUM_BUCKETS]; static struct list_head _uuid_buckets[NUM_BUCKETS]; -static void dm_hash_remove_all(void); +static void dm_hash_remove_all(int keep_open_devices); /* * Guards access to both hash tables. @@ -73,7 +74,7 @@ static int dm_hash_init(void) static void dm_hash_exit(void) { - dm_hash_remove_all(); + dm_hash_remove_all(0); devfs_remove(DM_DIR); } @@ -102,8 +103,10 @@ static struct hash_cell *__get_name_cell(const char *str) unsigned int h = hash_str(str); list_for_each_entry (hc, _name_buckets + h, name_list) - if (!strcmp(hc->name, str)) + if (!strcmp(hc->name, str)) { + dm_get(hc->md); return hc; + } return NULL; } @@ -114,8 +117,10 @@ static struct hash_cell *__get_uuid_cell(const char *str) unsigned int h = hash_str(str); list_for_each_entry (hc, _uuid_buckets + h, uuid_list) - if (!strcmp(hc->uuid, str)) + if (!strcmp(hc->uuid, str)) { + dm_get(hc->md); return hc; + } return NULL; } @@ -191,7 +196,7 @@ static int unregister_with_devfs(struct hash_cell *hc) */ static int dm_hash_insert(const char *name, const char *uuid, struct mapped_device *md) { - struct hash_cell *cell; + struct hash_cell *cell, *hc; /* * Allocate the new cells. @@ -204,14 +209,19 @@ static int dm_hash_insert(const char *name, const char *uuid, struct mapped_devi * Insert the cell into both hash tables. */ down_write(&_hash_lock); - if (__get_name_cell(name)) + hc = __get_name_cell(name); + if (hc) { + dm_put(hc->md); goto bad; + } list_add(&cell->name_list, _name_buckets + hash_str(name)); if (uuid) { - if (__get_uuid_cell(uuid)) { + hc = __get_uuid_cell(uuid); + if (hc) { list_del(&cell->name_list); + dm_put(hc->md); goto bad; } list_add(&cell->uuid_list, _uuid_buckets + hash_str(uuid)); @@ -251,19 +261,41 @@ static void __hash_remove(struct hash_cell *hc) free_cell(hc); } -static void dm_hash_remove_all(void) +static void dm_hash_remove_all(int keep_open_devices) { - int i; + int i, dev_skipped, dev_removed; struct hash_cell *hc; struct list_head *tmp, *n; down_write(&_hash_lock); + +retry: + dev_skipped = dev_removed = 0; for (i = 0; i < NUM_BUCKETS; i++) { list_for_each_safe (tmp, n, _name_buckets + i) { hc = list_entry(tmp, struct hash_cell, name_list); + + if (keep_open_devices && + dm_lock_for_deletion(hc->md)) { + dev_skipped++; + continue; + } __hash_remove(hc); + dev_removed = 1; } } + + /* + * Some mapped devices may be using other mapped devices, so if any + * still exist, repeat until we make no further progress. + */ + if (dev_skipped) { + if (dev_removed) + goto retry; + + DMWARN("remove_all left %d open device(s)", dev_skipped); + } + up_write(&_hash_lock); } @@ -289,6 +321,7 @@ static int dm_hash_rename(const char *old, const char *new) if (hc) { DMWARN("asked to rename to an already existing name %s -> %s", old, new); + dm_put(hc->md); up_write(&_hash_lock); kfree(new_name); return -EBUSY; @@ -328,6 +361,7 @@ static int dm_hash_rename(const char *old, const char *new) dm_table_put(table); } + dm_put(hc->md); up_write(&_hash_lock); kfree(old_name); return 0; @@ -344,7 +378,7 @@ typedef int (*ioctl_fn)(struct dm_ioctl *param, size_t param_size); static int remove_all(struct dm_ioctl *param, size_t param_size) { - dm_hash_remove_all(); + dm_hash_remove_all(1); param->data_size = 0; return 0; } @@ -524,7 +558,6 @@ static int __dev_status(struct mapped_device *md, struct dm_ioctl *param) { struct gendisk *disk = dm_disk(md); struct dm_table *table; - struct block_device *bdev; param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG | DM_ACTIVE_PRESENT_FLAG); @@ -534,20 +567,12 @@ static int __dev_status(struct mapped_device *md, struct dm_ioctl *param) param->dev = huge_encode_dev(MKDEV(disk->major, disk->first_minor)); - if (!(param->flags & DM_SKIP_BDGET_FLAG)) { - bdev = bdget_disk(disk, 0); - if (!bdev) - return -ENXIO; - - /* - * Yes, this will be out of date by the time it gets back - * to userland, but it is still very useful for - * debugging. - */ - param->open_count = bdev->bd_openers; - bdput(bdev); - } else - param->open_count = -1; + /* + * Yes, this will be out of date by the time it gets back + * to userland, but it is still very useful for + * debugging. + */ + param->open_count = dm_open_count(md); if (disk->policy) param->flags |= DM_READONLY_FLAG; @@ -567,7 +592,7 @@ static int __dev_status(struct mapped_device *md, struct dm_ioctl *param) static int dev_create(struct dm_ioctl *param, size_t param_size) { - int r; + int r, m = DM_ANY_MINOR; struct mapped_device *md; r = check_name(param->name); @@ -575,10 +600,9 @@ static int dev_create(struct dm_ioctl *param, size_t param_size) return r; if (param->flags & DM_PERSISTENT_DEV_FLAG) - r = dm_create_with_minor(MINOR(huge_decode_dev(param->dev)), &md); - else - r = dm_create(&md); + m = MINOR(huge_decode_dev(param->dev)); + r = dm_create(m, &md); if (r) return r; @@ -611,10 +635,8 @@ static struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param) return __get_name_cell(param->name); md = dm_get_md(huge_decode_dev(param->dev)); - if (md) { + if (md) mdptr = dm_get_mdptr(md); - dm_put(md); - } return mdptr; } @@ -628,7 +650,6 @@ static struct mapped_device *find_device(struct dm_ioctl *param) hc = __find_device_hash_cell(param); if (hc) { md = hc->md; - dm_get(md); /* * Sneakily write in both the name and the uuid @@ -653,6 +674,8 @@ static struct mapped_device *find_device(struct dm_ioctl *param) static int dev_remove(struct dm_ioctl *param, size_t param_size) { struct hash_cell *hc; + struct mapped_device *md; + int r; down_write(&_hash_lock); hc = __find_device_hash_cell(param); @@ -663,8 +686,22 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size) return -ENXIO; } + md = hc->md; + + /* + * Ensure the device is not open and nothing further can open it. + */ + r = dm_lock_for_deletion(md); + if (r) { + DMWARN("unable to remove open device %s", hc->name); + up_write(&_hash_lock); + dm_put(md); + return r; + } + __hash_remove(hc); up_write(&_hash_lock); + dm_put(md); param->data_size = 0; return 0; } @@ -790,7 +827,6 @@ static int do_resume(struct dm_ioctl *param) } md = hc->md; - dm_get(md); new_map = hc->new_map; hc->new_map = NULL; @@ -1078,6 +1114,7 @@ static int table_clear(struct dm_ioctl *param, size_t param_size) { int r; struct hash_cell *hc; + struct mapped_device *md; down_write(&_hash_lock); @@ -1096,7 +1133,9 @@ static int table_clear(struct dm_ioctl *param, size_t param_size) param->flags &= ~DM_INACTIVE_PRESENT_FLAG; r = __dev_status(hc->md, param); + md = hc->md; up_write(&_hash_lock); + dm_put(md); return r; } diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index daf586c0898..47b3c62bbdb 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -12,6 +12,8 @@ #include <linux/bio.h> #include <linux/slab.h> +#define DM_MSG_PREFIX "linear" + /* * Linear: maps a linear range of a device. */ @@ -29,7 +31,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) unsigned long long tmp; if (argc != 2) { - ti->error = "dm-linear: Invalid argument count"; + ti->error = "Invalid argument count"; return -EINVAL; } @@ -111,7 +113,7 @@ int __init dm_linear_init(void) int r = dm_register_target(&linear_target); if (r < 0) - DMERR("linear: register failed %d", r); + DMERR("register failed %d", r); return r; } @@ -121,5 +123,5 @@ void dm_linear_exit(void) int r = dm_unregister_target(&linear_target); if (r < 0) - DMERR("linear: unregister failed %d", r); + DMERR("unregister failed %d", r); } diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index d73779a4241..64b764bd02c 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c @@ -12,6 +12,8 @@ #include "dm-log.h" #include "dm-io.h" +#define DM_MSG_PREFIX "mirror log" + static LIST_HEAD(_log_types); static DEFINE_SPINLOCK(_lock); @@ -155,8 +157,6 @@ struct log_c { struct io_region header_location; struct log_header *disk_header; - - struct io_region bits_location; }; /* @@ -241,43 +241,21 @@ static inline int write_header(struct log_c *log) } /*---------------------------------------------------------------- - * Bits IO - *--------------------------------------------------------------*/ -static int read_bits(struct log_c *log) -{ - int r; - unsigned long ebits; - - r = dm_io_sync_vm(1, &log->bits_location, READ, - log->clean_bits, &ebits); - if (r) - return r; - - return 0; -} - -static int write_bits(struct log_c *log) -{ - unsigned long ebits; - return dm_io_sync_vm(1, &log->bits_location, WRITE, - log->clean_bits, &ebits); -} - -/*---------------------------------------------------------------- * core log constructor/destructor * * argv contains region_size followed optionally by [no]sync *--------------------------------------------------------------*/ #define BYTE_SHIFT 3 -static int core_ctr(struct dirty_log *log, struct dm_target *ti, - unsigned int argc, char **argv) +static int create_log_context(struct dirty_log *log, struct dm_target *ti, + unsigned int argc, char **argv, + struct dm_dev *dev) { enum sync sync = DEFAULTSYNC; struct log_c *lc; uint32_t region_size; unsigned int region_count; - size_t bitset_size; + size_t bitset_size, buf_size; if (argc < 1 || argc > 2) { DMWARN("wrong number of arguments to mirror log"); @@ -319,22 +297,53 @@ static int core_ctr(struct dirty_log *log, struct dm_target *ti, * Work out how many "unsigned long"s we need to hold the bitset. */ bitset_size = dm_round_up(region_count, - sizeof(unsigned long) << BYTE_SHIFT); + sizeof(*lc->clean_bits) << BYTE_SHIFT); bitset_size >>= BYTE_SHIFT; - lc->bitset_uint32_count = bitset_size / 4; - lc->clean_bits = vmalloc(bitset_size); - if (!lc->clean_bits) { - DMWARN("couldn't allocate clean bitset"); - kfree(lc); - return -ENOMEM; + lc->bitset_uint32_count = bitset_size / sizeof(*lc->clean_bits); + + /* + * Disk log? + */ + if (!dev) { + lc->clean_bits = vmalloc(bitset_size); + if (!lc->clean_bits) { + DMWARN("couldn't allocate clean bitset"); + kfree(lc); + return -ENOMEM; + } + lc->disk_header = NULL; + } else { + lc->log_dev = dev; + lc->header_location.bdev = lc->log_dev->bdev; + lc->header_location.sector = 0; + + /* + * Buffer holds both header and bitset. + */ + buf_size = dm_round_up((LOG_OFFSET << SECTOR_SHIFT) + + bitset_size, ti->limits.hardsect_size); + lc->header_location.count = buf_size >> SECTOR_SHIFT; + + lc->disk_header = vmalloc(buf_size); + if (!lc->disk_header) { + DMWARN("couldn't allocate disk log buffer"); + kfree(lc); + return -ENOMEM; + } + + lc->clean_bits = (void *)lc->disk_header + + (LOG_OFFSET << SECTOR_SHIFT); } + memset(lc->clean_bits, -1, bitset_size); lc->sync_bits = vmalloc(bitset_size); if (!lc->sync_bits) { DMWARN("couldn't allocate sync bitset"); - vfree(lc->clean_bits); + if (!dev) + vfree(lc->clean_bits); + vfree(lc->disk_header); kfree(lc); return -ENOMEM; } @@ -345,25 +354,40 @@ static int core_ctr(struct dirty_log *log, struct dm_target *ti, if (!lc->recovering_bits) { DMWARN("couldn't allocate sync bitset"); vfree(lc->sync_bits); - vfree(lc->clean_bits); + if (!dev) + vfree(lc->clean_bits); + vfree(lc->disk_header); kfree(lc); return -ENOMEM; } memset(lc->recovering_bits, 0, bitset_size); lc->sync_search = 0; log->context = lc; + return 0; } -static void core_dtr(struct dirty_log *log) +static int core_ctr(struct dirty_log *log, struct dm_target *ti, + unsigned int argc, char **argv) +{ + return create_log_context(log, ti, argc, argv, NULL); +} + +static void destroy_log_context(struct log_c *lc) { - struct log_c *lc = (struct log_c *) log->context; - vfree(lc->clean_bits); vfree(lc->sync_bits); vfree(lc->recovering_bits); kfree(lc); } +static void core_dtr(struct dirty_log *log) +{ + struct log_c *lc = (struct log_c *) log->context; + + vfree(lc->clean_bits); + destroy_log_context(lc); +} + /*---------------------------------------------------------------- * disk log constructor/destructor * @@ -373,8 +397,6 @@ static int disk_ctr(struct dirty_log *log, struct dm_target *ti, unsigned int argc, char **argv) { int r; - size_t size; - struct log_c *lc; struct dm_dev *dev; if (argc < 2 || argc > 3) { @@ -387,49 +409,22 @@ static int disk_ctr(struct dirty_log *log, struct dm_target *ti, if (r) return r; - r = core_ctr(log, ti, argc - 1, argv + 1); + r = create_log_context(log, ti, argc - 1, argv + 1, dev); if (r) { dm_put_device(ti, dev); return r; } - lc = (struct log_c *) log->context; - lc->log_dev = dev; - - /* setup the disk header fields */ - lc->header_location.bdev = lc->log_dev->bdev; - lc->header_location.sector = 0; - lc->header_location.count = 1; - - /* - * We can't read less than this amount, even though we'll - * not be using most of this space. - */ - lc->disk_header = vmalloc(1 << SECTOR_SHIFT); - if (!lc->disk_header) - goto bad; - - /* setup the disk bitset fields */ - lc->bits_location.bdev = lc->log_dev->bdev; - lc->bits_location.sector = LOG_OFFSET; - - size = dm_round_up(lc->bitset_uint32_count * sizeof(uint32_t), - 1 << SECTOR_SHIFT); - lc->bits_location.count = size >> SECTOR_SHIFT; return 0; - - bad: - dm_put_device(ti, lc->log_dev); - core_dtr(log); - return -ENOMEM; } static void disk_dtr(struct dirty_log *log) { struct log_c *lc = (struct log_c *) log->context; + dm_put_device(lc->ti, lc->log_dev); vfree(lc->disk_header); - core_dtr(log); + destroy_log_context(lc); } static int count_bits32(uint32_t *addr, unsigned size) @@ -454,12 +449,7 @@ static int disk_resume(struct dirty_log *log) if (r) return r; - /* read the bits */ - r = read_bits(lc); - if (r) - return r; - - /* set or clear any new bits */ + /* set or clear any new bits -- device has grown */ if (lc->sync == NOSYNC) for (i = lc->header.nr_regions; i < lc->region_count; i++) /* FIXME: amazingly inefficient */ @@ -469,15 +459,14 @@ static int disk_resume(struct dirty_log *log) /* FIXME: amazingly inefficient */ log_clear_bit(lc, lc->clean_bits, i); + /* clear any old bits -- device has shrunk */ + for (i = lc->region_count; i % (sizeof(*lc->clean_bits) << BYTE_SHIFT); i++) + log_clear_bit(lc, lc->clean_bits, i); + /* copy clean across to sync */ memcpy(lc->sync_bits, lc->clean_bits, size); lc->sync_count = count_bits32(lc->clean_bits, lc->bitset_uint32_count); - /* write the bits */ - r = write_bits(lc); - if (r) - return r; - /* set the correct number of regions in the header */ lc->header.nr_regions = lc->region_count; @@ -518,7 +507,7 @@ static int disk_flush(struct dirty_log *log) if (!lc->touched) return 0; - r = write_bits(lc); + r = write_header(lc); if (!r) lc->touched = 0; diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 1816f30678e..217615b3322 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -21,6 +21,7 @@ #include <linux/workqueue.h> #include <asm/atomic.h> +#define DM_MSG_PREFIX "multipath" #define MESG_STR(x) x, sizeof(x) /* Path properties */ @@ -446,8 +447,6 @@ struct param { char *error; }; -#define ESTR(s) ("dm-multipath: " s) - static int read_param(struct param *param, char *str, unsigned *v, char **error) { if (!str || @@ -495,12 +494,12 @@ static int parse_path_selector(struct arg_set *as, struct priority_group *pg, unsigned ps_argc; static struct param _params[] = { - {0, 1024, ESTR("invalid number of path selector args")}, + {0, 1024, "invalid number of path selector args"}, }; pst = dm_get_path_selector(shift(as)); if (!pst) { - ti->error = ESTR("unknown path selector type"); + ti->error = "unknown path selector type"; return -EINVAL; } @@ -511,7 +510,7 @@ static int parse_path_selector(struct arg_set *as, struct priority_group *pg, r = pst->create(&pg->ps, ps_argc, as->argv); if (r) { dm_put_path_selector(pst); - ti->error = ESTR("path selector constructor failed"); + ti->error = "path selector constructor failed"; return r; } @@ -529,7 +528,7 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps, /* we need at least a path arg */ if (as->argc < 1) { - ti->error = ESTR("no device given"); + ti->error = "no device given"; return NULL; } @@ -540,7 +539,7 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps, r = dm_get_device(ti, shift(as), ti->begin, ti->len, dm_table_get_mode(ti->table), &p->path.dev); if (r) { - ti->error = ESTR("error getting device"); + ti->error = "error getting device"; goto bad; } @@ -562,8 +561,8 @@ static struct priority_group *parse_priority_group(struct arg_set *as, struct dm_target *ti) { static struct param _params[] = { - {1, 1024, ESTR("invalid number of paths")}, - {0, 1024, ESTR("invalid number of selector args")} + {1, 1024, "invalid number of paths"}, + {0, 1024, "invalid number of selector args"} }; int r; @@ -572,13 +571,13 @@ static struct priority_group *parse_priority_group(struct arg_set *as, if (as->argc < 2) { as->argc = 0; - ti->error = ESTR("not enough priority group aruments"); + ti->error = "not enough priority group aruments"; return NULL; } pg = alloc_priority_group(); if (!pg) { - ti->error = ESTR("couldn't allocate priority group"); + ti->error = "couldn't allocate priority group"; return NULL; } pg->m = m; @@ -633,7 +632,7 @@ static int parse_hw_handler(struct arg_set *as, struct multipath *m, unsigned hw_argc; static struct param _params[] = { - {0, 1024, ESTR("invalid number of hardware handler args")}, + {0, 1024, "invalid number of hardware handler args"}, }; r = read_param(_params, shift(as), &hw_argc, &ti->error); @@ -645,14 +644,14 @@ static int parse_hw_handler(struct arg_set *as, struct multipath *m, hwht = dm_get_hw_handler(shift(as)); if (!hwht) { - ti->error = ESTR("unknown hardware handler type"); + ti->error = "unknown hardware handler type"; return -EINVAL; } r = hwht->create(&m->hw_handler, hw_argc - 1, as->argv); if (r) { dm_put_hw_handler(hwht); - ti->error = ESTR("hardware handler constructor failed"); + ti->error = "hardware handler constructor failed"; return r; } @@ -669,7 +668,7 @@ static int parse_features(struct arg_set *as, struct multipath *m, unsigned argc; static struct param _params[] = { - {0, 1, ESTR("invalid number of feature args")}, + {0, 1, "invalid number of feature args"}, }; r = read_param(_params, shift(as), &argc, &ti->error); @@ -692,8 +691,8 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, { /* target parameters */ static struct param _params[] = { - {1, 1024, ESTR("invalid number of priority groups")}, - {1, 1024, ESTR("invalid initial priority group number")}, + {1, 1024, "invalid number of priority groups"}, + {1, 1024, "invalid initial priority group number"}, }; int r; @@ -707,7 +706,7 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, m = alloc_multipath(); if (!m) { - ti->error = ESTR("can't allocate multipath"); + ti->error = "can't allocate multipath"; return -EINVAL; } @@ -746,7 +745,7 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, } if (pg_count != m->nr_priority_groups) { - ti->error = ESTR("priority group count mismatch"); + ti->error = "priority group count mismatch"; r = -EINVAL; goto bad; } @@ -807,7 +806,7 @@ static int fail_path(struct pgpath *pgpath) if (!pgpath->path.is_active) goto out; - DMWARN("dm-multipath: Failing path %s.", pgpath->path.dev->name); + DMWARN("Failing path %s.", pgpath->path.dev->name); pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path); pgpath->path.is_active = 0; @@ -1250,7 +1249,7 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv) r = dm_get_device(ti, argv[1], ti->begin, ti->len, dm_table_get_mode(ti->table), &dev); if (r) { - DMWARN("dm-multipath message: error getting device %s", + DMWARN("message: error getting device %s", argv[1]); return -EINVAL; } @@ -1309,7 +1308,7 @@ static int __init dm_multipath_init(void) return -ENOMEM; } - DMINFO("dm-multipath version %u.%u.%u loaded", + DMINFO("version %u.%u.%u loaded", multipath_target.version[0], multipath_target.version[1], multipath_target.version[2]); diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index d12cf3e5e07..be48cedf986 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -20,6 +20,8 @@ #include <linux/vmalloc.h> #include <linux/workqueue.h> +#define DM_MSG_PREFIX "raid1" + static struct workqueue_struct *_kmirrord_wq; static struct work_struct _kmirrord_work; @@ -106,12 +108,42 @@ struct region { struct bio_list delayed_bios; }; + +/*----------------------------------------------------------------- + * Mirror set structures. + *---------------------------------------------------------------*/ +struct mirror { + atomic_t error_count; + struct dm_dev *dev; + sector_t offset; +}; + +struct mirror_set { + struct dm_target *ti; + struct list_head list; + struct region_hash rh; + struct kcopyd_client *kcopyd_client; + + spinlock_t lock; /* protects the next two lists */ + struct bio_list reads; + struct bio_list writes; + + /* recovery */ + region_t nr_regions; + int in_sync; + + struct mirror *default_mirror; /* Default mirror */ + + unsigned int nr_mirrors; + struct mirror mirror[0]; +}; + /* * Conversion fns */ static inline region_t bio_to_region(struct region_hash *rh, struct bio *bio) { - return bio->bi_sector >> rh->region_shift; + return (bio->bi_sector - rh->ms->ti->begin) >> rh->region_shift; } static inline sector_t region_to_sector(struct region_hash *rh, region_t region) @@ -458,11 +490,9 @@ static int __rh_recovery_prepare(struct region_hash *rh) /* Already quiesced ? */ if (atomic_read(®->pending)) list_del_init(®->list); + else + list_move(®->list, &rh->quiesced_regions); - else { - list_del_init(®->list); - list_add(®->list, &rh->quiesced_regions); - } spin_unlock_irq(&rh->region_lock); return 1; @@ -541,35 +571,6 @@ static void rh_start_recovery(struct region_hash *rh) wake(); } -/*----------------------------------------------------------------- - * Mirror set structures. - *---------------------------------------------------------------*/ -struct mirror { - atomic_t error_count; - struct dm_dev *dev; - sector_t offset; -}; - -struct mirror_set { - struct dm_target *ti; - struct list_head list; - struct region_hash rh; - struct kcopyd_client *kcopyd_client; - - spinlock_t lock; /* protects the next two lists */ - struct bio_list reads; - struct bio_list writes; - - /* recovery */ - region_t nr_regions; - int in_sync; - - struct mirror *default_mirror; /* Default mirror */ - - unsigned int nr_mirrors; - struct mirror mirror[0]; -}; - /* * Every mirror should look like this one. */ @@ -603,7 +604,7 @@ static void recovery_complete(int read_err, unsigned int write_err, struct region *reg = (struct region *) context; /* FIXME: better error handling */ - rh_recovery_end(reg, read_err || write_err); + rh_recovery_end(reg, !(read_err || write_err)); } static int recover(struct mirror_set *ms, struct region *reg) @@ -893,7 +894,7 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors, ms = kmalloc(len, GFP_KERNEL); if (!ms) { - ti->error = "dm-mirror: Cannot allocate mirror context"; + ti->error = "Cannot allocate mirror context"; return NULL; } @@ -907,7 +908,7 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors, ms->default_mirror = &ms->mirror[DEFAULT_MIRROR]; if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) { - ti->error = "dm-mirror: Error creating dirty region hash"; + ti->error = "Error creating dirty region hash"; kfree(ms); return NULL; } @@ -937,14 +938,14 @@ static int get_mirror(struct mirror_set *ms, struct dm_target *ti, unsigned long long offset; if (sscanf(argv[1], "%llu", &offset) != 1) { - ti->error = "dm-mirror: Invalid offset"; + ti->error = "Invalid offset"; return -EINVAL; } if (dm_get_device(ti, argv[0], offset, ti->len, dm_table_get_mode(ti->table), &ms->mirror[mirror].dev)) { - ti->error = "dm-mirror: Device lookup failure"; + ti->error = "Device lookup failure"; return -ENXIO; } @@ -981,30 +982,30 @@ static struct dirty_log *create_dirty_log(struct dm_target *ti, struct dirty_log *dl; if (argc < 2) { - ti->error = "dm-mirror: Insufficient mirror log arguments"; + ti->error = "Insufficient mirror log arguments"; return NULL; } if (sscanf(argv[1], "%u", ¶m_count) != 1) { - ti->error = "dm-mirror: Invalid mirror log argument count"; + ti->error = "Invalid mirror log argument count"; return NULL; } *args_used = 2 + param_count; if (argc < *args_used) { - ti->error = "dm-mirror: Insufficient mirror log arguments"; + ti->error = "Insufficient mirror log arguments"; return NULL; } dl = dm_create_dirty_log(argv[0], ti, param_count, argv + 2); if (!dl) { - ti->error = "dm-mirror: Error creating mirror dirty log"; + ti->error = "Error creating mirror dirty log"; return NULL; } if (!_check_region_size(ti, dl->type->get_region_size(dl))) { - ti->error = "dm-mirror: Invalid region size"; + ti->error = "Invalid region size"; dm_destroy_dirty_log(dl); return NULL; } @@ -1038,7 +1039,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (!argc || sscanf(argv[0], "%u", &nr_mirrors) != 1 || nr_mirrors < 2 || nr_mirrors > KCOPYD_MAX_REGIONS + 1) { - ti->error = "dm-mirror: Invalid number of mirrors"; + ti->error = "Invalid number of mirrors"; dm_destroy_dirty_log(dl); return -EINVAL; } @@ -1046,7 +1047,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) argv++, argc--; if (argc != nr_mirrors * 2) { - ti->error = "dm-mirror: Wrong number of mirror arguments"; + ti->error = "Wrong number of mirror arguments"; dm_destroy_dirty_log(dl); return -EINVAL; } @@ -1115,7 +1116,7 @@ static int mirror_map(struct dm_target *ti, struct bio *bio, struct mirror *m; struct mirror_set *ms = ti->private; - map_context->ll = bio->bi_sector >> ms->rh.region_shift; + map_context->ll = bio_to_region(&ms->rh, bio); if (rw == WRITE) { queue_bio(ms, bio, rw); @@ -1221,7 +1222,7 @@ static int mirror_status(struct dm_target *ti, status_type_t type, static struct target_type mirror_target = { .name = "mirror", - .version = {1, 0, 1}, + .version = {1, 0, 2}, .module = THIS_MODULE, .ctr = mirror_ctr, .dtr = mirror_dtr, diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c index d0024865a78..c5a16c55012 100644 --- a/drivers/md/dm-round-robin.c +++ b/drivers/md/dm-round-robin.c @@ -14,6 +14,8 @@ #include <linux/slab.h> +#define DM_MSG_PREFIX "multipath round-robin" + /*----------------------------------------------------------------- * Path-handling code, paths are held in lists *---------------------------------------------------------------*/ @@ -191,9 +193,9 @@ static int __init dm_rr_init(void) int r = dm_register_path_selector(&rr_ps); if (r < 0) - DMERR("round-robin: register failed %d", r); + DMERR("register failed %d", r); - DMINFO("dm-round-robin version 1.0.0 loaded"); + DMINFO("version 1.0.0 loaded"); return r; } diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 08312b46463..8eea0ddbf5e 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -23,6 +23,8 @@ #include "dm-bio-list.h" #include "kcopyd.h" +#define DM_MSG_PREFIX "snapshots" + /* * The percentage increment we will wake up users at */ @@ -117,7 +119,7 @@ static int init_origin_hash(void) _origins = kmalloc(ORIGIN_HASH_SIZE * sizeof(struct list_head), GFP_KERNEL); if (!_origins) { - DMERR("Device mapper: Snapshot: unable to allocate memory"); + DMERR("unable to allocate memory"); return -ENOMEM; } @@ -412,7 +414,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) int blocksize; if (argc < 4) { - ti->error = "dm-snapshot: requires exactly 4 arguments"; + ti->error = "requires exactly 4 arguments"; r = -EINVAL; goto bad1; } @@ -530,7 +532,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) } ti->private = s; - ti->split_io = chunk_size; + ti->split_io = s->chunk_size; return 0; @@ -1127,7 +1129,7 @@ static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv) struct dm_dev *dev; if (argc != 1) { - ti->error = "dm-origin: incorrect number of arguments"; + ti->error = "origin: incorrect number of arguments"; return -EINVAL; } @@ -1204,7 +1206,7 @@ static int origin_status(struct dm_target *ti, status_type_t type, char *result, static struct target_type origin_target = { .name = "snapshot-origin", - .version = {1, 1, 0}, + .version = {1, 4, 0}, .module = THIS_MODULE, .ctr = origin_ctr, .dtr = origin_dtr, @@ -1215,7 +1217,7 @@ static struct target_type origin_target = { static struct target_type snapshot_target = { .name = "snapshot", - .version = {1, 1, 0}, + .version = {1, 4, 0}, .module = THIS_MODULE, .ctr = snapshot_ctr, .dtr = snapshot_dtr, @@ -1236,7 +1238,7 @@ static int __init dm_snapshot_init(void) r = dm_register_target(&origin_target); if (r < 0) { - DMERR("Device mapper: Origin: register failed %d\n", r); + DMERR("Origin target register failed %d", r); goto bad1; } diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 08328a8f5a3..6c29fcecd89 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -12,6 +12,8 @@ #include <linux/bio.h> #include <linux/slab.h> +#define DM_MSG_PREFIX "striped" + struct stripe { struct dm_dev *dev; sector_t physical_start; @@ -78,19 +80,19 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) unsigned int i; if (argc < 2) { - ti->error = "dm-stripe: Not enough arguments"; + ti->error = "Not enough arguments"; return -EINVAL; } stripes = simple_strtoul(argv[0], &end, 10); if (*end) { - ti->error = "dm-stripe: Invalid stripe count"; + ti->error = "Invalid stripe count"; return -EINVAL; } chunk_size = simple_strtoul(argv[1], &end, 10); if (*end) { - ti->error = "dm-stripe: Invalid chunk_size"; + ti->error = "Invalid chunk_size"; return -EINVAL; } @@ -99,19 +101,19 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) */ if (!chunk_size || (chunk_size & (chunk_size - 1)) || (chunk_size < (PAGE_SIZE >> SECTOR_SHIFT))) { - ti->error = "dm-stripe: Invalid chunk size"; + ti->error = "Invalid chunk size"; return -EINVAL; } if (ti->len & (chunk_size - 1)) { - ti->error = "dm-stripe: Target length not divisible by " + ti->error = "Target length not divisible by " "chunk size"; return -EINVAL; } width = ti->len; if (sector_div(width, stripes)) { - ti->error = "dm-stripe: Target length not divisible by " + ti->error = "Target length not divisible by " "number of stripes"; return -EINVAL; } @@ -120,14 +122,14 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) * Do we have enough arguments for that many stripes ? */ if (argc != (2 + 2 * stripes)) { - ti->error = "dm-stripe: Not enough destinations " + ti->error = "Not enough destinations " "specified"; return -EINVAL; } sc = alloc_context(stripes); if (!sc) { - ti->error = "dm-stripe: Memory allocation for striped context " + ti->error = "Memory allocation for striped context " "failed"; return -ENOMEM; } @@ -149,8 +151,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) r = get_stripe(ti, sc, i, argv); if (r < 0) { - ti->error = "dm-stripe: Couldn't parse stripe " - "destination"; + ti->error = "Couldn't parse stripe destination"; while (i--) dm_put_device(ti, sc->stripe[i].dev); kfree(sc); @@ -227,7 +228,7 @@ int __init dm_stripe_init(void) r = dm_register_target(&stripe_target); if (r < 0) - DMWARN("striped target registration failed"); + DMWARN("target registration failed"); return r; } @@ -235,7 +236,7 @@ int __init dm_stripe_init(void) void dm_stripe_exit(void) { if (dm_unregister_target(&stripe_target)) - DMWARN("striped target unregistration failed"); + DMWARN("target unregistration failed"); return; } diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 8f56a54cf0c..75fe9493e6a 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -17,6 +17,8 @@ #include <linux/mutex.h> #include <asm/atomic.h> +#define DM_MSG_PREFIX "table" + #define MAX_DEPTH 16 #define NODE_SIZE L1_CACHE_BYTES #define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t)) @@ -237,6 +239,44 @@ int dm_table_create(struct dm_table **result, int mode, return 0; } +int dm_create_error_table(struct dm_table **result, struct mapped_device *md) +{ + struct dm_table *t; + sector_t dev_size = 1; + int r; + + /* + * Find current size of device. + * Default to 1 sector if inactive. + */ + t = dm_get_table(md); + if (t) { + dev_size = dm_table_get_size(t); + dm_table_put(t); + } + + r = dm_table_create(&t, FMODE_READ, 1, md); + if (r) + return r; + + r = dm_table_add_target(t, "error", 0, dev_size, NULL); + if (r) + goto out; + + r = dm_table_complete(t); + if (r) + goto out; + + *result = t; + +out: + if (r) + dm_table_put(t); + + return r; +} +EXPORT_SYMBOL_GPL(dm_create_error_table); + static void free_devices(struct list_head *devices) { struct list_head *tmp, *next; @@ -590,6 +630,12 @@ int dm_split_args(int *argc, char ***argvp, char *input) unsigned array_size = 0; *argc = 0; + + if (!input) { + *argvp = NULL; + return 0; + } + argv = realloc_argv(&array_size, argv); if (!argv) return -ENOMEM; @@ -671,15 +717,14 @@ int dm_table_add_target(struct dm_table *t, const char *type, memset(tgt, 0, sizeof(*tgt)); if (!len) { - tgt->error = "zero-length target"; - DMERR("%s", tgt->error); + DMERR("%s: zero-length target", dm_device_name(t->md)); return -EINVAL; } tgt->type = dm_get_target_type(type); if (!tgt->type) { - tgt->error = "unknown target type"; - DMERR("%s", tgt->error); + DMERR("%s: %s: unknown target type", dm_device_name(t->md), + type); return -EINVAL; } @@ -716,7 +761,7 @@ int dm_table_add_target(struct dm_table *t, const char *type, return 0; bad: - DMERR("%s", tgt->error); + DMERR("%s: %s: %s", dm_device_name(t->md), type, tgt->error); dm_put_target_type(tgt->type); return r; } @@ -802,7 +847,7 @@ sector_t dm_table_get_size(struct dm_table *t) struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index) { - if (index > t->num_targets) + if (index >= t->num_targets) return NULL; return t->targets + index; diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c index 64fd8e79ea4..477a041a41c 100644 --- a/drivers/md/dm-target.c +++ b/drivers/md/dm-target.c @@ -12,6 +12,8 @@ #include <linux/bio.h> #include <linux/slab.h> +#define DM_MSG_PREFIX "target" + struct tt_internal { struct target_type tt; diff --git a/drivers/md/dm-zero.c b/drivers/md/dm-zero.c index 51c0639b248..ea569f7348d 100644 --- a/drivers/md/dm-zero.c +++ b/drivers/md/dm-zero.c @@ -10,13 +10,15 @@ #include <linux/init.h> #include <linux/bio.h> +#define DM_MSG_PREFIX "zero" + /* * Construct a dummy mapping that only returns zeros */ static int zero_ctr(struct dm_target *ti, unsigned int argc, char **argv) { if (argc != 0) { - ti->error = "dm-zero: No arguments required"; + ti->error = "No arguments required"; return -EINVAL; } @@ -60,7 +62,7 @@ static int __init dm_zero_init(void) int r = dm_register_target(&zero_target); if (r < 0) - DMERR("zero: register failed %d", r); + DMERR("register failed %d", r); return r; } @@ -70,7 +72,7 @@ static void __exit dm_zero_exit(void) int r = dm_unregister_target(&zero_target); if (r < 0) - DMERR("zero: unregister failed %d", r); + DMERR("unregister failed %d", r); } module_init(dm_zero_init) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 4d710b7a133..3ed2e53b9eb 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1,6 +1,6 @@ /* * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. - * Copyright (C) 2004 Red Hat, Inc. All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. * * This file is released under the GPL. */ @@ -21,11 +21,14 @@ #include <linux/hdreg.h> #include <linux/blktrace_api.h> +#define DM_MSG_PREFIX "core" + static const char *_name = DM_NAME; static unsigned int major = 0; static unsigned int _major = 0; +static DEFINE_SPINLOCK(_minor_lock); /* * One of these is allocated per bio. */ @@ -49,23 +52,28 @@ struct target_io { union map_info *dm_get_mapinfo(struct bio *bio) { - if (bio && bio->bi_private) - return &((struct target_io *)bio->bi_private)->info; - return NULL; + if (bio && bio->bi_private) + return &((struct target_io *)bio->bi_private)->info; + return NULL; } +#define MINOR_ALLOCED ((void *)-1) + /* * Bits for the md->flags field. */ #define DMF_BLOCK_IO 0 #define DMF_SUSPENDED 1 #define DMF_FROZEN 2 +#define DMF_FREEING 3 +#define DMF_DELETING 4 struct mapped_device { struct rw_semaphore io_lock; struct semaphore suspend_lock; rwlock_t map_lock; atomic_t holders; + atomic_t open_count; unsigned long flags; @@ -218,9 +226,25 @@ static int dm_blk_open(struct inode *inode, struct file *file) { struct mapped_device *md; + spin_lock(&_minor_lock); + md = inode->i_bdev->bd_disk->private_data; + if (!md) + goto out; + + if (test_bit(DMF_FREEING, &md->flags) || + test_bit(DMF_DELETING, &md->flags)) { + md = NULL; + goto out; + } + dm_get(md); - return 0; + atomic_inc(&md->open_count); + +out: + spin_unlock(&_minor_lock); + + return md ? 0 : -ENXIO; } static int dm_blk_close(struct inode *inode, struct file *file) @@ -228,10 +252,35 @@ static int dm_blk_close(struct inode *inode, struct file *file) struct mapped_device *md; md = inode->i_bdev->bd_disk->private_data; + atomic_dec(&md->open_count); dm_put(md); return 0; } +int dm_open_count(struct mapped_device *md) +{ + return atomic_read(&md->open_count); +} + +/* + * Guarantees nothing is using the device before it's deleted. + */ +int dm_lock_for_deletion(struct mapped_device *md) +{ + int r = 0; + + spin_lock(&_minor_lock); + + if (dm_open_count(md)) + r = -EBUSY; + else + set_bit(DMF_DELETING, &md->flags); + + spin_unlock(&_minor_lock); + + return r; +} + static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) { struct mapped_device *md = bdev->bd_disk->private_data; @@ -456,8 +505,8 @@ static void __map_bio(struct dm_target *ti, struct bio *clone, if (r > 0) { /* the bio has been remapped so dispatch it */ - blk_add_trace_remap(bdev_get_queue(clone->bi_bdev), clone, - tio->io->bio->bi_bdev->bd_dev, sector, + blk_add_trace_remap(bdev_get_queue(clone->bi_bdev), clone, + tio->io->bio->bi_bdev->bd_dev, sector, clone->bi_sector); generic_make_request(clone); @@ -744,43 +793,39 @@ static int dm_any_congested(void *congested_data, int bdi_bits) /*----------------------------------------------------------------- * An IDR is used to keep track of allocated minor numbers. *---------------------------------------------------------------*/ -static DEFINE_MUTEX(_minor_lock); static DEFINE_IDR(_minor_idr); -static void free_minor(unsigned int minor) +static void free_minor(int minor) { - mutex_lock(&_minor_lock); + spin_lock(&_minor_lock); idr_remove(&_minor_idr, minor); - mutex_unlock(&_minor_lock); + spin_unlock(&_minor_lock); } /* * See if the device with a specific minor # is free. */ -static int specific_minor(struct mapped_device *md, unsigned int minor) +static int specific_minor(struct mapped_device *md, int minor) { int r, m; if (minor >= (1 << MINORBITS)) return -EINVAL; - mutex_lock(&_minor_lock); + r = idr_pre_get(&_minor_idr, GFP_KERNEL); + if (!r) + return -ENOMEM; + + spin_lock(&_minor_lock); if (idr_find(&_minor_idr, minor)) { r = -EBUSY; goto out; } - r = idr_pre_get(&_minor_idr, GFP_KERNEL); - if (!r) { - r = -ENOMEM; - goto out; - } - - r = idr_get_new_above(&_minor_idr, md, minor, &m); - if (r) { + r = idr_get_new_above(&_minor_idr, MINOR_ALLOCED, minor, &m); + if (r) goto out; - } if (m != minor) { idr_remove(&_minor_idr, m); @@ -789,24 +834,21 @@ static int specific_minor(struct mapped_device *md, unsigned int minor) } out: - mutex_unlock(&_minor_lock); + spin_unlock(&_minor_lock); return r; } -static int next_free_minor(struct mapped_device *md, unsigned int *minor) +static int next_free_minor(struct mapped_device *md, int *minor) { - int r; - unsigned int m; - - mutex_lock(&_minor_lock); + int r, m; r = idr_pre_get(&_minor_idr, GFP_KERNEL); - if (!r) { - r = -ENOMEM; - goto out; - } + if (!r) + return -ENOMEM; + + spin_lock(&_minor_lock); - r = idr_get_new(&_minor_idr, md, &m); + r = idr_get_new(&_minor_idr, MINOR_ALLOCED, &m); if (r) { goto out; } @@ -820,7 +862,7 @@ static int next_free_minor(struct mapped_device *md, unsigned int *minor) *minor = m; out: - mutex_unlock(&_minor_lock); + spin_unlock(&_minor_lock); return r; } @@ -829,18 +871,25 @@ static struct block_device_operations dm_blk_dops; /* * Allocate and initialise a blank device with a given minor. */ -static struct mapped_device *alloc_dev(unsigned int minor, int persistent) +static struct mapped_device *alloc_dev(int minor) { int r; struct mapped_device *md = kmalloc(sizeof(*md), GFP_KERNEL); + void *old_md; if (!md) { DMWARN("unable to allocate device, out of memory."); return NULL; } + if (!try_module_get(THIS_MODULE)) + goto bad0; + /* get a minor number for the dev */ - r = persistent ? specific_minor(md, minor) : next_free_minor(md, &minor); + if (minor == DM_ANY_MINOR) + r = next_free_minor(md, &minor); + else + r = specific_minor(md, minor); if (r < 0) goto bad1; @@ -849,6 +898,7 @@ static struct mapped_device *alloc_dev(unsigned int minor, int persistent) init_MUTEX(&md->suspend_lock); rwlock_init(&md->map_lock); atomic_set(&md->holders, 1); + atomic_set(&md->open_count, 0); atomic_set(&md->event_nr, 0); md->queue = blk_alloc_queue(GFP_KERNEL); @@ -875,6 +925,10 @@ static struct mapped_device *alloc_dev(unsigned int minor, int persistent) if (!md->disk) goto bad4; + atomic_set(&md->pending, 0); + init_waitqueue_head(&md->wait); + init_waitqueue_head(&md->eventq); + md->disk->major = _major; md->disk->first_minor = minor; md->disk->fops = &dm_blk_dops; @@ -884,9 +938,12 @@ static struct mapped_device *alloc_dev(unsigned int minor, int persistent) add_disk(md->disk); format_dev_t(md->name, MKDEV(_major, minor)); - atomic_set(&md->pending, 0); - init_waitqueue_head(&md->wait); - init_waitqueue_head(&md->eventq); + /* Populate the mapping, nobody knows we exist yet */ + spin_lock(&_minor_lock); + old_md = idr_replace(&_minor_idr, md, minor); + spin_unlock(&_minor_lock); + + BUG_ON(old_md != MINOR_ALLOCED); return md; @@ -898,13 +955,15 @@ static struct mapped_device *alloc_dev(unsigned int minor, int persistent) blk_cleanup_queue(md->queue); free_minor(minor); bad1: + module_put(THIS_MODULE); + bad0: kfree(md); return NULL; } static void free_dev(struct mapped_device *md) { - unsigned int minor = md->disk->first_minor; + int minor = md->disk->first_minor; if (md->suspended_bdev) { thaw_bdev(md->suspended_bdev, NULL); @@ -914,8 +973,14 @@ static void free_dev(struct mapped_device *md) mempool_destroy(md->io_pool); del_gendisk(md->disk); free_minor(minor); + + spin_lock(&_minor_lock); + md->disk->private_data = NULL; + spin_unlock(&_minor_lock); + put_disk(md->disk); blk_cleanup_queue(md->queue); + module_put(THIS_MODULE); kfree(md); } @@ -984,12 +1049,11 @@ static void __unbind(struct mapped_device *md) /* * Constructor for a new device. */ -static int create_aux(unsigned int minor, int persistent, - struct mapped_device **result) +int dm_create(int minor, struct mapped_device **result) { struct mapped_device *md; - md = alloc_dev(minor, persistent); + md = alloc_dev(minor); if (!md) return -ENXIO; @@ -997,16 +1061,6 @@ static int create_aux(unsigned int minor, int persistent, return 0; } -int dm_create(struct mapped_device **result) -{ - return create_aux(0, 0, result); -} - -int dm_create_with_minor(unsigned int minor, struct mapped_device **result) -{ - return create_aux(minor, 1, result); -} - static struct mapped_device *dm_find_md(dev_t dev) { struct mapped_device *md; @@ -1015,13 +1069,18 @@ static struct mapped_device *dm_find_md(dev_t dev) if (MAJOR(dev) != _major || minor >= (1 << MINORBITS)) return NULL; - mutex_lock(&_minor_lock); + spin_lock(&_minor_lock); md = idr_find(&_minor_idr, minor); - if (!md || (dm_disk(md)->first_minor != minor)) + if (md && (md == MINOR_ALLOCED || + (dm_disk(md)->first_minor != minor) || + test_bit(DMF_FREEING, &md->flags))) { md = NULL; + goto out; + } - mutex_unlock(&_minor_lock); +out: + spin_unlock(&_minor_lock); return md; } @@ -1051,12 +1110,23 @@ void dm_get(struct mapped_device *md) atomic_inc(&md->holders); } +const char *dm_device_name(struct mapped_device *md) +{ + return md->name; +} +EXPORT_SYMBOL_GPL(dm_device_name); + void dm_put(struct mapped_device *md) { struct dm_table *map; - if (atomic_dec_and_test(&md->holders)) { + BUG_ON(test_bit(DMF_FREEING, &md->flags)); + + if (atomic_dec_and_lock(&md->holders, &_minor_lock)) { map = dm_get_table(md); + idr_replace(&_minor_idr, MINOR_ALLOCED, dm_disk(md)->first_minor); + set_bit(DMF_FREEING, &md->flags); + spin_unlock(&_minor_lock); if (!dm_suspended(md)) { dm_table_presuspend_targets(map); dm_table_postsuspend_targets(map); diff --git a/drivers/md/dm.h b/drivers/md/dm.h index fd90bc8f9e4..3c03c0ecab7 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -2,7 +2,7 @@ * Internal header file for device mapper * * Copyright (C) 2001, 2002 Sistina Software - * Copyright (C) 2004 Red Hat, Inc. All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. * * This file is released under the LGPL. */ @@ -17,9 +17,10 @@ #include <linux/hdreg.h> #define DM_NAME "device-mapper" -#define DMWARN(f, x...) printk(KERN_WARNING DM_NAME ": " f "\n" , ## x) -#define DMERR(f, x...) printk(KERN_ERR DM_NAME ": " f "\n" , ## x) -#define DMINFO(f, x...) printk(KERN_INFO DM_NAME ": " f "\n" , ## x) + +#define DMERR(f, arg...) printk(KERN_ERR DM_NAME ": " DM_MSG_PREFIX ": " f "\n", ## arg) +#define DMWARN(f, arg...) printk(KERN_WARNING DM_NAME ": " DM_MSG_PREFIX ": " f "\n", ## arg) +#define DMINFO(f, arg...) printk(KERN_INFO DM_NAME ": " DM_MSG_PREFIX ": " f "\n", ## arg) #define DMEMIT(x...) sz += ((sz >= maxlen) ? \ 0 : scnprintf(result + sz, maxlen - sz, x)) @@ -39,83 +40,16 @@ struct dm_dev { }; struct dm_table; -struct mapped_device; - -/*----------------------------------------------------------------- - * Functions for manipulating a struct mapped_device. - * Drop the reference with dm_put when you finish with the object. - *---------------------------------------------------------------*/ -int dm_create(struct mapped_device **md); -int dm_create_with_minor(unsigned int minor, struct mapped_device **md); -void dm_set_mdptr(struct mapped_device *md, void *ptr); -void *dm_get_mdptr(struct mapped_device *md); -struct mapped_device *dm_get_md(dev_t dev); - -/* - * Reference counting for md. - */ -void dm_get(struct mapped_device *md); -void dm_put(struct mapped_device *md); - -/* - * A device can still be used while suspended, but I/O is deferred. - */ -int dm_suspend(struct mapped_device *md, int with_lockfs); -int dm_resume(struct mapped_device *md); - -/* - * The device must be suspended before calling this method. - */ -int dm_swap_table(struct mapped_device *md, struct dm_table *t); - -/* - * Drop a reference on the table when you've finished with the - * result. - */ -struct dm_table *dm_get_table(struct mapped_device *md); - -/* - * Event functions. - */ -uint32_t dm_get_event_nr(struct mapped_device *md); -int dm_wait_event(struct mapped_device *md, int event_nr); - -/* - * Info functions. - */ -struct gendisk *dm_disk(struct mapped_device *md); -int dm_suspended(struct mapped_device *md); - -/* - * Geometry functions. - */ -int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo); -int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo); /*----------------------------------------------------------------- - * Functions for manipulating a table. Tables are also reference - * counted. + * Internal table functions. *---------------------------------------------------------------*/ -int dm_table_create(struct dm_table **result, int mode, - unsigned num_targets, struct mapped_device *md); - -void dm_table_get(struct dm_table *t); -void dm_table_put(struct dm_table *t); - -int dm_table_add_target(struct dm_table *t, const char *type, - sector_t start, sector_t len, char *params); -int dm_table_complete(struct dm_table *t); void dm_table_event_callback(struct dm_table *t, void (*fn)(void *), void *context); -void dm_table_event(struct dm_table *t); -sector_t dm_table_get_size(struct dm_table *t); struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index); struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector); void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q); -unsigned int dm_table_get_num_targets(struct dm_table *t); struct list_head *dm_table_get_devices(struct dm_table *t); -int dm_table_get_mode(struct dm_table *t); -struct mapped_device *dm_table_get_md(struct dm_table *t); void dm_table_presuspend_targets(struct dm_table *t); void dm_table_postsuspend_targets(struct dm_table *t); void dm_table_resume_targets(struct dm_table *t); @@ -133,7 +67,6 @@ void dm_put_target_type(struct target_type *t); int dm_target_iterate(void (*iter_func)(struct target_type *tt, void *param), void *param); - /*----------------------------------------------------------------- * Useful inlines. *---------------------------------------------------------------*/ @@ -191,5 +124,7 @@ void dm_stripe_exit(void); void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size); union map_info *dm_get_mapinfo(struct bio *bio); +int dm_open_count(struct mapped_device *md); +int dm_lock_for_deletion(struct mapped_device *md); #endif diff --git a/drivers/md/kcopyd.c b/drivers/md/kcopyd.c index 72480a48d88..73ab875fb15 100644 --- a/drivers/md/kcopyd.c +++ b/drivers/md/kcopyd.c @@ -314,7 +314,7 @@ static void complete_io(unsigned long error, void *context) if (error) { if (job->rw == WRITE) - job->write_err &= error; + job->write_err |= error; else job->read_err = 1; @@ -460,7 +460,7 @@ static void segment_complete(int read_err, job->read_err = 1; if (write_err) - job->write_err &= write_err; + job->write_err |= write_err; /* * Only dispatch more work if there hasn't been an error. diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 777585458c8..ff83c9b5979 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -111,7 +111,7 @@ static int linear_issue_flush(request_queue_t *q, struct gendisk *disk, return ret; } -static int linear_run (mddev_t *mddev) +static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) { linear_conf_t *conf; dev_info_t **table; @@ -121,20 +121,21 @@ static int linear_run (mddev_t *mddev) sector_t curr_offset; struct list_head *tmp; - conf = kzalloc (sizeof (*conf) + mddev->raid_disks*sizeof(dev_info_t), + conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(dev_info_t), GFP_KERNEL); if (!conf) - goto out; + return NULL; + mddev->private = conf; cnt = 0; - mddev->array_size = 0; + conf->array_size = 0; ITERATE_RDEV(mddev,rdev,tmp) { int j = rdev->raid_disk; dev_info_t *disk = conf->disks + j; - if (j < 0 || j > mddev->raid_disks || disk->rdev) { + if (j < 0 || j > raid_disks || disk->rdev) { printk("linear: disk numbering problem. Aborting!\n"); goto out; } @@ -152,11 +153,11 @@ static int linear_run (mddev_t *mddev) blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); disk->size = rdev->size; - mddev->array_size += rdev->size; + conf->array_size += rdev->size; cnt++; } - if (cnt != mddev->raid_disks) { + if (cnt != raid_disks) { printk("linear: not enough drives present. Aborting!\n"); goto out; } @@ -200,7 +201,7 @@ static int linear_run (mddev_t *mddev) unsigned round; unsigned long base; - sz = mddev->array_size >> conf->preshift; + sz = conf->array_size >> conf->preshift; sz += 1; /* force round-up */ base = conf->hash_spacing >> conf->preshift; round = sector_div(sz, base); @@ -247,14 +248,56 @@ static int linear_run (mddev_t *mddev) BUG_ON(table - conf->hash_table > nb_zone); + return conf; + +out: + kfree(conf); + return NULL; +} + +static int linear_run (mddev_t *mddev) +{ + linear_conf_t *conf; + + conf = linear_conf(mddev, mddev->raid_disks); + + if (!conf) + return 1; + mddev->private = conf; + mddev->array_size = conf->array_size; + blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec); mddev->queue->unplug_fn = linear_unplug; mddev->queue->issue_flush_fn = linear_issue_flush; return 0; +} -out: - kfree(conf); - return 1; +static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev) +{ + /* Adding a drive to a linear array allows the array to grow. + * It is permitted if the new drive has a matching superblock + * already on it, with raid_disk equal to raid_disks. + * It is achieved by creating a new linear_private_data structure + * and swapping it in in-place of the current one. + * The current one is never freed until the array is stopped. + * This avoids races. + */ + linear_conf_t *newconf; + + if (rdev->raid_disk != mddev->raid_disks) + return -EINVAL; + + newconf = linear_conf(mddev,mddev->raid_disks+1); + + if (!newconf) + return -ENOMEM; + + newconf->prev = mddev_to_conf(mddev); + mddev->private = newconf; + mddev->raid_disks++; + mddev->array_size = newconf->array_size; + set_capacity(mddev->gendisk, mddev->array_size << 1); + return 0; } static int linear_stop (mddev_t *mddev) @@ -262,8 +305,12 @@ static int linear_stop (mddev_t *mddev) linear_conf_t *conf = mddev_to_conf(mddev); blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ - kfree(conf->hash_table); - kfree(conf); + do { + linear_conf_t *t = conf->prev; + kfree(conf->hash_table); + kfree(conf); + conf = t; + } while (conf); return 0; } @@ -360,6 +407,7 @@ static struct mdk_personality linear_personality = .run = linear_run, .stop = linear_stop, .status = linear_status, + .hot_add_disk = linear_add, }; static int __init linear_init (void) diff --git a/drivers/md/md.c b/drivers/md/md.c index f19b874753a..306268ec99f 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -44,6 +44,7 @@ #include <linux/suspend.h> #include <linux/poll.h> #include <linux/mutex.h> +#include <linux/ctype.h> #include <linux/init.h> @@ -72,6 +73,10 @@ static void autostart_arrays (int part); static LIST_HEAD(pers_list); static DEFINE_SPINLOCK(pers_lock); +static void md_print_devices(void); + +#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } + /* * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' * is 1000 KB/sec, so the extra system load does not show up that much. @@ -170,7 +175,7 @@ EXPORT_SYMBOL_GPL(md_new_event); /* Alternate version that can be called from interrupts * when calling sysfs_notify isn't needed. */ -void md_new_event_inintr(mddev_t *mddev) +static void md_new_event_inintr(mddev_t *mddev) { atomic_inc(&md_event_count); wake_up(&md_event_waiters); @@ -732,6 +737,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) { mdp_disk_t *desc; mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); + __u64 ev1 = md_event(sb); rdev->raid_disk = -1; rdev->flags = 0; @@ -748,7 +754,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->layout = sb->layout; mddev->raid_disks = sb->raid_disks; mddev->size = sb->size; - mddev->events = md_event(sb); + mddev->events = ev1; mddev->bitmap_offset = 0; mddev->default_bitmap_offset = MD_SB_BYTES >> 9; @@ -797,7 +803,6 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) } else if (mddev->pers == NULL) { /* Insist on good event counter while assembling */ - __u64 ev1 = md_event(sb); ++ev1; if (ev1 < mddev->events) return -EINVAL; @@ -805,19 +810,21 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) /* if adding to array with a bitmap, then we can accept an * older device ... but not too old. */ - __u64 ev1 = md_event(sb); if (ev1 < mddev->bitmap->events_cleared) return 0; - } else /* just a hot-add of a new device, leave raid_disk at -1 */ - return 0; + } else { + if (ev1 < mddev->events) + /* just a hot-add of a new device, leave raid_disk at -1 */ + return 0; + } if (mddev->level != LEVEL_MULTIPATH) { desc = sb->disks + rdev->desc_nr; if (desc->state & (1<<MD_DISK_FAULTY)) set_bit(Faulty, &rdev->flags); - else if (desc->state & (1<<MD_DISK_SYNC) && - desc->raid_disk < mddev->raid_disks) { + else if (desc->state & (1<<MD_DISK_SYNC) /* && + desc->raid_disk < mddev->raid_disks */) { set_bit(In_sync, &rdev->flags); rdev->raid_disk = desc->raid_disk; } @@ -1100,6 +1107,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) { struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); + __u64 ev1 = le64_to_cpu(sb->events); rdev->raid_disk = -1; rdev->flags = 0; @@ -1115,7 +1123,7 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->layout = le32_to_cpu(sb->layout); mddev->raid_disks = le32_to_cpu(sb->raid_disks); mddev->size = le64_to_cpu(sb->size)/2; - mddev->events = le64_to_cpu(sb->events); + mddev->events = ev1; mddev->bitmap_offset = 0; mddev->default_bitmap_offset = 1024 >> 9; @@ -1149,7 +1157,6 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) } else if (mddev->pers == NULL) { /* Insist of good event counter while assembling */ - __u64 ev1 = le64_to_cpu(sb->events); ++ev1; if (ev1 < mddev->events) return -EINVAL; @@ -1157,12 +1164,13 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) /* If adding to array with a bitmap, then we can accept an * older device, but not too old. */ - __u64 ev1 = le64_to_cpu(sb->events); if (ev1 < mddev->bitmap->events_cleared) return 0; - } else /* just a hot-add of a new device, leave raid_disk at -1 */ - return 0; - + } else { + if (ev1 < mddev->events) + /* just a hot-add of a new device, leave raid_disk at -1 */ + return 0; + } if (mddev->level != LEVEL_MULTIPATH) { int role; rdev->desc_nr = le32_to_cpu(sb->dev_number); @@ -1174,7 +1182,11 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) set_bit(Faulty, &rdev->flags); break; default: - set_bit(In_sync, &rdev->flags); + if ((le32_to_cpu(sb->feature_map) & + MD_FEATURE_RECOVERY_OFFSET)) + rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); + else + set_bit(In_sync, &rdev->flags); rdev->raid_disk = role; break; } @@ -1198,6 +1210,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->feature_map = 0; sb->pad0 = 0; + sb->recovery_offset = cpu_to_le64(0); memset(sb->pad1, 0, sizeof(sb->pad1)); memset(sb->pad2, 0, sizeof(sb->pad2)); memset(sb->pad3, 0, sizeof(sb->pad3)); @@ -1218,6 +1231,14 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); } + + if (rdev->raid_disk >= 0 && + !test_bit(In_sync, &rdev->flags) && + rdev->recovery_offset > 0) { + sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); + sb->recovery_offset = cpu_to_le64(rdev->recovery_offset); + } + if (mddev->reshape_position != MaxSector) { sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); sb->reshape_position = cpu_to_le64(mddev->reshape_position); @@ -1242,11 +1263,12 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->dev_roles[i] = cpu_to_le16(0xfffe); else if (test_bit(In_sync, &rdev2->flags)) sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); + else if (rdev2->raid_disk >= 0 && rdev2->recovery_offset > 0) + sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); else sb->dev_roles[i] = cpu_to_le16(0xffff); } - sb->recovery_offset = cpu_to_le64(0); /* not supported yet */ sb->sb_csum = calc_sb_1_csum(sb); } @@ -1507,7 +1529,7 @@ static void print_rdev(mdk_rdev_t *rdev) printk(KERN_INFO "md: no rdev superblock!\n"); } -void md_print_devices(void) +static void md_print_devices(void) { struct list_head *tmp, *tmp2; mdk_rdev_t *rdev; @@ -1536,15 +1558,30 @@ void md_print_devices(void) } -static void sync_sbs(mddev_t * mddev) +static void sync_sbs(mddev_t * mddev, int nospares) { + /* Update each superblock (in-memory image), but + * if we are allowed to, skip spares which already + * have the right event counter, or have one earlier + * (which would mean they aren't being marked as dirty + * with the rest of the array) + */ mdk_rdev_t *rdev; struct list_head *tmp; ITERATE_RDEV(mddev,rdev,tmp) { - super_types[mddev->major_version]. - sync_super(mddev, rdev); - rdev->sb_loaded = 1; + if (rdev->sb_events == mddev->events || + (nospares && + rdev->raid_disk < 0 && + (rdev->sb_events&1)==0 && + rdev->sb_events+1 == mddev->events)) { + /* Don't update this superblock */ + rdev->sb_loaded = 2; + } else { + super_types[mddev->major_version]. + sync_super(mddev, rdev); + rdev->sb_loaded = 1; + } } } @@ -1554,12 +1591,42 @@ void md_update_sb(mddev_t * mddev) struct list_head *tmp; mdk_rdev_t *rdev; int sync_req; + int nospares = 0; repeat: spin_lock_irq(&mddev->write_lock); sync_req = mddev->in_sync; mddev->utime = get_seconds(); - mddev->events ++; + if (mddev->sb_dirty == 3) + /* just a clean<-> dirty transition, possibly leave spares alone, + * though if events isn't the right even/odd, we will have to do + * spares after all + */ + nospares = 1; + + /* If this is just a dirty<->clean transition, and the array is clean + * and 'events' is odd, we can roll back to the previous clean state */ + if (mddev->sb_dirty == 3 + && (mddev->in_sync && mddev->recovery_cp == MaxSector) + && (mddev->events & 1)) + mddev->events--; + else { + /* otherwise we have to go forward and ... */ + mddev->events ++; + if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */ + /* .. if the array isn't clean, insist on an odd 'events' */ + if ((mddev->events&1)==0) { + mddev->events++; + nospares = 0; + } + } else { + /* otherwise insist on an even 'events' (for clean states) */ + if ((mddev->events&1)) { + mddev->events++; + nospares = 0; + } + } + } if (!mddev->events) { /* @@ -1571,7 +1638,7 @@ repeat: mddev->events --; } mddev->sb_dirty = 2; - sync_sbs(mddev); + sync_sbs(mddev, nospares); /* * do not write anything to disk if using @@ -1593,6 +1660,8 @@ repeat: ITERATE_RDEV(mddev,rdev,tmp) { char b[BDEVNAME_SIZE]; dprintk(KERN_INFO "md: "); + if (rdev->sb_loaded != 1) + continue; /* no noise on spare devices */ if (test_bit(Faulty, &rdev->flags)) dprintk("(skipping faulty "); @@ -1604,6 +1673,7 @@ repeat: dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", bdevname(rdev->bdev,b), (unsigned long long)rdev->sb_offset); + rdev->sb_events = mddev->events; } else dprintk(")\n"); @@ -1667,6 +1737,10 @@ state_show(mdk_rdev_t *rdev, char *page) len += sprintf(page+len, "%sin_sync",sep); sep = ","; } + if (test_bit(WriteMostly, &rdev->flags)) { + len += sprintf(page+len, "%swrite_mostly",sep); + sep = ","; + } if (!test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags)) { len += sprintf(page+len, "%sspare", sep); @@ -1675,8 +1749,40 @@ state_show(mdk_rdev_t *rdev, char *page) return len+sprintf(page+len, "\n"); } +static ssize_t +state_store(mdk_rdev_t *rdev, const char *buf, size_t len) +{ + /* can write + * faulty - simulates and error + * remove - disconnects the device + * writemostly - sets write_mostly + * -writemostly - clears write_mostly + */ + int err = -EINVAL; + if (cmd_match(buf, "faulty") && rdev->mddev->pers) { + md_error(rdev->mddev, rdev); + err = 0; + } else if (cmd_match(buf, "remove")) { + if (rdev->raid_disk >= 0) + err = -EBUSY; + else { + mddev_t *mddev = rdev->mddev; + kick_rdev_from_array(rdev); + md_update_sb(mddev); + md_new_event(mddev); + err = 0; + } + } else if (cmd_match(buf, "writemostly")) { + set_bit(WriteMostly, &rdev->flags); + err = 0; + } else if (cmd_match(buf, "-writemostly")) { + clear_bit(WriteMostly, &rdev->flags); + err = 0; + } + return err ? err : len; +} static struct rdev_sysfs_entry -rdev_state = __ATTR_RO(state); +rdev_state = __ATTR(state, 0644, state_show, state_store); static ssize_t super_show(mdk_rdev_t *rdev, char *page) @@ -1873,6 +1979,7 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi rdev->desc_nr = -1; rdev->flags = 0; rdev->data_offset = 0; + rdev->sb_events = 0; atomic_set(&rdev->nr_pending, 0); atomic_set(&rdev->read_errors, 0); atomic_set(&rdev->corrected_errors, 0); @@ -1978,6 +2085,54 @@ static void analyze_sbs(mddev_t * mddev) } static ssize_t +safe_delay_show(mddev_t *mddev, char *page) +{ + int msec = (mddev->safemode_delay*1000)/HZ; + return sprintf(page, "%d.%03d\n", msec/1000, msec%1000); +} +static ssize_t +safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len) +{ + int scale=1; + int dot=0; + int i; + unsigned long msec; + char buf[30]; + char *e; + /* remove a period, and count digits after it */ + if (len >= sizeof(buf)) + return -EINVAL; + strlcpy(buf, cbuf, len); + buf[len] = 0; + for (i=0; i<len; i++) { + if (dot) { + if (isdigit(buf[i])) { + buf[i-1] = buf[i]; + scale *= 10; + } + buf[i] = 0; + } else if (buf[i] == '.') { + dot=1; + buf[i] = 0; + } + } + msec = simple_strtoul(buf, &e, 10); + if (e == buf || (*e && *e != '\n')) + return -EINVAL; + msec = (msec * 1000) / scale; + if (msec == 0) + mddev->safemode_delay = 0; + else { + mddev->safemode_delay = (msec*HZ)/1000; + if (mddev->safemode_delay == 0) + mddev->safemode_delay = 1; + } + return len; +} +static struct md_sysfs_entry md_safe_delay = +__ATTR(safe_mode_delay, 0644,safe_delay_show, safe_delay_store); + +static ssize_t level_show(mddev_t *mddev, char *page) { struct mdk_personality *p = mddev->pers; @@ -2012,6 +2167,32 @@ level_store(mddev_t *mddev, const char *buf, size_t len) static struct md_sysfs_entry md_level = __ATTR(level, 0644, level_show, level_store); + +static ssize_t +layout_show(mddev_t *mddev, char *page) +{ + /* just a number, not meaningful for all levels */ + return sprintf(page, "%d\n", mddev->layout); +} + +static ssize_t +layout_store(mddev_t *mddev, const char *buf, size_t len) +{ + char *e; + unsigned long n = simple_strtoul(buf, &e, 10); + if (mddev->pers) + return -EBUSY; + + if (!*buf || (*e && *e != '\n')) + return -EINVAL; + + mddev->layout = n; + return len; +} +static struct md_sysfs_entry md_layout = +__ATTR(layout, 0655, layout_show, layout_store); + + static ssize_t raid_disks_show(mddev_t *mddev, char *page) { @@ -2067,6 +2248,200 @@ static struct md_sysfs_entry md_chunk_size = __ATTR(chunk_size, 0644, chunk_size_show, chunk_size_store); static ssize_t +resync_start_show(mddev_t *mddev, char *page) +{ + return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); +} + +static ssize_t +resync_start_store(mddev_t *mddev, const char *buf, size_t len) +{ + /* can only set chunk_size if array is not yet active */ + char *e; + unsigned long long n = simple_strtoull(buf, &e, 10); + + if (mddev->pers) + return -EBUSY; + if (!*buf || (*e && *e != '\n')) + return -EINVAL; + + mddev->recovery_cp = n; + return len; +} +static struct md_sysfs_entry md_resync_start = +__ATTR(resync_start, 0644, resync_start_show, resync_start_store); + +/* + * The array state can be: + * + * clear + * No devices, no size, no level + * Equivalent to STOP_ARRAY ioctl + * inactive + * May have some settings, but array is not active + * all IO results in error + * When written, doesn't tear down array, but just stops it + * suspended (not supported yet) + * All IO requests will block. The array can be reconfigured. + * Writing this, if accepted, will block until array is quiessent + * readonly + * no resync can happen. no superblocks get written. + * write requests fail + * read-auto + * like readonly, but behaves like 'clean' on a write request. + * + * clean - no pending writes, but otherwise active. + * When written to inactive array, starts without resync + * If a write request arrives then + * if metadata is known, mark 'dirty' and switch to 'active'. + * if not known, block and switch to write-pending + * If written to an active array that has pending writes, then fails. + * active + * fully active: IO and resync can be happening. + * When written to inactive array, starts with resync + * + * write-pending + * clean, but writes are blocked waiting for 'active' to be written. + * + * active-idle + * like active, but no writes have been seen for a while (100msec). + * + */ +enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, + write_pending, active_idle, bad_word}; +static char *array_states[] = { + "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active", + "write-pending", "active-idle", NULL }; + +static int match_word(const char *word, char **list) +{ + int n; + for (n=0; list[n]; n++) + if (cmd_match(word, list[n])) + break; + return n; +} + +static ssize_t +array_state_show(mddev_t *mddev, char *page) +{ + enum array_state st = inactive; + + if (mddev->pers) + switch(mddev->ro) { + case 1: + st = readonly; + break; + case 2: + st = read_auto; + break; + case 0: + if (mddev->in_sync) + st = clean; + else if (mddev->safemode) + st = active_idle; + else + st = active; + } + else { + if (list_empty(&mddev->disks) && + mddev->raid_disks == 0 && + mddev->size == 0) + st = clear; + else + st = inactive; + } + return sprintf(page, "%s\n", array_states[st]); +} + +static int do_md_stop(mddev_t * mddev, int ro); +static int do_md_run(mddev_t * mddev); +static int restart_array(mddev_t *mddev); + +static ssize_t +array_state_store(mddev_t *mddev, const char *buf, size_t len) +{ + int err = -EINVAL; + enum array_state st = match_word(buf, array_states); + switch(st) { + case bad_word: + break; + case clear: + /* stopping an active array */ + if (mddev->pers) { + if (atomic_read(&mddev->active) > 1) + return -EBUSY; + err = do_md_stop(mddev, 0); + } + break; + case inactive: + /* stopping an active array */ + if (mddev->pers) { + if (atomic_read(&mddev->active) > 1) + return -EBUSY; + err = do_md_stop(mddev, 2); + } + break; + case suspended: + break; /* not supported yet */ + case readonly: + if (mddev->pers) + err = do_md_stop(mddev, 1); + else { + mddev->ro = 1; + err = do_md_run(mddev); + } + break; + case read_auto: + /* stopping an active array */ + if (mddev->pers) { + err = do_md_stop(mddev, 1); + if (err == 0) + mddev->ro = 2; /* FIXME mark devices writable */ + } else { + mddev->ro = 2; + err = do_md_run(mddev); + } + break; + case clean: + if (mddev->pers) { + restart_array(mddev); + spin_lock_irq(&mddev->write_lock); + if (atomic_read(&mddev->writes_pending) == 0) { + mddev->in_sync = 1; + mddev->sb_dirty = 1; + } + spin_unlock_irq(&mddev->write_lock); + } else { + mddev->ro = 0; + mddev->recovery_cp = MaxSector; + err = do_md_run(mddev); + } + break; + case active: + if (mddev->pers) { + restart_array(mddev); + mddev->sb_dirty = 0; + wake_up(&mddev->sb_wait); + err = 0; + } else { + mddev->ro = 0; + err = do_md_run(mddev); + } + break; + case write_pending: + case active_idle: + /* these cannot be set */ + break; + } + if (err) + return err; + else + return len; +} +static struct md_sysfs_entry md_array_state = __ATTR(array_state, 0644, array_state_show, array_state_store); + +static ssize_t null_show(mddev_t *mddev, char *page) { return -EINVAL; @@ -2428,11 +2803,15 @@ __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); static struct attribute *md_default_attrs[] = { &md_level.attr, + &md_layout.attr, &md_raid_disks.attr, &md_chunk_size.attr, &md_size.attr, + &md_resync_start.attr, &md_metadata.attr, &md_new_device.attr, + &md_safe_delay.attr, + &md_array_state.attr, NULL, }; @@ -2553,8 +2932,6 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data) return NULL; } -void md_wakeup_thread(mdk_thread_t *thread); - static void md_safemode_timeout(unsigned long data) { mddev_t *mddev = (mddev_t *) data; @@ -2708,7 +3085,7 @@ static int do_md_run(mddev_t * mddev) mddev->safemode = 0; mddev->safemode_timer.function = md_safemode_timeout; mddev->safemode_timer.data = (unsigned long) mddev; - mddev->safemode_delay = (20 * HZ)/1000 +1; /* 20 msec delay */ + mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ mddev->in_sync = 1; ITERATE_RDEV(mddev,rdev,tmp) @@ -2736,6 +3113,36 @@ static int do_md_run(mddev_t * mddev) mddev->queue->queuedata = mddev; mddev->queue->make_request_fn = mddev->pers->make_request; + /* If there is a partially-recovered drive we need to + * start recovery here. If we leave it to md_check_recovery, + * it will remove the drives and not do the right thing + */ + if (mddev->degraded) { + struct list_head *rtmp; + int spares = 0; + ITERATE_RDEV(mddev,rdev,rtmp) + if (rdev->raid_disk >= 0 && + !test_bit(In_sync, &rdev->flags) && + !test_bit(Faulty, &rdev->flags)) + /* complete an interrupted recovery */ + spares++; + if (spares && mddev->pers->sync_request) { + mddev->recovery = 0; + set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + mddev->sync_thread = md_register_thread(md_do_sync, + mddev, + "%s_resync"); + if (!mddev->sync_thread) { + printk(KERN_ERR "%s: could not start resync" + " thread...\n", + mdname(mddev)); + /* leave the spares where they are, it shouldn't hurt */ + mddev->recovery = 0; + } else + md_wakeup_thread(mddev->sync_thread); + } + } + mddev->changed = 1; md_new_event(mddev); return 0; @@ -2769,18 +3176,47 @@ static int restart_array(mddev_t *mddev) */ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); + md_wakeup_thread(mddev->sync_thread); err = 0; - } else { - printk(KERN_ERR "md: %s has no personality assigned.\n", - mdname(mddev)); + } else err = -EINVAL; - } out: return err; } -static int do_md_stop(mddev_t * mddev, int ro) +/* similar to deny_write_access, but accounts for our holding a reference + * to the file ourselves */ +static int deny_bitmap_write_access(struct file * file) +{ + struct inode *inode = file->f_mapping->host; + + spin_lock(&inode->i_lock); + if (atomic_read(&inode->i_writecount) > 1) { + spin_unlock(&inode->i_lock); + return -ETXTBSY; + } + atomic_set(&inode->i_writecount, -1); + spin_unlock(&inode->i_lock); + + return 0; +} + +static void restore_bitmap_write_access(struct file *file) +{ + struct inode *inode = file->f_mapping->host; + + spin_lock(&inode->i_lock); + atomic_set(&inode->i_writecount, 1); + spin_unlock(&inode->i_lock); +} + +/* mode: + * 0 - completely stop and dis-assemble array + * 1 - switch to readonly + * 2 - stop but do not disassemble array + */ +static int do_md_stop(mddev_t * mddev, int mode) { int err = 0; struct gendisk *disk = mddev->gendisk; @@ -2792,6 +3228,7 @@ static int do_md_stop(mddev_t * mddev, int ro) } if (mddev->sync_thread) { + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); set_bit(MD_RECOVERY_INTR, &mddev->recovery); md_unregister_thread(mddev->sync_thread); mddev->sync_thread = NULL; @@ -2801,12 +3238,15 @@ static int do_md_stop(mddev_t * mddev, int ro) invalidate_partition(disk, 0); - if (ro) { + switch(mode) { + case 1: /* readonly */ err = -ENXIO; if (mddev->ro==1) goto out; mddev->ro = 1; - } else { + break; + case 0: /* disassemble */ + case 2: /* stop */ bitmap_flush(mddev); md_super_wait(mddev); if (mddev->ro) @@ -2821,19 +3261,20 @@ static int do_md_stop(mddev_t * mddev, int ro) if (mddev->ro) mddev->ro = 0; } - if (!mddev->in_sync) { + if (!mddev->in_sync || mddev->sb_dirty) { /* mark array as shutdown cleanly */ mddev->in_sync = 1; md_update_sb(mddev); } - if (ro) + if (mode == 1) set_disk_ro(disk, 1); + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); } /* * Free resources if final stop */ - if (!ro) { + if (mode == 0) { mdk_rdev_t *rdev; struct list_head *tmp; struct gendisk *disk; @@ -2841,7 +3282,7 @@ static int do_md_stop(mddev_t * mddev, int ro) bitmap_destroy(mddev); if (mddev->bitmap_file) { - atomic_set(&mddev->bitmap_file->f_dentry->d_inode->i_writecount, 1); + restore_bitmap_write_access(mddev->bitmap_file); fput(mddev->bitmap_file); mddev->bitmap_file = NULL; } @@ -2857,11 +3298,15 @@ static int do_md_stop(mddev_t * mddev, int ro) export_array(mddev); mddev->array_size = 0; + mddev->size = 0; + mddev->raid_disks = 0; + mddev->recovery_cp = 0; + disk = mddev->gendisk; if (disk) set_capacity(disk, 0); mddev->changed = 1; - } else + } else if (mddev->pers) printk(KERN_INFO "md: %s switched to read-only mode.\n", mdname(mddev)); err = 0; @@ -3264,6 +3709,17 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) rdev->raid_disk = -1; err = bind_rdev_to_array(rdev, mddev); + if (!err && !mddev->pers->hot_remove_disk) { + /* If there is hot_add_disk but no hot_remove_disk + * then added disks for geometry changes, + * and should be added immediately. + */ + super_types[mddev->major_version]. + validate_super(mddev, rdev); + err = mddev->pers->hot_add_disk(mddev, rdev); + if (err) + unbind_rdev_from_array(rdev); + } if (err) export_rdev(rdev); @@ -3434,23 +3890,6 @@ abort_export: return err; } -/* similar to deny_write_access, but accounts for our holding a reference - * to the file ourselves */ -static int deny_bitmap_write_access(struct file * file) -{ - struct inode *inode = file->f_mapping->host; - - spin_lock(&inode->i_lock); - if (atomic_read(&inode->i_writecount) > 1) { - spin_unlock(&inode->i_lock); - return -ETXTBSY; - } - atomic_set(&inode->i_writecount, -1); - spin_unlock(&inode->i_lock); - - return 0; -} - static int set_bitmap_file(mddev_t *mddev, int fd) { int err; @@ -3491,12 +3930,17 @@ static int set_bitmap_file(mddev_t *mddev, int fd) mddev->pers->quiesce(mddev, 1); if (fd >= 0) err = bitmap_create(mddev); - if (fd < 0 || err) + if (fd < 0 || err) { bitmap_destroy(mddev); + fd = -1; /* make sure to put the file */ + } mddev->pers->quiesce(mddev, 0); - } else if (fd < 0) { - if (mddev->bitmap_file) + } + if (fd < 0) { + if (mddev->bitmap_file) { + restore_bitmap_write_access(mddev->bitmap_file); fput(mddev->bitmap_file); + } mddev->bitmap_file = NULL; } @@ -3977,11 +4421,6 @@ static int md_ioctl(struct inode *inode, struct file *file, goto done_unlock; default: - if (_IOC_TYPE(cmd) == MD_MAJOR) - printk(KERN_WARNING "md: %s(pid %d) used" - " obsolete MD ioctl, upgrade your" - " software to use new ictls.\n", - current->comm, current->pid); err = -EINVAL; goto abort_unlock; } @@ -4586,7 +5025,7 @@ void md_write_start(mddev_t *mddev, struct bio *bi) spin_lock_irq(&mddev->write_lock); if (mddev->in_sync) { mddev->in_sync = 0; - mddev->sb_dirty = 1; + mddev->sb_dirty = 3; md_wakeup_thread(mddev->thread); } spin_unlock_irq(&mddev->write_lock); @@ -4599,7 +5038,7 @@ void md_write_end(mddev_t *mddev) if (atomic_dec_and_test(&mddev->writes_pending)) { if (mddev->safemode == 2) md_wakeup_thread(mddev->thread); - else + else if (mddev->safemode_delay) mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay); } } @@ -4620,10 +5059,14 @@ void md_do_sync(mddev_t *mddev) struct list_head *tmp; sector_t last_check; int skipped = 0; + struct list_head *rtmp; + mdk_rdev_t *rdev; /* just incase thread restarts... */ if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) return; + if (mddev->ro) /* never try to sync a read-only array */ + return; /* we overload curr_resync somewhat here. * 0 == not engaged in resync at all @@ -4682,17 +5125,30 @@ void md_do_sync(mddev_t *mddev) } } while (mddev->curr_resync < 2); + j = 0; if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { /* resync follows the size requested by the personality, * which defaults to physical size, but can be virtual size */ max_sectors = mddev->resync_max_sectors; mddev->resync_mismatches = 0; + /* we don't use the checkpoint if there's a bitmap */ + if (!mddev->bitmap && + !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) + j = mddev->recovery_cp; } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) max_sectors = mddev->size << 1; - else + else { /* recovery follows the physical size of devices */ max_sectors = mddev->size << 1; + j = MaxSector; + ITERATE_RDEV(mddev,rdev,rtmp) + if (rdev->raid_disk >= 0 && + !test_bit(Faulty, &rdev->flags) && + !test_bit(In_sync, &rdev->flags) && + rdev->recovery_offset < j) + j = rdev->recovery_offset; + } printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev)); printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:" @@ -4702,12 +5158,7 @@ void md_do_sync(mddev_t *mddev) speed_max(mddev)); is_mddev_idle(mddev); /* this also initializes IO event counters */ - /* we don't use the checkpoint if there's a bitmap */ - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && !mddev->bitmap - && ! test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) - j = mddev->recovery_cp; - else - j = 0; + io_sectors = 0; for (m = 0; m < SYNC_MARKS; m++) { mark[m] = jiffies; @@ -4828,15 +5279,28 @@ void md_do_sync(mddev_t *mddev) if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && !test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && - mddev->curr_resync > 2 && - mddev->curr_resync >= mddev->recovery_cp) { - if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { - printk(KERN_INFO - "md: checkpointing recovery of %s.\n", - mdname(mddev)); - mddev->recovery_cp = mddev->curr_resync; - } else - mddev->recovery_cp = MaxSector; + mddev->curr_resync > 2) { + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { + if (mddev->curr_resync >= mddev->recovery_cp) { + printk(KERN_INFO + "md: checkpointing recovery of %s.\n", + mdname(mddev)); + mddev->recovery_cp = mddev->curr_resync; + } + } else + mddev->recovery_cp = MaxSector; + } else { + if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) + mddev->curr_resync = MaxSector; + ITERATE_RDEV(mddev,rdev,rtmp) + if (rdev->raid_disk >= 0 && + !test_bit(Faulty, &rdev->flags) && + !test_bit(In_sync, &rdev->flags) && + rdev->recovery_offset < mddev->curr_resync) + rdev->recovery_offset = mddev->curr_resync; + mddev->sb_dirty = 1; + } } skip: @@ -4908,7 +5372,7 @@ void md_check_recovery(mddev_t *mddev) if (mddev->safemode && !atomic_read(&mddev->writes_pending) && !mddev->in_sync && mddev->recovery_cp == MaxSector) { mddev->in_sync = 1; - mddev->sb_dirty = 1; + mddev->sb_dirty = 3; } if (mddev->safemode == 1) mddev->safemode = 0; @@ -4957,6 +5421,8 @@ void md_check_recovery(mddev_t *mddev) clear_bit(MD_RECOVERY_INTR, &mddev->recovery); clear_bit(MD_RECOVERY_DONE, &mddev->recovery); + if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) + goto unlock; /* no recovery is running. * remove any failed drives, then * add spares if possible. @@ -4979,6 +5445,7 @@ void md_check_recovery(mddev_t *mddev) ITERATE_RDEV(mddev,rdev,rtmp) if (rdev->raid_disk < 0 && !test_bit(Faulty, &rdev->flags)) { + rdev->recovery_offset = 0; if (mddev->pers->hot_add_disk(mddev,rdev)) { char nm[20]; sprintf(nm, "rd%d", rdev->raid_disk); @@ -5216,7 +5683,6 @@ EXPORT_SYMBOL(md_write_end); EXPORT_SYMBOL(md_register_thread); EXPORT_SYMBOL(md_unregister_thread); EXPORT_SYMBOL(md_wakeup_thread); -EXPORT_SYMBOL(md_print_devices); EXPORT_SYMBOL(md_check_recovery); MODULE_LICENSE("GPL"); MODULE_ALIAS("md"); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 4070eff6f0f..cead918578a 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -374,26 +374,26 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int * already. */ if (atomic_dec_and_test(&r1_bio->remaining)) { - if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) { + if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) reschedule_retry(r1_bio); - goto out; - } - /* it really is the end of this request */ - if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { - /* free extra copy of the data pages */ - int i = bio->bi_vcnt; - while (i--) - safe_put_page(bio->bi_io_vec[i].bv_page); + else { + /* it really is the end of this request */ + if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { + /* free extra copy of the data pages */ + int i = bio->bi_vcnt; + while (i--) + safe_put_page(bio->bi_io_vec[i].bv_page); + } + /* clear the bitmap if all writes complete successfully */ + bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, + r1_bio->sectors, + !test_bit(R1BIO_Degraded, &r1_bio->state), + behind); + md_write_end(r1_bio->mddev); + raid_end_bio_io(r1_bio); } - /* clear the bitmap if all writes complete successfully */ - bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, - r1_bio->sectors, - !test_bit(R1BIO_Degraded, &r1_bio->state), - behind); - md_write_end(r1_bio->mddev); - raid_end_bio_io(r1_bio); } - out: + if (to_put) bio_put(to_put); @@ -1625,6 +1625,12 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i /* before building a request, check if we can skip these blocks.. * This call the bitmap_start_sync doesn't actually record anything */ + if (mddev->bitmap == NULL && + mddev->recovery_cp == MaxSector && + conf->fullsync == 0) { + *skipped = 1; + return max_sector - sector_nr; + } if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { /* We can skip this block, and probably several more */ @@ -1888,7 +1894,8 @@ static int run(mddev_t *mddev) disk = conf->mirrors + i; - if (!disk->rdev) { + if (!disk->rdev || + !test_bit(In_sync, &disk->rdev->flags)) { disk->head_position = 0; mddev->degraded++; } diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 1440935414e..7f636283a1b 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -29,6 +29,7 @@ * raid_disks * near_copies (stored in low byte of layout) * far_copies (stored in second byte of layout) + * far_offset (stored in bit 16 of layout ) * * The data to be stored is divided into chunks using chunksize. * Each device is divided into far_copies sections. @@ -36,10 +37,14 @@ * near_copies copies of each chunk is stored (each on a different drive). * The starting device for each section is offset near_copies from the starting * device of the previous section. - * Thus there are (near_copies*far_copies) of each chunk, and each is on a different + * Thus they are (near_copies*far_copies) of each chunk, and each is on a different * drive. * near_copies and far_copies must be at least one, and their product is at most * raid_disks. + * + * If far_offset is true, then the far_copies are handled a bit differently. + * The copies are still in different stripes, but instead of be very far apart + * on disk, there are adjacent stripes. */ /* @@ -357,8 +362,7 @@ static int raid10_end_write_request(struct bio *bio, unsigned int bytes_done, in * With this layout, and block is never stored twice on the one device. * * raid10_find_phys finds the sector offset of a given virtual sector - * on each device that it is on. If a block isn't on a device, - * that entry in the array is set to MaxSector. + * on each device that it is on. * * raid10_find_virt does the reverse mapping, from a device and a * sector offset to a virtual address @@ -381,6 +385,8 @@ static void raid10_find_phys(conf_t *conf, r10bio_t *r10bio) chunk *= conf->near_copies; stripe = chunk; dev = sector_div(stripe, conf->raid_disks); + if (conf->far_offset) + stripe *= conf->far_copies; sector += stripe << conf->chunk_shift; @@ -414,16 +420,24 @@ static sector_t raid10_find_virt(conf_t *conf, sector_t sector, int dev) { sector_t offset, chunk, vchunk; - while (sector > conf->stride) { - sector -= conf->stride; - if (dev < conf->near_copies) - dev += conf->raid_disks - conf->near_copies; - else - dev -= conf->near_copies; - } - offset = sector & conf->chunk_mask; - chunk = sector >> conf->chunk_shift; + if (conf->far_offset) { + int fc; + chunk = sector >> conf->chunk_shift; + fc = sector_div(chunk, conf->far_copies); + dev -= fc * conf->near_copies; + if (dev < 0) + dev += conf->raid_disks; + } else { + while (sector > conf->stride) { + sector -= conf->stride; + if (dev < conf->near_copies) + dev += conf->raid_disks - conf->near_copies; + else + dev -= conf->near_copies; + } + chunk = sector >> conf->chunk_shift; + } vchunk = chunk * conf->raid_disks + dev; sector_div(vchunk, conf->near_copies); return (vchunk << conf->chunk_shift) + offset; @@ -900,9 +914,12 @@ static void status(struct seq_file *seq, mddev_t *mddev) seq_printf(seq, " %dK chunks", mddev->chunk_size/1024); if (conf->near_copies > 1) seq_printf(seq, " %d near-copies", conf->near_copies); - if (conf->far_copies > 1) - seq_printf(seq, " %d far-copies", conf->far_copies); - + if (conf->far_copies > 1) { + if (conf->far_offset) + seq_printf(seq, " %d offset-copies", conf->far_copies); + else + seq_printf(seq, " %d far-copies", conf->far_copies); + } seq_printf(seq, " [%d/%d] [", conf->raid_disks, conf->working_disks); for (i = 0; i < conf->raid_disks; i++) @@ -1915,7 +1932,7 @@ static int run(mddev_t *mddev) mirror_info_t *disk; mdk_rdev_t *rdev; struct list_head *tmp; - int nc, fc; + int nc, fc, fo; sector_t stride, size; if (mddev->chunk_size == 0) { @@ -1925,8 +1942,9 @@ static int run(mddev_t *mddev) nc = mddev->layout & 255; fc = (mddev->layout >> 8) & 255; + fo = mddev->layout & (1<<16); if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks || - (mddev->layout >> 16)) { + (mddev->layout >> 17)) { printk(KERN_ERR "raid10: %s: unsupported raid10 layout: 0x%8x\n", mdname(mddev), mddev->layout); goto out; @@ -1958,12 +1976,16 @@ static int run(mddev_t *mddev) conf->near_copies = nc; conf->far_copies = fc; conf->copies = nc*fc; + conf->far_offset = fo; conf->chunk_mask = (sector_t)(mddev->chunk_size>>9)-1; conf->chunk_shift = ffz(~mddev->chunk_size) - 9; - stride = mddev->size >> (conf->chunk_shift-1); - sector_div(stride, fc); - conf->stride = stride << conf->chunk_shift; - + if (fo) + conf->stride = 1 << conf->chunk_shift; + else { + stride = mddev->size >> (conf->chunk_shift-1); + sector_div(stride, fc); + conf->stride = stride << conf->chunk_shift; + } conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc, r10bio_pool_free, conf); if (!conf->r10bio_pool) { @@ -2015,7 +2037,8 @@ static int run(mddev_t *mddev) disk = conf->mirrors + i; - if (!disk->rdev) { + if (!disk->rdev || + !test_bit(In_sync, &rdev->flags)) { disk->head_position = 0; mddev->degraded++; } @@ -2037,7 +2060,13 @@ static int run(mddev_t *mddev) /* * Ok, everything is just fine now */ - size = conf->stride * conf->raid_disks; + if (conf->far_offset) { + size = mddev->size >> (conf->chunk_shift-1); + size *= conf->raid_disks; + size <<= conf->chunk_shift; + sector_div(size, conf->far_copies); + } else + size = conf->stride * conf->raid_disks; sector_div(size, conf->near_copies); mddev->array_size = size/2; mddev->resync_max_sectors = size; @@ -2050,7 +2079,7 @@ static int run(mddev_t *mddev) * maybe... */ { - int stripe = conf->raid_disks * mddev->chunk_size / PAGE_SIZE; + int stripe = conf->raid_disks * (mddev->chunk_size / PAGE_SIZE); stripe /= conf->near_copies; if (mddev->queue->backing_dev_info.ra_pages < 2* stripe) mddev->queue->backing_dev_info.ra_pages = 2* stripe; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 31843604049..f920e50ea12 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2,8 +2,11 @@ * raid5.c : Multiple Devices driver for Linux * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman * Copyright (C) 1999, 2000 Ingo Molnar + * Copyright (C) 2002, 2003 H. Peter Anvin * - * RAID-5 management functions. + * RAID-4/5/6 management functions. + * Thanks to Penguin Computing for making the RAID-6 development possible + * by donating a test server! * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -19,11 +22,11 @@ #include <linux/config.h> #include <linux/module.h> #include <linux/slab.h> -#include <linux/raid/raid5.h> #include <linux/highmem.h> #include <linux/bitops.h> #include <linux/kthread.h> #include <asm/atomic.h> +#include "raid6.h" #include <linux/raid/bitmap.h> @@ -68,6 +71,16 @@ #define __inline__ #endif +#if !RAID6_USE_EMPTY_ZERO_PAGE +/* In .bss so it's zeroed */ +const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256))); +#endif + +static inline int raid6_next_disk(int disk, int raid_disks) +{ + disk++; + return (disk < raid_disks) ? disk : 0; +} static void print_raid5_conf (raid5_conf_t *conf); static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) @@ -104,7 +117,7 @@ static void release_stripe(struct stripe_head *sh) { raid5_conf_t *conf = sh->raid_conf; unsigned long flags; - + spin_lock_irqsave(&conf->device_lock, flags); __release_stripe(conf, sh); spin_unlock_irqrestore(&conf->device_lock, flags); @@ -117,7 +130,7 @@ static inline void remove_hash(struct stripe_head *sh) hlist_del_init(&sh->hash); } -static void insert_hash(raid5_conf_t *conf, struct stripe_head *sh) +static inline void insert_hash(raid5_conf_t *conf, struct stripe_head *sh) { struct hlist_head *hp = stripe_hash(conf, sh->sector); @@ -190,7 +203,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int (unsigned long long)sh->sector); remove_hash(sh); - + sh->sector = sector; sh->pd_idx = pd_idx; sh->state = 0; @@ -269,8 +282,9 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector } else { if (!test_bit(STRIPE_HANDLE, &sh->state)) atomic_inc(&conf->active_stripes); - if (!list_empty(&sh->lru)) - list_del_init(&sh->lru); + if (list_empty(&sh->lru)) + BUG(); + list_del_init(&sh->lru); } } } while (sh == NULL); @@ -321,10 +335,9 @@ static int grow_stripes(raid5_conf_t *conf, int num) return 1; conf->slab_cache = sc; conf->pool_size = devs; - while (num--) { + while (num--) if (!grow_one_stripe(conf)) return 1; - } return 0; } @@ -631,8 +644,7 @@ static void raid5_build_block (struct stripe_head *sh, int i) dev->req.bi_private = sh; dev->flags = 0; - if (i != sh->pd_idx) - dev->sector = compute_blocknr(sh, i); + dev->sector = compute_blocknr(sh, i); } static void error(mddev_t *mddev, mdk_rdev_t *rdev) @@ -659,7 +671,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) " Operation continuing on %d devices\n", bdevname(rdev->bdev,b), conf->working_disks); } -} +} /* * Input: a 'big' sector number, @@ -697,9 +709,12 @@ static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks, /* * Select the parity disk based on the user selected algorithm. */ - if (conf->level == 4) + switch(conf->level) { + case 4: *pd_idx = data_disks; - else switch (conf->algorithm) { + break; + case 5: + switch (conf->algorithm) { case ALGORITHM_LEFT_ASYMMETRIC: *pd_idx = data_disks - stripe % raid_disks; if (*dd_idx >= *pd_idx) @@ -721,6 +736,39 @@ static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks, default: printk(KERN_ERR "raid5: unsupported algorithm %d\n", conf->algorithm); + } + break; + case 6: + + /**** FIX THIS ****/ + switch (conf->algorithm) { + case ALGORITHM_LEFT_ASYMMETRIC: + *pd_idx = raid_disks - 1 - (stripe % raid_disks); + if (*pd_idx == raid_disks-1) + (*dd_idx)++; /* Q D D D P */ + else if (*dd_idx >= *pd_idx) + (*dd_idx) += 2; /* D D P Q D */ + break; + case ALGORITHM_RIGHT_ASYMMETRIC: + *pd_idx = stripe % raid_disks; + if (*pd_idx == raid_disks-1) + (*dd_idx)++; /* Q D D D P */ + else if (*dd_idx >= *pd_idx) + (*dd_idx) += 2; /* D D P Q D */ + break; + case ALGORITHM_LEFT_SYMMETRIC: + *pd_idx = raid_disks - 1 - (stripe % raid_disks); + *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks; + break; + case ALGORITHM_RIGHT_SYMMETRIC: + *pd_idx = stripe % raid_disks; + *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks; + break; + default: + printk (KERN_CRIT "raid6: unsupported algorithm %d\n", + conf->algorithm); + } + break; } /* @@ -742,12 +790,17 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i) int chunk_number, dummy1, dummy2, dd_idx = i; sector_t r_sector; + chunk_offset = sector_div(new_sector, sectors_per_chunk); stripe = new_sector; BUG_ON(new_sector != stripe); - - switch (conf->algorithm) { + if (i == sh->pd_idx) + return 0; + switch(conf->level) { + case 4: break; + case 5: + switch (conf->algorithm) { case ALGORITHM_LEFT_ASYMMETRIC: case ALGORITHM_RIGHT_ASYMMETRIC: if (i > sh->pd_idx) @@ -761,7 +814,37 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i) break; default: printk(KERN_ERR "raid5: unsupported algorithm %d\n", + conf->algorithm); + } + break; + case 6: + data_disks = raid_disks - 2; + if (i == raid6_next_disk(sh->pd_idx, raid_disks)) + return 0; /* It is the Q disk */ + switch (conf->algorithm) { + case ALGORITHM_LEFT_ASYMMETRIC: + case ALGORITHM_RIGHT_ASYMMETRIC: + if (sh->pd_idx == raid_disks-1) + i--; /* Q D D D P */ + else if (i > sh->pd_idx) + i -= 2; /* D D P Q D */ + break; + case ALGORITHM_LEFT_SYMMETRIC: + case ALGORITHM_RIGHT_SYMMETRIC: + if (sh->pd_idx == raid_disks-1) + i--; /* Q D D D P */ + else { + /* D D P Q D */ + if (i < sh->pd_idx) + i += raid_disks; + i -= (sh->pd_idx + 2); + } + break; + default: + printk (KERN_CRIT "raid6: unsupported algorithm %d\n", conf->algorithm); + } + break; } chunk_number = stripe * data_disks + i; @@ -778,10 +861,11 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i) /* - * Copy data between a page in the stripe cache, and a bio. - * There are no alignment or size guarantees between the page or the - * bio except that there is some overlap. - * All iovecs in the bio must be considered. + * Copy data between a page in the stripe cache, and one or more bion + * The page could align with the middle of the bio, or there could be + * several bion, each with several bio_vecs, which cover part of the page + * Multiple bion are linked together on bi_next. There may be extras + * at the end of this list. We ignore them. */ static void copy_data(int frombio, struct bio *bio, struct page *page, @@ -810,7 +894,7 @@ static void copy_data(int frombio, struct bio *bio, if (len > 0 && page_offset + len > STRIPE_SIZE) clen = STRIPE_SIZE - page_offset; else clen = len; - + if (clen > 0) { char *ba = __bio_kmap_atomic(bio, i, KM_USER0); if (frombio) @@ -862,14 +946,14 @@ static void compute_block(struct stripe_head *sh, int dd_idx) set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); } -static void compute_parity(struct stripe_head *sh, int method) +static void compute_parity5(struct stripe_head *sh, int method) { raid5_conf_t *conf = sh->raid_conf; int i, pd_idx = sh->pd_idx, disks = sh->disks, count; void *ptr[MAX_XOR_BLOCKS]; struct bio *chosen; - PRINTK("compute_parity, stripe %llu, method %d\n", + PRINTK("compute_parity5, stripe %llu, method %d\n", (unsigned long long)sh->sector, method); count = 1; @@ -956,9 +1040,195 @@ static void compute_parity(struct stripe_head *sh, int method) clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); } +static void compute_parity6(struct stripe_head *sh, int method) +{ + raid6_conf_t *conf = sh->raid_conf; + int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = conf->raid_disks, count; + struct bio *chosen; + /**** FIX THIS: This could be very bad if disks is close to 256 ****/ + void *ptrs[disks]; + + qd_idx = raid6_next_disk(pd_idx, disks); + d0_idx = raid6_next_disk(qd_idx, disks); + + PRINTK("compute_parity, stripe %llu, method %d\n", + (unsigned long long)sh->sector, method); + + switch(method) { + case READ_MODIFY_WRITE: + BUG(); /* READ_MODIFY_WRITE N/A for RAID-6 */ + case RECONSTRUCT_WRITE: + for (i= disks; i-- ;) + if ( i != pd_idx && i != qd_idx && sh->dev[i].towrite ) { + chosen = sh->dev[i].towrite; + sh->dev[i].towrite = NULL; + + if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) + wake_up(&conf->wait_for_overlap); + + if (sh->dev[i].written) BUG(); + sh->dev[i].written = chosen; + } + break; + case CHECK_PARITY: + BUG(); /* Not implemented yet */ + } + + for (i = disks; i--;) + if (sh->dev[i].written) { + sector_t sector = sh->dev[i].sector; + struct bio *wbi = sh->dev[i].written; + while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) { + copy_data(1, wbi, sh->dev[i].page, sector); + wbi = r5_next_bio(wbi, sector); + } + + set_bit(R5_LOCKED, &sh->dev[i].flags); + set_bit(R5_UPTODATE, &sh->dev[i].flags); + } + +// switch(method) { +// case RECONSTRUCT_WRITE: +// case CHECK_PARITY: +// case UPDATE_PARITY: + /* Note that unlike RAID-5, the ordering of the disks matters greatly. */ + /* FIX: Is this ordering of drives even remotely optimal? */ + count = 0; + i = d0_idx; + do { + ptrs[count++] = page_address(sh->dev[i].page); + if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags)) + printk("block %d/%d not uptodate on parity calc\n", i,count); + i = raid6_next_disk(i, disks); + } while ( i != d0_idx ); +// break; +// } + + raid6_call.gen_syndrome(disks, STRIPE_SIZE, ptrs); + + switch(method) { + case RECONSTRUCT_WRITE: + set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); + set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags); + set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); + set_bit(R5_LOCKED, &sh->dev[qd_idx].flags); + break; + case UPDATE_PARITY: + set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); + set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags); + break; + } +} + + +/* Compute one missing block */ +static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero) +{ + raid6_conf_t *conf = sh->raid_conf; + int i, count, disks = conf->raid_disks; + void *ptr[MAX_XOR_BLOCKS], *p; + int pd_idx = sh->pd_idx; + int qd_idx = raid6_next_disk(pd_idx, disks); + + PRINTK("compute_block_1, stripe %llu, idx %d\n", + (unsigned long long)sh->sector, dd_idx); + + if ( dd_idx == qd_idx ) { + /* We're actually computing the Q drive */ + compute_parity6(sh, UPDATE_PARITY); + } else { + ptr[0] = page_address(sh->dev[dd_idx].page); + if (!nozero) memset(ptr[0], 0, STRIPE_SIZE); + count = 1; + for (i = disks ; i--; ) { + if (i == dd_idx || i == qd_idx) + continue; + p = page_address(sh->dev[i].page); + if (test_bit(R5_UPTODATE, &sh->dev[i].flags)) + ptr[count++] = p; + else + printk("compute_block() %d, stripe %llu, %d" + " not present\n", dd_idx, + (unsigned long long)sh->sector, i); + + check_xor(); + } + if (count != 1) + xor_block(count, STRIPE_SIZE, ptr); + if (!nozero) set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); + else clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); + } +} + +/* Compute two missing blocks */ +static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2) +{ + raid6_conf_t *conf = sh->raid_conf; + int i, count, disks = conf->raid_disks; + int pd_idx = sh->pd_idx; + int qd_idx = raid6_next_disk(pd_idx, disks); + int d0_idx = raid6_next_disk(qd_idx, disks); + int faila, failb; + + /* faila and failb are disk numbers relative to d0_idx */ + /* pd_idx become disks-2 and qd_idx become disks-1 */ + faila = (dd_idx1 < d0_idx) ? dd_idx1+(disks-d0_idx) : dd_idx1-d0_idx; + failb = (dd_idx2 < d0_idx) ? dd_idx2+(disks-d0_idx) : dd_idx2-d0_idx; + + BUG_ON(faila == failb); + if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; } + + PRINTK("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n", + (unsigned long long)sh->sector, dd_idx1, dd_idx2, faila, failb); + + if ( failb == disks-1 ) { + /* Q disk is one of the missing disks */ + if ( faila == disks-2 ) { + /* Missing P+Q, just recompute */ + compute_parity6(sh, UPDATE_PARITY); + return; + } else { + /* We're missing D+Q; recompute D from P */ + compute_block_1(sh, (dd_idx1 == qd_idx) ? dd_idx2 : dd_idx1, 0); + compute_parity6(sh, UPDATE_PARITY); /* Is this necessary? */ + return; + } + } + + /* We're missing D+P or D+D; build pointer table */ + { + /**** FIX THIS: This could be very bad if disks is close to 256 ****/ + void *ptrs[disks]; + + count = 0; + i = d0_idx; + do { + ptrs[count++] = page_address(sh->dev[i].page); + i = raid6_next_disk(i, disks); + if (i != dd_idx1 && i != dd_idx2 && + !test_bit(R5_UPTODATE, &sh->dev[i].flags)) + printk("compute_2 with missing block %d/%d\n", count, i); + } while ( i != d0_idx ); + + if ( failb == disks-2 ) { + /* We're missing D+P. */ + raid6_datap_recov(disks, STRIPE_SIZE, faila, ptrs); + } else { + /* We're missing D+D. */ + raid6_2data_recov(disks, STRIPE_SIZE, faila, failb, ptrs); + } + + /* Both the above update both missing blocks */ + set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags); + set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags); + } +} + + + /* * Each stripe/dev can have one or more bion attached. - * toread/towrite point to the first in a chain. + * toread/towrite point to the first in a chain. * The bi_next chain must be in order. */ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite) @@ -1031,6 +1301,13 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in static void end_reshape(raid5_conf_t *conf); +static int page_is_zero(struct page *p) +{ + char *a = page_address(p); + return ((*(u32*)a) == 0 && + memcmp(a, a+4, STRIPE_SIZE-4)==0); +} + static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks) { int sectors_per_chunk = conf->chunk_size >> 9; @@ -1062,7 +1339,7 @@ static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks) * */ -static void handle_stripe(struct stripe_head *sh) +static void handle_stripe5(struct stripe_head *sh) { raid5_conf_t *conf = sh->raid_conf; int disks = sh->disks; @@ -1394,7 +1671,7 @@ static void handle_stripe(struct stripe_head *sh) if (locked == 0 && (rcw == 0 ||rmw == 0) && !test_bit(STRIPE_BIT_DELAY, &sh->state)) { PRINTK("Computing parity...\n"); - compute_parity(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE); + compute_parity5(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE); /* now every locked buffer is ready to be written */ for (i=disks; i--;) if (test_bit(R5_LOCKED, &sh->dev[i].flags)) { @@ -1421,13 +1698,10 @@ static void handle_stripe(struct stripe_head *sh) !test_bit(STRIPE_INSYNC, &sh->state)) { set_bit(STRIPE_HANDLE, &sh->state); if (failed == 0) { - char *pagea; BUG_ON(uptodate != disks); - compute_parity(sh, CHECK_PARITY); + compute_parity5(sh, CHECK_PARITY); uptodate--; - pagea = page_address(sh->dev[sh->pd_idx].page); - if ((*(u32*)pagea) == 0 && - !memcmp(pagea, pagea+4, STRIPE_SIZE-4)) { + if (page_is_zero(sh->dev[sh->pd_idx].page)) { /* parity is correct (on disc, not in buffer any more) */ set_bit(STRIPE_INSYNC, &sh->state); } else { @@ -1487,7 +1761,7 @@ static void handle_stripe(struct stripe_head *sh) /* Need to write out all blocks after computing parity */ sh->disks = conf->raid_disks; sh->pd_idx = stripe_to_pdidx(sh->sector, conf, conf->raid_disks); - compute_parity(sh, RECONSTRUCT_WRITE); + compute_parity5(sh, RECONSTRUCT_WRITE); for (i= conf->raid_disks; i--;) { set_bit(R5_LOCKED, &sh->dev[i].flags); locked++; @@ -1615,6 +1889,569 @@ static void handle_stripe(struct stripe_head *sh) } } +static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) +{ + raid6_conf_t *conf = sh->raid_conf; + int disks = conf->raid_disks; + struct bio *return_bi= NULL; + struct bio *bi; + int i; + int syncing; + int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0; + int non_overwrite = 0; + int failed_num[2] = {0, 0}; + struct r5dev *dev, *pdev, *qdev; + int pd_idx = sh->pd_idx; + int qd_idx = raid6_next_disk(pd_idx, disks); + int p_failed, q_failed; + + PRINTK("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d, qd_idx=%d\n", + (unsigned long long)sh->sector, sh->state, atomic_read(&sh->count), + pd_idx, qd_idx); + + spin_lock(&sh->lock); + clear_bit(STRIPE_HANDLE, &sh->state); + clear_bit(STRIPE_DELAYED, &sh->state); + + syncing = test_bit(STRIPE_SYNCING, &sh->state); + /* Now to look around and see what can be done */ + + rcu_read_lock(); + for (i=disks; i--; ) { + mdk_rdev_t *rdev; + dev = &sh->dev[i]; + clear_bit(R5_Insync, &dev->flags); + + PRINTK("check %d: state 0x%lx read %p write %p written %p\n", + i, dev->flags, dev->toread, dev->towrite, dev->written); + /* maybe we can reply to a read */ + if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) { + struct bio *rbi, *rbi2; + PRINTK("Return read for disc %d\n", i); + spin_lock_irq(&conf->device_lock); + rbi = dev->toread; + dev->toread = NULL; + if (test_and_clear_bit(R5_Overlap, &dev->flags)) + wake_up(&conf->wait_for_overlap); + spin_unlock_irq(&conf->device_lock); + while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) { + copy_data(0, rbi, dev->page, dev->sector); + rbi2 = r5_next_bio(rbi, dev->sector); + spin_lock_irq(&conf->device_lock); + if (--rbi->bi_phys_segments == 0) { + rbi->bi_next = return_bi; + return_bi = rbi; + } + spin_unlock_irq(&conf->device_lock); + rbi = rbi2; + } + } + + /* now count some things */ + if (test_bit(R5_LOCKED, &dev->flags)) locked++; + if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++; + + + if (dev->toread) to_read++; + if (dev->towrite) { + to_write++; + if (!test_bit(R5_OVERWRITE, &dev->flags)) + non_overwrite++; + } + if (dev->written) written++; + rdev = rcu_dereference(conf->disks[i].rdev); + if (!rdev || !test_bit(In_sync, &rdev->flags)) { + /* The ReadError flag will just be confusing now */ + clear_bit(R5_ReadError, &dev->flags); + clear_bit(R5_ReWrite, &dev->flags); + } + if (!rdev || !test_bit(In_sync, &rdev->flags) + || test_bit(R5_ReadError, &dev->flags)) { + if ( failed < 2 ) + failed_num[failed] = i; + failed++; + } else + set_bit(R5_Insync, &dev->flags); + } + rcu_read_unlock(); + PRINTK("locked=%d uptodate=%d to_read=%d" + " to_write=%d failed=%d failed_num=%d,%d\n", + locked, uptodate, to_read, to_write, failed, + failed_num[0], failed_num[1]); + /* check if the array has lost >2 devices and, if so, some requests might + * need to be failed + */ + if (failed > 2 && to_read+to_write+written) { + for (i=disks; i--; ) { + int bitmap_end = 0; + + if (test_bit(R5_ReadError, &sh->dev[i].flags)) { + mdk_rdev_t *rdev; + rcu_read_lock(); + rdev = rcu_dereference(conf->disks[i].rdev); + if (rdev && test_bit(In_sync, &rdev->flags)) + /* multiple read failures in one stripe */ + md_error(conf->mddev, rdev); + rcu_read_unlock(); + } + + spin_lock_irq(&conf->device_lock); + /* fail all writes first */ + bi = sh->dev[i].towrite; + sh->dev[i].towrite = NULL; + if (bi) { to_write--; bitmap_end = 1; } + + if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) + wake_up(&conf->wait_for_overlap); + + while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){ + struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); + clear_bit(BIO_UPTODATE, &bi->bi_flags); + if (--bi->bi_phys_segments == 0) { + md_write_end(conf->mddev); + bi->bi_next = return_bi; + return_bi = bi; + } + bi = nextbi; + } + /* and fail all 'written' */ + bi = sh->dev[i].written; + sh->dev[i].written = NULL; + if (bi) bitmap_end = 1; + while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { + struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); + clear_bit(BIO_UPTODATE, &bi->bi_flags); + if (--bi->bi_phys_segments == 0) { + md_write_end(conf->mddev); + bi->bi_next = return_bi; + return_bi = bi; + } + bi = bi2; + } + + /* fail any reads if this device is non-operational */ + if (!test_bit(R5_Insync, &sh->dev[i].flags) || + test_bit(R5_ReadError, &sh->dev[i].flags)) { + bi = sh->dev[i].toread; + sh->dev[i].toread = NULL; + if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) + wake_up(&conf->wait_for_overlap); + if (bi) to_read--; + while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){ + struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); + clear_bit(BIO_UPTODATE, &bi->bi_flags); + if (--bi->bi_phys_segments == 0) { + bi->bi_next = return_bi; + return_bi = bi; + } + bi = nextbi; + } + } + spin_unlock_irq(&conf->device_lock); + if (bitmap_end) + bitmap_endwrite(conf->mddev->bitmap, sh->sector, + STRIPE_SECTORS, 0, 0); + } + } + if (failed > 2 && syncing) { + md_done_sync(conf->mddev, STRIPE_SECTORS,0); + clear_bit(STRIPE_SYNCING, &sh->state); + syncing = 0; + } + + /* + * might be able to return some write requests if the parity blocks + * are safe, or on a failed drive + */ + pdev = &sh->dev[pd_idx]; + p_failed = (failed >= 1 && failed_num[0] == pd_idx) + || (failed >= 2 && failed_num[1] == pd_idx); + qdev = &sh->dev[qd_idx]; + q_failed = (failed >= 1 && failed_num[0] == qd_idx) + || (failed >= 2 && failed_num[1] == qd_idx); + + if ( written && + ( p_failed || ((test_bit(R5_Insync, &pdev->flags) + && !test_bit(R5_LOCKED, &pdev->flags) + && test_bit(R5_UPTODATE, &pdev->flags))) ) && + ( q_failed || ((test_bit(R5_Insync, &qdev->flags) + && !test_bit(R5_LOCKED, &qdev->flags) + && test_bit(R5_UPTODATE, &qdev->flags))) ) ) { + /* any written block on an uptodate or failed drive can be + * returned. Note that if we 'wrote' to a failed drive, + * it will be UPTODATE, but never LOCKED, so we don't need + * to test 'failed' directly. + */ + for (i=disks; i--; ) + if (sh->dev[i].written) { + dev = &sh->dev[i]; + if (!test_bit(R5_LOCKED, &dev->flags) && + test_bit(R5_UPTODATE, &dev->flags) ) { + /* We can return any write requests */ + int bitmap_end = 0; + struct bio *wbi, *wbi2; + PRINTK("Return write for stripe %llu disc %d\n", + (unsigned long long)sh->sector, i); + spin_lock_irq(&conf->device_lock); + wbi = dev->written; + dev->written = NULL; + while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) { + wbi2 = r5_next_bio(wbi, dev->sector); + if (--wbi->bi_phys_segments == 0) { + md_write_end(conf->mddev); + wbi->bi_next = return_bi; + return_bi = wbi; + } + wbi = wbi2; + } + if (dev->towrite == NULL) + bitmap_end = 1; + spin_unlock_irq(&conf->device_lock); + if (bitmap_end) + bitmap_endwrite(conf->mddev->bitmap, sh->sector, + STRIPE_SECTORS, + !test_bit(STRIPE_DEGRADED, &sh->state), 0); + } + } + } + + /* Now we might consider reading some blocks, either to check/generate + * parity, or to satisfy requests + * or to load a block that is being partially written. + */ + if (to_read || non_overwrite || (to_write && failed) || (syncing && (uptodate < disks))) { + for (i=disks; i--;) { + dev = &sh->dev[i]; + if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && + (dev->toread || + (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || + syncing || + (failed >= 1 && (sh->dev[failed_num[0]].toread || to_write)) || + (failed >= 2 && (sh->dev[failed_num[1]].toread || to_write)) + ) + ) { + /* we would like to get this block, possibly + * by computing it, but we might not be able to + */ + if (uptodate == disks-1) { + PRINTK("Computing stripe %llu block %d\n", + (unsigned long long)sh->sector, i); + compute_block_1(sh, i, 0); + uptodate++; + } else if ( uptodate == disks-2 && failed >= 2 ) { + /* Computing 2-failure is *very* expensive; only do it if failed >= 2 */ + int other; + for (other=disks; other--;) { + if ( other == i ) + continue; + if ( !test_bit(R5_UPTODATE, &sh->dev[other].flags) ) + break; + } + BUG_ON(other < 0); + PRINTK("Computing stripe %llu blocks %d,%d\n", + (unsigned long long)sh->sector, i, other); + compute_block_2(sh, i, other); + uptodate += 2; + } else if (test_bit(R5_Insync, &dev->flags)) { + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantread, &dev->flags); +#if 0 + /* if I am just reading this block and we don't have + a failed drive, or any pending writes then sidestep the cache */ + if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext && + ! syncing && !failed && !to_write) { + sh->bh_cache[i]->b_page = sh->bh_read[i]->b_page; + sh->bh_cache[i]->b_data = sh->bh_read[i]->b_data; + } +#endif + locked++; + PRINTK("Reading block %d (sync=%d)\n", + i, syncing); + } + } + } + set_bit(STRIPE_HANDLE, &sh->state); + } + + /* now to consider writing and what else, if anything should be read */ + if (to_write) { + int rcw=0, must_compute=0; + for (i=disks ; i--;) { + dev = &sh->dev[i]; + /* Would I have to read this buffer for reconstruct_write */ + if (!test_bit(R5_OVERWRITE, &dev->flags) + && i != pd_idx && i != qd_idx + && (!test_bit(R5_LOCKED, &dev->flags) +#if 0 + || sh->bh_page[i] != bh->b_page +#endif + ) && + !test_bit(R5_UPTODATE, &dev->flags)) { + if (test_bit(R5_Insync, &dev->flags)) rcw++; + else { + PRINTK("raid6: must_compute: disk %d flags=%#lx\n", i, dev->flags); + must_compute++; + } + } + } + PRINTK("for sector %llu, rcw=%d, must_compute=%d\n", + (unsigned long long)sh->sector, rcw, must_compute); + set_bit(STRIPE_HANDLE, &sh->state); + + if (rcw > 0) + /* want reconstruct write, but need to get some data */ + for (i=disks; i--;) { + dev = &sh->dev[i]; + if (!test_bit(R5_OVERWRITE, &dev->flags) + && !(failed == 0 && (i == pd_idx || i == qd_idx)) + && !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && + test_bit(R5_Insync, &dev->flags)) { + if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + { + PRINTK("Read_old stripe %llu block %d for Reconstruct\n", + (unsigned long long)sh->sector, i); + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantread, &dev->flags); + locked++; + } else { + PRINTK("Request delayed stripe %llu block %d for Reconstruct\n", + (unsigned long long)sh->sector, i); + set_bit(STRIPE_DELAYED, &sh->state); + set_bit(STRIPE_HANDLE, &sh->state); + } + } + } + /* now if nothing is locked, and if we have enough data, we can start a write request */ + if (locked == 0 && rcw == 0 && + !test_bit(STRIPE_BIT_DELAY, &sh->state)) { + if ( must_compute > 0 ) { + /* We have failed blocks and need to compute them */ + switch ( failed ) { + case 0: BUG(); + case 1: compute_block_1(sh, failed_num[0], 0); break; + case 2: compute_block_2(sh, failed_num[0], failed_num[1]); break; + default: BUG(); /* This request should have been failed? */ + } + } + + PRINTK("Computing parity for stripe %llu\n", (unsigned long long)sh->sector); + compute_parity6(sh, RECONSTRUCT_WRITE); + /* now every locked buffer is ready to be written */ + for (i=disks; i--;) + if (test_bit(R5_LOCKED, &sh->dev[i].flags)) { + PRINTK("Writing stripe %llu block %d\n", + (unsigned long long)sh->sector, i); + locked++; + set_bit(R5_Wantwrite, &sh->dev[i].flags); + } + /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */ + set_bit(STRIPE_INSYNC, &sh->state); + + if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { + atomic_dec(&conf->preread_active_stripes); + if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) + md_wakeup_thread(conf->mddev->thread); + } + } + } + + /* maybe we need to check and possibly fix the parity for this stripe + * Any reads will already have been scheduled, so we just see if enough data + * is available + */ + if (syncing && locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state)) { + int update_p = 0, update_q = 0; + struct r5dev *dev; + + set_bit(STRIPE_HANDLE, &sh->state); + + BUG_ON(failed>2); + BUG_ON(uptodate < disks); + /* Want to check and possibly repair P and Q. + * However there could be one 'failed' device, in which + * case we can only check one of them, possibly using the + * other to generate missing data + */ + + /* If !tmp_page, we cannot do the calculations, + * but as we have set STRIPE_HANDLE, we will soon be called + * by stripe_handle with a tmp_page - just wait until then. + */ + if (tmp_page) { + if (failed == q_failed) { + /* The only possible failed device holds 'Q', so it makes + * sense to check P (If anything else were failed, we would + * have used P to recreate it). + */ + compute_block_1(sh, pd_idx, 1); + if (!page_is_zero(sh->dev[pd_idx].page)) { + compute_block_1(sh,pd_idx,0); + update_p = 1; + } + } + if (!q_failed && failed < 2) { + /* q is not failed, and we didn't use it to generate + * anything, so it makes sense to check it + */ + memcpy(page_address(tmp_page), + page_address(sh->dev[qd_idx].page), + STRIPE_SIZE); + compute_parity6(sh, UPDATE_PARITY); + if (memcmp(page_address(tmp_page), + page_address(sh->dev[qd_idx].page), + STRIPE_SIZE)!= 0) { + clear_bit(STRIPE_INSYNC, &sh->state); + update_q = 1; + } + } + if (update_p || update_q) { + conf->mddev->resync_mismatches += STRIPE_SECTORS; + if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) + /* don't try to repair!! */ + update_p = update_q = 0; + } + + /* now write out any block on a failed drive, + * or P or Q if they need it + */ + + if (failed == 2) { + dev = &sh->dev[failed_num[1]]; + locked++; + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantwrite, &dev->flags); + } + if (failed >= 1) { + dev = &sh->dev[failed_num[0]]; + locked++; + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantwrite, &dev->flags); + } + + if (update_p) { + dev = &sh->dev[pd_idx]; + locked ++; + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantwrite, &dev->flags); + } + if (update_q) { + dev = &sh->dev[qd_idx]; + locked++; + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantwrite, &dev->flags); + } + clear_bit(STRIPE_DEGRADED, &sh->state); + + set_bit(STRIPE_INSYNC, &sh->state); + } + } + + if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { + md_done_sync(conf->mddev, STRIPE_SECTORS,1); + clear_bit(STRIPE_SYNCING, &sh->state); + } + + /* If the failed drives are just a ReadError, then we might need + * to progress the repair/check process + */ + if (failed <= 2 && ! conf->mddev->ro) + for (i=0; i<failed;i++) { + dev = &sh->dev[failed_num[i]]; + if (test_bit(R5_ReadError, &dev->flags) + && !test_bit(R5_LOCKED, &dev->flags) + && test_bit(R5_UPTODATE, &dev->flags) + ) { + if (!test_bit(R5_ReWrite, &dev->flags)) { + set_bit(R5_Wantwrite, &dev->flags); + set_bit(R5_ReWrite, &dev->flags); + set_bit(R5_LOCKED, &dev->flags); + } else { + /* let's read it back */ + set_bit(R5_Wantread, &dev->flags); + set_bit(R5_LOCKED, &dev->flags); + } + } + } + spin_unlock(&sh->lock); + + while ((bi=return_bi)) { + int bytes = bi->bi_size; + + return_bi = bi->bi_next; + bi->bi_next = NULL; + bi->bi_size = 0; + bi->bi_end_io(bi, bytes, 0); + } + for (i=disks; i-- ;) { + int rw; + struct bio *bi; + mdk_rdev_t *rdev; + if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) + rw = 1; + else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) + rw = 0; + else + continue; + + bi = &sh->dev[i].req; + + bi->bi_rw = rw; + if (rw) + bi->bi_end_io = raid5_end_write_request; + else + bi->bi_end_io = raid5_end_read_request; + + rcu_read_lock(); + rdev = rcu_dereference(conf->disks[i].rdev); + if (rdev && test_bit(Faulty, &rdev->flags)) + rdev = NULL; + if (rdev) + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + + if (rdev) { + if (syncing) + md_sync_acct(rdev->bdev, STRIPE_SECTORS); + + bi->bi_bdev = rdev->bdev; + PRINTK("for %llu schedule op %ld on disc %d\n", + (unsigned long long)sh->sector, bi->bi_rw, i); + atomic_inc(&sh->count); + bi->bi_sector = sh->sector + rdev->data_offset; + bi->bi_flags = 1 << BIO_UPTODATE; + bi->bi_vcnt = 1; + bi->bi_max_vecs = 1; + bi->bi_idx = 0; + bi->bi_io_vec = &sh->dev[i].vec; + bi->bi_io_vec[0].bv_len = STRIPE_SIZE; + bi->bi_io_vec[0].bv_offset = 0; + bi->bi_size = STRIPE_SIZE; + bi->bi_next = NULL; + if (rw == WRITE && + test_bit(R5_ReWrite, &sh->dev[i].flags)) + atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); + generic_make_request(bi); + } else { + if (rw == 1) + set_bit(STRIPE_DEGRADED, &sh->state); + PRINTK("skip op %ld on disc %d for sector %llu\n", + bi->bi_rw, i, (unsigned long long)sh->sector); + clear_bit(R5_LOCKED, &sh->dev[i].flags); + set_bit(STRIPE_HANDLE, &sh->state); + } + } +} + +static void handle_stripe(struct stripe_head *sh, struct page *tmp_page) +{ + if (sh->raid_conf->level == 6) + handle_stripe6(sh, tmp_page); + else + handle_stripe5(sh); +} + + + static void raid5_activate_delayed(raid5_conf_t *conf) { if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { @@ -1753,7 +2590,7 @@ static int make_request(request_queue_t *q, struct bio * bi) for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { DEFINE_WAIT(w); - int disks; + int disks, data_disks; retry: prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); @@ -1781,7 +2618,9 @@ static int make_request(request_queue_t *q, struct bio * bi) } spin_unlock_irq(&conf->device_lock); } - new_sector = raid5_compute_sector(logical_sector, disks, disks - 1, + data_disks = disks - conf->max_degraded; + + new_sector = raid5_compute_sector(logical_sector, disks, data_disks, &dd_idx, &pd_idx, conf); PRINTK("raid5: make_request, sector %llu logical %llu\n", (unsigned long long)new_sector, @@ -1833,7 +2672,7 @@ static int make_request(request_queue_t *q, struct bio * bi) } finish_wait(&conf->wait_for_overlap, &w); raid5_plug_device(conf); - handle_stripe(sh); + handle_stripe(sh, NULL); release_stripe(sh); } else { /* cannot get stripe for read-ahead, just give-up */ @@ -1849,7 +2688,7 @@ static int make_request(request_queue_t *q, struct bio * bi) if (remaining == 0) { int bytes = bi->bi_size; - if ( bio_data_dir(bi) == WRITE ) + if ( rw == WRITE ) md_write_end(mddev); bi->bi_size = 0; bi->bi_end_io(bi, bytes, 0); @@ -1857,17 +2696,142 @@ static int make_request(request_queue_t *q, struct bio * bi) return 0; } -/* FIXME go_faster isn't used */ -static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) +static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped) { + /* reshaping is quite different to recovery/resync so it is + * handled quite separately ... here. + * + * On each call to sync_request, we gather one chunk worth of + * destination stripes and flag them as expanding. + * Then we find all the source stripes and request reads. + * As the reads complete, handle_stripe will copy the data + * into the destination stripe and release that stripe. + */ raid5_conf_t *conf = (raid5_conf_t *) mddev->private; struct stripe_head *sh; int pd_idx; sector_t first_sector, last_sector; + int raid_disks; + int data_disks; + int i; + int dd_idx; + sector_t writepos, safepos, gap; + + if (sector_nr == 0 && + conf->expand_progress != 0) { + /* restarting in the middle, skip the initial sectors */ + sector_nr = conf->expand_progress; + sector_div(sector_nr, conf->raid_disks-1); + *skipped = 1; + return sector_nr; + } + + /* we update the metadata when there is more than 3Meg + * in the block range (that is rather arbitrary, should + * probably be time based) or when the data about to be + * copied would over-write the source of the data at + * the front of the range. + * i.e. one new_stripe forward from expand_progress new_maps + * to after where expand_lo old_maps to + */ + writepos = conf->expand_progress + + conf->chunk_size/512*(conf->raid_disks-1); + sector_div(writepos, conf->raid_disks-1); + safepos = conf->expand_lo; + sector_div(safepos, conf->previous_raid_disks-1); + gap = conf->expand_progress - conf->expand_lo; + + if (writepos >= safepos || + gap > (conf->raid_disks-1)*3000*2 /*3Meg*/) { + /* Cannot proceed until we've updated the superblock... */ + wait_event(conf->wait_for_overlap, + atomic_read(&conf->reshape_stripes)==0); + mddev->reshape_position = conf->expand_progress; + mddev->sb_dirty = 1; + md_wakeup_thread(mddev->thread); + wait_event(mddev->sb_wait, mddev->sb_dirty == 0 || + kthread_should_stop()); + spin_lock_irq(&conf->device_lock); + conf->expand_lo = mddev->reshape_position; + spin_unlock_irq(&conf->device_lock); + wake_up(&conf->wait_for_overlap); + } + + for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) { + int j; + int skipped = 0; + pd_idx = stripe_to_pdidx(sector_nr+i, conf, conf->raid_disks); + sh = get_active_stripe(conf, sector_nr+i, + conf->raid_disks, pd_idx, 0); + set_bit(STRIPE_EXPANDING, &sh->state); + atomic_inc(&conf->reshape_stripes); + /* If any of this stripe is beyond the end of the old + * array, then we need to zero those blocks + */ + for (j=sh->disks; j--;) { + sector_t s; + if (j == sh->pd_idx) + continue; + s = compute_blocknr(sh, j); + if (s < (mddev->array_size<<1)) { + skipped = 1; + continue; + } + memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE); + set_bit(R5_Expanded, &sh->dev[j].flags); + set_bit(R5_UPTODATE, &sh->dev[j].flags); + } + if (!skipped) { + set_bit(STRIPE_EXPAND_READY, &sh->state); + set_bit(STRIPE_HANDLE, &sh->state); + } + release_stripe(sh); + } + spin_lock_irq(&conf->device_lock); + conf->expand_progress = (sector_nr + i)*(conf->raid_disks-1); + spin_unlock_irq(&conf->device_lock); + /* Ok, those stripe are ready. We can start scheduling + * reads on the source stripes. + * The source stripes are determined by mapping the first and last + * block on the destination stripes. + */ + raid_disks = conf->previous_raid_disks; + data_disks = raid_disks - 1; + first_sector = + raid5_compute_sector(sector_nr*(conf->raid_disks-1), + raid_disks, data_disks, + &dd_idx, &pd_idx, conf); + last_sector = + raid5_compute_sector((sector_nr+conf->chunk_size/512) + *(conf->raid_disks-1) -1, + raid_disks, data_disks, + &dd_idx, &pd_idx, conf); + if (last_sector >= (mddev->size<<1)) + last_sector = (mddev->size<<1)-1; + while (first_sector <= last_sector) { + pd_idx = stripe_to_pdidx(first_sector, conf, conf->previous_raid_disks); + sh = get_active_stripe(conf, first_sector, + conf->previous_raid_disks, pd_idx, 0); + set_bit(STRIPE_EXPAND_SOURCE, &sh->state); + set_bit(STRIPE_HANDLE, &sh->state); + release_stripe(sh); + first_sector += STRIPE_SECTORS; + } + return conf->chunk_size>>9; +} + +/* FIXME go_faster isn't used */ +static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) +{ + raid5_conf_t *conf = (raid5_conf_t *) mddev->private; + struct stripe_head *sh; + int pd_idx; int raid_disks = conf->raid_disks; - int data_disks = raid_disks-1; + int data_disks = raid_disks - conf->max_degraded; sector_t max_sector = mddev->size << 1; int sync_blocks; + int still_degraded = 0; + int i; if (sector_nr >= max_sector) { /* just being told to finish up .. nothing much to do */ @@ -1880,134 +2844,22 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i if (mddev->curr_resync < max_sector) /* aborted */ bitmap_end_sync(mddev->bitmap, mddev->curr_resync, &sync_blocks, 1); - else /* compelted sync */ + else /* completed sync */ conf->fullsync = 0; bitmap_close_sync(mddev->bitmap); return 0; } - if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { - /* reshaping is quite different to recovery/resync so it is - * handled quite separately ... here. - * - * On each call to sync_request, we gather one chunk worth of - * destination stripes and flag them as expanding. - * Then we find all the source stripes and request reads. - * As the reads complete, handle_stripe will copy the data - * into the destination stripe and release that stripe. - */ - int i; - int dd_idx; - sector_t writepos, safepos, gap; - - if (sector_nr == 0 && - conf->expand_progress != 0) { - /* restarting in the middle, skip the initial sectors */ - sector_nr = conf->expand_progress; - sector_div(sector_nr, conf->raid_disks-1); - *skipped = 1; - return sector_nr; - } - - /* we update the metadata when there is more than 3Meg - * in the block range (that is rather arbitrary, should - * probably be time based) or when the data about to be - * copied would over-write the source of the data at - * the front of the range. - * i.e. one new_stripe forward from expand_progress new_maps - * to after where expand_lo old_maps to - */ - writepos = conf->expand_progress + - conf->chunk_size/512*(conf->raid_disks-1); - sector_div(writepos, conf->raid_disks-1); - safepos = conf->expand_lo; - sector_div(safepos, conf->previous_raid_disks-1); - gap = conf->expand_progress - conf->expand_lo; - - if (writepos >= safepos || - gap > (conf->raid_disks-1)*3000*2 /*3Meg*/) { - /* Cannot proceed until we've updated the superblock... */ - wait_event(conf->wait_for_overlap, - atomic_read(&conf->reshape_stripes)==0); - mddev->reshape_position = conf->expand_progress; - mddev->sb_dirty = 1; - md_wakeup_thread(mddev->thread); - wait_event(mddev->sb_wait, mddev->sb_dirty == 0 || - kthread_should_stop()); - spin_lock_irq(&conf->device_lock); - conf->expand_lo = mddev->reshape_position; - spin_unlock_irq(&conf->device_lock); - wake_up(&conf->wait_for_overlap); - } - - for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) { - int j; - int skipped = 0; - pd_idx = stripe_to_pdidx(sector_nr+i, conf, conf->raid_disks); - sh = get_active_stripe(conf, sector_nr+i, - conf->raid_disks, pd_idx, 0); - set_bit(STRIPE_EXPANDING, &sh->state); - atomic_inc(&conf->reshape_stripes); - /* If any of this stripe is beyond the end of the old - * array, then we need to zero those blocks - */ - for (j=sh->disks; j--;) { - sector_t s; - if (j == sh->pd_idx) - continue; - s = compute_blocknr(sh, j); - if (s < (mddev->array_size<<1)) { - skipped = 1; - continue; - } - memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE); - set_bit(R5_Expanded, &sh->dev[j].flags); - set_bit(R5_UPTODATE, &sh->dev[j].flags); - } - if (!skipped) { - set_bit(STRIPE_EXPAND_READY, &sh->state); - set_bit(STRIPE_HANDLE, &sh->state); - } - release_stripe(sh); - } - spin_lock_irq(&conf->device_lock); - conf->expand_progress = (sector_nr + i)*(conf->raid_disks-1); - spin_unlock_irq(&conf->device_lock); - /* Ok, those stripe are ready. We can start scheduling - * reads on the source stripes. - * The source stripes are determined by mapping the first and last - * block on the destination stripes. - */ - raid_disks = conf->previous_raid_disks; - data_disks = raid_disks - 1; - first_sector = - raid5_compute_sector(sector_nr*(conf->raid_disks-1), - raid_disks, data_disks, - &dd_idx, &pd_idx, conf); - last_sector = - raid5_compute_sector((sector_nr+conf->chunk_size/512) - *(conf->raid_disks-1) -1, - raid_disks, data_disks, - &dd_idx, &pd_idx, conf); - if (last_sector >= (mddev->size<<1)) - last_sector = (mddev->size<<1)-1; - while (first_sector <= last_sector) { - pd_idx = stripe_to_pdidx(first_sector, conf, conf->previous_raid_disks); - sh = get_active_stripe(conf, first_sector, - conf->previous_raid_disks, pd_idx, 0); - set_bit(STRIPE_EXPAND_SOURCE, &sh->state); - set_bit(STRIPE_HANDLE, &sh->state); - release_stripe(sh); - first_sector += STRIPE_SECTORS; - } - return conf->chunk_size>>9; - } - /* if there is 1 or more failed drives and we are trying + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) + return reshape_request(mddev, sector_nr, skipped); + + /* if there is too many failed drives and we are trying * to resync, then assert that we are finished, because there is * nothing we can do. */ - if (mddev->degraded >= 1 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { + if (mddev->degraded >= conf->max_degraded && + test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { sector_t rv = (mddev->size << 1) - sector_nr; *skipped = 1; return rv; @@ -2026,17 +2878,26 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i if (sh == NULL) { sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 0); /* make sure we don't swamp the stripe cache if someone else - * is trying to get access + * is trying to get access */ schedule_timeout_uninterruptible(1); } - bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 0); - spin_lock(&sh->lock); + /* Need to check if array will still be degraded after recovery/resync + * We don't need to check the 'failed' flag as when that gets set, + * recovery aborts. + */ + for (i=0; i<mddev->raid_disks; i++) + if (conf->disks[i].rdev == NULL) + still_degraded = 1; + + bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); + + spin_lock(&sh->lock); set_bit(STRIPE_SYNCING, &sh->state); clear_bit(STRIPE_INSYNC, &sh->state); spin_unlock(&sh->lock); - handle_stripe(sh); + handle_stripe(sh, NULL); release_stripe(sh); return STRIPE_SECTORS; @@ -2091,7 +2952,7 @@ static void raid5d (mddev_t *mddev) spin_unlock_irq(&conf->device_lock); handled++; - handle_stripe(sh); + handle_stripe(sh, conf->spare_page); release_stripe(sh); spin_lock_irq(&conf->device_lock); @@ -2181,8 +3042,8 @@ static int run(mddev_t *mddev) struct disk_info *disk; struct list_head *tmp; - if (mddev->level != 5 && mddev->level != 4) { - printk(KERN_ERR "raid5: %s: raid level not set to 4/5 (%d)\n", + if (mddev->level != 5 && mddev->level != 4 && mddev->level != 6) { + printk(KERN_ERR "raid5: %s: raid level not set to 4/5/6 (%d)\n", mdname(mddev), mddev->level); return -EIO; } @@ -2251,6 +3112,11 @@ static int run(mddev_t *mddev) if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) goto abort; + if (mddev->level == 6) { + conf->spare_page = alloc_page(GFP_KERNEL); + if (!conf->spare_page) + goto abort; + } spin_lock_init(&conf->device_lock); init_waitqueue_head(&conf->wait_for_stripe); init_waitqueue_head(&conf->wait_for_overlap); @@ -2282,12 +3148,16 @@ static int run(mddev_t *mddev) } /* - * 0 for a fully functional array, 1 for a degraded array. + * 0 for a fully functional array, 1 or 2 for a degraded array. */ mddev->degraded = conf->failed_disks = conf->raid_disks - conf->working_disks; conf->mddev = mddev; conf->chunk_size = mddev->chunk_size; conf->level = mddev->level; + if (conf->level == 6) + conf->max_degraded = 2; + else + conf->max_degraded = 1; conf->algorithm = mddev->layout; conf->max_nr_stripes = NR_STRIPES; conf->expand_progress = mddev->reshape_position; @@ -2296,6 +3166,11 @@ static int run(mddev_t *mddev) mddev->size &= ~(mddev->chunk_size/1024 -1); mddev->resync_max_sectors = mddev->size << 1; + if (conf->level == 6 && conf->raid_disks < 4) { + printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n", + mdname(mddev), conf->raid_disks); + goto abort; + } if (!conf->chunk_size || conf->chunk_size % 4) { printk(KERN_ERR "raid5: invalid chunk size %d for %s\n", conf->chunk_size, mdname(mddev)); @@ -2307,14 +3182,14 @@ static int run(mddev_t *mddev) conf->algorithm, mdname(mddev)); goto abort; } - if (mddev->degraded > 1) { + if (mddev->degraded > conf->max_degraded) { printk(KERN_ERR "raid5: not enough operational devices for %s" " (%d/%d failed)\n", mdname(mddev), conf->failed_disks, conf->raid_disks); goto abort; } - if (mddev->degraded == 1 && + if (mddev->degraded > 0 && mddev->recovery_cp != MaxSector) { if (mddev->ok_start_degraded) printk(KERN_WARNING @@ -2379,11 +3254,12 @@ static int run(mddev_t *mddev) } /* read-ahead size must cover two whole stripes, which is - * 2 * (n-1) * chunksize where 'n' is the number of raid devices + * 2 * (datadisks) * chunksize where 'n' is the number of raid devices */ { - int stripe = (mddev->raid_disks-1) * mddev->chunk_size - / PAGE_SIZE; + int data_disks = conf->previous_raid_disks - conf->max_degraded; + int stripe = data_disks * + (mddev->chunk_size / PAGE_SIZE); if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) mddev->queue->backing_dev_info.ra_pages = 2 * stripe; } @@ -2393,12 +3269,14 @@ static int run(mddev_t *mddev) mddev->queue->unplug_fn = raid5_unplug_device; mddev->queue->issue_flush_fn = raid5_issue_flush; - mddev->array_size = mddev->size * (conf->previous_raid_disks - 1); + mddev->array_size = mddev->size * (conf->previous_raid_disks - + conf->max_degraded); return 0; abort: if (conf) { print_raid5_conf(conf); + safe_put_page(conf->spare_page); kfree(conf->disks); kfree(conf->stripe_hashtbl); kfree(conf); @@ -2427,23 +3305,23 @@ static int stop(mddev_t *mddev) } #if RAID5_DEBUG -static void print_sh (struct stripe_head *sh) +static void print_sh (struct seq_file *seq, struct stripe_head *sh) { int i; - printk("sh %llu, pd_idx %d, state %ld.\n", - (unsigned long long)sh->sector, sh->pd_idx, sh->state); - printk("sh %llu, count %d.\n", - (unsigned long long)sh->sector, atomic_read(&sh->count)); - printk("sh %llu, ", (unsigned long long)sh->sector); + seq_printf(seq, "sh %llu, pd_idx %d, state %ld.\n", + (unsigned long long)sh->sector, sh->pd_idx, sh->state); + seq_printf(seq, "sh %llu, count %d.\n", + (unsigned long long)sh->sector, atomic_read(&sh->count)); + seq_printf(seq, "sh %llu, ", (unsigned long long)sh->sector); for (i = 0; i < sh->disks; i++) { - printk("(cache%d: %p %ld) ", - i, sh->dev[i].page, sh->dev[i].flags); + seq_printf(seq, "(cache%d: %p %ld) ", + i, sh->dev[i].page, sh->dev[i].flags); } - printk("\n"); + seq_printf(seq, "\n"); } -static void printall (raid5_conf_t *conf) +static void printall (struct seq_file *seq, raid5_conf_t *conf) { struct stripe_head *sh; struct hlist_node *hn; @@ -2454,7 +3332,7 @@ static void printall (raid5_conf_t *conf) hlist_for_each_entry(sh, hn, &conf->stripe_hashtbl[i], hash) { if (sh->raid_conf != conf) continue; - print_sh(sh); + print_sh(seq, sh); } } spin_unlock_irq(&conf->device_lock); @@ -2474,9 +3352,8 @@ static void status (struct seq_file *seq, mddev_t *mddev) test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_"); seq_printf (seq, "]"); #if RAID5_DEBUG -#define D(x) \ - seq_printf (seq, "<"#x":%d>", atomic_read(&conf->x)) - printall(conf); + seq_printf (seq, "\n"); + printall(seq, conf); #endif } @@ -2560,14 +3437,20 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) int disk; struct disk_info *p; - if (mddev->degraded > 1) + if (mddev->degraded > conf->max_degraded) /* no point adding a device */ return 0; /* - * find the disk ... + * find the disk ... but prefer rdev->saved_raid_disk + * if possible. */ - for (disk=0; disk < conf->raid_disks; disk++) + if (rdev->saved_raid_disk >= 0 && + conf->disks[rdev->saved_raid_disk].rdev == NULL) + disk = rdev->saved_raid_disk; + else + disk = 0; + for ( ; disk < conf->raid_disks; disk++) if ((p=conf->disks + disk)->rdev == NULL) { clear_bit(In_sync, &rdev->flags); rdev->raid_disk = disk; @@ -2590,8 +3473,10 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) * any io in the removed space completes, but it hardly seems * worth it. */ + raid5_conf_t *conf = mddev_to_conf(mddev); + sectors &= ~((sector_t)mddev->chunk_size/512 - 1); - mddev->array_size = (sectors * (mddev->raid_disks-1))>>1; + mddev->array_size = (sectors * (mddev->raid_disks-conf->max_degraded))>>1; set_capacity(mddev->gendisk, mddev->array_size << 1); mddev->changed = 1; if (sectors/2 > mddev->size && mddev->recovery_cp == MaxSector) { @@ -2680,6 +3565,7 @@ static int raid5_start_reshape(mddev_t *mddev) set_bit(In_sync, &rdev->flags); conf->working_disks++; added_devices++; + rdev->recovery_offset = 0; sprintf(nm, "rd%d", rdev->raid_disk); sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); } else @@ -2731,6 +3617,17 @@ static void end_reshape(raid5_conf_t *conf) conf->expand_progress = MaxSector; spin_unlock_irq(&conf->device_lock); conf->mddev->reshape_position = MaxSector; + + /* read-ahead size must cover two whole stripes, which is + * 2 * (datadisks) * chunksize where 'n' is the number of raid devices + */ + { + int data_disks = conf->previous_raid_disks - conf->max_degraded; + int stripe = data_disks * + (conf->mddev->chunk_size / PAGE_SIZE); + if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe) + conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe; + } } } @@ -2762,6 +3659,23 @@ static void raid5_quiesce(mddev_t *mddev, int state) } } +static struct mdk_personality raid6_personality = +{ + .name = "raid6", + .level = 6, + .owner = THIS_MODULE, + .make_request = make_request, + .run = run, + .stop = stop, + .status = status, + .error_handler = error, + .hot_add_disk = raid5_add_disk, + .hot_remove_disk= raid5_remove_disk, + .spare_active = raid5_spare_active, + .sync_request = sync_request, + .resize = raid5_resize, + .quiesce = raid5_quiesce, +}; static struct mdk_personality raid5_personality = { .name = "raid5", @@ -2804,6 +3718,12 @@ static struct mdk_personality raid4_personality = static int __init raid5_init(void) { + int e; + + e = raid6_select_algo(); + if ( e ) + return e; + register_md_personality(&raid6_personality); register_md_personality(&raid5_personality); register_md_personality(&raid4_personality); return 0; @@ -2811,6 +3731,7 @@ static int __init raid5_init(void) static void raid5_exit(void) { + unregister_md_personality(&raid6_personality); unregister_md_personality(&raid5_personality); unregister_md_personality(&raid4_personality); } @@ -2823,3 +3744,10 @@ MODULE_ALIAS("md-raid5"); MODULE_ALIAS("md-raid4"); MODULE_ALIAS("md-level-5"); MODULE_ALIAS("md-level-4"); +MODULE_ALIAS("md-personality-8"); /* RAID6 */ +MODULE_ALIAS("md-raid6"); +MODULE_ALIAS("md-level-6"); + +/* This used to be two separate modules, they were: */ +MODULE_ALIAS("raid5"); +MODULE_ALIAS("raid6"); diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c deleted file mode 100644 index bc69355e010..00000000000 --- a/drivers/md/raid6main.c +++ /dev/null @@ -1,2427 +0,0 @@ -/* - * raid6main.c : Multiple Devices driver for Linux - * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman - * Copyright (C) 1999, 2000 Ingo Molnar - * Copyright (C) 2002, 2003 H. Peter Anvin - * - * RAID-6 management functions. This code is derived from raid5.c. - * Last merge from raid5.c bkcvs version 1.79 (kernel 2.6.1). - * - * Thanks to Penguin Computing for making the RAID-6 development possible - * by donating a test server! - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * You should have received a copy of the GNU General Public License - * (for example /usr/src/linux/COPYING); if not, write to the Free - * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - - -#include <linux/config.h> -#include <linux/module.h> -#include <linux/slab.h> -#include <linux/highmem.h> -#include <linux/bitops.h> -#include <asm/atomic.h> -#include "raid6.h" - -#include <linux/raid/bitmap.h> - -/* - * Stripe cache - */ - -#define NR_STRIPES 256 -#define STRIPE_SIZE PAGE_SIZE -#define STRIPE_SHIFT (PAGE_SHIFT - 9) -#define STRIPE_SECTORS (STRIPE_SIZE>>9) -#define IO_THRESHOLD 1 -#define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head)) -#define HASH_MASK (NR_HASH - 1) - -#define stripe_hash(conf, sect) (&((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK])) - -/* bio's attached to a stripe+device for I/O are linked together in bi_sector - * order without overlap. There may be several bio's per stripe+device, and - * a bio could span several devices. - * When walking this list for a particular stripe+device, we must never proceed - * beyond a bio that extends past this device, as the next bio might no longer - * be valid. - * This macro is used to determine the 'next' bio in the list, given the sector - * of the current stripe+device - */ -#define r5_next_bio(bio, sect) ( ( (bio)->bi_sector + ((bio)->bi_size>>9) < sect + STRIPE_SECTORS) ? (bio)->bi_next : NULL) -/* - * The following can be used to debug the driver - */ -#define RAID6_DEBUG 0 /* Extremely verbose printk */ -#define RAID6_PARANOIA 1 /* Check spinlocks */ -#define RAID6_DUMPSTATE 0 /* Include stripe cache state in /proc/mdstat */ -#if RAID6_PARANOIA && defined(CONFIG_SMP) -# define CHECK_DEVLOCK() assert_spin_locked(&conf->device_lock) -#else -# define CHECK_DEVLOCK() -#endif - -#define PRINTK(x...) ((void)(RAID6_DEBUG && printk(KERN_DEBUG x))) -#if RAID6_DEBUG -#undef inline -#undef __inline__ -#define inline -#define __inline__ -#endif - -#if !RAID6_USE_EMPTY_ZERO_PAGE -/* In .bss so it's zeroed */ -const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256))); -#endif - -static inline int raid6_next_disk(int disk, int raid_disks) -{ - disk++; - return (disk < raid_disks) ? disk : 0; -} - -static void print_raid6_conf (raid6_conf_t *conf); - -static void __release_stripe(raid6_conf_t *conf, struct stripe_head *sh) -{ - if (atomic_dec_and_test(&sh->count)) { - BUG_ON(!list_empty(&sh->lru)); - BUG_ON(atomic_read(&conf->active_stripes)==0); - if (test_bit(STRIPE_HANDLE, &sh->state)) { - if (test_bit(STRIPE_DELAYED, &sh->state)) - list_add_tail(&sh->lru, &conf->delayed_list); - else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && - conf->seq_write == sh->bm_seq) - list_add_tail(&sh->lru, &conf->bitmap_list); - else { - clear_bit(STRIPE_BIT_DELAY, &sh->state); - list_add_tail(&sh->lru, &conf->handle_list); - } - md_wakeup_thread(conf->mddev->thread); - } else { - if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { - atomic_dec(&conf->preread_active_stripes); - if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) - md_wakeup_thread(conf->mddev->thread); - } - list_add_tail(&sh->lru, &conf->inactive_list); - atomic_dec(&conf->active_stripes); - if (!conf->inactive_blocked || - atomic_read(&conf->active_stripes) < (conf->max_nr_stripes*3/4)) - wake_up(&conf->wait_for_stripe); - } - } -} -static void release_stripe(struct stripe_head *sh) -{ - raid6_conf_t *conf = sh->raid_conf; - unsigned long flags; - - spin_lock_irqsave(&conf->device_lock, flags); - __release_stripe(conf, sh); - spin_unlock_irqrestore(&conf->device_lock, flags); -} - -static inline void remove_hash(struct stripe_head *sh) -{ - PRINTK("remove_hash(), stripe %llu\n", (unsigned long long)sh->sector); - - hlist_del_init(&sh->hash); -} - -static inline void insert_hash(raid6_conf_t *conf, struct stripe_head *sh) -{ - struct hlist_head *hp = stripe_hash(conf, sh->sector); - - PRINTK("insert_hash(), stripe %llu\n", (unsigned long long)sh->sector); - - CHECK_DEVLOCK(); - hlist_add_head(&sh->hash, hp); -} - - -/* find an idle stripe, make sure it is unhashed, and return it. */ -static struct stripe_head *get_free_stripe(raid6_conf_t *conf) -{ - struct stripe_head *sh = NULL; - struct list_head *first; - - CHECK_DEVLOCK(); - if (list_empty(&conf->inactive_list)) - goto out; - first = conf->inactive_list.next; - sh = list_entry(first, struct stripe_head, lru); - list_del_init(first); - remove_hash(sh); - atomic_inc(&conf->active_stripes); -out: - return sh; -} - -static void shrink_buffers(struct stripe_head *sh, int num) -{ - struct page *p; - int i; - - for (i=0; i<num ; i++) { - p = sh->dev[i].page; - if (!p) - continue; - sh->dev[i].page = NULL; - put_page(p); - } -} - -static int grow_buffers(struct stripe_head *sh, int num) -{ - int i; - - for (i=0; i<num; i++) { - struct page *page; - - if (!(page = alloc_page(GFP_KERNEL))) { - return 1; - } - sh->dev[i].page = page; - } - return 0; -} - -static void raid6_build_block (struct stripe_head *sh, int i); - -static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx) -{ - raid6_conf_t *conf = sh->raid_conf; - int disks = conf->raid_disks, i; - - BUG_ON(atomic_read(&sh->count) != 0); - BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); - - CHECK_DEVLOCK(); - PRINTK("init_stripe called, stripe %llu\n", - (unsigned long long)sh->sector); - - remove_hash(sh); - - sh->sector = sector; - sh->pd_idx = pd_idx; - sh->state = 0; - - for (i=disks; i--; ) { - struct r5dev *dev = &sh->dev[i]; - - if (dev->toread || dev->towrite || dev->written || - test_bit(R5_LOCKED, &dev->flags)) { - PRINTK("sector=%llx i=%d %p %p %p %d\n", - (unsigned long long)sh->sector, i, dev->toread, - dev->towrite, dev->written, - test_bit(R5_LOCKED, &dev->flags)); - BUG(); - } - dev->flags = 0; - raid6_build_block(sh, i); - } - insert_hash(conf, sh); -} - -static struct stripe_head *__find_stripe(raid6_conf_t *conf, sector_t sector) -{ - struct stripe_head *sh; - struct hlist_node *hn; - - CHECK_DEVLOCK(); - PRINTK("__find_stripe, sector %llu\n", (unsigned long long)sector); - hlist_for_each_entry (sh, hn, stripe_hash(conf, sector), hash) - if (sh->sector == sector) - return sh; - PRINTK("__stripe %llu not in cache\n", (unsigned long long)sector); - return NULL; -} - -static void unplug_slaves(mddev_t *mddev); - -static struct stripe_head *get_active_stripe(raid6_conf_t *conf, sector_t sector, - int pd_idx, int noblock) -{ - struct stripe_head *sh; - - PRINTK("get_stripe, sector %llu\n", (unsigned long long)sector); - - spin_lock_irq(&conf->device_lock); - - do { - wait_event_lock_irq(conf->wait_for_stripe, - conf->quiesce == 0, - conf->device_lock, /* nothing */); - sh = __find_stripe(conf, sector); - if (!sh) { - if (!conf->inactive_blocked) - sh = get_free_stripe(conf); - if (noblock && sh == NULL) - break; - if (!sh) { - conf->inactive_blocked = 1; - wait_event_lock_irq(conf->wait_for_stripe, - !list_empty(&conf->inactive_list) && - (atomic_read(&conf->active_stripes) - < (conf->max_nr_stripes *3/4) - || !conf->inactive_blocked), - conf->device_lock, - unplug_slaves(conf->mddev); - ); - conf->inactive_blocked = 0; - } else - init_stripe(sh, sector, pd_idx); - } else { - if (atomic_read(&sh->count)) { - BUG_ON(!list_empty(&sh->lru)); - } else { - if (!test_bit(STRIPE_HANDLE, &sh->state)) - atomic_inc(&conf->active_stripes); - BUG_ON(list_empty(&sh->lru)); - list_del_init(&sh->lru); - } - } - } while (sh == NULL); - - if (sh) - atomic_inc(&sh->count); - - spin_unlock_irq(&conf->device_lock); - return sh; -} - -static int grow_one_stripe(raid6_conf_t *conf) -{ - struct stripe_head *sh; - sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL); - if (!sh) - return 0; - memset(sh, 0, sizeof(*sh) + (conf->raid_disks-1)*sizeof(struct r5dev)); - sh->raid_conf = conf; - spin_lock_init(&sh->lock); - - if (grow_buffers(sh, conf->raid_disks)) { - shrink_buffers(sh, conf->raid_disks); - kmem_cache_free(conf->slab_cache, sh); - return 0; - } - /* we just created an active stripe so... */ - atomic_set(&sh->count, 1); - atomic_inc(&conf->active_stripes); - INIT_LIST_HEAD(&sh->lru); - release_stripe(sh); - return 1; -} - -static int grow_stripes(raid6_conf_t *conf, int num) -{ - kmem_cache_t *sc; - int devs = conf->raid_disks; - - sprintf(conf->cache_name[0], "raid6/%s", mdname(conf->mddev)); - - sc = kmem_cache_create(conf->cache_name[0], - sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), - 0, 0, NULL, NULL); - if (!sc) - return 1; - conf->slab_cache = sc; - while (num--) - if (!grow_one_stripe(conf)) - return 1; - return 0; -} - -static int drop_one_stripe(raid6_conf_t *conf) -{ - struct stripe_head *sh; - spin_lock_irq(&conf->device_lock); - sh = get_free_stripe(conf); - spin_unlock_irq(&conf->device_lock); - if (!sh) - return 0; - BUG_ON(atomic_read(&sh->count)); - shrink_buffers(sh, conf->raid_disks); - kmem_cache_free(conf->slab_cache, sh); - atomic_dec(&conf->active_stripes); - return 1; -} - -static void shrink_stripes(raid6_conf_t *conf) -{ - while (drop_one_stripe(conf)) - ; - - if (conf->slab_cache) - kmem_cache_destroy(conf->slab_cache); - conf->slab_cache = NULL; -} - -static int raid6_end_read_request(struct bio * bi, unsigned int bytes_done, - int error) -{ - struct stripe_head *sh = bi->bi_private; - raid6_conf_t *conf = sh->raid_conf; - int disks = conf->raid_disks, i; - int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); - - if (bi->bi_size) - return 1; - - for (i=0 ; i<disks; i++) - if (bi == &sh->dev[i].req) - break; - - PRINTK("end_read_request %llu/%d, count: %d, uptodate %d.\n", - (unsigned long long)sh->sector, i, atomic_read(&sh->count), - uptodate); - if (i == disks) { - BUG(); - return 0; - } - - if (uptodate) { -#if 0 - struct bio *bio; - unsigned long flags; - spin_lock_irqsave(&conf->device_lock, flags); - /* we can return a buffer if we bypassed the cache or - * if the top buffer is not in highmem. If there are - * multiple buffers, leave the extra work to - * handle_stripe - */ - buffer = sh->bh_read[i]; - if (buffer && - (!PageHighMem(buffer->b_page) - || buffer->b_page == bh->b_page ) - ) { - sh->bh_read[i] = buffer->b_reqnext; - buffer->b_reqnext = NULL; - } else - buffer = NULL; - spin_unlock_irqrestore(&conf->device_lock, flags); - if (sh->bh_page[i]==bh->b_page) - set_buffer_uptodate(bh); - if (buffer) { - if (buffer->b_page != bh->b_page) - memcpy(buffer->b_data, bh->b_data, bh->b_size); - buffer->b_end_io(buffer, 1); - } -#else - set_bit(R5_UPTODATE, &sh->dev[i].flags); -#endif - if (test_bit(R5_ReadError, &sh->dev[i].flags)) { - printk(KERN_INFO "raid6: read error corrected!!\n"); - clear_bit(R5_ReadError, &sh->dev[i].flags); - clear_bit(R5_ReWrite, &sh->dev[i].flags); - } - if (atomic_read(&conf->disks[i].rdev->read_errors)) - atomic_set(&conf->disks[i].rdev->read_errors, 0); - } else { - int retry = 0; - clear_bit(R5_UPTODATE, &sh->dev[i].flags); - atomic_inc(&conf->disks[i].rdev->read_errors); - if (conf->mddev->degraded) - printk(KERN_WARNING "raid6: read error not correctable.\n"); - else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) - /* Oh, no!!! */ - printk(KERN_WARNING "raid6: read error NOT corrected!!\n"); - else if (atomic_read(&conf->disks[i].rdev->read_errors) - > conf->max_nr_stripes) - printk(KERN_WARNING - "raid6: Too many read errors, failing device.\n"); - else - retry = 1; - if (retry) - set_bit(R5_ReadError, &sh->dev[i].flags); - else { - clear_bit(R5_ReadError, &sh->dev[i].flags); - clear_bit(R5_ReWrite, &sh->dev[i].flags); - md_error(conf->mddev, conf->disks[i].rdev); - } - } - rdev_dec_pending(conf->disks[i].rdev, conf->mddev); -#if 0 - /* must restore b_page before unlocking buffer... */ - if (sh->bh_page[i] != bh->b_page) { - bh->b_page = sh->bh_page[i]; - bh->b_data = page_address(bh->b_page); - clear_buffer_uptodate(bh); - } -#endif - clear_bit(R5_LOCKED, &sh->dev[i].flags); - set_bit(STRIPE_HANDLE, &sh->state); - release_stripe(sh); - return 0; -} - -static int raid6_end_write_request (struct bio *bi, unsigned int bytes_done, - int error) -{ - struct stripe_head *sh = bi->bi_private; - raid6_conf_t *conf = sh->raid_conf; - int disks = conf->raid_disks, i; - unsigned long flags; - int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); - - if (bi->bi_size) - return 1; - - for (i=0 ; i<disks; i++) - if (bi == &sh->dev[i].req) - break; - - PRINTK("end_write_request %llu/%d, count %d, uptodate: %d.\n", - (unsigned long long)sh->sector, i, atomic_read(&sh->count), - uptodate); - if (i == disks) { - BUG(); - return 0; - } - - spin_lock_irqsave(&conf->device_lock, flags); - if (!uptodate) - md_error(conf->mddev, conf->disks[i].rdev); - - rdev_dec_pending(conf->disks[i].rdev, conf->mddev); - - clear_bit(R5_LOCKED, &sh->dev[i].flags); - set_bit(STRIPE_HANDLE, &sh->state); - __release_stripe(conf, sh); - spin_unlock_irqrestore(&conf->device_lock, flags); - return 0; -} - - -static sector_t compute_blocknr(struct stripe_head *sh, int i); - -static void raid6_build_block (struct stripe_head *sh, int i) -{ - struct r5dev *dev = &sh->dev[i]; - int pd_idx = sh->pd_idx; - int qd_idx = raid6_next_disk(pd_idx, sh->raid_conf->raid_disks); - - bio_init(&dev->req); - dev->req.bi_io_vec = &dev->vec; - dev->req.bi_vcnt++; - dev->req.bi_max_vecs++; - dev->vec.bv_page = dev->page; - dev->vec.bv_len = STRIPE_SIZE; - dev->vec.bv_offset = 0; - - dev->req.bi_sector = sh->sector; - dev->req.bi_private = sh; - - dev->flags = 0; - if (i != pd_idx && i != qd_idx) - dev->sector = compute_blocknr(sh, i); -} - -static void error(mddev_t *mddev, mdk_rdev_t *rdev) -{ - char b[BDEVNAME_SIZE]; - raid6_conf_t *conf = (raid6_conf_t *) mddev->private; - PRINTK("raid6: error called\n"); - - if (!test_bit(Faulty, &rdev->flags)) { - mddev->sb_dirty = 1; - if (test_bit(In_sync, &rdev->flags)) { - conf->working_disks--; - mddev->degraded++; - conf->failed_disks++; - clear_bit(In_sync, &rdev->flags); - /* - * if recovery was running, make sure it aborts. - */ - set_bit(MD_RECOVERY_ERR, &mddev->recovery); - } - set_bit(Faulty, &rdev->flags); - printk (KERN_ALERT - "raid6: Disk failure on %s, disabling device." - " Operation continuing on %d devices\n", - bdevname(rdev->bdev,b), conf->working_disks); - } -} - -/* - * Input: a 'big' sector number, - * Output: index of the data and parity disk, and the sector # in them. - */ -static sector_t raid6_compute_sector(sector_t r_sector, unsigned int raid_disks, - unsigned int data_disks, unsigned int * dd_idx, - unsigned int * pd_idx, raid6_conf_t *conf) -{ - long stripe; - unsigned long chunk_number; - unsigned int chunk_offset; - sector_t new_sector; - int sectors_per_chunk = conf->chunk_size >> 9; - - /* First compute the information on this sector */ - - /* - * Compute the chunk number and the sector offset inside the chunk - */ - chunk_offset = sector_div(r_sector, sectors_per_chunk); - chunk_number = r_sector; - if ( r_sector != chunk_number ) { - printk(KERN_CRIT "raid6: ERROR: r_sector = %llu, chunk_number = %lu\n", - (unsigned long long)r_sector, (unsigned long)chunk_number); - BUG(); - } - - /* - * Compute the stripe number - */ - stripe = chunk_number / data_disks; - - /* - * Compute the data disk and parity disk indexes inside the stripe - */ - *dd_idx = chunk_number % data_disks; - - /* - * Select the parity disk based on the user selected algorithm. - */ - - /**** FIX THIS ****/ - switch (conf->algorithm) { - case ALGORITHM_LEFT_ASYMMETRIC: - *pd_idx = raid_disks - 1 - (stripe % raid_disks); - if (*pd_idx == raid_disks-1) - (*dd_idx)++; /* Q D D D P */ - else if (*dd_idx >= *pd_idx) - (*dd_idx) += 2; /* D D P Q D */ - break; - case ALGORITHM_RIGHT_ASYMMETRIC: - *pd_idx = stripe % raid_disks; - if (*pd_idx == raid_disks-1) - (*dd_idx)++; /* Q D D D P */ - else if (*dd_idx >= *pd_idx) - (*dd_idx) += 2; /* D D P Q D */ - break; - case ALGORITHM_LEFT_SYMMETRIC: - *pd_idx = raid_disks - 1 - (stripe % raid_disks); - *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks; - break; - case ALGORITHM_RIGHT_SYMMETRIC: - *pd_idx = stripe % raid_disks; - *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks; - break; - default: - printk (KERN_CRIT "raid6: unsupported algorithm %d\n", - conf->algorithm); - } - - PRINTK("raid6: chunk_number = %lu, pd_idx = %u, dd_idx = %u\n", - chunk_number, *pd_idx, *dd_idx); - - /* - * Finally, compute the new sector number - */ - new_sector = (sector_t) stripe * sectors_per_chunk + chunk_offset; - return new_sector; -} - - -static sector_t compute_blocknr(struct stripe_head *sh, int i) -{ - raid6_conf_t *conf = sh->raid_conf; - int raid_disks = conf->raid_disks, data_disks = raid_disks - 2; - sector_t new_sector = sh->sector, check; - int sectors_per_chunk = conf->chunk_size >> 9; - sector_t stripe; - int chunk_offset; - int chunk_number, dummy1, dummy2, dd_idx = i; - sector_t r_sector; - int i0 = i; - - chunk_offset = sector_div(new_sector, sectors_per_chunk); - stripe = new_sector; - if ( new_sector != stripe ) { - printk(KERN_CRIT "raid6: ERROR: new_sector = %llu, stripe = %lu\n", - (unsigned long long)new_sector, (unsigned long)stripe); - BUG(); - } - - switch (conf->algorithm) { - case ALGORITHM_LEFT_ASYMMETRIC: - case ALGORITHM_RIGHT_ASYMMETRIC: - if (sh->pd_idx == raid_disks-1) - i--; /* Q D D D P */ - else if (i > sh->pd_idx) - i -= 2; /* D D P Q D */ - break; - case ALGORITHM_LEFT_SYMMETRIC: - case ALGORITHM_RIGHT_SYMMETRIC: - if (sh->pd_idx == raid_disks-1) - i--; /* Q D D D P */ - else { - /* D D P Q D */ - if (i < sh->pd_idx) - i += raid_disks; - i -= (sh->pd_idx + 2); - } - break; - default: - printk (KERN_CRIT "raid6: unsupported algorithm %d\n", - conf->algorithm); - } - - PRINTK("raid6: compute_blocknr: pd_idx = %u, i0 = %u, i = %u\n", sh->pd_idx, i0, i); - - chunk_number = stripe * data_disks + i; - r_sector = (sector_t)chunk_number * sectors_per_chunk + chunk_offset; - - check = raid6_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf); - if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) { - printk(KERN_CRIT "raid6: compute_blocknr: map not correct\n"); - return 0; - } - return r_sector; -} - - - -/* - * Copy data between a page in the stripe cache, and one or more bion - * The page could align with the middle of the bio, or there could be - * several bion, each with several bio_vecs, which cover part of the page - * Multiple bion are linked together on bi_next. There may be extras - * at the end of this list. We ignore them. - */ -static void copy_data(int frombio, struct bio *bio, - struct page *page, - sector_t sector) -{ - char *pa = page_address(page); - struct bio_vec *bvl; - int i; - int page_offset; - - if (bio->bi_sector >= sector) - page_offset = (signed)(bio->bi_sector - sector) * 512; - else - page_offset = (signed)(sector - bio->bi_sector) * -512; - bio_for_each_segment(bvl, bio, i) { - int len = bio_iovec_idx(bio,i)->bv_len; - int clen; - int b_offset = 0; - - if (page_offset < 0) { - b_offset = -page_offset; - page_offset += b_offset; - len -= b_offset; - } - - if (len > 0 && page_offset + len > STRIPE_SIZE) - clen = STRIPE_SIZE - page_offset; - else clen = len; - - if (clen > 0) { - char *ba = __bio_kmap_atomic(bio, i, KM_USER0); - if (frombio) - memcpy(pa+page_offset, ba+b_offset, clen); - else - memcpy(ba+b_offset, pa+page_offset, clen); - __bio_kunmap_atomic(ba, KM_USER0); - } - if (clen < len) /* hit end of page */ - break; - page_offset += len; - } -} - -#define check_xor() do { \ - if (count == MAX_XOR_BLOCKS) { \ - xor_block(count, STRIPE_SIZE, ptr); \ - count = 1; \ - } \ - } while(0) - -/* Compute P and Q syndromes */ -static void compute_parity(struct stripe_head *sh, int method) -{ - raid6_conf_t *conf = sh->raid_conf; - int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = conf->raid_disks, count; - struct bio *chosen; - /**** FIX THIS: This could be very bad if disks is close to 256 ****/ - void *ptrs[disks]; - - qd_idx = raid6_next_disk(pd_idx, disks); - d0_idx = raid6_next_disk(qd_idx, disks); - - PRINTK("compute_parity, stripe %llu, method %d\n", - (unsigned long long)sh->sector, method); - - switch(method) { - case READ_MODIFY_WRITE: - BUG(); /* READ_MODIFY_WRITE N/A for RAID-6 */ - case RECONSTRUCT_WRITE: - for (i= disks; i-- ;) - if ( i != pd_idx && i != qd_idx && sh->dev[i].towrite ) { - chosen = sh->dev[i].towrite; - sh->dev[i].towrite = NULL; - - if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) - wake_up(&conf->wait_for_overlap); - - BUG_ON(sh->dev[i].written); - sh->dev[i].written = chosen; - } - break; - case CHECK_PARITY: - BUG(); /* Not implemented yet */ - } - - for (i = disks; i--;) - if (sh->dev[i].written) { - sector_t sector = sh->dev[i].sector; - struct bio *wbi = sh->dev[i].written; - while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) { - copy_data(1, wbi, sh->dev[i].page, sector); - wbi = r5_next_bio(wbi, sector); - } - - set_bit(R5_LOCKED, &sh->dev[i].flags); - set_bit(R5_UPTODATE, &sh->dev[i].flags); - } - -// switch(method) { -// case RECONSTRUCT_WRITE: -// case CHECK_PARITY: -// case UPDATE_PARITY: - /* Note that unlike RAID-5, the ordering of the disks matters greatly. */ - /* FIX: Is this ordering of drives even remotely optimal? */ - count = 0; - i = d0_idx; - do { - ptrs[count++] = page_address(sh->dev[i].page); - if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags)) - printk("block %d/%d not uptodate on parity calc\n", i,count); - i = raid6_next_disk(i, disks); - } while ( i != d0_idx ); -// break; -// } - - raid6_call.gen_syndrome(disks, STRIPE_SIZE, ptrs); - - switch(method) { - case RECONSTRUCT_WRITE: - set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); - set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags); - set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); - set_bit(R5_LOCKED, &sh->dev[qd_idx].flags); - break; - case UPDATE_PARITY: - set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); - set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags); - break; - } -} - -/* Compute one missing block */ -static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero) -{ - raid6_conf_t *conf = sh->raid_conf; - int i, count, disks = conf->raid_disks; - void *ptr[MAX_XOR_BLOCKS], *p; - int pd_idx = sh->pd_idx; - int qd_idx = raid6_next_disk(pd_idx, disks); - - PRINTK("compute_block_1, stripe %llu, idx %d\n", - (unsigned long long)sh->sector, dd_idx); - - if ( dd_idx == qd_idx ) { - /* We're actually computing the Q drive */ - compute_parity(sh, UPDATE_PARITY); - } else { - ptr[0] = page_address(sh->dev[dd_idx].page); - if (!nozero) memset(ptr[0], 0, STRIPE_SIZE); - count = 1; - for (i = disks ; i--; ) { - if (i == dd_idx || i == qd_idx) - continue; - p = page_address(sh->dev[i].page); - if (test_bit(R5_UPTODATE, &sh->dev[i].flags)) - ptr[count++] = p; - else - printk("compute_block() %d, stripe %llu, %d" - " not present\n", dd_idx, - (unsigned long long)sh->sector, i); - - check_xor(); - } - if (count != 1) - xor_block(count, STRIPE_SIZE, ptr); - if (!nozero) set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); - else clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); - } -} - -/* Compute two missing blocks */ -static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2) -{ - raid6_conf_t *conf = sh->raid_conf; - int i, count, disks = conf->raid_disks; - int pd_idx = sh->pd_idx; - int qd_idx = raid6_next_disk(pd_idx, disks); - int d0_idx = raid6_next_disk(qd_idx, disks); - int faila, failb; - - /* faila and failb are disk numbers relative to d0_idx */ - /* pd_idx become disks-2 and qd_idx become disks-1 */ - faila = (dd_idx1 < d0_idx) ? dd_idx1+(disks-d0_idx) : dd_idx1-d0_idx; - failb = (dd_idx2 < d0_idx) ? dd_idx2+(disks-d0_idx) : dd_idx2-d0_idx; - - BUG_ON(faila == failb); - if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; } - - PRINTK("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n", - (unsigned long long)sh->sector, dd_idx1, dd_idx2, faila, failb); - - if ( failb == disks-1 ) { - /* Q disk is one of the missing disks */ - if ( faila == disks-2 ) { - /* Missing P+Q, just recompute */ - compute_parity(sh, UPDATE_PARITY); - return; - } else { - /* We're missing D+Q; recompute D from P */ - compute_block_1(sh, (dd_idx1 == qd_idx) ? dd_idx2 : dd_idx1, 0); - compute_parity(sh, UPDATE_PARITY); /* Is this necessary? */ - return; - } - } - - /* We're missing D+P or D+D; build pointer table */ - { - /**** FIX THIS: This could be very bad if disks is close to 256 ****/ - void *ptrs[disks]; - - count = 0; - i = d0_idx; - do { - ptrs[count++] = page_address(sh->dev[i].page); - i = raid6_next_disk(i, disks); - if (i != dd_idx1 && i != dd_idx2 && - !test_bit(R5_UPTODATE, &sh->dev[i].flags)) - printk("compute_2 with missing block %d/%d\n", count, i); - } while ( i != d0_idx ); - - if ( failb == disks-2 ) { - /* We're missing D+P. */ - raid6_datap_recov(disks, STRIPE_SIZE, faila, ptrs); - } else { - /* We're missing D+D. */ - raid6_2data_recov(disks, STRIPE_SIZE, faila, failb, ptrs); - } - - /* Both the above update both missing blocks */ - set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags); - set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags); - } -} - - -/* - * Each stripe/dev can have one or more bion attached. - * toread/towrite point to the first in a chain. - * The bi_next chain must be in order. - */ -static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite) -{ - struct bio **bip; - raid6_conf_t *conf = sh->raid_conf; - int firstwrite=0; - - PRINTK("adding bh b#%llu to stripe s#%llu\n", - (unsigned long long)bi->bi_sector, - (unsigned long long)sh->sector); - - - spin_lock(&sh->lock); - spin_lock_irq(&conf->device_lock); - if (forwrite) { - bip = &sh->dev[dd_idx].towrite; - if (*bip == NULL && sh->dev[dd_idx].written == NULL) - firstwrite = 1; - } else - bip = &sh->dev[dd_idx].toread; - while (*bip && (*bip)->bi_sector < bi->bi_sector) { - if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector) - goto overlap; - bip = &(*bip)->bi_next; - } - if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9)) - goto overlap; - - BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next); - if (*bip) - bi->bi_next = *bip; - *bip = bi; - bi->bi_phys_segments ++; - spin_unlock_irq(&conf->device_lock); - spin_unlock(&sh->lock); - - PRINTK("added bi b#%llu to stripe s#%llu, disk %d.\n", - (unsigned long long)bi->bi_sector, - (unsigned long long)sh->sector, dd_idx); - - if (conf->mddev->bitmap && firstwrite) { - sh->bm_seq = conf->seq_write; - bitmap_startwrite(conf->mddev->bitmap, sh->sector, - STRIPE_SECTORS, 0); - set_bit(STRIPE_BIT_DELAY, &sh->state); - } - - if (forwrite) { - /* check if page is covered */ - sector_t sector = sh->dev[dd_idx].sector; - for (bi=sh->dev[dd_idx].towrite; - sector < sh->dev[dd_idx].sector + STRIPE_SECTORS && - bi && bi->bi_sector <= sector; - bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) { - if (bi->bi_sector + (bi->bi_size>>9) >= sector) - sector = bi->bi_sector + (bi->bi_size>>9); - } - if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) - set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); - } - return 1; - - overlap: - set_bit(R5_Overlap, &sh->dev[dd_idx].flags); - spin_unlock_irq(&conf->device_lock); - spin_unlock(&sh->lock); - return 0; -} - - -static int page_is_zero(struct page *p) -{ - char *a = page_address(p); - return ((*(u32*)a) == 0 && - memcmp(a, a+4, STRIPE_SIZE-4)==0); -} -/* - * handle_stripe - do things to a stripe. - * - * We lock the stripe and then examine the state of various bits - * to see what needs to be done. - * Possible results: - * return some read request which now have data - * return some write requests which are safely on disc - * schedule a read on some buffers - * schedule a write of some buffers - * return confirmation of parity correctness - * - * Parity calculations are done inside the stripe lock - * buffers are taken off read_list or write_list, and bh_cache buffers - * get BH_Lock set before the stripe lock is released. - * - */ - -static void handle_stripe(struct stripe_head *sh, struct page *tmp_page) -{ - raid6_conf_t *conf = sh->raid_conf; - int disks = conf->raid_disks; - struct bio *return_bi= NULL; - struct bio *bi; - int i; - int syncing; - int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0; - int non_overwrite = 0; - int failed_num[2] = {0, 0}; - struct r5dev *dev, *pdev, *qdev; - int pd_idx = sh->pd_idx; - int qd_idx = raid6_next_disk(pd_idx, disks); - int p_failed, q_failed; - - PRINTK("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d, qd_idx=%d\n", - (unsigned long long)sh->sector, sh->state, atomic_read(&sh->count), - pd_idx, qd_idx); - - spin_lock(&sh->lock); - clear_bit(STRIPE_HANDLE, &sh->state); - clear_bit(STRIPE_DELAYED, &sh->state); - - syncing = test_bit(STRIPE_SYNCING, &sh->state); - /* Now to look around and see what can be done */ - - rcu_read_lock(); - for (i=disks; i--; ) { - mdk_rdev_t *rdev; - dev = &sh->dev[i]; - clear_bit(R5_Insync, &dev->flags); - - PRINTK("check %d: state 0x%lx read %p write %p written %p\n", - i, dev->flags, dev->toread, dev->towrite, dev->written); - /* maybe we can reply to a read */ - if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) { - struct bio *rbi, *rbi2; - PRINTK("Return read for disc %d\n", i); - spin_lock_irq(&conf->device_lock); - rbi = dev->toread; - dev->toread = NULL; - if (test_and_clear_bit(R5_Overlap, &dev->flags)) - wake_up(&conf->wait_for_overlap); - spin_unlock_irq(&conf->device_lock); - while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) { - copy_data(0, rbi, dev->page, dev->sector); - rbi2 = r5_next_bio(rbi, dev->sector); - spin_lock_irq(&conf->device_lock); - if (--rbi->bi_phys_segments == 0) { - rbi->bi_next = return_bi; - return_bi = rbi; - } - spin_unlock_irq(&conf->device_lock); - rbi = rbi2; - } - } - - /* now count some things */ - if (test_bit(R5_LOCKED, &dev->flags)) locked++; - if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++; - - - if (dev->toread) to_read++; - if (dev->towrite) { - to_write++; - if (!test_bit(R5_OVERWRITE, &dev->flags)) - non_overwrite++; - } - if (dev->written) written++; - rdev = rcu_dereference(conf->disks[i].rdev); - if (!rdev || !test_bit(In_sync, &rdev->flags)) { - /* The ReadError flag will just be confusing now */ - clear_bit(R5_ReadError, &dev->flags); - clear_bit(R5_ReWrite, &dev->flags); - } - if (!rdev || !test_bit(In_sync, &rdev->flags) - || test_bit(R5_ReadError, &dev->flags)) { - if ( failed < 2 ) - failed_num[failed] = i; - failed++; - } else - set_bit(R5_Insync, &dev->flags); - } - rcu_read_unlock(); - PRINTK("locked=%d uptodate=%d to_read=%d" - " to_write=%d failed=%d failed_num=%d,%d\n", - locked, uptodate, to_read, to_write, failed, - failed_num[0], failed_num[1]); - /* check if the array has lost >2 devices and, if so, some requests might - * need to be failed - */ - if (failed > 2 && to_read+to_write+written) { - for (i=disks; i--; ) { - int bitmap_end = 0; - - if (test_bit(R5_ReadError, &sh->dev[i].flags)) { - mdk_rdev_t *rdev; - rcu_read_lock(); - rdev = rcu_dereference(conf->disks[i].rdev); - if (rdev && test_bit(In_sync, &rdev->flags)) - /* multiple read failures in one stripe */ - md_error(conf->mddev, rdev); - rcu_read_unlock(); - } - - spin_lock_irq(&conf->device_lock); - /* fail all writes first */ - bi = sh->dev[i].towrite; - sh->dev[i].towrite = NULL; - if (bi) { to_write--; bitmap_end = 1; } - - if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) - wake_up(&conf->wait_for_overlap); - - while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){ - struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); - clear_bit(BIO_UPTODATE, &bi->bi_flags); - if (--bi->bi_phys_segments == 0) { - md_write_end(conf->mddev); - bi->bi_next = return_bi; - return_bi = bi; - } - bi = nextbi; - } - /* and fail all 'written' */ - bi = sh->dev[i].written; - sh->dev[i].written = NULL; - if (bi) bitmap_end = 1; - while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { - struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); - clear_bit(BIO_UPTODATE, &bi->bi_flags); - if (--bi->bi_phys_segments == 0) { - md_write_end(conf->mddev); - bi->bi_next = return_bi; - return_bi = bi; - } - bi = bi2; - } - - /* fail any reads if this device is non-operational */ - if (!test_bit(R5_Insync, &sh->dev[i].flags) || - test_bit(R5_ReadError, &sh->dev[i].flags)) { - bi = sh->dev[i].toread; - sh->dev[i].toread = NULL; - if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) - wake_up(&conf->wait_for_overlap); - if (bi) to_read--; - while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){ - struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); - clear_bit(BIO_UPTODATE, &bi->bi_flags); - if (--bi->bi_phys_segments == 0) { - bi->bi_next = return_bi; - return_bi = bi; - } - bi = nextbi; - } - } - spin_unlock_irq(&conf->device_lock); - if (bitmap_end) - bitmap_endwrite(conf->mddev->bitmap, sh->sector, - STRIPE_SECTORS, 0, 0); - } - } - if (failed > 2 && syncing) { - md_done_sync(conf->mddev, STRIPE_SECTORS,0); - clear_bit(STRIPE_SYNCING, &sh->state); - syncing = 0; - } - - /* - * might be able to return some write requests if the parity blocks - * are safe, or on a failed drive - */ - pdev = &sh->dev[pd_idx]; - p_failed = (failed >= 1 && failed_num[0] == pd_idx) - || (failed >= 2 && failed_num[1] == pd_idx); - qdev = &sh->dev[qd_idx]; - q_failed = (failed >= 1 && failed_num[0] == qd_idx) - || (failed >= 2 && failed_num[1] == qd_idx); - - if ( written && - ( p_failed || ((test_bit(R5_Insync, &pdev->flags) - && !test_bit(R5_LOCKED, &pdev->flags) - && test_bit(R5_UPTODATE, &pdev->flags))) ) && - ( q_failed || ((test_bit(R5_Insync, &qdev->flags) - && !test_bit(R5_LOCKED, &qdev->flags) - && test_bit(R5_UPTODATE, &qdev->flags))) ) ) { - /* any written block on an uptodate or failed drive can be - * returned. Note that if we 'wrote' to a failed drive, - * it will be UPTODATE, but never LOCKED, so we don't need - * to test 'failed' directly. - */ - for (i=disks; i--; ) - if (sh->dev[i].written) { - dev = &sh->dev[i]; - if (!test_bit(R5_LOCKED, &dev->flags) && - test_bit(R5_UPTODATE, &dev->flags) ) { - /* We can return any write requests */ - int bitmap_end = 0; - struct bio *wbi, *wbi2; - PRINTK("Return write for stripe %llu disc %d\n", - (unsigned long long)sh->sector, i); - spin_lock_irq(&conf->device_lock); - wbi = dev->written; - dev->written = NULL; - while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) { - wbi2 = r5_next_bio(wbi, dev->sector); - if (--wbi->bi_phys_segments == 0) { - md_write_end(conf->mddev); - wbi->bi_next = return_bi; - return_bi = wbi; - } - wbi = wbi2; - } - if (dev->towrite == NULL) - bitmap_end = 1; - spin_unlock_irq(&conf->device_lock); - if (bitmap_end) - bitmap_endwrite(conf->mddev->bitmap, sh->sector, - STRIPE_SECTORS, - !test_bit(STRIPE_DEGRADED, &sh->state), 0); - } - } - } - - /* Now we might consider reading some blocks, either to check/generate - * parity, or to satisfy requests - * or to load a block that is being partially written. - */ - if (to_read || non_overwrite || (to_write && failed) || (syncing && (uptodate < disks))) { - for (i=disks; i--;) { - dev = &sh->dev[i]; - if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && - (dev->toread || - (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || - syncing || - (failed >= 1 && (sh->dev[failed_num[0]].toread || to_write)) || - (failed >= 2 && (sh->dev[failed_num[1]].toread || to_write)) - ) - ) { - /* we would like to get this block, possibly - * by computing it, but we might not be able to - */ - if (uptodate == disks-1) { - PRINTK("Computing stripe %llu block %d\n", - (unsigned long long)sh->sector, i); - compute_block_1(sh, i, 0); - uptodate++; - } else if ( uptodate == disks-2 && failed >= 2 ) { - /* Computing 2-failure is *very* expensive; only do it if failed >= 2 */ - int other; - for (other=disks; other--;) { - if ( other == i ) - continue; - if ( !test_bit(R5_UPTODATE, &sh->dev[other].flags) ) - break; - } - BUG_ON(other < 0); - PRINTK("Computing stripe %llu blocks %d,%d\n", - (unsigned long long)sh->sector, i, other); - compute_block_2(sh, i, other); - uptodate += 2; - } else if (test_bit(R5_Insync, &dev->flags)) { - set_bit(R5_LOCKED, &dev->flags); - set_bit(R5_Wantread, &dev->flags); -#if 0 - /* if I am just reading this block and we don't have - a failed drive, or any pending writes then sidestep the cache */ - if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext && - ! syncing && !failed && !to_write) { - sh->bh_cache[i]->b_page = sh->bh_read[i]->b_page; - sh->bh_cache[i]->b_data = sh->bh_read[i]->b_data; - } -#endif - locked++; - PRINTK("Reading block %d (sync=%d)\n", - i, syncing); - } - } - } - set_bit(STRIPE_HANDLE, &sh->state); - } - - /* now to consider writing and what else, if anything should be read */ - if (to_write) { - int rcw=0, must_compute=0; - for (i=disks ; i--;) { - dev = &sh->dev[i]; - /* Would I have to read this buffer for reconstruct_write */ - if (!test_bit(R5_OVERWRITE, &dev->flags) - && i != pd_idx && i != qd_idx - && (!test_bit(R5_LOCKED, &dev->flags) -#if 0 - || sh->bh_page[i] != bh->b_page -#endif - ) && - !test_bit(R5_UPTODATE, &dev->flags)) { - if (test_bit(R5_Insync, &dev->flags)) rcw++; - else { - PRINTK("raid6: must_compute: disk %d flags=%#lx\n", i, dev->flags); - must_compute++; - } - } - } - PRINTK("for sector %llu, rcw=%d, must_compute=%d\n", - (unsigned long long)sh->sector, rcw, must_compute); - set_bit(STRIPE_HANDLE, &sh->state); - - if (rcw > 0) - /* want reconstruct write, but need to get some data */ - for (i=disks; i--;) { - dev = &sh->dev[i]; - if (!test_bit(R5_OVERWRITE, &dev->flags) - && !(failed == 0 && (i == pd_idx || i == qd_idx)) - && !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && - test_bit(R5_Insync, &dev->flags)) { - if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) - { - PRINTK("Read_old stripe %llu block %d for Reconstruct\n", - (unsigned long long)sh->sector, i); - set_bit(R5_LOCKED, &dev->flags); - set_bit(R5_Wantread, &dev->flags); - locked++; - } else { - PRINTK("Request delayed stripe %llu block %d for Reconstruct\n", - (unsigned long long)sh->sector, i); - set_bit(STRIPE_DELAYED, &sh->state); - set_bit(STRIPE_HANDLE, &sh->state); - } - } - } - /* now if nothing is locked, and if we have enough data, we can start a write request */ - if (locked == 0 && rcw == 0 && - !test_bit(STRIPE_BIT_DELAY, &sh->state)) { - if ( must_compute > 0 ) { - /* We have failed blocks and need to compute them */ - switch ( failed ) { - case 0: BUG(); - case 1: compute_block_1(sh, failed_num[0], 0); break; - case 2: compute_block_2(sh, failed_num[0], failed_num[1]); break; - default: BUG(); /* This request should have been failed? */ - } - } - - PRINTK("Computing parity for stripe %llu\n", (unsigned long long)sh->sector); - compute_parity(sh, RECONSTRUCT_WRITE); - /* now every locked buffer is ready to be written */ - for (i=disks; i--;) - if (test_bit(R5_LOCKED, &sh->dev[i].flags)) { - PRINTK("Writing stripe %llu block %d\n", - (unsigned long long)sh->sector, i); - locked++; - set_bit(R5_Wantwrite, &sh->dev[i].flags); - } - /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */ - set_bit(STRIPE_INSYNC, &sh->state); - - if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { - atomic_dec(&conf->preread_active_stripes); - if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) - md_wakeup_thread(conf->mddev->thread); - } - } - } - - /* maybe we need to check and possibly fix the parity for this stripe - * Any reads will already have been scheduled, so we just see if enough data - * is available - */ - if (syncing && locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state)) { - int update_p = 0, update_q = 0; - struct r5dev *dev; - - set_bit(STRIPE_HANDLE, &sh->state); - - BUG_ON(failed>2); - BUG_ON(uptodate < disks); - /* Want to check and possibly repair P and Q. - * However there could be one 'failed' device, in which - * case we can only check one of them, possibly using the - * other to generate missing data - */ - - /* If !tmp_page, we cannot do the calculations, - * but as we have set STRIPE_HANDLE, we will soon be called - * by stripe_handle with a tmp_page - just wait until then. - */ - if (tmp_page) { - if (failed == q_failed) { - /* The only possible failed device holds 'Q', so it makes - * sense to check P (If anything else were failed, we would - * have used P to recreate it). - */ - compute_block_1(sh, pd_idx, 1); - if (!page_is_zero(sh->dev[pd_idx].page)) { - compute_block_1(sh,pd_idx,0); - update_p = 1; - } - } - if (!q_failed && failed < 2) { - /* q is not failed, and we didn't use it to generate - * anything, so it makes sense to check it - */ - memcpy(page_address(tmp_page), - page_address(sh->dev[qd_idx].page), - STRIPE_SIZE); - compute_parity(sh, UPDATE_PARITY); - if (memcmp(page_address(tmp_page), - page_address(sh->dev[qd_idx].page), - STRIPE_SIZE)!= 0) { - clear_bit(STRIPE_INSYNC, &sh->state); - update_q = 1; - } - } - if (update_p || update_q) { - conf->mddev->resync_mismatches += STRIPE_SECTORS; - if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) - /* don't try to repair!! */ - update_p = update_q = 0; - } - - /* now write out any block on a failed drive, - * or P or Q if they need it - */ - - if (failed == 2) { - dev = &sh->dev[failed_num[1]]; - locked++; - set_bit(R5_LOCKED, &dev->flags); - set_bit(R5_Wantwrite, &dev->flags); - } - if (failed >= 1) { - dev = &sh->dev[failed_num[0]]; - locked++; - set_bit(R5_LOCKED, &dev->flags); - set_bit(R5_Wantwrite, &dev->flags); - } - - if (update_p) { - dev = &sh->dev[pd_idx]; - locked ++; - set_bit(R5_LOCKED, &dev->flags); - set_bit(R5_Wantwrite, &dev->flags); - } - if (update_q) { - dev = &sh->dev[qd_idx]; - locked++; - set_bit(R5_LOCKED, &dev->flags); - set_bit(R5_Wantwrite, &dev->flags); - } - clear_bit(STRIPE_DEGRADED, &sh->state); - - set_bit(STRIPE_INSYNC, &sh->state); - } - } - - if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { - md_done_sync(conf->mddev, STRIPE_SECTORS,1); - clear_bit(STRIPE_SYNCING, &sh->state); - } - - /* If the failed drives are just a ReadError, then we might need - * to progress the repair/check process - */ - if (failed <= 2 && ! conf->mddev->ro) - for (i=0; i<failed;i++) { - dev = &sh->dev[failed_num[i]]; - if (test_bit(R5_ReadError, &dev->flags) - && !test_bit(R5_LOCKED, &dev->flags) - && test_bit(R5_UPTODATE, &dev->flags) - ) { - if (!test_bit(R5_ReWrite, &dev->flags)) { - set_bit(R5_Wantwrite, &dev->flags); - set_bit(R5_ReWrite, &dev->flags); - set_bit(R5_LOCKED, &dev->flags); - } else { - /* let's read it back */ - set_bit(R5_Wantread, &dev->flags); - set_bit(R5_LOCKED, &dev->flags); - } - } - } - spin_unlock(&sh->lock); - - while ((bi=return_bi)) { - int bytes = bi->bi_size; - - return_bi = bi->bi_next; - bi->bi_next = NULL; - bi->bi_size = 0; - bi->bi_end_io(bi, bytes, 0); - } - for (i=disks; i-- ;) { - int rw; - struct bio *bi; - mdk_rdev_t *rdev; - if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) - rw = 1; - else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) - rw = 0; - else - continue; - - bi = &sh->dev[i].req; - - bi->bi_rw = rw; - if (rw) - bi->bi_end_io = raid6_end_write_request; - else - bi->bi_end_io = raid6_end_read_request; - - rcu_read_lock(); - rdev = rcu_dereference(conf->disks[i].rdev); - if (rdev && test_bit(Faulty, &rdev->flags)) - rdev = NULL; - if (rdev) - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - - if (rdev) { - if (syncing) - md_sync_acct(rdev->bdev, STRIPE_SECTORS); - - bi->bi_bdev = rdev->bdev; - PRINTK("for %llu schedule op %ld on disc %d\n", - (unsigned long long)sh->sector, bi->bi_rw, i); - atomic_inc(&sh->count); - bi->bi_sector = sh->sector + rdev->data_offset; - bi->bi_flags = 1 << BIO_UPTODATE; - bi->bi_vcnt = 1; - bi->bi_max_vecs = 1; - bi->bi_idx = 0; - bi->bi_io_vec = &sh->dev[i].vec; - bi->bi_io_vec[0].bv_len = STRIPE_SIZE; - bi->bi_io_vec[0].bv_offset = 0; - bi->bi_size = STRIPE_SIZE; - bi->bi_next = NULL; - if (rw == WRITE && - test_bit(R5_ReWrite, &sh->dev[i].flags)) - atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); - generic_make_request(bi); - } else { - if (rw == 1) - set_bit(STRIPE_DEGRADED, &sh->state); - PRINTK("skip op %ld on disc %d for sector %llu\n", - bi->bi_rw, i, (unsigned long long)sh->sector); - clear_bit(R5_LOCKED, &sh->dev[i].flags); - set_bit(STRIPE_HANDLE, &sh->state); - } - } -} - -static void raid6_activate_delayed(raid6_conf_t *conf) -{ - if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { - while (!list_empty(&conf->delayed_list)) { - struct list_head *l = conf->delayed_list.next; - struct stripe_head *sh; - sh = list_entry(l, struct stripe_head, lru); - list_del_init(l); - clear_bit(STRIPE_DELAYED, &sh->state); - if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) - atomic_inc(&conf->preread_active_stripes); - list_add_tail(&sh->lru, &conf->handle_list); - } - } -} - -static void activate_bit_delay(raid6_conf_t *conf) -{ - /* device_lock is held */ - struct list_head head; - list_add(&head, &conf->bitmap_list); - list_del_init(&conf->bitmap_list); - while (!list_empty(&head)) { - struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); - list_del_init(&sh->lru); - atomic_inc(&sh->count); - __release_stripe(conf, sh); - } -} - -static void unplug_slaves(mddev_t *mddev) -{ - raid6_conf_t *conf = mddev_to_conf(mddev); - int i; - - rcu_read_lock(); - for (i=0; i<mddev->raid_disks; i++) { - mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); - if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) { - request_queue_t *r_queue = bdev_get_queue(rdev->bdev); - - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - - if (r_queue->unplug_fn) - r_queue->unplug_fn(r_queue); - - rdev_dec_pending(rdev, mddev); - rcu_read_lock(); - } - } - rcu_read_unlock(); -} - -static void raid6_unplug_device(request_queue_t *q) -{ - mddev_t *mddev = q->queuedata; - raid6_conf_t *conf = mddev_to_conf(mddev); - unsigned long flags; - - spin_lock_irqsave(&conf->device_lock, flags); - - if (blk_remove_plug(q)) { - conf->seq_flush++; - raid6_activate_delayed(conf); - } - md_wakeup_thread(mddev->thread); - - spin_unlock_irqrestore(&conf->device_lock, flags); - - unplug_slaves(mddev); -} - -static int raid6_issue_flush(request_queue_t *q, struct gendisk *disk, - sector_t *error_sector) -{ - mddev_t *mddev = q->queuedata; - raid6_conf_t *conf = mddev_to_conf(mddev); - int i, ret = 0; - - rcu_read_lock(); - for (i=0; i<mddev->raid_disks && ret == 0; i++) { - mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); - if (rdev && !test_bit(Faulty, &rdev->flags)) { - struct block_device *bdev = rdev->bdev; - request_queue_t *r_queue = bdev_get_queue(bdev); - - if (!r_queue->issue_flush_fn) - ret = -EOPNOTSUPP; - else { - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, - error_sector); - rdev_dec_pending(rdev, mddev); - rcu_read_lock(); - } - } - } - rcu_read_unlock(); - return ret; -} - -static inline void raid6_plug_device(raid6_conf_t *conf) -{ - spin_lock_irq(&conf->device_lock); - blk_plug_device(conf->mddev->queue); - spin_unlock_irq(&conf->device_lock); -} - -static int make_request (request_queue_t *q, struct bio * bi) -{ - mddev_t *mddev = q->queuedata; - raid6_conf_t *conf = mddev_to_conf(mddev); - const unsigned int raid_disks = conf->raid_disks; - const unsigned int data_disks = raid_disks - 2; - unsigned int dd_idx, pd_idx; - sector_t new_sector; - sector_t logical_sector, last_sector; - struct stripe_head *sh; - const int rw = bio_data_dir(bi); - - if (unlikely(bio_barrier(bi))) { - bio_endio(bi, bi->bi_size, -EOPNOTSUPP); - return 0; - } - - md_write_start(mddev, bi); - - disk_stat_inc(mddev->gendisk, ios[rw]); - disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bi)); - - logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); - last_sector = bi->bi_sector + (bi->bi_size>>9); - - bi->bi_next = NULL; - bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ - - for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { - DEFINE_WAIT(w); - - new_sector = raid6_compute_sector(logical_sector, - raid_disks, data_disks, &dd_idx, &pd_idx, conf); - - PRINTK("raid6: make_request, sector %llu logical %llu\n", - (unsigned long long)new_sector, - (unsigned long long)logical_sector); - - retry: - prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); - sh = get_active_stripe(conf, new_sector, pd_idx, (bi->bi_rw&RWA_MASK)); - if (sh) { - if (!add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) { - /* Add failed due to overlap. Flush everything - * and wait a while - */ - raid6_unplug_device(mddev->queue); - release_stripe(sh); - schedule(); - goto retry; - } - finish_wait(&conf->wait_for_overlap, &w); - raid6_plug_device(conf); - handle_stripe(sh, NULL); - release_stripe(sh); - } else { - /* cannot get stripe for read-ahead, just give-up */ - clear_bit(BIO_UPTODATE, &bi->bi_flags); - finish_wait(&conf->wait_for_overlap, &w); - break; - } - - } - spin_lock_irq(&conf->device_lock); - if (--bi->bi_phys_segments == 0) { - int bytes = bi->bi_size; - - if (rw == WRITE ) - md_write_end(mddev); - bi->bi_size = 0; - bi->bi_end_io(bi, bytes, 0); - } - spin_unlock_irq(&conf->device_lock); - return 0; -} - -/* FIXME go_faster isn't used */ -static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) -{ - raid6_conf_t *conf = (raid6_conf_t *) mddev->private; - struct stripe_head *sh; - int sectors_per_chunk = conf->chunk_size >> 9; - sector_t x; - unsigned long stripe; - int chunk_offset; - int dd_idx, pd_idx; - sector_t first_sector; - int raid_disks = conf->raid_disks; - int data_disks = raid_disks - 2; - sector_t max_sector = mddev->size << 1; - int sync_blocks; - int still_degraded = 0; - int i; - - if (sector_nr >= max_sector) { - /* just being told to finish up .. nothing much to do */ - unplug_slaves(mddev); - - if (mddev->curr_resync < max_sector) /* aborted */ - bitmap_end_sync(mddev->bitmap, mddev->curr_resync, - &sync_blocks, 1); - else /* completed sync */ - conf->fullsync = 0; - bitmap_close_sync(mddev->bitmap); - - return 0; - } - /* if there are 2 or more failed drives and we are trying - * to resync, then assert that we are finished, because there is - * nothing we can do. - */ - if (mddev->degraded >= 2 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { - sector_t rv = (mddev->size << 1) - sector_nr; - *skipped = 1; - return rv; - } - if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && - !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && - !conf->fullsync && sync_blocks >= STRIPE_SECTORS) { - /* we can skip this block, and probably more */ - sync_blocks /= STRIPE_SECTORS; - *skipped = 1; - return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ - } - - x = sector_nr; - chunk_offset = sector_div(x, sectors_per_chunk); - stripe = x; - BUG_ON(x != stripe); - - first_sector = raid6_compute_sector((sector_t)stripe*data_disks*sectors_per_chunk - + chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf); - sh = get_active_stripe(conf, sector_nr, pd_idx, 1); - if (sh == NULL) { - sh = get_active_stripe(conf, sector_nr, pd_idx, 0); - /* make sure we don't swamp the stripe cache if someone else - * is trying to get access - */ - schedule_timeout_uninterruptible(1); - } - /* Need to check if array will still be degraded after recovery/resync - * We don't need to check the 'failed' flag as when that gets set, - * recovery aborts. - */ - for (i=0; i<mddev->raid_disks; i++) - if (conf->disks[i].rdev == NULL) - still_degraded = 1; - - bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); - - spin_lock(&sh->lock); - set_bit(STRIPE_SYNCING, &sh->state); - clear_bit(STRIPE_INSYNC, &sh->state); - spin_unlock(&sh->lock); - - handle_stripe(sh, NULL); - release_stripe(sh); - - return STRIPE_SECTORS; -} - -/* - * This is our raid6 kernel thread. - * - * We scan the hash table for stripes which can be handled now. - * During the scan, completed stripes are saved for us by the interrupt - * handler, so that they will not have to wait for our next wakeup. - */ -static void raid6d (mddev_t *mddev) -{ - struct stripe_head *sh; - raid6_conf_t *conf = mddev_to_conf(mddev); - int handled; - - PRINTK("+++ raid6d active\n"); - - md_check_recovery(mddev); - - handled = 0; - spin_lock_irq(&conf->device_lock); - while (1) { - struct list_head *first; - - if (conf->seq_flush - conf->seq_write > 0) { - int seq = conf->seq_flush; - spin_unlock_irq(&conf->device_lock); - bitmap_unplug(mddev->bitmap); - spin_lock_irq(&conf->device_lock); - conf->seq_write = seq; - activate_bit_delay(conf); - } - - if (list_empty(&conf->handle_list) && - atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD && - !blk_queue_plugged(mddev->queue) && - !list_empty(&conf->delayed_list)) - raid6_activate_delayed(conf); - - if (list_empty(&conf->handle_list)) - break; - - first = conf->handle_list.next; - sh = list_entry(first, struct stripe_head, lru); - - list_del_init(first); - atomic_inc(&sh->count); - BUG_ON(atomic_read(&sh->count)!= 1); - spin_unlock_irq(&conf->device_lock); - - handled++; - handle_stripe(sh, conf->spare_page); - release_stripe(sh); - - spin_lock_irq(&conf->device_lock); - } - PRINTK("%d stripes handled\n", handled); - - spin_unlock_irq(&conf->device_lock); - - unplug_slaves(mddev); - - PRINTK("--- raid6d inactive\n"); -} - -static ssize_t -raid6_show_stripe_cache_size(mddev_t *mddev, char *page) -{ - raid6_conf_t *conf = mddev_to_conf(mddev); - if (conf) - return sprintf(page, "%d\n", conf->max_nr_stripes); - else - return 0; -} - -static ssize_t -raid6_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) -{ - raid6_conf_t *conf = mddev_to_conf(mddev); - char *end; - int new; - if (len >= PAGE_SIZE) - return -EINVAL; - if (!conf) - return -ENODEV; - - new = simple_strtoul(page, &end, 10); - if (!*page || (*end && *end != '\n') ) - return -EINVAL; - if (new <= 16 || new > 32768) - return -EINVAL; - while (new < conf->max_nr_stripes) { - if (drop_one_stripe(conf)) - conf->max_nr_stripes--; - else - break; - } - while (new > conf->max_nr_stripes) { - if (grow_one_stripe(conf)) - conf->max_nr_stripes++; - else break; - } - return len; -} - -static struct md_sysfs_entry -raid6_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR, - raid6_show_stripe_cache_size, - raid6_store_stripe_cache_size); - -static ssize_t -stripe_cache_active_show(mddev_t *mddev, char *page) -{ - raid6_conf_t *conf = mddev_to_conf(mddev); - if (conf) - return sprintf(page, "%d\n", atomic_read(&conf->active_stripes)); - else - return 0; -} - -static struct md_sysfs_entry -raid6_stripecache_active = __ATTR_RO(stripe_cache_active); - -static struct attribute *raid6_attrs[] = { - &raid6_stripecache_size.attr, - &raid6_stripecache_active.attr, - NULL, -}; -static struct attribute_group raid6_attrs_group = { - .name = NULL, - .attrs = raid6_attrs, -}; - -static int run(mddev_t *mddev) -{ - raid6_conf_t *conf; - int raid_disk, memory; - mdk_rdev_t *rdev; - struct disk_info *disk; - struct list_head *tmp; - - if (mddev->level != 6) { - PRINTK("raid6: %s: raid level not set to 6 (%d)\n", mdname(mddev), mddev->level); - return -EIO; - } - - mddev->private = kzalloc(sizeof (raid6_conf_t), GFP_KERNEL); - if ((conf = mddev->private) == NULL) - goto abort; - conf->disks = kzalloc(mddev->raid_disks * sizeof(struct disk_info), - GFP_KERNEL); - if (!conf->disks) - goto abort; - - conf->mddev = mddev; - - if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) - goto abort; - - conf->spare_page = alloc_page(GFP_KERNEL); - if (!conf->spare_page) - goto abort; - - spin_lock_init(&conf->device_lock); - init_waitqueue_head(&conf->wait_for_stripe); - init_waitqueue_head(&conf->wait_for_overlap); - INIT_LIST_HEAD(&conf->handle_list); - INIT_LIST_HEAD(&conf->delayed_list); - INIT_LIST_HEAD(&conf->bitmap_list); - INIT_LIST_HEAD(&conf->inactive_list); - atomic_set(&conf->active_stripes, 0); - atomic_set(&conf->preread_active_stripes, 0); - - PRINTK("raid6: run(%s) called.\n", mdname(mddev)); - - ITERATE_RDEV(mddev,rdev,tmp) { - raid_disk = rdev->raid_disk; - if (raid_disk >= mddev->raid_disks - || raid_disk < 0) - continue; - disk = conf->disks + raid_disk; - - disk->rdev = rdev; - - if (test_bit(In_sync, &rdev->flags)) { - char b[BDEVNAME_SIZE]; - printk(KERN_INFO "raid6: device %s operational as raid" - " disk %d\n", bdevname(rdev->bdev,b), - raid_disk); - conf->working_disks++; - } - } - - conf->raid_disks = mddev->raid_disks; - - /* - * 0 for a fully functional array, 1 or 2 for a degraded array. - */ - mddev->degraded = conf->failed_disks = conf->raid_disks - conf->working_disks; - conf->mddev = mddev; - conf->chunk_size = mddev->chunk_size; - conf->level = mddev->level; - conf->algorithm = mddev->layout; - conf->max_nr_stripes = NR_STRIPES; - - /* device size must be a multiple of chunk size */ - mddev->size &= ~(mddev->chunk_size/1024 -1); - mddev->resync_max_sectors = mddev->size << 1; - - if (conf->raid_disks < 4) { - printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n", - mdname(mddev), conf->raid_disks); - goto abort; - } - if (!conf->chunk_size || conf->chunk_size % 4) { - printk(KERN_ERR "raid6: invalid chunk size %d for %s\n", - conf->chunk_size, mdname(mddev)); - goto abort; - } - if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) { - printk(KERN_ERR - "raid6: unsupported parity algorithm %d for %s\n", - conf->algorithm, mdname(mddev)); - goto abort; - } - if (mddev->degraded > 2) { - printk(KERN_ERR "raid6: not enough operational devices for %s" - " (%d/%d failed)\n", - mdname(mddev), conf->failed_disks, conf->raid_disks); - goto abort; - } - - if (mddev->degraded > 0 && - mddev->recovery_cp != MaxSector) { - if (mddev->ok_start_degraded) - printk(KERN_WARNING "raid6: starting dirty degraded array:%s" - "- data corruption possible.\n", - mdname(mddev)); - else { - printk(KERN_ERR "raid6: cannot start dirty degraded array" - " for %s\n", mdname(mddev)); - goto abort; - } - } - - { - mddev->thread = md_register_thread(raid6d, mddev, "%s_raid6"); - if (!mddev->thread) { - printk(KERN_ERR - "raid6: couldn't allocate thread for %s\n", - mdname(mddev)); - goto abort; - } - } - - memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + - conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; - if (grow_stripes(conf, conf->max_nr_stripes)) { - printk(KERN_ERR - "raid6: couldn't allocate %dkB for buffers\n", memory); - shrink_stripes(conf); - md_unregister_thread(mddev->thread); - goto abort; - } else - printk(KERN_INFO "raid6: allocated %dkB for %s\n", - memory, mdname(mddev)); - - if (mddev->degraded == 0) - printk(KERN_INFO "raid6: raid level %d set %s active with %d out of %d" - " devices, algorithm %d\n", conf->level, mdname(mddev), - mddev->raid_disks-mddev->degraded, mddev->raid_disks, - conf->algorithm); - else - printk(KERN_ALERT "raid6: raid level %d set %s active with %d" - " out of %d devices, algorithm %d\n", conf->level, - mdname(mddev), mddev->raid_disks - mddev->degraded, - mddev->raid_disks, conf->algorithm); - - print_raid6_conf(conf); - - /* read-ahead size must cover two whole stripes, which is - * 2 * (n-2) * chunksize where 'n' is the number of raid devices - */ - { - int stripe = (mddev->raid_disks-2) * mddev->chunk_size - / PAGE_SIZE; - if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) - mddev->queue->backing_dev_info.ra_pages = 2 * stripe; - } - - /* Ok, everything is just fine now */ - sysfs_create_group(&mddev->kobj, &raid6_attrs_group); - - mddev->array_size = mddev->size * (mddev->raid_disks - 2); - - mddev->queue->unplug_fn = raid6_unplug_device; - mddev->queue->issue_flush_fn = raid6_issue_flush; - return 0; -abort: - if (conf) { - print_raid6_conf(conf); - safe_put_page(conf->spare_page); - kfree(conf->stripe_hashtbl); - kfree(conf->disks); - kfree(conf); - } - mddev->private = NULL; - printk(KERN_ALERT "raid6: failed to run raid set %s\n", mdname(mddev)); - return -EIO; -} - - - -static int stop (mddev_t *mddev) -{ - raid6_conf_t *conf = (raid6_conf_t *) mddev->private; - - md_unregister_thread(mddev->thread); - mddev->thread = NULL; - shrink_stripes(conf); - kfree(conf->stripe_hashtbl); - blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ - sysfs_remove_group(&mddev->kobj, &raid6_attrs_group); - kfree(conf); - mddev->private = NULL; - return 0; -} - -#if RAID6_DUMPSTATE -static void print_sh (struct seq_file *seq, struct stripe_head *sh) -{ - int i; - - seq_printf(seq, "sh %llu, pd_idx %d, state %ld.\n", - (unsigned long long)sh->sector, sh->pd_idx, sh->state); - seq_printf(seq, "sh %llu, count %d.\n", - (unsigned long long)sh->sector, atomic_read(&sh->count)); - seq_printf(seq, "sh %llu, ", (unsigned long long)sh->sector); - for (i = 0; i < sh->raid_conf->raid_disks; i++) { - seq_printf(seq, "(cache%d: %p %ld) ", - i, sh->dev[i].page, sh->dev[i].flags); - } - seq_printf(seq, "\n"); -} - -static void printall (struct seq_file *seq, raid6_conf_t *conf) -{ - struct stripe_head *sh; - struct hlist_node *hn; - int i; - - spin_lock_irq(&conf->device_lock); - for (i = 0; i < NR_HASH; i++) { - sh = conf->stripe_hashtbl[i]; - hlist_for_each_entry(sh, hn, &conf->stripe_hashtbl[i], hash) { - if (sh->raid_conf != conf) - continue; - print_sh(seq, sh); - } - } - spin_unlock_irq(&conf->device_lock); -} -#endif - -static void status (struct seq_file *seq, mddev_t *mddev) -{ - raid6_conf_t *conf = (raid6_conf_t *) mddev->private; - int i; - - seq_printf (seq, " level %d, %dk chunk, algorithm %d", mddev->level, mddev->chunk_size >> 10, mddev->layout); - seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->working_disks); - for (i = 0; i < conf->raid_disks; i++) - seq_printf (seq, "%s", - conf->disks[i].rdev && - test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_"); - seq_printf (seq, "]"); -#if RAID6_DUMPSTATE - seq_printf (seq, "\n"); - printall(seq, conf); -#endif -} - -static void print_raid6_conf (raid6_conf_t *conf) -{ - int i; - struct disk_info *tmp; - - printk("RAID6 conf printout:\n"); - if (!conf) { - printk("(conf==NULL)\n"); - return; - } - printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks, - conf->working_disks, conf->failed_disks); - - for (i = 0; i < conf->raid_disks; i++) { - char b[BDEVNAME_SIZE]; - tmp = conf->disks + i; - if (tmp->rdev) - printk(" disk %d, o:%d, dev:%s\n", - i, !test_bit(Faulty, &tmp->rdev->flags), - bdevname(tmp->rdev->bdev,b)); - } -} - -static int raid6_spare_active(mddev_t *mddev) -{ - int i; - raid6_conf_t *conf = mddev->private; - struct disk_info *tmp; - - for (i = 0; i < conf->raid_disks; i++) { - tmp = conf->disks + i; - if (tmp->rdev - && !test_bit(Faulty, &tmp->rdev->flags) - && !test_bit(In_sync, &tmp->rdev->flags)) { - mddev->degraded--; - conf->failed_disks--; - conf->working_disks++; - set_bit(In_sync, &tmp->rdev->flags); - } - } - print_raid6_conf(conf); - return 0; -} - -static int raid6_remove_disk(mddev_t *mddev, int number) -{ - raid6_conf_t *conf = mddev->private; - int err = 0; - mdk_rdev_t *rdev; - struct disk_info *p = conf->disks + number; - - print_raid6_conf(conf); - rdev = p->rdev; - if (rdev) { - if (test_bit(In_sync, &rdev->flags) || - atomic_read(&rdev->nr_pending)) { - err = -EBUSY; - goto abort; - } - p->rdev = NULL; - synchronize_rcu(); - if (atomic_read(&rdev->nr_pending)) { - /* lost the race, try later */ - err = -EBUSY; - p->rdev = rdev; - } - } - -abort: - - print_raid6_conf(conf); - return err; -} - -static int raid6_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) -{ - raid6_conf_t *conf = mddev->private; - int found = 0; - int disk; - struct disk_info *p; - - if (mddev->degraded > 2) - /* no point adding a device */ - return 0; - /* - * find the disk ... but prefer rdev->saved_raid_disk - * if possible. - */ - if (rdev->saved_raid_disk >= 0 && - conf->disks[rdev->saved_raid_disk].rdev == NULL) - disk = rdev->saved_raid_disk; - else - disk = 0; - for ( ; disk < mddev->raid_disks; disk++) - if ((p=conf->disks + disk)->rdev == NULL) { - clear_bit(In_sync, &rdev->flags); - rdev->raid_disk = disk; - found = 1; - if (rdev->saved_raid_disk != disk) - conf->fullsync = 1; - rcu_assign_pointer(p->rdev, rdev); - break; - } - print_raid6_conf(conf); - return found; -} - -static int raid6_resize(mddev_t *mddev, sector_t sectors) -{ - /* no resync is happening, and there is enough space - * on all devices, so we can resize. - * We need to make sure resync covers any new space. - * If the array is shrinking we should possibly wait until - * any io in the removed space completes, but it hardly seems - * worth it. - */ - sectors &= ~((sector_t)mddev->chunk_size/512 - 1); - mddev->array_size = (sectors * (mddev->raid_disks-2))>>1; - set_capacity(mddev->gendisk, mddev->array_size << 1); - mddev->changed = 1; - if (sectors/2 > mddev->size && mddev->recovery_cp == MaxSector) { - mddev->recovery_cp = mddev->size << 1; - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - } - mddev->size = sectors /2; - mddev->resync_max_sectors = sectors; - return 0; -} - -static void raid6_quiesce(mddev_t *mddev, int state) -{ - raid6_conf_t *conf = mddev_to_conf(mddev); - - switch(state) { - case 1: /* stop all writes */ - spin_lock_irq(&conf->device_lock); - conf->quiesce = 1; - wait_event_lock_irq(conf->wait_for_stripe, - atomic_read(&conf->active_stripes) == 0, - conf->device_lock, /* nothing */); - spin_unlock_irq(&conf->device_lock); - break; - - case 0: /* re-enable writes */ - spin_lock_irq(&conf->device_lock); - conf->quiesce = 0; - wake_up(&conf->wait_for_stripe); - spin_unlock_irq(&conf->device_lock); - break; - } -} - -static struct mdk_personality raid6_personality = -{ - .name = "raid6", - .level = 6, - .owner = THIS_MODULE, - .make_request = make_request, - .run = run, - .stop = stop, - .status = status, - .error_handler = error, - .hot_add_disk = raid6_add_disk, - .hot_remove_disk= raid6_remove_disk, - .spare_active = raid6_spare_active, - .sync_request = sync_request, - .resize = raid6_resize, - .quiesce = raid6_quiesce, -}; - -static int __init raid6_init(void) -{ - int e; - - e = raid6_select_algo(); - if ( e ) - return e; - - return register_md_personality(&raid6_personality); -} - -static void raid6_exit (void) -{ - unregister_md_personality(&raid6_personality); -} - -module_init(raid6_init); -module_exit(raid6_exit); -MODULE_LICENSE("GPL"); -MODULE_ALIAS("md-personality-8"); /* RAID6 */ -MODULE_ALIAS("md-raid6"); -MODULE_ALIAS("md-level-6"); diff --git a/drivers/media/video/cx88/cx88-video.c b/drivers/media/video/cx88/cx88-video.c index 694d1d80ff3..dcda5291b99 100644 --- a/drivers/media/video/cx88/cx88-video.c +++ b/drivers/media/video/cx88/cx88-video.c @@ -494,8 +494,7 @@ static int restart_video_queue(struct cx8800_dev *dev, return 0; buf = list_entry(q->queued.next, struct cx88_buffer, vb.queue); if (NULL == prev) { - list_del(&buf->vb.queue); - list_add_tail(&buf->vb.queue,&q->active); + list_move_tail(&buf->vb.queue, &q->active); start_video_dma(dev, q, buf); buf->vb.state = STATE_ACTIVE; buf->count = q->count++; @@ -506,8 +505,7 @@ static int restart_video_queue(struct cx8800_dev *dev, } else if (prev->vb.width == buf->vb.width && prev->vb.height == buf->vb.height && prev->fmt == buf->fmt) { - list_del(&buf->vb.queue); - list_add_tail(&buf->vb.queue,&q->active); + list_move_tail(&buf->vb.queue, &q->active); buf->vb.state = STATE_ACTIVE; buf->count = q->count++; prev->risc.jmp[1] = cpu_to_le32(buf->risc.dma); diff --git a/drivers/media/video/usbvideo/quickcam_messenger.c b/drivers/media/video/usbvideo/quickcam_messenger.c index 3f3182a24da..56e01b62241 100644 --- a/drivers/media/video/usbvideo/quickcam_messenger.c +++ b/drivers/media/video/usbvideo/quickcam_messenger.c @@ -33,7 +33,7 @@ #include <linux/module.h> #include <linux/init.h> #include <linux/input.h> -#include <linux/usb_input.h> +#include <linux/usb/input.h> #include "usbvideo.h" #include "quickcam_messenger.h" diff --git a/drivers/net/irda/nsc-ircc.c b/drivers/net/irda/nsc-ircc.c index cc7ff8f00e4..cb62f2a9676 100644 --- a/drivers/net/irda/nsc-ircc.c +++ b/drivers/net/irda/nsc-ircc.c @@ -115,8 +115,12 @@ static nsc_chip_t chips[] = { /* Contributed by Jan Frey - IBM A30/A31 */ { "PC8739x", { 0x2e, 0x4e, 0x0 }, 0x20, 0xea, 0xff, nsc_ircc_probe_39x, nsc_ircc_init_39x }, - { "IBM", { 0x2e, 0x4e, 0x0 }, 0x20, 0xf4, 0xff, - nsc_ircc_probe_39x, nsc_ircc_init_39x }, + /* IBM ThinkPads using PC8738x (T60/X60/Z60) */ + { "IBM-PC8738x", { 0x2e, 0x4e, 0x0 }, 0x20, 0xf4, 0xff, + nsc_ircc_probe_39x, nsc_ircc_init_39x }, + /* IBM ThinkPads using PC8394T (T43/R52/?) */ + { "IBM-PC8394T", { 0x2e, 0x4e, 0x0 }, 0x20, 0xf9, 0xff, + nsc_ircc_probe_39x, nsc_ircc_init_39x }, { NULL } }; diff --git a/drivers/net/ppp_generic.c b/drivers/net/ppp_generic.c index 01cd8ec751e..d643a097faa 100644 --- a/drivers/net/ppp_generic.c +++ b/drivers/net/ppp_generic.c @@ -2578,8 +2578,7 @@ ppp_find_channel(int unit) list_for_each_entry(pch, &new_channels, list) { if (pch->file.index == unit) { - list_del(&pch->list); - list_add(&pch->list, &all_channels); + list_move(&pch->list, &all_channels); return pch; } } diff --git a/drivers/net/wireless/bcm43xx/Kconfig b/drivers/net/wireless/bcm43xx/Kconfig index 25ea4748f0b..533993f538f 100644 --- a/drivers/net/wireless/bcm43xx/Kconfig +++ b/drivers/net/wireless/bcm43xx/Kconfig @@ -2,6 +2,7 @@ config BCM43XX tristate "Broadcom BCM43xx wireless support" depends on PCI && IEEE80211 && IEEE80211_SOFTMAC && NET_RADIO && EXPERIMENTAL select FW_LOADER + select HW_RANDOM ---help--- This is an experimental driver for the Broadcom 43xx wireless chip, found in the Apple Airport Extreme and various other devices. diff --git a/drivers/net/wireless/bcm43xx/bcm43xx.h b/drivers/net/wireless/bcm43xx/bcm43xx.h index d8f917c21ea..17a56828e23 100644 --- a/drivers/net/wireless/bcm43xx/bcm43xx.h +++ b/drivers/net/wireless/bcm43xx/bcm43xx.h @@ -1,6 +1,7 @@ #ifndef BCM43xx_H_ #define BCM43xx_H_ +#include <linux/hw_random.h> #include <linux/version.h> #include <linux/kernel.h> #include <linux/spinlock.h> @@ -82,6 +83,7 @@ #define BCM43xx_MMIO_TSF_1 0x634 /* core rev < 3 only */ #define BCM43xx_MMIO_TSF_2 0x636 /* core rev < 3 only */ #define BCM43xx_MMIO_TSF_3 0x638 /* core rev < 3 only */ +#define BCM43xx_MMIO_RNG 0x65A #define BCM43xx_MMIO_POWERUP_DELAY 0x6A8 /* SPROM offsets. */ @@ -750,6 +752,10 @@ struct bcm43xx_private { const struct firmware *initvals0; const struct firmware *initvals1; + /* Random Number Generator. */ + struct hwrng rng; + char rng_name[20 + 1]; + /* Debugging stuff follows. */ #ifdef CONFIG_BCM43XX_DEBUG struct bcm43xx_dfsentry *dfsentry; diff --git a/drivers/net/wireless/bcm43xx/bcm43xx_main.c b/drivers/net/wireless/bcm43xx/bcm43xx_main.c index 085d7857fe3..27bcf47228e 100644 --- a/drivers/net/wireless/bcm43xx/bcm43xx_main.c +++ b/drivers/net/wireless/bcm43xx/bcm43xx_main.c @@ -3237,6 +3237,39 @@ static void bcm43xx_security_init(struct bcm43xx_private *bcm) bcm43xx_clear_keys(bcm); } +static int bcm43xx_rng_read(struct hwrng *rng, u32 *data) +{ + struct bcm43xx_private *bcm = (struct bcm43xx_private *)rng->priv; + unsigned long flags; + + bcm43xx_lock_irqonly(bcm, flags); + *data = bcm43xx_read16(bcm, BCM43xx_MMIO_RNG); + bcm43xx_unlock_irqonly(bcm, flags); + + return (sizeof(u16)); +} + +static void bcm43xx_rng_exit(struct bcm43xx_private *bcm) +{ + hwrng_unregister(&bcm->rng); +} + +static int bcm43xx_rng_init(struct bcm43xx_private *bcm) +{ + int err; + + snprintf(bcm->rng_name, ARRAY_SIZE(bcm->rng_name), + "%s_%s", KBUILD_MODNAME, bcm->net_dev->name); + bcm->rng.name = bcm->rng_name; + bcm->rng.data_read = bcm43xx_rng_read; + bcm->rng.priv = (unsigned long)bcm; + err = hwrng_register(&bcm->rng); + if (err) + printk(KERN_ERR PFX "RNG init failed (%d)\n", err); + + return err; +} + /* This is the opposite of bcm43xx_init_board() */ static void bcm43xx_free_board(struct bcm43xx_private *bcm) { @@ -3248,6 +3281,7 @@ static void bcm43xx_free_board(struct bcm43xx_private *bcm) bcm43xx_set_status(bcm, BCM43xx_STAT_SHUTTINGDOWN); + bcm43xx_rng_exit(bcm); for (i = 0; i < BCM43xx_MAX_80211_CORES; i++) { if (!bcm->core_80211[i].available) continue; @@ -3325,6 +3359,9 @@ static int bcm43xx_init_board(struct bcm43xx_private *bcm) bcm43xx_switch_core(bcm, &bcm->core_80211[0]); bcm43xx_mac_enable(bcm); } + err = bcm43xx_rng_init(bcm); + if (err) + goto err_80211_unwind; bcm43xx_macfilter_clear(bcm, BCM43xx_MACFILTER_ASSOC); bcm43xx_macfilter_set(bcm, BCM43xx_MACFILTER_SELF, (u8 *)(bcm->net_dev->dev_addr)); dprintk(KERN_INFO PFX "80211 cores initialized\n"); diff --git a/drivers/s390/net/lcs.c b/drivers/s390/net/lcs.c index f94419b334f..2eded55ae88 100644 --- a/drivers/s390/net/lcs.c +++ b/drivers/s390/net/lcs.c @@ -1140,10 +1140,9 @@ list_modified: } } /* re-insert all entries from the failed_list into ipm_list */ - list_for_each_entry_safe(ipm, tmp, &failed_list, list) { - list_del_init(&ipm->list); - list_add_tail(&ipm->list, &card->ipm_list); - } + list_for_each_entry_safe(ipm, tmp, &failed_list, list) + list_move_tail(&ipm->list, &card->ipm_list); + spin_unlock_irqrestore(&card->ipm_lock, flags); } diff --git a/drivers/scsi/ncr53c8xx.c b/drivers/scsi/ncr53c8xx.c index 6ab035590ee..b28712df0b7 100644 --- a/drivers/scsi/ncr53c8xx.c +++ b/drivers/scsi/ncr53c8xx.c @@ -5118,8 +5118,7 @@ static void ncr_ccb_skipped(struct ncb *np, struct ccb *cp) cp->host_status &= ~HS_SKIPMASK; cp->start.schedule.l_paddr = cpu_to_scr(NCB_SCRIPT_PHYS (np, select)); - list_del(&cp->link_ccbq); - list_add_tail(&cp->link_ccbq, &lp->skip_ccbq); + list_move_tail(&cp->link_ccbq, &lp->skip_ccbq); if (cp->queued) { --lp->queuedccbs; } diff --git a/drivers/scsi/qla2xxx/qla_init.c b/drivers/scsi/qla2xxx/qla_init.c index aef093db597..3d4487eac9b 100644 --- a/drivers/scsi/qla2xxx/qla_init.c +++ b/drivers/scsi/qla2xxx/qla_init.c @@ -2258,8 +2258,7 @@ qla2x00_configure_fabric(scsi_qla_host_t *ha) } /* Remove device from the new list and add it to DB */ - list_del(&fcport->list); - list_add_tail(&fcport->list, &ha->fcports); + list_move_tail(&fcport->list, &ha->fcports); /* Login and update database */ qla2x00_fabric_dev_login(ha, fcport, &next_loopid); diff --git a/drivers/usb/host/hc_crisv10.c b/drivers/usb/host/hc_crisv10.c index 2fe7fd19437..4a22909518f 100644 --- a/drivers/usb/host/hc_crisv10.c +++ b/drivers/usb/host/hc_crisv10.c @@ -411,8 +411,7 @@ static inline void urb_list_move_last(struct urb *urb, int epid) urb_entry_t *urb_entry = __urb_list_entry(urb, epid); assert(urb_entry); - list_del(&urb_entry->list); - list_add_tail(&urb_entry->list, &urb_list[epid]); + list_move_tail(&urb_entry->list, &urb_list[epid]); } /* Get the next urb in the list. */ diff --git a/drivers/usb/serial/whiteheat.c b/drivers/usb/serial/whiteheat.c index 5b06fa36609..56ffc81302f 100644 --- a/drivers/usb/serial/whiteheat.c +++ b/drivers/usb/serial/whiteheat.c @@ -686,19 +686,16 @@ static void whiteheat_close(struct usb_serial_port *port, struct file * filp) wrap = list_entry(tmp, struct whiteheat_urb_wrap, list); urb = wrap->urb; usb_kill_urb(urb); - list_del(tmp); - list_add(tmp, &info->rx_urbs_free); - } - list_for_each_safe(tmp, tmp2, &info->rx_urb_q) { - list_del(tmp); - list_add(tmp, &info->rx_urbs_free); + list_move(tmp, &info->rx_urbs_free); } + list_for_each_safe(tmp, tmp2, &info->rx_urb_q) + list_move(tmp, &info->rx_urbs_free); + list_for_each_safe(tmp, tmp2, &info->tx_urbs_submitted) { wrap = list_entry(tmp, struct whiteheat_urb_wrap, list); urb = wrap->urb; usb_kill_urb(urb); - list_del(tmp); - list_add(tmp, &info->tx_urbs_free); + list_move(tmp, &info->tx_urbs_free); } spin_unlock_irqrestore(&info->lock, flags); @@ -1080,8 +1077,7 @@ static void whiteheat_write_callback(struct urb *urb, struct pt_regs *regs) err("%s - Not my urb!", __FUNCTION__); return; } - list_del(&wrap->list); - list_add(&wrap->list, &info->tx_urbs_free); + list_move(&wrap->list, &info->tx_urbs_free); spin_unlock(&info->lock); if (urb->status) { @@ -1371,8 +1367,7 @@ static int start_port_read(struct usb_serial_port *port) wrap = list_entry(tmp, struct whiteheat_urb_wrap, list); urb = wrap->urb; usb_kill_urb(urb); - list_del(tmp); - list_add(tmp, &info->rx_urbs_free); + list_move(tmp, &info->rx_urbs_free); } break; } diff --git a/drivers/video/Kconfig b/drivers/video/Kconfig index 168ede7902b..17de4c84db6 100644 --- a/drivers/video/Kconfig +++ b/drivers/video/Kconfig @@ -4,6 +4,21 @@ menu "Graphics support" +config FIRMWARE_EDID + bool "Enable firmware EDID" + default y + ---help--- + This enables access to the EDID transferred from the firmware. + On the i386, this is from the Video BIOS. Enable this if DDC/I2C + transfers do not work for your driver and if you are using + nvidiafb, i810fb or savagefb. + + In general, choosing Y for this option is safe. If you + experience extremely long delays while booting before you get + something on your display, try setting this to N. Matrox cards in + combination with certain motherboards and monitors are known to + suffer from this problem. + config FB tristate "Support for frame buffer devices" ---help--- @@ -70,22 +85,6 @@ config FB_MACMODES depends on FB default n -config FB_FIRMWARE_EDID - bool "Enable firmware EDID" - depends on FB - default y - ---help--- - This enables access to the EDID transferred from the firmware. - On the i386, this is from the Video BIOS. Enable this if DDC/I2C - transfers do not work for your driver and if you are using - nvidiafb, i810fb or savagefb. - - In general, choosing Y for this option is safe. If you - experience extremely long delays while booting before you get - something on your display, try setting this to N. Matrox cards in - combination with certain motherboards and monitors are known to - suffer from this problem. - config FB_BACKLIGHT bool depends on FB @@ -551,10 +550,14 @@ config FB_VESA You will get a boot time penguin logo at no additional cost. Please read <file:Documentation/fb/vesafb.txt>. If unsure, say Y. -config VIDEO_SELECT - bool - depends on FB_VESA - default y +config FB_IMAC + bool "Intel-based Macintosh Framebuffer Support" + depends on (FB = y) && X86 + select FB_CFB_FILLRECT + select FB_CFB_COPYAREA + select FB_CFB_IMAGEBLIT + help + This is the frame buffer device driver for the Intel-based Macintosh config FB_HGA tristate "Hercules mono graphics support" @@ -578,12 +581,6 @@ config FB_HGA_ACCEL This will compile the Hercules mono graphics with acceleration functions. - -config VIDEO_SELECT - bool - depends on (FB = y) && X86 - default y - config FB_SGIVW tristate "SGI Visual Workstation framebuffer support" depends on FB && X86_VISWS diff --git a/drivers/video/Makefile b/drivers/video/Makefile index 23de3b2c785..c335e9bc3b2 100644 --- a/drivers/video/Makefile +++ b/drivers/video/Makefile @@ -4,15 +4,15 @@ # Each configuration option enables a list of files. -obj-$(CONFIG_VT) += console/ -obj-$(CONFIG_LOGO) += logo/ -obj-$(CONFIG_SYSFS) += backlight/ - obj-$(CONFIG_FB) += fb.o fb-y := fbmem.o fbmon.o fbcmap.o fbsysfs.o \ modedb.o fbcvt.o fb-objs := $(fb-y) +obj-$(CONFIG_VT) += console/ +obj-$(CONFIG_LOGO) += logo/ +obj-$(CONFIG_SYSFS) += backlight/ + obj-$(CONFIG_FB_CFB_FILLRECT) += cfbfillrect.o obj-$(CONFIG_FB_CFB_COPYAREA) += cfbcopyarea.o obj-$(CONFIG_FB_CFB_IMAGEBLIT) += cfbimgblt.o @@ -97,6 +97,7 @@ obj-$(CONFIG_FB_S3C2410) += s3c2410fb.o # Platform or fallback drivers go here obj-$(CONFIG_FB_VESA) += vesafb.o +obj-$(CONFIG_FB_IMAC) += imacfb.o obj-$(CONFIG_FB_VGA16) += vga16fb.o vgastate.o obj-$(CONFIG_FB_OF) += offb.o diff --git a/drivers/video/aty/aty128fb.c b/drivers/video/aty/aty128fb.c index db878fd55fb..11cf7fcb1d5 100644 --- a/drivers/video/aty/aty128fb.c +++ b/drivers/video/aty/aty128fb.c @@ -100,7 +100,7 @@ #ifndef CONFIG_PPC_PMAC /* default mode */ -static struct fb_var_screeninfo default_var __initdata = { +static struct fb_var_screeninfo default_var __devinitdata = { /* 640x480, 60 Hz, Non-Interlaced (25.175 MHz dotclock) */ 640, 480, 640, 480, 0, 0, 8, 0, {0, 8, 0}, {0, 8, 0}, {0, 8, 0}, {0, 0, 0}, @@ -123,7 +123,7 @@ static struct fb_var_screeninfo default_var = { /* default modedb mode */ /* 640x480, 60 Hz, Non-Interlaced (25.172 MHz dotclock) */ -static struct fb_videomode defaultmode __initdata = { +static struct fb_videomode defaultmode __devinitdata = { .refresh = 60, .xres = 640, .yres = 480, @@ -335,7 +335,7 @@ static const struct aty128_meminfo sdr_sgram = static const struct aty128_meminfo ddr_sgram = { 4, 4, 3, 3, 2, 3, 1, 16, 31, 16, "64-bit DDR SGRAM" }; -static struct fb_fix_screeninfo aty128fb_fix __initdata = { +static struct fb_fix_screeninfo aty128fb_fix __devinitdata = { .id = "ATY Rage128", .type = FB_TYPE_PACKED_PIXELS, .visual = FB_VISUAL_PSEUDOCOLOR, @@ -345,15 +345,15 @@ static struct fb_fix_screeninfo aty128fb_fix __initdata = { .accel = FB_ACCEL_ATI_RAGE128, }; -static char *mode_option __initdata = NULL; +static char *mode_option __devinitdata = NULL; #ifdef CONFIG_PPC_PMAC -static int default_vmode __initdata = VMODE_1024_768_60; -static int default_cmode __initdata = CMODE_8; +static int default_vmode __devinitdata = VMODE_1024_768_60; +static int default_cmode __devinitdata = CMODE_8; #endif -static int default_crt_on __initdata = 0; -static int default_lcd_on __initdata = 1; +static int default_crt_on __devinitdata = 0; +static int default_lcd_on __devinitdata = 1; #ifdef CONFIG_MTRR static int mtrr = 1; @@ -445,9 +445,9 @@ static int aty128_encode_var(struct fb_var_screeninfo *var, static int aty128_decode_var(struct fb_var_screeninfo *var, struct aty128fb_par *par); #if 0 -static void __init aty128_get_pllinfo(struct aty128fb_par *par, +static void __devinit aty128_get_pllinfo(struct aty128fb_par *par, void __iomem *bios); -static void __init __iomem *aty128_map_ROM(struct pci_dev *pdev, const struct aty128fb_par *par); +static void __devinit __iomem *aty128_map_ROM(struct pci_dev *pdev, const struct aty128fb_par *par); #endif static void aty128_timings(struct aty128fb_par *par); static void aty128_init_engine(struct aty128fb_par *par); @@ -573,7 +573,7 @@ static void aty_pll_writeupdate(const struct aty128fb_par *par) /* write to the scratch register to test r/w functionality */ -static int __init register_test(const struct aty128fb_par *par) +static int __devinit register_test(const struct aty128fb_par *par) { u32 val; int flag = 0; @@ -772,7 +772,7 @@ static u32 depth_to_dst(u32 depth) #ifndef __sparc__ -static void __iomem * __init aty128_map_ROM(const struct aty128fb_par *par, struct pci_dev *dev) +static void __iomem * __devinit aty128_map_ROM(const struct aty128fb_par *par, struct pci_dev *dev) { u16 dptr; u8 rom_type; @@ -856,7 +856,7 @@ static void __iomem * __init aty128_map_ROM(const struct aty128fb_par *par, stru return NULL; } -static void __init aty128_get_pllinfo(struct aty128fb_par *par, unsigned char __iomem *bios) +static void __devinit aty128_get_pllinfo(struct aty128fb_par *par, unsigned char __iomem *bios) { unsigned int bios_hdr; unsigned int bios_pll; @@ -903,7 +903,7 @@ static void __iomem * __devinit aty128_find_mem_vbios(struct aty128fb_par *par) #endif /* ndef(__sparc__) */ /* fill in known card constants if pll_block is not available */ -static void __init aty128_timings(struct aty128fb_par *par) +static void __devinit aty128_timings(struct aty128fb_par *par) { #ifdef CONFIG_PPC_OF /* instead of a table lookup, assume OF has properly @@ -1645,7 +1645,7 @@ static int aty128fb_sync(struct fb_info *info) } #ifndef MODULE -static int __init aty128fb_setup(char *options) +static int __devinit aty128fb_setup(char *options) { char *this_opt; @@ -1893,7 +1893,7 @@ static void aty128_early_resume(void *data) } #endif /* CONFIG_PPC_PMAC */ -static int __init aty128_init(struct pci_dev *pdev, const struct pci_device_id *ent) +static int __devinit aty128_init(struct pci_dev *pdev, const struct pci_device_id *ent) { struct fb_info *info = pci_get_drvdata(pdev); struct aty128fb_par *par = info->par; @@ -2037,7 +2037,7 @@ static int __init aty128_init(struct pci_dev *pdev, const struct pci_device_id * #ifdef CONFIG_PCI /* register a card ++ajoshi */ -static int __init aty128_probe(struct pci_dev *pdev, const struct pci_device_id *ent) +static int __devinit aty128_probe(struct pci_dev *pdev, const struct pci_device_id *ent) { unsigned long fb_addr, reg_addr; struct aty128fb_par *par; @@ -2556,7 +2556,7 @@ static int aty128_pci_resume(struct pci_dev *pdev) } -static int __init aty128fb_init(void) +static int __devinit aty128fb_init(void) { #ifndef MODULE char *option = NULL; diff --git a/drivers/video/aty/atyfb_base.c b/drivers/video/aty/atyfb_base.c index c5185f7cf4b..22e720611bf 100644 --- a/drivers/video/aty/atyfb_base.c +++ b/drivers/video/aty/atyfb_base.c @@ -316,12 +316,12 @@ static int vram; static int pll; static int mclk; static int xclk; -static int comp_sync __initdata = -1; +static int comp_sync __devinitdata = -1; static char *mode; #ifdef CONFIG_PPC -static int default_vmode __initdata = VMODE_CHOOSE; -static int default_cmode __initdata = CMODE_CHOOSE; +static int default_vmode __devinitdata = VMODE_CHOOSE; +static int default_cmode __devinitdata = CMODE_CHOOSE; module_param_named(vmode, default_vmode, int, 0); MODULE_PARM_DESC(vmode, "int: video mode for mac"); @@ -330,10 +330,10 @@ MODULE_PARM_DESC(cmode, "int: color mode for mac"); #endif #ifdef CONFIG_ATARI -static unsigned int mach64_count __initdata = 0; -static unsigned long phys_vmembase[FB_MAX] __initdata = { 0, }; -static unsigned long phys_size[FB_MAX] __initdata = { 0, }; -static unsigned long phys_guiregbase[FB_MAX] __initdata = { 0, }; +static unsigned int mach64_count __devinitdata = 0; +static unsigned long phys_vmembase[FB_MAX] __devinitdata = { 0, }; +static unsigned long phys_size[FB_MAX] __devinitdata = { 0, }; +static unsigned long phys_guiregbase[FB_MAX] __devinitdata = { 0, }; #endif /* top -> down is an evolution of mach64 chipset, any corrections? */ @@ -583,7 +583,7 @@ static u32 atyfb_get_pixclock(struct fb_var_screeninfo *var, struct atyfb_par *p * Apple monitor sense */ -static int __init read_aty_sense(const struct atyfb_par *par) +static int __devinit read_aty_sense(const struct atyfb_par *par) { int sense, i; @@ -1281,6 +1281,14 @@ static int atyfb_set_par(struct fb_info *info) par->accel_flags = var->accel_flags; /* hack */ + if (var->accel_flags) { + info->fbops->fb_sync = atyfb_sync; + info->flags &= ~FBINFO_HWACCEL_DISABLED; + } else { + info->fbops->fb_sync = NULL; + info->flags |= FBINFO_HWACCEL_DISABLED; + } + if (par->blitter_may_be_busy) wait_for_idle(par); @@ -2253,7 +2261,7 @@ static void aty_bl_exit(struct atyfb_par *par) #endif /* CONFIG_FB_ATY_BACKLIGHT */ -static void __init aty_calc_mem_refresh(struct atyfb_par *par, int xclk) +static void __devinit aty_calc_mem_refresh(struct atyfb_par *par, int xclk) { const int ragepro_tbl[] = { 44, 50, 55, 66, 75, 80, 100 @@ -2313,7 +2321,7 @@ static int __devinit atyfb_get_timings_from_lcd(struct atyfb_par *par, } #endif /* defined(__i386__) && defined(CONFIG_FB_ATY_GENERIC_LCD) */ -static int __init aty_init(struct fb_info *info, const char *name) +static int __devinit aty_init(struct fb_info *info, const char *name) { struct atyfb_par *par = (struct atyfb_par *) info->par; const char *ramname = NULL, *xtal; @@ -2394,12 +2402,15 @@ static int __init aty_init(struct fb_info *info, const char *name) break; } switch (clk_type) { +#ifdef CONFIG_ATARI case CLK_ATI18818_1: par->pll_ops = &aty_pll_ati18818_1; break; +#else case CLK_IBMRGB514: par->pll_ops = &aty_pll_ibm514; break; +#endif #if 0 /* dead code */ case CLK_STG1703: par->pll_ops = &aty_pll_stg1703; @@ -2604,7 +2615,11 @@ static int __init aty_init(struct fb_info *info, const char *name) info->fbops = &atyfb_ops; info->pseudo_palette = pseudo_palette; - info->flags = FBINFO_FLAG_DEFAULT; + info->flags = FBINFO_DEFAULT | + FBINFO_HWACCEL_IMAGEBLIT | + FBINFO_HWACCEL_FILLRECT | + FBINFO_HWACCEL_COPYAREA | + FBINFO_HWACCEL_YPAN; #ifdef CONFIG_PMAC_BACKLIGHT if (M64_HAS(G3_PB_1_1) && machine_is_compatible("PowerBook1,1")) { @@ -2733,7 +2748,7 @@ aty_init_exit: } #ifdef CONFIG_ATARI -static int __init store_video_par(char *video_str, unsigned char m64_num) +static int __devinit store_video_par(char *video_str, unsigned char m64_num) { char *p; unsigned long vmembase, size, guiregbase; @@ -3764,7 +3779,7 @@ static struct pci_driver atyfb_driver = { #endif /* CONFIG_PCI */ #ifndef MODULE -static int __init atyfb_setup(char *options) +static int __devinit atyfb_setup(char *options) { char *this_opt; @@ -3836,7 +3851,7 @@ static int __init atyfb_setup(char *options) } #endif /* MODULE */ -static int __init atyfb_init(void) +static int __devinit atyfb_init(void) { #ifndef MODULE char *option = NULL; diff --git a/drivers/video/aty/mach64_accel.c b/drivers/video/aty/mach64_accel.c index c98f4a44213..1490e5e1c23 100644 --- a/drivers/video/aty/mach64_accel.c +++ b/drivers/video/aty/mach64_accel.c @@ -200,8 +200,6 @@ void atyfb_copyarea(struct fb_info *info, const struct fb_copyarea *area) if (!area->width || !area->height) return; if (!par->accel_flags) { - if (par->blitter_may_be_busy) - wait_for_idle(par); cfb_copyarea(info, area); return; } @@ -248,8 +246,6 @@ void atyfb_fillrect(struct fb_info *info, const struct fb_fillrect *rect) if (!rect->width || !rect->height) return; if (!par->accel_flags) { - if (par->blitter_may_be_busy) - wait_for_idle(par); cfb_fillrect(info, rect); return; } @@ -288,14 +284,10 @@ void atyfb_imageblit(struct fb_info *info, const struct fb_image *image) return; if (!par->accel_flags || (image->depth != 1 && info->var.bits_per_pixel != image->depth)) { - if (par->blitter_may_be_busy) - wait_for_idle(par); - cfb_imageblit(info, image); return; } - wait_for_idle(par); pix_width = pix_width_save = aty_ld_le32(DP_PIX_WIDTH, par); host_cntl = aty_ld_le32(HOST_CNTL, par) | HOST_BYTE_ALIGN; @@ -425,8 +417,6 @@ void atyfb_imageblit(struct fb_info *info, const struct fb_image *image) } } - wait_for_idle(par); - /* restore pix_width */ wait_for_fifo(1, par); aty_st_le32(DP_PIX_WIDTH, pix_width_save, par); diff --git a/drivers/video/aty/mach64_cursor.c b/drivers/video/aty/mach64_cursor.c index ad8b7496f85..2a7f381c330 100644 --- a/drivers/video/aty/mach64_cursor.c +++ b/drivers/video/aty/mach64_cursor.c @@ -66,11 +66,6 @@ static const u8 cursor_bits_lookup[16] = { 0x01, 0x41, 0x11, 0x51, 0x05, 0x45, 0x15, 0x55 }; -static const u8 cursor_mask_lookup[16] = { - 0xaa, 0x2a, 0x8a, 0x0a, 0xa2, 0x22, 0x82, 0x02, - 0xa8, 0x28, 0x88, 0x08, 0xa0, 0x20, 0x80, 0x00 -}; - static int atyfb_cursor(struct fb_info *info, struct fb_cursor *cursor) { struct atyfb_par *par = (struct atyfb_par *) info->par; @@ -130,13 +125,13 @@ static int atyfb_cursor(struct fb_info *info, struct fb_cursor *cursor) fg_idx = cursor->image.fg_color; bg_idx = cursor->image.bg_color; - fg = (info->cmap.red[fg_idx] << 24) | - (info->cmap.green[fg_idx] << 16) | - (info->cmap.blue[fg_idx] << 8) | 15; + fg = ((info->cmap.red[fg_idx] & 0xff) << 24) | + ((info->cmap.green[fg_idx] & 0xff) << 16) | + ((info->cmap.blue[fg_idx] & 0xff) << 8) | 0xff; - bg = (info->cmap.red[bg_idx] << 24) | - (info->cmap.green[bg_idx] << 16) | - (info->cmap.blue[bg_idx] << 8); + bg = ((info->cmap.red[bg_idx] & 0xff) << 24) | + ((info->cmap.green[bg_idx] & 0xff) << 16) | + ((info->cmap.blue[bg_idx] & 0xff) << 8); wait_for_fifo(2, par); aty_st_le32(CUR_CLR0, bg, par); @@ -166,19 +161,17 @@ static int atyfb_cursor(struct fb_info *info, struct fb_cursor *cursor) switch (cursor->rop) { case ROP_XOR: // Upper 4 bits of mask data - fb_writeb(cursor_mask_lookup[m >> 4 ] | - cursor_bits_lookup[(b ^ m) >> 4], dst++); + fb_writeb(cursor_bits_lookup[(b ^ m) >> 4], dst++); // Lower 4 bits of mask - fb_writeb(cursor_mask_lookup[m & 0x0f ] | - cursor_bits_lookup[(b ^ m) & 0x0f], dst++); + fb_writeb(cursor_bits_lookup[(b ^ m) & 0x0f], + dst++); break; case ROP_COPY: // Upper 4 bits of mask data - fb_writeb(cursor_mask_lookup[m >> 4 ] | - cursor_bits_lookup[(b & m) >> 4], dst++); + fb_writeb(cursor_bits_lookup[(b & m) >> 4], dst++); // Lower 4 bits of mask - fb_writeb(cursor_mask_lookup[m & 0x0f ] | - cursor_bits_lookup[(b & m) & 0x0f], dst++); + fb_writeb(cursor_bits_lookup[(b & m) & 0x0f], + dst++); break; } } @@ -194,7 +187,7 @@ static int atyfb_cursor(struct fb_info *info, struct fb_cursor *cursor) return 0; } -int __init aty_init_cursor(struct fb_info *info) +int __devinit aty_init_cursor(struct fb_info *info) { unsigned long addr; diff --git a/drivers/video/aty/radeon_base.c b/drivers/video/aty/radeon_base.c index c5ecbb02e01..68b15645b89 100644 --- a/drivers/video/aty/radeon_base.c +++ b/drivers/video/aty/radeon_base.c @@ -2379,7 +2379,6 @@ err_release_pci0: err_release_fb: framebuffer_release(info); err_disable: - pci_disable_device(pdev); err_out: return ret; } @@ -2436,7 +2435,6 @@ static void __devexit radeonfb_pci_unregister (struct pci_dev *pdev) #endif fb_dealloc_cmap(&info->cmap); framebuffer_release(info); - pci_disable_device(pdev); } diff --git a/drivers/video/au1100fb.c b/drivers/video/au1100fb.c index 789450bb0bc..d63c3f48585 100644 --- a/drivers/video/au1100fb.c +++ b/drivers/video/au1100fb.c @@ -7,6 +7,8 @@ * Karl Lessard <klessard@sunrisetelecom.com> * <c.pellegrin@exadron.com> * + * PM support added by Rodolfo Giometti <giometti@linux.it> + * * Copyright 2002 MontaVista Software * Author: MontaVista Software, Inc. * ppopov@mvista.com or source@mvista.com @@ -602,17 +604,52 @@ int au1100fb_drv_remove(struct device *dev) return 0; } +#ifdef CONFIG_PM +static u32 sys_clksrc; +static struct au1100fb_regs fbregs; + int au1100fb_drv_suspend(struct device *dev, pm_message_t state) { - /* TODO */ + struct au1100fb_device *fbdev = dev_get_drvdata(dev); + + if (!fbdev) + return 0; + + /* Save the clock source state */ + sys_clksrc = au_readl(SYS_CLKSRC); + + /* Blank the LCD */ + au1100fb_fb_blank(VESA_POWERDOWN, &fbdev->info); + + /* Stop LCD clocking */ + au_writel(sys_clksrc & ~SYS_CS_ML_MASK, SYS_CLKSRC); + + memcpy(&fbregs, fbdev->regs, sizeof(struct au1100fb_regs)); + return 0; } int au1100fb_drv_resume(struct device *dev) { - /* TODO */ + struct au1100fb_device *fbdev = dev_get_drvdata(dev); + + if (!fbdev) + return 0; + + memcpy(fbdev->regs, &fbregs, sizeof(struct au1100fb_regs)); + + /* Restart LCD clocking */ + au_writel(sys_clksrc, SYS_CLKSRC); + + /* Unblank the LCD */ + au1100fb_fb_blank(VESA_NO_BLANKING, &fbdev->info); + return 0; } +#else +#define au1100fb_drv_suspend NULL +#define au1100fb_drv_resume NULL +#endif static struct device_driver au1100fb_driver = { .name = "au1100-lcd", diff --git a/drivers/video/backlight/Kconfig b/drivers/video/backlight/Kconfig index b895eaaa73f..022f9d3473f 100644 --- a/drivers/video/backlight/Kconfig +++ b/drivers/video/backlight/Kconfig @@ -10,7 +10,7 @@ menuconfig BACKLIGHT_LCD_SUPPORT config BACKLIGHT_CLASS_DEVICE tristate "Lowlevel Backlight controls" - depends on BACKLIGHT_LCD_SUPPORT + depends on BACKLIGHT_LCD_SUPPORT && FB default m help This framework adds support for low-level control of the LCD @@ -26,7 +26,7 @@ config BACKLIGHT_DEVICE config LCD_CLASS_DEVICE tristate "Lowlevel LCD controls" - depends on BACKLIGHT_LCD_SUPPORT + depends on BACKLIGHT_LCD_SUPPORT && FB default m help This framework adds support for low-level control of LCD. @@ -50,6 +50,14 @@ config BACKLIGHT_CORGI If you have a Sharp Zaurus SL-C7xx, SL-Cxx00 or SL-6000x say y to enable the backlight driver. +config BACKLIGHT_LOCOMO + tristate "Sharp LOCOMO LCD/Backlight Driver" + depends on BACKLIGHT_DEVICE && SHARP_LOCOMO + default y + help + If you have a Sharp Zaurus SL-5500 (Collie) or SL-5600 (Poodle) say y to + enable the LCD/backlight driver. + config BACKLIGHT_HP680 tristate "HP Jornada 680 Backlight Driver" depends on BACKLIGHT_DEVICE && SH_HP6XX diff --git a/drivers/video/backlight/Makefile b/drivers/video/backlight/Makefile index 744210c38e7..65e5553fc84 100644 --- a/drivers/video/backlight/Makefile +++ b/drivers/video/backlight/Makefile @@ -4,4 +4,4 @@ obj-$(CONFIG_LCD_CLASS_DEVICE) += lcd.o obj-$(CONFIG_BACKLIGHT_CLASS_DEVICE) += backlight.o obj-$(CONFIG_BACKLIGHT_CORGI) += corgi_bl.o obj-$(CONFIG_BACKLIGHT_HP680) += hp680_bl.o -obj-$(CONFIG_SHARP_LOCOMO) += locomolcd.o +obj-$(CONFIG_BACKLIGHT_LOCOMO) += locomolcd.o diff --git a/drivers/video/backlight/locomolcd.c b/drivers/video/backlight/locomolcd.c index 60831bb2368..bd879b7ec11 100644 --- a/drivers/video/backlight/locomolcd.c +++ b/drivers/video/backlight/locomolcd.c @@ -17,6 +17,8 @@ #include <linux/delay.h> #include <linux/device.h> #include <linux/interrupt.h> +#include <linux/fb.h> +#include <linux/backlight.h> #include <asm/hardware/locomo.h> #include <asm/irq.h> @@ -25,7 +27,10 @@ #include "../../../arch/arm/mach-sa1100/generic.h" +static struct backlight_device *locomolcd_bl_device; static struct locomo_dev *locomolcd_dev; +static unsigned long locomolcd_flags; +#define LOCOMOLCD_SUSPENDED 0x01 static void locomolcd_on(int comadj) { @@ -89,12 +94,10 @@ void locomolcd_power(int on) } /* read comadj */ - if (comadj == -1) { - if (machine_is_poodle()) - comadj = 118; - if (machine_is_collie()) - comadj = 128; - } + if (comadj == -1 && machine_is_collie()) + comadj = 128; + if (comadj == -1 && machine_is_poodle()) + comadj = 118; if (on) locomolcd_on(comadj); @@ -105,26 +108,100 @@ void locomolcd_power(int on) } EXPORT_SYMBOL(locomolcd_power); -static int poodle_lcd_probe(struct locomo_dev *dev) + +static int current_intensity; + +static int locomolcd_set_intensity(struct backlight_device *bd) +{ + int intensity = bd->props->brightness; + + if (bd->props->power != FB_BLANK_UNBLANK) + intensity = 0; + if (bd->props->fb_blank != FB_BLANK_UNBLANK) + intensity = 0; + if (locomolcd_flags & LOCOMOLCD_SUSPENDED) + intensity = 0; + + switch (intensity) { + /* AC and non-AC are handled differently, but produce same results in sharp code? */ + case 0: locomo_frontlight_set(locomolcd_dev, 0, 0, 161); break; + case 1: locomo_frontlight_set(locomolcd_dev, 117, 0, 161); break; + case 2: locomo_frontlight_set(locomolcd_dev, 163, 0, 148); break; + case 3: locomo_frontlight_set(locomolcd_dev, 194, 0, 161); break; + case 4: locomo_frontlight_set(locomolcd_dev, 194, 1, 161); break; + + default: + return -ENODEV; + } + current_intensity = intensity; + return 0; +} + +static int locomolcd_get_intensity(struct backlight_device *bd) +{ + return current_intensity; +} + +static struct backlight_properties locomobl_data = { + .owner = THIS_MODULE, + .get_brightness = locomolcd_get_intensity, + .update_status = locomolcd_set_intensity, + .max_brightness = 4, +}; + +#ifdef CONFIG_PM +static int locomolcd_suspend(struct locomo_dev *dev, pm_message_t state) +{ + locomolcd_flags |= LOCOMOLCD_SUSPENDED; + locomolcd_set_intensity(locomolcd_bl_device); + return 0; +} + +static int locomolcd_resume(struct locomo_dev *dev) +{ + locomolcd_flags &= ~LOCOMOLCD_SUSPENDED; + locomolcd_set_intensity(locomolcd_bl_device); + return 0; +} +#else +#define locomolcd_suspend NULL +#define locomolcd_resume NULL +#endif + +static int locomolcd_probe(struct locomo_dev *dev) { unsigned long flags; local_irq_save(flags); locomolcd_dev = dev; + locomo_gpio_set_dir(dev, LOCOMO_GPIO_FL_VR, 0); + /* the poodle_lcd_power function is called for the first time * from fs_initcall, which is before locomo is activated. * We need to recall poodle_lcd_power here*/ -#ifdef CONFIG_MACH_POODLE - locomolcd_power(1); -#endif + if (machine_is_poodle()) + locomolcd_power(1); + local_irq_restore(flags); + + locomolcd_bl_device = backlight_device_register("locomo-bl", NULL, &locomobl_data); + + if (IS_ERR (locomolcd_bl_device)) + return PTR_ERR (locomolcd_bl_device); + + /* Set up frontlight so that screen is readable */ + locomobl_data.brightness = 2; + locomolcd_set_intensity(locomolcd_bl_device); + return 0; } -static int poodle_lcd_remove(struct locomo_dev *dev) +static int locomolcd_remove(struct locomo_dev *dev) { unsigned long flags; + + backlight_device_unregister(locomolcd_bl_device); local_irq_save(flags); locomolcd_dev = NULL; local_irq_restore(flags); @@ -136,19 +213,33 @@ static struct locomo_driver poodle_lcd_driver = { .name = "locomo-backlight", }, .devid = LOCOMO_DEVID_BACKLIGHT, - .probe = poodle_lcd_probe, - .remove = poodle_lcd_remove, + .probe = locomolcd_probe, + .remove = locomolcd_remove, + .suspend = locomolcd_suspend, + .resume = locomolcd_resume, }; -static int __init poodle_lcd_init(void) + +static int __init locomolcd_init(void) { int ret = locomo_driver_register(&poodle_lcd_driver); - if (ret) return ret; + if (ret) + return ret; #ifdef CONFIG_SA1100_COLLIE sa1100fb_lcd_power = locomolcd_power; #endif return 0; } -device_initcall(poodle_lcd_init); +static void __exit locomolcd_exit(void) +{ + locomo_driver_unregister(&poodle_lcd_driver); +} + +module_init(locomolcd_init); +module_exit(locomolcd_exit); + +MODULE_AUTHOR("John Lenz <lenz@cs.wisc.edu>, Pavel Machek <pavel@suse.cz>"); +MODULE_DESCRIPTION("Collie LCD driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/video/cfbimgblt.c b/drivers/video/cfbimgblt.c index 8ba6152db2f..ad8a89bf8ea 100644 --- a/drivers/video/cfbimgblt.c +++ b/drivers/video/cfbimgblt.c @@ -230,6 +230,7 @@ static inline void fast_imageblit(const struct fb_image *image, struct fb_info * tab = cfb_tab16; break; case 32: + default: tab = cfb_tab32; break; } diff --git a/drivers/video/cirrusfb.c b/drivers/video/cirrusfb.c index 1103010af54..dda240eb736 100644 --- a/drivers/video/cirrusfb.c +++ b/drivers/video/cirrusfb.c @@ -2227,7 +2227,6 @@ static void cirrusfb_pci_unmap (struct cirrusfb_info *cinfo) release_region(0x3C0, 32); pci_release_regions(pdev); framebuffer_release(cinfo->info); - pci_disable_device(pdev); } #endif /* CONFIG_PCI */ @@ -2458,7 +2457,6 @@ err_release_regions: err_release_fb: framebuffer_release(info); err_disable: - pci_disable_device(pdev); err_out: return ret; } diff --git a/drivers/video/console/fbcon.c b/drivers/video/console/fbcon.c index 47ba1a79adc..5dc4083552d 100644 --- a/drivers/video/console/fbcon.c +++ b/drivers/video/console/fbcon.c @@ -125,6 +125,8 @@ static int softback_lines; static int first_fb_vc; static int last_fb_vc = MAX_NR_CONSOLES - 1; static int fbcon_is_default = 1; +static int fbcon_has_exited; + /* font data */ static char fontname[40]; @@ -140,7 +142,6 @@ static const struct consw fb_con; #define advance_row(p, delta) (unsigned short *)((unsigned long)(p) + (delta) * vc->vc_size_row) -static void fbcon_free_font(struct display *); static int fbcon_set_origin(struct vc_data *); #define CURSOR_DRAW_DELAY (1) @@ -194,6 +195,9 @@ static void fbcon_redraw_move(struct vc_data *vc, struct display *p, int line, int count, int dy); static void fbcon_modechanged(struct fb_info *info); static void fbcon_set_all_vcs(struct fb_info *info); +static void fbcon_start(void); +static void fbcon_exit(void); +static struct class_device *fbcon_class_device; #ifdef CONFIG_MAC /* @@ -252,7 +256,7 @@ static void fbcon_rotate_all(struct fb_info *info, u32 rotate) if (!ops || ops->currcon < 0 || rotate > 3) return; - for (i = 0; i < MAX_NR_CONSOLES; i++) { + for (i = first_fb_vc; i <= last_fb_vc; i++) { vc = vc_cons[i].d; if (!vc || vc->vc_mode != KD_TEXT || registered_fb[con2fb_map[i]] != info) @@ -389,15 +393,18 @@ static void fb_flashcursor(void *private) int c; int mode; - if (ops->currcon != -1) + acquire_console_sem(); + if (ops && ops->currcon != -1) vc = vc_cons[ops->currcon].d; if (!vc || !CON_IS_VISIBLE(vc) || fbcon_is_inactive(vc, info) || registered_fb[con2fb_map[vc->vc_num]] != info || - vc_cons[ops->currcon].d->vc_deccm != 1) + vc_cons[ops->currcon].d->vc_deccm != 1) { + release_console_sem(); return; - acquire_console_sem(); + } + p = &fb_display[vc->vc_num]; c = scr_readw((u16 *) vc->vc_pos); mode = (!ops->cursor_flash || ops->cursor_state.enable) ? @@ -528,7 +535,7 @@ static int search_fb_in_map(int idx) { int i, retval = 0; - for (i = 0; i < MAX_NR_CONSOLES; i++) { + for (i = first_fb_vc; i <= last_fb_vc; i++) { if (con2fb_map[i] == idx) retval = 1; } @@ -539,7 +546,7 @@ static int search_for_mapped_con(void) { int i, retval = 0; - for (i = 0; i < MAX_NR_CONSOLES; i++) { + for (i = first_fb_vc; i <= last_fb_vc; i++) { if (con2fb_map[i] != -1) retval = 1; } @@ -561,6 +568,7 @@ static int fbcon_takeover(int show_logo) err = take_over_console(&fb_con, first_fb_vc, last_fb_vc, fbcon_is_default); + if (err) { for (i = first_fb_vc; i <= last_fb_vc; i++) { con2fb_map[i] = -1; @@ -795,8 +803,8 @@ static int set_con2fb_map(int unit, int newidx, int user) if (oldidx == newidx) return 0; - if (!info) - err = -EINVAL; + if (!info || fbcon_has_exited) + return -EINVAL; if (!err && !search_for_mapped_con()) { info_idx = newidx; @@ -832,6 +840,9 @@ static int set_con2fb_map(int unit, int newidx, int user) con2fb_init_display(vc, info, unit, show_logo); } + if (!search_fb_in_map(info_idx)) + info_idx = newidx; + release_console_sem(); return err; } @@ -1034,6 +1045,7 @@ static const char *fbcon_startup(void) #endif /* CONFIG_MAC */ fbcon_add_cursor_timer(info); + fbcon_has_exited = 0; return display_desc; } @@ -1061,17 +1073,36 @@ static void fbcon_init(struct vc_data *vc, int init) /* If we are not the first console on this fb, copy the font from that console */ - t = &fb_display[svc->vc_num]; - if (!vc->vc_font.data) { - vc->vc_font.data = (void *)(p->fontdata = t->fontdata); - vc->vc_font.width = (*default_mode)->vc_font.width; - vc->vc_font.height = (*default_mode)->vc_font.height; - p->userfont = t->userfont; - if (p->userfont) - REFCOUNT(p->fontdata)++; + t = &fb_display[fg_console]; + if (!p->fontdata) { + if (t->fontdata) { + struct vc_data *fvc = vc_cons[fg_console].d; + + vc->vc_font.data = (void *)(p->fontdata = + fvc->vc_font.data); + vc->vc_font.width = fvc->vc_font.width; + vc->vc_font.height = fvc->vc_font.height; + p->userfont = t->userfont; + + if (p->userfont) + REFCOUNT(p->fontdata)++; + } else { + const struct font_desc *font = NULL; + + if (!fontname[0] || !(font = find_font(fontname))) + font = get_default_font(info->var.xres, + info->var.yres); + vc->vc_font.width = font->width; + vc->vc_font.height = font->height; + vc->vc_font.data = (void *)(p->fontdata = font->data); + vc->vc_font.charcount = 256; /* FIXME Need to + support more fonts */ + } } + if (p->userfont) charcnt = FNTCHARCNT(p->fontdata); + vc->vc_can_do_color = (fb_get_color_depth(&info->var, &info->fix)!=1); vc->vc_complement_mask = vc->vc_can_do_color ? 0x7700 : 0x0800; if (charcnt == 256) { @@ -1145,13 +1176,47 @@ static void fbcon_init(struct vc_data *vc, int init) ops->p = &fb_display[fg_console]; } +static void fbcon_free_font(struct display *p) +{ + if (p->userfont && p->fontdata && (--REFCOUNT(p->fontdata) == 0)) + kfree(p->fontdata - FONT_EXTRA_WORDS * sizeof(int)); + p->fontdata = NULL; + p->userfont = 0; +} + static void fbcon_deinit(struct vc_data *vc) { struct display *p = &fb_display[vc->vc_num]; + struct fb_info *info; + struct fbcon_ops *ops; + int idx; - if (info_idx != -1) - return; fbcon_free_font(p); + idx = con2fb_map[vc->vc_num]; + + if (idx == -1) + goto finished; + + info = registered_fb[idx]; + + if (!info) + goto finished; + + ops = info->fbcon_par; + + if (!ops) + goto finished; + + if (CON_IS_VISIBLE(vc)) + fbcon_del_cursor_timer(info); + + ops->flags &= ~FBCON_FLAGS_INIT; +finished: + + if (!con_is_bound(&fb_con)) + fbcon_exit(); + + return; } /* ====================================================================== */ @@ -2099,12 +2164,11 @@ static int fbcon_switch(struct vc_data *vc) if (info->fbops->fb_set_par) info->fbops->fb_set_par(info); - if (old_info != info) { + if (old_info != info) fbcon_del_cursor_timer(old_info); - fbcon_add_cursor_timer(info); - } } + fbcon_add_cursor_timer(info); set_blitting_type(vc, info); ops->cursor_reset = 1; @@ -2222,14 +2286,6 @@ static int fbcon_blank(struct vc_data *vc, int blank, int mode_switch) return 0; } -static void fbcon_free_font(struct display *p) -{ - if (p->userfont && p->fontdata && (--REFCOUNT(p->fontdata) == 0)) - kfree(p->fontdata - FONT_EXTRA_WORDS * sizeof(int)); - p->fontdata = NULL; - p->userfont = 0; -} - static int fbcon_get_font(struct vc_data *vc, struct console_font *font) { u8 *fontdata = vc->vc_font.data; @@ -2443,7 +2499,7 @@ static int fbcon_set_font(struct vc_data *vc, struct console_font *font, unsigne FNTSUM(new_data) = csum; /* Check if the same font is on some other console already */ - for (i = 0; i < MAX_NR_CONSOLES; i++) { + for (i = first_fb_vc; i <= last_fb_vc; i++) { struct vc_data *tmp = vc_cons[i].d; if (fb_display[i].userfont && @@ -2768,7 +2824,7 @@ static void fbcon_set_all_vcs(struct fb_info *info) if (!ops || ops->currcon < 0) return; - for (i = 0; i < MAX_NR_CONSOLES; i++) { + for (i = first_fb_vc; i <= last_fb_vc; i++) { vc = vc_cons[i].d; if (!vc || vc->vc_mode != KD_TEXT || registered_fb[con2fb_map[i]] != info) @@ -2830,22 +2886,57 @@ static int fbcon_mode_deleted(struct fb_info *info, return found; } +static int fbcon_fb_unregistered(int idx) +{ + int i; + + for (i = first_fb_vc; i <= last_fb_vc; i++) { + if (con2fb_map[i] == idx) + con2fb_map[i] = -1; + } + + if (idx == info_idx) { + info_idx = -1; + + for (i = 0; i < FB_MAX; i++) { + if (registered_fb[i] != NULL) { + info_idx = i; + break; + } + } + } + + if (info_idx != -1) { + for (i = first_fb_vc; i <= last_fb_vc; i++) { + if (con2fb_map[i] == -1) + con2fb_map[i] = info_idx; + } + } + + if (!num_registered_fb) + unregister_con_driver(&fb_con); + + return 0; +} + static int fbcon_fb_registered(int idx) { int ret = 0, i; if (info_idx == -1) { - for (i = 0; i < MAX_NR_CONSOLES; i++) { + for (i = first_fb_vc; i <= last_fb_vc; i++) { if (con2fb_map_boot[i] == idx) { info_idx = idx; break; } } + if (info_idx != -1) ret = fbcon_takeover(1); } else { - for (i = 0; i < MAX_NR_CONSOLES; i++) { - if (con2fb_map_boot[i] == idx) + for (i = first_fb_vc; i <= last_fb_vc; i++) { + if (con2fb_map_boot[i] == idx && + con2fb_map[i] == -1) set_con2fb_map(i, idx, 0); } } @@ -2882,7 +2973,7 @@ static void fbcon_new_modelist(struct fb_info *info) struct fb_var_screeninfo var; struct fb_videomode *mode; - for (i = 0; i < MAX_NR_CONSOLES; i++) { + for (i = first_fb_vc; i <= last_fb_vc; i++) { if (registered_fb[con2fb_map[i]] != info) continue; if (!fb_display[i].mode) @@ -2910,6 +3001,14 @@ static int fbcon_event_notify(struct notifier_block *self, struct fb_con2fbmap *con2fb; int ret = 0; + /* + * ignore all events except driver registration and deregistration + * if fbcon is not active + */ + if (fbcon_has_exited && !(action == FB_EVENT_FB_REGISTERED || + action == FB_EVENT_FB_UNREGISTERED)) + goto done; + switch(action) { case FB_EVENT_SUSPEND: fbcon_suspended(info); @@ -2930,6 +3029,9 @@ static int fbcon_event_notify(struct notifier_block *self, case FB_EVENT_FB_REGISTERED: ret = fbcon_fb_registered(info->node); break; + case FB_EVENT_FB_UNREGISTERED: + ret = fbcon_fb_unregistered(info->node); + break; case FB_EVENT_SET_CONSOLE_MAP: con2fb = event->data; ret = set_con2fb_map(con2fb->console - 1, @@ -2945,16 +3047,9 @@ static int fbcon_event_notify(struct notifier_block *self, case FB_EVENT_NEW_MODELIST: fbcon_new_modelist(info); break; - case FB_EVENT_SET_CON_ROTATE: - fbcon_rotate(info, *(int *)event->data); - break; - case FB_EVENT_GET_CON_ROTATE: - ret = fbcon_get_rotate(info); - break; - case FB_EVENT_SET_CON_ROTATE_ALL: - fbcon_rotate_all(info, *(int *)event->data); } +done: return ret; } @@ -2992,27 +3087,181 @@ static struct notifier_block fbcon_event_notifier = { .notifier_call = fbcon_event_notify, }; -static int __init fb_console_init(void) +static ssize_t store_rotate(struct class_device *class_device, + const char *buf, size_t count) { - int i; + struct fb_info *info; + int rotate, idx; + char **last = NULL; + + if (fbcon_has_exited) + return count; acquire_console_sem(); - fb_register_client(&fbcon_event_notifier); + idx = con2fb_map[fg_console]; + + if (idx == -1 || registered_fb[idx] == NULL) + goto err; + + info = registered_fb[idx]; + rotate = simple_strtoul(buf, last, 0); + fbcon_rotate(info, rotate); +err: release_console_sem(); + return count; +} - for (i = 0; i < MAX_NR_CONSOLES; i++) - con2fb_map[i] = -1; +static ssize_t store_rotate_all(struct class_device *class_device, + const char *buf, size_t count) +{ + struct fb_info *info; + int rotate, idx; + char **last = NULL; + + if (fbcon_has_exited) + return count; + + acquire_console_sem(); + idx = con2fb_map[fg_console]; + + if (idx == -1 || registered_fb[idx] == NULL) + goto err; + info = registered_fb[idx]; + rotate = simple_strtoul(buf, last, 0); + fbcon_rotate_all(info, rotate); +err: + release_console_sem(); + return count; +} + +static ssize_t show_rotate(struct class_device *class_device, char *buf) +{ + struct fb_info *info; + int rotate = 0, idx; + + if (fbcon_has_exited) + return 0; + + acquire_console_sem(); + idx = con2fb_map[fg_console]; + + if (idx == -1 || registered_fb[idx] == NULL) + goto err; + + info = registered_fb[idx]; + rotate = fbcon_get_rotate(info); +err: + release_console_sem(); + return snprintf(buf, PAGE_SIZE, "%d\n", rotate); +} + +static struct class_device_attribute class_device_attrs[] = { + __ATTR(rotate, S_IRUGO|S_IWUSR, show_rotate, store_rotate), + __ATTR(rotate_all, S_IWUSR, NULL, store_rotate_all), +}; + +static int fbcon_init_class_device(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(class_device_attrs); i++) + class_device_create_file(fbcon_class_device, + &class_device_attrs[i]); + return 0; +} + +static void fbcon_start(void) +{ if (num_registered_fb) { + int i; + + acquire_console_sem(); + for (i = 0; i < FB_MAX; i++) { if (registered_fb[i] != NULL) { info_idx = i; break; } } + + release_console_sem(); fbcon_takeover(0); } +} + +static void fbcon_exit(void) +{ + struct fb_info *info; + int i, j, mapped; + + if (fbcon_has_exited) + return; + +#ifdef CONFIG_ATARI + free_irq(IRQ_AUTO_4, fbcon_vbl_handler); +#endif +#ifdef CONFIG_MAC + if (MACH_IS_MAC && vbl_detected) + free_irq(IRQ_MAC_VBL, fbcon_vbl_handler); +#endif + + kfree((void *)softback_buf); + softback_buf = 0UL; + + for (i = 0; i < FB_MAX; i++) { + mapped = 0; + info = registered_fb[i]; + + if (info == NULL) + continue; + + for (j = first_fb_vc; j <= last_fb_vc; j++) { + if (con2fb_map[j] == i) + mapped = 1; + } + + if (mapped) { + if (info->fbops->fb_release) + info->fbops->fb_release(info, 0); + module_put(info->fbops->owner); + + if (info->fbcon_par) { + fbcon_del_cursor_timer(info); + kfree(info->fbcon_par); + info->fbcon_par = NULL; + } + if (info->queue.func == fb_flashcursor) + info->queue.func = NULL; + } + } + + fbcon_has_exited = 1; +} + +static int __init fb_console_init(void) +{ + int i; + + acquire_console_sem(); + fb_register_client(&fbcon_event_notifier); + fbcon_class_device = + class_device_create(fb_class, NULL, MKDEV(0, 0), NULL, "fbcon"); + + if (IS_ERR(fbcon_class_device)) { + printk(KERN_WARNING "Unable to create class_device " + "for fbcon; errno = %ld\n", + PTR_ERR(fbcon_class_device)); + fbcon_class_device = NULL; + } else + fbcon_init_class_device(); + + for (i = 0; i < MAX_NR_CONSOLES; i++) + con2fb_map[i] = -1; + + release_console_sem(); + fbcon_start(); return 0; } @@ -3020,12 +3269,24 @@ module_init(fb_console_init); #ifdef MODULE +static void __exit fbcon_deinit_class_device(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(class_device_attrs); i++) + class_device_remove_file(fbcon_class_device, + &class_device_attrs[i]); +} + static void __exit fb_console_exit(void) { acquire_console_sem(); fb_unregister_client(&fbcon_event_notifier); + fbcon_deinit_class_device(); + class_device_destroy(fb_class, MKDEV(0, 0)); + fbcon_exit(); release_console_sem(); - give_up_console(&fb_con); + unregister_con_driver(&fb_con); } module_exit(fb_console_exit); diff --git a/drivers/video/console/fbcon.h b/drivers/video/console/fbcon.h index c38c3d8e7a7..3487a636370 100644 --- a/drivers/video/console/fbcon.h +++ b/drivers/video/console/fbcon.h @@ -175,6 +175,7 @@ extern void fbcon_set_tileops(struct vc_data *vc, struct fb_info *info); #endif extern void fbcon_set_bitops(struct fbcon_ops *ops); extern int soft_cursor(struct fb_info *info, struct fb_cursor *cursor); +extern struct class *fb_class; #define FBCON_ATTRIBUTE_UNDERLINE 1 #define FBCON_ATTRIBUTE_REVERSE 2 diff --git a/drivers/video/console/mdacon.c b/drivers/video/console/mdacon.c index 7f939d066a5..c89f90edf8a 100644 --- a/drivers/video/console/mdacon.c +++ b/drivers/video/console/mdacon.c @@ -308,7 +308,7 @@ static void __init mda_initialize(void) outb_p(0x00, mda_gfx_port); } -static const char __init *mdacon_startup(void) +static const char *mdacon_startup(void) { mda_num_columns = 80; mda_num_lines = 25; diff --git a/drivers/video/console/newport_con.c b/drivers/video/console/newport_con.c index e99fe30e568..03041311711 100644 --- a/drivers/video/console/newport_con.c +++ b/drivers/video/console/newport_con.c @@ -51,6 +51,7 @@ static int topscan; static int xcurs_correction = 29; static int newport_xsize; static int newport_ysize; +static int newport_has_init; static int newport_set_def_font(int unit, struct console_font *op); @@ -283,6 +284,15 @@ static void newport_get_revisions(void) xcurs_correction = 21; } +static void newport_exit(void) +{ + int i; + + /* free memory used by user font */ + for (i = 0; i < MAX_NR_CONSOLES; i++) + newport_set_def_font(i, NULL); +} + /* Can't be __init, take_over_console may call it later */ static const char *newport_startup(void) { @@ -290,8 +300,10 @@ static const char *newport_startup(void) if (!sgi_gfxaddr) return NULL; - npregs = (struct newport_regs *) /* ioremap cannot fail */ - ioremap(sgi_gfxaddr, sizeof(struct newport_regs)); + + if (!npregs) + npregs = (struct newport_regs *)/* ioremap cannot fail */ + ioremap(sgi_gfxaddr, sizeof(struct newport_regs)); npregs->cset.config = NPORT_CFG_GD0; if (newport_wait(npregs)) @@ -307,11 +319,11 @@ static const char *newport_startup(void) newport_reset(); newport_get_revisions(); newport_get_screensize(); + newport_has_init = 1; return "SGI Newport"; out_unmap: - iounmap((void *)npregs); return NULL; } @@ -324,11 +336,10 @@ static void newport_init(struct vc_data *vc, int init) static void newport_deinit(struct vc_data *c) { - int i; - - /* free memory used by user font */ - for (i = 0; i < MAX_NR_CONSOLES; i++) - newport_set_def_font(i, NULL); + if (!con_is_bound(&newport_con) && newport_has_init) { + newport_exit(); + newport_has_init = 0; + } } static void newport_clear(struct vc_data *vc, int sy, int sx, int height, @@ -728,16 +739,23 @@ const struct consw newport_con = { #ifdef MODULE static int __init newport_console_init(void) { + + if (!sgi_gfxaddr) + return NULL; + + if (!npregs) + npregs = (struct newport_regs *)/* ioremap cannot fail */ + ioremap(sgi_gfxaddr, sizeof(struct newport_regs)); + return take_over_console(&newport_con, 0, MAX_NR_CONSOLES - 1, 1); } +module_init(newport_console_init); static void __exit newport_console_exit(void) { give_up_console(&newport_con); iounmap((void *)npregs); } - -module_init(newport_console_init); module_exit(newport_console_exit); #endif diff --git a/drivers/video/console/promcon.c b/drivers/video/console/promcon.c index 04f42fcaac5..d6e6ad537f9 100644 --- a/drivers/video/console/promcon.c +++ b/drivers/video/console/promcon.c @@ -109,7 +109,7 @@ promcon_end(struct vc_data *conp, char *b) return b - p; } -const char __init *promcon_startup(void) +const char *promcon_startup(void) { const char *display_desc = "PROM"; int node; @@ -133,7 +133,7 @@ const char __init *promcon_startup(void) return display_desc; } -static void __init +static void promcon_init_unimap(struct vc_data *conp) { mm_segment_t old_fs = get_fs(); diff --git a/drivers/video/console/sticon.c b/drivers/video/console/sticon.c index fd5940f4127..45c4f227e56 100644 --- a/drivers/video/console/sticon.c +++ b/drivers/video/console/sticon.c @@ -75,7 +75,7 @@ static inline void cursor_undrawn(void) cursor_drawn = 0; } -static const char *__init sticon_startup(void) +static const char *sticon_startup(void) { return "STI console"; } diff --git a/drivers/video/console/vgacon.c b/drivers/video/console/vgacon.c index e64d42e2449..f32b590730f 100644 --- a/drivers/video/console/vgacon.c +++ b/drivers/video/console/vgacon.c @@ -114,6 +114,7 @@ static int vga_512_chars; static int vga_video_font_height; static int vga_scan_lines; static unsigned int vga_rolled_over = 0; +static int vga_init_done; static int __init no_scroll(char *str) { @@ -190,7 +191,7 @@ static void vgacon_scrollback_init(int pitch) } } -static void __init vgacon_scrollback_startup(void) +static void vgacon_scrollback_startup(void) { vgacon_scrollback = alloc_bootmem(CONFIG_VGACON_SOFT_SCROLLBACK_SIZE * 1024); @@ -355,7 +356,7 @@ static int vgacon_scrolldelta(struct vc_data *c, int lines) } #endif /* CONFIG_VGACON_SOFT_SCROLLBACK */ -static const char __init *vgacon_startup(void) +static const char *vgacon_startup(void) { const char *display_desc = NULL; u16 saved1, saved2; @@ -523,7 +524,12 @@ static const char __init *vgacon_startup(void) vgacon_xres = ORIG_VIDEO_COLS * VGA_FONTWIDTH; vgacon_yres = vga_scan_lines; - vgacon_scrollback_startup(); + + if (!vga_init_done) { + vgacon_scrollback_startup(); + vga_init_done = 1; + } + return display_desc; } @@ -531,10 +537,20 @@ static void vgacon_init(struct vc_data *c, int init) { unsigned long p; - /* We cannot be loaded as a module, therefore init is always 1 */ + /* + * We cannot be loaded as a module, therefore init is always 1, + * but vgacon_init can be called more than once, and init will + * not be 1. + */ c->vc_can_do_color = vga_can_do_color; - c->vc_cols = vga_video_num_columns; - c->vc_rows = vga_video_num_lines; + + /* set dimensions manually if init != 0 since vc_resize() will fail */ + if (init) { + c->vc_cols = vga_video_num_columns; + c->vc_rows = vga_video_num_lines; + } else + vc_resize(c, vga_video_num_columns, vga_video_num_lines); + c->vc_scan_lines = vga_scan_lines; c->vc_font.height = vga_video_font_height; c->vc_complement_mask = 0x7700; diff --git a/drivers/video/epson1355fb.c b/drivers/video/epson1355fb.c index 082759447bf..f0a621ecc28 100644 --- a/drivers/video/epson1355fb.c +++ b/drivers/video/epson1355fb.c @@ -605,11 +605,6 @@ static void clearfb16(struct fb_info *info) fb_writeb(0, dst); } -static void epson1355fb_platform_release(struct device *device) -{ - dev_err(device, "This driver is broken, please bug the authors so they will fix it.\n"); -} - static int epson1355fb_remove(struct platform_device *dev) { struct fb_info *info = platform_get_drvdata(dev); @@ -733,13 +728,7 @@ static struct platform_driver epson1355fb_driver = { }, }; -static struct platform_device epson1355fb_device = { - .name = "epson1355fb", - .id = 0, - .dev = { - .release = epson1355fb_platform_release, - } -}; +static struct platform_device *epson1355fb_device; int __init epson1355fb_init(void) { @@ -749,11 +738,21 @@ int __init epson1355fb_init(void) return -ENODEV; ret = platform_driver_register(&epson1355fb_driver); + if (!ret) { - ret = platform_device_register(&epson1355fb_device); - if (ret) + epson1355fb_device = platform_device_alloc("epson1355fb", 0); + + if (epson1355fb_device) + ret = platform_device_add(epson1355fb_device); + else + ret = -ENOMEM; + + if (ret) { + platform_device_put(epson1355fb_device); platform_driver_unregister(&epson1355fb_driver); + } } + return ret; } @@ -762,7 +761,7 @@ module_init(epson1355fb_init); #ifdef MODULE static void __exit epson1355fb_exit(void) { - platform_device_unregister(&epson1355fb_device); + platform_device_unregister(epson1355fb_device); platform_driver_unregister(&epson1355fb_driver); } diff --git a/drivers/video/fbcvt.c b/drivers/video/fbcvt.c index ac90883dc3a..b5498999c4e 100644 --- a/drivers/video/fbcvt.c +++ b/drivers/video/fbcvt.c @@ -376,4 +376,3 @@ int fb_find_mode_cvt(struct fb_videomode *mode, int margins, int rb) return 0; } -EXPORT_SYMBOL(fb_find_mode_cvt); diff --git a/drivers/video/fbmem.c b/drivers/video/fbmem.c index 372aa177682..31143afe7c9 100644 --- a/drivers/video/fbmem.c +++ b/drivers/video/fbmem.c @@ -34,7 +34,6 @@ #endif #include <linux/devfs_fs_kernel.h> #include <linux/err.h> -#include <linux/kernel.h> #include <linux/device.h> #include <linux/efi.h> @@ -162,7 +161,6 @@ char* fb_get_buffer_offset(struct fb_info *info, struct fb_pixmap *buf, u32 size } #ifdef CONFIG_LOGO -#include <linux/linux_logo.h> static inline unsigned safe_shift(unsigned d, int n) { @@ -336,11 +334,11 @@ static void fb_rotate_logo_ud(const u8 *in, u8 *out, u32 width, u32 height) static void fb_rotate_logo_cw(const u8 *in, u8 *out, u32 width, u32 height) { - int i, j, w = width - 1; + int i, j, h = height - 1; for (i = 0; i < height; i++) for (j = 0; j < width; j++) - out[height * j + w - i] = *in++; + out[height * j + h - i] = *in++; } static void fb_rotate_logo_ccw(const u8 *in, u8 *out, u32 width, u32 height) @@ -358,24 +356,24 @@ static void fb_rotate_logo(struct fb_info *info, u8 *dst, u32 tmp; if (rotate == FB_ROTATE_UD) { - image->dx = info->var.xres - image->width; - image->dy = info->var.yres - image->height; fb_rotate_logo_ud(image->data, dst, image->width, image->height); + image->dx = info->var.xres - image->width; + image->dy = info->var.yres - image->height; } else if (rotate == FB_ROTATE_CW) { - tmp = image->width; - image->width = image->height; - image->height = tmp; - image->dx = info->var.xres - image->height; fb_rotate_logo_cw(image->data, dst, image->width, image->height); - } else if (rotate == FB_ROTATE_CCW) { tmp = image->width; image->width = image->height; image->height = tmp; - image->dy = info->var.yres - image->width; + image->dx = info->var.xres - image->width; + } else if (rotate == FB_ROTATE_CCW) { fb_rotate_logo_ccw(image->data, dst, image->width, image->height); + tmp = image->width; + image->width = image->height; + image->height = tmp; + image->dy = info->var.yres - image->height; } image->data = dst; @@ -435,7 +433,7 @@ int fb_prepare_logo(struct fb_info *info, int rotate) depth = info->var.green.length; } - if (info->fix.visual == FB_VISUAL_STATIC_PSEUDOCOLOR) { + if (info->fix.visual == FB_VISUAL_STATIC_PSEUDOCOLOR && depth > 4) { /* assume console colormap */ depth = 4; } @@ -1278,8 +1276,8 @@ static struct file_operations fb_fops = { #endif }; -static struct class *fb_class; - +struct class *fb_class; +EXPORT_SYMBOL(fb_class); /** * register_framebuffer - registers a frame buffer device * @fb_info: frame buffer info structure @@ -1355,6 +1353,7 @@ register_framebuffer(struct fb_info *fb_info) int unregister_framebuffer(struct fb_info *fb_info) { + struct fb_event event; int i; i = fb_info->node; @@ -1362,13 +1361,17 @@ unregister_framebuffer(struct fb_info *fb_info) return -EINVAL; devfs_remove("fb/%d", i); - if (fb_info->pixmap.addr && (fb_info->pixmap.flags & FB_PIXMAP_DEFAULT)) + if (fb_info->pixmap.addr && + (fb_info->pixmap.flags & FB_PIXMAP_DEFAULT)) kfree(fb_info->pixmap.addr); fb_destroy_modelist(&fb_info->modelist); registered_fb[i]=NULL; num_registered_fb--; fb_cleanup_class_device(fb_info); class_device_destroy(fb_class, MKDEV(FB_MAJOR, i)); + event.info = fb_info; + blocking_notifier_call_chain(&fb_notifier_list, + FB_EVENT_FB_UNREGISTERED, &event); return 0; } @@ -1491,28 +1494,6 @@ int fb_new_modelist(struct fb_info *info) return err; } -/** - * fb_con_duit - user<->fbcon passthrough - * @info: struct fb_info - * @event: notification event to be passed to fbcon - * @data: private data - * - * DESCRIPTION - * This function is an fbcon-user event passing channel - * which bypasses fbdev. This is hopefully temporary - * until a user interface for fbcon is created - */ -int fb_con_duit(struct fb_info *info, int event, void *data) -{ - struct fb_event evnt; - - evnt.info = info; - evnt.data = data; - - return blocking_notifier_call_chain(&fb_notifier_list, event, &evnt); -} -EXPORT_SYMBOL(fb_con_duit); - static char *video_options[FB_MAX]; static int ofonly; @@ -1622,6 +1603,5 @@ EXPORT_SYMBOL(fb_set_suspend); EXPORT_SYMBOL(fb_register_client); EXPORT_SYMBOL(fb_unregister_client); EXPORT_SYMBOL(fb_get_options); -EXPORT_SYMBOL(fb_new_modelist); MODULE_LICENSE("GPL"); diff --git a/drivers/video/fbmon.c b/drivers/video/fbmon.c index 53beeb4a999..3ccfff715a5 100644 --- a/drivers/video/fbmon.c +++ b/drivers/video/fbmon.c @@ -29,9 +29,9 @@ #include <linux/tty.h> #include <linux/fb.h> #include <linux/module.h> +#include <linux/pci.h> #include <video/edid.h> #ifdef CONFIG_PPC_OF -#include <linux/pci.h> #include <asm/prom.h> #include <asm/pci-bridge.h> #endif @@ -605,6 +605,7 @@ static int fb_get_monitor_limits(unsigned char *edid, struct fb_monspecs *specs) block = edid + DETAILED_TIMING_DESCRIPTIONS_START; DPRINTK(" Monitor Operating Limits: "); + for (i = 0; i < 4; i++, block += DETAILED_TIMING_DESCRIPTION_SIZE) { if (edid_is_limits_block(block)) { specs->hfmin = H_MIN_RATE * 1000; @@ -618,11 +619,12 @@ static int fb_get_monitor_limits(unsigned char *edid, struct fb_monspecs *specs) break; } } - + /* estimate monitor limits based on modes supported */ if (retval) { - struct fb_videomode *modes; + struct fb_videomode *modes, *mode; int num_modes, i, hz, hscan, pixclock; + int vtotal, htotal; modes = fb_create_modedb(edid, &num_modes); if (!modes) { @@ -632,20 +634,38 @@ static int fb_get_monitor_limits(unsigned char *edid, struct fb_monspecs *specs) retval = 0; for (i = 0; i < num_modes; i++) { - hz = modes[i].refresh; + mode = &modes[i]; pixclock = PICOS2KHZ(modes[i].pixclock) * 1000; - hscan = (modes[i].yres * 105 * hz + 5000)/100; + htotal = mode->xres + mode->right_margin + mode->hsync_len + + mode->left_margin; + vtotal = mode->yres + mode->lower_margin + mode->vsync_len + + mode->upper_margin; + + if (mode->vmode & FB_VMODE_INTERLACED) + vtotal /= 2; + + if (mode->vmode & FB_VMODE_DOUBLE) + vtotal *= 2; + + hscan = (pixclock + htotal / 2) / htotal; + hscan = (hscan + 500) / 1000 * 1000; + hz = (hscan + vtotal / 2) / vtotal; if (specs->dclkmax == 0 || specs->dclkmax < pixclock) specs->dclkmax = pixclock; + if (specs->dclkmin == 0 || specs->dclkmin > pixclock) specs->dclkmin = pixclock; + if (specs->hfmax == 0 || specs->hfmax < hscan) specs->hfmax = hscan; + if (specs->hfmin == 0 || specs->hfmin > hscan) specs->hfmin = hscan; + if (specs->vfmax == 0 || specs->vfmax < hz) specs->vfmax = hz; + if (specs->vfmin == 0 || specs->vfmin > hz) specs->vfmin = hz; } @@ -1281,8 +1301,7 @@ int fb_validate_mode(const struct fb_var_screeninfo *var, struct fb_info *info) -EINVAL : 0; } -#if defined(CONFIG_FB_FIRMWARE_EDID) && defined(__i386__) -#include <linux/pci.h> +#if defined(CONFIG_FIRMWARE_EDID) && defined(CONFIG_X86) /* * We need to ensure that the EDID block is only returned for diff --git a/drivers/video/fbsysfs.c b/drivers/video/fbsysfs.c index 3ceb8c1b392..4f78f234473 100644 --- a/drivers/video/fbsysfs.c +++ b/drivers/video/fbsysfs.c @@ -100,13 +100,22 @@ static int mode_string(char *buf, unsigned int offset, const struct fb_videomode *mode) { char m = 'U'; + char v = 'p'; + if (mode->flag & FB_MODE_IS_DETAILED) m = 'D'; if (mode->flag & FB_MODE_IS_VESA) m = 'V'; if (mode->flag & FB_MODE_IS_STANDARD) m = 'S'; - return snprintf(&buf[offset], PAGE_SIZE - offset, "%c:%dx%d-%d\n", m, mode->xres, mode->yres, mode->refresh); + + if (mode->vmode & FB_VMODE_INTERLACED) + v = 'i'; + if (mode->vmode & FB_VMODE_DOUBLE) + v = 'd'; + + return snprintf(&buf[offset], PAGE_SIZE - offset, "%c:%dx%d%c-%d\n", + m, mode->xres, mode->yres, v, mode->refresh); } static ssize_t store_mode(struct class_device *class_device, const char * buf, @@ -238,45 +247,6 @@ static ssize_t show_rotate(struct class_device *class_device, char *buf) return snprintf(buf, PAGE_SIZE, "%d\n", fb_info->var.rotate); } -static ssize_t store_con_rotate(struct class_device *class_device, - const char *buf, size_t count) -{ - struct fb_info *fb_info = class_get_devdata(class_device); - int rotate; - char **last = NULL; - - acquire_console_sem(); - rotate = simple_strtoul(buf, last, 0); - fb_con_duit(fb_info, FB_EVENT_SET_CON_ROTATE, &rotate); - release_console_sem(); - return count; -} - -static ssize_t store_con_rotate_all(struct class_device *class_device, - const char *buf, size_t count) -{ - struct fb_info *fb_info = class_get_devdata(class_device); - int rotate; - char **last = NULL; - - acquire_console_sem(); - rotate = simple_strtoul(buf, last, 0); - fb_con_duit(fb_info, FB_EVENT_SET_CON_ROTATE_ALL, &rotate); - release_console_sem(); - return count; -} - -static ssize_t show_con_rotate(struct class_device *class_device, char *buf) -{ - struct fb_info *fb_info = class_get_devdata(class_device); - int rotate; - - acquire_console_sem(); - rotate = fb_con_duit(fb_info, FB_EVENT_GET_CON_ROTATE, NULL); - release_console_sem(); - return snprintf(buf, PAGE_SIZE, "%d\n", rotate); -} - static ssize_t store_virtual(struct class_device *class_device, const char * buf, size_t count) { @@ -493,8 +463,6 @@ static struct class_device_attribute class_device_attrs[] = { __ATTR(name, S_IRUGO, show_name, NULL), __ATTR(stride, S_IRUGO, show_stride, NULL), __ATTR(rotate, S_IRUGO|S_IWUSR, show_rotate, store_rotate), - __ATTR(con_rotate, S_IRUGO|S_IWUSR, show_con_rotate, store_con_rotate), - __ATTR(con_rotate_all, S_IWUSR, NULL, store_con_rotate_all), __ATTR(state, S_IRUGO|S_IWUSR, show_fbstate, store_fbstate), #ifdef CONFIG_FB_BACKLIGHT __ATTR(bl_curve, S_IRUGO|S_IWUSR, show_bl_curve, store_bl_curve), diff --git a/drivers/video/geode/gx1fb_core.c b/drivers/video/geode/gx1fb_core.c index 20e69156d72..4d3a8871d3d 100644 --- a/drivers/video/geode/gx1fb_core.c +++ b/drivers/video/geode/gx1fb_core.c @@ -376,8 +376,6 @@ static int __init gx1fb_probe(struct pci_dev *pdev, const struct pci_device_id * release_mem_region(gx1_gx_base() + 0x8300, 0x100); } - pci_disable_device(pdev); - if (info) framebuffer_release(info); return ret; @@ -399,7 +397,6 @@ static void gx1fb_remove(struct pci_dev *pdev) iounmap(par->dc_regs); release_mem_region(gx1_gx_base() + 0x8300, 0x100); - pci_disable_device(pdev); pci_set_drvdata(pdev, NULL); framebuffer_release(info); diff --git a/drivers/video/geode/gxfb_core.c b/drivers/video/geode/gxfb_core.c index 89c34b15f5d..5ef12a3dfa5 100644 --- a/drivers/video/geode/gxfb_core.c +++ b/drivers/video/geode/gxfb_core.c @@ -354,8 +354,6 @@ static int __init gxfb_probe(struct pci_dev *pdev, const struct pci_device_id *i pci_release_region(pdev, 2); } - pci_disable_device(pdev); - if (info) framebuffer_release(info); return ret; @@ -377,7 +375,6 @@ static void gxfb_remove(struct pci_dev *pdev) iounmap(par->dc_regs); pci_release_region(pdev, 2); - pci_disable_device(pdev); pci_set_drvdata(pdev, NULL); framebuffer_release(info); diff --git a/drivers/video/i810/i810_main.c b/drivers/video/i810/i810_main.c index 44aa2ffff97..a1f7d80f0ac 100644 --- a/drivers/video/i810/i810_main.c +++ b/drivers/video/i810/i810_main.c @@ -2110,9 +2110,6 @@ static void i810fb_release_resource(struct fb_info *info, if (par->res_flags & MMIO_REQ) release_mem_region(par->mmio_start_phys, MMIO_SIZE); - if (par->res_flags & PCI_DEVICE_ENABLED) - pci_disable_device(par->dev); - framebuffer_release(info); } diff --git a/drivers/video/imacfb.c b/drivers/video/imacfb.c new file mode 100644 index 00000000000..7b1c168c834 --- /dev/null +++ b/drivers/video/imacfb.c @@ -0,0 +1,345 @@ +/* + * framebuffer driver for Intel Based Mac's + * + * (c) 2006 Edgar Hucek <gimli@dark-green.com> + * Original imac driver written by Gerd Knorr <kraxel@goldbach.in-berlin.de> + * + */ + +#include <linux/delay.h> +#include <linux/errno.h> +#include <linux/fb.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/ioport.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/platform_device.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/tty.h> + +#include <asm/io.h> + +#include <video/vga.h> + +typedef enum _MAC_TYPE { + M_I17, + M_I20, + M_MINI, + M_MACBOOK, + M_NEW +} MAC_TYPE; + +/* --------------------------------------------------------------------- */ + +static struct fb_var_screeninfo imacfb_defined __initdata = { + .activate = FB_ACTIVATE_NOW, + .height = -1, + .width = -1, + .right_margin = 32, + .upper_margin = 16, + .lower_margin = 4, + .vsync_len = 4, + .vmode = FB_VMODE_NONINTERLACED, +}; + +static struct fb_fix_screeninfo imacfb_fix __initdata = { + .id = "IMAC VGA", + .type = FB_TYPE_PACKED_PIXELS, + .accel = FB_ACCEL_NONE, + .visual = FB_VISUAL_TRUECOLOR, +}; + +static int inverse; +static int model = M_NEW; +static int manual_height; +static int manual_width; + +#define DEFAULT_FB_MEM 1024*1024*16 + +/* --------------------------------------------------------------------- */ + +static int imacfb_setcolreg(unsigned regno, unsigned red, unsigned green, + unsigned blue, unsigned transp, + struct fb_info *info) +{ + /* + * Set a single color register. The values supplied are + * already rounded down to the hardware's capabilities + * (according to the entries in the `var' structure). Return + * != 0 for invalid regno. + */ + + if (regno >= info->cmap.len) + return 1; + + if (regno < 16) { + red >>= 8; + green >>= 8; + blue >>= 8; + ((u32 *)(info->pseudo_palette))[regno] = + (red << info->var.red.offset) | + (green << info->var.green.offset) | + (blue << info->var.blue.offset); + } + return 0; +} + +static struct fb_ops imacfb_ops = { + .owner = THIS_MODULE, + .fb_setcolreg = imacfb_setcolreg, + .fb_fillrect = cfb_fillrect, + .fb_copyarea = cfb_copyarea, + .fb_imageblit = cfb_imageblit, +}; + +static int __init imacfb_setup(char *options) +{ + char *this_opt; + + if (!options || !*options) + return 0; + + while ((this_opt = strsep(&options, ",")) != NULL) { + if (!*this_opt) continue; + + if (!strcmp(this_opt, "inverse")) + inverse = 1; + else if (!strcmp(this_opt, "i17")) + model = M_I17; + else if (!strcmp(this_opt, "i20")) + model = M_I20; + else if (!strcmp(this_opt, "mini")) + model = M_MINI; + else if (!strcmp(this_opt, "macbook")) + model = M_MACBOOK; + else if (!strncmp(this_opt, "height:", 7)) + manual_height = simple_strtoul(this_opt+7, NULL, 0); + else if (!strncmp(this_opt, "width:", 6)) + manual_width = simple_strtoul(this_opt+6, NULL, 0); + } + return 0; +} + +static int __init imacfb_probe(struct platform_device *dev) +{ + struct fb_info *info; + int err; + unsigned int size_vmode; + unsigned int size_remap; + unsigned int size_total; + + screen_info.lfb_depth = 32; + screen_info.lfb_size = DEFAULT_FB_MEM / 0x10000; + screen_info.pages=1; + screen_info.blue_size = 8; + screen_info.blue_pos = 0; + screen_info.green_size = 8; + screen_info.green_pos = 8; + screen_info.red_size = 8; + screen_info.red_pos = 16; + screen_info.rsvd_size = 8; + screen_info.rsvd_pos = 24; + + switch (model) { + case M_I17: + screen_info.lfb_width = 1440; + screen_info.lfb_height = 900; + screen_info.lfb_linelength = 1472 * 4; + screen_info.lfb_base = 0x80010000; + break; + case M_NEW: + case M_I20: + screen_info.lfb_width = 1680; + screen_info.lfb_height = 1050; + screen_info.lfb_linelength = 1728 * 4; + screen_info.lfb_base = 0x80010000; + break; + case M_MINI: + screen_info.lfb_width = 1024; + screen_info.lfb_height = 768; + screen_info.lfb_linelength = 2048 * 4; + screen_info.lfb_base = 0x80000000; + break; + case M_MACBOOK: + screen_info.lfb_width = 1280; + screen_info.lfb_height = 800; + screen_info.lfb_linelength = 2048 * 4; + screen_info.lfb_base = 0x80000000; + break; + } + + /* if the user wants to manually specify height/width, + we will override the defaults */ + /* TODO: eventually get auto-detection working */ + if (manual_height > 0) + screen_info.lfb_height = manual_height; + if (manual_width > 0) + screen_info.lfb_width = manual_width; + + imacfb_fix.smem_start = screen_info.lfb_base; + imacfb_defined.bits_per_pixel = screen_info.lfb_depth; + imacfb_defined.xres = screen_info.lfb_width; + imacfb_defined.yres = screen_info.lfb_height; + imacfb_fix.line_length = screen_info.lfb_linelength; + + /* size_vmode -- that is the amount of memory needed for the + * used video mode, i.e. the minimum amount of + * memory we need. */ + size_vmode = imacfb_defined.yres * imacfb_fix.line_length; + + /* size_total -- all video memory we have. Used for + * entries, ressource allocation and bounds + * checking. */ + size_total = screen_info.lfb_size * 65536; + if (size_total < size_vmode) + size_total = size_vmode; + + /* size_remap -- the amount of video memory we are going to + * use for imacfb. With modern cards it is no + * option to simply use size_total as that + * wastes plenty of kernel address space. */ + size_remap = size_vmode * 2; + if (size_remap < size_vmode) + size_remap = size_vmode; + if (size_remap > size_total) + size_remap = size_total; + imacfb_fix.smem_len = size_remap; + +#ifndef __i386__ + screen_info.imacpm_seg = 0; +#endif + + if (!request_mem_region(imacfb_fix.smem_start, size_total, "imacfb")) { + printk(KERN_WARNING + "imacfb: cannot reserve video memory at 0x%lx\n", + imacfb_fix.smem_start); + /* We cannot make this fatal. Sometimes this comes from magic + spaces our resource handlers simply don't know about */ + } + + info = framebuffer_alloc(sizeof(u32) * 16, &dev->dev); + if (!info) { + err = -ENOMEM; + goto err_release_mem; + } + info->pseudo_palette = info->par; + info->par = NULL; + + info->screen_base = ioremap(imacfb_fix.smem_start, imacfb_fix.smem_len); + if (!info->screen_base) { + printk(KERN_ERR "imacfb: abort, cannot ioremap video memory " + "0x%x @ 0x%lx\n", + imacfb_fix.smem_len, imacfb_fix.smem_start); + err = -EIO; + goto err_unmap; + } + + printk(KERN_INFO "imacfb: framebuffer at 0x%lx, mapped to 0x%p, " + "using %dk, total %dk\n", + imacfb_fix.smem_start, info->screen_base, + size_remap/1024, size_total/1024); + printk(KERN_INFO "imacfb: mode is %dx%dx%d, linelength=%d, pages=%d\n", + imacfb_defined.xres, imacfb_defined.yres, + imacfb_defined.bits_per_pixel, imacfb_fix.line_length, + screen_info.pages); + + imacfb_defined.xres_virtual = imacfb_defined.xres; + imacfb_defined.yres_virtual = imacfb_fix.smem_len / + imacfb_fix.line_length; + printk(KERN_INFO "imacfb: scrolling: redraw\n"); + imacfb_defined.yres_virtual = imacfb_defined.yres; + + /* some dummy values for timing to make fbset happy */ + imacfb_defined.pixclock = 10000000 / imacfb_defined.xres * + 1000 / imacfb_defined.yres; + imacfb_defined.left_margin = (imacfb_defined.xres / 8) & 0xf8; + imacfb_defined.hsync_len = (imacfb_defined.xres / 8) & 0xf8; + + imacfb_defined.red.offset = screen_info.red_pos; + imacfb_defined.red.length = screen_info.red_size; + imacfb_defined.green.offset = screen_info.green_pos; + imacfb_defined.green.length = screen_info.green_size; + imacfb_defined.blue.offset = screen_info.blue_pos; + imacfb_defined.blue.length = screen_info.blue_size; + imacfb_defined.transp.offset = screen_info.rsvd_pos; + imacfb_defined.transp.length = screen_info.rsvd_size; + + printk(KERN_INFO "imacfb: %s: " + "size=%d:%d:%d:%d, shift=%d:%d:%d:%d\n", + "Truecolor", + screen_info.rsvd_size, + screen_info.red_size, + screen_info.green_size, + screen_info.blue_size, + screen_info.rsvd_pos, + screen_info.red_pos, + screen_info.green_pos, + screen_info.blue_pos); + + imacfb_fix.ypanstep = 0; + imacfb_fix.ywrapstep = 0; + + /* request failure does not faze us, as vgacon probably has this + * region already (FIXME) */ + request_region(0x3c0, 32, "imacfb"); + + info->fbops = &imacfb_ops; + info->var = imacfb_defined; + info->fix = imacfb_fix; + info->flags = FBINFO_FLAG_DEFAULT; + + if (fb_alloc_cmap(&info->cmap, 256, 0) < 0) { + err = -ENOMEM; + goto err_unmap; + } + if (register_framebuffer(info)<0) { + err = -EINVAL; + goto err_fb_dealoc; + } + printk(KERN_INFO "fb%d: %s frame buffer device\n", + info->node, info->fix.id); + return 0; + +err_fb_dealoc: + fb_dealloc_cmap(&info->cmap); +err_unmap: + iounmap(info->screen_base); + framebuffer_release(info); +err_release_mem: + release_mem_region(imacfb_fix.smem_start, size_total); + return err; +} + +static struct platform_driver imacfb_driver = { + .probe = imacfb_probe, + .driver = { + .name = "imacfb", + }, +}; + +static struct platform_device imacfb_device = { + .name = "imacfb", +}; + +static int __init imacfb_init(void) +{ + int ret; + char *option = NULL; + + /* ignore error return of fb_get_options */ + fb_get_options("imacfb", &option); + imacfb_setup(option); + ret = platform_driver_register(&imacfb_driver); + + if (!ret) { + ret = platform_device_register(&imacfb_device); + if (ret) + platform_driver_unregister(&imacfb_driver); + } + return ret; +} +module_init(imacfb_init); + +MODULE_LICENSE("GPL"); diff --git a/drivers/video/macmodes.c b/drivers/video/macmodes.c index c0385c6f7db..d21321ca7c3 100644 --- a/drivers/video/macmodes.c +++ b/drivers/video/macmodes.c @@ -327,7 +327,6 @@ int mac_var_to_vmode(const struct fb_var_screeninfo *var, int *vmode, } return -EINVAL; } -EXPORT_SYMBOL(mac_var_to_vmode); /** * mac_map_monitor_sense - Convert monitor sense to vmode @@ -371,8 +370,9 @@ EXPORT_SYMBOL(mac_map_monitor_sense); * */ -int __init mac_find_mode(struct fb_var_screeninfo *var, struct fb_info *info, - const char *mode_option, unsigned int default_bpp) +int __devinit mac_find_mode(struct fb_var_screeninfo *var, + struct fb_info *info, const char *mode_option, + unsigned int default_bpp) { const struct fb_videomode *db = NULL; unsigned int dbsize = 0; diff --git a/drivers/video/macmodes.h b/drivers/video/macmodes.h index 232f5a09a49..babeb81f467 100644 --- a/drivers/video/macmodes.h +++ b/drivers/video/macmodes.h @@ -55,9 +55,10 @@ extern int mac_vmode_to_var(int vmode, int cmode, extern int mac_var_to_vmode(const struct fb_var_screeninfo *var, int *vmode, int *cmode); extern int mac_map_monitor_sense(int sense); -extern int __init mac_find_mode(struct fb_var_screeninfo *var, - struct fb_info *info, const char *mode_option, - unsigned int default_bpp); +extern int __devinit mac_find_mode(struct fb_var_screeninfo *var, + struct fb_info *info, + const char *mode_option, + unsigned int default_bpp); /* diff --git a/drivers/video/modedb.c b/drivers/video/modedb.c index 26a1c618a20..ff5454601e2 100644 --- a/drivers/video/modedb.c +++ b/drivers/video/modedb.c @@ -259,6 +259,10 @@ static const struct fb_videomode modedb[] = { /* 1152x768, 60 Hz, PowerBook G4 Titanium I and II */ NULL, 60, 1152, 768, 15386, 158, 26, 29, 3, 136, 6, FB_SYNC_HOR_HIGH_ACT|FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED + }, { + /* 1366x768, 60 Hz, 47.403 kHz hsync, WXGA 16:9 aspect ratio */ + NULL, 60, 1366, 768, 13806, 120, 10, 14, 3, 32, 5, + 0, FB_VMODE_NONINTERLACED }, }; @@ -787,8 +791,9 @@ struct fb_videomode *fb_find_best_mode(struct fb_var_screeninfo *var, if (diff > d) { diff = d; best = mode; - } else if (diff == d && mode->refresh > best->refresh) - best = mode; + } else if (diff == d && best && + mode->refresh > best->refresh) + best = mode; } } return best; @@ -1016,8 +1021,6 @@ EXPORT_SYMBOL(fb_videomode_to_var); EXPORT_SYMBOL(fb_var_to_videomode); EXPORT_SYMBOL(fb_mode_is_equal); EXPORT_SYMBOL(fb_add_videomode); -EXPORT_SYMBOL(fb_delete_videomode); -EXPORT_SYMBOL(fb_destroy_modelist); EXPORT_SYMBOL(fb_match_mode); EXPORT_SYMBOL(fb_find_best_mode); EXPORT_SYMBOL(fb_find_nearest_mode); diff --git a/drivers/video/neofb.c b/drivers/video/neofb.c index 24b12f71d5a..2f156b724d1 100644 --- a/drivers/video/neofb.c +++ b/drivers/video/neofb.c @@ -1333,17 +1333,22 @@ static int neofb_blank(int blank_mode, struct fb_info *info) * run "setterm -powersave powerdown" to take advantage */ struct neofb_par *par = info->par; - int seqflags, lcdflags, dpmsflags, reg; - + int seqflags, lcdflags, dpmsflags, reg, tmpdisp; /* - * Reload the value stored in the register, if sensible. It might have - * been changed via FN keystroke. + * Read back the register bits related to display configuration. They might + * have been changed underneath the driver via Fn key stroke. + */ + neoUnlock(); + tmpdisp = vga_rgfx(NULL, 0x20) & 0x03; + neoLock(&par->state); + + /* In case we blank the screen, we want to store the possibly new + * configuration in the driver. During un-blank, we re-apply this setting, + * since the LCD bit will be cleared in order to switch off the backlight. */ if (par->PanelDispCntlRegRead) { - neoUnlock(); - par->PanelDispCntlReg1 = vga_rgfx(NULL, 0x20) & 0x03; - neoLock(&par->state); + par->PanelDispCntlReg1 = tmpdisp; } par->PanelDispCntlRegRead = !blank_mode; @@ -1378,12 +1383,21 @@ static int neofb_blank(int blank_mode, struct fb_info *info) break; case FB_BLANK_NORMAL: /* just blank screen (backlight stays on) */ seqflags = VGA_SR01_SCREEN_OFF; /* Disable sequencer */ - lcdflags = par->PanelDispCntlReg1 & 0x02; /* LCD normal */ + /* + * During a blank operation with the LID shut, we might store "LCD off" + * by mistake. Due to timing issues, the BIOS may switch the lights + * back on, and we turn it back off once we "unblank". + * + * So here is an attempt to implement ">=" - if we are in the process + * of unblanking, and the LCD bit is unset in the driver but set in the + * register, we must keep it. + */ + lcdflags = ((par->PanelDispCntlReg1 | tmpdisp) & 0x02); /* LCD normal */ dpmsflags = 0x00; /* no hsync/vsync suppression */ break; case FB_BLANK_UNBLANK: /* unblank */ seqflags = 0; /* Enable sequencer */ - lcdflags = par->PanelDispCntlReg1 & 0x02; /* LCD normal */ + lcdflags = ((par->PanelDispCntlReg1 | tmpdisp) & 0x02); /* LCD normal */ dpmsflags = 0x00; /* no hsync/vsync suppression */ #ifdef CONFIG_TOSHIBA /* Do we still need this ? */ diff --git a/drivers/video/nvidia/nv_hw.c b/drivers/video/nvidia/nv_hw.c index 99c3a8e6a23..9ed640d3572 100644 --- a/drivers/video/nvidia/nv_hw.c +++ b/drivers/video/nvidia/nv_hw.c @@ -886,7 +886,10 @@ void NVCalcStateExt(struct nvidia_par *par, case NV_ARCH_20: case NV_ARCH_30: default: - if (((par->Chipset & 0xffff) == 0x01A0) || + if ((par->Chipset & 0xfff0) == 0x0240) { + state->arbitration0 = 256; + state->arbitration1 = 0x0480; + } else if (((par->Chipset & 0xffff) == 0x01A0) || ((par->Chipset & 0xffff) == 0x01f0)) { nForceUpdateArbitrationSettings(VClk, pixelDepth * 8, @@ -1235,6 +1238,7 @@ void NVLoadStateExt(struct nvidia_par *par, RIVA_HW_STATE * state) break; case 0x0160: case 0x01D0: + case 0x0240: NV_WR32(par->PMC, 0x1700, NV_RD32(par->PFB, 0x020C)); NV_WR32(par->PMC, 0x1704, 0); @@ -1359,7 +1363,9 @@ void NVLoadStateExt(struct nvidia_par *par, RIVA_HW_STATE * state) if(((par->Chipset & 0xfff0) != 0x0160) && ((par->Chipset & 0xfff0) - != 0x0220)) + != 0x0220) && + ((par->Chipset & 0xfff0) + != 0x240)) NV_WR32(par->PGRAPH, 0x6900 + i*4, NV_RD32(par->PFB, diff --git a/drivers/video/nvidia/nvidia.c b/drivers/video/nvidia/nvidia.c index 03a7c1e9ce3..7b5cffb2785 100644 --- a/drivers/video/nvidia/nvidia.c +++ b/drivers/video/nvidia/nvidia.c @@ -67,359 +67,10 @@ #define MAX_CURS 32 static struct pci_device_id nvidiafb_pci_tbl[] = { - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_TNT, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_TNT2, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_UTNT2, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_TNT_UNKNOWN, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_VTNT2, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_UVTNT2, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_ITNT2, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE_SDR, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE_DDR, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_QUADRO, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE2_MX, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE2_MX2, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE2_GO, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_QUADRO2_MXR, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE2_GTS, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE2_GTS2, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE2_ULTRA, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_QUADRO2_PRO, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE4_MX_460, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE4_MX_440, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE4_MX_420, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE4_MX_440_SE, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE4_440_GO, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE4_420_GO, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE4_460_GO, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE4_420_GO_M32, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_QUADRO4_500XGL, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE4_440_GO_M64, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_QUADRO4_200, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_QUADRO4_550XGL, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_QUADRO4_500_GOGL, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE4_410_GO_M16, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE4_MX_440_8X, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE4_MX_440SE_8X, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE4_MX_420_8X, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE4_MX_4000, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE4_448_GO, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE4_488_GO, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_QUADRO4_580_XGL, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE4_MX_MAC, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_QUADRO4_280_NVS, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_QUADRO4_380_XGL, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_IGEFORCE2, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE3, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE3_1, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE3_2, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_QUADRO_DDC, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE4_TI_4600, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE4_TI_4400, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE4_TI_4200, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_QUADRO4_900XGL, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_QUADRO4_750XGL, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_QUADRO4_700XGL, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE4_TI_4800, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE4_TI_4800_8X, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE4_TI_4800SE, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE4_4200_GO, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_QUADRO4_980_XGL, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_QUADRO4_780_XGL, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_QUADRO4_700_GOGL, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE_FX_5800_ULTRA, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE_FX_5800, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_QUADRO_FX_2000, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_QUADRO_FX_1000, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE_FX_5600_ULTRA, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE_FX_5600, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE_FX_5600SE, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE_FX_GO5600, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE_FX_GO5650, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_QUADRO_FX_GO700, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE_FX_5200, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE_FX_5200_ULTRA, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE_FX_5200_1, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE_FX_5200SE, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE_FX_GO5200, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE_FX_GO5250, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE_FX_GO5250_32, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE_FX_GO_5200, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_QUADRO_NVS_280_PCI, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_QUADRO_FX_500, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE_FX_GO5300, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE_FX_GO5100, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE_FX_5900_ULTRA, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE_FX_5900, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE_FX_5900XT, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE_FX_5950_ULTRA, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_QUADRO_FX_3000, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE_FX_5700_ULTRA, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE_FX_5700, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE_FX_5700LE, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE_FX_5700VE, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE_FX_GO5700_1, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE_FX_GO5700_2, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_QUADRO_FX_GO1000, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_QUADRO_FX_1100, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE_FX_5500, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE_FX_5100, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_QUADRO_FX_700, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE_FX_5900ZT, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE_6800_ULTRA, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE_6800, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE_6800_LE, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE_6800_GT, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_QUADRO_FX_4000, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE_6600_GT, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE_6600, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE_6610_XL, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_QUADRO_FX_540, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE_6200, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCIE_DEVICE_ID_NVIDIA_GEFORCE_6800_ALT1, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCIE_DEVICE_ID_NVIDIA_GEFORCE_6600_ALT1, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCIE_DEVICE_ID_NVIDIA_GEFORCE_6600_ALT2, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCIE_DEVICE_ID_NVIDIA_GEFORCE_6200_ALT1, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCIE_DEVICE_ID_NVIDIA_GEFORCE_6800_GT, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCIE_DEVICE_ID_NVIDIA_QUADRO_NVS280, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, 0x0252, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, 0x0313, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, 0x0316, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, 0x0317, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, 0x031D, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, 0x031E, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, 0x031F, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, 0x0329, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, 0x032F, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, 0x0345, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, 0x0349, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, 0x034B, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, 0x034F, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, 0x00c0, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_GEFORCE_6800A, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_GEFORCE_6800A_LE, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_GEFORCE_GO_6800, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_GEFORCE_GO_6800_ULTRA, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_QUADRO_FX_GO1400, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, 0x00cd, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_QUADRO_FX_1400, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, 0x0142, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, 0x0143, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, 0x0144, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, 0x0145, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, 0x0146, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, 0x0147, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, 0x0148, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, 0x0149, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, 0x014b, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, 0x14c, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, 0x014d, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, 0x0160, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE_6200_TURBOCACHE, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, 0x0162, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, 0x0163, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE_GO_6200, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, 0x0165, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE_GO_6250, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE_GO_6200_1, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE_GO_6250_1, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, 0x0169, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, 0x016b, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, 0x016c, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, 0x016d, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, 0x016e, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, 0x0210, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE_6800B, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE_6800B_LE, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE_6800B_GT, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE_7800_GT, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE_7800_GTX, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE_GO_7800, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_GEFORCE_GO_7800_GTX, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, 0x021d, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, 0x021e, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, 0x0220, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, 0x0221, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, 0x0222, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {PCI_VENDOR_ID_NVIDIA, 0x0228, - PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {0,} /* terminate list */ + {PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID, + PCI_BASE_CLASS_DISPLAY << 16, 0xff0000, 0}, + { 0, } }; - MODULE_DEVICE_TABLE(pci, nvidiafb_pci_tbl); /* command line data, set in nvidiafb_setup() */ @@ -1465,10 +1116,10 @@ static u32 __devinit nvidia_get_chipset(struct fb_info *info) struct nvidia_par *par = info->par; u32 id = (par->pci_dev->vendor << 16) | par->pci_dev->device; - printk("nvidiafb: PCI id - %x\n", id); + printk(KERN_INFO PFX "Device ID: %x \n", id); + if ((id & 0xfff0) == 0x00f0) { /* pci-e */ - printk("nvidiafb: PCI-E card\n"); id = NV_RD32(par->REGS, 0x1800); if ((id & 0x0000ffff) == 0x000010DE) @@ -1476,9 +1127,9 @@ static u32 __devinit nvidia_get_chipset(struct fb_info *info) else if ((id & 0xffff0000) == 0xDE100000) /* wrong endian */ id = 0x10DE0000 | ((id << 8) & 0x0000ff00) | ((id >> 8) & 0x000000ff); + printk(KERN_INFO PFX "Subsystem ID: %x \n", id); } - printk("nvidiafb: Actual id - %x\n", id); return id; } @@ -1520,6 +1171,7 @@ static u32 __devinit nvidia_get_arch(struct fb_info *info) case 0x0210: case 0x0220: case 0x0230: + case 0x0240: case 0x0290: case 0x0390: arch = NV_ARCH_40; @@ -1567,7 +1219,7 @@ static int __devinit nvidiafb_probe(struct pci_dev *pd, if (pci_request_regions(pd, "nvidiafb")) { printk(KERN_ERR PFX "cannot request PCI regions\n"); - goto err_out_request; + goto err_out_enable; } par->FlatPanel = flatpanel; @@ -1596,7 +1248,6 @@ static int __devinit nvidiafb_probe(struct pci_dev *pd, } par->Chipset = nvidia_get_chipset(info); - printk(KERN_INFO PFX "nVidia device/chipset %X\n", par->Chipset); par->Architecture = nvidia_get_arch(info); if (par->Architecture == 0) { @@ -1687,10 +1338,8 @@ err_out_free_base1: nvidia_delete_i2c_busses(par); err_out_arch: iounmap(par->REGS); -err_out_free_base0: + err_out_free_base0: pci_release_regions(pd); -err_out_request: - pci_disable_device(pd); err_out_enable: kfree(info->pixmap.addr); err_out_kfree: @@ -1720,7 +1369,6 @@ static void __exit nvidiafb_remove(struct pci_dev *pd) nvidia_delete_i2c_busses(par); iounmap(par->REGS); pci_release_regions(pd); - pci_disable_device(pd); kfree(info->pixmap.addr); framebuffer_release(info); pci_set_drvdata(pd, NULL); diff --git a/drivers/video/riva/fbdev.c b/drivers/video/riva/fbdev.c index d4384ab1df6..12af58c5cf1 100644 --- a/drivers/video/riva/fbdev.c +++ b/drivers/video/riva/fbdev.c @@ -2152,7 +2152,6 @@ err_iounmap_ctrl_base: err_release_region: pci_release_regions(pd); err_disable_device: - pci_disable_device(pd); err_free_pixmap: kfree(info->pixmap.addr); err_framebuffer_release: @@ -2187,7 +2186,6 @@ static void __exit rivafb_remove(struct pci_dev *pd) if (par->riva.Architecture == NV_ARCH_03) iounmap(par->riva.PRAMIN); pci_release_regions(pd); - pci_disable_device(pd); kfree(info->pixmap.addr); framebuffer_release(info); pci_set_drvdata(pd, NULL); diff --git a/drivers/video/s3c2410fb.c b/drivers/video/s3c2410fb.c index 9451932fbaf..fbc41185068 100644 --- a/drivers/video/s3c2410fb.c +++ b/drivers/video/s3c2410fb.c @@ -641,6 +641,7 @@ static int __init s3c2410fb_probe(struct platform_device *pdev) int ret; int irq; int i; + u32 lcdcon1; mach_info = pdev->dev.platform_data; if (mach_info == NULL) { @@ -672,6 +673,11 @@ static int __init s3c2410fb_probe(struct platform_device *pdev) memcpy(&info->regs, &mach_info->regs, sizeof(info->regs)); + /* Stop the video and unset ENVID if set */ + info->regs.lcdcon1 &= ~S3C2410_LCDCON1_ENVID; + lcdcon1 = readl(S3C2410_LCDCON1); + writel(lcdcon1 & ~S3C2410_LCDCON1_ENVID, S3C2410_LCDCON1); + info->mach_info = pdev->dev.platform_data; fbinfo->fix.type = FB_TYPE_PACKED_PIXELS; @@ -794,15 +800,14 @@ dealloc_fb: * shutdown the lcd controller */ -static void s3c2410fb_stop_lcd(void) +static void s3c2410fb_stop_lcd(struct s3c2410fb_info *fbi) { unsigned long flags; - unsigned long tmp; local_irq_save(flags); - tmp = readl(S3C2410_LCDCON1); - writel(tmp & ~S3C2410_LCDCON1_ENVID, S3C2410_LCDCON1); + fbi->regs.lcdcon1 &= ~S3C2410_LCDCON1_ENVID; + writel(fbi->regs.lcdcon1, S3C2410_LCDCON1); local_irq_restore(flags); } @@ -816,7 +821,7 @@ static int s3c2410fb_remove(struct platform_device *pdev) struct s3c2410fb_info *info = fbinfo->par; int irq; - s3c2410fb_stop_lcd(); + s3c2410fb_stop_lcd(info); msleep(1); s3c2410fb_unmap_video_memory(info); @@ -844,7 +849,7 @@ static int s3c2410fb_suspend(struct platform_device *dev, pm_message_t state) struct fb_info *fbinfo = platform_get_drvdata(dev); struct s3c2410fb_info *info = fbinfo->par; - s3c2410fb_stop_lcd(); + s3c2410fb_stop_lcd(info); /* sleep before disabling the clock, we need to ensure * the LCD DMA engine is not going to get back on the bus diff --git a/drivers/video/savage/savagefb.h b/drivers/video/savage/savagefb.h index 58cfdfb4183..e648a6c0f6d 100644 --- a/drivers/video/savage/savagefb.h +++ b/drivers/video/savage/savagefb.h @@ -147,7 +147,27 @@ struct xtimings { int interlaced; }; +struct savage_reg { + unsigned char MiscOutReg; /* Misc */ + unsigned char CRTC[25]; /* Crtc Controller */ + unsigned char Sequencer[5]; /* Video Sequencer */ + unsigned char Graphics[9]; /* Video Graphics */ + unsigned char Attribute[21]; /* Video Atribute */ + unsigned int mode, refresh; + unsigned char SR08, SR0E, SR0F; + unsigned char SR10, SR11, SR12, SR13, SR15, SR18, SR29, SR30; + unsigned char SR54[8]; + unsigned char Clock; + unsigned char CR31, CR32, CR33, CR34, CR36, CR3A, CR3B, CR3C; + unsigned char CR40, CR41, CR42, CR43, CR45; + unsigned char CR50, CR51, CR53, CR55, CR58, CR5B, CR5D, CR5E; + unsigned char CR60, CR63, CR65, CR66, CR67, CR68, CR69, CR6D, CR6F; + unsigned char CR86, CR88; + unsigned char CR90, CR91, CRB0; + unsigned int STREAMS[22]; /* yuck, streams regs */ + unsigned int MMPR0, MMPR1, MMPR2, MMPR3; +}; /* --------------------------------------------------------------------- */ #define NR_PALETTE 256 @@ -167,6 +187,8 @@ struct savagefb_par { struct pci_dev *pcidev; savage_chipset chip; struct savagefb_i2c_chan chan; + struct savage_reg state; + struct savage_reg save; unsigned char *edid; u32 pseudo_palette[16]; int paletteEnabled; @@ -179,6 +201,7 @@ struct savagefb_par { int minClock; int numClocks; int clock[4]; + int MCLK, REFCLK, LCDclk; struct { u8 __iomem *vbase; u32 pbase; @@ -196,7 +219,6 @@ struct savagefb_par { volatile u32 __iomem *bci_base; unsigned int bci_ptr; - u32 cob_offset; u32 cob_size; int cob_index; @@ -204,7 +226,6 @@ struct savagefb_par { void (*SavageWaitIdle) (struct savagefb_par *par); void (*SavageWaitFifo) (struct savagefb_par *par, int space); - int MCLK, REFCLK, LCDclk; int HorizScaleFactor; /* Panels size */ @@ -217,26 +238,6 @@ struct savagefb_par { int depth; int vwidth; - - unsigned char MiscOutReg; /* Misc */ - unsigned char CRTC[25]; /* Crtc Controller */ - unsigned char Sequencer[5]; /* Video Sequencer */ - unsigned char Graphics[9]; /* Video Graphics */ - unsigned char Attribute[21]; /* Video Atribute */ - - unsigned int mode, refresh; - unsigned char SR08, SR0E, SR0F; - unsigned char SR10, SR11, SR12, SR13, SR15, SR18, SR29, SR30; - unsigned char SR54[8]; - unsigned char Clock; - unsigned char CR31, CR32, CR33, CR34, CR36, CR3A, CR3B, CR3C; - unsigned char CR40, CR41, CR42, CR43, CR45; - unsigned char CR50, CR51, CR53, CR55, CR58, CR5B, CR5D, CR5E; - unsigned char CR60, CR63, CR65, CR66, CR67, CR68, CR69, CR6D, CR6F; - unsigned char CR86, CR88; - unsigned char CR90, CR91, CRB0; - unsigned int STREAMS[22]; /* yuck, streams regs */ - unsigned int MMPR0, MMPR1, MMPR2, MMPR3; }; #define BCI_BD_BW_DISABLE 0x10000000 diff --git a/drivers/video/savage/savagefb_driver.c b/drivers/video/savage/savagefb_driver.c index 0da624e6524..78883cf66a4 100644 --- a/drivers/video/savage/savagefb_driver.c +++ b/drivers/video/savage/savagefb_driver.c @@ -86,15 +86,15 @@ MODULE_DESCRIPTION("FBDev driver for S3 Savage PCI/AGP Chips"); /* --------------------------------------------------------------------- */ -static void vgaHWSeqReset (struct savagefb_par *par, int start) +static void vgaHWSeqReset(struct savagefb_par *par, int start) { if (start) - VGAwSEQ (0x00, 0x01, par); /* Synchronous Reset */ + VGAwSEQ(0x00, 0x01, par); /* Synchronous Reset */ else - VGAwSEQ (0x00, 0x03, par); /* End Reset */ + VGAwSEQ(0x00, 0x03, par); /* End Reset */ } -static void vgaHWProtect (struct savagefb_par *par, int on) +static void vgaHWProtect(struct savagefb_par *par, int on) { unsigned char tmp; @@ -102,10 +102,10 @@ static void vgaHWProtect (struct savagefb_par *par, int on) /* * Turn off screen and disable sequencer. */ - tmp = VGArSEQ (0x01, par); + tmp = VGArSEQ(0x01, par); - vgaHWSeqReset (par, 1); /* start synchronous reset */ - VGAwSEQ (0x01, tmp | 0x20, par);/* disable the display */ + vgaHWSeqReset(par, 1); /* start synchronous reset */ + VGAwSEQ(0x01, tmp | 0x20, par);/* disable the display */ VGAenablePalette(par); } else { @@ -113,75 +113,76 @@ static void vgaHWProtect (struct savagefb_par *par, int on) * Reenable sequencer, then turn on screen. */ - tmp = VGArSEQ (0x01, par); + tmp = VGArSEQ(0x01, par); - VGAwSEQ (0x01, tmp & ~0x20, par);/* reenable display */ - vgaHWSeqReset (par, 0); /* clear synchronous reset */ + VGAwSEQ(0x01, tmp & ~0x20, par);/* reenable display */ + vgaHWSeqReset(par, 0); /* clear synchronous reset */ VGAdisablePalette(par); } } -static void vgaHWRestore (struct savagefb_par *par) +static void vgaHWRestore(struct savagefb_par *par, struct savage_reg *reg) { int i; - VGAwMISC (par->MiscOutReg, par); + VGAwMISC(reg->MiscOutReg, par); for (i = 1; i < 5; i++) - VGAwSEQ (i, par->Sequencer[i], par); + VGAwSEQ(i, reg->Sequencer[i], par); /* Ensure CRTC registers 0-7 are unlocked by clearing bit 7 or CRTC[17] */ - VGAwCR (17, par->CRTC[17] & ~0x80, par); + VGAwCR(17, reg->CRTC[17] & ~0x80, par); for (i = 0; i < 25; i++) - VGAwCR (i, par->CRTC[i], par); + VGAwCR(i, reg->CRTC[i], par); for (i = 0; i < 9; i++) - VGAwGR (i, par->Graphics[i], par); + VGAwGR(i, reg->Graphics[i], par); VGAenablePalette(par); for (i = 0; i < 21; i++) - VGAwATTR (i, par->Attribute[i], par); + VGAwATTR(i, reg->Attribute[i], par); VGAdisablePalette(par); } -static void vgaHWInit (struct fb_var_screeninfo *var, - struct savagefb_par *par, - struct xtimings *timings) +static void vgaHWInit(struct fb_var_screeninfo *var, + struct savagefb_par *par, + struct xtimings *timings, + struct savage_reg *reg) { - par->MiscOutReg = 0x23; + reg->MiscOutReg = 0x23; if (!(timings->sync & FB_SYNC_HOR_HIGH_ACT)) - par->MiscOutReg |= 0x40; + reg->MiscOutReg |= 0x40; if (!(timings->sync & FB_SYNC_VERT_HIGH_ACT)) - par->MiscOutReg |= 0x80; + reg->MiscOutReg |= 0x80; /* * Time Sequencer */ - par->Sequencer[0x00] = 0x00; - par->Sequencer[0x01] = 0x01; - par->Sequencer[0x02] = 0x0F; - par->Sequencer[0x03] = 0x00; /* Font select */ - par->Sequencer[0x04] = 0x0E; /* Misc */ + reg->Sequencer[0x00] = 0x00; + reg->Sequencer[0x01] = 0x01; + reg->Sequencer[0x02] = 0x0F; + reg->Sequencer[0x03] = 0x00; /* Font select */ + reg->Sequencer[0x04] = 0x0E; /* Misc */ /* * CRTC Controller */ - par->CRTC[0x00] = (timings->HTotal >> 3) - 5; - par->CRTC[0x01] = (timings->HDisplay >> 3) - 1; - par->CRTC[0x02] = (timings->HSyncStart >> 3) - 1; - par->CRTC[0x03] = (((timings->HSyncEnd >> 3) - 1) & 0x1f) | 0x80; - par->CRTC[0x04] = (timings->HSyncStart >> 3); - par->CRTC[0x05] = ((((timings->HSyncEnd >> 3) - 1) & 0x20) << 2) | + reg->CRTC[0x00] = (timings->HTotal >> 3) - 5; + reg->CRTC[0x01] = (timings->HDisplay >> 3) - 1; + reg->CRTC[0x02] = (timings->HSyncStart >> 3) - 1; + reg->CRTC[0x03] = (((timings->HSyncEnd >> 3) - 1) & 0x1f) | 0x80; + reg->CRTC[0x04] = (timings->HSyncStart >> 3); + reg->CRTC[0x05] = ((((timings->HSyncEnd >> 3) - 1) & 0x20) << 2) | (((timings->HSyncEnd >> 3)) & 0x1f); - par->CRTC[0x06] = (timings->VTotal - 2) & 0xFF; - par->CRTC[0x07] = (((timings->VTotal - 2) & 0x100) >> 8) | + reg->CRTC[0x06] = (timings->VTotal - 2) & 0xFF; + reg->CRTC[0x07] = (((timings->VTotal - 2) & 0x100) >> 8) | (((timings->VDisplay - 1) & 0x100) >> 7) | ((timings->VSyncStart & 0x100) >> 6) | (((timings->VSyncStart - 1) & 0x100) >> 5) | @@ -189,27 +190,27 @@ static void vgaHWInit (struct fb_var_screeninfo *var, (((timings->VTotal - 2) & 0x200) >> 4) | (((timings->VDisplay - 1) & 0x200) >> 3) | ((timings->VSyncStart & 0x200) >> 2); - par->CRTC[0x08] = 0x00; - par->CRTC[0x09] = (((timings->VSyncStart - 1) & 0x200) >> 4) | 0x40; + reg->CRTC[0x08] = 0x00; + reg->CRTC[0x09] = (((timings->VSyncStart - 1) & 0x200) >> 4) | 0x40; if (timings->dblscan) - par->CRTC[0x09] |= 0x80; - - par->CRTC[0x0a] = 0x00; - par->CRTC[0x0b] = 0x00; - par->CRTC[0x0c] = 0x00; - par->CRTC[0x0d] = 0x00; - par->CRTC[0x0e] = 0x00; - par->CRTC[0x0f] = 0x00; - par->CRTC[0x10] = timings->VSyncStart & 0xff; - par->CRTC[0x11] = (timings->VSyncEnd & 0x0f) | 0x20; - par->CRTC[0x12] = (timings->VDisplay - 1) & 0xff; - par->CRTC[0x13] = var->xres_virtual >> 4; - par->CRTC[0x14] = 0x00; - par->CRTC[0x15] = (timings->VSyncStart - 1) & 0xff; - par->CRTC[0x16] = (timings->VSyncEnd - 1) & 0xff; - par->CRTC[0x17] = 0xc3; - par->CRTC[0x18] = 0xff; + reg->CRTC[0x09] |= 0x80; + + reg->CRTC[0x0a] = 0x00; + reg->CRTC[0x0b] = 0x00; + reg->CRTC[0x0c] = 0x00; + reg->CRTC[0x0d] = 0x00; + reg->CRTC[0x0e] = 0x00; + reg->CRTC[0x0f] = 0x00; + reg->CRTC[0x10] = timings->VSyncStart & 0xff; + reg->CRTC[0x11] = (timings->VSyncEnd & 0x0f) | 0x20; + reg->CRTC[0x12] = (timings->VDisplay - 1) & 0xff; + reg->CRTC[0x13] = var->xres_virtual >> 4; + reg->CRTC[0x14] = 0x00; + reg->CRTC[0x15] = (timings->VSyncStart - 1) & 0xff; + reg->CRTC[0x16] = (timings->VSyncEnd - 1) & 0xff; + reg->CRTC[0x17] = 0xc3; + reg->CRTC[0x18] = 0xff; /* * are these unnecessary? @@ -220,38 +221,38 @@ static void vgaHWInit (struct fb_var_screeninfo *var, /* * Graphics Display Controller */ - par->Graphics[0x00] = 0x00; - par->Graphics[0x01] = 0x00; - par->Graphics[0x02] = 0x00; - par->Graphics[0x03] = 0x00; - par->Graphics[0x04] = 0x00; - par->Graphics[0x05] = 0x40; - par->Graphics[0x06] = 0x05; /* only map 64k VGA memory !!!! */ - par->Graphics[0x07] = 0x0F; - par->Graphics[0x08] = 0xFF; - - - par->Attribute[0x00] = 0x00; /* standard colormap translation */ - par->Attribute[0x01] = 0x01; - par->Attribute[0x02] = 0x02; - par->Attribute[0x03] = 0x03; - par->Attribute[0x04] = 0x04; - par->Attribute[0x05] = 0x05; - par->Attribute[0x06] = 0x06; - par->Attribute[0x07] = 0x07; - par->Attribute[0x08] = 0x08; - par->Attribute[0x09] = 0x09; - par->Attribute[0x0a] = 0x0A; - par->Attribute[0x0b] = 0x0B; - par->Attribute[0x0c] = 0x0C; - par->Attribute[0x0d] = 0x0D; - par->Attribute[0x0e] = 0x0E; - par->Attribute[0x0f] = 0x0F; - par->Attribute[0x10] = 0x41; - par->Attribute[0x11] = 0xFF; - par->Attribute[0x12] = 0x0F; - par->Attribute[0x13] = 0x00; - par->Attribute[0x14] = 0x00; + reg->Graphics[0x00] = 0x00; + reg->Graphics[0x01] = 0x00; + reg->Graphics[0x02] = 0x00; + reg->Graphics[0x03] = 0x00; + reg->Graphics[0x04] = 0x00; + reg->Graphics[0x05] = 0x40; + reg->Graphics[0x06] = 0x05; /* only map 64k VGA memory !!!! */ + reg->Graphics[0x07] = 0x0F; + reg->Graphics[0x08] = 0xFF; + + + reg->Attribute[0x00] = 0x00; /* standard colormap translation */ + reg->Attribute[0x01] = 0x01; + reg->Attribute[0x02] = 0x02; + reg->Attribute[0x03] = 0x03; + reg->Attribute[0x04] = 0x04; + reg->Attribute[0x05] = 0x05; + reg->Attribute[0x06] = 0x06; + reg->Attribute[0x07] = 0x07; + reg->Attribute[0x08] = 0x08; + reg->Attribute[0x09] = 0x09; + reg->Attribute[0x0a] = 0x0A; + reg->Attribute[0x0b] = 0x0B; + reg->Attribute[0x0c] = 0x0C; + reg->Attribute[0x0d] = 0x0D; + reg->Attribute[0x0e] = 0x0E; + reg->Attribute[0x0f] = 0x0F; + reg->Attribute[0x10] = 0x41; + reg->Attribute[0x11] = 0xFF; + reg->Attribute[0x12] = 0x0F; + reg->Attribute[0x13] = 0x00; + reg->Attribute[0x14] = 0x00; } /* -------------------- Hardware specific routines ------------------------- */ @@ -304,15 +305,15 @@ savage2000_waitidle(struct savagefb_par *par) while ((savage_in32(0x48C60, par) & 0x009fffff)); } - +#ifdef CONFIG_FB_SAVAGE_ACCEL static void -SavageSetup2DEngine (struct savagefb_par *par) +SavageSetup2DEngine(struct savagefb_par *par) { unsigned long GlobalBitmapDescriptor; GlobalBitmapDescriptor = 1 | 8 | BCI_BD_BW_DISABLE; - BCI_BD_SET_BPP (GlobalBitmapDescriptor, par->depth); - BCI_BD_SET_STRIDE (GlobalBitmapDescriptor, par->vwidth); + BCI_BD_SET_BPP(GlobalBitmapDescriptor, par->depth); + BCI_BD_SET_STRIDE(GlobalBitmapDescriptor, par->vwidth); switch(par->chip) { case S3_SAVAGE3D: @@ -361,32 +362,48 @@ SavageSetup2DEngine (struct savagefb_par *par) vga_out8(0x3d5, 0x0c, par); /* Set stride to use GBD. */ - vga_out8 (0x3d4, 0x50, par); - vga_out8 (0x3d5, vga_in8(0x3d5, par) | 0xC1, par); + vga_out8(0x3d4, 0x50, par); + vga_out8(0x3d5, vga_in8(0x3d5, par) | 0xC1, par); /* Enable 2D engine. */ - vga_out8 (0x3d4, 0x40, par); - vga_out8 (0x3d5, 0x01, par); + vga_out8(0x3d4, 0x40, par); + vga_out8(0x3d5, 0x01, par); - savage_out32 (MONO_PAT_0, ~0, par); - savage_out32 (MONO_PAT_1, ~0, par); + savage_out32(MONO_PAT_0, ~0, par); + savage_out32(MONO_PAT_1, ~0, par); /* Setup plane masks */ - savage_out32 (0x8128, ~0, par); /* enable all write planes */ - savage_out32 (0x812C, ~0, par); /* enable all read planes */ - savage_out16 (0x8134, 0x27, par); - savage_out16 (0x8136, 0x07, par); + savage_out32(0x8128, ~0, par); /* enable all write planes */ + savage_out32(0x812C, ~0, par); /* enable all read planes */ + savage_out16(0x8134, 0x27, par); + savage_out16(0x8136, 0x07, par); /* Now set the GBD */ par->bci_ptr = 0; - par->SavageWaitFifo (par, 4); + par->SavageWaitFifo(par, 4); - BCI_SEND( BCI_CMD_SETREG | (1 << 16) | BCI_GBD1 ); - BCI_SEND( 0 ); - BCI_SEND( BCI_CMD_SETREG | (1 << 16) | BCI_GBD2 ); - BCI_SEND( GlobalBitmapDescriptor ); + BCI_SEND(BCI_CMD_SETREG | (1 << 16) | BCI_GBD1); + BCI_SEND(0); + BCI_SEND(BCI_CMD_SETREG | (1 << 16) | BCI_GBD2); + BCI_SEND(GlobalBitmapDescriptor); } +static void savagefb_set_clip(struct fb_info *info) +{ + struct savagefb_par *par = info->par; + int cmd; + + cmd = BCI_CMD_NOP | BCI_CMD_CLIP_NEW; + par->bci_ptr = 0; + par->SavageWaitFifo(par,3); + BCI_SEND(cmd); + BCI_SEND(BCI_CLIP_TL(0, 0)); + BCI_SEND(BCI_CLIP_BR(0xfff, 0xfff)); +} +#else +static void SavageSetup2DEngine(struct savagefb_par *par) {} + +#endif static void SavageCalcClock(long freq, int min_m, int min_n1, int max_n1, int min_n2, int max_n2, long freq_min, @@ -398,11 +415,11 @@ static void SavageCalcClock(long freq, int min_m, int min_n1, int max_n1, unsigned char n1, n2, best_n1=16+2, best_n2=2, best_m=125+2; if (freq < freq_min / (1 << max_n2)) { - printk (KERN_ERR "invalid frequency %ld Khz\n", freq); + printk(KERN_ERR "invalid frequency %ld Khz\n", freq); freq = freq_min / (1 << max_n2); } if (freq > freq_max / (1 << min_n2)) { - printk (KERN_ERR "invalid frequency %ld Khz\n", freq); + printk(KERN_ERR "invalid frequency %ld Khz\n", freq); freq = freq_max / (1 << min_n2); } @@ -453,12 +470,12 @@ static int common_calc_clock(long freq, int min_m, int min_n1, int max_n1, BASE_FREQ; if (m < min_m + 2 || m > 127+2) continue; - if((m * BASE_FREQ >= freq_min * n1) && - (m * BASE_FREQ <= freq_max * n1)) { + if ((m * BASE_FREQ >= freq_min * n1) && + (m * BASE_FREQ <= freq_max * n1)) { diff = freq * (1 << n2) * n1 - BASE_FREQ * m; - if(diff < 0) + if (diff < 0) diff = -diff; - if(diff < best_diff) { + if (diff < best_diff) { best_diff = diff; best_m = m; best_n1 = n1; @@ -468,7 +485,7 @@ static int common_calc_clock(long freq, int min_m, int min_n1, int max_n1, } } - if(max_n1 == 63) + if (max_n1 == 63) *ndiv = (best_n1 - 2) | (best_n2 << 6); else *ndiv = (best_n1 - 2) | (best_n2 << 5); @@ -488,23 +505,23 @@ static void SavagePrintRegs(void) int vgaCRReg = 0x3d5; printk(KERN_DEBUG "SR x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE " - "xF" ); + "xF"); - for( i = 0; i < 0x70; i++ ) { - if( !(i % 16) ) - printk(KERN_DEBUG "\nSR%xx ", i >> 4 ); - vga_out8( 0x3c4, i, par); - printk(KERN_DEBUG " %02x", vga_in8(0x3c5, par) ); + for (i = 0; i < 0x70; i++) { + if (!(i % 16)) + printk(KERN_DEBUG "\nSR%xx ", i >> 4); + vga_out8(0x3c4, i, par); + printk(KERN_DEBUG " %02x", vga_in8(0x3c5, par)); } printk(KERN_DEBUG "\n\nCR x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC " - "xD xE xF" ); + "xD xE xF"); - for( i = 0; i < 0xB7; i++ ) { - if( !(i % 16) ) - printk(KERN_DEBUG "\nCR%xx ", i >> 4 ); - vga_out8( vgaCRIndex, i, par); - printk(KERN_DEBUG " %02x", vga_in8(vgaCRReg, par) ); + for (i = 0; i < 0xB7; i++) { + if (!(i % 16)) + printk(KERN_DEBUG "\nCR%xx ", i >> 4); + vga_out8(vgaCRIndex, i, par); + printk(KERN_DEBUG " %02x", vga_in8(vgaCRReg, par)); } printk(KERN_DEBUG "\n\n"); @@ -513,156 +530,309 @@ static void SavagePrintRegs(void) /* --------------------------------------------------------------------- */ -static void savage_get_default_par(struct savagefb_par *par) +static void savage_get_default_par(struct savagefb_par *par, struct savage_reg *reg) { unsigned char cr3a, cr53, cr66; - vga_out16 (0x3d4, 0x4838, par); - vga_out16 (0x3d4, 0xa039, par); - vga_out16 (0x3c4, 0x0608, par); - - vga_out8 (0x3d4, 0x66, par); - cr66 = vga_in8 (0x3d5, par); - vga_out8 (0x3d5, cr66 | 0x80, par); - vga_out8 (0x3d4, 0x3a, par); - cr3a = vga_in8 (0x3d5, par); - vga_out8 (0x3d5, cr3a | 0x80, par); - vga_out8 (0x3d4, 0x53, par); - cr53 = vga_in8 (0x3d5, par); - vga_out8 (0x3d5, cr53 & 0x7f, par); - - vga_out8 (0x3d4, 0x66, par); - vga_out8 (0x3d5, cr66, par); - vga_out8 (0x3d4, 0x3a, par); - vga_out8 (0x3d5, cr3a, par); - - vga_out8 (0x3d4, 0x66, par); - vga_out8 (0x3d5, cr66, par); - vga_out8 (0x3d4, 0x3a, par); - vga_out8 (0x3d5, cr3a, par); + vga_out16(0x3d4, 0x4838, par); + vga_out16(0x3d4, 0xa039, par); + vga_out16(0x3c4, 0x0608, par); + + vga_out8(0x3d4, 0x66, par); + cr66 = vga_in8(0x3d5, par); + vga_out8(0x3d5, cr66 | 0x80, par); + vga_out8(0x3d4, 0x3a, par); + cr3a = vga_in8(0x3d5, par); + vga_out8(0x3d5, cr3a | 0x80, par); + vga_out8(0x3d4, 0x53, par); + cr53 = vga_in8(0x3d5, par); + vga_out8(0x3d5, cr53 & 0x7f, par); + + vga_out8(0x3d4, 0x66, par); + vga_out8(0x3d5, cr66, par); + vga_out8(0x3d4, 0x3a, par); + vga_out8(0x3d5, cr3a, par); + + vga_out8(0x3d4, 0x66, par); + vga_out8(0x3d5, cr66, par); + vga_out8(0x3d4, 0x3a, par); + vga_out8(0x3d5, cr3a, par); /* unlock extended seq regs */ - vga_out8 (0x3c4, 0x08, par); - par->SR08 = vga_in8 (0x3c5, par); - vga_out8 (0x3c5, 0x06, par); + vga_out8(0x3c4, 0x08, par); + reg->SR08 = vga_in8(0x3c5, par); + vga_out8(0x3c5, 0x06, par); /* now save all the extended regs we need */ - vga_out8 (0x3d4, 0x31, par); - par->CR31 = vga_in8 (0x3d5, par); - vga_out8 (0x3d4, 0x32, par); - par->CR32 = vga_in8 (0x3d5, par); - vga_out8 (0x3d4, 0x34, par); - par->CR34 = vga_in8 (0x3d5, par); - vga_out8 (0x3d4, 0x36, par); - par->CR36 = vga_in8 (0x3d5, par); - vga_out8 (0x3d4, 0x3a, par); - par->CR3A = vga_in8 (0x3d5, par); - vga_out8 (0x3d4, 0x40, par); - par->CR40 = vga_in8 (0x3d5, par); - vga_out8 (0x3d4, 0x42, par); - par->CR42 = vga_in8 (0x3d5, par); - vga_out8 (0x3d4, 0x45, par); - par->CR45 = vga_in8 (0x3d5, par); - vga_out8 (0x3d4, 0x50, par); - par->CR50 = vga_in8 (0x3d5, par); - vga_out8 (0x3d4, 0x51, par); - par->CR51 = vga_in8 (0x3d5, par); - vga_out8 (0x3d4, 0x53, par); - par->CR53 = vga_in8 (0x3d5, par); - vga_out8 (0x3d4, 0x58, par); - par->CR58 = vga_in8 (0x3d5, par); - vga_out8 (0x3d4, 0x60, par); - par->CR60 = vga_in8 (0x3d5, par); - vga_out8 (0x3d4, 0x66, par); - par->CR66 = vga_in8 (0x3d5, par); - vga_out8 (0x3d4, 0x67, par); - par->CR67 = vga_in8 (0x3d5, par); - vga_out8 (0x3d4, 0x68, par); - par->CR68 = vga_in8 (0x3d5, par); - vga_out8 (0x3d4, 0x69, par); - par->CR69 = vga_in8 (0x3d5, par); - vga_out8 (0x3d4, 0x6f, par); - par->CR6F = vga_in8 (0x3d5, par); - - vga_out8 (0x3d4, 0x33, par); - par->CR33 = vga_in8 (0x3d5, par); - vga_out8 (0x3d4, 0x86, par); - par->CR86 = vga_in8 (0x3d5, par); - vga_out8 (0x3d4, 0x88, par); - par->CR88 = vga_in8 (0x3d5, par); - vga_out8 (0x3d4, 0x90, par); - par->CR90 = vga_in8 (0x3d5, par); - vga_out8 (0x3d4, 0x91, par); - par->CR91 = vga_in8 (0x3d5, par); - vga_out8 (0x3d4, 0xb0, par); - par->CRB0 = vga_in8 (0x3d5, par) | 0x80; + vga_out8(0x3d4, 0x31, par); + reg->CR31 = vga_in8(0x3d5, par); + vga_out8(0x3d4, 0x32, par); + reg->CR32 = vga_in8(0x3d5, par); + vga_out8(0x3d4, 0x34, par); + reg->CR34 = vga_in8(0x3d5, par); + vga_out8(0x3d4, 0x36, par); + reg->CR36 = vga_in8(0x3d5, par); + vga_out8(0x3d4, 0x3a, par); + reg->CR3A = vga_in8(0x3d5, par); + vga_out8(0x3d4, 0x40, par); + reg->CR40 = vga_in8(0x3d5, par); + vga_out8(0x3d4, 0x42, par); + reg->CR42 = vga_in8(0x3d5, par); + vga_out8(0x3d4, 0x45, par); + reg->CR45 = vga_in8(0x3d5, par); + vga_out8(0x3d4, 0x50, par); + reg->CR50 = vga_in8(0x3d5, par); + vga_out8(0x3d4, 0x51, par); + reg->CR51 = vga_in8(0x3d5, par); + vga_out8(0x3d4, 0x53, par); + reg->CR53 = vga_in8(0x3d5, par); + vga_out8(0x3d4, 0x58, par); + reg->CR58 = vga_in8(0x3d5, par); + vga_out8(0x3d4, 0x60, par); + reg->CR60 = vga_in8(0x3d5, par); + vga_out8(0x3d4, 0x66, par); + reg->CR66 = vga_in8(0x3d5, par); + vga_out8(0x3d4, 0x67, par); + reg->CR67 = vga_in8(0x3d5, par); + vga_out8(0x3d4, 0x68, par); + reg->CR68 = vga_in8(0x3d5, par); + vga_out8(0x3d4, 0x69, par); + reg->CR69 = vga_in8(0x3d5, par); + vga_out8(0x3d4, 0x6f, par); + reg->CR6F = vga_in8(0x3d5, par); + + vga_out8(0x3d4, 0x33, par); + reg->CR33 = vga_in8(0x3d5, par); + vga_out8(0x3d4, 0x86, par); + reg->CR86 = vga_in8(0x3d5, par); + vga_out8(0x3d4, 0x88, par); + reg->CR88 = vga_in8(0x3d5, par); + vga_out8(0x3d4, 0x90, par); + reg->CR90 = vga_in8(0x3d5, par); + vga_out8(0x3d4, 0x91, par); + reg->CR91 = vga_in8(0x3d5, par); + vga_out8(0x3d4, 0xb0, par); + reg->CRB0 = vga_in8(0x3d5, par) | 0x80; + + /* extended mode timing regs */ + vga_out8(0x3d4, 0x3b, par); + reg->CR3B = vga_in8(0x3d5, par); + vga_out8(0x3d4, 0x3c, par); + reg->CR3C = vga_in8(0x3d5, par); + vga_out8(0x3d4, 0x43, par); + reg->CR43 = vga_in8(0x3d5, par); + vga_out8(0x3d4, 0x5d, par); + reg->CR5D = vga_in8(0x3d5, par); + vga_out8(0x3d4, 0x5e, par); + reg->CR5E = vga_in8(0x3d5, par); + vga_out8(0x3d4, 0x65, par); + reg->CR65 = vga_in8(0x3d5, par); + + /* save seq extended regs for DCLK PLL programming */ + vga_out8(0x3c4, 0x0e, par); + reg->SR0E = vga_in8(0x3c5, par); + vga_out8(0x3c4, 0x0f, par); + reg->SR0F = vga_in8(0x3c5, par); + vga_out8(0x3c4, 0x10, par); + reg->SR10 = vga_in8(0x3c5, par); + vga_out8(0x3c4, 0x11, par); + reg->SR11 = vga_in8(0x3c5, par); + vga_out8(0x3c4, 0x12, par); + reg->SR12 = vga_in8(0x3c5, par); + vga_out8(0x3c4, 0x13, par); + reg->SR13 = vga_in8(0x3c5, par); + vga_out8(0x3c4, 0x29, par); + reg->SR29 = vga_in8(0x3c5, par); + + vga_out8(0x3c4, 0x15, par); + reg->SR15 = vga_in8(0x3c5, par); + vga_out8(0x3c4, 0x30, par); + reg->SR30 = vga_in8(0x3c5, par); + vga_out8(0x3c4, 0x18, par); + reg->SR18 = vga_in8(0x3c5, par); + + /* Save flat panel expansion regsters. */ + if (par->chip == S3_SAVAGE_MX) { + int i; + + for (i = 0; i < 8; i++) { + vga_out8(0x3c4, 0x54+i, par); + reg->SR54[i] = vga_in8(0x3c5, par); + } + } + + vga_out8(0x3d4, 0x66, par); + cr66 = vga_in8(0x3d5, par); + vga_out8(0x3d5, cr66 | 0x80, par); + vga_out8(0x3d4, 0x3a, par); + cr3a = vga_in8(0x3d5, par); + vga_out8(0x3d5, cr3a | 0x80, par); + + /* now save MIU regs */ + if (par->chip != S3_SAVAGE_MX) { + reg->MMPR0 = savage_in32(FIFO_CONTROL_REG, par); + reg->MMPR1 = savage_in32(MIU_CONTROL_REG, par); + reg->MMPR2 = savage_in32(STREAMS_TIMEOUT_REG, par); + reg->MMPR3 = savage_in32(MISC_TIMEOUT_REG, par); + } + + vga_out8(0x3d4, 0x3a, par); + vga_out8(0x3d5, cr3a, par); + vga_out8(0x3d4, 0x66, par); + vga_out8(0x3d5, cr66, par); +} + +static void savage_set_default_par(struct savagefb_par *par, + struct savage_reg *reg) +{ + unsigned char cr3a, cr53, cr66; + + vga_out16(0x3d4, 0x4838, par); + vga_out16(0x3d4, 0xa039, par); + vga_out16(0x3c4, 0x0608, par); + + vga_out8(0x3d4, 0x66, par); + cr66 = vga_in8(0x3d5, par); + vga_out8(0x3d5, cr66 | 0x80, par); + vga_out8(0x3d4, 0x3a, par); + cr3a = vga_in8(0x3d5, par); + vga_out8(0x3d5, cr3a | 0x80, par); + vga_out8(0x3d4, 0x53, par); + cr53 = vga_in8(0x3d5, par); + vga_out8(0x3d5, cr53 & 0x7f, par); + + vga_out8(0x3d4, 0x66, par); + vga_out8(0x3d5, cr66, par); + vga_out8(0x3d4, 0x3a, par); + vga_out8(0x3d5, cr3a, par); + + vga_out8(0x3d4, 0x66, par); + vga_out8(0x3d5, cr66, par); + vga_out8(0x3d4, 0x3a, par); + vga_out8(0x3d5, cr3a, par); + + /* unlock extended seq regs */ + vga_out8(0x3c4, 0x08, par); + vga_out8(0x3c5, reg->SR08, par); + vga_out8(0x3c5, 0x06, par); + + /* now restore all the extended regs we need */ + vga_out8(0x3d4, 0x31, par); + vga_out8(0x3d5, reg->CR31, par); + vga_out8(0x3d4, 0x32, par); + vga_out8(0x3d5, reg->CR32, par); + vga_out8(0x3d4, 0x34, par); + vga_out8(0x3d5, reg->CR34, par); + vga_out8(0x3d4, 0x36, par); + vga_out8(0x3d5,reg->CR36, par); + vga_out8(0x3d4, 0x3a, par); + vga_out8(0x3d5, reg->CR3A, par); + vga_out8(0x3d4, 0x40, par); + vga_out8(0x3d5, reg->CR40, par); + vga_out8(0x3d4, 0x42, par); + vga_out8(0x3d5, reg->CR42, par); + vga_out8(0x3d4, 0x45, par); + vga_out8(0x3d5, reg->CR45, par); + vga_out8(0x3d4, 0x50, par); + vga_out8(0x3d5, reg->CR50, par); + vga_out8(0x3d4, 0x51, par); + vga_out8(0x3d5, reg->CR51, par); + vga_out8(0x3d4, 0x53, par); + vga_out8(0x3d5, reg->CR53, par); + vga_out8(0x3d4, 0x58, par); + vga_out8(0x3d5, reg->CR58, par); + vga_out8(0x3d4, 0x60, par); + vga_out8(0x3d5, reg->CR60, par); + vga_out8(0x3d4, 0x66, par); + vga_out8(0x3d5, reg->CR66, par); + vga_out8(0x3d4, 0x67, par); + vga_out8(0x3d5, reg->CR67, par); + vga_out8(0x3d4, 0x68, par); + vga_out8(0x3d5, reg->CR68, par); + vga_out8(0x3d4, 0x69, par); + vga_out8(0x3d5, reg->CR69, par); + vga_out8(0x3d4, 0x6f, par); + vga_out8(0x3d5, reg->CR6F, par); + + vga_out8(0x3d4, 0x33, par); + vga_out8(0x3d5, reg->CR33, par); + vga_out8(0x3d4, 0x86, par); + vga_out8(0x3d5, reg->CR86, par); + vga_out8(0x3d4, 0x88, par); + vga_out8(0x3d5, reg->CR88, par); + vga_out8(0x3d4, 0x90, par); + vga_out8(0x3d5, reg->CR90, par); + vga_out8(0x3d4, 0x91, par); + vga_out8(0x3d5, reg->CR91, par); + vga_out8(0x3d4, 0xb0, par); + vga_out8(0x3d5, reg->CRB0, par); /* extended mode timing regs */ - vga_out8 (0x3d4, 0x3b, par); - par->CR3B = vga_in8 (0x3d5, par); - vga_out8 (0x3d4, 0x3c, par); - par->CR3C = vga_in8 (0x3d5, par); - vga_out8 (0x3d4, 0x43, par); - par->CR43 = vga_in8 (0x3d5, par); - vga_out8 (0x3d4, 0x5d, par); - par->CR5D = vga_in8 (0x3d5, par); - vga_out8 (0x3d4, 0x5e, par); - par->CR5E = vga_in8 (0x3d5, par); - vga_out8 (0x3d4, 0x65, par); - par->CR65 = vga_in8 (0x3d5, par); + vga_out8(0x3d4, 0x3b, par); + vga_out8(0x3d5, reg->CR3B, par); + vga_out8(0x3d4, 0x3c, par); + vga_out8(0x3d5, reg->CR3C, par); + vga_out8(0x3d4, 0x43, par); + vga_out8(0x3d5, reg->CR43, par); + vga_out8(0x3d4, 0x5d, par); + vga_out8(0x3d5, reg->CR5D, par); + vga_out8(0x3d4, 0x5e, par); + vga_out8(0x3d5, reg->CR5E, par); + vga_out8(0x3d4, 0x65, par); + vga_out8(0x3d5, reg->CR65, par); /* save seq extended regs for DCLK PLL programming */ - vga_out8 (0x3c4, 0x0e, par); - par->SR0E = vga_in8 (0x3c5, par); - vga_out8 (0x3c4, 0x0f, par); - par->SR0F = vga_in8 (0x3c5, par); - vga_out8 (0x3c4, 0x10, par); - par->SR10 = vga_in8 (0x3c5, par); - vga_out8 (0x3c4, 0x11, par); - par->SR11 = vga_in8 (0x3c5, par); - vga_out8 (0x3c4, 0x12, par); - par->SR12 = vga_in8 (0x3c5, par); - vga_out8 (0x3c4, 0x13, par); - par->SR13 = vga_in8 (0x3c5, par); - vga_out8 (0x3c4, 0x29, par); - par->SR29 = vga_in8 (0x3c5, par); - - vga_out8 (0x3c4, 0x15, par); - par->SR15 = vga_in8 (0x3c5, par); - vga_out8 (0x3c4, 0x30, par); - par->SR30 = vga_in8 (0x3c5, par); - vga_out8 (0x3c4, 0x18, par); - par->SR18 = vga_in8 (0x3c5, par); + vga_out8(0x3c4, 0x0e, par); + vga_out8(0x3c5, reg->SR0E, par); + vga_out8(0x3c4, 0x0f, par); + vga_out8(0x3c5, reg->SR0F, par); + vga_out8(0x3c4, 0x10, par); + vga_out8(0x3c5, reg->SR10, par); + vga_out8(0x3c4, 0x11, par); + vga_out8(0x3c5, reg->SR11, par); + vga_out8(0x3c4, 0x12, par); + vga_out8(0x3c5, reg->SR12, par); + vga_out8(0x3c4, 0x13, par); + vga_out8(0x3c5, reg->SR13, par); + vga_out8(0x3c4, 0x29, par); + vga_out8(0x3c5, reg->SR29, par); + + vga_out8(0x3c4, 0x15, par); + vga_out8(0x3c5, reg->SR15, par); + vga_out8(0x3c4, 0x30, par); + vga_out8(0x3c5, reg->SR30, par); + vga_out8(0x3c4, 0x18, par); + vga_out8(0x3c5, reg->SR18, par); /* Save flat panel expansion regsters. */ if (par->chip == S3_SAVAGE_MX) { int i; for (i = 0; i < 8; i++) { - vga_out8 (0x3c4, 0x54+i, par); - par->SR54[i] = vga_in8 (0x3c5, par); + vga_out8(0x3c4, 0x54+i, par); + vga_out8(0x3c5, reg->SR54[i], par); } } - vga_out8 (0x3d4, 0x66, par); - cr66 = vga_in8 (0x3d5, par); - vga_out8 (0x3d5, cr66 | 0x80, par); - vga_out8 (0x3d4, 0x3a, par); - cr3a = vga_in8 (0x3d5, par); - vga_out8 (0x3d5, cr3a | 0x80, par); + vga_out8(0x3d4, 0x66, par); + cr66 = vga_in8(0x3d5, par); + vga_out8(0x3d5, cr66 | 0x80, par); + vga_out8(0x3d4, 0x3a, par); + cr3a = vga_in8(0x3d5, par); + vga_out8(0x3d5, cr3a | 0x80, par); /* now save MIU regs */ if (par->chip != S3_SAVAGE_MX) { - par->MMPR0 = savage_in32(FIFO_CONTROL_REG, par); - par->MMPR1 = savage_in32(MIU_CONTROL_REG, par); - par->MMPR2 = savage_in32(STREAMS_TIMEOUT_REG, par); - par->MMPR3 = savage_in32(MISC_TIMEOUT_REG, par); + savage_out32(FIFO_CONTROL_REG, reg->MMPR0, par); + savage_out32(MIU_CONTROL_REG, reg->MMPR1, par); + savage_out32(STREAMS_TIMEOUT_REG, reg->MMPR2, par); + savage_out32(MISC_TIMEOUT_REG, reg->MMPR3, par); } - vga_out8 (0x3d4, 0x3a, par); - vga_out8 (0x3d5, cr3a, par); - vga_out8 (0x3d4, 0x66, par); - vga_out8 (0x3d5, cr66, par); + vga_out8(0x3d4, 0x3a, par); + vga_out8(0x3d5, cr3a, par); + vga_out8(0x3d4, 0x66, par); + vga_out8(0x3d5, cr66, par); } static void savage_update_var(struct fb_var_screeninfo *var, struct fb_videomode *modedb) @@ -683,8 +853,8 @@ static void savage_update_var(struct fb_var_screeninfo *var, struct fb_videomode var->vmode = modedb->vmode; } -static int savagefb_check_var (struct fb_var_screeninfo *var, - struct fb_info *info) +static int savagefb_check_var(struct fb_var_screeninfo *var, + struct fb_info *info) { struct savagefb_par *par = info->par; int memlen, vramlen, mode_valid = 0; @@ -750,10 +920,10 @@ static int savagefb_check_var (struct fb_var_screeninfo *var, if (par->SavagePanelWidth && (var->xres > par->SavagePanelWidth || var->yres > par->SavagePanelHeight)) { - printk (KERN_INFO "Mode (%dx%d) larger than the LCD panel " - "(%dx%d)\n", var->xres, var->yres, - par->SavagePanelWidth, - par->SavagePanelHeight); + printk(KERN_INFO "Mode (%dx%d) larger than the LCD panel " + "(%dx%d)\n", var->xres, var->yres, + par->SavagePanelWidth, + par->SavagePanelHeight); return -1; } @@ -788,8 +958,9 @@ static int savagefb_check_var (struct fb_var_screeninfo *var, } -static int savagefb_decode_var (struct fb_var_screeninfo *var, - struct savagefb_par *par) +static int savagefb_decode_var(struct fb_var_screeninfo *var, + struct savagefb_par *par, + struct savage_reg *reg) { struct xtimings timings; int width, dclk, i, j; /*, refresh; */ @@ -799,7 +970,7 @@ static int savagefb_decode_var (struct fb_var_screeninfo *var, DBG("savagefb_decode_var"); - memset (&timings, 0, sizeof(timings)); + memset(&timings, 0, sizeof(timings)); if (!pixclock) pixclock = 10000; /* 10ns = 100MHz */ timings.Clock = 1000000000 / pixclock; @@ -831,39 +1002,39 @@ static int savagefb_decode_var (struct fb_var_screeninfo *var, * This will allocate the datastructure and initialize all of the * generic VGA registers. */ - vgaHWInit (var, par, &timings); + vgaHWInit(var, par, &timings, reg); /* We need to set CR67 whether or not we use the BIOS. */ dclk = timings.Clock; - par->CR67 = 0x00; + reg->CR67 = 0x00; - switch( var->bits_per_pixel ) { + switch(var->bits_per_pixel) { case 8: - if( (par->chip == S3_SAVAGE2000) && (dclk >= 230000) ) - par->CR67 = 0x10; /* 8bpp, 2 pixels/clock */ + if ((par->chip == S3_SAVAGE2000) && (dclk >= 230000)) + reg->CR67 = 0x10; /* 8bpp, 2 pixels/clock */ else - par->CR67 = 0x00; /* 8bpp, 1 pixel/clock */ + reg->CR67 = 0x00; /* 8bpp, 1 pixel/clock */ break; case 15: - if ( S3_SAVAGE_MOBILE_SERIES(par->chip) || - ((par->chip == S3_SAVAGE2000) && (dclk >= 230000)) ) - par->CR67 = 0x30; /* 15bpp, 2 pixel/clock */ + if (S3_SAVAGE_MOBILE_SERIES(par->chip) || + ((par->chip == S3_SAVAGE2000) && (dclk >= 230000))) + reg->CR67 = 0x30; /* 15bpp, 2 pixel/clock */ else - par->CR67 = 0x20; /* 15bpp, 1 pixels/clock */ + reg->CR67 = 0x20; /* 15bpp, 1 pixels/clock */ break; case 16: - if( S3_SAVAGE_MOBILE_SERIES(par->chip) || - ((par->chip == S3_SAVAGE2000) && (dclk >= 230000)) ) - par->CR67 = 0x50; /* 16bpp, 2 pixel/clock */ + if (S3_SAVAGE_MOBILE_SERIES(par->chip) || + ((par->chip == S3_SAVAGE2000) && (dclk >= 230000))) + reg->CR67 = 0x50; /* 16bpp, 2 pixel/clock */ else - par->CR67 = 0x40; /* 16bpp, 1 pixels/clock */ + reg->CR67 = 0x40; /* 16bpp, 1 pixels/clock */ break; case 24: - par->CR67 = 0x70; + reg->CR67 = 0x70; break; case 32: - par->CR67 = 0xd0; + reg->CR67 = 0xd0; break; } @@ -872,61 +1043,61 @@ static int savagefb_decode_var (struct fb_var_screeninfo *var, * match. Fall back to traditional register-crunching. */ - vga_out8 (0x3d4, 0x3a, par); - tmp = vga_in8 (0x3d5, par); + vga_out8(0x3d4, 0x3a, par); + tmp = vga_in8(0x3d5, par); if (1 /*FIXME:psav->pci_burst*/) - par->CR3A = (tmp & 0x7f) | 0x15; + reg->CR3A = (tmp & 0x7f) | 0x15; else - par->CR3A = tmp | 0x95; + reg->CR3A = tmp | 0x95; - par->CR53 = 0x00; - par->CR31 = 0x8c; - par->CR66 = 0x89; + reg->CR53 = 0x00; + reg->CR31 = 0x8c; + reg->CR66 = 0x89; - vga_out8 (0x3d4, 0x58, par); - par->CR58 = vga_in8 (0x3d5, par) & 0x80; - par->CR58 |= 0x13; + vga_out8(0x3d4, 0x58, par); + reg->CR58 = vga_in8(0x3d5, par) & 0x80; + reg->CR58 |= 0x13; - par->SR15 = 0x03 | 0x80; - par->SR18 = 0x00; - par->CR43 = par->CR45 = par->CR65 = 0x00; + reg->SR15 = 0x03 | 0x80; + reg->SR18 = 0x00; + reg->CR43 = reg->CR45 = reg->CR65 = 0x00; - vga_out8 (0x3d4, 0x40, par); - par->CR40 = vga_in8 (0x3d5, par) & ~0x01; + vga_out8(0x3d4, 0x40, par); + reg->CR40 = vga_in8(0x3d5, par) & ~0x01; - par->MMPR0 = 0x010400; - par->MMPR1 = 0x00; - par->MMPR2 = 0x0808; - par->MMPR3 = 0x08080810; + reg->MMPR0 = 0x010400; + reg->MMPR1 = 0x00; + reg->MMPR2 = 0x0808; + reg->MMPR3 = 0x08080810; - SavageCalcClock (dclk, 1, 1, 127, 0, 4, 180000, 360000, &m, &n, &r); + SavageCalcClock(dclk, 1, 1, 127, 0, 4, 180000, 360000, &m, &n, &r); /* m = 107; n = 4; r = 2; */ if (par->MCLK <= 0) { - par->SR10 = 255; - par->SR11 = 255; + reg->SR10 = 255; + reg->SR11 = 255; } else { - common_calc_clock (par->MCLK, 1, 1, 31, 0, 3, 135000, 270000, - &par->SR11, &par->SR10); - /* par->SR10 = 80; // MCLK == 286000 */ - /* par->SR11 = 125; */ + common_calc_clock(par->MCLK, 1, 1, 31, 0, 3, 135000, 270000, + ®->SR11, ®->SR10); + /* reg->SR10 = 80; // MCLK == 286000 */ + /* reg->SR11 = 125; */ } - par->SR12 = (r << 6) | (n & 0x3f); - par->SR13 = m & 0xff; - par->SR29 = (r & 4) | (m & 0x100) >> 5 | (n & 0x40) >> 2; + reg->SR12 = (r << 6) | (n & 0x3f); + reg->SR13 = m & 0xff; + reg->SR29 = (r & 4) | (m & 0x100) >> 5 | (n & 0x40) >> 2; if (var->bits_per_pixel < 24) - par->MMPR0 -= 0x8000; + reg->MMPR0 -= 0x8000; else - par->MMPR0 -= 0x4000; + reg->MMPR0 -= 0x4000; if (timings.interlaced) - par->CR42 = 0x20; + reg->CR42 = 0x20; else - par->CR42 = 0x00; + reg->CR42 = 0x00; - par->CR34 = 0x10; /* display fifo */ + reg->CR34 = 0x10; /* display fifo */ i = ((((timings.HTotal >> 3) - 5) & 0x100) >> 8) | ((((timings.HDisplay >> 3) - 1) & 0x100) >> 7) | @@ -938,77 +1109,77 @@ static int savagefb_decode_var (struct fb_var_screeninfo *var, if ((timings.HSyncEnd >> 3) - (timings.HSyncStart >> 3) > 32) i |= 0x20; - j = (par->CRTC[0] + ((i & 0x01) << 8) + - par->CRTC[4] + ((i & 0x10) << 4) + 1) / 2; + j = (reg->CRTC[0] + ((i & 0x01) << 8) + + reg->CRTC[4] + ((i & 0x10) << 4) + 1) / 2; - if (j - (par->CRTC[4] + ((i & 0x10) << 4)) < 4) { - if (par->CRTC[4] + ((i & 0x10) << 4) + 4 <= - par->CRTC[0] + ((i & 0x01) << 8)) - j = par->CRTC[4] + ((i & 0x10) << 4) + 4; + if (j - (reg->CRTC[4] + ((i & 0x10) << 4)) < 4) { + if (reg->CRTC[4] + ((i & 0x10) << 4) + 4 <= + reg->CRTC[0] + ((i & 0x01) << 8)) + j = reg->CRTC[4] + ((i & 0x10) << 4) + 4; else - j = par->CRTC[0] + ((i & 0x01) << 8) + 1; + j = reg->CRTC[0] + ((i & 0x01) << 8) + 1; } - par->CR3B = j & 0xff; + reg->CR3B = j & 0xff; i |= (j & 0x100) >> 2; - par->CR3C = (par->CRTC[0] + ((i & 0x01) << 8)) / 2; - par->CR5D = i; - par->CR5E = (((timings.VTotal - 2) & 0x400) >> 10) | + reg->CR3C = (reg->CRTC[0] + ((i & 0x01) << 8)) / 2; + reg->CR5D = i; + reg->CR5E = (((timings.VTotal - 2) & 0x400) >> 10) | (((timings.VDisplay - 1) & 0x400) >> 9) | (((timings.VSyncStart) & 0x400) >> 8) | (((timings.VSyncStart) & 0x400) >> 6) | 0x40; width = (var->xres_virtual * ((var->bits_per_pixel+7) / 8)) >> 3; - par->CR91 = par->CRTC[19] = 0xff & width; - par->CR51 = (0x300 & width) >> 4; - par->CR90 = 0x80 | (width >> 8); - par->MiscOutReg |= 0x0c; + reg->CR91 = reg->CRTC[19] = 0xff & width; + reg->CR51 = (0x300 & width) >> 4; + reg->CR90 = 0x80 | (width >> 8); + reg->MiscOutReg |= 0x0c; /* Set frame buffer description. */ if (var->bits_per_pixel <= 8) - par->CR50 = 0; + reg->CR50 = 0; else if (var->bits_per_pixel <= 16) - par->CR50 = 0x10; + reg->CR50 = 0x10; else - par->CR50 = 0x30; + reg->CR50 = 0x30; if (var->xres_virtual <= 640) - par->CR50 |= 0x40; + reg->CR50 |= 0x40; else if (var->xres_virtual == 800) - par->CR50 |= 0x80; + reg->CR50 |= 0x80; else if (var->xres_virtual == 1024) - par->CR50 |= 0x00; + reg->CR50 |= 0x00; else if (var->xres_virtual == 1152) - par->CR50 |= 0x01; + reg->CR50 |= 0x01; else if (var->xres_virtual == 1280) - par->CR50 |= 0xc0; + reg->CR50 |= 0xc0; else if (var->xres_virtual == 1600) - par->CR50 |= 0x81; + reg->CR50 |= 0x81; else - par->CR50 |= 0xc1; /* Use GBD */ + reg->CR50 |= 0xc1; /* Use GBD */ - if( par->chip == S3_SAVAGE2000 ) - par->CR33 = 0x08; + if (par->chip == S3_SAVAGE2000) + reg->CR33 = 0x08; else - par->CR33 = 0x20; + reg->CR33 = 0x20; - par->CRTC[0x17] = 0xeb; + reg->CRTC[0x17] = 0xeb; - par->CR67 |= 1; + reg->CR67 |= 1; vga_out8(0x3d4, 0x36, par); - par->CR36 = vga_in8 (0x3d5, par); - vga_out8 (0x3d4, 0x68, par); - par->CR68 = vga_in8 (0x3d5, par); - par->CR69 = 0; - vga_out8 (0x3d4, 0x6f, par); - par->CR6F = vga_in8 (0x3d5, par); - vga_out8 (0x3d4, 0x86, par); - par->CR86 = vga_in8 (0x3d5, par); - vga_out8 (0x3d4, 0x88, par); - par->CR88 = vga_in8 (0x3d5, par) | 0x08; - vga_out8 (0x3d4, 0xb0, par); - par->CRB0 = vga_in8 (0x3d5, par) | 0x80; + reg->CR36 = vga_in8(0x3d5, par); + vga_out8(0x3d4, 0x68, par); + reg->CR68 = vga_in8(0x3d5, par); + reg->CR69 = 0; + vga_out8(0x3d4, 0x6f, par); + reg->CR6F = vga_in8(0x3d5, par); + vga_out8(0x3d4, 0x86, par); + reg->CR86 = vga_in8(0x3d5, par); + vga_out8(0x3d4, 0x88, par); + reg->CR88 = vga_in8(0x3d5, par) | 0x08; + vga_out8(0x3d4, 0xb0, par); + reg->CRB0 = vga_in8(0x3d5, par) | 0x80; return 0; } @@ -1037,11 +1208,11 @@ static int savagefb_setcolreg(unsigned regno, switch (info->var.bits_per_pixel) { case 8: - vga_out8 (0x3c8, regno, par); + vga_out8(0x3c8, regno, par); - vga_out8 (0x3c9, red >> 10, par); - vga_out8 (0x3c9, green >> 10, par); - vga_out8 (0x3c9, blue >> 10, par); + vga_out8(0x3c9, red >> 10, par); + vga_out8(0x3c9, green >> 10, par); + vga_out8(0x3c9, blue >> 10, par); break; case 16: @@ -1075,21 +1246,21 @@ static int savagefb_setcolreg(unsigned regno, return 0; } -static void savagefb_set_par_int (struct savagefb_par *par) +static void savagefb_set_par_int(struct savagefb_par *par, struct savage_reg *reg) { unsigned char tmp, cr3a, cr66, cr67; - DBG ("savagefb_set_par_int"); + DBG("savagefb_set_par_int"); - par->SavageWaitIdle (par); + par->SavageWaitIdle(par); - vga_out8 (0x3c2, 0x23, par); + vga_out8(0x3c2, 0x23, par); - vga_out16 (0x3d4, 0x4838, par); - vga_out16 (0x3d4, 0xa539, par); - vga_out16 (0x3c4, 0x0608, par); + vga_out16(0x3d4, 0x4838, par); + vga_out16(0x3d4, 0xa539, par); + vga_out16(0x3c4, 0x0608, par); - vgaHWProtect (par, 1); + vgaHWProtect(par, 1); /* * Some Savage/MX and /IX systems go nuts when trying to exit the @@ -1099,203 +1270,202 @@ static void savagefb_set_par_int (struct savagefb_par *par) */ VerticalRetraceWait(par); - vga_out8 (0x3d4, 0x67, par); - cr67 = vga_in8 (0x3d5, par); - vga_out8 (0x3d5, cr67/*par->CR67*/ & ~0x0c, par); /* no STREAMS yet */ + vga_out8(0x3d4, 0x67, par); + cr67 = vga_in8(0x3d5, par); + vga_out8(0x3d5, cr67/*par->CR67*/ & ~0x0c, par); /* no STREAMS yet */ - vga_out8 (0x3d4, 0x23, par); - vga_out8 (0x3d5, 0x00, par); - vga_out8 (0x3d4, 0x26, par); - vga_out8 (0x3d5, 0x00, par); + vga_out8(0x3d4, 0x23, par); + vga_out8(0x3d5, 0x00, par); + vga_out8(0x3d4, 0x26, par); + vga_out8(0x3d5, 0x00, par); /* restore extended regs */ - vga_out8 (0x3d4, 0x66, par); - vga_out8 (0x3d5, par->CR66, par); - vga_out8 (0x3d4, 0x3a, par); - vga_out8 (0x3d5, par->CR3A, par); - vga_out8 (0x3d4, 0x31, par); - vga_out8 (0x3d5, par->CR31, par); - vga_out8 (0x3d4, 0x32, par); - vga_out8 (0x3d5, par->CR32, par); - vga_out8 (0x3d4, 0x58, par); - vga_out8 (0x3d5, par->CR58, par); - vga_out8 (0x3d4, 0x53, par); - vga_out8 (0x3d5, par->CR53 & 0x7f, par); - - vga_out16 (0x3c4, 0x0608, par); + vga_out8(0x3d4, 0x66, par); + vga_out8(0x3d5, reg->CR66, par); + vga_out8(0x3d4, 0x3a, par); + vga_out8(0x3d5, reg->CR3A, par); + vga_out8(0x3d4, 0x31, par); + vga_out8(0x3d5, reg->CR31, par); + vga_out8(0x3d4, 0x32, par); + vga_out8(0x3d5, reg->CR32, par); + vga_out8(0x3d4, 0x58, par); + vga_out8(0x3d5, reg->CR58, par); + vga_out8(0x3d4, 0x53, par); + vga_out8(0x3d5, reg->CR53 & 0x7f, par); + + vga_out16(0x3c4, 0x0608, par); /* Restore DCLK registers. */ - vga_out8 (0x3c4, 0x0e, par); - vga_out8 (0x3c5, par->SR0E, par); - vga_out8 (0x3c4, 0x0f, par); - vga_out8 (0x3c5, par->SR0F, par); - vga_out8 (0x3c4, 0x29, par); - vga_out8 (0x3c5, par->SR29, par); - vga_out8 (0x3c4, 0x15, par); - vga_out8 (0x3c5, par->SR15, par); + vga_out8(0x3c4, 0x0e, par); + vga_out8(0x3c5, reg->SR0E, par); + vga_out8(0x3c4, 0x0f, par); + vga_out8(0x3c5, reg->SR0F, par); + vga_out8(0x3c4, 0x29, par); + vga_out8(0x3c5, reg->SR29, par); + vga_out8(0x3c4, 0x15, par); + vga_out8(0x3c5, reg->SR15, par); /* Restore flat panel expansion regsters. */ - if( par->chip == S3_SAVAGE_MX ) { + if (par->chip == S3_SAVAGE_MX) { int i; - for( i = 0; i < 8; i++ ) { - vga_out8 (0x3c4, 0x54+i, par); - vga_out8 (0x3c5, par->SR54[i], par); + for (i = 0; i < 8; i++) { + vga_out8(0x3c4, 0x54+i, par); + vga_out8(0x3c5, reg->SR54[i], par); } } - vgaHWRestore (par); + vgaHWRestore (par, reg); /* extended mode timing registers */ - vga_out8 (0x3d4, 0x53, par); - vga_out8 (0x3d5, par->CR53, par); - vga_out8 (0x3d4, 0x5d, par); - vga_out8 (0x3d5, par->CR5D, par); - vga_out8 (0x3d4, 0x5e, par); - vga_out8 (0x3d5, par->CR5E, par); - vga_out8 (0x3d4, 0x3b, par); - vga_out8 (0x3d5, par->CR3B, par); - vga_out8 (0x3d4, 0x3c, par); - vga_out8 (0x3d5, par->CR3C, par); - vga_out8 (0x3d4, 0x43, par); - vga_out8 (0x3d5, par->CR43, par); - vga_out8 (0x3d4, 0x65, par); - vga_out8 (0x3d5, par->CR65, par); + vga_out8(0x3d4, 0x53, par); + vga_out8(0x3d5, reg->CR53, par); + vga_out8(0x3d4, 0x5d, par); + vga_out8(0x3d5, reg->CR5D, par); + vga_out8(0x3d4, 0x5e, par); + vga_out8(0x3d5, reg->CR5E, par); + vga_out8(0x3d4, 0x3b, par); + vga_out8(0x3d5, reg->CR3B, par); + vga_out8(0x3d4, 0x3c, par); + vga_out8(0x3d5, reg->CR3C, par); + vga_out8(0x3d4, 0x43, par); + vga_out8(0x3d5, reg->CR43, par); + vga_out8(0x3d4, 0x65, par); + vga_out8(0x3d5, reg->CR65, par); /* restore the desired video mode with cr67 */ - vga_out8 (0x3d4, 0x67, par); + vga_out8(0x3d4, 0x67, par); /* following part not present in X11 driver */ - cr67 = vga_in8 (0x3d5, par) & 0xf; - vga_out8 (0x3d5, 0x50 | cr67, par); - udelay (10000); - vga_out8 (0x3d4, 0x67, par); + cr67 = vga_in8(0x3d5, par) & 0xf; + vga_out8(0x3d5, 0x50 | cr67, par); + udelay(10000); + vga_out8(0x3d4, 0x67, par); /* end of part */ - vga_out8 (0x3d5, par->CR67 & ~0x0c, par); + vga_out8(0x3d5, reg->CR67 & ~0x0c, par); /* other mode timing and extended regs */ - vga_out8 (0x3d4, 0x34, par); - vga_out8 (0x3d5, par->CR34, par); - vga_out8 (0x3d4, 0x40, par); - vga_out8 (0x3d5, par->CR40, par); - vga_out8 (0x3d4, 0x42, par); - vga_out8 (0x3d5, par->CR42, par); - vga_out8 (0x3d4, 0x45, par); - vga_out8 (0x3d5, par->CR45, par); - vga_out8 (0x3d4, 0x50, par); - vga_out8 (0x3d5, par->CR50, par); - vga_out8 (0x3d4, 0x51, par); - vga_out8 (0x3d5, par->CR51, par); + vga_out8(0x3d4, 0x34, par); + vga_out8(0x3d5, reg->CR34, par); + vga_out8(0x3d4, 0x40, par); + vga_out8(0x3d5, reg->CR40, par); + vga_out8(0x3d4, 0x42, par); + vga_out8(0x3d5, reg->CR42, par); + vga_out8(0x3d4, 0x45, par); + vga_out8(0x3d5, reg->CR45, par); + vga_out8(0x3d4, 0x50, par); + vga_out8(0x3d5, reg->CR50, par); + vga_out8(0x3d4, 0x51, par); + vga_out8(0x3d5, reg->CR51, par); /* memory timings */ - vga_out8 (0x3d4, 0x36, par); - vga_out8 (0x3d5, par->CR36, par); - vga_out8 (0x3d4, 0x60, par); - vga_out8 (0x3d5, par->CR60, par); - vga_out8 (0x3d4, 0x68, par); - vga_out8 (0x3d5, par->CR68, par); - vga_out8 (0x3d4, 0x69, par); - vga_out8 (0x3d5, par->CR69, par); - vga_out8 (0x3d4, 0x6f, par); - vga_out8 (0x3d5, par->CR6F, par); - - vga_out8 (0x3d4, 0x33, par); - vga_out8 (0x3d5, par->CR33, par); - vga_out8 (0x3d4, 0x86, par); - vga_out8 (0x3d5, par->CR86, par); - vga_out8 (0x3d4, 0x88, par); - vga_out8 (0x3d5, par->CR88, par); - vga_out8 (0x3d4, 0x90, par); - vga_out8 (0x3d5, par->CR90, par); - vga_out8 (0x3d4, 0x91, par); - vga_out8 (0x3d5, par->CR91, par); + vga_out8(0x3d4, 0x36, par); + vga_out8(0x3d5, reg->CR36, par); + vga_out8(0x3d4, 0x60, par); + vga_out8(0x3d5, reg->CR60, par); + vga_out8(0x3d4, 0x68, par); + vga_out8(0x3d5, reg->CR68, par); + vga_out8(0x3d4, 0x69, par); + vga_out8(0x3d5, reg->CR69, par); + vga_out8(0x3d4, 0x6f, par); + vga_out8(0x3d5, reg->CR6F, par); + + vga_out8(0x3d4, 0x33, par); + vga_out8(0x3d5, reg->CR33, par); + vga_out8(0x3d4, 0x86, par); + vga_out8(0x3d5, reg->CR86, par); + vga_out8(0x3d4, 0x88, par); + vga_out8(0x3d5, reg->CR88, par); + vga_out8(0x3d4, 0x90, par); + vga_out8(0x3d5, reg->CR90, par); + vga_out8(0x3d4, 0x91, par); + vga_out8(0x3d5, reg->CR91, par); if (par->chip == S3_SAVAGE4) { - vga_out8 (0x3d4, 0xb0, par); - vga_out8 (0x3d5, par->CRB0, par); + vga_out8(0x3d4, 0xb0, par); + vga_out8(0x3d5, reg->CRB0, par); } - vga_out8 (0x3d4, 0x32, par); - vga_out8 (0x3d5, par->CR32, par); + vga_out8(0x3d4, 0x32, par); + vga_out8(0x3d5, reg->CR32, par); /* unlock extended seq regs */ - vga_out8 (0x3c4, 0x08, par); - vga_out8 (0x3c5, 0x06, par); + vga_out8(0x3c4, 0x08, par); + vga_out8(0x3c5, 0x06, par); /* Restore extended sequencer regs for MCLK. SR10 == 255 indicates * that we should leave the default SR10 and SR11 values there. */ - if (par->SR10 != 255) { - vga_out8 (0x3c4, 0x10, par); - vga_out8 (0x3c5, par->SR10, par); - vga_out8 (0x3c4, 0x11, par); - vga_out8 (0x3c5, par->SR11, par); + if (reg->SR10 != 255) { + vga_out8(0x3c4, 0x10, par); + vga_out8(0x3c5, reg->SR10, par); + vga_out8(0x3c4, 0x11, par); + vga_out8(0x3c5, reg->SR11, par); } /* restore extended seq regs for dclk */ - vga_out8 (0x3c4, 0x0e, par); - vga_out8 (0x3c5, par->SR0E, par); - vga_out8 (0x3c4, 0x0f, par); - vga_out8 (0x3c5, par->SR0F, par); - vga_out8 (0x3c4, 0x12, par); - vga_out8 (0x3c5, par->SR12, par); - vga_out8 (0x3c4, 0x13, par); - vga_out8 (0x3c5, par->SR13, par); - vga_out8 (0x3c4, 0x29, par); - vga_out8 (0x3c5, par->SR29, par); - - vga_out8 (0x3c4, 0x18, par); - vga_out8 (0x3c5, par->SR18, par); + vga_out8(0x3c4, 0x0e, par); + vga_out8(0x3c5, reg->SR0E, par); + vga_out8(0x3c4, 0x0f, par); + vga_out8(0x3c5, reg->SR0F, par); + vga_out8(0x3c4, 0x12, par); + vga_out8(0x3c5, reg->SR12, par); + vga_out8(0x3c4, 0x13, par); + vga_out8(0x3c5, reg->SR13, par); + vga_out8(0x3c4, 0x29, par); + vga_out8(0x3c5, reg->SR29, par); + vga_out8(0x3c4, 0x18, par); + vga_out8(0x3c5, reg->SR18, par); /* load new m, n pll values for dclk & mclk */ - vga_out8 (0x3c4, 0x15, par); - tmp = vga_in8 (0x3c5, par) & ~0x21; + vga_out8(0x3c4, 0x15, par); + tmp = vga_in8(0x3c5, par) & ~0x21; - vga_out8 (0x3c5, tmp | 0x03, par); - vga_out8 (0x3c5, tmp | 0x23, par); - vga_out8 (0x3c5, tmp | 0x03, par); - vga_out8 (0x3c5, par->SR15, par); - udelay (100); + vga_out8(0x3c5, tmp | 0x03, par); + vga_out8(0x3c5, tmp | 0x23, par); + vga_out8(0x3c5, tmp | 0x03, par); + vga_out8(0x3c5, reg->SR15, par); + udelay(100); - vga_out8 (0x3c4, 0x30, par); - vga_out8 (0x3c5, par->SR30, par); - vga_out8 (0x3c4, 0x08, par); - vga_out8 (0x3c5, par->SR08, par); + vga_out8(0x3c4, 0x30, par); + vga_out8(0x3c5, reg->SR30, par); + vga_out8(0x3c4, 0x08, par); + vga_out8(0x3c5, reg->SR08, par); /* now write out cr67 in full, possibly starting STREAMS */ VerticalRetraceWait(par); - vga_out8 (0x3d4, 0x67, par); - vga_out8 (0x3d5, par->CR67, par); + vga_out8(0x3d4, 0x67, par); + vga_out8(0x3d5, reg->CR67, par); - vga_out8 (0x3d4, 0x66, par); - cr66 = vga_in8 (0x3d5, par); - vga_out8 (0x3d5, cr66 | 0x80, par); - vga_out8 (0x3d4, 0x3a, par); - cr3a = vga_in8 (0x3d5, par); - vga_out8 (0x3d5, cr3a | 0x80, par); + vga_out8(0x3d4, 0x66, par); + cr66 = vga_in8(0x3d5, par); + vga_out8(0x3d5, cr66 | 0x80, par); + vga_out8(0x3d4, 0x3a, par); + cr3a = vga_in8(0x3d5, par); + vga_out8(0x3d5, cr3a | 0x80, par); if (par->chip != S3_SAVAGE_MX) { VerticalRetraceWait(par); - savage_out32 (FIFO_CONTROL_REG, par->MMPR0, par); - par->SavageWaitIdle (par); - savage_out32 (MIU_CONTROL_REG, par->MMPR1, par); - par->SavageWaitIdle (par); - savage_out32 (STREAMS_TIMEOUT_REG, par->MMPR2, par); - par->SavageWaitIdle (par); - savage_out32 (MISC_TIMEOUT_REG, par->MMPR3, par); + savage_out32(FIFO_CONTROL_REG, reg->MMPR0, par); + par->SavageWaitIdle(par); + savage_out32(MIU_CONTROL_REG, reg->MMPR1, par); + par->SavageWaitIdle(par); + savage_out32(STREAMS_TIMEOUT_REG, reg->MMPR2, par); + par->SavageWaitIdle(par); + savage_out32(MISC_TIMEOUT_REG, reg->MMPR3, par); } - vga_out8 (0x3d4, 0x66, par); - vga_out8 (0x3d5, cr66, par); - vga_out8 (0x3d4, 0x3a, par); - vga_out8 (0x3d5, cr3a, par); + vga_out8(0x3d4, 0x66, par); + vga_out8(0x3d5, cr66, par); + vga_out8(0x3d4, 0x3a, par); + vga_out8(0x3d5, cr3a, par); - SavageSetup2DEngine (par); - vgaHWProtect (par, 0); + SavageSetup2DEngine(par); + vgaHWProtect(par, 0); } -static void savagefb_update_start (struct savagefb_par *par, - struct fb_var_screeninfo *var) +static void savagefb_update_start(struct savagefb_par *par, + struct fb_var_screeninfo *var) { int base; @@ -1305,8 +1475,8 @@ static void savagefb_update_start (struct savagefb_par *par, /* now program the start address registers */ vga_out16(0x3d4, (base & 0x00ff00) | 0x0c, par); vga_out16(0x3d4, ((base & 0x00ff) << 8) | 0x0d, par); - vga_out8 (0x3d4, 0x69, par); - vga_out8 (0x3d5, (base & 0x7f0000) >> 16, par); + vga_out8(0x3d4, 0x69, par); + vga_out8(0x3d5, (base & 0x7f0000) >> 16, par); } @@ -1325,29 +1495,14 @@ static void savagefb_set_fix(struct fb_info *info) } -#if defined(CONFIG_FB_SAVAGE_ACCEL) -static void savagefb_set_clip(struct fb_info *info) -{ - struct savagefb_par *par = info->par; - int cmd; - - cmd = BCI_CMD_NOP | BCI_CMD_CLIP_NEW; - par->bci_ptr = 0; - par->SavageWaitFifo(par,3); - BCI_SEND(cmd); - BCI_SEND(BCI_CLIP_TL(0, 0)); - BCI_SEND(BCI_CLIP_BR(0xfff, 0xfff)); -} -#endif - -static int savagefb_set_par (struct fb_info *info) +static int savagefb_set_par(struct fb_info *info) { struct savagefb_par *par = info->par; struct fb_var_screeninfo *var = &info->var; int err; DBG("savagefb_set_par"); - err = savagefb_decode_var (var, par); + err = savagefb_decode_var(var, par, &par->state); if (err) return err; @@ -1366,8 +1521,8 @@ static int savagefb_set_par (struct fb_info *info) par->maxClock = par->dacSpeedBpp; par->minClock = 10000; - savagefb_set_par_int (par); - fb_set_cmap (&info->cmap, info); + savagefb_set_par_int(par, &par->state); + fb_set_cmap(&info->cmap, info); savagefb_set_fix(info); savagefb_set_clip(info); @@ -1378,12 +1533,12 @@ static int savagefb_set_par (struct fb_info *info) /* * Pan or Wrap the Display */ -static int savagefb_pan_display (struct fb_var_screeninfo *var, - struct fb_info *info) +static int savagefb_pan_display(struct fb_var_screeninfo *var, + struct fb_info *info) { struct savagefb_par *par = info->par; - savagefb_update_start (par, var); + savagefb_update_start(par, var); return 0; } @@ -1440,6 +1595,22 @@ static int savagefb_blank(int blank, struct fb_info *info) return (blank == FB_BLANK_NORMAL) ? 1 : 0; } +static void savagefb_save_state(struct fb_info *info) +{ + struct savagefb_par *par = info->par; + + savage_get_default_par(par, &par->save); +} + +static void savagefb_restore_state(struct fb_info *info) +{ + struct savagefb_par *par = info->par; + + savagefb_blank(FB_BLANK_POWERDOWN, info); + savage_set_default_par(par, &par->save); + savagefb_blank(FB_BLANK_UNBLANK, info); +} + static struct fb_ops savagefb_ops = { .owner = THIS_MODULE, .fb_check_var = savagefb_check_var, @@ -1447,6 +1618,8 @@ static struct fb_ops savagefb_ops = { .fb_setcolreg = savagefb_setcolreg, .fb_pan_display = savagefb_pan_display, .fb_blank = savagefb_blank, + .fb_save_state = savagefb_save_state, + .fb_restore_state = savagefb_restore_state, #if defined(CONFIG_FB_SAVAGE_ACCEL) .fb_fillrect = savagefb_fillrect, .fb_copyarea = savagefb_copyarea, @@ -1479,59 +1652,59 @@ static struct fb_var_screeninfo __devinitdata savagefb_var800x600x8 = { .vmode = FB_VMODE_NONINTERLACED }; -static void savage_enable_mmio (struct savagefb_par *par) +static void savage_enable_mmio(struct savagefb_par *par) { unsigned char val; - DBG ("savage_enable_mmio\n"); + DBG("savage_enable_mmio\n"); - val = vga_in8 (0x3c3, par); - vga_out8 (0x3c3, val | 0x01, par); - val = vga_in8 (0x3cc, par); - vga_out8 (0x3c2, val | 0x01, par); + val = vga_in8(0x3c3, par); + vga_out8(0x3c3, val | 0x01, par); + val = vga_in8(0x3cc, par); + vga_out8(0x3c2, val | 0x01, par); if (par->chip >= S3_SAVAGE4) { - vga_out8 (0x3d4, 0x40, par); - val = vga_in8 (0x3d5, par); - vga_out8 (0x3d5, val | 1, par); + vga_out8(0x3d4, 0x40, par); + val = vga_in8(0x3d5, par); + vga_out8(0x3d5, val | 1, par); } } -static void savage_disable_mmio (struct savagefb_par *par) +static void savage_disable_mmio(struct savagefb_par *par) { unsigned char val; - DBG ("savage_disable_mmio\n"); + DBG("savage_disable_mmio\n"); - if(par->chip >= S3_SAVAGE4 ) { - vga_out8 (0x3d4, 0x40, par); - val = vga_in8 (0x3d5, par); - vga_out8 (0x3d5, val | 1, par); + if (par->chip >= S3_SAVAGE4) { + vga_out8(0x3d4, 0x40, par); + val = vga_in8(0x3d5, par); + vga_out8(0x3d5, val | 1, par); } } -static int __devinit savage_map_mmio (struct fb_info *info) +static int __devinit savage_map_mmio(struct fb_info *info) { struct savagefb_par *par = info->par; - DBG ("savage_map_mmio"); + DBG("savage_map_mmio"); - if (S3_SAVAGE3D_SERIES (par->chip)) - par->mmio.pbase = pci_resource_start (par->pcidev, 0) + + if (S3_SAVAGE3D_SERIES(par->chip)) + par->mmio.pbase = pci_resource_start(par->pcidev, 0) + SAVAGE_NEWMMIO_REGBASE_S3; else - par->mmio.pbase = pci_resource_start (par->pcidev, 0) + + par->mmio.pbase = pci_resource_start(par->pcidev, 0) + SAVAGE_NEWMMIO_REGBASE_S4; par->mmio.len = SAVAGE_NEWMMIO_REGSIZE; - par->mmio.vbase = ioremap (par->mmio.pbase, par->mmio.len); + par->mmio.vbase = ioremap(par->mmio.pbase, par->mmio.len); if (!par->mmio.vbase) { - printk ("savagefb: unable to map memory mapped IO\n"); + printk("savagefb: unable to map memory mapped IO\n"); return -ENOMEM; } else - printk (KERN_INFO "savagefb: mapped io at %p\n", + printk(KERN_INFO "savagefb: mapped io at %p\n", par->mmio.vbase); info->fix.mmio_start = par->mmio.pbase; @@ -1540,15 +1713,15 @@ static int __devinit savage_map_mmio (struct fb_info *info) par->bci_base = (u32 __iomem *)(par->mmio.vbase + BCI_BUFFER_OFFSET); par->bci_ptr = 0; - savage_enable_mmio (par); + savage_enable_mmio(par); return 0; } -static void savage_unmap_mmio (struct fb_info *info) +static void savage_unmap_mmio(struct fb_info *info) { struct savagefb_par *par = info->par; - DBG ("savage_unmap_mmio"); + DBG("savage_unmap_mmio"); savage_disable_mmio(par); @@ -1558,46 +1731,46 @@ static void savage_unmap_mmio (struct fb_info *info) } } -static int __devinit savage_map_video (struct fb_info *info, - int video_len) +static int __devinit savage_map_video(struct fb_info *info, + int video_len) { struct savagefb_par *par = info->par; int resource; DBG("savage_map_video"); - if (S3_SAVAGE3D_SERIES (par->chip)) + if (S3_SAVAGE3D_SERIES(par->chip)) resource = 0; else resource = 1; - par->video.pbase = pci_resource_start (par->pcidev, resource); + par->video.pbase = pci_resource_start(par->pcidev, resource); par->video.len = video_len; - par->video.vbase = ioremap (par->video.pbase, par->video.len); + par->video.vbase = ioremap(par->video.pbase, par->video.len); if (!par->video.vbase) { - printk ("savagefb: unable to map screen memory\n"); + printk("savagefb: unable to map screen memory\n"); return -ENOMEM; } else - printk (KERN_INFO "savagefb: mapped framebuffer at %p, " - "pbase == %x\n", par->video.vbase, par->video.pbase); + printk(KERN_INFO "savagefb: mapped framebuffer at %p, " + "pbase == %x\n", par->video.vbase, par->video.pbase); info->fix.smem_start = par->video.pbase; info->fix.smem_len = par->video.len - par->cob_size; info->screen_base = par->video.vbase; #ifdef CONFIG_MTRR - par->video.mtrr = mtrr_add (par->video.pbase, video_len, - MTRR_TYPE_WRCOMB, 1); + par->video.mtrr = mtrr_add(par->video.pbase, video_len, + MTRR_TYPE_WRCOMB, 1); #endif /* Clear framebuffer, it's all white in memory after boot */ - memset_io (par->video.vbase, 0, par->video.len); + memset_io(par->video.vbase, 0, par->video.len); return 0; } -static void savage_unmap_video (struct fb_info *info) +static void savage_unmap_video(struct fb_info *info) { struct savagefb_par *par = info->par; @@ -1605,16 +1778,16 @@ static void savage_unmap_video (struct fb_info *info) if (par->video.vbase) { #ifdef CONFIG_MTRR - mtrr_del (par->video.mtrr, par->video.pbase, par->video.len); + mtrr_del(par->video.mtrr, par->video.pbase, par->video.len); #endif - iounmap (par->video.vbase); + iounmap(par->video.vbase); par->video.vbase = NULL; info->screen_base = NULL; } } -static int savage_init_hw (struct savagefb_par *par) +static int savage_init_hw(struct savagefb_par *par) { unsigned char config1, m, n, n1, n2, sr8, cr3f, cr66 = 0, tmp; @@ -1656,7 +1829,7 @@ static int savage_init_hw (struct savagefb_par *par) switch (par->chip) { case S3_SAVAGE3D: - videoRam = RamSavage3D[ (config1 & 0xC0) >> 6 ] * 1024; + videoRam = RamSavage3D[(config1 & 0xC0) >> 6 ] * 1024; break; case S3_SAVAGE4: @@ -1667,22 +1840,22 @@ static int savage_init_hw (struct savagefb_par *par) * can do it different... */ vga_out8(0x3d4, 0x68, par); /* memory control 1 */ - if( (vga_in8(0x3d5, par) & 0xC0) == (0x01 << 6) ) + if ((vga_in8(0x3d5, par) & 0xC0) == (0x01 << 6)) RamSavage4[1] = 8; /*FALLTHROUGH*/ case S3_SAVAGE2000: - videoRam = RamSavage4[ (config1 & 0xE0) >> 5 ] * 1024; + videoRam = RamSavage4[(config1 & 0xE0) >> 5] * 1024; break; case S3_SAVAGE_MX: case S3_SUPERSAVAGE: - videoRam = RamSavageMX[ (config1 & 0x0E) >> 1 ] * 1024; + videoRam = RamSavageMX[(config1 & 0x0E) >> 1] * 1024; break; case S3_PROSAVAGE: - videoRam = RamSavageNB[ (config1 & 0xE0) >> 5 ] * 1024; + videoRam = RamSavageNB[(config1 & 0xE0) >> 5] * 1024; break; default: @@ -1693,31 +1866,31 @@ static int savage_init_hw (struct savagefb_par *par) videoRambytes = videoRam * 1024; - printk (KERN_INFO "savagefb: probed videoram: %dk\n", videoRam); + printk(KERN_INFO "savagefb: probed videoram: %dk\n", videoRam); /* reset graphics engine to avoid memory corruption */ - vga_out8 (0x3d4, 0x66, par); - cr66 = vga_in8 (0x3d5, par); - vga_out8 (0x3d5, cr66 | 0x02, par); - udelay (10000); + vga_out8(0x3d4, 0x66, par); + cr66 = vga_in8(0x3d5, par); + vga_out8(0x3d5, cr66 | 0x02, par); + udelay(10000); - vga_out8 (0x3d4, 0x66, par); - vga_out8 (0x3d5, cr66 & ~0x02, par); /* clear reset flag */ - udelay (10000); + vga_out8(0x3d4, 0x66, par); + vga_out8(0x3d5, cr66 & ~0x02, par); /* clear reset flag */ + udelay(10000); /* * reset memory interface, 3D engine, AGP master, PCI master, * master engine unit, motion compensation/LPB */ - vga_out8 (0x3d4, 0x3f, par); - cr3f = vga_in8 (0x3d5, par); - vga_out8 (0x3d5, cr3f | 0x08, par); - udelay (10000); + vga_out8(0x3d4, 0x3f, par); + cr3f = vga_in8(0x3d5, par); + vga_out8(0x3d5, cr3f | 0x08, par); + udelay(10000); - vga_out8 (0x3d4, 0x3f, par); - vga_out8 (0x3d5, cr3f & ~0x08, par); /* clear reset flags */ - udelay (10000); + vga_out8(0x3d4, 0x3f, par); + vga_out8(0x3d5, cr3f & ~0x08, par); /* clear reset flags */ + udelay(10000); /* Savage ramdac speeds */ par->numClocks = 4; @@ -1740,7 +1913,7 @@ static int savage_init_hw (struct savagefb_par *par) n1 = n & 0x1f; n2 = (n >> 5) & 0x03; par->MCLK = ((1431818 * (m+2)) / (n1+2) / (1 << n2) + 50) / 100; - printk (KERN_INFO "savagefb: Detected current MCLK value of %d kHz\n", + printk(KERN_INFO "savagefb: Detected current MCLK value of %d kHz\n", par->MCLK); /* check for DVI/flat panel */ @@ -1769,12 +1942,12 @@ static int savage_init_hw (struct savagefb_par *par) /* Check LCD panel parrmation */ if (par->display_type == DISP_LCD) { - unsigned char cr6b = VGArCR( 0x6b, par); + unsigned char cr6b = VGArCR(0x6b, par); - int panelX = (VGArSEQ (0x61, par) + - ((VGArSEQ (0x66, par) & 0x02) << 7) + 1) * 8; - int panelY = (VGArSEQ (0x69, par) + - ((VGArSEQ (0x6e, par) & 0x70) << 4) + 1); + int panelX = (VGArSEQ(0x61, par) + + ((VGArSEQ(0x66, par) & 0x02) << 7) + 1) * 8; + int panelY = (VGArSEQ(0x69, par) + + ((VGArSEQ(0x6e, par) & 0x70) << 4) + 1); char * sTechnology = "Unknown"; @@ -1796,26 +1969,26 @@ static int savage_init_hw (struct savagefb_par *par) ActiveDUO = 0x80 }; - if ((VGArSEQ (0x39, par) & 0x03) == 0) { + if ((VGArSEQ(0x39, par) & 0x03) == 0) { sTechnology = "TFT"; - } else if ((VGArSEQ (0x30, par) & 0x01) == 0) { + } else if ((VGArSEQ(0x30, par) & 0x01) == 0) { sTechnology = "DSTN"; } else { sTechnology = "STN"; } - printk (KERN_INFO "savagefb: %dx%d %s LCD panel detected %s\n", - panelX, panelY, sTechnology, - cr6b & ActiveLCD ? "and active" : "but not active"); + printk(KERN_INFO "savagefb: %dx%d %s LCD panel detected %s\n", + panelX, panelY, sTechnology, + cr6b & ActiveLCD ? "and active" : "but not active"); - if( cr6b & ActiveLCD ) { + if (cr6b & ActiveLCD) { /* * If the LCD is active and panel expansion is enabled, * we probably want to kill the HW cursor. */ - printk (KERN_INFO "savagefb: Limiting video mode to " - "%dx%d\n", panelX, panelY ); + printk(KERN_INFO "savagefb: Limiting video mode to " + "%dx%d\n", panelX, panelY); par->SavagePanelWidth = panelX; par->SavagePanelHeight = panelY; @@ -1824,9 +1997,10 @@ static int savage_init_hw (struct savagefb_par *par) par->display_type = DISP_CRT; } - savage_get_default_par (par); + savage_get_default_par(par, &par->state); + par->save = par->state; - if( S3_SAVAGE4_SERIES(par->chip) ) { + if (S3_SAVAGE4_SERIES(par->chip)) { /* * The Savage4 and ProSavage have COB coherency bugs which * render the buffer useless. We disable it. @@ -1845,9 +2019,9 @@ static int savage_init_hw (struct savagefb_par *par) return videoRambytes; } -static int __devinit savage_init_fb_info (struct fb_info *info, - struct pci_dev *dev, - const struct pci_device_id *id) +static int __devinit savage_init_fb_info(struct fb_info *info, + struct pci_dev *dev, + const struct pci_device_id *id) { struct savagefb_par *par = info->par; int err = 0; @@ -1863,63 +2037,63 @@ static int __devinit savage_init_fb_info (struct fb_info *info, switch (info->fix.accel) { case FB_ACCEL_SUPERSAVAGE: par->chip = S3_SUPERSAVAGE; - snprintf (info->fix.id, 16, "SuperSavage"); + snprintf(info->fix.id, 16, "SuperSavage"); break; case FB_ACCEL_SAVAGE4: par->chip = S3_SAVAGE4; - snprintf (info->fix.id, 16, "Savage4"); + snprintf(info->fix.id, 16, "Savage4"); break; case FB_ACCEL_SAVAGE3D: par->chip = S3_SAVAGE3D; - snprintf (info->fix.id, 16, "Savage3D"); + snprintf(info->fix.id, 16, "Savage3D"); break; case FB_ACCEL_SAVAGE3D_MV: par->chip = S3_SAVAGE3D; - snprintf (info->fix.id, 16, "Savage3D-MV"); + snprintf(info->fix.id, 16, "Savage3D-MV"); break; case FB_ACCEL_SAVAGE2000: par->chip = S3_SAVAGE2000; - snprintf (info->fix.id, 16, "Savage2000"); + snprintf(info->fix.id, 16, "Savage2000"); break; case FB_ACCEL_SAVAGE_MX_MV: par->chip = S3_SAVAGE_MX; - snprintf (info->fix.id, 16, "Savage/MX-MV"); + snprintf(info->fix.id, 16, "Savage/MX-MV"); break; case FB_ACCEL_SAVAGE_MX: par->chip = S3_SAVAGE_MX; - snprintf (info->fix.id, 16, "Savage/MX"); + snprintf(info->fix.id, 16, "Savage/MX"); break; case FB_ACCEL_SAVAGE_IX_MV: par->chip = S3_SAVAGE_MX; - snprintf (info->fix.id, 16, "Savage/IX-MV"); + snprintf(info->fix.id, 16, "Savage/IX-MV"); break; case FB_ACCEL_SAVAGE_IX: par->chip = S3_SAVAGE_MX; - snprintf (info->fix.id, 16, "Savage/IX"); + snprintf(info->fix.id, 16, "Savage/IX"); break; case FB_ACCEL_PROSAVAGE_PM: par->chip = S3_PROSAVAGE; - snprintf (info->fix.id, 16, "ProSavagePM"); + snprintf(info->fix.id, 16, "ProSavagePM"); break; case FB_ACCEL_PROSAVAGE_KM: par->chip = S3_PROSAVAGE; - snprintf (info->fix.id, 16, "ProSavageKM"); + snprintf(info->fix.id, 16, "ProSavageKM"); break; case FB_ACCEL_S3TWISTER_P: par->chip = S3_PROSAVAGE; - snprintf (info->fix.id, 16, "TwisterP"); + snprintf(info->fix.id, 16, "TwisterP"); break; case FB_ACCEL_S3TWISTER_K: par->chip = S3_PROSAVAGE; - snprintf (info->fix.id, 16, "TwisterK"); + snprintf(info->fix.id, 16, "TwisterK"); break; case FB_ACCEL_PROSAVAGE_DDR: par->chip = S3_PROSAVAGE; - snprintf (info->fix.id, 16, "ProSavageDDR"); + snprintf(info->fix.id, 16, "ProSavageDDR"); break; case FB_ACCEL_PROSAVAGE_DDRK: par->chip = S3_PROSAVAGE; - snprintf (info->fix.id, 16, "ProSavage8"); + snprintf(info->fix.id, 16, "ProSavage8"); break; } @@ -1960,7 +2134,7 @@ static int __devinit savage_init_fb_info (struct fb_info *info, info->pixmap.buf_align = 4; info->pixmap.access_align = 32; - err = fb_alloc_cmap (&info->cmap, NR_PALETTE, 0); + err = fb_alloc_cmap(&info->cmap, NR_PALETTE, 0); if (!err) info->flags |= FBINFO_HWACCEL_COPYAREA | FBINFO_HWACCEL_FILLRECT | @@ -1972,8 +2146,8 @@ static int __devinit savage_init_fb_info (struct fb_info *info, /* --------------------------------------------------------------------- */ -static int __devinit savagefb_probe (struct pci_dev* dev, - const struct pci_device_id* id) +static int __devinit savagefb_probe(struct pci_dev* dev, + const struct pci_device_id* id) { struct fb_info *info; struct savagefb_par *par; @@ -2085,12 +2259,12 @@ static int __devinit savagefb_probe (struct pci_dev* dev, fb_destroy_modedb(info->monspecs.modedb); info->monspecs.modedb = NULL; - err = register_framebuffer (info); + err = register_framebuffer(info); if (err < 0) goto failed; - printk (KERN_INFO "fb: S3 %s frame buffer device\n", - info->fix.id); + printk(KERN_INFO "fb: S3 %s frame buffer device\n", + info->fix.id); /* * Our driver data @@ -2103,10 +2277,10 @@ static int __devinit savagefb_probe (struct pci_dev* dev, #ifdef CONFIG_FB_SAVAGE_I2C savagefb_delete_i2c_busses(info); #endif - fb_alloc_cmap (&info->cmap, 0, 0); + fb_alloc_cmap(&info->cmap, 0, 0); savage_unmap_video(info); failed_video: - savage_unmap_mmio (info); + savage_unmap_mmio(info); failed_mmio: kfree(info->pixmap.addr); failed_init: @@ -2117,7 +2291,7 @@ static int __devinit savagefb_probe (struct pci_dev* dev, return err; } -static void __devexit savagefb_remove (struct pci_dev *dev) +static void __devexit savagefb_remove(struct pci_dev *dev) { struct fb_info *info = pci_get_drvdata(dev); @@ -2129,16 +2303,16 @@ static void __devexit savagefb_remove (struct pci_dev *dev) * we will be leaving hooks that could cause * oopsen laying around. */ - if (unregister_framebuffer (info)) - printk (KERN_WARNING "savagefb: danger danger! " - "Oopsen imminent!\n"); + if (unregister_framebuffer(info)) + printk(KERN_WARNING "savagefb: danger danger! " + "Oopsen imminent!\n"); #ifdef CONFIG_FB_SAVAGE_I2C savagefb_delete_i2c_busses(info); #endif - fb_alloc_cmap (&info->cmap, 0, 0); - savage_unmap_video (info); - savage_unmap_mmio (info); + fb_alloc_cmap(&info->cmap, 0, 0); + savage_unmap_video(info); + savage_unmap_mmio(info); kfree(info->pixmap.addr); pci_release_regions(dev); framebuffer_release(info); @@ -2151,7 +2325,7 @@ static void __devexit savagefb_remove (struct pci_dev *dev) } } -static int savagefb_suspend (struct pci_dev* dev, pm_message_t state) +static int savagefb_suspend(struct pci_dev* dev, pm_message_t state) { struct fb_info *info = pci_get_drvdata(dev); struct savagefb_par *par = info->par; @@ -2177,6 +2351,7 @@ static int savagefb_suspend (struct pci_dev* dev, pm_message_t state) info->fbops->fb_sync(info); savagefb_blank(FB_BLANK_POWERDOWN, info); + savage_set_default_par(par, &par->save); savage_disable_mmio(par); pci_save_state(dev); pci_disable_device(dev); @@ -2186,7 +2361,7 @@ static int savagefb_suspend (struct pci_dev* dev, pm_message_t state) return 0; } -static int savagefb_resume (struct pci_dev* dev) +static int savagefb_resume(struct pci_dev* dev) { struct fb_info *info = pci_get_drvdata(dev); struct savagefb_par *par = info->par; @@ -2210,15 +2385,15 @@ static int savagefb_resume (struct pci_dev* dev) pci_set_power_state(dev, PCI_D0); pci_restore_state(dev); - if(pci_enable_device(dev)) + if (pci_enable_device(dev)) DBG("err"); pci_set_master(dev); savage_enable_mmio(par); savage_init_hw(par); - savagefb_set_par (info); + savagefb_set_par(info); + fb_set_suspend(info, 0); savagefb_blank(FB_BLANK_UNBLANK, info); - fb_set_suspend (info, 0); release_console_sem(); return 0; @@ -2311,10 +2486,10 @@ static struct pci_driver savagefb_driver = { /* **************************** exit-time only **************************** */ -static void __exit savage_done (void) +static void __exit savage_done(void) { DBG("savage_done"); - pci_unregister_driver (&savagefb_driver); + pci_unregister_driver(&savagefb_driver); } @@ -2345,7 +2520,7 @@ static int __init savagefb_init(void) return -ENODEV; savagefb_setup(option); - return pci_register_driver (&savagefb_driver); + return pci_register_driver(&savagefb_driver); } diff --git a/drivers/video/sis/sis_main.c b/drivers/video/sis/sis_main.c index 8adf5bf91ee..c63c0e721b8 100644 --- a/drivers/video/sis/sis_main.c +++ b/drivers/video/sis/sis_main.c @@ -275,7 +275,7 @@ sisfb_search_mode(char *name, BOOLEAN quiet) static void __devinit sisfb_get_vga_mode_from_kernel(void) { -#if (defined(__i386__) || defined(__x86_64__)) && defined(CONFIG_VIDEO_SELECT) +#ifdef CONFIG_X86 char mymode[32]; int mydepth = screen_info.lfb_depth; diff --git a/drivers/video/skeletonfb.c b/drivers/video/skeletonfb.c index 9b707771d75..67f429e9318 100644 --- a/drivers/video/skeletonfb.c +++ b/drivers/video/skeletonfb.c @@ -906,11 +906,6 @@ static void __exit xxxfb_exit(void) } #endif -MODULE_LICENSE("GPL"); -module_init(xxxfb_init); -module_exit(xxxfb_exit); - - /* * Setup */ diff --git a/drivers/video/tgafb.c b/drivers/video/tgafb.c index 7398bd48ba6..6c2c78ab982 100644 --- a/drivers/video/tgafb.c +++ b/drivers/video/tgafb.c @@ -26,7 +26,6 @@ #include <linux/selection.h> #include <asm/io.h> #include <video/tgafb.h> -#include <linux/selection.h> /* * Local functions. diff --git a/drivers/video/vesafb.c b/drivers/video/vesafb.c index b0b9acfdd43..5718924b677 100644 --- a/drivers/video/vesafb.c +++ b/drivers/video/vesafb.c @@ -51,7 +51,7 @@ static int inverse = 0; static int mtrr = 0; /* disable mtrr */ static int vram_remap __initdata = 0; /* Set amount of memory to be used */ static int vram_total __initdata = 0; /* Set total amount of memory */ -static int pmi_setpal = 0; /* pmi for palette changes ??? */ +static int pmi_setpal = 1; /* pmi for palette changes ??? */ static int ypan = 0; /* 0..nothing, 1..ypan, 2..ywrap */ static unsigned short *pmi_base = NULL; static void (*pmi_start)(void); @@ -80,15 +80,30 @@ static int vesafb_pan_display(struct fb_var_screeninfo *var, return 0; } -static void vesa_setpalette(int regno, unsigned red, unsigned green, +static int vesa_setpalette(int regno, unsigned red, unsigned green, unsigned blue) { int shift = 16 - depth; + int err = -EINVAL; + +/* + * Try VGA registers first... + */ + if (vga_compat) { + outb_p(regno, dac_reg); + outb_p(red >> shift, dac_val); + outb_p(green >> shift, dac_val); + outb_p(blue >> shift, dac_val); + err = 0; + } #ifdef __i386__ - struct { u_char blue, green, red, pad; } entry; +/* + * Fallback to the PMI.... + */ + if (err && pmi_setpal) { + struct { u_char blue, green, red, pad; } entry; - if (pmi_setpal) { entry.red = red >> shift; entry.green = green >> shift; entry.blue = blue >> shift; @@ -102,26 +117,19 @@ static void vesa_setpalette(int regno, unsigned red, unsigned green, "d" (regno), /* EDX */ "D" (&entry), /* EDI */ "S" (&pmi_pal)); /* ESI */ - return; + err = 0; } #endif -/* - * without protected mode interface and if VGA compatible, - * try VGA registers... - */ - if (vga_compat) { - outb_p(regno, dac_reg); - outb_p(red >> shift, dac_val); - outb_p(green >> shift, dac_val); - outb_p(blue >> shift, dac_val); - } + return err; } static int vesafb_setcolreg(unsigned regno, unsigned red, unsigned green, unsigned blue, unsigned transp, struct fb_info *info) { + int err = 0; + /* * Set a single color register. The values supplied are * already rounded down to the hardware's capabilities @@ -133,7 +141,7 @@ static int vesafb_setcolreg(unsigned regno, unsigned red, unsigned green, return 1; if (info->var.bits_per_pixel == 8) - vesa_setpalette(regno,red,green,blue); + err = vesa_setpalette(regno,red,green,blue); else if (regno < 16) { switch (info->var.bits_per_pixel) { case 16: @@ -164,7 +172,7 @@ static int vesafb_setcolreg(unsigned regno, unsigned red, unsigned green, } } - return 0; + return err; } static struct fb_ops vesafb_ops = { @@ -460,9 +468,7 @@ static struct platform_driver vesafb_driver = { }, }; -static struct platform_device vesafb_device = { - .name = "vesafb", -}; +static struct platform_device *vesafb_device; static int __init vesafb_init(void) { @@ -475,10 +481,19 @@ static int __init vesafb_init(void) ret = platform_driver_register(&vesafb_driver); if (!ret) { - ret = platform_device_register(&vesafb_device); - if (ret) + vesafb_device = platform_device_alloc("vesafb", 0); + + if (vesafb_device) + ret = platform_device_add(vesafb_device); + else + ret = -ENOMEM; + + if (ret) { + platform_device_put(vesafb_device); platform_driver_unregister(&vesafb_driver); + } } + return ret; } module_init(vesafb_init); diff --git a/drivers/video/vfb.c b/drivers/video/vfb.c index 77eed1fd994..d073ffb6e1f 100644 --- a/drivers/video/vfb.c +++ b/drivers/video/vfb.c @@ -398,12 +398,6 @@ static int __init vfb_setup(char *options) * Initialisation */ -static void vfb_platform_release(struct device *device) -{ - // This is called when the reference count goes to zero. - dev_err(device, "This driver is broken, please bug the authors so they will fix it.\n"); -} - static int __init vfb_probe(struct platform_device *dev) { struct fb_info *info; @@ -482,13 +476,7 @@ static struct platform_driver vfb_driver = { }, }; -static struct platform_device vfb_device = { - .name = "vfb", - .id = 0, - .dev = { - .release = vfb_platform_release, - } -}; +static struct platform_device *vfb_device; static int __init vfb_init(void) { @@ -508,10 +496,19 @@ static int __init vfb_init(void) ret = platform_driver_register(&vfb_driver); if (!ret) { - ret = platform_device_register(&vfb_device); - if (ret) + vfb_device = platform_device_alloc("vfb", 0); + + if (vfb_device) + ret = platform_device_add(vfb_device); + else + ret = -ENOMEM; + + if (ret) { + platform_device_put(vfb_device); platform_driver_unregister(&vfb_driver); + } } + return ret; } @@ -520,7 +517,7 @@ module_init(vfb_init); #ifdef MODULE static void __exit vfb_exit(void) { - platform_device_unregister(&vfb_device); + platform_device_unregister(vfb_device); platform_driver_unregister(&vfb_driver); } diff --git a/drivers/video/vga16fb.c b/drivers/video/vga16fb.c index 4fd2a272e03..3c404c9bd36 100644 --- a/drivers/video/vga16fb.c +++ b/drivers/video/vga16fb.c @@ -1334,9 +1334,8 @@ static int vga16fb_setup(char *options) } #endif -static int __init vga16fb_probe(struct device *device) +static int __init vga16fb_probe(struct platform_device *dev) { - struct platform_device *dev = to_platform_device(device); struct fb_info *info; struct vga16fb_par *par; int i; @@ -1403,7 +1402,7 @@ static int __init vga16fb_probe(struct device *device) printk(KERN_INFO "fb%d: %s frame buffer device\n", info->node, info->fix.id); - dev_set_drvdata(device, info); + platform_set_drvdata(dev, info); return 0; @@ -1417,9 +1416,9 @@ static int __init vga16fb_probe(struct device *device) return ret; } -static int vga16fb_remove(struct device *device) +static int vga16fb_remove(struct platform_device *dev) { - struct fb_info *info = dev_get_drvdata(device); + struct fb_info *info = platform_get_drvdata(dev); if (info) { unregister_framebuffer(info); @@ -1432,16 +1431,15 @@ static int vga16fb_remove(struct device *device) return 0; } -static struct device_driver vga16fb_driver = { - .name = "vga16fb", - .bus = &platform_bus_type, +static struct platform_driver vga16fb_driver = { .probe = vga16fb_probe, .remove = vga16fb_remove, + .driver = { + .name = "vga16fb", + }, }; -static struct platform_device vga16fb_device = { - .name = "vga16fb", -}; +static struct platform_device *vga16fb_device; static int __init vga16fb_init(void) { @@ -1454,12 +1452,20 @@ static int __init vga16fb_init(void) vga16fb_setup(option); #endif - ret = driver_register(&vga16fb_driver); + ret = platform_driver_register(&vga16fb_driver); if (!ret) { - ret = platform_device_register(&vga16fb_device); - if (ret) - driver_unregister(&vga16fb_driver); + vga16fb_device = platform_device_alloc("vga16fb", 0); + + if (vga16fb_device) + ret = platform_device_add(vga16fb_device); + else + ret = -ENOMEM; + + if (ret) { + platform_device_put(vga16fb_device); + platform_driver_unregister(&vga16fb_driver); + } } return ret; @@ -1467,8 +1473,8 @@ static int __init vga16fb_init(void) static void __exit vga16fb_exit(void) { - platform_device_unregister(&vga16fb_device); - driver_unregister(&vga16fb_driver); + platform_device_unregister(vga16fb_device); + platform_driver_unregister(&vga16fb_driver); } MODULE_LICENSE("GPL"); diff --git a/fs/Kconfig b/fs/Kconfig index 1cdc043922d..6c5051802bd 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -1490,7 +1490,12 @@ config NFSD select LOCKD select SUNRPC select EXPORTFS - select NFS_ACL_SUPPORT if NFSD_V3_ACL || NFSD_V2_ACL + select NFSD_V2_ACL if NFSD_V3_ACL + select NFS_ACL_SUPPORT if NFSD_V2_ACL + select NFSD_TCP if NFSD_V4 + select CRYPTO_MD5 if NFSD_V4 + select CRYPTO if NFSD_V4 + select FS_POSIX_ACL if NFSD_V4 help If you want your Linux box to act as an NFS *server*, so that other computers on your local network which support NFS can access certain @@ -1528,7 +1533,6 @@ config NFSD_V3 config NFSD_V3_ACL bool "Provide server support for the NFSv3 ACL protocol extension" depends on NFSD_V3 - select NFSD_V2_ACL help Implement the NFSv3 ACL protocol extension for manipulating POSIX Access Control Lists on exported file systems. NFS clients should @@ -1538,10 +1542,6 @@ config NFSD_V3_ACL config NFSD_V4 bool "Provide NFSv4 server support (EXPERIMENTAL)" depends on NFSD_V3 && EXPERIMENTAL - select NFSD_TCP - select CRYPTO_MD5 - select CRYPTO - select FS_POSIX_ACL help If you would like to include the NFSv4 server as well as the NFSv2 and NFSv3 servers, say Y here. This feature is experimental, and diff --git a/fs/afs/cell.c b/fs/afs/cell.c index 009a9ae88d6..bfc1fd22d5b 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -413,8 +413,7 @@ int afs_server_find_by_peer(const struct rxrpc_peer *peer, /* we found it in the graveyard - resurrect it */ found_dead_server: - list_del(&server->link); - list_add_tail(&server->link, &cell->sv_list); + list_move_tail(&server->link, &cell->sv_list); afs_get_server(server); afs_kafstimod_del_timer(&server->timeout); spin_unlock(&cell->sv_gylock); diff --git a/fs/afs/kafsasyncd.c b/fs/afs/kafsasyncd.c index 7ac07d0d47b..f09a794f248 100644 --- a/fs/afs/kafsasyncd.c +++ b/fs/afs/kafsasyncd.c @@ -136,8 +136,7 @@ static int kafsasyncd(void *arg) if (!list_empty(&kafsasyncd_async_attnq)) { op = list_entry(kafsasyncd_async_attnq.next, struct afs_async_op, link); - list_del(&op->link); - list_add_tail(&op->link, + list_move_tail(&op->link, &kafsasyncd_async_busyq); } @@ -204,8 +203,7 @@ void afs_kafsasyncd_begin_op(struct afs_async_op *op) init_waitqueue_entry(&op->waiter, kafsasyncd_task); add_wait_queue(&op->call->waitq, &op->waiter); - list_del(&op->link); - list_add_tail(&op->link, &kafsasyncd_async_busyq); + list_move_tail(&op->link, &kafsasyncd_async_busyq); spin_unlock(&kafsasyncd_async_lock); @@ -223,8 +221,7 @@ void afs_kafsasyncd_attend_op(struct afs_async_op *op) spin_lock(&kafsasyncd_async_lock); - list_del(&op->link); - list_add_tail(&op->link, &kafsasyncd_async_attnq); + list_move_tail(&op->link, &kafsasyncd_async_attnq); spin_unlock(&kafsasyncd_async_lock); diff --git a/fs/afs/server.c b/fs/afs/server.c index 62b093aa41c..22afaae1a4c 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -123,8 +123,7 @@ int afs_server_lookup(struct afs_cell *cell, const struct in_addr *addr, resurrect_server: _debug("resurrecting server"); - list_del(&zombie->link); - list_add_tail(&zombie->link, &cell->sv_list); + list_move_tail(&zombie->link, &cell->sv_list); afs_get_server(zombie); afs_kafstimod_del_timer(&zombie->timeout); spin_unlock(&cell->sv_gylock); @@ -168,8 +167,7 @@ void afs_put_server(struct afs_server *server) } spin_lock(&cell->sv_gylock); - list_del(&server->link); - list_add_tail(&server->link, &cell->sv_graveyard); + list_move_tail(&server->link, &cell->sv_graveyard); /* time out in 10 secs */ afs_kafstimod_add_timer(&server->timeout, 10 * HZ); diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c index eced20618ec..331f730a1fb 100644 --- a/fs/afs/vlocation.c +++ b/fs/afs/vlocation.c @@ -326,8 +326,7 @@ int afs_vlocation_lookup(struct afs_cell *cell, /* found in the graveyard - resurrect */ _debug("found in graveyard"); atomic_inc(&vlocation->usage); - list_del(&vlocation->link); - list_add_tail(&vlocation->link, &cell->vl_list); + list_move_tail(&vlocation->link, &cell->vl_list); spin_unlock(&cell->vl_gylock); afs_kafstimod_del_timer(&vlocation->timeout); @@ -478,8 +477,7 @@ static void __afs_put_vlocation(struct afs_vlocation *vlocation) } /* move to graveyard queue */ - list_del(&vlocation->link); - list_add_tail(&vlocation->link,&cell->vl_graveyard); + list_move_tail(&vlocation->link,&cell->vl_graveyard); /* remove from pending timeout queue (refcounted if actually being * updated) */ diff --git a/fs/afs/vnode.c b/fs/afs/vnode.c index 9867fef3261..cf62da5d782 100644 --- a/fs/afs/vnode.c +++ b/fs/afs/vnode.c @@ -104,8 +104,7 @@ static void afs_vnode_finalise_status_update(struct afs_vnode *vnode, vnode->cb_expiry * HZ); spin_lock(&afs_cb_hash_lock); - list_del(&vnode->cb_hash_link); - list_add_tail(&vnode->cb_hash_link, + list_move_tail(&vnode->cb_hash_link, &afs_cb_hash(server, &vnode->fid)); spin_unlock(&afs_cb_hash_lock); diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index 4456d1daa40..8dbd44f10e9 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -376,8 +376,7 @@ next: DPRINTK("returning %p %.*s", expired, (int)expired->d_name.len, expired->d_name.name); spin_lock(&dcache_lock); - list_del(&expired->d_parent->d_subdirs); - list_add(&expired->d_parent->d_subdirs, &expired->d_u.d_child); + list_move(&expired->d_parent->d_subdirs, &expired->d_u.d_child); spin_unlock(&dcache_lock); return expired; } diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c index 6c6771db36d..7caee8d8ea3 100644 --- a/fs/coda/psdev.c +++ b/fs/coda/psdev.c @@ -259,7 +259,7 @@ static ssize_t coda_psdev_read(struct file * file, char __user * buf, /* If request was not a signal, enqueue and don't free */ if (!(req->uc_flags & REQ_ASYNC)) { req->uc_flags |= REQ_READ; - list_add(&(req->uc_chain), vcp->vc_processing.prev); + list_add_tail(&(req->uc_chain), &vcp->vc_processing); goto out; } diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c index b040eba13a7..a5b5e631ba6 100644 --- a/fs/coda/upcall.c +++ b/fs/coda/upcall.c @@ -725,7 +725,7 @@ static int coda_upcall(struct coda_sb_info *sbi, ((union inputArgs *)buffer)->ih.unique = req->uc_unique; /* Append msg to pending queue and poke Venus. */ - list_add(&(req->uc_chain), vcommp->vc_pending.prev); + list_add_tail(&(req->uc_chain), &vcommp->vc_pending); wake_up_interruptible(&vcommp->vc_waitq); /* We can be interrupted while we wait for Venus to process diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 9eb9824dd33..d8ecfedef18 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -80,6 +80,7 @@ #include <net/bluetooth/rfcomm.h> #include <linux/capi.h> +#include <linux/gigaset_dev.h> #include <scsi/scsi.h> #include <scsi/scsi_ioctl.h> diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 5f952187fc5..207f8006fd6 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -1009,8 +1009,7 @@ static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir /* fallthrough */ default: if (filp->f_pos == 2) { - list_del(q); - list_add(q, &parent_sd->s_children); + list_move(q, &parent_sd->s_children); } for (p=q->next; p!= &parent_sd->s_children; p=p->next) { struct configfs_dirent *next; @@ -1033,8 +1032,7 @@ static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir dt_type(next)) < 0) return 0; - list_del(q); - list_add(q, p); + list_move(q, p); p = q; filp->f_pos++; } diff --git a/fs/dcache.c b/fs/dcache.c index b85fda36053..48b44a714b3 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -522,8 +522,7 @@ void shrink_dcache_sb(struct super_block * sb) dentry = list_entry(tmp, struct dentry, d_lru); if (dentry->d_sb != sb) continue; - list_del(tmp); - list_add(tmp, &dentry_unused); + list_move(tmp, &dentry_unused); } /* @@ -638,7 +637,7 @@ resume: * of the unused list for prune_dcache */ if (!atomic_read(&dentry->d_count)) { - list_add(&dentry->d_lru, dentry_unused.prev); + list_add_tail(&dentry->d_lru, &dentry_unused); dentry_stat.nr_unused++; found++; } diff --git a/fs/dquot.c b/fs/dquot.c index 81d87a413c6..0122a279106 100644 --- a/fs/dquot.c +++ b/fs/dquot.c @@ -250,7 +250,7 @@ static inline struct dquot *find_dquot(unsigned int hashent, struct super_block /* Add a dquot to the tail of the free list */ static inline void put_dquot_last(struct dquot *dquot) { - list_add(&dquot->dq_free, free_dquots.prev); + list_add_tail(&dquot->dq_free, &free_dquots); dqstats.free_dquots++; } @@ -266,7 +266,7 @@ static inline void put_inuse(struct dquot *dquot) { /* We add to the back of inuse list so we don't have to restart * when traversing this list and we block */ - list_add(&dquot->dq_inuse, inuse_list.prev); + list_add_tail(&dquot->dq_inuse, &inuse_list); dqstats.allocated_dquots++; } diff --git a/fs/exec.c b/fs/exec.c index 0b88bf64614..c8494f513ea 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -666,8 +666,6 @@ static int de_thread(struct task_struct *tsk) * and to assume its PID: */ if (!thread_group_leader(current)) { - struct dentry *proc_dentry1, *proc_dentry2; - /* * Wait for the thread group leader to be a zombie. * It should already be zombie at this point, most @@ -689,10 +687,6 @@ static int de_thread(struct task_struct *tsk) */ current->start_time = leader->start_time; - spin_lock(&leader->proc_lock); - spin_lock(¤t->proc_lock); - proc_dentry1 = proc_pid_unhash(current); - proc_dentry2 = proc_pid_unhash(leader); write_lock_irq(&tasklist_lock); BUG_ON(leader->tgid != current->tgid); @@ -713,7 +707,7 @@ static int de_thread(struct task_struct *tsk) attach_pid(current, PIDTYPE_PID, current->pid); attach_pid(current, PIDTYPE_PGID, current->signal->pgrp); attach_pid(current, PIDTYPE_SID, current->signal->session); - list_add_tail_rcu(¤t->tasks, &init_task.tasks); + list_replace_rcu(&leader->tasks, ¤t->tasks); current->group_leader = current; leader->group_leader = current; @@ -721,7 +715,6 @@ static int de_thread(struct task_struct *tsk) /* Reduce leader to a thread */ detach_pid(leader, PIDTYPE_PGID); detach_pid(leader, PIDTYPE_SID); - list_del_init(&leader->tasks); current->exit_signal = SIGCHLD; @@ -729,10 +722,6 @@ static int de_thread(struct task_struct *tsk) leader->exit_state = EXIT_DEAD; write_unlock_irq(&tasklist_lock); - spin_unlock(&leader->proc_lock); - spin_unlock(¤t->proc_lock); - proc_pid_flush(proc_dentry1); - proc_pid_flush(proc_dentry2); } /* @@ -1379,67 +1368,102 @@ static void format_corename(char *corename, const char *pattern, long signr) *out_ptr = 0; } -static void zap_threads (struct mm_struct *mm) +static void zap_process(struct task_struct *start) { - struct task_struct *g, *p; - struct task_struct *tsk = current; - struct completion *vfork_done = tsk->vfork_done; - int traced = 0; + struct task_struct *t; - /* - * Make sure nobody is waiting for us to release the VM, - * otherwise we can deadlock when we wait on each other - */ - if (vfork_done) { - tsk->vfork_done = NULL; - complete(vfork_done); - } + start->signal->flags = SIGNAL_GROUP_EXIT; + start->signal->group_stop_count = 0; - read_lock(&tasklist_lock); - do_each_thread(g,p) - if (mm == p->mm && p != tsk) { - force_sig_specific(SIGKILL, p); - mm->core_waiters++; - if (unlikely(p->ptrace) && - unlikely(p->parent->mm == mm)) - traced = 1; + t = start; + do { + if (t != current && t->mm) { + t->mm->core_waiters++; + sigaddset(&t->pending.signal, SIGKILL); + signal_wake_up(t, 1); } - while_each_thread(g,p); + } while ((t = next_thread(t)) != start); +} - read_unlock(&tasklist_lock); +static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm, + int exit_code) +{ + struct task_struct *g, *p; + unsigned long flags; + int err = -EAGAIN; + + spin_lock_irq(&tsk->sighand->siglock); + if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) { + tsk->signal->group_exit_code = exit_code; + zap_process(tsk); + err = 0; + } + spin_unlock_irq(&tsk->sighand->siglock); + if (err) + return err; - if (unlikely(traced)) { - /* - * We are zapping a thread and the thread it ptraces. - * If the tracee went into a ptrace stop for exit tracing, - * we could deadlock since the tracer is waiting for this - * coredump to finish. Detach them so they can both die. - */ - write_lock_irq(&tasklist_lock); - do_each_thread(g,p) { - if (mm == p->mm && p != tsk && - p->ptrace && p->parent->mm == mm) { - __ptrace_detach(p, 0); + if (atomic_read(&mm->mm_users) == mm->core_waiters + 1) + goto done; + + rcu_read_lock(); + for_each_process(g) { + if (g == tsk->group_leader) + continue; + + p = g; + do { + if (p->mm) { + if (p->mm == mm) { + /* + * p->sighand can't disappear, but + * may be changed by de_thread() + */ + lock_task_sighand(p, &flags); + zap_process(p); + unlock_task_sighand(p, &flags); + } + break; } - } while_each_thread(g,p); - write_unlock_irq(&tasklist_lock); + } while ((p = next_thread(p)) != g); } + rcu_read_unlock(); +done: + return mm->core_waiters; } -static void coredump_wait(struct mm_struct *mm) +static int coredump_wait(int exit_code) { - DECLARE_COMPLETION(startup_done); + struct task_struct *tsk = current; + struct mm_struct *mm = tsk->mm; + struct completion startup_done; + struct completion *vfork_done; int core_waiters; + init_completion(&mm->core_done); + init_completion(&startup_done); mm->core_startup_done = &startup_done; - zap_threads(mm); - core_waiters = mm->core_waiters; + core_waiters = zap_threads(tsk, mm, exit_code); up_write(&mm->mmap_sem); + if (unlikely(core_waiters < 0)) + goto fail; + + /* + * Make sure nobody is waiting for us to release the VM, + * otherwise we can deadlock when we wait on each other + */ + vfork_done = tsk->vfork_done; + if (vfork_done) { + tsk->vfork_done = NULL; + complete(vfork_done); + } + if (core_waiters) wait_for_completion(&startup_done); +fail: BUG_ON(mm->core_waiters); + return core_waiters; } int do_coredump(long signr, int exit_code, struct pt_regs * regs) @@ -1473,22 +1497,9 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs) } mm->dumpable = 0; - retval = -EAGAIN; - spin_lock_irq(¤t->sighand->siglock); - if (!(current->signal->flags & SIGNAL_GROUP_EXIT)) { - current->signal->flags = SIGNAL_GROUP_EXIT; - current->signal->group_exit_code = exit_code; - current->signal->group_stop_count = 0; - retval = 0; - } - spin_unlock_irq(¤t->sighand->siglock); - if (retval) { - up_write(&mm->mmap_sem); + retval = coredump_wait(exit_code); + if (retval < 0) goto fail; - } - - init_completion(&mm->core_done); - coredump_wait(mm); /* * Clear any false indication of pending signals that might diff --git a/fs/ext3/super.c b/fs/ext3/super.c index b2891cc29db..b7483360a2d 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -630,7 +630,7 @@ enum { Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro, Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov, Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, - Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, + Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, @@ -666,6 +666,7 @@ static match_table_t tokens = { {Opt_noreservation, "noreservation"}, {Opt_noload, "noload"}, {Opt_nobh, "nobh"}, + {Opt_bh, "bh"}, {Opt_commit, "commit=%u"}, {Opt_journal_update, "journal=update"}, {Opt_journal_inum, "journal=%u"}, @@ -1014,6 +1015,9 @@ clear_qf_name: case Opt_nobh: set_opt(sbi->s_mount_opt, NOBH); break; + case Opt_bh: + clear_opt(sbi->s_mount_opt, NOBH); + break; default: printk (KERN_ERR "EXT3-fs: Unrecognized mount option \"%s\" " diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c index 1862e8bc101..b8886f048ea 100644 --- a/fs/jffs2/erase.c +++ b/fs/jffs2/erase.c @@ -53,8 +53,7 @@ static void jffs2_erase_block(struct jffs2_sb_info *c, if (!instr) { printk(KERN_WARNING "kmalloc for struct erase_info in jffs2_erase_block failed. Refiling block for later\n"); spin_lock(&c->erase_completion_lock); - list_del(&jeb->list); - list_add(&jeb->list, &c->erase_pending_list); + list_move(&jeb->list, &c->erase_pending_list); c->erasing_size -= c->sector_size; c->dirty_size += c->sector_size; jeb->dirty_size = c->sector_size; @@ -86,8 +85,7 @@ static void jffs2_erase_block(struct jffs2_sb_info *c, /* Erase failed immediately. Refile it on the list */ D1(printk(KERN_DEBUG "Erase at 0x%08x failed: %d. Refiling on erase_pending_list\n", jeb->offset, ret)); spin_lock(&c->erase_completion_lock); - list_del(&jeb->list); - list_add(&jeb->list, &c->erase_pending_list); + list_move(&jeb->list, &c->erase_pending_list); c->erasing_size -= c->sector_size; c->dirty_size += c->sector_size; jeb->dirty_size = c->sector_size; @@ -161,8 +159,7 @@ static void jffs2_erase_succeeded(struct jffs2_sb_info *c, struct jffs2_eraseblo { D1(printk(KERN_DEBUG "Erase completed successfully at 0x%08x\n", jeb->offset)); spin_lock(&c->erase_completion_lock); - list_del(&jeb->list); - list_add_tail(&jeb->list, &c->erase_complete_list); + list_move_tail(&jeb->list, &c->erase_complete_list); spin_unlock(&c->erase_completion_lock); /* Ensure that kupdated calls us again to mark them clean */ jffs2_erase_pending_trigger(c); @@ -178,8 +175,7 @@ static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock if (!jffs2_write_nand_badblock(c, jeb, bad_offset)) { /* We'd like to give this block another try. */ spin_lock(&c->erase_completion_lock); - list_del(&jeb->list); - list_add(&jeb->list, &c->erase_pending_list); + list_move(&jeb->list, &c->erase_pending_list); c->erasing_size -= c->sector_size; c->dirty_size += c->sector_size; jeb->dirty_size = c->sector_size; @@ -191,8 +187,7 @@ static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock spin_lock(&c->erase_completion_lock); c->erasing_size -= c->sector_size; c->bad_size += c->sector_size; - list_del(&jeb->list); - list_add(&jeb->list, &c->bad_list); + list_move(&jeb->list, &c->bad_list); c->nr_erasing_blocks--; spin_unlock(&c->erase_completion_lock); wake_up(&c->erase_wait); diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c index 8bedfd2ff68..ac0c350ed7d 100644 --- a/fs/jffs2/nodemgmt.c +++ b/fs/jffs2/nodemgmt.c @@ -211,8 +211,7 @@ static int jffs2_find_nextblock(struct jffs2_sb_info *c) struct jffs2_eraseblock *ejeb; ejeb = list_entry(c->erasable_list.next, struct jffs2_eraseblock, list); - list_del(&ejeb->list); - list_add_tail(&ejeb->list, &c->erase_pending_list); + list_move_tail(&ejeb->list, &c->erase_pending_list); c->nr_erasing_blocks++; jffs2_erase_pending_trigger(c); D1(printk(KERN_DEBUG "jffs2_find_nextblock: Triggering erase of erasable block at 0x%08x\n", diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c index a7f153f79ec..b9b700730df 100644 --- a/fs/jffs2/wbuf.c +++ b/fs/jffs2/wbuf.c @@ -495,8 +495,7 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c) /* Fix up the original jeb now it's on the bad_list */ if (first_raw == jeb->first_node) { D1(printk(KERN_DEBUG "Failing block at %08x is now empty. Moving to erase_pending_list\n", jeb->offset)); - list_del(&jeb->list); - list_add(&jeb->list, &c->erase_pending_list); + list_move(&jeb->list, &c->erase_pending_list); c->nr_erasing_blocks++; jffs2_erase_pending_trigger(c); } diff --git a/fs/libfs.c b/fs/libfs.c index fc785d8befb..ac02ea602c3 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -149,10 +149,9 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir) /* fallthrough */ default: spin_lock(&dcache_lock); - if (filp->f_pos == 2) { - list_del(q); - list_add(q, &dentry->d_subdirs); - } + if (filp->f_pos == 2) + list_move(q, &dentry->d_subdirs); + for (p=q->next; p != &dentry->d_subdirs; p=p->next) { struct dentry *next; next = list_entry(p, struct dentry, d_u.d_child); @@ -164,8 +163,7 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir) return 0; spin_lock(&dcache_lock); /* next is still alive */ - list_del(q); - list_add(q, p); + list_move(q, p); p = q; filp->f_pos++; } diff --git a/fs/namespace.c b/fs/namespace.c index 866430bb024..b3ed212ea41 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -526,10 +526,8 @@ void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill) { struct vfsmount *p; - for (p = mnt; p; p = next_mnt(p, mnt)) { - list_del(&p->mnt_hash); - list_add(&p->mnt_hash, kill); - } + for (p = mnt; p; p = next_mnt(p, mnt)) + list_move(&p->mnt_hash, kill); if (propagate) propagate_umount(kill); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 96c7578cbe1..1630b5670dc 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -529,8 +529,7 @@ move_to_confirmed(struct nfs4_client *clp) dprintk("NFSD: move_to_confirm nfs4_client %p\n", clp); list_del_init(&clp->cl_strhash); - list_del_init(&clp->cl_idhash); - list_add(&clp->cl_idhash, &conf_id_hashtbl[idhashval]); + list_move(&clp->cl_idhash, &conf_id_hashtbl[idhashval]); strhashval = clientstr_hashval(clp->cl_recdir); list_add(&clp->cl_strhash, &conf_str_hashtbl[strhashval]); renew_client(clp); diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index d852ebb538e..fdf7cf3dfad 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -103,8 +103,7 @@ nfsd_cache_shutdown(void) static void lru_put_end(struct svc_cacherep *rp) { - list_del(&rp->c_lru); - list_add_tail(&rp->c_lru, &lru_head); + list_move_tail(&rp->c_lru, &lru_head); } /* diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c index 355593dd8ef..87ee29cad50 100644 --- a/fs/ocfs2/dlm/dlmast.c +++ b/fs/ocfs2/dlm/dlmast.c @@ -381,8 +381,7 @@ do_ast: ret = DLM_NORMAL; if (past->type == DLM_AST) { /* do not alter lock refcount. switching lists. */ - list_del_init(&lock->list); - list_add_tail(&lock->list, &res->granted); + list_move_tail(&lock->list, &res->granted); mlog(0, "ast: adding to granted list... type=%d, " "convert_type=%d\n", lock->ml.type, lock->ml.convert_type); if (lock->ml.convert_type != LKM_IVMODE) { diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c index 8285228d9e3..70888b31e75 100644 --- a/fs/ocfs2/dlm/dlmconvert.c +++ b/fs/ocfs2/dlm/dlmconvert.c @@ -231,8 +231,7 @@ switch_queues: lock->ml.convert_type = type; /* do not alter lock refcount. switching lists. */ - list_del_init(&lock->list); - list_add_tail(&lock->list, &res->converting); + list_move_tail(&lock->list, &res->converting); unlock_exit: spin_unlock(&lock->spinlock); @@ -248,8 +247,7 @@ void dlm_revert_pending_convert(struct dlm_lock_resource *res, struct dlm_lock *lock) { /* do not alter lock refcount. switching lists. */ - list_del_init(&lock->list); - list_add_tail(&lock->list, &res->granted); + list_move_tail(&lock->list, &res->granted); lock->ml.convert_type = LKM_IVMODE; lock->lksb->flags &= ~(DLM_LKSB_GET_LVB|DLM_LKSB_PUT_LVB); } @@ -294,8 +292,7 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm, res->state |= DLM_LOCK_RES_IN_PROGRESS; /* move lock to local convert queue */ /* do not alter lock refcount. switching lists. */ - list_del_init(&lock->list); - list_add_tail(&lock->list, &res->converting); + list_move_tail(&lock->list, &res->converting); lock->convert_pending = 1; lock->ml.convert_type = type; diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c index 6fea28318d6..55cda25ae11 100644 --- a/fs/ocfs2/dlm/dlmlock.c +++ b/fs/ocfs2/dlm/dlmlock.c @@ -239,8 +239,7 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm, mlog(0, "%s: $RECOVERY lock for this node (%u) is " "mastered by %u; got lock, manually granting (no ast)\n", dlm->name, dlm->node_num, res->owner); - list_del_init(&lock->list); - list_add_tail(&lock->list, &res->granted); + list_move_tail(&lock->list, &res->granted); } spin_unlock(&res->spinlock); diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 805cbabac05..9962190e741 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -905,13 +905,11 @@ static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm, mlog(0, "found lockres owned by dead node while " "doing recovery for node %u. sending it.\n", dead_node); - list_del_init(&res->recovering); - list_add_tail(&res->recovering, list); + list_move_tail(&res->recovering, list); } else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) { mlog(0, "found UNKNOWN owner while doing recovery " "for node %u. sending it.\n", dead_node); - list_del_init(&res->recovering); - list_add_tail(&res->recovering, list); + list_move_tail(&res->recovering, list); } } spin_unlock(&dlm->spinlock); @@ -1529,8 +1527,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, /* move the lock to its proper place */ /* do not alter lock refcount. switching lists. */ - list_del_init(&lock->list); - list_add_tail(&lock->list, queue); + list_move_tail(&lock->list, queue); spin_unlock(&res->spinlock); mlog(0, "just reordered a local lock!\n"); diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index 5be9d14f12c..44d3b57ae8a 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -318,8 +318,7 @@ converting: target->ml.type = target->ml.convert_type; target->ml.convert_type = LKM_IVMODE; - list_del_init(&target->list); - list_add_tail(&target->list, &res->granted); + list_move_tail(&target->list, &res->granted); BUG_ON(!target->lksb); target->lksb->status = DLM_NORMAL; @@ -380,8 +379,7 @@ blocked: target->ml.type, target->ml.node); // target->ml.type is already correct - list_del_init(&target->list); - list_add_tail(&target->list, &res->granted); + list_move_tail(&target->list, &res->granted); BUG_ON(!target->lksb); target->lksb->status = DLM_NORMAL; diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index 7b1a2754267..ac89c509daf 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -271,8 +271,7 @@ void dlm_commit_pending_unlock(struct dlm_lock_resource *res, void dlm_commit_pending_cancel(struct dlm_lock_resource *res, struct dlm_lock *lock) { - list_del_init(&lock->list); - list_add_tail(&lock->list, &res->granted); + list_move_tail(&lock->list, &res->granted); lock->ml.convert_type = LKM_IVMODE; } diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index eebc3cfa6be..3fe8781c22c 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -222,8 +222,7 @@ void ocfs2_handle_add_inode(struct ocfs2_journal_handle *handle, BUG_ON(!list_empty(&OCFS2_I(inode)->ip_handle_list)); OCFS2_I(inode)->ip_handle = handle; - list_del(&(OCFS2_I(inode)->ip_handle_list)); - list_add_tail(&(OCFS2_I(inode)->ip_handle_list), &(handle->inode_list)); + list_move_tail(&(OCFS2_I(inode)->ip_handle_list), &(handle->inode_list)); } static void ocfs2_handle_unlock_inodes(struct ocfs2_journal_handle *handle) diff --git a/fs/pnode.c b/fs/pnode.c index 37b568ed0e0..da42ee61c1d 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -53,8 +53,7 @@ static int do_make_slave(struct vfsmount *mnt) if (master) { list_for_each_entry(slave_mnt, &mnt->mnt_slave_list, mnt_slave) slave_mnt->mnt_master = master; - list_del(&mnt->mnt_slave); - list_add(&mnt->mnt_slave, &master->mnt_slave_list); + list_move(&mnt->mnt_slave, &master->mnt_slave_list); list_splice(&mnt->mnt_slave_list, master->mnt_slave_list.prev); INIT_LIST_HEAD(&mnt->mnt_slave_list); } else { @@ -283,10 +282,8 @@ static void __propagate_umount(struct vfsmount *mnt) * umount the child only if the child has no * other children */ - if (child && list_empty(&child->mnt_mounts)) { - list_del(&child->mnt_hash); - list_add_tail(&child->mnt_hash, &mnt->mnt_hash); - } + if (child && list_empty(&child->mnt_mounts)) + list_move_tail(&child->mnt_hash, &mnt->mnt_hash); } } diff --git a/fs/proc/base.c b/fs/proc/base.c index 6afff725a8c..6ba7785319d 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -74,6 +74,16 @@ #include <linux/poll.h> #include "internal.h" +/* NOTE: + * Implementing inode permission operations in /proc is almost + * certainly an error. Permission checks need to happen during + * each system call not at open time. The reason is that most of + * what we wish to check for permissions in /proc varies at runtime. + * + * The classic example of a problem is opening file descriptors + * in /proc for a task before it execs a suid executable. + */ + /* * For hysterical raisins we keep the same inumbers as in the old procfs. * Feel free to change the macro below - just keep the range distinct from @@ -121,6 +131,8 @@ enum pid_directory_inos { PROC_TGID_ATTR_PREV, PROC_TGID_ATTR_EXEC, PROC_TGID_ATTR_FSCREATE, + PROC_TGID_ATTR_KEYCREATE, + PROC_TGID_ATTR_SOCKCREATE, #endif #ifdef CONFIG_AUDITSYSCALL PROC_TGID_LOGINUID, @@ -162,6 +174,8 @@ enum pid_directory_inos { PROC_TID_ATTR_PREV, PROC_TID_ATTR_EXEC, PROC_TID_ATTR_FSCREATE, + PROC_TID_ATTR_KEYCREATE, + PROC_TID_ATTR_SOCKCREATE, #endif #ifdef CONFIG_AUDITSYSCALL PROC_TID_LOGINUID, @@ -173,6 +187,9 @@ enum pid_directory_inos { PROC_TID_FD_DIR = 0x8000, /* 0x8000-0xffff */ }; +/* Worst case buffer size needed for holding an integer. */ +#define PROC_NUMBUF 10 + struct pid_entry { int type; int len; @@ -275,6 +292,8 @@ static struct pid_entry tgid_attr_stuff[] = { E(PROC_TGID_ATTR_PREV, "prev", S_IFREG|S_IRUGO), E(PROC_TGID_ATTR_EXEC, "exec", S_IFREG|S_IRUGO|S_IWUGO), E(PROC_TGID_ATTR_FSCREATE, "fscreate", S_IFREG|S_IRUGO|S_IWUGO), + E(PROC_TGID_ATTR_KEYCREATE, "keycreate", S_IFREG|S_IRUGO|S_IWUGO), + E(PROC_TGID_ATTR_SOCKCREATE, "sockcreate", S_IFREG|S_IRUGO|S_IWUGO), {0,0,NULL,0} }; static struct pid_entry tid_attr_stuff[] = { @@ -282,6 +301,8 @@ static struct pid_entry tid_attr_stuff[] = { E(PROC_TID_ATTR_PREV, "prev", S_IFREG|S_IRUGO), E(PROC_TID_ATTR_EXEC, "exec", S_IFREG|S_IRUGO|S_IWUGO), E(PROC_TID_ATTR_FSCREATE, "fscreate", S_IFREG|S_IRUGO|S_IWUGO), + E(PROC_TID_ATTR_KEYCREATE, "keycreate", S_IFREG|S_IRUGO|S_IWUGO), + E(PROC_TID_ATTR_SOCKCREATE, "sockcreate", S_IFREG|S_IRUGO|S_IWUGO), {0,0,NULL,0} }; #endif @@ -290,12 +311,15 @@ static struct pid_entry tid_attr_stuff[] = { static int proc_fd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt) { - struct task_struct *task = proc_task(inode); - struct files_struct *files; + struct task_struct *task = get_proc_task(inode); + struct files_struct *files = NULL; struct file *file; - int fd = proc_type(inode) - PROC_TID_FD_DIR; + int fd = proc_fd(inode); - files = get_files_struct(task); + if (task) { + files = get_files_struct(task); + put_task_struct(task); + } if (files) { /* * We are not taking a ref to the file structure, so we must @@ -327,29 +351,33 @@ static struct fs_struct *get_fs_struct(struct task_struct *task) return fs; } -static int proc_cwd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt) +static int get_nr_threads(struct task_struct *tsk) { - struct fs_struct *fs = get_fs_struct(proc_task(inode)); - int result = -ENOENT; - if (fs) { - read_lock(&fs->lock); - *mnt = mntget(fs->pwdmnt); - *dentry = dget(fs->pwd); - read_unlock(&fs->lock); - result = 0; - put_fs_struct(fs); + /* Must be called with the rcu_read_lock held */ + unsigned long flags; + int count = 0; + + if (lock_task_sighand(tsk, &flags)) { + count = atomic_read(&tsk->signal->count); + unlock_task_sighand(tsk, &flags); } - return result; + return count; } -static int proc_root_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt) +static int proc_cwd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt) { - struct fs_struct *fs = get_fs_struct(proc_task(inode)); + struct task_struct *task = get_proc_task(inode); + struct fs_struct *fs = NULL; int result = -ENOENT; + + if (task) { + fs = get_fs_struct(task); + put_task_struct(task); + } if (fs) { read_lock(&fs->lock); - *mnt = mntget(fs->rootmnt); - *dentry = dget(fs->root); + *mnt = mntget(fs->pwdmnt); + *dentry = dget(fs->pwd); read_unlock(&fs->lock); result = 0; put_fs_struct(fs); @@ -357,42 +385,16 @@ static int proc_root_link(struct inode *inode, struct dentry **dentry, struct vf return result; } - -/* Same as proc_root_link, but this addionally tries to get fs from other - * threads in the group */ -static int proc_task_root_link(struct inode *inode, struct dentry **dentry, - struct vfsmount **mnt) +static int proc_root_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt) { - struct fs_struct *fs; + struct task_struct *task = get_proc_task(inode); + struct fs_struct *fs = NULL; int result = -ENOENT; - struct task_struct *leader = proc_task(inode); - task_lock(leader); - fs = leader->fs; - if (fs) { - atomic_inc(&fs->count); - task_unlock(leader); - } else { - /* Try to get fs from other threads */ - task_unlock(leader); - read_lock(&tasklist_lock); - if (pid_alive(leader)) { - struct task_struct *task = leader; - - while ((task = next_thread(task)) != leader) { - task_lock(task); - fs = task->fs; - if (fs) { - atomic_inc(&fs->count); - task_unlock(task); - break; - } - task_unlock(task); - } - } - read_unlock(&tasklist_lock); + if (task) { + fs = get_fs_struct(task); + put_task_struct(task); } - if (fs) { read_lock(&fs->lock); *mnt = mntget(fs->rootmnt); @@ -404,7 +406,6 @@ static int proc_task_root_link(struct inode *inode, struct dentry **dentry, return result; } - #define MAY_PTRACE(task) \ (task == current || \ (task->parent == current && \ @@ -535,142 +536,22 @@ static int proc_oom_score(struct task_struct *task, char *buffer) /************************************************************************/ /* permission checks */ - -/* If the process being read is separated by chroot from the reading process, - * don't let the reader access the threads. - * - * note: this does dput(root) and mntput(vfsmnt) on exit. - */ -static int proc_check_chroot(struct dentry *root, struct vfsmount *vfsmnt) -{ - struct dentry *de, *base; - struct vfsmount *our_vfsmnt, *mnt; - int res = 0; - - read_lock(¤t->fs->lock); - our_vfsmnt = mntget(current->fs->rootmnt); - base = dget(current->fs->root); - read_unlock(¤t->fs->lock); - - spin_lock(&vfsmount_lock); - de = root; - mnt = vfsmnt; - - while (mnt != our_vfsmnt) { - if (mnt == mnt->mnt_parent) - goto out; - de = mnt->mnt_mountpoint; - mnt = mnt->mnt_parent; - } - - if (!is_subdir(de, base)) - goto out; - spin_unlock(&vfsmount_lock); - -exit: - dput(base); - mntput(our_vfsmnt); - dput(root); - mntput(vfsmnt); - return res; -out: - spin_unlock(&vfsmount_lock); - res = -EACCES; - goto exit; -} - -static int proc_check_root(struct inode *inode) -{ - struct dentry *root; - struct vfsmount *vfsmnt; - - if (proc_root_link(inode, &root, &vfsmnt)) /* Ewww... */ - return -ENOENT; - return proc_check_chroot(root, vfsmnt); -} - -static int proc_permission(struct inode *inode, int mask, struct nameidata *nd) -{ - if (generic_permission(inode, mask, NULL) != 0) - return -EACCES; - return proc_check_root(inode); -} - -static int proc_task_permission(struct inode *inode, int mask, struct nameidata *nd) -{ - struct dentry *root; - struct vfsmount *vfsmnt; - - if (generic_permission(inode, mask, NULL) != 0) - return -EACCES; - - if (proc_task_root_link(inode, &root, &vfsmnt)) - return -ENOENT; - - return proc_check_chroot(root, vfsmnt); -} - -extern struct seq_operations proc_pid_maps_op; -static int maps_open(struct inode *inode, struct file *file) -{ - struct task_struct *task = proc_task(inode); - int ret = seq_open(file, &proc_pid_maps_op); - if (!ret) { - struct seq_file *m = file->private_data; - m->private = task; - } - return ret; -} - -static struct file_operations proc_maps_operations = { - .open = maps_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -#ifdef CONFIG_NUMA -extern struct seq_operations proc_pid_numa_maps_op; -static int numa_maps_open(struct inode *inode, struct file *file) -{ - struct task_struct *task = proc_task(inode); - int ret = seq_open(file, &proc_pid_numa_maps_op); - if (!ret) { - struct seq_file *m = file->private_data; - m->private = task; - } - return ret; -} - -static struct file_operations proc_numa_maps_operations = { - .open = numa_maps_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; -#endif - -#ifdef CONFIG_MMU -extern struct seq_operations proc_pid_smaps_op; -static int smaps_open(struct inode *inode, struct file *file) +static int proc_fd_access_allowed(struct inode *inode) { - struct task_struct *task = proc_task(inode); - int ret = seq_open(file, &proc_pid_smaps_op); - if (!ret) { - struct seq_file *m = file->private_data; - m->private = task; + struct task_struct *task; + int allowed = 0; + /* Allow access to a task's file descriptors if it is us or we + * may use ptrace attach to the process and find out that + * information. + */ + task = get_proc_task(inode); + if (task) { + allowed = ptrace_may_attach(task); + put_task_struct(task); } - return ret; + return allowed; } -static struct file_operations proc_smaps_operations = { - .open = smaps_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; -#endif - extern struct seq_operations mounts_op; struct proc_mounts { struct seq_file m; @@ -679,16 +560,19 @@ struct proc_mounts { static int mounts_open(struct inode *inode, struct file *file) { - struct task_struct *task = proc_task(inode); - struct namespace *namespace; + struct task_struct *task = get_proc_task(inode); + struct namespace *namespace = NULL; struct proc_mounts *p; int ret = -EINVAL; - task_lock(task); - namespace = task->namespace; - if (namespace) - get_namespace(namespace); - task_unlock(task); + if (task) { + task_lock(task); + namespace = task->namespace; + if (namespace) + get_namespace(namespace); + task_unlock(task); + put_task_struct(task); + } if (namespace) { ret = -ENOMEM; @@ -745,17 +629,21 @@ static struct file_operations proc_mounts_operations = { extern struct seq_operations mountstats_op; static int mountstats_open(struct inode *inode, struct file *file) { - struct task_struct *task = proc_task(inode); int ret = seq_open(file, &mountstats_op); if (!ret) { struct seq_file *m = file->private_data; - struct namespace *namespace; - task_lock(task); - namespace = task->namespace; - if (namespace) - get_namespace(namespace); - task_unlock(task); + struct namespace *namespace = NULL; + struct task_struct *task = get_proc_task(inode); + + if (task) { + task_lock(task); + namespace = task->namespace; + if (namespace) + get_namespace(namespace); + task_unlock(task); + put_task_struct(task); + } if (namespace) m->private = namespace; @@ -782,18 +670,27 @@ static ssize_t proc_info_read(struct file * file, char __user * buf, struct inode * inode = file->f_dentry->d_inode; unsigned long page; ssize_t length; - struct task_struct *task = proc_task(inode); + struct task_struct *task = get_proc_task(inode); + + length = -ESRCH; + if (!task) + goto out_no_task; if (count > PROC_BLOCK_SIZE) count = PROC_BLOCK_SIZE; + + length = -ENOMEM; if (!(page = __get_free_page(GFP_KERNEL))) - return -ENOMEM; + goto out; length = PROC_I(inode)->op.proc_read(task, (char*)page); if (length >= 0) length = simple_read_from_buffer(buf, count, ppos, (char *)page, length); free_page(page); +out: + put_task_struct(task); +out_no_task: return length; } @@ -810,12 +707,15 @@ static int mem_open(struct inode* inode, struct file* file) static ssize_t mem_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) { - struct task_struct *task = proc_task(file->f_dentry->d_inode); + struct task_struct *task = get_proc_task(file->f_dentry->d_inode); char *page; unsigned long src = *ppos; int ret = -ESRCH; struct mm_struct *mm; + if (!task) + goto out_no_task; + if (!MAY_PTRACE(task) || !ptrace_may_attach(task)) goto out; @@ -865,6 +765,8 @@ out_put: out_free: free_page((unsigned long) page); out: + put_task_struct(task); +out_no_task: return ret; } @@ -877,15 +779,20 @@ static ssize_t mem_write(struct file * file, const char * buf, { int copied = 0; char *page; - struct task_struct *task = proc_task(file->f_dentry->d_inode); + struct task_struct *task = get_proc_task(file->f_dentry->d_inode); unsigned long dst = *ppos; + copied = -ESRCH; + if (!task) + goto out_no_task; + if (!MAY_PTRACE(task) || !ptrace_may_attach(task)) - return -ESRCH; + goto out; + copied = -ENOMEM; page = (char *)__get_free_page(GFP_USER); if (!page) - return -ENOMEM; + goto out; while (count > 0) { int this_len, retval; @@ -908,6 +815,9 @@ static ssize_t mem_write(struct file * file, const char * buf, } *ppos = dst; free_page((unsigned long) page); +out: + put_task_struct(task); +out_no_task: return copied; } #endif @@ -938,13 +848,18 @@ static struct file_operations proc_mem_operations = { static ssize_t oom_adjust_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - struct task_struct *task = proc_task(file->f_dentry->d_inode); - char buffer[8]; + struct task_struct *task = get_proc_task(file->f_dentry->d_inode); + char buffer[PROC_NUMBUF]; size_t len; - int oom_adjust = task->oomkilladj; + int oom_adjust; loff_t __ppos = *ppos; - len = sprintf(buffer, "%i\n", oom_adjust); + if (!task) + return -ESRCH; + oom_adjust = task->oomkilladj; + put_task_struct(task); + + len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust); if (__ppos >= len) return 0; if (count > len-__ppos) @@ -958,15 +873,15 @@ static ssize_t oom_adjust_read(struct file *file, char __user *buf, static ssize_t oom_adjust_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - struct task_struct *task = proc_task(file->f_dentry->d_inode); - char buffer[8], *end; + struct task_struct *task; + char buffer[PROC_NUMBUF], *end; int oom_adjust; if (!capable(CAP_SYS_RESOURCE)) return -EPERM; - memset(buffer, 0, 8); - if (count > 6) - count = 6; + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) return -EFAULT; oom_adjust = simple_strtol(buffer, &end, 0); @@ -974,7 +889,11 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, return -EINVAL; if (*end == '\n') end++; + task = get_proc_task(file->f_dentry->d_inode); + if (!task) + return -ESRCH; task->oomkilladj = oom_adjust; + put_task_struct(task); if (end - buffer == 0) return -EIO; return end - buffer; @@ -985,22 +904,21 @@ static struct file_operations proc_oom_adjust_operations = { .write = oom_adjust_write, }; -static struct inode_operations proc_mem_inode_operations = { - .permission = proc_permission, -}; - #ifdef CONFIG_AUDITSYSCALL #define TMPBUFLEN 21 static ssize_t proc_loginuid_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) { struct inode * inode = file->f_dentry->d_inode; - struct task_struct *task = proc_task(inode); + struct task_struct *task = get_proc_task(inode); ssize_t length; char tmpbuf[TMPBUFLEN]; + if (!task) + return -ESRCH; length = scnprintf(tmpbuf, TMPBUFLEN, "%u", audit_get_loginuid(task->audit_context)); + put_task_struct(task); return simple_read_from_buffer(buf, count, ppos, tmpbuf, length); } @@ -1010,13 +928,12 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, struct inode * inode = file->f_dentry->d_inode; char *page, *tmp; ssize_t length; - struct task_struct *task = proc_task(inode); uid_t loginuid; if (!capable(CAP_AUDIT_CONTROL)) return -EPERM; - if (current != task) + if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) return -EPERM; if (count >= PAGE_SIZE) @@ -1040,7 +957,7 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, goto out_free_page; } - length = audit_set_loginuid(task, loginuid); + length = audit_set_loginuid(current, loginuid); if (likely(length == 0)) length = count; @@ -1059,13 +976,16 @@ static struct file_operations proc_loginuid_operations = { static ssize_t seccomp_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - struct task_struct *tsk = proc_task(file->f_dentry->d_inode); + struct task_struct *tsk = get_proc_task(file->f_dentry->d_inode); char __buf[20]; loff_t __ppos = *ppos; size_t len; + if (!tsk) + return -ESRCH; /* no need to print the trailing zero, so use only len */ len = sprintf(__buf, "%u\n", tsk->seccomp.mode); + put_task_struct(tsk); if (__ppos >= len) return 0; if (count > len - __ppos) @@ -1079,29 +999,43 @@ static ssize_t seccomp_read(struct file *file, char __user *buf, static ssize_t seccomp_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - struct task_struct *tsk = proc_task(file->f_dentry->d_inode); + struct task_struct *tsk = get_proc_task(file->f_dentry->d_inode); char __buf[20], *end; unsigned int seccomp_mode; + ssize_t result; + + result = -ESRCH; + if (!tsk) + goto out_no_task; /* can set it only once to be even more secure */ + result = -EPERM; if (unlikely(tsk->seccomp.mode)) - return -EPERM; + goto out; + result = -EFAULT; memset(__buf, 0, sizeof(__buf)); count = min(count, sizeof(__buf) - 1); if (copy_from_user(__buf, buf, count)) - return -EFAULT; + goto out; + seccomp_mode = simple_strtoul(__buf, &end, 0); if (*end == '\n') end++; + result = -EINVAL; if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) { tsk->seccomp.mode = seccomp_mode; set_tsk_thread_flag(tsk, TIF_SECCOMP); } else - return -EINVAL; + goto out; + result = -EIO; if (unlikely(!(end - __buf))) - return -EIO; - return end - __buf; + goto out; + result = end - __buf; +out: + put_task_struct(tsk); +out_no_task: + return result; } static struct file_operations proc_seccomp_operations = { @@ -1118,10 +1052,8 @@ static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd) /* We don't need a base pointer in the /proc filesystem */ path_release(nd); - if (current->fsuid != inode->i_uid && !capable(CAP_DAC_OVERRIDE)) - goto out; - error = proc_check_root(inode); - if (error) + /* Are we allowed to snoop on the tasks file descriptors? */ + if (!proc_fd_access_allowed(inode)) goto out; error = PROC_I(inode)->op.proc_get_link(inode, &nd->dentry, &nd->mnt); @@ -1163,12 +1095,8 @@ static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int b struct dentry *de; struct vfsmount *mnt = NULL; - lock_kernel(); - - if (current->fsuid != inode->i_uid && !capable(CAP_DAC_OVERRIDE)) - goto out; - error = proc_check_root(inode); - if (error) + /* Are we allowed to snoop on the tasks file descriptors? */ + if (!proc_fd_access_allowed(inode)) goto out; error = PROC_I(inode)->op.proc_get_link(inode, &de, &mnt); @@ -1179,7 +1107,6 @@ static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int b dput(de); mntput(mnt); out: - unlock_kernel(); return error; } @@ -1188,21 +1115,20 @@ static struct inode_operations proc_pid_link_inode_operations = { .follow_link = proc_pid_follow_link }; -#define NUMBUF 10 - static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir) { - struct inode *inode = filp->f_dentry->d_inode; - struct task_struct *p = proc_task(inode); + struct dentry *dentry = filp->f_dentry; + struct inode *inode = dentry->d_inode; + struct task_struct *p = get_proc_task(inode); unsigned int fd, tid, ino; int retval; - char buf[NUMBUF]; + char buf[PROC_NUMBUF]; struct files_struct * files; struct fdtable *fdt; retval = -ENOENT; - if (!pid_alive(p)) - goto out; + if (!p) + goto out_no_task; retval = 0; tid = p->pid; @@ -1213,7 +1139,7 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir) goto out; filp->f_pos++; case 1: - ino = fake_ino(tid, PROC_TID_INO); + ino = parent_ino(dentry); if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0) goto out; filp->f_pos++; @@ -1232,7 +1158,7 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir) continue; rcu_read_unlock(); - j = NUMBUF; + j = PROC_NUMBUF; i = fd; do { j--; @@ -1241,7 +1167,7 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir) } while (i); ino = fake_ino(tid, PROC_TID_FD_DIR + fd); - if (filldir(dirent, buf+j, NUMBUF-j, fd+2, ino, DT_LNK) < 0) { + if (filldir(dirent, buf+j, PROC_NUMBUF-j, fd+2, ino, DT_LNK) < 0) { rcu_read_lock(); break; } @@ -1251,6 +1177,8 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir) put_files_struct(files); } out: + put_task_struct(p); +out_no_task: return retval; } @@ -1262,16 +1190,18 @@ static int proc_pident_readdir(struct file *filp, int pid; struct dentry *dentry = filp->f_dentry; struct inode *inode = dentry->d_inode; + struct task_struct *task = get_proc_task(inode); struct pid_entry *p; ino_t ino; int ret; ret = -ENOENT; - if (!pid_alive(proc_task(inode))) + if (!task) goto out; ret = 0; - pid = proc_task(inode)->pid; + pid = task->pid; + put_task_struct(task); i = filp->f_pos; switch (i) { case 0: @@ -1354,22 +1284,19 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st /* Common stuff */ ei = PROC_I(inode); - ei->task = NULL; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; inode->i_ino = fake_ino(task->pid, ino); - if (!pid_alive(task)) - goto out_unlock; - /* * grab the reference to task. */ - get_task_struct(task); - ei->task = task; - ei->type = ino; + ei->pid = get_pid(task->pids[PIDTYPE_PID].pid); + if (!ei->pid) + goto out_unlock; + inode->i_uid = 0; inode->i_gid = 0; - if (ino == PROC_TGID_INO || ino == PROC_TID_INO || task_dumpable(task)) { + if (task_dumpable(task)) { inode->i_uid = task->euid; inode->i_gid = task->egid; } @@ -1379,7 +1306,6 @@ out: return inode; out_unlock: - ei->pde = NULL; iput(inode); return NULL; } @@ -1393,13 +1319,21 @@ out_unlock: * * Rewrite the inode's ownerships here because the owning task may have * performed a setuid(), etc. + * + * Before the /proc/pid/status file was created the only way to read + * the effective uid of a /process was to stat /proc/pid. Reading + * /proc/pid/status is slow enough that procps and other packages + * kept stating /proc/pid. To keep the rules in /proc simple I have + * made this apply to all per process world readable and executable + * directories. */ static int pid_revalidate(struct dentry *dentry, struct nameidata *nd) { struct inode *inode = dentry->d_inode; - struct task_struct *task = proc_task(inode); - if (pid_alive(task)) { - if (proc_type(inode) == PROC_TGID_INO || proc_type(inode) == PROC_TID_INO || task_dumpable(task)) { + struct task_struct *task = get_proc_task(inode); + if (task) { + if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || + task_dumpable(task)) { inode->i_uid = task->euid; inode->i_gid = task->egid; } else { @@ -1407,59 +1341,75 @@ static int pid_revalidate(struct dentry *dentry, struct nameidata *nd) inode->i_gid = 0; } security_task_to_inode(task, inode); + put_task_struct(task); return 1; } d_drop(dentry); return 0; } +static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +{ + struct inode *inode = dentry->d_inode; + struct task_struct *task; + generic_fillattr(inode, stat); + + rcu_read_lock(); + stat->uid = 0; + stat->gid = 0; + task = pid_task(proc_pid(inode), PIDTYPE_PID); + if (task) { + if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || + task_dumpable(task)) { + stat->uid = task->euid; + stat->gid = task->egid; + } + } + rcu_read_unlock(); + return 0; +} + static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd) { struct inode *inode = dentry->d_inode; - struct task_struct *task = proc_task(inode); - int fd = proc_type(inode) - PROC_TID_FD_DIR; + struct task_struct *task = get_proc_task(inode); + int fd = proc_fd(inode); struct files_struct *files; - files = get_files_struct(task); - if (files) { - rcu_read_lock(); - if (fcheck_files(files, fd)) { + if (task) { + files = get_files_struct(task); + if (files) { + rcu_read_lock(); + if (fcheck_files(files, fd)) { + rcu_read_unlock(); + put_files_struct(files); + if (task_dumpable(task)) { + inode->i_uid = task->euid; + inode->i_gid = task->egid; + } else { + inode->i_uid = 0; + inode->i_gid = 0; + } + security_task_to_inode(task, inode); + put_task_struct(task); + return 1; + } rcu_read_unlock(); put_files_struct(files); - if (task_dumpable(task)) { - inode->i_uid = task->euid; - inode->i_gid = task->egid; - } else { - inode->i_uid = 0; - inode->i_gid = 0; - } - security_task_to_inode(task, inode); - return 1; } - rcu_read_unlock(); - put_files_struct(files); + put_task_struct(task); } d_drop(dentry); return 0; } -static void pid_base_iput(struct dentry *dentry, struct inode *inode) -{ - struct task_struct *task = proc_task(inode); - spin_lock(&task->proc_lock); - if (task->proc_dentry == dentry) - task->proc_dentry = NULL; - spin_unlock(&task->proc_lock); - iput(inode); -} - static int pid_delete_dentry(struct dentry * dentry) { /* Is the task we represent dead? * If so, then don't put the dentry on the lru list, * kill it immediately. */ - return !pid_alive(proc_task(dentry->d_inode)); + return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first; } static struct dentry_operations tid_fd_dentry_operations = @@ -1474,13 +1424,6 @@ static struct dentry_operations pid_dentry_operations = .d_delete = pid_delete_dentry, }; -static struct dentry_operations pid_base_dentry_operations = -{ - .d_revalidate = pid_revalidate, - .d_iput = pid_base_iput, - .d_delete = pid_delete_dentry, -}; - /* Lookups */ static unsigned name_to_int(struct dentry *dentry) @@ -1508,22 +1451,24 @@ out: /* SMP-safe */ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry, struct nameidata *nd) { - struct task_struct *task = proc_task(dir); + struct task_struct *task = get_proc_task(dir); unsigned fd = name_to_int(dentry); + struct dentry *result = ERR_PTR(-ENOENT); struct file * file; struct files_struct * files; struct inode *inode; struct proc_inode *ei; + if (!task) + goto out_no_task; if (fd == ~0U) goto out; - if (!pid_alive(task)) - goto out; inode = proc_pid_make_inode(dir->i_sb, task, PROC_TID_FD_DIR+fd); if (!inode) goto out; ei = PROC_I(inode); + ei->fd = fd; files = get_files_struct(task); if (!files) goto out_unlock; @@ -1548,19 +1493,25 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry, ei->op.proc_get_link = proc_fd_link; dentry->d_op = &tid_fd_dentry_operations; d_add(dentry, inode); - return NULL; + /* Close the race of the process dying before we return the dentry */ + if (tid_fd_revalidate(dentry, NULL)) + result = NULL; +out: + put_task_struct(task); +out_no_task: + return result; out_unlock2: spin_unlock(&files->file_lock); put_files_struct(files); out_unlock: iput(inode); -out: - return ERR_PTR(-ENOENT); + goto out; } static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldir); static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd); +static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); static struct file_operations proc_fd_operations = { .read = generic_read_dir, @@ -1577,12 +1528,11 @@ static struct file_operations proc_task_operations = { */ static struct inode_operations proc_fd_inode_operations = { .lookup = proc_lookupfd, - .permission = proc_permission, }; static struct inode_operations proc_task_inode_operations = { .lookup = proc_task_lookup, - .permission = proc_task_permission, + .getattr = proc_task_getattr, }; #ifdef CONFIG_SECURITY @@ -1592,12 +1542,17 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf, struct inode * inode = file->f_dentry->d_inode; unsigned long page; ssize_t length; - struct task_struct *task = proc_task(inode); + struct task_struct *task = get_proc_task(inode); + + length = -ESRCH; + if (!task) + goto out_no_task; if (count > PAGE_SIZE) count = PAGE_SIZE; + length = -ENOMEM; if (!(page = __get_free_page(GFP_KERNEL))) - return -ENOMEM; + goto out; length = security_getprocattr(task, (char*)file->f_dentry->d_name.name, @@ -1605,6 +1560,9 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf, if (length >= 0) length = simple_read_from_buffer(buf, count, ppos, (char *)page, length); free_page(page); +out: + put_task_struct(task); +out_no_task: return length; } @@ -1614,26 +1572,36 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, struct inode * inode = file->f_dentry->d_inode; char *page; ssize_t length; - struct task_struct *task = proc_task(inode); + struct task_struct *task = get_proc_task(inode); + length = -ESRCH; + if (!task) + goto out_no_task; if (count > PAGE_SIZE) count = PAGE_SIZE; - if (*ppos != 0) { - /* No partial writes. */ - return -EINVAL; - } + + /* No partial writes. */ + length = -EINVAL; + if (*ppos != 0) + goto out; + + length = -ENOMEM; page = (char*)__get_free_page(GFP_USER); if (!page) - return -ENOMEM; + goto out; + length = -EFAULT; if (copy_from_user(page, buf, count)) - goto out; + goto out_free; length = security_setprocattr(task, (char*)file->f_dentry->d_name.name, (void*)page, count); -out: +out_free: free_page((unsigned long) page); +out: + put_task_struct(task); +out_no_task: return length; } @@ -1648,24 +1616,22 @@ static struct file_operations proc_tgid_attr_operations; static struct inode_operations proc_tgid_attr_inode_operations; #endif -static int get_tid_list(int index, unsigned int *tids, struct inode *dir); - /* SMP-safe */ static struct dentry *proc_pident_lookup(struct inode *dir, struct dentry *dentry, struct pid_entry *ents) { struct inode *inode; - int error; - struct task_struct *task = proc_task(dir); + struct dentry *error; + struct task_struct *task = get_proc_task(dir); struct pid_entry *p; struct proc_inode *ei; - error = -ENOENT; + error = ERR_PTR(-ENOENT); inode = NULL; - if (!pid_alive(task)) - goto out; + if (!task) + goto out_no_task; for (p = ents; p->name; p++) { if (p->len != dentry->d_name.len) @@ -1676,7 +1642,7 @@ static struct dentry *proc_pident_lookup(struct inode *dir, if (!p->name) goto out; - error = -EINVAL; + error = ERR_PTR(-EINVAL); inode = proc_pid_make_inode(dir->i_sb, task, p->type); if (!inode) goto out; @@ -1689,7 +1655,7 @@ static struct dentry *proc_pident_lookup(struct inode *dir, */ switch(p->type) { case PROC_TGID_TASK: - inode->i_nlink = 2 + get_tid_list(2, NULL, dir); + inode->i_nlink = 2; inode->i_op = &proc_task_inode_operations; inode->i_fop = &proc_task_operations; break; @@ -1759,7 +1725,6 @@ static struct dentry *proc_pident_lookup(struct inode *dir, #endif case PROC_TID_MEM: case PROC_TGID_MEM: - inode->i_op = &proc_mem_inode_operations; inode->i_fop = &proc_mem_operations; break; #ifdef CONFIG_SECCOMP @@ -1801,6 +1766,10 @@ static struct dentry *proc_pident_lookup(struct inode *dir, case PROC_TGID_ATTR_EXEC: case PROC_TID_ATTR_FSCREATE: case PROC_TGID_ATTR_FSCREATE: + case PROC_TID_ATTR_KEYCREATE: + case PROC_TGID_ATTR_KEYCREATE: + case PROC_TID_ATTR_SOCKCREATE: + case PROC_TGID_ATTR_SOCKCREATE: inode->i_fop = &proc_pid_attr_operations; break; #endif @@ -1842,14 +1811,18 @@ static struct dentry *proc_pident_lookup(struct inode *dir, default: printk("procfs: impossible type (%d)",p->type); iput(inode); - return ERR_PTR(-EINVAL); + error = ERR_PTR(-EINVAL); + goto out; } dentry->d_op = &pid_dentry_operations; d_add(dentry, inode); - return NULL; - + /* Close the race of the process dying before we return the dentry */ + if (pid_revalidate(dentry, NULL)) + error = NULL; out: - return ERR_PTR(error); + put_task_struct(task); +out_no_task: + return error; } static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){ @@ -1872,10 +1845,12 @@ static struct file_operations proc_tid_base_operations = { static struct inode_operations proc_tgid_base_inode_operations = { .lookup = proc_tgid_base_lookup, + .getattr = pid_getattr, }; static struct inode_operations proc_tid_base_inode_operations = { .lookup = proc_tid_base_lookup, + .getattr = pid_getattr, }; #ifdef CONFIG_SECURITY @@ -1917,10 +1892,12 @@ static struct dentry *proc_tid_attr_lookup(struct inode *dir, static struct inode_operations proc_tgid_attr_inode_operations = { .lookup = proc_tgid_attr_lookup, + .getattr = pid_getattr, }; static struct inode_operations proc_tid_attr_inode_operations = { .lookup = proc_tid_attr_lookup, + .getattr = pid_getattr, }; #endif @@ -1930,14 +1907,14 @@ static struct inode_operations proc_tid_attr_inode_operations = { static int proc_self_readlink(struct dentry *dentry, char __user *buffer, int buflen) { - char tmp[30]; + char tmp[PROC_NUMBUF]; sprintf(tmp, "%d", current->tgid); return vfs_readlink(dentry,buffer,buflen,tmp); } static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd) { - char tmp[30]; + char tmp[PROC_NUMBUF]; sprintf(tmp, "%d", current->tgid); return ERR_PTR(vfs_follow_link(nd,tmp)); } @@ -1948,67 +1925,80 @@ static struct inode_operations proc_self_inode_operations = { }; /** - * proc_pid_unhash - Unhash /proc/@pid entry from the dcache. - * @p: task that should be flushed. + * proc_flush_task - Remove dcache entries for @task from the /proc dcache. + * + * @task: task that should be flushed. + * + * Looks in the dcache for + * /proc/@pid + * /proc/@tgid/task/@pid + * if either directory is present flushes it and all of it'ts children + * from the dcache. * - * Drops the /proc/@pid dcache entry from the hash chains. + * It is safe and reasonable to cache /proc entries for a task until + * that task exits. After that they just clog up the dcache with + * useless entries, possibly causing useful dcache entries to be + * flushed instead. This routine is proved to flush those useless + * dcache entries at process exit time. * - * Dropping /proc/@pid entries and detach_pid must be synchroneous, - * otherwise e.g. /proc/@pid/exe might point to the wrong executable, - * if the pid value is immediately reused. This is enforced by - * - caller must acquire spin_lock(p->proc_lock) - * - must be called before detach_pid() - * - proc_pid_lookup acquires proc_lock, and checks that - * the target is not dead by looking at the attach count - * of PIDTYPE_PID. + * NOTE: This routine is just an optimization so it does not guarantee + * that no dcache entries will exist at process exit time it + * just makes it very unlikely that any will persist. */ - -struct dentry *proc_pid_unhash(struct task_struct *p) +void proc_flush_task(struct task_struct *task) { - struct dentry *proc_dentry; + struct dentry *dentry, *leader, *dir; + char buf[PROC_NUMBUF]; + struct qstr name; + + name.name = buf; + name.len = snprintf(buf, sizeof(buf), "%d", task->pid); + dentry = d_hash_and_lookup(proc_mnt->mnt_root, &name); + if (dentry) { + shrink_dcache_parent(dentry); + d_drop(dentry); + dput(dentry); + } - proc_dentry = p->proc_dentry; - if (proc_dentry != NULL) { + if (thread_group_leader(task)) + goto out; - spin_lock(&dcache_lock); - spin_lock(&proc_dentry->d_lock); - if (!d_unhashed(proc_dentry)) { - dget_locked(proc_dentry); - __d_drop(proc_dentry); - spin_unlock(&proc_dentry->d_lock); - } else { - spin_unlock(&proc_dentry->d_lock); - proc_dentry = NULL; - } - spin_unlock(&dcache_lock); - } - return proc_dentry; -} + name.name = buf; + name.len = snprintf(buf, sizeof(buf), "%d", task->tgid); + leader = d_hash_and_lookup(proc_mnt->mnt_root, &name); + if (!leader) + goto out; -/** - * proc_pid_flush - recover memory used by stale /proc/@pid/x entries - * @proc_dentry: directoy to prune. - * - * Shrink the /proc directory that was used by the just killed thread. - */ - -void proc_pid_flush(struct dentry *proc_dentry) -{ - might_sleep(); - if(proc_dentry != NULL) { - shrink_dcache_parent(proc_dentry); - dput(proc_dentry); + name.name = "task"; + name.len = strlen(name.name); + dir = d_hash_and_lookup(leader, &name); + if (!dir) + goto out_put_leader; + + name.name = buf; + name.len = snprintf(buf, sizeof(buf), "%d", task->pid); + dentry = d_hash_and_lookup(dir, &name); + if (dentry) { + shrink_dcache_parent(dentry); + d_drop(dentry); + dput(dentry); } + + dput(dir); +out_put_leader: + dput(leader); +out: + return; } /* SMP-safe */ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) { + struct dentry *result = ERR_PTR(-ENOENT); struct task_struct *task; struct inode *inode; struct proc_inode *ei; unsigned tgid; - int died; if (dentry->d_name.len == 4 && !memcmp(dentry->d_name.name,"self",4)) { inode = new_inode(dir->i_sb); @@ -2029,21 +2019,18 @@ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct if (tgid == ~0U) goto out; - read_lock(&tasklist_lock); + rcu_read_lock(); task = find_task_by_pid(tgid); if (task) get_task_struct(task); - read_unlock(&tasklist_lock); + rcu_read_unlock(); if (!task) goto out; inode = proc_pid_make_inode(dir->i_sb, task, PROC_TGID_INO); + if (!inode) + goto out_put_task; - - if (!inode) { - put_task_struct(task); - goto out; - } inode->i_mode = S_IFDIR|S_IRUGO|S_IXUGO; inode->i_op = &proc_tgid_base_inode_operations; inode->i_fop = &proc_tgid_base_operations; @@ -2054,45 +2041,40 @@ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct inode->i_nlink = 4; #endif - dentry->d_op = &pid_base_dentry_operations; + dentry->d_op = &pid_dentry_operations; - died = 0; d_add(dentry, inode); - spin_lock(&task->proc_lock); - task->proc_dentry = dentry; - if (!pid_alive(task)) { - dentry = proc_pid_unhash(task); - died = 1; - } - spin_unlock(&task->proc_lock); + /* Close the race of the process dying before we return the dentry */ + if (pid_revalidate(dentry, NULL)) + result = NULL; +out_put_task: put_task_struct(task); - if (died) { - proc_pid_flush(dentry); - goto out; - } - return NULL; out: - return ERR_PTR(-ENOENT); + return result; } /* SMP-safe */ static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) { + struct dentry *result = ERR_PTR(-ENOENT); struct task_struct *task; - struct task_struct *leader = proc_task(dir); + struct task_struct *leader = get_proc_task(dir); struct inode *inode; unsigned tid; + if (!leader) + goto out_no_task; + tid = name_to_int(dentry); if (tid == ~0U) goto out; - read_lock(&tasklist_lock); + rcu_read_lock(); task = find_task_by_pid(tid); if (task) get_task_struct(task); - read_unlock(&tasklist_lock); + rcu_read_unlock(); if (!task) goto out; if (leader->tgid != task->tgid) @@ -2113,101 +2095,95 @@ static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry inode->i_nlink = 3; #endif - dentry->d_op = &pid_base_dentry_operations; + dentry->d_op = &pid_dentry_operations; d_add(dentry, inode); + /* Close the race of the process dying before we return the dentry */ + if (pid_revalidate(dentry, NULL)) + result = NULL; - put_task_struct(task); - return NULL; out_drop_task: put_task_struct(task); out: - return ERR_PTR(-ENOENT); + put_task_struct(leader); +out_no_task: + return result; } -#define PROC_NUMBUF 10 -#define PROC_MAXPIDS 20 - /* - * Get a few tgid's to return for filldir - we need to hold the - * tasklist lock while doing this, and we must release it before - * we actually do the filldir itself, so we use a temp buffer.. + * Find the first tgid to return to user space. + * + * Usually this is just whatever follows &init_task, but if the users + * buffer was too small to hold the full list or there was a seek into + * the middle of the directory we have more work to do. + * + * In the case of a short read we start with find_task_by_pid. + * + * In the case of a seek we start with &init_task and walk nr + * threads past it. */ -static int get_tgid_list(int index, unsigned long version, unsigned int *tgids) -{ - struct task_struct *p; - int nr_tgids = 0; - - index--; - read_lock(&tasklist_lock); - p = NULL; - if (version) { - p = find_task_by_pid(version); - if (p && !thread_group_leader(p)) - p = NULL; +static struct task_struct *first_tgid(int tgid, unsigned int nr) +{ + struct task_struct *pos; + rcu_read_lock(); + if (tgid && nr) { + pos = find_task_by_pid(tgid); + if (pos && thread_group_leader(pos)) + goto found; } + /* If nr exceeds the number of processes get out quickly */ + pos = NULL; + if (nr && nr >= nr_processes()) + goto done; - if (p) - index = 0; - else - p = next_task(&init_task); - - for ( ; p != &init_task; p = next_task(p)) { - int tgid = p->pid; - if (!pid_alive(p)) - continue; - if (--index >= 0) - continue; - tgids[nr_tgids] = tgid; - nr_tgids++; - if (nr_tgids >= PROC_MAXPIDS) - break; + /* If we haven't found our starting place yet start with + * the init_task and walk nr tasks forward. + */ + for (pos = next_task(&init_task); nr > 0; --nr) { + pos = next_task(pos); + if (pos == &init_task) { + pos = NULL; + goto done; + } } - read_unlock(&tasklist_lock); - return nr_tgids; +found: + get_task_struct(pos); +done: + rcu_read_unlock(); + return pos; } /* - * Get a few tid's to return for filldir - we need to hold the - * tasklist lock while doing this, and we must release it before - * we actually do the filldir itself, so we use a temp buffer.. + * Find the next task in the task list. + * Return NULL if we loop or there is any error. + * + * The reference to the input task_struct is released. */ -static int get_tid_list(int index, unsigned int *tids, struct inode *dir) -{ - struct task_struct *leader_task = proc_task(dir); - struct task_struct *task = leader_task; - int nr_tids = 0; - - index -= 2; - read_lock(&tasklist_lock); - /* - * The starting point task (leader_task) might be an already - * unlinked task, which cannot be used to access the task-list - * via next_thread(). - */ - if (pid_alive(task)) do { - int tid = task->pid; - - if (--index >= 0) - continue; - if (tids != NULL) - tids[nr_tids] = tid; - nr_tids++; - if (nr_tids >= PROC_MAXPIDS) - break; - } while ((task = next_thread(task)) != leader_task); - read_unlock(&tasklist_lock); - return nr_tids; +static struct task_struct *next_tgid(struct task_struct *start) +{ + struct task_struct *pos; + rcu_read_lock(); + pos = start; + if (pid_alive(start)) + pos = next_task(start); + if (pid_alive(pos) && (pos != &init_task)) { + get_task_struct(pos); + goto done; + } + pos = NULL; +done: + rcu_read_unlock(); + put_task_struct(start); + return pos; } /* for the /proc/ directory itself, after non-process stuff has been done */ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) { - unsigned int tgid_array[PROC_MAXPIDS]; char buf[PROC_NUMBUF]; unsigned int nr = filp->f_pos - FIRST_PROCESS_ENTRY; - unsigned int nr_tgids, i; - int next_tgid; + struct task_struct *task; + int tgid; if (!nr) { ino_t ino = fake_ino(0,PROC_TGID_INO); @@ -2216,63 +2192,116 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) filp->f_pos++; nr++; } + nr -= 1; /* f_version caches the tgid value that the last readdir call couldn't * return. lseek aka telldir automagically resets f_version to 0. */ - next_tgid = filp->f_version; + tgid = filp->f_version; filp->f_version = 0; - for (;;) { - nr_tgids = get_tgid_list(nr, next_tgid, tgid_array); - if (!nr_tgids) { - /* no more entries ! */ + for (task = first_tgid(tgid, nr); + task; + task = next_tgid(task), filp->f_pos++) { + int len; + ino_t ino; + tgid = task->pid; + len = snprintf(buf, sizeof(buf), "%d", tgid); + ino = fake_ino(tgid, PROC_TGID_INO); + if (filldir(dirent, buf, len, filp->f_pos, ino, DT_DIR) < 0) { + /* returning this tgid failed, save it as the first + * pid for the next readir call */ + filp->f_version = tgid; + put_task_struct(task); break; } - next_tgid = 0; + } + return 0; +} - /* do not use the last found pid, reserve it for next_tgid */ - if (nr_tgids == PROC_MAXPIDS) { - nr_tgids--; - next_tgid = tgid_array[nr_tgids]; - } +/* + * Find the first tid of a thread group to return to user space. + * + * Usually this is just the thread group leader, but if the users + * buffer was too small or there was a seek into the middle of the + * directory we have more work todo. + * + * In the case of a short read we start with find_task_by_pid. + * + * In the case of a seek we start with the leader and walk nr + * threads past it. + */ +static struct task_struct *first_tid(struct task_struct *leader, + int tid, int nr) +{ + struct task_struct *pos; - for (i=0;i<nr_tgids;i++) { - int tgid = tgid_array[i]; - ino_t ino = fake_ino(tgid,PROC_TGID_INO); - unsigned long j = PROC_NUMBUF; + rcu_read_lock(); + /* Attempt to start with the pid of a thread */ + if (tid && (nr > 0)) { + pos = find_task_by_pid(tid); + if (pos && (pos->group_leader == leader)) + goto found; + } - do - buf[--j] = '0' + (tgid % 10); - while ((tgid /= 10) != 0); + /* If nr exceeds the number of threads there is nothing todo */ + pos = NULL; + if (nr && nr >= get_nr_threads(leader)) + goto out; - if (filldir(dirent, buf+j, PROC_NUMBUF-j, filp->f_pos, ino, DT_DIR) < 0) { - /* returning this tgid failed, save it as the first - * pid for the next readir call */ - filp->f_version = tgid_array[i]; - goto out; - } - filp->f_pos++; - nr++; + /* If we haven't found our starting place yet start + * with the leader and walk nr threads forward. + */ + for (pos = leader; nr > 0; --nr) { + pos = next_thread(pos); + if (pos == leader) { + pos = NULL; + goto out; } } +found: + get_task_struct(pos); out: - return 0; + rcu_read_unlock(); + return pos; +} + +/* + * Find the next thread in the thread list. + * Return NULL if there is an error or no next thread. + * + * The reference to the input task_struct is released. + */ +static struct task_struct *next_tid(struct task_struct *start) +{ + struct task_struct *pos = NULL; + rcu_read_lock(); + if (pid_alive(start)) { + pos = next_thread(start); + if (thread_group_leader(pos)) + pos = NULL; + else + get_task_struct(pos); + } + rcu_read_unlock(); + put_task_struct(start); + return pos; } /* for the /proc/TGID/task/ directories */ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldir) { - unsigned int tid_array[PROC_MAXPIDS]; char buf[PROC_NUMBUF]; - unsigned int nr_tids, i; struct dentry *dentry = filp->f_dentry; struct inode *inode = dentry->d_inode; + struct task_struct *leader = get_proc_task(inode); + struct task_struct *task; int retval = -ENOENT; ino_t ino; + int tid; unsigned long pos = filp->f_pos; /* avoiding "long long" filp->f_pos */ - if (!pid_alive(proc_task(inode))) - goto out; + if (!leader) + goto out_no_task; retval = 0; switch (pos) { @@ -2290,24 +2319,45 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi /* fall through */ } - nr_tids = get_tid_list(pos, tid_array, inode); - inode->i_nlink = pos + nr_tids; - - for (i = 0; i < nr_tids; i++) { - unsigned long j = PROC_NUMBUF; - int tid = tid_array[i]; - - ino = fake_ino(tid,PROC_TID_INO); - - do - buf[--j] = '0' + (tid % 10); - while ((tid /= 10) != 0); - - if (filldir(dirent, buf+j, PROC_NUMBUF-j, pos, ino, DT_DIR) < 0) + /* f_version caches the tgid value that the last readdir call couldn't + * return. lseek aka telldir automagically resets f_version to 0. + */ + tid = filp->f_version; + filp->f_version = 0; + for (task = first_tid(leader, tid, pos - 2); + task; + task = next_tid(task), pos++) { + int len; + tid = task->pid; + len = snprintf(buf, sizeof(buf), "%d", tid); + ino = fake_ino(tid, PROC_TID_INO); + if (filldir(dirent, buf, len, pos, ino, DT_DIR < 0)) { + /* returning this tgid failed, save it as the first + * pid for the next readir call */ + filp->f_version = tid; + put_task_struct(task); break; - pos++; + } } out: filp->f_pos = pos; + put_task_struct(leader); +out_no_task: return retval; } + +static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +{ + struct inode *inode = dentry->d_inode; + struct task_struct *p = get_proc_task(inode); + generic_fillattr(inode, stat); + + if (p) { + rcu_read_lock(); + stat->nlink += get_nr_threads(p); + rcu_read_unlock(); + put_task_struct(p); + } + + return 0; +} diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 722b9c46311..6dcef089e18 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -58,14 +58,11 @@ static void de_put(struct proc_dir_entry *de) static void proc_delete_inode(struct inode *inode) { struct proc_dir_entry *de; - struct task_struct *tsk; truncate_inode_pages(&inode->i_data, 0); - /* Let go of any associated process */ - tsk = PROC_I(inode)->task; - if (tsk) - put_task_struct(tsk); + /* Stop tracking associated processes */ + put_pid(PROC_I(inode)->pid); /* Let go of any associated proc directory entry */ de = PROC_I(inode)->pde; @@ -94,8 +91,8 @@ static struct inode *proc_alloc_inode(struct super_block *sb) ei = (struct proc_inode *)kmem_cache_alloc(proc_inode_cachep, SLAB_KERNEL); if (!ei) return NULL; - ei->task = NULL; - ei->type = 0; + ei->pid = NULL; + ei->fd = 0; ei->op.proc_get_link = NULL; ei->pde = NULL; inode = &ei->vfs_inode; diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 0502f17b860..146a434ba94 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -37,16 +37,30 @@ extern int proc_tgid_stat(struct task_struct *, char *); extern int proc_pid_status(struct task_struct *, char *); extern int proc_pid_statm(struct task_struct *, char *); +extern struct file_operations proc_maps_operations; +extern struct file_operations proc_numa_maps_operations; +extern struct file_operations proc_smaps_operations; + +extern struct file_operations proc_maps_operations; +extern struct file_operations proc_numa_maps_operations; +extern struct file_operations proc_smaps_operations; + + void free_proc_entry(struct proc_dir_entry *de); int proc_init_inodecache(void); -static inline struct task_struct *proc_task(struct inode *inode) +static inline struct pid *proc_pid(struct inode *inode) +{ + return PROC_I(inode)->pid; +} + +static inline struct task_struct *get_proc_task(struct inode *inode) { - return PROC_I(inode)->task; + return get_pid_task(proc_pid(inode), PIDTYPE_PID); } -static inline int proc_type(struct inode *inode) +static inline int proc_fd(struct inode *inode) { - return PROC_I(inode)->type; + return PROC_I(inode)->fd; } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 91b7c15ab37..0137ec4c136 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -75,9 +75,13 @@ int proc_exe_link(struct inode *inode, struct dentry **dentry, struct vfsmount * { struct vm_area_struct * vma; int result = -ENOENT; - struct task_struct *task = proc_task(inode); - struct mm_struct * mm = get_task_mm(task); + struct task_struct *task = get_proc_task(inode); + struct mm_struct * mm = NULL; + if (task) { + mm = get_task_mm(task); + put_task_struct(task); + } if (!mm) goto out; down_read(&mm->mmap_sem); @@ -120,7 +124,8 @@ struct mem_size_stats static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats *mss) { - struct task_struct *task = m->private; + struct proc_maps_private *priv = m->private; + struct task_struct *task = priv->task; struct vm_area_struct *vma = v; struct mm_struct *mm = vma->vm_mm; struct file *file = vma->vm_file; @@ -295,12 +300,16 @@ static int show_smap(struct seq_file *m, void *v) static void *m_start(struct seq_file *m, loff_t *pos) { - struct task_struct *task = m->private; + struct proc_maps_private *priv = m->private; unsigned long last_addr = m->version; struct mm_struct *mm; - struct vm_area_struct *vma, *tail_vma; + struct vm_area_struct *vma, *tail_vma = NULL; loff_t l = *pos; + /* Clear the per syscall fields in priv */ + priv->task = NULL; + priv->tail_vma = NULL; + /* * We remember last_addr rather than next_addr to hit with * mmap_cache most of the time. We have zero last_addr at @@ -311,11 +320,15 @@ static void *m_start(struct seq_file *m, loff_t *pos) if (last_addr == -1UL) return NULL; - mm = get_task_mm(task); + priv->task = get_pid_task(priv->pid, PIDTYPE_PID); + if (!priv->task) + return NULL; + + mm = get_task_mm(priv->task); if (!mm) return NULL; - tail_vma = get_gate_vma(task); + priv->tail_vma = tail_vma = get_gate_vma(priv->task); down_read(&mm->mmap_sem); /* Start with last addr hint */ @@ -350,11 +363,9 @@ out: return tail_vma; } -static void m_stop(struct seq_file *m, void *v) +static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma) { - struct task_struct *task = m->private; - struct vm_area_struct *vma = v; - if (vma && vma != get_gate_vma(task)) { + if (vma && vma != priv->tail_vma) { struct mm_struct *mm = vma->vm_mm; up_read(&mm->mmap_sem); mmput(mm); @@ -363,38 +374,103 @@ static void m_stop(struct seq_file *m, void *v) static void *m_next(struct seq_file *m, void *v, loff_t *pos) { - struct task_struct *task = m->private; + struct proc_maps_private *priv = m->private; struct vm_area_struct *vma = v; - struct vm_area_struct *tail_vma = get_gate_vma(task); + struct vm_area_struct *tail_vma = priv->tail_vma; (*pos)++; if (vma && (vma != tail_vma) && vma->vm_next) return vma->vm_next; - m_stop(m, v); + vma_stop(priv, vma); return (vma != tail_vma)? tail_vma: NULL; } -struct seq_operations proc_pid_maps_op = { +static void m_stop(struct seq_file *m, void *v) +{ + struct proc_maps_private *priv = m->private; + struct vm_area_struct *vma = v; + + vma_stop(priv, vma); + if (priv->task) + put_task_struct(priv->task); +} + +static struct seq_operations proc_pid_maps_op = { .start = m_start, .next = m_next, .stop = m_stop, .show = show_map }; -struct seq_operations proc_pid_smaps_op = { +static struct seq_operations proc_pid_smaps_op = { .start = m_start, .next = m_next, .stop = m_stop, .show = show_smap }; +static int do_maps_open(struct inode *inode, struct file *file, + struct seq_operations *ops) +{ + struct proc_maps_private *priv; + int ret = -ENOMEM; + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (priv) { + priv->pid = proc_pid(inode); + ret = seq_open(file, ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = priv; + } else { + kfree(priv); + } + } + return ret; +} + +static int maps_open(struct inode *inode, struct file *file) +{ + return do_maps_open(inode, file, &proc_pid_maps_op); +} + +struct file_operations proc_maps_operations = { + .open = maps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + #ifdef CONFIG_NUMA extern int show_numa_map(struct seq_file *m, void *v); -struct seq_operations proc_pid_numa_maps_op = { +static struct seq_operations proc_pid_numa_maps_op = { .start = m_start, .next = m_next, .stop = m_stop, .show = show_numa_map }; + +static int numa_maps_open(struct inode *inode, struct file *file) +{ + return do_maps_open(inode, file, &proc_pid_numa_maps_op); +} + +struct file_operations proc_numa_maps_operations = { + .open = numa_maps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; #endif + +static int smaps_open(struct inode *inode, struct file *file) +{ + return do_maps_open(inode, file, &proc_pid_smaps_op); +} + +struct file_operations proc_smaps_operations = { + .open = smaps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 8f68827ed10..af69f28277b 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -156,9 +156,28 @@ static void *m_next(struct seq_file *m, void *v, loff_t *pos) { return NULL; } -struct seq_operations proc_pid_maps_op = { +static struct seq_operations proc_pid_maps_op = { .start = m_start, .next = m_next, .stop = m_stop, .show = show_map }; + +static int maps_open(struct inode *inode, struct file *file) +{ + int ret; + ret = seq_open(file, &proc_pid_maps_op); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = NULL; + } + return ret; +} + +struct file_operations proc_maps_operations = { + .open = maps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index cf6e1cf4035..752cea12e30 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -1560,12 +1560,6 @@ static ssize_t reiserfs_file_write(struct file *file, /* the file we are going t return res; } -static ssize_t reiserfs_aio_write(struct kiocb *iocb, const char __user * buf, - size_t count, loff_t pos) -{ - return generic_file_aio_write(iocb, buf, count, pos); -} - const struct file_operations reiserfs_file_operations = { .read = generic_file_read, .write = reiserfs_file_write, @@ -1575,7 +1569,7 @@ const struct file_operations reiserfs_file_operations = { .fsync = reiserfs_sync_file, .sendfile = generic_file_sendfile, .aio_read = generic_file_aio_read, - .aio_write = reiserfs_aio_write, + .aio_write = generic_file_aio_write, .splice_read = generic_file_splice_read, .splice_write = generic_file_splice_write, }; diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 1b73529b809..49d1a53dbef 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -834,8 +834,7 @@ static int write_ordered_buffers(spinlock_t * lock, get_bh(bh); if (test_set_buffer_locked(bh)) { if (!buffer_dirty(bh)) { - list_del_init(&jh->list); - list_add(&jh->list, &tmp); + list_move(&jh->list, &tmp); goto loop_next; } spin_unlock(lock); @@ -855,8 +854,7 @@ static int write_ordered_buffers(spinlock_t * lock, ret = -EIO; } if (buffer_dirty(bh)) { - list_del_init(&jh->list); - list_add(&jh->list, &tmp); + list_move(&jh->list, &tmp); add_to_chunk(&chunk, bh, lock, write_ordered_chunk); } else { reiserfs_free_jh(bh); diff --git a/fs/smbfs/request.c b/fs/smbfs/request.c index c71dd2760d3..c8e96195b96 100644 --- a/fs/smbfs/request.c +++ b/fs/smbfs/request.c @@ -400,8 +400,7 @@ static int smb_request_send_req(struct smb_request *req) if (!(req->rq_flags & SMB_REQ_TRANSMITTED)) goto out; - list_del_init(&req->rq_queue); - list_add_tail(&req->rq_queue, &server->recvq); + list_move_tail(&req->rq_queue, &server->recvq); result = 1; out: return result; @@ -435,8 +434,7 @@ int smb_request_send_server(struct smb_sb_info *server) result = smb_request_send_req(req); if (result < 0) { server->conn_error = result; - list_del_init(&req->rq_queue); - list_add(&req->rq_queue, &server->xmitq); + list_move(&req->rq_queue, &server->xmitq); result = -EIO; goto out; } diff --git a/fs/smbfs/smbiod.c b/fs/smbfs/smbiod.c index 3f71384020c..24577e2c489 100644 --- a/fs/smbfs/smbiod.c +++ b/fs/smbfs/smbiod.c @@ -193,8 +193,7 @@ int smbiod_retry(struct smb_sb_info *server) if (req->rq_flags & SMB_REQ_RETRY) { /* must move the request to the xmitq */ VERBOSE("retrying request %p on recvq\n", req); - list_del(&req->rq_queue); - list_add(&req->rq_queue, &server->xmitq); + list_move(&req->rq_queue, &server->xmitq); continue; } #endif diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 610b5bdbe75..61c42430cba 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -430,10 +430,9 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) i++; /* fallthrough */ default: - if (filp->f_pos == 2) { - list_del(q); - list_add(q, &parent_sd->s_children); - } + if (filp->f_pos == 2) + list_move(q, &parent_sd->s_children); + for (p=q->next; p!= &parent_sd->s_children; p=p->next) { struct sysfs_dirent *next; const char * name; @@ -455,8 +454,7 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) dt_type(next)) < 0) return 0; - list_del(q); - list_add(q, p); + list_move(q, p); p = q; filp->f_pos++; } diff --git a/include/asm-arm/hardware/locomo.h b/include/asm-arm/hardware/locomo.h index 5f10048ec54..22dfb173776 100644 --- a/include/asm-arm/hardware/locomo.h +++ b/include/asm-arm/hardware/locomo.h @@ -111,6 +111,8 @@ #define LOCOMO_ALS 0x00 /* Adjust light cycle */ #define LOCOMO_ALD 0x04 /* Adjust light duty */ +#define LOCOMO_ALC_EN 0x8000 + /* Backlight controller: TFT signal */ #define LOCOMO_BACKLIGHT 0x38 #define LOCOMO_TC 0x00 /* TFT control signal */ @@ -203,4 +205,7 @@ void locomo_gpio_write(struct locomo_dev *ldev, unsigned int bits, unsigned int /* M62332 control function */ void locomo_m62332_senddata(struct locomo_dev *ldev, unsigned int dac_data, int channel); +/* Frontlight control */ +void locomo_frontlight_set(struct locomo_dev *dev, int duty, int vr, int bpwf); + #endif diff --git a/include/asm-i386/delay.h b/include/asm-i386/delay.h index 456db8501c0..b1c7650dc7b 100644 --- a/include/asm-i386/delay.h +++ b/include/asm-i386/delay.h @@ -23,4 +23,6 @@ extern void __delay(unsigned long loops); ((n) > 20000 ? __bad_ndelay() : __const_udelay((n) * 5ul)) : \ __ndelay(n)) +void use_tsc_delay(void); + #endif /* defined(_I386_DELAY_H) */ diff --git a/include/asm-i386/kdebug.h b/include/asm-i386/kdebug.h index 96d0828ce09..d18cdb9fc9a 100644 --- a/include/asm-i386/kdebug.h +++ b/include/asm-i386/kdebug.h @@ -19,6 +19,8 @@ struct die_args { extern int register_die_notifier(struct notifier_block *); extern int unregister_die_notifier(struct notifier_block *); +extern int register_page_fault_notifier(struct notifier_block *); +extern int unregister_page_fault_notifier(struct notifier_block *); extern struct atomic_notifier_head i386die_chain; diff --git a/include/asm-i386/kprobes.h b/include/asm-i386/kprobes.h index 57d157c5cf8..0730a20f6db 100644 --- a/include/asm-i386/kprobes.h +++ b/include/asm-i386/kprobes.h @@ -44,6 +44,7 @@ typedef u8 kprobe_opcode_t; #define JPROBE_ENTRY(pentry) (kprobe_opcode_t *)pentry #define ARCH_SUPPORTS_KRETPROBES +#define ARCH_INACTIVE_KPROBE_COUNT 0 void arch_remove_kprobe(struct kprobe *p); void kretprobe_trampoline(void); diff --git a/include/asm-i386/mach-default/mach_timer.h b/include/asm-i386/mach-default/mach_timer.h index 4b9703bb028..807992fd417 100644 --- a/include/asm-i386/mach-default/mach_timer.h +++ b/include/asm-i386/mach-default/mach_timer.h @@ -15,7 +15,9 @@ #ifndef _MACH_TIMER_H #define _MACH_TIMER_H -#define CALIBRATE_LATCH (5 * LATCH) +#define CALIBRATE_TIME_MSEC 30 /* 30 msecs */ +#define CALIBRATE_LATCH \ + ((CLOCK_TICK_RATE * CALIBRATE_TIME_MSEC + 1000/2)/1000) static inline void mach_prepare_counter(void) { diff --git a/include/asm-i386/mach-summit/mach_mpparse.h b/include/asm-i386/mach-summit/mach_mpparse.h index 1cce2b924a8..94268399170 100644 --- a/include/asm-i386/mach-summit/mach_mpparse.h +++ b/include/asm-i386/mach-summit/mach_mpparse.h @@ -2,6 +2,7 @@ #define __ASM_MACH_MPPARSE_H #include <mach_apic.h> +#include <asm/tsc.h> extern int use_cyclone; @@ -29,6 +30,7 @@ static inline int mps_oem_check(struct mp_config_table *mpc, char *oem, (!strncmp(productid, "VIGIL SMP", 9) || !strncmp(productid, "EXA", 3) || !strncmp(productid, "RUTHLESS SMP", 12))){ + mark_tsc_unstable(); use_cyclone = 1; /*enable cyclone-timer*/ setup_summit(); return 1; @@ -42,6 +44,7 @@ static inline int acpi_madt_oem_check(char *oem_id, char *oem_table_id) if (!strncmp(oem_id, "IBM", 3) && (!strncmp(oem_table_id, "SERVIGIL", 8) || !strncmp(oem_table_id, "EXA", 3))){ + mark_tsc_unstable(); use_cyclone = 1; /*enable cyclone-timer*/ setup_summit(); return 1; diff --git a/include/asm-i386/timer.h b/include/asm-i386/timer.h index aed16437479..d0ebd05f851 100644 --- a/include/asm-i386/timer.h +++ b/include/asm-i386/timer.h @@ -3,68 +3,11 @@ #include <linux/init.h> #include <linux/pm.h> -/** - * struct timer_ops - used to define a timer source - * - * @name: name of the timer. - * @init: Probes and initializes the timer. Takes clock= override - * string as an argument. Returns 0 on success, anything else - * on failure. - * @mark_offset: called by the timer interrupt. - * @get_offset: called by gettimeofday(). Returns the number of microseconds - * since the last timer interupt. - * @monotonic_clock: returns the number of nanoseconds since the init of the - * timer. - * @delay: delays this many clock cycles. - */ -struct timer_opts { - char* name; - void (*mark_offset)(void); - unsigned long (*get_offset)(void); - unsigned long long (*monotonic_clock)(void); - void (*delay)(unsigned long); - unsigned long (*read_timer)(void); - int (*suspend)(pm_message_t state); - int (*resume)(void); -}; - -struct init_timer_opts { - int (*init)(char *override); - struct timer_opts *opts; -}; - #define TICK_SIZE (tick_nsec / 1000) - -extern struct timer_opts* __init select_timer(void); -extern void clock_fallback(void); void setup_pit_timer(void); - /* Modifiers for buggy PIT handling */ - extern int pit_latch_buggy; - -extern struct timer_opts *cur_timer; extern int timer_ack; - -/* list of externed timers */ -extern struct timer_opts timer_none; -extern struct timer_opts timer_pit; -extern struct init_timer_opts timer_pit_init; -extern struct init_timer_opts timer_tsc_init; -#ifdef CONFIG_X86_CYCLONE_TIMER -extern struct init_timer_opts timer_cyclone_init; -#endif - -extern unsigned long calibrate_tsc(void); -extern unsigned long read_timer_tsc(void); -extern void init_cpu_khz(void); extern int recalibrate_cpu_khz(void); -#ifdef CONFIG_HPET_TIMER -extern struct init_timer_opts timer_hpet_init; -extern unsigned long calibrate_tsc_hpet(unsigned long *tsc_hpet_quotient_ptr); -#endif -#ifdef CONFIG_X86_PM_TIMER -extern struct init_timer_opts timer_pmtmr_init; -#endif #endif diff --git a/include/asm-i386/timex.h b/include/asm-i386/timex.h index d434984303c..3666044409f 100644 --- a/include/asm-i386/timex.h +++ b/include/asm-i386/timex.h @@ -7,6 +7,7 @@ #define _ASMi386_TIMEX_H #include <asm/processor.h> +#include <asm/tsc.h> #ifdef CONFIG_X86_ELAN # define CLOCK_TICK_RATE 1189200 /* AMD Elan has different frequency! */ @@ -15,39 +16,6 @@ #endif -/* - * Standard way to access the cycle counter on i586+ CPUs. - * Currently only used on SMP. - * - * If you really have a SMP machine with i486 chips or older, - * compile for that, and this will just always return zero. - * That's ok, it just means that the nicer scheduling heuristics - * won't work for you. - * - * We only use the low 32 bits, and we'd simply better make sure - * that we reschedule before that wraps. Scheduling at least every - * four billion cycles just basically sounds like a good idea, - * regardless of how fast the machine is. - */ -typedef unsigned long long cycles_t; - -static inline cycles_t get_cycles (void) -{ - unsigned long long ret=0; - -#ifndef CONFIG_X86_TSC - if (!cpu_has_tsc) - return 0; -#endif - -#if defined(CONFIG_X86_GENERIC) || defined(CONFIG_X86_TSC) - rdtscll(ret); -#endif - return ret; -} - -extern unsigned int cpu_khz; - extern int read_current_timer(unsigned long *timer_value); #define ARCH_HAS_READ_CURRENT_TIMER 1 diff --git a/include/asm-i386/tsc.h b/include/asm-i386/tsc.h new file mode 100644 index 00000000000..97b828ce31e --- /dev/null +++ b/include/asm-i386/tsc.h @@ -0,0 +1,49 @@ +/* + * linux/include/asm-i386/tsc.h + * + * i386 TSC related functions + */ +#ifndef _ASM_i386_TSC_H +#define _ASM_i386_TSC_H + +#include <linux/config.h> +#include <asm/processor.h> + +/* + * Standard way to access the cycle counter on i586+ CPUs. + * Currently only used on SMP. + * + * If you really have a SMP machine with i486 chips or older, + * compile for that, and this will just always return zero. + * That's ok, it just means that the nicer scheduling heuristics + * won't work for you. + * + * We only use the low 32 bits, and we'd simply better make sure + * that we reschedule before that wraps. Scheduling at least every + * four billion cycles just basically sounds like a good idea, + * regardless of how fast the machine is. + */ +typedef unsigned long long cycles_t; + +extern unsigned int cpu_khz; +extern unsigned int tsc_khz; + +static inline cycles_t get_cycles(void) +{ + unsigned long long ret = 0; + +#ifndef CONFIG_X86_TSC + if (!cpu_has_tsc) + return 0; +#endif + +#if defined(CONFIG_X86_GENERIC) || defined(CONFIG_X86_TSC) + rdtscll(ret); +#endif + return ret; +} + +extern void tsc_init(void); +extern void mark_tsc_unstable(void); + +#endif diff --git a/include/asm-ia64/kdebug.h b/include/asm-ia64/kdebug.h index c195a9ad125..aed7142f9e4 100644 --- a/include/asm-ia64/kdebug.h +++ b/include/asm-ia64/kdebug.h @@ -40,6 +40,8 @@ struct die_args { extern int register_die_notifier(struct notifier_block *); extern int unregister_die_notifier(struct notifier_block *); +extern int register_page_fault_notifier(struct notifier_block *); +extern int unregister_page_fault_notifier(struct notifier_block *); extern struct atomic_notifier_head ia64die_chain; enum die_val { diff --git a/include/asm-ia64/kprobes.h b/include/asm-ia64/kprobes.h index 8c0fc227f0f..2418a787c40 100644 --- a/include/asm-ia64/kprobes.h +++ b/include/asm-ia64/kprobes.h @@ -82,6 +82,7 @@ struct kprobe_ctlblk { #define JPROBE_ENTRY(pentry) (kprobe_opcode_t *)pentry #define ARCH_SUPPORTS_KRETPROBES +#define ARCH_INACTIVE_KPROBE_COUNT 1 #define SLOT0_OPCODE_SHIFT (37) #define SLOT1_p1_OPCODE_SHIFT (37 - (64-46)) diff --git a/include/asm-powerpc/kdebug.h b/include/asm-powerpc/kdebug.h index c01786ab5fa..532bfee934f 100644 --- a/include/asm-powerpc/kdebug.h +++ b/include/asm-powerpc/kdebug.h @@ -18,6 +18,8 @@ struct die_args { extern int register_die_notifier(struct notifier_block *); extern int unregister_die_notifier(struct notifier_block *); +extern int register_page_fault_notifier(struct notifier_block *); +extern int unregister_page_fault_notifier(struct notifier_block *); extern struct atomic_notifier_head powerpc_die_chain; /* Grossly misnamed. */ diff --git a/include/asm-powerpc/kprobes.h b/include/asm-powerpc/kprobes.h index f466bc804f4..2d0af52c823 100644 --- a/include/asm-powerpc/kprobes.h +++ b/include/asm-powerpc/kprobes.h @@ -50,6 +50,8 @@ typedef unsigned int kprobe_opcode_t; IS_TWI(instr) || IS_TDI(instr)) #define ARCH_SUPPORTS_KRETPROBES +#define ARCH_INACTIVE_KPROBE_COUNT 1 + void kretprobe_trampoline(void); extern void arch_remove_kprobe(struct kprobe *p); diff --git a/include/asm-sparc64/kdebug.h b/include/asm-sparc64/kdebug.h index 4040d127ac3..11251bdd00c 100644 --- a/include/asm-sparc64/kdebug.h +++ b/include/asm-sparc64/kdebug.h @@ -17,6 +17,8 @@ struct die_args { extern int register_die_notifier(struct notifier_block *); extern int unregister_die_notifier(struct notifier_block *); +extern int register_page_fault_notifier(struct notifier_block *); +extern int unregister_page_fault_notifier(struct notifier_block *); extern struct atomic_notifier_head sparc64die_chain; extern void bad_trap(struct pt_regs *, long); diff --git a/include/asm-sparc64/kprobes.h b/include/asm-sparc64/kprobes.h index e9bb26f770e..15065af566c 100644 --- a/include/asm-sparc64/kprobes.h +++ b/include/asm-sparc64/kprobes.h @@ -12,6 +12,7 @@ typedef u32 kprobe_opcode_t; #define JPROBE_ENTRY(pentry) (kprobe_opcode_t *)pentry #define arch_remove_kprobe(p) do {} while (0) +#define ARCH_INACTIVE_KPROBE_COUNT 0 /* Architecture specific copy of original instruction*/ struct arch_specific_insn { diff --git a/include/asm-x86_64/kdebug.h b/include/asm-x86_64/kdebug.h index cf795631d9b..cd52c7f33bc 100644 --- a/include/asm-x86_64/kdebug.h +++ b/include/asm-x86_64/kdebug.h @@ -15,6 +15,8 @@ struct die_args { extern int register_die_notifier(struct notifier_block *); extern int unregister_die_notifier(struct notifier_block *); +extern int register_page_fault_notifier(struct notifier_block *); +extern int unregister_page_fault_notifier(struct notifier_block *); extern struct atomic_notifier_head die_chain; /* Grossly misnamed. */ diff --git a/include/asm-x86_64/kprobes.h b/include/asm-x86_64/kprobes.h index 98a1e95ddb9..d36febd9bb1 100644 --- a/include/asm-x86_64/kprobes.h +++ b/include/asm-x86_64/kprobes.h @@ -43,6 +43,7 @@ typedef u8 kprobe_opcode_t; #define JPROBE_ENTRY(pentry) (kprobe_opcode_t *)pentry #define ARCH_SUPPORTS_KRETPROBES +#define ARCH_INACTIVE_KPROBE_COUNT 1 void kretprobe_trampoline(void); extern void arch_remove_kprobe(struct kprobe *p); diff --git a/include/keys/user-type.h b/include/keys/user-type.h index a3dae1803f4..c37c34275a4 100644 --- a/include/keys/user-type.h +++ b/include/keys/user-type.h @@ -37,6 +37,7 @@ extern struct key_type key_type_user; extern int user_instantiate(struct key *key, const void *data, size_t datalen); extern int user_update(struct key *key, const void *data, size_t datalen); extern int user_match(const struct key *key, const void *criterion); +extern void user_revoke(struct key *key); extern void user_destroy(struct key *key); extern void user_describe(const struct key *user, struct seq_file *m); extern long user_read(const struct key *key, diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h new file mode 100644 index 00000000000..d852024ed09 --- /dev/null +++ b/include/linux/clocksource.h @@ -0,0 +1,185 @@ +/* linux/include/linux/clocksource.h + * + * This file contains the structure definitions for clocksources. + * + * If you are not a clocksource, or timekeeping code, you should + * not be including this file! + */ +#ifndef _LINUX_CLOCKSOURCE_H +#define _LINUX_CLOCKSOURCE_H + +#include <linux/types.h> +#include <linux/timex.h> +#include <linux/time.h> +#include <linux/list.h> +#include <asm/div64.h> +#include <asm/io.h> + +/* clocksource cycle base type */ +typedef u64 cycle_t; + +/** + * struct clocksource - hardware abstraction for a free running counter + * Provides mostly state-free accessors to the underlying hardware. + * + * @name: ptr to clocksource name + * @list: list head for registration + * @rating: rating value for selection (higher is better) + * To avoid rating inflation the following + * list should give you a guide as to how + * to assign your clocksource a rating + * 1-99: Unfit for real use + * Only available for bootup and testing purposes. + * 100-199: Base level usability. + * Functional for real use, but not desired. + * 200-299: Good. + * A correct and usable clocksource. + * 300-399: Desired. + * A reasonably fast and accurate clocksource. + * 400-499: Perfect + * The ideal clocksource. A must-use where + * available. + * @read: returns a cycle value + * @mask: bitmask for two's complement + * subtraction of non 64 bit counters + * @mult: cycle to nanosecond multiplier + * @shift: cycle to nanosecond divisor (power of two) + * @update_callback: called when safe to alter clocksource values + * @is_continuous: defines if clocksource is free-running. + * @cycle_interval: Used internally by timekeeping core, please ignore. + * @xtime_interval: Used internally by timekeeping core, please ignore. + */ +struct clocksource { + char *name; + struct list_head list; + int rating; + cycle_t (*read)(void); + cycle_t mask; + u32 mult; + u32 shift; + int (*update_callback)(void); + int is_continuous; + + /* timekeeping specific data, ignore */ + cycle_t cycle_last, cycle_interval; + u64 xtime_nsec, xtime_interval; + s64 error; +}; + +/* simplify initialization of mask field */ +#define CLOCKSOURCE_MASK(bits) (cycle_t)(bits<64 ? ((1ULL<<bits)-1) : -1) + +/** + * clocksource_khz2mult - calculates mult from khz and shift + * @khz: Clocksource frequency in KHz + * @shift_constant: Clocksource shift factor + * + * Helper functions that converts a khz counter frequency to a timsource + * multiplier, given the clocksource shift value + */ +static inline u32 clocksource_khz2mult(u32 khz, u32 shift_constant) +{ + /* khz = cyc/(Million ns) + * mult/2^shift = ns/cyc + * mult = ns/cyc * 2^shift + * mult = 1Million/khz * 2^shift + * mult = 1000000 * 2^shift / khz + * mult = (1000000<<shift) / khz + */ + u64 tmp = ((u64)1000000) << shift_constant; + + tmp += khz/2; /* round for do_div */ + do_div(tmp, khz); + + return (u32)tmp; +} + +/** + * clocksource_hz2mult - calculates mult from hz and shift + * @hz: Clocksource frequency in Hz + * @shift_constant: Clocksource shift factor + * + * Helper functions that converts a hz counter + * frequency to a timsource multiplier, given the + * clocksource shift value + */ +static inline u32 clocksource_hz2mult(u32 hz, u32 shift_constant) +{ + /* hz = cyc/(Billion ns) + * mult/2^shift = ns/cyc + * mult = ns/cyc * 2^shift + * mult = 1Billion/hz * 2^shift + * mult = 1000000000 * 2^shift / hz + * mult = (1000000000<<shift) / hz + */ + u64 tmp = ((u64)1000000000) << shift_constant; + + tmp += hz/2; /* round for do_div */ + do_div(tmp, hz); + + return (u32)tmp; +} + +/** + * clocksource_read: - Access the clocksource's current cycle value + * @cs: pointer to clocksource being read + * + * Uses the clocksource to return the current cycle_t value + */ +static inline cycle_t clocksource_read(struct clocksource *cs) +{ + return cs->read(); +} + +/** + * cyc2ns - converts clocksource cycles to nanoseconds + * @cs: Pointer to clocksource + * @cycles: Cycles + * + * Uses the clocksource and ntp ajdustment to convert cycle_ts to nanoseconds. + * + * XXX - This could use some mult_lxl_ll() asm optimization + */ +static inline s64 cyc2ns(struct clocksource *cs, cycle_t cycles) +{ + u64 ret = (u64)cycles; + ret = (ret * cs->mult) >> cs->shift; + return ret; +} + +/** + * clocksource_calculate_interval - Calculates a clocksource interval struct + * + * @c: Pointer to clocksource. + * @length_nsec: Desired interval length in nanoseconds. + * + * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment + * pair and interval request. + * + * Unless you're the timekeeping code, you should not be using this! + */ +static inline void clocksource_calculate_interval(struct clocksource *c, + unsigned long length_nsec) +{ + u64 tmp; + + /* XXX - All of this could use a whole lot of optimization */ + tmp = length_nsec; + tmp <<= c->shift; + tmp += c->mult/2; + do_div(tmp, c->mult); + + c->cycle_interval = (cycle_t)tmp; + if (c->cycle_interval == 0) + c->cycle_interval = 1; + + c->xtime_interval = (u64)c->cycle_interval * c->mult; +} + + +/* used to install a new clocksource */ +int clocksource_register(struct clocksource*); +void clocksource_reselect(void); +struct clocksource* clocksource_get_next(void); + +#endif /* _LINUX_CLOCKSOURCE_H */ diff --git a/include/linux/compat_ioctl.h b/include/linux/compat_ioctl.h index 89ab677cb99..917d62e4148 100644 --- a/include/linux/compat_ioctl.h +++ b/include/linux/compat_ioctl.h @@ -673,6 +673,11 @@ COMPATIBLE_IOCTL(CAPI_SET_FLAGS) COMPATIBLE_IOCTL(CAPI_CLR_FLAGS) COMPATIBLE_IOCTL(CAPI_NCCI_OPENCOUNT) COMPATIBLE_IOCTL(CAPI_NCCI_GETUNIT) +/* Siemens Gigaset */ +COMPATIBLE_IOCTL(GIGASET_REDIR) +COMPATIBLE_IOCTL(GIGASET_CONFIG) +COMPATIBLE_IOCTL(GIGASET_BRKCHARS) +COMPATIBLE_IOCTL(GIGASET_VERSION) /* Misc. */ COMPATIBLE_IOCTL(0x41545900) /* ATYIO_CLKR */ COMPATIBLE_IOCTL(0x41545901) /* ATYIO_CLKW */ diff --git a/include/linux/console.h b/include/linux/console.h index d0f8a800949..3bdf2155e56 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -63,9 +63,11 @@ extern const struct consw vga_con; /* VGA text console */ extern const struct consw newport_con; /* SGI Newport console */ extern const struct consw prom_con; /* SPARC PROM console */ +int con_is_bound(const struct consw *csw); +int register_con_driver(const struct consw *csw, int first, int last); +int unregister_con_driver(const struct consw *csw); int take_over_console(const struct consw *sw, int first, int last, int deflt); void give_up_console(const struct consw *sw); - /* scroll */ #define SM_UP (1) #define SM_DOWN (2) diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index aee10b2ea4c..e3d1c33d155 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -8,9 +8,12 @@ #ifndef _LINUX_DEVICE_MAPPER_H #define _LINUX_DEVICE_MAPPER_H +#ifdef __KERNEL__ + struct dm_target; struct dm_table; struct dm_dev; +struct mapped_device; typedef enum { STATUSTYPE_INFO, STATUSTYPE_TABLE } status_type_t; @@ -78,7 +81,7 @@ void dm_put_device(struct dm_target *ti, struct dm_dev *d); struct target_type { const char *name; struct module *module; - unsigned version[3]; + unsigned version[3]; dm_ctr_fn ctr; dm_dtr_fn dtr; dm_map_fn map; @@ -128,4 +131,108 @@ struct dm_target { int dm_register_target(struct target_type *t); int dm_unregister_target(struct target_type *t); -#endif /* _LINUX_DEVICE_MAPPER_H */ + +/*----------------------------------------------------------------- + * Functions for creating and manipulating mapped devices. + * Drop the reference with dm_put when you finish with the object. + *---------------------------------------------------------------*/ + +/* + * DM_ANY_MINOR chooses the next available minor number. + */ +#define DM_ANY_MINOR (-1) +int dm_create(int minor, struct mapped_device **md); + +/* + * Reference counting for md. + */ +struct mapped_device *dm_get_md(dev_t dev); +void dm_get(struct mapped_device *md); +void dm_put(struct mapped_device *md); + +/* + * An arbitrary pointer may be stored alongside a mapped device. + */ +void dm_set_mdptr(struct mapped_device *md, void *ptr); +void *dm_get_mdptr(struct mapped_device *md); + +/* + * A device can still be used while suspended, but I/O is deferred. + */ +int dm_suspend(struct mapped_device *md, int with_lockfs); +int dm_resume(struct mapped_device *md); + +/* + * Event functions. + */ +uint32_t dm_get_event_nr(struct mapped_device *md); +int dm_wait_event(struct mapped_device *md, int event_nr); + +/* + * Info functions. + */ +const char *dm_device_name(struct mapped_device *md); +struct gendisk *dm_disk(struct mapped_device *md); +int dm_suspended(struct mapped_device *md); + +/* + * Geometry functions. + */ +int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo); +int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo); + + +/*----------------------------------------------------------------- + * Functions for manipulating device-mapper tables. + *---------------------------------------------------------------*/ + +/* + * First create an empty table. + */ +int dm_table_create(struct dm_table **result, int mode, + unsigned num_targets, struct mapped_device *md); + +/* + * Then call this once for each target. + */ +int dm_table_add_target(struct dm_table *t, const char *type, + sector_t start, sector_t len, char *params); + +/* + * Finally call this to make the table ready for use. + */ +int dm_table_complete(struct dm_table *t); + +/* + * Table reference counting. + */ +struct dm_table *dm_get_table(struct mapped_device *md); +void dm_table_get(struct dm_table *t); +void dm_table_put(struct dm_table *t); + +/* + * Queries + */ +sector_t dm_table_get_size(struct dm_table *t); +unsigned int dm_table_get_num_targets(struct dm_table *t); +int dm_table_get_mode(struct dm_table *t); +struct mapped_device *dm_table_get_md(struct dm_table *t); + +/* + * Trigger an event. + */ +void dm_table_event(struct dm_table *t); + +/* + * The device must be suspended before calling this method. + */ +int dm_swap_table(struct mapped_device *md, struct dm_table *t); + +/* + * Prepare a table for a device that will error all I/O. + * To make it active, call dm_suspend(), dm_swap_table() then dm_resume(). + */ +int dm_create_error_table(struct dm_table **result, struct mapped_device *md); + +#endif /* __KERNEL__ */ +#endif /* _LINUX_DEVICE_MAPPER_H */ diff --git a/include/linux/dm-ioctl.h b/include/linux/dm-ioctl.h index c67c6786612..9623bb62509 100644 --- a/include/linux/dm-ioctl.h +++ b/include/linux/dm-ioctl.h @@ -285,9 +285,9 @@ typedef char ioctl_struct[308]; #define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) #define DM_VERSION_MAJOR 4 -#define DM_VERSION_MINOR 6 +#define DM_VERSION_MINOR 7 #define DM_VERSION_PATCHLEVEL 0 -#define DM_VERSION_EXTRA "-ioctl (2006-02-17)" +#define DM_VERSION_EXTRA "-ioctl (2006-06-24)" /* Status bits */ #define DM_READONLY_FLAG (1 << 0) /* In/Out */ @@ -314,7 +314,7 @@ typedef char ioctl_struct[308]; #define DM_BUFFER_FULL_FLAG (1 << 8) /* Out */ /* - * Set this to improve performance when you aren't going to use open_count. + * This flag is now ignored. */ #define DM_SKIP_BDGET_FLAG (1 << 9) /* In */ diff --git a/include/linux/fb.h b/include/linux/fb.h index f1281687e54..07a08e92bc7 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -504,23 +504,19 @@ struct fb_cursor_user { #define FB_EVENT_MODE_DELETE 0x04 /* A driver registered itself */ #define FB_EVENT_FB_REGISTERED 0x05 +/* A driver unregistered itself */ +#define FB_EVENT_FB_UNREGISTERED 0x06 /* CONSOLE-SPECIFIC: get console to framebuffer mapping */ -#define FB_EVENT_GET_CONSOLE_MAP 0x06 +#define FB_EVENT_GET_CONSOLE_MAP 0x07 /* CONSOLE-SPECIFIC: set console to framebuffer mapping */ -#define FB_EVENT_SET_CONSOLE_MAP 0x07 +#define FB_EVENT_SET_CONSOLE_MAP 0x08 /* A display blank is requested */ -#define FB_EVENT_BLANK 0x08 +#define FB_EVENT_BLANK 0x09 /* Private modelist is to be replaced */ -#define FB_EVENT_NEW_MODELIST 0x09 +#define FB_EVENT_NEW_MODELIST 0x0A /* The resolution of the passed in fb_info about to change and all vc's should be changed */ -#define FB_EVENT_MODE_CHANGE_ALL 0x0A -/* CONSOLE-SPECIFIC: set console rotation */ -#define FB_EVENT_SET_CON_ROTATE 0x0B -/* CONSOLE-SPECIFIC: get console rotation */ -#define FB_EVENT_GET_CON_ROTATE 0x0C -/* CONSOLE-SPECIFIC: rotate all consoles */ -#define FB_EVENT_SET_CON_ROTATE_ALL 0x0D +#define FB_EVENT_MODE_CHANGE_ALL 0x0B struct fb_event { struct fb_info *info; @@ -892,7 +888,6 @@ extern int fb_get_color_depth(struct fb_var_screeninfo *var, struct fb_fix_screeninfo *fix); extern int fb_get_options(char *name, char **option); extern int fb_new_modelist(struct fb_info *info); -extern int fb_con_duit(struct fb_info *info, int event, void *data); extern struct fb_info *registered_fb[FB_MAX]; extern int num_registered_fb; diff --git a/include/linux/hw_random.h b/include/linux/hw_random.h new file mode 100644 index 00000000000..21ea7610e17 --- /dev/null +++ b/include/linux/hw_random.h @@ -0,0 +1,50 @@ +/* + Hardware Random Number Generator + + Please read Documentation/hw_random.txt for details on use. + + ---------------------------------------------------------- + This software may be used and distributed according to the terms + of the GNU General Public License, incorporated herein by reference. + + */ + +#ifndef LINUX_HWRANDOM_H_ +#define LINUX_HWRANDOM_H_ +#ifdef __KERNEL__ + +#include <linux/types.h> +#include <linux/list.h> + +/** + * struct hwrng - Hardware Random Number Generator driver + * @name: Unique RNG name. + * @init: Initialization callback (can be NULL). + * @cleanup: Cleanup callback (can be NULL). + * @data_present: Callback to determine if data is available + * on the RNG. If NULL, it is assumed that + * there is always data available. + * @data_read: Read data from the RNG device. + * Returns the number of lower random bytes in "data". + * Must not be NULL. + * @priv: Private data, for use by the RNG driver. + */ +struct hwrng { + const char *name; + int (*init)(struct hwrng *rng); + void (*cleanup)(struct hwrng *rng); + int (*data_present)(struct hwrng *rng); + int (*data_read)(struct hwrng *rng, u32 *data); + unsigned long priv; + + /* internal. */ + struct list_head list; +}; + +/** Register a new Hardware Random Number Generator driver. */ +extern int hwrng_register(struct hwrng *rng); +/** Unregister a Hardware Random Number Generator driver. */ +extern void hwrng_unregister(struct hwrng *rng); + +#endif /* __KERNEL__ */ +#endif /* LINUX_HWRANDOM_H_ */ diff --git a/include/linux/idr.h b/include/linux/idr.h index d37c8d808b0..f559a719dbe 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -78,6 +78,7 @@ void *idr_find(struct idr *idp, int id); int idr_pre_get(struct idr *idp, gfp_t gfp_mask); int idr_get_new(struct idr *idp, void *ptr, int *id); int idr_get_new_above(struct idr *idp, void *ptr, int starting_id, int *id); +void *idr_replace(struct idr *idp, void *ptr, int id); void idr_remove(struct idr *idp, int id); void idr_destroy(struct idr *idp); void idr_init(struct idr *idp); diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 41ecbb847f3..e127ef7e8da 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -119,7 +119,6 @@ extern struct group_info init_groups; .signal = {{0}}}, \ .blocked = {{0}}, \ .alloc_lock = SPIN_LOCK_UNLOCKED, \ - .proc_lock = SPIN_LOCK_UNLOCKED, \ .journal_info = NULL, \ .cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \ .fs_excl = ATOMIC_INIT(0), \ diff --git a/include/linux/key.h b/include/linux/key.h index e81ebf910d0..e693e729bc9 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -248,7 +248,14 @@ extern struct key *key_alloc(struct key_type *type, const char *desc, uid_t uid, gid_t gid, struct task_struct *ctx, - key_perm_t perm, int not_in_quota); + key_perm_t perm, + unsigned long flags); + + +#define KEY_ALLOC_IN_QUOTA 0x0000 /* add to quota, reject if would overrun */ +#define KEY_ALLOC_QUOTA_OVERRUN 0x0001 /* add to quota, permit even if overrun */ +#define KEY_ALLOC_NOT_IN_QUOTA 0x0002 /* not in quota */ + extern int key_payload_reserve(struct key *key, size_t datalen); extern int key_instantiate_and_link(struct key *key, const void *data, @@ -285,7 +292,7 @@ extern key_ref_t key_create_or_update(key_ref_t keyring, const char *description, const void *payload, size_t plen, - int not_in_quota); + unsigned long flags); extern int key_update(key_ref_t key, const void *payload, @@ -299,7 +306,7 @@ extern int key_unlink(struct key *keyring, extern struct key *keyring_alloc(const char *description, uid_t uid, gid_t gid, struct task_struct *ctx, - int not_in_quota, + unsigned long flags, struct key *dest); extern int keyring_clear(struct key *keyring); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index bc747e5d713..03cd7551a7a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -699,7 +699,6 @@ extern int dev_hard_start_xmit(struct sk_buff *skb, extern void dev_init(void); -extern int netdev_nit; extern int netdev_budget; /* Called by rtnetlink.c:rtnl_unlock() */ diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index ca5a8733000..1efe60c5c00 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -31,6 +31,7 @@ struct netpoll_info { int rx_flags; spinlock_t rx_lock; struct netpoll *rx_np; /* netpoll that registered an rx_hook */ + struct sk_buff_head arp_tx; /* list of arp requests to reply to */ }; void netpoll_poll(struct netpoll *np); diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 5810d28fbed..17e75783e3a 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -99,9 +99,8 @@ extern void proc_misc_init(void); struct mm_struct; +void proc_flush_task(struct task_struct *task); struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *); -struct dentry *proc_pid_unhash(struct task_struct *p); -void proc_pid_flush(struct dentry *proc_dentry); int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir); unsigned long task_vsize(struct mm_struct *); int task_statm(struct mm_struct *, int *, int *, int *, int *); @@ -211,8 +210,7 @@ static inline void proc_net_remove(const char *name) #define proc_net_create(name, mode, info) ({ (void)(mode), NULL; }) static inline void proc_net_remove(const char *name) {} -static inline struct dentry *proc_pid_unhash(struct task_struct *p) { return NULL; } -static inline void proc_pid_flush(struct dentry *proc_dentry) { } +static inline void proc_flush_task(struct task_struct *task) { } static inline struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode, struct proc_dir_entry *parent) { return NULL; } @@ -248,8 +246,8 @@ extern void kclist_add(struct kcore_list *, void *, size_t); #endif struct proc_inode { - struct task_struct *task; - int type; + struct pid *pid; + int fd; union { int (*proc_get_link)(struct inode *, struct dentry **, struct vfsmount **); int (*proc_read)(struct task_struct *task, char *page); @@ -268,4 +266,10 @@ static inline struct proc_dir_entry *PDE(const struct inode *inode) return PROC_I(inode)->pde; } +struct proc_maps_private { + struct pid *pid; + struct task_struct *task; + struct vm_area_struct *tail_vma; +}; + #endif /* _LINUX_PROC_FS_H */ diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index ee918bc6e18..8b2749a259d 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -88,7 +88,6 @@ extern int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __us extern int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long dst, int len); extern int ptrace_attach(struct task_struct *tsk); extern int ptrace_detach(struct task_struct *, unsigned int); -extern void __ptrace_detach(struct task_struct *, unsigned int); extern void ptrace_disable(struct task_struct *); extern int ptrace_check_attach(struct task_struct *task, int kill); extern int ptrace_request(struct task_struct *child, long request, long addr, long data); diff --git a/include/linux/raid/bitmap.h b/include/linux/raid/bitmap.h index 899437802ae..63df898fe2e 100644 --- a/include/linux/raid/bitmap.h +++ b/include/linux/raid/bitmap.h @@ -140,6 +140,7 @@ typedef __u16 bitmap_counter_t; enum bitmap_state { BITMAP_ACTIVE = 0x001, /* the bitmap is in use */ BITMAP_STALE = 0x002, /* the bitmap file is out of date or had -EIO */ + BITMAP_WRITE_ERROR = 0x004, /* A write error has occurred */ BITMAP_HOSTENDIAN = 0x8000, }; @@ -244,15 +245,9 @@ struct bitmap { unsigned long daemon_lastrun; /* jiffies of last run */ unsigned long daemon_sleep; /* how many seconds between updates? */ - /* - * bitmap_writeback_daemon waits for file-pages that have been written, - * as there is no way to get a call-back when a page write completes. - */ - mdk_thread_t *writeback_daemon; - spinlock_t write_lock; + atomic_t pending_writes; /* pending writes to the bitmap file */ wait_queue_head_t write_wait; - struct list_head complete_pages; - mempool_t *write_pool; + }; /* the bitmap API */ diff --git a/include/linux/raid/linear.h b/include/linux/raid/linear.h index 7eaf290e10e..ba15469daf1 100644 --- a/include/linux/raid/linear.h +++ b/include/linux/raid/linear.h @@ -13,8 +13,10 @@ typedef struct dev_info dev_info_t; struct linear_private_data { + struct linear_private_data *prev; /* earlier version */ dev_info_t **hash_table; sector_t hash_spacing; + sector_t array_size; int preshift; /* shift before dividing by hash_spacing */ dev_info_t disks[0]; }; diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h index 66b44e5e0d6..eb3e547c8fe 100644 --- a/include/linux/raid/md.h +++ b/include/linux/raid/md.h @@ -85,8 +85,6 @@ extern void md_done_sync(mddev_t *mddev, int blocks, int ok); extern void md_error (mddev_t *mddev, mdk_rdev_t *rdev); extern void md_unplug_mddev(mddev_t *mddev); -extern void md_print_devices (void); - extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, sector_t sector, int size, struct page *page); extern void md_super_wait(mddev_t *mddev); @@ -97,7 +95,5 @@ extern void md_new_event(mddev_t *mddev); extern void md_update_sb(mddev_t * mddev); -#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } - #endif diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index e2df61f5b09..c1e0ac55bab 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -40,7 +40,8 @@ typedef struct mdk_rdev_s mdk_rdev_t; * options passed in raidrun: */ -#define MAX_CHUNK_SIZE (4096*1024) +/* Currently this must fix in an 'int' */ +#define MAX_CHUNK_SIZE (1<<30) /* * MD's 'extended' device @@ -57,6 +58,7 @@ struct mdk_rdev_s struct page *sb_page; int sb_loaded; + __u64 sb_events; sector_t data_offset; /* start of data in array */ sector_t sb_offset; int sb_size; /* bytes in the superblock */ @@ -87,6 +89,10 @@ struct mdk_rdev_s * array and could again if we did a partial * resync from the bitmap */ + sector_t recovery_offset;/* If this device has been partially + * recovered, this is where we were + * up to. + */ atomic_t nr_pending; /* number of pending requests. * only maintained for arrays that @@ -182,6 +188,8 @@ struct mddev_s #define MD_RECOVERY_REQUESTED 6 #define MD_RECOVERY_CHECK 7 #define MD_RECOVERY_RESHAPE 8 +#define MD_RECOVERY_FROZEN 9 + unsigned long recovery; int in_sync; /* know to not need resync */ diff --git a/include/linux/raid/md_p.h b/include/linux/raid/md_p.h index f1fbae7e390..b6ebc69bae5 100644 --- a/include/linux/raid/md_p.h +++ b/include/linux/raid/md_p.h @@ -265,9 +265,12 @@ struct mdp_superblock_1 { /* feature_map bits */ #define MD_FEATURE_BITMAP_OFFSET 1 +#define MD_FEATURE_RECOVERY_OFFSET 2 /* recovery_offset is present and + * must be honoured + */ #define MD_FEATURE_RESHAPE_ACTIVE 4 -#define MD_FEATURE_ALL 5 +#define MD_FEATURE_ALL (1|2|4) #endif diff --git a/include/linux/raid/raid10.h b/include/linux/raid/raid10.h index b1103298a8c..c41e56a7c09 100644 --- a/include/linux/raid/raid10.h +++ b/include/linux/raid/raid10.h @@ -24,11 +24,16 @@ struct r10_private_data_s { int far_copies; /* number of copies layed out * at large strides across drives */ + int far_offset; /* far_copies are offset by 1 stripe + * instead of many + */ int copies; /* near_copies * far_copies. * must be <= raid_disks */ sector_t stride; /* distance between far copies. - * This is size / far_copies + * This is size / far_copies unless + * far_offset, in which case it is + * 1 stripe. */ int chunk_shift; /* shift from chunks to sectors */ diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h index 914af667044..20ed4c99763 100644 --- a/include/linux/raid/raid5.h +++ b/include/linux/raid/raid5.h @@ -212,6 +212,7 @@ struct raid5_private_data { mddev_t *mddev; struct disk_info *spare; int chunk_size, level, algorithm; + int max_degraded; int raid_disks, working_disks, failed_disks; int max_nr_stripes; diff --git a/include/linux/sched.h b/include/linux/sched.h index 8d11d9310db..122a25c1b99 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -842,8 +842,6 @@ struct task_struct { u32 self_exec_id; /* Protection of (de-)allocation: mm, files, fs, tty, keyrings */ spinlock_t alloc_lock; -/* Protection of proc_dentry: nesting proc_lock, dcache_lock, write_lock_irq(&tasklist_lock); */ - spinlock_t proc_lock; #ifdef CONFIG_DEBUG_MUTEXES /* mutex deadlock detection */ @@ -856,7 +854,6 @@ struct task_struct { /* VM state */ struct reclaim_state *reclaim_state; - struct dentry *proc_dentry; struct backing_dev_info *backing_dev_info; struct io_context *io_context; diff --git a/include/linux/security.h b/include/linux/security.h index d2c17bd91a2..51805806f97 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -862,6 +862,7 @@ struct swap_info_struct; * Permit allocation of a key and assign security data. Note that key does * not have a serial number assigned at this point. * @key points to the key. + * @flags is the allocation flags * Return 0 if permission is granted, -ve error otherwise. * @key_free: * Notification of destruction; free security data. @@ -1324,7 +1325,7 @@ struct security_operations { /* key management security hooks */ #ifdef CONFIG_KEYS - int (*key_alloc)(struct key *key, struct task_struct *tsk); + int (*key_alloc)(struct key *key, struct task_struct *tsk, unsigned long flags); void (*key_free)(struct key *key); int (*key_permission)(key_ref_t key_ref, struct task_struct *context, @@ -3040,9 +3041,10 @@ static inline int security_xfrm_policy_lookup(struct xfrm_policy *xp, u32 sk_sid #ifdef CONFIG_KEYS #ifdef CONFIG_SECURITY static inline int security_key_alloc(struct key *key, - struct task_struct *tsk) + struct task_struct *tsk, + unsigned long flags) { - return security_ops->key_alloc(key, tsk); + return security_ops->key_alloc(key, tsk, flags); } static inline void security_key_free(struct key *key) @@ -3060,7 +3062,8 @@ static inline int security_key_permission(key_ref_t key_ref, #else static inline int security_key_alloc(struct key *key, - struct task_struct *tsk) + struct task_struct *tsk, + unsigned long flags) { return 0; } diff --git a/include/linux/time.h b/include/linux/time.h index 0cd696cee99..65dd85b2105 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -77,6 +77,8 @@ extern struct timespec xtime; extern struct timespec wall_to_monotonic; extern seqlock_t xtime_lock; +void timekeeping_init(void); + static inline unsigned long get_seconds(void) { return xtime.tv_sec; @@ -100,6 +102,7 @@ extern int do_getitimer(int which, struct itimerval *value); extern void getnstimeofday(struct timespec *tv); extern struct timespec timespec_trunc(struct timespec t, unsigned gran); +extern int timekeeping_is_continuous(void); /** * timespec_to_ns - Convert timespec to nanoseconds @@ -142,6 +145,20 @@ extern struct timespec ns_to_timespec(const s64 nsec); */ extern struct timeval ns_to_timeval(const s64 nsec); +/** + * timespec_add_ns - Adds nanoseconds to a timespec + * @a: pointer to timespec to be incremented + * @ns: unsigned nanoseconds value to be added + */ +static inline void timespec_add_ns(struct timespec *a, u64 ns) +{ + ns += a->tv_nsec; + while(unlikely(ns >= NSEC_PER_SEC)) { + ns -= NSEC_PER_SEC; + a->tv_sec++; + } + a->tv_nsec = ns; +} #endif /* __KERNEL__ */ #define NFDBITS __NFDBITS diff --git a/include/linux/timex.h b/include/linux/timex.h index 34d3ccff7bb..19bb6538b49 100644 --- a/include/linux/timex.h +++ b/include/linux/timex.h @@ -303,6 +303,8 @@ time_interpolator_reset(void) #endif /* !CONFIG_TIME_INTERPOLATION */ +#define TICK_LENGTH_SHIFT 32 + /* Returns how long ticks are at present, in ns / 2^(SHIFT_SCALE-10). */ extern u64 current_tick_length(void); diff --git a/include/net/tipc/tipc_bearer.h b/include/net/tipc/tipc_bearer.h index 098607cd4b7..e07136d74c2 100644 --- a/include/net/tipc/tipc_bearer.h +++ b/include/net/tipc/tipc_bearer.h @@ -49,10 +49,18 @@ #define TIPC_MEDIA_TYPE_ETH 1 +/* + * Destination address structure used by TIPC bearers when sending messages + * + * IMPORTANT: The fields of this structure MUST be stored using the specified + * byte order indicated below, as the structure is exchanged between nodes + * as part of a link setup process. + */ + struct tipc_media_addr { - __u32 type; + __u32 type; /* bearer type (network byte order) */ union { - __u8 eth_addr[6]; /* Ethernet bearer */ + __u8 eth_addr[6]; /* 48 bit Ethernet addr (byte array) */ #if 0 /* Prototypes for other possible bearer types */ diff --git a/init/initramfs.c b/init/initramfs.c index f81cfa40a71..d28c1094d7e 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -30,6 +30,7 @@ static void __init free(void *where) static __initdata struct hash { int ino, minor, major; + mode_t mode; struct hash *next; char name[N_ALIGN(PATH_MAX)]; } *head[32]; @@ -41,7 +42,8 @@ static inline int hash(int major, int minor, int ino) return tmp & 31; } -static char __init *find_link(int major, int minor, int ino, char *name) +static char __init *find_link(int major, int minor, int ino, + mode_t mode, char *name) { struct hash **p, *q; for (p = head + hash(major, minor, ino); *p; p = &(*p)->next) { @@ -51,14 +53,17 @@ static char __init *find_link(int major, int minor, int ino, char *name) continue; if ((*p)->major != major) continue; + if (((*p)->mode ^ mode) & S_IFMT) + continue; return (*p)->name; } q = (struct hash *)malloc(sizeof(struct hash)); if (!q) panic("can't allocate link hash entry"); - q->ino = ino; - q->minor = minor; q->major = major; + q->minor = minor; + q->ino = ino; + q->mode = mode; strcpy(q->name, name); q->next = NULL; *p = q; @@ -229,13 +234,25 @@ static int __init do_reset(void) static int __init maybe_link(void) { if (nlink >= 2) { - char *old = find_link(major, minor, ino, collected); + char *old = find_link(major, minor, ino, mode, collected); if (old) return (sys_link(old, collected) < 0) ? -1 : 1; } return 0; } +static void __init clean_path(char *path, mode_t mode) +{ + struct stat st; + + if (!sys_newlstat(path, &st) && (st.st_mode^mode) & S_IFMT) { + if (S_ISDIR(st.st_mode)) + sys_rmdir(path); + else + sys_unlink(path); + } +} + static __initdata int wfd; static int __init do_name(void) @@ -248,9 +265,15 @@ static int __init do_name(void) } if (dry_run) return 0; + clean_path(collected, mode); if (S_ISREG(mode)) { - if (maybe_link() >= 0) { - wfd = sys_open(collected, O_WRONLY|O_CREAT, mode); + int ml = maybe_link(); + if (ml >= 0) { + int openflags = O_WRONLY|O_CREAT; + if (ml != 1) + openflags |= O_TRUNC; + wfd = sys_open(collected, openflags, mode); + if (wfd >= 0) { sys_fchown(wfd, uid, gid); sys_fchmod(wfd, mode); @@ -291,6 +314,7 @@ static int __init do_copy(void) static int __init do_symlink(void) { collected[N_ALIGN(name_len) + body_len] = '\0'; + clean_path(collected, 0); sys_symlink(collected + N_ALIGN(name_len), collected); sys_lchown(collected, uid, gid); state = SkipIt; diff --git a/init/main.c b/init/main.c index f715b9b8975..9a970d317ea 100644 --- a/init/main.c +++ b/init/main.c @@ -490,6 +490,7 @@ asmlinkage void __init start_kernel(void) hrtimers_init(); softirq_init(); time_init(); + timekeeping_init(); /* * HACK ALERT! This is early. We're enabling the console before diff --git a/kernel/Makefile b/kernel/Makefile index f6ef00f4f90..bc4b8a7161f 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -10,6 +10,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ hrtimer.o +obj-y += time/ obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o obj-$(CONFIG_FUTEX) += futex.o ifeq ($(CONFIG_COMPAT),y) diff --git a/kernel/cpu.c b/kernel/cpu.c index fe2b8d0bfe4..03dcd981846 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -13,10 +13,10 @@ #include <linux/module.h> #include <linux/kthread.h> #include <linux/stop_machine.h> -#include <asm/semaphore.h> +#include <linux/mutex.h> /* This protects CPUs going up and down... */ -static DECLARE_MUTEX(cpucontrol); +static DEFINE_MUTEX(cpucontrol); static BLOCKING_NOTIFIER_HEAD(cpu_chain); @@ -30,9 +30,9 @@ static int __lock_cpu_hotplug(int interruptible) if (lock_cpu_hotplug_owner != current) { if (interruptible) - ret = down_interruptible(&cpucontrol); + ret = mutex_lock_interruptible(&cpucontrol); else - down(&cpucontrol); + mutex_lock(&cpucontrol); } /* @@ -56,7 +56,7 @@ void unlock_cpu_hotplug(void) { if (--lock_cpu_hotplug_depth == 0) { lock_cpu_hotplug_owner = NULL; - up(&cpucontrol); + mutex_unlock(&cpucontrol); } } EXPORT_SYMBOL_GPL(unlock_cpu_hotplug); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index b602f73fb38..1535af3a912 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2442,31 +2442,43 @@ void __cpuset_memory_pressure_bump(void) */ static int proc_cpuset_show(struct seq_file *m, void *v) { + struct pid *pid; struct task_struct *tsk; char *buf; - int retval = 0; + int retval; + retval = -ENOMEM; buf = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!buf) - return -ENOMEM; + goto out; + + retval = -ESRCH; + pid = m->private; + tsk = get_pid_task(pid, PIDTYPE_PID); + if (!tsk) + goto out_free; - tsk = m->private; + retval = -EINVAL; mutex_lock(&manage_mutex); + retval = cpuset_path(tsk->cpuset, buf, PAGE_SIZE); if (retval < 0) - goto out; + goto out_unlock; seq_puts(m, buf); seq_putc(m, '\n'); -out: +out_unlock: mutex_unlock(&manage_mutex); + put_task_struct(tsk); +out_free: kfree(buf); +out: return retval; } static int cpuset_open(struct inode *inode, struct file *file) { - struct task_struct *tsk = PROC_I(inode)->task; - return single_open(file, proc_cpuset_show, tsk); + struct pid *pid = PROC_I(inode)->pid; + return single_open(file, proc_cpuset_show, pid); } struct file_operations proc_cpuset_operations = { diff --git a/kernel/exit.c b/kernel/exit.c index e76bd02e930..304ef637be6 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -137,12 +137,8 @@ void release_task(struct task_struct * p) { int zap_leader; task_t *leader; - struct dentry *proc_dentry; - repeat: atomic_dec(&p->user->processes); - spin_lock(&p->proc_lock); - proc_dentry = proc_pid_unhash(p); write_lock_irq(&tasklist_lock); ptrace_unlink(p); BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); @@ -171,8 +167,7 @@ repeat: sched_exit(p); write_unlock_irq(&tasklist_lock); - spin_unlock(&p->proc_lock); - proc_pid_flush(proc_dentry); + proc_flush_task(p); release_thread(p); call_rcu(&p->rcu, delayed_put_task_struct); diff --git a/kernel/fork.c b/kernel/fork.c index dfd10cb370c..9b4e54ef022 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -993,13 +993,10 @@ static task_t *copy_process(unsigned long clone_flags, if (put_user(p->pid, parent_tidptr)) goto bad_fork_cleanup; - p->proc_dentry = NULL; - INIT_LIST_HEAD(&p->children); INIT_LIST_HEAD(&p->sibling); p->vfork_done = NULL; spin_lock_init(&p->alloc_lock); - spin_lock_init(&p->proc_lock); clear_tsk_thread_flag(p, TIF_SIGPENDING); init_sigpending(&p->pending); @@ -1159,18 +1156,6 @@ static task_t *copy_process(unsigned long clone_flags, } if (clone_flags & CLONE_THREAD) { - /* - * Important: if an exit-all has been started then - * do not create this new thread - the whole thread - * group is supposed to exit anyway. - */ - if (current->signal->flags & SIGNAL_GROUP_EXIT) { - spin_unlock(¤t->sighand->siglock); - write_unlock_irq(&tasklist_lock); - retval = -EAGAIN; - goto bad_fork_cleanup_namespace; - } - p->group_leader = current->group_leader; list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 1fbf466a29a..64aab081153 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -47,11 +47,17 @@ static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; +static atomic_t kprobe_count; DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */ static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; +static struct notifier_block kprobe_page_fault_nb = { + .notifier_call = kprobe_exceptions_notify, + .priority = 0x7fffffff /* we need to notified first */ +}; + #ifdef __ARCH_WANT_KPROBES_INSN_SLOT /* * kprobe->ainsn.insn points to the copy of the instruction to be @@ -368,16 +374,15 @@ static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p) */ static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p) { - struct kprobe *kp; - if (p->break_handler) { - list_for_each_entry_rcu(kp, &old_p->list, list) { - if (kp->break_handler) - return -EEXIST; - } + if (old_p->break_handler) + return -EEXIST; list_add_tail_rcu(&p->list, &old_p->list); + old_p->break_handler = aggr_break_handler; } else list_add_rcu(&p->list, &old_p->list); + if (p->post_handler && !old_p->post_handler) + old_p->post_handler = aggr_post_handler; return 0; } @@ -390,9 +395,11 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) copy_kprobe(p, ap); ap->addr = p->addr; ap->pre_handler = aggr_pre_handler; - ap->post_handler = aggr_post_handler; ap->fault_handler = aggr_fault_handler; - ap->break_handler = aggr_break_handler; + if (p->post_handler) + ap->post_handler = aggr_post_handler; + if (p->break_handler) + ap->break_handler = aggr_break_handler; INIT_LIST_HEAD(&ap->list); list_add_rcu(&p->list, &ap->list); @@ -464,6 +471,8 @@ static int __kprobes __register_kprobe(struct kprobe *p, old_p = get_kprobe(p->addr); if (old_p) { ret = register_aggr_kprobe(old_p, p); + if (!ret) + atomic_inc(&kprobe_count); goto out; } @@ -474,6 +483,10 @@ static int __kprobes __register_kprobe(struct kprobe *p, hlist_add_head_rcu(&p->hlist, &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); + if (atomic_add_return(1, &kprobe_count) == \ + (ARCH_INACTIVE_KPROBE_COUNT + 1)) + register_page_fault_notifier(&kprobe_page_fault_nb); + arch_arm_kprobe(p); out: @@ -536,14 +549,40 @@ valid_p: kfree(old_p); } arch_remove_kprobe(p); + } else { + mutex_lock(&kprobe_mutex); + if (p->break_handler) + old_p->break_handler = NULL; + if (p->post_handler){ + list_for_each_entry_rcu(list_p, &old_p->list, list){ + if (list_p->post_handler){ + cleanup_p = 2; + break; + } + } + if (cleanup_p == 0) + old_p->post_handler = NULL; + } + mutex_unlock(&kprobe_mutex); } + + /* Call unregister_page_fault_notifier() + * if no probes are active + */ + mutex_lock(&kprobe_mutex); + if (atomic_add_return(-1, &kprobe_count) == \ + ARCH_INACTIVE_KPROBE_COUNT) + unregister_page_fault_notifier(&kprobe_page_fault_nb); + mutex_unlock(&kprobe_mutex); + return; } static struct notifier_block kprobe_exceptions_nb = { .notifier_call = kprobe_exceptions_notify, - .priority = 0x7fffffff /* we need to notified first */ + .priority = 0x7fffffff /* we need to be notified first */ }; + int __kprobes register_jprobe(struct jprobe *jp) { /* Todo: Verify probepoint is a function entry point */ @@ -652,6 +691,7 @@ static int __init init_kprobes(void) INIT_HLIST_HEAD(&kprobe_table[i]); INIT_HLIST_HEAD(&kretprobe_inst_table[i]); } + atomic_set(&kprobe_count, 0); err = arch_init_kprobes(); if (!err) diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c index f4913c37695..036b6285b15 100644 --- a/kernel/mutex-debug.c +++ b/kernel/mutex-debug.c @@ -153,13 +153,13 @@ next: continue; count++; cursor = curr->next; - debug_spin_lock_restore(&debug_mutex_lock, flags); + debug_spin_unlock_restore(&debug_mutex_lock, flags); printk("\n#%03d: ", count); printk_lock(lock, filter ? 0 : 1); goto next; } - debug_spin_lock_restore(&debug_mutex_lock, flags); + debug_spin_unlock_restore(&debug_mutex_lock, flags); printk("\n"); } @@ -316,7 +316,7 @@ void mutex_debug_check_no_locks_held(struct task_struct *task) continue; list_del_init(curr); DEBUG_OFF(); - debug_spin_lock_restore(&debug_mutex_lock, flags); + debug_spin_unlock_restore(&debug_mutex_lock, flags); printk("BUG: %s/%d, lock held at task exit time!\n", task->comm, task->pid); @@ -325,7 +325,7 @@ void mutex_debug_check_no_locks_held(struct task_struct *task) printk("exiting task is not even the owner??\n"); return; } - debug_spin_lock_restore(&debug_mutex_lock, flags); + debug_spin_unlock_restore(&debug_mutex_lock, flags); } /* @@ -352,7 +352,7 @@ void mutex_debug_check_no_locks_freed(const void *from, unsigned long len) continue; list_del_init(curr); DEBUG_OFF(); - debug_spin_lock_restore(&debug_mutex_lock, flags); + debug_spin_unlock_restore(&debug_mutex_lock, flags); printk("BUG: %s/%d, active lock [%p(%p-%p)] freed!\n", current->comm, current->pid, lock, from, to); @@ -362,7 +362,7 @@ void mutex_debug_check_no_locks_freed(const void *from, unsigned long len) printk("freeing task is not even the owner??\n"); return; } - debug_spin_lock_restore(&debug_mutex_lock, flags); + debug_spin_unlock_restore(&debug_mutex_lock, flags); } /* diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h index fd384050acb..a5196c36a5f 100644 --- a/kernel/mutex-debug.h +++ b/kernel/mutex-debug.h @@ -46,21 +46,6 @@ extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, extern void debug_mutex_unlock(struct mutex *lock); extern void debug_mutex_init(struct mutex *lock, const char *name); -#define debug_spin_lock(lock) \ - do { \ - local_irq_disable(); \ - if (debug_mutex_on) \ - spin_lock(lock); \ - } while (0) - -#define debug_spin_unlock(lock) \ - do { \ - if (debug_mutex_on) \ - spin_unlock(lock); \ - local_irq_enable(); \ - preempt_check_resched(); \ - } while (0) - #define debug_spin_lock_save(lock, flags) \ do { \ local_irq_save(flags); \ @@ -68,7 +53,7 @@ extern void debug_mutex_init(struct mutex *lock, const char *name); spin_lock(lock); \ } while (0) -#define debug_spin_lock_restore(lock, flags) \ +#define debug_spin_unlock_restore(lock, flags) \ do { \ if (debug_mutex_on) \ spin_unlock(lock); \ @@ -76,20 +61,20 @@ extern void debug_mutex_init(struct mutex *lock, const char *name); preempt_check_resched(); \ } while (0) -#define spin_lock_mutex(lock) \ +#define spin_lock_mutex(lock, flags) \ do { \ struct mutex *l = container_of(lock, struct mutex, wait_lock); \ \ DEBUG_WARN_ON(in_interrupt()); \ - debug_spin_lock(&debug_mutex_lock); \ + debug_spin_lock_save(&debug_mutex_lock, flags); \ spin_lock(lock); \ DEBUG_WARN_ON(l->magic != l); \ } while (0) -#define spin_unlock_mutex(lock) \ +#define spin_unlock_mutex(lock, flags) \ do { \ spin_unlock(lock); \ - debug_spin_unlock(&debug_mutex_lock); \ + debug_spin_unlock_restore(&debug_mutex_lock, flags); \ } while (0) #define DEBUG_OFF() \ diff --git a/kernel/mutex.c b/kernel/mutex.c index 5449b210d9e..7043db21bbc 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -125,10 +125,11 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__) struct task_struct *task = current; struct mutex_waiter waiter; unsigned int old_val; + unsigned long flags; debug_mutex_init_waiter(&waiter); - spin_lock_mutex(&lock->wait_lock); + spin_lock_mutex(&lock->wait_lock, flags); debug_mutex_add_waiter(lock, &waiter, task->thread_info, ip); @@ -157,7 +158,7 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__) if (unlikely(state == TASK_INTERRUPTIBLE && signal_pending(task))) { mutex_remove_waiter(lock, &waiter, task->thread_info); - spin_unlock_mutex(&lock->wait_lock); + spin_unlock_mutex(&lock->wait_lock, flags); debug_mutex_free_waiter(&waiter); return -EINTR; @@ -165,9 +166,9 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__) __set_task_state(task, state); /* didnt get the lock, go to sleep: */ - spin_unlock_mutex(&lock->wait_lock); + spin_unlock_mutex(&lock->wait_lock, flags); schedule(); - spin_lock_mutex(&lock->wait_lock); + spin_lock_mutex(&lock->wait_lock, flags); } /* got the lock - rejoice! */ @@ -178,7 +179,7 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__) if (likely(list_empty(&lock->wait_list))) atomic_set(&lock->count, 0); - spin_unlock_mutex(&lock->wait_lock); + spin_unlock_mutex(&lock->wait_lock, flags); debug_mutex_free_waiter(&waiter); @@ -203,10 +204,11 @@ static fastcall noinline void __mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__) { struct mutex *lock = container_of(lock_count, struct mutex, count); + unsigned long flags; DEBUG_WARN_ON(lock->owner != current_thread_info()); - spin_lock_mutex(&lock->wait_lock); + spin_lock_mutex(&lock->wait_lock, flags); /* * some architectures leave the lock unlocked in the fastpath failure @@ -231,7 +233,7 @@ __mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__) debug_mutex_clear_owner(lock); - spin_unlock_mutex(&lock->wait_lock); + spin_unlock_mutex(&lock->wait_lock, flags); } /* @@ -276,9 +278,10 @@ __mutex_lock_interruptible_slowpath(atomic_t *lock_count __IP_DECL__) static inline int __mutex_trylock_slowpath(atomic_t *lock_count) { struct mutex *lock = container_of(lock_count, struct mutex, count); + unsigned long flags; int prev; - spin_lock_mutex(&lock->wait_lock); + spin_lock_mutex(&lock->wait_lock, flags); prev = atomic_xchg(&lock->count, -1); if (likely(prev == 1)) @@ -287,7 +290,7 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count) if (likely(list_empty(&lock->wait_list))) atomic_set(&lock->count, 0); - spin_unlock_mutex(&lock->wait_lock); + spin_unlock_mutex(&lock->wait_lock, flags); return prev == 1; } diff --git a/kernel/mutex.h b/kernel/mutex.h index 00fe84e7b67..06918994725 100644 --- a/kernel/mutex.h +++ b/kernel/mutex.h @@ -9,8 +9,10 @@ * !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs: */ -#define spin_lock_mutex(lock) spin_lock(lock) -#define spin_unlock_mutex(lock) spin_unlock(lock) +#define spin_lock_mutex(lock, flags) \ + do { spin_lock(lock); (void)(flags); } while (0) +#define spin_unlock_mutex(lock, flags) \ + do { spin_unlock(lock); (void)(flags); } while (0) #define mutex_remove_waiter(lock, waiter, ti) \ __list_del((waiter)->list.prev, (waiter)->list.next) diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 921c22ad16e..335c5b932e1 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -120,8 +120,18 @@ int ptrace_check_attach(struct task_struct *child, int kill) static int may_attach(struct task_struct *task) { - if (!task->mm) - return -EPERM; + /* May we inspect the given task? + * This check is used both for attaching with ptrace + * and for allowing access to sensitive information in /proc. + * + * ptrace_attach denies several cases that /proc allows + * because setting up the necessary parent/child relationship + * or halting the specified task is impossible. + */ + int dumpable = 0; + /* Don't let security modules deny introspection */ + if (task == current) + return 0; if (((current->uid != task->euid) || (current->uid != task->suid) || (current->uid != task->uid) || @@ -130,7 +140,9 @@ static int may_attach(struct task_struct *task) (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE)) return -EPERM; smp_rmb(); - if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE)) + if (task->mm) + dumpable = task->mm->dumpable; + if (!dumpable && !capable(CAP_SYS_PTRACE)) return -EPERM; return security_ptrace(current, task); @@ -176,6 +188,8 @@ repeat: goto repeat; } + if (!task->mm) + goto bad; /* the same process cannot be attached many times */ if (task->ptrace & PT_PTRACED) goto bad; @@ -200,7 +214,7 @@ out: return retval; } -void __ptrace_detach(struct task_struct *child, unsigned int data) +static inline void __ptrace_detach(struct task_struct *child, unsigned int data) { child->exit_code = data; /* .. re-parent .. */ @@ -219,6 +233,7 @@ int ptrace_detach(struct task_struct *child, unsigned int data) ptrace_disable(child); write_lock_irq(&tasklist_lock); + /* protect against de_thread()->release_task() */ if (child->ptrace) __ptrace_detach(child, data); write_unlock_irq(&tasklist_lock); diff --git a/kernel/sched.c b/kernel/sched.c index f06d059edef..cfaf3fabeec 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4247,7 +4247,7 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) if (retval) goto out_unlock; - jiffies_to_timespec(p->policy & SCHED_FIFO ? + jiffies_to_timespec(p->policy == SCHED_FIFO ? 0 : task_timeslice(p), &t); read_unlock(&tasklist_lock); retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; diff --git a/kernel/signal.c b/kernel/signal.c index 1b3c921737e..52adf53929f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1531,6 +1531,35 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why) spin_unlock_irqrestore(&sighand->siglock, flags); } +static inline int may_ptrace_stop(void) +{ + if (!likely(current->ptrace & PT_PTRACED)) + return 0; + + if (unlikely(current->parent == current->real_parent && + (current->ptrace & PT_ATTACHED))) + return 0; + + if (unlikely(current->signal == current->parent->signal) && + unlikely(current->signal->flags & SIGNAL_GROUP_EXIT)) + return 0; + + /* + * Are we in the middle of do_coredump? + * If so and our tracer is also part of the coredump stopping + * is a deadlock situation, and pointless because our tracer + * is dead so don't allow us to stop. + * If SIGKILL was already sent before the caller unlocked + * ->siglock we must see ->core_waiters != 0. Otherwise it + * is safe to enter schedule(). + */ + if (unlikely(current->mm->core_waiters) && + unlikely(current->mm == current->parent->mm)) + return 0; + + return 1; +} + /* * This must be called with current->sighand->siglock held. * @@ -1559,11 +1588,7 @@ static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info) spin_unlock_irq(¤t->sighand->siglock); try_to_freeze(); read_lock(&tasklist_lock); - if (likely(current->ptrace & PT_PTRACED) && - likely(current->parent != current->real_parent || - !(current->ptrace & PT_ATTACHED)) && - (likely(current->parent->signal != current->signal) || - !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) { + if (may_ptrace_stop()) { do_notify_parent_cldstop(current, CLD_TRAPPED); read_unlock(&tasklist_lock); schedule(); diff --git a/kernel/time.c b/kernel/time.c index b00ddc71ced..5bd48974764 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -523,6 +523,7 @@ EXPORT_SYMBOL(do_gettimeofday); #else +#ifndef CONFIG_GENERIC_TIME /* * Simulate gettimeofday using do_gettimeofday which only allows a timeval * and therefore only yields usec accuracy @@ -537,6 +538,7 @@ void getnstimeofday(struct timespec *tv) } EXPORT_SYMBOL_GPL(getnstimeofday); #endif +#endif /* Converts Gregorian date to seconds since 1970-01-01 00:00:00. * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 diff --git a/kernel/time/Makefile b/kernel/time/Makefile new file mode 100644 index 00000000000..e1dfd8e86cc --- /dev/null +++ b/kernel/time/Makefile @@ -0,0 +1 @@ +obj-y += clocksource.o jiffies.o diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c new file mode 100644 index 00000000000..74eca5939bd --- /dev/null +++ b/kernel/time/clocksource.c @@ -0,0 +1,349 @@ +/* + * linux/kernel/time/clocksource.c + * + * This file contains the functions which manage clocksource drivers. + * + * Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * TODO WishList: + * o Allow clocksource drivers to be unregistered + * o get rid of clocksource_jiffies extern + */ + +#include <linux/clocksource.h> +#include <linux/sysdev.h> +#include <linux/init.h> +#include <linux/module.h> + +/* XXX - Would like a better way for initializing curr_clocksource */ +extern struct clocksource clocksource_jiffies; + +/*[Clocksource internal variables]--------- + * curr_clocksource: + * currently selected clocksource. Initialized to clocksource_jiffies. + * next_clocksource: + * pending next selected clocksource. + * clocksource_list: + * linked list with the registered clocksources + * clocksource_lock: + * protects manipulations to curr_clocksource and next_clocksource + * and the clocksource_list + * override_name: + * Name of the user-specified clocksource. + */ +static struct clocksource *curr_clocksource = &clocksource_jiffies; +static struct clocksource *next_clocksource; +static LIST_HEAD(clocksource_list); +static DEFINE_SPINLOCK(clocksource_lock); +static char override_name[32]; +static int finished_booting; + +/* clocksource_done_booting - Called near the end of bootup + * + * Hack to avoid lots of clocksource churn at boot time + */ +static int __init clocksource_done_booting(void) +{ + finished_booting = 1; + return 0; +} + +late_initcall(clocksource_done_booting); + +/** + * clocksource_get_next - Returns the selected clocksource + * + */ +struct clocksource *clocksource_get_next(void) +{ + unsigned long flags; + + spin_lock_irqsave(&clocksource_lock, flags); + if (next_clocksource && finished_booting) { + curr_clocksource = next_clocksource; + next_clocksource = NULL; + } + spin_unlock_irqrestore(&clocksource_lock, flags); + + return curr_clocksource; +} + +/** + * select_clocksource - Finds the best registered clocksource. + * + * Private function. Must hold clocksource_lock when called. + * + * Looks through the list of registered clocksources, returning + * the one with the highest rating value. If there is a clocksource + * name that matches the override string, it returns that clocksource. + */ +static struct clocksource *select_clocksource(void) +{ + struct clocksource *best = NULL; + struct list_head *tmp; + + list_for_each(tmp, &clocksource_list) { + struct clocksource *src; + + src = list_entry(tmp, struct clocksource, list); + if (!best) + best = src; + + /* check for override: */ + if (strlen(src->name) == strlen(override_name) && + !strcmp(src->name, override_name)) { + best = src; + break; + } + /* pick the highest rating: */ + if (src->rating > best->rating) + best = src; + } + + return best; +} + +/** + * is_registered_source - Checks if clocksource is registered + * @c: pointer to a clocksource + * + * Private helper function. Must hold clocksource_lock when called. + * + * Returns one if the clocksource is already registered, zero otherwise. + */ +static int is_registered_source(struct clocksource *c) +{ + int len = strlen(c->name); + struct list_head *tmp; + + list_for_each(tmp, &clocksource_list) { + struct clocksource *src; + + src = list_entry(tmp, struct clocksource, list); + if (strlen(src->name) == len && !strcmp(src->name, c->name)) + return 1; + } + + return 0; +} + +/** + * clocksource_register - Used to install new clocksources + * @t: clocksource to be registered + * + * Returns -EBUSY if registration fails, zero otherwise. + */ +int clocksource_register(struct clocksource *c) +{ + int ret = 0; + unsigned long flags; + + spin_lock_irqsave(&clocksource_lock, flags); + /* check if clocksource is already registered */ + if (is_registered_source(c)) { + printk("register_clocksource: Cannot register %s. " + "Already registered!", c->name); + ret = -EBUSY; + } else { + /* register it */ + list_add(&c->list, &clocksource_list); + /* scan the registered clocksources, and pick the best one */ + next_clocksource = select_clocksource(); + } + spin_unlock_irqrestore(&clocksource_lock, flags); + return ret; +} +EXPORT_SYMBOL(clocksource_register); + +/** + * clocksource_reselect - Rescan list for next clocksource + * + * A quick helper function to be used if a clocksource changes its + * rating. Forces the clocksource list to be re-scanned for the best + * clocksource. + */ +void clocksource_reselect(void) +{ + unsigned long flags; + + spin_lock_irqsave(&clocksource_lock, flags); + next_clocksource = select_clocksource(); + spin_unlock_irqrestore(&clocksource_lock, flags); +} +EXPORT_SYMBOL(clocksource_reselect); + +/** + * sysfs_show_current_clocksources - sysfs interface for current clocksource + * @dev: unused + * @buf: char buffer to be filled with clocksource list + * + * Provides sysfs interface for listing current clocksource. + */ +static ssize_t +sysfs_show_current_clocksources(struct sys_device *dev, char *buf) +{ + char *curr = buf; + + spin_lock_irq(&clocksource_lock); + curr += sprintf(curr, "%s ", curr_clocksource->name); + spin_unlock_irq(&clocksource_lock); + + curr += sprintf(curr, "\n"); + + return curr - buf; +} + +/** + * sysfs_override_clocksource - interface for manually overriding clocksource + * @dev: unused + * @buf: name of override clocksource + * @count: length of buffer + * + * Takes input from sysfs interface for manually overriding the default + * clocksource selction. + */ +static ssize_t sysfs_override_clocksource(struct sys_device *dev, + const char *buf, size_t count) +{ + size_t ret = count; + /* strings from sysfs write are not 0 terminated! */ + if (count >= sizeof(override_name)) + return -EINVAL; + + /* strip of \n: */ + if (buf[count-1] == '\n') + count--; + if (count < 1) + return -EINVAL; + + spin_lock_irq(&clocksource_lock); + + /* copy the name given: */ + memcpy(override_name, buf, count); + override_name[count] = 0; + + /* try to select it: */ + next_clocksource = select_clocksource(); + + spin_unlock_irq(&clocksource_lock); + + return ret; +} + +/** + * sysfs_show_available_clocksources - sysfs interface for listing clocksource + * @dev: unused + * @buf: char buffer to be filled with clocksource list + * + * Provides sysfs interface for listing registered clocksources + */ +static ssize_t +sysfs_show_available_clocksources(struct sys_device *dev, char *buf) +{ + struct list_head *tmp; + char *curr = buf; + + spin_lock_irq(&clocksource_lock); + list_for_each(tmp, &clocksource_list) { + struct clocksource *src; + + src = list_entry(tmp, struct clocksource, list); + curr += sprintf(curr, "%s ", src->name); + } + spin_unlock_irq(&clocksource_lock); + + curr += sprintf(curr, "\n"); + + return curr - buf; +} + +/* + * Sysfs setup bits: + */ +static SYSDEV_ATTR(current_clocksource, 0600, sysfs_show_current_clocksources, + sysfs_override_clocksource); + +static SYSDEV_ATTR(available_clocksource, 0600, + sysfs_show_available_clocksources, NULL); + +static struct sysdev_class clocksource_sysclass = { + set_kset_name("clocksource"), +}; + +static struct sys_device device_clocksource = { + .id = 0, + .cls = &clocksource_sysclass, +}; + +static int __init init_clocksource_sysfs(void) +{ + int error = sysdev_class_register(&clocksource_sysclass); + + if (!error) + error = sysdev_register(&device_clocksource); + if (!error) + error = sysdev_create_file( + &device_clocksource, + &attr_current_clocksource); + if (!error) + error = sysdev_create_file( + &device_clocksource, + &attr_available_clocksource); + return error; +} + +device_initcall(init_clocksource_sysfs); + +/** + * boot_override_clocksource - boot clock override + * @str: override name + * + * Takes a clocksource= boot argument and uses it + * as the clocksource override name. + */ +static int __init boot_override_clocksource(char* str) +{ + unsigned long flags; + spin_lock_irqsave(&clocksource_lock, flags); + if (str) + strlcpy(override_name, str, sizeof(override_name)); + spin_unlock_irqrestore(&clocksource_lock, flags); + return 1; +} + +__setup("clocksource=", boot_override_clocksource); + +/** + * boot_override_clock - Compatibility layer for deprecated boot option + * @str: override name + * + * DEPRECATED! Takes a clock= boot argument and uses it + * as the clocksource override name + */ +static int __init boot_override_clock(char* str) +{ + if (!strcmp(str, "pmtmr")) { + printk("Warning: clock=pmtmr is deprecated. " + "Use clocksource=acpi_pm.\n"); + return boot_override_clocksource("acpi_pm"); + } + printk("Warning! clock= boot option is deprecated. " + "Use clocksource=xyz\n"); + return boot_override_clocksource(str); +} + +__setup("clock=", boot_override_clock); diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c new file mode 100644 index 00000000000..126bb30c4af --- /dev/null +++ b/kernel/time/jiffies.c @@ -0,0 +1,73 @@ +/*********************************************************************** +* linux/kernel/time/jiffies.c +* +* This file contains the jiffies based clocksource. +* +* Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com) +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation; either version 2 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software +* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +* +************************************************************************/ +#include <linux/clocksource.h> +#include <linux/jiffies.h> +#include <linux/init.h> + +/* The Jiffies based clocksource is the lowest common + * denominator clock source which should function on + * all systems. It has the same coarse resolution as + * the timer interrupt frequency HZ and it suffers + * inaccuracies caused by missed or lost timer + * interrupts and the inability for the timer + * interrupt hardware to accuratly tick at the + * requested HZ value. It is also not reccomended + * for "tick-less" systems. + */ +#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ)) + +/* Since jiffies uses a simple NSEC_PER_JIFFY multiplier + * conversion, the .shift value could be zero. However + * this would make NTP adjustments impossible as they are + * in units of 1/2^.shift. Thus we use JIFFIES_SHIFT to + * shift both the nominator and denominator the same + * amount, and give ntp adjustments in units of 1/2^8 + * + * The value 8 is somewhat carefully chosen, as anything + * larger can result in overflows. NSEC_PER_JIFFY grows as + * HZ shrinks, so values greater then 8 overflow 32bits when + * HZ=100. + */ +#define JIFFIES_SHIFT 8 + +static cycle_t jiffies_read(void) +{ + return (cycle_t) jiffies; +} + +struct clocksource clocksource_jiffies = { + .name = "jiffies", + .rating = 0, /* lowest rating*/ + .read = jiffies_read, + .mask = 0xffffffff, /*32bits*/ + .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ + .shift = JIFFIES_SHIFT, + .is_continuous = 0, /* tick based, not free running */ +}; + +static int __init init_jiffies_clocksource(void) +{ + return clocksource_register(&clocksource_jiffies); +} + +module_init(init_jiffies_clocksource); diff --git a/kernel/timer.c b/kernel/timer.c index eb97371b87d..5bb6b7976ee 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -597,7 +597,6 @@ long time_tolerance = MAXFREQ; /* frequency tolerance (ppm) */ long time_precision = 1; /* clock precision (us) */ long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */ long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ -static long time_phase; /* phase offset (scaled us) */ long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC; /* frequency offset (scaled ppm)*/ static long time_adj; /* tick adjust (scaled 1 / HZ) */ @@ -747,27 +746,14 @@ static long adjtime_adjustment(void) } /* in the NTP reference this is called "hardclock()" */ -static void update_wall_time_one_tick(void) +static void update_ntp_one_tick(void) { - long time_adjust_step, delta_nsec; + long time_adjust_step; time_adjust_step = adjtime_adjustment(); if (time_adjust_step) /* Reduce by this step the amount of time left */ time_adjust -= time_adjust_step; - delta_nsec = tick_nsec + time_adjust_step * 1000; - /* - * Advance the phase, once it gets to one microsecond, then - * advance the tick more. - */ - time_phase += time_adj; - if ((time_phase >= FINENSEC) || (time_phase <= -FINENSEC)) { - long ltemp = shift_right(time_phase, (SHIFT_SCALE - 10)); - time_phase -= ltemp << (SHIFT_SCALE - 10); - delta_nsec += ltemp; - } - xtime.tv_nsec += delta_nsec; - time_interpolator_update(delta_nsec); /* Changes by adjtime() do not take effect till next tick. */ if (time_next_adjust != 0) { @@ -780,36 +766,378 @@ static void update_wall_time_one_tick(void) * Return how long ticks are at the moment, that is, how much time * update_wall_time_one_tick will add to xtime next time we call it * (assuming no calls to do_adjtimex in the meantime). - * The return value is in fixed-point nanoseconds with SHIFT_SCALE-10 - * bits to the right of the binary point. + * The return value is in fixed-point nanoseconds shifted by the + * specified number of bits to the right of the binary point. * This function has no side-effects. */ u64 current_tick_length(void) { long delta_nsec; + u64 ret; + /* calculate the finest interval NTP will allow. + * ie: nanosecond value shifted by (SHIFT_SCALE - 10) + */ delta_nsec = tick_nsec + adjtime_adjustment() * 1000; - return ((u64) delta_nsec << (SHIFT_SCALE - 10)) + time_adj; + ret = (u64)delta_nsec << TICK_LENGTH_SHIFT; + ret += (s64)time_adj << (TICK_LENGTH_SHIFT - (SHIFT_SCALE - 10)); + + return ret; } -/* - * Using a loop looks inefficient, but "ticks" is - * usually just one (we shouldn't be losing ticks, - * we're doing this this way mainly for interrupt - * latency reasons, not because we think we'll - * have lots of lost timer ticks +/* XXX - all of this timekeeping code should be later moved to time.c */ +#include <linux/clocksource.h> +static struct clocksource *clock; /* pointer to current clocksource */ + +#ifdef CONFIG_GENERIC_TIME +/** + * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook + * + * private function, must hold xtime_lock lock when being + * called. Returns the number of nanoseconds since the + * last call to update_wall_time() (adjusted by NTP scaling) + */ +static inline s64 __get_nsec_offset(void) +{ + cycle_t cycle_now, cycle_delta; + s64 ns_offset; + + /* read clocksource: */ + cycle_now = clocksource_read(clock); + + /* calculate the delta since the last update_wall_time: */ + cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + + /* convert to nanoseconds: */ + ns_offset = cyc2ns(clock, cycle_delta); + + return ns_offset; +} + +/** + * __get_realtime_clock_ts - Returns the time of day in a timespec + * @ts: pointer to the timespec to be set + * + * Returns the time of day in a timespec. Used by + * do_gettimeofday() and get_realtime_clock_ts(). */ -static void update_wall_time(unsigned long ticks) +static inline void __get_realtime_clock_ts(struct timespec *ts) { + unsigned long seq; + s64 nsecs; + + do { + seq = read_seqbegin(&xtime_lock); + + *ts = xtime; + nsecs = __get_nsec_offset(); + + } while (read_seqretry(&xtime_lock, seq)); + + timespec_add_ns(ts, nsecs); +} + +/** + * getnstimeofday - Returns the time of day in a timespec + * @ts: pointer to the timespec to be set + * + * Returns the time of day in a timespec. + */ +void getnstimeofday(struct timespec *ts) +{ + __get_realtime_clock_ts(ts); +} + +EXPORT_SYMBOL(getnstimeofday); + +/** + * do_gettimeofday - Returns the time of day in a timeval + * @tv: pointer to the timeval to be set + * + * NOTE: Users should be converted to using get_realtime_clock_ts() + */ +void do_gettimeofday(struct timeval *tv) +{ + struct timespec now; + + __get_realtime_clock_ts(&now); + tv->tv_sec = now.tv_sec; + tv->tv_usec = now.tv_nsec/1000; +} + +EXPORT_SYMBOL(do_gettimeofday); +/** + * do_settimeofday - Sets the time of day + * @tv: pointer to the timespec variable containing the new time + * + * Sets the time of day to the new time and update NTP and notify hrtimers + */ +int do_settimeofday(struct timespec *tv) +{ + unsigned long flags; + time_t wtm_sec, sec = tv->tv_sec; + long wtm_nsec, nsec = tv->tv_nsec; + + if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) + return -EINVAL; + + write_seqlock_irqsave(&xtime_lock, flags); + + nsec -= __get_nsec_offset(); + + wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); + wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); + + set_normalized_timespec(&xtime, sec, nsec); + set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); + + ntp_clear(); + + write_sequnlock_irqrestore(&xtime_lock, flags); + + /* signal hrtimers about time change */ + clock_was_set(); + + return 0; +} + +EXPORT_SYMBOL(do_settimeofday); + +/** + * change_clocksource - Swaps clocksources if a new one is available + * + * Accumulates current time interval and initializes new clocksource + */ +static int change_clocksource(void) +{ + struct clocksource *new; + cycle_t now; + u64 nsec; + new = clocksource_get_next(); + if (clock != new) { + now = clocksource_read(new); + nsec = __get_nsec_offset(); + timespec_add_ns(&xtime, nsec); + + clock = new; + clock->cycle_last = now; + printk(KERN_INFO "Time: %s clocksource has been installed.\n", + clock->name); + return 1; + } else if (clock->update_callback) { + return clock->update_callback(); + } + return 0; +} +#else +#define change_clocksource() (0) +#endif + +/** + * timeofday_is_continuous - check to see if timekeeping is free running + */ +int timekeeping_is_continuous(void) +{ + unsigned long seq; + int ret; + do { - ticks--; - update_wall_time_one_tick(); - if (xtime.tv_nsec >= 1000000000) { - xtime.tv_nsec -= 1000000000; + seq = read_seqbegin(&xtime_lock); + + ret = clock->is_continuous; + + } while (read_seqretry(&xtime_lock, seq)); + + return ret; +} + +/* + * timekeeping_init - Initializes the clocksource and common timekeeping values + */ +void __init timekeeping_init(void) +{ + unsigned long flags; + + write_seqlock_irqsave(&xtime_lock, flags); + clock = clocksource_get_next(); + clocksource_calculate_interval(clock, tick_nsec); + clock->cycle_last = clocksource_read(clock); + ntp_clear(); + write_sequnlock_irqrestore(&xtime_lock, flags); +} + + +/* + * timekeeping_resume - Resumes the generic timekeeping subsystem. + * @dev: unused + * + * This is for the generic clocksource timekeeping. + * xtime/wall_to_monotonic/jiffies/wall_jiffies/etc are + * still managed by arch specific suspend/resume code. + */ +static int timekeeping_resume(struct sys_device *dev) +{ + unsigned long flags; + + write_seqlock_irqsave(&xtime_lock, flags); + /* restart the last cycle value */ + clock->cycle_last = clocksource_read(clock); + write_sequnlock_irqrestore(&xtime_lock, flags); + return 0; +} + +/* sysfs resume/suspend bits for timekeeping */ +static struct sysdev_class timekeeping_sysclass = { + .resume = timekeeping_resume, + set_kset_name("timekeeping"), +}; + +static struct sys_device device_timer = { + .id = 0, + .cls = &timekeeping_sysclass, +}; + +static int __init timekeeping_init_device(void) +{ + int error = sysdev_class_register(&timekeeping_sysclass); + if (!error) + error = sysdev_register(&device_timer); + return error; +} + +device_initcall(timekeeping_init_device); + +/* + * If the error is already larger, we look ahead another tick, + * to compensate for late or lost adjustments. + */ +static __always_inline int clocksource_bigadjust(int sign, s64 error, s64 *interval, s64 *offset) +{ + int adj; + + /* + * As soon as the machine is synchronized to the external time + * source this should be the common case. + */ + error >>= 2; + if (likely(sign > 0 ? error <= *interval : error >= *interval)) + return sign; + + /* + * An extra look ahead dampens the effect of the current error, + * which can grow quite large with continously late updates, as + * it would dominate the adjustment value and can lead to + * oscillation. + */ + error += current_tick_length() >> (TICK_LENGTH_SHIFT - clock->shift + 1); + error -= clock->xtime_interval >> 1; + + adj = 0; + while (1) { + error >>= 1; + if (sign > 0 ? error <= *interval : error >= *interval) + break; + adj++; + } + + /* + * Add the current adjustments to the error and take the offset + * into account, the latter can cause the error to be hardly + * reduced at the next tick. Check the error again if there's + * room for another adjustment, thus further reducing the error + * which otherwise had to be corrected at the next update. + */ + error = (error << 1) - *interval + *offset; + if (sign > 0 ? error > *interval : error < *interval) + adj++; + + *interval <<= adj; + *offset <<= adj; + return sign << adj; +} + +/* + * Adjust the multiplier to reduce the error value, + * this is optimized for the most common adjustments of -1,0,1, + * for other values we can do a bit more work. + */ +static void clocksource_adjust(struct clocksource *clock, s64 offset) +{ + s64 error, interval = clock->cycle_interval; + int adj; + + error = clock->error >> (TICK_LENGTH_SHIFT - clock->shift - 1); + if (error > interval) { + adj = clocksource_bigadjust(1, error, &interval, &offset); + } else if (error < -interval) { + interval = -interval; + offset = -offset; + adj = clocksource_bigadjust(-1, error, &interval, &offset); + } else + return; + + clock->mult += adj; + clock->xtime_interval += interval; + clock->xtime_nsec -= offset; + clock->error -= (interval - offset) << (TICK_LENGTH_SHIFT - clock->shift); +} + +/* + * update_wall_time - Uses the current clocksource to increment the wall time + * + * Called from the timer interrupt, must hold a write on xtime_lock. + */ +static void update_wall_time(void) +{ + cycle_t offset; + + clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift; + +#ifdef CONFIG_GENERIC_TIME + offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask; +#else + offset = clock->cycle_interval; +#endif + + /* normally this loop will run just once, however in the + * case of lost or late ticks, it will accumulate correctly. + */ + while (offset >= clock->cycle_interval) { + /* accumulate one interval */ + clock->xtime_nsec += clock->xtime_interval; + clock->cycle_last += clock->cycle_interval; + offset -= clock->cycle_interval; + + if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) { + clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift; xtime.tv_sec++; second_overflow(); } - } while (ticks); + + /* interpolator bits */ + time_interpolator_update(clock->xtime_interval + >> clock->shift); + /* increment the NTP state machine */ + update_ntp_one_tick(); + + /* accumulate error between NTP and clock interval */ + clock->error += current_tick_length(); + clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift); + } + + /* correct the clock when NTP error is too big */ + clocksource_adjust(clock, offset); + + /* store full nanoseconds into xtime */ + xtime.tv_nsec = clock->xtime_nsec >> clock->shift; + clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; + + /* check to see if there is a new clocksource to use */ + if (change_clocksource()) { + clock->error = 0; + clock->xtime_nsec = 0; + clocksource_calculate_interval(clock, tick_nsec); + } } /* @@ -915,10 +1243,8 @@ static inline void update_times(void) unsigned long ticks; ticks = jiffies - wall_jiffies; - if (ticks) { - wall_jiffies += ticks; - update_wall_time(ticks); - } + wall_jiffies += ticks; + update_wall_time(); calc_load(ticks); } diff --git a/lib/idr.c b/lib/idr.c index de19030a999..4d096819511 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -29,6 +29,7 @@ #include <linux/init.h> #include <linux/module.h> #endif +#include <linux/err.h> #include <linux/string.h> #include <linux/idr.h> @@ -398,6 +399,48 @@ void *idr_find(struct idr *idp, int id) } EXPORT_SYMBOL(idr_find); +/** + * idr_replace - replace pointer for given id + * @idp: idr handle + * @ptr: pointer you want associated with the id + * @id: lookup key + * + * Replace the pointer registered with an id and return the old value. + * A -ENOENT return indicates that @id was not found. + * A -EINVAL return indicates that @id was not within valid constraints. + * + * The caller must serialize vs idr_find(), idr_get_new(), and idr_remove(). + */ +void *idr_replace(struct idr *idp, void *ptr, int id) +{ + int n; + struct idr_layer *p, *old_p; + + n = idp->layers * IDR_BITS; + p = idp->top; + + id &= MAX_ID_MASK; + + if (id >= (1 << n)) + return ERR_PTR(-EINVAL); + + n -= IDR_BITS; + while ((n > 0) && p) { + p = p->ary[(id >> n) & IDR_MASK]; + n -= IDR_BITS; + } + + n = id & IDR_MASK; + if (unlikely(p == NULL || !test_bit(n, &p->bitmap))) + return ERR_PTR(-ENOENT); + + old_p = p->ary[n]; + p->ary[n] = ptr; + + return old_p; +} +EXPORT_SYMBOL(idr_replace); + static void idr_cache_ctor(void * idr_layer, kmem_cache_t *idr_layer_cache, unsigned long flags) { diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 73e0f23b7f5..6b9740bbf4c 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1821,7 +1821,7 @@ static inline void check_huge_range(struct vm_area_struct *vma, int show_numa_map(struct seq_file *m, void *v) { - struct task_struct *task = m->private; + struct proc_maps_private *priv = m->private; struct vm_area_struct *vma = v; struct numa_maps *md; struct file *file = vma->vm_file; @@ -1837,7 +1837,7 @@ int show_numa_map(struct seq_file *m, void *v) return 0; mpol_to_str(buffer, sizeof(buffer), - get_vma_policy(task, vma, vma->vm_start)); + get_vma_policy(priv->task, vma, vma->vm_start)); seq_printf(m, "%08lx %s", vma->vm_start, buffer); @@ -1891,7 +1891,7 @@ out: kfree(md); if (m->count < m->size) - m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0; + m->version = (vma != priv->tail_vma) ? vma->vm_start : 0; return 0; } diff --git a/mm/swap.c b/mm/swap.c index 03ae2076f92..990868afc1c 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -86,8 +86,7 @@ int rotate_reclaimable_page(struct page *page) zone = page_zone(page); spin_lock_irqsave(&zone->lru_lock, flags); if (PageLRU(page) && !PageActive(page)) { - list_del(&page->lru); - list_add_tail(&page->lru, &zone->inactive_list); + list_move_tail(&page->lru, &zone->inactive_list); inc_page_state(pgrotated); } if (!test_clear_page_writeback(page)) diff --git a/net/atm/mpc.c b/net/atm/mpc.c index a48a5d58040..5fe77df0018 100644 --- a/net/atm/mpc.c +++ b/net/atm/mpc.c @@ -1113,10 +1113,9 @@ static void check_qos_and_open_shortcut(struct k_message *msg, struct mpoa_clien static void MPOA_res_reply_rcvd(struct k_message *msg, struct mpoa_client *mpc) { - unsigned char *ip; - uint32_t dst_ip = msg->content.in_info.in_dst_ip; in_cache_entry *entry = mpc->in_ops->get(dst_ip, mpc); + dprintk("mpoa: (%s) MPOA_res_reply_rcvd: ip %u.%u.%u.%u\n", mpc->dev->name, NIPQUAD(dst_ip)); ddprintk("mpoa: (%s) MPOA_res_reply_rcvd() entry = %p", mpc->dev->name, entry); if(entry == NULL){ diff --git a/net/core/dev.c b/net/core/dev.c index ea2469398bd..f1c52cbd6ef 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -230,7 +230,7 @@ extern void netdev_unregister_sysfs(struct net_device *); * For efficiency */ -int netdev_nit; +static int netdev_nit; /* * Add a protocol ID to the list. Now that the input handler is @@ -1325,9 +1325,12 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev) nskb->next = NULL; rc = dev->hard_start_xmit(nskb, dev); if (unlikely(rc)) { + nskb->next = skb->next; skb->next = nskb; return rc; } + if (unlikely(netif_queue_stopped(dev) && skb->next)) + return NETDEV_TX_BUSY; } while (skb->next); skb->destructor = DEV_GSO_CB(skb)->destructor; diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 9cb78183038..471da451cd4 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -54,6 +54,7 @@ static atomic_t trapped; sizeof(struct iphdr) + sizeof(struct ethhdr)) static void zap_completion_queue(void); +static void arp_reply(struct sk_buff *skb); static void queue_process(void *p) { @@ -153,6 +154,22 @@ static void poll_napi(struct netpoll *np) } } +static void service_arp_queue(struct netpoll_info *npi) +{ + struct sk_buff *skb; + + if (unlikely(!npi)) + return; + + skb = skb_dequeue(&npi->arp_tx); + + while (skb != NULL) { + arp_reply(skb); + skb = skb_dequeue(&npi->arp_tx); + } + return; +} + void netpoll_poll(struct netpoll *np) { if(!np->dev || !netif_running(np->dev) || !np->dev->poll_controller) @@ -163,6 +180,8 @@ void netpoll_poll(struct netpoll *np) if (np->dev->poll) poll_napi(np); + service_arp_queue(np->dev->npinfo); + zap_completion_queue(); } @@ -279,14 +298,10 @@ static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) * network drivers do not expect to be called if the queue is * stopped. */ - if (netif_queue_stopped(np->dev)) { - netif_tx_unlock(np->dev); - netpoll_poll(np); - udelay(50); - continue; - } + status = NETDEV_TX_BUSY; + if (!netif_queue_stopped(np->dev)) + status = np->dev->hard_start_xmit(skb, np->dev); - status = np->dev->hard_start_xmit(skb, np->dev); netif_tx_unlock(np->dev); /* success */ @@ -446,7 +461,9 @@ int __netpoll_rx(struct sk_buff *skb) int proto, len, ulen; struct iphdr *iph; struct udphdr *uh; - struct netpoll *np = skb->dev->npinfo->rx_np; + struct netpoll_info *npi = skb->dev->npinfo; + struct netpoll *np = npi->rx_np; + if (!np) goto out; @@ -456,7 +473,7 @@ int __netpoll_rx(struct sk_buff *skb) /* check if netpoll clients need ARP */ if (skb->protocol == __constant_htons(ETH_P_ARP) && atomic_read(&trapped)) { - arp_reply(skb); + skb_queue_tail(&npi->arp_tx, skb); return 1; } @@ -651,6 +668,7 @@ int netpoll_setup(struct netpoll *np) npinfo->poll_owner = -1; npinfo->tries = MAX_RETRIES; spin_lock_init(&npinfo->rx_lock); + skb_queue_head_init(&npinfo->arp_tx); } else npinfo = ndev->npinfo; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 8e5044ba3ab..6edbb90cbce 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1739,12 +1739,15 @@ unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, unsigned int to, struct ts_config *config, struct ts_state *state) { + unsigned int ret; + config->get_next_block = skb_ts_get_next_block; config->finish = skb_ts_finish; skb_prepare_seq_read(skb, from, to, TS_SKB_CB(state)); - return textsearch_find(config, state); + ret = textsearch_find(config, state); + return (ret <= to - from ? ret : UINT_MAX); } /** diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0e029c4e290..c04176be7ed 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2166,7 +2166,7 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int sg) if (!pskb_may_pull(skb, thlen)) goto out; - oldlen = ~htonl(skb->len); + oldlen = (u16)~skb->len; __skb_pull(skb, thlen); segs = skb_segment(skb, sg); @@ -2174,7 +2174,7 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int sg) goto out; len = skb_shinfo(skb)->gso_size; - delta = csum_add(oldlen, htonl(thlen + len)); + delta = htonl(oldlen + (thlen + len)); skb = segs; th = skb->h.th; @@ -2183,10 +2183,10 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int sg) do { th->fin = th->psh = 0; - if (skb->ip_summed == CHECKSUM_NONE) { - th->check = csum_fold(csum_partial( - skb->h.raw, thlen, csum_add(skb->csum, delta))); - } + th->check = ~csum_fold(th->check + delta); + if (skb->ip_summed != CHECKSUM_HW) + th->check = csum_fold(csum_partial(skb->h.raw, thlen, + skb->csum)); seq += len; skb = skb->next; @@ -2196,11 +2196,11 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int sg) th->cwr = 0; } while (skb->next); - if (skb->ip_summed == CHECKSUM_NONE) { - delta = csum_add(oldlen, htonl(skb->tail - skb->h.raw)); - th->check = csum_fold(csum_partial( - skb->h.raw, thlen, csum_add(skb->csum, delta))); - } + delta = htonl(oldlen + (skb->tail - skb->h.raw) + skb->data_len); + th->check = ~csum_fold(th->check + delta); + if (skb->ip_summed != CHECKSUM_HW) + th->check = csum_fold(csum_partial(skb->h.raw, thlen, + skb->csum)); out: return segs; diff --git a/net/netrom/nr_route.c b/net/netrom/nr_route.c index b3b9097c87c..c11737f472d 100644 --- a/net/netrom/nr_route.c +++ b/net/netrom/nr_route.c @@ -725,15 +725,17 @@ void nr_link_failed(ax25_cb *ax25, int reason) struct nr_node *nr_node = NULL; spin_lock_bh(&nr_neigh_list_lock); - nr_neigh_for_each(s, node, &nr_neigh_list) + nr_neigh_for_each(s, node, &nr_neigh_list) { if (s->ax25 == ax25) { nr_neigh_hold(s); nr_neigh = s; break; } + } spin_unlock_bh(&nr_neigh_list_lock); - if (nr_neigh == NULL) return; + if (nr_neigh == NULL) + return; nr_neigh->ax25 = NULL; ax25_cb_put(ax25); @@ -743,11 +745,13 @@ void nr_link_failed(ax25_cb *ax25, int reason) return; } spin_lock_bh(&nr_node_list_lock); - nr_node_for_each(nr_node, node, &nr_node_list) + nr_node_for_each(nr_node, node, &nr_node_list) { nr_node_lock(nr_node); - if (nr_node->which < nr_node->count && nr_node->routes[nr_node->which].neighbour == nr_neigh) + if (nr_node->which < nr_node->count && + nr_node->routes[nr_node->which].neighbour == nr_neigh) nr_node->which++; nr_node_unlock(nr_node); + } spin_unlock_bh(&nr_node_list_lock); nr_neigh_put(nr_neigh); } diff --git a/net/rxrpc/call.c b/net/rxrpc/call.c index c4aeb7d4026..d07122b57e0 100644 --- a/net/rxrpc/call.c +++ b/net/rxrpc/call.c @@ -1098,8 +1098,7 @@ static void rxrpc_call_receive_data_packet(struct rxrpc_call *call, call->app_ready_seq = pmsg->seq; call->app_ready_qty += pmsg->dsize; - list_del_init(&pmsg->link); - list_add_tail(&pmsg->link, &call->app_readyq); + list_move_tail(&pmsg->link, &call->app_readyq); } /* see if we've got the last packet yet */ diff --git a/net/rxrpc/connection.c b/net/rxrpc/connection.c index 0e0a4553499..573b572f8f9 100644 --- a/net/rxrpc/connection.c +++ b/net/rxrpc/connection.c @@ -402,8 +402,7 @@ void rxrpc_put_connection(struct rxrpc_connection *conn) /* move to graveyard queue */ _debug("burying connection: {%08x}", ntohl(conn->conn_id)); - list_del(&conn->link); - list_add_tail(&conn->link, &peer->conn_graveyard); + list_move_tail(&conn->link, &peer->conn_graveyard); rxrpc_krxtimod_add_timer(&conn->timeout, rxrpc_conn_timeout * HZ); diff --git a/net/rxrpc/krxsecd.c b/net/rxrpc/krxsecd.c index 1aadd026d35..cea4eb5e249 100644 --- a/net/rxrpc/krxsecd.c +++ b/net/rxrpc/krxsecd.c @@ -160,8 +160,7 @@ void rxrpc_krxsecd_clear_transport(struct rxrpc_transport *trans) list_for_each_safe(_p, _n, &rxrpc_krxsecd_initmsgq) { msg = list_entry(_p, struct rxrpc_message, link); if (msg->trans == trans) { - list_del(&msg->link); - list_add_tail(&msg->link, &tmp); + list_move_tail(&msg->link, &tmp); atomic_dec(&rxrpc_krxsecd_qcount); } } diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 2c4ecbe5008..54128040a12 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -49,13 +49,19 @@ #include "name_table.h" #include "bcast.h" - #define MAX_PKT_DEFAULT_MCAST 1500 /* bcast link max packet size (fixed) */ #define BCLINK_WIN_DEFAULT 20 /* bcast link window size (default) */ #define BCLINK_LOG_BUF_SIZE 0 +/* + * Loss rate for incoming broadcast frames; used to test retransmission code. + * Set to N to cause every N'th frame to be discarded; 0 => don't discard any. + */ + +#define TIPC_BCAST_LOSS_RATE 0 + /** * struct bcbearer_pair - a pair of bearers used by broadcast link * @primary: pointer to primary bearer @@ -75,7 +81,14 @@ struct bcbearer_pair { * @bearer: (non-standard) broadcast bearer structure * @media: (non-standard) broadcast media structure * @bpairs: array of bearer pairs - * @bpairs_temp: array of bearer pairs used during creation of "bpairs" + * @bpairs_temp: temporary array of bearer pairs used by tipc_bcbearer_sort() + * @remains: temporary node map used by tipc_bcbearer_send() + * @remains_new: temporary node map used tipc_bcbearer_send() + * + * Note: The fields labelled "temporary" are incorporated into the bearer + * to avoid consuming potentially limited stack space through the use of + * large local variables within multicast routines. Concurrent access is + * prevented through use of the spinlock "bc_lock". */ struct bcbearer { @@ -83,6 +96,8 @@ struct bcbearer { struct media media; struct bcbearer_pair bpairs[MAX_BEARERS]; struct bcbearer_pair bpairs_temp[TIPC_MAX_LINK_PRI + 1]; + struct node_map remains; + struct node_map remains_new; }; /** @@ -165,21 +180,18 @@ static int bclink_ack_allowed(u32 n) * @after: sequence number of last packet to *not* retransmit * @to: sequence number of last packet to retransmit * - * Called with 'node' locked, bc_lock unlocked + * Called with bc_lock locked */ static void bclink_retransmit_pkt(u32 after, u32 to) { struct sk_buff *buf; - spin_lock_bh(&bc_lock); buf = bcl->first_out; while (buf && less_eq(buf_seqno(buf), after)) { buf = buf->next; } - if (buf != NULL) - tipc_link_retransmit(bcl, buf, mod(to - after)); - spin_unlock_bh(&bc_lock); + tipc_link_retransmit(bcl, buf, mod(to - after)); } /** @@ -346,8 +358,10 @@ static void tipc_bclink_peek_nack(u32 dest, u32 sender_tag, u32 gap_after, u32 g for (; buf; buf = buf->next) { u32 seqno = buf_seqno(buf); - if (mod(seqno - prev) != 1) + if (mod(seqno - prev) != 1) { buf = NULL; + break; + } if (seqno == gap_after) break; prev = seqno; @@ -399,7 +413,10 @@ int tipc_bclink_send_msg(struct sk_buff *buf) */ void tipc_bclink_recv_pkt(struct sk_buff *buf) -{ +{ +#if (TIPC_BCAST_LOSS_RATE) + static int rx_count = 0; +#endif struct tipc_msg *msg = buf_msg(buf); struct node* node = tipc_node_find(msg_prevnode(msg)); u32 next_in; @@ -420,9 +437,13 @@ void tipc_bclink_recv_pkt(struct sk_buff *buf) tipc_node_lock(node); tipc_bclink_acknowledge(node, msg_bcast_ack(msg)); tipc_node_unlock(node); + spin_lock_bh(&bc_lock); bcl->stats.recv_nacks++; + bcl->owner->next = node; /* remember requestor */ bclink_retransmit_pkt(msg_bcgap_after(msg), msg_bcgap_to(msg)); + bcl->owner->next = NULL; + spin_unlock_bh(&bc_lock); } else { tipc_bclink_peek_nack(msg_destnode(msg), msg_bcast_tag(msg), @@ -433,6 +454,14 @@ void tipc_bclink_recv_pkt(struct sk_buff *buf) return; } +#if (TIPC_BCAST_LOSS_RATE) + if (++rx_count == TIPC_BCAST_LOSS_RATE) { + rx_count = 0; + buf_discard(buf); + return; + } +#endif + tipc_node_lock(node); receive: deferred = node->bclink.deferred_head; @@ -531,12 +560,8 @@ static int tipc_bcbearer_send(struct sk_buff *buf, { static int send_count = 0; - struct node_map *remains; - struct node_map *remains_new; - struct node_map *remains_tmp; int bp_index; int swap_time; - int err; /* Prepare buffer for broadcasting (if first time trying to send it) */ @@ -557,9 +582,7 @@ static int tipc_bcbearer_send(struct sk_buff *buf, /* Send buffer over bearers until all targets reached */ - remains = kmalloc(sizeof(struct node_map), GFP_ATOMIC); - remains_new = kmalloc(sizeof(struct node_map), GFP_ATOMIC); - *remains = tipc_cltr_bcast_nodes; + bcbearer->remains = tipc_cltr_bcast_nodes; for (bp_index = 0; bp_index < MAX_BEARERS; bp_index++) { struct bearer *p = bcbearer->bpairs[bp_index].primary; @@ -568,8 +591,8 @@ static int tipc_bcbearer_send(struct sk_buff *buf, if (!p) break; /* no more bearers to try */ - tipc_nmap_diff(remains, &p->nodes, remains_new); - if (remains_new->count == remains->count) + tipc_nmap_diff(&bcbearer->remains, &p->nodes, &bcbearer->remains_new); + if (bcbearer->remains_new.count == bcbearer->remains.count) continue; /* bearer pair doesn't add anything */ if (!p->publ.blocked && @@ -587,27 +610,17 @@ swap: bcbearer->bpairs[bp_index].primary = s; bcbearer->bpairs[bp_index].secondary = p; update: - if (remains_new->count == 0) { - err = TIPC_OK; - goto out; - } + if (bcbearer->remains_new.count == 0) + return TIPC_OK; - /* swap map */ - remains_tmp = remains; - remains = remains_new; - remains_new = remains_tmp; + bcbearer->remains = bcbearer->remains_new; } /* Unable to reach all targets */ bcbearer->bearer.publ.blocked = 1; bcl->stats.bearer_congs++; - err = ~TIPC_OK; - - out: - kfree(remains_new); - kfree(remains); - return err; + return ~TIPC_OK; } /** @@ -765,7 +778,7 @@ int tipc_bclink_init(void) bclink = kmalloc(sizeof(*bclink), GFP_ATOMIC); if (!bcbearer || !bclink) { nomem: - warn("Memory squeeze; Failed to create multicast link\n"); + warn("Multicast link creation failed, no memory\n"); kfree(bcbearer); bcbearer = NULL; kfree(bclink); diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h index 0e3be2ab330..b243d9d495f 100644 --- a/net/tipc/bcast.h +++ b/net/tipc/bcast.h @@ -180,7 +180,7 @@ static inline void tipc_port_list_add(struct port_list *pl_ptr, u32 port) if (!item->next) { item->next = kmalloc(sizeof(*item), GFP_ATOMIC); if (!item->next) { - warn("Memory squeeze: multicast destination port list is incomplete\n"); + warn("Incomplete multicast delivery, no memory\n"); return; } item->next->next = NULL; diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index e213a8e5485..4fa24b5e891 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -112,39 +112,42 @@ int tipc_register_media(u32 media_type, goto exit; if (!media_name_valid(name)) { - warn("Media registration error: illegal name <%s>\n", name); + warn("Media <%s> rejected, illegal name\n", name); goto exit; } if (!bcast_addr) { - warn("Media registration error: no broadcast address supplied\n"); + warn("Media <%s> rejected, no broadcast address\n", name); goto exit; } if ((bearer_priority < TIPC_MIN_LINK_PRI) && (bearer_priority > TIPC_MAX_LINK_PRI)) { - warn("Media registration error: priority %u\n", bearer_priority); + warn("Media <%s> rejected, illegal priority (%u)\n", name, + bearer_priority); goto exit; } if ((link_tolerance < TIPC_MIN_LINK_TOL) || (link_tolerance > TIPC_MAX_LINK_TOL)) { - warn("Media registration error: tolerance %u\n", link_tolerance); + warn("Media <%s> rejected, illegal tolerance (%u)\n", name, + link_tolerance); goto exit; } media_id = media_count++; if (media_id >= MAX_MEDIA) { - warn("Attempt to register more than %u media\n", MAX_MEDIA); + warn("Media <%s> rejected, media limit reached (%u)\n", name, + MAX_MEDIA); media_count--; goto exit; } for (i = 0; i < media_id; i++) { if (media_list[i].type_id == media_type) { - warn("Attempt to register second media with type %u\n", + warn("Media <%s> rejected, duplicate type (%u)\n", name, media_type); media_count--; goto exit; } if (!strcmp(name, media_list[i].name)) { - warn("Attempt to re-register media name <%s>\n", name); + warn("Media <%s> rejected, duplicate name\n", name); media_count--; goto exit; } @@ -283,6 +286,9 @@ static struct bearer *bearer_find(const char *name) struct bearer *b_ptr; u32 i; + if (tipc_mode != TIPC_NET_MODE) + return NULL; + for (i = 0, b_ptr = tipc_bearers; i < MAX_BEARERS; i++, b_ptr++) { if (b_ptr->active && (!strcmp(b_ptr->publ.name, name))) return b_ptr; @@ -475,26 +481,33 @@ int tipc_enable_bearer(const char *name, u32 bcast_scope, u32 priority) u32 i; int res = -EINVAL; - if (tipc_mode != TIPC_NET_MODE) + if (tipc_mode != TIPC_NET_MODE) { + warn("Bearer <%s> rejected, not supported in standalone mode\n", + name); return -ENOPROTOOPT; - - if (!bearer_name_validate(name, &b_name) || - !tipc_addr_domain_valid(bcast_scope) || - !in_scope(bcast_scope, tipc_own_addr)) + } + if (!bearer_name_validate(name, &b_name)) { + warn("Bearer <%s> rejected, illegal name\n", name); return -EINVAL; - + } + if (!tipc_addr_domain_valid(bcast_scope) || + !in_scope(bcast_scope, tipc_own_addr)) { + warn("Bearer <%s> rejected, illegal broadcast scope\n", name); + return -EINVAL; + } if ((priority < TIPC_MIN_LINK_PRI || priority > TIPC_MAX_LINK_PRI) && - (priority != TIPC_MEDIA_LINK_PRI)) + (priority != TIPC_MEDIA_LINK_PRI)) { + warn("Bearer <%s> rejected, illegal priority\n", name); return -EINVAL; + } write_lock_bh(&tipc_net_lock); - if (!tipc_bearers) - goto failed; m_ptr = media_find(b_name.media_name); if (!m_ptr) { - warn("No media <%s>\n", b_name.media_name); + warn("Bearer <%s> rejected, media <%s> not registered\n", name, + b_name.media_name); goto failed; } @@ -510,23 +523,24 @@ restart: continue; } if (!strcmp(name, tipc_bearers[i].publ.name)) { - warn("Bearer <%s> already enabled\n", name); + warn("Bearer <%s> rejected, already enabled\n", name); goto failed; } if ((tipc_bearers[i].priority == priority) && (++with_this_prio > 2)) { if (priority-- == 0) { - warn("Third bearer <%s> with priority %u, unable to lower to %u\n", - name, priority + 1, priority); + warn("Bearer <%s> rejected, duplicate priority\n", + name); goto failed; } - warn("Third bearer <%s> with priority %u, lowering to %u\n", + warn("Bearer <%s> priority adjustment required %u->%u\n", name, priority + 1, priority); goto restart; } } if (bearer_id >= MAX_BEARERS) { - warn("Attempt to enable more than %d bearers\n", MAX_BEARERS); + warn("Bearer <%s> rejected, bearer limit reached (%u)\n", + name, MAX_BEARERS); goto failed; } @@ -536,7 +550,7 @@ restart: strcpy(b_ptr->publ.name, name); res = m_ptr->enable_bearer(&b_ptr->publ); if (res) { - warn("Failed to enable bearer <%s>\n", name); + warn("Bearer <%s> rejected, enable failure (%d)\n", name, -res); goto failed; } @@ -573,9 +587,6 @@ int tipc_block_bearer(const char *name) struct link *l_ptr; struct link *temp_l_ptr; - if (tipc_mode != TIPC_NET_MODE) - return -ENOPROTOOPT; - read_lock_bh(&tipc_net_lock); b_ptr = bearer_find(name); if (!b_ptr) { @@ -584,6 +595,7 @@ int tipc_block_bearer(const char *name) return -EINVAL; } + info("Blocking bearer <%s>\n", name); spin_lock_bh(&b_ptr->publ.lock); b_ptr->publ.blocked = 1; list_for_each_entry_safe(l_ptr, temp_l_ptr, &b_ptr->links, link_list) { @@ -595,7 +607,6 @@ int tipc_block_bearer(const char *name) } spin_unlock_bh(&b_ptr->publ.lock); read_unlock_bh(&tipc_net_lock); - info("Blocked bearer <%s>\n", name); return TIPC_OK; } @@ -611,15 +622,13 @@ static int bearer_disable(const char *name) struct link *l_ptr; struct link *temp_l_ptr; - if (tipc_mode != TIPC_NET_MODE) - return -ENOPROTOOPT; - b_ptr = bearer_find(name); if (!b_ptr) { warn("Attempt to disable unknown bearer <%s>\n", name); return -EINVAL; } + info("Disabling bearer <%s>\n", name); tipc_disc_stop_link_req(b_ptr->link_req); spin_lock_bh(&b_ptr->publ.lock); b_ptr->link_req = NULL; @@ -635,7 +644,6 @@ static int bearer_disable(const char *name) tipc_link_delete(l_ptr); } spin_unlock_bh(&b_ptr->publ.lock); - info("Disabled bearer <%s>\n", name); memset(b_ptr, 0, sizeof(struct bearer)); return TIPC_OK; } diff --git a/net/tipc/cluster.c b/net/tipc/cluster.c index 1aed81584e9..1dcb6940e33 100644 --- a/net/tipc/cluster.c +++ b/net/tipc/cluster.c @@ -60,8 +60,10 @@ struct cluster *tipc_cltr_create(u32 addr) int alloc; c_ptr = (struct cluster *)kmalloc(sizeof(*c_ptr), GFP_ATOMIC); - if (c_ptr == NULL) + if (c_ptr == NULL) { + warn("Cluster creation failure, no memory\n"); return NULL; + } memset(c_ptr, 0, sizeof(*c_ptr)); c_ptr->addr = tipc_addr(tipc_zone(addr), tipc_cluster(addr), 0); @@ -70,30 +72,32 @@ struct cluster *tipc_cltr_create(u32 addr) else max_nodes = tipc_max_nodes + 1; alloc = sizeof(void *) * (max_nodes + 1); + c_ptr->nodes = (struct node **)kmalloc(alloc, GFP_ATOMIC); if (c_ptr->nodes == NULL) { + warn("Cluster creation failure, no memory for node area\n"); kfree(c_ptr); return NULL; } - memset(c_ptr->nodes, 0, alloc); + memset(c_ptr->nodes, 0, alloc); + if (in_own_cluster(addr)) tipc_local_nodes = c_ptr->nodes; c_ptr->highest_slave = LOWEST_SLAVE - 1; c_ptr->highest_node = 0; z_ptr = tipc_zone_find(tipc_zone(addr)); - if (z_ptr == NULL) { + if (!z_ptr) { z_ptr = tipc_zone_create(addr); } - if (z_ptr != NULL) { - tipc_zone_attach_cluster(z_ptr, c_ptr); - c_ptr->owner = z_ptr; - } - else { + if (!z_ptr) { + kfree(c_ptr->nodes); kfree(c_ptr); - c_ptr = NULL; + return NULL; } + tipc_zone_attach_cluster(z_ptr, c_ptr); + c_ptr->owner = z_ptr; return c_ptr; } diff --git a/net/tipc/config.c b/net/tipc/config.c index 48b5de2dbe6..3ec502fac8c 100644 --- a/net/tipc/config.c +++ b/net/tipc/config.c @@ -291,13 +291,22 @@ static struct sk_buff *cfg_set_own_addr(void) if (!tipc_addr_node_valid(addr)) return tipc_cfg_reply_error_string(TIPC_CFG_INVALID_VALUE " (node address)"); - if (tipc_own_addr) + if (tipc_mode == TIPC_NET_MODE) return tipc_cfg_reply_error_string(TIPC_CFG_NOT_SUPPORTED " (cannot change node address once assigned)"); + tipc_own_addr = addr; + + /* + * Must release all spinlocks before calling start_net() because + * Linux version of TIPC calls eth_media_start() which calls + * register_netdevice_notifier() which may block! + * + * Temporarily releasing the lock should be harmless for non-Linux TIPC, + * but Linux version of eth_media_start() should really be reworked + * so that it can be called with spinlocks held. + */ spin_unlock_bh(&config_lock); - tipc_core_stop_net(); - tipc_own_addr = addr; tipc_core_start_net(); spin_lock_bh(&config_lock); return tipc_cfg_reply_none(); @@ -350,50 +359,21 @@ static struct sk_buff *cfg_set_max_subscriptions(void) static struct sk_buff *cfg_set_max_ports(void) { - int orig_mode; u32 value; if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_UNSIGNED)) return tipc_cfg_reply_error_string(TIPC_CFG_TLV_ERROR); value = *(u32 *)TLV_DATA(req_tlv_area); value = ntohl(value); + if (value == tipc_max_ports) + return tipc_cfg_reply_none(); if (value != delimit(value, 127, 65535)) return tipc_cfg_reply_error_string(TIPC_CFG_INVALID_VALUE " (max ports must be 127-65535)"); - - if (value == tipc_max_ports) - return tipc_cfg_reply_none(); - - if (atomic_read(&tipc_user_count) > 2) + if (tipc_mode != TIPC_NOT_RUNNING) return tipc_cfg_reply_error_string(TIPC_CFG_NOT_SUPPORTED - " (cannot change max ports while TIPC users exist)"); - - spin_unlock_bh(&config_lock); - orig_mode = tipc_get_mode(); - if (orig_mode == TIPC_NET_MODE) - tipc_core_stop_net(); - tipc_core_stop(); + " (cannot change max ports while TIPC is active)"); tipc_max_ports = value; - tipc_core_start(); - if (orig_mode == TIPC_NET_MODE) - tipc_core_start_net(); - spin_lock_bh(&config_lock); - return tipc_cfg_reply_none(); -} - -static struct sk_buff *set_net_max(int value, int *parameter) -{ - int orig_mode; - - if (value != *parameter) { - orig_mode = tipc_get_mode(); - if (orig_mode == TIPC_NET_MODE) - tipc_core_stop_net(); - *parameter = value; - if (orig_mode == TIPC_NET_MODE) - tipc_core_start_net(); - } - return tipc_cfg_reply_none(); } @@ -405,10 +385,16 @@ static struct sk_buff *cfg_set_max_zones(void) return tipc_cfg_reply_error_string(TIPC_CFG_TLV_ERROR); value = *(u32 *)TLV_DATA(req_tlv_area); value = ntohl(value); + if (value == tipc_max_zones) + return tipc_cfg_reply_none(); if (value != delimit(value, 1, 255)) return tipc_cfg_reply_error_string(TIPC_CFG_INVALID_VALUE " (max zones must be 1-255)"); - return set_net_max(value, &tipc_max_zones); + if (tipc_mode == TIPC_NET_MODE) + return tipc_cfg_reply_error_string(TIPC_CFG_NOT_SUPPORTED + " (cannot change max zones once TIPC has joined a network)"); + tipc_max_zones = value; + return tipc_cfg_reply_none(); } static struct sk_buff *cfg_set_max_clusters(void) @@ -419,8 +405,8 @@ static struct sk_buff *cfg_set_max_clusters(void) return tipc_cfg_reply_error_string(TIPC_CFG_TLV_ERROR); value = *(u32 *)TLV_DATA(req_tlv_area); value = ntohl(value); - if (value != 1) - return tipc_cfg_reply_error_string(TIPC_CFG_NOT_SUPPORTED + if (value != delimit(value, 1, 1)) + return tipc_cfg_reply_error_string(TIPC_CFG_INVALID_VALUE " (max clusters fixed at 1)"); return tipc_cfg_reply_none(); } @@ -433,10 +419,16 @@ static struct sk_buff *cfg_set_max_nodes(void) return tipc_cfg_reply_error_string(TIPC_CFG_TLV_ERROR); value = *(u32 *)TLV_DATA(req_tlv_area); value = ntohl(value); + if (value == tipc_max_nodes) + return tipc_cfg_reply_none(); if (value != delimit(value, 8, 2047)) return tipc_cfg_reply_error_string(TIPC_CFG_INVALID_VALUE " (max nodes must be 8-2047)"); - return set_net_max(value, &tipc_max_nodes); + if (tipc_mode == TIPC_NET_MODE) + return tipc_cfg_reply_error_string(TIPC_CFG_NOT_SUPPORTED + " (cannot change max nodes once TIPC has joined a network)"); + tipc_max_nodes = value; + return tipc_cfg_reply_none(); } static struct sk_buff *cfg_set_max_slaves(void) @@ -461,15 +453,16 @@ static struct sk_buff *cfg_set_netid(void) return tipc_cfg_reply_error_string(TIPC_CFG_TLV_ERROR); value = *(u32 *)TLV_DATA(req_tlv_area); value = ntohl(value); + if (value == tipc_net_id) + return tipc_cfg_reply_none(); if (value != delimit(value, 1, 9999)) return tipc_cfg_reply_error_string(TIPC_CFG_INVALID_VALUE " (network id must be 1-9999)"); - - if (tipc_own_addr) + if (tipc_mode == TIPC_NET_MODE) return tipc_cfg_reply_error_string(TIPC_CFG_NOT_SUPPORTED - " (cannot change network id once part of network)"); - - return set_net_max(value, &tipc_net_id); + " (cannot change network id once TIPC has joined a network)"); + tipc_net_id = value; + return tipc_cfg_reply_none(); } struct sk_buff *tipc_cfg_do_cmd(u32 orig_node, u16 cmd, const void *request_area, @@ -649,7 +642,7 @@ static void cfg_named_msg_event(void *userdata, if ((size < sizeof(*req_hdr)) || (size != TCM_ALIGN(ntohl(req_hdr->tcm_len))) || (ntohs(req_hdr->tcm_flags) != TCM_F_REQUEST)) { - warn("discarded invalid configuration message\n"); + warn("Invalid configuration message discarded\n"); return; } diff --git a/net/tipc/core.c b/net/tipc/core.c index 3d0a8ee4e1d..5003acb1591 100644 --- a/net/tipc/core.c +++ b/net/tipc/core.c @@ -2,7 +2,7 @@ * net/tipc/core.c: TIPC module code * * Copyright (c) 2003-2006, Ericsson AB - * Copyright (c) 2005, Wind River Systems + * Copyright (c) 2005-2006, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -57,7 +57,7 @@ void tipc_socket_stop(void); int tipc_netlink_start(void); void tipc_netlink_stop(void); -#define MOD_NAME "tipc_start: " +#define TIPC_MOD_VER "1.6.1" #ifndef CONFIG_TIPC_ZONES #define CONFIG_TIPC_ZONES 3 @@ -198,7 +198,7 @@ static int __init tipc_init(void) tipc_max_publications = 10000; tipc_max_subscriptions = 2000; tipc_max_ports = delimit(CONFIG_TIPC_PORTS, 127, 65536); - tipc_max_zones = delimit(CONFIG_TIPC_ZONES, 1, 511); + tipc_max_zones = delimit(CONFIG_TIPC_ZONES, 1, 255); tipc_max_clusters = delimit(CONFIG_TIPC_CLUSTERS, 1, 1); tipc_max_nodes = delimit(CONFIG_TIPC_NODES, 8, 2047); tipc_max_slaves = delimit(CONFIG_TIPC_SLAVE_NODES, 0, 2047); @@ -224,6 +224,7 @@ module_exit(tipc_exit); MODULE_DESCRIPTION("TIPC: Transparent Inter Process Communication"); MODULE_LICENSE("Dual BSD/GPL"); +MODULE_VERSION(TIPC_MOD_VER); /* Native TIPC API for kernel-space applications (see tipc.h) */ diff --git a/net/tipc/core.h b/net/tipc/core.h index 1f2e8b27a13..86f54f3512f 100644 --- a/net/tipc/core.h +++ b/net/tipc/core.h @@ -2,7 +2,7 @@ * net/tipc/core.h: Include file for TIPC global declarations * * Copyright (c) 2005-2006, Ericsson AB - * Copyright (c) 2005, Wind River Systems + * Copyright (c) 2005-2006, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -111,10 +111,6 @@ void tipc_dump(struct print_buf*,const char *fmt, ...); #else -#ifndef DBG_OUTPUT -#define DBG_OUTPUT NULL -#endif - /* * TIPC debug support not included: * - system messages are printed to system console @@ -129,6 +125,19 @@ void tipc_dump(struct print_buf*,const char *fmt, ...); #define msg_dbg(msg,txt) do {} while (0) #define dump(fmt,arg...) do {} while (0) + +/* + * TIPC_OUTPUT is defined to be the system console, while DBG_OUTPUT is + * the null print buffer. Thes ensures that any system or debug messages + * that are generated without using the above macros are handled correctly. + */ + +#undef TIPC_OUTPUT +#define TIPC_OUTPUT TIPC_CONS + +#undef DBG_OUTPUT +#define DBG_OUTPUT NULL + #endif @@ -309,7 +318,7 @@ static inline struct sk_buff *buf_acquire(u32 size) * buf_discard - frees a TIPC message buffer * @skb: message buffer * - * Frees a new buffer. If passed NULL, just returns. + * Frees a message buffer. If passed NULL, just returns. */ static inline void buf_discard(struct sk_buff *skb) diff --git a/net/tipc/discover.c b/net/tipc/discover.c index 92601385e5f..2b844120312 100644 --- a/net/tipc/discover.c +++ b/net/tipc/discover.c @@ -2,7 +2,7 @@ * net/tipc/discover.c * * Copyright (c) 2003-2006, Ericsson AB - * Copyright (c) 2005, Wind River Systems + * Copyright (c) 2005-2006, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -176,7 +176,6 @@ void tipc_disc_recv_msg(struct sk_buff *buf) n_ptr = tipc_node_create(orig); } if (n_ptr == NULL) { - warn("Memory squeeze; Failed to create node\n"); return; } spin_lock_bh(&n_ptr->lock); @@ -191,10 +190,8 @@ void tipc_disc_recv_msg(struct sk_buff *buf) } addr = &link->media_addr; if (memcmp(addr, &media_addr, sizeof(*addr))) { - char addr_string[16]; - - warn("New bearer address for %s\n", - addr_string_fill(addr_string, orig)); + warn("Resetting link <%s>, peer interface address changed\n", + link->name); memcpy(addr, &media_addr, sizeof(*addr)); tipc_link_reset(link); } @@ -270,8 +267,8 @@ static void disc_timeout(struct link_req *req) /* leave timer interval "as is" if already at a "normal" rate */ } else { req->timer_intv *= 2; - if (req->timer_intv > TIPC_LINK_REQ_SLOW) - req->timer_intv = TIPC_LINK_REQ_SLOW; + if (req->timer_intv > TIPC_LINK_REQ_FAST) + req->timer_intv = TIPC_LINK_REQ_FAST; if ((req->timer_intv == TIPC_LINK_REQ_FAST) && (req->bearer->nodes.count)) req->timer_intv = TIPC_LINK_REQ_SLOW; diff --git a/net/tipc/eth_media.c b/net/tipc/eth_media.c index 7a252785f72..682da4a2804 100644 --- a/net/tipc/eth_media.c +++ b/net/tipc/eth_media.c @@ -2,7 +2,7 @@ * net/tipc/eth_media.c: Ethernet bearer support for TIPC * * Copyright (c) 2001-2006, Ericsson AB - * Copyright (c) 2005, Wind River Systems + * Copyright (c) 2005-2006, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -98,17 +98,19 @@ static int recv_msg(struct sk_buff *buf, struct net_device *dev, u32 size; if (likely(eb_ptr->bearer)) { - size = msg_size((struct tipc_msg *)buf->data); - skb_trim(buf, size); - if (likely(buf->len == size)) { - buf->next = NULL; - tipc_recv_msg(buf, eb_ptr->bearer); - } else { - kfree_skb(buf); + if (likely(!dev->promiscuity) || + !memcmp(buf->mac.raw,dev->dev_addr,ETH_ALEN) || + !memcmp(buf->mac.raw,dev->broadcast,ETH_ALEN)) { + size = msg_size((struct tipc_msg *)buf->data); + skb_trim(buf, size); + if (likely(buf->len == size)) { + buf->next = NULL; + tipc_recv_msg(buf, eb_ptr->bearer); + return TIPC_OK; + } } - } else { - kfree_skb(buf); } + kfree_skb(buf); return TIPC_OK; } @@ -125,8 +127,7 @@ static int enable_bearer(struct tipc_bearer *tb_ptr) /* Find device with specified name */ - while (dev && dev->name && - (memcmp(dev->name, driver_name, strlen(dev->name)))) { + while (dev && dev->name && strncmp(dev->name, driver_name, IFNAMSIZ)) { dev = dev->next; } if (!dev) @@ -252,7 +253,9 @@ int tipc_eth_media_start(void) if (eth_started) return -EINVAL; - memset(&bcast_addr, 0xff, sizeof(bcast_addr)); + bcast_addr.type = htonl(TIPC_MEDIA_TYPE_ETH); + memset(&bcast_addr.dev_addr, 0xff, ETH_ALEN); + memset(eth_bearers, 0, sizeof(eth_bearers)); res = tipc_register_media(TIPC_MEDIA_TYPE_ETH, "eth", diff --git a/net/tipc/link.c b/net/tipc/link.c index 784b24b6d10..d6465805374 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -419,7 +419,7 @@ struct link *tipc_link_create(struct bearer *b_ptr, const u32 peer, l_ptr = (struct link *)kmalloc(sizeof(*l_ptr), GFP_ATOMIC); if (!l_ptr) { - warn("Memory squeeze; Failed to create link\n"); + warn("Link creation failed, no memory\n"); return NULL; } memset(l_ptr, 0, sizeof(*l_ptr)); @@ -469,7 +469,7 @@ struct link *tipc_link_create(struct bearer *b_ptr, const u32 peer, if (!pb) { kfree(l_ptr); - warn("Memory squeeze; Failed to create link\n"); + warn("Link creation failed, no memory for print buffer\n"); return NULL; } tipc_printbuf_init(&l_ptr->print_buf, pb, LINK_LOG_BUF_SIZE); @@ -574,7 +574,6 @@ void tipc_link_wakeup_ports(struct link *l_ptr, int all) break; list_del_init(&p_ptr->wait_list); p_ptr->congested_link = NULL; - assert(p_ptr->wakeup); spin_lock_bh(p_ptr->publ.lock); p_ptr->publ.congested = 0; p_ptr->wakeup(&p_ptr->publ); @@ -691,6 +690,7 @@ void tipc_link_reset(struct link *l_ptr) struct sk_buff *buf; u32 prev_state = l_ptr->state; u32 checkpoint = l_ptr->next_in_no; + int was_active_link = tipc_link_is_active(l_ptr); msg_set_session(l_ptr->pmsg, msg_session(l_ptr->pmsg) + 1); @@ -712,7 +712,7 @@ void tipc_link_reset(struct link *l_ptr) tipc_printf(TIPC_CONS, "\nReset link <%s>\n", l_ptr->name); dbg_link_dump(); #endif - if (tipc_node_has_active_links(l_ptr->owner) && + if (was_active_link && tipc_node_has_active_links(l_ptr->owner) && l_ptr->owner->permit_changeover) { l_ptr->reset_checkpoint = checkpoint; l_ptr->exp_msg_count = START_CHANGEOVER; @@ -755,7 +755,7 @@ void tipc_link_reset(struct link *l_ptr) static void link_activate(struct link *l_ptr) { - l_ptr->next_in_no = 1; + l_ptr->next_in_no = l_ptr->stats.recv_info = 1; tipc_node_link_up(l_ptr->owner, l_ptr); tipc_bearer_add_dest(l_ptr->b_ptr, l_ptr->addr); link_send_event(tipc_cfg_link_event, l_ptr, 1); @@ -820,6 +820,8 @@ static void link_state_event(struct link *l_ptr, unsigned event) break; case RESET_MSG: dbg_link("RES -> RR\n"); + info("Resetting link <%s>, requested by peer\n", + l_ptr->name); tipc_link_reset(l_ptr); l_ptr->state = RESET_RESET; l_ptr->fsm_msg_cnt = 0; @@ -844,6 +846,8 @@ static void link_state_event(struct link *l_ptr, unsigned event) break; case RESET_MSG: dbg_link("RES -> RR\n"); + info("Resetting link <%s>, requested by peer " + "while probing\n", l_ptr->name); tipc_link_reset(l_ptr); l_ptr->state = RESET_RESET; l_ptr->fsm_msg_cnt = 0; @@ -875,6 +879,8 @@ static void link_state_event(struct link *l_ptr, unsigned event) } else { /* Link has failed */ dbg_link("-> RU (%u probes unanswered)\n", l_ptr->fsm_msg_cnt); + warn("Resetting link <%s>, peer not responding\n", + l_ptr->name); tipc_link_reset(l_ptr); l_ptr->state = RESET_UNKNOWN; l_ptr->fsm_msg_cnt = 0; @@ -1050,7 +1056,7 @@ int tipc_link_send_buf(struct link *l_ptr, struct sk_buff *buf) msg_dbg(msg, "TIPC: Congestion, throwing away\n"); buf_discard(buf); if (imp > CONN_MANAGER) { - warn("Resetting <%s>, send queue full", l_ptr->name); + warn("Resetting link <%s>, send queue full", l_ptr->name); tipc_link_reset(l_ptr); } return dsz; @@ -1135,9 +1141,13 @@ int tipc_link_send(struct sk_buff *buf, u32 dest, u32 selector) if (n_ptr) { tipc_node_lock(n_ptr); l_ptr = n_ptr->active_links[selector & 1]; - dbg("tipc_link_send: found link %x for dest %x\n", l_ptr, dest); if (l_ptr) { + dbg("tipc_link_send: found link %x for dest %x\n", l_ptr, dest); res = tipc_link_send_buf(l_ptr, buf); + } else { + dbg("Attempt to send msg to unreachable node:\n"); + msg_dbg(buf_msg(buf),">>>"); + buf_discard(buf); } tipc_node_unlock(n_ptr); } else { @@ -1242,8 +1252,6 @@ int tipc_link_send_sections_fast(struct port *sender, int res; u32 selector = msg_origport(hdr) & 1; - assert(destaddr != tipc_own_addr); - again: /* * Try building message using port's max_pkt hint. @@ -1604,40 +1612,121 @@ void tipc_link_push_queue(struct link *l_ptr) tipc_bearer_schedule(l_ptr->b_ptr, l_ptr); } +static void link_reset_all(unsigned long addr) +{ + struct node *n_ptr; + char addr_string[16]; + u32 i; + + read_lock_bh(&tipc_net_lock); + n_ptr = tipc_node_find((u32)addr); + if (!n_ptr) { + read_unlock_bh(&tipc_net_lock); + return; /* node no longer exists */ + } + + tipc_node_lock(n_ptr); + + warn("Resetting all links to %s\n", + addr_string_fill(addr_string, n_ptr->addr)); + + for (i = 0; i < MAX_BEARERS; i++) { + if (n_ptr->links[i]) { + link_print(n_ptr->links[i], TIPC_OUTPUT, + "Resetting link\n"); + tipc_link_reset(n_ptr->links[i]); + } + } + + tipc_node_unlock(n_ptr); + read_unlock_bh(&tipc_net_lock); +} + +static void link_retransmit_failure(struct link *l_ptr, struct sk_buff *buf) +{ + struct tipc_msg *msg = buf_msg(buf); + + warn("Retransmission failure on link <%s>\n", l_ptr->name); + tipc_msg_print(TIPC_OUTPUT, msg, ">RETR-FAIL>"); + + if (l_ptr->addr) { + + /* Handle failure on standard link */ + + link_print(l_ptr, TIPC_OUTPUT, "Resetting link\n"); + tipc_link_reset(l_ptr); + + } else { + + /* Handle failure on broadcast link */ + + struct node *n_ptr; + char addr_string[16]; + + tipc_printf(TIPC_OUTPUT, "Msg seq number: %u, ", msg_seqno(msg)); + tipc_printf(TIPC_OUTPUT, "Outstanding acks: %u\n", (u32)TIPC_SKB_CB(buf)->handle); + + n_ptr = l_ptr->owner->next; + tipc_node_lock(n_ptr); + + addr_string_fill(addr_string, n_ptr->addr); + tipc_printf(TIPC_OUTPUT, "Multicast link info for %s\n", addr_string); + tipc_printf(TIPC_OUTPUT, "Supported: %d, ", n_ptr->bclink.supported); + tipc_printf(TIPC_OUTPUT, "Acked: %u\n", n_ptr->bclink.acked); + tipc_printf(TIPC_OUTPUT, "Last in: %u, ", n_ptr->bclink.last_in); + tipc_printf(TIPC_OUTPUT, "Gap after: %u, ", n_ptr->bclink.gap_after); + tipc_printf(TIPC_OUTPUT, "Gap to: %u\n", n_ptr->bclink.gap_to); + tipc_printf(TIPC_OUTPUT, "Nack sync: %u\n\n", n_ptr->bclink.nack_sync); + + tipc_k_signal((Handler)link_reset_all, (unsigned long)n_ptr->addr); + + tipc_node_unlock(n_ptr); + + l_ptr->stale_count = 0; + } +} + void tipc_link_retransmit(struct link *l_ptr, struct sk_buff *buf, u32 retransmits) { struct tipc_msg *msg; + if (!buf) + return; + + msg = buf_msg(buf); + dbg("Retransmitting %u in link %x\n", retransmits, l_ptr); - if (tipc_bearer_congested(l_ptr->b_ptr, l_ptr) && buf && !skb_cloned(buf)) { - msg_dbg(buf_msg(buf), ">NO_RETR->BCONG>"); - dbg_print_link(l_ptr, " "); - l_ptr->retransm_queue_head = msg_seqno(buf_msg(buf)); - l_ptr->retransm_queue_size = retransmits; - return; + if (tipc_bearer_congested(l_ptr->b_ptr, l_ptr)) { + if (!skb_cloned(buf)) { + msg_dbg(msg, ">NO_RETR->BCONG>"); + dbg_print_link(l_ptr, " "); + l_ptr->retransm_queue_head = msg_seqno(msg); + l_ptr->retransm_queue_size = retransmits; + return; + } else { + /* Don't retransmit if driver already has the buffer */ + } + } else { + /* Detect repeated retransmit failures on uncongested bearer */ + + if (l_ptr->last_retransmitted == msg_seqno(msg)) { + if (++l_ptr->stale_count > 100) { + link_retransmit_failure(l_ptr, buf); + return; + } + } else { + l_ptr->last_retransmitted = msg_seqno(msg); + l_ptr->stale_count = 1; + } } + while (retransmits && (buf != l_ptr->next_out) && buf && !skb_cloned(buf)) { msg = buf_msg(buf); msg_set_ack(msg, mod(l_ptr->next_in_no - 1)); msg_set_bcast_ack(msg, l_ptr->owner->bclink.last_in); if (tipc_bearer_send(l_ptr->b_ptr, buf, &l_ptr->media_addr)) { - /* Catch if retransmissions fail repeatedly: */ - if (l_ptr->last_retransmitted == msg_seqno(msg)) { - if (++l_ptr->stale_count > 100) { - tipc_msg_print(TIPC_CONS, buf_msg(buf), ">RETR>"); - info("...Retransmitted %u times\n", - l_ptr->stale_count); - link_print(l_ptr, TIPC_CONS, "Resetting Link\n"); - tipc_link_reset(l_ptr); - break; - } - } else { - l_ptr->stale_count = 0; - } - l_ptr->last_retransmitted = msg_seqno(msg); - msg_dbg(buf_msg(buf), ">RETR>"); buf = buf->next; retransmits--; @@ -1650,6 +1739,7 @@ void tipc_link_retransmit(struct link *l_ptr, struct sk_buff *buf, return; } } + l_ptr->retransm_queue_head = l_ptr->retransm_queue_size = 0; } @@ -1720,6 +1810,11 @@ void tipc_recv_msg(struct sk_buff *head, struct tipc_bearer *tb_ptr) link_recv_non_seq(buf); continue; } + + if (unlikely(!msg_short(msg) && + (msg_destnode(msg) != tipc_own_addr))) + goto cont; + n_ptr = tipc_node_find(msg_prevnode(msg)); if (unlikely(!n_ptr)) goto cont; @@ -2140,7 +2235,7 @@ static void link_recv_proto_msg(struct link *l_ptr, struct sk_buff *buf) if (msg_linkprio(msg) && (msg_linkprio(msg) != l_ptr->priority)) { - warn("Changing prio <%s>: %u->%u\n", + warn("Resetting link <%s>, priority change %u->%u\n", l_ptr->name, l_ptr->priority, msg_linkprio(msg)); l_ptr->priority = msg_linkprio(msg); tipc_link_reset(l_ptr); /* Enforce change to take effect */ @@ -2209,17 +2304,22 @@ void tipc_link_tunnel(struct link *l_ptr, u32 length = msg_size(msg); tunnel = l_ptr->owner->active_links[selector & 1]; - if (!tipc_link_is_up(tunnel)) + if (!tipc_link_is_up(tunnel)) { + warn("Link changeover error, " + "tunnel link no longer available\n"); return; + } msg_set_size(tunnel_hdr, length + INT_H_SIZE); buf = buf_acquire(length + INT_H_SIZE); - if (!buf) + if (!buf) { + warn("Link changeover error, " + "unable to send tunnel msg\n"); return; + } memcpy(buf->data, (unchar *)tunnel_hdr, INT_H_SIZE); memcpy(buf->data + INT_H_SIZE, (unchar *)msg, length); dbg("%c->%c:", l_ptr->b_ptr->net_plane, tunnel->b_ptr->net_plane); msg_dbg(buf_msg(buf), ">SEND>"); - assert(tunnel); tipc_link_send_buf(tunnel, buf); } @@ -2235,23 +2335,27 @@ void tipc_link_changeover(struct link *l_ptr) u32 msgcount = l_ptr->out_queue_size; struct sk_buff *crs = l_ptr->first_out; struct link *tunnel = l_ptr->owner->active_links[0]; - int split_bundles = tipc_node_has_redundant_links(l_ptr->owner); struct tipc_msg tunnel_hdr; + int split_bundles; if (!tunnel) return; - if (!l_ptr->owner->permit_changeover) + if (!l_ptr->owner->permit_changeover) { + warn("Link changeover error, " + "peer did not permit changeover\n"); return; + } msg_init(&tunnel_hdr, CHANGEOVER_PROTOCOL, ORIGINAL_MSG, TIPC_OK, INT_H_SIZE, l_ptr->addr); msg_set_bearer_id(&tunnel_hdr, l_ptr->peer_bearer_id); msg_set_msgcnt(&tunnel_hdr, msgcount); + dbg("Link changeover requires %u tunnel messages\n", msgcount); + if (!l_ptr->first_out) { struct sk_buff *buf; - assert(!msgcount); buf = buf_acquire(INT_H_SIZE); if (buf) { memcpy(buf->data, (unchar *)&tunnel_hdr, INT_H_SIZE); @@ -2261,10 +2365,15 @@ void tipc_link_changeover(struct link *l_ptr) msg_dbg(&tunnel_hdr, "EMPTY>SEND>"); tipc_link_send_buf(tunnel, buf); } else { - warn("Memory squeeze; link changeover failed\n"); + warn("Link changeover error, " + "unable to send changeover msg\n"); } return; } + + split_bundles = (l_ptr->owner->active_links[0] != + l_ptr->owner->active_links[1]); + while (crs) { struct tipc_msg *msg = buf_msg(crs); @@ -2310,7 +2419,8 @@ void tipc_link_send_duplicate(struct link *l_ptr, struct link *tunnel) msg_set_size(&tunnel_hdr, length + INT_H_SIZE); outbuf = buf_acquire(length + INT_H_SIZE); if (outbuf == NULL) { - warn("Memory squeeze; buffer duplication failed\n"); + warn("Link changeover error, " + "unable to send duplicate msg\n"); return; } memcpy(outbuf->data, (unchar *)&tunnel_hdr, INT_H_SIZE); @@ -2364,11 +2474,15 @@ static int link_recv_changeover_msg(struct link **l_ptr, u32 msg_count = msg_msgcnt(tunnel_msg); dest_link = (*l_ptr)->owner->links[msg_bearer_id(tunnel_msg)]; - assert(dest_link != *l_ptr); if (!dest_link) { msg_dbg(tunnel_msg, "NOLINK/<REC<"); goto exit; } + if (dest_link == *l_ptr) { + err("Unexpected changeover message on link <%s>\n", + (*l_ptr)->name); + goto exit; + } dbg("%c<-%c:", dest_link->b_ptr->net_plane, (*l_ptr)->b_ptr->net_plane); *l_ptr = dest_link; @@ -2381,7 +2495,7 @@ static int link_recv_changeover_msg(struct link **l_ptr, } *buf = buf_extract(tunnel_buf,INT_H_SIZE); if (*buf == NULL) { - warn("Memory squeeze; failed to extract msg\n"); + warn("Link changeover error, duplicate msg dropped\n"); goto exit; } msg_dbg(tunnel_msg, "TNL<REC<"); @@ -2393,13 +2507,17 @@ static int link_recv_changeover_msg(struct link **l_ptr, if (tipc_link_is_up(dest_link)) { msg_dbg(tunnel_msg, "UP/FIRST/<REC<"); + info("Resetting link <%s>, changeover initiated by peer\n", + dest_link->name); tipc_link_reset(dest_link); dest_link->exp_msg_count = msg_count; + dbg("Expecting %u tunnelled messages\n", msg_count); if (!msg_count) goto exit; } else if (dest_link->exp_msg_count == START_CHANGEOVER) { msg_dbg(tunnel_msg, "BLK/FIRST/<REC<"); dest_link->exp_msg_count = msg_count; + dbg("Expecting %u tunnelled messages\n", msg_count); if (!msg_count) goto exit; } @@ -2407,6 +2525,8 @@ static int link_recv_changeover_msg(struct link **l_ptr, /* Receive original message */ if (dest_link->exp_msg_count == 0) { + warn("Link switchover error, " + "got too many tunnelled messages\n"); msg_dbg(tunnel_msg, "OVERDUE/DROP/<REC<"); dbg_print_link(dest_link, "LINK:"); goto exit; @@ -2422,7 +2542,7 @@ static int link_recv_changeover_msg(struct link **l_ptr, buf_discard(tunnel_buf); return 1; } else { - warn("Memory squeeze; dropped incoming msg\n"); + warn("Link changeover error, original msg dropped\n"); } } exit: @@ -2444,13 +2564,8 @@ void tipc_link_recv_bundle(struct sk_buff *buf) while (msgcount--) { obuf = buf_extract(buf, pos); if (obuf == NULL) { - char addr_string[16]; - - warn("Buffer allocation failure;\n"); - warn(" incoming message(s) from %s lost\n", - addr_string_fill(addr_string, - msg_orignode(buf_msg(buf)))); - return; + warn("Link unable to unbundle message(s)\n"); + break; }; pos += align(msg_size(buf_msg(obuf))); msg_dbg(buf_msg(obuf), " /"); @@ -2508,7 +2623,7 @@ int tipc_link_send_long_buf(struct link *l_ptr, struct sk_buff *buf) } fragm = buf_acquire(fragm_sz + INT_H_SIZE); if (fragm == NULL) { - warn("Memory squeeze; failed to fragment msg\n"); + warn("Link unable to fragment message\n"); dsz = -ENOMEM; goto exit; } @@ -2623,7 +2738,7 @@ int tipc_link_recv_fragment(struct sk_buff **pending, struct sk_buff **fb, set_fragm_size(pbuf,fragm_sz); set_expected_frags(pbuf,exp_fragm_cnt - 1); } else { - warn("Memory squeeze; got no defragmenting buffer\n"); + warn("Link unable to reassemble fragmented message\n"); } buf_discard(fbuf); return 0; diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c index a3bbc891f95..f0b063bcc2a 100644 --- a/net/tipc/name_distr.c +++ b/net/tipc/name_distr.c @@ -127,7 +127,7 @@ void tipc_named_publish(struct publication *publ) buf = named_prepare_buf(PUBLICATION, ITEM_SIZE, 0); if (!buf) { - warn("Memory squeeze; failed to distribute publication\n"); + warn("Publication distribution failure\n"); return; } @@ -151,7 +151,7 @@ void tipc_named_withdraw(struct publication *publ) buf = named_prepare_buf(WITHDRAWAL, ITEM_SIZE, 0); if (!buf) { - warn("Memory squeeze; failed to distribute withdrawal\n"); + warn("Withdrawl distribution failure\n"); return; } @@ -174,7 +174,6 @@ void tipc_named_node_up(unsigned long node) u32 rest; u32 max_item_buf; - assert(in_own_cluster(node)); read_lock_bh(&tipc_nametbl_lock); max_item_buf = TIPC_MAX_USER_MSG_SIZE / ITEM_SIZE; max_item_buf *= ITEM_SIZE; @@ -185,8 +184,8 @@ void tipc_named_node_up(unsigned long node) left = (rest <= max_item_buf) ? rest : max_item_buf; rest -= left; buf = named_prepare_buf(PUBLICATION, left, node); - if (buf == NULL) { - warn("Memory Squeeze; could not send publication\n"); + if (!buf) { + warn("Bulk publication distribution failure\n"); goto exit; } item = (struct distr_item *)msg_data(buf_msg(buf)); @@ -221,15 +220,24 @@ exit: static void node_is_down(struct publication *publ) { struct publication *p; + write_lock_bh(&tipc_nametbl_lock); dbg("node_is_down: withdrawing %u, %u, %u\n", publ->type, publ->lower, publ->upper); publ->key += 1222345; p = tipc_nametbl_remove_publ(publ->type, publ->lower, publ->node, publ->ref, publ->key); - assert(p == publ); write_unlock_bh(&tipc_nametbl_lock); - kfree(publ); + + if (p != publ) { + err("Unable to remove publication from failed node\n" + "(type=%u, lower=%u, node=0x%x, ref=%u, key=%u)\n", + publ->type, publ->lower, publ->node, publ->ref, publ->key); + } + + if (p) { + kfree(p); + } } /** @@ -275,9 +283,15 @@ void tipc_named_recv(struct sk_buff *buf) if (publ) { tipc_nodesub_unsubscribe(&publ->subscr); kfree(publ); + } else { + err("Unable to remove publication by node 0x%x\n" + "(type=%u, lower=%u, ref=%u, key=%u)\n", + msg_orignode(msg), + ntohl(item->type), ntohl(item->lower), + ntohl(item->ref), ntohl(item->key)); } } else { - warn("tipc_named_recv: unknown msg\n"); + warn("Unrecognized name table message received\n"); } item++; } diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index d129422fc5c..38571306aba 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -71,7 +71,7 @@ struct sub_seq { * @sseq: pointer to dynamically-sized array of sub-sequences of this 'type'; * sub-sequences are sorted in ascending order * @alloc: number of sub-sequences currently in array - * @first_free: upper bound of highest sub-sequence + 1 + * @first_free: array index of first unused sub-sequence entry * @ns_list: links to adjacent name sequences in hash chain * @subscriptions: list of subscriptions for this 'type' * @lock: spinlock controlling access to name sequence structure @@ -120,7 +120,7 @@ static struct publication *publ_create(u32 type, u32 lower, u32 upper, struct publication *publ = (struct publication *)kmalloc(sizeof(*publ), GFP_ATOMIC); if (publ == NULL) { - warn("Memory squeeze; failed to create publication\n"); + warn("Publication creation failure, no memory\n"); return NULL; } @@ -165,7 +165,7 @@ static struct name_seq *tipc_nameseq_create(u32 type, struct hlist_head *seq_hea struct sub_seq *sseq = tipc_subseq_alloc(1); if (!nseq || !sseq) { - warn("Memory squeeze; failed to create name sequence\n"); + warn("Name sequence creation failed, no memory\n"); kfree(nseq); kfree(sseq); return NULL; @@ -175,7 +175,7 @@ static struct name_seq *tipc_nameseq_create(u32 type, struct hlist_head *seq_hea nseq->lock = SPIN_LOCK_UNLOCKED; nseq->type = type; nseq->sseqs = sseq; - dbg("tipc_nameseq_create() nseq = %x type %u, ssseqs %x, ff: %u\n", + dbg("tipc_nameseq_create(): nseq = %p, type %u, ssseqs %p, ff: %u\n", nseq, type, nseq->sseqs, nseq->first_free); nseq->alloc = 1; INIT_HLIST_NODE(&nseq->ns_list); @@ -253,16 +253,16 @@ static struct publication *tipc_nameseq_insert_publ(struct name_seq *nseq, struct sub_seq *sseq; int created_subseq = 0; - assert(nseq->first_free <= nseq->alloc); sseq = nameseq_find_subseq(nseq, lower); - dbg("nameseq_ins: for seq %x,<%u,%u>, found sseq %x\n", + dbg("nameseq_ins: for seq %p, {%u,%u}, found sseq %p\n", nseq, type, lower, sseq); if (sseq) { /* Lower end overlaps existing entry => need an exact match */ if ((sseq->lower != lower) || (sseq->upper != upper)) { - warn("Overlapping publ <%u,%u,%u>\n", type, lower, upper); + warn("Cannot publish {%u,%u,%u}, overlap error\n", + type, lower, upper); return NULL; } } else { @@ -277,25 +277,27 @@ static struct publication *tipc_nameseq_insert_publ(struct name_seq *nseq, if ((inspos < nseq->first_free) && (upper >= nseq->sseqs[inspos].lower)) { - warn("Overlapping publ <%u,%u,%u>\n", type, lower, upper); + warn("Cannot publish {%u,%u,%u}, overlap error\n", + type, lower, upper); return NULL; } /* Ensure there is space for new sub-sequence */ if (nseq->first_free == nseq->alloc) { - struct sub_seq *sseqs = nseq->sseqs; - nseq->sseqs = tipc_subseq_alloc(nseq->alloc * 2); - if (nseq->sseqs != NULL) { - memcpy(nseq->sseqs, sseqs, - nseq->alloc * sizeof (struct sub_seq)); - kfree(sseqs); - dbg("Allocated %u sseqs\n", nseq->alloc); - nseq->alloc *= 2; - } else { - warn("Memory squeeze; failed to create sub-sequence\n"); + struct sub_seq *sseqs = tipc_subseq_alloc(nseq->alloc * 2); + + if (!sseqs) { + warn("Cannot publish {%u,%u,%u}, no memory\n", + type, lower, upper); return NULL; } + dbg("Allocated %u more sseqs\n", nseq->alloc); + memcpy(sseqs, nseq->sseqs, + nseq->alloc * sizeof(struct sub_seq)); + kfree(nseq->sseqs); + nseq->sseqs = sseqs; + nseq->alloc *= 2; } dbg("Have %u sseqs for type %u\n", nseq->alloc, type); @@ -311,7 +313,7 @@ static struct publication *tipc_nameseq_insert_publ(struct name_seq *nseq, sseq->upper = upper; created_subseq = 1; } - dbg("inserting (%u %u %u) from %x:%u into sseq %x(%u,%u) of seq %x\n", + dbg("inserting {%u,%u,%u} from <0x%x:%u> into sseq %p(%u,%u) of seq %p\n", type, lower, upper, node, port, sseq, sseq->lower, sseq->upper, nseq); @@ -320,7 +322,7 @@ static struct publication *tipc_nameseq_insert_publ(struct name_seq *nseq, publ = publ_create(type, lower, upper, scope, node, port, key); if (!publ) return NULL; - dbg("inserting publ %x, node=%x publ->node=%x, subscr->node=%x\n", + dbg("inserting publ %p, node=0x%x publ->node=0x%x, subscr->node=%p\n", publ, node, publ->node, publ->subscr.node); if (!sseq->zone_list) @@ -367,45 +369,47 @@ static struct publication *tipc_nameseq_insert_publ(struct name_seq *nseq, /** * tipc_nameseq_remove_publ - + * + * NOTE: There may be cases where TIPC is asked to remove a publication + * that is not in the name table. For example, if another node issues a + * publication for a name sequence that overlaps an existing name sequence + * the publication will not be recorded, which means the publication won't + * be found when the name sequence is later withdrawn by that node. + * A failed withdraw request simply returns a failure indication and lets the + * caller issue any error or warning messages associated with such a problem. */ static struct publication *tipc_nameseq_remove_publ(struct name_seq *nseq, u32 inst, u32 node, u32 ref, u32 key) { struct publication *publ; + struct publication *curr; struct publication *prev; struct sub_seq *sseq = nameseq_find_subseq(nseq, inst); struct sub_seq *free; struct subscription *s, *st; int removed_subseq = 0; - assert(nseq); - - if (!sseq) { - int i; - - warn("Withdraw unknown <%u,%u>?\n", nseq->type, inst); - assert(nseq->sseqs); - dbg("Dumping subseqs %x for %x, alloc = %u,ff=%u\n", - nseq->sseqs, nseq, nseq->alloc, - nseq->first_free); - for (i = 0; i < nseq->first_free; i++) { - dbg("Subseq %u(%x): lower = %u,upper = %u\n", - i, &nseq->sseqs[i], nseq->sseqs[i].lower, - nseq->sseqs[i].upper); - } + if (!sseq) return NULL; - } - dbg("nameseq_remove: seq: %x, sseq %x, <%u,%u> key %u\n", + + dbg("tipc_nameseq_remove_publ: seq: %p, sseq %p, {%u,%u}, key %u\n", nseq, sseq, nseq->type, inst, key); + /* Remove publication from zone scope list */ + prev = sseq->zone_list; publ = sseq->zone_list->zone_list_next; while ((publ->key != key) || (publ->ref != ref) || (publ->node && (publ->node != node))) { prev = publ; publ = publ->zone_list_next; - assert(prev != sseq->zone_list); + if (prev == sseq->zone_list) { + + /* Prevent endless loop if publication not found */ + + return NULL; + } } if (publ != sseq->zone_list) prev->zone_list_next = publ->zone_list_next; @@ -416,14 +420,24 @@ static struct publication *tipc_nameseq_remove_publ(struct name_seq *nseq, u32 i sseq->zone_list = NULL; } + /* Remove publication from cluster scope list, if present */ + if (in_own_cluster(node)) { prev = sseq->cluster_list; - publ = sseq->cluster_list->cluster_list_next; - while ((publ->key != key) || (publ->ref != ref) || - (publ->node && (publ->node != node))) { - prev = publ; - publ = publ->cluster_list_next; - assert(prev != sseq->cluster_list); + curr = sseq->cluster_list->cluster_list_next; + while (curr != publ) { + prev = curr; + curr = curr->cluster_list_next; + if (prev == sseq->cluster_list) { + + /* Prevent endless loop for malformed list */ + + err("Unable to de-list cluster publication\n" + "{%u%u}, node=0x%x, ref=%u, key=%u)\n", + publ->type, publ->lower, publ->node, + publ->ref, publ->key); + goto end_cluster; + } } if (publ != sseq->cluster_list) prev->cluster_list_next = publ->cluster_list_next; @@ -434,15 +448,26 @@ static struct publication *tipc_nameseq_remove_publ(struct name_seq *nseq, u32 i sseq->cluster_list = NULL; } } +end_cluster: + + /* Remove publication from node scope list, if present */ if (node == tipc_own_addr) { prev = sseq->node_list; - publ = sseq->node_list->node_list_next; - while ((publ->key != key) || (publ->ref != ref) || - (publ->node && (publ->node != node))) { - prev = publ; - publ = publ->node_list_next; - assert(prev != sseq->node_list); + curr = sseq->node_list->node_list_next; + while (curr != publ) { + prev = curr; + curr = curr->node_list_next; + if (prev == sseq->node_list) { + + /* Prevent endless loop for malformed list */ + + err("Unable to de-list node publication\n" + "{%u%u}, node=0x%x, ref=%u, key=%u)\n", + publ->type, publ->lower, publ->node, + publ->ref, publ->key); + goto end_node; + } } if (publ != sseq->node_list) prev->node_list_next = publ->node_list_next; @@ -453,22 +478,18 @@ static struct publication *tipc_nameseq_remove_publ(struct name_seq *nseq, u32 i sseq->node_list = NULL; } } - assert(!publ->node || (publ->node == node)); - assert(publ->ref == ref); - assert(publ->key == key); +end_node: - /* - * Contract subseq list if no more publications: - */ - if (!sseq->node_list && !sseq->cluster_list && !sseq->zone_list) { + /* Contract subseq list if no more publications for that subseq */ + + if (!sseq->zone_list) { free = &nseq->sseqs[nseq->first_free--]; memmove(sseq, sseq + 1, (free - (sseq + 1)) * sizeof (*sseq)); removed_subseq = 1; } - /* - * Any subscriptions waiting ? - */ + /* Notify any waiting subscriptions */ + list_for_each_entry_safe(s, st, &nseq->subscriptions, nameseq_list) { tipc_subscr_report_overlap(s, publ->lower, @@ -478,6 +499,7 @@ static struct publication *tipc_nameseq_remove_publ(struct name_seq *nseq, u32 i publ->node, removed_subseq); } + return publ; } @@ -530,7 +552,7 @@ static struct name_seq *nametbl_find_seq(u32 type) seq_head = &table.types[hash(type)]; hlist_for_each_entry(ns, seq_node, seq_head, ns_list) { if (ns->type == type) { - dbg("found %x\n", ns); + dbg("found %p\n", ns); return ns; } } @@ -543,22 +565,21 @@ struct publication *tipc_nametbl_insert_publ(u32 type, u32 lower, u32 upper, { struct name_seq *seq = nametbl_find_seq(type); - dbg("ins_publ: <%u,%x,%x> found %x\n", type, lower, upper, seq); + dbg("tipc_nametbl_insert_publ: {%u,%u,%u} found %p\n", type, lower, upper, seq); if (lower > upper) { - warn("Failed to publish illegal <%u,%u,%u>\n", + warn("Failed to publish illegal {%u,%u,%u}\n", type, lower, upper); return NULL; } - dbg("Publishing <%u,%u,%u> from %x\n", type, lower, upper, node); + dbg("Publishing {%u,%u,%u} from 0x%x\n", type, lower, upper, node); if (!seq) { seq = tipc_nameseq_create(type, &table.types[hash(type)]); - dbg("tipc_nametbl_insert_publ: created %x\n", seq); + dbg("tipc_nametbl_insert_publ: created %p\n", seq); } if (!seq) return NULL; - assert(seq->type == type); return tipc_nameseq_insert_publ(seq, type, lower, upper, scope, node, port, key); } @@ -572,7 +593,7 @@ struct publication *tipc_nametbl_remove_publ(u32 type, u32 lower, if (!seq) return NULL; - dbg("Withdrawing <%u,%u> from %x\n", type, lower, node); + dbg("Withdrawing {%u,%u} from 0x%x\n", type, lower, node); publ = tipc_nameseq_remove_publ(seq, lower, node, ref, key); if (!seq->first_free && list_empty(&seq->subscriptions)) { @@ -738,12 +759,12 @@ struct publication *tipc_nametbl_publish(u32 type, u32 lower, u32 upper, struct publication *publ; if (table.local_publ_count >= tipc_max_publications) { - warn("Failed publish: max %u local publication\n", + warn("Publication failed, local publication limit reached (%u)\n", tipc_max_publications); return NULL; } if ((type < TIPC_RESERVED_TYPES) && !atomic_read(&rsv_publ_ok)) { - warn("Failed to publish reserved name <%u,%u,%u>\n", + warn("Publication failed, reserved name {%u,%u,%u}\n", type, lower, upper); return NULL; } @@ -767,10 +788,10 @@ int tipc_nametbl_withdraw(u32 type, u32 lower, u32 ref, u32 key) { struct publication *publ; - dbg("tipc_nametbl_withdraw:<%d,%d,%d>\n", type, lower, key); + dbg("tipc_nametbl_withdraw: {%u,%u}, key=%u\n", type, lower, key); write_lock_bh(&tipc_nametbl_lock); publ = tipc_nametbl_remove_publ(type, lower, tipc_own_addr, ref, key); - if (publ) { + if (likely(publ)) { table.local_publ_count--; if (publ->scope != TIPC_NODE_SCOPE) tipc_named_withdraw(publ); @@ -780,6 +801,9 @@ int tipc_nametbl_withdraw(u32 type, u32 lower, u32 ref, u32 key) return 1; } write_unlock_bh(&tipc_nametbl_lock); + err("Unable to remove local publication\n" + "(type=%u, lower=%u, ref=%u, key=%u)\n", + type, lower, ref, key); return 0; } @@ -787,8 +811,7 @@ int tipc_nametbl_withdraw(u32 type, u32 lower, u32 ref, u32 key) * tipc_nametbl_subscribe - add a subscription object to the name table */ -void -tipc_nametbl_subscribe(struct subscription *s) +void tipc_nametbl_subscribe(struct subscription *s) { u32 type = s->seq.type; struct name_seq *seq; @@ -800,11 +823,13 @@ tipc_nametbl_subscribe(struct subscription *s) } if (seq){ spin_lock_bh(&seq->lock); - dbg("tipc_nametbl_subscribe:found %x for <%u,%u,%u>\n", + dbg("tipc_nametbl_subscribe:found %p for {%u,%u,%u}\n", seq, type, s->seq.lower, s->seq.upper); - assert(seq->type == type); tipc_nameseq_subscribe(seq, s); spin_unlock_bh(&seq->lock); + } else { + warn("Failed to create subscription for {%u,%u,%u}\n", + s->seq.type, s->seq.lower, s->seq.upper); } write_unlock_bh(&tipc_nametbl_lock); } @@ -813,8 +838,7 @@ tipc_nametbl_subscribe(struct subscription *s) * tipc_nametbl_unsubscribe - remove a subscription object from name table */ -void -tipc_nametbl_unsubscribe(struct subscription *s) +void tipc_nametbl_unsubscribe(struct subscription *s) { struct name_seq *seq; @@ -1049,35 +1073,20 @@ int tipc_nametbl_init(void) void tipc_nametbl_stop(void) { - struct hlist_head *seq_head; - struct hlist_node *seq_node; - struct hlist_node *tmp; - struct name_seq *seq; u32 i; if (!table.types) return; + /* Verify name table is empty, then release it */ + write_lock_bh(&tipc_nametbl_lock); for (i = 0; i < tipc_nametbl_size; i++) { - seq_head = &table.types[i]; - hlist_for_each_entry_safe(seq, seq_node, tmp, seq_head, ns_list) { - struct sub_seq *sseq = seq->sseqs; - - for (; sseq != &seq->sseqs[seq->first_free]; sseq++) { - struct publication *publ = sseq->zone_list; - assert(publ); - do { - struct publication *next = - publ->zone_list_next; - kfree(publ); - publ = next; - } - while (publ != sseq->zone_list); - } - } + if (!hlist_empty(&table.types[i])) + err("tipc_nametbl_stop(): hash chain %u is non-null\n", i); } kfree(table.types); table.types = NULL; write_unlock_bh(&tipc_nametbl_lock); } + diff --git a/net/tipc/node.c b/net/tipc/node.c index 0d5db06e203..ce9678efa98 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -61,34 +61,37 @@ struct node *tipc_node_create(u32 addr) struct node **curr_node; n_ptr = kmalloc(sizeof(*n_ptr),GFP_ATOMIC); - if (n_ptr != NULL) { - memset(n_ptr, 0, sizeof(*n_ptr)); - n_ptr->addr = addr; - n_ptr->lock = SPIN_LOCK_UNLOCKED; - INIT_LIST_HEAD(&n_ptr->nsub); - - c_ptr = tipc_cltr_find(addr); - if (c_ptr == NULL) - c_ptr = tipc_cltr_create(addr); - if (c_ptr != NULL) { - n_ptr->owner = c_ptr; - tipc_cltr_attach_node(c_ptr, n_ptr); - n_ptr->last_router = -1; - - /* Insert node into ordered list */ - for (curr_node = &tipc_nodes; *curr_node; - curr_node = &(*curr_node)->next) { - if (addr < (*curr_node)->addr) { - n_ptr->next = *curr_node; - break; - } - } - (*curr_node) = n_ptr; - } else { - kfree(n_ptr); - n_ptr = NULL; - } - } + if (!n_ptr) { + warn("Node creation failed, no memory\n"); + return NULL; + } + + c_ptr = tipc_cltr_find(addr); + if (!c_ptr) { + c_ptr = tipc_cltr_create(addr); + } + if (!c_ptr) { + kfree(n_ptr); + return NULL; + } + + memset(n_ptr, 0, sizeof(*n_ptr)); + n_ptr->addr = addr; + n_ptr->lock = SPIN_LOCK_UNLOCKED; + INIT_LIST_HEAD(&n_ptr->nsub); + n_ptr->owner = c_ptr; + tipc_cltr_attach_node(c_ptr, n_ptr); + n_ptr->last_router = -1; + + /* Insert node into ordered list */ + for (curr_node = &tipc_nodes; *curr_node; + curr_node = &(*curr_node)->next) { + if (addr < (*curr_node)->addr) { + n_ptr->next = *curr_node; + break; + } + } + (*curr_node) = n_ptr; return n_ptr; } @@ -122,6 +125,8 @@ void tipc_node_link_up(struct node *n_ptr, struct link *l_ptr) { struct link **active = &n_ptr->active_links[0]; + n_ptr->working_links++; + info("Established link <%s> on network plane %c\n", l_ptr->name, l_ptr->b_ptr->net_plane); @@ -132,7 +137,7 @@ void tipc_node_link_up(struct node *n_ptr, struct link *l_ptr) return; } if (l_ptr->priority < active[0]->priority) { - info("Link is standby\n"); + info("New link <%s> becomes standby\n", l_ptr->name); return; } tipc_link_send_duplicate(active[0], l_ptr); @@ -140,8 +145,9 @@ void tipc_node_link_up(struct node *n_ptr, struct link *l_ptr) active[0] = l_ptr; return; } - info("Link <%s> on network plane %c becomes standby\n", - active[0]->name, active[0]->b_ptr->net_plane); + info("Old link <%s> becomes standby\n", active[0]->name); + if (active[1] != active[0]) + info("Old link <%s> becomes standby\n", active[1]->name); active[0] = active[1] = l_ptr; } @@ -181,6 +187,8 @@ void tipc_node_link_down(struct node *n_ptr, struct link *l_ptr) { struct link **active; + n_ptr->working_links--; + if (!tipc_link_is_active(l_ptr)) { info("Lost standby link <%s> on network plane %c\n", l_ptr->name, l_ptr->b_ptr->net_plane); @@ -210,8 +218,7 @@ int tipc_node_has_active_links(struct node *n_ptr) int tipc_node_has_redundant_links(struct node *n_ptr) { - return (tipc_node_has_active_links(n_ptr) && - (n_ptr->active_links[0] != n_ptr->active_links[1])); + return (n_ptr->working_links > 1); } static int tipc_node_has_active_routes(struct node *n_ptr) @@ -234,7 +241,6 @@ struct node *tipc_node_attach_link(struct link *l_ptr) u32 bearer_id = l_ptr->b_ptr->identity; char addr_string[16]; - assert(bearer_id < MAX_BEARERS); if (n_ptr->link_cnt >= 2) { char addr_string[16]; @@ -249,7 +255,7 @@ struct node *tipc_node_attach_link(struct link *l_ptr) n_ptr->link_cnt++; return n_ptr; } - err("Attempt to establish second link on <%s> to <%s> \n", + err("Attempt to establish second link on <%s> to %s \n", l_ptr->b_ptr->publ.name, addr_string_fill(addr_string, l_ptr->addr)); } @@ -314,7 +320,7 @@ static void node_established_contact(struct node *n_ptr) struct cluster *c_ptr; dbg("node_established_contact:-> %x\n", n_ptr->addr); - if (!tipc_node_has_active_routes(n_ptr)) { + if (!tipc_node_has_active_routes(n_ptr) && in_own_cluster(n_ptr->addr)) { tipc_k_signal((Handler)tipc_named_node_up, n_ptr->addr); } diff --git a/net/tipc/node.h b/net/tipc/node.h index 781126e084a..a07cc79ea63 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -51,6 +51,7 @@ * @nsub: list of "node down" subscriptions monitoring node * @active_links: pointers to active links to node * @links: pointers to all links to node + * @working_links: number of working links to node (both active and standby) * @link_cnt: number of links to node * @permit_changeover: non-zero if node has redundant links to this system * @routers: bitmap (used for multicluster communication) @@ -76,6 +77,7 @@ struct node { struct link *active_links[2]; struct link *links[MAX_BEARERS]; int link_cnt; + int working_links; int permit_changeover; u32 routers[512/32]; int last_router; diff --git a/net/tipc/node_subscr.c b/net/tipc/node_subscr.c index cff4068cc75..cc3fff3dec4 100644 --- a/net/tipc/node_subscr.c +++ b/net/tipc/node_subscr.c @@ -47,18 +47,19 @@ void tipc_nodesub_subscribe(struct node_subscr *node_sub, u32 addr, void *usr_handle, net_ev_handler handle_down) { - node_sub->node = NULL; - if (addr == tipc_own_addr) + if (addr == tipc_own_addr) { + node_sub->node = NULL; return; - if (!tipc_addr_node_valid(addr)) { - warn("node_subscr with illegal %x\n", addr); + } + + node_sub->node = tipc_node_find(addr); + if (!node_sub->node) { + warn("Node subscription rejected, unknown node 0x%x\n", addr); return; } - node_sub->handle_node_down = handle_down; node_sub->usr_handle = usr_handle; - node_sub->node = tipc_node_find(addr); - assert(node_sub->node); + tipc_node_lock(node_sub->node); list_add_tail(&node_sub->nodesub_list, &node_sub->node->nsub); tipc_node_unlock(node_sub->node); diff --git a/net/tipc/port.c b/net/tipc/port.c index 67e96cb1e82..47d97404e3e 100644 --- a/net/tipc/port.c +++ b/net/tipc/port.c @@ -168,7 +168,6 @@ void tipc_port_recv_mcast(struct sk_buff *buf, struct port_list *dp) struct port_list *item = dp; int cnt = 0; - assert(buf); msg = buf_msg(buf); /* Create destination port list, if one wasn't supplied */ @@ -196,7 +195,7 @@ void tipc_port_recv_mcast(struct sk_buff *buf, struct port_list *dp) struct sk_buff *b = skb_clone(buf, GFP_ATOMIC); if (b == NULL) { - warn("Buffer allocation failure\n"); + warn("Unable to deliver multicast message(s)\n"); msg_dbg(msg, "LOST:"); goto exit; } @@ -228,14 +227,14 @@ u32 tipc_createport_raw(void *usr_handle, u32 ref; p_ptr = kmalloc(sizeof(*p_ptr), GFP_ATOMIC); - if (p_ptr == NULL) { - warn("Memory squeeze; failed to create port\n"); + if (!p_ptr) { + warn("Port creation failed, no memory\n"); return 0; } memset(p_ptr, 0, sizeof(*p_ptr)); ref = tipc_ref_acquire(p_ptr, &p_ptr->publ.lock); if (!ref) { - warn("Reference Table Exhausted\n"); + warn("Port creation failed, reference table exhausted\n"); kfree(p_ptr); return 0; } @@ -810,18 +809,20 @@ static void port_dispatcher_sigh(void *dummy) void *usr_handle; int connected; int published; + u32 message_type; struct sk_buff *next = buf->next; struct tipc_msg *msg = buf_msg(buf); u32 dref = msg_destport(msg); + message_type = msg_type(msg); + if (message_type > TIPC_DIRECT_MSG) + goto reject; /* Unsupported message type */ + p_ptr = tipc_port_lock(dref); - if (!p_ptr) { - /* Port deleted while msg in queue */ - tipc_reject_msg(buf, TIPC_ERR_NO_PORT); - buf = next; - continue; - } + if (!p_ptr) + goto reject; /* Port deleted while msg in queue */ + orig.ref = msg_origport(msg); orig.node = msg_orignode(msg); up_ptr = p_ptr->user_port; @@ -832,7 +833,7 @@ static void port_dispatcher_sigh(void *dummy) if (unlikely(msg_errcode(msg))) goto err; - switch (msg_type(msg)) { + switch (message_type) { case TIPC_CONN_MSG:{ tipc_conn_msg_event cb = up_ptr->conn_msg_cb; @@ -874,6 +875,7 @@ static void port_dispatcher_sigh(void *dummy) &orig); break; } + case TIPC_MCAST_MSG: case TIPC_NAMED_MSG:{ tipc_named_msg_event cb = up_ptr->named_msg_cb; @@ -886,7 +888,8 @@ static void port_dispatcher_sigh(void *dummy) goto reject; dseq.type = msg_nametype(msg); dseq.lower = msg_nameinst(msg); - dseq.upper = dseq.lower; + dseq.upper = (message_type == TIPC_NAMED_MSG) + ? dseq.lower : msg_nameupper(msg); skb_pull(buf, msg_hdr_sz(msg)); cb(usr_handle, dref, &buf, msg_data(msg), msg_data_sz(msg), msg_importance(msg), @@ -899,7 +902,7 @@ static void port_dispatcher_sigh(void *dummy) buf = next; continue; err: - switch (msg_type(msg)) { + switch (message_type) { case TIPC_CONN_MSG:{ tipc_conn_shutdown_event cb = @@ -931,6 +934,7 @@ err: msg_data_sz(msg), msg_errcode(msg), &orig); break; } + case TIPC_MCAST_MSG: case TIPC_NAMED_MSG:{ tipc_named_msg_err_event cb = up_ptr->named_err_cb; @@ -940,7 +944,8 @@ err: break; dseq.type = msg_nametype(msg); dseq.lower = msg_nameinst(msg); - dseq.upper = dseq.lower; + dseq.upper = (message_type == TIPC_NAMED_MSG) + ? dseq.lower : msg_nameupper(msg); skb_pull(buf, msg_hdr_sz(msg)); cb(usr_handle, dref, &buf, msg_data(msg), msg_data_sz(msg), msg_errcode(msg), &dseq); @@ -1054,7 +1059,8 @@ int tipc_createport(u32 user_ref, u32 ref; up_ptr = (struct user_port *)kmalloc(sizeof(*up_ptr), GFP_ATOMIC); - if (up_ptr == NULL) { + if (!up_ptr) { + warn("Port creation failed, no memory\n"); return -ENOMEM; } ref = tipc_createport_raw(NULL, port_dispatcher, port_wakeup, importance); @@ -1165,8 +1171,6 @@ int tipc_withdraw(u32 ref, unsigned int scope, struct tipc_name_seq const *seq) p_ptr = tipc_port_lock(ref); if (!p_ptr) return -EINVAL; - if (!p_ptr->publ.published) - goto exit; if (!seq) { list_for_each_entry_safe(publ, tpubl, &p_ptr->publications, pport_list) { @@ -1193,7 +1197,6 @@ int tipc_withdraw(u32 ref, unsigned int scope, struct tipc_name_seq const *seq) } if (list_empty(&p_ptr->publications)) p_ptr->publ.published = 0; -exit: tipc_port_unlock(p_ptr); return res; } diff --git a/net/tipc/ref.c b/net/tipc/ref.c index 33bbf509509..d2f0cce10e2 100644 --- a/net/tipc/ref.c +++ b/net/tipc/ref.c @@ -127,7 +127,14 @@ u32 tipc_ref_acquire(void *object, spinlock_t **lock) u32 next_plus_upper; u32 reference = 0; - assert(tipc_ref_table.entries && object); + if (!object) { + err("Attempt to acquire reference to non-existent object\n"); + return 0; + } + if (!tipc_ref_table.entries) { + err("Reference table not found during acquisition attempt\n"); + return 0; + } write_lock_bh(&ref_table_lock); if (tipc_ref_table.first_free) { @@ -162,15 +169,28 @@ void tipc_ref_discard(u32 ref) u32 index; u32 index_mask; - assert(tipc_ref_table.entries); - assert(ref != 0); + if (!ref) { + err("Attempt to discard reference 0\n"); + return; + } + if (!tipc_ref_table.entries) { + err("Reference table not found during discard attempt\n"); + return; + } write_lock_bh(&ref_table_lock); index_mask = tipc_ref_table.index_mask; index = ref & index_mask; entry = &(tipc_ref_table.entries[index]); - assert(entry->object != 0); - assert(entry->data.reference == ref); + + if (!entry->object) { + err("Attempt to discard reference to non-existent object\n"); + goto exit; + } + if (entry->data.reference != ref) { + err("Attempt to discard non-existent reference\n"); + goto exit; + } /* mark entry as unused */ entry->object = NULL; @@ -184,6 +204,7 @@ void tipc_ref_discard(u32 ref) /* increment upper bits of entry to invalidate subsequent references */ entry->data.next_plus_upper = (ref & ~index_mask) + (index_mask + 1); +exit: write_unlock_bh(&ref_table_lock); } diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 648a734e604..32d778448a0 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -169,12 +169,6 @@ static int tipc_create(struct socket *sock, int protocol) struct sock *sk; u32 ref; - if ((sock->type != SOCK_STREAM) && - (sock->type != SOCK_SEQPACKET) && - (sock->type != SOCK_DGRAM) && - (sock->type != SOCK_RDM)) - return -EPROTOTYPE; - if (unlikely(protocol != 0)) return -EPROTONOSUPPORT; @@ -199,6 +193,9 @@ static int tipc_create(struct socket *sock, int protocol) sock->ops = &msg_ops; sock->state = SS_READY; break; + default: + tipc_deleteport(ref); + return -EPROTOTYPE; } sk = sk_alloc(AF_TIPC, GFP_KERNEL, &tipc_proto, 1); @@ -426,7 +423,7 @@ static int dest_name_check(struct sockaddr_tipc *dest, struct msghdr *m) if (copy_from_user(&hdr, m->msg_iov[0].iov_base, sizeof(hdr))) return -EFAULT; - if ((ntohs(hdr.tcm_type) & 0xC000) & (!capable(CAP_NET_ADMIN))) + if ((ntohs(hdr.tcm_type) & 0xC000) && (!capable(CAP_NET_ADMIN))) return -EACCES; return 0; @@ -437,7 +434,7 @@ static int dest_name_check(struct sockaddr_tipc *dest, struct msghdr *m) * @iocb: (unused) * @sock: socket structure * @m: message to send - * @total_len: (unused) + * @total_len: length of message * * Message must have an destination specified explicitly. * Used for SOCK_RDM and SOCK_DGRAM messages, @@ -458,7 +455,8 @@ static int send_msg(struct kiocb *iocb, struct socket *sock, if (unlikely(!dest)) return -EDESTADDRREQ; - if (unlikely(dest->family != AF_TIPC)) + if (unlikely((m->msg_namelen < sizeof(*dest)) || + (dest->family != AF_TIPC))) return -EINVAL; needs_conn = (sock->state != SS_READY); @@ -470,6 +468,10 @@ static int send_msg(struct kiocb *iocb, struct socket *sock, if ((tsock->p->published) || ((sock->type == SOCK_STREAM) && (total_len != 0))) return -EOPNOTSUPP; + if (dest->addrtype == TIPC_ADDR_NAME) { + tsock->p->conn_type = dest->addr.name.name.type; + tsock->p->conn_instance = dest->addr.name.name.instance; + } } if (down_interruptible(&tsock->sem)) @@ -538,7 +540,7 @@ exit: * @iocb: (unused) * @sock: socket structure * @m: message to send - * @total_len: (unused) + * @total_len: length of message * * Used for SOCK_SEQPACKET messages and SOCK_STREAM data. * @@ -561,15 +563,15 @@ static int send_packet(struct kiocb *iocb, struct socket *sock, return -ERESTARTSYS; } - if (unlikely(sock->state != SS_CONNECTED)) { - if (sock->state == SS_DISCONNECTING) - res = -EPIPE; - else - res = -ENOTCONN; - goto exit; - } - do { + if (unlikely(sock->state != SS_CONNECTED)) { + if (sock->state == SS_DISCONNECTING) + res = -EPIPE; + else + res = -ENOTCONN; + goto exit; + } + res = tipc_send(tsock->p->ref, m->msg_iovlen, m->msg_iov); if (likely(res != -ELINKCONG)) { exit: @@ -597,7 +599,8 @@ exit: * * Used for SOCK_STREAM data. * - * Returns the number of bytes sent on success, or errno otherwise + * Returns the number of bytes sent on success (or partial success), + * or errno if no data sent */ @@ -611,6 +614,7 @@ static int send_stream(struct kiocb *iocb, struct socket *sock, char __user *curr_start; int curr_left; int bytes_to_send; + int bytes_sent; int res; if (likely(total_len <= TIPC_MAX_USER_MSG_SIZE)) @@ -633,11 +637,11 @@ static int send_stream(struct kiocb *iocb, struct socket *sock, * of small iovec entries into send_packet(). */ - my_msg = *m; - curr_iov = my_msg.msg_iov; - curr_iovlen = my_msg.msg_iovlen; + curr_iov = m->msg_iov; + curr_iovlen = m->msg_iovlen; my_msg.msg_iov = &my_iov; my_msg.msg_iovlen = 1; + bytes_sent = 0; while (curr_iovlen--) { curr_start = curr_iov->iov_base; @@ -648,16 +652,18 @@ static int send_stream(struct kiocb *iocb, struct socket *sock, ? curr_left : TIPC_MAX_USER_MSG_SIZE; my_iov.iov_base = curr_start; my_iov.iov_len = bytes_to_send; - if ((res = send_packet(iocb, sock, &my_msg, 0)) < 0) - return res; + if ((res = send_packet(iocb, sock, &my_msg, 0)) < 0) { + return bytes_sent ? bytes_sent : res; + } curr_left -= bytes_to_send; curr_start += bytes_to_send; + bytes_sent += bytes_to_send; } curr_iov++; } - return total_len; + return bytes_sent; } /** @@ -727,6 +733,7 @@ static int anc_data_recv(struct msghdr *m, struct tipc_msg *msg, u32 anc_data[3]; u32 err; u32 dest_type; + int has_name; int res; if (likely(m->msg_controllen == 0)) @@ -738,10 +745,10 @@ static int anc_data_recv(struct msghdr *m, struct tipc_msg *msg, if (unlikely(err)) { anc_data[0] = err; anc_data[1] = msg_data_sz(msg); - if ((res = put_cmsg(m, SOL_SOCKET, TIPC_ERRINFO, 8, anc_data))) + if ((res = put_cmsg(m, SOL_TIPC, TIPC_ERRINFO, 8, anc_data))) return res; if (anc_data[1] && - (res = put_cmsg(m, SOL_SOCKET, TIPC_RETDATA, anc_data[1], + (res = put_cmsg(m, SOL_TIPC, TIPC_RETDATA, anc_data[1], msg_data(msg)))) return res; } @@ -751,25 +758,28 @@ static int anc_data_recv(struct msghdr *m, struct tipc_msg *msg, dest_type = msg ? msg_type(msg) : TIPC_DIRECT_MSG; switch (dest_type) { case TIPC_NAMED_MSG: + has_name = 1; anc_data[0] = msg_nametype(msg); anc_data[1] = msg_namelower(msg); anc_data[2] = msg_namelower(msg); break; case TIPC_MCAST_MSG: + has_name = 1; anc_data[0] = msg_nametype(msg); anc_data[1] = msg_namelower(msg); anc_data[2] = msg_nameupper(msg); break; case TIPC_CONN_MSG: + has_name = (tport->conn_type != 0); anc_data[0] = tport->conn_type; anc_data[1] = tport->conn_instance; anc_data[2] = tport->conn_instance; break; default: - anc_data[0] = 0; + has_name = 0; } - if (anc_data[0] && - (res = put_cmsg(m, SOL_SOCKET, TIPC_DESTNAME, 12, anc_data))) + if (has_name && + (res = put_cmsg(m, SOL_TIPC, TIPC_DESTNAME, 12, anc_data))) return res; return 0; @@ -960,7 +970,7 @@ static int recv_stream(struct kiocb *iocb, struct socket *sock, restart: if (unlikely((skb_queue_len(&sock->sk->sk_receive_queue) == 0) && (flags & MSG_DONTWAIT))) { - res = (sz_copied == 0) ? -EWOULDBLOCK : 0; + res = -EWOULDBLOCK; goto exit; } @@ -1051,7 +1061,7 @@ restart: exit: up(&tsock->sem); - return res ? res : sz_copied; + return sz_copied ? sz_copied : res; } /** @@ -1236,7 +1246,8 @@ static int connect(struct socket *sock, struct sockaddr *dest, int destlen, if (sock->state == SS_READY) return -EOPNOTSUPP; - /* MOVE THE REST OF THIS ERROR CHECKING TO send_msg()? */ + /* Issue Posix-compliant error code if socket is in the wrong state */ + if (sock->state == SS_LISTENING) return -EOPNOTSUPP; if (sock->state == SS_CONNECTING) @@ -1244,13 +1255,20 @@ static int connect(struct socket *sock, struct sockaddr *dest, int destlen, if (sock->state != SS_UNCONNECTED) return -EISCONN; - if ((dst->family != AF_TIPC) || - ((dst->addrtype != TIPC_ADDR_NAME) && (dst->addrtype != TIPC_ADDR_ID))) + /* + * Reject connection attempt using multicast address + * + * Note: send_msg() validates the rest of the address fields, + * so there's no need to do it here + */ + + if (dst->addrtype == TIPC_ADDR_MCAST) return -EINVAL; /* Send a 'SYN-' to destination */ m.msg_name = dest; + m.msg_namelen = destlen; if ((res = send_msg(NULL, sock, &m, 0)) < 0) { sock->state = SS_DISCONNECTING; return res; @@ -1269,10 +1287,6 @@ static int connect(struct socket *sock, struct sockaddr *dest, int destlen, msg = buf_msg(buf); res = auto_connect(sock, tsock, msg); if (!res) { - if (dst->addrtype == TIPC_ADDR_NAME) { - tsock->p->conn_type = dst->addr.name.name.type; - tsock->p->conn_instance = dst->addr.name.name.instance; - } if (!msg_data_sz(msg)) advance_queue(tsock); } @@ -1386,7 +1400,7 @@ exit: /** * shutdown - shutdown socket connection * @sock: socket structure - * @how: direction to close (always treated as read + write) + * @how: direction to close (unused; always treated as read + write) * * Terminates connection (if necessary), then purges socket's receive queue. * @@ -1469,7 +1483,8 @@ restart: * Returns 0 on success, errno otherwise */ -static int setsockopt(struct socket *sock, int lvl, int opt, char *ov, int ol) +static int setsockopt(struct socket *sock, + int lvl, int opt, char __user *ov, int ol) { struct tipc_sock *tsock = tipc_sk(sock->sk); u32 value; @@ -1525,7 +1540,8 @@ static int setsockopt(struct socket *sock, int lvl, int opt, char *ov, int ol) * Returns 0 on success, errno otherwise */ -static int getsockopt(struct socket *sock, int lvl, int opt, char *ov, int *ol) +static int getsockopt(struct socket *sock, + int lvl, int opt, char __user *ov, int *ol) { struct tipc_sock *tsock = tipc_sk(sock->sk); int len; diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index c5f026c7fd3..fc171875660 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -266,7 +266,8 @@ static void subscr_subscribe(struct tipc_subscr *s, /* Refuse subscription if global limit exceeded */ if (atomic_read(&topsrv.subscription_count) >= tipc_max_subscriptions) { - warn("Failed: max %u subscriptions\n", tipc_max_subscriptions); + warn("Subscription rejected, subscription limit reached (%u)\n", + tipc_max_subscriptions); subscr_terminate(subscriber); return; } @@ -274,8 +275,8 @@ static void subscr_subscribe(struct tipc_subscr *s, /* Allocate subscription object */ sub = kmalloc(sizeof(*sub), GFP_ATOMIC); - if (sub == NULL) { - warn("Memory squeeze; ignoring subscription\n"); + if (!sub) { + warn("Subscription rejected, no memory\n"); subscr_terminate(subscriber); return; } @@ -298,8 +299,7 @@ static void subscr_subscribe(struct tipc_subscr *s, if ((((sub->filter != TIPC_SUB_PORTS) && (sub->filter != TIPC_SUB_SERVICE))) || (sub->seq.lower > sub->seq.upper)) { - warn("Rejecting illegal subscription %u,%u,%u\n", - sub->seq.type, sub->seq.lower, sub->seq.upper); + warn("Subscription rejected, illegal request\n"); kfree(sub); subscr_terminate(subscriber); return; @@ -387,7 +387,7 @@ static void subscr_named_msg_event(void *usr_handle, dbg("subscr_named_msg_event: orig = %x own = %x,\n", orig->node, tipc_own_addr); if (size && (size != sizeof(struct tipc_subscr))) { - warn("Received tipc_subscr of invalid size\n"); + warn("Subscriber rejected, invalid subscription size\n"); return; } @@ -395,7 +395,7 @@ static void subscr_named_msg_event(void *usr_handle, subscriber = kmalloc(sizeof(struct subscriber), GFP_ATOMIC); if (subscriber == NULL) { - warn("Memory squeeze; ignoring subscriber setup\n"); + warn("Subscriber rejected, no memory\n"); return; } memset(subscriber, 0, sizeof(struct subscriber)); @@ -403,7 +403,7 @@ static void subscr_named_msg_event(void *usr_handle, INIT_LIST_HEAD(&subscriber->subscriber_list); subscriber->ref = tipc_ref_acquire(subscriber, &subscriber->lock); if (subscriber->ref == 0) { - warn("Failed to acquire subscriber reference\n"); + warn("Subscriber rejected, reference table exhausted\n"); kfree(subscriber); return; } @@ -422,7 +422,7 @@ static void subscr_named_msg_event(void *usr_handle, NULL, &subscriber->port_ref); if (subscriber->port_ref == 0) { - warn("Memory squeeze; failed to create subscription port\n"); + warn("Subscriber rejected, unable to create port\n"); tipc_ref_discard(subscriber->ref); kfree(subscriber); return; diff --git a/net/tipc/zone.c b/net/tipc/zone.c index 2803e1b4f17..316c4872ff5 100644 --- a/net/tipc/zone.c +++ b/net/tipc/zone.c @@ -44,19 +44,24 @@ struct _zone *tipc_zone_create(u32 addr) { - struct _zone *z_ptr = NULL; + struct _zone *z_ptr; u32 z_num; - if (!tipc_addr_domain_valid(addr)) + if (!tipc_addr_domain_valid(addr)) { + err("Zone creation failed, invalid domain 0x%x\n", addr); return NULL; + } z_ptr = (struct _zone *)kmalloc(sizeof(*z_ptr), GFP_ATOMIC); - if (z_ptr != NULL) { - memset(z_ptr, 0, sizeof(*z_ptr)); - z_num = tipc_zone(addr); - z_ptr->addr = tipc_addr(z_num, 0, 0); - tipc_net.zones[z_num] = z_ptr; + if (!z_ptr) { + warn("Zone creation failed, insufficient memory\n"); + return NULL; } + + memset(z_ptr, 0, sizeof(*z_ptr)); + z_num = tipc_zone(addr); + z_ptr->addr = tipc_addr(z_num, 0, 0); + tipc_net.zones[z_num] = z_ptr; return z_ptr; } diff --git a/security/Kconfig b/security/Kconfig index 34f593410d5..67785df264e 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -22,16 +22,22 @@ config KEYS If you are unsure as to whether this is required, answer N. config KEYS_DEBUG_PROC_KEYS - bool "Enable the /proc/keys file by which all keys may be viewed" + bool "Enable the /proc/keys file by which keys may be viewed" depends on KEYS help - This option turns on support for the /proc/keys file through which - all the keys on the system can be listed. + This option turns on support for the /proc/keys file - through which + can be listed all the keys on the system that are viewable by the + reading process. - This option is a slight security risk in that it makes it possible - for anyone to see all the keys on the system. Normally the manager - pretends keys that are inaccessible to a process don't exist as far - as that process is concerned. + The only keys included in the list are those that grant View + permission to the reading process whether or not it possesses them. + Note that LSM security checks are still performed, and may further + filter out keys that the current process is not authorised to view. + + Only key attributes are listed here; key payloads are not included in + the resulting table. + + If you are unsure as to whether this is required, answer N. config SECURITY bool "Enable different security models" diff --git a/security/dummy.c b/security/dummy.c index c3c5493581e..310fcdf7b74 100644 --- a/security/dummy.c +++ b/security/dummy.c @@ -870,7 +870,8 @@ static int dummy_setprocattr(struct task_struct *p, char *name, void *value, siz } #ifdef CONFIG_KEYS -static inline int dummy_key_alloc(struct key *key, struct task_struct *ctx) +static inline int dummy_key_alloc(struct key *key, struct task_struct *ctx, + unsigned long flags) { return 0; } diff --git a/security/keys/internal.h b/security/keys/internal.h index e066e605795..3c2877f0663 100644 --- a/security/keys/internal.h +++ b/security/keys/internal.h @@ -99,7 +99,8 @@ extern int install_process_keyring(struct task_struct *tsk); extern struct key *request_key_and_link(struct key_type *type, const char *description, const char *callout_info, - struct key *dest_keyring); + struct key *dest_keyring, + unsigned long flags); /* * request_key authorisation diff --git a/security/keys/key.c b/security/keys/key.c index 51f85155738..43295ca37b5 100644 --- a/security/keys/key.c +++ b/security/keys/key.c @@ -15,11 +15,11 @@ #include <linux/slab.h> #include <linux/security.h> #include <linux/workqueue.h> +#include <linux/random.h> #include <linux/err.h> #include "internal.h" static kmem_cache_t *key_jar; -static key_serial_t key_serial_next = 3; struct rb_root key_serial_tree; /* tree of keys indexed by serial */ DEFINE_SPINLOCK(key_serial_lock); @@ -169,22 +169,23 @@ static void __init __key_insert_serial(struct key *key) /*****************************************************************************/ /* * assign a key the next unique serial number - * - we work through all the serial numbers between 2 and 2^31-1 in turn and - * then wrap + * - these are assigned randomly to avoid security issues through covert + * channel problems */ static inline void key_alloc_serial(struct key *key) { struct rb_node *parent, **p; struct key *xkey; - spin_lock(&key_serial_lock); - - /* propose a likely serial number and look for a hole for it in the + /* propose a random serial number and look for a hole for it in the * serial number tree */ - key->serial = key_serial_next; - if (key->serial < 3) - key->serial = 3; - key_serial_next = key->serial + 1; + do { + get_random_bytes(&key->serial, sizeof(key->serial)); + + key->serial >>= 1; /* negative numbers are not permitted */ + } while (key->serial < 3); + + spin_lock(&key_serial_lock); parent = NULL; p = &key_serial_tree.rb_node; @@ -204,12 +205,11 @@ static inline void key_alloc_serial(struct key *key) /* we found a key with the proposed serial number - walk the tree from * that point looking for the next unused serial number */ - serial_exists: +serial_exists: for (;;) { - key->serial = key_serial_next; + key->serial++; if (key->serial < 2) key->serial = 2; - key_serial_next = key->serial + 1; if (!rb_parent(parent)) p = &key_serial_tree.rb_node; @@ -228,7 +228,7 @@ static inline void key_alloc_serial(struct key *key) } /* we've found a suitable hole - arrange for this key to occupy it */ - insert_here: +insert_here: rb_link_node(&key->serial_node, parent, p); rb_insert_color(&key->serial_node, &key_serial_tree); @@ -248,7 +248,7 @@ static inline void key_alloc_serial(struct key *key) */ struct key *key_alloc(struct key_type *type, const char *desc, uid_t uid, gid_t gid, struct task_struct *ctx, - key_perm_t perm, int not_in_quota) + key_perm_t perm, unsigned long flags) { struct key_user *user = NULL; struct key *key; @@ -269,12 +269,14 @@ struct key *key_alloc(struct key_type *type, const char *desc, /* check that the user's quota permits allocation of another key and * its description */ - if (!not_in_quota) { + if (!(flags & KEY_ALLOC_NOT_IN_QUOTA)) { spin_lock(&user->lock); - if (user->qnkeys + 1 >= KEYQUOTA_MAX_KEYS || - user->qnbytes + quotalen >= KEYQUOTA_MAX_BYTES - ) - goto no_quota; + if (!(flags & KEY_ALLOC_QUOTA_OVERRUN)) { + if (user->qnkeys + 1 >= KEYQUOTA_MAX_KEYS || + user->qnbytes + quotalen >= KEYQUOTA_MAX_BYTES + ) + goto no_quota; + } user->qnkeys++; user->qnbytes += quotalen; @@ -308,7 +310,7 @@ struct key *key_alloc(struct key_type *type, const char *desc, key->payload.data = NULL; key->security = NULL; - if (!not_in_quota) + if (!(flags & KEY_ALLOC_NOT_IN_QUOTA)) key->flags |= 1 << KEY_FLAG_IN_QUOTA; memset(&key->type_data, 0, sizeof(key->type_data)); @@ -318,7 +320,7 @@ struct key *key_alloc(struct key_type *type, const char *desc, #endif /* let the security module know about the key */ - ret = security_key_alloc(key, ctx); + ret = security_key_alloc(key, ctx, flags); if (ret < 0) goto security_error; @@ -332,7 +334,7 @@ error: security_error: kfree(key->description); kmem_cache_free(key_jar, key); - if (!not_in_quota) { + if (!(flags & KEY_ALLOC_NOT_IN_QUOTA)) { spin_lock(&user->lock); user->qnkeys--; user->qnbytes -= quotalen; @@ -345,7 +347,7 @@ security_error: no_memory_3: kmem_cache_free(key_jar, key); no_memory_2: - if (!not_in_quota) { + if (!(flags & KEY_ALLOC_NOT_IN_QUOTA)) { spin_lock(&user->lock); user->qnkeys--; user->qnbytes -= quotalen; @@ -761,7 +763,7 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref, const char *description, const void *payload, size_t plen, - int not_in_quota) + unsigned long flags) { struct key_type *ktype; struct key *keyring, *key = NULL; @@ -822,7 +824,7 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref, /* allocate a new key */ key = key_alloc(ktype, description, current->fsuid, current->fsgid, - current, perm, not_in_quota); + current, perm, flags); if (IS_ERR(key)) { key_ref = ERR_PTR(PTR_ERR(key)); goto error_3; diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index ed71d86d2ce..329411cf876 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -102,7 +102,7 @@ asmlinkage long sys_add_key(const char __user *_type, /* create or update the requested key and add it to the target * keyring */ key_ref = key_create_or_update(keyring_ref, type, description, - payload, plen, 0); + payload, plen, KEY_ALLOC_IN_QUOTA); if (!IS_ERR(key_ref)) { ret = key_ref_to_ptr(key_ref)->serial; key_ref_put(key_ref); @@ -184,7 +184,8 @@ asmlinkage long sys_request_key(const char __user *_type, /* do the search */ key = request_key_and_link(ktype, description, callout_info, - key_ref_to_ptr(dest_ref)); + key_ref_to_ptr(dest_ref), + KEY_ALLOC_IN_QUOTA); if (IS_ERR(key)) { ret = PTR_ERR(key); goto error5; @@ -672,6 +673,7 @@ long keyctl_read_key(key_serial_t keyid, char __user *buffer, size_t buflen) */ long keyctl_chown_key(key_serial_t id, uid_t uid, gid_t gid) { + struct key_user *newowner, *zapowner = NULL; struct key *key; key_ref_t key_ref; long ret; @@ -695,19 +697,50 @@ long keyctl_chown_key(key_serial_t id, uid_t uid, gid_t gid) if (!capable(CAP_SYS_ADMIN)) { /* only the sysadmin can chown a key to some other UID */ if (uid != (uid_t) -1 && key->uid != uid) - goto no_access; + goto error_put; /* only the sysadmin can set the key's GID to a group other * than one of those that the current process subscribes to */ if (gid != (gid_t) -1 && gid != key->gid && !in_group_p(gid)) - goto no_access; + goto error_put; } - /* change the UID (have to update the quotas) */ + /* change the UID */ if (uid != (uid_t) -1 && uid != key->uid) { - /* don't support UID changing yet */ - ret = -EOPNOTSUPP; - goto no_access; + ret = -ENOMEM; + newowner = key_user_lookup(uid); + if (!newowner) + goto error_put; + + /* transfer the quota burden to the new user */ + if (test_bit(KEY_FLAG_IN_QUOTA, &key->flags)) { + spin_lock(&newowner->lock); + if (newowner->qnkeys + 1 >= KEYQUOTA_MAX_KEYS || + newowner->qnbytes + key->quotalen >= + KEYQUOTA_MAX_BYTES) + goto quota_overrun; + + newowner->qnkeys++; + newowner->qnbytes += key->quotalen; + spin_unlock(&newowner->lock); + + spin_lock(&key->user->lock); + key->user->qnkeys--; + key->user->qnbytes -= key->quotalen; + spin_unlock(&key->user->lock); + } + + atomic_dec(&key->user->nkeys); + atomic_inc(&newowner->nkeys); + + if (test_bit(KEY_FLAG_INSTANTIATED, &key->flags)) { + atomic_dec(&key->user->nikeys); + atomic_inc(&newowner->nikeys); + } + + zapowner = key->user; + key->user = newowner; + key->uid = uid; } /* change the GID */ @@ -716,12 +749,20 @@ long keyctl_chown_key(key_serial_t id, uid_t uid, gid_t gid) ret = 0; - no_access: +error_put: up_write(&key->sem); key_put(key); - error: + if (zapowner) + key_user_put(zapowner); +error: return ret; +quota_overrun: + spin_unlock(&newowner->lock); + zapowner = newowner; + ret = -EDQUOT; + goto error_put; + } /* end keyctl_chown_key() */ /*****************************************************************************/ diff --git a/security/keys/keyring.c b/security/keys/keyring.c index 1357207fc9d..e8d02acc51e 100644 --- a/security/keys/keyring.c +++ b/security/keys/keyring.c @@ -49,6 +49,7 @@ static inline unsigned keyring_hash(const char *desc) static int keyring_instantiate(struct key *keyring, const void *data, size_t datalen); static int keyring_match(const struct key *keyring, const void *criterion); +static void keyring_revoke(struct key *keyring); static void keyring_destroy(struct key *keyring); static void keyring_describe(const struct key *keyring, struct seq_file *m); static long keyring_read(const struct key *keyring, @@ -59,6 +60,7 @@ struct key_type key_type_keyring = { .def_datalen = sizeof(struct keyring_list), .instantiate = keyring_instantiate, .match = keyring_match, + .revoke = keyring_revoke, .destroy = keyring_destroy, .describe = keyring_describe, .read = keyring_read, @@ -240,7 +242,7 @@ static long keyring_read(const struct key *keyring, * allocate a keyring and link into the destination keyring */ struct key *keyring_alloc(const char *description, uid_t uid, gid_t gid, - struct task_struct *ctx, int not_in_quota, + struct task_struct *ctx, unsigned long flags, struct key *dest) { struct key *keyring; @@ -249,7 +251,7 @@ struct key *keyring_alloc(const char *description, uid_t uid, gid_t gid, keyring = key_alloc(&key_type_keyring, description, uid, gid, ctx, (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_ALL, - not_in_quota); + flags); if (!IS_ERR(keyring)) { ret = key_instantiate_and_link(keyring, NULL, 0, dest, NULL); @@ -953,3 +955,22 @@ int keyring_clear(struct key *keyring) } /* end keyring_clear() */ EXPORT_SYMBOL(keyring_clear); + +/*****************************************************************************/ +/* + * dispose of the links from a revoked keyring + * - called with the key sem write-locked + */ +static void keyring_revoke(struct key *keyring) +{ + struct keyring_list *klist = keyring->payload.subscriptions; + + /* adjust the quota */ + key_payload_reserve(keyring, 0); + + if (klist) { + rcu_assign_pointer(keyring->payload.subscriptions, NULL); + call_rcu(&klist->rcu, keyring_clear_rcu_disposal); + } + +} /* end keyring_revoke() */ diff --git a/security/keys/proc.c b/security/keys/proc.c index 12b750e51fb..686a9ee0c5d 100644 --- a/security/keys/proc.c +++ b/security/keys/proc.c @@ -137,6 +137,13 @@ static int proc_keys_show(struct seq_file *m, void *v) struct timespec now; unsigned long timo; char xbuf[12]; + int rc; + + /* check whether the current task is allowed to view the key (assuming + * non-possession) */ + rc = key_task_permission(make_key_ref(key, 0), current, KEY_VIEW); + if (rc < 0) + return 0; now = current_kernel_time(); diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c index 4d9825f9962..32150cf7c37 100644 --- a/security/keys/process_keys.c +++ b/security/keys/process_keys.c @@ -77,7 +77,8 @@ int alloc_uid_keyring(struct user_struct *user, /* concoct a default session keyring */ sprintf(buf, "_uid_ses.%u", user->uid); - session_keyring = keyring_alloc(buf, user->uid, (gid_t) -1, ctx, 0, NULL); + session_keyring = keyring_alloc(buf, user->uid, (gid_t) -1, ctx, + KEY_ALLOC_IN_QUOTA, NULL); if (IS_ERR(session_keyring)) { ret = PTR_ERR(session_keyring); goto error; @@ -87,8 +88,8 @@ int alloc_uid_keyring(struct user_struct *user, * keyring */ sprintf(buf, "_uid.%u", user->uid); - uid_keyring = keyring_alloc(buf, user->uid, (gid_t) -1, ctx, 0, - session_keyring); + uid_keyring = keyring_alloc(buf, user->uid, (gid_t) -1, ctx, + KEY_ALLOC_IN_QUOTA, session_keyring); if (IS_ERR(uid_keyring)) { key_put(session_keyring); ret = PTR_ERR(uid_keyring); @@ -144,7 +145,8 @@ int install_thread_keyring(struct task_struct *tsk) sprintf(buf, "_tid.%u", tsk->pid); - keyring = keyring_alloc(buf, tsk->uid, tsk->gid, tsk, 1, NULL); + keyring = keyring_alloc(buf, tsk->uid, tsk->gid, tsk, + KEY_ALLOC_QUOTA_OVERRUN, NULL); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto error; @@ -178,7 +180,8 @@ int install_process_keyring(struct task_struct *tsk) if (!tsk->signal->process_keyring) { sprintf(buf, "_pid.%u", tsk->tgid); - keyring = keyring_alloc(buf, tsk->uid, tsk->gid, tsk, 1, NULL); + keyring = keyring_alloc(buf, tsk->uid, tsk->gid, tsk, + KEY_ALLOC_QUOTA_OVERRUN, NULL); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto error; @@ -209,6 +212,7 @@ error: static int install_session_keyring(struct task_struct *tsk, struct key *keyring) { + unsigned long flags; struct key *old; char buf[20]; @@ -218,7 +222,12 @@ static int install_session_keyring(struct task_struct *tsk, if (!keyring) { sprintf(buf, "_ses.%u", tsk->tgid); - keyring = keyring_alloc(buf, tsk->uid, tsk->gid, tsk, 1, NULL); + flags = KEY_ALLOC_QUOTA_OVERRUN; + if (tsk->signal->session_keyring) + flags = KEY_ALLOC_IN_QUOTA; + + keyring = keyring_alloc(buf, tsk->uid, tsk->gid, tsk, + flags, NULL); if (IS_ERR(keyring)) return PTR_ERR(keyring); } @@ -728,7 +737,8 @@ long join_session_keyring(const char *name) keyring = find_keyring_by_name(name, 0); if (PTR_ERR(keyring) == -ENOKEY) { /* not found - try and create a new one */ - keyring = keyring_alloc(name, tsk->uid, tsk->gid, tsk, 0, NULL); + keyring = keyring_alloc(name, tsk->uid, tsk->gid, tsk, + KEY_ALLOC_IN_QUOTA, NULL); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto error2; diff --git a/security/keys/request_key.c b/security/keys/request_key.c index eab66a06ca5..58d1efd4fc2 100644 --- a/security/keys/request_key.c +++ b/security/keys/request_key.c @@ -48,8 +48,8 @@ static int call_sbin_request_key(struct key *key, /* allocate a new session keyring */ sprintf(desc, "_req.%u", key->serial); - keyring = keyring_alloc(desc, current->fsuid, current->fsgid, - current, 1, NULL); + keyring = keyring_alloc(desc, current->fsuid, current->fsgid, current, + KEY_ALLOC_QUOTA_OVERRUN, NULL); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto error_alloc; @@ -126,7 +126,8 @@ error_alloc: */ static struct key *__request_key_construction(struct key_type *type, const char *description, - const char *callout_info) + const char *callout_info, + unsigned long flags) { request_key_actor_t actor; struct key_construction cons; @@ -134,12 +135,12 @@ static struct key *__request_key_construction(struct key_type *type, struct key *key, *authkey; int ret, negated; - kenter("%s,%s,%s", type->name, description, callout_info); + kenter("%s,%s,%s,%lx", type->name, description, callout_info, flags); /* create a key and add it to the queue */ key = key_alloc(type, description, - current->fsuid, current->fsgid, - current, KEY_POS_ALL, 0); + current->fsuid, current->fsgid, current, KEY_POS_ALL, + flags); if (IS_ERR(key)) goto alloc_failed; @@ -258,15 +259,16 @@ alloc_failed: static struct key *request_key_construction(struct key_type *type, const char *description, struct key_user *user, - const char *callout_info) + const char *callout_info, + unsigned long flags) { struct key_construction *pcons; struct key *key, *ckey; DECLARE_WAITQUEUE(myself, current); - kenter("%s,%s,{%d},%s", - type->name, description, user->uid, callout_info); + kenter("%s,%s,{%d},%s,%lx", + type->name, description, user->uid, callout_info, flags); /* see if there's such a key under construction already */ down_write(&key_construction_sem); @@ -282,7 +284,8 @@ static struct key *request_key_construction(struct key_type *type, } /* see about getting userspace to construct the key */ - key = __request_key_construction(type, description, callout_info); + key = __request_key_construction(type, description, callout_info, + flags); error: kleave(" = %p", key); return key; @@ -389,14 +392,15 @@ static void request_key_link(struct key *key, struct key *dest_keyring) struct key *request_key_and_link(struct key_type *type, const char *description, const char *callout_info, - struct key *dest_keyring) + struct key *dest_keyring, + unsigned long flags) { struct key_user *user; struct key *key; key_ref_t key_ref; - kenter("%s,%s,%s,%p", - type->name, description, callout_info, dest_keyring); + kenter("%s,%s,%s,%p,%lx", + type->name, description, callout_info, dest_keyring, flags); /* search all the process keyrings for a key */ key_ref = search_process_keyrings(type, description, type->match, @@ -429,7 +433,8 @@ struct key *request_key_and_link(struct key_type *type, /* ask userspace (returns NULL if it waited on a key * being constructed) */ key = request_key_construction(type, description, - user, callout_info); + user, callout_info, + flags); if (key) break; @@ -485,7 +490,8 @@ struct key *request_key(struct key_type *type, const char *description, const char *callout_info) { - return request_key_and_link(type, description, callout_info, NULL); + return request_key_and_link(type, description, callout_info, NULL, + KEY_ALLOC_IN_QUOTA); } /* end request_key() */ diff --git a/security/keys/request_key_auth.c b/security/keys/request_key_auth.c index cb9817ced3f..cbf58a91b00 100644 --- a/security/keys/request_key_auth.c +++ b/security/keys/request_key_auth.c @@ -187,7 +187,7 @@ struct key *request_key_auth_new(struct key *target, const char *callout_info) authkey = key_alloc(&key_type_request_key_auth, desc, current->fsuid, current->fsgid, current, KEY_POS_VIEW | KEY_POS_READ | KEY_POS_SEARCH | - KEY_USR_VIEW, 1); + KEY_USR_VIEW, KEY_ALLOC_NOT_IN_QUOTA); if (IS_ERR(authkey)) { ret = PTR_ERR(authkey); goto error_alloc; diff --git a/security/keys/user_defined.c b/security/keys/user_defined.c index 8e71895b97a..5bbfdebb7ac 100644 --- a/security/keys/user_defined.c +++ b/security/keys/user_defined.c @@ -28,6 +28,7 @@ struct key_type key_type_user = { .instantiate = user_instantiate, .update = user_update, .match = user_match, + .revoke = user_revoke, .destroy = user_destroy, .describe = user_describe, .read = user_read, @@ -67,6 +68,7 @@ error: return ret; } /* end user_instantiate() */ + EXPORT_SYMBOL_GPL(user_instantiate); /*****************************************************************************/ @@ -141,7 +143,28 @@ EXPORT_SYMBOL_GPL(user_match); /*****************************************************************************/ /* - * dispose of the data dangling from the corpse of a user + * dispose of the links from a revoked keyring + * - called with the key sem write-locked + */ +void user_revoke(struct key *key) +{ + struct user_key_payload *upayload = key->payload.data; + + /* clear the quota */ + key_payload_reserve(key, 0); + + if (upayload) { + rcu_assign_pointer(key->payload.data, NULL); + call_rcu(&upayload->rcu, user_update_rcu_disposal); + } + +} /* end user_revoke() */ + +EXPORT_SYMBOL(user_revoke); + +/*****************************************************************************/ +/* + * dispose of the data dangling from the corpse of a user key */ void user_destroy(struct key *key) { diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 79c16e31c88..ac7f2b2e392 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -1099,6 +1099,17 @@ static int may_create(struct inode *dir, FILESYSTEM__ASSOCIATE, &ad); } +/* Check whether a task can create a key. */ +static int may_create_key(u32 ksid, + struct task_struct *ctx) +{ + struct task_security_struct *tsec; + + tsec = ctx->security; + + return avc_has_perm(tsec->sid, ksid, SECCLASS_KEY, KEY__CREATE, NULL); +} + #define MAY_LINK 0 #define MAY_UNLINK 1 #define MAY_RMDIR 2 @@ -1521,8 +1532,9 @@ static int selinux_bprm_set_security(struct linux_binprm *bprm) /* Default to the current task SID. */ bsec->sid = tsec->sid; - /* Reset create SID on execve. */ + /* Reset create and sockcreate SID on execve. */ tsec->create_sid = 0; + tsec->sockcreate_sid = 0; if (tsec->exec_sid) { newsid = tsec->exec_sid; @@ -2574,9 +2586,10 @@ static int selinux_task_alloc_security(struct task_struct *tsk) tsec2->osid = tsec1->osid; tsec2->sid = tsec1->sid; - /* Retain the exec and create SIDs across fork */ + /* Retain the exec, create, and sock SIDs across fork */ tsec2->exec_sid = tsec1->exec_sid; tsec2->create_sid = tsec1->create_sid; + tsec2->sockcreate_sid = tsec1->sockcreate_sid; /* Retain ptracer SID across fork, if any. This will be reset by the ptrace hook upon any @@ -2926,12 +2939,14 @@ static int selinux_socket_create(int family, int type, { int err = 0; struct task_security_struct *tsec; + u32 newsid; if (kern) goto out; tsec = current->security; - err = avc_has_perm(tsec->sid, tsec->sid, + newsid = tsec->sockcreate_sid ? : tsec->sid; + err = avc_has_perm(tsec->sid, newsid, socket_type_to_security_class(family, type, protocol), SOCKET__CREATE, NULL); @@ -2944,12 +2959,14 @@ static void selinux_socket_post_create(struct socket *sock, int family, { struct inode_security_struct *isec; struct task_security_struct *tsec; + u32 newsid; isec = SOCK_INODE(sock)->i_security; tsec = current->security; + newsid = tsec->sockcreate_sid ? : tsec->sid; isec->sclass = socket_type_to_security_class(family, type, protocol); - isec->sid = kern ? SECINITSID_KERNEL : tsec->sid; + isec->sid = kern ? SECINITSID_KERNEL : newsid; isec->initialized = 1; return; @@ -4150,6 +4167,10 @@ static int selinux_getprocattr(struct task_struct *p, sid = tsec->exec_sid; else if (!strcmp(name, "fscreate")) sid = tsec->create_sid; + else if (!strcmp(name, "keycreate")) + sid = tsec->keycreate_sid; + else if (!strcmp(name, "sockcreate")) + sid = tsec->sockcreate_sid; else return -EINVAL; @@ -4182,6 +4203,10 @@ static int selinux_setprocattr(struct task_struct *p, error = task_has_perm(current, p, PROCESS__SETEXEC); else if (!strcmp(name, "fscreate")) error = task_has_perm(current, p, PROCESS__SETFSCREATE); + else if (!strcmp(name, "keycreate")) + error = task_has_perm(current, p, PROCESS__SETKEYCREATE); + else if (!strcmp(name, "sockcreate")) + error = task_has_perm(current, p, PROCESS__SETSOCKCREATE); else if (!strcmp(name, "current")) error = task_has_perm(current, p, PROCESS__SETCURRENT); else @@ -4211,6 +4236,13 @@ static int selinux_setprocattr(struct task_struct *p, tsec->exec_sid = sid; else if (!strcmp(name, "fscreate")) tsec->create_sid = sid; + else if (!strcmp(name, "keycreate")) { + error = may_create_key(sid, p); + if (error) + return error; + tsec->keycreate_sid = sid; + } else if (!strcmp(name, "sockcreate")) + tsec->sockcreate_sid = sid; else if (!strcmp(name, "current")) { struct av_decision avd; @@ -4264,7 +4296,8 @@ static int selinux_setprocattr(struct task_struct *p, #ifdef CONFIG_KEYS -static int selinux_key_alloc(struct key *k, struct task_struct *tsk) +static int selinux_key_alloc(struct key *k, struct task_struct *tsk, + unsigned long flags) { struct task_security_struct *tsec = tsk->security; struct key_security_struct *ksec; @@ -4274,7 +4307,10 @@ static int selinux_key_alloc(struct key *k, struct task_struct *tsk) return -ENOMEM; ksec->obj = k; - ksec->sid = tsec->sid; + if (tsec->keycreate_sid) + ksec->sid = tsec->keycreate_sid; + else + ksec->sid = tsec->sid; k->security = ksec; return 0; @@ -4513,8 +4549,10 @@ static __init int selinux_init(void) #ifdef CONFIG_KEYS /* Add security information to initial keyrings */ - security_key_alloc(&root_user_keyring, current); - security_key_alloc(&root_session_keyring, current); + selinux_key_alloc(&root_user_keyring, current, + KEY_ALLOC_NOT_IN_QUOTA); + selinux_key_alloc(&root_session_keyring, current, + KEY_ALLOC_NOT_IN_QUOTA); #endif return 0; diff --git a/security/selinux/include/av_perm_to_string.h b/security/selinux/include/av_perm_to_string.h index bc020bde6c8..7c9b5838083 100644 --- a/security/selinux/include/av_perm_to_string.h +++ b/security/selinux/include/av_perm_to_string.h @@ -72,6 +72,8 @@ S_(SECCLASS_PROCESS, PROCESS__EXECMEM, "execmem") S_(SECCLASS_PROCESS, PROCESS__EXECSTACK, "execstack") S_(SECCLASS_PROCESS, PROCESS__EXECHEAP, "execheap") + S_(SECCLASS_PROCESS, PROCESS__SETKEYCREATE, "setkeycreate") + S_(SECCLASS_PROCESS, PROCESS__SETSOCKCREATE, "setsockcreate") S_(SECCLASS_MSGQ, MSGQ__ENQUEUE, "enqueue") S_(SECCLASS_MSG, MSG__SEND, "send") S_(SECCLASS_MSG, MSG__RECEIVE, "receive") @@ -248,3 +250,4 @@ S_(SECCLASS_KEY, KEY__SEARCH, "search") S_(SECCLASS_KEY, KEY__LINK, "link") S_(SECCLASS_KEY, KEY__SETATTR, "setattr") + S_(SECCLASS_KEY, KEY__CREATE, "create") diff --git a/security/selinux/include/av_permissions.h b/security/selinux/include/av_permissions.h index 1205227a3a3..69fd4b48202 100644 --- a/security/selinux/include/av_permissions.h +++ b/security/selinux/include/av_permissions.h @@ -467,6 +467,8 @@ #define PROCESS__EXECMEM 0x02000000UL #define PROCESS__EXECSTACK 0x04000000UL #define PROCESS__EXECHEAP 0x08000000UL +#define PROCESS__SETKEYCREATE 0x10000000UL +#define PROCESS__SETSOCKCREATE 0x20000000UL #define IPC__CREATE 0x00000001UL #define IPC__DESTROY 0x00000002UL @@ -966,4 +968,4 @@ #define KEY__SEARCH 0x00000008UL #define KEY__LINK 0x00000010UL #define KEY__SETATTR 0x00000020UL - +#define KEY__CREATE 0x00000040UL diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h index 8f5547ad185..cf54a304169 100644 --- a/security/selinux/include/objsec.h +++ b/security/selinux/include/objsec.h @@ -32,6 +32,8 @@ struct task_security_struct { u32 sid; /* current SID */ u32 exec_sid; /* exec SID */ u32 create_sid; /* fscreate SID */ + u32 keycreate_sid; /* keycreate SID */ + u32 sockcreate_sid; /* fscreate SID */ u32 ptrace_sid; /* SID of ptrace parent */ }; |