diff options
Diffstat (limited to 'Documentation/ia64')
| -rw-r--r-- | Documentation/ia64/.gitignore | 1 | ||||
| -rw-r--r-- | Documentation/ia64/Makefile | 8 | ||||
| -rw-r--r-- | Documentation/ia64/aliasing-test.c | 263 | ||||
| -rw-r--r-- | Documentation/ia64/aliasing.txt | 221 | ||||
| -rw-r--r-- | Documentation/ia64/efirtc.txt | 2 | ||||
| -rw-r--r-- | Documentation/ia64/err_inject.txt | 1068 | ||||
| -rw-r--r-- | Documentation/ia64/fsys.txt | 2 | ||||
| -rw-r--r-- | Documentation/ia64/kvm.txt | 83 | ||||
| -rw-r--r-- | Documentation/ia64/mca.txt | 4 | ||||
| -rw-r--r-- | Documentation/ia64/paravirt_ops.txt | 137 | ||||
| -rw-r--r-- | Documentation/ia64/serial.txt | 9 | ||||
| -rw-r--r-- | Documentation/ia64/xen.txt | 183 |
12 files changed, 1976 insertions, 5 deletions
diff --git a/Documentation/ia64/.gitignore b/Documentation/ia64/.gitignore new file mode 100644 index 00000000000..ab806edc873 --- /dev/null +++ b/Documentation/ia64/.gitignore @@ -0,0 +1 @@ +aliasing-test diff --git a/Documentation/ia64/Makefile b/Documentation/ia64/Makefile new file mode 100644 index 00000000000..b75db69ec48 --- /dev/null +++ b/Documentation/ia64/Makefile @@ -0,0 +1,8 @@ +# kbuild trick to avoid linker error. Can be omitted if a module is built. +obj- := dummy.o + +# List of programs to build +hostprogs-y := aliasing-test + +# Tell kbuild to always build the programs +always := $(hostprogs-y) diff --git a/Documentation/ia64/aliasing-test.c b/Documentation/ia64/aliasing-test.c new file mode 100644 index 00000000000..62a190d45f3 --- /dev/null +++ b/Documentation/ia64/aliasing-test.c @@ -0,0 +1,263 @@ +/* + * Exercise /dev/mem mmap cases that have been troublesome in the past + * + * (c) Copyright 2007 Hewlett-Packard Development Company, L.P. + * Bjorn Helgaas <bjorn.helgaas@hp.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <stdlib.h> +#include <stdio.h> +#include <sys/types.h> +#include <dirent.h> +#include <fcntl.h> +#include <fnmatch.h> +#include <string.h> +#include <sys/ioctl.h> +#include <sys/mman.h> +#include <sys/stat.h> +#include <unistd.h> +#include <linux/pci.h> + +int sum; + +static int map_mem(char *path, off_t offset, size_t length, int touch) +{ + int fd, rc; + void *addr; + int *c; + + fd = open(path, O_RDWR); + if (fd == -1) { + perror(path); + return -1; + } + + if (fnmatch("/proc/bus/pci/*", path, 0) == 0) { + rc = ioctl(fd, PCIIOC_MMAP_IS_MEM); + if (rc == -1) + perror("PCIIOC_MMAP_IS_MEM ioctl"); + } + + addr = mmap(NULL, length, PROT_READ|PROT_WRITE, MAP_SHARED, fd, offset); + if (addr == MAP_FAILED) + return 1; + + if (touch) { + c = (int *) addr; + while (c < (int *) (addr + length)) + sum += *c++; + } + + rc = munmap(addr, length); + if (rc == -1) { + perror("munmap"); + return -1; + } + + close(fd); + return 0; +} + +static int scan_tree(char *path, char *file, off_t offset, size_t length, int touch) +{ + struct dirent **namelist; + char *name, *path2; + int i, n, r, rc = 0, result = 0; + struct stat buf; + + n = scandir(path, &namelist, 0, alphasort); + if (n < 0) { + perror("scandir"); + return -1; + } + + for (i = 0; i < n; i++) { + name = namelist[i]->d_name; + + if (fnmatch(".", name, 0) == 0) + goto skip; + if (fnmatch("..", name, 0) == 0) + goto skip; + + path2 = malloc(strlen(path) + strlen(name) + 3); + strcpy(path2, path); + strcat(path2, "/"); + strcat(path2, name); + + if (fnmatch(file, name, 0) == 0) { + rc = map_mem(path2, offset, length, touch); + if (rc == 0) + fprintf(stderr, "PASS: %s 0x%lx-0x%lx is %s\n", path2, offset, offset + length, touch ? "readable" : "mappable"); + else if (rc > 0) + fprintf(stderr, "PASS: %s 0x%lx-0x%lx not mappable\n", path2, offset, offset + length); + else { + fprintf(stderr, "FAIL: %s 0x%lx-0x%lx not accessible\n", path2, offset, offset + length); + return rc; + } + } else { + r = lstat(path2, &buf); + if (r == 0 && S_ISDIR(buf.st_mode)) { + rc = scan_tree(path2, file, offset, length, touch); + if (rc < 0) + return rc; + } + } + + result |= rc; + free(path2); + +skip: + free(namelist[i]); + } + free(namelist); + return result; +} + +char buf[1024]; + +static int read_rom(char *path) +{ + int fd, rc; + size_t size = 0; + + fd = open(path, O_RDWR); + if (fd == -1) { + perror(path); + return -1; + } + + rc = write(fd, "1", 2); + if (rc <= 0) { + close(fd); + perror("write"); + return -1; + } + + do { + rc = read(fd, buf, sizeof(buf)); + if (rc > 0) + size += rc; + } while (rc > 0); + + close(fd); + return size; +} + +static int scan_rom(char *path, char *file) +{ + struct dirent **namelist; + char *name, *path2; + int i, n, r, rc = 0, result = 0; + struct stat buf; + + n = scandir(path, &namelist, 0, alphasort); + if (n < 0) { + perror("scandir"); + return -1; + } + + for (i = 0; i < n; i++) { + name = namelist[i]->d_name; + + if (fnmatch(".", name, 0) == 0) + goto skip; + if (fnmatch("..", name, 0) == 0) + goto skip; + + path2 = malloc(strlen(path) + strlen(name) + 3); + strcpy(path2, path); + strcat(path2, "/"); + strcat(path2, name); + + if (fnmatch(file, name, 0) == 0) { + rc = read_rom(path2); + + /* + * It's OK if the ROM is unreadable. Maybe there + * is no ROM, or some other error occurred. The + * important thing is that no MCA happened. + */ + if (rc > 0) + fprintf(stderr, "PASS: %s read %d bytes\n", path2, rc); + else { + fprintf(stderr, "PASS: %s not readable\n", path2); + return rc; + } + } else { + r = lstat(path2, &buf); + if (r == 0 && S_ISDIR(buf.st_mode)) { + rc = scan_rom(path2, file); + if (rc < 0) + return rc; + } + } + + result |= rc; + free(path2); + +skip: + free(namelist[i]); + } + free(namelist); + return result; +} + +int main(void) +{ + int rc; + + if (map_mem("/dev/mem", 0, 0xA0000, 1) == 0) + fprintf(stderr, "PASS: /dev/mem 0x0-0xa0000 is readable\n"); + else + fprintf(stderr, "FAIL: /dev/mem 0x0-0xa0000 not accessible\n"); + + /* + * It's not safe to blindly read the VGA frame buffer. If you know + * how to poke the card the right way, it should respond, but it's + * not safe in general. Many machines, e.g., Intel chipsets, cover + * up a non-responding card by just returning -1, but others will + * report the failure as a machine check. + */ + if (map_mem("/dev/mem", 0xA0000, 0x20000, 0) == 0) + fprintf(stderr, "PASS: /dev/mem 0xa0000-0xc0000 is mappable\n"); + else + fprintf(stderr, "FAIL: /dev/mem 0xa0000-0xc0000 not accessible\n"); + + if (map_mem("/dev/mem", 0xC0000, 0x40000, 1) == 0) + fprintf(stderr, "PASS: /dev/mem 0xc0000-0x100000 is readable\n"); + else + fprintf(stderr, "FAIL: /dev/mem 0xc0000-0x100000 not accessible\n"); + + /* + * Often you can map all the individual pieces above (0-0xA0000, + * 0xA0000-0xC0000, and 0xC0000-0x100000), but can't map the whole + * thing at once. This is because the individual pieces use different + * attributes, and there's no single attribute supported over the + * whole region. + */ + rc = map_mem("/dev/mem", 0, 1024*1024, 0); + if (rc == 0) + fprintf(stderr, "PASS: /dev/mem 0x0-0x100000 is mappable\n"); + else if (rc > 0) + fprintf(stderr, "PASS: /dev/mem 0x0-0x100000 not mappable\n"); + else + fprintf(stderr, "FAIL: /dev/mem 0x0-0x100000 not accessible\n"); + + scan_tree("/sys/class/pci_bus", "legacy_mem", 0, 0xA0000, 1); + scan_tree("/sys/class/pci_bus", "legacy_mem", 0xA0000, 0x20000, 0); + scan_tree("/sys/class/pci_bus", "legacy_mem", 0xC0000, 0x40000, 1); + scan_tree("/sys/class/pci_bus", "legacy_mem", 0, 1024*1024, 0); + + scan_rom("/sys/devices", "rom"); + + scan_tree("/proc/bus/pci", "??.?", 0, 0xA0000, 1); + scan_tree("/proc/bus/pci", "??.?", 0xA0000, 0x20000, 0); + scan_tree("/proc/bus/pci", "??.?", 0xC0000, 0x40000, 1); + scan_tree("/proc/bus/pci", "??.?", 0, 1024*1024, 0); + + return rc; +} diff --git a/Documentation/ia64/aliasing.txt b/Documentation/ia64/aliasing.txt new file mode 100644 index 00000000000..5a4dea6abeb --- /dev/null +++ b/Documentation/ia64/aliasing.txt @@ -0,0 +1,221 @@ + MEMORY ATTRIBUTE ALIASING ON IA-64 + + Bjorn Helgaas + <bjorn.helgaas@hp.com> + May 4, 2006 + + +MEMORY ATTRIBUTES + + Itanium supports several attributes for virtual memory references. + The attribute is part of the virtual translation, i.e., it is + contained in the TLB entry. The ones of most interest to the Linux + kernel are: + + WB Write-back (cacheable) + UC Uncacheable + WC Write-coalescing + + System memory typically uses the WB attribute. The UC attribute is + used for memory-mapped I/O devices. The WC attribute is uncacheable + like UC is, but writes may be delayed and combined to increase + performance for things like frame buffers. + + The Itanium architecture requires that we avoid accessing the same + page with both a cacheable mapping and an uncacheable mapping[1]. + + The design of the chipset determines which attributes are supported + on which regions of the address space. For example, some chipsets + support either WB or UC access to main memory, while others support + only WB access. + +MEMORY MAP + + Platform firmware describes the physical memory map and the + supported attributes for each region. At boot-time, the kernel uses + the EFI GetMemoryMap() interface. ACPI can also describe memory + devices and the attributes they support, but Linux/ia64 currently + doesn't use this information. + + The kernel uses the efi_memmap table returned from GetMemoryMap() to + learn the attributes supported by each region of physical address + space. Unfortunately, this table does not completely describe the + address space because some machines omit some or all of the MMIO + regions from the map. + + The kernel maintains another table, kern_memmap, which describes the + memory Linux is actually using and the attribute for each region. + This contains only system memory; it does not contain MMIO space. + + The kern_memmap table typically contains only a subset of the system + memory described by the efi_memmap. Linux/ia64 can't use all memory + in the system because of constraints imposed by the identity mapping + scheme. + + The efi_memmap table is preserved unmodified because the original + boot-time information is required for kexec. + +KERNEL IDENTITY MAPPINGS + + Linux/ia64 identity mappings are done with large pages, currently + either 16MB or 64MB, referred to as "granules." Cacheable mappings + are speculative[2], so the processor can read any location in the + page at any time, independent of the programmer's intentions. This + means that to avoid attribute aliasing, Linux can create a cacheable + identity mapping only when the entire granule supports cacheable + access. + + Therefore, kern_memmap contains only full granule-sized regions that + can referenced safely by an identity mapping. + + Uncacheable mappings are not speculative, so the processor will + generate UC accesses only to locations explicitly referenced by + software. This allows UC identity mappings to cover granules that + are only partially populated, or populated with a combination of UC + and WB regions. + +USER MAPPINGS + + User mappings are typically done with 16K or 64K pages. The smaller + page size allows more flexibility because only 16K or 64K has to be + homogeneous with respect to memory attributes. + +POTENTIAL ATTRIBUTE ALIASING CASES + + There are several ways the kernel creates new mappings: + + mmap of /dev/mem + + This uses remap_pfn_range(), which creates user mappings. These + mappings may be either WB or UC. If the region being mapped + happens to be in kern_memmap, meaning that it may also be mapped + by a kernel identity mapping, the user mapping must use the same + attribute as the kernel mapping. + + If the region is not in kern_memmap, the user mapping should use + an attribute reported as being supported in the EFI memory map. + + Since the EFI memory map does not describe MMIO on some + machines, this should use an uncacheable mapping as a fallback. + + mmap of /sys/class/pci_bus/.../legacy_mem + + This is very similar to mmap of /dev/mem, except that legacy_mem + only allows mmap of the one megabyte "legacy MMIO" area for a + specific PCI bus. Typically this is the first megabyte of + physical address space, but it may be different on machines with + several VGA devices. + + "X" uses this to access VGA frame buffers. Using legacy_mem + rather than /dev/mem allows multiple instances of X to talk to + different VGA cards. + + The /dev/mem mmap constraints apply. + + mmap of /proc/bus/pci/.../??.? + + This is an MMIO mmap of PCI functions, which additionally may or + may not be requested as using the WC attribute. + + If WC is requested, and the region in kern_memmap is either WC + or UC, and the EFI memory map designates the region as WC, then + the WC mapping is allowed. + + Otherwise, the user mapping must use the same attribute as the + kernel mapping. + + read/write of /dev/mem + + This uses copy_from_user(), which implicitly uses a kernel + identity mapping. This is obviously safe for things in + kern_memmap. + + There may be corner cases of things that are not in kern_memmap, + but could be accessed this way. For example, registers in MMIO + space are not in kern_memmap, but could be accessed with a UC + mapping. This would not cause attribute aliasing. But + registers typically can be accessed only with four-byte or + eight-byte accesses, and the copy_from_user() path doesn't allow + any control over the access size, so this would be dangerous. + + ioremap() + + This returns a mapping for use inside the kernel. + + If the region is in kern_memmap, we should use the attribute + specified there. + + If the EFI memory map reports that the entire granule supports + WB, we should use that (granules that are partially reserved + or occupied by firmware do not appear in kern_memmap). + + If the granule contains non-WB memory, but we can cover the + region safely with kernel page table mappings, we can use + ioremap_page_range() as most other architectures do. + + Failing all of the above, we have to fall back to a UC mapping. + +PAST PROBLEM CASES + + mmap of various MMIO regions from /dev/mem by "X" on Intel platforms + + The EFI memory map may not report these MMIO regions. + + These must be allowed so that X will work. This means that + when the EFI memory map is incomplete, every /dev/mem mmap must + succeed. It may create either WB or UC user mappings, depending + on whether the region is in kern_memmap or the EFI memory map. + + mmap of 0x0-0x9FFFF /dev/mem by "hwinfo" on HP sx1000 with VGA enabled + + The EFI memory map reports the following attributes: + 0x00000-0x9FFFF WB only + 0xA0000-0xBFFFF UC only (VGA frame buffer) + 0xC0000-0xFFFFF WB only + + This mmap is done with user pages, not kernel identity mappings, + so it is safe to use WB mappings. + + The kernel VGA driver may ioremap the VGA frame buffer at 0xA0000, + which uses a granule-sized UC mapping. This granule will cover some + WB-only memory, but since UC is non-speculative, the processor will + never generate an uncacheable reference to the WB-only areas unless + the driver explicitly touches them. + + mmap of 0x0-0xFFFFF legacy_mem by "X" + + If the EFI memory map reports that the entire range supports the + same attributes, we can allow the mmap (and we will prefer WB if + supported, as is the case with HP sx[12]000 machines with VGA + disabled). + + If EFI reports the range as partly WB and partly UC (as on sx[12]000 + machines with VGA enabled), we must fail the mmap because there's no + safe attribute to use. + + If EFI reports some of the range but not all (as on Intel firmware + that doesn't report the VGA frame buffer at all), we should fail the + mmap and force the user to map just the specific region of interest. + + mmap of 0xA0000-0xBFFFF legacy_mem by "X" on HP sx1000 with VGA disabled + + The EFI memory map reports the following attributes: + 0x00000-0xFFFFF WB only (no VGA MMIO hole) + + This is a special case of the previous case, and the mmap should + fail for the same reason as above. + + read of /sys/devices/.../rom + + For VGA devices, this may cause an ioremap() of 0xC0000. This + used to be done with a UC mapping, because the VGA frame buffer + at 0xA0000 prevents use of a WB granule. The UC mapping causes + an MCA on HP sx[12]000 chipsets. + + We should use WB page table mappings to avoid covering the VGA + frame buffer. + +NOTES + + [1] SDM rev 2.2, vol 2, sec 4.4.1. + [2] SDM rev 2.2, vol 2, sec 4.4.6. diff --git a/Documentation/ia64/efirtc.txt b/Documentation/ia64/efirtc.txt index ede2c1e51cd..057e6bebda8 100644 --- a/Documentation/ia64/efirtc.txt +++ b/Documentation/ia64/efirtc.txt @@ -26,7 +26,7 @@ to initialize the system view of the time during boot. Because we wanted to minimize the impact on existing user-level apps using the CMOS clock, we decided to expose an API that was very similar to the one used today with the legacy RTC driver (driver/char/rtc.c). However, because -EFI provides a simpler services, not all all ioctl() are available. Also +EFI provides a simpler services, not all ioctl() are available. Also new ioctl()s have been introduced for things that EFI provides but not the legacy. diff --git a/Documentation/ia64/err_inject.txt b/Documentation/ia64/err_inject.txt new file mode 100644 index 00000000000..9f651c18142 --- /dev/null +++ b/Documentation/ia64/err_inject.txt @@ -0,0 +1,1068 @@ + +IPF Machine Check (MC) error inject tool +======================================== + +IPF Machine Check (MC) error inject tool is used to inject MC +errors from Linux. The tool is a test bed for IPF MC work flow including +hardware correctable error handling, OS recoverable error handling, MC +event logging, etc. + +The tool includes two parts: a kernel driver and a user application +sample. The driver provides interface to PAL to inject error +and query error injection capabilities. The driver code is in +arch/ia64/kernel/err_inject.c. The application sample (shown below) +provides a combination of various errors and calls the driver's interface +(sysfs interface) to inject errors or query error injection capabilities. + +The tool can be used to test Intel IPF machine MC handling capabilities. +It's especially useful for people who can not access hardware MC injection +tool to inject error. It's also very useful to integrate with other +software test suits to do stressful testing on IPF. + +Below is a sample application as part of the whole tool. The sample +can be used as a working test tool. Or it can be expanded to include +more features. It also can be a integrated into a library or other user +application to have more thorough test. + +The sample application takes err.conf as error configuration input. GCC +compiles the code. After you install err_inject driver, you can run +this sample application to inject errors. + +Errata: Itanium 2 Processors Specification Update lists some errata against +the pal_mc_error_inject PAL procedure. The following err.conf has been tested +on latest Montecito PAL. + +err.conf: + +#This is configuration file for err_inject_tool. +#The format of the each line is: +#cpu, loop, interval, err_type_info, err_struct_info, err_data_buffer +#where +# cpu: logical cpu number the error will be inject in. +# loop: times the error will be injected. +# interval: In second. every so often one error is injected. +# err_type_info, err_struct_info: PAL parameters. +# +#Note: All values are hex w/o or w/ 0x prefix. + + +#On cpu2, inject only total 0x10 errors, interval 5 seconds +#corrected, data cache, hier-2, physical addr(assigned by tool code). +#working on Montecito latest PAL. +2, 10, 5, 4101, 95 + +#On cpu4, inject and consume total 0x10 errors, interval 5 seconds +#corrected, data cache, hier-2, physical addr(assigned by tool code). +#working on Montecito latest PAL. +4, 10, 5, 4109, 95 + +#On cpu15, inject and consume total 0x10 errors, interval 5 seconds +#recoverable, DTR0, hier-2. +#working on Montecito latest PAL. +0xf, 0x10, 5, 4249, 15 + +The sample application source code: + +err_injection_tool.c: + +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Copyright (C) 2006 Intel Co + * Fenghua Yu <fenghua.yu@intel.com> + * + */ +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <stdio.h> +#include <sched.h> +#include <unistd.h> +#include <stdlib.h> +#include <stdarg.h> +#include <string.h> +#include <errno.h> +#include <time.h> +#include <sys/ipc.h> +#include <sys/sem.h> +#include <sys/wait.h> +#include <sys/mman.h> +#include <sys/shm.h> + +#define MAX_FN_SIZE 256 +#define MAX_BUF_SIZE 256 +#define DATA_BUF_SIZE 256 +#define NR_CPUS 512 +#define MAX_TASK_NUM 2048 +#define MIN_INTERVAL 5 // seconds +#define ERR_DATA_BUFFER_SIZE 3 // Three 8-byte. +#define PARA_FIELD_NUM 5 +#define MASK_SIZE (NR_CPUS/64) +#define PATH_FORMAT "/sys/devices/system/cpu/cpu%d/err_inject/" + +int sched_setaffinity(pid_t pid, unsigned int len, unsigned long *mask); + +int verbose; +#define vbprintf if (verbose) printf + +int log_info(int cpu, const char *fmt, ...) +{ + FILE *log; + char fn[MAX_FN_SIZE]; + char buf[MAX_BUF_SIZE]; + va_list args; + + sprintf(fn, "%d.log", cpu); + log=fopen(fn, "a+"); + if (log==NULL) { + perror("Error open:"); + return -1; + } + + va_start(args, fmt); + vprintf(fmt, args); + memset(buf, 0, MAX_BUF_SIZE); + vsprintf(buf, fmt, args); + va_end(args); + + fwrite(buf, sizeof(buf), 1, log); + fclose(log); + + return 0; +} + +typedef unsigned long u64; +typedef unsigned int u32; + +typedef union err_type_info_u { + struct { + u64 mode : 3, /* 0-2 */ + err_inj : 3, /* 3-5 */ + err_sev : 2, /* 6-7 */ + err_struct : 5, /* 8-12 */ + struct_hier : 3, /* 13-15 */ + reserved : 48; /* 16-63 */ + } err_type_info_u; + u64 err_type_info; +} err_type_info_t; + +typedef union err_struct_info_u { + struct { + u64 siv : 1, /* 0 */ + c_t : 2, /* 1-2 */ + cl_p : 3, /* 3-5 */ + cl_id : 3, /* 6-8 */ + cl_dp : 1, /* 9 */ + reserved1 : 22, /* 10-31 */ + tiv : 1, /* 32 */ + trigger : 4, /* 33-36 */ + trigger_pl : 3, /* 37-39 */ + reserved2 : 24; /* 40-63 */ + } err_struct_info_cache; + struct { + u64 siv : 1, /* 0 */ + tt : 2, /* 1-2 */ + tc_tr : 2, /* 3-4 */ + tr_slot : 8, /* 5-12 */ + reserved1 : 19, /* 13-31 */ + tiv : 1, /* 32 */ + trigger : 4, /* 33-36 */ + trigger_pl : 3, /* 37-39 */ + reserved2 : 24; /* 40-63 */ + } err_struct_info_tlb; + struct { + u64 siv : 1, /* 0 */ + regfile_id : 4, /* 1-4 */ + reg_num : 7, /* 5-11 */ + reserved1 : 20, /* 12-31 */ + tiv : 1, /* 32 */ + trigger : 4, /* 33-36 */ + trigger_pl : 3, /* 37-39 */ + reserved2 : 24; /* 40-63 */ + } err_struct_info_register; + struct { + u64 reserved; + } err_struct_info_bus_processor_interconnect; + u64 err_struct_info; +} err_struct_info_t; + +typedef union err_data_buffer_u { + struct { + u64 trigger_addr; /* 0-63 */ + u64 inj_addr; /* 64-127 */ + u64 way : 5, /* 128-132 */ + index : 20, /* 133-152 */ + : 39; /* 153-191 */ + } err_data_buffer_cache; + struct { + u64 trigger_addr; /* 0-63 */ + u64 inj_addr; /* 64-127 */ + u64 way : 5, /* 128-132 */ + index : 20, /* 133-152 */ + reserved : 39; /* 153-191 */ + } err_data_buffer_tlb; + struct { + u64 trigger_addr; /* 0-63 */ + } err_data_buffer_register; + struct { + u64 reserved; /* 0-63 */ + } err_data_buffer_bus_processor_interconnect; + u64 err_data_buffer[ERR_DATA_BUFFER_SIZE]; +} err_data_buffer_t; + +typedef union capabilities_u { + struct { + u64 i : 1, + d : 1, + rv : 1, + tag : 1, + data : 1, + mesi : 1, + dp : 1, + reserved1 : 3, + pa : 1, + va : 1, + wi : 1, + reserved2 : 20, + trigger : 1, + trigger_pl : 1, + reserved3 : 30; + } capabilities_cache; + struct { + u64 d : 1, + i : 1, + rv : 1, + tc : 1, + tr : 1, + reserved1 : 27, + trigger : 1, + trigger_pl : 1, + reserved2 : 30; + } capabilities_tlb; + struct { + u64 gr_b0 : 1, + gr_b1 : 1, + fr : 1, + br : 1, + pr : 1, + ar : 1, + cr : 1, + rr : 1, + pkr : 1, + dbr : 1, + ibr : 1, + pmc : 1, + pmd : 1, + reserved1 : 3, + regnum : 1, + reserved2 : 15, + trigger : 1, + trigger_pl : 1, + reserved3 : 30; + } capabilities_register; + struct { + u64 reserved; + } capabilities_bus_processor_interconnect; +} capabilities_t; + +typedef struct resources_s { + u64 ibr0 : 1, + ibr2 : 1, + ibr4 : 1, + ibr6 : 1, + dbr0 : 1, + dbr2 : 1, + dbr4 : 1, + dbr6 : 1, + reserved : 48; +} resources_t; + + +long get_page_size(void) +{ + long page_size=sysconf(_SC_PAGESIZE); + return page_size; +} + +#define PAGE_SIZE (get_page_size()==-1?0x4000:get_page_size()) +#define SHM_SIZE (2*PAGE_SIZE*NR_CPUS) +#define SHM_VA 0x2000000100000000 + +int shmid; +void *shmaddr; + +int create_shm(void) +{ + key_t key; + char fn[MAX_FN_SIZE]; + + /* cpu0 is always existing */ + sprintf(fn, PATH_FORMAT, 0); + if ((key = ftok(fn, 's')) == -1) { + perror("ftok"); + return -1; + } + + shmid = shmget(key, SHM_SIZE, 0644 | IPC_CREAT); + if (shmid == -1) { + if (errno==EEXIST) { + shmid = shmget(key, SHM_SIZE, 0); + if (shmid == -1) { + perror("shmget"); + return -1; + } + } + else { + perror("shmget"); + return -1; + } + } + vbprintf("shmid=%d", shmid); + + /* connect to the segment: */ + shmaddr = shmat(shmid, (void *)SHM_VA, 0); + if (shmaddr == (void*)-1) { + perror("shmat"); + return -1; + } + + memset(shmaddr, 0, SHM_SIZE); + mlock(shmaddr, SHM_SIZE); + + return 0; +} + +int free_shm() +{ + munlock(shmaddr, SHM_SIZE); + shmdt(shmaddr); + semctl(shmid, 0, IPC_RMID); + + return 0; +} + +#ifdef _SEM_SEMUN_UNDEFINED +union semun +{ + int val; + struct semid_ds *buf; + unsigned short int *array; + struct seminfo *__buf; +}; +#endif + +u32 mode=1; /* 1: physical mode; 2: virtual mode. */ +int one_lock=1; +key_t key[NR_CPUS]; +int semid[NR_CPUS]; + +int create_sem(int cpu) +{ + union semun arg; + char fn[MAX_FN_SIZE]; + int sid; + + sprintf(fn, PATH_FORMAT, cpu); + sprintf(fn, "%s/%s", fn, "err_type_info"); + if ((key[cpu] = ftok(fn, 'e')) == -1) { + perror("ftok"); + return -1; + } + + if (semid[cpu]!=0) + return 0; + + /* clear old semaphore */ + if ((sid = semget(key[cpu], 1, 0)) != -1) + semctl(sid, 0, IPC_RMID); + + /* get one semaphore */ + if ((semid[cpu] = semget(key[cpu], 1, IPC_CREAT | IPC_EXCL)) == -1) { + perror("semget"); + printf("Please remove semaphore with key=0x%lx, then run the tool.\n", + (u64)key[cpu]); + return -1; + } + + vbprintf("semid[%d]=0x%lx, key[%d]=%lx\n",cpu,(u64)semid[cpu],cpu, + (u64)key[cpu]); + /* initialize the semaphore to 1: */ + arg.val = 1; + if (semctl(semid[cpu], 0, SETVAL, arg) == -1) { + perror("semctl"); + return -1; + } + + return 0; +} + +static int lock(int cpu) +{ + struct sembuf lock; + + lock.sem_num = cpu; + lock.sem_op = 1; + semop(semid[cpu], &lock, 1); + + return 0; +} + +static int unlock(int cpu) +{ + struct sembuf unlock; + + unlock.sem_num = cpu; + unlock.sem_op = -1; + semop(semid[cpu], &unlock, 1); + + return 0; +} + +void free_sem(int cpu) +{ + semctl(semid[cpu], 0, IPC_RMID); +} + +int wr_multi(char *fn, unsigned long *data, int size) +{ + int fd; + char buf[MAX_BUF_SIZE]; + int ret; + + if (size==1) + sprintf(buf, "%lx", *data); + else if (size==3) + sprintf(buf, "%lx,%lx,%lx", data[0], data[1], data[2]); + else { + fprintf(stderr,"write to file with wrong size!\n"); + return -1; + } + + fd=open(fn, O_RDWR); + if (!fd) { + perror("Error:"); + return -1; + } + ret=write(fd, buf, sizeof(buf)); + close(fd); + return ret; +} + +int wr(char *fn, unsigned long data) +{ + return wr_multi(fn, &data, 1); +} + +int rd(char *fn, unsigned long *data) +{ + int fd; + char buf[MAX_BUF_SIZE]; + + fd=open(fn, O_RDONLY); + if (fd<0) { + perror("Error:"); + return -1; + } + read(fd, buf, MAX_BUF_SIZE); + *data=strtoul(buf, NULL, 16); + close(fd); + return 0; +} + +int rd_status(char *path, int *status) +{ + char fn[MAX_FN_SIZE]; + sprintf(fn, "%s/status", path); + if (rd(fn, (u64*)status)<0) { + perror("status reading error.\n"); + return -1; + } + + return 0; +} + +int rd_capabilities(char *path, u64 *capabilities) +{ + char fn[MAX_FN_SIZE]; + sprintf(fn, "%s/capabilities", path); + if (rd(fn, capabilities)<0) { + perror("capabilities reading error.\n"); + return -1; + } + + return 0; +} + +int rd_all(char *path) +{ + unsigned long err_type_info, err_struct_info, err_data_buffer; + int status; + unsigned long capabilities, resources; + char fn[MAX_FN_SIZE]; + + sprintf(fn, "%s/err_type_info", path); + if (rd(fn, &err_type_info)<0) { + perror("err_type_info reading error.\n"); + return -1; + } + printf("err_type_info=%lx\n", err_type_info); + + sprintf(fn, "%s/err_struct_info", path); + if (rd(fn, &err_struct_info)<0) { + perror("err_struct_info reading error.\n"); + return -1; + } + printf("err_struct_info=%lx\n", err_struct_info); + + sprintf(fn, "%s/err_data_buffer", path); + if (rd(fn, &err_data_buffer)<0) { + perror("err_data_buffer reading error.\n"); + return -1; + } + printf("err_data_buffer=%lx\n", err_data_buffer); + + sprintf(fn, "%s/status", path); + if (rd("status", (u64*)&status)<0) { + perror("status reading error.\n"); + return -1; + } + printf("status=%d\n", status); + + sprintf(fn, "%s/capabilities", path); + if (rd(fn,&capabilities)<0) { + perror("capabilities reading error.\n"); + return -1; + } + printf("capabilities=%lx\n", capabilities); + + sprintf(fn, "%s/resources", path); + if (rd(fn, &resources)<0) { + perror("resources reading error.\n"); + return -1; + } + printf("resources=%lx\n", resources); + + return 0; +} + +int query_capabilities(char *path, err_type_info_t err_type_info, + u64 *capabilities) +{ + char fn[MAX_FN_SIZE]; + err_struct_info_t err_struct_info; + err_data_buffer_t err_data_buffer; + + err_struct_info.err_struct_info=0; + memset(err_data_buffer.err_data_buffer, -1, ERR_DATA_BUFFER_SIZE*8); + + sprintf(fn, "%s/err_type_info", path); + wr(fn, err_type_info.err_type_info); + sprintf(fn, "%s/err_struct_info", path); + wr(fn, 0x0); + sprintf(fn, "%s/err_data_buffer", path); + wr_multi(fn, err_data_buffer.err_data_buffer, ERR_DATA_BUFFER_SIZE); + + // Fire pal_mc_error_inject procedure. + sprintf(fn, "%s/call_start", path); + wr(fn, mode); + + if (rd_capabilities(path, capabilities)<0) + return -1; + + return 0; +} + +int query_all_capabilities() +{ + int status; + err_type_info_t err_type_info; + int err_sev, err_struct, struct_hier; + int cap=0; + u64 capabilities; + char path[MAX_FN_SIZE]; + + err_type_info.err_type_info=0; // Initial + err_type_info.err_type_info_u.mode=0; // Query mode; + err_type_info.err_type_info_u.err_inj=0; + + printf("All capabilities implemented in pal_mc_error_inject:\n"); + sprintf(path, PATH_FORMAT ,0); + for (err_sev=0;err_sev<3;err_sev++) + for (err_struct=0;err_struct<5;err_struct++) + for (struct_hier=0;struct_hier<5;struct_hier++) + { + status=-1; + capabilities=0; + err_type_info.err_type_info_u.err_sev=err_sev; + err_type_info.err_type_info_u.err_struct=err_struct; + err_type_info.err_type_info_u.struct_hier=struct_hier; + + if (query_capabilities(path, err_type_info, &capabilities)<0) + continue; + + if (rd_status(path, &status)<0) + continue; + + if (status==0) { + cap=1; + printf("For err_sev=%d, err_struct=%d, struct_hier=%d: ", + err_sev, err_struct, struct_hier); + printf("capabilities 0x%lx\n", capabilities); + } + } + if (!cap) { + printf("No capabilities supported.\n"); + return 0; + } + + return 0; +} + +int err_inject(int cpu, char *path, err_type_info_t err_type_info, + err_struct_info_t err_struct_info, + err_data_buffer_t err_data_buffer) +{ + int status; + char fn[MAX_FN_SIZE]; + + log_info(cpu, "err_type_info=%lx, err_struct_info=%lx, ", + err_type_info.err_type_info, + err_struct_info.err_struct_info); + log_info(cpu,"err_data_buffer=[%lx,%lx,%lx]\n", + err_data_buffer.err_data_buffer[0], + err_data_buffer.err_data_buffer[1], + err_data_buffer.err_data_buffer[2]); + sprintf(fn, "%s/err_type_info", path); + wr(fn, err_type_info.err_type_info); + sprintf(fn, "%s/err_struct_info", path); + wr(fn, err_struct_info.err_struct_info); + sprintf(fn, "%s/err_data_buffer", path); + wr_multi(fn, err_data_buffer.err_data_buffer, ERR_DATA_BUFFER_SIZE); + + // Fire pal_mc_error_inject procedure. + sprintf(fn, "%s/call_start", path); + wr(fn,mode); + + if (rd_status(path, &status)<0) { + vbprintf("fail: read status\n"); + return -100; + } + + if (status!=0) { + log_info(cpu, "fail: status=%d\n", status); + return status; + } + + return status; +} + +static int construct_data_buf(char *path, err_type_info_t err_type_info, + err_struct_info_t err_struct_info, + err_data_buffer_t *err_data_buffer, + void *va1) +{ + char fn[MAX_FN_SIZE]; + u64 virt_addr=0, phys_addr=0; + + vbprintf("va1=%lx\n", (u64)va1); + memset(&err_data_buffer->err_data_buffer_cache, 0, ERR_DATA_BUFFER_SIZE*8); + + switch (err_type_info.err_type_info_u.err_struct) { + case 1: // Cache + switch (err_struct_info.err_struct_info_cache.cl_id) { + case 1: //Virtual addr + err_data_buffer->err_data_buffer_cache.inj_addr=(u64)va1; + break; + case 2: //Phys addr + sprintf(fn, "%s/virtual_to_phys", path); + virt_addr=(u64)va1; + if (wr(fn,virt_addr)<0) + return -1; + rd(fn, &phys_addr); + err_data_buffer->err_data_buffer_cache.inj_addr=phys_addr; + break; + default: + printf("Not supported cl_id\n"); + break; + } + break; + case 2: // TLB + break; + case 3: // Register file + break; + case 4: // Bus/system interconnect + default: + printf("Not supported err_struct\n"); + break; + } + + return 0; +} + +typedef struct { + u64 cpu; + u64 loop; + u64 interval; + u64 err_type_info; + u64 err_struct_info; + u64 err_data_buffer[ERR_DATA_BUFFER_SIZE]; +} parameters_t; + +parameters_t line_para; +int para; + +static int empty_data_buffer(u64 *err_data_buffer) +{ + int empty=1; + int i; + + for (i=0;i<ERR_DATA_BUFFER_SIZE; i++) + if (err_data_buffer[i]!=-1) + empty=0; + + return empty; +} + +int err_inj() +{ + err_type_info_t err_type_info; + err_struct_info_t err_struct_info; + err_data_buffer_t err_data_buffer; + int count; + FILE *fp; + unsigned long cpu, loop, interval, err_type_info_conf, err_struct_info_conf; + u64 err_data_buffer_conf[ERR_DATA_BUFFER_SIZE]; + int num; + int i; + char path[MAX_FN_SIZE]; + parameters_t parameters[MAX_TASK_NUM]={}; + pid_t child_pid[MAX_TASK_NUM]; + time_t current_time; + int status; + + if (!para) { + fp=fopen("err.conf", "r"); + if (fp==NULL) { + perror("Error open err.conf"); + return -1; + } + + num=0; + while (!feof(fp)) { + char buf[256]; + memset(buf,0,256); + fgets(buf, 256, fp); + count=sscanf(buf, "%lx, %lx, %lx, %lx, %lx, %lx, %lx, %lx\n", + &cpu, &loop, &interval,&err_type_info_conf, + &err_struct_info_conf, + &err_data_buffer_conf[0], + &err_data_buffer_conf[1], + &err_data_buffer_conf[2]); + if (count!=PARA_FIELD_NUM+3) { + err_data_buffer_conf[0]=-1; + err_data_buffer_conf[1]=-1; + err_data_buffer_conf[2]=-1; + count=sscanf(buf, "%lx, %lx, %lx, %lx, %lx\n", + &cpu, &loop, &interval,&err_type_info_conf, + &err_struct_info_conf); + if (count!=PARA_FIELD_NUM) + continue; + } + + parameters[num].cpu=cpu; + parameters[num].loop=loop; + parameters[num].interval= interval>MIN_INTERVAL + ?interval:MIN_INTERVAL; + parameters[num].err_type_info=err_type_info_conf; + parameters[num].err_struct_info=err_struct_info_conf; + memcpy(parameters[num++].err_data_buffer, + err_data_buffer_conf,ERR_DATA_BUFFER_SIZE*8) ; + + if (num>=MAX_TASK_NUM) + break; + } + } + else { + parameters[0].cpu=line_para.cpu; + parameters[0].loop=line_para.loop; + parameters[0].interval= line_para.interval>MIN_INTERVAL + ?line_para.interval:MIN_INTERVAL; + parameters[0].err_type_info=line_para.err_type_info; + parameters[0].err_struct_info=line_para.err_struct_info; + memcpy(parameters[0].err_data_buffer, + line_para.err_data_buffer,ERR_DATA_BUFFER_SIZE*8) ; + + num=1; + } + + /* Create semaphore: If one_lock, one semaphore for all processors. + Otherwise, one semaphore for each processor. */ + if (one_lock) { + if (create_sem(0)) { + printf("Can not create semaphore...exit\n"); + free_sem(0); + return -1; + } + } + else { + for (i=0;i<num;i++) { + if (create_sem(parameters[i].cpu)) { + printf("Can not create semaphore for cpu%d...exit\n",i); + free_sem(parameters[num].cpu); + return -1; + } + } + } + + /* Create a shm segment which will be used to inject/consume errors on.*/ + if (create_shm()==-1) { + printf("Error to create shm...exit\n"); + return -1; + } + + for (i=0;i<num;i++) { + pid_t pid; + + current_time=time(NULL); + log_info(parameters[i].cpu, "\nBegine at %s", ctime(¤t_time)); + log_info(parameters[i].cpu, "Configurations:\n"); + log_info(parameters[i].cpu,"On cpu%ld: loop=%lx, interval=%lx(s)", + parameters[i].cpu, + parameters[i].loop, + parameters[i].interval); + log_info(parameters[i].cpu," err_type_info=%lx,err_struct_info=%lx\n", + parameters[i].err_type_info, + parameters[i].err_struct_info); + + sprintf(path, PATH_FORMAT, (int)parameters[i].cpu); + err_type_info.err_type_info=parameters[i].err_type_info; + err_struct_info.err_struct_info=parameters[i].err_struct_info; + memcpy(err_data_buffer.err_data_buffer, + parameters[i].err_data_buffer, + ERR_DATA_BUFFER_SIZE*8); + + pid=fork(); + if (pid==0) { + unsigned long mask[MASK_SIZE]; + int j, k; + + void *va1, *va2; + + /* Allocate two memory areas va1 and va2 in shm */ + va1=shmaddr+parameters[i].cpu*PAGE_SIZE; + va2=shmaddr+parameters[i].cpu*PAGE_SIZE+PAGE_SIZE; + + vbprintf("va1=%lx, va2=%lx\n", (u64)va1, (u64)va2); + memset(va1, 0x1, PAGE_SIZE); + memset(va2, 0x2, PAGE_SIZE); + + if (empty_data_buffer(err_data_buffer.err_data_buffer)) + /* If not specified yet, construct data buffer + * with va1 + */ + construct_data_buf(path, err_type_info, + err_struct_info, &err_data_buffer,va1); + + for (j=0;j<MASK_SIZE;j++) + mask[j]=0; + + cpu=parameters[i].cpu; + k = cpu%64; + j = cpu/64; + mask[j] = 1UL << k; + + if (sched_setaffinity(0, MASK_SIZE*8, mask)==-1) { + perror("Error sched_setaffinity:"); + return -1; + } + + for (j=0; j<parameters[i].loop; j++) { + log_info(parameters[i].cpu,"Injection "); + log_info(parameters[i].cpu,"on cpu%ld: #%d/%ld ", + + parameters[i].cpu,j+1, parameters[i].loop); + + /* Hold the lock */ + if (one_lock) + lock(0); + else + /* Hold lock on this cpu */ + lock(parameters[i].cpu); + + if ((status=err_inject(parameters[i].cpu, + path, err_type_info, + err_struct_info, err_data_buffer)) + ==0) { + /* consume the error for "inject only"*/ + memcpy(va2, va1, PAGE_SIZE); + memcpy(va1, va2, PAGE_SIZE); + log_info(parameters[i].cpu, + "successful\n"); + } + else { + log_info(parameters[i].cpu,"fail:"); + log_info(parameters[i].cpu, + "status=%d\n", status); + unlock(parameters[i].cpu); + break; + } + if (one_lock) + /* Release the lock */ + unlock(0); + /* Release lock on this cpu */ + else + unlock(parameters[i].cpu); + + if (j < parameters[i].loop-1) + sleep(parameters[i].interval); + } + current_time=time(NULL); + log_info(parameters[i].cpu, "Done at %s", ctime(¤t_time)); + return 0; + } + else if (pid<0) { + perror("Error fork:"); + continue; + } + child_pid[i]=pid; + } + for (i=0;i<num;i++) + waitpid(child_pid[i], NULL, 0); + + if (one_lock) + free_sem(0); + else + for (i=0;i<num;i++) + free_sem(parameters[i].cpu); + + printf("All done.\n"); + + return 0; +} + +void help() +{ + printf("err_inject_tool:\n"); + printf("\t-q: query all capabilities. default: off\n"); + printf("\t-m: procedure mode. 1: physical 2: virtual. default: 1\n"); + printf("\t-i: inject errors. default: off\n"); + printf("\t-l: one lock per cpu. default: one lock for all\n"); + printf("\t-e: error parameters:\n"); + printf("\t\tcpu,loop,interval,err_type_info,err_struct_info[,err_data_buffer[0],err_data_buffer[1],err_data_buffer[2]]\n"); + printf("\t\t cpu: logical cpu number the error will be inject in.\n"); + printf("\t\t loop: times the error will be injected.\n"); + printf("\t\t interval: In second. every so often one error is injected.\n"); + printf("\t\t err_type_info, err_struct_info: PAL parameters.\n"); + printf("\t\t err_data_buffer: PAL parameter. Optional. If not present,\n"); + printf("\t\t it's constructed by tool automatically. Be\n"); + printf("\t\t careful to provide err_data_buffer and make\n"); + printf("\t\t sure it's working with the environment.\n"); + printf("\t Note:no space between error parameters.\n"); + printf("\t default: Take error parameters from err.conf instead of command line.\n"); + printf("\t-v: verbose. default: off\n"); + printf("\t-h: help\n\n"); + printf("The tool will take err.conf file as "); + printf("input to inject single or multiple errors "); + printf("on one or multiple cpus in parallel.\n"); +} + +int main(int argc, char **argv) +{ + char c; + int do_err_inj=0; + int do_query_all=0; + int count; + u32 m; + + /* Default one lock for all cpu's */ + one_lock=1; + while ((c = getopt(argc, argv, "m:iqvhle:")) != EOF) + switch (c) { + case 'm': /* Procedure mode. 1: phys 2: virt */ + count=sscanf(optarg, "%x", &m); + if (count!=1 || (m!=1 && m!=2)) { + printf("Wrong mode number.\n"); + help(); + return -1; + } + mode=m; + break; + case 'i': /* Inject errors */ + do_err_inj=1; + break; + case 'q': /* Query */ + do_query_all=1; + break; + case 'v': /* Verbose */ + verbose=1; + break; + case 'l': /* One lock per cpu */ + one_lock=0; + break; + case 'e': /* error arguments */ + /* Take parameters: + * #cpu, loop, interval, err_type_info, err_struct_info[, err_data_buffer] + * err_data_buffer is optional. Recommend not to specify + * err_data_buffer. Better to use tool to generate it. + */ + count=sscanf(optarg, + "%lx, %lx, %lx, %lx, %lx, %lx, %lx, %lx\n", + &line_para.cpu, + &line_para.loop, + &line_para.interval, + &line_para.err_type_info, + &line_para.err_struct_info, + &line_para.err_data_buffer[0], + &line_para.err_data_buffer[1], + &line_para.err_data_buffer[2]); + if (count!=PARA_FIELD_NUM+3) { + line_para.err_data_buffer[0]=-1, + line_para.err_data_buffer[1]=-1, + line_para.err_data_buffer[2]=-1; + count=sscanf(optarg, "%lx, %lx, %lx, %lx, %lx\n", + &line_para.cpu, + &line_para.loop, + &line_para.interval, + &line_para.err_type_info, + &line_para.err_struct_info); + if (count!=PARA_FIELD_NUM) { + printf("Wrong error arguments.\n"); + help(); + return -1; + } + } + para=1; + break; + continue; + break; + case 'h': + help(); + return 0; + default: + break; + } + + if (do_query_all) + query_all_capabilities(); + if (do_err_inj) + err_inj(); + + if (!do_query_all && !do_err_inj) + help(); + + return 0; +} + diff --git a/Documentation/ia64/fsys.txt b/Documentation/ia64/fsys.txt index 28da181f996..59dd689d9b8 100644 --- a/Documentation/ia64/fsys.txt +++ b/Documentation/ia64/fsys.txt @@ -165,7 +165,7 @@ complicated cases. * Signal handling The delivery of (asynchronous) signals must be delayed until fsys-mode -is exited. This is acomplished with the help of the lower-privilege +is exited. This is accomplished with the help of the lower-privilege transfer trap: arch/ia64/kernel/process.c:do_notify_resume_user() checks whether the interrupted task was in fsys-mode and, if so, sets PSR.lp and returns immediately. When fsys-mode is exited via the diff --git a/Documentation/ia64/kvm.txt b/Documentation/ia64/kvm.txt new file mode 100644 index 00000000000..ffb5c80bec3 --- /dev/null +++ b/Documentation/ia64/kvm.txt @@ -0,0 +1,83 @@ +Currently, kvm module is in EXPERIMENTAL stage on IA64. This means that +interfaces are not stable enough to use. So, please don't run critical +applications in virtual machine. +We will try our best to improve it in future versions! + + Guide: How to boot up guests on kvm/ia64 + +This guide is to describe how to enable kvm support for IA-64 systems. + +1. Get the kvm source from git.kernel.org. + Userspace source: + git clone git://git.kernel.org/pub/scm/virt/kvm/kvm-userspace.git + Kernel Source: + git clone git://git.kernel.org/pub/scm/linux/kernel/git/xiantao/kvm-ia64.git + +2. Compile the source code. + 2.1 Compile userspace code: + (1)cd ./kvm-userspace + (2)./configure + (3)cd kernel + (4)make sync LINUX= $kernel_dir (kernel_dir is the directory of kernel source.) + (5)cd .. + (6)make qemu + (7)cd qemu; make install + + 2.2 Compile kernel source code: + (1) cd ./$kernel_dir + (2) Make menuconfig + (3) Enter into virtualization option, and choose kvm. + (4) make + (5) Once (4) done, make modules_install + (6) Make initrd, and use new kernel to reboot up host machine. + (7) Once (6) done, cd $kernel_dir/arch/ia64/kvm + (8) insmod kvm.ko; insmod kvm-intel.ko + +Note: For step 2, please make sure that host page size == TARGET_PAGE_SIZE of qemu, otherwise, may fail. + +3. Get Guest Firmware named as Flash.fd, and put it under right place: + (1) If you have the guest firmware (binary) released by Intel Corp for Xen, use it directly. + + (2) If you have no firmware at hand, Please download its source from + hg clone http://xenbits.xensource.com/ext/efi-vfirmware.hg + you can get the firmware's binary in the directory of efi-vfirmware.hg/binaries. + + (3) Rename the firmware you owned to Flash.fd, and copy it to /usr/local/share/qemu + +4. Boot up Linux or Windows guests: + 4.1 Create or install a image for guest boot. If you have xen experience, it should be easy. + + 4.2 Boot up guests use the following command. + /usr/local/bin/qemu-system-ia64 -smp xx -m 512 -hda $your_image + (xx is the number of virtual processors for the guest, now the maximum value is 4) + +5. Known possible issue on some platforms with old Firmware. + +In the event of strange host crash issues, try to solve it through either of the following ways: + +(1): Upgrade your Firmware to the latest one. + +(2): Applying the below patch to kernel source. +diff --git a/arch/ia64/kernel/pal.S b/arch/ia64/kernel/pal.S +index 0b53344..f02b0f7 100644 +--- a/arch/ia64/kernel/pal.S ++++ b/arch/ia64/kernel/pal.S +@@ -84,7 +84,8 @@ GLOBAL_ENTRY(ia64_pal_call_static) + mov ar.pfs = loc1 + mov rp = loc0 + ;; +- srlz.d // serialize restoration of psr.l ++ srlz.i // serialize restoration of psr.l ++ ;; + br.ret.sptk.many b0 + END(ia64_pal_call_static) + +6. Bug report: + If you found any issues when use kvm/ia64, Please post the bug info to kvm-ia64-devel mailing list. + https://lists.sourceforge.net/lists/listinfo/kvm-ia64-devel/ + +Thanks for your interest! Let's work together, and make kvm/ia64 stronger and stronger! + + + Xiantao Zhang <xiantao.zhang@intel.com> + 2008.3.10 diff --git a/Documentation/ia64/mca.txt b/Documentation/ia64/mca.txt index a71cc6a67ef..f097c60cba1 100644 --- a/Documentation/ia64/mca.txt +++ b/Documentation/ia64/mca.txt @@ -12,7 +12,7 @@ by locks is indeterminate, including linked lists. --- The complicated ia64 MCA process. All of this is mandated by Intel's -specification for ia64 SAL, error recovery and and unwind, it is not as +specification for ia64 SAL, error recovery and unwind, it is not as if we have a choice here. * MCA occurs on one cpu, usually due to a double bit memory error. @@ -94,7 +94,7 @@ if we have a choice here. INIT is less complicated than MCA. Pressing the nmi button or using the equivalent command on the management console sends INIT to all -cpus. SAL picks one one of the cpus as the monarch and the rest are +cpus. SAL picks one of the cpus as the monarch and the rest are slaves. All the OS INIT handlers are entered at approximately the same time. The OS monarch prints the state of all tasks and returns, after which the slaves return and the system resumes. diff --git a/Documentation/ia64/paravirt_ops.txt b/Documentation/ia64/paravirt_ops.txt new file mode 100644 index 00000000000..39ded02ec33 --- /dev/null +++ b/Documentation/ia64/paravirt_ops.txt @@ -0,0 +1,137 @@ +Paravirt_ops on IA64 +==================== + 21 May 2008, Isaku Yamahata <yamahata@valinux.co.jp> + + +Introduction +------------ +The aim of this documentation is to help with maintainability and/or to +encourage people to use paravirt_ops/IA64. + +paravirt_ops (pv_ops in short) is a way for virtualization support of +Linux kernel on x86. Several ways for virtualization support were +proposed, paravirt_ops is the winner. +On the other hand, now there are also several IA64 virtualization +technologies like kvm/IA64, xen/IA64 and many other academic IA64 +hypervisors so that it is good to add generic virtualization +infrastructure on Linux/IA64. + + +What is paravirt_ops? +--------------------- +It has been developed on x86 as virtualization support via API, not ABI. +It allows each hypervisor to override operations which are important for +hypervisors at API level. And it allows a single kernel binary to run on +all supported execution environments including native machine. +Essentially paravirt_ops is a set of function pointers which represent +operations corresponding to low level sensitive instructions and high +level functionalities in various area. But one significant difference +from usual function pointer table is that it allows optimization with +binary patch. It is because some of these operations are very +performance sensitive and indirect call overhead is not negligible. +With binary patch, indirect C function call can be transformed into +direct C function call or in-place execution to eliminate the overhead. + +Thus, operations of paravirt_ops are classified into three categories. +- simple indirect call + These operations correspond to high level functionality so that the + overhead of indirect call isn't very important. + +- indirect call which allows optimization with binary patch + Usually these operations correspond to low level instructions. They + are called frequently and performance critical. So the overhead is + very important. + +- a set of macros for hand written assembly code + Hand written assembly codes (.S files) also need paravirtualization + because they include sensitive instructions or some of code paths in + them are very performance critical. + + +The relation to the IA64 machine vector +--------------------------------------- +Linux/IA64 has the IA64 machine vector functionality which allows the +kernel to switch implementations (e.g. initialization, ipi, dma api...) +depending on executing platform. +We can replace some implementations very easily defining a new machine +vector. Thus another approach for virtualization support would be +enhancing the machine vector functionality. +But paravirt_ops approach was taken because +- virtualization support needs wider support than machine vector does. + e.g. low level instruction paravirtualization. It must be + initialized very early before platform detection. + +- virtualization support needs more functionality like binary patch. + Probably the calling overhead might not be very large compared to the + emulation overhead of virtualization. However in the native case, the + overhead should be eliminated completely. + A single kernel binary should run on each environment including native, + and the overhead of paravirt_ops on native environment should be as + small as possible. + +- for full virtualization technology, e.g. KVM/IA64 or + Xen/IA64 HVM domain, the result would be + (the emulated platform machine vector. probably dig) + (pv_ops). + This means that the virtualization support layer should be under + the machine vector layer. + +Possibly it might be better to move some function pointers from +paravirt_ops to machine vector. In fact, Xen domU case utilizes both +pv_ops and machine vector. + + +IA64 paravirt_ops +----------------- +In this section, the concrete paravirt_ops will be discussed. +Because of the architecture difference between ia64 and x86, the +resulting set of functions is very different from x86 pv_ops. + +- C function pointer tables +They are not very performance critical so that simple C indirect +function call is acceptable. The following structures are defined at +this moment. For details see linux/include/asm-ia64/paravirt.h + - struct pv_info + This structure describes the execution environment. + - struct pv_init_ops + This structure describes the various initialization hooks. + - struct pv_iosapic_ops + This structure describes hooks to iosapic operations. + - struct pv_irq_ops + This structure describes hooks to irq related operations + - struct pv_time_op + This structure describes hooks to steal time accounting. + +- a set of indirect calls which need optimization +Currently this class of functions correspond to a subset of IA64 +intrinsics. At this moment the optimization with binary patch isn't +implemented yet. +struct pv_cpu_op is defined. For details see +linux/include/asm-ia64/paravirt_privop.h +Mostly they correspond to ia64 intrinsics 1-to-1. +Caveat: Now they are defined as C indirect function pointers, but in +order to support binary patch optimization, they will be changed +using GCC extended inline assembly code. + +- a set of macros for hand written assembly code (.S files) +For maintenance purpose, the taken approach for .S files is single +source code and compile multiple times with different macros definitions. +Each pv_ops instance must define those macros to compile. +The important thing here is that sensitive, but non-privileged +instructions must be paravirtualized and that some privileged +instructions also need paravirtualization for reasonable performance. +Developers who modify .S files must be aware of that. At this moment +an easy checker is implemented to detect paravirtualization breakage. +But it doesn't cover all the cases. + +Sometimes this set of macros is called pv_cpu_asm_op. But there is no +corresponding structure in the source code. +Those macros mostly 1:1 correspond to a subset of privileged +instructions. See linux/include/asm-ia64/native/inst.h. +And some functions written in assembly also need to be overrided so +that each pv_ops instance have to define some macros. Again see +linux/include/asm-ia64/native/inst.h. + + +Those structures must be initialized very early before start_kernel. +Probably initialized in head.S using multi entry point or some other trick. +For native case implementation see linux/arch/ia64/kernel/paravirt.c. diff --git a/Documentation/ia64/serial.txt b/Documentation/ia64/serial.txt index f51eb4bc2ff..6869c73de4e 100644 --- a/Documentation/ia64/serial.txt +++ b/Documentation/ia64/serial.txt @@ -124,9 +124,16 @@ TROUBLESHOOTING SERIAL CONSOLE PROBLEMS - Add entry to /etc/securetty for console tty. + No ACPI serial devices found in 2.6.17 or later: + - Turn on CONFIG_PNP and CONFIG_PNPACPI. Prior to 2.6.17, ACPI + serial devices were discovered by 8250_acpi. In 2.6.17, + 8250_acpi was replaced by the combination of 8250_pnp and + CONFIG_PNPACPI. -[1] http://www.dig64.org/specifications/DIG64_PCDPv20.pdf + + +[1] http://www.dig64.org/specifications/agreement The table was originally defined as the "HCDP" for "Headless Console/Debug Port." The current version is the "PCDP" for "Primary Console and Debug Port Devices." diff --git a/Documentation/ia64/xen.txt b/Documentation/ia64/xen.txt new file mode 100644 index 00000000000..c61a99f7c8b --- /dev/null +++ b/Documentation/ia64/xen.txt @@ -0,0 +1,183 @@ + Recipe for getting/building/running Xen/ia64 with pv_ops + -------------------------------------------------------- + +This recipe describes how to get xen-ia64 source and build it, +and run domU with pv_ops. + +============ +Requirements +============ + + - python + - mercurial + it (aka "hg") is an open-source source code + management software. See the below. + http://www.selenic.com/mercurial/wiki/ + - git + - bridge-utils + +================================= +Getting and Building Xen and Dom0 +================================= + + My environment is; + Machine : Tiger4 + Domain0 OS : RHEL5 + DomainU OS : RHEL5 + + 1. Download source + # hg clone http://xenbits.xensource.com/ext/ia64/xen-unstable.hg + # cd xen-unstable.hg + # hg clone http://xenbits.xensource.com/ext/ia64/linux-2.6.18-xen.hg + + 2. # make world + + 3. # make install-tools + + 4. copy kernels and xen + # cp xen/xen.gz /boot/efi/efi/redhat/ + # cp build-linux-2.6.18-xen_ia64/vmlinux.gz \ + /boot/efi/efi/redhat/vmlinuz-2.6.18.8-xen + + 5. make initrd for Dom0/DomU + # make -C linux-2.6.18-xen.hg ARCH=ia64 modules_install \ + O=$(/bin/pwd)/build-linux-2.6.18-xen_ia64 + # mkinitrd -f /boot/efi/efi/redhat/initrd-2.6.18.8-xen.img \ + 2.6.18.8-xen --builtin mptspi --builtin mptbase \ + --builtin mptscsih --builtin uhci-hcd --builtin ohci-hcd \ + --builtin ehci-hcd + +================================ +Making a disk image for guest OS +================================ + + 1. make file + # dd if=/dev/zero of=/root/rhel5.img bs=1M seek=4096 count=0 + # mke2fs -F -j /root/rhel5.img + # mount -o loop /root/rhel5.img /mnt + # cp -ax /{dev,var,etc,usr,bin,sbin,lib} /mnt + # mkdir /mnt/{root,proc,sys,home,tmp} + + Note: You may miss some device files. If so, please create them + with mknod. Or you can use tar instead of cp. + + 2. modify DomU's fstab + # vi /mnt/etc/fstab + /dev/xvda1 / ext3 defaults 1 1 + none /dev/pts devpts gid=5,mode=620 0 0 + none /dev/shm tmpfs defaults 0 0 + none /proc proc defaults 0 0 + none /sys sysfs defaults 0 0 + + 3. modify inittab + set runlevel to 3 to avoid X trying to start + # vi /mnt/etc/inittab + id:3:initdefault: + Start a getty on the hvc0 console + X0:2345:respawn:/sbin/mingetty hvc0 + tty1-6 mingetty can be commented out + + 4. add hvc0 into /etc/securetty + # vi /mnt/etc/securetty (add hvc0) + + 5. umount + # umount /mnt + +FYI, virt-manager can also make a disk image for guest OS. +It's GUI tools and easy to make it. + +================== +Boot Xen & Domain0 +================== + + 1. replace elilo + elilo of RHEL5 can boot Xen and Dom0. + If you use old elilo (e.g RHEL4), please download from the below + http://elilo.sourceforge.net/cgi-bin/blosxom + and copy into /boot/efi/efi/redhat/ + # cp elilo-3.6-ia64.efi /boot/efi/efi/redhat/elilo.efi + + 2. modify elilo.conf (like the below) + # vi /boot/efi/efi/redhat/elilo.conf + prompt + timeout=20 + default=xen + relocatable + + image=vmlinuz-2.6.18.8-xen + label=xen + vmm=xen.gz + initrd=initrd-2.6.18.8-xen.img + read-only + append=" -- rhgb root=/dev/sda2" + +The append options before "--" are for xen hypervisor, +the options after "--" are for dom0. + +FYI, your machine may need console options like +"com1=19200,8n1 console=vga,com1". For example, +append="com1=19200,8n1 console=vga,com1 -- rhgb console=tty0 \ +console=ttyS0 root=/dev/sda2" + +===================================== +Getting and Building domU with pv_ops +===================================== + + 1. get pv_ops tree + # git clone http://people.valinux.co.jp/~yamahata/xen-ia64/linux-2.6-xen-ia64.git/ + + 2. git branch (if necessary) + # cd linux-2.6-xen-ia64/ + # git checkout -b your_branch origin/xen-ia64-domu-minimal-2008may19 + (Note: The current branch is xen-ia64-domu-minimal-2008may19. + But you would find the new branch. You can see with + "git branch -r" to get the branch lists. + http://people.valinux.co.jp/~yamahata/xen-ia64/for_eagl/linux-2.6-ia64-pv-ops.git/ + is also available. The tree is based on + git://git.kernel.org/pub/scm/linux/kernel/git/aegl/linux-2.6 test) + + + 3. copy .config for pv_ops of domU + # cp arch/ia64/configs/xen_domu_wip_defconfig .config + + 4. make kernel with pv_ops + # make oldconfig + # make + + 5. install the kernel and initrd + # cp vmlinux.gz /boot/efi/efi/redhat/vmlinuz-2.6-pv_ops-xenU + # make modules_install + # mkinitrd -f /boot/efi/efi/redhat/initrd-2.6-pv_ops-xenU.img \ + 2.6.26-rc3xen-ia64-08941-g1b12161 --builtin mptspi \ + --builtin mptbase --builtin mptscsih --builtin uhci-hcd \ + --builtin ohci-hcd --builtin ehci-hcd + +======================== +Boot DomainU with pv_ops +======================== + + 1. make config of DomU + # vi /etc/xen/rhel5 + kernel = "/boot/efi/efi/redhat/vmlinuz-2.6-pv_ops-xenU" + ramdisk = "/boot/efi/efi/redhat/initrd-2.6-pv_ops-xenU.img" + vcpus = 1 + memory = 512 + name = "rhel5" + disk = [ 'file:/root/rhel5.img,xvda1,w' ] + root = "/dev/xvda1 ro" + extra= "rhgb console=hvc0" + + 2. After boot xen and dom0, start xend + # /etc/init.d/xend start + ( In the debugging case, # XEND_DEBUG=1 xend trace_start ) + + 3. start domU + # xm create -c rhel5 + +========= +Reference +========= +- Wiki of Xen/IA64 upstream merge + http://wiki.xensource.com/xenwiki/XenIA64/UpstreamMerge + +Written by Akio Takebe <takebe_akio@jp.fujitsu.com> on 28 May 2008 |
