Merge commit 'v2.6.37-rc8' into x86/apic

Conflicts: arch/x86/include/asm/io_apic.h Merge reason: move to a fresh -rc, resolve the conflict. Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Ingo Molnar <mingo@elte.hu> 2011-01-04 09:43:42 +0100
committer: Ingo Molnar <mingo@elte.hu> 2011-01-04 09:43:42 +0100
commit: bc030d6cb9532877c1c5a3f5e7123344fa24a285 (patch)
tree: d223d410b868b80d4c0deec192d354a5d06b201a /Documentation/filesystems
parent: d3bd058826aa8b79590cca6c8e6d1557bf576ada (diff)
parent: 387c31c7e5c9805b0aef8833d1731a5fe7bdea14 (diff)
14 files changed, 219 insertions, 48 deletions
diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX
index 4303614b5ad..8c624a18f67 100644
--- a/Documentation/filesystems/00-INDEX
+++ b/Documentation/filesystems/00-INDEX
@@ -96,8 +96,6 @@ seq_file.txt
 	- how to use the seq_file API
 sharedsubtree.txt
 	- a description of shared subtrees for namespaces.
-smbfs.txt
-	- info on using filesystems with the SMB protocol (Win 3.11 and NT).
 spufs.txt
 	- info and mount options for the SPU filesystem used on Cell.
 sysfs-pci.txt
diff --git a/Documentation/filesystems/9p.txt b/Documentation/filesystems/9p.txt
index f9765e8cf08..b22abba78fe 100644
--- a/Documentation/filesystems/9p.txt
+++ b/Documentation/filesystems/9p.txt
@@ -111,7 +111,7 @@ OPTIONS
   		This can be used to share devices/named pipes/sockets between
 		hosts.  This functionality will be expanded in later versions.
 
-  access	there are three access modes.
+  access	there are four access modes.
 			user  = if a user tries to access a file on v9fs
 			        filesystem for the first time, v9fs sends an
 			        attach command (Tattach) for that user.
@@ -120,6 +120,8 @@ OPTIONS
 				the files on the mounted filesystem
 			any   = v9fs does single attach and performs all
 				operations as one user
+			client = ACL based access check on the 9p client
+			         side for access validation
 
   cachetag	cache tag to use the specified persistent cache.
 		cache tags for existing cache sessions can be listed at
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 2db4283efa8..b6426f15b4a 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -173,12 +173,13 @@ prototypes:
 	sector_t (*bmap)(struct address_space *, sector_t);
 	int (*invalidatepage) (struct page *, unsigned long);
 	int (*releasepage) (struct page *, int);
+	void (*freepage)(struct page *);
 	int (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
 			loff_t offset, unsigned long nr_segs);
 	int (*launder_page) (struct page *);
 
 locking rules:
-	All except set_page_dirty may block
+	All except set_page_dirty and freepage may block
 
 			BKL	PageLocked(page)	i_mutex
 writepage:		no	yes, unlocks (see below)
@@ -193,6 +194,7 @@ perform_write:		no	n/a			yes
 bmap:			no
 invalidatepage:		no	yes
 releasepage:		no	yes
+freepage:		no	yes
 direct_IO:		no
 launder_page:		no	yes
 
@@ -288,6 +290,9 @@ buffers from the page in preparation for freeing it.  It returns zero to
 indicate that the buffers are (or may be) freeable.  If ->releasepage is zero,
 the kernel assumes that the fs has no private interest in the buffers.
 
+	->freepage() is called when the kernel is done dropping the page
+from the page cache.
+
 	->launder_page() may be called prior to releasing a page if
 it is still found to be dirty. It returns zero if the page was successfully
 cleaned, or an error value if not. Note that in order to prevent the page
@@ -322,7 +327,6 @@ fl_release_private:	yes	yes
 prototypes:
 	int (*fl_compare_owner)(struct file_lock *, struct file_lock *);
 	void (*fl_notify)(struct file_lock *);  /* unblock callback */
-	void (*fl_copy_lock)(struct file_lock *, struct file_lock *);
 	void (*fl_release_private)(struct file_lock *);
 	void (*fl_break)(struct file_lock *); /* break_lease callback */
 
@@ -330,7 +334,6 @@ locking rules:
 			BKL	may block
 fl_compare_owner:	yes	no
 fl_notify:		yes	no
-fl_copy_lock:		yes	no
 fl_release_private:	yes	yes
 fl_break:		yes	no
 
@@ -349,21 +352,36 @@ call this method upon the IO completion.
 
 --------------------------- block_device_operations -----------------------
 prototypes:
-	int (*open) (struct inode *, struct file *);
-	int (*release) (struct inode *, struct file *);
-	int (*ioctl) (struct inode *, struct file *, unsigned, unsigned long);
+	int (*open) (struct block_device *, fmode_t);
+	int (*release) (struct gendisk *, fmode_t);
+	int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
+	int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
+	int (*direct_access) (struct block_device *, sector_t, void **, unsigned long *);
 	int (*media_changed) (struct gendisk *);
+	void (*unlock_native_capacity) (struct gendisk *);
 	int (*revalidate_disk) (struct gendisk *);
+	int (*getgeo)(struct block_device *, struct hd_geometry *);
+	void (*swap_slot_free_notify) (struct block_device *, unsigned long);
 
 locking rules:
-			BKL	bd_sem
-open:			yes	yes
-release:		yes	yes
-ioctl:			yes	no
+			BKL	bd_mutex
+open:			no	yes
+release:		no	yes
+ioctl:			no	no
+compat_ioctl:		no	no
+direct_access:		no	no
 media_changed:		no	no
+unlock_native_capacity:	no	no
 revalidate_disk:	no	no
+getgeo:			no	no
+swap_slot_free_notify:	no	no	(see below)
+
+media_changed, unlock_native_capacity and revalidate_disk are called only from
+check_disk_change().
+
+swap_slot_free_notify is called with swap_lock and sometimes the page lock
+held.
 
-The last two are called only from check_disk_change().
 
 --------------------------- file_operations -------------------------------
 prototypes:
diff --git a/Documentation/filesystems/configfs/configfs_example_explicit.c b/Documentation/filesystems/configfs/configfs_example_explicit.c
index d428cc9f07f..fd53869f563 100644
--- a/Documentation/filesystems/configfs/configfs_example_explicit.c
+++ b/Documentation/filesystems/configfs/configfs_example_explicit.c
@@ -89,7 +89,7 @@ static ssize_t childless_storeme_write(struct childless *childless,
 	char *p = (char *) page;
 
 	tmp = simple_strtoul(p, &p, 10);
-	if (!p || (*p && (*p != '\n')))
+	if ((*p != '\0') && (*p != '\n'))
 		return -EINVAL;
 
 	if (tmp > INT_MAX)
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index e1def1786e5..6ab9442d7ee 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -353,6 +353,20 @@ noauto_da_alloc		replacing existing files via patterns such as
 			system crashes before the delayed allocation
 			blocks are forced to disk.
 
+noinit_itable		Do not initialize any uninitialized inode table
+			blocks in the background.  This feature may be
+			used by installation CD's so that the install
+			process can complete as quickly as possible; the
+			inode table initialization process would then be
+			deferred until the next time the  file system
+			is unmounted.
+
+init_itable=n		The lazy itable init code will wait n times the
+			number of milliseconds it took to zero out the
+			previous block group's inode table.  This
+			minimizes the impact on the systme performance
+			while file system's inode table is being initialized.
+
 discard		Controls whether ext4 should issue discard/TRIM
 nodiscard(*)		commands to the underlying block device when
 			blocks are freed.  This is useful for SSD devices
diff --git a/Documentation/filesystems/nfs/00-INDEX b/Documentation/filesystems/nfs/00-INDEX
index 2f68cd68876..a57e12411d2 100644
--- a/Documentation/filesystems/nfs/00-INDEX
+++ b/Documentation/filesystems/nfs/00-INDEX
@@ -12,5 +12,9 @@ nfs-rdma.txt
 	- how to install and setup the Linux NFS/RDMA client and server software
 nfsroot.txt
 	- short guide on setting up a diskless box with NFS root filesystem.
+pnfs.txt
+	- short explanation of some of the internals of the pnfs client code
 rpc-cache.txt
 	- introduction to the caching mechanisms in the sunrpc layer.
+idmapper.txt
+	- information for configuring request-keys to be used by idmapper
diff --git a/Documentation/filesystems/nfs/idmapper.txt b/Documentation/filesystems/nfs/idmapper.txt
new file mode 100644
index 00000000000..b9b4192ea8b
--- /dev/null
+++ b/Documentation/filesystems/nfs/idmapper.txt
@@ -0,0 +1,67 @@
+
+=========
+ID Mapper
+=========
+Id mapper is used by NFS to translate user and group ids into names, and to
+translate user and group names into ids.  Part of this translation involves
+performing an upcall to userspace to request the information.  Id mapper will
+user request-key to perform this upcall and cache the result.  The program
+/usr/sbin/nfs.idmap should be called by request-key, and will perform the
+translation and initialize a key with the resulting information.
+
+ NFS_USE_NEW_IDMAPPER must be selected when configuring the kernel to use this
+ feature.
+
+===========
+Configuring
+===========
+The file /etc/request-key.conf will need to be modified so /sbin/request-key can
+direct the upcall.  The following line should be added:
+
+#OP	TYPE	DESCRIPTION	CALLOUT INFO	PROGRAM ARG1 ARG2 ARG3 ...
+#======	=======	===============	===============	===============================
+create	id_resolver	*	*		/usr/sbin/nfs.idmap %k %d 600
+
+This will direct all id_resolver requests to the program /usr/sbin/nfs.idmap.
+The last parameter, 600, defines how many seconds into the future the key will
+expire.  This parameter is optional for /usr/sbin/nfs.idmap.  When the timeout
+is not specified, nfs.idmap will default to 600 seconds.
+
+id mapper uses for key descriptions:
+	  uid:  Find the UID for the given user
+	  gid:  Find the GID for the given group
+	 user:  Find the user  name for the given UID
+	group:  Find the group name for the given GID
+
+You can handle any of these individually, rather than using the generic upcall
+program.  If you would like to use your own program for a uid lookup then you
+would edit your request-key.conf so it look similar to this:
+
+#OP	TYPE	DESCRIPTION	CALLOUT INFO	PROGRAM ARG1 ARG2 ARG3 ...
+#======	=======	===============	===============	===============================
+create	id_resolver	uid:*	*		/some/other/program %k %d 600
+create	id_resolver	*	*		/usr/sbin/nfs.idmap %k %d 600
+
+Notice that the new line was added above the line for the generic program.
+request-key will find the first matching line and corresponding program.  In
+this case, /some/other/program will handle all uid lookups and
+/usr/sbin/nfs.idmap will handle gid, user, and group lookups.
+
+See <file:Documentation/keys-request-keys.txt> for more information about the
+request-key function.
+
+
+=========
+nfs.idmap
+=========
+nfs.idmap is designed to be called by request-key, and should not be run "by
+hand".  This program takes two arguments, a serialized key and a key
+description.  The serialized key is first converted into a key_serial_t, and
+then passed as an argument to keyctl_instantiate (both are part of keyutils.h).
+
+The actual lookups are performed by functions found in nfsidmap.h.  nfs.idmap
+determines the correct function to call by looking at the first part of the
+description string.  For example, a uid lookup description will appear as
+"uid:user@domain".
+
+nfs.idmap will return 0 if the key was instantiated, and non-zero otherwise.
diff --git a/Documentation/filesystems/nfs/nfsroot.txt b/Documentation/filesystems/nfs/nfsroot.txt
index f2430a7974e..90c71c6f0d0 100644
--- a/Documentation/filesystems/nfs/nfsroot.txt
+++ b/Documentation/filesystems/nfs/nfsroot.txt
@@ -159,6 +159,28 @@ ip=<client-ip>:<server-ip>:<gw-ip>:<netmask>:<hostname>:<device>:<autoconf>
                 Default: any
 
 
+nfsrootdebug
+
+  This parameter enables debugging messages to appear in the kernel
+  log at boot time so that administrators can verify that the correct
+  NFS mount options, server address, and root path are passed to the
+  NFS client.
+
+
+rdinit=<executable file>
+
+  To specify which file contains the program that starts system
+  initialization, administrators can use this command line parameter.
+  The default value of this parameter is "/init".  If the specified
+  file exists and the kernel can execute it, root filesystem related
+  kernel command line parameters, including `nfsroot=', are ignored.
+
+  A description of the process of mounting the root file system can be
+  found in:
+
+    Documentation/early-userspace/README
+
+
 
 
 3.) Boot Loader
diff --git a/Documentation/filesystems/nfs/pnfs.txt b/Documentation/filesystems/nfs/pnfs.txt
new file mode 100644
index 00000000000..bc0b9cfe095
--- /dev/null
+++ b/Documentation/filesystems/nfs/pnfs.txt
@@ -0,0 +1,48 @@
+Reference counting in pnfs:
+==========================
+
+The are several inter-related caches.  We have layouts which can
+reference multiple devices, each of which can reference multiple data servers.
+Each data server can be referenced by multiple devices.  Each device
+can be referenced by multiple layouts.  To keep all of this straight,
+we need to reference count.
+
+
+struct pnfs_layout_hdr
+----------------------
+The on-the-wire command LAYOUTGET corresponds to struct
+pnfs_layout_segment, usually referred to by the variable name lseg.
+Each nfs_inode may hold a pointer to a cache of of these layout
+segments in nfsi->layout, of type struct pnfs_layout_hdr.
+
+We reference the header for the inode pointing to it, across each
+outstanding RPC call that references it (LAYOUTGET, LAYOUTRETURN,
+LAYOUTCOMMIT), and for each lseg held within.
+
+Each header is also (when non-empty) put on a list associated with
+struct nfs_client (cl_layouts).  Being put on this list does not bump
+the reference count, as the layout is kept around by the lseg that
+keeps it in the list.
+
+deviceid_cache
+--------------
+lsegs reference device ids, which are resolved per nfs_client and
+layout driver type.  The device ids are held in a RCU cache (struct
+nfs4_deviceid_cache).  The cache itself is referenced across each
+mount.  The entries (struct nfs4_deviceid) themselves are held across
+the lifetime of each lseg referencing them.
+
+RCU is used because the deviceid is basically a write once, read many
+data structure.  The hlist size of 32 buckets needs better
+justification, but seems reasonable given that we can have multiple
+deviceid's per filesystem, and multiple filesystems per nfs_client.
+
+The hash code is copied from the nfsd code base.  A discussion of
+hashing and variations of this algorithm can be found at:
+http://groups.google.com/group/comp.lang.c/browse_thread/thread/9522965e2b8d3809
+
+data server cache
+-----------------
+file driver devices refer to data servers, which are kept in a module
+level cache.  Its reference is held over the lifetime of the deviceid
+pointing to it.
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index a6aca874088..e73df2722ff 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -136,6 +136,7 @@ Table 1-1: Process specific entries in /proc
  statm		Process memory status information
  status		Process status in human readable form
  wchan		If CONFIG_KALLSYMS is set, a pre-decoded wchan
+ pagemap	Page table
  stack		Report full stack trace, enable via CONFIG_STACKTRACE
  smaps		a extension based on maps, showing the memory consumption of
 		each mapping
@@ -370,17 +371,24 @@ Shared_Dirty:          0 kB
 Private_Clean:         0 kB
 Private_Dirty:         0 kB
 Referenced:          892 kB
+Anonymous:             0 kB
 Swap:                  0 kB
 KernelPageSize:        4 kB
 MMUPageSize:           4 kB
 
-The first  of these lines shows  the same information  as is displayed for the
-mapping in /proc/PID/maps.  The remaining lines show  the size of the mapping,
-the amount of the mapping that is currently resident in RAM, the "proportional
-set size” (divide each shared page by the number of processes sharing it), the
-number of clean and dirty shared pages in the mapping, and the number of clean
-and dirty private pages in the mapping.  The "Referenced" indicates the amount
-of memory currently marked as referenced or accessed.
+The first of these lines shows the same information as is displayed for the
+mapping in /proc/PID/maps.  The remaining lines show the size of the mapping
+(size), the amount of the mapping that is currently resident in RAM (RSS), the
+process' proportional share of this mapping (PSS), the number of clean and
+dirty private pages in the mapping.  Note that even a page which is part of a
+MAP_SHARED mapping, but has only a single pte mapped, i.e.  is currently used
+by only one process, is accounted as private and not as shared.  "Referenced"
+indicates the amount of memory currently marked as referenced or accessed.
+"Anonymous" shows the amount of memory that does not belong to any file.  Even
+a mapping associated with a file may contain anonymous pages: when MAP_PRIVATE
+and a page is modified, the file page is replaced by a private anonymous copy.
+"Swap" shows how much would-be-anonymous memory is also used, but out on
+swap.
 
 This file is only present if the CONFIG_MMU kernel configuration option is
 enabled.
@@ -397,6 +405,9 @@ To clear the bits for the file mapped pages associated with the process
     > echo 3 > /proc/PID/clear_refs
 Any other value written to /proc/PID/clear_refs will have no effect.
 
+The /proc/pid/pagemap gives the PFN, which can be used to find the pageflags
+using /proc/kpageflags and number of times a page is mapped using
+/proc/kpagecount. For detailed explanation, see Documentation/vm/pagemap.txt.
 
 1.2 Kernel data
 ---------------
diff --git a/Documentation/filesystems/sharedsubtree.txt b/Documentation/filesystems/sharedsubtree.txt
index fc0e39af43c..4ede421c968 100644
--- a/Documentation/filesystems/sharedsubtree.txt
+++ b/Documentation/filesystems/sharedsubtree.txt
@@ -62,10 +62,10 @@ replicas continue to be exactly same.
 	# mount /dev/sd0  /tmp/a
 
 	#ls /tmp/a
-	t1 t2 t2
+	t1 t2 t3
 
 	#ls /mnt/a
-	t1 t2 t2
+	t1 t2 t3
 
 	Note that the mount has propagated to the mount at /mnt as well.
 
diff --git a/Documentation/filesystems/smbfs.txt b/Documentation/filesystems/smbfs.txt
deleted file mode 100644
index 194fb0decd2..00000000000
--- a/Documentation/filesystems/smbfs.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-Smbfs is a filesystem that implements the SMB protocol, which is the
-protocol used by Windows for Workgroups, Windows 95 and Windows NT.
-Smbfs was inspired by Samba, the program written by Andrew Tridgell
-that turns any Unix host into a file server for DOS or Windows clients.
-
-Smbfs is a SMB client, but uses parts of samba for its operation. For
-more info on samba, including documentation, please go to
-http://www.samba.org/ and then on to your nearest mirror.
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index ed7e5efc06d..20899e095e7 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -534,6 +534,7 @@ struct address_space_operations {
 	sector_t (*bmap)(struct address_space *, sector_t);
 	int (*invalidatepage) (struct page *, unsigned long);
 	int (*releasepage) (struct page *, int);
+	void (*freepage)(struct page *);
 	ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
 			loff_t offset, unsigned long nr_segs);
 	struct page* (*get_xip_page)(struct address_space *, sector_t,
@@ -660,11 +661,10 @@ struct address_space_operations {
   releasepage: releasepage is called on PagePrivate pages to indicate
         that the page should be freed if possible.  ->releasepage
         should remove any private data from the page and clear the
-        PagePrivate flag.  It may also remove the page from the
-        address_space.  If this fails for some reason, it may indicate
-        failure with a 0 return value.
-	This is used in two distinct though related cases.  The first
-        is when the VM finds a clean page with no active users and
+        PagePrivate flag. If releasepage() fails for some reason, it must
+	indicate failure with a 0 return value.
+	releasepage() is used in two distinct though related cases.  The
+	first is when the VM finds a clean page with no active users and
         wants to make it a free page.  If ->releasepage succeeds, the
         page will be removed from the address_space and become free.
 
@@ -679,6 +679,12 @@ struct address_space_operations {
         need to ensure this.  Possibly it can clear the PageUptodate
         bit if it cannot free private data yet.
 
+  freepage: freepage is called once the page is no longer visible in
+        the page cache in order to allow the cleanup of any private
+	data. Since it may be called by the memory reclaimer, it
+	should not assume that the original address_space mapping still
+	exists, and it should not block.
+
   direct_IO: called by the generic read/write routines to perform
         direct_IO - that is IO requests which bypass the page cache
         and transfer data directly between the storage and the
diff --git a/Documentation/filesystems/xfs-delayed-logging-design.txt b/Documentation/filesystems/xfs-delayed-logging-design.txt
index 96d0df28bed..7445bf335da 100644
--- a/Documentation/filesystems/xfs-delayed-logging-design.txt
+++ b/Documentation/filesystems/xfs-delayed-logging-design.txt
@@ -794,17 +794,6 @@ designed.
 
 Roadmap:
 
-2.6.37 Remove experimental tag from mount option
-	=> should be roughly 6 months after initial merge
-	=> enough time to:
-		=> gain confidence and fix problems reported by early
-		   adopters (a.k.a. guinea pigs)
-		=> address worst performance regressions and undesired
-		   behaviours
-		=> start tuning/optimising code for parallelism
-		=> start tuning/optimising algorithms consuming
-		   excessive CPU time
-
 2.6.39 Switch default mount option to use delayed logging
 	=> should be roughly 12 months after initial merge
 	=> enough time to shake out remaining problems before next round of
author	Ingo Molnar <mingo@elte.hu>	2011-01-04 09:43:42 +0100
committer	Ingo Molnar <mingo@elte.hu>	2011-01-04 09:43:42 +0100
commit	bc030d6cb9532877c1c5a3f5e7123344fa24a285 (patch)
tree	d223d410b868b80d4c0deec192d354a5d06b201a /Documentation/filesystems
parent	d3bd058826aa8b79590cca6c8e6d1557bf576ada (diff)
parent	387c31c7e5c9805b0aef8833d1731a5fe7bdea14 (diff)