aboutsummaryrefslogtreecommitdiff
path: root/Documentation/filesystems
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation/filesystems')
-rw-r--r--Documentation/filesystems/00-INDEX60
-rw-r--r--Documentation/filesystems/9p.txt17
-rw-r--r--Documentation/filesystems/Locking70
-rw-r--r--Documentation/filesystems/affs.txt9
-rw-r--r--Documentation/filesystems/autofs4-mount-control.txt2
-rw-r--r--Documentation/filesystems/btrfs.txt225
-rw-r--r--Documentation/filesystems/caching/backend-api.txt9
-rw-r--r--Documentation/filesystems/caching/netfs-api.txt110
-rw-r--r--Documentation/filesystems/cifs.txt51
-rw-r--r--Documentation/filesystems/cifs/AUTHORS56
-rw-r--r--Documentation/filesystems/cifs/CHANGES1065
-rw-r--r--Documentation/filesystems/cifs/README753
-rw-r--r--Documentation/filesystems/cifs/TODO129
-rw-r--r--Documentation/filesystems/cifs/cifs.txt31
-rwxr-xr-xDocumentation/filesystems/cifs/winucase_convert.pl62
-rw-r--r--Documentation/filesystems/directory-locking31
-rw-r--r--Documentation/filesystems/ext3.txt7
-rw-r--r--Documentation/filesystems/ext4.txt30
-rw-r--r--Documentation/filesystems/f2fs.txt142
-rw-r--r--Documentation/filesystems/hfsplus.txt2
-rw-r--r--Documentation/filesystems/jfs.txt2
-rw-r--r--Documentation/filesystems/nfs/00-INDEX4
-rw-r--r--Documentation/filesystems/nfs/Exporting2
-rw-r--r--Documentation/filesystems/nfs/nfs41-server.txt44
-rw-r--r--Documentation/filesystems/nfs/pnfs.txt2
-rw-r--r--Documentation/filesystems/nfs/rpc-server-gss.txt91
-rw-r--r--Documentation/filesystems/nilfs2.txt68
-rw-r--r--Documentation/filesystems/ntfs.txt2
-rw-r--r--Documentation/filesystems/porting28
-rw-r--r--Documentation/filesystems/proc.txt71
-rw-r--r--Documentation/filesystems/qnx6.txt4
-rw-r--r--Documentation/filesystems/ramfs-rootfs-initramfs.txt4
-rw-r--r--Documentation/filesystems/relay.txt2
-rw-r--r--Documentation/filesystems/seq_file.txt9
-rw-r--r--Documentation/filesystems/sharedsubtree.txt2
-rw-r--r--Documentation/filesystems/sysfs-tagging.txt2
-rw-r--r--Documentation/filesystems/sysfs.txt6
-rw-r--r--Documentation/filesystems/vfat.txt31
-rw-r--r--Documentation/filesystems/vfs.txt150
-rw-r--r--Documentation/filesystems/xfs-self-describing-metadata.txt350
-rw-r--r--Documentation/filesystems/xfs.txt322
41 files changed, 3650 insertions, 407 deletions
diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX
index 8042050eb26..ac28149aede 100644
--- a/Documentation/filesystems/00-INDEX
+++ b/Documentation/filesystems/00-INDEX
@@ -2,6 +2,8 @@
- this file (info on some of the filesystems supported by linux).
Locking
- info on locking rules as they pertain to Linux VFS.
+Makefile
+ - Makefile for building the filsystems-part of DocBook.
9p.txt
- 9p (v9fs) is an implementation of the Plan 9 remote fs protocol.
adfs.txt
@@ -10,24 +12,32 @@ afs.txt
- info and examples for the distributed AFS (Andrew File System) fs.
affs.txt
- info and mount options for the Amiga Fast File System.
+autofs4-mount-control.txt
+ - info on device control operations for autofs4 module.
automount-support.txt
- information about filesystem automount support.
befs.txt
- information about the BeOS filesystem for Linux.
bfs.txt
- info for the SCO UnixWare Boot Filesystem (BFS).
+btrfs.txt
+ - info for the BTRFS filesystem.
+caching/
+ - directory containing filesystem cache documentation.
ceph.txt
- - info for the Ceph Distributed File System
-cifs.txt
- - description of the CIFS filesystem.
+ - info for the Ceph Distributed File System.
+cifs/
+ - directory containing CIFS filesystem documentation and example code.
coda.txt
- description of the CODA filesystem.
configfs/
- directory containing configfs documentation and example code.
cramfs.txt
- info on the cram filesystem for small storage (ROMs etc).
-dentry-locking.txt
- - info on the RCU-based dcache locking model.
+debugfs.txt
+ - info on the debugfs filesystem.
+devpts.txt
+ - info on the devpts filesystem.
directory-locking
- info about the locking scheme used for directory operations.
dlmfs.txt
@@ -35,7 +45,7 @@ dlmfs.txt
dnotify.txt
- info about directory notification in Linux.
dnotify_test.c
- - example program for dnotify
+ - example program for dnotify.
ecryptfs.txt
- docs on eCryptfs: stacked cryptographic filesystem for Linux.
efivarfs.txt
@@ -48,12 +58,18 @@ ext3.txt
- info, mount options and specifications for the Ext3 filesystem.
ext4.txt
- info, mount options and specifications for the Ext4 filesystem.
-files.txt
- - info on file management in the Linux kernel.
f2fs.txt
- info and mount options for the F2FS filesystem.
+fiemap.txt
+ - info on fiemap ioctl.
+files.txt
+ - info on file management in the Linux kernel.
fuse.txt
- info on the Filesystem in User SpacE including mount options.
+gfs2-glocks.txt
+ - info on the Global File System 2 - Glock internal locking rules.
+gfs2-uevents.txt
+ - info on the Global File System 2 - uevents.
gfs2.txt
- info on the Global File System 2.
hfs.txt
@@ -84,40 +100,58 @@ ntfs.txt
- info and mount options for the NTFS filesystem (Windows NT).
ocfs2.txt
- info and mount options for the OCFS2 clustered filesystem.
+omfs.txt
+ - info on the Optimized MPEG FileSystem.
+path-lookup.txt
+ - info on path walking and name lookup locking.
+pohmelfs/
+ - directory containing pohmelfs filesystem documentation.
porting
- various information on filesystem porting.
proc.txt
- info on Linux's /proc filesystem.
+qnx6.txt
+ - info on the QNX6 filesystem.
+quota.txt
+ - info on Quota subsystem.
ramfs-rootfs-initramfs.txt
- info on the 'in memory' filesystems ramfs, rootfs and initramfs.
-reiser4.txt
- - info on the Reiser4 filesystem based on dancing tree algorithms.
relay.txt
- info on relay, for efficient streaming from kernel to user space.
romfs.txt
- description of the ROMFS filesystem.
seq_file.txt
- - how to use the seq_file API
+ - how to use the seq_file API.
sharedsubtree.txt
- a description of shared subtrees for namespaces.
spufs.txt
- info and mount options for the SPU filesystem used on Cell.
+squashfs.txt
+ - info on the squashfs filesystem.
sysfs-pci.txt
- info on accessing PCI device resources through sysfs.
+sysfs-tagging.txt
+ - info on sysfs tagging to avoid duplicates.
sysfs.txt
- info on sysfs, a ram-based filesystem for exporting kernel objects.
sysv-fs.txt
- info on the SystemV/V7/Xenix/Coherent filesystem.
tmpfs.txt
- info on tmpfs, a filesystem that holds all files in virtual memory.
+ubifs.txt
+ - info on the Unsorted Block Images FileSystem.
udf.txt
- info and mount options for the UDF filesystem.
ufs.txt
- info on the ufs filesystem.
vfat.txt
- - info on using the VFAT filesystem used in Windows NT and Windows 95
+ - info on using the VFAT filesystem used in Windows NT and Windows 95.
vfs.txt
- - overview of the Virtual File System
+ - overview of the Virtual File System.
+xfs-delayed-logging-design.txt
+ - info on the XFS Delayed Logging Design.
+xfs-self-describing-metadata.txt
+ - info on XFS Self Describing Metadata.
xfs.txt
- info and mount options for the XFS filesystem.
xip.txt
diff --git a/Documentation/filesystems/9p.txt b/Documentation/filesystems/9p.txt
index 2c032144284..fec7144e817 100644
--- a/Documentation/filesystems/9p.txt
+++ b/Documentation/filesystems/9p.txt
@@ -69,10 +69,14 @@ OPTIONS
offering several exported file systems.
cache=mode specifies a caching policy. By default, no caches are used.
+ none = default no cache policy, metadata and data
+ alike are synchronous.
loose = no attempts are made at consistency,
intended for exclusive, read-only mounts
- fscache = use FS-Cache for a persistent, read-only
+ fscache = use FS-Cache for a persistent, read-only
cache backend.
+ mmap = minimal cache that is only used for read-write
+ mmap. Northing else is cached, like cache=none
debug=n specifies debug level. The debug level is a bitmask.
0x01 = display verbose error messages
@@ -147,8 +151,7 @@ on sourceforge (http://sourceforge.net/projects/v9fs).
News and other information is maintained on a Wiki.
(http://sf.net/apps/mediawiki/v9fs/index.php).
-Bug reports may be issued through the kernel.org bugzilla
-(http://bugzilla.kernel.org)
+Bug reports are best issued via the mailing list.
For more information on the Plan 9 Operating System check out
http://plan9.bell-labs.com/plan9
@@ -156,11 +159,3 @@ http://plan9.bell-labs.com/plan9
For information on Plan 9 from User Space (Plan 9 applications and libraries
ported to Linux/BSD/OSX/etc) check out http://swtch.com/plan9
-
-STATUS
-======
-
-The 2.6 kernel support is working on PPC and x86.
-
-PLEASE USE THE KERNEL BUGZILLA TO REPORT PROBLEMS. (http://bugzilla.kernel.org)
-
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index f48e0c6b4c4..b18dd177902 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -10,10 +10,9 @@ be able to use diff(1).
--------------------------- dentry_operations --------------------------
prototypes:
int (*d_revalidate)(struct dentry *, unsigned int);
- int (*d_hash)(const struct dentry *, const struct inode *,
- struct qstr *);
- int (*d_compare)(const struct dentry *, const struct inode *,
- const struct dentry *, const struct inode *,
+ int (*d_weak_revalidate)(struct dentry *, unsigned int);
+ int (*d_hash)(const struct dentry *, struct qstr *);
+ int (*d_compare)(const struct dentry *, const struct dentry *,
unsigned int, const char *, const struct qstr *);
int (*d_delete)(struct dentry *);
void (*d_release)(struct dentry *);
@@ -25,6 +24,7 @@ prototypes:
locking rules:
rename_lock ->d_lock may block rcu-walk
d_revalidate: no no yes (ref-walk) maybe
+d_weak_revalidate:no no yes no
d_hash no no no maybe
d_compare: yes no no maybe
d_delete: no yes no no
@@ -47,6 +47,8 @@ prototypes:
int (*mknod) (struct inode *,struct dentry *,umode_t,dev_t);
int (*rename) (struct inode *, struct dentry *,
struct inode *, struct dentry *);
+ int (*rename2) (struct inode *, struct dentry *,
+ struct inode *, struct dentry *, unsigned int);
int (*readlink) (struct dentry *, char __user *,int);
void * (*follow_link) (struct dentry *, struct nameidata *);
void (*put_link) (struct dentry *, struct nameidata *, void *);
@@ -64,6 +66,7 @@ prototypes:
int (*atomic_open)(struct inode *, struct dentry *,
struct file *, unsigned open_flag,
umode_t create_mode, int *opened);
+ int (*tmpfile) (struct inode *, struct dentry *, umode_t);
locking rules:
all may block
@@ -77,6 +80,7 @@ mkdir: yes
unlink: yes (both)
rmdir: yes (both) (see below)
rename: yes (all) (see below)
+rename2: yes (all) (see below)
readlink: no
follow_link: no
put_link: no
@@ -91,10 +95,12 @@ removexattr: yes
fiemap: no
update_time: no
atomic_open: yes
+tmpfile: no
Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on
victim.
- cross-directory ->rename() has (per-superblock) ->s_vfs_rename_sem.
+ cross-directory ->rename() and rename2() has (per-superblock)
+->s_vfs_rename_sem.
See Documentation/filesystems/directory-locking for more detailed discussion
of the locking scheme for directory operations.
@@ -187,16 +193,15 @@ prototypes:
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata);
sector_t (*bmap)(struct address_space *, sector_t);
- int (*invalidatepage) (struct page *, unsigned long);
+ void (*invalidatepage) (struct page *, unsigned int, unsigned int);
int (*releasepage) (struct page *, int);
void (*freepage)(struct page *);
- int (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
- loff_t offset, unsigned long nr_segs);
+ int (*direct_IO)(int, struct kiocb *, struct iov_iter *iter, loff_t offset);
int (*get_xip_mem)(struct address_space *, pgoff_t, int, void **,
unsigned long *);
int (*migratepage)(struct address_space *, struct page *, struct page *);
int (*launder_page)(struct page *);
- int (*is_partially_uptodate)(struct page *, read_descriptor_t *, unsigned long);
+ int (*is_partially_uptodate)(struct page *, unsigned long, unsigned long);
int (*error_remove_page)(struct address_space *, struct page *);
int (*swap_activate)(struct file *);
int (*swap_deactivate)(struct file *);
@@ -308,8 +313,8 @@ filesystems and by the swapper. The latter will eventually go away. Please,
keep it that way and don't breed new callers.
->invalidatepage() is called when the filesystem must attempt to drop
-some or all of the buffers from the page when it is being truncated. It
-returns zero on success. If ->invalidatepage is zero, the kernel uses
+some or all of the buffers from the page when it is being truncated. It
+returns zero on success. If ->invalidatepage is zero, the kernel uses
block_invalidatepage() instead.
->releasepage() is called when the kernel is about to try to drop the
@@ -342,25 +347,38 @@ prototypes:
locking rules:
- file_lock_lock may block
+ inode->i_lock may block
fl_copy_lock: yes no
fl_release_private: maybe no
----------------------- lock_manager_operations ---------------------------
prototypes:
int (*lm_compare_owner)(struct file_lock *, struct file_lock *);
+ unsigned long (*lm_owner_key)(struct file_lock *);
void (*lm_notify)(struct file_lock *); /* unblock callback */
int (*lm_grant)(struct file_lock *, struct file_lock *, int);
void (*lm_break)(struct file_lock *); /* break_lease callback */
int (*lm_change)(struct file_lock **, int);
locking rules:
- file_lock_lock may block
-lm_compare_owner: yes no
-lm_notify: yes no
-lm_grant: no no
-lm_break: yes no
-lm_change yes no
+
+ inode->i_lock blocked_lock_lock may block
+lm_compare_owner: yes[1] maybe no
+lm_owner_key yes[1] yes no
+lm_notify: yes yes no
+lm_grant: no no no
+lm_break: yes no no
+lm_change yes no no
+
+[1]: ->lm_compare_owner and ->lm_owner_key are generally called with
+*an* inode->i_lock held. It may not be the i_lock of the inode
+associated with either file_lock argument! This is the case with deadlock
+detection, since the code has to chase down the owners of locks that may
+be entirely unrelated to the one on which the lock is being acquired.
+For deadlock detection however, the blocked_lock_lock is also held. The
+fact that these locks are held ensures that the file_locks do not
+disappear out from under you while doing the comparison or generating an
+owner key.
--------------------------- buffer_head -----------------------------------
prototypes:
@@ -412,7 +430,9 @@ prototypes:
ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
- int (*readdir) (struct file *, void *, filldir_t);
+ ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
+ ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
+ int (*iterate) (struct file *, struct dir_context *);
unsigned int (*poll) (struct file *, struct poll_table_struct *);
long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
@@ -510,6 +530,7 @@ locking rules:
open: yes
close: yes
fault: yes can return with page locked
+map_pages: yes
page_mkwrite: yes can return with page locked
access: yes
@@ -521,6 +542,15 @@ the page, then ensure it is not already truncated (the page lock will block
subsequent truncate), and then return with VM_FAULT_LOCKED, and the page
locked. The VM will unlock the page.
+ ->map_pages() is called when VM asks to map easy accessible pages.
+Filesystem should find and map pages associated with offsets from "pgoff"
+till "max_pgoff". ->map_pages() is called with page table locked and must
+not block. If it's not possible to reach a page without blocking,
+filesystem should skip it. Filesystem should use do_set_pte() to setup
+page table entry. Pointer to entry associated with offset "pgoff" is
+passed in "pte" field in vm_fault structure. Pointers to entries for other
+offsets should be calculated relative to "pte".
+
->page_mkwrite() is called when a previously read-only pte is
about to become writeable. The filesystem again must ensure that there are
no truncate/invalidate races, and then return with the page locked. If
@@ -529,7 +559,7 @@ like the ->fault() handler, but simply return with VM_FAULT_NOPAGE, which
will cause the VM to retry the fault.
->access() is called when get_user_pages() fails in
-acces_process_vm(), typically used to debug a process through
+access_process_vm(), typically used to debug a process through
/proc/pid/mem or ptrace. This function is needed only for
VM_IO | VM_PFNMAP VMAs.
diff --git a/Documentation/filesystems/affs.txt b/Documentation/filesystems/affs.txt
index 81ac488e375..71b63c2b984 100644
--- a/Documentation/filesystems/affs.txt
+++ b/Documentation/filesystems/affs.txt
@@ -49,6 +49,10 @@ mode=mode Sets the mode flags to the given (octal) value, regardless
This is useful since most of the plain AmigaOS files
will map to 600.
+nofilenametruncate
+ The file system will return an error when filename exceeds
+ standard maximum filename length (30 characters).
+
reserved=num Sets the number of reserved blocks at the start of the
partition to num. You should never need this option.
Default is 2.
@@ -181,9 +185,8 @@ tested, though several hundred MB have been read and written using
this fs. For a most up-to-date list of bugs please consult
fs/affs/Changes.
-Filenames are truncated to 30 characters without warning (this
-can be changed by setting the compile-time option AFFS_NO_TRUNCATE
-in include/linux/amigaffs.h).
+By default, filenames are truncated to 30 characters without warning.
+'nofilenametruncate' mount option can change that behavior.
Case is ignored by the affs in filename matching, but Linux shells
do care about the case. Example (with /wb being an affs mounted fs):
diff --git a/Documentation/filesystems/autofs4-mount-control.txt b/Documentation/filesystems/autofs4-mount-control.txt
index 4c95935cbcf..aff22113a98 100644
--- a/Documentation/filesystems/autofs4-mount-control.txt
+++ b/Documentation/filesystems/autofs4-mount-control.txt
@@ -255,7 +255,7 @@ AUTOFS_DEV_IOCTL_OPENMOUNT and AUTOFS_DEV_IOCTL_CLOSEMOUNT
Obtain and release a file descriptor for an autofs managed mount point
path. The open call requires an initialized struct autofs_dev_ioctl with
-the the path field set and the size field adjusted appropriately as well
+the path field set and the size field adjusted appropriately as well
as the arg1 field set to the device number of the autofs mount. The
device number can be obtained from the mount options shown in
/proc/mounts. The close call requires an initialized struct
diff --git a/Documentation/filesystems/btrfs.txt b/Documentation/filesystems/btrfs.txt
index 7671352216f..d11cc2f8077 100644
--- a/Documentation/filesystems/btrfs.txt
+++ b/Documentation/filesystems/btrfs.txt
@@ -1,8 +1,8 @@
- BTRFS
- =====
+BTRFS
+=====
-Btrfs is a new copy on write filesystem for Linux aimed at
+Btrfs is a copy on write filesystem for Linux aimed at
implementing advanced features while focusing on fault tolerance,
repair and easy administration. Initially developed by Oracle, Btrfs
is licensed under the GPL and open for contribution from anyone.
@@ -34,9 +34,198 @@ The main Btrfs features include:
* Online filesystem defragmentation
+Mount Options
+=============
- MAILING LIST
- ============
+When mounting a btrfs filesystem, the following option are accepted.
+Options with (*) are default options and will not show in the mount options.
+
+ alloc_start=<bytes>
+ Debugging option to force all block allocations above a certain
+ byte threshold on each block device. The value is specified in
+ bytes, optionally with a K, M, or G suffix, case insensitive.
+ Default is 1MB.
+
+ noautodefrag(*)
+ autodefrag
+ Disable/enable auto defragmentation.
+ Auto defragmentation detects small random writes into files and queue
+ them up for the defrag process. Works best for small files;
+ Not well suited for large database workloads.
+
+ check_int
+ check_int_data
+ check_int_print_mask=<value>
+ These debugging options control the behavior of the integrity checking
+ module (the BTRFS_FS_CHECK_INTEGRITY config option required).
+
+ check_int enables the integrity checker module, which examines all
+ block write requests to ensure on-disk consistency, at a large
+ memory and CPU cost.
+
+ check_int_data includes extent data in the integrity checks, and
+ implies the check_int option.
+
+ check_int_print_mask takes a bitmask of BTRFSIC_PRINT_MASK_* values
+ as defined in fs/btrfs/check-integrity.c, to control the integrity
+ checker module behavior.
+
+ See comments at the top of fs/btrfs/check-integrity.c for more info.
+
+ commit=<seconds>
+ Set the interval of periodic commit, 30 seconds by default. Higher
+ values defer data being synced to permanent storage with obvious
+ consequences when the system crashes. The upper bound is not forced,
+ but a warning is printed if it's more than 300 seconds (5 minutes).
+
+ compress
+ compress=<type>
+ compress-force
+ compress-force=<type>
+ Control BTRFS file data compression. Type may be specified as "zlib"
+ "lzo" or "no" (for no compression, used for remounting). If no type
+ is specified, zlib is used. If compress-force is specified,
+ all files will be compressed, whether or not they compress well.
+ If compression is enabled, nodatacow and nodatasum are disabled.
+
+ degraded
+ Allow mounts to continue with missing devices. A read-write mount may
+ fail with too many devices missing, for example if a stripe member
+ is completely missing.
+
+ device=<devicepath>
+ Specify a device during mount so that ioctls on the control device
+ can be avoided. Especially useful when trying to mount a multi-device
+ setup as root. May be specified multiple times for multiple devices.
+
+ nodiscard(*)
+ discard
+ Disable/enable discard mount option.
+ Discard issues frequent commands to let the block device reclaim space
+ freed by the filesystem.
+ This is useful for SSD devices, thinly provisioned
+ LUNs and virtual machine images, but may have a significant
+ performance impact. (The fstrim command is also available to
+ initiate batch trims from userspace).
+
+ noenospc_debug(*)
+ enospc_debug
+ Disable/enable debugging option to be more verbose in some ENOSPC conditions.
+
+ fatal_errors=<action>
+ Action to take when encountering a fatal error:
+ "bug" - BUG() on a fatal error. This is the default.
+ "panic" - panic() on a fatal error.
+
+ noflushoncommit(*)
+ flushoncommit
+ The 'flushoncommit' mount option forces any data dirtied by a write in a
+ prior transaction to commit as part of the current commit. This makes
+ the committed state a fully consistent view of the file system from the
+ application's perspective (i.e., it includes all completed file system
+ operations). This was previously the behavior only when a snapshot is
+ created.
+
+ inode_cache
+ Enable free inode number caching. Defaults to off due to an overflow
+ problem when the free space crcs don't fit inside a single page.
+
+ max_inline=<bytes>
+ Specify the maximum amount of space, in bytes, that can be inlined in
+ a metadata B-tree leaf. The value is specified in bytes, optionally
+ with a K, M, or G suffix, case insensitive. In practice, this value
+ is limited by the root sector size, with some space unavailable due
+ to leaf headers. For a 4k sectorsize, max inline data is ~3900 bytes.
+
+ metadata_ratio=<value>
+ Specify that 1 metadata chunk should be allocated after every <value>
+ data chunks. Off by default.
+
+ acl(*)
+ noacl
+ Enable/disable support for Posix Access Control Lists (ACLs). See the
+ acl(5) manual page for more information about ACLs.
+
+ barrier(*)
+ nobarrier
+ Enable/disable the use of block layer write barriers. Write barriers
+ ensure that certain IOs make it through the device cache and are on
+ persistent storage. If disabled on a device with a volatile
+ (non-battery-backed) write-back cache, nobarrier option will lead to
+ filesystem corruption on a system crash or power loss.
+
+ datacow(*)
+ nodatacow
+ Enable/disable data copy-on-write for newly created files.
+ Nodatacow implies nodatasum, and disables all compression.
+
+ datasum(*)
+ nodatasum
+ Enable/disable data checksumming for newly created files.
+ Datasum implies datacow.
+
+ treelog(*)
+ notreelog
+ Enable/disable the tree logging used for fsync and O_SYNC writes.
+
+ recovery
+ Enable autorecovery attempts if a bad tree root is found at mount time.
+ Currently this scans a list of several previous tree roots and tries to
+ use the first readable.
+
+ rescan_uuid_tree
+ Force check and rebuild procedure of the UUID tree. This should not
+ normally be needed.
+
+ skip_balance
+ Skip automatic resume of interrupted balance operation after mount.
+ May be resumed with "btrfs balance resume."
+
+ space_cache (*)
+ Enable the on-disk freespace cache.
+ nospace_cache
+ Disable freespace cache loading without clearing the cache.
+ clear_cache
+ Force clearing and rebuilding of the disk space cache if something
+ has gone wrong.
+
+ ssd
+ nossd
+ ssd_spread
+ Options to control ssd allocation schemes. By default, BTRFS will
+ enable or disable ssd allocation heuristics depending on whether a
+ rotational or nonrotational disk is in use. The ssd and nossd options
+ can override this autodetection.
+
+ The ssd_spread mount option attempts to allocate into big chunks
+ of unused space, and may perform better on low-end ssds. ssd_spread
+ implies ssd, enabling all other ssd heuristics as well.
+
+ subvol=<path>
+ Mount subvolume at <path> rather than the root subvolume. <path> is
+ relative to the top level subvolume.
+
+ subvolid=<ID>
+ Mount subvolume specified by an ID number rather than the root subvolume.
+ This allows mounting of subvolumes which are not in the root of the mounted
+ filesystem.
+ You can use "btrfs subvolume list" to see subvolume ID numbers.
+
+ subvolrootid=<objectid> (deprecated)
+ Mount subvolume specified by <objectid> rather than the root subvolume.
+ This allows mounting of subvolumes which are not in the root of the mounted
+ filesystem.
+ You can use "btrfs subvolume show " to see the object ID for a subvolume.
+
+ thread_pool=<number>
+ The number of worker threads to allocate. The default number is equal
+ to the number of CPUs + 2, or 8, whichever is smaller.
+
+ user_subvol_rm_allowed
+ Allow subvolumes to be deleted by a non-root user. Use with caution.
+
+MAILING LIST
+============
There is a Btrfs mailing list hosted on vger.kernel.org. You can
find details on how to subscribe here:
@@ -49,8 +238,8 @@ http://dir.gmane.org/gmane.comp.file-systems.btrfs
- IRC
- ===
+IRC
+===
Discussion of Btrfs also occurs on the #btrfs channel of the Freenode
IRC network.
@@ -68,24 +257,14 @@ available from the git repository at the following location:
These include the following tools:
-mkfs.btrfs: create a filesystem
-
-btrfsctl: control program to create snapshots and subvolumes:
+* mkfs.btrfs: create a filesystem
- mount /dev/sda2 /mnt
- btrfsctl -s new_subvol_name /mnt
- btrfsctl -s snapshot_of_default /mnt/default
- btrfsctl -s snapshot_of_new_subvol /mnt/new_subvol_name
- btrfsctl -s snapshot_of_a_snapshot /mnt/snapshot_of_new_subvol
- ls /mnt
- default snapshot_of_a_snapshot snapshot_of_new_subvol
- new_subvol_name snapshot_of_default
+* btrfs: a single tool to manage the filesystems, refer to the manpage for more details
- Snapshots and subvolumes cannot be deleted right now, but you can
- rm -rf all the files and directories inside them.
+* 'btrfsck' or 'btrfs check': do a consistency check of the filesystem
-btrfsck: do a limited check of the FS extent trees.
+Other tools for specific tasks:
-btrfs-debug-tree: print all of the FS metadata in text form. Example:
+* btrfs-convert: in-place conversion from ext2/3/4 filesystems
- btrfs-debug-tree /dev/sda2 >& big_output_file
+* btrfs-image: dump filesystem metadata for debugging
diff --git a/Documentation/filesystems/caching/backend-api.txt b/Documentation/filesystems/caching/backend-api.txt
index d78bab9622c..277d1e81067 100644
--- a/Documentation/filesystems/caching/backend-api.txt
+++ b/Documentation/filesystems/caching/backend-api.txt
@@ -299,6 +299,15 @@ performed on the denizens of the cache. These are held in a structure of type:
enough space in the cache to permit this.
+ (*) Check coherency state of an object [mandatory]:
+
+ int (*check_consistency)(struct fscache_object *object)
+
+ This method is called to have the cache check the saved auxiliary data of
+ the object against the netfs's idea of the state. 0 should be returned
+ if they're consistent and -ESTALE otherwise. -ENOMEM and -ERESTARTSYS
+ may also be returned.
+
(*) Update object [mandatory]:
int (*update_object)(struct fscache_object *object)
diff --git a/Documentation/filesystems/caching/netfs-api.txt b/Documentation/filesystems/caching/netfs-api.txt
index 97e6c0ecc5e..aed6b94160b 100644
--- a/Documentation/filesystems/caching/netfs-api.txt
+++ b/Documentation/filesystems/caching/netfs-api.txt
@@ -29,15 +29,16 @@ This document contains the following sections:
(6) Index registration
(7) Data file registration
(8) Miscellaneous object registration
- (9) Setting the data file size
+ (9) Setting the data file size
(10) Page alloc/read/write
(11) Page uncaching
- (12) Index and data file update
- (13) Miscellaneous cookie operations
- (14) Cookie unregistration
- (15) Index invalidation
- (16) Data file invalidation
- (17) FS-Cache specific page flags.
+ (12) Index and data file consistency
+ (13) Cookie enablement
+ (14) Miscellaneous cookie operations
+ (15) Cookie unregistration
+ (16) Index invalidation
+ (17) Data file invalidation
+ (18) FS-Cache specific page flags.
=============================
@@ -334,7 +335,8 @@ the path to the file:
struct fscache_cookie *
fscache_acquire_cookie(struct fscache_cookie *parent,
const struct fscache_object_def *def,
- void *netfs_data);
+ void *netfs_data,
+ bool enable);
This function creates an index entry in the index represented by parent,
filling in the index entry by calling the operations pointed to by def.
@@ -350,6 +352,10 @@ object needs to be created somewhere down the hierarchy. Furthermore, an index
may be created in several different caches independently at different times.
This is all handled transparently, and the netfs doesn't see any of it.
+A cookie will be created in the disabled state if enabled is false. A cookie
+must be enabled to do anything with it. A disabled cookie can be enabled by
+calling fscache_enable_cookie() (see below).
+
For example, with AFS, a cell would be added to the primary index. This index
entry would have a dependent inode containing a volume location index for the
volume mappings within this cell:
@@ -357,7 +363,7 @@ volume mappings within this cell:
cell->cache =
fscache_acquire_cookie(afs_cache_netfs.primary_index,
&afs_cell_cache_index_def,
- cell);
+ cell, true);
Then when a volume location was accessed, it would be entered into the cell's
index and an inode would be allocated that acts as a volume type and hash chain
@@ -366,7 +372,7 @@ combination:
vlocation->cache =
fscache_acquire_cookie(cell->cache,
&afs_vlocation_cache_index_def,
- vlocation);
+ vlocation, true);
And then a particular flavour of volume (R/O for example) could be added to
that index, creating another index for vnodes (AFS inode equivalents):
@@ -374,7 +380,7 @@ that index, creating another index for vnodes (AFS inode equivalents):
volume->cache =
fscache_acquire_cookie(vlocation->cache,
&afs_volume_cache_index_def,
- volume);
+ volume, true);
======================
@@ -388,7 +394,7 @@ the object definition should be something other than index type.
vnode->cache =
fscache_acquire_cookie(volume->cache,
&afs_vnode_cache_object_def,
- vnode);
+ vnode, true);
=================================
@@ -404,7 +410,7 @@ it would be some other type of object such as a data file.
xattr->cache =
fscache_acquire_cookie(vnode->cache,
&afs_xattr_cache_object_def,
- xattr);
+ xattr, true);
Miscellaneous objects might be used to store extended attributes or directory
entries for example.
@@ -433,7 +439,7 @@ to the caller. The attribute adjustment excludes read and write operations.
=====================
-PAGE READ/ALLOC/WRITE
+PAGE ALLOC/READ/WRITE
=====================
And the sixth step is to store and retrieve pages in the cache. There are
@@ -499,7 +505,7 @@ Else if there's a copy of the page resident in the cache:
(*) An argument that's 0 on success or negative for an error code.
If an error occurs, it should be assumed that the page contains no usable
- data.
+ data. fscache_readpages_cancel() may need to be called.
end_io_func() will be called in process context if the read is results in
an error, but it might be called in interrupt context if the read is
@@ -623,6 +629,22 @@ some of the pages being read and some being allocated. Those pages will have
been marked appropriately and will need uncaching.
+CANCELLATION OF UNREAD PAGES
+----------------------------
+
+If one or more pages are passed to fscache_read_or_alloc_pages() but not then
+read from the cache and also not read from the underlying filesystem then
+those pages will need to have any marks and reservations removed. This can be
+done by calling:
+
+ void fscache_readpages_cancel(struct fscache_cookie *cookie,
+ struct list_head *pages);
+
+prior to returning to the caller. The cookie argument should be as passed to
+fscache_read_or_alloc_pages(). Every page in the pages list will be examined
+and any that have PG_fscache set will be uncached.
+
+
==============
PAGE UNCACHING
==============
@@ -690,9 +712,18 @@ written to the cache and for the cache to finish with the page generally. No
error is returned.
-==========================
-INDEX AND DATA FILE UPDATE
-==========================
+===============================
+INDEX AND DATA FILE CONSISTENCY
+===============================
+
+To find out whether auxiliary data for an object is up to data within the
+cache, the following function can be called:
+
+ int fscache_check_consistency(struct fscache_cookie *cookie)
+
+This will call back to the netfs to check whether the auxiliary data associated
+with a cookie is correct. It returns 0 if it is and -ESTALE if it isn't; it
+may also return -ENOMEM and -ERESTARTSYS.
To request an update of the index data for an index or other object, the
following function should be called:
@@ -708,6 +739,47 @@ Note that partial updates may happen automatically at other times, such as when
data blocks are added to a data file object.
+=================
+COOKIE ENABLEMENT
+=================
+
+Cookies exist in one of two states: enabled and disabled. If a cookie is
+disabled, it ignores all attempts to acquire child cookies; check, update or
+invalidate its state; allocate, read or write backing pages - though it is
+still possible to uncache pages and relinquish the cookie.
+
+The initial enablement state is set by fscache_acquire_cookie(), but the cookie
+can be enabled or disabled later. To disable a cookie, call:
+
+ void fscache_disable_cookie(struct fscache_cookie *cookie,
+ bool invalidate);
+
+If the cookie is not already disabled, this locks the cookie against other
+enable and disable ops, marks the cookie as being disabled, discards or
+invalidates any backing objects and waits for cessation of activity on any
+associated object before unlocking the cookie.
+
+All possible failures are handled internally. The caller should consider
+calling fscache_uncache_all_inode_pages() afterwards to make sure all page
+markings are cleared up.
+
+Cookies can be enabled or reenabled with:
+
+ void fscache_enable_cookie(struct fscache_cookie *cookie,
+ bool (*can_enable)(void *data),
+ void *data)
+
+If the cookie is not already enabled, this locks the cookie against other
+enable and disable ops, invokes can_enable() and, if the cookie is not an index
+cookie, will begin the procedure of acquiring backing objects.
+
+The optional can_enable() function is passed the data argument and returns a
+ruling as to whether or not enablement should actually be permitted to begin.
+
+All possible failures are handled internally. The cookie will only be marked
+as enabled if provisional backing objects are allocated.
+
+
===============================
MISCELLANEOUS COOKIE OPERATIONS
===============================
@@ -753,7 +825,7 @@ COOKIE UNREGISTRATION
To get rid of a cookie, this function should be called.
void fscache_relinquish_cookie(struct fscache_cookie *cookie,
- int retire);
+ bool retire);
If retire is non-zero, then the object will be marked for recycling, and all
copies of it will be removed from all active caches in which it is present.
diff --git a/Documentation/filesystems/cifs.txt b/Documentation/filesystems/cifs.txt
deleted file mode 100644
index 49cc923a93e..00000000000
--- a/Documentation/filesystems/cifs.txt
+++ /dev/null
@@ -1,51 +0,0 @@
- This is the client VFS module for the Common Internet File System
- (CIFS) protocol which is the successor to the Server Message Block
- (SMB) protocol, the native file sharing mechanism for most early
- PC operating systems. CIFS is fully supported by current network
- file servers such as Windows 2000, Windows 2003 (including
- Windows XP) as well by Samba (which provides excellent CIFS
- server support for Linux and many other operating systems), so
- this network filesystem client can mount to a wide variety of
- servers. The smbfs module should be used instead of this cifs module
- for mounting to older SMB servers such as OS/2. The smbfs and cifs
- modules can coexist and do not conflict. The CIFS VFS filesystem
- module is designed to work well with servers that implement the
- newer versions (dialects) of the SMB/CIFS protocol such as Samba,
- the program written by Andrew Tridgell that turns any Unix host
- into a SMB/CIFS file server.
-
- The intent of this module is to provide the most advanced network
- file system function for CIFS compliant servers, including better
- POSIX compliance, secure per-user session establishment, high
- performance safe distributed caching (oplock), optional packet
- signing, large files, Unicode support and other internationalization
- improvements. Since both Samba server and this filesystem client support
- the CIFS Unix extensions, the combination can provide a reasonable
- alternative to NFSv4 for fileserving in some Linux to Linux environments,
- not just in Linux to Windows environments.
-
- This filesystem has an optional mount utility (mount.cifs) that can
- be obtained from the project page and installed in the path in the same
- directory with the other mount helpers (such as mount.smbfs).
- Mounting using the cifs filesystem without installing the mount helper
- requires specifying the server's ip address.
-
- For Linux 2.4:
- mount //anything/here /mnt_target -o
- user=username,pass=password,unc=//ip_address_of_server/sharename
-
- For Linux 2.5:
- mount //ip_address_of_server/sharename /mnt_target -o user=username, pass=password
-
-
- For more information on the module see the project page at
-
- http://us1.samba.org/samba/Linux_CIFS_client.html
-
- For more information on CIFS see:
-
- http://www.snia.org/tech_activities/CIFS
-
- or the Samba site:
-
- http://www.samba.org
diff --git a/Documentation/filesystems/cifs/AUTHORS b/Documentation/filesystems/cifs/AUTHORS
new file mode 100644
index 00000000000..ca4a67a0bb1
--- /dev/null
+++ b/Documentation/filesystems/cifs/AUTHORS
@@ -0,0 +1,56 @@
+Original Author
+===============
+Steve French (sfrench@samba.org)
+
+The author wishes to express his appreciation and thanks to:
+Andrew Tridgell (Samba team) for his early suggestions about smb/cifs VFS
+improvements. Thanks to IBM for allowing me time and test resources to pursue
+this project, to Jim McDonough from IBM (and the Samba Team) for his help, to
+the IBM Linux JFS team for explaining many esoteric Linux filesystem features.
+Jeremy Allison of the Samba team has done invaluable work in adding the server
+side of the original CIFS Unix extensions and reviewing and implementing
+portions of the newer CIFS POSIX extensions into the Samba 3 file server. Thank
+Dave Boutcher of IBM Rochester (author of the OS/400 smb/cifs filesystem client)
+for proving years ago that very good smb/cifs clients could be done on Unix-like
+operating systems. Volker Lendecke, Andrew Tridgell, Urban Widmark, John
+Newbigin and others for their work on the Linux smbfs module. Thanks to
+the other members of the Storage Network Industry Association CIFS Technical
+Workgroup for their work specifying this highly complex protocol and finally
+thanks to the Samba team for their technical advice and encouragement.
+
+Patch Contributors
+------------------
+Zwane Mwaikambo
+Andi Kleen
+Amrut Joshi
+Shobhit Dayal
+Sergey Vlasov
+Richard Hughes
+Yury Umanets
+Mark Hamzy (for some of the early cifs IPv6 work)
+Domen Puncer
+Jesper Juhl (in particular for lots of whitespace/formatting cleanup)
+Vince Negri and Dave Stahl (for finding an important caching bug)
+Adrian Bunk (kcalloc cleanups)
+Miklos Szeredi
+Kazeon team for various fixes especially for 2.4 version.
+Asser Ferno (Change Notify support)
+Shaggy (Dave Kleikamp) for innumerable small fs suggestions and some good cleanup
+Gunter Kukkukk (testing and suggestions for support of old servers)
+Igor Mammedov (DFS support)
+Jeff Layton (many, many fixes, as well as great work on the cifs Kerberos code)
+Scott Lovenberg
+
+Test case and Bug Report contributors
+-------------------------------------
+Thanks to those in the community who have submitted detailed bug reports
+and debug of problems they have found: Jochen Dolze, David Blaine,
+Rene Scharfe, Martin Josefsson, Alexander Wild, Anthony Liguori,
+Lars Muller, Urban Widmark, Massimiliano Ferrero, Howard Owen,
+Olaf Kirch, Kieron Briggs, Nick Millington and others. Also special
+mention to the Stanford Checker (SWAT) which pointed out many minor
+bugs in error paths. Valuable suggestions also have come from Al Viro
+and Dave Miller.
+
+And thanks to the IBM LTC and Power test teams and SuSE testers for
+finding multiple bugs during excellent stress test runs.
diff --git a/Documentation/filesystems/cifs/CHANGES b/Documentation/filesystems/cifs/CHANGES
new file mode 100644
index 00000000000..bc0025cdd1c
--- /dev/null
+++ b/Documentation/filesystems/cifs/CHANGES
@@ -0,0 +1,1065 @@
+Version 1.62
+------------
+Add sockopt=TCP_NODELAY mount option. EA (xattr) routines hardened
+to more strictly handle corrupt frames.
+
+Version 1.61
+------------
+Fix append problem to Samba servers (files opened with O_APPEND could
+have duplicated data). Fix oops in cifs_lookup. Workaround problem
+mounting to OS/400 Netserve. Fix oops in cifs_get_tcp_session.
+Disable use of server inode numbers when server only
+partially supports them (e.g. for one server querying inode numbers on
+FindFirst fails but QPathInfo queries works). Fix oops with dfs in
+cifs_put_smb_ses. Fix mmap to work on directio mounts (needed
+for OpenOffice when on forcedirectio mount e.g.)
+
+Version 1.60
+-------------
+Fix memory leak in reconnect. Fix oops in DFS mount error path.
+Set s_maxbytes to smaller (the max that vfs can handle) so that
+sendfile will now work over cifs mounts again. Add noforcegid
+and noforceuid mount parameters. Fix small mem leak when using
+ntlmv2. Fix 2nd mount to same server but with different port to
+be allowed (rather than reusing the 1st port) - only when the
+user explicitly overrides the port on the 2nd mount.
+
+Version 1.59
+------------
+Client uses server inode numbers (which are persistent) rather than
+client generated ones by default (mount option "serverino" turned
+on by default if server supports it). Add forceuid and forcegid
+mount options (so that when negotiating unix extensions specifying
+which uid mounted does not immediately force the server's reported
+uids to be overridden). Add support for scope mount parm. Improve
+hard link detection to use same inode for both. Do not set
+read-only dos attribute on directories (for chmod) since Windows
+explorer special cases this attribute bit for directories for
+a different purpose.
+
+Version 1.58
+------------
+Guard against buffer overruns in various UCS-2 to UTF-8 string conversions
+when the UTF-8 string is composed of unusually long (more than 4 byte) converted
+characters. Add support for mounting root of a share which redirects immediately
+to DFS target. Convert string conversion functions from Unicode to more
+accurately mark string length before allocating memory (which may help the
+rare cases where a UTF-8 string is much larger than the UCS2 string that
+we converted from). Fix endianness of the vcnum field used during
+session setup to distinguish multiple mounts to same server from different
+userids. Raw NTLMSSP fixed (it requires /proc/fs/cifs/experimental
+flag to be set to 2, and mount must enable krb5 to turn on extended security).
+Performance of file create to Samba improved (posix create on lookup
+removes 1 of 2 network requests sent on file create)
+
+Version 1.57
+------------
+Improve support for multiple security contexts to the same server. We
+used to use the same "vcnumber" for all connections which could cause
+the server to treat subsequent connections, especially those that
+are authenticated as guest, as reconnections, invalidating the earlier
+user's smb session. This fix allows cifs to mount multiple times to the
+same server with different userids without risking invalidating earlier
+established security contexts. fsync now sends SMB Flush operation
+to better ensure that we wait for server to write all of the data to
+server disk (not just write it over the network). Add new mount
+parameter to allow user to disable sending the (slow) SMB flush on
+fsync if desired (fsync still flushes all cached write data to the server).
+Posix file open support added (turned off after one attempt if server
+fails to support it properly, as with Samba server versions prior to 3.3.2)
+Fix "redzone overwritten" bug in cifs_put_tcon (CIFSTcon may allocate too
+little memory for the "nativeFileSystem" field returned by the server
+during mount). Endian convert inode numbers if necessary (makes it easier
+to compare inode numbers on network files from big endian systems).
+
+Version 1.56
+------------
+Add "forcemandatorylock" mount option to allow user to use mandatory
+rather than posix (advisory) byte range locks, even though server would
+support posix byte range locks. Fix query of root inode when prefixpath
+specified and user does not have access to query information about the
+top of the share. Fix problem in 2.6.28 resolving DFS paths to
+Samba servers (worked to Windows). Fix rmdir so that pending search
+(readdir) requests do not get invalid results which include the now
+removed directory. Fix oops in cifs_dfs_ref.c when prefixpath is not reachable
+when using DFS. Add better file create support to servers which support
+the CIFS POSIX protocol extensions (this adds support for new flags
+on create, and improves semantics for write of locked ranges).
+
+Version 1.55
+------------
+Various fixes to make delete of open files behavior more predictable
+(when delete of an open file fails we mark the file as "delete-on-close"
+in a way that more servers accept, but only if we can first rename the
+file to a temporary name). Add experimental support for more safely
+handling fcntl(F_SETLEASE). Convert cifs to using blocking tcp
+sends, and also let tcp autotune the socket send and receive buffers.
+This reduces the number of EAGAIN errors returned by TCP/IP in
+high stress workloads (and the number of retries on socket writes
+when sending large SMBWriteX requests). Fix case in which a portion of
+data can in some cases not get written to the file on the server before the
+file is closed. Fix DFS parsing to properly handle path consumed field,
+and to handle certain codepage conversions better. Fix mount and
+umount race that can cause oops in mount or umount or reconnect.
+
+Version 1.54
+------------
+Fix premature write failure on congested networks (we would give up
+on EAGAIN from the socket too quickly on large writes).
+Cifs_mkdir and cifs_create now respect the setgid bit on parent dir.
+Fix endian problems in acl (mode from/to cifs acl) on bigendian
+architectures. Fix problems with preserving timestamps on copying open
+files (e.g. "cp -a") to Windows servers. For mkdir and create honor setgid bit
+on parent directory when server supports Unix Extensions but not POSIX
+create. Update cifs.upcall version to handle new Kerberos sec flags
+(this requires update of cifs.upcall program from Samba). Fix memory leak
+on dns_upcall (resolving DFS referralls). Fix plain text password
+authentication (requires setting SecurityFlags to 0x30030 to enable
+lanman and plain text though). Fix writes to be at correct offset when
+file is open with O_APPEND and file is on a directio (forcediretio) mount.
+Fix bug in rewinding readdir directory searches. Add nodfs mount option.
+
+Version 1.53
+------------
+DFS support added (Microsoft Distributed File System client support needed
+for referrals which enable a hierarchical name space among servers).
+Disable temporary caching of mode bits to servers which do not support
+storing of mode (e.g. Windows servers, when client mounts without cifsacl
+mount option) and add new "dynperm" mount option to enable temporary caching
+of mode (enable old behavior). Fix hang on mount caused when server crashes
+tcp session during negotiate protocol.
+
+Version 1.52
+------------
+Fix oops on second mount to server when null auth is used.
+Enable experimental Kerberos support. Return writebehind errors on flush
+and sync so that events like out of disk space get reported properly on
+cached files. Fix setxattr failure to certain Samba versions. Fix mount
+of second share to disconnected server session (autoreconnect on this).
+Add ability to modify cifs acls for handling chmod (when mounted with
+cifsacl flag). Fix prefixpath path separator so we can handle mounts
+with prefixpaths longer than one directory (one path component) when
+mounted to Windows servers. Fix slow file open when cifsacl
+enabled. Fix memory leak in FindNext when the SMB call returns -EBADF.
+
+
+Version 1.51
+------------
+Fix memory leak in statfs when mounted to very old servers (e.g.
+Windows 9x). Add new feature "POSIX open" which allows servers
+which support the current POSIX Extensions to provide better semantics
+(e.g. delete for open files opened with posix open). Take into
+account umask on posix mkdir not just older style mkdir. Add
+ability to mount to IPC$ share (which allows CIFS named pipes to be
+opened, read and written as if they were files). When 1st tree
+connect fails (e.g. due to signing negotiation failure) fix
+leak that causes cifsd not to stop and rmmod to fail to cleanup
+cifs_request_buffers pool. Fix problem with POSIX Open/Mkdir on
+bigendian architectures. Fix possible memory corruption when
+EAGAIN returned on kern_recvmsg. Return better error if server
+requires packet signing but client has disabled it. When mounted
+with cifsacl mount option - mode bits are approximated based
+on the contents of the ACL of the file or directory. When cifs
+mount helper is missing convert make sure that UNC name
+has backslash (not forward slash) between ip address of server
+and the share name.
+
+Version 1.50
+------------
+Fix NTLMv2 signing. NFS server mounted over cifs works (if cifs mount is
+done with "serverino" mount option). Add support for POSIX Unlink
+(helps with certain sharing violation cases when server such as
+Samba supports newer POSIX CIFS Protocol Extensions). Add "nounix"
+mount option to allow disabling the CIFS Unix Extensions for just
+that mount. Fix hang on spinlock in find_writable_file (race when
+reopening file after session crash). Byte range unlock request to
+windows server could unlock more bytes (on server copy of file)
+than intended if start of unlock request is well before start of
+a previous byte range lock that we issued.
+
+Version 1.49
+------------
+IPv6 support. Enable ipv6 addresses to be passed on mount (put the ipv6
+address after the "ip=" mount option, at least until mount.cifs is fixed to
+handle DNS host to ipv6 name translation). Accept override of uid or gid
+on mount even when Unix Extensions are negotiated (it used to be ignored
+when Unix Extensions were ignored). This allows users to override the
+default uid and gid for files when they are certain that the uids or
+gids on the server do not match those of the client. Make "sec=none"
+mount override username (so that null user connection is attempted)
+to match what documentation said. Support for very large reads, over 127K,
+available to some newer servers (such as Samba 3.0.26 and later but
+note that it also requires setting CIFSMaxBufSize at module install
+time to a larger value which may hurt performance in some cases).
+Make sign option force signing (or fail if server does not support it).
+
+Version 1.48
+------------
+Fix mtime bouncing around from local idea of last write times to remote time.
+Fix hang (in i_size_read) when simultaneous size update of same remote file
+on smp system corrupts sequence number. Do not reread unnecessarily partial page
+(which we are about to overwrite anyway) when writing out file opened rw.
+When DOS attribute of file on non-Unix server's file changes on the server side
+from read-only back to read-write, reflect this change in default file mode
+(we had been leaving a file's mode read-only until the inode were reloaded).
+Allow setting of attribute back to ATTR_NORMAL (removing readonly dos attribute
+when archive dos attribute not set and we are changing mode back to writeable
+on server which does not support the Unix Extensions). Remove read only dos
+attribute on chmod when adding any write permission (ie on any of
+user/group/other (not all of user/group/other ie 0222) when
+mounted to windows. Add support for POSIX MkDir (slight performance
+enhancement and eliminates the network race between the mkdir and set
+path info of the mode).
+
+
+Version 1.47
+------------
+Fix oops in list_del during mount caused by unaligned string.
+Fix file corruption which could occur on some large file
+copies caused by writepages page i/o completion bug.
+Seek to SEEK_END forces check for update of file size for non-cached
+files. Allow file size to be updated on remote extend of locally open,
+non-cached file. Fix reconnect to newer Samba servers (or other servers
+which support the CIFS Unix/POSIX extensions) so that we again tell the
+server the Unix/POSIX cifs capabilities which we support (SetFSInfo).
+Add experimental support for new POSIX Open/Mkdir (which returns
+stat information on the open, and allows setting the mode).
+
+Version 1.46
+------------
+Support deep tree mounts. Better support OS/2, Win9x (DOS) time stamps.
+Allow null user to be specified on mount ("username="). Do not return
+EINVAL on readdir when filldir fails due to overwritten blocksize
+(fixes FC problem). Return error in rename 2nd attempt retry (ie report
+if rename by handle also fails, after rename by path fails, we were
+not reporting whether the retry worked or not). Fix NTLMv2 to
+work to Windows servers (mount with option "sec=ntlmv2").
+
+Version 1.45
+------------
+Do not time out lockw calls when using posix extensions. Do not
+time out requests if server still responding reasonably fast
+on requests on other threads. Improve POSIX locking emulation,
+(lock cancel now works, and unlock of merged range works even
+to Windows servers now). Fix oops on mount to lanman servers
+(win9x, os/2 etc.) when null password. Do not send listxattr
+(SMB to query all EAs) if nouser_xattr specified. Fix SE Linux
+problem (instantiate inodes/dentries in right order for readdir).
+
+Version 1.44
+------------
+Rewritten sessionsetup support, including support for legacy SMB
+session setup needed for OS/2 and older servers such as Windows 95 and 98.
+Fix oops on ls to OS/2 servers. Add support for level 1 FindFirst
+so we can do search (ls etc.) to OS/2. Do not send NTCreateX
+or recent levels of FindFirst unless server says it supports NT SMBs
+(instead use legacy equivalents from LANMAN dialect). Fix to allow
+NTLMv2 authentication support (now can use stronger password hashing
+on mount if corresponding /proc/fs/cifs/SecurityFlags is set (0x4004).
+Allow override of global cifs security flags on mount via "sec=" option(s).
+
+Version 1.43
+------------
+POSIX locking to servers which support CIFS POSIX Extensions
+(disabled by default controlled by proc/fs/cifs/Experimental).
+Handle conversion of long share names (especially Asian languages)
+to Unicode during mount. Fix memory leak in sess struct on reconnect.
+Fix rare oops after acpi suspend. Fix O_TRUNC opens to overwrite on
+cifs open which helps rare case when setpathinfo fails or server does
+not support it.
+
+Version 1.42
+------------
+Fix slow oplock break when mounted to different servers at the same time and
+the tids match and we try to find matching fid on wrong server. Fix read
+looping when signing required by server (2.6.16 kernel only). Fix readdir
+vs. rename race which could cause each to hang. Return . and .. even
+if server does not. Allow searches to skip first three entries and
+begin at any location. Fix oops in find_writeable_file.
+
+Version 1.41
+------------
+Fix NTLMv2 security (can be enabled in /proc/fs/cifs) so customers can
+configure stronger authentication. Fix sfu symlinks so they can
+be followed (not just recognized). Fix wraparound of bcc on
+read responses when buffer size over 64K and also fix wrap of
+max smb buffer size when CIFSMaxBufSize over 64K. Fix oops in
+cifs_user_read and cifs_readpages (when EAGAIN on send of smb
+on socket is returned over and over). Add POSIX (advisory) byte range
+locking support (requires server with newest CIFS UNIX Extensions
+to the protocol implemented). Slow down negprot slightly in port 139
+RFC1001 case to give session_init time on buggy servers.
+
+Version 1.40
+------------
+Use fsuid (fsgid) more consistently instead of uid (gid). Improve performance
+of readpages by eliminating one extra memcpy. Allow update of file size
+from remote server even if file is open for write as long as mount is
+directio. Recognize share mode security and send NTLM encrypted password
+on tree connect if share mode negotiated.
+
+Version 1.39
+------------
+Defer close of a file handle slightly if pending writes depend on that handle
+(this reduces the EBADF bad file handle errors that can be logged under heavy
+stress on writes). Modify cifs Kconfig options to expose CONFIG_CIFS_STATS2
+Fix SFU style symlinks and mknod needed for servers which do not support the
+CIFS Unix Extensions. Fix setfacl/getfacl on bigendian. Timeout negative
+dentries so files that the client sees as deleted but that later get created
+on the server will be recognized. Add client side permission check on setattr.
+Timeout stuck requests better (where server has never responded or sent corrupt
+responses)
+
+Version 1.38
+------------
+Fix tcp socket retransmission timeouts (e.g. on ENOSPACE from the socket)
+to be smaller at first (but increasing) so large write performance performance
+over GigE is better. Do not hang thread on illegal byte range lock response
+from Windows (Windows can send an RFC1001 size which does not match smb size) by
+allowing an SMBs TCP length to be up to a few bytes longer than it should be.
+wsize and rsize can now be larger than negotiated buffer size if server
+supports large readx/writex, even when directio mount flag not specified.
+Write size will in many cases now be 16K instead of 4K which greatly helps
+file copy performance on lightly loaded networks. Fix oops in dnotify
+when experimental config flag enabled. Make cifsFYI more granular.
+
+Version 1.37
+------------
+Fix readdir caching when unlink removes file in current search buffer,
+and this is followed by a rewind search to just before the deleted entry.
+Do not attempt to set ctime unless atime and/or mtime change requested
+(most servers throw it away anyway). Fix length check of received smbs
+to be more accurate. Fix big endian problem with mapchars mount option,
+and with a field returned by statfs.
+
+Version 1.36
+------------
+Add support for mounting to older pre-CIFS servers such as Windows9x and ME.
+For these older servers, add option for passing netbios name of server in
+on mount (servernetbiosname). Add suspend support for power management, to
+avoid cifsd thread preventing software suspend from working.
+Add mount option for disabling the default behavior of sending byte range lock
+requests to the server (necessary for certain applications which break with
+mandatory lock behavior such as Evolution), and also mount option for
+requesting case insensitive matching for path based requests (requesting
+case sensitive is the default).
+
+Version 1.35
+------------
+Add writepage performance improvements. Fix path name conversions
+for long filenames on mounts which were done with "mapchars" mount option
+specified. Ensure multiplex ids do not collide. Fix case in which
+rmmod can oops if done soon after last unmount. Fix truncated
+search (readdir) output when resume filename was a long filename.
+Fix filename conversion when mapchars mount option was specified and
+filename was a long filename.
+
+Version 1.34
+------------
+Fix error mapping of the TOO_MANY_LINKS (hardlinks) case.
+Do not oops if root user kills cifs oplock kernel thread or
+kills the cifsd thread (NB: killing the cifs kernel threads is not
+recommended, unmount and rmmod cifs will kill them when they are
+no longer needed). Fix readdir to ASCII servers (ie older servers
+which do not support Unicode) and also require asterisk.
+Fix out of memory case in which data could be written one page
+off in the page cache.
+
+Version 1.33
+------------
+Fix caching problem, in which readdir of directory containing a file
+which was cached could cause the file's time stamp to be updated
+without invalidating the readahead data (so we could get stale
+file data on the client for that file even as the server copy changed).
+Cleanup response processing so cifsd can not loop when abnormally
+terminated.
+
+
+Version 1.32
+------------
+Fix oops in ls when Transact2 FindFirst (or FindNext) returns more than one
+transact response for an SMB request and search entry split across two frames.
+Add support for lsattr (getting ext2/ext3/reiserfs attr flags from the server)
+as new protocol extensions. Do not send Get/Set calls for POSIX ACLs
+unless server explicitly claims to support them in CIFS Unix extensions
+POSIX ACL capability bit. Fix packet signing when multiuser mounting with
+different users from the same client to the same server. Fix oops in
+cifs_close. Add mount option for remapping reserved characters in
+filenames (also allow recognizing files with created by SFU which have any
+of these seven reserved characters, except backslash, to be recognized).
+Fix invalid transact2 message (we were sometimes trying to interpret
+oplock breaks as SMB responses). Add ioctl for checking that the
+current uid matches the uid of the mounter (needed by umount.cifs).
+Reduce the number of large buffer allocations in cifs response processing
+(significantly reduces memory pressure under heavy stress with multiple
+processes accessing the same server at the same time).
+
+Version 1.31
+------------
+Fix updates of DOS attributes and time fields so that files on NT4 servers
+do not get marked delete on close. Display sizes of cifs buffer pools in
+cifs stats. Fix oops in unmount when cifsd thread being killed by
+shutdown. Add generic readv/writev and aio support. Report inode numbers
+consistently in readdir and lookup (when serverino mount option is
+specified use the inode number that the server reports - for both lookup
+and readdir, otherwise by default the locally generated inode number is used
+for inodes created in either path since servers are not always able to
+provide unique inode numbers when exporting multiple volumes from under one
+sharename).
+
+Version 1.30
+------------
+Allow new nouser_xattr mount parm to disable xattr support for user namespace.
+Do not flag user_xattr mount parm in dmesg. Retry failures setting file time
+(mostly affects NT4 servers) by retry with handle based network operation.
+Add new POSIX Query FS Info for returning statfs info more accurately.
+Handle passwords with multiple commas in them.
+
+Version 1.29
+------------
+Fix default mode in sysfs of cifs module parms. Remove old readdir routine.
+Fix capabilities flags for large readx so as to allow reads larger than 64K.
+
+Version 1.28
+------------
+Add module init parm for large SMB buffer size (to allow it to be changed
+from its default of 16K) which is especially useful for large file copy
+when mounting with the directio mount option. Fix oops after
+returning from mount when experimental ExtendedSecurity enabled and
+SpnegoNegotiated returning invalid error. Fix case to retry better when
+peek returns from 1 to 3 bytes on socket which should have more data.
+Fixed path based calls (such as cifs lookup) to handle path names
+longer than 530 (now can handle PATH_MAX). Fix pass through authentication
+from Samba server to DC (Samba required dummy LM password).
+
+Version 1.27
+------------
+Turn off DNOTIFY (directory change notification support) by default
+(unless built with the experimental flag) to fix hang with KDE
+file browser. Fix DNOTIFY flag mappings. Fix hang (in wait_event
+waiting on an SMB response) in SendReceive when session dies but
+reconnects quickly from another task. Add module init parms for
+minimum number of large and small network buffers in the buffer pools,
+and for the maximum number of simultaneous requests.
+
+Version 1.26
+------------
+Add setfacl support to allow setting of ACLs remotely to Samba 3.10 and later
+and other POSIX CIFS compliant servers. Fix error mapping for getfacl
+to EOPNOTSUPP when server does not support posix acls on the wire. Fix
+improperly zeroed buffer in CIFS Unix extensions set times call.
+
+Version 1.25
+------------
+Fix internationalization problem in cifs readdir with filenames that map to
+longer UTF-8 strings than the string on the wire was in Unicode. Add workaround
+for readdir to netapp servers. Fix search rewind (seek into readdir to return
+non-consecutive entries). Do not do readdir when server negotiates
+buffer size to small to fit filename. Add support for reading POSIX ACLs from
+the server (add also acl and noacl mount options).
+
+Version 1.24
+------------
+Optionally allow using server side inode numbers, rather than client generated
+ones by specifying mount option "serverino" - this is required for some apps
+to work which double check hardlinked files and have persistent inode numbers.
+
+Version 1.23
+------------
+Multiple bigendian fixes. On little endian systems (for reconnect after
+network failure) fix tcp session reconnect code so we do not try first
+to reconnect on reverse of port 445. Treat reparse points (NTFS junctions)
+as directories rather than symlinks because we can do follow link on them.
+
+Version 1.22
+------------
+Add config option to enable XATTR (extended attribute) support, mapping
+xattr names in the "user." namespace space to SMB/CIFS EAs. Lots of
+minor fixes pointed out by the Stanford SWAT checker (mostly missing
+or out of order NULL pointer checks in little used error paths).
+
+Version 1.21
+------------
+Add new mount parm to control whether mode check (generic_permission) is done
+on the client. If Unix extensions are enabled and the uids on the client
+and server do not match, client permission checks are meaningless on
+server uids that do not exist on the client (this does not affect the
+normal ACL check which occurs on the server). Fix default uid
+on mknod to match create and mkdir. Add optional mount parm to allow
+override of the default uid behavior (in which the server sets the uid
+and gid of newly created files). Normally for network filesystem mounts
+user want the server to set the uid/gid on newly created files (rather than
+using uid of the client processes you would in a local filesystem).
+
+Version 1.20
+------------
+Make transaction counts more consistent. Merge /proc/fs/cifs/SimultaneousOps
+info into /proc/fs/cifs/DebugData. Fix oops in rare oops in readdir
+(in build_wildcard_path_from_dentry). Fix mknod to pass type field
+(block/char/fifo) properly. Remove spurious mount warning log entry when
+credentials passed as mount argument. Set major/minor device number in
+inode for block and char devices when unix extensions enabled.
+
+Version 1.19
+------------
+Fix /proc/fs/cifs/Stats and DebugData display to handle larger
+amounts of return data. Properly limit requests to MAX_REQ (50
+is the usual maximum active multiplex SMB/CIFS requests per server).
+Do not kill cifsd (and thus hurt the other SMB session) when more than one
+session to the same server (but with different userids) exists and one
+of the two user's smb sessions is being removed while leaving the other.
+Do not loop reconnecting in cifsd demultiplex thread when admin
+kills the thread without going through unmount.
+
+Version 1.18
+------------
+Do not rename hardlinked files (since that should be a noop). Flush
+cached write behind data when reopening a file after session abend,
+except when already in write. Grab per socket sem during reconnect
+to avoid oops in sendmsg if overlapping with reconnect. Do not
+reset cached inode file size on readdir for files open for write on
+client.
+
+
+Version 1.17
+------------
+Update number of blocks in file so du command is happier (in Linux a fake
+blocksize of 512 is required for calculating number of blocks in inode).
+Fix prepare write of partial pages to read in data from server if possible.
+Fix race on tcpStatus field between unmount and reconnection code, causing
+cifsd process sometimes to hang around forever. Improve out of memory
+checks in cifs_filldir
+
+Version 1.16
+------------
+Fix incorrect file size in file handle based setattr on big endian hardware.
+Fix oops in build_path_from_dentry when out of memory. Add checks for invalid
+and closing file structs in writepage/partialpagewrite. Add statistics
+for each mounted share (new menuconfig option). Fix endianness problem in
+volume information displayed in /proc/fs/cifs/DebugData (only affects
+affects big endian architectures). Prevent renames while constructing
+path names for open, mkdir and rmdir.
+
+Version 1.15
+------------
+Change to mempools for alloc smb request buffers and multiplex structs
+to better handle low memory problems (and potential deadlocks).
+
+Version 1.14
+------------
+Fix incomplete listings of large directories on Samba servers when Unix
+extensions enabled. Fix oops when smb_buffer can not be allocated. Fix
+rename deadlock when writing out dirty pages at same time.
+
+Version 1.13
+------------
+Fix open of files in which O_CREATE can cause the mode to change in
+some cases. Fix case in which retry of write overlaps file close.
+Fix PPC64 build error. Reduce excessive stack usage in smb password
+hashing. Fix overwrite of Linux user's view of file mode to Windows servers.
+
+Version 1.12
+------------
+Fixes for large file copy, signal handling, socket retry, buffer
+allocation and low memory situations.
+
+Version 1.11
+------------
+Better port 139 support to Windows servers (RFC1001/RFC1002 Session_Initialize)
+also now allowing support for specifying client netbiosname. NT4 support added.
+
+Version 1.10
+------------
+Fix reconnection (and certain failed mounts) to properly wake up the
+blocked users thread so it does not seem hung (in some cases was blocked
+until the cifs receive timeout expired). Fix spurious error logging
+to kernel log when application with open network files killed.
+
+Version 1.09
+------------
+Fix /proc/fs module unload warning message (that could be logged
+to the kernel log). Fix intermittent failure in connectathon
+test7 (hardlink count not immediately refreshed in case in which
+inode metadata can be incorrectly kept cached when time near zero)
+
+Version 1.08
+------------
+Allow file_mode and dir_mode (specified at mount time) to be enforced
+locally (the server already enforced its own ACLs too) for servers
+that do not report the correct mode (do not support the
+CIFS Unix Extensions).
+
+Version 1.07
+------------
+Fix some small memory leaks in some unmount error paths. Fix major leak
+of cache pages in readpages causing multiple read oriented stress
+testcases (including fsx, and even large file copy) to fail over time.
+
+Version 1.06
+------------
+Send NTCreateX with ATTR_POSIX if Linux/Unix extensions negotiated with server.
+This allows files that differ only in case and improves performance of file
+creation and file open to such servers. Fix semaphore conflict which causes
+slow delete of open file to Samba (which unfortunately can cause an oplock
+break to self while vfs_unlink held i_sem) which can hang for 20 seconds.
+
+Version 1.05
+------------
+fixes to cifs_readpages for fsx test case
+
+Version 1.04
+------------
+Fix caching data integrity bug when extending file size especially when no
+oplock on file. Fix spurious logging of valid already parsed mount options
+that are parsed outside of the cifs vfs such as nosuid.
+
+
+Version 1.03
+------------
+Connect to server when port number override not specified, and tcp port
+unitialized. Reset search to restart at correct file when kernel routine
+filldir returns error during large directory searches (readdir).
+
+Version 1.02
+------------
+Fix caching problem when files opened by multiple clients in which
+page cache could contain stale data, and write through did
+not occur often enough while file was still open when read ahead
+(read oplock) not allowed. Treat "sep=" when first mount option
+as an override of comma as the default separator between mount
+options.
+
+Version 1.01
+------------
+Allow passwords longer than 16 bytes. Allow null password string.
+
+Version 1.00
+------------
+Gracefully clean up failed mounts when attempting to mount to servers such as
+Windows 98 that terminate tcp sessions during protocol negotiation. Handle
+embedded commas in mount parsing of passwords.
+
+Version 0.99
+------------
+Invalidate local inode cached pages on oplock break and when last file
+instance is closed so that the client does not continue using stale local
+copy rather than later modified server copy of file. Do not reconnect
+when server drops the tcp session prematurely before negotiate
+protocol response. Fix oops in reopen_file when dentry freed. Allow
+the support for CIFS Unix Extensions to be disabled via proc interface.
+
+Version 0.98
+------------
+Fix hang in commit_write during reconnection of open files under heavy load.
+Fix unload_nls oops in a mount failure path. Serialize writes to same socket
+which also fixes any possible races when cifs signatures are enabled in SMBs
+being sent out of signature sequence number order.
+
+Version 0.97
+------------
+Fix byte range locking bug (endian problem) causing bad offset and
+length.
+
+Version 0.96
+------------
+Fix oops (in send_sig) caused by CIFS unmount code trying to
+wake up the demultiplex thread after it had exited. Do not log
+error on harmless oplock release of closed handle.
+
+Version 0.95
+------------
+Fix unsafe global variable usage and password hash failure on gcc 3.3.1
+Fix problem reconnecting secondary mounts to same server after session
+failure. Fix invalid dentry - race in mkdir when directory gets created
+by another client between the lookup and mkdir.
+
+Version 0.94
+------------
+Fix to list processing in reopen_files. Fix reconnection when server hung
+but tcpip session still alive. Set proper timeout on socket read.
+
+Version 0.93
+------------
+Add missing mount options including iocharset. SMP fixes in write and open.
+Fix errors in reconnecting after TCP session failure. Fix module unloading
+of default nls codepage
+
+Version 0.92
+------------
+Active smb transactions should never go negative (fix double FreeXid). Fix
+list processing in file routines. Check return code on kmalloc in open.
+Fix spinlock usage for SMP.
+
+Version 0.91
+------------
+Fix oops in reopen_files when invalid dentry. drop dentry on server rename
+and on revalidate errors. Fix cases where pid is now tgid. Fix return code
+on create hard link when server does not support them.
+
+Version 0.90
+------------
+Fix scheduling while atomic error in getting inode info on newly created file.
+Fix truncate of existing files opened with O_CREAT but not O_TRUNC set.
+
+Version 0.89
+------------
+Fix oops on write to dead tcp session. Remove error log write for case when file open
+O_CREAT but not O_EXCL
+
+Version 0.88
+------------
+Fix non-POSIX behavior on rename of open file and delete of open file by taking
+advantage of trans2 SetFileInfo rename facility if available on target server.
+Retry on ENOSPC and EAGAIN socket errors.
+
+Version 0.87
+------------
+Fix oops on big endian readdir. Set blksize to be even power of two (2**blkbits) to fix
+allocation size miscalculation. After oplock token lost do not read through
+cache.
+
+Version 0.86
+------------
+Fix oops on empty file readahead. Fix for file size handling for locally cached files.
+
+Version 0.85
+------------
+Fix oops in mkdir when server fails to return inode info. Fix oops in reopen_files
+during auto reconnection to server after server recovered from failure.
+
+Version 0.84
+------------
+Finish support for Linux 2.5 open/create changes, which removes the
+redundant NTCreate/QPathInfo/close that was sent during file create.
+Enable oplock by default. Enable packet signing by default (needed to
+access many recent Windows servers)
+
+Version 0.83
+------------
+Fix oops when mounting to long server names caused by inverted parms to kmalloc.
+Fix MultiuserMount (/proc/fs/cifs configuration setting) so that when enabled
+we will choose a cifs user session (smb uid) that better matches the local
+uid if a) the mount uid does not match the current uid and b) we have another
+session to the same server (ip address) for a different mount which
+matches the current local uid.
+
+Version 0.82
+------------
+Add support for mknod of block or character devices. Fix oplock
+code (distributed caching) to properly send response to oplock
+break from server.
+
+Version 0.81
+------------
+Finish up CIFS packet digital signing for the default
+NTLM security case. This should help Windows 2003
+network interoperability since it is common for
+packet signing to be required now. Fix statfs (stat -f)
+which recently started returning errors due to
+invalid value (-1 instead of 0) being set in the
+struct kstatfs f_ffiles field.
+
+Version 0.80
+-----------
+Fix oops on stopping oplock thread when removing cifs when
+built as module.
+
+Version 0.79
+------------
+Fix mount options for ro (readonly), uid, gid and file and directory mode.
+
+Version 0.78
+------------
+Fix errors displayed on failed mounts to be more understandable.
+Fixed various incorrect or misleading smb to posix error code mappings.
+
+Version 0.77
+------------
+Fix display of NTFS DFS junctions to display as symlinks.
+They are the network equivalent. Fix oops in
+cifs_partialpagewrite caused by missing spinlock protection
+of openfile linked list. Allow writebehind caching errors to
+be returned to the application at file close.
+
+Version 0.76
+------------
+Clean up options displayed in /proc/mounts by show_options to
+be more consistent with other filesystems.
+
+Version 0.75
+------------
+Fix delete of readonly file to Windows servers. Reflect
+presence or absence of read only dos attribute in mode
+bits for servers that do not support CIFS Unix extensions.
+Fix shortened results on readdir of large directories to
+servers supporting CIFS Unix extensions (caused by
+incorrect resume key).
+
+Version 0.74
+------------
+Fix truncate bug (set file size) that could cause hangs e.g. running fsx
+
+Version 0.73
+------------
+unload nls if mount fails.
+
+Version 0.72
+------------
+Add resume key support to search (readdir) code to workaround
+Windows bug. Add /proc/fs/cifs/LookupCacheEnable which
+allows disabling caching of attribute information for
+lookups.
+
+Version 0.71
+------------
+Add more oplock handling (distributed caching code). Remove
+dead code. Remove excessive stack space utilization from
+symlink routines.
+
+Version 0.70
+------------
+Fix oops in get dfs referral (triggered when null path sent in to
+mount). Add support for overriding rsize at mount time.
+
+Version 0.69
+------------
+Fix buffer overrun in readdir which caused intermittent kernel oopses.
+Fix writepage code to release kmap on write data. Allow "-ip=" new
+mount option to be passed in on parameter distinct from the first part
+(server name portion of) the UNC name. Allow override of the
+tcp port of the target server via new mount option "-port="
+
+Version 0.68
+------------
+Fix search handle leak on rewind. Fix setuid and gid so that they are
+reflected in the local inode immediately. Cleanup of whitespace
+to make 2.4 and 2.5 versions more consistent.
+
+
+Version 0.67
+------------
+Fix signal sending so that captive thread (cifsd) exits on umount
+(which was causing the warning in kmem_cache_free of the request buffers
+at rmmod time). This had broken as a sideeffect of the recent global
+kernel change to daemonize. Fix memory leak in readdir code which
+showed up in "ls -R" (and applications that did search rewinding).
+
+Version 0.66
+------------
+Reconnect tids and fids after session reconnection (still do not
+reconnect byte range locks though). Fix problem caching
+lookup information for directory inodes, improving performance,
+especially in deep directory trees. Fix various build warnings.
+
+Version 0.65
+------------
+Finish fixes to commit write for caching/readahead consistency. fsx
+now works to Samba servers. Fix oops caused when readahead
+was interrupted by a signal.
+
+Version 0.64
+------------
+Fix data corruption (in partial page after truncate) that caused fsx to
+fail to Windows servers. Cleaned up some extraneous error logging in
+common error paths. Add generic sendfile support.
+
+Version 0.63
+------------
+Fix memory leak in AllocMidQEntry.
+Finish reconnection logic, so connection with server can be dropped
+(or server rebooted) and the cifs client will reconnect.
+
+Version 0.62
+------------
+Fix temporary socket leak when bad userid or password specified
+(or other SMBSessSetup failure). Increase maximum buffer size to slightly
+over 16K to allow negotiation of up to Samba and Windows server default read
+sizes. Add support for readpages
+
+Version 0.61
+------------
+Fix oops when username not passed in on mount. Extensive fixes and improvements
+to error logging (strip redundant newlines, change debug macros to ensure newline
+passed in and to be more consistent). Fix writepage wrong file handle problem,
+a readonly file handle could be incorrectly used to attempt to write out
+file updates through the page cache to multiply open files. This could cause
+the iozone benchmark to fail on the fwrite test. Fix bug mounting two different
+shares to the same Windows server when using different usernames
+(doing this to Samba servers worked but Windows was rejecting it) - now it is
+possible to use different userids when connecting to the same server from a
+Linux client. Fix oops when treeDisconnect called during unmount on
+previously freed socket.
+
+Version 0.60
+------------
+Fix oops in readpages caused by not setting address space operations in inode in
+rare code path.
+
+Version 0.59
+------------
+Includes support for deleting of open files and renaming over existing files (per POSIX
+requirement). Add readlink support for Windows junction points (directory symlinks).
+
+Version 0.58
+------------
+Changed read and write to go through pagecache. Added additional address space operations.
+Memory mapped operations now working.
+
+Version 0.57
+------------
+Added writepage code for additional memory mapping support. Fixed leak in xids causing
+the simultaneous operations counter (/proc/fs/cifs/SimultaneousOps) to increase on
+every stat call. Additional formatting cleanup.
+
+Version 0.56
+------------
+Fix bigendian bug in order of time conversion. Merge 2.5 to 2.4 version. Formatting cleanup.
+
+Version 0.55
+------------
+Fixes from Zwane Mwaikambo for adding missing return code checking in a few places.
+Also included a modified version of his fix to protect global list manipulation of
+the smb session and tree connection and mid related global variables.
+
+Version 0.54
+------------
+Fix problem with captive thread hanging around at unmount time. Adjust to 2.5.42-pre
+changes to superblock layout. Remove wasteful allocation of smb buffers (now the send
+buffer is reused for responses). Add more oplock handling. Additional minor cleanup.
+
+Version 0.53
+------------
+More stylistic updates to better match kernel style. Add additional statistics
+for filesystem which can be viewed via /proc/fs/cifs. Add more pieces of NTLMv2
+and CIFS Packet Signing enablement.
+
+Version 0.52
+------------
+Replace call to sleep_on with safer wait_on_event.
+Make stylistic changes to better match kernel style recommendations.
+Remove most typedef usage (except for the PDUs themselves).
+
+Version 0.51
+------------
+Update mount so the -unc mount option is no longer required (the ip address can be specified
+in a UNC style device name. Implementation of readpage/writepage started.
+
+Version 0.50
+------------
+Fix intermittent problem with incorrect smb header checking on badly
+fragmented tcp responses
+
+Version 0.49
+------------
+Fixes to setting of allocation size and file size.
+
+Version 0.48
+------------
+Various 2.5.38 fixes. Now works on 2.5.38
+
+Version 0.47
+------------
+Prepare for 2.5 kernel merge. Remove ifdefs.
+
+Version 0.46
+------------
+Socket buffer management fixes. Fix dual free.
+
+Version 0.45
+------------
+Various big endian fixes for hardlinks and symlinks and also for dfs.
+
+Version 0.44
+------------
+Various big endian fixes for servers with Unix extensions such as Samba
+
+Version 0.43
+------------
+Various FindNext fixes for incorrect filenames on large directory searches on big endian
+clients. basic posix file i/o tests now work on big endian machines, not just le
+
+Version 0.42
+------------
+SessionSetup and NegotiateProtocol now work from Big Endian machines.
+Various Big Endian fixes found during testing on the Linux on 390. Various fixes for compatibility with older
+versions of 2.4 kernel (now builds and works again on kernels at least as early as 2.4.7).
+
+Version 0.41
+------------
+Various minor fixes for Connectathon Posix "basic" file i/o test suite. Directory caching fixed so hardlinked
+files now return the correct number of links on fstat as they are repeatedly linked and unlinked.
+
+Version 0.40
+------------
+Implemented "Raw" (i.e. not encapsulated in SPNEGO) NTLMSSP (i.e. the Security Provider Interface used to negotiate
+session advanced session authentication). Raw NTLMSSP is preferred by Windows 2000 Professional and Windows XP.
+Began implementing support for SPNEGO encapsulation of NTLMSSP based session authentication blobs
+(which is the mechanism preferred by Windows 2000 server in the absence of Kerberos).
+
+Version 0.38
+------------
+Introduced optional mount helper utility mount.cifs and made coreq changes to cifs vfs to enable
+it. Fixed a few bugs in the DFS code (e.g. bcc two bytes too short and incorrect uid in PDU).
+
+Version 0.37
+------------
+Rewrote much of connection and mount/unmount logic to handle bugs with
+multiple uses to same share, multiple users to same server etc.
+
+Version 0.36
+------------
+Fixed major problem with dentry corruption (missing call to dput)
+
+Version 0.35
+------------
+Rewrite of readdir code to fix bug. Various fixes for bigendian machines.
+Begin adding oplock support. Multiusermount and oplockEnabled flags added to /proc/fs/cifs
+although corresponding function not fully implemented in the vfs yet
+
+Version 0.34
+------------
+Fixed dentry caching bug, misc. cleanup
+
+Version 0.33
+------------
+Fixed 2.5 support to handle build and configure changes as well as misc. 2.5 changes. Now can build
+on current 2.5 beta version (2.5.24) of the Linux kernel as well as on 2.4 Linux kernels.
+Support for STATUS codes (newer 32 bit NT error codes) added. DFS support begun to be added.
+
+Version 0.32
+------------
+Unix extensions (symlink, readlink, hardlink, chmod and some chgrp and chown) implemented
+and tested against Samba 2.2.5
+
+
+Version 0.31
+------------
+1) Fixed lockrange to be correct (it was one byte too short)
+
+2) Fixed GETLK (i.e. the fcntl call to test a range of bytes in a file to see if locked) to correctly
+show range as locked when there is a conflict with an existing lock.
+
+3) default file perms are now 2767 (indicating support for mandatory locks) instead of 777 for directories
+in most cases. Eventually will offer optional ability to query server for the correct perms.
+
+3) Fixed eventual trap when mounting twice to different shares on the same server when the first succeeded
+but the second one was invalid and failed (the second one was incorrectly disconnecting the tcp and smb
+session)
+
+4) Fixed error logging of valid mount options
+
+5) Removed logging of password field.
+
+6) Moved negotiate, treeDisconnect and uloggoffX (only tConx and SessSetup remain in connect.c) to cifssmb.c
+and cleaned them up and made them more consistent with other cifs functions.
+
+7) Server support for Unix extensions is now fully detected and FindFirst is implemented both ways
+(with or without Unix extensions) but FindNext and QueryPathInfo with the Unix extensions are not completed,
+nor is the symlink support using the Unix extensions
+
+8) Started adding the readlink and follow_link code
+
+Version 0.3
+-----------
+Initial drop
+
diff --git a/Documentation/filesystems/cifs/README b/Documentation/filesystems/cifs/README
new file mode 100644
index 00000000000..2d5622f60e1
--- /dev/null
+++ b/Documentation/filesystems/cifs/README
@@ -0,0 +1,753 @@
+The CIFS VFS support for Linux supports many advanced network filesystem
+features such as hierarchical dfs like namespace, hardlinks, locking and more.
+It was designed to comply with the SNIA CIFS Technical Reference (which
+supersedes the 1992 X/Open SMB Standard) as well as to perform best practice
+practical interoperability with Windows 2000, Windows XP, Samba and equivalent
+servers. This code was developed in participation with the Protocol Freedom
+Information Foundation.
+
+Please see
+ http://protocolfreedom.org/ and
+ http://samba.org/samba/PFIF/
+for more details.
+
+
+For questions or bug reports please contact:
+ sfrench@samba.org (sfrench@us.ibm.com)
+
+Build instructions:
+==================
+For Linux 2.4:
+1) Get the kernel source (e.g.from http://www.kernel.org)
+and download the cifs vfs source (see the project page
+at http://us1.samba.org/samba/Linux_CIFS_client.html)
+and change directory into the top of the kernel directory
+then patch the kernel (e.g. "patch -p1 < cifs_24.patch")
+to add the cifs vfs to your kernel configure options if
+it has not already been added (e.g. current SuSE and UL
+users do not need to apply the cifs_24.patch since the cifs vfs is
+already in the kernel configure menu) and then
+mkdir linux/fs/cifs and then copy the current cifs vfs files from
+the cifs download to your kernel build directory e.g.
+
+ cp <cifs_download_dir>/fs/cifs/* to <kernel_download_dir>/fs/cifs
+
+2) make menuconfig (or make xconfig)
+3) select cifs from within the network filesystem choices
+4) save and exit
+5) make dep
+6) make modules (or "make" if CIFS VFS not to be built as a module)
+
+For Linux 2.6:
+1) Download the kernel (e.g. from http://www.kernel.org)
+and change directory into the top of the kernel directory tree
+(e.g. /usr/src/linux-2.5.73)
+2) make menuconfig (or make xconfig)
+3) select cifs from within the network filesystem choices
+4) save and exit
+5) make
+
+
+Installation instructions:
+=========================
+If you have built the CIFS vfs as module (successfully) simply
+type "make modules_install" (or if you prefer, manually copy the file to
+the modules directory e.g. /lib/modules/2.4.10-4GB/kernel/fs/cifs/cifs.o).
+
+If you have built the CIFS vfs into the kernel itself, follow the instructions
+for your distribution on how to install a new kernel (usually you
+would simply type "make install").
+
+If you do not have the utility mount.cifs (in the Samba 3.0 source tree and on
+the CIFS VFS web site) copy it to the same directory in which mount.smbfs and
+similar files reside (usually /sbin). Although the helper software is not
+required, mount.cifs is recommended. Eventually the Samba 3.0 utility program
+"net" may also be helpful since it may someday provide easier mount syntax for
+users who are used to Windows e.g.
+ net use <mount point> <UNC name or cifs URL>
+Note that running the Winbind pam/nss module (logon service) on all of your
+Linux clients is useful in mapping Uids and Gids consistently across the
+domain to the proper network user. The mount.cifs mount helper can be
+trivially built from Samba 3.0 or later source e.g. by executing:
+
+ gcc samba/source/client/mount.cifs.c -o mount.cifs
+
+If cifs is built as a module, then the size and number of network buffers
+and maximum number of simultaneous requests to one server can be configured.
+Changing these from their defaults is not recommended. By executing modinfo
+ modinfo kernel/fs/cifs/cifs.ko
+on kernel/fs/cifs/cifs.ko the list of configuration changes that can be made
+at module initialization time (by running insmod cifs.ko) can be seen.
+
+Allowing User Mounts
+====================
+To permit users to mount and unmount over directories they own is possible
+with the cifs vfs. A way to enable such mounting is to mark the mount.cifs
+utility as suid (e.g. "chmod +s /sbin/mount.cifs). To enable users to
+umount shares they mount requires
+1) mount.cifs version 1.4 or later
+2) an entry for the share in /etc/fstab indicating that a user may
+unmount it e.g.
+//server/usersharename /mnt/username cifs user 0 0
+
+Note that when the mount.cifs utility is run suid (allowing user mounts),
+in order to reduce risks, the "nosuid" mount flag is passed in on mount to
+disallow execution of an suid program mounted on the remote target.
+When mount is executed as root, nosuid is not passed in by default,
+and execution of suid programs on the remote target would be enabled
+by default. This can be changed, as with nfs and other filesystems,
+by simply specifying "nosuid" among the mount options. For user mounts
+though to be able to pass the suid flag to mount requires rebuilding
+mount.cifs with the following flag:
+
+ gcc samba/source/client/mount.cifs.c -DCIFS_ALLOW_USR_SUID -o mount.cifs
+
+There is a corresponding manual page for cifs mounting in the Samba 3.0 and
+later source tree in docs/manpages/mount.cifs.8
+
+Allowing User Unmounts
+======================
+To permit users to ummount directories that they have user mounted (see above),
+the utility umount.cifs may be used. It may be invoked directly, or if
+umount.cifs is placed in /sbin, umount can invoke the cifs umount helper
+(at least for most versions of the umount utility) for umount of cifs
+mounts, unless umount is invoked with -i (which will avoid invoking a umount
+helper). As with mount.cifs, to enable user unmounts umount.cifs must be marked
+as suid (e.g. "chmod +s /sbin/umount.cifs") or equivalent (some distributions
+allow adding entries to a file to the /etc/permissions file to achieve the
+equivalent suid effect). For this utility to succeed the target path
+must be a cifs mount, and the uid of the current user must match the uid
+of the user who mounted the resource.
+
+Also note that the customary way of allowing user mounts and unmounts is
+(instead of using mount.cifs and unmount.cifs as suid) to add a line
+to the file /etc/fstab for each //server/share you wish to mount, but
+this can become unwieldy when potential mount targets include many
+or unpredictable UNC names.
+
+Samba Considerations
+====================
+To get the maximum benefit from the CIFS VFS, we recommend using a server that
+supports the SNIA CIFS Unix Extensions standard (e.g. Samba 2.2.5 or later or
+Samba 3.0) but the CIFS vfs works fine with a wide variety of CIFS servers.
+Note that uid, gid and file permissions will display default values if you do
+not have a server that supports the Unix extensions for CIFS (such as Samba
+2.2.5 or later). To enable the Unix CIFS Extensions in the Samba server, add
+the line:
+
+ unix extensions = yes
+
+to your smb.conf file on the server. Note that the following smb.conf settings
+are also useful (on the Samba server) when the majority of clients are Unix or
+Linux:
+
+ case sensitive = yes
+ delete readonly = yes
+ ea support = yes
+
+Note that server ea support is required for supporting xattrs from the Linux
+cifs client, and that EA support is present in later versions of Samba (e.g.
+3.0.6 and later (also EA support works in all versions of Windows, at least to
+shares on NTFS filesystems). Extended Attribute (xattr) support is an optional
+feature of most Linux filesystems which may require enabling via
+make menuconfig. Client support for extended attributes (user xattr) can be
+disabled on a per-mount basis by specifying "nouser_xattr" on mount.
+
+The CIFS client can get and set POSIX ACLs (getfacl, setfacl) to Samba servers
+version 3.10 and later. Setting POSIX ACLs requires enabling both XATTR and
+then POSIX support in the CIFS configuration options when building the cifs
+module. POSIX ACL support can be disabled on a per mount basic by specifying
+"noacl" on mount.
+
+Some administrators may want to change Samba's smb.conf "map archive" and
+"create mask" parameters from the default. Unless the create mask is changed
+newly created files can end up with an unnecessarily restrictive default mode,
+which may not be what you want, although if the CIFS Unix extensions are
+enabled on the server and client, subsequent setattr calls (e.g. chmod) can
+fix the mode. Note that creating special devices (mknod) remotely
+may require specifying a mkdev function to Samba if you are not using
+Samba 3.0.6 or later. For more information on these see the manual pages
+("man smb.conf") on the Samba server system. Note that the cifs vfs,
+unlike the smbfs vfs, does not read the smb.conf on the client system
+(the few optional settings are passed in on mount via -o parameters instead).
+Note that Samba 2.2.7 or later includes a fix that allows the CIFS VFS to delete
+open files (required for strict POSIX compliance). Windows Servers already
+supported this feature. Samba server does not allow symlinks that refer to files
+outside of the share, so in Samba versions prior to 3.0.6, most symlinks to
+files with absolute paths (ie beginning with slash) such as:
+ ln -s /mnt/foo bar
+would be forbidden. Samba 3.0.6 server or later includes the ability to create
+such symlinks safely by converting unsafe symlinks (ie symlinks to server
+files that are outside of the share) to a samba specific format on the server
+that is ignored by local server applications and non-cifs clients and that will
+not be traversed by the Samba server). This is opaque to the Linux client
+application using the cifs vfs. Absolute symlinks will work to Samba 3.0.5 or
+later, but only for remote clients using the CIFS Unix extensions, and will
+be invisbile to Windows clients and typically will not affect local
+applications running on the same server as Samba.
+
+Use instructions:
+================
+Once the CIFS VFS support is built into the kernel or installed as a module
+(cifs.o), you can use mount syntax like the following to access Samba or Windows
+servers:
+
+ mount -t cifs //9.53.216.11/e$ /mnt -o user=myname,pass=mypassword
+
+Before -o the option -v may be specified to make the mount.cifs
+mount helper display the mount steps more verbosely.
+After -o the following commonly used cifs vfs specific options
+are supported:
+
+ user=<username>
+ pass=<password>
+ domain=<domain name>
+
+Other cifs mount options are described below. Use of TCP names (in addition to
+ip addresses) is available if the mount helper (mount.cifs) is installed. If
+you do not trust the server to which are mounted, or if you do not have
+cifs signing enabled (and the physical network is insecure), consider use
+of the standard mount options "noexec" and "nosuid" to reduce the risk of
+running an altered binary on your local system (downloaded from a hostile server
+or altered by a hostile router).
+
+Although mounting using format corresponding to the CIFS URL specification is
+not possible in mount.cifs yet, it is possible to use an alternate format
+for the server and sharename (which is somewhat similar to NFS style mount
+syntax) instead of the more widely used UNC format (i.e. \\server\share):
+ mount -t cifs tcp_name_of_server:share_name /mnt -o user=myname,pass=mypasswd
+
+When using the mount helper mount.cifs, passwords may be specified via alternate
+mechanisms, instead of specifying it after -o using the normal "pass=" syntax
+on the command line:
+1) By including it in a credential file. Specify credentials=filename as one
+of the mount options. Credential files contain two lines
+ username=someuser
+ password=your_password
+2) By specifying the password in the PASSWD environment variable (similarly
+the user name can be taken from the USER environment variable).
+3) By specifying the password in a file by name via PASSWD_FILE
+4) By specifying the password in a file by file descriptor via PASSWD_FD
+
+If no password is provided, mount.cifs will prompt for password entry
+
+Restrictions
+============
+Servers must support either "pure-TCP" (port 445 TCP/IP CIFS connections) or RFC
+1001/1002 support for "Netbios-Over-TCP/IP." This is not likely to be a
+problem as most servers support this.
+
+Valid filenames differ between Windows and Linux. Windows typically restricts
+filenames which contain certain reserved characters (e.g.the character :
+which is used to delimit the beginning of a stream name by Windows), while
+Linux allows a slightly wider set of valid characters in filenames. Windows
+servers can remap such characters when an explicit mapping is specified in
+the Server's registry. Samba starting with version 3.10 will allow such
+filenames (ie those which contain valid Linux characters, which normally
+would be forbidden for Windows/CIFS semantics) as long as the server is
+configured for Unix Extensions (and the client has not disabled
+/proc/fs/cifs/LinuxExtensionsEnabled).
+
+
+CIFS VFS Mount Options
+======================
+A partial list of the supported mount options follows:
+ user The user name to use when trying to establish
+ the CIFS session.
+ password The user password. If the mount helper is
+ installed, the user will be prompted for password
+ if not supplied.
+ ip The ip address of the target server
+ unc The target server Universal Network Name (export) to
+ mount.
+ domain Set the SMB/CIFS workgroup name prepended to the
+ username during CIFS session establishment
+ forceuid Set the default uid for inodes to the uid
+ passed in on mount. For mounts to servers
+ which do support the CIFS Unix extensions, such as a
+ properly configured Samba server, the server provides
+ the uid, gid and mode so this parameter should not be
+ specified unless the server and clients uid and gid
+ numbering differ. If the server and client are in the
+ same domain (e.g. running winbind or nss_ldap) and
+ the server supports the Unix Extensions then the uid
+ and gid can be retrieved from the server (and uid
+ and gid would not have to be specifed on the mount.
+ For servers which do not support the CIFS Unix
+ extensions, the default uid (and gid) returned on lookup
+ of existing files will be the uid (gid) of the person
+ who executed the mount (root, except when mount.cifs
+ is configured setuid for user mounts) unless the "uid="
+ (gid) mount option is specified. Also note that permission
+ checks (authorization checks) on accesses to a file occur
+ at the server, but there are cases in which an administrator
+ may want to restrict at the client as well. For those
+ servers which do not report a uid/gid owner
+ (such as Windows), permissions can also be checked at the
+ client, and a crude form of client side permission checking
+ can be enabled by specifying file_mode and dir_mode on
+ the client. (default)
+ forcegid (similar to above but for the groupid instead of uid) (default)
+ noforceuid Fill in file owner information (uid) by requesting it from
+ the server if possible. With this option, the value given in
+ the uid= option (on mount) will only be used if the server
+ can not support returning uids on inodes.
+ noforcegid (similar to above but for the group owner, gid, instead of uid)
+ uid Set the default uid for inodes, and indicate to the
+ cifs kernel driver which local user mounted. If the server
+ supports the unix extensions the default uid is
+ not used to fill in the owner fields of inodes (files)
+ unless the "forceuid" parameter is specified.
+ gid Set the default gid for inodes (similar to above).
+ file_mode If CIFS Unix extensions are not supported by the server
+ this overrides the default mode for file inodes.
+ fsc Enable local disk caching using FS-Cache (off by default). This
+ option could be useful to improve performance on a slow link,
+ heavily loaded server and/or network where reading from the
+ disk is faster than reading from the server (over the network).
+ This could also impact scalability positively as the
+ number of calls to the server are reduced. However, local
+ caching is not suitable for all workloads for e.g. read-once
+ type workloads. So, you need to consider carefully your
+ workload/scenario before using this option. Currently, local
+ disk caching is functional for CIFS files opened as read-only.
+ dir_mode If CIFS Unix extensions are not supported by the server
+ this overrides the default mode for directory inodes.
+ port attempt to contact the server on this tcp port, before
+ trying the usual ports (port 445, then 139).
+ iocharset Codepage used to convert local path names to and from
+ Unicode. Unicode is used by default for network path
+ names if the server supports it. If iocharset is
+ not specified then the nls_default specified
+ during the local client kernel build will be used.
+ If server does not support Unicode, this parameter is
+ unused.
+ rsize default read size (usually 16K). The client currently
+ can not use rsize larger than CIFSMaxBufSize. CIFSMaxBufSize
+ defaults to 16K and may be changed (from 8K to the maximum
+ kmalloc size allowed by your kernel) at module install time
+ for cifs.ko. Setting CIFSMaxBufSize to a very large value
+ will cause cifs to use more memory and may reduce performance
+ in some cases. To use rsize greater than 127K (the original
+ cifs protocol maximum) also requires that the server support
+ a new Unix Capability flag (for very large read) which some
+ newer servers (e.g. Samba 3.0.26 or later) do. rsize can be
+ set from a minimum of 2048 to a maximum of 130048 (127K or
+ CIFSMaxBufSize, whichever is smaller)
+ wsize default write size (default 57344)
+ maximum wsize currently allowed by CIFS is 57344 (fourteen
+ 4096 byte pages)
+ actimeo=n attribute cache timeout in seconds (default 1 second).
+ After this timeout, the cifs client requests fresh attribute
+ information from the server. This option allows to tune the
+ attribute cache timeout to suit the workload needs. Shorter
+ timeouts mean better the cache coherency, but increased number
+ of calls to the server. Longer timeouts mean reduced number
+ of calls to the server at the expense of less stricter cache
+ coherency checks (i.e. incorrect attribute cache for a short
+ period of time).
+ rw mount the network share read-write (note that the
+ server may still consider the share read-only)
+ ro mount network share read-only
+ version used to distinguish different versions of the
+ mount helper utility (not typically needed)
+ sep if first mount option (after the -o), overrides
+ the comma as the separator between the mount
+ parms. e.g.
+ -o user=myname,password=mypassword,domain=mydom
+ could be passed instead with period as the separator by
+ -o sep=.user=myname.password=mypassword.domain=mydom
+ this might be useful when comma is contained within username
+ or password or domain. This option is less important
+ when the cifs mount helper cifs.mount (version 1.1 or later)
+ is used.
+ nosuid Do not allow remote executables with the suid bit
+ program to be executed. This is only meaningful for mounts
+ to servers such as Samba which support the CIFS Unix Extensions.
+ If you do not trust the servers in your network (your mount
+ targets) it is recommended that you specify this option for
+ greater security.
+ exec Permit execution of binaries on the mount.
+ noexec Do not permit execution of binaries on the mount.
+ dev Recognize block devices on the remote mount.
+ nodev Do not recognize devices on the remote mount.
+ suid Allow remote files on this mountpoint with suid enabled to
+ be executed (default for mounts when executed as root,
+ nosuid is default for user mounts).
+ credentials Although ignored by the cifs kernel component, it is used by
+ the mount helper, mount.cifs. When mount.cifs is installed it
+ opens and reads the credential file specified in order
+ to obtain the userid and password arguments which are passed to
+ the cifs vfs.
+ guest Although ignored by the kernel component, the mount.cifs
+ mount helper will not prompt the user for a password
+ if guest is specified on the mount options. If no
+ password is specified a null password will be used.
+ perm Client does permission checks (vfs_permission check of uid
+ and gid of the file against the mode and desired operation),
+ Note that this is in addition to the normal ACL check on the
+ target machine done by the server software.
+ Client permission checking is enabled by default.
+ noperm Client does not do permission checks. This can expose
+ files on this mount to access by other users on the local
+ client system. It is typically only needed when the server
+ supports the CIFS Unix Extensions but the UIDs/GIDs on the
+ client and server system do not match closely enough to allow
+ access by the user doing the mount, but it may be useful with
+ non CIFS Unix Extension mounts for cases in which the default
+ mode is specified on the mount but is not to be enforced on the
+ client (e.g. perhaps when MultiUserMount is enabled)
+ Note that this does not affect the normal ACL check on the
+ target machine done by the server software (of the server
+ ACL against the user name provided at mount time).
+ serverino Use server's inode numbers instead of generating automatically
+ incrementing inode numbers on the client. Although this will
+ make it easier to spot hardlinked files (as they will have
+ the same inode numbers) and inode numbers may be persistent,
+ note that the server does not guarantee that the inode numbers
+ are unique if multiple server side mounts are exported under a
+ single share (since inode numbers on the servers might not
+ be unique if multiple filesystems are mounted under the same
+ shared higher level directory). Note that some older
+ (e.g. pre-Windows 2000) do not support returning UniqueIDs
+ or the CIFS Unix Extensions equivalent and for those
+ this mount option will have no effect. Exporting cifs mounts
+ under nfsd requires this mount option on the cifs mount.
+ This is now the default if server supports the
+ required network operation.
+ noserverino Client generates inode numbers (rather than using the actual one
+ from the server). These inode numbers will vary after
+ unmount or reboot which can confuse some applications,
+ but not all server filesystems support unique inode
+ numbers.
+ setuids If the CIFS Unix extensions are negotiated with the server
+ the client will attempt to set the effective uid and gid of
+ the local process on newly created files, directories, and
+ devices (create, mkdir, mknod). If the CIFS Unix Extensions
+ are not negotiated, for newly created files and directories
+ instead of using the default uid and gid specified on
+ the mount, cache the new file's uid and gid locally which means
+ that the uid for the file can change when the inode is
+ reloaded (or the user remounts the share).
+ nosetuids The client will not attempt to set the uid and gid on
+ on newly created files, directories, and devices (create,
+ mkdir, mknod) which will result in the server setting the
+ uid and gid to the default (usually the server uid of the
+ user who mounted the share). Letting the server (rather than
+ the client) set the uid and gid is the default. If the CIFS
+ Unix Extensions are not negotiated then the uid and gid for
+ new files will appear to be the uid (gid) of the mounter or the
+ uid (gid) parameter specified on the mount.
+ netbiosname When mounting to servers via port 139, specifies the RFC1001
+ source name to use to represent the client netbios machine
+ name when doing the RFC1001 netbios session initialize.
+ direct Do not do inode data caching on files opened on this mount.
+ This precludes mmapping files on this mount. In some cases
+ with fast networks and little or no caching benefits on the
+ client (e.g. when the application is doing large sequential
+ reads bigger than page size without rereading the same data)
+ this can provide better performance than the default
+ behavior which caches reads (readahead) and writes
+ (writebehind) through the local Linux client pagecache
+ if oplock (caching token) is granted and held. Note that
+ direct allows write operations larger than page size
+ to be sent to the server.
+ strictcache Use for switching on strict cache mode. In this mode the
+ client read from the cache all the time it has Oplock Level II,
+ otherwise - read from the server. All written data are stored
+ in the cache, but if the client doesn't have Exclusive Oplock,
+ it writes the data to the server.
+ rwpidforward Forward pid of a process who opened a file to any read or write
+ operation on that file. This prevent applications like WINE
+ from failing on read and write if we use mandatory brlock style.
+ acl Allow setfacl and getfacl to manage posix ACLs if server
+ supports them. (default)
+ noacl Do not allow setfacl and getfacl calls on this mount
+ user_xattr Allow getting and setting user xattrs (those attributes whose
+ name begins with "user." or "os2.") as OS/2 EAs (extended
+ attributes) to the server. This allows support of the
+ setfattr and getfattr utilities. (default)
+ nouser_xattr Do not allow getfattr/setfattr to get/set/list xattrs
+ mapchars Translate six of the seven reserved characters (not backslash)
+ *?<>|:
+ to the remap range (above 0xF000), which also
+ allows the CIFS client to recognize files created with
+ such characters by Windows's POSIX emulation. This can
+ also be useful when mounting to most versions of Samba
+ (which also forbids creating and opening files
+ whose names contain any of these seven characters).
+ This has no effect if the server does not support
+ Unicode on the wire.
+ nomapchars Do not translate any of these seven characters (default).
+ nocase Request case insensitive path name matching (case
+ sensitive is the default if the server supports it).
+ (mount option "ignorecase" is identical to "nocase")
+ posixpaths If CIFS Unix extensions are supported, attempt to
+ negotiate posix path name support which allows certain
+ characters forbidden in typical CIFS filenames, without
+ requiring remapping. (default)
+ noposixpaths If CIFS Unix extensions are supported, do not request
+ posix path name support (this may cause servers to
+ reject creatingfile with certain reserved characters).
+ nounix Disable the CIFS Unix Extensions for this mount (tree
+ connection). This is rarely needed, but it may be useful
+ in order to turn off multiple settings all at once (ie
+ posix acls, posix locks, posix paths, symlink support
+ and retrieving uids/gids/mode from the server) or to
+ work around a bug in server which implement the Unix
+ Extensions.
+ nobrl Do not send byte range lock requests to the server.
+ This is necessary for certain applications that break
+ with cifs style mandatory byte range locks (and most
+ cifs servers do not yet support requesting advisory
+ byte range locks).
+ forcemandatorylock Even if the server supports posix (advisory) byte range
+ locking, send only mandatory lock requests. For some
+ (presumably rare) applications, originally coded for
+ DOS/Windows, which require Windows style mandatory byte range
+ locking, they may be able to take advantage of this option,
+ forcing the cifs client to only send mandatory locks
+ even if the cifs server would support posix advisory locks.
+ "forcemand" is accepted as a shorter form of this mount
+ option.
+ nostrictsync If this mount option is set, when an application does an
+ fsync call then the cifs client does not send an SMB Flush
+ to the server (to force the server to write all dirty data
+ for this file immediately to disk), although cifs still sends
+ all dirty (cached) file data to the server and waits for the
+ server to respond to the write. Since SMB Flush can be
+ very slow, and some servers may be reliable enough (to risk
+ delaying slightly flushing the data to disk on the server),
+ turning on this option may be useful to improve performance for
+ applications that fsync too much, at a small risk of server
+ crash. If this mount option is not set, by default cifs will
+ send an SMB flush request (and wait for a response) on every
+ fsync call.
+ nodfs Disable DFS (global name space support) even if the
+ server claims to support it. This can help work around
+ a problem with parsing of DFS paths with Samba server
+ versions 3.0.24 and 3.0.25.
+ remount remount the share (often used to change from ro to rw mounts
+ or vice versa)
+ cifsacl Report mode bits (e.g. on stat) based on the Windows ACL for
+ the file. (EXPERIMENTAL)
+ servern Specify the server 's netbios name (RFC1001 name) to use
+ when attempting to setup a session to the server.
+ This is needed for mounting to some older servers (such
+ as OS/2 or Windows 98 and Windows ME) since they do not
+ support a default server name. A server name can be up
+ to 15 characters long and is usually uppercased.
+ sfu When the CIFS Unix Extensions are not negotiated, attempt to
+ create device files and fifos in a format compatible with
+ Services for Unix (SFU). In addition retrieve bits 10-12
+ of the mode via the SETFILEBITS extended attribute (as
+ SFU does). In the future the bottom 9 bits of the
+ mode also will be emulated using queries of the security
+ descriptor (ACL).
+ mfsymlinks Enable support for Minshall+French symlinks
+ (see http://wiki.samba.org/index.php/UNIX_Extensions#Minshall.2BFrench_symlinks)
+ This option is ignored when specified together with the
+ 'sfu' option. Minshall+French symlinks are used even if
+ the server supports the CIFS Unix Extensions.
+ sign Must use packet signing (helps avoid unwanted data modification
+ by intermediate systems in the route). Note that signing
+ does not work with lanman or plaintext authentication.
+ seal Must seal (encrypt) all data on this mounted share before
+ sending on the network. Requires support for Unix Extensions.
+ Note that this differs from the sign mount option in that it
+ causes encryption of data sent over this mounted share but other
+ shares mounted to the same server are unaffected.
+ locallease This option is rarely needed. Fcntl F_SETLEASE is
+ used by some applications such as Samba and NFSv4 server to
+ check to see whether a file is cacheable. CIFS has no way
+ to explicitly request a lease, but can check whether a file
+ is cacheable (oplocked). Unfortunately, even if a file
+ is not oplocked, it could still be cacheable (ie cifs client
+ could grant fcntl leases if no other local processes are using
+ the file) for cases for example such as when the server does not
+ support oplocks and the user is sure that the only updates to
+ the file will be from this client. Specifying this mount option
+ will allow the cifs client to check for leases (only) locally
+ for files which are not oplocked instead of denying leases
+ in that case. (EXPERIMENTAL)
+ sec Security mode. Allowed values are:
+ none attempt to connection as a null user (no name)
+ krb5 Use Kerberos version 5 authentication
+ krb5i Use Kerberos authentication and packet signing
+ ntlm Use NTLM password hashing (default)
+ ntlmi Use NTLM password hashing with signing (if
+ /proc/fs/cifs/PacketSigningEnabled on or if
+ server requires signing also can be the default)
+ ntlmv2 Use NTLMv2 password hashing
+ ntlmv2i Use NTLMv2 password hashing with packet signing
+ lanman (if configured in kernel config) use older
+ lanman hash
+hard Retry file operations if server is not responding
+soft Limit retries to unresponsive servers (usually only
+ one retry) before returning an error. (default)
+
+The mount.cifs mount helper also accepts a few mount options before -o
+including:
+
+ -S take password from stdin (equivalent to setting the environment
+ variable "PASSWD_FD=0"
+ -V print mount.cifs version
+ -? display simple usage information
+
+With most 2.6 kernel versions of modutils, the version of the cifs kernel
+module can be displayed via modinfo.
+
+Misc /proc/fs/cifs Flags and Debug Info
+=======================================
+Informational pseudo-files:
+DebugData Displays information about active CIFS sessions and
+ shares, features enabled as well as the cifs.ko
+ version.
+Stats Lists summary resource usage information as well as per
+ share statistics, if CONFIG_CIFS_STATS in enabled
+ in the kernel configuration.
+
+Configuration pseudo-files:
+PacketSigningEnabled If set to one, cifs packet signing is enabled
+ and will be used if the server requires
+ it. If set to two, cifs packet signing is
+ required even if the server considers packet
+ signing optional. (default 1)
+SecurityFlags Flags which control security negotiation and
+ also packet signing. Authentication (may/must)
+ flags (e.g. for NTLM and/or NTLMv2) may be combined with
+ the signing flags. Specifying two different password
+ hashing mechanisms (as "must use") on the other hand
+ does not make much sense. Default flags are
+ 0x07007
+ (NTLM, NTLMv2 and packet signing allowed). The maximum
+ allowable flags if you want to allow mounts to servers
+ using weaker password hashes is 0x37037 (lanman,
+ plaintext, ntlm, ntlmv2, signing allowed). Some
+ SecurityFlags require the corresponding menuconfig
+ options to be enabled (lanman and plaintext require
+ CONFIG_CIFS_WEAK_PW_HASH for example). Enabling
+ plaintext authentication currently requires also
+ enabling lanman authentication in the security flags
+ because the cifs module only supports sending
+ laintext passwords using the older lanman dialect
+ form of the session setup SMB. (e.g. for authentication
+ using plain text passwords, set the SecurityFlags
+ to 0x30030):
+
+ may use packet signing 0x00001
+ must use packet signing 0x01001
+ may use NTLM (most common password hash) 0x00002
+ must use NTLM 0x02002
+ may use NTLMv2 0x00004
+ must use NTLMv2 0x04004
+ may use Kerberos security 0x00008
+ must use Kerberos 0x08008
+ may use lanman (weak) password hash 0x00010
+ must use lanman password hash 0x10010
+ may use plaintext passwords 0x00020
+ must use plaintext passwords 0x20020
+ (reserved for future packet encryption) 0x00040
+
+cifsFYI If set to non-zero value, additional debug information
+ will be logged to the system error log. This field
+ contains three flags controlling different classes of
+ debugging entries. The maximum value it can be set
+ to is 7 which enables all debugging points (default 0).
+ Some debugging statements are not compiled into the
+ cifs kernel unless CONFIG_CIFS_DEBUG2 is enabled in the
+ kernel configuration. cifsFYI may be set to one or
+ nore of the following flags (7 sets them all):
+
+ log cifs informational messages 0x01
+ log return codes from cifs entry points 0x02
+ log slow responses (ie which take longer than 1 second)
+ CONFIG_CIFS_STATS2 must be enabled in .config 0x04
+
+
+traceSMB If set to one, debug information is logged to the
+ system error log with the start of smb requests
+ and responses (default 0)
+LookupCacheEnable If set to one, inode information is kept cached
+ for one second improving performance of lookups
+ (default 1)
+OplockEnabled If set to one, safe distributed caching enabled.
+ (default 1)
+LinuxExtensionsEnabled If set to one then the client will attempt to
+ use the CIFS "UNIX" extensions which are optional
+ protocol enhancements that allow CIFS servers
+ to return accurate UID/GID information as well
+ as support symbolic links. If you use servers
+ such as Samba that support the CIFS Unix
+ extensions but do not want to use symbolic link
+ support and want to map the uid and gid fields
+ to values supplied at mount (rather than the
+ actual values, then set this to zero. (default 1)
+
+These experimental features and tracing can be enabled by changing flags in
+/proc/fs/cifs (after the cifs module has been installed or built into the
+kernel, e.g. insmod cifs). To enable a feature set it to 1 e.g. to enable
+tracing to the kernel message log type:
+
+ echo 7 > /proc/fs/cifs/cifsFYI
+
+cifsFYI functions as a bit mask. Setting it to 1 enables additional kernel
+logging of various informational messages. 2 enables logging of non-zero
+SMB return codes while 4 enables logging of requests that take longer
+than one second to complete (except for byte range lock requests).
+Setting it to 4 requires defining CONFIG_CIFS_STATS2 manually in the
+source code (typically by setting it in the beginning of cifsglob.h),
+and setting it to seven enables all three. Finally, tracing
+the start of smb requests and responses can be enabled via:
+
+ echo 1 > /proc/fs/cifs/traceSMB
+
+Per share (per client mount) statistics are available in /proc/fs/cifs/Stats
+if the kernel was configured with cifs statistics enabled. The statistics
+represent the number of successful (ie non-zero return code from the server)
+SMB responses to some of the more common commands (open, delete, mkdir etc.).
+Also recorded is the total bytes read and bytes written to the server for
+that share. Note that due to client caching effects this can be less than the
+number of bytes read and written by the application running on the client.
+The statistics for the number of total SMBs and oplock breaks are different in
+that they represent all for that share, not just those for which the server
+returned success.
+
+Also note that "cat /proc/fs/cifs/DebugData" will display information about
+the active sessions and the shares that are mounted.
+
+Enabling Kerberos (extended security) works but requires version 1.2 or later
+of the helper program cifs.upcall to be present and to be configured in the
+/etc/request-key.conf file. The cifs.upcall helper program is from the Samba
+project(http://www.samba.org). NTLM and NTLMv2 and LANMAN support do not
+require this helper. Note that NTLMv2 security (which does not require the
+cifs.upcall helper program), instead of using Kerberos, is sufficient for
+some use cases.
+
+DFS support allows transparent redirection to shares in an MS-DFS name space.
+In addition, DFS support for target shares which are specified as UNC
+names which begin with host names (rather than IP addresses) requires
+a user space helper (such as cifs.upcall) to be present in order to
+translate host names to ip address, and the user space helper must also
+be configured in the file /etc/request-key.conf. Samba, Windows servers and
+many NAS appliances support DFS as a way of constructing a global name
+space to ease network configuration and improve reliability.
+
+To use cifs Kerberos and DFS support, the Linux keyutils package should be
+installed and something like the following lines should be added to the
+/etc/request-key.conf file:
+
+create cifs.spnego * * /usr/local/sbin/cifs.upcall %k
+create dns_resolver * * /usr/local/sbin/cifs.upcall %k
+
+CIFS kernel module parameters
+=============================
+These module parameters can be specified or modified either during the time of
+module loading or during the runtime by using the interface
+ /proc/module/cifs/parameters/<param>
+
+i.e. echo "value" > /sys/module/cifs/parameters/<param>
+
+1. enable_oplocks - Enable or disable oplocks. Oplocks are enabled by default.
+ [Y/y/1]. To disable use any of [N/n/0].
+
diff --git a/Documentation/filesystems/cifs/TODO b/Documentation/filesystems/cifs/TODO
new file mode 100644
index 00000000000..355abcdcda9
--- /dev/null
+++ b/Documentation/filesystems/cifs/TODO
@@ -0,0 +1,129 @@
+Version 1.53 May 20, 2008
+
+A Partial List of Missing Features
+==================================
+
+Contributions are welcome. There are plenty of opportunities
+for visible, important contributions to this module. Here
+is a partial list of the known problems and missing features:
+
+a) Support for SecurityDescriptors(Windows/CIFS ACLs) for chmod/chgrp/chown
+so that these operations can be supported to Windows servers
+
+b) Mapping POSIX ACLs (and eventually NFSv4 ACLs) to CIFS
+SecurityDescriptors
+
+c) Better pam/winbind integration (e.g. to handle uid mapping
+better)
+
+d) Cleanup now unneeded SessSetup code in
+fs/cifs/connect.c and add back in NTLMSSP code if any servers
+need it
+
+e) fix NTLMv2 signing when two mounts with different users to same
+server.
+
+f) Directory entry caching relies on a 1 second timer, rather than
+using FindNotify or equivalent. - (started)
+
+g) quota support (needs minor kernel change since quota calls
+to make it to network filesystems or deviceless filesystems)
+
+h) investigate sync behavior (including syncpage) and check
+for proper behavior of intr/nointr
+
+i) improve support for very old servers (OS/2 and Win9x for example)
+Including support for changing the time remotely (utimes command).
+
+j) hook lower into the sockets api (as NFS/SunRPC does) to avoid the
+extra copy in/out of the socket buffers in some cases.
+
+k) Better optimize open (and pathbased setfilesize) to reduce the
+oplock breaks coming from windows srv. Piggyback identical file
+opens on top of each other by incrementing reference count rather
+than resending (helps reduce server resource utilization and avoid
+spurious oplock breaks).
+
+l) Improve performance of readpages by sending more than one read
+at a time when 8 pages or more are requested. In conjuntion
+add support for async_cifs_readpages.
+
+m) Add support for storing symlink info to Windows servers
+in the Extended Attribute format their SFU clients would recognize.
+
+n) Finish fcntl D_NOTIFY support so kde and gnome file list windows
+will autorefresh (partially complete by Asser). Needs minor kernel
+vfs change to support removing D_NOTIFY on a file.
+
+o) Add GUI tool to configure /proc/fs/cifs settings and for display of
+the CIFS statistics (started)
+
+p) implement support for security and trusted categories of xattrs
+(requires minor protocol extension) to enable better support for SELINUX
+
+q) Implement O_DIRECT flag on open (already supported on mount)
+
+r) Create UID mapping facility so server UIDs can be mapped on a per
+mount or a per server basis to client UIDs or nobody if no mapping
+exists. This is helpful when Unix extensions are negotiated to
+allow better permission checking when UIDs differ on the server
+and client. Add new protocol request to the CIFS protocol
+standard for asking the server for the corresponding name of a
+particular uid.
+
+s) Add support for CIFS Unix and also the newer POSIX extensions to the
+server side for Samba 4.
+
+t) In support for OS/2 (LANMAN 1.2 and LANMAN2.1 based SMB servers)
+need to add ability to set time to server (utimes command)
+
+u) DOS attrs - returned as pseudo-xattr in Samba format (check VFAT and NTFS for this too)
+
+v) mount check for unmatched uids
+
+w) Add support for new vfs entry point for fallocate
+
+x) Fix Samba 3 server to handle Linux kernel aio so dbench with lots of
+processes can proceed better in parallel (on the server)
+
+y) Fix Samba 3 to handle reads/writes over 127K (and remove the cifs mount
+restriction of wsize max being 127K)
+
+KNOWN BUGS (updated April 24, 2007)
+====================================
+See http://bugzilla.samba.org - search on product "CifsVFS" for
+current bug list.
+
+1) existing symbolic links (Windows reparse points) are recognized but
+can not be created remotely. They are implemented for Samba and those that
+support the CIFS Unix extensions, although earlier versions of Samba
+overly restrict the pathnames.
+2) follow_link and readdir code does not follow dfs junctions
+but recognizes them
+3) create of new files to FAT partitions on Windows servers can
+succeed but still return access denied (appears to be Windows
+server not cifs client problem) and has not been reproduced recently.
+NTFS partitions do not have this problem.
+4) Unix/POSIX capabilities are reset after reconnection, and affect
+a few fields in the tree connection but we do do not know which
+superblocks to apply these changes to. We should probably walk
+the list of superblocks to set these. Also need to check the
+flags on the second mount to the same share, and see if we
+can do the same trick that NFS does to remount duplicate shares.
+
+Misc testing to do
+==================
+1) check out max path names and max path name components against various server
+types. Try nested symlinks (8 deep). Return max path name in stat -f information
+
+2) Modify file portion of ltp so it can run against a mounted network
+share and run it against cifs vfs in automated fashion.
+
+3) Additional performance testing and optimization using iozone and similar -
+there are some easy changes that can be done to parallelize sequential writes,
+and when signing is disabled to request larger read sizes (larger than
+negotiated size) and send larger write sizes to modern servers.
+
+4) More exhaustively test against less common servers. More testing
+against Windows 9x, Windows ME servers.
+
diff --git a/Documentation/filesystems/cifs/cifs.txt b/Documentation/filesystems/cifs/cifs.txt
new file mode 100644
index 00000000000..2fac91ac96c
--- /dev/null
+++ b/Documentation/filesystems/cifs/cifs.txt
@@ -0,0 +1,31 @@
+ This is the client VFS module for the Common Internet File System
+ (CIFS) protocol which is the successor to the Server Message Block
+ (SMB) protocol, the native file sharing mechanism for most early
+ PC operating systems. New and improved versions of CIFS are now
+ called SMB2 and SMB3. These dialects are also supported by the
+ CIFS VFS module. CIFS is fully supported by network
+ file servers such as Windows 2000, 2003, 2008 and 2012
+ as well by Samba (which provides excellent CIFS
+ server support for Linux and many other operating systems), so
+ this network filesystem client can mount to a wide variety of
+ servers.
+
+ The intent of this module is to provide the most advanced network
+ file system function for CIFS compliant servers, including better
+ POSIX compliance, secure per-user session establishment, high
+ performance safe distributed caching (oplock), optional packet
+ signing, large files, Unicode support and other internationalization
+ improvements. Since both Samba server and this filesystem client support
+ the CIFS Unix extensions, the combination can provide a reasonable
+ alternative to NFSv4 for fileserving in some Linux to Linux environments,
+ not just in Linux to Windows environments.
+
+ This filesystem has an mount utility (mount.cifs) that can be obtained from
+
+ https://ftp.samba.org/pub/linux-cifs/cifs-utils/
+
+ It must be installed in the directory with the other mount helpers.
+
+ For more information on the module see the project wiki page at
+
+ https://wiki.samba.org/index.php/LinuxCIFS_utils
diff --git a/Documentation/filesystems/cifs/winucase_convert.pl b/Documentation/filesystems/cifs/winucase_convert.pl
new file mode 100755
index 00000000000..322a9c833f2
--- /dev/null
+++ b/Documentation/filesystems/cifs/winucase_convert.pl
@@ -0,0 +1,62 @@
+#!/usr/bin/perl -w
+#
+# winucase_convert.pl -- convert "Windows 8 Upper Case Mapping Table.txt" to
+# a two-level set of C arrays.
+#
+# Copyright 2013: Jeff Layton <jlayton@redhat.com>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+
+while(<>) {
+ next if (!/^0x(..)(..)\t0x(....)\t/);
+ $firstchar = hex($1);
+ $secondchar = hex($2);
+ $uppercase = hex($3);
+
+ $top[$firstchar][$secondchar] = $uppercase;
+}
+
+for ($i = 0; $i < 256; $i++) {
+ next if (!$top[$i]);
+
+ printf("static const wchar_t t2_%2.2x[256] = {", $i);
+ for ($j = 0; $j < 256; $j++) {
+ if (($j % 8) == 0) {
+ print "\n\t";
+ } else {
+ print " ";
+ }
+ printf("0x%4.4x,", $top[$i][$j] ? $top[$i][$j] : 0);
+ }
+ print "\n};\n\n";
+}
+
+printf("static const wchar_t *const toplevel[256] = {", $i);
+for ($i = 0; $i < 256; $i++) {
+ if (($i % 8) == 0) {
+ print "\n\t";
+ } elsif ($top[$i]) {
+ print " ";
+ } else {
+ print " ";
+ }
+
+ if ($top[$i]) {
+ printf("t2_%2.2x,", $i);
+ } else {
+ print "NULL,";
+ }
+}
+print "\n};\n\n";
diff --git a/Documentation/filesystems/directory-locking b/Documentation/filesystems/directory-locking
index ff7b611abf3..09bbf9a54f8 100644
--- a/Documentation/filesystems/directory-locking
+++ b/Documentation/filesystems/directory-locking
@@ -2,6 +2,10 @@
kinds of locks - per-inode (->i_mutex) and per-filesystem
(->s_vfs_rename_mutex).
+ When taking the i_mutex on multiple non-directory objects, we
+always acquire the locks in order by increasing address. We'll call
+that "inode pointer" order in the following.
+
For our purposes all operations fall in 5 classes:
1) read access. Locking rules: caller locks directory we are accessing.
@@ -12,8 +16,9 @@ kinds of locks - per-inode (->i_mutex) and per-filesystem
locks victim and calls the method.
4) rename() that is _not_ cross-directory. Locking rules: caller locks
-the parent, finds source and target, if target already exists - locks it
-and then calls the method.
+the parent and finds source and target. If target already exists, lock
+it. If source is a non-directory, lock it. If that means we need to
+lock both, lock them in inode pointer order.
5) link creation. Locking rules:
* lock parent
@@ -30,7 +35,9 @@ rules:
fail with -ENOTEMPTY
* if new parent is equal to or is a descendent of source
fail with -ELOOP
- * if target exists - lock it.
+ * If target exists, lock it. If source is a non-directory, lock
+ it. In case that means we need to lock both source and target,
+ do so in inode pointer order.
* call the method.
@@ -56,9 +63,11 @@ objects - A < B iff A is an ancestor of B.
renames will be blocked on filesystem lock and we don't start changing
the order until we had acquired all locks).
-(3) any operation holds at most one lock on non-directory object and
- that lock is acquired after all other locks. (Proof: see descriptions
- of operations).
+(3) locks on non-directory objects are acquired only after locks on
+ directory objects, and are acquired in inode pointer order.
+ (Proof: all operations but renames take lock on at most one
+ non-directory object, except renames, which take locks on source and
+ target in inode pointer order in the case they are not directories.)
Now consider the minimal deadlock. Each process is blocked on
attempt to acquire some lock and already holds at least one lock. Let's
@@ -66,9 +75,13 @@ consider the set of contended locks. First of all, filesystem lock is
not contended, since any process blocked on it is not holding any locks.
Thus all processes are blocked on ->i_mutex.
- Non-directory objects are not contended due to (3). Thus link
-creation can't be a part of deadlock - it can't be blocked on source
-and it means that it doesn't hold any locks.
+ By (3), any process holding a non-directory lock can only be
+waiting on another non-directory lock with a larger address. Therefore
+the process holding the "largest" such lock can always make progress, and
+non-directory objects are not included in the set of contended locks.
+
+ Thus link creation can't be a part of deadlock - it can't be
+blocked on source and it means that it doesn't hold any locks.
Any contended object is either held by cross-directory rename or
has a child that is also contended. Indeed, suppose that it is held by
diff --git a/Documentation/filesystems/ext3.txt b/Documentation/filesystems/ext3.txt
index 293855e9500..7ed0d17d672 100644
--- a/Documentation/filesystems/ext3.txt
+++ b/Documentation/filesystems/ext3.txt
@@ -26,11 +26,12 @@ journal=inum When a journal already exists, this option is ignored.
Otherwise, it specifies the number of the inode which
will represent the ext3 file system's journal file.
+journal_path=path
journal_dev=devnum When the external journal device's major/minor numbers
- have changed, this option allows the user to specify
+ have changed, these options allow the user to specify
the new journal location. The journal device is
- identified through its new major/minor numbers encoded
- in devnum.
+ identified through either its new major/minor numbers
+ encoded in devnum, or via a path to the device.
norecovery Don't load the journal on mounting. Note that this forces
noload mount of inconsistent filesystem, which can lead to
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index 34ea4f1fa6e..919a3293aaa 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -2,7 +2,7 @@
Ext4 Filesystem
===============
-Ext4 is an an advanced level of the ext3 filesystem which incorporates
+Ext4 is an advanced level of the ext3 filesystem which incorporates
scalability and reliability enhancements for supporting large filesystems
(64 bit) in keeping with increasing disk capacities and state-of-the-art
feature requirements.
@@ -144,11 +144,12 @@ journal_async_commit Commit block can be written to disk without waiting
mount the device. This will enable 'journal_checksum'
internally.
+journal_path=path
journal_dev=devnum When the external journal device's major/minor numbers
- have changed, this option allows the user to specify
+ have changed, these options allow the user to specify
the new journal location. The journal device is
- identified through its new major/minor numbers encoded
- in devnum.
+ identified through either its new major/minor numbers
+ encoded in devnum, or via a path to the device.
norecovery Don't load the journal on mounting. Note that
noload if the filesystem was not unmounted cleanly,
@@ -494,6 +495,17 @@ Files in /sys/fs/ext4/<devname>
session_write_kbytes This file is read-only and shows the number of
kilobytes of data that have been written to this
filesystem since it was mounted.
+
+ reserved_clusters This is RW file and contains number of reserved
+ clusters in the file system which will be used
+ in the specific situations to avoid costly
+ zeroout, unexpected ENOSPC, or possible data
+ loss. The default is 2% or 4096 clusters,
+ whichever is smaller and this can be changed
+ however it can never exceed number of clusters
+ in the file system. If there is not enough space
+ for the reserved space when mounting the file
+ mount will _not_ fail.
..............................................................................
Ioctls
@@ -587,6 +599,16 @@ Table of Ext4 specific ioctls
bitmaps and inode table, the userspace tool thus
just passes the new number of blocks.
+EXT4_IOC_SWAP_BOOT Swap i_blocks and associated attributes
+ (like i_blocks, i_size, i_flags, ...) from
+ the specified inode with inode
+ EXT4_BOOT_LOADER_INO (#5). This is typically
+ used to store a boot loader in a secure part of
+ the filesystem, where it can't be changed by a
+ normal user by accident.
+ The data blocks of the previous boot loader
+ will be associated with the given inode.
+
..............................................................................
References
diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt
index dcf338e62b7..51afba17bba 100644
--- a/Documentation/filesystems/f2fs.txt
+++ b/Documentation/filesystems/f2fs.txt
@@ -18,8 +18,8 @@ according to its internal geometry or flash memory management scheme, namely FTL
F2FS and its tools support various parameters not only for configuring on-disk
layout, but also for selecting allocation and cleaning algorithms.
-The file system formatting tool, "mkfs.f2fs", is available from the following
-git tree:
+The following git tree provides the file system formatting tool (mkfs.f2fs),
+a consistency checking tool (fsck.f2fs), and a debugging tool (dump.f2fs).
>> git://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk/f2fs-tools.git
For reporting bugs and sending patches, please use the following mailing list:
@@ -98,8 +98,13 @@ Cleaning Overhead
MOUNT OPTIONS
================================================================================
-background_gc_off Turn off cleaning operations, namely garbage collection,
- triggered in background when I/O subsystem is idle.
+background_gc=%s Turn on/off cleaning operations, namely garbage
+ collection, triggered in background when I/O subsystem is
+ idle. If background_gc=on, it will turn on the garbage
+ collection and if background_gc=off, garbage collection
+ will be truned off.
+ Default value for this option is on. So garbage
+ collection is on by default.
disable_roll_forward Disable the roll-forward recovery routine
discard Issue discard/TRIM commands when a segment is cleaned.
no_heap Disable heap-style segment allocation which finds free
@@ -114,6 +119,13 @@ active_logs=%u Support configuring the number of active logs. In the
Default number is 6.
disable_ext_identify Disable the extension list configured by mkfs, so f2fs
does not aware of cold files such as media files.
+inline_xattr Enable the inline xattrs feature.
+inline_data Enable the inline data feature: New created small(<~3.4k)
+ files can be written into inode block.
+flush_merge Merge concurrent cache_flush commands as much as possible
+ to eliminate redundant command issues. If the underlying
+ device handles the cache_flush command relatively slowly,
+ recommend to enable this option.
================================================================================
DEBUGFS ENTRIES
@@ -128,6 +140,79 @@ f2fs. Each file shows the whole f2fs information.
- current memory footprint consumed by f2fs.
================================================================================
+SYSFS ENTRIES
+================================================================================
+
+Information about mounted f2f2 file systems can be found in
+/sys/fs/f2fs. Each mounted filesystem will have a directory in
+/sys/fs/f2fs based on its device name (i.e., /sys/fs/f2fs/sda).
+The files in each per-device directory are shown in table below.
+
+Files in /sys/fs/f2fs/<devname>
+(see also Documentation/ABI/testing/sysfs-fs-f2fs)
+..............................................................................
+ File Content
+
+ gc_max_sleep_time This tuning parameter controls the maximum sleep
+ time for the garbage collection thread. Time is
+ in milliseconds.
+
+ gc_min_sleep_time This tuning parameter controls the minimum sleep
+ time for the garbage collection thread. Time is
+ in milliseconds.
+
+ gc_no_gc_sleep_time This tuning parameter controls the default sleep
+ time for the garbage collection thread. Time is
+ in milliseconds.
+
+ gc_idle This parameter controls the selection of victim
+ policy for garbage collection. Setting gc_idle = 0
+ (default) will disable this option. Setting
+ gc_idle = 1 will select the Cost Benefit approach
+ & setting gc_idle = 2 will select the greedy aproach.
+
+ reclaim_segments This parameter controls the number of prefree
+ segments to be reclaimed. If the number of prefree
+ segments is larger than the number of segments
+ in the proportion to the percentage over total
+ volume size, f2fs tries to conduct checkpoint to
+ reclaim the prefree segments to free segments.
+ By default, 5% over total # of segments.
+
+ max_small_discards This parameter controls the number of discard
+ commands that consist small blocks less than 2MB.
+ The candidates to be discarded are cached until
+ checkpoint is triggered, and issued during the
+ checkpoint. By default, it is disabled with 0.
+
+ ipu_policy This parameter controls the policy of in-place
+ updates in f2fs. There are five policies:
+ 0: F2FS_IPU_FORCE, 1: F2FS_IPU_SSR,
+ 2: F2FS_IPU_UTIL, 3: F2FS_IPU_SSR_UTIL,
+ 4: F2FS_IPU_DISABLE.
+
+ min_ipu_util This parameter controls the threshold to trigger
+ in-place-updates. The number indicates percentage
+ of the filesystem utilization, and used by
+ F2FS_IPU_UTIL and F2FS_IPU_SSR_UTIL policies.
+
+ max_victim_search This parameter controls the number of trials to
+ find a victim segment when conducting SSR and
+ cleaning operations. The default value is 4096
+ which covers 8GB block address range.
+
+ dir_level This parameter controls the directory level to
+ support large directory. If a directory has a
+ number of files, it can reduce the file lookup
+ latency by increasing this dir_level value.
+ Otherwise, it needs to decrease this value to
+ reduce the space overhead. The default value is 0.
+
+ ram_thresh This parameter controls the memory footprint used
+ by free nids and cached nat entries. By default,
+ 10 is set, which indicates 10 MB / 1 GB RAM.
+
+================================================================================
USAGE
================================================================================
@@ -144,9 +229,13 @@ USAGE
# mkfs.f2fs -l label /dev/block_device
# mount -t f2fs /dev/block_device /mnt/f2fs
-Format options
---------------
--l [label] : Give a volume label, up to 256 unicode name.
+mkfs.f2fs
+---------
+The mkfs.f2fs is for the use of formatting a partition as the f2fs filesystem,
+which builds a basic on-disk layout.
+
+The options consist of:
+-l [label] : Give a volume label, up to 512 unicode name.
-a [0 or 1] : Split start location of each area for heap-based allocation.
1 is set by default, which performs this.
-o [int] : Set overprovision ratio in percent over volume size.
@@ -156,6 +245,39 @@ Format options
-z [int] : Set the number of sections per zone.
1 is set by default.
-e [str] : Set basic extension list. e.g. "mp3,gif,mov"
+-t [0 or 1] : Disable discard command or not.
+ 1 is set by default, which conducts discard.
+
+fsck.f2fs
+---------
+The fsck.f2fs is a tool to check the consistency of an f2fs-formatted
+partition, which examines whether the filesystem metadata and user-made data
+are cross-referenced correctly or not.
+Note that, initial version of the tool does not fix any inconsistency.
+
+The options consist of:
+ -d debug level [default:0]
+
+dump.f2fs
+---------
+The dump.f2fs shows the information of specific inode and dumps SSA and SIT to
+file. Each file is dump_ssa and dump_sit.
+
+The dump.f2fs is used to debug on-disk data structures of the f2fs filesystem.
+It shows on-disk inode information reconized by a given inode number, and is
+able to dump all the SSA and SIT entries into predefined files, ./dump_ssa and
+./dump_sit respectively.
+
+The options consist of:
+ -d debug level [default:0]
+ -i inode no (hex)
+ -s [SIT dump segno from #1~#2 (decimal), for all 0~-1]
+ -a [SSA dump segno from #1~#2 (decimal), for all 0~-1]
+
+Examples:
+# dump.f2fs -i [ino] /dev/sdx
+# dump.f2fs -s 0~-1 /dev/sdx (SIT dump)
+# dump.f2fs -a 0~-1 /dev/sdx (SSA dump)
================================================================================
DESIGN
@@ -339,9 +461,11 @@ The number of blocks and buckets are determined by,
# of blocks in level #n = |
`- 4, Otherwise
- ,- 2^n, if n < MAX_DIR_HASH_DEPTH / 2,
+ ,- 2^(n + dir_level),
+ | if n + dir_level < MAX_DIR_HASH_DEPTH / 2,
# of buckets in level #n = |
- `- 2^((MAX_DIR_HASH_DEPTH / 2) - 1), Otherwise
+ `- 2^((MAX_DIR_HASH_DEPTH / 2) - 1),
+ Otherwise
When F2FS finds a file name in a directory, at first a hash value of the file
name is calculated. Then, F2FS scans the hash table in level #0 to find the
diff --git a/Documentation/filesystems/hfsplus.txt b/Documentation/filesystems/hfsplus.txt
index af1628a1061..59f7569fc9e 100644
--- a/Documentation/filesystems/hfsplus.txt
+++ b/Documentation/filesystems/hfsplus.txt
@@ -56,4 +56,4 @@ References
kernel source: <file:fs/hfsplus>
-Apple Technote 1150 http://developer.apple.com/technotes/tn/tn1150.html
+Apple Technote 1150 https://developer.apple.com/legacy/library/technotes/tn/tn1150.html
diff --git a/Documentation/filesystems/jfs.txt b/Documentation/filesystems/jfs.txt
index f7433355394..41fd757997b 100644
--- a/Documentation/filesystems/jfs.txt
+++ b/Documentation/filesystems/jfs.txt
@@ -42,7 +42,7 @@ nodiscard(*) block device when blocks are freed. This is useful for SSD
devices and sparse/thinly-provisioned LUNs. The FITRIM ioctl
command is also available together with the nodiscard option.
The value of minlen specifies the minimum blockcount, when
- a TRIM command to the block device is considered usefull.
+ a TRIM command to the block device is considered useful.
When no value is given to the discard option, it defaults to
64 blocks, which means 256KiB in JFS.
The minlen value of discard overrides the minlen value given
diff --git a/Documentation/filesystems/nfs/00-INDEX b/Documentation/filesystems/nfs/00-INDEX
index 1716874a651..53f3b596ac0 100644
--- a/Documentation/filesystems/nfs/00-INDEX
+++ b/Documentation/filesystems/nfs/00-INDEX
@@ -12,6 +12,8 @@ nfs41-server.txt
- info on the Linux server implementation of NFSv4 minor version 1.
nfs-rdma.txt
- how to install and setup the Linux NFS/RDMA client and server software
+nfsd-admin-interfaces.txt
+ - Administrative interfaces for nfsd.
nfsroot.txt
- short guide on setting up a diskless box with NFS root filesystem.
pnfs.txt
@@ -20,3 +22,5 @@ rpc-cache.txt
- introduction to the caching mechanisms in the sunrpc layer.
idmapper.txt
- information for configuring request-keys to be used by idmapper
+rpc-server-gss.txt
+ - Information on GSS authentication support in the NFS Server
diff --git a/Documentation/filesystems/nfs/Exporting b/Documentation/filesystems/nfs/Exporting
index 09994c24728..e543b1a619c 100644
--- a/Documentation/filesystems/nfs/Exporting
+++ b/Documentation/filesystems/nfs/Exporting
@@ -93,7 +93,7 @@ For a filesystem to be exportable it must:
2/ make sure that d_splice_alias is used rather than d_add
when ->lookup finds an inode for a given parent and name.
- If inode is NULL, d_splice_alias(inode, dentry) is eqivalent to
+ If inode is NULL, d_splice_alias(inode, dentry) is equivalent to
d_add(dentry, inode), NULL
diff --git a/Documentation/filesystems/nfs/nfs41-server.txt b/Documentation/filesystems/nfs/nfs41-server.txt
index 01c2db76979..c49cd7e796e 100644
--- a/Documentation/filesystems/nfs/nfs41-server.txt
+++ b/Documentation/filesystems/nfs/nfs41-server.txt
@@ -5,11 +5,11 @@ Server support for minorversion 1 can be controlled using the
by reading this file will contain either "+4.1" or "-4.1"
correspondingly.
-Currently, server support for minorversion 1 is disabled by default.
-It can be enabled at run time by writing the string "+4.1" to
+Currently, server support for minorversion 1 is enabled by default.
+It can be disabled at run time by writing the string "-4.1" to
the /proc/fs/nfsd/versions control file. Note that to write this
-control file, the nfsd service must be taken down. Use your user-mode
-nfs-utils to set this up; see rpc.nfsd(8)
+control file, the nfsd service must be taken down. You can use rpc.nfsd
+for this; see rpc.nfsd(8).
(Warning: older servers will interpret "+4.1" and "-4.1" as "+4" and
"-4", respectively. Therefore, code meant to work on both new and old
@@ -29,29 +29,6 @@ are still under development out of tree.
See http://wiki.linux-nfs.org/wiki/index.php/PNFS_prototype_design
for more information.
-The current implementation is intended for developers only: while it
-does support ordinary file operations on clients we have tested against
-(including the linux client), it is incomplete in ways which may limit
-features unexpectedly, cause known bugs in rare cases, or cause
-interoperability problems with future clients. Known issues:
-
- - gss support is questionable: currently mounts with kerberos
- from a linux client are possible, but we aren't really
- conformant with the spec (for example, we don't use kerberos
- on the backchannel correctly).
- - We do not support SSV, which provides security for shared
- client-server state (thus preventing unauthorized tampering
- with locks and opens, for example). It is mandatory for
- servers to support this, though no clients use it yet.
-
-In addition, some limitations are inherited from the current NFSv4
-implementation:
-
- - Incomplete delegation enforcement: if a file is renamed or
- unlinked by a local process, a client holding a delegation may
- continue to indefinitely allow opens of the file under the old
- name.
-
The table below, taken from the NFSv4.1 document, lists
the operations that are mandatory to implement (REQ), optional
(OPT), and NFSv4.0 operations that are required not to implement (MNI)
@@ -169,6 +146,16 @@ NS*| CB_WANTS_CANCELLED | OPT | FDELG, | Section 20.10 |
Implementation notes:
+SSV:
+* The spec claims this is mandatory, but we don't actually know of any
+ implementations, so we're ignoring it for now. The server returns
+ NFS4ERR_ENCR_ALG_UNSUPP on EXCHANGE_ID, which should be future-proof.
+
+GSS on the backchannel:
+* Again, theoretically required but not widely implemented (in
+ particular, the current Linux client doesn't request it). We return
+ NFS4ERR_ENCR_ALG_UNSUPP on CREATE_SESSION.
+
DELEGPURGE:
* mandatory only for servers that support CLAIM_DELEGATE_PREV and/or
CLAIM_DELEG_PREV_FH (which allows clients to keep delegations that
@@ -176,7 +163,6 @@ DELEGPURGE:
now.
EXCHANGE_ID:
-* only SP4_NONE state protection supported
* implementation ids are ignored
CREATE_SESSION:
@@ -190,7 +176,5 @@ Nonstandard compound limitations:
ca_maxrequestsize request and a ca_maxresponsesize reply, so we may
fail to live up to the promise we made in CREATE_SESSION fore channel
negotiation.
-* No more than one read-like operation allowed per compound; encoding
- replies that cross page boundaries (except for read data) not handled.
See also http://wiki.linux-nfs.org/wiki/index.php/Server_4.0_and_4.1_issues.
diff --git a/Documentation/filesystems/nfs/pnfs.txt b/Documentation/filesystems/nfs/pnfs.txt
index 52ae07f5f57..adc81a35fe2 100644
--- a/Documentation/filesystems/nfs/pnfs.txt
+++ b/Documentation/filesystems/nfs/pnfs.txt
@@ -12,7 +12,7 @@ struct pnfs_layout_hdr
----------------------
The on-the-wire command LAYOUTGET corresponds to struct
pnfs_layout_segment, usually referred to by the variable name lseg.
-Each nfs_inode may hold a pointer to a cache of of these layout
+Each nfs_inode may hold a pointer to a cache of these layout
segments in nfsi->layout, of type struct pnfs_layout_hdr.
We reference the header for the inode pointing to it, across each
diff --git a/Documentation/filesystems/nfs/rpc-server-gss.txt b/Documentation/filesystems/nfs/rpc-server-gss.txt
new file mode 100644
index 00000000000..716f4be8e8b
--- /dev/null
+++ b/Documentation/filesystems/nfs/rpc-server-gss.txt
@@ -0,0 +1,91 @@
+
+rpcsec_gss support for kernel RPC servers
+=========================================
+
+This document gives references to the standards and protocols used to
+implement RPCGSS authentication in kernel RPC servers such as the NFS
+server and the NFS client's NFSv4.0 callback server. (But note that
+NFSv4.1 and higher don't require the client to act as a server for the
+purposes of authentication.)
+
+RPCGSS is specified in a few IETF documents:
+ - RFC2203 v1: http://tools.ietf.org/rfc/rfc2203.txt
+ - RFC5403 v2: http://tools.ietf.org/rfc/rfc5403.txt
+and there is a 3rd version being proposed:
+ - http://tools.ietf.org/id/draft-williams-rpcsecgssv3.txt
+ (At draft n. 02 at the time of writing)
+
+Background
+----------
+
+The RPCGSS Authentication method describes a way to perform GSSAPI
+Authentication for NFS. Although GSSAPI is itself completely mechanism
+agnostic, in many cases only the KRB5 mechanism is supported by NFS
+implementations.
+
+The Linux kernel, at the moment, supports only the KRB5 mechanism, and
+depends on GSSAPI extensions that are KRB5 specific.
+
+GSSAPI is a complex library, and implementing it completely in kernel is
+unwarranted. However GSSAPI operations are fundementally separable in 2
+parts:
+- initial context establishment
+- integrity/privacy protection (signing and encrypting of individual
+ packets)
+
+The former is more complex and policy-independent, but less
+performance-sensitive. The latter is simpler and needs to be very fast.
+
+Therefore, we perform per-packet integrity and privacy protection in the
+kernel, but leave the initial context establishment to userspace. We
+need upcalls to request userspace to perform context establishment.
+
+NFS Server Legacy Upcall Mechanism
+----------------------------------
+
+The classic upcall mechanism uses a custom text based upcall mechanism
+to talk to a custom daemon called rpc.svcgssd that is provide by the
+nfs-utils package.
+
+This upcall mechanism has 2 limitations:
+
+A) It can handle tokens that are no bigger than 2KiB
+
+In some Kerberos deployment GSSAPI tokens can be quite big, up and
+beyond 64KiB in size due to various authorization extensions attacked to
+the Kerberos tickets, that needs to be sent through the GSS layer in
+order to perform context establishment.
+
+B) It does not properly handle creds where the user is member of more
+than a few housand groups (the current hard limit in the kernel is 65K
+groups) due to limitation on the size of the buffer that can be send
+back to the kernel (4KiB).
+
+NFS Server New RPC Upcall Mechanism
+-----------------------------------
+
+The newer upcall mechanism uses RPC over a unix socket to a daemon
+called gss-proxy, implemented by a userspace program called Gssproxy.
+
+The gss_proxy RPC protocol is currently documented here:
+
+ https://fedorahosted.org/gss-proxy/wiki/ProtocolDocumentation
+
+This upcall mechanism uses the kernel rpc client and connects to the gssproxy
+userspace program over a regular unix socket. The gssproxy protocol does not
+suffer from the size limitations of the legacy protocol.
+
+Negotiating Upcall Mechanisms
+-----------------------------
+
+To provide backward compatibility, the kernel defaults to using the
+legacy mechanism. To switch to the new mechanism, gss-proxy must bind
+to /var/run/gssproxy.sock and then write "1" to
+/proc/net/rpc/use-gss-proxy. If gss-proxy dies, it must repeat both
+steps.
+
+Once the upcall mechanism is chosen, it cannot be changed. To prevent
+locking into the legacy mechanisms, the above steps must be performed
+before starting nfsd. Whoever starts nfsd can guarantee this by reading
+from /proc/net/rpc/use-gss-proxy and checking that it contains a
+"1"--the read will block until gss-proxy has done its write to the file.
diff --git a/Documentation/filesystems/nilfs2.txt b/Documentation/filesystems/nilfs2.txt
index 873a2ab2e9f..41c3d332acc 100644
--- a/Documentation/filesystems/nilfs2.txt
+++ b/Documentation/filesystems/nilfs2.txt
@@ -25,9 +25,8 @@ available from the following download page. At least "mkfs.nilfs2",
cleaner or garbage collector) are required. Details on the tools are
described in the man pages included in the package.
-Project web page: http://www.nilfs.org/en/
-Download page: http://www.nilfs.org/en/download.html
-Git tree web page: http://www.nilfs.org/git/
+Project web page: http://nilfs.sourceforge.net/
+Download page: http://nilfs.sourceforge.net/en/download.html
List info: http://vger.kernel.org/vger-lists.html#linux-nilfs
Caveats
@@ -81,6 +80,69 @@ nodiscard(*) The discard/TRIM commands are sent to the underlying
block device when blocks are freed. This is useful
for SSD devices and sparse/thinly-provisioned LUNs.
+Ioctls
+======
+
+There is some NILFS2 specific functionality which can be accessed by applications
+through the system call interfaces. The list of all NILFS2 specific ioctls are
+shown in the table below.
+
+Table of NILFS2 specific ioctls
+..............................................................................
+ Ioctl Description
+ NILFS_IOCTL_CHANGE_CPMODE Change mode of given checkpoint between
+ checkpoint and snapshot state. This ioctl is
+ used in chcp and mkcp utilities.
+
+ NILFS_IOCTL_DELETE_CHECKPOINT Remove checkpoint from NILFS2 file system.
+ This ioctl is used in rmcp utility.
+
+ NILFS_IOCTL_GET_CPINFO Return info about requested checkpoints. This
+ ioctl is used in lscp utility and by
+ nilfs_cleanerd daemon.
+
+ NILFS_IOCTL_GET_CPSTAT Return checkpoints statistics. This ioctl is
+ used by lscp, rmcp utilities and by
+ nilfs_cleanerd daemon.
+
+ NILFS_IOCTL_GET_SUINFO Return segment usage info about requested
+ segments. This ioctl is used in lssu,
+ nilfs_resize utilities and by nilfs_cleanerd
+ daemon.
+
+ NILFS_IOCTL_SET_SUINFO Modify segment usage info of requested
+ segments. This ioctl is used by
+ nilfs_cleanerd daemon to skip unnecessary
+ cleaning operation of segments and reduce
+ performance penalty or wear of flash device
+ due to redundant move of in-use blocks.
+
+ NILFS_IOCTL_GET_SUSTAT Return segment usage statistics. This ioctl
+ is used in lssu, nilfs_resize utilities and
+ by nilfs_cleanerd daemon.
+
+ NILFS_IOCTL_GET_VINFO Return information on virtual block addresses.
+ This ioctl is used by nilfs_cleanerd daemon.
+
+ NILFS_IOCTL_GET_BDESCS Return information about descriptors of disk
+ block numbers. This ioctl is used by
+ nilfs_cleanerd daemon.
+
+ NILFS_IOCTL_CLEAN_SEGMENTS Do garbage collection operation in the
+ environment of requested parameters from
+ userspace. This ioctl is used by
+ nilfs_cleanerd daemon.
+
+ NILFS_IOCTL_SYNC Make a checkpoint. This ioctl is used in
+ mkcp utility.
+
+ NILFS_IOCTL_RESIZE Resize NILFS2 volume. This ioctl is used
+ by nilfs_resize utility.
+
+ NILFS_IOCTL_SET_ALLOC_RANGE Define lower limit of segments in bytes and
+ upper limit of segments in bytes. This ioctl
+ is used by nilfs_resize utility.
+
NILFS2 usage
============
diff --git a/Documentation/filesystems/ntfs.txt b/Documentation/filesystems/ntfs.txt
index 791af8dac06..61947facfc0 100644
--- a/Documentation/filesystems/ntfs.txt
+++ b/Documentation/filesystems/ntfs.txt
@@ -455,8 +455,6 @@ not have this problem with odd numbers of sectors.
ChangeLog
=========
-Note, a technical ChangeLog aimed at kernel hackers is in fs/ntfs/ChangeLog.
-
2.1.30:
- Fix writev() (it kept writing the first segment over and over again
instead of moving onto subsequent segments).
diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index 0472c31c163..0f3a1390bf0 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -295,9 +295,9 @@ in the beginning of ->setattr unconditionally.
->clear_inode() and ->delete_inode() are gone; ->evict_inode() should
be used instead. It gets called whenever the inode is evicted, whether it has
remaining links or not. Caller does *not* evict the pagecache or inode-associated
-metadata buffers; getting rid of those is responsibility of method, as it had
-been for ->delete_inode(). Caller makes sure async writeback cannot be running
-for the inode while (or after) ->evict_inode() is called.
+metadata buffers; the method has to use truncate_inode_pages_final() to get rid
+of those. Caller makes sure async writeback cannot be running for the inode while
+(or after) ->evict_inode() is called.
->drop_inode() returns int now; it's called on final iput() with
inode->i_lock held and it returns true if filesystems wants the inode to be
@@ -441,3 +441,25 @@ d_make_root() drops the reference to inode if dentry allocation fails.
two, it gets "is it an O_EXCL or equivalent?" boolean argument. Note that
local filesystems can ignore tha argument - they are guaranteed that the
object doesn't exist. It's remote/distributed ones that might care...
+--
+[mandatory]
+ FS_REVAL_DOT is gone; if you used to have it, add ->d_weak_revalidate()
+in your dentry operations instead.
+--
+[mandatory]
+ vfs_readdir() is gone; switch to iterate_dir() instead
+--
+[mandatory]
+ ->readdir() is gone now; switch to ->iterate()
+[mandatory]
+ vfs_follow_link has been removed. Filesystems must use nd_set_link
+ from ->follow_link for normal symlinks, or nd_jump_link for magic
+ /proc/<pid> style links.
+--
+[mandatory]
+ iget5_locked()/ilookup5()/ilookup5_nowait() test() callback used to be
+ called with both ->i_lock and inode_hash_lock held; the former is *not*
+ taken anymore, so verify that your callbacks do not rely on it (none
+ of the in-tree instances did). inode_hash_lock is still held,
+ of course, so they are still serialized wrt removal from inode hash,
+ as well as wrt set() callback of iget5_locked().
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index fd8d0d594fc..ddc531a74d0 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -234,7 +234,7 @@ Table 1-2: Contents of the status files (as of 2.6.30-rc7)
ShdPnd bitmap of shared pending signals for the process
SigBlk bitmap of blocked signals
SigIgn bitmap of ignored signals
- SigCgt bitmap of catched signals
+ SigCgt bitmap of caught signals
CapInh bitmap of inheritable capabilities
CapPrm bitmap of permitted capabilities
CapEff bitmap of effective capabilities
@@ -300,7 +300,7 @@ Table 1-4: Contents of the stat files (as of 2.6.30-rc7)
pending bitmap of pending signals
blocked bitmap of blocked signals
sigign bitmap of ignored signals
- sigcatch bitmap of catched signals
+ sigcatch bitmap of caught signals
wchan address where process went to sleep
0 (place holder)
0 (place holder)
@@ -460,6 +460,7 @@ manner. The codes are the following:
nl - non-linear mapping
ar - architecture specific flag
dd - do not include area into core dump
+ sd - soft-dirty flag
mm - mixed map area
hg - huge page advise flag
nh - no-huge page advise flag
@@ -473,7 +474,8 @@ This file is only present if the CONFIG_MMU kernel configuration option is
enabled.
The /proc/PID/clear_refs is used to reset the PG_Referenced and ACCESSED/YOUNG
-bits on both physical and virtual pages associated with a process.
+bits on both physical and virtual pages associated with a process, and the
+soft-dirty bit on pte (see Documentation/vm/soft-dirty.txt for details).
To clear the bits for all the pages associated with the process
> echo 1 > /proc/PID/clear_refs
@@ -482,6 +484,10 @@ To clear the bits for the anonymous pages associated with the process
To clear the bits for the file mapped pages associated with the process
> echo 3 > /proc/PID/clear_refs
+
+To clear the soft-dirty bit
+ > echo 4 > /proc/PID/clear_refs
+
Any other value written to /proc/PID/clear_refs will have no effect.
The /proc/pid/pagemap gives the PFN, which can be used to find the pageflags
@@ -541,7 +547,7 @@ Table 1-5: Kernel info in /proc
sys See chapter 2
sysvipc Info of SysVIPC Resources (msg, sem, shm) (2.4)
tty Info of tty drivers
- uptime System uptime
+ uptime Wall clock since boot, combined idle time of all cpus
version Kernel version
video bttv info of video resources (2.4)
vmallocinfo Show vmalloced areas
@@ -761,6 +767,7 @@ The "Locked" indicates whether the mapping is locked in memory or not.
MemTotal: 16344972 kB
MemFree: 13634064 kB
+MemAvailable: 14836172 kB
Buffers: 3656 kB
Cached: 1195708 kB
SwapCached: 0 kB
@@ -793,6 +800,14 @@ AnonHugePages: 49152 kB
MemTotal: Total usable ram (i.e. physical ram minus a few reserved
bits and the kernel binary code)
MemFree: The sum of LowFree+HighFree
+MemAvailable: An estimate of how much memory is available for starting new
+ applications, without swapping. Calculated from MemFree,
+ SReclaimable, the size of the file LRU lists, and the low
+ watermarks in each zone.
+ The estimate takes into account that the system needs some
+ page cache to function well, and that not all reclaimable
+ slab will be reclaimable, due to items being in use. The
+ impact of those factors will vary from system to system.
Buffers: Relatively temporary storage for raw disk blocks
shouldn't get tremendously large (20MB or so)
Cached: in-memory cache for files read from the disk (the
@@ -839,7 +854,8 @@ WritebackTmp: Memory used by FUSE for temporary writeback buffers
if strict overcommit accounting is enabled (mode 2 in
'vm.overcommit_memory').
The CommitLimit is calculated with the following formula:
- CommitLimit = ('vm.overcommit_ratio' * Physical RAM) + Swap
+ CommitLimit = ([total RAM pages] - [total huge TLB pages]) *
+ overcommit_ratio / 100 + [total swap pages]
For example, on a system with 1G of physical RAM and 7G
of swap with a `vm.overcommit_ratio` of 30 it would
yield a CommitLimit of 7.3G.
@@ -849,16 +865,15 @@ Committed_AS: The amount of memory presently allocated on the system.
The committed memory is a sum of all of the memory which
has been allocated by processes, even if it has not been
"used" by them as of yet. A process which malloc()'s 1G
- of memory, but only touches 300M of it will only show up
- as using 300M of memory even if it has the address space
- allocated for the entire 1G. This 1G is memory which has
- been "committed" to by the VM and can be used at any time
- by the allocating application. With strict overcommit
- enabled on the system (mode 2 in 'vm.overcommit_memory'),
- allocations which would exceed the CommitLimit (detailed
- above) will not be permitted. This is useful if one needs
- to guarantee that processes will not fail due to lack of
- memory once that memory has been successfully allocated.
+ of memory, but only touches 300M of it will show up as
+ using 1G. This 1G is memory which has been "committed" to
+ by the VM and can be used at any time by the allocating
+ application. With strict overcommit enabled on the system
+ (mode 2 in 'vm.overcommit_memory'),allocations which would
+ exceed the CommitLimit (detailed above) will not be permitted.
+ This is useful if one needs to guarantee that processes will
+ not fail due to lack of memory once that memory has been
+ successfully allocated.
VmallocTotal: total size of vmalloc memory area
VmallocUsed: amount of vmalloc area which is used
VmallocChunk: largest contiguous block of vmalloc area which is free
@@ -1231,8 +1246,9 @@ second). The meanings of the columns are as follows, from left to right:
The "intr" line gives counts of interrupts serviced since boot time, for each
of the possible system interrupts. The first column is the total of all
-interrupts serviced; each subsequent column is the total for that particular
-interrupt.
+interrupts serviced including unnumbered architecture specific interrupts;
+each subsequent column is the total for that particular numbered interrupt.
+Unnumbered interrupts are not shown, only summed into the total.
The "ctxt" line gives the total number of context switches across all CPUs.
@@ -1372,8 +1388,8 @@ may allocate from based on an estimation of its current memory and swap use.
For example, if a task is using all allowed memory, its badness score will be
1000. If it is using half of its allowed memory, its score will be 500.
-There is an additional factor included in the badness score: root
-processes are given 3% extra memory over other tasks.
+There is an additional factor included in the badness score: the current memory
+and swap usage is discounted by 3% for root processes.
The amount of "allowed" memory depends on the context in which the oom killer
was called. If it is due to the memory assigned to the allocating task's cpuset
@@ -1634,18 +1650,21 @@ pids, so one need to either stop or freeze processes being inspected
if precise results are needed.
-3.7 /proc/<pid>/fdinfo/<fd> - Information about opened file
+3.8 /proc/<pid>/fdinfo/<fd> - Information about opened file
---------------------------------------------------------------
This file provides information associated with an opened file. The regular
-files have at least two fields -- 'pos' and 'flags'. The 'pos' represents
-the current offset of the opened file in decimal form [see lseek(2) for
-details] and 'flags' denotes the octal O_xxx mask the file has been
-created with [see open(2) for details].
+files have at least three fields -- 'pos', 'flags' and mnt_id. The 'pos'
+represents the current offset of the opened file in decimal form [see lseek(2)
+for details], 'flags' denotes the octal O_xxx mask the file has been
+created with [see open(2) for details] and 'mnt_id' represents mount ID of
+the file system containing the opened file [see 3.5 /proc/<pid>/mountinfo
+for details].
A typical output is
pos: 0
flags: 0100002
+ mnt_id: 19
The files such as eventfd, fsnotify, signalfd, epoll among the regular pos/flags
pair provide additional information particular to the objects they represent.
@@ -1654,6 +1673,7 @@ pair provide additional information particular to the objects they represent.
~~~~~~~~~~~~~
pos: 0
flags: 04002
+ mnt_id: 9
eventfd-count: 5a
where 'eventfd-count' is hex value of a counter.
@@ -1662,6 +1682,7 @@ pair provide additional information particular to the objects they represent.
~~~~~~~~~~~~~~
pos: 0
flags: 04002
+ mnt_id: 9
sigmask: 0000000000000200
where 'sigmask' is hex value of the signal mask associated
@@ -1671,6 +1692,7 @@ pair provide additional information particular to the objects they represent.
~~~~~~~~~~~
pos: 0
flags: 02
+ mnt_id: 9
tfd: 5 events: 1d data: ffffffffffffffff
where 'tfd' is a target file descriptor number in decimal form,
@@ -1704,6 +1726,7 @@ pair provide additional information particular to the objects they represent.
pos: 0
flags: 02
+ mnt_id: 9
fanotify flags:10 event-flags:0
fanotify mnt_id:12 mflags:40 mask:38 ignored_mask:40000003
fanotify ino:4f969 sdev:800013 mflags:0 mask:3b ignored_mask:40000000 fhandle-bytes:8 fhandle-type:1 f_handle:69f90400c275b5b4
diff --git a/Documentation/filesystems/qnx6.txt b/Documentation/filesystems/qnx6.txt
index e59f2f09f56..40867978913 100644
--- a/Documentation/filesystems/qnx6.txt
+++ b/Documentation/filesystems/qnx6.txt
@@ -148,8 +148,8 @@ smaller than addressing space in the bitmap.
Bitmap system area
------------------
-The bitmap itself is devided into three parts.
-First the system area, that is split into two halfs.
+The bitmap itself is divided into three parts.
+First the system area, that is split into two halves.
Then userspace.
The requirement for a static, fixed preallocated system area comes from how
diff --git a/Documentation/filesystems/ramfs-rootfs-initramfs.txt b/Documentation/filesystems/ramfs-rootfs-initramfs.txt
index 59b4a0962e0..b176928e696 100644
--- a/Documentation/filesystems/ramfs-rootfs-initramfs.txt
+++ b/Documentation/filesystems/ramfs-rootfs-initramfs.txt
@@ -79,6 +79,10 @@ to just make sure certain lists can't become empty.
Most systems just mount another filesystem over rootfs and ignore it. The
amount of space an empty instance of ramfs takes up is tiny.
+If CONFIG_TMPFS is enabled, rootfs will use tmpfs instead of ramfs by
+default. To force ramfs, add "rootfstype=ramfs" to the kernel command
+line.
+
What is initramfs?
------------------
diff --git a/Documentation/filesystems/relay.txt b/Documentation/filesystems/relay.txt
index 510b722667a..33e2f369473 100644
--- a/Documentation/filesystems/relay.txt
+++ b/Documentation/filesystems/relay.txt
@@ -31,7 +31,7 @@ Semantics
Each relay channel has one buffer per CPU, each buffer has one or more
sub-buffers. Messages are written to the first sub-buffer until it is
-too full to contain a new message, in which case it it is written to
+too full to contain a new message, in which case it is written to
the next (if available). Messages are never split across sub-buffers.
At this point, userspace can be notified so it empties the first
sub-buffer, while the kernel continues writing to the next.
diff --git a/Documentation/filesystems/seq_file.txt b/Documentation/filesystems/seq_file.txt
index a1e2e0dda90..1fe0ccb1af5 100644
--- a/Documentation/filesystems/seq_file.txt
+++ b/Documentation/filesystems/seq_file.txt
@@ -54,6 +54,15 @@ how the mechanism works without getting lost in other details. (Those
wanting to see the full source for this module can find it at
http://lwn.net/Articles/22359/).
+Deprecated create_proc_entry
+
+Note that the above article uses create_proc_entry which was removed in
+kernel 3.10. Current versions require the following update
+
+- entry = create_proc_entry("sequence", 0, NULL);
+- if (entry)
+- entry->proc_fops = &ct_file_ops;
++ entry = proc_create("sequence", 0, NULL, &ct_file_ops);
The iterator interface
diff --git a/Documentation/filesystems/sharedsubtree.txt b/Documentation/filesystems/sharedsubtree.txt
index 4ede421c968..32a173dd315 100644
--- a/Documentation/filesystems/sharedsubtree.txt
+++ b/Documentation/filesystems/sharedsubtree.txt
@@ -727,7 +727,7 @@ replicas continue to be exactly same.
mkdir -p /tmp/m3
mount --rbind /root /tmp/m3
- I wont' draw the tree..but it has 24 vfsmounts
+ I won't draw the tree..but it has 24 vfsmounts
at step i the number of vfsmounts is V[i] = i*V[i-1].
diff --git a/Documentation/filesystems/sysfs-tagging.txt b/Documentation/filesystems/sysfs-tagging.txt
index caaaf1266d8..eb843e49c5a 100644
--- a/Documentation/filesystems/sysfs-tagging.txt
+++ b/Documentation/filesystems/sysfs-tagging.txt
@@ -24,7 +24,7 @@ flag between KOBJ_NS_TYPE_NONE and KOBJ_NS_TYPES, and s_ns will
point to the namespace to which it belongs.
Each sysfs superblock's sysfs_super_info contains an array void
-*ns[KOBJ_NS_TYPES]. When a a task in a tagging namespace
+*ns[KOBJ_NS_TYPES]. When a task in a tagging namespace
kobj_nstype first mounts sysfs, a new superblock is created. It
will be differentiated from other sysfs mounts by having its
s_fs_info->ns[kobj_nstype] set to the new namespace. Note that
diff --git a/Documentation/filesystems/sysfs.txt b/Documentation/filesystems/sysfs.txt
index a6619b7064b..b35a64b82f9 100644
--- a/Documentation/filesystems/sysfs.txt
+++ b/Documentation/filesystems/sysfs.txt
@@ -108,12 +108,12 @@ static DEVICE_ATTR(foo, S_IWUSR | S_IRUGO, show_foo, store_foo);
is equivalent to doing:
static struct device_attribute dev_attr_foo = {
- .attr = {
+ .attr = {
.name = "foo",
.mode = S_IWUSR | S_IRUGO,
- .show = show_foo,
- .store = store_foo,
},
+ .show = show_foo,
+ .store = store_foo,
};
diff --git a/Documentation/filesystems/vfat.txt b/Documentation/filesystems/vfat.txt
index d230dd9c99b..ce1126aceed 100644
--- a/Documentation/filesystems/vfat.txt
+++ b/Documentation/filesystems/vfat.txt
@@ -150,12 +150,33 @@ discard -- If set, issues discard/TRIM commands to the block
device when blocks are freed. This is useful for SSD devices
and sparse/thinly-provisoned LUNs.
-nfs -- This option maintains an index (cache) of directory
- inodes by i_logstart which is used by the nfs-related code to
- improve look-ups.
+nfs=stale_rw|nostale_ro
+ Enable this only if you want to export the FAT filesystem
+ over NFS.
+
+ stale_rw: This option maintains an index (cache) of directory
+ inodes by i_logstart which is used by the nfs-related code to
+ improve look-ups. Full file operations (read/write) over NFS is
+ supported but with cache eviction at NFS server, this could
+ result in ESTALE issues.
+
+ nostale_ro: This option bases the inode number and filehandle
+ on the on-disk location of a file in the MS-DOS directory entry.
+ This ensures that ESTALE will not be returned after a file is
+ evicted from the inode cache. However, it means that operations
+ such as rename, create and unlink could cause filehandles that
+ previously pointed at one file to point at a different file,
+ potentially causing data corruption. For this reason, this
+ option also mounts the filesystem readonly.
+
+ To maintain backward compatibility, '-o nfs' is also accepted,
+ defaulting to stale_rw
+
+dos1xfloppy -- If set, use a fallback default BIOS Parameter Block
+ configuration, determined by backing device size. These static
+ parameters match defaults assumed by DOS 1.x for 160 kiB,
+ 180 kiB, 320 kiB, and 360 kiB floppies and floppy images.
- Enable this only if you want to export the FAT filesystem
- over NFS
<bool>: 0,1,yes,no,true,false
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index e3869098163..a1d0d7a3016 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -347,6 +347,8 @@ struct inode_operations {
int (*mknod) (struct inode *,struct dentry *,umode_t,dev_t);
int (*rename) (struct inode *, struct dentry *,
struct inode *, struct dentry *);
+ int (*rename2) (struct inode *, struct dentry *,
+ struct inode *, struct dentry *, unsigned int);
int (*readlink) (struct dentry *, char __user *,int);
void * (*follow_link) (struct dentry *, struct nameidata *);
void (*put_link) (struct dentry *, struct nameidata *, void *);
@@ -359,9 +361,9 @@ struct inode_operations {
ssize_t (*listxattr) (struct dentry *, char *, size_t);
int (*removexattr) (struct dentry *, const char *);
void (*update_time)(struct inode *, struct timespec *, int);
- int (*atomic_open)(struct inode *, struct dentry *,
- struct file *, unsigned open_flag,
- umode_t create_mode, int *opened);
+ int (*atomic_open)(struct inode *, struct dentry *, struct file *,
+ unsigned open_flag, umode_t create_mode, int *opened);
+ int (*tmpfile) (struct inode *, struct dentry *, umode_t);
};
Again, all methods are called without any locks being held, unless
@@ -414,6 +416,20 @@ otherwise noted.
rename: called by the rename(2) system call to rename the object to
have the parent and name given by the second inode and dentry.
+ rename2: this has an additional flags argument compared to rename.
+ If no flags are supported by the filesystem then this method
+ need not be implemented. If some flags are supported then the
+ filesystem must return -EINVAL for any unsupported or unknown
+ flags. Currently the following flags are implemented:
+ (1) RENAME_NOREPLACE: this flag indicates that if the target
+ of the rename exists the rename should fail with -EEXIST
+ instead of replacing the target. The VFS already checks for
+ existence, so for local filesystems the RENAME_NOREPLACE
+ implementation is equivalent to plain rename.
+ (2) RENAME_EXCHANGE: exchange source and target. Both must
+ exist; this is checked by the VFS. Unlike plain rename,
+ source and target may be of different type.
+
readlink: called by the readlink(2) system call. Only required if
you want to support reading symbolic links
@@ -468,9 +484,14 @@ otherwise noted.
method the filesystem can look up, possibly create and open the file in
one atomic operation. If it cannot perform this (e.g. the file type
turned out to be wrong) it may signal this by returning 1 instead of
- usual 0 or -ve . This method is only called if the last
- component is negative or needs lookup. Cached positive dentries are
- still handled by f_op->open().
+ usual 0 or -ve . This method is only called if the last component is
+ negative or needs lookup. Cached positive dentries are still handled by
+ f_op->open(). If the file was created, the FILE_CREATED flag should be
+ set in "opened". In case of O_EXCL the method must only succeed if the
+ file didn't exist and hence FILE_CREATED shall always be set on success.
+
+ tmpfile: called in the end of O_TMPFILE open(). Optional, equivalent to
+ atomically creating, opening and unlinking a file in given directory.
The Address Space Object
========================
@@ -549,12 +570,11 @@ struct address_space_operations
-------------------------------
This describes how the VFS can manipulate mapping of a file to page cache in
-your filesystem. As of kernel 2.6.22, the following members are defined:
+your filesystem. The following members are defined:
struct address_space_operations {
int (*writepage)(struct page *page, struct writeback_control *wbc);
int (*readpage)(struct file *, struct page *);
- int (*sync_page)(struct page *);
int (*writepages)(struct address_space *, struct writeback_control *);
int (*set_page_dirty)(struct page *page);
int (*readpages)(struct file *filp, struct address_space *mapping,
@@ -566,16 +586,18 @@ struct address_space_operations {
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata);
sector_t (*bmap)(struct address_space *, sector_t);
- int (*invalidatepage) (struct page *, unsigned long);
+ void (*invalidatepage) (struct page *, unsigned int, unsigned int);
int (*releasepage) (struct page *, int);
void (*freepage)(struct page *);
- ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
- loff_t offset, unsigned long nr_segs);
+ ssize_t (*direct_IO)(int, struct kiocb *, struct iov_iter *iter, loff_t offset);
struct page* (*get_xip_page)(struct address_space *, sector_t,
int);
/* migrate the contents of a page to the specified target */
int (*migratepage) (struct page *, struct page *);
int (*launder_page) (struct page *);
+ int (*is_partially_uptodate) (struct page *, unsigned long,
+ unsigned long);
+ void (*is_dirty_writeback) (struct page *, bool *, bool *);
int (*error_remove_page) (struct mapping *mapping, struct page *page);
int (*swap_activate)(struct file *);
int (*swap_deactivate)(struct file *);
@@ -607,13 +629,6 @@ struct address_space_operations {
In this case, the page will be relocated, relocked and if
that all succeeds, ->readpage will be called again.
- sync_page: called by the VM to notify the backing store to perform all
- queued I/O operations for a page. I/O operations for other pages
- associated with this address_space object may also be performed.
-
- This function is optional and is called only for pages with
- PG_Writeback set while waiting for the writeback to complete.
-
writepages: called by the VM to write out pages associated with the
address_space object. If wbc->sync_mode is WBC_SYNC_ALL, then
the writeback_control will specify a range of pages that must be
@@ -685,14 +700,14 @@ struct address_space_operations {
invalidatepage: If a page has PagePrivate set, then invalidatepage
will be called when part or all of the page is to be removed
from the address space. This generally corresponds to either a
- truncation or a complete invalidation of the address space
- (in the latter case 'offset' will always be 0).
- Any private data associated with the page should be updated
- to reflect this truncation. If offset is 0, then
- the private data should be released, because the page
- must be able to be completely discarded. This may be done by
- calling the ->releasepage function, but in this case the
- release MUST succeed.
+ truncation, punch hole or a complete invalidation of the address
+ space (in the latter case 'offset' will always be 0 and 'length'
+ will be PAGE_CACHE_SIZE). Any private data associated with the page
+ should be updated to reflect this truncation. If offset is 0 and
+ length is PAGE_CACHE_SIZE, then the private data should be released,
+ because the page must be able to be completely discarded. This may
+ be done by calling the ->releasepage function, but in this case the
+ release MUST succeed.
releasepage: releasepage is called on PagePrivate pages to indicate
that the page should be freed if possible. ->releasepage
@@ -742,6 +757,20 @@ struct address_space_operations {
prevent redirtying the page, it is kept locked during the whole
operation.
+ is_partially_uptodate: Called by the VM when reading a file through the
+ pagecache when the underlying blocksize != pagesize. If the required
+ block is up to date then the read can complete without needing the IO
+ to bring the whole page up to date.
+
+ is_dirty_writeback: Called by the VM when attempting to reclaim a page.
+ The VM uses dirty and writeback information to determine if it needs
+ to stall to allow flushers a chance to complete some IO. Ordinarily
+ it can use PageDirty and PageWriteback but some filesystems have
+ more complex state (unstable pages in NFS prevent reclaim) or
+ do not set those flags due to locking problems (jbd). This callback
+ allows a filesystem to indicate to the VM if a page should be
+ treated as dirty or writeback for the purposes of stalling.
+
error_remove_page: normally set to generic_error_remove_page if truncation
is ok for this address space. Used for memory failure handling.
Setting this implies you deal with pages going away under you,
@@ -768,7 +797,7 @@ struct file_operations
----------------------
This describes how the VFS can manipulate an open file. As of kernel
-3.5, the following members are defined:
+3.12, the following members are defined:
struct file_operations {
struct module *owner;
@@ -777,7 +806,9 @@ struct file_operations {
ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
- int (*readdir) (struct file *, void *, filldir_t);
+ ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
+ ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
+ int (*iterate) (struct file *, struct dir_context *);
unsigned int (*poll) (struct file *, struct poll_table_struct *);
long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
@@ -789,9 +820,6 @@ struct file_operations {
int (*aio_fsync) (struct kiocb *, int datasync);
int (*fasync) (int, struct file *, int);
int (*lock) (struct file *, int, struct file_lock *);
- ssize_t (*readv) (struct file *, const struct iovec *, unsigned long, loff_t *);
- ssize_t (*writev) (struct file *, const struct iovec *, unsigned long, loff_t *);
- ssize_t (*sendfile) (struct file *, loff_t *, size_t, read_actor_t, void *);
ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
int (*check_flags)(int);
@@ -800,6 +828,7 @@ struct file_operations {
ssize_t (*splice_read)(struct file *, struct pipe_inode_info *, size_t, unsigned int);
int (*setlease)(struct file *, long arg, struct file_lock **);
long (*fallocate)(struct file *, int mode, loff_t offset, loff_t len);
+ int (*show_fdinfo)(struct seq_file *m, struct file *f);
};
Again, all methods are called without any locks being held, unless
@@ -809,13 +838,17 @@ otherwise noted.
read: called by read(2) and related system calls
- aio_read: called by io_submit(2) and other asynchronous I/O operations
+ aio_read: vectored, possibly asynchronous read
+
+ read_iter: possibly asynchronous read with iov_iter as destination
write: called by write(2) and related system calls
- aio_write: called by io_submit(2) and other asynchronous I/O operations
+ aio_write: vectored, possibly asynchronous write
+
+ write_iter: possibly asynchronous write with iov_iter as source
- readdir: called when the VFS needs to read the directory contents
+ iterate: called when the VFS needs to read the directory contents
poll: called by the VFS when a process wants to check if there is
activity on this file and (optionally) go to sleep until there
@@ -850,12 +883,6 @@ otherwise noted.
lock: called by the fcntl(2) system call for F_GETLK, F_SETLK, and F_SETLKW
commands
- readv: called by the readv(2) system call
-
- writev: called by the writev(2) system call
-
- sendfile: called by the sendfile(2) system call
-
get_unmapped_area: called by the mmap(2) system call
check_flags: called by the fcntl(2) system call for F_SETFL command
@@ -900,10 +927,9 @@ defined:
struct dentry_operations {
int (*d_revalidate)(struct dentry *, unsigned int);
- int (*d_hash)(const struct dentry *, const struct inode *,
- struct qstr *);
- int (*d_compare)(const struct dentry *, const struct inode *,
- const struct dentry *, const struct inode *,
+ int (*d_weak_revalidate)(struct dentry *, unsigned int);
+ int (*d_hash)(const struct dentry *, struct qstr *);
+ int (*d_compare)(const struct dentry *, const struct dentry *,
unsigned int, const char *, const struct qstr *);
int (*d_delete)(const struct dentry *);
void (*d_release)(struct dentry *);
@@ -915,8 +941,13 @@ struct dentry_operations {
d_revalidate: called when the VFS needs to revalidate a dentry. This
is called whenever a name look-up finds a dentry in the
- dcache. Most filesystems leave this as NULL, because all their
- dentries in the dcache are valid
+ dcache. Most local filesystems leave this as NULL, because all their
+ dentries in the dcache are valid. Network filesystems are different
+ since things can change on the server without the client necessarily
+ being aware of it.
+
+ This function should return a positive value if the dentry is still
+ valid, and zero or a negative error code if it isn't.
d_revalidate may be called in rcu-walk mode (flags & LOOKUP_RCU).
If in rcu-walk mode, the filesystem must revalidate the dentry without
@@ -927,27 +958,40 @@ struct dentry_operations {
If a situation is encountered that rcu-walk cannot handle, return
-ECHILD and it will be called again in ref-walk mode.
+ d_weak_revalidate: called when the VFS needs to revalidate a "jumped" dentry.
+ This is called when a path-walk ends at dentry that was not acquired by
+ doing a lookup in the parent directory. This includes "/", "." and "..",
+ as well as procfs-style symlinks and mountpoint traversal.
+
+ In this case, we are less concerned with whether the dentry is still
+ fully correct, but rather that the inode is still valid. As with
+ d_revalidate, most local filesystems will set this to NULL since their
+ dcache entries are always valid.
+
+ This function has the same return code semantics as d_revalidate.
+
+ d_weak_revalidate is only called after leaving rcu-walk mode.
+
d_hash: called when the VFS adds a dentry to the hash table. The first
dentry passed to d_hash is the parent directory that the name is
- to be hashed into. The inode is the dentry's inode.
+ to be hashed into.
Same locking and synchronisation rules as d_compare regarding
what is safe to dereference etc.
d_compare: called to compare a dentry name with a given name. The first
dentry is the parent of the dentry to be compared, the second is
- the parent's inode, then the dentry and inode (may be NULL) of the
- child dentry. len and name string are properties of the dentry to be
- compared. qstr is the name to compare it with.
+ the child dentry. len and name string are properties of the dentry
+ to be compared. qstr is the name to compare it with.
Must be constant and idempotent, and should not take locks if
- possible, and should not or store into the dentry or inodes.
- Should not dereference pointers outside the dentry or inodes without
+ possible, and should not or store into the dentry.
+ Should not dereference pointers outside the dentry without
lots of care (eg. d_parent, d_inode, d_name should not be used).
However, our vfsmount is pinned, and RCU held, so the dentries and
inodes won't disappear, neither will our sb or filesystem module.
- ->i_sb and ->d_sb may be used.
+ ->d_sb may be used.
It is a tricky calling convention because it needs to be called under
"rcu-walk", ie. without any locks or references on things.
diff --git a/Documentation/filesystems/xfs-self-describing-metadata.txt b/Documentation/filesystems/xfs-self-describing-metadata.txt
new file mode 100644
index 00000000000..05aa455163e
--- /dev/null
+++ b/Documentation/filesystems/xfs-self-describing-metadata.txt
@@ -0,0 +1,350 @@
+XFS Self Describing Metadata
+----------------------------
+
+Introduction
+------------
+
+The largest scalability problem facing XFS is not one of algorithmic
+scalability, but of verification of the filesystem structure. Scalabilty of the
+structures and indexes on disk and the algorithms for iterating them are
+adequate for supporting PB scale filesystems with billions of inodes, however it
+is this very scalability that causes the verification problem.
+
+Almost all metadata on XFS is dynamically allocated. The only fixed location
+metadata is the allocation group headers (SB, AGF, AGFL and AGI), while all
+other metadata structures need to be discovered by walking the filesystem
+structure in different ways. While this is already done by userspace tools for
+validating and repairing the structure, there are limits to what they can
+verify, and this in turn limits the supportable size of an XFS filesystem.
+
+For example, it is entirely possible to manually use xfs_db and a bit of
+scripting to analyse the structure of a 100TB filesystem when trying to
+determine the root cause of a corruption problem, but it is still mainly a
+manual task of verifying that things like single bit errors or misplaced writes
+weren't the ultimate cause of a corruption event. It may take a few hours to a
+few days to perform such forensic analysis, so for at this scale root cause
+analysis is entirely possible.
+
+However, if we scale the filesystem up to 1PB, we now have 10x as much metadata
+to analyse and so that analysis blows out towards weeks/months of forensic work.
+Most of the analysis work is slow and tedious, so as the amount of analysis goes
+up, the more likely that the cause will be lost in the noise. Hence the primary
+concern for supporting PB scale filesystems is minimising the time and effort
+required for basic forensic analysis of the filesystem structure.
+
+
+Self Describing Metadata
+------------------------
+
+One of the problems with the current metadata format is that apart from the
+magic number in the metadata block, we have no other way of identifying what it
+is supposed to be. We can't even identify if it is the right place. Put simply,
+you can't look at a single metadata block in isolation and say "yes, it is
+supposed to be there and the contents are valid".
+
+Hence most of the time spent on forensic analysis is spent doing basic
+verification of metadata values, looking for values that are in range (and hence
+not detected by automated verification checks) but are not correct. Finding and
+understanding how things like cross linked block lists (e.g. sibling
+pointers in a btree end up with loops in them) are the key to understanding what
+went wrong, but it is impossible to tell what order the blocks were linked into
+each other or written to disk after the fact.
+
+Hence we need to record more information into the metadata to allow us to
+quickly determine if the metadata is intact and can be ignored for the purpose
+of analysis. We can't protect against every possible type of error, but we can
+ensure that common types of errors are easily detectable. Hence the concept of
+self describing metadata.
+
+The first, fundamental requirement of self describing metadata is that the
+metadata object contains some form of unique identifier in a well known
+location. This allows us to identify the expected contents of the block and
+hence parse and verify the metadata object. IF we can't independently identify
+the type of metadata in the object, then the metadata doesn't describe itself
+very well at all!
+
+Luckily, almost all XFS metadata has magic numbers embedded already - only the
+AGFL, remote symlinks and remote attribute blocks do not contain identifying
+magic numbers. Hence we can change the on-disk format of all these objects to
+add more identifying information and detect this simply by changing the magic
+numbers in the metadata objects. That is, if it has the current magic number,
+the metadata isn't self identifying. If it contains a new magic number, it is
+self identifying and we can do much more expansive automated verification of the
+metadata object at runtime, during forensic analysis or repair.
+
+As a primary concern, self describing metadata needs some form of overall
+integrity checking. We cannot trust the metadata if we cannot verify that it has
+not been changed as a result of external influences. Hence we need some form of
+integrity check, and this is done by adding CRC32c validation to the metadata
+block. If we can verify the block contains the metadata it was intended to
+contain, a large amount of the manual verification work can be skipped.
+
+CRC32c was selected as metadata cannot be more than 64k in length in XFS and
+hence a 32 bit CRC is more than sufficient to detect multi-bit errors in
+metadata blocks. CRC32c is also now hardware accelerated on common CPUs so it is
+fast. So while CRC32c is not the strongest of possible integrity checks that
+could be used, it is more than sufficient for our needs and has relatively
+little overhead. Adding support for larger integrity fields and/or algorithms
+does really provide any extra value over CRC32c, but it does add a lot of
+complexity and so there is no provision for changing the integrity checking
+mechanism.
+
+Self describing metadata needs to contain enough information so that the
+metadata block can be verified as being in the correct place without needing to
+look at any other metadata. This means it needs to contain location information.
+Just adding a block number to the metadata is not sufficient to protect against
+mis-directed writes - a write might be misdirected to the wrong LUN and so be
+written to the "correct block" of the wrong filesystem. Hence location
+information must contain a filesystem identifier as well as a block number.
+
+Another key information point in forensic analysis is knowing who the metadata
+block belongs to. We already know the type, the location, that it is valid
+and/or corrupted, and how long ago that it was last modified. Knowing the owner
+of the block is important as it allows us to find other related metadata to
+determine the scope of the corruption. For example, if we have a extent btree
+object, we don't know what inode it belongs to and hence have to walk the entire
+filesystem to find the owner of the block. Worse, the corruption could mean that
+no owner can be found (i.e. it's an orphan block), and so without an owner field
+in the metadata we have no idea of the scope of the corruption. If we have an
+owner field in the metadata object, we can immediately do top down validation to
+determine the scope of the problem.
+
+Different types of metadata have different owner identifiers. For example,
+directory, attribute and extent tree blocks are all owned by an inode, whilst
+freespace btree blocks are owned by an allocation group. Hence the size and
+contents of the owner field are determined by the type of metadata object we are
+looking at. The owner information can also identify misplaced writes (e.g.
+freespace btree block written to the wrong AG).
+
+Self describing metadata also needs to contain some indication of when it was
+written to the filesystem. One of the key information points when doing forensic
+analysis is how recently the block was modified. Correlation of set of corrupted
+metadata blocks based on modification times is important as it can indicate
+whether the corruptions are related, whether there's been multiple corruption
+events that lead to the eventual failure, and even whether there are corruptions
+present that the run-time verification is not detecting.
+
+For example, we can determine whether a metadata object is supposed to be free
+space or still allocated if it is still referenced by its owner by looking at
+when the free space btree block that contains the block was last written
+compared to when the metadata object itself was last written. If the free space
+block is more recent than the object and the object's owner, then there is a
+very good chance that the block should have been removed from the owner.
+
+To provide this "written timestamp", each metadata block gets the Log Sequence
+Number (LSN) of the most recent transaction it was modified on written into it.
+This number will always increase over the life of the filesystem, and the only
+thing that resets it is running xfs_repair on the filesystem. Further, by use of
+the LSN we can tell if the corrupted metadata all belonged to the same log
+checkpoint and hence have some idea of how much modification occurred between
+the first and last instance of corrupt metadata on disk and, further, how much
+modification occurred between the corruption being written and when it was
+detected.
+
+Runtime Validation
+------------------
+
+Validation of self-describing metadata takes place at runtime in two places:
+
+ - immediately after a successful read from disk
+ - immediately prior to write IO submission
+
+The verification is completely stateless - it is done independently of the
+modification process, and seeks only to check that the metadata is what it says
+it is and that the metadata fields are within bounds and internally consistent.
+As such, we cannot catch all types of corruption that can occur within a block
+as there may be certain limitations that operational state enforces of the
+metadata, or there may be corruption of interblock relationships (e.g. corrupted
+sibling pointer lists). Hence we still need stateful checking in the main code
+body, but in general most of the per-field validation is handled by the
+verifiers.
+
+For read verification, the caller needs to specify the expected type of metadata
+that it should see, and the IO completion process verifies that the metadata
+object matches what was expected. If the verification process fails, then it
+marks the object being read as EFSCORRUPTED. The caller needs to catch this
+error (same as for IO errors), and if it needs to take special action due to a
+verification error it can do so by catching the EFSCORRUPTED error value. If we
+need more discrimination of error type at higher levels, we can define new
+error numbers for different errors as necessary.
+
+The first step in read verification is checking the magic number and determining
+whether CRC validating is necessary. If it is, the CRC32c is calculated and
+compared against the value stored in the object itself. Once this is validated,
+further checks are made against the location information, followed by extensive
+object specific metadata validation. If any of these checks fail, then the
+buffer is considered corrupt and the EFSCORRUPTED error is set appropriately.
+
+Write verification is the opposite of the read verification - first the object
+is extensively verified and if it is OK we then update the LSN from the last
+modification made to the object, After this, we calculate the CRC and insert it
+into the object. Once this is done the write IO is allowed to continue. If any
+error occurs during this process, the buffer is again marked with a EFSCORRUPTED
+error for the higher layers to catch.
+
+Structures
+----------
+
+A typical on-disk structure needs to contain the following information:
+
+struct xfs_ondisk_hdr {
+ __be32 magic; /* magic number */
+ __be32 crc; /* CRC, not logged */
+ uuid_t uuid; /* filesystem identifier */
+ __be64 owner; /* parent object */
+ __be64 blkno; /* location on disk */
+ __be64 lsn; /* last modification in log, not logged */
+};
+
+Depending on the metadata, this information may be part of a header structure
+separate to the metadata contents, or may be distributed through an existing
+structure. The latter occurs with metadata that already contains some of this
+information, such as the superblock and AG headers.
+
+Other metadata may have different formats for the information, but the same
+level of information is generally provided. For example:
+
+ - short btree blocks have a 32 bit owner (ag number) and a 32 bit block
+ number for location. The two of these combined provide the same
+ information as @owner and @blkno in eh above structure, but using 8
+ bytes less space on disk.
+
+ - directory/attribute node blocks have a 16 bit magic number, and the
+ header that contains the magic number has other information in it as
+ well. hence the additional metadata headers change the overall format
+ of the metadata.
+
+A typical buffer read verifier is structured as follows:
+
+#define XFS_FOO_CRC_OFF offsetof(struct xfs_ondisk_hdr, crc)
+
+static void
+xfs_foo_read_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+
+ if ((xfs_sb_version_hascrc(&mp->m_sb) &&
+ !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
+ XFS_FOO_CRC_OFF)) ||
+ !xfs_foo_verify(bp)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ }
+}
+
+The code ensures that the CRC is only checked if the filesystem has CRCs enabled
+by checking the superblock of the feature bit, and then if the CRC verifies OK
+(or is not needed) it verifies the actual contents of the block.
+
+The verifier function will take a couple of different forms, depending on
+whether the magic number can be used to determine the format of the block. In
+the case it can't, the code is structured as follows:
+
+static bool
+xfs_foo_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_ondisk_hdr *hdr = bp->b_addr;
+
+ if (hdr->magic != cpu_to_be32(XFS_FOO_MAGIC))
+ return false;
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (!uuid_equal(&hdr->uuid, &mp->m_sb.sb_uuid))
+ return false;
+ if (bp->b_bn != be64_to_cpu(hdr->blkno))
+ return false;
+ if (hdr->owner == 0)
+ return false;
+ }
+
+ /* object specific verification checks here */
+
+ return true;
+}
+
+If there are different magic numbers for the different formats, the verifier
+will look like:
+
+static bool
+xfs_foo_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_ondisk_hdr *hdr = bp->b_addr;
+
+ if (hdr->magic == cpu_to_be32(XFS_FOO_CRC_MAGIC)) {
+ if (!uuid_equal(&hdr->uuid, &mp->m_sb.sb_uuid))
+ return false;
+ if (bp->b_bn != be64_to_cpu(hdr->blkno))
+ return false;
+ if (hdr->owner == 0)
+ return false;
+ } else if (hdr->magic != cpu_to_be32(XFS_FOO_MAGIC))
+ return false;
+
+ /* object specific verification checks here */
+
+ return true;
+}
+
+Write verifiers are very similar to the read verifiers, they just do things in
+the opposite order to the read verifiers. A typical write verifier:
+
+static void
+xfs_foo_write_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
+
+ if (!xfs_foo_verify(bp)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ return;
+ }
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return;
+
+
+ if (bip) {
+ struct xfs_ondisk_hdr *hdr = bp->b_addr;
+ hdr->lsn = cpu_to_be64(bip->bli_item.li_lsn);
+ }
+ xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_FOO_CRC_OFF);
+}
+
+This will verify the internal structure of the metadata before we go any
+further, detecting corruptions that have occurred as the metadata has been
+modified in memory. If the metadata verifies OK, and CRCs are enabled, we then
+update the LSN field (when it was last modified) and calculate the CRC on the
+metadata. Once this is done, we can issue the IO.
+
+Inodes and Dquots
+-----------------
+
+Inodes and dquots are special snowflakes. They have per-object CRC and
+self-identifiers, but they are packed so that there are multiple objects per
+buffer. Hence we do not use per-buffer verifiers to do the work of per-object
+verification and CRC calculations. The per-buffer verifiers simply perform basic
+identification of the buffer - that they contain inodes or dquots, and that
+there are magic numbers in all the expected spots. All further CRC and
+verification checks are done when each inode is read from or written back to the
+buffer.
+
+The structure of the verifiers and the identifiers checks is very similar to the
+buffer code described above. The only difference is where they are called. For
+example, inode read verification is done in xfs_iread() when the inode is first
+read out of the buffer and the struct xfs_inode is instantiated. The inode is
+already extensively verified during writeback in xfs_iflush_int, so the only
+addition here is to add the LSN and CRC to the inode as it is copied back into
+the buffer.
+
+XXX: inode unlinked list modification doesn't recalculate the inode CRC! None of
+the unlinked list modifications check or update CRCs, neither during unlink nor
+log recovery. So, it's gone unnoticed until now. This won't matter immediately -
+repair will probably complain about it - but it needs to be fixed.
+
diff --git a/Documentation/filesystems/xfs.txt b/Documentation/filesystems/xfs.txt
index 3e4b3dd1e04..5be51fd888b 100644
--- a/Documentation/filesystems/xfs.txt
+++ b/Documentation/filesystems/xfs.txt
@@ -18,6 +18,8 @@ Mount Options
=============
When mounting an XFS filesystem, the following options are accepted.
+For boolean mount options, the names with the (*) suffix is the
+default behaviour.
allocsize=size
Sets the buffered I/O end-of-file preallocation size when
@@ -25,94 +27,128 @@ When mounting an XFS filesystem, the following options are accepted.
Valid values for this option are page size (typically 4KiB)
through to 1GiB, inclusive, in power-of-2 increments.
- attr2/noattr2
- The options enable/disable (default is disabled for backward
- compatibility on-disk) an "opportunistic" improvement to be
- made in the way inline extended attributes are stored on-disk.
- When the new form is used for the first time (by setting or
- removing extended attributes) the on-disk superblock feature
- bit field will be updated to reflect this format being in use.
-
- barrier
- Enables the use of block layer write barriers for writes into
- the journal and unwritten extent conversion. This allows for
- drive level write caching to be enabled, for devices that
- support write barriers.
+ The default behaviour is for dynamic end-of-file
+ preallocation size, which uses a set of heuristics to
+ optimise the preallocation size based on the current
+ allocation patterns within the file and the access patterns
+ to the file. Specifying a fixed allocsize value turns off
+ the dynamic behaviour.
+
+ attr2
+ noattr2
+ The options enable/disable an "opportunistic" improvement to
+ be made in the way inline extended attributes are stored
+ on-disk. When the new form is used for the first time when
+ attr2 is selected (either when setting or removing extended
+ attributes) the on-disk superblock feature bit field will be
+ updated to reflect this format being in use.
+
+ The default behaviour is determined by the on-disk feature
+ bit indicating that attr2 behaviour is active. If either
+ mount option it set, then that becomes the new default used
+ by the filesystem.
+
+ CRC enabled filesystems always use the attr2 format, and so
+ will reject the noattr2 mount option if it is set.
+
+ barrier (*)
+ nobarrier
+ Enables/disables the use of block layer write barriers for
+ writes into the journal and for data integrity operations.
+ This allows for drive level write caching to be enabled, for
+ devices that support write barriers.
discard
- Issue command to let the block device reclaim space freed by the
- filesystem. This is useful for SSD devices, thinly provisioned
- LUNs and virtual machine images, but may have a performance
- impact.
-
- dmapi
- Enable the DMAPI (Data Management API) event callouts.
- Use with the "mtpt" option.
-
- grpid/bsdgroups and nogrpid/sysvgroups
- These options define what group ID a newly created file gets.
- When grpid is set, it takes the group ID of the directory in
- which it is created; otherwise (the default) it takes the fsgid
- of the current process, unless the directory has the setgid bit
- set, in which case it takes the gid from the parent directory,
- and also gets the setgid bit set if it is a directory itself.
-
- ihashsize=value
- In memory inode hashes have been removed, so this option has
- no function as of August 2007. Option is deprecated.
-
- ikeep/noikeep
- When ikeep is specified, XFS does not delete empty inode clusters
- and keeps them around on disk. ikeep is the traditional XFS
- behaviour. When noikeep is specified, empty inode clusters
- are returned to the free space pool. The default is noikeep for
- non-DMAPI mounts, while ikeep is the default when DMAPI is in use.
-
- inode64
- Indicates that XFS is allowed to create inodes at any location
- in the filesystem, including those which will result in inode
- numbers occupying more than 32 bits of significance. This is
- the default allocation option. Applications which do not handle
- inode numbers bigger than 32 bits, should use inode32 option.
+ nodiscard (*)
+ Enable/disable the issuing of commands to let the block
+ device reclaim space freed by the filesystem. This is
+ useful for SSD devices, thinly provisioned LUNs and virtual
+ machine images, but may have a performance impact.
+
+ Note: It is currently recommended that you use the fstrim
+ application to discard unused blocks rather than the discard
+ mount option because the performance impact of this option
+ is quite severe.
+
+ grpid/bsdgroups
+ nogrpid/sysvgroups (*)
+ These options define what group ID a newly created file
+ gets. When grpid is set, it takes the group ID of the
+ directory in which it is created; otherwise it takes the
+ fsgid of the current process, unless the directory has the
+ setgid bit set, in which case it takes the gid from the
+ parent directory, and also gets the setgid bit set if it is
+ a directory itself.
+
+ filestreams
+ Make the data allocator use the filestreams allocation mode
+ across the entire filesystem rather than just on directories
+ configured to use it.
+
+ ikeep
+ noikeep (*)
+ When ikeep is specified, XFS does not delete empty inode
+ clusters and keeps them around on disk. When noikeep is
+ specified, empty inode clusters are returned to the free
+ space pool.
inode32
- Indicates that XFS is limited to create inodes at locations which
- will not result in inode numbers with more than 32 bits of
- significance. This is provided for backwards compatibility, since
- 64 bits inode numbers might cause problems for some applications
- that cannot handle large inode numbers.
-
- largeio/nolargeio
+ inode64 (*)
+ When inode32 is specified, it indicates that XFS limits
+ inode creation to locations which will not result in inode
+ numbers with more than 32 bits of significance.
+
+ When inode64 is specified, it indicates that XFS is allowed
+ to create inodes at any location in the filesystem,
+ including those which will result in inode numbers occupying
+ more than 32 bits of significance.
+
+ inode32 is provided for backwards compatibility with older
+ systems and applications, since 64 bits inode numbers might
+ cause problems for some applications that cannot handle
+ large inode numbers. If applications are in use which do
+ not handle inode numbers bigger than 32 bits, the inode32
+ option should be specified.
+
+
+ largeio
+ nolargeio (*)
If "nolargeio" is specified, the optimal I/O reported in
- st_blksize by stat(2) will be as small as possible to allow user
- applications to avoid inefficient read/modify/write I/O.
- If "largeio" specified, a filesystem that has a "swidth" specified
- will return the "swidth" value (in bytes) in st_blksize. If the
- filesystem does not have a "swidth" specified but does specify
- an "allocsize" then "allocsize" (in bytes) will be returned
- instead.
- If neither of these two options are specified, then filesystem
- will behave as if "nolargeio" was specified.
+ st_blksize by stat(2) will be as small as possible to allow
+ user applications to avoid inefficient read/modify/write
+ I/O. This is typically the page size of the machine, as
+ this is the granularity of the page cache.
+
+ If "largeio" specified, a filesystem that was created with a
+ "swidth" specified will return the "swidth" value (in bytes)
+ in st_blksize. If the filesystem does not have a "swidth"
+ specified but does specify an "allocsize" then "allocsize"
+ (in bytes) will be returned instead. Otherwise the behaviour
+ is the same as if "nolargeio" was specified.
logbufs=value
- Set the number of in-memory log buffers. Valid numbers range
- from 2-8 inclusive.
- The default value is 8 buffers for filesystems with a
- blocksize of 64KiB, 4 buffers for filesystems with a blocksize
- of 32KiB, 3 buffers for filesystems with a blocksize of 16KiB
- and 2 buffers for all other configurations. Increasing the
- number of buffers may increase performance on some workloads
- at the cost of the memory used for the additional log buffers
- and their associated control structures.
+ Set the number of in-memory log buffers. Valid numbers
+ range from 2-8 inclusive.
+
+ The default value is 8 buffers.
+
+ If the memory cost of 8 log buffers is too high on small
+ systems, then it may be reduced at some cost to performance
+ on metadata intensive workloads. The logbsize option below
+ controls the size of each buffer and so is also relevant to
+ this case.
logbsize=value
- Set the size of each in-memory log buffer.
- Size may be specified in bytes, or in kilobytes with a "k" suffix.
- Valid sizes for version 1 and version 2 logs are 16384 (16k) and
- 32768 (32k). Valid sizes for version 2 logs also include
- 65536 (64k), 131072 (128k) and 262144 (256k).
- The default value for machines with more than 32MiB of memory
- is 32768, machines with less memory use 16384 by default.
+ Set the size of each in-memory log buffer. The size may be
+ specified in bytes, or in kilobytes with a "k" suffix.
+ Valid sizes for version 1 and version 2 logs are 16384 (16k)
+ and 32768 (32k). Valid sizes for version 2 logs also
+ include 65536 (64k), 131072 (128k) and 262144 (256k). The
+ logbsize must be an integer multiple of the log
+ stripe unit configured at mkfs time.
+
+ The default value for for version 1 logs is 32768, while the
+ default value for version 2 logs is MAX(32768, log_sunit).
logdev=device and rtdev=device
Use an external log (metadata journal) and/or real-time device.
@@ -121,16 +157,11 @@ When mounting an XFS filesystem, the following options are accepted.
optional, and the log section can be separate from the data
section or contained within it.
- mtpt=mountpoint
- Use with the "dmapi" option. The value specified here will be
- included in the DMAPI mount event, and should be the path of
- the actual mountpoint that is used.
-
noalign
- Data allocations will not be aligned at stripe unit boundaries.
-
- noatime
- Access timestamps are not updated when a file is read.
+ Data allocations will not be aligned at stripe unit
+ boundaries. This is only relevant to filesystems created
+ with non-zero data alignment parameters (sunit, swidth) by
+ mkfs.
norecovery
The filesystem will be mounted without running log recovery.
@@ -141,8 +172,14 @@ When mounting an XFS filesystem, the following options are accepted.
the mount will fail.
nouuid
- Don't check for double mounted file systems using the file system uuid.
- This is useful to mount LVM snapshot volumes.
+ Don't check for double mounted file systems using the file
+ system uuid. This is useful to mount LVM snapshot volumes,
+ and often used in combination with "norecovery" for mounting
+ read-only snapshots.
+
+ noquota
+ Forcibly turns off all quota accounting and enforcement
+ within the filesystem.
uquota/usrquota/uqnoenforce/quota
User disk quota accounting enabled, and limits (optionally)
@@ -157,24 +194,64 @@ When mounting an XFS filesystem, the following options are accepted.
enforced. Refer to xfs_quota(8) for further details.
sunit=value and swidth=value
- Used to specify the stripe unit and width for a RAID device or
- a stripe volume. "value" must be specified in 512-byte block
- units.
- If this option is not specified and the filesystem was made on
- a stripe volume or the stripe width or unit were specified for
- the RAID device at mkfs time, then the mount system call will
- restore the value from the superblock. For filesystems that
- are made directly on RAID devices, these options can be used
- to override the information in the superblock if the underlying
- disk layout changes after the filesystem has been created.
- The "swidth" option is required if the "sunit" option has been
- specified, and must be a multiple of the "sunit" value.
+ Used to specify the stripe unit and width for a RAID device
+ or a stripe volume. "value" must be specified in 512-byte
+ block units. These options are only relevant to filesystems
+ that were created with non-zero data alignment parameters.
+
+ The sunit and swidth parameters specified must be compatible
+ with the existing filesystem alignment characteristics. In
+ general, that means the only valid changes to sunit are
+ increasing it by a power-of-2 multiple. Valid swidth values
+ are any integer multiple of a valid sunit value.
+
+ Typically the only time these mount options are necessary if
+ after an underlying RAID device has had it's geometry
+ modified, such as adding a new disk to a RAID5 lun and
+ reshaping it.
swalloc
Data allocations will be rounded up to stripe width boundaries
when the current end of file is being extended and the file
size is larger than the stripe width size.
+ wsync
+ When specified, all filesystem namespace operations are
+ executed synchronously. This ensures that when the namespace
+ operation (create, unlink, etc) completes, the change to the
+ namespace is on stable storage. This is useful in HA setups
+ where failover must not result in clients seeing
+ inconsistent namespace presentation during or after a
+ failover event.
+
+
+Deprecated Mount Options
+========================
+
+ delaylog/nodelaylog
+ Delayed logging is the only logging method that XFS supports
+ now, so these mount options are now ignored.
+
+ Due for removal in 3.12.
+
+ ihashsize=value
+ In memory inode hashes have been removed, so this option has
+ no function as of August 2007. Option is deprecated.
+
+ Due for removal in 3.12.
+
+ irixsgid
+ This behaviour is now controlled by a sysctl, so the mount
+ option is ignored.
+
+ Due for removal in 3.12.
+
+ osyncisdsync
+ osyncisosync
+ O_SYNC and O_DSYNC are fully supported, so there is no need
+ for these options any more.
+
+ Due for removal in 3.12.
sysctls
=======
@@ -186,15 +263,20 @@ The following sysctls are available for the XFS filesystem:
in /proc/fs/xfs/stat. It then immediately resets to "0".
fs.xfs.xfssyncd_centisecs (Min: 100 Default: 3000 Max: 720000)
- The interval at which the xfssyncd thread flushes metadata
- out to disk. This thread will flush log activity out, and
- do some processing on unlinked inodes.
+ The interval at which the filesystem flushes metadata
+ out to disk and runs internal cache cleanup routines.
- fs.xfs.xfsbufd_centisecs (Min: 50 Default: 100 Max: 3000)
- The interval at which xfsbufd scans the dirty metadata buffers list.
+ fs.xfs.filestream_centisecs (Min: 1 Default: 3000 Max: 360000)
+ The interval at which the filesystem ages filestreams cache
+ references and returns timed-out AGs back to the free stream
+ pool.
- fs.xfs.age_buffer_centisecs (Min: 100 Default: 1500 Max: 720000)
- The age at which xfsbufd flushes dirty metadata buffers to disk.
+ fs.xfs.speculative_prealloc_lifetime
+ (Units: seconds Min: 1 Default: 300 Max: 86400)
+ The interval at which the background scanning for inodes
+ with unused speculative preallocation runs. The scan
+ removes unused preallocation from clean inodes and releases
+ the unused space back to the free pool.
fs.xfs.error_level (Min: 0 Default: 3 Max: 11)
A volume knob for error reporting when internal errors occur.
@@ -251,9 +333,31 @@ The following sysctls are available for the XFS filesystem:
by the xfs_io(8) chattr command on a directory to be
inherited by files in that directory.
+ fs.xfs.inherit_nodefrag (Min: 0 Default: 1 Max: 1)
+ Setting this to "1" will cause the "nodefrag" flag set
+ by the xfs_io(8) chattr command on a directory to be
+ inherited by files in that directory.
+
fs.xfs.rotorstep (Min: 1 Default: 1 Max: 256)
In "inode32" allocation mode, this option determines how many
files the allocator attempts to allocate in the same allocation
group before moving to the next allocation group. The intent
is to control the rate at which the allocator moves between
allocation groups when allocating extents for new files.
+
+Deprecated Sysctls
+==================
+
+ fs.xfs.xfsbufd_centisecs (Min: 50 Default: 100 Max: 3000)
+ Dirty metadata is now tracked by the log subsystem and
+ flushing is driven by log space and idling demands. The
+ xfsbufd no longer exists, so this syctl does nothing.
+
+ Due for removal in 3.14.
+
+ fs.xfs.age_buffer_centisecs (Min: 100 Default: 1500 Max: 720000)
+ Dirty metadata is now tracked by the log subsystem and
+ flushing is driven by log space and idling demands. The
+ xfsbufd no longer exists, so this syctl does nothing.
+
+ Due for removal in 3.14.