aboutsummaryrefslogtreecommitdiff
path: root/Documentation/filesystems
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation/filesystems')
-rw-r--r--Documentation/filesystems/00-INDEX60
-rw-r--r--Documentation/filesystems/9p.txt17
-rw-r--r--Documentation/filesystems/Locking113
-rw-r--r--Documentation/filesystems/affs.txt9
-rw-r--r--Documentation/filesystems/autofs4-mount-control.txt2
-rw-r--r--Documentation/filesystems/btrfs.txt225
-rw-r--r--Documentation/filesystems/caching/backend-api.txt47
-rw-r--r--Documentation/filesystems/caching/netfs-api.txt150
-rw-r--r--Documentation/filesystems/caching/object.txt23
-rw-r--r--Documentation/filesystems/caching/operations.txt2
-rw-r--r--Documentation/filesystems/cifs.txt51
-rw-r--r--Documentation/filesystems/cifs/AUTHORS56
-rw-r--r--Documentation/filesystems/cifs/CHANGES1065
-rw-r--r--Documentation/filesystems/cifs/README753
-rw-r--r--Documentation/filesystems/cifs/TODO129
-rw-r--r--Documentation/filesystems/cifs/cifs.txt31
-rwxr-xr-xDocumentation/filesystems/cifs/winucase_convert.pl62
-rw-r--r--Documentation/filesystems/debugfs.txt7
-rw-r--r--Documentation/filesystems/directory-locking31
-rw-r--r--Documentation/filesystems/efivarfs.txt16
-rw-r--r--Documentation/filesystems/ext3.txt13
-rw-r--r--Documentation/filesystems/ext4.txt61
-rw-r--r--Documentation/filesystems/f2fs.txt545
-rw-r--r--Documentation/filesystems/files.txt4
-rw-r--r--Documentation/filesystems/gfs2-glocks.txt119
-rw-r--r--Documentation/filesystems/gfs2-uevents.txt2
-rw-r--r--Documentation/filesystems/gfs2.txt9
-rw-r--r--Documentation/filesystems/hfsplus.txt2
-rw-r--r--Documentation/filesystems/jfs.txt19
-rw-r--r--Documentation/filesystems/nfs/00-INDEX4
-rw-r--r--Documentation/filesystems/nfs/Exporting2
-rw-r--r--Documentation/filesystems/nfs/idmapper.txt20
-rw-r--r--Documentation/filesystems/nfs/nfs.txt44
-rw-r--r--Documentation/filesystems/nfs/nfs41-server.txt60
-rw-r--r--Documentation/filesystems/nfs/nfsd-admin-interfaces.txt41
-rw-r--r--Documentation/filesystems/nfs/nfsroot.txt10
-rw-r--r--Documentation/filesystems/nfs/pnfs.txt56
-rw-r--r--Documentation/filesystems/nfs/rpc-server-gss.txt91
-rw-r--r--Documentation/filesystems/nilfs2.txt68
-rw-r--r--Documentation/filesystems/ntfs.txt2
-rw-r--r--Documentation/filesystems/pohmelfs/network_protocol.txt2
-rw-r--r--Documentation/filesystems/porting74
-rw-r--r--Documentation/filesystems/proc.txt257
-rw-r--r--Documentation/filesystems/qnx6.txt174
-rw-r--r--Documentation/filesystems/ramfs-rootfs-initramfs.txt6
-rw-r--r--Documentation/filesystems/relay.txt2
-rw-r--r--Documentation/filesystems/seq_file.txt9
-rw-r--r--Documentation/filesystems/sharedsubtree.txt2
-rw-r--r--Documentation/filesystems/sysfs-tagging.txt2
-rw-r--r--Documentation/filesystems/sysfs.txt6
-rw-r--r--Documentation/filesystems/vfat.txt41
-rw-r--r--Documentation/filesystems/vfs.txt205
-rw-r--r--Documentation/filesystems/xfs-self-describing-metadata.txt350
-rw-r--r--Documentation/filesystems/xfs.txt319
54 files changed, 4950 insertions, 520 deletions
diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX
index 8c624a18f67..ac28149aede 100644
--- a/Documentation/filesystems/00-INDEX
+++ b/Documentation/filesystems/00-INDEX
@@ -2,6 +2,8 @@
- this file (info on some of the filesystems supported by linux).
Locking
- info on locking rules as they pertain to Linux VFS.
+Makefile
+ - Makefile for building the filsystems-part of DocBook.
9p.txt
- 9p (v9fs) is an implementation of the Plan 9 remote fs protocol.
adfs.txt
@@ -10,24 +12,32 @@ afs.txt
- info and examples for the distributed AFS (Andrew File System) fs.
affs.txt
- info and mount options for the Amiga Fast File System.
+autofs4-mount-control.txt
+ - info on device control operations for autofs4 module.
automount-support.txt
- information about filesystem automount support.
befs.txt
- information about the BeOS filesystem for Linux.
bfs.txt
- info for the SCO UnixWare Boot Filesystem (BFS).
+btrfs.txt
+ - info for the BTRFS filesystem.
+caching/
+ - directory containing filesystem cache documentation.
ceph.txt
- - info for the Ceph Distributed File System
-cifs.txt
- - description of the CIFS filesystem.
+ - info for the Ceph Distributed File System.
+cifs/
+ - directory containing CIFS filesystem documentation and example code.
coda.txt
- description of the CODA filesystem.
configfs/
- directory containing configfs documentation and example code.
cramfs.txt
- info on the cram filesystem for small storage (ROMs etc).
-dentry-locking.txt
- - info on the RCU-based dcache locking model.
+debugfs.txt
+ - info on the debugfs filesystem.
+devpts.txt
+ - info on the devpts filesystem.
directory-locking
- info about the locking scheme used for directory operations.
dlmfs.txt
@@ -35,9 +45,11 @@ dlmfs.txt
dnotify.txt
- info about directory notification in Linux.
dnotify_test.c
- - example program for dnotify
+ - example program for dnotify.
ecryptfs.txt
- docs on eCryptfs: stacked cryptographic filesystem for Linux.
+efivarfs.txt
+ - info for the efivarfs filesystem.
exofs.txt
- info, usage, mount options, design about EXOFS.
ext2.txt
@@ -46,10 +58,18 @@ ext3.txt
- info, mount options and specifications for the Ext3 filesystem.
ext4.txt
- info, mount options and specifications for the Ext4 filesystem.
+f2fs.txt
+ - info and mount options for the F2FS filesystem.
+fiemap.txt
+ - info on fiemap ioctl.
files.txt
- info on file management in the Linux kernel.
fuse.txt
- info on the Filesystem in User SpacE including mount options.
+gfs2-glocks.txt
+ - info on the Global File System 2 - Glock internal locking rules.
+gfs2-uevents.txt
+ - info on the Global File System 2 - uevents.
gfs2.txt
- info on the Global File System 2.
hfs.txt
@@ -80,40 +100,58 @@ ntfs.txt
- info and mount options for the NTFS filesystem (Windows NT).
ocfs2.txt
- info and mount options for the OCFS2 clustered filesystem.
+omfs.txt
+ - info on the Optimized MPEG FileSystem.
+path-lookup.txt
+ - info on path walking and name lookup locking.
+pohmelfs/
+ - directory containing pohmelfs filesystem documentation.
porting
- various information on filesystem porting.
proc.txt
- info on Linux's /proc filesystem.
+qnx6.txt
+ - info on the QNX6 filesystem.
+quota.txt
+ - info on Quota subsystem.
ramfs-rootfs-initramfs.txt
- info on the 'in memory' filesystems ramfs, rootfs and initramfs.
-reiser4.txt
- - info on the Reiser4 filesystem based on dancing tree algorithms.
relay.txt
- info on relay, for efficient streaming from kernel to user space.
romfs.txt
- description of the ROMFS filesystem.
seq_file.txt
- - how to use the seq_file API
+ - how to use the seq_file API.
sharedsubtree.txt
- a description of shared subtrees for namespaces.
spufs.txt
- info and mount options for the SPU filesystem used on Cell.
+squashfs.txt
+ - info on the squashfs filesystem.
sysfs-pci.txt
- info on accessing PCI device resources through sysfs.
+sysfs-tagging.txt
+ - info on sysfs tagging to avoid duplicates.
sysfs.txt
- info on sysfs, a ram-based filesystem for exporting kernel objects.
sysv-fs.txt
- info on the SystemV/V7/Xenix/Coherent filesystem.
tmpfs.txt
- info on tmpfs, a filesystem that holds all files in virtual memory.
+ubifs.txt
+ - info on the Unsorted Block Images FileSystem.
udf.txt
- info and mount options for the UDF filesystem.
ufs.txt
- info on the ufs filesystem.
vfat.txt
- - info on using the VFAT filesystem used in Windows NT and Windows 95
+ - info on using the VFAT filesystem used in Windows NT and Windows 95.
vfs.txt
- - overview of the Virtual File System
+ - overview of the Virtual File System.
+xfs-delayed-logging-design.txt
+ - info on the XFS Delayed Logging Design.
+xfs-self-describing-metadata.txt
+ - info on XFS Self Describing Metadata.
xfs.txt
- info and mount options for the XFS filesystem.
xip.txt
diff --git a/Documentation/filesystems/9p.txt b/Documentation/filesystems/9p.txt
index 2c032144284..fec7144e817 100644
--- a/Documentation/filesystems/9p.txt
+++ b/Documentation/filesystems/9p.txt
@@ -69,10 +69,14 @@ OPTIONS
offering several exported file systems.
cache=mode specifies a caching policy. By default, no caches are used.
+ none = default no cache policy, metadata and data
+ alike are synchronous.
loose = no attempts are made at consistency,
intended for exclusive, read-only mounts
- fscache = use FS-Cache for a persistent, read-only
+ fscache = use FS-Cache for a persistent, read-only
cache backend.
+ mmap = minimal cache that is only used for read-write
+ mmap. Northing else is cached, like cache=none
debug=n specifies debug level. The debug level is a bitmask.
0x01 = display verbose error messages
@@ -147,8 +151,7 @@ on sourceforge (http://sourceforge.net/projects/v9fs).
News and other information is maintained on a Wiki.
(http://sf.net/apps/mediawiki/v9fs/index.php).
-Bug reports may be issued through the kernel.org bugzilla
-(http://bugzilla.kernel.org)
+Bug reports are best issued via the mailing list.
For more information on the Plan 9 Operating System check out
http://plan9.bell-labs.com/plan9
@@ -156,11 +159,3 @@ http://plan9.bell-labs.com/plan9
For information on Plan 9 from User Space (Plan 9 applications and libraries
ported to Linux/BSD/OSX/etc) check out http://swtch.com/plan9
-
-STATUS
-======
-
-The 2.6 kernel support is working on PPC and x86.
-
-PLEASE USE THE KERNEL BUGZILLA TO REPORT PROBLEMS. (http://bugzilla.kernel.org)
-
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 4fca82e5276..b18dd177902 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -9,11 +9,10 @@ be able to use diff(1).
--------------------------- dentry_operations --------------------------
prototypes:
- int (*d_revalidate)(struct dentry *, struct nameidata *);
- int (*d_hash)(const struct dentry *, const struct inode *,
- struct qstr *);
- int (*d_compare)(const struct dentry *, const struct inode *,
- const struct dentry *, const struct inode *,
+ int (*d_revalidate)(struct dentry *, unsigned int);
+ int (*d_weak_revalidate)(struct dentry *, unsigned int);
+ int (*d_hash)(const struct dentry *, struct qstr *);
+ int (*d_compare)(const struct dentry *, const struct dentry *,
unsigned int, const char *, const struct qstr *);
int (*d_delete)(struct dentry *);
void (*d_release)(struct dentry *);
@@ -25,6 +24,7 @@ prototypes:
locking rules:
rename_lock ->d_lock may block rcu-walk
d_revalidate: no no yes (ref-walk) maybe
+d_weak_revalidate:no no yes no
d_hash no no no maybe
d_compare: yes no no maybe
d_delete: no yes no no
@@ -37,9 +37,8 @@ d_manage: no no yes (ref-walk) maybe
--------------------------- inode_operations ---------------------------
prototypes:
- int (*create) (struct inode *,struct dentry *,umode_t, struct nameidata *);
- struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameid
-ata *);
+ int (*create) (struct inode *,struct dentry *,umode_t, bool);
+ struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
int (*link) (struct dentry *,struct inode *,struct dentry *);
int (*unlink) (struct inode *,struct dentry *);
int (*symlink) (struct inode *,struct dentry *,const char *);
@@ -48,6 +47,8 @@ ata *);
int (*mknod) (struct inode *,struct dentry *,umode_t,dev_t);
int (*rename) (struct inode *, struct dentry *,
struct inode *, struct dentry *);
+ int (*rename2) (struct inode *, struct dentry *,
+ struct inode *, struct dentry *, unsigned int);
int (*readlink) (struct dentry *, char __user *,int);
void * (*follow_link) (struct dentry *, struct nameidata *);
void (*put_link) (struct dentry *, struct nameidata *, void *);
@@ -60,8 +61,12 @@ ata *);
ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
ssize_t (*listxattr) (struct dentry *, char *, size_t);
int (*removexattr) (struct dentry *, const char *);
- void (*truncate_range)(struct inode *, loff_t, loff_t);
int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len);
+ void (*update_time)(struct inode *, struct timespec *, int);
+ int (*atomic_open)(struct inode *, struct dentry *,
+ struct file *, unsigned open_flag,
+ umode_t create_mode, int *opened);
+ int (*tmpfile) (struct inode *, struct dentry *, umode_t);
locking rules:
all may block
@@ -75,10 +80,10 @@ mkdir: yes
unlink: yes (both)
rmdir: yes (both) (see below)
rename: yes (all) (see below)
+rename2: yes (all) (see below)
readlink: no
follow_link: no
put_link: no
-truncate: yes (see below)
setattr: yes
permission: no (may not block if called in rcu-walk mode)
get_acl: no
@@ -87,16 +92,15 @@ setxattr: yes
getxattr: no
listxattr: no
removexattr: yes
-truncate_range: yes
fiemap: no
+update_time: no
+atomic_open: yes
+tmpfile: no
+
Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on
victim.
- cross-directory ->rename() has (per-superblock) ->s_vfs_rename_sem.
- ->truncate() is never called directly - it's a callback, not a
-method. It's called by vmtruncate() - deprecated library function used by
-->setattr(). Locking information above applies to that call (i.e. is
-inherited from ->setattr() - vmtruncate() is used when ATTR_SIZE had been
-passed).
+ cross-directory ->rename() and rename2() has (per-superblock)
+->s_vfs_rename_sem.
See Documentation/filesystems/directory-locking for more detailed discussion
of the locking scheme for directory operations.
@@ -110,7 +114,6 @@ prototypes:
int (*drop_inode) (struct inode *);
void (*evict_inode) (struct inode *);
void (*put_super) (struct super_block *);
- void (*write_super) (struct super_block *);
int (*sync_fs)(struct super_block *sb, int wait);
int (*freeze_fs) (struct super_block *);
int (*unfreeze_fs) (struct super_block *);
@@ -132,10 +135,9 @@ write_inode:
drop_inode: !!!inode->i_lock!!!
evict_inode:
put_super: write
-write_super: read
sync_fs: read
-freeze_fs: read
-unfreeze_fs: read
+freeze_fs: write
+unfreeze_fs: write
statfs: maybe(read) (see below)
remount_fs: write
umount_begin: no
@@ -191,17 +193,18 @@ prototypes:
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata);
sector_t (*bmap)(struct address_space *, sector_t);
- int (*invalidatepage) (struct page *, unsigned long);
+ void (*invalidatepage) (struct page *, unsigned int, unsigned int);
int (*releasepage) (struct page *, int);
void (*freepage)(struct page *);
- int (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
- loff_t offset, unsigned long nr_segs);
+ int (*direct_IO)(int, struct kiocb *, struct iov_iter *iter, loff_t offset);
int (*get_xip_mem)(struct address_space *, pgoff_t, int, void **,
unsigned long *);
int (*migratepage)(struct address_space *, struct page *, struct page *);
int (*launder_page)(struct page *);
- int (*is_partially_uptodate)(struct page *, read_descriptor_t *, unsigned long);
+ int (*is_partially_uptodate)(struct page *, unsigned long, unsigned long);
int (*error_remove_page)(struct address_space *, struct page *);
+ int (*swap_activate)(struct file *);
+ int (*swap_deactivate)(struct file *);
locking rules:
All except set_page_dirty and freepage may block
@@ -225,6 +228,8 @@ migratepage: yes (both)
launder_page: yes
is_partially_uptodate: yes
error_remove_page: yes
+swap_activate: no
+swap_deactivate: no
->write_begin(), ->write_end(), ->sync_page() and ->readpage()
may be called from the request handler (/dev/loop).
@@ -308,8 +313,8 @@ filesystems and by the swapper. The latter will eventually go away. Please,
keep it that way and don't breed new callers.
->invalidatepage() is called when the filesystem must attempt to drop
-some or all of the buffers from the page when it is being truncated. It
-returns zero on success. If ->invalidatepage is zero, the kernel uses
+some or all of the buffers from the page when it is being truncated. It
+returns zero on success. If ->invalidatepage is zero, the kernel uses
block_invalidatepage() instead.
->releasepage() is called when the kernel is about to try to drop the
@@ -326,6 +331,15 @@ cleaned, or an error value if not. Note that in order to prevent the page
getting mapped back in and redirtied, it needs to be kept locked
across the entire operation.
+ ->swap_activate will be called with a non-zero argument on
+files backing (non block device backed) swapfiles. A return value
+of zero indicates success, in which case this file can be used for
+backing swapspace. The swapspace operations will be proxied to the
+address space operations.
+
+ ->swap_deactivate() will be called in the sys_swapoff()
+path after ->swap_activate() returned success.
+
----------------------- file_lock_operations ------------------------------
prototypes:
void (*fl_copy_lock)(struct file_lock *, struct file_lock *);
@@ -333,27 +347,38 @@ prototypes:
locking rules:
- file_lock_lock may block
+ inode->i_lock may block
fl_copy_lock: yes no
fl_release_private: maybe no
----------------------- lock_manager_operations ---------------------------
prototypes:
int (*lm_compare_owner)(struct file_lock *, struct file_lock *);
+ unsigned long (*lm_owner_key)(struct file_lock *);
void (*lm_notify)(struct file_lock *); /* unblock callback */
int (*lm_grant)(struct file_lock *, struct file_lock *, int);
- void (*lm_release_private)(struct file_lock *);
void (*lm_break)(struct file_lock *); /* break_lease callback */
int (*lm_change)(struct file_lock **, int);
locking rules:
- file_lock_lock may block
-lm_compare_owner: yes no
-lm_notify: yes no
-lm_grant: no no
-lm_release_private: maybe no
-lm_break: yes no
-lm_change yes no
+
+ inode->i_lock blocked_lock_lock may block
+lm_compare_owner: yes[1] maybe no
+lm_owner_key yes[1] yes no
+lm_notify: yes yes no
+lm_grant: no no no
+lm_break: yes no no
+lm_change yes no no
+
+[1]: ->lm_compare_owner and ->lm_owner_key are generally called with
+*an* inode->i_lock held. It may not be the i_lock of the inode
+associated with either file_lock argument! This is the case with deadlock
+detection, since the code has to chase down the owners of locks that may
+be entirely unrelated to the one on which the lock is being acquired.
+For deadlock detection however, the blocked_lock_lock is also held. The
+fact that these locks are held ensures that the file_locks do not
+disappear out from under you while doing the comparison or generating an
+owner key.
--------------------------- buffer_head -----------------------------------
prototypes:
@@ -405,7 +430,9 @@ prototypes:
ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
- int (*readdir) (struct file *, void *, filldir_t);
+ ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
+ ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
+ int (*iterate) (struct file *, struct dir_context *);
unsigned int (*poll) (struct file *, struct poll_table_struct *);
long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
@@ -503,6 +530,7 @@ locking rules:
open: yes
close: yes
fault: yes can return with page locked
+map_pages: yes
page_mkwrite: yes can return with page locked
access: yes
@@ -514,6 +542,15 @@ the page, then ensure it is not already truncated (the page lock will block
subsequent truncate), and then return with VM_FAULT_LOCKED, and the page
locked. The VM will unlock the page.
+ ->map_pages() is called when VM asks to map easy accessible pages.
+Filesystem should find and map pages associated with offsets from "pgoff"
+till "max_pgoff". ->map_pages() is called with page table locked and must
+not block. If it's not possible to reach a page without blocking,
+filesystem should skip it. Filesystem should use do_set_pte() to setup
+page table entry. Pointer to entry associated with offset "pgoff" is
+passed in "pte" field in vm_fault structure. Pointers to entries for other
+offsets should be calculated relative to "pte".
+
->page_mkwrite() is called when a previously read-only pte is
about to become writeable. The filesystem again must ensure that there are
no truncate/invalidate races, and then return with the page locked. If
@@ -522,7 +559,7 @@ like the ->fault() handler, but simply return with VM_FAULT_NOPAGE, which
will cause the VM to retry the fault.
->access() is called when get_user_pages() fails in
-acces_process_vm(), typically used to debug a process through
+access_process_vm(), typically used to debug a process through
/proc/pid/mem or ptrace. This function is needed only for
VM_IO | VM_PFNMAP VMAs.
diff --git a/Documentation/filesystems/affs.txt b/Documentation/filesystems/affs.txt
index 81ac488e375..71b63c2b984 100644
--- a/Documentation/filesystems/affs.txt
+++ b/Documentation/filesystems/affs.txt
@@ -49,6 +49,10 @@ mode=mode Sets the mode flags to the given (octal) value, regardless
This is useful since most of the plain AmigaOS files
will map to 600.
+nofilenametruncate
+ The file system will return an error when filename exceeds
+ standard maximum filename length (30 characters).
+
reserved=num Sets the number of reserved blocks at the start of the
partition to num. You should never need this option.
Default is 2.
@@ -181,9 +185,8 @@ tested, though several hundred MB have been read and written using
this fs. For a most up-to-date list of bugs please consult
fs/affs/Changes.
-Filenames are truncated to 30 characters without warning (this
-can be changed by setting the compile-time option AFFS_NO_TRUNCATE
-in include/linux/amigaffs.h).
+By default, filenames are truncated to 30 characters without warning.
+'nofilenametruncate' mount option can change that behavior.
Case is ignored by the affs in filename matching, but Linux shells
do care about the case. Example (with /wb being an affs mounted fs):
diff --git a/Documentation/filesystems/autofs4-mount-control.txt b/Documentation/filesystems/autofs4-mount-control.txt
index 4c95935cbcf..aff22113a98 100644
--- a/Documentation/filesystems/autofs4-mount-control.txt
+++ b/Documentation/filesystems/autofs4-mount-control.txt
@@ -255,7 +255,7 @@ AUTOFS_DEV_IOCTL_OPENMOUNT and AUTOFS_DEV_IOCTL_CLOSEMOUNT
Obtain and release a file descriptor for an autofs managed mount point
path. The open call requires an initialized struct autofs_dev_ioctl with
-the the path field set and the size field adjusted appropriately as well
+the path field set and the size field adjusted appropriately as well
as the arg1 field set to the device number of the autofs mount. The
device number can be obtained from the mount options shown in
/proc/mounts. The close call requires an initialized struct
diff --git a/Documentation/filesystems/btrfs.txt b/Documentation/filesystems/btrfs.txt
index 7671352216f..d11cc2f8077 100644
--- a/Documentation/filesystems/btrfs.txt
+++ b/Documentation/filesystems/btrfs.txt
@@ -1,8 +1,8 @@
- BTRFS
- =====
+BTRFS
+=====
-Btrfs is a new copy on write filesystem for Linux aimed at
+Btrfs is a copy on write filesystem for Linux aimed at
implementing advanced features while focusing on fault tolerance,
repair and easy administration. Initially developed by Oracle, Btrfs
is licensed under the GPL and open for contribution from anyone.
@@ -34,9 +34,198 @@ The main Btrfs features include:
* Online filesystem defragmentation
+Mount Options
+=============
- MAILING LIST
- ============
+When mounting a btrfs filesystem, the following option are accepted.
+Options with (*) are default options and will not show in the mount options.
+
+ alloc_start=<bytes>
+ Debugging option to force all block allocations above a certain
+ byte threshold on each block device. The value is specified in
+ bytes, optionally with a K, M, or G suffix, case insensitive.
+ Default is 1MB.
+
+ noautodefrag(*)
+ autodefrag
+ Disable/enable auto defragmentation.
+ Auto defragmentation detects small random writes into files and queue
+ them up for the defrag process. Works best for small files;
+ Not well suited for large database workloads.
+
+ check_int
+ check_int_data
+ check_int_print_mask=<value>
+ These debugging options control the behavior of the integrity checking
+ module (the BTRFS_FS_CHECK_INTEGRITY config option required).
+
+ check_int enables the integrity checker module, which examines all
+ block write requests to ensure on-disk consistency, at a large
+ memory and CPU cost.
+
+ check_int_data includes extent data in the integrity checks, and
+ implies the check_int option.
+
+ check_int_print_mask takes a bitmask of BTRFSIC_PRINT_MASK_* values
+ as defined in fs/btrfs/check-integrity.c, to control the integrity
+ checker module behavior.
+
+ See comments at the top of fs/btrfs/check-integrity.c for more info.
+
+ commit=<seconds>
+ Set the interval of periodic commit, 30 seconds by default. Higher
+ values defer data being synced to permanent storage with obvious
+ consequences when the system crashes. The upper bound is not forced,
+ but a warning is printed if it's more than 300 seconds (5 minutes).
+
+ compress
+ compress=<type>
+ compress-force
+ compress-force=<type>
+ Control BTRFS file data compression. Type may be specified as "zlib"
+ "lzo" or "no" (for no compression, used for remounting). If no type
+ is specified, zlib is used. If compress-force is specified,
+ all files will be compressed, whether or not they compress well.
+ If compression is enabled, nodatacow and nodatasum are disabled.
+
+ degraded
+ Allow mounts to continue with missing devices. A read-write mount may
+ fail with too many devices missing, for example if a stripe member
+ is completely missing.
+
+ device=<devicepath>
+ Specify a device during mount so that ioctls on the control device
+ can be avoided. Especially useful when trying to mount a multi-device
+ setup as root. May be specified multiple times for multiple devices.
+
+ nodiscard(*)
+ discard
+ Disable/enable discard mount option.
+ Discard issues frequent commands to let the block device reclaim space
+ freed by the filesystem.
+ This is useful for SSD devices, thinly provisioned
+ LUNs and virtual machine images, but may have a significant
+ performance impact. (The fstrim command is also available to
+ initiate batch trims from userspace).
+
+ noenospc_debug(*)
+ enospc_debug
+ Disable/enable debugging option to be more verbose in some ENOSPC conditions.
+
+ fatal_errors=<action>
+ Action to take when encountering a fatal error:
+ "bug" - BUG() on a fatal error. This is the default.
+ "panic" - panic() on a fatal error.
+
+ noflushoncommit(*)
+ flushoncommit
+ The 'flushoncommit' mount option forces any data dirtied by a write in a
+ prior transaction to commit as part of the current commit. This makes
+ the committed state a fully consistent view of the file system from the
+ application's perspective (i.e., it includes all completed file system
+ operations). This was previously the behavior only when a snapshot is
+ created.
+
+ inode_cache
+ Enable free inode number caching. Defaults to off due to an overflow
+ problem when the free space crcs don't fit inside a single page.
+
+ max_inline=<bytes>
+ Specify the maximum amount of space, in bytes, that can be inlined in
+ a metadata B-tree leaf. The value is specified in bytes, optionally
+ with a K, M, or G suffix, case insensitive. In practice, this value
+ is limited by the root sector size, with some space unavailable due
+ to leaf headers. For a 4k sectorsize, max inline data is ~3900 bytes.
+
+ metadata_ratio=<value>
+ Specify that 1 metadata chunk should be allocated after every <value>
+ data chunks. Off by default.
+
+ acl(*)
+ noacl
+ Enable/disable support for Posix Access Control Lists (ACLs). See the
+ acl(5) manual page for more information about ACLs.
+
+ barrier(*)
+ nobarrier
+ Enable/disable the use of block layer write barriers. Write barriers
+ ensure that certain IOs make it through the device cache and are on
+ persistent storage. If disabled on a device with a volatile
+ (non-battery-backed) write-back cache, nobarrier option will lead to
+ filesystem corruption on a system crash or power loss.
+
+ datacow(*)
+ nodatacow
+ Enable/disable data copy-on-write for newly created files.
+ Nodatacow implies nodatasum, and disables all compression.
+
+ datasum(*)
+ nodatasum
+ Enable/disable data checksumming for newly created files.
+ Datasum implies datacow.
+
+ treelog(*)
+ notreelog
+ Enable/disable the tree logging used for fsync and O_SYNC writes.
+
+ recovery
+ Enable autorecovery attempts if a bad tree root is found at mount time.
+ Currently this scans a list of several previous tree roots and tries to
+ use the first readable.
+
+ rescan_uuid_tree
+ Force check and rebuild procedure of the UUID tree. This should not
+ normally be needed.
+
+ skip_balance
+ Skip automatic resume of interrupted balance operation after mount.
+ May be resumed with "btrfs balance resume."
+
+ space_cache (*)
+ Enable the on-disk freespace cache.
+ nospace_cache
+ Disable freespace cache loading without clearing the cache.
+ clear_cache
+ Force clearing and rebuilding of the disk space cache if something
+ has gone wrong.
+
+ ssd
+ nossd
+ ssd_spread
+ Options to control ssd allocation schemes. By default, BTRFS will
+ enable or disable ssd allocation heuristics depending on whether a
+ rotational or nonrotational disk is in use. The ssd and nossd options
+ can override this autodetection.
+
+ The ssd_spread mount option attempts to allocate into big chunks
+ of unused space, and may perform better on low-end ssds. ssd_spread
+ implies ssd, enabling all other ssd heuristics as well.
+
+ subvol=<path>
+ Mount subvolume at <path> rather than the root subvolume. <path> is
+ relative to the top level subvolume.
+
+ subvolid=<ID>
+ Mount subvolume specified by an ID number rather than the root subvolume.
+ This allows mounting of subvolumes which are not in the root of the mounted
+ filesystem.
+ You can use "btrfs subvolume list" to see subvolume ID numbers.
+
+ subvolrootid=<objectid> (deprecated)
+ Mount subvolume specified by <objectid> rather than the root subvolume.
+ This allows mounting of subvolumes which are not in the root of the mounted
+ filesystem.
+ You can use "btrfs subvolume show " to see the object ID for a subvolume.
+
+ thread_pool=<number>
+ The number of worker threads to allocate. The default number is equal
+ to the number of CPUs + 2, or 8, whichever is smaller.
+
+ user_subvol_rm_allowed
+ Allow subvolumes to be deleted by a non-root user. Use with caution.
+
+MAILING LIST
+============
There is a Btrfs mailing list hosted on vger.kernel.org. You can
find details on how to subscribe here:
@@ -49,8 +238,8 @@ http://dir.gmane.org/gmane.comp.file-systems.btrfs
- IRC
- ===
+IRC
+===
Discussion of Btrfs also occurs on the #btrfs channel of the Freenode
IRC network.
@@ -68,24 +257,14 @@ available from the git repository at the following location:
These include the following tools:
-mkfs.btrfs: create a filesystem
-
-btrfsctl: control program to create snapshots and subvolumes:
+* mkfs.btrfs: create a filesystem
- mount /dev/sda2 /mnt
- btrfsctl -s new_subvol_name /mnt
- btrfsctl -s snapshot_of_default /mnt/default
- btrfsctl -s snapshot_of_new_subvol /mnt/new_subvol_name
- btrfsctl -s snapshot_of_a_snapshot /mnt/snapshot_of_new_subvol
- ls /mnt
- default snapshot_of_a_snapshot snapshot_of_new_subvol
- new_subvol_name snapshot_of_default
+* btrfs: a single tool to manage the filesystems, refer to the manpage for more details
- Snapshots and subvolumes cannot be deleted right now, but you can
- rm -rf all the files and directories inside them.
+* 'btrfsck' or 'btrfs check': do a consistency check of the filesystem
-btrfsck: do a limited check of the FS extent trees.
+Other tools for specific tasks:
-btrfs-debug-tree: print all of the FS metadata in text form. Example:
+* btrfs-convert: in-place conversion from ext2/3/4 filesystems
- btrfs-debug-tree /dev/sda2 >& big_output_file
+* btrfs-image: dump filesystem metadata for debugging
diff --git a/Documentation/filesystems/caching/backend-api.txt b/Documentation/filesystems/caching/backend-api.txt
index 382d52cdaf2..277d1e81067 100644
--- a/Documentation/filesystems/caching/backend-api.txt
+++ b/Documentation/filesystems/caching/backend-api.txt
@@ -299,6 +299,15 @@ performed on the denizens of the cache. These are held in a structure of type:
enough space in the cache to permit this.
+ (*) Check coherency state of an object [mandatory]:
+
+ int (*check_consistency)(struct fscache_object *object)
+
+ This method is called to have the cache check the saved auxiliary data of
+ the object against the netfs's idea of the state. 0 should be returned
+ if they're consistent and -ESTALE otherwise. -ENOMEM and -ERESTARTSYS
+ may also be returned.
+
(*) Update object [mandatory]:
int (*update_object)(struct fscache_object *object)
@@ -308,6 +317,18 @@ performed on the denizens of the cache. These are held in a structure of type:
obtained by calling object->cookie->def->get_aux()/get_attr().
+ (*) Invalidate data object [mandatory]:
+
+ int (*invalidate_object)(struct fscache_operation *op)
+
+ This is called to invalidate a data object (as pointed to by op->object).
+ All the data stored for this object should be discarded and an
+ attr_changed operation should be performed. The caller will follow up
+ with an object update operation.
+
+ fscache_op_complete() must be called on op before returning.
+
+
(*) Discard object [mandatory]:
void (*drop_object)(struct fscache_object *object)
@@ -419,7 +440,10 @@ performed on the denizens of the cache. These are held in a structure of type:
If an I/O error occurs, fscache_io_error() should be called and -ENOBUFS
returned if possible or fscache_end_io() called with a suitable error
- code..
+ code.
+
+ fscache_put_retrieval() should be called after a page or pages are dealt
+ with. This will complete the operation when all pages are dealt with.
(*) Request pages be read from cache [mandatory]:
@@ -526,6 +550,27 @@ FS-Cache provides some utilities that a cache backend may make use of:
error value should be 0 if successful and an error otherwise.
+ (*) Record that one or more pages being retrieved or allocated have been dealt
+ with:
+
+ void fscache_retrieval_complete(struct fscache_retrieval *op,
+ int n_pages);
+
+ This is called to record the fact that one or more pages have been dealt
+ with and are no longer the concern of this operation. When the number of
+ pages remaining in the operation reaches 0, the operation will be
+ completed.
+
+
+ (*) Record operation completion:
+
+ void fscache_op_complete(struct fscache_operation *op);
+
+ This is called to record the completion of an operation. This deducts
+ this operation from the parent object's run state, potentially permitting
+ one or more pending operations to start running.
+
+
(*) Set highest store limit:
void fscache_set_store_limit(struct fscache_object *object,
diff --git a/Documentation/filesystems/caching/netfs-api.txt b/Documentation/filesystems/caching/netfs-api.txt
index 7cc6bf2871e..aed6b94160b 100644
--- a/Documentation/filesystems/caching/netfs-api.txt
+++ b/Documentation/filesystems/caching/netfs-api.txt
@@ -29,14 +29,16 @@ This document contains the following sections:
(6) Index registration
(7) Data file registration
(8) Miscellaneous object registration
- (9) Setting the data file size
+ (9) Setting the data file size
(10) Page alloc/read/write
(11) Page uncaching
- (12) Index and data file update
- (13) Miscellaneous cookie operations
- (14) Cookie unregistration
- (15) Index and data file invalidation
- (16) FS-Cache specific page flags.
+ (12) Index and data file consistency
+ (13) Cookie enablement
+ (14) Miscellaneous cookie operations
+ (15) Cookie unregistration
+ (16) Index invalidation
+ (17) Data file invalidation
+ (18) FS-Cache specific page flags.
=============================
@@ -333,7 +335,8 @@ the path to the file:
struct fscache_cookie *
fscache_acquire_cookie(struct fscache_cookie *parent,
const struct fscache_object_def *def,
- void *netfs_data);
+ void *netfs_data,
+ bool enable);
This function creates an index entry in the index represented by parent,
filling in the index entry by calling the operations pointed to by def.
@@ -349,6 +352,10 @@ object needs to be created somewhere down the hierarchy. Furthermore, an index
may be created in several different caches independently at different times.
This is all handled transparently, and the netfs doesn't see any of it.
+A cookie will be created in the disabled state if enabled is false. A cookie
+must be enabled to do anything with it. A disabled cookie can be enabled by
+calling fscache_enable_cookie() (see below).
+
For example, with AFS, a cell would be added to the primary index. This index
entry would have a dependent inode containing a volume location index for the
volume mappings within this cell:
@@ -356,7 +363,7 @@ volume mappings within this cell:
cell->cache =
fscache_acquire_cookie(afs_cache_netfs.primary_index,
&afs_cell_cache_index_def,
- cell);
+ cell, true);
Then when a volume location was accessed, it would be entered into the cell's
index and an inode would be allocated that acts as a volume type and hash chain
@@ -365,7 +372,7 @@ combination:
vlocation->cache =
fscache_acquire_cookie(cell->cache,
&afs_vlocation_cache_index_def,
- vlocation);
+ vlocation, true);
And then a particular flavour of volume (R/O for example) could be added to
that index, creating another index for vnodes (AFS inode equivalents):
@@ -373,7 +380,7 @@ that index, creating another index for vnodes (AFS inode equivalents):
volume->cache =
fscache_acquire_cookie(vlocation->cache,
&afs_volume_cache_index_def,
- volume);
+ volume, true);
======================
@@ -387,7 +394,7 @@ the object definition should be something other than index type.
vnode->cache =
fscache_acquire_cookie(volume->cache,
&afs_vnode_cache_object_def,
- vnode);
+ vnode, true);
=================================
@@ -403,7 +410,7 @@ it would be some other type of object such as a data file.
xattr->cache =
fscache_acquire_cookie(vnode->cache,
&afs_xattr_cache_object_def,
- xattr);
+ xattr, true);
Miscellaneous objects might be used to store extended attributes or directory
entries for example.
@@ -432,7 +439,7 @@ to the caller. The attribute adjustment excludes read and write operations.
=====================
-PAGE READ/ALLOC/WRITE
+PAGE ALLOC/READ/WRITE
=====================
And the sixth step is to store and retrieve pages in the cache. There are
@@ -498,7 +505,7 @@ Else if there's a copy of the page resident in the cache:
(*) An argument that's 0 on success or negative for an error code.
If an error occurs, it should be assumed that the page contains no usable
- data.
+ data. fscache_readpages_cancel() may need to be called.
end_io_func() will be called in process context if the read is results in
an error, but it might be called in interrupt context if the read is
@@ -622,6 +629,22 @@ some of the pages being read and some being allocated. Those pages will have
been marked appropriately and will need uncaching.
+CANCELLATION OF UNREAD PAGES
+----------------------------
+
+If one or more pages are passed to fscache_read_or_alloc_pages() but not then
+read from the cache and also not read from the underlying filesystem then
+those pages will need to have any marks and reservations removed. This can be
+done by calling:
+
+ void fscache_readpages_cancel(struct fscache_cookie *cookie,
+ struct list_head *pages);
+
+prior to returning to the caller. The cookie argument should be as passed to
+fscache_read_or_alloc_pages(). Every page in the pages list will be examined
+and any that have PG_fscache set will be uncached.
+
+
==============
PAGE UNCACHING
==============
@@ -689,9 +712,18 @@ written to the cache and for the cache to finish with the page generally. No
error is returned.
-==========================
-INDEX AND DATA FILE UPDATE
-==========================
+===============================
+INDEX AND DATA FILE CONSISTENCY
+===============================
+
+To find out whether auxiliary data for an object is up to data within the
+cache, the following function can be called:
+
+ int fscache_check_consistency(struct fscache_cookie *cookie)
+
+This will call back to the netfs to check whether the auxiliary data associated
+with a cookie is correct. It returns 0 if it is and -ESTALE if it isn't; it
+may also return -ENOMEM and -ERESTARTSYS.
To request an update of the index data for an index or other object, the
following function should be called:
@@ -707,6 +739,47 @@ Note that partial updates may happen automatically at other times, such as when
data blocks are added to a data file object.
+=================
+COOKIE ENABLEMENT
+=================
+
+Cookies exist in one of two states: enabled and disabled. If a cookie is
+disabled, it ignores all attempts to acquire child cookies; check, update or
+invalidate its state; allocate, read or write backing pages - though it is
+still possible to uncache pages and relinquish the cookie.
+
+The initial enablement state is set by fscache_acquire_cookie(), but the cookie
+can be enabled or disabled later. To disable a cookie, call:
+
+ void fscache_disable_cookie(struct fscache_cookie *cookie,
+ bool invalidate);
+
+If the cookie is not already disabled, this locks the cookie against other
+enable and disable ops, marks the cookie as being disabled, discards or
+invalidates any backing objects and waits for cessation of activity on any
+associated object before unlocking the cookie.
+
+All possible failures are handled internally. The caller should consider
+calling fscache_uncache_all_inode_pages() afterwards to make sure all page
+markings are cleared up.
+
+Cookies can be enabled or reenabled with:
+
+ void fscache_enable_cookie(struct fscache_cookie *cookie,
+ bool (*can_enable)(void *data),
+ void *data)
+
+If the cookie is not already enabled, this locks the cookie against other
+enable and disable ops, invokes can_enable() and, if the cookie is not an index
+cookie, will begin the procedure of acquiring backing objects.
+
+The optional can_enable() function is passed the data argument and returns a
+ruling as to whether or not enablement should actually be permitted to begin.
+
+All possible failures are handled internally. The cookie will only be marked
+as enabled if provisional backing objects are allocated.
+
+
===============================
MISCELLANEOUS COOKIE OPERATIONS
===============================
@@ -752,7 +825,7 @@ COOKIE UNREGISTRATION
To get rid of a cookie, this function should be called.
void fscache_relinquish_cookie(struct fscache_cookie *cookie,
- int retire);
+ bool retire);
If retire is non-zero, then the object will be marked for recycling, and all
copies of it will be removed from all active caches in which it is present.
@@ -767,13 +840,42 @@ the cookies for "child" indices, objects and pages have been relinquished
first.
-================================
-INDEX AND DATA FILE INVALIDATION
-================================
+==================
+INDEX INVALIDATION
+==================
+
+There is no direct way to invalidate an index subtree. To do this, the caller
+should relinquish and retire the cookie they have, and then acquire a new one.
+
+
+======================
+DATA FILE INVALIDATION
+======================
+
+Sometimes it will be necessary to invalidate an object that contains data.
+Typically this will be necessary when the server tells the netfs of a foreign
+change - at which point the netfs has to throw away all the state it had for an
+inode and reload from the server.
+
+To indicate that a cache object should be invalidated, the following function
+can be called:
+
+ void fscache_invalidate(struct fscache_cookie *cookie);
+
+This can be called with spinlocks held as it defers the work to a thread pool.
+All extant storage, retrieval and attribute change ops at this point are
+cancelled and discarded. Some future operations will be rejected until the
+cache has had a chance to insert a barrier in the operations queue. After
+that, operations will be queued again behind the invalidation operation.
+
+The invalidation operation will perform an attribute change operation and an
+auxiliary data update operation as it is very likely these will have changed.
+
+Using the following function, the netfs can wait for the invalidation operation
+to have reached a point at which it can start submitting ordinary operations
+once again:
-There is no direct way to invalidate an index subtree or a data file. To do
-this, the caller should relinquish and retire the cookie they have, and then
-acquire a new one.
+ void fscache_wait_on_invalidate(struct fscache_cookie *cookie);
===========================
diff --git a/Documentation/filesystems/caching/object.txt b/Documentation/filesystems/caching/object.txt
index 58313348da8..100ff41127e 100644
--- a/Documentation/filesystems/caching/object.txt
+++ b/Documentation/filesystems/caching/object.txt
@@ -216,7 +216,14 @@ servicing netfs requests:
The normal running state. In this state, requests the netfs makes will be
passed on to the cache.
- (6) State FSCACHE_OBJECT_UPDATING.
+ (6) State FSCACHE_OBJECT_INVALIDATING.
+
+ The object is undergoing invalidation. When the state comes here, it
+ discards all pending read, write and attribute change operations as it is
+ going to clear out the cache entirely and reinitialise it. It will then
+ continue to the FSCACHE_OBJECT_UPDATING state.
+
+ (7) State FSCACHE_OBJECT_UPDATING.
The state machine comes here to update the object in the cache from the
netfs's records. This involves updating the auxiliary data that is used
@@ -225,13 +232,13 @@ servicing netfs requests:
And there are terminal states in which an object cleans itself up, deallocates
memory and potentially deletes stuff from disk:
- (7) State FSCACHE_OBJECT_LC_DYING.
+ (8) State FSCACHE_OBJECT_LC_DYING.
The object comes here if it is dying because of a lookup or creation
error. This would be due to a disk error or system error of some sort.
Temporary data is cleaned up, and the parent is released.
- (8) State FSCACHE_OBJECT_DYING.
+ (9) State FSCACHE_OBJECT_DYING.
The object comes here if it is dying due to an error, because its parent
cookie has been relinquished by the netfs or because the cache is being
@@ -241,27 +248,27 @@ memory and potentially deletes stuff from disk:
can destroy themselves. This object waits for all its children to go away
before advancing to the next state.
- (9) State FSCACHE_OBJECT_ABORT_INIT.
+(10) State FSCACHE_OBJECT_ABORT_INIT.
The object comes to this state if it was waiting on its parent in
FSCACHE_OBJECT_INIT, but its parent died. The object will destroy itself
so that the parent may proceed from the FSCACHE_OBJECT_DYING state.
-(10) State FSCACHE_OBJECT_RELEASING.
-(11) State FSCACHE_OBJECT_RECYCLING.
+(11) State FSCACHE_OBJECT_RELEASING.
+(12) State FSCACHE_OBJECT_RECYCLING.
The object comes to one of these two states when dying once it is rid of
all its children, if it is dying because the netfs relinquished its
cookie. In the first state, the cached data is expected to persist, and
in the second it will be deleted.
-(12) State FSCACHE_OBJECT_WITHDRAWING.
+(13) State FSCACHE_OBJECT_WITHDRAWING.
The object transits to this state if the cache decides it wants to
withdraw the object from service, perhaps to make space, but also due to
error or just because the whole cache is being withdrawn.
-(13) State FSCACHE_OBJECT_DEAD.
+(14) State FSCACHE_OBJECT_DEAD.
The object transits to this state when the in-memory object record is
ready to be deleted. The object processor shouldn't ever see an object in
diff --git a/Documentation/filesystems/caching/operations.txt b/Documentation/filesystems/caching/operations.txt
index b6b070c57cb..bee2a5f93d6 100644
--- a/Documentation/filesystems/caching/operations.txt
+++ b/Documentation/filesystems/caching/operations.txt
@@ -174,7 +174,7 @@ Operations are used through the following procedure:
necessary (the object might have died whilst the thread was waiting).
When it has finished doing its processing, it should call
- fscache_put_operation() on it.
+ fscache_op_complete() and fscache_put_operation() on it.
(4) The operation holds an effective lock upon the object, preventing other
exclusive ops conflicting until it is released. The operation can be
diff --git a/Documentation/filesystems/cifs.txt b/Documentation/filesystems/cifs.txt
deleted file mode 100644
index 49cc923a93e..00000000000
--- a/Documentation/filesystems/cifs.txt
+++ /dev/null
@@ -1,51 +0,0 @@
- This is the client VFS module for the Common Internet File System
- (CIFS) protocol which is the successor to the Server Message Block
- (SMB) protocol, the native file sharing mechanism for most early
- PC operating systems. CIFS is fully supported by current network
- file servers such as Windows 2000, Windows 2003 (including
- Windows XP) as well by Samba (which provides excellent CIFS
- server support for Linux and many other operating systems), so
- this network filesystem client can mount to a wide variety of
- servers. The smbfs module should be used instead of this cifs module
- for mounting to older SMB servers such as OS/2. The smbfs and cifs
- modules can coexist and do not conflict. The CIFS VFS filesystem
- module is designed to work well with servers that implement the
- newer versions (dialects) of the SMB/CIFS protocol such as Samba,
- the program written by Andrew Tridgell that turns any Unix host
- into a SMB/CIFS file server.
-
- The intent of this module is to provide the most advanced network
- file system function for CIFS compliant servers, including better
- POSIX compliance, secure per-user session establishment, high
- performance safe distributed caching (oplock), optional packet
- signing, large files, Unicode support and other internationalization
- improvements. Since both Samba server and this filesystem client support
- the CIFS Unix extensions, the combination can provide a reasonable
- alternative to NFSv4 for fileserving in some Linux to Linux environments,
- not just in Linux to Windows environments.
-
- This filesystem has an optional mount utility (mount.cifs) that can
- be obtained from the project page and installed in the path in the same
- directory with the other mount helpers (such as mount.smbfs).
- Mounting using the cifs filesystem without installing the mount helper
- requires specifying the server's ip address.
-
- For Linux 2.4:
- mount //anything/here /mnt_target -o
- user=username,pass=password,unc=//ip_address_of_server/sharename
-
- For Linux 2.5:
- mount //ip_address_of_server/sharename /mnt_target -o user=username, pass=password
-
-
- For more information on the module see the project page at
-
- http://us1.samba.org/samba/Linux_CIFS_client.html
-
- For more information on CIFS see:
-
- http://www.snia.org/tech_activities/CIFS
-
- or the Samba site:
-
- http://www.samba.org
diff --git a/Documentation/filesystems/cifs/AUTHORS b/Documentation/filesystems/cifs/AUTHORS
new file mode 100644
index 00000000000..ca4a67a0bb1
--- /dev/null
+++ b/Documentation/filesystems/cifs/AUTHORS
@@ -0,0 +1,56 @@
+Original Author
+===============
+Steve French (sfrench@samba.org)
+
+The author wishes to express his appreciation and thanks to:
+Andrew Tridgell (Samba team) for his early suggestions about smb/cifs VFS
+improvements. Thanks to IBM for allowing me time and test resources to pursue
+this project, to Jim McDonough from IBM (and the Samba Team) for his help, to
+the IBM Linux JFS team for explaining many esoteric Linux filesystem features.
+Jeremy Allison of the Samba team has done invaluable work in adding the server
+side of the original CIFS Unix extensions and reviewing and implementing
+portions of the newer CIFS POSIX extensions into the Samba 3 file server. Thank
+Dave Boutcher of IBM Rochester (author of the OS/400 smb/cifs filesystem client)
+for proving years ago that very good smb/cifs clients could be done on Unix-like
+operating systems. Volker Lendecke, Andrew Tridgell, Urban Widmark, John
+Newbigin and others for their work on the Linux smbfs module. Thanks to
+the other members of the Storage Network Industry Association CIFS Technical
+Workgroup for their work specifying this highly complex protocol and finally
+thanks to the Samba team for their technical advice and encouragement.
+
+Patch Contributors
+------------------
+Zwane Mwaikambo
+Andi Kleen
+Amrut Joshi
+Shobhit Dayal
+Sergey Vlasov
+Richard Hughes
+Yury Umanets
+Mark Hamzy (for some of the early cifs IPv6 work)
+Domen Puncer
+Jesper Juhl (in particular for lots of whitespace/formatting cleanup)
+Vince Negri and Dave Stahl (for finding an important caching bug)
+Adrian Bunk (kcalloc cleanups)
+Miklos Szeredi
+Kazeon team for various fixes especially for 2.4 version.
+Asser Ferno (Change Notify support)
+Shaggy (Dave Kleikamp) for innumerable small fs suggestions and some good cleanup
+Gunter Kukkukk (testing and suggestions for support of old servers)
+Igor Mammedov (DFS support)
+Jeff Layton (many, many fixes, as well as great work on the cifs Kerberos code)
+Scott Lovenberg
+
+Test case and Bug Report contributors
+-------------------------------------
+Thanks to those in the community who have submitted detailed bug reports
+and debug of problems they have found: Jochen Dolze, David Blaine,
+Rene Scharfe, Martin Josefsson, Alexander Wild, Anthony Liguori,
+Lars Muller, Urban Widmark, Massimiliano Ferrero, Howard Owen,
+Olaf Kirch, Kieron Briggs, Nick Millington and others. Also special
+mention to the Stanford Checker (SWAT) which pointed out many minor
+bugs in error paths. Valuable suggestions also have come from Al Viro
+and Dave Miller.
+
+And thanks to the IBM LTC and Power test teams and SuSE testers for
+finding multiple bugs during excellent stress test runs.
diff --git a/Documentation/filesystems/cifs/CHANGES b/Documentation/filesystems/cifs/CHANGES
new file mode 100644
index 00000000000..bc0025cdd1c
--- /dev/null
+++ b/Documentation/filesystems/cifs/CHANGES
@@ -0,0 +1,1065 @@
+Version 1.62
+------------
+Add sockopt=TCP_NODELAY mount option. EA (xattr) routines hardened
+to more strictly handle corrupt frames.
+
+Version 1.61
+------------
+Fix append problem to Samba servers (files opened with O_APPEND could
+have duplicated data). Fix oops in cifs_lookup. Workaround problem
+mounting to OS/400 Netserve. Fix oops in cifs_get_tcp_session.
+Disable use of server inode numbers when server only
+partially supports them (e.g. for one server querying inode numbers on
+FindFirst fails but QPathInfo queries works). Fix oops with dfs in
+cifs_put_smb_ses. Fix mmap to work on directio mounts (needed
+for OpenOffice when on forcedirectio mount e.g.)
+
+Version 1.60
+-------------
+Fix memory leak in reconnect. Fix oops in DFS mount error path.
+Set s_maxbytes to smaller (the max that vfs can handle) so that
+sendfile will now work over cifs mounts again. Add noforcegid
+and noforceuid mount parameters. Fix small mem leak when using
+ntlmv2. Fix 2nd mount to same server but with different port to
+be allowed (rather than reusing the 1st port) - only when the
+user explicitly overrides the port on the 2nd mount.
+
+Version 1.59
+------------
+Client uses server inode numbers (which are persistent) rather than
+client generated ones by default (mount option "serverino" turned
+on by default if server supports it). Add forceuid and forcegid
+mount options (so that when negotiating unix extensions specifying
+which uid mounted does not immediately force the server's reported
+uids to be overridden). Add support for scope mount parm. Improve
+hard link detection to use same inode for both. Do not set
+read-only dos attribute on directories (for chmod) since Windows
+explorer special cases this attribute bit for directories for
+a different purpose.
+
+Version 1.58
+------------
+Guard against buffer overruns in various UCS-2 to UTF-8 string conversions
+when the UTF-8 string is composed of unusually long (more than 4 byte) converted
+characters. Add support for mounting root of a share which redirects immediately
+to DFS target. Convert string conversion functions from Unicode to more
+accurately mark string length before allocating memory (which may help the
+rare cases where a UTF-8 string is much larger than the UCS2 string that
+we converted from). Fix endianness of the vcnum field used during
+session setup to distinguish multiple mounts to same server from different
+userids. Raw NTLMSSP fixed (it requires /proc/fs/cifs/experimental
+flag to be set to 2, and mount must enable krb5 to turn on extended security).
+Performance of file create to Samba improved (posix create on lookup
+removes 1 of 2 network requests sent on file create)
+
+Version 1.57
+------------
+Improve support for multiple security contexts to the same server. We
+used to use the same "vcnumber" for all connections which could cause
+the server to treat subsequent connections, especially those that
+are authenticated as guest, as reconnections, invalidating the earlier
+user's smb session. This fix allows cifs to mount multiple times to the
+same server with different userids without risking invalidating earlier
+established security contexts. fsync now sends SMB Flush operation
+to better ensure that we wait for server to write all of the data to
+server disk (not just write it over the network). Add new mount
+parameter to allow user to disable sending the (slow) SMB flush on
+fsync if desired (fsync still flushes all cached write data to the server).
+Posix file open support added (turned off after one attempt if server
+fails to support it properly, as with Samba server versions prior to 3.3.2)
+Fix "redzone overwritten" bug in cifs_put_tcon (CIFSTcon may allocate too
+little memory for the "nativeFileSystem" field returned by the server
+during mount). Endian convert inode numbers if necessary (makes it easier
+to compare inode numbers on network files from big endian systems).
+
+Version 1.56
+------------
+Add "forcemandatorylock" mount option to allow user to use mandatory
+rather than posix (advisory) byte range locks, even though server would
+support posix byte range locks. Fix query of root inode when prefixpath
+specified and user does not have access to query information about the
+top of the share. Fix problem in 2.6.28 resolving DFS paths to
+Samba servers (worked to Windows). Fix rmdir so that pending search
+(readdir) requests do not get invalid results which include the now
+removed directory. Fix oops in cifs_dfs_ref.c when prefixpath is not reachable
+when using DFS. Add better file create support to servers which support
+the CIFS POSIX protocol extensions (this adds support for new flags
+on create, and improves semantics for write of locked ranges).
+
+Version 1.55
+------------
+Various fixes to make delete of open files behavior more predictable
+(when delete of an open file fails we mark the file as "delete-on-close"
+in a way that more servers accept, but only if we can first rename the
+file to a temporary name). Add experimental support for more safely
+handling fcntl(F_SETLEASE). Convert cifs to using blocking tcp
+sends, and also let tcp autotune the socket send and receive buffers.
+This reduces the number of EAGAIN errors returned by TCP/IP in
+high stress workloads (and the number of retries on socket writes
+when sending large SMBWriteX requests). Fix case in which a portion of
+data can in some cases not get written to the file on the server before the
+file is closed. Fix DFS parsing to properly handle path consumed field,
+and to handle certain codepage conversions better. Fix mount and
+umount race that can cause oops in mount or umount or reconnect.
+
+Version 1.54
+------------
+Fix premature write failure on congested networks (we would give up
+on EAGAIN from the socket too quickly on large writes).
+Cifs_mkdir and cifs_create now respect the setgid bit on parent dir.
+Fix endian problems in acl (mode from/to cifs acl) on bigendian
+architectures. Fix problems with preserving timestamps on copying open
+files (e.g. "cp -a") to Windows servers. For mkdir and create honor setgid bit
+on parent directory when server supports Unix Extensions but not POSIX
+create. Update cifs.upcall version to handle new Kerberos sec flags
+(this requires update of cifs.upcall program from Samba). Fix memory leak
+on dns_upcall (resolving DFS referralls). Fix plain text password
+authentication (requires setting SecurityFlags to 0x30030 to enable
+lanman and plain text though). Fix writes to be at correct offset when
+file is open with O_APPEND and file is on a directio (forcediretio) mount.
+Fix bug in rewinding readdir directory searches. Add nodfs mount option.
+
+Version 1.53
+------------
+DFS support added (Microsoft Distributed File System client support needed
+for referrals which enable a hierarchical name space among servers).
+Disable temporary caching of mode bits to servers which do not support
+storing of mode (e.g. Windows servers, when client mounts without cifsacl
+mount option) and add new "dynperm" mount option to enable temporary caching
+of mode (enable old behavior). Fix hang on mount caused when server crashes
+tcp session during negotiate protocol.
+
+Version 1.52
+------------
+Fix oops on second mount to server when null auth is used.
+Enable experimental Kerberos support. Return writebehind errors on flush
+and sync so that events like out of disk space get reported properly on
+cached files. Fix setxattr failure to certain Samba versions. Fix mount
+of second share to disconnected server session (autoreconnect on this).
+Add ability to modify cifs acls for handling chmod (when mounted with
+cifsacl flag). Fix prefixpath path separator so we can handle mounts
+with prefixpaths longer than one directory (one path component) when
+mounted to Windows servers. Fix slow file open when cifsacl
+enabled. Fix memory leak in FindNext when the SMB call returns -EBADF.
+
+
+Version 1.51
+------------
+Fix memory leak in statfs when mounted to very old servers (e.g.
+Windows 9x). Add new feature "POSIX open" which allows servers
+which support the current POSIX Extensions to provide better semantics
+(e.g. delete for open files opened with posix open). Take into
+account umask on posix mkdir not just older style mkdir. Add
+ability to mount to IPC$ share (which allows CIFS named pipes to be
+opened, read and written as if they were files). When 1st tree
+connect fails (e.g. due to signing negotiation failure) fix
+leak that causes cifsd not to stop and rmmod to fail to cleanup
+cifs_request_buffers pool. Fix problem with POSIX Open/Mkdir on
+bigendian architectures. Fix possible memory corruption when
+EAGAIN returned on kern_recvmsg. Return better error if server
+requires packet signing but client has disabled it. When mounted
+with cifsacl mount option - mode bits are approximated based
+on the contents of the ACL of the file or directory. When cifs
+mount helper is missing convert make sure that UNC name
+has backslash (not forward slash) between ip address of server
+and the share name.
+
+Version 1.50
+------------
+Fix NTLMv2 signing. NFS server mounted over cifs works (if cifs mount is
+done with "serverino" mount option). Add support for POSIX Unlink
+(helps with certain sharing violation cases when server such as
+Samba supports newer POSIX CIFS Protocol Extensions). Add "nounix"
+mount option to allow disabling the CIFS Unix Extensions for just
+that mount. Fix hang on spinlock in find_writable_file (race when
+reopening file after session crash). Byte range unlock request to
+windows server could unlock more bytes (on server copy of file)
+than intended if start of unlock request is well before start of
+a previous byte range lock that we issued.
+
+Version 1.49
+------------
+IPv6 support. Enable ipv6 addresses to be passed on mount (put the ipv6
+address after the "ip=" mount option, at least until mount.cifs is fixed to
+handle DNS host to ipv6 name translation). Accept override of uid or gid
+on mount even when Unix Extensions are negotiated (it used to be ignored
+when Unix Extensions were ignored). This allows users to override the
+default uid and gid for files when they are certain that the uids or
+gids on the server do not match those of the client. Make "sec=none"
+mount override username (so that null user connection is attempted)
+to match what documentation said. Support for very large reads, over 127K,
+available to some newer servers (such as Samba 3.0.26 and later but
+note that it also requires setting CIFSMaxBufSize at module install
+time to a larger value which may hurt performance in some cases).
+Make sign option force signing (or fail if server does not support it).
+
+Version 1.48
+------------
+Fix mtime bouncing around from local idea of last write times to remote time.
+Fix hang (in i_size_read) when simultaneous size update of same remote file
+on smp system corrupts sequence number. Do not reread unnecessarily partial page
+(which we are about to overwrite anyway) when writing out file opened rw.
+When DOS attribute of file on non-Unix server's file changes on the server side
+from read-only back to read-write, reflect this change in default file mode
+(we had been leaving a file's mode read-only until the inode were reloaded).
+Allow setting of attribute back to ATTR_NORMAL (removing readonly dos attribute
+when archive dos attribute not set and we are changing mode back to writeable
+on server which does not support the Unix Extensions). Remove read only dos
+attribute on chmod when adding any write permission (ie on any of
+user/group/other (not all of user/group/other ie 0222) when
+mounted to windows. Add support for POSIX MkDir (slight performance
+enhancement and eliminates the network race between the mkdir and set
+path info of the mode).
+
+
+Version 1.47
+------------
+Fix oops in list_del during mount caused by unaligned string.
+Fix file corruption which could occur on some large file
+copies caused by writepages page i/o completion bug.
+Seek to SEEK_END forces check for update of file size for non-cached
+files. Allow file size to be updated on remote extend of locally open,
+non-cached file. Fix reconnect to newer Samba servers (or other servers
+which support the CIFS Unix/POSIX extensions) so that we again tell the
+server the Unix/POSIX cifs capabilities which we support (SetFSInfo).
+Add experimental support for new POSIX Open/Mkdir (which returns
+stat information on the open, and allows setting the mode).
+
+Version 1.46
+------------
+Support deep tree mounts. Better support OS/2, Win9x (DOS) time stamps.
+Allow null user to be specified on mount ("username="). Do not return
+EINVAL on readdir when filldir fails due to overwritten blocksize
+(fixes FC problem). Return error in rename 2nd attempt retry (ie report
+if rename by handle also fails, after rename by path fails, we were
+not reporting whether the retry worked or not). Fix NTLMv2 to
+work to Windows servers (mount with option "sec=ntlmv2").
+
+Version 1.45
+------------
+Do not time out lockw calls when using posix extensions. Do not
+time out requests if server still responding reasonably fast
+on requests on other threads. Improve POSIX locking emulation,
+(lock cancel now works, and unlock of merged range works even
+to Windows servers now). Fix oops on mount to lanman servers
+(win9x, os/2 etc.) when null password. Do not send listxattr
+(SMB to query all EAs) if nouser_xattr specified. Fix SE Linux
+problem (instantiate inodes/dentries in right order for readdir).
+
+Version 1.44
+------------
+Rewritten sessionsetup support, including support for legacy SMB
+session setup needed for OS/2 and older servers such as Windows 95 and 98.
+Fix oops on ls to OS/2 servers. Add support for level 1 FindFirst
+so we can do search (ls etc.) to OS/2. Do not send NTCreateX
+or recent levels of FindFirst unless server says it supports NT SMBs
+(instead use legacy equivalents from LANMAN dialect). Fix to allow
+NTLMv2 authentication support (now can use stronger password hashing
+on mount if corresponding /proc/fs/cifs/SecurityFlags is set (0x4004).
+Allow override of global cifs security flags on mount via "sec=" option(s).
+
+Version 1.43
+------------
+POSIX locking to servers which support CIFS POSIX Extensions
+(disabled by default controlled by proc/fs/cifs/Experimental).
+Handle conversion of long share names (especially Asian languages)
+to Unicode during mount. Fix memory leak in sess struct on reconnect.
+Fix rare oops after acpi suspend. Fix O_TRUNC opens to overwrite on
+cifs open which helps rare case when setpathinfo fails or server does
+not support it.
+
+Version 1.42
+------------
+Fix slow oplock break when mounted to different servers at the same time and
+the tids match and we try to find matching fid on wrong server. Fix read
+looping when signing required by server (2.6.16 kernel only). Fix readdir
+vs. rename race which could cause each to hang. Return . and .. even
+if server does not. Allow searches to skip first three entries and
+begin at any location. Fix oops in find_writeable_file.
+
+Version 1.41
+------------
+Fix NTLMv2 security (can be enabled in /proc/fs/cifs) so customers can
+configure stronger authentication. Fix sfu symlinks so they can
+be followed (not just recognized). Fix wraparound of bcc on
+read responses when buffer size over 64K and also fix wrap of
+max smb buffer size when CIFSMaxBufSize over 64K. Fix oops in
+cifs_user_read and cifs_readpages (when EAGAIN on send of smb
+on socket is returned over and over). Add POSIX (advisory) byte range
+locking support (requires server with newest CIFS UNIX Extensions
+to the protocol implemented). Slow down negprot slightly in port 139
+RFC1001 case to give session_init time on buggy servers.
+
+Version 1.40
+------------
+Use fsuid (fsgid) more consistently instead of uid (gid). Improve performance
+of readpages by eliminating one extra memcpy. Allow update of file size
+from remote server even if file is open for write as long as mount is
+directio. Recognize share mode security and send NTLM encrypted password
+on tree connect if share mode negotiated.
+
+Version 1.39
+------------
+Defer close of a file handle slightly if pending writes depend on that handle
+(this reduces the EBADF bad file handle errors that can be logged under heavy
+stress on writes). Modify cifs Kconfig options to expose CONFIG_CIFS_STATS2
+Fix SFU style symlinks and mknod needed for servers which do not support the
+CIFS Unix Extensions. Fix setfacl/getfacl on bigendian. Timeout negative
+dentries so files that the client sees as deleted but that later get created
+on the server will be recognized. Add client side permission check on setattr.
+Timeout stuck requests better (where server has never responded or sent corrupt
+responses)
+
+Version 1.38
+------------
+Fix tcp socket retransmission timeouts (e.g. on ENOSPACE from the socket)
+to be smaller at first (but increasing) so large write performance performance
+over GigE is better. Do not hang thread on illegal byte range lock response
+from Windows (Windows can send an RFC1001 size which does not match smb size) by
+allowing an SMBs TCP length to be up to a few bytes longer than it should be.
+wsize and rsize can now be larger than negotiated buffer size if server
+supports large readx/writex, even when directio mount flag not specified.
+Write size will in many cases now be 16K instead of 4K which greatly helps
+file copy performance on lightly loaded networks. Fix oops in dnotify
+when experimental config flag enabled. Make cifsFYI more granular.
+
+Version 1.37
+------------
+Fix readdir caching when unlink removes file in current search buffer,
+and this is followed by a rewind search to just before the deleted entry.
+Do not attempt to set ctime unless atime and/or mtime change requested
+(most servers throw it away anyway). Fix length check of received smbs
+to be more accurate. Fix big endian problem with mapchars mount option,
+and with a field returned by statfs.
+
+Version 1.36
+------------
+Add support for mounting to older pre-CIFS servers such as Windows9x and ME.
+For these older servers, add option for passing netbios name of server in
+on mount (servernetbiosname). Add suspend support for power management, to
+avoid cifsd thread preventing software suspend from working.
+Add mount option for disabling the default behavior of sending byte range lock
+requests to the server (necessary for certain applications which break with
+mandatory lock behavior such as Evolution), and also mount option for
+requesting case insensitive matching for path based requests (requesting
+case sensitive is the default).
+
+Version 1.35
+------------
+Add writepage performance improvements. Fix path name conversions
+for long filenames on mounts which were done with "mapchars" mount option
+specified. Ensure multiplex ids do not collide. Fix case in which
+rmmod can oops if done soon after last unmount. Fix truncated
+search (readdir) output when resume filename was a long filename.
+Fix filename conversion when mapchars mount option was specified and
+filename was a long filename.
+
+Version 1.34
+------------
+Fix error mapping of the TOO_MANY_LINKS (hardlinks) case.
+Do not oops if root user kills cifs oplock kernel thread or
+kills the cifsd thread (NB: killing the cifs kernel threads is not
+recommended, unmount and rmmod cifs will kill them when they are
+no longer needed). Fix readdir to ASCII servers (ie older servers
+which do not support Unicode) and also require asterisk.
+Fix out of memory case in which data could be written one page
+off in the page cache.
+
+Version 1.33
+------------
+Fix caching problem, in which readdir of directory containing a file
+which was cached could cause the file's time stamp to be updated
+without invalidating the readahead data (so we could get stale
+file data on the client for that file even as the server copy changed).
+Cleanup response processing so cifsd can not loop when abnormally
+terminated.
+
+
+Version 1.32
+------------
+Fix oops in ls when Transact2 FindFirst (or FindNext) returns more than one
+transact response for an SMB request and search entry split across two frames.
+Add support for lsattr (getting ext2/ext3/reiserfs attr flags from the server)
+as new protocol extensions. Do not send Get/Set calls for POSIX ACLs
+unless server explicitly claims to support them in CIFS Unix extensions
+POSIX ACL capability bit. Fix packet signing when multiuser mounting with
+different users from the same client to the same server. Fix oops in
+cifs_close. Add mount option for remapping reserved characters in
+filenames (also allow recognizing files with created by SFU which have any
+of these seven reserved characters, except backslash, to be recognized).
+Fix invalid transact2 message (we were sometimes trying to interpret
+oplock breaks as SMB responses). Add ioctl for checking that the
+current uid matches the uid of the mounter (needed by umount.cifs).
+Reduce the number of large buffer allocations in cifs response processing
+(significantly reduces memory pressure under heavy stress with multiple
+processes accessing the same server at the same time).
+
+Version 1.31
+------------
+Fix updates of DOS attributes and time fields so that files on NT4 servers
+do not get marked delete on close. Display sizes of cifs buffer pools in
+cifs stats. Fix oops in unmount when cifsd thread being killed by
+shutdown. Add generic readv/writev and aio support. Report inode numbers
+consistently in readdir and lookup (when serverino mount option is
+specified use the inode number that the server reports - for both lookup
+and readdir, otherwise by default the locally generated inode number is used
+for inodes created in either path since servers are not always able to
+provide unique inode numbers when exporting multiple volumes from under one
+sharename).
+
+Version 1.30
+------------
+Allow new nouser_xattr mount parm to disable xattr support for user namespace.
+Do not flag user_xattr mount parm in dmesg. Retry failures setting file time
+(mostly affects NT4 servers) by retry with handle based network operation.
+Add new POSIX Query FS Info for returning statfs info more accurately.
+Handle passwords with multiple commas in them.
+
+Version 1.29
+------------
+Fix default mode in sysfs of cifs module parms. Remove old readdir routine.
+Fix capabilities flags for large readx so as to allow reads larger than 64K.
+
+Version 1.28
+------------
+Add module init parm for large SMB buffer size (to allow it to be changed
+from its default of 16K) which is especially useful for large file copy
+when mounting with the directio mount option. Fix oops after
+returning from mount when experimental ExtendedSecurity enabled and
+SpnegoNegotiated returning invalid error. Fix case to retry better when
+peek returns from 1 to 3 bytes on socket which should have more data.
+Fixed path based calls (such as cifs lookup) to handle path names
+longer than 530 (now can handle PATH_MAX). Fix pass through authentication
+from Samba server to DC (Samba required dummy LM password).
+
+Version 1.27
+------------
+Turn off DNOTIFY (directory change notification support) by default
+(unless built with the experimental flag) to fix hang with KDE
+file browser. Fix DNOTIFY flag mappings. Fix hang (in wait_event
+waiting on an SMB response) in SendReceive when session dies but
+reconnects quickly from another task. Add module init parms for
+minimum number of large and small network buffers in the buffer pools,
+and for the maximum number of simultaneous requests.
+
+Version 1.26
+------------
+Add setfacl support to allow setting of ACLs remotely to Samba 3.10 and later
+and other POSIX CIFS compliant servers. Fix error mapping for getfacl
+to EOPNOTSUPP when server does not support posix acls on the wire. Fix
+improperly zeroed buffer in CIFS Unix extensions set times call.
+
+Version 1.25
+------------
+Fix internationalization problem in cifs readdir with filenames that map to
+longer UTF-8 strings than the string on the wire was in Unicode. Add workaround
+for readdir to netapp servers. Fix search rewind (seek into readdir to return
+non-consecutive entries). Do not do readdir when server negotiates
+buffer size to small to fit filename. Add support for reading POSIX ACLs from
+the server (add also acl and noacl mount options).
+
+Version 1.24
+------------
+Optionally allow using server side inode numbers, rather than client generated
+ones by specifying mount option "serverino" - this is required for some apps
+to work which double check hardlinked files and have persistent inode numbers.
+
+Version 1.23
+------------
+Multiple bigendian fixes. On little endian systems (for reconnect after
+network failure) fix tcp session reconnect code so we do not try first
+to reconnect on reverse of port 445. Treat reparse points (NTFS junctions)
+as directories rather than symlinks because we can do follow link on them.
+
+Version 1.22
+------------
+Add config option to enable XATTR (extended attribute) support, mapping
+xattr names in the "user." namespace space to SMB/CIFS EAs. Lots of
+minor fixes pointed out by the Stanford SWAT checker (mostly missing
+or out of order NULL pointer checks in little used error paths).
+
+Version 1.21
+------------
+Add new mount parm to control whether mode check (generic_permission) is done
+on the client. If Unix extensions are enabled and the uids on the client
+and server do not match, client permission checks are meaningless on
+server uids that do not exist on the client (this does not affect the
+normal ACL check which occurs on the server). Fix default uid
+on mknod to match create and mkdir. Add optional mount parm to allow
+override of the default uid behavior (in which the server sets the uid
+and gid of newly created files). Normally for network filesystem mounts
+user want the server to set the uid/gid on newly created files (rather than
+using uid of the client processes you would in a local filesystem).
+
+Version 1.20
+------------
+Make transaction counts more consistent. Merge /proc/fs/cifs/SimultaneousOps
+info into /proc/fs/cifs/DebugData. Fix oops in rare oops in readdir
+(in build_wildcard_path_from_dentry). Fix mknod to pass type field
+(block/char/fifo) properly. Remove spurious mount warning log entry when
+credentials passed as mount argument. Set major/minor device number in
+inode for block and char devices when unix extensions enabled.
+
+Version 1.19
+------------
+Fix /proc/fs/cifs/Stats and DebugData display to handle larger
+amounts of return data. Properly limit requests to MAX_REQ (50
+is the usual maximum active multiplex SMB/CIFS requests per server).
+Do not kill cifsd (and thus hurt the other SMB session) when more than one
+session to the same server (but with different userids) exists and one
+of the two user's smb sessions is being removed while leaving the other.
+Do not loop reconnecting in cifsd demultiplex thread when admin
+kills the thread without going through unmount.
+
+Version 1.18
+------------
+Do not rename hardlinked files (since that should be a noop). Flush
+cached write behind data when reopening a file after session abend,
+except when already in write. Grab per socket sem during reconnect
+to avoid oops in sendmsg if overlapping with reconnect. Do not
+reset cached inode file size on readdir for files open for write on
+client.
+
+
+Version 1.17
+------------
+Update number of blocks in file so du command is happier (in Linux a fake
+blocksize of 512 is required for calculating number of blocks in inode).
+Fix prepare write of partial pages to read in data from server if possible.
+Fix race on tcpStatus field between unmount and reconnection code, causing
+cifsd process sometimes to hang around forever. Improve out of memory
+checks in cifs_filldir
+
+Version 1.16
+------------
+Fix incorrect file size in file handle based setattr on big endian hardware.
+Fix oops in build_path_from_dentry when out of memory. Add checks for invalid
+and closing file structs in writepage/partialpagewrite. Add statistics
+for each mounted share (new menuconfig option). Fix endianness problem in
+volume information displayed in /proc/fs/cifs/DebugData (only affects
+affects big endian architectures). Prevent renames while constructing
+path names for open, mkdir and rmdir.
+
+Version 1.15
+------------
+Change to mempools for alloc smb request buffers and multiplex structs
+to better handle low memory problems (and potential deadlocks).
+
+Version 1.14
+------------
+Fix incomplete listings of large directories on Samba servers when Unix
+extensions enabled. Fix oops when smb_buffer can not be allocated. Fix
+rename deadlock when writing out dirty pages at same time.
+
+Version 1.13
+------------
+Fix open of files in which O_CREATE can cause the mode to change in
+some cases. Fix case in which retry of write overlaps file close.
+Fix PPC64 build error. Reduce excessive stack usage in smb password
+hashing. Fix overwrite of Linux user's view of file mode to Windows servers.
+
+Version 1.12
+------------
+Fixes for large file copy, signal handling, socket retry, buffer
+allocation and low memory situations.
+
+Version 1.11
+------------
+Better port 139 support to Windows servers (RFC1001/RFC1002 Session_Initialize)
+also now allowing support for specifying client netbiosname. NT4 support added.
+
+Version 1.10
+------------
+Fix reconnection (and certain failed mounts) to properly wake up the
+blocked users thread so it does not seem hung (in some cases was blocked
+until the cifs receive timeout expired). Fix spurious error logging
+to kernel log when application with open network files killed.
+
+Version 1.09
+------------
+Fix /proc/fs module unload warning message (that could be logged
+to the kernel log). Fix intermittent failure in connectathon
+test7 (hardlink count not immediately refreshed in case in which
+inode metadata can be incorrectly kept cached when time near zero)
+
+Version 1.08
+------------
+Allow file_mode and dir_mode (specified at mount time) to be enforced
+locally (the server already enforced its own ACLs too) for servers
+that do not report the correct mode (do not support the
+CIFS Unix Extensions).
+
+Version 1.07
+------------
+Fix some small memory leaks in some unmount error paths. Fix major leak
+of cache pages in readpages causing multiple read oriented stress
+testcases (including fsx, and even large file copy) to fail over time.
+
+Version 1.06
+------------
+Send NTCreateX with ATTR_POSIX if Linux/Unix extensions negotiated with server.
+This allows files that differ only in case and improves performance of file
+creation and file open to such servers. Fix semaphore conflict which causes
+slow delete of open file to Samba (which unfortunately can cause an oplock
+break to self while vfs_unlink held i_sem) which can hang for 20 seconds.
+
+Version 1.05
+------------
+fixes to cifs_readpages for fsx test case
+
+Version 1.04
+------------
+Fix caching data integrity bug when extending file size especially when no
+oplock on file. Fix spurious logging of valid already parsed mount options
+that are parsed outside of the cifs vfs such as nosuid.
+
+
+Version 1.03
+------------
+Connect to server when port number override not specified, and tcp port
+unitialized. Reset search to restart at correct file when kernel routine
+filldir returns error during large directory searches (readdir).
+
+Version 1.02
+------------
+Fix caching problem when files opened by multiple clients in which
+page cache could contain stale data, and write through did
+not occur often enough while file was still open when read ahead
+(read oplock) not allowed. Treat "sep=" when first mount option
+as an override of comma as the default separator between mount
+options.
+
+Version 1.01
+------------
+Allow passwords longer than 16 bytes. Allow null password string.
+
+Version 1.00
+------------
+Gracefully clean up failed mounts when attempting to mount to servers such as
+Windows 98 that terminate tcp sessions during protocol negotiation. Handle
+embedded commas in mount parsing of passwords.
+
+Version 0.99
+------------
+Invalidate local inode cached pages on oplock break and when last file
+instance is closed so that the client does not continue using stale local
+copy rather than later modified server copy of file. Do not reconnect
+when server drops the tcp session prematurely before negotiate
+protocol response. Fix oops in reopen_file when dentry freed. Allow
+the support for CIFS Unix Extensions to be disabled via proc interface.
+
+Version 0.98
+------------
+Fix hang in commit_write during reconnection of open files under heavy load.
+Fix unload_nls oops in a mount failure path. Serialize writes to same socket
+which also fixes any possible races when cifs signatures are enabled in SMBs
+being sent out of signature sequence number order.
+
+Version 0.97
+------------
+Fix byte range locking bug (endian problem) causing bad offset and
+length.
+
+Version 0.96
+------------
+Fix oops (in send_sig) caused by CIFS unmount code trying to
+wake up the demultiplex thread after it had exited. Do not log
+error on harmless oplock release of closed handle.
+
+Version 0.95
+------------
+Fix unsafe global variable usage and password hash failure on gcc 3.3.1
+Fix problem reconnecting secondary mounts to same server after session
+failure. Fix invalid dentry - race in mkdir when directory gets created
+by another client between the lookup and mkdir.
+
+Version 0.94
+------------
+Fix to list processing in reopen_files. Fix reconnection when server hung
+but tcpip session still alive. Set proper timeout on socket read.
+
+Version 0.93
+------------
+Add missing mount options including iocharset. SMP fixes in write and open.
+Fix errors in reconnecting after TCP session failure. Fix module unloading
+of default nls codepage
+
+Version 0.92
+------------
+Active smb transactions should never go negative (fix double FreeXid). Fix
+list processing in file routines. Check return code on kmalloc in open.
+Fix spinlock usage for SMP.
+
+Version 0.91
+------------
+Fix oops in reopen_files when invalid dentry. drop dentry on server rename
+and on revalidate errors. Fix cases where pid is now tgid. Fix return code
+on create hard link when server does not support them.
+
+Version 0.90
+------------
+Fix scheduling while atomic error in getting inode info on newly created file.
+Fix truncate of existing files opened with O_CREAT but not O_TRUNC set.
+
+Version 0.89
+------------
+Fix oops on write to dead tcp session. Remove error log write for case when file open
+O_CREAT but not O_EXCL
+
+Version 0.88
+------------
+Fix non-POSIX behavior on rename of open file and delete of open file by taking
+advantage of trans2 SetFileInfo rename facility if available on target server.
+Retry on ENOSPC and EAGAIN socket errors.
+
+Version 0.87
+------------
+Fix oops on big endian readdir. Set blksize to be even power of two (2**blkbits) to fix
+allocation size miscalculation. After oplock token lost do not read through
+cache.
+
+Version 0.86
+------------
+Fix oops on empty file readahead. Fix for file size handling for locally cached files.
+
+Version 0.85
+------------
+Fix oops in mkdir when server fails to return inode info. Fix oops in reopen_files
+during auto reconnection to server after server recovered from failure.
+
+Version 0.84
+------------
+Finish support for Linux 2.5 open/create changes, which removes the
+redundant NTCreate/QPathInfo/close that was sent during file create.
+Enable oplock by default. Enable packet signing by default (needed to
+access many recent Windows servers)
+
+Version 0.83
+------------
+Fix oops when mounting to long server names caused by inverted parms to kmalloc.
+Fix MultiuserMount (/proc/fs/cifs configuration setting) so that when enabled
+we will choose a cifs user session (smb uid) that better matches the local
+uid if a) the mount uid does not match the current uid and b) we have another
+session to the same server (ip address) for a different mount which
+matches the current local uid.
+
+Version 0.82
+------------
+Add support for mknod of block or character devices. Fix oplock
+code (distributed caching) to properly send response to oplock
+break from server.
+
+Version 0.81
+------------
+Finish up CIFS packet digital signing for the default
+NTLM security case. This should help Windows 2003
+network interoperability since it is common for
+packet signing to be required now. Fix statfs (stat -f)
+which recently started returning errors due to
+invalid value (-1 instead of 0) being set in the
+struct kstatfs f_ffiles field.
+
+Version 0.80
+-----------
+Fix oops on stopping oplock thread when removing cifs when
+built as module.
+
+Version 0.79
+------------
+Fix mount options for ro (readonly), uid, gid and file and directory mode.
+
+Version 0.78
+------------
+Fix errors displayed on failed mounts to be more understandable.
+Fixed various incorrect or misleading smb to posix error code mappings.
+
+Version 0.77
+------------
+Fix display of NTFS DFS junctions to display as symlinks.
+They are the network equivalent. Fix oops in
+cifs_partialpagewrite caused by missing spinlock protection
+of openfile linked list. Allow writebehind caching errors to
+be returned to the application at file close.
+
+Version 0.76
+------------
+Clean up options displayed in /proc/mounts by show_options to
+be more consistent with other filesystems.
+
+Version 0.75
+------------
+Fix delete of readonly file to Windows servers. Reflect
+presence or absence of read only dos attribute in mode
+bits for servers that do not support CIFS Unix extensions.
+Fix shortened results on readdir of large directories to
+servers supporting CIFS Unix extensions (caused by
+incorrect resume key).
+
+Version 0.74
+------------
+Fix truncate bug (set file size) that could cause hangs e.g. running fsx
+
+Version 0.73
+------------
+unload nls if mount fails.
+
+Version 0.72
+------------
+Add resume key support to search (readdir) code to workaround
+Windows bug. Add /proc/fs/cifs/LookupCacheEnable which
+allows disabling caching of attribute information for
+lookups.
+
+Version 0.71
+------------
+Add more oplock handling (distributed caching code). Remove
+dead code. Remove excessive stack space utilization from
+symlink routines.
+
+Version 0.70
+------------
+Fix oops in get dfs referral (triggered when null path sent in to
+mount). Add support for overriding rsize at mount time.
+
+Version 0.69
+------------
+Fix buffer overrun in readdir which caused intermittent kernel oopses.
+Fix writepage code to release kmap on write data. Allow "-ip=" new
+mount option to be passed in on parameter distinct from the first part
+(server name portion of) the UNC name. Allow override of the
+tcp port of the target server via new mount option "-port="
+
+Version 0.68
+------------
+Fix search handle leak on rewind. Fix setuid and gid so that they are
+reflected in the local inode immediately. Cleanup of whitespace
+to make 2.4 and 2.5 versions more consistent.
+
+
+Version 0.67
+------------
+Fix signal sending so that captive thread (cifsd) exits on umount
+(which was causing the warning in kmem_cache_free of the request buffers
+at rmmod time). This had broken as a sideeffect of the recent global
+kernel change to daemonize. Fix memory leak in readdir code which
+showed up in "ls -R" (and applications that did search rewinding).
+
+Version 0.66
+------------
+Reconnect tids and fids after session reconnection (still do not
+reconnect byte range locks though). Fix problem caching
+lookup information for directory inodes, improving performance,
+especially in deep directory trees. Fix various build warnings.
+
+Version 0.65
+------------
+Finish fixes to commit write for caching/readahead consistency. fsx
+now works to Samba servers. Fix oops caused when readahead
+was interrupted by a signal.
+
+Version 0.64
+------------
+Fix data corruption (in partial page after truncate) that caused fsx to
+fail to Windows servers. Cleaned up some extraneous error logging in
+common error paths. Add generic sendfile support.
+
+Version 0.63
+------------
+Fix memory leak in AllocMidQEntry.
+Finish reconnection logic, so connection with server can be dropped
+(or server rebooted) and the cifs client will reconnect.
+
+Version 0.62
+------------
+Fix temporary socket leak when bad userid or password specified
+(or other SMBSessSetup failure). Increase maximum buffer size to slightly
+over 16K to allow negotiation of up to Samba and Windows server default read
+sizes. Add support for readpages
+
+Version 0.61
+------------
+Fix oops when username not passed in on mount. Extensive fixes and improvements
+to error logging (strip redundant newlines, change debug macros to ensure newline
+passed in and to be more consistent). Fix writepage wrong file handle problem,
+a readonly file handle could be incorrectly used to attempt to write out
+file updates through the page cache to multiply open files. This could cause
+the iozone benchmark to fail on the fwrite test. Fix bug mounting two different
+shares to the same Windows server when using different usernames
+(doing this to Samba servers worked but Windows was rejecting it) - now it is
+possible to use different userids when connecting to the same server from a
+Linux client. Fix oops when treeDisconnect called during unmount on
+previously freed socket.
+
+Version 0.60
+------------
+Fix oops in readpages caused by not setting address space operations in inode in
+rare code path.
+
+Version 0.59
+------------
+Includes support for deleting of open files and renaming over existing files (per POSIX
+requirement). Add readlink support for Windows junction points (directory symlinks).
+
+Version 0.58
+------------
+Changed read and write to go through pagecache. Added additional address space operations.
+Memory mapped operations now working.
+
+Version 0.57
+------------
+Added writepage code for additional memory mapping support. Fixed leak in xids causing
+the simultaneous operations counter (/proc/fs/cifs/SimultaneousOps) to increase on
+every stat call. Additional formatting cleanup.
+
+Version 0.56
+------------
+Fix bigendian bug in order of time conversion. Merge 2.5 to 2.4 version. Formatting cleanup.
+
+Version 0.55
+------------
+Fixes from Zwane Mwaikambo for adding missing return code checking in a few places.
+Also included a modified version of his fix to protect global list manipulation of
+the smb session and tree connection and mid related global variables.
+
+Version 0.54
+------------
+Fix problem with captive thread hanging around at unmount time. Adjust to 2.5.42-pre
+changes to superblock layout. Remove wasteful allocation of smb buffers (now the send
+buffer is reused for responses). Add more oplock handling. Additional minor cleanup.
+
+Version 0.53
+------------
+More stylistic updates to better match kernel style. Add additional statistics
+for filesystem which can be viewed via /proc/fs/cifs. Add more pieces of NTLMv2
+and CIFS Packet Signing enablement.
+
+Version 0.52
+------------
+Replace call to sleep_on with safer wait_on_event.
+Make stylistic changes to better match kernel style recommendations.
+Remove most typedef usage (except for the PDUs themselves).
+
+Version 0.51
+------------
+Update mount so the -unc mount option is no longer required (the ip address can be specified
+in a UNC style device name. Implementation of readpage/writepage started.
+
+Version 0.50
+------------
+Fix intermittent problem with incorrect smb header checking on badly
+fragmented tcp responses
+
+Version 0.49
+------------
+Fixes to setting of allocation size and file size.
+
+Version 0.48
+------------
+Various 2.5.38 fixes. Now works on 2.5.38
+
+Version 0.47
+------------
+Prepare for 2.5 kernel merge. Remove ifdefs.
+
+Version 0.46
+------------
+Socket buffer management fixes. Fix dual free.
+
+Version 0.45
+------------
+Various big endian fixes for hardlinks and symlinks and also for dfs.
+
+Version 0.44
+------------
+Various big endian fixes for servers with Unix extensions such as Samba
+
+Version 0.43
+------------
+Various FindNext fixes for incorrect filenames on large directory searches on big endian
+clients. basic posix file i/o tests now work on big endian machines, not just le
+
+Version 0.42
+------------
+SessionSetup and NegotiateProtocol now work from Big Endian machines.
+Various Big Endian fixes found during testing on the Linux on 390. Various fixes for compatibility with older
+versions of 2.4 kernel (now builds and works again on kernels at least as early as 2.4.7).
+
+Version 0.41
+------------
+Various minor fixes for Connectathon Posix "basic" file i/o test suite. Directory caching fixed so hardlinked
+files now return the correct number of links on fstat as they are repeatedly linked and unlinked.
+
+Version 0.40
+------------
+Implemented "Raw" (i.e. not encapsulated in SPNEGO) NTLMSSP (i.e. the Security Provider Interface used to negotiate
+session advanced session authentication). Raw NTLMSSP is preferred by Windows 2000 Professional and Windows XP.
+Began implementing support for SPNEGO encapsulation of NTLMSSP based session authentication blobs
+(which is the mechanism preferred by Windows 2000 server in the absence of Kerberos).
+
+Version 0.38
+------------
+Introduced optional mount helper utility mount.cifs and made coreq changes to cifs vfs to enable
+it. Fixed a few bugs in the DFS code (e.g. bcc two bytes too short and incorrect uid in PDU).
+
+Version 0.37
+------------
+Rewrote much of connection and mount/unmount logic to handle bugs with
+multiple uses to same share, multiple users to same server etc.
+
+Version 0.36
+------------
+Fixed major problem with dentry corruption (missing call to dput)
+
+Version 0.35
+------------
+Rewrite of readdir code to fix bug. Various fixes for bigendian machines.
+Begin adding oplock support. Multiusermount and oplockEnabled flags added to /proc/fs/cifs
+although corresponding function not fully implemented in the vfs yet
+
+Version 0.34
+------------
+Fixed dentry caching bug, misc. cleanup
+
+Version 0.33
+------------
+Fixed 2.5 support to handle build and configure changes as well as misc. 2.5 changes. Now can build
+on current 2.5 beta version (2.5.24) of the Linux kernel as well as on 2.4 Linux kernels.
+Support for STATUS codes (newer 32 bit NT error codes) added. DFS support begun to be added.
+
+Version 0.32
+------------
+Unix extensions (symlink, readlink, hardlink, chmod and some chgrp and chown) implemented
+and tested against Samba 2.2.5
+
+
+Version 0.31
+------------
+1) Fixed lockrange to be correct (it was one byte too short)
+
+2) Fixed GETLK (i.e. the fcntl call to test a range of bytes in a file to see if locked) to correctly
+show range as locked when there is a conflict with an existing lock.
+
+3) default file perms are now 2767 (indicating support for mandatory locks) instead of 777 for directories
+in most cases. Eventually will offer optional ability to query server for the correct perms.
+
+3) Fixed eventual trap when mounting twice to different shares on the same server when the first succeeded
+but the second one was invalid and failed (the second one was incorrectly disconnecting the tcp and smb
+session)
+
+4) Fixed error logging of valid mount options
+
+5) Removed logging of password field.
+
+6) Moved negotiate, treeDisconnect and uloggoffX (only tConx and SessSetup remain in connect.c) to cifssmb.c
+and cleaned them up and made them more consistent with other cifs functions.
+
+7) Server support for Unix extensions is now fully detected and FindFirst is implemented both ways
+(with or without Unix extensions) but FindNext and QueryPathInfo with the Unix extensions are not completed,
+nor is the symlink support using the Unix extensions
+
+8) Started adding the readlink and follow_link code
+
+Version 0.3
+-----------
+Initial drop
+
diff --git a/Documentation/filesystems/cifs/README b/Documentation/filesystems/cifs/README
new file mode 100644
index 00000000000..2d5622f60e1
--- /dev/null
+++ b/Documentation/filesystems/cifs/README
@@ -0,0 +1,753 @@
+The CIFS VFS support for Linux supports many advanced network filesystem
+features such as hierarchical dfs like namespace, hardlinks, locking and more.
+It was designed to comply with the SNIA CIFS Technical Reference (which
+supersedes the 1992 X/Open SMB Standard) as well as to perform best practice
+practical interoperability with Windows 2000, Windows XP, Samba and equivalent
+servers. This code was developed in participation with the Protocol Freedom
+Information Foundation.
+
+Please see
+ http://protocolfreedom.org/ and
+ http://samba.org/samba/PFIF/
+for more details.
+
+
+For questions or bug reports please contact:
+ sfrench@samba.org (sfrench@us.ibm.com)
+
+Build instructions:
+==================
+For Linux 2.4:
+1) Get the kernel source (e.g.from http://www.kernel.org)
+and download the cifs vfs source (see the project page
+at http://us1.samba.org/samba/Linux_CIFS_client.html)
+and change directory into the top of the kernel directory
+then patch the kernel (e.g. "patch -p1 < cifs_24.patch")
+to add the cifs vfs to your kernel configure options if
+it has not already been added (e.g. current SuSE and UL
+users do not need to apply the cifs_24.patch since the cifs vfs is
+already in the kernel configure menu) and then
+mkdir linux/fs/cifs and then copy the current cifs vfs files from
+the cifs download to your kernel build directory e.g.
+
+ cp <cifs_download_dir>/fs/cifs/* to <kernel_download_dir>/fs/cifs
+
+2) make menuconfig (or make xconfig)
+3) select cifs from within the network filesystem choices
+4) save and exit
+5) make dep
+6) make modules (or "make" if CIFS VFS not to be built as a module)
+
+For Linux 2.6:
+1) Download the kernel (e.g. from http://www.kernel.org)
+and change directory into the top of the kernel directory tree
+(e.g. /usr/src/linux-2.5.73)
+2) make menuconfig (or make xconfig)
+3) select cifs from within the network filesystem choices
+4) save and exit
+5) make
+
+
+Installation instructions:
+=========================
+If you have built the CIFS vfs as module (successfully) simply
+type "make modules_install" (or if you prefer, manually copy the file to
+the modules directory e.g. /lib/modules/2.4.10-4GB/kernel/fs/cifs/cifs.o).
+
+If you have built the CIFS vfs into the kernel itself, follow the instructions
+for your distribution on how to install a new kernel (usually you
+would simply type "make install").
+
+If you do not have the utility mount.cifs (in the Samba 3.0 source tree and on
+the CIFS VFS web site) copy it to the same directory in which mount.smbfs and
+similar files reside (usually /sbin). Although the helper software is not
+required, mount.cifs is recommended. Eventually the Samba 3.0 utility program
+"net" may also be helpful since it may someday provide easier mount syntax for
+users who are used to Windows e.g.
+ net use <mount point> <UNC name or cifs URL>
+Note that running the Winbind pam/nss module (logon service) on all of your
+Linux clients is useful in mapping Uids and Gids consistently across the
+domain to the proper network user. The mount.cifs mount helper can be
+trivially built from Samba 3.0 or later source e.g. by executing:
+
+ gcc samba/source/client/mount.cifs.c -o mount.cifs
+
+If cifs is built as a module, then the size and number of network buffers
+and maximum number of simultaneous requests to one server can be configured.
+Changing these from their defaults is not recommended. By executing modinfo
+ modinfo kernel/fs/cifs/cifs.ko
+on kernel/fs/cifs/cifs.ko the list of configuration changes that can be made
+at module initialization time (by running insmod cifs.ko) can be seen.
+
+Allowing User Mounts
+====================
+To permit users to mount and unmount over directories they own is possible
+with the cifs vfs. A way to enable such mounting is to mark the mount.cifs
+utility as suid (e.g. "chmod +s /sbin/mount.cifs). To enable users to
+umount shares they mount requires
+1) mount.cifs version 1.4 or later
+2) an entry for the share in /etc/fstab indicating that a user may
+unmount it e.g.
+//server/usersharename /mnt/username cifs user 0 0
+
+Note that when the mount.cifs utility is run suid (allowing user mounts),
+in order to reduce risks, the "nosuid" mount flag is passed in on mount to
+disallow execution of an suid program mounted on the remote target.
+When mount is executed as root, nosuid is not passed in by default,
+and execution of suid programs on the remote target would be enabled
+by default. This can be changed, as with nfs and other filesystems,
+by simply specifying "nosuid" among the mount options. For user mounts
+though to be able to pass the suid flag to mount requires rebuilding
+mount.cifs with the following flag:
+
+ gcc samba/source/client/mount.cifs.c -DCIFS_ALLOW_USR_SUID -o mount.cifs
+
+There is a corresponding manual page for cifs mounting in the Samba 3.0 and
+later source tree in docs/manpages/mount.cifs.8
+
+Allowing User Unmounts
+======================
+To permit users to ummount directories that they have user mounted (see above),
+the utility umount.cifs may be used. It may be invoked directly, or if
+umount.cifs is placed in /sbin, umount can invoke the cifs umount helper
+(at least for most versions of the umount utility) for umount of cifs
+mounts, unless umount is invoked with -i (which will avoid invoking a umount
+helper). As with mount.cifs, to enable user unmounts umount.cifs must be marked
+as suid (e.g. "chmod +s /sbin/umount.cifs") or equivalent (some distributions
+allow adding entries to a file to the /etc/permissions file to achieve the
+equivalent suid effect). For this utility to succeed the target path
+must be a cifs mount, and the uid of the current user must match the uid
+of the user who mounted the resource.
+
+Also note that the customary way of allowing user mounts and unmounts is
+(instead of using mount.cifs and unmount.cifs as suid) to add a line
+to the file /etc/fstab for each //server/share you wish to mount, but
+this can become unwieldy when potential mount targets include many
+or unpredictable UNC names.
+
+Samba Considerations
+====================
+To get the maximum benefit from the CIFS VFS, we recommend using a server that
+supports the SNIA CIFS Unix Extensions standard (e.g. Samba 2.2.5 or later or
+Samba 3.0) but the CIFS vfs works fine with a wide variety of CIFS servers.
+Note that uid, gid and file permissions will display default values if you do
+not have a server that supports the Unix extensions for CIFS (such as Samba
+2.2.5 or later). To enable the Unix CIFS Extensions in the Samba server, add
+the line:
+
+ unix extensions = yes
+
+to your smb.conf file on the server. Note that the following smb.conf settings
+are also useful (on the Samba server) when the majority of clients are Unix or
+Linux:
+
+ case sensitive = yes
+ delete readonly = yes
+ ea support = yes
+
+Note that server ea support is required for supporting xattrs from the Linux
+cifs client, and that EA support is present in later versions of Samba (e.g.
+3.0.6 and later (also EA support works in all versions of Windows, at least to
+shares on NTFS filesystems). Extended Attribute (xattr) support is an optional
+feature of most Linux filesystems which may require enabling via
+make menuconfig. Client support for extended attributes (user xattr) can be
+disabled on a per-mount basis by specifying "nouser_xattr" on mount.
+
+The CIFS client can get and set POSIX ACLs (getfacl, setfacl) to Samba servers
+version 3.10 and later. Setting POSIX ACLs requires enabling both XATTR and
+then POSIX support in the CIFS configuration options when building the cifs
+module. POSIX ACL support can be disabled on a per mount basic by specifying
+"noacl" on mount.
+
+Some administrators may want to change Samba's smb.conf "map archive" and
+"create mask" parameters from the default. Unless the create mask is changed
+newly created files can end up with an unnecessarily restrictive default mode,
+which may not be what you want, although if the CIFS Unix extensions are
+enabled on the server and client, subsequent setattr calls (e.g. chmod) can
+fix the mode. Note that creating special devices (mknod) remotely
+may require specifying a mkdev function to Samba if you are not using
+Samba 3.0.6 or later. For more information on these see the manual pages
+("man smb.conf") on the Samba server system. Note that the cifs vfs,
+unlike the smbfs vfs, does not read the smb.conf on the client system
+(the few optional settings are passed in on mount via -o parameters instead).
+Note that Samba 2.2.7 or later includes a fix that allows the CIFS VFS to delete
+open files (required for strict POSIX compliance). Windows Servers already
+supported this feature. Samba server does not allow symlinks that refer to files
+outside of the share, so in Samba versions prior to 3.0.6, most symlinks to
+files with absolute paths (ie beginning with slash) such as:
+ ln -s /mnt/foo bar
+would be forbidden. Samba 3.0.6 server or later includes the ability to create
+such symlinks safely by converting unsafe symlinks (ie symlinks to server
+files that are outside of the share) to a samba specific format on the server
+that is ignored by local server applications and non-cifs clients and that will
+not be traversed by the Samba server). This is opaque to the Linux client
+application using the cifs vfs. Absolute symlinks will work to Samba 3.0.5 or
+later, but only for remote clients using the CIFS Unix extensions, and will
+be invisbile to Windows clients and typically will not affect local
+applications running on the same server as Samba.
+
+Use instructions:
+================
+Once the CIFS VFS support is built into the kernel or installed as a module
+(cifs.o), you can use mount syntax like the following to access Samba or Windows
+servers:
+
+ mount -t cifs //9.53.216.11/e$ /mnt -o user=myname,pass=mypassword
+
+Before -o the option -v may be specified to make the mount.cifs
+mount helper display the mount steps more verbosely.
+After -o the following commonly used cifs vfs specific options
+are supported:
+
+ user=<username>
+ pass=<password>
+ domain=<domain name>
+
+Other cifs mount options are described below. Use of TCP names (in addition to
+ip addresses) is available if the mount helper (mount.cifs) is installed. If
+you do not trust the server to which are mounted, or if you do not have
+cifs signing enabled (and the physical network is insecure), consider use
+of the standard mount options "noexec" and "nosuid" to reduce the risk of
+running an altered binary on your local system (downloaded from a hostile server
+or altered by a hostile router).
+
+Although mounting using format corresponding to the CIFS URL specification is
+not possible in mount.cifs yet, it is possible to use an alternate format
+for the server and sharename (which is somewhat similar to NFS style mount
+syntax) instead of the more widely used UNC format (i.e. \\server\share):
+ mount -t cifs tcp_name_of_server:share_name /mnt -o user=myname,pass=mypasswd
+
+When using the mount helper mount.cifs, passwords may be specified via alternate
+mechanisms, instead of specifying it after -o using the normal "pass=" syntax
+on the command line:
+1) By including it in a credential file. Specify credentials=filename as one
+of the mount options. Credential files contain two lines
+ username=someuser
+ password=your_password
+2) By specifying the password in the PASSWD environment variable (similarly
+the user name can be taken from the USER environment variable).
+3) By specifying the password in a file by name via PASSWD_FILE
+4) By specifying the password in a file by file descriptor via PASSWD_FD
+
+If no password is provided, mount.cifs will prompt for password entry
+
+Restrictions
+============
+Servers must support either "pure-TCP" (port 445 TCP/IP CIFS connections) or RFC
+1001/1002 support for "Netbios-Over-TCP/IP." This is not likely to be a
+problem as most servers support this.
+
+Valid filenames differ between Windows and Linux. Windows typically restricts
+filenames which contain certain reserved characters (e.g.the character :
+which is used to delimit the beginning of a stream name by Windows), while
+Linux allows a slightly wider set of valid characters in filenames. Windows
+servers can remap such characters when an explicit mapping is specified in
+the Server's registry. Samba starting with version 3.10 will allow such
+filenames (ie those which contain valid Linux characters, which normally
+would be forbidden for Windows/CIFS semantics) as long as the server is
+configured for Unix Extensions (and the client has not disabled
+/proc/fs/cifs/LinuxExtensionsEnabled).
+
+
+CIFS VFS Mount Options
+======================
+A partial list of the supported mount options follows:
+ user The user name to use when trying to establish
+ the CIFS session.
+ password The user password. If the mount helper is
+ installed, the user will be prompted for password
+ if not supplied.
+ ip The ip address of the target server
+ unc The target server Universal Network Name (export) to
+ mount.
+ domain Set the SMB/CIFS workgroup name prepended to the
+ username during CIFS session establishment
+ forceuid Set the default uid for inodes to the uid
+ passed in on mount. For mounts to servers
+ which do support the CIFS Unix extensions, such as a
+ properly configured Samba server, the server provides
+ the uid, gid and mode so this parameter should not be
+ specified unless the server and clients uid and gid
+ numbering differ. If the server and client are in the
+ same domain (e.g. running winbind or nss_ldap) and
+ the server supports the Unix Extensions then the uid
+ and gid can be retrieved from the server (and uid
+ and gid would not have to be specifed on the mount.
+ For servers which do not support the CIFS Unix
+ extensions, the default uid (and gid) returned on lookup
+ of existing files will be the uid (gid) of the person
+ who executed the mount (root, except when mount.cifs
+ is configured setuid for user mounts) unless the "uid="
+ (gid) mount option is specified. Also note that permission
+ checks (authorization checks) on accesses to a file occur
+ at the server, but there are cases in which an administrator
+ may want to restrict at the client as well. For those
+ servers which do not report a uid/gid owner
+ (such as Windows), permissions can also be checked at the
+ client, and a crude form of client side permission checking
+ can be enabled by specifying file_mode and dir_mode on
+ the client. (default)
+ forcegid (similar to above but for the groupid instead of uid) (default)
+ noforceuid Fill in file owner information (uid) by requesting it from
+ the server if possible. With this option, the value given in
+ the uid= option (on mount) will only be used if the server
+ can not support returning uids on inodes.
+ noforcegid (similar to above but for the group owner, gid, instead of uid)
+ uid Set the default uid for inodes, and indicate to the
+ cifs kernel driver which local user mounted. If the server
+ supports the unix extensions the default uid is
+ not used to fill in the owner fields of inodes (files)
+ unless the "forceuid" parameter is specified.
+ gid Set the default gid for inodes (similar to above).
+ file_mode If CIFS Unix extensions are not supported by the server
+ this overrides the default mode for file inodes.
+ fsc Enable local disk caching using FS-Cache (off by default). This
+ option could be useful to improve performance on a slow link,
+ heavily loaded server and/or network where reading from the
+ disk is faster than reading from the server (over the network).
+ This could also impact scalability positively as the
+ number of calls to the server are reduced. However, local
+ caching is not suitable for all workloads for e.g. read-once
+ type workloads. So, you need to consider carefully your
+ workload/scenario before using this option. Currently, local
+ disk caching is functional for CIFS files opened as read-only.
+ dir_mode If CIFS Unix extensions are not supported by the server
+ this overrides the default mode for directory inodes.
+ port attempt to contact the server on this tcp port, before
+ trying the usual ports (port 445, then 139).
+ iocharset Codepage used to convert local path names to and from
+ Unicode. Unicode is used by default for network path
+ names if the server supports it. If iocharset is
+ not specified then the nls_default specified
+ during the local client kernel build will be used.
+ If server does not support Unicode, this parameter is
+ unused.
+ rsize default read size (usually 16K). The client currently
+ can not use rsize larger than CIFSMaxBufSize. CIFSMaxBufSize
+ defaults to 16K and may be changed (from 8K to the maximum
+ kmalloc size allowed by your kernel) at module install time
+ for cifs.ko. Setting CIFSMaxBufSize to a very large value
+ will cause cifs to use more memory and may reduce performance
+ in some cases. To use rsize greater than 127K (the original
+ cifs protocol maximum) also requires that the server support
+ a new Unix Capability flag (for very large read) which some
+ newer servers (e.g. Samba 3.0.26 or later) do. rsize can be
+ set from a minimum of 2048 to a maximum of 130048 (127K or
+ CIFSMaxBufSize, whichever is smaller)
+ wsize default write size (default 57344)
+ maximum wsize currently allowed by CIFS is 57344 (fourteen
+ 4096 byte pages)
+ actimeo=n attribute cache timeout in seconds (default 1 second).
+ After this timeout, the cifs client requests fresh attribute
+ information from the server. This option allows to tune the
+ attribute cache timeout to suit the workload needs. Shorter
+ timeouts mean better the cache coherency, but increased number
+ of calls to the server. Longer timeouts mean reduced number
+ of calls to the server at the expense of less stricter cache
+ coherency checks (i.e. incorrect attribute cache for a short
+ period of time).
+ rw mount the network share read-write (note that the
+ server may still consider the share read-only)
+ ro mount network share read-only
+ version used to distinguish different versions of the
+ mount helper utility (not typically needed)
+ sep if first mount option (after the -o), overrides
+ the comma as the separator between the mount
+ parms. e.g.
+ -o user=myname,password=mypassword,domain=mydom
+ could be passed instead with period as the separator by
+ -o sep=.user=myname.password=mypassword.domain=mydom
+ this might be useful when comma is contained within username
+ or password or domain. This option is less important
+ when the cifs mount helper cifs.mount (version 1.1 or later)
+ is used.
+ nosuid Do not allow remote executables with the suid bit
+ program to be executed. This is only meaningful for mounts
+ to servers such as Samba which support the CIFS Unix Extensions.
+ If you do not trust the servers in your network (your mount
+ targets) it is recommended that you specify this option for
+ greater security.
+ exec Permit execution of binaries on the mount.
+ noexec Do not permit execution of binaries on the mount.
+ dev Recognize block devices on the remote mount.
+ nodev Do not recognize devices on the remote mount.
+ suid Allow remote files on this mountpoint with suid enabled to
+ be executed (default for mounts when executed as root,
+ nosuid is default for user mounts).
+ credentials Although ignored by the cifs kernel component, it is used by
+ the mount helper, mount.cifs. When mount.cifs is installed it
+ opens and reads the credential file specified in order
+ to obtain the userid and password arguments which are passed to
+ the cifs vfs.
+ guest Although ignored by the kernel component, the mount.cifs
+ mount helper will not prompt the user for a password
+ if guest is specified on the mount options. If no
+ password is specified a null password will be used.
+ perm Client does permission checks (vfs_permission check of uid
+ and gid of the file against the mode and desired operation),
+ Note that this is in addition to the normal ACL check on the
+ target machine done by the server software.
+ Client permission checking is enabled by default.
+ noperm Client does not do permission checks. This can expose
+ files on this mount to access by other users on the local
+ client system. It is typically only needed when the server
+ supports the CIFS Unix Extensions but the UIDs/GIDs on the
+ client and server system do not match closely enough to allow
+ access by the user doing the mount, but it may be useful with
+ non CIFS Unix Extension mounts for cases in which the default
+ mode is specified on the mount but is not to be enforced on the
+ client (e.g. perhaps when MultiUserMount is enabled)
+ Note that this does not affect the normal ACL check on the
+ target machine done by the server software (of the server
+ ACL against the user name provided at mount time).
+ serverino Use server's inode numbers instead of generating automatically
+ incrementing inode numbers on the client. Although this will
+ make it easier to spot hardlinked files (as they will have
+ the same inode numbers) and inode numbers may be persistent,
+ note that the server does not guarantee that the inode numbers
+ are unique if multiple server side mounts are exported under a
+ single share (since inode numbers on the servers might not
+ be unique if multiple filesystems are mounted under the same
+ shared higher level directory). Note that some older
+ (e.g. pre-Windows 2000) do not support returning UniqueIDs
+ or the CIFS Unix Extensions equivalent and for those
+ this mount option will have no effect. Exporting cifs mounts
+ under nfsd requires this mount option on the cifs mount.
+ This is now the default if server supports the
+ required network operation.
+ noserverino Client generates inode numbers (rather than using the actual one
+ from the server). These inode numbers will vary after
+ unmount or reboot which can confuse some applications,
+ but not all server filesystems support unique inode
+ numbers.
+ setuids If the CIFS Unix extensions are negotiated with the server
+ the client will attempt to set the effective uid and gid of
+ the local process on newly created files, directories, and
+ devices (create, mkdir, mknod). If the CIFS Unix Extensions
+ are not negotiated, for newly created files and directories
+ instead of using the default uid and gid specified on
+ the mount, cache the new file's uid and gid locally which means
+ that the uid for the file can change when the inode is
+ reloaded (or the user remounts the share).
+ nosetuids The client will not attempt to set the uid and gid on
+ on newly created files, directories, and devices (create,
+ mkdir, mknod) which will result in the server setting the
+ uid and gid to the default (usually the server uid of the
+ user who mounted the share). Letting the server (rather than
+ the client) set the uid and gid is the default. If the CIFS
+ Unix Extensions are not negotiated then the uid and gid for
+ new files will appear to be the uid (gid) of the mounter or the
+ uid (gid) parameter specified on the mount.
+ netbiosname When mounting to servers via port 139, specifies the RFC1001
+ source name to use to represent the client netbios machine
+ name when doing the RFC1001 netbios session initialize.
+ direct Do not do inode data caching on files opened on this mount.
+ This precludes mmapping files on this mount. In some cases
+ with fast networks and little or no caching benefits on the
+ client (e.g. when the application is doing large sequential
+ reads bigger than page size without rereading the same data)
+ this can provide better performance than the default
+ behavior which caches reads (readahead) and writes
+ (writebehind) through the local Linux client pagecache
+ if oplock (caching token) is granted and held. Note that
+ direct allows write operations larger than page size
+ to be sent to the server.
+ strictcache Use for switching on strict cache mode. In this mode the
+ client read from the cache all the time it has Oplock Level II,
+ otherwise - read from the server. All written data are stored
+ in the cache, but if the client doesn't have Exclusive Oplock,
+ it writes the data to the server.
+ rwpidforward Forward pid of a process who opened a file to any read or write
+ operation on that file. This prevent applications like WINE
+ from failing on read and write if we use mandatory brlock style.
+ acl Allow setfacl and getfacl to manage posix ACLs if server
+ supports them. (default)
+ noacl Do not allow setfacl and getfacl calls on this mount
+ user_xattr Allow getting and setting user xattrs (those attributes whose
+ name begins with "user." or "os2.") as OS/2 EAs (extended
+ attributes) to the server. This allows support of the
+ setfattr and getfattr utilities. (default)
+ nouser_xattr Do not allow getfattr/setfattr to get/set/list xattrs
+ mapchars Translate six of the seven reserved characters (not backslash)
+ *?<>|:
+ to the remap range (above 0xF000), which also
+ allows the CIFS client to recognize files created with
+ such characters by Windows's POSIX emulation. This can
+ also be useful when mounting to most versions of Samba
+ (which also forbids creating and opening files
+ whose names contain any of these seven characters).
+ This has no effect if the server does not support
+ Unicode on the wire.
+ nomapchars Do not translate any of these seven characters (default).
+ nocase Request case insensitive path name matching (case
+ sensitive is the default if the server supports it).
+ (mount option "ignorecase" is identical to "nocase")
+ posixpaths If CIFS Unix extensions are supported, attempt to
+ negotiate posix path name support which allows certain
+ characters forbidden in typical CIFS filenames, without
+ requiring remapping. (default)
+ noposixpaths If CIFS Unix extensions are supported, do not request
+ posix path name support (this may cause servers to
+ reject creatingfile with certain reserved characters).
+ nounix Disable the CIFS Unix Extensions for this mount (tree
+ connection). This is rarely needed, but it may be useful
+ in order to turn off multiple settings all at once (ie
+ posix acls, posix locks, posix paths, symlink support
+ and retrieving uids/gids/mode from the server) or to
+ work around a bug in server which implement the Unix
+ Extensions.
+ nobrl Do not send byte range lock requests to the server.
+ This is necessary for certain applications that break
+ with cifs style mandatory byte range locks (and most
+ cifs servers do not yet support requesting advisory
+ byte range locks).
+ forcemandatorylock Even if the server supports posix (advisory) byte range
+ locking, send only mandatory lock requests. For some
+ (presumably rare) applications, originally coded for
+ DOS/Windows, which require Windows style mandatory byte range
+ locking, they may be able to take advantage of this option,
+ forcing the cifs client to only send mandatory locks
+ even if the cifs server would support posix advisory locks.
+ "forcemand" is accepted as a shorter form of this mount
+ option.
+ nostrictsync If this mount option is set, when an application does an
+ fsync call then the cifs client does not send an SMB Flush
+ to the server (to force the server to write all dirty data
+ for this file immediately to disk), although cifs still sends
+ all dirty (cached) file data to the server and waits for the
+ server to respond to the write. Since SMB Flush can be
+ very slow, and some servers may be reliable enough (to risk
+ delaying slightly flushing the data to disk on the server),
+ turning on this option may be useful to improve performance for
+ applications that fsync too much, at a small risk of server
+ crash. If this mount option is not set, by default cifs will
+ send an SMB flush request (and wait for a response) on every
+ fsync call.
+ nodfs Disable DFS (global name space support) even if the
+ server claims to support it. This can help work around
+ a problem with parsing of DFS paths with Samba server
+ versions 3.0.24 and 3.0.25.
+ remount remount the share (often used to change from ro to rw mounts
+ or vice versa)
+ cifsacl Report mode bits (e.g. on stat) based on the Windows ACL for
+ the file. (EXPERIMENTAL)
+ servern Specify the server 's netbios name (RFC1001 name) to use
+ when attempting to setup a session to the server.
+ This is needed for mounting to some older servers (such
+ as OS/2 or Windows 98 and Windows ME) since they do not
+ support a default server name. A server name can be up
+ to 15 characters long and is usually uppercased.
+ sfu When the CIFS Unix Extensions are not negotiated, attempt to
+ create device files and fifos in a format compatible with
+ Services for Unix (SFU). In addition retrieve bits 10-12
+ of the mode via the SETFILEBITS extended attribute (as
+ SFU does). In the future the bottom 9 bits of the
+ mode also will be emulated using queries of the security
+ descriptor (ACL).
+ mfsymlinks Enable support for Minshall+French symlinks
+ (see http://wiki.samba.org/index.php/UNIX_Extensions#Minshall.2BFrench_symlinks)
+ This option is ignored when specified together with the
+ 'sfu' option. Minshall+French symlinks are used even if
+ the server supports the CIFS Unix Extensions.
+ sign Must use packet signing (helps avoid unwanted data modification
+ by intermediate systems in the route). Note that signing
+ does not work with lanman or plaintext authentication.
+ seal Must seal (encrypt) all data on this mounted share before
+ sending on the network. Requires support for Unix Extensions.
+ Note that this differs from the sign mount option in that it
+ causes encryption of data sent over this mounted share but other
+ shares mounted to the same server are unaffected.
+ locallease This option is rarely needed. Fcntl F_SETLEASE is
+ used by some applications such as Samba and NFSv4 server to
+ check to see whether a file is cacheable. CIFS has no way
+ to explicitly request a lease, but can check whether a file
+ is cacheable (oplocked). Unfortunately, even if a file
+ is not oplocked, it could still be cacheable (ie cifs client
+ could grant fcntl leases if no other local processes are using
+ the file) for cases for example such as when the server does not
+ support oplocks and the user is sure that the only updates to
+ the file will be from this client. Specifying this mount option
+ will allow the cifs client to check for leases (only) locally
+ for files which are not oplocked instead of denying leases
+ in that case. (EXPERIMENTAL)
+ sec Security mode. Allowed values are:
+ none attempt to connection as a null user (no name)
+ krb5 Use Kerberos version 5 authentication
+ krb5i Use Kerberos authentication and packet signing
+ ntlm Use NTLM password hashing (default)
+ ntlmi Use NTLM password hashing with signing (if
+ /proc/fs/cifs/PacketSigningEnabled on or if
+ server requires signing also can be the default)
+ ntlmv2 Use NTLMv2 password hashing
+ ntlmv2i Use NTLMv2 password hashing with packet signing
+ lanman (if configured in kernel config) use older
+ lanman hash
+hard Retry file operations if server is not responding
+soft Limit retries to unresponsive servers (usually only
+ one retry) before returning an error. (default)
+
+The mount.cifs mount helper also accepts a few mount options before -o
+including:
+
+ -S take password from stdin (equivalent to setting the environment
+ variable "PASSWD_FD=0"
+ -V print mount.cifs version
+ -? display simple usage information
+
+With most 2.6 kernel versions of modutils, the version of the cifs kernel
+module can be displayed via modinfo.
+
+Misc /proc/fs/cifs Flags and Debug Info
+=======================================
+Informational pseudo-files:
+DebugData Displays information about active CIFS sessions and
+ shares, features enabled as well as the cifs.ko
+ version.
+Stats Lists summary resource usage information as well as per
+ share statistics, if CONFIG_CIFS_STATS in enabled
+ in the kernel configuration.
+
+Configuration pseudo-files:
+PacketSigningEnabled If set to one, cifs packet signing is enabled
+ and will be used if the server requires
+ it. If set to two, cifs packet signing is
+ required even if the server considers packet
+ signing optional. (default 1)
+SecurityFlags Flags which control security negotiation and
+ also packet signing. Authentication (may/must)
+ flags (e.g. for NTLM and/or NTLMv2) may be combined with
+ the signing flags. Specifying two different password
+ hashing mechanisms (as "must use") on the other hand
+ does not make much sense. Default flags are
+ 0x07007
+ (NTLM, NTLMv2 and packet signing allowed). The maximum
+ allowable flags if you want to allow mounts to servers
+ using weaker password hashes is 0x37037 (lanman,
+ plaintext, ntlm, ntlmv2, signing allowed). Some
+ SecurityFlags require the corresponding menuconfig
+ options to be enabled (lanman and plaintext require
+ CONFIG_CIFS_WEAK_PW_HASH for example). Enabling
+ plaintext authentication currently requires also
+ enabling lanman authentication in the security flags
+ because the cifs module only supports sending
+ laintext passwords using the older lanman dialect
+ form of the session setup SMB. (e.g. for authentication
+ using plain text passwords, set the SecurityFlags
+ to 0x30030):
+
+ may use packet signing 0x00001
+ must use packet signing 0x01001
+ may use NTLM (most common password hash) 0x00002
+ must use NTLM 0x02002
+ may use NTLMv2 0x00004
+ must use NTLMv2 0x04004
+ may use Kerberos security 0x00008
+ must use Kerberos 0x08008
+ may use lanman (weak) password hash 0x00010
+ must use lanman password hash 0x10010
+ may use plaintext passwords 0x00020
+ must use plaintext passwords 0x20020
+ (reserved for future packet encryption) 0x00040
+
+cifsFYI If set to non-zero value, additional debug information
+ will be logged to the system error log. This field
+ contains three flags controlling different classes of
+ debugging entries. The maximum value it can be set
+ to is 7 which enables all debugging points (default 0).
+ Some debugging statements are not compiled into the
+ cifs kernel unless CONFIG_CIFS_DEBUG2 is enabled in the
+ kernel configuration. cifsFYI may be set to one or
+ nore of the following flags (7 sets them all):
+
+ log cifs informational messages 0x01
+ log return codes from cifs entry points 0x02
+ log slow responses (ie which take longer than 1 second)
+ CONFIG_CIFS_STATS2 must be enabled in .config 0x04
+
+
+traceSMB If set to one, debug information is logged to the
+ system error log with the start of smb requests
+ and responses (default 0)
+LookupCacheEnable If set to one, inode information is kept cached
+ for one second improving performance of lookups
+ (default 1)
+OplockEnabled If set to one, safe distributed caching enabled.
+ (default 1)
+LinuxExtensionsEnabled If set to one then the client will attempt to
+ use the CIFS "UNIX" extensions which are optional
+ protocol enhancements that allow CIFS servers
+ to return accurate UID/GID information as well
+ as support symbolic links. If you use servers
+ such as Samba that support the CIFS Unix
+ extensions but do not want to use symbolic link
+ support and want to map the uid and gid fields
+ to values supplied at mount (rather than the
+ actual values, then set this to zero. (default 1)
+
+These experimental features and tracing can be enabled by changing flags in
+/proc/fs/cifs (after the cifs module has been installed or built into the
+kernel, e.g. insmod cifs). To enable a feature set it to 1 e.g. to enable
+tracing to the kernel message log type:
+
+ echo 7 > /proc/fs/cifs/cifsFYI
+
+cifsFYI functions as a bit mask. Setting it to 1 enables additional kernel
+logging of various informational messages. 2 enables logging of non-zero
+SMB return codes while 4 enables logging of requests that take longer
+than one second to complete (except for byte range lock requests).
+Setting it to 4 requires defining CONFIG_CIFS_STATS2 manually in the
+source code (typically by setting it in the beginning of cifsglob.h),
+and setting it to seven enables all three. Finally, tracing
+the start of smb requests and responses can be enabled via:
+
+ echo 1 > /proc/fs/cifs/traceSMB
+
+Per share (per client mount) statistics are available in /proc/fs/cifs/Stats
+if the kernel was configured with cifs statistics enabled. The statistics
+represent the number of successful (ie non-zero return code from the server)
+SMB responses to some of the more common commands (open, delete, mkdir etc.).
+Also recorded is the total bytes read and bytes written to the server for
+that share. Note that due to client caching effects this can be less than the
+number of bytes read and written by the application running on the client.
+The statistics for the number of total SMBs and oplock breaks are different in
+that they represent all for that share, not just those for which the server
+returned success.
+
+Also note that "cat /proc/fs/cifs/DebugData" will display information about
+the active sessions and the shares that are mounted.
+
+Enabling Kerberos (extended security) works but requires version 1.2 or later
+of the helper program cifs.upcall to be present and to be configured in the
+/etc/request-key.conf file. The cifs.upcall helper program is from the Samba
+project(http://www.samba.org). NTLM and NTLMv2 and LANMAN support do not
+require this helper. Note that NTLMv2 security (which does not require the
+cifs.upcall helper program), instead of using Kerberos, is sufficient for
+some use cases.
+
+DFS support allows transparent redirection to shares in an MS-DFS name space.
+In addition, DFS support for target shares which are specified as UNC
+names which begin with host names (rather than IP addresses) requires
+a user space helper (such as cifs.upcall) to be present in order to
+translate host names to ip address, and the user space helper must also
+be configured in the file /etc/request-key.conf. Samba, Windows servers and
+many NAS appliances support DFS as a way of constructing a global name
+space to ease network configuration and improve reliability.
+
+To use cifs Kerberos and DFS support, the Linux keyutils package should be
+installed and something like the following lines should be added to the
+/etc/request-key.conf file:
+
+create cifs.spnego * * /usr/local/sbin/cifs.upcall %k
+create dns_resolver * * /usr/local/sbin/cifs.upcall %k
+
+CIFS kernel module parameters
+=============================
+These module parameters can be specified or modified either during the time of
+module loading or during the runtime by using the interface
+ /proc/module/cifs/parameters/<param>
+
+i.e. echo "value" > /sys/module/cifs/parameters/<param>
+
+1. enable_oplocks - Enable or disable oplocks. Oplocks are enabled by default.
+ [Y/y/1]. To disable use any of [N/n/0].
+
diff --git a/Documentation/filesystems/cifs/TODO b/Documentation/filesystems/cifs/TODO
new file mode 100644
index 00000000000..355abcdcda9
--- /dev/null
+++ b/Documentation/filesystems/cifs/TODO
@@ -0,0 +1,129 @@
+Version 1.53 May 20, 2008
+
+A Partial List of Missing Features
+==================================
+
+Contributions are welcome. There are plenty of opportunities
+for visible, important contributions to this module. Here
+is a partial list of the known problems and missing features:
+
+a) Support for SecurityDescriptors(Windows/CIFS ACLs) for chmod/chgrp/chown
+so that these operations can be supported to Windows servers
+
+b) Mapping POSIX ACLs (and eventually NFSv4 ACLs) to CIFS
+SecurityDescriptors
+
+c) Better pam/winbind integration (e.g. to handle uid mapping
+better)
+
+d) Cleanup now unneeded SessSetup code in
+fs/cifs/connect.c and add back in NTLMSSP code if any servers
+need it
+
+e) fix NTLMv2 signing when two mounts with different users to same
+server.
+
+f) Directory entry caching relies on a 1 second timer, rather than
+using FindNotify or equivalent. - (started)
+
+g) quota support (needs minor kernel change since quota calls
+to make it to network filesystems or deviceless filesystems)
+
+h) investigate sync behavior (including syncpage) and check
+for proper behavior of intr/nointr
+
+i) improve support for very old servers (OS/2 and Win9x for example)
+Including support for changing the time remotely (utimes command).
+
+j) hook lower into the sockets api (as NFS/SunRPC does) to avoid the
+extra copy in/out of the socket buffers in some cases.
+
+k) Better optimize open (and pathbased setfilesize) to reduce the
+oplock breaks coming from windows srv. Piggyback identical file
+opens on top of each other by incrementing reference count rather
+than resending (helps reduce server resource utilization and avoid
+spurious oplock breaks).
+
+l) Improve performance of readpages by sending more than one read
+at a time when 8 pages or more are requested. In conjuntion
+add support for async_cifs_readpages.
+
+m) Add support for storing symlink info to Windows servers
+in the Extended Attribute format their SFU clients would recognize.
+
+n) Finish fcntl D_NOTIFY support so kde and gnome file list windows
+will autorefresh (partially complete by Asser). Needs minor kernel
+vfs change to support removing D_NOTIFY on a file.
+
+o) Add GUI tool to configure /proc/fs/cifs settings and for display of
+the CIFS statistics (started)
+
+p) implement support for security and trusted categories of xattrs
+(requires minor protocol extension) to enable better support for SELINUX
+
+q) Implement O_DIRECT flag on open (already supported on mount)
+
+r) Create UID mapping facility so server UIDs can be mapped on a per
+mount or a per server basis to client UIDs or nobody if no mapping
+exists. This is helpful when Unix extensions are negotiated to
+allow better permission checking when UIDs differ on the server
+and client. Add new protocol request to the CIFS protocol
+standard for asking the server for the corresponding name of a
+particular uid.
+
+s) Add support for CIFS Unix and also the newer POSIX extensions to the
+server side for Samba 4.
+
+t) In support for OS/2 (LANMAN 1.2 and LANMAN2.1 based SMB servers)
+need to add ability to set time to server (utimes command)
+
+u) DOS attrs - returned as pseudo-xattr in Samba format (check VFAT and NTFS for this too)
+
+v) mount check for unmatched uids
+
+w) Add support for new vfs entry point for fallocate
+
+x) Fix Samba 3 server to handle Linux kernel aio so dbench with lots of
+processes can proceed better in parallel (on the server)
+
+y) Fix Samba 3 to handle reads/writes over 127K (and remove the cifs mount
+restriction of wsize max being 127K)
+
+KNOWN BUGS (updated April 24, 2007)
+====================================
+See http://bugzilla.samba.org - search on product "CifsVFS" for
+current bug list.
+
+1) existing symbolic links (Windows reparse points) are recognized but
+can not be created remotely. They are implemented for Samba and those that
+support the CIFS Unix extensions, although earlier versions of Samba
+overly restrict the pathnames.
+2) follow_link and readdir code does not follow dfs junctions
+but recognizes them
+3) create of new files to FAT partitions on Windows servers can
+succeed but still return access denied (appears to be Windows
+server not cifs client problem) and has not been reproduced recently.
+NTFS partitions do not have this problem.
+4) Unix/POSIX capabilities are reset after reconnection, and affect
+a few fields in the tree connection but we do do not know which
+superblocks to apply these changes to. We should probably walk
+the list of superblocks to set these. Also need to check the
+flags on the second mount to the same share, and see if we
+can do the same trick that NFS does to remount duplicate shares.
+
+Misc testing to do
+==================
+1) check out max path names and max path name components against various server
+types. Try nested symlinks (8 deep). Return max path name in stat -f information
+
+2) Modify file portion of ltp so it can run against a mounted network
+share and run it against cifs vfs in automated fashion.
+
+3) Additional performance testing and optimization using iozone and similar -
+there are some easy changes that can be done to parallelize sequential writes,
+and when signing is disabled to request larger read sizes (larger than
+negotiated size) and send larger write sizes to modern servers.
+
+4) More exhaustively test against less common servers. More testing
+against Windows 9x, Windows ME servers.
+
diff --git a/Documentation/filesystems/cifs/cifs.txt b/Documentation/filesystems/cifs/cifs.txt
new file mode 100644
index 00000000000..2fac91ac96c
--- /dev/null
+++ b/Documentation/filesystems/cifs/cifs.txt
@@ -0,0 +1,31 @@
+ This is the client VFS module for the Common Internet File System
+ (CIFS) protocol which is the successor to the Server Message Block
+ (SMB) protocol, the native file sharing mechanism for most early
+ PC operating systems. New and improved versions of CIFS are now
+ called SMB2 and SMB3. These dialects are also supported by the
+ CIFS VFS module. CIFS is fully supported by network
+ file servers such as Windows 2000, 2003, 2008 and 2012
+ as well by Samba (which provides excellent CIFS
+ server support for Linux and many other operating systems), so
+ this network filesystem client can mount to a wide variety of
+ servers.
+
+ The intent of this module is to provide the most advanced network
+ file system function for CIFS compliant servers, including better
+ POSIX compliance, secure per-user session establishment, high
+ performance safe distributed caching (oplock), optional packet
+ signing, large files, Unicode support and other internationalization
+ improvements. Since both Samba server and this filesystem client support
+ the CIFS Unix extensions, the combination can provide a reasonable
+ alternative to NFSv4 for fileserving in some Linux to Linux environments,
+ not just in Linux to Windows environments.
+
+ This filesystem has an mount utility (mount.cifs) that can be obtained from
+
+ https://ftp.samba.org/pub/linux-cifs/cifs-utils/
+
+ It must be installed in the directory with the other mount helpers.
+
+ For more information on the module see the project wiki page at
+
+ https://wiki.samba.org/index.php/LinuxCIFS_utils
diff --git a/Documentation/filesystems/cifs/winucase_convert.pl b/Documentation/filesystems/cifs/winucase_convert.pl
new file mode 100755
index 00000000000..322a9c833f2
--- /dev/null
+++ b/Documentation/filesystems/cifs/winucase_convert.pl
@@ -0,0 +1,62 @@
+#!/usr/bin/perl -w
+#
+# winucase_convert.pl -- convert "Windows 8 Upper Case Mapping Table.txt" to
+# a two-level set of C arrays.
+#
+# Copyright 2013: Jeff Layton <jlayton@redhat.com>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+
+while(<>) {
+ next if (!/^0x(..)(..)\t0x(....)\t/);
+ $firstchar = hex($1);
+ $secondchar = hex($2);
+ $uppercase = hex($3);
+
+ $top[$firstchar][$secondchar] = $uppercase;
+}
+
+for ($i = 0; $i < 256; $i++) {
+ next if (!$top[$i]);
+
+ printf("static const wchar_t t2_%2.2x[256] = {", $i);
+ for ($j = 0; $j < 256; $j++) {
+ if (($j % 8) == 0) {
+ print "\n\t";
+ } else {
+ print " ";
+ }
+ printf("0x%4.4x,", $top[$i][$j] ? $top[$i][$j] : 0);
+ }
+ print "\n};\n\n";
+}
+
+printf("static const wchar_t *const toplevel[256] = {", $i);
+for ($i = 0; $i < 256; $i++) {
+ if (($i % 8) == 0) {
+ print "\n\t";
+ } elsif ($top[$i]) {
+ print " ";
+ } else {
+ print " ";
+ }
+
+ if ($top[$i]) {
+ printf("t2_%2.2x,", $i);
+ } else {
+ print "NULL,";
+ }
+}
+print "\n};\n\n";
diff --git a/Documentation/filesystems/debugfs.txt b/Documentation/filesystems/debugfs.txt
index 6872c91bce3..3a863f69272 100644
--- a/Documentation/filesystems/debugfs.txt
+++ b/Documentation/filesystems/debugfs.txt
@@ -14,7 +14,10 @@ Debugfs is typically mounted with a command like:
mount -t debugfs none /sys/kernel/debug
-(Or an equivalent /etc/fstab line).
+(Or an equivalent /etc/fstab line).
+The debugfs root directory is accessible only to the root user by
+default. To change access to the tree the "uid", "gid" and "mode" mount
+options can be used.
Note that the debugfs API is exported GPL-only to modules.
@@ -133,7 +136,7 @@ file.
void __iomem *base;
};
- struct dentry *debugfs_create_regset32(const char *name, mode_t mode,
+ struct dentry *debugfs_create_regset32(const char *name, umode_t mode,
struct dentry *parent,
struct debugfs_regset32 *regset);
diff --git a/Documentation/filesystems/directory-locking b/Documentation/filesystems/directory-locking
index ff7b611abf3..09bbf9a54f8 100644
--- a/Documentation/filesystems/directory-locking
+++ b/Documentation/filesystems/directory-locking
@@ -2,6 +2,10 @@
kinds of locks - per-inode (->i_mutex) and per-filesystem
(->s_vfs_rename_mutex).
+ When taking the i_mutex on multiple non-directory objects, we
+always acquire the locks in order by increasing address. We'll call
+that "inode pointer" order in the following.
+
For our purposes all operations fall in 5 classes:
1) read access. Locking rules: caller locks directory we are accessing.
@@ -12,8 +16,9 @@ kinds of locks - per-inode (->i_mutex) and per-filesystem
locks victim and calls the method.
4) rename() that is _not_ cross-directory. Locking rules: caller locks
-the parent, finds source and target, if target already exists - locks it
-and then calls the method.
+the parent and finds source and target. If target already exists, lock
+it. If source is a non-directory, lock it. If that means we need to
+lock both, lock them in inode pointer order.
5) link creation. Locking rules:
* lock parent
@@ -30,7 +35,9 @@ rules:
fail with -ENOTEMPTY
* if new parent is equal to or is a descendent of source
fail with -ELOOP
- * if target exists - lock it.
+ * If target exists, lock it. If source is a non-directory, lock
+ it. In case that means we need to lock both source and target,
+ do so in inode pointer order.
* call the method.
@@ -56,9 +63,11 @@ objects - A < B iff A is an ancestor of B.
renames will be blocked on filesystem lock and we don't start changing
the order until we had acquired all locks).
-(3) any operation holds at most one lock on non-directory object and
- that lock is acquired after all other locks. (Proof: see descriptions
- of operations).
+(3) locks on non-directory objects are acquired only after locks on
+ directory objects, and are acquired in inode pointer order.
+ (Proof: all operations but renames take lock on at most one
+ non-directory object, except renames, which take locks on source and
+ target in inode pointer order in the case they are not directories.)
Now consider the minimal deadlock. Each process is blocked on
attempt to acquire some lock and already holds at least one lock. Let's
@@ -66,9 +75,13 @@ consider the set of contended locks. First of all, filesystem lock is
not contended, since any process blocked on it is not holding any locks.
Thus all processes are blocked on ->i_mutex.
- Non-directory objects are not contended due to (3). Thus link
-creation can't be a part of deadlock - it can't be blocked on source
-and it means that it doesn't hold any locks.
+ By (3), any process holding a non-directory lock can only be
+waiting on another non-directory lock with a larger address. Therefore
+the process holding the "largest" such lock can always make progress, and
+non-directory objects are not included in the set of contended locks.
+
+ Thus link creation can't be a part of deadlock - it can't be
+blocked on source and it means that it doesn't hold any locks.
Any contended object is either held by cross-directory rename or
has a child that is also contended. Indeed, suppose that it is held by
diff --git a/Documentation/filesystems/efivarfs.txt b/Documentation/filesystems/efivarfs.txt
new file mode 100644
index 00000000000..c477af086e6
--- /dev/null
+++ b/Documentation/filesystems/efivarfs.txt
@@ -0,0 +1,16 @@
+
+efivarfs - a (U)EFI variable filesystem
+
+The efivarfs filesystem was created to address the shortcomings of
+using entries in sysfs to maintain EFI variables. The old sysfs EFI
+variables code only supported variables of up to 1024 bytes. This
+limitation existed in version 0.99 of the EFI specification, but was
+removed before any full releases. Since variables can now be larger
+than a single page, sysfs isn't the best interface for this.
+
+Variables can be created, deleted and modified with the efivarfs
+filesystem.
+
+efivarfs is typically mounted like this,
+
+ mount -t efivarfs none /sys/firmware/efi/efivars
diff --git a/Documentation/filesystems/ext3.txt b/Documentation/filesystems/ext3.txt
index b100adc38ad..7ed0d17d672 100644
--- a/Documentation/filesystems/ext3.txt
+++ b/Documentation/filesystems/ext3.txt
@@ -26,11 +26,12 @@ journal=inum When a journal already exists, this option is ignored.
Otherwise, it specifies the number of the inode which
will represent the ext3 file system's journal file.
+journal_path=path
journal_dev=devnum When the external journal device's major/minor numbers
- have changed, this option allows the user to specify
+ have changed, these options allow the user to specify
the new journal location. The journal device is
- identified through its new major/minor numbers encoded
- in devnum.
+ identified through either its new major/minor numbers
+ encoded in devnum, or via a path to the device.
norecovery Don't load the journal on mounting. Note that this forces
noload mount of inconsistent filesystem, which can lead to
@@ -59,9 +60,9 @@ commit=nrsec (*) Ext3 can be told to sync all its data and metadata
Setting it to very large values will improve
performance.
-barrier=<0(*)|1> This enables/disables the use of write barriers in
-barrier the jbd code. barrier=0 disables, barrier=1 enables.
-nobarrier (*) This also requires an IO stack which can support
+barrier=<0|1(*)> This enables/disables the use of write barriers in
+barrier (*) the jbd code. barrier=0 disables, barrier=1 enables.
+nobarrier This also requires an IO stack which can support
barriers, and if jbd gets an error on a barrier
write, it will disable again with a warning.
Write barriers enforce proper on-disk ordering
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index 10ec4639f15..919a3293aaa 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -2,7 +2,7 @@
Ext4 Filesystem
===============
-Ext4 is an an advanced level of the ext3 filesystem which incorporates
+Ext4 is an advanced level of the ext3 filesystem which incorporates
scalability and reliability enhancements for supporting large filesystems
(64 bit) in keeping with increasing disk capacities and state-of-the-art
feature requirements.
@@ -144,14 +144,12 @@ journal_async_commit Commit block can be written to disk without waiting
mount the device. This will enable 'journal_checksum'
internally.
-journal=update Update the ext4 file system's journal to the current
- format.
-
+journal_path=path
journal_dev=devnum When the external journal device's major/minor numbers
- have changed, this option allows the user to specify
+ have changed, these options allow the user to specify
the new journal location. The journal device is
- identified through its new major/minor numbers encoded
- in devnum.
+ identified through either its new major/minor numbers
+ encoded in devnum, or via a path to the device.
norecovery Don't load the journal on mounting. Note that
noload if the filesystem was not unmounted cleanly,
@@ -203,12 +201,9 @@ inode_readahead_blks=n This tuning parameter controls the maximum
table readahead algorithm will pre-read into
the buffer cache. The default value is 32 blocks.
-nouser_xattr Disables Extended User Attributes. If you have extended
- attribute support enabled in the kernel configuration
- (CONFIG_EXT4_FS_XATTR), extended attribute support
- is enabled by default on mount. See the attr(5) manual
- page and http://acl.bestbits.at/ for more information
- about extended attributes.
+nouser_xattr Disables Extended User Attributes. See the
+ attr(5) manual page and http://acl.bestbits.at/
+ for more information about extended attributes.
noacl This option disables POSIX Access Control List
support. If ACL support is enabled in the kernel
@@ -308,7 +303,7 @@ min_batch_time=usec This parameter sets the commit time (as
fast disks, at the cost of increasing latency.
journal_ioprio=prio The I/O priority (from 0 to 7, where 0 is the
- highest priorty) which should be used for I/O
+ highest priority) which should be used for I/O
operations submitted by kjournald2 during a
commit operation. This defaults to 3, which is
a slightly higher priority than the default I/O
@@ -343,7 +338,7 @@ noinit_itable Do not initialize any uninitialized inode table
init_itable=n The lazy itable init code will wait n times the
number of milliseconds it took to zero out the
previous block group's inode table. This
- minimizes the impact on the systme performance
+ minimizes the impact on the system performance
while file system's inode table is being initialized.
discard Controls whether ext4 should issue discard/TRIM
@@ -356,11 +351,6 @@ nouid32 Disables 32-bit UIDs and GIDs. This is for
interoperability with older kernels which only
store and expect 16-bit values.
-resize Allows to resize filesystem to the end of the last
- existing block group, further resize has to be done
- with resize2fs either online, or offline. It can be
- used only with conjunction with remount.
-
block_validity This options allows to enables/disables the in-kernel
noblock_validity facility for tracking filesystem metadata blocks
within internal data structures. This allows multi-
@@ -383,6 +373,16 @@ dioread_nolock locking. If the dioread_nolock option is specified
Because of the restrictions this options comprises
it is off by default (e.g. dioread_lock).
+max_dir_size_kb=n This limits the size of directories so that any
+ attempt to expand them beyond the specified
+ limit in kilobytes will cause an ENOSPC error.
+ This is useful in memory constrained
+ environments, where a very large directory can
+ cause severe performance problems or even
+ provoke the Out Of Memory killer. (For example,
+ if there is only 512mb memory available, a 176mb
+ directory may seriously cramp the system's style.)
+
i_version Enable 64-bit inode version support. This option is
off by default.
@@ -495,6 +495,17 @@ Files in /sys/fs/ext4/<devname>
session_write_kbytes This file is read-only and shows the number of
kilobytes of data that have been written to this
filesystem since it was mounted.
+
+ reserved_clusters This is RW file and contains number of reserved
+ clusters in the file system which will be used
+ in the specific situations to avoid costly
+ zeroout, unexpected ENOSPC, or possible data
+ loss. The default is 2% or 4096 clusters,
+ whichever is smaller and this can be changed
+ however it can never exceed number of clusters
+ in the file system. If there is not enough space
+ for the reserved space when mounting the file
+ mount will _not_ fail.
..............................................................................
Ioctls
@@ -588,6 +599,16 @@ Table of Ext4 specific ioctls
bitmaps and inode table, the userspace tool thus
just passes the new number of blocks.
+EXT4_IOC_SWAP_BOOT Swap i_blocks and associated attributes
+ (like i_blocks, i_size, i_flags, ...) from
+ the specified inode with inode
+ EXT4_BOOT_LOADER_INO (#5). This is typically
+ used to store a boot loader in a secure part of
+ the filesystem, where it can't be changed by a
+ normal user by accident.
+ The data blocks of the previous boot loader
+ will be associated with the given inode.
+
..............................................................................
References
diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt
new file mode 100644
index 00000000000..51afba17bba
--- /dev/null
+++ b/Documentation/filesystems/f2fs.txt
@@ -0,0 +1,545 @@
+================================================================================
+WHAT IS Flash-Friendly File System (F2FS)?
+================================================================================
+
+NAND flash memory-based storage devices, such as SSD, eMMC, and SD cards, have
+been equipped on a variety systems ranging from mobile to server systems. Since
+they are known to have different characteristics from the conventional rotating
+disks, a file system, an upper layer to the storage device, should adapt to the
+changes from the sketch in the design level.
+
+F2FS is a file system exploiting NAND flash memory-based storage devices, which
+is based on Log-structured File System (LFS). The design has been focused on
+addressing the fundamental issues in LFS, which are snowball effect of wandering
+tree and high cleaning overhead.
+
+Since a NAND flash memory-based storage device shows different characteristic
+according to its internal geometry or flash memory management scheme, namely FTL,
+F2FS and its tools support various parameters not only for configuring on-disk
+layout, but also for selecting allocation and cleaning algorithms.
+
+The following git tree provides the file system formatting tool (mkfs.f2fs),
+a consistency checking tool (fsck.f2fs), and a debugging tool (dump.f2fs).
+>> git://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk/f2fs-tools.git
+
+For reporting bugs and sending patches, please use the following mailing list:
+>> linux-f2fs-devel@lists.sourceforge.net
+
+================================================================================
+BACKGROUND AND DESIGN ISSUES
+================================================================================
+
+Log-structured File System (LFS)
+--------------------------------
+"A log-structured file system writes all modifications to disk sequentially in
+a log-like structure, thereby speeding up both file writing and crash recovery.
+The log is the only structure on disk; it contains indexing information so that
+files can be read back from the log efficiently. In order to maintain large free
+areas on disk for fast writing, we divide the log into segments and use a
+segment cleaner to compress the live information from heavily fragmented
+segments." from Rosenblum, M. and Ousterhout, J. K., 1992, "The design and
+implementation of a log-structured file system", ACM Trans. Computer Systems
+10, 1, 26–52.
+
+Wandering Tree Problem
+----------------------
+In LFS, when a file data is updated and written to the end of log, its direct
+pointer block is updated due to the changed location. Then the indirect pointer
+block is also updated due to the direct pointer block update. In this manner,
+the upper index structures such as inode, inode map, and checkpoint block are
+also updated recursively. This problem is called as wandering tree problem [1],
+and in order to enhance the performance, it should eliminate or relax the update
+propagation as much as possible.
+
+[1] Bityutskiy, A. 2005. JFFS3 design issues. http://www.linux-mtd.infradead.org/
+
+Cleaning Overhead
+-----------------
+Since LFS is based on out-of-place writes, it produces so many obsolete blocks
+scattered across the whole storage. In order to serve new empty log space, it
+needs to reclaim these obsolete blocks seamlessly to users. This job is called
+as a cleaning process.
+
+The process consists of three operations as follows.
+1. A victim segment is selected through referencing segment usage table.
+2. It loads parent index structures of all the data in the victim identified by
+ segment summary blocks.
+3. It checks the cross-reference between the data and its parent index structure.
+4. It moves valid data selectively.
+
+This cleaning job may cause unexpected long delays, so the most important goal
+is to hide the latencies to users. And also definitely, it should reduce the
+amount of valid data to be moved, and move them quickly as well.
+
+================================================================================
+KEY FEATURES
+================================================================================
+
+Flash Awareness
+---------------
+- Enlarge the random write area for better performance, but provide the high
+ spatial locality
+- Align FS data structures to the operational units in FTL as best efforts
+
+Wandering Tree Problem
+----------------------
+- Use a term, “node”, that represents inodes as well as various pointer blocks
+- Introduce Node Address Table (NAT) containing the locations of all the “node”
+ blocks; this will cut off the update propagation.
+
+Cleaning Overhead
+-----------------
+- Support a background cleaning process
+- Support greedy and cost-benefit algorithms for victim selection policies
+- Support multi-head logs for static/dynamic hot and cold data separation
+- Introduce adaptive logging for efficient block allocation
+
+================================================================================
+MOUNT OPTIONS
+================================================================================
+
+background_gc=%s Turn on/off cleaning operations, namely garbage
+ collection, triggered in background when I/O subsystem is
+ idle. If background_gc=on, it will turn on the garbage
+ collection and if background_gc=off, garbage collection
+ will be truned off.
+ Default value for this option is on. So garbage
+ collection is on by default.
+disable_roll_forward Disable the roll-forward recovery routine
+discard Issue discard/TRIM commands when a segment is cleaned.
+no_heap Disable heap-style segment allocation which finds free
+ segments for data from the beginning of main area, while
+ for node from the end of main area.
+nouser_xattr Disable Extended User Attributes. Note: xattr is enabled
+ by default if CONFIG_F2FS_FS_XATTR is selected.
+noacl Disable POSIX Access Control List. Note: acl is enabled
+ by default if CONFIG_F2FS_FS_POSIX_ACL is selected.
+active_logs=%u Support configuring the number of active logs. In the
+ current design, f2fs supports only 2, 4, and 6 logs.
+ Default number is 6.
+disable_ext_identify Disable the extension list configured by mkfs, so f2fs
+ does not aware of cold files such as media files.
+inline_xattr Enable the inline xattrs feature.
+inline_data Enable the inline data feature: New created small(<~3.4k)
+ files can be written into inode block.
+flush_merge Merge concurrent cache_flush commands as much as possible
+ to eliminate redundant command issues. If the underlying
+ device handles the cache_flush command relatively slowly,
+ recommend to enable this option.
+
+================================================================================
+DEBUGFS ENTRIES
+================================================================================
+
+/sys/kernel/debug/f2fs/ contains information about all the partitions mounted as
+f2fs. Each file shows the whole f2fs information.
+
+/sys/kernel/debug/f2fs/status includes:
+ - major file system information managed by f2fs currently
+ - average SIT information about whole segments
+ - current memory footprint consumed by f2fs.
+
+================================================================================
+SYSFS ENTRIES
+================================================================================
+
+Information about mounted f2f2 file systems can be found in
+/sys/fs/f2fs. Each mounted filesystem will have a directory in
+/sys/fs/f2fs based on its device name (i.e., /sys/fs/f2fs/sda).
+The files in each per-device directory are shown in table below.
+
+Files in /sys/fs/f2fs/<devname>
+(see also Documentation/ABI/testing/sysfs-fs-f2fs)
+..............................................................................
+ File Content
+
+ gc_max_sleep_time This tuning parameter controls the maximum sleep
+ time for the garbage collection thread. Time is
+ in milliseconds.
+
+ gc_min_sleep_time This tuning parameter controls the minimum sleep
+ time for the garbage collection thread. Time is
+ in milliseconds.
+
+ gc_no_gc_sleep_time This tuning parameter controls the default sleep
+ time for the garbage collection thread. Time is
+ in milliseconds.
+
+ gc_idle This parameter controls the selection of victim
+ policy for garbage collection. Setting gc_idle = 0
+ (default) will disable this option. Setting
+ gc_idle = 1 will select the Cost Benefit approach
+ & setting gc_idle = 2 will select the greedy aproach.
+
+ reclaim_segments This parameter controls the number of prefree
+ segments to be reclaimed. If the number of prefree
+ segments is larger than the number of segments
+ in the proportion to the percentage over total
+ volume size, f2fs tries to conduct checkpoint to
+ reclaim the prefree segments to free segments.
+ By default, 5% over total # of segments.
+
+ max_small_discards This parameter controls the number of discard
+ commands that consist small blocks less than 2MB.
+ The candidates to be discarded are cached until
+ checkpoint is triggered, and issued during the
+ checkpoint. By default, it is disabled with 0.
+
+ ipu_policy This parameter controls the policy of in-place
+ updates in f2fs. There are five policies:
+ 0: F2FS_IPU_FORCE, 1: F2FS_IPU_SSR,
+ 2: F2FS_IPU_UTIL, 3: F2FS_IPU_SSR_UTIL,
+ 4: F2FS_IPU_DISABLE.
+
+ min_ipu_util This parameter controls the threshold to trigger
+ in-place-updates. The number indicates percentage
+ of the filesystem utilization, and used by
+ F2FS_IPU_UTIL and F2FS_IPU_SSR_UTIL policies.
+
+ max_victim_search This parameter controls the number of trials to
+ find a victim segment when conducting SSR and
+ cleaning operations. The default value is 4096
+ which covers 8GB block address range.
+
+ dir_level This parameter controls the directory level to
+ support large directory. If a directory has a
+ number of files, it can reduce the file lookup
+ latency by increasing this dir_level value.
+ Otherwise, it needs to decrease this value to
+ reduce the space overhead. The default value is 0.
+
+ ram_thresh This parameter controls the memory footprint used
+ by free nids and cached nat entries. By default,
+ 10 is set, which indicates 10 MB / 1 GB RAM.
+
+================================================================================
+USAGE
+================================================================================
+
+1. Download userland tools and compile them.
+
+2. Skip, if f2fs was compiled statically inside kernel.
+ Otherwise, insert the f2fs.ko module.
+ # insmod f2fs.ko
+
+3. Create a directory trying to mount
+ # mkdir /mnt/f2fs
+
+4. Format the block device, and then mount as f2fs
+ # mkfs.f2fs -l label /dev/block_device
+ # mount -t f2fs /dev/block_device /mnt/f2fs
+
+mkfs.f2fs
+---------
+The mkfs.f2fs is for the use of formatting a partition as the f2fs filesystem,
+which builds a basic on-disk layout.
+
+The options consist of:
+-l [label] : Give a volume label, up to 512 unicode name.
+-a [0 or 1] : Split start location of each area for heap-based allocation.
+ 1 is set by default, which performs this.
+-o [int] : Set overprovision ratio in percent over volume size.
+ 5 is set by default.
+-s [int] : Set the number of segments per section.
+ 1 is set by default.
+-z [int] : Set the number of sections per zone.
+ 1 is set by default.
+-e [str] : Set basic extension list. e.g. "mp3,gif,mov"
+-t [0 or 1] : Disable discard command or not.
+ 1 is set by default, which conducts discard.
+
+fsck.f2fs
+---------
+The fsck.f2fs is a tool to check the consistency of an f2fs-formatted
+partition, which examines whether the filesystem metadata and user-made data
+are cross-referenced correctly or not.
+Note that, initial version of the tool does not fix any inconsistency.
+
+The options consist of:
+ -d debug level [default:0]
+
+dump.f2fs
+---------
+The dump.f2fs shows the information of specific inode and dumps SSA and SIT to
+file. Each file is dump_ssa and dump_sit.
+
+The dump.f2fs is used to debug on-disk data structures of the f2fs filesystem.
+It shows on-disk inode information reconized by a given inode number, and is
+able to dump all the SSA and SIT entries into predefined files, ./dump_ssa and
+./dump_sit respectively.
+
+The options consist of:
+ -d debug level [default:0]
+ -i inode no (hex)
+ -s [SIT dump segno from #1~#2 (decimal), for all 0~-1]
+ -a [SSA dump segno from #1~#2 (decimal), for all 0~-1]
+
+Examples:
+# dump.f2fs -i [ino] /dev/sdx
+# dump.f2fs -s 0~-1 /dev/sdx (SIT dump)
+# dump.f2fs -a 0~-1 /dev/sdx (SSA dump)
+
+================================================================================
+DESIGN
+================================================================================
+
+On-disk Layout
+--------------
+
+F2FS divides the whole volume into a number of segments, each of which is fixed
+to 2MB in size. A section is composed of consecutive segments, and a zone
+consists of a set of sections. By default, section and zone sizes are set to one
+segment size identically, but users can easily modify the sizes by mkfs.
+
+F2FS splits the entire volume into six areas, and all the areas except superblock
+consists of multiple segments as described below.
+
+ align with the zone size <-|
+ |-> align with the segment size
+ _________________________________________________________________________
+ | | | Segment | Node | Segment | |
+ | Superblock | Checkpoint | Info. | Address | Summary | Main |
+ | (SB) | (CP) | Table (SIT) | Table (NAT) | Area (SSA) | |
+ |____________|_____2______|______N______|______N______|______N_____|__N___|
+ . .
+ . .
+ . .
+ ._________________________________________.
+ |_Segment_|_..._|_Segment_|_..._|_Segment_|
+ . .
+ ._________._________
+ |_section_|__...__|_
+ . .
+ .________.
+ |__zone__|
+
+- Superblock (SB)
+ : It is located at the beginning of the partition, and there exist two copies
+ to avoid file system crash. It contains basic partition information and some
+ default parameters of f2fs.
+
+- Checkpoint (CP)
+ : It contains file system information, bitmaps for valid NAT/SIT sets, orphan
+ inode lists, and summary entries of current active segments.
+
+- Segment Information Table (SIT)
+ : It contains segment information such as valid block count and bitmap for the
+ validity of all the blocks.
+
+- Node Address Table (NAT)
+ : It is composed of a block address table for all the node blocks stored in
+ Main area.
+
+- Segment Summary Area (SSA)
+ : It contains summary entries which contains the owner information of all the
+ data and node blocks stored in Main area.
+
+- Main Area
+ : It contains file and directory data including their indices.
+
+In order to avoid misalignment between file system and flash-based storage, F2FS
+aligns the start block address of CP with the segment size. Also, it aligns the
+start block address of Main area with the zone size by reserving some segments
+in SSA area.
+
+Reference the following survey for additional technical details.
+https://wiki.linaro.org/WorkingGroups/Kernel/Projects/FlashCardSurvey
+
+File System Metadata Structure
+------------------------------
+
+F2FS adopts the checkpointing scheme to maintain file system consistency. At
+mount time, F2FS first tries to find the last valid checkpoint data by scanning
+CP area. In order to reduce the scanning time, F2FS uses only two copies of CP.
+One of them always indicates the last valid data, which is called as shadow copy
+mechanism. In addition to CP, NAT and SIT also adopt the shadow copy mechanism.
+
+For file system consistency, each CP points to which NAT and SIT copies are
+valid, as shown as below.
+
+ +--------+----------+---------+
+ | CP | SIT | NAT |
+ +--------+----------+---------+
+ . . . .
+ . . . .
+ . . . .
+ +-------+-------+--------+--------+--------+--------+
+ | CP #0 | CP #1 | SIT #0 | SIT #1 | NAT #0 | NAT #1 |
+ +-------+-------+--------+--------+--------+--------+
+ | ^ ^
+ | | |
+ `----------------------------------------'
+
+Index Structure
+---------------
+
+The key data structure to manage the data locations is a "node". Similar to
+traditional file structures, F2FS has three types of node: inode, direct node,
+indirect node. F2FS assigns 4KB to an inode block which contains 923 data block
+indices, two direct node pointers, two indirect node pointers, and one double
+indirect node pointer as described below. One direct node block contains 1018
+data blocks, and one indirect node block contains also 1018 node blocks. Thus,
+one inode block (i.e., a file) covers:
+
+ 4KB * (923 + 2 * 1018 + 2 * 1018 * 1018 + 1018 * 1018 * 1018) := 3.94TB.
+
+ Inode block (4KB)
+ |- data (923)
+ |- direct node (2)
+ | `- data (1018)
+ |- indirect node (2)
+ | `- direct node (1018)
+ | `- data (1018)
+ `- double indirect node (1)
+ `- indirect node (1018)
+ `- direct node (1018)
+ `- data (1018)
+
+Note that, all the node blocks are mapped by NAT which means the location of
+each node is translated by the NAT table. In the consideration of the wandering
+tree problem, F2FS is able to cut off the propagation of node updates caused by
+leaf data writes.
+
+Directory Structure
+-------------------
+
+A directory entry occupies 11 bytes, which consists of the following attributes.
+
+- hash hash value of the file name
+- ino inode number
+- len the length of file name
+- type file type such as directory, symlink, etc
+
+A dentry block consists of 214 dentry slots and file names. Therein a bitmap is
+used to represent whether each dentry is valid or not. A dentry block occupies
+4KB with the following composition.
+
+ Dentry Block(4 K) = bitmap (27 bytes) + reserved (3 bytes) +
+ dentries(11 * 214 bytes) + file name (8 * 214 bytes)
+
+ [Bucket]
+ +--------------------------------+
+ |dentry block 1 | dentry block 2 |
+ +--------------------------------+
+ . .
+ . .
+ . [Dentry Block Structure: 4KB] .
+ +--------+----------+----------+------------+
+ | bitmap | reserved | dentries | file names |
+ +--------+----------+----------+------------+
+ [Dentry Block: 4KB] . .
+ . .
+ . .
+ +------+------+-----+------+
+ | hash | ino | len | type |
+ +------+------+-----+------+
+ [Dentry Structure: 11 bytes]
+
+F2FS implements multi-level hash tables for directory structure. Each level has
+a hash table with dedicated number of hash buckets as shown below. Note that
+"A(2B)" means a bucket includes 2 data blocks.
+
+----------------------
+A : bucket
+B : block
+N : MAX_DIR_HASH_DEPTH
+----------------------
+
+level #0 | A(2B)
+ |
+level #1 | A(2B) - A(2B)
+ |
+level #2 | A(2B) - A(2B) - A(2B) - A(2B)
+ . | . . . .
+level #N/2 | A(2B) - A(2B) - A(2B) - A(2B) - A(2B) - ... - A(2B)
+ . | . . . .
+level #N | A(4B) - A(4B) - A(4B) - A(4B) - A(4B) - ... - A(4B)
+
+The number of blocks and buckets are determined by,
+
+ ,- 2, if n < MAX_DIR_HASH_DEPTH / 2,
+ # of blocks in level #n = |
+ `- 4, Otherwise
+
+ ,- 2^(n + dir_level),
+ | if n + dir_level < MAX_DIR_HASH_DEPTH / 2,
+ # of buckets in level #n = |
+ `- 2^((MAX_DIR_HASH_DEPTH / 2) - 1),
+ Otherwise
+
+When F2FS finds a file name in a directory, at first a hash value of the file
+name is calculated. Then, F2FS scans the hash table in level #0 to find the
+dentry consisting of the file name and its inode number. If not found, F2FS
+scans the next hash table in level #1. In this way, F2FS scans hash tables in
+each levels incrementally from 1 to N. In each levels F2FS needs to scan only
+one bucket determined by the following equation, which shows O(log(# of files))
+complexity.
+
+ bucket number to scan in level #n = (hash value) % (# of buckets in level #n)
+
+In the case of file creation, F2FS finds empty consecutive slots that cover the
+file name. F2FS searches the empty slots in the hash tables of whole levels from
+1 to N in the same way as the lookup operation.
+
+The following figure shows an example of two cases holding children.
+ --------------> Dir <--------------
+ | |
+ child child
+
+ child - child [hole] - child
+
+ child - child - child [hole] - [hole] - child
+
+ Case 1: Case 2:
+ Number of children = 6, Number of children = 3,
+ File size = 7 File size = 7
+
+Default Block Allocation
+------------------------
+
+At runtime, F2FS manages six active logs inside "Main" area: Hot/Warm/Cold node
+and Hot/Warm/Cold data.
+
+- Hot node contains direct node blocks of directories.
+- Warm node contains direct node blocks except hot node blocks.
+- Cold node contains indirect node blocks
+- Hot data contains dentry blocks
+- Warm data contains data blocks except hot and cold data blocks
+- Cold data contains multimedia data or migrated data blocks
+
+LFS has two schemes for free space management: threaded log and copy-and-compac-
+tion. The copy-and-compaction scheme which is known as cleaning, is well-suited
+for devices showing very good sequential write performance, since free segments
+are served all the time for writing new data. However, it suffers from cleaning
+overhead under high utilization. Contrarily, the threaded log scheme suffers
+from random writes, but no cleaning process is needed. F2FS adopts a hybrid
+scheme where the copy-and-compaction scheme is adopted by default, but the
+policy is dynamically changed to the threaded log scheme according to the file
+system status.
+
+In order to align F2FS with underlying flash-based storage, F2FS allocates a
+segment in a unit of section. F2FS expects that the section size would be the
+same as the unit size of garbage collection in FTL. Furthermore, with respect
+to the mapping granularity in FTL, F2FS allocates each section of the active
+logs from different zones as much as possible, since FTL can write the data in
+the active logs into one allocation unit according to its mapping granularity.
+
+Cleaning process
+----------------
+
+F2FS does cleaning both on demand and in the background. On-demand cleaning is
+triggered when there are not enough free segments to serve VFS calls. Background
+cleaner is operated by a kernel thread, and triggers the cleaning job when the
+system is idle.
+
+F2FS supports two victim selection policies: greedy and cost-benefit algorithms.
+In the greedy algorithm, F2FS selects a victim segment having the smallest number
+of valid blocks. In the cost-benefit algorithm, F2FS selects a victim segment
+according to the segment age and the number of valid blocks in order to address
+log block thrashing problem in the greedy algorithm. F2FS adopts the greedy
+algorithm for on-demand cleaner, while background cleaner adopts cost-benefit
+algorithm.
+
+In order to identify whether the data in the victim segment are valid or not,
+F2FS manages a bitmap. Each bit represents the validity of a block, and the
+bitmap is composed of a bit stream covering whole blocks in main area.
diff --git a/Documentation/filesystems/files.txt b/Documentation/filesystems/files.txt
index ac2facc50d2..46dfc6b038c 100644
--- a/Documentation/filesystems/files.txt
+++ b/Documentation/filesystems/files.txt
@@ -113,8 +113,8 @@ the fdtable structure -
if (fd >= 0) {
/* locate_fd() may have expanded fdtable, load the ptr */
fdt = files_fdtable(files);
- FD_SET(fd, fdt->open_fds);
- FD_CLR(fd, fdt->close_on_exec);
+ __set_open_fd(fd, fdt);
+ __clear_close_on_exec(fd, fdt);
spin_unlock(&files->file_lock);
.....
diff --git a/Documentation/filesystems/gfs2-glocks.txt b/Documentation/filesystems/gfs2-glocks.txt
index 0494f78d87e..fcc79957be6 100644
--- a/Documentation/filesystems/gfs2-glocks.txt
+++ b/Documentation/filesystems/gfs2-glocks.txt
@@ -61,7 +61,9 @@ go_unlock | Called on the final local unlock of a lock
go_dump | Called to print content of object for debugfs file, or on
| error to dump glock to the log.
go_type | The type of the glock, LM_TYPE_.....
-go_min_hold_time | The minimum hold time
+go_callback | Called if the DLM sends a callback to drop this lock
+go_flags | GLOF_ASPACE is set, if the glock has an address space
+ | associated with it
The minimum hold time for each lock is the time after a remote lock
grant for which we ignore remote demote requests. This is in order to
@@ -89,6 +91,7 @@ go_demote_ok | Sometimes | Yes
go_lock | Yes | No
go_unlock | Yes | No
go_dump | Sometimes | Yes
+go_callback | Sometimes (N/A) | Yes
N.B. Operations must not drop either the bit lock or the spinlock
if its held on entry. go_dump and do_demote_ok must never block.
@@ -111,4 +114,118 @@ itself (locking order as above), and the other, known as the iopen
glock is used in conjunction with the i_nlink field in the inode to
determine the lifetime of the inode in question. Locking of inodes
is on a per-inode basis. Locking of rgrps is on a per rgrp basis.
+In general we prefer to lock local locks prior to cluster locks.
+
+ Glock Statistics
+ ------------------
+
+The stats are divided into two sets: those relating to the
+super block and those relating to an individual glock. The
+super block stats are done on a per cpu basis in order to
+try and reduce the overhead of gathering them. They are also
+further divided by glock type. All timings are in nanoseconds.
+
+In the case of both the super block and glock statistics,
+the same information is gathered in each case. The super
+block timing statistics are used to provide default values for
+the glock timing statistics, so that newly created glocks
+should have, as far as possible, a sensible starting point.
+The per-glock counters are initialised to zero when the
+glock is created. The per-glock statistics are lost when
+the glock is ejected from memory.
+
+The statistics are divided into three pairs of mean and
+variance, plus two counters. The mean/variance pairs are
+smoothed exponential estimates and the algorithm used is
+one which will be very familiar to those used to calculation
+of round trip times in network code. See "TCP/IP Illustrated,
+Volume 1", W. Richard Stevens, sect 21.3, "Round-Trip Time Measurement",
+p. 299 and onwards. Also, Volume 2, Sect. 25.10, p. 838 and onwards.
+Unlike the TCP/IP Illustrated case, the mean and variance are
+not scaled, but are in units of integer nanoseconds.
+
+The three pairs of mean/variance measure the following
+things:
+
+ 1. DLM lock time (non-blocking requests)
+ 2. DLM lock time (blocking requests)
+ 3. Inter-request time (again to the DLM)
+
+A non-blocking request is one which will complete right
+away, whatever the state of the DLM lock in question. That
+currently means any requests when (a) the current state of
+the lock is exclusive, i.e. a lock demotion (b) the requested
+state is either null or unlocked (again, a demotion) or (c) the
+"try lock" flag is set. A blocking request covers all the other
+lock requests.
+
+There are two counters. The first is there primarily to show
+how many lock requests have been made, and thus how much data
+has gone into the mean/variance calculations. The other counter
+is counting queuing of holders at the top layer of the glock
+code. Hopefully that number will be a lot larger than the number
+of dlm lock requests issued.
+
+So why gather these statistics? There are several reasons
+we'd like to get a better idea of these timings:
+
+1. To be able to better set the glock "min hold time"
+2. To spot performance issues more easily
+3. To improve the algorithm for selecting resource groups for
+allocation (to base it on lock wait time, rather than blindly
+using a "try lock")
+
+Due to the smoothing action of the updates, a step change in
+some input quantity being sampled will only fully be taken
+into account after 8 samples (or 4 for the variance) and this
+needs to be carefully considered when interpreting the
+results.
+
+Knowing both the time it takes a lock request to complete and
+the average time between lock requests for a glock means we
+can compute the total percentage of the time for which the
+node is able to use a glock vs. time that the rest of the
+cluster has its share. That will be very useful when setting
+the lock min hold time.
+
+Great care has been taken to ensure that we
+measure exactly the quantities that we want, as accurately
+as possible. There are always inaccuracies in any
+measuring system, but I hope this is as accurate as we
+can reasonably make it.
+
+Per sb stats can be found here:
+/sys/kernel/debug/gfs2/<fsname>/sbstats
+Per glock stats can be found here:
+/sys/kernel/debug/gfs2/<fsname>/glstats
+
+Assuming that debugfs is mounted on /sys/kernel/debug and also
+that <fsname> is replaced with the name of the gfs2 filesystem
+in question.
+
+The abbreviations used in the output as are follows:
+
+srtt - Smoothed round trip time for non-blocking dlm requests
+srttvar - Variance estimate for srtt
+srttb - Smoothed round trip time for (potentially) blocking dlm requests
+srttvarb - Variance estimate for srttb
+sirt - Smoothed inter-request time (for dlm requests)
+sirtvar - Variance estimate for sirt
+dlm - Number of dlm requests made (dcnt in glstats file)
+queue - Number of glock requests queued (qcnt in glstats file)
+
+The sbstats file contains a set of these stats for each glock type (so 8 lines
+for each type) and for each cpu (one column per cpu). The glstats file contains
+a set of these stats for each glock in a similar format to the glocks file, but
+using the format mean/variance for each of the timing stats.
+
+The gfs2_glock_lock_time tracepoint prints out the current values of the stats
+for the glock in question, along with some addition information on each dlm
+reply that is received:
+
+status - The status of the dlm request
+flags - The dlm request flags
+tdiff - The time taken by this specific request
+(remaining fields as per above list)
+
diff --git a/Documentation/filesystems/gfs2-uevents.txt b/Documentation/filesystems/gfs2-uevents.txt
index d8188966929..19a19ebebc3 100644
--- a/Documentation/filesystems/gfs2-uevents.txt
+++ b/Documentation/filesystems/gfs2-uevents.txt
@@ -62,7 +62,7 @@ be fixed.
The REMOVE uevent is generated at the end of an unsuccessful mount
or at the end of a umount of the filesystem. All REMOVE uevents will
-have been preceded by at least an ADD uevent for the same fileystem,
+have been preceded by at least an ADD uevent for the same filesystem,
and unlike the other uevents is generated automatically by the kernel's
kobject subsystem.
diff --git a/Documentation/filesystems/gfs2.txt b/Documentation/filesystems/gfs2.txt
index 4cda926628a..cc4f2306609 100644
--- a/Documentation/filesystems/gfs2.txt
+++ b/Documentation/filesystems/gfs2.txt
@@ -1,7 +1,7 @@
Global File System
------------------
-http://sources.redhat.com/cluster/wiki/
+https://fedorahosted.org/cluster/wiki/HomePage
GFS is a cluster file system. It allows a cluster of computers to
simultaneously use a block device that is shared between them (with FC,
@@ -30,7 +30,8 @@ needed, simply:
If you are using Fedora, you need to install the gfs2-utils package
and, for lock_dlm, you will also need to install the cman package
-and write a cluster.conf as per the documentation.
+and write a cluster.conf as per the documentation. For F17 and above
+cman has been replaced by the dlm package.
GFS2 is not on-disk compatible with previous versions of GFS, but it
is pretty close.
@@ -39,8 +40,6 @@ The following man pages can be found at the URL above:
fsck.gfs2 to repair a filesystem
gfs2_grow to expand a filesystem online
gfs2_jadd to add journals to a filesystem online
- gfs2_tool to manipulate, examine and tune a filesystem
- gfs2_quota to examine and change quota values in a filesystem
+ tunegfs2 to manipulate, examine and tune a filesystem
gfs2_convert to convert a gfs filesystem to gfs2 in-place
- mount.gfs2 to help mount(8) mount a filesystem
mkfs.gfs2 to make a filesystem
diff --git a/Documentation/filesystems/hfsplus.txt b/Documentation/filesystems/hfsplus.txt
index af1628a1061..59f7569fc9e 100644
--- a/Documentation/filesystems/hfsplus.txt
+++ b/Documentation/filesystems/hfsplus.txt
@@ -56,4 +56,4 @@ References
kernel source: <file:fs/hfsplus>
-Apple Technote 1150 http://developer.apple.com/technotes/tn/tn1150.html
+Apple Technote 1150 https://developer.apple.com/legacy/library/technotes/tn/tn1150.html
diff --git a/Documentation/filesystems/jfs.txt b/Documentation/filesystems/jfs.txt
index 26ebde77e82..41fd757997b 100644
--- a/Documentation/filesystems/jfs.txt
+++ b/Documentation/filesystems/jfs.txt
@@ -3,6 +3,7 @@ IBM's Journaled File System (JFS) for Linux
JFS Homepage: http://jfs.sourceforge.net/
The following mount options are supported:
+(*) == default
iocharset=name Character set to use for converting from Unicode to
ASCII. The default is to do no conversion. Use
@@ -21,12 +22,12 @@ nointegrity Do not write to the journal. The primary use of this option
from backup media. The integrity of the volume is not
guaranteed if the system abnormally abends.
-integrity Default. Commit metadata changes to the journal. Use this
- option to remount a volume where the nointegrity option was
+integrity(*) Commit metadata changes to the journal. Use this option to
+ remount a volume where the nointegrity option was
previously specified in order to restore normal behavior.
errors=continue Keep going on a filesystem error.
-errors=remount-ro Default. Remount the filesystem read-only on an error.
+errors=remount-ro(*) Remount the filesystem read-only on an error.
errors=panic Panic and halt the machine if an error occurs.
uid=value Override on-disk uid with specified value
@@ -35,7 +36,17 @@ umask=value Override on-disk umask with specified octal value. For
directories, the execute bit will be set if the corresponding
read bit is set.
-Please send bugs, comments, cards and letters to shaggy@linux.vnet.ibm.com.
+discard=minlen This enables/disables the use of discard/TRIM commands.
+discard The discard/TRIM commands are sent to the underlying
+nodiscard(*) block device when blocks are freed. This is useful for SSD
+ devices and sparse/thinly-provisioned LUNs. The FITRIM ioctl
+ command is also available together with the nodiscard option.
+ The value of minlen specifies the minimum blockcount, when
+ a TRIM command to the block device is considered useful.
+ When no value is given to the discard option, it defaults to
+ 64 blocks, which means 256KiB in JFS.
+ The minlen value of discard overrides the minlen value given
+ on an FITRIM ioctl().
The JFS mailing list can be subscribed to by using the link labeled
"Mail list Subscribe" at our web page http://jfs.sourceforge.net/
diff --git a/Documentation/filesystems/nfs/00-INDEX b/Documentation/filesystems/nfs/00-INDEX
index 1716874a651..53f3b596ac0 100644
--- a/Documentation/filesystems/nfs/00-INDEX
+++ b/Documentation/filesystems/nfs/00-INDEX
@@ -12,6 +12,8 @@ nfs41-server.txt
- info on the Linux server implementation of NFSv4 minor version 1.
nfs-rdma.txt
- how to install and setup the Linux NFS/RDMA client and server software
+nfsd-admin-interfaces.txt
+ - Administrative interfaces for nfsd.
nfsroot.txt
- short guide on setting up a diskless box with NFS root filesystem.
pnfs.txt
@@ -20,3 +22,5 @@ rpc-cache.txt
- introduction to the caching mechanisms in the sunrpc layer.
idmapper.txt
- information for configuring request-keys to be used by idmapper
+rpc-server-gss.txt
+ - Information on GSS authentication support in the NFS Server
diff --git a/Documentation/filesystems/nfs/Exporting b/Documentation/filesystems/nfs/Exporting
index 09994c24728..e543b1a619c 100644
--- a/Documentation/filesystems/nfs/Exporting
+++ b/Documentation/filesystems/nfs/Exporting
@@ -93,7 +93,7 @@ For a filesystem to be exportable it must:
2/ make sure that d_splice_alias is used rather than d_add
when ->lookup finds an inode for a given parent and name.
- If inode is NULL, d_splice_alias(inode, dentry) is eqivalent to
+ If inode is NULL, d_splice_alias(inode, dentry) is equivalent to
d_add(dentry, inode), NULL
diff --git a/Documentation/filesystems/nfs/idmapper.txt b/Documentation/filesystems/nfs/idmapper.txt
index 120fd3cf7fd..fe03d10bb79 100644
--- a/Documentation/filesystems/nfs/idmapper.txt
+++ b/Documentation/filesystems/nfs/idmapper.txt
@@ -4,13 +4,21 @@ ID Mapper
=========
Id mapper is used by NFS to translate user and group ids into names, and to
translate user and group names into ids. Part of this translation involves
-performing an upcall to userspace to request the information. Id mapper will
-user request-key to perform this upcall and cache the result. The program
-/usr/sbin/nfs.idmap should be called by request-key, and will perform the
-translation and initialize a key with the resulting information.
+performing an upcall to userspace to request the information. There are two
+ways NFS could obtain this information: placing a call to /sbin/request-key
+or by placing a call to the rpc.idmap daemon.
+
+NFS will attempt to call /sbin/request-key first. If this succeeds, the
+result will be cached using the generic request-key cache. This call should
+only fail if /etc/request-key.conf is not configured for the id_resolver key
+type, see the "Configuring" section below if you wish to use the request-key
+method.
+
+If the call to /sbin/request-key fails (if /etc/request-key.conf is not
+configured with the id_resolver key type), then the idmapper will ask the
+legacy rpc.idmap daemon for the id mapping. This result will be stored
+in a custom NFS idmap cache.
- NFS_USE_NEW_IDMAPPER must be selected when configuring the kernel to use this
- feature.
===========
Configuring
diff --git a/Documentation/filesystems/nfs/nfs.txt b/Documentation/filesystems/nfs/nfs.txt
index f50f26ce6cd..f2571c8bef7 100644
--- a/Documentation/filesystems/nfs/nfs.txt
+++ b/Documentation/filesystems/nfs/nfs.txt
@@ -12,9 +12,47 @@ and work is in progress on adding support for minor version 1 of the NFSv4
protocol.
The purpose of this document is to provide information on some of the
-upcall interfaces that are used in order to provide the NFS client with
-some of the information that it requires in order to fully comply with
-the NFS spec.
+special features of the NFS client that can be configured by system
+administrators.
+
+
+The nfs4_unique_id parameter
+============================
+
+NFSv4 requires clients to identify themselves to servers with a unique
+string. File open and lock state shared between one client and one server
+is associated with this identity. To support robust NFSv4 state recovery
+and transparent state migration, this identity string must not change
+across client reboots.
+
+Without any other intervention, the Linux client uses a string that contains
+the local system's node name. System administrators, however, often do not
+take care to ensure that node names are fully qualified and do not change
+over the lifetime of a client system. Node names can have other
+administrative requirements that require particular behavior that does not
+work well as part of an nfs_client_id4 string.
+
+The nfs.nfs4_unique_id boot parameter specifies a unique string that can be
+used instead of a system's node name when an NFS client identifies itself to
+a server. Thus, if the system's node name is not unique, or it changes, its
+nfs.nfs4_unique_id stays the same, preventing collision with other clients
+or loss of state during NFS reboot recovery or transparent state migration.
+
+The nfs.nfs4_unique_id string is typically a UUID, though it can contain
+anything that is believed to be unique across all NFS clients. An
+nfs4_unique_id string should be chosen when a client system is installed,
+just as a system's root file system gets a fresh UUID in its label at
+install time.
+
+The string should remain fixed for the lifetime of the client. It can be
+changed safely if care is taken that the client shuts down cleanly and all
+outstanding NFSv4 state has expired, to prevent loss of NFSv4 state.
+
+This string can be stored in an NFS client's grub.conf, or it can be provided
+via a net boot facility such as PXE. It may also be specified as an nfs.ko
+module parameter. Specifying a uniquifier string is not support for NFS
+clients running in containers.
+
The DNS resolver
================
diff --git a/Documentation/filesystems/nfs/nfs41-server.txt b/Documentation/filesystems/nfs/nfs41-server.txt
index 092fad92a3f..c49cd7e796e 100644
--- a/Documentation/filesystems/nfs/nfs41-server.txt
+++ b/Documentation/filesystems/nfs/nfs41-server.txt
@@ -5,11 +5,11 @@ Server support for minorversion 1 can be controlled using the
by reading this file will contain either "+4.1" or "-4.1"
correspondingly.
-Currently, server support for minorversion 1 is disabled by default.
-It can be enabled at run time by writing the string "+4.1" to
+Currently, server support for minorversion 1 is enabled by default.
+It can be disabled at run time by writing the string "-4.1" to
the /proc/fs/nfsd/versions control file. Note that to write this
-control file, the nfsd service must be taken down. Use your user-mode
-nfs-utils to set this up; see rpc.nfsd(8)
+control file, the nfsd service must be taken down. You can use rpc.nfsd
+for this; see rpc.nfsd(8).
(Warning: older servers will interpret "+4.1" and "-4.1" as "+4" and
"-4", respectively. Therefore, code meant to work on both new and old
@@ -29,40 +29,6 @@ are still under development out of tree.
See http://wiki.linux-nfs.org/wiki/index.php/PNFS_prototype_design
for more information.
-The current implementation is intended for developers only: while it
-does support ordinary file operations on clients we have tested against
-(including the linux client), it is incomplete in ways which may limit
-features unexpectedly, cause known bugs in rare cases, or cause
-interoperability problems with future clients. Known issues:
-
- - gss support is questionable: currently mounts with kerberos
- from a linux client are possible, but we aren't really
- conformant with the spec (for example, we don't use kerberos
- on the backchannel correctly).
- - Incomplete backchannel support: incomplete backchannel gss
- support and no support for BACKCHANNEL_CTL mean that
- callbacks (hence delegations and layouts) may not be
- available and clients confused by the incomplete
- implementation may fail.
- - We do not support SSV, which provides security for shared
- client-server state (thus preventing unauthorized tampering
- with locks and opens, for example). It is mandatory for
- servers to support this, though no clients use it yet.
- - Mandatory operations which we do not support, such as
- DESTROY_CLIENTID, are not currently used by clients, but will be
- (and the spec recommends their uses in common cases), and
- clients should not be expected to know how to recover from the
- case where they are not supported. This will eventually cause
- interoperability failures.
-
-In addition, some limitations are inherited from the current NFSv4
-implementation:
-
- - Incomplete delegation enforcement: if a file is renamed or
- unlinked by a local process, a client holding a delegation may
- continue to indefinitely allow opens of the file under the old
- name.
-
The table below, taken from the NFSv4.1 document, lists
the operations that are mandatory to implement (REQ), optional
(OPT), and NFSv4.0 operations that are required not to implement (MNI)
@@ -89,7 +55,7 @@ Operations
| | MNI | or OPT) | |
+----------------------+------------+--------------+----------------+
| ACCESS | REQ | | Section 18.1 |
-NS | BACKCHANNEL_CTL | REQ | | Section 18.33 |
+I | BACKCHANNEL_CTL | REQ | | Section 18.33 |
I | BIND_CONN_TO_SESSION | REQ | | Section 18.34 |
| CLOSE | REQ | | Section 18.2 |
| COMMIT | REQ | | Section 18.3 |
@@ -99,7 +65,7 @@ NS*| DELEGPURGE | OPT | FDELG (REQ) | Section 18.5 |
| DELEGRETURN | OPT | FDELG, | Section 18.6 |
| | | DDELG, pNFS | |
| | | (REQ) | |
-NS | DESTROY_CLIENTID | REQ | | Section 18.50 |
+I | DESTROY_CLIENTID | REQ | | Section 18.50 |
I | DESTROY_SESSION | REQ | | Section 18.37 |
I | EXCHANGE_ID | REQ | | Section 18.35 |
I | FREE_STATEID | REQ | | Section 18.38 |
@@ -180,6 +146,16 @@ NS*| CB_WANTS_CANCELLED | OPT | FDELG, | Section 20.10 |
Implementation notes:
+SSV:
+* The spec claims this is mandatory, but we don't actually know of any
+ implementations, so we're ignoring it for now. The server returns
+ NFS4ERR_ENCR_ALG_UNSUPP on EXCHANGE_ID, which should be future-proof.
+
+GSS on the backchannel:
+* Again, theoretically required but not widely implemented (in
+ particular, the current Linux client doesn't request it). We return
+ NFS4ERR_ENCR_ALG_UNSUPP on CREATE_SESSION.
+
DELEGPURGE:
* mandatory only for servers that support CLAIM_DELEGATE_PREV and/or
CLAIM_DELEG_PREV_FH (which allows clients to keep delegations that
@@ -187,12 +163,10 @@ DELEGPURGE:
now.
EXCHANGE_ID:
-* only SP4_NONE state protection supported
* implementation ids are ignored
CREATE_SESSION:
* backchannel attributes are ignored
-* backchannel security parameters are ignored
SEQUENCE:
* no support for dynamic slot table renegotiation (optional)
@@ -202,7 +176,5 @@ Nonstandard compound limitations:
ca_maxrequestsize request and a ca_maxresponsesize reply, so we may
fail to live up to the promise we made in CREATE_SESSION fore channel
negotiation.
-* No more than one IO operation (read, write, readdir) allowed per
- compound.
See also http://wiki.linux-nfs.org/wiki/index.php/Server_4.0_and_4.1_issues.
diff --git a/Documentation/filesystems/nfs/nfsd-admin-interfaces.txt b/Documentation/filesystems/nfs/nfsd-admin-interfaces.txt
new file mode 100644
index 00000000000..56a96fb08a7
--- /dev/null
+++ b/Documentation/filesystems/nfs/nfsd-admin-interfaces.txt
@@ -0,0 +1,41 @@
+Administrative interfaces for nfsd
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Note that normally these interfaces are used only by the utilities in
+nfs-utils.
+
+nfsd is controlled mainly by pseudofiles under the "nfsd" filesystem,
+which is normally mounted at /proc/fs/nfsd/.
+
+The server is always started by the first write of a nonzero value to
+nfsd/threads.
+
+Before doing that, NFSD can be told which sockets to listen on by
+writing to nfsd/portlist; that write may be:
+
+ - an ascii-encoded file descriptor, which should refer to a
+ bound (and listening, for tcp) socket, or
+ - "transportname port", where transportname is currently either
+ "udp", "tcp", or "rdma".
+
+If nfsd is started without doing any of these, then it will create one
+udp and one tcp listener at port 2049 (see nfsd_init_socks).
+
+On startup, nfsd and lockd grace periods start.
+
+nfsd is shut down by a write of 0 to nfsd/threads. All locks and state
+are thrown away at that point.
+
+Between startup and shutdown, the number of threads may be adjusted up
+or down by additional writes to nfsd/threads or by writes to
+nfsd/pool_threads.
+
+For more detail about files under nfsd/ and what they control, see
+fs/nfsd/nfsctl.c; most of them have detailed comments.
+
+Implementation notes
+^^^^^^^^^^^^^^^^^^^^
+
+Note that the rpc server requires the caller to serialize addition and
+removal of listening sockets, and startup and shutdown of the server.
+For nfsd this is done using nfsd_mutex.
diff --git a/Documentation/filesystems/nfs/nfsroot.txt b/Documentation/filesystems/nfs/nfsroot.txt
index ffdd9d866ad..2d66ed68812 100644
--- a/Documentation/filesystems/nfs/nfsroot.txt
+++ b/Documentation/filesystems/nfs/nfsroot.txt
@@ -78,7 +78,8 @@ nfsroot=[<server-ip>:]<root-dir>[,<nfs-options>]
flags = hard, nointr, noposix, cto, ac
-ip=<client-ip>:<server-ip>:<gw-ip>:<netmask>:<hostname>:<device>:<autoconf>
+ip=<client-ip>:<server-ip>:<gw-ip>:<netmask>:<hostname>:<device>:<autoconf>:
+ <dns0-ip>:<dns1-ip>
This parameter tells the kernel how to configure IP addresses of devices
and also how to set up the IP routing table. It was originally called
@@ -158,6 +159,13 @@ ip=<client-ip>:<server-ip>:<gw-ip>:<netmask>:<hostname>:<device>:<autoconf>
Default: any
+ <dns0-ip> IP address of first nameserver.
+ Value gets exported by /proc/net/pnp which is often linked
+ on embedded systems by /etc/resolv.conf.
+
+ <dns1-ip> IP address of secound nameserver.
+ Same as above.
+
nfsrootdebug
diff --git a/Documentation/filesystems/nfs/pnfs.txt b/Documentation/filesystems/nfs/pnfs.txt
index 983e14abe7e..adc81a35fe2 100644
--- a/Documentation/filesystems/nfs/pnfs.txt
+++ b/Documentation/filesystems/nfs/pnfs.txt
@@ -12,7 +12,7 @@ struct pnfs_layout_hdr
----------------------
The on-the-wire command LAYOUTGET corresponds to struct
pnfs_layout_segment, usually referred to by the variable name lseg.
-Each nfs_inode may hold a pointer to a cache of of these layout
+Each nfs_inode may hold a pointer to a cache of these layout
segments in nfsi->layout, of type struct pnfs_layout_hdr.
We reference the header for the inode pointing to it, across each
@@ -53,3 +53,57 @@ lseg maintains an extra reference corresponding to the NFS_LSEG_VALID
bit which holds it in the pnfs_layout_hdr's list. When the final lseg
is removed from the pnfs_layout_hdr's list, the NFS_LAYOUT_DESTROYED
bit is set, preventing any new lsegs from being added.
+
+layout drivers
+--------------
+
+PNFS utilizes what is called layout drivers. The STD defines 3 basic
+layout types: "files" "objects" and "blocks". For each of these types
+there is a layout-driver with a common function-vectors table which
+are called by the nfs-client pnfs-core to implement the different layout
+types.
+
+Files-layout-driver code is in: fs/nfs/nfs4filelayout.c && nfs4filelayoutdev.c
+Objects-layout-deriver code is in: fs/nfs/objlayout/.. directory
+Blocks-layout-deriver code is in: fs/nfs/blocklayout/.. directory
+
+objects-layout setup
+--------------------
+
+As part of the full STD implementation the objlayoutdriver.ko needs, at times,
+to automatically login to yet undiscovered iscsi/osd devices. For this the
+driver makes up-calles to a user-mode script called *osd_login*
+
+The path_name of the script to use is by default:
+ /sbin/osd_login.
+This name can be overridden by the Kernel module parameter:
+ objlayoutdriver.osd_login_prog
+
+If Kernel does not find the osd_login_prog path it will zero it out
+and will not attempt farther logins. An admin can then write new value
+to the objlayoutdriver.osd_login_prog Kernel parameter to re-enable it.
+
+The /sbin/osd_login is part of the nfs-utils package, and should usually
+be installed on distributions that support this Kernel version.
+
+The API to the login script is as follows:
+ Usage: $0 -u <URI> -o <OSDNAME> -s <SYSTEMID>
+ Options:
+ -u target uri e.g. iscsi://<ip>:<port>
+ (allways exists)
+ (More protocols can be defined in the future.
+ The client does not interpret this string it is
+ passed unchanged as received from the Server)
+ -o osdname of the requested target OSD
+ (Might be empty)
+ (A string which denotes the OSD name, there is a
+ limit of 64 chars on this string)
+ -s systemid of the requested target OSD
+ (Might be empty)
+ (This string, if not empty is always an hex
+ representation of the 20 bytes osd_system_id)
+
+blocks-layout setup
+-------------------
+
+TODO: Document the setup needs of the blocks layout driver
diff --git a/Documentation/filesystems/nfs/rpc-server-gss.txt b/Documentation/filesystems/nfs/rpc-server-gss.txt
new file mode 100644
index 00000000000..716f4be8e8b
--- /dev/null
+++ b/Documentation/filesystems/nfs/rpc-server-gss.txt
@@ -0,0 +1,91 @@
+
+rpcsec_gss support for kernel RPC servers
+=========================================
+
+This document gives references to the standards and protocols used to
+implement RPCGSS authentication in kernel RPC servers such as the NFS
+server and the NFS client's NFSv4.0 callback server. (But note that
+NFSv4.1 and higher don't require the client to act as a server for the
+purposes of authentication.)
+
+RPCGSS is specified in a few IETF documents:
+ - RFC2203 v1: http://tools.ietf.org/rfc/rfc2203.txt
+ - RFC5403 v2: http://tools.ietf.org/rfc/rfc5403.txt
+and there is a 3rd version being proposed:
+ - http://tools.ietf.org/id/draft-williams-rpcsecgssv3.txt
+ (At draft n. 02 at the time of writing)
+
+Background
+----------
+
+The RPCGSS Authentication method describes a way to perform GSSAPI
+Authentication for NFS. Although GSSAPI is itself completely mechanism
+agnostic, in many cases only the KRB5 mechanism is supported by NFS
+implementations.
+
+The Linux kernel, at the moment, supports only the KRB5 mechanism, and
+depends on GSSAPI extensions that are KRB5 specific.
+
+GSSAPI is a complex library, and implementing it completely in kernel is
+unwarranted. However GSSAPI operations are fundementally separable in 2
+parts:
+- initial context establishment
+- integrity/privacy protection (signing and encrypting of individual
+ packets)
+
+The former is more complex and policy-independent, but less
+performance-sensitive. The latter is simpler and needs to be very fast.
+
+Therefore, we perform per-packet integrity and privacy protection in the
+kernel, but leave the initial context establishment to userspace. We
+need upcalls to request userspace to perform context establishment.
+
+NFS Server Legacy Upcall Mechanism
+----------------------------------
+
+The classic upcall mechanism uses a custom text based upcall mechanism
+to talk to a custom daemon called rpc.svcgssd that is provide by the
+nfs-utils package.
+
+This upcall mechanism has 2 limitations:
+
+A) It can handle tokens that are no bigger than 2KiB
+
+In some Kerberos deployment GSSAPI tokens can be quite big, up and
+beyond 64KiB in size due to various authorization extensions attacked to
+the Kerberos tickets, that needs to be sent through the GSS layer in
+order to perform context establishment.
+
+B) It does not properly handle creds where the user is member of more
+than a few housand groups (the current hard limit in the kernel is 65K
+groups) due to limitation on the size of the buffer that can be send
+back to the kernel (4KiB).
+
+NFS Server New RPC Upcall Mechanism
+-----------------------------------
+
+The newer upcall mechanism uses RPC over a unix socket to a daemon
+called gss-proxy, implemented by a userspace program called Gssproxy.
+
+The gss_proxy RPC protocol is currently documented here:
+
+ https://fedorahosted.org/gss-proxy/wiki/ProtocolDocumentation
+
+This upcall mechanism uses the kernel rpc client and connects to the gssproxy
+userspace program over a regular unix socket. The gssproxy protocol does not
+suffer from the size limitations of the legacy protocol.
+
+Negotiating Upcall Mechanisms
+-----------------------------
+
+To provide backward compatibility, the kernel defaults to using the
+legacy mechanism. To switch to the new mechanism, gss-proxy must bind
+to /var/run/gssproxy.sock and then write "1" to
+/proc/net/rpc/use-gss-proxy. If gss-proxy dies, it must repeat both
+steps.
+
+Once the upcall mechanism is chosen, it cannot be changed. To prevent
+locking into the legacy mechanisms, the above steps must be performed
+before starting nfsd. Whoever starts nfsd can guarantee this by reading
+from /proc/net/rpc/use-gss-proxy and checking that it contains a
+"1"--the read will block until gss-proxy has done its write to the file.
diff --git a/Documentation/filesystems/nilfs2.txt b/Documentation/filesystems/nilfs2.txt
index 873a2ab2e9f..41c3d332acc 100644
--- a/Documentation/filesystems/nilfs2.txt
+++ b/Documentation/filesystems/nilfs2.txt
@@ -25,9 +25,8 @@ available from the following download page. At least "mkfs.nilfs2",
cleaner or garbage collector) are required. Details on the tools are
described in the man pages included in the package.
-Project web page: http://www.nilfs.org/en/
-Download page: http://www.nilfs.org/en/download.html
-Git tree web page: http://www.nilfs.org/git/
+Project web page: http://nilfs.sourceforge.net/
+Download page: http://nilfs.sourceforge.net/en/download.html
List info: http://vger.kernel.org/vger-lists.html#linux-nilfs
Caveats
@@ -81,6 +80,69 @@ nodiscard(*) The discard/TRIM commands are sent to the underlying
block device when blocks are freed. This is useful
for SSD devices and sparse/thinly-provisioned LUNs.
+Ioctls
+======
+
+There is some NILFS2 specific functionality which can be accessed by applications
+through the system call interfaces. The list of all NILFS2 specific ioctls are
+shown in the table below.
+
+Table of NILFS2 specific ioctls
+..............................................................................
+ Ioctl Description
+ NILFS_IOCTL_CHANGE_CPMODE Change mode of given checkpoint between
+ checkpoint and snapshot state. This ioctl is
+ used in chcp and mkcp utilities.
+
+ NILFS_IOCTL_DELETE_CHECKPOINT Remove checkpoint from NILFS2 file system.
+ This ioctl is used in rmcp utility.
+
+ NILFS_IOCTL_GET_CPINFO Return info about requested checkpoints. This
+ ioctl is used in lscp utility and by
+ nilfs_cleanerd daemon.
+
+ NILFS_IOCTL_GET_CPSTAT Return checkpoints statistics. This ioctl is
+ used by lscp, rmcp utilities and by
+ nilfs_cleanerd daemon.
+
+ NILFS_IOCTL_GET_SUINFO Return segment usage info about requested
+ segments. This ioctl is used in lssu,
+ nilfs_resize utilities and by nilfs_cleanerd
+ daemon.
+
+ NILFS_IOCTL_SET_SUINFO Modify segment usage info of requested
+ segments. This ioctl is used by
+ nilfs_cleanerd daemon to skip unnecessary
+ cleaning operation of segments and reduce
+ performance penalty or wear of flash device
+ due to redundant move of in-use blocks.
+
+ NILFS_IOCTL_GET_SUSTAT Return segment usage statistics. This ioctl
+ is used in lssu, nilfs_resize utilities and
+ by nilfs_cleanerd daemon.
+
+ NILFS_IOCTL_GET_VINFO Return information on virtual block addresses.
+ This ioctl is used by nilfs_cleanerd daemon.
+
+ NILFS_IOCTL_GET_BDESCS Return information about descriptors of disk
+ block numbers. This ioctl is used by
+ nilfs_cleanerd daemon.
+
+ NILFS_IOCTL_CLEAN_SEGMENTS Do garbage collection operation in the
+ environment of requested parameters from
+ userspace. This ioctl is used by
+ nilfs_cleanerd daemon.
+
+ NILFS_IOCTL_SYNC Make a checkpoint. This ioctl is used in
+ mkcp utility.
+
+ NILFS_IOCTL_RESIZE Resize NILFS2 volume. This ioctl is used
+ by nilfs_resize utility.
+
+ NILFS_IOCTL_SET_ALLOC_RANGE Define lower limit of segments in bytes and
+ upper limit of segments in bytes. This ioctl
+ is used by nilfs_resize utility.
+
NILFS2 usage
============
diff --git a/Documentation/filesystems/ntfs.txt b/Documentation/filesystems/ntfs.txt
index 791af8dac06..61947facfc0 100644
--- a/Documentation/filesystems/ntfs.txt
+++ b/Documentation/filesystems/ntfs.txt
@@ -455,8 +455,6 @@ not have this problem with odd numbers of sectors.
ChangeLog
=========
-Note, a technical ChangeLog aimed at kernel hackers is in fs/ntfs/ChangeLog.
-
2.1.30:
- Fix writev() (it kept writing the first segment over and over again
instead of moving onto subsequent segments).
diff --git a/Documentation/filesystems/pohmelfs/network_protocol.txt b/Documentation/filesystems/pohmelfs/network_protocol.txt
index 65e03dd4482..c680b4b5353 100644
--- a/Documentation/filesystems/pohmelfs/network_protocol.txt
+++ b/Documentation/filesystems/pohmelfs/network_protocol.txt
@@ -20,7 +20,7 @@ Commands can be embedded into transaction command (which in turn has own command
so one can extend protocol as needed without breaking backward compatibility as long
as old commands are supported. All string lengths include tail 0 byte.
-All commands are transferred over the network in big-endian. CPU endianess is used at the end peers.
+All commands are transferred over the network in big-endian. CPU endianness is used at the end peers.
@cmd - command number, which specifies command to be processed. Following
commands are used currently:
diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index b4a3d765ff9..0f3a1390bf0 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -94,9 +94,8 @@ protected.
---
[mandatory]
-BKL is also moved from around sb operations. ->write_super() Is now called
-without BKL held. BKL should have been shifted into individual fs sb_op
-functions. If you don't need it, remove it.
+BKL is also moved from around sb operations. BKL should have been shifted into
+individual fs sb_op functions. If you don't need it, remove it.
---
[informational]
@@ -282,7 +281,7 @@ ext2_write_failed and callers for an example.
[mandatory]
- ->truncate is going away. The whole truncate sequence needs to be
+ ->truncate is gone. The whole truncate sequence needs to be
implemented in ->setattr, which is now mandatory for filesystems
implementing on-disk size changes. Start with a copy of the old inode_setattr
and vmtruncate, and the reorder the vmtruncate + foofs_vmtruncate sequence to
@@ -296,8 +295,9 @@ in the beginning of ->setattr unconditionally.
->clear_inode() and ->delete_inode() are gone; ->evict_inode() should
be used instead. It gets called whenever the inode is evicted, whether it has
remaining links or not. Caller does *not* evict the pagecache or inode-associated
-metadata buffers; getting rid of those is responsibility of method, as it had
-been for ->delete_inode().
+metadata buffers; the method has to use truncate_inode_pages_final() to get rid
+of those. Caller makes sure async writeback cannot be running for the inode while
+(or after) ->evict_inode() is called.
->drop_inode() returns int now; it's called on final iput() with
inode->i_lock held and it returns true if filesystems wants the inode to be
@@ -306,14 +306,11 @@ updated appropriately. generic_delete_inode() is also alive and it consists
simply of return 1. Note that all actual eviction work is done by caller after
->drop_inode() returns.
- clear_inode() is gone; use end_writeback() instead. As before, it must
-be called exactly once on each call of ->evict_inode() (as it used to be for
-each call of ->delete_inode()). Unlike before, if you are using inode-associated
-metadata buffers (i.e. mark_buffer_dirty_inode()), it's your responsibility to
-call invalidate_inode_buffers() before end_writeback().
- No async writeback (and thus no calls of ->write_inode()) will happen
-after end_writeback() returns, so actions that should not overlap with ->write_inode()
-(e.g. freeing on-disk inode if i_nlink is 0) ought to be done after that call.
+ As before, clear_inode() must be called exactly once on each call of
+->evict_inode() (as it used to be for each call of ->delete_inode()). Unlike
+before, if you are using inode-associated metadata buffers (i.e.
+mark_buffer_dirty_inode()), it's your responsibility to call
+invalidate_inode_buffers() before clear_inode().
NOTE: checking i_nlink in the beginning of ->write_inode() and bailing out
if it's zero is not *and* *never* *had* *been* enough. Final unlink() and iput()
@@ -357,12 +354,10 @@ protects *all* the dcache state of a given dentry.
via rcu-walk path walk (basically, if the file can have had a path name in the
vfs namespace).
- i_dentry and i_rcu share storage in a union, and the vfs expects
-i_dentry to be reinitialized before it is freed, so an:
-
- INIT_LIST_HEAD(&inode->i_dentry);
-
-must be done in the RCU callback.
+ Even though i_dentry and i_rcu share storage in a union, we will
+initialize the former in inode_init_always(), so just leave it alone in
+the callback. It used to be necessary to clean it there, but not anymore
+(starting at 3.2).
--
[recommended]
@@ -429,3 +424,42 @@ filemap_write_and_wait_range() so that all dirty pages are synced out properly.
You must also keep in mind that ->fsync() is not called with i_mutex held
anymore, so if you require i_mutex locking you must make sure to take it and
release it yourself.
+
+--
+[mandatory]
+ d_alloc_root() is gone, along with a lot of bugs caused by code
+misusing it. Replacement: d_make_root(inode). The difference is,
+d_make_root() drops the reference to inode if dentry allocation fails.
+
+--
+[mandatory]
+ The witch is dead! Well, 2/3 of it, anyway. ->d_revalidate() and
+->lookup() do *not* take struct nameidata anymore; just the flags.
+--
+[mandatory]
+ ->create() doesn't take struct nameidata *; unlike the previous
+two, it gets "is it an O_EXCL or equivalent?" boolean argument. Note that
+local filesystems can ignore tha argument - they are guaranteed that the
+object doesn't exist. It's remote/distributed ones that might care...
+--
+[mandatory]
+ FS_REVAL_DOT is gone; if you used to have it, add ->d_weak_revalidate()
+in your dentry operations instead.
+--
+[mandatory]
+ vfs_readdir() is gone; switch to iterate_dir() instead
+--
+[mandatory]
+ ->readdir() is gone now; switch to ->iterate()
+[mandatory]
+ vfs_follow_link has been removed. Filesystems must use nd_set_link
+ from ->follow_link for normal symlinks, or nd_jump_link for magic
+ /proc/<pid> style links.
+--
+[mandatory]
+ iget5_locked()/ilookup5()/ilookup5_nowait() test() callback used to be
+ called with both ->i_lock and inode_hash_lock held; the former is *not*
+ taken anymore, so verify that your callbacks do not rely on it (none
+ of the in-tree instances did). inode_hash_lock is still held,
+ of course, so they are still serialized wrt removal from inode hash,
+ as well as wrt set() callback of iget5_locked().
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index a76a26a1db8..ddc531a74d0 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -40,6 +40,8 @@ Table of Contents
3.4 /proc/<pid>/coredump_filter - Core dump filtering settings
3.5 /proc/<pid>/mountinfo - Information about mounts
3.6 /proc/<pid>/comm & /proc/<pid>/task/<tid>/comm
+ 3.7 /proc/<pid>/task/<tid>/children - Information about task children
+ 3.8 /proc/<pid>/fdinfo/<fd> - Information about opened file
4 Configuring procfs
4.1 Mount options
@@ -141,7 +143,7 @@ Table 1-1: Process specific entries in /proc
pagemap Page table
stack Report full stack trace, enable via CONFIG_STACKTRACE
smaps a extension based on maps, showing the memory consumption of
- each mapping
+ each mapping and flags associated with it
..............................................................................
For example, to get the status information of a process, all you have to do is
@@ -180,6 +182,7 @@ read the file /proc/PID/status:
CapPrm: 0000000000000000
CapEff: 0000000000000000
CapBnd: ffffffffffffffff
+ Seccomp: 0
voluntary_ctxt_switches: 0
nonvoluntary_ctxt_switches: 1
@@ -231,11 +234,12 @@ Table 1-2: Contents of the status files (as of 2.6.30-rc7)
ShdPnd bitmap of shared pending signals for the process
SigBlk bitmap of blocked signals
SigIgn bitmap of ignored signals
- SigCgt bitmap of catched signals
+ SigCgt bitmap of caught signals
CapInh bitmap of inheritable capabilities
CapPrm bitmap of permitted capabilities
CapEff bitmap of effective capabilities
CapBnd bitmap of capabilities bounding set
+ Seccomp seccomp mode, like prctl(PR_GET_SECCOMP, ...)
Cpus_allowed mask of CPUs on which this process may run
Cpus_allowed_list Same as previous, but in "list format"
Mems_allowed mask of memory nodes allowed to this process
@@ -290,13 +294,13 @@ Table 1-4: Contents of the stat files (as of 2.6.30-rc7)
rsslim current limit in bytes on the rss
start_code address above which program text can run
end_code address below which program text can run
- start_stack address of the start of the stack
+ start_stack address of the start of the main process stack
esp current value of ESP
eip current value of EIP
pending bitmap of pending signals
blocked bitmap of blocked signals
sigign bitmap of ignored signals
- sigcatch bitmap of catched signals
+ sigcatch bitmap of caught signals
wchan address where process went to sleep
0 (place holder)
0 (place holder)
@@ -310,6 +314,11 @@ Table 1-4: Contents of the stat files (as of 2.6.30-rc7)
start_data address above which program data+bss is placed
end_data address below which program data+bss is placed
start_brk address above which program heap can be expanded with brk()
+ arg_start address above which program command line is placed
+ arg_end address below which program command line is placed
+ env_start address above which program environment is placed
+ env_end address below which program environment is placed
+ exit_code the thread's exit_code in the form reported by the waitpid system call
..............................................................................
The /proc/PID/maps file containing the currently mapped memory regions and
@@ -325,7 +334,7 @@ address perms offset dev inode pathname
a7cb1000-a7cb2000 ---p 00000000 00:00 0
a7cb2000-a7eb2000 rw-p 00000000 00:00 0
a7eb2000-a7eb3000 ---p 00000000 00:00 0
-a7eb3000-a7ed5000 rw-p 00000000 00:00 0
+a7eb3000-a7ed5000 rw-p 00000000 00:00 0 [stack:1001]
a7ed5000-a8008000 r-xp 00000000 03:00 4222 /lib/libc.so.6
a8008000-a800a000 r--p 00133000 03:00 4222 /lib/libc.so.6
a800a000-a800b000 rw-p 00135000 03:00 4222 /lib/libc.so.6
@@ -357,11 +366,39 @@ is not associated with a file:
[heap] = the heap of the program
[stack] = the stack of the main process
+ [stack:1001] = the stack of the thread with tid 1001
[vdso] = the "virtual dynamic shared object",
the kernel system call handler
or if empty, the mapping is anonymous.
+The /proc/PID/task/TID/maps is a view of the virtual memory from the viewpoint
+of the individual tasks of a process. In this file you will see a mapping marked
+as [stack] if that task sees it as a stack. This is a key difference from the
+content of /proc/PID/maps, where you will see all mappings that are being used
+as stack by all of those tasks. Hence, for the example above, the task-level
+map, i.e. /proc/PID/task/TID/maps for thread 1001 will look like this:
+
+08048000-08049000 r-xp 00000000 03:00 8312 /opt/test
+08049000-0804a000 rw-p 00001000 03:00 8312 /opt/test
+0804a000-0806b000 rw-p 00000000 00:00 0 [heap]
+a7cb1000-a7cb2000 ---p 00000000 00:00 0
+a7cb2000-a7eb2000 rw-p 00000000 00:00 0
+a7eb2000-a7eb3000 ---p 00000000 00:00 0
+a7eb3000-a7ed5000 rw-p 00000000 00:00 0 [stack]
+a7ed5000-a8008000 r-xp 00000000 03:00 4222 /lib/libc.so.6
+a8008000-a800a000 r--p 00133000 03:00 4222 /lib/libc.so.6
+a800a000-a800b000 rw-p 00135000 03:00 4222 /lib/libc.so.6
+a800b000-a800e000 rw-p 00000000 00:00 0
+a800e000-a8022000 r-xp 00000000 03:00 14462 /lib/libpthread.so.0
+a8022000-a8023000 r--p 00013000 03:00 14462 /lib/libpthread.so.0
+a8023000-a8024000 rw-p 00014000 03:00 14462 /lib/libpthread.so.0
+a8024000-a8027000 rw-p 00000000 00:00 0
+a8027000-a8043000 r-xp 00000000 03:00 8317 /lib/ld-linux.so.2
+a8043000-a8044000 r--p 0001b000 03:00 8317 /lib/ld-linux.so.2
+a8044000-a8045000 rw-p 0001c000 03:00 8317 /lib/ld-linux.so.2
+aff35000-aff4a000 rw-p 00000000 00:00 0
+ffffe000-fffff000 r-xp 00000000 00:00 0 [vdso]
The /proc/PID/smaps is an extension based on maps, showing the memory
consumption for each of the process's mappings. For each of mappings there
@@ -381,8 +418,9 @@ Swap: 0 kB
KernelPageSize: 4 kB
MMUPageSize: 4 kB
Locked: 374 kB
+VmFlags: rd ex mr mw me de
-The first of these lines shows the same information as is displayed for the
+the first of these lines shows the same information as is displayed for the
mapping in /proc/PID/maps. The remaining lines show the size of the mapping
(size), the amount of the mapping that is currently resident in RAM (RSS), the
process' proportional share of this mapping (PSS), the number of clean and
@@ -396,11 +434,48 @@ and a page is modified, the file page is replaced by a private anonymous copy.
"Swap" shows how much would-be-anonymous memory is also used, but out on
swap.
+"VmFlags" field deserves a separate description. This member represents the kernel
+flags associated with the particular virtual memory area in two letter encoded
+manner. The codes are the following:
+ rd - readable
+ wr - writeable
+ ex - executable
+ sh - shared
+ mr - may read
+ mw - may write
+ me - may execute
+ ms - may share
+ gd - stack segment growns down
+ pf - pure PFN range
+ dw - disabled write to the mapped file
+ lo - pages are locked in memory
+ io - memory mapped I/O area
+ sr - sequential read advise provided
+ rr - random read advise provided
+ dc - do not copy area on fork
+ de - do not expand area on remapping
+ ac - area is accountable
+ nr - swap space is not reserved for the area
+ ht - area uses huge tlb pages
+ nl - non-linear mapping
+ ar - architecture specific flag
+ dd - do not include area into core dump
+ sd - soft-dirty flag
+ mm - mixed map area
+ hg - huge page advise flag
+ nh - no-huge page advise flag
+ mg - mergable advise flag
+
+Note that there is no guarantee that every flag and associated mnemonic will
+be present in all further kernel releases. Things get changed, the flags may
+be vanished or the reverse -- new added.
+
This file is only present if the CONFIG_MMU kernel configuration option is
enabled.
The /proc/PID/clear_refs is used to reset the PG_Referenced and ACCESSED/YOUNG
-bits on both physical and virtual pages associated with a process.
+bits on both physical and virtual pages associated with a process, and the
+soft-dirty bit on pte (see Documentation/vm/soft-dirty.txt for details).
To clear the bits for all the pages associated with the process
> echo 1 > /proc/PID/clear_refs
@@ -409,6 +484,10 @@ To clear the bits for the anonymous pages associated with the process
To clear the bits for the file mapped pages associated with the process
> echo 3 > /proc/PID/clear_refs
+
+To clear the soft-dirty bit
+ > echo 4 > /proc/PID/clear_refs
+
Any other value written to /proc/PID/clear_refs will have no effect.
The /proc/pid/pagemap gives the PFN, which can be used to find the pageflags
@@ -468,7 +547,7 @@ Table 1-5: Kernel info in /proc
sys See chapter 2
sysvipc Info of SysVIPC Resources (msg, sem, shm) (2.4)
tty Info of tty drivers
- uptime System uptime
+ uptime Wall clock since boot, combined idle time of all cpus
version Kernel version
video bttv info of video resources (2.4)
vmallocinfo Show vmalloced areas
@@ -688,6 +767,7 @@ The "Locked" indicates whether the mapping is locked in memory or not.
MemTotal: 16344972 kB
MemFree: 13634064 kB
+MemAvailable: 14836172 kB
Buffers: 3656 kB
Cached: 1195708 kB
SwapCached: 0 kB
@@ -715,10 +795,19 @@ Committed_AS: 100056 kB
VmallocTotal: 112216 kB
VmallocUsed: 428 kB
VmallocChunk: 111088 kB
+AnonHugePages: 49152 kB
MemTotal: Total usable ram (i.e. physical ram minus a few reserved
bits and the kernel binary code)
MemFree: The sum of LowFree+HighFree
+MemAvailable: An estimate of how much memory is available for starting new
+ applications, without swapping. Calculated from MemFree,
+ SReclaimable, the size of the file LRU lists, and the low
+ watermarks in each zone.
+ The estimate takes into account that the system needs some
+ page cache to function well, and that not all reclaimable
+ slab will be reclaimable, due to items being in use. The
+ impact of those factors will vary from system to system.
Buffers: Relatively temporary storage for raw disk blocks
shouldn't get tremendously large (20MB or so)
Cached: in-memory cache for files read from the disk (the
@@ -748,6 +837,7 @@ VmallocChunk: 111088 kB
Dirty: Memory which is waiting to get written back to the disk
Writeback: Memory which is actively being written back to the disk
AnonPages: Non-file backed pages mapped into userspace page tables
+AnonHugePages: Non-file backed huge pages mapped into userspace page tables
Mapped: files which have been mmaped, such as libraries
Slab: in-kernel data structures cache
SReclaimable: Part of Slab, that might be reclaimed, such as caches
@@ -764,7 +854,8 @@ WritebackTmp: Memory used by FUSE for temporary writeback buffers
if strict overcommit accounting is enabled (mode 2 in
'vm.overcommit_memory').
The CommitLimit is calculated with the following formula:
- CommitLimit = ('vm.overcommit_ratio' * Physical RAM) + Swap
+ CommitLimit = ([total RAM pages] - [total huge TLB pages]) *
+ overcommit_ratio / 100 + [total swap pages]
For example, on a system with 1G of physical RAM and 7G
of swap with a `vm.overcommit_ratio` of 30 it would
yield a CommitLimit of 7.3G.
@@ -774,16 +865,15 @@ Committed_AS: The amount of memory presently allocated on the system.
The committed memory is a sum of all of the memory which
has been allocated by processes, even if it has not been
"used" by them as of yet. A process which malloc()'s 1G
- of memory, but only touches 300M of it will only show up
- as using 300M of memory even if it has the address space
- allocated for the entire 1G. This 1G is memory which has
- been "committed" to by the VM and can be used at any time
- by the allocating application. With strict overcommit
- enabled on the system (mode 2 in 'vm.overcommit_memory'),
- allocations which would exceed the CommitLimit (detailed
- above) will not be permitted. This is useful if one needs
- to guarantee that processes will not fail due to lack of
- memory once that memory has been successfully allocated.
+ of memory, but only touches 300M of it will show up as
+ using 1G. This 1G is memory which has been "committed" to
+ by the VM and can be used at any time by the allocating
+ application. With strict overcommit enabled on the system
+ (mode 2 in 'vm.overcommit_memory'),allocations which would
+ exceed the CommitLimit (detailed above) will not be permitted.
+ This is useful if one needs to guarantee that processes will
+ not fail due to lack of memory once that memory has been
+ successfully allocated.
VmallocTotal: total size of vmalloc memory area
VmallocUsed: amount of vmalloc area which is used
VmallocChunk: largest contiguous block of vmalloc area which is free
@@ -968,7 +1058,6 @@ Table 1-9: Network info in /proc/net
snmp SNMP data
sockstat Socket statistics
tcp TCP sockets
- tr_rif Token ring RIF routing table
udp UDP sockets
unix UNIX domain sockets
wireless Wireless interface data (Wavelan etc)
@@ -1157,8 +1246,9 @@ second). The meanings of the columns are as follows, from left to right:
The "intr" line gives counts of interrupts serviced since boot time, for each
of the possible system interrupts. The first column is the total of all
-interrupts serviced; each subsequent column is the total for that particular
-interrupt.
+interrupts serviced including unnumbered architecture specific interrupts;
+each subsequent column is the total for that particular numbered interrupt.
+Unnumbered interrupts are not shown, only summed into the total.
The "ctxt" line gives the total number of context switches across all CPUs.
@@ -1298,8 +1388,8 @@ may allocate from based on an estimation of its current memory and swap use.
For example, if a task is using all allowed memory, its badness score will be
1000. If it is using half of its allowed memory, its score will be 500.
-There is an additional factor included in the badness score: root
-processes are given 3% extra memory over other tasks.
+There is an additional factor included in the badness score: the current memory
+and swap usage is discounted by 3% for root processes.
The amount of "allowed" memory depends on the context in which the oom killer
was called. If it is due to the memory assigned to the allocating task's cpuset
@@ -1332,16 +1422,10 @@ be used to tune the badness score. Its acceptable values range from -16
(OOM_DISABLE) to disable oom killing entirely for that task. Its value is
scaled linearly with /proc/<pid>/oom_score_adj.
-Writing to /proc/<pid>/oom_score_adj or /proc/<pid>/oom_adj will change the
-other with its scaled value.
-
The value of /proc/<pid>/oom_score_adj may be reduced no lower than the last
value set by a CAP_SYS_RESOURCE process. To reduce the value any lower
requires CAP_SYS_RESOURCE.
-NOTICE: /proc/<pid>/oom_adj is deprecated and will be removed, please see
-Documentation/feature-removal-schedule.txt.
-
Caveat: when a parent task is selected, the oom killer will sacrifice any first
generation children with separate address spaces instead, if possible. This
avoids servers and important system daemons from being killed and loses the
@@ -1352,7 +1436,7 @@ minimal amount of work.
-------------------------------------------------------------
This file can be used to check the current score used by the oom-killer is for
-any given <pid>. Use it together with /proc/<pid>/oom_adj to tune which
+any given <pid>. Use it together with /proc/<pid>/oom_score_adj to tune which
process should be killed in an out-of-memory situation.
@@ -1549,6 +1633,117 @@ then the kernel's TASK_COMM_LEN (currently 16 chars) will result in a truncated
comm value.
+3.7 /proc/<pid>/task/<tid>/children - Information about task children
+-------------------------------------------------------------------------
+This file provides a fast way to retrieve first level children pids
+of a task pointed by <pid>/<tid> pair. The format is a space separated
+stream of pids.
+
+Note the "first level" here -- if a child has own children they will
+not be listed here, one needs to read /proc/<children-pid>/task/<tid>/children
+to obtain the descendants.
+
+Since this interface is intended to be fast and cheap it doesn't
+guarantee to provide precise results and some children might be
+skipped, especially if they've exited right after we printed their
+pids, so one need to either stop or freeze processes being inspected
+if precise results are needed.
+
+
+3.8 /proc/<pid>/fdinfo/<fd> - Information about opened file
+---------------------------------------------------------------
+This file provides information associated with an opened file. The regular
+files have at least three fields -- 'pos', 'flags' and mnt_id. The 'pos'
+represents the current offset of the opened file in decimal form [see lseek(2)
+for details], 'flags' denotes the octal O_xxx mask the file has been
+created with [see open(2) for details] and 'mnt_id' represents mount ID of
+the file system containing the opened file [see 3.5 /proc/<pid>/mountinfo
+for details].
+
+A typical output is
+
+ pos: 0
+ flags: 0100002
+ mnt_id: 19
+
+The files such as eventfd, fsnotify, signalfd, epoll among the regular pos/flags
+pair provide additional information particular to the objects they represent.
+
+ Eventfd files
+ ~~~~~~~~~~~~~
+ pos: 0
+ flags: 04002
+ mnt_id: 9
+ eventfd-count: 5a
+
+ where 'eventfd-count' is hex value of a counter.
+
+ Signalfd files
+ ~~~~~~~~~~~~~~
+ pos: 0
+ flags: 04002
+ mnt_id: 9
+ sigmask: 0000000000000200
+
+ where 'sigmask' is hex value of the signal mask associated
+ with a file.
+
+ Epoll files
+ ~~~~~~~~~~~
+ pos: 0
+ flags: 02
+ mnt_id: 9
+ tfd: 5 events: 1d data: ffffffffffffffff
+
+ where 'tfd' is a target file descriptor number in decimal form,
+ 'events' is events mask being watched and the 'data' is data
+ associated with a target [see epoll(7) for more details].
+
+ Fsnotify files
+ ~~~~~~~~~~~~~~
+ For inotify files the format is the following
+
+ pos: 0
+ flags: 02000000
+ inotify wd:3 ino:9e7e sdev:800013 mask:800afce ignored_mask:0 fhandle-bytes:8 fhandle-type:1 f_handle:7e9e0000640d1b6d
+
+ where 'wd' is a watch descriptor in decimal form, ie a target file
+ descriptor number, 'ino' and 'sdev' are inode and device where the
+ target file resides and the 'mask' is the mask of events, all in hex
+ form [see inotify(7) for more details].
+
+ If the kernel was built with exportfs support, the path to the target
+ file is encoded as a file handle. The file handle is provided by three
+ fields 'fhandle-bytes', 'fhandle-type' and 'f_handle', all in hex
+ format.
+
+ If the kernel is built without exportfs support the file handle won't be
+ printed out.
+
+ If there is no inotify mark attached yet the 'inotify' line will be omitted.
+
+ For fanotify files the format is
+
+ pos: 0
+ flags: 02
+ mnt_id: 9
+ fanotify flags:10 event-flags:0
+ fanotify mnt_id:12 mflags:40 mask:38 ignored_mask:40000003
+ fanotify ino:4f969 sdev:800013 mflags:0 mask:3b ignored_mask:40000000 fhandle-bytes:8 fhandle-type:1 f_handle:69f90400c275b5b4
+
+ where fanotify 'flags' and 'event-flags' are values used in fanotify_init
+ call, 'mnt_id' is the mount point identifier, 'mflags' is the value of
+ flags associated with mark which are tracked separately from events
+ mask. 'ino', 'sdev' are target inode and device, 'mask' is the events
+ mask and 'ignored_mask' is the mask of events which are to be ignored.
+ All in hex format. Incorporation of 'mflags', 'mask' and 'ignored_mask'
+ does provide information about flags and mask used in fanotify_mark
+ call [see fsnotify manpage for details].
+
+ While the first three lines are mandatory and always printed, the rest is
+ optional and may be omitted if no marks created yet.
+
+
------------------------------------------------------------------------------
Configuring procfs
------------------------------------------------------------------------------
diff --git a/Documentation/filesystems/qnx6.txt b/Documentation/filesystems/qnx6.txt
new file mode 100644
index 00000000000..40867978913
--- /dev/null
+++ b/Documentation/filesystems/qnx6.txt
@@ -0,0 +1,174 @@
+The QNX6 Filesystem
+===================
+
+The qnx6fs is used by newer QNX operating system versions. (e.g. Neutrino)
+It got introduced in QNX 6.4.0 and is used default since 6.4.1.
+
+Option
+======
+
+mmi_fs Mount filesystem as used for example by Audi MMI 3G system
+
+Specification
+=============
+
+qnx6fs shares many properties with traditional Unix filesystems. It has the
+concepts of blocks, inodes and directories.
+On QNX it is possible to create little endian and big endian qnx6 filesystems.
+This feature makes it possible to create and use a different endianness fs
+for the target (QNX is used on quite a range of embedded systems) plattform
+running on a different endianness.
+The Linux driver handles endianness transparently. (LE and BE)
+
+Blocks
+------
+
+The space in the device or file is split up into blocks. These are a fixed
+size of 512, 1024, 2048 or 4096, which is decided when the filesystem is
+created.
+Blockpointers are 32bit, so the maximum space that can be addressed is
+2^32 * 4096 bytes or 16TB
+
+The superblocks
+---------------
+
+The superblock contains all global information about the filesystem.
+Each qnx6fs got two superblocks, each one having a 64bit serial number.
+That serial number is used to identify the "active" superblock.
+In write mode with reach new snapshot (after each synchronous write), the
+serial of the new master superblock is increased (old superblock serial + 1)
+
+So basically the snapshot functionality is realized by an atomic final
+update of the serial number. Before updating that serial, all modifications
+are done by copying all modified blocks during that specific write request
+(or period) and building up a new (stable) filesystem structure under the
+inactive superblock.
+
+Each superblock holds a set of root inodes for the different filesystem
+parts. (Inode, Bitmap and Longfilenames)
+Each of these root nodes holds information like total size of the stored
+data and the addressing levels in that specific tree.
+If the level value is 0, up to 16 direct blocks can be addressed by each
+node.
+Level 1 adds an additional indirect addressing level where each indirect
+addressing block holds up to blocksize / 4 bytes pointers to data blocks.
+Level 2 adds an additional indirect addressing block level (so, already up
+to 16 * 256 * 256 = 1048576 blocks that can be addressed by such a tree).
+
+Unused block pointers are always set to ~0 - regardless of root node,
+indirect addressing blocks or inodes.
+Data leaves are always on the lowest level. So no data is stored on upper
+tree levels.
+
+The first Superblock is located at 0x2000. (0x2000 is the bootblock size)
+The Audi MMI 3G first superblock directly starts at byte 0.
+Second superblock position can either be calculated from the superblock
+information (total number of filesystem blocks) or by taking the highest
+device address, zeroing the last 3 bytes and then subtracting 0x1000 from
+that address.
+
+0x1000 is the size reserved for each superblock - regardless of the
+blocksize of the filesystem.
+
+Inodes
+------
+
+Each object in the filesystem is represented by an inode. (index node)
+The inode structure contains pointers to the filesystem blocks which contain
+the data held in the object and all of the metadata about an object except
+its longname. (filenames longer than 27 characters)
+The metadata about an object includes the permissions, owner, group, flags,
+size, number of blocks used, access time, change time and modification time.
+
+Object mode field is POSIX format. (which makes things easier)
+
+There are also pointers to the first 16 blocks, if the object data can be
+addressed with 16 direct blocks.
+For more than 16 blocks an indirect addressing in form of another tree is
+used. (scheme is the same as the one used for the superblock root nodes)
+
+The filesize is stored 64bit. Inode counting starts with 1. (whilst long
+filename inodes start with 0)
+
+Directories
+-----------
+
+A directory is a filesystem object and has an inode just like a file.
+It is a specially formatted file containing records which associate each
+name with an inode number.
+'.' inode number points to the directory inode
+'..' inode number points to the parent directory inode
+Eeach filename record additionally got a filename length field.
+
+One special case are long filenames or subdirectory names.
+These got set a filename length field of 0xff in the corresponding directory
+record plus the longfile inode number also stored in that record.
+With that longfilename inode number, the longfilename tree can be walked
+starting with the superblock longfilename root node pointers.
+
+Special files
+-------------
+
+Symbolic links are also filesystem objects with inodes. They got a specific
+bit in the inode mode field identifying them as symbolic link.
+The directory entry file inode pointer points to the target file inode.
+
+Hard links got an inode, a directory entry, but a specific mode bit set,
+no block pointers and the directory file record pointing to the target file
+inode.
+
+Character and block special devices do not exist in QNX as those files
+are handled by the QNX kernel/drivers and created in /dev independent of the
+underlaying filesystem.
+
+Long filenames
+--------------
+
+Long filenames are stored in a separate addressing tree. The staring point
+is the longfilename root node in the active superblock.
+Each data block (tree leaves) holds one long filename. That filename is
+limited to 510 bytes. The first two starting bytes are used as length field
+for the actual filename.
+If that structure shall fit for all allowed blocksizes, it is clear why there
+is a limit of 510 bytes for the actual filename stored.
+
+Bitmap
+------
+
+The qnx6fs filesystem allocation bitmap is stored in a tree under bitmap
+root node in the superblock and each bit in the bitmap represents one
+filesystem block.
+The first block is block 0, which starts 0x1000 after superblock start.
+So for a normal qnx6fs 0x3000 (bootblock + superblock) is the physical
+address at which block 0 is located.
+
+Bits at the end of the last bitmap block are set to 1, if the device is
+smaller than addressing space in the bitmap.
+
+Bitmap system area
+------------------
+
+The bitmap itself is divided into three parts.
+First the system area, that is split into two halves.
+Then userspace.
+
+The requirement for a static, fixed preallocated system area comes from how
+qnx6fs deals with writes.
+Each superblock got it's own half of the system area. So superblock #1
+always uses blocks from the lower half whilst superblock #2 just writes to
+blocks represented by the upper half bitmap system area bits.
+
+Bitmap blocks, Inode blocks and indirect addressing blocks for those two
+tree structures are treated as system blocks.
+
+The rational behind that is that a write request can work on a new snapshot
+(system area of the inactive - resp. lower serial numbered superblock) while
+at the same time there is still a complete stable filesystem structer in the
+other half of the system area.
+
+When finished with writing (a sync write is completed, the maximum sync leap
+time or a filesystem sync is requested), serial of the previously inactive
+superblock atomically is increased and the fs switches over to that - then
+stable declared - superblock.
+
+For all data outside the system area, blocks are just copied while writing.
diff --git a/Documentation/filesystems/ramfs-rootfs-initramfs.txt b/Documentation/filesystems/ramfs-rootfs-initramfs.txt
index a8273d5fad2..b176928e696 100644
--- a/Documentation/filesystems/ramfs-rootfs-initramfs.txt
+++ b/Documentation/filesystems/ramfs-rootfs-initramfs.txt
@@ -79,6 +79,10 @@ to just make sure certain lists can't become empty.
Most systems just mount another filesystem over rootfs and ignore it. The
amount of space an empty instance of ramfs takes up is tiny.
+If CONFIG_TMPFS is enabled, rootfs will use tmpfs instead of ramfs by
+default. To force ramfs, add "rootfstype=ramfs" to the kernel command
+line.
+
What is initramfs?
------------------
@@ -297,7 +301,7 @@ the above threads) is:
either way about the archive format, and there are alternative tools,
such as:
- http://freshmeat.net/projects/afio/
+ http://freecode.com/projects/afio
2) The cpio archive format chosen by the kernel is simpler and cleaner (and
thus easier to create and parse) than any of the (literally dozens of)
diff --git a/Documentation/filesystems/relay.txt b/Documentation/filesystems/relay.txt
index 510b722667a..33e2f369473 100644
--- a/Documentation/filesystems/relay.txt
+++ b/Documentation/filesystems/relay.txt
@@ -31,7 +31,7 @@ Semantics
Each relay channel has one buffer per CPU, each buffer has one or more
sub-buffers. Messages are written to the first sub-buffer until it is
-too full to contain a new message, in which case it it is written to
+too full to contain a new message, in which case it is written to
the next (if available). Messages are never split across sub-buffers.
At this point, userspace can be notified so it empties the first
sub-buffer, while the kernel continues writing to the next.
diff --git a/Documentation/filesystems/seq_file.txt b/Documentation/filesystems/seq_file.txt
index a1e2e0dda90..1fe0ccb1af5 100644
--- a/Documentation/filesystems/seq_file.txt
+++ b/Documentation/filesystems/seq_file.txt
@@ -54,6 +54,15 @@ how the mechanism works without getting lost in other details. (Those
wanting to see the full source for this module can find it at
http://lwn.net/Articles/22359/).
+Deprecated create_proc_entry
+
+Note that the above article uses create_proc_entry which was removed in
+kernel 3.10. Current versions require the following update
+
+- entry = create_proc_entry("sequence", 0, NULL);
+- if (entry)
+- entry->proc_fops = &ct_file_ops;
++ entry = proc_create("sequence", 0, NULL, &ct_file_ops);
The iterator interface
diff --git a/Documentation/filesystems/sharedsubtree.txt b/Documentation/filesystems/sharedsubtree.txt
index 4ede421c968..32a173dd315 100644
--- a/Documentation/filesystems/sharedsubtree.txt
+++ b/Documentation/filesystems/sharedsubtree.txt
@@ -727,7 +727,7 @@ replicas continue to be exactly same.
mkdir -p /tmp/m3
mount --rbind /root /tmp/m3
- I wont' draw the tree..but it has 24 vfsmounts
+ I won't draw the tree..but it has 24 vfsmounts
at step i the number of vfsmounts is V[i] = i*V[i-1].
diff --git a/Documentation/filesystems/sysfs-tagging.txt b/Documentation/filesystems/sysfs-tagging.txt
index caaaf1266d8..eb843e49c5a 100644
--- a/Documentation/filesystems/sysfs-tagging.txt
+++ b/Documentation/filesystems/sysfs-tagging.txt
@@ -24,7 +24,7 @@ flag between KOBJ_NS_TYPE_NONE and KOBJ_NS_TYPES, and s_ns will
point to the namespace to which it belongs.
Each sysfs superblock's sysfs_super_info contains an array void
-*ns[KOBJ_NS_TYPES]. When a a task in a tagging namespace
+*ns[KOBJ_NS_TYPES]. When a task in a tagging namespace
kobj_nstype first mounts sysfs, a new superblock is created. It
will be differentiated from other sysfs mounts by having its
s_fs_info->ns[kobj_nstype] set to the new namespace. Note that
diff --git a/Documentation/filesystems/sysfs.txt b/Documentation/filesystems/sysfs.txt
index a6619b7064b..b35a64b82f9 100644
--- a/Documentation/filesystems/sysfs.txt
+++ b/Documentation/filesystems/sysfs.txt
@@ -108,12 +108,12 @@ static DEVICE_ATTR(foo, S_IWUSR | S_IRUGO, show_foo, store_foo);
is equivalent to doing:
static struct device_attribute dev_attr_foo = {
- .attr = {
+ .attr = {
.name = "foo",
.mode = S_IWUSR | S_IRUGO,
- .show = show_foo,
- .store = store_foo,
},
+ .show = show_foo,
+ .store = store_foo,
};
diff --git a/Documentation/filesystems/vfat.txt b/Documentation/filesystems/vfat.txt
index ead764b2728..ce1126aceed 100644
--- a/Documentation/filesystems/vfat.txt
+++ b/Documentation/filesystems/vfat.txt
@@ -111,6 +111,15 @@ tz=UTC -- Interpret timestamps as UTC rather than local time.
useful when mounting devices (like digital cameras)
that are set to UTC in order to avoid the pitfalls of
local time.
+time_offset=minutes
+ -- Set offset for conversion of timestamps from local time
+ used by FAT to UTC. I.e. <minutes> minutes will be subtracted
+ from each timestamp to convert it to UTC used internally by
+ Linux. This is useful when time zone set in sys_tz is
+ not the time zone used by the filesystem. Note that this
+ option still does not provide correct time stamps in all
+ cases in presence of DST - time stamps in a different DST
+ setting will be off by one hour.
showexec -- If set, the execute permission bits of the file will be
allowed only if the extension part of the name is .EXE,
@@ -137,6 +146,38 @@ errors=panic|continue|remount-ro
without doing anything or remount the partition in
read-only mode (default behavior).
+discard -- If set, issues discard/TRIM commands to the block
+ device when blocks are freed. This is useful for SSD devices
+ and sparse/thinly-provisoned LUNs.
+
+nfs=stale_rw|nostale_ro
+ Enable this only if you want to export the FAT filesystem
+ over NFS.
+
+ stale_rw: This option maintains an index (cache) of directory
+ inodes by i_logstart which is used by the nfs-related code to
+ improve look-ups. Full file operations (read/write) over NFS is
+ supported but with cache eviction at NFS server, this could
+ result in ESTALE issues.
+
+ nostale_ro: This option bases the inode number and filehandle
+ on the on-disk location of a file in the MS-DOS directory entry.
+ This ensures that ESTALE will not be returned after a file is
+ evicted from the inode cache. However, it means that operations
+ such as rename, create and unlink could cause filehandles that
+ previously pointed at one file to point at a different file,
+ potentially causing data corruption. For this reason, this
+ option also mounts the filesystem readonly.
+
+ To maintain backward compatibility, '-o nfs' is also accepted,
+ defaulting to stale_rw
+
+dos1xfloppy -- If set, use a fallback default BIOS Parameter Block
+ configuration, determined by backing device size. These static
+ parameters match defaults assumed by DOS 1.x for 160 kiB,
+ 180 kiB, 320 kiB, and 360 kiB floppies and floppy images.
+
+
<bool>: 0,1,yes,no,true,false
TODO
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 3d9393b845b..a1d0d7a3016 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -114,7 +114,7 @@ members are defined:
struct file_system_type {
const char *name;
int fs_flags;
- struct dentry (*mount) (struct file_system_type *, int,
+ struct dentry *(*mount) (struct file_system_type *, int,
const char *, void *);
void (*kill_sb) (struct super_block *);
struct module *owner;
@@ -216,7 +216,6 @@ struct super_operations {
void (*drop_inode) (struct inode *);
void (*delete_inode) (struct inode *);
void (*put_super) (struct super_block *);
- void (*write_super) (struct super_block *);
int (*sync_fs)(struct super_block *sb, int wait);
int (*freeze_fs) (struct super_block *);
int (*unfreeze_fs) (struct super_block *);
@@ -273,9 +272,6 @@ or bottom half).
put_super: called when the VFS wishes to free the superblock
(i.e. unmount). This is called with the superblock lock held
- write_super: called when the VFS superblock needs to be written to
- disc. This method is optional
-
sync_fs: called when VFS is writing out all dirty data associated with
a superblock. The second parameter indicates whether the method
should wait until the write out has been completed. Optional.
@@ -341,8 +337,8 @@ This describes how the VFS can manipulate an inode in your
filesystem. As of kernel 2.6.22, the following members are defined:
struct inode_operations {
- int (*create) (struct inode *,struct dentry *, umode_t, struct nameidata *);
- struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameidata *);
+ int (*create) (struct inode *,struct dentry *, umode_t, bool);
+ struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
int (*link) (struct dentry *,struct inode *,struct dentry *);
int (*unlink) (struct inode *,struct dentry *);
int (*symlink) (struct inode *,struct dentry *,const char *);
@@ -351,10 +347,11 @@ struct inode_operations {
int (*mknod) (struct inode *,struct dentry *,umode_t,dev_t);
int (*rename) (struct inode *, struct dentry *,
struct inode *, struct dentry *);
+ int (*rename2) (struct inode *, struct dentry *,
+ struct inode *, struct dentry *, unsigned int);
int (*readlink) (struct dentry *, char __user *,int);
void * (*follow_link) (struct dentry *, struct nameidata *);
void (*put_link) (struct dentry *, struct nameidata *, void *);
- void (*truncate) (struct inode *);
int (*permission) (struct inode *, int);
int (*get_acl)(struct inode *, int);
int (*setattr) (struct dentry *, struct iattr *);
@@ -363,7 +360,10 @@ struct inode_operations {
ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
ssize_t (*listxattr) (struct dentry *, char *, size_t);
int (*removexattr) (struct dentry *, const char *);
- void (*truncate_range)(struct inode *, loff_t, loff_t);
+ void (*update_time)(struct inode *, struct timespec *, int);
+ int (*atomic_open)(struct inode *, struct dentry *, struct file *,
+ unsigned open_flag, umode_t create_mode, int *opened);
+ int (*tmpfile) (struct inode *, struct dentry *, umode_t);
};
Again, all methods are called without any locks being held, unless
@@ -416,6 +416,20 @@ otherwise noted.
rename: called by the rename(2) system call to rename the object to
have the parent and name given by the second inode and dentry.
+ rename2: this has an additional flags argument compared to rename.
+ If no flags are supported by the filesystem then this method
+ need not be implemented. If some flags are supported then the
+ filesystem must return -EINVAL for any unsupported or unknown
+ flags. Currently the following flags are implemented:
+ (1) RENAME_NOREPLACE: this flag indicates that if the target
+ of the rename exists the rename should fail with -EEXIST
+ instead of replacing the target. The VFS already checks for
+ existence, so for local filesystems the RENAME_NOREPLACE
+ implementation is equivalent to plain rename.
+ (2) RENAME_EXCHANGE: exchange source and target. Both must
+ exist; this is checked by the VFS. Unlike plain rename,
+ source and target may be of different type.
+
readlink: called by the readlink(2) system call. Only required if
you want to support reading symbolic links
@@ -432,16 +446,6 @@ otherwise noted.
started might not be in the page cache at the end of the
walk).
- truncate: Deprecated. This will not be called if ->setsize is defined.
- Called by the VFS to change the size of a file. The
- i_size field of the inode is set to the desired size by the
- VFS before this method is called. This method is called by
- the truncate(2) system call and related functionality.
-
- Note: ->truncate and vmtruncate are deprecated. Do not add new
- instances/calls of these. Filesystems should be converted to do their
- truncate sequence via ->setattr().
-
permission: called by the VFS to check for access rights on a POSIX-like
filesystem.
@@ -472,9 +476,22 @@ otherwise noted.
removexattr: called by the VFS to remove an extended attribute from
a file. This method is called by removexattr(2) system call.
- truncate_range: a method provided by the underlying filesystem to truncate a
- range of blocks , i.e. punch a hole somewhere in a file.
+ update_time: called by the VFS to update a specific time or the i_version of
+ an inode. If this is not defined the VFS will update the inode itself
+ and call mark_inode_dirty_sync.
+
+ atomic_open: called on the last component of an open. Using this optional
+ method the filesystem can look up, possibly create and open the file in
+ one atomic operation. If it cannot perform this (e.g. the file type
+ turned out to be wrong) it may signal this by returning 1 instead of
+ usual 0 or -ve . This method is only called if the last component is
+ negative or needs lookup. Cached positive dentries are still handled by
+ f_op->open(). If the file was created, the FILE_CREATED flag should be
+ set in "opened". In case of O_EXCL the method must only succeed if the
+ file didn't exist and hence FILE_CREATED shall always be set on success.
+ tmpfile: called in the end of O_TMPFILE open(). Optional, equivalent to
+ atomically creating, opening and unlinking a file in given directory.
The Address Space Object
========================
@@ -553,12 +570,11 @@ struct address_space_operations
-------------------------------
This describes how the VFS can manipulate mapping of a file to page cache in
-your filesystem. As of kernel 2.6.22, the following members are defined:
+your filesystem. The following members are defined:
struct address_space_operations {
int (*writepage)(struct page *page, struct writeback_control *wbc);
int (*readpage)(struct file *, struct page *);
- int (*sync_page)(struct page *);
int (*writepages)(struct address_space *, struct writeback_control *);
int (*set_page_dirty)(struct page *page);
int (*readpages)(struct file *filp, struct address_space *mapping,
@@ -570,17 +586,21 @@ struct address_space_operations {
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata);
sector_t (*bmap)(struct address_space *, sector_t);
- int (*invalidatepage) (struct page *, unsigned long);
+ void (*invalidatepage) (struct page *, unsigned int, unsigned int);
int (*releasepage) (struct page *, int);
void (*freepage)(struct page *);
- ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
- loff_t offset, unsigned long nr_segs);
+ ssize_t (*direct_IO)(int, struct kiocb *, struct iov_iter *iter, loff_t offset);
struct page* (*get_xip_page)(struct address_space *, sector_t,
int);
/* migrate the contents of a page to the specified target */
int (*migratepage) (struct page *, struct page *);
int (*launder_page) (struct page *);
+ int (*is_partially_uptodate) (struct page *, unsigned long,
+ unsigned long);
+ void (*is_dirty_writeback) (struct page *, bool *, bool *);
int (*error_remove_page) (struct mapping *mapping, struct page *page);
+ int (*swap_activate)(struct file *);
+ int (*swap_deactivate)(struct file *);
};
writepage: called by the VM to write a dirty page to backing store.
@@ -609,13 +629,6 @@ struct address_space_operations {
In this case, the page will be relocated, relocked and if
that all succeeds, ->readpage will be called again.
- sync_page: called by the VM to notify the backing store to perform all
- queued I/O operations for a page. I/O operations for other pages
- associated with this address_space object may also be performed.
-
- This function is optional and is called only for pages with
- PG_Writeback set while waiting for the writeback to complete.
-
writepages: called by the VM to write out pages associated with the
address_space object. If wbc->sync_mode is WBC_SYNC_ALL, then
the writeback_control will specify a range of pages that must be
@@ -687,14 +700,14 @@ struct address_space_operations {
invalidatepage: If a page has PagePrivate set, then invalidatepage
will be called when part or all of the page is to be removed
from the address space. This generally corresponds to either a
- truncation or a complete invalidation of the address space
- (in the latter case 'offset' will always be 0).
- Any private data associated with the page should be updated
- to reflect this truncation. If offset is 0, then
- the private data should be released, because the page
- must be able to be completely discarded. This may be done by
- calling the ->releasepage function, but in this case the
- release MUST succeed.
+ truncation, punch hole or a complete invalidation of the address
+ space (in the latter case 'offset' will always be 0 and 'length'
+ will be PAGE_CACHE_SIZE). Any private data associated with the page
+ should be updated to reflect this truncation. If offset is 0 and
+ length is PAGE_CACHE_SIZE, then the private data should be released,
+ because the page must be able to be completely discarded. This may
+ be done by calling the ->releasepage function, but in this case the
+ release MUST succeed.
releasepage: releasepage is called on PagePrivate pages to indicate
that the page should be freed if possible. ->releasepage
@@ -744,11 +757,35 @@ struct address_space_operations {
prevent redirtying the page, it is kept locked during the whole
operation.
+ is_partially_uptodate: Called by the VM when reading a file through the
+ pagecache when the underlying blocksize != pagesize. If the required
+ block is up to date then the read can complete without needing the IO
+ to bring the whole page up to date.
+
+ is_dirty_writeback: Called by the VM when attempting to reclaim a page.
+ The VM uses dirty and writeback information to determine if it needs
+ to stall to allow flushers a chance to complete some IO. Ordinarily
+ it can use PageDirty and PageWriteback but some filesystems have
+ more complex state (unstable pages in NFS prevent reclaim) or
+ do not set those flags due to locking problems (jbd). This callback
+ allows a filesystem to indicate to the VM if a page should be
+ treated as dirty or writeback for the purposes of stalling.
+
error_remove_page: normally set to generic_error_remove_page if truncation
is ok for this address space. Used for memory failure handling.
Setting this implies you deal with pages going away under you,
unless you have them locked or reference counts increased.
+ swap_activate: Called when swapon is used on a file to allocate
+ space if necessary and pin the block lookup information in
+ memory. A return value of zero indicates success,
+ in which case this file can be used to back swapspace. The
+ swapspace operations will be proxied to this address space's
+ ->swap_{out,in} methods.
+
+ swap_deactivate: Called during swapoff on files where swap_activate
+ was successful.
+
The File Object
===============
@@ -760,7 +797,7 @@ struct file_operations
----------------------
This describes how the VFS can manipulate an open file. As of kernel
-2.6.22, the following members are defined:
+3.12, the following members are defined:
struct file_operations {
struct module *owner;
@@ -769,7 +806,9 @@ struct file_operations {
ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
- int (*readdir) (struct file *, void *, filldir_t);
+ ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
+ ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
+ int (*iterate) (struct file *, struct dir_context *);
unsigned int (*poll) (struct file *, struct poll_table_struct *);
long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
@@ -781,15 +820,15 @@ struct file_operations {
int (*aio_fsync) (struct kiocb *, int datasync);
int (*fasync) (int, struct file *, int);
int (*lock) (struct file *, int, struct file_lock *);
- ssize_t (*readv) (struct file *, const struct iovec *, unsigned long, loff_t *);
- ssize_t (*writev) (struct file *, const struct iovec *, unsigned long, loff_t *);
- ssize_t (*sendfile) (struct file *, loff_t *, size_t, read_actor_t, void *);
ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
int (*check_flags)(int);
int (*flock) (struct file *, int, struct file_lock *);
ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, size_t, unsigned int);
ssize_t (*splice_read)(struct file *, struct pipe_inode_info *, size_t, unsigned int);
+ int (*setlease)(struct file *, long arg, struct file_lock **);
+ long (*fallocate)(struct file *, int mode, loff_t offset, loff_t len);
+ int (*show_fdinfo)(struct seq_file *m, struct file *f);
};
Again, all methods are called without any locks being held, unless
@@ -799,13 +838,17 @@ otherwise noted.
read: called by read(2) and related system calls
- aio_read: called by io_submit(2) and other asynchronous I/O operations
+ aio_read: vectored, possibly asynchronous read
+
+ read_iter: possibly asynchronous read with iov_iter as destination
write: called by write(2) and related system calls
- aio_write: called by io_submit(2) and other asynchronous I/O operations
+ aio_write: vectored, possibly asynchronous write
- readdir: called when the VFS needs to read the directory contents
+ write_iter: possibly asynchronous write with iov_iter as source
+
+ iterate: called when the VFS needs to read the directory contents
poll: called by the VFS when a process wants to check if there is
activity on this file and (optionally) go to sleep until there
@@ -840,12 +883,6 @@ otherwise noted.
lock: called by the fcntl(2) system call for F_GETLK, F_SETLK, and F_SETLKW
commands
- readv: called by the readv(2) system call
-
- writev: called by the writev(2) system call
-
- sendfile: called by the sendfile(2) system call
-
get_unmapped_area: called by the mmap(2) system call
check_flags: called by the fcntl(2) system call for F_SETFL command
@@ -858,6 +895,11 @@ otherwise noted.
splice_read: called by the VFS to splice data from file to a pipe. This
method is used by the splice(2) system call
+ setlease: called by the VFS to set or release a file lock lease.
+ setlease has the file_lock_lock held and must not sleep.
+
+ fallocate: called by the VFS to preallocate blocks or punch a hole.
+
Note that the file operations are implemented by the specific
filesystem in which the inode resides. When opening a device node
(character or block special) most filesystems will call special
@@ -884,11 +926,10 @@ the VFS uses a default. As of kernel 2.6.22, the following members are
defined:
struct dentry_operations {
- int (*d_revalidate)(struct dentry *, struct nameidata *);
- int (*d_hash)(const struct dentry *, const struct inode *,
- struct qstr *);
- int (*d_compare)(const struct dentry *, const struct inode *,
- const struct dentry *, const struct inode *,
+ int (*d_revalidate)(struct dentry *, unsigned int);
+ int (*d_weak_revalidate)(struct dentry *, unsigned int);
+ int (*d_hash)(const struct dentry *, struct qstr *);
+ int (*d_compare)(const struct dentry *, const struct dentry *,
unsigned int, const char *, const struct qstr *);
int (*d_delete)(const struct dentry *);
void (*d_release)(struct dentry *);
@@ -900,39 +941,57 @@ struct dentry_operations {
d_revalidate: called when the VFS needs to revalidate a dentry. This
is called whenever a name look-up finds a dentry in the
- dcache. Most filesystems leave this as NULL, because all their
- dentries in the dcache are valid
+ dcache. Most local filesystems leave this as NULL, because all their
+ dentries in the dcache are valid. Network filesystems are different
+ since things can change on the server without the client necessarily
+ being aware of it.
- d_revalidate may be called in rcu-walk mode (nd->flags & LOOKUP_RCU).
+ This function should return a positive value if the dentry is still
+ valid, and zero or a negative error code if it isn't.
+
+ d_revalidate may be called in rcu-walk mode (flags & LOOKUP_RCU).
If in rcu-walk mode, the filesystem must revalidate the dentry without
blocking or storing to the dentry, d_parent and d_inode should not be
- used without care (because they can go NULL), instead nd->inode should
- be used.
+ used without care (because they can change and, in d_inode case, even
+ become NULL under us).
If a situation is encountered that rcu-walk cannot handle, return
-ECHILD and it will be called again in ref-walk mode.
+ d_weak_revalidate: called when the VFS needs to revalidate a "jumped" dentry.
+ This is called when a path-walk ends at dentry that was not acquired by
+ doing a lookup in the parent directory. This includes "/", "." and "..",
+ as well as procfs-style symlinks and mountpoint traversal.
+
+ In this case, we are less concerned with whether the dentry is still
+ fully correct, but rather that the inode is still valid. As with
+ d_revalidate, most local filesystems will set this to NULL since their
+ dcache entries are always valid.
+
+ This function has the same return code semantics as d_revalidate.
+
+ d_weak_revalidate is only called after leaving rcu-walk mode.
+
d_hash: called when the VFS adds a dentry to the hash table. The first
dentry passed to d_hash is the parent directory that the name is
- to be hashed into. The inode is the dentry's inode.
+ to be hashed into.
Same locking and synchronisation rules as d_compare regarding
what is safe to dereference etc.
d_compare: called to compare a dentry name with a given name. The first
dentry is the parent of the dentry to be compared, the second is
- the parent's inode, then the dentry and inode (may be NULL) of the
- child dentry. len and name string are properties of the dentry to be
- compared. qstr is the name to compare it with.
+ the child dentry. len and name string are properties of the dentry
+ to be compared. qstr is the name to compare it with.
Must be constant and idempotent, and should not take locks if
- possible, and should not or store into the dentry or inodes.
- Should not dereference pointers outside the dentry or inodes without
+ possible, and should not or store into the dentry.
+ Should not dereference pointers outside the dentry without
lots of care (eg. d_parent, d_inode, d_name should not be used).
However, our vfsmount is pinned, and RCU held, so the dentries and
inodes won't disappear, neither will our sb or filesystem module.
- ->i_sb and ->d_sb may be used.
+ ->d_sb may be used.
It is a tricky calling convention because it needs to be called under
"rcu-walk", ie. without any locks or references on things.
@@ -993,7 +1052,7 @@ struct dentry_operations {
If the 'rcu_walk' parameter is true, then the caller is doing a
pathwalk in RCU-walk mode. Sleeping is not permitted in this mode,
- and the caller can be asked to leave it and call again by returing
+ and the caller can be asked to leave it and call again by returning
-ECHILD.
This function is only used if DCACHE_MANAGE_TRANSIT is set on the
diff --git a/Documentation/filesystems/xfs-self-describing-metadata.txt b/Documentation/filesystems/xfs-self-describing-metadata.txt
new file mode 100644
index 00000000000..05aa455163e
--- /dev/null
+++ b/Documentation/filesystems/xfs-self-describing-metadata.txt
@@ -0,0 +1,350 @@
+XFS Self Describing Metadata
+----------------------------
+
+Introduction
+------------
+
+The largest scalability problem facing XFS is not one of algorithmic
+scalability, but of verification of the filesystem structure. Scalabilty of the
+structures and indexes on disk and the algorithms for iterating them are
+adequate for supporting PB scale filesystems with billions of inodes, however it
+is this very scalability that causes the verification problem.
+
+Almost all metadata on XFS is dynamically allocated. The only fixed location
+metadata is the allocation group headers (SB, AGF, AGFL and AGI), while all
+other metadata structures need to be discovered by walking the filesystem
+structure in different ways. While this is already done by userspace tools for
+validating and repairing the structure, there are limits to what they can
+verify, and this in turn limits the supportable size of an XFS filesystem.
+
+For example, it is entirely possible to manually use xfs_db and a bit of
+scripting to analyse the structure of a 100TB filesystem when trying to
+determine the root cause of a corruption problem, but it is still mainly a
+manual task of verifying that things like single bit errors or misplaced writes
+weren't the ultimate cause of a corruption event. It may take a few hours to a
+few days to perform such forensic analysis, so for at this scale root cause
+analysis is entirely possible.
+
+However, if we scale the filesystem up to 1PB, we now have 10x as much metadata
+to analyse and so that analysis blows out towards weeks/months of forensic work.
+Most of the analysis work is slow and tedious, so as the amount of analysis goes
+up, the more likely that the cause will be lost in the noise. Hence the primary
+concern for supporting PB scale filesystems is minimising the time and effort
+required for basic forensic analysis of the filesystem structure.
+
+
+Self Describing Metadata
+------------------------
+
+One of the problems with the current metadata format is that apart from the
+magic number in the metadata block, we have no other way of identifying what it
+is supposed to be. We can't even identify if it is the right place. Put simply,
+you can't look at a single metadata block in isolation and say "yes, it is
+supposed to be there and the contents are valid".
+
+Hence most of the time spent on forensic analysis is spent doing basic
+verification of metadata values, looking for values that are in range (and hence
+not detected by automated verification checks) but are not correct. Finding and
+understanding how things like cross linked block lists (e.g. sibling
+pointers in a btree end up with loops in them) are the key to understanding what
+went wrong, but it is impossible to tell what order the blocks were linked into
+each other or written to disk after the fact.
+
+Hence we need to record more information into the metadata to allow us to
+quickly determine if the metadata is intact and can be ignored for the purpose
+of analysis. We can't protect against every possible type of error, but we can
+ensure that common types of errors are easily detectable. Hence the concept of
+self describing metadata.
+
+The first, fundamental requirement of self describing metadata is that the
+metadata object contains some form of unique identifier in a well known
+location. This allows us to identify the expected contents of the block and
+hence parse and verify the metadata object. IF we can't independently identify
+the type of metadata in the object, then the metadata doesn't describe itself
+very well at all!
+
+Luckily, almost all XFS metadata has magic numbers embedded already - only the
+AGFL, remote symlinks and remote attribute blocks do not contain identifying
+magic numbers. Hence we can change the on-disk format of all these objects to
+add more identifying information and detect this simply by changing the magic
+numbers in the metadata objects. That is, if it has the current magic number,
+the metadata isn't self identifying. If it contains a new magic number, it is
+self identifying and we can do much more expansive automated verification of the
+metadata object at runtime, during forensic analysis or repair.
+
+As a primary concern, self describing metadata needs some form of overall
+integrity checking. We cannot trust the metadata if we cannot verify that it has
+not been changed as a result of external influences. Hence we need some form of
+integrity check, and this is done by adding CRC32c validation to the metadata
+block. If we can verify the block contains the metadata it was intended to
+contain, a large amount of the manual verification work can be skipped.
+
+CRC32c was selected as metadata cannot be more than 64k in length in XFS and
+hence a 32 bit CRC is more than sufficient to detect multi-bit errors in
+metadata blocks. CRC32c is also now hardware accelerated on common CPUs so it is
+fast. So while CRC32c is not the strongest of possible integrity checks that
+could be used, it is more than sufficient for our needs and has relatively
+little overhead. Adding support for larger integrity fields and/or algorithms
+does really provide any extra value over CRC32c, but it does add a lot of
+complexity and so there is no provision for changing the integrity checking
+mechanism.
+
+Self describing metadata needs to contain enough information so that the
+metadata block can be verified as being in the correct place without needing to
+look at any other metadata. This means it needs to contain location information.
+Just adding a block number to the metadata is not sufficient to protect against
+mis-directed writes - a write might be misdirected to the wrong LUN and so be
+written to the "correct block" of the wrong filesystem. Hence location
+information must contain a filesystem identifier as well as a block number.
+
+Another key information point in forensic analysis is knowing who the metadata
+block belongs to. We already know the type, the location, that it is valid
+and/or corrupted, and how long ago that it was last modified. Knowing the owner
+of the block is important as it allows us to find other related metadata to
+determine the scope of the corruption. For example, if we have a extent btree
+object, we don't know what inode it belongs to and hence have to walk the entire
+filesystem to find the owner of the block. Worse, the corruption could mean that
+no owner can be found (i.e. it's an orphan block), and so without an owner field
+in the metadata we have no idea of the scope of the corruption. If we have an
+owner field in the metadata object, we can immediately do top down validation to
+determine the scope of the problem.
+
+Different types of metadata have different owner identifiers. For example,
+directory, attribute and extent tree blocks are all owned by an inode, whilst
+freespace btree blocks are owned by an allocation group. Hence the size and
+contents of the owner field are determined by the type of metadata object we are
+looking at. The owner information can also identify misplaced writes (e.g.
+freespace btree block written to the wrong AG).
+
+Self describing metadata also needs to contain some indication of when it was
+written to the filesystem. One of the key information points when doing forensic
+analysis is how recently the block was modified. Correlation of set of corrupted
+metadata blocks based on modification times is important as it can indicate
+whether the corruptions are related, whether there's been multiple corruption
+events that lead to the eventual failure, and even whether there are corruptions
+present that the run-time verification is not detecting.
+
+For example, we can determine whether a metadata object is supposed to be free
+space or still allocated if it is still referenced by its owner by looking at
+when the free space btree block that contains the block was last written
+compared to when the metadata object itself was last written. If the free space
+block is more recent than the object and the object's owner, then there is a
+very good chance that the block should have been removed from the owner.
+
+To provide this "written timestamp", each metadata block gets the Log Sequence
+Number (LSN) of the most recent transaction it was modified on written into it.
+This number will always increase over the life of the filesystem, and the only
+thing that resets it is running xfs_repair on the filesystem. Further, by use of
+the LSN we can tell if the corrupted metadata all belonged to the same log
+checkpoint and hence have some idea of how much modification occurred between
+the first and last instance of corrupt metadata on disk and, further, how much
+modification occurred between the corruption being written and when it was
+detected.
+
+Runtime Validation
+------------------
+
+Validation of self-describing metadata takes place at runtime in two places:
+
+ - immediately after a successful read from disk
+ - immediately prior to write IO submission
+
+The verification is completely stateless - it is done independently of the
+modification process, and seeks only to check that the metadata is what it says
+it is and that the metadata fields are within bounds and internally consistent.
+As such, we cannot catch all types of corruption that can occur within a block
+as there may be certain limitations that operational state enforces of the
+metadata, or there may be corruption of interblock relationships (e.g. corrupted
+sibling pointer lists). Hence we still need stateful checking in the main code
+body, but in general most of the per-field validation is handled by the
+verifiers.
+
+For read verification, the caller needs to specify the expected type of metadata
+that it should see, and the IO completion process verifies that the metadata
+object matches what was expected. If the verification process fails, then it
+marks the object being read as EFSCORRUPTED. The caller needs to catch this
+error (same as for IO errors), and if it needs to take special action due to a
+verification error it can do so by catching the EFSCORRUPTED error value. If we
+need more discrimination of error type at higher levels, we can define new
+error numbers for different errors as necessary.
+
+The first step in read verification is checking the magic number and determining
+whether CRC validating is necessary. If it is, the CRC32c is calculated and
+compared against the value stored in the object itself. Once this is validated,
+further checks are made against the location information, followed by extensive
+object specific metadata validation. If any of these checks fail, then the
+buffer is considered corrupt and the EFSCORRUPTED error is set appropriately.
+
+Write verification is the opposite of the read verification - first the object
+is extensively verified and if it is OK we then update the LSN from the last
+modification made to the object, After this, we calculate the CRC and insert it
+into the object. Once this is done the write IO is allowed to continue. If any
+error occurs during this process, the buffer is again marked with a EFSCORRUPTED
+error for the higher layers to catch.
+
+Structures
+----------
+
+A typical on-disk structure needs to contain the following information:
+
+struct xfs_ondisk_hdr {
+ __be32 magic; /* magic number */
+ __be32 crc; /* CRC, not logged */
+ uuid_t uuid; /* filesystem identifier */
+ __be64 owner; /* parent object */
+ __be64 blkno; /* location on disk */
+ __be64 lsn; /* last modification in log, not logged */
+};
+
+Depending on the metadata, this information may be part of a header structure
+separate to the metadata contents, or may be distributed through an existing
+structure. The latter occurs with metadata that already contains some of this
+information, such as the superblock and AG headers.
+
+Other metadata may have different formats for the information, but the same
+level of information is generally provided. For example:
+
+ - short btree blocks have a 32 bit owner (ag number) and a 32 bit block
+ number for location. The two of these combined provide the same
+ information as @owner and @blkno in eh above structure, but using 8
+ bytes less space on disk.
+
+ - directory/attribute node blocks have a 16 bit magic number, and the
+ header that contains the magic number has other information in it as
+ well. hence the additional metadata headers change the overall format
+ of the metadata.
+
+A typical buffer read verifier is structured as follows:
+
+#define XFS_FOO_CRC_OFF offsetof(struct xfs_ondisk_hdr, crc)
+
+static void
+xfs_foo_read_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+
+ if ((xfs_sb_version_hascrc(&mp->m_sb) &&
+ !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
+ XFS_FOO_CRC_OFF)) ||
+ !xfs_foo_verify(bp)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ }
+}
+
+The code ensures that the CRC is only checked if the filesystem has CRCs enabled
+by checking the superblock of the feature bit, and then if the CRC verifies OK
+(or is not needed) it verifies the actual contents of the block.
+
+The verifier function will take a couple of different forms, depending on
+whether the magic number can be used to determine the format of the block. In
+the case it can't, the code is structured as follows:
+
+static bool
+xfs_foo_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_ondisk_hdr *hdr = bp->b_addr;
+
+ if (hdr->magic != cpu_to_be32(XFS_FOO_MAGIC))
+ return false;
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (!uuid_equal(&hdr->uuid, &mp->m_sb.sb_uuid))
+ return false;
+ if (bp->b_bn != be64_to_cpu(hdr->blkno))
+ return false;
+ if (hdr->owner == 0)
+ return false;
+ }
+
+ /* object specific verification checks here */
+
+ return true;
+}
+
+If there are different magic numbers for the different formats, the verifier
+will look like:
+
+static bool
+xfs_foo_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_ondisk_hdr *hdr = bp->b_addr;
+
+ if (hdr->magic == cpu_to_be32(XFS_FOO_CRC_MAGIC)) {
+ if (!uuid_equal(&hdr->uuid, &mp->m_sb.sb_uuid))
+ return false;
+ if (bp->b_bn != be64_to_cpu(hdr->blkno))
+ return false;
+ if (hdr->owner == 0)
+ return false;
+ } else if (hdr->magic != cpu_to_be32(XFS_FOO_MAGIC))
+ return false;
+
+ /* object specific verification checks here */
+
+ return true;
+}
+
+Write verifiers are very similar to the read verifiers, they just do things in
+the opposite order to the read verifiers. A typical write verifier:
+
+static void
+xfs_foo_write_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
+
+ if (!xfs_foo_verify(bp)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ return;
+ }
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return;
+
+
+ if (bip) {
+ struct xfs_ondisk_hdr *hdr = bp->b_addr;
+ hdr->lsn = cpu_to_be64(bip->bli_item.li_lsn);
+ }
+ xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_FOO_CRC_OFF);
+}
+
+This will verify the internal structure of the metadata before we go any
+further, detecting corruptions that have occurred as the metadata has been
+modified in memory. If the metadata verifies OK, and CRCs are enabled, we then
+update the LSN field (when it was last modified) and calculate the CRC on the
+metadata. Once this is done, we can issue the IO.
+
+Inodes and Dquots
+-----------------
+
+Inodes and dquots are special snowflakes. They have per-object CRC and
+self-identifiers, but they are packed so that there are multiple objects per
+buffer. Hence we do not use per-buffer verifiers to do the work of per-object
+verification and CRC calculations. The per-buffer verifiers simply perform basic
+identification of the buffer - that they contain inodes or dquots, and that
+there are magic numbers in all the expected spots. All further CRC and
+verification checks are done when each inode is read from or written back to the
+buffer.
+
+The structure of the verifiers and the identifiers checks is very similar to the
+buffer code described above. The only difference is where they are called. For
+example, inode read verification is done in xfs_iread() when the inode is first
+read out of the buffer and the struct xfs_inode is instantiated. The inode is
+already extensively verified during writeback in xfs_iflush_int, so the only
+addition here is to add the LSN and CRC to the inode as it is copied back into
+the buffer.
+
+XXX: inode unlinked list modification doesn't recalculate the inode CRC! None of
+the unlinked list modifications check or update CRCs, neither during unlink nor
+log recovery. So, it's gone unnoticed until now. This won't matter immediately -
+repair will probably complain about it - but it needs to be fixed.
+
diff --git a/Documentation/filesystems/xfs.txt b/Documentation/filesystems/xfs.txt
index 3fc0c31a6f5..5be51fd888b 100644
--- a/Documentation/filesystems/xfs.txt
+++ b/Documentation/filesystems/xfs.txt
@@ -18,6 +18,8 @@ Mount Options
=============
When mounting an XFS filesystem, the following options are accepted.
+For boolean mount options, the names with the (*) suffix is the
+default behaviour.
allocsize=size
Sets the buffered I/O end-of-file preallocation size when
@@ -25,87 +27,128 @@ When mounting an XFS filesystem, the following options are accepted.
Valid values for this option are page size (typically 4KiB)
through to 1GiB, inclusive, in power-of-2 increments.
- attr2/noattr2
- The options enable/disable (default is disabled for backward
- compatibility on-disk) an "opportunistic" improvement to be
- made in the way inline extended attributes are stored on-disk.
- When the new form is used for the first time (by setting or
- removing extended attributes) the on-disk superblock feature
- bit field will be updated to reflect this format being in use.
-
- barrier
- Enables the use of block layer write barriers for writes into
- the journal and unwritten extent conversion. This allows for
- drive level write caching to be enabled, for devices that
- support write barriers.
+ The default behaviour is for dynamic end-of-file
+ preallocation size, which uses a set of heuristics to
+ optimise the preallocation size based on the current
+ allocation patterns within the file and the access patterns
+ to the file. Specifying a fixed allocsize value turns off
+ the dynamic behaviour.
+
+ attr2
+ noattr2
+ The options enable/disable an "opportunistic" improvement to
+ be made in the way inline extended attributes are stored
+ on-disk. When the new form is used for the first time when
+ attr2 is selected (either when setting or removing extended
+ attributes) the on-disk superblock feature bit field will be
+ updated to reflect this format being in use.
+
+ The default behaviour is determined by the on-disk feature
+ bit indicating that attr2 behaviour is active. If either
+ mount option it set, then that becomes the new default used
+ by the filesystem.
+
+ CRC enabled filesystems always use the attr2 format, and so
+ will reject the noattr2 mount option if it is set.
+
+ barrier (*)
+ nobarrier
+ Enables/disables the use of block layer write barriers for
+ writes into the journal and for data integrity operations.
+ This allows for drive level write caching to be enabled, for
+ devices that support write barriers.
discard
- Issue command to let the block device reclaim space freed by the
- filesystem. This is useful for SSD devices, thinly provisioned
- LUNs and virtual machine images, but may have a performance
- impact. This option is incompatible with the nodelaylog option.
-
- dmapi
- Enable the DMAPI (Data Management API) event callouts.
- Use with the "mtpt" option.
-
- grpid/bsdgroups and nogrpid/sysvgroups
- These options define what group ID a newly created file gets.
- When grpid is set, it takes the group ID of the directory in
- which it is created; otherwise (the default) it takes the fsgid
- of the current process, unless the directory has the setgid bit
- set, in which case it takes the gid from the parent directory,
- and also gets the setgid bit set if it is a directory itself.
-
- ihashsize=value
- In memory inode hashes have been removed, so this option has
- no function as of August 2007. Option is deprecated.
-
- ikeep/noikeep
- When ikeep is specified, XFS does not delete empty inode clusters
- and keeps them around on disk. ikeep is the traditional XFS
- behaviour. When noikeep is specified, empty inode clusters
- are returned to the free space pool. The default is noikeep for
- non-DMAPI mounts, while ikeep is the default when DMAPI is in use.
-
- inode64
- Indicates that XFS is allowed to create inodes at any location
- in the filesystem, including those which will result in inode
- numbers occupying more than 32 bits of significance. This is
- provided for backwards compatibility, but causes problems for
- backup applications that cannot handle large inode numbers.
-
- largeio/nolargeio
+ nodiscard (*)
+ Enable/disable the issuing of commands to let the block
+ device reclaim space freed by the filesystem. This is
+ useful for SSD devices, thinly provisioned LUNs and virtual
+ machine images, but may have a performance impact.
+
+ Note: It is currently recommended that you use the fstrim
+ application to discard unused blocks rather than the discard
+ mount option because the performance impact of this option
+ is quite severe.
+
+ grpid/bsdgroups
+ nogrpid/sysvgroups (*)
+ These options define what group ID a newly created file
+ gets. When grpid is set, it takes the group ID of the
+ directory in which it is created; otherwise it takes the
+ fsgid of the current process, unless the directory has the
+ setgid bit set, in which case it takes the gid from the
+ parent directory, and also gets the setgid bit set if it is
+ a directory itself.
+
+ filestreams
+ Make the data allocator use the filestreams allocation mode
+ across the entire filesystem rather than just on directories
+ configured to use it.
+
+ ikeep
+ noikeep (*)
+ When ikeep is specified, XFS does not delete empty inode
+ clusters and keeps them around on disk. When noikeep is
+ specified, empty inode clusters are returned to the free
+ space pool.
+
+ inode32
+ inode64 (*)
+ When inode32 is specified, it indicates that XFS limits
+ inode creation to locations which will not result in inode
+ numbers with more than 32 bits of significance.
+
+ When inode64 is specified, it indicates that XFS is allowed
+ to create inodes at any location in the filesystem,
+ including those which will result in inode numbers occupying
+ more than 32 bits of significance.
+
+ inode32 is provided for backwards compatibility with older
+ systems and applications, since 64 bits inode numbers might
+ cause problems for some applications that cannot handle
+ large inode numbers. If applications are in use which do
+ not handle inode numbers bigger than 32 bits, the inode32
+ option should be specified.
+
+
+ largeio
+ nolargeio (*)
If "nolargeio" is specified, the optimal I/O reported in
- st_blksize by stat(2) will be as small as possible to allow user
- applications to avoid inefficient read/modify/write I/O.
- If "largeio" specified, a filesystem that has a "swidth" specified
- will return the "swidth" value (in bytes) in st_blksize. If the
- filesystem does not have a "swidth" specified but does specify
- an "allocsize" then "allocsize" (in bytes) will be returned
- instead.
- If neither of these two options are specified, then filesystem
- will behave as if "nolargeio" was specified.
+ st_blksize by stat(2) will be as small as possible to allow
+ user applications to avoid inefficient read/modify/write
+ I/O. This is typically the page size of the machine, as
+ this is the granularity of the page cache.
+
+ If "largeio" specified, a filesystem that was created with a
+ "swidth" specified will return the "swidth" value (in bytes)
+ in st_blksize. If the filesystem does not have a "swidth"
+ specified but does specify an "allocsize" then "allocsize"
+ (in bytes) will be returned instead. Otherwise the behaviour
+ is the same as if "nolargeio" was specified.
logbufs=value
- Set the number of in-memory log buffers. Valid numbers range
- from 2-8 inclusive.
- The default value is 8 buffers for filesystems with a
- blocksize of 64KiB, 4 buffers for filesystems with a blocksize
- of 32KiB, 3 buffers for filesystems with a blocksize of 16KiB
- and 2 buffers for all other configurations. Increasing the
- number of buffers may increase performance on some workloads
- at the cost of the memory used for the additional log buffers
- and their associated control structures.
+ Set the number of in-memory log buffers. Valid numbers
+ range from 2-8 inclusive.
+
+ The default value is 8 buffers.
+
+ If the memory cost of 8 log buffers is too high on small
+ systems, then it may be reduced at some cost to performance
+ on metadata intensive workloads. The logbsize option below
+ controls the size of each buffer and so is also relevant to
+ this case.
logbsize=value
- Set the size of each in-memory log buffer.
- Size may be specified in bytes, or in kilobytes with a "k" suffix.
- Valid sizes for version 1 and version 2 logs are 16384 (16k) and
- 32768 (32k). Valid sizes for version 2 logs also include
- 65536 (64k), 131072 (128k) and 262144 (256k).
- The default value for machines with more than 32MiB of memory
- is 32768, machines with less memory use 16384 by default.
+ Set the size of each in-memory log buffer. The size may be
+ specified in bytes, or in kilobytes with a "k" suffix.
+ Valid sizes for version 1 and version 2 logs are 16384 (16k)
+ and 32768 (32k). Valid sizes for version 2 logs also
+ include 65536 (64k), 131072 (128k) and 262144 (256k). The
+ logbsize must be an integer multiple of the log
+ stripe unit configured at mkfs time.
+
+ The default value for for version 1 logs is 32768, while the
+ default value for version 2 logs is MAX(32768, log_sunit).
logdev=device and rtdev=device
Use an external log (metadata journal) and/or real-time device.
@@ -114,16 +157,11 @@ When mounting an XFS filesystem, the following options are accepted.
optional, and the log section can be separate from the data
section or contained within it.
- mtpt=mountpoint
- Use with the "dmapi" option. The value specified here will be
- included in the DMAPI mount event, and should be the path of
- the actual mountpoint that is used.
-
noalign
- Data allocations will not be aligned at stripe unit boundaries.
-
- noatime
- Access timestamps are not updated when a file is read.
+ Data allocations will not be aligned at stripe unit
+ boundaries. This is only relevant to filesystems created
+ with non-zero data alignment parameters (sunit, swidth) by
+ mkfs.
norecovery
The filesystem will be mounted without running log recovery.
@@ -134,8 +172,14 @@ When mounting an XFS filesystem, the following options are accepted.
the mount will fail.
nouuid
- Don't check for double mounted file systems using the file system uuid.
- This is useful to mount LVM snapshot volumes.
+ Don't check for double mounted file systems using the file
+ system uuid. This is useful to mount LVM snapshot volumes,
+ and often used in combination with "norecovery" for mounting
+ read-only snapshots.
+
+ noquota
+ Forcibly turns off all quota accounting and enforcement
+ within the filesystem.
uquota/usrquota/uqnoenforce/quota
User disk quota accounting enabled, and limits (optionally)
@@ -150,24 +194,64 @@ When mounting an XFS filesystem, the following options are accepted.
enforced. Refer to xfs_quota(8) for further details.
sunit=value and swidth=value
- Used to specify the stripe unit and width for a RAID device or
- a stripe volume. "value" must be specified in 512-byte block
- units.
- If this option is not specified and the filesystem was made on
- a stripe volume or the stripe width or unit were specified for
- the RAID device at mkfs time, then the mount system call will
- restore the value from the superblock. For filesystems that
- are made directly on RAID devices, these options can be used
- to override the information in the superblock if the underlying
- disk layout changes after the filesystem has been created.
- The "swidth" option is required if the "sunit" option has been
- specified, and must be a multiple of the "sunit" value.
+ Used to specify the stripe unit and width for a RAID device
+ or a stripe volume. "value" must be specified in 512-byte
+ block units. These options are only relevant to filesystems
+ that were created with non-zero data alignment parameters.
+
+ The sunit and swidth parameters specified must be compatible
+ with the existing filesystem alignment characteristics. In
+ general, that means the only valid changes to sunit are
+ increasing it by a power-of-2 multiple. Valid swidth values
+ are any integer multiple of a valid sunit value.
+
+ Typically the only time these mount options are necessary if
+ after an underlying RAID device has had it's geometry
+ modified, such as adding a new disk to a RAID5 lun and
+ reshaping it.
swalloc
Data allocations will be rounded up to stripe width boundaries
when the current end of file is being extended and the file
size is larger than the stripe width size.
+ wsync
+ When specified, all filesystem namespace operations are
+ executed synchronously. This ensures that when the namespace
+ operation (create, unlink, etc) completes, the change to the
+ namespace is on stable storage. This is useful in HA setups
+ where failover must not result in clients seeing
+ inconsistent namespace presentation during or after a
+ failover event.
+
+
+Deprecated Mount Options
+========================
+
+ delaylog/nodelaylog
+ Delayed logging is the only logging method that XFS supports
+ now, so these mount options are now ignored.
+
+ Due for removal in 3.12.
+
+ ihashsize=value
+ In memory inode hashes have been removed, so this option has
+ no function as of August 2007. Option is deprecated.
+
+ Due for removal in 3.12.
+
+ irixsgid
+ This behaviour is now controlled by a sysctl, so the mount
+ option is ignored.
+
+ Due for removal in 3.12.
+
+ osyncisdsync
+ osyncisosync
+ O_SYNC and O_DSYNC are fully supported, so there is no need
+ for these options any more.
+
+ Due for removal in 3.12.
sysctls
=======
@@ -179,15 +263,20 @@ The following sysctls are available for the XFS filesystem:
in /proc/fs/xfs/stat. It then immediately resets to "0".
fs.xfs.xfssyncd_centisecs (Min: 100 Default: 3000 Max: 720000)
- The interval at which the xfssyncd thread flushes metadata
- out to disk. This thread will flush log activity out, and
- do some processing on unlinked inodes.
+ The interval at which the filesystem flushes metadata
+ out to disk and runs internal cache cleanup routines.
- fs.xfs.xfsbufd_centisecs (Min: 50 Default: 100 Max: 3000)
- The interval at which xfsbufd scans the dirty metadata buffers list.
+ fs.xfs.filestream_centisecs (Min: 1 Default: 3000 Max: 360000)
+ The interval at which the filesystem ages filestreams cache
+ references and returns timed-out AGs back to the free stream
+ pool.
- fs.xfs.age_buffer_centisecs (Min: 100 Default: 1500 Max: 720000)
- The age at which xfsbufd flushes dirty metadata buffers to disk.
+ fs.xfs.speculative_prealloc_lifetime
+ (Units: seconds Min: 1 Default: 300 Max: 86400)
+ The interval at which the background scanning for inodes
+ with unused speculative preallocation runs. The scan
+ removes unused preallocation from clean inodes and releases
+ the unused space back to the free pool.
fs.xfs.error_level (Min: 0 Default: 3 Max: 11)
A volume knob for error reporting when internal errors occur.
@@ -244,9 +333,31 @@ The following sysctls are available for the XFS filesystem:
by the xfs_io(8) chattr command on a directory to be
inherited by files in that directory.
+ fs.xfs.inherit_nodefrag (Min: 0 Default: 1 Max: 1)
+ Setting this to "1" will cause the "nodefrag" flag set
+ by the xfs_io(8) chattr command on a directory to be
+ inherited by files in that directory.
+
fs.xfs.rotorstep (Min: 1 Default: 1 Max: 256)
In "inode32" allocation mode, this option determines how many
files the allocator attempts to allocate in the same allocation
group before moving to the next allocation group. The intent
is to control the rate at which the allocator moves between
allocation groups when allocating extents for new files.
+
+Deprecated Sysctls
+==================
+
+ fs.xfs.xfsbufd_centisecs (Min: 50 Default: 100 Max: 3000)
+ Dirty metadata is now tracked by the log subsystem and
+ flushing is driven by log space and idling demands. The
+ xfsbufd no longer exists, so this syctl does nothing.
+
+ Due for removal in 3.14.
+
+ fs.xfs.age_buffer_centisecs (Min: 100 Default: 1500 Max: 720000)
+ Dirty metadata is now tracked by the log subsystem and
+ flushing is driven by log space and idling demands. The
+ xfsbufd no longer exists, so this syctl does nothing.
+
+ Due for removal in 3.14.