diff options
Diffstat (limited to 'Documentation/filesystems/Locking')
| -rw-r--r-- | Documentation/filesystems/Locking | 338 |
1 files changed, 182 insertions, 156 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index b6426f15b4a..b18dd177902 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -9,51 +9,67 @@ be able to use diff(1). --------------------------- dentry_operations -------------------------- prototypes: - int (*d_revalidate)(struct dentry *, int); - int (*d_hash) (struct dentry *, struct qstr *); - int (*d_compare) (struct dentry *, struct qstr *, struct qstr *); + int (*d_revalidate)(struct dentry *, unsigned int); + int (*d_weak_revalidate)(struct dentry *, unsigned int); + int (*d_hash)(const struct dentry *, struct qstr *); + int (*d_compare)(const struct dentry *, const struct dentry *, + unsigned int, const char *, const struct qstr *); int (*d_delete)(struct dentry *); void (*d_release)(struct dentry *); void (*d_iput)(struct dentry *, struct inode *); char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen); + struct vfsmount *(*d_automount)(struct path *path); + int (*d_manage)(struct dentry *, bool); locking rules: - none have BKL - dcache_lock rename_lock ->d_lock may block -d_revalidate: no no no yes -d_hash no no no yes -d_compare: no yes no no -d_delete: yes no yes no -d_release: no no no yes -d_iput: no no no yes + rename_lock ->d_lock may block rcu-walk +d_revalidate: no no yes (ref-walk) maybe +d_weak_revalidate:no no yes no +d_hash no no no maybe +d_compare: yes no no maybe +d_delete: no yes no no +d_release: no no yes no +d_prune: no yes no no +d_iput: no no yes no d_dname: no no no no +d_automount: no no yes no +d_manage: no no yes (ref-walk) maybe --------------------------- inode_operations --------------------------- prototypes: - int (*create) (struct inode *,struct dentry *,int, struct nameidata *); - struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameid -ata *); + int (*create) (struct inode *,struct dentry *,umode_t, bool); + struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int); int (*link) (struct dentry *,struct inode *,struct dentry *); int (*unlink) (struct inode *,struct dentry *); int (*symlink) (struct inode *,struct dentry *,const char *); - int (*mkdir) (struct inode *,struct dentry *,int); + int (*mkdir) (struct inode *,struct dentry *,umode_t); int (*rmdir) (struct inode *,struct dentry *); - int (*mknod) (struct inode *,struct dentry *,int,dev_t); + int (*mknod) (struct inode *,struct dentry *,umode_t,dev_t); int (*rename) (struct inode *, struct dentry *, struct inode *, struct dentry *); + int (*rename2) (struct inode *, struct dentry *, + struct inode *, struct dentry *, unsigned int); int (*readlink) (struct dentry *, char __user *,int); - int (*follow_link) (struct dentry *, struct nameidata *); + void * (*follow_link) (struct dentry *, struct nameidata *); + void (*put_link) (struct dentry *, struct nameidata *, void *); void (*truncate) (struct inode *); - int (*permission) (struct inode *, int, struct nameidata *); + int (*permission) (struct inode *, int, unsigned int); + int (*get_acl)(struct inode *, int); int (*setattr) (struct dentry *, struct iattr *); int (*getattr) (struct vfsmount *, struct dentry *, struct kstat *); int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); ssize_t (*listxattr) (struct dentry *, char *, size_t); int (*removexattr) (struct dentry *, const char *); + int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len); + void (*update_time)(struct inode *, struct timespec *, int); + int (*atomic_open)(struct inode *, struct dentry *, + struct file *, unsigned open_flag, + umode_t create_mode, int *opened); + int (*tmpfile) (struct inode *, struct dentry *, umode_t); locking rules: - all may block, none have BKL + all may block i_mutex(inode) lookup: yes create: yes @@ -64,24 +80,27 @@ mkdir: yes unlink: yes (both) rmdir: yes (both) (see below) rename: yes (all) (see below) +rename2: yes (all) (see below) readlink: no follow_link: no -truncate: yes (see below) +put_link: no setattr: yes -permission: no +permission: no (may not block if called in rcu-walk mode) +get_acl: no getattr: no setxattr: yes getxattr: no listxattr: no removexattr: yes +fiemap: no +update_time: no +atomic_open: yes +tmpfile: no + Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on victim. - cross-directory ->rename() has (per-superblock) ->s_vfs_rename_sem. - ->truncate() is never called directly - it's a callback, not a -method. It's called by vmtruncate() - library function normally used by -->setattr(). Locking information above applies to that call (i.e. is -inherited from ->setattr() - vmtruncate() is used when ATTR_SIZE had been -passed). + cross-directory ->rename() and rename2() has (per-superblock) +->s_vfs_rename_sem. See Documentation/filesystems/directory-locking for more detailed discussion of the locking scheme for directory operations. @@ -90,43 +109,42 @@ of the locking scheme for directory operations. prototypes: struct inode *(*alloc_inode)(struct super_block *sb); void (*destroy_inode)(struct inode *); - void (*dirty_inode) (struct inode *); - int (*write_inode) (struct inode *, int); + void (*dirty_inode) (struct inode *, int flags); + int (*write_inode) (struct inode *, struct writeback_control *wbc); int (*drop_inode) (struct inode *); void (*evict_inode) (struct inode *); void (*put_super) (struct super_block *); - void (*write_super) (struct super_block *); int (*sync_fs)(struct super_block *sb, int wait); int (*freeze_fs) (struct super_block *); int (*unfreeze_fs) (struct super_block *); int (*statfs) (struct dentry *, struct kstatfs *); int (*remount_fs) (struct super_block *, int *, char *); void (*umount_begin) (struct super_block *); - int (*show_options)(struct seq_file *, struct vfsmount *); + int (*show_options)(struct seq_file *, struct dentry *); ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); + int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t); locking rules: All may block [not true, see below] - None have BKL s_umount alloc_inode: destroy_inode: -dirty_inode: (must not sleep) +dirty_inode: write_inode: -drop_inode: !!!inode_lock!!! +drop_inode: !!!inode->i_lock!!! evict_inode: put_super: write -write_super: read sync_fs: read -freeze_fs: read -unfreeze_fs: read +freeze_fs: write +unfreeze_fs: write statfs: maybe(read) (see below) remount_fs: write umount_begin: no show_options: no (namespace_sem) quota_read: no (see below) quota_write: no (see below) +bdev_try_to_free_page: no (see below) ->statfs() has s_umount (shared) when called by ustat(2) (native or compat), but that's an accident of bad API; s_umount is used to pin @@ -139,19 +157,23 @@ be the only ones operating on the quota file by the quota code (via dqio_sem) (unless an admin really wants to screw up something and writes to quota files with quotas on). For other details about locking see also dquot_operations section. +->bdev_try_to_free_page is called from the ->releasepage handler of +the block device inode. See there for more details. --------------------------- file_system_type --------------------------- prototypes: int (*get_sb) (struct file_system_type *, int, const char *, void *, struct vfsmount *); + struct dentry *(*mount) (struct file_system_type *, int, + const char *, void *); void (*kill_sb) (struct super_block *); locking rules: - may block BKL -get_sb yes no -kill_sb yes no + may block +mount yes +kill_sb yes -->get_sb() returns error or 0 with locked superblock attached to the vfsmount -(exclusive on ->s_umount). +->mount() returns ERR_PTR or the root dentry; its superblock should be locked +on return. ->kill_sb() takes a write-locked superblock, does all shutdown work on it, unlocks and drops the reference. @@ -171,32 +193,43 @@ prototypes: loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata); sector_t (*bmap)(struct address_space *, sector_t); - int (*invalidatepage) (struct page *, unsigned long); + void (*invalidatepage) (struct page *, unsigned int, unsigned int); int (*releasepage) (struct page *, int); void (*freepage)(struct page *); - int (*direct_IO)(int, struct kiocb *, const struct iovec *iov, - loff_t offset, unsigned long nr_segs); - int (*launder_page) (struct page *); + int (*direct_IO)(int, struct kiocb *, struct iov_iter *iter, loff_t offset); + int (*get_xip_mem)(struct address_space *, pgoff_t, int, void **, + unsigned long *); + int (*migratepage)(struct address_space *, struct page *, struct page *); + int (*launder_page)(struct page *); + int (*is_partially_uptodate)(struct page *, unsigned long, unsigned long); + int (*error_remove_page)(struct address_space *, struct page *); + int (*swap_activate)(struct file *); + int (*swap_deactivate)(struct file *); locking rules: All except set_page_dirty and freepage may block - BKL PageLocked(page) i_mutex -writepage: no yes, unlocks (see below) -readpage: no yes, unlocks -sync_page: no maybe -writepages: no -set_page_dirty no no -readpages: no -write_begin: no locks the page yes -write_end: no yes, unlocks yes -perform_write: no n/a yes -bmap: no -invalidatepage: no yes -releasepage: no yes -freepage: no yes -direct_IO: no -launder_page: no yes + PageLocked(page) i_mutex +writepage: yes, unlocks (see below) +readpage: yes, unlocks +sync_page: maybe +writepages: +set_page_dirty no +readpages: +write_begin: locks the page yes +write_end: yes, unlocks yes +bmap: +invalidatepage: yes +releasepage: yes +freepage: yes +direct_IO: +get_xip_mem: maybe +migratepage: yes (both) +launder_page: yes +is_partially_uptodate: yes +error_remove_page: yes +swap_activate: no +swap_deactivate: no ->write_begin(), ->write_end(), ->sync_page() and ->readpage() may be called from the request handler (/dev/loop). @@ -276,13 +309,12 @@ under spinlock (it cannot block) and is sometimes called with the page not locked. ->bmap() is currently used by legacy ioctl() (FIBMAP) provided by some -filesystems and by the swapper. The latter will eventually go away. All -instances do not actually need the BKL. Please, keep it that way and don't -breed new callers. +filesystems and by the swapper. The latter will eventually go away. Please, +keep it that way and don't breed new callers. ->invalidatepage() is called when the filesystem must attempt to drop -some or all of the buffers from the page when it is being truncated. It -returns zero on success. If ->invalidatepage is zero, the kernel uses +some or all of the buffers from the page when it is being truncated. It +returns zero on success. If ->invalidatepage is zero, the kernel uses block_invalidatepage() instead. ->releasepage() is called when the kernel is about to try to drop the @@ -299,47 +331,55 @@ cleaned, or an error value if not. Note that in order to prevent the page getting mapped back in and redirtied, it needs to be kept locked across the entire operation. - Note: currently almost all instances of address_space methods are -using BKL for internal serialization and that's one of the worst sources -of contention. Normally they are calling library functions (in fs/buffer.c) -and pass foo_get_block() as a callback (on local block-based filesystems, -indeed). BKL is not needed for library stuff and is usually taken by -foo_get_block(). It's an overkill, since block bitmaps can be protected by -internal fs locking and real critical areas are much smaller than the areas -filesystems protect now. + ->swap_activate will be called with a non-zero argument on +files backing (non block device backed) swapfiles. A return value +of zero indicates success, in which case this file can be used for +backing swapspace. The swapspace operations will be proxied to the +address space operations. + + ->swap_deactivate() will be called in the sys_swapoff() +path after ->swap_activate() returned success. ----------------------- file_lock_operations ------------------------------ prototypes: - void (*fl_insert)(struct file_lock *); /* lock insertion callback */ - void (*fl_remove)(struct file_lock *); /* lock removal callback */ void (*fl_copy_lock)(struct file_lock *, struct file_lock *); void (*fl_release_private)(struct file_lock *); locking rules: - BKL may block -fl_insert: yes no -fl_remove: yes no -fl_copy_lock: yes no -fl_release_private: yes yes + inode->i_lock may block +fl_copy_lock: yes no +fl_release_private: maybe no ----------------------- lock_manager_operations --------------------------- prototypes: - int (*fl_compare_owner)(struct file_lock *, struct file_lock *); - void (*fl_notify)(struct file_lock *); /* unblock callback */ - void (*fl_release_private)(struct file_lock *); - void (*fl_break)(struct file_lock *); /* break_lease callback */ + int (*lm_compare_owner)(struct file_lock *, struct file_lock *); + unsigned long (*lm_owner_key)(struct file_lock *); + void (*lm_notify)(struct file_lock *); /* unblock callback */ + int (*lm_grant)(struct file_lock *, struct file_lock *, int); + void (*lm_break)(struct file_lock *); /* break_lease callback */ + int (*lm_change)(struct file_lock **, int); locking rules: - BKL may block -fl_compare_owner: yes no -fl_notify: yes no -fl_release_private: yes yes -fl_break: yes no - - Currently only NFSD and NLM provide instances of this class. None of the -them block. If you have out-of-tree instances - please, show up. Locking -in that area will change. + + inode->i_lock blocked_lock_lock may block +lm_compare_owner: yes[1] maybe no +lm_owner_key yes[1] yes no +lm_notify: yes yes no +lm_grant: no no no +lm_break: yes no no +lm_change yes no no + +[1]: ->lm_compare_owner and ->lm_owner_key are generally called with +*an* inode->i_lock held. It may not be the i_lock of the inode +associated with either file_lock argument! This is the case with deadlock +detection, since the code has to chase down the owners of locks that may +be entirely unrelated to the one on which the lock is being acquired. +For deadlock detection however, the blocked_lock_lock is also held. The +fact that these locks are held ensures that the file_locks do not +disappear out from under you while doing the comparison or generating an +owner key. + --------------------------- buffer_head ----------------------------------- prototypes: void (*b_end_io)(struct buffer_head *bh, int uptodate); @@ -364,17 +404,17 @@ prototypes: void (*swap_slot_free_notify) (struct block_device *, unsigned long); locking rules: - BKL bd_mutex -open: no yes -release: no yes -ioctl: no no -compat_ioctl: no no -direct_access: no no -media_changed: no no -unlock_native_capacity: no no -revalidate_disk: no no -getgeo: no no -swap_slot_free_notify: no no (see below) + bd_mutex +open: yes +release: yes +ioctl: no +compat_ioctl: no +direct_access: no +media_changed: no +unlock_native_capacity: no +revalidate_disk: no +getgeo: no +swap_slot_free_notify: no (see below) media_changed, unlock_native_capacity and revalidate_disk are called only from check_disk_change(). @@ -390,7 +430,9 @@ prototypes: ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t); ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t); - int (*readdir) (struct file *, void *, filldir_t); + ssize_t (*read_iter) (struct kiocb *, struct iov_iter *); + ssize_t (*write_iter) (struct kiocb *, struct iov_iter *); + int (*iterate) (struct file *, struct dir_context *); unsigned int (*poll) (struct file *, struct poll_table_struct *); long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); long (*compat_ioctl) (struct file *, unsigned int, unsigned long); @@ -398,7 +440,7 @@ prototypes: int (*open) (struct inode *, struct file *); int (*flush) (struct file *); int (*release) (struct inode *, struct file *); - int (*fsync) (struct file *, int datasync); + int (*fsync) (struct file *, loff_t start, loff_t end, int datasync); int (*aio_fsync) (struct kiocb *, int datasync); int (*fasync) (int, struct file *, int); int (*lock) (struct file *, int, struct file_lock *); @@ -413,34 +455,20 @@ prototypes: unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); int (*check_flags)(int); + int (*flock) (struct file *, int, struct file_lock *); + ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, + size_t, unsigned int); + ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, + size_t, unsigned int); + int (*setlease)(struct file *, long, struct file_lock **); + long (*fallocate)(struct file *, int, loff_t, loff_t); }; locking rules: - All may block. - BKL -llseek: no (see below) -read: no -aio_read: no -write: no -aio_write: no -readdir: no -poll: no -unlocked_ioctl: no -compat_ioctl: no -mmap: no -open: no -flush: no -release: no -fsync: no (see below) -aio_fsync: no -fasync: no -lock: yes -readv: no -writev: no -sendfile: no -sendpage: no -get_unmapped_area: no -check_flags: no + All may block except for ->setlease. + No VFS locks held on entry except for ->setlease. + +->setlease has the file_list_lock held and must not sleep. ->llseek() locking has moved from llseek to the individual llseek implementations. If your fs is not using generic_file_llseek, you @@ -450,17 +478,10 @@ mutex or just to use i_size_read() instead. Note: this does not protect the file->f_pos against concurrent modifications since this is something the userspace has to take care about. -Note: ext2_release() was *the* source of contention on fs-intensive -loads and dropping BKL on ->release() helps to get rid of that (we still -grab BKL for cases when we close a file that had been opened r/w, but that -can and should be done using the internal locking with smaller critical areas). -Current worst offender is ext2_get_block()... - -->fasync() is called without BKL protection, and is responsible for -maintaining the FASYNC bit in filp->f_flags. Most instances call -fasync_helper(), which does that maintenance, so it's not normally -something one needs to worry about. Return values > 0 will be mapped to -zero in the VFS layer. +->fasync() is responsible for maintaining the FASYNC bit in filp->f_flags. +Most instances call fasync_helper(), which does that maintenance, so it's +not normally something one needs to worry about. Return values > 0 will be +mapped to zero in the VFS layer. ->readdir() and ->ioctl() on directories must be changed. Ideally we would move ->readdir() to inode_operations and use a separate method for directory @@ -471,8 +492,6 @@ components. And there are other reasons why the current interface is a mess... ->read on directories probably must go away - we should just enforce -EISDIR in sys_read() and friends. -->fsync() has i_mutex on inode. - --------------------------- dquot_operations ------------------------------- prototypes: int (*write_dquot) (struct dquot *); @@ -507,12 +526,13 @@ prototypes: int (*access)(struct vm_area_struct *, unsigned long, void*, int, int); locking rules: - BKL mmap_sem PageLocked(page) -open: no yes -close: no yes -fault: no yes can return with page locked -page_mkwrite: no yes can return with page locked -access: no yes + mmap_sem PageLocked(page) +open: yes +close: yes +fault: yes can return with page locked +map_pages: yes +page_mkwrite: yes can return with page locked +access: yes ->fault() is called when a previously not present pte is about to be faulted in. The filesystem must find and return the page associated @@ -522,6 +542,15 @@ the page, then ensure it is not already truncated (the page lock will block subsequent truncate), and then return with VM_FAULT_LOCKED, and the page locked. The VM will unlock the page. + ->map_pages() is called when VM asks to map easy accessible pages. +Filesystem should find and map pages associated with offsets from "pgoff" +till "max_pgoff". ->map_pages() is called with page table locked and must +not block. If it's not possible to reach a page without blocking, +filesystem should skip it. Filesystem should use do_set_pte() to setup +page table entry. Pointer to entry associated with offset "pgoff" is +passed in "pte" field in vm_fault structure. Pointers to entries for other +offsets should be calculated relative to "pte". + ->page_mkwrite() is called when a previously read-only pte is about to become writeable. The filesystem again must ensure that there are no truncate/invalidate races, and then return with the page locked. If @@ -530,7 +559,7 @@ like the ->fault() handler, but simply return with VM_FAULT_NOPAGE, which will cause the VM to retry the fault. ->access() is called when get_user_pages() fails in -acces_process_vm(), typically used to debug a process through +access_process_vm(), typically used to debug a process through /proc/pid/mem or ptrace. This function is needed only for VM_IO | VM_PFNMAP VMAs. @@ -539,6 +568,3 @@ VM_IO | VM_PFNMAP VMAs. (if you break something or notice that it is broken and do not fix it yourself - at least put it here) - -ipc/shm.c::shm_delete() - may need BKL. -->read() and ->write() in many drivers are (probably) missing BKL. |
