aboutsummaryrefslogtreecommitdiff
path: root/Documentation/filesystems/Locking
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation/filesystems/Locking')
-rw-r--r--Documentation/filesystems/Locking501
1 files changed, 276 insertions, 225 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 1045da582b9..b18dd177902 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -9,50 +9,68 @@ be able to use diff(1).
--------------------------- dentry_operations --------------------------
prototypes:
- int (*d_revalidate)(struct dentry *, int);
- int (*d_hash) (struct dentry *, struct qstr *);
- int (*d_compare) (struct dentry *, struct qstr *, struct qstr *);
+ int (*d_revalidate)(struct dentry *, unsigned int);
+ int (*d_weak_revalidate)(struct dentry *, unsigned int);
+ int (*d_hash)(const struct dentry *, struct qstr *);
+ int (*d_compare)(const struct dentry *, const struct dentry *,
+ unsigned int, const char *, const struct qstr *);
int (*d_delete)(struct dentry *);
void (*d_release)(struct dentry *);
void (*d_iput)(struct dentry *, struct inode *);
+ char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen);
+ struct vfsmount *(*d_automount)(struct path *path);
+ int (*d_manage)(struct dentry *, bool);
locking rules:
- none have BKL
- dcache_lock rename_lock ->d_lock may block
-d_revalidate: no no no yes
-d_hash no no no yes
-d_compare: no yes no no
-d_delete: yes no yes no
-d_release: no no no yes
-d_iput: no no no yes
+ rename_lock ->d_lock may block rcu-walk
+d_revalidate: no no yes (ref-walk) maybe
+d_weak_revalidate:no no yes no
+d_hash no no no maybe
+d_compare: yes no no maybe
+d_delete: no yes no no
+d_release: no no yes no
+d_prune: no yes no no
+d_iput: no no yes no
+d_dname: no no no no
+d_automount: no no yes no
+d_manage: no no yes (ref-walk) maybe
--------------------------- inode_operations ---------------------------
prototypes:
- int (*create) (struct inode *,struct dentry *,int, struct nameidata *);
- struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameid
-ata *);
+ int (*create) (struct inode *,struct dentry *,umode_t, bool);
+ struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
int (*link) (struct dentry *,struct inode *,struct dentry *);
int (*unlink) (struct inode *,struct dentry *);
int (*symlink) (struct inode *,struct dentry *,const char *);
- int (*mkdir) (struct inode *,struct dentry *,int);
+ int (*mkdir) (struct inode *,struct dentry *,umode_t);
int (*rmdir) (struct inode *,struct dentry *);
- int (*mknod) (struct inode *,struct dentry *,int,dev_t);
+ int (*mknod) (struct inode *,struct dentry *,umode_t,dev_t);
int (*rename) (struct inode *, struct dentry *,
struct inode *, struct dentry *);
+ int (*rename2) (struct inode *, struct dentry *,
+ struct inode *, struct dentry *, unsigned int);
int (*readlink) (struct dentry *, char __user *,int);
- int (*follow_link) (struct dentry *, struct nameidata *);
+ void * (*follow_link) (struct dentry *, struct nameidata *);
+ void (*put_link) (struct dentry *, struct nameidata *, void *);
void (*truncate) (struct inode *);
- int (*permission) (struct inode *, int, struct nameidata *);
+ int (*permission) (struct inode *, int, unsigned int);
+ int (*get_acl)(struct inode *, int);
int (*setattr) (struct dentry *, struct iattr *);
int (*getattr) (struct vfsmount *, struct dentry *, struct kstat *);
int (*setxattr) (struct dentry *, const char *,const void *,size_t,int);
ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
ssize_t (*listxattr) (struct dentry *, char *, size_t);
int (*removexattr) (struct dentry *, const char *);
+ int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len);
+ void (*update_time)(struct inode *, struct timespec *, int);
+ int (*atomic_open)(struct inode *, struct dentry *,
+ struct file *, unsigned open_flag,
+ umode_t create_mode, int *opened);
+ int (*tmpfile) (struct inode *, struct dentry *, umode_t);
locking rules:
- all may block, none have BKL
- i_sem(inode)
+ all may block
+ i_mutex(inode)
lookup: yes
create: yes
link: yes (both)
@@ -62,24 +80,27 @@ mkdir: yes
unlink: yes (both)
rmdir: yes (both) (see below)
rename: yes (all) (see below)
+rename2: yes (all) (see below)
readlink: no
follow_link: no
-truncate: yes (see below)
+put_link: no
setattr: yes
-permission: no
+permission: no (may not block if called in rcu-walk mode)
+get_acl: no
getattr: no
setxattr: yes
getxattr: no
listxattr: no
removexattr: yes
- Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_sem on
+fiemap: no
+update_time: no
+atomic_open: yes
+tmpfile: no
+
+ Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on
victim.
- cross-directory ->rename() has (per-superblock) ->s_vfs_rename_sem.
- ->truncate() is never called directly - it's a callback, not a
-method. It's called by vmtruncate() - library function normally used by
-->setattr(). Locking information above applies to that call (i.e. is
-inherited from ->setattr() - vmtruncate() is used when ATTR_SIZE had been
-passed).
+ cross-directory ->rename() and rename2() has (per-superblock)
+->s_vfs_rename_sem.
See Documentation/filesystems/directory-locking for more detailed discussion
of the locking scheme for directory operations.
@@ -88,69 +109,71 @@ of the locking scheme for directory operations.
prototypes:
struct inode *(*alloc_inode)(struct super_block *sb);
void (*destroy_inode)(struct inode *);
- void (*read_inode) (struct inode *);
- void (*dirty_inode) (struct inode *);
- int (*write_inode) (struct inode *, int);
- void (*put_inode) (struct inode *);
- void (*drop_inode) (struct inode *);
- void (*delete_inode) (struct inode *);
+ void (*dirty_inode) (struct inode *, int flags);
+ int (*write_inode) (struct inode *, struct writeback_control *wbc);
+ int (*drop_inode) (struct inode *);
+ void (*evict_inode) (struct inode *);
void (*put_super) (struct super_block *);
- void (*write_super) (struct super_block *);
int (*sync_fs)(struct super_block *sb, int wait);
- void (*write_super_lockfs) (struct super_block *);
- void (*unlockfs) (struct super_block *);
- int (*statfs) (struct super_block *, struct kstatfs *);
+ int (*freeze_fs) (struct super_block *);
+ int (*unfreeze_fs) (struct super_block *);
+ int (*statfs) (struct dentry *, struct kstatfs *);
int (*remount_fs) (struct super_block *, int *, char *);
- void (*clear_inode) (struct inode *);
void (*umount_begin) (struct super_block *);
- int (*show_options)(struct seq_file *, struct vfsmount *);
+ int (*show_options)(struct seq_file *, struct dentry *);
ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
+ int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t);
locking rules:
- All may block.
- BKL s_lock s_umount
-alloc_inode: no no no
-destroy_inode: no
-read_inode: no (see below)
-dirty_inode: no (must not sleep)
-write_inode: no
-put_inode: no
-drop_inode: no !!!inode_lock!!!
-delete_inode: no
-put_super: yes yes no
-write_super: no yes read
-sync_fs: no no read
-write_super_lockfs: ?
-unlockfs: ?
-statfs: no no no
-remount_fs: no yes maybe (see below)
-clear_inode: no
-umount_begin: yes no no
-show_options: no (vfsmount->sem)
-quota_read: no no no (see below)
-quota_write: no no no (see below)
-
-->read_inode() is not a method - it's a callback used in iget().
-->remount_fs() will have the s_umount lock if it's already mounted.
-When called from get_sb_single, it does NOT have the s_umount lock.
+ All may block [not true, see below]
+ s_umount
+alloc_inode:
+destroy_inode:
+dirty_inode:
+write_inode:
+drop_inode: !!!inode->i_lock!!!
+evict_inode:
+put_super: write
+sync_fs: read
+freeze_fs: write
+unfreeze_fs: write
+statfs: maybe(read) (see below)
+remount_fs: write
+umount_begin: no
+show_options: no (namespace_sem)
+quota_read: no (see below)
+quota_write: no (see below)
+bdev_try_to_free_page: no (see below)
+
+->statfs() has s_umount (shared) when called by ustat(2) (native or
+compat), but that's an accident of bad API; s_umount is used to pin
+the superblock down when we only have dev_t given us by userland to
+identify the superblock. Everything else (statfs(), fstatfs(), etc.)
+doesn't hold it when calling ->statfs() - superblock is pinned down
+by resolving the pathname passed to syscall.
->quota_read() and ->quota_write() functions are both guaranteed to
be the only ones operating on the quota file by the quota code (via
dqio_sem) (unless an admin really wants to screw up something and
writes to quota files with quotas on). For other details about locking
see also dquot_operations section.
+->bdev_try_to_free_page is called from the ->releasepage handler of
+the block device inode. See there for more details.
--------------------------- file_system_type ---------------------------
prototypes:
- struct super_block *(*get_sb) (struct file_system_type *, int,
- const char *, void *);
+ int (*get_sb) (struct file_system_type *, int,
+ const char *, void *, struct vfsmount *);
+ struct dentry *(*mount) (struct file_system_type *, int,
+ const char *, void *);
void (*kill_sb) (struct super_block *);
locking rules:
- may block BKL
-get_sb yes yes
-kill_sb yes yes
+ may block
+mount yes
+kill_sb yes
-->get_sb() returns error or a locked superblock (exclusive on ->s_umount).
+->mount() returns ERR_PTR or the root dentry; its superblock should be locked
+on return.
->kill_sb() takes a write-locked superblock, does all shutdown work on it,
unlocks and drops the reference.
@@ -163,32 +186,52 @@ prototypes:
int (*set_page_dirty)(struct page *page);
int (*readpages)(struct file *filp, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages);
- int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);
- int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
+ int (*write_begin)(struct file *, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata);
+ int (*write_end)(struct file *, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata);
sector_t (*bmap)(struct address_space *, sector_t);
- int (*invalidatepage) (struct page *, unsigned long);
+ void (*invalidatepage) (struct page *, unsigned int, unsigned int);
int (*releasepage) (struct page *, int);
- int (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
- loff_t offset, unsigned long nr_segs);
+ void (*freepage)(struct page *);
+ int (*direct_IO)(int, struct kiocb *, struct iov_iter *iter, loff_t offset);
+ int (*get_xip_mem)(struct address_space *, pgoff_t, int, void **,
+ unsigned long *);
+ int (*migratepage)(struct address_space *, struct page *, struct page *);
+ int (*launder_page)(struct page *);
+ int (*is_partially_uptodate)(struct page *, unsigned long, unsigned long);
+ int (*error_remove_page)(struct address_space *, struct page *);
+ int (*swap_activate)(struct file *);
+ int (*swap_deactivate)(struct file *);
locking rules:
- All except set_page_dirty may block
-
- BKL PageLocked(page)
-writepage: no yes, unlocks (see below)
-readpage: no yes, unlocks
-sync_page: no maybe
-writepages: no
-set_page_dirty no no
-readpages: no
-prepare_write: no yes
-commit_write: no yes
-bmap: yes
-invalidatepage: no yes
-releasepage: no yes
-direct_IO: no
-
- ->prepare_write(), ->commit_write(), ->sync_page() and ->readpage()
+ All except set_page_dirty and freepage may block
+
+ PageLocked(page) i_mutex
+writepage: yes, unlocks (see below)
+readpage: yes, unlocks
+sync_page: maybe
+writepages:
+set_page_dirty no
+readpages:
+write_begin: locks the page yes
+write_end: yes, unlocks yes
+bmap:
+invalidatepage: yes
+releasepage: yes
+freepage: yes
+direct_IO:
+get_xip_mem: maybe
+migratepage: yes (both)
+launder_page: yes
+is_partially_uptodate: yes
+error_remove_page: yes
+swap_activate: no
+swap_deactivate: no
+
+ ->write_begin(), ->write_end(), ->sync_page() and ->readpage()
may be called from the request handler (/dev/loop).
->readpage() unlocks the page, either synchronously or via I/O
@@ -216,7 +259,7 @@ against the page the filesystem should redirty the page with
redirty_page_for_writepage(), then unlock the page and return zero.
This may also be done to avoid internal deadlocks, but rarely.
-If the filesytem is called for sync then it must wait on any
+If the filesystem is called for sync then it must wait on any
in-progress I/O and then start new I/O.
The filesystem should unlock the page synchronously, before returning to the
@@ -266,13 +309,12 @@ under spinlock (it cannot block) and is sometimes called with the page
not locked.
->bmap() is currently used by legacy ioctl() (FIBMAP) provided by some
-filesystems and by the swapper. The latter will eventually go away. All
-instances do not actually need the BKL. Please, keep it that way and don't
-breed new callers.
+filesystems and by the swapper. The latter will eventually go away. Please,
+keep it that way and don't breed new callers.
->invalidatepage() is called when the filesystem must attempt to drop
-some or all of the buffers from the page when it is being truncated. It
-returns zero on success. If ->invalidatepage is zero, the kernel uses
+some or all of the buffers from the page when it is being truncated. It
+returns zero on success. If ->invalidatepage is zero, the kernel uses
block_invalidatepage() instead.
->releasepage() is called when the kernel is about to try to drop the
@@ -280,49 +322,64 @@ buffers from the page in preparation for freeing it. It returns zero to
indicate that the buffers are (or may be) freeable. If ->releasepage is zero,
the kernel assumes that the fs has no private interest in the buffers.
- Note: currently almost all instances of address_space methods are
-using BKL for internal serialization and that's one of the worst sources
-of contention. Normally they are calling library functions (in fs/buffer.c)
-and pass foo_get_block() as a callback (on local block-based filesystems,
-indeed). BKL is not needed for library stuff and is usually taken by
-foo_get_block(). It's an overkill, since block bitmaps can be protected by
-internal fs locking and real critical areas are much smaller than the areas
-filesystems protect now.
+ ->freepage() is called when the kernel is done dropping the page
+from the page cache.
+
+ ->launder_page() may be called prior to releasing a page if
+it is still found to be dirty. It returns zero if the page was successfully
+cleaned, or an error value if not. Note that in order to prevent the page
+getting mapped back in and redirtied, it needs to be kept locked
+across the entire operation.
+
+ ->swap_activate will be called with a non-zero argument on
+files backing (non block device backed) swapfiles. A return value
+of zero indicates success, in which case this file can be used for
+backing swapspace. The swapspace operations will be proxied to the
+address space operations.
+
+ ->swap_deactivate() will be called in the sys_swapoff()
+path after ->swap_activate() returned success.
----------------------- file_lock_operations ------------------------------
prototypes:
- void (*fl_insert)(struct file_lock *); /* lock insertion callback */
- void (*fl_remove)(struct file_lock *); /* lock removal callback */
void (*fl_copy_lock)(struct file_lock *, struct file_lock *);
void (*fl_release_private)(struct file_lock *);
locking rules:
- BKL may block
-fl_insert: yes no
-fl_remove: yes no
-fl_copy_lock: yes no
-fl_release_private: yes yes
+ inode->i_lock may block
+fl_copy_lock: yes no
+fl_release_private: maybe no
----------------------- lock_manager_operations ---------------------------
prototypes:
- int (*fl_compare_owner)(struct file_lock *, struct file_lock *);
- void (*fl_notify)(struct file_lock *); /* unblock callback */
- void (*fl_copy_lock)(struct file_lock *, struct file_lock *);
- void (*fl_release_private)(struct file_lock *);
- void (*fl_break)(struct file_lock *); /* break_lease callback */
+ int (*lm_compare_owner)(struct file_lock *, struct file_lock *);
+ unsigned long (*lm_owner_key)(struct file_lock *);
+ void (*lm_notify)(struct file_lock *); /* unblock callback */
+ int (*lm_grant)(struct file_lock *, struct file_lock *, int);
+ void (*lm_break)(struct file_lock *); /* break_lease callback */
+ int (*lm_change)(struct file_lock **, int);
locking rules:
- BKL may block
-fl_compare_owner: yes no
-fl_notify: yes no
-fl_copy_lock: yes no
-fl_release_private: yes yes
-fl_break: yes no
-
- Currently only NFSD and NLM provide instances of this class. None of the
-them block. If you have out-of-tree instances - please, show up. Locking
-in that area will change.
+
+ inode->i_lock blocked_lock_lock may block
+lm_compare_owner: yes[1] maybe no
+lm_owner_key yes[1] yes no
+lm_notify: yes yes no
+lm_grant: no no no
+lm_break: yes no no
+lm_change yes no no
+
+[1]: ->lm_compare_owner and ->lm_owner_key are generally called with
+*an* inode->i_lock held. It may not be the i_lock of the inode
+associated with either file_lock argument! This is the case with deadlock
+detection, since the code has to chase down the owners of locks that may
+be entirely unrelated to the one on which the lock is being acquired.
+For deadlock detection however, the blocked_lock_lock is also held. The
+fact that these locks are held ensures that the file_locks do not
+disappear out from under you while doing the comparison or generating an
+owner key.
+
--------------------------- buffer_head -----------------------------------
prototypes:
void (*b_end_io)(struct buffer_head *bh, int uptodate);
@@ -335,41 +392,55 @@ call this method upon the IO completion.
--------------------------- block_device_operations -----------------------
prototypes:
- int (*open) (struct inode *, struct file *);
- int (*release) (struct inode *, struct file *);
- int (*ioctl) (struct inode *, struct file *, unsigned, unsigned long);
+ int (*open) (struct block_device *, fmode_t);
+ int (*release) (struct gendisk *, fmode_t);
+ int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
+ int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
+ int (*direct_access) (struct block_device *, sector_t, void **, unsigned long *);
int (*media_changed) (struct gendisk *);
+ void (*unlock_native_capacity) (struct gendisk *);
int (*revalidate_disk) (struct gendisk *);
+ int (*getgeo)(struct block_device *, struct hd_geometry *);
+ void (*swap_slot_free_notify) (struct block_device *, unsigned long);
locking rules:
- BKL bd_sem
-open: yes yes
-release: yes yes
-ioctl: yes no
-media_changed: no no
-revalidate_disk: no no
+ bd_mutex
+open: yes
+release: yes
+ioctl: no
+compat_ioctl: no
+direct_access: no
+media_changed: no
+unlock_native_capacity: no
+revalidate_disk: no
+getgeo: no
+swap_slot_free_notify: no (see below)
+
+media_changed, unlock_native_capacity and revalidate_disk are called only from
+check_disk_change().
+
+swap_slot_free_notify is called with swap_lock and sometimes the page lock
+held.
-The last two are called only from check_disk_change().
--------------------------- file_operations -------------------------------
prototypes:
loff_t (*llseek) (struct file *, loff_t, int);
ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
- ssize_t (*aio_read) (struct kiocb *, char __user *, size_t, loff_t);
ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
- ssize_t (*aio_write) (struct kiocb *, const char __user *, size_t,
- loff_t);
- int (*readdir) (struct file *, void *, filldir_t);
+ ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
+ ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
+ ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
+ ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
+ int (*iterate) (struct file *, struct dir_context *);
unsigned int (*poll) (struct file *, struct poll_table_struct *);
- int (*ioctl) (struct inode *, struct file *, unsigned int,
- unsigned long);
long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
int (*mmap) (struct file *, struct vm_area_struct *);
int (*open) (struct inode *, struct file *);
int (*flush) (struct file *);
int (*release) (struct inode *, struct file *);
- int (*fsync) (struct file *, struct dentry *, int datasync);
+ int (*fsync) (struct file *, loff_t start, loff_t end, int datasync);
int (*aio_fsync) (struct kiocb *, int datasync);
int (*fasync) (int, struct file *, int);
int (*lock) (struct file *, int, struct file_lock *);
@@ -384,60 +455,33 @@ prototypes:
unsigned long (*get_unmapped_area)(struct file *, unsigned long,
unsigned long, unsigned long, unsigned long);
int (*check_flags)(int);
- int (*dir_notify)(struct file *, unsigned long);
+ int (*flock) (struct file *, int, struct file_lock *);
+ ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *,
+ size_t, unsigned int);
+ ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *,
+ size_t, unsigned int);
+ int (*setlease)(struct file *, long, struct file_lock **);
+ long (*fallocate)(struct file *, int, loff_t, loff_t);
};
locking rules:
- All except ->poll() may block.
- BKL
-llseek: no (see below)
-read: no
-aio_read: no
-write: no
-aio_write: no
-readdir: no
-poll: no
-ioctl: yes (see below)
-unlocked_ioctl: no (see below)
-compat_ioctl: no
-mmap: no
-open: maybe (see below)
-flush: no
-release: no
-fsync: no (see below)
-aio_fsync: no
-fasync: yes (see below)
-lock: yes
-readv: no
-writev: no
-sendfile: no
-sendpage: no
-get_unmapped_area: no
-check_flags: no
-dir_notify: no
+ All may block except for ->setlease.
+ No VFS locks held on entry except for ->setlease.
+
+->setlease has the file_list_lock held and must not sleep.
->llseek() locking has moved from llseek to the individual llseek
implementations. If your fs is not using generic_file_llseek, you
need to acquire and release the appropriate locks in your ->llseek().
For many filesystems, it is probably safe to acquire the inode
-semaphore. Note some filesystems (i.e. remote ones) provide no
-protection for i_size so you will need to use the BKL.
-
-->open() locking is in-transit: big lock partially moved into the methods.
-The only exception is ->open() in the instances of file_operations that never
-end up in ->i_fop/->proc_fops, i.e. ones that belong to character devices
-(chrdev_open() takes lock before replacing ->f_op and calling the secondary
-method. As soon as we fix the handling of module reference counters all
-instances of ->open() will be called without the BKL.
+mutex or just to use i_size_read() instead.
+Note: this does not protect the file->f_pos against concurrent modifications
+since this is something the userspace has to take care about.
-Note: ext2_release() was *the* source of contention on fs-intensive
-loads and dropping BKL on ->release() helps to get rid of that (we still
-grab BKL for cases when we close a file that had been opened r/w, but that
-can and should be done using the internal locking with smaller critical areas).
-Current worst offender is ext2_get_block()...
-
-->fasync() is a mess. This area needs a big cleanup and that will probably
-affect locking.
+->fasync() is responsible for maintaining the FASYNC bit in filp->f_flags.
+Most instances call fasync_helper(), which does that maintenance, so it's
+not normally something one needs to worry about. Return values > 0 will be
+mapped to zero in the VFS layer.
->readdir() and ->ioctl() on directories must be changed. Ideally we would
move ->readdir() to inode_operations and use a separate method for directory
@@ -445,23 +489,11 @@ move ->readdir() to inode_operations and use a separate method for directory
anything that resembles union-mount we won't have a struct file for all
components. And there are other reasons why the current interface is a mess...
-->ioctl() on regular files is superceded by the ->unlocked_ioctl() that
-doesn't take the BKL.
-
->read on directories probably must go away - we should just enforce -EISDIR
in sys_read() and friends.
-->fsync() has i_sem on inode.
-
--------------------------- dquot_operations -------------------------------
prototypes:
- int (*initialize) (struct inode *, int);
- int (*drop) (struct inode *);
- int (*alloc_space) (struct inode *, qsize_t, int);
- int (*alloc_inode) (const struct inode *, unsigned long);
- int (*free_space) (struct inode *, qsize_t);
- int (*free_inode) (const struct inode *, unsigned long);
- int (*transfer) (struct inode *, struct iattr *);
int (*write_dquot) (struct dquot *);
int (*acquire_dquot) (struct dquot *);
int (*release_dquot) (struct dquot *);
@@ -474,13 +506,6 @@ a proper locking wrt the filesystem and call the generic quota operations.
What filesystem should expect from the generic quota functions:
FS recursion Held locks when called
-initialize: yes maybe dqonoff_sem
-drop: yes -
-alloc_space: ->mark_dirty() -
-alloc_inode: ->mark_dirty() -
-free_space: ->mark_dirty() -
-free_inode: ->mark_dirty() -
-transfer: yes -
write_dquot: yes dqonoff_sem or dqptr_sem
acquire_dquot: yes dqonoff_sem or dqptr_sem
release_dquot: yes dqonoff_sem or dqptr_sem
@@ -490,30 +515,56 @@ write_info: yes dqonoff_sem
FS recursion means calling ->quota_read() and ->quota_write() from superblock
operations.
-->alloc_space(), ->alloc_inode(), ->free_space(), ->free_inode() are called
-only directly by the filesystem and do not call any fs functions only
-the ->mark_dirty() operation.
-
More details about quota locking can be found in fs/dquot.c.
--------------------------- vm_operations_struct -----------------------------
prototypes:
void (*open)(struct vm_area_struct*);
void (*close)(struct vm_area_struct*);
- struct page *(*nopage)(struct vm_area_struct*, unsigned long, int *);
+ int (*fault)(struct vm_area_struct*, struct vm_fault *);
+ int (*page_mkwrite)(struct vm_area_struct *, struct vm_fault *);
+ int (*access)(struct vm_area_struct *, unsigned long, void*, int, int);
locking rules:
- BKL mmap_sem
-open: no yes
-close: no yes
-nopage: no yes
+ mmap_sem PageLocked(page)
+open: yes
+close: yes
+fault: yes can return with page locked
+map_pages: yes
+page_mkwrite: yes can return with page locked
+access: yes
+
+ ->fault() is called when a previously not present pte is about
+to be faulted in. The filesystem must find and return the page associated
+with the passed in "pgoff" in the vm_fault structure. If it is possible that
+the page may be truncated and/or invalidated, then the filesystem must lock
+the page, then ensure it is not already truncated (the page lock will block
+subsequent truncate), and then return with VM_FAULT_LOCKED, and the page
+locked. The VM will unlock the page.
+
+ ->map_pages() is called when VM asks to map easy accessible pages.
+Filesystem should find and map pages associated with offsets from "pgoff"
+till "max_pgoff". ->map_pages() is called with page table locked and must
+not block. If it's not possible to reach a page without blocking,
+filesystem should skip it. Filesystem should use do_set_pte() to setup
+page table entry. Pointer to entry associated with offset "pgoff" is
+passed in "pte" field in vm_fault structure. Pointers to entries for other
+offsets should be calculated relative to "pte".
+
+ ->page_mkwrite() is called when a previously read-only pte is
+about to become writeable. The filesystem again must ensure that there are
+no truncate/invalidate races, and then return with the page locked. If
+the page has been truncated, the filesystem should not look up a new page
+like the ->fault() handler, but simply return with VM_FAULT_NOPAGE, which
+will cause the VM to retry the fault.
+
+ ->access() is called when get_user_pages() fails in
+access_process_vm(), typically used to debug a process through
+/proc/pid/mem or ptrace. This function is needed only for
+VM_IO | VM_PFNMAP VMAs.
================================================================================
Dubious stuff
(if you break something or notice that it is broken and do not fix it yourself
- at least put it here)
-
-ipc/shm.c::shm_delete() - may need BKL.
-->read() and ->write() in many drivers are (probably) missing BKL.
-drivers/sgi/char/graphics.c::sgi_graphics_nopage() - may need BKL.