43 files changed, 8521 insertions, 5017 deletions
diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
deleted file mode 100644
index 1d2ad15f153..00000000000
--- a/fs/ntfs/ChangeLog
+++ /dev/null
@@ -1,1350 +0,0 @@
-ToDo/Notes:
-	- Find and fix bugs.
-	- Checkpoint or disable the user space journal ($UsnJrnl).
-	- In between ntfs_prepare/commit_write, need exclusion between
-	  simultaneous file extensions. Need perhaps an NInoResizeUnderway()
-	  flag which we can set in ntfs_prepare_write() and clear again in
-	  ntfs_commit_write(). Just have to be careful in readpage/writepage,
-	  as well as in truncate, that we play nice... We might need to have
-	  a data_size field in the ntfs_inode to store the real attribute
-	  length. Also need to be careful with initialized_size extention in
-	  ntfs_prepare_write. Basically, just be _very_ careful in this code...
-	  OTOH, perhaps i_sem, which is held accross generic_file_write is
-	  sufficient for synchronisation here. We then just need to make sure
-	  ntfs_readpage/writepage/truncate interoperate properly with us.
-	  UPDATE: The above is all ok as it is due to i_sem held.  The only
-	  thing that needs to be checked is ntfs_writepage() which does not
-	  hold i_sem.  It cannot change i_size but it needs to cope with a
-	  concurrent i_size change.
-	- Implement mft.c::sync_mft_mirror_umount().  We currently will just
-	  leave the volume dirty on umount if the final iput(vol->mft_ino)
-	  causes a write of any mirrored mft records due to the mft mirror
-	  inode having been discarded already.  Whether this can actually ever
-	  happen is unclear however so it is worth waiting until someone hits
-	  the problem.
-	- Enable the code for setting the NT4 compatibility flag when we start
-	  making NTFS 1.2 specific modifications.
-
-2.1.23-WIP
-
-	- Add printk rate limiting for ntfs_warning() and ntfs_error() when
-	  compiled without debug.  This avoids a possible denial of service
-	  attack.  Thanks to Carl-Daniel Hailfinger from SuSE for pointing this
-	  out.
-
-2.1.22 - Many bug and race fixes and error handling improvements.
-
-	- Improve error handling in fs/ntfs/inode.c::ntfs_truncate().
-	- Change fs/ntfs/inode.c::ntfs_truncate() to return an error code
-	  instead of void and provide a helper ntfs_truncate_vfs() for the
-	  vfs ->truncate method.
-	- Add a new ntfs inode flag NInoTruncateFailed() and modify
-	  fs/ntfs/inode.c::ntfs_truncate() to set and clear it appropriately.
-	- Fix min_size and max_size definitions in ATTR_DEF structure in
-	  fs/ntfs/layout.h to be signed.
-	- Add attribute definition handling helpers to fs/ntfs/attrib.[hc]:
-	  ntfs_attr_size_bounds_check(), ntfs_attr_can_be_non_resident(), and
-	  ntfs_attr_can_be_resident(), which in turn use the new private helper
-	  ntfs_attr_find_in_attrdef().
-	- In fs/ntfs/aops.c::mark_ntfs_record_dirty(), take the
-	  mapping->private_lock around the dirtying of the buffer heads
-	  analagous to the way it is done in __set_page_dirty_buffers().
-	- Ensure the mft record size does not exceed the PAGE_CACHE_SIZE at
-	  mount time as this cannot work with the current implementation.
-	- Check for location of attribute name and improve error handling in
-	  general in fs/ntfs/inode.c::ntfs_read_locked_inode() and friends.
-	- In fs/ntfs/aops.c::ntfs_writepage(), if the page is fully outside
-	  i_size, i.e. race with truncate, invalidate the buffers on the page
-	  so that they become freeable and hence the page does not leak.
-	- Remove unused function fs/ntfs/runlist.c::ntfs_rl_merge().  (Adrian
-	  Bunk)
-	- Fix stupid bug in fs/ntfs/attrib.c::ntfs_attr_find() that resulted in
-	  a NULL pointer dereference in the error code path when a corrupt
-	  attribute was found.  (Thanks to Domen Puncer for the bug report.)
-	- Add MODULE_VERSION() to fs/ntfs/super.c.
-	- Make several functions and variables static.  (Adrian Bunk)
-	- Modify fs/ntfs/aops.c::mark_ntfs_record_dirty() so it allocates
-	  buffers for the page if they are not present and then marks the
-	  buffers belonging to the ntfs record dirty.  This causes the buffers
-	  to become busy and hence they are safe from removal until the page
-	  has been written out.
-	- Fix stupid bug in fs/ntfs/attrib.c::ntfs_external_attr_find() in the
-	  error handling code path that resulted in a BUG() due to trying to
-	  unmap an extent mft record when the mapping of it had failed and it
-	  thus was not mapped.  (Thanks to Ken MacFerrin for the bug report.)
-	- Drop the runlist lock after the vcn has been read in
-	  fs/ntfs/lcnalloc.c::__ntfs_cluster_free().
-	- Rewrite handling of multi sector transfer errors.  We now do not set
-	  PageError() when such errors are detected in the async i/o handler
-	  fs/ntfs/aops.c::ntfs_end_buffer_async_read().  All users of mst
-	  protected attributes now check the magic of each ntfs record as they
-	  use it and act appropriately.  This has the effect of making errors
-	  granular per ntfs record rather than per page which solves the case
-	  where we cannot access any of the ntfs records in a page when a
-	  single one of them had an mst error.  (Thanks to Ken MacFerrin for
-	  the bug report.)
-	- Fix error handling in fs/ntfs/quota.c::ntfs_mark_quotas_out_of_date()
-	  where we failed to release i_sem on the $Quota/$Q attribute inode.
-	- Fix bug in handling of bad inodes in fs/ntfs/namei.c::ntfs_lookup().
-	- Add mapping of unmapped buffers to all remaining code paths, i.e.
-	  fs/ntfs/aops.c::ntfs_write_mst_block(), mft.c::ntfs_sync_mft_mirror(),
-	  and write_mft_record_nolock().  From now on we require that the
-	  complete runlist for the mft mirror is always mapped into memory.
-	- Add creation of buffers to fs/ntfs/mft.c::ntfs_sync_mft_mirror().
-	- Improve error handling in fs/ntfs/aops.c::ntfs_{read,write}_block().
-	- Cleanup fs/ntfs/aops.c::ntfs_{read,write}page() since we know that a
-	  resident attribute will be smaller than a page which makes the code
-	  simpler.  Also make the code more tolerant to concurrent ->truncate.
-
-2.1.21 - Fix some races and bugs, rewrite mft write code, add mft allocator.
-
-	- Implement extent mft record deallocation
-	  fs/ntfs/mft.c::ntfs_extent_mft_record_free().
-	- Splitt runlist related functions off from attrib.[hc] to runlist.[hc].
-	- Add vol->mft_data_pos and initialize it at mount time.
-	- Rename init_runlist() to ntfs_init_runlist(), ntfs_vcn_to_lcn() to
-	  ntfs_rl_vcn_to_lcn(), decompress_mapping_pairs() to
-	  ntfs_mapping_pairs_decompress(), ntfs_merge_runlists() to
-	  ntfs_runlists_merge() and adapt all callers.
-	- Add fs/ntfs/runlist.[hc]::ntfs_get_nr_significant_bytes(),
-	  ntfs_get_size_for_mapping_pairs(), ntfs_write_significant_bytes(),
-	  and ntfs_mapping_pairs_build(), adapted from libntfs.
-	- Make fs/ntfs/lcnalloc.c::ntfs_cluster_free_from_rl_nolock() not
-	  static and add a declaration for it to lcnalloc.h.
-	- Add fs/ntfs/lcnalloc.h::ntfs_cluster_free_from_rl() which is a static
-	  inline wrapper for ntfs_cluster_free_from_rl_nolock() which takes the
-	  cluster bitmap lock for the duration of the call.
-	- Add fs/ntfs/attrib.[hc]::ntfs_attr_record_resize().
-	- Implement the equivalent of memset() for an ntfs attribute in
-	  fs/ntfs/attrib.[hc]::ntfs_attr_set() and switch
-	  fs/ntfs/logfile.c::ntfs_empty_logfile() to using it.
-	- Remove unnecessary casts from LCN_* constants.
-	- Implement fs/ntfs/runlist.c::ntfs_rl_truncate_nolock().
-	- Add MFT_RECORD_OLD as a copy of MFT_RECORD in fs/ntfs/layout.h and
-	  change MFT_RECORD to contain the NTFS 3.1+ specific fields.
-	- Add a helper function fs/ntfs/aops.c::mark_ntfs_record_dirty() which
-	  marks all buffers belonging to an ntfs record dirty, followed by
-	  marking the page the ntfs record is in dirty and also marking the vfs
-	  inode containing the ntfs record dirty (I_DIRTY_PAGES).
-	- Switch fs/ntfs/index.h::ntfs_index_entry_mark_dirty() to using the
-	  new helper fs/ntfs/aops.c::mark_ntfs_record_dirty() and remove the no
-	  longer needed fs/ntfs/index.[hc]::__ntfs_index_entry_mark_dirty().
-	- Move ntfs_{un,}map_page() from ntfs.h to aops.h and fix resulting
-	  include errors.
-	- Move the typedefs for runlist_element and runlist from types.h to
-	  runlist.h and fix resulting include errors.
-	- Remove unused {__,}format_mft_record() from fs/ntfs/mft.c.
-	- Modify fs/ntfs/mft.c::__mark_mft_record_dirty() to use the helper
-	  mark_ntfs_record_dirty() which also changes the behaviour in that we
-	  now set the buffers belonging to the mft record dirty as well as the
-	  page itself.
-	- Update fs/ntfs/mft.c::write_mft_record_nolock() and sync_mft_mirror()
-	  to cope with the fact that there now are dirty buffers in mft pages.
-	- Update fs/ntfs/inode.c::ntfs_write_inode() to also use the helper
-	  mark_ntfs_record_dirty() and thus to set the buffers belonging to the
-	  mft record dirty as well as the page itself.
-	- Fix compiler warnings on x86-64 in fs/ntfs/dir.c.  (Randy Dunlap,
-	  slightly modified by me)
-	- Add fs/ntfs/mft.c::try_map_mft_record() which fails with -EALREADY if
-	  the mft record is already locked and otherwise behaves the same way
-	  as fs/ntfs/mft.c::map_mft_record().
-	- Modify fs/ntfs/mft.c::write_mft_record_nolock() so that it only
-	  writes the mft record if the buffers belonging to it are dirty.
-	  Otherwise we assume that it was written out by other means already.
-	- Attempting to write outside initialized size is _not_ a bug so remove
-	  the bug check from fs/ntfs/aops.c::ntfs_write_mst_block().  It is in
-	  fact required to write outside initialized size when preparing to
-	  extend the initialized size.
-	- Map the page instead of using page_address() before writing to it in
-	  fs/ntfs/aops.c::ntfs_mft_writepage().
-	- Provide exclusion between opening an inode / mapping an mft record
-	  and accessing the mft record in fs/ntfs/mft.c::ntfs_mft_writepage()
-	  by setting the page not uptodate throughout ntfs_mft_writepage().
-	- Clear the page uptodate flag in fs/ntfs/aops.c::ntfs_write_mst_block()
-	  to ensure noone can see the page whilst the mst fixups are applied.
-	- Add the helper fs/ntfs/mft.c::ntfs_may_write_mft_record() which
-	  checks if an mft record may be written out safely obtaining any
-	  necessary locks in the process.  This is used by
-	  fs/ntfs/aops.c::ntfs_write_mst_block().
-	- Modify fs/ntfs/aops.c::ntfs_write_mst_block() to also work for
-	  writing mft records and improve its error handling in the process.
-	  Now if any of the records in the page fail to be written out, all
-	  other records will be written out instead of aborting completely.
-	- Remove ntfs_mft_aops and update all users to use ntfs_mst_aops.
-	- Modify fs/ntfs/inode.c::ntfs_read_locked_inode() to set the
-	  ntfs_mst_aops for all inodes which are NInoMstProtected() and
-	  ntfs_aops for all other inodes.
-	- Rename fs/ntfs/mft.c::sync_mft_mirror{,_umount}() to
-	  ntfs_sync_mft_mirror{,_umount}() and change their parameters so they
-	  no longer require an ntfs inode to be present.  Update all callers.
-	- Cleanup the error handling in fs/ntfs/mft.c::ntfs_sync_mft_mirror().
-	- Clear the page uptodate flag in fs/ntfs/mft.c::ntfs_sync_mft_mirror()
-	  to ensure noone can see the page whilst the mst fixups are applied.
-	- Remove the no longer needed fs/ntfs/mft.c::ntfs_mft_writepage() and
-	  fs/ntfs/mft.c::try_map_mft_record().
-	- Fix callers of fs/ntfs/aops.c::mark_ntfs_record_dirty() to call it
-	  with the ntfs inode which contains the page rather than the ntfs
-	  inode the mft record of which is in the page.
-	- Fix race condition in fs/ntfs/inode.c::ntfs_put_inode() by moving the
-	  index inode bitmap inode release code from there to
-	  fs/ntfs/inode.c::ntfs_clear_big_inode().  (Thanks to Christoph
-	  Hellwig for spotting this.)
-	- Fix race condition in fs/ntfs/inode.c::ntfs_put_inode() by taking the
-	  inode semaphore around the code that sets ni->itype.index.bmp_ino to
-	  NULL and reorganize the code to optimize it a bit.  (Thanks to
-	  Christoph Hellwig for spotting this.)
-	- Modify fs/ntfs/aops.c::mark_ntfs_record_dirty() to no longer take the
-	  ntfs inode as a parameter as this is confusing and misleading and the
-	  needed ntfs inode is available via NTFS_I(page->mapping->host).
-	  Adapt all callers to this change.
-	- Modify fs/ntfs/mft.c::write_mft_record_nolock() and
-	  fs/ntfs/aops.c::ntfs_write_mst_block() to only check the dirty state
-	  of the first buffer in a record and to take this as the ntfs record
-	  dirty state.  We cannot look at the dirty state for subsequent
-	  buffers because we might be racing with
-	  fs/ntfs/aops.c::mark_ntfs_record_dirty().
-	- Move the static inline ntfs_init_big_inode() from fs/ntfs/inode.c to
-	  inode.h and make fs/ntfs/inode.c::__ntfs_init_inode() non-static and
-	  add a declaration for it to inode.h.  Fix some compilation issues
-	  that resulted due to #includes and header file interdependencies.
-	- Simplify setup of i_mode in fs/ntfs/inode.c::ntfs_read_locked_inode().
-	- Add helpers fs/ntfs/layout.h::MK_MREF() and MK_LE_MREF().
-	- Modify fs/ntfs/mft.c::map_extent_mft_record() to only verify the mft
-	  record sequence number if it is specified (i.e. not zero).
-	- Add fs/ntfs/mft.[hc]::ntfs_mft_record_alloc() and various helper
-	  functions used by it.
-	- Update Documentation/filesystems/ntfs.txt with instructions on how to
-	  use the Device-Mapper driver with NTFS ftdisk/LDM raid.  This removes
-	  the linear raid problem with the Software RAID / MD driver when one
-	  or more of the devices has an odd number of sectors.
-
-2.1.20 - Fix two stupid bugs introduced in 2.1.18 release.
-
-	- Fix stupid bug in fs/ntfs/attrib.c::ntfs_attr_reinit_search_ctx()
-	  where we did not clear ctx->al_entry but it was still set due to
-	  changes in ntfs_attr_lookup() and ntfs_external_attr_find() in
-	  particular.
-	- Fix another stupid bug in fs/ntfs/attrib.c::ntfs_external_attr_find()
-	  where we forgot to unmap the extent mft record when we had finished
-	  enumerating an attribute which caused a bug check to trigger when the
-	  VFS calls ->clear_inode.
-
-2.1.19 - Many cleanups, improvements, and a minor bug fix.
-
-	- Update ->setattr (fs/ntfs/inode.c::ntfs_setattr()) to refuse to
-	  change the uid, gid, and mode of an inode as we do not support NTFS
-	  ACLs yet.
-	- Remove BKL use from ntfs_setattr() syncing up with the rest of the
-	  kernel.
-	- Get rid of the ugly transparent union in fs/ntfs/dir.c::ntfs_readdir()
-	  and ntfs_filldir() as per suggestion from Al Viro.
-	- Change '\0' and L'\0' to simply 0 as per advice from Linus Torvalds.
-	- Update ->truncate (fs/ntfs/inode.c::ntfs_truncate()) to check if the
-	  inode size has changed and to only output an error if so.
-	- Rename fs/ntfs/attrib.h::attribute_value_length() to ntfs_attr_size().
-	- Add le{16,32,64} as well as sle{16,32,64} data types to
-	  fs/ntfs/types.h.
-	- Change ntfschar to be le16 instead of u16 in fs/ntfs/types.h.
-	- Add le versions of VCN, LCN, and LSN called leVCN, leLCN, and leLSN,
-	  respectively, to fs/ntfs/types.h.
-	- Update endianness conversion macros in fs/ntfs/endian.h to use the
-	  new types as appropriate.
-	- Do proper type casting when using sle64_to_cpup() in fs/ntfs/dir.c
-	  and index.c.
-	- Add leMFT_REF data type to fs/ntfs/layout.h.
-	- Update all NTFS header files with the new little endian data types.
-	  Affected files are fs/ntfs/layout.h, logfile.h, and time.h.
-	- Do proper type casting when using ntfs_is_*_recordp() in
-	  fs/ntfs/logfile.c, mft.c, and super.c. 
-	- Fix all the sparse bitwise warnings.  Had to change all the typedef
-	  enums storing little endian values to simple enums plus a typedef for
-	  the datatype to make sparse happy.
-	- Fix a bug found by the new sparse bitwise warnings where the default
-	  upcase table was defined as a pointer to wchar_t rather than ntfschar
-	  in fs/ntfs/ntfs.h and super.c.
-	- Change {const_,}cpu_to_le{16,32}(0) to just 0 as suggested by Al Viro.
-
-2.1.18 - Fix scheduling latencies at mount time as well as an endianness bug.
-
-	- Remove vol->nr_mft_records as it was pretty meaningless and optimize
-	  the calculation of total/free inodes as used by statfs().
-	- Fix scheduling latencies in ntfs_fill_super() by dropping the BKL
-	  because the code itself is using the ntfs_lock semaphore which
-	  provides safe locking.  (Ingo Molnar)
-	- Fix a potential bug in fs/ntfs/mft.c::map_extent_mft_record() that
-	  could occur in the future for when we start closing/freeing extent
-	  inodes if we don't set base_ni->ext.extent_ntfs_inos to NULL after
-	  we free it.
-	- Rename {find,lookup}_attr() to ntfs_attr_{find,lookup}() as well as
-	  find_external_attr() to ntfs_external_attr_find() to cleanup the
-	  namespace a bit and to be more consistent with libntfs.
-	- Rename {{re,}init,get,put}_attr_search_ctx() to
-	  ntfs_attr_{{re,}init,get,put}_search_ctx() as well as the type
-	  attr_search_context to ntfs_attr_search_ctx.
-	- Force use of ntfs_attr_find() in ntfs_attr_lookup() when searching
-	  for the attribute list attribute itself.
-	- Fix endianness bug in ntfs_external_attr_find().
-	- Change ntfs_{external_,}attr_find() to return 0 on success, -ENOENT
-	  if the attribute is not found, and -EIO on real error.  In the case
-	  of -ENOENT, the search context is updated to describe the attribute
-	  before which the attribute being searched for would need to be
-	  inserted if such an action were to be desired and in the case of
-	  ntfs_external_attr_find() the search context is also updated to
-	  indicate the attribute list entry before which the attribute list
-	  entry of the attribute being searched for would need to be inserted
-	  if such an action were to be desired.  Also make ntfs_find_attr()
-	  static and remove its prototype from attrib.h as it is not used
-	  anywhere other than attrib.c.  Update ntfs_attr_lookup() and all
-	  callers of ntfs_{external,}attr_{find,lookup}() for the new return
-	  values.
-	- Minor cleanup of fs/ntfs/inode.c::ntfs_init_locked_inode().
-
-2.1.17 - Fix bugs in mount time error code paths and other updates.
-
-	- Implement bitmap modification code (fs/ntfs/bitmap.[hc]).  This
-	  includes functions to set/clear a single bit or a run of bits.
-	- Add fs/ntfs/attrib.[hc]::ntfs_find_vcn() which returns the locked
-	  runlist element containing a particular vcn.  It also takes care of
-	  mapping any needed runlist fragments.
-	- Implement cluster (de-)allocation code (fs/ntfs/lcnalloc.[hc]).
-	- Load attribute definition table from $AttrDef at mount time.
-	- Fix bugs in mount time error code paths involving (de)allocation of
-	  the default and volume upcase tables.
-	- Remove ntfs_nr_mounts as it is no longer used.
-
-2.1.16 - Implement access time updates, file sync, async io, and read/writev.
-
-	- Add support for readv/writev and aio_read/aio_write (fs/ntfs/file.c).
-	  This is done by setting the appropriate file operations pointers to
-	  the generic helper functions provided by mm/filemap.c.
-	- Implement fsync, fdatasync, and msync both for files (fs/ntfs/file.c)
-	  and directories (fs/ntfs/dir.c).
-	- Add support for {a,m,c}time updates to inode.c::ntfs_write_inode().
-	  Note, except for the root directory and any other system files opened
-	  by the user, the system files will not have their access times
-	  updated as they are only accessed at the inode level an hence the
-	  file level functions which cause the times to be updated are never
-	  invoked.
-
-2.1.15 - Invalidate quotas when (re)mounting read-write.
-
-	- Add new element itype.index.collation_rule to the ntfs inode
-	  structure and set it appropriately in ntfs_read_locked_inode().
-	- Implement a new inode type "index" to allow efficient access to the
-	  indices found in various system files and adapt inode handling
-	  accordingly (fs/ntfs/inode.[hc]).  An index inode is essentially an
-	  attribute inode (NInoAttr() is true) with an attribute type of
-	  AT_INDEX_ALLOCATION.  As such, it is no longer allowed to call
-	  ntfs_attr_iget() with an attribute type of AT_INDEX_ALLOCATION as
-	  there would be no way to distinguish between normal attribute inodes
-	  and index inodes.  The function to obtain an index inode is
-	  ntfs_index_iget() and it uses the helper function
-	  ntfs_read_locked_index_inode().  Note, we do not overload
-	  ntfs_attr_iget() as indices consist of multiple attributes so using
-	  ntfs_attr_iget() to obtain an index inode would be confusing.
-	- Ensure that there is no overflow when doing page->index <<
-	  PAGE_CACHE_SHIFT by casting page->index to s64 in fs/ntfs/aops.c.
-	- Use atomic kmap instead of kmap() in fs/ntfs/aops.c::ntfs_read_page()
-	  and ntfs_read_block().
-	- Use case sensitive attribute lookups instead of case insensitive ones.
-	- Lock all page cache pages belonging to mst protected attributes while
-	  accessing them to ensure we never see corrupt data while the page is
-	  under writeout.
-	- Add framework for generic ntfs collation (fs/ntfs/collation.[hc]).
-	  We have ntfs_is_collation_rule_supported() to check if the collation
-	  rule you want to use is supported and ntfs_collation() which actually
-	  collates two data items.  We currently only support COLLATION_BINARY
-	  and COLLATION_NTOFS_ULONG but support for other collation rules will
-	  be added as the need arises.
-	- Add a new type, ntfs_index_context, to allow retrieval of an index
-	  entry using the corresponding index key.  To get an index context,
-	  use ntfs_index_ctx_get() and to release it, use ntfs_index_ctx_put().
-	  This also adds a new slab cache for the index contexts.  To lookup a
-	  key in an index inode, use ntfs_index_lookup().  After modifying an
-	  index entry, call ntfs_index_entry_flush_dcache_page() followed by
-	  ntfs_index_entry_mark_dirty() to ensure the changes are written out
-	  to disk.  For details see fs/ntfs/index.[hc].  Note, at present, if
-	  an index entry is in the index allocation attribute rather than the
-	  index root attribute it will not be written out (you will get a
-	  warning message about discarded changes instead).
-	- Load the quota file ($Quota) and check if quota tracking is enabled
-	  and if so, mark the quotas out of date.  This causes windows to
-	  rescan the volume on boot and update all quota entries.
-	- Add a set_page_dirty address space operation for ntfs_m[fs]t_aops.
-	  It is simply set to __set_page_dirty_nobuffers() to make sure that
-	  running set_page_dirty() on a page containing mft/ntfs records will
-	  not affect the dirty state of the page buffers.
-	- Add fs/ntfs/index.c::__ntfs_index_entry_mark_dirty() which sets all
-	  buffers that are inside the ntfs record in the page dirty after which
-	  it sets the page dirty.  This allows ->writepage to only write the
-	  dirty index records rather than having to write all the records in
-	  the page.  Modify fs/ntfs/index.h::ntfs_index_entry_mark_dirty() to
-	  use this rather than __set_page_dirty_nobuffers().
-	- Implement fs/ntfs/aops.c::ntfs_write_mst_block() which enables the
-	  writing of page cache pages belonging to mst protected attributes
-	  like the index allocation attribute in directory indices and other
-	  indices like $Quota/$Q, etc.  This means that the quota is now marked
-	  out of date on all volumes rather than only on ones where the quota
-	  defaults entry is in the index root attribute of the $Quota/$Q index.
-
-2.1.14 - Fix an NFSd caused deadlock reported by several users.
-
-	- Modify fs/ntfs/ntfs_readdir() to copy the index root attribute value
-	  to a buffer so that we can put the search context and unmap the mft
-	  record before calling the filldir() callback.  We need to do this
-	  because of NFSd which calls ->lookup() from its filldir callback()
-	  and this causes NTFS to deadlock as ntfs_lookup() maps the mft record
-	  of the directory and since ntfs_readdir() has got it mapped already
-	  ntfs_lookup() deadlocks.
-
-2.1.13 - Enable overwriting of resident files and housekeeping of system files.
-
-	- Implement writing of mft records (fs/ntfs/mft.[hc]), which includes
-	  keeping the mft mirror in sync with the mft when mirrored mft records
-	  are written.  The functions are write_mft_record{,_nolock}().  The
-	  implementation is quite rudimentary for now with lots of things not
-	  implemented yet but I am not sure any of them can actually occur so
-	  I will wait for people to hit each one and only then implement it.
-	- Commit open system inodes at umount time.  This should make it
-	  virtually impossible for sync_mft_mirror_umount() to ever be needed.
-	- Implement ->write_inode (fs/ntfs/inode.c::ntfs_write_inode()) for the
-	  ntfs super operations.  This gives us inode writing via the VFS inode
-	  dirty code paths.  Note:  Access time updates are not implemented yet.
-	- Implement fs/ntfs/mft.[hc]::{,__}mark_mft_record_dirty() and make
-	  fs/ntfs/aops.c::ntfs_writepage() and ntfs_commit_write() use it, thus
-	  finally enabling resident file overwrite!  (-8  This also includes a
-	  placeholder for ->writepage (ntfs_mft_writepage()), which for now
-	  just redirties the page and returns.  Also, at umount time, we for
-	  now throw away all mft data page cache pages after the last call to
-	  ntfs_commit_inode() in the hope that all inodes will have been
-	  written out by then and hence no dirty (meta)data will be lost.  We
-	  also check for this case and emit an error message telling the user
-	  to run chkdsk.
-	- Use set_page_writeback() and end_page_writeback() in the resident
-	  attribute code path of fs/ntfs/aops.c::ntfs_writepage() otherwise
-	  the radix-tree tag PAGECACHE_TAG_DIRTY remains set even though the
-	  page is clean.
-	- Implement ntfs_mft_writepage() so it now checks if any of the mft
-	  records in the page are dirty and if so redirties the page and
-	  returns.  Otherwise it just returns (after doing set_page_writeback(),
-	  unlock_page(), end_page_writeback() or the radix-tree tag
-	  PAGECACHE_TAG_DIRTY remains set even though the page is clean), thus
-	  alowing the VM to do with the page as it pleases.  Also, at umount
-	  time, now only throw away dirty mft (meta)data pages if dirty inodes
-	  are present and ask the user to email us if they see this happening.
-	- Add functions ntfs_{clear,set}_volume_flags(), to modify the volume
-	  information flags (fs/ntfs/super.c).
-	- Mark the volume dirty when (re)mounting read-write and mark it clean
-	  when unmounting or remounting read-only.  If any volume errors are
-	  found, the volume is left marked dirty to force chkdsk to run.
-	- Add code to set the NT4 compatibility flag when (re)mounting
-	  read-write for newer NTFS versions but leave it commented out for now
-	  since we do not make any modifications that are NTFS 1.2 specific yet
-	  and since setting this flag breaks Captive-NTFS which is not nice.
-	  This code must be enabled once we start writing NTFS 1.2 specific
-	  changes otherwise Windows NTFS driver might crash / cause corruption.
-
-2.1.12 - Fix the second fix to the decompression engine and some cleanups.
-
-	- Add a new address space operations struct, ntfs_mst_aops, for mst
-	  protected attributes.  This is because the default ntfs_aops do not
-	  make sense with mst protected data and were they to write anything to
-	  such an attribute they would cause data corruption so we provide
-	  ntfs_mst_aops which does not have any write related operations set.
-	- Cleanup dirty ntfs inode handling (fs/ntfs/inode.[hc]) which also
-	  includes an adapted ntfs_commit_inode() and an implementation of
-	  ntfs_write_inode() which for now just cleans dirty inodes without
-	  writing them (it does emit a warning that this is happening).
-	- Undo the second decompression engine fix (see 2.1.9 release ChangeLog
-	  entry) as it was only fixing a theoretical bug but at the same time
-	  it badly broke the handling of sparse and uncompressed compression
-	  blocks.
-
-2.1.11 - Driver internal cleanups.
-
-	- Only build logfile.o if building the driver with read-write support.
-	- Really final white space cleanups.
-	- Use generic_ffs() instead of ffs() in logfile.c which allows the
-	  log_page_size variable to be optimized by gcc into a constant.
-	- Rename uchar_t to ntfschar everywhere as uchar_t is unsigned 1-byte
-	  char as defined by POSIX and as found on some systems.
-
-2.1.10 - Force read-only (re)mounting of volumes with unsupported volume flags.
-
-	- Finish off the white space cleanups (remove trailing spaces, etc).
-	- Clean up ntfs_fill_super() and ntfs_read_inode_mount() by removing
-	  the kludges around the first iget().  Instead of (re)setting ->s_op
-	  we have the $MFT inode set up by explicit new_inode() / set ->i_ino /
-	  insert_inode_hash() / call ntfs_read_inode_mount() directly.  This
-	  kills the need for second super_operations and allows to return error
-	  from ntfs_read_inode_mount() without resorting to ugly "poisoning"
-	  tricks.  (Al Viro)
-	- Force read-only (re)mounting if any of the following bits are set in
-	  the volume information flags:
-	  	VOLUME_IS_DIRTY, VOLUME_RESIZE_LOG_FILE,
-		VOLUME_UPGRADE_ON_MOUNT, VOLUME_DELETE_USN_UNDERWAY,
-		VOLUME_REPAIR_OBJECT_ID, VOLUME_MODIFIED_BY_CHKDSK
-	  To make this easier we define VOLUME_MUST_MOUNT_RO_MASK with all the
-	  above bits set so the test is made easy.
-
-2.1.9 - Fix two bugs in decompression engine.
-
-	- Fix a bug where we would not always detect that we have reached the
-	  end of a compression block because we were ending at minus one byte
-	  which is effectively the same as being at the end.  The fix is to
-	  check whether the uncompressed buffer has been fully filled and if so
-	  we assume we have reached the end of the compression block.  A big
-	  thank you to Marcin Gibuła for the bug report, the assistance in
-	  tracking down the bug and testing the fix.
-	- Fix a possible bug where when a compressed read is truncated to the
-	  end of the file, the offset inside the last page was not truncated.
-
-2.1.8 - Handle $MFT mirror and $LogFile, improve time handling, and cleanups.
-
-	- Use get_bh() instead of manual atomic_inc() in fs/ntfs/compress.c.
-	- Modify fs/ntfs/time.c::ntfs2utc(), get_current_ntfs_time(), and
-	  utc2ntfs() to work with struct timespec instead of time_t on the
-	  Linux UTC time side thus preserving the full precision of the NTFS
-	  time and only loosing up to 99 nano-seconds in the Linux UTC time.
-	- Move fs/ntfs/time.c to fs/ntfs/time.h and make the time functions
-	  static inline.
-	- Remove unused ntfs_dirty_inode().
-	- Cleanup super operations declaration in fs/ntfs/super.c.
-	- Wrap flush_dcache_mft_record_page() in #ifdef NTFS_RW.
-	- Add NInoTestSetFoo() and NInoTestClearFoo() macro magic to
-	  fs/ntfs/inode.h and use it to declare NInoTest{Set,Clear}Dirty.
-	- Move typedefs for ntfs_attr and test_t from fs/ntfs/inode.c to
-	  fs/ntfs/inode.h so they can be used elsewhere.
-	- Determine the mft mirror size as the number of mirrored mft records
-	  and store it in ntfs_volume->mftmirr_size (fs/ntfs/super.c).
-	- Load the mft mirror at mount time and compare the mft records stored
-	  in it to the ones in the mft.  Force a read-only mount if the two do
-	  not match (fs/ntfs/super.c).
-	- Fix type casting related warnings on 64-bit architectures.  Thanks
-	  to Meelis Roos for reporting them.
-	- Move %L to %ll as %L is floating point and %ll is integer which is
-	  what we want.
-	- Read the journal ($LogFile) and determine if the volume has been
-	  shutdown cleanly and force a read-only mount if not (fs/ntfs/super.c
-	  and fs/ntfs/logfile.c).  This is a little bit of a crude check in
-	  that we only look at the restart areas and not at the actual log
-	  records so that there will be a very small number of cases where we
-	  think that a volume is dirty when in fact it is clean.  This should
-	  only affect volumes that have not been shutdown cleanly and did not
-	  have any pending, non-check-pointed i/o.
-	- If the $LogFile indicates a clean shutdown and a read-write (re)mount
-	  is requested, empty $LogFile by overwriting it with 0xff bytes to
-	  ensure that Windows cannot cause data corruption by replaying a stale
-	  journal after Linux has written to the volume.
-
-2.1.7 - Enable NFS exporting of mounted NTFS volumes.
-
-	- Set i_generation in the VFS inode from the seq_no of the NTFS inode.
-	- Make ntfs_lookup() NFS export safe, i.e. use d_splice_alias(), etc.
-	- Implement ->get_dentry() in fs/ntfs/namei.c::ntfs_get_dentry() as the
-	  default doesn't allow inode number 0 which is a valid inode on NTFS
-	  and even if it did allow that it uses iget() instead of ntfs_iget()
-	  which makes it useless for us.
-	- Implement ->get_parent() in fs/ntfs/namei.c::ntfs_get_parent() as the
-	  default just returns -EACCES which is not very useful.
-	- Define export operations (->s_export_op) for NTFS (ntfs_export_ops)
-	  and set them up in the super block at mount time (super.c) this
-	  allows mounted NTFS volumes to be exported via NFS.
-	- Add missing return -EOPNOTSUPP; in
-	  fs/ntfs/aops.c::ntfs_commit_nonresident_write().
-	- Enforce no atime and no dir atime updates at mount/remount time as
-	  they are not implemented yet anyway.
-	- Move a few assignments in fs/ntfs/attrib.c::load_attribute_list() to
-	  after a NULL check.  Thanks to Dave Jones for pointing this out.
-
-2.1.6 - Fix minor bug in handling of compressed directories.
-
-	- Fix bug in handling of compressed directories.  A compressed
-	  directory is not really compressed so when we set the ->i_blocks
-	  field of a compressed directory inode we were setting it from the
-	  non-existing field ni->itype.compressed.size which gave random
-	  results...  For directories we now always use ni->allocated_size.
-
-2.1.5 - Fix minor bug in attribute list attribute handling.
-
-	- Fix bug in attribute list handling.  Actually it is not as much a bug
-	  as too much protection in that we were not allowing attribute lists
-	  which waste space on disk while Windows XP clearly allows it and in
-	  fact creates such attribute lists so our driver was failing.
-	- Update NTFS documentation ready for 2.6 kernel release.
-
-2.1.4 - Reduce compiler requirements.
-
-	- Remove all uses of unnamed structs and unions in the driver to make
-	  old and newer gcc versions happy. Makes it a bit uglier IMO but at
-	  least people will stop hassling me about it.
-
-2.1.3 - Important bug fixes in corner cases.
-
-	- super.c::parse_ntfs_boot_sector(): Correct the check for 64-bit
-	  clusters. (Philipp Thomas)
-	- attrib.c::load_attribute_list(): Fix bug when initialized_size is a
-	  multiple of the block_size but not the cluster size. (Szabolcs
-	  Szakacsits <szaka@sienet.hu>)
-
-2.1.2 - Important bug fixes aleviating the hangs in statfs.
-
-	- Fix buggy free cluster and free inode determination logic.
-
-2.1.1 - Minor updates.
-
-	- Add handling for initialized_size != data_size in compressed files.
-	- Reduce function local stack usage from 0x3d4 bytes to just noise in
-	  fs/ntfs/upcase.c. (Randy Dunlap <rddunlap@osdl.ord>)
-	- Remove compiler warnings for newer gcc.
-	- Pages are no longer kmapped by mm/filemap.c::generic_file_write()
-	  around calls to ->{prepare,commit}_write.  Adapt NTFS appropriately
-	  in fs/ntfs/aops.c::ntfs_prepare_nonresident_write() by using
-	  kmap_atomic(KM_USER0).
-
-2.1.0 - First steps towards write support: implement file overwrite.
-
-	- Add configuration option for developmental write support with an
-	  appropriately scary configuration help text.
-	- Initial implementation of fs/ntfs/aops.c::ntfs_writepage() and its
-	  helper fs/ntfs/aops.c::ntfs_write_block(). This enables mmap(2) based
-	  overwriting of existing files on ntfs. Note: Resident files are
-	  only written into memory, and not written out to disk at present, so
-	  avoid writing to files smaller than about 1kiB.
-	- Initial implementation of fs/ntfs/aops.c::ntfs_prepare_write(), its
-	  helper fs/ntfs/aops.c::ntfs_prepare_nonresident_write() and their
-	  counterparts, fs/ntfs/aops.c::ntfs_commit_write(), and
-	  fs/ntfs/aops.c::ntfs_commit_nonresident_write(), respectively. Also,
-	  add generic_file_write() to the ntfs file operations (fs/ntfs/file.c).
-	  This enables write(2) based overwriting of existing files on ntfs.
-	  Note: As with mmap(2) based overwriting, resident files are only
-	  written into memory, and not written out to disk at present, so avoid
-	  writing to files smaller than about 1kiB.
-	- Implement ->truncate (fs/ntfs/inode.c::ntfs_truncate()) and
-	  ->setattr() (fs/ntfs/inode.c::ntfs_setattr()) inode operations for
-	  files with the purpose of intercepting and aborting all i_size
-	  changes which we do not support yet. ntfs_truncate() actually only
-	  emits a warning message but AFAICS our interception of i_size changes
-	  elsewhere means ntfs_truncate() never gets called for i_size changes.
-	  It is only called from generic_file_write() when we fail in
-	  ntfs_prepare_{,nonresident_}write() in order to discard any
-	  instantiated buffers beyond i_size. Thus i_size is not actually
-	  changed so our warning message is enough. Unfortunately it is not
-	  possible to easily determine if i_size is being changed or not hence
-	  we just emit an appropriately worded error message.
-
-2.0.25 - Small bug fixes and cleanups.
-
-	- Unlock the page in an out of memory error code path in
-	  fs/ntfs/aops.c::ntfs_read_block().
-	- If fs/ntfs/aops.c::ntfs_read_page() is called on an uptodate page,
-	  just unlock the page and return. (This can happen due to ->writepage
-	  clearing PageUptodate() during write out of MstProtected()
-	  attributes.
-	- Remove leaked write code again.
-
-2.0.24 - Cleanups.
-
-	- Treat BUG_ON() as ASSERT() not VERIFY(), i.e. do not use side effects
-	  inside BUG_ON(). (Adam J. Richter)
-	- Split logical OR expressions inside BUG_ON() into individual BUG_ON()
-	  calls for improved debugging. (Adam J. Richter)
-	- Add errors flag to the ntfs volume state, accessed via
-	  NVol{,Set,Clear}Errors(vol).
-	- Do not allow read-write remounts of read-only volumes with errors.
-	- Clarify comment for ntfs file operation sendfile which was added by
-	  Christoph Hellwig a while ago (just using generic_file_sendfile())
-	  to say that ntfs ->sendfile is only used for the case where the
-	  source data is on the ntfs partition and the destination is
-	  somewhere else, i.e. nothing we need to concern ourselves with.
-	- Add generic_file_write() as our ntfs file write operation.
-
-2.0.23 - Major bug fixes (races, deadlocks, non-i386 architectures).
-
-	- Massive internal locking changes to mft record locking. Fixes lock
-	  recursion and replaces the mrec_lock read/write semaphore with a
-	  mutex. Also removes the now superfluous mft_count. This fixes several
-	  race conditions and deadlocks, especially in the future write code.
-	- Fix ntfs over loopback for compressed files by adding an
-	  optimization barrier. (gcc was screwing up otherwise ?)
-	- Miscellaneous cleanups all over the code and a fix or two in error
-	  handling code paths.
-	Thanks go to Christoph Hellwig for pointing out the following two:
-	- Remove now unused function fs/ntfs/malloc.h::vmalloc_nofs().
-	- Fix ntfs_free() for ia64 and parisc by checking for VMALLOC_END, too.
-
-2.0.22 - Cleanups, mainly to ntfs_readdir(), and use C99 initializers.
-
-	- Change fs/ntfs/dir.c::ntfs_reddir() to only read/write ->f_pos once
-	  at entry/exit respectively.
-	- Use C99 initializers for structures.
-	- Remove unused variable blocks from fs/ntfs/aops.c::ntfs_read_block().
-
-2.0.21 - Check for, and refuse to work with too large files/directories/volumes.
-
-	- Limit volume size at mount time to 2TiB on architectures where
-	  unsigned long is 32-bits (fs/ntfs/super.c::parse_ntfs_boot_sector()).
-	  This is the most we can do without overflowing the 32-bit limit of
-	  the block device size imposed on us by sb_bread() and sb_getblk()
-	  for the time being.
-	- Limit file/directory size at open() time to 16TiB on architectures
-	  where unsigned long is 32-bits (fs/ntfs/file.c::ntfs_file_open() and
-	  fs/ntfs/dir.c::ntfs_dir_open()). This is the most we can do without
-	  overflowing the page cache page index.
-
-2.0.20 - Support non-resident directory index bitmaps, fix page leak in readdir.
-
-	- Move the directory index bitmap to use an attribute inode instead of
-	  having special fields for it inside the ntfs inode structure. This
-	  means that the index bitmaps now use the page cache for i/o, too,
-	  and also as a side effect we get support for non-resident index
-	  bitmaps for free.
-	- Simplify/cleanup error handling in fs/ntfs/dir.c::ntfs_readdir() and
-	  fix a page leak that manifested itself in some cases.
-	- Add fs/ntfs/inode.c::ntfs_put_inode(), which we need to release the
-	  index bitmap inode on the final iput().
-
-2.0.19 - Fix race condition, improvements, and optimizations in i/o interface.
-
-	- Apply block optimization added to fs/ntfs/aops.c::ntfs_read_block()
-	  to fs/ntfs/compress.c::ntfs_file_read_compressed_block() as well.
-	- Drop the "file" from ntfs_file_read_compressed_block().
-	- Rename fs/ntfs/aops.c::ntfs_enb_buffer_read_async() to
-	  ntfs_end_buffer_async_read() (more like the fs/buffer.c counterpart).
-	- Update ntfs_end_buffer_async_read() with the improved logic from
-	  its updated counterpart fs/buffer.c::end_buffer_async_read(). Apply
-	  further logic improvements to better determine when we set PageError.
-	- Update submission of buffers in fs/ntfs/aops.c::ntfs_read_block() to
-	  check for the buffers being uptodate first in line with the updated
-	  fs/buffer.c::block_read_full_page(). This plugs a small race
-	  condition.
-
-2.0.18 - Fix race condition in reading of compressed files.
-
-	- There was a narrow window between checking a buffer head for being
-	  uptodate and locking it in ntfs_file_read_compressed_block(). We now
-	  lock the buffer and then check whether it is uptodate or not.
-
-2.0.17 - Cleanups and optimizations - shrinking the ToDo list.
-
-	- Modify fs/ntfs/inode.c::ntfs_read_locked_inode() to return an error
-	  code and update callers, i.e. ntfs_iget(), to pass that error code
-	  up instead of just using -EIO.
-	- Modifications to super.c to ensure that both mount and remount
-	  cannot set any write related options when the driver is compiled
-	  read-only.
-	- Optimize block resolution in fs/ntfs/aops.c::ntfs_read_block() to
-	  cache the current runlist element. This should improve performance
-	  when reading very large and/or very fragmented data.
-
-2.0.16 - Convert access to $MFT/$BITMAP to attribute inode API.
-
-	- Fix a stupid bug introduced in 2.0.15 where we were unmapping the
-	  wrong inode in fs/ntfs/inode.c::ntfs_attr_iget().
-	- Fix debugging check in fs/ntfs/aops.c::ntfs_read_block().
-	- Convert $MFT/$BITMAP access to attribute inode API and remove all
-	  remnants of the ugly mftbmp address space and operations hack. This
-	  means we finally have only one readpage function as well as only one
-	  async io completion handler. Yey! The mft bitmap is now just an
-	  attribute inode and is accessed from vol->mftbmp_ino just as if it
-	  were a normal file. Fake inodes rule. (-:
-
-2.0.15 - Fake inodes based attribute i/o via the pagecache, fixes and cleanups.
-
-	- Fix silly bug in fs/ntfs/super.c::parse_options() which was causing
-	  remounts to fail when the partition had an entry in /etc/fstab and
-	  the entry specified the nls= option.
-	- Apply same macro magic used in fs/ntfs/inode.h to fs/ntfs/volume.h to
-	  expand all the helper functions NVolFoo(), NVolSetFoo(), and
-	  NVolClearFoo().
-	- Move copyright statement from driver initialisation message to
-	  module description (fs/super.c). This makes the initialisation
-	  message fit on one line and fits in better with rest of kernel.
-	- Update fs/ntfs/attrib.c::map_run_list() to work on both real and
-	  attribute inodes, and both for files and directories.
-	- Implement fake attribute inodes allowing all attribute i/o to go via
-	  the page cache and to use all the normal vfs/mm functionality:
-	  - Add ntfs_attr_iget() and its helper ntfs_read_locked_attr_inode()
-	    to fs/ntfs/inode.c.
-	  - Add needed cleanup code to ntfs_clear_big_inode().
-	- Merge address space operations for files and directories (aops.c),
-	  now just have ntfs_aops:
-	  - Rename:
-		end_buffer_read_attr_async() ->	ntfs_end_buffer_read_async(),
-		ntfs_attr_read_block()	     ->	ntfs_read_block(),
-		ntfs_file_read_page()	     ->	ntfs_readpage().
-	  - Rewrite fs/ntfs/aops.c::ntfs_readpage() to work on both real and
-	    attribute inodes, and both for files and directories.
-	  - Remove obsolete fs/ntfs/aops.c::ntfs_mst_readpage().
-
-2.0.14 - Run list merging code cleanup, minor locking changes, typo fixes.
-
-	- Change fs/ntfs/super.c::ntfs_statfs() to not rely on BKL by moving
-	  the locking out of super.c::get_nr_free_mft_records() and taking and
-	  dropping the mftbmp_lock rw_semaphore in ntfs_statfs() itself.
-	- Bring attribute runlist merging code (fs/ntfs/attrib.c) in sync with
-	  current userspace ntfs library code. This means that if a merge
-	  fails the original runlists are always left unmodified instead of
-	  being silently corrupted.
-	- Misc typo fixes.
-
-2.0.13 - Use iget5_locked() in preparation for fake inodes and small cleanups.
-
-	- Remove nr_mft_bits and the now superfluous union with nr_mft_records
-	  from ntfs_volume structure.
-	- Remove nr_lcn_bits and the now superfluous union with nr_clusters
-	  from ntfs_volume structure.
-	- Use iget5_locked() and friends instead of conventional iget(). Wrap
-	  the call in fs/ntfs/inode.c::ntfs_iget() and update callers of iget()
-	  to use ntfs_iget(). Leave only one iget() call at mount time so we
-	  don't need an ntfs_iget_mount().
-	- Change fs/ntfs/inode.c::ntfs_new_extent_inode() to take mft_no as an
-	  additional argument.
-
-2.0.12 - Initial cleanup of address space operations following 2.0.11 changes.
-
-	- Merge fs/ntfs/aops.c::end_buffer_read_mst_async() and
-	  fs/ntfs/aops.c::end_buffer_read_file_async() into one function
-	  fs/ntfs/aops.c::end_buffer_read_attr_async() using NInoMstProtected()
-	  to determine whether to apply mst fixups or not.
-	- Above change allows merging fs/ntfs/aops.c::ntfs_file_read_block()
-	  and fs/ntfs/aops.c::ntfs_mst_readpage() into one function
-	  fs/ntfs/aops.c::ntfs_attr_read_block(). Also, create a tiny wrapper
-	  fs/ntfs/aops.c::ntfs_mst_readpage() to transform the parameters from
-	  the VFS readpage function prototype to the ntfs_attr_read_block()
-	  function prototype.
-
-2.0.11 - Initial preparations for fake inode based attribute i/o.
-
-	- Move definition of ntfs_inode_state_bits to fs/ntfs/inode.h and
-	  do some macro magic (adapted from include/linux/buffer_head.h) to
-	  expand all the helper functions NInoFoo(), NInoSetFoo(), and
-	  NInoClearFoo().
-	- Add new flag to ntfs_inode_state_bits: NI_Sparse.
-	- Add new fields to ntfs_inode structure to allow use of fake inodes
-	  for attribute i/o: type, name, name_len. Also add new state bits:
-	  NI_Attr, which, if set, indicates the inode is a fake inode, and
-	  NI_MstProtected, which, if set, indicates the attribute uses multi
-	  sector transfer protection, i.e. fixups need to be applied after
-	  reads and before/after writes.
-	- Rename fs/ntfs/inode.c::ntfs_{new,clear,destroy}_inode() to
-	  ntfs_{new,clear,destroy}_extent_inode() and update callers.
-	- Use ntfs_clear_extent_inode() in fs/ntfs/inode.c::__ntfs_clear_inode()
-	  instead of ntfs_destroy_extent_inode().
-	- Cleanup memory deallocations in {__,}ntfs_clear_{,big_}inode().
-	- Make all operations on ntfs inode state bits use the NIno* functions.
-	- Set up the new ntfs inode fields and state bits in
-	  fs/ntfs/inode.c::ntfs_read_inode() and add appropriate cleanup of
-	  allocated memory to __ntfs_clear_inode().
-	- Cleanup ntfs_inode structure a bit for better ordering of elements
-	  w.r.t. their size to allow better packing of the structure in memory.
-
-2.0.10 - There can only be 2^32 - 1 inodes on an NTFS volume.
-
-	- Add check at mount time to verify that the number of inodes on the
-	  volume does not exceed 2^32 - 1, which is the maximum allowed for
-	  NTFS according to Microsoft.
-	- Change mft_no member of ntfs_inode structure to be unsigned long.
-	  Update all users. This makes ntfs_inode->mft_no just a copy of struct
-	  inode->i_ino. But we can't just always use struct inode->i_ino and
-	  remove mft_no because extent inodes do not have an attached struct
-	  inode.
-
-2.0.9 - Decompression engine now uses a single buffer and other cleanups.
-
-	- Change decompression engine to use a single buffer protected by a
-	  spin lock instead of per-CPU buffers. (Rusty Russell)
-	- Do not update cb_pos when handling a partial final page during
-	  decompression of a sparse compression block, as the value is later
-	  reset without being read/used. (Rusty Russell)
-	- Switch to using the new KM_BIO_SRC_IRQ for atomic kmap()s. (Andrew
-	  Morton)
-	- Change buffer size in ntfs_readdir()/ntfs_filldir() to use
-	  NLS_MAX_CHARSET_SIZE which makes the buffers almost 1kiB each but
-	  it also makes everything safer so it is a good thing.
-	- Miscellaneous minor cleanups to comments.
-
-2.0.8 - Major updates for handling of case sensitivity and dcache aliasing.
-
-	Big thanks go to Al Viro and other inhabitants of #kernel for investing
-	their time to discuss the case sensitivity and dcache aliasing issues.
-
-	- Remove unused source file fs/ntfs/attraops.c.
-	- Remove show_inodes mount option(s), thus dropping support for
-	  displaying of short file names.
-	- Remove deprecated mount option posix.
-	- Restore show_sys_files mount option.
-	- Add new mount option case_sensitive, to determine if the driver
-	  treats file names as case sensitive or not. If case sensitive, create
-	  file names in the POSIX namespace. Otherwise create file names in the
-	  LONG/WIN32 namespace. Note, files remain accessible via their short
-	  file name, if it exists.
-	- Remove really dumb logic bug in boot sector recovery code.
-	- Fix dcache aliasing issues wrt short/long file names via changes
-	  to fs/ntfs/dir.c::ntfs_lookup_inode_by_name() and
-	  fs/ntfs/namei.c::ntfs_lookup():
-	  - Add additional argument to ntfs_lookup_inode_by_name() in which we
-	    return information about the matching file name if the case is not
-	    matching or the match is a short file name. See comments above the
-	    function definition for details.
-	  - Change ntfs_lookup() to only create dcache entries for the correctly
-	    cased file name and only for the WIN32 namespace counterpart of DOS
-	    namespace file names. This ensures we have only one dentry per
-	    directory and also removes all dcache aliasing issues between short
-	    and long file names once we add write support. See comments above
-	    function for details.
-	- Fix potential 1 byte overflow in fs/ntfs/unistr.c::ntfs_ucstonls().
-
-2.0.7 - Minor cleanups and updates for changes in core kernel code.
-
-	- Remove much of the NULL struct element initializers.
-	- Various updates to make compatible with recent kernels.
-	- Remove defines of MAX_BUF_PER_PAGE and include linux/buffer_head.h
-	  in fs/ntfs/ntfs.h instead.
-	- Remove no longer needed KERNEL_VERSION checks. We are now in the
-	  kernel proper so they are no longer needed.
-
-2.0.6 - Major bugfix to make compatible with other kernel changes.
-
-	- Initialize the mftbmp address space properly now that there are more
-	  fields in the struct address_space. This was leading to hangs and
-	  oopses on umount since 2.5.12 because of changes to other parts of
-	  the kernel. We probably want a kernel generic init_address_space()
-	  function...
-	- Drop BKL from ntfs_readdir() after consultation with Al Viro. The
-	  only caller of ->readdir() is vfs_readdir() which holds i_sem during
-	  the call, and i_sem is sufficient protection against changes in the
-	  directory inode (including ->i_size).
-	- Use generic_file_llseek() for directories (as opposed to
-	  default_llseek()) as this downs i_sem instead of the BKL which is
-	  what we now need for exclusion against ->f_pos changes considering we
-	  no longer take the BKL in ntfs_readdir().
-
-2.0.5 - Major bugfix. Buffer overflow in extent inode handling.
-
-	- No need to set old blocksize in super.c::ntfs_fill_super() as the
-	  VFS does so via invocation of deactivate_super() calling
-	  fs->fill_super() calling block_kill_super() which does it.
-	- BKL moved from VFS into dir.c::ntfs_readdir(). (Linus Torvalds)
-	  -> Do we really need it? I don't think so as we have exclusion on
-	  the directory ntfs_inode rw_semaphore mrec_lock. We mmight have to
-	  move the ->f_pos accesses under the mrec_lock though. Check this...
-	- Fix really, really, really stupid buffer overflow in extent inode
-	  handling in mft.c::map_extent_mft_record().
-
-2.0.4 - Cleanups and updates for kernel 2.5.11.
-
-	- Add documentation on how to use the MD driver to be able to use NTFS
-	  stripe and volume sets in Linux and generally cleanup documentation
-	  a bit.
-	Remove all uses of kdev_t in favour of struct block_device *:
-	- Change compress.c::ntfs_file_read_compressed_block() to use
-	  sb_getblk() instead of getblk().
-	- Change super.c::ntfs_fill_super() to use bdev_hardsect_size() instead
-	  of get_hardsect_size().
-	- No need to get old blocksize in super.c::ntfs_fill_super() as
-	  fs/super.c::get_sb_bdev() already does this.
-	- Set bh->b_bdev instead of bh->b_dev throughout aops.c.
-
-2.0.3 - Small bug fixes, cleanups, and performance improvements.
-
-	- Remove some dead code from mft.c.
-	- Optimize readpage and read_block functions throughout aops.c so that
-	  only initialized blocks are read. Non-initialized ones have their
-	  buffer head mapped, zeroed, and set up to date, without scheduling
-	  any i/o. Thanks to Al Viro for advice on how to avoid the device i/o.
-	Thanks go to Andrew Morton for spotting the below:
-	- Fix buglet in allocate_compression_buffers() error code path.
-	- Call flush_dcache_page() after modifying page cache page contents in
-	  ntfs_file_readpage().
-	- Check for existence of page buffers throughout aops.c before calling
-	  create_empty_buffers(). This happens when an I/O error occurs and the
-	  read is retried. (It also happens once writing is implemented so that
-	  needed doing anyway but I had left it for later...)
-	- Don't BUG_ON() uptodate and/or mapped buffers throughout aops.c in
-	  readpage and read_block functions. Reasoning same as above (i.e. I/O
-	  error retries and future write code paths.)
-
-2.0.2 - Minor updates and cleanups.
-
-	- Cleanup: rename mst.c::__post_read_mst_fixup to post_write_mst_fixup
-	  and cleanup the code a bit, removing the unused size parameter.
-	- Change default fmask to 0177 and update documentation.
-	- Change attrib.c::get_attr_search_ctx() to return the search context
-	  directly instead of taking the address of a pointer. A return value
-	  of NULL means the allocation failed. Updated all callers
-	  appropriately.
-	- Update to 2.5.9 kernel (preserving backwards compatibility) by
-	  replacing all occurences of page->buffers with page_buffers(page).
-	- Fix minor bugs in runlist merging, also minor cleanup.
-	- Updates to bootsector layout and mft mirror contents descriptions.
-	- Small bug fix in error detection in unistr.c and some cleanups.
-	- Grow name buffer allocations in unistr.c in aligned mutlipled of 64
-	  bytes.
-
-2.0.1 - Minor updates.
-
-	- Make default umask correspond to documentation.
-	- Improve documentation.
-	- Set default mode to include execute bit. The {u,f,d}mask can be used
-	  to take it away if desired. This allows binaries to be executed from
-	  a mounted ntfs partition.
-
-2.0.0 - New version number. Remove TNG from the name. Now in the kernel.
-
-	- Add kill_super, just keeping up with the vfs changes in the kernel.
-	- Repeat some changes from tng-0.0.8 that somehow got lost on the way
-	  from the CVS import into BitKeeper.
-	- Begin to implement proper handling of allocated_size vs
-	  initialized_size vs data_size (i.e. i_size). Done are
-	  mft.c::ntfs_mft_readpage(), aops.c::end_buffer_read_index_async(),
-	  and attrib.c::load_attribute_list().
-	- Lock the runlist in attrib.c::load_attribute_list() while using it.
-	- Fix memory leak in ntfs_file_read_compressed_block() and generally
-	  clean up compress.c a little, removing some uncommented/unused debug
-	  code.
-	- Tidy up dir.c a little bit.
-	- Don't bother getting the runlist in inode.c::ntfs_read_inode().
-	- Merge mft.c::ntfs_mft_readpage() and aops.c::ntfs_index_readpage()
-	  creating aops.c::ntfs_mst_readpage(), improving the handling of
-	  holes and overflow in the process and implementing the correct
-	  equivalent of ntfs_file_get_block() in ntfs_mst_readpage() itself.
-	  I am aiming for correctness at the moment. Modularisation can come
-	  later.
-	- Rename aops.c::end_buffer_read_index_async() to
-	  end_buffer_read_mst_async() and optimize the overflow checking and
-	  handling.
-	- Use the host of the mftbmp address space mapping to hold the ntfs
-	  volume. This is needed so the async i/o completion handler can
-	  retrieve a pointer to the volume. Hopefully this will not cause
-	  problems elsewhere in the kernel... Otherwise will need to use a
-	  fake inode.
-	- Complete implementation of proper handling of allocated_size vs
-	  initialized_size vs data_size (i.e. i_size) in whole driver.
-	  Basically aops.c is now completely rewritten.
-	- Change NTFS driver name to just NTFS and set version number to 2.0.0
-	  to make a clear distinction from the old driver which is still on
-	  version 1.1.22.
-
-tng-0.0.8 - 08/03/2002 - Now using BitKeeper, http://linux-ntfs.bkbits.net/
-
-	- Replace bdevname(sb->s_dev) with sb->s_id.
-	- Remove now superfluous new-line characters in all callers of
-	  ntfs_debug().
-	- Apply kludge in ntfs_read_inode(), setting i_nlink to 1 for
-	  directories. Without this the "find" utility gets very upset which is
-	  fair enough as Linux/Unix do not support directory hard links.
-	- Further runlist merging work. (Richard Russon)
-	- Backwards compatibility for gcc-2.95. (Richard Russon)
-	- Update to kernel 2.5.5-pre1 and rediff the now tiny patch.
-	- Convert to new file system declaration using ->ntfs_get_sb() and
-	  replacing ntfs_read_super() with ntfs_fill_super().
-	- Set s_maxbytes to MAX_LFS_FILESIZE to avoid page cache page index
-	  overflow on 32-bit architectures.
-	- Cleanup upcase loading code to use ntfs_(un)map_page().
-	- Disable/reenable preemtion in critical sections of compession engine.
-	- Replace device size determination in ntfs_fill_super() with
-	  sb->s_bdev->bd_inode->i_size (in bytes) and remove now superfluous
-	  function super.c::get_nr_blocks().
-	- Implement a mount time option (show_inodes) allowing choice of which
-	  types of inode names readdir() returns and modify ntfs_filldir()
-	  accordingly. There are several parameters to show_inodes:
-		system:	system files
-		win32:	long file names (including POSIX file names) [DEFAULT]
-		long:	same as win32
-		dos:	short file names only (excluding POSIX file names)
-		short:	same as dos
-		posix:	same as both win32 and dos
-		all:	all file names
-	  Note that the options are additive, i.e. specifying:
-		-o show_inodes=system,show_inodes=win32,show_inodes=dos
-	  is the same as specifying:
-		-o show_inodes=all
-	  Note that the "posix" and "all" options will show all directory
-	  names, BUT the link count on each directory inode entry is set to 1,
-	  due to Linux not supporting directory hard links. This may well
-	  confuse some userspace applications, since the directory names will
-	  have the same inode numbers. Thus it is NOT advisable to use the
-	  "posix" or "all" options. We provide them only for completeness sake.
-	- Add copies of allocated_size, initialized_size, and compressed_size to
-	  the ntfs inode structure and set them up in
-	  inode.c::ntfs_read_inode(). These reflect the unnamed data attribute
-	  for files and the index allocation attribute for directories.
-	- Add copies of allocated_size and initialized_size to ntfs inode for
-	  $BITMAP attribute of large directories and set them up in
-	  inode.c::ntfs_read_inode().
-	- Add copies of allocated_size and initialized_size to ntfs volume for
-	  $BITMAP attribute of $MFT and set them up in
-	  super.c::load_system_files().
-	- Parse deprecated ntfs driver options (iocharset, show_sys_files,
-	  posix, and utf8) and tell user what the new options to use are. Note
-	  we still do support them but they will be removed with kernel 2.7.x.
-	- Change all occurences of integer long long printf formatting to hex
-	  as printk() will not support long long integer format if/when the
-	  div64 patch goes into the kernel.
-	- Make slab caches have stable names and change the names to what they
-	  were intended to be. These changes are required/made possible by the
-	  new slab cache name handling which removes the length limitation by
-	  requiring the caller of kmem_cache_create() to supply a stable name
-	  which is then referenced but not copied.
-	- Rename run_list structure to run_list_element and create a new
-	  run_list structure containing a pointer to a run_list_element
-	  structure and a read/write semaphore. Adapt all users of runlists
-	  to new scheme and take and release the lock as needed. This fixes a
-	  nasty race as the run_list changes even when inodes are locked for
-	  reading and even when the inode isn't locked at all, so we really
-	  needed the serialization. We use a semaphore rather than a spinlock
-	  as memory allocations can sleep and doing everything GFP_ATOMIC
-	  would be silly.
-	- Cleanup read_inode() removing all code checking for lowest_vcn != 0.
-	  This can never happen due to the nature of lookup_attr() and how we
-	  support attribute lists. If it did happen it would imply the inode
-	  being corrupt.
-	- Check for lowest_vcn != 0 in ntfs_read_inode() and mark the inode as
-	  bad if found.
-	- Update to 2.5.6-pre2 changes in struct address_space.
-	- Use parent_ino() when accessing d_parent inode number in dir.c.
-	- Import Sourceforge CVS repository into BitKeeper repository:
-		http://linux-ntfs.bkbits.net/ntfs-tng-2.5
-	- Update fs/Makefile, fs/Config.help, fs/Config.in, and
-	  Documentation/filesystems/ntfs.txt for NTFS TNG.
-	- Create kernel configuration option controlling whether debugging
-	  is enabled or not.
-	- Add the required export of end_buffer_io_sync() from the patches
-	  directory to the kernel code.
-	- Update inode.c::ntfs_show_options() with show_inodes mount option.
-	- Update errors mount option.
-
-tng-0.0.7 - 13/02/2002 - The driver is now feature complete for read-only!
-
-	- Cleanup mft.c and it's debug/error output in particular. Fix a minor
-	  bug in mapping of extent inodes. Update all the comments to fit all
-	  the recent code changes.
-	- Modify vcn_to_lcn() to cope with entirely unmapped runlists.
-	- Cleanups in compress.c, mostly comments and folding help.
-	- Implement attrib.c::map_run_list() as a generic helper.
-	- Make compress.c::ntfs_file_read_compressed_block() use map_run_list()
-	  thus making code shorter and enabling attribute list support.
-	- Cleanup incorrect use of [su]64 with %L printf format specifier in
-	  all source files. Type casts to [unsigned] long long added to correct
-	  the mismatches (important for architectures which have long long not
-	  being 64 bits).
-	- Merge async io completion handlers for directory indexes and $MFT
-	  data into one by setting the index_block_size{_bits} of the ntfs
-	  inode for $MFT to the mft_record_size{_bits} of the ntfs_volume.
-	- Cleanup aops.c, update comments.
-	- Make ntfs_file_get_block() use map_run_list() so all files now
-	  support attribute lists.
-	- Make ntfs_dir_readpage() almost verbatim copy of
-	  block_read_full_page() by using ntfs_file_get_block() with only real
-	  difference being the use of our own async io completion handler
-	  rather than the default one, thus reducing the amount of code and
-	  automatically enabling attribute list support for directory indices.
-	- Fix bug in load_attribute_list() - forgot to call brelse in error
-	  code path.
-	- Change parameters to find_attr() and lookup_attr(). We no longer
-	  pass in the upcase table and its length. These can be gotten from
-	  ctx->ntfs_ino->vol->upcase{_len}. Update all callers.
-	- Cleanups in attrib.c.
-	- Implement merging of runlists, attrib.c::merge_run_lists() and its
-	  helpers. (Richard Russon)
-	- Attribute lists part 2, attribute extents and multi part runlists:
-	  enable proper support for LCN_RL_NOT_MAPPED and automatic mapping of
-	  further runlist parts via attrib.c::map_run_list().
-	- Tiny endianness bug fix in decompress_mapping_pairs().
-
-tng-0.0.6 - Encrypted directories, bug fixes, cleanups, debugging enhancements.
-
-	- Enable encrypted directories. (Their index root is marked encrypted
-	  to indicate that new files in that directory should be created
-	  encrypted.)
-	- Fix bug in NInoBmpNonResident() macro. (Cut and paste error.)
-	- Enable $Extend system directory. Most (if not all) extended system
-	  files do not have unnamed data attributes so ntfs_read_inode() had to
-	  special case them but that is ok, as the special casing recovery
-	  happens inside an error code path so there is zero slow down in the
-	  normal fast path. The special casing is done by introducing a new
-	  function inode.c::ntfs_is_extended_system_file() which checks if any
-	  of the hard links in the inode point to $Extend as being their parent
-	  directory and if they do we assume this is an extended system file.
-	- Create a sysctl/proc interface to allow {dis,en}abling of debug output
-	  when compiled with -DDEBUG. Default is debug messages to be disabled.
-	  To enable them, one writes a non-zero value to /proc/sys/fs/ntfs-debug
-	  (if /proc is enabled) or uses sysctl(2) to effect the same (if sysctl
-	  interface is enabled). Inspired by old ntfs driver.
-	- Add debug_msgs insmod/kernel boot parameter to set whether debug
-	  messages are {dis,en}abled. This is useful to enable debug messages
-	  during ntfs initialization and is the only way to activate debugging
-	  when the sysctl interface is not enabled.
-	- Cleanup debug output in various places.
-	- Remove all dollar signs ($) from the source (except comments) to
-	  enable compilation on architectures whose gcc compiler does not
-	  support dollar signs in the names of variables/constants. Attribute
-	  types now start with AT_ instead of $ and $I30 is now just I30.
-	- Cleanup ntfs_lookup() and add consistency check of sequence numbers.
-	- Load complete runlist for $MFT/$BITMAP during mount and cleanup
-	  access functions. This means we now cope with $MFT/$BITMAP being
-	  spread accross several mft records.
-	- Disable modification of mft_zone_multiplier on remount. We can always
-	  reenable this later on if we really want to, but we will need to make
-	  sure we readjust the mft_zone size / layout accordingly.
-
-tng-0.0.5 - Modernize for 2.5.x and further in line-ing with Al Viro's comments.
-
-	- Use sb_set_blocksize() instead of set_blocksize() and verify the
-	  return value.
-	- Use sb_bread() instead of bread() throughout.
-	- Add index_vcn_size{_bits} to ntfs_inode structure to store the size
-	  of a directory index block vcn. Apply resulting simplifications in
-	  dir.c everywhere.
-	- Fix a small bug somewhere (but forgot what it was).
-	- Change ntfs_{debug,error,warning} to enable gcc to do type checking
-	  on the printf-format parameter list and fix bugs reported by gcc
-	  as a result. (Richard Russon)
-	- Move inode allocation strategy to Al's new stuff but maintain the
-	  divorce of ntfs_inode from struct inode. To achieve this we have two
-	  separate slab caches, one for big ntfs inodes containing a struct
-	  inode and pure ntfs inodes and at the same time fix some faulty
-	  error code paths in ntfs_read_inode().
-	- Show mount options in proc (inode.c::ntfs_show_options()).
-
-tng-0.0.4 - Big changes, getting in line with Al Viro's comments.
-
-	- Modified (un)map_mft_record functions to be common for read and write
-	  case. To specify which is which, added extra parameter at front of
-	  parameter list. Pass either READ or WRITE to this, each has the
-	  obvious meaning.
-	- General cleanups to allow for easier folding in vi.
-	- attrib.c::decompress_mapping_pairs() now accepts the old runlist
-	  argument, and invokes attrib.c::merge_run_lists() to merge the old
-	  and the new runlists.
-	- Removed attrib.c::find_first_attr().
-	- Implemented loading of attribute list and complete runlist for $MFT.
-	  This means we now cope with $MFT being spread across several mft
-	  records.
-	- Adapt to 2.5.2-pre9 and the changed create_empty_buffers() syntax.
-	- Adapt major/minor/kdev_t/[bk]devname stuff to new 2.5.x kernels.
-	- Make ntfs_volume be allocated via kmalloc() instead of using a slab
-	  cache. There are too little ntfs_volume structures at any one time
-	  to justify a private slab cache.
-	- Fix bogus kmap() use in async io completion. Now use kmap_atomic().
-	  Use KM_BIO_IRQ on advice from IRC/kernel...
-	- Use ntfs_map_page() in map_mft_record() and create ->readpage method
-	  for reading $MFT (ntfs_mft_readpage). In the process create dedicated
-	  address space operations (ntfs_mft_aops) for $MFT inode mapping. Also
-	  removed the now superfluous exports from the kernel core patch.
-	- Fix a bug where kfree() was used insted of ntfs_free().
-	- Change map_mft_record() to take ntfs_inode as argument instead of
-	  vfs inode. Dito for unmap_mft_record(). Adapt all callers.
-	- Add pointer to ntfs_volume to ntfs_inode.
-	- Add mft record number and sequence number to ntfs_inode. Stop using
-	  i_ino and i_generation for in-driver purposes.
-	- Implement attrib.c::merge_run_lists(). (Richard Russon)
-	- Remove use of proper inodes by extent inodes. Move i_ino and
-	  i_generation to ntfs_inode to do this. Apply simplifications that
-	  result and remove iget_no_wait(), etc.
-	- Pass ntfs_inode everywhere in the driver (used to be struct inode).
-	- Add reference counting in ntfs_inode for the ntfs inode itself and
-	  for the mapped mft record.
-	- Extend mft record mapping so we can (un)map extent mft records (new
-	  functions (un)map_extent_mft_record), and so mappings are reference
-	  counted and don't have to happen twice if already mapped - just ref
-	  count increases.
-	- Add -o iocharset as alias to -o nls for backwards compatibility.
-	- The latest core patch is now tiny. In fact just a single additional
-	  export is necessary over the base kernel.
-
-tng-0.0.3 - Cleanups, enhancements, bug fixes.
-
-	- Work on attrib.c::decompress_mapping_pairs() to detect base extents
-	  and setup the runlist appropriately using knowledge provided by the
-	  sizes in the base attribute record.
-	- Balance the get_/put_attr_search_ctx() calls so we don't leak memory
-	  any more.
-	- Introduce ntfs_malloc_nofs() and ntfs_free() to allocate/free a single
-	  page or use vmalloc depending on the amount of memory requested.
-	- Cleanup error output. The __FUNCTION__ "(): " is now added
-	  automatically. Introduced a new header file debug.h to support this
-	  and also moved ntfs_debug() function into it.
-	- Make reading of compressed files more intelligent and especially get
-	  rid of the vmalloc_nofs() from readpage(). This now uses per CPU
-	  buffers (allocated at first mount with cluster size <= 4kiB and
-	  deallocated on last umount with cluster size <= 4kiB), and
-	  asynchronous io for the compressed data using a list of buffer heads.
-	  Er, we use synchronous io as async io only works on whole pages
-	  covered by buffers and not on individual buffer heads...
-	- Bug fix for reading compressed files with sparse compression blocks.
-
-tng-0.0.2 - Now handles larger/fragmented/compressed volumes/files/dirs.
-
-	- Fixed handling of directories when cluster size exceeds index block
-	  size.
-	- Hide DOS only name space directory entries from readdir() but allow
-	  them in lookup(). This should fix the problem that Linux doesn't
-	  support directory hard links, while still allowing access to entries
-	  via their short file name. This also has the benefit of mimicking
-	  what Windows users are used to, so it is the ideal solution.
-	- Implemented sync_page everywhere so no more hangs in D state when
-	  waiting for a page.
-	- Stop using bforget() in favour of brelse().
-	- Stop locking buffers unnecessarily.
-	- Implemented compressed files (inode->mapping contains uncompressed
-	  data, raw compressed data is currently bread() into a vmalloc()ed
-	  memory buffer).
-	- Enable compressed directories. (Their index root is marked compressed
-	  to indicate that new files in that directory should be created
-	  compressed.)
-	- Use vsnprintf rather than vsprintf in the ntfs_error and ntfs_warning
-	  functions. (Thanks to Will Dyson for pointing this out.)
-	- Moved the ntfs_inode and ntfs_volume (the former ntfs_inode_info and
-	  ntfs_sb_info) out of the common inode and super_block structures and
-	  started using the generic_ip and generic_sbp pointers instead. This
-	  makes ntfs entirely private with respect to the kernel tree.
-	- Detect compiler version and abort with error message if gcc less than
-	  2.96 is used.
-	- Fix bug in name comparison function in unistr.c.
-	- Implement attribute lists part 1, the infrastructure: search contexts
-	  and operations, find_external_attr(), lookup_attr()) and make the
-	  code use the infrastructure.
-	- Fix stupid buffer overflow bug that became apparent on larger run
-	  list containing attributes.
-	- Fix bugs in readdir() that became apparent on larger directories.
-
-	The driver is now really useful and survives the test
-		find . -type f -exec md5sum "{}" \;
-	without any error messages on a over 1GiB sized partition with >16k
-	files on it, including compressed files and directories and many files
-	and directories with attribute lists.
-
-tng-0.0.1 - The first useful version.
-
-	- Added ntfs_lookup().
-	- Added default upcase generation and handling.
-	- Added compile options to be shown on module init.
-	- Many bug fixes that were "hidden" before.
-	- Update to latest kernel.
-	- Added ntfs_readdir().
-	- Added file operations for mmap(), read(), open() and llseek(). We just
-	  use the generic ones. The whole point of going through implementing
-	  readpage() methods and where possible get_block() call backs is that
-	  this allows us to make use of the generic high level methods provided
-	  by the kernel.
-
-	The driver is now actually useful! Yey. (-: It undoubtedly has got bugs
-	though and it doesn't implement accesssing compressed files yet. Also,
-	accessing files with attribute list attributes is not implemented yet
-	either. But for small or simple file systems it should work and allow
-	you to list directories, use stat on directory entries and the file
-	system, open, read, mmap and llseek around in files. A big mile stone
-	has been reached!
-
-tng-0.0.0 - Initial version tag.
-
-	Initial driver implementation. The driver can mount and umount simple
-	NTFS file systems (i.e. ones without attribute lists in the system
-	files). If the mount fails there might be problems in the error handling
-	code paths, so be warned. Otherwise it seems to be loading the system
-	files nicely and the mft record read mapping/unmapping seems to be
-	working nicely, too. Proof of inode metadata in the page cache and non-
-	resident file unnamed stream data in the page cache concepts is thus
-	complete.
diff --git a/fs/ntfs/Kconfig b/fs/ntfs/Kconfig
new file mode 100644
index 00000000000..f5a868cc915
--- /dev/null
+++ b/fs/ntfs/Kconfig
@@ -0,0 +1,78 @@
+config NTFS_FS
+	tristate "NTFS file system support"
+	select NLS
+	help
+	  NTFS is the file system of Microsoft Windows NT, 2000, XP and 2003.
+
+	  Saying Y or M here enables read support.  There is partial, but
+	  safe, write support available.  For write support you must also
+	  say Y to "NTFS write support" below.
+
+	  There are also a number of user-space tools available, called
+	  ntfsprogs.  These include ntfsundelete and ntfsresize, that work
+	  without NTFS support enabled in the kernel.
+
+	  This is a rewrite from scratch of Linux NTFS support and replaced
+	  the old NTFS code starting with Linux 2.5.11.  A backport to
+	  the Linux 2.4 kernel series is separately available as a patch
+	  from the project web site.
+
+	  For more information see <file:Documentation/filesystems/ntfs.txt>
+	  and <http://www.linux-ntfs.org/>.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called ntfs.
+
+	  If you are not using Windows NT, 2000, XP or 2003 in addition to
+	  Linux on your computer it is safe to say N.
+
+config NTFS_DEBUG
+	bool "NTFS debugging support"
+	depends on NTFS_FS
+	help
+	  If you are experiencing any problems with the NTFS file system, say
+	  Y here.  This will result in additional consistency checks to be
+	  performed by the driver as well as additional debugging messages to
+	  be written to the system log.  Note that debugging messages are
+	  disabled by default.  To enable them, supply the option debug_msgs=1
+	  at the kernel command line when booting the kernel or as an option
+	  to insmod when loading the ntfs module.  Once the driver is active,
+	  you can enable debugging messages by doing (as root):
+	  echo 1 > /proc/sys/fs/ntfs-debug
+	  Replacing the "1" with "0" would disable debug messages.
+
+	  If you leave debugging messages disabled, this results in little
+	  overhead, but enabling debug messages results in very significant
+	  slowdown of the system.
+
+	  When reporting bugs, please try to have available a full dump of
+	  debugging messages while the misbehaviour was occurring.
+
+config NTFS_RW
+	bool "NTFS write support"
+	depends on NTFS_FS
+	help
+	  This enables the partial, but safe, write support in the NTFS driver.
+
+	  The only supported operation is overwriting existing files, without
+	  changing the file length.  No file or directory creation, deletion or
+	  renaming is possible.  Note only non-resident files can be written to
+	  so you may find that some very small files (<500 bytes or so) cannot
+	  be written to.
+
+	  While we cannot guarantee that it will not damage any data, we have
+	  so far not received a single report where the driver would have
+	  damaged someones data so we assume it is perfectly safe to use.
+
+	  Note:  While write support is safe in this version (a rewrite from
+	  scratch of the NTFS support), it should be noted that the old NTFS
+	  write support, included in Linux 2.5.10 and before (since 1997),
+	  is not safe.
+
+	  This is currently useful with TopologiLinux.  TopologiLinux is run
+	  on top of any DOS/Microsoft Windows system without partitioning your
+	  hard disk.  Unlike other Linux distributions TopologiLinux does not
+	  need its own partition.  For more information see
+	  <http://topologi-linux.sourceforge.net/>
+
+	  It is perfectly safe to say N here.
diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile
index 7b66381a0b0..30206b23843 100644
--- a/fs/ntfs/Makefile
+++ b/fs/ntfs/Makefile
@@ -2,18 +2,13 @@
 
 obj-$(CONFIG_NTFS_FS) += ntfs.o
 
-ntfs-objs := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \
-	     index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \
-	     unistr.o upcase.o
+ntfs-y := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \
+	  index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \
+	  unistr.o upcase.o
 
-EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.22\"
+ntfs-$(CONFIG_NTFS_RW) += bitmap.o lcnalloc.o logfile.o quota.o usnjrnl.o
 
-ifeq ($(CONFIG_NTFS_DEBUG),y)
-EXTRA_CFLAGS += -DDEBUG
-endif
+ccflags-y := -DNTFS_VERSION=\"2.1.30\"
+ccflags-$(CONFIG_NTFS_DEBUG)	+= -DDEBUG
+ccflags-$(CONFIG_NTFS_RW)	+= -DNTFS_RW
 
-ifeq ($(CONFIG_NTFS_RW),y)
-EXTRA_CFLAGS += -DNTFS_RW
-
-ntfs-objs += bitmap.o lcnalloc.o logfile.o quota.o
-endif
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 45d56e41ed9..d267ea6aa1a 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -2,7 +2,7 @@
  * aops.c - NTFS kernel address space operations and page cache handling.
  *	    Part of the Linux-NTFS project.
  *
- * Copyright (c) 2001-2004 Anton Altaparmakov
+ * Copyright (c) 2001-2007 Anton Altaparmakov
  * Copyright (c) 2002 Richard Russon
  *
  * This program/include file is free software; you can redistribute it and/or
@@ -22,11 +22,14 @@
  */
 
 #include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/gfp.h>
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/swap.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
+#include <linux/bit_spinlock.h>
 
 #include "aops.h"
 #include "attrib.h"
@@ -55,42 +58,58 @@
  */
 static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
 {
-	static DEFINE_SPINLOCK(page_uptodate_lock);
 	unsigned long flags;
-	struct buffer_head *tmp;
+	struct buffer_head *first, *tmp;
 	struct page *page;
+	struct inode *vi;
 	ntfs_inode *ni;
 	int page_uptodate = 1;
 
 	page = bh->b_page;
-	ni = NTFS_I(page->mapping->host);
+	vi = page->mapping->host;
+	ni = NTFS_I(vi);
 
 	if (likely(uptodate)) {
-		s64 file_ofs;
+		loff_t i_size;
+		s64 file_ofs, init_size;
 
 		set_buffer_uptodate(bh);
 
 		file_ofs = ((s64)page->index << PAGE_CACHE_SHIFT) +
 				bh_offset(bh);
+		read_lock_irqsave(&ni->size_lock, flags);
+		init_size = ni->initialized_size;
+		i_size = i_size_read(vi);
+		read_unlock_irqrestore(&ni->size_lock, flags);
+		if (unlikely(init_size > i_size)) {
+			/* Race with shrinking truncate. */
+			init_size = i_size;
+		}
 		/* Check for the current buffer head overflowing. */
-		if (file_ofs + bh->b_size > ni->initialized_size) {
-			char *addr;
-			int ofs = 0;
-
-			if (file_ofs < ni->initialized_size)
-				ofs = ni->initialized_size - file_ofs;
-			addr = kmap_atomic(page, KM_BIO_SRC_IRQ);
-			memset(addr + bh_offset(bh) + ofs, 0, bh->b_size - ofs);
+		if (unlikely(file_ofs + bh->b_size > init_size)) {
+			int ofs;
+			void *kaddr;
+
+			ofs = 0;
+			if (file_ofs < init_size)
+				ofs = init_size - file_ofs;
+			local_irq_save(flags);
+			kaddr = kmap_atomic(page);
+			memset(kaddr + bh_offset(bh) + ofs, 0,
+					bh->b_size - ofs);
 			flush_dcache_page(page);
-			kunmap_atomic(addr, KM_BIO_SRC_IRQ);
+			kunmap_atomic(kaddr);
+			local_irq_restore(flags);
 		}
 	} else {
 		clear_buffer_uptodate(bh);
-		ntfs_error(ni->vol->sb, "Buffer I/O error, logical block %llu.",
-				(unsigned long long)bh->b_blocknr);
 		SetPageError(page);
+		ntfs_error(ni->vol->sb, "Buffer I/O error, logical block "
+				"0x%llx.", (unsigned long long)bh->b_blocknr);
 	}
-	spin_lock_irqsave(&page_uptodate_lock, flags);
+	first = page_buffers(page);
+	local_irq_save(flags);
+	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
 	clear_buffer_async_read(bh);
 	unlock_buffer(bh);
 	tmp = bh;
@@ -105,7 +124,8 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
 		}
 		tmp = tmp->b_this_page;
 	} while (tmp != bh);
-	spin_unlock_irqrestore(&page_uptodate_lock, flags);
+	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
+	local_irq_restore(flags);
 	/*
 	 * If none of the buffers had errors then we can set the page uptodate,
 	 * but we first have to perform the post read mst fixups, if the
@@ -118,7 +138,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
 		if (likely(page_uptodate && !PageError(page)))
 			SetPageUptodate(page);
 	} else {
-		char *addr;
+		u8 *kaddr;
 		unsigned int i, recs;
 		u32 rec_size;
 
@@ -126,19 +146,22 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
 		recs = PAGE_CACHE_SIZE / rec_size;
 		/* Should have been verified before we got here... */
 		BUG_ON(!recs);
-		addr = kmap_atomic(page, KM_BIO_SRC_IRQ);
+		local_irq_save(flags);
+		kaddr = kmap_atomic(page);
 		for (i = 0; i < recs; i++)
-			post_read_mst_fixup((NTFS_RECORD*)(addr +
+			post_read_mst_fixup((NTFS_RECORD*)(kaddr +
 					i * rec_size), rec_size);
+		kunmap_atomic(kaddr);
+		local_irq_restore(flags);
 		flush_dcache_page(page);
-		kunmap_atomic(addr, KM_BIO_SRC_IRQ);
-		if (likely(!PageError(page) && page_uptodate))
+		if (likely(page_uptodate && !PageError(page)))
 			SetPageUptodate(page);
 	}
 	unlock_page(page);
 	return;
 still_busy:
-	spin_unlock_irqrestore(&page_uptodate_lock, flags);
+	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
+	local_irq_restore(flags);
 	return;
 }
 
@@ -161,43 +184,69 @@ still_busy:
  */
 static int ntfs_read_block(struct page *page)
 {
+	loff_t i_size;
 	VCN vcn;
 	LCN lcn;
+	s64 init_size;
+	struct inode *vi;
 	ntfs_inode *ni;
 	ntfs_volume *vol;
 	runlist_element *rl;
 	struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
 	sector_t iblock, lblock, zblock;
+	unsigned long flags;
 	unsigned int blocksize, vcn_ofs;
 	int i, nr;
 	unsigned char blocksize_bits;
 
-	ni = NTFS_I(page->mapping->host);
+	vi = page->mapping->host;
+	ni = NTFS_I(vi);
 	vol = ni->vol;
 
 	/* $MFT/$DATA must have its complete runlist in memory at all times. */
 	BUG_ON(!ni->runlist.rl && !ni->mft_no && !NInoAttr(ni));
 
-	blocksize_bits = VFS_I(ni)->i_blkbits;
-	blocksize = 1 << blocksize_bits;
+	blocksize = vol->sb->s_blocksize;
+	blocksize_bits = vol->sb->s_blocksize_bits;
 
-	if (!page_has_buffers(page))
+	if (!page_has_buffers(page)) {
 		create_empty_buffers(page, blocksize, 0);
-	bh = head = page_buffers(page);
-	if (unlikely(!bh)) {
-		unlock_page(page);
-		return -ENOMEM;
+		if (unlikely(!page_has_buffers(page))) {
+			unlock_page(page);
+			return -ENOMEM;
+		}
 	}
+	bh = head = page_buffers(page);
+	BUG_ON(!bh);
 
+	/*
+	 * We may be racing with truncate.  To avoid some of the problems we
+	 * now take a snapshot of the various sizes and use those for the whole
+	 * of the function.  In case of an extending truncate it just means we
+	 * may leave some buffers unmapped which are now allocated.  This is
+	 * not a problem since these buffers will just get mapped when a write
+	 * occurs.  In case of a shrinking truncate, we will detect this later
+	 * on due to the runlist being incomplete and if the page is being
+	 * fully truncated, truncate will throw it away as soon as we unlock
+	 * it so no need to worry what we do with it.
+	 */
 	iblock = (s64)page->index << (PAGE_CACHE_SHIFT - blocksize_bits);
+	read_lock_irqsave(&ni->size_lock, flags);
 	lblock = (ni->allocated_size + blocksize - 1) >> blocksize_bits;
-	zblock = (ni->initialized_size + blocksize - 1) >> blocksize_bits;
+	init_size = ni->initialized_size;
+	i_size = i_size_read(vi);
+	read_unlock_irqrestore(&ni->size_lock, flags);
+	if (unlikely(init_size > i_size)) {
+		/* Race with shrinking truncate. */
+		init_size = i_size;
+	}
+	zblock = (init_size + blocksize - 1) >> blocksize_bits;
 
 	/* Loop through all the buffers in the page. */
 	rl = NULL;
 	nr = i = 0;
 	do {
-		u8 *kaddr;
+		int err = 0;
 
 		if (unlikely(buffer_uptodate(bh)))
 			continue;
@@ -208,7 +257,7 @@ static int ntfs_read_block(struct page *page)
 		bh->b_bdev = vol->sb->s_bdev;
 		/* Is the block within the allowed limits? */
 		if (iblock < lblock) {
-			BOOL is_retry = FALSE;
+			bool is_retry = false;
 
 			/* Convert iblock into corresponding vcn and offset. */
 			vcn = (VCN)iblock << blocksize_bits >>
@@ -246,8 +295,7 @@ lock_retry_remap:
 				goto handle_hole;
 			/* If first try and runlist unmapped, map and retry. */
 			if (!is_retry && lcn == LCN_RL_NOT_MAPPED) {
-				int err;
-				is_retry = TRUE;
+				is_retry = true;
 				/*
 				 * Attempt to map runlist, dropping lock for
 				 * the duration.
@@ -257,19 +305,30 @@ lock_retry_remap:
 				if (likely(!err))
 					goto lock_retry_remap;
 				rl = NULL;
-				lcn = err;
+			} else if (!rl)
+				up_read(&ni->runlist.lock);
+			/*
+			 * If buffer is outside the runlist, treat it as a
+			 * hole.  This can happen due to concurrent truncate
+			 * for example.
+			 */
+			if (err == -ENOENT || lcn == LCN_ENOENT) {
+				err = 0;
+				goto handle_hole;
 			}
 			/* Hard error, zero out region. */
+			if (!err)
+				err = -EIO;
 			bh->b_blocknr = -1;
 			SetPageError(page);
 			ntfs_error(vol->sb, "Failed to read from inode 0x%lx, "
 					"attribute type 0x%x, vcn 0x%llx, "
 					"offset 0x%x because its location on "
 					"disk could not be determined%s "
-					"(error code %lli).", ni->mft_no,
+					"(error code %i).", ni->mft_no,
 					ni->type, (unsigned long long)vcn,
 					vcn_ofs, is_retry ? " even after "
-					"retrying" : "", (long long)lcn);
+					"retrying" : "", err);
 		}
 		/*
 		 * Either iblock was outside lblock limits or
@@ -280,11 +339,9 @@ handle_hole:
 		bh->b_blocknr = -1UL;
 		clear_buffer_mapped(bh);
 handle_zblock:
-		kaddr = kmap_atomic(page, KM_USER0);
-		memset(kaddr + i * blocksize, 0, blocksize);
-		flush_dcache_page(page);
-		kunmap_atomic(kaddr, KM_USER0);
-		set_buffer_uptodate(bh);
+		zero_user(page, i * blocksize, blocksize);
+		if (likely(!err))
+			set_buffer_uptodate(bh);
 	} while (i++, iblock++, (bh = bh->b_this_page) != head);
 
 	/* Release the lock if we took it. */
@@ -342,14 +399,26 @@ handle_zblock:
 static int ntfs_readpage(struct file *file, struct page *page)
 {
 	loff_t i_size;
+	struct inode *vi;
 	ntfs_inode *ni, *base_ni;
-	u8 *kaddr;
+	u8 *addr;
 	ntfs_attr_search_ctx *ctx;
 	MFT_RECORD *mrec;
+	unsigned long flags;
 	u32 attr_len;
 	int err = 0;
 
+retry_readpage:
 	BUG_ON(!PageLocked(page));
+	vi = page->mapping->host;
+	i_size = i_size_read(vi);
+	/* Is the page fully outside i_size? (truncate in progress) */
+	if (unlikely(page->index >= (i_size + PAGE_CACHE_SIZE - 1) >>
+			PAGE_CACHE_SHIFT)) {
+		zero_user(page, 0, PAGE_CACHE_SIZE);
+		ntfs_debug("Read outside i_size - truncated?");
+		goto done;
+	}
 	/*
 	 * This can potentially happen because we clear PageUptodate() during
 	 * ntfs_writepage() of MstProtected() attributes.
@@ -358,38 +427,44 @@ static int ntfs_readpage(struct file *file, struct page *page)
 		unlock_page(page);
 		return 0;
 	}
-	ni = NTFS_I(page->mapping->host);
-
+	ni = NTFS_I(vi);
+	/*
+	 * Only $DATA attributes can be encrypted and only unnamed $DATA
+	 * attributes can be compressed.  Index root can have the flags set but
+	 * this means to create compressed/encrypted files, not that the
+	 * attribute is compressed/encrypted.  Note we need to check for
+	 * AT_INDEX_ALLOCATION since this is the type of both directory and
+	 * index inodes.
+	 */
+	if (ni->type != AT_INDEX_ALLOCATION) {
+		/* If attribute is encrypted, deny access, just like NT4. */
+		if (NInoEncrypted(ni)) {
+			BUG_ON(ni->type != AT_DATA);
+			err = -EACCES;
+			goto err_out;
+		}
+		/* Compressed data streams are handled in compress.c. */
+		if (NInoNonResident(ni) && NInoCompressed(ni)) {
+			BUG_ON(ni->type != AT_DATA);
+			BUG_ON(ni->name_len);
+			return ntfs_read_compressed_block(page);
+		}
+	}
 	/* NInoNonResident() == NInoIndexAllocPresent() */
 	if (NInoNonResident(ni)) {
-		/*
-		 * Only unnamed $DATA attributes can be compressed or
-		 * encrypted.
-		 */
-		if (ni->type == AT_DATA && !ni->name_len) {
-			/* If file is encrypted, deny access, just like NT4. */
-			if (NInoEncrypted(ni)) {
-				err = -EACCES;
-				goto err_out;
-			}
-			/* Compressed data streams are handled in compress.c. */
-			if (NInoCompressed(ni))
-				return ntfs_read_compressed_block(page);
-		}
-		/* Normal data stream. */
+		/* Normal, non-resident data stream. */
 		return ntfs_read_block(page);
 	}
 	/*
 	 * Attribute is resident, implying it is not compressed or encrypted.
 	 * This also means the attribute is smaller than an mft record and
 	 * hence smaller than a page, so can simply zero out any pages with
-	 * index above 0.  We can also do this if the file size is 0.
+	 * index above 0.  Note the attribute can actually be marked compressed
+	 * but if it is resident the actual data is not compressed so we are
+	 * ok to ignore the compressed flag here.
 	 */
-	if (unlikely(page->index > 0 || !i_size_read(VFS_I(ni)))) {
-		kaddr = kmap_atomic(page, KM_USER0);
-		memset(kaddr, 0, PAGE_CACHE_SIZE);
-		flush_dcache_page(page);
-		kunmap_atomic(kaddr, KM_USER0);
+	if (unlikely(page->index > 0)) {
+		zero_user(page, 0, PAGE_CACHE_SIZE);
 		goto done;
 	}
 	if (!NInoAttr(ni))
@@ -402,6 +477,14 @@ static int ntfs_readpage(struct file *file, struct page *page)
 		err = PTR_ERR(mrec);
 		goto err_out;
 	}
+	/*
+	 * If a parallel write made the attribute non-resident, drop the mft
+	 * record and retry the readpage.
+	 */
+	if (unlikely(NInoNonResident(ni))) {
+		unmap_mft_record(base_ni);
+		goto retry_readpage;
+	}
 	ctx = ntfs_attr_get_search_ctx(base_ni, mrec);
 	if (unlikely(!ctx)) {
 		err = -ENOMEM;
@@ -412,18 +495,24 @@ static int ntfs_readpage(struct file *file, struct page *page)
 	if (unlikely(err))
 		goto put_unm_err_out;
 	attr_len = le32_to_cpu(ctx->attr->data.resident.value_length);
-	i_size = i_size_read(VFS_I(ni));
-	if (unlikely(attr_len > i_size))
+	read_lock_irqsave(&ni->size_lock, flags);
+	if (unlikely(attr_len > ni->initialized_size))
+		attr_len = ni->initialized_size;
+	i_size = i_size_read(vi);
+	read_unlock_irqrestore(&ni->size_lock, flags);
+	if (unlikely(attr_len > i_size)) {
+		/* Race with shrinking truncate. */
 		attr_len = i_size;
-	kaddr = kmap_atomic(page, KM_USER0);
+	}
+	addr = kmap_atomic(page);
 	/* Copy the data to the page. */
-	memcpy(kaddr, (u8*)ctx->attr +
+	memcpy(addr, (u8*)ctx->attr +
 			le16_to_cpu(ctx->attr->data.resident.value_offset),
 			attr_len);
 	/* Zero the remainder of the page. */
-	memset(kaddr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
+	memset(addr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
 	flush_dcache_page(page);
-	kunmap_atomic(kaddr, KM_USER0);
+	kunmap_atomic(addr);
 put_unm_err_out:
 	ntfs_attr_put_search_ctx(ctx);
 unm_err_out:
@@ -463,15 +552,18 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc)
 {
 	VCN vcn;
 	LCN lcn;
+	s64 initialized_size;
+	loff_t i_size;
 	sector_t block, dblock, iblock;
 	struct inode *vi;
 	ntfs_inode *ni;
 	ntfs_volume *vol;
 	runlist_element *rl;
 	struct buffer_head *bh, *head;
+	unsigned long flags;
 	unsigned int blocksize, vcn_ofs;
 	int err;
-	BOOL need_end_writeback;
+	bool need_end_writeback;
 	unsigned char blocksize_bits;
 
 	vi = page->mapping->host;
@@ -483,38 +575,43 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc)
 
 	BUG_ON(!NInoNonResident(ni));
 	BUG_ON(NInoMstProtected(ni));
-
-	blocksize_bits = vi->i_blkbits;
-	blocksize = 1 << blocksize_bits;
-
+	blocksize = vol->sb->s_blocksize;
+	blocksize_bits = vol->sb->s_blocksize_bits;
 	if (!page_has_buffers(page)) {
 		BUG_ON(!PageUptodate(page));
 		create_empty_buffers(page, blocksize,
 				(1 << BH_Uptodate) | (1 << BH_Dirty));
+		if (unlikely(!page_has_buffers(page))) {
+			ntfs_warning(vol->sb, "Error allocating page "
+					"buffers.  Redirtying page so we try "
+					"again later.");
+			/*
+			 * Put the page back on mapping->dirty_pages, but leave
+			 * its buffers' dirty state as-is.
+			 */
+			redirty_page_for_writepage(wbc, page);
+			unlock_page(page);
+			return 0;
+		}
 	}
 	bh = head = page_buffers(page);
-	if (unlikely(!bh)) {
-		ntfs_warning(vol->sb, "Error allocating page buffers. "
-				"Redirtying page so we try again later.");
-		/*
-		 * Put the page back on mapping->dirty_pages, but leave its
-		 * buffer's dirty state as-is.
-		 */
-		redirty_page_for_writepage(wbc, page);
-		unlock_page(page);
-		return 0;
-	}
+	BUG_ON(!bh);
 
 	/* NOTE: Different naming scheme to ntfs_read_block()! */
 
 	/* The first block in the page. */
 	block = (s64)page->index << (PAGE_CACHE_SHIFT - blocksize_bits);
 
+	read_lock_irqsave(&ni->size_lock, flags);
+	i_size = i_size_read(vi);
+	initialized_size = ni->initialized_size;
+	read_unlock_irqrestore(&ni->size_lock, flags);
+
 	/* The first out of bounds block for the data size. */
-	dblock = (vi->i_size + blocksize - 1) >> blocksize_bits;
+	dblock = (i_size + blocksize - 1) >> blocksize_bits;
 
 	/* The last (fully or partially) initialized block. */
-	iblock = ni->initialized_size >> blocksize_bits;
+	iblock = initialized_size >> blocksize_bits;
 
 	/*
 	 * Be very careful.  We have no exclusion from __set_page_dirty_buffers
@@ -534,7 +631,7 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc)
 	rl = NULL;
 	err = 0;
 	do {
-		BOOL is_retry = FALSE;
+		bool is_retry = false;
 
 		if (unlikely(block >= dblock)) {
 			/*
@@ -559,7 +656,7 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc)
 
 		/* Make sure we have enough initialized size. */
 		if (unlikely((block >= iblock) &&
-				(ni->initialized_size < vi->i_size))) {
+				(initialized_size < i_size))) {
 			/*
 			 * If this page is fully outside initialized size, zero
 			 * out all pages between the current initialized size
@@ -645,6 +742,27 @@ lock_retry_remap:
 		}
 		/* It is a hole, need to instantiate it. */
 		if (lcn == LCN_HOLE) {
+			u8 *kaddr;
+			unsigned long *bpos, *bend;
+
+			/* Check if the buffer is zero. */
+			kaddr = kmap_atomic(page);
+			bpos = (unsigned long *)(kaddr + bh_offset(bh));
+			bend = (unsigned long *)((u8*)bpos + blocksize);
+			do {
+				if (unlikely(*bpos))
+					break;
+			} while (likely(++bpos < bend));
+			kunmap_atomic(kaddr);
+			if (bpos == bend) {
+				/*
+				 * Buffer is zero and sparse, no need to write
+				 * it.
+				 */
+				bh->b_blocknr = -1;
+				clear_buffer_dirty(bh);
+				continue;
+			}
 			// TODO: Instantiate the hole.
 			// clear_buffer_new(bh);
 			// unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
@@ -655,7 +773,7 @@ lock_retry_remap:
 		}
 		/* If first try and runlist unmapped, map and retry. */
 		if (!is_retry && lcn == LCN_RL_NOT_MAPPED) {
-			is_retry = TRUE;
+			is_retry = true;
 			/*
 			 * Attempt to map runlist, dropping lock for
 			 * the duration.
@@ -665,19 +783,32 @@ lock_retry_remap:
 			if (likely(!err))
 				goto lock_retry_remap;
 			rl = NULL;
-			lcn = err;
+		} else if (!rl)
+			up_read(&ni->runlist.lock);
+		/*
+		 * If buffer is outside the runlist, truncate has cut it out
+		 * of the runlist.  Just clean and clear the buffer and set it
+		 * uptodate so it can get discarded by the VM.
+		 */
+		if (err == -ENOENT || lcn == LCN_ENOENT) {
+			bh->b_blocknr = -1;
+			clear_buffer_dirty(bh);
+			zero_user(page, bh_offset(bh), blocksize);
+			set_buffer_uptodate(bh);
+			err = 0;
+			continue;
 		}
 		/* Failed to map the buffer, even after retrying. */
+		if (!err)
+			err = -EIO;
 		bh->b_blocknr = -1;
 		ntfs_error(vol->sb, "Failed to write to inode 0x%lx, "
 				"attribute type 0x%x, vcn 0x%llx, offset 0x%x "
 				"because its location on disk could not be "
-				"determined%s (error code %lli).", ni->mft_no,
+				"determined%s (error code %i).", ni->mft_no,
 				ni->type, (unsigned long long)vcn,
 				vcn_ofs, is_retry ? " even after "
-				"retrying" : "", (long long)lcn);
-		if (!err)
-			err = -EIO;
+				"retrying" : "", err);
 		break;
 	} while (block++, (bh = bh->b_this_page) != head);
 
@@ -688,7 +819,7 @@ lock_retry_remap:
 	/* For the error case, need to reset bh to the beginning. */
 	bh = head;
 
-	/* Just an optimization, so ->readpage() isn't called later. */
+	/* Just an optimization, so ->readpage() is not called later. */
 	if (unlikely(!PageUptodate(page))) {
 		int uptodate = 1;
 		do {
@@ -704,7 +835,6 @@ lock_retry_remap:
 
 	/* Setup all mapped, dirty buffers for async write i/o. */
 	do {
-		get_bh(bh);
 		if (buffer_mapped(bh) && buffer_dirty(bh)) {
 			lock_buffer(bh);
 			if (test_clear_buffer_dirty(bh)) {
@@ -742,24 +872,18 @@ lock_retry_remap:
 
 	BUG_ON(PageWriteback(page));
 	set_page_writeback(page);	/* Keeps try_to_free_buffers() away. */
-	unlock_page(page);
 
-	/*
-	 * Submit the prepared buffers for i/o. Note the page is unlocked,
-	 * and the async write i/o completion handler can end_page_writeback()
-	 * at any time after the *first* submit_bh(). So the buffers can then
-	 * disappear...
-	 */
-	need_end_writeback = TRUE;
+	/* Submit the prepared buffers for i/o. */
+	need_end_writeback = true;
 	do {
 		struct buffer_head *next = bh->b_this_page;
 		if (buffer_async_write(bh)) {
 			submit_bh(WRITE, bh);
-			need_end_writeback = FALSE;
+			need_end_writeback = false;
 		}
-		put_bh(bh);
 		bh = next;
 	} while (bh != head);
+	unlock_page(page);
 
 	/* If no i/o was started, need to end_page_writeback(). */
 	if (unlikely(need_end_writeback))
@@ -801,17 +925,15 @@ static int ntfs_write_mst_block(struct page *page,
 	ntfs_inode *ni = NTFS_I(vi);
 	ntfs_volume *vol = ni->vol;
 	u8 *kaddr;
-	unsigned char bh_size_bits = vi->i_blkbits;
-	unsigned int bh_size = 1 << bh_size_bits;
 	unsigned int rec_size = ni->itype.index.block_size;
 	ntfs_inode *locked_nis[PAGE_CACHE_SIZE / rec_size];
 	struct buffer_head *bh, *head, *tbh, *rec_start_bh;
-	int max_bhs = PAGE_CACHE_SIZE / bh_size;
-	struct buffer_head *bhs[max_bhs];
+	struct buffer_head *bhs[MAX_BUF_PER_PAGE];
 	runlist_element *rl;
-	int i, nr_locked_nis, nr_recs, nr_bhs, bhs_per_rec, err, err2;
-	unsigned rec_size_bits;
-	BOOL sync, is_mft, page_is_dirty, rec_is_dirty;
+	int i, nr_locked_nis, nr_recs, nr_bhs, max_bhs, bhs_per_rec, err, err2;
+	unsigned bh_size, rec_size_bits;
+	bool sync, is_mft, page_is_dirty, rec_is_dirty;
+	unsigned char bh_size_bits;
 
 	ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
 			"0x%lx.", vi->i_ino, ni->type, page->index);
@@ -826,13 +948,16 @@ static int ntfs_write_mst_block(struct page *page,
 	 */
 	BUG_ON(!(is_mft || S_ISDIR(vi->i_mode) ||
 			(NInoAttr(ni) && ni->type == AT_INDEX_ALLOCATION)));
+	bh_size = vol->sb->s_blocksize;
+	bh_size_bits = vol->sb->s_blocksize_bits;
+	max_bhs = PAGE_CACHE_SIZE / bh_size;
 	BUG_ON(!max_bhs);
+	BUG_ON(max_bhs > MAX_BUF_PER_PAGE);
 
 	/* Were we called for sync purposes? */
 	sync = (wbc->sync_mode == WB_SYNC_ALL);
 
 	/* Make sure we have mapped buffers. */
-	BUG_ON(!page_has_buffers(page));
 	bh = head = page_buffers(page);
 	BUG_ON(!bh);
 
@@ -846,18 +971,19 @@ static int ntfs_write_mst_block(struct page *page,
 			(PAGE_CACHE_SHIFT - bh_size_bits);
 
 	/* The first out of bounds block for the data size. */
-	dblock = (vi->i_size + bh_size - 1) >> bh_size_bits;
+	dblock = (i_size_read(vi) + bh_size - 1) >> bh_size_bits;
 
 	rl = NULL;
 	err = err2 = nr_bhs = nr_recs = nr_locked_nis = 0;
-	page_is_dirty = rec_is_dirty = FALSE;
+	page_is_dirty = rec_is_dirty = false;
 	rec_start_bh = NULL;
 	do {
-		BOOL is_retry = FALSE;
+		bool is_retry = false;
 
 		if (likely(block < rec_block)) {
 			if (unlikely(block >= dblock)) {
 				clear_buffer_dirty(bh);
+				set_buffer_uptodate(bh);
 				continue;
 			}
 			/*
@@ -883,10 +1009,10 @@ static int ntfs_write_mst_block(struct page *page,
 			}
 			if (!buffer_dirty(bh)) {
 				/* Clean records are not written out. */
-				rec_is_dirty = FALSE;
+				rec_is_dirty = false;
 				continue;
 			}
-			rec_is_dirty = TRUE;
+			rec_is_dirty = true;
 			rec_start_bh = bh;
 		}
 		/* Need to map the buffer if it is not mapped already. */
@@ -895,6 +1021,7 @@ static int ntfs_write_mst_block(struct page *page,
 			LCN lcn;
 			unsigned int vcn_ofs;
 
+			bh->b_bdev = vol->sb->s_bdev;
 			/* Obtain the vcn and offset of the current block. */
 			vcn = (VCN)block << bh_size_bits;
 			vcn_ofs = vcn & vol->cluster_size_mask;
@@ -926,7 +1053,7 @@ lock_retry_remap:
 				 */
 				if (!is_mft && !is_retry &&
 						lcn == LCN_RL_NOT_MAPPED) {
-					is_retry = TRUE;
+					is_retry = true;
 					/*
 					 * Attempt to map runlist, dropping
 					 * lock for the duration.
@@ -936,10 +1063,13 @@ lock_retry_remap:
 					if (likely(!err2))
 						goto lock_retry_remap;
 					if (err2 == -ENOMEM)
-						page_is_dirty = TRUE;
+						page_is_dirty = true;
 					lcn = err2;
-				} else
+				} else {
 					err2 = -EIO;
+					if (!rl)
+						up_read(&ni->runlist.lock);
+				}
 				/* Hard error.  Abort writing this record. */
 				if (!err || err == -ENOMEM)
 					err = err2;
@@ -949,7 +1079,8 @@ lock_retry_remap:
 						"attribute type 0x%x) because "
 						"its location on disk could "
 						"not be determined (error "
-						"code %lli).", (s64)block <<
+						"code %lli).",
+						(long long)block <<
 						bh_size_bits >>
 						vol->mft_record_size_bits,
 						ni->mft_no, ni->type,
@@ -1014,7 +1145,7 @@ lock_retry_remap:
 				 * means we need to redirty the page before
 				 * returning.
 				 */
-				page_is_dirty = TRUE;
+				page_is_dirty = true;
 				/*
 				 * Remove the buffers in this mft record from
 				 * the list of buffers to write.
@@ -1064,7 +1195,7 @@ lock_retry_remap:
 		tbh = bhs[i];
 		if (!tbh)
 			continue;
-		if (unlikely(test_set_buffer_locked(tbh)))
+		if (!trylock_buffer(tbh))
 			BUG();
 		/* The buffer dirty state is now irrelevant, just clean it. */
 		clear_buffer_dirty(tbh);
@@ -1147,18 +1278,18 @@ unm_done:
 		
 		tni = locked_nis[nr_locked_nis];
 		/* Get the base inode. */
-		down(&tni->extent_lock);
+		mutex_lock(&tni->extent_lock);
 		if (tni->nr_extents >= 0)
 			base_tni = tni;
 		else {
 			base_tni = tni->ext.base_ntfs_ino;
 			BUG_ON(!base_tni);
 		}
-		up(&tni->extent_lock);
+		mutex_unlock(&tni->extent_lock);
 		ntfs_debug("Unlocking %s inode 0x%lx.",
 				tni == base_tni ? "base" : "extent",
 				tni->mft_no);
-		up(&tni->mrec_lock);
+		mutex_unlock(&tni->mrec_lock);
 		atomic_dec(&tni->count);
 		iput(VFS_I(base_tni));
 	}
@@ -1223,19 +1354,17 @@ done:
 static int ntfs_writepage(struct page *page, struct writeback_control *wbc)
 {
 	loff_t i_size;
-	struct inode *vi;
-	ntfs_inode *ni, *base_ni;
-	char *kaddr;
-	ntfs_attr_search_ctx *ctx;
-	MFT_RECORD *m;
+	struct inode *vi = page->mapping->host;
+	ntfs_inode *base_ni = NULL, *ni = NTFS_I(vi);
+	char *addr;
+	ntfs_attr_search_ctx *ctx = NULL;
+	MFT_RECORD *m = NULL;
 	u32 attr_len;
 	int err;
 
+retry_writepage:
 	BUG_ON(!PageLocked(page));
-
-	vi = page->mapping->host;
 	i_size = i_size_read(vi);
-
 	/* Is the page fully outside i_size? (truncate in progress) */
 	if (unlikely(page->index >= (i_size + PAGE_CACHE_SIZE - 1) >>
 			PAGE_CACHE_SHIFT)) {
@@ -1243,65 +1372,67 @@ static int ntfs_writepage(struct page *page, struct writeback_control *wbc)
 		 * The page may have dirty, unmapped buffers.  Make them
 		 * freeable here, so the page does not leak.
 		 */
-		block_invalidatepage(page, 0);
+		block_invalidatepage(page, 0, PAGE_CACHE_SIZE);
 		unlock_page(page);
 		ntfs_debug("Write outside i_size - truncated?");
 		return 0;
 	}
-	ni = NTFS_I(vi);
-
+	/*
+	 * Only $DATA attributes can be encrypted and only unnamed $DATA
+	 * attributes can be compressed.  Index root can have the flags set but
+	 * this means to create compressed/encrypted files, not that the
+	 * attribute is compressed/encrypted.  Note we need to check for
+	 * AT_INDEX_ALLOCATION since this is the type of both directory and
+	 * index inodes.
+	 */
+	if (ni->type != AT_INDEX_ALLOCATION) {
+		/* If file is encrypted, deny access, just like NT4. */
+		if (NInoEncrypted(ni)) {
+			unlock_page(page);
+			BUG_ON(ni->type != AT_DATA);
+			ntfs_debug("Denying write access to encrypted file.");
+			return -EACCES;
+		}
+		/* Compressed data streams are handled in compress.c. */
+		if (NInoNonResident(ni) && NInoCompressed(ni)) {
+			BUG_ON(ni->type != AT_DATA);
+			BUG_ON(ni->name_len);
+			// TODO: Implement and replace this with
+			// return ntfs_write_compressed_block(page);
+			unlock_page(page);
+			ntfs_error(vi->i_sb, "Writing to compressed files is "
+					"not supported yet.  Sorry.");
+			return -EOPNOTSUPP;
+		}
+		// TODO: Implement and remove this check.
+		if (NInoNonResident(ni) && NInoSparse(ni)) {
+			unlock_page(page);
+			ntfs_error(vi->i_sb, "Writing to sparse files is not "
+					"supported yet.  Sorry.");
+			return -EOPNOTSUPP;
+		}
+	}
 	/* NInoNonResident() == NInoIndexAllocPresent() */
 	if (NInoNonResident(ni)) {
-		/*
-		 * Only unnamed $DATA attributes can be compressed, encrypted,
-		 * and/or sparse.
-		 */
-		if (ni->type == AT_DATA && !ni->name_len) {
-			/* If file is encrypted, deny access, just like NT4. */
-			if (NInoEncrypted(ni)) {
-				unlock_page(page);
-				ntfs_debug("Denying write access to encrypted "
-						"file.");
-				return -EACCES;
-			}
-			/* Compressed data streams are handled in compress.c. */
-			if (NInoCompressed(ni)) {
-				// TODO: Implement and replace this check with
-				// return ntfs_write_compressed_block(page);
-				unlock_page(page);
-				ntfs_error(vi->i_sb, "Writing to compressed "
-						"files is not supported yet. "
-						"Sorry.");
-				return -EOPNOTSUPP;
-			}
-			// TODO: Implement and remove this check.
-			if (NInoSparse(ni)) {
-				unlock_page(page);
-				ntfs_error(vi->i_sb, "Writing to sparse files "
-						"is not supported yet. Sorry.");
-				return -EOPNOTSUPP;
-			}
-		}
 		/* We have to zero every time due to mmap-at-end-of-file. */
 		if (page->index >= (i_size >> PAGE_CACHE_SHIFT)) {
 			/* The page straddles i_size. */
 			unsigned int ofs = i_size & ~PAGE_CACHE_MASK;
-			kaddr = kmap_atomic(page, KM_USER0);
-			memset(kaddr + ofs, 0, PAGE_CACHE_SIZE - ofs);
-			flush_dcache_page(page);
-			kunmap_atomic(kaddr, KM_USER0);
+			zero_user_segment(page, ofs, PAGE_CACHE_SIZE);
 		}
 		/* Handle mst protected attributes. */
 		if (NInoMstProtected(ni))
 			return ntfs_write_mst_block(page, wbc);
-		/* Normal data stream. */
+		/* Normal, non-resident data stream. */
 		return ntfs_write_block(page, wbc);
 	}
 	/*
-	 * Attribute is resident, implying it is not compressed, encrypted,
-	 * sparse, or mst protected.  This also means the attribute is smaller
-	 * than an mft record and hence smaller than a page, so can simply
-	 * return error on any pages with index above 0.
+	 * Attribute is resident, implying it is not compressed, encrypted, or
+	 * mst protected.  This also means the attribute is smaller than an mft
+	 * record and hence smaller than a page, so can simply return error on
+	 * any pages with index above 0.  Note the attribute can actually be
+	 * marked compressed but if it is resident the actual data is not
+	 * compressed so we are ok to ignore the compressed flag here.
 	 */
 	BUG_ON(page_has_buffers(page));
 	BUG_ON(!PageUptodate(page));
@@ -1326,6 +1457,14 @@ static int ntfs_writepage(struct page *page, struct writeback_control *wbc)
 		ctx = NULL;
 		goto err_out;
 	}
+	/*
+	 * If a parallel write made the attribute non-resident, drop the mft
+	 * record and retry the writepage.
+	 */
+	if (unlikely(NInoNonResident(ni))) {
+		unmap_mft_record(base_ni);
+		goto retry_writepage;
+	}
 	ctx = ntfs_attr_get_search_ctx(base_ni, m);
 	if (unlikely(!ctx)) {
 		err = -ENOMEM;
@@ -1342,53 +1481,33 @@ static int ntfs_writepage(struct page *page, struct writeback_control *wbc)
 	BUG_ON(PageWriteback(page));
 	set_page_writeback(page);
 	unlock_page(page);
-
-	/*
-	 * Here, we don't need to zero the out of bounds area everytime because
-	 * the below memcpy() already takes care of the mmap-at-end-of-file
-	 * requirements. If the file is converted to a non-resident one, then
-	 * the code path use is switched to the non-resident one where the
-	 * zeroing happens on each ntfs_writepage() invocation.
-	 *
-	 * The above also applies nicely when i_size is decreased.
-	 *
-	 * When i_size is increased, the memory between the old and new i_size
-	 * _must_ be zeroed (or overwritten with new data). Otherwise we will
-	 * expose data to userspace/disk which should never have been exposed.
-	 *
-	 * FIXME: Ensure that i_size increases do the zeroing/overwriting and
-	 * if we cannot guarantee that, then enable the zeroing below.  If the
-	 * zeroing below is enabled, we MUST move the unlock_page() from above
-	 * to after the kunmap_atomic(), i.e. just before the
-	 * end_page_writeback().
-	 * UPDATE: ntfs_prepare/commit_write() do the zeroing on i_size
-	 * increases for resident attributes so those are ok.
-	 * TODO: ntfs_truncate(), others?
-	 */
-
 	attr_len = le32_to_cpu(ctx->attr->data.resident.value_length);
-	i_size = i_size_read(VFS_I(ni));
-	kaddr = kmap_atomic(page, KM_USER0);
+	i_size = i_size_read(vi);
 	if (unlikely(attr_len > i_size)) {
-		/* Zero out of bounds area in the mft record. */
-		memset((u8*)ctx->attr + le16_to_cpu(
-				ctx->attr->data.resident.value_offset) +
-				i_size, 0, attr_len - i_size);
+		/* Race with shrinking truncate or a failed truncate. */
 		attr_len = i_size;
+		/*
+		 * If the truncate failed, fix it up now.  If a concurrent
+		 * truncate, we do its job, so it does not have to do anything.
+		 */
+		err = ntfs_resident_attr_value_resize(ctx->mrec, ctx->attr,
+				attr_len);
+		/* Shrinking cannot fail. */
+		BUG_ON(err);
 	}
+	addr = kmap_atomic(page);
 	/* Copy the data from the page to the mft record. */
 	memcpy((u8*)ctx->attr +
 			le16_to_cpu(ctx->attr->data.resident.value_offset),
-			kaddr, attr_len);
-	flush_dcache_mft_record_page(ctx->ntfs_ino);
+			addr, attr_len);
 	/* Zero out of bounds area in the page cache page. */
-	memset(kaddr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
+	memset(addr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
+	kunmap_atomic(addr);
 	flush_dcache_page(page);
-	kunmap_atomic(kaddr, KM_USER0);
-
+	flush_dcache_mft_record_page(ctx->ntfs_ino);
+	/* We are done with the page. */
 	end_page_writeback(page);
-
-	/* Mark the mft record dirty, so it gets written back. */
+	/* Finally, mark the mft record dirty, so it gets written back. */
 	mark_mft_record_dirty(ctx->ntfs_ino);
 	ntfs_attr_put_search_ctx(ctx);
 	unmap_mft_record(base_ni);
@@ -1405,818 +1524,11 @@ err_out:
 		err = 0;
 	} else {
 		ntfs_error(vi->i_sb, "Resident attribute write failed with "
-				"error %i.  Setting page error flag.", err);
+				"error %i.", err);
 		SetPageError(page);
-	}
-	unlock_page(page);
-	if (ctx)
-		ntfs_attr_put_search_ctx(ctx);
-	if (m)
-		unmap_mft_record(base_ni);
-	return err;
-}
-
-/**
- * ntfs_prepare_nonresident_write -
- *
- */
-static int ntfs_prepare_nonresident_write(struct page *page,
-		unsigned from, unsigned to)
-{
-	VCN vcn;
-	LCN lcn;
-	sector_t block, ablock, iblock;
-	struct inode *vi;
-	ntfs_inode *ni;
-	ntfs_volume *vol;
-	runlist_element *rl;
-	struct buffer_head *bh, *head, *wait[2], **wait_bh = wait;
-	unsigned int vcn_ofs, block_start, block_end, blocksize;
-	int err;
-	BOOL is_retry;
-	unsigned char blocksize_bits;
-
-	vi = page->mapping->host;
-	ni = NTFS_I(vi);
-	vol = ni->vol;
-
-	ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
-			"0x%lx, from = %u, to = %u.", ni->mft_no, ni->type,
-			page->index, from, to);
-
-	BUG_ON(!NInoNonResident(ni));
-
-	blocksize_bits = vi->i_blkbits;
-	blocksize = 1 << blocksize_bits;
-
-	/*
-	 * create_empty_buffers() will create uptodate/dirty buffers if the
-	 * page is uptodate/dirty.
-	 */
-	if (!page_has_buffers(page))
-		create_empty_buffers(page, blocksize, 0);
-	bh = head = page_buffers(page);
-	if (unlikely(!bh))
-		return -ENOMEM;
-
-	/* The first block in the page. */
-	block = (s64)page->index << (PAGE_CACHE_SHIFT - blocksize_bits);
-
-	/*
-	 * The first out of bounds block for the allocated size. No need to
-	 * round up as allocated_size is in multiples of cluster size and the
-	 * minimum cluster size is 512 bytes, which is equal to the smallest
-	 * blocksize.
-	 */
-	ablock = ni->allocated_size >> blocksize_bits;
-
-	/* The last (fully or partially) initialized block. */
-	iblock = ni->initialized_size >> blocksize_bits;
-
-	/* Loop through all the buffers in the page. */
-	block_start = 0;
-	rl = NULL;
-	err = 0;
-	do {
-		block_end = block_start + blocksize;
-		/*
-		 * If buffer @bh is outside the write, just mark it uptodate
-		 * if the page is uptodate and continue with the next buffer.
-		 */
-		if (block_end <= from || block_start >= to) {
-			if (PageUptodate(page)) {
-				if (!buffer_uptodate(bh))
-					set_buffer_uptodate(bh);
-			}
-			continue;
-		}
-		/*
-		 * @bh is at least partially being written to.
-		 * Make sure it is not marked as new.
-		 */
-		//if (buffer_new(bh))
-		//	clear_buffer_new(bh);
-
-		if (block >= ablock) {
-			// TODO: block is above allocated_size, need to
-			// allocate it. Best done in one go to accommodate not
-			// only block but all above blocks up to and including:
-			// ((page->index << PAGE_CACHE_SHIFT) + to + blocksize
-			// - 1) >> blobksize_bits. Obviously will need to round
-			// up to next cluster boundary, too. This should be
-			// done with a helper function, so it can be reused.
-			ntfs_error(vol->sb, "Writing beyond allocated size "
-					"is not supported yet. Sorry.");
-			err = -EOPNOTSUPP;
-			goto err_out;
-			// Need to update ablock.
-			// Need to set_buffer_new() on all block bhs that are
-			// newly allocated.
-		}
-		/*
-		 * Now we have enough allocated size to fulfill the whole
-		 * request, i.e. block < ablock is true.
-		 */
-		if (unlikely((block >= iblock) &&
-				(ni->initialized_size < vi->i_size))) {
-			/*
-			 * If this page is fully outside initialized size, zero
-			 * out all pages between the current initialized size
-			 * and the current page. Just use ntfs_readpage() to do
-			 * the zeroing transparently.
-			 */
-			if (block > iblock) {
-				// TODO:
-				// For each page do:
-				// - read_cache_page()
-				// Again for each page do:
-				// - wait_on_page_locked()
-				// - Check (PageUptodate(page) &&
-				//			!PageError(page))
-				// Update initialized size in the attribute and
-				// in the inode.
-				// Again, for each page do:
-				//	__set_page_dirty_buffers();
-				// page_cache_release()
-				// We don't need to wait on the writes.
-				// Update iblock.
-			}
-			/*
-			 * The current page straddles initialized size. Zero
-			 * all non-uptodate buffers and set them uptodate (and
-			 * dirty?). Note, there aren't any non-uptodate buffers
-			 * if the page is uptodate.
-			 * FIXME: For an uptodate page, the buffers may need to
-			 * be written out because they were not initialized on
-			 * disk before.
-			 */
-			if (!PageUptodate(page)) {
-				// TODO:
-				// Zero any non-uptodate buffers up to i_size.
-				// Set them uptodate and dirty.
-			}
-			// TODO:
-			// Update initialized size in the attribute and in the
-			// inode (up to i_size).
-			// Update iblock.
-			// FIXME: This is inefficient. Try to batch the two
-			// size changes to happen in one go.
-			ntfs_error(vol->sb, "Writing beyond initialized size "
-					"is not supported yet. Sorry.");
-			err = -EOPNOTSUPP;
-			goto err_out;
-			// Do NOT set_buffer_new() BUT DO clear buffer range
-			// outside write request range.
-			// set_buffer_uptodate() on complete buffers as well as
-			// set_buffer_dirty().
-		}
-
-		/* Need to map unmapped buffers. */
-		if (!buffer_mapped(bh)) {
-			/* Unmapped buffer. Need to map it. */
-			bh->b_bdev = vol->sb->s_bdev;
-
-			/* Convert block into corresponding vcn and offset. */
-			vcn = (VCN)block << blocksize_bits >>
-					vol->cluster_size_bits;
-			vcn_ofs = ((VCN)block << blocksize_bits) &
-					vol->cluster_size_mask;
-
-			is_retry = FALSE;
-			if (!rl) {
-lock_retry_remap:
-				down_read(&ni->runlist.lock);
-				rl = ni->runlist.rl;
-			}
-			if (likely(rl != NULL)) {
-				/* Seek to element containing target vcn. */
-				while (rl->length && rl[1].vcn <= vcn)
-					rl++;
-				lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
-			} else
-				lcn = LCN_RL_NOT_MAPPED;
-			if (unlikely(lcn < 0)) {
-				/*
-				 * We extended the attribute allocation above.
-				 * If we hit an ENOENT here it means that the
-				 * allocation was insufficient which is a bug.
-				 */
-				BUG_ON(lcn == LCN_ENOENT);
-
-				/* It is a hole, need to instantiate it. */
-				if (lcn == LCN_HOLE) {
-					// TODO: Instantiate the hole.
-					// clear_buffer_new(bh);
-					// unmap_underlying_metadata(bh->b_bdev,
-					//		bh->b_blocknr);
-					// For non-uptodate buffers, need to
-					// zero out the region outside the
-					// request in this bh or all bhs,
-					// depending on what we implemented
-					// above.
-					// Need to flush_dcache_page().
-					// Or could use set_buffer_new()
-					// instead?
-					ntfs_error(vol->sb, "Writing into "
-							"sparse regions is "
-							"not supported yet. "
-							"Sorry.");
-					err = -EOPNOTSUPP;
-					goto err_out;
-				} else if (!is_retry &&
-						lcn == LCN_RL_NOT_MAPPED) {
-					is_retry = TRUE;
-					/*
-					 * Attempt to map runlist, dropping
-					 * lock for the duration.
-					 */
-					up_read(&ni->runlist.lock);
-					err = ntfs_map_runlist(ni, vcn);
-					if (likely(!err))
-						goto lock_retry_remap;
-					rl = NULL;
-					lcn = err;
-				}
-				/*
-				 * Failed to map the buffer, even after
-				 * retrying.
-				 */
-				bh->b_blocknr = -1;
-				ntfs_error(vol->sb, "Failed to write to inode "
-						"0x%lx, attribute type 0x%x, "
-						"vcn 0x%llx, offset 0x%x "
-						"because its location on disk "
-						"could not be determined%s "
-						"(error code %lli).",
-						ni->mft_no, ni->type,
-						(unsigned long long)vcn,
-						vcn_ofs, is_retry ? " even "
-						"after retrying" : "",
-						(long long)lcn);
-				if (!err)
-					err = -EIO;
-				goto err_out;
-			}
-			/* We now have a successful remap, i.e. lcn >= 0. */
-
-			/* Setup buffer head to correct block. */
-			bh->b_blocknr = ((lcn << vol->cluster_size_bits)
-					+ vcn_ofs) >> blocksize_bits;
-			set_buffer_mapped(bh);
-
-			// FIXME: Something analogous to this is needed for
-			// each newly allocated block, i.e. BH_New.
-			// FIXME: Might need to take this out of the
-			// if (!buffer_mapped(bh)) {}, depending on how we
-			// implement things during the allocated_size and
-			// initialized_size extension code above.
-			if (buffer_new(bh)) {
-				clear_buffer_new(bh);
-				unmap_underlying_metadata(bh->b_bdev,
-						bh->b_blocknr);
-				if (PageUptodate(page)) {
-					set_buffer_uptodate(bh);
-					continue;
-				}
-				/*
-				 * Page is _not_ uptodate, zero surrounding
-				 * region. NOTE: This is how we decide if to
-				 * zero or not!
-				 */
-				if (block_end > to || block_start < from) {
-					void *kaddr;
-
-					kaddr = kmap_atomic(page, KM_USER0);
-					if (block_end > to)
-						memset(kaddr + to, 0,
-								block_end - to);
-					if (block_start < from)
-						memset(kaddr + block_start, 0,
-								from -
-								block_start);
-					flush_dcache_page(page);
-					kunmap_atomic(kaddr, KM_USER0);
-				}
-				continue;
-			}
-		}
-		/* @bh is mapped, set it uptodate if the page is uptodate. */
-		if (PageUptodate(page)) {
-			if (!buffer_uptodate(bh))
-				set_buffer_uptodate(bh);
-			continue;
-		}
-		/*
-		 * The page is not uptodate. The buffer is mapped. If it is not
-		 * uptodate, and it is only partially being written to, we need
-		 * to read the buffer in before the write, i.e. right now.
-		 */
-		if (!buffer_uptodate(bh) &&
-				(block_start < from || block_end > to)) {
-			ll_rw_block(READ, 1, &bh);
-			*wait_bh++ = bh;
-		}
-	} while (block++, block_start = block_end,
-			(bh = bh->b_this_page) != head);
-
-	/* Release the lock if we took it. */
-	if (rl) {
-		up_read(&ni->runlist.lock);
-		rl = NULL;
-	}
-
-	/* If we issued read requests, let them complete. */
-	while (wait_bh > wait) {
-		wait_on_buffer(*--wait_bh);
-		if (!buffer_uptodate(*wait_bh))
-			return -EIO;
-	}
-
-	ntfs_debug("Done.");
-	return 0;
-err_out:
-	/*
-	 * Zero out any newly allocated blocks to avoid exposing stale data.
-	 * If BH_New is set, we know that the block was newly allocated in the
-	 * above loop.
-	 * FIXME: What about initialized_size increments? Have we done all the
-	 * required zeroing above? If not this error handling is broken, and
-	 * in particular the if (block_end <= from) check is completely bogus.
-	 */
-	bh = head;
-	block_start = 0;
-	is_retry = FALSE;
-	do {
-		block_end = block_start + blocksize;
-		if (block_end <= from)
-			continue;
-		if (block_start >= to)
-			break;
-		if (buffer_new(bh)) {
-			void *kaddr;
-
-			clear_buffer_new(bh);
-			kaddr = kmap_atomic(page, KM_USER0);
-			memset(kaddr + block_start, 0, bh->b_size);
-			kunmap_atomic(kaddr, KM_USER0);
-			set_buffer_uptodate(bh);
-			mark_buffer_dirty(bh);
-			is_retry = TRUE;
-		}
-	} while (block_start = block_end, (bh = bh->b_this_page) != head);
-	if (is_retry)
-		flush_dcache_page(page);
-	if (rl)
-		up_read(&ni->runlist.lock);
-	return err;
-}
-
-/**
- * ntfs_prepare_write - prepare a page for receiving data
- *
- * This is called from generic_file_write() with i_sem held on the inode
- * (@page->mapping->host).  The @page is locked but not kmap()ped.  The source
- * data has not yet been copied into the @page.
- *
- * Need to extend the attribute/fill in holes if necessary, create blocks and
- * make partially overwritten blocks uptodate,
- *
- * i_size is not to be modified yet.
- *
- * Return 0 on success or -errno on error.
- *
- * Should be using block_prepare_write() [support for sparse files] or
- * cont_prepare_write() [no support for sparse files].  Cannot do that due to
- * ntfs specifics but can look at them for implementation guidance.
- *
- * Note: In the range, @from is inclusive and @to is exclusive, i.e. @from is
- * the first byte in the page that will be written to and @to is the first byte
- * after the last byte that will be written to.
- */
-static int ntfs_prepare_write(struct file *file, struct page *page,
-		unsigned from, unsigned to)
-{
-	s64 new_size;
-	struct inode *vi = page->mapping->host;
-	ntfs_inode *base_ni = NULL, *ni = NTFS_I(vi);
-	ntfs_volume *vol = ni->vol;
-	ntfs_attr_search_ctx *ctx = NULL;
-	MFT_RECORD *m = NULL;
-	ATTR_RECORD *a;
-	u8 *kaddr;
-	u32 attr_len;
-	int err;
-
-	ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
-			"0x%lx, from = %u, to = %u.", vi->i_ino, ni->type,
-			page->index, from, to);
-	BUG_ON(!PageLocked(page));
-	BUG_ON(from > PAGE_CACHE_SIZE);
-	BUG_ON(to > PAGE_CACHE_SIZE);
-	BUG_ON(from > to);
-	BUG_ON(NInoMstProtected(ni));
-	/*
-	 * If a previous ntfs_truncate() failed, repeat it and abort if it
-	 * fails again.
-	 */
-	if (unlikely(NInoTruncateFailed(ni))) {
-		down_write(&vi->i_alloc_sem);
-		err = ntfs_truncate(vi);
-		up_write(&vi->i_alloc_sem);
-		if (err || NInoTruncateFailed(ni)) {
-			if (!err)
-				err = -EIO;
-			goto err_out;
-		}
-	}
-	/* If the attribute is not resident, deal with it elsewhere. */
-	if (NInoNonResident(ni)) {
-		/*
-		 * Only unnamed $DATA attributes can be compressed, encrypted,
-		 * and/or sparse.
-		 */
-		if (ni->type == AT_DATA && !ni->name_len) {
-			/* If file is encrypted, deny access, just like NT4. */
-			if (NInoEncrypted(ni)) {
-				ntfs_debug("Denying write access to encrypted "
-						"file.");
-				return -EACCES;
-			}
-			/* Compressed data streams are handled in compress.c. */
-			if (NInoCompressed(ni)) {
-				// TODO: Implement and replace this check with
-				// return ntfs_write_compressed_block(page);
-				ntfs_error(vi->i_sb, "Writing to compressed "
-						"files is not supported yet. "
-						"Sorry.");
-				return -EOPNOTSUPP;
-			}
-			// TODO: Implement and remove this check.
-			if (NInoSparse(ni)) {
-				ntfs_error(vi->i_sb, "Writing to sparse files "
-						"is not supported yet. Sorry.");
-				return -EOPNOTSUPP;
-			}
-		}
-		/* Normal data stream. */
-		return ntfs_prepare_nonresident_write(page, from, to);
-	}
-	/*
-	 * Attribute is resident, implying it is not compressed, encrypted, or
-	 * sparse.
-	 */
-	BUG_ON(page_has_buffers(page));
-	new_size = ((s64)page->index << PAGE_CACHE_SHIFT) + to;
-	/* If we do not need to resize the attribute allocation we are done. */
-	if (new_size <= vi->i_size)
-		goto done;
-
-	// FIXME: We abort for now as this code is not safe.
-	ntfs_error(vi->i_sb, "Changing the file size is not supported yet.  "
-			"Sorry.");
-	return -EOPNOTSUPP;
-
-	/* Map, pin, and lock the (base) mft record. */
-	if (!NInoAttr(ni))
-		base_ni = ni;
-	else
-		base_ni = ni->ext.base_ntfs_ino;
-	m = map_mft_record(base_ni);
-	if (IS_ERR(m)) {
-		err = PTR_ERR(m);
-		m = NULL;
-		ctx = NULL;
-		goto err_out;
-	}
-	ctx = ntfs_attr_get_search_ctx(base_ni, m);
-	if (unlikely(!ctx)) {
-		err = -ENOMEM;
-		goto err_out;
-	}
-	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
-			CASE_SENSITIVE, 0, NULL, 0, ctx);
-	if (unlikely(err)) {
-		if (err == -ENOENT)
-			err = -EIO;
-		goto err_out;
-	}
-	m = ctx->mrec;
-	a = ctx->attr;
-	/* The total length of the attribute value. */
-	attr_len = le32_to_cpu(a->data.resident.value_length);
-	BUG_ON(vi->i_size != attr_len);
-	/* Check if new size is allowed in $AttrDef. */
-	err = ntfs_attr_size_bounds_check(vol, ni->type, new_size);
-	if (unlikely(err)) {
-		if (err == -ERANGE) {
-			ntfs_error(vol->sb, "Write would cause the inode "
-					"0x%lx to exceed the maximum size for "
-					"its attribute type (0x%x).  Aborting "
-					"write.", vi->i_ino,
-					le32_to_cpu(ni->type));
-		} else {
-			ntfs_error(vol->sb, "Inode 0x%lx has unknown "
-					"attribute type 0x%x.  Aborting "
-					"write.", vi->i_ino,
-					le32_to_cpu(ni->type));
-			err = -EIO;
-		}
-		goto err_out2;
-	}
-	/*
-	 * Extend the attribute record to be able to store the new attribute
-	 * size.
-	 */
-	if (new_size >= vol->mft_record_size || ntfs_attr_record_resize(m, a,
-			le16_to_cpu(a->data.resident.value_offset) +
-			new_size)) {
-		/* Not enough space in the mft record. */
-		ntfs_error(vol->sb, "Not enough space in the mft record for "
-				"the resized attribute value.  This is not "
-				"supported yet.  Aborting write.");
-		err = -EOPNOTSUPP;
-		goto err_out2;
-	}
-	/*
-	 * We have enough space in the mft record to fit the write.  This
-	 * implies the attribute is smaller than the mft record and hence the
-	 * attribute must be in a single page and hence page->index must be 0.
-	 */
-	BUG_ON(page->index);
-	/*
-	 * If the beginning of the write is past the old size, enlarge the
-	 * attribute value up to the beginning of the write and fill it with
-	 * zeroes.
-	 */
-	if (from > attr_len) {
-		memset((u8*)a + le16_to_cpu(a->data.resident.value_offset) +
-				attr_len, 0, from - attr_len);
-		a->data.resident.value_length = cpu_to_le32(from);
-		/* Zero the corresponding area in the page as well. */
-		if (PageUptodate(page)) {
-			kaddr = kmap_atomic(page, KM_USER0);
-			memset(kaddr + attr_len, 0, from - attr_len);
-			kunmap_atomic(kaddr, KM_USER0);
-			flush_dcache_page(page);
-		}
-	}
-	flush_dcache_mft_record_page(ctx->ntfs_ino);
-	mark_mft_record_dirty(ctx->ntfs_ino);
-	ntfs_attr_put_search_ctx(ctx);
-	unmap_mft_record(base_ni);
-	/*
-	 * Because resident attributes are handled by memcpy() to/from the
-	 * corresponding MFT record, and because this form of i/o is byte
-	 * aligned rather than block aligned, there is no need to bring the
-	 * page uptodate here as in the non-resident case where we need to
-	 * bring the buffers straddled by the write uptodate before
-	 * generic_file_write() does the copying from userspace.
-	 *
-	 * We thus defer the uptodate bringing of the page region outside the
-	 * region written to to ntfs_commit_write(), which makes the code
-	 * simpler and saves one atomic kmap which is good.
-	 */
-done:
-	ntfs_debug("Done.");
-	return 0;
-err_out:
-	if (err == -ENOMEM)
-		ntfs_warning(vi->i_sb, "Error allocating memory required to "
-				"prepare the write.");
-	else {
-		ntfs_error(vi->i_sb, "Resident attribute prepare write failed "
-				"with error %i.", err);
-		NVolSetErrors(vol);
-		make_bad_inode(vi);
-	}
-err_out2:
-	if (ctx)
-		ntfs_attr_put_search_ctx(ctx);
-	if (m)
-		unmap_mft_record(base_ni);
-	return err;
-}
-
-/**
- * ntfs_commit_nonresident_write -
- *
- */
-static int ntfs_commit_nonresident_write(struct page *page,
-		unsigned from, unsigned to)
-{
-	s64 pos = ((s64)page->index << PAGE_CACHE_SHIFT) + to;
-	struct inode *vi = page->mapping->host;
-	struct buffer_head *bh, *head;
-	unsigned int block_start, block_end, blocksize;
-	BOOL partial;
-
-	ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
-			"0x%lx, from = %u, to = %u.", vi->i_ino,
-			NTFS_I(vi)->type, page->index, from, to);
-	blocksize = 1 << vi->i_blkbits;
-
-	// FIXME: We need a whole slew of special cases in here for compressed
-	// files for example...
-	// For now, we know ntfs_prepare_write() would have failed so we can't
-	// get here in any of the cases which we have to special case, so we
-	// are just a ripped off, unrolled generic_commit_write().
-
-	bh = head = page_buffers(page);
-	block_start = 0;
-	partial = FALSE;
-	do {
-		block_end = block_start + blocksize;
-		if (block_end <= from || block_start >= to) {
-			if (!buffer_uptodate(bh))
-				partial = TRUE;
-		} else {
-			set_buffer_uptodate(bh);
-			mark_buffer_dirty(bh);
-		}
-	} while (block_start = block_end, (bh = bh->b_this_page) != head);
-	/*
-	 * If this is a partial write which happened to make all buffers
-	 * uptodate then we can optimize away a bogus ->readpage() for the next
-	 * read().  Here we 'discover' whether the page went uptodate as a
-	 * result of this (potentially partial) write.
-	 */
-	if (!partial)
-		SetPageUptodate(page);
-	/*
-	 * Not convinced about this at all.  See disparity comment above.  For
-	 * now we know ntfs_prepare_write() would have failed in the write
-	 * exceeds i_size case, so this will never trigger which is fine.
-	 */
-	if (pos > vi->i_size) {
-		ntfs_error(vi->i_sb, "Writing beyond the existing file size is "
-				"not supported yet.  Sorry.");
-		return -EOPNOTSUPP;
-		// vi->i_size = pos;
-		// mark_inode_dirty(vi);
-	}
-	ntfs_debug("Done.");
-	return 0;
-}
-
-/**
- * ntfs_commit_write - commit the received data
- *
- * This is called from generic_file_write() with i_sem held on the inode
- * (@page->mapping->host).  The @page is locked but not kmap()ped.  The source
- * data has already been copied into the @page.  ntfs_prepare_write() has been
- * called before the data copied and it returned success so we can take the
- * results of various BUG checks and some error handling for granted.
- *
- * Need to mark modified blocks dirty so they get written out later when
- * ntfs_writepage() is invoked by the VM.
- *
- * Return 0 on success or -errno on error.
- *
- * Should be using generic_commit_write().  This marks buffers uptodate and
- * dirty, sets the page uptodate if all buffers in the page are uptodate, and
- * updates i_size if the end of io is beyond i_size.  In that case, it also
- * marks the inode dirty.
- *
- * Cannot use generic_commit_write() due to ntfs specialities but can look at
- * it for implementation guidance.
- *
- * If things have gone as outlined in ntfs_prepare_write(), then we do not
- * need to do any page content modifications here at all, except in the write
- * to resident attribute case, where we need to do the uptodate bringing here
- * which we combine with the copying into the mft record which means we save
- * one atomic kmap.
- */
-static int ntfs_commit_write(struct file *file, struct page *page,
-		unsigned from, unsigned to)
-{
-	struct inode *vi = page->mapping->host;
-	ntfs_inode *base_ni, *ni = NTFS_I(vi);
-	char *kaddr, *kattr;
-	ntfs_attr_search_ctx *ctx;
-	MFT_RECORD *m;
-	ATTR_RECORD *a;
-	u32 attr_len;
-	int err;
-
-	ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
-			"0x%lx, from = %u, to = %u.", vi->i_ino, ni->type,
-			page->index, from, to);
-	/* If the attribute is not resident, deal with it elsewhere. */
-	if (NInoNonResident(ni)) {
-		/* Only unnamed $DATA attributes can be compressed/encrypted. */
-		if (ni->type == AT_DATA && !ni->name_len) {
-			/* Encrypted files need separate handling. */
-			if (NInoEncrypted(ni)) {
-				// We never get here at present!
-				BUG();
-			}
-			/* Compressed data streams are handled in compress.c. */
-			if (NInoCompressed(ni)) {
-				// TODO: Implement this!
-				// return ntfs_write_compressed_block(page);
-				// We never get here at present!
-				BUG();
-			}
-		}
-		/* Normal data stream. */
-		return ntfs_commit_nonresident_write(page, from, to);
-	}
-	/*
-	 * Attribute is resident, implying it is not compressed, encrypted, or
-	 * sparse.
-	 */
-	if (!NInoAttr(ni))
-		base_ni = ni;
-	else
-		base_ni = ni->ext.base_ntfs_ino;
-	/* Map, pin, and lock the mft record. */
-	m = map_mft_record(base_ni);
-	if (IS_ERR(m)) {
-		err = PTR_ERR(m);
-		m = NULL;
-		ctx = NULL;
-		goto err_out;
-	}
-	ctx = ntfs_attr_get_search_ctx(base_ni, m);
-	if (unlikely(!ctx)) {
-		err = -ENOMEM;
-		goto err_out;
-	}
-	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
-			CASE_SENSITIVE, 0, NULL, 0, ctx);
-	if (unlikely(err)) {
-		if (err == -ENOENT)
-			err = -EIO;
-		goto err_out;
-	}
-	a = ctx->attr;
-	/* The total length of the attribute value. */
-	attr_len = le32_to_cpu(a->data.resident.value_length);
-	BUG_ON(from > attr_len);
-	kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset);
-	kaddr = kmap_atomic(page, KM_USER0);
-	/* Copy the received data from the page to the mft record. */
-	memcpy(kattr + from, kaddr + from, to - from);
-	/* Update the attribute length if necessary. */
-	if (to > attr_len) {
-		attr_len = to;
-		a->data.resident.value_length = cpu_to_le32(attr_len);
-	}
-	/*
-	 * If the page is not uptodate, bring the out of bounds area(s)
-	 * uptodate by copying data from the mft record to the page.
-	 */
-	if (!PageUptodate(page)) {
-		if (from > 0)
-			memcpy(kaddr, kattr, from);
-		if (to < attr_len)
-			memcpy(kaddr + to, kattr + to, attr_len - to);
-		/* Zero the region outside the end of the attribute value. */
-		if (attr_len < PAGE_CACHE_SIZE)
-			memset(kaddr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
-		/*
-		 * The probability of not having done any of the above is
-		 * extremely small, so we just flush unconditionally.
-		 */
-		flush_dcache_page(page);
-		SetPageUptodate(page);
-	}
-	kunmap_atomic(kaddr, KM_USER0);
-	/* Update i_size if necessary. */
-	if (vi->i_size < attr_len) {
-		ni->allocated_size = ni->initialized_size = attr_len;
-		i_size_write(vi, attr_len);
-	}
-	/* Mark the mft record dirty, so it gets written back. */
-	flush_dcache_mft_record_page(ctx->ntfs_ino);
-	mark_mft_record_dirty(ctx->ntfs_ino);
-	ntfs_attr_put_search_ctx(ctx);
-	unmap_mft_record(base_ni);
-	ntfs_debug("Done.");
-	return 0;
-err_out:
-	if (err == -ENOMEM) {
-		ntfs_warning(vi->i_sb, "Error allocating memory required to "
-				"commit the write.");
-		if (PageUptodate(page)) {
-			ntfs_warning(vi->i_sb, "Page is uptodate, setting "
-					"dirty so the write will be retried "
-					"later on by the VM.");
-			/*
-			 * Put the page on mapping->dirty_pages, but leave its
-			 * buffers' dirty state as-is.
-			 */
-			__set_page_dirty_nobuffers(page);
-			err = 0;
-		} else
-			ntfs_error(vi->i_sb, "Page is not uptodate.  Written "
-					"data has been lost.");
-	} else {
-		ntfs_error(vi->i_sb, "Resident attribute commit write failed "
-				"with error %i.", err);
 		NVolSetErrors(ni->vol);
-		make_bad_inode(vi);
 	}
+	unlock_page(page);
 	if (ctx)
 		ntfs_attr_put_search_ctx(ctx);
 	if (m)
@@ -2229,32 +1541,33 @@ err_out:
 /**
  * ntfs_aops - general address space operations for inodes and attributes
  */
-struct address_space_operations ntfs_aops = {
+const struct address_space_operations ntfs_aops = {
 	.readpage	= ntfs_readpage,	/* Fill page with data. */
-	.sync_page	= block_sync_page,	/* Currently, just unplugs the
-						   disk request queue. */
 #ifdef NTFS_RW
 	.writepage	= ntfs_writepage,	/* Write dirty page to disk. */
-	.prepare_write	= ntfs_prepare_write,	/* Prepare page and buffers
-						   ready to receive data. */
-	.commit_write	= ntfs_commit_write,	/* Commit received data. */
 #endif /* NTFS_RW */
+	.migratepage	= buffer_migrate_page,	/* Move a page cache page from
+						   one physical page to an
+						   other. */
+	.error_remove_page = generic_error_remove_page,
 };
 
 /**
  * ntfs_mst_aops - general address space operations for mst protecteed inodes
  *		   and attributes
  */
-struct address_space_operations ntfs_mst_aops = {
+const struct address_space_operations ntfs_mst_aops = {
 	.readpage	= ntfs_readpage,	/* Fill page with data. */
-	.sync_page	= block_sync_page,	/* Currently, just unplugs the
-						   disk request queue. */
 #ifdef NTFS_RW
 	.writepage	= ntfs_writepage,	/* Write dirty page to disk. */
 	.set_page_dirty	= __set_page_dirty_nobuffers,	/* Set the page dirty
 						   without touching the buffers
 						   belonging to the page. */
 #endif /* NTFS_RW */
+	.migratepage	= buffer_migrate_page,	/* Move a page cache page from
+						   one physical page to an
+						   other. */
+	.error_remove_page = generic_error_remove_page,
 };
 
 #ifdef NTFS_RW
@@ -2282,7 +1595,7 @@ void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) {
 
 	BUG_ON(!PageUptodate(page));
 	end = ofs + ni->itype.index.block_size;
-	bh_size = 1 << VFS_I(ni)->i_blkbits;
+	bh_size = VFS_I(ni)->i_sb->s_blocksize;
 	spin_lock(&mapping->private_lock);
 	if (unlikely(!page_has_buffers(page))) {
 		spin_unlock(&mapping->private_lock);
@@ -2302,6 +1615,7 @@ void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) {
 			buffers_to_free = bh;
 	}
 	bh = head = page_buffers(page);
+	BUG_ON(!bh);
 	do {
 		bh_ofs = bh_offset(bh);
 		if (bh_ofs + bh_size <= ofs)
diff --git a/fs/ntfs/aops.h b/fs/ntfs/aops.h
index 3b74e66ca2f..caecc58f529 100644
--- a/fs/ntfs/aops.h
+++ b/fs/ntfs/aops.h
@@ -80,19 +80,17 @@ static inline void ntfs_unmap_page(struct page *page)
  *
  * The unlocked and uptodate page is returned on success or an encoded error
  * on failure. Caller has to test for error using the IS_ERR() macro on the
- * return value. If that evaluates to TRUE, the negative error code can be
+ * return value. If that evaluates to 'true', the negative error code can be
  * obtained using PTR_ERR() on the return value of ntfs_map_page().
  */
 static inline struct page *ntfs_map_page(struct address_space *mapping,
 		unsigned long index)
 {
-	struct page *page = read_cache_page(mapping, index,
-			(filler_t*)mapping->a_ops->readpage, NULL);
+	struct page *page = read_mapping_page(mapping, index, NULL);
 
 	if (!IS_ERR(page)) {
-		wait_on_page_locked(page);
 		kmap(page);
-		if (PageUptodate(page) && !PageError(page))
+		if (!PageError(page))
 			return page;
 		ntfs_unmap_page(page);
 		return ERR_PTR(-EIO);
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index 1ff7f90a18b..250ed5b20c8 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -1,7 +1,7 @@
 /**
  * attrib.c - NTFS attribute operations.  Part of the Linux-NTFS project.
  *
- * Copyright (c) 2001-2004 Anton Altaparmakov
+ * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc.
  * Copyright (c) 2002 Richard Russon
  *
  * This program/include file is free software; you can redistribute it and/or
@@ -21,89 +21,417 @@
  */
 
 #include <linux/buffer_head.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
 
 #include "attrib.h"
 #include "debug.h"
 #include "layout.h"
+#include "lcnalloc.h"
+#include "malloc.h"
 #include "mft.h"
 #include "ntfs.h"
 #include "types.h"
 
 /**
- * ntfs_map_runlist - map (a part of) a runlist of an ntfs inode
+ * ntfs_map_runlist_nolock - map (a part of) a runlist of an ntfs inode
  * @ni:		ntfs inode for which to map (part of) a runlist
  * @vcn:	map runlist part containing this vcn
+ * @ctx:	active attribute search context if present or NULL if not
  *
  * Map the part of a runlist containing the @vcn of the ntfs inode @ni.
  *
- * Return 0 on success and -errno on error.
+ * If @ctx is specified, it is an active search context of @ni and its base mft
+ * record.  This is needed when ntfs_map_runlist_nolock() encounters unmapped
+ * runlist fragments and allows their mapping.  If you do not have the mft
+ * record mapped, you can specify @ctx as NULL and ntfs_map_runlist_nolock()
+ * will perform the necessary mapping and unmapping.
  *
- * Locking: - The runlist must be unlocked on entry and is unlocked on return.
- *	    - This function takes the lock for writing and modifies the runlist.
+ * Note, ntfs_map_runlist_nolock() saves the state of @ctx on entry and
+ * restores it before returning.  Thus, @ctx will be left pointing to the same
+ * attribute on return as on entry.  However, the actual pointers in @ctx may
+ * point to different memory locations on return, so you must remember to reset
+ * any cached pointers from the @ctx, i.e. after the call to
+ * ntfs_map_runlist_nolock(), you will probably want to do:
+ *	m = ctx->mrec;
+ *	a = ctx->attr;
+ * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that
+ * you cache ctx->mrec in a variable @m of type MFT_RECORD *.
+ *
+ * Return 0 on success and -errno on error.  There is one special error code
+ * which is not an error as such.  This is -ENOENT.  It means that @vcn is out
+ * of bounds of the runlist.
+ *
+ * Note the runlist can be NULL after this function returns if @vcn is zero and
+ * the attribute has zero allocated size, i.e. there simply is no runlist.
+ *
+ * WARNING: If @ctx is supplied, regardless of whether success or failure is
+ *	    returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx
+ *	    is no longer valid, i.e. you need to either call
+ *	    ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it.
+ *	    In that case PTR_ERR(@ctx->mrec) will give you the error code for
+ *	    why the mapping of the old inode failed.
+ *
+ * Locking: - The runlist described by @ni must be locked for writing on entry
+ *	      and is locked on return.  Note the runlist will be modified.
+ *	    - If @ctx is NULL, the base mft record of @ni must not be mapped on
+ *	      entry and it will be left unmapped on return.
+ *	    - If @ctx is not NULL, the base mft record must be mapped on entry
+ *	      and it will be left mapped on return.
  */
-int ntfs_map_runlist(ntfs_inode *ni, VCN vcn)
+int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn, ntfs_attr_search_ctx *ctx)
 {
+	VCN end_vcn;
+	unsigned long flags;
 	ntfs_inode *base_ni;
-	ntfs_attr_search_ctx *ctx;
-	MFT_RECORD *mrec;
+	MFT_RECORD *m;
+	ATTR_RECORD *a;
+	runlist_element *rl;
+	struct page *put_this_page = NULL;
 	int err = 0;
+	bool ctx_is_temporary, ctx_needs_reset;
+	ntfs_attr_search_ctx old_ctx = { NULL, };
 
 	ntfs_debug("Mapping runlist part containing vcn 0x%llx.",
 			(unsigned long long)vcn);
-
 	if (!NInoAttr(ni))
 		base_ni = ni;
 	else
 		base_ni = ni->ext.base_ntfs_ino;
+	if (!ctx) {
+		ctx_is_temporary = ctx_needs_reset = true;
+		m = map_mft_record(base_ni);
+		if (IS_ERR(m))
+			return PTR_ERR(m);
+		ctx = ntfs_attr_get_search_ctx(base_ni, m);
+		if (unlikely(!ctx)) {
+			err = -ENOMEM;
+			goto err_out;
+		}
+	} else {
+		VCN allocated_size_vcn;
 
-	mrec = map_mft_record(base_ni);
-	if (IS_ERR(mrec))
-		return PTR_ERR(mrec);
-	ctx = ntfs_attr_get_search_ctx(base_ni, mrec);
-	if (unlikely(!ctx)) {
-		err = -ENOMEM;
+		BUG_ON(IS_ERR(ctx->mrec));
+		a = ctx->attr;
+		BUG_ON(!a->non_resident);
+		ctx_is_temporary = false;
+		end_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn);
+		read_lock_irqsave(&ni->size_lock, flags);
+		allocated_size_vcn = ni->allocated_size >>
+				ni->vol->cluster_size_bits;
+		read_unlock_irqrestore(&ni->size_lock, flags);
+		if (!a->data.non_resident.lowest_vcn && end_vcn <= 0)
+			end_vcn = allocated_size_vcn - 1;
+		/*
+		 * If we already have the attribute extent containing @vcn in
+		 * @ctx, no need to look it up again.  We slightly cheat in
+		 * that if vcn exceeds the allocated size, we will refuse to
+		 * map the runlist below, so there is definitely no need to get
+		 * the right attribute extent.
+		 */
+		if (vcn >= allocated_size_vcn || (a->type == ni->type &&
+				a->name_length == ni->name_len &&
+				!memcmp((u8*)a + le16_to_cpu(a->name_offset),
+				ni->name, ni->name_len) &&
+				sle64_to_cpu(a->data.non_resident.lowest_vcn)
+				<= vcn && end_vcn >= vcn))
+			ctx_needs_reset = false;
+		else {
+			/* Save the old search context. */
+			old_ctx = *ctx;
+			/*
+			 * If the currently mapped (extent) inode is not the
+			 * base inode we will unmap it when we reinitialize the
+			 * search context which means we need to get a
+			 * reference to the page containing the mapped mft
+			 * record so we do not accidentally drop changes to the
+			 * mft record when it has not been marked dirty yet.
+			 */
+			if (old_ctx.base_ntfs_ino && old_ctx.ntfs_ino !=
+					old_ctx.base_ntfs_ino) {
+				put_this_page = old_ctx.ntfs_ino->page;
+				page_cache_get(put_this_page);
+			}
+			/*
+			 * Reinitialize the search context so we can lookup the
+			 * needed attribute extent.
+			 */
+			ntfs_attr_reinit_search_ctx(ctx);
+			ctx_needs_reset = true;
+		}
+	}
+	if (ctx_needs_reset) {
+		err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+				CASE_SENSITIVE, vcn, NULL, 0, ctx);
+		if (unlikely(err)) {
+			if (err == -ENOENT)
+				err = -EIO;
+			goto err_out;
+		}
+		BUG_ON(!ctx->attr->non_resident);
+	}
+	a = ctx->attr;
+	/*
+	 * Only decompress the mapping pairs if @vcn is inside it.  Otherwise
+	 * we get into problems when we try to map an out of bounds vcn because
+	 * we then try to map the already mapped runlist fragment and
+	 * ntfs_mapping_pairs_decompress() fails.
+	 */
+	end_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn) + 1;
+	if (unlikely(vcn && vcn >= end_vcn)) {
+		err = -ENOENT;
 		goto err_out;
 	}
-	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
-			CASE_SENSITIVE, vcn, NULL, 0, ctx);
-	if (unlikely(err))
-		goto put_err_out;
+	rl = ntfs_mapping_pairs_decompress(ni->vol, a, ni->runlist.rl);
+	if (IS_ERR(rl))
+		err = PTR_ERR(rl);
+	else
+		ni->runlist.rl = rl;
+err_out:
+	if (ctx_is_temporary) {
+		if (likely(ctx))
+			ntfs_attr_put_search_ctx(ctx);
+		unmap_mft_record(base_ni);
+	} else if (ctx_needs_reset) {
+		/*
+		 * If there is no attribute list, restoring the search context
+		 * is accomplished simply by copying the saved context back over
+		 * the caller supplied context.  If there is an attribute list,
+		 * things are more complicated as we need to deal with mapping
+		 * of mft records and resulting potential changes in pointers.
+		 */
+		if (NInoAttrList(base_ni)) {
+			/*
+			 * If the currently mapped (extent) inode is not the
+			 * one we had before, we need to unmap it and map the
+			 * old one.
+			 */
+			if (ctx->ntfs_ino != old_ctx.ntfs_ino) {
+				/*
+				 * If the currently mapped inode is not the
+				 * base inode, unmap it.
+				 */
+				if (ctx->base_ntfs_ino && ctx->ntfs_ino !=
+						ctx->base_ntfs_ino) {
+					unmap_extent_mft_record(ctx->ntfs_ino);
+					ctx->mrec = ctx->base_mrec;
+					BUG_ON(!ctx->mrec);
+				}
+				/*
+				 * If the old mapped inode is not the base
+				 * inode, map it.
+				 */
+				if (old_ctx.base_ntfs_ino &&
+						old_ctx.ntfs_ino !=
+						old_ctx.base_ntfs_ino) {
+retry_map:
+					ctx->mrec = map_mft_record(
+							old_ctx.ntfs_ino);
+					/*
+					 * Something bad has happened.  If out
+					 * of memory retry till it succeeds.
+					 * Any other errors are fatal and we
+					 * return the error code in ctx->mrec.
+					 * Let the caller deal with it...  We
+					 * just need to fudge things so the
+					 * caller can reinit and/or put the
+					 * search context safely.
+					 */
+					if (IS_ERR(ctx->mrec)) {
+						if (PTR_ERR(ctx->mrec) ==
+								-ENOMEM) {
+							schedule();
+							goto retry_map;
+						} else
+							old_ctx.ntfs_ino =
+								old_ctx.
+								base_ntfs_ino;
+					}
+				}
+			}
+			/* Update the changed pointers in the saved context. */
+			if (ctx->mrec != old_ctx.mrec) {
+				if (!IS_ERR(ctx->mrec))
+					old_ctx.attr = (ATTR_RECORD*)(
+							(u8*)ctx->mrec +
+							((u8*)old_ctx.attr -
+							(u8*)old_ctx.mrec));
+				old_ctx.mrec = ctx->mrec;
+			}
+		}
+		/* Restore the search context to the saved one. */
+		*ctx = old_ctx;
+		/*
+		 * We drop the reference on the page we took earlier.  In the
+		 * case that IS_ERR(ctx->mrec) is true this means we might lose
+		 * some changes to the mft record that had been made between
+		 * the last time it was marked dirty/written out and now.  This
+		 * at this stage is not a problem as the mapping error is fatal
+		 * enough that the mft record cannot be written out anyway and
+		 * the caller is very likely to shutdown the whole inode
+		 * immediately and mark the volume dirty for chkdsk to pick up
+		 * the pieces anyway.
+		 */
+		if (put_this_page)
+			page_cache_release(put_this_page);
+	}
+	return err;
+}
+
+/**
+ * ntfs_map_runlist - map (a part of) a runlist of an ntfs inode
+ * @ni:		ntfs inode for which to map (part of) a runlist
+ * @vcn:	map runlist part containing this vcn
+ *
+ * Map the part of a runlist containing the @vcn of the ntfs inode @ni.
+ *
+ * Return 0 on success and -errno on error.  There is one special error code
+ * which is not an error as such.  This is -ENOENT.  It means that @vcn is out
+ * of bounds of the runlist.
+ *
+ * Locking: - The runlist must be unlocked on entry and is unlocked on return.
+ *	    - This function takes the runlist lock for writing and may modify
+ *	      the runlist.
+ */
+int ntfs_map_runlist(ntfs_inode *ni, VCN vcn)
+{
+	int err = 0;
 
 	down_write(&ni->runlist.lock);
 	/* Make sure someone else didn't do the work while we were sleeping. */
 	if (likely(ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn) <=
-			LCN_RL_NOT_MAPPED)) {
-		runlist_element *rl;
+			LCN_RL_NOT_MAPPED))
+		err = ntfs_map_runlist_nolock(ni, vcn, NULL);
+	up_write(&ni->runlist.lock);
+	return err;
+}
 
-		rl = ntfs_mapping_pairs_decompress(ni->vol, ctx->attr,
-				ni->runlist.rl);
-		if (IS_ERR(rl))
-			err = PTR_ERR(rl);
-		else
-			ni->runlist.rl = rl;
+/**
+ * ntfs_attr_vcn_to_lcn_nolock - convert a vcn into a lcn given an ntfs inode
+ * @ni:			ntfs inode of the attribute whose runlist to search
+ * @vcn:		vcn to convert
+ * @write_locked:	true if the runlist is locked for writing
+ *
+ * Find the virtual cluster number @vcn in the runlist of the ntfs attribute
+ * described by the ntfs inode @ni and return the corresponding logical cluster
+ * number (lcn).
+ *
+ * If the @vcn is not mapped yet, the attempt is made to map the attribute
+ * extent containing the @vcn and the vcn to lcn conversion is retried.
+ *
+ * If @write_locked is true the caller has locked the runlist for writing and
+ * if false for reading.
+ *
+ * Since lcns must be >= 0, we use negative return codes with special meaning:
+ *
+ * Return code	Meaning / Description
+ * ==========================================
+ *  LCN_HOLE	Hole / not allocated on disk.
+ *  LCN_ENOENT	There is no such vcn in the runlist, i.e. @vcn is out of bounds.
+ *  LCN_ENOMEM	Not enough memory to map runlist.
+ *  LCN_EIO	Critical error (runlist/file is corrupt, i/o error, etc).
+ *
+ * Locking: - The runlist must be locked on entry and is left locked on return.
+ *	    - If @write_locked is 'false', i.e. the runlist is locked for reading,
+ *	      the lock may be dropped inside the function so you cannot rely on
+ *	      the runlist still being the same when this function returns.
+ */
+LCN ntfs_attr_vcn_to_lcn_nolock(ntfs_inode *ni, const VCN vcn,
+		const bool write_locked)
+{
+	LCN lcn;
+	unsigned long flags;
+	bool is_retry = false;
+
+	BUG_ON(!ni);
+	ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, %s_locked.",
+			ni->mft_no, (unsigned long long)vcn,
+			write_locked ? "write" : "read");
+	BUG_ON(!NInoNonResident(ni));
+	BUG_ON(vcn < 0);
+	if (!ni->runlist.rl) {
+		read_lock_irqsave(&ni->size_lock, flags);
+		if (!ni->allocated_size) {
+			read_unlock_irqrestore(&ni->size_lock, flags);
+			return LCN_ENOENT;
+		}
+		read_unlock_irqrestore(&ni->size_lock, flags);
 	}
-	up_write(&ni->runlist.lock);
+retry_remap:
+	/* Convert vcn to lcn.  If that fails map the runlist and retry once. */
+	lcn = ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn);
+	if (likely(lcn >= LCN_HOLE)) {
+		ntfs_debug("Done, lcn 0x%llx.", (long long)lcn);
+		return lcn;
+	}
+	if (lcn != LCN_RL_NOT_MAPPED) {
+		if (lcn != LCN_ENOENT)
+			lcn = LCN_EIO;
+	} else if (!is_retry) {
+		int err;
 
-put_err_out:
-	ntfs_attr_put_search_ctx(ctx);
-err_out:
-	unmap_mft_record(base_ni);
-	return err;
+		if (!write_locked) {
+			up_read(&ni->runlist.lock);
+			down_write(&ni->runlist.lock);
+			if (unlikely(ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn) !=
+					LCN_RL_NOT_MAPPED)) {
+				up_write(&ni->runlist.lock);
+				down_read(&ni->runlist.lock);
+				goto retry_remap;
+			}
+		}
+		err = ntfs_map_runlist_nolock(ni, vcn, NULL);
+		if (!write_locked) {
+			up_write(&ni->runlist.lock);
+			down_read(&ni->runlist.lock);
+		}
+		if (likely(!err)) {
+			is_retry = true;
+			goto retry_remap;
+		}
+		if (err == -ENOENT)
+			lcn = LCN_ENOENT;
+		else if (err == -ENOMEM)
+			lcn = LCN_ENOMEM;
+		else
+			lcn = LCN_EIO;
+	}
+	if (lcn != LCN_ENOENT)
+		ntfs_error(ni->vol->sb, "Failed with error code %lli.",
+				(long long)lcn);
+	return lcn;
 }
 
 /**
- * ntfs_find_vcn - find a vcn in the runlist described by an ntfs inode
+ * ntfs_attr_find_vcn_nolock - find a vcn in the runlist of an ntfs inode
  * @ni:		ntfs inode describing the runlist to search
  * @vcn:	vcn to find
- * @need_write:	if false, lock for reading and if true, lock for writing
+ * @ctx:	active attribute search context if present or NULL if not
  *
  * Find the virtual cluster number @vcn in the runlist described by the ntfs
  * inode @ni and return the address of the runlist element containing the @vcn.
- * The runlist is left locked and the caller has to unlock it.  If @need_write
- * is true, the runlist is locked for writing and if @need_write is false, the
- * runlist is locked for reading.  In the error case, the runlist is not left
- * locked.
  *
+ * If the @vcn is not mapped yet, the attempt is made to map the attribute
+ * extent containing the @vcn and the vcn to lcn conversion is retried.
+ *
+ * If @ctx is specified, it is an active search context of @ni and its base mft
+ * record.  This is needed when ntfs_attr_find_vcn_nolock() encounters unmapped
+ * runlist fragments and allows their mapping.  If you do not have the mft
+ * record mapped, you can specify @ctx as NULL and ntfs_attr_find_vcn_nolock()
+ * will perform the necessary mapping and unmapping.
+ *
+ * Note, ntfs_attr_find_vcn_nolock() saves the state of @ctx on entry and
+ * restores it before returning.  Thus, @ctx will be left pointing to the same
+ * attribute on return as on entry.  However, the actual pointers in @ctx may
+ * point to different memory locations on return, so you must remember to reset
+ * any cached pointers from the @ctx, i.e. after the call to
+ * ntfs_attr_find_vcn_nolock(), you will probably want to do:
+ *	m = ctx->mrec;
+ *	a = ctx->attr;
+ * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that
+ * you cache ctx->mrec in a variable @m of type MFT_RECORD *.
  * Note you need to distinguish between the lcn of the returned runlist element
  * being >= 0 and LCN_HOLE.  In the later case you have to return zeroes on
  * read and allocate clusters on write.
@@ -118,34 +446,47 @@ err_out:
  *	-ENOMEM - Not enough memory to map runlist.
  *	-EIO	- Critical error (runlist/file is corrupt, i/o error, etc).
  *
- * Locking: - The runlist must be unlocked on entry.
- *	    - On failing return, the runlist is unlocked.
- *	    - On successful return, the runlist is locked.  If @need_write us
- *	      true, it is locked for writing.  Otherwise is is locked for
- *	      reading.
+ * WARNING: If @ctx is supplied, regardless of whether success or failure is
+ *	    returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx
+ *	    is no longer valid, i.e. you need to either call
+ *	    ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it.
+ *	    In that case PTR_ERR(@ctx->mrec) will give you the error code for
+ *	    why the mapping of the old inode failed.
+ *
+ * Locking: - The runlist described by @ni must be locked for writing on entry
+ *	      and is locked on return.  Note the runlist may be modified when
+ *	      needed runlist fragments need to be mapped.
+ *	    - If @ctx is NULL, the base mft record of @ni must not be mapped on
+ *	      entry and it will be left unmapped on return.
+ *	    - If @ctx is not NULL, the base mft record must be mapped on entry
+ *	      and it will be left mapped on return.
  */
-runlist_element *ntfs_find_vcn(ntfs_inode *ni, const VCN vcn,
-		const BOOL need_write)
+runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni, const VCN vcn,
+		ntfs_attr_search_ctx *ctx)
 {
+	unsigned long flags;
 	runlist_element *rl;
 	int err = 0;
-	BOOL is_retry = FALSE;
+	bool is_retry = false;
 
-	ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, lock for %sing.",
-			ni->mft_no, (unsigned long long)vcn,
-			!need_write ? "read" : "writ");
 	BUG_ON(!ni);
+	ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, with%s ctx.",
+			ni->mft_no, (unsigned long long)vcn, ctx ? "" : "out");
 	BUG_ON(!NInoNonResident(ni));
 	BUG_ON(vcn < 0);
-lock_retry_remap:
-	if (!need_write)
-		down_read(&ni->runlist.lock);
-	else
-		down_write(&ni->runlist.lock);
+	if (!ni->runlist.rl) {
+		read_lock_irqsave(&ni->size_lock, flags);
+		if (!ni->allocated_size) {
+			read_unlock_irqrestore(&ni->size_lock, flags);
+			return ERR_PTR(-ENOENT);
+		}
+		read_unlock_irqrestore(&ni->size_lock, flags);
+	}
+retry_remap:
 	rl = ni->runlist.rl;
 	if (likely(rl && vcn >= rl[0].vcn)) {
 		while (likely(rl->length)) {
-			if (likely(vcn < rl[1].vcn)) {
+			if (unlikely(vcn < rl[1].vcn)) {
 				if (likely(rl->lcn >= LCN_HOLE)) {
 					ntfs_debug("Done.");
 					return rl;
@@ -161,30 +502,30 @@ lock_retry_remap:
 				err = -EIO;
 		}
 	}
-	if (!need_write)
-		up_read(&ni->runlist.lock);
-	else
-		up_write(&ni->runlist.lock);
 	if (!err && !is_retry) {
 		/*
-		 * The @vcn is in an unmapped region, map the runlist and
-		 * retry.
+		 * If the search context is invalid we cannot map the unmapped
+		 * region.
 		 */
-		err = ntfs_map_runlist(ni, vcn);
-		if (likely(!err)) {
-			is_retry = TRUE;
-			goto lock_retry_remap;
+		if (IS_ERR(ctx->mrec))
+			err = PTR_ERR(ctx->mrec);
+		else {
+			/*
+			 * The @vcn is in an unmapped region, map the runlist
+			 * and retry.
+			 */
+			err = ntfs_map_runlist_nolock(ni, vcn, ctx);
+			if (likely(!err)) {
+				is_retry = true;
+				goto retry_remap;
+			}
 		}
-		/*
-		 * -EINVAL and -ENOENT coming from a failed mapping attempt are
-		 * equivalent to i/o errors for us as they should not happen in
-		 * our code paths.
-		 */
-		if (err == -EINVAL || err == -ENOENT)
+		if (err == -EINVAL)
 			err = -EIO;
 	} else if (!err)
 		err = -EIO;
-	ntfs_error(ni->vol->sb, "Failed with error code %i.", err);
+	if (err != -ENOENT)
+		ntfs_error(ni->vol->sb, "Failed with error code %i.", err);
 	return ERR_PTR(err);
 }
 
@@ -215,8 +556,8 @@ lock_retry_remap:
  * On actual error, ntfs_attr_find() returns -EIO.  In this case @ctx->attr is
  * undefined and in particular do not rely on it not changing.
  *
- * If @ctx->is_first is TRUE, the search begins with @ctx->attr itself.  If it
- * is FALSE, the search begins after @ctx->attr.
+ * If @ctx->is_first is 'true', the search begins with @ctx->attr itself.  If it
+ * is 'false', the search begins after @ctx->attr.
  *
  * If @ic is IGNORE_CASE, the @name comparisson is not case sensitive and
  * @ctx->ntfs_ino must be set to the ntfs inode to which the mft record
@@ -256,11 +597,11 @@ static int ntfs_attr_find(const ATTR_TYPE type, const ntfschar *name,
 
 	/*
 	 * Iterate over attributes in mft record starting at @ctx->attr, or the
-	 * attribute following that, if @ctx->is_first is TRUE.
+	 * attribute following that, if @ctx->is_first is 'true'.
 	 */
 	if (ctx->is_first) {
 		a = ctx->attr;
-		ctx->is_first = FALSE;
+		ctx->is_first = false;
 	} else
 		a = (ATTR_RECORD*)((u8*)ctx->attr +
 				le32_to_cpu(ctx->attr->length));
@@ -393,6 +734,11 @@ int load_attribute_list(ntfs_volume *vol, runlist *runlist, u8 *al_start,
 	block_size_bits = sb->s_blocksize_bits;
 	down_read(&runlist->lock);
 	rl = runlist->rl;
+	if (!rl) {
+		ntfs_error(sb, "Cannot read attribute list since runlist is "
+				"missing.");
+		goto err_out;	
+	}
 	/* Read all clusters specified by the runlist one run at a time. */
 	while (rl->length) {
 		lcn = ntfs_rl_vcn_to_lcn(rl, rl->vcn);
@@ -542,11 +888,11 @@ static int ntfs_external_attr_find(const ATTR_TYPE type,
 		ctx->al_entry = (ATTR_LIST_ENTRY*)al_start;
 	/*
 	 * Iterate over entries in attribute list starting at @ctx->al_entry,
-	 * or the entry following that, if @ctx->is_first is TRUE.
+	 * or the entry following that, if @ctx->is_first is 'true'.
 	 */
 	if (ctx->is_first) {
 		al_entry = ctx->al_entry;
-		ctx->is_first = FALSE;
+		ctx->is_first = false;
 	} else
 		al_entry = (ATTR_LIST_ENTRY*)((u8*)ctx->al_entry +
 				le16_to_cpu(ctx->al_entry->length));
@@ -700,7 +1046,7 @@ do_next_attr_loop:
 				le32_to_cpu(ctx->mrec->bytes_allocated))
 			break;
 		if (a->type == AT_END)
-			continue;
+			break;
 		if (!a->length)
 			break;
 		if (al_entry->instance != a->instance)
@@ -779,7 +1125,7 @@ not_found:
 	ctx->mrec = ctx->base_mrec;
 	ctx->attr = (ATTR_RECORD*)((u8*)ctx->mrec +
 			le16_to_cpu(ctx->mrec->attrs_offset));
-	ctx->is_first = TRUE;
+	ctx->is_first = true;
 	ctx->ntfs_ino = base_ni;
 	ctx->base_ntfs_ino = NULL;
 	ctx->base_mrec = NULL;
@@ -835,7 +1181,7 @@ not_found:
  * for, i.e. if one wants to add the attribute to the mft record this is the
  * correct place to insert its attribute list entry into.
  *
- * When -errno != -ENOENT, an error occured during the lookup.  @ctx->attr is
+ * When -errno != -ENOENT, an error occurred during the lookup.  @ctx->attr is
  * then undefined and in particular you should not rely on it not changing.
  */
 int ntfs_attr_lookup(const ATTR_TYPE type, const ntfschar *name,
@@ -846,6 +1192,7 @@ int ntfs_attr_lookup(const ATTR_TYPE type, const ntfschar *name,
 	ntfs_inode *base_ni;
 
 	ntfs_debug("Entering.");
+	BUG_ON(IS_ERR(ctx->mrec));
 	if (ctx->base_ntfs_ino)
 		base_ni = ctx->base_ntfs_ino;
 	else
@@ -870,15 +1217,14 @@ int ntfs_attr_lookup(const ATTR_TYPE type, const ntfschar *name,
 static inline void ntfs_attr_init_search_ctx(ntfs_attr_search_ctx *ctx,
 		ntfs_inode *ni, MFT_RECORD *mrec)
 {
-	ctx->mrec = mrec;
-	/* Sanity checks are performed elsewhere. */
-	ctx->attr = (ATTR_RECORD*)((u8*)mrec + le16_to_cpu(mrec->attrs_offset));
-	ctx->is_first = TRUE;
-	ctx->ntfs_ino = ni;
-	ctx->al_entry = NULL;
-	ctx->base_ntfs_ino = NULL;
-	ctx->base_mrec = NULL;
-	ctx->base_attr = NULL;
+	*ctx = (ntfs_attr_search_ctx) {
+		.mrec = mrec,
+		/* Sanity checks are performed elsewhere. */
+		.attr = (ATTR_RECORD*)((u8*)mrec +
+				le16_to_cpu(mrec->attrs_offset)),
+		.is_first = true,
+		.ntfs_ino = ni,
+	};
 }
 
 /**
@@ -895,7 +1241,7 @@ void ntfs_attr_reinit_search_ctx(ntfs_attr_search_ctx *ctx)
 {
 	if (likely(!ctx->base_ntfs_ino)) {
 		/* No attribute list. */
-		ctx->is_first = TRUE;
+		ctx->is_first = true;
 		/* Sanity checks are performed elsewhere. */
 		ctx->attr = (ATTR_RECORD*)((u8*)ctx->mrec +
 				le16_to_cpu(ctx->mrec->attrs_offset));
@@ -924,7 +1270,7 @@ ntfs_attr_search_ctx *ntfs_attr_get_search_ctx(ntfs_inode *ni, MFT_RECORD *mrec)
 {
 	ntfs_attr_search_ctx *ctx;
 
-	ctx = kmem_cache_alloc(ntfs_attr_ctx_cache, SLAB_NOFS);
+	ctx = kmem_cache_alloc(ntfs_attr_ctx_cache, GFP_NOFS);
 	if (ctx)
 		ntfs_attr_init_search_ctx(ctx, ni, mrec);
 	return ctx;
@@ -945,6 +1291,8 @@ void ntfs_attr_put_search_ctx(ntfs_attr_search_ctx *ctx)
 	return;
 }
 
+#ifdef NTFS_RW
+
 /**
  * ntfs_attr_find_in_attrdef - find an attribute in the $AttrDef system file
  * @vol:	ntfs volume to which the attribute belongs
@@ -1024,27 +1372,21 @@ int ntfs_attr_size_bounds_check(const ntfs_volume *vol, const ATTR_TYPE type,
  * Check whether the attribute of @type on the ntfs volume @vol is allowed to
  * be non-resident.  This information is obtained from $AttrDef system file.
  *
- * Return 0 if the attribute is allowed to be non-resident, -EPERM if not, or
+ * Return 0 if the attribute is allowed to be non-resident, -EPERM if not, and
  * -ENOENT if the attribute is not listed in $AttrDef.
  */
 int ntfs_attr_can_be_non_resident(const ntfs_volume *vol, const ATTR_TYPE type)
 {
 	ATTR_DEF *ad;
 
-	/*
-	 * $DATA is always allowed to be non-resident even if $AttrDef does not
-	 * specify this in the flags of the $DATA attribute definition record.
-	 */
-	if (type == AT_DATA)
-		return 0;
 	/* Find the attribute definition record in $AttrDef. */
 	ad = ntfs_attr_find_in_attrdef(vol, type);
 	if (unlikely(!ad))
 		return -ENOENT;
 	/* Check the flags and return the result. */
-	if (ad->flags & CAN_BE_NON_RESIDENT)
-		return 0;
-	return -EPERM;
+	if (ad->flags & ATTR_DEF_RESIDENT)
+		return -EPERM;
+	return 0;
 }
 
 /**
@@ -1067,9 +1409,9 @@ int ntfs_attr_can_be_non_resident(const ntfs_volume *vol, const ATTR_TYPE type)
  */
 int ntfs_attr_can_be_resident(const ntfs_volume *vol, const ATTR_TYPE type)
 {
-	if (type != AT_INDEX_ALLOCATION && type != AT_EA)
-		return 0;
-	return -EPERM;
+	if (type == AT_INDEX_ALLOCATION)
+		return -EPERM;
+	return 0;
 }
 
 /**
@@ -1117,6 +1459,1018 @@ int ntfs_attr_record_resize(MFT_RECORD *m, ATTR_RECORD *a, u32 new_size)
 }
 
 /**
+ * ntfs_resident_attr_value_resize - resize the value of a resident attribute
+ * @m:		mft record containing attribute record
+ * @a:		attribute record whose value to resize
+ * @new_size:	new size in bytes to which to resize the attribute value of @a
+ *
+ * Resize the value of the attribute @a in the mft record @m to @new_size bytes.
+ * If the value is made bigger, the newly allocated space is cleared.
+ *
+ * Return 0 on success and -errno on error.  The following error codes are
+ * defined:
+ *	-ENOSPC	- Not enough space in the mft record @m to perform the resize.
+ *
+ * Note: On error, no modifications have been performed whatsoever.
+ *
+ * Warning: If you make a record smaller without having copied all the data you
+ *	    are interested in the data may be overwritten.
+ */
+int ntfs_resident_attr_value_resize(MFT_RECORD *m, ATTR_RECORD *a,
+		const u32 new_size)
+{
+	u32 old_size;
+
+	/* Resize the resident part of the attribute record. */
+	if (ntfs_attr_record_resize(m, a,
+			le16_to_cpu(a->data.resident.value_offset) + new_size))
+		return -ENOSPC;
+	/*
+	 * The resize succeeded!  If we made the attribute value bigger, clear
+	 * the area between the old size and @new_size.
+	 */
+	old_size = le32_to_cpu(a->data.resident.value_length);
+	if (new_size > old_size)
+		memset((u8*)a + le16_to_cpu(a->data.resident.value_offset) +
+				old_size, 0, new_size - old_size);
+	/* Finally update the length of the attribute value. */
+	a->data.resident.value_length = cpu_to_le32(new_size);
+	return 0;
+}
+
+/**
+ * ntfs_attr_make_non_resident - convert a resident to a non-resident attribute
+ * @ni:		ntfs inode describing the attribute to convert
+ * @data_size:	size of the resident data to copy to the non-resident attribute
+ *
+ * Convert the resident ntfs attribute described by the ntfs inode @ni to a
+ * non-resident one.
+ *
+ * @data_size must be equal to the attribute value size.  This is needed since
+ * we need to know the size before we can map the mft record and our callers
+ * always know it.  The reason we cannot simply read the size from the vfs
+ * inode i_size is that this is not necessarily uptodate.  This happens when
+ * ntfs_attr_make_non_resident() is called in the ->truncate call path(s).
+ *
+ * Return 0 on success and -errno on error.  The following error return codes
+ * are defined:
+ *	-EPERM	- The attribute is not allowed to be non-resident.
+ *	-ENOMEM	- Not enough memory.
+ *	-ENOSPC	- Not enough disk space.
+ *	-EINVAL	- Attribute not defined on the volume.
+ *	-EIO	- I/o error or other error.
+ * Note that -ENOSPC is also returned in the case that there is not enough
+ * space in the mft record to do the conversion.  This can happen when the mft
+ * record is already very full.  The caller is responsible for trying to make
+ * space in the mft record and trying again.  FIXME: Do we need a separate
+ * error return code for this kind of -ENOSPC or is it always worth trying
+ * again in case the attribute may then fit in a resident state so no need to
+ * make it non-resident at all?  Ho-hum...  (AIA)
+ *
+ * NOTE to self: No changes in the attribute list are required to move from
+ *		 a resident to a non-resident attribute.
+ *
+ * Locking: - The caller must hold i_mutex on the inode.
+ */
+int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size)
+{
+	s64 new_size;
+	struct inode *vi = VFS_I(ni);
+	ntfs_volume *vol = ni->vol;
+	ntfs_inode *base_ni;
+	MFT_RECORD *m;
+	ATTR_RECORD *a;
+	ntfs_attr_search_ctx *ctx;
+	struct page *page;
+	runlist_element *rl;
+	u8 *kaddr;
+	unsigned long flags;
+	int mp_size, mp_ofs, name_ofs, arec_size, err, err2;
+	u32 attr_size;
+	u8 old_res_attr_flags;
+
+	/* Check that the attribute is allowed to be non-resident. */
+	err = ntfs_attr_can_be_non_resident(vol, ni->type);
+	if (unlikely(err)) {
+		if (err == -EPERM)
+			ntfs_debug("Attribute is not allowed to be "
+					"non-resident.");
+		else
+			ntfs_debug("Attribute not defined on the NTFS "
+					"volume!");
+		return err;
+	}
+	/*
+	 * FIXME: Compressed and encrypted attributes are not supported when
+	 * writing and we should never have gotten here for them.
+	 */
+	BUG_ON(NInoCompressed(ni));
+	BUG_ON(NInoEncrypted(ni));
+	/*
+	 * The size needs to be aligned to a cluster boundary for allocation
+	 * purposes.
+	 */
+	new_size = (data_size + vol->cluster_size - 1) &
+			~(vol->cluster_size - 1);
+	if (new_size > 0) {
+		/*
+		 * Will need the page later and since the page lock nests
+		 * outside all ntfs locks, we need to get the page now.
+		 */
+		page = find_or_create_page(vi->i_mapping, 0,
+				mapping_gfp_mask(vi->i_mapping));
+		if (unlikely(!page))
+			return -ENOMEM;
+		/* Start by allocating clusters to hold the attribute value. */
+		rl = ntfs_cluster_alloc(vol, 0, new_size >>
+				vol->cluster_size_bits, -1, DATA_ZONE, true);
+		if (IS_ERR(rl)) {
+			err = PTR_ERR(rl);
+			ntfs_debug("Failed to allocate cluster%s, error code "
+					"%i.", (new_size >>
+					vol->cluster_size_bits) > 1 ? "s" : "",
+					err);
+			goto page_err_out;
+		}
+	} else {
+		rl = NULL;
+		page = NULL;
+	}
+	/* Determine the size of the mapping pairs array. */
+	mp_size = ntfs_get_size_for_mapping_pairs(vol, rl, 0, -1);
+	if (unlikely(mp_size < 0)) {
+		err = mp_size;
+		ntfs_debug("Failed to get size for mapping pairs array, error "
+				"code %i.", err);
+		goto rl_err_out;
+	}
+	down_write(&ni->runlist.lock);
+	if (!NInoAttr(ni))
+		base_ni = ni;
+	else
+		base_ni = ni->ext.base_ntfs_ino;
+	m = map_mft_record(base_ni);
+	if (IS_ERR(m)) {
+		err = PTR_ERR(m);
+		m = NULL;
+		ctx = NULL;
+		goto err_out;
+	}
+	ctx = ntfs_attr_get_search_ctx(base_ni, m);
+	if (unlikely(!ctx)) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+			CASE_SENSITIVE, 0, NULL, 0, ctx);
+	if (unlikely(err)) {
+		if (err == -ENOENT)
+			err = -EIO;
+		goto err_out;
+	}
+	m = ctx->mrec;
+	a = ctx->attr;
+	BUG_ON(NInoNonResident(ni));
+	BUG_ON(a->non_resident);
+	/*
+	 * Calculate new offsets for the name and the mapping pairs array.
+	 */
+	if (NInoSparse(ni) || NInoCompressed(ni))
+		name_ofs = (offsetof(ATTR_REC,
+				data.non_resident.compressed_size) +
+				sizeof(a->data.non_resident.compressed_size) +
+				7) & ~7;
+	else
+		name_ofs = (offsetof(ATTR_REC,
+				data.non_resident.compressed_size) + 7) & ~7;
+	mp_ofs = (name_ofs + a->name_length * sizeof(ntfschar) + 7) & ~7;
+	/*
+	 * Determine the size of the resident part of the now non-resident
+	 * attribute record.
+	 */
+	arec_size = (mp_ofs + mp_size + 7) & ~7;
+	/*
+	 * If the page is not uptodate bring it uptodate by copying from the
+	 * attribute value.
+	 */
+	attr_size = le32_to_cpu(a->data.resident.value_length);
+	BUG_ON(attr_size != data_size);
+	if (page && !PageUptodate(page)) {
+		kaddr = kmap_atomic(page);
+		memcpy(kaddr, (u8*)a +
+				le16_to_cpu(a->data.resident.value_offset),
+				attr_size);
+		memset(kaddr + attr_size, 0, PAGE_CACHE_SIZE - attr_size);
+		kunmap_atomic(kaddr);
+		flush_dcache_page(page);
+		SetPageUptodate(page);
+	}
+	/* Backup the attribute flag. */
+	old_res_attr_flags = a->data.resident.flags;
+	/* Resize the resident part of the attribute record. */
+	err = ntfs_attr_record_resize(m, a, arec_size);
+	if (unlikely(err))
+		goto err_out;
+	/*
+	 * Convert the resident part of the attribute record to describe a
+	 * non-resident attribute.
+	 */
+	a->non_resident = 1;
+	/* Move the attribute name if it exists and update the offset. */
+	if (a->name_length)
+		memmove((u8*)a + name_ofs, (u8*)a + le16_to_cpu(a->name_offset),
+				a->name_length * sizeof(ntfschar));
+	a->name_offset = cpu_to_le16(name_ofs);
+	/* Setup the fields specific to non-resident attributes. */
+	a->data.non_resident.lowest_vcn = 0;
+	a->data.non_resident.highest_vcn = cpu_to_sle64((new_size - 1) >>
+			vol->cluster_size_bits);
+	a->data.non_resident.mapping_pairs_offset = cpu_to_le16(mp_ofs);
+	memset(&a->data.non_resident.reserved, 0,
+			sizeof(a->data.non_resident.reserved));
+	a->data.non_resident.allocated_size = cpu_to_sle64(new_size);
+	a->data.non_resident.data_size =
+			a->data.non_resident.initialized_size =
+			cpu_to_sle64(attr_size);
+	if (NInoSparse(ni) || NInoCompressed(ni)) {
+		a->data.non_resident.compression_unit = 0;
+		if (NInoCompressed(ni) || vol->major_ver < 3)
+			a->data.non_resident.compression_unit = 4;
+		a->data.non_resident.compressed_size =
+				a->data.non_resident.allocated_size;
+	} else
+		a->data.non_resident.compression_unit = 0;
+	/* Generate the mapping pairs array into the attribute record. */
+	err = ntfs_mapping_pairs_build(vol, (u8*)a + mp_ofs,
+			arec_size - mp_ofs, rl, 0, -1, NULL);
+	if (unlikely(err)) {
+		ntfs_debug("Failed to build mapping pairs, error code %i.",
+				err);
+		goto undo_err_out;
+	}
+	/* Setup the in-memory attribute structure to be non-resident. */
+	ni->runlist.rl = rl;
+	write_lock_irqsave(&ni->size_lock, flags);
+	ni->allocated_size = new_size;
+	if (NInoSparse(ni) || NInoCompressed(ni)) {
+		ni->itype.compressed.size = ni->allocated_size;
+		if (a->data.non_resident.compression_unit) {
+			ni->itype.compressed.block_size = 1U << (a->data.
+					non_resident.compression_unit +
+					vol->cluster_size_bits);
+			ni->itype.compressed.block_size_bits =
+					ffs(ni->itype.compressed.block_size) -
+					1;
+			ni->itype.compressed.block_clusters = 1U <<
+					a->data.non_resident.compression_unit;
+		} else {
+			ni->itype.compressed.block_size = 0;
+			ni->itype.compressed.block_size_bits = 0;
+			ni->itype.compressed.block_clusters = 0;
+		}
+		vi->i_blocks = ni->itype.compressed.size >> 9;
+	} else
+		vi->i_blocks = ni->allocated_size >> 9;
+	write_unlock_irqrestore(&ni->size_lock, flags);
+	/*
+	 * This needs to be last since the address space operations ->readpage
+	 * and ->writepage can run concurrently with us as they are not
+	 * serialized on i_mutex.  Note, we are not allowed to fail once we flip
+	 * this switch, which is another reason to do this last.
+	 */
+	NInoSetNonResident(ni);
+	/* Mark the mft record dirty, so it gets written back. */
+	flush_dcache_mft_record_page(ctx->ntfs_ino);
+	mark_mft_record_dirty(ctx->ntfs_ino);
+	ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(base_ni);
+	up_write(&ni->runlist.lock);
+	if (page) {
+		set_page_dirty(page);
+		unlock_page(page);
+		page_cache_release(page);
+	}
+	ntfs_debug("Done.");
+	return 0;
+undo_err_out:
+	/* Convert the attribute back into a resident attribute. */
+	a->non_resident = 0;
+	/* Move the attribute name if it exists and update the offset. */
+	name_ofs = (offsetof(ATTR_RECORD, data.resident.reserved) +
+			sizeof(a->data.resident.reserved) + 7) & ~7;
+	if (a->name_length)
+		memmove((u8*)a + name_ofs, (u8*)a + le16_to_cpu(a->name_offset),
+				a->name_length * sizeof(ntfschar));
+	mp_ofs = (name_ofs + a->name_length * sizeof(ntfschar) + 7) & ~7;
+	a->name_offset = cpu_to_le16(name_ofs);
+	arec_size = (mp_ofs + attr_size + 7) & ~7;
+	/* Resize the resident part of the attribute record. */
+	err2 = ntfs_attr_record_resize(m, a, arec_size);
+	if (unlikely(err2)) {
+		/*
+		 * This cannot happen (well if memory corruption is at work it
+		 * could happen in theory), but deal with it as well as we can.
+		 * If the old size is too small, truncate the attribute,
+		 * otherwise simply give it a larger allocated size.
+		 * FIXME: Should check whether chkdsk complains when the
+		 * allocated size is much bigger than the resident value size.
+		 */
+		arec_size = le32_to_cpu(a->length);
+		if ((mp_ofs + attr_size) > arec_size) {
+			err2 = attr_size;
+			attr_size = arec_size - mp_ofs;
+			ntfs_error(vol->sb, "Failed to undo partial resident "
+					"to non-resident attribute "
+					"conversion.  Truncating inode 0x%lx, "
+					"attribute type 0x%x from %i bytes to "
+					"%i bytes to maintain metadata "
+					"consistency.  THIS MEANS YOU ARE "
+					"LOSING %i BYTES DATA FROM THIS %s.",
+					vi->i_ino,
+					(unsigned)le32_to_cpu(ni->type),
+					err2, attr_size, err2 - attr_size,
+					((ni->type == AT_DATA) &&
+					!ni->name_len) ? "FILE": "ATTRIBUTE");
+			write_lock_irqsave(&ni->size_lock, flags);
+			ni->initialized_size = attr_size;
+			i_size_write(vi, attr_size);
+			write_unlock_irqrestore(&ni->size_lock, flags);
+		}
+	}
+	/* Setup the fields specific to resident attributes. */
+	a->data.resident.value_length = cpu_to_le32(attr_size);
+	a->data.resident.value_offset = cpu_to_le16(mp_ofs);
+	a->data.resident.flags = old_res_attr_flags;
+	memset(&a->data.resident.reserved, 0,
+			sizeof(a->data.resident.reserved));
+	/* Copy the data from the page back to the attribute value. */
+	if (page) {
+		kaddr = kmap_atomic(page);
+		memcpy((u8*)a + mp_ofs, kaddr, attr_size);
+		kunmap_atomic(kaddr);
+	}
+	/* Setup the allocated size in the ntfs inode in case it changed. */
+	write_lock_irqsave(&ni->size_lock, flags);
+	ni->allocated_size = arec_size - mp_ofs;
+	write_unlock_irqrestore(&ni->size_lock, flags);
+	/* Mark the mft record dirty, so it gets written back. */
+	flush_dcache_mft_record_page(ctx->ntfs_ino);
+	mark_mft_record_dirty(ctx->ntfs_ino);
+err_out:
+	if (ctx)
+		ntfs_attr_put_search_ctx(ctx);
+	if (m)
+		unmap_mft_record(base_ni);
+	ni->runlist.rl = NULL;
+	up_write(&ni->runlist.lock);
+rl_err_out:
+	if (rl) {
+		if (ntfs_cluster_free_from_rl(vol, rl) < 0) {
+			ntfs_error(vol->sb, "Failed to release allocated "
+					"cluster(s) in error code path.  Run "
+					"chkdsk to recover the lost "
+					"cluster(s).");
+			NVolSetErrors(vol);
+		}
+		ntfs_free(rl);
+page_err_out:
+		unlock_page(page);
+		page_cache_release(page);
+	}
+	if (err == -EINVAL)
+		err = -EIO;
+	return err;
+}
+
+/**
+ * ntfs_attr_extend_allocation - extend the allocated space of an attribute
+ * @ni:			ntfs inode of the attribute whose allocation to extend
+ * @new_alloc_size:	new size in bytes to which to extend the allocation to
+ * @new_data_size:	new size in bytes to which to extend the data to
+ * @data_start:		beginning of region which is required to be non-sparse
+ *
+ * Extend the allocated space of an attribute described by the ntfs inode @ni
+ * to @new_alloc_size bytes.  If @data_start is -1, the whole extension may be
+ * implemented as a hole in the file (as long as both the volume and the ntfs
+ * inode @ni have sparse support enabled).  If @data_start is >= 0, then the
+ * region between the old allocated size and @data_start - 1 may be made sparse
+ * but the regions between @data_start and @new_alloc_size must be backed by
+ * actual clusters.
+ *
+ * If @new_data_size is -1, it is ignored.  If it is >= 0, then the data size
+ * of the attribute is extended to @new_data_size.  Note that the i_size of the
+ * vfs inode is not updated.  Only the data size in the base attribute record
+ * is updated.  The caller has to update i_size separately if this is required.
+ * WARNING: It is a BUG() for @new_data_size to be smaller than the old data
+ * size as well as for @new_data_size to be greater than @new_alloc_size.
+ *
+ * For resident attributes this involves resizing the attribute record and if
+ * necessary moving it and/or other attributes into extent mft records and/or
+ * converting the attribute to a non-resident attribute which in turn involves
+ * extending the allocation of a non-resident attribute as described below.
+ *
+ * For non-resident attributes this involves allocating clusters in the data
+ * zone on the volume (except for regions that are being made sparse) and
+ * extending the run list to describe the allocated clusters as well as
+ * updating the mapping pairs array of the attribute.  This in turn involves
+ * resizing the attribute record and if necessary moving it and/or other
+ * attributes into extent mft records and/or splitting the attribute record
+ * into multiple extent attribute records.
+ *
+ * Also, the attribute list attribute is updated if present and in some of the
+ * above cases (the ones where extent mft records/attributes come into play),
+ * an attribute list attribute is created if not already present.
+ *
+ * Return the new allocated size on success and -errno on error.  In the case
+ * that an error is encountered but a partial extension at least up to
+ * @data_start (if present) is possible, the allocation is partially extended
+ * and this is returned.  This means the caller must check the returned size to
+ * determine if the extension was partial.  If @data_start is -1 then partial
+ * allocations are not performed.
+ *
+ * WARNING: Do not call ntfs_attr_extend_allocation() for $MFT/$DATA.
+ *
+ * Locking: This function takes the runlist lock of @ni for writing as well as
+ * locking the mft record of the base ntfs inode.  These locks are maintained
+ * throughout execution of the function.  These locks are required so that the
+ * attribute can be resized safely and so that it can for example be converted
+ * from resident to non-resident safely.
+ *
+ * TODO: At present attribute list attribute handling is not implemented.
+ *
+ * TODO: At present it is not safe to call this function for anything other
+ * than the $DATA attribute(s) of an uncompressed and unencrypted file.
+ */
+s64 ntfs_attr_extend_allocation(ntfs_inode *ni, s64 new_alloc_size,
+		const s64 new_data_size, const s64 data_start)
+{
+	VCN vcn;
+	s64 ll, allocated_size, start = data_start;
+	struct inode *vi = VFS_I(ni);
+	ntfs_volume *vol = ni->vol;
+	ntfs_inode *base_ni;
+	MFT_RECORD *m;
+	ATTR_RECORD *a;
+	ntfs_attr_search_ctx *ctx;
+	runlist_element *rl, *rl2;
+	unsigned long flags;
+	int err, mp_size;
+	u32 attr_len = 0; /* Silence stupid gcc warning. */
+	bool mp_rebuilt;
+
+#ifdef DEBUG
+	read_lock_irqsave(&ni->size_lock, flags);
+	allocated_size = ni->allocated_size;
+	read_unlock_irqrestore(&ni->size_lock, flags);
+	ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
+			"old_allocated_size 0x%llx, "
+			"new_allocated_size 0x%llx, new_data_size 0x%llx, "
+			"data_start 0x%llx.", vi->i_ino,
+			(unsigned)le32_to_cpu(ni->type),
+			(unsigned long long)allocated_size,
+			(unsigned long long)new_alloc_size,
+			(unsigned long long)new_data_size,
+			(unsigned long long)start);
+#endif
+retry_extend:
+	/*
+	 * For non-resident attributes, @start and @new_size need to be aligned
+	 * to cluster boundaries for allocation purposes.
+	 */
+	if (NInoNonResident(ni)) {
+		if (start > 0)
+			start &= ~(s64)vol->cluster_size_mask;
+		new_alloc_size = (new_alloc_size + vol->cluster_size - 1) &
+				~(s64)vol->cluster_size_mask;
+	}
+	BUG_ON(new_data_size >= 0 && new_data_size > new_alloc_size);
+	/* Check if new size is allowed in $AttrDef. */
+	err = ntfs_attr_size_bounds_check(vol, ni->type, new_alloc_size);
+	if (unlikely(err)) {
+		/* Only emit errors when the write will fail completely. */
+		read_lock_irqsave(&ni->size_lock, flags);
+		allocated_size = ni->allocated_size;
+		read_unlock_irqrestore(&ni->size_lock, flags);
+		if (start < 0 || start >= allocated_size) {
+			if (err == -ERANGE) {
+				ntfs_error(vol->sb, "Cannot extend allocation "
+						"of inode 0x%lx, attribute "
+						"type 0x%x, because the new "
+						"allocation would exceed the "
+						"maximum allowed size for "
+						"this attribute type.",
+						vi->i_ino, (unsigned)
+						le32_to_cpu(ni->type));
+			} else {
+				ntfs_error(vol->sb, "Cannot extend allocation "
+						"of inode 0x%lx, attribute "
+						"type 0x%x, because this "
+						"attribute type is not "
+						"defined on the NTFS volume.  "
+						"Possible corruption!  You "
+						"should run chkdsk!",
+						vi->i_ino, (unsigned)
+						le32_to_cpu(ni->type));
+			}
+		}
+		/* Translate error code to be POSIX conformant for write(2). */
+		if (err == -ERANGE)
+			err = -EFBIG;
+		else
+			err = -EIO;
+		return err;
+	}
+	if (!NInoAttr(ni))
+		base_ni = ni;
+	else
+		base_ni = ni->ext.base_ntfs_ino;
+	/*
+	 * We will be modifying both the runlist (if non-resident) and the mft
+	 * record so lock them both down.
+	 */
+	down_write(&ni->runlist.lock);
+	m = map_mft_record(base_ni);
+	if (IS_ERR(m)) {
+		err = PTR_ERR(m);
+		m = NULL;
+		ctx = NULL;
+		goto err_out;
+	}
+	ctx = ntfs_attr_get_search_ctx(base_ni, m);
+	if (unlikely(!ctx)) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+	read_lock_irqsave(&ni->size_lock, flags);
+	allocated_size = ni->allocated_size;
+	read_unlock_irqrestore(&ni->size_lock, flags);
+	/*
+	 * If non-resident, seek to the last extent.  If resident, there is
+	 * only one extent, so seek to that.
+	 */
+	vcn = NInoNonResident(ni) ? allocated_size >> vol->cluster_size_bits :
+			0;
+	/*
+	 * Abort if someone did the work whilst we waited for the locks.  If we
+	 * just converted the attribute from resident to non-resident it is
+	 * likely that exactly this has happened already.  We cannot quite
+	 * abort if we need to update the data size.
+	 */
+	if (unlikely(new_alloc_size <= allocated_size)) {
+		ntfs_debug("Allocated size already exceeds requested size.");
+		new_alloc_size = allocated_size;
+		if (new_data_size < 0)
+			goto done;
+		/*
+		 * We want the first attribute extent so that we can update the
+		 * data size.
+		 */
+		vcn = 0;
+	}
+	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+			CASE_SENSITIVE, vcn, NULL, 0, ctx);
+	if (unlikely(err)) {
+		if (err == -ENOENT)
+			err = -EIO;
+		goto err_out;
+	}
+	m = ctx->mrec;
+	a = ctx->attr;
+	/* Use goto to reduce indentation. */
+	if (a->non_resident)
+		goto do_non_resident_extend;
+	BUG_ON(NInoNonResident(ni));
+	/* The total length of the attribute value. */
+	attr_len = le32_to_cpu(a->data.resident.value_length);
+	/*
+	 * Extend the attribute record to be able to store the new attribute
+	 * size.  ntfs_attr_record_resize() will not do anything if the size is
+	 * not changing.
+	 */
+	if (new_alloc_size < vol->mft_record_size &&
+			!ntfs_attr_record_resize(m, a,
+			le16_to_cpu(a->data.resident.value_offset) +
+			new_alloc_size)) {
+		/* The resize succeeded! */
+		write_lock_irqsave(&ni->size_lock, flags);
+		ni->allocated_size = le32_to_cpu(a->length) -
+				le16_to_cpu(a->data.resident.value_offset);
+		write_unlock_irqrestore(&ni->size_lock, flags);
+		if (new_data_size >= 0) {
+			BUG_ON(new_data_size < attr_len);
+			a->data.resident.value_length =
+					cpu_to_le32((u32)new_data_size);
+		}
+		goto flush_done;
+	}
+	/*
+	 * We have to drop all the locks so we can call
+	 * ntfs_attr_make_non_resident().  This could be optimised by try-
+	 * locking the first page cache page and only if that fails dropping
+	 * the locks, locking the page, and redoing all the locking and
+	 * lookups.  While this would be a huge optimisation, it is not worth
+	 * it as this is definitely a slow code path.
+	 */
+	ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(base_ni);
+	up_write(&ni->runlist.lock);
+	/*
+	 * Not enough space in the mft record, try to make the attribute
+	 * non-resident and if successful restart the extension process.
+	 */
+	err = ntfs_attr_make_non_resident(ni, attr_len);
+	if (likely(!err))
+		goto retry_extend;
+	/*
+	 * Could not make non-resident.  If this is due to this not being
+	 * permitted for this attribute type or there not being enough space,
+	 * try to make other attributes non-resident.  Otherwise fail.
+	 */
+	if (unlikely(err != -EPERM && err != -ENOSPC)) {
+		/* Only emit errors when the write will fail completely. */
+		read_lock_irqsave(&ni->size_lock, flags);
+		allocated_size = ni->allocated_size;
+		read_unlock_irqrestore(&ni->size_lock, flags);
+		if (start < 0 || start >= allocated_size)
+			ntfs_error(vol->sb, "Cannot extend allocation of "
+					"inode 0x%lx, attribute type 0x%x, "
+					"because the conversion from resident "
+					"to non-resident attribute failed "
+					"with error code %i.", vi->i_ino,
+					(unsigned)le32_to_cpu(ni->type), err);
+		if (err != -ENOMEM)
+			err = -EIO;
+		goto conv_err_out;
+	}
+	/* TODO: Not implemented from here, abort. */
+	read_lock_irqsave(&ni->size_lock, flags);
+	allocated_size = ni->allocated_size;
+	read_unlock_irqrestore(&ni->size_lock, flags);
+	if (start < 0 || start >= allocated_size) {
+		if (err == -ENOSPC)
+			ntfs_error(vol->sb, "Not enough space in the mft "
+					"record/on disk for the non-resident "
+					"attribute value.  This case is not "
+					"implemented yet.");
+		else /* if (err == -EPERM) */
+			ntfs_error(vol->sb, "This attribute type may not be "
+					"non-resident.  This case is not "
+					"implemented yet.");
+	}
+	err = -EOPNOTSUPP;
+	goto conv_err_out;
+#if 0
+	// TODO: Attempt to make other attributes non-resident.
+	if (!err)
+		goto do_resident_extend;
+	/*
+	 * Both the attribute list attribute and the standard information
+	 * attribute must remain in the base inode.  Thus, if this is one of
+	 * these attributes, we have to try to move other attributes out into
+	 * extent mft records instead.
+	 */
+	if (ni->type == AT_ATTRIBUTE_LIST ||
+			ni->type == AT_STANDARD_INFORMATION) {
+		// TODO: Attempt to move other attributes into extent mft
+		// records.
+		err = -EOPNOTSUPP;
+		if (!err)
+			goto do_resident_extend;
+		goto err_out;
+	}
+	// TODO: Attempt to move this attribute to an extent mft record, but
+	// only if it is not already the only attribute in an mft record in
+	// which case there would be nothing to gain.
+	err = -EOPNOTSUPP;
+	if (!err)
+		goto do_resident_extend;
+	/* There is nothing we can do to make enough space. )-: */
+	goto err_out;
+#endif
+do_non_resident_extend:
+	BUG_ON(!NInoNonResident(ni));
+	if (new_alloc_size == allocated_size) {
+		BUG_ON(vcn);
+		goto alloc_done;
+	}
+	/*
+	 * If the data starts after the end of the old allocation, this is a
+	 * $DATA attribute and sparse attributes are enabled on the volume and
+	 * for this inode, then create a sparse region between the old
+	 * allocated size and the start of the data.  Otherwise simply proceed
+	 * with filling the whole space between the old allocated size and the
+	 * new allocated size with clusters.
+	 */
+	if ((start >= 0 && start <= allocated_size) || ni->type != AT_DATA ||
+			!NVolSparseEnabled(vol) || NInoSparseDisabled(ni))
+		goto skip_sparse;
+	// TODO: This is not implemented yet.  We just fill in with real
+	// clusters for now...
+	ntfs_debug("Inserting holes is not-implemented yet.  Falling back to "
+			"allocating real clusters instead.");
+skip_sparse:
+	rl = ni->runlist.rl;
+	if (likely(rl)) {
+		/* Seek to the end of the runlist. */
+		while (rl->length)
+			rl++;
+	}
+	/* If this attribute extent is not mapped, map it now. */
+	if (unlikely(!rl || rl->lcn == LCN_RL_NOT_MAPPED ||
+			(rl->lcn == LCN_ENOENT && rl > ni->runlist.rl &&
+			(rl-1)->lcn == LCN_RL_NOT_MAPPED))) {
+		if (!rl && !allocated_size)
+			goto first_alloc;
+		rl = ntfs_mapping_pairs_decompress(vol, a, ni->runlist.rl);
+		if (IS_ERR(rl)) {
+			err = PTR_ERR(rl);
+			if (start < 0 || start >= allocated_size)
+				ntfs_error(vol->sb, "Cannot extend allocation "
+						"of inode 0x%lx, attribute "
+						"type 0x%x, because the "
+						"mapping of a runlist "
+						"fragment failed with error "
+						"code %i.", vi->i_ino,
+						(unsigned)le32_to_cpu(ni->type),
+						err);
+			if (err != -ENOMEM)
+				err = -EIO;
+			goto err_out;
+		}
+		ni->runlist.rl = rl;
+		/* Seek to the end of the runlist. */
+		while (rl->length)
+			rl++;
+	}
+	/*
+	 * We now know the runlist of the last extent is mapped and @rl is at
+	 * the end of the runlist.  We want to begin allocating clusters
+	 * starting at the last allocated cluster to reduce fragmentation.  If
+	 * there are no valid LCNs in the attribute we let the cluster
+	 * allocator choose the starting cluster.
+	 */
+	/* If the last LCN is a hole or simillar seek back to last real LCN. */
+	while (rl->lcn < 0 && rl > ni->runlist.rl)
+		rl--;
+first_alloc:
+	// FIXME: Need to implement partial allocations so at least part of the
+	// write can be performed when start >= 0.  (Needed for POSIX write(2)
+	// conformance.)
+	rl2 = ntfs_cluster_alloc(vol, allocated_size >> vol->cluster_size_bits,
+			(new_alloc_size - allocated_size) >>
+			vol->cluster_size_bits, (rl && (rl->lcn >= 0)) ?
+			rl->lcn + rl->length : -1, DATA_ZONE, true);
+	if (IS_ERR(rl2)) {
+		err = PTR_ERR(rl2);
+		if (start < 0 || start >= allocated_size)
+			ntfs_error(vol->sb, "Cannot extend allocation of "
+					"inode 0x%lx, attribute type 0x%x, "
+					"because the allocation of clusters "
+					"failed with error code %i.", vi->i_ino,
+					(unsigned)le32_to_cpu(ni->type), err);
+		if (err != -ENOMEM && err != -ENOSPC)
+			err = -EIO;
+		goto err_out;
+	}
+	rl = ntfs_runlists_merge(ni->runlist.rl, rl2);
+	if (IS_ERR(rl)) {
+		err = PTR_ERR(rl);
+		if (start < 0 || start >= allocated_size)
+			ntfs_error(vol->sb, "Cannot extend allocation of "
+					"inode 0x%lx, attribute type 0x%x, "
+					"because the runlist merge failed "
+					"with error code %i.", vi->i_ino,
+					(unsigned)le32_to_cpu(ni->type), err);
+		if (err != -ENOMEM)
+			err = -EIO;
+		if (ntfs_cluster_free_from_rl(vol, rl2)) {
+			ntfs_error(vol->sb, "Failed to release allocated "
+					"cluster(s) in error code path.  Run "
+					"chkdsk to recover the lost "
+					"cluster(s).");
+			NVolSetErrors(vol);
+		}
+		ntfs_free(rl2);
+		goto err_out;
+	}
+	ni->runlist.rl = rl;
+	ntfs_debug("Allocated 0x%llx clusters.", (long long)(new_alloc_size -
+			allocated_size) >> vol->cluster_size_bits);
+	/* Find the runlist element with which the attribute extent starts. */
+	ll = sle64_to_cpu(a->data.non_resident.lowest_vcn);
+	rl2 = ntfs_rl_find_vcn_nolock(rl, ll);
+	BUG_ON(!rl2);
+	BUG_ON(!rl2->length);
+	BUG_ON(rl2->lcn < LCN_HOLE);
+	mp_rebuilt = false;
+	/* Get the size for the new mapping pairs array for this extent. */
+	mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1);
+	if (unlikely(mp_size <= 0)) {
+		err = mp_size;
+		if (start < 0 || start >= allocated_size)
+			ntfs_error(vol->sb, "Cannot extend allocation of "
+					"inode 0x%lx, attribute type 0x%x, "
+					"because determining the size for the "
+					"mapping pairs failed with error code "
+					"%i.", vi->i_ino,
+					(unsigned)le32_to_cpu(ni->type), err);
+		err = -EIO;
+		goto undo_alloc;
+	}
+	/* Extend the attribute record to fit the bigger mapping pairs array. */
+	attr_len = le32_to_cpu(a->length);
+	err = ntfs_attr_record_resize(m, a, mp_size +
+			le16_to_cpu(a->data.non_resident.mapping_pairs_offset));
+	if (unlikely(err)) {
+		BUG_ON(err != -ENOSPC);
+		// TODO: Deal with this by moving this extent to a new mft
+		// record or by starting a new extent in a new mft record,
+		// possibly by extending this extent partially and filling it
+		// and creating a new extent for the remainder, or by making
+		// other attributes non-resident and/or by moving other
+		// attributes out of this mft record.
+		if (start < 0 || start >= allocated_size)
+			ntfs_error(vol->sb, "Not enough space in the mft "
+					"record for the extended attribute "
+					"record.  This case is not "
+					"implemented yet.");
+		err = -EOPNOTSUPP;
+		goto undo_alloc;
+	}
+	mp_rebuilt = true;
+	/* Generate the mapping pairs array directly into the attr record. */
+	err = ntfs_mapping_pairs_build(vol, (u8*)a +
+			le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
+			mp_size, rl2, ll, -1, NULL);
+	if (unlikely(err)) {
+		if (start < 0 || start >= allocated_size)
+			ntfs_error(vol->sb, "Cannot extend allocation of "
+					"inode 0x%lx, attribute type 0x%x, "
+					"because building the mapping pairs "
+					"failed with error code %i.", vi->i_ino,
+					(unsigned)le32_to_cpu(ni->type), err);
+		err = -EIO;
+		goto undo_alloc;
+	}
+	/* Update the highest_vcn. */
+	a->data.non_resident.highest_vcn = cpu_to_sle64((new_alloc_size >>
+			vol->cluster_size_bits) - 1);
+	/*
+	 * We now have extended the allocated size of the attribute.  Reflect
+	 * this in the ntfs_inode structure and the attribute record.
+	 */
+	if (a->data.non_resident.lowest_vcn) {
+		/*
+		 * We are not in the first attribute extent, switch to it, but
+		 * first ensure the changes will make it to disk later.
+		 */
+		flush_dcache_mft_record_page(ctx->ntfs_ino);
+		mark_mft_record_dirty(ctx->ntfs_ino);
+		ntfs_attr_reinit_search_ctx(ctx);
+		err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+				CASE_SENSITIVE, 0, NULL, 0, ctx);
+		if (unlikely(err))
+			goto restore_undo_alloc;
+		/* @m is not used any more so no need to set it. */
+		a = ctx->attr;
+	}
+	write_lock_irqsave(&ni->size_lock, flags);
+	ni->allocated_size = new_alloc_size;
+	a->data.non_resident.allocated_size = cpu_to_sle64(new_alloc_size);
+	/*
+	 * FIXME: This would fail if @ni is a directory, $MFT, or an index,
+	 * since those can have sparse/compressed set.  For example can be
+	 * set compressed even though it is not compressed itself and in that
+	 * case the bit means that files are to be created compressed in the
+	 * directory...  At present this is ok as this code is only called for
+	 * regular files, and only for their $DATA attribute(s).
+	 * FIXME: The calculation is wrong if we created a hole above.  For now
+	 * it does not matter as we never create holes.
+	 */
+	if (NInoSparse(ni) || NInoCompressed(ni)) {
+		ni->itype.compressed.size += new_alloc_size - allocated_size;
+		a->data.non_resident.compressed_size =
+				cpu_to_sle64(ni->itype.compressed.size);
+		vi->i_blocks = ni->itype.compressed.size >> 9;
+	} else
+		vi->i_blocks = new_alloc_size >> 9;
+	write_unlock_irqrestore(&ni->size_lock, flags);
+alloc_done:
+	if (new_data_size >= 0) {
+		BUG_ON(new_data_size <
+				sle64_to_cpu(a->data.non_resident.data_size));
+		a->data.non_resident.data_size = cpu_to_sle64(new_data_size);
+	}
+flush_done:
+	/* Ensure the changes make it to disk. */
+	flush_dcache_mft_record_page(ctx->ntfs_ino);
+	mark_mft_record_dirty(ctx->ntfs_ino);
+done:
+	ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(base_ni);
+	up_write(&ni->runlist.lock);
+	ntfs_debug("Done, new_allocated_size 0x%llx.",
+			(unsigned long long)new_alloc_size);
+	return new_alloc_size;
+restore_undo_alloc:
+	if (start < 0 || start >= allocated_size)
+		ntfs_error(vol->sb, "Cannot complete extension of allocation "
+				"of inode 0x%lx, attribute type 0x%x, because "
+				"lookup of first attribute extent failed with "
+				"error code %i.", vi->i_ino,
+				(unsigned)le32_to_cpu(ni->type), err);
+	if (err == -ENOENT)
+		err = -EIO;
+	ntfs_attr_reinit_search_ctx(ctx);
+	if (ntfs_attr_lookup(ni->type, ni->name, ni->name_len, CASE_SENSITIVE,
+			allocated_size >> vol->cluster_size_bits, NULL, 0,
+			ctx)) {
+		ntfs_error(vol->sb, "Failed to find last attribute extent of "
+				"attribute in error code path.  Run chkdsk to "
+				"recover.");
+		write_lock_irqsave(&ni->size_lock, flags);
+		ni->allocated_size = new_alloc_size;
+		/*
+		 * FIXME: This would fail if @ni is a directory...  See above.
+		 * FIXME: The calculation is wrong if we created a hole above.
+		 * For now it does not matter as we never create holes.
+		 */
+		if (NInoSparse(ni) || NInoCompressed(ni)) {
+			ni->itype.compressed.size += new_alloc_size -
+					allocated_size;
+			vi->i_blocks = ni->itype.compressed.size >> 9;
+		} else
+			vi->i_blocks = new_alloc_size >> 9;
+		write_unlock_irqrestore(&ni->size_lock, flags);
+		ntfs_attr_put_search_ctx(ctx);
+		unmap_mft_record(base_ni);
+		up_write(&ni->runlist.lock);
+		/*
+		 * The only thing that is now wrong is the allocated size of the
+		 * base attribute extent which chkdsk should be able to fix.
+		 */
+		NVolSetErrors(vol);
+		return err;
+	}
+	ctx->attr->data.non_resident.highest_vcn = cpu_to_sle64(
+			(allocated_size >> vol->cluster_size_bits) - 1);
+undo_alloc:
+	ll = allocated_size >> vol->cluster_size_bits;
+	if (ntfs_cluster_free(ni, ll, -1, ctx) < 0) {
+		ntfs_error(vol->sb, "Failed to release allocated cluster(s) "
+				"in error code path.  Run chkdsk to recover "
+				"the lost cluster(s).");
+		NVolSetErrors(vol);
+	}
+	m = ctx->mrec;
+	a = ctx->attr;
+	/*
+	 * If the runlist truncation fails and/or the search context is no
+	 * longer valid, we cannot resize the attribute record or build the
+	 * mapping pairs array thus we mark the inode bad so that no access to
+	 * the freed clusters can happen.
+	 */
+	if (ntfs_rl_truncate_nolock(vol, &ni->runlist, ll) || IS_ERR(m)) {
+		ntfs_error(vol->sb, "Failed to %s in error code path.  Run "
+				"chkdsk to recover.", IS_ERR(m) ?
+				"restore attribute search context" :
+				"truncate attribute runlist");
+		NVolSetErrors(vol);
+	} else if (mp_rebuilt) {
+		if (ntfs_attr_record_resize(m, a, attr_len)) {
+			ntfs_error(vol->sb, "Failed to restore attribute "
+					"record in error code path.  Run "
+					"chkdsk to recover.");
+			NVolSetErrors(vol);
+		} else /* if (success) */ {
+			if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(
+					a->data.non_resident.
+					mapping_pairs_offset), attr_len -
+					le16_to_cpu(a->data.non_resident.
+					mapping_pairs_offset), rl2, ll, -1,
+					NULL)) {
+				ntfs_error(vol->sb, "Failed to restore "
+						"mapping pairs array in error "
+						"code path.  Run chkdsk to "
+						"recover.");
+				NVolSetErrors(vol);
+			}
+			flush_dcache_mft_record_page(ctx->ntfs_ino);
+			mark_mft_record_dirty(ctx->ntfs_ino);
+		}
+	}
+err_out:
+	if (ctx)
+		ntfs_attr_put_search_ctx(ctx);
+	if (m)
+		unmap_mft_record(base_ni);
+	up_write(&ni->runlist.lock);
+conv_err_out:
+	ntfs_debug("Failed.  Returning error code %i.", err);
+	return err;
+}
+
+/**
  * ntfs_attr_set - fill (a part of) an attribute with a byte
  * @ni:		ntfs inode describing the attribute to fill
  * @ofs:	offset inside the attribute at which to start to fill
@@ -1127,6 +2481,10 @@ int ntfs_attr_record_resize(MFT_RECORD *m, ATTR_RECORD *a, u32 new_size)
  * byte offset @ofs inside the attribute with the constant byte @val.
  *
  * This function is effectively like memset() applied to an ntfs attribute.
+ * Note thie function actually only operates on the page cache pages belonging
+ * to the ntfs attribute and it marks them dirty after doing the memset().
+ * Thus it relies on the vm dirty page write code paths to cause the modified
+ * pages to be written to the mft record/disk.
  *
  * Return 0 on success and -errno on error.  An error code of -ESPIPE means
  * that @ofs + @cnt were outside the end of the attribute and no write was
@@ -1139,7 +2497,7 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
 	struct page *page;
 	u8 *kaddr;
 	pgoff_t idx, end;
-	unsigned int start_ofs, end_ofs, size;
+	unsigned start_ofs, end_ofs, size;
 
 	ntfs_debug("Entering for ofs 0x%llx, cnt 0x%llx, val 0x%hx.",
 			(long long)ofs, (long long)cnt, val);
@@ -1147,6 +2505,12 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
 	BUG_ON(cnt < 0);
 	if (!cnt)
 		goto done;
+	/*
+	 * FIXME: Compressed and encrypted attributes are not supported when
+	 * writing and we should never have gotten here for them.
+	 */
+	BUG_ON(NInoCompressed(ni));
+	BUG_ON(NInoEncrypted(ni));
 	mapping = VFS_I(ni)->i_mapping;
 	/* Work out the starting index and page offset. */
 	idx = ofs >> PAGE_CACHE_SHIFT;
@@ -1155,25 +2519,17 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
 	end = ofs + cnt;
 	end_ofs = end & ~PAGE_CACHE_MASK;
 	/* If the end is outside the inode size return -ESPIPE. */
-	if (unlikely(end > VFS_I(ni)->i_size)) {
+	if (unlikely(end > i_size_read(VFS_I(ni)))) {
 		ntfs_error(vol->sb, "Request exceeds end of attribute.");
 		return -ESPIPE;
 	}
 	end >>= PAGE_CACHE_SHIFT;
 	/* If there is a first partial page, need to do it the slow way. */
 	if (start_ofs) {
-		page = read_cache_page(mapping, idx,
-				(filler_t*)mapping->a_ops->readpage, NULL);
+		page = read_mapping_page(mapping, idx, NULL);
 		if (IS_ERR(page)) {
 			ntfs_error(vol->sb, "Failed to read first partial "
-					"page (sync error, index 0x%lx).", idx);
-			return PTR_ERR(page);
-		}
-		wait_on_page_locked(page);
-		if (unlikely(!PageUptodate(page))) {
-			ntfs_error(vol->sb, "Failed to read first partial page "
-					"(async error, index 0x%lx).", idx);
-			page_cache_release(page);
+					"page (error, index 0x%lx).", idx);
 			return PTR_ERR(page);
 		}
 		/*
@@ -1183,12 +2539,14 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
 		size = PAGE_CACHE_SIZE;
 		if (idx == end)
 			size = end_ofs;
-		kaddr = kmap_atomic(page, KM_USER0);
+		kaddr = kmap_atomic(page);
 		memset(kaddr + start_ofs, val, size - start_ofs);
 		flush_dcache_page(page);
-		kunmap_atomic(kaddr, KM_USER0);
+		kunmap_atomic(kaddr);
 		set_page_dirty(page);
 		page_cache_release(page);
+		balance_dirty_pages_ratelimited(mapping);
+		cond_resched();
 		if (idx == end)
 			goto done;
 		idx++;
@@ -1202,10 +2560,10 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
 					"page (index 0x%lx).", idx);
 			return -ENOMEM;
 		}
-		kaddr = kmap_atomic(page, KM_USER0);
+		kaddr = kmap_atomic(page);
 		memset(kaddr, val, PAGE_CACHE_SIZE);
 		flush_dcache_page(page);
-		kunmap_atomic(kaddr, KM_USER0);
+		kunmap_atomic(kaddr);
 		/*
 		 * If the page has buffers, mark them uptodate since buffer
 		 * state and not page state is definitive in 2.6 kernels.
@@ -1228,31 +2586,29 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
 		/* Finally unlock and release the page. */
 		unlock_page(page);
 		page_cache_release(page);
+		balance_dirty_pages_ratelimited(mapping);
+		cond_resched();
 	}
 	/* If there is a last partial page, need to do it the slow way. */
 	if (end_ofs) {
-		page = read_cache_page(mapping, idx,
-				(filler_t*)mapping->a_ops->readpage, NULL);
+		page = read_mapping_page(mapping, idx, NULL);
 		if (IS_ERR(page)) {
 			ntfs_error(vol->sb, "Failed to read last partial page "
-					"(sync error, index 0x%lx).", idx);
-			return PTR_ERR(page);
-		}
-		wait_on_page_locked(page);
-		if (unlikely(!PageUptodate(page))) {
-			ntfs_error(vol->sb, "Failed to read last partial page "
-					"(async error, index 0x%lx).", idx);
-			page_cache_release(page);
+					"(error, index 0x%lx).", idx);
 			return PTR_ERR(page);
 		}
-		kaddr = kmap_atomic(page, KM_USER0);
+		kaddr = kmap_atomic(page);
 		memset(kaddr, val, end_ofs);
 		flush_dcache_page(page);
-		kunmap_atomic(kaddr, KM_USER0);
+		kunmap_atomic(kaddr);
 		set_page_dirty(page);
 		page_cache_release(page);
+		balance_dirty_pages_ratelimited(mapping);
+		cond_resched();
 	}
 done:
 	ntfs_debug("Done.");
 	return 0;
 }
+
+#endif /* NTFS_RW */
diff --git a/fs/ntfs/attrib.h b/fs/ntfs/attrib.h
index e0c2c6c81bc..3c8b74c99b8 100644
--- a/fs/ntfs/attrib.h
+++ b/fs/ntfs/attrib.h
@@ -2,7 +2,7 @@
  * attrib.h - Defines for attribute handling in NTFS Linux kernel driver.
  *	      Part of the Linux-NTFS project.
  *
- * Copyright (c) 2001-2004 Anton Altaparmakov
+ * Copyright (c) 2001-2005 Anton Altaparmakov
  * Copyright (c) 2002 Richard Russon
  *
  * This program/include file is free software; you can redistribute it and/or
@@ -40,10 +40,10 @@
  * Structure must be initialized to zero before the first call to one of the
  * attribute search functions. Initialize @mrec to point to the mft record to
  * search, and @attr to point to the first attribute within @mrec (not necessary
- * if calling the _first() functions), and set @is_first to TRUE (not necessary
+ * if calling the _first() functions), and set @is_first to 'true' (not necessary
  * if calling the _first() functions).
  *
- * If @is_first is TRUE, the search begins with @attr. If @is_first is FALSE,
+ * If @is_first is 'true', the search begins with @attr. If @is_first is 'false',
  * the search begins after @attr. This is so that, after the first call to one
  * of the search attribute functions, we can call the function again, without
  * any modification of the search context, to automagically get the next
@@ -52,7 +52,7 @@
 typedef struct {
 	MFT_RECORD *mrec;
 	ATTR_RECORD *attr;
-	BOOL is_first;
+	bool is_first;
 	ntfs_inode *ntfs_ino;
 	ATTR_LIST_ENTRY *al_entry;
 	ntfs_inode *base_ntfs_ino;
@@ -60,10 +60,15 @@ typedef struct {
 	ATTR_RECORD *base_attr;
 } ntfs_attr_search_ctx;
 
+extern int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn,
+		ntfs_attr_search_ctx *ctx);
 extern int ntfs_map_runlist(ntfs_inode *ni, VCN vcn);
 
-extern runlist_element *ntfs_find_vcn(ntfs_inode *ni, const VCN vcn,
-		const BOOL need_write);
+extern LCN ntfs_attr_vcn_to_lcn_nolock(ntfs_inode *ni, const VCN vcn,
+		const bool write_locked);
+
+extern runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni,
+		const VCN vcn, ntfs_attr_search_ctx *ctx);
 
 int ntfs_attr_lookup(const ATTR_TYPE type, const ntfschar *name,
 		const u32 name_len, const IGNORE_CASE_BOOL ic,
@@ -85,6 +90,8 @@ extern ntfs_attr_search_ctx *ntfs_attr_get_search_ctx(ntfs_inode *ni,
 		MFT_RECORD *mrec);
 extern void ntfs_attr_put_search_ctx(ntfs_attr_search_ctx *ctx);
 
+#ifdef NTFS_RW
+
 extern int ntfs_attr_size_bounds_check(const ntfs_volume *vol,
 		const ATTR_TYPE type, const s64 size);
 extern int ntfs_attr_can_be_non_resident(const ntfs_volume *vol,
@@ -93,8 +100,17 @@ extern int ntfs_attr_can_be_resident(const ntfs_volume *vol,
 		const ATTR_TYPE type);
 
 extern int ntfs_attr_record_resize(MFT_RECORD *m, ATTR_RECORD *a, u32 new_size);
+extern int ntfs_resident_attr_value_resize(MFT_RECORD *m, ATTR_RECORD *a,
+		const u32 new_size);
+
+extern int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size);
+
+extern s64 ntfs_attr_extend_allocation(ntfs_inode *ni, s64 new_alloc_size,
+		const s64 new_data_size, const s64 data_start);
 
 extern int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt,
 		const u8 val);
 
+#endif /* NTFS_RW */
+
 #endif /* _LINUX_NTFS_ATTRIB_H */
diff --git a/fs/ntfs/bitmap.c b/fs/ntfs/bitmap.c
index 12cf2e30c7d..0809cf87609 100644
--- a/fs/ntfs/bitmap.c
+++ b/fs/ntfs/bitmap.c
@@ -1,7 +1,7 @@
 /*
  * bitmap.c - NTFS kernel bitmap handling.  Part of the Linux-NTFS project.
  *
- * Copyright (c) 2004 Anton Altaparmakov
+ * Copyright (c) 2004-2005 Anton Altaparmakov
  *
  * This program/include file is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as published
@@ -34,18 +34,18 @@
  * @start_bit:		first bit to set
  * @count:		number of bits to set
  * @value:		value to set the bits to (i.e. 0 or 1)
- * @is_rollback:	if TRUE this is a rollback operation
+ * @is_rollback:	if 'true' this is a rollback operation
  *
  * Set @count bits starting at bit @start_bit in the bitmap described by the
  * vfs inode @vi to @value, where @value is either 0 or 1.
  *
- * @is_rollback should always be FALSE, it is for internal use to rollback
+ * @is_rollback should always be 'false', it is for internal use to rollback
  * errors.  You probably want to use ntfs_bitmap_set_bits_in_run() instead.
  *
  * Return 0 on success and -errno on error.
  */
 int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit,
-		const s64 count, const u8 value, const BOOL is_rollback)
+		const s64 count, const u8 value, const bool is_rollback)
 {
 	s64 cnt = count;
 	pgoff_t index, end_index;
@@ -90,7 +90,8 @@ int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit,
 	/* If the first byte is partial, modify the appropriate bits in it. */
 	if (bit) {
 		u8 *byte = kaddr + pos;
-		while ((bit & 7) && cnt--) {
+		while ((bit & 7) && cnt) {
+			cnt--;
 			if (value)
 				*byte |= 1 << bit++;
 			else
@@ -171,7 +172,7 @@ rollback:
 		return PTR_ERR(page);
 	if (count != cnt)
 		pos = __ntfs_bitmap_set_bits_in_run(vi, start_bit, count - cnt,
-				value ? 0 : 1, TRUE);
+				value ? 0 : 1, true);
 	else
 		pos = 0;
 	if (!pos) {
diff --git a/fs/ntfs/bitmap.h b/fs/ntfs/bitmap.h
index bb50d6bc921..72c9ad8be70 100644
--- a/fs/ntfs/bitmap.h
+++ b/fs/ntfs/bitmap.h
@@ -30,7 +30,7 @@
 #include "types.h"
 
 extern int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit,
-		const s64 count, const u8 value, const BOOL is_rollback);
+		const s64 count, const u8 value, const bool is_rollback);
 
 /**
  * ntfs_bitmap_set_bits_in_run - set a run of bits in a bitmap to a value
@@ -48,7 +48,7 @@ static inline int ntfs_bitmap_set_bits_in_run(struct inode *vi,
 		const s64 start_bit, const s64 count, const u8 value)
 {
 	return __ntfs_bitmap_set_bits_in_run(vi, start_bit, count, value,
-			FALSE);
+			false);
 }
 
 /**
diff --git a/fs/ntfs/collate.h b/fs/ntfs/collate.h
index e027f36fcc2..aba83347e5f 100644
--- a/fs/ntfs/collate.h
+++ b/fs/ntfs/collate.h
@@ -26,7 +26,7 @@
 #include "types.h"
 #include "volume.h"
 
-static inline BOOL ntfs_is_collation_rule_supported(COLLATION_RULE cr) {
+static inline bool ntfs_is_collation_rule_supported(COLLATION_RULE cr) {
 	int i;
 
 	/*
@@ -35,12 +35,12 @@ static inline BOOL ntfs_is_collation_rule_supported(COLLATION_RULE cr) {
 	 * now.
 	 */
 	if (unlikely(cr != COLLATION_BINARY && cr != COLLATION_NTOFS_ULONG))
-		return FALSE;
+		return false;
 	i = le32_to_cpu(cr);
 	if (likely(((i >= 0) && (i <= 0x02)) ||
 			((i >= 0x10) && (i <= 0x13))))
-		return TRUE;
-	return FALSE;
+		return true;
+	return false;
 }
 
 extern int ntfs_collate(ntfs_volume *vol, COLLATION_RULE cr,
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
index ee5ae706f86..f82498c35e7 100644
--- a/fs/ntfs/compress.c
+++ b/fs/ntfs/compress.c
@@ -25,6 +25,7 @@
 #include <linux/buffer_head.h>
 #include <linux/blkdev.h>
 #include <linux/vmalloc.h>
+#include <linux/slab.h>
 
 #include "attrib.h"
 #include "inode.h"
@@ -57,7 +58,7 @@ typedef enum {
 /**
  * ntfs_compression_buffer - one buffer for the decompression engine
  */
-static u8 *ntfs_compression_buffer = NULL;
+static u8 *ntfs_compression_buffer;
 
 /**
  * ntfs_cb_lock - spinlock which protects ntfs_compression_buffer
@@ -67,7 +68,7 @@ static DEFINE_SPINLOCK(ntfs_cb_lock);
 /**
  * allocate_compression_buffers - allocate the decompression buffers
  *
- * Caller has to hold the ntfs_lock semaphore.
+ * Caller has to hold the ntfs_lock mutex.
  *
  * Return 0 on success or -ENOMEM if the allocations failed.
  */
@@ -84,7 +85,7 @@ int allocate_compression_buffers(void)
 /**
  * free_compression_buffers - free the decompression buffers
  *
- * Caller has to hold the ntfs_lock semaphore.
+ * Caller has to hold the ntfs_lock mutex.
  */
 void free_compression_buffers(void)
 {
@@ -96,13 +97,14 @@ void free_compression_buffers(void)
 /**
  * zero_partial_compressed_page - zero out of bounds compressed page region
  */
-static void zero_partial_compressed_page(ntfs_inode *ni, struct page *page)
+static void zero_partial_compressed_page(struct page *page,
+		const s64 initialized_size)
 {
 	u8 *kp = page_address(page);
 	unsigned int kp_ofs;
 
 	ntfs_debug("Zeroing page region outside initialized size.");
-	if (((s64)page->index << PAGE_CACHE_SHIFT) >= ni->initialized_size) {
+	if (((s64)page->index << PAGE_CACHE_SHIFT) >= initialized_size) {
 		/*
 		 * FIXME: Using clear_page() will become wrong when we get
 		 * PAGE_CACHE_SIZE != PAGE_SIZE but for now there is no problem.
@@ -110,7 +112,7 @@ static void zero_partial_compressed_page(ntfs_inode *ni, struct page *page)
 		clear_page(kp);
 		return;
 	}
-	kp_ofs = ni->initialized_size & ~PAGE_CACHE_MASK;
+	kp_ofs = initialized_size & ~PAGE_CACHE_MASK;
 	memset(kp + kp_ofs, 0, PAGE_CACHE_SIZE - kp_ofs);
 	return;
 }
@@ -118,12 +120,12 @@ static void zero_partial_compressed_page(ntfs_inode *ni, struct page *page)
 /**
  * handle_bounds_compressed_page - test for&handle out of bounds compressed page
  */
-static inline void handle_bounds_compressed_page(ntfs_inode *ni,
-		struct page *page)
+static inline void handle_bounds_compressed_page(struct page *page,
+		const loff_t i_size, const s64 initialized_size)
 {
-	if ((page->index >= (ni->initialized_size >> PAGE_CACHE_SHIFT)) &&
-			(ni->initialized_size < VFS_I(ni)->i_size))
-		zero_partial_compressed_page(ni, page);
+	if ((page->index >= (initialized_size >> PAGE_CACHE_SHIFT)) &&
+			(initialized_size < i_size))
+		zero_partial_compressed_page(page, initialized_size);
 	return;
 }
 
@@ -138,6 +140,8 @@ static inline void handle_bounds_compressed_page(ntfs_inode *ni,
  * @xpage_done:		set to 1 if xpage was completed successfully (IN/OUT)
  * @cb_start:		compression block to decompress (IN)
  * @cb_size:		size of compression block @cb_start in bytes (IN)
+ * @i_size:		file size when we started the read (IN)
+ * @initialized_size:	initialized file size when we started the read (IN)
  *
  * The caller must have disabled preemption. ntfs_decompress() reenables it when
  * the critical section is finished.
@@ -165,7 +169,8 @@ static inline void handle_bounds_compressed_page(ntfs_inode *ni,
 static int ntfs_decompress(struct page *dest_pages[], int *dest_index,
 		int *dest_ofs, const int dest_max_index, const int dest_max_ofs,
 		const int xpage, char *xpage_done, u8 *const cb_start,
-		const u32 cb_size)
+		const u32 cb_size, const loff_t i_size,
+		const s64 initialized_size)
 {
 	/*
 	 * Pointers into the compressed data, i.e. the compression block (cb),
@@ -219,9 +224,6 @@ return_error:
 		spin_unlock(&ntfs_cb_lock);
 		/* Second stage: finalize completed pages. */
 		if (nr_completed_pages > 0) {
-			struct page *page = dest_pages[completed_pages[0]];
-			ntfs_inode *ni = NTFS_I(page->mapping->host);
-
 			for (i = 0; i < nr_completed_pages; i++) {
 				int di = completed_pages[i];
 
@@ -230,7 +232,8 @@ return_error:
 				 * If we are outside the initialized size, zero
 				 * the out of bounds page range.
 				 */
-				handle_bounds_compressed_page(ni, dp);
+				handle_bounds_compressed_page(dp, i_size,
+						initialized_size);
 				flush_dcache_page(dp);
 				kunmap(dp);
 				SetPageUptodate(dp);
@@ -478,12 +481,14 @@ return_overflow:
  */
 int ntfs_read_compressed_block(struct page *page)
 {
+	loff_t i_size;
+	s64 initialized_size;
 	struct address_space *mapping = page->mapping;
 	ntfs_inode *ni = NTFS_I(mapping->host);
 	ntfs_volume *vol = ni->vol;
 	struct super_block *sb = vol->sb;
 	runlist_element *rl;
-	unsigned long block_size = sb->s_blocksize;
+	unsigned long flags, block_size = sb->s_blocksize;
 	unsigned char block_size_bits = sb->s_blocksize_bits;
 	u8 *cb, *cb_pos, *cb_end;
 	struct buffer_head **bhs;
@@ -496,7 +501,7 @@ int ntfs_read_compressed_block(struct page *page)
 	VCN start_vcn = (((s64)index << PAGE_CACHE_SHIFT) & ~cb_size_mask) >>
 			vol->cluster_size_bits;
 	/*
-	 * The first vcn after the last wanted vcn (minumum alignment is again
+	 * The first vcn after the last wanted vcn (minimum alignment is again
 	 * PAGE_CACHE_SIZE.
 	 */
 	VCN end_vcn = ((((s64)(index + 1UL) << PAGE_CACHE_SHIFT) + cb_size - 1)
@@ -535,7 +540,6 @@ int ntfs_read_compressed_block(struct page *page)
 	if (unlikely(!pages || !bhs)) {
 		kfree(bhs);
 		kfree(pages);
-		SetPageError(page);
 		unlock_page(page);
 		ntfs_error(vol->sb, "Failed to allocate internal buffers.");
 		return -ENOMEM;
@@ -552,8 +556,22 @@ int ntfs_read_compressed_block(struct page *page)
 	 * The remaining pages need to be allocated and inserted into the page
 	 * cache, alignment guarantees keep all the below much simpler. (-8
 	 */
-	max_page = ((VFS_I(ni)->i_size + PAGE_CACHE_SIZE - 1) >>
-			PAGE_CACHE_SHIFT) - offset;
+	read_lock_irqsave(&ni->size_lock, flags);
+	i_size = i_size_read(VFS_I(ni));
+	initialized_size = ni->initialized_size;
+	read_unlock_irqrestore(&ni->size_lock, flags);
+	max_page = ((i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
+			offset;
+	/* Is the page fully outside i_size? (truncate in progress) */
+	if (xpage >= max_page) {
+		kfree(bhs);
+		kfree(pages);
+		zero_user(page, 0, PAGE_CACHE_SIZE);
+		ntfs_debug("Compressed read outside i_size - truncated?");
+		SetPageUptodate(page);
+		unlock_page(page);
+		return 0;
+	}
 	if (nr_pages < max_page)
 		max_page = nr_pages;
 	for (i = 0; i < max_page; i++, offset++) {
@@ -593,7 +611,7 @@ do_next_cb:
 	rl = NULL;
 	for (vcn = start_vcn, start_vcn += cb_clusters; vcn < start_vcn;
 			vcn++) {
-		BOOL is_retry = FALSE;
+		bool is_retry = false;
 
 		if (!rl) {
 lock_retry_remap:
@@ -619,7 +637,7 @@ lock_retry_remap:
 				break;
 			if (is_retry || lcn != LCN_RL_NOT_MAPPED)
 				goto rl_err;
-			is_retry = TRUE;
+			is_retry = true;
 			/*
 			 * Attempt to map runlist, dropping lock for the
 			 * duration.
@@ -648,7 +666,7 @@ lock_retry_remap:
 	for (i = 0; i < nr_bhs; i++) {
 		struct buffer_head *tbh = bhs[i];
 
-		if (unlikely(test_set_buffer_locked(tbh)))
+		if (!trylock_buffer(tbh))
 			continue;
 		if (unlikely(buffer_uptodate(tbh))) {
 			unlock_buffer(tbh);
@@ -680,8 +698,7 @@ lock_retry_remap:
 					"uptodate! Unplugging the disk queue "
 					"and rescheduling.");
 			get_bh(tbh);
-			blk_run_address_space(mapping);
-			schedule();
+			io_schedule();
 			put_bh(tbh);
 			if (unlikely(!buffer_uptodate(tbh)))
 				goto read_err;
@@ -824,7 +841,8 @@ lock_retry_remap:
 				 * If we are outside the initialized size, zero
 				 * the out of bounds page range.
 				 */
-				handle_bounds_compressed_page(ni, page);
+				handle_bounds_compressed_page(page, i_size,
+						initialized_size);
 				flush_dcache_page(page);
 				kunmap(page);
 				SetPageUptodate(page);
@@ -847,7 +865,8 @@ lock_retry_remap:
 		ntfs_debug("Found compressed compression block.");
 		err = ntfs_decompress(pages, &cur_page, &cur_ofs,
 				cb_max_page, cb_max_ofs, xpage, &xpage_done,
-				cb_pos,	cb_size - (cb_pos - cb));
+				cb_pos,	cb_size - (cb_pos - cb), i_size,
+				initialized_size);
 		/*
 		 * We can sleep from now on, lock already dropped by
 		 * ntfs_decompress().
@@ -861,9 +880,6 @@ lock_retry_remap:
 			for (; prev_cur_page < cur_page; prev_cur_page++) {
 				page = pages[prev_cur_page];
 				if (page) {
-					if (prev_cur_page == xpage &&
-							!xpage_done)
-						SetPageError(page);
 					flush_dcache_page(page);
 					kunmap(page);
 					unlock_page(page);
@@ -894,8 +910,6 @@ lock_retry_remap:
 					"Terminating them with extreme "
 					"prejudice.  Inode 0x%lx, page index "
 					"0x%lx.", ni->mft_no, page->index);
-			if (cur_page == xpage && !xpage_done)
-				SetPageError(page);
 			flush_dcache_page(page);
 			kunmap(page);
 			unlock_page(page);
@@ -913,7 +927,7 @@ lock_retry_remap:
 		return 0;
 
 	ntfs_debug("Failed. Returning error code %s.", err == -EOVERFLOW ?
-			"EOVERFLOW" : (!err ? "EIO" : "unkown error"));
+			"EOVERFLOW" : (!err ? "EIO" : "unknown error"));
 	return err < 0 ? err : -EIO;
 
 read_err:
@@ -943,8 +957,6 @@ err_out:
 	for (i = cur_page; i < max_page; i++) {
 		page = pages[i];
 		if (page) {
-			if (i == xpage && !xpage_done)
-				SetPageError(page);
 			flush_dcache_page(page);
 			kunmap(page);
 			unlock_page(page);
diff --git a/fs/ntfs/debug.c b/fs/ntfs/debug.c
index 6fb6bb5e372..dd6103cc93c 100644
--- a/fs/ntfs/debug.c
+++ b/fs/ntfs/debug.c
@@ -18,16 +18,9 @@
  * distribution in the file COPYING); if not, write to the Free Software
  * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
-
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include "debug.h"
 
-/*
- * A static buffer to hold the error string being displayed and a spinlock
- * to protect concurrent accesses to it.
- */
-static char err_buf[1024];
-static DEFINE_SPINLOCK(err_buf_lock);
-
 /**
  * __ntfs_warning - output a warning to the syslog
  * @function:	name of function outputting the warning
@@ -50,6 +43,7 @@ static DEFINE_SPINLOCK(err_buf_lock);
 void __ntfs_warning(const char *function, const struct super_block *sb,
 		const char *fmt, ...)
 {
+	struct va_format vaf;
 	va_list args;
 	int flen = 0;
 
@@ -59,17 +53,15 @@ void __ntfs_warning(const char *function, const struct super_block *sb,
 #endif
 	if (function)
 		flen = strlen(function);
-	spin_lock(&err_buf_lock);
 	va_start(args, fmt);
-	vsnprintf(err_buf, sizeof(err_buf), fmt, args);
-	va_end(args);
+	vaf.fmt = fmt;
+	vaf.va = &args;
 	if (sb)
-		printk(KERN_ERR "NTFS-fs warning (device %s): %s(): %s\n",
-				sb->s_id, flen ? function : "", err_buf);
+		pr_warn("(device %s): %s(): %pV\n",
+			sb->s_id, flen ? function : "", &vaf);
 	else
-		printk(KERN_ERR "NTFS-fs warning: %s(): %s\n",
-				flen ? function : "", err_buf);
-	spin_unlock(&err_buf_lock);
+		pr_warn("%s(): %pV\n", flen ? function : "", &vaf);
+	va_end(args);
 }
 
 /**
@@ -94,6 +86,7 @@ void __ntfs_warning(const char *function, const struct super_block *sb,
 void __ntfs_error(const char *function, const struct super_block *sb,
 		const char *fmt, ...)
 {
+	struct va_format vaf;
 	va_list args;
 	int flen = 0;
 
@@ -103,17 +96,15 @@ void __ntfs_error(const char *function, const struct super_block *sb,
 #endif
 	if (function)
 		flen = strlen(function);
-	spin_lock(&err_buf_lock);
 	va_start(args, fmt);
-	vsnprintf(err_buf, sizeof(err_buf), fmt, args);
-	va_end(args);
+	vaf.fmt = fmt;
+	vaf.va = &args;
 	if (sb)
-		printk(KERN_ERR "NTFS-fs error (device %s): %s(): %s\n",
-				sb->s_id, flen ? function : "", err_buf);
+		pr_err("(device %s): %s(): %pV\n",
+		       sb->s_id, flen ? function : "", &vaf);
 	else
-		printk(KERN_ERR "NTFS-fs error: %s(): %s\n",
-				flen ? function : "", err_buf);
-	spin_unlock(&err_buf_lock);
+		pr_err("%s(): %pV\n", flen ? function : "", &vaf);
+	va_end(args);
 }
 
 #ifdef DEBUG
@@ -124,6 +115,7 @@ int debug_msgs = 0;
 void __ntfs_debug (const char *file, int line, const char *function,
 		const char *fmt, ...)
 {
+	struct va_format vaf;
 	va_list args;
 	int flen = 0;
 
@@ -131,13 +123,11 @@ void __ntfs_debug (const char *file, int line, const char *function,
 		return;
 	if (function)
 		flen = strlen(function);
-	spin_lock(&err_buf_lock);
 	va_start(args, fmt);
-	vsnprintf(err_buf, sizeof(err_buf), fmt, args);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+	pr_debug("(%s, %d): %s(): %pV", file, line, flen ? function : "", &vaf);
 	va_end(args);
-	printk(KERN_DEBUG "NTFS-fs DEBUG (%s, %d): %s(): %s\n", file, line,
-			flen ? function : "", err_buf);
-	spin_unlock(&err_buf_lock);
 }
 
 /* Dump a runlist. Caller has to provide synchronisation for @rl. */
@@ -149,12 +139,12 @@ void ntfs_debug_dump_runlist(const runlist_element *rl)
 
 	if (!debug_msgs)
 		return;
-	printk(KERN_DEBUG "NTFS-fs DEBUG: Dumping runlist (values in hex):\n");
+	pr_debug("Dumping runlist (values in hex):\n");
 	if (!rl) {
-		printk(KERN_DEBUG "Run list not present.\n");
+		pr_debug("Run list not present.\n");
 		return;
 	}
-	printk(KERN_DEBUG "VCN              LCN               Run length\n");
+	pr_debug("VCN              LCN               Run length\n");
 	for (i = 0; ; i++) {
 		LCN lcn = (rl + i)->lcn;
 
@@ -163,15 +153,18 @@ void ntfs_debug_dump_runlist(const runlist_element *rl)
 
 			if (index > -LCN_ENOENT - 1)
 				index = 3;
-			printk(KERN_DEBUG "%-16Lx %s %-16Lx%s\n",
-					(rl + i)->vcn, lcn_str[index],
-					(rl + i)->length, (rl + i)->length ?
-					"" : " (runlist end)");
+			pr_debug("%-16Lx %s %-16Lx%s\n",
+					(long long)(rl + i)->vcn, lcn_str[index],
+					(long long)(rl + i)->length,
+					(rl + i)->length ? "" :
+						" (runlist end)");
 		} else
-			printk(KERN_DEBUG "%-16Lx %-16Lx  %-16Lx%s\n",
-					(rl + i)->vcn, (rl + i)->lcn,
-					(rl + i)->length, (rl + i)->length ?
-					"" : " (runlist end)");
+			pr_debug("%-16Lx %-16Lx  %-16Lx%s\n",
+					(long long)(rl + i)->vcn,
+					(long long)(rl + i)->lcn,
+					(long long)(rl + i)->length,
+					(rl + i)->length ? "" :
+						" (runlist end)");
 		if (!(rl + i)->length)
 			break;
 	}
diff --git a/fs/ntfs/debug.h b/fs/ntfs/debug.h
index 8ac37c33d12..61bf091e32a 100644
--- a/fs/ntfs/debug.h
+++ b/fs/ntfs/debug.h
@@ -30,7 +30,9 @@
 
 extern int debug_msgs;
 
-#if 0 /* Fool kernel-doc since it doesn't do macros yet */
+extern __printf(4, 5)
+void __ntfs_debug(const char *file, int line, const char *function,
+		  const char *format, ...);
 /**
  * ntfs_debug - write a debug level message to syslog
  * @f:		a printf format string containing the message
@@ -39,29 +41,31 @@ extern int debug_msgs;
  * ntfs_debug() writes a DEBUG level message to the syslog but only if the
  * driver was compiled with -DDEBUG. Otherwise, the call turns into a NOP.
  */
-static void ntfs_debug(const char *f, ...);
-#endif
-
-extern void __ntfs_debug (const char *file, int line, const char *function,
-	const char *format, ...) __attribute__ ((format (printf, 4, 5)));
 #define ntfs_debug(f, a...)						\
-	__ntfs_debug(__FILE__, __LINE__, __FUNCTION__, f, ##a)
+	__ntfs_debug(__FILE__, __LINE__, __func__, f, ##a)
 
 extern void ntfs_debug_dump_runlist(const runlist_element *rl);
 
 #else	/* !DEBUG */
 
-#define ntfs_debug(f, a...)		do {} while (0)
+#define ntfs_debug(fmt, ...)						\
+do {									\
+	if (0)								\
+		no_printk(fmt, ##__VA_ARGS__);				\
+} while (0)
+
 #define ntfs_debug_dump_runlist(rl)	do {} while (0)
 
 #endif	/* !DEBUG */
 
-extern void __ntfs_warning(const char *function, const struct super_block *sb,
-		const char *fmt, ...) __attribute__ ((format (printf, 3, 4)));
-#define ntfs_warning(sb, f, a...)	__ntfs_warning(__FUNCTION__, sb, f, ##a)
+extern  __printf(3, 4)
+void __ntfs_warning(const char *function, const struct super_block *sb,
+		    const char *fmt, ...);
+#define ntfs_warning(sb, f, a...)	__ntfs_warning(__func__, sb, f, ##a)
 
-extern void __ntfs_error(const char *function, const struct super_block *sb,
-		const char *fmt, ...) __attribute__ ((format (printf, 3, 4)));
-#define ntfs_error(sb, f, a...)		__ntfs_error(__FUNCTION__, sb, f, ##a)
+extern  __printf(3, 4)
+void __ntfs_error(const char *function, const struct super_block *sb,
+		  const char *fmt, ...);
+#define ntfs_error(sb, f, a...)		__ntfs_error(__func__, sb, f, ##a)
 
 #endif /* _LINUX_NTFS_DEBUG_H */
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index 93577561cdb..9e38dafa3bc 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -1,7 +1,7 @@
 /**
  * dir.c - NTFS kernel directory operations. Part of the Linux-NTFS project.
  *
- * Copyright (c) 2001-2004 Anton Altaparmakov
+ * Copyright (c) 2001-2007 Anton Altaparmakov
  * Copyright (c) 2002 Richard Russon
  *
  * This program/include file is free software; you can redistribute it and/or
@@ -20,8 +20,8 @@
  * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
+#include <linux/slab.h>
 
 #include "dir.h"
 #include "aops.h"
@@ -33,8 +33,8 @@
 /**
  * The little endian Unicode string $I30 as a global constant.
  */
-ntfschar I30[5] = { const_cpu_to_le16('$'), const_cpu_to_le16('I'),
-		const_cpu_to_le16('3'),	const_cpu_to_le16('0'), 0 };
+ntfschar I30[5] = { cpu_to_le16('$'), cpu_to_le16('I'),
+		cpu_to_le16('3'),	cpu_to_le16('0'), 0 };
 
 /**
  * ntfs_lookup_inode_by_name - find an inode in a directory given its name
@@ -69,7 +69,7 @@ ntfschar I30[5] = { const_cpu_to_le16('$'), const_cpu_to_le16('I'),
  * work but we don't care for how quickly one can access them. This also fixes
  * the dcache aliasing issues.
  *
- * Locking:  - Caller must hold i_sem on the directory.
+ * Locking:  - Caller must hold i_mutex on the directory.
  *	     - Each page cache page in the index allocation mapping must be
  *	       locked whilst being accessed otherwise we may find a corrupt
  *	       page due to it being under ->writepage at the moment which
@@ -183,8 +183,7 @@ found_it:
 				name->len = 0;
 				*res = name;
 			} else {
-				if (name)
-					kfree(name);
+				kfree(name);
 				*res = NULL;
 			}
 			mref = le64_to_cpu(ie->data.dir.indexed_file);
@@ -444,8 +443,7 @@ found_it2:
 				name->len = 0;
 				*res = name;
 			} else {
-				if (name)
-					kfree(name);
+				kfree(name);
 				*res = NULL;
 			}
 			mref = le64_to_cpu(ie->data.dir.indexed_file);
@@ -610,7 +608,7 @@ dir_err_out:
 // TODO: (AIA)
 // The algorithm embedded in this code will be required for the time when we
 // want to support adding of entries to directories, where we require correct
-// collation of file names in order not to cause corruption of the file system.
+// collation of file names in order not to cause corruption of the filesystem.
 
 /**
  * ntfs_lookup_inode_by_name - find an inode in a directory given its name
@@ -1006,13 +1004,11 @@ dir_err_out:
 /**
  * ntfs_filldir - ntfs specific filldir method
  * @vol:	current ntfs volume
- * @fpos:	position in the directory
  * @ndir:	ntfs inode of current directory
  * @ia_page:	page in which the index allocation buffer @ie is in resides
  * @ie:		current index entry
  * @name:	buffer to use for the converted name
- * @dirent:	vfs filldir callback context
- * @filldir:	vfs filldir callback
+ * @actor:	what to feed the entries to
  *
  * Convert the Unicode @name to the loaded NLS and pass it to the @filldir
  * callback.
@@ -1026,12 +1022,12 @@ dir_err_out:
  * retake the lock if we are returning a non-zero value as ntfs_readdir()
  * would need to drop the lock immediately anyway.
  */
-static inline int ntfs_filldir(ntfs_volume *vol, loff_t fpos,
+static inline int ntfs_filldir(ntfs_volume *vol,
 		ntfs_inode *ndir, struct page *ia_page, INDEX_ENTRY *ie,
-		u8 *name, void *dirent, filldir_t filldir)
+		u8 *name, struct dir_context *actor)
 {
 	unsigned long mref;
-	int name_len, rc;
+	int name_len;
 	unsigned dt_type;
 	FILE_NAME_TYPE_FLAGS name_type;
 
@@ -1053,7 +1049,8 @@ static inline int ntfs_filldir(ntfs_volume *vol, loff_t fpos,
 			ie->key.file_name.file_name_length, &name,
 			NTFS_MAX_NAME_LEN * NLS_MAX_CHARSET_SIZE + 1);
 	if (name_len <= 0) {
-		ntfs_debug("Skipping unrepresentable file.");
+		ntfs_warning(vol->sb, "Skipping unrepresentable inode 0x%llx.",
+				(long long)MREF_LE(ie->data.dir.indexed_file));
 		return 0;
 	}
 	if (ie->key.file_name.file_attributes &
@@ -1069,13 +1066,14 @@ static inline int ntfs_filldir(ntfs_volume *vol, loff_t fpos,
 	if (ia_page)
 		unlock_page(ia_page);
 	ntfs_debug("Calling filldir for %s with len %i, fpos 0x%llx, inode "
-			"0x%lx, DT_%s.", name, name_len, fpos, mref,
+			"0x%lx, DT_%s.", name, name_len, actor->pos, mref,
 			dt_type == DT_DIR ? "DIR" : "REG");
-	rc = filldir(dirent, name, name_len, fpos, mref, dt_type);
+	if (!dir_emit(actor, name, name_len, mref, dt_type))
+		return 1;
 	/* Relock the page but not if we are aborting ->readdir. */
-	if (!rc && ia_page)
+	if (ia_page)
 		lock_page(ia_page);
-	return rc;
+	return 0;
 }
 
 /*
@@ -1086,11 +1084,11 @@ static inline int ntfs_filldir(ntfs_volume *vol, loff_t fpos,
  * While this will return the names in random order this doesn't matter for
  * ->readdir but OTOH results in a faster ->readdir.
  *
- * VFS calls ->readdir without BKL but with i_sem held. This protects the VFS
+ * VFS calls ->readdir without BKL but with i_mutex held. This protects the VFS
  * parts (e.g. ->f_pos and ->i_size, and it also protects against directory
  * modifications).
  *
- * Locking:  - Caller must hold i_sem on the directory.
+ * Locking:  - Caller must hold i_mutex on the directory.
  *	     - Each page cache page in the index allocation mapping must be
  *	       locked whilst being accessed otherwise we may find a corrupt
  *	       page due to it being under ->writepage at the moment which
@@ -1098,11 +1096,11 @@ static inline int ntfs_filldir(ntfs_volume *vol, loff_t fpos,
  *	       removes them again after the write is complete after which it 
  *	       unlocks the page.
  */
-static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int ntfs_readdir(struct file *file, struct dir_context *actor)
 {
 	s64 ia_pos, ia_start, prev_ia_pos, bmp_pos;
-	loff_t fpos;
-	struct inode *bmp_vi, *vdir = filp->f_dentry->d_inode;
+	loff_t i_size;
+	struct inode *bmp_vi, *vdir = file_inode(file);
 	struct super_block *sb = vdir->i_sb;
 	ntfs_inode *ndir = NTFS_I(vdir);
 	ntfs_volume *vol = NTFS_SB(sb);
@@ -1117,46 +1115,29 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	u8 *kaddr, *bmp, *index_end;
 	ntfs_attr_search_ctx *ctx;
 
-	fpos = filp->f_pos;
 	ntfs_debug("Entering for inode 0x%lx, fpos 0x%llx.",
-			vdir->i_ino, fpos);
+			vdir->i_ino, actor->pos);
 	rc = err = 0;
 	/* Are we at end of dir yet? */
-	if (fpos >= vdir->i_size + vol->mft_record_size)
-		goto done;
+	i_size = i_size_read(vdir);
+	if (actor->pos >= i_size + vol->mft_record_size)
+		return 0;
 	/* Emulate . and .. for all directories. */
-	if (!fpos) {
-		ntfs_debug("Calling filldir for . with len 1, fpos 0x0, "
-				"inode 0x%lx, DT_DIR.", vdir->i_ino);
-		rc = filldir(dirent, ".", 1, fpos, vdir->i_ino, DT_DIR);
-		if (rc)
-			goto done;
-		fpos++;
-	}
-	if (fpos == 1) {
-		ntfs_debug("Calling filldir for .. with len 2, fpos 0x1, "
-				"inode 0x%lx, DT_DIR.",
-				parent_ino(filp->f_dentry));
-		rc = filldir(dirent, "..", 2, fpos,
-				parent_ino(filp->f_dentry), DT_DIR);
-		if (rc)
-			goto done;
-		fpos++;
-	}
+	if (!dir_emit_dots(file, actor))
+		return 0;
 	m = NULL;
 	ctx = NULL;
 	/*
 	 * Allocate a buffer to store the current name being processed
 	 * converted to format determined by current NLS.
 	 */
-	name = (u8*)kmalloc(NTFS_MAX_NAME_LEN * NLS_MAX_CHARSET_SIZE + 1,
-			GFP_NOFS);
+	name = kmalloc(NTFS_MAX_NAME_LEN * NLS_MAX_CHARSET_SIZE + 1, GFP_NOFS);
 	if (unlikely(!name)) {
 		err = -ENOMEM;
 		goto err_out;
 	}
 	/* Are we jumping straight into the index allocation attribute? */
-	if (fpos >= vol->mft_record_size)
+	if (actor->pos >= vol->mft_record_size)
 		goto skip_index_root;
 	/* Get hold of the mft record for the directory. */
 	m = map_mft_record(ndir);
@@ -1171,7 +1152,7 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		goto err_out;
 	}
 	/* Get the offset into the index root attribute. */
-	ir_pos = (s64)fpos;
+	ir_pos = (s64)actor->pos;
 	/* Find the index root attribute in the mft record. */
 	err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL,
 			0, ctx);
@@ -1191,7 +1172,7 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	 * map the mft record without deadlocking.
 	 */
 	rc = le32_to_cpu(ctx->attr->data.resident.value_length);
-	ir = (INDEX_ROOT*)kmalloc(rc, GFP_NOFS);
+	ir = kmalloc(rc, GFP_NOFS);
 	if (unlikely(!ir)) {
 		err = -ENOMEM;
 		goto err_out;
@@ -1227,10 +1208,9 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		if (ir_pos > (u8*)ie - (u8*)ir)
 			continue;
 		/* Advance the position even if going to skip the entry. */
-		fpos = (u8*)ie - (u8*)ir;
+		actor->pos = (u8*)ie - (u8*)ir;
 		/* Submit the name to the filldir callback. */
-		rc = ntfs_filldir(vol, fpos, ndir, NULL, ie, name, dirent,
-				filldir);
+		rc = ntfs_filldir(vol, ndir, NULL, ie, name, actor);
 		if (rc) {
 			kfree(ir);
 			goto abort;
@@ -1243,31 +1223,27 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	if (!NInoIndexAllocPresent(ndir))
 		goto EOD;
 	/* Advance fpos to the beginning of the index allocation. */
-	fpos = vol->mft_record_size;
+	actor->pos = vol->mft_record_size;
 skip_index_root:
 	kaddr = NULL;
 	prev_ia_pos = -1LL;
 	/* Get the offset into the index allocation attribute. */
-	ia_pos = (s64)fpos - vol->mft_record_size;
+	ia_pos = (s64)actor->pos - vol->mft_record_size;
 	ia_mapping = vdir->i_mapping;
-	bmp_vi = ndir->itype.index.bmp_ino;
-	if (unlikely(!bmp_vi)) {
-		ntfs_debug("Inode 0x%lx, regetting index bitmap.", vdir->i_ino);
-		bmp_vi = ntfs_attr_iget(vdir, AT_BITMAP, I30, 4);
-		if (IS_ERR(bmp_vi)) {
-			ntfs_error(sb, "Failed to get bitmap attribute.");
-			err = PTR_ERR(bmp_vi);
-			goto err_out;
-		}
-		ndir->itype.index.bmp_ino = bmp_vi;
+	ntfs_debug("Inode 0x%lx, getting index bitmap.", vdir->i_ino);
+	bmp_vi = ntfs_attr_iget(vdir, AT_BITMAP, I30, 4);
+	if (IS_ERR(bmp_vi)) {
+		ntfs_error(sb, "Failed to get bitmap attribute.");
+		err = PTR_ERR(bmp_vi);
+		goto err_out;
 	}
 	bmp_mapping = bmp_vi->i_mapping;
 	/* Get the starting bitmap bit position and sanity check it. */
 	bmp_pos = ia_pos >> ndir->itype.index.block_size_bits;
-	if (unlikely(bmp_pos >> 3 >= bmp_vi->i_size)) {
+	if (unlikely(bmp_pos >> 3 >= i_size_read(bmp_vi))) {
 		ntfs_error(sb, "Current index allocation position exceeds "
 				"index bitmap size.");
-		goto err_out;
+		goto iput_err_out;
 	}
 	/* Get the starting bit position in the current bitmap page. */
 	cur_bmp_pos = bmp_pos & ((PAGE_CACHE_SIZE * 8) - 1);
@@ -1283,7 +1259,7 @@ get_next_bmp_page:
 		ntfs_error(sb, "Reading index bitmap failed.");
 		err = PTR_ERR(bmp_page);
 		bmp_page = NULL;
-		goto err_out;
+		goto iput_err_out;
 	}
 	bmp = (u8*)page_address(bmp_page);
 	/* Find next index block in use. */
@@ -1301,7 +1277,7 @@ find_next_index_buffer:
 			goto get_next_bmp_page;
 		}
 		/* If we have reached the end of the bitmap, we are done. */
-		if (unlikely(((bmp_pos + cur_bmp_pos) >> 3) >= vdir->i_size))
+		if (unlikely(((bmp_pos + cur_bmp_pos) >> 3) >= i_size))
 			goto unm_EOD;
 		ia_pos = (bmp_pos + cur_bmp_pos) <<
 				ndir->itype.index.block_size_bits;
@@ -1309,7 +1285,8 @@ find_next_index_buffer:
 	ntfs_debug("Handling index buffer 0x%llx.",
 			(unsigned long long)bmp_pos + cur_bmp_pos);
 	/* If the current index buffer is in the same page we reuse the page. */
-	if ((prev_ia_pos & PAGE_CACHE_MASK) != (ia_pos & PAGE_CACHE_MASK)) {
+	if ((prev_ia_pos & (s64)PAGE_CACHE_MASK) !=
+			(ia_pos & (s64)PAGE_CACHE_MASK)) {
 		prev_ia_pos = ia_pos;
 		if (likely(ia_page != NULL)) {
 			unlock_page(ia_page);
@@ -1413,7 +1390,7 @@ find_next_index_buffer:
 		if (ia_pos - ia_start > (u8*)ie - (u8*)ia)
 			continue;
 		/* Advance the position even if going to skip the entry. */
-		fpos = (u8*)ie - (u8*)ia +
+		actor->pos = (u8*)ie - (u8*)ia +
 				(sle64_to_cpu(ia->index_block_vcn) <<
 				ndir->itype.index.vcn_size_bits) +
 				vol->mft_record_size;
@@ -1423,12 +1400,12 @@ find_next_index_buffer:
 		 * before returning, unless a non-zero value is returned in
 		 * which case the page is left unlocked.
 		 */
-		rc = ntfs_filldir(vol, fpos, ndir, ia_page, ie, name, dirent,
-				filldir);
+		rc = ntfs_filldir(vol, ndir, ia_page, ie, name, actor);
 		if (rc) {
 			/* @ia_page is already unlocked in this case. */
 			ntfs_unmap_page(ia_page);
 			ntfs_unmap_page(bmp_page);
+			iput(bmp_vi);
 			goto abort;
 		}
 	}
@@ -1439,32 +1416,25 @@ unm_EOD:
 		ntfs_unmap_page(ia_page);
 	}
 	ntfs_unmap_page(bmp_page);
+	iput(bmp_vi);
 EOD:
 	/* We are finished, set fpos to EOD. */
-	fpos = vdir->i_size + vol->mft_record_size;
+	actor->pos = i_size + vol->mft_record_size;
 abort:
 	kfree(name);
-done:
-#ifdef DEBUG
-	if (!rc)
-		ntfs_debug("EOD, fpos 0x%llx, returning 0.", fpos);
-	else
-		ntfs_debug("filldir returned %i, fpos 0x%llx, returning 0.",
-				rc, fpos);
-#endif
-	filp->f_pos = fpos;
 	return 0;
 err_out:
-	if (bmp_page)
+	if (bmp_page) {
 		ntfs_unmap_page(bmp_page);
+iput_err_out:
+		iput(bmp_vi);
+	}
 	if (ia_page) {
 		unlock_page(ia_page);
 		ntfs_unmap_page(ia_page);
 	}
-	if (ir)
-		kfree(ir);
-	if (name)
-		kfree(name);
+	kfree(ir);
+	kfree(name);
 	if (ctx)
 		ntfs_attr_put_search_ctx(ctx);
 	if (m)
@@ -1472,7 +1442,6 @@ err_out:
 	if (!err)
 		err = -EIO;
 	ntfs_debug("Failed. Returning error code %i.", -err);
-	filp->f_pos = fpos;
 	return err;
 }
 
@@ -1495,7 +1464,7 @@ err_out:
 static int ntfs_dir_open(struct inode *vi, struct file *filp)
 {
 	if (sizeof(unsigned long) < 8) {
-		if (vi->i_size > MAX_LFS_FILESIZE)
+		if (i_size_read(vi) > MAX_LFS_FILESIZE)
 			return -EFBIG;
 	}
 	return 0;
@@ -1521,25 +1490,39 @@ static int ntfs_dir_open(struct inode *vi, struct file *filp)
  * Note: In the past @filp could be NULL so we ignore it as we don't need it
  * anyway.
  *
- * Locking: Caller must hold i_sem on the inode.
+ * Locking: Caller must hold i_mutex on the inode.
  *
  * TODO: We should probably also write all attribute/index inodes associated
  * with this inode but since we have no simple way of getting to them we ignore
  * this problem for now.  We do write the $BITMAP attribute if it is present
  * which is the important one for a directory so things are not too bad.
  */
-static int ntfs_dir_fsync(struct file *filp, struct dentry *dentry,
-		int datasync)
+static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
+			  int datasync)
 {
-	struct inode *vi = dentry->d_inode;
-	ntfs_inode *ni = NTFS_I(vi);
+	struct inode *bmp_vi, *vi = filp->f_mapping->host;
 	int err, ret;
+	ntfs_attr na;
 
 	ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
+
+	err = filemap_write_and_wait_range(vi->i_mapping, start, end);
+	if (err)
+		return err;
+	mutex_lock(&vi->i_mutex);
+
 	BUG_ON(!S_ISDIR(vi->i_mode));
-	if (NInoIndexAllocPresent(ni) && ni->itype.index.bmp_ino)
-		write_inode_now(ni->itype.index.bmp_ino, !datasync);
-	ret = ntfs_write_inode(vi, 1);
+	/* If the bitmap attribute inode is in memory sync it, too. */
+	na.mft_no = vi->i_ino;
+	na.type = AT_BITMAP;
+	na.name = I30;
+	na.name_len = 4;
+	bmp_vi = ilookup5(vi->i_sb, vi->i_ino, (test_t)ntfs_test_inode, &na);
+	if (bmp_vi) {
+ 		write_inode_now(bmp_vi, !datasync);
+		iput(bmp_vi);
+	}
+	ret = __ntfs_write_inode(vi, 1);
 	write_inode_now(vi, !datasync);
 	err = sync_blockdev(vi->i_sb->s_bdev);
 	if (unlikely(err && !ret))
@@ -1549,15 +1532,16 @@ static int ntfs_dir_fsync(struct file *filp, struct dentry *dentry,
 	else
 		ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx.  Error "
 				"%u.", datasync ? "data" : "", vi->i_ino, -ret);
+	mutex_unlock(&vi->i_mutex);
 	return ret;
 }
 
 #endif /* NTFS_RW */
 
-struct file_operations ntfs_dir_ops = {
+const struct file_operations ntfs_dir_ops = {
 	.llseek		= generic_file_llseek,	/* Seek inside directory. */
 	.read		= generic_read_dir,	/* Return -EISDIR. */
-	.readdir	= ntfs_readdir,		/* Read directory contents. */
+	.iterate	= ntfs_readdir,		/* Read directory contents. */
 #ifdef NTFS_RW
 	.fsync		= ntfs_dir_fsync,	/* Sync a directory to disk. */
 	/*.aio_fsync	= ,*/			/* Sync all outstanding async
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index db8713ea0d2..5c9e2c81cb1 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1,7 +1,7 @@
 /*
- * file.c - NTFS kernel file operations. Part of the Linux-NTFS project.
+ * file.c - NTFS kernel file operations.  Part of the Linux-NTFS project.
  *
- * Copyright (c) 2001-2004 Anton Altaparmakov
+ * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.
  *
  * This program/include file is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as published
@@ -19,11 +19,26 @@
  * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
-#include <linux/pagemap.h>
 #include <linux/buffer_head.h>
+#include <linux/gfp.h>
+#include <linux/pagemap.h>
+#include <linux/pagevec.h>
+#include <linux/sched.h>
+#include <linux/swap.h>
+#include <linux/uio.h>
+#include <linux/writeback.h>
+#include <linux/aio.h>
+
+#include <asm/page.h>
+#include <asm/uaccess.h>
 
+#include "attrib.h"
+#include "bitmap.h"
 #include "inode.h"
 #include "debug.h"
+#include "lcnalloc.h"
+#include "malloc.h"
+#include "mft.h"
 #include "ntfs.h"
 
 /**
@@ -47,8 +62,8 @@
 static int ntfs_file_open(struct inode *vi, struct file *filp)
 {
 	if (sizeof(unsigned long) < 8) {
-		if (vi->i_size > MAX_LFS_FILESIZE)
-			return -EFBIG;
+		if (i_size_read(vi) > MAX_LFS_FILESIZE)
+			return -EOVERFLOW;
 	}
 	return generic_file_open(vi, filp);
 }
@@ -56,9 +71,2075 @@ static int ntfs_file_open(struct inode *vi, struct file *filp)
 #ifdef NTFS_RW
 
 /**
+ * ntfs_attr_extend_initialized - extend the initialized size of an attribute
+ * @ni:			ntfs inode of the attribute to extend
+ * @new_init_size:	requested new initialized size in bytes
+ * @cached_page:	store any allocated but unused page here
+ * @lru_pvec:		lru-buffering pagevec of the caller
+ *
+ * Extend the initialized size of an attribute described by the ntfs inode @ni
+ * to @new_init_size bytes.  This involves zeroing any non-sparse space between
+ * the old initialized size and @new_init_size both in the page cache and on
+ * disk (if relevant complete pages are already uptodate in the page cache then
+ * these are simply marked dirty).
+ *
+ * As a side-effect, the file size (vfs inode->i_size) may be incremented as,
+ * in the resident attribute case, it is tied to the initialized size and, in
+ * the non-resident attribute case, it may not fall below the initialized size.
+ *
+ * Note that if the attribute is resident, we do not need to touch the page
+ * cache at all.  This is because if the page cache page is not uptodate we
+ * bring it uptodate later, when doing the write to the mft record since we
+ * then already have the page mapped.  And if the page is uptodate, the
+ * non-initialized region will already have been zeroed when the page was
+ * brought uptodate and the region may in fact already have been overwritten
+ * with new data via mmap() based writes, so we cannot just zero it.  And since
+ * POSIX specifies that the behaviour of resizing a file whilst it is mmap()ped
+ * is unspecified, we choose not to do zeroing and thus we do not need to touch
+ * the page at all.  For a more detailed explanation see ntfs_truncate() in
+ * fs/ntfs/inode.c.
+ *
+ * Return 0 on success and -errno on error.  In the case that an error is
+ * encountered it is possible that the initialized size will already have been
+ * incremented some way towards @new_init_size but it is guaranteed that if
+ * this is the case, the necessary zeroing will also have happened and that all
+ * metadata is self-consistent.
+ *
+ * Locking: i_mutex on the vfs inode corrseponsind to the ntfs inode @ni must be
+ *	    held by the caller.
+ */
+static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size)
+{
+	s64 old_init_size;
+	loff_t old_i_size;
+	pgoff_t index, end_index;
+	unsigned long flags;
+	struct inode *vi = VFS_I(ni);
+	ntfs_inode *base_ni;
+	MFT_RECORD *m = NULL;
+	ATTR_RECORD *a;
+	ntfs_attr_search_ctx *ctx = NULL;
+	struct address_space *mapping;
+	struct page *page = NULL;
+	u8 *kattr;
+	int err;
+	u32 attr_len;
+
+	read_lock_irqsave(&ni->size_lock, flags);
+	old_init_size = ni->initialized_size;
+	old_i_size = i_size_read(vi);
+	BUG_ON(new_init_size > ni->allocated_size);
+	read_unlock_irqrestore(&ni->size_lock, flags);
+	ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
+			"old_initialized_size 0x%llx, "
+			"new_initialized_size 0x%llx, i_size 0x%llx.",
+			vi->i_ino, (unsigned)le32_to_cpu(ni->type),
+			(unsigned long long)old_init_size,
+			(unsigned long long)new_init_size, old_i_size);
+	if (!NInoAttr(ni))
+		base_ni = ni;
+	else
+		base_ni = ni->ext.base_ntfs_ino;
+	/* Use goto to reduce indentation and we need the label below anyway. */
+	if (NInoNonResident(ni))
+		goto do_non_resident_extend;
+	BUG_ON(old_init_size != old_i_size);
+	m = map_mft_record(base_ni);
+	if (IS_ERR(m)) {
+		err = PTR_ERR(m);
+		m = NULL;
+		goto err_out;
+	}
+	ctx = ntfs_attr_get_search_ctx(base_ni, m);
+	if (unlikely(!ctx)) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+			CASE_SENSITIVE, 0, NULL, 0, ctx);
+	if (unlikely(err)) {
+		if (err == -ENOENT)
+			err = -EIO;
+		goto err_out;
+	}
+	m = ctx->mrec;
+	a = ctx->attr;
+	BUG_ON(a->non_resident);
+	/* The total length of the attribute value. */
+	attr_len = le32_to_cpu(a->data.resident.value_length);
+	BUG_ON(old_i_size != (loff_t)attr_len);
+	/*
+	 * Do the zeroing in the mft record and update the attribute size in
+	 * the mft record.
+	 */
+	kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset);
+	memset(kattr + attr_len, 0, new_init_size - attr_len);
+	a->data.resident.value_length = cpu_to_le32((u32)new_init_size);
+	/* Finally, update the sizes in the vfs and ntfs inodes. */
+	write_lock_irqsave(&ni->size_lock, flags);
+	i_size_write(vi, new_init_size);
+	ni->initialized_size = new_init_size;
+	write_unlock_irqrestore(&ni->size_lock, flags);
+	goto done;
+do_non_resident_extend:
+	/*
+	 * If the new initialized size @new_init_size exceeds the current file
+	 * size (vfs inode->i_size), we need to extend the file size to the
+	 * new initialized size.
+	 */
+	if (new_init_size > old_i_size) {
+		m = map_mft_record(base_ni);
+		if (IS_ERR(m)) {
+			err = PTR_ERR(m);
+			m = NULL;
+			goto err_out;
+		}
+		ctx = ntfs_attr_get_search_ctx(base_ni, m);
+		if (unlikely(!ctx)) {
+			err = -ENOMEM;
+			goto err_out;
+		}
+		err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+				CASE_SENSITIVE, 0, NULL, 0, ctx);
+		if (unlikely(err)) {
+			if (err == -ENOENT)
+				err = -EIO;
+			goto err_out;
+		}
+		m = ctx->mrec;
+		a = ctx->attr;
+		BUG_ON(!a->non_resident);
+		BUG_ON(old_i_size != (loff_t)
+				sle64_to_cpu(a->data.non_resident.data_size));
+		a->data.non_resident.data_size = cpu_to_sle64(new_init_size);
+		flush_dcache_mft_record_page(ctx->ntfs_ino);
+		mark_mft_record_dirty(ctx->ntfs_ino);
+		/* Update the file size in the vfs inode. */
+		i_size_write(vi, new_init_size);
+		ntfs_attr_put_search_ctx(ctx);
+		ctx = NULL;
+		unmap_mft_record(base_ni);
+		m = NULL;
+	}
+	mapping = vi->i_mapping;
+	index = old_init_size >> PAGE_CACHE_SHIFT;
+	end_index = (new_init_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	do {
+		/*
+		 * Read the page.  If the page is not present, this will zero
+		 * the uninitialized regions for us.
+		 */
+		page = read_mapping_page(mapping, index, NULL);
+		if (IS_ERR(page)) {
+			err = PTR_ERR(page);
+			goto init_err_out;
+		}
+		if (unlikely(PageError(page))) {
+			page_cache_release(page);
+			err = -EIO;
+			goto init_err_out;
+		}
+		/*
+		 * Update the initialized size in the ntfs inode.  This is
+		 * enough to make ntfs_writepage() work.
+		 */
+		write_lock_irqsave(&ni->size_lock, flags);
+		ni->initialized_size = (s64)(index + 1) << PAGE_CACHE_SHIFT;
+		if (ni->initialized_size > new_init_size)
+			ni->initialized_size = new_init_size;
+		write_unlock_irqrestore(&ni->size_lock, flags);
+		/* Set the page dirty so it gets written out. */
+		set_page_dirty(page);
+		page_cache_release(page);
+		/*
+		 * Play nice with the vm and the rest of the system.  This is
+		 * very much needed as we can potentially be modifying the
+		 * initialised size from a very small value to a really huge
+		 * value, e.g.
+		 *	f = open(somefile, O_TRUNC);
+		 *	truncate(f, 10GiB);
+		 *	seek(f, 10GiB);
+		 *	write(f, 1);
+		 * And this would mean we would be marking dirty hundreds of
+		 * thousands of pages or as in the above example more than
+		 * two and a half million pages!
+		 *
+		 * TODO: For sparse pages could optimize this workload by using
+		 * the FsMisc / MiscFs page bit as a "PageIsSparse" bit.  This
+		 * would be set in readpage for sparse pages and here we would
+		 * not need to mark dirty any pages which have this bit set.
+		 * The only caveat is that we have to clear the bit everywhere
+		 * where we allocate any clusters that lie in the page or that
+		 * contain the page.
+		 *
+		 * TODO: An even greater optimization would be for us to only
+		 * call readpage() on pages which are not in sparse regions as
+		 * determined from the runlist.  This would greatly reduce the
+		 * number of pages we read and make dirty in the case of sparse
+		 * files.
+		 */
+		balance_dirty_pages_ratelimited(mapping);
+		cond_resched();
+	} while (++index < end_index);
+	read_lock_irqsave(&ni->size_lock, flags);
+	BUG_ON(ni->initialized_size != new_init_size);
+	read_unlock_irqrestore(&ni->size_lock, flags);
+	/* Now bring in sync the initialized_size in the mft record. */
+	m = map_mft_record(base_ni);
+	if (IS_ERR(m)) {
+		err = PTR_ERR(m);
+		m = NULL;
+		goto init_err_out;
+	}
+	ctx = ntfs_attr_get_search_ctx(base_ni, m);
+	if (unlikely(!ctx)) {
+		err = -ENOMEM;
+		goto init_err_out;
+	}
+	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+			CASE_SENSITIVE, 0, NULL, 0, ctx);
+	if (unlikely(err)) {
+		if (err == -ENOENT)
+			err = -EIO;
+		goto init_err_out;
+	}
+	m = ctx->mrec;
+	a = ctx->attr;
+	BUG_ON(!a->non_resident);
+	a->data.non_resident.initialized_size = cpu_to_sle64(new_init_size);
+done:
+	flush_dcache_mft_record_page(ctx->ntfs_ino);
+	mark_mft_record_dirty(ctx->ntfs_ino);
+	if (ctx)
+		ntfs_attr_put_search_ctx(ctx);
+	if (m)
+		unmap_mft_record(base_ni);
+	ntfs_debug("Done, initialized_size 0x%llx, i_size 0x%llx.",
+			(unsigned long long)new_init_size, i_size_read(vi));
+	return 0;
+init_err_out:
+	write_lock_irqsave(&ni->size_lock, flags);
+	ni->initialized_size = old_init_size;
+	write_unlock_irqrestore(&ni->size_lock, flags);
+err_out:
+	if (ctx)
+		ntfs_attr_put_search_ctx(ctx);
+	if (m)
+		unmap_mft_record(base_ni);
+	ntfs_debug("Failed.  Returning error code %i.", err);
+	return err;
+}
+
+/**
+ * ntfs_fault_in_pages_readable -
+ *
+ * Fault a number of userspace pages into pagetables.
+ *
+ * Unlike include/linux/pagemap.h::fault_in_pages_readable(), this one copes
+ * with more than two userspace pages as well as handling the single page case
+ * elegantly.
+ *
+ * If you find this difficult to understand, then think of the while loop being
+ * the following code, except that we do without the integer variable ret:
+ *
+ *	do {
+ *		ret = __get_user(c, uaddr);
+ *		uaddr += PAGE_SIZE;
+ *	} while (!ret && uaddr < end);
+ *
+ * Note, the final __get_user() may well run out-of-bounds of the user buffer,
+ * but _not_ out-of-bounds of the page the user buffer belongs to, and since
+ * this is only a read and not a write, and since it is still in the same page,
+ * it should not matter and this makes the code much simpler.
+ */
+static inline void ntfs_fault_in_pages_readable(const char __user *uaddr,
+		int bytes)
+{
+	const char __user *end;
+	volatile char c;
+
+	/* Set @end to the first byte outside the last page we care about. */
+	end = (const char __user*)PAGE_ALIGN((unsigned long)uaddr + bytes);
+
+	while (!__get_user(c, uaddr) && (uaddr += PAGE_SIZE, uaddr < end))
+		;
+}
+
+/**
+ * ntfs_fault_in_pages_readable_iovec -
+ *
+ * Same as ntfs_fault_in_pages_readable() but operates on an array of iovecs.
+ */
+static inline void ntfs_fault_in_pages_readable_iovec(const struct iovec *iov,
+		size_t iov_ofs, int bytes)
+{
+	do {
+		const char __user *buf;
+		unsigned len;
+
+		buf = iov->iov_base + iov_ofs;
+		len = iov->iov_len - iov_ofs;
+		if (len > bytes)
+			len = bytes;
+		ntfs_fault_in_pages_readable(buf, len);
+		bytes -= len;
+		iov++;
+		iov_ofs = 0;
+	} while (bytes);
+}
+
+/**
+ * __ntfs_grab_cache_pages - obtain a number of locked pages
+ * @mapping:	address space mapping from which to obtain page cache pages
+ * @index:	starting index in @mapping at which to begin obtaining pages
+ * @nr_pages:	number of page cache pages to obtain
+ * @pages:	array of pages in which to return the obtained page cache pages
+ * @cached_page: allocated but as yet unused page
+ * @lru_pvec:	lru-buffering pagevec of caller
+ *
+ * Obtain @nr_pages locked page cache pages from the mapping @mapping and
+ * starting at index @index.
+ *
+ * If a page is newly created, add it to lru list
+ *
+ * Note, the page locks are obtained in ascending page index order.
+ */
+static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
+		pgoff_t index, const unsigned nr_pages, struct page **pages,
+		struct page **cached_page)
+{
+	int err, nr;
+
+	BUG_ON(!nr_pages);
+	err = nr = 0;
+	do {
+		pages[nr] = find_lock_page(mapping, index);
+		if (!pages[nr]) {
+			if (!*cached_page) {
+				*cached_page = page_cache_alloc(mapping);
+				if (unlikely(!*cached_page)) {
+					err = -ENOMEM;
+					goto err_out;
+				}
+			}
+			err = add_to_page_cache_lru(*cached_page, mapping, index,
+					GFP_KERNEL);
+			if (unlikely(err)) {
+				if (err == -EEXIST)
+					continue;
+				goto err_out;
+			}
+			pages[nr] = *cached_page;
+			*cached_page = NULL;
+		}
+		index++;
+		nr++;
+	} while (nr < nr_pages);
+out:
+	return err;
+err_out:
+	while (nr > 0) {
+		unlock_page(pages[--nr]);
+		page_cache_release(pages[nr]);
+	}
+	goto out;
+}
+
+static inline int ntfs_submit_bh_for_read(struct buffer_head *bh)
+{
+	lock_buffer(bh);
+	get_bh(bh);
+	bh->b_end_io = end_buffer_read_sync;
+	return submit_bh(READ, bh);
+}
+
+/**
+ * ntfs_prepare_pages_for_non_resident_write - prepare pages for receiving data
+ * @pages:	array of destination pages
+ * @nr_pages:	number of pages in @pages
+ * @pos:	byte position in file at which the write begins
+ * @bytes:	number of bytes to be written
+ *
+ * This is called for non-resident attributes from ntfs_file_buffered_write()
+ * with i_mutex held on the inode (@pages[0]->mapping->host).  There are
+ * @nr_pages pages in @pages which are locked but not kmap()ped.  The source
+ * data has not yet been copied into the @pages.
+ * 
+ * Need to fill any holes with actual clusters, allocate buffers if necessary,
+ * ensure all the buffers are mapped, and bring uptodate any buffers that are
+ * only partially being written to.
+ *
+ * If @nr_pages is greater than one, we are guaranteed that the cluster size is
+ * greater than PAGE_CACHE_SIZE, that all pages in @pages are entirely inside
+ * the same cluster and that they are the entirety of that cluster, and that
+ * the cluster is sparse, i.e. we need to allocate a cluster to fill the hole.
+ *
+ * i_size is not to be modified yet.
+ *
+ * Return 0 on success or -errno on error.
+ */
+static int ntfs_prepare_pages_for_non_resident_write(struct page **pages,
+		unsigned nr_pages, s64 pos, size_t bytes)
+{
+	VCN vcn, highest_vcn = 0, cpos, cend, bh_cpos, bh_cend;
+	LCN lcn;
+	s64 bh_pos, vcn_len, end, initialized_size;
+	sector_t lcn_block;
+	struct page *page;
+	struct inode *vi;
+	ntfs_inode *ni, *base_ni = NULL;
+	ntfs_volume *vol;
+	runlist_element *rl, *rl2;
+	struct buffer_head *bh, *head, *wait[2], **wait_bh = wait;
+	ntfs_attr_search_ctx *ctx = NULL;
+	MFT_RECORD *m = NULL;
+	ATTR_RECORD *a = NULL;
+	unsigned long flags;
+	u32 attr_rec_len = 0;
+	unsigned blocksize, u;
+	int err, mp_size;
+	bool rl_write_locked, was_hole, is_retry;
+	unsigned char blocksize_bits;
+	struct {
+		u8 runlist_merged:1;
+		u8 mft_attr_mapped:1;
+		u8 mp_rebuilt:1;
+		u8 attr_switched:1;
+	} status = { 0, 0, 0, 0 };
+
+	BUG_ON(!nr_pages);
+	BUG_ON(!pages);
+	BUG_ON(!*pages);
+	vi = pages[0]->mapping->host;
+	ni = NTFS_I(vi);
+	vol = ni->vol;
+	ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page "
+			"index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.",
+			vi->i_ino, ni->type, pages[0]->index, nr_pages,
+			(long long)pos, bytes);
+	blocksize = vol->sb->s_blocksize;
+	blocksize_bits = vol->sb->s_blocksize_bits;
+	u = 0;
+	do {
+		page = pages[u];
+		BUG_ON(!page);
+		/*
+		 * create_empty_buffers() will create uptodate/dirty buffers if
+		 * the page is uptodate/dirty.
+		 */
+		if (!page_has_buffers(page)) {
+			create_empty_buffers(page, blocksize, 0);
+			if (unlikely(!page_has_buffers(page)))
+				return -ENOMEM;
+		}
+	} while (++u < nr_pages);
+	rl_write_locked = false;
+	rl = NULL;
+	err = 0;
+	vcn = lcn = -1;
+	vcn_len = 0;
+	lcn_block = -1;
+	was_hole = false;
+	cpos = pos >> vol->cluster_size_bits;
+	end = pos + bytes;
+	cend = (end + vol->cluster_size - 1) >> vol->cluster_size_bits;
+	/*
+	 * Loop over each page and for each page over each buffer.  Use goto to
+	 * reduce indentation.
+	 */
+	u = 0;
+do_next_page:
+	page = pages[u];
+	bh_pos = (s64)page->index << PAGE_CACHE_SHIFT;
+	bh = head = page_buffers(page);
+	do {
+		VCN cdelta;
+		s64 bh_end;
+		unsigned bh_cofs;
+
+		/* Clear buffer_new on all buffers to reinitialise state. */
+		if (buffer_new(bh))
+			clear_buffer_new(bh);
+		bh_end = bh_pos + blocksize;
+		bh_cpos = bh_pos >> vol->cluster_size_bits;
+		bh_cofs = bh_pos & vol->cluster_size_mask;
+		if (buffer_mapped(bh)) {
+			/*
+			 * The buffer is already mapped.  If it is uptodate,
+			 * ignore it.
+			 */
+			if (buffer_uptodate(bh))
+				continue;
+			/*
+			 * The buffer is not uptodate.  If the page is uptodate
+			 * set the buffer uptodate and otherwise ignore it.
+			 */
+			if (PageUptodate(page)) {
+				set_buffer_uptodate(bh);
+				continue;
+			}
+			/*
+			 * Neither the page nor the buffer are uptodate.  If
+			 * the buffer is only partially being written to, we
+			 * need to read it in before the write, i.e. now.
+			 */
+			if ((bh_pos < pos && bh_end > pos) ||
+					(bh_pos < end && bh_end > end)) {
+				/*
+				 * If the buffer is fully or partially within
+				 * the initialized size, do an actual read.
+				 * Otherwise, simply zero the buffer.
+				 */
+				read_lock_irqsave(&ni->size_lock, flags);
+				initialized_size = ni->initialized_size;
+				read_unlock_irqrestore(&ni->size_lock, flags);
+				if (bh_pos < initialized_size) {
+					ntfs_submit_bh_for_read(bh);
+					*wait_bh++ = bh;
+				} else {
+					zero_user(page, bh_offset(bh),
+							blocksize);
+					set_buffer_uptodate(bh);
+				}
+			}
+			continue;
+		}
+		/* Unmapped buffer.  Need to map it. */
+		bh->b_bdev = vol->sb->s_bdev;
+		/*
+		 * If the current buffer is in the same clusters as the map
+		 * cache, there is no need to check the runlist again.  The
+		 * map cache is made up of @vcn, which is the first cached file
+		 * cluster, @vcn_len which is the number of cached file
+		 * clusters, @lcn is the device cluster corresponding to @vcn,
+		 * and @lcn_block is the block number corresponding to @lcn.
+		 */
+		cdelta = bh_cpos - vcn;
+		if (likely(!cdelta || (cdelta > 0 && cdelta < vcn_len))) {
+map_buffer_cached:
+			BUG_ON(lcn < 0);
+			bh->b_blocknr = lcn_block +
+					(cdelta << (vol->cluster_size_bits -
+					blocksize_bits)) +
+					(bh_cofs >> blocksize_bits);
+			set_buffer_mapped(bh);
+			/*
+			 * If the page is uptodate so is the buffer.  If the
+			 * buffer is fully outside the write, we ignore it if
+			 * it was already allocated and we mark it dirty so it
+			 * gets written out if we allocated it.  On the other
+			 * hand, if we allocated the buffer but we are not
+			 * marking it dirty we set buffer_new so we can do
+			 * error recovery.
+			 */
+			if (PageUptodate(page)) {
+				if (!buffer_uptodate(bh))
+					set_buffer_uptodate(bh);
+				if (unlikely(was_hole)) {
+					/* We allocated the buffer. */
+					unmap_underlying_metadata(bh->b_bdev,
+							bh->b_blocknr);
+					if (bh_end <= pos || bh_pos >= end)
+						mark_buffer_dirty(bh);
+					else
+						set_buffer_new(bh);
+				}
+				continue;
+			}
+			/* Page is _not_ uptodate. */
+			if (likely(!was_hole)) {
+				/*
+				 * Buffer was already allocated.  If it is not
+				 * uptodate and is only partially being written
+				 * to, we need to read it in before the write,
+				 * i.e. now.
+				 */
+				if (!buffer_uptodate(bh) && bh_pos < end &&
+						bh_end > pos &&
+						(bh_pos < pos ||
+						bh_end > end)) {
+					/*
+					 * If the buffer is fully or partially
+					 * within the initialized size, do an
+					 * actual read.  Otherwise, simply zero
+					 * the buffer.
+					 */
+					read_lock_irqsave(&ni->size_lock,
+							flags);
+					initialized_size = ni->initialized_size;
+					read_unlock_irqrestore(&ni->size_lock,
+							flags);
+					if (bh_pos < initialized_size) {
+						ntfs_submit_bh_for_read(bh);
+						*wait_bh++ = bh;
+					} else {
+						zero_user(page, bh_offset(bh),
+								blocksize);
+						set_buffer_uptodate(bh);
+					}
+				}
+				continue;
+			}
+			/* We allocated the buffer. */
+			unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+			/*
+			 * If the buffer is fully outside the write, zero it,
+			 * set it uptodate, and mark it dirty so it gets
+			 * written out.  If it is partially being written to,
+			 * zero region surrounding the write but leave it to
+			 * commit write to do anything else.  Finally, if the
+			 * buffer is fully being overwritten, do nothing.
+			 */
+			if (bh_end <= pos || bh_pos >= end) {
+				if (!buffer_uptodate(bh)) {
+					zero_user(page, bh_offset(bh),
+							blocksize);
+					set_buffer_uptodate(bh);
+				}
+				mark_buffer_dirty(bh);
+				continue;
+			}
+			set_buffer_new(bh);
+			if (!buffer_uptodate(bh) &&
+					(bh_pos < pos || bh_end > end)) {
+				u8 *kaddr;
+				unsigned pofs;
+					
+				kaddr = kmap_atomic(page);
+				if (bh_pos < pos) {
+					pofs = bh_pos & ~PAGE_CACHE_MASK;
+					memset(kaddr + pofs, 0, pos - bh_pos);
+				}
+				if (bh_end > end) {
+					pofs = end & ~PAGE_CACHE_MASK;
+					memset(kaddr + pofs, 0, bh_end - end);
+				}
+				kunmap_atomic(kaddr);
+				flush_dcache_page(page);
+			}
+			continue;
+		}
+		/*
+		 * Slow path: this is the first buffer in the cluster.  If it
+		 * is outside allocated size and is not uptodate, zero it and
+		 * set it uptodate.
+		 */
+		read_lock_irqsave(&ni->size_lock, flags);
+		initialized_size = ni->allocated_size;
+		read_unlock_irqrestore(&ni->size_lock, flags);
+		if (bh_pos > initialized_size) {
+			if (PageUptodate(page)) {
+				if (!buffer_uptodate(bh))
+					set_buffer_uptodate(bh);
+			} else if (!buffer_uptodate(bh)) {
+				zero_user(page, bh_offset(bh), blocksize);
+				set_buffer_uptodate(bh);
+			}
+			continue;
+		}
+		is_retry = false;
+		if (!rl) {
+			down_read(&ni->runlist.lock);
+retry_remap:
+			rl = ni->runlist.rl;
+		}
+		if (likely(rl != NULL)) {
+			/* Seek to element containing target cluster. */
+			while (rl->length && rl[1].vcn <= bh_cpos)
+				rl++;
+			lcn = ntfs_rl_vcn_to_lcn(rl, bh_cpos);
+			if (likely(lcn >= 0)) {
+				/*
+				 * Successful remap, setup the map cache and
+				 * use that to deal with the buffer.
+				 */
+				was_hole = false;
+				vcn = bh_cpos;
+				vcn_len = rl[1].vcn - vcn;
+				lcn_block = lcn << (vol->cluster_size_bits -
+						blocksize_bits);
+				cdelta = 0;
+				/*
+				 * If the number of remaining clusters touched
+				 * by the write is smaller or equal to the
+				 * number of cached clusters, unlock the
+				 * runlist as the map cache will be used from
+				 * now on.
+				 */
+				if (likely(vcn + vcn_len >= cend)) {
+					if (rl_write_locked) {
+						up_write(&ni->runlist.lock);
+						rl_write_locked = false;
+					} else
+						up_read(&ni->runlist.lock);
+					rl = NULL;
+				}
+				goto map_buffer_cached;
+			}
+		} else
+			lcn = LCN_RL_NOT_MAPPED;
+		/*
+		 * If it is not a hole and not out of bounds, the runlist is
+		 * probably unmapped so try to map it now.
+		 */
+		if (unlikely(lcn != LCN_HOLE && lcn != LCN_ENOENT)) {
+			if (likely(!is_retry && lcn == LCN_RL_NOT_MAPPED)) {
+				/* Attempt to map runlist. */
+				if (!rl_write_locked) {
+					/*
+					 * We need the runlist locked for
+					 * writing, so if it is locked for
+					 * reading relock it now and retry in
+					 * case it changed whilst we dropped
+					 * the lock.
+					 */
+					up_read(&ni->runlist.lock);
+					down_write(&ni->runlist.lock);
+					rl_write_locked = true;
+					goto retry_remap;
+				}
+				err = ntfs_map_runlist_nolock(ni, bh_cpos,
+						NULL);
+				if (likely(!err)) {
+					is_retry = true;
+					goto retry_remap;
+				}
+				/*
+				 * If @vcn is out of bounds, pretend @lcn is
+				 * LCN_ENOENT.  As long as the buffer is out
+				 * of bounds this will work fine.
+				 */
+				if (err == -ENOENT) {
+					lcn = LCN_ENOENT;
+					err = 0;
+					goto rl_not_mapped_enoent;
+				}
+			} else
+				err = -EIO;
+			/* Failed to map the buffer, even after retrying. */
+			bh->b_blocknr = -1;
+			ntfs_error(vol->sb, "Failed to write to inode 0x%lx, "
+					"attribute type 0x%x, vcn 0x%llx, "
+					"vcn offset 0x%x, because its "
+					"location on disk could not be "
+					"determined%s (error code %i).",
+					ni->mft_no, ni->type,
+					(unsigned long long)bh_cpos,
+					(unsigned)bh_pos &
+					vol->cluster_size_mask,
+					is_retry ? " even after retrying" : "",
+					err);
+			break;
+		}
+rl_not_mapped_enoent:
+		/*
+		 * The buffer is in a hole or out of bounds.  We need to fill
+		 * the hole, unless the buffer is in a cluster which is not
+		 * touched by the write, in which case we just leave the buffer
+		 * unmapped.  This can only happen when the cluster size is
+		 * less than the page cache size.
+		 */
+		if (unlikely(vol->cluster_size < PAGE_CACHE_SIZE)) {
+			bh_cend = (bh_end + vol->cluster_size - 1) >>
+					vol->cluster_size_bits;
+			if ((bh_cend <= cpos || bh_cpos >= cend)) {
+				bh->b_blocknr = -1;
+				/*
+				 * If the buffer is uptodate we skip it.  If it
+				 * is not but the page is uptodate, we can set
+				 * the buffer uptodate.  If the page is not
+				 * uptodate, we can clear the buffer and set it
+				 * uptodate.  Whether this is worthwhile is
+				 * debatable and this could be removed.
+				 */
+				if (PageUptodate(page)) {
+					if (!buffer_uptodate(bh))
+						set_buffer_uptodate(bh);
+				} else if (!buffer_uptodate(bh)) {
+					zero_user(page, bh_offset(bh),
+						blocksize);
+					set_buffer_uptodate(bh);
+				}
+				continue;
+			}
+		}
+		/*
+		 * Out of bounds buffer is invalid if it was not really out of
+		 * bounds.
+		 */
+		BUG_ON(lcn != LCN_HOLE);
+		/*
+		 * We need the runlist locked for writing, so if it is locked
+		 * for reading relock it now and retry in case it changed
+		 * whilst we dropped the lock.
+		 */
+		BUG_ON(!rl);
+		if (!rl_write_locked) {
+			up_read(&ni->runlist.lock);
+			down_write(&ni->runlist.lock);
+			rl_write_locked = true;
+			goto retry_remap;
+		}
+		/* Find the previous last allocated cluster. */
+		BUG_ON(rl->lcn != LCN_HOLE);
+		lcn = -1;
+		rl2 = rl;
+		while (--rl2 >= ni->runlist.rl) {
+			if (rl2->lcn >= 0) {
+				lcn = rl2->lcn + rl2->length;
+				break;
+			}
+		}
+		rl2 = ntfs_cluster_alloc(vol, bh_cpos, 1, lcn, DATA_ZONE,
+				false);
+		if (IS_ERR(rl2)) {
+			err = PTR_ERR(rl2);
+			ntfs_debug("Failed to allocate cluster, error code %i.",
+					err);
+			break;
+		}
+		lcn = rl2->lcn;
+		rl = ntfs_runlists_merge(ni->runlist.rl, rl2);
+		if (IS_ERR(rl)) {
+			err = PTR_ERR(rl);
+			if (err != -ENOMEM)
+				err = -EIO;
+			if (ntfs_cluster_free_from_rl(vol, rl2)) {
+				ntfs_error(vol->sb, "Failed to release "
+						"allocated cluster in error "
+						"code path.  Run chkdsk to "
+						"recover the lost cluster.");
+				NVolSetErrors(vol);
+			}
+			ntfs_free(rl2);
+			break;
+		}
+		ni->runlist.rl = rl;
+		status.runlist_merged = 1;
+		ntfs_debug("Allocated cluster, lcn 0x%llx.",
+				(unsigned long long)lcn);
+		/* Map and lock the mft record and get the attribute record. */
+		if (!NInoAttr(ni))
+			base_ni = ni;
+		else
+			base_ni = ni->ext.base_ntfs_ino;
+		m = map_mft_record(base_ni);
+		if (IS_ERR(m)) {
+			err = PTR_ERR(m);
+			break;
+		}
+		ctx = ntfs_attr_get_search_ctx(base_ni, m);
+		if (unlikely(!ctx)) {
+			err = -ENOMEM;
+			unmap_mft_record(base_ni);
+			break;
+		}
+		status.mft_attr_mapped = 1;
+		err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+				CASE_SENSITIVE, bh_cpos, NULL, 0, ctx);
+		if (unlikely(err)) {
+			if (err == -ENOENT)
+				err = -EIO;
+			break;
+		}
+		m = ctx->mrec;
+		a = ctx->attr;
+		/*
+		 * Find the runlist element with which the attribute extent
+		 * starts.  Note, we cannot use the _attr_ version because we
+		 * have mapped the mft record.  That is ok because we know the
+		 * runlist fragment must be mapped already to have ever gotten
+		 * here, so we can just use the _rl_ version.
+		 */
+		vcn = sle64_to_cpu(a->data.non_resident.lowest_vcn);
+		rl2 = ntfs_rl_find_vcn_nolock(rl, vcn);
+		BUG_ON(!rl2);
+		BUG_ON(!rl2->length);
+		BUG_ON(rl2->lcn < LCN_HOLE);
+		highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn);
+		/*
+		 * If @highest_vcn is zero, calculate the real highest_vcn
+		 * (which can really be zero).
+		 */
+		if (!highest_vcn)
+			highest_vcn = (sle64_to_cpu(
+					a->data.non_resident.allocated_size) >>
+					vol->cluster_size_bits) - 1;
+		/*
+		 * Determine the size of the mapping pairs array for the new
+		 * extent, i.e. the old extent with the hole filled.
+		 */
+		mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, vcn,
+				highest_vcn);
+		if (unlikely(mp_size <= 0)) {
+			if (!(err = mp_size))
+				err = -EIO;
+			ntfs_debug("Failed to get size for mapping pairs "
+					"array, error code %i.", err);
+			break;
+		}
+		/*
+		 * Resize the attribute record to fit the new mapping pairs
+		 * array.
+		 */
+		attr_rec_len = le32_to_cpu(a->length);
+		err = ntfs_attr_record_resize(m, a, mp_size + le16_to_cpu(
+				a->data.non_resident.mapping_pairs_offset));
+		if (unlikely(err)) {
+			BUG_ON(err != -ENOSPC);
+			// TODO: Deal with this by using the current attribute
+			// and fill it with as much of the mapping pairs
+			// array as possible.  Then loop over each attribute
+			// extent rewriting the mapping pairs arrays as we go
+			// along and if when we reach the end we have not
+			// enough space, try to resize the last attribute
+			// extent and if even that fails, add a new attribute
+			// extent.
+			// We could also try to resize at each step in the hope
+			// that we will not need to rewrite every single extent.
+			// Note, we may need to decompress some extents to fill
+			// the runlist as we are walking the extents...
+			ntfs_error(vol->sb, "Not enough space in the mft "
+					"record for the extended attribute "
+					"record.  This case is not "
+					"implemented yet.");
+			err = -EOPNOTSUPP;
+			break ;
+		}
+		status.mp_rebuilt = 1;
+		/*
+		 * Generate the mapping pairs array directly into the attribute
+		 * record.
+		 */
+		err = ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(
+				a->data.non_resident.mapping_pairs_offset),
+				mp_size, rl2, vcn, highest_vcn, NULL);
+		if (unlikely(err)) {
+			ntfs_error(vol->sb, "Cannot fill hole in inode 0x%lx, "
+					"attribute type 0x%x, because building "
+					"the mapping pairs failed with error "
+					"code %i.", vi->i_ino,
+					(unsigned)le32_to_cpu(ni->type), err);
+			err = -EIO;
+			break;
+		}
+		/* Update the highest_vcn but only if it was not set. */
+		if (unlikely(!a->data.non_resident.highest_vcn))
+			a->data.non_resident.highest_vcn =
+					cpu_to_sle64(highest_vcn);
+		/*
+		 * If the attribute is sparse/compressed, update the compressed
+		 * size in the ntfs_inode structure and the attribute record.
+		 */
+		if (likely(NInoSparse(ni) || NInoCompressed(ni))) {
+			/*
+			 * If we are not in the first attribute extent, switch
+			 * to it, but first ensure the changes will make it to
+			 * disk later.
+			 */
+			if (a->data.non_resident.lowest_vcn) {
+				flush_dcache_mft_record_page(ctx->ntfs_ino);
+				mark_mft_record_dirty(ctx->ntfs_ino);
+				ntfs_attr_reinit_search_ctx(ctx);
+				err = ntfs_attr_lookup(ni->type, ni->name,
+						ni->name_len, CASE_SENSITIVE,
+						0, NULL, 0, ctx);
+				if (unlikely(err)) {
+					status.attr_switched = 1;
+					break;
+				}
+				/* @m is not used any more so do not set it. */
+				a = ctx->attr;
+			}
+			write_lock_irqsave(&ni->size_lock, flags);
+			ni->itype.compressed.size += vol->cluster_size;
+			a->data.non_resident.compressed_size =
+					cpu_to_sle64(ni->itype.compressed.size);
+			write_unlock_irqrestore(&ni->size_lock, flags);
+		}
+		/* Ensure the changes make it to disk. */
+		flush_dcache_mft_record_page(ctx->ntfs_ino);
+		mark_mft_record_dirty(ctx->ntfs_ino);
+		ntfs_attr_put_search_ctx(ctx);
+		unmap_mft_record(base_ni);
+		/* Successfully filled the hole. */
+		status.runlist_merged = 0;
+		status.mft_attr_mapped = 0;
+		status.mp_rebuilt = 0;
+		/* Setup the map cache and use that to deal with the buffer. */
+		was_hole = true;
+		vcn = bh_cpos;
+		vcn_len = 1;
+		lcn_block = lcn << (vol->cluster_size_bits - blocksize_bits);
+		cdelta = 0;
+		/*
+		 * If the number of remaining clusters in the @pages is smaller
+		 * or equal to the number of cached clusters, unlock the
+		 * runlist as the map cache will be used from now on.
+		 */
+		if (likely(vcn + vcn_len >= cend)) {
+			up_write(&ni->runlist.lock);
+			rl_write_locked = false;
+			rl = NULL;
+		}
+		goto map_buffer_cached;
+	} while (bh_pos += blocksize, (bh = bh->b_this_page) != head);
+	/* If there are no errors, do the next page. */
+	if (likely(!err && ++u < nr_pages))
+		goto do_next_page;
+	/* If there are no errors, release the runlist lock if we took it. */
+	if (likely(!err)) {
+		if (unlikely(rl_write_locked)) {
+			up_write(&ni->runlist.lock);
+			rl_write_locked = false;
+		} else if (unlikely(rl))
+			up_read(&ni->runlist.lock);
+		rl = NULL;
+	}
+	/* If we issued read requests, let them complete. */
+	read_lock_irqsave(&ni->size_lock, flags);
+	initialized_size = ni->initialized_size;
+	read_unlock_irqrestore(&ni->size_lock, flags);
+	while (wait_bh > wait) {
+		bh = *--wait_bh;
+		wait_on_buffer(bh);
+		if (likely(buffer_uptodate(bh))) {
+			page = bh->b_page;
+			bh_pos = ((s64)page->index << PAGE_CACHE_SHIFT) +
+					bh_offset(bh);
+			/*
+			 * If the buffer overflows the initialized size, need
+			 * to zero the overflowing region.
+			 */
+			if (unlikely(bh_pos + blocksize > initialized_size)) {
+				int ofs = 0;
+
+				if (likely(bh_pos < initialized_size))
+					ofs = initialized_size - bh_pos;
+				zero_user_segment(page, bh_offset(bh) + ofs,
+						blocksize);
+			}
+		} else /* if (unlikely(!buffer_uptodate(bh))) */
+			err = -EIO;
+	}
+	if (likely(!err)) {
+		/* Clear buffer_new on all buffers. */
+		u = 0;
+		do {
+			bh = head = page_buffers(pages[u]);
+			do {
+				if (buffer_new(bh))
+					clear_buffer_new(bh);
+			} while ((bh = bh->b_this_page) != head);
+		} while (++u < nr_pages);
+		ntfs_debug("Done.");
+		return err;
+	}
+	if (status.attr_switched) {
+		/* Get back to the attribute extent we modified. */
+		ntfs_attr_reinit_search_ctx(ctx);
+		if (ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+				CASE_SENSITIVE, bh_cpos, NULL, 0, ctx)) {
+			ntfs_error(vol->sb, "Failed to find required "
+					"attribute extent of attribute in "
+					"error code path.  Run chkdsk to "
+					"recover.");
+			write_lock_irqsave(&ni->size_lock, flags);
+			ni->itype.compressed.size += vol->cluster_size;
+			write_unlock_irqrestore(&ni->size_lock, flags);
+			flush_dcache_mft_record_page(ctx->ntfs_ino);
+			mark_mft_record_dirty(ctx->ntfs_ino);
+			/*
+			 * The only thing that is now wrong is the compressed
+			 * size of the base attribute extent which chkdsk
+			 * should be able to fix.
+			 */
+			NVolSetErrors(vol);
+		} else {
+			m = ctx->mrec;
+			a = ctx->attr;
+			status.attr_switched = 0;
+		}
+	}
+	/*
+	 * If the runlist has been modified, need to restore it by punching a
+	 * hole into it and we then need to deallocate the on-disk cluster as
+	 * well.  Note, we only modify the runlist if we are able to generate a
+	 * new mapping pairs array, i.e. only when the mapped attribute extent
+	 * is not switched.
+	 */
+	if (status.runlist_merged && !status.attr_switched) {
+		BUG_ON(!rl_write_locked);
+		/* Make the file cluster we allocated sparse in the runlist. */
+		if (ntfs_rl_punch_nolock(vol, &ni->runlist, bh_cpos, 1)) {
+			ntfs_error(vol->sb, "Failed to punch hole into "
+					"attribute runlist in error code "
+					"path.  Run chkdsk to recover the "
+					"lost cluster.");
+			NVolSetErrors(vol);
+		} else /* if (success) */ {
+			status.runlist_merged = 0;
+			/*
+			 * Deallocate the on-disk cluster we allocated but only
+			 * if we succeeded in punching its vcn out of the
+			 * runlist.
+			 */
+			down_write(&vol->lcnbmp_lock);
+			if (ntfs_bitmap_clear_bit(vol->lcnbmp_ino, lcn)) {
+				ntfs_error(vol->sb, "Failed to release "
+						"allocated cluster in error "
+						"code path.  Run chkdsk to "
+						"recover the lost cluster.");
+				NVolSetErrors(vol);
+			}
+			up_write(&vol->lcnbmp_lock);
+		}
+	}
+	/*
+	 * Resize the attribute record to its old size and rebuild the mapping
+	 * pairs array.  Note, we only can do this if the runlist has been
+	 * restored to its old state which also implies that the mapped
+	 * attribute extent is not switched.
+	 */
+	if (status.mp_rebuilt && !status.runlist_merged) {
+		if (ntfs_attr_record_resize(m, a, attr_rec_len)) {
+			ntfs_error(vol->sb, "Failed to restore attribute "
+					"record in error code path.  Run "
+					"chkdsk to recover.");
+			NVolSetErrors(vol);
+		} else /* if (success) */ {
+			if (ntfs_mapping_pairs_build(vol, (u8*)a +
+					le16_to_cpu(a->data.non_resident.
+					mapping_pairs_offset), attr_rec_len -
+					le16_to_cpu(a->data.non_resident.
+					mapping_pairs_offset), ni->runlist.rl,
+					vcn, highest_vcn, NULL)) {
+				ntfs_error(vol->sb, "Failed to restore "
+						"mapping pairs array in error "
+						"code path.  Run chkdsk to "
+						"recover.");
+				NVolSetErrors(vol);
+			}
+			flush_dcache_mft_record_page(ctx->ntfs_ino);
+			mark_mft_record_dirty(ctx->ntfs_ino);
+		}
+	}
+	/* Release the mft record and the attribute. */
+	if (status.mft_attr_mapped) {
+		ntfs_attr_put_search_ctx(ctx);
+		unmap_mft_record(base_ni);
+	}
+	/* Release the runlist lock. */
+	if (rl_write_locked)
+		up_write(&ni->runlist.lock);
+	else if (rl)
+		up_read(&ni->runlist.lock);
+	/*
+	 * Zero out any newly allocated blocks to avoid exposing stale data.
+	 * If BH_New is set, we know that the block was newly allocated above
+	 * and that it has not been fully zeroed and marked dirty yet.
+	 */
+	nr_pages = u;
+	u = 0;
+	end = bh_cpos << vol->cluster_size_bits;
+	do {
+		page = pages[u];
+		bh = head = page_buffers(page);
+		do {
+			if (u == nr_pages &&
+					((s64)page->index << PAGE_CACHE_SHIFT) +
+					bh_offset(bh) >= end)
+				break;
+			if (!buffer_new(bh))
+				continue;
+			clear_buffer_new(bh);
+			if (!buffer_uptodate(bh)) {
+				if (PageUptodate(page))
+					set_buffer_uptodate(bh);
+				else {
+					zero_user(page, bh_offset(bh),
+							blocksize);
+					set_buffer_uptodate(bh);
+				}
+			}
+			mark_buffer_dirty(bh);
+		} while ((bh = bh->b_this_page) != head);
+	} while (++u <= nr_pages);
+	ntfs_error(vol->sb, "Failed.  Returning error code %i.", err);
+	return err;
+}
+
+/*
+ * Copy as much as we can into the pages and return the number of bytes which
+ * were successfully copied.  If a fault is encountered then clear the pages
+ * out to (ofs + bytes) and return the number of bytes which were copied.
+ */
+static inline size_t ntfs_copy_from_user(struct page **pages,
+		unsigned nr_pages, unsigned ofs, const char __user *buf,
+		size_t bytes)
+{
+	struct page **last_page = pages + nr_pages;
+	char *addr;
+	size_t total = 0;
+	unsigned len;
+	int left;
+
+	do {
+		len = PAGE_CACHE_SIZE - ofs;
+		if (len > bytes)
+			len = bytes;
+		addr = kmap_atomic(*pages);
+		left = __copy_from_user_inatomic(addr + ofs, buf, len);
+		kunmap_atomic(addr);
+		if (unlikely(left)) {
+			/* Do it the slow way. */
+			addr = kmap(*pages);
+			left = __copy_from_user(addr + ofs, buf, len);
+			kunmap(*pages);
+			if (unlikely(left))
+				goto err_out;
+		}
+		total += len;
+		bytes -= len;
+		if (!bytes)
+			break;
+		buf += len;
+		ofs = 0;
+	} while (++pages < last_page);
+out:
+	return total;
+err_out:
+	total += len - left;
+	/* Zero the rest of the target like __copy_from_user(). */
+	while (++pages < last_page) {
+		bytes -= len;
+		if (!bytes)
+			break;
+		len = PAGE_CACHE_SIZE;
+		if (len > bytes)
+			len = bytes;
+		zero_user(*pages, 0, len);
+	}
+	goto out;
+}
+
+static size_t __ntfs_copy_from_user_iovec_inatomic(char *vaddr,
+		const struct iovec *iov, size_t iov_ofs, size_t bytes)
+{
+	size_t total = 0;
+
+	while (1) {
+		const char __user *buf = iov->iov_base + iov_ofs;
+		unsigned len;
+		size_t left;
+
+		len = iov->iov_len - iov_ofs;
+		if (len > bytes)
+			len = bytes;
+		left = __copy_from_user_inatomic(vaddr, buf, len);
+		total += len;
+		bytes -= len;
+		vaddr += len;
+		if (unlikely(left)) {
+			total -= left;
+			break;
+		}
+		if (!bytes)
+			break;
+		iov++;
+		iov_ofs = 0;
+	}
+	return total;
+}
+
+static inline void ntfs_set_next_iovec(const struct iovec **iovp,
+		size_t *iov_ofsp, size_t bytes)
+{
+	const struct iovec *iov = *iovp;
+	size_t iov_ofs = *iov_ofsp;
+
+	while (bytes) {
+		unsigned len;
+
+		len = iov->iov_len - iov_ofs;
+		if (len > bytes)
+			len = bytes;
+		bytes -= len;
+		iov_ofs += len;
+		if (iov->iov_len == iov_ofs) {
+			iov++;
+			iov_ofs = 0;
+		}
+	}
+	*iovp = iov;
+	*iov_ofsp = iov_ofs;
+}
+
+/*
+ * This has the same side-effects and return value as ntfs_copy_from_user().
+ * The difference is that on a fault we need to memset the remainder of the
+ * pages (out to offset + bytes), to emulate ntfs_copy_from_user()'s
+ * single-segment behaviour.
+ *
+ * We call the same helper (__ntfs_copy_from_user_iovec_inatomic()) both when
+ * atomic and when not atomic.  This is ok because it calls
+ * __copy_from_user_inatomic() and it is ok to call this when non-atomic.  In
+ * fact, the only difference between __copy_from_user_inatomic() and
+ * __copy_from_user() is that the latter calls might_sleep() and the former
+ * should not zero the tail of the buffer on error.  And on many architectures
+ * __copy_from_user_inatomic() is just defined to __copy_from_user() so it
+ * makes no difference at all on those architectures.
+ */
+static inline size_t ntfs_copy_from_user_iovec(struct page **pages,
+		unsigned nr_pages, unsigned ofs, const struct iovec **iov,
+		size_t *iov_ofs, size_t bytes)
+{
+	struct page **last_page = pages + nr_pages;
+	char *addr;
+	size_t copied, len, total = 0;
+
+	do {
+		len = PAGE_CACHE_SIZE - ofs;
+		if (len > bytes)
+			len = bytes;
+		addr = kmap_atomic(*pages);
+		copied = __ntfs_copy_from_user_iovec_inatomic(addr + ofs,
+				*iov, *iov_ofs, len);
+		kunmap_atomic(addr);
+		if (unlikely(copied != len)) {
+			/* Do it the slow way. */
+			addr = kmap(*pages);
+			copied = __ntfs_copy_from_user_iovec_inatomic(addr +
+					ofs, *iov, *iov_ofs, len);
+			if (unlikely(copied != len))
+				goto err_out;
+			kunmap(*pages);
+		}
+		total += len;
+		ntfs_set_next_iovec(iov, iov_ofs, len);
+		bytes -= len;
+		if (!bytes)
+			break;
+		ofs = 0;
+	} while (++pages < last_page);
+out:
+	return total;
+err_out:
+	BUG_ON(copied > len);
+	/* Zero the rest of the target like __copy_from_user(). */
+	memset(addr + ofs + copied, 0, len - copied);
+	kunmap(*pages);
+	total += copied;
+	ntfs_set_next_iovec(iov, iov_ofs, copied);
+	while (++pages < last_page) {
+		bytes -= len;
+		if (!bytes)
+			break;
+		len = PAGE_CACHE_SIZE;
+		if (len > bytes)
+			len = bytes;
+		zero_user(*pages, 0, len);
+	}
+	goto out;
+}
+
+static inline void ntfs_flush_dcache_pages(struct page **pages,
+		unsigned nr_pages)
+{
+	BUG_ON(!nr_pages);
+	/*
+	 * Warning: Do not do the decrement at the same time as the call to
+	 * flush_dcache_page() because it is a NULL macro on i386 and hence the
+	 * decrement never happens so the loop never terminates.
+	 */
+	do {
+		--nr_pages;
+		flush_dcache_page(pages[nr_pages]);
+	} while (nr_pages > 0);
+}
+
+/**
+ * ntfs_commit_pages_after_non_resident_write - commit the received data
+ * @pages:	array of destination pages
+ * @nr_pages:	number of pages in @pages
+ * @pos:	byte position in file at which the write begins
+ * @bytes:	number of bytes to be written
+ *
+ * See description of ntfs_commit_pages_after_write(), below.
+ */
+static inline int ntfs_commit_pages_after_non_resident_write(
+		struct page **pages, const unsigned nr_pages,
+		s64 pos, size_t bytes)
+{
+	s64 end, initialized_size;
+	struct inode *vi;
+	ntfs_inode *ni, *base_ni;
+	struct buffer_head *bh, *head;
+	ntfs_attr_search_ctx *ctx;
+	MFT_RECORD *m;
+	ATTR_RECORD *a;
+	unsigned long flags;
+	unsigned blocksize, u;
+	int err;
+
+	vi = pages[0]->mapping->host;
+	ni = NTFS_I(vi);
+	blocksize = vi->i_sb->s_blocksize;
+	end = pos + bytes;
+	u = 0;
+	do {
+		s64 bh_pos;
+		struct page *page;
+		bool partial;
+
+		page = pages[u];
+		bh_pos = (s64)page->index << PAGE_CACHE_SHIFT;
+		bh = head = page_buffers(page);
+		partial = false;
+		do {
+			s64 bh_end;
+
+			bh_end = bh_pos + blocksize;
+			if (bh_end <= pos || bh_pos >= end) {
+				if (!buffer_uptodate(bh))
+					partial = true;
+			} else {
+				set_buffer_uptodate(bh);
+				mark_buffer_dirty(bh);
+			}
+		} while (bh_pos += blocksize, (bh = bh->b_this_page) != head);
+		/*
+		 * If all buffers are now uptodate but the page is not, set the
+		 * page uptodate.
+		 */
+		if (!partial && !PageUptodate(page))
+			SetPageUptodate(page);
+	} while (++u < nr_pages);
+	/*
+	 * Finally, if we do not need to update initialized_size or i_size we
+	 * are finished.
+	 */
+	read_lock_irqsave(&ni->size_lock, flags);
+	initialized_size = ni->initialized_size;
+	read_unlock_irqrestore(&ni->size_lock, flags);
+	if (end <= initialized_size) {
+		ntfs_debug("Done.");
+		return 0;
+	}
+	/*
+	 * Update initialized_size/i_size as appropriate, both in the inode and
+	 * the mft record.
+	 */
+	if (!NInoAttr(ni))
+		base_ni = ni;
+	else
+		base_ni = ni->ext.base_ntfs_ino;
+	/* Map, pin, and lock the mft record. */
+	m = map_mft_record(base_ni);
+	if (IS_ERR(m)) {
+		err = PTR_ERR(m);
+		m = NULL;
+		ctx = NULL;
+		goto err_out;
+	}
+	BUG_ON(!NInoNonResident(ni));
+	ctx = ntfs_attr_get_search_ctx(base_ni, m);
+	if (unlikely(!ctx)) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+			CASE_SENSITIVE, 0, NULL, 0, ctx);
+	if (unlikely(err)) {
+		if (err == -ENOENT)
+			err = -EIO;
+		goto err_out;
+	}
+	a = ctx->attr;
+	BUG_ON(!a->non_resident);
+	write_lock_irqsave(&ni->size_lock, flags);
+	BUG_ON(end > ni->allocated_size);
+	ni->initialized_size = end;
+	a->data.non_resident.initialized_size = cpu_to_sle64(end);
+	if (end > i_size_read(vi)) {
+		i_size_write(vi, end);
+		a->data.non_resident.data_size =
+				a->data.non_resident.initialized_size;
+	}
+	write_unlock_irqrestore(&ni->size_lock, flags);
+	/* Mark the mft record dirty, so it gets written back. */
+	flush_dcache_mft_record_page(ctx->ntfs_ino);
+	mark_mft_record_dirty(ctx->ntfs_ino);
+	ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(base_ni);
+	ntfs_debug("Done.");
+	return 0;
+err_out:
+	if (ctx)
+		ntfs_attr_put_search_ctx(ctx);
+	if (m)
+		unmap_mft_record(base_ni);
+	ntfs_error(vi->i_sb, "Failed to update initialized_size/i_size (error "
+			"code %i).", err);
+	if (err != -ENOMEM)
+		NVolSetErrors(ni->vol);
+	return err;
+}
+
+/**
+ * ntfs_commit_pages_after_write - commit the received data
+ * @pages:	array of destination pages
+ * @nr_pages:	number of pages in @pages
+ * @pos:	byte position in file at which the write begins
+ * @bytes:	number of bytes to be written
+ *
+ * This is called from ntfs_file_buffered_write() with i_mutex held on the inode
+ * (@pages[0]->mapping->host).  There are @nr_pages pages in @pages which are
+ * locked but not kmap()ped.  The source data has already been copied into the
+ * @page.  ntfs_prepare_pages_for_non_resident_write() has been called before
+ * the data was copied (for non-resident attributes only) and it returned
+ * success.
+ *
+ * Need to set uptodate and mark dirty all buffers within the boundary of the
+ * write.  If all buffers in a page are uptodate we set the page uptodate, too.
+ *
+ * Setting the buffers dirty ensures that they get written out later when
+ * ntfs_writepage() is invoked by the VM.
+ *
+ * Finally, we need to update i_size and initialized_size as appropriate both
+ * in the inode and the mft record.
+ *
+ * This is modelled after fs/buffer.c::generic_commit_write(), which marks
+ * buffers uptodate and dirty, sets the page uptodate if all buffers in the
+ * page are uptodate, and updates i_size if the end of io is beyond i_size.  In
+ * that case, it also marks the inode dirty.
+ *
+ * If things have gone as outlined in
+ * ntfs_prepare_pages_for_non_resident_write(), we do not need to do any page
+ * content modifications here for non-resident attributes.  For resident
+ * attributes we need to do the uptodate bringing here which we combine with
+ * the copying into the mft record which means we save one atomic kmap.
+ *
+ * Return 0 on success or -errno on error.
+ */
+static int ntfs_commit_pages_after_write(struct page **pages,
+		const unsigned nr_pages, s64 pos, size_t bytes)
+{
+	s64 end, initialized_size;
+	loff_t i_size;
+	struct inode *vi;
+	ntfs_inode *ni, *base_ni;
+	struct page *page;
+	ntfs_attr_search_ctx *ctx;
+	MFT_RECORD *m;
+	ATTR_RECORD *a;
+	char *kattr, *kaddr;
+	unsigned long flags;
+	u32 attr_len;
+	int err;
+
+	BUG_ON(!nr_pages);
+	BUG_ON(!pages);
+	page = pages[0];
+	BUG_ON(!page);
+	vi = page->mapping->host;
+	ni = NTFS_I(vi);
+	ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page "
+			"index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.",
+			vi->i_ino, ni->type, page->index, nr_pages,
+			(long long)pos, bytes);
+	if (NInoNonResident(ni))
+		return ntfs_commit_pages_after_non_resident_write(pages,
+				nr_pages, pos, bytes);
+	BUG_ON(nr_pages > 1);
+	/*
+	 * Attribute is resident, implying it is not compressed, encrypted, or
+	 * sparse.
+	 */
+	if (!NInoAttr(ni))
+		base_ni = ni;
+	else
+		base_ni = ni->ext.base_ntfs_ino;
+	BUG_ON(NInoNonResident(ni));
+	/* Map, pin, and lock the mft record. */
+	m = map_mft_record(base_ni);
+	if (IS_ERR(m)) {
+		err = PTR_ERR(m);
+		m = NULL;
+		ctx = NULL;
+		goto err_out;
+	}
+	ctx = ntfs_attr_get_search_ctx(base_ni, m);
+	if (unlikely(!ctx)) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+			CASE_SENSITIVE, 0, NULL, 0, ctx);
+	if (unlikely(err)) {
+		if (err == -ENOENT)
+			err = -EIO;
+		goto err_out;
+	}
+	a = ctx->attr;
+	BUG_ON(a->non_resident);
+	/* The total length of the attribute value. */
+	attr_len = le32_to_cpu(a->data.resident.value_length);
+	i_size = i_size_read(vi);
+	BUG_ON(attr_len != i_size);
+	BUG_ON(pos > attr_len);
+	end = pos + bytes;
+	BUG_ON(end > le32_to_cpu(a->length) -
+			le16_to_cpu(a->data.resident.value_offset));
+	kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset);
+	kaddr = kmap_atomic(page);
+	/* Copy the received data from the page to the mft record. */
+	memcpy(kattr + pos, kaddr + pos, bytes);
+	/* Update the attribute length if necessary. */
+	if (end > attr_len) {
+		attr_len = end;
+		a->data.resident.value_length = cpu_to_le32(attr_len);
+	}
+	/*
+	 * If the page is not uptodate, bring the out of bounds area(s)
+	 * uptodate by copying data from the mft record to the page.
+	 */
+	if (!PageUptodate(page)) {
+		if (pos > 0)
+			memcpy(kaddr, kattr, pos);
+		if (end < attr_len)
+			memcpy(kaddr + end, kattr + end, attr_len - end);
+		/* Zero the region outside the end of the attribute value. */
+		memset(kaddr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
+		flush_dcache_page(page);
+		SetPageUptodate(page);
+	}
+	kunmap_atomic(kaddr);
+	/* Update initialized_size/i_size if necessary. */
+	read_lock_irqsave(&ni->size_lock, flags);
+	initialized_size = ni->initialized_size;
+	BUG_ON(end > ni->allocated_size);
+	read_unlock_irqrestore(&ni->size_lock, flags);
+	BUG_ON(initialized_size != i_size);
+	if (end > initialized_size) {
+		write_lock_irqsave(&ni->size_lock, flags);
+		ni->initialized_size = end;
+		i_size_write(vi, end);
+		write_unlock_irqrestore(&ni->size_lock, flags);
+	}
+	/* Mark the mft record dirty, so it gets written back. */
+	flush_dcache_mft_record_page(ctx->ntfs_ino);
+	mark_mft_record_dirty(ctx->ntfs_ino);
+	ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(base_ni);
+	ntfs_debug("Done.");
+	return 0;
+err_out:
+	if (err == -ENOMEM) {
+		ntfs_warning(vi->i_sb, "Error allocating memory required to "
+				"commit the write.");
+		if (PageUptodate(page)) {
+			ntfs_warning(vi->i_sb, "Page is uptodate, setting "
+					"dirty so the write will be retried "
+					"later on by the VM.");
+			/*
+			 * Put the page on mapping->dirty_pages, but leave its
+			 * buffers' dirty state as-is.
+			 */
+			__set_page_dirty_nobuffers(page);
+			err = 0;
+		} else
+			ntfs_error(vi->i_sb, "Page is not uptodate.  Written "
+					"data has been lost.");
+	} else {
+		ntfs_error(vi->i_sb, "Resident attribute commit write failed "
+				"with error %i.", err);
+		NVolSetErrors(ni->vol);
+	}
+	if (ctx)
+		ntfs_attr_put_search_ctx(ctx);
+	if (m)
+		unmap_mft_record(base_ni);
+	return err;
+}
+
+static void ntfs_write_failed(struct address_space *mapping, loff_t to)
+{
+	struct inode *inode = mapping->host;
+
+	if (to > inode->i_size) {
+		truncate_pagecache(inode, inode->i_size);
+		ntfs_truncate_vfs(inode);
+	}
+}
+
+/**
+ * ntfs_file_buffered_write -
+ *
+ * Locking: The vfs is holding ->i_mutex on the inode.
+ */
+static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
+		const struct iovec *iov, unsigned long nr_segs,
+		loff_t pos, loff_t *ppos, size_t count)
+{
+	struct file *file = iocb->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	struct inode *vi = mapping->host;
+	ntfs_inode *ni = NTFS_I(vi);
+	ntfs_volume *vol = ni->vol;
+	struct page *pages[NTFS_MAX_PAGES_PER_CLUSTER];
+	struct page *cached_page = NULL;
+	char __user *buf = NULL;
+	s64 end, ll;
+	VCN last_vcn;
+	LCN lcn;
+	unsigned long flags;
+	size_t bytes, iov_ofs = 0;	/* Offset in the current iovec. */
+	ssize_t status, written;
+	unsigned nr_pages;
+	int err;
+
+	ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
+			"pos 0x%llx, count 0x%lx.",
+			vi->i_ino, (unsigned)le32_to_cpu(ni->type),
+			(unsigned long long)pos, (unsigned long)count);
+	if (unlikely(!count))
+		return 0;
+	BUG_ON(NInoMstProtected(ni));
+	/*
+	 * If the attribute is not an index root and it is encrypted or
+	 * compressed, we cannot write to it yet.  Note we need to check for
+	 * AT_INDEX_ALLOCATION since this is the type of both directory and
+	 * index inodes.
+	 */
+	if (ni->type != AT_INDEX_ALLOCATION) {
+		/* If file is encrypted, deny access, just like NT4. */
+		if (NInoEncrypted(ni)) {
+			/*
+			 * Reminder for later: Encrypted files are _always_
+			 * non-resident so that the content can always be
+			 * encrypted.
+			 */
+			ntfs_debug("Denying write access to encrypted file.");
+			return -EACCES;
+		}
+		if (NInoCompressed(ni)) {
+			/* Only unnamed $DATA attribute can be compressed. */
+			BUG_ON(ni->type != AT_DATA);
+			BUG_ON(ni->name_len);
+			/*
+			 * Reminder for later: If resident, the data is not
+			 * actually compressed.  Only on the switch to non-
+			 * resident does compression kick in.  This is in
+			 * contrast to encrypted files (see above).
+			 */
+			ntfs_error(vi->i_sb, "Writing to compressed files is "
+					"not implemented yet.  Sorry.");
+			return -EOPNOTSUPP;
+		}
+	}
+	/*
+	 * If a previous ntfs_truncate() failed, repeat it and abort if it
+	 * fails again.
+	 */
+	if (unlikely(NInoTruncateFailed(ni))) {
+		inode_dio_wait(vi);
+		err = ntfs_truncate(vi);
+		if (err || NInoTruncateFailed(ni)) {
+			if (!err)
+				err = -EIO;
+			ntfs_error(vol->sb, "Cannot perform write to inode "
+					"0x%lx, attribute type 0x%x, because "
+					"ntfs_truncate() failed (error code "
+					"%i).", vi->i_ino,
+					(unsigned)le32_to_cpu(ni->type), err);
+			return err;
+		}
+	}
+	/* The first byte after the write. */
+	end = pos + count;
+	/*
+	 * If the write goes beyond the allocated size, extend the allocation
+	 * to cover the whole of the write, rounded up to the nearest cluster.
+	 */
+	read_lock_irqsave(&ni->size_lock, flags);
+	ll = ni->allocated_size;
+	read_unlock_irqrestore(&ni->size_lock, flags);
+	if (end > ll) {
+		/* Extend the allocation without changing the data size. */
+		ll = ntfs_attr_extend_allocation(ni, end, -1, pos);
+		if (likely(ll >= 0)) {
+			BUG_ON(pos >= ll);
+			/* If the extension was partial truncate the write. */
+			if (end > ll) {
+				ntfs_debug("Truncating write to inode 0x%lx, "
+						"attribute type 0x%x, because "
+						"the allocation was only "
+						"partially extended.",
+						vi->i_ino, (unsigned)
+						le32_to_cpu(ni->type));
+				end = ll;
+				count = ll - pos;
+			}
+		} else {
+			err = ll;
+			read_lock_irqsave(&ni->size_lock, flags);
+			ll = ni->allocated_size;
+			read_unlock_irqrestore(&ni->size_lock, flags);
+			/* Perform a partial write if possible or fail. */
+			if (pos < ll) {
+				ntfs_debug("Truncating write to inode 0x%lx, "
+						"attribute type 0x%x, because "
+						"extending the allocation "
+						"failed (error code %i).",
+						vi->i_ino, (unsigned)
+						le32_to_cpu(ni->type), err);
+				end = ll;
+				count = ll - pos;
+			} else {
+				ntfs_error(vol->sb, "Cannot perform write to "
+						"inode 0x%lx, attribute type "
+						"0x%x, because extending the "
+						"allocation failed (error "
+						"code %i).", vi->i_ino,
+						(unsigned)
+						le32_to_cpu(ni->type), err);
+				return err;
+			}
+		}
+	}
+	written = 0;
+	/*
+	 * If the write starts beyond the initialized size, extend it up to the
+	 * beginning of the write and initialize all non-sparse space between
+	 * the old initialized size and the new one.  This automatically also
+	 * increments the vfs inode->i_size to keep it above or equal to the
+	 * initialized_size.
+	 */
+	read_lock_irqsave(&ni->size_lock, flags);
+	ll = ni->initialized_size;
+	read_unlock_irqrestore(&ni->size_lock, flags);
+	if (pos > ll) {
+		err = ntfs_attr_extend_initialized(ni, pos);
+		if (err < 0) {
+			ntfs_error(vol->sb, "Cannot perform write to inode "
+					"0x%lx, attribute type 0x%x, because "
+					"extending the initialized size "
+					"failed (error code %i).", vi->i_ino,
+					(unsigned)le32_to_cpu(ni->type), err);
+			status = err;
+			goto err_out;
+		}
+	}
+	/*
+	 * Determine the number of pages per cluster for non-resident
+	 * attributes.
+	 */
+	nr_pages = 1;
+	if (vol->cluster_size > PAGE_CACHE_SIZE && NInoNonResident(ni))
+		nr_pages = vol->cluster_size >> PAGE_CACHE_SHIFT;
+	/* Finally, perform the actual write. */
+	last_vcn = -1;
+	if (likely(nr_segs == 1))
+		buf = iov->iov_base;
+	do {
+		VCN vcn;
+		pgoff_t idx, start_idx;
+		unsigned ofs, do_pages, u;
+		size_t copied;
+
+		start_idx = idx = pos >> PAGE_CACHE_SHIFT;
+		ofs = pos & ~PAGE_CACHE_MASK;
+		bytes = PAGE_CACHE_SIZE - ofs;
+		do_pages = 1;
+		if (nr_pages > 1) {
+			vcn = pos >> vol->cluster_size_bits;
+			if (vcn != last_vcn) {
+				last_vcn = vcn;
+				/*
+				 * Get the lcn of the vcn the write is in.  If
+				 * it is a hole, need to lock down all pages in
+				 * the cluster.
+				 */
+				down_read(&ni->runlist.lock);
+				lcn = ntfs_attr_vcn_to_lcn_nolock(ni, pos >>
+						vol->cluster_size_bits, false);
+				up_read(&ni->runlist.lock);
+				if (unlikely(lcn < LCN_HOLE)) {
+					status = -EIO;
+					if (lcn == LCN_ENOMEM)
+						status = -ENOMEM;
+					else
+						ntfs_error(vol->sb, "Cannot "
+							"perform write to "
+							"inode 0x%lx, "
+							"attribute type 0x%x, "
+							"because the attribute "
+							"is corrupt.",
+							vi->i_ino, (unsigned)
+							le32_to_cpu(ni->type));
+					break;
+				}
+				if (lcn == LCN_HOLE) {
+					start_idx = (pos & ~(s64)
+							vol->cluster_size_mask)
+							>> PAGE_CACHE_SHIFT;
+					bytes = vol->cluster_size - (pos &
+							vol->cluster_size_mask);
+					do_pages = nr_pages;
+				}
+			}
+		}
+		if (bytes > count)
+			bytes = count;
+		/*
+		 * Bring in the user page(s) that we will copy from _first_.
+		 * Otherwise there is a nasty deadlock on copying from the same
+		 * page(s) as we are writing to, without it/them being marked
+		 * up-to-date.  Note, at present there is nothing to stop the
+		 * pages being swapped out between us bringing them into memory
+		 * and doing the actual copying.
+		 */
+		if (likely(nr_segs == 1))
+			ntfs_fault_in_pages_readable(buf, bytes);
+		else
+			ntfs_fault_in_pages_readable_iovec(iov, iov_ofs, bytes);
+		/* Get and lock @do_pages starting at index @start_idx. */
+		status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages,
+				pages, &cached_page);
+		if (unlikely(status))
+			break;
+		/*
+		 * For non-resident attributes, we need to fill any holes with
+		 * actual clusters and ensure all bufferes are mapped.  We also
+		 * need to bring uptodate any buffers that are only partially
+		 * being written to.
+		 */
+		if (NInoNonResident(ni)) {
+			status = ntfs_prepare_pages_for_non_resident_write(
+					pages, do_pages, pos, bytes);
+			if (unlikely(status)) {
+				loff_t i_size;
+
+				do {
+					unlock_page(pages[--do_pages]);
+					page_cache_release(pages[do_pages]);
+				} while (do_pages);
+				/*
+				 * The write preparation may have instantiated
+				 * allocated space outside i_size.  Trim this
+				 * off again.  We can ignore any errors in this
+				 * case as we will just be waisting a bit of
+				 * allocated space, which is not a disaster.
+				 */
+				i_size = i_size_read(vi);
+				if (pos + bytes > i_size) {
+					ntfs_write_failed(mapping, pos + bytes);
+				}
+				break;
+			}
+		}
+		u = (pos >> PAGE_CACHE_SHIFT) - pages[0]->index;
+		if (likely(nr_segs == 1)) {
+			copied = ntfs_copy_from_user(pages + u, do_pages - u,
+					ofs, buf, bytes);
+			buf += copied;
+		} else
+			copied = ntfs_copy_from_user_iovec(pages + u,
+					do_pages - u, ofs, &iov, &iov_ofs,
+					bytes);
+		ntfs_flush_dcache_pages(pages + u, do_pages - u);
+		status = ntfs_commit_pages_after_write(pages, do_pages, pos,
+				bytes);
+		if (likely(!status)) {
+			written += copied;
+			count -= copied;
+			pos += copied;
+			if (unlikely(copied != bytes))
+				status = -EFAULT;
+		}
+		do {
+			unlock_page(pages[--do_pages]);
+			page_cache_release(pages[do_pages]);
+		} while (do_pages);
+		if (unlikely(status))
+			break;
+		balance_dirty_pages_ratelimited(mapping);
+		cond_resched();
+	} while (count);
+err_out:
+	*ppos = pos;
+	if (cached_page)
+		page_cache_release(cached_page);
+	ntfs_debug("Done.  Returning %s (written 0x%lx, status %li).",
+			written ? "written" : "status", (unsigned long)written,
+			(long)status);
+	return written ? written : status;
+}
+
+/**
+ * ntfs_file_aio_write_nolock -
+ */
+static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb,
+		const struct iovec *iov, unsigned long nr_segs, loff_t *ppos)
+{
+	struct file *file = iocb->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = mapping->host;
+	loff_t pos;
+	size_t count;		/* after file limit checks */
+	ssize_t written, err;
+
+	count = iov_length(iov, nr_segs);
+	pos = *ppos;
+	/* We can write back this queue in page reclaim. */
+	current->backing_dev_info = mapping->backing_dev_info;
+	written = 0;
+	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
+	if (err)
+		goto out;
+	if (!count)
+		goto out;
+	err = file_remove_suid(file);
+	if (err)
+		goto out;
+	err = file_update_time(file);
+	if (err)
+		goto out;
+	written = ntfs_file_buffered_write(iocb, iov, nr_segs, pos, ppos,
+			count);
+out:
+	current->backing_dev_info = NULL;
+	return written ? written : err;
+}
+
+/**
+ * ntfs_file_aio_write -
+ */
+static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
+		unsigned long nr_segs, loff_t pos)
+{
+	struct file *file = iocb->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = mapping->host;
+	ssize_t ret;
+
+	BUG_ON(iocb->ki_pos != pos);
+
+	mutex_lock(&inode->i_mutex);
+	ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos);
+	mutex_unlock(&inode->i_mutex);
+	if (ret > 0) {
+		int err = generic_write_sync(file, iocb->ki_pos - ret, ret);
+		if (err < 0)
+			ret = err;
+	}
+	return ret;
+}
+
+/**
  * ntfs_file_fsync - sync a file to disk
  * @filp:	file to be synced
- * @dentry:	dentry describing the file to sync
  * @datasync:	if non-zero only flush user data and not metadata
  *
  * Data integrity sync of a file to disk.  Used for fsync, fdatasync, and msync
@@ -74,26 +2155,34 @@ static int ntfs_file_open(struct inode *vi, struct file *filp)
  * Also, if @datasync is true, we do not wait on the inode to be written out
  * but we always wait on the page cache pages to be written out.
  *
- * Note: In the past @filp could be NULL so we ignore it as we don't need it
- * anyway.
- *
- * Locking: Caller must hold i_sem on the inode.
+ * Locking: Caller must hold i_mutex on the inode.
  *
  * TODO: We should probably also write all attribute/index inodes associated
  * with this inode but since we have no simple way of getting to them we ignore
  * this problem for now.
  */
-static int ntfs_file_fsync(struct file *filp, struct dentry *dentry,
-		int datasync)
+static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end,
+			   int datasync)
 {
-	struct inode *vi = dentry->d_inode;
+	struct inode *vi = filp->f_mapping->host;
 	int err, ret = 0;
 
 	ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
+
+	err = filemap_write_and_wait_range(vi->i_mapping, start, end);
+	if (err)
+		return err;
+	mutex_lock(&vi->i_mutex);
+
 	BUG_ON(S_ISDIR(vi->i_mode));
 	if (!datasync || !NInoNonResident(NTFS_I(vi)))
-		ret = ntfs_write_inode(vi, 1);
+		ret = __ntfs_write_inode(vi, 1);
 	write_inode_now(vi, !datasync);
+	/*
+	 * NOTE: If we were to use mapping->private_list (see ext2 and
+	 * fs/buffer.c) for dirty blocks then we could optimize the below to be
+	 * sync_mapping_buffers(vi->i_mapping).
+	 */
 	err = sync_blockdev(vi->i_sb->s_bdev);
 	if (unlikely(err && !ret))
 		ret = err;
@@ -102,54 +2191,52 @@ static int ntfs_file_fsync(struct file *filp, struct dentry *dentry,
 	else
 		ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx.  Error "
 				"%u.", datasync ? "data" : "", vi->i_ino, -ret);
+	mutex_unlock(&vi->i_mutex);
 	return ret;
 }
 
 #endif /* NTFS_RW */
 
-struct file_operations ntfs_file_ops = {
-	.llseek		= generic_file_llseek,	  /* Seek inside file. */
-	.read		= generic_file_read,	  /* Read from file. */
-	.aio_read	= generic_file_aio_read,  /* Async read from file. */
-	.readv		= generic_file_readv,	  /* Read from file. */
+const struct file_operations ntfs_file_ops = {
+	.llseek		= generic_file_llseek,	 /* Seek inside file. */
+	.read		= new_sync_read,	 /* Read from file. */
+	.read_iter	= generic_file_read_iter, /* Async read from file. */
 #ifdef NTFS_RW
-	.write		= generic_file_write,	  /* Write to file. */
-	.aio_write	= generic_file_aio_write, /* Async write to file. */
-	.writev		= generic_file_writev,	  /* Write to file. */
-	/*.release	= ,*/			  /* Last file is closed.  See
-						     fs/ext2/file.c::
-						     ext2_release_file() for
-						     how to use this to discard
-						     preallocated space for
-						     write opened files. */
-	.fsync		= ntfs_file_fsync,	  /* Sync a file to disk. */
-	/*.aio_fsync	= ,*/			  /* Sync all outstanding async
-						     i/o operations on a
-						     kiocb. */
+	.write		= do_sync_write,	 /* Write to file. */
+	.aio_write	= ntfs_file_aio_write,	 /* Async write to file. */
+	/*.release	= ,*/			 /* Last file is closed.  See
+						    fs/ext2/file.c::
+						    ext2_release_file() for
+						    how to use this to discard
+						    preallocated space for
+						    write opened files. */
+	.fsync		= ntfs_file_fsync,	 /* Sync a file to disk. */
+	/*.aio_fsync	= ,*/			 /* Sync all outstanding async
+						    i/o operations on a
+						    kiocb. */
 #endif /* NTFS_RW */
-	/*.ioctl	= ,*/			  /* Perform function on the
-						     mounted filesystem. */
-	.mmap		= generic_file_mmap,	  /* Mmap file. */
-	.open		= ntfs_file_open,	  /* Open file. */
-	.sendfile	= generic_file_sendfile,  /* Zero-copy data send with
-						     the data source being on
-						     the ntfs partition.  We
-						     do not need to care about
-						     the data destination. */
-	/*.sendpage	= ,*/			  /* Zero-copy data send with
-						     the data destination being
-						     on the ntfs partition.  We
-						     do not need to care about
-						     the data source. */
+	/*.ioctl	= ,*/			 /* Perform function on the
+						    mounted filesystem. */
+	.mmap		= generic_file_mmap,	 /* Mmap file. */
+	.open		= ntfs_file_open,	 /* Open file. */
+	.splice_read	= generic_file_splice_read /* Zero-copy data send with
+						    the data source being on
+						    the ntfs partition.  We do
+						    not need to care about the
+						    data destination. */
+	/*.sendpage	= ,*/			 /* Zero-copy data send with
+						    the data destination being
+						    on the ntfs partition.  We
+						    do not need to care about
+						    the data source. */
 };
 
-struct inode_operations ntfs_file_inode_ops = {
+const struct inode_operations ntfs_file_inode_ops = {
 #ifdef NTFS_RW
-	.truncate	= ntfs_truncate_vfs,
 	.setattr	= ntfs_setattr,
 #endif /* NTFS_RW */
 };
 
-struct file_operations ntfs_empty_file_ops = {};
+const struct file_operations ntfs_empty_file_ops = {};
 
-struct inode_operations ntfs_empty_inode_ops = {};
+const struct inode_operations ntfs_empty_inode_ops = {};
diff --git a/fs/ntfs/index.c b/fs/ntfs/index.c
index 71bd2cd7a4d..096c135691a 100644
--- a/fs/ntfs/index.c
+++ b/fs/ntfs/index.c
@@ -1,7 +1,7 @@
 /*
  * index.c - NTFS kernel index handling.  Part of the Linux-NTFS project.
  *
- * Copyright (c) 2004 Anton Altaparmakov
+ * Copyright (c) 2004-2005 Anton Altaparmakov
  *
  * This program/include file is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as published
@@ -19,6 +19,8 @@
  * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
+#include <linux/slab.h>
+
 #include "aops.h"
 #include "collate.h"
 #include "debug.h"
@@ -32,25 +34,15 @@
  * Allocate a new index context, initialize it with @idx_ni and return it.
  * Return NULL if allocation failed.
  *
- * Locking:  Caller must hold i_sem on the index inode.
+ * Locking:  Caller must hold i_mutex on the index inode.
  */
 ntfs_index_context *ntfs_index_ctx_get(ntfs_inode *idx_ni)
 {
 	ntfs_index_context *ictx;
 
-	ictx = kmem_cache_alloc(ntfs_index_ctx_cache, SLAB_NOFS);
-	if (ictx) {
-		ictx->idx_ni = idx_ni;
-		ictx->entry = NULL;
-		ictx->data = NULL;
-		ictx->data_len = 0;
-		ictx->is_in_root = 0;
-		ictx->ir = NULL;
-		ictx->actx = NULL;
-		ictx->base_ni = NULL;
-		ictx->ia = NULL;
-		ictx->page = NULL;
-	}
+	ictx = kmem_cache_alloc(ntfs_index_ctx_cache, GFP_NOFS);
+	if (ictx)
+		*ictx = (ntfs_index_context){ .idx_ni = idx_ni };
 	return ictx;
 }
 
@@ -60,7 +52,7 @@ ntfs_index_context *ntfs_index_ctx_get(ntfs_inode *idx_ni)
  *
  * Release the index context @ictx, releasing all associated resources.
  *
- * Locking:  Caller must hold i_sem on the index inode.
+ * Locking:  Caller must hold i_mutex on the index inode.
  */
 void ntfs_index_ctx_put(ntfs_index_context *ictx)
 {
@@ -116,7 +108,7 @@ void ntfs_index_ctx_put(ntfs_index_context *ictx)
  * or ntfs_index_entry_write() before the call to ntfs_index_ctx_put() to
  * ensure that the changes are written to disk.
  *
- * Locking:  - Caller must hold i_sem on the index inode.
+ * Locking:  - Caller must hold i_mutex on the index inode.
  *	     - Each page cache page in the index allocation mapping must be
  *	       locked whilst being accessed otherwise we may find a corrupt
  *	       page due to it being under ->writepage at the moment which
@@ -214,7 +206,8 @@ int ntfs_index_lookup(const void *key, const int key_len,
 		if ((key_len == le16_to_cpu(ie->key_length)) && !memcmp(key,
 				&ie->key, key_len)) {
 ir_done:
-			ictx->is_in_root = TRUE;
+			ictx->is_in_root = true;
+			ictx->ir = ir;
 			ictx->actx = actx;
 			ictx->base_ni = base_ni;
 			ictx->ia = NULL;
@@ -383,7 +376,7 @@ fast_descend_into_child_node:
 		if ((key_len == le16_to_cpu(ie->key_length)) && !memcmp(key,
 				&ie->key, key_len)) {
 ia_done:
-			ictx->is_in_root = FALSE;
+			ictx->is_in_root = false;
 			ictx->actx = NULL;
 			ictx->base_ni = NULL;
 			ictx->ia = ia;
diff --git a/fs/ntfs/index.h b/fs/ntfs/index.h
index 846a489e869..8745469c398 100644
--- a/fs/ntfs/index.h
+++ b/fs/ntfs/index.h
@@ -37,12 +37,12 @@
  * @entry:	index entry (points into @ir or @ia)
  * @data:	index entry data (points into @entry)
  * @data_len:	length in bytes of @data
- * @is_in_root:	TRUE if @entry is in @ir and FALSE if it is in @ia
+ * @is_in_root:	'true' if @entry is in @ir and 'false' if it is in @ia
  * @ir:		index root if @is_in_root and NULL otherwise
  * @actx:	attribute search context if @is_in_root and NULL otherwise
  * @base_ni:	base inode if @is_in_root and NULL otherwise
- * @ia:		index block if @is_in_root is FALSE and NULL otherwise
- * @page:	page if @is_in_root is FALSE and NULL otherwise
+ * @ia:		index block if @is_in_root is 'false' and NULL otherwise
+ * @page:	page if @is_in_root is 'false' and NULL otherwise
  *
  * @idx_ni is the index inode this context belongs to.
  *
@@ -50,11 +50,11 @@
  * are the index entry data and its length in bytes, respectively.  @data
  * simply points into @entry.  This is probably what the user is interested in.
  *
- * If @is_in_root is TRUE, @entry is in the index root attribute @ir described
+ * If @is_in_root is 'true', @entry is in the index root attribute @ir described
  * by the attribute search context @actx and the base inode @base_ni.  @ia and
  * @page are NULL in this case.
  *
- * If @is_in_root is FALSE, @entry is in the index allocation attribute and @ia
+ * If @is_in_root is 'false', @entry is in the index allocation attribute and @ia
  * and @page point to the index allocation block and the mapped, locked page it
  * is in, respectively.  @ir, @actx and @base_ni are NULL in this case.
  *
@@ -77,7 +77,7 @@ typedef struct {
 	INDEX_ENTRY *entry;
 	void *data;
 	u16 data_len;
-	BOOL is_in_root;
+	bool is_in_root;
 	INDEX_ROOT *ir;
 	ntfs_attr_search_ctx *actx;
 	ntfs_inode *base_ni;
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 31840ba0b38..f47af5e6e23 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -1,7 +1,7 @@
 /**
  * inode.c - NTFS kernel inode handling. Part of the Linux-NTFS project.
  *
- * Copyright (c) 2001-2004 Anton Altaparmakov
+ * Copyright (c) 2001-2007 Anton Altaparmakov
  *
  * This program/include file is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as published
@@ -19,17 +19,24 @@
  * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
-#include <linux/pagemap.h>
 #include <linux/buffer_head.h>
-#include <linux/smp_lock.h>
-#include <linux/quotaops.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
 #include <linux/mount.h>
+#include <linux/mutex.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include <linux/slab.h>
+#include <linux/log2.h>
+#include <linux/aio.h>
 
 #include "aops.h"
+#include "attrib.h"
+#include "bitmap.h"
 #include "dir.h"
 #include "debug.h"
 #include "inode.h"
-#include "attrib.h"
+#include "lcnalloc.h"
 #include "malloc.h"
 #include "mft.h"
 #include "time.h"
@@ -48,7 +55,7 @@
  *
  * Return 1 if the attributes match and 0 if not.
  *
- * NOTE: This function runs with the inode_lock spin lock held so it is not
+ * NOTE: This function runs with the inode_hash_lock spin lock held so it is not
  * allowed to sleep.
  */
 int ntfs_test_inode(struct inode *vi, ntfs_attr *na)
@@ -88,11 +95,11 @@ int ntfs_test_inode(struct inode *vi, ntfs_attr *na)
  * If initializing the normal file/directory inode, set @na->type to AT_UNUSED.
  * In that case, @na->name and @na->name_len should be set to NULL and 0,
  * respectively. Although that is not strictly necessary as
- * ntfs_read_inode_locked() will fill them in later.
+ * ntfs_read_locked_inode() will fill them in later.
  *
  * Return 0 on success and -errno on error.
  *
- * NOTE: This function runs with the inode_lock spin lock held so it is not
+ * NOTE: This function runs with the inode->i_lock spin lock held so it is not
  * allowed to sleep. (Hence the GFP_ATOMIC allocation.)
  */
 static int ntfs_init_locked_inode(struct inode *vi, ntfs_attr *na)
@@ -130,11 +137,11 @@ static int ntfs_init_locked_inode(struct inode *vi, ntfs_attr *na)
 
 		BUG_ON(!na->name);
 		i = na->name_len * sizeof(ntfschar);
-		ni->name = (ntfschar*)kmalloc(i + sizeof(ntfschar), GFP_ATOMIC);
+		ni->name = kmalloc(i + sizeof(ntfschar), GFP_ATOMIC);
 		if (!ni->name)
 			return -ENOMEM;
 		memcpy(ni->name, na->name, i);
-		ni->name[i] = 0;
+		ni->name[na->name_len] = 0;
 	}
 	return 0;
 }
@@ -164,8 +171,8 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi,
 struct inode *ntfs_iget(struct super_block *sb, unsigned long mft_no)
 {
 	struct inode *vi;
-	ntfs_attr na;
 	int err;
+	ntfs_attr na;
 
 	na.mft_no = mft_no;
 	na.type = AT_UNUSED;
@@ -174,7 +181,7 @@ struct inode *ntfs_iget(struct super_block *sb, unsigned long mft_no)
 
 	vi = iget5_locked(sb, mft_no, (test_t)ntfs_test_inode,
 			(set_t)ntfs_init_locked_inode, &na);
-	if (!vi)
+	if (unlikely(!vi))
 		return ERR_PTR(-ENOMEM);
 
 	err = 0;
@@ -188,7 +195,7 @@ struct inode *ntfs_iget(struct super_block *sb, unsigned long mft_no)
 	 * There is no point in keeping bad inodes around if the failure was
 	 * due to ENOMEM. We want to be able to retry again later.
 	 */
-	if (err == -ENOMEM) {
+	if (unlikely(err == -ENOMEM)) {
 		iput(vi);
 		vi = ERR_PTR(err);
 	}
@@ -222,8 +229,8 @@ struct inode *ntfs_attr_iget(struct inode *base_vi, ATTR_TYPE type,
 		ntfschar *name, u32 name_len)
 {
 	struct inode *vi;
-	ntfs_attr na;
 	int err;
+	ntfs_attr na;
 
 	/* Make sure no one calls ntfs_attr_iget() for indices. */
 	BUG_ON(type == AT_INDEX_ALLOCATION);
@@ -235,7 +242,7 @@ struct inode *ntfs_attr_iget(struct inode *base_vi, ATTR_TYPE type,
 
 	vi = iget5_locked(base_vi->i_sb, na.mft_no, (test_t)ntfs_test_inode,
 			(set_t)ntfs_init_locked_inode, &na);
-	if (!vi)
+	if (unlikely(!vi))
 		return ERR_PTR(-ENOMEM);
 
 	err = 0;
@@ -250,7 +257,7 @@ struct inode *ntfs_attr_iget(struct inode *base_vi, ATTR_TYPE type,
 	 * simplifies things in that we never need to check for bad attribute
 	 * inodes elsewhere.
 	 */
-	if (err) {
+	if (unlikely(err)) {
 		iput(vi);
 		vi = ERR_PTR(err);
 	}
@@ -280,8 +287,8 @@ struct inode *ntfs_index_iget(struct inode *base_vi, ntfschar *name,
 		u32 name_len)
 {
 	struct inode *vi;
-	ntfs_attr na;
 	int err;
+	ntfs_attr na;
 
 	na.mft_no = base_vi->i_ino;
 	na.type = AT_INDEX_ALLOCATION;
@@ -290,7 +297,7 @@ struct inode *ntfs_index_iget(struct inode *base_vi, ntfschar *name,
 
 	vi = iget5_locked(base_vi->i_sb, na.mft_no, (test_t)ntfs_test_inode,
 			(set_t)ntfs_init_locked_inode, &na);
-	if (!vi)
+	if (unlikely(!vi))
 		return ERR_PTR(-ENOMEM);
 
 	err = 0;
@@ -305,7 +312,7 @@ struct inode *ntfs_index_iget(struct inode *base_vi, ntfschar *name,
 	 * simplifies things in that we never need to check for bad index
 	 * inodes elsewhere.
 	 */
-	if (err) {
+	if (unlikely(err)) {
 		iput(vi);
 		vi = ERR_PTR(err);
 	}
@@ -317,8 +324,7 @@ struct inode *ntfs_alloc_big_inode(struct super_block *sb)
 	ntfs_inode *ni;
 
 	ntfs_debug("Entering.");
-	ni = (ntfs_inode *)kmem_cache_alloc(ntfs_big_inode_cache,
-			SLAB_NOFS);
+	ni = kmem_cache_alloc(ntfs_big_inode_cache, GFP_NOFS);
 	if (likely(ni != NULL)) {
 		ni->state = 0;
 		return VFS_I(ni);
@@ -327,6 +333,12 @@ struct inode *ntfs_alloc_big_inode(struct super_block *sb)
 	return NULL;
 }
 
+static void ntfs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	kmem_cache_free(ntfs_big_inode_cache, NTFS_I(inode));
+}
+
 void ntfs_destroy_big_inode(struct inode *inode)
 {
 	ntfs_inode *ni = NTFS_I(inode);
@@ -335,7 +347,7 @@ void ntfs_destroy_big_inode(struct inode *inode)
 	BUG_ON(ni->page);
 	if (!atomic_dec_and_test(&ni->count))
 		BUG();
-	kmem_cache_free(ntfs_big_inode_cache, NTFS_I(inode));
+	call_rcu(&inode->i_rcu, ntfs_i_callback);
 }
 
 static inline ntfs_inode *ntfs_alloc_extent_inode(void)
@@ -343,7 +355,7 @@ static inline ntfs_inode *ntfs_alloc_extent_inode(void)
 	ntfs_inode *ni;
 
 	ntfs_debug("Entering.");
-	ni = (ntfs_inode *)kmem_cache_alloc(ntfs_inode_cache, SLAB_NOFS);
+	ni = kmem_cache_alloc(ntfs_inode_cache, GFP_NOFS);
 	if (likely(ni != NULL)) {
 		ni->state = 0;
 		return ni;
@@ -361,6 +373,12 @@ static void ntfs_destroy_extent_inode(ntfs_inode *ni)
 	kmem_cache_free(ntfs_inode_cache, ni);
 }
 
+/*
+ * The attribute runlist lock has separate locking rules from the
+ * normal runlist lock, so split the two lock-classes:
+ */
+static struct lock_class_key attr_list_rl_lock_class;
+
 /**
  * __ntfs_init_inode - initialize ntfs specific part of an inode
  * @sb:		super block of mounted volume
@@ -376,28 +394,37 @@ static void ntfs_destroy_extent_inode(ntfs_inode *ni)
 void __ntfs_init_inode(struct super_block *sb, ntfs_inode *ni)
 {
 	ntfs_debug("Entering.");
+	rwlock_init(&ni->size_lock);
 	ni->initialized_size = ni->allocated_size = 0;
 	ni->seq_no = 0;
 	atomic_set(&ni->count, 1);
 	ni->vol = NTFS_SB(sb);
 	ntfs_init_runlist(&ni->runlist);
-	init_MUTEX(&ni->mrec_lock);
+	mutex_init(&ni->mrec_lock);
 	ni->page = NULL;
 	ni->page_ofs = 0;
 	ni->attr_list_size = 0;
 	ni->attr_list = NULL;
 	ntfs_init_runlist(&ni->attr_list_rl);
-	ni->itype.index.bmp_ino = NULL;
+	lockdep_set_class(&ni->attr_list_rl.lock,
+				&attr_list_rl_lock_class);
 	ni->itype.index.block_size = 0;
 	ni->itype.index.vcn_size = 0;
 	ni->itype.index.collation_rule = 0;
 	ni->itype.index.block_size_bits = 0;
 	ni->itype.index.vcn_size_bits = 0;
-	init_MUTEX(&ni->extent_lock);
+	mutex_init(&ni->extent_lock);
 	ni->nr_extents = 0;
 	ni->ext.base_ntfs_ino = NULL;
 }
 
+/*
+ * Extent inodes get MFT-mapped in a nested way, while the base inode
+ * is still mapped. Teach this nesting to the lock validator by creating
+ * a separate class for nested inode's mrec_lock's:
+ */
+static struct lock_class_key extent_inode_mrec_lock_key;
+
 inline ntfs_inode *ntfs_new_extent_inode(struct super_block *sb,
 		unsigned long mft_no)
 {
@@ -406,6 +433,7 @@ inline ntfs_inode *ntfs_new_extent_inode(struct super_block *sb,
 	ntfs_debug("Entering.");
 	if (likely(ni != NULL)) {
 		__ntfs_init_inode(sb, ni);
+		lockdep_set_class(&ni->mrec_lock, &extent_inode_mrec_lock_key);
 		ni->mft_no = mft_no;
 		ni->type = AT_UNUSED;
 		ni->name = NULL;
@@ -509,7 +537,7 @@ err_corrupt_attr:
  * the ntfs inode.
  *
  * Q: What locks are held when the function is called?
- * A: i_state has I_LOCK set, hence the inode is locked, also
+ * A: i_state has I_NEW set, hence the inode is locked, also
  *    i_count is set to 1, so it is not going to go away
  *    i_flags is set to 0 and we have no business touching it.  Only an ioctl()
  *    is allowed to write to them. We should of course be honouring them but
@@ -523,7 +551,9 @@ static int ntfs_read_locked_inode(struct inode *vi)
 {
 	ntfs_volume *vol = NTFS_SB(vi->i_sb);
 	ntfs_inode *ni;
+	struct inode *bvi;
 	MFT_RECORD *m;
+	ATTR_RECORD *a;
 	STANDARD_INFORMATION *si;
 	ntfs_attr_search_ctx *ctx;
 	int err = 0;
@@ -532,8 +562,6 @@ static int ntfs_read_locked_inode(struct inode *vi)
 
 	/* Setup the generic vfs inode parts now. */
 
-	/* This is the optimal IO size (for stat), not the fs block size. */
-	vi->i_blksize = PAGE_CACHE_SIZE;
 	/*
 	 * This is for checking whether an inode has changed w.r.t. a file so
 	 * that the file can be updated if necessary (compare with f_version).
@@ -584,7 +612,7 @@ static int ntfs_read_locked_inode(struct inode *vi)
 	 * might be tricky due to vfs interactions. Need to think about this
 	 * some more when implementing the unlink command.
 	 */
-	vi->i_nlink = le16_to_cpu(m->link_count);
+	set_nlink(vi, le16_to_cpu(m->link_count));
 	/*
 	 * FIXME: Reparse points can have the directory bit set even though
 	 * they would be S_IFLNK. Need to deal with this further below when we
@@ -594,7 +622,7 @@ static int ntfs_read_locked_inode(struct inode *vi)
 	 */
 	/* Everyone gets all permissions. */
 	vi->i_mode |= S_IRWXUGO;
-	/* If read-only, noone gets write permissions. */
+	/* If read-only, no one gets write permissions. */
 	if (IS_RDONLY(vi))
 		vi->i_mode &= ~S_IWUGO;
 	if (m->flags & MFT_RECORD_IS_DIRECTORY) {
@@ -606,7 +634,7 @@ static int ntfs_read_locked_inode(struct inode *vi)
 		vi->i_mode &= ~vol->dmask;
 		/* Things break without this kludge! */
 		if (vi->i_nlink > 1)
-			vi->i_nlink = 1;
+			set_nlink(vi, 1);
 	} else {
 		vi->i_mode |= S_IFREG;
 		/* Apply the file permissions mask set in the mount options. */
@@ -632,9 +660,10 @@ static int ntfs_read_locked_inode(struct inode *vi)
 		}
 		goto unm_err_out;
 	}
+	a = ctx->attr;
 	/* Get the standard information attribute value. */
-	si = (STANDARD_INFORMATION*)((char*)ctx->attr +
-			le16_to_cpu(ctx->attr->data.resident.value_offset));
+	si = (STANDARD_INFORMATION*)((u8*)a +
+			le16_to_cpu(a->data.resident.value_offset));
 
 	/* Transfer information from the standard information into vi. */
 	/*
@@ -673,15 +702,31 @@ static int ntfs_read_locked_inode(struct inode *vi)
 			goto skip_attr_list_load;
 		ntfs_debug("Attribute list found in inode 0x%lx.", vi->i_ino);
 		NInoSetAttrList(ni);
-		if (ctx->attr->flags & ATTR_IS_ENCRYPTED ||
-				ctx->attr->flags & ATTR_COMPRESSION_MASK ||
-				ctx->attr->flags & ATTR_IS_SPARSE) {
+		a = ctx->attr;
+		if (a->flags & ATTR_COMPRESSION_MASK) {
 			ntfs_error(vi->i_sb, "Attribute list attribute is "
-					"compressed/encrypted/sparse.");
+					"compressed.");
 			goto unm_err_out;
 		}
+		if (a->flags & ATTR_IS_ENCRYPTED ||
+				a->flags & ATTR_IS_SPARSE) {
+			if (a->non_resident) {
+				ntfs_error(vi->i_sb, "Non-resident attribute "
+						"list attribute is encrypted/"
+						"sparse.");
+				goto unm_err_out;
+			}
+			ntfs_warning(vi->i_sb, "Resident attribute list "
+					"attribute in inode 0x%lx is marked "
+					"encrypted/sparse which is not true.  "
+					"However, Windows allows this and "
+					"chkdsk does not detect or correct it "
+					"so we will just ignore the invalid "
+					"flags and pretend they are not set.",
+					vi->i_ino);
+		}
 		/* Now allocate memory for the attribute list. */
-		ni->attr_list_size = (u32)ntfs_attr_size(ctx->attr);
+		ni->attr_list_size = (u32)ntfs_attr_size(a);
 		ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size);
 		if (!ni->attr_list) {
 			ntfs_error(vi->i_sb, "Not enough memory to allocate "
@@ -689,9 +734,9 @@ static int ntfs_read_locked_inode(struct inode *vi)
 			err = -ENOMEM;
 			goto unm_err_out;
 		}
-		if (ctx->attr->non_resident) {
+		if (a->non_resident) {
 			NInoSetAttrListNonResident(ni);
-			if (ctx->attr->data.non_resident.lowest_vcn) {
+			if (a->data.non_resident.lowest_vcn) {
 				ntfs_error(vi->i_sb, "Attribute list has non "
 						"zero lowest_vcn.");
 				goto unm_err_out;
@@ -701,7 +746,7 @@ static int ntfs_read_locked_inode(struct inode *vi)
 			 * exclusive access to the inode at this time.
 			 */
 			ni->attr_list_rl.rl = ntfs_mapping_pairs_decompress(vol,
-					ctx->attr, NULL);
+					a, NULL);
 			if (IS_ERR(ni->attr_list_rl.rl)) {
 				err = PTR_ERR(ni->attr_list_rl.rl);
 				ni->attr_list_rl.rl = NULL;
@@ -712,27 +757,26 @@ static int ntfs_read_locked_inode(struct inode *vi)
 			/* Now load the attribute list. */
 			if ((err = load_attribute_list(vol, &ni->attr_list_rl,
 					ni->attr_list, ni->attr_list_size,
-					sle64_to_cpu(ctx->attr->data.
-					non_resident.initialized_size)))) {
+					sle64_to_cpu(a->data.non_resident.
+					initialized_size)))) {
 				ntfs_error(vi->i_sb, "Failed to load "
 						"attribute list attribute.");
 				goto unm_err_out;
 			}
-		} else /* if (!ctx.attr->non_resident) */ {
-			if ((u8*)ctx->attr + le16_to_cpu(
-					ctx->attr->data.resident.value_offset) +
-					le32_to_cpu(
-					ctx->attr->data.resident.value_length) >
+		} else /* if (!a->non_resident) */ {
+			if ((u8*)a + le16_to_cpu(a->data.resident.value_offset)
+					+ le32_to_cpu(
+					a->data.resident.value_length) >
 					(u8*)ctx->mrec + vol->mft_record_size) {
 				ntfs_error(vi->i_sb, "Corrupt attribute list "
 						"in inode.");
 				goto unm_err_out;
 			}
 			/* Now copy the attribute list. */
-			memcpy(ni->attr_list, (u8*)ctx->attr + le16_to_cpu(
-					ctx->attr->data.resident.value_offset),
+			memcpy(ni->attr_list, (u8*)a + le16_to_cpu(
+					a->data.resident.value_offset),
 					le32_to_cpu(
-					ctx->attr->data.resident.value_length));
+					a->data.resident.value_length));
 		}
 	}
 skip_attr_list_load:
@@ -741,10 +785,10 @@ skip_attr_list_load:
 	 * in ntfs_ino->attr_list and it is ntfs_ino->attr_list_size bytes.
 	 */
 	if (S_ISDIR(vi->i_mode)) {
-		struct inode *bvi;
+		loff_t bvi_size;
 		ntfs_inode *bni;
 		INDEX_ROOT *ir;
-		char *ir_end, *index_end;
+		u8 *ir_end, *index_end;
 
 		/* It is a directory, find index root attribute. */
 		ntfs_attr_reinit_search_ctx(ctx);
@@ -760,17 +804,16 @@ skip_attr_list_load:
 			}
 			goto unm_err_out;
 		}
+		a = ctx->attr;
 		/* Set up the state. */
-		if (unlikely(ctx->attr->non_resident)) {
+		if (unlikely(a->non_resident)) {
 			ntfs_error(vol->sb, "$INDEX_ROOT attribute is not "
 					"resident.");
 			goto unm_err_out;
 		}
 		/* Ensure the attribute name is placed before the value. */
-		if (unlikely(ctx->attr->name_length &&
-				(le16_to_cpu(ctx->attr->name_offset) >=
-				le16_to_cpu(ctx->attr->data.resident.
-				value_offset)))) {
+		if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >=
+				le16_to_cpu(a->data.resident.value_offset)))) {
 			ntfs_error(vol->sb, "$INDEX_ROOT attribute name is "
 					"placed after the attribute value.");
 			goto unm_err_out;
@@ -781,28 +824,27 @@ skip_attr_list_load:
 		 * encrypted. However index root cannot be both compressed and
 		 * encrypted.
 		 */
-		if (ctx->attr->flags & ATTR_COMPRESSION_MASK)
+		if (a->flags & ATTR_COMPRESSION_MASK)
 			NInoSetCompressed(ni);
-		if (ctx->attr->flags & ATTR_IS_ENCRYPTED) {
-			if (ctx->attr->flags & ATTR_COMPRESSION_MASK) {
+		if (a->flags & ATTR_IS_ENCRYPTED) {
+			if (a->flags & ATTR_COMPRESSION_MASK) {
 				ntfs_error(vi->i_sb, "Found encrypted and "
 						"compressed attribute.");
 				goto unm_err_out;
 			}
 			NInoSetEncrypted(ni);
 		}
-		if (ctx->attr->flags & ATTR_IS_SPARSE)
+		if (a->flags & ATTR_IS_SPARSE)
 			NInoSetSparse(ni);
-		ir = (INDEX_ROOT*)((char*)ctx->attr + le16_to_cpu(
-				ctx->attr->data.resident.value_offset));
-		ir_end = (char*)ir + le32_to_cpu(
-				ctx->attr->data.resident.value_length);
-		if (ir_end > (char*)ctx->mrec + vol->mft_record_size) {
+		ir = (INDEX_ROOT*)((u8*)a +
+				le16_to_cpu(a->data.resident.value_offset));
+		ir_end = (u8*)ir + le32_to_cpu(a->data.resident.value_length);
+		if (ir_end > (u8*)ctx->mrec + vol->mft_record_size) {
 			ntfs_error(vi->i_sb, "$INDEX_ROOT attribute is "
 					"corrupt.");
 			goto unm_err_out;
 		}
-		index_end = (char*)&ir->index +
+		index_end = (u8*)&ir->index +
 				le32_to_cpu(ir->index.index_length);
 		if (index_end > ir_end) {
 			ntfs_error(vi->i_sb, "Directory index is corrupt.");
@@ -889,7 +931,8 @@ skip_attr_list_load:
 						"attribute.");
 			goto unm_err_out;
 		}
-		if (!ctx->attr->non_resident) {
+		a = ctx->attr;
+		if (!a->non_resident) {
 			ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute "
 					"is resident.");
 			goto unm_err_out;
@@ -898,42 +941,40 @@ skip_attr_list_load:
 		 * Ensure the attribute name is placed before the mapping pairs
 		 * array.
 		 */
-		if (unlikely(ctx->attr->name_length &&
-				(le16_to_cpu(ctx->attr->name_offset) >=
-				le16_to_cpu(ctx->attr->data.non_resident.
-				mapping_pairs_offset)))) {
+		if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >=
+				le16_to_cpu(
+				a->data.non_resident.mapping_pairs_offset)))) {
 			ntfs_error(vol->sb, "$INDEX_ALLOCATION attribute name "
 					"is placed after the mapping pairs "
 					"array.");
 			goto unm_err_out;
 		}
-		if (ctx->attr->flags & ATTR_IS_ENCRYPTED) {
+		if (a->flags & ATTR_IS_ENCRYPTED) {
 			ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute "
 					"is encrypted.");
 			goto unm_err_out;
 		}
-		if (ctx->attr->flags & ATTR_IS_SPARSE) {
+		if (a->flags & ATTR_IS_SPARSE) {
 			ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute "
 					"is sparse.");
 			goto unm_err_out;
 		}
-		if (ctx->attr->flags & ATTR_COMPRESSION_MASK) {
+		if (a->flags & ATTR_COMPRESSION_MASK) {
 			ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute "
 					"is compressed.");
 			goto unm_err_out;
 		}
-		if (ctx->attr->data.non_resident.lowest_vcn) {
+		if (a->data.non_resident.lowest_vcn) {
 			ntfs_error(vi->i_sb, "First extent of "
 					"$INDEX_ALLOCATION attribute has non "
 					"zero lowest_vcn.");
 			goto unm_err_out;
 		}
-		vi->i_size = sle64_to_cpu(
-				ctx->attr->data.non_resident.data_size);
+		vi->i_size = sle64_to_cpu(a->data.non_resident.data_size);
 		ni->initialized_size = sle64_to_cpu(
-				ctx->attr->data.non_resident.initialized_size);
+				a->data.non_resident.initialized_size);
 		ni->allocated_size = sle64_to_cpu(
-				ctx->attr->data.non_resident.allocated_size);
+				a->data.non_resident.allocated_size);
 		/*
 		 * We are done with the mft record, so we release it. Otherwise
 		 * we would deadlock in ntfs_attr_iget().
@@ -949,22 +990,24 @@ skip_attr_list_load:
 			err = PTR_ERR(bvi);
 			goto unm_err_out;
 		}
-		ni->itype.index.bmp_ino = bvi;
 		bni = NTFS_I(bvi);
 		if (NInoCompressed(bni) || NInoEncrypted(bni) ||
 				NInoSparse(bni)) {
 			ntfs_error(vi->i_sb, "$BITMAP attribute is compressed "
 					"and/or encrypted and/or sparse.");
-			goto unm_err_out;
+			goto iput_unm_err_out;
 		}
 		/* Consistency check bitmap size vs. index allocation size. */
-		if ((bvi->i_size << 3) < (vi->i_size >>
+		bvi_size = i_size_read(bvi);
+		if ((bvi_size << 3) < (vi->i_size >>
 				ni->itype.index.block_size_bits)) {
 			ntfs_error(vi->i_sb, "Index bitmap too small (0x%llx) "
 					"for index allocation (0x%llx).",
-					bvi->i_size << 3, vi->i_size);
-			goto unm_err_out;
+					bvi_size << 3, vi->i_size);
+			goto iput_unm_err_out;
 		}
+		/* No longer need the bitmap attribute inode. */
+		iput(bvi);
 skip_large_dir_stuff:
 		/* Setup the operations for this inode. */
 		vi->i_op = &ntfs_dir_inode_ops;
@@ -1010,87 +1053,103 @@ skip_large_dir_stuff:
 			ntfs_error(vi->i_sb, "$DATA attribute is missing.");
 			goto unm_err_out;
 		}
+		a = ctx->attr;
 		/* Setup the state. */
-		if (ctx->attr->non_resident) {
-			NInoSetNonResident(ni);
-			if (ctx->attr->flags & ATTR_COMPRESSION_MASK) {
+		if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_SPARSE)) {
+			if (a->flags & ATTR_COMPRESSION_MASK) {
 				NInoSetCompressed(ni);
 				if (vol->cluster_size > 4096) {
 					ntfs_error(vi->i_sb, "Found "
-						"compressed data but "
-						"compression is disabled due "
-						"to cluster size (%i) > 4kiB.",
-						vol->cluster_size);
+							"compressed data but "
+							"compression is "
+							"disabled due to "
+							"cluster size (%i) > "
+							"4kiB.",
+							vol->cluster_size);
 					goto unm_err_out;
 				}
-				if ((ctx->attr->flags & ATTR_COMPRESSION_MASK)
+				if ((a->flags & ATTR_COMPRESSION_MASK)
 						!= ATTR_IS_COMPRESSED) {
-					ntfs_error(vi->i_sb, "Found "
-						"unknown compression method or "
-						"corrupt file.");
+					ntfs_error(vi->i_sb, "Found unknown "
+							"compression method "
+							"or corrupt file.");
 					goto unm_err_out;
 				}
-				ni->itype.compressed.block_clusters = 1U <<
-						ctx->attr->data.non_resident.
-						compression_unit;
-				if (ctx->attr->data.non_resident.
+			}
+			if (a->flags & ATTR_IS_SPARSE)
+				NInoSetSparse(ni);
+		}
+		if (a->flags & ATTR_IS_ENCRYPTED) {
+			if (NInoCompressed(ni)) {
+				ntfs_error(vi->i_sb, "Found encrypted and "
+						"compressed data.");
+				goto unm_err_out;
+			}
+			NInoSetEncrypted(ni);
+		}
+		if (a->non_resident) {
+			NInoSetNonResident(ni);
+			if (NInoCompressed(ni) || NInoSparse(ni)) {
+				if (NInoCompressed(ni) && a->data.non_resident.
 						compression_unit != 4) {
 					ntfs_error(vi->i_sb, "Found "
-						"nonstandard compression unit "
-						"(%u instead of 4).  Cannot "
-						"handle this.",
-						ctx->attr->data.non_resident.
-						compression_unit);
+							"non-standard "
+							"compression unit (%u "
+							"instead of 4).  "
+							"Cannot handle this.",
+							a->data.non_resident.
+							compression_unit);
 					err = -EOPNOTSUPP;
 					goto unm_err_out;
 				}
-				ni->itype.compressed.block_size = 1U << (
-						ctx->attr->data.non_resident.
-						compression_unit +
-						vol->cluster_size_bits);
-				ni->itype.compressed.block_size_bits = ffs(
-					ni->itype.compressed.block_size) - 1;
-			}
-			if (ctx->attr->flags & ATTR_IS_ENCRYPTED) {
-				if (ctx->attr->flags & ATTR_COMPRESSION_MASK) {
-					ntfs_error(vi->i_sb, "Found encrypted "
-							"and compressed data.");
-					goto unm_err_out;
+				if (a->data.non_resident.compression_unit) {
+					ni->itype.compressed.block_size = 1U <<
+							(a->data.non_resident.
+							compression_unit +
+							vol->cluster_size_bits);
+					ni->itype.compressed.block_size_bits =
+							ffs(ni->itype.
+							compressed.
+							block_size) - 1;
+					ni->itype.compressed.block_clusters =
+							1U << a->data.
+							non_resident.
+							compression_unit;
+				} else {
+					ni->itype.compressed.block_size = 0;
+					ni->itype.compressed.block_size_bits =
+							0;
+					ni->itype.compressed.block_clusters =
+							0;
 				}
-				NInoSetEncrypted(ni);
+				ni->itype.compressed.size = sle64_to_cpu(
+						a->data.non_resident.
+						compressed_size);
 			}
-			if (ctx->attr->flags & ATTR_IS_SPARSE)
-				NInoSetSparse(ni);
-			if (ctx->attr->data.non_resident.lowest_vcn) {
+			if (a->data.non_resident.lowest_vcn) {
 				ntfs_error(vi->i_sb, "First extent of $DATA "
 						"attribute has non zero "
 						"lowest_vcn.");
 				goto unm_err_out;
 			}
-			/* Setup all the sizes. */
 			vi->i_size = sle64_to_cpu(
-					ctx->attr->data.non_resident.data_size);
+					a->data.non_resident.data_size);
 			ni->initialized_size = sle64_to_cpu(
-					ctx->attr->data.non_resident.
-					initialized_size);
+					a->data.non_resident.initialized_size);
 			ni->allocated_size = sle64_to_cpu(
-					ctx->attr->data.non_resident.
-					allocated_size);
-			if (NInoCompressed(ni)) {
-				ni->itype.compressed.size = sle64_to_cpu(
-						ctx->attr->data.non_resident.
-						compressed_size);
-			}
+					a->data.non_resident.allocated_size);
 		} else { /* Resident attribute. */
-			/*
-			 * Make all sizes equal for simplicity in read code
-			 * paths. FIXME: Need to keep this in mind when
-			 * converting to non-resident attribute in write code
-			 * path. (Probably only affects truncate().)
-			 */
-			vi->i_size = ni->initialized_size = ni->allocated_size =
-					le32_to_cpu(
-					ctx->attr->data.resident.value_length);
+			vi->i_size = ni->initialized_size = le32_to_cpu(
+					a->data.resident.value_length);
+			ni->allocated_size = le32_to_cpu(a->length) -
+					le16_to_cpu(
+					a->data.resident.value_offset);
+			if (vi->i_size > ni->allocated_size) {
+				ntfs_error(vi->i_sb, "Resident data attribute "
+						"is corrupt (size exceeds "
+						"allocation).");
+				goto unm_err_out;
+			}
 		}
 no_data_attr_special_case:
 		/* We are done with the mft record, so we release it. */
@@ -1117,14 +1176,14 @@ no_data_attr_special_case:
 	 * sizes of all non-resident attributes present to give us the Linux
 	 * correct size that should go into i_blocks (after division by 512).
 	 */
-	if (S_ISDIR(vi->i_mode) || !NInoCompressed(ni))
-		vi->i_blocks = ni->allocated_size >> 9;
-	else
+	if (S_ISREG(vi->i_mode) && (NInoCompressed(ni) || NInoSparse(ni)))
 		vi->i_blocks = ni->itype.compressed.size >> 9;
-
+	else
+		vi->i_blocks = ni->allocated_size >> 9;
 	ntfs_debug("Done.");
 	return 0;
-
+iput_unm_err_out:
+	iput(bvi);
 unm_err_out:
 	if (!err)
 		err = -EIO;
@@ -1155,17 +1214,20 @@ err_out:
  * necessary fields in @vi as well as initializing the ntfs inode.
  *
  * Q: What locks are held when the function is called?
- * A: i_state has I_LOCK set, hence the inode is locked, also
+ * A: i_state has I_NEW set, hence the inode is locked, also
  *    i_count is set to 1, so it is not going to go away
  *
  * Return 0 on success and -errno on error.  In the error case, the inode will
  * have had make_bad_inode() executed on it.
+ *
+ * Note this cannot be called for AT_INDEX_ALLOCATION.
  */
 static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
 {
 	ntfs_volume *vol = NTFS_SB(vi->i_sb);
 	ntfs_inode *ni, *base_ni;
 	MFT_RECORD *m;
+	ATTR_RECORD *a;
 	ntfs_attr_search_ctx *ctx;
 	int err = 0;
 
@@ -1177,11 +1239,10 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
 	base_ni = NTFS_I(base_vi);
 
 	/* Just mirror the values from the base inode. */
-	vi->i_blksize	= base_vi->i_blksize;
 	vi->i_version	= base_vi->i_version;
 	vi->i_uid	= base_vi->i_uid;
 	vi->i_gid	= base_vi->i_gid;
-	vi->i_nlink	= base_vi->i_nlink;
+	set_nlink(vi, base_vi->i_nlink);
 	vi->i_mtime	= base_vi->i_mtime;
 	vi->i_ctime	= base_vi->i_ctime;
 	vi->i_atime	= base_vi->i_atime;
@@ -1200,61 +1261,14 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
 		err = -ENOMEM;
 		goto unm_err_out;
 	}
-
 	/* Find the attribute. */
 	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
 			CASE_SENSITIVE, 0, NULL, 0, ctx);
 	if (unlikely(err))
 		goto unm_err_out;
-
-	if (!ctx->attr->non_resident) {
-		/* Ensure the attribute name is placed before the value. */
-		if (unlikely(ctx->attr->name_length &&
-				(le16_to_cpu(ctx->attr->name_offset) >=
-				le16_to_cpu(ctx->attr->data.resident.
-				value_offset)))) {
-			ntfs_error(vol->sb, "Attribute name is placed after "
-					"the attribute value.");
-			goto unm_err_out;
-		}
-		if (NInoMstProtected(ni) || ctx->attr->flags) {
-			ntfs_error(vi->i_sb, "Found mst protected attribute "
-					"or attribute with non-zero flags but "
-					"the attribute is resident.  Please "
-					"report you saw this message to "
-					"linux-ntfs-dev@lists.sourceforge.net");
-			goto unm_err_out;
-		}
-		/*
-		 * Resident attribute. Make all sizes equal for simplicity in
-		 * read code paths.
-		 */
-		vi->i_size = ni->initialized_size = ni->allocated_size =
-			le32_to_cpu(ctx->attr->data.resident.value_length);
-	} else {
-		NInoSetNonResident(ni);
-		/*
-		 * Ensure the attribute name is placed before the mapping pairs
-		 * array.
-		 */
-		if (unlikely(ctx->attr->name_length &&
-				(le16_to_cpu(ctx->attr->name_offset) >=
-				le16_to_cpu(ctx->attr->data.non_resident.
-				mapping_pairs_offset)))) {
-			ntfs_error(vol->sb, "Attribute name is placed after "
-					"the mapping pairs array.");
-			goto unm_err_out;
-		}
-		if (ctx->attr->flags & ATTR_COMPRESSION_MASK) {
-			if (NInoMstProtected(ni)) {
-				ntfs_error(vi->i_sb, "Found mst protected "
-						"attribute but the attribute "
-						"is compressed.  Please report "
-						"you saw this message to "
-						"linux-ntfs-dev@lists."
-						"sourceforge.net");
-				goto unm_err_out;
-			}
+	a = ctx->attr;
+	if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_SPARSE)) {
+		if (a->flags & ATTR_COMPRESSION_MASK) {
 			NInoSetCompressed(ni);
 			if ((ni->type != AT_DATA) || (ni->type == AT_DATA &&
 					ni->name_len)) {
@@ -1274,95 +1288,142 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
 						vol->cluster_size);
 				goto unm_err_out;
 			}
-			if ((ctx->attr->flags & ATTR_COMPRESSION_MASK)
-					!= ATTR_IS_COMPRESSED) {
+			if ((a->flags & ATTR_COMPRESSION_MASK) !=
+					ATTR_IS_COMPRESSED) {
 				ntfs_error(vi->i_sb, "Found unknown "
 						"compression method.");
 				goto unm_err_out;
 			}
-			ni->itype.compressed.block_clusters = 1U <<
-					ctx->attr->data.non_resident.
-					compression_unit;
-			if (ctx->attr->data.non_resident.compression_unit !=
-					4) {
-				ntfs_error(vi->i_sb, "Found nonstandard "
+		}
+		/*
+		 * The compressed/sparse flag set in an index root just means
+		 * to compress all files.
+		 */
+		if (NInoMstProtected(ni) && ni->type != AT_INDEX_ROOT) {
+			ntfs_error(vi->i_sb, "Found mst protected attribute "
+					"but the attribute is %s.  Please "
+					"report you saw this message to "
+					"linux-ntfs-dev@lists.sourceforge.net",
+					NInoCompressed(ni) ? "compressed" :
+					"sparse");
+			goto unm_err_out;
+		}
+		if (a->flags & ATTR_IS_SPARSE)
+			NInoSetSparse(ni);
+	}
+	if (a->flags & ATTR_IS_ENCRYPTED) {
+		if (NInoCompressed(ni)) {
+			ntfs_error(vi->i_sb, "Found encrypted and compressed "
+					"data.");
+			goto unm_err_out;
+		}
+		/*
+		 * The encryption flag set in an index root just means to
+		 * encrypt all files.
+		 */
+		if (NInoMstProtected(ni) && ni->type != AT_INDEX_ROOT) {
+			ntfs_error(vi->i_sb, "Found mst protected attribute "
+					"but the attribute is encrypted.  "
+					"Please report you saw this message "
+					"to linux-ntfs-dev@lists.sourceforge."
+					"net");
+			goto unm_err_out;
+		}
+		if (ni->type != AT_DATA) {
+			ntfs_error(vi->i_sb, "Found encrypted non-data "
+					"attribute.");
+			goto unm_err_out;
+		}
+		NInoSetEncrypted(ni);
+	}
+	if (!a->non_resident) {
+		/* Ensure the attribute name is placed before the value. */
+		if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >=
+				le16_to_cpu(a->data.resident.value_offset)))) {
+			ntfs_error(vol->sb, "Attribute name is placed after "
+					"the attribute value.");
+			goto unm_err_out;
+		}
+		if (NInoMstProtected(ni)) {
+			ntfs_error(vi->i_sb, "Found mst protected attribute "
+					"but the attribute is resident.  "
+					"Please report you saw this message to "
+					"linux-ntfs-dev@lists.sourceforge.net");
+			goto unm_err_out;
+		}
+		vi->i_size = ni->initialized_size = le32_to_cpu(
+				a->data.resident.value_length);
+		ni->allocated_size = le32_to_cpu(a->length) -
+				le16_to_cpu(a->data.resident.value_offset);
+		if (vi->i_size > ni->allocated_size) {
+			ntfs_error(vi->i_sb, "Resident attribute is corrupt "
+					"(size exceeds allocation).");
+			goto unm_err_out;
+		}
+	} else {
+		NInoSetNonResident(ni);
+		/*
+		 * Ensure the attribute name is placed before the mapping pairs
+		 * array.
+		 */
+		if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >=
+				le16_to_cpu(
+				a->data.non_resident.mapping_pairs_offset)))) {
+			ntfs_error(vol->sb, "Attribute name is placed after "
+					"the mapping pairs array.");
+			goto unm_err_out;
+		}
+		if (NInoCompressed(ni) || NInoSparse(ni)) {
+			if (NInoCompressed(ni) && a->data.non_resident.
+					compression_unit != 4) {
+				ntfs_error(vi->i_sb, "Found non-standard "
 						"compression unit (%u instead "
 						"of 4).  Cannot handle this.",
-						ctx->attr->data.non_resident.
+						a->data.non_resident.
 						compression_unit);
 				err = -EOPNOTSUPP;
 				goto unm_err_out;
 			}
-			ni->itype.compressed.block_size = 1U << (
-					ctx->attr->data.non_resident.
-					compression_unit +
-					vol->cluster_size_bits);
-			ni->itype.compressed.block_size_bits = ffs(
-				ni->itype.compressed.block_size) - 1;
-		}
-		if (ctx->attr->flags & ATTR_IS_ENCRYPTED) {
-			if (ctx->attr->flags & ATTR_COMPRESSION_MASK) {
-				ntfs_error(vi->i_sb, "Found encrypted "
-						"and compressed data.");
-				goto unm_err_out;
-			}
-			if (NInoMstProtected(ni)) {
-				ntfs_error(vi->i_sb, "Found mst protected "
-						"attribute but the attribute "
-						"is encrypted.  Please report "
-						"you saw this message to "
-						"linux-ntfs-dev@lists."
-						"sourceforge.net");
-				goto unm_err_out;
-			}
-			NInoSetEncrypted(ni);
-		}
-		if (ctx->attr->flags & ATTR_IS_SPARSE) {
-			if (NInoMstProtected(ni)) {
-				ntfs_error(vi->i_sb, "Found mst protected "
-						"attribute but the attribute "
-						"is sparse.  Please report "
-						"you saw this message to "
-						"linux-ntfs-dev@lists."
-						"sourceforge.net");
-				goto unm_err_out;
+			if (a->data.non_resident.compression_unit) {
+				ni->itype.compressed.block_size = 1U <<
+						(a->data.non_resident.
+						compression_unit +
+						vol->cluster_size_bits);
+				ni->itype.compressed.block_size_bits =
+						ffs(ni->itype.compressed.
+						block_size) - 1;
+				ni->itype.compressed.block_clusters = 1U <<
+						a->data.non_resident.
+						compression_unit;
+			} else {
+				ni->itype.compressed.block_size = 0;
+				ni->itype.compressed.block_size_bits = 0;
+				ni->itype.compressed.block_clusters = 0;
 			}
-			NInoSetSparse(ni);
+			ni->itype.compressed.size = sle64_to_cpu(
+					a->data.non_resident.compressed_size);
 		}
-		if (ctx->attr->data.non_resident.lowest_vcn) {
+		if (a->data.non_resident.lowest_vcn) {
 			ntfs_error(vi->i_sb, "First extent of attribute has "
 					"non-zero lowest_vcn.");
 			goto unm_err_out;
 		}
-		/* Setup all the sizes. */
-		vi->i_size = sle64_to_cpu(
-				ctx->attr->data.non_resident.data_size);
+		vi->i_size = sle64_to_cpu(a->data.non_resident.data_size);
 		ni->initialized_size = sle64_to_cpu(
-				ctx->attr->data.non_resident.initialized_size);
+				a->data.non_resident.initialized_size);
 		ni->allocated_size = sle64_to_cpu(
-				ctx->attr->data.non_resident.allocated_size);
-		if (NInoCompressed(ni)) {
-			ni->itype.compressed.size = sle64_to_cpu(
-					ctx->attr->data.non_resident.
-					compressed_size);
-		}
+				a->data.non_resident.allocated_size);
 	}
-
-	/* Setup the operations for this attribute inode. */
-	vi->i_op = NULL;
-	vi->i_fop = NULL;
 	if (NInoMstProtected(ni))
 		vi->i_mapping->a_ops = &ntfs_mst_aops;
 	else
 		vi->i_mapping->a_ops = &ntfs_aops;
-
-	if (!NInoCompressed(ni))
-		vi->i_blocks = ni->allocated_size >> 9;
-	else
+	if ((NInoCompressed(ni) || NInoSparse(ni)) && ni->type != AT_INDEX_ROOT)
 		vi->i_blocks = ni->itype.compressed.size >> 9;
-
+	else
+		vi->i_blocks = ni->allocated_size >> 9;
 	/*
-	 * Make sure the base inode doesn't go away and attach it to the
+	 * Make sure the base inode does not go away and attach it to the
 	 * attribute inode.
 	 */
 	igrab(base_vi);
@@ -1388,7 +1449,6 @@ err_out:
 			"Run chkdsk.", err, vi->i_ino, ni->type, ni->name_len,
 			base_vi->i_ino);
 	make_bad_inode(vi);
-	make_bad_inode(base_vi);
 	if (err != -ENOMEM)
 		NVolSetErrors(vol);
 	return err;
@@ -1421,7 +1481,7 @@ err_out:
  * normal directory inodes.
  *
  * Q: What locks are held when the function is called?
- * A: i_state has I_LOCK set, hence the inode is locked, also
+ * A: i_state has I_NEW set, hence the inode is locked, also
  *    i_count is set to 1, so it is not going to go away
  *
  * Return 0 on success and -errno on error.  In the error case, the inode will
@@ -1429,10 +1489,12 @@ err_out:
  */
 static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi)
 {
+	loff_t bvi_size;
 	ntfs_volume *vol = NTFS_SB(vi->i_sb);
 	ntfs_inode *ni, *base_ni, *bni;
 	struct inode *bvi;
 	MFT_RECORD *m;
+	ATTR_RECORD *a;
 	ntfs_attr_search_ctx *ctx;
 	INDEX_ROOT *ir;
 	u8 *ir_end, *index_end;
@@ -1443,11 +1505,10 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi)
 	ni	= NTFS_I(vi);
 	base_ni = NTFS_I(base_vi);
 	/* Just mirror the values from the base inode. */
-	vi->i_blksize	= base_vi->i_blksize;
 	vi->i_version	= base_vi->i_version;
 	vi->i_uid	= base_vi->i_uid;
 	vi->i_gid	= base_vi->i_gid;
-	vi->i_nlink	= base_vi->i_nlink;
+	set_nlink(vi, base_vi->i_nlink);
 	vi->i_mtime	= base_vi->i_mtime;
 	vi->i_ctime	= base_vi->i_ctime;
 	vi->i_atime	= base_vi->i_atime;
@@ -1474,30 +1535,31 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi)
 					"missing.");
 		goto unm_err_out;
 	}
+	a = ctx->attr;
 	/* Set up the state. */
-	if (unlikely(ctx->attr->non_resident)) {
+	if (unlikely(a->non_resident)) {
 		ntfs_error(vol->sb, "$INDEX_ROOT attribute is not resident.");
 		goto unm_err_out;
 	}
 	/* Ensure the attribute name is placed before the value. */
-	if (unlikely(ctx->attr->name_length &&
-			(le16_to_cpu(ctx->attr->name_offset) >=
-			le16_to_cpu(ctx->attr->data.resident.
-			value_offset)))) {
+	if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >=
+			le16_to_cpu(a->data.resident.value_offset)))) {
 		ntfs_error(vol->sb, "$INDEX_ROOT attribute name is placed "
 				"after the attribute value.");
 		goto unm_err_out;
 	}
-	/* Compressed/encrypted/sparse index root is not allowed. */
-	if (ctx->attr->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_ENCRYPTED |
+	/*
+	 * Compressed/encrypted/sparse index root is not allowed, except for
+	 * directories of course but those are not dealt with here.
+	 */
+	if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_ENCRYPTED |
 			ATTR_IS_SPARSE)) {
 		ntfs_error(vi->i_sb, "Found compressed/encrypted/sparse index "
 				"root attribute.");
 		goto unm_err_out;
 	}
-	ir = (INDEX_ROOT*)((u8*)ctx->attr +
-			le16_to_cpu(ctx->attr->data.resident.value_offset));
-	ir_end = (u8*)ir + le32_to_cpu(ctx->attr->data.resident.value_length);
+	ir = (INDEX_ROOT*)((u8*)a + le16_to_cpu(a->data.resident.value_offset));
+	ir_end = (u8*)ir + le32_to_cpu(a->data.resident.value_length);
 	if (ir_end > (u8*)ctx->mrec + vol->mft_record_size) {
 		ntfs_error(vi->i_sb, "$INDEX_ROOT attribute is corrupt.");
 		goto unm_err_out;
@@ -1516,7 +1578,7 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi)
 	ntfs_debug("Index collation rule is 0x%x.",
 			le32_to_cpu(ir->collation_rule));
 	ni->itype.index.block_size = le32_to_cpu(ir->index_block_size);
-	if (ni->itype.index.block_size & (ni->itype.index.block_size - 1)) {
+	if (!is_power_of_2(ni->itype.index.block_size)) {
 		ntfs_error(vi->i_sb, "Index block size (%u) is not a power of "
 				"two.", ni->itype.index.block_size);
 		goto unm_err_out;
@@ -1570,7 +1632,8 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi)
 					"$INDEX_ALLOCATION attribute.");
 		goto unm_err_out;
 	}
-	if (!ctx->attr->non_resident) {
+	a = ctx->attr;
+	if (!a->non_resident) {
 		ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is "
 				"resident.");
 		goto unm_err_out;
@@ -1578,37 +1641,36 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi)
 	/*
 	 * Ensure the attribute name is placed before the mapping pairs array.
 	 */
-	if (unlikely(ctx->attr->name_length && (le16_to_cpu(
-			ctx->attr->name_offset) >= le16_to_cpu(
-			ctx->attr->data.non_resident.mapping_pairs_offset)))) {
+	if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >=
+			le16_to_cpu(
+			a->data.non_resident.mapping_pairs_offset)))) {
 		ntfs_error(vol->sb, "$INDEX_ALLOCATION attribute name is "
 				"placed after the mapping pairs array.");
 		goto unm_err_out;
 	}
-	if (ctx->attr->flags & ATTR_IS_ENCRYPTED) {
+	if (a->flags & ATTR_IS_ENCRYPTED) {
 		ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is "
 				"encrypted.");
 		goto unm_err_out;
 	}
-	if (ctx->attr->flags & ATTR_IS_SPARSE) {
+	if (a->flags & ATTR_IS_SPARSE) {
 		ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is sparse.");
 		goto unm_err_out;
 	}
-	if (ctx->attr->flags & ATTR_COMPRESSION_MASK) {
+	if (a->flags & ATTR_COMPRESSION_MASK) {
 		ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is "
 				"compressed.");
 		goto unm_err_out;
 	}
-	if (ctx->attr->data.non_resident.lowest_vcn) {
+	if (a->data.non_resident.lowest_vcn) {
 		ntfs_error(vi->i_sb, "First extent of $INDEX_ALLOCATION "
 				"attribute has non zero lowest_vcn.");
 		goto unm_err_out;
 	}
-	vi->i_size = sle64_to_cpu(ctx->attr->data.non_resident.data_size);
+	vi->i_size = sle64_to_cpu(a->data.non_resident.data_size);
 	ni->initialized_size = sle64_to_cpu(
-			ctx->attr->data.non_resident.initialized_size);
-	ni->allocated_size = sle64_to_cpu(
-			ctx->attr->data.non_resident.allocated_size);
+			a->data.non_resident.initialized_size);
+	ni->allocated_size = sle64_to_cpu(a->data.non_resident.allocated_size);
 	/*
 	 * We are done with the mft record, so we release it.  Otherwise
 	 * we would deadlock in ntfs_attr_iget().
@@ -1632,21 +1694,18 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi)
 		goto iput_unm_err_out;
 	}
 	/* Consistency check bitmap size vs. index allocation size. */
-	if ((bvi->i_size << 3) < (vi->i_size >>
-			ni->itype.index.block_size_bits)) {
+	bvi_size = i_size_read(bvi);
+	if ((bvi_size << 3) < (vi->i_size >> ni->itype.index.block_size_bits)) {
 		ntfs_error(vi->i_sb, "Index bitmap too small (0x%llx) for "
-				"index allocation (0x%llx).", bvi->i_size << 3,
+				"index allocation (0x%llx).", bvi_size << 3,
 				vi->i_size);
 		goto iput_unm_err_out;
 	}
-	ni->itype.index.bmp_ino = bvi;
+	iput(bvi);
 skip_large_index_stuff:
 	/* Setup the operations for this index inode. */
-	vi->i_op = NULL;
-	vi->i_fop = NULL;
 	vi->i_mapping->a_ops = &ntfs_mst_aops;
 	vi->i_blocks = ni->allocated_size >> 9;
-
 	/*
 	 * Make sure the base inode doesn't go away and attach it to the
 	 * index inode.
@@ -1657,7 +1716,6 @@ skip_large_index_stuff:
 
 	ntfs_debug("Done.");
 	return 0;
-
 iput_unm_err_out:
 	iput(bvi);
 unm_err_out:
@@ -1677,6 +1735,15 @@ err_out:
 	return err;
 }
 
+/*
+ * The MFT inode has special locking, so teach the lock validator
+ * about this by splitting off the locking rules of the MFT from
+ * the locking rules of other inodes. The MFT inode can never be
+ * accessed from the VFS side (or even internally), only by the
+ * map_mft functions.
+ */
+static struct lock_class_key mft_ni_runlist_lock_key, mft_ni_mrec_lock_key;
+
 /**
  * ntfs_read_inode_mount - special read_inode for mount time use only
  * @vi:		inode to read
@@ -1712,7 +1779,7 @@ int ntfs_read_inode_mount(struct inode *vi)
 	struct buffer_head *bh;
 	ntfs_inode *ni;
 	MFT_RECORD *m = NULL;
-	ATTR_RECORD *attr;
+	ATTR_RECORD *a;
 	ntfs_attr_search_ctx *ctx;
 	unsigned int i, nr_blocks;
 	int err;
@@ -1727,10 +1794,10 @@ int ntfs_read_inode_mount(struct inode *vi)
 	/* Setup the data attribute. It is special as it is mst protected. */
 	NInoSetNonResident(ni);
 	NInoSetMstProtected(ni);
+	NInoSetSparseDisabled(ni);
 	ni->type = AT_DATA;
 	ni->name = NULL;
 	ni->name_len = 0;
-
 	/*
 	 * This sets up our little cheat allowing us to reuse the async read io
 	 * completion handler for directories.
@@ -1805,29 +1872,44 @@ int ntfs_read_inode_mount(struct inode *vi)
 	} else /* if (!err) */ {
 		ATTR_LIST_ENTRY *al_entry, *next_al_entry;
 		u8 *al_end;
+		static const char *es = "  Not allowed.  $MFT is corrupt.  "
+				"You should run chkdsk.";
 
 		ntfs_debug("Attribute list attribute found in $MFT.");
 		NInoSetAttrList(ni);
-		if (ctx->attr->flags & ATTR_IS_ENCRYPTED ||
-				ctx->attr->flags & ATTR_COMPRESSION_MASK ||
-				ctx->attr->flags & ATTR_IS_SPARSE) {
+		a = ctx->attr;
+		if (a->flags & ATTR_COMPRESSION_MASK) {
 			ntfs_error(sb, "Attribute list attribute is "
-					"compressed/encrypted/sparse. Not "
-					"allowed. $MFT is corrupt. You should "
-					"run chkdsk.");
+					"compressed.%s", es);
 			goto put_err_out;
 		}
+		if (a->flags & ATTR_IS_ENCRYPTED ||
+				a->flags & ATTR_IS_SPARSE) {
+			if (a->non_resident) {
+				ntfs_error(sb, "Non-resident attribute list "
+						"attribute is encrypted/"
+						"sparse.%s", es);
+				goto put_err_out;
+			}
+			ntfs_warning(sb, "Resident attribute list attribute "
+					"in $MFT system file is marked "
+					"encrypted/sparse which is not true.  "
+					"However, Windows allows this and "
+					"chkdsk does not detect or correct it "
+					"so we will just ignore the invalid "
+					"flags and pretend they are not set.");
+		}
 		/* Now allocate memory for the attribute list. */
-		ni->attr_list_size = (u32)ntfs_attr_size(ctx->attr);
+		ni->attr_list_size = (u32)ntfs_attr_size(a);
 		ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size);
 		if (!ni->attr_list) {
 			ntfs_error(sb, "Not enough memory to allocate buffer "
 					"for attribute list.");
 			goto put_err_out;
 		}
-		if (ctx->attr->non_resident) {
+		if (a->non_resident) {
 			NInoSetAttrListNonResident(ni);
-			if (ctx->attr->data.non_resident.lowest_vcn) {
+			if (a->data.non_resident.lowest_vcn) {
 				ntfs_error(sb, "Attribute list has non zero "
 						"lowest_vcn. $MFT is corrupt. "
 						"You should run chkdsk.");
@@ -1835,7 +1917,7 @@ int ntfs_read_inode_mount(struct inode *vi)
 			}
 			/* Setup the runlist. */
 			ni->attr_list_rl.rl = ntfs_mapping_pairs_decompress(vol,
-					ctx->attr, NULL);
+					a, NULL);
 			if (IS_ERR(ni->attr_list_rl.rl)) {
 				err = PTR_ERR(ni->attr_list_rl.rl);
 				ni->attr_list_rl.rl = NULL;
@@ -1847,7 +1929,7 @@ int ntfs_read_inode_mount(struct inode *vi)
 			/* Now load the attribute list. */
 			if ((err = load_attribute_list(vol, &ni->attr_list_rl,
 					ni->attr_list, ni->attr_list_size,
-					sle64_to_cpu(ctx->attr->data.
+					sle64_to_cpu(a->data.
 					non_resident.initialized_size)))) {
 				ntfs_error(sb, "Failed to load attribute list "
 						"attribute with error code %i.",
@@ -1855,20 +1937,20 @@ int ntfs_read_inode_mount(struct inode *vi)
 				goto put_err_out;
 			}
 		} else /* if (!ctx.attr->non_resident) */ {
-			if ((u8*)ctx->attr + le16_to_cpu(
-					ctx->attr->data.resident.value_offset) +
+			if ((u8*)a + le16_to_cpu(
+					a->data.resident.value_offset) +
 					le32_to_cpu(
-					ctx->attr->data.resident.value_length) >
+					a->data.resident.value_length) >
 					(u8*)ctx->mrec + vol->mft_record_size) {
 				ntfs_error(sb, "Corrupt attribute list "
 						"attribute.");
 				goto put_err_out;
 			}
 			/* Now copy the attribute list. */
-			memcpy(ni->attr_list, (u8*)ctx->attr + le16_to_cpu(
-					ctx->attr->data.resident.value_offset),
+			memcpy(ni->attr_list, (u8*)a + le16_to_cpu(
+					a->data.resident.value_offset),
 					le32_to_cpu(
-					ctx->attr->data.resident.value_length));
+					a->data.resident.value_length));
 		}
 		/* The attribute list is now setup in memory. */
 		/*
@@ -1899,8 +1981,7 @@ int ntfs_read_inode_mount(struct inode *vi)
 				goto em_put_err_out;
 			next_al_entry = (ATTR_LIST_ENTRY*)((u8*)al_entry +
 					le16_to_cpu(al_entry->length));
-			if (le32_to_cpu(al_entry->type) >
-					const_le32_to_cpu(AT_DATA))
+			if (le32_to_cpu(al_entry->type) > le32_to_cpu(AT_DATA))
 				goto em_put_err_out;
 			if (AT_DATA != al_entry->type)
 				continue;
@@ -1934,25 +2015,25 @@ int ntfs_read_inode_mount(struct inode *vi)
 	ntfs_attr_reinit_search_ctx(ctx);
 
 	/* Now load all attribute extents. */
-	attr = NULL;
+	a = NULL;
 	next_vcn = last_vcn = highest_vcn = 0;
 	while (!(err = ntfs_attr_lookup(AT_DATA, NULL, 0, 0, next_vcn, NULL, 0,
 			ctx))) {
 		runlist_element *nrl;
 
 		/* Cache the current attribute. */
-		attr = ctx->attr;
+		a = ctx->attr;
 		/* $MFT must be non-resident. */
-		if (!attr->non_resident) {
+		if (!a->non_resident) {
 			ntfs_error(sb, "$MFT must be non-resident but a "
 					"resident extent was found. $MFT is "
 					"corrupt. Run chkdsk.");
 			goto put_err_out;
 		}
 		/* $MFT must be uncompressed and unencrypted. */
-		if (attr->flags & ATTR_COMPRESSION_MASK ||
-				attr->flags & ATTR_IS_ENCRYPTED ||
-				attr->flags & ATTR_IS_SPARSE) {
+		if (a->flags & ATTR_COMPRESSION_MASK ||
+				a->flags & ATTR_IS_ENCRYPTED ||
+				a->flags & ATTR_IS_SPARSE) {
 			ntfs_error(sb, "$MFT must be uncompressed, "
 					"non-sparse, and unencrypted but a "
 					"compressed/sparse/encrypted extent "
@@ -1966,7 +2047,7 @@ int ntfs_read_inode_mount(struct inode *vi)
 		 * as we have exclusive access to the inode at this time and we
 		 * are a mount in progress task, too.
 		 */
-		nrl = ntfs_mapping_pairs_decompress(vol, attr, ni->runlist.rl);
+		nrl = ntfs_mapping_pairs_decompress(vol, a, ni->runlist.rl);
 		if (IS_ERR(nrl)) {
 			ntfs_error(sb, "ntfs_mapping_pairs_decompress() "
 					"failed with error code %ld.  $MFT is "
@@ -1977,7 +2058,7 @@ int ntfs_read_inode_mount(struct inode *vi)
 
 		/* Are we in the first extent? */
 		if (!next_vcn) {
-			if (attr->data.non_resident.lowest_vcn) {
+			if (a->data.non_resident.lowest_vcn) {
 				ntfs_error(sb, "First extent of $DATA "
 						"attribute has non zero "
 						"lowest_vcn. $MFT is corrupt. "
@@ -1986,15 +2067,15 @@ int ntfs_read_inode_mount(struct inode *vi)
 			}
 			/* Get the last vcn in the $DATA attribute. */
 			last_vcn = sle64_to_cpu(
-					attr->data.non_resident.allocated_size)
+					a->data.non_resident.allocated_size)
 					>> vol->cluster_size_bits;
 			/* Fill in the inode size. */
 			vi->i_size = sle64_to_cpu(
-					attr->data.non_resident.data_size);
-			ni->initialized_size = sle64_to_cpu(attr->data.
-					non_resident.initialized_size);
+					a->data.non_resident.data_size);
+			ni->initialized_size = sle64_to_cpu(
+					a->data.non_resident.initialized_size);
 			ni->allocated_size = sle64_to_cpu(
-					attr->data.non_resident.allocated_size);
+					a->data.non_resident.allocated_size);
 			/*
 			 * Verify the number of mft records does not exceed
 			 * 2^32 - 1.
@@ -2042,7 +2123,8 @@ int ntfs_read_inode_mount(struct inode *vi)
 			 * ntfs_read_inode() will have set up the default ones.
 			 */
 			/* Set uid and gid to root. */
-			vi->i_uid = vi->i_gid = 0;
+			vi->i_uid = GLOBAL_ROOT_UID;
+			vi->i_gid = GLOBAL_ROOT_GID;
 			/* Regular file. No access for anyone. */
 			vi->i_mode = S_IFREG;
 			/* No VFS initiated operations allowed for $MFT. */
@@ -2051,7 +2133,7 @@ int ntfs_read_inode_mount(struct inode *vi)
 		}
 
 		/* Get the lowest vcn for the next extent. */
-		highest_vcn = sle64_to_cpu(attr->data.non_resident.highest_vcn);
+		highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn);
 		next_vcn = highest_vcn + 1;
 
 		/* Only one extent or error, which we catch below. */
@@ -2060,7 +2142,7 @@ int ntfs_read_inode_mount(struct inode *vi)
 
 		/* Avoid endless loops due to corruption. */
 		if (next_vcn < sle64_to_cpu(
-				attr->data.non_resident.lowest_vcn)) {
+				a->data.non_resident.lowest_vcn)) {
 			ntfs_error(sb, "$MFT has corrupt attribute list "
 					"attribute. Run chkdsk.");
 			goto put_err_out;
@@ -2071,7 +2153,7 @@ int ntfs_read_inode_mount(struct inode *vi)
 				"$MFT is corrupt. Run chkdsk.");
 		goto put_err_out;
 	}
-	if (!attr) {
+	if (!a) {
 		ntfs_error(sb, "$MFT/$DATA attribute not found. $MFT is "
 				"corrupt. Run chkdsk.");
 		goto put_err_out;
@@ -2088,6 +2170,14 @@ int ntfs_read_inode_mount(struct inode *vi)
 	ntfs_attr_put_search_ctx(ctx);
 	ntfs_debug("Done.");
 	ntfs_free(m);
+
+	/*
+	 * Split the locking rules of the MFT inode from the
+	 * locking rules of other inodes:
+	 */
+	lockdep_set_class(&ni->runlist.lock, &mft_ni_runlist_lock_key);
+	lockdep_set_class(&ni->mrec_lock, &mft_ni_mrec_lock_key);
+
 	return 0;
 
 em_put_err_out:
@@ -2102,37 +2192,6 @@ err_out:
 	return -1;
 }
 
-/**
- * ntfs_put_inode - handler for when the inode reference count is decremented
- * @vi:		vfs inode
- *
- * The VFS calls ntfs_put_inode() every time the inode reference count (i_count)
- * is about to be decremented (but before the decrement itself.
- *
- * If the inode @vi is a directory with two references, one of which is being
- * dropped, we need to put the attribute inode for the directory index bitmap,
- * if it is present, otherwise the directory inode would remain pinned for
- * ever.
- */
-void ntfs_put_inode(struct inode *vi)
-{
-	if (S_ISDIR(vi->i_mode) && atomic_read(&vi->i_count) == 2) {
-		ntfs_inode *ni = NTFS_I(vi);
-		if (NInoIndexAllocPresent(ni)) {
-			struct inode *bvi = NULL;
-			down(&vi->i_sem);
-			if (atomic_read(&vi->i_count) == 2) {
-				bvi = ni->itype.index.bmp_ino;
-				if (bvi)
-					ni->itype.index.bmp_ino = NULL;
-			}
-			up(&vi->i_sem);
-			if (bvi)
-				iput(bvi);
-		}
-	}
-}
-
 static void __ntfs_clear_inode(ntfs_inode *ni)
 {
 	/* Free all alocated memory. */
@@ -2185,7 +2244,7 @@ void ntfs_clear_extent_inode(ntfs_inode *ni)
 }
 
 /**
- * ntfs_clear_big_inode - clean up the ntfs specific part of an inode
+ * ntfs_evict_big_inode - clean up the ntfs specific part of an inode
  * @vi:		vfs inode pending annihilation
  *
  * When the VFS is going to remove an inode from memory, ntfs_clear_big_inode()
@@ -2194,25 +2253,16 @@ void ntfs_clear_extent_inode(ntfs_inode *ni)
  *
  * If the MFT record is dirty, we commit it before doing anything else.
  */
-void ntfs_clear_big_inode(struct inode *vi)
+void ntfs_evict_big_inode(struct inode *vi)
 {
 	ntfs_inode *ni = NTFS_I(vi);
 
-	/*
-	 * If the inode @vi is an index inode we need to put the attribute
-	 * inode for the index bitmap, if it is present, otherwise the index
-	 * inode would disappear and the attribute inode for the index bitmap
-	 * would no longer be referenced from anywhere and thus it would remain
-	 * pinned for ever.
-	 */
-	if (NInoAttr(ni) && (ni->type == AT_INDEX_ALLOCATION) &&
-			NInoIndexAllocPresent(ni) && ni->itype.index.bmp_ino) {
-		iput(ni->itype.index.bmp_ino);
-		ni->itype.index.bmp_ino = NULL;
-	}
+	truncate_inode_pages_final(&vi->i_data);
+	clear_inode(vi);
+
 #ifdef NTFS_RW
 	if (NInoDirty(ni)) {
-		BOOL was_bad = (is_bad_inode(vi));
+		bool was_bad = (is_bad_inode(vi));
 
 		/* Committing the inode also commits all extent inodes. */
 		ntfs_commit_inode(vi);
@@ -2250,20 +2300,20 @@ void ntfs_clear_big_inode(struct inode *vi)
 /**
  * ntfs_show_options - show mount options in /proc/mounts
  * @sf:		seq_file in which to write our mount options
- * @mnt:	vfs mount whose mount options to display
+ * @root:	root of the mounted tree whose mount options to display
  *
  * Called by the VFS once for each mounted ntfs volume when someone reads
  * /proc/mounts in order to display the NTFS specific mount options of each
- * mount. The mount options of the vfs mount @mnt are written to the seq file
+ * mount. The mount options of fs specified by @root are written to the seq file
  * @sf and success is returned.
  */
-int ntfs_show_options(struct seq_file *sf, struct vfsmount *mnt)
+int ntfs_show_options(struct seq_file *sf, struct dentry *root)
 {
-	ntfs_volume *vol = NTFS_SB(mnt->mnt_sb);
+	ntfs_volume *vol = NTFS_SB(root->d_sb);
 	int i;
 
-	seq_printf(sf, ",uid=%i", vol->uid);
-	seq_printf(sf, ",gid=%i", vol->gid);
+	seq_printf(sf, ",uid=%i", from_kuid_munged(&init_user_ns, vol->uid));
+	seq_printf(sf, ",gid=%i", from_kgid_munged(&init_user_ns, vol->gid));
 	if (vol->fmask == vol->dmask)
 		seq_printf(sf, ",umask=0%o", vol->fmask);
 	else {
@@ -2275,6 +2325,8 @@ int ntfs_show_options(struct seq_file *sf, struct vfsmount *mnt)
 		seq_printf(sf, ",case_sensitive");
 	if (NVolShowSystemFiles(vol))
 		seq_printf(sf, ",show_sys_files");
+	if (!NVolSparseEnabled(vol))
+		seq_printf(sf, ",disable_sparse");
 	for (i = 0; on_errors_arr[i].val; i++) {
 		if (on_errors_arr[i].val & vol->on_errors)
 			seq_printf(sf, ",errors=%s", on_errors_arr[i].str);
@@ -2285,11 +2337,16 @@ int ntfs_show_options(struct seq_file *sf, struct vfsmount *mnt)
 
 #ifdef NTFS_RW
 
+static const char *es = "  Leaving inconsistent metadata.  Unmount and run "
+		"chkdsk.";
+
 /**
  * ntfs_truncate - called when the i_size of an ntfs inode is changed
  * @vi:		inode for which the i_size was changed
  *
- * We do not support i_size changes yet.
+ * We only support i_size changes for normal files at present, i.e. not
+ * compressed and not encrypted.  This is enforced in ntfs_setattr(), see
+ * below.
  *
  * The kernel guarantees that @vi is a regular file (S_ISREG() is true) and
  * that the change is allowed.
@@ -2299,79 +2356,505 @@ int ntfs_show_options(struct seq_file *sf, struct vfsmount *mnt)
  *
  * Returns 0 on success or -errno on error.
  *
- * Called with ->i_sem held.  In all but one case ->i_alloc_sem is held for
- * writing.  The only case where ->i_alloc_sem is not held is
- * mm/filemap.c::generic_file_buffered_write() where vmtruncate() is called
- * with the current i_size as the offset which means that it is a noop as far
- * as ntfs_truncate() is concerned.
+ * Called with ->i_mutex held.
  */
 int ntfs_truncate(struct inode *vi)
 {
-	ntfs_inode *ni = NTFS_I(vi);
+	s64 new_size, old_size, nr_freed, new_alloc_size, old_alloc_size;
+	VCN highest_vcn;
+	unsigned long flags;
+	ntfs_inode *base_ni, *ni = NTFS_I(vi);
 	ntfs_volume *vol = ni->vol;
 	ntfs_attr_search_ctx *ctx;
 	MFT_RECORD *m;
+	ATTR_RECORD *a;
 	const char *te = "  Leaving file length out of sync with i_size.";
-	int err;
+	int err, mp_size, size_change, alloc_change;
+	u32 attr_len;
 
 	ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
 	BUG_ON(NInoAttr(ni));
+	BUG_ON(S_ISDIR(vi->i_mode));
+	BUG_ON(NInoMstProtected(ni));
 	BUG_ON(ni->nr_extents < 0);
-	m = map_mft_record(ni);
+retry_truncate:
+	/*
+	 * Lock the runlist for writing and map the mft record to ensure it is
+	 * safe to mess with the attribute runlist and sizes.
+	 */
+	down_write(&ni->runlist.lock);
+	if (!NInoAttr(ni))
+		base_ni = ni;
+	else
+		base_ni = ni->ext.base_ntfs_ino;
+	m = map_mft_record(base_ni);
 	if (IS_ERR(m)) {
 		err = PTR_ERR(m);
 		ntfs_error(vi->i_sb, "Failed to map mft record for inode 0x%lx "
 				"(error code %d).%s", vi->i_ino, err, te);
 		ctx = NULL;
 		m = NULL;
-		goto err_out;
+		goto old_bad_out;
 	}
-	ctx = ntfs_attr_get_search_ctx(ni, m);
+	ctx = ntfs_attr_get_search_ctx(base_ni, m);
 	if (unlikely(!ctx)) {
 		ntfs_error(vi->i_sb, "Failed to allocate a search context for "
 				"inode 0x%lx (not enough memory).%s",
 				vi->i_ino, te);
 		err = -ENOMEM;
-		goto err_out;
+		goto old_bad_out;
 	}
 	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
 			CASE_SENSITIVE, 0, NULL, 0, ctx);
 	if (unlikely(err)) {
-		if (err == -ENOENT)
+		if (err == -ENOENT) {
 			ntfs_error(vi->i_sb, "Open attribute is missing from "
 					"mft record.  Inode 0x%lx is corrupt.  "
-					"Run chkdsk.", vi->i_ino);
-		else
+					"Run chkdsk.%s", vi->i_ino, te);
+			err = -EIO;
+		} else
 			ntfs_error(vi->i_sb, "Failed to lookup attribute in "
-					"inode 0x%lx (error code %d).",
-					vi->i_ino, err);
-		goto err_out;
+					"inode 0x%lx (error code %d).%s",
+					vi->i_ino, err, te);
+		goto old_bad_out;
 	}
-	/* If the size has not changed there is nothing to do. */
-	if (ntfs_attr_size(ctx->attr) == i_size_read(vi))
-		goto done;
-	// TODO: Implement the truncate...
-	ntfs_error(vi->i_sb, "Inode size has changed but this is not "
-			"implemented yet.  Resetting inode size to old value. "
-			" This is most likely a bug in the ntfs driver!");
-	i_size_write(vi, ntfs_attr_size(ctx->attr)); 
-done:
+	m = ctx->mrec;
+	a = ctx->attr;
+	/*
+	 * The i_size of the vfs inode is the new size for the attribute value.
+	 */
+	new_size = i_size_read(vi);
+	/* The current size of the attribute value is the old size. */
+	old_size = ntfs_attr_size(a);
+	/* Calculate the new allocated size. */
+	if (NInoNonResident(ni))
+		new_alloc_size = (new_size + vol->cluster_size - 1) &
+				~(s64)vol->cluster_size_mask;
+	else
+		new_alloc_size = (new_size + 7) & ~7;
+	/* The current allocated size is the old allocated size. */
+	read_lock_irqsave(&ni->size_lock, flags);
+	old_alloc_size = ni->allocated_size;
+	read_unlock_irqrestore(&ni->size_lock, flags);
+	/*
+	 * The change in the file size.  This will be 0 if no change, >0 if the
+	 * size is growing, and <0 if the size is shrinking.
+	 */
+	size_change = -1;
+	if (new_size - old_size >= 0) {
+		size_change = 1;
+		if (new_size == old_size)
+			size_change = 0;
+	}
+	/* As above for the allocated size. */
+	alloc_change = -1;
+	if (new_alloc_size - old_alloc_size >= 0) {
+		alloc_change = 1;
+		if (new_alloc_size == old_alloc_size)
+			alloc_change = 0;
+	}
+	/*
+	 * If neither the size nor the allocation are being changed there is
+	 * nothing to do.
+	 */
+	if (!size_change && !alloc_change)
+		goto unm_done;
+	/* If the size is changing, check if new size is allowed in $AttrDef. */
+	if (size_change) {
+		err = ntfs_attr_size_bounds_check(vol, ni->type, new_size);
+		if (unlikely(err)) {
+			if (err == -ERANGE) {
+				ntfs_error(vol->sb, "Truncate would cause the "
+						"inode 0x%lx to %simum size "
+						"for its attribute type "
+						"(0x%x).  Aborting truncate.",
+						vi->i_ino,
+						new_size > old_size ? "exceed "
+						"the max" : "go under the min",
+						le32_to_cpu(ni->type));
+				err = -EFBIG;
+			} else {
+				ntfs_error(vol->sb, "Inode 0x%lx has unknown "
+						"attribute type 0x%x.  "
+						"Aborting truncate.",
+						vi->i_ino,
+						le32_to_cpu(ni->type));
+				err = -EIO;
+			}
+			/* Reset the vfs inode size to the old size. */
+			i_size_write(vi, old_size);
+			goto err_out;
+		}
+	}
+	if (NInoCompressed(ni) || NInoEncrypted(ni)) {
+		ntfs_warning(vi->i_sb, "Changes in inode size are not "
+				"supported yet for %s files, ignoring.",
+				NInoCompressed(ni) ? "compressed" :
+				"encrypted");
+		err = -EOPNOTSUPP;
+		goto bad_out;
+	}
+	if (a->non_resident)
+		goto do_non_resident_truncate;
+	BUG_ON(NInoNonResident(ni));
+	/* Resize the attribute record to best fit the new attribute size. */
+	if (new_size < vol->mft_record_size &&
+			!ntfs_resident_attr_value_resize(m, a, new_size)) {
+		/* The resize succeeded! */
+		flush_dcache_mft_record_page(ctx->ntfs_ino);
+		mark_mft_record_dirty(ctx->ntfs_ino);
+		write_lock_irqsave(&ni->size_lock, flags);
+		/* Update the sizes in the ntfs inode and all is done. */
+		ni->allocated_size = le32_to_cpu(a->length) -
+				le16_to_cpu(a->data.resident.value_offset);
+		/*
+		 * Note ntfs_resident_attr_value_resize() has already done any
+		 * necessary data clearing in the attribute record.  When the
+		 * file is being shrunk vmtruncate() will already have cleared
+		 * the top part of the last partial page, i.e. since this is
+		 * the resident case this is the page with index 0.  However,
+		 * when the file is being expanded, the page cache page data
+		 * between the old data_size, i.e. old_size, and the new_size
+		 * has not been zeroed.  Fortunately, we do not need to zero it
+		 * either since on one hand it will either already be zero due
+		 * to both readpage and writepage clearing partial page data
+		 * beyond i_size in which case there is nothing to do or in the
+		 * case of the file being mmap()ped at the same time, POSIX
+		 * specifies that the behaviour is unspecified thus we do not
+		 * have to do anything.  This means that in our implementation
+		 * in the rare case that the file is mmap()ped and a write
+		 * occurred into the mmap()ped region just beyond the file size
+		 * and writepage has not yet been called to write out the page
+		 * (which would clear the area beyond the file size) and we now
+		 * extend the file size to incorporate this dirty region
+		 * outside the file size, a write of the page would result in
+		 * this data being written to disk instead of being cleared.
+		 * Given both POSIX and the Linux mmap(2) man page specify that
+		 * this corner case is undefined, we choose to leave it like
+		 * that as this is much simpler for us as we cannot lock the
+		 * relevant page now since we are holding too many ntfs locks
+		 * which would result in a lock reversal deadlock.
+		 */
+		ni->initialized_size = new_size;
+		write_unlock_irqrestore(&ni->size_lock, flags);
+		goto unm_done;
+	}
+	/* If the above resize failed, this must be an attribute extension. */
+	BUG_ON(size_change < 0);
+	/*
+	 * We have to drop all the locks so we can call
+	 * ntfs_attr_make_non_resident().  This could be optimised by try-
+	 * locking the first page cache page and only if that fails dropping
+	 * the locks, locking the page, and redoing all the locking and
+	 * lookups.  While this would be a huge optimisation, it is not worth
+	 * it as this is definitely a slow code path as it only ever can happen
+	 * once for any given file.
+	 */
 	ntfs_attr_put_search_ctx(ctx);
-	unmap_mft_record(ni);
-	NInoClearTruncateFailed(ni);
-	ntfs_debug("Done.");
-	return 0;
-err_out:
-	if (err != -ENOMEM) {
+	unmap_mft_record(base_ni);
+	up_write(&ni->runlist.lock);
+	/*
+	 * Not enough space in the mft record, try to make the attribute
+	 * non-resident and if successful restart the truncation process.
+	 */
+	err = ntfs_attr_make_non_resident(ni, old_size);
+	if (likely(!err))
+		goto retry_truncate;
+	/*
+	 * Could not make non-resident.  If this is due to this not being
+	 * permitted for this attribute type or there not being enough space,
+	 * try to make other attributes non-resident.  Otherwise fail.
+	 */
+	if (unlikely(err != -EPERM && err != -ENOSPC)) {
+		ntfs_error(vol->sb, "Cannot truncate inode 0x%lx, attribute "
+				"type 0x%x, because the conversion from "
+				"resident to non-resident attribute failed "
+				"with error code %i.", vi->i_ino,
+				(unsigned)le32_to_cpu(ni->type), err);
+		if (err != -ENOMEM)
+			err = -EIO;
+		goto conv_err_out;
+	}
+	/* TODO: Not implemented from here, abort. */
+	if (err == -ENOSPC)
+		ntfs_error(vol->sb, "Not enough space in the mft record/on "
+				"disk for the non-resident attribute value.  "
+				"This case is not implemented yet.");
+	else /* if (err == -EPERM) */
+		ntfs_error(vol->sb, "This attribute type may not be "
+				"non-resident.  This case is not implemented "
+				"yet.");
+	err = -EOPNOTSUPP;
+	goto conv_err_out;
+#if 0
+	// TODO: Attempt to make other attributes non-resident.
+	if (!err)
+		goto do_resident_extend;
+	/*
+	 * Both the attribute list attribute and the standard information
+	 * attribute must remain in the base inode.  Thus, if this is one of
+	 * these attributes, we have to try to move other attributes out into
+	 * extent mft records instead.
+	 */
+	if (ni->type == AT_ATTRIBUTE_LIST ||
+			ni->type == AT_STANDARD_INFORMATION) {
+		// TODO: Attempt to move other attributes into extent mft
+		// records.
+		err = -EOPNOTSUPP;
+		if (!err)
+			goto do_resident_extend;
+		goto err_out;
+	}
+	// TODO: Attempt to move this attribute to an extent mft record, but
+	// only if it is not already the only attribute in an mft record in
+	// which case there would be nothing to gain.
+	err = -EOPNOTSUPP;
+	if (!err)
+		goto do_resident_extend;
+	/* There is nothing we can do to make enough space. )-: */
+	goto err_out;
+#endif
+do_non_resident_truncate:
+	BUG_ON(!NInoNonResident(ni));
+	if (alloc_change < 0) {
+		highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn);
+		if (highest_vcn > 0 &&
+				old_alloc_size >> vol->cluster_size_bits >
+				highest_vcn + 1) {
+			/*
+			 * This attribute has multiple extents.  Not yet
+			 * supported.
+			 */
+			ntfs_error(vol->sb, "Cannot truncate inode 0x%lx, "
+					"attribute type 0x%x, because the "
+					"attribute is highly fragmented (it "
+					"consists of multiple extents) and "
+					"this case is not implemented yet.",
+					vi->i_ino,
+					(unsigned)le32_to_cpu(ni->type));
+			err = -EOPNOTSUPP;
+			goto bad_out;
+		}
+	}
+	/*
+	 * If the size is shrinking, need to reduce the initialized_size and
+	 * the data_size before reducing the allocation.
+	 */
+	if (size_change < 0) {
+		/*
+		 * Make the valid size smaller (i_size is already up-to-date).
+		 */
+		write_lock_irqsave(&ni->size_lock, flags);
+		if (new_size < ni->initialized_size) {
+			ni->initialized_size = new_size;
+			a->data.non_resident.initialized_size =
+					cpu_to_sle64(new_size);
+		}
+		a->data.non_resident.data_size = cpu_to_sle64(new_size);
+		write_unlock_irqrestore(&ni->size_lock, flags);
+		flush_dcache_mft_record_page(ctx->ntfs_ino);
+		mark_mft_record_dirty(ctx->ntfs_ino);
+		/* If the allocated size is not changing, we are done. */
+		if (!alloc_change)
+			goto unm_done;
+		/*
+		 * If the size is shrinking it makes no sense for the
+		 * allocation to be growing.
+		 */
+		BUG_ON(alloc_change > 0);
+	} else /* if (size_change >= 0) */ {
+		/*
+		 * The file size is growing or staying the same but the
+		 * allocation can be shrinking, growing or staying the same.
+		 */
+		if (alloc_change > 0) {
+			/*
+			 * We need to extend the allocation and possibly update
+			 * the data size.  If we are updating the data size,
+			 * since we are not touching the initialized_size we do
+			 * not need to worry about the actual data on disk.
+			 * And as far as the page cache is concerned, there
+			 * will be no pages beyond the old data size and any
+			 * partial region in the last page between the old and
+			 * new data size (or the end of the page if the new
+			 * data size is outside the page) does not need to be
+			 * modified as explained above for the resident
+			 * attribute truncate case.  To do this, we simply drop
+			 * the locks we hold and leave all the work to our
+			 * friendly helper ntfs_attr_extend_allocation().
+			 */
+			ntfs_attr_put_search_ctx(ctx);
+			unmap_mft_record(base_ni);
+			up_write(&ni->runlist.lock);
+			err = ntfs_attr_extend_allocation(ni, new_size,
+					size_change > 0 ? new_size : -1, -1);
+			/*
+			 * ntfs_attr_extend_allocation() will have done error
+			 * output already.
+			 */
+			goto done;
+		}
+		if (!alloc_change)
+			goto alloc_done;
+	}
+	/* alloc_change < 0 */
+	/* Free the clusters. */
+	nr_freed = ntfs_cluster_free(ni, new_alloc_size >>
+			vol->cluster_size_bits, -1, ctx);
+	m = ctx->mrec;
+	a = ctx->attr;
+	if (unlikely(nr_freed < 0)) {
+		ntfs_error(vol->sb, "Failed to release cluster(s) (error code "
+				"%lli).  Unmount and run chkdsk to recover "
+				"the lost cluster(s).", (long long)nr_freed);
 		NVolSetErrors(vol);
-		make_bad_inode(vi);
+		nr_freed = 0;
+	}
+	/* Truncate the runlist. */
+	err = ntfs_rl_truncate_nolock(vol, &ni->runlist,
+			new_alloc_size >> vol->cluster_size_bits);
+	/*
+	 * If the runlist truncation failed and/or the search context is no
+	 * longer valid, we cannot resize the attribute record or build the
+	 * mapping pairs array thus we mark the inode bad so that no access to
+	 * the freed clusters can happen.
+	 */
+	if (unlikely(err || IS_ERR(m))) {
+		ntfs_error(vol->sb, "Failed to %s (error code %li).%s",
+				IS_ERR(m) ?
+				"restore attribute search context" :
+				"truncate attribute runlist",
+				IS_ERR(m) ? PTR_ERR(m) : err, es);
+		err = -EIO;
+		goto bad_out;
+	}
+	/* Get the size for the shrunk mapping pairs array for the runlist. */
+	mp_size = ntfs_get_size_for_mapping_pairs(vol, ni->runlist.rl, 0, -1);
+	if (unlikely(mp_size <= 0)) {
+		ntfs_error(vol->sb, "Cannot shrink allocation of inode 0x%lx, "
+				"attribute type 0x%x, because determining the "
+				"size for the mapping pairs failed with error "
+				"code %i.%s", vi->i_ino,
+				(unsigned)le32_to_cpu(ni->type), mp_size, es);
+		err = -EIO;
+		goto bad_out;
+	}
+	/*
+	 * Shrink the attribute record for the new mapping pairs array.  Note,
+	 * this cannot fail since we are making the attribute smaller thus by
+	 * definition there is enough space to do so.
+	 */
+	attr_len = le32_to_cpu(a->length);
+	err = ntfs_attr_record_resize(m, a, mp_size +
+			le16_to_cpu(a->data.non_resident.mapping_pairs_offset));
+	BUG_ON(err);
+	/*
+	 * Generate the mapping pairs array directly into the attribute record.
+	 */
+	err = ntfs_mapping_pairs_build(vol, (u8*)a +
+			le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
+			mp_size, ni->runlist.rl, 0, -1, NULL);
+	if (unlikely(err)) {
+		ntfs_error(vol->sb, "Cannot shrink allocation of inode 0x%lx, "
+				"attribute type 0x%x, because building the "
+				"mapping pairs failed with error code %i.%s",
+				vi->i_ino, (unsigned)le32_to_cpu(ni->type),
+				err, es);
+		err = -EIO;
+		goto bad_out;
+	}
+	/* Update the allocated/compressed size as well as the highest vcn. */
+	a->data.non_resident.highest_vcn = cpu_to_sle64((new_alloc_size >>
+			vol->cluster_size_bits) - 1);
+	write_lock_irqsave(&ni->size_lock, flags);
+	ni->allocated_size = new_alloc_size;
+	a->data.non_resident.allocated_size = cpu_to_sle64(new_alloc_size);
+	if (NInoSparse(ni) || NInoCompressed(ni)) {
+		if (nr_freed) {
+			ni->itype.compressed.size -= nr_freed <<
+					vol->cluster_size_bits;
+			BUG_ON(ni->itype.compressed.size < 0);
+			a->data.non_resident.compressed_size = cpu_to_sle64(
+					ni->itype.compressed.size);
+			vi->i_blocks = ni->itype.compressed.size >> 9;
+		}
+	} else
+		vi->i_blocks = new_alloc_size >> 9;
+	write_unlock_irqrestore(&ni->size_lock, flags);
+	/*
+	 * We have shrunk the allocation.  If this is a shrinking truncate we
+	 * have already dealt with the initialized_size and the data_size above
+	 * and we are done.  If the truncate is only changing the allocation
+	 * and not the data_size, we are also done.  If this is an extending
+	 * truncate, need to extend the data_size now which is ensured by the
+	 * fact that @size_change is positive.
+	 */
+alloc_done:
+	/*
+	 * If the size is growing, need to update it now.  If it is shrinking,
+	 * we have already updated it above (before the allocation change).
+	 */
+	if (size_change > 0)
+		a->data.non_resident.data_size = cpu_to_sle64(new_size);
+	/* Ensure the modified mft record is written out. */
+	flush_dcache_mft_record_page(ctx->ntfs_ino);
+	mark_mft_record_dirty(ctx->ntfs_ino);
+unm_done:
+	ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(base_ni);
+	up_write(&ni->runlist.lock);
+done:
+	/* Update the mtime and ctime on the base inode. */
+	/* normally ->truncate shouldn't update ctime or mtime,
+	 * but ntfs did before so it got a copy & paste version
+	 * of file_update_time.  one day someone should fix this
+	 * for real.
+	 */
+	if (!IS_NOCMTIME(VFS_I(base_ni)) && !IS_RDONLY(VFS_I(base_ni))) {
+		struct timespec now = current_fs_time(VFS_I(base_ni)->i_sb);
+		int sync_it = 0;
+
+		if (!timespec_equal(&VFS_I(base_ni)->i_mtime, &now) ||
+		    !timespec_equal(&VFS_I(base_ni)->i_ctime, &now))
+			sync_it = 1;
+		VFS_I(base_ni)->i_mtime = now;
+		VFS_I(base_ni)->i_ctime = now;
+
+		if (sync_it)
+			mark_inode_dirty_sync(VFS_I(base_ni));
+	}
+
+	if (likely(!err)) {
+		NInoClearTruncateFailed(ni);
+		ntfs_debug("Done.");
 	}
+	return err;
+old_bad_out:
+	old_size = -1;
+bad_out:
+	if (err != -ENOMEM && err != -EOPNOTSUPP)
+		NVolSetErrors(vol);
+	if (err != -EOPNOTSUPP)
+		NInoSetTruncateFailed(ni);
+	else if (old_size >= 0)
+		i_size_write(vi, old_size);
+err_out:
 	if (ctx)
 		ntfs_attr_put_search_ctx(ctx);
 	if (m)
-		unmap_mft_record(ni);
-	NInoSetTruncateFailed(ni);
+		unmap_mft_record(base_ni);
+	up_write(&ni->runlist.lock);
+out:
+	ntfs_debug("Failed.  Returning error code %i.", err);
 	return err;
+conv_err_out:
+	if (err != -ENOMEM && err != -EOPNOTSUPP)
+		NVolSetErrors(vol);
+	if (err != -EOPNOTSUPP)
+		NInoSetTruncateFailed(ni);
+	else
+		i_size_write(vi, old_size);
+	goto out;
 }
 
 /**
@@ -2382,9 +2865,11 @@ err_out:
  *
  * See ntfs_truncate() description above for details.
  */
+#ifdef NTFS_RW
 void ntfs_truncate_vfs(struct inode *vi) {
 	ntfs_truncate(vi);
 }
+#endif
 
 /**
  * ntfs_setattr - called from notify_change() when an attribute is being changed
@@ -2398,11 +2883,7 @@ void ntfs_truncate_vfs(struct inode *vi) {
  * We also abort all changes of user, group, and mode as we do not implement
  * the NTFS ACLs yet.
  *
- * Called with ->i_sem held.  For the ATTR_SIZE (i.e. ->truncate) case, also
- * called with ->i_alloc_sem held for writing.
- *
- * Basically this is a copy of generic notify_change() and inode_setattr()
- * functionality, except we intercept and abort changes in i_size.
+ * Called with ->i_mutex held.
  */
 int ntfs_setattr(struct dentry *dentry, struct iattr *attr)
 {
@@ -2412,8 +2893,7 @@ int ntfs_setattr(struct dentry *dentry, struct iattr *attr)
 
 	err = inode_change_ok(vi, attr);
 	if (err)
-		return err;
-
+		goto out;
 	/* We do not support NTFS ACLs yet. */
 	if (ia_valid & (ATTR_UID | ATTR_GID | ATTR_MODE)) {
 		ntfs_warning(vi->i_sb, "Changes in user/group/mode are not "
@@ -2421,14 +2901,24 @@ int ntfs_setattr(struct dentry *dentry, struct iattr *attr)
 		err = -EOPNOTSUPP;
 		goto out;
 	}
-
 	if (ia_valid & ATTR_SIZE) {
 		if (attr->ia_size != i_size_read(vi)) {
-			ntfs_warning(vi->i_sb, "Changes in inode size are not "
-					"supported yet, ignoring.");
-			err = -EOPNOTSUPP;
-			// TODO: Implement...
-			// err = vmtruncate(vi, attr->ia_size);
+			ntfs_inode *ni = NTFS_I(vi);
+			/*
+			 * FIXME: For now we do not support resizing of
+			 * compressed or encrypted files yet.
+			 */
+			if (NInoCompressed(ni) || NInoEncrypted(ni)) {
+				ntfs_warning(vi->i_sb, "Changes in inode size "
+						"are not supported yet for "
+						"%s files, ignoring.",
+						NInoCompressed(ni) ?
+						"compressed" : "encrypted");
+				err = -EOPNOTSUPP;
+			} else {
+				truncate_setsize(vi, attr->ia_size);
+				ntfs_truncate_vfs(vi);
+			}
 			if (err || ia_valid == ATTR_SIZE)
 				goto out;
 		} else {
@@ -2436,16 +2926,18 @@ int ntfs_setattr(struct dentry *dentry, struct iattr *attr)
 			 * We skipped the truncate but must still update
 			 * timestamps.
 			 */
-			ia_valid |= ATTR_MTIME|ATTR_CTIME;
+			ia_valid |= ATTR_MTIME | ATTR_CTIME;
 		}
 	}
-
 	if (ia_valid & ATTR_ATIME)
-		vi->i_atime = attr->ia_atime;
+		vi->i_atime = timespec_trunc(attr->ia_atime,
+				vi->i_sb->s_time_gran);
 	if (ia_valid & ATTR_MTIME)
-		vi->i_mtime = attr->ia_mtime;
+		vi->i_mtime = timespec_trunc(attr->ia_mtime,
+				vi->i_sb->s_time_gran);
 	if (ia_valid & ATTR_CTIME)
-		vi->i_ctime = attr->ia_ctime;
+		vi->i_ctime = timespec_trunc(attr->ia_ctime,
+				vi->i_sb->s_time_gran);
 	mark_inode_dirty(vi);
 out:
 	return err;
@@ -2469,7 +2961,7 @@ out:
  *
  * Return 0 on success and -errno on error.
  */
-int ntfs_write_inode(struct inode *vi, int sync)
+int __ntfs_write_inode(struct inode *vi, int sync)
 {
 	sle64 nt;
 	ntfs_inode *ni = NTFS_I(vi);
@@ -2477,7 +2969,7 @@ int ntfs_write_inode(struct inode *vi, int sync)
 	MFT_RECORD *m;
 	STANDARD_INFORMATION *si;
 	int err = 0;
-	BOOL modified = FALSE;
+	bool modified = false;
 
 	ntfs_debug("Entering for %sinode 0x%lx.", NInoAttr(ni) ? "attr " : "",
 			vi->i_ino);
@@ -2515,29 +3007,29 @@ int ntfs_write_inode(struct inode *vi, int sync)
 	nt = utc2ntfs(vi->i_mtime);
 	if (si->last_data_change_time != nt) {
 		ntfs_debug("Updating mtime for inode 0x%lx: old = 0x%llx, "
-				"new = 0x%llx", vi->i_ino,
+				"new = 0x%llx", vi->i_ino, (long long)
 				sle64_to_cpu(si->last_data_change_time),
-				sle64_to_cpu(nt));
+				(long long)sle64_to_cpu(nt));
 		si->last_data_change_time = nt;
-		modified = TRUE;
+		modified = true;
 	}
 	nt = utc2ntfs(vi->i_ctime);
 	if (si->last_mft_change_time != nt) {
 		ntfs_debug("Updating ctime for inode 0x%lx: old = 0x%llx, "
-				"new = 0x%llx", vi->i_ino,
+				"new = 0x%llx", vi->i_ino, (long long)
 				sle64_to_cpu(si->last_mft_change_time),
-				sle64_to_cpu(nt));
+				(long long)sle64_to_cpu(nt));
 		si->last_mft_change_time = nt;
-		modified = TRUE;
+		modified = true;
 	}
 	nt = utc2ntfs(vi->i_atime);
 	if (si->last_access_time != nt) {
 		ntfs_debug("Updating atime for inode 0x%lx: old = 0x%llx, "
 				"new = 0x%llx", vi->i_ino,
-				sle64_to_cpu(si->last_access_time),
-				sle64_to_cpu(nt));
+				(long long)sle64_to_cpu(si->last_access_time),
+				(long long)sle64_to_cpu(nt));
 		si->last_access_time = nt;
-		modified = TRUE;
+		modified = true;
 	}
 	/*
 	 * If we just modified the standard information attribute we need to
@@ -2555,15 +3047,18 @@ int ntfs_write_inode(struct inode *vi, int sync)
 	 * record will be cleaned and written out to disk below, i.e. before
 	 * this function returns.
 	 */
-	if (modified && !NInoTestSetDirty(ctx->ntfs_ino))
-		mark_ntfs_record_dirty(ctx->ntfs_ino->page,
-				ctx->ntfs_ino->page_ofs);
+	if (modified) {
+		flush_dcache_mft_record_page(ctx->ntfs_ino);
+		if (!NInoTestSetDirty(ctx->ntfs_ino))
+			mark_ntfs_record_dirty(ctx->ntfs_ino->page,
+					ctx->ntfs_ino->page_ofs);
+	}
 	ntfs_attr_put_search_ctx(ctx);
 	/* Now the access times are updated, write the base mft record. */
 	if (NInoDirty(ni))
 		err = write_mft_record(ni, m, sync);
 	/* Write all attached extent mft records. */
-	down(&ni->extent_lock);
+	mutex_lock(&ni->extent_lock);
 	if (ni->nr_extents > 0) {
 		ntfs_inode **extent_nis = ni->ext.extent_ntfs_inos;
 		int i;
@@ -2590,7 +3085,7 @@ int ntfs_write_inode(struct inode *vi, int sync)
 			}
 		}
 	}
-	up(&ni->extent_lock);
+	mutex_unlock(&ni->extent_lock);
 	unmap_mft_record(ni);
 	if (unlikely(err))
 		goto err_out;
@@ -2605,9 +3100,7 @@ err_out:
 				"retries later.");
 		mark_inode_dirty(vi);
 	} else {
-		ntfs_error(vi->i_sb, "Failed (error code %i):  Marking inode "
-				"as bad.  You should run chkdsk.", -err);
-		make_bad_inode(vi);
+		ntfs_error(vi->i_sb, "Failed (error %i):  Run chkdsk.", -err);
 		NVolSetErrors(ni->vol);
 	}
 	return err;
diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h
index 99580455f2e..76b6cfb579d 100644
--- a/fs/ntfs/inode.h
+++ b/fs/ntfs/inode.h
@@ -2,7 +2,7 @@
  * inode.h - Defines for inode structures NTFS Linux kernel driver. Part of
  *	     the Linux-NTFS project.
  *
- * Copyright (c) 2001-2004 Anton Altaparmakov
+ * Copyright (c) 2001-2007 Anton Altaparmakov
  * Copyright (c) 2002 Richard Russon
  *
  * This program/include file is free software; you can redistribute it and/or
@@ -24,12 +24,13 @@
 #ifndef _LINUX_NTFS_INODE_H
 #define _LINUX_NTFS_INODE_H
 
-#include <linux/mm.h>
+#include <linux/atomic.h>
+
 #include <linux/fs.h>
-#include <linux/seq_file.h>
 #include <linux/list.h>
-#include <asm/atomic.h>
-#include <asm/semaphore.h>
+#include <linux/mm.h>
+#include <linux/mutex.h>
+#include <linux/seq_file.h>
 
 #include "layout.h"
 #include "volume.h"
@@ -44,6 +45,7 @@ typedef struct _ntfs_inode ntfs_inode;
  * fields already provided in the VFS inode.
  */
 struct _ntfs_inode {
+	rwlock_t size_lock;	/* Lock serializing access to inode sizes. */
 	s64 initialized_size;	/* Copy from the attribute record. */
 	s64 allocated_size;	/* Copy from the attribute record. */
 	unsigned long state;	/* NTFS specific flags describing this inode.
@@ -80,7 +82,7 @@ struct _ntfs_inode {
 	 * The following fields are only valid for real inodes and extent
 	 * inodes.
 	 */
-	struct semaphore mrec_lock; /* Lock for serializing access to the
+	struct mutex mrec_lock;	/* Lock for serializing access to the
 				   mft record belonging to this inode. */
 	struct page *page;	/* The page containing the mft record of the
 				   inode. This should only be touched by the
@@ -99,8 +101,6 @@ struct _ntfs_inode {
 	runlist attr_list_rl;	/* Run list for the attribute list value. */
 	union {
 		struct { /* It is a directory, $MFT, or an index inode. */
-			struct inode *bmp_ino;	/* Attribute inode for the
-						   index $BITMAP. */
 			u32 block_size;		/* Size of an index block. */
 			u32 vcn_size;		/* Size of a vcn in this
 						   index. */
@@ -109,7 +109,7 @@ struct _ntfs_inode {
 			u8 block_size_bits; 	/* Log2 of the above. */
 			u8 vcn_size_bits;	/* Log2 of the above. */
 		} index;
-		struct { /* It is a compressed file or an attribute inode. */
+		struct { /* It is a compressed/sparse file/attribute inode. */
 			s64 size;		/* Copy of compressed_size from
 						   $DATA. */
 			u32 block_size;		/* Size of a compression block
@@ -118,7 +118,7 @@ struct _ntfs_inode {
 			u8 block_clusters;	/* Number of clusters per cb. */
 		} compressed;
 	} itype;
-	struct semaphore extent_lock;	/* Lock for accessing/modifying the
+	struct mutex extent_lock;	/* Lock for accessing/modifying the
 					   below . */
 	s32 nr_extents;	/* For a base mft record, the number of attached extent
 			   inodes (0 if none), for extent records and for fake
@@ -165,6 +165,7 @@ typedef enum {
 	NI_Sparse,		/* 1: Unnamed data attr is sparse (f).
 				   1: Create sparse files by default (d).
 				   1: Attribute is sparse (a). */
+	NI_SparseDisabled,	/* 1: May not create sparse regions. */
 	NI_TruncateFailed,	/* 1: Last ntfs_truncate() call failed. */
 } ntfs_inode_state_bits;
 
@@ -217,6 +218,7 @@ NINO_FNS(IndexAllocPresent)
 NINO_FNS(Compressed)
 NINO_FNS(Encrypted)
 NINO_FNS(Sparse)
+NINO_FNS(SparseDisabled)
 NINO_FNS(TruncateFailed)
 
 /*
@@ -277,7 +279,7 @@ extern struct inode *ntfs_index_iget(struct inode *base_vi, ntfschar *name,
 
 extern struct inode *ntfs_alloc_big_inode(struct super_block *sb);
 extern void ntfs_destroy_big_inode(struct inode *inode);
-extern void ntfs_clear_big_inode(struct inode *vi);
+extern void ntfs_evict_big_inode(struct inode *vi);
 
 extern void __ntfs_init_inode(struct super_block *sb, ntfs_inode *ni);
 
@@ -296,9 +298,7 @@ extern void ntfs_clear_extent_inode(ntfs_inode *ni);
 
 extern int ntfs_read_inode_mount(struct inode *vi);
 
-extern void ntfs_put_inode(struct inode *vi);
-
-extern int ntfs_show_options(struct seq_file *sf, struct vfsmount *mnt);
+extern int ntfs_show_options(struct seq_file *sf, struct dentry *root);
 
 #ifdef NTFS_RW
 
@@ -307,15 +307,19 @@ extern void ntfs_truncate_vfs(struct inode *vi);
 
 extern int ntfs_setattr(struct dentry *dentry, struct iattr *attr);
 
-extern int ntfs_write_inode(struct inode *vi, int sync);
+extern int __ntfs_write_inode(struct inode *vi, int sync);
 
 static inline void ntfs_commit_inode(struct inode *vi)
 {
 	if (!is_bad_inode(vi))
-		ntfs_write_inode(vi, 1);
+		__ntfs_write_inode(vi, 1);
 	return;
 }
 
+#else
+
+static inline void ntfs_truncate_vfs(struct inode *vi) {}
+
 #endif /* NTFS_RW */
 
 #endif /* _LINUX_NTFS_INODE_H */
diff --git a/fs/ntfs/layout.h b/fs/ntfs/layout.h
index 47b33899992..809c0e6d8e0 100644
--- a/fs/ntfs/layout.h
+++ b/fs/ntfs/layout.h
@@ -2,7 +2,7 @@
  * layout.h - All NTFS associated on-disk structures. Part of the Linux-NTFS
  *	      project.
  *
- * Copyright (c) 2001-2004 Anton Altaparmakov
+ * Copyright (c) 2001-2005 Anton Altaparmakov
  * Copyright (c) 2002 Richard Russon
  *
  * This program/include file is free software; you can redistribute it and/or
@@ -31,19 +31,8 @@
 
 #include "types.h"
 
-/*
- * Constant endianness conversion defines.
- */
-#define const_le16_to_cpu(x)	__constant_le16_to_cpu(x)
-#define const_le32_to_cpu(x)	__constant_le32_to_cpu(x)
-#define const_le64_to_cpu(x)	__constant_le64_to_cpu(x)
-
-#define const_cpu_to_le16(x)	__constant_cpu_to_le16(x)
-#define const_cpu_to_le32(x)	__constant_cpu_to_le32(x)
-#define const_cpu_to_le64(x)	__constant_cpu_to_le64(x)
-
 /* The NTFS oem_id "NTFS    " */
-#define magicNTFS	const_cpu_to_le64(0x202020205346544eULL)
+#define magicNTFS	cpu_to_le64(0x202020205346544eULL)
 
 /*
  * Location of bootsector on partition:
@@ -114,25 +103,25 @@ typedef struct {
  */
 enum {
 	/* Found in $MFT/$DATA. */
-	magic_FILE = const_cpu_to_le32(0x454c4946), /* Mft entry. */
-	magic_INDX = const_cpu_to_le32(0x58444e49), /* Index buffer. */
-	magic_HOLE = const_cpu_to_le32(0x454c4f48), /* ? (NTFS 3.0+?) */
+	magic_FILE = cpu_to_le32(0x454c4946), /* Mft entry. */
+	magic_INDX = cpu_to_le32(0x58444e49), /* Index buffer. */
+	magic_HOLE = cpu_to_le32(0x454c4f48), /* ? (NTFS 3.0+?) */
 
 	/* Found in $LogFile/$DATA. */
-	magic_RSTR = const_cpu_to_le32(0x52545352), /* Restart page. */
-	magic_RCRD = const_cpu_to_le32(0x44524352), /* Log record page. */
+	magic_RSTR = cpu_to_le32(0x52545352), /* Restart page. */
+	magic_RCRD = cpu_to_le32(0x44524352), /* Log record page. */
 
 	/* Found in $LogFile/$DATA.  (May be found in $MFT/$DATA, also?) */
-	magic_CHKD = const_cpu_to_le32(0x424b4843), /* Modified by chkdsk. */
+	magic_CHKD = cpu_to_le32(0x444b4843), /* Modified by chkdsk. */
 
 	/* Found in all ntfs record containing records. */
-	magic_BAAD = const_cpu_to_le32(0x44414142), /* Failed multi sector
+	magic_BAAD = cpu_to_le32(0x44414142), /* Failed multi sector
 						       transfer was detected. */
 	/*
 	 * Found in $LogFile/$DATA when a page is full of 0xff bytes and is
 	 * thus not initialized.  Page must be initialized before using it.
 	 */
-	magic_empty = const_cpu_to_le32(0xffffffff) /* Record is empty. */
+	magic_empty = cpu_to_le32(0xffffffff) /* Record is empty. */
 };
 
 typedef le32 NTFS_RECORD_TYPE;
@@ -142,13 +131,13 @@ typedef le32 NTFS_RECORD_TYPE;
  * operator! (-8
  */
 
-static inline BOOL __ntfs_is_magic(le32 x, NTFS_RECORD_TYPE r)
+static inline bool __ntfs_is_magic(le32 x, NTFS_RECORD_TYPE r)
 {
 	return (x == r);
 }
 #define ntfs_is_magic(x, m)	__ntfs_is_magic(x, magic_##m)
 
-static inline BOOL __ntfs_is_magicp(le32 *p, NTFS_RECORD_TYPE r)
+static inline bool __ntfs_is_magicp(le32 *p, NTFS_RECORD_TYPE r)
 {
 	return (*p == r);
 }
@@ -258,8 +247,8 @@ typedef enum {
  * information about the mft record in which they are present.
  */
 enum {
-	MFT_RECORD_IN_USE	= const_cpu_to_le16(0x0001),
-	MFT_RECORD_IS_DIRECTORY = const_cpu_to_le16(0x0002),
+	MFT_RECORD_IN_USE	= cpu_to_le16(0x0001),
+	MFT_RECORD_IS_DIRECTORY = cpu_to_le16(0x0002),
 } __attribute__ ((__packed__));
 
 typedef le16 MFT_RECORD_FLAGS;
@@ -297,7 +286,7 @@ typedef le16 MFT_RECORD_FLAGS;
  * fragmented. Volume free space includes the empty part of the mft zone and
  * when the volume's free 88% are used up, the mft zone is shrunk by a factor
  * of 2, thus making more space available for more files/data. This process is
- * repeated everytime there is no more free space except for the mft zone until
+ * repeated every time there is no more free space except for the mft zone until
  * there really is no more free space.
  */
 
@@ -308,10 +297,8 @@ typedef le16 MFT_RECORD_FLAGS;
  * The _LE versions are to be applied on little endian MFT_REFs.
  * Note: The _LE versions will return a CPU endian formatted value!
  */
-typedef enum {
-	MFT_REF_MASK_CPU	= 0x0000ffffffffffffULL,
-	MFT_REF_MASK_LE		= const_cpu_to_le64(0x0000ffffffffffffULL),
-} MFT_REF_CONSTS;
+#define MFT_REF_MASK_CPU 0x0000ffffffffffffULL
+#define MFT_REF_MASK_LE cpu_to_le64(MFT_REF_MASK_CPU)
 
 typedef u64 MFT_REF;
 typedef le64 leMFT_REF;
@@ -325,7 +312,7 @@ typedef le64 leMFT_REF;
 #define MREF_LE(x)	((unsigned long)(le64_to_cpu(x) & MFT_REF_MASK_CPU))
 #define MSEQNO_LE(x)	((u16)((le64_to_cpu(x) >> 48) & 0xffff))
 
-#define IS_ERR_MREF(x)	(((x) & 0x0000800000000000ULL) ? 1 : 0)
+#define IS_ERR_MREF(x)	(((x) & 0x0000800000000000ULL) ? true : false)
 #define ERR_MREF(x)	((u64)((s64)(x)))
 #define MREF_ERR(x)	((int)((s64)(x)))
 
@@ -479,25 +466,25 @@ typedef struct {
  * a revealing choice of symbol I do not know what is... (-;
  */
 enum {
-	AT_UNUSED			= const_cpu_to_le32(         0),
-	AT_STANDARD_INFORMATION		= const_cpu_to_le32(      0x10),
-	AT_ATTRIBUTE_LIST		= const_cpu_to_le32(      0x20),
-	AT_FILE_NAME			= const_cpu_to_le32(      0x30),
-	AT_OBJECT_ID			= const_cpu_to_le32(      0x40),
-	AT_SECURITY_DESCRIPTOR		= const_cpu_to_le32(      0x50),
-	AT_VOLUME_NAME			= const_cpu_to_le32(      0x60),
-	AT_VOLUME_INFORMATION		= const_cpu_to_le32(      0x70),
-	AT_DATA				= const_cpu_to_le32(      0x80),
-	AT_INDEX_ROOT			= const_cpu_to_le32(      0x90),
-	AT_INDEX_ALLOCATION		= const_cpu_to_le32(      0xa0),
-	AT_BITMAP			= const_cpu_to_le32(      0xb0),
-	AT_REPARSE_POINT		= const_cpu_to_le32(      0xc0),
-	AT_EA_INFORMATION		= const_cpu_to_le32(      0xd0),
-	AT_EA				= const_cpu_to_le32(      0xe0),
-	AT_PROPERTY_SET			= const_cpu_to_le32(      0xf0),
-	AT_LOGGED_UTILITY_STREAM	= const_cpu_to_le32(     0x100),
-	AT_FIRST_USER_DEFINED_ATTRIBUTE	= const_cpu_to_le32(    0x1000),
-	AT_END				= const_cpu_to_le32(0xffffffff)
+	AT_UNUSED			= cpu_to_le32(         0),
+	AT_STANDARD_INFORMATION		= cpu_to_le32(      0x10),
+	AT_ATTRIBUTE_LIST		= cpu_to_le32(      0x20),
+	AT_FILE_NAME			= cpu_to_le32(      0x30),
+	AT_OBJECT_ID			= cpu_to_le32(      0x40),
+	AT_SECURITY_DESCRIPTOR		= cpu_to_le32(      0x50),
+	AT_VOLUME_NAME			= cpu_to_le32(      0x60),
+	AT_VOLUME_INFORMATION		= cpu_to_le32(      0x70),
+	AT_DATA				= cpu_to_le32(      0x80),
+	AT_INDEX_ROOT			= cpu_to_le32(      0x90),
+	AT_INDEX_ALLOCATION		= cpu_to_le32(      0xa0),
+	AT_BITMAP			= cpu_to_le32(      0xb0),
+	AT_REPARSE_POINT		= cpu_to_le32(      0xc0),
+	AT_EA_INFORMATION		= cpu_to_le32(      0xd0),
+	AT_EA				= cpu_to_le32(      0xe0),
+	AT_PROPERTY_SET			= cpu_to_le32(      0xf0),
+	AT_LOGGED_UTILITY_STREAM	= cpu_to_le32(     0x100),
+	AT_FIRST_USER_DEFINED_ATTRIBUTE	= cpu_to_le32(    0x1000),
+	AT_END				= cpu_to_le32(0xffffffff)
 };
 
 typedef le32 ATTR_TYPE;
@@ -541,32 +528,50 @@ typedef le32 ATTR_TYPE;
  *	equal then the second le32 values would be compared, etc.
  */
 enum {
-	COLLATION_BINARY		= const_cpu_to_le32(0x00),
-	COLLATION_FILE_NAME		= const_cpu_to_le32(0x01),
-	COLLATION_UNICODE_STRING	= const_cpu_to_le32(0x02),
-	COLLATION_NTOFS_ULONG		= const_cpu_to_le32(0x10),
-	COLLATION_NTOFS_SID		= const_cpu_to_le32(0x11),
-	COLLATION_NTOFS_SECURITY_HASH	= const_cpu_to_le32(0x12),
-	COLLATION_NTOFS_ULONGS		= const_cpu_to_le32(0x13)
+	COLLATION_BINARY		= cpu_to_le32(0x00),
+	COLLATION_FILE_NAME		= cpu_to_le32(0x01),
+	COLLATION_UNICODE_STRING	= cpu_to_le32(0x02),
+	COLLATION_NTOFS_ULONG		= cpu_to_le32(0x10),
+	COLLATION_NTOFS_SID		= cpu_to_le32(0x11),
+	COLLATION_NTOFS_SECURITY_HASH	= cpu_to_le32(0x12),
+	COLLATION_NTOFS_ULONGS		= cpu_to_le32(0x13),
 };
 
 typedef le32 COLLATION_RULE;
 
 /*
  * The flags (32-bit) describing attribute properties in the attribute
- * definition structure.  FIXME: This information is from Regis's information
- * and, according to him, it is not certain and probably incomplete.
- * The INDEXABLE flag is fairly certainly correct as only the file name
- * attribute has this flag set and this is the only attribute indexed in NT4.
+ * definition structure.  FIXME: This information is based on Regis's
+ * information and, according to him, it is not certain and probably
+ * incomplete.  The INDEXABLE flag is fairly certainly correct as only the file
+ * name attribute has this flag set and this is the only attribute indexed in
+ * NT4.
  */
 enum {
-	INDEXABLE	    = const_cpu_to_le32(0x02), /* Attribute can be
-							  indexed. */
-	NEED_TO_REGENERATE  = const_cpu_to_le32(0x40), /* Need to regenerate
-							  during regeneration
-							  phase. */
-	CAN_BE_NON_RESIDENT = const_cpu_to_le32(0x80), /* Attribute can be
-							  non-resident. */
+	ATTR_DEF_INDEXABLE	= cpu_to_le32(0x02), /* Attribute can be
+					indexed. */
+	ATTR_DEF_MULTIPLE	= cpu_to_le32(0x04), /* Attribute type
+					can be present multiple times in the
+					mft records of an inode. */
+	ATTR_DEF_NOT_ZERO	= cpu_to_le32(0x08), /* Attribute value
+					must contain at least one non-zero
+					byte. */
+	ATTR_DEF_INDEXED_UNIQUE	= cpu_to_le32(0x10), /* Attribute must be
+					indexed and the attribute value must be
+					unique for the attribute type in all of
+					the mft records of an inode. */
+	ATTR_DEF_NAMED_UNIQUE	= cpu_to_le32(0x20), /* Attribute must be
+					named and the name must be unique for
+					the attribute type in all of the mft
+					records of an inode. */
+	ATTR_DEF_RESIDENT	= cpu_to_le32(0x40), /* Attribute must be
+					resident. */
+	ATTR_DEF_ALWAYS_LOG	= cpu_to_le32(0x80), /* Always log
+					modifications to this attribute,
+					regardless of whether it is resident or
+					non-resident.  Without this, only log
+					modifications if the attribute is
+					resident. */
 };
 
 typedef le32 ATTR_DEF_FLAGS;
@@ -598,12 +603,12 @@ typedef struct {
  * Attribute flags (16-bit).
  */
 enum {
-	ATTR_IS_COMPRESSED    = const_cpu_to_le16(0x0001),
-	ATTR_COMPRESSION_MASK = const_cpu_to_le16(0x00ff), /* Compression method
+	ATTR_IS_COMPRESSED    = cpu_to_le16(0x0001),
+	ATTR_COMPRESSION_MASK = cpu_to_le16(0x00ff), /* Compression method
 							      mask.  Also, first
 							      illegal value. */
-	ATTR_IS_ENCRYPTED     = const_cpu_to_le16(0x4000),
-	ATTR_IS_SPARSE	      = const_cpu_to_le16(0x8000),
+	ATTR_IS_ENCRYPTED     = cpu_to_le16(0x4000),
+	ATTR_IS_SPARSE	      = cpu_to_le16(0x8000),
 } __attribute__ ((__packed__));
 
 typedef le16 ATTR_FLAGS;
@@ -749,10 +754,11 @@ typedef struct {
 				record header aligned to 8-byte boundary. */
 /* 34*/			u8 compression_unit; /* The compression unit expressed
 				as the log to the base 2 of the number of
-				clusters in a compression unit. 0 means not
-				compressed. (This effectively limits the
+				clusters in a compression unit.  0 means not
+				compressed.  (This effectively limits the
 				compression unit size to be a power of two
-				clusters.) WinNT4 only uses a value of 4. */
+				clusters.)  WinNT4 only uses a value of 4.
+				Sparse files have this set to 0 on XPSP2. */
 /* 35*/			u8 reserved[5];		/* Align to 8-byte boundary. */
 /* The sizes below are only used when lowest_vcn is zero, as otherwise it would
    be difficult to keep them up-to-date.*/
@@ -772,10 +778,10 @@ typedef struct {
 				data_size. */
 /* sizeof(uncompressed attr) = 64*/
 /* 64*/			sle64 compressed_size;	/* Byte size of the attribute
-				value after compression. Only present when
-				compressed. Always is a multiple of the
-				cluster size. Represents the actual amount of
-				disk space being used on the disk. */
+				value after compression.  Only present when
+				compressed or sparse.  Always is a multiple of
+				the cluster size.  Represents the actual amount
+				of disk space being used on the disk. */
 /* sizeof(compressed attr) = 72*/
 		} __attribute__ ((__packed__)) non_resident;
 	} __attribute__ ((__packed__)) data;
@@ -784,57 +790,60 @@ typedef struct {
 typedef ATTR_RECORD ATTR_REC;
 
 /*
- * File attribute flags (32-bit).
+ * File attribute flags (32-bit) appearing in the file_attributes fields of the
+ * STANDARD_INFORMATION attribute of MFT_RECORDs and the FILENAME_ATTR
+ * attributes of MFT_RECORDs and directory index entries.
+ *
+ * All of the below flags appear in the directory index entries but only some
+ * appear in the STANDARD_INFORMATION attribute whilst only some others appear
+ * in the FILENAME_ATTR attribute of MFT_RECORDs.  Unless otherwise stated the
+ * flags appear in all of the above.
  */
 enum {
-	/*
-	 * The following flags are only present in the STANDARD_INFORMATION
-	 * attribute (in the field file_attributes).
-	 */
-	FILE_ATTR_READONLY		= const_cpu_to_le32(0x00000001),
-	FILE_ATTR_HIDDEN		= const_cpu_to_le32(0x00000002),
-	FILE_ATTR_SYSTEM		= const_cpu_to_le32(0x00000004),
-	/* Old DOS volid. Unused in NT.	= const_cpu_to_le32(0x00000008), */
+	FILE_ATTR_READONLY		= cpu_to_le32(0x00000001),
+	FILE_ATTR_HIDDEN		= cpu_to_le32(0x00000002),
+	FILE_ATTR_SYSTEM		= cpu_to_le32(0x00000004),
+	/* Old DOS volid. Unused in NT.	= cpu_to_le32(0x00000008), */
 
-	FILE_ATTR_DIRECTORY		= const_cpu_to_le32(0x00000010),
+	FILE_ATTR_DIRECTORY		= cpu_to_le32(0x00000010),
 	/* Note, FILE_ATTR_DIRECTORY is not considered valid in NT.  It is
 	   reserved for the DOS SUBDIRECTORY flag. */
-	FILE_ATTR_ARCHIVE		= const_cpu_to_le32(0x00000020),
-	FILE_ATTR_DEVICE		= const_cpu_to_le32(0x00000040),
-	FILE_ATTR_NORMAL		= const_cpu_to_le32(0x00000080),
+	FILE_ATTR_ARCHIVE		= cpu_to_le32(0x00000020),
+	FILE_ATTR_DEVICE		= cpu_to_le32(0x00000040),
+	FILE_ATTR_NORMAL		= cpu_to_le32(0x00000080),
 
-	FILE_ATTR_TEMPORARY		= const_cpu_to_le32(0x00000100),
-	FILE_ATTR_SPARSE_FILE		= const_cpu_to_le32(0x00000200),
-	FILE_ATTR_REPARSE_POINT		= const_cpu_to_le32(0x00000400),
-	FILE_ATTR_COMPRESSED		= const_cpu_to_le32(0x00000800),
+	FILE_ATTR_TEMPORARY		= cpu_to_le32(0x00000100),
+	FILE_ATTR_SPARSE_FILE		= cpu_to_le32(0x00000200),
+	FILE_ATTR_REPARSE_POINT		= cpu_to_le32(0x00000400),
+	FILE_ATTR_COMPRESSED		= cpu_to_le32(0x00000800),
 
-	FILE_ATTR_OFFLINE		= const_cpu_to_le32(0x00001000),
-	FILE_ATTR_NOT_CONTENT_INDEXED	= const_cpu_to_le32(0x00002000),
-	FILE_ATTR_ENCRYPTED		= const_cpu_to_le32(0x00004000),
+	FILE_ATTR_OFFLINE		= cpu_to_le32(0x00001000),
+	FILE_ATTR_NOT_CONTENT_INDEXED	= cpu_to_le32(0x00002000),
+	FILE_ATTR_ENCRYPTED		= cpu_to_le32(0x00004000),
 
-	FILE_ATTR_VALID_FLAGS		= const_cpu_to_le32(0x00007fb7),
+	FILE_ATTR_VALID_FLAGS		= cpu_to_le32(0x00007fb7),
 	/* Note, FILE_ATTR_VALID_FLAGS masks out the old DOS VolId and the
 	   FILE_ATTR_DEVICE and preserves everything else.  This mask is used
 	   to obtain all flags that are valid for reading. */
-	FILE_ATTR_VALID_SET_FLAGS	= const_cpu_to_le32(0x000031a7),
+	FILE_ATTR_VALID_SET_FLAGS	= cpu_to_le32(0x000031a7),
 	/* Note, FILE_ATTR_VALID_SET_FLAGS masks out the old DOS VolId, the
 	   F_A_DEVICE, F_A_DIRECTORY, F_A_SPARSE_FILE, F_A_REPARSE_POINT,
 	   F_A_COMPRESSED, and F_A_ENCRYPTED and preserves the rest.  This mask
-	   is used to to obtain all flags that are valid for setting. */
-
+	   is used to obtain all flags that are valid for setting. */
 	/*
-	 * The following flags are only present in the FILE_NAME attribute (in
-	 * the field file_attributes).
+	 * The flag FILE_ATTR_DUP_FILENAME_INDEX_PRESENT is present in all
+	 * FILENAME_ATTR attributes but not in the STANDARD_INFORMATION
+	 * attribute of an mft record.
 	 */
-	FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT	= const_cpu_to_le32(0x10000000),
+	FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT	= cpu_to_le32(0x10000000),
 	/* Note, this is a copy of the corresponding bit from the mft record,
 	   telling us whether this is a directory or not, i.e. whether it has
 	   an index root attribute or not. */
-	FILE_ATTR_DUP_VIEW_INDEX_PRESENT	= const_cpu_to_le32(0x20000000),
+	FILE_ATTR_DUP_VIEW_INDEX_PRESENT	= cpu_to_le32(0x20000000),
 	/* Note, this is a copy of the corresponding bit from the mft record,
 	   telling us whether this file has a view index present (eg. object id
 	   index, quota index, one of the security indexes or the encrypting
-	   file system related indexes). */
+	   filesystem related indexes). */
 };
 
 typedef le32 FILE_ATTR_FLAGS;
@@ -870,7 +879,7 @@ typedef struct {
 					   Windows this is only updated when
 					   accessed if some time delta has
 					   passed since the last update. Also,
-					   last access times updates can be
+					   last access time updates can be
 					   disabled altogether for speed. */
 /* 32*/	FILE_ATTR_FLAGS file_attributes; /* Flags describing the file. */
 /* 36*/	union {
@@ -917,20 +926,12 @@ typedef struct {
 		/* 56*/	le64 quota_charged;	/* Byte size of the charge to
 				the quota for all streams of the file. Note: Is
 				zero if quotas are disabled. */
-		/* 64*/	le64 usn;		/* Last update sequence number
-				of the file. This is a direct index into the
-				change (aka usn) journal file. It is zero if
-				the usn journal is disabled.
-				NOTE: To disable the journal need to delete
-				the journal file itself and to then walk the
-				whole mft and set all Usn entries in all mft
-				records to zero! (This can take a while!)
-				The journal is FILE_Extend/$UsnJrnl. Win2k
-				will recreate the journal and initiate
-				logging if necessary when mounting the
-				partition. This, in contrast to disabling the
-				journal is a very fast process, so the user
-				won't even notice it. */
+		/* 64*/	leUSN usn;		/* Last update sequence number
+				of the file.  This is a direct index into the
+				transaction log file ($UsnJrnl).  It is zero if
+				the usn journal is disabled or this file has
+				not been subject to logging yet.  See usnjrnl.h
+				for details. */
 		} __attribute__ ((__packed__)) v3;
 	/* sizeof() = 72 bytes (NTFS 3.x) */
 	} __attribute__ ((__packed__)) ver;
@@ -1012,10 +1013,17 @@ enum {
 	FILE_NAME_POSIX		= 0x00,
 	/* This is the largest namespace. It is case sensitive and allows all
 	   Unicode characters except for: '\0' and '/'.  Beware that in
-	   WinNT/2k files which eg have the same name except for their case
-	   will not be distinguished by the standard utilities and thus a "del
-	   filename" will delete both "filename" and "fileName" without
-	   warning. */
+	   WinNT/2k/2003 by default files which eg have the same name except
+	   for their case will not be distinguished by the standard utilities
+	   and thus a "del filename" will delete both "filename" and "fileName"
+	   without warning.  However if for example Services For Unix (SFU) are
+	   installed and the case sensitive option was enabled at installation
+	   time, then you can create/access/delete such files.
+	   Note that even SFU places restrictions on the filenames beyond the
+	   '\0' and '/' and in particular the following set of characters is
+	   not allowed: '"', '/', '<', '>', '\'.  All other characters,
+	   including the ones no allowed in WIN32 namespace are allowed.
+	   Tested with SFU 3.5 (this is now free) running on Windows XP. */
 	FILE_NAME_WIN32		= 0x01,
 	/* The standard WinNT/2k NTFS long filenames. Case insensitive.  All
 	   Unicode chars except: '\0', '"', '*', '/', ':', '<', '>', '?', '\',
@@ -1055,11 +1063,22 @@ typedef struct {
 					   modified. */
 /* 20*/	sle64 last_access_time;		/* Time this mft record was last
 					   accessed. */
-/* 28*/	sle64 allocated_size;		/* Byte size of allocated space for the
-					   data attribute. NOTE: Is a multiple
-					   of the cluster size. */
-/* 30*/	sle64 data_size;		/* Byte size of actual data in data
-					   attribute. */
+/* 28*/	sle64 allocated_size;		/* Byte size of on-disk allocated space
+					   for the unnamed data attribute.  So
+					   for normal $DATA, this is the
+					   allocated_size from the unnamed
+					   $DATA attribute and for compressed
+					   and/or sparse $DATA, this is the
+					   compressed_size from the unnamed
+					   $DATA attribute.  For a directory or
+					   other inode without an unnamed $DATA
+					   attribute, this is always 0.  NOTE:
+					   This is a multiple of the cluster
+					   size. */
+/* 30*/	sle64 data_size;		/* Byte size of actual data in unnamed
+					   data attribute.  For a directory or
+					   other inode without an unnamed $DATA
+					   attribute, this is always 0. */
 /* 38*/	FILE_ATTR_FLAGS file_attributes;	/* Flags describing the file. */
 /* 3c*/	union {
 	/* 3c*/	struct {
@@ -1416,42 +1435,42 @@ enum {
 	/* Specific rights for files and directories are as follows: */
 
 	/* Right to read data from the file. (FILE) */
-	FILE_READ_DATA			= const_cpu_to_le32(0x00000001),
+	FILE_READ_DATA			= cpu_to_le32(0x00000001),
 	/* Right to list contents of a directory. (DIRECTORY) */
-	FILE_LIST_DIRECTORY		= const_cpu_to_le32(0x00000001),
+	FILE_LIST_DIRECTORY		= cpu_to_le32(0x00000001),
 
 	/* Right to write data to the file. (FILE) */
-	FILE_WRITE_DATA			= const_cpu_to_le32(0x00000002),
+	FILE_WRITE_DATA			= cpu_to_le32(0x00000002),
 	/* Right to create a file in the directory. (DIRECTORY) */
-	FILE_ADD_FILE			= const_cpu_to_le32(0x00000002),
+	FILE_ADD_FILE			= cpu_to_le32(0x00000002),
 
 	/* Right to append data to the file. (FILE) */
-	FILE_APPEND_DATA		= const_cpu_to_le32(0x00000004),
+	FILE_APPEND_DATA		= cpu_to_le32(0x00000004),
 	/* Right to create a subdirectory. (DIRECTORY) */
-	FILE_ADD_SUBDIRECTORY		= const_cpu_to_le32(0x00000004),
+	FILE_ADD_SUBDIRECTORY		= cpu_to_le32(0x00000004),
 
 	/* Right to read extended attributes. (FILE/DIRECTORY) */
-	FILE_READ_EA			= const_cpu_to_le32(0x00000008),
+	FILE_READ_EA			= cpu_to_le32(0x00000008),
 
 	/* Right to write extended attributes. (FILE/DIRECTORY) */
-	FILE_WRITE_EA			= const_cpu_to_le32(0x00000010),
+	FILE_WRITE_EA			= cpu_to_le32(0x00000010),
 
 	/* Right to execute a file. (FILE) */
-	FILE_EXECUTE			= const_cpu_to_le32(0x00000020),
+	FILE_EXECUTE			= cpu_to_le32(0x00000020),
 	/* Right to traverse the directory. (DIRECTORY) */
-	FILE_TRAVERSE			= const_cpu_to_le32(0x00000020),
+	FILE_TRAVERSE			= cpu_to_le32(0x00000020),
 
 	/*
 	 * Right to delete a directory and all the files it contains (its
 	 * children), even if the files are read-only. (DIRECTORY)
 	 */
-	FILE_DELETE_CHILD		= const_cpu_to_le32(0x00000040),
+	FILE_DELETE_CHILD		= cpu_to_le32(0x00000040),
 
 	/* Right to read file attributes. (FILE/DIRECTORY) */
-	FILE_READ_ATTRIBUTES		= const_cpu_to_le32(0x00000080),
+	FILE_READ_ATTRIBUTES		= cpu_to_le32(0x00000080),
 
 	/* Right to change file attributes. (FILE/DIRECTORY) */
-	FILE_WRITE_ATTRIBUTES		= const_cpu_to_le32(0x00000100),
+	FILE_WRITE_ATTRIBUTES		= cpu_to_le32(0x00000100),
 
 	/*
 	 * The standard rights (bits 16 to 23).  These are independent of the
@@ -1459,27 +1478,27 @@ enum {
 	 */
 
 	/* Right to delete the object. */
-	DELETE				= const_cpu_to_le32(0x00010000),
+	DELETE				= cpu_to_le32(0x00010000),
 
 	/*
 	 * Right to read the information in the object's security descriptor,
 	 * not including the information in the SACL, i.e. right to read the
 	 * security descriptor and owner.
 	 */
-	READ_CONTROL			= const_cpu_to_le32(0x00020000),
+	READ_CONTROL			= cpu_to_le32(0x00020000),
 
 	/* Right to modify the DACL in the object's security descriptor. */
-	WRITE_DAC			= const_cpu_to_le32(0x00040000),
+	WRITE_DAC			= cpu_to_le32(0x00040000),
 
 	/* Right to change the owner in the object's security descriptor. */
-	WRITE_OWNER			= const_cpu_to_le32(0x00080000),
+	WRITE_OWNER			= cpu_to_le32(0x00080000),
 
 	/*
 	 * Right to use the object for synchronization.  Enables a process to
 	 * wait until the object is in the signalled state.  Some object types
 	 * do not support this access right.
 	 */
-	SYNCHRONIZE			= const_cpu_to_le32(0x00100000),
+	SYNCHRONIZE			= cpu_to_le32(0x00100000),
 
 	/*
 	 * The following STANDARD_RIGHTS_* are combinations of the above for
@@ -1487,25 +1506,25 @@ enum {
 	 */
 
 	/* These are currently defined to READ_CONTROL. */
-	STANDARD_RIGHTS_READ		= const_cpu_to_le32(0x00020000),
-	STANDARD_RIGHTS_WRITE		= const_cpu_to_le32(0x00020000),
-	STANDARD_RIGHTS_EXECUTE		= const_cpu_to_le32(0x00020000),
+	STANDARD_RIGHTS_READ		= cpu_to_le32(0x00020000),
+	STANDARD_RIGHTS_WRITE		= cpu_to_le32(0x00020000),
+	STANDARD_RIGHTS_EXECUTE		= cpu_to_le32(0x00020000),
 
 	/* Combines DELETE, READ_CONTROL, WRITE_DAC, and WRITE_OWNER access. */
-	STANDARD_RIGHTS_REQUIRED	= const_cpu_to_le32(0x000f0000),
+	STANDARD_RIGHTS_REQUIRED	= cpu_to_le32(0x000f0000),
 
 	/*
 	 * Combines DELETE, READ_CONTROL, WRITE_DAC, WRITE_OWNER, and
 	 * SYNCHRONIZE access.
 	 */
-	STANDARD_RIGHTS_ALL		= const_cpu_to_le32(0x001f0000),
+	STANDARD_RIGHTS_ALL		= cpu_to_le32(0x001f0000),
 
 	/*
 	 * The access system ACL and maximum allowed access types (bits 24 to
 	 * 25, bits 26 to 27 are reserved).
 	 */
-	ACCESS_SYSTEM_SECURITY		= const_cpu_to_le32(0x01000000),
-	MAXIMUM_ALLOWED			= const_cpu_to_le32(0x02000000),
+	ACCESS_SYSTEM_SECURITY		= cpu_to_le32(0x01000000),
+	MAXIMUM_ALLOWED			= cpu_to_le32(0x02000000),
 
 	/*
 	 * The generic rights (bits 28 to 31).  These map onto the standard and
@@ -1513,10 +1532,10 @@ enum {
 	 */
 
 	/* Read, write, and execute access. */
-	GENERIC_ALL			= const_cpu_to_le32(0x10000000),
+	GENERIC_ALL			= cpu_to_le32(0x10000000),
 
 	/* Execute access. */
-	GENERIC_EXECUTE			= const_cpu_to_le32(0x20000000),
+	GENERIC_EXECUTE			= cpu_to_le32(0x20000000),
 
 	/*
 	 * Write access.  For files, this maps onto:
@@ -1525,7 +1544,7 @@ enum {
 	 * For directories, the mapping has the same numerical value.  See
 	 * above for the descriptions of the rights granted.
 	 */
-	GENERIC_WRITE			= const_cpu_to_le32(0x40000000),
+	GENERIC_WRITE			= cpu_to_le32(0x40000000),
 
 	/*
 	 * Read access.  For files, this maps onto:
@@ -1534,7 +1553,7 @@ enum {
 	 * For directories, the mapping has the same numberical value.  See
 	 * above for the descriptions of the rights granted.
 	 */
-	GENERIC_READ			= const_cpu_to_le32(0x80000000),
+	GENERIC_READ			= cpu_to_le32(0x80000000),
 };
 
 typedef le32 ACCESS_MASK;
@@ -1574,8 +1593,8 @@ typedef struct {
  * The object ACE flags (32-bit).
  */
 enum {
-	ACE_OBJECT_TYPE_PRESENT			= const_cpu_to_le32(1),
-	ACE_INHERITED_OBJECT_TYPE_PRESENT	= const_cpu_to_le32(2),
+	ACE_OBJECT_TYPE_PRESENT			= cpu_to_le32(1),
+	ACE_INHERITED_OBJECT_TYPE_PRESENT	= cpu_to_le32(2),
 };
 
 typedef le32 OBJECT_ACE_FLAGS;
@@ -1638,13 +1657,13 @@ typedef enum {
  *	pointed to by the Owner field was provided by a defaulting mechanism
  *	rather than explicitly provided by the original provider of the
  *	security descriptor.  This may affect the treatment of the SID with
- *	respect to inheritence of an owner.
+ *	respect to inheritance of an owner.
  *
  * SE_GROUP_DEFAULTED - This boolean flag, when set, indicates that the SID in
  *	the Group field was provided by a defaulting mechanism rather than
  *	explicitly provided by the original provider of the security
  *	descriptor.  This may affect the treatment of the SID with respect to
- *	inheritence of a primary group.
+ *	inheritance of a primary group.
  *
  * SE_DACL_PRESENT - This boolean flag, when set, indicates that the security
  *	descriptor contains a discretionary ACL.  If this flag is set and the
@@ -1655,7 +1674,7 @@ typedef enum {
  *	pointed to by the Dacl field was provided by a defaulting mechanism
  *	rather than explicitly provided by the original provider of the
  *	security descriptor.  This may affect the treatment of the ACL with
- *	respect to inheritence of an ACL.  This flag is ignored if the
+ *	respect to inheritance of an ACL.  This flag is ignored if the
  *	DaclPresent flag is not set.
  *
  * SE_SACL_PRESENT - This boolean flag, when set,  indicates that the security
@@ -1667,7 +1686,7 @@ typedef enum {
  *	pointed to by the Sacl field was provided by a defaulting mechanism
  *	rather than explicitly provided by the original provider of the
  *	security descriptor.  This may affect the treatment of the ACL with
- *	respect to inheritence of an ACL.  This flag is ignored if the
+ *	respect to inheritance of an ACL.  This flag is ignored if the
  *	SaclPresent flag is not set.
  *
  * SE_SELF_RELATIVE - This boolean flag, when set, indicates that the security
@@ -1676,23 +1695,23 @@ typedef enum {
  *	expressed as offsets from the beginning of the security descriptor.
  */
 enum {
-	SE_OWNER_DEFAULTED		= const_cpu_to_le16(0x0001),
-	SE_GROUP_DEFAULTED		= const_cpu_to_le16(0x0002),
-	SE_DACL_PRESENT			= const_cpu_to_le16(0x0004),
-	SE_DACL_DEFAULTED		= const_cpu_to_le16(0x0008),
-
-	SE_SACL_PRESENT			= const_cpu_to_le16(0x0010),
-	SE_SACL_DEFAULTED		= const_cpu_to_le16(0x0020),
-
-	SE_DACL_AUTO_INHERIT_REQ	= const_cpu_to_le16(0x0100),
-	SE_SACL_AUTO_INHERIT_REQ	= const_cpu_to_le16(0x0200),
-	SE_DACL_AUTO_INHERITED		= const_cpu_to_le16(0x0400),
-	SE_SACL_AUTO_INHERITED		= const_cpu_to_le16(0x0800),
-
-	SE_DACL_PROTECTED		= const_cpu_to_le16(0x1000),
-	SE_SACL_PROTECTED		= const_cpu_to_le16(0x2000),
-	SE_RM_CONTROL_VALID		= const_cpu_to_le16(0x4000),
-	SE_SELF_RELATIVE		= const_cpu_to_le16(0x8000)
+	SE_OWNER_DEFAULTED		= cpu_to_le16(0x0001),
+	SE_GROUP_DEFAULTED		= cpu_to_le16(0x0002),
+	SE_DACL_PRESENT			= cpu_to_le16(0x0004),
+	SE_DACL_DEFAULTED		= cpu_to_le16(0x0008),
+
+	SE_SACL_PRESENT			= cpu_to_le16(0x0010),
+	SE_SACL_DEFAULTED		= cpu_to_le16(0x0020),
+
+	SE_DACL_AUTO_INHERIT_REQ	= cpu_to_le16(0x0100),
+	SE_SACL_AUTO_INHERIT_REQ	= cpu_to_le16(0x0200),
+	SE_DACL_AUTO_INHERITED		= cpu_to_le16(0x0400),
+	SE_SACL_AUTO_INHERITED		= cpu_to_le16(0x0800),
+
+	SE_DACL_PROTECTED		= cpu_to_le16(0x1000),
+	SE_SACL_PROTECTED		= cpu_to_le16(0x2000),
+	SE_RM_CONTROL_VALID		= cpu_to_le16(0x4000),
+	SE_SELF_RELATIVE		= cpu_to_le16(0x8000)
 } __attribute__ ((__packed__));
 
 typedef le16 SECURITY_DESCRIPTOR_CONTROL;
@@ -1880,20 +1899,21 @@ typedef struct {
  * Possible flags for the volume (16-bit).
  */
 enum {
-	VOLUME_IS_DIRTY			= const_cpu_to_le16(0x0001),
-	VOLUME_RESIZE_LOG_FILE		= const_cpu_to_le16(0x0002),
-	VOLUME_UPGRADE_ON_MOUNT		= const_cpu_to_le16(0x0004),
-	VOLUME_MOUNTED_ON_NT4		= const_cpu_to_le16(0x0008),
+	VOLUME_IS_DIRTY			= cpu_to_le16(0x0001),
+	VOLUME_RESIZE_LOG_FILE		= cpu_to_le16(0x0002),
+	VOLUME_UPGRADE_ON_MOUNT		= cpu_to_le16(0x0004),
+	VOLUME_MOUNTED_ON_NT4		= cpu_to_le16(0x0008),
 
-	VOLUME_DELETE_USN_UNDERWAY	= const_cpu_to_le16(0x0010),
-	VOLUME_REPAIR_OBJECT_ID		= const_cpu_to_le16(0x0020),
+	VOLUME_DELETE_USN_UNDERWAY	= cpu_to_le16(0x0010),
+	VOLUME_REPAIR_OBJECT_ID		= cpu_to_le16(0x0020),
 
-	VOLUME_MODIFIED_BY_CHKDSK	= const_cpu_to_le16(0x8000),
+	VOLUME_CHKDSK_UNDERWAY		= cpu_to_le16(0x4000),
+	VOLUME_MODIFIED_BY_CHKDSK	= cpu_to_le16(0x8000),
 
-	VOLUME_FLAGS_MASK		= const_cpu_to_le16(0x803f),
+	VOLUME_FLAGS_MASK		= cpu_to_le16(0xc03f),
 
 	/* To make our life easier when checking if we must mount read-only. */
-	VOLUME_MUST_MOUNT_RO_MASK	= const_cpu_to_le16(0x8037),
+	VOLUME_MUST_MOUNT_RO_MASK	= cpu_to_le16(0xc027),
 } __attribute__ ((__packed__));
 
 typedef le16 VOLUME_FLAGS;
@@ -1988,14 +2008,14 @@ typedef struct {
  *
  * When a directory is small enough to fit inside the index root then this
  * is the only attribute describing the directory. When the directory is too
- * large to fit in the index root, on the other hand, two aditional attributes
+ * large to fit in the index root, on the other hand, two additional attributes
  * are present: an index allocation attribute, containing sub-nodes of the B+
  * directory tree (see below), and a bitmap attribute, describing which virtual
  * cluster numbers (vcns) in the index allocation attribute are in use by an
  * index block.
  *
  * NOTE: The root directory (FILE_root) contains an entry for itself. Other
- * dircetories do not contain entries for themselves, though.
+ * directories do not contain entries for themselves, though.
  */
 typedef struct {
 	ATTR_TYPE type;			/* Type of the indexed attribute. Is
@@ -2078,26 +2098,26 @@ typedef struct {
  * The user quota flags.  Names explain meaning.
  */
 enum {
-	QUOTA_FLAG_DEFAULT_LIMITS	= const_cpu_to_le32(0x00000001),
-	QUOTA_FLAG_LIMIT_REACHED	= const_cpu_to_le32(0x00000002),
-	QUOTA_FLAG_ID_DELETED		= const_cpu_to_le32(0x00000004),
+	QUOTA_FLAG_DEFAULT_LIMITS	= cpu_to_le32(0x00000001),
+	QUOTA_FLAG_LIMIT_REACHED	= cpu_to_le32(0x00000002),
+	QUOTA_FLAG_ID_DELETED		= cpu_to_le32(0x00000004),
 
-	QUOTA_FLAG_USER_MASK		= const_cpu_to_le32(0x00000007),
+	QUOTA_FLAG_USER_MASK		= cpu_to_le32(0x00000007),
 	/* This is a bit mask for the user quota flags. */
 
 	/*
 	 * These flags are only present in the quota defaults index entry, i.e.
 	 * in the entry where owner_id = QUOTA_DEFAULTS_ID.
 	 */
-	QUOTA_FLAG_TRACKING_ENABLED	= const_cpu_to_le32(0x00000010),
-	QUOTA_FLAG_ENFORCEMENT_ENABLED	= const_cpu_to_le32(0x00000020),
-	QUOTA_FLAG_TRACKING_REQUESTED	= const_cpu_to_le32(0x00000040),
-	QUOTA_FLAG_LOG_THRESHOLD	= const_cpu_to_le32(0x00000080),
-
-	QUOTA_FLAG_LOG_LIMIT		= const_cpu_to_le32(0x00000100),
-	QUOTA_FLAG_OUT_OF_DATE		= const_cpu_to_le32(0x00000200),
-	QUOTA_FLAG_CORRUPT		= const_cpu_to_le32(0x00000400),
-	QUOTA_FLAG_PENDING_DELETES	= const_cpu_to_le32(0x00000800),
+	QUOTA_FLAG_TRACKING_ENABLED	= cpu_to_le32(0x00000010),
+	QUOTA_FLAG_ENFORCEMENT_ENABLED	= cpu_to_le32(0x00000020),
+	QUOTA_FLAG_TRACKING_REQUESTED	= cpu_to_le32(0x00000040),
+	QUOTA_FLAG_LOG_THRESHOLD	= cpu_to_le32(0x00000080),
+
+	QUOTA_FLAG_LOG_LIMIT		= cpu_to_le32(0x00000100),
+	QUOTA_FLAG_OUT_OF_DATE		= cpu_to_le32(0x00000200),
+	QUOTA_FLAG_CORRUPT		= cpu_to_le32(0x00000400),
+	QUOTA_FLAG_PENDING_DELETES	= cpu_to_le32(0x00000800),
 };
 
 typedef le32 QUOTA_FLAGS;
@@ -2141,9 +2161,9 @@ typedef struct {
  * Predefined owner_id values (32-bit).
  */
 enum {
-	QUOTA_INVALID_ID	= const_cpu_to_le32(0x00000000),
-	QUOTA_DEFAULTS_ID	= const_cpu_to_le32(0x00000001),
-	QUOTA_FIRST_USER_ID	= const_cpu_to_le32(0x00000100),
+	QUOTA_INVALID_ID	= cpu_to_le32(0x00000000),
+	QUOTA_DEFAULTS_ID	= cpu_to_le32(0x00000001),
+	QUOTA_FIRST_USER_ID	= cpu_to_le32(0x00000100),
 };
 
 /*
@@ -2158,14 +2178,14 @@ typedef enum {
  * Index entry flags (16-bit).
  */
 enum {
-	INDEX_ENTRY_NODE = const_cpu_to_le16(1), /* This entry contains a
+	INDEX_ENTRY_NODE = cpu_to_le16(1), /* This entry contains a
 			sub-node, i.e. a reference to an index block in form of
 			a virtual cluster number (see below). */
-	INDEX_ENTRY_END  = const_cpu_to_le16(2), /* This signifies the last
+	INDEX_ENTRY_END  = cpu_to_le16(2), /* This signifies the last
 			entry in an index block.  The index entry does not
 			represent a file but it can point to a sub-node. */
 
-	INDEX_ENTRY_SPACE_FILLER = const_cpu_to_le16(0xffff), /* gcc: Force
+	INDEX_ENTRY_SPACE_FILLER = cpu_to_le16(0xffff), /* gcc: Force
 			enum bit width to 16-bit. */
 } __attribute__ ((__packed__));
 
@@ -2263,7 +2283,7 @@ typedef struct {
 	//		   the key_length is zero, then the vcn immediately
 	//		   follows the INDEX_ENTRY_HEADER. Regardless of
 	//		   key_length, the address of the 8-byte boundary
-	//		   alligned vcn of INDEX_ENTRY{_HEADER} *ie is given by
+	//		   aligned vcn of INDEX_ENTRY{_HEADER} *ie is given by
 	//		   (char*)ie + le16_to_cpu(ie*)->length) - sizeof(VCN),
 	//		   where sizeof(VCN) can be hardcoded as 8 if wanted. */
 } __attribute__ ((__packed__)) INDEX_ENTRY;
@@ -2303,26 +2323,26 @@ typedef struct {
  * These are the predefined reparse point tags:
  */
 enum {
-	IO_REPARSE_TAG_IS_ALIAS		= const_cpu_to_le32(0x20000000),
-	IO_REPARSE_TAG_IS_HIGH_LATENCY	= const_cpu_to_le32(0x40000000),
-	IO_REPARSE_TAG_IS_MICROSOFT	= const_cpu_to_le32(0x80000000),
+	IO_REPARSE_TAG_IS_ALIAS		= cpu_to_le32(0x20000000),
+	IO_REPARSE_TAG_IS_HIGH_LATENCY	= cpu_to_le32(0x40000000),
+	IO_REPARSE_TAG_IS_MICROSOFT	= cpu_to_le32(0x80000000),
 
-	IO_REPARSE_TAG_RESERVED_ZERO	= const_cpu_to_le32(0x00000000),
-	IO_REPARSE_TAG_RESERVED_ONE	= const_cpu_to_le32(0x00000001),
-	IO_REPARSE_TAG_RESERVED_RANGE	= const_cpu_to_le32(0x00000001),
+	IO_REPARSE_TAG_RESERVED_ZERO	= cpu_to_le32(0x00000000),
+	IO_REPARSE_TAG_RESERVED_ONE	= cpu_to_le32(0x00000001),
+	IO_REPARSE_TAG_RESERVED_RANGE	= cpu_to_le32(0x00000001),
 
-	IO_REPARSE_TAG_NSS		= const_cpu_to_le32(0x68000005),
-	IO_REPARSE_TAG_NSS_RECOVER	= const_cpu_to_le32(0x68000006),
-	IO_REPARSE_TAG_SIS		= const_cpu_to_le32(0x68000007),
-	IO_REPARSE_TAG_DFS		= const_cpu_to_le32(0x68000008),
+	IO_REPARSE_TAG_NSS		= cpu_to_le32(0x68000005),
+	IO_REPARSE_TAG_NSS_RECOVER	= cpu_to_le32(0x68000006),
+	IO_REPARSE_TAG_SIS		= cpu_to_le32(0x68000007),
+	IO_REPARSE_TAG_DFS		= cpu_to_le32(0x68000008),
 
-	IO_REPARSE_TAG_MOUNT_POINT	= const_cpu_to_le32(0x88000003),
+	IO_REPARSE_TAG_MOUNT_POINT	= cpu_to_le32(0x88000003),
 
-	IO_REPARSE_TAG_HSM		= const_cpu_to_le32(0xa8000004),
+	IO_REPARSE_TAG_HSM		= cpu_to_le32(0xa8000004),
 
-	IO_REPARSE_TAG_SYMBOLIC_LINK	= const_cpu_to_le32(0xe8000000),
+	IO_REPARSE_TAG_SYMBOLIC_LINK	= cpu_to_le32(0xe8000000),
 
-	IO_REPARSE_TAG_VALID_VALUES	= const_cpu_to_le32(0xe000ffff),
+	IO_REPARSE_TAG_VALID_VALUES	= cpu_to_le32(0xe000ffff),
 };
 
 /*
@@ -2358,7 +2378,9 @@ typedef struct {
  * Extended attribute flags (8-bit).
  */
 enum {
-	NEED_EA	= 0x80
+	NEED_EA	= 0x80		/* If set the file to which the EA belongs
+				   cannot be interpreted without understanding
+				   the associates extended attributes. */
 } __attribute__ ((__packed__));
 
 typedef u8 EA_FLAGS;
@@ -2366,20 +2388,20 @@ typedef u8 EA_FLAGS;
 /*
  * Attribute: Extended attribute (EA) (0xe0).
  *
- * NOTE: Always non-resident. (Is this true?)
+ * NOTE: Can be resident or non-resident.
  *
  * Like the attribute list and the index buffer list, the EA attribute value is
  * a sequence of EA_ATTR variable length records.
- *
- * FIXME: It appears weird that the EA name is not unicode. Is it true?
  */
 typedef struct {
 	le32 next_entry_offset;	/* Offset to the next EA_ATTR. */
 	EA_FLAGS flags;		/* Flags describing the EA. */
-	u8 ea_name_length;	/* Length of the name of the EA in bytes. */
+	u8 ea_name_length;	/* Length of the name of the EA in bytes
+				   excluding the '\0' byte terminator. */
 	le16 ea_value_length;	/* Byte size of the EA's value. */
-	u8 ea_name[0];		/* Name of the EA. */
-	u8 ea_value[0];		/* The value of the EA. Immediately follows
+	u8 ea_name[0];		/* Name of the EA.  Note this is ASCII, not
+				   Unicode and it is zero terminated. */
+	u8 ea_value[0];		/* The value of the EA.  Immediately follows
 				   the name. */
 } __attribute__ ((__packed__)) EA_ATTR;
 
diff --git a/fs/ntfs/lcnalloc.c b/fs/ntfs/lcnalloc.c
index 23fd911078b..1711b710b64 100644
--- a/fs/ntfs/lcnalloc.c
+++ b/fs/ntfs/lcnalloc.c
@@ -1,7 +1,7 @@
 /*
  * lcnalloc.c - Cluster (de)allocation code.  Part of the Linux-NTFS project.
  *
- * Copyright (c) 2004 Anton Altaparmakov
+ * Copyright (c) 2004-2005 Anton Altaparmakov
  *
  * This program/include file is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as published
@@ -54,13 +54,15 @@ int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol,
 	int ret = 0;
 
 	ntfs_debug("Entering.");
+	if (!rl)
+		return 0;
 	for (; rl->length; rl++) {
 		int err;
 
 		if (rl->lcn < 0)
 			continue;
 		err = ntfs_bitmap_clear_run(lcnbmp_vi, rl->lcn, rl->length);
-		if (unlikely(err && (!ret || ret == ENOMEM) && ret != err))
+		if (unlikely(err && (!ret || ret == -ENOMEM) && ret != err))
 			ret = err;
 	}
 	ntfs_debug("Done.");
@@ -74,6 +76,7 @@ int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol,
  * @count:	number of clusters to allocate
  * @start_lcn:	starting lcn at which to allocate the clusters (or -1 if none)
  * @zone:	zone from which to allocate the clusters
+ * @is_extension:	if 'true', this is an attribute extension
  *
  * Allocate @count clusters preferably starting at cluster @start_lcn or at the
  * current allocator position if @start_lcn is -1, on the mounted ntfs volume
@@ -84,6 +87,13 @@ int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol,
  * @start_vcn specifies the vcn of the first allocated cluster.  This makes
  * merging the resulting runlist with the old runlist easier.
  *
+ * If @is_extension is 'true', the caller is allocating clusters to extend an
+ * attribute and if it is 'false', the caller is allocating clusters to fill a
+ * hole in an attribute.  Practically the difference is that if @is_extension
+ * is 'true' the returned runlist will be terminated with LCN_ENOENT and if
+ * @is_extension is 'false' the runlist will be terminated with
+ * LCN_RL_NOT_MAPPED.
+ *
  * You need to check the return value with IS_ERR().  If this is false, the
  * function was successful and the return value is a runlist describing the
  * allocated cluster(s).  If IS_ERR() is true, the function failed and
@@ -135,11 +145,13 @@ int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol,
  */
 runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, const VCN start_vcn,
 		const s64 count, const LCN start_lcn,
-		const NTFS_CLUSTER_ALLOCATION_ZONES zone)
+		const NTFS_CLUSTER_ALLOCATION_ZONES zone,
+		const bool is_extension)
 {
 	LCN zone_start, zone_end, bmp_pos, bmp_initial_pos, last_read_pos, lcn;
 	LCN prev_lcn = 0, prev_run_len = 0, mft_zone_size;
 	s64 clusters;
+	loff_t i_size;
 	struct inode *lcnbmp_vi;
 	runlist_element *rl = NULL;
 	struct address_space *mapping;
@@ -162,17 +174,9 @@ runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, const VCN start_vcn,
 	BUG_ON(zone < FIRST_ZONE);
 	BUG_ON(zone > LAST_ZONE);
 
-	/* Return empty runlist if @count == 0 */
-	// FIXME: Do we want to just return NULL instead? (AIA)
-	if (!count) {
-		rl = ntfs_malloc_nofs(PAGE_SIZE);
-		if (!rl)
-			return ERR_PTR(-ENOMEM);
-		rl[0].vcn = start_vcn;
-		rl[0].lcn = LCN_RL_NOT_MAPPED;
-		rl[0].length = 0;
-		return rl;
-	}
+	/* Return NULL if @count is zero. */
+	if (!count)
+		return NULL;
 	/* Take the lcnbmp lock for writing. */
 	down_write(&vol->lcnbmp_lock);
 	/*
@@ -249,6 +253,7 @@ runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, const VCN start_vcn,
 	clusters = count;
 	rlpos = rlsize = 0;
 	mapping = lcnbmp_vi->i_mapping;
+	i_size = i_size_read(lcnbmp_vi);
 	while (1) {
 		ntfs_debug("Start of outer while loop: done_zones 0x%x, "
 				"search_zone %i, pass %i, zone_start 0x%llx, "
@@ -263,7 +268,7 @@ runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, const VCN start_vcn,
 		last_read_pos = bmp_pos >> 3;
 		ntfs_debug("last_read_pos 0x%llx.",
 				(unsigned long long)last_read_pos);
-		if (last_read_pos > lcnbmp_vi->i_size) {
+		if (last_read_pos > i_size) {
 			ntfs_debug("End of attribute reached.  "
 					"Skipping to zone_pass_done.");
 			goto zone_pass_done;
@@ -287,11 +292,11 @@ runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, const VCN start_vcn,
 		buf_size = last_read_pos & ~PAGE_CACHE_MASK;
 		buf = page_address(page) + buf_size;
 		buf_size = PAGE_CACHE_SIZE - buf_size;
-		if (unlikely(last_read_pos + buf_size > lcnbmp_vi->i_size))
-			buf_size = lcnbmp_vi->i_size - last_read_pos;
+		if (unlikely(last_read_pos + buf_size > i_size))
+			buf_size = i_size - last_read_pos;
 		buf_size <<= 3;
 		lcn = bmp_pos & 7;
-		bmp_pos &= ~7;
+		bmp_pos &= ~(LCN)7;
 		ntfs_debug("Before inner while loop: buf_size %i, lcn 0x%llx, "
 				"bmp_pos 0x%llx, need_writeback %i.", buf_size,
 				(unsigned long long)lcn,
@@ -309,12 +314,12 @@ runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, const VCN start_vcn,
 					(unsigned int)*byte);
 			/* Skip full bytes. */
 			if (*byte == 0xff) {
-				lcn = (lcn + 8) & ~7;
+				lcn = (lcn + 8) & ~(LCN)7;
 				ntfs_debug("Continuing while loop 1.");
 				continue;
 			}
 			bit = 1 << (lcn & 7);
-			ntfs_debug("bit %i.", bit);
+			ntfs_debug("bit 0x%x.", bit);
 			/* If the bit is already set, go onto the next one. */
 			if (*byte & bit) {
 				lcn++;
@@ -691,7 +696,7 @@ switch_to_data1_zone:		search_zone = 2;
 		if (zone == MFT_ZONE || mft_zone_size <= 0) {
 			ntfs_debug("No free clusters left, going to out.");
 			/* Really no more space left on device. */
-			err = ENOSPC;
+			err = -ENOSPC;
 			goto out;
 		} /* zone == DATA_ZONE && mft_zone_size > 0 */
 		ntfs_debug("Shrinking mft zone.");
@@ -733,7 +738,7 @@ out:
 	/* Add runlist terminator element. */
 	if (likely(rl)) {
 		rl[rlpos].vcn = rl[rlpos - 1].vcn + rl[rlpos - 1].length;
-		rl[rlpos].lcn = LCN_RL_NOT_MAPPED;
+		rl[rlpos].lcn = is_extension ? LCN_ENOENT : LCN_RL_NOT_MAPPED;
 		rl[rlpos].length = 0;
 	}
 	if (likely(page && !IS_ERR(page))) {
@@ -755,13 +760,13 @@ out:
 	if (rl) {
 		int err2;
 
-		if (err == ENOSPC)
+		if (err == -ENOSPC)
 			ntfs_debug("Not enough space to complete allocation, "
-					"err ENOSPC, first free lcn 0x%llx, "
+					"err -ENOSPC, first free lcn 0x%llx, "
 					"could allocate up to 0x%llx "
 					"clusters.",
 					(unsigned long long)rl[0].lcn,
-					(unsigned long long)count - clusters);
+					(unsigned long long)(count - clusters));
 		/* Deallocate all allocated clusters. */
 		ntfs_debug("Attempting rollback...");
 		err2 = ntfs_cluster_free_from_rl_nolock(vol, rl);
@@ -773,62 +778,88 @@ out:
 		}
 		/* Free the runlist. */
 		ntfs_free(rl);
-	} else if (err == ENOSPC)
-		ntfs_debug("No space left at all, err = ENOSPC, "
-				"first free lcn = 0x%llx.",
-				(unsigned long long)vol->data1_zone_pos);
+	} else if (err == -ENOSPC)
+		ntfs_debug("No space left at all, err = -ENOSPC, first free "
+				"lcn = 0x%llx.",
+				(long long)vol->data1_zone_pos);
 	up_write(&vol->lcnbmp_lock);
 	return ERR_PTR(err);
 }
 
 /**
  * __ntfs_cluster_free - free clusters on an ntfs volume
- * @vi:		vfs inode whose runlist describes the clusters to free
- * @start_vcn:	vcn in the runlist of @vi at which to start freeing clusters
+ * @ni:		ntfs inode whose runlist describes the clusters to free
+ * @start_vcn:	vcn in the runlist of @ni at which to start freeing clusters
  * @count:	number of clusters to free or -1 for all clusters
- * @is_rollback:	if TRUE this is a rollback operation
+ * @ctx:	active attribute search context if present or NULL if not
+ * @is_rollback:	true if this is a rollback operation
  *
  * Free @count clusters starting at the cluster @start_vcn in the runlist
- * described by the vfs inode @vi.
+ * described by the vfs inode @ni.
  *
  * If @count is -1, all clusters from @start_vcn to the end of the runlist are
  * deallocated.  Thus, to completely free all clusters in a runlist, use
  * @start_vcn = 0 and @count = -1.
  *
- * @is_rollback should always be FALSE, it is for internal use to rollback
+ * If @ctx is specified, it is an active search context of @ni and its base mft
+ * record.  This is needed when __ntfs_cluster_free() encounters unmapped
+ * runlist fragments and allows their mapping.  If you do not have the mft
+ * record mapped, you can specify @ctx as NULL and __ntfs_cluster_free() will
+ * perform the necessary mapping and unmapping.
+ *
+ * Note, __ntfs_cluster_free() saves the state of @ctx on entry and restores it
+ * before returning.  Thus, @ctx will be left pointing to the same attribute on
+ * return as on entry.  However, the actual pointers in @ctx may point to
+ * different memory locations on return, so you must remember to reset any
+ * cached pointers from the @ctx, i.e. after the call to __ntfs_cluster_free(),
+ * you will probably want to do:
+ *	m = ctx->mrec;
+ *	a = ctx->attr;
+ * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that
+ * you cache ctx->mrec in a variable @m of type MFT_RECORD *.
+ *
+ * @is_rollback should always be 'false', it is for internal use to rollback
  * errors.  You probably want to use ntfs_cluster_free() instead.
  *
- * Note, ntfs_cluster_free() does not modify the runlist at all, so the caller
- * has to deal with it later.
+ * Note, __ntfs_cluster_free() does not modify the runlist, so you have to
+ * remove from the runlist or mark sparse the freed runs later.
  *
  * Return the number of deallocated clusters (not counting sparse ones) on
  * success and -errno on error.
  *
- * Locking: - The runlist described by @vi must be unlocked on entry and is
- *	      unlocked on return.
- *	    - This function takes the runlist lock of @vi for reading and
- *	      sometimes for writing and sometimes modifies the runlist.
+ * WARNING: If @ctx is supplied, regardless of whether success or failure is
+ *	    returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx
+ *	    is no longer valid, i.e. you need to either call
+ *	    ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it.
+ *	    In that case PTR_ERR(@ctx->mrec) will give you the error code for
+ *	    why the mapping of the old inode failed.
+ *
+ * Locking: - The runlist described by @ni must be locked for writing on entry
+ *	      and is locked on return.  Note the runlist may be modified when
+ *	      needed runlist fragments need to be mapped.
  *	    - The volume lcn bitmap must be unlocked on entry and is unlocked
  *	      on return.
  *	    - This function takes the volume lcn bitmap lock for writing and
  *	      modifies the bitmap contents.
+ *	    - If @ctx is NULL, the base mft record of @ni must not be mapped on
+ *	      entry and it will be left unmapped on return.
+ *	    - If @ctx is not NULL, the base mft record must be mapped on entry
+ *	      and it will be left mapped on return.
  */
-s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn, s64 count,
-		const BOOL is_rollback)
+s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, s64 count,
+		ntfs_attr_search_ctx *ctx, const bool is_rollback)
 {
 	s64 delta, to_free, total_freed, real_freed;
-	ntfs_inode *ni;
 	ntfs_volume *vol;
 	struct inode *lcnbmp_vi;
 	runlist_element *rl;
 	int err;
 
-	BUG_ON(!vi);
+	BUG_ON(!ni);
 	ntfs_debug("Entering for i_ino 0x%lx, start_vcn 0x%llx, count "
-			"0x%llx.%s", vi->i_ino, (unsigned long long)start_vcn,
+			"0x%llx.%s", ni->mft_no, (unsigned long long)start_vcn,
 			(unsigned long long)count,
 			is_rollback ? " (rollback)" : "");
-	ni = NTFS_I(vi);
 	vol = ni->vol;
 	lcnbmp_vi = vol->lcnbmp_ino;
 	BUG_ON(!lcnbmp_vi);
@@ -846,8 +877,7 @@ s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn, s64 count,
 
 	total_freed = real_freed = 0;
 
-	/* This returns with ni->runlist locked for reading on success. */
-	rl = ntfs_find_vcn(ni, start_vcn, FALSE);
+	rl = ntfs_attr_find_vcn_nolock(ni, start_vcn, ctx);
 	if (IS_ERR(rl)) {
 		if (!is_rollback)
 			ntfs_error(vol->sb, "Failed to find first runlist "
@@ -861,7 +891,7 @@ s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn, s64 count,
 			ntfs_error(vol->sb, "First runlist element has "
 					"invalid lcn, aborting.");
 		err = -EIO;
-		goto unl_err_out;
+		goto err_out;
 	}
 	/* Find the starting cluster inside the run that needs freeing. */
 	delta = start_vcn - rl->vcn;
@@ -879,7 +909,7 @@ s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn, s64 count,
 			if (!is_rollback)
 				ntfs_error(vol->sb, "Failed to clear first run "
 						"(error %i), aborting.", err);
-			goto unl_err_out;
+			goto err_out;
 		}
 		/* We have freed @to_free real clusters. */
 		real_freed = to_free;
@@ -899,30 +929,15 @@ s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn, s64 count,
 		if (unlikely(rl->lcn < LCN_HOLE)) {
 			VCN vcn;
 
-			/*
-			 * Attempt to map runlist, dropping runlist lock for
-			 * the duration.
-			 */
+			/* Attempt to map runlist. */
 			vcn = rl->vcn;
-			up_read(&ni->runlist.lock);
-			err = ntfs_map_runlist(ni, vcn);
-			if (err) {
-				if (!is_rollback)
-					ntfs_error(vol->sb, "Failed to map "
-							"runlist fragment.");
-				if (err == -EINVAL || err == -ENOENT)
-					err = -EIO;
-				goto err_out;
-			}
-			/*
-			 * This returns with ni->runlist locked for reading on
-			 * success.
-			 */
-			rl = ntfs_find_vcn(ni, vcn, FALSE);
+			rl = ntfs_attr_find_vcn_nolock(ni, vcn, ctx);
 			if (IS_ERR(rl)) {
 				err = PTR_ERR(rl);
 				if (!is_rollback)
-					ntfs_error(vol->sb, "Failed to find "
+					ntfs_error(vol->sb, "Failed to map "
+							"runlist fragment or "
+							"failed to find "
 							"subsequent runlist "
 							"element.");
 				goto err_out;
@@ -935,7 +950,7 @@ s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn, s64 count,
 							(unsigned long long)
 							rl->lcn);
 				err = -EIO;
-				goto unl_err_out;
+				goto err_out;
 			}
 		}
 		/* The number of clusters in this run that need freeing. */
@@ -951,7 +966,7 @@ s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn, s64 count,
 				if (!is_rollback)
 					ntfs_error(vol->sb, "Failed to clear "
 							"subsequent run.");
-				goto unl_err_out;
+				goto err_out;
 			}
 			/* We have freed @to_free real clusters. */
 			real_freed += to_free;
@@ -963,7 +978,6 @@ s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn, s64 count,
 		/* Update the total done clusters. */
 		total_freed += to_free;
 	}
-	up_read(&ni->runlist.lock);
 	if (likely(!is_rollback))
 		up_write(&vol->lcnbmp_lock);
 
@@ -972,8 +986,6 @@ s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn, s64 count,
 	/* We are done.  Return the number of actually freed clusters. */
 	ntfs_debug("Done.");
 	return real_freed;
-unl_err_out:
-	up_read(&ni->runlist.lock);
 err_out:
 	if (is_rollback)
 		return err;
@@ -987,7 +999,7 @@ err_out:
 	 * If rollback fails, set the volume errors flag, emit an error
 	 * message, and return the error code.
 	 */
-	delta = __ntfs_cluster_free(vi, start_vcn, total_freed, TRUE);
+	delta = __ntfs_cluster_free(ni, start_vcn, total_freed, ctx, true);
 	if (delta < 0) {
 		ntfs_error(vol->sb, "Failed to rollback (error %i).  Leaving "
 				"inconsistent metadata!  Unmount and run "
diff --git a/fs/ntfs/lcnalloc.h b/fs/ntfs/lcnalloc.h
index 4cac1c024af..2adb0431694 100644
--- a/fs/ntfs/lcnalloc.h
+++ b/fs/ntfs/lcnalloc.h
@@ -2,7 +2,7 @@
  * lcnalloc.h - Exports for NTFS kernel cluster (de)allocation.  Part of the
  *		Linux-NTFS project.
  *
- * Copyright (c) 2004 Anton Altaparmakov
+ * Copyright (c) 2004-2005 Anton Altaparmakov
  *
  * This program/include file is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as published
@@ -27,7 +27,9 @@
 
 #include <linux/fs.h>
 
+#include "attrib.h"
 #include "types.h"
+#include "inode.h"
 #include "runlist.h"
 #include "volume.h"
 
@@ -40,43 +42,72 @@ typedef enum {
 
 extern runlist_element *ntfs_cluster_alloc(ntfs_volume *vol,
 		const VCN start_vcn, const s64 count, const LCN start_lcn,
-		const NTFS_CLUSTER_ALLOCATION_ZONES zone);
+		const NTFS_CLUSTER_ALLOCATION_ZONES zone,
+		const bool is_extension);
 
-extern s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn,
-		s64 count, const BOOL is_rollback);
+extern s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn,
+		s64 count, ntfs_attr_search_ctx *ctx, const bool is_rollback);
 
 /**
  * ntfs_cluster_free - free clusters on an ntfs volume
- * @vi:		vfs inode whose runlist describes the clusters to free
- * @start_vcn:	vcn in the runlist of @vi at which to start freeing clusters
+ * @ni:		ntfs inode whose runlist describes the clusters to free
+ * @start_vcn:	vcn in the runlist of @ni at which to start freeing clusters
  * @count:	number of clusters to free or -1 for all clusters
+ * @ctx:	active attribute search context if present or NULL if not
  *
  * Free @count clusters starting at the cluster @start_vcn in the runlist
- * described by the vfs inode @vi.
+ * described by the ntfs inode @ni.
  *
  * If @count is -1, all clusters from @start_vcn to the end of the runlist are
  * deallocated.  Thus, to completely free all clusters in a runlist, use
  * @start_vcn = 0 and @count = -1.
  *
- * Note, ntfs_cluster_free() does not modify the runlist at all, so the caller
- * has to deal with it later.
+ * If @ctx is specified, it is an active search context of @ni and its base mft
+ * record.  This is needed when ntfs_cluster_free() encounters unmapped runlist
+ * fragments and allows their mapping.  If you do not have the mft record
+ * mapped, you can specify @ctx as NULL and ntfs_cluster_free() will perform
+ * the necessary mapping and unmapping.
+ *
+ * Note, ntfs_cluster_free() saves the state of @ctx on entry and restores it
+ * before returning.  Thus, @ctx will be left pointing to the same attribute on
+ * return as on entry.  However, the actual pointers in @ctx may point to
+ * different memory locations on return, so you must remember to reset any
+ * cached pointers from the @ctx, i.e. after the call to ntfs_cluster_free(),
+ * you will probably want to do:
+ *	m = ctx->mrec;
+ *	a = ctx->attr;
+ * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that
+ * you cache ctx->mrec in a variable @m of type MFT_RECORD *.
+ *
+ * Note, ntfs_cluster_free() does not modify the runlist, so you have to remove
+ * from the runlist or mark sparse the freed runs later.
  *
  * Return the number of deallocated clusters (not counting sparse ones) on
  * success and -errno on error.
  *
- * Locking: - The runlist described by @vi must be unlocked on entry and is
- *	      unlocked on return.
- *	    - This function takes the runlist lock of @vi for reading and
- *	      sometimes for writing and sometimes modifies the runlist.
+ * WARNING: If @ctx is supplied, regardless of whether success or failure is
+ *	    returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx
+ *	    is no longer valid, i.e. you need to either call
+ *	    ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it.
+ *	    In that case PTR_ERR(@ctx->mrec) will give you the error code for
+ *	    why the mapping of the old inode failed.
+ *
+ * Locking: - The runlist described by @ni must be locked for writing on entry
+ *	      and is locked on return.  Note the runlist may be modified when
+ *	      needed runlist fragments need to be mapped.
  *	    - The volume lcn bitmap must be unlocked on entry and is unlocked
  *	      on return.
  *	    - This function takes the volume lcn bitmap lock for writing and
  *	      modifies the bitmap contents.
+ *	    - If @ctx is NULL, the base mft record of @ni must not be mapped on
+ *	      entry and it will be left unmapped on return.
+ *	    - If @ctx is not NULL, the base mft record must be mapped on entry
+ *	      and it will be left mapped on return.
  */
-static inline s64 ntfs_cluster_free(struct inode *vi, const VCN start_vcn,
-		s64 count)
+static inline s64 ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn,
+		s64 count, ntfs_attr_search_ctx *ctx)
 {
-	return __ntfs_cluster_free(vi, start_vcn, count, FALSE);
+	return __ntfs_cluster_free(ni, start_vcn, count, ctx, false);
 }
 
 extern int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol,
@@ -93,8 +124,10 @@ extern int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol,
  *
  * Return 0 on success and -errno on error.
  *
- * Locking: This function takes the volume lcn bitmap lock for writing and
- *	    modifies the bitmap contents.
+ * Locking: - This function takes the volume lcn bitmap lock for writing and
+ *	      modifies the bitmap contents.
+ *	    - The caller must have locked the runlist @rl for reading or
+ *	      writing.
  */
 static inline int ntfs_cluster_free_from_rl(ntfs_volume *vol,
 		const runlist_element *rl)
diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c
index 5e280abafab..c71de292c5a 100644
--- a/fs/ntfs/logfile.c
+++ b/fs/ntfs/logfile.c
@@ -1,7 +1,7 @@
 /*
  * logfile.c - NTFS kernel journal handling. Part of the Linux-NTFS project.
  *
- * Copyright (c) 2002-2004 Anton Altaparmakov
+ * Copyright (c) 2002-2007 Anton Altaparmakov
  *
  * This program/include file is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as published
@@ -26,6 +26,7 @@
 #include <linux/highmem.h>
 #include <linux/buffer_head.h>
 #include <linux/bitops.h>
+#include <linux/log2.h>
 
 #include "attrib.h"
 #include "aops.h"
@@ -41,17 +42,18 @@
  * @rp:		restart page header to check
  * @pos:	position in @vi at which the restart page header resides
  *
- * Check the restart page header @rp for consistency and return TRUE if it is
- * consistent and FALSE otherwise.
+ * Check the restart page header @rp for consistency and return 'true' if it is
+ * consistent and 'false' otherwise.
  *
  * This function only needs NTFS_BLOCK_SIZE bytes in @rp, i.e. it does not
  * require the full restart page.
  */
-static BOOL ntfs_check_restart_page_header(struct inode *vi,
+static bool ntfs_check_restart_page_header(struct inode *vi,
 		RESTART_PAGE_HEADER *rp, s64 pos)
 {
 	u32 logfile_system_page_size, logfile_log_page_size;
-	u16 usa_count, usa_ofs, usa_end, ra_ofs;
+	u16 ra_ofs, usa_count, usa_ofs, usa_end = 0;
+	bool have_usa = true;
 
 	ntfs_debug("Entering.");
 	/*
@@ -64,9 +66,9 @@ static BOOL ntfs_check_restart_page_header(struct inode *vi,
 			logfile_log_page_size < NTFS_BLOCK_SIZE ||
 			logfile_system_page_size &
 			(logfile_system_page_size - 1) ||
-			logfile_log_page_size & (logfile_log_page_size - 1)) {
+			!is_power_of_2(logfile_log_page_size)) {
 		ntfs_error(vi->i_sb, "$LogFile uses unsupported page size.");
-		return FALSE;
+		return false;
 	}
 	/*
 	 * We must be either at !pos (1st restart page) or at pos = system page
@@ -75,7 +77,7 @@ static BOOL ntfs_check_restart_page_header(struct inode *vi,
 	if (pos && pos != logfile_system_page_size) {
 		ntfs_error(vi->i_sb, "Found restart area in incorrect "
 				"position in $LogFile.");
-		return FALSE;
+		return false;
 	}
 	/* We only know how to handle version 1.1. */
 	if (sle16_to_cpu(rp->major_ver) != 1 ||
@@ -84,14 +86,22 @@ static BOOL ntfs_check_restart_page_header(struct inode *vi,
 				"supported.  (This driver supports version "
 				"1.1 only.)", (int)sle16_to_cpu(rp->major_ver),
 				(int)sle16_to_cpu(rp->minor_ver));
-		return FALSE;
+		return false;
+	}
+	/*
+	 * If chkdsk has been run the restart page may not be protected by an
+	 * update sequence array.
+	 */
+	if (ntfs_is_chkd_record(rp->magic) && !le16_to_cpu(rp->usa_count)) {
+		have_usa = false;
+		goto skip_usa_checks;
 	}
 	/* Verify the size of the update sequence array. */
 	usa_count = 1 + (logfile_system_page_size >> NTFS_BLOCK_SIZE_BITS);
 	if (usa_count != le16_to_cpu(rp->usa_count)) {
 		ntfs_error(vi->i_sb, "$LogFile restart page specifies "
 				"inconsistent update sequence array count.");
-		return FALSE;
+		return false;
 	}
 	/* Verify the position of the update sequence array. */
 	usa_ofs = le16_to_cpu(rp->usa_ofs);
@@ -100,8 +110,9 @@ static BOOL ntfs_check_restart_page_header(struct inode *vi,
 			usa_end > NTFS_BLOCK_SIZE - sizeof(u16)) {
 		ntfs_error(vi->i_sb, "$LogFile restart page specifies "
 				"inconsistent update sequence array offset.");
-		return FALSE;
+		return false;
 	}
+skip_usa_checks:
 	/*
 	 * Verify the position of the restart area.  It must be:
 	 *	- aligned to 8-byte boundary,
@@ -109,11 +120,12 @@ static BOOL ntfs_check_restart_page_header(struct inode *vi,
 	 *	- within the system page size.
 	 */
 	ra_ofs = le16_to_cpu(rp->restart_area_offset);
-	if (ra_ofs & 7 || ra_ofs < usa_end ||
+	if (ra_ofs & 7 || (have_usa ? ra_ofs < usa_end :
+			ra_ofs < sizeof(RESTART_PAGE_HEADER)) ||
 			ra_ofs > logfile_system_page_size) {
 		ntfs_error(vi->i_sb, "$LogFile restart page specifies "
 				"inconsistent restart area offset.");
-		return FALSE;
+		return false;
 	}
 	/*
 	 * Only restart pages modified by chkdsk are allowed to have chkdsk_lsn
@@ -121,11 +133,11 @@ static BOOL ntfs_check_restart_page_header(struct inode *vi,
 	 */
 	if (!ntfs_is_chkd_record(rp->magic) && sle64_to_cpu(rp->chkdsk_lsn)) {
 		ntfs_error(vi->i_sb, "$LogFile restart page is not modified "
-				"chkdsk but a chkdsk LSN is specified.");
-		return FALSE;
+				"by chkdsk but a chkdsk LSN is specified.");
+		return false;
 	}
 	ntfs_debug("Done.");
-	return TRUE;
+	return true;
 }
 
 /**
@@ -134,7 +146,7 @@ static BOOL ntfs_check_restart_page_header(struct inode *vi,
  * @rp:		restart page whose restart area to check
  *
  * Check the restart area of the restart page @rp for consistency and return
- * TRUE if it is consistent and FALSE otherwise.
+ * 'true' if it is consistent and 'false' otherwise.
  *
  * This function assumes that the restart page header has already been
  * consistency checked.
@@ -142,7 +154,7 @@ static BOOL ntfs_check_restart_page_header(struct inode *vi,
  * This function only needs NTFS_BLOCK_SIZE bytes in @rp, i.e. it does not
  * require the full restart page.
  */
-static BOOL ntfs_check_restart_area(struct inode *vi, RESTART_PAGE_HEADER *rp)
+static bool ntfs_check_restart_area(struct inode *vi, RESTART_PAGE_HEADER *rp)
 {
 	u64 file_size;
 	RESTART_AREA *ra;
@@ -161,7 +173,7 @@ static BOOL ntfs_check_restart_area(struct inode *vi, RESTART_PAGE_HEADER *rp)
 			NTFS_BLOCK_SIZE - sizeof(u16)) {
 		ntfs_error(vi->i_sb, "$LogFile restart area specifies "
 				"inconsistent file offset.");
-		return FALSE;
+		return false;
 	}
 	/*
 	 * Now that we can access ra->client_array_offset, make sure everything
@@ -175,7 +187,7 @@ static BOOL ntfs_check_restart_area(struct inode *vi, RESTART_PAGE_HEADER *rp)
 			ra_ofs + ca_ofs > NTFS_BLOCK_SIZE - sizeof(u16)) {
 		ntfs_error(vi->i_sb, "$LogFile restart area specifies "
 				"inconsistent client array offset.");
-		return FALSE;
+		return false;
 	}
 	/*
 	 * The restart area must end within the system page size both when
@@ -192,7 +204,7 @@ static BOOL ntfs_check_restart_area(struct inode *vi, RESTART_PAGE_HEADER *rp)
 				"of the system page size specified by the "
 				"restart page header and/or the specified "
 				"restart area length is inconsistent.");
-		return FALSE;
+		return false;
 	}
 	/*
 	 * The ra->client_free_list and ra->client_in_use_list must be either
@@ -207,7 +219,7 @@ static BOOL ntfs_check_restart_area(struct inode *vi, RESTART_PAGE_HEADER *rp)
 			le16_to_cpu(ra->log_clients))) {
 		ntfs_error(vi->i_sb, "$LogFile restart area specifies "
 				"overflowing client free and/or in use lists.");
-		return FALSE;
+		return false;
 	}
 	/*
 	 * Check ra->seq_number_bits against ra->file_size for consistency.
@@ -222,24 +234,24 @@ static BOOL ntfs_check_restart_area(struct inode *vi, RESTART_PAGE_HEADER *rp)
 	if (le32_to_cpu(ra->seq_number_bits) != 67 - fs_bits) {
 		ntfs_error(vi->i_sb, "$LogFile restart area specifies "
 				"inconsistent sequence number bits.");
-		return FALSE;
+		return false;
 	}
 	/* The log record header length must be a multiple of 8. */
 	if (((le16_to_cpu(ra->log_record_header_length) + 7) & ~7) !=
 			le16_to_cpu(ra->log_record_header_length)) {
 		ntfs_error(vi->i_sb, "$LogFile restart area specifies "
 				"inconsistent log record header length.");
-		return FALSE;
+		return false;
 	}
 	/* Dito for the log page data offset. */
 	if (((le16_to_cpu(ra->log_page_data_offset) + 7) & ~7) !=
 			le16_to_cpu(ra->log_page_data_offset)) {
 		ntfs_error(vi->i_sb, "$LogFile restart area specifies "
 				"inconsistent log page data offset.");
-		return FALSE;
+		return false;
 	}
 	ntfs_debug("Done.");
-	return TRUE;
+	return true;
 }
 
 /**
@@ -248,7 +260,7 @@ static BOOL ntfs_check_restart_area(struct inode *vi, RESTART_PAGE_HEADER *rp)
  * @rp:		restart page whose log client array to check
  *
  * Check the log client array of the restart page @rp for consistency and
- * return TRUE if it is consistent and FALSE otherwise.
+ * return 'true' if it is consistent and 'false' otherwise.
  *
  * This function assumes that the restart page header and the restart area have
  * already been consistency checked.
@@ -257,13 +269,13 @@ static BOOL ntfs_check_restart_area(struct inode *vi, RESTART_PAGE_HEADER *rp)
  * function needs @rp->system_page_size bytes in @rp, i.e. it requires the full
  * restart page and the page must be multi sector transfer deprotected.
  */
-static BOOL ntfs_check_log_client_array(struct inode *vi,
+static bool ntfs_check_log_client_array(struct inode *vi,
 		RESTART_PAGE_HEADER *rp)
 {
 	RESTART_AREA *ra;
 	LOG_CLIENT_RECORD *ca, *cr;
 	u16 nr_clients, idx;
-	BOOL in_free_list, idx_is_first;
+	bool in_free_list, idx_is_first;
 
 	ntfs_debug("Entering.");
 	ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset));
@@ -279,9 +291,9 @@ static BOOL ntfs_check_log_client_array(struct inode *vi,
 	 */
 	nr_clients = le16_to_cpu(ra->log_clients);
 	idx = le16_to_cpu(ra->client_free_list);
-	in_free_list = TRUE;
+	in_free_list = true;
 check_list:
-	for (idx_is_first = TRUE; idx != LOGFILE_NO_CLIENT_CPU; nr_clients--,
+	for (idx_is_first = true; idx != LOGFILE_NO_CLIENT_CPU; nr_clients--,
 			idx = le16_to_cpu(cr->next_client)) {
 		if (!nr_clients || idx >= le16_to_cpu(ra->log_clients))
 			goto err_out;
@@ -291,20 +303,20 @@ check_list:
 		if (idx_is_first) {
 			if (cr->prev_client != LOGFILE_NO_CLIENT)
 				goto err_out;
-			idx_is_first = FALSE;
+			idx_is_first = false;
 		}
 	}
 	/* Switch to and check the in use list if we just did the free list. */
 	if (in_free_list) {
-		in_free_list = FALSE;
+		in_free_list = false;
 		idx = le16_to_cpu(ra->client_in_use_list);
 		goto check_list;
 	}
 	ntfs_debug("Done.");
-	return TRUE;
+	return true;
 err_out:
 	ntfs_error(vi->i_sb, "$LogFile log client array is corrupt.");
-	return FALSE;
+	return false;
 }
 
 /**
@@ -312,10 +324,12 @@ err_out:
  * @vi:		$LogFile inode to which the restart page belongs
  * @rp:		restart page to check
  * @pos:	position in @vi at which the restart page resides
- * @wrp:	copy of the multi sector transfer deprotected restart page
+ * @wrp:	[OUT] copy of the multi sector transfer deprotected restart page
+ * @lsn:	[OUT] set to the current logfile lsn on success
  *
- * Check the restart page @rp for consistency and return TRUE if it is
- * consistent and FALSE otherwise.
+ * Check the restart page @rp for consistency and return 0 if it is consistent
+ * and -errno otherwise.  The restart page may have been modified by chkdsk in
+ * which case its magic is CHKD instead of RSTR.
  *
  * This function only needs NTFS_BLOCK_SIZE bytes in @rp, i.e. it does not
  * require the full restart page.
@@ -323,25 +337,33 @@ err_out:
  * If @wrp is not NULL, on success, *@wrp will point to a buffer containing a
  * copy of the complete multi sector transfer deprotected page.  On failure,
  * *@wrp is undefined.
+ *
+ * Simillarly, if @lsn is not NULL, on success *@lsn will be set to the current
+ * logfile lsn according to this restart page.  On failure, *@lsn is undefined.
+ *
+ * The following error codes are defined:
+ *	-EINVAL	- The restart page is inconsistent.
+ *	-ENOMEM	- Not enough memory to load the restart page.
+ *	-EIO	- Failed to reading from $LogFile.
  */
-static BOOL ntfs_check_and_load_restart_page(struct inode *vi,
-		RESTART_PAGE_HEADER *rp, s64 pos, RESTART_PAGE_HEADER **wrp)
+static int ntfs_check_and_load_restart_page(struct inode *vi,
+		RESTART_PAGE_HEADER *rp, s64 pos, RESTART_PAGE_HEADER **wrp,
+		LSN *lsn)
 {
 	RESTART_AREA *ra;
 	RESTART_PAGE_HEADER *trp;
-	int size;
-	BOOL ret;
+	int size, err;
 
 	ntfs_debug("Entering.");
 	/* Check the restart page header for consistency. */
 	if (!ntfs_check_restart_page_header(vi, rp, pos)) {
 		/* Error output already done inside the function. */
-		return FALSE;
+		return -EINVAL;
 	}
 	/* Check the restart area for consistency. */
 	if (!ntfs_check_restart_area(vi, rp)) {
 		/* Error output already done inside the function. */
-		return FALSE;
+		return -EINVAL;
 	}
 	ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset));
 	/*
@@ -352,7 +374,7 @@ static BOOL ntfs_check_and_load_restart_page(struct inode *vi,
 	if (!trp) {
 		ntfs_error(vi->i_sb, "Failed to allocate memory for $LogFile "
 				"restart page buffer.");
-		return FALSE;
+		return -ENOMEM;
 	}
 	/*
 	 * Read the whole of the restart page into the buffer.  If it fits
@@ -379,6 +401,9 @@ static BOOL ntfs_check_and_load_restart_page(struct inode *vi,
 			if (IS_ERR(page)) {
 				ntfs_error(vi->i_sb, "Error mapping $LogFile "
 						"page (index %lu).", idx);
+				err = PTR_ERR(page);
+				if (err != -EIO && err != -ENOMEM)
+					err = -EIO;
 				goto err_out;
 			}
 			size = min_t(int, to_read, PAGE_CACHE_SIZE);
@@ -389,32 +414,64 @@ static BOOL ntfs_check_and_load_restart_page(struct inode *vi,
 			idx++;
 		} while (to_read > 0);
 	}
-	/* Perform the multi sector transfer deprotection on the buffer. */
-	if (post_read_mst_fixup((NTFS_RECORD*)trp,
+	/*
+	 * Perform the multi sector transfer deprotection on the buffer if the
+	 * restart page is protected.
+	 */
+	if ((!ntfs_is_chkd_record(trp->magic) || le16_to_cpu(trp->usa_count))
+			&& post_read_mst_fixup((NTFS_RECORD*)trp,
 			le32_to_cpu(rp->system_page_size))) {
-		ntfs_error(vi->i_sb, "Multi sector transfer error detected in "
-				"$LogFile restart page.");
-		goto err_out;
+		/*
+		 * A multi sector tranfer error was detected.  We only need to
+		 * abort if the restart page contents exceed the multi sector
+		 * transfer fixup of the first sector.
+		 */
+		if (le16_to_cpu(rp->restart_area_offset) +
+				le16_to_cpu(ra->restart_area_length) >
+				NTFS_BLOCK_SIZE - sizeof(u16)) {
+			ntfs_error(vi->i_sb, "Multi sector transfer error "
+					"detected in $LogFile restart page.");
+			err = -EINVAL;
+			goto err_out;
+		}
+	}
+	/*
+	 * If the restart page is modified by chkdsk or there are no active
+	 * logfile clients, the logfile is consistent.  Otherwise, need to
+	 * check the log client records for consistency, too.
+	 */
+	err = 0;
+	if (ntfs_is_rstr_record(rp->magic) &&
+			ra->client_in_use_list != LOGFILE_NO_CLIENT) {
+		if (!ntfs_check_log_client_array(vi, trp)) {
+			err = -EINVAL;
+			goto err_out;
+		}
+	}
+	if (lsn) {
+		if (ntfs_is_rstr_record(rp->magic))
+			*lsn = sle64_to_cpu(ra->current_lsn);
+		else /* if (ntfs_is_chkd_record(rp->magic)) */
+			*lsn = sle64_to_cpu(rp->chkdsk_lsn);
 	}
-	/* Check the log client records for consistency. */
-	ret = ntfs_check_log_client_array(vi, trp);
-	if (ret && wrp)
-		*wrp = trp;
-	else
-		ntfs_free(trp);
 	ntfs_debug("Done.");
-	return ret;
+	if (wrp)
+		*wrp = trp;
+	else {
 err_out:
-	ntfs_free(trp);
-	return FALSE;
+		ntfs_free(trp);
+	}
+	return err;
 }
 
 /**
- * ntfs_ckeck_logfile - check in the journal if the volume is consistent
+ * ntfs_check_logfile - check the journal for consistency
  * @log_vi:	struct inode of loaded journal $LogFile to check
+ * @rp:		[OUT] on success this is a copy of the current restart page
  *
- * Check the $LogFile journal for consistency and return TRUE if it is
- * consistent and FALSE if not.
+ * Check the $LogFile journal for consistency and return 'true' if it is
+ * consistent and 'false' if not.  On success, the current restart page is
+ * returned in *@rp.  Caller must call ntfs_free(*@rp) when finished with it.
  *
  * At present we only check the two restart pages and ignore the log record
  * pages.
@@ -424,26 +481,25 @@ err_out:
  * if the $LogFile was created on a system with a different page size to ours
  * yet and mst deprotection would fail if our page size is smaller.
  */
-BOOL ntfs_check_logfile(struct inode *log_vi)
+bool ntfs_check_logfile(struct inode *log_vi, RESTART_PAGE_HEADER **rp)
 {
-	s64 size, pos, rstr1_pos, rstr2_pos;
+	s64 size, pos;
+	LSN rstr1_lsn, rstr2_lsn;
 	ntfs_volume *vol = NTFS_SB(log_vi->i_sb);
 	struct address_space *mapping = log_vi->i_mapping;
 	struct page *page = NULL;
 	u8 *kaddr = NULL;
 	RESTART_PAGE_HEADER *rstr1_ph = NULL;
 	RESTART_PAGE_HEADER *rstr2_ph = NULL;
-	int log_page_size, log_page_mask, ofs;
-	BOOL logfile_is_empty = TRUE;
-	BOOL rstr1_found = FALSE;
-	BOOL rstr2_found = FALSE;
+	int log_page_size, log_page_mask, err;
+	bool logfile_is_empty = true;
 	u8 log_page_bits;
 
 	ntfs_debug("Entering.");
 	/* An empty $LogFile must have been clean before it got emptied. */
 	if (NVolLogFileEmpty(vol))
 		goto is_empty;
-	size = log_vi->i_size;
+	size = i_size_read(log_vi);
 	/* Make sure the file doesn't exceed the maximum allowed size. */
 	if (size > MaxLogFileSize)
 		size = MaxLogFileSize;
@@ -460,11 +516,11 @@ BOOL ntfs_check_logfile(struct inode *log_vi)
 		log_page_size = PAGE_CACHE_SIZE;
 	log_page_mask = log_page_size - 1;
 	/*
-	 * Use generic_ffs() instead of ffs() to enable the compiler to
+	 * Use ntfs_ffs() instead of ffs() to enable the compiler to
 	 * optimize log_page_size and log_page_bits into constants.
 	 */
-	log_page_bits = generic_ffs(log_page_size) - 1;
-	size &= ~(log_page_size - 1);
+	log_page_bits = ntfs_ffs(log_page_size) - 1;
+	size &= ~(s64)(log_page_size - 1);
 	/*
 	 * Ensure the log file is big enough to store at least the two restart
 	 * pages and the minimum number of log record pages.
@@ -472,7 +528,7 @@ BOOL ntfs_check_logfile(struct inode *log_vi)
 	if (size < log_page_size * 2 || (size - log_page_size * 2) >>
 			log_page_bits < MinLogRecordPages) {
 		ntfs_error(vol->sb, "$LogFile is too small.");
-		return FALSE;
+		return false;
 	}
 	/*
 	 * Read through the file looking for a restart page.  Since the restart
@@ -491,7 +547,7 @@ BOOL ntfs_check_logfile(struct inode *log_vi)
 			if (IS_ERR(page)) {
 				ntfs_error(vol->sb, "Error mapping $LogFile "
 						"page (index %lu).", idx);
-				return FALSE;
+				goto err_out;
 			}
 		}
 		kaddr = (u8*)page_address(page) + (pos & ~PAGE_CACHE_MASK);
@@ -501,7 +557,7 @@ BOOL ntfs_check_logfile(struct inode *log_vi)
 		 * means we are done.
 		 */
 		if (!ntfs_is_empty_recordp((le32*)kaddr))
-			logfile_is_empty = FALSE;
+			logfile_is_empty = false;
 		else if (!logfile_is_empty)
 			break;
 		/*
@@ -510,145 +566,135 @@ BOOL ntfs_check_logfile(struct inode *log_vi)
 		 */
 		if (ntfs_is_rcrd_recordp((le32*)kaddr))
 			break;
-		/*
-		 * A modified by chkdsk restart page means we cannot handle
-		 * this log file.
-		 */
-		if (ntfs_is_chkd_recordp((le32*)kaddr)) {
-			ntfs_error(vol->sb, "$LogFile has been modified by "
-					"chkdsk.  Mount this volume in "
-					"Windows.");
-			goto err_out;
-		}
-		/* If not a restart page, continue. */
-		if (!ntfs_is_rstr_recordp((le32*)kaddr)) {
-			/* Skip to the minimum page size for the next one. */
+		/* If not a (modified by chkdsk) restart page, continue. */
+		if (!ntfs_is_rstr_recordp((le32*)kaddr) &&
+				!ntfs_is_chkd_recordp((le32*)kaddr)) {
 			if (!pos)
 				pos = NTFS_BLOCK_SIZE >> 1;
 			continue;
 		}
-		/* We now know we have a restart page. */
-		if (!pos) {
-			rstr1_found = TRUE;
-			rstr1_pos = pos;
-		} else {
-			if (rstr2_found) {
-				ntfs_error(vol->sb, "Found more than two "
-						"restart pages in $LogFile.");
-				goto err_out;
-			}
-			rstr2_found = TRUE;
-			rstr2_pos = pos;
-		}
 		/*
-		 * Check the restart page for consistency and get a copy of the
-		 * complete multi sector transfer deprotected restart page.
+		 * Check the (modified by chkdsk) restart page for consistency
+		 * and get a copy of the complete multi sector transfer
+		 * deprotected restart page.
 		 */
-		if (!ntfs_check_and_load_restart_page(log_vi,
+		err = ntfs_check_and_load_restart_page(log_vi,
 				(RESTART_PAGE_HEADER*)kaddr, pos,
-				!pos ? &rstr1_ph : &rstr2_ph)) {
-			/* Error output already done inside the function. */
-			goto err_out;
+				!rstr1_ph ? &rstr1_ph : &rstr2_ph,
+				!rstr1_ph ? &rstr1_lsn : &rstr2_lsn);
+		if (!err) {
+			/*
+			 * If we have now found the first (modified by chkdsk)
+			 * restart page, continue looking for the second one.
+			 */
+			if (!pos) {
+				pos = NTFS_BLOCK_SIZE >> 1;
+				continue;
+			}
+			/*
+			 * We have now found the second (modified by chkdsk)
+			 * restart page, so we can stop looking.
+			 */
+			break;
 		}
 		/*
-		 * We have a valid restart page.  The next one must be after
-		 * a whole system page size as specified by the valid restart
-		 * page.
+		 * Error output already done inside the function.  Note, we do
+		 * not abort if the restart page was invalid as we might still
+		 * find a valid one further in the file.
 		 */
+		if (err != -EINVAL) {
+			ntfs_unmap_page(page);
+			goto err_out;
+		}
+		/* Continue looking. */
 		if (!pos)
-			pos = le32_to_cpu(rstr1_ph->system_page_size) >> 1;
+			pos = NTFS_BLOCK_SIZE >> 1;
 	}
-	if (page) {
+	if (page)
 		ntfs_unmap_page(page);
-		page = NULL;
-	}
 	if (logfile_is_empty) {
 		NVolSetLogFileEmpty(vol);
 is_empty:
 		ntfs_debug("Done.  ($LogFile is empty.)");
-		return TRUE;
+		return true;
 	}
-	if (!rstr1_found || !rstr2_found) {
-		ntfs_error(vol->sb, "Did not find two restart pages in "
-				"$LogFile.");
-		goto err_out;
+	if (!rstr1_ph) {
+		BUG_ON(rstr2_ph);
+		ntfs_error(vol->sb, "Did not find any restart pages in "
+				"$LogFile and it was not empty.");
+		return false;
+	}
+	/* If both restart pages were found, use the more recent one. */
+	if (rstr2_ph) {
+		/*
+		 * If the second restart area is more recent, switch to it.
+		 * Otherwise just throw it away.
+		 */
+		if (rstr2_lsn > rstr1_lsn) {
+			ntfs_debug("Using second restart page as it is more "
+					"recent.");
+			ntfs_free(rstr1_ph);
+			rstr1_ph = rstr2_ph;
+			/* rstr1_lsn = rstr2_lsn; */
+		} else {
+			ntfs_debug("Using first restart page as it is more "
+					"recent.");
+			ntfs_free(rstr2_ph);
+		}
+		rstr2_ph = NULL;
 	}
-	/*
-	 * The two restart areas must be identical except for the update
-	 * sequence number.
-	 */
-	ofs = le16_to_cpu(rstr1_ph->usa_ofs);
-	if (memcmp(rstr1_ph, rstr2_ph, ofs) || (ofs += sizeof(u16),
-			memcmp((u8*)rstr1_ph + ofs, (u8*)rstr2_ph + ofs,
-			le32_to_cpu(rstr1_ph->system_page_size) - ofs))) {
-		ntfs_error(vol->sb, "The two restart pages in $LogFile do not "
-				"match.");
-		goto err_out;
-	}
-	ntfs_free(rstr1_ph);
-	ntfs_free(rstr2_ph);
 	/* All consistency checks passed. */
+	if (rp)
+		*rp = rstr1_ph;
+	else
+		ntfs_free(rstr1_ph);
 	ntfs_debug("Done.");
-	return TRUE;
+	return true;
 err_out:
-	if (page)
-		ntfs_unmap_page(page);
 	if (rstr1_ph)
 		ntfs_free(rstr1_ph);
-	if (rstr2_ph)
-		ntfs_free(rstr2_ph);
-	return FALSE;
+	return false;
 }
 
 /**
  * ntfs_is_logfile_clean - check in the journal if the volume is clean
  * @log_vi:	struct inode of loaded journal $LogFile to check
+ * @rp:		copy of the current restart page
  *
- * Analyze the $LogFile journal and return TRUE if it indicates the volume was
- * shutdown cleanly and FALSE if not.
+ * Analyze the $LogFile journal and return 'true' if it indicates the volume was
+ * shutdown cleanly and 'false' if not.
  *
  * At present we only look at the two restart pages and ignore the log record
  * pages.  This is a little bit crude in that there will be a very small number
  * of cases where we think that a volume is dirty when in fact it is clean.
  * This should only affect volumes that have not been shutdown cleanly but did
  * not have any pending, non-check-pointed i/o, i.e. they were completely idle
- * at least for the five seconds preceeding the unclean shutdown.
+ * at least for the five seconds preceding the unclean shutdown.
  *
  * This function assumes that the $LogFile journal has already been consistency
  * checked by a call to ntfs_check_logfile() and in particular if the $LogFile
  * is empty this function requires that NVolLogFileEmpty() is true otherwise an
  * empty volume will be reported as dirty.
  */
-BOOL ntfs_is_logfile_clean(struct inode *log_vi)
+bool ntfs_is_logfile_clean(struct inode *log_vi, const RESTART_PAGE_HEADER *rp)
 {
 	ntfs_volume *vol = NTFS_SB(log_vi->i_sb);
-	struct page *page;
-	RESTART_PAGE_HEADER *rp;
 	RESTART_AREA *ra;
 
 	ntfs_debug("Entering.");
 	/* An empty $LogFile must have been clean before it got emptied. */
 	if (NVolLogFileEmpty(vol)) {
 		ntfs_debug("Done.  ($LogFile is empty.)");
-		return TRUE;
+		return true;
 	}
-	/*
-	 * Read the first restart page.  It will be possibly incomplete and
-	 * will not be multi sector transfer deprotected but we only need the
-	 * first NTFS_BLOCK_SIZE bytes so it does not matter.
-	 */
-	page = ntfs_map_page(log_vi->i_mapping, 0);
-	if (IS_ERR(page)) {
-		ntfs_error(vol->sb, "Error mapping $LogFile page (index 0).");
-		return FALSE;
-	}
-	rp = (RESTART_PAGE_HEADER*)page_address(page);
-	if (!ntfs_is_rstr_record(rp->magic)) {
-		ntfs_error(vol->sb, "No restart page found at offset zero in "
-				"$LogFile.  This is probably a bug in that "
-				"the $LogFile should have been consistency "
-				"checked before calling this function.");
-		goto err_out;
+	BUG_ON(!rp);
+	if (!ntfs_is_rstr_record(rp->magic) &&
+			!ntfs_is_chkd_record(rp->magic)) {
+		ntfs_error(vol->sb, "Restart page buffer is invalid.  This is "
+				"probably a bug in that the $LogFile should "
+				"have been consistency checked before calling "
+				"this function.");
+		return false;
 	}
 	ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset));
 	/*
@@ -659,47 +705,159 @@ BOOL ntfs_is_logfile_clean(struct inode *log_vi)
 	if (ra->client_in_use_list != LOGFILE_NO_CLIENT &&
 			!(ra->flags & RESTART_VOLUME_IS_CLEAN)) {
 		ntfs_debug("Done.  $LogFile indicates a dirty shutdown.");
-		goto err_out;
+		return false;
 	}
-	ntfs_unmap_page(page);
 	/* $LogFile indicates a clean shutdown. */
 	ntfs_debug("Done.  $LogFile indicates a clean shutdown.");
-	return TRUE;
-err_out:
-	ntfs_unmap_page(page);
-	return FALSE;
+	return true;
 }
 
 /**
  * ntfs_empty_logfile - empty the contents of the $LogFile journal
  * @log_vi:	struct inode of loaded journal $LogFile to empty
  *
- * Empty the contents of the $LogFile journal @log_vi and return TRUE on
- * success and FALSE on error.
+ * Empty the contents of the $LogFile journal @log_vi and return 'true' on
+ * success and 'false' on error.
  *
  * This function assumes that the $LogFile journal has already been consistency
  * checked by a call to ntfs_check_logfile() and that ntfs_is_logfile_clean()
  * has been used to ensure that the $LogFile is clean.
  */
-BOOL ntfs_empty_logfile(struct inode *log_vi)
+bool ntfs_empty_logfile(struct inode *log_vi)
 {
-	ntfs_volume *vol = NTFS_SB(log_vi->i_sb);
+	VCN vcn, end_vcn;
+	ntfs_inode *log_ni = NTFS_I(log_vi);
+	ntfs_volume *vol = log_ni->vol;
+	struct super_block *sb = vol->sb;
+	runlist_element *rl;
+	unsigned long flags;
+	unsigned block_size, block_size_bits;
+	int err;
+	bool should_wait = true;
 
 	ntfs_debug("Entering.");
-	if (!NVolLogFileEmpty(vol)) {
-		int err;
-		
-		err = ntfs_attr_set(NTFS_I(log_vi), 0, log_vi->i_size, 0xff);
-		if (unlikely(err)) {
-			ntfs_error(vol->sb, "Failed to fill $LogFile with "
-					"0xff bytes (error code %i).", err);
-			return FALSE;
-		}
-		/* Set the flag so we do not have to do it again on remount. */
-		NVolSetLogFileEmpty(vol);
+	if (NVolLogFileEmpty(vol)) {
+		ntfs_debug("Done.");
+		return true;
 	}
+	/*
+	 * We cannot use ntfs_attr_set() because we may be still in the middle
+	 * of a mount operation.  Thus we do the emptying by hand by first
+	 * zapping the page cache pages for the $LogFile/$DATA attribute and
+	 * then emptying each of the buffers in each of the clusters specified
+	 * by the runlist by hand.
+	 */
+	block_size = sb->s_blocksize;
+	block_size_bits = sb->s_blocksize_bits;
+	vcn = 0;
+	read_lock_irqsave(&log_ni->size_lock, flags);
+	end_vcn = (log_ni->initialized_size + vol->cluster_size_mask) >>
+			vol->cluster_size_bits;
+	read_unlock_irqrestore(&log_ni->size_lock, flags);
+	truncate_inode_pages(log_vi->i_mapping, 0);
+	down_write(&log_ni->runlist.lock);
+	rl = log_ni->runlist.rl;
+	if (unlikely(!rl || vcn < rl->vcn || !rl->length)) {
+map_vcn:
+		err = ntfs_map_runlist_nolock(log_ni, vcn, NULL);
+		if (err) {
+			ntfs_error(sb, "Failed to map runlist fragment (error "
+					"%d).", -err);
+			goto err;
+		}
+		rl = log_ni->runlist.rl;
+		BUG_ON(!rl || vcn < rl->vcn || !rl->length);
+	}
+	/* Seek to the runlist element containing @vcn. */
+	while (rl->length && vcn >= rl[1].vcn)
+		rl++;
+	do {
+		LCN lcn;
+		sector_t block, end_block;
+		s64 len;
+
+		/*
+		 * If this run is not mapped map it now and start again as the
+		 * runlist will have been updated.
+		 */
+		lcn = rl->lcn;
+		if (unlikely(lcn == LCN_RL_NOT_MAPPED)) {
+			vcn = rl->vcn;
+			goto map_vcn;
+		}
+		/* If this run is not valid abort with an error. */
+		if (unlikely(!rl->length || lcn < LCN_HOLE))
+			goto rl_err;
+		/* Skip holes. */
+		if (lcn == LCN_HOLE)
+			continue;
+		block = lcn << vol->cluster_size_bits >> block_size_bits;
+		len = rl->length;
+		if (rl[1].vcn > end_vcn)
+			len = end_vcn - rl->vcn;
+		end_block = (lcn + len) << vol->cluster_size_bits >>
+				block_size_bits;
+		/* Iterate over the blocks in the run and empty them. */
+		do {
+			struct buffer_head *bh;
+
+			/* Obtain the buffer, possibly not uptodate. */
+			bh = sb_getblk(sb, block);
+			BUG_ON(!bh);
+			/* Setup buffer i/o submission. */
+			lock_buffer(bh);
+			bh->b_end_io = end_buffer_write_sync;
+			get_bh(bh);
+			/* Set the entire contents of the buffer to 0xff. */
+			memset(bh->b_data, -1, block_size);
+			if (!buffer_uptodate(bh))
+				set_buffer_uptodate(bh);
+			if (buffer_dirty(bh))
+				clear_buffer_dirty(bh);
+			/*
+			 * Submit the buffer and wait for i/o to complete but
+			 * only for the first buffer so we do not miss really
+			 * serious i/o errors.  Once the first buffer has
+			 * completed ignore errors afterwards as we can assume
+			 * that if one buffer worked all of them will work.
+			 */
+			submit_bh(WRITE, bh);
+			if (should_wait) {
+				should_wait = false;
+				wait_on_buffer(bh);
+				if (unlikely(!buffer_uptodate(bh)))
+					goto io_err;
+			}
+			brelse(bh);
+		} while (++block < end_block);
+	} while ((++rl)->vcn < end_vcn);
+	up_write(&log_ni->runlist.lock);
+	/*
+	 * Zap the pages again just in case any got instantiated whilst we were
+	 * emptying the blocks by hand.  FIXME: We may not have completed
+	 * writing to all the buffer heads yet so this may happen too early.
+	 * We really should use a kernel thread to do the emptying
+	 * asynchronously and then we can also set the volume dirty and output
+	 * an error message if emptying should fail.
+	 */
+	truncate_inode_pages(log_vi->i_mapping, 0);
+	/* Set the flag so we do not have to do it again on remount. */
+	NVolSetLogFileEmpty(vol);
 	ntfs_debug("Done.");
-	return TRUE;
+	return true;
+io_err:
+	ntfs_error(sb, "Failed to write buffer.  Unmount and run chkdsk.");
+	goto dirty_err;
+rl_err:
+	ntfs_error(sb, "Runlist is corrupt.  Unmount and run chkdsk.");
+dirty_err:
+	NVolSetErrors(vol);
+	err = -EIO;
+err:
+	up_write(&log_ni->runlist.lock);
+	ntfs_error(sb, "Failed to fill $LogFile with 0xff bytes (error %d).",
+			-err);
+	return false;
 }
 
 #endif /* NTFS_RW */
diff --git a/fs/ntfs/logfile.h b/fs/ntfs/logfile.h
index 4ee4378de06..aa2b6ac3f0a 100644
--- a/fs/ntfs/logfile.h
+++ b/fs/ntfs/logfile.h
@@ -2,7 +2,7 @@
  * logfile.h - Defines for NTFS kernel journal ($LogFile) handling.  Part of
  *	       the Linux-NTFS project.
  *
- * Copyright (c) 2000-2004 Anton Altaparmakov
+ * Copyright (c) 2000-2005 Anton Altaparmakov
  *
  * This program/include file is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as published
@@ -104,7 +104,7 @@ typedef struct {
  * in this particular client array.  Also inside the client records themselves,
  * this means that there are no client records preceding or following this one.
  */
-#define LOGFILE_NO_CLIENT	const_cpu_to_le16(0xffff)
+#define LOGFILE_NO_CLIENT	cpu_to_le16(0xffff)
 #define LOGFILE_NO_CLIENT_CPU	0xffff
 
 /*
@@ -112,8 +112,8 @@ typedef struct {
  * information about the log file in which they are present.
  */
 enum {
-	RESTART_VOLUME_IS_CLEAN	= const_cpu_to_le16(0x0002),
-	RESTART_SPACE_FILLER	= 0xffff, /* gcc: Force enum bit width to 16. */
+	RESTART_VOLUME_IS_CLEAN	= cpu_to_le16(0x0002),
+	RESTART_SPACE_FILLER	= cpu_to_le16(0xffff), /* gcc: Force enum bit width to 16. */
 } __attribute__ ((__packed__));
 
 typedef le16 RESTART_AREA_FLAGS;
@@ -222,7 +222,7 @@ typedef struct {
 /* 24*/	sle64 file_size;	/* Usable byte size of the log file.  If the
 				   restart_area_offset + the offset of the
 				   file_size are > 510 then corruption has
-				   occured.  This is the very first check when
+				   occurred.  This is the very first check when
 				   starting with the restart_area as if it
 				   fails it means that some of the above values
 				   will be corrupted by the multi sector
@@ -296,11 +296,13 @@ typedef struct {
 /* sizeof() = 160 (0xa0) bytes */
 } __attribute__ ((__packed__)) LOG_CLIENT_RECORD;
 
-extern BOOL ntfs_check_logfile(struct inode *log_vi);
+extern bool ntfs_check_logfile(struct inode *log_vi,
+		RESTART_PAGE_HEADER **rp);
 
-extern BOOL ntfs_is_logfile_clean(struct inode *log_vi);
+extern bool ntfs_is_logfile_clean(struct inode *log_vi,
+		const RESTART_PAGE_HEADER *rp);
 
-extern BOOL ntfs_empty_logfile(struct inode *log_vi);
+extern bool ntfs_empty_logfile(struct inode *log_vi);
 
 #endif /* NTFS_RW */
 
diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h
index fac5944df6d..a44b14cbcee 100644
--- a/fs/ntfs/malloc.h
+++ b/fs/ntfs/malloc.h
@@ -1,7 +1,7 @@
 /*
  * malloc.h - NTFS kernel memory handling. Part of the Linux-NTFS project.
  *
- * Copyright (c) 2001-2004 Anton Altaparmakov
+ * Copyright (c) 2001-2005 Anton Altaparmakov
  *
  * This program/include file is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as published
@@ -27,31 +27,65 @@
 #include <linux/highmem.h>
 
 /**
- * ntfs_malloc_nofs - allocate memory in multiples of pages
- * @size	number of bytes to allocate
+ * __ntfs_malloc - allocate memory in multiples of pages
+ * @size:	number of bytes to allocate
+ * @gfp_mask:	extra flags for the allocator
+ *
+ * Internal function.  You probably want ntfs_malloc_nofs()...
  *
  * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and
  * returns a pointer to the allocated memory.
  *
  * If there was insufficient memory to complete the request, return NULL.
+ * Depending on @gfp_mask the allocation may be guaranteed to succeed.
  */
-static inline void *ntfs_malloc_nofs(unsigned long size)
+static inline void *__ntfs_malloc(unsigned long size, gfp_t gfp_mask)
 {
 	if (likely(size <= PAGE_SIZE)) {
 		BUG_ON(!size);
 		/* kmalloc() has per-CPU caches so is faster for now. */
-		return kmalloc(PAGE_SIZE, GFP_NOFS);
-		/* return (void *)__get_free_page(GFP_NOFS | __GFP_HIGHMEM); */
+		return kmalloc(PAGE_SIZE, gfp_mask & ~__GFP_HIGHMEM);
+		/* return (void *)__get_free_page(gfp_mask); */
 	}
-	if (likely(size >> PAGE_SHIFT < num_physpages))
-		return __vmalloc(size, GFP_NOFS | __GFP_HIGHMEM, PAGE_KERNEL);
+	if (likely((size >> PAGE_SHIFT) < totalram_pages))
+		return __vmalloc(size, gfp_mask, PAGE_KERNEL);
 	return NULL;
 }
 
+/**
+ * ntfs_malloc_nofs - allocate memory in multiples of pages
+ * @size:	number of bytes to allocate
+ *
+ * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and
+ * returns a pointer to the allocated memory.
+ *
+ * If there was insufficient memory to complete the request, return NULL.
+ */
+static inline void *ntfs_malloc_nofs(unsigned long size)
+{
+	return __ntfs_malloc(size, GFP_NOFS | __GFP_HIGHMEM);
+}
+
+/**
+ * ntfs_malloc_nofs_nofail - allocate memory in multiples of pages
+ * @size:	number of bytes to allocate
+ *
+ * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and
+ * returns a pointer to the allocated memory.
+ *
+ * This function guarantees that the allocation will succeed.  It will sleep
+ * for as long as it takes to complete the allocation.
+ *
+ * If there was insufficient memory to complete the request, return NULL.
+ */
+static inline void *ntfs_malloc_nofs_nofail(unsigned long size)
+{
+	return __ntfs_malloc(size, GFP_NOFS | __GFP_HIGHMEM | __GFP_NOFAIL);
+}
+
 static inline void ntfs_free(void *addr)
 {
-	if (likely(((unsigned long)addr < VMALLOC_START) ||
-			((unsigned long)addr >= VMALLOC_END ))) {
+	if (!is_vmalloc_addr(addr)) {
 		kfree(addr);
 		/* free_page((unsigned long)addr); */
 		return;
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index dfa85ac2f8b..3014a36a255 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -1,7 +1,7 @@
 /**
  * mft.c - NTFS kernel mft record operations. Part of the Linux-NTFS project.
  *
- * Copyright (c) 2001-2004 Anton Altaparmakov
+ * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc.
  * Copyright (c) 2002 Richard Russon
  *
  * This program/include file is free software; you can redistribute it and/or
@@ -21,6 +21,7 @@
  */
 
 #include <linux/buffer_head.h>
+#include <linux/slab.h>
 #include <linux/swap.h>
 
 #include "attrib.h"
@@ -45,10 +46,12 @@
  */
 static inline MFT_RECORD *map_mft_record_page(ntfs_inode *ni)
 {
+	loff_t i_size;
 	ntfs_volume *vol = ni->vol;
 	struct inode *mft_vi = vol->mft_ino;
 	struct page *page;
-	unsigned long index, ofs, end_index;
+	unsigned long index, end_index;
+	unsigned ofs;
 
 	BUG_ON(ni->page);
 	/*
@@ -57,18 +60,20 @@ static inline MFT_RECORD *map_mft_record_page(ntfs_inode *ni)
 	 * overflowing the unsigned long, but I don't think we would ever get
 	 * here if the volume was that big...
 	 */
-	index = ni->mft_no << vol->mft_record_size_bits >> PAGE_CACHE_SHIFT;
+	index = (u64)ni->mft_no << vol->mft_record_size_bits >>
+			PAGE_CACHE_SHIFT;
 	ofs = (ni->mft_no << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK;
 
+	i_size = i_size_read(mft_vi);
 	/* The maximum valid index into the page cache for $MFT's data. */
-	end_index = mft_vi->i_size >> PAGE_CACHE_SHIFT;
+	end_index = i_size >> PAGE_CACHE_SHIFT;
 
 	/* If the wanted index is out of bounds the mft record doesn't exist. */
 	if (unlikely(index >= end_index)) {
-		if (index > end_index || (mft_vi->i_size & ~PAGE_CACHE_MASK) <
-				ofs + vol->mft_record_size) {
+		if (index > end_index || (i_size & ~PAGE_CACHE_MASK) < ofs +
+				vol->mft_record_size) {
 			page = ERR_PTR(-ENOENT);
-			ntfs_error(vol->sb, "Attemt to read mft record 0x%lx, "
+			ntfs_error(vol->sb, "Attempt to read mft record 0x%lx, "
 					"which is beyond the end of the mft.  "
 					"This is probably a bug in the ntfs "
 					"driver.", ni->mft_no);
@@ -89,6 +94,7 @@ static inline MFT_RECORD *map_mft_record_page(ntfs_inode *ni)
 				"Run chkdsk.", ni->mft_no);
 		ntfs_unmap_page(page);
 		page = ERR_PTR(-EIO);
+		NVolSetErrors(vol);
 	}
 err_out:
 	ni->page = NULL;
@@ -100,8 +106,8 @@ err_out:
  * map_mft_record - map, pin and lock an mft record
  * @ni:		ntfs inode whose MFT record to map
  *
- * First, take the mrec_lock semaphore. We might now be sleeping, while waiting
- * for the semaphore if it was already locked by someone else.
+ * First, take the mrec_lock mutex.  We might now be sleeping, while waiting
+ * for the mutex if it was already locked by someone else.
  *
  * The page of the record is mapped using map_mft_record_page() before being
  * returned to the caller.
@@ -131,9 +137,9 @@ err_out:
  * So that code will end up having to own the mrec_lock of all mft
  * records/inodes present in the page before I/O can proceed. In that case we
  * wouldn't need to bother with PG_locked and PG_uptodate as nobody will be
- * accessing anything without owning the mrec_lock semaphore. But we do need
- * to use them because of the read_cache_page() invocation and the code becomes
- * so much simpler this way that it is well worth it.
+ * accessing anything without owning the mrec_lock mutex.  But we do need to
+ * use them because of the read_cache_page() invocation and the code becomes so
+ * much simpler this way that it is well worth it.
  *
  * The mft record is now ours and we return a pointer to it. You need to check
  * the returned pointer with IS_ERR() and if that is true, PTR_ERR() will return
@@ -156,13 +162,13 @@ MFT_RECORD *map_mft_record(ntfs_inode *ni)
 	atomic_inc(&ni->count);
 
 	/* Serialize access to this mft record. */
-	down(&ni->mrec_lock);
+	mutex_lock(&ni->mrec_lock);
 
 	m = map_mft_record_page(ni);
 	if (likely(!IS_ERR(m)))
 		return m;
 
-	up(&ni->mrec_lock);
+	mutex_unlock(&ni->mrec_lock);
 	atomic_dec(&ni->count);
 	ntfs_error(ni->vol->sb, "Failed with error code %lu.", -PTR_ERR(m));
 	return m;
@@ -213,7 +219,7 @@ void unmap_mft_record(ntfs_inode *ni)
 	ntfs_debug("Entering for mft_no 0x%lx.", ni->mft_no);
 
 	unmap_mft_record_page(ni);
-	up(&ni->mrec_lock);
+	mutex_unlock(&ni->mrec_lock);
 	atomic_dec(&ni->count);
 	/*
 	 * If pure ntfs_inode, i.e. no vfs inode attached, we leave it to
@@ -246,7 +252,7 @@ MFT_RECORD *map_extent_mft_record(ntfs_inode *base_ni, MFT_REF mref,
 	int i;
 	unsigned long mft_no = MREF(mref);
 	u16 seq_no = MSEQNO(mref);
-	BOOL destroy_ni = FALSE;
+	bool destroy_ni = false;
 
 	ntfs_debug("Mapping extent mft record 0x%lx (base mft record 0x%lx).",
 			mft_no, base_ni->mft_no);
@@ -257,7 +263,7 @@ MFT_RECORD *map_extent_mft_record(ntfs_inode *base_ni, MFT_REF mref,
 	 * in which case just return it. If not found, add it to the base
 	 * inode before returning it.
 	 */
-	down(&base_ni->extent_lock);
+	mutex_lock(&base_ni->extent_lock);
 	if (base_ni->nr_extents > 0) {
 		extent_nis = base_ni->ext.extent_ntfs_inos;
 		for (i = 0; i < base_ni->nr_extents; i++) {
@@ -270,7 +276,7 @@ MFT_RECORD *map_extent_mft_record(ntfs_inode *base_ni, MFT_REF mref,
 		}
 	}
 	if (likely(ni != NULL)) {
-		up(&base_ni->extent_lock);
+		mutex_unlock(&base_ni->extent_lock);
 		atomic_dec(&base_ni->count);
 		/* We found the record; just have to map and return it. */
 		m = map_mft_record(ni);
@@ -285,7 +291,7 @@ MFT_RECORD *map_extent_mft_record(ntfs_inode *base_ni, MFT_REF mref,
 			}
 			unmap_mft_record(ni);
 			ntfs_error(base_ni->vol->sb, "Found stale extent mft "
-					"reference! Corrupt file system. "
+					"reference! Corrupt filesystem. "
 					"Run chkdsk.");
 			return ERR_PTR(-EIO);
 		}
@@ -297,7 +303,7 @@ map_err_out:
 	/* Record wasn't there. Get a new ntfs inode and initialize it. */
 	ni = ntfs_new_extent_inode(base_ni->vol->sb, mft_no);
 	if (unlikely(!ni)) {
-		up(&base_ni->extent_lock);
+		mutex_unlock(&base_ni->extent_lock);
 		atomic_dec(&base_ni->count);
 		return ERR_PTR(-ENOMEM);
 	}
@@ -308,7 +314,7 @@ map_err_out:
 	/* Now map the record. */
 	m = map_mft_record(ni);
 	if (IS_ERR(m)) {
-		up(&base_ni->extent_lock);
+		mutex_unlock(&base_ni->extent_lock);
 		atomic_dec(&base_ni->count);
 		ntfs_clear_extent_inode(ni);
 		goto map_err_out;
@@ -316,8 +322,8 @@ map_err_out:
 	/* Verify the sequence number if it is present. */
 	if (seq_no && (le16_to_cpu(m->sequence_number) != seq_no)) {
 		ntfs_error(base_ni->vol->sb, "Found stale extent mft "
-				"reference! Corrupt file system. Run chkdsk.");
-		destroy_ni = TRUE;
+				"reference! Corrupt filesystem. Run chkdsk.");
+		destroy_ni = true;
 		m = ERR_PTR(-EIO);
 		goto unm_err_out;
 	}
@@ -326,11 +332,11 @@ map_err_out:
 		ntfs_inode **tmp;
 		int new_size = (base_ni->nr_extents + 4) * sizeof(ntfs_inode *);
 
-		tmp = (ntfs_inode **)kmalloc(new_size, GFP_NOFS);
+		tmp = kmalloc(new_size, GFP_NOFS);
 		if (unlikely(!tmp)) {
 			ntfs_error(base_ni->vol->sb, "Failed to allocate "
 					"internal buffer.");
-			destroy_ni = TRUE;
+			destroy_ni = true;
 			m = ERR_PTR(-ENOMEM);
 			goto unm_err_out;
 		}
@@ -343,14 +349,14 @@ map_err_out:
 		base_ni->ext.extent_ntfs_inos = tmp;
 	}
 	base_ni->ext.extent_ntfs_inos[base_ni->nr_extents++] = ni;
-	up(&base_ni->extent_lock);
+	mutex_unlock(&base_ni->extent_lock);
 	atomic_dec(&base_ni->count);
 	ntfs_debug("Done 2.");
 	*ntfs_ino = ni;
 	return m;
 unm_err_out:
 	unmap_mft_record(ni);
-	up(&base_ni->extent_lock);
+	mutex_unlock(&base_ni->extent_lock);
 	atomic_dec(&base_ni->count);
 	/*
 	 * If the extent inode was not attached to the base inode we need to
@@ -379,13 +385,12 @@ unm_err_out:
  * it is dirty in the inode meta data rather than the data page cache of the
  * inode, and thus there are no data pages that need writing out.  Therefore, a
  * full mark_inode_dirty() is overkill.  A mark_inode_dirty_sync(), on the
- * other hand, is not sufficient, because I_DIRTY_DATASYNC needs to be set to
- * ensure ->write_inode is called from generic_osync_inode() and this needs to
- * happen or the file data would not necessarily hit the device synchronously,
- * even though the vfs inode has the O_SYNC flag set.  Also, I_DIRTY_DATASYNC
- * simply "feels" better than just I_DIRTY_SYNC, since the file data has not
- * actually hit the block device yet, which is not what I_DIRTY_SYNC on its own
- * would suggest.
+ * other hand, is not sufficient, because ->write_inode needs to be called even
+ * in case of fdatasync. This needs to happen or the file data would not
+ * necessarily hit the device synchronously, even though the vfs inode has the
+ * O_SYNC flag set.  Also, I_DIRTY_DATASYNC simply "feels" better than just
+ * I_DIRTY_SYNC, since the file data has not actually hit the block device yet,
+ * which is not what I_DIRTY_SYNC on its own would suggest.
  */
 void __mark_mft_record_dirty(ntfs_inode *ni)
 {
@@ -395,12 +400,12 @@ void __mark_mft_record_dirty(ntfs_inode *ni)
 	BUG_ON(NInoAttr(ni));
 	mark_ntfs_record_dirty(ni->page, ni->page_ofs);
 	/* Determine the base vfs inode and mark it dirty, too. */
-	down(&ni->extent_lock);
+	mutex_lock(&ni->extent_lock);
 	if (likely(ni->nr_extents >= 0))
 		base_ni = ni;
 	else
 		base_ni = ni->ext.base_ntfs_ino;
-	up(&ni->extent_lock);
+	mutex_unlock(&ni->extent_lock);
 	__mark_inode_dirty(VFS_I(base_ni), I_DIRTY_SYNC | I_DIRTY_DATASYNC);
 }
 
@@ -469,7 +474,7 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
 	runlist_element *rl;
 	unsigned int block_start, block_end, m_start, m_end, page_ofs;
 	int i_bhs, nr_bhs, err = 0;
-	unsigned char blocksize_bits = vol->mftmirr_ino->i_blkbits;
+	unsigned char blocksize_bits = vol->sb->s_blocksize_bits;
 
 	ntfs_debug("Entering for inode 0x%lx.", mft_no);
 	BUG_ON(!max_bhs);
@@ -509,7 +514,6 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
 		} while (bh);
 		tail->b_this_page = head;
 		attach_page_buffers(page, head);
-		BUG_ON(!page_has_buffers(page));
 	}
 	bh = head = page_buffers(page);
 	BUG_ON(!bh);
@@ -531,6 +535,7 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
 			LCN lcn;
 			unsigned int vcn_ofs;
 
+			bh->b_bdev = vol->sb->s_bdev;
 			/* Obtain the vcn and offset of the current block. */
 			vcn = ((VCN)mft_no << vol->mft_record_size_bits) +
 					(block_start - m_start);
@@ -581,7 +586,7 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
 		for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
 			struct buffer_head *tbh = bhs[i_bhs];
 
-			if (unlikely(test_set_buffer_locked(tbh)))
+			if (!trylock_buffer(tbh))
 				BUG();
 			BUG_ON(!buffer_uptodate(tbh));
 			clear_buffer_dirty(tbh);
@@ -646,10 +651,7 @@ err_out:
  * fs/ntfs/aops.c::mark_ntfs_record_dirty().
  *
  * On success, clean the mft record and return 0.  On error, leave the mft
- * record dirty and return -errno.  The caller should call make_bad_inode() on
- * the base inode to ensure no more access happens to this inode.  We do not do
- * it here as the caller may want to finish writing other extent mft records
- * first to minimize on-disk metadata inconsistencies.
+ * record dirty and return -errno.
  *
  * NOTE:  We always perform synchronous i/o and ignore the @sync parameter.
  * However, if the mft record has a counterpart in the mft mirror and @sync is
@@ -668,8 +670,8 @@ int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync)
 {
 	ntfs_volume *vol = ni->vol;
 	struct page *page = ni->page;
-	unsigned char blocksize_bits = vol->mft_ino->i_blkbits;
-	unsigned int blocksize = 1 << blocksize_bits;
+	unsigned int blocksize = vol->sb->s_blocksize;
+	unsigned char blocksize_bits = vol->sb->s_blocksize_bits;
 	int max_bhs = vol->mft_record_size / blocksize;
 	struct buffer_head *bhs[max_bhs];
 	struct buffer_head *bh, *head;
@@ -689,7 +691,6 @@ int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync)
 	 */
 	if (!NInoTestClearDirty(ni))
 		goto done;
-	BUG_ON(!page_has_buffers(page));
 	bh = head = page_buffers(page);
 	BUG_ON(!bh);
 	rl = NULL;
@@ -723,6 +724,7 @@ int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync)
 			LCN lcn;
 			unsigned int vcn_ofs;
 
+			bh->b_bdev = vol->sb->s_bdev;
 			/* Obtain the vcn and offset of the current block. */
 			vcn = ((VCN)ni->mft_no << vol->mft_record_size_bits) +
 					(block_start - m_start);
@@ -777,7 +779,7 @@ int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync)
 	for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
 		struct buffer_head *tbh = bhs[i_bhs];
 
-		if (unlikely(test_set_buffer_locked(tbh)))
+		if (!trylock_buffer(tbh))
 			BUG();
 		BUG_ON(!buffer_uptodate(tbh));
 		clear_buffer_dirty(tbh);
@@ -855,7 +857,7 @@ err_out:
  * caller is responsible for unlocking the ntfs inode and unpinning the base
  * vfs inode.
  *
- * Return TRUE if the mft record may be written out and FALSE if not.
+ * Return 'true' if the mft record may be written out and 'false' if not.
  *
  * The caller has locked the page and cleared the uptodate flag on it which
  * means that we can safely write out any dirty mft records that do not have
@@ -866,7 +868,7 @@ err_out:
  * Here is a description of the tests we perform:
  *
  * If the inode is found in icache we know the mft record must be a base mft
- * record.  If it is dirty, we do not write it and return FALSE as the vfs
+ * record.  If it is dirty, we do not write it and return 'false' as the vfs
  * inode write paths will result in the access times being updated which would
  * cause the base mft record to be redirtied and written out again.  (We know
  * the access time update will modify the base mft record because Windows
@@ -875,11 +877,11 @@ err_out:
  *
  * If the inode is in icache and not dirty, we attempt to lock the mft record
  * and if we find the lock was already taken, it is not safe to write the mft
- * record and we return FALSE.
+ * record and we return 'false'.
  *
  * If we manage to obtain the lock we have exclusive access to the mft record,
  * which also allows us safe writeout of the mft record.  We then set
- * @locked_ni to the locked ntfs inode and return TRUE.
+ * @locked_ni to the locked ntfs inode and return 'true'.
  *
  * Note we cannot just lock the mft record and sleep while waiting for the lock
  * because this would deadlock due to lock reversal (normally the mft record is
@@ -889,24 +891,24 @@ err_out:
  * If the inode is not in icache we need to perform further checks.
  *
  * If the mft record is not a FILE record or it is a base mft record, we can
- * safely write it and return TRUE.
+ * safely write it and return 'true'.
  *
  * We now know the mft record is an extent mft record.  We check if the inode
  * corresponding to its base mft record is in icache and obtain a reference to
- * it if it is.  If it is not, we can safely write it and return TRUE.
+ * it if it is.  If it is not, we can safely write it and return 'true'.
  *
  * We now have the base inode for the extent mft record.  We check if it has an
  * ntfs inode for the extent mft record attached and if not it is safe to write
- * the extent mft record and we return TRUE.
+ * the extent mft record and we return 'true'.
  *
  * The ntfs inode for the extent mft record is attached to the base inode so we
  * attempt to lock the extent mft record and if we find the lock was already
- * taken, it is not safe to write the extent mft record and we return FALSE.
+ * taken, it is not safe to write the extent mft record and we return 'false'.
  *
  * If we manage to obtain the lock we have exclusive access to the extent mft
  * record, which also allows us safe writeout of the extent mft record.  We
  * set the ntfs inode of the extent mft record clean and then set @locked_ni to
- * the now locked ntfs inode and return TRUE.
+ * the now locked ntfs inode and return 'true'.
  *
  * Note, the reason for actually writing dirty mft records here and not just
  * relying on the vfs inode dirty code paths is that we can have mft records
@@ -920,7 +922,7 @@ err_out:
  * appear if the mft record is reused for a new inode before it got written
  * out.
  */
-BOOL ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no,
+bool ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no,
 		const MFT_RECORD *m, ntfs_inode **locked_ni)
 {
 	struct super_block *sb = vol->sb;
@@ -946,20 +948,23 @@ BOOL ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no,
 	na.name_len = 0;
 	na.type = AT_UNUSED;
 	/*
-	 * For inode 0, i.e. $MFT itself, we cannot use ilookup5() from here or
-	 * we deadlock because the inode is already locked by the kernel
-	 * (fs/fs-writeback.c::__sync_single_inode()) and ilookup5() waits
-	 * until the inode is unlocked before returning it and it never gets
-	 * unlocked because ntfs_should_write_mft_record() never returns.  )-:
-	 * Fortunately, we have inode 0 pinned in icache for the duration of
-	 * the mount so we can access it directly.
+	 * Optimize inode 0, i.e. $MFT itself, since we have it in memory and
+	 * we get here for it rather often.
 	 */
 	if (!mft_no) {
 		/* Balance the below iput(). */
 		vi = igrab(mft_vi);
 		BUG_ON(vi != mft_vi);
-	} else
-		vi = ilookup5(sb, mft_no, (test_t)ntfs_test_inode, &na);
+	} else {
+		/*
+		 * Have to use ilookup5_nowait() since ilookup5() waits for the
+		 * inode lock which causes ntfs to deadlock when a concurrent
+		 * inode write via the inode dirty code paths and the page
+		 * dirty code path of the inode dirty code path when writing
+		 * $MFT occurs.
+		 */
+		vi = ilookup5_nowait(sb, mft_no, (test_t)ntfs_test_inode, &na);
+	}
 	if (vi) {
 		ntfs_debug("Base inode 0x%lx is in icache.", mft_no);
 		/* The inode is in icache. */
@@ -972,16 +977,16 @@ BOOL ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no,
 					mft_no);
 			atomic_dec(&ni->count);
 			iput(vi);
-			return FALSE;
+			return false;
 		}
 		ntfs_debug("Inode 0x%lx is not dirty.", mft_no);
 		/* The inode is not dirty, try to take the mft record lock. */
-		if (unlikely(down_trylock(&ni->mrec_lock))) {
+		if (unlikely(!mutex_trylock(&ni->mrec_lock))) {
 			ntfs_debug("Mft record 0x%lx is already locked, do "
 					"not write it.", mft_no);
 			atomic_dec(&ni->count);
 			iput(vi);
-			return FALSE;
+			return false;
 		}
 		ntfs_debug("Managed to lock mft record 0x%lx, write it.",
 				mft_no);
@@ -990,7 +995,7 @@ BOOL ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no,
 		 * return the locked ntfs inode.
 		 */
 		*locked_ni = ni;
-		return TRUE;
+		return true;
 	}
 	ntfs_debug("Inode 0x%lx is not in icache.", mft_no);
 	/* The inode is not in icache. */
@@ -998,13 +1003,13 @@ BOOL ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no,
 	if (!ntfs_is_mft_record(m->magic)) {
 		ntfs_debug("Mft record 0x%lx is not a FILE record, write it.",
 				mft_no);
-		return TRUE;
+		return true;
 	}
 	/* Write the mft record if it is a base inode. */
 	if (!m->base_mft_record) {
 		ntfs_debug("Mft record 0x%lx is a base record, write it.",
 				mft_no);
-		return TRUE;
+		return true;
 	}
 	/*
 	 * This is an extent mft record.  Check if the inode corresponding to
@@ -1014,7 +1019,13 @@ BOOL ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no,
 	na.mft_no = MREF_LE(m->base_mft_record);
 	ntfs_debug("Mft record 0x%lx is an extent record.  Looking for base "
 			"inode 0x%lx in icache.", mft_no, na.mft_no);
-	vi = ilookup5(sb, na.mft_no, (test_t)ntfs_test_inode, &na);
+	if (!na.mft_no) {
+		/* Balance the below iput(). */
+		vi = igrab(mft_vi);
+		BUG_ON(vi != mft_vi);
+	} else
+		vi = ilookup5_nowait(sb, na.mft_no, (test_t)ntfs_test_inode,
+				&na);
 	if (!vi) {
 		/*
 		 * The base inode is not in icache, write this extent mft
@@ -1022,7 +1033,7 @@ BOOL ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no,
 		 */
 		ntfs_debug("Base inode 0x%lx is not in icache, write the "
 				"extent record.", na.mft_no);
-		return TRUE;
+		return true;
 	}
 	ntfs_debug("Base inode 0x%lx is in icache.", na.mft_no);
 	/*
@@ -1030,17 +1041,17 @@ BOOL ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no,
 	 * corresponding to this extent mft record attached.
 	 */
 	ni = NTFS_I(vi);
-	down(&ni->extent_lock);
+	mutex_lock(&ni->extent_lock);
 	if (ni->nr_extents <= 0) {
 		/*
 		 * The base inode has no attached extent inodes, write this
 		 * extent mft record.
 		 */
-		up(&ni->extent_lock);
+		mutex_unlock(&ni->extent_lock);
 		iput(vi);
 		ntfs_debug("Base inode 0x%lx has no attached extent inodes, "
 				"write the extent record.", na.mft_no);
-		return TRUE;
+		return true;
 	}
 	/* Iterate over the attached extent inodes. */
 	extent_nis = ni->ext.extent_ntfs_inos;
@@ -1059,28 +1070,28 @@ BOOL ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no,
 	 * extent mft record.
 	 */
 	if (!eni) {
-		up(&ni->extent_lock);
+		mutex_unlock(&ni->extent_lock);
 		iput(vi);
 		ntfs_debug("Extent inode 0x%lx is not attached to its base "
 				"inode 0x%lx, write the extent record.",
 				mft_no, na.mft_no);
-		return TRUE;
+		return true;
 	}
 	ntfs_debug("Extent inode 0x%lx is attached to its base inode 0x%lx.",
 			mft_no, na.mft_no);
 	/* Take a reference to the extent ntfs inode. */
 	atomic_inc(&eni->count);
-	up(&ni->extent_lock);
+	mutex_unlock(&ni->extent_lock);
 	/*
 	 * Found the extent inode coresponding to this extent mft record.
 	 * Try to take the mft record lock.
 	 */
-	if (unlikely(down_trylock(&eni->mrec_lock))) {
+	if (unlikely(!mutex_trylock(&eni->mrec_lock))) {
 		atomic_dec(&eni->count);
 		iput(vi);
 		ntfs_debug("Extent mft record 0x%lx is already locked, do "
 				"not write it.", mft_no);
-		return FALSE;
+		return false;
 	}
 	ntfs_debug("Managed to lock extent mft record 0x%lx, write it.",
 			mft_no);
@@ -1092,7 +1103,7 @@ BOOL ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no,
 	 * the locked extent ntfs inode.
 	 */
 	*locked_ni = eni;
-	return TRUE;
+	return true;
 }
 
 static const char *es = "  Leaving inconsistent metadata.  Unmount and run "
@@ -1121,6 +1132,7 @@ static int ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(ntfs_volume *vol,
 		ntfs_inode *base_ni)
 {
 	s64 pass_end, ll, data_pos, pass_start, ofs, bit;
+	unsigned long flags;
 	struct address_space *mftbmp_mapping;
 	u8 *buf, *byte;
 	struct page *page;
@@ -1134,9 +1146,13 @@ static int ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(ntfs_volume *vol,
 	 * Set the end of the pass making sure we do not overflow the mft
 	 * bitmap.
 	 */
+	read_lock_irqsave(&NTFS_I(vol->mft_ino)->size_lock, flags);
 	pass_end = NTFS_I(vol->mft_ino)->allocated_size >>
 			vol->mft_record_size_bits;
+	read_unlock_irqrestore(&NTFS_I(vol->mft_ino)->size_lock, flags);
+	read_lock_irqsave(&NTFS_I(vol->mftbmp_ino)->size_lock, flags);
 	ll = NTFS_I(vol->mftbmp_ino)->initialized_size << 3;
+	read_unlock_irqrestore(&NTFS_I(vol->mftbmp_ino)->size_lock, flags);
 	if (pass_end > ll)
 		pass_end = ll;
 	pass = 1;
@@ -1175,7 +1191,7 @@ static int ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(ntfs_volume *vol,
 		if (size) {
 			page = ntfs_map_page(mftbmp_mapping,
 					ofs >> PAGE_CACHE_SHIFT);
-			if (unlikely(IS_ERR(page))) {
+			if (IS_ERR(page)) {
 				ntfs_error(vol->sb, "Failed to read mft "
 						"bitmap, aborting.");
 				return PTR_ERR(page);
@@ -1263,6 +1279,7 @@ static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol)
 {
 	LCN lcn;
 	s64 ll;
+	unsigned long flags;
 	struct page *page;
 	ntfs_inode *mft_ni, *mftbmp_ni;
 	runlist_element *rl, *rl2 = NULL;
@@ -1284,17 +1301,20 @@ static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol)
 	/*
 	 * Determine the last lcn of the mft bitmap.  The allocated size of the
 	 * mft bitmap cannot be zero so we are ok to do this.
-	 * ntfs_find_vcn() returns the runlist locked on success.
 	 */
-	rl = ntfs_find_vcn(mftbmp_ni, (mftbmp_ni->allocated_size - 1) >>
-			vol->cluster_size_bits, TRUE);
+	down_write(&mftbmp_ni->runlist.lock);
+	read_lock_irqsave(&mftbmp_ni->size_lock, flags);
+	ll = mftbmp_ni->allocated_size;
+	read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
+	rl = ntfs_attr_find_vcn_nolock(mftbmp_ni,
+			(ll - 1) >> vol->cluster_size_bits, NULL);
 	if (unlikely(IS_ERR(rl) || !rl->length || rl->lcn < 0)) {
+		up_write(&mftbmp_ni->runlist.lock);
 		ntfs_error(vol->sb, "Failed to determine last allocated "
 				"cluster of mft bitmap attribute.");
-		if (!IS_ERR(rl)) {
-			up_write(&mftbmp_ni->runlist.lock);
+		if (!IS_ERR(rl))
 			ret = -EIO;
-		} else
+		else
 			ret = PTR_ERR(rl);
 		return ret;
 	}
@@ -1333,7 +1353,8 @@ static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol)
 		up_write(&vol->lcnbmp_lock);
 		ntfs_unmap_page(page);
 		/* Allocate a cluster from the DATA_ZONE. */
-		rl2 = ntfs_cluster_alloc(vol, rl[1].vcn, 1, lcn, DATA_ZONE);
+		rl2 = ntfs_cluster_alloc(vol, rl[1].vcn, 1, lcn, DATA_ZONE,
+				true);
 		if (IS_ERR(rl2)) {
 			up_write(&mftbmp_ni->runlist.lock);
 			ntfs_error(vol->sb, "Failed to allocate a cluster for "
@@ -1346,7 +1367,7 @@ static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol)
 			ntfs_error(vol->sb, "Failed to merge runlists for mft "
 					"bitmap.");
 			if (ntfs_cluster_free_from_rl(vol, rl2)) {
-				ntfs_error(vol->sb, "Failed to dealocate "
+				ntfs_error(vol->sb, "Failed to deallocate "
 						"allocated cluster.%s", es);
 				NVolSetErrors(vol);
 			}
@@ -1396,7 +1417,7 @@ static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol)
 	BUG_ON(ll < rl2->vcn);
 	BUG_ON(ll >= rl2->vcn + rl2->length);
 	/* Get the size for the new mapping pairs array for this extent. */
-	mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll);
+	mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1);
 	if (unlikely(mp_size <= 0)) {
 		ntfs_error(vol->sb, "Get size for mapping pairs failed for "
 				"mft bitmap attribute extent.");
@@ -1418,8 +1439,10 @@ static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol)
 		// TODO: Deal with this by moving this extent to a new mft
 		// record or by starting a new extent in a new mft record or by
 		// moving other attributes out of this mft record.
+		// Note: It will need to be a special mft record and if none of
+		// those are available it gets rather complicated...
 		ntfs_error(vol->sb, "Not enough space in this mft record to "
-				"accomodate extended mft bitmap attribute "
+				"accommodate extended mft bitmap attribute "
 				"extent.  Cannot handle this yet.");
 		ret = -EOPNOTSUPP;
 		goto undo_alloc;
@@ -1428,7 +1451,7 @@ static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol)
 	/* Generate the mapping pairs array directly into the attr record. */
 	ret = ntfs_mapping_pairs_build(vol, (u8*)a +
 			le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
-			mp_size, rl2, ll, NULL);
+			mp_size, rl2, ll, -1, NULL);
 	if (unlikely(ret)) {
 		ntfs_error(vol->sb, "Failed to build mapping pairs array for "
 				"mft bitmap attribute.");
@@ -1458,9 +1481,11 @@ static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol)
 		}
 		a = ctx->attr;
 	}
+	write_lock_irqsave(&mftbmp_ni->size_lock, flags);
 	mftbmp_ni->allocated_size += vol->cluster_size;
 	a->data.non_resident.allocated_size =
 			cpu_to_sle64(mftbmp_ni->allocated_size);
+	write_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
 	/* Ensure the changes make it to disk. */
 	flush_dcache_mft_record_page(ctx->ntfs_ino);
 	mark_mft_record_dirty(ctx->ntfs_ino);
@@ -1476,7 +1501,9 @@ restore_undo_alloc:
 			0, ctx)) {
 		ntfs_error(vol->sb, "Failed to find last attribute extent of "
 				"mft bitmap attribute.%s", es);
+		write_lock_irqsave(&mftbmp_ni->size_lock, flags);
 		mftbmp_ni->allocated_size += vol->cluster_size;
+		write_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
 		ntfs_attr_put_search_ctx(ctx);
 		unmap_mft_record(mft_ni);
 		up_write(&mftbmp_ni->runlist.lock);
@@ -1512,7 +1539,7 @@ undo_alloc:
 				a->data.non_resident.mapping_pairs_offset),
 				old_alen - le16_to_cpu(
 				a->data.non_resident.mapping_pairs_offset),
-				rl2, ll, NULL)) {
+				rl2, ll, -1, NULL)) {
 			ntfs_error(vol->sb, "Failed to restore mapping pairs "
 					"array.%s", es);
 			NVolSetErrors(vol);
@@ -1550,6 +1577,7 @@ undo_alloc:
 static int ntfs_mft_bitmap_extend_initialized_nolock(ntfs_volume *vol)
 {
 	s64 old_data_size, old_initialized_size;
+	unsigned long flags;
 	struct inode *mftbmp_vi;
 	ntfs_inode *mft_ni, *mftbmp_ni;
 	ntfs_attr_search_ctx *ctx;
@@ -1583,7 +1611,8 @@ static int ntfs_mft_bitmap_extend_initialized_nolock(ntfs_volume *vol)
 		goto put_err_out;
 	}
 	a = ctx->attr;
-	old_data_size = mftbmp_vi->i_size;
+	write_lock_irqsave(&mftbmp_ni->size_lock, flags);
+	old_data_size = i_size_read(mftbmp_vi);
 	old_initialized_size = mftbmp_ni->initialized_size;
 	/*
 	 * We can simply update the initialized_size before filling the space
@@ -1593,11 +1622,12 @@ static int ntfs_mft_bitmap_extend_initialized_nolock(ntfs_volume *vol)
 	mftbmp_ni->initialized_size += 8;
 	a->data.non_resident.initialized_size =
 			cpu_to_sle64(mftbmp_ni->initialized_size);
-	if (mftbmp_ni->initialized_size > mftbmp_vi->i_size) {
-		mftbmp_vi->i_size = mftbmp_ni->initialized_size;
+	if (mftbmp_ni->initialized_size > old_data_size) {
+		i_size_write(mftbmp_vi, mftbmp_ni->initialized_size);
 		a->data.non_resident.data_size =
-				cpu_to_sle64(mftbmp_vi->i_size);
+				cpu_to_sle64(mftbmp_ni->initialized_size);
 	}
+	write_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
 	/* Ensure the changes make it to disk. */
 	flush_dcache_mft_record_page(ctx->ntfs_ino);
 	mark_mft_record_dirty(ctx->ntfs_ino);
@@ -1636,22 +1666,28 @@ unm_err_out:
 		goto err_out;
 	}
 	a = ctx->attr;
+	write_lock_irqsave(&mftbmp_ni->size_lock, flags);
 	mftbmp_ni->initialized_size = old_initialized_size;
 	a->data.non_resident.initialized_size =
 			cpu_to_sle64(old_initialized_size);
-	if (mftbmp_vi->i_size != old_data_size) {
-		mftbmp_vi->i_size = old_data_size;
+	if (i_size_read(mftbmp_vi) != old_data_size) {
+		i_size_write(mftbmp_vi, old_data_size);
 		a->data.non_resident.data_size = cpu_to_sle64(old_data_size);
 	}
+	write_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
 	flush_dcache_mft_record_page(ctx->ntfs_ino);
 	mark_mft_record_dirty(ctx->ntfs_ino);
 	ntfs_attr_put_search_ctx(ctx);
 	unmap_mft_record(mft_ni);
+#ifdef DEBUG
+	read_lock_irqsave(&mftbmp_ni->size_lock, flags);
 	ntfs_debug("Restored status of mftbmp: allocated_size 0x%llx, "
 			"data_size 0x%llx, initialized_size 0x%llx.",
 			(long long)mftbmp_ni->allocated_size,
-			(long long)mftbmp_vi->i_size,
+			(long long)i_size_read(mftbmp_vi),
 			(long long)mftbmp_ni->initialized_size);
+	read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
+#endif /* DEBUG */
 err_out:
 	return ret;
 }
@@ -1679,7 +1715,8 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol)
 {
 	LCN lcn;
 	VCN old_last_vcn;
-	s64 min_nr, nr, ll = 0;
+	s64 min_nr, nr, ll;
+	unsigned long flags;
 	ntfs_inode *mft_ni;
 	runlist_element *rl, *rl2;
 	ntfs_attr_search_ctx *ctx = NULL;
@@ -1687,7 +1724,7 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol)
 	ATTR_RECORD *a = NULL;
 	int ret, mp_size;
 	u32 old_alen = 0;
-	BOOL mp_rebuilt = FALSE;
+	bool mp_rebuilt = false;
 
 	ntfs_debug("Extending mft data allocation.");
 	mft_ni = NTFS_I(vol->mft_ino);
@@ -1695,23 +1732,25 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol)
 	 * Determine the preferred allocation location, i.e. the last lcn of
 	 * the mft data attribute.  The allocated size of the mft data
 	 * attribute cannot be zero so we are ok to do this.
-	 * ntfs_find_vcn() returns the runlist locked on success.
 	 */
-	rl = ntfs_find_vcn(mft_ni, (mft_ni->allocated_size - 1) >>
-			vol->cluster_size_bits, TRUE);
+	down_write(&mft_ni->runlist.lock);
+	read_lock_irqsave(&mft_ni->size_lock, flags);
+	ll = mft_ni->allocated_size;
+	read_unlock_irqrestore(&mft_ni->size_lock, flags);
+	rl = ntfs_attr_find_vcn_nolock(mft_ni,
+			(ll - 1) >> vol->cluster_size_bits, NULL);
 	if (unlikely(IS_ERR(rl) || !rl->length || rl->lcn < 0)) {
+		up_write(&mft_ni->runlist.lock);
 		ntfs_error(vol->sb, "Failed to determine last allocated "
 				"cluster of mft data attribute.");
-		if (!IS_ERR(rl)) {
-			up_write(&mft_ni->runlist.lock);
+		if (!IS_ERR(rl))
 			ret = -EIO;
-		} else
+		else
 			ret = PTR_ERR(rl);
 		return ret;
 	}
 	lcn = rl->lcn + rl->length;
-	ntfs_debug("Last lcn of mft data attribute is 0x%llx.",
-			(long long)lcn);
+	ntfs_debug("Last lcn of mft data attribute is 0x%llx.", (long long)lcn);
 	/* Minimum allocation is one mft record worth of clusters. */
 	min_nr = vol->mft_record_size >> vol->cluster_size_bits;
 	if (!min_nr)
@@ -1721,12 +1760,13 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol)
 	if (!nr)
 		nr = min_nr;
 	/* Ensure we do not go above 2^32-1 mft records. */
-	if (unlikely((mft_ni->allocated_size +
-			(nr << vol->cluster_size_bits)) >>
+	read_lock_irqsave(&mft_ni->size_lock, flags);
+	ll = mft_ni->allocated_size;
+	read_unlock_irqrestore(&mft_ni->size_lock, flags);
+	if (unlikely((ll + (nr << vol->cluster_size_bits)) >>
 			vol->mft_record_size_bits >= (1ll << 32))) {
 		nr = min_nr;
-		if (unlikely((mft_ni->allocated_size +
-				(nr << vol->cluster_size_bits)) >>
+		if (unlikely((ll + (nr << vol->cluster_size_bits)) >>
 				vol->mft_record_size_bits >= (1ll << 32))) {
 			ntfs_warning(vol->sb, "Cannot allocate mft record "
 					"because the maximum number of inodes "
@@ -1739,7 +1779,8 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol)
 			nr > min_nr ? "default" : "minimal", (long long)nr);
 	old_last_vcn = rl[1].vcn;
 	do {
-		rl2 = ntfs_cluster_alloc(vol, old_last_vcn, nr, lcn, MFT_ZONE);
+		rl2 = ntfs_cluster_alloc(vol, old_last_vcn, nr, lcn, MFT_ZONE,
+				true);
 		if (likely(!IS_ERR(rl2)))
 			break;
 		if (PTR_ERR(rl2) != -ENOSPC || nr == min_nr) {
@@ -1764,7 +1805,7 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol)
 		ntfs_error(vol->sb, "Failed to merge runlists for mft data "
 				"attribute.");
 		if (ntfs_cluster_free_from_rl(vol, rl2)) {
-			ntfs_error(vol->sb, "Failed to dealocate clusters "
+			ntfs_error(vol->sb, "Failed to deallocate clusters "
 					"from the mft data attribute.%s", es);
 			NVolSetErrors(vol);
 		}
@@ -1772,7 +1813,7 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol)
 		return PTR_ERR(rl);
 	}
 	mft_ni->runlist.rl = rl;
-	ntfs_debug("Allocated %lli clusters.", nr);
+	ntfs_debug("Allocated %lli clusters.", (long long)nr);
 	/* Find the last run in the new runlist. */
 	for (; rl[1].length; rl++)
 		;
@@ -1808,7 +1849,7 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol)
 	BUG_ON(ll < rl2->vcn);
 	BUG_ON(ll >= rl2->vcn + rl2->length);
 	/* Get the size for the new mapping pairs array for this extent. */
-	mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll);
+	mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1);
 	if (unlikely(mp_size <= 0)) {
 		ntfs_error(vol->sb, "Get size for mapping pairs failed for "
 				"mft data attribute extent.");
@@ -1832,18 +1873,22 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol)
 		// moving other attributes out of this mft record.
 		// Note: Use the special reserved mft records and ensure that
 		// this extent is not required to find the mft record in
-		// question.
+		// question.  If no free special records left we would need to
+		// move an existing record away, insert ours in its place, and
+		// then place the moved record into the newly allocated space
+		// and we would then need to update all references to this mft
+		// record appropriately.  This is rather complicated...
 		ntfs_error(vol->sb, "Not enough space in this mft record to "
-				"accomodate extended mft data attribute "
+				"accommodate extended mft data attribute "
 				"extent.  Cannot handle this yet.");
 		ret = -EOPNOTSUPP;
 		goto undo_alloc;
 	}
-	mp_rebuilt = TRUE;
+	mp_rebuilt = true;
 	/* Generate the mapping pairs array directly into the attr record. */
 	ret = ntfs_mapping_pairs_build(vol, (u8*)a +
 			le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
-			mp_size, rl2, ll, NULL);
+			mp_size, rl2, ll, -1, NULL);
 	if (unlikely(ret)) {
 		ntfs_error(vol->sb, "Failed to build mapping pairs array of "
 				"mft data attribute.");
@@ -1875,9 +1920,11 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol)
 		}
 		a = ctx->attr;
 	}
+	write_lock_irqsave(&mft_ni->size_lock, flags);
 	mft_ni->allocated_size += nr << vol->cluster_size_bits;
 	a->data.non_resident.allocated_size =
 			cpu_to_sle64(mft_ni->allocated_size);
+	write_unlock_irqrestore(&mft_ni->size_lock, flags);
 	/* Ensure the changes make it to disk. */
 	flush_dcache_mft_record_page(ctx->ntfs_ino);
 	mark_mft_record_dirty(ctx->ntfs_ino);
@@ -1892,7 +1939,9 @@ restore_undo_alloc:
 			CASE_SENSITIVE, rl[1].vcn, NULL, 0, ctx)) {
 		ntfs_error(vol->sb, "Failed to find last attribute extent of "
 				"mft data attribute.%s", es);
+		write_lock_irqsave(&mft_ni->size_lock, flags);
 		mft_ni->allocated_size += nr << vol->cluster_size_bits;
+		write_unlock_irqrestore(&mft_ni->size_lock, flags);
 		ntfs_attr_put_search_ctx(ctx);
 		unmap_mft_record(mft_ni);
 		up_write(&mft_ni->runlist.lock);
@@ -1903,25 +1952,26 @@ restore_undo_alloc:
 		NVolSetErrors(vol);
 		return ret;
 	}
-	a = ctx->attr;
-	a->data.non_resident.highest_vcn = cpu_to_sle64(old_last_vcn - 1);
+	ctx->attr->data.non_resident.highest_vcn =
+			cpu_to_sle64(old_last_vcn - 1);
 undo_alloc:
-	if (ntfs_cluster_free(vol->mft_ino, old_last_vcn, -1) < 0) {
+	if (ntfs_cluster_free(mft_ni, old_last_vcn, -1, ctx) < 0) {
 		ntfs_error(vol->sb, "Failed to free clusters from mft data "
 				"attribute.%s", es);
 		NVolSetErrors(vol);
 	}
+	a = ctx->attr;
 	if (ntfs_rl_truncate_nolock(vol, &mft_ni->runlist, old_last_vcn)) {
 		ntfs_error(vol->sb, "Failed to truncate mft data attribute "
 				"runlist.%s", es);
 		NVolSetErrors(vol);
 	}
-	if (mp_rebuilt) {
+	if (mp_rebuilt && !IS_ERR(ctx->mrec)) {
 		if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(
 				a->data.non_resident.mapping_pairs_offset),
 				old_alen - le16_to_cpu(
 				a->data.non_resident.mapping_pairs_offset),
-				rl2, ll, NULL)) {
+				rl2, ll, -1, NULL)) {
 			ntfs_error(vol->sb, "Failed to restore mapping pairs "
 					"array.%s", es);
 			NVolSetErrors(vol);
@@ -1933,6 +1983,10 @@ undo_alloc:
 		}
 		flush_dcache_mft_record_page(ctx->ntfs_ino);
 		mark_mft_record_dirty(ctx->ntfs_ino);
+	} else if (IS_ERR(ctx->mrec)) {
+		ntfs_error(vol->sb, "Failed to restore attribute search "
+				"context.%s", es);
+		NVolSetErrors(vol);
 	}
 	if (ctx)
 		ntfs_attr_put_search_ctx(ctx);
@@ -1991,7 +2045,7 @@ static int ntfs_mft_record_layout(const ntfs_volume *vol, const s64 mft_no,
 				"reports this as corruption, please email "
 				"linux-ntfs-dev@lists.sourceforge.net stating "
 				"that you saw this message and that the "
-				"modified file system created was corrupt.  "
+				"modified filesystem created was corrupt.  "
 				"Thank you.");
 	}
 	/* Set the update sequence number to 1. */
@@ -2036,6 +2090,7 @@ static int ntfs_mft_record_layout(const ntfs_volume *vol, const s64 mft_no,
  */
 static int ntfs_mft_record_format(const ntfs_volume *vol, const s64 mft_no)
 {
+	loff_t i_size;
 	struct inode *mft_vi = vol->mft_ino;
 	struct page *page;
 	MFT_RECORD *m;
@@ -2051,10 +2106,11 @@ static int ntfs_mft_record_format(const ntfs_volume *vol, const s64 mft_no)
 	index = mft_no << vol->mft_record_size_bits >> PAGE_CACHE_SHIFT;
 	ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK;
 	/* The maximum valid index into the page cache for $MFT's data. */
-	end_index = mft_vi->i_size >> PAGE_CACHE_SHIFT;
+	i_size = i_size_read(mft_vi);
+	end_index = i_size >> PAGE_CACHE_SHIFT;
 	if (unlikely(index >= end_index)) {
 		if (unlikely(index > end_index || ofs + vol->mft_record_size >=
-				(mft_vi->i_size & ~PAGE_CACHE_MASK))) {
+				(i_size & ~PAGE_CACHE_MASK))) {
 			ntfs_error(vol->sb, "Tried to format non-existing mft "
 					"record 0x%llx.", (long long)mft_no);
 			return -ENOENT;
@@ -2062,7 +2118,7 @@ static int ntfs_mft_record_format(const ntfs_volume *vol, const s64 mft_no)
 	}
 	/* Read, map, and pin the page containing the mft record. */
 	page = ntfs_map_page(mft_vi->i_mapping, index);
-	if (unlikely(IS_ERR(page))) {
+	if (IS_ERR(page)) {
 		ntfs_error(vol->sb, "Failed to map page containing mft record "
 				"to format 0x%llx.", (long long)mft_no);
 		return PTR_ERR(page);
@@ -2188,6 +2244,7 @@ ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode,
 		ntfs_inode *base_ni, MFT_RECORD **mrec)
 {
 	s64 ll, bit, old_data_initialized, old_data_size;
+	unsigned long flags;
 	struct inode *vi;
 	struct page *page;
 	ntfs_inode *mft_ni, *mftbmp_ni, *ni;
@@ -2198,7 +2255,7 @@ ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode,
 	unsigned int ofs;
 	int err;
 	le16 seq_no, usn;
-	BOOL record_formatted = FALSE;
+	bool record_formatted = false;
 
 	if (base_ni) {
 		ntfs_debug("Entering (allocating an extent mft record for "
@@ -2237,9 +2294,13 @@ ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode,
 	 * the first 24 mft records as they are special and whilst they may not
 	 * be in use, we do not allocate from them.
 	 */
+	read_lock_irqsave(&mft_ni->size_lock, flags);
 	ll = mft_ni->initialized_size >> vol->mft_record_size_bits;
-	if (mftbmp_ni->initialized_size << 3 > ll &&
-			mftbmp_ni->initialized_size > 3) {
+	read_unlock_irqrestore(&mft_ni->size_lock, flags);
+	read_lock_irqsave(&mftbmp_ni->size_lock, flags);
+	old_data_initialized = mftbmp_ni->initialized_size;
+	read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
+	if (old_data_initialized << 3 > ll && old_data_initialized > 3) {
 		bit = ll;
 		if (bit < 24)
 			bit = 24;
@@ -2254,15 +2315,18 @@ ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode,
 	 * mft record that we can allocate.
 	 * Note: The smallest mft record we allocate is mft record 24.
 	 */
-	bit = mftbmp_ni->initialized_size << 3;
+	bit = old_data_initialized << 3;
 	if (unlikely(bit >= (1ll << 32)))
 		goto max_err_out;
+	read_lock_irqsave(&mftbmp_ni->size_lock, flags);
+	old_data_size = mftbmp_ni->allocated_size;
 	ntfs_debug("Status of mftbmp before extension: allocated_size 0x%llx, "
 			"data_size 0x%llx, initialized_size 0x%llx.",
-			(long long)mftbmp_ni->allocated_size,
-			(long long)vol->mftbmp_ino->i_size,
-			(long long)mftbmp_ni->initialized_size);
-	if (mftbmp_ni->initialized_size + 8 > mftbmp_ni->allocated_size) {
+			(long long)old_data_size,
+			(long long)i_size_read(vol->mftbmp_ino),
+			(long long)old_data_initialized);
+	read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
+	if (old_data_initialized + 8 > old_data_size) {
 		/* Need to extend bitmap by one more cluster. */
 		ntfs_debug("mftbmp: initialized_size + 8 > allocated_size.");
 		err = ntfs_mft_bitmap_extend_allocation_nolock(vol);
@@ -2270,12 +2334,16 @@ ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode,
 			up_write(&vol->mftbmp_lock);
 			goto err_out;
 		}
+#ifdef DEBUG
+		read_lock_irqsave(&mftbmp_ni->size_lock, flags);
 		ntfs_debug("Status of mftbmp after allocation extension: "
 				"allocated_size 0x%llx, data_size 0x%llx, "
 				"initialized_size 0x%llx.",
 				(long long)mftbmp_ni->allocated_size,
-				(long long)vol->mftbmp_ino->i_size,
+				(long long)i_size_read(vol->mftbmp_ino),
 				(long long)mftbmp_ni->initialized_size);
+		read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
+#endif /* DEBUG */
 	}
 	/*
 	 * We now have sufficient allocated space, extend the initialized_size
@@ -2287,12 +2355,16 @@ ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode,
 		up_write(&vol->mftbmp_lock);
 		goto err_out;
 	}
-	ntfs_debug("Status of mftbmp after initialized extention: "
+#ifdef DEBUG
+	read_lock_irqsave(&mftbmp_ni->size_lock, flags);
+	ntfs_debug("Status of mftbmp after initialized extension: "
 			"allocated_size 0x%llx, data_size 0x%llx, "
 			"initialized_size 0x%llx.",
 			(long long)mftbmp_ni->allocated_size,
-			(long long)vol->mftbmp_ino->i_size,
+			(long long)i_size_read(vol->mftbmp_ino),
 			(long long)mftbmp_ni->initialized_size);
+	read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
+#endif /* DEBUG */
 	ntfs_debug("Found free record (#3), bit 0x%llx.", (long long)bit);
 found_free_rec:
 	/* @bit is the found free mft record, allocate it in the mft bitmap. */
@@ -2314,7 +2386,10 @@ have_alloc_rec:
 	 * parallel allocation could allocate the same mft record as this one.
 	 */
 	ll = (bit + 1) << vol->mft_record_size_bits;
-	if (ll <= mft_ni->initialized_size) {
+	read_lock_irqsave(&mft_ni->size_lock, flags);
+	old_data_initialized = mft_ni->initialized_size;
+	read_unlock_irqrestore(&mft_ni->size_lock, flags);
+	if (ll <= old_data_initialized) {
 		ntfs_debug("Allocated mft record already initialized.");
 		goto mft_rec_already_initialized;
 	}
@@ -2325,26 +2400,30 @@ have_alloc_rec:
 	 * actually traversed more than once when a freshly formatted volume is
 	 * first written to so it optimizes away nicely in the common case.
 	 */
+	read_lock_irqsave(&mft_ni->size_lock, flags);
 	ntfs_debug("Status of mft data before extension: "
 			"allocated_size 0x%llx, data_size 0x%llx, "
 			"initialized_size 0x%llx.",
 			(long long)mft_ni->allocated_size,
-			(long long)vol->mft_ino->i_size,
+			(long long)i_size_read(vol->mft_ino),
 			(long long)mft_ni->initialized_size);
 	while (ll > mft_ni->allocated_size) {
+		read_unlock_irqrestore(&mft_ni->size_lock, flags);
 		err = ntfs_mft_data_extend_allocation_nolock(vol);
 		if (unlikely(err)) {
 			ntfs_error(vol->sb, "Failed to extend mft data "
 					"allocation.");
 			goto undo_mftbmp_alloc_nolock;
 		}
+		read_lock_irqsave(&mft_ni->size_lock, flags);
 		ntfs_debug("Status of mft data after allocation extension: "
 				"allocated_size 0x%llx, data_size 0x%llx, "
 				"initialized_size 0x%llx.",
 				(long long)mft_ni->allocated_size,
-				(long long)vol->mft_ino->i_size,
+				(long long)i_size_read(vol->mft_ino),
 				(long long)mft_ni->initialized_size);
 	}
+	read_unlock_irqrestore(&mft_ni->size_lock, flags);
 	/*
 	 * Extend mft data initialized size (and data size of course) to reach
 	 * the allocated mft record, formatting the mft records allong the way.
@@ -2352,6 +2431,7 @@ have_alloc_rec:
 	 * needed by ntfs_mft_record_format().  We will update the attribute
 	 * record itself in one fell swoop later on.
 	 */
+	write_lock_irqsave(&mft_ni->size_lock, flags);
 	old_data_initialized = mft_ni->initialized_size;
 	old_data_size = vol->mft_ino->i_size;
 	while (ll > mft_ni->initialized_size) {
@@ -2360,8 +2440,9 @@ have_alloc_rec:
 		new_initialized_size = mft_ni->initialized_size +
 				vol->mft_record_size;
 		mft_no = mft_ni->initialized_size >> vol->mft_record_size_bits;
-		if (new_initialized_size > vol->mft_ino->i_size)
-			vol->mft_ino->i_size = new_initialized_size;
+		if (new_initialized_size > i_size_read(vol->mft_ino))
+			i_size_write(vol->mft_ino, new_initialized_size);
+		write_unlock_irqrestore(&mft_ni->size_lock, flags);
 		ntfs_debug("Initializing mft record 0x%llx.",
 				(long long)mft_no);
 		err = ntfs_mft_record_format(vol, mft_no);
@@ -2369,9 +2450,11 @@ have_alloc_rec:
 			ntfs_error(vol->sb, "Failed to format mft record.");
 			goto undo_data_init;
 		}
+		write_lock_irqsave(&mft_ni->size_lock, flags);
 		mft_ni->initialized_size = new_initialized_size;
 	}
-	record_formatted = TRUE;
+	write_unlock_irqrestore(&mft_ni->size_lock, flags);
+	record_formatted = true;
 	/* Update the mft data attribute record to reflect the new sizes. */
 	m = map_mft_record(mft_ni);
 	if (IS_ERR(m)) {
@@ -2396,22 +2479,27 @@ have_alloc_rec:
 		goto undo_data_init;
 	}
 	a = ctx->attr;
+	read_lock_irqsave(&mft_ni->size_lock, flags);
 	a->data.non_resident.initialized_size =
 			cpu_to_sle64(mft_ni->initialized_size);
-	a->data.non_resident.data_size = cpu_to_sle64(vol->mft_ino->i_size);
+	a->data.non_resident.data_size =
+			cpu_to_sle64(i_size_read(vol->mft_ino));
+	read_unlock_irqrestore(&mft_ni->size_lock, flags);
 	/* Ensure the changes make it to disk. */
 	flush_dcache_mft_record_page(ctx->ntfs_ino);
 	mark_mft_record_dirty(ctx->ntfs_ino);
 	ntfs_attr_put_search_ctx(ctx);
 	unmap_mft_record(mft_ni);
+	read_lock_irqsave(&mft_ni->size_lock, flags);
 	ntfs_debug("Status of mft data after mft record initialization: "
 			"allocated_size 0x%llx, data_size 0x%llx, "
 			"initialized_size 0x%llx.",
 			(long long)mft_ni->allocated_size,
-			(long long)vol->mft_ino->i_size,
+			(long long)i_size_read(vol->mft_ino),
 			(long long)mft_ni->initialized_size);
-	BUG_ON(vol->mft_ino->i_size > mft_ni->allocated_size);
-	BUG_ON(mft_ni->initialized_size > vol->mft_ino->i_size);
+	BUG_ON(i_size_read(vol->mft_ino) > mft_ni->allocated_size);
+	BUG_ON(mft_ni->initialized_size > i_size_read(vol->mft_ino));
+	read_unlock_irqrestore(&mft_ni->size_lock, flags);
 mft_rec_already_initialized:
 	/*
 	 * We can finally drop the mft bitmap lock as the mft data attribute
@@ -2431,7 +2519,7 @@ mft_rec_already_initialized:
 	ofs = (bit << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK;
 	/* Read, map, and pin the page containing the mft record. */
 	page = ntfs_map_page(vol->mft_ino->i_mapping, index);
-	if (unlikely(IS_ERR(page))) {
+	if (IS_ERR(page)) {
 		ntfs_error(vol->sb, "Failed to map page containing allocated "
 				"mft record 0x%llx.", (long long)bit);
 		err = PTR_ERR(page);
@@ -2488,6 +2576,8 @@ mft_rec_already_initialized:
 	flush_dcache_page(page);
 	SetPageUptodate(page);
 	if (base_ni) {
+		MFT_RECORD *m_tmp;
+
 		/*
 		 * Setup the base mft record in the extent mft record.  This
 		 * completes initialization of the allocated extent mft record
@@ -2500,11 +2590,11 @@ mft_rec_already_initialized:
 		 * attach it to the base inode @base_ni and map, pin, and lock
 		 * its, i.e. the allocated, mft record.
 		 */
-		m = map_extent_mft_record(base_ni, bit, &ni);
-		if (IS_ERR(m)) {
+		m_tmp = map_extent_mft_record(base_ni, bit, &ni);
+		if (IS_ERR(m_tmp)) {
 			ntfs_error(vol->sb, "Failed to map allocated extent "
 					"mft record 0x%llx.", (long long)bit);
-			err = PTR_ERR(m);
+			err = PTR_ERR(m_tmp);
 			/* Set the mft record itself not in use. */
 			m->flags &= cpu_to_le16(
 					~le16_to_cpu(MFT_RECORD_IN_USE));
@@ -2515,6 +2605,7 @@ mft_rec_already_initialized:
 			ntfs_unmap_page(page);
 			goto undo_mftbmp_alloc;
 		}
+		BUG_ON(m != m_tmp);
 		/*
 		 * Make sure the allocated mft record is written out to disk.
 		 * No need to set the inode dirty because the caller is going
@@ -2550,11 +2641,6 @@ mft_rec_already_initialized:
 		}
 		vi->i_ino = bit;
 		/*
-		 * This is the optimal IO size (for stat), not the fs block
-		 * size.
-		 */
-		vi->i_blksize = PAGE_CACHE_SIZE;
-		/*
 		 * This is for checking whether an inode has changed w.r.t. a
 		 * file so that the file can be updated if necessary (compare
 		 * with f_version).
@@ -2582,7 +2668,7 @@ mft_rec_already_initialized:
 			ni->name_len = 4;
 
 			ni->itype.index.block_size = 4096;
-			ni->itype.index.block_size_bits = generic_ffs(4096) - 1;
+			ni->itype.index.block_size_bits = ntfs_ffs(4096) - 1;
 			ni->itype.index.collation_rule = COLLATION_FILE_NAME;
 			if (vol->cluster_size <= ni->itype.index.block_size) {
 				ni->itype.index.vcn_size = vol->cluster_size;
@@ -2621,7 +2707,7 @@ mft_rec_already_initialized:
 		 * have its page mapped and it is very easy to do.
 		 */
 		atomic_inc(&ni->count);
-		down(&ni->mrec_lock);
+		mutex_lock(&ni->mrec_lock);
 		ni->page = page;
 		ni->page_ofs = ofs;
 		/*
@@ -2652,8 +2738,10 @@ mft_rec_already_initialized:
 	*mrec = m;
 	return ni;
 undo_data_init:
+	write_lock_irqsave(&mft_ni->size_lock, flags);
 	mft_ni->initialized_size = old_data_initialized;
-	vol->mft_ino->i_size = old_data_size;
+	i_size_write(vol->mft_ino, old_data_size);
+	write_unlock_irqrestore(&mft_ni->size_lock, flags);
 	goto undo_mftbmp_alloc_nolock;
 undo_mftbmp_alloc:
 	down_write(&vol->mftbmp_lock);
@@ -2706,22 +2794,22 @@ int ntfs_extent_mft_record_free(ntfs_inode *ni, MFT_RECORD *m)
 	BUG_ON(NInoAttr(ni));
 	BUG_ON(ni->nr_extents != -1);
 
-	down(&ni->extent_lock);
+	mutex_lock(&ni->extent_lock);
 	base_ni = ni->ext.base_ntfs_ino;
-	up(&ni->extent_lock);
+	mutex_unlock(&ni->extent_lock);
 
 	BUG_ON(base_ni->nr_extents <= 0);
 
 	ntfs_debug("Entering for extent inode 0x%lx, base inode 0x%lx.\n",
 			mft_no, base_ni->mft_no);
 
-	down(&base_ni->extent_lock);
+	mutex_lock(&base_ni->extent_lock);
 
 	/* Make sure we are holding the only reference to the extent inode. */
 	if (atomic_read(&ni->count) > 2) {
 		ntfs_error(vol->sb, "Tried to free busy extent inode 0x%lx, "
 				"not freeing.", base_ni->mft_no);
-		up(&base_ni->extent_lock);
+		mutex_unlock(&base_ni->extent_lock);
 		return -EBUSY;
 	}
 
@@ -2739,7 +2827,7 @@ int ntfs_extent_mft_record_free(ntfs_inode *ni, MFT_RECORD *m)
 		break;
 	}
 
-	up(&base_ni->extent_lock);
+	mutex_unlock(&base_ni->extent_lock);
 
 	if (unlikely(err)) {
 		ntfs_error(vol->sb, "Extent inode 0x%lx is not attached to "
@@ -2754,7 +2842,7 @@ int ntfs_extent_mft_record_free(ntfs_inode *ni, MFT_RECORD *m)
 	 */
 
 	/* Mark the mft record as not in use. */
-	m->flags &= const_cpu_to_le16(~const_le16_to_cpu(MFT_RECORD_IN_USE));
+	m->flags &= ~MFT_RECORD_IN_USE;
 
 	/* Increment the sequence number, skipping zero, if it is not zero. */
 	old_seq_no = m->sequence_number;
@@ -2798,16 +2886,16 @@ rollback_error:
 	return 0;
 rollback:
 	/* Rollback what we did... */
-	down(&base_ni->extent_lock);
+	mutex_lock(&base_ni->extent_lock);
 	extent_nis = base_ni->ext.extent_ntfs_inos;
 	if (!(base_ni->nr_extents & 3)) {
 		int new_size = (base_ni->nr_extents + 4) * sizeof(ntfs_inode*);
 
-		extent_nis = (ntfs_inode**)kmalloc(new_size, GFP_NOFS);
+		extent_nis = kmalloc(new_size, GFP_NOFS);
 		if (unlikely(!extent_nis)) {
 			ntfs_error(vol->sb, "Failed to allocate internal "
 					"buffer during rollback.%s", es);
-			up(&base_ni->extent_lock);
+			mutex_unlock(&base_ni->extent_lock);
 			NVolSetErrors(vol);
 			goto rollback_error;
 		}
@@ -2822,7 +2910,7 @@ rollback:
 	m->flags |= MFT_RECORD_IN_USE;
 	m->sequence_number = old_seq_no;
 	extent_nis[base_ni->nr_extents++] = ni;
-	up(&base_ni->extent_lock);
+	mutex_unlock(&base_ni->extent_lock);
 	mark_mft_record_dirty(ni);
 	return err;
 }
diff --git a/fs/ntfs/mft.h b/fs/ntfs/mft.h
index 407de2cef1d..b52bf87b99d 100644
--- a/fs/ntfs/mft.h
+++ b/fs/ntfs/mft.h
@@ -97,10 +97,7 @@ extern int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync);
  * uptodate.
  *
  * On success, clean the mft record and return 0.  On error, leave the mft
- * record dirty and return -errno.  The caller should call make_bad_inode() on
- * the base inode to ensure no more access happens to this inode.  We do not do
- * it here as the caller may want to finish writing other extent mft records
- * first to minimize on-disk metadata inconsistencies.
+ * record dirty and return -errno.
  */
 static inline int write_mft_record(ntfs_inode *ni, MFT_RECORD *m, int sync)
 {
@@ -114,7 +111,7 @@ static inline int write_mft_record(ntfs_inode *ni, MFT_RECORD *m, int sync)
 	return err;
 }
 
-extern BOOL ntfs_may_write_mft_record(ntfs_volume *vol,
+extern bool ntfs_may_write_mft_record(ntfs_volume *vol,
 		const unsigned long mft_no, const MFT_RECORD *m,
 		ntfs_inode **locked_ni);
 
diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c
index 7c7e13b43b2..436f36037e0 100644
--- a/fs/ntfs/namei.c
+++ b/fs/ntfs/namei.c
@@ -2,7 +2,7 @@
  * namei.c - NTFS kernel directory inode operations. Part of the Linux-NTFS
  *	     project.
  *
- * Copyright (c) 2001-2004 Anton Altaparmakov
+ * Copyright (c) 2001-2006 Anton Altaparmakov
  *
  * This program/include file is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as published
@@ -21,7 +21,9 @@
  */
 
 #include <linux/dcache.h>
+#include <linux/exportfs.h>
 #include <linux/security.h>
+#include <linux/slab.h>
 
 #include "attrib.h"
 #include "debug.h"
@@ -96,10 +98,10 @@
  *    name. We then convert the name to the current NLS code page, and proceed
  *    searching for a dentry with this name, etc, as in case 2), above.
  *
- * Locking: Caller must hold i_sem on the directory.
+ * Locking: Caller must hold i_mutex on the directory.
  */
 static struct dentry *ntfs_lookup(struct inode *dir_ino, struct dentry *dent,
-		struct nameidata *nd)
+		unsigned int flags)
 {
 	ntfs_volume *vol = NTFS_SB(dir_ino->i_sb);
 	struct inode *dent_inode;
@@ -115,7 +117,9 @@ static struct dentry *ntfs_lookup(struct inode *dir_ino, struct dentry *dent,
 	uname_len = ntfs_nlstoucs(vol, dent->d_name.name, dent->d_name.len,
 			&uname);
 	if (uname_len < 0) {
-		ntfs_error(vol->sb, "Failed to convert name to Unicode.");
+		if (uname_len != -ENAMETOOLONG)
+			ntfs_error(vol->sb, "Failed to convert name to "
+					"Unicode.");
 		return ERR_PTR(uname_len);
 	}
 	mref = ntfs_lookup_inode_by_name(NTFS_I(dir_ino), uname, uname_len,
@@ -153,12 +157,11 @@ static struct dentry *ntfs_lookup(struct inode *dir_ino, struct dentry *dent,
 			ntfs_error(vol->sb, "ntfs_iget(0x%lx) failed with "
 					"error code %li.", dent_ino,
 					PTR_ERR(dent_inode));
-		if (name)
-			kfree(name);
+		kfree(name);
 		/* Return the error code. */
 		return (struct dentry *)dent_inode;
 	}
-	/* It is guaranteed that name is no longer allocated at this point. */
+	/* It is guaranteed that @name is no longer allocated at this point. */
 	if (MREF_ERR(mref) == -ENOENT) {
 		ntfs_debug("Entry was not found, adding negative dentry.");
 		/* The dcache will handle negative entries. */
@@ -169,11 +172,9 @@ static struct dentry *ntfs_lookup(struct inode *dir_ino, struct dentry *dent,
 	ntfs_error(vol->sb, "ntfs_lookup_ino_by_name() failed with error "
 			"code %i.", -MREF_ERR(mref));
 	return ERR_PTR(MREF_ERR(mref));
-
 	// TODO: Consider moving this lot to a separate function! (AIA)
 handle_name:
    {
-	struct dentry *real_dent, *new_dent;
 	MFT_RECORD *m;
 	ntfs_attr_search_ctx *ctx;
 	ntfs_inode *ni = NTFS_I(dent_inode);
@@ -254,93 +255,9 @@ handle_name:
 	}
 	nls_name.hash = full_name_hash(nls_name.name, nls_name.len);
 
-	/*
-	 * Note: No need for dent->d_lock lock as i_sem is held on the
-	 * parent inode.
-	 */
-
-	/* Does a dentry matching the nls_name exist already? */
-	real_dent = d_lookup(dent->d_parent, &nls_name);
-	/* If not, create it now. */
-	if (!real_dent) {
-		real_dent = d_alloc(dent->d_parent, &nls_name);
-		kfree(nls_name.name);
-		if (!real_dent) {
-			err = -ENOMEM;
-			goto err_out;
-		}
-		new_dent = d_splice_alias(dent_inode, real_dent);
-		if (new_dent)
-			dput(real_dent);
-		else
-			new_dent = real_dent;
-		ntfs_debug("Done.  (Created new dentry.)");
-		return new_dent;
-	}
+	dent = d_add_ci(dent, dent_inode, &nls_name);
 	kfree(nls_name.name);
-	/* Matching dentry exists, check if it is negative. */
-	if (real_dent->d_inode) {
-		if (unlikely(real_dent->d_inode != dent_inode)) {
-			/* This can happen because bad inodes are unhashed. */
-			BUG_ON(!is_bad_inode(dent_inode));
-			BUG_ON(!is_bad_inode(real_dent->d_inode));
-		}
-		/*
-		 * Already have the inode and the dentry attached, decrement
-		 * the reference count to balance the ntfs_iget() we did
-		 * earlier on.  We found the dentry using d_lookup() so it
-		 * cannot be disconnected and thus we do not need to worry
-		 * about any NFS/disconnectedness issues here.
-		 */
-		iput(dent_inode);
-		ntfs_debug("Done.  (Already had inode and dentry.)");
-		return real_dent;
-	}
-	/*
-	 * Negative dentry: instantiate it unless the inode is a directory and
-	 * has a 'disconnected' dentry (i.e. IS_ROOT and DCACHE_DISCONNECTED),
-	 * in which case d_move() that in place of the found dentry.
-	 */
-	if (!S_ISDIR(dent_inode->i_mode)) {
-		/* Not a directory; everything is easy. */
-		d_instantiate(real_dent, dent_inode);
-		ntfs_debug("Done.  (Already had negative file dentry.)");
-		return real_dent;
-	}
-	spin_lock(&dcache_lock);
-	if (list_empty(&dent_inode->i_dentry)) {
-		/*
-		 * Directory without a 'disconnected' dentry; we need to do
-		 * d_instantiate() by hand because it takes dcache_lock which
-		 * we already hold.
-		 */
-		list_add(&real_dent->d_alias, &dent_inode->i_dentry);
-		real_dent->d_inode = dent_inode;
-		spin_unlock(&dcache_lock);
-		security_d_instantiate(real_dent, dent_inode);
-		ntfs_debug("Done.  (Already had negative directory dentry.)");
-		return real_dent;
-	}
-	/*
-	 * Directory with a 'disconnected' dentry; get a reference to the
-	 * 'disconnected' dentry.
-	 */
-	new_dent = list_entry(dent_inode->i_dentry.next, struct dentry,
-			d_alias);
-	dget_locked(new_dent);
-	spin_unlock(&dcache_lock);
-	/* Do security vodoo. */
-	security_d_instantiate(real_dent, dent_inode);
-	/* Move new_dent in place of real_dent. */
-	d_move(new_dent, real_dent);
-	/* Balance the ntfs_iget() we did above. */
-	iput(dent_inode);
-	/* Throw away real_dent. */
-	dput(real_dent);
-	/* Use new_dent as the actual dentry. */
-	ntfs_debug("Done.  (Already had negative, disconnected directory "
-			"dentry.)");
-	return new_dent;
+	return dent;
 
 eio_err_out:
 	ntfs_error(vol->sb, "Illegal file name attribute. Run chkdsk.");
@@ -359,7 +276,7 @@ err_out:
 /**
  * Inode operations for directories.
  */
-struct inode_operations ntfs_dir_inode_ops = {
+const struct inode_operations ntfs_dir_inode_ops = {
 	.lookup	= ntfs_lookup,	/* VFS: Lookup directory. */
 };
 
@@ -375,12 +292,12 @@ struct inode_operations ntfs_dir_inode_ops = {
  * The code is based on the ext3 ->get_parent() implementation found in
  * fs/ext3/namei.c::ext3_get_parent().
  *
- * Note: ntfs_get_parent() is called with @child_dent->d_inode->i_sem down.
+ * Note: ntfs_get_parent() is called with @child_dent->d_inode->i_mutex down.
  *
  * Return the dentry of the parent directory on success or the error code on
  * error (IS_ERR() is true).
  */
-struct dentry *ntfs_get_parent(struct dentry *child_dent)
+static struct dentry *ntfs_get_parent(struct dentry *child_dent)
 {
 	struct inode *vi = child_dent->d_inode;
 	ntfs_inode *ni = NTFS_I(vi);
@@ -388,8 +305,6 @@ struct dentry *ntfs_get_parent(struct dentry *child_dent)
 	ntfs_attr_search_ctx *ctx;
 	ATTR_RECORD *attr;
 	FILE_NAME_ATTR *fn;
-	struct inode *parent_vi;
-	struct dentry *parent_dent;
 	unsigned long parent_ino;
 	int err;
 
@@ -429,70 +344,62 @@ try_next:
 	/* Release the search context and the mft record of the child. */
 	ntfs_attr_put_search_ctx(ctx);
 	unmap_mft_record(ni);
-	/* Get the inode of the parent directory. */
-	parent_vi = ntfs_iget(vi->i_sb, parent_ino);
-	if (IS_ERR(parent_vi) || unlikely(is_bad_inode(parent_vi))) {
-		if (!IS_ERR(parent_vi))
-			iput(parent_vi);
-		ntfs_error(vi->i_sb, "Failed to get parent directory inode "
-				"0x%lx of child inode 0x%lx.", parent_ino,
-				vi->i_ino);
-		return ERR_PTR(-EACCES);
-	}
-	/* Finally get a dentry for the parent directory and return it. */
-	parent_dent = d_alloc_anon(parent_vi);
-	if (unlikely(!parent_dent)) {
-		iput(parent_vi);
-		return ERR_PTR(-ENOMEM);
+
+	return d_obtain_alias(ntfs_iget(vi->i_sb, parent_ino));
+}
+
+static struct inode *ntfs_nfs_get_inode(struct super_block *sb,
+		u64 ino, u32 generation)
+{
+	struct inode *inode;
+
+	inode = ntfs_iget(sb, ino);
+	if (!IS_ERR(inode)) {
+		if (is_bad_inode(inode) || inode->i_generation != generation) {
+			iput(inode);
+			inode = ERR_PTR(-ESTALE);
+		}
 	}
-	ntfs_debug("Done for inode 0x%lx.", vi->i_ino);
-	return parent_dent;
+
+	return inode;
+}
+
+static struct dentry *ntfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
+		int fh_len, int fh_type)
+{
+	return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+				    ntfs_nfs_get_inode);
+}
+
+static struct dentry *ntfs_fh_to_parent(struct super_block *sb, struct fid *fid,
+		int fh_len, int fh_type)
+{
+	return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+				    ntfs_nfs_get_inode);
 }
 
 /**
- * ntfs_get_dentry - find a dentry for the inode from a file handle sub-fragment
- * @sb:		super block identifying the mounted ntfs volume
- * @fh:		the file handle sub-fragment
+ * Export operations allowing NFS exporting of mounted NTFS partitions.
  *
- * Find a dentry for the inode given a file handle sub-fragment.  This function
- * is called from fs/exportfs/expfs.c::find_exported_dentry() which in turn is
- * called from the default ->decode_fh() which is export_decode_fh() in the
- * same file.  The code is closely based on the default ->get_dentry() helper
- * fs/exportfs/expfs.c::get_object().
+ * We use the default ->encode_fh() for now.  Note that they
+ * use 32 bits to store the inode number which is an unsigned long so on 64-bit
+ * architectures is usually 64 bits so it would all fail horribly on huge
+ * volumes.  I guess we need to define our own encode and decode fh functions
+ * that store 64-bit inode numbers at some point but for now we will ignore the
+ * problem...
  *
- * The @fh contains two 32-bit unsigned values, the first one is the inode
- * number and the second one is the inode generation.
+ * We also use the default ->get_name() helper (used by ->decode_fh() via
+ * fs/exportfs/expfs.c::find_exported_dentry()) as that is completely fs
+ * independent.
  *
- * Return the dentry on success or the error code on error (IS_ERR() is true).
+ * The default ->get_parent() just returns -EACCES so we have to provide our
+ * own and the default ->get_dentry() is incompatible with NTFS due to not
+ * allowing the inode number 0 which is used in NTFS for the system file $MFT
+ * and due to using iget() whereas NTFS needs ntfs_iget().
  */
-struct dentry *ntfs_get_dentry(struct super_block *sb, void *fh)
-{
-	struct inode *vi;
-	struct dentry *dent;
-	unsigned long ino = ((u32 *)fh)[0];
-	u32 gen = ((u32 *)fh)[1];
-
-	ntfs_debug("Entering for inode 0x%lx, generation 0x%x.", ino, gen);
-	vi = ntfs_iget(sb, ino);
-	if (IS_ERR(vi)) {
-		ntfs_error(sb, "Failed to get inode 0x%lx.", ino);
-		return (struct dentry *)vi;
-	}
-	if (unlikely(is_bad_inode(vi) || vi->i_generation != gen)) {
-		/* We didn't find the right inode. */
-		ntfs_error(sb, "Inode 0x%lx, bad count: %d %d or version 0x%x "
-				"0x%x.", vi->i_ino, vi->i_nlink,
-				atomic_read(&vi->i_count), vi->i_generation,
-				gen);
-		iput(vi);
-		return ERR_PTR(-ESTALE);
-	}
-	/* Now find a dentry.  If possible, get a well-connected one. */
-	dent = d_alloc_anon(vi);
-	if (unlikely(!dent)) {
-		iput(vi);
-		return ERR_PTR(-ENOMEM);
-	}
-	ntfs_debug("Done for inode 0x%lx, generation 0x%x.", ino, gen);
-	return dent;
-}
+const struct export_operations ntfs_export_ops = {
+	.get_parent	= ntfs_get_parent,	/* Find the parent of a given
+						   directory. */
+	.fh_to_dentry	= ntfs_fh_to_dentry,
+	.fh_to_parent	= ntfs_fh_to_parent,
+};
diff --git a/fs/ntfs/ntfs.h b/fs/ntfs/ntfs.h
index 720ffb71bab..d6a340bf80f 100644
--- a/fs/ntfs/ntfs.h
+++ b/fs/ntfs/ntfs.h
@@ -2,7 +2,7 @@
  * ntfs.h - Defines for NTFS Linux kernel driver. Part of the Linux-NTFS
  *	    project.
  *
- * Copyright (c) 2001-2004 Anton Altaparmakov
+ * Copyright (c) 2001-2005 Anton Altaparmakov
  * Copyright (C) 2002 Richard Russon
  *
  * This program/include file is free software; you can redistribute it and/or
@@ -31,6 +31,7 @@
 #include <linux/fs.h>
 #include <linux/nls.h>
 #include <linux/smp.h>
+#include <linux/pagemap.h>
 
 #include "types.h"
 #include "volume.h"
@@ -41,29 +42,34 @@ typedef enum {
 	NTFS_BLOCK_SIZE_BITS	= 9,
 	NTFS_SB_MAGIC		= 0x5346544e,	/* 'NTFS' */
 	NTFS_MAX_NAME_LEN	= 255,
+	NTFS_MAX_ATTR_NAME_LEN	= 255,
+	NTFS_MAX_CLUSTER_SIZE	= 64 * 1024,	/* 64kiB */
+	NTFS_MAX_PAGES_PER_CLUSTER = NTFS_MAX_CLUSTER_SIZE / PAGE_CACHE_SIZE,
 } NTFS_CONSTANTS;
 
 /* Global variables. */
 
 /* Slab caches (from super.c). */
-extern kmem_cache_t *ntfs_name_cache;
-extern kmem_cache_t *ntfs_inode_cache;
-extern kmem_cache_t *ntfs_big_inode_cache;
-extern kmem_cache_t *ntfs_attr_ctx_cache;
-extern kmem_cache_t *ntfs_index_ctx_cache;
+extern struct kmem_cache *ntfs_name_cache;
+extern struct kmem_cache *ntfs_inode_cache;
+extern struct kmem_cache *ntfs_big_inode_cache;
+extern struct kmem_cache *ntfs_attr_ctx_cache;
+extern struct kmem_cache *ntfs_index_ctx_cache;
 
 /* The various operations structs defined throughout the driver files. */
-extern struct address_space_operations ntfs_aops;
-extern struct address_space_operations ntfs_mst_aops;
+extern const struct address_space_operations ntfs_aops;
+extern const struct address_space_operations ntfs_mst_aops;
 
-extern struct  file_operations ntfs_file_ops;
-extern struct inode_operations ntfs_file_inode_ops;
+extern const struct  file_operations ntfs_file_ops;
+extern const struct inode_operations ntfs_file_inode_ops;
 
-extern struct  file_operations ntfs_dir_ops;
-extern struct inode_operations ntfs_dir_inode_ops;
+extern const struct  file_operations ntfs_dir_ops;
+extern const struct inode_operations ntfs_dir_inode_ops;
 
-extern struct  file_operations ntfs_empty_file_ops;
-extern struct inode_operations ntfs_empty_inode_ops;
+extern const struct  file_operations ntfs_empty_file_ops;
+extern const struct inode_operations ntfs_empty_inode_ops;
+
+extern const struct export_operations ntfs_export_ops;
 
 /**
  * NTFS_SB - return the ntfs volume given a vfs super block
@@ -85,7 +91,7 @@ extern void free_compression_buffers(void);
 
 /* From fs/ntfs/super.c */
 #define default_upcase_len 0x10000
-extern struct semaphore ntfs_lock;
+extern struct mutex ntfs_lock;
 
 typedef struct {
 	int val;
@@ -99,7 +105,7 @@ extern int pre_write_mst_fixup(NTFS_RECORD *b, const u32 size);
 extern void post_write_mst_fixup(NTFS_RECORD *b);
 
 /* From fs/ntfs/unistr.c */
-extern BOOL ntfs_are_names_equal(const ntfschar *s1, size_t s1_len,
+extern bool ntfs_are_names_equal(const ntfschar *s1, size_t s1_len,
 		const ntfschar *s2, size_t s2_len,
 		const IGNORE_CASE_BOOL ic,
 		const ntfschar *upcase, const u32 upcase_size);
@@ -126,4 +132,33 @@ extern int ntfs_ucstonls(const ntfs_volume *vol, const ntfschar *ins,
 /* From fs/ntfs/upcase.c */
 extern ntfschar *generate_default_upcase(void);
 
+static inline int ntfs_ffs(int x)
+{
+	int r = 1;
+
+	if (!x)
+		return 0;
+	if (!(x & 0xffff)) {
+		x >>= 16;
+		r += 16;
+	}
+	if (!(x & 0xff)) {
+		x >>= 8;
+		r += 8;
+	}
+	if (!(x & 0xf)) {
+		x >>= 4;
+		r += 4;
+	}
+	if (!(x & 3)) {
+		x >>= 2;
+		r += 2;
+	}
+	if (!(x & 1)) {
+		x >>= 1;
+		r += 1;
+	}
+	return r;
+}
+
 #endif /* _LINUX_NTFS_H */
diff --git a/fs/ntfs/quota.c b/fs/ntfs/quota.c
index 833df2a4e9f..d80e3315cab 100644
--- a/fs/ntfs/quota.c
+++ b/fs/ntfs/quota.c
@@ -31,10 +31,10 @@
  * ntfs_mark_quotas_out_of_date - mark the quotas out of date on an ntfs volume
  * @vol:	ntfs volume on which to mark the quotas out of date
  *
- * Mark the quotas out of date on the ntfs volume @vol and return TRUE on
- * success and FALSE on error.
+ * Mark the quotas out of date on the ntfs volume @vol and return 'true' on
+ * success and 'false' on error.
  */
-BOOL ntfs_mark_quotas_out_of_date(ntfs_volume *vol)
+bool ntfs_mark_quotas_out_of_date(ntfs_volume *vol)
 {
 	ntfs_index_context *ictx;
 	QUOTA_CONTROL_ENTRY *qce;
@@ -46,9 +46,9 @@ BOOL ntfs_mark_quotas_out_of_date(ntfs_volume *vol)
 		goto done;
 	if (!vol->quota_ino || !vol->quota_q_ino) {
 		ntfs_error(vol->sb, "Quota inodes are not open.");
-		return FALSE;
+		return false;
 	}
-	down(&vol->quota_q_ino->i_sem);
+	mutex_lock(&vol->quota_q_ino->i_mutex);
 	ictx = ntfs_index_ctx_get(NTFS_I(vol->quota_q_ino));
 	if (!ictx) {
 		ntfs_error(vol->sb, "Failed to get index context.");
@@ -98,7 +98,7 @@ BOOL ntfs_mark_quotas_out_of_date(ntfs_volume *vol)
 	ntfs_index_entry_mark_dirty(ictx);
 set_done:
 	ntfs_index_ctx_put(ictx);
-	up(&vol->quota_q_ino->i_sem);
+	mutex_unlock(&vol->quota_q_ino->i_mutex);
 	/*
 	 * We set the flag so we do not try to mark the quotas out of date
 	 * again on remount.
@@ -106,12 +106,12 @@ set_done:
 	NVolSetQuotaOutOfDate(vol);
 done:
 	ntfs_debug("Done.");
-	return TRUE;
+	return true;
 err_out:
 	if (ictx)
 		ntfs_index_ctx_put(ictx);
-	up(&vol->quota_q_ino->i_sem);
-	return FALSE;
+	mutex_unlock(&vol->quota_q_ino->i_mutex);
+	return false;
 }
 
 #endif /* NTFS_RW */
diff --git a/fs/ntfs/quota.h b/fs/ntfs/quota.h
index 40e4763aa22..4cbe5594c0b 100644
--- a/fs/ntfs/quota.h
+++ b/fs/ntfs/quota.h
@@ -28,7 +28,7 @@
 #include "types.h"
 #include "volume.h"
 
-extern BOOL ntfs_mark_quotas_out_of_date(ntfs_volume *vol);
+extern bool ntfs_mark_quotas_out_of_date(ntfs_volume *vol);
 
 #endif /* NTFS_RW */
 
diff --git a/fs/ntfs/runlist.c b/fs/ntfs/runlist.c
index 8438fb1da21..eac7d6788a1 100644
--- a/fs/ntfs/runlist.c
+++ b/fs/ntfs/runlist.c
@@ -1,8 +1,8 @@
 /**
  * runlist.c - NTFS runlist handling code.  Part of the Linux-NTFS project.
  *
- * Copyright (c) 2001-2004 Anton Altaparmakov
- * Copyright (c) 2002 Richard Russon
+ * Copyright (c) 2001-2007 Anton Altaparmakov
+ * Copyright (c) 2002-2005 Richard Russon
  *
  * This program/include file is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as published
@@ -35,7 +35,7 @@ static inline void ntfs_rl_mm(runlist_element *base, int dst, int src,
 		int size)
 {
 	if (likely((dst != src) && (size > 0)))
-		memmove(base + dst, base + src, size * sizeof (*base));
+		memmove(base + dst, base + src, size * sizeof(*base));
 }
 
 /**
@@ -59,7 +59,7 @@ static inline void ntfs_rl_mc(runlist_element *dstbase, int dst,
  *
  * As the runlists grow, more memory will be required.  To prevent the
  * kernel having to allocate and reallocate large numbers of small bits of
- * memory, this function returns and entire page of memory.
+ * memory, this function returns an entire page of memory.
  *
  * It is up to the caller to serialize access to the runlist @rl.
  *
@@ -95,6 +95,51 @@ static inline runlist_element *ntfs_rl_realloc(runlist_element *rl,
 }
 
 /**
+ * ntfs_rl_realloc_nofail - Reallocate memory for runlists
+ * @rl:		original runlist
+ * @old_size:	number of runlist elements in the original runlist @rl
+ * @new_size:	number of runlist elements we need space for
+ *
+ * As the runlists grow, more memory will be required.  To prevent the
+ * kernel having to allocate and reallocate large numbers of small bits of
+ * memory, this function returns an entire page of memory.
+ *
+ * This function guarantees that the allocation will succeed.  It will sleep
+ * for as long as it takes to complete the allocation.
+ *
+ * It is up to the caller to serialize access to the runlist @rl.
+ *
+ * N.B.  If the new allocation doesn't require a different number of pages in
+ *       memory, the function will return the original pointer.
+ *
+ * On success, return a pointer to the newly allocated, or recycled, memory.
+ * On error, return -errno. The following error codes are defined:
+ *	-ENOMEM	- Not enough memory to allocate runlist array.
+ *	-EINVAL	- Invalid parameters were passed in.
+ */
+static inline runlist_element *ntfs_rl_realloc_nofail(runlist_element *rl,
+		int old_size, int new_size)
+{
+	runlist_element *new_rl;
+
+	old_size = PAGE_ALIGN(old_size * sizeof(*rl));
+	new_size = PAGE_ALIGN(new_size * sizeof(*rl));
+	if (old_size == new_size)
+		return rl;
+
+	new_rl = ntfs_malloc_nofs_nofail(new_size);
+	BUG_ON(!new_rl);
+
+	if (likely(rl != NULL)) {
+		if (unlikely(old_size > new_size))
+			old_size = new_size;
+		memcpy(new_rl, rl, old_size);
+		ntfs_free(rl);
+	}
+	return new_rl;
+}
+
+/**
  * ntfs_are_rl_mergeable - test if two runlists can be joined together
  * @dst:	original runlist
  * @src:	new runlist to test for mergeability with @dst
@@ -104,23 +149,30 @@ static inline runlist_element *ntfs_rl_realloc(runlist_element *rl,
  *
  * It is up to the caller to serialize access to the runlists @dst and @src.
  *
- * Return: TRUE   Success, the runlists can be merged.
- *	   FALSE  Failure, the runlists cannot be merged.
+ * Return: true   Success, the runlists can be merged.
+ *	   false  Failure, the runlists cannot be merged.
  */
-static inline BOOL ntfs_are_rl_mergeable(runlist_element *dst,
+static inline bool ntfs_are_rl_mergeable(runlist_element *dst,
 		runlist_element *src)
 {
 	BUG_ON(!dst);
 	BUG_ON(!src);
 
-	if ((dst->lcn < 0) || (src->lcn < 0))     /* Are we merging holes? */
-		return FALSE;
-	if ((dst->lcn + dst->length) != src->lcn) /* Are the runs contiguous? */
-		return FALSE;
-	if ((dst->vcn + dst->length) != src->vcn) /* Are the runs misaligned? */
-		return FALSE;
-
-	return TRUE;
+	/* We can merge unmapped regions even if they are misaligned. */
+	if ((dst->lcn == LCN_RL_NOT_MAPPED) && (src->lcn == LCN_RL_NOT_MAPPED))
+		return true;
+	/* If the runs are misaligned, we cannot merge them. */
+	if ((dst->vcn + dst->length) != src->vcn)
+		return false;
+	/* If both runs are non-sparse and contiguous, we can merge them. */
+	if ((dst->lcn >= 0) && (src->lcn >= 0) &&
+			((dst->lcn + dst->length) == src->lcn))
+		return true;
+	/* If we are merging two holes, we can merge them. */
+	if ((dst->lcn == LCN_HOLE) && (src->lcn == LCN_HOLE))
+		return true;
+	/* Cannot merge. */
+	return false;
 }
 
 /**
@@ -166,14 +218,15 @@ static inline void __ntfs_rl_merge(runlist_element *dst, runlist_element *src)
 static inline runlist_element *ntfs_rl_append(runlist_element *dst,
 		int dsize, runlist_element *src, int ssize, int loc)
 {
-	BOOL right;
-	int magic;
+	bool right = false;	/* Right end of @src needs merging. */
+	int marker;		/* End of the inserted runs. */
 
 	BUG_ON(!dst);
 	BUG_ON(!src);
 
 	/* First, check if the right hand end needs merging. */
-	right = ntfs_are_rl_mergeable(src + ssize - 1, dst + loc + 1);
+	if ((loc + 1) < dsize)
+		right = ntfs_are_rl_mergeable(src + ssize - 1, dst + loc + 1);
 
 	/* Space required: @dst size + @src size, less one if we merged. */
 	dst = ntfs_rl_realloc(dst, dsize, dsize + ssize - right);
@@ -188,18 +241,19 @@ static inline runlist_element *ntfs_rl_append(runlist_element *dst,
 	if (right)
 		__ntfs_rl_merge(src + ssize - 1, dst + loc + 1);
 
-	magic = loc + ssize;
+	/* First run after the @src runs that have been inserted. */
+	marker = loc + ssize + 1;
 
 	/* Move the tail of @dst out of the way, then copy in @src. */
-	ntfs_rl_mm(dst, magic + 1, loc + 1 + right, dsize - loc - 1 - right);
+	ntfs_rl_mm(dst, marker, loc + 1 + right, dsize - (loc + 1 + right));
 	ntfs_rl_mc(dst, loc + 1, src, 0, ssize);
 
 	/* Adjust the size of the preceding hole. */
 	dst[loc].length = dst[loc + 1].vcn - dst[loc].vcn;
 
 	/* We may have changed the length of the file, so fix the end marker */
-	if (dst[magic + 1].lcn == LCN_ENOENT)
-		dst[magic + 1].vcn = dst[magic].vcn + dst[magic].length;
+	if (dst[marker].lcn == LCN_ENOENT)
+		dst[marker].vcn = dst[marker - 1].vcn + dst[marker - 1].length;
 
 	return dst;
 }
@@ -231,18 +285,17 @@ static inline runlist_element *ntfs_rl_append(runlist_element *dst,
 static inline runlist_element *ntfs_rl_insert(runlist_element *dst,
 		int dsize, runlist_element *src, int ssize, int loc)
 {
-	BOOL left = FALSE;
-	BOOL disc = FALSE;	/* Discontinuity */
-	BOOL hole = FALSE;	/* Following a hole */
-	int magic;
+	bool left = false;	/* Left end of @src needs merging. */
+	bool disc = false;	/* Discontinuity between @dst and @src. */
+	int marker;		/* End of the inserted runs. */
 
 	BUG_ON(!dst);
 	BUG_ON(!src);
 
-	/* disc => Discontinuity between the end of @dst and the start of @src.
-	 *	   This means we might need to insert a hole.
-	 * hole => @dst ends with a hole or an unmapped region which we can
-	 *	   extend to match the discontinuity. */
+	/*
+	 * disc => Discontinuity between the end of @dst and the start of @src.
+	 *	   This means we might need to insert a "not mapped" run.
+	 */
 	if (loc == 0)
 		disc = (src[0].vcn > 0);
 	else {
@@ -255,58 +308,49 @@ static inline runlist_element *ntfs_rl_insert(runlist_element *dst,
 			merged_length += src->length;
 
 		disc = (src[0].vcn > dst[loc - 1].vcn + merged_length);
-		if (disc)
-			hole = (dst[loc - 1].lcn == LCN_HOLE);
 	}
-
-	/* Space required: @dst size + @src size, less one if we merged, plus
-	 * one if there was a discontinuity, less one for a trailing hole. */
-	dst = ntfs_rl_realloc(dst, dsize, dsize + ssize - left + disc - hole);
+	/*
+	 * Space required: @dst size + @src size, less one if we merged, plus
+	 * one if there was a discontinuity.
+	 */
+	dst = ntfs_rl_realloc(dst, dsize, dsize + ssize - left + disc);
 	if (IS_ERR(dst))
 		return dst;
 	/*
 	 * We are guaranteed to succeed from here so can start modifying the
 	 * original runlist.
 	 */
-
 	if (left)
 		__ntfs_rl_merge(dst + loc - 1, src);
-
-	magic = loc + ssize - left + disc - hole;
+	/*
+	 * First run after the @src runs that have been inserted.
+	 * Nominally,  @marker equals @loc + @ssize, i.e. location + number of
+	 * runs in @src.  However, if @left, then the first run in @src has
+	 * been merged with one in @dst.  And if @disc, then @dst and @src do
+	 * not meet and we need an extra run to fill the gap.
+	 */
+	marker = loc + ssize - left + disc;
 
 	/* Move the tail of @dst out of the way, then copy in @src. */
-	ntfs_rl_mm(dst, magic, loc, dsize - loc);
-	ntfs_rl_mc(dst, loc + disc - hole, src, left, ssize - left);
+	ntfs_rl_mm(dst, marker, loc, dsize - loc);
+	ntfs_rl_mc(dst, loc + disc, src, left, ssize - left);
 
-	/* Adjust the VCN of the last run ... */
-	if (dst[magic].lcn <= LCN_HOLE)
-		dst[magic].vcn = dst[magic - 1].vcn + dst[magic - 1].length;
+	/* Adjust the VCN of the first run after the insertion... */
+	dst[marker].vcn = dst[marker - 1].vcn + dst[marker - 1].length;
 	/* ... and the length. */
-	if (dst[magic].lcn == LCN_HOLE || dst[magic].lcn == LCN_RL_NOT_MAPPED)
-		dst[magic].length = dst[magic + 1].vcn - dst[magic].vcn;
+	if (dst[marker].lcn == LCN_HOLE || dst[marker].lcn == LCN_RL_NOT_MAPPED)
+		dst[marker].length = dst[marker + 1].vcn - dst[marker].vcn;
 
-	/* Writing beyond the end of the file and there's a discontinuity. */
+	/* Writing beyond the end of the file and there is a discontinuity. */
 	if (disc) {
-		if (hole)
-			dst[loc - 1].length = dst[loc].vcn - dst[loc - 1].vcn;
-		else {
-			if (loc > 0) {
-				dst[loc].vcn = dst[loc - 1].vcn +
-						dst[loc - 1].length;
-				dst[loc].length = dst[loc + 1].vcn -
-						dst[loc].vcn;
-			} else {
-				dst[loc].vcn = 0;
-				dst[loc].length = dst[loc + 1].vcn;
-			}
-			dst[loc].lcn = LCN_RL_NOT_MAPPED;
+		if (loc > 0) {
+			dst[loc].vcn = dst[loc - 1].vcn + dst[loc - 1].length;
+			dst[loc].length = dst[loc + 1].vcn - dst[loc].vcn;
+		} else {
+			dst[loc].vcn = 0;
+			dst[loc].length = dst[loc + 1].vcn;
 		}
-
-		magic += hole;
-
-		if (dst[magic].lcn == LCN_ENOENT)
-			dst[magic].vcn = dst[magic - 1].vcn +
-					dst[magic - 1].length;
+		dst[loc].lcn = LCN_RL_NOT_MAPPED;
 	}
 	return dst;
 }
@@ -337,42 +381,65 @@ static inline runlist_element *ntfs_rl_insert(runlist_element *dst,
 static inline runlist_element *ntfs_rl_replace(runlist_element *dst,
 		int dsize, runlist_element *src, int ssize, int loc)
 {
-	BOOL left = FALSE;
-	BOOL right;
-	int magic;
+	signed delta;
+	bool left = false;	/* Left end of @src needs merging. */
+	bool right = false;	/* Right end of @src needs merging. */
+	int tail;		/* Start of tail of @dst. */
+	int marker;		/* End of the inserted runs. */
 
 	BUG_ON(!dst);
 	BUG_ON(!src);
 
-	/* First, merge the left and right ends, if necessary. */
-	right = ntfs_are_rl_mergeable(src + ssize - 1, dst + loc + 1);
+	/* First, see if the left and right ends need merging. */
+	if ((loc + 1) < dsize)
+		right = ntfs_are_rl_mergeable(src + ssize - 1, dst + loc + 1);
 	if (loc > 0)
 		left = ntfs_are_rl_mergeable(dst + loc - 1, src);
-
-	/* Allocate some space. We'll need less if the left, right, or both
-	 * ends were merged. */
-	dst = ntfs_rl_realloc(dst, dsize, dsize + ssize - left - right);
-	if (IS_ERR(dst))
-		return dst;
+	/*
+	 * Allocate some space.  We will need less if the left, right, or both
+	 * ends get merged.  The -1 accounts for the run being replaced.
+	 */
+	delta = ssize - 1 - left - right;
+	if (delta > 0) {
+		dst = ntfs_rl_realloc(dst, dsize, dsize + delta);
+		if (IS_ERR(dst))
+			return dst;
+	}
 	/*
 	 * We are guaranteed to succeed from here so can start modifying the
 	 * original runlists.
 	 */
+
+	/* First, merge the left and right ends, if necessary. */
 	if (right)
 		__ntfs_rl_merge(src + ssize - 1, dst + loc + 1);
 	if (left)
 		__ntfs_rl_merge(dst + loc - 1, src);
-
-	/* FIXME: What does this mean? (AIA) */
-	magic = loc + ssize - left;
+	/*
+	 * Offset of the tail of @dst.  This needs to be moved out of the way
+	 * to make space for the runs to be copied from @src, i.e. the first
+	 * run of the tail of @dst.
+	 * Nominally, @tail equals @loc + 1, i.e. location, skipping the
+	 * replaced run.  However, if @right, then one of @dst's runs is
+	 * already merged into @src.
+	 */
+	tail = loc + right + 1;
+	/*
+	 * First run after the @src runs that have been inserted, i.e. where
+	 * the tail of @dst needs to be moved to.
+	 * Nominally, @marker equals @loc + @ssize, i.e. location + number of
+	 * runs in @src.  However, if @left, then the first run in @src has
+	 * been merged with one in @dst.
+	 */
+	marker = loc + ssize - left;
 
 	/* Move the tail of @dst out of the way, then copy in @src. */
-	ntfs_rl_mm(dst, magic, loc + right + 1, dsize - loc - right - 1);
+	ntfs_rl_mm(dst, marker, tail, dsize - tail);
 	ntfs_rl_mc(dst, loc, src, left, ssize - left);
 
-	/* We may have changed the length of the file, so fix the end marker */
-	if (dst[magic].lcn == LCN_ENOENT)
-		dst[magic].vcn = dst[magic - 1].vcn + dst[magic - 1].length;
+	/* We may have changed the length of the file, so fix the end marker. */
+	if (dsize - tail > 0 && dst[marker].lcn == LCN_ENOENT)
+		dst[marker].vcn = dst[marker - 1].vcn + dst[marker - 1].length;
 	return dst;
 }
 
@@ -494,6 +561,7 @@ runlist_element *ntfs_runlists_merge(runlist_element *drl,
 			/* Scan to the end of the source runlist. */
 			for (dend = 0; likely(drl[dend].length); dend++)
 				;
+			dend++;
 			drl = ntfs_rl_realloc(drl, dend, dend + 1);
 			if (IS_ERR(drl))
 				return drl;
@@ -552,8 +620,8 @@ runlist_element *ntfs_runlists_merge(runlist_element *drl,
 		;
 
 	{
-	BOOL start;
-	BOOL finish;
+	bool start;
+	bool finish;
 	int ds = dend + 1;		/* Number of elements in drl & srl */
 	int ss = sfinal - sstart + 1;
 
@@ -563,11 +631,11 @@ runlist_element *ntfs_runlists_merge(runlist_element *drl,
 		 ((drl[dins].vcn + drl[dins].length) <=      /* End of hole   */
 		  (srl[send - 1].vcn + srl[send - 1].length)));
 
-	/* Or we'll lose an end marker */
-	if (start && finish && (drl[dins].length == 0))
+	/* Or we will lose an end marker. */
+	if (finish && !drl[dins].length)
 		ss++;
 	if (marker && (drl[dins].vcn + drl[dins].length > srl[send - 1].vcn))
-		finish = FALSE;
+		finish = false;
 #if 0
 	ntfs_debug("dfinal = %i, dend = %i", dfinal, dend);
 	ntfs_debug("sstart = %i, sfinal = %i, send = %i", sstart, sfinal, send);
@@ -618,11 +686,8 @@ runlist_element *ntfs_runlists_merge(runlist_element *drl,
 			if (drl[ds].lcn != LCN_RL_NOT_MAPPED) {
 				/* Add an unmapped runlist element. */
 				if (!slots) {
-					/* FIXME/TODO: We need to have the
-					 * extra memory already! (AIA) */
-					drl = ntfs_rl_realloc(drl, ds, ds + 2);
-					if (!drl)
-						goto critical_error;
+					drl = ntfs_rl_realloc_nofail(drl, ds,
+							ds + 2);
 					slots = 2;
 				}
 				ds++;
@@ -637,13 +702,8 @@ runlist_element *ntfs_runlists_merge(runlist_element *drl,
 			drl[ds].length = marker_vcn - drl[ds].vcn;
 			/* Finally add the ENOENT terminator. */
 			ds++;
-			if (!slots) {
-				/* FIXME/TODO: We need to have the extra
-				 * memory already! (AIA) */
-				drl = ntfs_rl_realloc(drl, ds, ds + 1);
-				if (!drl)
-					goto critical_error;
-			}
+			if (!slots)
+				drl = ntfs_rl_realloc_nofail(drl, ds, ds + 1);
 			drl[ds].vcn = marker_vcn;
 			drl[ds].lcn = LCN_ENOENT;
 			drl[ds].length = (s64)0;
@@ -656,11 +716,6 @@ finished:
 	ntfs_debug("Merged runlist:");
 	ntfs_debug_dump_runlist(drl);
 	return drl;
-
-critical_error:
-	/* Critical error! We cannot afford to fail here. */
-	ntfs_error(NULL, "Critical error! Not enough memory.");
-	panic("NTFS: Cannot continue.");
 }
 
 /**
@@ -724,6 +779,9 @@ runlist_element *ntfs_mapping_pairs_decompress(const ntfs_volume *vol,
 		ntfs_error(vol->sb, "Corrupt attribute.");
 		return ERR_PTR(-EIO);
 	}
+	/* If the mapping pairs array is valid but empty, nothing to do. */
+	if (!vcn && !*buf)
+		return old_rl;
 	/* Current position in runlist array. */
 	rlpos = 0;
 	/* Allocate first page and set current runlist size to one page. */
@@ -855,30 +913,42 @@ mpa_err:
 	if (!attr->data.non_resident.lowest_vcn) {
 		VCN max_cluster;
 
-		max_cluster = (sle64_to_cpu(
+		max_cluster = ((sle64_to_cpu(
 				attr->data.non_resident.allocated_size) +
 				vol->cluster_size - 1) >>
-				vol->cluster_size_bits;
+				vol->cluster_size_bits) - 1;
 		/*
-		 * If there is a difference between the highest_vcn and the
-		 * highest cluster, the runlist is either corrupt or, more
-		 * likely, there are more extents following this one.
+		 * A highest_vcn of zero means this is a single extent
+		 * attribute so simply terminate the runlist with LCN_ENOENT).
 		 */
-		if (deltaxcn < --max_cluster) {
-			ntfs_debug("More extents to follow; deltaxcn = 0x%llx, "
-					"max_cluster = 0x%llx",
-					(unsigned long long)deltaxcn,
-					(unsigned long long)max_cluster);
-			rl[rlpos].vcn = vcn;
-			vcn += rl[rlpos].length = max_cluster - deltaxcn;
-			rl[rlpos].lcn = LCN_RL_NOT_MAPPED;
-			rlpos++;
-		} else if (unlikely(deltaxcn > max_cluster)) {
-			ntfs_error(vol->sb, "Corrupt attribute. deltaxcn = "
-					"0x%llx, max_cluster = 0x%llx",
-					(unsigned long long)deltaxcn,
-					(unsigned long long)max_cluster);
-			goto mpa_err;
+		if (deltaxcn) {
+			/*
+			 * If there is a difference between the highest_vcn and
+			 * the highest cluster, the runlist is either corrupt
+			 * or, more likely, there are more extents following
+			 * this one.
+			 */
+			if (deltaxcn < max_cluster) {
+				ntfs_debug("More extents to follow; deltaxcn "
+						"= 0x%llx, max_cluster = "
+						"0x%llx",
+						(unsigned long long)deltaxcn,
+						(unsigned long long)
+						max_cluster);
+				rl[rlpos].vcn = vcn;
+				vcn += rl[rlpos].length = max_cluster -
+						deltaxcn;
+				rl[rlpos].lcn = LCN_RL_NOT_MAPPED;
+				rlpos++;
+			} else if (unlikely(deltaxcn > max_cluster)) {
+				ntfs_error(vol->sb, "Corrupt attribute.  "
+						"deltaxcn = 0x%llx, "
+						"max_cluster = 0x%llx",
+						(unsigned long long)deltaxcn,
+						(unsigned long long)
+						max_cluster);
+				goto mpa_err;
+			}
 		}
 		rl[rlpos].lcn = LCN_ENOENT;
 	} else /* Not the base extent. There may be more extents to follow. */
@@ -918,17 +988,18 @@ err_out:
  *
  * It is up to the caller to serialize access to the runlist @rl.
  *
- * Since lcns must be >= 0, we use negative return values with special meaning:
+ * Since lcns must be >= 0, we use negative return codes with special meaning:
  *
- * Return value			Meaning / Description
+ * Return code		Meaning / Description
  * ==================================================
- *  -1 = LCN_HOLE		Hole / not allocated on disk.
- *  -2 = LCN_RL_NOT_MAPPED	This is part of the runlist which has not been
- *				inserted into the runlist yet.
- *  -3 = LCN_ENOENT		There is no such vcn in the attribute.
+ *  LCN_HOLE		Hole / not allocated on disk.
+ *  LCN_RL_NOT_MAPPED	This is part of the runlist which has not been
+ *			inserted into the runlist yet.
+ *  LCN_ENOENT		There is no such vcn in the attribute.
  *
  * Locking: - The caller must have locked the runlist (for reading or writing).
- *	    - This function does not touch the lock.
+ *	    - This function does not touch the lock, nor does it modify the
+ *	      runlist.
  */
 LCN ntfs_rl_vcn_to_lcn(const runlist_element *rl, const VCN vcn)
 {
@@ -964,6 +1035,39 @@ LCN ntfs_rl_vcn_to_lcn(const runlist_element *rl, const VCN vcn)
 	return LCN_ENOENT;
 }
 
+#ifdef NTFS_RW
+
+/**
+ * ntfs_rl_find_vcn_nolock - find a vcn in a runlist
+ * @rl:		runlist to search
+ * @vcn:	vcn to find
+ *
+ * Find the virtual cluster number @vcn in the runlist @rl and return the
+ * address of the runlist element containing the @vcn on success.
+ *
+ * Return NULL if @rl is NULL or @vcn is in an unmapped part/out of bounds of
+ * the runlist.
+ *
+ * Locking: The runlist must be locked on entry.
+ */
+runlist_element *ntfs_rl_find_vcn_nolock(runlist_element *rl, const VCN vcn)
+{
+	BUG_ON(vcn < 0);
+	if (unlikely(!rl || vcn < rl[0].vcn))
+		return NULL;
+	while (likely(rl->length)) {
+		if (unlikely(vcn < rl[1].vcn)) {
+			if (likely(rl->lcn >= LCN_HOLE))
+				return rl;
+			return NULL;
+		}
+		rl++;
+	}
+	if (likely(rl->lcn == LCN_ENOENT))
+		return rl;
+	return NULL;
+}
+
 /**
  * ntfs_get_nr_significant_bytes - get number of bytes needed to store a number
  * @n:		number for which to get the number of bytes for
@@ -999,10 +1103,17 @@ static inline int ntfs_get_nr_significant_bytes(const s64 n)
  * ntfs_get_size_for_mapping_pairs - get bytes needed for mapping pairs array
  * @vol:	ntfs volume (needed for the ntfs version)
  * @rl:		locked runlist to determine the size of the mapping pairs of
- * @start_vcn:	vcn at which to start the mapping pairs array
+ * @first_vcn:	first vcn which to include in the mapping pairs array
+ * @last_vcn:	last vcn which to include in the mapping pairs array
  *
  * Walk the locked runlist @rl and calculate the size in bytes of the mapping
- * pairs array corresponding to the runlist @rl, starting at vcn @start_vcn.
+ * pairs array corresponding to the runlist @rl, starting at vcn @first_vcn and
+ * finishing with vcn @last_vcn.
+ *
+ * A @last_vcn of -1 means end of runlist and in that case the size of the
+ * mapping pairs array corresponding to the runlist starting at vcn @first_vcn
+ * and finishing at the end of the runlist is determined.
+ *
  * This for example allows us to allocate a buffer of the right size when
  * building the mapping pairs array.
  *
@@ -1018,34 +1129,50 @@ static inline int ntfs_get_nr_significant_bytes(const s64 n)
  *	    remains locked throughout, and is left locked upon return.
  */
 int ntfs_get_size_for_mapping_pairs(const ntfs_volume *vol,
-		const runlist_element *rl, const VCN start_vcn)
+		const runlist_element *rl, const VCN first_vcn,
+		const VCN last_vcn)
 {
 	LCN prev_lcn;
 	int rls;
+	bool the_end = false;
 
-	BUG_ON(start_vcn < 0);
+	BUG_ON(first_vcn < 0);
+	BUG_ON(last_vcn < -1);
+	BUG_ON(last_vcn >= 0 && first_vcn > last_vcn);
 	if (!rl) {
-		BUG_ON(start_vcn);
+		BUG_ON(first_vcn);
+		BUG_ON(last_vcn > 0);
 		return 1;
 	}
-	/* Skip to runlist element containing @start_vcn. */
-	while (rl->length && start_vcn >= rl[1].vcn)
+	/* Skip to runlist element containing @first_vcn. */
+	while (rl->length && first_vcn >= rl[1].vcn)
 		rl++;
-	if ((!rl->length && start_vcn > rl->vcn) || start_vcn < rl->vcn)
+	if (unlikely((!rl->length && first_vcn > rl->vcn) ||
+			first_vcn < rl->vcn))
 		return -EINVAL;
 	prev_lcn = 0;
 	/* Always need the termining zero byte. */
 	rls = 1;
 	/* Do the first partial run if present. */
-	if (start_vcn > rl->vcn) {
-		s64 delta;
+	if (first_vcn > rl->vcn) {
+		s64 delta, length = rl->length;
 
 		/* We know rl->length != 0 already. */
-		if (rl->length < 0 || rl->lcn < LCN_HOLE)
+		if (unlikely(length < 0 || rl->lcn < LCN_HOLE))
 			goto err_out;
-		delta = start_vcn - rl->vcn;
+		/*
+		 * If @stop_vcn is given and finishes inside this run, cap the
+		 * run length.
+		 */
+		if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) {
+			s64 s1 = last_vcn + 1;
+			if (unlikely(rl[1].vcn > s1))
+				length = s1 - rl->vcn;
+			the_end = true;
+		}
+		delta = first_vcn - rl->vcn;
 		/* Header byte + length. */
-		rls += 1 + ntfs_get_nr_significant_bytes(rl->length - delta);
+		rls += 1 + ntfs_get_nr_significant_bytes(length - delta);
 		/*
 		 * If the logical cluster number (lcn) denotes a hole and we
 		 * are on NTFS 3.0+, we don't store it at all, i.e. we need
@@ -1053,9 +1180,9 @@ int ntfs_get_size_for_mapping_pairs(const ntfs_volume *vol,
 		 * Note: this assumes that on NTFS 1.2-, holes are stored with
 		 * an lcn of -1 and not a delta_lcn of -1 (unless both are -1).
 		 */
-		if (rl->lcn >= 0 || vol->major_ver < 3) {
+		if (likely(rl->lcn >= 0 || vol->major_ver < 3)) {
 			prev_lcn = rl->lcn;
-			if (rl->lcn >= 0)
+			if (likely(rl->lcn >= 0))
 				prev_lcn += delta;
 			/* Change in lcn. */
 			rls += ntfs_get_nr_significant_bytes(prev_lcn);
@@ -1064,11 +1191,23 @@ int ntfs_get_size_for_mapping_pairs(const ntfs_volume *vol,
 		rl++;
 	}
 	/* Do the full runs. */
-	for (; rl->length; rl++) {
-		if (rl->length < 0 || rl->lcn < LCN_HOLE)
+	for (; rl->length && !the_end; rl++) {
+		s64 length = rl->length;
+
+		if (unlikely(length < 0 || rl->lcn < LCN_HOLE))
 			goto err_out;
+		/*
+		 * If @stop_vcn is given and finishes inside this run, cap the
+		 * run length.
+		 */
+		if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) {
+			s64 s1 = last_vcn + 1;
+			if (unlikely(rl[1].vcn > s1))
+				length = s1 - rl->vcn;
+			the_end = true;
+		}
 		/* Header byte + length. */
-		rls += 1 + ntfs_get_nr_significant_bytes(rl->length);
+		rls += 1 + ntfs_get_nr_significant_bytes(length);
 		/*
 		 * If the logical cluster number (lcn) denotes a hole and we
 		 * are on NTFS 3.0+, we don't store it at all, i.e. we need
@@ -1076,7 +1215,7 @@ int ntfs_get_size_for_mapping_pairs(const ntfs_volume *vol,
 		 * Note: this assumes that on NTFS 1.2-, holes are stored with
 		 * an lcn of -1 and not a delta_lcn of -1 (unless both are -1).
 		 */
-		if (rl->lcn >= 0 || vol->major_ver < 3) {
+		if (likely(rl->lcn >= 0 || vol->major_ver < 3)) {
 			/* Change in lcn. */
 			rls += ntfs_get_nr_significant_bytes(rl->lcn -
 					prev_lcn);
@@ -1104,7 +1243,7 @@ err_out:
  * write.
  *
  * This is used when building the mapping pairs array of a runlist to compress
- * a given logical cluster number (lcn) or a specific run length to the minumum
+ * a given logical cluster number (lcn) or a specific run length to the minimum
  * size possible.
  *
  * Return the number of bytes written on success.  On error, i.e. the
@@ -1119,7 +1258,7 @@ static inline int ntfs_write_significant_bytes(s8 *dst, const s8 *dst_max,
 
 	i = 0;
 	do {
-		if (dst > dst_max)
+		if (unlikely(dst > dst_max))
 			goto err_out;
 		*dst++ = l & 0xffll;
 		l >>= 8;
@@ -1128,12 +1267,12 @@ static inline int ntfs_write_significant_bytes(s8 *dst, const s8 *dst_max,
 	j = (n >> 8 * (i - 1)) & 0xff;
 	/* If the sign bit is wrong, we need an extra byte. */
 	if (n < 0 && j >= 0) {
-		if (dst > dst_max)
+		if (unlikely(dst > dst_max))
 			goto err_out;
 		i++;
 		*dst = (s8)-1;
 	} else if (n > 0 && j < 0) {
-		if (dst > dst_max)
+		if (unlikely(dst > dst_max))
 			goto err_out;
 		i++;
 		*dst = (s8)0;
@@ -1149,13 +1288,18 @@ err_out:
  * @dst:	destination buffer to which to write the mapping pairs array
  * @dst_len:	size of destination buffer @dst in bytes
  * @rl:		locked runlist for which to build the mapping pairs array
- * @start_vcn:	vcn at which to start the mapping pairs array
+ * @first_vcn:	first vcn which to include in the mapping pairs array
+ * @last_vcn:	last vcn which to include in the mapping pairs array
  * @stop_vcn:	first vcn outside destination buffer on success or -ENOSPC
  *
  * Create the mapping pairs array from the locked runlist @rl, starting at vcn
- * @start_vcn and save the array in @dst.  @dst_len is the size of @dst in
- * bytes and it should be at least equal to the value obtained by calling
- * ntfs_get_size_for_mapping_pairs().
+ * @first_vcn and finishing with vcn @last_vcn and save the array in @dst.
+ * @dst_len is the size of @dst in bytes and it should be at least equal to the
+ * value obtained by calling ntfs_get_size_for_mapping_pairs().
+ *
+ * A @last_vcn of -1 means end of runlist and in that case the mapping pairs
+ * array corresponding to the runlist starting at vcn @first_vcn and finishing
+ * at the end of the runlist is created.
  *
  * If @rl is NULL, just write a single terminator byte to @dst.
  *
@@ -1164,7 +1308,7 @@ err_out:
  * been filled with all the mapping pairs that will fit, thus it can be treated
  * as partial success, in that a new attribute extent needs to be created or
  * the next extent has to be used and the mapping pairs build has to be
- * continued with @start_vcn set to *@stop_vcn.
+ * continued with @first_vcn set to *@stop_vcn.
  *
  * Return 0 on success and -errno on error.  The following error codes are
  * defined:
@@ -1178,27 +1322,32 @@ err_out:
  */
 int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst,
 		const int dst_len, const runlist_element *rl,
-		const VCN start_vcn, VCN *const stop_vcn)
+		const VCN first_vcn, const VCN last_vcn, VCN *const stop_vcn)
 {
 	LCN prev_lcn;
 	s8 *dst_max, *dst_next;
 	int err = -ENOSPC;
+	bool the_end = false;
 	s8 len_len, lcn_len;
 
-	BUG_ON(start_vcn < 0);
+	BUG_ON(first_vcn < 0);
+	BUG_ON(last_vcn < -1);
+	BUG_ON(last_vcn >= 0 && first_vcn > last_vcn);
 	BUG_ON(dst_len < 1);
 	if (!rl) {
-		BUG_ON(start_vcn);
+		BUG_ON(first_vcn);
+		BUG_ON(last_vcn > 0);
 		if (stop_vcn)
 			*stop_vcn = 0;
 		/* Terminator byte. */
 		*dst = 0;
 		return 0;
 	}
-	/* Skip to runlist element containing @start_vcn. */
-	while (rl->length && start_vcn >= rl[1].vcn)
+	/* Skip to runlist element containing @first_vcn. */
+	while (rl->length && first_vcn >= rl[1].vcn)
 		rl++;
-	if ((!rl->length && start_vcn > rl->vcn) || start_vcn < rl->vcn)
+	if (unlikely((!rl->length && first_vcn > rl->vcn) ||
+			first_vcn < rl->vcn))
 		return -EINVAL;
 	/*
 	 * @dst_max is used for bounds checking in
@@ -1207,17 +1356,27 @@ int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst,
 	dst_max = dst + dst_len - 1;
 	prev_lcn = 0;
 	/* Do the first partial run if present. */
-	if (start_vcn > rl->vcn) {
-		s64 delta;
+	if (first_vcn > rl->vcn) {
+		s64 delta, length = rl->length;
 
 		/* We know rl->length != 0 already. */
-		if (rl->length < 0 || rl->lcn < LCN_HOLE)
+		if (unlikely(length < 0 || rl->lcn < LCN_HOLE))
 			goto err_out;
-		delta = start_vcn - rl->vcn;
+		/*
+		 * If @stop_vcn is given and finishes inside this run, cap the
+		 * run length.
+		 */
+		if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) {
+			s64 s1 = last_vcn + 1;
+			if (unlikely(rl[1].vcn > s1))
+				length = s1 - rl->vcn;
+			the_end = true;
+		}
+		delta = first_vcn - rl->vcn;
 		/* Write length. */
 		len_len = ntfs_write_significant_bytes(dst + 1, dst_max,
-				rl->length - delta);
-		if (len_len < 0)
+				length - delta);
+		if (unlikely(len_len < 0))
 			goto size_err;
 		/*
 		 * If the logical cluster number (lcn) denotes a hole and we
@@ -1228,19 +1387,19 @@ int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst,
 		 * case on NT4. - We assume that we just need to write the lcn
 		 * change until someone tells us otherwise... (AIA)
 		 */
-		if (rl->lcn >= 0 || vol->major_ver < 3) {
+		if (likely(rl->lcn >= 0 || vol->major_ver < 3)) {
 			prev_lcn = rl->lcn;
-			if (rl->lcn >= 0)
+			if (likely(rl->lcn >= 0))
 				prev_lcn += delta;
 			/* Write change in lcn. */
 			lcn_len = ntfs_write_significant_bytes(dst + 1 +
 					len_len, dst_max, prev_lcn);
-			if (lcn_len < 0)
+			if (unlikely(lcn_len < 0))
 				goto size_err;
 		} else
 			lcn_len = 0;
 		dst_next = dst + len_len + lcn_len + 1;
-		if (dst_next > dst_max)
+		if (unlikely(dst_next > dst_max))
 			goto size_err;
 		/* Update header byte. */
 		*dst = lcn_len << 4 | len_len;
@@ -1250,13 +1409,25 @@ int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst,
 		rl++;
 	}
 	/* Do the full runs. */
-	for (; rl->length; rl++) {
-		if (rl->length < 0 || rl->lcn < LCN_HOLE)
+	for (; rl->length && !the_end; rl++) {
+		s64 length = rl->length;
+
+		if (unlikely(length < 0 || rl->lcn < LCN_HOLE))
 			goto err_out;
+		/*
+		 * If @stop_vcn is given and finishes inside this run, cap the
+		 * run length.
+		 */
+		if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) {
+			s64 s1 = last_vcn + 1;
+			if (unlikely(rl[1].vcn > s1))
+				length = s1 - rl->vcn;
+			the_end = true;
+		}
 		/* Write length. */
 		len_len = ntfs_write_significant_bytes(dst + 1, dst_max,
-				rl->length);
-		if (len_len < 0)
+				length);
+		if (unlikely(len_len < 0))
 			goto size_err;
 		/*
 		 * If the logical cluster number (lcn) denotes a hole and we
@@ -1267,17 +1438,17 @@ int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst,
 		 * case on NT4. - We assume that we just need to write the lcn
 		 * change until someone tells us otherwise... (AIA)
 		 */
-		if (rl->lcn >= 0 || vol->major_ver < 3) {
+		if (likely(rl->lcn >= 0 || vol->major_ver < 3)) {
 			/* Write change in lcn. */
 			lcn_len = ntfs_write_significant_bytes(dst + 1 +
 					len_len, dst_max, rl->lcn - prev_lcn);
-			if (lcn_len < 0)
+			if (unlikely(lcn_len < 0))
 				goto size_err;
 			prev_lcn = rl->lcn;
 		} else
 			lcn_len = 0;
 		dst_next = dst + len_len + lcn_len + 1;
-		if (dst_next > dst_max)
+		if (unlikely(dst_next > dst_max))
 			goto size_err;
 		/* Update header byte. */
 		*dst = lcn_len << 4 | len_len;
@@ -1303,6 +1474,7 @@ err_out:
 
 /**
  * ntfs_rl_truncate_nolock - truncate a runlist starting at a specified vcn
+ * @vol:	ntfs volume (needed for error output)
  * @runlist:	runlist to truncate
  * @new_length:	the new length of the runlist in VCNs
  *
@@ -1310,12 +1482,16 @@ err_out:
  * holding the runlist elements to a length of @new_length VCNs.
  *
  * If @new_length lies within the runlist, the runlist elements with VCNs of
- * @new_length and above are discarded.
+ * @new_length and above are discarded.  As a special case if @new_length is
+ * zero, the runlist is discarded and set to NULL.
  *
  * If @new_length lies beyond the runlist, a sparse runlist element is added to
  * the end of the runlist @runlist or if the last runlist element is a sparse
  * one already, this is extended.
  *
+ * Note, no checking is done for unmapped runlist elements.  It is assumed that
+ * the caller has mapped any elements that need to be mapped already.
+ *
  * Return 0 on success and -errno on error.
  *
  * Locking: The caller must hold @runlist->lock for writing.
@@ -1330,6 +1506,13 @@ int ntfs_rl_truncate_nolock(const ntfs_volume *vol, runlist *const runlist,
 	BUG_ON(!runlist);
 	BUG_ON(new_length < 0);
 	rl = runlist->rl;
+	if (!new_length) {
+		ntfs_debug("Freeing runlist.");
+		runlist->rl = NULL;
+		if (rl)
+			ntfs_free(rl);
+		return 0;
+	}
 	if (unlikely(!rl)) {
 		/*
 		 * Create a runlist consisting of a sparse runlist element of
@@ -1358,7 +1541,7 @@ int ntfs_rl_truncate_nolock(const ntfs_volume *vol, runlist *const runlist,
 	 */
 	if (rl->length) {
 		runlist_element *trl;
-		BOOL is_end;
+		bool is_end;
 
 		ntfs_debug("Shrinking runlist.");
 		/* Determine the runlist size. */
@@ -1372,11 +1555,11 @@ int ntfs_rl_truncate_nolock(const ntfs_volume *vol, runlist *const runlist,
 		 * If a run was partially truncated, make the following runlist
 		 * element a terminator.
 		 */
-		is_end = FALSE;
+		is_end = false;
 		if (rl->length) {
 			rl++;
 			if (!rl->length)
-				is_end = TRUE;
+				is_end = true;
 			rl->vcn = new_length;
 			rl->length = 0;
 		}
@@ -1436,3 +1619,289 @@ int ntfs_rl_truncate_nolock(const ntfs_volume *vol, runlist *const runlist,
 	ntfs_debug("Done.");
 	return 0;
 }
+
+/**
+ * ntfs_rl_punch_nolock - punch a hole into a runlist
+ * @vol:	ntfs volume (needed for error output)
+ * @runlist:	runlist to punch a hole into
+ * @start:	starting VCN of the hole to be created
+ * @length:	size of the hole to be created in units of clusters
+ *
+ * Punch a hole into the runlist @runlist starting at VCN @start and of size
+ * @length clusters.
+ *
+ * Return 0 on success and -errno on error, in which case @runlist has not been
+ * modified.
+ *
+ * If @start and/or @start + @length are outside the runlist return error code
+ * -ENOENT.
+ *
+ * If the runlist contains unmapped or error elements between @start and @start
+ * + @length return error code -EINVAL.
+ *
+ * Locking: The caller must hold @runlist->lock for writing.
+ */
+int ntfs_rl_punch_nolock(const ntfs_volume *vol, runlist *const runlist,
+		const VCN start, const s64 length)
+{
+	const VCN end = start + length;
+	s64 delta;
+	runlist_element *rl, *rl_end, *rl_real_end, *trl;
+	int old_size;
+	bool lcn_fixup = false;
+
+	ntfs_debug("Entering for start 0x%llx, length 0x%llx.",
+			(long long)start, (long long)length);
+	BUG_ON(!runlist);
+	BUG_ON(start < 0);
+	BUG_ON(length < 0);
+	BUG_ON(end < 0);
+	rl = runlist->rl;
+	if (unlikely(!rl)) {
+		if (likely(!start && !length))
+			return 0;
+		return -ENOENT;
+	}
+	/* Find @start in the runlist. */
+	while (likely(rl->length && start >= rl[1].vcn))
+		rl++;
+	rl_end = rl;
+	/* Find @end in the runlist. */
+	while (likely(rl_end->length && end >= rl_end[1].vcn)) {
+		/* Verify there are no unmapped or error elements. */
+		if (unlikely(rl_end->lcn < LCN_HOLE))
+			return -EINVAL;
+		rl_end++;
+	}
+	/* Check the last element. */
+	if (unlikely(rl_end->length && rl_end->lcn < LCN_HOLE))
+		return -EINVAL;
+	/* This covers @start being out of bounds, too. */
+	if (!rl_end->length && end > rl_end->vcn)
+		return -ENOENT;
+	if (!length)
+		return 0;
+	if (!rl->length)
+		return -ENOENT;
+	rl_real_end = rl_end;
+	/* Determine the runlist size. */
+	while (likely(rl_real_end->length))
+		rl_real_end++;
+	old_size = rl_real_end - runlist->rl + 1;
+	/* If @start is in a hole simply extend the hole. */
+	if (rl->lcn == LCN_HOLE) {
+		/*
+		 * If both @start and @end are in the same sparse run, we are
+		 * done.
+		 */
+		if (end <= rl[1].vcn) {
+			ntfs_debug("Done (requested hole is already sparse).");
+			return 0;
+		}
+extend_hole:
+		/* Extend the hole. */
+		rl->length = end - rl->vcn;
+		/* If @end is in a hole, merge it with the current one. */
+		if (rl_end->lcn == LCN_HOLE) {
+			rl_end++;
+			rl->length = rl_end->vcn - rl->vcn;
+		}
+		/* We have done the hole.  Now deal with the remaining tail. */
+		rl++;
+		/* Cut out all runlist elements up to @end. */
+		if (rl < rl_end)
+			memmove(rl, rl_end, (rl_real_end - rl_end + 1) *
+					sizeof(*rl));
+		/* Adjust the beginning of the tail if necessary. */
+		if (end > rl->vcn) {
+			delta = end - rl->vcn;
+			rl->vcn = end;
+			rl->length -= delta;
+			/* Only adjust the lcn if it is real. */
+			if (rl->lcn >= 0)
+				rl->lcn += delta;
+		}
+shrink_allocation:
+		/* Reallocate memory if the allocation changed. */
+		if (rl < rl_end) {
+			rl = ntfs_rl_realloc(runlist->rl, old_size,
+					old_size - (rl_end - rl));
+			if (IS_ERR(rl))
+				ntfs_warning(vol->sb, "Failed to shrink "
+						"runlist buffer.  This just "
+						"wastes a bit of memory "
+						"temporarily so we ignore it "
+						"and return success.");
+			else
+				runlist->rl = rl;
+		}
+		ntfs_debug("Done (extend hole).");
+		return 0;
+	}
+	/*
+	 * If @start is at the beginning of a run things are easier as there is
+	 * no need to split the first run.
+	 */
+	if (start == rl->vcn) {
+		/*
+		 * @start is at the beginning of a run.
+		 *
+		 * If the previous run is sparse, extend its hole.
+		 *
+		 * If @end is not in the same run, switch the run to be sparse
+		 * and extend the newly created hole.
+		 *
+		 * Thus both of these cases reduce the problem to the above
+		 * case of "@start is in a hole".
+		 */
+		if (rl > runlist->rl && (rl - 1)->lcn == LCN_HOLE) {
+			rl--;
+			goto extend_hole;
+		}
+		if (end >= rl[1].vcn) {
+			rl->lcn = LCN_HOLE;
+			goto extend_hole;
+		}
+		/*
+		 * The final case is when @end is in the same run as @start.
+		 * For this need to split the run into two.  One run for the
+		 * sparse region between the beginning of the old run, i.e.
+		 * @start, and @end and one for the remaining non-sparse
+		 * region, i.e. between @end and the end of the old run.
+		 */
+		trl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 1);
+		if (IS_ERR(trl))
+			goto enomem_out;
+		old_size++;
+		if (runlist->rl != trl) {
+			rl = trl + (rl - runlist->rl);
+			rl_end = trl + (rl_end - runlist->rl);
+			rl_real_end = trl + (rl_real_end - runlist->rl);
+			runlist->rl = trl;
+		}
+split_end:
+		/* Shift all the runs up by one. */
+		memmove(rl + 1, rl, (rl_real_end - rl + 1) * sizeof(*rl));
+		/* Finally, setup the two split runs. */
+		rl->lcn = LCN_HOLE;
+		rl->length = length;
+		rl++;
+		rl->vcn += length;
+		/* Only adjust the lcn if it is real. */
+		if (rl->lcn >= 0 || lcn_fixup)
+			rl->lcn += length;
+		rl->length -= length;
+		ntfs_debug("Done (split one).");
+		return 0;
+	}
+	/*
+	 * @start is neither in a hole nor at the beginning of a run.
+	 *
+	 * If @end is in a hole, things are easier as simply truncating the run
+	 * @start is in to end at @start - 1, deleting all runs after that up
+	 * to @end, and finally extending the beginning of the run @end is in
+	 * to be @start is all that is needed.
+	 */
+	if (rl_end->lcn == LCN_HOLE) {
+		/* Truncate the run containing @start. */
+		rl->length = start - rl->vcn;
+		rl++;
+		/* Cut out all runlist elements up to @end. */
+		if (rl < rl_end)
+			memmove(rl, rl_end, (rl_real_end - rl_end + 1) *
+					sizeof(*rl));
+		/* Extend the beginning of the run @end is in to be @start. */
+		rl->vcn = start;
+		rl->length = rl[1].vcn - start;
+		goto shrink_allocation;
+	}
+	/* 
+	 * If @end is not in a hole there are still two cases to distinguish.
+	 * Either @end is or is not in the same run as @start.
+	 *
+	 * The second case is easier as it can be reduced to an already solved
+	 * problem by truncating the run @start is in to end at @start - 1.
+	 * Then, if @end is in the next run need to split the run into a sparse
+	 * run followed by a non-sparse run (already covered above) and if @end
+	 * is not in the next run switching it to be sparse, again reduces the
+	 * problem to the already covered case of "@start is in a hole".
+	 */
+	if (end >= rl[1].vcn) {
+		/*
+		 * If @end is not in the next run, reduce the problem to the
+		 * case of "@start is in a hole".
+		 */
+		if (rl[1].length && end >= rl[2].vcn) {
+			/* Truncate the run containing @start. */
+			rl->length = start - rl->vcn;
+			rl++;
+			rl->vcn = start;
+			rl->lcn = LCN_HOLE;
+			goto extend_hole;
+		}
+		trl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 1);
+		if (IS_ERR(trl))
+			goto enomem_out;
+		old_size++;
+		if (runlist->rl != trl) {
+			rl = trl + (rl - runlist->rl);
+			rl_end = trl + (rl_end - runlist->rl);
+			rl_real_end = trl + (rl_real_end - runlist->rl);
+			runlist->rl = trl;
+		}
+		/* Truncate the run containing @start. */
+		rl->length = start - rl->vcn;
+		rl++;
+		/*
+		 * @end is in the next run, reduce the problem to the case
+		 * where "@start is at the beginning of a run and @end is in
+		 * the same run as @start".
+		 */
+		delta = rl->vcn - start;
+		rl->vcn = start;
+		if (rl->lcn >= 0) {
+			rl->lcn -= delta;
+			/* Need this in case the lcn just became negative. */
+			lcn_fixup = true;
+		}
+		rl->length += delta;
+		goto split_end;
+	}
+	/*
+	 * The first case from above, i.e. @end is in the same run as @start.
+	 * We need to split the run into three.  One run for the non-sparse
+	 * region between the beginning of the old run and @start, one for the
+	 * sparse region between @start and @end, and one for the remaining
+	 * non-sparse region, i.e. between @end and the end of the old run.
+	 */
+	trl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 2);
+	if (IS_ERR(trl))
+		goto enomem_out;
+	old_size += 2;
+	if (runlist->rl != trl) {
+		rl = trl + (rl - runlist->rl);
+		rl_end = trl + (rl_end - runlist->rl);
+		rl_real_end = trl + (rl_real_end - runlist->rl);
+		runlist->rl = trl;
+	}
+	/* Shift all the runs up by two. */
+	memmove(rl + 2, rl, (rl_real_end - rl + 1) * sizeof(*rl));
+	/* Finally, setup the three split runs. */
+	rl->length = start - rl->vcn;
+	rl++;
+	rl->vcn = start;
+	rl->lcn = LCN_HOLE;
+	rl->length = length;
+	rl++;
+	delta = end - rl->vcn;
+	rl->vcn = end;
+	rl->lcn += delta;
+	rl->length -= delta;
+	ntfs_debug("Done (split both).");
+	return 0;
+enomem_out:
+	ntfs_error(vol->sb, "Not enough memory to extend runlist buffer.");
+	return -ENOMEM;
+}
+
+#endif /* NTFS_RW */
diff --git a/fs/ntfs/runlist.h b/fs/ntfs/runlist.h
index 7107fde59df..47728fbb610 100644
--- a/fs/ntfs/runlist.h
+++ b/fs/ntfs/runlist.h
@@ -2,7 +2,7 @@
  * runlist.h - Defines for runlist handling in NTFS Linux kernel driver.
  *	       Part of the Linux-NTFS project.
  *
- * Copyright (c) 2001-2004 Anton Altaparmakov
+ * Copyright (c) 2001-2005 Anton Altaparmakov
  * Copyright (c) 2002 Richard Russon
  *
  * This program/include file is free software; you can redistribute it and/or
@@ -66,6 +66,8 @@ typedef enum {
 	LCN_HOLE		= -1,	/* Keep this as highest value or die! */
 	LCN_RL_NOT_MAPPED	= -2,
 	LCN_ENOENT		= -3,
+	LCN_ENOMEM		= -4,
+	LCN_EIO			= -5,
 } LCN_SPECIAL_VALUES;
 
 extern runlist_element *ntfs_runlists_merge(runlist_element *drl,
@@ -76,14 +78,25 @@ extern runlist_element *ntfs_mapping_pairs_decompress(const ntfs_volume *vol,
 
 extern LCN ntfs_rl_vcn_to_lcn(const runlist_element *rl, const VCN vcn);
 
+#ifdef NTFS_RW
+
+extern runlist_element *ntfs_rl_find_vcn_nolock(runlist_element *rl,
+		const VCN vcn);
+
 extern int ntfs_get_size_for_mapping_pairs(const ntfs_volume *vol,
-		const runlist_element *rl, const VCN start_vcn);
+		const runlist_element *rl, const VCN first_vcn,
+		const VCN last_vcn);
 
 extern int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst,
 		const int dst_len, const runlist_element *rl,
-		const VCN start_vcn, VCN *const stop_vcn);
+		const VCN first_vcn, const VCN last_vcn, VCN *const stop_vcn);
 
 extern int ntfs_rl_truncate_nolock(const ntfs_volume *vol,
 		runlist *const runlist, const s64 new_length);
 
+int ntfs_rl_punch_nolock(const ntfs_volume *vol, runlist *const runlist,
+		const VCN start, const s64 length);
+
+#endif /* NTFS_RW */
+
 #endif /* _LINUX_NTFS_RUNLIST_H */
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 212a3d0f207..6c3296e546c 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -1,7 +1,7 @@
 /*
  * super.c - NTFS kernel super block handling. Part of the Linux-NTFS project.
  *
- * Copyright (c) 2001-2004 Anton Altaparmakov
+ * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc.
  * Copyright (c) 2001,2002 Richard Russon
  *
  * This program/include file is free software; you can redistribute it and/or
@@ -19,34 +19,39 @@
  * distribution in the file COPYING); if not, write to the Free Software
  * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/stddef.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/spinlock.h>
-#include <linux/blkdev.h>	/* For bdev_hardsect_size(). */
+#include <linux/blkdev.h>	/* For bdev_logical_block_size(). */
 #include <linux/backing-dev.h>
 #include <linux/buffer_head.h>
 #include <linux/vfs.h>
 #include <linux/moduleparam.h>
-#include <linux/smp_lock.h>
+#include <linux/bitmap.h>
 
 #include "sysctl.h"
 #include "logfile.h"
 #include "quota.h"
+#include "usnjrnl.h"
 #include "dir.h"
 #include "debug.h"
 #include "index.h"
+#include "inode.h"
 #include "aops.h"
+#include "layout.h"
 #include "malloc.h"
 #include "ntfs.h"
 
-/* Number of mounted file systems which have compression enabled. */
+/* Number of mounted filesystems which have compression enabled. */
 static unsigned long ntfs_nr_compression_users;
 
 /* A global default upcase table and a corresponding reference count. */
-static ntfschar *default_upcase = NULL;
-static unsigned long ntfs_nr_upcase_users = 0;
+static ntfschar *default_upcase;
+static unsigned long ntfs_nr_upcase_users;
 
 /* Error constants/strings used in inode.c::ntfs_show_options(). */
 typedef enum {
@@ -71,18 +76,18 @@ const option_t on_errors_arr[] = {
  *
  * Copied from old ntfs driver (which copied from vfat driver).
  */
-static int simple_getbool(char *s, BOOL *setval)
+static int simple_getbool(char *s, bool *setval)
 {
 	if (s) {
 		if (!strcmp(s, "1") || !strcmp(s, "yes") || !strcmp(s, "true"))
-			*setval = TRUE;
+			*setval = true;
 		else if (!strcmp(s, "0") || !strcmp(s, "no") ||
 							!strcmp(s, "false"))
-			*setval = FALSE;
+			*setval = false;
 		else
 			return 0;
 	} else
-		*setval = TRUE;
+		*setval = true;
 	return 1;
 }
 
@@ -93,16 +98,16 @@ static int simple_getbool(char *s, BOOL *setval)
  *
  * Parse the recognized options in @opt for the ntfs volume described by @vol.
  */
-static BOOL parse_options(ntfs_volume *vol, char *opt)
+static bool parse_options(ntfs_volume *vol, char *opt)
 {
 	char *p, *v, *ov;
 	static char *utf8 = "utf8";
 	int errors = 0, sloppy = 0;
-	uid_t uid = (uid_t)-1;
-	gid_t gid = (gid_t)-1;
-	mode_t fmask = (mode_t)-1, dmask = (mode_t)-1;
+	kuid_t uid = INVALID_UID;
+	kgid_t gid = INVALID_GID;
+	umode_t fmask = (umode_t)-1, dmask = (umode_t)-1;
 	int mft_zone_multiplier = -1, on_errors = -1;
-	int show_sys_files = -1, case_sensitive = -1;
+	int show_sys_files = -1, case_sensitive = -1, disable_sparse = -1;
 	struct nls_table *nls_map = NULL, *old_nls;
 
 	/* I am lazy... (-8 */
@@ -124,9 +129,41 @@ static BOOL parse_options(ntfs_volume *vol, char *opt)
 		if (*v)							\
 			goto needs_val;					\
 	}
+#define NTFS_GETOPT_UID(option, variable)				\
+	if (!strcmp(p, option)) {					\
+		uid_t uid_value;					\
+		if (!v || !*v)						\
+			goto needs_arg;					\
+		uid_value = simple_strtoul(ov = v, &v, 0);		\
+		if (*v)							\
+			goto needs_val;					\
+		variable = make_kuid(current_user_ns(), uid_value);	\
+		if (!uid_valid(variable))				\
+			goto needs_val;					\
+	}
+#define NTFS_GETOPT_GID(option, variable)				\
+	if (!strcmp(p, option)) {					\
+		gid_t gid_value;					\
+		if (!v || !*v)						\
+			goto needs_arg;					\
+		gid_value = simple_strtoul(ov = v, &v, 0);		\
+		if (*v)							\
+			goto needs_val;					\
+		variable = make_kgid(current_user_ns(), gid_value);	\
+		if (!gid_valid(variable))				\
+			goto needs_val;					\
+	}
+#define NTFS_GETOPT_OCTAL(option, variable)				\
+	if (!strcmp(p, option)) {					\
+		if (!v || !*v)						\
+			goto needs_arg;					\
+		variable = simple_strtoul(ov = v, &v, 8);		\
+		if (*v)							\
+			goto needs_val;					\
+	}
 #define NTFS_GETOPT_BOOL(option, variable)				\
 	if (!strcmp(p, option)) {					\
-		BOOL val;						\
+		bool val;						\
 		if (!simple_getbool(v, &val))				\
 			goto needs_bool;				\
 		variable = val;						\
@@ -153,15 +190,16 @@ static BOOL parse_options(ntfs_volume *vol, char *opt)
 	while ((p = strsep(&opt, ","))) {
 		if ((v = strchr(p, '=')))
 			*v++ = 0;
-		NTFS_GETOPT("uid", uid)
-		else NTFS_GETOPT("gid", gid)
-		else NTFS_GETOPT("umask", fmask = dmask)
-		else NTFS_GETOPT("fmask", fmask)
-		else NTFS_GETOPT("dmask", dmask)
+		NTFS_GETOPT_UID("uid", uid)
+		else NTFS_GETOPT_GID("gid", gid)
+		else NTFS_GETOPT_OCTAL("umask", fmask = dmask)
+		else NTFS_GETOPT_OCTAL("fmask", fmask)
+		else NTFS_GETOPT_OCTAL("dmask", dmask)
 		else NTFS_GETOPT("mft_zone_multiplier", mft_zone_multiplier)
-		else NTFS_GETOPT_WITH_DEFAULT("sloppy", sloppy, TRUE)
+		else NTFS_GETOPT_WITH_DEFAULT("sloppy", sloppy, true)
 		else NTFS_GETOPT_BOOL("show_sys_files", show_sys_files)
 		else NTFS_GETOPT_BOOL("case_sensitive", case_sensitive)
+		else NTFS_GETOPT_BOOL("disable_sparse", disable_sparse)
 		else NTFS_GETOPT_OPTIONS_ARRAY("errors", on_errors,
 				on_errors_arr)
 		else if (!strcmp(p, "posix") || !strcmp(p, "show_inodes"))
@@ -182,25 +220,24 @@ use_utf8:
 				if (!old_nls) {
 					ntfs_error(vol->sb, "NLS character set "
 							"%s not found.", v);
-					return FALSE;
+					return false;
 				}
 				ntfs_error(vol->sb, "NLS character set %s not "
 						"found. Using previous one %s.",
 						v, old_nls->charset);
 				nls_map = old_nls;
 			} else /* nls_map */ {
-				if (old_nls)
-					unload_nls(old_nls);
+				unload_nls(old_nls);
 			}
 		} else if (!strcmp(p, "utf8")) {
-			BOOL val = FALSE;
+			bool val = false;
 			ntfs_warning(vol->sb, "Option utf8 is no longer "
 				   "supported, using option nls=utf8. Please "
 				   "use option nls=utf8 in the future and "
 				   "make sure utf8 is compiled either as a "
 				   "module or into the kernel.");
 			if (!v || !*v)
-				val = TRUE;
+				val = true;
 			else if (!simple_getbool(v, &val))
 				goto needs_bool;
 			if (val) {
@@ -219,7 +256,7 @@ use_utf8:
 	}
 no_mount_options:
 	if (errors && !sloppy)
-		return FALSE;
+		return false;
 	if (sloppy)
 		ntfs_warning(vol->sb, "Sloppy option given. Ignoring "
 				"unrecognized mount option(s) and continuing.");
@@ -228,14 +265,14 @@ no_mount_options:
 		if (!on_errors) {
 			ntfs_error(vol->sb, "Invalid errors option argument "
 					"or bug in options parser.");
-			return FALSE;
+			return false;
 		}
 	}
 	if (nls_map) {
 		if (vol->nls_map && vol->nls_map != nls_map) {
 			ntfs_error(vol->sb, "Cannot change NLS character set "
 					"on remount.");
-			return FALSE;
+			return false;
 		} /* else (!vol->nls_map) */
 		ntfs_debug("Using NLS character set %s.", nls_map->charset);
 		vol->nls_map = nls_map;
@@ -245,7 +282,7 @@ no_mount_options:
 			if (!vol->nls_map) {
 				ntfs_error(vol->sb, "Failed to load default "
 						"NLS character set.");
-				return FALSE;
+				return false;
 			}
 			ntfs_debug("Using default NLS character set (%s).",
 					vol->nls_map->charset);
@@ -256,7 +293,7 @@ no_mount_options:
 				mft_zone_multiplier) {
 			ntfs_error(vol->sb, "Cannot change mft_zone_multiplier "
 					"on remount.");
-			return FALSE;
+			return false;
 		}
 		if (mft_zone_multiplier < 1 || mft_zone_multiplier > 4) {
 			ntfs_error(vol->sb, "Invalid mft_zone_multiplier. "
@@ -271,13 +308,13 @@ no_mount_options:
 		vol->on_errors = on_errors;
 	if (!vol->on_errors || vol->on_errors == ON_ERRORS_RECOVER)
 		vol->on_errors |= ON_ERRORS_CONTINUE;
-	if (uid != (uid_t)-1)
+	if (uid_valid(uid))
 		vol->uid = uid;
-	if (gid != (gid_t)-1)
+	if (gid_valid(gid))
 		vol->gid = gid;
-	if (fmask != (mode_t)-1)
+	if (fmask != (umode_t)-1)
 		vol->fmask = fmask;
-	if (dmask != (mode_t)-1)
+	if (dmask != (umode_t)-1)
 		vol->dmask = dmask;
 	if (show_sys_files != -1) {
 		if (show_sys_files)
@@ -291,16 +328,31 @@ no_mount_options:
 		else
 			NVolClearCaseSensitive(vol);
 	}
-	return TRUE;
+	if (disable_sparse != -1) {
+		if (disable_sparse)
+			NVolClearSparseEnabled(vol);
+		else {
+			if (!NVolSparseEnabled(vol) &&
+					vol->major_ver && vol->major_ver < 3)
+				ntfs_warning(vol->sb, "Not enabling sparse "
+						"support due to NTFS volume "
+						"version %i.%i (need at least "
+						"version 3.0).", vol->major_ver,
+						vol->minor_ver);
+			else
+				NVolSetSparseEnabled(vol);
+		}
+	}
+	return true;
 needs_arg:
 	ntfs_error(vol->sb, "The %s option requires an argument.", p);
-	return FALSE;
+	return false;
 needs_bool:
 	ntfs_error(vol->sb, "The %s option requires a boolean argument.", p);
-	return FALSE;
+	return false;
 needs_val:
 	ntfs_error(vol->sb, "Invalid %s option argument: %s", p, ov);
-	return FALSE;
+	return false;
 }
 
 #ifdef NTFS_RW
@@ -416,9 +468,12 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
 	ntfs_volume *vol = NTFS_SB(sb);
 
 	ntfs_debug("Entering with remount options string: %s", opt);
+
+	sync_filesystem(sb);
+
 #ifndef NTFS_RW
-	/* For read-only compiled driver, enforce all read-only flags. */
-	*flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME;
+	/* For read-only compiled driver, enforce read-only flag. */
+	*flags |= MS_RDONLY;
 #else /* NTFS_RW */
 	/*
 	 * For the read-write compiled driver, if we are remounting read-write,
@@ -430,7 +485,7 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
 	 * the volume on boot and updates them.
 	 *
 	 * When remounting read-only, mark the volume clean if no volume errors
-	 * have occured.
+	 * have occurred.
 	 */
 	if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
 		static const char *es = ".  Cannot remount read-write.";
@@ -445,9 +500,16 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
 			ntfs_error(sb, "Volume is dirty and read-only%s", es);
 			return -EROFS;
 		}
+		if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) {
+			ntfs_error(sb, "Volume has been modified by chkdsk "
+					"and is read-only%s", es);
+			return -EROFS;
+		}
 		if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) {
-			ntfs_error(sb, "Volume has unsupported flags set and "
-					"is read-only%s", es);
+			ntfs_error(sb, "Volume has unsupported flags set "
+					"(0x%x) and is read-only%s",
+					(unsigned)le16_to_cpu(vol->vol_flags),
+					es);
 			return -EROFS;
 		}
 		if (ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) {
@@ -480,6 +542,12 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
 			NVolSetErrors(vol);
 			return -EROFS;
 		}
+		if (!ntfs_stamp_usnjrnl(vol)) {
+			ntfs_error(sb, "Failed to stamp transation log "
+					"($UsnJrnl)%s", es);
+			NVolSetErrors(vol);
+			return -EROFS;
+		}
 	} else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) {
 		/* Remounting read-only. */
 		if (!NVolErrors(vol)) {
@@ -495,6 +563,7 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
 
 	if (!parse_options(vol, opt))
 		return -EINVAL;
+
 	ntfs_debug("Done.");
 	return 0;
 }
@@ -503,29 +572,32 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
  * is_boot_sector_ntfs - check whether a boot sector is a valid NTFS boot sector
  * @sb:		Super block of the device to which @b belongs.
  * @b:		Boot sector of device @sb to check.
- * @silent:	If TRUE, all output will be silenced.
+ * @silent:	If 'true', all output will be silenced.
  *
  * is_boot_sector_ntfs() checks whether the boot sector @b is a valid NTFS boot
- * sector. Returns TRUE if it is valid and FALSE if not.
+ * sector. Returns 'true' if it is valid and 'false' if not.
  *
  * @sb is only needed for warning/error output, i.e. it can be NULL when silent
- * is TRUE.
+ * is 'true'.
  */
-static BOOL is_boot_sector_ntfs(const struct super_block *sb,
-		const NTFS_BOOT_SECTOR *b, const BOOL silent)
+static bool is_boot_sector_ntfs(const struct super_block *sb,
+		const NTFS_BOOT_SECTOR *b, const bool silent)
 {
 	/*
 	 * Check that checksum == sum of u32 values from b to the checksum
-	 * field. If checksum is zero, no checking is done.
+	 * field.  If checksum is zero, no checking is done.  We will work when
+	 * the checksum test fails, since some utilities update the boot sector
+	 * ignoring the checksum which leaves the checksum out-of-date.  We
+	 * report a warning if this is the case.
 	 */
-	if ((void*)b < (void*)&b->checksum && b->checksum) {
+	if ((void*)b < (void*)&b->checksum && b->checksum && !silent) {
 		le32 *u;
 		u32 i;
 
 		for (i = 0, u = (le32*)b; u < (le32*)(&b->checksum); ++u)
 			i += le32_to_cpup(u);
 		if (le32_to_cpu(b->checksum) != i)
-			goto not_ntfs;
+			ntfs_warning(sb, "Invalid boot sector checksum.");
 	}
 	/* Check OEMidentifier is "NTFS    " */
 	if (b->oem_id != magicNTFS)
@@ -541,9 +613,9 @@ static BOOL is_boot_sector_ntfs(const struct super_block *sb,
 	default:
 		goto not_ntfs;
 	}
-	/* Check the cluster size is not above 65536 bytes. */
+	/* Check the cluster size is not above the maximum (64kiB). */
 	if ((u32)le16_to_cpu(b->bpb.bytes_per_sector) *
-			b->bpb.sectors_per_cluster > 0x10000)
+			b->bpb.sectors_per_cluster > NTFS_MAX_CLUSTER_SIZE)
 		goto not_ntfs;
 	/* Check reserved/unused fields are really zero. */
 	if (le16_to_cpu(b->bpb.reserved_sectors) ||
@@ -577,9 +649,9 @@ static BOOL is_boot_sector_ntfs(const struct super_block *sb,
 	 */
 	if (!silent && b->end_of_sector_marker != cpu_to_le16(0xaa55))
 		ntfs_warning(sb, "Invalid end of sector marker.");
-	return TRUE;
+	return true;
 not_ntfs:
-	return FALSE;
+	return false;
 }
 
 /**
@@ -606,7 +678,7 @@ static struct buffer_head *read_ntfs_boot_sector(struct super_block *sb,
 {
 	const char *read_err_str = "Unable to read %s boot sector.";
 	struct buffer_head *bh_primary, *bh_backup;
-	long nr_blocks = NTFS_SB(sb)->nr_blocks;
+	sector_t nr_blocks = NTFS_SB(sb)->nr_blocks;
 
 	/* Try to read primary boot sector. */
 	if ((bh_primary = sb_bread(sb, 0))) {
@@ -653,13 +725,18 @@ hotfix_primary_boot_sector:
 		/*
 		 * If we managed to read sector zero and the volume is not
 		 * read-only, copy the found, valid backup boot sector to the
-		 * primary boot sector.
+		 * primary boot sector.  Note we only copy the actual boot
+		 * sector structure, not the actual whole device sector as that
+		 * may be bigger and would potentially damage the $Boot system
+		 * file (FIXME: Would be nice to know if the backup boot sector
+		 * on a large sector device contains the whole boot loader or
+		 * just the first 512 bytes).
 		 */
 		if (!(sb->s_flags & MS_RDONLY)) {
 			ntfs_warning(sb, "Hot-fix: Recovering invalid primary "
 					"boot sector from backup copy.");
 			memcpy(bh_primary->b_data, bh_backup->b_data,
-					sb->s_blocksize);
+					NTFS_BLOCK_SIZE);
 			mark_buffer_dirty(bh_primary);
 			sync_dirty_buffer(bh_primary);
 			if (buffer_uptodate(bh_primary)) {
@@ -684,9 +761,9 @@ hotfix_primary_boot_sector:
  * @b:		boot sector to parse
  *
  * Parse the ntfs boot sector @b and store all imporant information therein in
- * the ntfs super block @vol.  Return TRUE on success and FALSE on error.
+ * the ntfs super block @vol.  Return 'true' on success and 'false' on error.
  */
-static BOOL parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b)
+static bool parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b)
 {
 	unsigned int sectors_per_cluster_bits, nr_hidden_sects;
 	int clusters_per_mft_record, clusters_per_index_record;
@@ -698,9 +775,13 @@ static BOOL parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b)
 			vol->sector_size);
 	ntfs_debug("vol->sector_size_bits = %i (0x%x)", vol->sector_size_bits,
 			vol->sector_size_bits);
-	if (vol->sector_size != vol->sb->s_blocksize)
-		ntfs_warning(vol->sb, "The boot sector indicates a sector size "
-				"different from the device sector size.");
+	if (vol->sector_size < vol->sb->s_blocksize) {
+		ntfs_error(vol->sb, "Sector size (%i) is smaller than the "
+				"device block size (%lu).  This is not "
+				"supported.  Sorry.", vol->sector_size,
+				vol->sb->s_blocksize);
+		return false;
+	}
 	ntfs_debug("sectors_per_cluster = 0x%x", b->bpb.sectors_per_cluster);
 	sectors_per_cluster_bits = ffs(b->bpb.sectors_per_cluster) - 1;
 	ntfs_debug("sectors_per_cluster_bits = 0x%x",
@@ -713,17 +794,12 @@ static BOOL parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b)
 	ntfs_debug("vol->cluster_size = %i (0x%x)", vol->cluster_size,
 			vol->cluster_size);
 	ntfs_debug("vol->cluster_size_mask = 0x%x", vol->cluster_size_mask);
-	ntfs_debug("vol->cluster_size_bits = %i (0x%x)",
-			vol->cluster_size_bits, vol->cluster_size_bits);
-	if (vol->sector_size > vol->cluster_size) {
-		ntfs_error(vol->sb, "Sector sizes above the cluster size are "
-				"not supported.  Sorry.");
-		return FALSE;
-	}
-	if (vol->sb->s_blocksize > vol->cluster_size) {
-		ntfs_error(vol->sb, "Cluster sizes smaller than the device "
-				"sector size are not supported.  Sorry.");
-		return FALSE;
+	ntfs_debug("vol->cluster_size_bits = %i", vol->cluster_size_bits);
+	if (vol->cluster_size < vol->sector_size) {
+		ntfs_error(vol->sb, "Cluster size (%i) is smaller than the "
+				"sector size (%i).  This is not supported.  "
+				"Sorry.", vol->cluster_size, vol->sector_size);
+		return false;
 	}
 	clusters_per_mft_record = b->clusters_per_mft_record;
 	ntfs_debug("clusters_per_mft_record = %i (0x%x)",
@@ -751,12 +827,19 @@ static BOOL parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b)
 	 * we store $MFT/$DATA, the table of mft records in the page cache.
 	 */
 	if (vol->mft_record_size > PAGE_CACHE_SIZE) {
-		ntfs_error(vol->sb, "Mft record size %i (0x%x) exceeds the "
-				"page cache size on your system %lu (0x%lx).  "
+		ntfs_error(vol->sb, "Mft record size (%i) exceeds the "
+				"PAGE_CACHE_SIZE on your system (%lu).  "
 				"This is not supported.  Sorry.",
-				vol->mft_record_size, vol->mft_record_size,
-				PAGE_CACHE_SIZE, PAGE_CACHE_SIZE);
-		return FALSE;
+				vol->mft_record_size, PAGE_CACHE_SIZE);
+		return false;
+	}
+	/* We cannot support mft record sizes below the sector size. */
+	if (vol->mft_record_size < vol->sector_size) {
+		ntfs_error(vol->sb, "Mft record size (%i) is smaller than the "
+				"sector size (%i).  This is not supported.  "
+				"Sorry.", vol->mft_record_size,
+				vol->sector_size);
+		return false;
 	}
 	clusters_per_index_record = b->clusters_per_index_record;
 	ntfs_debug("clusters_per_index_record = %i (0x%x)",
@@ -781,6 +864,14 @@ static BOOL parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b)
 	ntfs_debug("vol->index_record_size_bits = %i (0x%x)",
 			vol->index_record_size_bits,
 			vol->index_record_size_bits);
+	/* We cannot support index record sizes below the sector size. */
+	if (vol->index_record_size < vol->sector_size) {
+		ntfs_error(vol->sb, "Index record size (%i) is smaller than "
+				"the sector size (%i).  This is not "
+				"supported.  Sorry.", vol->index_record_size,
+				vol->sector_size);
+		return false;
+	}
 	/*
 	 * Get the size of the volume in clusters and check for 64-bit-ness.
 	 * Windows currently only uses 32 bits to save the clusters so we do
@@ -789,7 +880,7 @@ static BOOL parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b)
 	ll = sle64_to_cpu(b->number_of_sectors) >> sectors_per_cluster_bits;
 	if ((u64)ll >= 1ULL << 32) {
 		ntfs_error(vol->sb, "Cannot handle 64-bit clusters.  Sorry.");
-		return FALSE;
+		return false;
 	}
 	vol->nr_clusters = ll;
 	ntfs_debug("vol->nr_clusters = 0x%llx", (long long)vol->nr_clusters);
@@ -805,21 +896,24 @@ static BOOL parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b)
 					"Maximum supported is 2TiB.  Sorry.",
 					(unsigned long long)ll >> (40 -
 					vol->cluster_size_bits));
-			return FALSE;
+			return false;
 		}
 	}
 	ll = sle64_to_cpu(b->mft_lcn);
 	if (ll >= vol->nr_clusters) {
-		ntfs_error(vol->sb, "MFT LCN is beyond end of volume.  Weird.");
-		return FALSE;
+		ntfs_error(vol->sb, "MFT LCN (%lli, 0x%llx) is beyond end of "
+				"volume.  Weird.", (unsigned long long)ll,
+				(unsigned long long)ll);
+		return false;
 	}
 	vol->mft_lcn = ll;
 	ntfs_debug("vol->mft_lcn = 0x%llx", (long long)vol->mft_lcn);
 	ll = sle64_to_cpu(b->mftmirr_lcn);
 	if (ll >= vol->nr_clusters) {
-		ntfs_error(vol->sb, "MFTMirr LCN is beyond end of volume.  "
-				"Weird.");
-		return FALSE;
+		ntfs_error(vol->sb, "MFTMirr LCN (%lli, 0x%llx) is beyond end "
+				"of volume.  Weird.", (unsigned long long)ll,
+				(unsigned long long)ll);
+		return false;
 	}
 	vol->mftmirr_lcn = ll;
 	ntfs_debug("vol->mftmirr_lcn = 0x%llx", (long long)vol->mftmirr_lcn);
@@ -842,7 +936,7 @@ static BOOL parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b)
 	vol->serial_no = le64_to_cpu(b->volume_serial_number);
 	ntfs_debug("vol->serial_no = 0x%llx",
 			(unsigned long long)vol->serial_no);
-	return TRUE;
+	return true;
 }
 
 /**
@@ -935,9 +1029,9 @@ static void ntfs_setup_allocators(ntfs_volume *vol)
  * load_and_init_mft_mirror - load and setup the mft mirror inode for a volume
  * @vol:	ntfs super block describing device whose mft mirror to load
  *
- * Return TRUE on success or FALSE on error.
+ * Return 'true' on success or 'false' on error.
  */
-static BOOL load_and_init_mft_mirror(ntfs_volume *vol)
+static bool load_and_init_mft_mirror(ntfs_volume *vol)
 {
 	struct inode *tmp_ino;
 	ntfs_inode *tmp_ni;
@@ -949,14 +1043,15 @@ static BOOL load_and_init_mft_mirror(ntfs_volume *vol)
 		if (!IS_ERR(tmp_ino))
 			iput(tmp_ino);
 		/* Caller will display error message. */
-		return FALSE;
+		return false;
 	}
 	/*
 	 * Re-initialize some specifics about $MFTMirr's inode as
 	 * ntfs_read_inode() will have set up the default ones.
 	 */
 	/* Set uid and gid to root. */
-	tmp_ino->i_uid = tmp_ino->i_gid = 0;
+	tmp_ino->i_uid = GLOBAL_ROOT_UID;
+	tmp_ino->i_gid = GLOBAL_ROOT_GID;
 	/* Regular file.  No access for anyone. */
 	tmp_ino->i_mode = S_IFREG;
 	/* No VFS initiated operations allowed for $MFTMirr. */
@@ -967,6 +1062,7 @@ static BOOL load_and_init_mft_mirror(ntfs_volume *vol)
 	tmp_ni = NTFS_I(tmp_ino);
 	/* The $MFTMirr, like the $MFT is multi sector transfer protected. */
 	NInoSetMstProtected(tmp_ni);
+	NInoSetSparseDisabled(tmp_ni);
 	/*
 	 * Set up our little cheat allowing us to reuse the async read io
 	 * completion handler for directories.
@@ -975,27 +1071,27 @@ static BOOL load_and_init_mft_mirror(ntfs_volume *vol)
 	tmp_ni->itype.index.block_size_bits = vol->mft_record_size_bits;
 	vol->mftmirr_ino = tmp_ino;
 	ntfs_debug("Done.");
-	return TRUE;
+	return true;
 }
 
 /**
  * check_mft_mirror - compare contents of the mft mirror with the mft
  * @vol:	ntfs super block describing device whose mft mirror to check
  *
- * Return TRUE on success or FALSE on error.
+ * Return 'true' on success or 'false' on error.
  *
  * Note, this function also results in the mft mirror runlist being completely
  * mapped into memory.  The mft mirror write code requires this and will BUG()
  * should it find an unmapped runlist element.
  */
-static BOOL check_mft_mirror(ntfs_volume *vol)
+static bool check_mft_mirror(ntfs_volume *vol)
 {
-	unsigned long index;
 	struct super_block *sb = vol->sb;
 	ntfs_inode *mirr_ni;
 	struct page *mft_page, *mirr_page;
 	u8 *kmft, *kmirr;
 	runlist_element *rl, rl2[2];
+	pgoff_t index;
 	int mrecs_per_page, i;
 
 	ntfs_debug("Entering.");
@@ -1020,7 +1116,7 @@ static BOOL check_mft_mirror(ntfs_volume *vol)
 					index);
 			if (IS_ERR(mft_page)) {
 				ntfs_error(sb, "Failed to read $MFT.");
-				return FALSE;
+				return false;
 			}
 			kmft = page_address(mft_page);
 			/* Get the $MFTMirr page. */
@@ -1033,26 +1129,38 @@ static BOOL check_mft_mirror(ntfs_volume *vol)
 			kmirr = page_address(mirr_page);
 			++index;
 		}
-		/* Make sure the record is ok. */
-		if (ntfs_is_baad_recordp((le32*)kmft)) {
-			ntfs_error(sb, "Incomplete multi sector transfer "
-					"detected in mft record %i.", i);
+		/* Do not check the record if it is not in use. */
+		if (((MFT_RECORD*)kmft)->flags & MFT_RECORD_IN_USE) {
+			/* Make sure the record is ok. */
+			if (ntfs_is_baad_recordp((le32*)kmft)) {
+				ntfs_error(sb, "Incomplete multi sector "
+						"transfer detected in mft "
+						"record %i.", i);
 mm_unmap_out:
-			ntfs_unmap_page(mirr_page);
+				ntfs_unmap_page(mirr_page);
 mft_unmap_out:
-			ntfs_unmap_page(mft_page);
-			return FALSE;
+				ntfs_unmap_page(mft_page);
+				return false;
+			}
 		}
-		if (ntfs_is_baad_recordp((le32*)kmirr)) {
-			ntfs_error(sb, "Incomplete multi sector transfer "
-					"detected in mft mirror record %i.", i);
-			goto mm_unmap_out;
+		/* Do not check the mirror record if it is not in use. */
+		if (((MFT_RECORD*)kmirr)->flags & MFT_RECORD_IN_USE) {
+			if (ntfs_is_baad_recordp((le32*)kmirr)) {
+				ntfs_error(sb, "Incomplete multi sector "
+						"transfer detected in mft "
+						"mirror record %i.", i);
+				goto mm_unmap_out;
+			}
 		}
 		/* Get the amount of data in the current record. */
 		bytes = le32_to_cpu(((MFT_RECORD*)kmft)->bytes_in_use);
-		if (!bytes || bytes > vol->mft_record_size) {
+		if (bytes < sizeof(MFT_RECORD_OLD) ||
+				bytes > vol->mft_record_size ||
+				ntfs_is_baad_recordp((le32*)kmft)) {
 			bytes = le32_to_cpu(((MFT_RECORD*)kmirr)->bytes_in_use);
-			if (!bytes || bytes > vol->mft_record_size)
+			if (bytes < sizeof(MFT_RECORD_OLD) ||
+					bytes > vol->mft_record_size ||
+					ntfs_is_baad_recordp((le32*)kmirr))
 				bytes = vol->mft_record_size;
 		}
 		/* Compare the two records. */
@@ -1091,21 +1199,22 @@ mft_unmap_out:
 			ntfs_error(sb, "$MFTMirr location mismatch.  "
 					"Run chkdsk.");
 			up_read(&mirr_ni->runlist.lock);
-			return FALSE;
+			return false;
 		}
 	} while (rl2[i++].length);
 	up_read(&mirr_ni->runlist.lock);
 	ntfs_debug("Done.");
-	return TRUE;
+	return true;
 }
 
 /**
  * load_and_check_logfile - load and check the logfile inode for a volume
  * @vol:	ntfs super block describing device whose logfile to load
  *
- * Return TRUE on success or FALSE on error.
+ * Return 'true' on success or 'false' on error.
  */
-static BOOL load_and_check_logfile(ntfs_volume *vol)
+static bool load_and_check_logfile(ntfs_volume *vol,
+		RESTART_PAGE_HEADER **rp)
 {
 	struct inode *tmp_ino;
 
@@ -1115,46 +1224,163 @@ static BOOL load_and_check_logfile(ntfs_volume *vol)
 		if (!IS_ERR(tmp_ino))
 			iput(tmp_ino);
 		/* Caller will display error message. */
-		return FALSE;
+		return false;
 	}
-	if (!ntfs_check_logfile(tmp_ino)) {
+	if (!ntfs_check_logfile(tmp_ino, rp)) {
 		iput(tmp_ino);
 		/* ntfs_check_logfile() will have displayed error output. */
-		return FALSE;
+		return false;
 	}
+	NInoSetSparseDisabled(NTFS_I(tmp_ino));
 	vol->logfile_ino = tmp_ino;
 	ntfs_debug("Done.");
-	return TRUE;
+	return true;
+}
+
+#define NTFS_HIBERFIL_HEADER_SIZE	4096
+
+/**
+ * check_windows_hibernation_status - check if Windows is suspended on a volume
+ * @vol:	ntfs super block of device to check
+ *
+ * Check if Windows is hibernated on the ntfs volume @vol.  This is done by
+ * looking for the file hiberfil.sys in the root directory of the volume.  If
+ * the file is not present Windows is definitely not suspended.
+ *
+ * If hiberfil.sys exists and is less than 4kiB in size it means Windows is
+ * definitely suspended (this volume is not the system volume).  Caveat:  on a
+ * system with many volumes it is possible that the < 4kiB check is bogus but
+ * for now this should do fine.
+ *
+ * If hiberfil.sys exists and is larger than 4kiB in size, we need to read the
+ * hiberfil header (which is the first 4kiB).  If this begins with "hibr",
+ * Windows is definitely suspended.  If it is completely full of zeroes,
+ * Windows is definitely not hibernated.  Any other case is treated as if
+ * Windows is suspended.  This caters for the above mentioned caveat of a
+ * system with many volumes where no "hibr" magic would be present and there is
+ * no zero header.
+ *
+ * Return 0 if Windows is not hibernated on the volume, >0 if Windows is
+ * hibernated on the volume, and -errno on error.
+ */
+static int check_windows_hibernation_status(ntfs_volume *vol)
+{
+	MFT_REF mref;
+	struct inode *vi;
+	struct page *page;
+	u32 *kaddr, *kend;
+	ntfs_name *name = NULL;
+	int ret = 1;
+	static const ntfschar hiberfil[13] = { cpu_to_le16('h'),
+			cpu_to_le16('i'), cpu_to_le16('b'),
+			cpu_to_le16('e'), cpu_to_le16('r'),
+			cpu_to_le16('f'), cpu_to_le16('i'),
+			cpu_to_le16('l'), cpu_to_le16('.'),
+			cpu_to_le16('s'), cpu_to_le16('y'),
+			cpu_to_le16('s'), 0 };
+
+	ntfs_debug("Entering.");
+	/*
+	 * Find the inode number for the hibernation file by looking up the
+	 * filename hiberfil.sys in the root directory.
+	 */
+	mutex_lock(&vol->root_ino->i_mutex);
+	mref = ntfs_lookup_inode_by_name(NTFS_I(vol->root_ino), hiberfil, 12,
+			&name);
+	mutex_unlock(&vol->root_ino->i_mutex);
+	if (IS_ERR_MREF(mref)) {
+		ret = MREF_ERR(mref);
+		/* If the file does not exist, Windows is not hibernated. */
+		if (ret == -ENOENT) {
+			ntfs_debug("hiberfil.sys not present.  Windows is not "
+					"hibernated on the volume.");
+			return 0;
+		}
+		/* A real error occurred. */
+		ntfs_error(vol->sb, "Failed to find inode number for "
+				"hiberfil.sys.");
+		return ret;
+	}
+	/* We do not care for the type of match that was found. */
+	kfree(name);
+	/* Get the inode. */
+	vi = ntfs_iget(vol->sb, MREF(mref));
+	if (IS_ERR(vi) || is_bad_inode(vi)) {
+		if (!IS_ERR(vi))
+			iput(vi);
+		ntfs_error(vol->sb, "Failed to load hiberfil.sys.");
+		return IS_ERR(vi) ? PTR_ERR(vi) : -EIO;
+	}
+	if (unlikely(i_size_read(vi) < NTFS_HIBERFIL_HEADER_SIZE)) {
+		ntfs_debug("hiberfil.sys is smaller than 4kiB (0x%llx).  "
+				"Windows is hibernated on the volume.  This "
+				"is not the system volume.", i_size_read(vi));
+		goto iput_out;
+	}
+	page = ntfs_map_page(vi->i_mapping, 0);
+	if (IS_ERR(page)) {
+		ntfs_error(vol->sb, "Failed to read from hiberfil.sys.");
+		ret = PTR_ERR(page);
+		goto iput_out;
+	}
+	kaddr = (u32*)page_address(page);
+	if (*(le32*)kaddr == cpu_to_le32(0x72626968)/*'hibr'*/) {
+		ntfs_debug("Magic \"hibr\" found in hiberfil.sys.  Windows is "
+				"hibernated on the volume.  This is the "
+				"system volume.");
+		goto unm_iput_out;
+	}
+	kend = kaddr + NTFS_HIBERFIL_HEADER_SIZE/sizeof(*kaddr);
+	do {
+		if (unlikely(*kaddr)) {
+			ntfs_debug("hiberfil.sys is larger than 4kiB "
+					"(0x%llx), does not contain the "
+					"\"hibr\" magic, and does not have a "
+					"zero header.  Windows is hibernated "
+					"on the volume.  This is not the "
+					"system volume.", i_size_read(vi));
+			goto unm_iput_out;
+		}
+	} while (++kaddr < kend);
+	ntfs_debug("hiberfil.sys contains a zero header.  Windows is not "
+			"hibernated on the volume.  This is the system "
+			"volume.");
+	ret = 0;
+unm_iput_out:
+	ntfs_unmap_page(page);
+iput_out:
+	iput(vi);
+	return ret;
 }
 
 /**
  * load_and_init_quota - load and setup the quota file for a volume if present
  * @vol:	ntfs super block describing device whose quota file to load
  *
- * Return TRUE on success or FALSE on error.  If $Quota is not present, we
+ * Return 'true' on success or 'false' on error.  If $Quota is not present, we
  * leave vol->quota_ino as NULL and return success.
  */
-static BOOL load_and_init_quota(ntfs_volume *vol)
+static bool load_and_init_quota(ntfs_volume *vol)
 {
 	MFT_REF mref;
 	struct inode *tmp_ino;
 	ntfs_name *name = NULL;
-	static const ntfschar Quota[7] = { const_cpu_to_le16('$'),
-			const_cpu_to_le16('Q'), const_cpu_to_le16('u'),
-			const_cpu_to_le16('o'), const_cpu_to_le16('t'),
-			const_cpu_to_le16('a'), 0 };
-	static ntfschar Q[3] = { const_cpu_to_le16('$'),
-			const_cpu_to_le16('Q'), 0 };
+	static const ntfschar Quota[7] = { cpu_to_le16('$'),
+			cpu_to_le16('Q'), cpu_to_le16('u'),
+			cpu_to_le16('o'), cpu_to_le16('t'),
+			cpu_to_le16('a'), 0 };
+	static ntfschar Q[3] = { cpu_to_le16('$'),
+			cpu_to_le16('Q'), 0 };
 
 	ntfs_debug("Entering.");
 	/*
 	 * Find the inode number for the quota file by looking up the filename
 	 * $Quota in the extended system files directory $Extend.
 	 */
-	down(&vol->extend_ino->i_sem);
+	mutex_lock(&vol->extend_ino->i_mutex);
 	mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), Quota, 6,
 			&name);
-	up(&vol->extend_ino->i_sem);
+	mutex_unlock(&vol->extend_ino->i_mutex);
 	if (IS_ERR_MREF(mref)) {
 		/*
 		 * If the file does not exist, quotas are disabled and have
@@ -1168,47 +1394,208 @@ static BOOL load_and_init_quota(ntfs_volume *vol)
 			 * not enabled.
 			 */
 			NVolSetQuotaOutOfDate(vol);
-			return TRUE;
+			return true;
 		}
-		/* A real error occured. */
+		/* A real error occurred. */
 		ntfs_error(vol->sb, "Failed to find inode number for $Quota.");
-		return FALSE;
+		return false;
 	}
 	/* We do not care for the type of match that was found. */
-	if (name)
-		kfree(name);
+	kfree(name);
 	/* Get the inode. */
 	tmp_ino = ntfs_iget(vol->sb, MREF(mref));
 	if (IS_ERR(tmp_ino) || is_bad_inode(tmp_ino)) {
 		if (!IS_ERR(tmp_ino))
 			iput(tmp_ino);
 		ntfs_error(vol->sb, "Failed to load $Quota.");
-		return FALSE;
+		return false;
 	}
 	vol->quota_ino = tmp_ino;
 	/* Get the $Q index allocation attribute. */
 	tmp_ino = ntfs_index_iget(vol->quota_ino, Q, 2);
 	if (IS_ERR(tmp_ino)) {
 		ntfs_error(vol->sb, "Failed to load $Quota/$Q index.");
-		return FALSE;
+		return false;
 	}
 	vol->quota_q_ino = tmp_ino;
 	ntfs_debug("Done.");
-	return TRUE;
+	return true;
+}
+
+/**
+ * load_and_init_usnjrnl - load and setup the transaction log if present
+ * @vol:	ntfs super block describing device whose usnjrnl file to load
+ *
+ * Return 'true' on success or 'false' on error.
+ *
+ * If $UsnJrnl is not present or in the process of being disabled, we set
+ * NVolUsnJrnlStamped() and return success.
+ *
+ * If the $UsnJrnl $DATA/$J attribute has a size equal to the lowest valid usn,
+ * i.e. transaction logging has only just been enabled or the journal has been
+ * stamped and nothing has been logged since, we also set NVolUsnJrnlStamped()
+ * and return success.
+ */
+static bool load_and_init_usnjrnl(ntfs_volume *vol)
+{
+	MFT_REF mref;
+	struct inode *tmp_ino;
+	ntfs_inode *tmp_ni;
+	struct page *page;
+	ntfs_name *name = NULL;
+	USN_HEADER *uh;
+	static const ntfschar UsnJrnl[9] = { cpu_to_le16('$'),
+			cpu_to_le16('U'), cpu_to_le16('s'),
+			cpu_to_le16('n'), cpu_to_le16('J'),
+			cpu_to_le16('r'), cpu_to_le16('n'),
+			cpu_to_le16('l'), 0 };
+	static ntfschar Max[5] = { cpu_to_le16('$'),
+			cpu_to_le16('M'), cpu_to_le16('a'),
+			cpu_to_le16('x'), 0 };
+	static ntfschar J[3] = { cpu_to_le16('$'),
+			cpu_to_le16('J'), 0 };
+
+	ntfs_debug("Entering.");
+	/*
+	 * Find the inode number for the transaction log file by looking up the
+	 * filename $UsnJrnl in the extended system files directory $Extend.
+	 */
+	mutex_lock(&vol->extend_ino->i_mutex);
+	mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), UsnJrnl, 8,
+			&name);
+	mutex_unlock(&vol->extend_ino->i_mutex);
+	if (IS_ERR_MREF(mref)) {
+		/*
+		 * If the file does not exist, transaction logging is disabled,
+		 * just return success.
+		 */
+		if (MREF_ERR(mref) == -ENOENT) {
+			ntfs_debug("$UsnJrnl not present.  Volume does not "
+					"have transaction logging enabled.");
+not_enabled:
+			/*
+			 * No need to try to stamp the transaction log if
+			 * transaction logging is not enabled.
+			 */
+			NVolSetUsnJrnlStamped(vol);
+			return true;
+		}
+		/* A real error occurred. */
+		ntfs_error(vol->sb, "Failed to find inode number for "
+				"$UsnJrnl.");
+		return false;
+	}
+	/* We do not care for the type of match that was found. */
+	kfree(name);
+	/* Get the inode. */
+	tmp_ino = ntfs_iget(vol->sb, MREF(mref));
+	if (unlikely(IS_ERR(tmp_ino) || is_bad_inode(tmp_ino))) {
+		if (!IS_ERR(tmp_ino))
+			iput(tmp_ino);
+		ntfs_error(vol->sb, "Failed to load $UsnJrnl.");
+		return false;
+	}
+	vol->usnjrnl_ino = tmp_ino;
+	/*
+	 * If the transaction log is in the process of being deleted, we can
+	 * ignore it.
+	 */
+	if (unlikely(vol->vol_flags & VOLUME_DELETE_USN_UNDERWAY)) {
+		ntfs_debug("$UsnJrnl in the process of being disabled.  "
+				"Volume does not have transaction logging "
+				"enabled.");
+		goto not_enabled;
+	}
+	/* Get the $DATA/$Max attribute. */
+	tmp_ino = ntfs_attr_iget(vol->usnjrnl_ino, AT_DATA, Max, 4);
+	if (IS_ERR(tmp_ino)) {
+		ntfs_error(vol->sb, "Failed to load $UsnJrnl/$DATA/$Max "
+				"attribute.");
+		return false;
+	}
+	vol->usnjrnl_max_ino = tmp_ino;
+	if (unlikely(i_size_read(tmp_ino) < sizeof(USN_HEADER))) {
+		ntfs_error(vol->sb, "Found corrupt $UsnJrnl/$DATA/$Max "
+				"attribute (size is 0x%llx but should be at "
+				"least 0x%zx bytes).", i_size_read(tmp_ino),
+				sizeof(USN_HEADER));
+		return false;
+	}
+	/* Get the $DATA/$J attribute. */
+	tmp_ino = ntfs_attr_iget(vol->usnjrnl_ino, AT_DATA, J, 2);
+	if (IS_ERR(tmp_ino)) {
+		ntfs_error(vol->sb, "Failed to load $UsnJrnl/$DATA/$J "
+				"attribute.");
+		return false;
+	}
+	vol->usnjrnl_j_ino = tmp_ino;
+	/* Verify $J is non-resident and sparse. */
+	tmp_ni = NTFS_I(vol->usnjrnl_j_ino);
+	if (unlikely(!NInoNonResident(tmp_ni) || !NInoSparse(tmp_ni))) {
+		ntfs_error(vol->sb, "$UsnJrnl/$DATA/$J attribute is resident "
+				"and/or not sparse.");
+		return false;
+	}
+	/* Read the USN_HEADER from $DATA/$Max. */
+	page = ntfs_map_page(vol->usnjrnl_max_ino->i_mapping, 0);
+	if (IS_ERR(page)) {
+		ntfs_error(vol->sb, "Failed to read from $UsnJrnl/$DATA/$Max "
+				"attribute.");
+		return false;
+	}
+	uh = (USN_HEADER*)page_address(page);
+	/* Sanity check the $Max. */
+	if (unlikely(sle64_to_cpu(uh->allocation_delta) >
+			sle64_to_cpu(uh->maximum_size))) {
+		ntfs_error(vol->sb, "Allocation delta (0x%llx) exceeds "
+				"maximum size (0x%llx).  $UsnJrnl is corrupt.",
+				(long long)sle64_to_cpu(uh->allocation_delta),
+				(long long)sle64_to_cpu(uh->maximum_size));
+		ntfs_unmap_page(page);
+		return false;
+	}
+	/*
+	 * If the transaction log has been stamped and nothing has been written
+	 * to it since, we do not need to stamp it.
+	 */
+	if (unlikely(sle64_to_cpu(uh->lowest_valid_usn) >=
+			i_size_read(vol->usnjrnl_j_ino))) {
+		if (likely(sle64_to_cpu(uh->lowest_valid_usn) ==
+				i_size_read(vol->usnjrnl_j_ino))) {
+			ntfs_unmap_page(page);
+			ntfs_debug("$UsnJrnl is enabled but nothing has been "
+					"logged since it was last stamped.  "
+					"Treating this as if the volume does "
+					"not have transaction logging "
+					"enabled.");
+			goto not_enabled;
+		}
+		ntfs_error(vol->sb, "$UsnJrnl has lowest valid usn (0x%llx) "
+				"which is out of bounds (0x%llx).  $UsnJrnl "
+				"is corrupt.",
+				(long long)sle64_to_cpu(uh->lowest_valid_usn),
+				i_size_read(vol->usnjrnl_j_ino));
+		ntfs_unmap_page(page);
+		return false;
+	}
+	ntfs_unmap_page(page);
+	ntfs_debug("Done.");
+	return true;
 }
 
 /**
  * load_and_init_attrdef - load the attribute definitions table for a volume
  * @vol:	ntfs super block describing device whose attrdef to load
  *
- * Return TRUE on success or FALSE on error.
+ * Return 'true' on success or 'false' on error.
  */
-static BOOL load_and_init_attrdef(ntfs_volume *vol)
+static bool load_and_init_attrdef(ntfs_volume *vol)
 {
+	loff_t i_size;
 	struct super_block *sb = vol->sb;
 	struct inode *ino;
 	struct page *page;
-	unsigned long index, max_index;
+	pgoff_t index, max_index;
 	unsigned int size;
 
 	ntfs_debug("Entering.");
@@ -1219,14 +1606,16 @@ static BOOL load_and_init_attrdef(ntfs_volume *vol)
 			iput(ino);
 		goto failed;
 	}
+	NInoSetSparseDisabled(NTFS_I(ino));
 	/* The size of FILE_AttrDef must be above 0 and fit inside 31 bits. */
-	if (!ino->i_size || ino->i_size > 0x7fffffff)
+	i_size = i_size_read(ino);
+	if (i_size <= 0 || i_size > 0x7fffffff)
 		goto iput_failed;
-	vol->attrdef = (ATTR_DEF*)ntfs_malloc_nofs(ino->i_size);
+	vol->attrdef = (ATTR_DEF*)ntfs_malloc_nofs(i_size);
 	if (!vol->attrdef)
 		goto iput_failed;
 	index = 0;
-	max_index = ino->i_size >> PAGE_CACHE_SHIFT;
+	max_index = i_size >> PAGE_CACHE_SHIFT;
 	size = PAGE_CACHE_SIZE;
 	while (index < max_index) {
 		/* Read the attrdef table and copy it into the linear buffer. */
@@ -1239,14 +1628,14 @@ read_partial_attrdef_page:
 		ntfs_unmap_page(page);
 	};
 	if (size == PAGE_CACHE_SIZE) {
-		size = ino->i_size & ~PAGE_CACHE_MASK;
+		size = i_size & ~PAGE_CACHE_MASK;
 		if (size)
 			goto read_partial_attrdef_page;
 	}
-	vol->attrdef_size = ino->i_size;
-	ntfs_debug("Read %llu bytes from $AttrDef.", ino->i_size);
+	vol->attrdef_size = i_size;
+	ntfs_debug("Read %llu bytes from $AttrDef.", i_size);
 	iput(ino);
-	return TRUE;
+	return true;
 free_iput_failed:
 	ntfs_free(vol->attrdef);
 	vol->attrdef = NULL;
@@ -1254,7 +1643,7 @@ iput_failed:
 	iput(ino);
 failed:
 	ntfs_error(sb, "Failed to initialize attribute definition table.");
-	return FALSE;
+	return false;
 }
 
 #endif /* NTFS_RW */
@@ -1263,14 +1652,15 @@ failed:
  * load_and_init_upcase - load the upcase table for an ntfs volume
  * @vol:	ntfs super block describing device whose upcase to load
  *
- * Return TRUE on success or FALSE on error.
+ * Return 'true' on success or 'false' on error.
  */
-static BOOL load_and_init_upcase(ntfs_volume *vol)
+static bool load_and_init_upcase(ntfs_volume *vol)
 {
+	loff_t i_size;
 	struct super_block *sb = vol->sb;
 	struct inode *ino;
 	struct page *page;
-	unsigned long index, max_index;
+	pgoff_t index, max_index;
 	unsigned int size;
 	int i, max;
 
@@ -1286,14 +1676,15 @@ static BOOL load_and_init_upcase(ntfs_volume *vol)
 	 * The upcase size must not be above 64k Unicode characters, must not
 	 * be zero and must be a multiple of sizeof(ntfschar).
 	 */
-	if (!ino->i_size || ino->i_size & (sizeof(ntfschar) - 1) ||
-			ino->i_size > 64ULL * 1024 * sizeof(ntfschar))
+	i_size = i_size_read(ino);
+	if (!i_size || i_size & (sizeof(ntfschar) - 1) ||
+			i_size > 64ULL * 1024 * sizeof(ntfschar))
 		goto iput_upcase_failed;
-	vol->upcase = (ntfschar*)ntfs_malloc_nofs(ino->i_size);
+	vol->upcase = (ntfschar*)ntfs_malloc_nofs(i_size);
 	if (!vol->upcase)
 		goto iput_upcase_failed;
 	index = 0;
-	max_index = ino->i_size >> PAGE_CACHE_SHIFT;
+	max_index = i_size >> PAGE_CACHE_SHIFT;
 	size = PAGE_CACHE_SIZE;
 	while (index < max_index) {
 		/* Read the upcase table and copy it into the linear buffer. */
@@ -1306,20 +1697,20 @@ read_partial_upcase_page:
 		ntfs_unmap_page(page);
 	};
 	if (size == PAGE_CACHE_SIZE) {
-		size = ino->i_size & ~PAGE_CACHE_MASK;
+		size = i_size & ~PAGE_CACHE_MASK;
 		if (size)
 			goto read_partial_upcase_page;
 	}
-	vol->upcase_len = ino->i_size >> UCHAR_T_SIZE_BITS;
+	vol->upcase_len = i_size >> UCHAR_T_SIZE_BITS;
 	ntfs_debug("Read %llu bytes from $UpCase (expected %zu bytes).",
-			ino->i_size, 64 * 1024 * sizeof(ntfschar));
+			i_size, 64 * 1024 * sizeof(ntfschar));
 	iput(ino);
-	down(&ntfs_lock);
+	mutex_lock(&ntfs_lock);
 	if (!default_upcase) {
 		ntfs_debug("Using volume specified $UpCase since default is "
 				"not present.");
-		up(&ntfs_lock);
-		return TRUE;
+		mutex_unlock(&ntfs_lock);
+		return true;
 	}
 	max = default_upcase_len;
 	if (max > vol->upcase_len)
@@ -1332,35 +1723,43 @@ read_partial_upcase_page:
 		vol->upcase = default_upcase;
 		vol->upcase_len = max;
 		ntfs_nr_upcase_users++;
-		up(&ntfs_lock);
+		mutex_unlock(&ntfs_lock);
 		ntfs_debug("Volume specified $UpCase matches default. Using "
 				"default.");
-		return TRUE;
+		return true;
 	}
-	up(&ntfs_lock);
+	mutex_unlock(&ntfs_lock);
 	ntfs_debug("Using volume specified $UpCase since it does not match "
 			"the default.");
-	return TRUE;
+	return true;
 iput_upcase_failed:
 	iput(ino);
 	ntfs_free(vol->upcase);
 	vol->upcase = NULL;
 upcase_failed:
-	down(&ntfs_lock);
+	mutex_lock(&ntfs_lock);
 	if (default_upcase) {
 		vol->upcase = default_upcase;
 		vol->upcase_len = default_upcase_len;
 		ntfs_nr_upcase_users++;
-		up(&ntfs_lock);
+		mutex_unlock(&ntfs_lock);
 		ntfs_error(sb, "Failed to load $UpCase from the volume. Using "
 				"default.");
-		return TRUE;
+		return true;
 	}
-	up(&ntfs_lock);
+	mutex_unlock(&ntfs_lock);
 	ntfs_error(sb, "Failed to initialize upcase table.");
-	return FALSE;
+	return false;
 }
 
+/*
+ * The lcn and mft bitmap inodes are NTFS-internal inodes with
+ * their own special locking rules:
+ */
+static struct lock_class_key
+	lcnbmp_runlist_lock_key, lcnbmp_mrec_lock_key,
+	mftbmp_runlist_lock_key, mftbmp_mrec_lock_key;
+
 /**
  * load_system_files - open the system files using normal functions
  * @vol:	ntfs super block describing device whose system files to load
@@ -1368,14 +1767,18 @@ upcase_failed:
  * Open the system files with normal access functions and complete setting up
  * the ntfs super block @vol.
  *
- * Return TRUE on success or FALSE on error.
+ * Return 'true' on success or 'false' on error.
  */
-static BOOL load_system_files(ntfs_volume *vol)
+static bool load_system_files(ntfs_volume *vol)
 {
 	struct super_block *sb = vol->sb;
 	MFT_RECORD *m;
 	VOLUME_INFORMATION *vi;
 	ntfs_attr_search_ctx *ctx;
+#ifdef NTFS_RW
+	RESTART_PAGE_HEADER *rp;
+	int err;
+#endif /* NTFS_RW */
 
 	ntfs_debug("Entering.");
 #ifdef NTFS_RW
@@ -1396,7 +1799,7 @@ static BOOL load_system_files(ntfs_volume *vol)
 						es3);
 				goto iput_mirr_err_out;
 			}
-			sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME;
+			sb->s_flags |= MS_RDONLY;
 			ntfs_error(sb, "%s.  Mounting read-only%s",
 					!vol->mftmirr_ino ? es1 : es2, es3);
 		} else
@@ -1413,6 +1816,10 @@ static BOOL load_system_files(ntfs_volume *vol)
 		ntfs_error(sb, "Failed to load $MFT/$BITMAP attribute.");
 		goto iput_mirr_err_out;
 	}
+	lockdep_set_class(&NTFS_I(vol->mftbmp_ino)->runlist.lock,
+			   &mftbmp_runlist_lock_key);
+	lockdep_set_class(&NTFS_I(vol->mftbmp_ino)->mrec_lock,
+			   &mftbmp_mrec_lock_key);
 	/* Read upcase table and setup @vol->upcase and @vol->upcase_len. */
 	if (!load_and_init_upcase(vol))
 		goto iput_mftbmp_err_out;
@@ -1435,7 +1842,13 @@ static BOOL load_system_files(ntfs_volume *vol)
 			iput(vol->lcnbmp_ino);
 		goto bitmap_failed;
 	}
-	if ((vol->nr_clusters + 7) >> 3 > vol->lcnbmp_ino->i_size) {
+	lockdep_set_class(&NTFS_I(vol->lcnbmp_ino)->runlist.lock,
+			   &lcnbmp_runlist_lock_key);
+	lockdep_set_class(&NTFS_I(vol->lcnbmp_ino)->mrec_lock,
+			   &lcnbmp_mrec_lock_key);
+
+	NInoSetSparseDisabled(NTFS_I(vol->lcnbmp_ino));
+	if ((vol->nr_clusters + 7) >> 3 > i_size_read(vol->lcnbmp_ino)) {
 		iput(vol->lcnbmp_ino);
 bitmap_failed:
 		ntfs_error(sb, "Failed to load $Bitmap.");
@@ -1484,17 +1897,36 @@ get_ctx_vol_failed:
 	vol->minor_ver = vi->minor_ver;
 	ntfs_attr_put_search_ctx(ctx);
 	unmap_mft_record(NTFS_I(vol->vol_ino));
-	printk(KERN_INFO "NTFS volume version %i.%i.\n", vol->major_ver,
+	pr_info("volume version %i.%i.\n", vol->major_ver,
 			vol->minor_ver);
+	if (vol->major_ver < 3 && NVolSparseEnabled(vol)) {
+		ntfs_warning(vol->sb, "Disabling sparse support due to NTFS "
+				"volume version %i.%i (need at least version "
+				"3.0).", vol->major_ver, vol->minor_ver);
+		NVolClearSparseEnabled(vol);
+	}
 #ifdef NTFS_RW
 	/* Make sure that no unsupported volume flags are set. */
 	if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) {
 		static const char *es1a = "Volume is dirty";
-		static const char *es1b = "Volume has unsupported flags set";
-		static const char *es2 = ".  Run chkdsk and mount in Windows.";
-		const char *es1;
-		
-		es1 = vol->vol_flags & VOLUME_IS_DIRTY ? es1a : es1b;
+		static const char *es1b = "Volume has been modified by chkdsk";
+		static const char *es1c = "Volume has unsupported flags set";
+		static const char *es2a = ".  Run chkdsk and mount in Windows.";
+		static const char *es2b = ".  Mount in Windows.";
+		const char *es1, *es2;
+
+		es2 = es2a;
+		if (vol->vol_flags & VOLUME_IS_DIRTY)
+			es1 = es1a;
+		else if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) {
+			es1 = es1b;
+			es2 = es2b;
+		} else {
+			es1 = es1c;
+			ntfs_warning(sb, "Unsupported volume flags 0x%x "
+					"encountered.",
+					(unsigned)le16_to_cpu(vol->vol_flags));
+		}
 		/* If a read-write mount, convert it to a read-only mount. */
 		if (!(sb->s_flags & MS_RDONLY)) {
 			if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
@@ -1505,7 +1937,7 @@ get_ctx_vol_failed:
 						es1, es2);
 				goto iput_vol_err_out;
 			}
-			sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME;
+			sb->s_flags |= MS_RDONLY;
 			ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
 		} else
 			ntfs_warning(sb, "%s.  Will not be able to remount "
@@ -1519,8 +1951,9 @@ get_ctx_vol_failed:
 	 * Get the inode for the logfile, check it and determine if the volume
 	 * was shutdown cleanly.
 	 */
-	if (!load_and_check_logfile(vol) ||
-			!ntfs_is_logfile_clean(vol->logfile_ino)) {
+	rp = NULL;
+	if (!load_and_check_logfile(vol, &rp) ||
+			!ntfs_is_logfile_clean(vol->logfile_ino, rp)) {
 		static const char *es1a = "Failed to load $LogFile";
 		static const char *es1b = "$LogFile is not clean";
 		static const char *es2 = ".  Mount in Windows.";
@@ -1535,9 +1968,58 @@ get_ctx_vol_failed:
 						"continue nor on_errors="
 						"remount-ro was specified%s",
 						es1, es2);
+				if (vol->logfile_ino) {
+					BUG_ON(!rp);
+					ntfs_free(rp);
+				}
 				goto iput_logfile_err_out;
 			}
-			sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME;
+			sb->s_flags |= MS_RDONLY;
+			ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
+		} else
+			ntfs_warning(sb, "%s.  Will not be able to remount "
+					"read-write%s", es1, es2);
+		/* This will prevent a read-write remount. */
+		NVolSetErrors(vol);
+	}
+	ntfs_free(rp);
+#endif /* NTFS_RW */
+	/* Get the root directory inode so we can do path lookups. */
+	vol->root_ino = ntfs_iget(sb, FILE_root);
+	if (IS_ERR(vol->root_ino) || is_bad_inode(vol->root_ino)) {
+		if (!IS_ERR(vol->root_ino))
+			iput(vol->root_ino);
+		ntfs_error(sb, "Failed to load root directory.");
+		goto iput_logfile_err_out;
+	}
+#ifdef NTFS_RW
+	/*
+	 * Check if Windows is suspended to disk on the target volume.  If it
+	 * is hibernated, we must not write *anything* to the disk so set
+	 * NVolErrors() without setting the dirty volume flag and mount
+	 * read-only.  This will prevent read-write remounting and it will also
+	 * prevent all writes.
+	 */
+	err = check_windows_hibernation_status(vol);
+	if (unlikely(err)) {
+		static const char *es1a = "Failed to determine if Windows is "
+				"hibernated";
+		static const char *es1b = "Windows is hibernated";
+		static const char *es2 = ".  Run chkdsk.";
+		const char *es1;
+
+		es1 = err < 0 ? es1a : es1b;
+		/* If a read-write mount, convert it to a read-only mount. */
+		if (!(sb->s_flags & MS_RDONLY)) {
+			if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
+					ON_ERRORS_CONTINUE))) {
+				ntfs_error(sb, "%s and neither on_errors="
+						"continue nor on_errors="
+						"remount-ro was specified%s",
+						es1, es2);
+				goto iput_root_err_out;
+			}
+			sb->s_flags |= MS_RDONLY;
 			ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
 		} else
 			ntfs_warning(sb, "%s.  Will not be able to remount "
@@ -1558,10 +2040,10 @@ get_ctx_vol_failed:
 			ntfs_error(sb, "%s and neither on_errors=continue nor "
 					"on_errors=remount-ro was specified%s",
 					es1, es2);
-			goto iput_logfile_err_out;
+			goto iput_root_err_out;
 		}
 		ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
-		sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME;
+		sb->s_flags |= MS_RDONLY;
 		/*
 		 * Do not set NVolErrors() because ntfs_remount() might manage
 		 * to set the dirty flag in which case all would be well.
@@ -1585,10 +2067,10 @@ get_ctx_vol_failed:
 			ntfs_error(sb, "%s and neither on_errors=continue nor "
 					"on_errors=remount-ro was specified%s",
 					es1, es2);
-			goto iput_logfile_err_out;
+			goto iput_root_err_out;
 		}
 		ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
-		sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME;
+		sb->s_flags |= MS_RDONLY;
 		NVolSetErrors(vol);
 	}
 #endif
@@ -1604,24 +2086,16 @@ get_ctx_vol_failed:
 			ntfs_error(sb, "%s and neither on_errors=continue nor "
 					"on_errors=remount-ro was specified%s",
 					es1, es2);
-			goto iput_logfile_err_out;
+			goto iput_root_err_out;
 		}
 		ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
-		sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME;
+		sb->s_flags |= MS_RDONLY;
 		NVolSetErrors(vol);
 	}
 #endif /* NTFS_RW */
-	/* Get the root directory inode. */
-	vol->root_ino = ntfs_iget(sb, FILE_root);
-	if (IS_ERR(vol->root_ino) || is_bad_inode(vol->root_ino)) {
-		if (!IS_ERR(vol->root_ino))
-			iput(vol->root_ino);
-		ntfs_error(sb, "Failed to load root directory.");
-		goto iput_logfile_err_out;
-	}
 	/* If on NTFS versions before 3.0, we are done. */
-	if (vol->major_ver < 3)
-		return TRUE;
+	if (unlikely(vol->major_ver < 3))
+		return true;
 	/* NTFS 3.0+ specific initialization. */
 	/* Get the security descriptors inode. */
 	vol->secure_ino = ntfs_iget(sb, FILE_Secure);
@@ -1631,7 +2105,7 @@ get_ctx_vol_failed:
 		ntfs_error(sb, "Failed to load $Secure.");
 		goto iput_root_err_out;
 	}
-	// FIXME: Initialize security.
+	// TODO: Initialize security.
 	/* Get the extended system files' directory inode. */
 	vol->extend_ino = ntfs_iget(sb, FILE_Extend);
 	if (IS_ERR(vol->extend_ino) || is_bad_inode(vol->extend_ino)) {
@@ -1656,7 +2130,7 @@ get_ctx_vol_failed:
 						es1, es2);
 				goto iput_quota_err_out;
 			}
-			sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME;
+			sb->s_flags |= MS_RDONLY;
 			ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
 		} else
 			ntfs_warning(sb, "%s.  Will not be able to remount "
@@ -1679,13 +2153,63 @@ get_ctx_vol_failed:
 			goto iput_quota_err_out;
 		}
 		ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
-		sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME;
+		sb->s_flags |= MS_RDONLY;
+		NVolSetErrors(vol);
+	}
+	/*
+	 * Find the transaction log file ($UsnJrnl), load it if present, check
+	 * it, and set it up.
+	 */
+	if (!load_and_init_usnjrnl(vol)) {
+		static const char *es1 = "Failed to load $UsnJrnl";
+		static const char *es2 = ".  Run chkdsk.";
+
+		/* If a read-write mount, convert it to a read-only mount. */
+		if (!(sb->s_flags & MS_RDONLY)) {
+			if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
+					ON_ERRORS_CONTINUE))) {
+				ntfs_error(sb, "%s and neither on_errors="
+						"continue nor on_errors="
+						"remount-ro was specified%s",
+						es1, es2);
+				goto iput_usnjrnl_err_out;
+			}
+			sb->s_flags |= MS_RDONLY;
+			ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
+		} else
+			ntfs_warning(sb, "%s.  Will not be able to remount "
+					"read-write%s", es1, es2);
+		/* This will prevent a read-write remount. */
+		NVolSetErrors(vol);
+	}
+	/* If (still) a read-write mount, stamp the transaction log. */
+	if (!(sb->s_flags & MS_RDONLY) && !ntfs_stamp_usnjrnl(vol)) {
+		static const char *es1 = "Failed to stamp transaction log "
+				"($UsnJrnl)";
+		static const char *es2 = ".  Run chkdsk.";
+
+		/* Convert to a read-only mount. */
+		if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
+				ON_ERRORS_CONTINUE))) {
+			ntfs_error(sb, "%s and neither on_errors=continue nor "
+					"on_errors=remount-ro was specified%s",
+					es1, es2);
+			goto iput_usnjrnl_err_out;
+		}
+		ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
+		sb->s_flags |= MS_RDONLY;
 		NVolSetErrors(vol);
 	}
-	// TODO: Delete or checkpoint the $UsnJrnl if it exists.
 #endif /* NTFS_RW */
-	return TRUE;
+	return true;
 #ifdef NTFS_RW
+iput_usnjrnl_err_out:
+	if (vol->usnjrnl_j_ino)
+		iput(vol->usnjrnl_j_ino);
+	if (vol->usnjrnl_max_ino)
+		iput(vol->usnjrnl_max_ino);
+	if (vol->usnjrnl_ino)
+		iput(vol->usnjrnl_ino);
 iput_quota_err_out:
 	if (vol->quota_q_ino)
 		iput(vol->quota_q_ino);
@@ -1716,12 +2240,12 @@ iput_attrdef_err_out:
 iput_upcase_err_out:
 #endif /* NTFS_RW */
 	vol->upcase_len = 0;
-	down(&ntfs_lock);
+	mutex_lock(&ntfs_lock);
 	if (vol->upcase == default_upcase) {
 		ntfs_nr_upcase_users--;
 		vol->upcase = NULL;
 	}
-	up(&ntfs_lock);
+	mutex_unlock(&ntfs_lock);
 	if (vol->upcase) {
 		ntfs_free(vol->upcase);
 		vol->upcase = NULL;
@@ -1733,7 +2257,7 @@ iput_mirr_err_out:
 	if (vol->mftmirr_ino)
 		iput(vol->mftmirr_ino);
 #endif /* NTFS_RW */
-	return FALSE;
+	return false;
 }
 
 /**
@@ -1750,6 +2274,7 @@ static void ntfs_put_super(struct super_block *sb)
 	ntfs_volume *vol = NTFS_SB(sb);
 
 	ntfs_debug("Entering.");
+
 #ifdef NTFS_RW
 	/*
 	 * Commit all inodes while they are still open in case some of them
@@ -1759,6 +2284,12 @@ static void ntfs_put_super(struct super_block *sb)
 
 	/* NTFS 3.0+ specific. */
 	if (vol->major_ver >= 3) {
+		if (vol->usnjrnl_j_ino)
+			ntfs_commit_inode(vol->usnjrnl_j_ino);
+		if (vol->usnjrnl_max_ino)
+			ntfs_commit_inode(vol->usnjrnl_max_ino);
+		if (vol->usnjrnl_ino)
+			ntfs_commit_inode(vol->usnjrnl_ino);
 		if (vol->quota_q_ino)
 			ntfs_commit_inode(vol->quota_q_ino);
 		if (vol->quota_ino)
@@ -1787,7 +2318,7 @@ static void ntfs_put_super(struct super_block *sb)
 	ntfs_commit_inode(vol->mft_ino);
 
 	/*
-	 * If a read-write mount and no volume errors have occured, mark the
+	 * If a read-write mount and no volume errors have occurred, mark the
 	 * volume clean.  Also, re-commit all affected inodes.
 	 */
 	if (!(sb->s_flags & MS_RDONLY)) {
@@ -1814,6 +2345,18 @@ static void ntfs_put_super(struct super_block *sb)
 	/* NTFS 3.0+ specific clean up. */
 	if (vol->major_ver >= 3) {
 #ifdef NTFS_RW
+		if (vol->usnjrnl_j_ino) {
+			iput(vol->usnjrnl_j_ino);
+			vol->usnjrnl_j_ino = NULL;
+		}
+		if (vol->usnjrnl_max_ino) {
+			iput(vol->usnjrnl_max_ino);
+			vol->usnjrnl_max_ino = NULL;
+		}
+		if (vol->usnjrnl_ino) {
+			iput(vol->usnjrnl_ino);
+			vol->usnjrnl_ino = NULL;
+		}
 		if (vol->quota_q_ino) {
 			iput(vol->quota_q_ino);
 			vol->quota_q_ino = NULL;
@@ -1859,39 +2402,12 @@ static void ntfs_put_super(struct super_block *sb)
 		vol->mftmirr_ino = NULL;
 	}
 	/*
-	 * If any dirty inodes are left, throw away all mft data page cache
-	 * pages to allow a clean umount.  This should never happen any more
-	 * due to mft.c::ntfs_mft_writepage() cleaning all the dirty pages as
-	 * the underlying mft records are written out and cleaned.  If it does,
-	 * happen anyway, we want to know...
+	 * We should have no dirty inodes left, due to
+	 * mft.c::ntfs_mft_writepage() cleaning all the dirty pages as
+	 * the underlying mft records are written out and cleaned.
 	 */
 	ntfs_commit_inode(vol->mft_ino);
 	write_inode_now(vol->mft_ino, 1);
-	if (!list_empty(&sb->s_dirty)) {
-		const char *s1, *s2;
-
-		down(&vol->mft_ino->i_sem);
-		truncate_inode_pages(vol->mft_ino->i_mapping, 0);
-		up(&vol->mft_ino->i_sem);
-		write_inode_now(vol->mft_ino, 1);
-		if (!list_empty(&sb->s_dirty)) {
-			static const char *_s1 = "inodes";
-			static const char *_s2 = "";
-			s1 = _s1;
-			s2 = _s2;
-		} else {
-			static const char *_s1 = "mft pages";
-			static const char *_s2 = "They have been thrown "
-					"away.  ";
-			s1 = _s1;
-			s2 = _s2;
-		}
-		ntfs_error(sb, "Dirty %s found at umount time.  %sYou should "
-				"run chkdsk.  Please email "
-				"linux-ntfs-dev@lists.sourceforge.net and say "
-				"that you saw this message.  Thank you.", s1,
-				s2);
-	}
 #endif /* NTFS_RW */
 
 	iput(vol->mft_ino);
@@ -1908,7 +2424,7 @@ static void ntfs_put_super(struct super_block *sb)
 	 * Destroy the global default upcase table if necessary.  Also decrease
 	 * the number of upcase users if we are a user.
 	 */
-	down(&ntfs_lock);
+	mutex_lock(&ntfs_lock);
 	if (vol->upcase == default_upcase) {
 		ntfs_nr_upcase_users--;
 		vol->upcase = NULL;
@@ -1919,18 +2435,16 @@ static void ntfs_put_super(struct super_block *sb)
 	}
 	if (vol->cluster_size <= 4096 && !--ntfs_nr_compression_users)
 		free_compression_buffers();
-	up(&ntfs_lock);
+	mutex_unlock(&ntfs_lock);
 	if (vol->upcase) {
 		ntfs_free(vol->upcase);
 		vol->upcase = NULL;
 	}
-	if (vol->nls_map) {
-		unload_nls(vol->nls_map);
-		vol->nls_map = NULL;
-	}
+
+	unload_nls(vol->nls_map);
+
 	sb->s_fs_info = NULL;
 	kfree(vol);
-	return;
 }
 
 /**
@@ -1955,12 +2469,9 @@ static void ntfs_put_super(struct super_block *sb)
 static s64 get_nr_free_clusters(ntfs_volume *vol)
 {
 	s64 nr_free = vol->nr_clusters;
-	u32 *kaddr;
 	struct address_space *mapping = vol->lcnbmp_ino->i_mapping;
-	filler_t *readpage = (filler_t*)mapping->a_ops->readpage;
 	struct page *page;
-	unsigned long index, max_index;
-	unsigned int max_size;
+	pgoff_t index, max_index;
 
 	ntfs_debug("Entering.");
 	/* Serialize accesses to the cluster bitmap. */
@@ -1972,45 +2483,35 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
 	 */
 	max_index = (((vol->nr_clusters + 7) >> 3) + PAGE_CACHE_SIZE - 1) >>
 			PAGE_CACHE_SHIFT;
-	/* Use multiples of 4 bytes. */
-	max_size = PAGE_CACHE_SIZE >> 2;
-	ntfs_debug("Reading $Bitmap, max_index = 0x%lx, max_size = 0x%x.",
-			max_index, max_size);
-	for (index = 0UL; index < max_index; index++) {
-		unsigned int i;
+	/* Use multiples of 4 bytes, thus max_size is PAGE_CACHE_SIZE / 4. */
+	ntfs_debug("Reading $Bitmap, max_index = 0x%lx, max_size = 0x%lx.",
+			max_index, PAGE_CACHE_SIZE / 4);
+	for (index = 0; index < max_index; index++) {
+		unsigned long *kaddr;
+
 		/*
 		 * Read the page from page cache, getting it from backing store
 		 * if necessary, and increment the use count.
 		 */
-		page = read_cache_page(mapping, index, (filler_t*)readpage,
-				NULL);
+		page = read_mapping_page(mapping, index, NULL);
 		/* Ignore pages which errored synchronously. */
 		if (IS_ERR(page)) {
-			ntfs_debug("Sync read_cache_page() error. Skipping "
+			ntfs_debug("read_mapping_page() error. Skipping "
 					"page (index 0x%lx).", index);
 			nr_free -= PAGE_CACHE_SIZE * 8;
 			continue;
 		}
-		wait_on_page_locked(page);
-		/* Ignore pages which errored asynchronously. */
-		if (!PageUptodate(page)) {
-			ntfs_debug("Async read_cache_page() error. Skipping "
-					"page (index 0x%lx).", index);
-			page_cache_release(page);
-			nr_free -= PAGE_CACHE_SIZE * 8;
-			continue;
-		}
-		kaddr = (u32*)kmap_atomic(page, KM_USER0);
+		kaddr = kmap_atomic(page);
 		/*
-		 * For each 4 bytes, subtract the number of set bits. If this
+		 * Subtract the number of set bits. If this
 		 * is the last page and it is partial we don't really care as
 		 * it just means we do a little extra work but it won't affect
 		 * the result as all out of range bytes are set to zero by
 		 * ntfs_readpage().
 		 */
-	  	for (i = 0; i < max_size; i++)
-			nr_free -= (s64)hweight32(kaddr[i]);
-		kunmap_atomic(kaddr, KM_USER0);
+		nr_free -= bitmap_weight(kaddr,
+					PAGE_CACHE_SIZE * BITS_PER_BYTE);
+		kunmap_atomic(kaddr);
 		page_cache_release(page);
 	}
 	ntfs_debug("Finished reading $Bitmap, last index = 0x%lx.", index - 1);
@@ -2021,7 +2522,7 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
 	if (vol->nr_clusters & 63)
 		nr_free += 64 - (vol->nr_clusters & 63);
 	up_read(&vol->lcnbmp_lock);
-	/* If errors occured we may well have gone below zero, fix this. */
+	/* If errors occurred we may well have gone below zero, fix this. */
 	if (nr_free < 0)
 		nr_free = 0;
 	ntfs_debug("Exiting.");
@@ -2031,6 +2532,8 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
 /**
  * __get_nr_free_mft_records - return the number of free inodes on a volume
  * @vol:	ntfs volume for which to obtain free inode count
+ * @nr_free:	number of mft records in filesystem
+ * @max_index:	maximum number of pages containing set bits
  *
  * Calculate the number of free mft records (inodes) on the mounted NTFS
  * volume @vol. We actually calculate the number of mft records in use instead
@@ -2043,71 +2546,48 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
  *
  * NOTE: Caller must hold mftbmp_lock rw_semaphore for reading or writing.
  */
-static unsigned long __get_nr_free_mft_records(ntfs_volume *vol)
+static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
+		s64 nr_free, const pgoff_t max_index)
 {
-	s64 nr_free;
-	u32 *kaddr;
 	struct address_space *mapping = vol->mftbmp_ino->i_mapping;
-	filler_t *readpage = (filler_t*)mapping->a_ops->readpage;
 	struct page *page;
-	unsigned long index, max_index;
-	unsigned int max_size;
+	pgoff_t index;
 
 	ntfs_debug("Entering.");
-	/* Number of mft records in file system (at this point in time). */
-	nr_free = vol->mft_ino->i_size >> vol->mft_record_size_bits;
-	/*
-	 * Convert the maximum number of set bits into bytes rounded up, then
-	 * convert into multiples of PAGE_CACHE_SIZE, rounding up so that if we
-	 * have one full and one partial page max_index = 2.
-	 */
-	max_index = ((((NTFS_I(vol->mft_ino)->initialized_size >>
-			vol->mft_record_size_bits) + 7) >> 3) +
-			PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-	/* Use multiples of 4 bytes. */
-	max_size = PAGE_CACHE_SIZE >> 2;
+	/* Use multiples of 4 bytes, thus max_size is PAGE_CACHE_SIZE / 4. */
 	ntfs_debug("Reading $MFT/$BITMAP, max_index = 0x%lx, max_size = "
-			"0x%x.", max_index, max_size);
-	for (index = 0UL; index < max_index; index++) {
-		unsigned int i;
+			"0x%lx.", max_index, PAGE_CACHE_SIZE / 4);
+	for (index = 0; index < max_index; index++) {
+		unsigned long *kaddr;
+
 		/*
 		 * Read the page from page cache, getting it from backing store
 		 * if necessary, and increment the use count.
 		 */
-		page = read_cache_page(mapping, index, (filler_t*)readpage,
-				NULL);
+		page = read_mapping_page(mapping, index, NULL);
 		/* Ignore pages which errored synchronously. */
 		if (IS_ERR(page)) {
-			ntfs_debug("Sync read_cache_page() error. Skipping "
-					"page (index 0x%lx).", index);
-			nr_free -= PAGE_CACHE_SIZE * 8;
-			continue;
-		}
-		wait_on_page_locked(page);
-		/* Ignore pages which errored asynchronously. */
-		if (!PageUptodate(page)) {
-			ntfs_debug("Async read_cache_page() error. Skipping "
+			ntfs_debug("read_mapping_page() error. Skipping "
 					"page (index 0x%lx).", index);
-			page_cache_release(page);
 			nr_free -= PAGE_CACHE_SIZE * 8;
 			continue;
 		}
-		kaddr = (u32*)kmap_atomic(page, KM_USER0);
+		kaddr = kmap_atomic(page);
 		/*
-		 * For each 4 bytes, subtract the number of set bits. If this
+		 * Subtract the number of set bits. If this
 		 * is the last page and it is partial we don't really care as
 		 * it just means we do a little extra work but it won't affect
 		 * the result as all out of range bytes are set to zero by
 		 * ntfs_readpage().
 		 */
-	  	for (i = 0; i < max_size; i++)
-			nr_free -= (s64)hweight32(kaddr[i]);
-		kunmap_atomic(kaddr, KM_USER0);
+		nr_free -= bitmap_weight(kaddr,
+					PAGE_CACHE_SIZE * BITS_PER_BYTE);
+		kunmap_atomic(kaddr);
 		page_cache_release(page);
 	}
 	ntfs_debug("Finished reading $MFT/$BITMAP, last index = 0x%lx.",
 			index - 1);
-	/* If errors occured we may well have gone below zero, fix this. */
+	/* If errors occurred we may well have gone below zero, fix this. */
 	if (nr_free < 0)
 		nr_free = 0;
 	ntfs_debug("Exiting.");
@@ -2116,10 +2596,10 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol)
 
 /**
  * ntfs_statfs - return information about mounted NTFS volume
- * @sb:		super block of mounted volume
+ * @dentry:	dentry from mounted volume
  * @sfs:	statfs structure in which to return the information
  *
- * Return information about the mounted NTFS volume @sb in the statfs structure
+ * Return information about the mounted NTFS volume @dentry in the statfs structure
  * pointed to by @sfs (this is initialized with zeros before ntfs_statfs is
  * called). We interpret the values to be correct of the moment in time at
  * which we are called. Most values are variable otherwise and this isn't just
@@ -2132,10 +2612,14 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol)
  *
  * Return 0 on success or -errno on error.
  */
-static int ntfs_statfs(struct super_block *sb, struct kstatfs *sfs)
+static int ntfs_statfs(struct dentry *dentry, struct kstatfs *sfs)
 {
-	ntfs_volume *vol = NTFS_SB(sb);
+	struct super_block *sb = dentry->d_sb;
 	s64 size;
+	ntfs_volume *vol = NTFS_SB(sb);
+	ntfs_inode *mft_ni = NTFS_I(vol->mft_ino);
+	pgoff_t max_index;
+	unsigned long flags;
 
 	ntfs_debug("Entering.");
 	/* Type of filesystem. */
@@ -2143,13 +2627,13 @@ static int ntfs_statfs(struct super_block *sb, struct kstatfs *sfs)
 	/* Optimal transfer block size. */
 	sfs->f_bsize  = PAGE_CACHE_SIZE;
 	/*
-	 * Total data blocks in file system in units of f_bsize and since
+	 * Total data blocks in filesystem in units of f_bsize and since
 	 * inodes are also stored in data blocs ($MFT is a file) this is just
 	 * the total clusters.
 	 */
 	sfs->f_blocks = vol->nr_clusters << vol->cluster_size_bits >>
 				PAGE_CACHE_SHIFT;
-	/* Free data blocks in file system in units of f_bsize. */
+	/* Free data blocks in filesystem in units of f_bsize. */
 	size	      = get_nr_free_clusters(vol) << vol->cluster_size_bits >>
 				PAGE_CACHE_SHIFT;
 	if (size < 0LL)
@@ -2158,17 +2642,27 @@ static int ntfs_statfs(struct super_block *sb, struct kstatfs *sfs)
 	sfs->f_bavail = sfs->f_bfree = size;
 	/* Serialize accesses to the inode bitmap. */
 	down_read(&vol->mftbmp_lock);
-	/* Number of inodes in file system (at this point in time). */
-	sfs->f_files = vol->mft_ino->i_size >> vol->mft_record_size_bits;
+	read_lock_irqsave(&mft_ni->size_lock, flags);
+	size = i_size_read(vol->mft_ino) >> vol->mft_record_size_bits;
+	/*
+	 * Convert the maximum number of set bits into bytes rounded up, then
+	 * convert into multiples of PAGE_CACHE_SIZE, rounding up so that if we
+	 * have one full and one partial page max_index = 2.
+	 */
+	max_index = ((((mft_ni->initialized_size >> vol->mft_record_size_bits)
+			+ 7) >> 3) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	read_unlock_irqrestore(&mft_ni->size_lock, flags);
+	/* Number of inodes in filesystem (at this point in time). */
+	sfs->f_files = size;
 	/* Free inodes in fs (based on current total count). */
-	sfs->f_ffree = __get_nr_free_mft_records(vol);
+	sfs->f_ffree = __get_nr_free_mft_records(vol, size, max_index);
 	up_read(&vol->mftbmp_lock);
 	/*
 	 * File system id. This is extremely *nix flavour dependent and even
 	 * within Linux itself all fs do their own thing. I interpret this to
 	 * mean a unique id associated with the mounted fs and not the id
-	 * associated with the file system driver, the latter is already given
-	 * by the file system type in sfs->f_type. Thus we use the 64-bit
+	 * associated with the filesystem driver, the latter is already given
+	 * by the filesystem type in sfs->f_type. Thus we use the 64-bit
 	 * volume serial number splitting it into two 32-bit parts. We enter
 	 * the least significant 32-bits in f_fsid[0] and the most significant
 	 * 32-bits in f_fsid[1].
@@ -2180,92 +2674,45 @@ static int ntfs_statfs(struct super_block *sb, struct kstatfs *sfs)
 	return 0;
 }
 
+#ifdef NTFS_RW
+static int ntfs_write_inode(struct inode *vi, struct writeback_control *wbc)
+{
+	return __ntfs_write_inode(vi, wbc->sync_mode == WB_SYNC_ALL);
+}
+#endif
+
 /**
  * The complete super operations.
  */
-static struct super_operations ntfs_sops = {
+static const struct super_operations ntfs_sops = {
 	.alloc_inode	= ntfs_alloc_big_inode,	  /* VFS: Allocate new inode. */
 	.destroy_inode	= ntfs_destroy_big_inode, /* VFS: Deallocate inode. */
-	.put_inode	= ntfs_put_inode,	  /* VFS: Called just before
-						     the inode reference count
-						     is decreased. */
 #ifdef NTFS_RW
-	//.dirty_inode	= NULL,			/* VFS: Called from
-	//					   __mark_inode_dirty(). */
 	.write_inode	= ntfs_write_inode,	/* VFS: Write dirty inode to
 						   disk. */
-	//.drop_inode	= NULL,			/* VFS: Called just after the
-	//					   inode reference count has
-	//					   been decreased to zero.
-	//					   NOTE: The inode lock is
-	//					   held. See fs/inode.c::
-	//					   generic_drop_inode(). */
-	//.delete_inode	= NULL,			/* VFS: Delete inode from disk.
-	//					   Called when i_count becomes
-	//					   0 and i_nlink is also 0. */
-	//.write_super	= NULL,			/* Flush dirty super block to
-	//					   disk. */
-	//.sync_fs	= NULL,			/* ? */
-	//.write_super_lockfs	= NULL,		/* ? */
-	//.unlockfs	= NULL,			/* ? */
 #endif /* NTFS_RW */
 	.put_super	= ntfs_put_super,	/* Syscall: umount. */
 	.statfs		= ntfs_statfs,		/* Syscall: statfs */
 	.remount_fs	= ntfs_remount,		/* Syscall: mount -o remount. */
-	.clear_inode	= ntfs_clear_big_inode,	/* VFS: Called when an inode is
+	.evict_inode	= ntfs_evict_big_inode,	/* VFS: Called when an inode is
 						   removed from memory. */
-	//.umount_begin	= NULL,			/* Forced umount. */
 	.show_options	= ntfs_show_options,	/* Show mount options in
 						   proc. */
 };
 
-
 /**
- * Declarations for NTFS specific export operations (fs/ntfs/namei.c).
- */
-extern struct dentry *ntfs_get_parent(struct dentry *child_dent);
-extern struct dentry *ntfs_get_dentry(struct super_block *sb, void *fh);
-
-/**
- * Export operations allowing NFS exporting of mounted NTFS partitions.
- *
- * We use the default ->decode_fh() and ->encode_fh() for now.  Note that they
- * use 32 bits to store the inode number which is an unsigned long so on 64-bit
- * architectures is usually 64 bits so it would all fail horribly on huge
- * volumes.  I guess we need to define our own encode and decode fh functions
- * that store 64-bit inode numbers at some point but for now we will ignore the
- * problem...
- *
- * We also use the default ->get_name() helper (used by ->decode_fh() via
- * fs/exportfs/expfs.c::find_exported_dentry()) as that is completely fs
- * independent.
- *
- * The default ->get_parent() just returns -EACCES so we have to provide our
- * own and the default ->get_dentry() is incompatible with NTFS due to not
- * allowing the inode number 0 which is used in NTFS for the system file $MFT
- * and due to using iget() whereas NTFS needs ntfs_iget().
- */
-static struct export_operations ntfs_export_ops = {
-	.get_parent	= ntfs_get_parent,	/* Find the parent of a given
-						   directory. */
-	.get_dentry	= ntfs_get_dentry,	/* Find a dentry for the inode
-						   given a file handle
-						   sub-fragment. */
-};
-
-/**
- * ntfs_fill_super - mount an ntfs files system
- * @sb:		super block of ntfs file system to mount
+ * ntfs_fill_super - mount an ntfs filesystem
+ * @sb:		super block of ntfs filesystem to mount
  * @opt:	string containing the mount options
  * @silent:	silence error output
  *
  * ntfs_fill_super() is called by the VFS to mount the device described by @sb
- * with the mount otions in @data with the NTFS file system.
+ * with the mount otions in @data with the NTFS filesystem.
  *
  * If @silent is true, remain silent even if errors are detected. This is used
- * during bootup, when the kernel tries to mount the root file system with all
- * registered file systems one after the other until one succeeds. This implies
- * that all file systems except the correct one will quite correctly and
+ * during bootup, when the kernel tries to mount the root filesystem with all
+ * registered filesystems one after the other until one succeeds. This implies
+ * that all filesystems except the correct one will quite correctly and
  * expectedly return an error, but nobody wants to see error messages when in
  * fact this is what is supposed to happen.
  *
@@ -2276,11 +2723,22 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
 	ntfs_volume *vol;
 	struct buffer_head *bh;
 	struct inode *tmp_ino;
-	int result;
+	int blocksize, result;
 
+	/*
+	 * We do a pretty difficult piece of bootstrap by reading the
+	 * MFT (and other metadata) from disk into memory. We'll only
+	 * release this metadata during umount, so the locking patterns
+	 * observed during bootstrap do not count. So turn off the
+	 * observation of locking patterns (strictly for this context
+	 * only) while mounting NTFS. [The validator is still active
+	 * otherwise, even for this context: it will for example record
+	 * lock class registrations.]
+	 */
+	lockdep_off();
 	ntfs_debug("Entering.");
 #ifndef NTFS_RW
-	sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME;
+	sb->s_flags |= MS_RDONLY;
 #endif /* ! NTFS_RW */
 	/* Allocate a new ntfs_volume and place it in sb->s_fs_info. */
 	sb->s_fs_info = kmalloc(sizeof(ntfs_volume), GFP_NOFS);
@@ -2289,99 +2747,110 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
 		if (!silent)
 			ntfs_error(sb, "Allocation of NTFS volume structure "
 					"failed. Aborting mount...");
+		lockdep_on();
 		return -ENOMEM;
 	}
 	/* Initialize ntfs_volume structure. */
-	memset(vol, 0, sizeof(ntfs_volume));
-	vol->sb = sb;
-	vol->upcase = NULL;
-	vol->attrdef = NULL;
-	vol->mft_ino = NULL;
-	vol->mftbmp_ino = NULL;
+	*vol = (ntfs_volume) {
+		.sb = sb,
+		/*
+		 * Default is group and other don't have any access to files or
+		 * directories while owner has full access. Further, files by
+		 * default are not executable but directories are of course
+		 * browseable.
+		 */
+		.fmask = 0177,
+		.dmask = 0077,
+	};
 	init_rwsem(&vol->mftbmp_lock);
-#ifdef NTFS_RW
-	vol->mftmirr_ino = NULL;
-	vol->logfile_ino = NULL;
-#endif /* NTFS_RW */
-	vol->lcnbmp_ino = NULL;
 	init_rwsem(&vol->lcnbmp_lock);
-	vol->vol_ino = NULL;
-	vol->root_ino = NULL;
-	vol->secure_ino = NULL;
-	vol->extend_ino = NULL;
-#ifdef NTFS_RW
-	vol->quota_ino = NULL;
-	vol->quota_q_ino = NULL;
-#endif /* NTFS_RW */
-	vol->nls_map = NULL;
 
-	/*
-	 * Default is group and other don't have any access to files or
-	 * directories while owner has full access. Further, files by default
-	 * are not executable but directories are of course browseable.
-	 */
-	vol->fmask = 0177;
-	vol->dmask = 0077;
-
-	unlock_kernel();
+	/* By default, enable sparse support. */
+	NVolSetSparseEnabled(vol);
 
 	/* Important to get the mount options dealt with now. */
 	if (!parse_options(vol, (char*)opt))
 		goto err_out_now;
 
+	/* We support sector sizes up to the PAGE_CACHE_SIZE. */
+	if (bdev_logical_block_size(sb->s_bdev) > PAGE_CACHE_SIZE) {
+		if (!silent)
+			ntfs_error(sb, "Device has unsupported sector size "
+					"(%i).  The maximum supported sector "
+					"size on this architecture is %lu "
+					"bytes.",
+					bdev_logical_block_size(sb->s_bdev),
+					PAGE_CACHE_SIZE);
+		goto err_out_now;
+	}
 	/*
-	 * TODO: Fail safety check. In the future we should really be able to
-	 * cope with this being the case, but for now just bail out.
+	 * Setup the device access block size to NTFS_BLOCK_SIZE or the hard
+	 * sector size, whichever is bigger.
 	 */
-	if (bdev_hardsect_size(sb->s_bdev) > NTFS_BLOCK_SIZE) {
+	blocksize = sb_min_blocksize(sb, NTFS_BLOCK_SIZE);
+	if (blocksize < NTFS_BLOCK_SIZE) {
 		if (!silent)
-			ntfs_error(sb, "Device has unsupported hardsect_size.");
+			ntfs_error(sb, "Unable to set device block size.");
 		goto err_out_now;
 	}
-
-	/* Setup the device access block size to NTFS_BLOCK_SIZE. */
-	if (sb_set_blocksize(sb, NTFS_BLOCK_SIZE) != NTFS_BLOCK_SIZE) {
+	BUG_ON(blocksize != sb->s_blocksize);
+	ntfs_debug("Set device block size to %i bytes (block size bits %i).",
+			blocksize, sb->s_blocksize_bits);
+	/* Determine the size of the device in units of block_size bytes. */
+	if (!i_size_read(sb->s_bdev->bd_inode)) {
 		if (!silent)
-			ntfs_error(sb, "Unable to set block size.");
+			ntfs_error(sb, "Unable to determine device size.");
 		goto err_out_now;
 	}
-
-	/* Get the size of the device in units of NTFS_BLOCK_SIZE bytes. */
-	vol->nr_blocks = sb->s_bdev->bd_inode->i_size >> NTFS_BLOCK_SIZE_BITS;
-
+	vol->nr_blocks = i_size_read(sb->s_bdev->bd_inode) >>
+			sb->s_blocksize_bits;
 	/* Read the boot sector and return unlocked buffer head to it. */
 	if (!(bh = read_ntfs_boot_sector(sb, silent))) {
 		if (!silent)
 			ntfs_error(sb, "Not an NTFS volume.");
 		goto err_out_now;
 	}
-
 	/*
-	 * Extract the data from the boot sector and setup the ntfs super block
+	 * Extract the data from the boot sector and setup the ntfs volume
 	 * using it.
 	 */
 	result = parse_ntfs_boot_sector(vol, (NTFS_BOOT_SECTOR*)bh->b_data);
-
-	/* Initialize the cluster and mft allocators. */
-	ntfs_setup_allocators(vol);
-
 	brelse(bh);
-
 	if (!result) {
 		if (!silent)
 			ntfs_error(sb, "Unsupported NTFS filesystem.");
 		goto err_out_now;
 	}
-
 	/*
-	 * TODO: When we start coping with sector sizes different from
-	 * NTFS_BLOCK_SIZE, we now probably need to set the blocksize of the
-	 * device (probably to NTFS_BLOCK_SIZE).
+	 * If the boot sector indicates a sector size bigger than the current
+	 * device block size, switch the device block size to the sector size.
+	 * TODO: It may be possible to support this case even when the set
+	 * below fails, we would just be breaking up the i/o for each sector
+	 * into multiple blocks for i/o purposes but otherwise it should just
+	 * work.  However it is safer to leave disabled until someone hits this
+	 * error message and then we can get them to try it without the setting
+	 * so we know for sure that it works.
 	 */
-
+	if (vol->sector_size > blocksize) {
+		blocksize = sb_set_blocksize(sb, vol->sector_size);
+		if (blocksize != vol->sector_size) {
+			if (!silent)
+				ntfs_error(sb, "Unable to set device block "
+						"size to sector size (%i).",
+						vol->sector_size);
+			goto err_out_now;
+		}
+		BUG_ON(blocksize != sb->s_blocksize);
+		vol->nr_blocks = i_size_read(sb->s_bdev->bd_inode) >>
+				sb->s_blocksize_bits;
+		ntfs_debug("Changed device block size to %i bytes (block size "
+				"bits %i) to match volume sector size.",
+				blocksize, sb->s_blocksize_bits);
+	}
+	/* Initialize the cluster and mft allocators. */
+	ntfs_setup_allocators(vol);
 	/* Setup remaining fields in the super block. */
 	sb->s_magic = NTFS_SB_MAGIC;
-
 	/*
 	 * Ntfs allows 63 bits for the file size, i.e. correct would be:
 	 *	sb->s_maxbytes = ~0ULL >> 1;
@@ -2391,9 +2860,8 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
 	 * without overflowing the index or to 2^63 - 1, whichever is smaller.
 	 */
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
-
+	/* Ntfs measures time in 100ns intervals. */
 	sb->s_time_gran = 100;
-
 	/*
 	 * Now load the metadata required for the page cache and our address
 	 * space operations to function. We do this by setting up a specialised
@@ -2415,7 +2883,7 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
 			ntfs_error(sb, "Failed to load essential metadata.");
 		goto iput_tmp_ino_err_out_now;
 	}
-	down(&ntfs_lock);
+	mutex_lock(&ntfs_lock);
 	/*
 	 * The current mount is a compression user if the cluster size is
 	 * less than or equal 4kiB.
@@ -2426,7 +2894,7 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
 			ntfs_error(NULL, "Failed to allocate buffers "
 					"for compression engine.");
 			ntfs_nr_compression_users--;
-			up(&ntfs_lock);
+			mutex_unlock(&ntfs_lock);
 			goto iput_tmp_ino_err_out_now;
 		}
 	}
@@ -2438,7 +2906,7 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
 	if (!default_upcase)
 		default_upcase = generate_default_upcase();
 	ntfs_nr_upcase_users++;
-	up(&ntfs_lock);
+	mutex_unlock(&ntfs_lock);
 	/*
 	 * From now on, ignore @silent parameter. If we fail below this line,
 	 * it will be due to a corrupt fs or a system error, so we report it.
@@ -2451,19 +2919,20 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
 		ntfs_error(sb, "Failed to load system files.");
 		goto unl_upcase_iput_tmp_ino_err_out_now;
 	}
-	if ((sb->s_root = d_alloc_root(vol->root_ino))) {
-		/* We increment i_count simulating an ntfs_iget(). */
-		atomic_inc(&vol->root_ino->i_count);
+
+	/* We grab a reference, simulating an ntfs_iget(). */
+	ihold(vol->root_ino);
+	if ((sb->s_root = d_make_root(vol->root_ino))) {
 		ntfs_debug("Exiting, status successful.");
 		/* Release the default upcase if it has no users. */
-		down(&ntfs_lock);
+		mutex_lock(&ntfs_lock);
 		if (!--ntfs_nr_upcase_users && default_upcase) {
 			ntfs_free(default_upcase);
 			default_upcase = NULL;
 		}
-		up(&ntfs_lock);
+		mutex_unlock(&ntfs_lock);
 		sb->s_export_op = &ntfs_export_ops;
-		lock_kernel();
+		lockdep_on();
 		return 0;
 	}
 	ntfs_error(sb, "Failed to allocate root directory.");
@@ -2476,6 +2945,18 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
 	/* NTFS 3.0+ specific clean up. */
 	if (vol->major_ver >= 3) {
 #ifdef NTFS_RW
+		if (vol->usnjrnl_j_ino) {
+			iput(vol->usnjrnl_j_ino);
+			vol->usnjrnl_j_ino = NULL;
+		}
+		if (vol->usnjrnl_max_ino) {
+			iput(vol->usnjrnl_max_ino);
+			vol->usnjrnl_max_ino = NULL;
+		}
+		if (vol->usnjrnl_ino) {
+			iput(vol->usnjrnl_ino);
+			vol->usnjrnl_ino = NULL;
+		}
 		if (vol->quota_q_ino) {
 			iput(vol->quota_q_ino);
 			vol->quota_q_ino = NULL;
@@ -2517,12 +2998,12 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
 		vol->attrdef = NULL;
 	}
 	vol->upcase_len = 0;
-	down(&ntfs_lock);
+	mutex_lock(&ntfs_lock);
 	if (vol->upcase == default_upcase) {
 		ntfs_nr_upcase_users--;
 		vol->upcase = NULL;
 	}
-	up(&ntfs_lock);
+	mutex_unlock(&ntfs_lock);
 	if (vol->upcase) {
 		ntfs_free(vol->upcase);
 		vol->upcase = NULL;
@@ -2537,40 +3018,25 @@ unl_upcase_iput_tmp_ino_err_out_now:
 	 * Decrease the number of upcase users and destroy the global default
 	 * upcase table if necessary.
 	 */
-	down(&ntfs_lock);
+	mutex_lock(&ntfs_lock);
 	if (!--ntfs_nr_upcase_users && default_upcase) {
 		ntfs_free(default_upcase);
 		default_upcase = NULL;
 	}
 	if (vol->cluster_size <= 4096 && !--ntfs_nr_compression_users)
 		free_compression_buffers();
-	up(&ntfs_lock);
+	mutex_unlock(&ntfs_lock);
 iput_tmp_ino_err_out_now:
 	iput(tmp_ino);
 	if (vol->mft_ino && vol->mft_ino != tmp_ino)
 		iput(vol->mft_ino);
 	vol->mft_ino = NULL;
-	/*
-	 * This is needed to get ntfs_clear_extent_inode() called for each
-	 * inode we have ever called ntfs_iget()/iput() on, otherwise we A)
-	 * leak resources and B) a subsequent mount fails automatically due to
-	 * ntfs_iget() never calling down into our ntfs_read_locked_inode()
-	 * method again... FIXME: Do we need to do this twice now because of
-	 * attribute inodes? I think not, so leave as is for now... (AIA)
-	 */
-	if (invalidate_inodes(sb)) {
-		ntfs_error(sb, "Busy inodes left. This is most likely a NTFS "
-				"driver bug.");
-		/* Copied from fs/super.c. I just love this message. (-; */
-		printk("NTFS: Busy inodes after umount. Self-destruct in 5 "
-				"seconds.  Have a nice day...\n");
-	}
 	/* Errors at this stage are irrelevant. */
 err_out_now:
-	lock_kernel();
 	sb->s_fs_info = NULL;
 	kfree(vol);
 	ntfs_debug("Failed, returning -EINVAL.");
+	lockdep_on();
 	return -EINVAL;
 }
 
@@ -2579,46 +3045,44 @@ err_out_now:
  * strings of the maximum length allowed by NTFS, which is NTFS_MAX_NAME_LEN
  * (255) Unicode characters + a terminating NULL Unicode character.
  */
-kmem_cache_t *ntfs_name_cache;
+struct kmem_cache *ntfs_name_cache;
 
-/* Slab caches for efficient allocation/deallocation of of inodes. */
-kmem_cache_t *ntfs_inode_cache;
-kmem_cache_t *ntfs_big_inode_cache;
+/* Slab caches for efficient allocation/deallocation of inodes. */
+struct kmem_cache *ntfs_inode_cache;
+struct kmem_cache *ntfs_big_inode_cache;
 
 /* Init once constructor for the inode slab cache. */
-static void ntfs_big_inode_init_once(void *foo, kmem_cache_t *cachep,
-		unsigned long flags)
+static void ntfs_big_inode_init_once(void *foo)
 {
 	ntfs_inode *ni = (ntfs_inode *)foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-			SLAB_CTOR_CONSTRUCTOR)
-		inode_init_once(VFS_I(ni));
+	inode_init_once(VFS_I(ni));
 }
 
 /*
  * Slab caches to optimize allocations and deallocations of attribute search
  * contexts and index contexts, respectively.
  */
-kmem_cache_t *ntfs_attr_ctx_cache;
-kmem_cache_t *ntfs_index_ctx_cache;
+struct kmem_cache *ntfs_attr_ctx_cache;
+struct kmem_cache *ntfs_index_ctx_cache;
 
-/* Driver wide semaphore. */
-DECLARE_MUTEX(ntfs_lock);
+/* Driver wide mutex. */
+DEFINE_MUTEX(ntfs_lock);
 
-static struct super_block *ntfs_get_sb(struct file_system_type *fs_type,
+static struct dentry *ntfs_mount(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, ntfs_fill_super);
+	return mount_bdev(fs_type, flags, dev_name, data, ntfs_fill_super);
 }
 
 static struct file_system_type ntfs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "ntfs",
-	.get_sb		= ntfs_get_sb,
+	.mount		= ntfs_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
+MODULE_ALIAS_FS("ntfs");
 
 /* Stable names for the slab caches. */
 static const char ntfs_index_ctx_cache_name[] = "ntfs_index_ctx_cache";
@@ -2632,7 +3096,7 @@ static int __init init_ntfs_fs(void)
 	int err = 0;
 
 	/* This may be ugly but it results in pretty output so who cares. (-8 */
-	printk(KERN_INFO "NTFS driver " NTFS_VERSION " [Flags: R/"
+	pr_info("driver " NTFS_VERSION " [Flags: R/"
 #ifdef NTFS_RW
 			"W"
 #else
@@ -2650,53 +3114,49 @@ static int __init init_ntfs_fs(void)
 
 	ntfs_index_ctx_cache = kmem_cache_create(ntfs_index_ctx_cache_name,
 			sizeof(ntfs_index_context), 0 /* offset */,
-			SLAB_HWCACHE_ALIGN, NULL /* ctor */, NULL /* dtor */);
+			SLAB_HWCACHE_ALIGN, NULL /* ctor */);
 	if (!ntfs_index_ctx_cache) {
-		printk(KERN_CRIT "NTFS: Failed to create %s!\n",
-				ntfs_index_ctx_cache_name);
+		pr_crit("Failed to create %s!\n", ntfs_index_ctx_cache_name);
 		goto ictx_err_out;
 	}
 	ntfs_attr_ctx_cache = kmem_cache_create(ntfs_attr_ctx_cache_name,
 			sizeof(ntfs_attr_search_ctx), 0 /* offset */,
-			SLAB_HWCACHE_ALIGN, NULL /* ctor */, NULL /* dtor */);
+			SLAB_HWCACHE_ALIGN, NULL /* ctor */);
 	if (!ntfs_attr_ctx_cache) {
-		printk(KERN_CRIT "NTFS: Failed to create %s!\n",
-				ntfs_attr_ctx_cache_name);
+		pr_crit("NTFS: Failed to create %s!\n",
+			ntfs_attr_ctx_cache_name);
 		goto actx_err_out;
 	}
 
 	ntfs_name_cache = kmem_cache_create(ntfs_name_cache_name,
 			(NTFS_MAX_NAME_LEN+1) * sizeof(ntfschar), 0,
-			SLAB_HWCACHE_ALIGN, NULL, NULL);
+			SLAB_HWCACHE_ALIGN, NULL);
 	if (!ntfs_name_cache) {
-		printk(KERN_CRIT "NTFS: Failed to create %s!\n",
-				ntfs_name_cache_name);
+		pr_crit("Failed to create %s!\n", ntfs_name_cache_name);
 		goto name_err_out;
 	}
 
 	ntfs_inode_cache = kmem_cache_create(ntfs_inode_cache_name,
 			sizeof(ntfs_inode), 0,
-			SLAB_RECLAIM_ACCOUNT, NULL, NULL);
+			SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
 	if (!ntfs_inode_cache) {
-		printk(KERN_CRIT "NTFS: Failed to create %s!\n",
-				ntfs_inode_cache_name);
+		pr_crit("Failed to create %s!\n", ntfs_inode_cache_name);
 		goto inode_err_out;
 	}
 
 	ntfs_big_inode_cache = kmem_cache_create(ntfs_big_inode_cache_name,
 			sizeof(big_ntfs_inode), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT,
-			ntfs_big_inode_init_once, NULL);
+			SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
+			ntfs_big_inode_init_once);
 	if (!ntfs_big_inode_cache) {
-		printk(KERN_CRIT "NTFS: Failed to create %s!\n",
-				ntfs_big_inode_cache_name);
+		pr_crit("Failed to create %s!\n", ntfs_big_inode_cache_name);
 		goto big_inode_err_out;
 	}
 
 	/* Register the ntfs sysctls. */
 	err = ntfs_sysctl(1);
 	if (err) {
-		printk(KERN_CRIT "NTFS: Failed to register NTFS sysctls!\n");
+		pr_crit("Failed to register NTFS sysctls!\n");
 		goto sysctl_err_out;
 	}
 
@@ -2705,8 +3165,10 @@ static int __init init_ntfs_fs(void)
 		ntfs_debug("NTFS driver registered successfully.");
 		return 0; /* Success! */
 	}
-	printk(KERN_CRIT "NTFS: Failed to register NTFS file system driver!\n");
+	pr_crit("Failed to register NTFS filesystem driver!\n");
 
+	/* Unregister the ntfs sysctls. */
+	ntfs_sysctl(0);
 sysctl_err_out:
 	kmem_cache_destroy(ntfs_big_inode_cache);
 big_inode_err_out:
@@ -2719,8 +3181,7 @@ actx_err_out:
 	kmem_cache_destroy(ntfs_index_ctx_cache);
 ictx_err_out:
 	if (!err) {
-		printk(KERN_CRIT "NTFS: Aborting NTFS file system driver "
-				"registration...\n");
+		pr_crit("Aborting NTFS filesystem driver registration...\n");
 		err = -ENOMEM;
 	}
 	return err;
@@ -2728,42 +3189,30 @@ ictx_err_out:
 
 static void __exit exit_ntfs_fs(void)
 {
-	int err = 0;
-
 	ntfs_debug("Unregistering NTFS driver.");
 
 	unregister_filesystem(&ntfs_fs_type);
 
-	if (kmem_cache_destroy(ntfs_big_inode_cache) && (err = 1))
-		printk(KERN_CRIT "NTFS: Failed to destory %s.\n",
-				ntfs_big_inode_cache_name);
-	if (kmem_cache_destroy(ntfs_inode_cache) && (err = 1))
-		printk(KERN_CRIT "NTFS: Failed to destory %s.\n",
-				ntfs_inode_cache_name);
-	if (kmem_cache_destroy(ntfs_name_cache) && (err = 1))
-		printk(KERN_CRIT "NTFS: Failed to destory %s.\n",
-				ntfs_name_cache_name);
-	if (kmem_cache_destroy(ntfs_attr_ctx_cache) && (err = 1))
-		printk(KERN_CRIT "NTFS: Failed to destory %s.\n",
-				ntfs_attr_ctx_cache_name);
-	if (kmem_cache_destroy(ntfs_index_ctx_cache) && (err = 1))
-		printk(KERN_CRIT "NTFS: Failed to destory %s.\n",
-				ntfs_index_ctx_cache_name);
-	if (err)
-		printk(KERN_CRIT "NTFS: This causes memory to leak! There is "
-				"probably a BUG in the driver! Please report "
-				"you saw this message to "
-				"linux-ntfs-dev@lists.sourceforge.net\n");
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
+	kmem_cache_destroy(ntfs_big_inode_cache);
+	kmem_cache_destroy(ntfs_inode_cache);
+	kmem_cache_destroy(ntfs_name_cache);
+	kmem_cache_destroy(ntfs_attr_ctx_cache);
+	kmem_cache_destroy(ntfs_index_ctx_cache);
 	/* Unregister the ntfs sysctls. */
 	ntfs_sysctl(0);
 }
 
-MODULE_AUTHOR("Anton Altaparmakov <aia21@cantab.net>");
-MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2004 Anton Altaparmakov");
+MODULE_AUTHOR("Anton Altaparmakov <anton@tuxera.com>");
+MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.");
 MODULE_VERSION(NTFS_VERSION);
 MODULE_LICENSE("GPL");
 #ifdef DEBUG
-module_param(debug_msgs, bool, 0);
+module_param(debug_msgs, bint, 0);
 MODULE_PARM_DESC(debug_msgs, "Enable debug messages.");
 #endif
 
diff --git a/fs/ntfs/sysctl.c b/fs/ntfs/sysctl.c
index 75067e4f303..a503156ec15 100644
--- a/fs/ntfs/sysctl.c
+++ b/fs/ntfs/sysctl.c
@@ -1,9 +1,9 @@
 /*
  * sysctl.c - Code for sysctl handling in NTFS Linux kernel driver. Part of
  *	      the Linux-NTFS project. Adapted from the old NTFS driver,
- *	      Copyright (C) 1997 Martin von L�wis, R�gis Duchesne
+ *	      Copyright (C) 1997 Martin von Löwis, Régis Duchesne
  *
- * Copyright (c) 2002-2004 Anton Altaparmakov
+ * Copyright (c) 2002-2005 Anton Altaparmakov
  *
  * This program/include file is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as published
@@ -33,24 +33,30 @@
 #include "sysctl.h"
 #include "debug.h"
 
-#define FS_NTFS	1
-
 /* Definition of the ntfs sysctl. */
-static ctl_table ntfs_sysctls[] = {
-	{ FS_NTFS, "ntfs-debug",		/* Binary and text IDs. */
-	  &debug_msgs,sizeof(debug_msgs),	/* Data pointer and size. */
-	  0644,	NULL, &proc_dointvec },		/* Mode, child, proc handler. */
-	{ 0 }
+static struct ctl_table ntfs_sysctls[] = {
+	{
+		.procname	= "ntfs-debug",
+		.data		= &debug_msgs,		/* Data pointer and size. */
+		.maxlen		= sizeof(debug_msgs),
+		.mode		= 0644,			/* Mode, proc handler. */
+		.proc_handler	= proc_dointvec
+	},
+	{}
 };
 
 /* Define the parent directory /proc/sys/fs. */
-static ctl_table sysctls_root[] = {
-	{ CTL_FS, "fs", NULL, 0, 0555, ntfs_sysctls },
-	{ 0 }
+static struct ctl_table sysctls_root[] = {
+	{
+		.procname	= "fs",
+		.mode		= 0555,
+		.child		= ntfs_sysctls
+	},
+	{}
 };
 
 /* Storage for the sysctls header. */
-static struct ctl_table_header *sysctls_root_table = NULL;
+static struct ctl_table_header *sysctls_root_table;
 
 /**
  * ntfs_sysctl - add or remove the debug sysctl
@@ -62,17 +68,9 @@ int ntfs_sysctl(int add)
 {
 	if (add) {
 		BUG_ON(sysctls_root_table);
-		sysctls_root_table = register_sysctl_table(sysctls_root, 0);
+		sysctls_root_table = register_sysctl_table(sysctls_root);
 		if (!sysctls_root_table)
 			return -ENOMEM;
-#ifdef CONFIG_PROC_FS
-		/*
-		 * If the proc file system is in use and we are a module, need
-		 * to set the owner of our proc entry to our module. In the
-		 * non-modular case, THIS_MODULE is NULL, so this is ok.
-		 */
-		ntfs_sysctls[0].de->owner = THIS_MODULE;
-#endif
 	} else {
 		BUG_ON(!sysctls_root_table);
 		unregister_sysctl_table(sysctls_root_table);
diff --git a/fs/ntfs/sysctl.h b/fs/ntfs/sysctl.h
index df749cc0aac..d4f8ce920d9 100644
--- a/fs/ntfs/sysctl.h
+++ b/fs/ntfs/sysctl.h
@@ -1,7 +1,7 @@
 /*
  * sysctl.h - Defines for sysctl handling in NTFS Linux kernel driver. Part of
  *	      the Linux-NTFS project. Adapted from the old NTFS driver,
- *	      Copyright (C) 1997 Martin von L�wis, R�gis Duchesne
+ *	      Copyright (C) 1997 Martin von Löwis, Régis Duchesne
  *
  * Copyright (c) 2002-2004 Anton Altaparmakov
  *
@@ -24,9 +24,8 @@
 #ifndef _LINUX_NTFS_SYSCTL_H
 #define _LINUX_NTFS_SYSCTL_H
 
-#include <linux/config.h>
 
-#if (DEBUG && CONFIG_SYSCTL)
+#if defined(DEBUG) && defined(CONFIG_SYSCTL)
 
 extern int ntfs_sysctl(int add);
 
diff --git a/fs/ntfs/time.h b/fs/ntfs/time.h
index a09a51dabe4..01233989d5d 100644
--- a/fs/ntfs/time.h
+++ b/fs/ntfs/time.h
@@ -1,7 +1,7 @@
 /*
  * time.h - NTFS time conversion functions.  Part of the Linux-NTFS project.
  *
- * Copyright (c) 2001-2004 Anton Altaparmakov
+ * Copyright (c) 2001-2005 Anton Altaparmakov
  *
  * This program/include file is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as published
@@ -87,7 +87,7 @@ static inline struct timespec ntfs2utc(const sle64 time)
 	struct timespec ts;
 
 	/* Subtract the NTFS time offset. */
-	s64 t = sle64_to_cpu(time) - NTFS_TIME_OFFSET;
+	u64 t = (u64)(sle64_to_cpu(time) - NTFS_TIME_OFFSET);
 	/*
 	 * Convert the time to 1-second intervals and the remainder to
 	 * 1-nano-second intervals.
diff --git a/fs/ntfs/types.h b/fs/ntfs/types.h
index 08a55aa53d4..8c8053b6698 100644
--- a/fs/ntfs/types.h
+++ b/fs/ntfs/types.h
@@ -2,7 +2,7 @@
  * types.h - Defines for NTFS Linux kernel driver specific types.
  *	     Part of the Linux-NTFS project.
  *
- * Copyright (c) 2001-2004 Anton Altaparmakov
+ * Copyright (c) 2001-2005 Anton Altaparmakov
  *
  * This program/include file is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as published
@@ -53,10 +53,13 @@ typedef sle64 leLCN;
 typedef s64 LSN;
 typedef sle64 leLSN;
 
-typedef enum {
-	FALSE = 0,
-	TRUE = 1
-} BOOL;
+/*
+ * The NTFS transaction log $UsnJrnl uses usn which are signed 64-bit values.
+ * We define our own type USN, to allow for type checking and better code
+ * readability.
+ */
+typedef s64 USN;
+typedef sle64 leUSN;
 
 typedef enum {
 	CASE_SENSITIVE = 0,
diff --git a/fs/ntfs/unistr.c b/fs/ntfs/unistr.c
index 560b0ea255b..005ca4b0f13 100644
--- a/fs/ntfs/unistr.c
+++ b/fs/ntfs/unistr.c
@@ -1,7 +1,7 @@
 /*
  * unistr.c - NTFS Unicode string handling. Part of the Linux-NTFS project.
  *
- * Copyright (c) 2001-2004 Anton Altaparmakov
+ * Copyright (c) 2001-2006 Anton Altaparmakov
  *
  * This program/include file is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as published
@@ -19,6 +19,8 @@
  * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
+#include <linux/slab.h>
+
 #include "types.h"
 #include "debug.h"
 #include "ntfs.h"
@@ -59,16 +61,16 @@ static const u8 legal_ansi_char_array[0x40] = {
  * @upcase:		upcase table (only if @ic == IGNORE_CASE)
  * @upcase_size:	length in Unicode characters of @upcase (if present)
  *
- * Compare the names @s1 and @s2 and return TRUE (1) if the names are
- * identical, or FALSE (0) if they are not identical. If @ic is IGNORE_CASE,
+ * Compare the names @s1 and @s2 and return 'true' (1) if the names are
+ * identical, or 'false' (0) if they are not identical. If @ic is IGNORE_CASE,
  * the @upcase table is used to performa a case insensitive comparison.
  */
-BOOL ntfs_are_names_equal(const ntfschar *s1, size_t s1_len,
+bool ntfs_are_names_equal(const ntfschar *s1, size_t s1_len,
 		const ntfschar *s2, size_t s2_len, const IGNORE_CASE_BOOL ic,
 		const ntfschar *upcase, const u32 upcase_size)
 {
 	if (s1_len != s2_len)
-		return FALSE;
+		return false;
 	if (ic == CASE_SENSITIVE)
 		return !ntfs_ucsncmp(s1, s2, s1_len);
 	return !ntfs_ucsncasecmp(s1, s2, s1_len, upcase, upcase_size);
@@ -242,7 +244,7 @@ int ntfs_file_compare_values(FILE_NAME_ATTR *file_name_attr1,
  * map dictates, into a little endian, 2-byte Unicode string.
  *
  * This function allocates the string and the caller is responsible for
- * calling kmem_cache_free(ntfs_name_cache, @outs); when finished with it.
+ * calling kmem_cache_free(ntfs_name_cache, *@outs); when finished with it.
  *
  * On success the function returns the number of Unicode characters written to
  * the output string *@outs (>= 0), not counting the terminating Unicode NULL
@@ -262,37 +264,48 @@ int ntfs_nlstoucs(const ntfs_volume *vol, const char *ins,
 	wchar_t wc;
 	int i, o, wc_len;
 
-	/* We don't trust outside sources. */
-	if (ins) {
-		ucs = (ntfschar*)kmem_cache_alloc(ntfs_name_cache, SLAB_NOFS);
-		if (ucs) {
+	/* We do not trust outside sources. */
+	if (likely(ins)) {
+		ucs = kmem_cache_alloc(ntfs_name_cache, GFP_NOFS);
+		if (likely(ucs)) {
 			for (i = o = 0; i < ins_len; i += wc_len) {
 				wc_len = nls->char2uni(ins + i, ins_len - i,
 						&wc);
-				if (wc_len >= 0) {
-					if (wc) {
+				if (likely(wc_len >= 0 &&
+						o < NTFS_MAX_NAME_LEN)) {
+					if (likely(wc)) {
 						ucs[o++] = cpu_to_le16(wc);
 						continue;
-					} /* else (!wc) */
+					} /* else if (!wc) */
 					break;
-				} /* else (wc_len < 0) */
-				goto conversion_err;
+				} /* else if (wc_len < 0 ||
+						o >= NTFS_MAX_NAME_LEN) */
+				goto name_err;
 			}
 			ucs[o] = 0;
 			*outs = ucs;
 			return o;
-		} /* else (!ucs) */
-		ntfs_error(vol->sb, "Failed to allocate name from "
-				"ntfs_name_cache!");
+		} /* else if (!ucs) */
+		ntfs_error(vol->sb, "Failed to allocate buffer for converted "
+				"name from ntfs_name_cache.");
 		return -ENOMEM;
-	} /* else (!ins) */
-	ntfs_error(NULL, "Received NULL pointer.");
+	} /* else if (!ins) */
+	ntfs_error(vol->sb, "Received NULL pointer.");
 	return -EINVAL;
-conversion_err:
-	ntfs_error(vol->sb, "Name using character set %s contains characters "
-			"that cannot be converted to Unicode.", nls->charset);
+name_err:
 	kmem_cache_free(ntfs_name_cache, ucs);
-	return -EILSEQ;
+	if (wc_len < 0) {
+		ntfs_error(vol->sb, "Name using character set %s contains "
+				"characters that cannot be converted to "
+				"Unicode.", nls->charset);
+		i = -EILSEQ;
+	} else /* if (o >= NTFS_MAX_NAME_LEN) */ {
+		ntfs_error(vol->sb, "Name is too long (maximum length for a "
+				"name on NTFS is %d Unicode characters.",
+				NTFS_MAX_NAME_LEN);
+		i = -ENAMETOOLONG;
+	}
+	return i;
 }
 
 /**
@@ -337,7 +350,7 @@ int ntfs_ucstonls(const ntfs_volume *vol, const ntfschar *ins,
 		}
 		if (!ns) {
 			ns_len = ins_len * NLS_MAX_CHARSET_SIZE;
-			ns = (unsigned char*)kmalloc(ns_len + 1, GFP_NOFS);
+			ns = kmalloc(ns_len + 1, GFP_NOFS);
 			if (!ns)
 				goto mem_err_out;
 		}
@@ -352,7 +365,7 @@ retry:			wc = nls->uni2char(le16_to_cpu(ins[i]), ns + o,
 			else if (wc == -ENAMETOOLONG && ns != *outs) {
 				unsigned char *tc;
 				/* Grow in multiples of 64 bytes. */
-				tc = (unsigned char*)kmalloc((ns_len + 64) &
+				tc = kmalloc((ns_len + 64) &
 						~63, GFP_NOFS);
 				if (tc) {
 					memcpy(tc, ns, ns_len);
@@ -372,7 +385,8 @@ retry:			wc = nls->uni2char(le16_to_cpu(ins[i]), ns + o,
 	return -EINVAL;
 conversion_err:
 	ntfs_error(vol->sb, "Unicode name contains characters that cannot be "
-			"converted to character set %s.", nls->charset);
+			"converted to character set %s.  You might want to "
+			"try to use the mount option nls=utf8.", nls->charset);
 	if (ns != *outs)
 		kfree(ns);
 	if (wc != -ENAMETOOLONG)
diff --git a/fs/ntfs/upcase.c b/fs/ntfs/upcase.c
index 879cdf1d5bd..e2f72ca9803 100644
--- a/fs/ntfs/upcase.c
+++ b/fs/ntfs/upcase.c
@@ -3,10 +3,7 @@
  *	      Part of the Linux-NTFS project.
  *
  * Copyright (c) 2001 Richard Russon <ntfs@flatcap.org>
- * Copyright (c) 2001-2004 Anton Altaparmakov
- *
- * Modified for mkntfs inclusion 9 June 2001 by Anton Altaparmakov.
- * Modified for kernel inclusion 10 September 2001 by Anton Altparmakov.
+ * Copyright (c) 2001-2006 Anton Altaparmakov
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the Free
@@ -75,15 +72,15 @@ ntfschar *generate_default_upcase(void)
 	if (!uc)
 		return uc;
 	memset(uc, 0, default_upcase_len * sizeof(ntfschar));
+	/* Generate the little endian Unicode upcase table used by ntfs. */
 	for (i = 0; i < default_upcase_len; i++)
 		uc[i] = cpu_to_le16(i);
 	for (r = 0; uc_run_table[r][0]; r++)
 		for (i = uc_run_table[r][0]; i < uc_run_table[r][1]; i++)
-			uc[i] = cpu_to_le16((le16_to_cpu(uc[i]) +
-					uc_run_table[r][2]));
+			le16_add_cpu(&uc[i], uc_run_table[r][2]);
 	for (r = 0; uc_dup_table[r][0]; r++)
 		for (i = uc_dup_table[r][0]; i < uc_dup_table[r][1]; i += 2)
-			uc[i + 1] = cpu_to_le16(le16_to_cpu(uc[i + 1]) - 1);
+			le16_add_cpu(&uc[i + 1], -1);
 	for (r = 0; uc_word_table[r][0]; r++)
 		uc[uc_word_table[r][0]] = cpu_to_le16(uc_word_table[r][1]);
 	return uc;
diff --git a/fs/ntfs/usnjrnl.c b/fs/ntfs/usnjrnl.c
new file mode 100644
index 00000000000..b2bc0d55b03
--- /dev/null
+++ b/fs/ntfs/usnjrnl.c
@@ -0,0 +1,84 @@
+/*
+ * usnjrnl.h - NTFS kernel transaction log ($UsnJrnl) handling.  Part of the
+ *	       Linux-NTFS project.
+ *
+ * Copyright (c) 2005 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifdef NTFS_RW
+
+#include <linux/fs.h>
+#include <linux/highmem.h>
+#include <linux/mm.h>
+
+#include "aops.h"
+#include "debug.h"
+#include "endian.h"
+#include "time.h"
+#include "types.h"
+#include "usnjrnl.h"
+#include "volume.h"
+
+/**
+ * ntfs_stamp_usnjrnl - stamp the transaction log ($UsnJrnl) on an ntfs volume
+ * @vol:	ntfs volume on which to stamp the transaction log
+ *
+ * Stamp the transaction log ($UsnJrnl) on the ntfs volume @vol and return
+ * 'true' on success and 'false' on error.
+ *
+ * This function assumes that the transaction log has already been loaded and
+ * consistency checked by a call to fs/ntfs/super.c::load_and_init_usnjrnl().
+ */
+bool ntfs_stamp_usnjrnl(ntfs_volume *vol)
+{
+	ntfs_debug("Entering.");
+	if (likely(!NVolUsnJrnlStamped(vol))) {
+		sle64 stamp;
+		struct page *page;
+		USN_HEADER *uh;
+
+		page = ntfs_map_page(vol->usnjrnl_max_ino->i_mapping, 0);
+		if (IS_ERR(page)) {
+			ntfs_error(vol->sb, "Failed to read from "
+					"$UsnJrnl/$DATA/$Max attribute.");
+			return false;
+		}
+		uh = (USN_HEADER*)page_address(page);
+		stamp = get_current_ntfs_time();
+		ntfs_debug("Stamping transaction log ($UsnJrnl): old "
+				"journal_id 0x%llx, old lowest_valid_usn "
+				"0x%llx, new journal_id 0x%llx, new "
+				"lowest_valid_usn 0x%llx.",
+				(long long)sle64_to_cpu(uh->journal_id),
+				(long long)sle64_to_cpu(uh->lowest_valid_usn),
+				(long long)sle64_to_cpu(stamp),
+				i_size_read(vol->usnjrnl_j_ino));
+		uh->lowest_valid_usn =
+				cpu_to_sle64(i_size_read(vol->usnjrnl_j_ino));
+		uh->journal_id = stamp;
+		flush_dcache_page(page);
+		set_page_dirty(page);
+		ntfs_unmap_page(page);
+		/* Set the flag so we do not have to do it again on remount. */
+		NVolSetUsnJrnlStamped(vol);
+	}
+	ntfs_debug("Done.");
+	return true;
+}
+
+#endif /* NTFS_RW */
diff --git a/fs/ntfs/usnjrnl.h b/fs/ntfs/usnjrnl.h
new file mode 100644
index 00000000000..00d8e6bd7c3
--- /dev/null
+++ b/fs/ntfs/usnjrnl.h
@@ -0,0 +1,205 @@
+/*
+ * usnjrnl.h - Defines for NTFS kernel transaction log ($UsnJrnl) handling.
+ *	       Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2005 Anton Altaparmakov
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _LINUX_NTFS_USNJRNL_H
+#define _LINUX_NTFS_USNJRNL_H
+
+#ifdef NTFS_RW
+
+#include "types.h"
+#include "endian.h"
+#include "layout.h"
+#include "volume.h"
+
+/*
+ * Transaction log ($UsnJrnl) organization:
+ *
+ * The transaction log records whenever a file is modified in any way.  So for
+ * example it will record that file "blah" was written to at a particular time
+ * but not what was written.  If will record that a file was deleted or
+ * created, that a file was truncated, etc.  See below for all the reason
+ * codes used.
+ *
+ * The transaction log is in the $Extend directory which is in the root
+ * directory of each volume.  If it is not present it means transaction
+ * logging is disabled.  If it is present it means transaction logging is
+ * either enabled or in the process of being disabled in which case we can
+ * ignore it as it will go away as soon as Windows gets its hands on it.
+ *
+ * To determine whether the transaction logging is enabled or in the process
+ * of being disabled, need to check the volume flags in the
+ * $VOLUME_INFORMATION attribute in the $Volume system file (which is present
+ * in the root directory and has a fixed mft record number, see layout.h).
+ * If the flag VOLUME_DELETE_USN_UNDERWAY is set it means the transaction log
+ * is in the process of being disabled and if this flag is clear it means the
+ * transaction log is enabled.
+ *
+ * The transaction log consists of two parts; the $DATA/$Max attribute as well
+ * as the $DATA/$J attribute.  $Max is a header describing the transaction
+ * log whilst $J is the transaction log data itself as a sequence of variable
+ * sized USN_RECORDs (see below for all the structures).
+ *
+ * We do not care about transaction logging at this point in time but we still
+ * need to let windows know that the transaction log is out of date.  To do
+ * this we need to stamp the transaction log.  This involves setting the
+ * lowest_valid_usn field in the $DATA/$Max attribute to the usn to be used
+ * for the next added USN_RECORD to the $DATA/$J attribute as well as
+ * generating a new journal_id in $DATA/$Max.
+ *
+ * The journal_id is as of the current version (2.0) of the transaction log
+ * simply the 64-bit timestamp of when the journal was either created or last
+ * stamped.
+ *
+ * To determine the next usn there are two ways.  The first is to parse
+ * $DATA/$J and to find the last USN_RECORD in it and to add its record_length
+ * to its usn (which is the byte offset in the $DATA/$J attribute).  The
+ * second is simply to take the data size of the attribute.  Since the usns
+ * are simply byte offsets into $DATA/$J, this is exactly the next usn.  For
+ * obvious reasons we use the second method as it is much simpler and faster.
+ *
+ * As an aside, note that to actually disable the transaction log, one would
+ * need to set the VOLUME_DELETE_USN_UNDERWAY flag (see above), then go
+ * through all the mft records on the volume and set the usn field in their
+ * $STANDARD_INFORMATION attribute to zero.  Once that is done, one would need
+ * to delete the transaction log file, i.e. \$Extent\$UsnJrnl, and finally,
+ * one would need to clear the VOLUME_DELETE_USN_UNDERWAY flag.
+ *
+ * Note that if a volume is unmounted whilst the transaction log is being
+ * disabled, the process will continue the next time the volume is mounted.
+ * This is why we can safely mount read-write when we see a transaction log
+ * in the process of being deleted.
+ */
+
+/* Some $UsnJrnl related constants. */
+#define UsnJrnlMajorVer		2
+#define UsnJrnlMinorVer		0
+
+/*
+ * $DATA/$Max attribute.  This is (always?) resident and has a fixed size of
+ * 32 bytes.  It contains the header describing the transaction log.
+ */
+typedef struct {
+/*Ofs*/
+/*   0*/sle64 maximum_size;	/* The maximum on-disk size of the $DATA/$J
+				   attribute. */
+/*   8*/sle64 allocation_delta;	/* Number of bytes by which to increase the
+				   size of the $DATA/$J attribute. */
+/*0x10*/sle64 journal_id;	/* Current id of the transaction log. */
+/*0x18*/leUSN lowest_valid_usn;	/* Lowest valid usn in $DATA/$J for the
+				   current journal_id. */
+/* sizeof() = 32 (0x20) bytes */
+} __attribute__ ((__packed__)) USN_HEADER;
+
+/*
+ * Reason flags (32-bit).  Cumulative flags describing the change(s) to the
+ * file since it was last opened.  I think the names speak for themselves but
+ * if you disagree check out the descriptions in the Linux NTFS project NTFS
+ * documentation: http://www.linux-ntfs.org/
+ */
+enum {
+	USN_REASON_DATA_OVERWRITE	= cpu_to_le32(0x00000001),
+	USN_REASON_DATA_EXTEND		= cpu_to_le32(0x00000002),
+	USN_REASON_DATA_TRUNCATION	= cpu_to_le32(0x00000004),
+	USN_REASON_NAMED_DATA_OVERWRITE	= cpu_to_le32(0x00000010),
+	USN_REASON_NAMED_DATA_EXTEND	= cpu_to_le32(0x00000020),
+	USN_REASON_NAMED_DATA_TRUNCATION= cpu_to_le32(0x00000040),
+	USN_REASON_FILE_CREATE		= cpu_to_le32(0x00000100),
+	USN_REASON_FILE_DELETE		= cpu_to_le32(0x00000200),
+	USN_REASON_EA_CHANGE		= cpu_to_le32(0x00000400),
+	USN_REASON_SECURITY_CHANGE	= cpu_to_le32(0x00000800),
+	USN_REASON_RENAME_OLD_NAME	= cpu_to_le32(0x00001000),
+	USN_REASON_RENAME_NEW_NAME	= cpu_to_le32(0x00002000),
+	USN_REASON_INDEXABLE_CHANGE	= cpu_to_le32(0x00004000),
+	USN_REASON_BASIC_INFO_CHANGE	= cpu_to_le32(0x00008000),
+	USN_REASON_HARD_LINK_CHANGE	= cpu_to_le32(0x00010000),
+	USN_REASON_COMPRESSION_CHANGE	= cpu_to_le32(0x00020000),
+	USN_REASON_ENCRYPTION_CHANGE	= cpu_to_le32(0x00040000),
+	USN_REASON_OBJECT_ID_CHANGE	= cpu_to_le32(0x00080000),
+	USN_REASON_REPARSE_POINT_CHANGE	= cpu_to_le32(0x00100000),
+	USN_REASON_STREAM_CHANGE	= cpu_to_le32(0x00200000),
+	USN_REASON_CLOSE		= cpu_to_le32(0x80000000),
+};
+
+typedef le32 USN_REASON_FLAGS;
+
+/*
+ * Source info flags (32-bit).  Information about the source of the change(s)
+ * to the file.  For detailed descriptions of what these mean, see the Linux
+ * NTFS project NTFS documentation:
+ *	http://www.linux-ntfs.org/
+ */
+enum {
+	USN_SOURCE_DATA_MANAGEMENT	  = cpu_to_le32(0x00000001),
+	USN_SOURCE_AUXILIARY_DATA	  = cpu_to_le32(0x00000002),
+	USN_SOURCE_REPLICATION_MANAGEMENT = cpu_to_le32(0x00000004),
+};
+
+typedef le32 USN_SOURCE_INFO_FLAGS;
+
+/*
+ * $DATA/$J attribute.  This is always non-resident, is marked as sparse, and
+ * is of variabled size.  It consists of a sequence of variable size
+ * USN_RECORDS.  The minimum allocated_size is allocation_delta as
+ * specified in $DATA/$Max.  When the maximum_size specified in $DATA/$Max is
+ * exceeded by more than allocation_delta bytes, allocation_delta bytes are
+ * allocated and appended to the $DATA/$J attribute and an equal number of
+ * bytes at the beginning of the attribute are freed and made sparse.  Note the
+ * making sparse only happens at volume checkpoints and hence the actual
+ * $DATA/$J size can exceed maximum_size + allocation_delta temporarily.
+ */
+typedef struct {
+/*Ofs*/
+/*   0*/le32 length;		/* Byte size of this record (8-byte
+				   aligned). */
+/*   4*/le16 major_ver;		/* Major version of the transaction log used
+				   for this record. */
+/*   6*/le16 minor_ver;		/* Minor version of the transaction log used
+				   for this record. */
+/*   8*/leMFT_REF mft_reference;/* The mft reference of the file (or
+				   directory) described by this record. */
+/*0x10*/leMFT_REF parent_directory;/* The mft reference of the parent
+				   directory of the file described by this
+				   record. */
+/*0x18*/leUSN usn;		/* The usn of this record.  Equals the offset
+				   within the $DATA/$J attribute. */
+/*0x20*/sle64 time;		/* Time when this record was created. */
+/*0x28*/USN_REASON_FLAGS reason;/* Reason flags (see above). */
+/*0x2c*/USN_SOURCE_INFO_FLAGS source_info;/* Source info flags (see above). */
+/*0x30*/le32 security_id;	/* File security_id copied from
+				   $STANDARD_INFORMATION. */
+/*0x34*/FILE_ATTR_FLAGS file_attributes;	/* File attributes copied from
+				   $STANDARD_INFORMATION or $FILE_NAME (not
+				   sure which). */
+/*0x38*/le16 file_name_size;	/* Size of the file name in bytes. */
+/*0x3a*/le16 file_name_offset;	/* Offset to the file name in bytes from the
+				   start of this record. */
+/*0x3c*/ntfschar file_name[0];	/* Use when creating only.  When reading use
+				   file_name_offset to determine the location
+				   of the name. */
+/* sizeof() = 60 (0x3c) bytes */
+} __attribute__ ((__packed__)) USN_RECORD;
+
+extern bool ntfs_stamp_usnjrnl(ntfs_volume *vol);
+
+#endif /* NTFS_RW */
+
+#endif /* _LINUX_NTFS_USNJRNL_H */
diff --git a/fs/ntfs/volume.h b/fs/ntfs/volume.h
index 4b97fa8635a..4f579b02bc7 100644
--- a/fs/ntfs/volume.h
+++ b/fs/ntfs/volume.h
@@ -2,7 +2,7 @@
  * volume.h - Defines for volume structures in NTFS Linux kernel driver. Part
  *	      of the Linux-NTFS project.
  *
- * Copyright (c) 2001-2004 Anton Altaparmakov
+ * Copyright (c) 2001-2006 Anton Altaparmakov
  * Copyright (c) 2002 Richard Russon
  *
  * This program/include file is free software; you can redistribute it and/or
@@ -25,6 +25,7 @@
 #define _LINUX_NTFS_VOLUME_H
 
 #include <linux/rwsem.h>
+#include <linux/uidgid.h>
 
 #include "types.h"
 #include "layout.h"
@@ -41,20 +42,18 @@ typedef struct {
 	 * structure has stabilized... (AIA)
 	 */
 	/* Device specifics. */
-	struct super_block *sb;		/* Pointer back to the super_block,
-					   so we don't have to get the offset
-					   every time. */
-	LCN nr_blocks;			/* Number of NTFS_BLOCK_SIZE bytes
+	struct super_block *sb;		/* Pointer back to the super_block. */
+	LCN nr_blocks;			/* Number of sb->s_blocksize bytes
 					   sized blocks on the device. */
 	/* Configuration provided by user at mount time. */
 	unsigned long flags;		/* Miscellaneous flags, see below. */
-	uid_t uid;			/* uid that files will be mounted as. */
-	gid_t gid;			/* gid that files will be mounted as. */
-	mode_t fmask;			/* The mask for file permissions. */
-	mode_t dmask;			/* The mask for directory
+	kuid_t uid;			/* uid that files will be mounted as. */
+	kgid_t gid;			/* gid that files will be mounted as. */
+	umode_t fmask;			/* The mask for file permissions. */
+	umode_t dmask;			/* The mask for directory
 					   permissions. */
 	u8 mft_zone_multiplier;		/* Initial mft zone multiplier. */
-	u8 on_errors;			/* What to do on file system errors. */
+	u8 on_errors;			/* What to do on filesystem errors. */
 	/* NTFS bootsector provided information. */
 	u16 sector_size;		/* in bytes */
 	u8 sector_size_bits;		/* log2(sector_size) */
@@ -125,6 +124,10 @@ typedef struct {
 	/* $Quota stuff is NTFS3.0+ specific.  Unused/NULL otherwise. */
 	struct inode *quota_ino;	/* The VFS inode of $Quota. */
 	struct inode *quota_q_ino;	/* Attribute inode for $Quota/$Q. */
+	/* $UsnJrnl stuff is NTFS3.0+ specific.  Unused/NULL otherwise. */
+	struct inode *usnjrnl_ino;	/* The VFS inode of $UsnJrnl. */
+	struct inode *usnjrnl_max_ino;	/* Attribute inode for $UsnJrnl/$Max. */
+	struct inode *usnjrnl_j_ino;	/* Attribute inode for $UsnJrnl/$J. */
 #endif /* NTFS_RW */
 	struct nls_table *nls_map;
 } ntfs_volume;
@@ -137,17 +140,19 @@ typedef enum {
 	NV_ShowSystemFiles,	/* 1: Return system files in ntfs_readdir(). */
 	NV_CaseSensitive,	/* 1: Treat file names as case sensitive and
 				      create filenames in the POSIX namespace.
-				      Otherwise be case insensitive and create
-				      file names in WIN32 namespace. */
+				      Otherwise be case insensitive but still
+				      create file names in POSIX namespace. */
 	NV_LogFileEmpty,	/* 1: $LogFile journal is empty. */
 	NV_QuotaOutOfDate,	/* 1: $Quota is out of date. */
+	NV_UsnJrnlStamped,	/* 1: $UsnJrnl has been stamped. */
+	NV_SparseEnabled,	/* 1: May create sparse files. */
 } ntfs_volume_flags;
 
 /*
  * Macro tricks to expand the NVolFoo(), NVolSetFoo(), and NVolClearFoo()
  * functions.
  */
-#define NVOL_FNS(flag)					\
+#define DEFINE_NVOL_BIT_OPS(flag)					\
 static inline int NVol##flag(ntfs_volume *vol)		\
 {							\
 	return test_bit(NV_##flag, &(vol)->flags);	\
@@ -162,10 +167,12 @@ static inline void NVolClear##flag(ntfs_volume *vol)	\
 }
 
 /* Emit the ntfs volume bitops functions. */
-NVOL_FNS(Errors)
-NVOL_FNS(ShowSystemFiles)
-NVOL_FNS(CaseSensitive)
-NVOL_FNS(LogFileEmpty)
-NVOL_FNS(QuotaOutOfDate)
+DEFINE_NVOL_BIT_OPS(Errors)
+DEFINE_NVOL_BIT_OPS(ShowSystemFiles)
+DEFINE_NVOL_BIT_OPS(CaseSensitive)
+DEFINE_NVOL_BIT_OPS(LogFileEmpty)
+DEFINE_NVOL_BIT_OPS(QuotaOutOfDate)
+DEFINE_NVOL_BIT_OPS(UsnJrnlStamped)
+DEFINE_NVOL_BIT_OPS(SparseEnabled)
 
 #endif /* _LINUX_NTFS_VOLUME_H */