diff options
Diffstat (limited to 'fs/ext4/file.c')
| -rw-r--r-- | fs/ext4/file.c | 517 | 
1 files changed, 464 insertions, 53 deletions
diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 5a5c55ddcee..8695f70af1e 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -23,7 +23,9 @@  #include <linux/jbd2.h>  #include <linux/mount.h>  #include <linux/path.h> +#include <linux/aio.h>  #include <linux/quotaops.h> +#include <linux/pagevec.h>  #include "ext4.h"  #include "ext4_jbd2.h"  #include "xattr.h" @@ -55,37 +57,145 @@ static int ext4_release_file(struct inode *inode, struct file *filp)  	return 0;  } +static void ext4_unwritten_wait(struct inode *inode) +{ +	wait_queue_head_t *wq = ext4_ioend_wq(inode); + +	wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_unwritten) == 0)); +} + +/* + * This tests whether the IO in question is block-aligned or not. + * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they + * are converted to written only after the IO is complete.  Until they are + * mapped, these blocks appear as holes, so dio_zero_block() will assume that + * it needs to zero out portions of the start and/or end block.  If 2 AIO + * threads are at work on the same unwritten block, they must be synchronized + * or one thread will zero the other's data, causing corruption. + */ +static int +ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos) +{ +	struct super_block *sb = inode->i_sb; +	int blockmask = sb->s_blocksize - 1; + +	if (pos >= i_size_read(inode)) +		return 0; + +	if ((pos | iov_iter_alignment(from)) & blockmask) +		return 1; + +	return 0; +} +  static ssize_t -ext4_file_write(struct kiocb *iocb, const struct iovec *iov, -		unsigned long nr_segs, loff_t pos) +ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)  { -	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; +	struct file *file = iocb->ki_filp; +	struct inode *inode = file_inode(iocb->ki_filp); +	struct mutex *aio_mutex = NULL; +	struct blk_plug plug; +	int o_direct = file->f_flags & O_DIRECT; +	int overwrite = 0; +	size_t length = iov_iter_count(from); +	ssize_t ret; +	loff_t pos = iocb->ki_pos; + +	/* +	 * Unaligned direct AIO must be serialized; see comment above +	 * In the case of O_APPEND, assume that we must always serialize +	 */ +	if (o_direct && +	    ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && +	    !is_sync_kiocb(iocb) && +	    (file->f_flags & O_APPEND || +	     ext4_unaligned_aio(inode, from, pos))) { +		aio_mutex = ext4_aio_mutex(inode); +		mutex_lock(aio_mutex); +		ext4_unwritten_wait(inode); +	} + +	mutex_lock(&inode->i_mutex); +	if (file->f_flags & O_APPEND) +		iocb->ki_pos = pos = i_size_read(inode);  	/*  	 * If we have encountered a bitmap-format file, the size limit  	 * is smaller than s_maxbytes, which is for extent-mapped files.  	 */ -  	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {  		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); -		size_t length = iov_length(iov, nr_segs); -		if ((pos > sbi->s_bitmap_maxbytes || -		    (pos == sbi->s_bitmap_maxbytes && length > 0))) -			return -EFBIG; +		if ((pos > sbi->s_bitmap_maxbytes) || +		    (pos == sbi->s_bitmap_maxbytes && length > 0)) { +			mutex_unlock(&inode->i_mutex); +			ret = -EFBIG; +			goto errout; +		} + +		if (pos + length > sbi->s_bitmap_maxbytes) +			iov_iter_truncate(from, sbi->s_bitmap_maxbytes - pos); +	} + +	if (o_direct) { +		blk_start_plug(&plug); + +		iocb->private = &overwrite; -		if (pos + length > sbi->s_bitmap_maxbytes) { -			nr_segs = iov_shorten((struct iovec *)iov, nr_segs, -					      sbi->s_bitmap_maxbytes - pos); +		/* check whether we do a DIO overwrite or not */ +		if (ext4_should_dioread_nolock(inode) && !aio_mutex && +		    !file->f_mapping->nrpages && pos + length <= i_size_read(inode)) { +			struct ext4_map_blocks map; +			unsigned int blkbits = inode->i_blkbits; +			int err, len; + +			map.m_lblk = pos >> blkbits; +			map.m_len = (EXT4_BLOCK_ALIGN(pos + length, blkbits) >> blkbits) +				- map.m_lblk; +			len = map.m_len; + +			err = ext4_map_blocks(NULL, inode, &map, 0); +			/* +			 * 'err==len' means that all of blocks has +			 * been preallocated no matter they are +			 * initialized or not.  For excluding +			 * unwritten extents, we need to check +			 * m_flags.  There are two conditions that +			 * indicate for initialized extents.  1) If we +			 * hit extent cache, EXT4_MAP_MAPPED flag is +			 * returned; 2) If we do a real lookup, +			 * non-flags are returned.  So we should check +			 * these two conditions. +			 */ +			if (err == len && (map.m_flags & EXT4_MAP_MAPPED)) +				overwrite = 1;  		}  	} -	return generic_file_aio_write(iocb, iov, nr_segs, pos); +	ret = __generic_file_write_iter(iocb, from); +	mutex_unlock(&inode->i_mutex); + +	if (ret > 0) { +		ssize_t err; + +		err = generic_write_sync(file, iocb->ki_pos - ret, ret); +		if (err < 0) +			ret = err; +	} +	if (o_direct) +		blk_finish_plug(&plug); + +errout: +	if (aio_mutex) +		mutex_unlock(aio_mutex); +	return ret;  }  static const struct vm_operations_struct ext4_file_vm_ops = {  	.fault		= filemap_fault, +	.map_pages	= filemap_map_pages,  	.page_mkwrite   = ext4_page_mkwrite, +	.remap_pages	= generic_file_remap_pages,  };  static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) @@ -96,7 +206,6 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)  		return -ENOEXEC;  	file_accessed(file);  	vma->vm_ops = &ext4_file_vm_ops; -	vma->vm_flags |= VM_CAN_NONLINEAR;  	return 0;  } @@ -122,62 +231,366 @@ static int ext4_file_open(struct inode * inode, struct file * filp)  		path.dentry = mnt->mnt_root;  		cp = d_path(&path, buf, sizeof(buf));  		if (!IS_ERR(cp)) { -			memcpy(sbi->s_es->s_last_mounted, cp, -			       sizeof(sbi->s_es->s_last_mounted)); -			ext4_mark_super_dirty(sb); +			handle_t *handle; +			int err; + +			handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1); +			if (IS_ERR(handle)) +				return PTR_ERR(handle); +			BUFFER_TRACE(sbi->s_sbh, "get_write_access"); +			err = ext4_journal_get_write_access(handle, sbi->s_sbh); +			if (err) { +				ext4_journal_stop(handle); +				return err; +			} +			strlcpy(sbi->s_es->s_last_mounted, cp, +				sizeof(sbi->s_es->s_last_mounted)); +			ext4_handle_dirty_super(handle, sb); +			ext4_journal_stop(handle);  		}  	} +	/* +	 * Set up the jbd2_inode if we are opening the inode for +	 * writing and the journal is present +	 */ +	if (filp->f_mode & FMODE_WRITE) { +		int ret = ext4_inode_attach_jinode(inode); +		if (ret < 0) +			return ret; +	}  	return dquot_file_open(inode, filp);  }  /* - * ext4_llseek() copied from generic_file_llseek() to handle both - * block-mapped and extent-mapped maxbytes values. This should - * otherwise be identical with generic_file_llseek(). + * Here we use ext4_map_blocks() to get a block mapping for a extent-based + * file rather than ext4_ext_walk_space() because we can introduce + * SEEK_DATA/SEEK_HOLE for block-mapped and extent-mapped file at the same + * function.  When extent status tree has been fully implemented, it will + * track all extent status for a file and we can directly use it to + * retrieve the offset for SEEK_DATA/SEEK_HOLE. + */ + +/* + * When we retrieve the offset for SEEK_DATA/SEEK_HOLE, we would need to + * lookup page cache to check whether or not there has some data between + * [startoff, endoff] because, if this range contains an unwritten extent, + * we determine this extent as a data or a hole according to whether the + * page cache has data or not.   */ -loff_t ext4_llseek(struct file *file, loff_t offset, int origin) +static int ext4_find_unwritten_pgoff(struct inode *inode, +				     int whence, +				     struct ext4_map_blocks *map, +				     loff_t *offset) +{ +	struct pagevec pvec; +	unsigned int blkbits; +	pgoff_t index; +	pgoff_t end; +	loff_t endoff; +	loff_t startoff; +	loff_t lastoff; +	int found = 0; + +	blkbits = inode->i_sb->s_blocksize_bits; +	startoff = *offset; +	lastoff = startoff; +	endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits; + +	index = startoff >> PAGE_CACHE_SHIFT; +	end = endoff >> PAGE_CACHE_SHIFT; + +	pagevec_init(&pvec, 0); +	do { +		int i, num; +		unsigned long nr_pages; + +		num = min_t(pgoff_t, end - index, PAGEVEC_SIZE); +		nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, +					  (pgoff_t)num); +		if (nr_pages == 0) { +			if (whence == SEEK_DATA) +				break; + +			BUG_ON(whence != SEEK_HOLE); +			/* +			 * If this is the first time to go into the loop and +			 * offset is not beyond the end offset, it will be a +			 * hole at this offset +			 */ +			if (lastoff == startoff || lastoff < endoff) +				found = 1; +			break; +		} + +		/* +		 * If this is the first time to go into the loop and +		 * offset is smaller than the first page offset, it will be a +		 * hole at this offset. +		 */ +		if (lastoff == startoff && whence == SEEK_HOLE && +		    lastoff < page_offset(pvec.pages[0])) { +			found = 1; +			break; +		} + +		for (i = 0; i < nr_pages; i++) { +			struct page *page = pvec.pages[i]; +			struct buffer_head *bh, *head; + +			/* +			 * If the current offset is not beyond the end of given +			 * range, it will be a hole. +			 */ +			if (lastoff < endoff && whence == SEEK_HOLE && +			    page->index > end) { +				found = 1; +				*offset = lastoff; +				goto out; +			} + +			lock_page(page); + +			if (unlikely(page->mapping != inode->i_mapping)) { +				unlock_page(page); +				continue; +			} + +			if (!page_has_buffers(page)) { +				unlock_page(page); +				continue; +			} + +			if (page_has_buffers(page)) { +				lastoff = page_offset(page); +				bh = head = page_buffers(page); +				do { +					if (buffer_uptodate(bh) || +					    buffer_unwritten(bh)) { +						if (whence == SEEK_DATA) +							found = 1; +					} else { +						if (whence == SEEK_HOLE) +							found = 1; +					} +					if (found) { +						*offset = max_t(loff_t, +							startoff, lastoff); +						unlock_page(page); +						goto out; +					} +					lastoff += bh->b_size; +					bh = bh->b_this_page; +				} while (bh != head); +			} + +			lastoff = page_offset(page) + PAGE_SIZE; +			unlock_page(page); +		} + +		/* +		 * The no. of pages is less than our desired, that would be a +		 * hole in there. +		 */ +		if (nr_pages < num && whence == SEEK_HOLE) { +			found = 1; +			*offset = lastoff; +			break; +		} + +		index = pvec.pages[i - 1]->index + 1; +		pagevec_release(&pvec); +	} while (index <= end); + +out: +	pagevec_release(&pvec); +	return found; +} + +/* + * ext4_seek_data() retrieves the offset for SEEK_DATA. + */ +static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)  {  	struct inode *inode = file->f_mapping->host; -	loff_t maxbytes; +	struct ext4_map_blocks map; +	struct extent_status es; +	ext4_lblk_t start, last, end; +	loff_t dataoff, isize; +	int blkbits; +	int ret = 0; -	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) -		maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes; -	else -		maxbytes = inode->i_sb->s_maxbytes;  	mutex_lock(&inode->i_mutex); -	switch (origin) { -	case SEEK_END: -		offset += inode->i_size; -		break; -	case SEEK_CUR: -		if (offset == 0) { -			mutex_unlock(&inode->i_mutex); -			return file->f_pos; -		} -		offset += file->f_pos; -		break; -	} -	if (offset < 0 || offset > maxbytes) { +	isize = i_size_read(inode); +	if (offset >= isize) {  		mutex_unlock(&inode->i_mutex); -		return -EINVAL; +		return -ENXIO;  	} -	if (offset != file->f_pos) { -		file->f_pos = offset; -		file->f_version = 0; +	blkbits = inode->i_sb->s_blocksize_bits; +	start = offset >> blkbits; +	last = start; +	end = isize >> blkbits; +	dataoff = offset; + +	do { +		map.m_lblk = last; +		map.m_len = end - last + 1; +		ret = ext4_map_blocks(NULL, inode, &map, 0); +		if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { +			if (last != start) +				dataoff = (loff_t)last << blkbits; +			break; +		} + +		/* +		 * If there is a delay extent at this offset, +		 * it will be as a data. +		 */ +		ext4_es_find_delayed_extent_range(inode, last, last, &es); +		if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { +			if (last != start) +				dataoff = (loff_t)last << blkbits; +			break; +		} + +		/* +		 * If there is a unwritten extent at this offset, +		 * it will be as a data or a hole according to page +		 * cache that has data or not. +		 */ +		if (map.m_flags & EXT4_MAP_UNWRITTEN) { +			int unwritten; +			unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA, +							      &map, &dataoff); +			if (unwritten) +				break; +		} + +		last++; +		dataoff = (loff_t)last << blkbits; +	} while (last <= end); + +	mutex_unlock(&inode->i_mutex); + +	if (dataoff > isize) +		return -ENXIO; + +	return vfs_setpos(file, dataoff, maxsize); +} + +/* + * ext4_seek_hole() retrieves the offset for SEEK_HOLE. + */ +static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) +{ +	struct inode *inode = file->f_mapping->host; +	struct ext4_map_blocks map; +	struct extent_status es; +	ext4_lblk_t start, last, end; +	loff_t holeoff, isize; +	int blkbits; +	int ret = 0; + +	mutex_lock(&inode->i_mutex); + +	isize = i_size_read(inode); +	if (offset >= isize) { +		mutex_unlock(&inode->i_mutex); +		return -ENXIO;  	} + +	blkbits = inode->i_sb->s_blocksize_bits; +	start = offset >> blkbits; +	last = start; +	end = isize >> blkbits; +	holeoff = offset; + +	do { +		map.m_lblk = last; +		map.m_len = end - last + 1; +		ret = ext4_map_blocks(NULL, inode, &map, 0); +		if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { +			last += ret; +			holeoff = (loff_t)last << blkbits; +			continue; +		} + +		/* +		 * If there is a delay extent at this offset, +		 * we will skip this extent. +		 */ +		ext4_es_find_delayed_extent_range(inode, last, last, &es); +		if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { +			last = es.es_lblk + es.es_len; +			holeoff = (loff_t)last << blkbits; +			continue; +		} + +		/* +		 * If there is a unwritten extent at this offset, +		 * it will be as a data or a hole according to page +		 * cache that has data or not. +		 */ +		if (map.m_flags & EXT4_MAP_UNWRITTEN) { +			int unwritten; +			unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE, +							      &map, &holeoff); +			if (!unwritten) { +				last += ret; +				holeoff = (loff_t)last << blkbits; +				continue; +			} +		} + +		/* find a hole */ +		break; +	} while (last <= end); +  	mutex_unlock(&inode->i_mutex); -	return offset; +	if (holeoff > isize) +		holeoff = isize; + +	return vfs_setpos(file, holeoff, maxsize); +} + +/* + * ext4_llseek() handles both block-mapped and extent-mapped maxbytes values + * by calling generic_file_llseek_size() with the appropriate maxbytes + * value for each. + */ +loff_t ext4_llseek(struct file *file, loff_t offset, int whence) +{ +	struct inode *inode = file->f_mapping->host; +	loff_t maxbytes; + +	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) +		maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes; +	else +		maxbytes = inode->i_sb->s_maxbytes; + +	switch (whence) { +	case SEEK_SET: +	case SEEK_CUR: +	case SEEK_END: +		return generic_file_llseek_size(file, offset, whence, +						maxbytes, i_size_read(inode)); +	case SEEK_DATA: +		return ext4_seek_data(file, offset, maxbytes); +	case SEEK_HOLE: +		return ext4_seek_hole(file, offset, maxbytes); +	} + +	return -EINVAL;  }  const struct file_operations ext4_file_operations = {  	.llseek		= ext4_llseek, -	.read		= do_sync_read, -	.write		= do_sync_write, -	.aio_read	= generic_file_aio_read, -	.aio_write	= ext4_file_write, +	.read		= new_sync_read, +	.write		= new_sync_write, +	.read_iter	= generic_file_read_iter, +	.write_iter	= ext4_file_write_iter,  	.unlocked_ioctl = ext4_ioctl,  #ifdef CONFIG_COMPAT  	.compat_ioctl	= ext4_compat_ioctl, @@ -187,21 +600,19 @@ const struct file_operations ext4_file_operations = {  	.release	= ext4_release_file,  	.fsync		= ext4_sync_file,  	.splice_read	= generic_file_splice_read, -	.splice_write	= generic_file_splice_write, +	.splice_write	= iter_file_splice_write, +	.fallocate	= ext4_fallocate,  };  const struct inode_operations ext4_file_inode_operations = { -	.truncate	= ext4_truncate,  	.setattr	= ext4_setattr,  	.getattr	= ext4_getattr, -#ifdef CONFIG_EXT4_FS_XATTR  	.setxattr	= generic_setxattr,  	.getxattr	= generic_getxattr,  	.listxattr	= ext4_listxattr,  	.removexattr	= generic_removexattr, -#endif -	.check_acl	= ext4_check_acl, -	.fallocate	= ext4_fallocate, +	.get_acl	= ext4_get_acl, +	.set_acl	= ext4_set_acl,  	.fiemap		= ext4_fiemap,  };  | 
