diff options
Diffstat (limited to 'fs/read_write.c')
| -rw-r--r-- | fs/read_write.c | 900 | 
1 files changed, 656 insertions, 244 deletions
diff --git a/fs/read_write.c b/fs/read_write.c index 431a0ed610c..009d8542a88 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -9,59 +9,89 @@  #include <linux/fcntl.h>  #include <linux/file.h>  #include <linux/uio.h> -#include <linux/smp_lock.h> +#include <linux/aio.h>  #include <linux/fsnotify.h>  #include <linux/security.h> -#include <linux/module.h> +#include <linux/export.h>  #include <linux/syscalls.h>  #include <linux/pagemap.h>  #include <linux/splice.h> -#include "read_write.h" +#include <linux/compat.h> +#include "internal.h"  #include <asm/uaccess.h>  #include <asm/unistd.h> +typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *); +typedef ssize_t (*iov_fn_t)(struct kiocb *, const struct iovec *, +		unsigned long, loff_t); +typedef ssize_t (*iter_fn_t)(struct kiocb *, struct iov_iter *); +  const struct file_operations generic_ro_fops = {  	.llseek		= generic_file_llseek, -	.read		= do_sync_read, -	.aio_read	= generic_file_aio_read, +	.read		= new_sync_read, +	.read_iter	= generic_file_read_iter,  	.mmap		= generic_file_readonly_mmap,  	.splice_read	= generic_file_splice_read,  };  EXPORT_SYMBOL(generic_ro_fops); -static int -__negative_fpos_check(struct file *file, loff_t pos, size_t count) +static inline int unsigned_offsets(struct file *file)  { -	/* -	 * pos or pos+count is negative here, check overflow. -	 * too big "count" will be caught in rw_verify_area(). -	 */ -	if ((pos < 0) && (pos + count < pos)) -		return -EOVERFLOW; -	if (file->f_mode & FMODE_UNSIGNED_OFFSET) -		return 0; -	return -EINVAL; +	return file->f_mode & FMODE_UNSIGNED_OFFSET;  }  /** - * generic_file_llseek_unlocked - lockless generic llseek implementation + * vfs_setpos - update the file offset for lseek + * @file:	file structure in question + * @offset:	file offset to seek to + * @maxsize:	maximum file size + * + * This is a low-level filesystem helper for updating the file offset to + * the value specified by @offset if the given offset is valid and it is + * not equal to the current file offset. + * + * Return the specified offset on success and -EINVAL on invalid offset. + */ +loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize) +{ +	if (offset < 0 && !unsigned_offsets(file)) +		return -EINVAL; +	if (offset > maxsize) +		return -EINVAL; + +	if (offset != file->f_pos) { +		file->f_pos = offset; +		file->f_version = 0; +	} +	return offset; +} +EXPORT_SYMBOL(vfs_setpos); + +/** + * generic_file_llseek_size - generic llseek implementation for regular files   * @file:	file structure to seek on   * @offset:	file offset to seek to - * @origin:	type of seek + * @whence:	type of seek + * @size:	max size of this file in file system + * @eof:	offset used for SEEK_END position   * - * Updates the file offset to the value specified by @offset and @origin. - * Locking must be provided by the caller. + * This is a variant of generic_file_llseek that allows passing in a custom + * maximum file size and a custom EOF position, for e.g. hashed directories + * + * Synchronization: + * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms) + * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes. + * read/writes behave like SEEK_SET against seeks.   */  loff_t -generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin) +generic_file_llseek_size(struct file *file, loff_t offset, int whence, +		loff_t maxsize, loff_t eof)  { -	struct inode *inode = file->f_mapping->host; - -	switch (origin) { +	switch (whence) {  	case SEEK_END: -		offset += inode->i_size; +		offset += eof;  		break;  	case SEEK_CUR:  		/* @@ -72,78 +102,110 @@ generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)  		 */  		if (offset == 0)  			return file->f_pos; -		offset += file->f_pos; +		/* +		 * f_lock protects against read/modify/write race with other +		 * SEEK_CURs. Note that parallel writes and reads behave +		 * like SEEK_SET. +		 */ +		spin_lock(&file->f_lock); +		offset = vfs_setpos(file, file->f_pos + offset, maxsize); +		spin_unlock(&file->f_lock); +		return offset; +	case SEEK_DATA: +		/* +		 * In the generic case the entire file is data, so as long as +		 * offset isn't at the end of the file then the offset is data. +		 */ +		if (offset >= eof) +			return -ENXIO; +		break; +	case SEEK_HOLE: +		/* +		 * There is a virtual hole at the end of the file, so as long as +		 * offset isn't i_size or larger, return i_size. +		 */ +		if (offset >= eof) +			return -ENXIO; +		offset = eof;  		break;  	} -	if (offset < 0 && __negative_fpos_check(file, offset, 0)) -		return -EINVAL; -	if (offset > inode->i_sb->s_maxbytes) -		return -EINVAL; - -	/* Special lock needed here? */ -	if (offset != file->f_pos) { -		file->f_pos = offset; -		file->f_version = 0; -	} - -	return offset; +	return vfs_setpos(file, offset, maxsize);  } -EXPORT_SYMBOL(generic_file_llseek_unlocked); +EXPORT_SYMBOL(generic_file_llseek_size);  /**   * generic_file_llseek - generic llseek implementation for regular files   * @file:	file structure to seek on   * @offset:	file offset to seek to - * @origin:	type of seek + * @whence:	type of seek   *   * This is a generic implemenation of ->llseek useable for all normal local   * filesystems.  It just updates the file offset to the value specified by - * @offset and @origin under i_mutex. + * @offset and @whence.   */ -loff_t generic_file_llseek(struct file *file, loff_t offset, int origin) +loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)  { -	loff_t rval; - -	mutex_lock(&file->f_dentry->d_inode->i_mutex); -	rval = generic_file_llseek_unlocked(file, offset, origin); -	mutex_unlock(&file->f_dentry->d_inode->i_mutex); +	struct inode *inode = file->f_mapping->host; -	return rval; +	return generic_file_llseek_size(file, offset, whence, +					inode->i_sb->s_maxbytes, +					i_size_read(inode));  }  EXPORT_SYMBOL(generic_file_llseek);  /** + * fixed_size_llseek - llseek implementation for fixed-sized devices + * @file:	file structure to seek on + * @offset:	file offset to seek to + * @whence:	type of seek + * @size:	size of the file + * + */ +loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size) +{ +	switch (whence) { +	case SEEK_SET: case SEEK_CUR: case SEEK_END: +		return generic_file_llseek_size(file, offset, whence, +						size, size); +	default: +		return -EINVAL; +	} +} +EXPORT_SYMBOL(fixed_size_llseek); + +/**   * noop_llseek - No Operation Performed llseek implementation   * @file:	file structure to seek on   * @offset:	file offset to seek to - * @origin:	type of seek + * @whence:	type of seek   *   * This is an implementation of ->llseek useable for the rare special case when   * userspace expects the seek to succeed but the (device) file is actually not   * able to perform the seek. In this case you use noop_llseek() instead of   * falling back to the default implementation of ->llseek.   */ -loff_t noop_llseek(struct file *file, loff_t offset, int origin) +loff_t noop_llseek(struct file *file, loff_t offset, int whence)  {  	return file->f_pos;  }  EXPORT_SYMBOL(noop_llseek); -loff_t no_llseek(struct file *file, loff_t offset, int origin) +loff_t no_llseek(struct file *file, loff_t offset, int whence)  {  	return -ESPIPE;  }  EXPORT_SYMBOL(no_llseek); -loff_t default_llseek(struct file *file, loff_t offset, int origin) +loff_t default_llseek(struct file *file, loff_t offset, int whence)  { +	struct inode *inode = file_inode(file);  	loff_t retval; -	mutex_lock(&file->f_dentry->d_inode->i_mutex); -	switch (origin) { +	mutex_lock(&inode->i_mutex); +	switch (whence) {  		case SEEK_END: -			offset += i_size_read(file->f_path.dentry->d_inode); +			offset += i_size_read(inode);  			break;  		case SEEK_CUR:  			if (offset == 0) { @@ -151,9 +213,33 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)  				goto out;  			}  			offset += file->f_pos; +			break; +		case SEEK_DATA: +			/* +			 * In the generic case the entire file is data, so as +			 * long as offset isn't at the end of the file then the +			 * offset is data. +			 */ +			if (offset >= inode->i_size) { +				retval = -ENXIO; +				goto out; +			} +			break; +		case SEEK_HOLE: +			/* +			 * There is a virtual hole at the end of the file, so +			 * as long as offset isn't i_size or larger, return +			 * i_size. +			 */ +			if (offset >= inode->i_size) { +				retval = -ENXIO; +				goto out; +			} +			offset = inode->i_size; +			break;  	}  	retval = -EINVAL; -	if (offset >= 0 || !__negative_fpos_check(file, offset, 0)) { +	if (offset >= 0 || unsigned_offsets(file)) {  		if (offset != file->f_pos) {  			file->f_pos = offset;  			file->f_version = 0; @@ -161,68 +247,79 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)  		retval = offset;  	}  out: -	mutex_unlock(&file->f_dentry->d_inode->i_mutex); +	mutex_unlock(&inode->i_mutex);  	return retval;  }  EXPORT_SYMBOL(default_llseek); -loff_t vfs_llseek(struct file *file, loff_t offset, int origin) +loff_t vfs_llseek(struct file *file, loff_t offset, int whence)  {  	loff_t (*fn)(struct file *, loff_t, int);  	fn = no_llseek;  	if (file->f_mode & FMODE_LSEEK) { -		if (file->f_op && file->f_op->llseek) +		if (file->f_op->llseek)  			fn = file->f_op->llseek;  	} -	return fn(file, offset, origin); +	return fn(file, offset, whence);  }  EXPORT_SYMBOL(vfs_llseek); -SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, origin) +static inline struct fd fdget_pos(int fd)  { -	off_t retval; -	struct file * file; -	int fput_needed; +	return __to_fd(__fdget_pos(fd)); +} -	retval = -EBADF; -	file = fget_light(fd, &fput_needed); -	if (!file) -		goto bad; +static inline void fdput_pos(struct fd f) +{ +	if (f.flags & FDPUT_POS_UNLOCK) +		mutex_unlock(&f.file->f_pos_lock); +	fdput(f); +} + +SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence) +{ +	off_t retval; +	struct fd f = fdget_pos(fd); +	if (!f.file) +		return -EBADF;  	retval = -EINVAL; -	if (origin <= SEEK_MAX) { -		loff_t res = vfs_llseek(file, offset, origin); +	if (whence <= SEEK_MAX) { +		loff_t res = vfs_llseek(f.file, offset, whence);  		retval = res;  		if (res != (loff_t)retval)  			retval = -EOVERFLOW;	/* LFS: should only happen on 32 bit platforms */  	} -	fput_light(file, fput_needed); -bad: +	fdput_pos(f);  	return retval;  } +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence) +{ +	return sys_lseek(fd, offset, whence); +} +#endif +  #ifdef __ARCH_WANT_SYS_LLSEEK  SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,  		unsigned long, offset_low, loff_t __user *, result, -		unsigned int, origin) +		unsigned int, whence)  {  	int retval; -	struct file * file; +	struct fd f = fdget_pos(fd);  	loff_t offset; -	int fput_needed; -	retval = -EBADF; -	file = fget_light(fd, &fput_needed); -	if (!file) -		goto bad; +	if (!f.file) +		return -EBADF;  	retval = -EINVAL; -	if (origin > SEEK_MAX) +	if (whence > SEEK_MAX)  		goto out_putf; -	offset = vfs_llseek(file, ((loff_t) offset_high << 32) | offset_low, -			origin); +	offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low, +			whence);  	retval = (int)offset;  	if (offset >= 0) { @@ -231,31 +328,33 @@ SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,  			retval = 0;  	}  out_putf: -	fput_light(file, fput_needed); -bad: +	fdput_pos(f);  	return retval;  }  #endif -  /*   * rw_verify_area doesn't like huge counts. We limit   * them to something that fits in "int" so that others   * won't have to do range checks all the time.   */ -int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count) +int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)  {  	struct inode *inode;  	loff_t pos;  	int retval = -EINVAL; -	inode = file->f_path.dentry->d_inode; +	inode = file_inode(file);  	if (unlikely((ssize_t) count < 0))  		return retval;  	pos = *ppos; -	if (unlikely((pos < 0) || (loff_t) (pos + count) < 0)) { -		retval = __negative_fpos_check(file, pos, count); -		if (retval) +	if (unlikely(pos < 0)) { +		if (!unsigned_offsets(file)) +			return retval; +		if (count >= -pos) /* both values are in 0..LLONG_MAX */ +			return -EOVERFLOW; +	} else if (unlikely((loff_t) (pos + count) < 0)) { +		if (!unsigned_offsets(file))  			return retval;  	} @@ -273,41 +372,45 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count  	return count > MAX_RW_COUNT ? MAX_RW_COUNT : count;  } -static void wait_on_retry_sync_kiocb(struct kiocb *iocb) +ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)  { -	set_current_state(TASK_UNINTERRUPTIBLE); -	if (!kiocbIsKicked(iocb)) -		schedule(); -	else -		kiocbClearKicked(iocb); -	__set_current_state(TASK_RUNNING); +	struct iovec iov = { .iov_base = buf, .iov_len = len }; +	struct kiocb kiocb; +	ssize_t ret; + +	init_sync_kiocb(&kiocb, filp); +	kiocb.ki_pos = *ppos; +	kiocb.ki_nbytes = len; + +	ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos); +	if (-EIOCBQUEUED == ret) +		ret = wait_on_sync_kiocb(&kiocb); +	*ppos = kiocb.ki_pos; +	return ret;  } -ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) +EXPORT_SYMBOL(do_sync_read); + +ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)  {  	struct iovec iov = { .iov_base = buf, .iov_len = len };  	struct kiocb kiocb; +	struct iov_iter iter;  	ssize_t ret;  	init_sync_kiocb(&kiocb, filp);  	kiocb.ki_pos = *ppos; -	kiocb.ki_left = len;  	kiocb.ki_nbytes = len; +	iov_iter_init(&iter, READ, &iov, 1, len); -	for (;;) { -		ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos); -		if (ret != -EIOCBRETRY) -			break; -		wait_on_retry_sync_kiocb(&kiocb); -	} - +	ret = filp->f_op->read_iter(&kiocb, &iter);  	if (-EIOCBQUEUED == ret)  		ret = wait_on_sync_kiocb(&kiocb);  	*ppos = kiocb.ki_pos;  	return ret;  } -EXPORT_SYMBOL(do_sync_read); +EXPORT_SYMBOL(new_sync_read);  ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)  { @@ -315,7 +418,7 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)  	if (!(file->f_mode & FMODE_READ))  		return -EBADF; -	if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read)) +	if (!(file->f_mode & FMODE_CAN_READ))  		return -EINVAL;  	if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))  		return -EFAULT; @@ -325,8 +428,10 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)  		count = ret;  		if (file->f_op->read)  			ret = file->f_op->read(file, buf, count, pos); -		else +		else if (file->f_op->aio_read)  			ret = do_sync_read(file, buf, count, pos); +		else +			ret = new_sync_read(file, buf, count, pos);  		if (ret > 0) {  			fsnotify_access(file);  			add_rchar(current, ret); @@ -347,16 +452,9 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof  	init_sync_kiocb(&kiocb, filp);  	kiocb.ki_pos = *ppos; -	kiocb.ki_left = len;  	kiocb.ki_nbytes = len; -	for (;;) { -		ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos); -		if (ret != -EIOCBRETRY) -			break; -		wait_on_retry_sync_kiocb(&kiocb); -	} - +	ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);  	if (-EIOCBQUEUED == ret)  		ret = wait_on_sync_kiocb(&kiocb);  	*ppos = kiocb.ki_pos; @@ -365,13 +463,63 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof  EXPORT_SYMBOL(do_sync_write); +ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos) +{ +	struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len }; +	struct kiocb kiocb; +	struct iov_iter iter; +	ssize_t ret; + +	init_sync_kiocb(&kiocb, filp); +	kiocb.ki_pos = *ppos; +	kiocb.ki_nbytes = len; +	iov_iter_init(&iter, WRITE, &iov, 1, len); + +	ret = filp->f_op->write_iter(&kiocb, &iter); +	if (-EIOCBQUEUED == ret) +		ret = wait_on_sync_kiocb(&kiocb); +	*ppos = kiocb.ki_pos; +	return ret; +} + +EXPORT_SYMBOL(new_sync_write); + +ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t *pos) +{ +	mm_segment_t old_fs; +	const char __user *p; +	ssize_t ret; + +	if (!(file->f_mode & FMODE_CAN_WRITE)) +		return -EINVAL; + +	old_fs = get_fs(); +	set_fs(get_ds()); +	p = (__force const char __user *)buf; +	if (count > MAX_RW_COUNT) +		count =  MAX_RW_COUNT; +	if (file->f_op->write) +		ret = file->f_op->write(file, p, count, pos); +	else if (file->f_op->aio_write) +		ret = do_sync_write(file, p, count, pos); +	else +		ret = new_sync_write(file, p, count, pos); +	set_fs(old_fs); +	if (ret > 0) { +		fsnotify_modify(file); +		add_wchar(current, ret); +	} +	inc_syscw(current); +	return ret; +} +  ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)  {  	ssize_t ret;  	if (!(file->f_mode & FMODE_WRITE))  		return -EBADF; -	if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write)) +	if (!(file->f_mode & FMODE_CAN_WRITE))  		return -EINVAL;  	if (unlikely(!access_ok(VERIFY_READ, buf, count)))  		return -EFAULT; @@ -379,15 +527,19 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_  	ret = rw_verify_area(WRITE, file, pos, count);  	if (ret >= 0) {  		count = ret; +		file_start_write(file);  		if (file->f_op->write)  			ret = file->f_op->write(file, buf, count, pos); -		else +		else if (file->f_op->aio_write)  			ret = do_sync_write(file, buf, count, pos); +		else +			ret = new_sync_write(file, buf, count, pos);  		if (ret > 0) {  			fsnotify_modify(file);  			add_wchar(current, ret);  		}  		inc_syscw(current); +		file_end_write(file);  	}  	return ret; @@ -407,96 +559,75 @@ static inline void file_pos_write(struct file *file, loff_t pos)  SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)  { -	struct file *file; +	struct fd f = fdget_pos(fd);  	ssize_t ret = -EBADF; -	int fput_needed; -	file = fget_light(fd, &fput_needed); -	if (file) { -		loff_t pos = file_pos_read(file); -		ret = vfs_read(file, buf, count, &pos); -		file_pos_write(file, pos); -		fput_light(file, fput_needed); +	if (f.file) { +		loff_t pos = file_pos_read(f.file); +		ret = vfs_read(f.file, buf, count, &pos); +		if (ret >= 0) +			file_pos_write(f.file, pos); +		fdput_pos(f);  	} -  	return ret;  }  SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,  		size_t, count)  { -	struct file *file; +	struct fd f = fdget_pos(fd);  	ssize_t ret = -EBADF; -	int fput_needed; -	file = fget_light(fd, &fput_needed); -	if (file) { -		loff_t pos = file_pos_read(file); -		ret = vfs_write(file, buf, count, &pos); -		file_pos_write(file, pos); -		fput_light(file, fput_needed); +	if (f.file) { +		loff_t pos = file_pos_read(f.file); +		ret = vfs_write(f.file, buf, count, &pos); +		if (ret >= 0) +			file_pos_write(f.file, pos); +		fdput_pos(f);  	}  	return ret;  } -SYSCALL_DEFINE(pread64)(unsigned int fd, char __user *buf, -			size_t count, loff_t pos) +SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf, +			size_t, count, loff_t, pos)  { -	struct file *file; +	struct fd f;  	ssize_t ret = -EBADF; -	int fput_needed;  	if (pos < 0)  		return -EINVAL; -	file = fget_light(fd, &fput_needed); -	if (file) { +	f = fdget(fd); +	if (f.file) {  		ret = -ESPIPE; -		if (file->f_mode & FMODE_PREAD) -			ret = vfs_read(file, buf, count, &pos); -		fput_light(file, fput_needed); +		if (f.file->f_mode & FMODE_PREAD) +			ret = vfs_read(f.file, buf, count, &pos); +		fdput(f);  	}  	return ret;  } -#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS -asmlinkage long SyS_pread64(long fd, long buf, long count, loff_t pos) -{ -	return SYSC_pread64((unsigned int) fd, (char __user *) buf, -			    (size_t) count, pos); -} -SYSCALL_ALIAS(sys_pread64, SyS_pread64); -#endif -SYSCALL_DEFINE(pwrite64)(unsigned int fd, const char __user *buf, -			 size_t count, loff_t pos) +SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf, +			 size_t, count, loff_t, pos)  { -	struct file *file; +	struct fd f;  	ssize_t ret = -EBADF; -	int fput_needed;  	if (pos < 0)  		return -EINVAL; -	file = fget_light(fd, &fput_needed); -	if (file) { +	f = fdget(fd); +	if (f.file) {  		ret = -ESPIPE; -		if (file->f_mode & FMODE_PWRITE)   -			ret = vfs_write(file, buf, count, &pos); -		fput_light(file, fput_needed); +		if (f.file->f_mode & FMODE_PWRITE)   +			ret = vfs_write(f.file, buf, count, &pos); +		fdput(f);  	}  	return ret;  } -#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS -asmlinkage long SyS_pwrite64(long fd, long buf, long count, loff_t pos) -{ -	return SYSC_pwrite64((unsigned int) fd, (const char __user *) buf, -			     (size_t) count, pos); -} -SYSCALL_ALIAS(sys_pwrite64, SyS_pwrite64); -#endif  /*   * Reduce an iovec's length in-place.  Return the resulting number of segments @@ -519,24 +650,36 @@ unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)  }  EXPORT_SYMBOL(iov_shorten); -ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov, -		unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn) +static ssize_t do_iter_readv_writev(struct file *filp, int rw, const struct iovec *iov, +		unsigned long nr_segs, size_t len, loff_t *ppos, iter_fn_t fn)  {  	struct kiocb kiocb; +	struct iov_iter iter;  	ssize_t ret;  	init_sync_kiocb(&kiocb, filp);  	kiocb.ki_pos = *ppos; -	kiocb.ki_left = len;  	kiocb.ki_nbytes = len; -	for (;;) { -		ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos); -		if (ret != -EIOCBRETRY) -			break; -		wait_on_retry_sync_kiocb(&kiocb); -	} +	iov_iter_init(&iter, rw, iov, nr_segs, len); +	ret = fn(&kiocb, &iter); +	if (ret == -EIOCBQUEUED) +		ret = wait_on_sync_kiocb(&kiocb); +	*ppos = kiocb.ki_pos; +	return ret; +} + +static ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov, +		unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn) +{ +	struct kiocb kiocb; +	ssize_t ret; + +	init_sync_kiocb(&kiocb, filp); +	kiocb.ki_pos = *ppos; +	kiocb.ki_nbytes = len; +	ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos);  	if (ret == -EIOCBQUEUED)  		ret = wait_on_sync_kiocb(&kiocb);  	*ppos = kiocb.ki_pos; @@ -544,7 +687,7 @@ ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,  }  /* Do it by hand, with file-ops */ -ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov, +static ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,  		unsigned long nr_segs, loff_t *ppos, io_fn_t fn)  {  	struct iovec *vector = iov; @@ -637,7 +780,8 @@ ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,  			ret = -EINVAL;  			goto out;  		} -		if (unlikely(!access_ok(vrfy_dir(type), buf, len))) { +		if (type >= 0 +		    && unlikely(!access_ok(vrfy_dir(type), buf, len))) {  			ret = -EFAULT;  			goto out;  		} @@ -662,14 +806,10 @@ static ssize_t do_readv_writev(int type, struct file *file,  	ssize_t ret;  	io_fn_t fn;  	iov_fn_t fnv; - -	if (!file->f_op) { -		ret = -EINVAL; -		goto out; -	} +	iter_fn_t iter_fn;  	ret = rw_copy_check_uvector(type, uvector, nr_segs, -			ARRAY_SIZE(iovstack), iovstack, &iov); +				    ARRAY_SIZE(iovstack), iovstack, &iov);  	if (ret <= 0)  		goto out; @@ -682,17 +822,26 @@ static ssize_t do_readv_writev(int type, struct file *file,  	if (type == READ) {  		fn = file->f_op->read;  		fnv = file->f_op->aio_read; +		iter_fn = file->f_op->read_iter;  	} else {  		fn = (io_fn_t)file->f_op->write;  		fnv = file->f_op->aio_write; +		iter_fn = file->f_op->write_iter; +		file_start_write(file);  	} -	if (fnv) +	if (iter_fn) +		ret = do_iter_readv_writev(file, type, iov, nr_segs, tot_len, +						pos, iter_fn); +	else if (fnv)  		ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,  						pos, fnv);  	else  		ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn); +	if (type != READ) +		file_end_write(file); +  out:  	if (iov != iovstack)  		kfree(iov); @@ -710,7 +859,7 @@ ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,  {  	if (!(file->f_mode & FMODE_READ))  		return -EBADF; -	if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read)) +	if (!(file->f_mode & FMODE_CAN_READ))  		return -EINVAL;  	return do_readv_writev(READ, file, vec, vlen, pos); @@ -723,7 +872,7 @@ ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,  {  	if (!(file->f_mode & FMODE_WRITE))  		return -EBADF; -	if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write)) +	if (!(file->f_mode & FMODE_CAN_WRITE))  		return -EINVAL;  	return do_readv_writev(WRITE, file, vec, vlen, pos); @@ -734,16 +883,15 @@ EXPORT_SYMBOL(vfs_writev);  SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,  		unsigned long, vlen)  { -	struct file *file; +	struct fd f = fdget_pos(fd);  	ssize_t ret = -EBADF; -	int fput_needed; -	file = fget_light(fd, &fput_needed); -	if (file) { -		loff_t pos = file_pos_read(file); -		ret = vfs_readv(file, vec, vlen, &pos); -		file_pos_write(file, pos); -		fput_light(file, fput_needed); +	if (f.file) { +		loff_t pos = file_pos_read(f.file); +		ret = vfs_readv(f.file, vec, vlen, &pos); +		if (ret >= 0) +			file_pos_write(f.file, pos); +		fdput_pos(f);  	}  	if (ret > 0) @@ -755,16 +903,15 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,  SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,  		unsigned long, vlen)  { -	struct file *file; +	struct fd f = fdget_pos(fd);  	ssize_t ret = -EBADF; -	int fput_needed; -	file = fget_light(fd, &fput_needed); -	if (file) { -		loff_t pos = file_pos_read(file); -		ret = vfs_writev(file, vec, vlen, &pos); -		file_pos_write(file, pos); -		fput_light(file, fput_needed); +	if (f.file) { +		loff_t pos = file_pos_read(f.file); +		ret = vfs_writev(f.file, vec, vlen, &pos); +		if (ret >= 0) +			file_pos_write(f.file, pos); +		fdput_pos(f);  	}  	if (ret > 0) @@ -783,19 +930,18 @@ SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,  		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)  {  	loff_t pos = pos_from_hilo(pos_h, pos_l); -	struct file *file; +	struct fd f;  	ssize_t ret = -EBADF; -	int fput_needed;  	if (pos < 0)  		return -EINVAL; -	file = fget_light(fd, &fput_needed); -	if (file) { +	f = fdget(fd); +	if (f.file) {  		ret = -ESPIPE; -		if (file->f_mode & FMODE_PREAD) -			ret = vfs_readv(file, vec, vlen, &pos); -		fput_light(file, fput_needed); +		if (f.file->f_mode & FMODE_PREAD) +			ret = vfs_readv(f.file, vec, vlen, &pos); +		fdput(f);  	}  	if (ret > 0) @@ -808,52 +954,269 @@ SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,  		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)  {  	loff_t pos = pos_from_hilo(pos_h, pos_l); -	struct file *file; +	struct fd f;  	ssize_t ret = -EBADF; -	int fput_needed;  	if (pos < 0)  		return -EINVAL; -	file = fget_light(fd, &fput_needed); -	if (file) { +	f = fdget(fd); +	if (f.file) {  		ret = -ESPIPE; -		if (file->f_mode & FMODE_PWRITE) -			ret = vfs_writev(file, vec, vlen, &pos); -		fput_light(file, fput_needed); +		if (f.file->f_mode & FMODE_PWRITE) +			ret = vfs_writev(f.file, vec, vlen, &pos); +		fdput(f); +	} + +	if (ret > 0) +		add_wchar(current, ret); +	inc_syscw(current); +	return ret; +} + +#ifdef CONFIG_COMPAT + +static ssize_t compat_do_readv_writev(int type, struct file *file, +			       const struct compat_iovec __user *uvector, +			       unsigned long nr_segs, loff_t *pos) +{ +	compat_ssize_t tot_len; +	struct iovec iovstack[UIO_FASTIOV]; +	struct iovec *iov = iovstack; +	ssize_t ret; +	io_fn_t fn; +	iov_fn_t fnv; +	iter_fn_t iter_fn; + +	ret = compat_rw_copy_check_uvector(type, uvector, nr_segs, +					       UIO_FASTIOV, iovstack, &iov); +	if (ret <= 0) +		goto out; + +	tot_len = ret; +	ret = rw_verify_area(type, file, pos, tot_len); +	if (ret < 0) +		goto out; + +	fnv = NULL; +	if (type == READ) { +		fn = file->f_op->read; +		fnv = file->f_op->aio_read; +		iter_fn = file->f_op->read_iter; +	} else { +		fn = (io_fn_t)file->f_op->write; +		fnv = file->f_op->aio_write; +		iter_fn = file->f_op->write_iter; +		file_start_write(file); +	} + +	if (iter_fn) +		ret = do_iter_readv_writev(file, type, iov, nr_segs, tot_len, +						pos, iter_fn); +	else if (fnv) +		ret = do_sync_readv_writev(file, iov, nr_segs, tot_len, +						pos, fnv); +	else +		ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn); + +	if (type != READ) +		file_end_write(file); + +out: +	if (iov != iovstack) +		kfree(iov); +	if ((ret + (type == READ)) > 0) { +		if (type == READ) +			fsnotify_access(file); +		else +			fsnotify_modify(file);  	} +	return ret; +} + +static size_t compat_readv(struct file *file, +			   const struct compat_iovec __user *vec, +			   unsigned long vlen, loff_t *pos) +{ +	ssize_t ret = -EBADF; + +	if (!(file->f_mode & FMODE_READ)) +		goto out; +	ret = -EINVAL; +	if (!(file->f_mode & FMODE_CAN_READ)) +		goto out; + +	ret = compat_do_readv_writev(READ, file, vec, vlen, pos); + +out: +	if (ret > 0) +		add_rchar(current, ret); +	inc_syscr(current); +	return ret; +} + +COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd, +		const struct compat_iovec __user *,vec, +		compat_ulong_t, vlen) +{ +	struct fd f = fdget_pos(fd); +	ssize_t ret; +	loff_t pos; + +	if (!f.file) +		return -EBADF; +	pos = f.file->f_pos; +	ret = compat_readv(f.file, vec, vlen, &pos); +	if (ret >= 0) +		f.file->f_pos = pos; +	fdput_pos(f); +	return ret; +} + +static long __compat_sys_preadv64(unsigned long fd, +				  const struct compat_iovec __user *vec, +				  unsigned long vlen, loff_t pos) +{ +	struct fd f; +	ssize_t ret; + +	if (pos < 0) +		return -EINVAL; +	f = fdget(fd); +	if (!f.file) +		return -EBADF; +	ret = -ESPIPE; +	if (f.file->f_mode & FMODE_PREAD) +		ret = compat_readv(f.file, vec, vlen, &pos); +	fdput(f); +	return ret; +} + +#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64 +COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd, +		const struct compat_iovec __user *,vec, +		unsigned long, vlen, loff_t, pos) +{ +	return __compat_sys_preadv64(fd, vec, vlen, pos); +} +#endif + +COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd, +		const struct compat_iovec __user *,vec, +		compat_ulong_t, vlen, u32, pos_low, u32, pos_high) +{ +	loff_t pos = ((loff_t)pos_high << 32) | pos_low; + +	return __compat_sys_preadv64(fd, vec, vlen, pos); +} + +static size_t compat_writev(struct file *file, +			    const struct compat_iovec __user *vec, +			    unsigned long vlen, loff_t *pos) +{ +	ssize_t ret = -EBADF; + +	if (!(file->f_mode & FMODE_WRITE)) +		goto out; + +	ret = -EINVAL; +	if (!(file->f_mode & FMODE_CAN_WRITE)) +		goto out; + +	ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos); + +out:  	if (ret > 0)  		add_wchar(current, ret);  	inc_syscw(current);  	return ret;  } +COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd, +		const struct compat_iovec __user *, vec, +		compat_ulong_t, vlen) +{ +	struct fd f = fdget_pos(fd); +	ssize_t ret; +	loff_t pos; + +	if (!f.file) +		return -EBADF; +	pos = f.file->f_pos; +	ret = compat_writev(f.file, vec, vlen, &pos); +	if (ret >= 0) +		f.file->f_pos = pos; +	fdput_pos(f); +	return ret; +} + +static long __compat_sys_pwritev64(unsigned long fd, +				   const struct compat_iovec __user *vec, +				   unsigned long vlen, loff_t pos) +{ +	struct fd f; +	ssize_t ret; + +	if (pos < 0) +		return -EINVAL; +	f = fdget(fd); +	if (!f.file) +		return -EBADF; +	ret = -ESPIPE; +	if (f.file->f_mode & FMODE_PWRITE) +		ret = compat_writev(f.file, vec, vlen, &pos); +	fdput(f); +	return ret; +} + +#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64 +COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd, +		const struct compat_iovec __user *,vec, +		unsigned long, vlen, loff_t, pos) +{ +	return __compat_sys_pwritev64(fd, vec, vlen, pos); +} +#endif + +COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd, +		const struct compat_iovec __user *,vec, +		compat_ulong_t, vlen, u32, pos_low, u32, pos_high) +{ +	loff_t pos = ((loff_t)pos_high << 32) | pos_low; + +	return __compat_sys_pwritev64(fd, vec, vlen, pos); +} +#endif +  static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, -			   size_t count, loff_t max) +		  	   size_t count, loff_t max)  { -	struct file * in_file, * out_file; -	struct inode * in_inode, * out_inode; +	struct fd in, out; +	struct inode *in_inode, *out_inode;  	loff_t pos; +	loff_t out_pos;  	ssize_t retval; -	int fput_needed_in, fput_needed_out, fl; +	int fl;  	/*  	 * Get input file, and verify that it is ok..  	 */  	retval = -EBADF; -	in_file = fget_light(in_fd, &fput_needed_in); -	if (!in_file) +	in = fdget(in_fd); +	if (!in.file)  		goto out; -	if (!(in_file->f_mode & FMODE_READ)) +	if (!(in.file->f_mode & FMODE_READ))  		goto fput_in;  	retval = -ESPIPE; -	if (!ppos) -		ppos = &in_file->f_pos; -	else -		if (!(in_file->f_mode & FMODE_PREAD)) +	if (!ppos) { +		pos = in.file->f_pos; +	} else { +		pos = *ppos; +		if (!(in.file->f_mode & FMODE_PREAD))  			goto fput_in; -	retval = rw_verify_area(READ, in_file, ppos, count); +	} +	retval = rw_verify_area(READ, in.file, &pos, count);  	if (retval < 0)  		goto fput_in;  	count = retval; @@ -862,15 +1225,16 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,  	 * Get output file, and verify that it is ok..  	 */  	retval = -EBADF; -	out_file = fget_light(out_fd, &fput_needed_out); -	if (!out_file) +	out = fdget(out_fd); +	if (!out.file)  		goto fput_in; -	if (!(out_file->f_mode & FMODE_WRITE)) +	if (!(out.file->f_mode & FMODE_WRITE))  		goto fput_out;  	retval = -EINVAL; -	in_inode = in_file->f_path.dentry->d_inode; -	out_inode = out_file->f_path.dentry->d_inode; -	retval = rw_verify_area(WRITE, out_file, &out_file->f_pos, count); +	in_inode = file_inode(in.file); +	out_inode = file_inode(out.file); +	out_pos = out.file->f_pos; +	retval = rw_verify_area(WRITE, out.file, &out_pos, count);  	if (retval < 0)  		goto fput_out;  	count = retval; @@ -878,7 +1242,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,  	if (!max)  		max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes); -	pos = *ppos;  	if (unlikely(pos + count > max)) {  		retval = -EOVERFLOW;  		if (pos >= max) @@ -894,25 +1257,34 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,  	 * and the application is arguably buggy if it doesn't expect  	 * EAGAIN on a non-blocking file descriptor.  	 */ -	if (in_file->f_flags & O_NONBLOCK) +	if (in.file->f_flags & O_NONBLOCK)  		fl = SPLICE_F_NONBLOCK;  #endif -	retval = do_splice_direct(in_file, ppos, out_file, count, fl); +	file_start_write(out.file); +	retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl); +	file_end_write(out.file);  	if (retval > 0) {  		add_rchar(current, retval);  		add_wchar(current, retval); +		fsnotify_access(in.file); +		fsnotify_modify(out.file); +		out.file->f_pos = out_pos; +		if (ppos) +			*ppos = pos; +		else +			in.file->f_pos = pos;  	}  	inc_syscr(current);  	inc_syscw(current); -	if (*ppos > max) +	if (pos > max)  		retval = -EOVERFLOW;  fput_out: -	fput_light(out_file, fput_needed_out); +	fdput(out);  fput_in: -	fput_light(in_file, fput_needed_in); +	fdput(in);  out:  	return retval;  } @@ -952,3 +1324,43 @@ SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, si  	return do_sendfile(out_fd, in_fd, NULL, count, 0);  } + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, +		compat_off_t __user *, offset, compat_size_t, count) +{ +	loff_t pos; +	off_t off; +	ssize_t ret; + +	if (offset) { +		if (unlikely(get_user(off, offset))) +			return -EFAULT; +		pos = off; +		ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS); +		if (unlikely(put_user(pos, offset))) +			return -EFAULT; +		return ret; +	} + +	return do_sendfile(out_fd, in_fd, NULL, count, 0); +} + +COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, +		compat_loff_t __user *, offset, compat_size_t, count) +{ +	loff_t pos; +	ssize_t ret; + +	if (offset) { +		if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t)))) +			return -EFAULT; +		ret = do_sendfile(out_fd, in_fd, &pos, count, 0); +		if (unlikely(put_user(pos, offset))) +			return -EFAULT; +		return ret; +	} + +	return do_sendfile(out_fd, in_fd, NULL, count, 0); +} +#endif  | 
