diff options
Diffstat (limited to 'fs/splice.c')
| -rw-r--r-- | fs/splice.c | 2138 |
1 files changed, 1474 insertions, 664 deletions
diff --git a/fs/splice.c b/fs/splice.c index 0559e7577a0..f5cb9ba8451 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -12,7 +12,7 @@ * Jens to support splicing to files, network, direct splicing, etc and * fixing lots of bugs. * - * Copyright (C) 2005-2006 Jens Axboe <axboe@suse.de> + * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk> * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org> * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu> * @@ -20,23 +20,20 @@ #include <linux/fs.h> #include <linux/file.h> #include <linux/pagemap.h> -#include <linux/pipe_fs_i.h> +#include <linux/splice.h> +#include <linux/memcontrol.h> #include <linux/mm_inline.h> #include <linux/swap.h> #include <linux/writeback.h> -#include <linux/buffer_head.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/syscalls.h> - -/* - * Passed to the actors - */ -struct splice_desc { - unsigned int len, total_len; /* current and remaining length */ - unsigned int flags; /* splice flags */ - struct file *file; /* file to read/write */ - loff_t pos; /* file position */ -}; +#include <linux/uio.h> +#include <linux/security.h> +#include <linux/gfp.h> +#include <linux/socket.h> +#include <linux/compat.h> +#include <linux/aio.h> +#include "internal.h" /* * Attempt to steal a page from a pipe buffer. This should perhaps go into @@ -44,48 +41,64 @@ struct splice_desc { * addition of remove_mapping(). If success is returned, the caller may * attempt to reuse this page for another destination. */ -static int page_cache_pipe_buf_steal(struct pipe_inode_info *info, +static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct page *page = buf->page; - struct address_space *mapping = page_mapping(page); + struct address_space *mapping; lock_page(page); - WARN_ON(!PageUptodate(page)); + mapping = page_mapping(page); + if (mapping) { + WARN_ON(!PageUptodate(page)); - /* - * At least for ext2 with nobh option, we need to wait on writeback - * completing on this page, since we'll remove it from the pagecache. - * Otherwise truncate wont wait on the page, allowing the disk - * blocks to be reused by someone else before we actually wrote our - * data to them. fs corruption ensues. - */ - wait_on_page_writeback(page); + /* + * At least for ext2 with nobh option, we need to wait on + * writeback completing on this page, since we'll remove it + * from the pagecache. Otherwise truncate wont wait on the + * page, allowing the disk blocks to be reused by someone else + * before we actually wrote our data to them. fs corruption + * ensues. + */ + wait_on_page_writeback(page); - if (PagePrivate(page)) - try_to_release_page(page, mapping_gfp_mask(mapping)); + if (page_has_private(page) && + !try_to_release_page(page, GFP_KERNEL)) + goto out_unlock; - if (!remove_mapping(mapping, page)) { - unlock_page(page); - return 1; + /* + * If we succeeded in removing the mapping, set LRU flag + * and return good. + */ + if (remove_mapping(mapping, page)) { + buf->flags |= PIPE_BUF_FLAG_LRU; + return 0; + } } - buf->flags |= PIPE_BUF_FLAG_STOLEN | PIPE_BUF_FLAG_LRU; - return 0; + /* + * Raced with truncate or failed to remove page from current + * address space, unlock and return failure. + */ +out_unlock: + unlock_page(page); + return 1; } -static void page_cache_pipe_buf_release(struct pipe_inode_info *info, +static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { page_cache_release(buf->page); - buf->page = NULL; - buf->flags &= ~(PIPE_BUF_FLAG_STOLEN | PIPE_BUF_FLAG_LRU); + buf->flags &= ~PIPE_BUF_FLAG_LRU; } -static void *page_cache_pipe_buf_map(struct file *file, - struct pipe_inode_info *info, - struct pipe_buffer *buf) +/* + * Check whether the contents of buf is OK to access. Since the content + * is a page cache page, IO may be in flight. + */ +static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) { struct page *page = buf->page; int err; @@ -111,54 +124,73 @@ static void *page_cache_pipe_buf_map(struct file *file, } /* - * Page is ok afterall, fall through to mapping. + * Page is ok afterall, we are done. */ unlock_page(page); } - return kmap(page); + return 0; error: unlock_page(page); - return ERR_PTR(err); + return err; } -static void page_cache_pipe_buf_unmap(struct pipe_inode_info *info, - struct pipe_buffer *buf) -{ - kunmap(buf->page); -} +const struct pipe_buf_operations page_cache_pipe_buf_ops = { + .can_merge = 0, + .confirm = page_cache_pipe_buf_confirm, + .release = page_cache_pipe_buf_release, + .steal = page_cache_pipe_buf_steal, + .get = generic_pipe_buf_get, +}; -static void page_cache_pipe_buf_get(struct pipe_inode_info *info, +static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { - page_cache_get(buf->page); + if (!(buf->flags & PIPE_BUF_FLAG_GIFT)) + return 1; + + buf->flags |= PIPE_BUF_FLAG_LRU; + return generic_pipe_buf_steal(pipe, buf); } -static struct pipe_buf_operations page_cache_pipe_buf_ops = { +static const struct pipe_buf_operations user_page_pipe_buf_ops = { .can_merge = 0, - .map = page_cache_pipe_buf_map, - .unmap = page_cache_pipe_buf_unmap, + .confirm = generic_pipe_buf_confirm, .release = page_cache_pipe_buf_release, - .steal = page_cache_pipe_buf_steal, - .get = page_cache_pipe_buf_get, + .steal = user_page_pipe_buf_steal, + .get = generic_pipe_buf_get, }; -/* - * Pipe output worker. This sets up our pipe format with the page cache - * pipe buffer operations. Otherwise very similar to the regular pipe_writev(). +static void wakeup_pipe_readers(struct pipe_inode_info *pipe) +{ + smp_mb(); + if (waitqueue_active(&pipe->wait)) + wake_up_interruptible(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); +} + +/** + * splice_to_pipe - fill passed data into a pipe + * @pipe: pipe to fill + * @spd: data to fill + * + * Description: + * @spd contains a map of pages and len/offset tuples, along with + * the struct pipe_buf_operations associated with these pages. This + * function will link that data to the pipe. + * */ -static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages, - int nr_pages, unsigned long len, - unsigned int offset, unsigned int flags) +ssize_t splice_to_pipe(struct pipe_inode_info *pipe, + struct splice_pipe_desc *spd) { - int ret, do_wakeup, i; + unsigned int spd_pages = spd->nr_pages; + int ret, do_wakeup, page_nr; ret = 0; do_wakeup = 0; - i = 0; + page_nr = 0; - if (pipe->inode) - mutex_lock(&pipe->inode->i_mutex); + pipe_lock(pipe); for (;;) { if (!pipe->readers) { @@ -168,38 +200,34 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages, break; } - if (pipe->nrbufs < PIPE_BUFFERS) { - int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1); + if (pipe->nrbufs < pipe->buffers) { + int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1); struct pipe_buffer *buf = pipe->bufs + newbuf; - struct page *page = pages[i++]; - unsigned long this_len; - this_len = PAGE_CACHE_SIZE - offset; - if (this_len > len) - this_len = len; + buf->page = spd->pages[page_nr]; + buf->offset = spd->partial[page_nr].offset; + buf->len = spd->partial[page_nr].len; + buf->private = spd->partial[page_nr].private; + buf->ops = spd->ops; + if (spd->flags & SPLICE_F_GIFT) + buf->flags |= PIPE_BUF_FLAG_GIFT; - buf->page = page; - buf->offset = offset; - buf->len = this_len; - buf->ops = &page_cache_pipe_buf_ops; pipe->nrbufs++; - if (pipe->inode) + page_nr++; + ret += buf->len; + + if (pipe->files) do_wakeup = 1; - ret += this_len; - len -= this_len; - offset = 0; - if (!--nr_pages) + if (!--spd->nr_pages) break; - if (!len) - break; - if (pipe->nrbufs < PIPE_BUFFERS) + if (pipe->nrbufs < pipe->buffers) continue; break; } - if (flags & SPLICE_F_NONBLOCK) { + if (spd->flags & SPLICE_F_NONBLOCK) { if (!ret) ret = -EAGAIN; break; @@ -224,111 +252,178 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages, pipe->waiting_writers--; } - if (pipe->inode) - mutex_unlock(&pipe->inode->i_mutex); + pipe_unlock(pipe); - if (do_wakeup) { - smp_mb(); - if (waitqueue_active(&pipe->wait)) - wake_up_interruptible(&pipe->wait); - kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); - } + if (do_wakeup) + wakeup_pipe_readers(pipe); - while (i < nr_pages) - page_cache_release(pages[i++]); + while (page_nr < spd_pages) + spd->spd_release(spd, page_nr++); return ret; } +void spd_release_page(struct splice_pipe_desc *spd, unsigned int i) +{ + page_cache_release(spd->pages[i]); +} + +/* + * Check if we need to grow the arrays holding pages and partial page + * descriptions. + */ +int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd) +{ + unsigned int buffers = ACCESS_ONCE(pipe->buffers); + + spd->nr_pages_max = buffers; + if (buffers <= PIPE_DEF_BUFFERS) + return 0; + + spd->pages = kmalloc(buffers * sizeof(struct page *), GFP_KERNEL); + spd->partial = kmalloc(buffers * sizeof(struct partial_page), GFP_KERNEL); + + if (spd->pages && spd->partial) + return 0; + + kfree(spd->pages); + kfree(spd->partial); + return -ENOMEM; +} + +void splice_shrink_spd(struct splice_pipe_desc *spd) +{ + if (spd->nr_pages_max <= PIPE_DEF_BUFFERS) + return; + + kfree(spd->pages); + kfree(spd->partial); +} + static int __generic_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { struct address_space *mapping = in->f_mapping; - unsigned int loff, offset, nr_pages; - struct page *pages[PIPE_BUFFERS]; + unsigned int loff, nr_pages, req_pages; + struct page *pages[PIPE_DEF_BUFFERS]; + struct partial_page partial[PIPE_DEF_BUFFERS]; struct page *page; pgoff_t index, end_index; loff_t isize; - size_t bytes; - int i, error; + int error, page_nr; + struct splice_pipe_desc spd = { + .pages = pages, + .partial = partial, + .nr_pages_max = PIPE_DEF_BUFFERS, + .flags = flags, + .ops = &page_cache_pipe_buf_ops, + .spd_release = spd_release_page, + }; + + if (splice_grow_spd(pipe, &spd)) + return -ENOMEM; index = *ppos >> PAGE_CACHE_SHIFT; - loff = offset = *ppos & ~PAGE_CACHE_MASK; - nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - - if (nr_pages > PIPE_BUFFERS) - nr_pages = PIPE_BUFFERS; + loff = *ppos & ~PAGE_CACHE_MASK; + req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + nr_pages = min(req_pages, spd.nr_pages_max); /* - * Initiate read-ahead on this page range. however, don't call into - * read-ahead if this is a non-zero offset (we are likely doing small - * chunk splice and the page is already there) for a single page. + * Lookup the (hopefully) full range of pages we need. */ - if (!offset || nr_pages > 1) - do_page_cache_readahead(mapping, in, index, nr_pages); + spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, spd.pages); + index += spd.nr_pages; /* - * Now fill in the holes: + * If find_get_pages_contig() returned fewer pages than we needed, + * readahead/allocate the rest and fill in the holes. */ - error = 0; - bytes = 0; - for (i = 0; i < nr_pages; i++, index++) { - unsigned int this_len; + if (spd.nr_pages < nr_pages) + page_cache_sync_readahead(mapping, &in->f_ra, in, + index, req_pages - spd.nr_pages); - if (!len) - break; - - /* - * this_len is the max we'll use from this page - */ - this_len = min(len, PAGE_CACHE_SIZE - loff); -find_page: + error = 0; + while (spd.nr_pages < nr_pages) { /* - * lookup the page for this index + * Page could be there, find_get_pages_contig() breaks on + * the first hole. */ page = find_get_page(mapping, index); if (!page) { /* - * page didn't exist, allocate one + * page didn't exist, allocate one. */ page = page_cache_alloc_cold(mapping); if (!page) break; error = add_to_page_cache_lru(page, mapping, index, - mapping_gfp_mask(mapping)); + GFP_KERNEL); if (unlikely(error)) { page_cache_release(page); + if (error == -EEXIST) + continue; break; } - - goto readpage; + /* + * add_to_page_cache() locks the page, unlock it + * to avoid convoluting the logic below even more. + */ + unlock_page(page); } + spd.pages[spd.nr_pages++] = page; + index++; + } + + /* + * Now loop over the map and see if we need to start IO on any + * pages, fill in the partial map, etc. + */ + index = *ppos >> PAGE_CACHE_SHIFT; + nr_pages = spd.nr_pages; + spd.nr_pages = 0; + for (page_nr = 0; page_nr < nr_pages; page_nr++) { + unsigned int this_len; + + if (!len) + break; + + /* + * this_len is the max we'll use from this page + */ + this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); + page = spd.pages[page_nr]; + + if (PageReadahead(page)) + page_cache_async_readahead(mapping, &in->f_ra, in, + page, index, req_pages - page_nr); + /* * If the page isn't uptodate, we may need to start io on it */ if (!PageUptodate(page)) { - /* - * If in nonblock mode then dont block on waiting - * for an in-flight io page - */ - if (flags & SPLICE_F_NONBLOCK) - break; - lock_page(page); /* - * page was truncated, stop here. if this isn't the - * first page, we'll just complete what we already - * added + * Page was truncated, or invalidated by the + * filesystem. Redo the find/create, but this time the + * page is kept locked, so there's no chance of another + * race with truncate/invalidate. */ if (!page->mapping) { unlock_page(page); - page_cache_release(page); - break; + page = find_or_create_page(mapping, index, + mapping_gfp_mask(mapping)); + + if (!page) { + error = -ENOMEM; + break; + } + page_cache_release(spd.pages[page_nr]); + spd.pages[page_nr] = page; } /* * page was already under io and is now done, great @@ -338,452 +433,680 @@ find_page: goto fill_it; } -readpage: /* * need to read in the page */ error = mapping->a_ops->readpage(in, page); - if (unlikely(error)) { - page_cache_release(page); + /* + * We really should re-lookup the page here, + * but it complicates things a lot. Instead + * lets just do what we already stored, and + * we'll get it the next time we are called. + */ if (error == AOP_TRUNCATED_PAGE) - goto find_page; + error = 0; + break; } + } +fill_it: + /* + * i_size must be checked after PageUptodate. + */ + isize = i_size_read(mapping->host); + end_index = (isize - 1) >> PAGE_CACHE_SHIFT; + if (unlikely(!isize || index > end_index)) + break; + + /* + * if this is the last page, see if we need to shrink + * the length and stop + */ + if (end_index == index) { + unsigned int plen; /* - * i_size must be checked after ->readpage(). + * max good bytes in this page */ - isize = i_size_read(mapping->host); - end_index = (isize - 1) >> PAGE_CACHE_SHIFT; - if (unlikely(!isize || index > end_index)) { - page_cache_release(page); + plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; + if (plen <= loff) break; - } /* - * if this is the last page, see if we need to shrink - * the length and stop + * force quit after adding this page */ - if (end_index == index) { - loff = PAGE_CACHE_SIZE - (isize & ~PAGE_CACHE_MASK); - if (bytes + loff > isize) { - page_cache_release(page); - break; - } - /* - * force quit after adding this page - */ - nr_pages = i; - this_len = min(this_len, loff); - } + this_len = min(this_len, plen - loff); + len = this_len; } -fill_it: - pages[i] = page; - bytes += this_len; + + spd.partial[page_nr].offset = loff; + spd.partial[page_nr].len = this_len; len -= this_len; loff = 0; + spd.nr_pages++; + index++; } - if (i) - return move_to_pipe(pipe, pages, i, bytes, offset, flags); + /* + * Release any pages at the end, if we quit early. 'page_nr' is how far + * we got, 'nr_pages' is how many pages are in the map. + */ + while (page_nr < nr_pages) + page_cache_release(spd.pages[page_nr++]); + in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; + if (spd.nr_pages) + error = splice_to_pipe(pipe, &spd); + + splice_shrink_spd(&spd); return error; } /** * generic_file_splice_read - splice data from file to a pipe * @in: file to splice from + * @ppos: position in @in * @pipe: pipe to splice to * @len: number of bytes to splice * @flags: splice modifier flags * - * Will read pages from given file and fill them into a pipe. + * Description: + * Will read pages from given file and fill them into a pipe. Can be + * used as long as the address_space operations for the source implements + * a readpage() hook. + * */ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { - ssize_t spliced; + loff_t isize, left; int ret; - ret = 0; - spliced = 0; - - while (len) { - ret = __generic_file_splice_read(in, ppos, pipe, len, flags); + isize = i_size_read(in->f_mapping->host); + if (unlikely(*ppos >= isize)) + return 0; - if (ret < 0) - break; - else if (!ret) { - if (spliced) - break; - if (flags & SPLICE_F_NONBLOCK) { - ret = -EAGAIN; - break; - } - } + left = isize - *ppos; + if (unlikely(left < len)) + len = left; + ret = __generic_file_splice_read(in, ppos, pipe, len, flags); + if (ret > 0) { *ppos += ret; - len -= ret; - spliced += ret; + file_accessed(in); } - if (spliced) - return spliced; - return ret; } - EXPORT_SYMBOL(generic_file_splice_read); -/* - * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' - * using sendpage(). - */ -static int pipe_to_sendpage(struct pipe_inode_info *info, - struct pipe_buffer *buf, struct splice_desc *sd) -{ - struct file *file = sd->file; - loff_t pos = sd->pos; - unsigned int offset; - ssize_t ret; - void *ptr; - int more; +static const struct pipe_buf_operations default_pipe_buf_ops = { + .can_merge = 0, + .confirm = generic_pipe_buf_confirm, + .release = generic_pipe_buf_release, + .steal = generic_pipe_buf_steal, + .get = generic_pipe_buf_get, +}; - /* - * Sub-optimal, but we are limited by the pipe ->map. We don't - * need a kmap'ed buffer here, we just want to make sure we - * have the page pinned if the pipe page originates from the - * page cache. - */ - ptr = buf->ops->map(file, info, buf); - if (IS_ERR(ptr)) - return PTR_ERR(ptr); +static int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + return 1; +} - offset = pos & ~PAGE_CACHE_MASK; - more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len; +/* Pipe buffer operations for a socket and similar. */ +const struct pipe_buf_operations nosteal_pipe_buf_ops = { + .can_merge = 0, + .confirm = generic_pipe_buf_confirm, + .release = generic_pipe_buf_release, + .steal = generic_pipe_buf_nosteal, + .get = generic_pipe_buf_get, +}; +EXPORT_SYMBOL(nosteal_pipe_buf_ops); - ret = file->f_op->sendpage(file, buf->page, offset, sd->len, &pos,more); +static ssize_t kernel_readv(struct file *file, const struct iovec *vec, + unsigned long vlen, loff_t offset) +{ + mm_segment_t old_fs; + loff_t pos = offset; + ssize_t res; - buf->ops->unmap(info, buf); - if (ret == sd->len) - return 0; + old_fs = get_fs(); + set_fs(get_ds()); + /* The cast to a user pointer is valid due to the set_fs() */ + res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos); + set_fs(old_fs); - return -EIO; + return res; } -/* - * This is a little more tricky than the file -> pipe splicing. There are - * basically three cases: - * - * - Destination page already exists in the address space and there - * are users of it. For that case we have no other option that - * copying the data. Tough luck. - * - Destination page already exists in the address space, but there - * are no users of it. Make sure it's uptodate, then drop it. Fall - * through to last case. - * - Destination page does not exist, we can add the pipe page to - * the page cache and avoid the copy. - * - * If asked to move pages to the output file (SPLICE_F_MOVE is set in - * sd->flags), we attempt to migrate pages from the pipe to the output - * file address space page cache. This is possible if no one else has - * the pipe page referenced outside of the pipe and page cache. If - * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create - * a new page in the output file page cache and fill/dirty that. - */ -static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, - struct splice_desc *sd) +ssize_t kernel_write(struct file *file, const char *buf, size_t count, + loff_t pos) { - struct file *file = sd->file; - struct address_space *mapping = file->f_mapping; - gfp_t gfp_mask = mapping_gfp_mask(mapping); - unsigned int offset; - struct page *page; - pgoff_t index; - char *src; - int ret; + mm_segment_t old_fs; + ssize_t res; - /* - * make sure the data in this buffer is uptodate - */ - src = buf->ops->map(file, info, buf); - if (IS_ERR(src)) - return PTR_ERR(src); + old_fs = get_fs(); + set_fs(get_ds()); + /* The cast to a user pointer is valid due to the set_fs() */ + res = vfs_write(file, (__force const char __user *)buf, count, &pos); + set_fs(old_fs); - index = sd->pos >> PAGE_CACHE_SHIFT; - offset = sd->pos & ~PAGE_CACHE_MASK; + return res; +} +EXPORT_SYMBOL(kernel_write); - /* - * Reuse buf page, if SPLICE_F_MOVE is set. - */ - if (sd->flags & SPLICE_F_MOVE) { - /* - * If steal succeeds, buf->page is now pruned from the vm - * side (LRU and page cache) and we can reuse it. The page - * will also be looked on successful return. - */ - if (buf->ops->steal(info, buf)) - goto find_page; - - page = buf->page; - if (add_to_page_cache(page, mapping, index, gfp_mask)) - goto find_page; - - if (!(buf->flags & PIPE_BUF_FLAG_LRU)) - lru_cache_add(page); - } else { -find_page: - page = find_lock_page(mapping, index); - if (!page) { - ret = -ENOMEM; - page = page_cache_alloc_cold(mapping); - if (unlikely(!page)) - goto out_nomem; +ssize_t default_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + unsigned int nr_pages; + unsigned int nr_freed; + size_t offset; + struct page *pages[PIPE_DEF_BUFFERS]; + struct partial_page partial[PIPE_DEF_BUFFERS]; + struct iovec *vec, __vec[PIPE_DEF_BUFFERS]; + ssize_t res; + size_t this_len; + int error; + int i; + struct splice_pipe_desc spd = { + .pages = pages, + .partial = partial, + .nr_pages_max = PIPE_DEF_BUFFERS, + .flags = flags, + .ops = &default_pipe_buf_ops, + .spd_release = spd_release_page, + }; + + if (splice_grow_spd(pipe, &spd)) + return -ENOMEM; + + res = -ENOMEM; + vec = __vec; + if (spd.nr_pages_max > PIPE_DEF_BUFFERS) { + vec = kmalloc(spd.nr_pages_max * sizeof(struct iovec), GFP_KERNEL); + if (!vec) + goto shrink_ret; + } - /* - * This will also lock the page - */ - ret = add_to_page_cache_lru(page, mapping, index, - gfp_mask); - if (unlikely(ret)) - goto out; - } + offset = *ppos & ~PAGE_CACHE_MASK; + nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - /* - * We get here with the page locked. If the page is also - * uptodate, we don't need to do more. If it isn't, we - * may need to bring it in if we are not going to overwrite - * the full page. - */ - if (!PageUptodate(page)) { - if (sd->len < PAGE_CACHE_SIZE) { - ret = mapping->a_ops->readpage(file, page); - if (unlikely(ret)) - goto out; - - lock_page(page); - - if (!PageUptodate(page)) { - /* - * Page got invalidated, repeat. - */ - if (!page->mapping) { - unlock_page(page); - page_cache_release(page); - goto find_page; - } - ret = -EIO; - goto out; - } - } else - SetPageUptodate(page); - } - } + for (i = 0; i < nr_pages && i < spd.nr_pages_max && len; i++) { + struct page *page; - ret = mapping->a_ops->prepare_write(file, page, 0, sd->len); - if (ret == AOP_TRUNCATED_PAGE) { - page_cache_release(page); - goto find_page; - } else if (ret) - goto out; + page = alloc_page(GFP_USER); + error = -ENOMEM; + if (!page) + goto err; - if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) { - char *dst = kmap_atomic(page, KM_USER0); + this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset); + vec[i].iov_base = (void __user *) page_address(page); + vec[i].iov_len = this_len; + spd.pages[i] = page; + spd.nr_pages++; + len -= this_len; + offset = 0; + } - memcpy(dst + offset, src + buf->offset, sd->len); - flush_dcache_page(page); - kunmap_atomic(dst, KM_USER0); + res = kernel_readv(in, vec, spd.nr_pages, *ppos); + if (res < 0) { + error = res; + goto err; } - ret = mapping->a_ops->commit_write(file, page, 0, sd->len); - if (ret == AOP_TRUNCATED_PAGE) { - page_cache_release(page); - goto find_page; - } else if (ret) - goto out; + error = 0; + if (!res) + goto err; + + nr_freed = 0; + for (i = 0; i < spd.nr_pages; i++) { + this_len = min_t(size_t, vec[i].iov_len, res); + spd.partial[i].offset = 0; + spd.partial[i].len = this_len; + if (!this_len) { + __free_page(spd.pages[i]); + spd.pages[i] = NULL; + nr_freed++; + } + res -= this_len; + } + spd.nr_pages -= nr_freed; - mark_page_accessed(page); - balance_dirty_pages_ratelimited(mapping); -out: - if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) - page_cache_release(page); + res = splice_to_pipe(pipe, &spd); + if (res > 0) + *ppos += res; - unlock_page(page); -out_nomem: - buf->ops->unmap(info, buf); - return ret; -} +shrink_ret: + if (vec != __vec) + kfree(vec); + splice_shrink_spd(&spd); + return res; -typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *, - struct splice_desc *); +err: + for (i = 0; i < spd.nr_pages; i++) + __free_page(spd.pages[i]); + + res = error; + goto shrink_ret; +} +EXPORT_SYMBOL(default_file_splice_read); /* - * Pipe input worker. Most of this logic works like a regular pipe, the - * key here is the 'actor' worker passed in that actually moves the data - * to the wanted destination. See pipe_to_file/pipe_to_sendpage above. + * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' + * using sendpage(). Return the number of bytes sent. */ -static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out, - loff_t *ppos, size_t len, unsigned int flags, - splice_actor *actor) +static int pipe_to_sendpage(struct pipe_inode_info *pipe, + struct pipe_buffer *buf, struct splice_desc *sd) { - int ret, do_wakeup, err; - struct splice_desc sd; + struct file *file = sd->u.file; + loff_t pos = sd->pos; + int more; - ret = 0; - do_wakeup = 0; + if (!likely(file->f_op->sendpage)) + return -EINVAL; - sd.total_len = len; - sd.flags = flags; - sd.file = out; - sd.pos = *ppos; + more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0; - if (pipe->inode) - mutex_lock(&pipe->inode->i_mutex); + if (sd->len < sd->total_len && pipe->nrbufs > 1) + more |= MSG_SENDPAGE_NOTLAST; - for (;;) { - if (pipe->nrbufs) { - struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; - struct pipe_buf_operations *ops = buf->ops; + return file->f_op->sendpage(file, buf->page, buf->offset, + sd->len, &pos, more); +} - sd.len = buf->len; - if (sd.len > sd.total_len) - sd.len = sd.total_len; +static void wakeup_pipe_writers(struct pipe_inode_info *pipe) +{ + smp_mb(); + if (waitqueue_active(&pipe->wait)) + wake_up_interruptible(&pipe->wait); + kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); +} - err = actor(pipe, buf, &sd); - if (err) { - if (!ret && err != -ENODATA) - ret = err; +/** + * splice_from_pipe_feed - feed available data from a pipe to a file + * @pipe: pipe to splice from + * @sd: information to @actor + * @actor: handler that splices the data + * + * Description: + * This function loops over the pipe and calls @actor to do the + * actual moving of a single struct pipe_buffer to the desired + * destination. It returns when there's no more buffers left in + * the pipe or if the requested number of bytes (@sd->total_len) + * have been copied. It returns a positive number (one) if the + * pipe needs to be filled with more data, zero if the required + * number of bytes have been copied and -errno on error. + * + * This, together with splice_from_pipe_{begin,end,next}, may be + * used to implement the functionality of __splice_from_pipe() when + * locking is required around copying the pipe buffers to the + * destination. + */ +static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd, + splice_actor *actor) +{ + int ret; - break; - } + while (pipe->nrbufs) { + struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; + const struct pipe_buf_operations *ops = buf->ops; - ret += sd.len; - buf->offset += sd.len; - buf->len -= sd.len; + sd->len = buf->len; + if (sd->len > sd->total_len) + sd->len = sd->total_len; - if (!buf->len) { - buf->ops = NULL; - ops->release(pipe, buf); - pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1); - pipe->nrbufs--; - if (pipe->inode) - do_wakeup = 1; - } + ret = buf->ops->confirm(pipe, buf); + if (unlikely(ret)) { + if (ret == -ENODATA) + ret = 0; + return ret; + } - sd.pos += sd.len; - sd.total_len -= sd.len; - if (!sd.total_len) - break; + ret = actor(pipe, buf, sd); + if (ret <= 0) + return ret; + + buf->offset += ret; + buf->len -= ret; + + sd->num_spliced += ret; + sd->len -= ret; + sd->pos += ret; + sd->total_len -= ret; + + if (!buf->len) { + buf->ops = NULL; + ops->release(pipe, buf); + pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1); + pipe->nrbufs--; + if (pipe->files) + sd->need_wakeup = true; } - if (pipe->nrbufs) - continue; + if (!sd->total_len) + return 0; + } + + return 1; +} + +/** + * splice_from_pipe_next - wait for some data to splice from + * @pipe: pipe to splice from + * @sd: information about the splice operation + * + * Description: + * This function will wait for some data and return a positive + * value (one) if pipe buffers are available. It will return zero + * or -errno if no more data needs to be spliced. + */ +static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd) +{ + while (!pipe->nrbufs) { if (!pipe->writers) - break; - if (!pipe->waiting_writers) { - if (ret) - break; - } + return 0; - if (flags & SPLICE_F_NONBLOCK) { - if (!ret) - ret = -EAGAIN; - break; - } + if (!pipe->waiting_writers && sd->num_spliced) + return 0; - if (signal_pending(current)) { - if (!ret) - ret = -ERESTARTSYS; - break; - } + if (sd->flags & SPLICE_F_NONBLOCK) + return -EAGAIN; - if (do_wakeup) { - smp_mb(); - if (waitqueue_active(&pipe->wait)) - wake_up_interruptible_sync(&pipe->wait); - kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); - do_wakeup = 0; + if (signal_pending(current)) + return -ERESTARTSYS; + + if (sd->need_wakeup) { + wakeup_pipe_writers(pipe); + sd->need_wakeup = false; } pipe_wait(pipe); } - if (pipe->inode) - mutex_unlock(&pipe->inode->i_mutex); + return 1; +} - if (do_wakeup) { - smp_mb(); - if (waitqueue_active(&pipe->wait)) - wake_up_interruptible(&pipe->wait); - kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); - } +/** + * splice_from_pipe_begin - start splicing from pipe + * @sd: information about the splice operation + * + * Description: + * This function should be called before a loop containing + * splice_from_pipe_next() and splice_from_pipe_feed() to + * initialize the necessary fields of @sd. + */ +static void splice_from_pipe_begin(struct splice_desc *sd) +{ + sd->num_spliced = 0; + sd->need_wakeup = false; +} + +/** + * splice_from_pipe_end - finish splicing from pipe + * @pipe: pipe to splice from + * @sd: information about the splice operation + * + * Description: + * This function will wake up pipe writers if necessary. It should + * be called after a loop containing splice_from_pipe_next() and + * splice_from_pipe_feed(). + */ +static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd) +{ + if (sd->need_wakeup) + wakeup_pipe_writers(pipe); +} + +/** + * __splice_from_pipe - splice data from a pipe to given actor + * @pipe: pipe to splice from + * @sd: information to @actor + * @actor: handler that splices the data + * + * Description: + * This function does little more than loop over the pipe and call + * @actor to do the actual moving of a single struct pipe_buffer to + * the desired destination. See pipe_to_file, pipe_to_sendpage, or + * pipe_to_user. + * + */ +ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd, + splice_actor *actor) +{ + int ret; + + splice_from_pipe_begin(sd); + do { + ret = splice_from_pipe_next(pipe, sd); + if (ret > 0) + ret = splice_from_pipe_feed(pipe, sd, actor); + } while (ret > 0); + splice_from_pipe_end(pipe, sd); + + return sd->num_spliced ? sd->num_spliced : ret; +} +EXPORT_SYMBOL(__splice_from_pipe); + +/** + * splice_from_pipe - splice data from a pipe to a file + * @pipe: pipe to splice from + * @out: file to splice to + * @ppos: position in @out + * @len: how many bytes to splice + * @flags: splice modifier flags + * @actor: handler that splices the data + * + * Description: + * See __splice_from_pipe. This function locks the pipe inode, + * otherwise it's identical to __splice_from_pipe(). + * + */ +ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags, + splice_actor *actor) +{ + ssize_t ret; + struct splice_desc sd = { + .total_len = len, + .flags = flags, + .pos = *ppos, + .u.file = out, + }; + + pipe_lock(pipe); + ret = __splice_from_pipe(pipe, &sd, actor); + pipe_unlock(pipe); return ret; } /** - * generic_file_splice_write - splice data from a pipe to a file + * iter_file_splice_write - splice data from a pipe to a file * @pipe: pipe info * @out: file to write to + * @ppos: position in @out * @len: number of bytes to splice * @flags: splice modifier flags * - * Will either move or copy pages (determined by @flags options) from - * the given pipe inode to the given file. + * Description: + * Will either move or copy pages (determined by @flags options) from + * the given pipe inode to the given file. + * This one is ->write_iter-based. * */ ssize_t -generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, +iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { - struct address_space *mapping = out->f_mapping; + struct splice_desc sd = { + .total_len = len, + .flags = flags, + .pos = *ppos, + .u.file = out, + }; + int nbufs = pipe->buffers; + struct bio_vec *array = kcalloc(nbufs, sizeof(struct bio_vec), + GFP_KERNEL); ssize_t ret; - ret = move_from_pipe(pipe, out, ppos, len, flags, pipe_to_file); - if (ret > 0) { - struct inode *inode = mapping->host; + if (unlikely(!array)) + return -ENOMEM; - *ppos += ret; + pipe_lock(pipe); - /* - * If file or inode is SYNC and we actually wrote some data, - * sync it. - */ - if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { - int err; + splice_from_pipe_begin(&sd); + while (sd.total_len) { + struct iov_iter from; + struct kiocb kiocb; + size_t left; + int n, idx; + + ret = splice_from_pipe_next(pipe, &sd); + if (ret <= 0) + break; + + if (unlikely(nbufs < pipe->buffers)) { + kfree(array); + nbufs = pipe->buffers; + array = kcalloc(nbufs, sizeof(struct bio_vec), + GFP_KERNEL); + if (!array) { + ret = -ENOMEM; + break; + } + } - mutex_lock(&inode->i_mutex); - err = generic_osync_inode(inode, mapping, - OSYNC_METADATA|OSYNC_DATA); - mutex_unlock(&inode->i_mutex); + /* build the vector */ + left = sd.total_len; + for (n = 0, idx = pipe->curbuf; left && n < pipe->nrbufs; n++, idx++) { + struct pipe_buffer *buf = pipe->bufs + idx; + size_t this_len = buf->len; - if (err) - ret = err; + if (this_len > left) + this_len = left; + + if (idx == pipe->buffers - 1) + idx = -1; + + ret = buf->ops->confirm(pipe, buf); + if (unlikely(ret)) { + if (ret == -ENODATA) + ret = 0; + goto done; + } + + array[n].bv_page = buf->page; + array[n].bv_len = this_len; + array[n].bv_offset = buf->offset; + left -= this_len; + } + + /* ... iov_iter */ + from.type = ITER_BVEC | WRITE; + from.bvec = array; + from.nr_segs = n; + from.count = sd.total_len - left; + from.iov_offset = 0; + + /* ... and iocb */ + init_sync_kiocb(&kiocb, out); + kiocb.ki_pos = sd.pos; + kiocb.ki_nbytes = sd.total_len - left; + + /* now, send it */ + ret = out->f_op->write_iter(&kiocb, &from); + if (-EIOCBQUEUED == ret) + ret = wait_on_sync_kiocb(&kiocb); + + if (ret <= 0) + break; + + sd.num_spliced += ret; + sd.total_len -= ret; + *ppos = sd.pos = kiocb.ki_pos; + + /* dismiss the fully eaten buffers, adjust the partial one */ + while (ret) { + struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; + if (ret >= buf->len) { + const struct pipe_buf_operations *ops = buf->ops; + ret -= buf->len; + buf->len = 0; + buf->ops = NULL; + ops->release(pipe, buf); + pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1); + pipe->nrbufs--; + if (pipe->files) + sd.need_wakeup = true; + } else { + buf->offset += ret; + buf->len -= ret; + ret = 0; + } } } +done: + kfree(array); + splice_from_pipe_end(pipe, &sd); + + pipe_unlock(pipe); + + if (sd.num_spliced) + ret = sd.num_spliced; return ret; } -EXPORT_SYMBOL(generic_file_splice_write); +EXPORT_SYMBOL(iter_file_splice_write); + +static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf, + struct splice_desc *sd) +{ + int ret; + void *data; + loff_t tmp = sd->pos; + + data = kmap(buf->page); + ret = __kernel_write(sd->u.file, data + buf->offset, sd->len, &tmp); + kunmap(buf->page); + + return ret; +} + +static ssize_t default_file_splice_write(struct pipe_inode_info *pipe, + struct file *out, loff_t *ppos, + size_t len, unsigned int flags) +{ + ssize_t ret; + + ret = splice_from_pipe(pipe, out, ppos, len, flags, write_pipe_buf); + if (ret > 0) + *ppos += ret; + + return ret; +} /** * generic_splice_sendpage - splice data from a pipe to a socket - * @inode: pipe inode + * @pipe: pipe to splice from * @out: socket to write to + * @ppos: position in @out * @len: number of bytes to splice * @flags: splice modifier flags * - * Will send @len bytes from the pipe to a network socket. No data copying - * is involved. + * Description: + * Will send @len bytes from the pipe to a network socket. No data copying + * is involved. * */ ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { - return move_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage); + return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage); } EXPORT_SYMBOL(generic_splice_sendpage); @@ -794,19 +1117,15 @@ EXPORT_SYMBOL(generic_splice_sendpage); static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { - int ret; - - if (unlikely(!out->f_op || !out->f_op->splice_write)) - return -EINVAL; + ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, + loff_t *, size_t, unsigned int); - if (unlikely(!(out->f_mode & FMODE_WRITE))) - return -EBADF; - - ret = rw_verify_area(WRITE, out, ppos, len); - if (unlikely(ret < 0)) - return ret; + if (out->f_op->splice_write) + splice_write = out->f_op->splice_write; + else + splice_write = default_file_splice_write; - return out->f_op->splice_write(pipe, out, ppos, len, flags); + return splice_write(pipe, out, ppos, len, flags); } /* @@ -816,12 +1135,10 @@ static long do_splice_to(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { - loff_t isize, left; + ssize_t (*splice_read)(struct file *, loff_t *, + struct pipe_inode_info *, size_t, unsigned int); int ret; - if (unlikely(!in->f_op || !in->f_op->splice_read)) - return -EINVAL; - if (unlikely(!(in->f_mode & FMODE_READ))) return -EBADF; @@ -829,32 +1146,42 @@ static long do_splice_to(struct file *in, loff_t *ppos, if (unlikely(ret < 0)) return ret; - isize = i_size_read(in->f_mapping->host); - if (unlikely(*ppos >= isize)) - return 0; - - left = isize - *ppos; - if (unlikely(left < len)) - len = left; + if (in->f_op->splice_read) + splice_read = in->f_op->splice_read; + else + splice_read = default_file_splice_read; - return in->f_op->splice_read(in, ppos, pipe, len, flags); + return splice_read(in, ppos, pipe, len, flags); } -long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, - size_t len, unsigned int flags) +/** + * splice_direct_to_actor - splices data directly between two non-pipes + * @in: file to splice from + * @sd: actor information on where to splice to + * @actor: handles the data splicing + * + * Description: + * This is a special case helper to splice directly between two + * points, without requiring an explicit pipe. Internally an allocated + * pipe is cached in the process, and reused during the lifetime of + * that process. + * + */ +ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, + splice_direct_actor *actor) { struct pipe_inode_info *pipe; long ret, bytes; - loff_t out_off; umode_t i_mode; - int i; + size_t len; + int i, flags; /* * We require the input being a regular file, as we don't want to * randomly drop data for eg socket -> socket splicing. Use the * piped splicing for that! */ - i_mode = in->f_dentry->d_inode->i_mode; + i_mode = file_inode(in)->i_mode; if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode))) return -EINVAL; @@ -864,13 +1191,13 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, */ pipe = current->splice_pipe; if (unlikely(!pipe)) { - pipe = alloc_pipe_info(NULL); + pipe = alloc_pipe_info(); if (!pipe) return -ENOMEM; /* * We don't have an immediate reader, but we'll read the stuff - * out of the pipe right after the move_to_pipe(). So set + * out of the pipe right after the splice_to_pipe(). So set * PIPE_READERS appropriately. */ pipe->readers = 1; @@ -883,49 +1210,49 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, */ ret = 0; bytes = 0; - out_off = 0; + len = sd->total_len; + flags = sd->flags; - while (len) { - size_t read_len, max_read_len; + /* + * Don't block on output, we have to drain the direct pipe. + */ + sd->flags &= ~SPLICE_F_NONBLOCK; - /* - * Do at most PIPE_BUFFERS pages worth of transfer: - */ - max_read_len = min(len, (size_t)(PIPE_BUFFERS*PAGE_SIZE)); + while (len) { + size_t read_len; + loff_t pos = sd->pos, prev_pos = pos; - ret = do_splice_to(in, ppos, pipe, max_read_len, flags); - if (unlikely(ret < 0)) + ret = do_splice_to(in, &pos, pipe, len, flags); + if (unlikely(ret <= 0)) goto out_release; read_len = ret; + sd->total_len = read_len; /* * NOTE: nonblocking mode only applies to the input. We * must not do the output in nonblocking mode as then we * could get stuck data in the internal pipe: */ - ret = do_splice_from(pipe, out, &out_off, read_len, - flags & ~SPLICE_F_NONBLOCK); - if (unlikely(ret < 0)) + ret = actor(pipe, sd); + if (unlikely(ret <= 0)) { + sd->pos = prev_pos; goto out_release; + } bytes += ret; len -= ret; + sd->pos = pos; - /* - * In nonblocking mode, if we got back a short read then - * that was due to either an IO error or due to the - * pagecache entry not being there. In the IO error case - * the _next_ splice attempt will produce a clean IO error - * return value (not a short read), so in both cases it's - * correct to break out of the loop here: - */ - if ((flags & SPLICE_F_NONBLOCK) && (read_len < max_read_len)) - break; + if (ret < read_len) { + sd->pos = prev_pos + ret; + goto out_release; + } } +done: pipe->nrbufs = pipe->curbuf = 0; - + file_accessed(in); return bytes; out_release: @@ -933,7 +1260,7 @@ out_release: * If we did an incomplete transfer we must release * the pipe buffers in question: */ - for (i = 0; i < PIPE_BUFFERS; i++) { + for (i = 0; i < pipe->buffers; i++) { struct pipe_buffer *buf = pipe->bufs + i; if (buf->ops) { @@ -941,18 +1268,72 @@ out_release: buf->ops = NULL; } } - pipe->nrbufs = pipe->curbuf = 0; - /* - * If we transferred some data, return the number of bytes: - */ - if (bytes > 0) - return bytes; + if (!bytes) + bytes = ret; + + goto done; +} +EXPORT_SYMBOL(splice_direct_to_actor); + +static int direct_splice_actor(struct pipe_inode_info *pipe, + struct splice_desc *sd) +{ + struct file *file = sd->u.file; + + return do_splice_from(pipe, file, sd->opos, sd->total_len, + sd->flags); +} + +/** + * do_splice_direct - splices data directly between two files + * @in: file to splice from + * @ppos: input file offset + * @out: file to splice to + * @opos: output file offset + * @len: number of bytes to splice + * @flags: splice modifier flags + * + * Description: + * For use by do_sendfile(). splice can easily emulate sendfile, but + * doing it in the application would incur an extra system call + * (splice in + splice out, as compared to just sendfile()). So this helper + * can splice directly through a process-private pipe. + * + */ +long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, + loff_t *opos, size_t len, unsigned int flags) +{ + struct splice_desc sd = { + .len = len, + .total_len = len, + .flags = flags, + .pos = *ppos, + .u.file = out, + .opos = opos, + }; + long ret; + + if (unlikely(!(out->f_mode & FMODE_WRITE))) + return -EBADF; + + if (unlikely(out->f_flags & O_APPEND)) + return -EINVAL; + + ret = rw_verify_area(WRITE, out, opos, len); + if (unlikely(ret < 0)) + return ret; + + ret = splice_direct_to_actor(in, &sd, direct_splice_actor); + if (ret > 0) + *ppos = sd.pos; return ret; } -EXPORT_SYMBOL(do_splice_direct); +static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, + struct pipe_inode_info *opipe, + size_t len, unsigned int flags); /* * Determine where to splice to/from. @@ -961,47 +1342,82 @@ static long do_splice(struct file *in, loff_t __user *off_in, struct file *out, loff_t __user *off_out, size_t len, unsigned int flags) { - struct pipe_inode_info *pipe; - loff_t offset, *off; + struct pipe_inode_info *ipipe; + struct pipe_inode_info *opipe; + loff_t offset; long ret; - pipe = in->f_dentry->d_inode->i_pipe; - if (pipe) { + ipipe = get_pipe_info(in); + opipe = get_pipe_info(out); + + if (ipipe && opipe) { + if (off_in || off_out) + return -ESPIPE; + + if (!(in->f_mode & FMODE_READ)) + return -EBADF; + + if (!(out->f_mode & FMODE_WRITE)) + return -EBADF; + + /* Splicing to self would be fun, but... */ + if (ipipe == opipe) + return -EINVAL; + + return splice_pipe_to_pipe(ipipe, opipe, len, flags); + } + + if (ipipe) { if (off_in) return -ESPIPE; if (off_out) { - if (out->f_op->llseek == no_llseek) + if (!(out->f_mode & FMODE_PWRITE)) return -EINVAL; if (copy_from_user(&offset, off_out, sizeof(loff_t))) return -EFAULT; - off = &offset; - } else - off = &out->f_pos; + } else { + offset = out->f_pos; + } - ret = do_splice_from(pipe, out, off, len, flags); + if (unlikely(!(out->f_mode & FMODE_WRITE))) + return -EBADF; - if (off_out && copy_to_user(off_out, off, sizeof(loff_t))) + if (unlikely(out->f_flags & O_APPEND)) + return -EINVAL; + + ret = rw_verify_area(WRITE, out, &offset, len); + if (unlikely(ret < 0)) + return ret; + + file_start_write(out); + ret = do_splice_from(ipipe, out, &offset, len, flags); + file_end_write(out); + + if (!off_out) + out->f_pos = offset; + else if (copy_to_user(off_out, &offset, sizeof(loff_t))) ret = -EFAULT; return ret; } - pipe = out->f_dentry->d_inode->i_pipe; - if (pipe) { + if (opipe) { if (off_out) return -ESPIPE; if (off_in) { - if (in->f_op->llseek == no_llseek) + if (!(in->f_mode & FMODE_PREAD)) return -EINVAL; if (copy_from_user(&offset, off_in, sizeof(loff_t))) return -EFAULT; - off = &offset; - } else - off = &in->f_pos; + } else { + offset = in->f_pos; + } - ret = do_splice_to(in, off, pipe, len, flags); + ret = do_splice_to(in, &offset, opipe, len, flags); - if (off_in && copy_to_user(off_in, off, sizeof(loff_t))) + if (!off_in) + in->f_pos = offset; + else if (copy_to_user(off_in, &offset, sizeof(loff_t))) ret = -EFAULT; return ret; @@ -1010,174 +1426,557 @@ static long do_splice(struct file *in, loff_t __user *off_in, return -EINVAL; } -asmlinkage long sys_splice(int fd_in, loff_t __user *off_in, - int fd_out, loff_t __user *off_out, - size_t len, unsigned int flags) +/* + * Map an iov into an array of pages and offset/length tupples. With the + * partial_page structure, we can map several non-contiguous ranges into + * our ones pages[] map instead of splitting that operation into pieces. + * Could easily be exported as a generic helper for other users, in which + * case one would probably want to add a 'max_nr_pages' parameter as well. + */ +static int get_iovec_page_array(const struct iovec __user *iov, + unsigned int nr_vecs, struct page **pages, + struct partial_page *partial, bool aligned, + unsigned int pipe_buffers) +{ + int buffers = 0, error = 0; + + while (nr_vecs) { + unsigned long off, npages; + struct iovec entry; + void __user *base; + size_t len; + int i; + + error = -EFAULT; + if (copy_from_user(&entry, iov, sizeof(entry))) + break; + + base = entry.iov_base; + len = entry.iov_len; + + /* + * Sanity check this iovec. 0 read succeeds. + */ + error = 0; + if (unlikely(!len)) + break; + error = -EFAULT; + if (!access_ok(VERIFY_READ, base, len)) + break; + + /* + * Get this base offset and number of pages, then map + * in the user pages. + */ + off = (unsigned long) base & ~PAGE_MASK; + + /* + * If asked for alignment, the offset must be zero and the + * length a multiple of the PAGE_SIZE. + */ + error = -EINVAL; + if (aligned && (off || len & ~PAGE_MASK)) + break; + + npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (npages > pipe_buffers - buffers) + npages = pipe_buffers - buffers; + + error = get_user_pages_fast((unsigned long)base, npages, + 0, &pages[buffers]); + + if (unlikely(error <= 0)) + break; + + /* + * Fill this contiguous range into the partial page map. + */ + for (i = 0; i < error; i++) { + const int plen = min_t(size_t, len, PAGE_SIZE - off); + + partial[buffers].offset = off; + partial[buffers].len = plen; + + off = 0; + len -= plen; + buffers++; + } + + /* + * We didn't complete this iov, stop here since it probably + * means we have to move some of this into a pipe to + * be able to continue. + */ + if (len) + break; + + /* + * Don't continue if we mapped fewer pages than we asked for, + * or if we mapped the max number of pages that we have + * room for. + */ + if (error < npages || buffers == pipe_buffers) + break; + + nr_vecs--; + iov++; + } + + if (buffers) + return buffers; + + return error; +} + +static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, + struct splice_desc *sd) +{ + int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data); + return n == sd->len ? n : -EFAULT; +} + +/* + * For lack of a better implementation, implement vmsplice() to userspace + * as a simple copy of the pipes pages to the user iov. + */ +static long vmsplice_to_user(struct file *file, const struct iovec __user *uiov, + unsigned long nr_segs, unsigned int flags) +{ + struct pipe_inode_info *pipe; + struct splice_desc sd; + long ret; + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov = iovstack; + struct iov_iter iter; + ssize_t count; + + pipe = get_pipe_info(file); + if (!pipe) + return -EBADF; + + ret = rw_copy_check_uvector(READ, uiov, nr_segs, + ARRAY_SIZE(iovstack), iovstack, &iov); + if (ret <= 0) + goto out; + + count = ret; + iov_iter_init(&iter, READ, iov, nr_segs, count); + + sd.len = 0; + sd.total_len = count; + sd.flags = flags; + sd.u.data = &iter; + sd.pos = 0; + + pipe_lock(pipe); + ret = __splice_from_pipe(pipe, &sd, pipe_to_user); + pipe_unlock(pipe); + +out: + if (iov != iovstack) + kfree(iov); + + return ret; +} + +/* + * vmsplice splices a user address range into a pipe. It can be thought of + * as splice-from-memory, where the regular splice is splice-from-file (or + * to file). In both cases the output is a pipe, naturally. + */ +static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov, + unsigned long nr_segs, unsigned int flags) +{ + struct pipe_inode_info *pipe; + struct page *pages[PIPE_DEF_BUFFERS]; + struct partial_page partial[PIPE_DEF_BUFFERS]; + struct splice_pipe_desc spd = { + .pages = pages, + .partial = partial, + .nr_pages_max = PIPE_DEF_BUFFERS, + .flags = flags, + .ops = &user_page_pipe_buf_ops, + .spd_release = spd_release_page, + }; + long ret; + + pipe = get_pipe_info(file); + if (!pipe) + return -EBADF; + + if (splice_grow_spd(pipe, &spd)) + return -ENOMEM; + + spd.nr_pages = get_iovec_page_array(iov, nr_segs, spd.pages, + spd.partial, false, + spd.nr_pages_max); + if (spd.nr_pages <= 0) + ret = spd.nr_pages; + else + ret = splice_to_pipe(pipe, &spd); + + splice_shrink_spd(&spd); + return ret; +} + +/* + * Note that vmsplice only really supports true splicing _from_ user memory + * to a pipe, not the other way around. Splicing from user memory is a simple + * operation that can be supported without any funky alignment restrictions + * or nasty vm tricks. We simply map in the user memory and fill them into + * a pipe. The reverse isn't quite as easy, though. There are two possible + * solutions for that: + * + * - memcpy() the data internally, at which point we might as well just + * do a regular read() on the buffer anyway. + * - Lots of nasty vm tricks, that are neither fast nor flexible (it + * has restriction limitations on both ends of the pipe). + * + * Currently we punt and implement it as a normal copy, see pipe_to_user(). + * + */ +SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov, + unsigned long, nr_segs, unsigned int, flags) +{ + struct fd f; + long error; + + if (unlikely(nr_segs > UIO_MAXIOV)) + return -EINVAL; + else if (unlikely(!nr_segs)) + return 0; + + error = -EBADF; + f = fdget(fd); + if (f.file) { + if (f.file->f_mode & FMODE_WRITE) + error = vmsplice_to_pipe(f.file, iov, nr_segs, flags); + else if (f.file->f_mode & FMODE_READ) + error = vmsplice_to_user(f.file, iov, nr_segs, flags); + + fdput(f); + } + + return error; +} + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE4(vmsplice, int, fd, const struct compat_iovec __user *, iov32, + unsigned int, nr_segs, unsigned int, flags) +{ + unsigned i; + struct iovec __user *iov; + if (nr_segs > UIO_MAXIOV) + return -EINVAL; + iov = compat_alloc_user_space(nr_segs * sizeof(struct iovec)); + for (i = 0; i < nr_segs; i++) { + struct compat_iovec v; + if (get_user(v.iov_base, &iov32[i].iov_base) || + get_user(v.iov_len, &iov32[i].iov_len) || + put_user(compat_ptr(v.iov_base), &iov[i].iov_base) || + put_user(v.iov_len, &iov[i].iov_len)) + return -EFAULT; + } + return sys_vmsplice(fd, iov, nr_segs, flags); +} +#endif + +SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in, + int, fd_out, loff_t __user *, off_out, + size_t, len, unsigned int, flags) { + struct fd in, out; long error; - struct file *in, *out; - int fput_in, fput_out; if (unlikely(!len)) return 0; error = -EBADF; - in = fget_light(fd_in, &fput_in); - if (in) { - if (in->f_mode & FMODE_READ) { - out = fget_light(fd_out, &fput_out); - if (out) { - if (out->f_mode & FMODE_WRITE) - error = do_splice(in, off_in, - out, off_out, + in = fdget(fd_in); + if (in.file) { + if (in.file->f_mode & FMODE_READ) { + out = fdget(fd_out); + if (out.file) { + if (out.file->f_mode & FMODE_WRITE) + error = do_splice(in.file, off_in, + out.file, off_out, len, flags); - fput_light(out, fput_out); + fdput(out); } } + fdput(in); + } + return error; +} + +/* + * Make sure there's data to read. Wait for input if we can, otherwise + * return an appropriate error. + */ +static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) +{ + int ret; - fput_light(in, fput_in); + /* + * Check ->nrbufs without the inode lock first. This function + * is speculative anyways, so missing one is ok. + */ + if (pipe->nrbufs) + return 0; + + ret = 0; + pipe_lock(pipe); + + while (!pipe->nrbufs) { + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + if (!pipe->writers) + break; + if (!pipe->waiting_writers) { + if (flags & SPLICE_F_NONBLOCK) { + ret = -EAGAIN; + break; + } + } + pipe_wait(pipe); } - return error; + pipe_unlock(pipe); + return ret; } /* - * Link contents of ipipe to opipe. + * Make sure there's writeable room. Wait for room if we can, otherwise + * return an appropriate error. */ -static int link_pipe(struct pipe_inode_info *ipipe, - struct pipe_inode_info *opipe, - size_t len, unsigned int flags) +static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) +{ + int ret; + + /* + * Check ->nrbufs without the inode lock first. This function + * is speculative anyways, so missing one is ok. + */ + if (pipe->nrbufs < pipe->buffers) + return 0; + + ret = 0; + pipe_lock(pipe); + + while (pipe->nrbufs >= pipe->buffers) { + if (!pipe->readers) { + send_sig(SIGPIPE, current, 0); + ret = -EPIPE; + break; + } + if (flags & SPLICE_F_NONBLOCK) { + ret = -EAGAIN; + break; + } + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + pipe->waiting_writers++; + pipe_wait(pipe); + pipe->waiting_writers--; + } + + pipe_unlock(pipe); + return ret; +} + +/* + * Splice contents of ipipe to opipe. + */ +static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, + struct pipe_inode_info *opipe, + size_t len, unsigned int flags) { struct pipe_buffer *ibuf, *obuf; - int ret, do_wakeup, i, ipipe_first; + int ret = 0, nbuf; + bool input_wakeup = false; + - ret = do_wakeup = ipipe_first = 0; +retry: + ret = ipipe_prep(ipipe, flags); + if (ret) + return ret; + + ret = opipe_prep(opipe, flags); + if (ret) + return ret; /* * Potential ABBA deadlock, work around it by ordering lock - * grabbing by inode address. Otherwise two different processes + * grabbing by pipe info address. Otherwise two different processes * could deadlock (one doing tee from A -> B, the other from B -> A). */ - if (ipipe->inode < opipe->inode) { - ipipe_first = 1; - mutex_lock(&ipipe->inode->i_mutex); - mutex_lock(&opipe->inode->i_mutex); - } else { - mutex_lock(&opipe->inode->i_mutex); - mutex_lock(&ipipe->inode->i_mutex); - } + pipe_double_lock(ipipe, opipe); - for (i = 0;; i++) { + do { if (!opipe->readers) { send_sig(SIGPIPE, current, 0); if (!ret) ret = -EPIPE; break; } - if (ipipe->nrbufs - i) { - ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1)); - /* - * If we have room, fill this buffer - */ - if (opipe->nrbufs < PIPE_BUFFERS) { - int nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1); + if (!ipipe->nrbufs && !ipipe->writers) + break; - /* - * Get a reference to this pipe buffer, - * so we can copy the contents over. - */ - ibuf->ops->get(ipipe, ibuf); + /* + * Cannot make any progress, because either the input + * pipe is empty or the output pipe is full. + */ + if (!ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) { + /* Already processed some buffers, break */ + if (ret) + break; - obuf = opipe->bufs + nbuf; - *obuf = *ibuf; + if (flags & SPLICE_F_NONBLOCK) { + ret = -EAGAIN; + break; + } - if (obuf->len > len) - obuf->len = len; + /* + * We raced with another reader/writer and haven't + * managed to process any buffers. A zero return + * value means EOF, so retry instead. + */ + pipe_unlock(ipipe); + pipe_unlock(opipe); + goto retry; + } - opipe->nrbufs++; - do_wakeup = 1; - ret += obuf->len; - len -= obuf->len; + ibuf = ipipe->bufs + ipipe->curbuf; + nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1); + obuf = opipe->bufs + nbuf; - if (!len) - break; - if (opipe->nrbufs < PIPE_BUFFERS) - continue; - } + if (len >= ibuf->len) { + /* + * Simply move the whole buffer from ipipe to opipe + */ + *obuf = *ibuf; + ibuf->ops = NULL; + opipe->nrbufs++; + ipipe->curbuf = (ipipe->curbuf + 1) & (ipipe->buffers - 1); + ipipe->nrbufs--; + input_wakeup = true; + } else { + /* + * Get a reference to this pipe buffer, + * so we can copy the contents over. + */ + ibuf->ops->get(ipipe, ibuf); + *obuf = *ibuf; /* - * We have input available, but no output room. - * If we already copied data, return that. If we - * need to drop the opipe lock, it must be ordered - * last to avoid deadlocks. + * Don't inherit the gift flag, we need to + * prevent multiple steals of this page. */ - if ((flags & SPLICE_F_NONBLOCK) || !ipipe_first) { - if (!ret) - ret = -EAGAIN; - break; - } - if (signal_pending(current)) { - if (!ret) - ret = -ERESTARTSYS; - break; - } - if (do_wakeup) { - smp_mb(); - if (waitqueue_active(&opipe->wait)) - wake_up_interruptible(&opipe->wait); - kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN); - do_wakeup = 0; - } + obuf->flags &= ~PIPE_BUF_FLAG_GIFT; - opipe->waiting_writers++; - pipe_wait(opipe); - opipe->waiting_writers--; - continue; + obuf->len = len; + opipe->nrbufs++; + ibuf->offset += obuf->len; + ibuf->len -= obuf->len; } + ret += obuf->len; + len -= obuf->len; + } while (len); - /* - * No input buffers, do the usual checks for available - * writers and blocking and wait if necessary - */ - if (!ipipe->writers) + pipe_unlock(ipipe); + pipe_unlock(opipe); + + /* + * If we put data in the output pipe, wakeup any potential readers. + */ + if (ret > 0) + wakeup_pipe_readers(opipe); + + if (input_wakeup) + wakeup_pipe_writers(ipipe); + + return ret; +} + +/* + * Link contents of ipipe to opipe. + */ +static int link_pipe(struct pipe_inode_info *ipipe, + struct pipe_inode_info *opipe, + size_t len, unsigned int flags) +{ + struct pipe_buffer *ibuf, *obuf; + int ret = 0, i = 0, nbuf; + + /* + * Potential ABBA deadlock, work around it by ordering lock + * grabbing by pipe info address. Otherwise two different processes + * could deadlock (one doing tee from A -> B, the other from B -> A). + */ + pipe_double_lock(ipipe, opipe); + + do { + if (!opipe->readers) { + send_sig(SIGPIPE, current, 0); + if (!ret) + ret = -EPIPE; break; - if (!ipipe->waiting_writers) { - if (ret) - break; } + /* - * pipe_wait() drops the ipipe mutex. To avoid deadlocks - * with another process, we can only safely do that if - * the ipipe lock is ordered last. + * If we have iterated all input buffers or ran out of + * output room, break. */ - if ((flags & SPLICE_F_NONBLOCK) || ipipe_first) { - if (!ret) - ret = -EAGAIN; + if (i >= ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) break; - } - if (signal_pending(current)) { - if (!ret) - ret = -ERESTARTSYS; - break; - } - if (waitqueue_active(&ipipe->wait)) - wake_up_interruptible_sync(&ipipe->wait); - kill_fasync(&ipipe->fasync_writers, SIGIO, POLL_OUT); + ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (ipipe->buffers-1)); + nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1); - pipe_wait(ipipe); - } + /* + * Get a reference to this pipe buffer, + * so we can copy the contents over. + */ + ibuf->ops->get(ipipe, ibuf); - mutex_unlock(&ipipe->inode->i_mutex); - mutex_unlock(&opipe->inode->i_mutex); + obuf = opipe->bufs + nbuf; + *obuf = *ibuf; - if (do_wakeup) { - smp_mb(); - if (waitqueue_active(&opipe->wait)) - wake_up_interruptible(&opipe->wait); - kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN); - } + /* + * Don't inherit the gift flag, we need to + * prevent multiple steals of this page. + */ + obuf->flags &= ~PIPE_BUF_FLAG_GIFT; + + if (obuf->len > len) + obuf->len = len; + + opipe->nrbufs++; + ret += obuf->len; + len -= obuf->len; + i++; + } while (len); + + /* + * return EAGAIN if we have the potential of some data in the + * future, otherwise just return 0 + */ + if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK)) + ret = -EAGAIN; + + pipe_unlock(ipipe); + pipe_unlock(opipe); + + /* + * If we put data in the output pipe, wakeup any potential readers. + */ + if (ret > 0) + wakeup_pipe_readers(opipe); return ret; } @@ -1191,40 +1990,51 @@ static int link_pipe(struct pipe_inode_info *ipipe, static long do_tee(struct file *in, struct file *out, size_t len, unsigned int flags) { - struct pipe_inode_info *ipipe = in->f_dentry->d_inode->i_pipe; - struct pipe_inode_info *opipe = out->f_dentry->d_inode->i_pipe; + struct pipe_inode_info *ipipe = get_pipe_info(in); + struct pipe_inode_info *opipe = get_pipe_info(out); + int ret = -EINVAL; /* - * Link ipipe to the two output pipes, consuming as we go along. + * Duplicate the contents of ipipe to opipe without actually + * copying the data. */ - if (ipipe && opipe) - return link_pipe(ipipe, opipe, len, flags); + if (ipipe && opipe && ipipe != opipe) { + /* + * Keep going, unless we encounter an error. The ipipe/opipe + * ordering doesn't really matter. + */ + ret = ipipe_prep(ipipe, flags); + if (!ret) { + ret = opipe_prep(opipe, flags); + if (!ret) + ret = link_pipe(ipipe, opipe, len, flags); + } + } - return -EINVAL; + return ret; } -asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags) +SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags) { - struct file *in; - int error, fput_in; + struct fd in; + int error; if (unlikely(!len)) return 0; error = -EBADF; - in = fget_light(fdin, &fput_in); - if (in) { - if (in->f_mode & FMODE_READ) { - int fput_out; - struct file *out = fget_light(fdout, &fput_out); - - if (out) { - if (out->f_mode & FMODE_WRITE) - error = do_tee(in, out, len, flags); - fput_light(out, fput_out); + in = fdget(fdin); + if (in.file) { + if (in.file->f_mode & FMODE_READ) { + struct fd out = fdget(fdout); + if (out.file) { + if (out.file->f_mode & FMODE_WRITE) + error = do_tee(in.file, out.file, + len, flags); + fdput(out); } } - fput_light(in, fput_in); + fdput(in); } return error; |
