diff options
Diffstat (limited to 'fs')
128 files changed, 4688 insertions, 1950 deletions
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index fd01d90cada..57997fa14e6 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -51,4 +51,4 @@ int v9fs_dir_release(struct inode *inode, struct file *filp); int v9fs_file_open(struct inode *inode, struct file *file); void v9fs_inode2stat(struct inode *inode, struct p9_stat *stat); void v9fs_dentry_release(struct dentry *); -int v9fs_uflags2omode(int uflags); +int v9fs_uflags2omode(int uflags, int extended); diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 0d55affe37d..52944d2249a 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -59,7 +59,7 @@ int v9fs_file_open(struct inode *inode, struct file *file) P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p \n", inode, file); v9ses = v9fs_inode2v9ses(inode); - omode = v9fs_uflags2omode(file->f_flags); + omode = v9fs_uflags2omode(file->f_flags, v9fs_extended(v9ses)); fid = file->private_data; if (!fid) { fid = v9fs_fid_clone(file->f_path.dentry); @@ -75,6 +75,8 @@ int v9fs_file_open(struct inode *inode, struct file *file) inode->i_size = 0; inode->i_blocks = 0; } + if ((file->f_flags & O_APPEND) && (!v9fs_extended(v9ses))) + generic_file_llseek(file, 0, SEEK_END); } file->private_data = fid; diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 40fa807bd92..c95295c6504 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -132,10 +132,10 @@ static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode) /** * v9fs_uflags2omode- convert posix open flags to plan 9 mode bits * @uflags: flags to convert - * + * @extended: if .u extensions are active */ -int v9fs_uflags2omode(int uflags) +int v9fs_uflags2omode(int uflags, int extended) { int ret; @@ -155,14 +155,16 @@ int v9fs_uflags2omode(int uflags) break; } - if (uflags & O_EXCL) - ret |= P9_OEXCL; - if (uflags & O_TRUNC) ret |= P9_OTRUNC; - if (uflags & O_APPEND) - ret |= P9_OAPPEND; + if (extended) { + if (uflags & O_EXCL) + ret |= P9_OEXCL; + + if (uflags & O_APPEND) + ret |= P9_OAPPEND; + } return ret; } @@ -506,7 +508,7 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode, flags = O_RDWR; fid = v9fs_create(v9ses, dir, dentry, NULL, perm, - v9fs_uflags2omode(flags)); + v9fs_uflags2omode(flags, v9fs_extended(v9ses))); if (IS_ERR(fid)) { err = PTR_ERR(fid); fid = NULL; diff --git a/fs/Kconfig b/fs/Kconfig index cf12c403b8c..313b2e06ded 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -830,7 +830,7 @@ config NTFS_FS from the project web site. For more information see <file:Documentation/filesystems/ntfs.txt> - and <http://linux-ntfs.sourceforge.net/>. + and <http://www.linux-ntfs.org/>. To compile this file system support as a module, choose M here: the module will be called ntfs. @@ -930,7 +930,7 @@ config PROC_KCORE config PROC_VMCORE bool "/proc/vmcore support (EXPERIMENTAL)" - depends on PROC_FS && EXPERIMENTAL && CRASH_DUMP + depends on PROC_FS && CRASH_DUMP default y help Exports the dump image of crashed kernel in ELF format. diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 55e8ee1900a..3263084eef9 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -42,7 +42,7 @@ config BINFMT_ELF_FDPIC config BINFMT_FLAT bool "Kernel support for flat binaries" - depends on !MMU + depends on !MMU && (!FRV || BROKEN) help Support uClinux FLAT format binaries. diff --git a/fs/Makefile b/fs/Makefile index 1e7a11bd4da..277b079dec9 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -19,6 +19,7 @@ else obj-y += no-block.o endif +obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o obj-$(CONFIG_INOTIFY) += inotify.o obj-$(CONFIG_INOTIFY_USER) += inotify_user.o obj-$(CONFIG_EPOLL) += eventpoll.o diff --git a/fs/afs/callback.c b/fs/afs/callback.c index a78d5b236bb..587ef5123cd 100644 --- a/fs/afs/callback.c +++ b/fs/afs/callback.c @@ -8,7 +8,7 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * - * Authors: David Woodhouse <dwmw2@cambridge.redhat.com> + * Authors: David Woodhouse <dwmw2@infradead.org> * David Howells <dhowells@redhat.com> * */ diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 08db82e1343..bb47217f6a1 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -8,7 +8,7 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * - * Authors: David Woodhouse <dwmw2@cambridge.redhat.com> + * Authors: David Woodhouse <dwmw2@infradead.org> * David Howells <dhowells@redhat.com> * */ diff --git a/fs/afs/super.c b/fs/afs/super.c index 4b572b801d8..7e3faeef681 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -10,7 +10,7 @@ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * * Authors: David Howells <dhowells@redhat.com> - * David Woodhouse <dwmw2@redhat.com> + * David Woodhouse <dwmw2@infradead.org> * */ @@ -591,10 +591,6 @@ static void use_mm(struct mm_struct *mm) atomic_inc(&mm->mm_count); tsk->mm = mm; tsk->active_mm = mm; - /* - * Note that on UML this *requires* PF_BORROWED_MM to be set, otherwise - * it won't work. Update it accordingly if you change it here - */ switch_mm(active_mm, mm, tsk); task_unlock(tsk); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 0fa95b198e6..d48ff5f370f 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -16,7 +16,6 @@ #include <linux/time.h> #include <linux/mm.h> #include <linux/mman.h> -#include <linux/a.out.h> #include <linux/errno.h> #include <linux/signal.h> #include <linux/binfmts.h> @@ -548,7 +547,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) struct { struct elfhdr elf_ex; struct elfhdr interp_elf_ex; - struct exec interp_ex; } *loc; loc = kmalloc(sizeof(*loc), GFP_KERNEL); @@ -680,7 +678,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) } /* Get the exec headers */ - loc->interp_ex = *((struct exec *)bprm->buf); loc->interp_elf_ex = *((struct elfhdr *)bprm->buf); break; } diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index ddd35d87339..d051a32e627 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -390,7 +390,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, } /* expand the stack mapping to use up the entire allocation granule */ - fullsize = ksize((char *) current->mm->start_brk); + fullsize = kobjsize((char *) current->mm->start_brk); if (!IS_ERR_VALUE(do_mremap(current->mm->start_brk, stack_size, fullsize, 0, 0))) stack_size = fullsize; diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 3b40d45a3a1..2cb1acda3a8 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -548,7 +548,7 @@ static int load_flat_file(struct linux_binprm * bprm, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0); /* Remap to use all availabe slack region space */ if (realdatastart && (realdatastart < (unsigned long)-4096)) { - reallen = ksize((void *)realdatastart); + reallen = kobjsize((void *)realdatastart); if (reallen > len) { realdatastart = do_mremap(realdatastart, len, reallen, MREMAP_FIXED, realdatastart); @@ -600,7 +600,7 @@ static int load_flat_file(struct linux_binprm * bprm, PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0); /* Remap to use all availabe slack region space */ if (textpos && (textpos < (unsigned long) -4096)) { - reallen = ksize((void *)textpos); + reallen = kobjsize((void *)textpos); if (reallen > len) { textpos = do_mremap(textpos, len, reallen, MREMAP_FIXED, textpos); @@ -683,7 +683,7 @@ static int load_flat_file(struct linux_binprm * bprm, */ current->mm->start_brk = datapos + data_len + bss_len; current->mm->brk = (current->mm->start_brk + 3) & ~3; - current->mm->context.end_brk = memp + ksize((void *) memp) - stack_len; + current->mm->context.end_brk = memp + kobjsize((void *) memp) - stack_len; } if (flags & FLAT_FLAG_KTRACE) @@ -790,7 +790,7 @@ static int load_flat_file(struct linux_binprm * bprm, /* zero the BSS, BRK and stack areas */ memset((void*)(datapos + data_len), 0, bss_len + - (memp + ksize((void *) memp) - stack_len - /* end brk */ + (memp + kobjsize((void *) memp) - stack_len - /* end brk */ libinfo->lib_list[id].start_brk) + /* start brk */ stack_len); diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c new file mode 100644 index 00000000000..63e2ee63058 --- /dev/null +++ b/fs/bio-integrity.c @@ -0,0 +1,719 @@ +/* + * bio-integrity.c - bio data integrity extensions + * + * Copyright (C) 2007, 2008 Oracle Corporation + * Written by: Martin K. Petersen <martin.petersen@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, + * USA. + * + */ + +#include <linux/blkdev.h> +#include <linux/mempool.h> +#include <linux/bio.h> +#include <linux/workqueue.h> + +static struct kmem_cache *bio_integrity_slab __read_mostly; +static struct workqueue_struct *kintegrityd_wq; + +/** + * bio_integrity_alloc_bioset - Allocate integrity payload and attach it to bio + * @bio: bio to attach integrity metadata to + * @gfp_mask: Memory allocation mask + * @nr_vecs: Number of integrity metadata scatter-gather elements + * @bs: bio_set to allocate from + * + * Description: This function prepares a bio for attaching integrity + * metadata. nr_vecs specifies the maximum number of pages containing + * integrity metadata that can be attached. + */ +struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio, + gfp_t gfp_mask, + unsigned int nr_vecs, + struct bio_set *bs) +{ + struct bio_integrity_payload *bip; + struct bio_vec *iv; + unsigned long idx; + + BUG_ON(bio == NULL); + + bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask); + if (unlikely(bip == NULL)) { + printk(KERN_ERR "%s: could not alloc bip\n", __func__); + return NULL; + } + + memset(bip, 0, sizeof(*bip)); + + iv = bvec_alloc_bs(gfp_mask, nr_vecs, &idx, bs); + if (unlikely(iv == NULL)) { + printk(KERN_ERR "%s: could not alloc bip_vec\n", __func__); + mempool_free(bip, bs->bio_integrity_pool); + return NULL; + } + + bip->bip_pool = idx; + bip->bip_vec = iv; + bip->bip_bio = bio; + bio->bi_integrity = bip; + + return bip; +} +EXPORT_SYMBOL(bio_integrity_alloc_bioset); + +/** + * bio_integrity_alloc - Allocate integrity payload and attach it to bio + * @bio: bio to attach integrity metadata to + * @gfp_mask: Memory allocation mask + * @nr_vecs: Number of integrity metadata scatter-gather elements + * + * Description: This function prepares a bio for attaching integrity + * metadata. nr_vecs specifies the maximum number of pages containing + * integrity metadata that can be attached. + */ +struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, + gfp_t gfp_mask, + unsigned int nr_vecs) +{ + return bio_integrity_alloc_bioset(bio, gfp_mask, nr_vecs, fs_bio_set); +} +EXPORT_SYMBOL(bio_integrity_alloc); + +/** + * bio_integrity_free - Free bio integrity payload + * @bio: bio containing bip to be freed + * @bs: bio_set this bio was allocated from + * + * Description: Used to free the integrity portion of a bio. Usually + * called from bio_free(). + */ +void bio_integrity_free(struct bio *bio, struct bio_set *bs) +{ + struct bio_integrity_payload *bip = bio->bi_integrity; + + BUG_ON(bip == NULL); + + /* A cloned bio doesn't own the integrity metadata */ + if (!bio_flagged(bio, BIO_CLONED) && bip->bip_buf != NULL) + kfree(bip->bip_buf); + + mempool_free(bip->bip_vec, bs->bvec_pools[bip->bip_pool]); + mempool_free(bip, bs->bio_integrity_pool); + + bio->bi_integrity = NULL; +} +EXPORT_SYMBOL(bio_integrity_free); + +/** + * bio_integrity_add_page - Attach integrity metadata + * @bio: bio to update + * @page: page containing integrity metadata + * @len: number of bytes of integrity metadata in page + * @offset: start offset within page + * + * Description: Attach a page containing integrity metadata to bio. + */ +int bio_integrity_add_page(struct bio *bio, struct page *page, + unsigned int len, unsigned int offset) +{ + struct bio_integrity_payload *bip = bio->bi_integrity; + struct bio_vec *iv; + + if (bip->bip_vcnt >= bvec_nr_vecs(bip->bip_pool)) { + printk(KERN_ERR "%s: bip_vec full\n", __func__); + return 0; + } + + iv = bip_vec_idx(bip, bip->bip_vcnt); + BUG_ON(iv == NULL); + BUG_ON(iv->bv_page != NULL); + + iv->bv_page = page; + iv->bv_len = len; + iv->bv_offset = offset; + bip->bip_vcnt++; + + return len; +} +EXPORT_SYMBOL(bio_integrity_add_page); + +/** + * bio_integrity_enabled - Check whether integrity can be passed + * @bio: bio to check + * + * Description: Determines whether bio_integrity_prep() can be called + * on this bio or not. bio data direction and target device must be + * set prior to calling. The functions honors the write_generate and + * read_verify flags in sysfs. + */ +int bio_integrity_enabled(struct bio *bio) +{ + /* Already protected? */ + if (bio_integrity(bio)) + return 0; + + return bdev_integrity_enabled(bio->bi_bdev, bio_data_dir(bio)); +} +EXPORT_SYMBOL(bio_integrity_enabled); + +/** + * bio_integrity_hw_sectors - Convert 512b sectors to hardware ditto + * @bi: blk_integrity profile for device + * @sectors: Number of 512 sectors to convert + * + * Description: The block layer calculates everything in 512 byte + * sectors but integrity metadata is done in terms of the hardware + * sector size of the storage device. Convert the block layer sectors + * to physical sectors. + */ +static inline unsigned int bio_integrity_hw_sectors(struct blk_integrity *bi, + unsigned int sectors) +{ + /* At this point there are only 512b or 4096b DIF/EPP devices */ + if (bi->sector_size == 4096) + return sectors >>= 3; + + return sectors; +} + +/** + * bio_integrity_tag_size - Retrieve integrity tag space + * @bio: bio to inspect + * + * Description: Returns the maximum number of tag bytes that can be + * attached to this bio. Filesystems can use this to determine how + * much metadata to attach to an I/O. + */ +unsigned int bio_integrity_tag_size(struct bio *bio) +{ + struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + + BUG_ON(bio->bi_size == 0); + + return bi->tag_size * (bio->bi_size / bi->sector_size); +} +EXPORT_SYMBOL(bio_integrity_tag_size); + +int bio_integrity_tag(struct bio *bio, void *tag_buf, unsigned int len, int set) +{ + struct bio_integrity_payload *bip = bio->bi_integrity; + struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + unsigned int nr_sectors; + + BUG_ON(bip->bip_buf == NULL); + + if (bi->tag_size == 0) + return -1; + + nr_sectors = bio_integrity_hw_sectors(bi, + DIV_ROUND_UP(len, bi->tag_size)); + + if (nr_sectors * bi->tuple_size > bip->bip_size) { + printk(KERN_ERR "%s: tag too big for bio: %u > %u\n", + __func__, nr_sectors * bi->tuple_size, bip->bip_size); + return -1; + } + + if (set) + bi->set_tag_fn(bip->bip_buf, tag_buf, nr_sectors); + else + bi->get_tag_fn(bip->bip_buf, tag_buf, nr_sectors); + + return 0; +} + +/** + * bio_integrity_set_tag - Attach a tag buffer to a bio + * @bio: bio to attach buffer to + * @tag_buf: Pointer to a buffer containing tag data + * @len: Length of the included buffer + * + * Description: Use this function to tag a bio by leveraging the extra + * space provided by devices formatted with integrity protection. The + * size of the integrity buffer must be <= to the size reported by + * bio_integrity_tag_size(). + */ +int bio_integrity_set_tag(struct bio *bio, void *tag_buf, unsigned int len) +{ + BUG_ON(bio_data_dir(bio) != WRITE); + + return bio_integrity_tag(bio, tag_buf, len, 1); +} +EXPORT_SYMBOL(bio_integrity_set_tag); + +/** + * bio_integrity_get_tag - Retrieve a tag buffer from a bio + * @bio: bio to retrieve buffer from + * @tag_buf: Pointer to a buffer for the tag data + * @len: Length of the target buffer + * + * Description: Use this function to retrieve the tag buffer from a + * completed I/O. The size of the integrity buffer must be <= to the + * size reported by bio_integrity_tag_size(). + */ +int bio_integrity_get_tag(struct bio *bio, void *tag_buf, unsigned int len) +{ + BUG_ON(bio_data_dir(bio) != READ); + + return bio_integrity_tag(bio, tag_buf, len, 0); +} +EXPORT_SYMBOL(bio_integrity_get_tag); + +/** + * bio_integrity_generate - Generate integrity metadata for a bio + * @bio: bio to generate integrity metadata for + * + * Description: Generates integrity metadata for a bio by calling the + * block device's generation callback function. The bio must have a + * bip attached with enough room to accommodate the generated + * integrity metadata. + */ +static void bio_integrity_generate(struct bio *bio) +{ + struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + struct blk_integrity_exchg bix; + struct bio_vec *bv; + sector_t sector = bio->bi_sector; + unsigned int i, sectors, total; + void *prot_buf = bio->bi_integrity->bip_buf; + + total = 0; + bix.disk_name = bio->bi_bdev->bd_disk->disk_name; + bix.sector_size = bi->sector_size; + + bio_for_each_segment(bv, bio, i) { + void *kaddr = kmap_atomic(bv->bv_page, KM_USER0); + bix.data_buf = kaddr + bv->bv_offset; + bix.data_size = bv->bv_len; + bix.prot_buf = prot_buf; + bix.sector = sector; + + bi->generate_fn(&bix); + + sectors = bv->bv_len / bi->sector_size; + sector += sectors; + prot_buf += sectors * bi->tuple_size; + total += sectors * bi->tuple_size; + BUG_ON(total > bio->bi_integrity->bip_size); + + kunmap_atomic(kaddr, KM_USER0); + } +} + +/** + * bio_integrity_prep - Prepare bio for integrity I/O + * @bio: bio to prepare + * + * Description: Allocates a buffer for integrity metadata, maps the + * pages and attaches them to a bio. The bio must have data + * direction, target device and start sector set priot to calling. In + * the WRITE case, integrity metadata will be generated using the + * block device's integrity function. In the READ case, the buffer + * will be prepared for DMA and a suitable end_io handler set up. + */ +int bio_integrity_prep(struct bio *bio) +{ + struct bio_integrity_payload *bip; + struct blk_integrity *bi; + struct request_queue *q; + void *buf; + unsigned long start, end; + unsigned int len, nr_pages; + unsigned int bytes, offset, i; + unsigned int sectors; + + bi = bdev_get_integrity(bio->bi_bdev); + q = bdev_get_queue(bio->bi_bdev); + BUG_ON(bi == NULL); + BUG_ON(bio_integrity(bio)); + + sectors = bio_integrity_hw_sectors(bi, bio_sectors(bio)); + + /* Allocate kernel buffer for protection data */ + len = sectors * blk_integrity_tuple_size(bi); + buf = kmalloc(len, GFP_NOIO | __GFP_NOFAIL | q->bounce_gfp); + if (unlikely(buf == NULL)) { + printk(KERN_ERR "could not allocate integrity buffer\n"); + return -EIO; + } + + end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + start = ((unsigned long) buf) >> PAGE_SHIFT; + nr_pages = end - start; + + /* Allocate bio integrity payload and integrity vectors */ + bip = bio_integrity_alloc(bio, GFP_NOIO, nr_pages); + if (unlikely(bip == NULL)) { + printk(KERN_ERR "could not allocate data integrity bioset\n"); + kfree(buf); + return -EIO; + } + + bip->bip_buf = buf; + bip->bip_size = len; + bip->bip_sector = bio->bi_sector; + + /* Map it */ + offset = offset_in_page(buf); + for (i = 0 ; i < nr_pages ; i++) { + int ret; + bytes = PAGE_SIZE - offset; + + if (len <= 0) + break; + + if (bytes > len) + bytes = len; + + ret = bio_integrity_add_page(bio, virt_to_page(buf), + bytes, offset); + + if (ret == 0) + return 0; + + if (ret < bytes) + break; + + buf += bytes; + len -= bytes; + offset = 0; + } + + /* Install custom I/O completion handler if read verify is enabled */ + if (bio_data_dir(bio) == READ) { + bip->bip_end_io = bio->bi_end_io; + bio->bi_end_io = bio_integrity_endio; + } + + /* Auto-generate integrity metadata if this is a write */ + if (bio_data_dir(bio) == WRITE) + bio_integrity_generate(bio); + + return 0; +} +EXPORT_SYMBOL(bio_integrity_prep); + +/** + * bio_integrity_verify - Verify integrity metadata for a bio + * @bio: bio to verify + * + * Description: This function is called to verify the integrity of a + * bio. The data in the bio io_vec is compared to the integrity + * metadata returned by the HBA. + */ +static int bio_integrity_verify(struct bio *bio) +{ + struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + struct blk_integrity_exchg bix; + struct bio_vec *bv; + sector_t sector = bio->bi_integrity->bip_sector; + unsigned int i, sectors, total, ret; + void *prot_buf = bio->bi_integrity->bip_buf; + + ret = total = 0; + bix.disk_name = bio->bi_bdev->bd_disk->disk_name; + bix.sector_size = bi->sector_size; + + bio_for_each_segment(bv, bio, i) { + void *kaddr = kmap_atomic(bv->bv_page, KM_USER0); + bix.data_buf = kaddr + bv->bv_offset; + bix.data_size = bv->bv_len; + bix.prot_buf = prot_buf; + bix.sector = sector; + + ret = bi->verify_fn(&bix); + + if (ret) { + kunmap_atomic(kaddr, KM_USER0); + break; + } + + sectors = bv->bv_len / bi->sector_size; + sector += sectors; + prot_buf += sectors * bi->tuple_size; + total += sectors * bi->tuple_size; + BUG_ON(total > bio->bi_integrity->bip_size); + + kunmap_atomic(kaddr, KM_USER0); + } + + return ret; +} + +/** + * bio_integrity_verify_fn - Integrity I/O completion worker + * @work: Work struct stored in bio to be verified + * + * Description: This workqueue function is called to complete a READ + * request. The function verifies the transferred integrity metadata + * and then calls the original bio end_io function. + */ +static void bio_integrity_verify_fn(struct work_struct *work) +{ + struct bio_integrity_payload *bip = + container_of(work, struct bio_integrity_payload, bip_work); + struct bio *bio = bip->bip_bio; + int error = bip->bip_error; + + if (bio_integrity_verify(bio)) { + clear_bit(BIO_UPTODATE, &bio->bi_flags); + error = -EIO; + } + + /* Restore original bio completion handler */ + bio->bi_end_io = bip->bip_end_io; + + if (bio->bi_end_io) + bio->bi_end_io(bio, error); +} + +/** + * bio_integrity_endio - Integrity I/O completion function + * @bio: Protected bio + * @error: Pointer to errno + * + * Description: Completion for integrity I/O + * + * Normally I/O completion is done in interrupt context. However, + * verifying I/O integrity is a time-consuming task which must be run + * in process context. This function postpones completion + * accordingly. + */ +void bio_integrity_endio(struct bio *bio, int error) +{ + struct bio_integrity_payload *bip = bio->bi_integrity; + + BUG_ON(bip->bip_bio != bio); + + bip->bip_error = error; + INIT_WORK(&bip->bip_work, bio_integrity_verify_fn); + queue_work(kintegrityd_wq, &bip->bip_work); +} +EXPORT_SYMBOL(bio_integrity_endio); + +/** + * bio_integrity_mark_head - Advance bip_vec skip bytes + * @bip: Integrity vector to advance + * @skip: Number of bytes to advance it + */ +void bio_integrity_mark_head(struct bio_integrity_payload *bip, + unsigned int skip) +{ + struct bio_vec *iv; + unsigned int i; + + bip_for_each_vec(iv, bip, i) { + if (skip == 0) { + bip->bip_idx = i; + return; + } else if (skip >= iv->bv_len) { + skip -= iv->bv_len; + } else { /* skip < iv->bv_len) */ + iv->bv_offset += skip; + iv->bv_len -= skip; + bip->bip_idx = i; + return; + } + } +} + +/** + * bio_integrity_mark_tail - Truncate bip_vec to be len bytes long + * @bip: Integrity vector to truncate + * @len: New length of integrity vector + */ +void bio_integrity_mark_tail(struct bio_integrity_payload *bip, + unsigned int len) +{ + struct bio_vec *iv; + unsigned int i; + + bip_for_each_vec(iv, bip, i) { + if (len == 0) { + bip->bip_vcnt = i; + return; + } else if (len >= iv->bv_len) { + len -= iv->bv_len; + } else { /* len < iv->bv_len) */ + iv->bv_len = len; + len = 0; + } + } +} + +/** + * bio_integrity_advance - Advance integrity vector + * @bio: bio whose integrity vector to update + * @bytes_done: number of data bytes that have been completed + * + * Description: This function calculates how many integrity bytes the + * number of completed data bytes correspond to and advances the + * integrity vector accordingly. + */ +void bio_integrity_advance(struct bio *bio, unsigned int bytes_done) +{ + struct bio_integrity_payload *bip = bio->bi_integrity; + struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + unsigned int nr_sectors; + + BUG_ON(bip == NULL); + BUG_ON(bi == NULL); + + nr_sectors = bio_integrity_hw_sectors(bi, bytes_done >> 9); + bio_integrity_mark_head(bip, nr_sectors * bi->tuple_size); +} +EXPORT_SYMBOL(bio_integrity_advance); + +/** + * bio_integrity_trim - Trim integrity vector + * @bio: bio whose integrity vector to update + * @offset: offset to first data sector + * @sectors: number of data sectors + * + * Description: Used to trim the integrity vector in a cloned bio. + * The ivec will be advanced corresponding to 'offset' data sectors + * and the length will be truncated corresponding to 'len' data + * sectors. + */ +void bio_integrity_trim(struct bio *bio, unsigned int offset, + unsigned int sectors) +{ + struct bio_integrity_payload *bip = bio->bi_integrity; + struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + unsigned int nr_sectors; + + BUG_ON(bip == NULL); + BUG_ON(bi == NULL); + BUG_ON(!bio_flagged(bio, BIO_CLONED)); + + nr_sectors = bio_integrity_hw_sectors(bi, sectors); + bip->bip_sector = bip->bip_sector + offset; + bio_integrity_mark_head(bip, offset * bi->tuple_size); + bio_integrity_mark_tail(bip, sectors * bi->tuple_size); +} +EXPORT_SYMBOL(bio_integrity_trim); + +/** + * bio_integrity_split - Split integrity metadata + * @bio: Protected bio + * @bp: Resulting bio_pair + * @sectors: Offset + * + * Description: Splits an integrity page into a bio_pair. + */ +void bio_integrity_split(struct bio *bio, struct bio_pair *bp, int sectors) +{ + struct blk_integrity *bi; + struct bio_integrity_payload *bip = bio->bi_integrity; + unsigned int nr_sectors; + + if (bio_integrity(bio) == 0) + return; + + bi = bdev_get_integrity(bio->bi_bdev); + BUG_ON(bi == NULL); + BUG_ON(bip->bip_vcnt != 1); + + nr_sectors = bio_integrity_hw_sectors(bi, sectors); + + bp->bio1.bi_integrity = &bp->bip1; + bp->bio2.bi_integrity = &bp->bip2; + + bp->iv1 = bip->bip_vec[0]; + bp->iv2 = bip->bip_vec[0]; + + bp->bip1.bip_vec = &bp->iv1; + bp->bip2.bip_vec = &bp->iv2; + + bp->iv1.bv_len = sectors * bi->tuple_size; + bp->iv2.bv_offset += sectors * bi->tuple_size; + bp->iv2.bv_len -= sectors * bi->tuple_size; + + bp->bip1.bip_sector = bio->bi_integrity->bip_sector; + bp->bip2.bip_sector = bio->bi_integrity->bip_sector + nr_sectors; + + bp->bip1.bip_vcnt = bp->bip2.bip_vcnt = 1; + bp->bip1.bip_idx = bp->bip2.bip_idx = 0; +} +EXPORT_SYMBOL(bio_integrity_split); + +/** + * bio_integrity_clone - Callback for cloning bios with integrity metadata + * @bio: New bio + * @bio_src: Original bio + * @bs: bio_set to allocate bip from + * + * Description: Called to allocate a bip when cloning a bio + */ +int bio_integrity_clone(struct bio *bio, struct bio *bio_src, + struct bio_set *bs) +{ + struct bio_integrity_payload *bip_src = bio_src->bi_integrity; + struct bio_integrity_payload *bip; + + BUG_ON(bip_src == NULL); + + bip = bio_integrity_alloc_bioset(bio, GFP_NOIO, bip_src->bip_vcnt, bs); + + if (bip == NULL) + return -EIO; + + memcpy(bip->bip_vec, bip_src->bip_vec, + bip_src->bip_vcnt * sizeof(struct bio_vec)); + + bip->bip_sector = bip_src->bip_sector; + bip->bip_vcnt = bip_src->bip_vcnt; + bip->bip_idx = bip_src->bip_idx; + + return 0; +} +EXPORT_SYMBOL(bio_integrity_clone); + +int bioset_integrity_create(struct bio_set *bs, int pool_size) +{ + bs->bio_integrity_pool = mempool_create_slab_pool(pool_size, + bio_integrity_slab); + if (!bs->bio_integrity_pool) + return -1; + + return 0; +} +EXPORT_SYMBOL(bioset_integrity_create); + +void bioset_integrity_free(struct bio_set *bs) +{ + if (bs->bio_integrity_pool) + mempool_destroy(bs->bio_integrity_pool); +} +EXPORT_SYMBOL(bioset_integrity_free); + +void __init bio_integrity_init_slab(void) +{ + bio_integrity_slab = KMEM_CACHE(bio_integrity_payload, + SLAB_HWCACHE_ALIGN|SLAB_PANIC); +} +EXPORT_SYMBOL(bio_integrity_init_slab); + +static int __init integrity_init(void) +{ + kintegrityd_wq = create_workqueue("kintegrityd"); + + if (!kintegrityd_wq) + panic("Failed to create kintegrityd\n"); + + return 0; +} +subsys_initcall(integrity_init); @@ -28,25 +28,10 @@ #include <linux/blktrace_api.h> #include <scsi/sg.h> /* for struct sg_iovec */ -#define BIO_POOL_SIZE 2 - static struct kmem_cache *bio_slab __read_mostly; -#define BIOVEC_NR_POOLS 6 - -/* - * a small number of entries is fine, not going to be performance critical. - * basically we just need to survive - */ -#define BIO_SPLIT_ENTRIES 2 mempool_t *bio_split_pool __read_mostly; -struct biovec_slab { - int nr_vecs; - char *name; - struct kmem_cache *slab; -}; - /* * if you change this list, also change bvec_alloc or things will * break badly! cannot be bigger than what you can fit into an @@ -60,23 +45,17 @@ static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = { #undef BV /* - * bio_set is used to allow other portions of the IO system to - * allocate their own private memory pools for bio and iovec structures. - * These memory pools in turn all allocate from the bio_slab - * and the bvec_slabs[]. - */ -struct bio_set { - mempool_t *bio_pool; - mempool_t *bvec_pools[BIOVEC_NR_POOLS]; -}; - -/* * fs_bio_set is the bio_set containing bio and iovec memory pools used by * IO code that does not need private memory pools. */ -static struct bio_set *fs_bio_set; +struct bio_set *fs_bio_set; + +unsigned int bvec_nr_vecs(unsigned short idx) +{ + return bvec_slabs[idx].nr_vecs; +} -static inline struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs) +struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs) { struct bio_vec *bvl; @@ -117,6 +96,9 @@ void bio_free(struct bio *bio, struct bio_set *bio_set) mempool_free(bio->bi_io_vec, bio_set->bvec_pools[pool_idx]); } + if (bio_integrity(bio)) + bio_integrity_free(bio, bio_set); + mempool_free(bio, bio_set->bio_pool); } @@ -275,9 +257,19 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask) { struct bio *b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, fs_bio_set); - if (b) { - b->bi_destructor = bio_fs_destructor; - __bio_clone(b, bio); + if (!b) + return NULL; + + b->bi_destructor = bio_fs_destructor; + __bio_clone(b, bio); + + if (bio_integrity(bio)) { + int ret; + + ret = bio_integrity_clone(b, bio, fs_bio_set); + + if (ret < 0) + return NULL; } return b; @@ -333,10 +325,19 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page if (page == prev->bv_page && offset == prev->bv_offset + prev->bv_len) { prev->bv_len += len; - if (q->merge_bvec_fn && - q->merge_bvec_fn(q, bio, prev) < len) { - prev->bv_len -= len; - return 0; + + if (q->merge_bvec_fn) { + struct bvec_merge_data bvm = { + .bi_bdev = bio->bi_bdev, + .bi_sector = bio->bi_sector, + .bi_size = bio->bi_size, + .bi_rw = bio->bi_rw, + }; + + if (q->merge_bvec_fn(q, &bvm, prev) < len) { + prev->bv_len -= len; + return 0; + } } goto done; @@ -377,11 +378,18 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page * queue to get further control */ if (q->merge_bvec_fn) { + struct bvec_merge_data bvm = { + .bi_bdev = bio->bi_bdev, + .bi_sector = bio->bi_sector, + .bi_size = bio->bi_size, + .bi_rw = bio->bi_rw, + }; + /* * merge_bvec_fn() returns number of bytes it can accept * at this offset */ - if (q->merge_bvec_fn(q, bio, bvec) < len) { + if (q->merge_bvec_fn(q, &bvm, bvec) < len) { bvec->bv_page = NULL; bvec->bv_len = 0; bvec->bv_offset = 0; @@ -1249,6 +1257,9 @@ struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors) bp->bio1.bi_private = bi; bp->bio2.bi_private = pool; + if (bio_integrity(bi)) + bio_integrity_split(bi, bp, first_sectors); + return bp; } @@ -1290,6 +1301,7 @@ void bioset_free(struct bio_set *bs) if (bs->bio_pool) mempool_destroy(bs->bio_pool); + bioset_integrity_free(bs); biovec_free_pools(bs); kfree(bs); @@ -1306,6 +1318,9 @@ struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size) if (!bs->bio_pool) goto bad; + if (bioset_integrity_create(bs, bio_pool_size)) + goto bad; + if (!biovec_create_pools(bs, bvec_pool_size)) return bs; @@ -1332,6 +1347,7 @@ static int __init init_bio(void) { bio_slab = KMEM_CACHE(bio, SLAB_HWCACHE_ALIGN|SLAB_PANIC); + bio_integrity_init_slab(); biovec_init_slabs(); fs_bio_set = bioset_create(BIO_POOL_SIZE, 2); diff --git a/fs/block_dev.c b/fs/block_dev.c index 7d822fae776..10d8a0aa871 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -12,6 +12,7 @@ #include <linux/kmod.h> #include <linux/major.h> #include <linux/smp_lock.h> +#include <linux/device_cgroup.h> #include <linux/highmem.h> #include <linux/blkdev.h> #include <linux/module.h> @@ -928,9 +929,22 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part) { struct module *owner = NULL; struct gendisk *disk; - int ret = -ENXIO; + int ret; int part; + int perm = 0; + + if (file->f_mode & FMODE_READ) + perm |= MAY_READ; + if (file->f_mode & FMODE_WRITE) + perm |= MAY_WRITE; + /* + * hooks: /n/, see "layering violations". + */ + ret = devcgroup_inode_permission(bdev->bd_inode, perm); + if (ret != 0) + return ret; + ret = -ENXIO; file->f_mapping = bdev->bd_inode->i_mapping; lock_kernel(); disk = get_gendisk(bdev->bd_dev, &part); diff --git a/fs/buffer.c b/fs/buffer.c index a073f3f4f01..5fa1512cd9a 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -821,7 +821,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) * contents - it is a noop if I/O is still in * flight on potentially older contents. */ - ll_rw_block(SWRITE, 1, &bh); + ll_rw_block(SWRITE_SYNC, 1, &bh); brelse(bh); spin_lock(lock); } @@ -1691,11 +1691,13 @@ static int __block_write_full_page(struct inode *inode, struct page *page, */ clear_buffer_dirty(bh); set_buffer_uptodate(bh); - } else if (!buffer_mapped(bh) && buffer_dirty(bh)) { + } else if ((!buffer_mapped(bh) || buffer_delay(bh)) && + buffer_dirty(bh)) { WARN_ON(bh->b_size != blocksize); err = get_block(inode, block, bh, 1); if (err) goto recover; + clear_buffer_delay(bh); if (buffer_new(bh)) { /* blockdev mappings never come here */ clear_buffer_new(bh); @@ -1774,7 +1776,8 @@ recover: bh = head; /* Recovery: lock and submit the mapped buffers */ do { - if (buffer_mapped(bh) && buffer_dirty(bh)) { + if (buffer_mapped(bh) && buffer_dirty(bh) && + !buffer_delay(bh)) { lock_buffer(bh); mark_buffer_async_write(bh); } else { @@ -2061,6 +2064,7 @@ int generic_write_end(struct file *file, struct address_space *mapping, struct page *page, void *fsdata) { struct inode *inode = mapping->host; + int i_size_changed = 0; copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); @@ -2073,12 +2077,21 @@ int generic_write_end(struct file *file, struct address_space *mapping, */ if (pos+copied > inode->i_size) { i_size_write(inode, pos+copied); - mark_inode_dirty(inode); + i_size_changed = 1; } unlock_page(page); page_cache_release(page); + /* + * Don't mark the inode dirty under page lock. First, it unnecessarily + * makes the holding time of page lock longer. Second, it forces lock + * ordering of page lock and transaction start for journaling + * filesystems. + */ + if (i_size_changed) + mark_inode_dirty(inode); + return copied; } EXPORT_SYMBOL(generic_write_end); @@ -2940,16 +2953,19 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[]) for (i = 0; i < nr; i++) { struct buffer_head *bh = bhs[i]; - if (rw == SWRITE) + if (rw == SWRITE || rw == SWRITE_SYNC) lock_buffer(bh); else if (test_set_buffer_locked(bh)) continue; - if (rw == WRITE || rw == SWRITE) { + if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC) { if (test_clear_buffer_dirty(bh)) { bh->b_end_io = end_buffer_write_sync; get_bh(bh); - submit_bh(WRITE, bh); + if (rw == SWRITE_SYNC) + submit_bh(WRITE_SYNC, bh); + else + submit_bh(WRITE, bh); continue; } } else { @@ -2978,7 +2994,7 @@ int sync_dirty_buffer(struct buffer_head *bh) if (test_clear_buffer_dirty(bh)) { get_bh(bh); bh->b_end_io = end_buffer_write_sync; - ret = submit_bh(WRITE, bh); + ret = submit_bh(WRITE_SYNC, bh); wait_on_buffer(bh); if (buffer_eopnotsupp(bh)) { clear_buffer_eopnotsupp(bh); diff --git a/fs/char_dev.c b/fs/char_dev.c index 68e510b8845..3cb7cda3d78 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -373,6 +373,8 @@ static int chrdev_open(struct inode *inode, struct file *filp) return -ENXIO; new = container_of(kobj, struct cdev, kobj); spin_lock(&cdev_lock); + /* Check i_cdev again in case somebody beat us to it while + we dropped the lock. */ p = inode->i_cdev; if (!p) { inode->i_cdev = p = new; @@ -392,11 +394,8 @@ static int chrdev_open(struct inode *inode, struct file *filp) cdev_put(p); return -ENXIO; } - if (filp->f_op->open) { - lock_kernel(); + if (filp->f_op->open) ret = filp->f_op->open(inode,filp); - unlock_kernel(); - } if (ret) cdev_put(p); return ret; diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES index 28e3d5c5fca..1f3465201fd 100644 --- a/fs/cifs/CHANGES +++ b/fs/cifs/CHANGES @@ -2,6 +2,11 @@ Version 1.53 ------------ DFS support added (Microsoft Distributed File System client support needed for referrals which enable a hierarchical name space among servers). +Disable temporary caching of mode bits to servers which do not support +storing of mode (e.g. Windows servers, when client mounts without cifsacl +mount option) and add new "dynperm" mount option to enable temporary caching +of mode (enable old behavior). Fix hang on mount caused when server crashes +tcp session during negotiate protocol. Version 1.52 ------------ diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c index cb52cbbe45f..f58e41d3ba4 100644 --- a/fs/cifs/asn1.c +++ b/fs/cifs/asn1.c @@ -186,6 +186,11 @@ asn1_length_decode(struct asn1_ctx *ctx, unsigned int *def, unsigned int *len) } } } + + /* don't trust len bigger than ctx buffer */ + if (*len > ctx->end - ctx->pointer) + return 0; + return 1; } @@ -203,6 +208,10 @@ asn1_header_decode(struct asn1_ctx *ctx, if (!asn1_length_decode(ctx, &def, &len)) return 0; + /* primitive shall be definite, indefinite shall be constructed */ + if (*con == ASN1_PRI && !def) + return 0; + if (def) *eoc = ctx->pointer + len; else @@ -389,6 +398,11 @@ asn1_oid_decode(struct asn1_ctx *ctx, unsigned long *optr; size = eoc - ctx->pointer + 1; + + /* first subid actually encodes first two subids */ + if (size < 2 || size > ULONG_MAX/sizeof(unsigned long)) + return 0; + *oid = kmalloc(size * sizeof(unsigned long), GFP_ATOMIC); if (*oid == NULL) return 0; diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 34902cff540..0e9fc2ba90e 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -34,11 +34,11 @@ static struct cifs_wksid wksidarr[NUM_WK_SIDS] = { {{1, 0, {0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0} }, "null user"}, {{1, 1, {0, 0, 0, 0, 0, 1}, {0, 0, 0, 0, 0} }, "nobody"}, - {{1, 1, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(11), 0, 0, 0, 0} }, "net-users"}, - {{1, 1, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(18), 0, 0, 0, 0} }, "sys"}, - {{1, 2, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(32), cpu_to_le32(544), 0, 0, 0} }, "root"}, - {{1, 2, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(32), cpu_to_le32(545), 0, 0, 0} }, "users"}, - {{1, 2, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(32), cpu_to_le32(546), 0, 0, 0} }, "guest"} } + {{1, 1, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(11), 0, 0, 0, 0} }, "net-users"}, + {{1, 1, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(18), 0, 0, 0, 0} }, "sys"}, + {{1, 2, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(32), __constant_cpu_to_le32(544), 0, 0, 0} }, "root"}, + {{1, 2, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(32), __constant_cpu_to_le32(545), 0, 0, 0} }, "users"}, + {{1, 2, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(32), __constant_cpu_to_le32(546), 0, 0, 0} }, "guest"} } ; diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 5df93fd6303..22857c639df 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -97,9 +97,6 @@ cifs_read_super(struct super_block *sb, void *data, { struct inode *inode; struct cifs_sb_info *cifs_sb; -#ifdef CONFIG_CIFS_DFS_UPCALL - int len; -#endif int rc = 0; /* BB should we make this contingent on mount parm? */ @@ -117,15 +114,17 @@ cifs_read_super(struct super_block *sb, void *data, * complex operation (mount), and in case of fail * just exit instead of doing mount and attempting * undo it if this copy fails?*/ - len = strlen(data); - cifs_sb->mountdata = kzalloc(len + 1, GFP_KERNEL); - if (cifs_sb->mountdata == NULL) { - kfree(sb->s_fs_info); - sb->s_fs_info = NULL; - return -ENOMEM; + if (data) { + int len = strlen(data); + cifs_sb->mountdata = kzalloc(len + 1, GFP_KERNEL); + if (cifs_sb->mountdata == NULL) { + kfree(sb->s_fs_info); + sb->s_fs_info = NULL; + return -ENOMEM; + } + strncpy(cifs_sb->mountdata, data, len + 1); + cifs_sb->mountdata[len] = '\0'; } - strncpy(cifs_sb->mountdata, data, len + 1); - cifs_sb->mountdata[len] = '\0'; #endif rc = cifs_mount(sb, cifs_sb, data, devname); @@ -613,7 +612,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin) if (retval < 0) return (loff_t)retval; } - return remote_llseek(file, offset, origin); + return generic_file_llseek_unlocked(file, offset, origin); } struct file_system_type cifs_fs_type = { diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 08914053242..9cfcf326ead 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -333,7 +333,6 @@ struct cifsFileInfo { bool messageMode:1; /* for pipes: message vs byte mode */ atomic_t wrtPending; /* handle in use - defer close */ struct semaphore fh_sem; /* prevents reopen race after dead ses*/ - char *search_resume_name; /* BB removeme BB */ struct cifs_search_info srch_inf; }; @@ -626,7 +625,7 @@ GLOBAL_EXTERN atomic_t tcpSesAllocCount; GLOBAL_EXTERN atomic_t tcpSesReconnectCount; GLOBAL_EXTERN atomic_t tconInfoReconnectCount; -/* Various Debug counters to remove someday (BB) */ +/* Various Debug counters */ GLOBAL_EXTERN atomic_t bufAllocCount; /* current number allocated */ #ifdef CONFIG_CIFS_STATS2 GLOBAL_EXTERN atomic_t totBufAllocCount; /* total allocated over all time */ diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index 65d58b4e6a6..0f327c224da 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -79,6 +79,19 @@ #define TRANS2_GET_DFS_REFERRAL 0x10 #define TRANS2_REPORT_DFS_INCOSISTENCY 0x11 +/* SMB Transact (Named Pipe) subcommand codes */ +#define TRANS_SET_NMPIPE_STATE 0x0001 +#define TRANS_RAW_READ_NMPIPE 0x0011 +#define TRANS_QUERY_NMPIPE_STATE 0x0021 +#define TRANS_QUERY_NMPIPE_INFO 0x0022 +#define TRANS_PEEK_NMPIPE 0x0023 +#define TRANS_TRANSACT_NMPIPE 0x0026 +#define TRANS_RAW_WRITE_NMPIPE 0x0031 +#define TRANS_READ_NMPIPE 0x0036 +#define TRANS_WRITE_NMPIPE 0x0037 +#define TRANS_WAIT_NMPIPE 0x0053 +#define TRANS_CALL_NMPIPE 0x0054 + /* NT Transact subcommand codes */ #define NT_TRANSACT_CREATE 0x01 #define NT_TRANSACT_IOCTL 0x02 @@ -328,12 +341,13 @@ #define CREATE_COMPLETE_IF_OPLK 0x00000100 /* should be zero */ #define CREATE_NO_EA_KNOWLEDGE 0x00000200 #define CREATE_EIGHT_DOT_THREE 0x00000400 /* doc says this is obsolete - open for recovery flag - should - be zero */ + "open for recovery" flag - should + be zero in any case */ +#define CREATE_OPEN_FOR_RECOVERY 0x00000400 #define CREATE_RANDOM_ACCESS 0x00000800 #define CREATE_DELETE_ON_CLOSE 0x00001000 #define CREATE_OPEN_BY_ID 0x00002000 -#define CREATE_OPEN_BACKUP_INTN 0x00004000 +#define CREATE_OPEN_BACKUP_INTENT 0x00004000 #define CREATE_NO_COMPRESSION 0x00008000 #define CREATE_RESERVE_OPFILTER 0x00100000 /* should be zero */ #define OPEN_REPARSE_POINT 0x00200000 @@ -722,7 +736,6 @@ typedef struct smb_com_tconx_rsp_ext { #define SMB_CSC_CACHE_AUTO_REINT 0x0004 #define SMB_CSC_CACHE_VDO 0x0008 #define SMB_CSC_NO_CACHING 0x000C - #define SMB_UNIQUE_FILE_NAME 0x0010 #define SMB_EXTENDED_SIGNATURES 0x0020 @@ -806,7 +819,7 @@ typedef struct smb_com_findclose_req { #define ICOUNT_MASK 0x00FF #define PIPE_READ_MODE 0x0100 #define NAMED_PIPE_TYPE 0x0400 -#define PIPE_END_POINT 0x0800 +#define PIPE_END_POINT 0x4000 #define BLOCKING_NAMED_PIPE 0x8000 typedef struct smb_com_open_req { /* also handles create */ diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 9b8b4cfdf99..4511b708f0f 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -1728,7 +1728,7 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon, { int rc = 0; LOCK_REQ *pSMB = NULL; - LOCK_RSP *pSMBr = NULL; +/* LOCK_RSP *pSMBr = NULL; */ /* No response data other than rc to parse */ int bytes_returned; int timeout = 0; __u16 count; @@ -1739,8 +1739,6 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon, if (rc) return rc; - pSMBr = (LOCK_RSP *)pSMB; /* BB removeme BB */ - if (lockType == LOCKING_ANDX_OPLOCK_RELEASE) { timeout = CIFS_ASYNC_OP; /* no response expected */ pSMB->Timeout = 0; @@ -1774,7 +1772,7 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon, if (waitFlag) { rc = SendReceiveBlockingLock(xid, tcon, (struct smb_hdr *) pSMB, - (struct smb_hdr *) pSMBr, &bytes_returned); + (struct smb_hdr *) pSMB, &bytes_returned); cifs_small_buf_release(pSMB); } else { rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *)pSMB, @@ -3927,9 +3925,9 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr, } ref = (struct dfs_referral_level_3 *) &(pSMBr->referrals); - if (ref->VersionNumber != 3) { + if (ref->VersionNumber != cpu_to_le16(3)) { cERROR(1, ("Referrals of V%d version are not supported," - "should be V3", ref->VersionNumber)); + "should be V3", le16_to_cpu(ref->VersionNumber))); rc = -EINVAL; goto parse_DFS_referrals_exit; } @@ -3977,7 +3975,7 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr, if (rc) goto parse_DFS_referrals_exit; - ref += ref->Size; + ref += le16_to_cpu(ref->Size); } parse_DFS_referrals_exit: diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 023434f72c1..e8fa46c7cff 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -653,6 +653,7 @@ multi_t2_fnd: spin_lock(&GlobalMid_Lock); server->tcpStatus = CifsExiting; spin_unlock(&GlobalMid_Lock); + wake_up_all(&server->response_q); /* don't exit until kthread_stop is called */ set_current_state(TASK_UNINTERRUPTIBLE); @@ -2120,6 +2121,10 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO; } + if ((volume_info.cifs_acl) && (volume_info.dynperm)) + cERROR(1, ("mount option dynperm ignored if cifsacl " + "mount option supported")); + tcon = find_unc(sin_server.sin_addr.s_addr, volume_info.UNC, volume_info.username); diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index f0b5b5f3dd2..fb69c1fa85c 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -260,7 +260,9 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode, buf, inode->i_sb, xid, &fileHandle); if (newinode) { - newinode->i_mode = mode; + if (cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_DYNPERM) + newinode->i_mode = mode; if ((oplock & CIFS_CREATE_ACTION) && (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)) { diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 8636cec2642..0aac824371a 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -546,7 +546,6 @@ int cifs_close(struct inode *inode, struct file *file) msleep(timeout); timeout *= 8; } - kfree(pSMBFile->search_resume_name); kfree(file->private_data); file->private_data = NULL; } else @@ -605,12 +604,6 @@ int cifs_closedir(struct inode *inode, struct file *file) else cifs_buf_release(ptmp); } - ptmp = pCFileStruct->search_resume_name; - if (ptmp) { - cFYI(1, ("closedir free resume name")); - pCFileStruct->search_resume_name = NULL; - kfree(ptmp); - } kfree(file->private_data); file->private_data = NULL; } diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 129dbfe4dca..2e904bd111c 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -219,15 +219,15 @@ int cifs_get_inode_info_unix(struct inode **pinode, rc = CIFSSMBUnixQPathInfo(xid, pTcon, full_path, &find_data, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); - if (rc) { - if (rc == -EREMOTE && !is_dfs_referral) { - is_dfs_referral = true; - cFYI(DBG2, ("DFS ref")); - /* for DFS, server does not give us real inode data */ - fill_fake_finddataunix(&find_data, sb); - rc = 0; - } - } + if (rc == -EREMOTE && !is_dfs_referral) { + is_dfs_referral = true; + cFYI(DBG2, ("DFS ref")); + /* for DFS, server does not give us real inode data */ + fill_fake_finddataunix(&find_data, sb); + rc = 0; + } else if (rc) + goto cgiiu_exit; + num_of_bytes = le64_to_cpu(find_data.NumOfBytes); end_of_file = le64_to_cpu(find_data.EndOfFile); @@ -236,7 +236,7 @@ int cifs_get_inode_info_unix(struct inode **pinode, *pinode = new_inode(sb); if (*pinode == NULL) { rc = -ENOMEM; - goto cgiiu_exit; + goto cgiiu_exit; } /* Is an i_ino of zero legal? */ /* note ino incremented to unique num in new_inode */ @@ -418,6 +418,7 @@ int cifs_get_inode_info(struct inode **pinode, char *buf = NULL; bool adjustTZ = false; bool is_dfs_referral = false; + umode_t default_mode; pTcon = cifs_sb->tcon; cFYI(1, ("Getting info on %s", full_path)); @@ -530,47 +531,42 @@ int cifs_get_inode_info(struct inode **pinode, inode->i_mtime.tv_sec += pTcon->ses->server->timeAdj; } - /* set default mode. will override for dirs below */ - if (atomic_read(&cifsInfo->inUse) == 0) - /* new inode, can safely set these fields */ - inode->i_mode = cifs_sb->mnt_file_mode; - else /* since we set the inode type below we need to mask off - to avoid strange results if type changes and both - get orred in */ - inode->i_mode &= ~S_IFMT; -/* if (attr & ATTR_REPARSE) */ - /* We no longer handle these as symlinks because we could not - follow them due to the absolute path with drive letter */ - if (attr & ATTR_DIRECTORY) { - /* override default perms since we do not do byte range locking - on dirs */ - inode->i_mode = cifs_sb->mnt_dir_mode; - inode->i_mode |= S_IFDIR; - } else if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) && - (cifsInfo->cifsAttrs & ATTR_SYSTEM) && - /* No need to le64 convert size of zero */ - (pfindData->EndOfFile == 0)) { - inode->i_mode = cifs_sb->mnt_file_mode; - inode->i_mode |= S_IFIFO; -/* BB Finish for SFU style symlinks and devices */ - } else if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) && - (cifsInfo->cifsAttrs & ATTR_SYSTEM)) { - if (decode_sfu_inode(inode, le64_to_cpu(pfindData->EndOfFile), - full_path, cifs_sb, xid)) - cFYI(1, ("Unrecognized sfu inode type")); - - cFYI(1, ("sfu mode 0%o", inode->i_mode)); + /* get default inode mode */ + if (attr & ATTR_DIRECTORY) + default_mode = cifs_sb->mnt_dir_mode; + else + default_mode = cifs_sb->mnt_file_mode; + + /* set permission bits */ + if (atomic_read(&cifsInfo->inUse) == 0 || + (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) == 0) + inode->i_mode = default_mode; + else { + /* just reenable write bits if !ATTR_READONLY */ + if ((inode->i_mode & S_IWUGO) == 0 && + (attr & ATTR_READONLY) == 0) + inode->i_mode |= (S_IWUGO & default_mode); + inode->i_mode &= ~S_IFMT; + } + /* clear write bits if ATTR_READONLY is set */ + if (attr & ATTR_READONLY) + inode->i_mode &= ~S_IWUGO; + + /* set inode type */ + if ((attr & ATTR_SYSTEM) && + (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)) { + /* no need to fix endianness on 0 */ + if (pfindData->EndOfFile == 0) + inode->i_mode |= S_IFIFO; + else if (decode_sfu_inode(inode, + le64_to_cpu(pfindData->EndOfFile), + full_path, cifs_sb, xid)) + cFYI(1, ("unknown SFU file type\n")); } else { - inode->i_mode |= S_IFREG; - /* treat dos attribute of read-only as read-only mode eg 555 */ - if (cifsInfo->cifsAttrs & ATTR_READONLY) - inode->i_mode &= ~(S_IWUGO); - else if ((inode->i_mode & S_IWUGO) == 0) - /* the ATTR_READONLY flag may have been */ - /* changed on server -- set any w bits */ - /* allowed by mnt_file_mode */ - inode->i_mode |= (S_IWUGO & cifs_sb->mnt_file_mode); - /* BB add code to validate if device or weird share or device type? */ + if (attr & ATTR_DIRECTORY) + inode->i_mode |= S_IFDIR; + else + inode->i_mode |= S_IFREG; } spin_lock(&inode->i_lock); @@ -1019,8 +1015,11 @@ mkdir_get_info: CIFS_MOUNT_MAP_SPECIAL_CHR); } if (direntry->d_inode) { - direntry->d_inode->i_mode = mode; - direntry->d_inode->i_mode |= S_IFDIR; + if (cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_DYNPERM) + direntry->d_inode->i_mode = + (mode | S_IFDIR); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { direntry->d_inode->i_uid = @@ -1547,13 +1546,26 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs) } else goto cifs_setattr_exit; } - if (attrs->ia_valid & ATTR_UID) { - cFYI(1, ("UID changed to %d", attrs->ia_uid)); - uid = attrs->ia_uid; - } - if (attrs->ia_valid & ATTR_GID) { - cFYI(1, ("GID changed to %d", attrs->ia_gid)); - gid = attrs->ia_gid; + + /* + * Without unix extensions we can't send ownership changes to the + * server, so silently ignore them. This is consistent with how + * local DOS/Windows filesystems behave (VFAT, NTFS, etc). With + * CIFSACL support + proper Windows to Unix idmapping, we may be + * able to support this in the future. + */ + if (!pTcon->unix_ext && + !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)) { + attrs->ia_valid &= ~(ATTR_UID | ATTR_GID); + } else { + if (attrs->ia_valid & ATTR_UID) { + cFYI(1, ("UID changed to %d", attrs->ia_uid)); + uid = attrs->ia_uid; + } + if (attrs->ia_valid & ATTR_GID) { + cFYI(1, ("GID changed to %d", attrs->ia_gid)); + gid = attrs->ia_gid; + } } time_buf.Attributes = 0; @@ -1563,7 +1575,7 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs) attrs->ia_valid &= ~ATTR_MODE; if (attrs->ia_valid & ATTR_MODE) { - cFYI(1, ("Mode changed to 0x%x", attrs->ia_mode)); + cFYI(1, ("Mode changed to 0%o", attrs->ia_mode)); mode = attrs->ia_mode; } @@ -1578,18 +1590,18 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs) #ifdef CONFIG_CIFS_EXPERIMENTAL if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) rc = mode_to_acl(inode, full_path, mode); - else if ((mode & S_IWUGO) == 0) { -#else - if ((mode & S_IWUGO) == 0) { + else #endif - /* not writeable */ - if ((cifsInode->cifsAttrs & ATTR_READONLY) == 0) { - set_dosattr = true; - time_buf.Attributes = - cpu_to_le32(cifsInode->cifsAttrs | - ATTR_READONLY); - } - } else if (cifsInode->cifsAttrs & ATTR_READONLY) { + if (((mode & S_IWUGO) == 0) && + (cifsInode->cifsAttrs & ATTR_READONLY) == 0) { + set_dosattr = true; + time_buf.Attributes = cpu_to_le32(cifsInode->cifsAttrs | + ATTR_READONLY); + /* fix up mode if we're not using dynperm */ + if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) == 0) + attrs->ia_mode = inode->i_mode & ~S_IWUGO; + } else if ((mode & S_IWUGO) && + (cifsInode->cifsAttrs & ATTR_READONLY)) { /* If file is readonly on server, we would not be able to write to it - so if any write bit is enabled for user or group or other we @@ -1600,6 +1612,20 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs) /* Windows ignores set to zero */ if (time_buf.Attributes == 0) time_buf.Attributes |= cpu_to_le32(ATTR_NORMAL); + + /* reset local inode permissions to normal */ + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)) { + attrs->ia_mode &= ~(S_IALLUGO); + if (S_ISDIR(inode->i_mode)) + attrs->ia_mode |= + cifs_sb->mnt_dir_mode; + else + attrs->ia_mode |= + cifs_sb->mnt_file_mode; + } + } else if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)) { + /* ignore mode change - ATTR_READONLY hasn't changed */ + attrs->ia_valid &= ~ATTR_MODE; } } diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 1d69b8014e0..4b17f8fe315 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -519,8 +519,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv) pnotify = (struct file_notify_information *) ((char *)&pSMBr->hdr.Protocol + data_offset); cFYI(1, ("dnotify on %s Action: 0x%x", - pnotify->FileName, - pnotify->Action)); /* BB removeme BB */ + pnotify->FileName, pnotify->Action)); /* cifs_dump_mem("Rcvd notify Data: ",buf, sizeof(struct smb_hdr)+60); */ return true; diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 713c2511019..83f30695488 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -132,6 +132,7 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type, __u32 attr; __u64 allocation_size; __u64 end_of_file; + umode_t default_mode; /* save mtime and size */ local_mtime = tmp_inode->i_mtime; @@ -187,48 +188,54 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type, if (atomic_read(&cifsInfo->inUse) == 0) { tmp_inode->i_uid = cifs_sb->mnt_uid; tmp_inode->i_gid = cifs_sb->mnt_gid; - /* set default mode. will override for dirs below */ - tmp_inode->i_mode = cifs_sb->mnt_file_mode; - } else { - /* mask off the type bits since it gets set - below and we do not want to get two type - bits set */ + } + + if (attr & ATTR_DIRECTORY) + default_mode = cifs_sb->mnt_dir_mode; + else + default_mode = cifs_sb->mnt_file_mode; + + /* set initial permissions */ + if ((atomic_read(&cifsInfo->inUse) == 0) || + (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) == 0) + tmp_inode->i_mode = default_mode; + else { + /* just reenable write bits if !ATTR_READONLY */ + if ((tmp_inode->i_mode & S_IWUGO) == 0 && + (attr & ATTR_READONLY) == 0) + tmp_inode->i_mode |= (S_IWUGO & default_mode); + tmp_inode->i_mode &= ~S_IFMT; } - if (attr & ATTR_DIRECTORY) { - *pobject_type = DT_DIR; - /* override default perms since we do not lock dirs */ - if (atomic_read(&cifsInfo->inUse) == 0) - tmp_inode->i_mode = cifs_sb->mnt_dir_mode; - tmp_inode->i_mode |= S_IFDIR; - } else if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) && - (attr & ATTR_SYSTEM)) { + /* clear write bits if ATTR_READONLY is set */ + if (attr & ATTR_READONLY) + tmp_inode->i_mode &= ~S_IWUGO; + + /* set inode type */ + if ((attr & ATTR_SYSTEM) && + (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)) { if (end_of_file == 0) { - *pobject_type = DT_FIFO; tmp_inode->i_mode |= S_IFIFO; + *pobject_type = DT_FIFO; } else { - /* rather than get the type here, we mark the - inode as needing revalidate and get the real type - (blk vs chr vs. symlink) later ie in lookup */ - *pobject_type = DT_REG; + /* + * trying to get the type can be slow, so just call + * this a regular file for now, and mark for reval + */ tmp_inode->i_mode |= S_IFREG; + *pobject_type = DT_REG; cifsInfo->time = 0; } -/* we no longer mark these because we could not follow them */ -/* } else if (attr & ATTR_REPARSE) { - *pobject_type = DT_LNK; - tmp_inode->i_mode |= S_IFLNK; */ } else { - *pobject_type = DT_REG; - tmp_inode->i_mode |= S_IFREG; - if (attr & ATTR_READONLY) - tmp_inode->i_mode &= ~(S_IWUGO); - else if ((tmp_inode->i_mode & S_IWUGO) == 0) - /* the ATTR_READONLY flag may have been changed on */ - /* server -- set any w bits allowed by mnt_file_mode */ - tmp_inode->i_mode |= (S_IWUGO & cifs_sb->mnt_file_mode); - } /* could add code here - to validate if device or weird share type? */ + if (attr & ATTR_DIRECTORY) { + tmp_inode->i_mode |= S_IFDIR; + *pobject_type = DT_DIR; + } else { + tmp_inode->i_mode |= S_IFREG; + *pobject_type = DT_REG; + } + } /* can not fill in nlink here as in qpathinfo version and Unx search */ if (atomic_read(&cifsInfo->inUse) == 0) @@ -675,8 +682,6 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon, cifsFile->invalidHandle = true; CIFSFindClose(xid, pTcon, cifsFile->netfid); } - kfree(cifsFile->search_resume_name); - cifsFile->search_resume_name = NULL; if (cifsFile->srch_inf.ntwrk_buf_start) { cFYI(1, ("freeing SMB ff cache buf on search rewind")); if (cifsFile->srch_inf.smallBuf) @@ -1043,9 +1048,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir) } /* else { cifsFile->invalidHandle = true; CIFSFindClose(xid, pTcon, cifsFile->netfid); - } - kfree(cifsFile->search_resume_name); - cifsFile->search_resume_name = NULL; */ + } */ rc = find_cifs_entry(xid, pTcon, file, ¤t_entry, &num_to_fill); diff --git a/fs/dcache.c b/fs/dcache.c index 3ee588d5f58..6068c25b393 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -17,6 +17,7 @@ #include <linux/syscalls.h> #include <linux/string.h> #include <linux/mm.h> +#include <linux/fdtable.h> #include <linux/fs.h> #include <linux/fsnotify.h> #include <linux/slab.h> @@ -106,9 +107,10 @@ static void dentry_lru_remove(struct dentry *dentry) /* * Release the dentry's inode, using the filesystem * d_iput() operation if defined. - * Called with dcache_lock and per dentry lock held, drops both. */ static void dentry_iput(struct dentry * dentry) + __releases(dentry->d_lock) + __releases(dcache_lock) { struct inode *inode = dentry->d_inode; if (inode) { @@ -132,12 +134,13 @@ static void dentry_iput(struct dentry * dentry) * d_kill - kill dentry and return parent * @dentry: dentry to kill * - * Called with dcache_lock and d_lock, releases both. The dentry must - * already be unhashed and removed from the LRU. + * The dentry must already be unhashed and removed from the LRU. * * If this is the root of the dentry tree, return NULL. */ static struct dentry *d_kill(struct dentry *dentry) + __releases(dentry->d_lock) + __releases(dcache_lock) { struct dentry *parent; @@ -383,11 +386,11 @@ restart: * Try to prune ancestors as well. This is necessary to prevent * quadratic behavior of shrink_dcache_parent(), but is also expected * to be beneficial in reducing dentry cache fragmentation. - * - * Called with dcache_lock, drops it and then regains. - * Called with dentry->d_lock held, drops it. */ static void prune_one_dentry(struct dentry * dentry) + __releases(dentry->d_lock) + __releases(dcache_lock) + __acquires(dcache_lock) { __d_drop(dentry); dentry = d_kill(dentry); @@ -1604,10 +1607,9 @@ static int d_isparent(struct dentry *p1, struct dentry *p2) * * Note: If ever the locking in lock_rename() changes, then please * remember to update this too... - * - * On return, dcache_lock will have been unlocked. */ static struct dentry *__d_unalias(struct dentry *dentry, struct dentry *alias) + __releases(dcache_lock) { struct mutex *m1 = NULL, *m2 = NULL; struct dentry *ret; @@ -1743,11 +1745,9 @@ out_nolock: shouldnt_be_hashed: spin_unlock(&dcache_lock); BUG(); - goto shouldnt_be_hashed; } -static int prepend(char **buffer, int *buflen, const char *str, - int namelen) +static int prepend(char **buffer, int *buflen, const char *str, int namelen) { *buflen -= namelen; if (*buflen < 0) @@ -1757,8 +1757,13 @@ static int prepend(char **buffer, int *buflen, const char *str, return 0; } +static int prepend_name(char **buffer, int *buflen, struct qstr *name) +{ + return prepend(buffer, buflen, name->name, name->len); +} + /** - * d_path - return the path of a dentry + * __d_path - return the path of a dentry * @path: the dentry/vfsmount to report * @root: root vfsmnt/dentry (may be modified by this function) * @buffer: buffer to return value in @@ -1779,9 +1784,10 @@ char *__d_path(const struct path *path, struct path *root, { struct dentry *dentry = path->dentry; struct vfsmount *vfsmnt = path->mnt; - char * end = buffer+buflen; - char * retval; + char *end = buffer + buflen; + char *retval; + spin_lock(&vfsmount_lock); prepend(&end, &buflen, "\0", 1); if (!IS_ROOT(dentry) && d_unhashed(dentry) && (prepend(&end, &buflen, " (deleted)", 10) != 0)) @@ -1800,38 +1806,37 @@ char *__d_path(const struct path *path, struct path *root, break; if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) { /* Global root? */ - spin_lock(&vfsmount_lock); if (vfsmnt->mnt_parent == vfsmnt) { - spin_unlock(&vfsmount_lock); goto global_root; } dentry = vfsmnt->mnt_mountpoint; vfsmnt = vfsmnt->mnt_parent; - spin_unlock(&vfsmount_lock); continue; } parent = dentry->d_parent; prefetch(parent); - if ((prepend(&end, &buflen, dentry->d_name.name, - dentry->d_name.len) != 0) || + if ((prepend_name(&end, &buflen, &dentry->d_name) != 0) || (prepend(&end, &buflen, "/", 1) != 0)) goto Elong; retval = end; dentry = parent; } +out: + spin_unlock(&vfsmount_lock); return retval; global_root: retval += 1; /* hit the slash */ - if (prepend(&retval, &buflen, dentry->d_name.name, - dentry->d_name.len) != 0) + if (prepend_name(&retval, &buflen, &dentry->d_name) != 0) goto Elong; root->mnt = vfsmnt; root->dentry = dentry; - return retval; + goto out; + Elong: - return ERR_PTR(-ENAMETOOLONG); + retval = ERR_PTR(-ENAMETOOLONG); + goto out; } /** @@ -1845,9 +1850,9 @@ Elong: * * Returns the buffer or an error code if the path was too long. * - * "buflen" should be positive. Caller holds the dcache_lock. + * "buflen" should be positive. */ -char *d_path(struct path *path, char *buf, int buflen) +char *d_path(const struct path *path, char *buf, int buflen) { char *res; struct path root; @@ -1915,16 +1920,11 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen) retval = end-1; *retval = '/'; - for (;;) { - struct dentry *parent; - if (IS_ROOT(dentry)) - break; + while (!IS_ROOT(dentry)) { + struct dentry *parent = dentry->d_parent; - parent = dentry->d_parent; prefetch(parent); - - if ((prepend(&end, &buflen, dentry->d_name.name, - dentry->d_name.len) != 0) || + if ((prepend_name(&end, &buflen, &dentry->d_name) != 0) || (prepend(&end, &buflen, "/", 1) != 0)) goto Elong; @@ -1975,7 +1975,7 @@ asmlinkage long sys_getcwd(char __user *buf, unsigned long size) error = -ENOENT; /* Has the current directory has been unlinked? */ spin_lock(&dcache_lock); - if (pwd.dentry->d_parent == pwd.dentry || !d_unhashed(pwd.dentry)) { + if (IS_ROOT(pwd.dentry) || !d_unhashed(pwd.dentry)) { unsigned long len; struct path tmp = root; char * cwd; diff --git a/fs/dlm/user.c b/fs/dlm/user.c index ebbcf38fd33..f976f303c19 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -15,6 +15,7 @@ #include <linux/poll.h> #include <linux/signal.h> #include <linux/spinlock.h> +#include <linux/smp_lock.h> #include <linux/dlm.h> #include <linux/dlm_device.h> @@ -618,13 +619,17 @@ static int device_open(struct inode *inode, struct file *file) struct dlm_user_proc *proc; struct dlm_ls *ls; + lock_kernel(); ls = dlm_find_lockspace_device(iminor(inode)); - if (!ls) + if (!ls) { + unlock_kernel(); return -ENOENT; + } proc = kzalloc(sizeof(struct dlm_user_proc), GFP_KERNEL); if (!proc) { dlm_put_lockspace(ls); + unlock_kernel(); return -ENOMEM; } @@ -636,6 +641,7 @@ static int device_open(struct inode *inode, struct file *file) spin_lock_init(&proc->locks_spin); init_waitqueue_head(&proc->wait); file->private_data = proc; + unlock_kernel(); return 0; } @@ -870,6 +876,7 @@ static unsigned int device_poll(struct file *file, poll_table *wait) static int ctl_device_open(struct inode *inode, struct file *file) { + cycle_kernel_lock(); file->private_data = NULL; return 0; } diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index cd62d75b2cc..e2832bc7869 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -1906,9 +1906,9 @@ int ecryptfs_get_tfm_and_mutex_for_cipher_name(struct crypto_blkcipher **tfm, goto out; } } - mutex_unlock(&key_tfm_list_mutex); (*tfm) = key_tfm->key_tfm; (*tfm_mutex) = &key_tfm->key_tfm_mutex; out: + mutex_unlock(&key_tfm_list_mutex); return rc; } diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 951ee33a022..c15c25745e0 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -660,8 +660,6 @@ int ecryptfs_get_tfm_and_mutex_for_cipher_name(struct crypto_blkcipher **tfm, int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key, struct ecryptfs_auth_tok **auth_tok, char *sig); -int ecryptfs_write_zeros(struct file *file, pgoff_t index, int start, - int num_zeros); int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data, loff_t offset, size_t size); int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode, diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 2258b8f654a..24749bf0668 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -30,6 +30,7 @@ #include <linux/security.h> #include <linux/compat.h> #include <linux/fs_stack.h> +#include <linux/smp_lock.h> #include "ecryptfs_kernel.h" /** @@ -277,9 +278,11 @@ static int ecryptfs_fasync(int fd, struct file *file, int flag) int rc = 0; struct file *lower_file = NULL; + lock_kernel(); lower_file = ecryptfs_file_to_lower(file); if (lower_file->f_op && lower_file->f_op->fasync) rc = lower_file->f_op->fasync(fd, lower_file, flag); + unlock_kernel(); return rc; } diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c index 50c994a249a..09a4522f65e 100644 --- a/fs/ecryptfs/miscdev.c +++ b/fs/ecryptfs/miscdev.c @@ -575,13 +575,11 @@ int ecryptfs_init_ecryptfs_miscdev(void) int rc; atomic_set(&ecryptfs_num_miscdev_opens, 0); - mutex_lock(&ecryptfs_daemon_hash_mux); rc = misc_register(&ecryptfs_miscdev); if (rc) printk(KERN_ERR "%s: Failed to register miscellaneous device " "for communications with userspace daemons; rc = [%d]\n", __func__, rc); - mutex_unlock(&ecryptfs_daemon_hash_mux); return rc; } diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c index ebf55150be5..75c2ea9fee3 100644 --- a/fs/ecryptfs/read_write.c +++ b/fs/ecryptfs/read_write.c @@ -157,20 +157,6 @@ int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset, ecryptfs_page_idx, rc); goto out; } - if (start_offset_in_page) { - /* Read in the page from the lower - * into the eCryptfs inode page cache, - * decrypting */ - rc = ecryptfs_decrypt_page(ecryptfs_page); - if (rc) { - printk(KERN_ERR "%s: Error decrypting " - "page; rc = [%d]\n", - __func__, rc); - ClearPageUptodate(ecryptfs_page); - page_cache_release(ecryptfs_page); - goto out; - } - } ecryptfs_page_virt = kmap_atomic(ecryptfs_page, KM_USER0); /* @@ -349,14 +335,6 @@ int ecryptfs_read(char *data, loff_t offset, size_t size, ecryptfs_page_idx, rc); goto out; } - rc = ecryptfs_decrypt_page(ecryptfs_page); - if (rc) { - printk(KERN_ERR "%s: Error decrypting " - "page; rc = [%d]\n", __func__, rc); - ClearPageUptodate(ecryptfs_page); - page_cache_release(ecryptfs_page); - goto out; - } ecryptfs_page_virt = kmap_atomic(ecryptfs_page, KM_USER0); memcpy((data + data_offset), ((char *)ecryptfs_page_virt + start_offset_in_page), diff --git a/fs/exec.c b/fs/exec.c index 3c2ba7ce11d..fd9234379e8 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -26,7 +26,6 @@ #include <linux/file.h> #include <linux/fdtable.h> #include <linux/mman.h> -#include <linux/a.out.h> #include <linux/stat.h> #include <linux/fcntl.h> #include <linux/smp_lock.h> @@ -61,6 +60,11 @@ #include <linux/kmod.h> #endif +#ifdef __alpha__ +/* for /sbin/loader handling in search_binary_handler() */ +#include <linux/a.out.h> +#endif + int core_uses_pid; char core_pattern[CORENAME_MAX_SIZE] = "core"; int suid_dumpable = 0; @@ -606,7 +610,7 @@ int setup_arg_pages(struct linux_binprm *bprm, bprm->exec -= stack_shift; down_write(&mm->mmap_sem); - vm_flags = vma->vm_flags; + vm_flags = VM_STACK_FLAGS; /* * Adjust stack execute permissions; explicitly enable for @@ -860,6 +864,7 @@ static int de_thread(struct task_struct *tsk) no_thread_group: exit_itimers(sig); + flush_itimer_signals(); if (leader) release_task(leader); @@ -1154,7 +1159,7 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs) { int try,retval; struct linux_binfmt *fmt; -#if defined(__alpha__) && defined(CONFIG_ARCH_SUPPORTS_AOUT) +#ifdef __alpha__ /* handle /sbin/loader.. */ { struct exec * eh = (struct exec *) bprm->buf; diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c index 28cfd0b4052..77278e947e9 100644 --- a/fs/ext3/resize.c +++ b/fs/ext3/resize.c @@ -580,7 +580,8 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, } blk = EXT3_SB(sb)->s_sbh->b_blocknr + 1 + EXT3_SB(sb)->s_gdb_count; - data = (__le32 *)dind->b_data + EXT3_SB(sb)->s_gdb_count; + data = (__le32 *)dind->b_data + (EXT3_SB(sb)->s_gdb_count % + EXT3_ADDR_PER_BLOCK(sb)); end = (__le32 *)dind->b_data + EXT3_ADDR_PER_BLOCK(sb); /* Get each reserved primary GDT block and verify it holds backups */ diff --git a/fs/ext3/super.c b/fs/ext3/super.c index fe3119a71ad..2845425077e 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -2875,8 +2875,10 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type, blk++; } out: - if (len == towrite) + if (len == towrite) { + mutex_unlock(&inode->i_mutex); return err; + } if (inode->i_size < off+len-towrite) { i_size_write(inode, off+len-towrite); EXT3_I(inode)->i_disksize = inode->i_size; diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 30494c5da84..495ab21b983 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -43,6 +43,46 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, } +static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block, + ext4_group_t block_group) +{ + ext4_group_t actual_group; + ext4_get_group_no_and_offset(sb, block, &actual_group, NULL); + if (actual_group == block_group) + return 1; + return 0; +} + +static int ext4_group_used_meta_blocks(struct super_block *sb, + ext4_group_t block_group) +{ + ext4_fsblk_t tmp; + struct ext4_sb_info *sbi = EXT4_SB(sb); + /* block bitmap, inode bitmap, and inode table blocks */ + int used_blocks = sbi->s_itb_per_group + 2; + + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) { + struct ext4_group_desc *gdp; + struct buffer_head *bh; + + gdp = ext4_get_group_desc(sb, block_group, &bh); + if (!ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), + block_group)) + used_blocks--; + + if (!ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp), + block_group)) + used_blocks--; + + tmp = ext4_inode_table(sb, gdp); + for (; tmp < ext4_inode_table(sb, gdp) + + sbi->s_itb_per_group; tmp++) { + if (!ext4_block_in_group(sb, tmp, block_group)) + used_blocks -= 1; + } + } + return used_blocks; +} /* Initializes an uninitialized block bitmap if given, and returns the * number of blocks free in the group. */ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, @@ -81,12 +121,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks); } } else { /* For META_BG_BLOCK_GROUPS */ - int group_rel = (block_group - - le32_to_cpu(sbi->s_es->s_first_meta_bg)) % - EXT4_DESC_PER_BLOCK(sb); - if (group_rel == 0 || group_rel == 1 || - (group_rel == EXT4_DESC_PER_BLOCK(sb) - 1)) - bit_max += 1; + bit_max += ext4_bg_num_gdb(sb, block_group); } if (block_group == sbi->s_groups_count - 1) { @@ -105,20 +140,34 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, free_blocks = group_blocks - bit_max; if (bh) { - ext4_fsblk_t start; + ext4_fsblk_t start, tmp; + int flex_bg = 0; for (bit = 0; bit < bit_max; bit++) ext4_set_bit(bit, bh->b_data); start = ext4_group_first_block_no(sb, block_group); - /* Set bits for block and inode bitmaps, and inode table */ - ext4_set_bit(ext4_block_bitmap(sb, gdp) - start, bh->b_data); - ext4_set_bit(ext4_inode_bitmap(sb, gdp) - start, bh->b_data); - for (bit = (ext4_inode_table(sb, gdp) - start), - bit_max = bit + sbi->s_itb_per_group; bit < bit_max; bit++) - ext4_set_bit(bit, bh->b_data); + if (EXT4_HAS_INCOMPAT_FEATURE(sb, + EXT4_FEATURE_INCOMPAT_FLEX_BG)) + flex_bg = 1; + /* Set bits for block and inode bitmaps, and inode table */ + tmp = ext4_block_bitmap(sb, gdp); + if (!flex_bg || ext4_block_in_group(sb, tmp, block_group)) + ext4_set_bit(tmp - start, bh->b_data); + + tmp = ext4_inode_bitmap(sb, gdp); + if (!flex_bg || ext4_block_in_group(sb, tmp, block_group)) + ext4_set_bit(tmp - start, bh->b_data); + + tmp = ext4_inode_table(sb, gdp); + for (; tmp < ext4_inode_table(sb, gdp) + + sbi->s_itb_per_group; tmp++) { + if (!flex_bg || + ext4_block_in_group(sb, tmp, block_group)) + ext4_set_bit(tmp - start, bh->b_data); + } /* * Also if the number of blocks within the group is * less than the blocksize * 8 ( which is the size @@ -126,8 +175,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, */ mark_bitmap_end(group_blocks, sb->s_blocksize * 8, bh->b_data); } - - return free_blocks - sbi->s_itb_per_group - 2; + return free_blocks - ext4_group_used_meta_blocks(sb, block_group); } @@ -242,7 +290,7 @@ err_out: return 0; } /** - * read_block_bitmap() + * ext4_read_block_bitmap() * @sb: super block * @block_group: given block group * @@ -252,7 +300,7 @@ err_out: * Return buffer_head on success or NULL in case of failure. */ struct buffer_head * -read_block_bitmap(struct super_block *sb, ext4_group_t block_group) +ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) { struct ext4_group_desc * desc; struct buffer_head * bh = NULL; @@ -356,8 +404,7 @@ restart: prev = rsv; } printk("Window map complete.\n"); - if (bad) - BUG(); + BUG_ON(bad); } #define rsv_window_dump(root, verbose) \ __rsv_window_dump((root), (verbose), __func__) @@ -641,7 +688,7 @@ do_more: count -= overflow; } brelse(bitmap_bh); - bitmap_bh = read_block_bitmap(sb, block_group); + bitmap_bh = ext4_read_block_bitmap(sb, block_group); if (!bitmap_bh) goto error_return; desc = ext4_get_group_desc (sb, block_group, &gd_bh); @@ -757,6 +804,13 @@ do_more: spin_unlock(sb_bgl_lock(sbi, block_group)); percpu_counter_add(&sbi->s_freeblocks_counter, count); + if (sbi->s_log_groups_per_flex) { + ext4_group_t flex_group = ext4_flex_group(sbi, block_group); + spin_lock(sb_bgl_lock(sbi, flex_group)); + sbi->s_flex_groups[flex_group].free_blocks += count; + spin_unlock(sb_bgl_lock(sbi, flex_group)); + } + /* We dirtied the bitmap block */ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); err = ext4_journal_dirty_metadata(handle, bitmap_bh); @@ -1545,23 +1599,35 @@ out: /** * ext4_has_free_blocks() - * @sbi: in-core super block structure. + * @sbi: in-core super block structure. + * @nblocks: number of neeed blocks * - * Check if filesystem has at least 1 free block available for allocation. + * Check if filesystem has free blocks available for allocation. + * Return the number of blocks avaible for allocation for this request + * On success, return nblocks */ -static int ext4_has_free_blocks(struct ext4_sb_info *sbi) +ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi, + ext4_fsblk_t nblocks) { - ext4_fsblk_t free_blocks, root_blocks; + ext4_fsblk_t free_blocks; + ext4_fsblk_t root_blocks = 0; free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); - root_blocks = ext4_r_blocks_count(sbi->s_es); - if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) && + + if (!capable(CAP_SYS_RESOURCE) && sbi->s_resuid != current->fsuid && - (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) { - return 0; - } - return 1; -} + (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid))) + root_blocks = ext4_r_blocks_count(sbi->s_es); +#ifdef CONFIG_SMP + if (free_blocks - root_blocks < FBC_BATCH) + free_blocks = + percpu_counter_sum_and_set(&sbi->s_freeblocks_counter); +#endif + if (free_blocks - root_blocks < nblocks) + return free_blocks - root_blocks; + return nblocks; + } + /** * ext4_should_retry_alloc() @@ -1577,7 +1643,7 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi) */ int ext4_should_retry_alloc(struct super_block *sb, int *retries) { - if (!ext4_has_free_blocks(EXT4_SB(sb)) || (*retries)++ > 3) + if (!ext4_has_free_blocks(EXT4_SB(sb), 1) || (*retries)++ > 3) return 0; jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id); @@ -1586,20 +1652,24 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries) } /** - * ext4_new_blocks_old() -- core block(s) allocation function + * ext4_old_new_blocks() -- core block bitmap based block allocation function + * * @handle: handle to this transaction * @inode: file inode * @goal: given target block(filesystem wide) * @count: target number of blocks to allocate * @errp: error code * - * ext4_new_blocks uses a goal block to assist allocation. It tries to - * allocate block(s) from the block group contains the goal block first. If that - * fails, it will try to allocate block(s) from other block groups without - * any specific goal block. + * ext4_old_new_blocks uses a goal block to assist allocation and look up + * the block bitmap directly to do block allocation. It tries to + * allocate block(s) from the block group contains the goal block first. If + * that fails, it will try to allocate block(s) from other block groups + * without any specific goal block. + * + * This function is called when -o nomballoc mount option is enabled * */ -ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode, +ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode, ext4_fsblk_t goal, unsigned long *count, int *errp) { struct buffer_head *bitmap_bh = NULL; @@ -1623,13 +1693,26 @@ ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode, ext4_group_t ngroups; unsigned long num = *count; - *errp = -ENOSPC; sb = inode->i_sb; if (!sb) { + *errp = -ENODEV; printk("ext4_new_block: nonexistent device"); return 0; } + sbi = EXT4_SB(sb); + if (!EXT4_I(inode)->i_delalloc_reserved_flag) { + /* + * With delalloc we already reserved the blocks + */ + *count = ext4_has_free_blocks(sbi, *count); + } + if (*count == 0) { + *errp = -ENOSPC; + return 0; /*return with ENOSPC error */ + } + num = *count; + /* * Check quota for allocation of this block. */ @@ -1653,11 +1736,6 @@ ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode, if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0)) my_rsv = &block_i->rsv_window_node; - if (!ext4_has_free_blocks(sbi)) { - *errp = -ENOSPC; - goto out; - } - /* * First, test whether the goal block is free. */ @@ -1681,7 +1759,7 @@ retry_alloc: my_rsv = NULL; if (free_blocks > 0) { - bitmap_bh = read_block_bitmap(sb, group_no); + bitmap_bh = ext4_read_block_bitmap(sb, group_no); if (!bitmap_bh) goto io_error; grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle, @@ -1717,7 +1795,7 @@ retry_alloc: continue; brelse(bitmap_bh); - bitmap_bh = read_block_bitmap(sb, group_no); + bitmap_bh = ext4_read_block_bitmap(sb, group_no); if (!bitmap_bh) goto io_error; /* @@ -1829,7 +1907,15 @@ allocated: le16_add_cpu(&gdp->bg_free_blocks_count, -num); gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp); spin_unlock(sb_bgl_lock(sbi, group_no)); - percpu_counter_sub(&sbi->s_freeblocks_counter, num); + if (!EXT4_I(inode)->i_delalloc_reserved_flag) + percpu_counter_sub(&sbi->s_freeblocks_counter, num); + + if (sbi->s_log_groups_per_flex) { + ext4_group_t flex_group = ext4_flex_group(sbi, group_no); + spin_lock(sb_bgl_lock(sbi, flex_group)); + sbi->s_flex_groups[flex_group].free_blocks -= num; + spin_unlock(sb_bgl_lock(sbi, flex_group)); + } BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor"); err = ext4_journal_dirty_metadata(handle, gdp_bh); @@ -1862,46 +1948,104 @@ out: return 0; } -ext4_fsblk_t ext4_new_block(handle_t *handle, struct inode *inode, - ext4_fsblk_t goal, int *errp) +#define EXT4_META_BLOCK 0x1 + +static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode, + ext4_lblk_t iblock, ext4_fsblk_t goal, + unsigned long *count, int *errp, int flags) { struct ext4_allocation_request ar; ext4_fsblk_t ret; if (!test_opt(inode->i_sb, MBALLOC)) { - unsigned long count = 1; - ret = ext4_new_blocks_old(handle, inode, goal, &count, errp); - return ret; + return ext4_old_new_blocks(handle, inode, goal, count, errp); } memset(&ar, 0, sizeof(ar)); + /* Fill with neighbour allocated blocks */ + ar.inode = inode; ar.goal = goal; - ar.len = 1; + ar.len = *count; + ar.logical = iblock; + + if (S_ISREG(inode->i_mode) && !(flags & EXT4_META_BLOCK)) + /* enable in-core preallocation for data block allocation */ + ar.flags = EXT4_MB_HINT_DATA; + else + /* disable in-core preallocation for non-regular files */ + ar.flags = 0; + ret = ext4_mb_new_blocks(handle, &ar, errp); + *count = ar.len; return ret; } -ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode, +/* + * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks + * + * @handle: handle to this transaction + * @inode: file inode + * @goal: given target block(filesystem wide) + * @count: total number of blocks need + * @errp: error code + * + * Return 1st allocated block numberon success, *count stores total account + * error stores in errp pointer + */ +ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, ext4_fsblk_t goal, unsigned long *count, int *errp) { - struct ext4_allocation_request ar; ext4_fsblk_t ret; - - if (!test_opt(inode->i_sb, MBALLOC)) { - ret = ext4_new_blocks_old(handle, inode, goal, count, errp); - return ret; + ret = do_blk_alloc(handle, inode, 0, goal, + count, errp, EXT4_META_BLOCK); + /* + * Account for the allocated meta blocks + */ + if (!(*errp)) { + spin_lock(&EXT4_I(inode)->i_block_reservation_lock); + EXT4_I(inode)->i_allocated_meta_blocks += *count; + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); } - - memset(&ar, 0, sizeof(ar)); - ar.inode = inode; - ar.goal = goal; - ar.len = *count; - ret = ext4_mb_new_blocks(handle, &ar, errp); - *count = ar.len; return ret; } +/* + * ext4_new_meta_block() -- allocate block for meta data (indexing) blocks + * + * @handle: handle to this transaction + * @inode: file inode + * @goal: given target block(filesystem wide) + * @errp: error code + * + * Return allocated block number on success + */ +ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode, + ext4_fsblk_t goal, int *errp) +{ + unsigned long count = 1; + return ext4_new_meta_blocks(handle, inode, goal, &count, errp); +} + +/* + * ext4_new_blocks() -- allocate data blocks + * + * @handle: handle to this transaction + * @inode: file inode + * @goal: given target block(filesystem wide) + * @count: total number of blocks need + * @errp: error code + * + * Return 1st allocated block numberon success, *count stores total account + * error stores in errp pointer + */ + +ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode, + ext4_lblk_t iblock, ext4_fsblk_t goal, + unsigned long *count, int *errp) +{ + return do_blk_alloc(handle, inode, iblock, goal, count, errp, 0); +} /** * ext4_count_free_blocks() -- count filesystem free blocks @@ -1933,7 +2077,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb) continue; desc_count += le16_to_cpu(gdp->bg_free_blocks_count); brelse(bitmap_bh); - bitmap_bh = read_block_bitmap(sb, i); + bitmap_bh = ext4_read_block_bitmap(sb, i); if (bitmap_bh == NULL) continue; diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 2bf0331ea19..d3d23d73c08 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -129,7 +129,8 @@ static int ext4_readdir(struct file * filp, struct buffer_head *bh = NULL; map_bh.b_state = 0; - err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh, 0, 0); + err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh, + 0, 0, 0); if (err > 0) { pgoff_t index = map_bh.b_blocknr >> (PAGE_CACHE_SHIFT - inode->i_blkbits); @@ -272,7 +273,7 @@ static void free_rb_tree_fname(struct rb_root *root) while (n) { /* Do the node's children first */ - if ((n)->rb_left) { + if (n->rb_left) { n = n->rb_left; continue; } @@ -301,24 +302,18 @@ static void free_rb_tree_fname(struct rb_root *root) parent->rb_right = NULL; n = parent; } - root->rb_node = NULL; } -static struct dir_private_info *create_dir_info(loff_t pos) +static struct dir_private_info *ext4_htree_create_dir_info(loff_t pos) { struct dir_private_info *p; - p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL); + p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL); if (!p) return NULL; - p->root.rb_node = NULL; - p->curr_node = NULL; - p->extra_fname = NULL; - p->last_pos = 0; p->curr_hash = pos2maj_hash(pos); p->curr_minor_hash = pos2min_hash(pos); - p->next_hash = 0; return p; } @@ -433,7 +428,7 @@ static int ext4_dx_readdir(struct file * filp, int ret; if (!info) { - info = create_dir_info(filp->f_pos); + info = ext4_htree_create_dir_info(filp->f_pos); if (!info) return -ENOMEM; filp->private_data = info; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 8158083f7ac..303e41cf7b1 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -22,7 +22,7 @@ #include "ext4_i.h" /* - * The second extended filesystem constants/structures + * The fourth extended filesystem constants/structures */ /* @@ -45,7 +45,7 @@ #define ext4_debug(f, a...) \ do { \ printk (KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:", \ - __FILE__, __LINE__, __FUNCTION__); \ + __FILE__, __LINE__, __func__); \ printk (KERN_DEBUG f, ## a); \ } while (0) #else @@ -74,6 +74,9 @@ #define EXT4_MB_HINT_GOAL_ONLY 256 /* goal is meaningful */ #define EXT4_MB_HINT_TRY_GOAL 512 +/* blocks already pre-reserved by delayed allocation */ +#define EXT4_MB_DELALLOC_RESERVED 1024 + struct ext4_allocation_request { /* target inode for block we're allocating */ @@ -170,6 +173,15 @@ struct ext4_group_desc __u32 bg_reserved2[3]; }; +/* + * Structure of a flex block group info + */ + +struct flex_groups { + __u32 free_inodes; + __u32 free_blocks; +}; + #define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */ #define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */ #define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */ @@ -527,6 +539,7 @@ do { \ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ #define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ #define EXT4_MOUNT_MBALLOC 0x4000000 /* Buddy allocation support */ +#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */ #ifndef _LINUX_EXT2_FS_H #define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt @@ -647,7 +660,10 @@ struct ext4_super_block { __le16 s_mmp_interval; /* # seconds to wait in MMP checking */ __le64 s_mmp_block; /* Block for multi-mount protection */ __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ - __u32 s_reserved[163]; /* Padding to the end of the block */ + __u8 s_log_groups_per_flex; /* FLEX_BG group size */ + __u8 s_reserved_char_pad2; + __le16 s_reserved_pad; + __u32 s_reserved[162]; /* Padding to the end of the block */ }; #ifdef __KERNEL__ @@ -958,12 +974,17 @@ extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb, extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group); extern unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group); -extern ext4_fsblk_t ext4_new_block (handle_t *handle, struct inode *inode, +extern ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode, ext4_fsblk_t goal, int *errp); -extern ext4_fsblk_t ext4_new_blocks (handle_t *handle, struct inode *inode, +extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, ext4_fsblk_t goal, unsigned long *count, int *errp); -extern ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode, +extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode, + ext4_lblk_t iblock, ext4_fsblk_t goal, + unsigned long *count, int *errp); +extern ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode, ext4_fsblk_t goal, unsigned long *count, int *errp); +extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi, + ext4_fsblk_t nblocks); extern void ext4_free_blocks (handle_t *handle, struct inode *inode, ext4_fsblk_t block, unsigned long count, int metadata); extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb, @@ -1016,9 +1037,14 @@ extern int __init init_ext4_mballoc(void); extern void exit_ext4_mballoc(void); extern void ext4_mb_free_blocks(handle_t *, struct inode *, unsigned long, unsigned long, int, unsigned long *); +extern int ext4_mb_add_more_groupinfo(struct super_block *sb, + ext4_group_t i, struct ext4_group_desc *desc); +extern void ext4_mb_update_group_info(struct ext4_group_info *grp, + ext4_grpblk_t add); /* inode.c */ +void ext4_da_release_space(struct inode *inode, int used, int to_free); int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, struct buffer_head *bh, ext4_fsblk_t blocknr); struct buffer_head *ext4_getblk(handle_t *, struct inode *, @@ -1033,19 +1059,23 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, extern struct inode *ext4_iget(struct super_block *, unsigned long); extern int ext4_write_inode (struct inode *, int); extern int ext4_setattr (struct dentry *, struct iattr *); +extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat); extern void ext4_delete_inode (struct inode *); extern int ext4_sync_inode (handle_t *, struct inode *); extern void ext4_discard_reservation (struct inode *); extern void ext4_dirty_inode(struct inode *); extern int ext4_change_inode_journal_flag(struct inode *, int); extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); +extern int ext4_can_truncate(struct inode *inode); extern void ext4_truncate (struct inode *); extern void ext4_set_inode_flags(struct inode *); extern void ext4_get_inode_flags(struct ext4_inode_info *); extern void ext4_set_aops(struct inode *inode); extern int ext4_writepage_trans_blocks(struct inode *); -extern int ext4_block_truncate_page(handle_t *handle, struct page *page, +extern int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from); +extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page); /* ioctl.c */ extern long ext4_ioctl(struct file *, unsigned int, unsigned long); @@ -1159,10 +1189,21 @@ struct ext4_group_info *ext4_get_group_info(struct super_block *sb, } +static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi, + ext4_group_t block_group) +{ + return block_group >> sbi->s_log_groups_per_flex; +} + +static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi) +{ + return 1 << sbi->s_log_groups_per_flex; +} + #define ext4_std_error(sb, errno) \ do { \ if ((errno)) \ - __ext4_std_error((sb), __FUNCTION__, (errno)); \ + __ext4_std_error((sb), __func__, (errno)); \ } while (0) /* @@ -1191,7 +1232,7 @@ extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, ext4_lblk_t iblock, unsigned long max_blocks, struct buffer_head *bh_result, int create, int extend_disksize); -extern void ext4_ext_truncate(struct inode *, struct page *); +extern void ext4_ext_truncate(struct inode *); extern void ext4_ext_init(struct super_block *); extern void ext4_ext_release(struct super_block *); extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, @@ -1199,7 +1240,7 @@ extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, unsigned long max_blocks, struct buffer_head *bh, int create, - int extend_disksize); + int extend_disksize, int flag); #endif /* __KERNEL__ */ #endif /* _EXT4_H */ diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 75333b595fa..6c166c0a54b 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -212,6 +212,7 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext) (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN)); } +extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks); extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *); extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t); extern int ext4_extent_tree_init(handle_t *, struct inode *); diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h index 26a4ae255d7..ef7409f0e7e 100644 --- a/fs/ext4/ext4_i.h +++ b/fs/ext4/ext4_i.h @@ -79,7 +79,7 @@ struct ext4_ext_cache { }; /* - * third extended file system inode data in memory + * fourth extended file system inode data in memory */ struct ext4_inode_info { __le32 i_data[15]; /* unconverted */ @@ -150,6 +150,7 @@ struct ext4_inode_info { */ struct rw_semaphore i_data_sem; struct inode vfs_inode; + struct jbd2_inode jinode; unsigned long i_ext_generation; struct ext4_ext_cache i_cached_extent; @@ -162,6 +163,13 @@ struct ext4_inode_info { /* mballoc */ struct list_head i_prealloc_list; spinlock_t i_prealloc_lock; + + /* allocation reservation info for delalloc */ + unsigned long i_reserved_data_blocks; + unsigned long i_reserved_meta_blocks; + unsigned long i_allocated_meta_blocks; + unsigned short i_delalloc_reserved_flag; + spinlock_t i_block_reservation_lock; }; #endif /* _EXT4_I */ diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 9255a7d28b2..eb8bc3afe6e 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -142,19 +142,17 @@ int __ext4_journal_dirty_metadata(const char *where, handle_t *handle, struct buffer_head *bh); #define ext4_journal_get_undo_access(handle, bh) \ - __ext4_journal_get_undo_access(__FUNCTION__, (handle), (bh)) + __ext4_journal_get_undo_access(__func__, (handle), (bh)) #define ext4_journal_get_write_access(handle, bh) \ - __ext4_journal_get_write_access(__FUNCTION__, (handle), (bh)) + __ext4_journal_get_write_access(__func__, (handle), (bh)) #define ext4_journal_revoke(handle, blocknr, bh) \ - __ext4_journal_revoke(__FUNCTION__, (handle), (blocknr), (bh)) + __ext4_journal_revoke(__func__, (handle), (blocknr), (bh)) #define ext4_journal_get_create_access(handle, bh) \ - __ext4_journal_get_create_access(__FUNCTION__, (handle), (bh)) + __ext4_journal_get_create_access(__func__, (handle), (bh)) #define ext4_journal_dirty_metadata(handle, bh) \ - __ext4_journal_dirty_metadata(__FUNCTION__, (handle), (bh)) + __ext4_journal_dirty_metadata(__func__, (handle), (bh)) #define ext4_journal_forget(handle, bh) \ - __ext4_journal_forget(__FUNCTION__, (handle), (bh)) - -int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh); + __ext4_journal_forget(__func__, (handle), (bh)) handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks); int __ext4_journal_stop(const char *where, handle_t *handle); @@ -165,7 +163,7 @@ static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks) } #define ext4_journal_stop(handle) \ - __ext4_journal_stop(__FUNCTION__, (handle)) + __ext4_journal_stop(__func__, (handle)) static inline handle_t *ext4_journal_current_handle(void) { @@ -192,6 +190,11 @@ static inline int ext4_journal_force_commit(journal_t *journal) return jbd2_journal_force_commit(journal); } +static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode) +{ + return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode); +} + /* super.c */ int ext4_force_commit(struct super_block *sb); diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h index 5802e69f219..6300226d553 100644 --- a/fs/ext4/ext4_sb.h +++ b/fs/ext4/ext4_sb.h @@ -25,7 +25,7 @@ #include <linux/rbtree.h> /* - * third extended-fs super-block data in memory + * fourth extended-fs super-block data in memory */ struct ext4_sb_info { unsigned long s_desc_size; /* Size of a group descriptor in bytes */ @@ -143,6 +143,9 @@ struct ext4_sb_info { /* locality groups */ struct ext4_locality_group *s_locality_groups; + + unsigned int s_log_groups_per_flex; + struct flex_groups *s_flex_groups; }; #endif /* _EXT4_SB */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 47929c4e3da..42c4c0c892e 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -92,17 +92,16 @@ static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb) ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); } -static handle_t *ext4_ext_journal_restart(handle_t *handle, int needed) +static int ext4_ext_journal_restart(handle_t *handle, int needed) { int err; if (handle->h_buffer_credits > needed) - return handle; - if (!ext4_journal_extend(handle, needed)) - return handle; - err = ext4_journal_restart(handle, needed); - - return handle; + return 0; + err = ext4_journal_extend(handle, needed); + if (err) + return err; + return ext4_journal_restart(handle, needed); } /* @@ -180,15 +179,18 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, return bg_start + colour + block; } +/* + * Allocation for a meta data block + */ static ext4_fsblk_t -ext4_ext_new_block(handle_t *handle, struct inode *inode, +ext4_ext_new_meta_block(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, struct ext4_extent *ex, int *err) { ext4_fsblk_t goal, newblock; goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block)); - newblock = ext4_new_block(handle, inode, goal, err); + newblock = ext4_new_meta_block(handle, inode, goal, err); return newblock; } @@ -246,6 +248,36 @@ static int ext4_ext_space_root_idx(struct inode *inode) return size; } +/* + * Calculate the number of metadata blocks needed + * to allocate @blocks + * Worse case is one block per extent + */ +int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks) +{ + int lcap, icap, rcap, leafs, idxs, num; + int newextents = blocks; + + rcap = ext4_ext_space_root_idx(inode); + lcap = ext4_ext_space_block(inode); + icap = ext4_ext_space_block_idx(inode); + + /* number of new leaf blocks needed */ + num = leafs = (newextents + lcap - 1) / lcap; + + /* + * Worse case, we need separate index block(s) + * to link all new leaf blocks + */ + idxs = (leafs + icap - 1) / icap; + do { + num += idxs; + idxs = (idxs + icap - 1) / icap; + } while (idxs > rcap); + + return num; +} + static int ext4_ext_max_entries(struct inode *inode, int depth) { @@ -524,6 +556,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, alloc = 1; } path[0].p_hdr = eh; + path[0].p_bh = NULL; i = depth; /* walk through the tree */ @@ -552,12 +585,14 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, } path[ppos].p_depth = i; - path[ppos].p_hdr = eh; path[ppos].p_ext = NULL; path[ppos].p_idx = NULL; /* find extent */ ext4_ext_binsearch(inode, path + ppos, block); + /* if not an empty leaf */ + if (path[ppos].p_ext) + path[ppos].p_block = ext_pblock(path[ppos].p_ext); ext4_ext_show_path(inode, path); @@ -688,7 +723,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, /* allocate all needed blocks */ ext_debug("allocate %d blocks for indexes/leaf\n", depth - at); for (a = 0; a < depth - at; a++) { - newblock = ext4_ext_new_block(handle, inode, path, newext, &err); + newblock = ext4_ext_new_meta_block(handle, inode, path, + newext, &err); if (newblock == 0) goto cleanup; ablocks[a] = newblock; @@ -884,7 +920,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, ext4_fsblk_t newblock; int err = 0; - newblock = ext4_ext_new_block(handle, inode, path, newext, &err); + newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err); if (newblock == 0) return err; @@ -981,6 +1017,8 @@ repeat: /* if we found index with free entry, then use that * entry: create all needed subtree and add new leaf */ err = ext4_ext_split(handle, inode, path, newext, i); + if (err) + goto out; /* refill path */ ext4_ext_drop_refs(path); @@ -1883,11 +1921,9 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); #endif - handle = ext4_ext_journal_restart(handle, credits); - if (IS_ERR(handle)) { - err = PTR_ERR(handle); + err = ext4_ext_journal_restart(handle, credits); + if (err) goto out; - } err = ext4_ext_get_access(handle, inode, path + depth); if (err) @@ -2529,6 +2565,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, int err = 0, depth, ret; unsigned long allocated = 0; struct ext4_allocation_request ar; + loff_t disksize; __clear_bit(BH_New, &bh_result->b_state); ext_debug("blocks %u/%lu requested for inode %u\n", @@ -2616,8 +2653,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, */ if (allocated > max_blocks) allocated = max_blocks; - /* mark the buffer unwritten */ - __set_bit(BH_Unwritten, &bh_result->b_state); + set_buffer_unwritten(bh_result); goto out2; } @@ -2716,14 +2752,19 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, goto out2; } - if (extend_disksize && inode->i_size > EXT4_I(inode)->i_disksize) - EXT4_I(inode)->i_disksize = inode->i_size; - /* previous routine could use block we allocated */ newblock = ext_pblock(&newex); allocated = ext4_ext_get_actual_len(&newex); outnew: - __set_bit(BH_New, &bh_result->b_state); + if (extend_disksize) { + disksize = ((loff_t) iblock + ar.len) << inode->i_blkbits; + if (disksize > i_size_read(inode)) + disksize = i_size_read(inode); + if (disksize > EXT4_I(inode)->i_disksize) + EXT4_I(inode)->i_disksize = disksize; + } + + set_buffer_new(bh_result); /* Cache only when it is _not_ an uninitialized extent */ if (create != EXT4_CREATE_UNINITIALIZED_EXT) @@ -2733,7 +2774,7 @@ out: if (allocated > max_blocks) allocated = max_blocks; ext4_ext_show_leaf(inode, path); - __set_bit(BH_Mapped, &bh_result->b_state); + set_buffer_mapped(bh_result); bh_result->b_bdev = inode->i_sb->s_bdev; bh_result->b_blocknr = newblock; out2: @@ -2744,7 +2785,7 @@ out2: return err ? err : allocated; } -void ext4_ext_truncate(struct inode * inode, struct page *page) +void ext4_ext_truncate(struct inode *inode) { struct address_space *mapping = inode->i_mapping; struct super_block *sb = inode->i_sb; @@ -2757,18 +2798,14 @@ void ext4_ext_truncate(struct inode * inode, struct page *page) */ err = ext4_writepage_trans_blocks(inode) + 3; handle = ext4_journal_start(inode, err); - if (IS_ERR(handle)) { - if (page) { - clear_highpage(page); - flush_dcache_page(page); - unlock_page(page); - page_cache_release(page); - } + if (IS_ERR(handle)) return; - } - if (page) - ext4_block_truncate_page(handle, page, mapping, inode->i_size); + if (inode->i_size & (sb->s_blocksize - 1)) + ext4_block_truncate_page(handle, mapping, inode->i_size); + + if (ext4_orphan_add(handle, inode)) + goto out_stop; down_write(&EXT4_I(inode)->i_data_sem); ext4_ext_invalidate_cache(inode); @@ -2780,8 +2817,6 @@ void ext4_ext_truncate(struct inode * inode, struct page *page) * Probably we need not scan at all, * because page truncation is enough. */ - if (ext4_orphan_add(handle, inode)) - goto out_stop; /* we have to know where to truncate from in crash case */ EXT4_I(inode)->i_disksize = inode->i_size; @@ -2798,6 +2833,7 @@ void ext4_ext_truncate(struct inode * inode, struct page *page) handle->h_sync = 1; out_stop: + up_write(&EXT4_I(inode)->i_data_sem); /* * If this was a simple ftruncate() and the file will remain alive, * then we need to clear up the orphan record which we created above. @@ -2808,7 +2844,6 @@ out_stop: if (inode->i_nlink) ext4_orphan_del(handle, inode); - up_write(&EXT4_I(inode)->i_data_sem); inode->i_mtime = inode->i_ctime = ext4_current_time(inode); ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); @@ -2911,7 +2946,7 @@ retry: } ret = ext4_get_blocks_wrap(handle, inode, block, max_blocks, &map_bh, - EXT4_CREATE_UNINITIALIZED_EXT, 0); + EXT4_CREATE_UNINITIALIZED_EXT, 0, 0); if (ret <= 0) { #ifdef EXT4FS_DEBUG WARN_ON(ret <= 0); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 4159be6366a..430eb7978db 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -123,6 +123,23 @@ force_commit: return ret; } +static struct vm_operations_struct ext4_file_vm_ops = { + .fault = filemap_fault, + .page_mkwrite = ext4_page_mkwrite, +}; + +static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct address_space *mapping = file->f_mapping; + + if (!mapping->a_ops->readpage) + return -ENOEXEC; + file_accessed(file); + vma->vm_ops = &ext4_file_vm_ops; + vma->vm_flags |= VM_CAN_NONLINEAR; + return 0; +} + const struct file_operations ext4_file_operations = { .llseek = generic_file_llseek, .read = do_sync_read, @@ -133,7 +150,7 @@ const struct file_operations ext4_file_operations = { #ifdef CONFIG_COMPAT .compat_ioctl = ext4_compat_ioctl, #endif - .mmap = generic_file_mmap, + .mmap = ext4_file_mmap, .open = generic_file_open, .release = ext4_release_file, .fsync = ext4_sync_file, @@ -144,6 +161,7 @@ const struct file_operations ext4_file_operations = { const struct inode_operations ext4_file_inode_operations = { .truncate = ext4_truncate, .setattr = ext4_setattr, + .getattr = ext4_getattr, #ifdef CONFIG_EXT4DEV_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 1c8ba48d4f8..a45c3737ad3 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -27,6 +27,7 @@ #include <linux/sched.h> #include <linux/writeback.h> #include <linux/jbd2.h> +#include <linux/blkdev.h> #include "ext4.h" #include "ext4_jbd2.h" @@ -45,6 +46,7 @@ int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync) { struct inode *inode = dentry->d_inode; + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; int ret = 0; J_ASSERT(ext4_journal_current_handle() == NULL); @@ -85,6 +87,8 @@ int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync) .nr_to_write = 0, /* sys_fsync did this */ }; ret = sync_inode(inode, &wbc); + if (journal && (journal->j_flags & JBD2_BARRIER)) + blkdev_issue_flush(inode->i_sb->s_bdev, NULL); } out: return ret; diff --git a/fs/ext4/group.h b/fs/ext4/group.h index 7eb0604e7ee..c2c0a8d06d0 100644 --- a/fs/ext4/group.h +++ b/fs/ext4/group.h @@ -13,7 +13,7 @@ extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group, struct ext4_group_desc *gdp); extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group, struct ext4_group_desc *gdp); -struct buffer_head *read_block_bitmap(struct super_block *sb, +struct buffer_head *ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group); extern unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index c6efbab0c80..a92eb305344 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -157,6 +157,7 @@ void ext4_free_inode (handle_t *handle, struct inode * inode) struct ext4_super_block * es; struct ext4_sb_info *sbi; int fatal = 0, err; + ext4_group_t flex_group; if (atomic_read(&inode->i_count) > 1) { printk ("ext4_free_inode: inode has count=%d\n", @@ -232,6 +233,12 @@ void ext4_free_inode (handle_t *handle, struct inode * inode) if (is_directory) percpu_counter_dec(&sbi->s_dirs_counter); + if (sbi->s_log_groups_per_flex) { + flex_group = ext4_flex_group(sbi, block_group); + spin_lock(sb_bgl_lock(sbi, flex_group)); + sbi->s_flex_groups[flex_group].free_inodes++; + spin_unlock(sb_bgl_lock(sbi, flex_group)); + } } BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata"); err = ext4_journal_dirty_metadata(handle, bh2); @@ -286,6 +293,80 @@ static int find_group_dir(struct super_block *sb, struct inode *parent, return ret; } +#define free_block_ratio 10 + +static int find_group_flex(struct super_block *sb, struct inode *parent, + ext4_group_t *best_group) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_group_desc *desc; + struct buffer_head *bh; + struct flex_groups *flex_group = sbi->s_flex_groups; + ext4_group_t parent_group = EXT4_I(parent)->i_block_group; + ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group); + ext4_group_t ngroups = sbi->s_groups_count; + int flex_size = ext4_flex_bg_size(sbi); + ext4_group_t best_flex = parent_fbg_group; + int blocks_per_flex = sbi->s_blocks_per_group * flex_size; + int flexbg_free_blocks; + int flex_freeb_ratio; + ext4_group_t n_fbg_groups; + ext4_group_t i; + + n_fbg_groups = (sbi->s_groups_count + flex_size - 1) >> + sbi->s_log_groups_per_flex; + +find_close_to_parent: + flexbg_free_blocks = flex_group[best_flex].free_blocks; + flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex; + if (flex_group[best_flex].free_inodes && + flex_freeb_ratio > free_block_ratio) + goto found_flexbg; + + if (best_flex && best_flex == parent_fbg_group) { + best_flex--; + goto find_close_to_parent; + } + + for (i = 0; i < n_fbg_groups; i++) { + if (i == parent_fbg_group || i == parent_fbg_group - 1) + continue; + + flexbg_free_blocks = flex_group[i].free_blocks; + flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex; + + if (flex_freeb_ratio > free_block_ratio && + flex_group[i].free_inodes) { + best_flex = i; + goto found_flexbg; + } + + if (best_flex < 0 || + (flex_group[i].free_blocks > + flex_group[best_flex].free_blocks && + flex_group[i].free_inodes)) + best_flex = i; + } + + if (!flex_group[best_flex].free_inodes || + !flex_group[best_flex].free_blocks) + return -1; + +found_flexbg: + for (i = best_flex * flex_size; i < ngroups && + i < (best_flex + 1) * flex_size; i++) { + desc = ext4_get_group_desc(sb, i, &bh); + if (le16_to_cpu(desc->bg_free_inodes_count)) { + *best_group = i; + goto out; + } + } + + return -1; +out: + return 0; +} + /* * Orlov's allocator for directories. * @@ -501,6 +582,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode) struct inode *ret; ext4_group_t i; int free = 0; + ext4_group_t flex_group; /* Cannot create files in a deleted directory */ if (!dir || !dir->i_nlink) @@ -514,6 +596,12 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode) sbi = EXT4_SB(sb); es = sbi->s_es; + + if (sbi->s_log_groups_per_flex) { + ret2 = find_group_flex(sb, dir, &group); + goto got_group; + } + if (S_ISDIR(mode)) { if (test_opt (sb, OLDALLOC)) ret2 = find_group_dir(sb, dir, &group); @@ -522,6 +610,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode) } else ret2 = find_group_other(sb, dir, &group); +got_group: err = -ENOSPC; if (ret2 == -1) goto out; @@ -600,7 +689,7 @@ got: /* We may have to initialize the block bitmap if it isn't already */ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) && gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { - struct buffer_head *block_bh = read_block_bitmap(sb, group); + struct buffer_head *block_bh = ext4_read_block_bitmap(sb, group); BUFFER_TRACE(block_bh, "get block bitmap access"); err = ext4_journal_get_write_access(handle, block_bh); @@ -676,6 +765,13 @@ got: percpu_counter_inc(&sbi->s_dirs_counter); sb->s_dirt = 1; + if (sbi->s_log_groups_per_flex) { + flex_group = ext4_flex_group(sbi, group); + spin_lock(sb_bgl_lock(sbi, flex_group)); + sbi->s_flex_groups[flex_group].free_inodes--; + spin_unlock(sb_bgl_lock(sbi, flex_group)); + } + inode->i_uid = current->fsuid; if (test_opt (sb, GRPID)) inode->i_gid = dir->i_gid; @@ -740,14 +836,10 @@ got: goto fail_free_drop; if (test_opt(sb, EXTENTS)) { - /* set extent flag only for diretory, file and normal symlink*/ + /* set extent flag only for directory, file and normal symlink*/ if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) { EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL; ext4_ext_tree_init(handle, inode); - err = ext4_update_incompat_feature(handle, sb, - EXT4_FEATURE_INCOMPAT_EXTENTS); - if (err) - goto fail_free_drop; } } @@ -817,6 +909,14 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) if (IS_ERR(inode)) goto iget_failed; + /* + * If the orphans has i_nlinks > 0 then it should be able to be + * truncated, otherwise it won't be removed from the orphan list + * during processing and an infinite loop will result. + */ + if (inode->i_nlink && !ext4_can_truncate(inode)) + goto bad_orphan; + if (NEXT_ORPHAN(inode) > max_ino) goto bad_orphan; brelse(bitmap_bh); @@ -838,6 +938,7 @@ bad_orphan: printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n", NEXT_ORPHAN(inode)); printk(KERN_NOTICE "max_ino=%lu\n", max_ino); + printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink); /* Avoid freeing blocks if we got a bad deleted inode */ if (inode->i_nlink == 0) inode->i_blocks = 0; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 8d970774641..8ca2763df09 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -32,12 +32,23 @@ #include <linux/string.h> #include <linux/buffer_head.h> #include <linux/writeback.h> +#include <linux/pagevec.h> #include <linux/mpage.h> #include <linux/uio.h> #include <linux/bio.h> #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" +#include "ext4_extents.h" + +static inline int ext4_begin_ordered_truncate(struct inode *inode, + loff_t new_size) +{ + return jbd2_journal_begin_ordered_truncate(&EXT4_I(inode)->jinode, + new_size); +} + +static void ext4_invalidatepage(struct page *page, unsigned long offset); /* * Test whether an inode is a fast symlink. @@ -181,6 +192,8 @@ void ext4_delete_inode (struct inode * inode) { handle_t *handle; + if (ext4_should_order_data(inode)) + ext4_begin_ordered_truncate(inode, 0); truncate_inode_pages(&inode->i_data, 0); if (is_bad_inode(inode)) @@ -508,11 +521,12 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks, * direct blocks */ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, - ext4_fsblk_t goal, int indirect_blks, int blks, - ext4_fsblk_t new_blocks[4], int *err) + ext4_lblk_t iblock, ext4_fsblk_t goal, + int indirect_blks, int blks, + ext4_fsblk_t new_blocks[4], int *err) { int target, i; - unsigned long count = 0; + unsigned long count = 0, blk_allocated = 0; int index = 0; ext4_fsblk_t current_block = 0; int ret = 0; @@ -525,12 +539,13 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, * the first direct block of this branch. That's the * minimum number of blocks need to allocate(required) */ - target = blks + indirect_blks; - - while (1) { + /* first we try to allocate the indirect blocks */ + target = indirect_blks; + while (target > 0) { count = target; /* allocating blocks for indirect blocks and direct blocks */ - current_block = ext4_new_blocks(handle,inode,goal,&count,err); + current_block = ext4_new_meta_blocks(handle, inode, + goal, &count, err); if (*err) goto failed_out; @@ -540,16 +555,48 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, new_blocks[index++] = current_block++; count--; } - - if (count > 0) + if (count > 0) { + /* + * save the new block number + * for the first direct block + */ + new_blocks[index] = current_block; + printk(KERN_INFO "%s returned more blocks than " + "requested\n", __func__); + WARN_ON(1); break; + } } - /* save the new block number for the first direct block */ - new_blocks[index] = current_block; - + target = blks - count ; + blk_allocated = count; + if (!target) + goto allocated; + /* Now allocate data blocks */ + count = target; + /* allocating blocks for data blocks */ + current_block = ext4_new_blocks(handle, inode, iblock, + goal, &count, err); + if (*err && (target == blks)) { + /* + * if the allocation failed and we didn't allocate + * any blocks before + */ + goto failed_out; + } + if (!*err) { + if (target == blks) { + /* + * save the new block number + * for the first direct block + */ + new_blocks[index] = current_block; + } + blk_allocated += count; + } +allocated: /* total number of blocks allocated for direct blocks */ - ret = count; + ret = blk_allocated; *err = 0; return ret; failed_out: @@ -584,8 +631,9 @@ failed_out: * as described above and return 0. */ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, - int indirect_blks, int *blks, ext4_fsblk_t goal, - ext4_lblk_t *offsets, Indirect *branch) + ext4_lblk_t iblock, int indirect_blks, + int *blks, ext4_fsblk_t goal, + ext4_lblk_t *offsets, Indirect *branch) { int blocksize = inode->i_sb->s_blocksize; int i, n = 0; @@ -595,7 +643,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, ext4_fsblk_t new_blocks[4]; ext4_fsblk_t current_block; - num = ext4_alloc_blocks(handle, inode, goal, indirect_blks, + num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks, *blks, new_blocks, &err); if (err) return err; @@ -799,6 +847,7 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, struct ext4_inode_info *ei = EXT4_I(inode); int count = 0; ext4_fsblk_t first_block = 0; + loff_t disksize; J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)); @@ -855,8 +904,9 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, /* * Block out ext4_truncate while we alter the tree */ - err = ext4_alloc_branch(handle, inode, indirect_blks, &count, goal, - offsets + (partial - chain), partial); + err = ext4_alloc_branch(handle, inode, iblock, indirect_blks, + &count, goal, + offsets + (partial - chain), partial); /* * The ext4_splice_branch call will free and forget any buffers @@ -873,8 +923,13 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, * protect it if you're about to implement concurrent * ext4_get_block() -bzzz */ - if (!err && extend_disksize && inode->i_size > ei->i_disksize) - ei->i_disksize = inode->i_size; + if (!err && extend_disksize) { + disksize = ((loff_t) iblock + count) << inode->i_blkbits; + if (disksize > i_size_read(inode)) + disksize = i_size_read(inode); + if (disksize > ei->i_disksize) + ei->i_disksize = disksize; + } if (err) goto cleanup; @@ -934,7 +989,7 @@ out: */ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, unsigned long max_blocks, struct buffer_head *bh, - int create, int extend_disksize) + int create, int extend_disksize, int flag) { int retval; @@ -975,6 +1030,15 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, * with create == 1 flag. */ down_write((&EXT4_I(inode)->i_data_sem)); + + /* + * if the caller is from delayed allocation writeout path + * we have already reserved fs blocks for allocation + * let the underlying get_block() function know to + * avoid double accounting + */ + if (flag) + EXT4_I(inode)->i_delalloc_reserved_flag = 1; /* * We need to check for EXT4 here because migrate * could have changed the inode type in between @@ -996,6 +1060,18 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, ~EXT4_EXT_MIGRATE; } } + + if (flag) { + EXT4_I(inode)->i_delalloc_reserved_flag = 0; + /* + * Update reserved blocks/metadata blocks + * after successful block allocation + * which were deferred till now + */ + if ((retval > 0) && buffer_delay(bh)) + ext4_da_release_space(inode, retval, 0); + } + up_write((&EXT4_I(inode)->i_data_sem)); return retval; } @@ -1021,7 +1097,7 @@ static int ext4_get_block(struct inode *inode, sector_t iblock, } ret = ext4_get_blocks_wrap(handle, inode, iblock, - max_blocks, bh_result, create, 0); + max_blocks, bh_result, create, 0, 0); if (ret > 0) { bh_result->b_size = (ret << inode->i_blkbits); ret = 0; @@ -1047,7 +1123,7 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, dummy.b_blocknr = -1000; buffer_trace_init(&dummy.b_history); err = ext4_get_blocks_wrap(handle, inode, block, 1, - &dummy, create, 1); + &dummy, create, 1, 0); /* * ext4_get_blocks_handle() returns number of blocks * mapped. 0 in case of a HOLE. @@ -1203,19 +1279,20 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, to = from + len; retry: - page = __grab_cache_page(mapping, index); - if (!page) - return -ENOMEM; - *pagep = page; - handle = ext4_journal_start(inode, needed_blocks); if (IS_ERR(handle)) { - unlock_page(page); - page_cache_release(page); ret = PTR_ERR(handle); goto out; } + page = __grab_cache_page(mapping, index); + if (!page) { + ext4_journal_stop(handle); + ret = -ENOMEM; + goto out; + } + *pagep = page; + ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, ext4_get_block); @@ -1225,8 +1302,8 @@ retry: } if (ret) { - ext4_journal_stop(handle); unlock_page(page); + ext4_journal_stop(handle); page_cache_release(page); } @@ -1236,15 +1313,6 @@ out: return ret; } -int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh) -{ - int err = jbd2_journal_dirty_data(handle, bh); - if (err) - ext4_journal_abort_handle(__func__, __func__, - bh, handle, err); - return err; -} - /* For write_end() in data=journal mode */ static int write_end_fn(handle_t *handle, struct buffer_head *bh) { @@ -1255,29 +1323,6 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh) } /* - * Generic write_end handler for ordered and writeback ext4 journal modes. - * We can't use generic_write_end, because that unlocks the page and we need to - * unlock the page after ext4_journal_stop, but ext4_journal_stop must run - * after block_write_end. - */ -static int ext4_generic_write_end(struct file *file, - struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - struct inode *inode = file->f_mapping->host; - - copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); - - if (pos+copied > inode->i_size) { - i_size_write(inode, pos+copied); - mark_inode_dirty(inode); - } - - return copied; -} - -/* * We need to pick up the new inode size which generic_commit_write gave us * `file' can be NULL - eg, when called from page_symlink(). * @@ -1290,15 +1335,14 @@ static int ext4_ordered_write_end(struct file *file, struct page *page, void *fsdata) { handle_t *handle = ext4_journal_current_handle(); - struct inode *inode = file->f_mapping->host; + struct inode *inode = mapping->host; unsigned from, to; int ret = 0, ret2; from = pos & (PAGE_CACHE_SIZE - 1); to = from + len; - ret = walk_page_buffers(handle, page_buffers(page), - from, to, NULL, ext4_journal_dirty_data); + ret = ext4_jbd2_file_inode(handle, inode); if (ret == 0) { /* @@ -1311,7 +1355,7 @@ static int ext4_ordered_write_end(struct file *file, new_i_size = pos + copied; if (new_i_size > EXT4_I(inode)->i_disksize) EXT4_I(inode)->i_disksize = new_i_size; - ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, + ret2 = generic_write_end(file, mapping, pos, len, copied, page, fsdata); copied = ret2; if (ret2 < 0) @@ -1320,8 +1364,6 @@ static int ext4_ordered_write_end(struct file *file, ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; - unlock_page(page); - page_cache_release(page); return ret ? ret : copied; } @@ -1332,7 +1374,7 @@ static int ext4_writeback_write_end(struct file *file, struct page *page, void *fsdata) { handle_t *handle = ext4_journal_current_handle(); - struct inode *inode = file->f_mapping->host; + struct inode *inode = mapping->host; int ret = 0, ret2; loff_t new_i_size; @@ -1340,7 +1382,7 @@ static int ext4_writeback_write_end(struct file *file, if (new_i_size > EXT4_I(inode)->i_disksize) EXT4_I(inode)->i_disksize = new_i_size; - ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, + ret2 = generic_write_end(file, mapping, pos, len, copied, page, fsdata); copied = ret2; if (ret2 < 0) @@ -1349,8 +1391,6 @@ static int ext4_writeback_write_end(struct file *file, ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; - unlock_page(page); - page_cache_release(page); return ret ? ret : copied; } @@ -1389,14 +1429,965 @@ static int ext4_journalled_write_end(struct file *file, ret = ret2; } + unlock_page(page); ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; - unlock_page(page); page_cache_release(page); return ret ? ret : copied; } +/* + * Calculate the number of metadata blocks need to reserve + * to allocate @blocks for non extent file based file + */ +static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks) +{ + int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb); + int ind_blks, dind_blks, tind_blks; + + /* number of new indirect blocks needed */ + ind_blks = (blocks + icap - 1) / icap; + + dind_blks = (ind_blks + icap - 1) / icap; + + tind_blks = 1; + + return ind_blks + dind_blks + tind_blks; +} + +/* + * Calculate the number of metadata blocks need to reserve + * to allocate given number of blocks + */ +static int ext4_calc_metadata_amount(struct inode *inode, int blocks) +{ + if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) + return ext4_ext_calc_metadata_amount(inode, blocks); + + return ext4_indirect_calc_metadata_amount(inode, blocks); +} + +static int ext4_da_reserve_space(struct inode *inode, int nrblocks) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + unsigned long md_needed, mdblocks, total = 0; + + /* + * recalculate the amount of metadata blocks to reserve + * in order to allocate nrblocks + * worse case is one extent per block + */ + spin_lock(&EXT4_I(inode)->i_block_reservation_lock); + total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks; + mdblocks = ext4_calc_metadata_amount(inode, total); + BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks); + + md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks; + total = md_needed + nrblocks; + + if (ext4_has_free_blocks(sbi, total) < total) { + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + return -ENOSPC; + } + + /* reduce fs free blocks counter */ + percpu_counter_sub(&sbi->s_freeblocks_counter, total); + + EXT4_I(inode)->i_reserved_data_blocks += nrblocks; + EXT4_I(inode)->i_reserved_meta_blocks = mdblocks; + + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + return 0; /* success */ +} + +void ext4_da_release_space(struct inode *inode, int used, int to_free) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + int total, mdb, mdb_free, release; + + spin_lock(&EXT4_I(inode)->i_block_reservation_lock); + /* recalculate the number of metablocks still need to be reserved */ + total = EXT4_I(inode)->i_reserved_data_blocks - used - to_free; + mdb = ext4_calc_metadata_amount(inode, total); + + /* figure out how many metablocks to release */ + BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); + mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; + + /* Account for allocated meta_blocks */ + mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks; + + release = to_free + mdb_free; + + /* update fs free blocks counter for truncate case */ + percpu_counter_add(&sbi->s_freeblocks_counter, release); + + /* update per-inode reservations */ + BUG_ON(used + to_free > EXT4_I(inode)->i_reserved_data_blocks); + EXT4_I(inode)->i_reserved_data_blocks -= (used + to_free); + + BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); + EXT4_I(inode)->i_reserved_meta_blocks = mdb; + EXT4_I(inode)->i_allocated_meta_blocks = 0; + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); +} + +static void ext4_da_page_release_reservation(struct page *page, + unsigned long offset) +{ + int to_release = 0; + struct buffer_head *head, *bh; + unsigned int curr_off = 0; + + head = page_buffers(page); + bh = head; + do { + unsigned int next_off = curr_off + bh->b_size; + + if ((offset <= curr_off) && (buffer_delay(bh))) { + to_release++; + clear_buffer_delay(bh); + } + curr_off = next_off; + } while ((bh = bh->b_this_page) != head); + ext4_da_release_space(page->mapping->host, 0, to_release); +} + +/* + * Delayed allocation stuff + */ + +struct mpage_da_data { + struct inode *inode; + struct buffer_head lbh; /* extent of blocks */ + unsigned long first_page, next_page; /* extent of pages */ + get_block_t *get_block; + struct writeback_control *wbc; +}; + +/* + * mpage_da_submit_io - walks through extent of pages and try to write + * them with __mpage_writepage() + * + * @mpd->inode: inode + * @mpd->first_page: first page of the extent + * @mpd->next_page: page after the last page of the extent + * @mpd->get_block: the filesystem's block mapper function + * + * By the time mpage_da_submit_io() is called we expect all blocks + * to be allocated. this may be wrong if allocation failed. + * + * As pages are already locked by write_cache_pages(), we can't use it + */ +static int mpage_da_submit_io(struct mpage_da_data *mpd) +{ + struct address_space *mapping = mpd->inode->i_mapping; + struct mpage_data mpd_pp = { + .bio = NULL, + .last_block_in_bio = 0, + .get_block = mpd->get_block, + .use_writepage = 1, + }; + int ret = 0, err, nr_pages, i; + unsigned long index, end; + struct pagevec pvec; + + BUG_ON(mpd->next_page <= mpd->first_page); + + pagevec_init(&pvec, 0); + index = mpd->first_page; + end = mpd->next_page - 1; + + while (index <= end) { + /* XXX: optimize tail */ + nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); + if (nr_pages == 0) + break; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + index = page->index; + if (index > end) + break; + index++; + + err = __mpage_writepage(page, mpd->wbc, &mpd_pp); + + /* + * In error case, we have to continue because + * remaining pages are still locked + * XXX: unlock and re-dirty them? + */ + if (ret == 0) + ret = err; + } + pagevec_release(&pvec); + } + if (mpd_pp.bio) + mpage_bio_submit(WRITE, mpd_pp.bio); + + return ret; +} + +/* + * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers + * + * @mpd->inode - inode to walk through + * @exbh->b_blocknr - first block on a disk + * @exbh->b_size - amount of space in bytes + * @logical - first logical block to start assignment with + * + * the function goes through all passed space and put actual disk + * block numbers into buffer heads, dropping BH_Delay + */ +static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical, + struct buffer_head *exbh) +{ + struct inode *inode = mpd->inode; + struct address_space *mapping = inode->i_mapping; + int blocks = exbh->b_size >> inode->i_blkbits; + sector_t pblock = exbh->b_blocknr, cur_logical; + struct buffer_head *head, *bh; + unsigned long index, end; + struct pagevec pvec; + int nr_pages, i; + + index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits); + end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits); + cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + + pagevec_init(&pvec, 0); + + while (index <= end) { + /* XXX: optimize tail */ + nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); + if (nr_pages == 0) + break; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + index = page->index; + if (index > end) + break; + index++; + + BUG_ON(!PageLocked(page)); + BUG_ON(PageWriteback(page)); + BUG_ON(!page_has_buffers(page)); + + bh = page_buffers(page); + head = bh; + + /* skip blocks out of the range */ + do { + if (cur_logical >= logical) + break; + cur_logical++; + } while ((bh = bh->b_this_page) != head); + + do { + if (cur_logical >= logical + blocks) + break; + if (buffer_delay(bh)) { + bh->b_blocknr = pblock; + clear_buffer_delay(bh); + } else if (buffer_mapped(bh)) + BUG_ON(bh->b_blocknr != pblock); + + cur_logical++; + pblock++; + } while ((bh = bh->b_this_page) != head); + } + pagevec_release(&pvec); + } +} + + +/* + * __unmap_underlying_blocks - just a helper function to unmap + * set of blocks described by @bh + */ +static inline void __unmap_underlying_blocks(struct inode *inode, + struct buffer_head *bh) +{ + struct block_device *bdev = inode->i_sb->s_bdev; + int blocks, i; + + blocks = bh->b_size >> inode->i_blkbits; + for (i = 0; i < blocks; i++) + unmap_underlying_metadata(bdev, bh->b_blocknr + i); +} + +/* + * mpage_da_map_blocks - go through given space + * + * @mpd->lbh - bh describing space + * @mpd->get_block - the filesystem's block mapper function + * + * The function skips space we know is already mapped to disk blocks. + * + * The function ignores errors ->get_block() returns, thus real + * error handling is postponed to __mpage_writepage() + */ +static void mpage_da_map_blocks(struct mpage_da_data *mpd) +{ + struct buffer_head *lbh = &mpd->lbh; + int err = 0, remain = lbh->b_size; + sector_t next = lbh->b_blocknr; + struct buffer_head new; + + /* + * We consider only non-mapped and non-allocated blocks + */ + if (buffer_mapped(lbh) && !buffer_delay(lbh)) + return; + + while (remain) { + new.b_state = lbh->b_state; + new.b_blocknr = 0; + new.b_size = remain; + err = mpd->get_block(mpd->inode, next, &new, 1); + if (err) { + /* + * Rather than implement own error handling + * here, we just leave remaining blocks + * unallocated and try again with ->writepage() + */ + break; + } + BUG_ON(new.b_size == 0); + + if (buffer_new(&new)) + __unmap_underlying_blocks(mpd->inode, &new); + + /* + * If blocks are delayed marked, we need to + * put actual blocknr and drop delayed bit + */ + if (buffer_delay(lbh)) + mpage_put_bnr_to_bhs(mpd, next, &new); + + /* go for the remaining blocks */ + next += new.b_size >> mpd->inode->i_blkbits; + remain -= new.b_size; + } +} + +#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay)) + +/* + * mpage_add_bh_to_extent - try to add one more block to extent of blocks + * + * @mpd->lbh - extent of blocks + * @logical - logical number of the block in the file + * @bh - bh of the block (used to access block's state) + * + * the function is used to collect contig. blocks in same state + */ +static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, + sector_t logical, struct buffer_head *bh) +{ + struct buffer_head *lbh = &mpd->lbh; + sector_t next; + + next = lbh->b_blocknr + (lbh->b_size >> mpd->inode->i_blkbits); + + /* + * First block in the extent + */ + if (lbh->b_size == 0) { + lbh->b_blocknr = logical; + lbh->b_size = bh->b_size; + lbh->b_state = bh->b_state & BH_FLAGS; + return; + } + + /* + * Can we merge the block to our big extent? + */ + if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) { + lbh->b_size += bh->b_size; + return; + } + + /* + * We couldn't merge the block to our extent, so we + * need to flush current extent and start new one + */ + mpage_da_map_blocks(mpd); + + /* + * Now start a new extent + */ + lbh->b_size = bh->b_size; + lbh->b_state = bh->b_state & BH_FLAGS; + lbh->b_blocknr = logical; +} + +/* + * __mpage_da_writepage - finds extent of pages and blocks + * + * @page: page to consider + * @wbc: not used, we just follow rules + * @data: context + * + * The function finds extents of pages and scan them for all blocks. + */ +static int __mpage_da_writepage(struct page *page, + struct writeback_control *wbc, void *data) +{ + struct mpage_da_data *mpd = data; + struct inode *inode = mpd->inode; + struct buffer_head *bh, *head, fake; + sector_t logical; + + /* + * Can we merge this page to current extent? + */ + if (mpd->next_page != page->index) { + /* + * Nope, we can't. So, we map non-allocated blocks + * and start IO on them using __mpage_writepage() + */ + if (mpd->next_page != mpd->first_page) { + mpage_da_map_blocks(mpd); + mpage_da_submit_io(mpd); + } + + /* + * Start next extent of pages ... + */ + mpd->first_page = page->index; + + /* + * ... and blocks + */ + mpd->lbh.b_size = 0; + mpd->lbh.b_state = 0; + mpd->lbh.b_blocknr = 0; + } + + mpd->next_page = page->index + 1; + logical = (sector_t) page->index << + (PAGE_CACHE_SHIFT - inode->i_blkbits); + + if (!page_has_buffers(page)) { + /* + * There is no attached buffer heads yet (mmap?) + * we treat the page asfull of dirty blocks + */ + bh = &fake; + bh->b_size = PAGE_CACHE_SIZE; + bh->b_state = 0; + set_buffer_dirty(bh); + set_buffer_uptodate(bh); + mpage_add_bh_to_extent(mpd, logical, bh); + } else { + /* + * Page with regular buffer heads, just add all dirty ones + */ + head = page_buffers(page); + bh = head; + do { + BUG_ON(buffer_locked(bh)); + if (buffer_dirty(bh)) + mpage_add_bh_to_extent(mpd, logical, bh); + logical++; + } while ((bh = bh->b_this_page) != head); + } + + return 0; +} + +/* + * mpage_da_writepages - walk the list of dirty pages of the given + * address space, allocates non-allocated blocks, maps newly-allocated + * blocks to existing bhs and issue IO them + * + * @mapping: address space structure to write + * @wbc: subtract the number of written pages from *@wbc->nr_to_write + * @get_block: the filesystem's block mapper function. + * + * This is a library function, which implements the writepages() + * address_space_operation. + * + * In order to avoid duplication of logic that deals with partial pages, + * multiple bio per page, etc, we find non-allocated blocks, allocate + * them with minimal calls to ->get_block() and re-use __mpage_writepage() + * + * It's important that we call __mpage_writepage() only once for each + * involved page, otherwise we'd have to implement more complicated logic + * to deal with pages w/o PG_lock or w/ PG_writeback and so on. + * + * See comments to mpage_writepages() + */ +static int mpage_da_writepages(struct address_space *mapping, + struct writeback_control *wbc, + get_block_t get_block) +{ + struct mpage_da_data mpd; + int ret; + + if (!get_block) + return generic_writepages(mapping, wbc); + + mpd.wbc = wbc; + mpd.inode = mapping->host; + mpd.lbh.b_size = 0; + mpd.lbh.b_state = 0; + mpd.lbh.b_blocknr = 0; + mpd.first_page = 0; + mpd.next_page = 0; + mpd.get_block = get_block; + + ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd); + + /* + * Handle last extent of pages + */ + if (mpd.next_page != mpd.first_page) { + mpage_da_map_blocks(&mpd); + mpage_da_submit_io(&mpd); + } + + return ret; +} + +/* + * this is a special callback for ->write_begin() only + * it's intention is to return mapped block or reserve space + */ +static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + int ret = 0; + + BUG_ON(create == 0); + BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize); + + /* + * first, we need to know whether the block is allocated already + * preallocated blocks are unmapped but should treated + * the same as allocated blocks. + */ + ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1, bh_result, 0, 0, 0); + if ((ret == 0) && !buffer_delay(bh_result)) { + /* the block isn't (pre)allocated yet, let's reserve space */ + /* + * XXX: __block_prepare_write() unmaps passed block, + * is it OK? + */ + ret = ext4_da_reserve_space(inode, 1); + if (ret) + /* not enough space to reserve */ + return ret; + + map_bh(bh_result, inode->i_sb, 0); + set_buffer_new(bh_result); + set_buffer_delay(bh_result); + } else if (ret > 0) { + bh_result->b_size = (ret << inode->i_blkbits); + ret = 0; + } + + return ret; +} +#define EXT4_DELALLOC_RSVED 1 +static int ext4_da_get_block_write(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + int ret; + unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; + loff_t disksize = EXT4_I(inode)->i_disksize; + handle_t *handle = NULL; + + handle = ext4_journal_current_handle(); + if (!handle) { + ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks, + bh_result, 0, 0, 0); + BUG_ON(!ret); + } else { + ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks, + bh_result, create, 0, EXT4_DELALLOC_RSVED); + } + + if (ret > 0) { + bh_result->b_size = (ret << inode->i_blkbits); + + /* + * Update on-disk size along with block allocation + * we don't use 'extend_disksize' as size may change + * within already allocated block -bzzz + */ + disksize = ((loff_t) iblock + ret) << inode->i_blkbits; + if (disksize > i_size_read(inode)) + disksize = i_size_read(inode); + if (disksize > EXT4_I(inode)->i_disksize) { + /* + * XXX: replace with spinlock if seen contended -bzzz + */ + down_write(&EXT4_I(inode)->i_data_sem); + if (disksize > EXT4_I(inode)->i_disksize) + EXT4_I(inode)->i_disksize = disksize; + up_write(&EXT4_I(inode)->i_data_sem); + + if (EXT4_I(inode)->i_disksize == disksize) { + ret = ext4_mark_inode_dirty(handle, inode); + return ret; + } + } + ret = 0; + } + return ret; +} + +static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh) +{ + /* + * unmapped buffer is possible for holes. + * delay buffer is possible with delayed allocation + */ + return ((!buffer_mapped(bh) || buffer_delay(bh)) && buffer_dirty(bh)); +} + +static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + int ret = 0; + unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; + + /* + * we don't want to do block allocation in writepage + * so call get_block_wrap with create = 0 + */ + ret = ext4_get_blocks_wrap(NULL, inode, iblock, max_blocks, + bh_result, 0, 0, 0); + if (ret > 0) { + bh_result->b_size = (ret << inode->i_blkbits); + ret = 0; + } + return ret; +} + +/* + * get called vi ext4_da_writepages after taking page lock (have journal handle) + * get called via journal_submit_inode_data_buffers (no journal handle) + * get called via shrink_page_list via pdflush (no journal handle) + * or grab_page_cache when doing write_begin (have journal handle) + */ +static int ext4_da_writepage(struct page *page, + struct writeback_control *wbc) +{ + int ret = 0; + loff_t size; + unsigned long len; + struct buffer_head *page_bufs; + struct inode *inode = page->mapping->host; + + size = i_size_read(inode); + if (page->index == size >> PAGE_CACHE_SHIFT) + len = size & ~PAGE_CACHE_MASK; + else + len = PAGE_CACHE_SIZE; + + if (page_has_buffers(page)) { + page_bufs = page_buffers(page); + if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, + ext4_bh_unmapped_or_delay)) { + /* + * We don't want to do block allocation + * So redirty the page and return + * We may reach here when we do a journal commit + * via journal_submit_inode_data_buffers. + * If we don't have mapping block we just ignore + * them. We can also reach here via shrink_page_list + */ + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; + } + } else { + /* + * The test for page_has_buffers() is subtle: + * We know the page is dirty but it lost buffers. That means + * that at some moment in time after write_begin()/write_end() + * has been called all buffers have been clean and thus they + * must have been written at least once. So they are all + * mapped and we can happily proceed with mapping them + * and writing the page. + * + * Try to initialize the buffer_heads and check whether + * all are mapped and non delay. We don't want to + * do block allocation here. + */ + ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, + ext4_normal_get_block_write); + if (!ret) { + page_bufs = page_buffers(page); + /* check whether all are mapped and non delay */ + if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, + ext4_bh_unmapped_or_delay)) { + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; + } + } else { + /* + * We can't do block allocation here + * so just redity the page and unlock + * and return + */ + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; + } + } + + if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) + ret = nobh_writepage(page, ext4_normal_get_block_write, wbc); + else + ret = block_write_full_page(page, + ext4_normal_get_block_write, + wbc); + + return ret; +} + +/* + * For now just follow the DIO way to estimate the max credits + * needed to write out EXT4_MAX_WRITEBACK_PAGES. + * todo: need to calculate the max credits need for + * extent based files, currently the DIO credits is based on + * indirect-blocks mapping way. + * + * Probably should have a generic way to calculate credits + * for DIO, writepages, and truncate + */ +#define EXT4_MAX_WRITEBACK_PAGES DIO_MAX_BLOCKS +#define EXT4_MAX_WRITEBACK_CREDITS DIO_CREDITS + +static int ext4_da_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct inode *inode = mapping->host; + handle_t *handle = NULL; + int needed_blocks; + int ret = 0; + long to_write; + loff_t range_start = 0; + + /* + * No pages to write? This is mainly a kludge to avoid starting + * a transaction for special inodes like journal inode on last iput() + * because that could violate lock ordering on umount + */ + if (!mapping->nrpages) + return 0; + + /* + * Estimate the worse case needed credits to write out + * EXT4_MAX_BUF_BLOCKS pages + */ + needed_blocks = EXT4_MAX_WRITEBACK_CREDITS; + + to_write = wbc->nr_to_write; + if (!wbc->range_cyclic) { + /* + * If range_cyclic is not set force range_cont + * and save the old writeback_index + */ + wbc->range_cont = 1; + range_start = wbc->range_start; + } + + while (!ret && to_write) { + /* start a new transaction*/ + handle = ext4_journal_start(inode, needed_blocks); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out_writepages; + } + if (ext4_should_order_data(inode)) { + /* + * With ordered mode we need to add + * the inode to the journal handle + * when we do block allocation. + */ + ret = ext4_jbd2_file_inode(handle, inode); + if (ret) { + ext4_journal_stop(handle); + goto out_writepages; + } + + } + /* + * set the max dirty pages could be write at a time + * to fit into the reserved transaction credits + */ + if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES) + wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES; + + to_write -= wbc->nr_to_write; + ret = mpage_da_writepages(mapping, wbc, + ext4_da_get_block_write); + ext4_journal_stop(handle); + if (wbc->nr_to_write) { + /* + * There is no more writeout needed + * or we requested for a noblocking writeout + * and we found the device congested + */ + to_write += wbc->nr_to_write; + break; + } + wbc->nr_to_write = to_write; + } + +out_writepages: + wbc->nr_to_write = to_write; + if (range_start) + wbc->range_start = range_start; + return ret; +} + +static int ext4_da_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + int ret, retries = 0; + struct page *page; + pgoff_t index; + unsigned from, to; + struct inode *inode = mapping->host; + handle_t *handle; + + index = pos >> PAGE_CACHE_SHIFT; + from = pos & (PAGE_CACHE_SIZE - 1); + to = from + len; + +retry: + /* + * With delayed allocation, we don't log the i_disksize update + * if there is delayed block allocation. But we still need + * to journalling the i_disksize update if writes to the end + * of file which has an already mapped buffer. + */ + handle = ext4_journal_start(inode, 1); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + + page = __grab_cache_page(mapping, index); + if (!page) + return -ENOMEM; + *pagep = page; + + ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, + ext4_da_get_block_prep); + if (ret < 0) { + unlock_page(page); + ext4_journal_stop(handle); + page_cache_release(page); + } + + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; +out: + return ret; +} + +/* + * Check if we should update i_disksize + * when write to the end of file but not require block allocation + */ +static int ext4_da_should_update_i_disksize(struct page *page, + unsigned long offset) +{ + struct buffer_head *bh; + struct inode *inode = page->mapping->host; + unsigned int idx; + int i; + + bh = page_buffers(page); + idx = offset >> inode->i_blkbits; + + for (i=0; i < idx; i++) + bh = bh->b_this_page; + + if (!buffer_mapped(bh) || (buffer_delay(bh))) + return 0; + return 1; +} + +static int ext4_da_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = mapping->host; + int ret = 0, ret2; + handle_t *handle = ext4_journal_current_handle(); + loff_t new_i_size; + unsigned long start, end; + + start = pos & (PAGE_CACHE_SIZE - 1); + end = start + copied -1; + + /* + * generic_write_end() will run mark_inode_dirty() if i_size + * changes. So let's piggyback the i_disksize mark_inode_dirty + * into that. + */ + + new_i_size = pos + copied; + if (new_i_size > EXT4_I(inode)->i_disksize) { + if (ext4_da_should_update_i_disksize(page, end)) { + down_write(&EXT4_I(inode)->i_data_sem); + if (new_i_size > EXT4_I(inode)->i_disksize) { + /* + * Updating i_disksize when extending file + * without needing block allocation + */ + if (ext4_should_order_data(inode)) + ret = ext4_jbd2_file_inode(handle, + inode); + + EXT4_I(inode)->i_disksize = new_i_size; + } + up_write(&EXT4_I(inode)->i_data_sem); + } + } + ret2 = generic_write_end(file, mapping, pos, len, copied, + page, fsdata); + copied = ret2; + if (ret2 < 0) + ret = ret2; + ret2 = ext4_journal_stop(handle); + if (!ret) + ret = ret2; + + return ret ? ret : copied; +} + +static void ext4_da_invalidatepage(struct page *page, unsigned long offset) +{ + /* + * Drop reserved blocks + */ + BUG_ON(!PageLocked(page)); + if (!page_has_buffers(page)) + goto out; + + ext4_da_page_release_reservation(page, offset); + +out: + ext4_invalidatepage(page, offset); + + return; +} + /* * bmap() is special. It gets used by applications such as lilo and by @@ -1418,6 +2409,16 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) journal_t *journal; int err; + if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && + test_opt(inode->i_sb, DELALLOC)) { + /* + * With delalloc we want to sync the file + * so that we can make sure we allocate + * blocks for file + */ + filemap_write_and_wait(mapping); + } + if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) { /* * This is a REALLY heavyweight approach, but the use of @@ -1462,21 +2463,17 @@ static int bput_one(handle_t *handle, struct buffer_head *bh) return 0; } -static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh) -{ - if (buffer_mapped(bh)) - return ext4_journal_dirty_data(handle, bh); - return 0; -} - /* - * Note that we always start a transaction even if we're not journalling - * data. This is to preserve ordering: any hole instantiation within - * __block_write_full_page -> ext4_get_block() should be journalled - * along with the data so we don't crash and then get metadata which - * refers to old data. + * Note that we don't need to start a transaction unless we're journaling data + * because we should have holes filled from ext4_page_mkwrite(). We even don't + * need to file the inode to the transaction's list in ordered mode because if + * we are writing back data added by write(), the inode is already there and if + * we are writing back data modified via mmap(), noone guarantees in which + * transaction the data will hit the disk. In case we are journaling data, we + * cannot start transaction directly because transaction start ranks above page + * lock so we have to do some magic. * - * In all journalling modes block_write_full_page() will start the I/O. + * In all journaling modes block_write_full_page() will start the I/O. * * Problem: * @@ -1518,105 +2515,103 @@ static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh) * disastrous. Any write() or metadata operation will sync the fs for * us. * - * AKPM2: if all the page's buffers are mapped to disk and !data=journal, - * we don't need to open a transaction here. */ -static int ext4_ordered_writepage(struct page *page, +static int __ext4_normal_writepage(struct page *page, struct writeback_control *wbc) { struct inode *inode = page->mapping->host; - struct buffer_head *page_bufs; - handle_t *handle = NULL; - int ret = 0; - int err; - - J_ASSERT(PageLocked(page)); - - /* - * We give up here if we're reentered, because it might be for a - * different filesystem. - */ - if (ext4_journal_current_handle()) - goto out_fail; - handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); + if (test_opt(inode->i_sb, NOBH)) + return nobh_writepage(page, + ext4_normal_get_block_write, wbc); + else + return block_write_full_page(page, + ext4_normal_get_block_write, + wbc); +} - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out_fail; - } +static int ext4_normal_writepage(struct page *page, + struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + loff_t size = i_size_read(inode); + loff_t len; - if (!page_has_buffers(page)) { - create_empty_buffers(page, inode->i_sb->s_blocksize, - (1 << BH_Dirty)|(1 << BH_Uptodate)); + J_ASSERT(PageLocked(page)); + if (page->index == size >> PAGE_CACHE_SHIFT) + len = size & ~PAGE_CACHE_MASK; + else + len = PAGE_CACHE_SIZE; + + if (page_has_buffers(page)) { + /* if page has buffers it should all be mapped + * and allocated. If there are not buffers attached + * to the page we know the page is dirty but it lost + * buffers. That means that at some moment in time + * after write_begin() / write_end() has been called + * all buffers have been clean and thus they must have been + * written at least once. So they are all mapped and we can + * happily proceed with mapping them and writing the page. + */ + BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, + ext4_bh_unmapped_or_delay)); } - page_bufs = page_buffers(page); - walk_page_buffers(handle, page_bufs, 0, - PAGE_CACHE_SIZE, NULL, bget_one); - - ret = block_write_full_page(page, ext4_get_block, wbc); - /* - * The page can become unlocked at any point now, and - * truncate can then come in and change things. So we - * can't touch *page from now on. But *page_bufs is - * safe due to elevated refcount. - */ + if (!ext4_journal_current_handle()) + return __ext4_normal_writepage(page, wbc); - /* - * And attach them to the current transaction. But only if - * block_write_full_page() succeeded. Otherwise they are unmapped, - * and generally junk. - */ - if (ret == 0) { - err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, - NULL, jbd2_journal_dirty_data_fn); - if (!ret) - ret = err; - } - walk_page_buffers(handle, page_bufs, 0, - PAGE_CACHE_SIZE, NULL, bput_one); - err = ext4_journal_stop(handle); - if (!ret) - ret = err; - return ret; - -out_fail: redirty_page_for_writepage(wbc, page); unlock_page(page); - return ret; + return 0; } -static int ext4_writeback_writepage(struct page *page, +static int __ext4_journalled_writepage(struct page *page, struct writeback_control *wbc) { - struct inode *inode = page->mapping->host; + struct address_space *mapping = page->mapping; + struct inode *inode = mapping->host; + struct buffer_head *page_bufs; handle_t *handle = NULL; int ret = 0; int err; - if (ext4_journal_current_handle()) - goto out_fail; + ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, + ext4_normal_get_block_write); + if (ret != 0) + goto out_unlock; + + page_bufs = page_buffers(page); + walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL, + bget_one); + /* As soon as we unlock the page, it can go away, but we have + * references to buffers so we are safe */ + unlock_page(page); handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); if (IS_ERR(handle)) { ret = PTR_ERR(handle); - goto out_fail; + goto out; } - if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) - ret = nobh_writepage(page, ext4_get_block, wbc); - else - ret = block_write_full_page(page, ext4_get_block, wbc); + ret = walk_page_buffers(handle, page_bufs, 0, + PAGE_CACHE_SIZE, NULL, do_journal_get_write_access); + err = walk_page_buffers(handle, page_bufs, 0, + PAGE_CACHE_SIZE, NULL, write_end_fn); + if (ret == 0) + ret = err; err = ext4_journal_stop(handle); if (!ret) ret = err; - return ret; -out_fail: - redirty_page_for_writepage(wbc, page); + walk_page_buffers(handle, page_bufs, 0, + PAGE_CACHE_SIZE, NULL, bput_one); + EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; + goto out; + +out_unlock: unlock_page(page); +out: return ret; } @@ -1624,59 +2619,53 @@ static int ext4_journalled_writepage(struct page *page, struct writeback_control *wbc) { struct inode *inode = page->mapping->host; - handle_t *handle = NULL; - int ret = 0; - int err; + loff_t size = i_size_read(inode); + loff_t len; - if (ext4_journal_current_handle()) - goto no_write; + J_ASSERT(PageLocked(page)); + if (page->index == size >> PAGE_CACHE_SHIFT) + len = size & ~PAGE_CACHE_MASK; + else + len = PAGE_CACHE_SIZE; + + if (page_has_buffers(page)) { + /* if page has buffers it should all be mapped + * and allocated. If there are not buffers attached + * to the page we know the page is dirty but it lost + * buffers. That means that at some moment in time + * after write_begin() / write_end() has been called + * all buffers have been clean and thus they must have been + * written at least once. So they are all mapped and we can + * happily proceed with mapping them and writing the page. + */ + BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, + ext4_bh_unmapped_or_delay)); + } - handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); + if (ext4_journal_current_handle()) goto no_write; - } - if (!page_has_buffers(page) || PageChecked(page)) { + if (PageChecked(page)) { /* * It's mmapped pagecache. Add buffers and journal it. There * doesn't seem much point in redirtying the page here. */ ClearPageChecked(page); - ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, - ext4_get_block); - if (ret != 0) { - ext4_journal_stop(handle); - goto out_unlock; - } - ret = walk_page_buffers(handle, page_buffers(page), 0, - PAGE_CACHE_SIZE, NULL, do_journal_get_write_access); - - err = walk_page_buffers(handle, page_buffers(page), 0, - PAGE_CACHE_SIZE, NULL, write_end_fn); - if (ret == 0) - ret = err; - EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; - unlock_page(page); + return __ext4_journalled_writepage(page, wbc); } else { /* * It may be a page full of checkpoint-mode buffers. We don't * really know unless we go poke around in the buffer_heads. * But block_write_full_page will do the right thing. */ - ret = block_write_full_page(page, ext4_get_block, wbc); + return block_write_full_page(page, + ext4_normal_get_block_write, + wbc); } - err = ext4_journal_stop(handle); - if (!ret) - ret = err; -out: - return ret; - no_write: redirty_page_for_writepage(wbc, page); -out_unlock: unlock_page(page); - goto out; + return 0; } static int ext4_readpage(struct file *file, struct page *page) @@ -1819,7 +2808,7 @@ static int ext4_journalled_set_page_dirty(struct page *page) static const struct address_space_operations ext4_ordered_aops = { .readpage = ext4_readpage, .readpages = ext4_readpages, - .writepage = ext4_ordered_writepage, + .writepage = ext4_normal_writepage, .sync_page = block_sync_page, .write_begin = ext4_write_begin, .write_end = ext4_ordered_write_end, @@ -1833,7 +2822,7 @@ static const struct address_space_operations ext4_ordered_aops = { static const struct address_space_operations ext4_writeback_aops = { .readpage = ext4_readpage, .readpages = ext4_readpages, - .writepage = ext4_writeback_writepage, + .writepage = ext4_normal_writepage, .sync_page = block_sync_page, .write_begin = ext4_write_begin, .write_end = ext4_writeback_write_end, @@ -1857,10 +2846,31 @@ static const struct address_space_operations ext4_journalled_aops = { .releasepage = ext4_releasepage, }; +static const struct address_space_operations ext4_da_aops = { + .readpage = ext4_readpage, + .readpages = ext4_readpages, + .writepage = ext4_da_writepage, + .writepages = ext4_da_writepages, + .sync_page = block_sync_page, + .write_begin = ext4_da_write_begin, + .write_end = ext4_da_write_end, + .bmap = ext4_bmap, + .invalidatepage = ext4_da_invalidatepage, + .releasepage = ext4_releasepage, + .direct_IO = ext4_direct_IO, + .migratepage = buffer_migrate_page, +}; + void ext4_set_aops(struct inode *inode) { - if (ext4_should_order_data(inode)) + if (ext4_should_order_data(inode) && + test_opt(inode->i_sb, DELALLOC)) + inode->i_mapping->a_ops = &ext4_da_aops; + else if (ext4_should_order_data(inode)) inode->i_mapping->a_ops = &ext4_ordered_aops; + else if (ext4_should_writeback_data(inode) && + test_opt(inode->i_sb, DELALLOC)) + inode->i_mapping->a_ops = &ext4_da_aops; else if (ext4_should_writeback_data(inode)) inode->i_mapping->a_ops = &ext4_writeback_aops; else @@ -1873,7 +2883,7 @@ void ext4_set_aops(struct inode *inode) * This required during truncate. We need to physically zero the tail end * of that block so it doesn't yield old data if the file is later grown. */ -int ext4_block_truncate_page(handle_t *handle, struct page *page, +int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from) { ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; @@ -1882,8 +2892,13 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page, ext4_lblk_t iblock; struct inode *inode = mapping->host; struct buffer_head *bh; + struct page *page; int err = 0; + page = grab_cache_page(mapping, from >> PAGE_CACHE_SHIFT); + if (!page) + return -EINVAL; + blocksize = inode->i_sb->s_blocksize; length = blocksize - (offset & (blocksize - 1)); iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); @@ -1956,7 +2971,7 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page, err = ext4_journal_dirty_metadata(handle, bh); } else { if (ext4_should_order_data(inode)) - err = ext4_journal_dirty_data(handle, bh); + err = ext4_jbd2_file_inode(handle, inode); mark_buffer_dirty(bh); } @@ -2179,7 +3194,21 @@ static void ext4_free_data(handle_t *handle, struct inode *inode, if (this_bh) { BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata"); - ext4_journal_dirty_metadata(handle, this_bh); + + /* + * The buffer head should have an attached journal head at this + * point. However, if the data is corrupted and an indirect + * block pointed to itself, it would have been detached when + * the block was cleared. Check for this instead of OOPSing. + */ + if (bh2jh(this_bh)) + ext4_journal_dirty_metadata(handle, this_bh); + else + ext4_error(inode->i_sb, __func__, + "circular indirect block detected, " + "inode=%lu, block=%llu", + inode->i_ino, + (unsigned long long) this_bh->b_blocknr); } } @@ -2305,6 +3334,19 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, } } +int ext4_can_truncate(struct inode *inode) +{ + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return 0; + if (S_ISREG(inode->i_mode)) + return 1; + if (S_ISDIR(inode->i_mode)) + return 1; + if (S_ISLNK(inode->i_mode)) + return !ext4_inode_is_fast_symlink(inode); + return 0; +} + /* * ext4_truncate() * @@ -2347,51 +3389,25 @@ void ext4_truncate(struct inode *inode) int n; ext4_lblk_t last_block; unsigned blocksize = inode->i_sb->s_blocksize; - struct page *page; - if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || - S_ISLNK(inode->i_mode))) - return; - if (ext4_inode_is_fast_symlink(inode)) - return; - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + if (!ext4_can_truncate(inode)) return; - /* - * We have to lock the EOF page here, because lock_page() nests - * outside jbd2_journal_start(). - */ - if ((inode->i_size & (blocksize - 1)) == 0) { - /* Block boundary? Nothing to do */ - page = NULL; - } else { - page = grab_cache_page(mapping, - inode->i_size >> PAGE_CACHE_SHIFT); - if (!page) - return; - } - if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { - ext4_ext_truncate(inode, page); + ext4_ext_truncate(inode); return; } handle = start_transaction(inode); - if (IS_ERR(handle)) { - if (page) { - clear_highpage(page); - flush_dcache_page(page); - unlock_page(page); - page_cache_release(page); - } + if (IS_ERR(handle)) return; /* AKPM: return what? */ - } last_block = (inode->i_size + blocksize-1) >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); - if (page) - ext4_block_truncate_page(handle, page, mapping, inode->i_size); + if (inode->i_size & (blocksize - 1)) + if (ext4_block_truncate_page(handle, mapping, inode->i_size)) + goto out_stop; n = ext4_block_to_path(inode, last_block, offsets, NULL); if (n == 0) @@ -2410,6 +3426,11 @@ void ext4_truncate(struct inode *inode) goto out_stop; /* + * From here we block out all ext4_get_block() callers who want to + * modify the block allocation tree. + */ + down_write(&ei->i_data_sem); + /* * The orphan list entry will now protect us from any crash which * occurs before the truncate completes, so it is now safe to propagate * the new, shorter inode size (held for now in i_size) into the @@ -2418,12 +3439,6 @@ void ext4_truncate(struct inode *inode) */ ei->i_disksize = inode->i_size; - /* - * From here we block out all ext4_get_block() callers who want to - * modify the block allocation tree. - */ - down_write(&ei->i_data_sem); - if (n == 1) { /* direct blocks */ ext4_free_data(handle, inode, NULL, i_data+offsets[0], i_data + EXT4_NDIR_BLOCKS); @@ -3107,7 +4122,14 @@ int ext4_write_inode(struct inode *inode, int wait) * be freed, so we have a strong guarantee that no future commit will * leave these blocks visible to the user.) * - * Called with inode->sem down. + * Another thing we have to assure is that if we are in ordered mode + * and inode is still attached to the committing transaction, we must + * we start writeout of all the dirty pages which are being truncated. + * This way we are sure that all the data written in the previous + * transaction are already on disk (truncate waits for pages under + * writeback). + * + * Called with inode->i_mutex down. */ int ext4_setattr(struct dentry *dentry, struct iattr *attr) { @@ -3173,6 +4195,22 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) if (!error) error = rc; ext4_journal_stop(handle); + + if (ext4_should_order_data(inode)) { + error = ext4_begin_ordered_truncate(inode, + attr->ia_size); + if (error) { + /* Do as much error cleanup as possible */ + handle = ext4_journal_start(inode, 3); + if (IS_ERR(handle)) { + ext4_orphan_del(NULL, inode); + goto err_out; + } + ext4_orphan_del(handle, inode); + ext4_journal_stop(handle); + goto err_out; + } + } } rc = inode_setattr(inode, attr); @@ -3193,6 +4231,32 @@ err_out: return error; } +int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct inode *inode; + unsigned long delalloc_blocks; + + inode = dentry->d_inode; + generic_fillattr(inode, stat); + + /* + * We can't update i_blocks if the block allocation is delayed + * otherwise in the case of system crash before the real block + * allocation is done, we will have i_blocks inconsistent with + * on-disk file blocks. + * We always keep i_blocks updated together with real + * allocation. But to not confuse with user, stat + * will return the blocks that include the delayed allocation + * blocks for this file. + */ + spin_lock(&EXT4_I(inode)->i_block_reservation_lock); + delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks; + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + + stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9; + return 0; +} /* * How many blocks doth make a writepage()? @@ -3506,3 +4570,64 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) return err; } + +static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh) +{ + return !buffer_mapped(bh); +} + +int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page) +{ + loff_t size; + unsigned long len; + int ret = -EINVAL; + struct file *file = vma->vm_file; + struct inode *inode = file->f_path.dentry->d_inode; + struct address_space *mapping = inode->i_mapping; + + /* + * Get i_alloc_sem to stop truncates messing with the inode. We cannot + * get i_mutex because we are already holding mmap_sem. + */ + down_read(&inode->i_alloc_sem); + size = i_size_read(inode); + if (page->mapping != mapping || size <= page_offset(page) + || !PageUptodate(page)) { + /* page got truncated from under us? */ + goto out_unlock; + } + ret = 0; + if (PageMappedToDisk(page)) + goto out_unlock; + + if (page->index == size >> PAGE_CACHE_SHIFT) + len = size & ~PAGE_CACHE_MASK; + else + len = PAGE_CACHE_SIZE; + + if (page_has_buffers(page)) { + /* return if we have all the buffers mapped */ + if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, + ext4_bh_unmapped)) + goto out_unlock; + } + /* + * OK, we need to fill the hole... Do write_begin write_end + * to do block allocation/reservation.We are not holding + * inode.i__mutex here. That allow * parallel write_begin, + * write_end call. lock_page prevent this from happening + * on the same page though + */ + ret = mapping->a_ops->write_begin(file, mapping, page_offset(page), + len, AOP_FLAG_UNINTERRUPTIBLE, &page, NULL); + if (ret < 0) + goto out_unlock; + ret = mapping->a_ops->write_end(file, mapping, page_offset(page), + len, len, page, NULL); + if (ret < 0) + goto out_unlock; + ret = 0; +out_unlock: + up_read(&inode->i_alloc_sem); + return ret; +} diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 873ad9b3418..8d141a25bbe 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -381,22 +381,28 @@ static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr) static inline int mb_find_next_zero_bit(void *addr, int max, int start) { - int fix = 0; + int fix = 0, ret, tmpmax; addr = mb_correct_addr_and_bit(&fix, addr); - max += fix; + tmpmax = max + fix; start += fix; - return ext4_find_next_zero_bit(addr, max, start) - fix; + ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix; + if (ret > max) + return max; + return ret; } static inline int mb_find_next_bit(void *addr, int max, int start) { - int fix = 0; + int fix = 0, ret, tmpmax; addr = mb_correct_addr_and_bit(&fix, addr); - max += fix; + tmpmax = max + fix; start += fix; - return ext4_find_next_bit(addr, max, start) - fix; + ret = ext4_find_next_bit(addr, tmpmax, start) - fix; + if (ret > max) + return max; + return ret; } static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max) @@ -803,6 +809,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore) if (!buffer_uptodate(bh[i])) goto out; + err = 0; first_block = page->index * blocks_per_page; for (i = 0; i < blocks_per_page; i++) { int group; @@ -883,6 +890,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, int pnum; int poff; struct page *page; + int ret; mb_debug("load group %lu\n", group); @@ -914,15 +922,21 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, if (page) { BUG_ON(page->mapping != inode->i_mapping); if (!PageUptodate(page)) { - ext4_mb_init_cache(page, NULL); + ret = ext4_mb_init_cache(page, NULL); + if (ret) { + unlock_page(page); + goto err; + } mb_cmp_bitmaps(e4b, page_address(page) + (poff * sb->s_blocksize)); } unlock_page(page); } } - if (page == NULL || !PageUptodate(page)) + if (page == NULL || !PageUptodate(page)) { + ret = -EIO; goto err; + } e4b->bd_bitmap_page = page; e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); mark_page_accessed(page); @@ -938,14 +952,20 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); if (page) { BUG_ON(page->mapping != inode->i_mapping); - if (!PageUptodate(page)) - ext4_mb_init_cache(page, e4b->bd_bitmap); - + if (!PageUptodate(page)) { + ret = ext4_mb_init_cache(page, e4b->bd_bitmap); + if (ret) { + unlock_page(page); + goto err; + } + } unlock_page(page); } } - if (page == NULL || !PageUptodate(page)) + if (page == NULL || !PageUptodate(page)) { + ret = -EIO; goto err; + } e4b->bd_buddy_page = page; e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); mark_page_accessed(page); @@ -962,7 +982,7 @@ err: page_cache_release(e4b->bd_buddy_page); e4b->bd_buddy = NULL; e4b->bd_bitmap = NULL; - return -EIO; + return ret; } static void ext4_mb_release_desc(struct ext4_buddy *e4b) @@ -1031,7 +1051,7 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len) } } -static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, +static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, int first, int count) { int block = 0; @@ -1071,11 +1091,12 @@ static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, blocknr += block; blocknr += le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); - + ext4_unlock_group(sb, e4b->bd_group); ext4_error(sb, __func__, "double-free of inode" " %lu's block %llu(bit %u in group %lu)\n", inode ? inode->i_ino : 0, blocknr, block, e4b->bd_group); + ext4_lock_group(sb, e4b->bd_group); } mb_clear_bit(block, EXT4_MB_BITMAP(e4b)); e4b->bd_info->bb_counters[order]++; @@ -1113,8 +1134,6 @@ static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, } while (1); } mb_check_buddy(e4b); - - return 0; } static int mb_find_extent(struct ext4_buddy *e4b, int order, int block, @@ -1730,10 +1749,6 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) ac->ac_g_ex.fe_start = sbi->s_mb_last_start; spin_unlock(&sbi->s_md_lock); } - - /* searching for the right group start from the goal value specified */ - group = ac->ac_g_ex.fe_group; - /* Let's just scan groups to find more-less suitable blocks */ cr = ac->ac_2order ? 0 : 1; /* @@ -1743,6 +1758,12 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) repeat: for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) { ac->ac_criteria = cr; + /* + * searching for the right group start + * from the goal value specified + */ + group = ac->ac_g_ex.fe_group; + for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) { struct ext4_group_info *grp; struct ext4_group_desc *desc; @@ -1963,6 +1984,8 @@ static int ext4_mb_seq_history_open(struct inode *inode, struct file *file) int rc; int size; + if (unlikely(sbi->s_mb_history == NULL)) + return -ENOMEM; s = kmalloc(sizeof(*s), GFP_KERNEL); if (s == NULL) return -ENOMEM; @@ -2165,9 +2188,7 @@ static void ext4_mb_history_init(struct super_block *sb) sbi->s_mb_history_cur = 0; spin_lock_init(&sbi->s_mb_history_lock); i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history); - sbi->s_mb_history = kmalloc(i, GFP_KERNEL); - if (likely(sbi->s_mb_history != NULL)) - memset(sbi->s_mb_history, 0, i); + sbi->s_mb_history = kzalloc(i, GFP_KERNEL); /* if we can't allocate history, then we simple won't use it */ } @@ -2215,21 +2236,192 @@ ext4_mb_store_history(struct ext4_allocation_context *ac) #define ext4_mb_history_init(sb) #endif + +/* Create and initialize ext4_group_info data for the given group. */ +int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *desc) +{ + int i, len; + int metalen = 0; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_group_info **meta_group_info; + + /* + * First check if this group is the first of a reserved block. + * If it's true, we have to allocate a new table of pointers + * to ext4_group_info structures + */ + if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { + metalen = sizeof(*meta_group_info) << + EXT4_DESC_PER_BLOCK_BITS(sb); + meta_group_info = kmalloc(metalen, GFP_KERNEL); + if (meta_group_info == NULL) { + printk(KERN_ERR "EXT4-fs: can't allocate mem for a " + "buddy group\n"); + goto exit_meta_group_info; + } + sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = + meta_group_info; + } + + /* + * calculate needed size. if change bb_counters size, + * don't forget about ext4_mb_generate_buddy() + */ + len = offsetof(typeof(**meta_group_info), + bb_counters[sb->s_blocksize_bits + 2]); + + meta_group_info = + sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]; + i = group & (EXT4_DESC_PER_BLOCK(sb) - 1); + + meta_group_info[i] = kzalloc(len, GFP_KERNEL); + if (meta_group_info[i] == NULL) { + printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n"); + goto exit_group_info; + } + set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, + &(meta_group_info[i]->bb_state)); + + /* + * initialize bb_free to be able to skip + * empty groups without initialization + */ + if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { + meta_group_info[i]->bb_free = + ext4_free_blocks_after_init(sb, group, desc); + } else { + meta_group_info[i]->bb_free = + le16_to_cpu(desc->bg_free_blocks_count); + } + + INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); + +#ifdef DOUBLE_CHECK + { + struct buffer_head *bh; + meta_group_info[i]->bb_bitmap = + kmalloc(sb->s_blocksize, GFP_KERNEL); + BUG_ON(meta_group_info[i]->bb_bitmap == NULL); + bh = ext4_read_block_bitmap(sb, group); + BUG_ON(bh == NULL); + memcpy(meta_group_info[i]->bb_bitmap, bh->b_data, + sb->s_blocksize); + put_bh(bh); + } +#endif + + return 0; + +exit_group_info: + /* If a meta_group_info table has been allocated, release it now */ + if (group % EXT4_DESC_PER_BLOCK(sb) == 0) + kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]); +exit_meta_group_info: + return -ENOMEM; +} /* ext4_mb_add_groupinfo */ + +/* + * Add a group to the existing groups. + * This function is used for online resize + */ +int ext4_mb_add_more_groupinfo(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *desc) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct inode *inode = sbi->s_buddy_cache; + int blocks_per_page; + int block; + int pnum; + struct page *page; + int err; + + /* Add group based on group descriptor*/ + err = ext4_mb_add_groupinfo(sb, group, desc); + if (err) + return err; + + /* + * Cache pages containing dynamic mb_alloc datas (buddy and bitmap + * datas) are set not up to date so that they will be re-initilaized + * during the next call to ext4_mb_load_buddy + */ + + /* Set buddy page as not up to date */ + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; + block = group * 2; + pnum = block / blocks_per_page; + page = find_get_page(inode->i_mapping, pnum); + if (page != NULL) { + ClearPageUptodate(page); + page_cache_release(page); + } + + /* Set bitmap page as not up to date */ + block++; + pnum = block / blocks_per_page; + page = find_get_page(inode->i_mapping, pnum); + if (page != NULL) { + ClearPageUptodate(page); + page_cache_release(page); + } + + return 0; +} + +/* + * Update an existing group. + * This function is used for online resize + */ +void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add) +{ + grp->bb_free += add; +} + static int ext4_mb_init_backend(struct super_block *sb) { ext4_group_t i; - int j, len, metalen; + int metalen; struct ext4_sb_info *sbi = EXT4_SB(sb); - int num_meta_group_infos = - (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) >> - EXT4_DESC_PER_BLOCK_BITS(sb); + struct ext4_super_block *es = sbi->s_es; + int num_meta_group_infos; + int num_meta_group_infos_max; + int array_size; struct ext4_group_info **meta_group_info; + struct ext4_group_desc *desc; + + /* This is the number of blocks used by GDT */ + num_meta_group_infos = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - + 1) >> EXT4_DESC_PER_BLOCK_BITS(sb); + + /* + * This is the total number of blocks used by GDT including + * the number of reserved blocks for GDT. + * The s_group_info array is allocated with this value + * to allow a clean online resize without a complex + * manipulation of pointer. + * The drawback is the unused memory when no resize + * occurs but it's very low in terms of pages + * (see comments below) + * Need to handle this properly when META_BG resizing is allowed + */ + num_meta_group_infos_max = num_meta_group_infos + + le16_to_cpu(es->s_reserved_gdt_blocks); + /* + * array_size is the size of s_group_info array. We round it + * to the next power of two because this approximation is done + * internally by kmalloc so we can have some more memory + * for free here (e.g. may be used for META_BG resize). + */ + array_size = 1; + while (array_size < sizeof(*sbi->s_group_info) * + num_meta_group_infos_max) + array_size = array_size << 1; /* An 8TB filesystem with 64-bit pointers requires a 4096 byte * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. * So a two level scheme suffices for now. */ - sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) * - num_meta_group_infos, GFP_KERNEL); + sbi->s_group_info = kmalloc(array_size, GFP_KERNEL); if (sbi->s_group_info == NULL) { printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n"); return -ENOMEM; @@ -2256,63 +2448,15 @@ static int ext4_mb_init_backend(struct super_block *sb) sbi->s_group_info[i] = meta_group_info; } - /* - * calculate needed size. if change bb_counters size, - * don't forget about ext4_mb_generate_buddy() - */ - len = sizeof(struct ext4_group_info); - len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2); for (i = 0; i < sbi->s_groups_count; i++) { - struct ext4_group_desc *desc; - - meta_group_info = - sbi->s_group_info[i >> EXT4_DESC_PER_BLOCK_BITS(sb)]; - j = i & (EXT4_DESC_PER_BLOCK(sb) - 1); - - meta_group_info[j] = kzalloc(len, GFP_KERNEL); - if (meta_group_info[j] == NULL) { - printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n"); - goto err_freebuddy; - } desc = ext4_get_group_desc(sb, i, NULL); if (desc == NULL) { printk(KERN_ERR "EXT4-fs: can't read descriptor %lu\n", i); - i++; goto err_freebuddy; } - memset(meta_group_info[j], 0, len); - set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, - &(meta_group_info[j]->bb_state)); - - /* - * initialize bb_free to be able to skip - * empty groups without initialization - */ - if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { - meta_group_info[j]->bb_free = - ext4_free_blocks_after_init(sb, i, desc); - } else { - meta_group_info[j]->bb_free = - le16_to_cpu(desc->bg_free_blocks_count); - } - - INIT_LIST_HEAD(&meta_group_info[j]->bb_prealloc_list); - -#ifdef DOUBLE_CHECK - { - struct buffer_head *bh; - meta_group_info[j]->bb_bitmap = - kmalloc(sb->s_blocksize, GFP_KERNEL); - BUG_ON(meta_group_info[j]->bb_bitmap == NULL); - bh = read_block_bitmap(sb, i); - BUG_ON(bh == NULL); - memcpy(meta_group_info[j]->bb_bitmap, bh->b_data, - sb->s_blocksize); - put_bh(bh); - } -#endif - + if (ext4_mb_add_groupinfo(sb, i, desc) != 0) + goto err_freebuddy; } return 0; @@ -2336,6 +2480,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) unsigned i; unsigned offset; unsigned max; + int ret; if (!test_opt(sb, MBALLOC)) return 0; @@ -2370,12 +2515,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) } while (i <= sb->s_blocksize_bits + 1); /* init file for buddy data */ - i = ext4_mb_init_backend(sb); - if (i) { + ret = ext4_mb_init_backend(sb); + if (ret != 0) { clear_opt(sbi->s_mount_opt, MBALLOC); kfree(sbi->s_mb_offsets); kfree(sbi->s_mb_maxs); - return i; + return ret; } spin_lock_init(&sbi->s_md_lock); @@ -2548,8 +2693,7 @@ ext4_mb_free_committed_blocks(struct super_block *sb) ext4_lock_group(sb, md->group); for (i = 0; i < md->num; i++) { mb_debug(" %u", md->blocks[i]); - err = mb_free_blocks(NULL, &e4b, md->blocks[i], 1); - BUG_ON(err != 0); + mb_free_blocks(NULL, &e4b, md->blocks[i], 1); } mb_debug("\n"); ext4_unlock_group(sb, md->group); @@ -2575,25 +2719,24 @@ ext4_mb_free_committed_blocks(struct super_block *sb) -#define MB_PROC_VALUE_READ(name) \ -static int ext4_mb_read_##name(char *page, char **start, \ - off_t off, int count, int *eof, void *data) \ +#define MB_PROC_FOPS(name) \ +static int ext4_mb_##name##_proc_show(struct seq_file *m, void *v) \ { \ - struct ext4_sb_info *sbi = data; \ - int len; \ - *eof = 1; \ - if (off != 0) \ - return 0; \ - len = sprintf(page, "%ld\n", sbi->s_mb_##name); \ - *start = page; \ - return len; \ -} - -#define MB_PROC_VALUE_WRITE(name) \ -static int ext4_mb_write_##name(struct file *file, \ - const char __user *buf, unsigned long cnt, void *data) \ + struct ext4_sb_info *sbi = m->private; \ + \ + seq_printf(m, "%ld\n", sbi->s_mb_##name); \ + return 0; \ +} \ + \ +static int ext4_mb_##name##_proc_open(struct inode *inode, struct file *file)\ +{ \ + return single_open(file, ext4_mb_##name##_proc_show, PDE(inode)->data);\ +} \ + \ +static ssize_t ext4_mb_##name##_proc_write(struct file *file, \ + const char __user *buf, size_t cnt, loff_t *ppos) \ { \ - struct ext4_sb_info *sbi = data; \ + struct ext4_sb_info *sbi = PDE(file->f_path.dentry->d_inode)->data;\ char str[32]; \ long value; \ if (cnt >= sizeof(str)) \ @@ -2605,31 +2748,32 @@ static int ext4_mb_write_##name(struct file *file, \ return -ERANGE; \ sbi->s_mb_##name = value; \ return cnt; \ -} +} \ + \ +static const struct file_operations ext4_mb_##name##_proc_fops = { \ + .owner = THIS_MODULE, \ + .open = ext4_mb_##name##_proc_open, \ + .read = seq_read, \ + .llseek = seq_lseek, \ + .release = single_release, \ + .write = ext4_mb_##name##_proc_write, \ +}; -MB_PROC_VALUE_READ(stats); -MB_PROC_VALUE_WRITE(stats); -MB_PROC_VALUE_READ(max_to_scan); -MB_PROC_VALUE_WRITE(max_to_scan); -MB_PROC_VALUE_READ(min_to_scan); -MB_PROC_VALUE_WRITE(min_to_scan); -MB_PROC_VALUE_READ(order2_reqs); -MB_PROC_VALUE_WRITE(order2_reqs); -MB_PROC_VALUE_READ(stream_request); -MB_PROC_VALUE_WRITE(stream_request); -MB_PROC_VALUE_READ(group_prealloc); -MB_PROC_VALUE_WRITE(group_prealloc); +MB_PROC_FOPS(stats); +MB_PROC_FOPS(max_to_scan); +MB_PROC_FOPS(min_to_scan); +MB_PROC_FOPS(order2_reqs); +MB_PROC_FOPS(stream_request); +MB_PROC_FOPS(group_prealloc); #define MB_PROC_HANDLER(name, var) \ do { \ - proc = create_proc_entry(name, mode, sbi->s_mb_proc); \ + proc = proc_create_data(name, mode, sbi->s_mb_proc, \ + &ext4_mb_##var##_proc_fops, sbi); \ if (proc == NULL) { \ printk(KERN_ERR "EXT4-fs: can't to create %s\n", name); \ goto err_out; \ } \ - proc->data = sbi; \ - proc->read_proc = ext4_mb_read_##var ; \ - proc->write_proc = ext4_mb_write_##var; \ } while (0) static int ext4_mb_init_per_dev_proc(struct super_block *sb) @@ -2639,6 +2783,10 @@ static int ext4_mb_init_per_dev_proc(struct super_block *sb) struct proc_dir_entry *proc; char devname[64]; + if (proc_root_ext4 == NULL) { + sbi->s_mb_proc = NULL; + return -EINVAL; + } bdevname(sb->s_bdev, devname); sbi->s_mb_proc = proc_mkdir(devname, proc_root_ext4); @@ -2745,11 +2893,9 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, sbi = EXT4_SB(sb); es = sbi->s_es; - ext4_debug("using block group %lu(%d)\n", ac->ac_b_ex.fe_group, - gdp->bg_free_blocks_count); err = -EIO; - bitmap_bh = read_block_bitmap(sb, ac->ac_b_ex.fe_group); + bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group); if (!bitmap_bh) goto out_err; @@ -2762,6 +2908,9 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, if (!gdp) goto out_err; + ext4_debug("using block group %lu(%d)\n", ac->ac_b_ex.fe_group, + gdp->bg_free_blocks_count); + err = ext4_journal_get_write_access(handle, gdp_bh); if (err) goto out_err; @@ -2815,7 +2964,23 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len); gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); - percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len); + + /* + * free blocks account has already be reduced/reserved + * at write_begin() time for delayed allocation + * do not double accounting + */ + if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED)) + percpu_counter_sub(&sbi->s_freeblocks_counter, + ac->ac_b_ex.fe_len); + + if (sbi->s_log_groups_per_flex) { + ext4_group_t flex_group = ext4_flex_group(sbi, + ac->ac_b_ex.fe_group); + spin_lock(sb_bgl_lock(sbi, flex_group)); + sbi->s_flex_groups[flex_group].free_blocks -= ac->ac_b_ex.fe_len; + spin_unlock(sb_bgl_lock(sbi, flex_group)); + } err = ext4_journal_dirty_metadata(handle, bitmap_bh); if (err) @@ -3094,8 +3259,7 @@ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac, static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac, struct ext4_prealloc_space *pa) { - unsigned len = ac->ac_o_ex.fe_len; - + unsigned int len = ac->ac_o_ex.fe_len; ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart, &ac->ac_b_ex.fe_group, &ac->ac_b_ex.fe_start); @@ -3473,8 +3637,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, if (bit >= end) break; next = mb_find_next_bit(bitmap_bh->b_data, end, bit); - if (next > end) - next = end; start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit + le32_to_cpu(sbi->s_es->s_first_data_block); mb_debug(" free preallocated %u/%u in group %u\n", @@ -3569,7 +3731,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, if (list_empty(&grp->bb_prealloc_list)) return 0; - bitmap_bh = read_block_bitmap(sb, group); + bitmap_bh = ext4_read_block_bitmap(sb, group); if (bitmap_bh == NULL) { /* error handling here */ ext4_mb_release_desc(&e4b); @@ -3743,7 +3905,7 @@ repeat: err = ext4_mb_load_buddy(sb, group, &e4b); BUG_ON(err != 0); /* error handling here */ - bitmap_bh = read_block_bitmap(sb, group); + bitmap_bh = ext4_read_block_bitmap(sb, group); if (bitmap_bh == NULL) { /* error handling here */ ext4_mb_release_desc(&e4b); @@ -4011,10 +4173,21 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, sbi = EXT4_SB(sb); if (!test_opt(sb, MBALLOC)) { - block = ext4_new_blocks_old(handle, ar->inode, ar->goal, + block = ext4_old_new_blocks(handle, ar->inode, ar->goal, &(ar->len), errp); return block; } + if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) { + /* + * With delalloc we already reserved the blocks + */ + ar->len = ext4_has_free_blocks(sbi, ar->len); + } + + if (ar->len == 0) { + *errp = -ENOSPC; + return 0; + } while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) { ar->flags |= EXT4_MB_HINT_NOPREALLOC; @@ -4026,10 +4199,14 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, } inquota = ar->len; + if (EXT4_I(ar->inode)->i_delalloc_reserved_flag) + ar->flags |= EXT4_MB_DELALLOC_RESERVED; + ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); if (!ac) { + ar->len = 0; *errp = -ENOMEM; - return 0; + goto out1; } ext4_mb_poll_new_transaction(sb, handle); @@ -4037,12 +4214,11 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, *errp = ext4_mb_initialize_context(ac, ar); if (*errp) { ar->len = 0; - goto out; + goto out2; } ac->ac_op = EXT4_MB_HISTORY_PREALLOC; if (!ext4_mb_use_preallocated(ac)) { - ac->ac_op = EXT4_MB_HISTORY_ALLOC; ext4_mb_normalize_request(ac, ar); repeat: @@ -4085,11 +4261,12 @@ repeat: ext4_mb_release_context(ac); -out: +out2: + kmem_cache_free(ext4_ac_cachep, ac); +out1: if (ar->len < inquota) DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len); - kmem_cache_free(ext4_ac_cachep, ac); return block; } static void ext4_mb_poll_new_transaction(struct super_block *sb, @@ -4242,7 +4419,7 @@ do_more: overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb); count -= overflow; } - bitmap_bh = read_block_bitmap(sb, block_group); + bitmap_bh = ext4_read_block_bitmap(sb, block_group); if (!bitmap_bh) goto error_return; gdp = ext4_get_group_desc(sb, block_group, &gd_bh); @@ -4309,10 +4486,9 @@ do_more: ext4_mb_free_metadata(handle, &e4b, block_group, bit, count); } else { ext4_lock_group(sb, block_group); - err = mb_free_blocks(inode, &e4b, bit, count); + mb_free_blocks(inode, &e4b, bit, count); ext4_mb_return_to_preallocation(inode, &e4b, block, count); ext4_unlock_group(sb, block_group); - BUG_ON(err != 0); } spin_lock(sb_bgl_lock(sbi, block_group)); @@ -4321,6 +4497,13 @@ do_more: spin_unlock(sb_bgl_lock(sbi, block_group)); percpu_counter_add(&sbi->s_freeblocks_counter, count); + if (sbi->s_log_groups_per_flex) { + ext4_group_t flex_group = ext4_flex_group(sbi, block_group); + spin_lock(sb_bgl_lock(sbi, flex_group)); + sbi->s_flex_groups[flex_group].free_blocks += count; + spin_unlock(sb_bgl_lock(sbi, flex_group)); + } + ext4_mb_release_desc(&e4b); *freed += count; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index ab16beaa830..387ad98350c 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -183,6 +183,16 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, struct inode *inode); /* + * p is at least 6 bytes before the end of page + */ +static inline struct ext4_dir_entry_2 * +ext4_next_entry(struct ext4_dir_entry_2 *p) +{ + return (struct ext4_dir_entry_2 *)((char *)p + + ext4_rec_len_from_disk(p->rec_len)); +} + +/* * Future: use high four bits of block for coalesce-on-delete flags * Mask them off for now. */ @@ -231,13 +241,13 @@ static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize) { unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) - EXT4_DIR_REC_LEN(2) - infosize; - return 0? 20: entry_space / sizeof(struct dx_entry); + return entry_space / sizeof(struct dx_entry); } static inline unsigned dx_node_limit (struct inode *dir) { unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0); - return 0? 22: entry_space / sizeof(struct dx_entry); + return entry_space / sizeof(struct dx_entry); } /* @@ -554,15 +564,6 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash, /* - * p is at least 6 bytes before the end of page - */ -static inline struct ext4_dir_entry_2 *ext4_next_entry(struct ext4_dir_entry_2 *p) -{ - return (struct ext4_dir_entry_2 *)((char *)p + - ext4_rec_len_from_disk(p->rec_len)); -} - -/* * This function fills a red-black tree with information from a * directory block. It returns the number directory entries loaded * into the tree. If there is an error it is returned in err. @@ -993,19 +994,21 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry, de = (struct ext4_dir_entry_2 *) bh->b_data; top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize - EXT4_DIR_REC_LEN(0)); - for (; de < top; de = ext4_next_entry(de)) - if (ext4_match (namelen, name, de)) { - if (!ext4_check_dir_entry("ext4_find_entry", - dir, de, bh, - (block<<EXT4_BLOCK_SIZE_BITS(sb)) - +((char *)de - bh->b_data))) { - brelse (bh); + for (; de < top; de = ext4_next_entry(de)) { + int off = (block << EXT4_BLOCK_SIZE_BITS(sb)) + + ((char *) de - bh->b_data); + + if (!ext4_check_dir_entry(__func__, dir, de, bh, off)) { + brelse(bh); *err = ERR_BAD_DX_DIR; goto errout; } - *res_dir = de; - dx_release (frames); - return bh; + + if (ext4_match(namelen, name, de)) { + *res_dir = de; + dx_release(frames); + return bh; + } } brelse (bh); /* Check to see if we should continue to search */ diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 9f086a6a472..f000fbe2cd9 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -563,7 +563,8 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, } blk = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + EXT4_SB(sb)->s_gdb_count; - data = (__le32 *)dind->b_data + EXT4_SB(sb)->s_gdb_count; + data = (__le32 *)dind->b_data + (EXT4_SB(sb)->s_gdb_count % + EXT4_ADDR_PER_BLOCK(sb)); end = (__le32 *)dind->b_data + EXT4_ADDR_PER_BLOCK(sb); /* Get each reserved primary GDT block and verify it holds backups */ @@ -854,7 +855,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) */ /* Update group descriptor block for new group */ - gdp = (struct ext4_group_desc *)primary->b_data + gdb_off; + gdp = (struct ext4_group_desc *)((char *)primary->b_data + + gdb_off * EXT4_DESC_SIZE(sb)); ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */ ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */ @@ -864,6 +866,15 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp); /* + * We can allocate memory for mb_alloc based on the new group + * descriptor + */ + if (test_opt(sb, MBALLOC)) { + err = ext4_mb_add_more_groupinfo(sb, input->group, gdp); + if (err) + goto exit_journal; + } + /* * Make the new blocks and inodes valid next. We do this before * increasing the group count so that once the group is enabled, * all of its blocks and inodes are already valid. @@ -955,6 +966,8 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, handle_t *handle; int err; unsigned long freed_blocks; + ext4_group_t group; + struct ext4_group_info *grp; /* We don't need to worry about locking wrt other resizers just * yet: we're going to revalidate es->s_blocks_count after @@ -986,7 +999,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, } /* Handle the remaining blocks in the last group only. */ - ext4_get_group_no_and_offset(sb, o_blocks_count, NULL, &last); + ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last); if (last == 0) { ext4_warning(sb, __func__, @@ -1058,6 +1071,45 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, o_blocks_count + add); if ((err = ext4_journal_stop(handle))) goto exit_put; + + /* + * Mark mballoc pages as not up to date so that they will be updated + * next time they are loaded by ext4_mb_load_buddy. + */ + if (test_opt(sb, MBALLOC)) { + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct inode *inode = sbi->s_buddy_cache; + int blocks_per_page; + int block; + int pnum; + struct page *page; + + /* Set buddy page as not up to date */ + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; + block = group * 2; + pnum = block / blocks_per_page; + page = find_get_page(inode->i_mapping, pnum); + if (page != NULL) { + ClearPageUptodate(page); + page_cache_release(page); + } + + /* Set bitmap page as not up to date */ + block++; + pnum = block / blocks_per_page; + page = find_get_page(inode->i_mapping, pnum); + if (page != NULL) { + ClearPageUptodate(page); + page_cache_release(page); + } + + /* Get the info on the last group */ + grp = ext4_get_group_info(sb, group); + + /* Update free blocks in group info */ + ext4_mb_update_group_info(grp, add); + } + if (test_opt(sb, DEBUG)) printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n", ext4_blocks_count(es)); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 09d9359c805..1cb371dcd60 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -506,6 +506,7 @@ static void ext4_put_super (struct super_block * sb) ext4_ext_release(sb); ext4_xattr_put_super(sb); jbd2_journal_destroy(sbi->s_journal); + sbi->s_journal = NULL; if (!(sb->s_flags & MS_RDONLY)) { EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); es->s_state = cpu_to_le16(sbi->s_mount_state); @@ -517,6 +518,7 @@ static void ext4_put_super (struct super_block * sb) for (i = 0; i < sbi->s_gdb_count; i++) brelse(sbi->s_group_desc[i]); kfree(sbi->s_group_desc); + kfree(sbi->s_flex_groups); percpu_counter_destroy(&sbi->s_freeblocks_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); @@ -571,6 +573,12 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); INIT_LIST_HEAD(&ei->i_prealloc_list); spin_lock_init(&ei->i_prealloc_lock); + jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode); + ei->i_reserved_data_blocks = 0; + ei->i_reserved_meta_blocks = 0; + ei->i_allocated_meta_blocks = 0; + ei->i_delalloc_reserved_flag = 0; + spin_lock_init(&(ei->i_block_reservation_lock)); return &ei->vfs_inode; } @@ -635,6 +643,8 @@ static void ext4_clear_inode(struct inode *inode) EXT4_I(inode)->i_block_alloc_info = NULL; if (unlikely(rsv)) kfree(rsv); + jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal, + &EXT4_I(inode)->jinode); } static inline void ext4_show_quota_options(struct seq_file *seq, struct super_block *sb) @@ -729,8 +739,15 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_printf(seq, ",commit=%u", (unsigned) (sbi->s_commit_interval / HZ)); } - if (test_opt(sb, BARRIER)) - seq_puts(seq, ",barrier=1"); + /* + * We're changing the default of barrier mount option, so + * let's always display its mount state so it's clear what its + * status is. + */ + seq_puts(seq, ",barrier="); + seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0"); + if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) + seq_puts(seq, ",journal_async_commit"); if (test_opt(sb, NOBH)) seq_puts(seq, ",nobh"); if (!test_opt(sb, EXTENTS)) @@ -739,6 +756,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",nomballoc"); if (test_opt(sb, I_VERSION)) seq_puts(seq, ",i_version"); + if (!test_opt(sb, DELALLOC)) + seq_puts(seq, ",nodelalloc"); + if (sbi->s_stripe) seq_printf(seq, ",stripe=%lu", sbi->s_stripe); @@ -886,7 +906,7 @@ enum { Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version, - Opt_mballoc, Opt_nomballoc, Opt_stripe, + Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc, }; static match_table_t tokens = { @@ -945,6 +965,8 @@ static match_table_t tokens = { {Opt_nomballoc, "nomballoc"}, {Opt_stripe, "stripe=%u"}, {Opt_resize, "resize"}, + {Opt_delalloc, "delalloc"}, + {Opt_nodelalloc, "nodelalloc"}, {Opt_err, NULL}, }; @@ -982,6 +1004,7 @@ static int parse_options (char *options, struct super_block *sb, int qtype, qfmt; char *qname; #endif + ext4_fsblk_t last_block; if (!options) return 1; @@ -1301,15 +1324,39 @@ set_qf_format: clear_opt(sbi->s_mount_opt, NOBH); break; case Opt_extents: + if (!EXT4_HAS_INCOMPAT_FEATURE(sb, + EXT4_FEATURE_INCOMPAT_EXTENTS)) { + ext4_warning(sb, __func__, + "extents feature not enabled " + "on this filesystem, use tune2fs\n"); + return 0; + } set_opt (sbi->s_mount_opt, EXTENTS); break; case Opt_noextents: + /* + * When e2fsprogs support resizing an already existing + * ext3 file system to greater than 2**32 we need to + * add support to block allocator to handle growing + * already existing block mapped inode so that blocks + * allocated for them fall within 2**32 + */ + last_block = ext4_blocks_count(sbi->s_es) - 1; + if (last_block > 0xffffffffULL) { + printk(KERN_ERR "EXT4-fs: Filesystem too " + "large to mount with " + "-o noextents options\n"); + return 0; + } clear_opt (sbi->s_mount_opt, EXTENTS); break; case Opt_i_version: set_opt(sbi->s_mount_opt, I_VERSION); sb->s_flags |= MS_I_VERSION; break; + case Opt_nodelalloc: + clear_opt(sbi->s_mount_opt, DELALLOC); + break; case Opt_mballoc: set_opt(sbi->s_mount_opt, MBALLOC); break; @@ -1323,6 +1370,9 @@ set_qf_format: return 0; sbi->s_stripe = option; break; + case Opt_delalloc: + set_opt(sbi->s_mount_opt, DELALLOC); + break; default: printk (KERN_ERR "EXT4-fs: Unrecognized mount option \"%s\" " @@ -1435,6 +1485,54 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, return res; } +static int ext4_fill_flex_info(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_group_desc *gdp = NULL; + struct buffer_head *bh; + ext4_group_t flex_group_count; + ext4_group_t flex_group; + int groups_per_flex = 0; + __u64 block_bitmap = 0; + int i; + + if (!sbi->s_es->s_log_groups_per_flex) { + sbi->s_log_groups_per_flex = 0; + return 1; + } + + sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; + groups_per_flex = 1 << sbi->s_log_groups_per_flex; + + flex_group_count = (sbi->s_groups_count + groups_per_flex - 1) / + groups_per_flex; + sbi->s_flex_groups = kmalloc(flex_group_count * + sizeof(struct flex_groups), GFP_KERNEL); + if (sbi->s_flex_groups == NULL) { + printk(KERN_ERR "EXT4-fs: not enough memory\n"); + goto failed; + } + memset(sbi->s_flex_groups, 0, flex_group_count * + sizeof(struct flex_groups)); + + gdp = ext4_get_group_desc(sb, 1, &bh); + block_bitmap = ext4_block_bitmap(sb, gdp) - 1; + + for (i = 0; i < sbi->s_groups_count; i++) { + gdp = ext4_get_group_desc(sb, i, &bh); + + flex_group = ext4_flex_group(sbi, i); + sbi->s_flex_groups[flex_group].free_inodes += + le16_to_cpu(gdp->bg_free_inodes_count); + sbi->s_flex_groups[flex_group].free_blocks += + le16_to_cpu(gdp->bg_free_blocks_count); + } + + return 1; +failed: + return 0; +} + __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group, struct ext4_group_desc *gdp) { @@ -1802,8 +1900,8 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi) } static int ext4_fill_super (struct super_block *sb, void *data, int silent) - __releases(kernel_sem) - __acquires(kernel_sem) + __releases(kernel_lock) + __acquires(kernel_lock) { struct buffer_head * bh; @@ -1843,11 +1941,6 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) goto out_fail; } - if (!sb_set_blocksize(sb, blocksize)) { - printk(KERN_ERR "EXT4-fs: bad blocksize %d.\n", blocksize); - goto out_fail; - } - /* * The ext4 superblock will not be buffer aligned for other than 1kB * block sizes. We need to calculate the offset from buffer start. @@ -1907,18 +2000,32 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) sbi->s_resgid = le16_to_cpu(es->s_def_resgid); set_opt(sbi->s_mount_opt, RESERVATION); + set_opt(sbi->s_mount_opt, BARRIER); /* * turn on extents feature by default in ext4 filesystem - * User -o noextents to turn it off + * only if feature flag already set by mkfs or tune2fs. + * Use -o noextents to turn it off */ - set_opt(sbi->s_mount_opt, EXTENTS); + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) + set_opt(sbi->s_mount_opt, EXTENTS); + else + ext4_warning(sb, __func__, + "extents feature not enabled on this filesystem, " + "use tune2fs.\n"); /* - * turn on mballoc feature by default in ext4 filesystem - * User -o nomballoc to turn it off + * turn on mballoc code by default in ext4 filesystem + * Use -o nomballoc to turn it off */ set_opt(sbi->s_mount_opt, MBALLOC); + /* + * enable delayed allocation by default + * Use -o nodelalloc to turn it off + */ + set_opt(sbi->s_mount_opt, DELALLOC); + + if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum, NULL, 0)) goto failed_mount; @@ -2129,6 +2236,14 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) printk(KERN_ERR "EXT4-fs: group descriptors corrupted!\n"); goto failed_mount2; } + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) + if (!ext4_fill_flex_info(sb)) { + printk(KERN_ERR + "EXT4-fs: unable to initialize " + "flex_bg meta info!\n"); + goto failed_mount2; + } + sbi->s_gdb_count = db_count; get_random_bytes(&sbi->s_next_generation, sizeof(u32)); spin_lock_init(&sbi->s_next_gen_lock); @@ -2189,6 +2304,29 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) { if (ext4_load_journal(sb, es, journal_devnum)) goto failed_mount3; + if (!(sb->s_flags & MS_RDONLY) && + EXT4_SB(sb)->s_journal->j_failed_commit) { + printk(KERN_CRIT "EXT4-fs error (device %s): " + "ext4_fill_super: Journal transaction " + "%u is corrupt\n", sb->s_id, + EXT4_SB(sb)->s_journal->j_failed_commit); + if (test_opt (sb, ERRORS_RO)) { + printk (KERN_CRIT + "Mounting filesystem read-only\n"); + sb->s_flags |= MS_RDONLY; + EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; + es->s_state |= cpu_to_le16(EXT4_ERROR_FS); + } + if (test_opt(sb, ERRORS_PANIC)) { + EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; + es->s_state |= cpu_to_le16(EXT4_ERROR_FS); + ext4_commit_super(sb, es, 1); + printk(KERN_CRIT + "EXT4-fs (device %s): mount failed\n", + sb->s_id); + goto failed_mount4; + } + } } else if (journal_inum) { if (ext4_create_journal(sb, es, journal_inum)) goto failed_mount3; @@ -2326,6 +2464,13 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered": "writeback"); + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { + printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - " + "requested data journaling mode\n"); + clear_opt(sbi->s_mount_opt, DELALLOC); + } else if (test_opt(sb, DELALLOC)) + printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n"); + ext4_ext_init(sb); ext4_mb_init(sb, needs_recovery); @@ -2340,6 +2485,7 @@ cantfind_ext4: failed_mount4: jbd2_journal_destroy(sbi->s_journal); + sbi->s_journal = NULL; failed_mount3: percpu_counter_destroy(&sbi->s_freeblocks_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); @@ -3293,7 +3439,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, err = ext4_journal_dirty_metadata(handle, bh); else { /* Always do at least ordered writes for quotas */ - err = ext4_journal_dirty_data(handle, bh); + err = ext4_jbd2_file_inode(handle, inode); mark_buffer_dirty(bh); } brelse(bh); @@ -3305,8 +3451,10 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, blk++; } out: - if (len == towrite) + if (len == towrite) { + mutex_unlock(&inode->i_mutex); return err; + } if (inode->i_size < off+len-towrite) { i_size_write(inode, off+len-towrite); EXT4_I(inode)->i_disksize = inode->i_size; diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index ff08633f398..93c5fdcdad2 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -810,7 +810,7 @@ inserted: /* We need to allocate a new block */ ext4_fsblk_t goal = ext4_group_first_block_no(sb, EXT4_I(inode)->i_block_group); - ext4_fsblk_t block = ext4_new_block(handle, inode, + ext4_fsblk_t block = ext4_new_meta_block(handle, inode, goal, &error); if (error) goto cleanup; diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c index fff33382cad..ac1a52cf2a3 100644 --- a/fs/ext4/xattr_trusted.c +++ b/fs/ext4/xattr_trusted.c @@ -13,13 +13,11 @@ #include "ext4.h" #include "xattr.h" -#define XATTR_TRUSTED_PREFIX "trusted." - static size_t ext4_xattr_trusted_list(struct inode *inode, char *list, size_t list_size, const char *name, size_t name_len) { - const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1; + const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; if (!capable(CAP_SYS_ADMIN)) diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c index 67be723fcc4..d91aa61b42a 100644 --- a/fs/ext4/xattr_user.c +++ b/fs/ext4/xattr_user.c @@ -12,13 +12,11 @@ #include "ext4.h" #include "xattr.h" -#define XATTR_USER_PREFIX "user." - static size_t ext4_xattr_user_list(struct inode *inode, char *list, size_t list_size, const char *name, size_t name_len) { - const size_t prefix_len = sizeof(XATTR_USER_PREFIX)-1; + const size_t prefix_len = XATTR_USER_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; if (!test_opt(inode->i_sb, XATTR_USER)) diff --git a/fs/fat/cache.c b/fs/fat/cache.c index fda25479af2..3a9ecac8d61 100644 --- a/fs/fat/cache.c +++ b/fs/fat/cache.c @@ -61,7 +61,7 @@ void fat_cache_destroy(void) static inline struct fat_cache *fat_cache_alloc(struct inode *inode) { - return kmem_cache_alloc(fat_cache_cachep, GFP_KERNEL); + return kmem_cache_alloc(fat_cache_cachep, GFP_NOFS); } static inline void fat_cache_free(struct fat_cache *cache) diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 486725ee99a..34541d06e62 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -472,7 +472,7 @@ static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent, loff_t cpos; int ret = 0; - lock_kernel(); + lock_super(sb); cpos = filp->f_pos; /* Fake . and .. for the root directory. */ @@ -654,7 +654,7 @@ FillFailed: if (unicode) __putname(unicode); out: - unlock_kernel(); + unlock_super(sb); return ret; } diff --git a/fs/fat/file.c b/fs/fat/file.c index 27cc1164ec3..c672df4036e 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -11,7 +11,6 @@ #include <linux/mount.h> #include <linux/time.h> #include <linux/msdos_fs.h> -#include <linux/smp_lock.h> #include <linux/buffer_head.h> #include <linux/writeback.h> #include <linux/backing-dev.h> @@ -242,9 +241,7 @@ void fat_truncate(struct inode *inode) nr_clusters = (inode->i_size + (cluster_size - 1)) >> sbi->cluster_bits; - lock_kernel(); fat_free(inode, nr_clusters); - unlock_kernel(); fat_flush_inodes(inode->i_sb, inode, NULL); } @@ -257,26 +254,34 @@ int fat_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) } EXPORT_SYMBOL_GPL(fat_getattr); -static int fat_check_mode(const struct msdos_sb_info *sbi, struct inode *inode, - mode_t mode) +static int fat_sanitize_mode(const struct msdos_sb_info *sbi, + struct inode *inode, umode_t *mode_ptr) { - mode_t mask, req = mode & ~S_IFMT; + mode_t mask, perm; - if (S_ISREG(mode)) + /* + * Note, the basic check is already done by a caller of + * (attr->ia_mode & ~MSDOS_VALID_MODE) + */ + + if (S_ISREG(inode->i_mode)) mask = sbi->options.fs_fmask; else mask = sbi->options.fs_dmask; + perm = *mode_ptr & ~(S_IFMT | mask); + /* * Of the r and x bits, all (subject to umask) must be present. Of the * w bits, either all (subject to umask) or none must be present. */ - req &= ~mask; - if ((req & (S_IRUGO | S_IXUGO)) != (inode->i_mode & (S_IRUGO|S_IXUGO))) + if ((perm & (S_IRUGO | S_IXUGO)) != (inode->i_mode & (S_IRUGO|S_IXUGO))) return -EPERM; - if ((req & S_IWUGO) && ((req & S_IWUGO) != (S_IWUGO & ~mask))) + if ((perm & S_IWUGO) && ((perm & S_IWUGO) != (S_IWUGO & ~mask))) return -EPERM; + *mode_ptr &= S_IFMT | perm; + return 0; } @@ -299,11 +304,9 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr) { struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb); struct inode *inode = dentry->d_inode; - int mask, error = 0; + int error = 0; unsigned int ia_valid; - lock_kernel(); - /* * Expand the file. Since inode_setattr() updates ->i_size * before calling the ->truncate(), but FAT needs to fill the @@ -332,12 +335,13 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr) error = 0; goto out; } + if (((attr->ia_valid & ATTR_UID) && (attr->ia_uid != sbi->options.fs_uid)) || ((attr->ia_valid & ATTR_GID) && (attr->ia_gid != sbi->options.fs_gid)) || ((attr->ia_valid & ATTR_MODE) && - fat_check_mode(sbi, inode, attr->ia_mode) < 0)) + (attr->ia_mode & ~MSDOS_VALID_MODE))) error = -EPERM; if (error) { @@ -346,17 +350,17 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr) goto out; } - error = inode_setattr(inode, attr); - if (error) - goto out; + /* + * We don't return -EPERM here. Yes, strange, but this is too + * old behavior. + */ + if (attr->ia_valid & ATTR_MODE) { + if (fat_sanitize_mode(sbi, inode, &attr->ia_mode) < 0) + attr->ia_valid &= ~ATTR_MODE; + } - if (S_ISDIR(inode->i_mode)) - mask = sbi->options.fs_dmask; - else - mask = sbi->options.fs_fmask; - inode->i_mode &= S_IFMT | (S_IRWXUGO & ~mask); + error = inode_setattr(inode, attr); out: - unlock_kernel(); return error; } EXPORT_SYMBOL_GPL(fat_setattr); diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 4e0a3dd9d67..46a4508ffd2 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -440,14 +440,13 @@ static void fat_delete_inode(struct inode *inode) static void fat_clear_inode(struct inode *inode) { - struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); + struct super_block *sb = inode->i_sb; + struct msdos_sb_info *sbi = MSDOS_SB(sb); - lock_kernel(); spin_lock(&sbi->inode_hash_lock); fat_cache_inval_inode(inode); hlist_del_init(&MSDOS_I(inode)->i_fat_hash); spin_unlock(&sbi->inode_hash_lock); - unlock_kernel(); } static void fat_write_super(struct super_block *sb) @@ -485,7 +484,7 @@ static struct kmem_cache *fat_inode_cachep; static struct inode *fat_alloc_inode(struct super_block *sb) { struct msdos_inode_info *ei; - ei = kmem_cache_alloc(fat_inode_cachep, GFP_KERNEL); + ei = kmem_cache_alloc(fat_inode_cachep, GFP_NOFS); if (!ei) return NULL; return &ei->vfs_inode; @@ -567,7 +566,7 @@ retry: if (inode->i_ino == MSDOS_ROOT_INO || !i_pos) return 0; - lock_kernel(); + lock_super(sb); bh = sb_bread(sb, i_pos >> sbi->dir_per_block_bits); if (!bh) { printk(KERN_ERR "FAT: unable to read inode block " @@ -579,7 +578,7 @@ retry: if (i_pos != MSDOS_I(inode)->i_pos) { spin_unlock(&sbi->inode_hash_lock); brelse(bh); - unlock_kernel(); + unlock_super(sb); goto retry; } @@ -606,7 +605,7 @@ retry: err = sync_dirty_buffer(bh); brelse(bh); out: - unlock_kernel(); + unlock_super(sb); return err; } @@ -736,6 +735,7 @@ fat_encode_fh(struct dentry *de, __u32 *fh, int *lenp, int connectable) static struct dentry *fat_get_parent(struct dentry *child) { + struct super_block *sb = child->d_sb; struct buffer_head *bh; struct msdos_dir_entry *de; loff_t i_pos; @@ -743,14 +743,14 @@ static struct dentry *fat_get_parent(struct dentry *child) struct inode *inode; int err; - lock_kernel(); + lock_super(sb); err = fat_get_dotdot_entry(child->d_inode, &bh, &de, &i_pos); if (err) { parent = ERR_PTR(err); goto out; } - inode = fat_build_inode(child->d_sb, de, i_pos); + inode = fat_build_inode(sb, de, i_pos); brelse(bh); if (IS_ERR(inode)) { parent = ERR_CAST(inode); @@ -762,7 +762,7 @@ static struct dentry *fat_get_parent(struct dentry *child) parent = ERR_PTR(-ENOMEM); } out: - unlock_kernel(); + unlock_super(sb); return parent; } @@ -1172,6 +1172,12 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, long error; char buf[50]; + /* + * GFP_KERNEL is ok here, because while we do hold the + * supeblock lock, memory pressure can't call back into + * the filesystem, since we're only just about to mount + * it and have no inodes etc active! + */ sbi = kzalloc(sizeof(struct msdos_sb_info), GFP_KERNEL); if (!sbi) return -ENOMEM; diff --git a/fs/fcntl.c b/fs/fcntl.c index bfd776509a7..330a7d78259 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -12,7 +12,6 @@ #include <linux/fdtable.h> #include <linux/capability.h> #include <linux/dnotify.h> -#include <linux/smp_lock.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/security.h> @@ -227,7 +226,6 @@ static int setfl(int fd, struct file * filp, unsigned long arg) if (error) return error; - lock_kernel(); if ((arg ^ filp->f_flags) & FASYNC) { if (filp->f_op && filp->f_op->fasync) { error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0); @@ -238,7 +236,6 @@ static int setfl(int fd, struct file * filp, unsigned long arg) filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK); out: - unlock_kernel(); return error; } diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index fb77e096213..3141690558c 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -488,7 +488,12 @@ static struct fuse_conn *new_conn(struct super_block *sb) err = bdi_init(&fc->bdi); if (err) goto error_kfree; - err = bdi_register_dev(&fc->bdi, fc->dev); + if (sb->s_bdev) { + err = bdi_register(&fc->bdi, NULL, "%u:%u-fuseblk", + MAJOR(fc->dev), MINOR(fc->dev)); + } else { + err = bdi_register_dev(&fc->bdi, fc->dev); + } if (err) goto error_bdi_destroy; /* @@ -586,7 +591,7 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->bdi.ra_pages = min(fc->bdi.ra_pages, ra_pages); fc->minor = arg->minor; fc->max_write = arg->minor < 5 ? 4096 : arg->max_write; - fc->max_write = min_t(unsigned, 4096, fc->max_write); + fc->max_write = max_t(unsigned, 4096, fc->max_write); fc->conn_init = 1; } fuse_put_request(fc, req); @@ -662,7 +667,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) fc->flags = d.flags; fc->user_id = d.user_id; fc->group_id = d.group_id; - fc->max_read = min_t(unsigned, 4096, d.max_read); + fc->max_read = max_t(unsigned, 4096, d.max_read); /* Used by get_root_inode() */ sb->s_fs_info = fc; diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index c19184f2e70..bec76b1c2bb 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -246,15 +246,11 @@ static void find_metapath(const struct gfs2_sbd *sdp, u64 block, } -static inline unsigned int zero_metapath_length(const struct metapath *mp, - unsigned height) +static inline unsigned int metapath_branch_start(const struct metapath *mp) { - unsigned int i; - for (i = 0; i < height - 1; i++) { - if (mp->mp_list[i] != 0) - return i; - } - return height; + if (mp->mp_list[0] == 0) + return 2; + return 1; } /** @@ -436,7 +432,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock, struct gfs2_sbd *sdp = GFS2_SB(inode); struct buffer_head *dibh = mp->mp_bh[0]; u64 bn, dblock = 0; - unsigned n, i, blks, alloced = 0, iblks = 0, zmpl = 0; + unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0; unsigned dblks = 0; unsigned ptrs_per_blk; const unsigned end_of_metadata = height - 1; @@ -471,9 +467,8 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock, /* Building up tree height */ state = ALLOC_GROW_HEIGHT; iblks = height - ip->i_height; - zmpl = zero_metapath_length(mp, height); - iblks -= zmpl; - iblks += height; + branch_start = metapath_branch_start(mp); + iblks += (height - branch_start); } } @@ -509,13 +504,13 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock, sizeof(struct gfs2_meta_header)); *ptr = zero_bn; state = ALLOC_GROW_DEPTH; - for(i = zmpl; i < height; i++) { + for(i = branch_start; i < height; i++) { if (mp->mp_bh[i] == NULL) break; brelse(mp->mp_bh[i]); mp->mp_bh[i] = NULL; } - i = zmpl; + i = branch_start; } if (n == 0) break; diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c index e1b7d525a06..24dd5945008 100644 --- a/fs/gfs2/ops_file.c +++ b/fs/gfs2/ops_file.c @@ -62,11 +62,11 @@ static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin) error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); if (!error) { - error = remote_llseek(file, offset, origin); + error = generic_file_llseek_unlocked(file, offset, origin); gfs2_glock_dq_uninit(&i_gh); } } else - error = remote_llseek(file, offset, origin); + error = generic_file_llseek_unlocked(file, offset, origin); return error; } diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 6387523a315..3401628d742 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -195,7 +195,7 @@ ulong_aligned: depending on architecture. I've experimented with several ways of writing this section such as using an else before the goto but this one seems to be the fastest. */ - while ((unsigned char *)plong < end - 1) { + while ((unsigned char *)plong < end - sizeof(unsigned long)) { prefetch(plong + 1); if (((*plong) & LBITMASK) != lskipval) break; diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 6914598022c..91389c8aee8 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -688,7 +688,6 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact J_ASSERT(transaction->t_state == T_FINISHED); J_ASSERT(transaction->t_buffers == NULL); - J_ASSERT(transaction->t_sync_datalist == NULL); J_ASSERT(transaction->t_forget == NULL); J_ASSERT(transaction->t_iobuf_list == NULL); J_ASSERT(transaction->t_shadow_list == NULL); diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 4d99685fdce..f8b3be87322 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -22,6 +22,8 @@ #include <linux/pagemap.h> #include <linux/jiffies.h> #include <linux/crc32.h> +#include <linux/writeback.h> +#include <linux/backing-dev.h> /* * Default IO end handler for temporary BJ_IO buffer_heads. @@ -37,8 +39,8 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) } /* - * When an ext3-ordered file is truncated, it is possible that many pages are - * not sucessfully freed, because they are attached to a committing transaction. + * When an ext4 file is truncated, it is possible that some pages are not + * successfully freed, because they are attached to a committing transaction. * After the transaction commits, these pages are left on the LRU, with no * ->mapping, and with attached buffers. These pages are trivially reclaimable * by the VM, but their apparent absence upsets the VM accounting, and it makes @@ -80,21 +82,6 @@ nope: } /* - * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is - * held. For ranking reasons we must trylock. If we lose, schedule away and - * return 0. j_list_lock is dropped in this case. - */ -static int inverted_lock(journal_t *journal, struct buffer_head *bh) -{ - if (!jbd_trylock_bh_state(bh)) { - spin_unlock(&journal->j_list_lock); - schedule(); - return 0; - } - return 1; -} - -/* * Done it all: now submit the commit record. We should have * cleaned up our previous buffers by now, so if we are in abort * mode we can now just skip the rest of the journal write @@ -112,6 +99,7 @@ static int journal_submit_commit_record(journal_t *journal, struct buffer_head *bh; int ret; int barrier_done = 0; + struct timespec now = current_kernel_time(); if (is_journal_aborted(journal)) return 0; @@ -126,6 +114,8 @@ static int journal_submit_commit_record(journal_t *journal, tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK); tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid); + tmp->h_commit_sec = cpu_to_be64(now.tv_sec); + tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec); if (JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM)) { @@ -168,6 +158,7 @@ static int journal_submit_commit_record(journal_t *journal, spin_unlock(&journal->j_state_lock); /* And try again, without the barrier */ + lock_buffer(bh); set_buffer_uptodate(bh); set_buffer_dirty(bh); ret = submit_bh(WRITE, bh); @@ -196,159 +187,104 @@ static int journal_wait_on_commit_record(struct buffer_head *bh) } /* - * Wait for all submitted IO to complete. + * write the filemap data using writepage() address_space_operations. + * We don't do block allocation here even for delalloc. We don't + * use writepages() because with dealyed allocation we may be doing + * block allocation in writepages(). */ -static int journal_wait_on_locked_list(journal_t *journal, - transaction_t *commit_transaction) +static int journal_submit_inode_data_buffers(struct address_space *mapping) { - int ret = 0; - struct journal_head *jh; - - while (commit_transaction->t_locked_list) { - struct buffer_head *bh; - - jh = commit_transaction->t_locked_list->b_tprev; - bh = jh2bh(jh); - get_bh(bh); - if (buffer_locked(bh)) { - spin_unlock(&journal->j_list_lock); - wait_on_buffer(bh); - if (unlikely(!buffer_uptodate(bh))) - ret = -EIO; - spin_lock(&journal->j_list_lock); - } - if (!inverted_lock(journal, bh)) { - put_bh(bh); - spin_lock(&journal->j_list_lock); - continue; - } - if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) { - __jbd2_journal_unfile_buffer(jh); - jbd_unlock_bh_state(bh); - jbd2_journal_remove_journal_head(bh); - put_bh(bh); - } else { - jbd_unlock_bh_state(bh); - } - put_bh(bh); - cond_resched_lock(&journal->j_list_lock); - } + int ret; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = mapping->nrpages * 2, + .range_start = 0, + .range_end = i_size_read(mapping->host), + .for_writepages = 1, + }; + + ret = generic_writepages(mapping, &wbc); return ret; - } +} -static void journal_do_submit_data(struct buffer_head **wbuf, int bufs) +/* + * Submit all the data buffers of inode associated with the transaction to + * disk. + * + * We are in a committing transaction. Therefore no new inode can be added to + * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently + * operate on from being released while we write out pages. + */ +static int journal_submit_data_buffers(journal_t *journal, + transaction_t *commit_transaction) { - int i; + struct jbd2_inode *jinode; + int err, ret = 0; + struct address_space *mapping; - for (i = 0; i < bufs; i++) { - wbuf[i]->b_end_io = end_buffer_write_sync; - /* We use-up our safety reference in submit_bh() */ - submit_bh(WRITE, wbuf[i]); + spin_lock(&journal->j_list_lock); + list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { + mapping = jinode->i_vfs_inode->i_mapping; + jinode->i_flags |= JI_COMMIT_RUNNING; + spin_unlock(&journal->j_list_lock); + /* + * submit the inode data buffers. We use writepage + * instead of writepages. Because writepages can do + * block allocation with delalloc. We need to write + * only allocated blocks here. + */ + err = journal_submit_inode_data_buffers(mapping); + if (!ret) + ret = err; + spin_lock(&journal->j_list_lock); + J_ASSERT(jinode->i_transaction == commit_transaction); + jinode->i_flags &= ~JI_COMMIT_RUNNING; + wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); } + spin_unlock(&journal->j_list_lock); + return ret; } /* - * Submit all the data buffers to disk + * Wait for data submitted for writeout, refile inodes to proper + * transaction if needed. + * */ -static void journal_submit_data_buffers(journal_t *journal, - transaction_t *commit_transaction) +static int journal_finish_inode_data_buffers(journal_t *journal, + transaction_t *commit_transaction) { - struct journal_head *jh; - struct buffer_head *bh; - int locked; - int bufs = 0; - struct buffer_head **wbuf = journal->j_wbuf; + struct jbd2_inode *jinode, *next_i; + int err, ret = 0; - /* - * Whenever we unlock the journal and sleep, things can get added - * onto ->t_sync_datalist, so we have to keep looping back to - * write_out_data until we *know* that the list is empty. - * - * Cleanup any flushed data buffers from the data list. Even in - * abort mode, we want to flush this out as soon as possible. - */ -write_out_data: - cond_resched(); + /* For locking, see the comment in journal_submit_data_buffers() */ spin_lock(&journal->j_list_lock); + list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { + jinode->i_flags |= JI_COMMIT_RUNNING; + spin_unlock(&journal->j_list_lock); + err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping); + if (!ret) + ret = err; + spin_lock(&journal->j_list_lock); + jinode->i_flags &= ~JI_COMMIT_RUNNING; + wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); + } - while (commit_transaction->t_sync_datalist) { - jh = commit_transaction->t_sync_datalist; - bh = jh2bh(jh); - locked = 0; - - /* Get reference just to make sure buffer does not disappear - * when we are forced to drop various locks */ - get_bh(bh); - /* If the buffer is dirty, we need to submit IO and hence - * we need the buffer lock. We try to lock the buffer without - * blocking. If we fail, we need to drop j_list_lock and do - * blocking lock_buffer(). - */ - if (buffer_dirty(bh)) { - if (test_set_buffer_locked(bh)) { - BUFFER_TRACE(bh, "needs blocking lock"); - spin_unlock(&journal->j_list_lock); - /* Write out all data to prevent deadlocks */ - journal_do_submit_data(wbuf, bufs); - bufs = 0; - lock_buffer(bh); - spin_lock(&journal->j_list_lock); - } - locked = 1; - } - /* We have to get bh_state lock. Again out of order, sigh. */ - if (!inverted_lock(journal, bh)) { - jbd_lock_bh_state(bh); - spin_lock(&journal->j_list_lock); - } - /* Someone already cleaned up the buffer? */ - if (!buffer_jbd(bh) - || jh->b_transaction != commit_transaction - || jh->b_jlist != BJ_SyncData) { - jbd_unlock_bh_state(bh); - if (locked) - unlock_buffer(bh); - BUFFER_TRACE(bh, "already cleaned up"); - put_bh(bh); - continue; - } - if (locked && test_clear_buffer_dirty(bh)) { - BUFFER_TRACE(bh, "needs writeout, adding to array"); - wbuf[bufs++] = bh; - __jbd2_journal_file_buffer(jh, commit_transaction, - BJ_Locked); - jbd_unlock_bh_state(bh); - if (bufs == journal->j_wbufsize) { - spin_unlock(&journal->j_list_lock); - journal_do_submit_data(wbuf, bufs); - bufs = 0; - goto write_out_data; - } - } else if (!locked && buffer_locked(bh)) { - __jbd2_journal_file_buffer(jh, commit_transaction, - BJ_Locked); - jbd_unlock_bh_state(bh); - put_bh(bh); + /* Now refile inode to proper lists */ + list_for_each_entry_safe(jinode, next_i, + &commit_transaction->t_inode_list, i_list) { + list_del(&jinode->i_list); + if (jinode->i_next_transaction) { + jinode->i_transaction = jinode->i_next_transaction; + jinode->i_next_transaction = NULL; + list_add(&jinode->i_list, + &jinode->i_transaction->t_inode_list); } else { - BUFFER_TRACE(bh, "writeout complete: unfile"); - __jbd2_journal_unfile_buffer(jh); - jbd_unlock_bh_state(bh); - if (locked) - unlock_buffer(bh); - jbd2_journal_remove_journal_head(bh); - /* Once for our safety reference, once for - * jbd2_journal_remove_journal_head() */ - put_bh(bh); - put_bh(bh); - } - - if (need_resched() || spin_needbreak(&journal->j_list_lock)) { - spin_unlock(&journal->j_list_lock); - goto write_out_data; + jinode->i_transaction = NULL; } } spin_unlock(&journal->j_list_lock); - journal_do_submit_data(wbuf, bufs); + + return ret; } static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh) @@ -523,21 +459,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) * Now start flushing things to disk, in the order they appear * on the transaction lists. Data blocks go first. */ - err = 0; - journal_submit_data_buffers(journal, commit_transaction); - - /* - * Wait for all previously submitted IO to complete if commit - * record is to be written synchronously. - */ - spin_lock(&journal->j_list_lock); - if (!JBD2_HAS_INCOMPAT_FEATURE(journal, - JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) - err = journal_wait_on_locked_list(journal, - commit_transaction); - - spin_unlock(&journal->j_list_lock); - + err = journal_submit_data_buffers(journal, commit_transaction); if (err) jbd2_journal_abort(journal, err); @@ -546,16 +468,6 @@ void jbd2_journal_commit_transaction(journal_t *journal) jbd_debug(3, "JBD: commit phase 2\n"); /* - * If we found any dirty or locked buffers, then we should have - * looped back up to the write_out_data label. If there weren't - * any then journal_clean_data_list should have wiped the list - * clean by now, so check that it is in fact empty. - */ - J_ASSERT (commit_transaction->t_sync_datalist == NULL); - - jbd_debug (3, "JBD: commit phase 3\n"); - - /* * Way to go: we have now written out all of the data for a * transaction! Now comes the tricky part: we need to write out * metadata. Loop over the transaction's entire buffer list: @@ -573,6 +485,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) J_ASSERT(commit_transaction->t_nr_buffers <= commit_transaction->t_outstanding_credits); + err = 0; descriptor = NULL; bufs = 0; while (commit_transaction->t_buffers) { @@ -747,15 +660,19 @@ start_journal_io: &cbh, crc32_sum); if (err) __jbd2_journal_abort_hard(journal); - - spin_lock(&journal->j_list_lock); - err = journal_wait_on_locked_list(journal, - commit_transaction); - spin_unlock(&journal->j_list_lock); - if (err) - __jbd2_journal_abort_hard(journal); } + /* + * This is the right place to wait for data buffers both for ASYNC + * and !ASYNC commit. If commit is ASYNC, we need to wait only after + * the commit block went to disk (which happens above). If commit is + * SYNC, we need to wait for data buffers before we start writing + * commit block, which happens below in such setting. + */ + err = journal_finish_inode_data_buffers(journal, commit_transaction); + if (err) + jbd2_journal_abort(journal, err); + /* Lo and behold: we have just managed to send a transaction to the log. Before we can commit it, wait for the IO so far to complete. Control buffers being written are on the @@ -767,7 +684,7 @@ start_journal_io: so we incur less scheduling load. */ - jbd_debug(3, "JBD: commit phase 4\n"); + jbd_debug(3, "JBD: commit phase 3\n"); /* * akpm: these are BJ_IO, and j_list_lock is not needed. @@ -826,7 +743,7 @@ wait_for_iobuf: J_ASSERT (commit_transaction->t_shadow_list == NULL); - jbd_debug(3, "JBD: commit phase 5\n"); + jbd_debug(3, "JBD: commit phase 4\n"); /* Here we wait for the revoke record and descriptor record buffers */ wait_for_ctlbuf: @@ -853,7 +770,7 @@ wait_for_iobuf: /* AKPM: bforget here */ } - jbd_debug(3, "JBD: commit phase 6\n"); + jbd_debug(3, "JBD: commit phase 5\n"); if (!JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { @@ -873,9 +790,9 @@ wait_for_iobuf: transaction can be removed from any checkpoint list it was on before. */ - jbd_debug(3, "JBD: commit phase 7\n"); + jbd_debug(3, "JBD: commit phase 6\n"); - J_ASSERT(commit_transaction->t_sync_datalist == NULL); + J_ASSERT(list_empty(&commit_transaction->t_inode_list)); J_ASSERT(commit_transaction->t_buffers == NULL); J_ASSERT(commit_transaction->t_checkpoint_list == NULL); J_ASSERT(commit_transaction->t_iobuf_list == NULL); @@ -996,7 +913,7 @@ restart_loop: /* Done with this transaction! */ - jbd_debug(3, "JBD: commit phase 8\n"); + jbd_debug(3, "JBD: commit phase 7\n"); J_ASSERT(commit_transaction->t_state == T_COMMIT); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 2e24567c4a7..b26c6d9fe6a 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -50,7 +50,6 @@ EXPORT_SYMBOL(jbd2_journal_unlock_updates); EXPORT_SYMBOL(jbd2_journal_get_write_access); EXPORT_SYMBOL(jbd2_journal_get_create_access); EXPORT_SYMBOL(jbd2_journal_get_undo_access); -EXPORT_SYMBOL(jbd2_journal_dirty_data); EXPORT_SYMBOL(jbd2_journal_dirty_metadata); EXPORT_SYMBOL(jbd2_journal_release_buffer); EXPORT_SYMBOL(jbd2_journal_forget); @@ -82,6 +81,10 @@ EXPORT_SYMBOL(jbd2_journal_blocks_per_page); EXPORT_SYMBOL(jbd2_journal_invalidatepage); EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers); EXPORT_SYMBOL(jbd2_journal_force_commit); +EXPORT_SYMBOL(jbd2_journal_file_inode); +EXPORT_SYMBOL(jbd2_journal_init_jbd_inode); +EXPORT_SYMBOL(jbd2_journal_release_jbd_inode); +EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate); static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); static void __journal_abort_soft (journal_t *journal, int errno); @@ -2195,6 +2198,54 @@ void jbd2_journal_put_journal_head(struct journal_head *jh) } /* + * Initialize jbd inode head + */ +void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode) +{ + jinode->i_transaction = NULL; + jinode->i_next_transaction = NULL; + jinode->i_vfs_inode = inode; + jinode->i_flags = 0; + INIT_LIST_HEAD(&jinode->i_list); +} + +/* + * Function to be called before we start removing inode from memory (i.e., + * clear_inode() is a fine place to be called from). It removes inode from + * transaction's lists. + */ +void jbd2_journal_release_jbd_inode(journal_t *journal, + struct jbd2_inode *jinode) +{ + int writeout = 0; + + if (!journal) + return; +restart: + spin_lock(&journal->j_list_lock); + /* Is commit writing out inode - we have to wait */ + if (jinode->i_flags & JI_COMMIT_RUNNING) { + wait_queue_head_t *wq; + DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING); + wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING); + prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&journal->j_list_lock); + schedule(); + finish_wait(wq, &wait.wait); + goto restart; + } + + /* Do we need to wait for data writeback? */ + if (journal->j_committing_transaction == jinode->i_transaction) + writeout = 1; + if (jinode->i_transaction) { + list_del(&jinode->i_list); + jinode->i_transaction = NULL; + } + spin_unlock(&journal->j_list_lock); +} + +/* * debugfs tunables */ #ifdef CONFIG_JBD2_DEBUG diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 5d0405a9e7c..058f50f65b7 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -344,6 +344,7 @@ static int calc_chksums(journal_t *journal, struct buffer_head *bh, *crc32_sum = crc32_be(*crc32_sum, (void *)obh->b_data, obh->b_size); } + put_bh(obh); } return 0; } @@ -610,9 +611,8 @@ static int do_one_pass(journal_t *journal, chksum_err = chksum_seen = 0; if (info->end_transaction) { - printk(KERN_ERR "JBD: Transaction %u " - "found to be corrupt.\n", - next_commit_ID - 1); + journal->j_failed_commit = + info->end_transaction; brelse(bh); break; } @@ -643,10 +643,8 @@ static int do_one_pass(journal_t *journal, if (!JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)){ - printk(KERN_ERR - "JBD: Transaction %u " - "found to be corrupt.\n", - next_commit_ID); + journal->j_failed_commit = + next_commit_ID; brelse(bh); break; } diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index d6e006e6780..4f7cadbb19f 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -41,7 +41,6 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh); * new transaction and we can't block without protecting against other * processes trying to touch the journal while it is in transition. * - * Called under j_state_lock */ static transaction_t * @@ -52,6 +51,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) transaction->t_tid = journal->j_transaction_sequence++; transaction->t_expires = jiffies + journal->j_commit_interval; spin_lock_init(&transaction->t_handle_lock); + INIT_LIST_HEAD(&transaction->t_inode_list); /* Set up the commit timer for the new transaction. */ journal->j_commit_timer.expires = round_jiffies(transaction->t_expires); @@ -943,183 +943,6 @@ out: } /** - * int jbd2_journal_dirty_data() - mark a buffer as containing dirty data which - * needs to be flushed before we can commit the - * current transaction. - * @handle: transaction - * @bh: bufferhead to mark - * - * The buffer is placed on the transaction's data list and is marked as - * belonging to the transaction. - * - * Returns error number or 0 on success. - * - * jbd2_journal_dirty_data() can be called via page_launder->ext3_writepage - * by kswapd. - */ -int jbd2_journal_dirty_data(handle_t *handle, struct buffer_head *bh) -{ - journal_t *journal = handle->h_transaction->t_journal; - int need_brelse = 0; - struct journal_head *jh; - - if (is_handle_aborted(handle)) - return 0; - - jh = jbd2_journal_add_journal_head(bh); - JBUFFER_TRACE(jh, "entry"); - - /* - * The buffer could *already* be dirty. Writeout can start - * at any time. - */ - jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid); - - /* - * What if the buffer is already part of a running transaction? - * - * There are two cases: - * 1) It is part of the current running transaction. Refile it, - * just in case we have allocated it as metadata, deallocated - * it, then reallocated it as data. - * 2) It is part of the previous, still-committing transaction. - * If all we want to do is to guarantee that the buffer will be - * written to disk before this new transaction commits, then - * being sure that the *previous* transaction has this same - * property is sufficient for us! Just leave it on its old - * transaction. - * - * In case (2), the buffer must not already exist as metadata - * --- that would violate write ordering (a transaction is free - * to write its data at any point, even before the previous - * committing transaction has committed). The caller must - * never, ever allow this to happen: there's nothing we can do - * about it in this layer. - */ - jbd_lock_bh_state(bh); - spin_lock(&journal->j_list_lock); - - /* Now that we have bh_state locked, are we really still mapped? */ - if (!buffer_mapped(bh)) { - JBUFFER_TRACE(jh, "unmapped buffer, bailing out"); - goto no_journal; - } - - if (jh->b_transaction) { - JBUFFER_TRACE(jh, "has transaction"); - if (jh->b_transaction != handle->h_transaction) { - JBUFFER_TRACE(jh, "belongs to older transaction"); - J_ASSERT_JH(jh, jh->b_transaction == - journal->j_committing_transaction); - - /* @@@ IS THIS TRUE ? */ - /* - * Not any more. Scenario: someone does a write() - * in data=journal mode. The buffer's transaction has - * moved into commit. Then someone does another - * write() to the file. We do the frozen data copyout - * and set b_next_transaction to point to j_running_t. - * And while we're in that state, someone does a - * writepage() in an attempt to pageout the same area - * of the file via a shared mapping. At present that - * calls jbd2_journal_dirty_data(), and we get right here. - * It may be too late to journal the data. Simply - * falling through to the next test will suffice: the - * data will be dirty and wil be checkpointed. The - * ordering comments in the next comment block still - * apply. - */ - //J_ASSERT_JH(jh, jh->b_next_transaction == NULL); - - /* - * If we're journalling data, and this buffer was - * subject to a write(), it could be metadata, forget - * or shadow against the committing transaction. Now, - * someone has dirtied the same darn page via a mapping - * and it is being writepage()'d. - * We *could* just steal the page from commit, with some - * fancy locking there. Instead, we just skip it - - * don't tie the page's buffers to the new transaction - * at all. - * Implication: if we crash before the writepage() data - * is written into the filesystem, recovery will replay - * the write() data. - */ - if (jh->b_jlist != BJ_None && - jh->b_jlist != BJ_SyncData && - jh->b_jlist != BJ_Locked) { - JBUFFER_TRACE(jh, "Not stealing"); - goto no_journal; - } - - /* - * This buffer may be undergoing writeout in commit. We - * can't return from here and let the caller dirty it - * again because that can cause the write-out loop in - * commit to never terminate. - */ - if (buffer_dirty(bh)) { - get_bh(bh); - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - need_brelse = 1; - sync_dirty_buffer(bh); - jbd_lock_bh_state(bh); - spin_lock(&journal->j_list_lock); - /* Since we dropped the lock... */ - if (!buffer_mapped(bh)) { - JBUFFER_TRACE(jh, "buffer got unmapped"); - goto no_journal; - } - /* The buffer may become locked again at any - time if it is redirtied */ - } - - /* journal_clean_data_list() may have got there first */ - if (jh->b_transaction != NULL) { - JBUFFER_TRACE(jh, "unfile from commit"); - __jbd2_journal_temp_unlink_buffer(jh); - /* It still points to the committing - * transaction; move it to this one so - * that the refile assert checks are - * happy. */ - jh->b_transaction = handle->h_transaction; - } - /* The buffer will be refiled below */ - - } - /* - * Special case --- the buffer might actually have been - * allocated and then immediately deallocated in the previous, - * committing transaction, so might still be left on that - * transaction's metadata lists. - */ - if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) { - JBUFFER_TRACE(jh, "not on correct data list: unfile"); - J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow); - __jbd2_journal_temp_unlink_buffer(jh); - jh->b_transaction = handle->h_transaction; - JBUFFER_TRACE(jh, "file as data"); - __jbd2_journal_file_buffer(jh, handle->h_transaction, - BJ_SyncData); - } - } else { - JBUFFER_TRACE(jh, "not on a transaction"); - __jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_SyncData); - } -no_journal: - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - if (need_brelse) { - BUFFER_TRACE(bh, "brelse"); - __brelse(bh); - } - JBUFFER_TRACE(jh, "exit"); - jbd2_journal_put_journal_head(jh); - return 0; -} - -/** * int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata * @handle: transaction to add buffer to. * @bh: buffer to mark @@ -1541,10 +1364,10 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh) * Remove a buffer from the appropriate transaction list. * * Note that this function can *change* the value of - * bh->b_transaction->t_sync_datalist, t_buffers, t_forget, - * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list. If the caller - * is holding onto a copy of one of thee pointers, it could go bad. - * Generally the caller needs to re-read the pointer from the transaction_t. + * bh->b_transaction->t_buffers, t_forget, t_iobuf_list, t_shadow_list, + * t_log_list or t_reserved_list. If the caller is holding onto a copy of one + * of these pointers, it could go bad. Generally the caller needs to re-read + * the pointer from the transaction_t. * * Called under j_list_lock. The journal may not be locked. */ @@ -1566,9 +1389,6 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh) switch (jh->b_jlist) { case BJ_None: return; - case BJ_SyncData: - list = &transaction->t_sync_datalist; - break; case BJ_Metadata: transaction->t_nr_buffers--; J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0); @@ -1589,9 +1409,6 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh) case BJ_Reserved: list = &transaction->t_reserved_list; break; - case BJ_Locked: - list = &transaction->t_locked_list; - break; } __blist_del_buffer(list, jh); @@ -1634,15 +1451,7 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh) goto out; spin_lock(&journal->j_list_lock); - if (jh->b_transaction != NULL && jh->b_cp_transaction == NULL) { - if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) { - /* A written-back ordered data buffer */ - JBUFFER_TRACE(jh, "release data"); - __jbd2_journal_unfile_buffer(jh); - jbd2_journal_remove_journal_head(bh); - __brelse(bh); - } - } else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) { + if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) { /* written-back checkpointed metadata buffer */ if (jh->b_jlist == BJ_None) { JBUFFER_TRACE(jh, "remove from checkpoint list"); @@ -1656,12 +1465,43 @@ out: return; } +/* + * jbd2_journal_try_to_free_buffers() could race with + * jbd2_journal_commit_transaction(). The later might still hold the + * reference count to the buffers when inspecting them on + * t_syncdata_list or t_locked_list. + * + * jbd2_journal_try_to_free_buffers() will call this function to + * wait for the current transaction to finish syncing data buffers, before + * try to free that buffer. + * + * Called with journal->j_state_lock hold. + */ +static void jbd2_journal_wait_for_transaction_sync_data(journal_t *journal) +{ + transaction_t *transaction; + tid_t tid; + + spin_lock(&journal->j_state_lock); + transaction = journal->j_committing_transaction; + + if (!transaction) { + spin_unlock(&journal->j_state_lock); + return; + } + + tid = transaction->t_tid; + spin_unlock(&journal->j_state_lock); + jbd2_log_wait_commit(journal, tid); +} /** * int jbd2_journal_try_to_free_buffers() - try to free page buffers. * @journal: journal for operation * @page: to try and free - * @unused_gfp_mask: unused + * @gfp_mask: we use the mask to detect how hard should we try to release + * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to + * release the buffers. * * * For all the buffers on this page, @@ -1690,9 +1530,11 @@ out: * journal_try_to_free_buffer() is changing its state. But that * cannot happen because we never reallocate freed data as metadata * while the data is part of a transaction. Yes? + * + * Return 0 on failure, 1 on success */ int jbd2_journal_try_to_free_buffers(journal_t *journal, - struct page *page, gfp_t unused_gfp_mask) + struct page *page, gfp_t gfp_mask) { struct buffer_head *head; struct buffer_head *bh; @@ -1708,7 +1550,8 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal, /* * We take our own ref against the journal_head here to avoid * having to add tons of locking around each instance of - * jbd2_journal_remove_journal_head() and jbd2_journal_put_journal_head(). + * jbd2_journal_remove_journal_head() and + * jbd2_journal_put_journal_head(). */ jh = jbd2_journal_grab_journal_head(bh); if (!jh) @@ -1721,7 +1564,28 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal, if (buffer_jbd(bh)) goto busy; } while ((bh = bh->b_this_page) != head); + ret = try_to_free_buffers(page); + + /* + * There are a number of places where jbd2_journal_try_to_free_buffers() + * could race with jbd2_journal_commit_transaction(), the later still + * holds the reference to the buffers to free while processing them. + * try_to_free_buffers() failed to free those buffers. Some of the + * caller of releasepage() request page buffers to be dropped, otherwise + * treat the fail-to-free as errors (such as generic_file_direct_IO()) + * + * So, if the caller of try_to_release_page() wants the synchronous + * behaviour(i.e make sure buffers are dropped upon return), + * let's wait for the current transaction to finish flush of + * dirty data buffers, then try to free those buffers again, + * with the journal locked. + */ + if (ret == 0 && (gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)) { + jbd2_journal_wait_for_transaction_sync_data(journal); + ret = try_to_free_buffers(page); + } + busy: return ret; } @@ -1823,6 +1687,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) if (!buffer_jbd(bh)) goto zap_buffer_unlocked; + /* OK, we have data buffer in journaled mode */ spin_lock(&journal->j_state_lock); jbd_lock_bh_state(bh); spin_lock(&journal->j_list_lock); @@ -1886,15 +1751,6 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) } } else if (transaction == journal->j_committing_transaction) { JBUFFER_TRACE(jh, "on committing transaction"); - if (jh->b_jlist == BJ_Locked) { - /* - * The buffer is on the committing transaction's locked - * list. We have the buffer locked, so I/O has - * completed. So we can nail the buffer now. - */ - may_free = __dispose_buffer(jh, transaction); - goto zap_buffer; - } /* * If it is committing, we simply cannot touch it. We * can remove it's next_transaction pointer from the @@ -2027,9 +1883,6 @@ void __jbd2_journal_file_buffer(struct journal_head *jh, J_ASSERT_JH(jh, !jh->b_committed_data); J_ASSERT_JH(jh, !jh->b_frozen_data); return; - case BJ_SyncData: - list = &transaction->t_sync_datalist; - break; case BJ_Metadata: transaction->t_nr_buffers++; list = &transaction->t_buffers; @@ -2049,9 +1902,6 @@ void __jbd2_journal_file_buffer(struct journal_head *jh, case BJ_Reserved: list = &transaction->t_reserved_list; break; - case BJ_Locked: - list = &transaction->t_locked_list; - break; } __blist_add_buffer(list, jh); @@ -2141,3 +1991,88 @@ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh) spin_unlock(&journal->j_list_lock); __brelse(bh); } + +/* + * File inode in the inode list of the handle's transaction + */ +int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode) +{ + transaction_t *transaction = handle->h_transaction; + journal_t *journal = transaction->t_journal; + + if (is_handle_aborted(handle)) + return -EIO; + + jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino, + transaction->t_tid); + + /* + * First check whether inode isn't already on the transaction's + * lists without taking the lock. Note that this check is safe + * without the lock as we cannot race with somebody removing inode + * from the transaction. The reason is that we remove inode from the + * transaction only in journal_release_jbd_inode() and when we commit + * the transaction. We are guarded from the first case by holding + * a reference to the inode. We are safe against the second case + * because if jinode->i_transaction == transaction, commit code + * cannot touch the transaction because we hold reference to it, + * and if jinode->i_next_transaction == transaction, commit code + * will only file the inode where we want it. + */ + if (jinode->i_transaction == transaction || + jinode->i_next_transaction == transaction) + return 0; + + spin_lock(&journal->j_list_lock); + + if (jinode->i_transaction == transaction || + jinode->i_next_transaction == transaction) + goto done; + + /* On some different transaction's list - should be + * the committing one */ + if (jinode->i_transaction) { + J_ASSERT(jinode->i_next_transaction == NULL); + J_ASSERT(jinode->i_transaction == + journal->j_committing_transaction); + jinode->i_next_transaction = transaction; + goto done; + } + /* Not on any transaction list... */ + J_ASSERT(!jinode->i_next_transaction); + jinode->i_transaction = transaction; + list_add(&jinode->i_list, &transaction->t_inode_list); +done: + spin_unlock(&journal->j_list_lock); + + return 0; +} + +/* + * This function must be called when inode is journaled in ordered mode + * before truncation happens. It starts writeout of truncated part in + * case it is in the committing transaction so that we stand to ordered + * mode consistency guarantees. + */ +int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode, + loff_t new_size) +{ + journal_t *journal; + transaction_t *commit_trans; + int ret = 0; + + if (!inode->i_transaction && !inode->i_next_transaction) + goto out; + journal = inode->i_transaction->t_journal; + spin_lock(&journal->j_state_lock); + commit_trans = journal->j_committing_transaction; + spin_unlock(&journal->j_state_lock); + if (inode->i_transaction == commit_trans) { + ret = filemap_fdatawrite_range(inode->i_vfs_inode->i_mapping, + new_size, LLONG_MAX); + if (ret) + jbd2_journal_abort(journal, ret); + } +out: + return ret; +} diff --git a/fs/libfs.c b/fs/libfs.c index b004dfadd89..baeb71ee1cd 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -512,6 +512,20 @@ void simple_release_fs(struct vfsmount **mount, int *count) mntput(mnt); } +/** + * simple_read_from_buffer - copy data from the buffer to user space + * @to: the user space buffer to read to + * @count: the maximum number of bytes to read + * @ppos: the current position in the buffer + * @from: the buffer to read from + * @available: the size of the buffer + * + * The simple_read_from_buffer() function reads up to @count bytes from the + * buffer @from at offset @ppos into the user space address starting at @to. + * + * On success, the number of bytes read is returned and the offset @ppos is + * advanced by this number, or negative value is returned on error. + **/ ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos, const void *from, size_t available) { @@ -528,6 +542,37 @@ ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos, return count; } +/** + * memory_read_from_buffer - copy data from the buffer + * @to: the kernel space buffer to read to + * @count: the maximum number of bytes to read + * @ppos: the current position in the buffer + * @from: the buffer to read from + * @available: the size of the buffer + * + * The memory_read_from_buffer() function reads up to @count bytes from the + * buffer @from at offset @ppos into the kernel space address starting at @to. + * + * On success, the number of bytes read is returned and the offset @ppos is + * advanced by this number, or negative value is returned on error. + **/ +ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos, + const void *from, size_t available) +{ + loff_t pos = *ppos; + + if (pos < 0) + return -EINVAL; + if (pos >= available) + return 0; + if (count > available - pos) + count = available - pos; + memcpy(to, from + pos, count); + *ppos = pos + count; + + return count; +} + /* * Transaction based IO. * The file expects a single write which triggers the transaction, and then @@ -800,6 +845,7 @@ EXPORT_SYMBOL(simple_statfs); EXPORT_SYMBOL(simple_sync_file); EXPORT_SYMBOL(simple_unlink); EXPORT_SYMBOL(simple_read_from_buffer); +EXPORT_SYMBOL(memory_read_from_buffer); EXPORT_SYMBOL(simple_transaction_get); EXPORT_SYMBOL(simple_transaction_read); EXPORT_SYMBOL(simple_transaction_release); diff --git a/fs/locks.c b/fs/locks.c index 11dbf08651b..dce8c747371 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -561,9 +561,6 @@ static void locks_insert_lock(struct file_lock **pos, struct file_lock *fl) /* insert into file's list */ fl->fl_next = *pos; *pos = fl; - - if (fl->fl_ops && fl->fl_ops->fl_insert) - fl->fl_ops->fl_insert(fl); } /* @@ -586,9 +583,6 @@ static void locks_delete_lock(struct file_lock **thisfl_p) fl->fl_fasync = NULL; } - if (fl->fl_ops && fl->fl_ops->fl_remove) - fl->fl_ops->fl_remove(fl); - if (fl->fl_nspid) { put_pid(fl->fl_nspid); fl->fl_nspid = NULL; diff --git a/fs/mpage.c b/fs/mpage.c index 235e4d3873a..dbcc7af76a1 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -82,7 +82,7 @@ static void mpage_end_io_write(struct bio *bio, int err) bio_put(bio); } -static struct bio *mpage_bio_submit(int rw, struct bio *bio) +struct bio *mpage_bio_submit(int rw, struct bio *bio) { bio->bi_end_io = mpage_end_io_read; if (rw == WRITE) @@ -90,6 +90,7 @@ static struct bio *mpage_bio_submit(int rw, struct bio *bio) submit_bio(rw, bio); return NULL; } +EXPORT_SYMBOL(mpage_bio_submit); static struct bio * mpage_alloc(struct block_device *bdev, @@ -435,15 +436,9 @@ EXPORT_SYMBOL(mpage_readpage); * written, so it can intelligently allocate a suitably-sized BIO. For now, * just allocate full-size (16-page) BIOs. */ -struct mpage_data { - struct bio *bio; - sector_t last_block_in_bio; - get_block_t *get_block; - unsigned use_writepage; -}; -static int __mpage_writepage(struct page *page, struct writeback_control *wbc, - void *data) +int __mpage_writepage(struct page *page, struct writeback_control *wbc, + void *data) { struct mpage_data *mpd = data; struct bio *bio = mpd->bio; @@ -651,6 +646,7 @@ out: mpd->bio = bio; return ret; } +EXPORT_SYMBOL(__mpage_writepage); /** * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them diff --git a/fs/msdos/namei.c b/fs/msdos/namei.c index 05ff4f1d702..1f7f2956412 100644 --- a/fs/msdos/namei.c +++ b/fs/msdos/namei.c @@ -214,7 +214,7 @@ static struct dentry *msdos_lookup(struct inode *dir, struct dentry *dentry, dentry->d_op = &msdos_dentry_operations; - lock_kernel(); + lock_super(sb); res = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo); if (res == -ENOENT) goto add; @@ -232,7 +232,7 @@ add: if (dentry) dentry->d_op = &msdos_dentry_operations; out: - unlock_kernel(); + unlock_super(sb); if (!res) return dentry; return ERR_PTR(res); @@ -286,7 +286,7 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, int mode, unsigned char msdos_name[MSDOS_NAME]; int err, is_hid; - lock_kernel(); + lock_super(sb); err = msdos_format_name(dentry->d_name.name, dentry->d_name.len, msdos_name, &MSDOS_SB(sb)->options); @@ -315,7 +315,7 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, int mode, d_instantiate(dentry, inode); out: - unlock_kernel(); + unlock_super(sb); if (!err) err = fat_flush_inodes(sb, dir, inode); return err; @@ -324,11 +324,12 @@ out: /***** Remove a directory */ static int msdos_rmdir(struct inode *dir, struct dentry *dentry) { + struct super_block *sb = dir->i_sb; struct inode *inode = dentry->d_inode; struct fat_slot_info sinfo; int err; - lock_kernel(); + lock_super(sb); /* * Check whether the directory is not in use, then check * whether it is empty. @@ -349,9 +350,9 @@ static int msdos_rmdir(struct inode *dir, struct dentry *dentry) inode->i_ctime = CURRENT_TIME_SEC; fat_detach(inode); out: - unlock_kernel(); + unlock_super(sb); if (!err) - err = fat_flush_inodes(inode->i_sb, dir, inode); + err = fat_flush_inodes(sb, dir, inode); return err; } @@ -366,7 +367,7 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, int mode) struct timespec ts; int err, is_hid, cluster; - lock_kernel(); + lock_super(sb); err = msdos_format_name(dentry->d_name.name, dentry->d_name.len, msdos_name, &MSDOS_SB(sb)->options); @@ -404,14 +405,14 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, int mode) d_instantiate(dentry, inode); - unlock_kernel(); + unlock_super(sb); fat_flush_inodes(sb, dir, inode); return 0; out_free: fat_free_clusters(dir, cluster); out: - unlock_kernel(); + unlock_super(sb); return err; } @@ -419,10 +420,11 @@ out: static int msdos_unlink(struct inode *dir, struct dentry *dentry) { struct inode *inode = dentry->d_inode; + struct super_block *sb= inode->i_sb; struct fat_slot_info sinfo; int err; - lock_kernel(); + lock_super(sb); err = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo); if (err) goto out; @@ -434,9 +436,9 @@ static int msdos_unlink(struct inode *dir, struct dentry *dentry) inode->i_ctime = CURRENT_TIME_SEC; fat_detach(inode); out: - unlock_kernel(); + unlock_super(sb); if (!err) - err = fat_flush_inodes(inode->i_sb, dir, inode); + err = fat_flush_inodes(sb, dir, inode); return err; } @@ -618,10 +620,11 @@ error_inode: static int msdos_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { + struct super_block *sb = old_dir->i_sb; unsigned char old_msdos_name[MSDOS_NAME], new_msdos_name[MSDOS_NAME]; int err, is_hid; - lock_kernel(); + lock_super(sb); err = msdos_format_name(old_dentry->d_name.name, old_dentry->d_name.len, old_msdos_name, @@ -640,9 +643,9 @@ static int msdos_rename(struct inode *old_dir, struct dentry *old_dentry, err = do_msdos_rename(old_dir, old_msdos_name, old_dentry, new_dir, new_msdos_name, new_dentry, is_hid); out: - unlock_kernel(); + unlock_super(sb); if (!err) - err = fat_flush_inodes(old_dir->i_sb, old_dir, new_dir); + err = fat_flush_inodes(sb, old_dir, new_dir); return err; } diff --git a/fs/namei.c b/fs/namei.c index c7e43536c49..01e67dddcc3 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -581,15 +581,13 @@ static __always_inline int link_path_walk(const char *name, struct nameidata *nd int result; /* make sure the stuff we saved doesn't go away */ - dget(save.dentry); - mntget(save.mnt); + path_get(&save); result = __link_path_walk(name, nd); if (result == -ESTALE) { /* nd->path had been dropped */ nd->path = save; - dget(nd->path.dentry); - mntget(nd->path.mnt); + path_get(&nd->path); nd->flags |= LOOKUP_REVAL; result = __link_path_walk(name, nd); } @@ -1216,8 +1214,9 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, nd->flags = flags; nd->depth = 0; - nd->path.mnt = mntget(mnt); - nd->path.dentry = dget(dentry); + nd->path.dentry = dentry; + nd->path.mnt = mnt; + path_get(&nd->path); retval = path_walk(name, nd); if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry && @@ -2857,16 +2856,17 @@ int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen) { struct nameidata nd; void *cookie; + int res; nd.depth = 0; cookie = dentry->d_inode->i_op->follow_link(dentry, &nd); - if (!IS_ERR(cookie)) { - int res = vfs_readlink(dentry, buffer, buflen, nd_get_link(&nd)); - if (dentry->d_inode->i_op->put_link) - dentry->d_inode->i_op->put_link(dentry, &nd, cookie); - cookie = ERR_PTR(res); - } - return PTR_ERR(cookie); + if (IS_ERR(cookie)) + return PTR_ERR(cookie); + + res = vfs_readlink(dentry, buffer, buflen, nd_get_link(&nd)); + if (dentry->d_inode->i_op->put_link) + dentry->d_inode->i_op->put_link(dentry, &nd, cookie); + return res; } int vfs_follow_link(struct nameidata *nd, const char *link) diff --git a/fs/namespace.c b/fs/namespace.c index 4fc302c2a0e..4f6f7635b59 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -750,7 +750,7 @@ struct proc_fs_info { const char *str; }; -static void show_sb_opts(struct seq_file *m, struct super_block *sb) +static int show_sb_opts(struct seq_file *m, struct super_block *sb) { static const struct proc_fs_info fs_info[] = { { MS_SYNCHRONOUS, ",sync" }, @@ -764,6 +764,8 @@ static void show_sb_opts(struct seq_file *m, struct super_block *sb) if (sb->s_flags & fs_infop->flag) seq_puts(m, fs_infop->str); } + + return security_sb_show_options(m, sb); } static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt) @@ -806,11 +808,14 @@ static int show_vfsmnt(struct seq_file *m, void *v) seq_putc(m, ' '); show_type(m, mnt->mnt_sb); seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw"); - show_sb_opts(m, mnt->mnt_sb); + err = show_sb_opts(m, mnt->mnt_sb); + if (err) + goto out; show_mnt_opts(m, mnt); if (mnt->mnt_sb->s_op->show_options) err = mnt->mnt_sb->s_op->show_options(m, mnt); seq_puts(m, " 0 0\n"); +out: return err; } @@ -865,10 +870,13 @@ static int show_mountinfo(struct seq_file *m, void *v) seq_putc(m, ' '); mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); seq_puts(m, sb->s_flags & MS_RDONLY ? " ro" : " rw"); - show_sb_opts(m, sb); + err = show_sb_opts(m, sb); + if (err) + goto out; if (sb->s_op->show_options) err = sb->s_op->show_options(m, mnt); seq_putc(m, '\n'); +out: return err; } diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c index 2b145de45b3..6a7d901f193 100644 --- a/fs/ncpfs/file.c +++ b/fs/ncpfs/file.c @@ -18,6 +18,7 @@ #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/sched.h> +#include <linux/smp_lock.h> #include <linux/ncp_fs.h> #include "ncplib_kernel.h" @@ -281,9 +282,18 @@ static int ncp_release(struct inode *inode, struct file *file) { return 0; } +static loff_t ncp_remote_llseek(struct file *file, loff_t offset, int origin) +{ + loff_t ret; + lock_kernel(); + ret = generic_file_llseek_unlocked(file, offset, origin); + unlock_kernel(); + return ret; +} + const struct file_operations ncp_file_operations = { - .llseek = remote_llseek, + .llseek = ncp_remote_llseek, .read = ncp_file_read, .write = ncp_file_write, .ioctl = ncp_ioctl, diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 58d43daec08..982a2064fe4 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -204,7 +204,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page) * Note: assumes we have exclusive access to this mapping either * through inode->i_mutex or some other mechanism. */ - if (page->index == 0 && invalidate_inode_pages2_range(inode->i_mapping, PAGE_CACHE_SIZE, -1) < 0) { + if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) { /* Should never happen */ nfs_zap_mapping(inode, inode->i_mapping); } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index d84a3d8f32a..4e98a56a177 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -170,6 +170,7 @@ force_reval: static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin) { + loff_t loff; /* origin == SEEK_END => we must revalidate the cached file length */ if (origin == SEEK_END) { struct inode *inode = filp->f_mapping->host; @@ -177,7 +178,10 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin) if (retval < 0) return (loff_t)retval; } - return remote_llseek(filp, offset, origin); + lock_kernel(); /* BKL needed? */ + loff = generic_file_llseek_unlocked(filp, offset, origin); + unlock_kernel(); + return loff; } /* diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index 49c7cd0502c..779d2eb649c 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -130,10 +130,11 @@ static int xdr_decode_fhstatus3(struct rpc_rqst *req, __be32 *p, struct mnt_fhstatus *res) { struct nfs_fh *fh = res->fh; + unsigned size; if ((res->status = ntohl(*p++)) == 0) { - int size = ntohl(*p++); - if (size <= NFS3_FHSIZE) { + size = ntohl(*p++); + if (size <= NFS3_FHSIZE && size != 0) { fh->size = size; memcpy(fh->data, p, size); } else diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 2a4a024a4e7..614efeed543 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1216,8 +1216,6 @@ static int nfs_validate_mount_data(void *options, { struct nfs_mount_data *data = (struct nfs_mount_data *)options; - memset(args, 0, sizeof(*args)); - if (data == NULL) goto out_no_data; @@ -1251,13 +1249,13 @@ static int nfs_validate_mount_data(void *options, case 5: memset(data->context, 0, sizeof(data->context)); case 6: - if (data->flags & NFS_MOUNT_VER3) + if (data->flags & NFS_MOUNT_VER3) { + if (data->root.size > NFS3_FHSIZE || data->root.size == 0) + goto out_invalid_fh; mntfh->size = data->root.size; - else + } else mntfh->size = NFS2_FHSIZE; - if (mntfh->size > sizeof(mntfh->data)) - goto out_invalid_fh; memcpy(mntfh->data, data->root.data, mntfh->size); if (mntfh->size < sizeof(mntfh->data)) @@ -1585,24 +1583,29 @@ static int nfs_get_sb(struct file_system_type *fs_type, { struct nfs_server *server = NULL; struct super_block *s; - struct nfs_fh mntfh; - struct nfs_parsed_mount_data data; + struct nfs_parsed_mount_data *data; + struct nfs_fh *mntfh; struct dentry *mntroot; int (*compare_super)(struct super_block *, void *) = nfs_compare_super; struct nfs_sb_mountdata sb_mntdata = { .mntflags = flags, }; - int error; + int error = -ENOMEM; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL); + if (data == NULL || mntfh == NULL) + goto out_free_fh; - security_init_mnt_opts(&data.lsm_opts); + security_init_mnt_opts(&data->lsm_opts); /* Validate the mount data */ - error = nfs_validate_mount_data(raw_data, &data, &mntfh, dev_name); + error = nfs_validate_mount_data(raw_data, data, mntfh, dev_name); if (error < 0) goto out; /* Get a volume representation */ - server = nfs_create_server(&data, &mntfh); + server = nfs_create_server(data, mntfh); if (IS_ERR(server)) { error = PTR_ERR(server); goto out; @@ -1630,16 +1633,16 @@ static int nfs_get_sb(struct file_system_type *fs_type, if (!s->s_root) { /* initial superblock/root creation */ - nfs_fill_super(s, &data); + nfs_fill_super(s, data); } - mntroot = nfs_get_root(s, &mntfh); + mntroot = nfs_get_root(s, mntfh); if (IS_ERR(mntroot)) { error = PTR_ERR(mntroot); goto error_splat_super; } - error = security_sb_set_mnt_opts(s, &data.lsm_opts); + error = security_sb_set_mnt_opts(s, &data->lsm_opts); if (error) goto error_splat_root; @@ -1649,9 +1652,12 @@ static int nfs_get_sb(struct file_system_type *fs_type, error = 0; out: - kfree(data.nfs_server.hostname); - kfree(data.mount_server.hostname); - security_free_mnt_opts(&data.lsm_opts); + kfree(data->nfs_server.hostname); + kfree(data->mount_server.hostname); + security_free_mnt_opts(&data->lsm_opts); +out_free_fh: + kfree(mntfh); + kfree(data); return error; out_err_nosb: @@ -1800,8 +1806,6 @@ static int nfs4_validate_mount_data(void *options, struct nfs4_mount_data *data = (struct nfs4_mount_data *)options; char *c; - memset(args, 0, sizeof(*args)); - if (data == NULL) goto out_no_data; @@ -1959,26 +1963,31 @@ out_no_client_address: static int nfs4_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt) { - struct nfs_parsed_mount_data data; + struct nfs_parsed_mount_data *data; struct super_block *s; struct nfs_server *server; - struct nfs_fh mntfh; + struct nfs_fh *mntfh; struct dentry *mntroot; int (*compare_super)(struct super_block *, void *) = nfs_compare_super; struct nfs_sb_mountdata sb_mntdata = { .mntflags = flags, }; - int error; + int error = -ENOMEM; - security_init_mnt_opts(&data.lsm_opts); + data = kzalloc(sizeof(*data), GFP_KERNEL); + mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL); + if (data == NULL || mntfh == NULL) + goto out_free_fh; + + security_init_mnt_opts(&data->lsm_opts); /* Validate the mount data */ - error = nfs4_validate_mount_data(raw_data, &data, dev_name); + error = nfs4_validate_mount_data(raw_data, data, dev_name); if (error < 0) goto out; /* Get a volume representation */ - server = nfs4_create_server(&data, &mntfh); + server = nfs4_create_server(data, mntfh); if (IS_ERR(server)) { error = PTR_ERR(server); goto out; @@ -2009,13 +2018,13 @@ static int nfs4_get_sb(struct file_system_type *fs_type, nfs4_fill_super(s); } - mntroot = nfs4_get_root(s, &mntfh); + mntroot = nfs4_get_root(s, mntfh); if (IS_ERR(mntroot)) { error = PTR_ERR(mntroot); goto error_splat_super; } - error = security_sb_set_mnt_opts(s, &data.lsm_opts); + error = security_sb_set_mnt_opts(s, &data->lsm_opts); if (error) goto error_splat_root; @@ -2025,10 +2034,13 @@ static int nfs4_get_sb(struct file_system_type *fs_type, error = 0; out: - kfree(data.client_address); - kfree(data.nfs_server.export_path); - kfree(data.nfs_server.hostname); - security_free_mnt_opts(&data.lsm_opts); + kfree(data->client_address); + kfree(data->nfs_server.export_path); + kfree(data->nfs_server.hostname); + security_free_mnt_opts(&data->lsm_opts); +out_free_fh: + kfree(mntfh); + kfree(data); return error; out_free: diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 6d8ace3e325..f333848fd3b 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -739,12 +739,13 @@ int nfs_updatepage(struct file *file, struct page *page, } status = nfs_writepage_setup(ctx, page, offset, count); - __set_page_dirty_nobuffers(page); + if (status < 0) + nfs_set_pageerror(page); + else + __set_page_dirty_nobuffers(page); dprintk("NFS: nfs_updatepage returns %d (isize %Ld)\n", status, (long long)i_size_read(inode)); - if (status < 0) - nfs_set_pageerror(page); return status; } diff --git a/fs/ntfs/upcase.c b/fs/ntfs/upcase.c index 9101807dc81..e2f72ca9803 100644 --- a/fs/ntfs/upcase.c +++ b/fs/ntfs/upcase.c @@ -77,11 +77,10 @@ ntfschar *generate_default_upcase(void) uc[i] = cpu_to_le16(i); for (r = 0; uc_run_table[r][0]; r++) for (i = uc_run_table[r][0]; i < uc_run_table[r][1]; i++) - uc[i] = cpu_to_le16(le16_to_cpu(uc[i]) + - uc_run_table[r][2]); + le16_add_cpu(&uc[i], uc_run_table[r][2]); for (r = 0; uc_dup_table[r][0]; r++) for (i = uc_dup_table[r][0]; i < uc_dup_table[r][1]; i += 2) - uc[i + 1] = cpu_to_le16(le16_to_cpu(uc[i + 1]) - 1); + le16_add_cpu(&uc[i + 1], -1); for (r = 0; uc_word_table[r][0]; r++) uc[uc_word_table[r][0]] = cpu_to_le16(uc_word_table[r][1]); return uc; diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index cf9401e8cd0..cfdb08b484e 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -21,7 +21,6 @@ #include <linux/kernel.h> #include <linux/module.h> -#include <linux/sysctl.h> #include <linux/configfs.h> #include "tcp.h" @@ -36,65 +35,6 @@ * cluster references throughout where nodes are looked up */ struct o2nm_cluster *o2nm_single_cluster = NULL; -#define OCFS2_MAX_HB_CTL_PATH 256 -static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl"; - -static ctl_table ocfs2_nm_table[] = { - { - .ctl_name = 1, - .procname = "hb_ctl_path", - .data = ocfs2_hb_ctl_path, - .maxlen = OCFS2_MAX_HB_CTL_PATH, - .mode = 0644, - .proc_handler = &proc_dostring, - .strategy = &sysctl_string, - }, - { .ctl_name = 0 } -}; - -static ctl_table ocfs2_mod_table[] = { - { - .ctl_name = FS_OCFS2_NM, - .procname = "nm", - .data = NULL, - .maxlen = 0, - .mode = 0555, - .child = ocfs2_nm_table - }, - { .ctl_name = 0} -}; - -static ctl_table ocfs2_kern_table[] = { - { - .ctl_name = FS_OCFS2, - .procname = "ocfs2", - .data = NULL, - .maxlen = 0, - .mode = 0555, - .child = ocfs2_mod_table - }, - { .ctl_name = 0} -}; - -static ctl_table ocfs2_root_table[] = { - { - .ctl_name = CTL_FS, - .procname = "fs", - .data = NULL, - .maxlen = 0, - .mode = 0555, - .child = ocfs2_kern_table - }, - { .ctl_name = 0 } -}; - -static struct ctl_table_header *ocfs2_table_header = NULL; - -const char *o2nm_get_hb_ctl_path(void) -{ - return ocfs2_hb_ctl_path; -} -EXPORT_SYMBOL_GPL(o2nm_get_hb_ctl_path); struct o2nm_node *o2nm_get_node_by_num(u8 node_num) { @@ -941,9 +881,6 @@ void o2nm_undepend_this_node(void) static void __exit exit_o2nm(void) { - if (ocfs2_table_header) - unregister_sysctl_table(ocfs2_table_header); - /* XXX sync with hb callbacks and shut down hb? */ o2net_unregister_hb_callbacks(); configfs_unregister_subsystem(&o2nm_cluster_group.cs_subsys); @@ -964,16 +901,9 @@ static int __init init_o2nm(void) if (ret) goto out; - ocfs2_table_header = register_sysctl_table(ocfs2_root_table); - if (!ocfs2_table_header) { - printk(KERN_ERR "nodemanager: unable to register sysctl\n"); - ret = -ENOMEM; /* or something. */ - goto out_o2net; - } - ret = o2net_register_hb_callbacks(); if (ret) - goto out_sysctl; + goto out_o2net; config_group_init(&o2nm_cluster_group.cs_subsys.su_group); mutex_init(&o2nm_cluster_group.cs_subsys.su_mutex); @@ -990,8 +920,6 @@ static int __init init_o2nm(void) configfs_unregister_subsystem(&o2nm_cluster_group.cs_subsys); out_callbacks: o2net_unregister_hb_callbacks(); -out_sysctl: - unregister_sysctl_table(ocfs2_table_header); out_o2net: o2net_exit(); out: diff --git a/fs/ocfs2/cluster/nodemanager.h b/fs/ocfs2/cluster/nodemanager.h index 7c860361b8d..c992ea0da4a 100644 --- a/fs/ocfs2/cluster/nodemanager.h +++ b/fs/ocfs2/cluster/nodemanager.h @@ -33,10 +33,6 @@ #include <linux/configfs.h> #include <linux/rbtree.h> -#define FS_OCFS2_NM 1 - -const char *o2nm_get_hb_ctl_path(void); - struct o2nm_node { spinlock_t nd_lock; struct config_item nd_item; diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 1e44ad14881..a27d61581bd 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -142,53 +142,43 @@ static void o2net_idle_timer(unsigned long data); static void o2net_sc_postpone_idle(struct o2net_sock_container *sc); static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc); -static void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype, - u32 msgkey, struct task_struct *task, u8 node) -{ #ifdef CONFIG_DEBUG_FS +void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype, + u32 msgkey, struct task_struct *task, u8 node) +{ INIT_LIST_HEAD(&nst->st_net_debug_item); nst->st_task = task; nst->st_msg_type = msgtype; nst->st_msg_key = msgkey; nst->st_node = node; -#endif } -static void o2net_set_nst_sock_time(struct o2net_send_tracking *nst) +void o2net_set_nst_sock_time(struct o2net_send_tracking *nst) { -#ifdef CONFIG_DEBUG_FS do_gettimeofday(&nst->st_sock_time); -#endif } -static void o2net_set_nst_send_time(struct o2net_send_tracking *nst) +void o2net_set_nst_send_time(struct o2net_send_tracking *nst) { -#ifdef CONFIG_DEBUG_FS do_gettimeofday(&nst->st_send_time); -#endif } -static void o2net_set_nst_status_time(struct o2net_send_tracking *nst) +void o2net_set_nst_status_time(struct o2net_send_tracking *nst) { -#ifdef CONFIG_DEBUG_FS do_gettimeofday(&nst->st_status_time); -#endif } -static void o2net_set_nst_sock_container(struct o2net_send_tracking *nst, +void o2net_set_nst_sock_container(struct o2net_send_tracking *nst, struct o2net_sock_container *sc) { -#ifdef CONFIG_DEBUG_FS nst->st_sc = sc; -#endif } -static void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id) +void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id) { -#ifdef CONFIG_DEBUG_FS nst->st_id = msg_id; -#endif } +#endif /* CONFIG_DEBUG_FS */ static inline int o2net_reconnect_delay(void) { diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h index a705d5d1903..fd6179eb26d 100644 --- a/fs/ocfs2/cluster/tcp.h +++ b/fs/ocfs2/cluster/tcp.h @@ -128,23 +128,23 @@ void o2net_debug_del_nst(struct o2net_send_tracking *nst); void o2net_debug_add_sc(struct o2net_sock_container *sc); void o2net_debug_del_sc(struct o2net_sock_container *sc); #else -static int o2net_debugfs_init(void) +static inline int o2net_debugfs_init(void) { return 0; } -static void o2net_debugfs_exit(void) +static inline void o2net_debugfs_exit(void) { } -static void o2net_debug_add_nst(struct o2net_send_tracking *nst) +static inline void o2net_debug_add_nst(struct o2net_send_tracking *nst) { } -static void o2net_debug_del_nst(struct o2net_send_tracking *nst) +static inline void o2net_debug_del_nst(struct o2net_send_tracking *nst) { } -static void o2net_debug_add_sc(struct o2net_sock_container *sc) +static inline void o2net_debug_add_sc(struct o2net_sock_container *sc) { } -static void o2net_debug_del_sc(struct o2net_sock_container *sc) +static inline void o2net_debug_del_sc(struct o2net_sock_container *sc) { } #endif /* CONFIG_DEBUG_FS */ diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h index 8d58cfe410b..18307ff81b7 100644 --- a/fs/ocfs2/cluster/tcp_internal.h +++ b/fs/ocfs2/cluster/tcp_internal.h @@ -224,10 +224,42 @@ struct o2net_send_tracking { struct timeval st_send_time; struct timeval st_status_time; }; + +void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype, + u32 msgkey, struct task_struct *task, u8 node); +void o2net_set_nst_sock_time(struct o2net_send_tracking *nst); +void o2net_set_nst_send_time(struct o2net_send_tracking *nst); +void o2net_set_nst_status_time(struct o2net_send_tracking *nst); +void o2net_set_nst_sock_container(struct o2net_send_tracking *nst, + struct o2net_sock_container *sc); +void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id); + #else struct o2net_send_tracking { u32 dummy; }; + +static inline void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype, + u32 msgkey, struct task_struct *task, u8 node) +{ +} +static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst) +{ +} +static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst) +{ +} +static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst) +{ +} +static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst, + struct o2net_sock_container *sc) +{ +} +static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, + u32 msg_id) +{ +} #endif /* CONFIG_DEBUG_FS */ #endif /* O2CLUSTER_TCP_INTERNAL_H */ diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h index d34a62a3a62..8c686d22f9c 100644 --- a/fs/ocfs2/dlm/dlmdebug.h +++ b/fs/ocfs2/dlm/dlmdebug.h @@ -60,25 +60,25 @@ void dlm_destroy_debugfs_root(void); #else -static int dlm_debug_init(struct dlm_ctxt *dlm) +static inline int dlm_debug_init(struct dlm_ctxt *dlm) { return 0; } -static void dlm_debug_shutdown(struct dlm_ctxt *dlm) +static inline void dlm_debug_shutdown(struct dlm_ctxt *dlm) { } -static int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm) +static inline int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm) { return 0; } -static void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm) +static inline void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm) { } -static int dlm_create_debugfs_root(void) +static inline int dlm_create_debugfs_root(void) { return 0; } -static void dlm_destroy_debugfs_root(void) +static inline void dlm_destroy_debugfs_root(void) { } diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index efc015c6128..44f87caf368 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -606,7 +606,9 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm, res->last_used = 0; + spin_lock(&dlm->spinlock); list_add_tail(&res->tracking, &dlm->tracking_list); + spin_unlock(&dlm->spinlock); memset(res->lvb, 0, DLM_LVB_LEN); memset(res->refmap, 0, sizeof(res->refmap)); diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 394d25a131a..80e20d9f278 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -1554,8 +1554,8 @@ out: */ int ocfs2_file_lock(struct file *file, int ex, int trylock) { - int ret, level = ex ? LKM_EXMODE : LKM_PRMODE; - unsigned int lkm_flags = trylock ? LKM_NOQUEUE : 0; + int ret, level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; + unsigned int lkm_flags = trylock ? DLM_LKF_NOQUEUE : 0; unsigned long flags; struct ocfs2_file_private *fp = file->private_data; struct ocfs2_lock_res *lockres = &fp->fp_flock; @@ -1582,7 +1582,7 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock) * Get the lock at NLMODE to start - that way we * can cancel the upconvert request if need be. */ - ret = ocfs2_lock_create(osb, lockres, LKM_NLMODE, 0); + ret = ocfs2_lock_create(osb, lockres, DLM_LOCK_NL, 0); if (ret < 0) { mlog_errno(ret); goto out; @@ -1597,7 +1597,7 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock) } lockres->l_action = OCFS2_AST_CONVERT; - lkm_flags |= LKM_CONVERT; + lkm_flags |= DLM_LKF_CONVERT; lockres->l_requested = level; lockres_or_flags(lockres, OCFS2_LOCK_BUSY); @@ -1664,7 +1664,7 @@ void ocfs2_file_unlock(struct file *file) if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) return; - if (lockres->l_level == LKM_NLMODE) + if (lockres->l_level == DLM_LOCK_NL) return; mlog(0, "Unlock: \"%s\" flags: 0x%lx, level: %d, act: %d\n", @@ -1678,11 +1678,11 @@ void ocfs2_file_unlock(struct file *file) lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED); lockres->l_blocking = DLM_LOCK_EX; - gen = ocfs2_prepare_downconvert(lockres, LKM_NLMODE); + gen = ocfs2_prepare_downconvert(lockres, DLM_LOCK_NL); lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0); spin_unlock_irqrestore(&lockres->l_lock, flags); - ret = ocfs2_downconvert_lock(osb, lockres, LKM_NLMODE, 0, gen); + ret = ocfs2_downconvert_lock(osb, lockres, DLM_LOCK_NL, 0, gen); if (ret) { mlog_errno(ret); return; diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c index bbd1667aa7d..fcd120f1493 100644 --- a/fs/ocfs2/stack_o2cb.c +++ b/fs/ocfs2/stack_o2cb.c @@ -317,8 +317,7 @@ out: return rc; } -static int o2cb_cluster_disconnect(struct ocfs2_cluster_connection *conn, - int hangup_pending) +static int o2cb_cluster_disconnect(struct ocfs2_cluster_connection *conn) { struct dlm_ctxt *dlm = conn->cc_lockspace; struct o2dlm_private *priv = conn->cc_private; @@ -333,43 +332,6 @@ static int o2cb_cluster_disconnect(struct ocfs2_cluster_connection *conn, return 0; } -static void o2hb_stop(const char *group) -{ - int ret; - char *argv[5], *envp[3]; - - argv[0] = (char *)o2nm_get_hb_ctl_path(); - argv[1] = "-K"; - argv[2] = "-u"; - argv[3] = (char *)group; - argv[4] = NULL; - - mlog(0, "Run: %s %s %s %s\n", argv[0], argv[1], argv[2], argv[3]); - - /* minimal command environment taken from cpu_run_sbin_hotplug */ - envp[0] = "HOME=/"; - envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; - envp[2] = NULL; - - ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); - if (ret < 0) - mlog_errno(ret); -} - -/* - * Hangup is a hack for tools compatibility. Older ocfs2-tools software - * expects the filesystem to call "ocfs2_hb_ctl" during unmount. This - * happens regardless of whether the DLM got started, so we can't do it - * in ocfs2_cluster_disconnect(). We bring the o2hb_stop() function into - * the glue and provide a "hangup" API for super.c to call. - * - * Other stacks will eventually provide a NULL ->hangup() pointer. - */ -static void o2cb_cluster_hangup(const char *group, int grouplen) -{ - o2hb_stop(group); -} - static int o2cb_cluster_this_node(unsigned int *node) { int node_num; @@ -388,7 +350,6 @@ static int o2cb_cluster_this_node(unsigned int *node) static struct ocfs2_stack_operations o2cb_stack_ops = { .connect = o2cb_cluster_connect, .disconnect = o2cb_cluster_disconnect, - .hangup = o2cb_cluster_hangup, .this_node = o2cb_cluster_this_node, .dlm_lock = o2cb_dlm_lock, .dlm_unlock = o2cb_dlm_unlock, diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index b503772cd0e..bd7e0f3acfc 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -21,6 +21,7 @@ #include <linux/fs.h> #include <linux/miscdevice.h> #include <linux/mutex.h> +#include <linux/smp_lock.h> #include <linux/reboot.h> #include <asm/uaccess.h> @@ -61,7 +62,7 @@ * negotiated by the client. The client negotiates based on the maximum * version advertised in /sys/fs/ocfs2/max_locking_protocol. The major * number from the "SETV" message must match - * user_stack.sp_proto->lp_max_version.pv_major, and the minor number + * ocfs2_user_plugin.sp_proto->lp_max_version.pv_major, and the minor number * must be less than or equal to ...->lp_max_version.pv_minor. * * Once this information has been set, mounts will be allowed. From this @@ -153,7 +154,7 @@ union ocfs2_control_message { struct ocfs2_control_message_down u_down; }; -static struct ocfs2_stack_plugin user_stack; +static struct ocfs2_stack_plugin ocfs2_user_plugin; static atomic_t ocfs2_control_opened; static int ocfs2_control_this_node = -1; @@ -399,7 +400,7 @@ static int ocfs2_control_do_setversion_msg(struct file *file, char *ptr = NULL; struct ocfs2_control_private *p = file->private_data; struct ocfs2_protocol_version *max = - &user_stack.sp_proto->lp_max_version; + &ocfs2_user_plugin.sp_proto->lp_max_version; if (ocfs2_control_get_handshake_state(file) != OCFS2_CONTROL_HANDSHAKE_PROTOCOL) @@ -619,10 +620,12 @@ static int ocfs2_control_open(struct inode *inode, struct file *file) return -ENOMEM; p->op_this_node = -1; + lock_kernel(); mutex_lock(&ocfs2_control_lock); file->private_data = p; list_add(&p->op_list, &ocfs2_control_private_list); mutex_unlock(&ocfs2_control_lock); + unlock_kernel(); return 0; } @@ -680,7 +683,7 @@ static void fsdlm_lock_ast_wrapper(void *astarg) struct dlm_lksb *lksb = fsdlm_astarg_to_lksb(astarg); int status = lksb->sb_status; - BUG_ON(user_stack.sp_proto == NULL); + BUG_ON(ocfs2_user_plugin.sp_proto == NULL); /* * For now we're punting on the issue of other non-standard errors @@ -693,16 +696,16 @@ static void fsdlm_lock_ast_wrapper(void *astarg) */ if (status == -DLM_EUNLOCK || status == -DLM_ECANCEL) - user_stack.sp_proto->lp_unlock_ast(astarg, 0); + ocfs2_user_plugin.sp_proto->lp_unlock_ast(astarg, 0); else - user_stack.sp_proto->lp_lock_ast(astarg); + ocfs2_user_plugin.sp_proto->lp_lock_ast(astarg); } static void fsdlm_blocking_ast_wrapper(void *astarg, int level) { - BUG_ON(user_stack.sp_proto == NULL); + BUG_ON(ocfs2_user_plugin.sp_proto == NULL); - user_stack.sp_proto->lp_blocking_ast(astarg, level); + ocfs2_user_plugin.sp_proto->lp_blocking_ast(astarg, level); } static int user_dlm_lock(struct ocfs2_cluster_connection *conn, @@ -816,8 +819,7 @@ out: return rc; } -static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn, - int hangup_pending) +static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn) { dlm_release_lockspace(conn->cc_lockspace, 2); conn->cc_lockspace = NULL; @@ -838,7 +840,7 @@ static int user_cluster_this_node(unsigned int *this_node) return 0; } -static struct ocfs2_stack_operations user_stack_ops = { +static struct ocfs2_stack_operations ocfs2_user_plugin_ops = { .connect = user_cluster_connect, .disconnect = user_cluster_disconnect, .this_node = user_cluster_this_node, @@ -849,20 +851,20 @@ static struct ocfs2_stack_operations user_stack_ops = { .dump_lksb = user_dlm_dump_lksb, }; -static struct ocfs2_stack_plugin user_stack = { +static struct ocfs2_stack_plugin ocfs2_user_plugin = { .sp_name = "user", - .sp_ops = &user_stack_ops, + .sp_ops = &ocfs2_user_plugin_ops, .sp_owner = THIS_MODULE, }; -static int __init user_stack_init(void) +static int __init ocfs2_user_plugin_init(void) { int rc; rc = ocfs2_control_init(); if (!rc) { - rc = ocfs2_stack_glue_register(&user_stack); + rc = ocfs2_stack_glue_register(&ocfs2_user_plugin); if (rc) ocfs2_control_exit(); } @@ -870,14 +872,14 @@ static int __init user_stack_init(void) return rc; } -static void __exit user_stack_exit(void) +static void __exit ocfs2_user_plugin_exit(void) { - ocfs2_stack_glue_unregister(&user_stack); + ocfs2_stack_glue_unregister(&ocfs2_user_plugin); ocfs2_control_exit(); } MODULE_AUTHOR("Oracle"); MODULE_DESCRIPTION("ocfs2 driver for userspace cluster stacks"); MODULE_LICENSE("GPL"); -module_init(user_stack_init); -module_exit(user_stack_exit); +module_init(ocfs2_user_plugin_init); +module_exit(ocfs2_user_plugin_exit); diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index 119f60cea9c..10e149ae5e3 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -26,6 +26,7 @@ #include <linux/fs.h> #include <linux/kobject.h> #include <linux/sysfs.h> +#include <linux/sysctl.h> #include "ocfs2_fs.h" @@ -33,11 +34,13 @@ #define OCFS2_STACK_PLUGIN_O2CB "o2cb" #define OCFS2_STACK_PLUGIN_USER "user" +#define OCFS2_MAX_HB_CTL_PATH 256 static struct ocfs2_locking_protocol *lproto; static DEFINE_SPINLOCK(ocfs2_stack_lock); static LIST_HEAD(ocfs2_stack_list); static char cluster_stack_name[OCFS2_STACK_LABEL_LEN + 1]; +static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl"; /* * The stack currently in use. If not null, active_stack->sp_count > 0, @@ -349,7 +352,7 @@ int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn, BUG_ON(conn == NULL); - ret = active_stack->sp_ops->disconnect(conn, hangup_pending); + ret = active_stack->sp_ops->disconnect(conn); /* XXX Should we free it anyway? */ if (!ret) { @@ -362,13 +365,48 @@ int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn, } EXPORT_SYMBOL_GPL(ocfs2_cluster_disconnect); +/* + * Leave the group for this filesystem. This is executed by a userspace + * program (stored in ocfs2_hb_ctl_path). + */ +static void ocfs2_leave_group(const char *group) +{ + int ret; + char *argv[5], *envp[3]; + + argv[0] = ocfs2_hb_ctl_path; + argv[1] = "-K"; + argv[2] = "-u"; + argv[3] = (char *)group; + argv[4] = NULL; + + /* minimal command environment taken from cpu_run_sbin_hotplug */ + envp[0] = "HOME=/"; + envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; + envp[2] = NULL; + + ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); + if (ret < 0) { + printk(KERN_ERR + "ocfs2: Error %d running user helper " + "\"%s %s %s %s\"\n", + ret, argv[0], argv[1], argv[2], argv[3]); + } +} + +/* + * Hangup is a required post-umount. ocfs2-tools software expects the + * filesystem to call "ocfs2_hb_ctl" during unmount. This happens + * regardless of whether the DLM got started, so we can't do it + * in ocfs2_cluster_disconnect(). The ocfs2_leave_group() function does + * the actual work. + */ void ocfs2_cluster_hangup(const char *group, int grouplen) { BUG_ON(group == NULL); BUG_ON(group[grouplen] != '\0'); - if (active_stack->sp_ops->hangup) - active_stack->sp_ops->hangup(group, grouplen); + ocfs2_leave_group(group); /* cluster_disconnect() was called with hangup_pending==1 */ ocfs2_stack_driver_put(); @@ -548,10 +586,83 @@ error: return ret; } +/* + * Sysctl bits + * + * The sysctl lives at /proc/sys/fs/ocfs2/nm/hb_ctl_path. The 'nm' doesn't + * make as much sense in a multiple cluster stack world, but it's safer + * and easier to preserve the name. + */ + +#define FS_OCFS2_NM 1 + +static ctl_table ocfs2_nm_table[] = { + { + .ctl_name = 1, + .procname = "hb_ctl_path", + .data = ocfs2_hb_ctl_path, + .maxlen = OCFS2_MAX_HB_CTL_PATH, + .mode = 0644, + .proc_handler = &proc_dostring, + .strategy = &sysctl_string, + }, + { .ctl_name = 0 } +}; + +static ctl_table ocfs2_mod_table[] = { + { + .ctl_name = FS_OCFS2_NM, + .procname = "nm", + .data = NULL, + .maxlen = 0, + .mode = 0555, + .child = ocfs2_nm_table + }, + { .ctl_name = 0} +}; + +static ctl_table ocfs2_kern_table[] = { + { + .ctl_name = FS_OCFS2, + .procname = "ocfs2", + .data = NULL, + .maxlen = 0, + .mode = 0555, + .child = ocfs2_mod_table + }, + { .ctl_name = 0} +}; + +static ctl_table ocfs2_root_table[] = { + { + .ctl_name = CTL_FS, + .procname = "fs", + .data = NULL, + .maxlen = 0, + .mode = 0555, + .child = ocfs2_kern_table + }, + { .ctl_name = 0 } +}; + +static struct ctl_table_header *ocfs2_table_header = NULL; + + +/* + * Initialization + */ + static int __init ocfs2_stack_glue_init(void) { strcpy(cluster_stack_name, OCFS2_STACK_PLUGIN_O2CB); + ocfs2_table_header = register_sysctl_table(ocfs2_root_table); + if (!ocfs2_table_header) { + printk(KERN_ERR + "ocfs2 stack glue: unable to register sysctl\n"); + return -ENOMEM; /* or something. */ + } + return ocfs2_sysfs_init(); } @@ -559,6 +670,8 @@ static void __exit ocfs2_stack_glue_exit(void) { lproto = NULL; ocfs2_sysfs_exit(); + if (ocfs2_table_header) + unregister_sysctl_table(ocfs2_table_header); } MODULE_AUTHOR("Oracle"); diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h index 005e4f170e0..db56281dd1b 100644 --- a/fs/ocfs2/stackglue.h +++ b/fs/ocfs2/stackglue.h @@ -134,22 +134,10 @@ struct ocfs2_stack_operations { * be freed. Thus, a stack must not return from ->disconnect() * until it will no longer reference the conn pointer. * - * If hangup_pending is zero, ocfs2_cluster_disconnect() will also - * be dropping the reference on the module. + * Once this call returns, the stack glue will be dropping this + * connection's reference on the module. */ - int (*disconnect)(struct ocfs2_cluster_connection *conn, - int hangup_pending); - - /* - * ocfs2_cluster_hangup() exists for compatibility with older - * ocfs2 tools. Only the classic stack really needs it. As such - * ->hangup() is not required of all stacks. See the comment by - * ocfs2_cluster_hangup() for more details. - * - * Note that ocfs2_cluster_hangup() can only be called if - * hangup_pending was passed to ocfs2_cluster_disconnect(). - */ - void (*hangup)(const char *group, int grouplen); + int (*disconnect)(struct ocfs2_cluster_connection *conn); /* * ->this_node() returns the cluster's unique identifier for the @@ -258,4 +246,5 @@ void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto) /* Used by stack plugins */ int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin); void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin); + #endif /* STACKGLUE_H */ diff --git a/fs/open.c b/fs/open.c index a1450086e92..a99ad09c319 100644 --- a/fs/open.c +++ b/fs/open.c @@ -16,6 +16,7 @@ #include <linux/namei.h> #include <linux/backing-dev.h> #include <linux/capability.h> +#include <linux/securebits.h> #include <linux/security.h> #include <linux/mount.h> #include <linux/vfs.h> @@ -425,7 +426,7 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode) { struct nameidata nd; int old_fsuid, old_fsgid; - kernel_cap_t old_cap; + kernel_cap_t uninitialized_var(old_cap); /* !SECURE_NO_SETUID_FIXUP */ int res; if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ @@ -433,23 +434,27 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode) old_fsuid = current->fsuid; old_fsgid = current->fsgid; - old_cap = current->cap_effective; current->fsuid = current->uid; current->fsgid = current->gid; - /* - * Clear the capabilities if we switch to a non-root user - * - * FIXME: There is a race here against sys_capset. The - * capabilities can change yet we will restore the old - * value below. We should hold task_capabilities_lock, - * but we cannot because user_path_walk can sleep. - */ - if (current->uid) - cap_clear(current->cap_effective); - else - current->cap_effective = current->cap_permitted; + if (!issecure(SECURE_NO_SETUID_FIXUP)) { + /* + * Clear the capabilities if we switch to a non-root user + */ +#ifndef CONFIG_SECURITY_FILE_CAPABILITIES + /* + * FIXME: There is a race here against sys_capset. The + * capabilities can change yet we will restore the old + * value below. We should hold task_capabilities_lock, + * but we cannot because user_path_walk can sleep. + */ +#endif /* ndef CONFIG_SECURITY_FILE_CAPABILITIES */ + if (current->uid) + old_cap = cap_set_effective(__cap_empty_set); + else + old_cap = cap_set_effective(current->cap_permitted); + } res = __user_walk_fd(dfd, filename, LOOKUP_FOLLOW|LOOKUP_ACCESS, &nd); if (res) @@ -478,7 +483,9 @@ out_path_release: out: current->fsuid = old_fsuid; current->fsgid = old_fsgid; - current->cap_effective = old_cap; + + if (!issecure(SECURE_NO_SETUID_FIXUP)) + cap_set_effective(old_cap); return res; } diff --git a/fs/pipe.c b/fs/pipe.c index ec228bc9f88..700f4e0d957 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -1003,8 +1003,7 @@ struct file *create_write_pipe(void) void free_write_pipe(struct file *f) { free_pipe_info(f->f_dentry->d_inode); - dput(f->f_path.dentry); - mntput(f->f_path.mnt); + path_put(&f->f_path); put_filp(f); } @@ -1015,8 +1014,8 @@ struct file *create_read_pipe(struct file *wrf) return ERR_PTR(-ENFILE); /* Grab pipe from the writer */ - f->f_path.mnt = mntget(wrf->f_path.mnt); - f->f_path.dentry = dget(wrf->f_path.dentry); + f->f_path = wrf->f_path; + path_get(&wrf->f_path); f->f_mapping = wrf->f_path.dentry->d_inode->i_mapping; f->f_pos = 0; @@ -1068,8 +1067,7 @@ int do_pipe(int *fd) err_fdr: put_unused_fd(fdr); err_read_pipe: - dput(fr->f_dentry); - mntput(fr->f_vfsmnt); + path_put(&fr->f_path); put_filp(fr); err_write_pipe: free_write_pipe(fw); diff --git a/fs/proc/array.c b/fs/proc/array.c index 9e3b8c33c24..797d775e035 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -288,7 +288,7 @@ static void render_cap_t(struct seq_file *m, const char *header, seq_printf(m, "%s", header); CAP_FOR_EACH_U32(__capi) { seq_printf(m, "%08x", - a->cap[(_LINUX_CAPABILITY_U32S-1) - __capi]); + a->cap[(_KERNEL_CAPABILITY_U32S-1) - __capi]); } seq_printf(m, "\n"); } diff --git a/fs/proc/base.c b/fs/proc/base.c index c447e0743a3..58c3e6a8e15 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -127,6 +127,25 @@ struct pid_entry { NULL, &proc_single_file_operations, \ { .proc_show = &proc_##OTYPE } ) +/* + * Count the number of hardlinks for the pid_entry table, excluding the . + * and .. links. + */ +static unsigned int pid_entry_count_dirs(const struct pid_entry *entries, + unsigned int n) +{ + unsigned int i; + unsigned int count; + + count = 0; + for (i = 0; i < n; ++i) { + if (S_ISDIR(entries[i].mode)) + ++count; + } + + return count; +} + int maps_protect; EXPORT_SYMBOL(maps_protect); @@ -214,7 +233,7 @@ static int check_mem_permission(struct task_struct *task) */ if (task->parent == current && (task->ptrace & PT_PTRACED) && task_is_stopped_or_traced(task) && - ptrace_may_attach(task)) + ptrace_may_access(task, PTRACE_MODE_ATTACH)) return 0; /* @@ -232,7 +251,8 @@ struct mm_struct *mm_for_maps(struct task_struct *task) task_lock(task); if (task->mm != mm) goto out; - if (task->mm != current->mm && __ptrace_may_attach(task) < 0) + if (task->mm != current->mm && + __ptrace_may_access(task, PTRACE_MODE_READ) < 0) goto out; task_unlock(task); return mm; @@ -499,7 +519,7 @@ static int proc_fd_access_allowed(struct inode *inode) */ task = get_proc_task(inode); if (task) { - allowed = ptrace_may_attach(task); + allowed = ptrace_may_access(task, PTRACE_MODE_READ); put_task_struct(task); } return allowed; @@ -885,7 +905,7 @@ static ssize_t environ_read(struct file *file, char __user *buf, if (!task) goto out_no_task; - if (!ptrace_may_attach(task)) + if (!ptrace_may_access(task, PTRACE_MODE_READ)) goto out; ret = -ENOMEM; @@ -2585,10 +2605,9 @@ static struct dentry *proc_pid_instantiate(struct inode *dir, inode->i_op = &proc_tgid_base_inode_operations; inode->i_fop = &proc_tgid_base_operations; inode->i_flags|=S_IMMUTABLE; - inode->i_nlink = 5; -#ifdef CONFIG_SECURITY - inode->i_nlink += 1; -#endif + + inode->i_nlink = 2 + pid_entry_count_dirs(tgid_base_stuff, + ARRAY_SIZE(tgid_base_stuff)); dentry->d_op = &pid_dentry_operations; @@ -2816,10 +2835,9 @@ static struct dentry *proc_task_instantiate(struct inode *dir, inode->i_op = &proc_tid_base_inode_operations; inode->i_fop = &proc_tid_base_operations; inode->i_flags|=S_IMMUTABLE; - inode->i_nlink = 4; -#ifdef CONFIG_SECURITY - inode->i_nlink += 1; -#endif + + inode->i_nlink = 2 + pid_entry_count_dirs(tid_base_stuff, + ARRAY_SIZE(tid_base_stuff)); dentry->d_op = &pid_dentry_operations; diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 6f4e8dc97da..b08d1001791 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -425,7 +425,8 @@ struct inode *proc_get_inode(struct super_block *sb, unsigned int ino, } } unlock_new_inode(inode); - } + } else + module_put(de->owner); return inode; out_ino: diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index 74a323d2b85..c652d469dc0 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -123,6 +123,11 @@ static int uptime_read_proc(char *page, char **start, off_t off, return proc_calc_metrics(page, start, off, count, eof, len); } +int __attribute__((weak)) arch_report_meminfo(char *page) +{ + return 0; +} + static int meminfo_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) { @@ -139,7 +144,7 @@ static int meminfo_read_proc(char *page, char **start, off_t off, #define K(x) ((x) << (PAGE_SHIFT - 10)) si_meminfo(&i); si_swapinfo(&i); - committed = atomic_read(&vm_committed_space); + committed = atomic_long_read(&vm_committed_space); allowed = ((totalram_pages - hugetlb_total_pages()) * sysctl_overcommit_ratio / 100) + total_swap_pages; @@ -221,6 +226,8 @@ static int meminfo_read_proc(char *page, char **start, off_t off, len += hugetlb_report_meminfo(page + len); + len += arch_report_meminfo(page + len); + return proc_calc_metrics(page, start, off, count, eof, len); #undef K } @@ -472,6 +479,13 @@ static const struct file_operations proc_vmalloc_operations = { }; #endif +#ifndef arch_irq_stat_cpu +#define arch_irq_stat_cpu(cpu) 0 +#endif +#ifndef arch_irq_stat +#define arch_irq_stat() 0 +#endif + static int show_stat(struct seq_file *p, void *v) { int i; @@ -509,7 +523,9 @@ static int show_stat(struct seq_file *p, void *v) sum += temp; per_irq_sum[j] += temp; } + sum += arch_irq_stat_cpu(i); } + sum += arch_irq_stat(); seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n", (unsigned long long)cputime64_to_clock_t(user), @@ -716,7 +732,7 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf, pfn = src / KPMSIZE; count = min_t(size_t, count, (max_pfn * KPMSIZE) - src); if (src & KPMMASK || count & KPMMASK) - return -EIO; + return -EINVAL; while (count > 0) { ppage = NULL; @@ -726,7 +742,7 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf, if (!ppage) pcount = 0; else - pcount = atomic_read(&ppage->_count); + pcount = page_mapcount(ppage); if (put_user(pcount, out++)) { ret = -EFAULT; @@ -782,7 +798,7 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf, pfn = src / KPMSIZE; count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src); if (src & KPMMASK || count & KPMMASK) - return -EIO; + return -EINVAL; while (count > 0) { ppage = NULL; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 88717c0f941..164bd9f9ede 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -210,7 +210,7 @@ static int show_map(struct seq_file *m, void *v) dev_t dev = 0; int len; - if (maps_protect && !ptrace_may_attach(task)) + if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ)) return -EACCES; if (file) { @@ -315,9 +315,9 @@ struct mem_size_stats { }; static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, - void *private) + struct mm_walk *walk) { - struct mem_size_stats *mss = private; + struct mem_size_stats *mss = walk->private; struct vm_area_struct *vma = mss->vma; pte_t *pte, ptent; spinlock_t *ptl; @@ -365,19 +365,21 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, return 0; } -static struct mm_walk smaps_walk = { .pmd_entry = smaps_pte_range }; - static int show_smap(struct seq_file *m, void *v) { struct vm_area_struct *vma = v; struct mem_size_stats mss; int ret; + struct mm_walk smaps_walk = { + .pmd_entry = smaps_pte_range, + .mm = vma->vm_mm, + .private = &mss, + }; memset(&mss, 0, sizeof mss); mss.vma = vma; if (vma->vm_mm && !is_vm_hugetlb_page(vma)) - walk_page_range(vma->vm_mm, vma->vm_start, vma->vm_end, - &smaps_walk, &mss); + walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk); ret = show_map(m, v); if (ret) @@ -426,9 +428,9 @@ const struct file_operations proc_smaps_operations = { }; static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, - unsigned long end, void *private) + unsigned long end, struct mm_walk *walk) { - struct vm_area_struct *vma = private; + struct vm_area_struct *vma = walk->private; pte_t *pte, ptent; spinlock_t *ptl; struct page *page; @@ -452,8 +454,6 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, return 0; } -static struct mm_walk clear_refs_walk = { .pmd_entry = clear_refs_pte_range }; - static ssize_t clear_refs_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { @@ -476,11 +476,17 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, return -ESRCH; mm = get_task_mm(task); if (mm) { + struct mm_walk clear_refs_walk = { + .pmd_entry = clear_refs_pte_range, + .mm = mm, + }; down_read(&mm->mmap_sem); - for (vma = mm->mmap; vma; vma = vma->vm_next) + for (vma = mm->mmap; vma; vma = vma->vm_next) { + clear_refs_walk.private = vma; if (!is_vm_hugetlb_page(vma)) - walk_page_range(mm, vma->vm_start, vma->vm_end, - &clear_refs_walk, vma); + walk_page_range(vma->vm_start, vma->vm_end, + &clear_refs_walk); + } flush_tlb_mm(mm); up_read(&mm->mmap_sem); mmput(mm); @@ -496,7 +502,7 @@ const struct file_operations proc_clear_refs_operations = { }; struct pagemapread { - char __user *out, *end; + u64 __user *out, *end; }; #define PM_ENTRY_BYTES sizeof(u64) @@ -519,28 +525,18 @@ struct pagemapread { static int add_to_pagemap(unsigned long addr, u64 pfn, struct pagemapread *pm) { - /* - * Make sure there's room in the buffer for an - * entire entry. Otherwise, only copy part of - * the pfn. - */ - if (pm->out + PM_ENTRY_BYTES >= pm->end) { - if (copy_to_user(pm->out, &pfn, pm->end - pm->out)) - return -EFAULT; - pm->out = pm->end; - return PM_END_OF_BUFFER; - } - if (put_user(pfn, pm->out)) return -EFAULT; - pm->out += PM_ENTRY_BYTES; + pm->out++; + if (pm->out >= pm->end) + return PM_END_OF_BUFFER; return 0; } static int pagemap_pte_hole(unsigned long start, unsigned long end, - void *private) + struct mm_walk *walk) { - struct pagemapread *pm = private; + struct pagemapread *pm = walk->private; unsigned long addr; int err = 0; for (addr = start; addr < end; addr += PAGE_SIZE) { @@ -557,24 +553,45 @@ static u64 swap_pte_to_pagemap_entry(pte_t pte) return swp_type(e) | (swp_offset(e) << MAX_SWAPFILES_SHIFT); } +static unsigned long pte_to_pagemap_entry(pte_t pte) +{ + unsigned long pme = 0; + if (is_swap_pte(pte)) + pme = PM_PFRAME(swap_pte_to_pagemap_entry(pte)) + | PM_PSHIFT(PAGE_SHIFT) | PM_SWAP; + else if (pte_present(pte)) + pme = PM_PFRAME(pte_pfn(pte)) + | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT; + return pme; +} + static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, - void *private) + struct mm_walk *walk) { - struct pagemapread *pm = private; + struct vm_area_struct *vma; + struct pagemapread *pm = walk->private; pte_t *pte; int err = 0; + /* find the first VMA at or above 'addr' */ + vma = find_vma(walk->mm, addr); for (; addr != end; addr += PAGE_SIZE) { u64 pfn = PM_NOT_PRESENT; - pte = pte_offset_map(pmd, addr); - if (is_swap_pte(*pte)) - pfn = PM_PFRAME(swap_pte_to_pagemap_entry(*pte)) - | PM_PSHIFT(PAGE_SHIFT) | PM_SWAP; - else if (pte_present(*pte)) - pfn = PM_PFRAME(pte_pfn(*pte)) - | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT; - /* unmap so we're not in atomic when we copy to userspace */ - pte_unmap(pte); + + /* check to see if we've left 'vma' behind + * and need a new, higher one */ + if (vma && (addr >= vma->vm_end)) + vma = find_vma(walk->mm, addr); + + /* check that 'vma' actually covers this address, + * and that it isn't a huge page vma */ + if (vma && (vma->vm_start <= addr) && + !is_vm_hugetlb_page(vma)) { + pte = pte_offset_map(pmd, addr); + pfn = pte_to_pagemap_entry(*pte); + /* unmap before userspace copy */ + pte_unmap(pte); + } err = add_to_pagemap(addr, pfn, pm); if (err) return err; @@ -585,11 +602,6 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, return err; } -static struct mm_walk pagemap_walk = { - .pmd_entry = pagemap_pte_range, - .pte_hole = pagemap_pte_hole -}; - /* * /proc/pid/pagemap - an array mapping virtual pages to pfns * @@ -624,17 +636,22 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, struct pagemapread pm; int pagecount; int ret = -ESRCH; + struct mm_walk pagemap_walk; + unsigned long src; + unsigned long svpfn; + unsigned long start_vaddr; + unsigned long end_vaddr; if (!task) goto out; ret = -EACCES; - if (!ptrace_may_attach(task)) + if (!ptrace_may_access(task, PTRACE_MODE_READ)) goto out_task; ret = -EINVAL; /* file position must be aligned */ - if (*ppos % PM_ENTRY_BYTES) + if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES)) goto out_task; ret = 0; @@ -642,11 +659,15 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, if (!mm) goto out_task; - ret = -ENOMEM; + uaddr = (unsigned long)buf & PAGE_MASK; uend = (unsigned long)(buf + count); pagecount = (PAGE_ALIGN(uend) - uaddr) / PAGE_SIZE; - pages = kmalloc(pagecount * sizeof(struct page *), GFP_KERNEL); + ret = 0; + if (pagecount == 0) + goto out_mm; + pages = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL); + ret = -ENOMEM; if (!pages) goto out_mm; @@ -664,36 +685,36 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, goto out_pages; } - pm.out = buf; - pm.end = buf + count; + pm.out = (u64 *)buf; + pm.end = (u64 *)(buf + count); - if (!ptrace_may_attach(task)) { - ret = -EIO; - } else { - unsigned long src = *ppos; - unsigned long svpfn = src / PM_ENTRY_BYTES; - unsigned long start_vaddr = svpfn << PAGE_SHIFT; - unsigned long end_vaddr = TASK_SIZE_OF(task); - - /* watch out for wraparound */ - if (svpfn > TASK_SIZE_OF(task) >> PAGE_SHIFT) - start_vaddr = end_vaddr; - - /* - * The odds are that this will stop walking way - * before end_vaddr, because the length of the - * user buffer is tracked in "pm", and the walk - * will stop when we hit the end of the buffer. - */ - ret = walk_page_range(mm, start_vaddr, end_vaddr, - &pagemap_walk, &pm); - if (ret == PM_END_OF_BUFFER) - ret = 0; - /* don't need mmap_sem for these, but this looks cleaner */ - *ppos += pm.out - buf; - if (!ret) - ret = pm.out - buf; - } + pagemap_walk.pmd_entry = pagemap_pte_range; + pagemap_walk.pte_hole = pagemap_pte_hole; + pagemap_walk.mm = mm; + pagemap_walk.private = ± + + src = *ppos; + svpfn = src / PM_ENTRY_BYTES; + start_vaddr = svpfn << PAGE_SHIFT; + end_vaddr = TASK_SIZE_OF(task); + + /* watch out for wraparound */ + if (svpfn > TASK_SIZE_OF(task) >> PAGE_SHIFT) + start_vaddr = end_vaddr; + + /* + * The odds are that this will stop walking way + * before end_vaddr, because the length of the + * user buffer is tracked in "pm", and the walk + * will stop when we hit the end of the buffer. + */ + ret = walk_page_range(start_vaddr, end_vaddr, &pagemap_walk); + if (ret == PM_END_OF_BUFFER) + ret = 0; + /* don't need mmap_sem for these, but this looks cleaner */ + *ppos += (char *)pm.out - buf; + if (!ret) + ret = (char *)pm.out - buf; out_pages: for (; pagecount; pagecount--) { @@ -726,7 +747,7 @@ static int show_numa_map_checked(struct seq_file *m, void *v) struct proc_maps_private *priv = m->private; struct task_struct *task = priv->task; - if (maps_protect && !ptrace_may_attach(task)) + if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ)) return -EACCES; return show_numa_map(m, v); diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 4b4f9cc2f18..5d84e7121df 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -113,7 +113,7 @@ static int show_map(struct seq_file *m, void *_vml) struct proc_maps_private *priv = m->private; struct task_struct *task = priv->task; - if (maps_protect && !ptrace_may_attach(task)) + if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ)) return -EACCES; return nommu_vma_show(m, vml->vma); diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c index 9590b902430..78f613cb9c7 100644 --- a/fs/ramfs/file-mmu.c +++ b/fs/ramfs/file-mmu.c @@ -45,6 +45,7 @@ const struct file_operations ramfs_file_operations = { .mmap = generic_file_mmap, .fsync = simple_sync_file, .splice_read = generic_file_splice_read, + .splice_write = generic_file_splice_write, .llseek = generic_file_llseek, }; diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 0989bc2c2f6..52312ec93ff 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -43,6 +43,7 @@ const struct file_operations ramfs_file_operations = { .aio_write = generic_file_aio_write, .fsync = simple_sync_file, .splice_read = generic_file_splice_read, + .splice_write = generic_file_splice_write, .llseek = generic_file_llseek, }; diff --git a/fs/read_write.c b/fs/read_write.c index f0d1240a5c6..9ba495d5a29 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -31,12 +31,12 @@ const struct file_operations generic_ro_fops = { EXPORT_SYMBOL(generic_ro_fops); -loff_t generic_file_llseek(struct file *file, loff_t offset, int origin) +loff_t +generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin) { loff_t retval; struct inode *inode = file->f_mapping->host; - mutex_lock(&inode->i_mutex); switch (origin) { case SEEK_END: offset += inode->i_size; @@ -46,42 +46,26 @@ loff_t generic_file_llseek(struct file *file, loff_t offset, int origin) } retval = -EINVAL; if (offset>=0 && offset<=inode->i_sb->s_maxbytes) { + /* Special lock needed here? */ if (offset != file->f_pos) { file->f_pos = offset; file->f_version = 0; } retval = offset; } - mutex_unlock(&inode->i_mutex); return retval; } +EXPORT_SYMBOL(generic_file_llseek_unlocked); -EXPORT_SYMBOL(generic_file_llseek); - -loff_t remote_llseek(struct file *file, loff_t offset, int origin) +loff_t generic_file_llseek(struct file *file, loff_t offset, int origin) { - loff_t retval; - - lock_kernel(); - switch (origin) { - case SEEK_END: - offset += i_size_read(file->f_path.dentry->d_inode); - break; - case SEEK_CUR: - offset += file->f_pos; - } - retval = -EINVAL; - if (offset>=0 && offset<=file->f_path.dentry->d_inode->i_sb->s_maxbytes) { - if (offset != file->f_pos) { - file->f_pos = offset; - file->f_version = 0; - } - retval = offset; - } - unlock_kernel(); - return retval; + loff_t n; + mutex_lock(&file->f_dentry->d_inode->i_mutex); + n = generic_file_llseek_unlocked(file, offset, origin); + mutex_unlock(&file->f_dentry->d_inode->i_mutex); + return n; } -EXPORT_SYMBOL(remote_llseek); +EXPORT_SYMBOL(generic_file_llseek); loff_t no_llseek(struct file *file, loff_t offset, int origin) { diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 57917932212..192269698a8 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -45,6 +45,8 @@ void reiserfs_delete_inode(struct inode *inode) goto out; reiserfs_update_inode_transaction(inode); + reiserfs_discard_prealloc(&th, inode); + err = reiserfs_delete_object(&th, inode); /* Do quota update inside a transaction for journaled quotas. We must do that diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index ed424d708e6..1d40f2bd197 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -2165,8 +2165,10 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type, blk++; } out: - if (len == towrite) + if (len == towrite) { + mutex_unlock(&inode->i_mutex); return err; + } if (inode->i_size < off + len - towrite) i_size_write(inode, off + len - towrite); inode->i_version++; diff --git a/fs/select.c b/fs/select.c index 8dda969614a..da0e88201c3 100644 --- a/fs/select.c +++ b/fs/select.c @@ -249,7 +249,6 @@ int do_select(int n, fd_set_bits *fds, s64 *timeout) retval++; } } - cond_resched(); } if (res_in) *rinp = res_in; @@ -257,6 +256,7 @@ int do_select(int n, fd_set_bits *fds, s64 *timeout) *routp = res_out; if (res_ex) *rexp = res_ex; + cond_resched(); } wait = NULL; if (retval || !*timeout || signal_pending(current)) diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c index efbe29af3d7..2294783320c 100644 --- a/fs/smbfs/file.c +++ b/fs/smbfs/file.c @@ -422,9 +422,18 @@ smb_file_permission(struct inode *inode, int mask, struct nameidata *nd) return error; } +static loff_t smb_remote_llseek(struct file *file, loff_t offset, int origin) +{ + loff_t ret; + lock_kernel(); + ret = generic_file_llseek_unlocked(file, offset, origin); + unlock_kernel(); + return ret; +} + const struct file_operations smb_file_operations = { - .llseek = remote_llseek, + .llseek = smb_remote_llseek, .read = do_sync_read, .aio_read = smb_file_aio_read, .write = do_sync_write, diff --git a/fs/splice.c b/fs/splice.c index 78150038b58..399442179d8 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -58,8 +58,8 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe, */ wait_on_page_writeback(page); - if (PagePrivate(page)) - try_to_release_page(page, GFP_KERNEL); + if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL)) + goto out_unlock; /* * If we succeeded in removing the mapping, set LRU flag @@ -75,6 +75,7 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe, * Raced with truncate or failed to remove page from current * address space, unlock and return failure. */ +out_unlock: unlock_page(page); return 1; } @@ -378,13 +379,22 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, lock_page(page); /* - * page was truncated, stop here. if this isn't the - * first page, we'll just complete what we already - * added + * Page was truncated, or invalidated by the + * filesystem. Redo the find/create, but this time the + * page is kept locked, so there's no chance of another + * race with truncate/invalidate. */ if (!page->mapping) { unlock_page(page); - break; + page = find_or_create_page(mapping, index, + mapping_gfp_mask(mapping)); + + if (!page) { + error = -ENOMEM; + break; + } + page_cache_release(pages[page_nr]); + pages[page_nr] = page; } /* * page was already under io and is now done, great @@ -983,7 +993,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, while (len) { size_t read_len; - loff_t pos = sd->pos; + loff_t pos = sd->pos, prev_pos = pos; ret = do_splice_to(in, &pos, pipe, len, flags); if (unlikely(ret <= 0)) @@ -998,15 +1008,19 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, * could get stuck data in the internal pipe: */ ret = actor(pipe, sd); - if (unlikely(ret <= 0)) + if (unlikely(ret <= 0)) { + sd->pos = prev_pos; goto out_release; + } bytes += ret; len -= ret; sd->pos = pos; - if (ret < read_len) + if (ret < read_len) { + sd->pos = prev_pos + ret; goto out_release; + } } done: @@ -1072,7 +1086,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, ret = splice_direct_to_actor(in, &sd, direct_splice_actor); if (ret > 0) - *ppos += ret; + *ppos = sd.pos; return ret; } diff --git a/fs/udf/super.c b/fs/udf/super.c index 7a5f69be6ac..44cc702f96c 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -682,38 +682,26 @@ static int udf_vrs(struct super_block *sb, int silent) /* * Check whether there is an anchor block in the given block */ -static int udf_check_anchor_block(struct super_block *sb, sector_t block, - bool varconv) +static int udf_check_anchor_block(struct super_block *sb, sector_t block) { - struct buffer_head *bh = NULL; - tag *t; + struct buffer_head *bh; uint16_t ident; - uint32_t location; - if (varconv) { - if (udf_fixed_to_variable(block) >= - sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits) - return 0; - bh = sb_bread(sb, udf_fixed_to_variable(block)); - } - else - bh = sb_bread(sb, block); + if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV) && + udf_fixed_to_variable(block) >= + sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits) + return 0; + bh = udf_read_tagged(sb, block, block, &ident); if (!bh) return 0; - - t = (tag *)bh->b_data; - ident = le16_to_cpu(t->tagIdent); - location = le32_to_cpu(t->tagLocation); brelse(bh); - if (ident != TAG_IDENT_AVDP) - return 0; - return location == block; + + return ident == TAG_IDENT_AVDP; } /* Search for an anchor volume descriptor pointer */ -static sector_t udf_scan_anchors(struct super_block *sb, bool varconv, - sector_t lastblock) +static sector_t udf_scan_anchors(struct super_block *sb, sector_t lastblock) { sector_t last[6]; int i; @@ -739,7 +727,7 @@ static sector_t udf_scan_anchors(struct super_block *sb, bool varconv, sb->s_blocksize_bits) continue; - if (udf_check_anchor_block(sb, last[i], varconv)) { + if (udf_check_anchor_block(sb, last[i])) { sbi->s_anchor[0] = last[i]; sbi->s_anchor[1] = last[i] - 256; return last[i]; @@ -748,17 +736,17 @@ static sector_t udf_scan_anchors(struct super_block *sb, bool varconv, if (last[i] < 256) continue; - if (udf_check_anchor_block(sb, last[i] - 256, varconv)) { + if (udf_check_anchor_block(sb, last[i] - 256)) { sbi->s_anchor[1] = last[i] - 256; return last[i]; } } - if (udf_check_anchor_block(sb, sbi->s_session + 256, varconv)) { + if (udf_check_anchor_block(sb, sbi->s_session + 256)) { sbi->s_anchor[0] = sbi->s_session + 256; return last[0]; } - if (udf_check_anchor_block(sb, sbi->s_session + 512, varconv)) { + if (udf_check_anchor_block(sb, sbi->s_session + 512)) { sbi->s_anchor[0] = sbi->s_session + 512; return last[0]; } @@ -780,23 +768,24 @@ static void udf_find_anchor(struct super_block *sb) int i; struct udf_sb_info *sbi = UDF_SB(sb); - lastblock = udf_scan_anchors(sb, 0, sbi->s_last_block); + lastblock = udf_scan_anchors(sb, sbi->s_last_block); if (lastblock) goto check_anchor; /* No anchor found? Try VARCONV conversion of block numbers */ + UDF_SET_FLAG(sb, UDF_FLAG_VARCONV); /* Firstly, we try to not convert number of the last block */ - lastblock = udf_scan_anchors(sb, 1, + lastblock = udf_scan_anchors(sb, udf_variable_to_fixed(sbi->s_last_block)); - if (lastblock) { - UDF_SET_FLAG(sb, UDF_FLAG_VARCONV); + if (lastblock) goto check_anchor; - } /* Secondly, we try with converted number of the last block */ - lastblock = udf_scan_anchors(sb, 1, sbi->s_last_block); - if (lastblock) - UDF_SET_FLAG(sb, UDF_FLAG_VARCONV); + lastblock = udf_scan_anchors(sb, sbi->s_last_block); + if (!lastblock) { + /* VARCONV didn't help. Clear it. */ + UDF_CLEAR_FLAG(sb, UDF_FLAG_VARCONV); + } check_anchor: /* diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index 8fa9c2d7091..8ec865de5f1 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -16,7 +16,7 @@ #define UDF_PREALLOCATE #define UDF_DEFAULT_PREALLOC_BLOCKS 8 -#define UDFFS_DEBUG +#undef UDFFS_DEBUG #ifdef UDFFS_DEBUG #define udf_debug(f, a...) \ diff --git a/fs/utimes.c b/fs/utimes.c index af059d5cb48..b6b664e7145 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -40,14 +40,9 @@ asmlinkage long sys_utime(char __user *filename, struct utimbuf __user *times) #endif -static bool nsec_special(long nsec) -{ - return nsec == UTIME_OMIT || nsec == UTIME_NOW; -} - static bool nsec_valid(long nsec) { - if (nsec_special(nsec)) + if (nsec == UTIME_OMIT || nsec == UTIME_NOW) return true; return nsec >= 0 && nsec <= 999999999; @@ -102,7 +97,11 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags if (error) goto dput_and_out; - /* Don't worry, the checks are done in inode_change_ok() */ + if (times && times[0].tv_nsec == UTIME_NOW && + times[1].tv_nsec == UTIME_NOW) + times = NULL; + + /* In most cases, the checks are done in inode_change_ok() */ newattrs.ia_valid = ATTR_CTIME | ATTR_MTIME | ATTR_ATIME; if (times) { error = -EPERM; @@ -124,28 +123,34 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags newattrs.ia_mtime.tv_nsec = times[1].tv_nsec; newattrs.ia_valid |= ATTR_MTIME_SET; } - } - /* - * If times is NULL or both times are either UTIME_OMIT or - * UTIME_NOW, then need to check permissions, because - * inode_change_ok() won't do it. - */ - if (!times || (nsec_special(times[0].tv_nsec) && - nsec_special(times[1].tv_nsec))) { + /* + * For the UTIME_OMIT/UTIME_NOW and UTIME_NOW/UTIME_OMIT + * cases, we need to make an extra check that is not done by + * inode_change_ok(). + */ + if (((times[0].tv_nsec == UTIME_NOW && + times[1].tv_nsec == UTIME_OMIT) + || + (times[0].tv_nsec == UTIME_OMIT && + times[1].tv_nsec == UTIME_NOW)) + && !is_owner_or_cap(inode)) + goto mnt_drop_write_and_out; + } else { + + /* + * If times is NULL (or both times are UTIME_NOW), + * then we need to check permissions, because + * inode_change_ok() won't do it. + */ error = -EACCES; if (IS_IMMUTABLE(inode)) goto mnt_drop_write_and_out; if (!is_owner_or_cap(inode)) { - if (f) { - if (!(f->f_mode & FMODE_WRITE)) - goto mnt_drop_write_and_out; - } else { - error = vfs_permission(&nd, MAY_WRITE); - if (error) - goto mnt_drop_write_and_out; - } + error = permission(inode, MAY_WRITE, NULL); + if (error) + goto mnt_drop_write_and_out; } } mutex_lock(&inode->i_mutex); @@ -169,14 +174,6 @@ asmlinkage long sys_utimensat(int dfd, char __user *filename, struct timespec __ if (utimes) { if (copy_from_user(&tstimes, utimes, sizeof(tstimes))) return -EFAULT; - if ((tstimes[0].tv_nsec == UTIME_OMIT || - tstimes[0].tv_nsec == UTIME_NOW) && - tstimes[0].tv_sec != 0) - return -EINVAL; - if ((tstimes[1].tv_nsec == UTIME_OMIT || - tstimes[1].tv_nsec == UTIME_NOW) && - tstimes[1].tv_sec != 0) - return -EINVAL; /* Nothing to do, we must not even check the path. */ if (tstimes[0].tv_nsec == UTIME_OMIT && diff --git a/fs/vfat/namei.c b/fs/vfat/namei.c index a3522727ea5..b546ba69be8 100644 --- a/fs/vfat/namei.c +++ b/fs/vfat/namei.c @@ -645,7 +645,7 @@ static int vfat_add_entry(struct inode *dir, struct qstr *qname, int is_dir, if (len == 0) return -ENOENT; - slots = kmalloc(sizeof(*slots) * MSDOS_SLOTS, GFP_KERNEL); + slots = kmalloc(sizeof(*slots) * MSDOS_SLOTS, GFP_NOFS); if (slots == NULL) return -ENOMEM; @@ -687,7 +687,7 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry, struct dentry *alias; int err, table; - lock_kernel(); + lock_super(sb); table = (MSDOS_SB(sb)->options.name_check == 's') ? 2 : 0; dentry->d_op = &vfat_dentry_ops[table]; @@ -699,7 +699,7 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry, inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos); brelse(sinfo.bh); if (IS_ERR(inode)) { - unlock_kernel(); + unlock_super(sb); return ERR_CAST(inode); } alias = d_find_alias(inode); @@ -708,13 +708,13 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry, dput(alias); else { iput(inode); - unlock_kernel(); + unlock_super(sb); return alias; } } error: - unlock_kernel(); + unlock_super(sb); dentry->d_op = &vfat_dentry_ops[table]; dentry->d_time = dentry->d_parent->d_inode->i_version; dentry = d_splice_alias(inode, dentry); @@ -734,7 +734,7 @@ static int vfat_create(struct inode *dir, struct dentry *dentry, int mode, struct timespec ts; int err; - lock_kernel(); + lock_super(sb); ts = CURRENT_TIME_SEC; err = vfat_add_entry(dir, &dentry->d_name, 0, 0, &ts, &sinfo); @@ -755,17 +755,18 @@ static int vfat_create(struct inode *dir, struct dentry *dentry, int mode, dentry->d_time = dentry->d_parent->d_inode->i_version; d_instantiate(dentry, inode); out: - unlock_kernel(); + unlock_super(sb); return err; } static int vfat_rmdir(struct inode *dir, struct dentry *dentry) { struct inode *inode = dentry->d_inode; + struct super_block *sb = dir->i_sb; struct fat_slot_info sinfo; int err; - lock_kernel(); + lock_super(sb); err = fat_dir_empty(inode); if (err) @@ -783,7 +784,7 @@ static int vfat_rmdir(struct inode *dir, struct dentry *dentry) inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC; fat_detach(inode); out: - unlock_kernel(); + unlock_super(sb); return err; } @@ -791,10 +792,11 @@ out: static int vfat_unlink(struct inode *dir, struct dentry *dentry) { struct inode *inode = dentry->d_inode; + struct super_block *sb = dir->i_sb; struct fat_slot_info sinfo; int err; - lock_kernel(); + lock_super(sb); err = vfat_find(dir, &dentry->d_name, &sinfo); if (err) @@ -807,7 +809,7 @@ static int vfat_unlink(struct inode *dir, struct dentry *dentry) inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC; fat_detach(inode); out: - unlock_kernel(); + unlock_super(sb); return err; } @@ -820,7 +822,7 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, int mode) struct timespec ts; int err, cluster; - lock_kernel(); + lock_super(sb); ts = CURRENT_TIME_SEC; cluster = fat_alloc_new_dir(dir, &ts); @@ -849,13 +851,13 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, int mode) dentry->d_time = dentry->d_parent->d_inode->i_version; d_instantiate(dentry, inode); - unlock_kernel(); + unlock_super(sb); return 0; out_free: fat_free_clusters(dir, cluster); out: - unlock_kernel(); + unlock_super(sb); return err; } @@ -869,11 +871,12 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry, struct timespec ts; loff_t dotdot_i_pos, new_i_pos; int err, is_dir, update_dotdot, corrupt = 0; + struct super_block *sb = old_dir->i_sb; old_sinfo.bh = sinfo.bh = dotdot_bh = NULL; old_inode = old_dentry->d_inode; new_inode = new_dentry->d_inode; - lock_kernel(); + lock_super(sb); err = vfat_find(old_dir, &old_dentry->d_name, &old_sinfo); if (err) goto out; @@ -951,7 +954,7 @@ out: brelse(sinfo.bh); brelse(dotdot_bh); brelse(old_sinfo.bh); - unlock_kernel(); + unlock_super(sb); return err; diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 5105015a75a..98e0e86093b 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -387,6 +387,8 @@ _xfs_buf_lookup_pages( if (unlikely(page == NULL)) { if (flags & XBF_READ_AHEAD) { bp->b_page_count = i; + for (i = 0; i < bp->b_page_count; i++) + unlock_page(bp->b_pages[i]); return -ENOMEM; } @@ -416,17 +418,24 @@ _xfs_buf_lookup_pages( ASSERT(!PagePrivate(page)); if (!PageUptodate(page)) { page_count--; - if (blocksize < PAGE_CACHE_SIZE && !PagePrivate(page)) { + if (blocksize >= PAGE_CACHE_SIZE) { + if (flags & XBF_READ) + bp->b_flags |= _XBF_PAGE_LOCKED; + } else if (!PagePrivate(page)) { if (test_page_region(page, offset, nbytes)) page_count++; } } - unlock_page(page); bp->b_pages[i] = page; offset = 0; } + if (!(bp->b_flags & _XBF_PAGE_LOCKED)) { + for (i = 0; i < bp->b_page_count; i++) + unlock_page(bp->b_pages[i]); + } + if (page_count == bp->b_page_count) bp->b_flags |= XBF_DONE; @@ -746,6 +755,7 @@ xfs_buf_associate_memory( bp->b_count_desired = len; bp->b_buffer_length = buflen; bp->b_flags |= XBF_MAPPED; + bp->b_flags &= ~_XBF_PAGE_LOCKED; return 0; } @@ -1093,8 +1103,10 @@ _xfs_buf_ioend( xfs_buf_t *bp, int schedule) { - if (atomic_dec_and_test(&bp->b_io_remaining) == 1) + if (atomic_dec_and_test(&bp->b_io_remaining) == 1) { + bp->b_flags &= ~_XBF_PAGE_LOCKED; xfs_buf_ioend(bp, schedule); + } } STATIC void @@ -1125,6 +1137,9 @@ xfs_buf_bio_end_io( if (--bvec >= bio->bi_io_vec) prefetchw(&bvec->bv_page->flags); + + if (bp->b_flags & _XBF_PAGE_LOCKED) + unlock_page(page); } while (bvec >= bio->bi_io_vec); _xfs_buf_ioend(bp, 1); @@ -1163,7 +1178,8 @@ _xfs_buf_ioapply( * filesystem block size is not smaller than the page size. */ if ((bp->b_buffer_length < PAGE_CACHE_SIZE) && - (bp->b_flags & XBF_READ) && + ((bp->b_flags & (XBF_READ|_XBF_PAGE_LOCKED)) == + (XBF_READ|_XBF_PAGE_LOCKED)) && (blocksize >= PAGE_CACHE_SIZE)) { bio = bio_alloc(GFP_NOIO, 1); diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index 841d7883528..f948ec7ba9a 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -66,6 +66,25 @@ typedef enum { _XBF_PAGES = (1 << 18), /* backed by refcounted pages */ _XBF_RUN_QUEUES = (1 << 19),/* run block device task queue */ _XBF_DELWRI_Q = (1 << 21), /* buffer on delwri queue */ + + /* + * Special flag for supporting metadata blocks smaller than a FSB. + * + * In this case we can have multiple xfs_buf_t on a single page and + * need to lock out concurrent xfs_buf_t readers as they only + * serialise access to the buffer. + * + * If the FSB size >= PAGE_CACHE_SIZE case, we have no serialisation + * between reads of the page. Hence we can have one thread read the + * page and modify it, but then race with another thread that thinks + * the page is not up-to-date and hence reads it again. + * + * The result is that the first modifcation to the page is lost. + * This sort of AGF/AGI reading race can happen when unlinking inodes + * that require truncation and results in the AGI unlinked list + * modifications being lost. + */ + _XBF_PAGE_LOCKED = (1 << 22), } xfs_buf_flags_t; typedef enum { diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index 65e78c13d4a..5f60363b934 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -184,19 +184,24 @@ xfs_file_release( return -xfs_release(XFS_I(inode)); } +/* + * We ignore the datasync flag here because a datasync is effectively + * identical to an fsync. That is, datasync implies that we need to write + * only the metadata needed to be able to access the data that is written + * if we crash after the call completes. Hence if we are writing beyond + * EOF we have to log the inode size change as well, which makes it a + * full fsync. If we don't write beyond EOF, the inode core will be + * clean in memory and so we don't need to log the inode, just like + * fsync. + */ STATIC int xfs_file_fsync( struct file *filp, struct dentry *dentry, int datasync) { - int flags = FSYNC_WAIT; - - if (datasync) - flags |= FSYNC_DATA; xfs_iflags_clear(XFS_I(dentry->d_inode), XFS_ITRUNCATED); - return -xfs_fsync(XFS_I(dentry->d_inode), flags, - (xfs_off_t)0, (xfs_off_t)-1); + return -xfs_fsync(XFS_I(dentry->d_inode)); } /* diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h index 9d73cb5c0fc..25eb2a9e8d9 100644 --- a/fs/xfs/linux-2.6/xfs_vnode.h +++ b/fs/xfs/linux-2.6/xfs_vnode.h @@ -230,14 +230,6 @@ static inline void vn_atime_to_time_t(bhv_vnode_t *vp, time_t *tt) #define ATTR_NOSIZETOK 0x400 /* Don't get the SIZE token */ /* - * Flags to vop_fsync/reclaim. - */ -#define FSYNC_NOWAIT 0 /* asynchronous flush */ -#define FSYNC_WAIT 0x1 /* synchronous fsync or forced reclaim */ -#define FSYNC_INVAL 0x2 /* flush and invalidate cached data */ -#define FSYNC_DATA 0x4 /* synchronous fsync of data only */ - -/* * Tracking vnode activity. */ #if defined(XFS_INODE_TRACE) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index cf0bb9c1d62..e569bf5d6cf 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2974,6 +2974,7 @@ xfs_iflush_cluster( xfs_mount_t *mp = ip->i_mount; xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino); unsigned long first_index, mask; + unsigned long inodes_per_cluster; int ilist_size; xfs_inode_t **ilist; xfs_inode_t *iq; @@ -2985,8 +2986,9 @@ xfs_iflush_cluster( ASSERT(pag->pagi_inodeok); ASSERT(pag->pag_ici_init); - ilist_size = XFS_INODE_CLUSTER_SIZE(mp) * sizeof(xfs_inode_t *); - ilist = kmem_alloc(ilist_size, KM_MAYFAIL); + inodes_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog; + ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *); + ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS); if (!ilist) return 0; @@ -2995,8 +2997,7 @@ xfs_iflush_cluster( read_lock(&pag->pag_ici_lock); /* really need a gang lookup range call here */ nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist, - first_index, - XFS_INODE_CLUSTER_SIZE(mp)); + first_index, inodes_per_cluster); if (nr_found == 0) goto out_free; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index afaee301b0e..ad3d26ddfe3 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -2427,13 +2427,20 @@ restart: if (iclog->ic_size - iclog->ic_offset < 2*sizeof(xlog_op_header_t)) { xlog_state_switch_iclogs(log, iclog, iclog->ic_size); - /* If I'm the only one writing to this iclog, sync it to disk */ - if (atomic_read(&iclog->ic_refcnt) == 1) { + /* + * If I'm the only one writing to this iclog, sync it to disk. + * We need to do an atomic compare and decrement here to avoid + * racing with concurrent atomic_dec_and_lock() calls in + * xlog_state_release_iclog() when there is more than one + * reference to the iclog. + */ + if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1)) { + /* we are the only one */ spin_unlock(&log->l_icloglock); - if ((error = xlog_state_release_iclog(log, iclog))) + error = xlog_state_release_iclog(log, iclog); + if (error) return error; } else { - atomic_dec(&iclog->ic_refcnt); spin_unlock(&log->l_icloglock); } goto restart; diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 70702a60b4b..e475e3717eb 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -856,18 +856,14 @@ xfs_readlink( /* * xfs_fsync * - * This is called to sync the inode and its data out to disk. - * We need to hold the I/O lock while flushing the data, and - * the inode lock while flushing the inode. The inode lock CANNOT - * be held while flushing the data, so acquire after we're done - * with that. + * This is called to sync the inode and its data out to disk. We need to hold + * the I/O lock while flushing the data, and the inode lock while flushing the + * inode. The inode lock CANNOT be held while flushing the data, so acquire + * after we're done with that. */ int xfs_fsync( - xfs_inode_t *ip, - int flag, - xfs_off_t start, - xfs_off_t stop) + xfs_inode_t *ip) { xfs_trans_t *tp; int error; @@ -875,103 +871,79 @@ xfs_fsync( xfs_itrace_entry(ip); - ASSERT(start >= 0 && stop >= -1); - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return XFS_ERROR(EIO); - if (flag & FSYNC_DATA) - filemap_fdatawait(vn_to_inode(XFS_ITOV(ip))->i_mapping); + /* capture size updates in I/O completion before writing the inode. */ + error = filemap_fdatawait(vn_to_inode(XFS_ITOV(ip))->i_mapping); + if (error) + return XFS_ERROR(error); /* - * We always need to make sure that the required inode state - * is safe on disk. The vnode might be clean but because - * of committed transactions that haven't hit the disk yet. - * Likewise, there could be unflushed non-transactional - * changes to the inode core that have to go to disk. + * We always need to make sure that the required inode state is safe on + * disk. The vnode might be clean but we still might need to force the + * log because of committed transactions that haven't hit the disk yet. + * Likewise, there could be unflushed non-transactional changes to the + * inode core that have to go to disk and this requires us to issue + * a synchronous transaction to capture these changes correctly. * - * The following code depends on one assumption: that - * any transaction that changes an inode logs the core - * because it has to change some field in the inode core - * (typically nextents or nblocks). That assumption - * implies that any transactions against an inode will - * catch any non-transactional updates. If inode-altering - * transactions exist that violate this assumption, the - * code breaks. Right now, it figures that if the involved - * update_* field is clear and the inode is unpinned, the - * inode is clean. Either it's been flushed or it's been - * committed and the commit has hit the disk unpinning the inode. - * (Note that xfs_inode_item_format() called at commit clears - * the update_* fields.) + * This code relies on the assumption that if the update_* fields + * of the inode are clear and the inode is unpinned then it is clean + * and no action is required. */ xfs_ilock(ip, XFS_ILOCK_SHARED); - /* If we are flushing data then we care about update_size - * being set, otherwise we care about update_core - */ - if ((flag & FSYNC_DATA) ? - (ip->i_update_size == 0) : - (ip->i_update_core == 0)) { + if (!(ip->i_update_size || ip->i_update_core)) { /* - * Timestamps/size haven't changed since last inode - * flush or inode transaction commit. That means - * either nothing got written or a transaction - * committed which caught the updates. If the - * latter happened and the transaction hasn't - * hit the disk yet, the inode will be still - * be pinned. If it is, force the log. + * Timestamps/size haven't changed since last inode flush or + * inode transaction commit. That means either nothing got + * written or a transaction committed which caught the updates. + * If the latter happened and the transaction hasn't hit the + * disk yet, the inode will be still be pinned. If it is, + * force the log. */ xfs_iunlock(ip, XFS_ILOCK_SHARED); if (xfs_ipincount(ip)) { - _xfs_log_force(ip->i_mount, (xfs_lsn_t)0, - XFS_LOG_FORCE | - ((flag & FSYNC_WAIT) - ? XFS_LOG_SYNC : 0), + error = _xfs_log_force(ip->i_mount, (xfs_lsn_t)0, + XFS_LOG_FORCE | XFS_LOG_SYNC, &log_flushed); } else { /* - * If the inode is not pinned and nothing - * has changed we don't need to flush the - * cache. + * If the inode is not pinned and nothing has changed + * we don't need to flush the cache. */ changed = 0; } - error = 0; } else { /* - * Kick off a transaction to log the inode - * core to get the updates. Make it - * sync if FSYNC_WAIT is passed in (which - * is done by everybody but specfs). The - * sync transaction will also force the log. + * Kick off a transaction to log the inode core to get the + * updates. The sync transaction will also force the log. */ xfs_iunlock(ip, XFS_ILOCK_SHARED); tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS); - if ((error = xfs_trans_reserve(tp, 0, - XFS_FSYNC_TS_LOG_RES(ip->i_mount), - 0, 0, 0))) { + error = xfs_trans_reserve(tp, 0, + XFS_FSYNC_TS_LOG_RES(ip->i_mount), 0, 0, 0); + if (error) { xfs_trans_cancel(tp, 0); return error; } xfs_ilock(ip, XFS_ILOCK_EXCL); /* - * Note - it's possible that we might have pushed - * ourselves out of the way during trans_reserve - * which would flush the inode. But there's no - * guarantee that the inode buffer has actually - * gone out yet (it's delwri). Plus the buffer - * could be pinned anyway if it's part of an - * inode in another recent transaction. So we - * play it safe and fire off the transaction anyway. + * Note - it's possible that we might have pushed ourselves out + * of the way during trans_reserve which would flush the inode. + * But there's no guarantee that the inode buffer has actually + * gone out yet (it's delwri). Plus the buffer could be pinned + * anyway if it's part of an inode in another recent + * transaction. So we play it safe and fire off the + * transaction anyway. */ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_ihold(tp, ip); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - if (flag & FSYNC_WAIT) - xfs_trans_set_sync(tp); + xfs_trans_set_sync(tp); error = _xfs_trans_commit(tp, 0, &log_flushed); xfs_iunlock(ip, XFS_ILOCK_EXCL); diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h index 8abe8f186e2..57335ba4ce5 100644 --- a/fs/xfs/xfs_vnodeops.h +++ b/fs/xfs/xfs_vnodeops.h @@ -18,8 +18,7 @@ int xfs_open(struct xfs_inode *ip); int xfs_setattr(struct xfs_inode *ip, struct bhv_vattr *vap, int flags, struct cred *credp); int xfs_readlink(struct xfs_inode *ip, char *link); -int xfs_fsync(struct xfs_inode *ip, int flag, xfs_off_t start, - xfs_off_t stop); +int xfs_fsync(struct xfs_inode *ip); int xfs_release(struct xfs_inode *ip); int xfs_inactive(struct xfs_inode *ip); int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name, |