From 6dda9266913ad57e09afc1a10d6473f10c806a63 Mon Sep 17 00:00:00 2001 From: "Luck, Tony" Date: Thu, 11 Aug 2011 15:14:39 -0700 Subject: pstore: defer inserting OOPS entries into pstore Life is simple for all the kernel terminating types of kmsg_dump call backs - pstore just saves the tail end of the console log. But for "oops" the situation is more complex - the kernel may carry on running (possibly for ever). So we'd like to make the logged copy of the oops appear in the pstore filesystem - so that the user has a handle to clear the entry from the persistent backing store (if we don't, the store may fill with "oops" entries (that are also safely stashed in /var/log/messages) leaving no space for real errors. Current code calls pstore_mkfile() immediately. But this may not be safe. The oops could have happened with arbitrary locks held, or in interrupt or NMI context. So allocating memory and calling into generic filesystem code seems unwise. This patch defers making the entry appear. At the time of the oops, we merely set a flag "pstore_new_entry" noting that a new entry has been added. A periodic timer checks once a minute to see if the flag is set - if so, it schedules a work queue to rescan the backing store and make all new entries appear in the pstore filesystem. Signed-off-by: Tony Luck --- fs/pstore/inode.c | 40 ++++++++++++++++++++++++++++++++++---- fs/pstore/internal.h | 2 +- fs/pstore/platform.c | 54 +++++++++++++++++++++++++++++++++++++++++++--------- 3 files changed, 82 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 893b961dcfd..379a02dc121 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -32,13 +33,18 @@ #include #include #include +#include #include #include "internal.h" #define PSTORE_NAMELEN 64 +static DEFINE_SPINLOCK(allpstore_lock); +static LIST_HEAD(allpstore); + struct pstore_private { + struct list_head list; struct pstore_info *psi; enum pstore_type_id type; u64 id; @@ -81,8 +87,16 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry) static void pstore_evict_inode(struct inode *inode) { + struct pstore_private *p = inode->i_private; + unsigned long flags; + end_writeback(inode); - kfree(inode->i_private); + if (p) { + spin_lock_irqsave(&allpstore_lock, flags); + list_del(&p->list); + spin_unlock_irqrestore(&allpstore_lock, flags); + kfree(p); + } } static const struct inode_operations pstore_dir_inode_operations = { @@ -182,9 +196,23 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, struct dentry *root = pstore_sb->s_root; struct dentry *dentry; struct inode *inode; - int rc; + int rc = 0; char name[PSTORE_NAMELEN]; - struct pstore_private *private; + struct pstore_private *private, *pos; + unsigned long flags; + + spin_lock_irqsave(&allpstore_lock, flags); + list_for_each_entry(pos, &allpstore, list) { + if (pos->type == type && + pos->id == id && + pos->psi == psi) { + rc = -EEXIST; + break; + } + } + spin_unlock_irqrestore(&allpstore_lock, flags); + if (rc) + return rc; rc = -ENOMEM; inode = pstore_get_inode(pstore_sb, root->d_inode, S_IFREG | 0444, 0); @@ -229,6 +257,10 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, d_add(dentry, inode); + spin_lock_irqsave(&allpstore_lock, flags); + list_add(&private->list, &allpstore); + spin_unlock_irqrestore(&allpstore_lock, flags); + mutex_unlock(&root->d_inode->i_mutex); return 0; @@ -277,7 +309,7 @@ int pstore_fill_super(struct super_block *sb, void *data, int silent) goto fail; } - pstore_get_records(); + pstore_get_records(0); return 0; fail: diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h index 611c1b3c46f..3bde461c3f3 100644 --- a/fs/pstore/internal.h +++ b/fs/pstore/internal.h @@ -1,5 +1,5 @@ extern void pstore_set_kmsg_bytes(int); -extern void pstore_get_records(void); +extern void pstore_get_records(int); extern int pstore_mkfile(enum pstore_type_id, char *psname, u64 id, char *data, size_t size, struct timespec time, struct pstore_info *psi); diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index c5300ec3169..ca60ebcfb15 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -25,11 +25,28 @@ #include #include #include +#include #include #include +#include #include "internal.h" +/* + * We defer making "oops" entries appear in pstore - see + * whether the system is actually still running well enough + * to let someone see the entry + */ +#define PSTORE_INTERVAL (60 * HZ) + +static int pstore_new_entry; + +static void pstore_timefunc(unsigned long); +static DEFINE_TIMER(pstore_timer, pstore_timefunc, 0, 0); + +static void pstore_dowork(struct work_struct *); +static DECLARE_WORK(pstore_work, pstore_dowork); + /* * pstore_lock just protects "psinfo" during * calls to pstore_register() @@ -100,9 +117,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, id = psinfo->write(PSTORE_TYPE_DMESG, part, hsize + l1_cpy + l2_cpy, psinfo); if (reason == KMSG_DUMP_OOPS && pstore_is_mounted()) - pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id, - psinfo->buf, hsize + l1_cpy + l2_cpy, - CURRENT_TIME, psinfo); + pstore_new_entry = 1; l1 -= l1_cpy; l2 -= l2_cpy; total += l1_cpy + l2_cpy; @@ -148,19 +163,24 @@ int pstore_register(struct pstore_info *psi) } if (pstore_is_mounted()) - pstore_get_records(); + pstore_get_records(0); kmsg_dump_register(&pstore_dumper); + pstore_timer.expires = jiffies + PSTORE_INTERVAL; + add_timer(&pstore_timer); + return 0; } EXPORT_SYMBOL_GPL(pstore_register); /* - * Read all the records from the persistent store. Create and - * file files in our filesystem. + * Read all the records from the persistent store. Create + * files in our filesystem. Don't warn about -EEXIST errors + * when we are re-scanning the backing store looking to add new + * error records. */ -void pstore_get_records(void) +void pstore_get_records(int quiet) { struct pstore_info *psi = psinfo; ssize_t size; @@ -178,8 +198,9 @@ void pstore_get_records(void) goto out; while ((size = psi->read(&id, &type, &time, psi)) > 0) { - if (pstore_mkfile(type, psi->name, id, psi->buf, (size_t)size, - time, psi)) + rc = pstore_mkfile(type, psi->name, id, psi->buf, (size_t)size, + time, psi); + if (rc && (rc != -EEXIST || !quiet)) failed++; } psi->close(psi); @@ -191,6 +212,21 @@ out: failed, psi->name); } +static void pstore_dowork(struct work_struct *work) +{ + pstore_get_records(1); +} + +static void pstore_timefunc(unsigned long dummy) +{ + if (pstore_new_entry) { + pstore_new_entry = 0; + schedule_work(&pstore_work); + } + + mod_timer(&pstore_timer, jiffies + PSTORE_INTERVAL); +} + /* * Call platform driver to write a record to the * persistent store. -- cgit v1.2.3-18-g5258 From abd4d5587be911f63592537284dad78766d97d62 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Fri, 12 Aug 2011 10:54:51 -0700 Subject: pstore: change mutex locking to spin_locks pstore was using mutex locking to protect read/write access to the backend plug-ins. This causes problems when pstore is executed in an NMI context through panic() -> kmsg_dump(). This patch changes the mutex to a spin_lock_irqsave then also checks to see if we are in an NMI context. If we are in an NMI and can't get the lock, just print a message stating that and blow by the locking. All this is probably a hack around the bigger locking problem but it solves my current situation of trying to sleep in an NMI context. Tested by loading the lkdtm module and executing a HARDLOCKUP which will cause the machine to panic inside the nmi handler. Signed-off-by: Don Zickus Acked-by: Matthew Garrett Signed-off-by: Tony Luck --- fs/pstore/platform.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index ca60ebcfb15..0472924024c 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include "internal.h" @@ -88,13 +89,20 @@ static void pstore_dump(struct kmsg_dumper *dumper, u64 id; int hsize; unsigned int part = 1; + unsigned long flags = 0; + int is_locked = 0; if (reason < ARRAY_SIZE(reason_str)) why = reason_str[reason]; else why = "Unknown"; - mutex_lock(&psinfo->buf_mutex); + if (in_nmi()) { + is_locked = spin_trylock(&psinfo->buf_lock); + if (!is_locked) + pr_err("pstore dump routine blocked in NMI, may corrupt error record\n"); + } else + spin_lock_irqsave(&psinfo->buf_lock, flags); oopscount++; while (total < kmsg_bytes) { dst = psinfo->buf; @@ -123,7 +131,11 @@ static void pstore_dump(struct kmsg_dumper *dumper, total += l1_cpy + l2_cpy; part++; } - mutex_unlock(&psinfo->buf_mutex); + if (in_nmi()) { + if (is_locked) + spin_unlock(&psinfo->buf_lock); + } else + spin_unlock_irqrestore(&psinfo->buf_lock, flags); } static struct kmsg_dumper pstore_dumper = { @@ -188,11 +200,12 @@ void pstore_get_records(int quiet) enum pstore_type_id type; struct timespec time; int failed = 0, rc; + unsigned long flags; if (!psi) return; - mutex_lock(&psinfo->buf_mutex); + spin_lock_irqsave(&psinfo->buf_lock, flags); rc = psi->open(psi); if (rc) goto out; @@ -205,7 +218,7 @@ void pstore_get_records(int quiet) } psi->close(psi); out: - mutex_unlock(&psinfo->buf_mutex); + spin_unlock_irqrestore(&psinfo->buf_lock, flags); if (failed) printk(KERN_WARNING "pstore: failed to load %d record(s) from '%s'\n", @@ -233,7 +246,8 @@ static void pstore_timefunc(unsigned long dummy) */ int pstore_write(enum pstore_type_id type, char *buf, size_t size) { - u64 id; + u64 id; + unsigned long flags; if (!psinfo) return -ENODEV; @@ -241,13 +255,13 @@ int pstore_write(enum pstore_type_id type, char *buf, size_t size) if (size > psinfo->bufsize) return -EFBIG; - mutex_lock(&psinfo->buf_mutex); + spin_lock_irqsave(&psinfo->buf_lock, flags); memcpy(psinfo->buf, buf, size); id = psinfo->write(type, 0, size, psinfo); if (pstore_is_mounted()) pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id, psinfo->buf, size, CURRENT_TIME, psinfo); - mutex_unlock(&psinfo->buf_mutex); + spin_unlock_irqrestore(&psinfo->buf_lock, flags); return 0; } -- cgit v1.2.3-18-g5258 From 5a0143a4f00517ea433bf459a80742ccc623a665 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 28 Jul 2011 17:47:10 +0200 Subject: ext3: Remove i_mutex from ext3_sync_file() ext3_sync_file() does not need i_mutex for anything so just drop it. Signed-off-by: Jan Kara --- fs/ext3/fsync.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'fs') diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c index d494c554c6e..1860ed35632 100644 --- a/fs/ext3/fsync.c +++ b/fs/ext3/fsync.c @@ -61,13 +61,6 @@ int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (ret) goto out; - /* - * Taking the mutex here just to keep consistent with how fsync was - * called previously, however it looks like we don't need to take - * i_mutex at all. - */ - mutex_lock(&inode->i_mutex); - J_ASSERT(ext3_journal_current_handle() == NULL); /* @@ -85,7 +78,6 @@ int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * safe in-journal, which is all fsync() needs to ensure. */ if (ext3_should_journal_data(inode)) { - mutex_unlock(&inode->i_mutex); ret = ext3_force_commit(inode->i_sb); goto out; } @@ -108,8 +100,6 @@ int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ if (needs_barrier) blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); - - mutex_unlock(&inode->i_mutex); out: trace_ext3_sync_file_exit(inode, ret); return ret; -- cgit v1.2.3-18-g5258 From 1cde201da4e97f10a5dd2434cff4ceff381603d1 Mon Sep 17 00:00:00 2001 From: Toshiyuki Okajima Date: Tue, 2 Aug 2011 18:16:57 +0900 Subject: ext3: fix message in ext3_remount for rw-remount case If there are some inodes in orphan list while a filesystem is being read-only mounted, we should recommend that peole umount and then mount it when they try to remount with read-write. But the current message and comment recommend that they umount and then remount it which may be slightly misleading. Signed-off-by: Toshiyuki Okajima Signed-off-by: Jan Kara --- fs/ext3/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 7beb69ae001..2043bcc8771 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -2669,13 +2669,13 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) /* * If we have an unprocessed orphan list hanging * around from a previously readonly bdev mount, - * require a full umount/remount for now. + * require a full umount & mount for now. */ if (es->s_last_orphan) { ext3_msg(sb, KERN_WARNING, "warning: couldn't " "remount RDWR because of unprocessed " "orphan inode list. Please " - "umount/remount instead."); + "umount & mount instead."); err = -EINVAL; goto restore_opts; } -- cgit v1.2.3-18-g5258 From 6e3d6ca0bf91bcce0453fff9b597154ff6bb9731 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Thu, 4 Aug 2011 12:29:32 +0200 Subject: fs/ext2/balloc.c: delete useless initialization Delete nontrivial initialization that is immediately overwritten by the result of an allocation function. The semantic match that makes this change is as follows: (http://coccinelle.lip6.fr/) // @@ type T; identifier i; expression e; @@ ( T i = \(0\|NULL\|ERR_PTR(...)\); | -T i = e; +T i; ) ... when != i i = \(kzalloc\|kcalloc\|kmalloc\)(...); // Signed-off-by: Julia Lawall Signed-off-by: Jan Kara --- fs/ext2/balloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c index 8f44cef1b3e..a8cbe1bc6ad 100644 --- a/fs/ext2/balloc.c +++ b/fs/ext2/balloc.c @@ -421,7 +421,7 @@ static inline int rsv_is_empty(struct ext2_reserve_window *rsv) void ext2_init_block_alloc_info(struct inode *inode) { struct ext2_inode_info *ei = EXT2_I(inode); - struct ext2_block_alloc_info *block_i = ei->i_block_alloc_info; + struct ext2_block_alloc_info *block_i; struct super_block *sb = inode->i_sb; block_i = kmalloc(sizeof(*block_i), GFP_NOFS); -- cgit v1.2.3-18-g5258 From 46130222df8567ffde773216044c7611a1e71d51 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Thu, 4 Aug 2011 12:29:31 +0200 Subject: fs/ext3/balloc.c: delete useless initialization Delete nontrivial initialization that is immediately overwritten by the result of an allocation function. The semantic match that makes this change is as follows: (http://coccinelle.lip6.fr/) // @@ type T; identifier i; expression e; @@ ( T i = \(0\|NULL\|ERR_PTR(...)\); | -T i = e; +T i; ) ... when != i i = \(kzalloc\|kcalloc\|kmalloc\)(...); // Signed-off-by: Julia Lawall Signed-off-by: Jan Kara --- fs/ext3/balloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c index 6386d76f44a..caedc00520e 100644 --- a/fs/ext3/balloc.c +++ b/fs/ext3/balloc.c @@ -427,7 +427,7 @@ static inline int rsv_is_empty(struct ext3_reserve_window *rsv) void ext3_init_block_alloc_info(struct inode *inode) { struct ext3_inode_info *ei = EXT3_I(inode); - struct ext3_block_alloc_info *block_i = ei->i_block_alloc_info; + struct ext3_block_alloc_info *block_i; struct super_block *sb = inode->i_sb; block_i = kmalloc(sizeof(*block_i), GFP_NOFS); -- cgit v1.2.3-18-g5258 From fbc854027c91fa2813ae7f9de43cc0b5c1119f41 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Tue, 16 Aug 2011 18:08:06 +0200 Subject: ext3: remove deprecated oldalloc For a long time now orlov is the default block allocator in the ext3. It performs better than the old one and no one seems to claim otherwise so we can safely drop it and make oldalloc and orlov mount option deprecated. Signed-off-by: Lukas Czerner Signed-off-by: Jan Kara --- fs/ext3/ialloc.c | 45 +++------------------------------------------ fs/ext3/super.c | 8 ++++---- 2 files changed, 7 insertions(+), 46 deletions(-) (limited to 'fs') diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c index bf09cbf938c..635bd8ce6d5 100644 --- a/fs/ext3/ialloc.c +++ b/fs/ext3/ialloc.c @@ -177,42 +177,6 @@ error_return: ext3_std_error(sb, fatal); } -/* - * There are two policies for allocating an inode. If the new inode is - * a directory, then a forward search is made for a block group with both - * free space and a low directory-to-inode ratio; if that fails, then of - * the groups with above-average free space, that group with the fewest - * directories already is chosen. - * - * For other inodes, search forward from the parent directory\'s block - * group to find a free inode. - */ -static int find_group_dir(struct super_block *sb, struct inode *parent) -{ - int ngroups = EXT3_SB(sb)->s_groups_count; - unsigned int freei, avefreei; - struct ext3_group_desc *desc, *best_desc = NULL; - int group, best_group = -1; - - freei = percpu_counter_read_positive(&EXT3_SB(sb)->s_freeinodes_counter); - avefreei = freei / ngroups; - - for (group = 0; group < ngroups; group++) { - desc = ext3_get_group_desc (sb, group, NULL); - if (!desc || !desc->bg_free_inodes_count) - continue; - if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei) - continue; - if (!best_desc || - (le16_to_cpu(desc->bg_free_blocks_count) > - le16_to_cpu(best_desc->bg_free_blocks_count))) { - best_group = group; - best_desc = desc; - } - } - return best_group; -} - /* * Orlov's allocator for directories. * @@ -436,12 +400,9 @@ struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, sbi = EXT3_SB(sb); es = sbi->s_es; - if (S_ISDIR(mode)) { - if (test_opt (sb, OLDALLOC)) - group = find_group_dir(sb, dir); - else - group = find_group_orlov(sb, dir); - } else + if (S_ISDIR(mode)) + group = find_group_orlov(sb, dir); + else group = find_group_other(sb, dir); err = -ENOSPC; diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 2043bcc8771..922d289aeeb 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -652,8 +652,6 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",nouid32"); if (test_opt(sb, DEBUG)) seq_puts(seq, ",debug"); - if (test_opt(sb, OLDALLOC)) - seq_puts(seq, ",oldalloc"); #ifdef CONFIG_EXT3_FS_XATTR if (test_opt(sb, XATTR_USER)) seq_puts(seq, ",user_xattr"); @@ -1049,10 +1047,12 @@ static int parse_options (char *options, struct super_block *sb, set_opt (sbi->s_mount_opt, DEBUG); break; case Opt_oldalloc: - set_opt (sbi->s_mount_opt, OLDALLOC); + ext3_msg(sb, KERN_WARNING, + "Ignoring deprecated oldalloc option"); break; case Opt_orlov: - clear_opt (sbi->s_mount_opt, OLDALLOC); + ext3_msg(sb, KERN_WARNING, + "Ignoring deprecated orlov option"); break; #ifdef CONFIG_EXT3_FS_XATTR case Opt_user_xattr: -- cgit v1.2.3-18-g5258 From d37854cf99319966f34bb19c7a897b87d478b56c Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Mon, 22 Aug 2011 16:23:56 +0300 Subject: UBIFS: introduce a helper to dump scanning info This commit adds 'dbg_dump_sleb()' helper function to dump scanning information. Signed-off-by: Artem Bityutskiy --- fs/ubifs/debug.c | 16 ++++++++++++++++ fs/ubifs/debug.h | 5 +++++ 2 files changed, 21 insertions(+) (limited to 'fs') diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index eef109a1a92..b09ba2dd8b6 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -870,6 +870,22 @@ void dbg_dump_lpt_info(struct ubifs_info *c) spin_unlock(&dbg_lock); } +void dbg_dump_sleb(const struct ubifs_info *c, + const struct ubifs_scan_leb *sleb, int offs) +{ + struct ubifs_scan_node *snod; + + printk(KERN_DEBUG "(pid %d) start dumping scanned data from LEB %d:%d\n", + current->pid, sleb->lnum, offs); + + list_for_each_entry(snod, &sleb->nodes, list) { + cond_resched(); + printk(KERN_DEBUG "Dumping node at LEB %d:%d len %d\n", sleb->lnum, + snod->offs, snod->len); + dbg_dump_node(c, snod->node); + } +} + void dbg_dump_leb(const struct ubifs_info *c, int lnum) { struct ubifs_scan_leb *sleb; diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h index 45174b53437..2bf84211e32 100644 --- a/fs/ubifs/debug.h +++ b/fs/ubifs/debug.h @@ -269,6 +269,8 @@ void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp); void dbg_dump_lprops(struct ubifs_info *c); void dbg_dump_lpt_info(struct ubifs_info *c); void dbg_dump_leb(const struct ubifs_info *c, int lnum); +void dbg_dump_sleb(const struct ubifs_info *c, + const struct ubifs_scan_leb *sleb, int offs); void dbg_dump_znode(const struct ubifs_info *c, const struct ubifs_znode *znode); void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat); @@ -387,6 +389,9 @@ static inline void dbg_dump_lpt_info(struct ubifs_info *c) { return; } static inline void dbg_dump_leb(const struct ubifs_info *c, int lnum) { return; } static inline void +dbg_dump_sleb(const struct ubifs_info *c, + const struct ubifs_scan_leb *sleb, int offs) { return; } +static inline void dbg_dump_znode(const struct ubifs_info *c, const struct ubifs_znode *znode) { return; } static inline void dbg_dump_heap(struct ubifs_info *c, -- cgit v1.2.3-18-g5258 From d27769ec3df1a8de9ca450d2dcd72d1ab259ba32 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 23 Aug 2011 20:01:04 +0200 Subject: block: add GENHD_FL_NO_PART_SCAN There are cases where suppressing partition scan is useful - e.g. for lo devices and pseudo SATA devices which advertise to be a disk but get upset on partition scan (some port multiplier control devices show such behavior). This patch adds GENHD_FL_NO_PART_SCAN which suppresses partition scan regardless of the number of possible partitions. disk_partitionable() is renamed to disk_part_scan_enabled() as suppressing partition scan doesn't imply the device can't be partitioned using BLKPG_ADD/DEL_PARTITION calls from userland. show_partition() now directly tests disk_max_parts() to maintain backward-compatibility. -v2: Updated to make it clear that only partition scan is suppressed not partitioning itself as suggested by Kay Sievers. Signed-off-by: Tejun Heo Cc: Kay Sievers Signed-off-by: Jens Axboe --- fs/block_dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index ff77262e887..0bed0d4588d 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -971,7 +971,7 @@ static void flush_disk(struct block_device *bdev, bool kill_dirty) if (!bdev->bd_disk) return; - if (disk_partitionable(bdev->bd_disk)) + if (disk_part_scan_enabled(bdev->bd_disk)) bdev->bd_invalidated = 1; } -- cgit v1.2.3-18-g5258 From 7606f85a701ed8feeac065e133ff9a51c267aa0d Mon Sep 17 00:00:00 2001 From: srimugunthan dhandapani Date: Fri, 26 Aug 2011 16:08:39 +0530 Subject: UBIFS: fix the dark space calculation The dark space calculation should be 64 bit type-casted, when assigning to tmp64 (similar to how total_free is calculated). Overflow will occur for very large flashes. Signed-off-by: srimugunthan Signed-off-by: Artem Bityutskiy --- fs/ubifs/recovery.c | 2 +- fs/ubifs/sb.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c index af02790d932..ee4f43f4bb9 100644 --- a/fs/ubifs/recovery.c +++ b/fs/ubifs/recovery.c @@ -983,7 +983,7 @@ int ubifs_recover_inl_heads(struct ubifs_info *c, void *sbuf) } /** - * clean_an_unclean_leb - read and write a LEB to remove corruption. + * clean_an_unclean_leb - read and write a LEB to remove corruption. * @c: UBIFS file-system description object * @ucleb: unclean LEB information * @sbuf: LEB-sized buffer to use diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c index 93d938ad3d2..6094c5a5d7a 100644 --- a/fs/ubifs/sb.c +++ b/fs/ubifs/sb.c @@ -247,7 +247,7 @@ static int create_default_filesystem(struct ubifs_info *c) mst->total_dirty = cpu_to_le64(tmp64); /* The indexing LEB does not contribute to dark space */ - tmp64 = (c->main_lebs - 1) * c->dark_wm; + tmp64 = ((long long)(c->main_lebs - 1) * c->dark_wm); mst->total_dark = cpu_to_le64(tmp64); mst->total_used = cpu_to_le64(UBIFS_INO_NODE_SZ); -- cgit v1.2.3-18-g5258 From f32948ddd1179ac0b105ceacc235cfc3f98ebea3 Mon Sep 17 00:00:00 2001 From: Li Haifeng Date: Tue, 30 Aug 2011 17:32:50 +0200 Subject: ext2: fix the outdated comment in ext2_nfs_get_inode() Signed-off-by: Li Haifeng Signed-off-by: Jan Kara --- fs/ext2/super.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 1dd62ed35b8..bd8ac164a3b 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -327,10 +327,10 @@ static struct inode *ext2_nfs_get_inode(struct super_block *sb, if (ino > le32_to_cpu(EXT2_SB(sb)->s_es->s_inodes_count)) return ERR_PTR(-ESTALE); - /* iget isn't really right if the inode is currently unallocated!! - * ext2_read_inode currently does appropriate checks, but - * it might be "neater" to call ext2_get_inode first and check - * if the inode is valid..... + /* + * ext2_iget isn't quite right if the inode is currently unallocated! + * However ext2_iget currently does appropriate checks to handle stale + * inodes so everything is OK. */ inode = ext2_iget(sb, ino); if (IS_ERR(inode)) -- cgit v1.2.3-18-g5258 From 1cd9f0976aa4606db8d6e3dc3edd0aca8019372a Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 31 Aug 2011 11:54:51 -0400 Subject: ext2,ext3,ext4: don't inherit APPEND_FL or IMMUTABLE_FL for new inodes This doesn't make much sense, and it exposes a bug in the kernel where attempts to create a new file in an append-only directory using O_CREAT will fail (but still leave a zero-length file). This was discovered when xfstests #79 was generalized so it could run on all file systems. Signed-off-by: "Theodore Ts'o" Cc:stable@kernel.org --- fs/ext4/ext4.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index b7d7bd0f066..5c38120c389 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -358,8 +358,7 @@ struct flex_groups { /* Flags that should be inherited by new inodes from their parent. */ #define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ - EXT4_SYNC_FL | EXT4_IMMUTABLE_FL | EXT4_APPEND_FL |\ - EXT4_NODUMP_FL | EXT4_NOATIME_FL |\ + EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\ EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\ EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL) -- cgit v1.2.3-18-g5258 From 84ebd795613488992b273220c2937d575d27d2a9 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 31 Aug 2011 11:56:51 -0400 Subject: ext4: fake direct I/O mode for data=journal Currently attempts to open a file with O_DIRECT in data=journal mode causes the open to fail with -EINVAL. This makes it very hard to test data=journal mode. So we will let the open succeed, but then always fall back to O_DSYNC buffered writes. Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 18d2558b762..b84f127c085 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2854,6 +2854,12 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, struct inode *inode = file->f_mapping->host; ssize_t ret; + /* + * If we are doing data journalling we don't support O_DIRECT + */ + if (ext4_should_journal_data(inode)) + return 0; + trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs); @@ -2923,6 +2929,7 @@ static const struct address_space_operations ext4_journalled_aops = { .bmap = ext4_bmap, .invalidatepage = ext4_invalidatepage, .releasepage = ext4_releasepage, + .direct_IO = ext4_direct_IO, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, }; -- cgit v1.2.3-18-g5258 From bcaa992975041e40449be8c010c26192b8c8b409 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 31 Aug 2011 11:58:51 -0400 Subject: ext4: ext4_rename should dirty dir_bh with the correct directory When ext4_rename performs a directory rename (move), dir_bh is a buffer that is modified to update the '..' link in the directory being moved (old_inode). However, ext4_handle_dirty_metadata is called with the old parent directory inode (old_dir) and dir_bh, which is incorrect because dir_bh does not belong to the parent inode. Fix this error. Signed-off-by: Darrick J. Wong Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index f8068c7bae9..09f930b7a78 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2529,7 +2529,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) = cpu_to_le32(new_dir->i_ino); BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata"); - retval = ext4_handle_dirty_metadata(handle, old_dir, dir_bh); + retval = ext4_handle_dirty_metadata(handle, old_inode, dir_bh); if (retval) { ext4_std_error(old_dir->i_sb, retval); goto end_rename; -- cgit v1.2.3-18-g5258 From f9287c1f2d329f4d78a3bbc9cf0db0ebae6f146a Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 31 Aug 2011 12:00:51 -0400 Subject: ext4: ext4_mkdir should dirty dir_block with newly created directory inode ext4_mkdir calls ext4_handle_dirty_metadata with dir_block and the inode "dir". Unfortunately, dir_block belongs to the newly created directory (which is "inode"), not the parent directory (which is "dir"). Fix the incorrect association. Signed-off-by: Darrick J. Wong Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 09f930b7a78..f0abe432313 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1862,7 +1862,7 @@ retry: ext4_set_de_type(dir->i_sb, de, S_IFDIR); inode->i_nlink = 2; BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, dir, dir_block); + err = ext4_handle_dirty_metadata(handle, inode, dir_block); if (err) goto out_clear_inode; err = ext4_mark_inode_dirty(handle, inode); -- cgit v1.2.3-18-g5258 From 5930ea643805feb50a2f8383ae12eb6f10935e49 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 31 Aug 2011 12:02:51 -0400 Subject: ext4: call ext4_handle_dirty_metadata with correct inode in ext4_dx_add_entry ext4_dx_add_entry manipulates bh2 and frames[0].bh, which are two buffer_heads that point to directory blocks assigned to the directory inode. However, the function calls ext4_handle_dirty_metadata with the inode of the file that's being added to the directory, not the directory inode itself. Therefore, correct the code to dirty the directory buffers with the directory inode, not the file inode. Signed-off-by: Darrick J. Wong Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/namei.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index f0abe432313..a067835bbac 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1585,7 +1585,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, dxtrace(dx_show_index("node", frames[1].entries)); dxtrace(dx_show_index("node", ((struct dx_node *) bh2->b_data)->entries)); - err = ext4_handle_dirty_metadata(handle, inode, bh2); + err = ext4_handle_dirty_metadata(handle, dir, bh2); if (err) goto journal_error; brelse (bh2); @@ -1611,7 +1611,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, if (err) goto journal_error; } - err = ext4_handle_dirty_metadata(handle, inode, frames[0].bh); + err = ext4_handle_dirty_metadata(handle, dir, frames[0].bh); if (err) { ext4_std_error(inode->i_sb, err); goto cleanup; -- cgit v1.2.3-18-g5258 From 4e96b2dbbf1d7e81f22047a50f862555a6cb87cb Mon Sep 17 00:00:00 2001 From: Allison Henderson Date: Sat, 3 Sep 2011 11:51:09 -0400 Subject: ext4: Add new ext4_discard_partial_page_buffers routines This patch adds two new routines: ext4_discard_partial_page_buffers and ext4_discard_partial_page_buffers_no_lock. The ext4_discard_partial_page_buffers routine is a wrapper function to ext4_discard_partial_page_buffers_no_lock. The wrapper function locks the page and passes it to ext4_discard_partial_page_buffers_no_lock. Calling functions that already have the page locked can call ext4_discard_partial_page_buffers_no_lock directly. The ext4_discard_partial_page_buffers_no_lock function zeros a specified range in a page, and unmaps the corresponding buffer heads. Only block aligned regions of the page will have their buffer heads unmapped. Unblock aligned regions will be mapped if needed so that they can be updated with the partial zero out. This function is meant to be used to update a page and its buffer heads to be zeroed and unmapped when the corresponding blocks have been released or will be released. This routine is used in the following scenarios: * A hole is punched and the non page aligned regions of the head and tail of the hole need to be discarded * The file is truncated and the partial page beyond EOF needs to be discarded * The end of a hole is in the same page as EOF. After the page is flushed, the partial page beyond EOF needs to be discarded. * A write operation begins or ends inside a hole and the partial page appearing before or after the write needs to be discarded * A write operation extends EOF and the partial page beyond EOF needs to be discarded This function takes a flag EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED which is used when a write operation begins or ends in a hole. When the EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED flag is used, only buffer heads that are already unmapped will have the corresponding regions of the page zeroed. Signed-off-by: Allison Henderson Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 11 +++ fs/ext4/inode.c | 224 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 235 insertions(+) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 5c38120c389..ccfa81f33bb 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -528,6 +528,11 @@ struct ext4_new_group_data { #define EXT4_FREE_BLOCKS_VALIDATED 0x0004 #define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008 +/* + * Flags used by ext4_discard_partial_page_buffers + */ +#define EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED 0x0001 + /* * ioctl commands */ @@ -1838,6 +1843,12 @@ extern int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from); extern int ext4_block_zero_page_range(handle_t *handle, struct address_space *mapping, loff_t from, loff_t length); +extern int ext4_discard_partial_page_buffers(handle_t *handle, + struct address_space *mapping, loff_t from, + loff_t length, int flags); +extern int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, + struct inode *inode, struct page *page, loff_t from, + loff_t length, int flags); extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); extern qsize_t *ext4_get_reserved_space(struct inode *inode); extern void ext4_da_update_reserve_space(struct inode *inode, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index b84f127c085..d1b1ef71e5b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2966,6 +2966,230 @@ void ext4_set_aops(struct inode *inode) inode->i_mapping->a_ops = &ext4_journalled_aops; } + +/* + * ext4_discard_partial_page_buffers() + * Wrapper function for ext4_discard_partial_page_buffers_no_lock. + * This function finds and locks the page containing the offset + * "from" and passes it to ext4_discard_partial_page_buffers_no_lock. + * Calling functions that already have the page locked should call + * ext4_discard_partial_page_buffers_no_lock directly. + */ +int ext4_discard_partial_page_buffers(handle_t *handle, + struct address_space *mapping, loff_t from, + loff_t length, int flags) +{ + struct inode *inode = mapping->host; + struct page *page; + int err = 0; + + page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT, + mapping_gfp_mask(mapping) & ~__GFP_FS); + if (!page) + return -EINVAL; + + err = ext4_discard_partial_page_buffers_no_lock(handle, inode, page, + from, length, flags); + + unlock_page(page); + page_cache_release(page); + return err; +} + +/* + * ext4_discard_partial_page_buffers_no_lock() + * Zeros a page range of length 'length' starting from offset 'from'. + * Buffer heads that correspond to the block aligned regions of the + * zeroed range will be unmapped. Unblock aligned regions + * will have the corresponding buffer head mapped if needed so that + * that region of the page can be updated with the partial zero out. + * + * This function assumes that the page has already been locked. The + * The range to be discarded must be contained with in the given page. + * If the specified range exceeds the end of the page it will be shortened + * to the end of the page that corresponds to 'from'. This function is + * appropriate for updating a page and it buffer heads to be unmapped and + * zeroed for blocks that have been either released, or are going to be + * released. + * + * handle: The journal handle + * inode: The files inode + * page: A locked page that contains the offset "from" + * from: The starting byte offset (from the begining of the file) + * to begin discarding + * len: The length of bytes to discard + * flags: Optional flags that may be used: + * + * EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED + * Only zero the regions of the page whose buffer heads + * have already been unmapped. This flag is appropriate + * for updateing the contents of a page whose blocks may + * have already been released, and we only want to zero + * out the regions that correspond to those released blocks. + * + * Returns zero on sucess or negative on failure. + */ +int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, + struct inode *inode, struct page *page, loff_t from, + loff_t length, int flags) +{ + ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; + unsigned int offset = from & (PAGE_CACHE_SIZE-1); + unsigned int blocksize, max, pos; + unsigned int end_of_block, range_to_discard; + ext4_lblk_t iblock; + struct buffer_head *bh; + int err = 0; + + blocksize = inode->i_sb->s_blocksize; + max = PAGE_CACHE_SIZE - offset; + + if (index != page->index) + return -EINVAL; + + /* + * correct length if it does not fall between + * 'from' and the end of the page + */ + if (length > max || length < 0) + length = max; + + iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); + + if (!page_has_buffers(page)) { + /* + * If the range to be discarded covers a partial block + * we need to get the page buffers. This is because + * partial blocks cannot be released and the page needs + * to be updated with the contents of the block before + * we write the zeros on top of it. + */ + if (!(from & (blocksize - 1)) || + !((from + length) & (blocksize - 1))) { + create_empty_buffers(page, blocksize, 0); + } else { + /* + * If there are no partial blocks, + * there is nothing to update, + * so we can return now + */ + return 0; + } + } + + /* Find the buffer that contains "offset" */ + bh = page_buffers(page); + pos = blocksize; + while (offset >= pos) { + bh = bh->b_this_page; + iblock++; + pos += blocksize; + } + + pos = offset; + while (pos < offset + length) { + err = 0; + + /* The length of space left to zero and unmap */ + range_to_discard = offset + length - pos; + + /* The length of space until the end of the block */ + end_of_block = blocksize - (pos & (blocksize-1)); + + /* + * Do not unmap or zero past end of block + * for this buffer head + */ + if (range_to_discard > end_of_block) + range_to_discard = end_of_block; + + + /* + * Skip this buffer head if we are only zeroing unampped + * regions of the page + */ + if (flags & EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED && + buffer_mapped(bh)) + goto next; + + /* If the range is block aligned, unmap */ + if (range_to_discard == blocksize) { + clear_buffer_dirty(bh); + bh->b_bdev = NULL; + clear_buffer_mapped(bh); + clear_buffer_req(bh); + clear_buffer_new(bh); + clear_buffer_delay(bh); + clear_buffer_unwritten(bh); + clear_buffer_uptodate(bh); + zero_user(page, pos, range_to_discard); + BUFFER_TRACE(bh, "Buffer discarded"); + goto next; + } + + /* + * If this block is not completely contained in the range + * to be discarded, then it is not going to be released. Because + * we need to keep this block, we need to make sure this part + * of the page is uptodate before we modify it by writeing + * partial zeros on it. + */ + if (!buffer_mapped(bh)) { + /* + * Buffer head must be mapped before we can read + * from the block + */ + BUFFER_TRACE(bh, "unmapped"); + ext4_get_block(inode, iblock, bh, 0); + /* unmapped? It's a hole - nothing to do */ + if (!buffer_mapped(bh)) { + BUFFER_TRACE(bh, "still unmapped"); + goto next; + } + } + + /* Ok, it's mapped. Make sure it's up-to-date */ + if (PageUptodate(page)) + set_buffer_uptodate(bh); + + if (!buffer_uptodate(bh)) { + err = -EIO; + ll_rw_block(READ, 1, &bh); + wait_on_buffer(bh); + /* Uhhuh. Read error. Complain and punt.*/ + if (!buffer_uptodate(bh)) + goto next; + } + + if (ext4_should_journal_data(inode)) { + BUFFER_TRACE(bh, "get write access"); + err = ext4_journal_get_write_access(handle, bh); + if (err) + goto next; + } + + zero_user(page, pos, range_to_discard); + + err = 0; + if (ext4_should_journal_data(inode)) { + err = ext4_handle_dirty_metadata(handle, inode, bh); + } else { + if (ext4_should_order_data(inode) && + EXT4_I(inode)->jinode) + err = ext4_jbd2_file_inode(handle, inode); + mark_buffer_dirty(bh); + } + + BUFFER_TRACE(bh, "Partial buffer zeroed"); +next: + bh = bh->b_this_page; + iblock++; + pos += range_to_discard; + } + + return err; +} + /* * ext4_block_truncate_page() zeroes out a mapping from file offset `from' * up to the end of the block which corresponds to `from'. -- cgit v1.2.3-18-g5258 From ba06208a1315ab2d2217e09c79582b886c9f629e Mon Sep 17 00:00:00 2001 From: Allison Henderson Date: Sat, 3 Sep 2011 11:55:59 -0400 Subject: ext4: fix xfstests 75, 112, 127 punch hole failure This patch addresses a bug found by xfstests 75, 112, 127 when blocksize = 1k This bug happens because the punch hole code only zeros out non block aligned regions of the page. This means that if the blocks are smaller than a page, then the block aligned regions of the page inside the hole are left un-zeroed, and their buffer heads are still mapped. This bug is corrected by using ext4_discard_partial_page_buffers to properly zero the partial page at the head and tail of the hole, and unmap the corresponding buffer heads This patch also addresses a bug reported by Lukas while working on a new patch to add discard support for loop devices using punch hole. The bug happened because of the first and last block number needed to be cast to a larger data type before calculating the byte offset, but since now we only need the byte offsets of the pages, we no longer even need to be calculating the byte offsets of the blocks. The code to do the block offset calculations is removed in this patch. Signed-off-by: Allison Henderson --- fs/ext4/extents.c | 61 +++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 39 insertions(+), 22 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 57cf568a98a..18f7e04a4fa 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4162,17 +4162,14 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) struct address_space *mapping = inode->i_mapping; struct ext4_map_blocks map; handle_t *handle; - loff_t first_block_offset, last_block_offset, block_len; - loff_t first_page, last_page, first_page_offset, last_page_offset; + loff_t first_page, last_page, page_len; + loff_t first_page_offset, last_page_offset; int ret, credits, blocks_released, err = 0; first_block = (offset + sb->s_blocksize - 1) >> EXT4_BLOCK_SIZE_BITS(sb); last_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); - first_block_offset = first_block << EXT4_BLOCK_SIZE_BITS(sb); - last_block_offset = last_block << EXT4_BLOCK_SIZE_BITS(sb); - first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; last_page = (offset + length) >> PAGE_CACHE_SHIFT; @@ -4211,24 +4208,44 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) goto out; /* - * Now we need to zero out the un block aligned data. - * If the file is smaller than a block, just - * zero out the middle + * Now we need to zero out the non-page-aligned data in the + * pages at the start and tail of the hole, and unmap the buffer + * heads for the block aligned regions of the page that were + * completely zeroed. */ - if (first_block > last_block) - ext4_block_zero_page_range(handle, mapping, offset, length); - else { - /* zero out the head of the hole before the first block */ - block_len = first_block_offset - offset; - if (block_len > 0) - ext4_block_zero_page_range(handle, mapping, - offset, block_len); - - /* zero out the tail of the hole after the last block */ - block_len = offset + length - last_block_offset; - if (block_len > 0) { - ext4_block_zero_page_range(handle, mapping, - last_block_offset, block_len); + if (first_page > last_page) { + /* + * If the file space being truncated is contained within a page + * just zero out and unmap the middle of that page + */ + err = ext4_discard_partial_page_buffers(handle, + mapping, offset, length, 0); + + if (err) + goto out; + } else { + /* + * zero out and unmap the partial page that contains + * the start of the hole + */ + page_len = first_page_offset - offset; + if (page_len > 0) { + err = ext4_discard_partial_page_buffers(handle, mapping, + offset, page_len, 0); + if (err) + goto out; + } + + /* + * zero out and unmap the partial page that contains + * the end of the hole + */ + page_len = offset + length - last_page_offset; + if (page_len > 0) { + err = ext4_discard_partial_page_buffers(handle, mapping, + last_page_offset, page_len, 0); + if (err) + goto out; } } -- cgit v1.2.3-18-g5258 From 2be4751b21ae1cacb002da48cfc5bf6743fee8c1 Mon Sep 17 00:00:00 2001 From: Allison Henderson Date: Sat, 3 Sep 2011 11:56:52 -0400 Subject: ext4: fix 2nd xfstests 127 punch hole failure This patch fixes a second punch hole bug found by xfstests 127. This bug happens because punch hole needs to flush the pages of the hole to avoid race conditions. But if the end of the hole is in the same page as i_size, the buffer heads beyond i_size need to be unmapped and the page needs to be zeroed after it is flushed. To correct this, the new ext4_discard_partial_page_buffers routine is used to zero and unmap the partial page beyond i_size if the end of the hole appears in the same page as i_size. The code has also been optimized to set the end of the hole to the page after i_size if the specified hole exceeds i_size, and the code that flushes the pages has been simplified. Signed-off-by: Allison Henderson --- fs/ext4/extents.c | 41 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 37 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 18f7e04a4fa..9124cd24e09 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4166,6 +4166,20 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) loff_t first_page_offset, last_page_offset; int ret, credits, blocks_released, err = 0; + /* No need to punch hole beyond i_size */ + if (offset >= inode->i_size) + return 0; + + /* + * If the hole extends beyond i_size, set the hole + * to end after the page that contains i_size + */ + if (offset + length > inode->i_size) { + length = inode->i_size + + PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) - + offset; + } + first_block = (offset + sb->s_blocksize - 1) >> EXT4_BLOCK_SIZE_BITS(sb); last_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); @@ -4182,11 +4196,10 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) */ if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { err = filemap_write_and_wait_range(mapping, - first_page_offset == 0 ? 0 : first_page_offset-1, - last_page_offset); + offset, offset + length - 1); - if (err) - return err; + if (err) + return err; } /* Now release the pages */ @@ -4249,6 +4262,26 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) } } + + /* + * If i_size is contained in the last page, we need to + * unmap and zero the partial page after i_size + */ + if (inode->i_size >> PAGE_CACHE_SHIFT == last_page && + inode->i_size % PAGE_CACHE_SIZE != 0) { + + page_len = PAGE_CACHE_SIZE - + (inode->i_size & (PAGE_CACHE_SIZE - 1)); + + if (page_len > 0) { + err = ext4_discard_partial_page_buffers(handle, + mapping, inode->i_size, page_len, 0); + + if (err) + goto out; + } + } + /* If there are no blocks to remove, return now */ if (first_block >= last_block) goto out; -- cgit v1.2.3-18-g5258 From 56889787cfa77dfd96f0b3a3e6a4f26c2e4a5134 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 3 Sep 2011 18:22:38 -0400 Subject: ext4: improve handling of conflicting mount options If the user explicitly specifies conflicting mount options for delalloc or dioread_nolock and data=journal, fail the mount, instead of printing a warning and continuing (since many user's won't look at dmesg and notice the warning). Also, print a single warning that data=journal implies that delayed allocation is not on by default (since it's not supported), and furthermore that O_DIRECT is not supported. Improve the text in Documentation/filesystems/ext4.txt so this is clear there as well. Similarly, if the dioread_nolock mount option is specified when the file system block size != PAGE_SIZE, fail the mount instead of printing a warning message and ignoring the mount option. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 3 +++ fs/ext4/super.c | 50 +++++++++++++++++++++++++++++--------------------- 2 files changed, 32 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index ccfa81f33bb..48ae98819d3 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -922,6 +922,9 @@ struct ext4_inode_info { #define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */ #define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */ +#define EXT4_MOUNT2_EXPLICIT_DELALLOC 0x00000001 /* User explicitly + specified delalloc */ + #define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ ~EXT4_MOUNT_##opt #define set_opt(sb, opt) EXT4_SB(sb)->s_mount_opt |= \ diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 44d0c8db223..ee2f74a7084 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1801,6 +1801,7 @@ set_qf_format: break; case Opt_nodelalloc: clear_opt(sb, DELALLOC); + clear_opt2(sb, EXPLICIT_DELALLOC); break; case Opt_mblk_io_submit: set_opt(sb, MBLK_IO_SUBMIT); @@ -1817,6 +1818,7 @@ set_qf_format: break; case Opt_delalloc: set_opt(sb, DELALLOC); + set_opt2(sb, EXPLICIT_DELALLOC); break; case Opt_block_validity: set_opt(sb, BLOCK_VALIDITY); @@ -3224,6 +3226,33 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) &journal_ioprio, NULL, 0)) goto failed_mount; + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { + printk_once(KERN_WARNING "EXT4-fs: Warning: mounting " + "with data=journal disables delayed " + "allocation and O_DIRECT support!\n"); + if (test_opt2(sb, EXPLICIT_DELALLOC)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "both data=journal and delalloc"); + goto failed_mount; + } + if (test_opt(sb, DIOREAD_NOLOCK)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "both data=journal and delalloc"); + goto failed_mount; + } + if (test_opt(sb, DELALLOC)) + clear_opt(sb, DELALLOC); + } + + blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); + if (test_opt(sb, DIOREAD_NOLOCK)) { + if (blocksize < PAGE_SIZE) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "dioread_nolock if block size != PAGE_SIZE"); + goto failed_mount; + } + } + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); @@ -3265,8 +3294,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY))) goto failed_mount; - blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); - if (blocksize < EXT4_MIN_BLOCK_SIZE || blocksize > EXT4_MAX_BLOCK_SIZE) { ext4_msg(sb, KERN_ERR, @@ -3679,25 +3706,6 @@ no_journal: "available"); } - if (test_opt(sb, DELALLOC) && - (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) { - ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - " - "requested data journaling mode"); - clear_opt(sb, DELALLOC); - } - if (test_opt(sb, DIOREAD_NOLOCK)) { - if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { - ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock " - "option - requested data journaling mode"); - clear_opt(sb, DIOREAD_NOLOCK); - } - if (sb->s_blocksize < PAGE_SIZE) { - ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock " - "option - block size is too small"); - clear_opt(sb, DIOREAD_NOLOCK); - } - } - err = ext4_setup_system_zone(sb); if (err) { ext4_msg(sb, KERN_ERR, "failed to initialize system " -- cgit v1.2.3-18-g5258 From 9ea7a0df63630ad8197716cd313ea66e28906fc0 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 4 Sep 2011 10:18:14 -0400 Subject: jbd2: add debugging information to jbd2_journal_dirty_metadata() Add debugging information in case jbd2_journal_dirty_metadata() is called with a buffer_head which didn't have jbd2_journal_get_write_access() called on it, or if the journal_head has the wrong transaction in it. In addition, return an error code. This won't change anything for ocfs2, which will BUG_ON() the non-zero exit code. For ext4, the caller of this function is ext4_handle_dirty_metadata(), and on seeing a non-zero return code, will call __ext4_journal_stop(), which will print the function and line number of the (buggy) calling function and abort the journal. This will allow us to recover instead of bug halting, which is better from a robustness and reliability point of view. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4_jbd2.c | 8 ++++--- fs/ext4/extents.c | 10 ++++++--- fs/jbd2/transaction.c | 58 +++++++++++++++++++++++++++++++++++++++++++++------ 3 files changed, 64 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index f5240aa1560..aca17901758 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -109,9 +109,11 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, if (ext4_handle_valid(handle)) { err = jbd2_journal_dirty_metadata(handle, bh); - if (err) - ext4_journal_abort_handle(where, line, __func__, - bh, handle, err); + if (err) { + /* Errors can only happen if there is a bug */ + handle->h_err = err; + __ext4_journal_stop(where, line, handle); + } } else { if (inode) mark_buffer_dirty_inode(bh, inode); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 9124cd24e09..2c5216a8d03 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -96,13 +96,17 @@ static int ext4_ext_get_access(handle_t *handle, struct inode *inode, * - ENOMEM * - EIO */ -static int ext4_ext_dirty(handle_t *handle, struct inode *inode, - struct ext4_ext_path *path) +#define ext4_ext_dirty(handle, inode, path) \ + __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path)) +static int __ext4_ext_dirty(const char *where, unsigned int line, + handle_t *handle, struct inode *inode, + struct ext4_ext_path *path) { int err; if (path->p_bh) { /* path points to block */ - err = ext4_handle_dirty_metadata(handle, inode, path->p_bh); + err = __ext4_handle_dirty_metadata(where, line, handle, + inode, path->p_bh); } else { /* path points to leaf/index in inode body */ err = ext4_mark_inode_dirty(handle, inode); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 2d7109414cd..cb56fe9aaab 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1049,6 +1049,10 @@ void jbd2_buffer_abort_trigger(struct journal_head *jh, * mark dirty metadata which needs to be journaled as part of the current * transaction. * + * The buffer must have previously had jbd2_journal_get_write_access() + * called so that it has a valid journal_head attached to the buffer + * head. + * * The buffer is placed on the transaction's metadata list and is marked * as belonging to the transaction. * @@ -1065,11 +1069,16 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) transaction_t *transaction = handle->h_transaction; journal_t *journal = transaction->t_journal; struct journal_head *jh = bh2jh(bh); + int ret = 0; jbd_debug(5, "journal_head %p\n", jh); JBUFFER_TRACE(jh, "entry"); if (is_handle_aborted(handle)) goto out; + if (!buffer_jbd(bh)) { + ret = -EUCLEAN; + goto out; + } jbd_lock_bh_state(bh); @@ -1093,8 +1102,20 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) */ if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) { JBUFFER_TRACE(jh, "fastpath"); - J_ASSERT_JH(jh, jh->b_transaction == - journal->j_running_transaction); + if (unlikely(jh->b_transaction != + journal->j_running_transaction)) { + printk(KERN_EMERG "JBD: %s: " + "jh->b_transaction (%llu, %p, %u) != " + "journal->j_running_transaction (%p, %u)", + journal->j_devname, + (unsigned long long) bh->b_blocknr, + jh->b_transaction, + jh->b_transaction ? jh->b_transaction->t_tid : 0, + journal->j_running_transaction, + journal->j_running_transaction ? + journal->j_running_transaction->t_tid : 0); + ret = -EINVAL; + } goto out_unlock_bh; } @@ -1108,9 +1129,32 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) */ if (jh->b_transaction != transaction) { JBUFFER_TRACE(jh, "already on other transaction"); - J_ASSERT_JH(jh, jh->b_transaction == - journal->j_committing_transaction); - J_ASSERT_JH(jh, jh->b_next_transaction == transaction); + if (unlikely(jh->b_transaction != + journal->j_committing_transaction)) { + printk(KERN_EMERG "JBD: %s: " + "jh->b_transaction (%llu, %p, %u) != " + "journal->j_committing_transaction (%p, %u)", + journal->j_devname, + (unsigned long long) bh->b_blocknr, + jh->b_transaction, + jh->b_transaction ? jh->b_transaction->t_tid : 0, + journal->j_committing_transaction, + journal->j_committing_transaction ? + journal->j_committing_transaction->t_tid : 0); + ret = -EINVAL; + } + if (unlikely(jh->b_next_transaction != transaction)) { + printk(KERN_EMERG "JBD: %s: " + "jh->b_next_transaction (%llu, %p, %u) != " + "transaction (%p, %u)", + journal->j_devname, + (unsigned long long) bh->b_blocknr, + jh->b_next_transaction, + jh->b_next_transaction ? + jh->b_next_transaction->t_tid : 0, + transaction, transaction->t_tid); + ret = -EINVAL; + } /* And this case is illegal: we can't reuse another * transaction's data buffer, ever. */ goto out_unlock_bh; @@ -1127,7 +1171,9 @@ out_unlock_bh: jbd_unlock_bh_state(bh); out: JBUFFER_TRACE(jh, "exit"); - return 0; + if (ret) + __WARN(); /* All errors are bugs, so dump the stack */ + return ret; } /* -- cgit v1.2.3-18-g5258 From d2159fb7b8bac12684aabdf41d84b56da9f5c062 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sun, 4 Sep 2011 10:20:14 -0400 Subject: jbd2: use gfp_t instead of int This silences some Sparse warnings: fs/jbd2/transaction.c:135:69: warning: incorrect type in argument 2 (different base types) fs/jbd2/transaction.c:135:69: expected restricted gfp_t [usertype] flags fs/jbd2/transaction.c:135:69: got int [signed] gfp_mask Signed-off-by: Dan Carpenter Signed-off-by: "Theodore Ts'o" --- fs/jbd2/transaction.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index cb56fe9aaab..b01fd610408 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -115,7 +115,7 @@ static inline void update_t_max_wait(transaction_t *transaction, */ static int start_this_handle(journal_t *journal, handle_t *handle, - int gfp_mask) + gfp_t gfp_mask) { transaction_t *transaction, *new_transaction = NULL; tid_t tid; @@ -320,7 +320,7 @@ static handle_t *new_handle(int nblocks) * Return a pointer to a newly allocated handle, or an ERR_PTR() value * on failure. */ -handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask) +handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask) { handle_t *handle = journal_current_handle(); int err; @@ -443,7 +443,7 @@ out: * transaction capabable of guaranteeing the requested number of * credits. */ -int jbd2__journal_restart(handle_t *handle, int nblocks, int gfp_mask) +int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask) { transaction_t *transaction = handle->h_transaction; journal_t *journal = transaction->t_journal; -- cgit v1.2.3-18-g5258 From decbd919f4bb9cb6