diff options
Diffstat (limited to 'fs/ubifs')
34 files changed, 7360 insertions, 4414 deletions
diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig index 91ceeda7e5b..ba66d508006 100644 --- a/fs/ubifs/Kconfig +++ b/fs/ubifs/Kconfig @@ -11,18 +11,12 @@ config UBIFS_FS help UBIFS is a file system for flash devices which works on top of UBI. -config UBIFS_FS_XATTR - bool "Extended attributes support" - depends on UBIFS_FS - help - This option enables support of extended attributes. - config UBIFS_FS_ADVANCED_COMPR bool "Advanced compression options" depends on UBIFS_FS help This option allows to explicitly choose which compressions, if any, - are enabled in UBIFS. Removing compressors means inbility to read + are enabled in UBIFS. Removing compressors means inability to read existing file systems. If unsure, say 'N'. @@ -32,7 +26,7 @@ config UBIFS_FS_LZO depends on UBIFS_FS default y help - LZO compressor is generally faster then zlib but compresses worse. + LZO compressor is generally faster than zlib but compresses worse. Say 'Y' if unsure. config UBIFS_FS_ZLIB @@ -40,33 +34,4 @@ config UBIFS_FS_ZLIB depends on UBIFS_FS default y help - Zlib copresses better then LZO but it is slower. Say 'Y' if unsure. - -# Debugging-related stuff -config UBIFS_FS_DEBUG - bool "Enable debugging" - depends on UBIFS_FS - select DEBUG_FS - select KALLSYMS_ALL - help - This option enables UBIFS debugging. - -config UBIFS_FS_DEBUG_MSG_LVL - int "Default message level (0 = no extra messages, 3 = lots)" - depends on UBIFS_FS_DEBUG - default "0" - help - This controls the amount of debugging messages produced by UBIFS. - If reporting bugs, please try to have available a full dump of the - messages at level 1 while the misbehaviour was occurring. Level 2 - may become necessary if level 1 messages were not enough to find the - bug. Generally Level 3 should be avoided. - -config UBIFS_FS_DEBUG_CHKS - bool "Enable extra checks" - depends on UBIFS_FS_DEBUG - help - If extra checks are enabled UBIFS will check the consistency of its - internal data structures during operation. However, UBIFS performance - is dramatically slower when this option is selected especially if the - file system is large. + Zlib compresses better than LZO but it is slower. Say 'Y' if unsure. diff --git a/fs/ubifs/Makefile b/fs/ubifs/Makefile index 80e93c35e49..2c6f0cb816b 100644 --- a/fs/ubifs/Makefile +++ b/fs/ubifs/Makefile @@ -3,7 +3,4 @@ obj-$(CONFIG_UBIFS_FS) += ubifs.o ubifs-y += shrinker.o journal.o file.o dir.o super.o sb.o io.o ubifs-y += tnc.o master.o scan.o replay.o log.o commit.o gc.o orphan.o ubifs-y += budget.o find.o tnc_commit.o compress.o lpt.o lprops.o -ubifs-y += recovery.o ioctl.o lpt_commit.o tnc_misc.o - -ubifs-$(CONFIG_UBIFS_FS_DEBUG) += debug.o -ubifs-$(CONFIG_UBIFS_FS_XATTR) += xattr.o +ubifs-y += recovery.o ioctl.o lpt_commit.o tnc_misc.o xattr.o debug.o diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c index d81fb9ed2b8..eb997e9c4ab 100644 --- a/fs/ubifs/budget.c +++ b/fs/ubifs/budget.c @@ -32,18 +32,15 @@ #include "ubifs.h" #include <linux/writeback.h> -#include <asm/div64.h> +#include <linux/math64.h> /* * When pessimistic budget calculations say that there is no enough space, * UBIFS starts writing back dirty inodes and pages, doing garbage collection, - * or committing. The below constants define maximum number of times UBIFS + * or committing. The below constant defines maximum number of times UBIFS * repeats the operations. */ -#define MAX_SHRINK_RETRIES 8 -#define MAX_GC_RETRIES 4 -#define MAX_CMT_RETRIES 2 -#define MAX_NOSPC_RETRIES 1 +#define MAX_MKSPC_RETRIES 3 /* * The below constant defines amount of dirty pages which should be written @@ -52,73 +49,24 @@ #define NR_TO_WRITE 16 /** - * struct retries_info - information about re-tries while making free space. - * @prev_liability: previous liability - * @shrink_cnt: how many times the liability was shrinked - * @shrink_retries: count of liability shrink re-tries (increased when - * liability does not shrink) - * @try_gc: GC should be tried first - * @gc_retries: how many times GC was run - * @cmt_retries: how many times commit has been done - * @nospc_retries: how many times GC returned %-ENOSPC - * - * Since we consider budgeting to be the fast-path, and this structure has to - * be allocated on stack and zeroed out, we make it smaller using bit-fields. - */ -struct retries_info { - long long prev_liability; - unsigned int shrink_cnt; - unsigned int shrink_retries:5; - unsigned int try_gc:1; - unsigned int gc_retries:4; - unsigned int cmt_retries:3; - unsigned int nospc_retries:1; -}; - -/** * shrink_liability - write-back some dirty pages/inodes. * @c: UBIFS file-system description object * @nr_to_write: how many dirty pages to write-back * * This function shrinks UBIFS liability by means of writing back some amount - * of dirty inodes and their pages. Returns the amount of pages which were - * written back. The returned value does not include dirty inodes which were - * synchronized. + * of dirty inodes and their pages. * * Note, this function synchronizes even VFS inodes which are locked * (@i_mutex) by the caller of the budgeting function, because write-back does * not touch @i_mutex. */ -static int shrink_liability(struct ubifs_info *c, int nr_to_write) +static void shrink_liability(struct ubifs_info *c, int nr_to_write) { - int nr_written; - struct writeback_control wbc = { - .sync_mode = WB_SYNC_NONE, - .range_end = LLONG_MAX, - .nr_to_write = nr_to_write, - }; - - generic_sync_sb_inodes(c->vfs_sb, &wbc); - nr_written = nr_to_write - wbc.nr_to_write; - - if (!nr_written) { - /* - * Re-try again but wait on pages/inodes which are being - * written-back concurrently (e.g., by pdflush). - */ - memset(&wbc, 0, sizeof(struct writeback_control)); - wbc.sync_mode = WB_SYNC_ALL; - wbc.range_end = LLONG_MAX; - wbc.nr_to_write = nr_to_write; - generic_sync_sb_inodes(c->vfs_sb, &wbc); - nr_written = nr_to_write - wbc.nr_to_write; - } - - dbg_budg("%d pages were written back", nr_written); - return nr_written; + down_read(&c->vfs_sb->s_umount); + writeback_inodes_sb(c->vfs_sb, WB_REASON_FS_FREE_SPACE); + up_read(&c->vfs_sb->s_umount); } - /** * run_gc - run garbage collector. * @c: UBIFS file-system description object @@ -147,13 +95,29 @@ static int run_gc(struct ubifs_info *c) } /** + * get_liability - calculate current liability. + * @c: UBIFS file-system description object + * + * This function calculates and returns current UBIFS liability, i.e. the + * amount of bytes UBIFS has "promised" to write to the media. + */ +static long long get_liability(struct ubifs_info *c) +{ + long long liab; + + spin_lock(&c->space_lock); + liab = c->bi.idx_growth + c->bi.data_growth + c->bi.dd_growth; + spin_unlock(&c->space_lock); + return liab; +} + +/** * make_free_space - make more free space on the file-system. * @c: UBIFS file-system description object - * @ri: information about previous invocations of this function * * This function is called when an operation cannot be budgeted because there * is supposedly no free space. But in most cases there is some free space: - * o budgeting is pessimistic, so it always budgets more then it is actually + * o budgeting is pessimistic, so it always budgets more than it is actually * needed, so shrinking the liability is one way to make free space - the * cached data will take less space then it was budgeted for; * o GC may turn some dark space into free space (budgeting treats dark space @@ -165,129 +129,74 @@ static int run_gc(struct ubifs_info *c) * Returns %-ENOSPC if it couldn't do more free space, and other negative error * codes on failures. */ -static int make_free_space(struct ubifs_info *c, struct retries_info *ri) +static int make_free_space(struct ubifs_info *c) { - int err; - - /* - * If we have some dirty pages and inodes (liability), try to write - * them back unless this was tried too many times without effect - * already. - */ - if (ri->shrink_retries < MAX_SHRINK_RETRIES && !ri->try_gc) { - long long liability; + int err, retries = 0; + long long liab1, liab2; - spin_lock(&c->space_lock); - liability = c->budg_idx_growth + c->budg_data_growth + - c->budg_dd_growth; - spin_unlock(&c->space_lock); - - if (ri->prev_liability >= liability) { - /* Liability does not shrink, next time try GC then */ - ri->shrink_retries += 1; - if (ri->gc_retries < MAX_GC_RETRIES) - ri->try_gc = 1; - dbg_budg("liability did not shrink: retries %d of %d", - ri->shrink_retries, MAX_SHRINK_RETRIES); - } - - dbg_budg("force write-back (count %d)", ri->shrink_cnt); - shrink_liability(c, NR_TO_WRITE + ri->shrink_cnt); + do { + liab1 = get_liability(c); + /* + * We probably have some dirty pages or inodes (liability), try + * to write them back. + */ + dbg_budg("liability %lld, run write-back", liab1); + shrink_liability(c, NR_TO_WRITE); - ri->prev_liability = liability; - ri->shrink_cnt += 1; - return -EAGAIN; - } + liab2 = get_liability(c); + if (liab2 < liab1) + return -EAGAIN; - /* - * Try to run garbage collector unless it was already tried too many - * times. - */ - if (ri->gc_retries < MAX_GC_RETRIES) { - ri->gc_retries += 1; - dbg_budg("run GC, retries %d of %d", - ri->gc_retries, MAX_GC_RETRIES); + dbg_budg("new liability %lld (not shrunk)", liab2); - ri->try_gc = 0; + /* Liability did not shrink again, try GC */ + dbg_budg("Run GC"); err = run_gc(c); if (!err) return -EAGAIN; - if (err == -EAGAIN) { - dbg_budg("GC asked to commit"); - err = ubifs_run_commit(c); - if (err) - return err; - return -EAGAIN; - } - - if (err != -ENOSPC) + if (err != -EAGAIN && err != -ENOSPC) + /* Some real error happened */ return err; - /* - * GC could not make any progress. If this is the first time, - * then it makes sense to try to commit, because it might make - * some dirty space. - */ - dbg_budg("GC returned -ENOSPC, retries %d", - ri->nospc_retries); - if (ri->nospc_retries >= MAX_NOSPC_RETRIES) - return err; - ri->nospc_retries += 1; - } - - /* Neither GC nor write-back helped, try to commit */ - if (ri->cmt_retries < MAX_CMT_RETRIES) { - ri->cmt_retries += 1; - dbg_budg("run commit, retries %d of %d", - ri->cmt_retries, MAX_CMT_RETRIES); + dbg_budg("Run commit (retries %d)", retries); err = ubifs_run_commit(c); if (err) return err; - return -EAGAIN; - } + } while (retries++ < MAX_MKSPC_RETRIES); + return -ENOSPC; } /** - * ubifs_calc_min_idx_lebs - calculate amount of eraseblocks for the index. + * ubifs_calc_min_idx_lebs - calculate amount of LEBs for the index. * @c: UBIFS file-system description object * - * This function calculates and returns the number of eraseblocks which should - * be kept for index usage. + * This function calculates and returns the number of LEBs which should be kept + * for index usage. */ int ubifs_calc_min_idx_lebs(struct ubifs_info *c) { - int ret; - uint64_t idx_size; - - idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx; - - /* And make sure we have twice the index size of space reserved */ - idx_size <<= 1; + int idx_lebs; + long long idx_size; + idx_size = c->bi.old_idx_sz + c->bi.idx_growth + c->bi.uncommitted_idx; + /* And make sure we have thrice the index size of space reserved */ + idx_size += idx_size << 1; /* * We do not maintain 'old_idx_size' as 'old_idx_lebs'/'old_idx_bytes' * pair, nor similarly the two variables for the new index size, so we * have to do this costly 64-bit division on fast-path. */ - if (do_div(idx_size, c->leb_size - c->max_idx_node_sz)) - ret = idx_size + 1; - else - ret = idx_size; + idx_lebs = div_u64(idx_size + c->idx_leb_size - 1, c->idx_leb_size); /* * The index head is not available for the in-the-gaps method, so add an * extra LEB to compensate. */ - ret += 1; - /* - * At present the index needs at least 2 LEBs: one for the index head - * and one for in-the-gaps method (which currently does not cater for - * the index head and so excludes it from consideration). - */ - if (ret < 2) - ret = 2; - return ret; + idx_lebs += 1; + if (idx_lebs < MIN_INDEX_LEBS) + idx_lebs = MIN_INDEX_LEBS; + return idx_lebs; } /** @@ -302,18 +211,6 @@ long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs) int subtract_lebs; long long available; - /* - * Force the amount available to the total size reported if the used - * space is zero. - */ - if (c->lst.total_used <= UBIFS_INO_NODE_SZ && - c->budg_data_growth + c->budg_dd_growth == 0) { - /* Do the same calculation as for c->block_cnt */ - available = c->main_lebs - 2; - available *= c->leb_size - c->dark_wm; - return available; - } - available = c->main_bytes - c->lst.total_used; /* @@ -375,8 +272,8 @@ long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs) */ static int can_use_rp(struct ubifs_info *c) { - if (current->fsuid == c->rp_uid || capable(CAP_SYS_RESOURCE) || - (c->rp_gid != 0 && in_group_p(c->rp_gid))) + if (uid_eq(current_fsuid(), c->rp_uid) || capable(CAP_SYS_RESOURCE) || + (!gid_eq(c->rp_gid, GLOBAL_ROOT_GID) && in_group_p(c->rp_gid))) return 1; return 0; } @@ -385,23 +282,23 @@ static int can_use_rp(struct ubifs_info *c) * do_budget_space - reserve flash space for index and data growth. * @c: UBIFS file-system description object * - * This function makes sure UBIFS has enough free eraseblocks for index growth - * and data. + * This function makes sure UBIFS has enough free LEBs for index growth and + * data. * - * When budgeting index space, UBIFS reserves twice as more LEBs as the index + * When budgeting index space, UBIFS reserves thrice as many LEBs as the index * would take if it was consolidated and written to the flash. This guarantees * that the "in-the-gaps" commit method always succeeds and UBIFS will always * be able to commit dirty index. So this function basically adds amount of - * budgeted index space to the size of the current index, multiplies this by 2, - * and makes sure this does not exceed the amount of free eraseblocks. + * budgeted index space to the size of the current index, multiplies this by 3, + * and makes sure this does not exceed the amount of free LEBs. * - * Notes about @c->min_idx_lebs and @c->lst.idx_lebs variables: + * Notes about @c->bi.min_idx_lebs and @c->lst.idx_lebs variables: * o @c->lst.idx_lebs is the number of LEBs the index currently uses. It might * be large, because UBIFS does not do any index consolidation as long as * there is free space. IOW, the index may take a lot of LEBs, but the LEBs * will contain a lot of dirt. - * o @c->min_idx_lebs is the the index presumably takes. IOW, the index may be - * consolidated to take up to @c->min_idx_lebs LEBs. + * o @c->bi.min_idx_lebs is the number of LEBS the index presumably takes. IOW, + * the index may be consolidated to take up to @c->bi.min_idx_lebs LEBs. * * This function returns zero in case of success, and %-ENOSPC in case of * failure. @@ -426,31 +323,32 @@ static int do_budget_space(struct ubifs_info *c) * @c->lst.empty_lebs + @c->freeable_cnt + @c->idx_gc_cnt - * @c->lst.taken_empty_lebs * - * @empty_lebs are available because they are empty. @freeable_cnt are - * available because they contain only free and dirty space and the - * index allocation always occurs after wbufs are synch'ed. - * @idx_gc_cnt are available because they are index LEBs that have been - * garbage collected (including trivial GC) and are awaiting the commit - * before they can be unmapped - note that the in-the-gaps method will - * grab these if it needs them. @taken_empty_lebs are empty_lebs that - * have already been allocated for some purpose (also includes those - * LEBs on the @idx_gc list). + * @c->lst.empty_lebs are available because they are empty. + * @c->freeable_cnt are available because they contain only free and + * dirty space, @c->idx_gc_cnt are available because they are index + * LEBs that have been garbage collected and are awaiting the commit + * before they can be used. And the in-the-gaps method will grab these + * if it needs them. @c->lst.taken_empty_lebs are empty LEBs that have + * already been allocated for some purpose. + * + * Note, @c->idx_gc_cnt is included to both @c->lst.empty_lebs (because + * these LEBs are empty) and to @c->lst.taken_empty_lebs (because they + * are taken until after the commit). * - * Note, @taken_empty_lebs may temporarily be higher by one because of - * the way we serialize LEB allocations and budgeting. See a comment in - * 'ubifs_find_free_space()'. + * Note, @c->lst.taken_empty_lebs may temporarily be higher by one + * because of the way we serialize LEB allocations and budgeting. See a + * comment in 'ubifs_find_free_space()'. */ lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt - c->lst.taken_empty_lebs; if (unlikely(rsvd_idx_lebs > lebs)) { - dbg_budg("out of indexing space: min_idx_lebs %d (old %d), " - "rsvd_idx_lebs %d", min_idx_lebs, c->min_idx_lebs, - rsvd_idx_lebs); + dbg_budg("out of indexing space: min_idx_lebs %d (old %d), rsvd_idx_lebs %d", + min_idx_lebs, c->bi.min_idx_lebs, rsvd_idx_lebs); return -ENOSPC; } available = ubifs_calc_available(c, min_idx_lebs); - outstanding = c->budg_data_growth + c->budg_dd_growth; + outstanding = c->bi.data_growth + c->bi.dd_growth; if (unlikely(available < outstanding)) { dbg_budg("out of data space: available %lld, outstanding %lld", @@ -461,7 +359,7 @@ static int do_budget_space(struct ubifs_info *c) if (available - outstanding <= c->rp_size && !can_use_rp(c)) return -ENOSPC; - c->min_idx_lebs = min_idx_lebs; + c->bi.min_idx_lebs = min_idx_lebs; return 0; } @@ -494,11 +392,11 @@ static int calc_data_growth(const struct ubifs_info *c, { int data_growth; - data_growth = req->new_ino ? c->inode_budget : 0; + data_growth = req->new_ino ? c->bi.inode_budget : 0; if (req->new_page) - data_growth += c->page_budget; + data_growth += c->bi.page_budget; if (req->new_dent) - data_growth += c->dent_budget; + data_growth += c->bi.dent_budget; data_growth += req->new_ino_d; return data_growth; } @@ -514,12 +412,12 @@ static int calc_dd_growth(const struct ubifs_info *c, { int dd_growth; - dd_growth = req->dirtied_page ? c->page_budget : 0; + dd_growth = req->dirtied_page ? c->bi.page_budget : 0; if (req->dirtied_ino) - dd_growth += c->inode_budget << (req->dirtied_ino - 1); + dd_growth += c->bi.inode_budget << (req->dirtied_ino - 1); if (req->mod_dent) - dd_growth += c->dent_budget; + dd_growth += c->bi.dent_budget; dd_growth += req->dirtied_ino_d; return dd_growth; } @@ -539,35 +437,40 @@ static int calc_dd_growth(const struct ubifs_info *c, */ int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req) { - int uninitialized_var(cmt_retries), uninitialized_var(wb_retries); - int err, idx_growth, data_growth, dd_growth; - struct retries_info ri; - + int err, idx_growth, data_growth, dd_growth, retried = 0; + + ubifs_assert(req->new_page <= 1); + ubifs_assert(req->dirtied_page <= 1); + ubifs_assert(req->new_dent <= 1); + ubifs_assert(req->mod_dent <= 1); + ubifs_assert(req->new_ino <= 1); + ubifs_assert(req->new_ino_d <= UBIFS_MAX_INO_DATA); ubifs_assert(req->dirtied_ino <= 4); ubifs_assert(req->dirtied_ino_d <= UBIFS_MAX_INO_DATA * 4); + ubifs_assert(!(req->new_ino_d & 7)); + ubifs_assert(!(req->dirtied_ino_d & 7)); data_growth = calc_data_growth(c, req); dd_growth = calc_dd_growth(c, req); if (!data_growth && !dd_growth) return 0; idx_growth = calc_idx_growth(c, req); - memset(&ri, 0, sizeof(struct retries_info)); again: spin_lock(&c->space_lock); - ubifs_assert(c->budg_idx_growth >= 0); - ubifs_assert(c->budg_data_growth >= 0); - ubifs_assert(c->budg_dd_growth >= 0); + ubifs_assert(c->bi.idx_growth >= 0); + ubifs_assert(c->bi.data_growth >= 0); + ubifs_assert(c->bi.dd_growth >= 0); - if (unlikely(c->nospace) && (c->nospace_rp || !can_use_rp(c))) { + if (unlikely(c->bi.nospace) && (c->bi.nospace_rp || !can_use_rp(c))) { dbg_budg("no space"); spin_unlock(&c->space_lock); return -ENOSPC; } - c->budg_idx_growth += idx_growth; - c->budg_data_growth += data_growth; - c->budg_dd_growth += dd_growth; + c->bi.idx_growth += idx_growth; + c->bi.data_growth += data_growth; + c->bi.dd_growth += dd_growth; err = do_budget_space(c); if (likely(!err)) { @@ -579,9 +482,9 @@ again: } /* Restore the old values */ - c->budg_idx_growth -= idx_growth; - c->budg_data_growth -= data_growth; - c->budg_dd_growth -= dd_growth; + c->bi.idx_growth -= idx_growth; + c->bi.data_growth -= data_growth; + c->bi.dd_growth -= dd_growth; spin_unlock(&c->space_lock); if (req->fast) { @@ -589,16 +492,21 @@ again: return err; } - err = make_free_space(c, &ri); + err = make_free_space(c); + cond_resched(); if (err == -EAGAIN) { dbg_budg("try again"); - cond_resched(); goto again; } else if (err == -ENOSPC) { + if (!retried) { + retried = 1; + dbg_budg("-ENOSPC, but anyway try once again"); + goto again; + } dbg_budg("FS is full, -ENOSPC"); - c->nospace = 1; + c->bi.nospace = 1; if (can_use_rp(c) || c->rp_size == 0) - c->nospace_rp = 1; + c->bi.nospace_rp = 1; smp_wmb(); } else ubifs_err("cannot budget space, error %d", err); @@ -613,13 +521,21 @@ again: * This function releases the space budgeted by 'ubifs_budget_space()'. Note, * since the index changes (which were budgeted for in @req->idx_growth) will * only be written to the media on commit, this function moves the index budget - * from @c->budg_idx_growth to @c->budg_uncommitted_idx. The latter will be - * zeroed by the commit operation. + * from @c->bi.idx_growth to @c->bi.uncommitted_idx. The latter will be zeroed + * by the commit operation. */ void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req) { + ubifs_assert(req->new_page <= 1); + ubifs_assert(req->dirtied_page <= 1); + ubifs_assert(req->new_dent <= 1); + ubifs_assert(req->mod_dent <= 1); + ubifs_assert(req->new_ino <= 1); + ubifs_assert(req->new_ino_d <= UBIFS_MAX_INO_DATA); ubifs_assert(req->dirtied_ino <= 4); ubifs_assert(req->dirtied_ino_d <= UBIFS_MAX_INO_DATA * 4); + ubifs_assert(!(req->new_ino_d & 7)); + ubifs_assert(!(req->dirtied_ino_d & 7)); if (!req->recalculate) { ubifs_assert(req->idx_growth >= 0); ubifs_assert(req->data_growth >= 0); @@ -635,19 +551,23 @@ void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req) if (!req->data_growth && !req->dd_growth) return; - c->nospace = c->nospace_rp = 0; + c->bi.nospace = c->bi.nospace_rp = 0; smp_wmb(); spin_lock(&c->space_lock); - c->budg_idx_growth -= req->idx_growth; - c->budg_uncommitted_idx += req->idx_growth; - c->budg_data_growth -= req->data_growth; - c->budg_dd_growth -= req->dd_growth; - c->min_idx_lebs = ubifs_calc_min_idx_lebs(c); - - ubifs_assert(c->budg_idx_growth >= 0); - ubifs_assert(c->budg_data_growth >= 0); - ubifs_assert(c->min_idx_lebs < c->main_lebs); + c->bi.idx_growth -= req->idx_growth; + c->bi.uncommitted_idx += req->idx_growth; + c->bi.data_growth -= req->data_growth; + c->bi.dd_growth -= req->dd_growth; + c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c); + + ubifs_assert(c->bi.idx_growth >= 0); + ubifs_assert(c->bi.data_growth >= 0); + ubifs_assert(c->bi.dd_growth >= 0); + ubifs_assert(c->bi.min_idx_lebs < c->main_lebs); + ubifs_assert(!(c->bi.idx_growth & 7)); + ubifs_assert(!(c->bi.data_growth & 7)); + ubifs_assert(!(c->bi.dd_growth & 7)); spin_unlock(&c->space_lock); } @@ -656,7 +576,7 @@ void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req) * @c: UBIFS file-system description object * * This function converts budget which was allocated for a new page of data to - * the budget of changing an existing page of data. The latter is smaller then + * the budget of changing an existing page of data. The latter is smaller than * the former, so this function only does simple re-calculation and does not * involve any write-back. */ @@ -664,13 +584,13 @@ void ubifs_convert_page_budget(struct ubifs_info *c) { spin_lock(&c->space_lock); /* Release the index growth reservation */ - c->budg_idx_growth -= c->max_idx_node_sz << UBIFS_BLOCKS_PER_PAGE_SHIFT; + c->bi.idx_growth -= c->max_idx_node_sz << UBIFS_BLOCKS_PER_PAGE_SHIFT; /* Release the data growth reservation */ - c->budg_data_growth -= c->page_budget; + c->bi.data_growth -= c->bi.page_budget; /* Increase the dirty data growth reservation instead */ - c->budg_dd_growth += c->page_budget; + c->bi.dd_growth += c->bi.page_budget; /* And re-calculate the indexing space reservation */ - c->min_idx_lebs = ubifs_calc_min_idx_lebs(c); + c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c); spin_unlock(&c->space_lock); } @@ -681,47 +601,108 @@ void ubifs_convert_page_budget(struct ubifs_info *c) * * This function releases budget corresponding to a dirty inode. It is usually * called when after the inode has been written to the media and marked as - * clean. + * clean. It also causes the "no space" flags to be cleared. */ void ubifs_release_dirty_inode_budget(struct ubifs_info *c, struct ubifs_inode *ui) { - struct ubifs_budget_req req = {.dd_growth = c->inode_budget, - .dirtied_ino_d = ui->data_len}; + struct ubifs_budget_req req; + memset(&req, 0, sizeof(struct ubifs_budget_req)); + /* The "no space" flags will be cleared because dd_growth is > 0 */ + req.dd_growth = c->bi.inode_budget + ALIGN(ui->data_len, 8); ubifs_release_budget(c, &req); } /** - * ubifs_budg_get_free_space - return amount of free space. + * ubifs_reported_space - calculate reported free space. + * @c: the UBIFS file-system description object + * @free: amount of free space + * + * This function calculates amount of free space which will be reported to + * user-space. User-space application tend to expect that if the file-system + * (e.g., via the 'statfs()' call) reports that it has N bytes available, they + * are able to write a file of size N. UBIFS attaches node headers to each data + * node and it has to write indexing nodes as well. This introduces additional + * overhead, and UBIFS has to report slightly less free space to meet the above + * expectations. + * + * This function assumes free space is made up of uncompressed data nodes and + * full index nodes (one per data node, tripled because we always allow enough + * space to write the index thrice). + * + * Note, the calculation is pessimistic, which means that most of the time + * UBIFS reports less space than it actually has. + */ +long long ubifs_reported_space(const struct ubifs_info *c, long long free) +{ + int divisor, factor, f; + + /* + * Reported space size is @free * X, where X is UBIFS block size + * divided by UBIFS block size + all overhead one data block + * introduces. The overhead is the node header + indexing overhead. + * + * Indexing overhead calculations are based on the following formula: + * I = N/(f - 1) + 1, where I - number of indexing nodes, N - number + * of data nodes, f - fanout. Because effective UBIFS fanout is twice + * as less than maximum fanout, we assume that each data node + * introduces 3 * @c->max_idx_node_sz / (@c->fanout/2 - 1) bytes. + * Note, the multiplier 3 is because UBIFS reserves thrice as more space + * for the index. + */ + f = c->fanout > 3 ? c->fanout >> 1 : 2; + factor = UBIFS_BLOCK_SIZE; + divisor = UBIFS_MAX_DATA_NODE_SZ; + divisor += (c->max_idx_node_sz * 3) / (f - 1); + free *= factor; + return div_u64(free, divisor); +} + +/** + * ubifs_get_free_space_nolock - return amount of free space. * @c: UBIFS file-system description object * - * This function returns amount of free space on the file-system. + * This function calculates amount of free space to report to user-space. + * + * Because UBIFS may introduce substantial overhead (the index, node headers, + * alignment, wastage at the end of LEBs, etc), it cannot report real amount of + * free flash space it has (well, because not all dirty space is reclaimable, + * UBIFS does not actually know the real amount). If UBIFS did so, it would + * bread user expectations about what free space is. Users seem to accustomed + * to assume that if the file-system reports N bytes of free space, they would + * be able to fit a file of N bytes to the FS. This almost works for + * traditional file-systems, because they have way less overhead than UBIFS. + * So, to keep users happy, UBIFS tries to take the overhead into account. */ -long long ubifs_budg_get_free_space(struct ubifs_info *c) +long long ubifs_get_free_space_nolock(struct ubifs_info *c) { - int min_idx_lebs, rsvd_idx_lebs; + int rsvd_idx_lebs, lebs; long long available, outstanding, free; - /* Do exactly the same calculations as in 'do_budget_space()' */ - spin_lock(&c->space_lock); - min_idx_lebs = ubifs_calc_min_idx_lebs(c); + ubifs_assert(c->bi.min_idx_lebs == ubifs_calc_min_idx_lebs(c)); + outstanding = c->bi.data_growth + c->bi.dd_growth; + available = ubifs_calc_available(c, c->bi.min_idx_lebs); - if (min_idx_lebs > c->lst.idx_lebs) - rsvd_idx_lebs = min_idx_lebs - c->lst.idx_lebs; + /* + * When reporting free space to user-space, UBIFS guarantees that it is + * possible to write a file of free space size. This means that for + * empty LEBs we may use more precise calculations than + * 'ubifs_calc_available()' is using. Namely, we know that in empty + * LEBs we would waste only @c->leb_overhead bytes, not @c->dark_wm. + * Thus, amend the available space. + * + * Note, the calculations below are similar to what we have in + * 'do_budget_space()', so refer there for comments. + */ + if (c->bi.min_idx_lebs > c->lst.idx_lebs) + rsvd_idx_lebs = c->bi.min_idx_lebs - c->lst.idx_lebs; else rsvd_idx_lebs = 0; - - if (rsvd_idx_lebs > c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt - - c->lst.taken_empty_lebs) { - spin_unlock(&c->space_lock); - return 0; - } - - available = ubifs_calc_available(c, min_idx_lebs); - outstanding = c->budg_data_growth + c->budg_dd_growth; - c->min_idx_lebs = min_idx_lebs; - spin_unlock(&c->space_lock); + lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt - + c->lst.taken_empty_lebs; + lebs -= rsvd_idx_lebs; + available += lebs * (c->dark_wm - c->leb_overhead); if (available > outstanding) free = ubifs_reported_space(c, available - outstanding); @@ -729,3 +710,21 @@ long long ubifs_budg_get_free_space(struct ubifs_info *c) free = 0; return free; } + +/** + * ubifs_get_free_space - return amount of free space. + * @c: UBIFS file-system description object + * + * This function calculates and returns amount of free space to report to + * user-space. + */ +long long ubifs_get_free_space(struct ubifs_info *c) +{ + long long free; + + spin_lock(&c->space_lock); + free = ubifs_get_free_space_nolock(c); + spin_unlock(&c->space_lock); + + return free; +} diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c index 3b516316c9b..ff8229340cd 100644 --- a/fs/ubifs/commit.c +++ b/fs/ubifs/commit.c @@ -45,8 +45,59 @@ #include <linux/freezer.h> #include <linux/kthread.h> +#include <linux/slab.h> #include "ubifs.h" +/* + * nothing_to_commit - check if there is nothing to commit. + * @c: UBIFS file-system description object + * + * This is a helper function which checks if there is anything to commit. It is + * used as an optimization to avoid starting the commit if it is not really + * necessary. Indeed, the commit operation always assumes flash I/O (e.g., + * writing the commit start node to the log), and it is better to avoid doing + * this unnecessarily. E.g., 'ubifs_sync_fs()' runs the commit, but if there is + * nothing to commit, it is more optimal to avoid any flash I/O. + * + * This function has to be called with @c->commit_sem locked for writing - + * this function does not take LPT/TNC locks because the @c->commit_sem + * guarantees that we have exclusive access to the TNC and LPT data structures. + * + * This function returns %1 if there is nothing to commit and %0 otherwise. + */ +static int nothing_to_commit(struct ubifs_info *c) +{ + /* + * During mounting or remounting from R/O mode to R/W mode we may + * commit for various recovery-related reasons. + */ + if (c->mounting || c->remounting_rw) + return 0; + + /* + * If the root TNC node is dirty, we definitely have something to + * commit. + */ + if (c->zroot.znode && ubifs_zn_dirty(c->zroot.znode)) + return 0; + + /* + * Even though the TNC is clean, the LPT tree may have dirty nodes. For + * example, this may happen if the budgeting subsystem invoked GC to + * make some free space, and the GC found an LEB with only dirty and + * free space. In this case GC would just change the lprops of this + * LEB (by turning all space into free space) and unmap it. + */ + if (c->nroot && test_bit(DIRTY_CNODE, &c->nroot->flags)) + return 0; + + ubifs_assert(atomic_long_read(&c->dirty_zn_cnt) == 0); + ubifs_assert(c->dirty_pn_cnt == 0); + ubifs_assert(c->dirty_nn_cnt == 0); + + return 1; +} + /** * do_commit - commit the journal. * @c: UBIFS file-system description object @@ -62,11 +113,19 @@ static int do_commit(struct ubifs_info *c) struct ubifs_lp_stats lst; dbg_cmt("start"); - if (c->ro_media) { + ubifs_assert(!c->ro_media && !c->ro_mount); + + if (c->ro_error) { err = -EROFS; goto out_up; } + if (nothing_to_commit(c)) { + up_write(&c->commit_sem); + err = 0; + goto out_cancel; + } + /* Sync all write buffers (necessary for recovery) */ for (i = 0; i < c->jhead_cnt; i++) { err = ubifs_wbuf_sync(&c->jheads[i].wbuf); @@ -74,6 +133,7 @@ static int do_commit(struct ubifs_info *c) goto out_up; } + c->cmt_no += 1; err = ubifs_gc_start_commit(c); if (err) goto out_up; @@ -115,14 +175,14 @@ static int do_commit(struct ubifs_info *c) goto out; mutex_lock(&c->mst_mutex); - c->mst_node->cmt_no = cpu_to_le64(++c->cmt_no); + c->mst_node->cmt_no = cpu_to_le64(c->cmt_no); c->mst_node->log_lnum = cpu_to_le32(new_ltail_lnum); c->mst_node->root_lnum = cpu_to_le32(zroot.lnum); c->mst_node->root_offs = cpu_to_le32(zroot.offs); c->mst_node->root_len = cpu_to_le32(zroot.len); c->mst_node->ihead_lnum = cpu_to_le32(c->ihead_lnum); c->mst_node->ihead_offs = cpu_to_le32(c->ihead_offs); - c->mst_node->index_size = cpu_to_le64(c->old_idx_sz); + c->mst_node->index_size = cpu_to_le64(c->bi.old_idx_sz); c->mst_node->lpt_lnum = cpu_to_le32(c->lpt_lnum); c->mst_node->lpt_offs = cpu_to_le32(c->lpt_offs); c->mst_node->nhead_lnum = cpu_to_le32(c->nhead_lnum); @@ -158,12 +218,12 @@ static int do_commit(struct ubifs_info *c) if (err) goto out; +out_cancel: spin_lock(&c->cs_lock); c->cmt_state = COMMIT_RESTING; wake_up(&c->cmt_wq); dbg_cmt("commit end"); spin_unlock(&c->cs_lock); - return 0; out_up: @@ -268,7 +328,7 @@ int ubifs_bg_thread(void *info) cond_resched(); } - dbg_msg("background thread \"%s\" stops", c->bgt_name); + ubifs_msg("background thread \"%s\" stops", c->bgt_name); return 0; } @@ -358,7 +418,7 @@ int ubifs_run_commit(struct ubifs_info *c) spin_lock(&c->cs_lock); if (c->cmt_state == COMMIT_BROKEN) { - err = -EINVAL; + err = -EROFS; goto out; } @@ -384,7 +444,7 @@ int ubifs_run_commit(struct ubifs_info *c) * re-check it. */ if (c->cmt_state == COMMIT_BROKEN) { - err = -EINVAL; + err = -EROFS; goto out_cmt_unlock; } @@ -436,7 +496,9 @@ int ubifs_gc_should_commit(struct ubifs_info *c) return ret; } -#ifdef CONFIG_UBIFS_FS_DEBUG +/* + * Everything below is related to debugging. + */ /** * struct idx_node - hold index nodes during index tree traversal. @@ -452,7 +514,7 @@ struct idx_node { struct list_head list; int iip; union ubifs_key upper_key; - struct ubifs_idx_node idx __attribute__((aligned(8))); + struct ubifs_idx_node idx __aligned(8); }; /** @@ -469,12 +531,12 @@ int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot) { struct ubifs_idx_node *idx; int lnum, offs, len, err = 0; + struct ubifs_debug_info *d = c->dbg; - c->old_zroot = *zroot; - - lnum = c->old_zroot.lnum; - offs = c->old_zroot.offs; - len = c->old_zroot.len; + d->old_zroot = *zroot; + lnum = d->old_zroot.lnum; + offs = d->old_zroot.offs; + len = d->old_zroot.len; idx = kmalloc(c->max_idx_node_sz, GFP_NOFS); if (!idx) @@ -484,8 +546,8 @@ int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot) if (err) goto out; - c->old_zroot_level = le16_to_cpu(idx->level); - c->old_zroot_sqnum = le64_to_cpu(idx->ch.sqnum); + d->old_zroot_level = le16_to_cpu(idx->level); + d->old_zroot_sqnum = le64_to_cpu(idx->ch.sqnum); out: kfree(idx); return err; @@ -508,15 +570,16 @@ int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot) { int lnum, offs, len, err = 0, uninitialized_var(last_level), child_cnt; int first = 1, iip; - union ubifs_key lower_key, upper_key, l_key, u_key; + struct ubifs_debug_info *d = c->dbg; + union ubifs_key uninitialized_var(lower_key), upper_key, l_key, u_key; unsigned long long uninitialized_var(last_sqnum); struct ubifs_idx_node *idx; struct list_head list; struct idx_node *i; size_t sz; - if (!(ubifs_chk_flags & UBIFS_CHK_OLD_IDX)) - goto out; + if (!dbg_is_chk_index(c)) + return 0; INIT_LIST_HEAD(&list); @@ -524,9 +587,9 @@ int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot) UBIFS_IDX_NODE_SZ; /* Start at the old zroot */ - lnum = c->old_zroot.lnum; - offs = c->old_zroot.offs; - len = c->old_zroot.len; + lnum = d->old_zroot.lnum; + offs = d->old_zroot.offs; + len = d->old_zroot.len; iip = 0; /* @@ -559,11 +622,11 @@ int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot) if (first) { first = 0; /* Check root level and sqnum */ - if (le16_to_cpu(idx->level) != c->old_zroot_level) { + if (le16_to_cpu(idx->level) != d->old_zroot_level) { err = 2; goto out_dump; } - if (le64_to_cpu(idx->ch.sqnum) != c->old_zroot_sqnum) { + if (le64_to_cpu(idx->ch.sqnum) != d->old_zroot_sqnum) { err = 3; goto out_dump; } @@ -653,14 +716,14 @@ out: return 0; out_dump: - dbg_err("dumping index node (iip=%d)", i->iip); - dbg_dump_node(c, idx); + ubifs_err("dumping index node (iip=%d)", i->iip); + ubifs_dump_node(c, idx); list_del(&i->list); kfree(i); if (!list_empty(&list)) { i = list_entry(list.prev, struct idx_node, list); - dbg_err("dumping parent index node"); - dbg_dump_node(c, &i->idx); + ubifs_err("dumping parent index node"); + ubifs_dump_node(c, &i->idx); } out_free: while (!list_empty(&list)) { @@ -673,5 +736,3 @@ out_free: err = -EINVAL; return err; } - -#endif /* CONFIG_UBIFS_FS_DEBUG */ diff --git a/fs/ubifs/compress.c b/fs/ubifs/compress.c index 5bb51dac3c1..2bfa0953335 100644 --- a/fs/ubifs/compress.c +++ b/fs/ubifs/compress.c @@ -33,7 +33,7 @@ /* Fake description object for the "none" compressor */ static struct ubifs_compressor none_compr = { .compr_type = UBIFS_COMPR_NONE, - .name = "no compression", + .name = "none", .capi_name = "", }; @@ -43,13 +43,13 @@ static DEFINE_MUTEX(lzo_mutex); static struct ubifs_compressor lzo_compr = { .compr_type = UBIFS_COMPR_LZO, .comp_mutex = &lzo_mutex, - .name = "LZO", + .name = "lzo", .capi_name = "lzo", }; #else static struct ubifs_compressor lzo_compr = { .compr_type = UBIFS_COMPR_LZO, - .name = "LZO", + .name = "lzo", }; #endif @@ -91,8 +91,6 @@ struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT]; * * Note, if the input buffer was not compressed, it is copied to the output * buffer and %UBIFS_COMPR_NONE is returned in @compr_type. - * - * This functions returns %0 on success or a negative error code on failure. */ void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len, int *compr_type) @@ -110,21 +108,20 @@ void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len, if (compr->comp_mutex) mutex_lock(compr->comp_mutex); err = crypto_comp_compress(compr->cc, in_buf, in_len, out_buf, - out_len); + (unsigned int *)out_len); if (compr->comp_mutex) mutex_unlock(compr->comp_mutex); if (unlikely(err)) { - ubifs_warn("cannot compress %d bytes, compressor %s, " - "error %d, leave data uncompressed", + ubifs_warn("cannot compress %d bytes, compressor %s, error %d, leave data uncompressed", in_len, compr->name, err); goto no_compr; } /* - * Presently, we just require that compression results in less data, - * rather than any defined minimum compression ratio or amount. + * If the data compressed only slightly, it is better to leave it + * uncompressed to improve read speed. */ - if (ALIGN(*out_len, 8) >= ALIGN(in_len, 8)) + if (in_len - *out_len < UBIFS_MIN_COMPRESS_DIFF) goto no_compr; return; @@ -174,12 +171,12 @@ int ubifs_decompress(const void *in_buf, int in_len, void *out_buf, if (compr->decomp_mutex) mutex_lock(compr->decomp_mutex); err = crypto_comp_decompress(compr->cc, in_buf, in_len, out_buf, - out_len); + (unsigned int *)out_len); if (compr->decomp_mutex) mutex_unlock(compr->decomp_mutex); if (err) - ubifs_err("cannot decompress %d bytes, compressor %s, " - "error %d", in_len, compr->name, err); + ubifs_err("cannot decompress %d bytes, compressor %s, error %d", + in_len, compr->name, err); return err; } @@ -246,7 +243,7 @@ out_lzo: /** * ubifs_compressors_exit - de-initialize UBIFS compressors. */ -void __exit ubifs_compressors_exit(void) +void ubifs_compressors_exit(void) { compr_exit(&lzo_compr); compr_exit(&zlib_compr); diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 4e3aaeba4ec..177b0152fef 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -27,30 +27,14 @@ * various local functions of those subsystems. */ -#define UBIFS_DBG_PRESERVE_UBI - -#include "ubifs.h" #include <linux/module.h> -#include <linux/moduleparam.h> - -#ifdef CONFIG_UBIFS_FS_DEBUG - -DEFINE_SPINLOCK(dbg_lock); - -static char dbg_key_buf0[128]; -static char dbg_key_buf1[128]; - -unsigned int ubifs_msg_flags = UBIFS_MSG_FLAGS_DEFAULT; -unsigned int ubifs_chk_flags = UBIFS_CHK_FLAGS_DEFAULT; -unsigned int ubifs_tst_flags; - -module_param_named(debug_msgs, ubifs_msg_flags, uint, S_IRUGO | S_IWUSR); -module_param_named(debug_chks, ubifs_chk_flags, uint, S_IRUGO | S_IWUSR); -module_param_named(debug_tsts, ubifs_tst_flags, uint, S_IRUGO | S_IWUSR); +#include <linux/debugfs.h> +#include <linux/math64.h> +#include <linux/uaccess.h> +#include <linux/random.h> +#include "ubifs.h" -MODULE_PARM_DESC(debug_msgs, "Debug message type flags"); -MODULE_PARM_DESC(debug_chks, "Debug check flags"); -MODULE_PARM_DESC(debug_tsts, "Debug special test flags"); +static DEFINE_SPINLOCK(dbg_lock); static const char *get_key_fmt(int fmt) { @@ -92,8 +76,30 @@ static const char *get_key_type(int type) } } -static void sprintf_key(const struct ubifs_info *c, const union ubifs_key *key, - char *buffer) +static const char *get_dent_type(int type) +{ + switch (type) { + case UBIFS_ITYPE_REG: + return "file"; + case UBIFS_ITYPE_DIR: + return "dir"; + case UBIFS_ITYPE_LNK: + return "symlink"; + case UBIFS_ITYPE_BLK: + return "blkdev"; + case UBIFS_ITYPE_CHR: + return "char dev"; + case UBIFS_ITYPE_FIFO: + return "fifo"; + case UBIFS_ITYPE_SOCK: + return "socket"; + default: + return "unknown/invalid type"; + } +} + +const char *dbg_snprintf_key(const struct ubifs_info *c, + const union ubifs_key *key, char *buffer, int len) { char *p = buffer; int type = key_type(c, key); @@ -101,42 +107,34 @@ static void sprintf_key(const struct ubifs_info *c, const union ubifs_key *key, if (c->key_fmt == UBIFS_SIMPLE_KEY_FMT) { switch (type) { case UBIFS_INO_KEY: - sprintf(p, "(%lu, %s)", key_inum(c, key), - get_key_type(type)); + len -= snprintf(p, len, "(%lu, %s)", + (unsigned long)key_inum(c, key), + get_key_type(type)); break; case UBIFS_DENT_KEY: case UBIFS_XENT_KEY: - sprintf(p, "(%lu, %s, %#08x)", key_inum(c, key), - get_key_type(type), key_hash(c, key)); + len -= snprintf(p, len, "(%lu, %s, %#08x)", + (unsigned long)key_inum(c, key), + get_key_type(type), key_hash(c, key)); break; case UBIFS_DATA_KEY: - sprintf(p, "(%lu, %s, %u)", key_inum(c, key), - get_key_type(type), key_block(c, key)); + len -= snprintf(p, len, "(%lu, %s, %u)", + (unsigned long)key_inum(c, key), + get_key_type(type), key_block(c, key)); break; case UBIFS_TRUN_KEY: - sprintf(p, "(%lu, %s)", - key_inum(c, key), get_key_type(type)); + len -= snprintf(p, len, "(%lu, %s)", + (unsigned long)key_inum(c, key), + get_key_type(type)); break; default: - sprintf(p, "(bad key type: %#08x, %#08x)", - key->u32[0], key->u32[1]); + len -= snprintf(p, len, "(bad key type: %#08x, %#08x)", + key->u32[0], key->u32[1]); } } else - sprintf(p, "bad key format %d", c->key_fmt); -} - -const char *dbg_key_str0(const struct ubifs_info *c, const union ubifs_key *key) -{ - /* dbg_lock must be held */ - sprintf_key(c, key, dbg_key_buf0); - return dbg_key_buf0; -} - -const char *dbg_key_str1(const struct ubifs_info *c, const union ubifs_key *key) -{ - /* dbg_lock must be held */ - sprintf_key(c, key, dbg_key_buf1); - return dbg_key_buf1; + len -= snprintf(p, len, "bad key format %d", c->key_fmt); + ubifs_assert(len > 0); + return p; } const char *dbg_ntype(int type) @@ -205,62 +203,112 @@ const char *dbg_cstate(int cmt_state) } } +const char *dbg_jhead(int jhead) +{ + switch (jhead) { + case GCHD: + return "0 (GC)"; + case BASEHD: + return "1 (base)"; + case DATAHD: + return "2 (data)"; + default: + return "unknown journal head"; + } +} + static void dump_ch(const struct ubifs_ch *ch) { - printk(KERN_DEBUG "\tmagic %#x\n", le32_to_cpu(ch->magic)); - printk(KERN_DEBUG "\tcrc %#x\n", le32_to_cpu(ch->crc)); - printk(KERN_DEBUG "\tnode_type %d (%s)\n", ch->node_type, + pr_err("\tmagic %#x\n", le32_to_cpu(ch->magic)); + pr_err("\tcrc %#x\n", le32_to_cpu(ch->crc)); + pr_err("\tnode_type %d (%s)\n", ch->node_type, dbg_ntype(ch->node_type)); - printk(KERN_DEBUG "\tgroup_type %d (%s)\n", ch->group_type, + pr_err("\tgroup_type %d (%s)\n", ch->group_type, dbg_gtype(ch->group_type)); - printk(KERN_DEBUG "\tsqnum %llu\n", + pr_err("\tsqnum %llu\n", (unsigned long long)le64_to_cpu(ch->sqnum)); - printk(KERN_DEBUG "\tlen %u\n", le32_to_cpu(ch->len)); + pr_err("\tlen %u\n", le32_to_cpu(ch->len)); } -void dbg_dump_inode(const struct ubifs_info *c, const struct inode *inode) +void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode) { const struct ubifs_inode *ui = ubifs_inode(inode); + struct qstr nm = { .name = NULL }; + union ubifs_key key; + struct ubifs_dent_node *dent, *pdent = NULL; + int count = 2; - printk(KERN_DEBUG "inode %lu\n", inode->i_ino); - printk(KERN_DEBUG "size %llu\n", + pr_err("Dump in-memory inode:"); + pr_err("\tinode %lu\n", inode->i_ino); + pr_err("\tsize %llu\n", (unsigned long long)i_size_read(inode)); - printk(KERN_DEBUG "nlink %u\n", inode->i_nlink); - printk(KERN_DEBUG "uid %u\n", (unsigned int)inode->i_uid); - printk(KERN_DEBUG "gid %u\n", (unsigned int)inode->i_gid); - printk(KERN_DEBUG "atime %u.%u\n", + pr_err("\tnlink %u\n", inode->i_nlink); + pr_err("\tuid %u\n", (unsigned int)i_uid_read(inode)); + pr_err("\tgid %u\n", (unsigned int)i_gid_read(inode)); + pr_err("\tatime %u.%u\n", (unsigned int)inode->i_atime.tv_sec, (unsigned int)inode->i_atime.tv_nsec); - printk(KERN_DEBUG "mtime %u.%u\n", + pr_err("\tmtime %u.%u\n", (unsigned int)inode->i_mtime.tv_sec, (unsigned int)inode->i_mtime.tv_nsec); - printk(KERN_DEBUG "ctime %u.%u\n", + pr_err("\tctime %u.%u\n", (unsigned int)inode->i_ctime.tv_sec, (unsigned int)inode->i_ctime.tv_nsec); - printk(KERN_DEBUG "creat_sqnum %llu\n", ui->creat_sqnum); - printk(KERN_DEBUG "xattr_size %u\n", ui->xattr_size); - printk(KERN_DEBUG "xattr_cnt %u\n", ui->xattr_cnt); - printk(KERN_DEBUG "xattr_names %u\n", ui->xattr_names); - printk(KERN_DEBUG "dirty %u\n", ui->dirty); - printk(KERN_DEBUG "xattr %u\n", ui->xattr); - printk(KERN_DEBUG "flags %d\n", ui->flags); - printk(KERN_DEBUG "compr_type %d\n", ui->compr_type); - printk(KERN_DEBUG "data_len %d\n", ui->data_len); + pr_err("\tcreat_sqnum %llu\n", ui->creat_sqnum); + pr_err("\txattr_size %u\n", ui->xattr_size); + pr_err("\txattr_cnt %u\n", ui->xattr_cnt); + pr_err("\txattr_names %u\n", ui->xattr_names); + pr_err("\tdirty %u\n", ui->dirty); + pr_err("\txattr %u\n", ui->xattr); + pr_err("\tbulk_read %u\n", ui->xattr); + pr_err("\tsynced_i_size %llu\n", + (unsigned long long)ui->synced_i_size); + pr_err("\tui_size %llu\n", + (unsigned long long)ui->ui_size); + pr_err("\tflags %d\n", ui->flags); + pr_err("\tcompr_type %d\n", ui->compr_type); + pr_err("\tlast_page_read %lu\n", ui->last_page_read); + pr_err("\tread_in_a_row %lu\n", ui->read_in_a_row); + pr_err("\tdata_len %d\n", ui->data_len); + + if (!S_ISDIR(inode->i_mode)) + return; + + pr_err("List of directory entries:\n"); + ubifs_assert(!mutex_is_locked(&c->tnc_mutex)); + + lowest_dent_key(c, &key, inode->i_ino); + while (1) { + dent = ubifs_tnc_next_ent(c, &key, &nm); + if (IS_ERR(dent)) { + if (PTR_ERR(dent) != -ENOENT) + pr_err("error %ld\n", PTR_ERR(dent)); + break; + } + + pr_err("\t%d: %s (%s)\n", + count++, dent->name, get_dent_type(dent->type)); + + nm.name = dent->name; + nm.len = le16_to_cpu(dent->nlen); + kfree(pdent); + pdent = dent; + key_read(c, &dent->key, &key); + } + kfree(pdent); } -void dbg_dump_node(const struct ubifs_info *c, const void *node) +void ubifs_dump_node(const struct ubifs_info *c, const void *node) { int i, n; union ubifs_key key; const struct ubifs_ch *ch = node; - - if (dbg_failure_mode) - return; + char key_buf[DBG_KEY_BUF_LEN]; /* If the magic is incorrect, just hexdump the first bytes */ if (le32_to_cpu(ch->magic) != UBIFS_NODE_MAGIC) { - printk(KERN_DEBUG "Not a node, first %zu bytes:", UBIFS_CH_SZ); - print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 32, 1, + pr_err("Not a node, first %zu bytes:", UBIFS_CH_SZ); + print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 32, 1, (void *)node, UBIFS_CH_SZ, 1); return; } @@ -273,8 +321,7 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node) { const struct ubifs_pad_node *pad = node; - printk(KERN_DEBUG "\tpad_len %u\n", - le32_to_cpu(pad->pad_len)); + pr_err("\tpad_len %u\n", le32_to_cpu(pad->pad_len)); break; } case UBIFS_SB_NODE: @@ -282,115 +329,77 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node) const struct ubifs_sb_node *sup = node; unsigned int sup_flags = le32_to_cpu(sup->flags); - printk(KERN_DEBUG "\tkey_hash %d (%s)\n", + pr_err("\tkey_hash %d (%s)\n", (int)sup->key_hash, get_key_hash(sup->key_hash)); - printk(KERN_DEBUG "\tkey_fmt %d (%s)\n", + pr_err("\tkey_fmt %d (%s)\n", (int)sup->key_fmt, get_key_fmt(sup->key_fmt)); - printk(KERN_DEBUG "\tflags %#x\n", sup_flags); - printk(KERN_DEBUG "\t big_lpt %u\n", + pr_err("\tflags %#x\n", sup_flags); + pr_err("\t big_lpt %u\n", !!(sup_flags & UBIFS_FLG_BIGLPT)); - printk(KERN_DEBUG "\tmin_io_size %u\n", - le32_to_cpu(sup->min_io_size)); - printk(KERN_DEBUG "\tleb_size %u\n", - le32_to_cpu(sup->leb_size)); - printk(KERN_DEBUG "\tleb_cnt %u\n", - le32_to_cpu(sup->leb_cnt)); - printk(KERN_DEBUG "\tmax_leb_cnt %u\n", - le32_to_cpu(sup->max_leb_cnt)); - printk(KERN_DEBUG "\tmax_bud_bytes %llu\n", + pr_err("\t space_fixup %u\n", + !!(sup_flags & UBIFS_FLG_SPACE_FIXUP)); + pr_err("\tmin_io_size %u\n", le32_to_cpu(sup->min_io_size)); + pr_err("\tleb_size %u\n", le32_to_cpu(sup->leb_size)); + pr_err("\tleb_cnt %u\n", le32_to_cpu(sup->leb_cnt)); + pr_err("\tmax_leb_cnt %u\n", le32_to_cpu(sup->max_leb_cnt)); + pr_err("\tmax_bud_bytes %llu\n", (unsigned long long)le64_to_cpu(sup->max_bud_bytes)); - printk(KERN_DEBUG "\tlog_lebs %u\n", - le32_to_cpu(sup->log_lebs)); - printk(KERN_DEBUG "\tlpt_lebs %u\n", - le32_to_cpu(sup->lpt_lebs)); - printk(KERN_DEBUG "\torph_lebs %u\n", - le32_to_cpu(sup->orph_lebs)); - printk(KERN_DEBUG "\tjhead_cnt %u\n", - le32_to_cpu(sup->jhead_cnt)); - printk(KERN_DEBUG "\tfanout %u\n", - le32_to_cpu(sup->fanout)); - printk(KERN_DEBUG "\tlsave_cnt %u\n", - le32_to_cpu(sup->lsave_cnt)); - printk(KERN_DEBUG "\tdefault_compr %u\n", + pr_err("\tlog_lebs %u\n", le32_to_cpu(sup->log_lebs)); + pr_err("\tlpt_lebs %u\n", le32_to_cpu(sup->lpt_lebs)); + pr_err("\torph_lebs %u\n", le32_to_cpu(sup->orph_lebs)); + pr_err("\tjhead_cnt %u\n", le32_to_cpu(sup->jhead_cnt)); + pr_err("\tfanout %u\n", le32_to_cpu(sup->fanout)); + pr_err("\tlsave_cnt %u\n", le32_to_cpu(sup->lsave_cnt)); + pr_err("\tdefault_compr %u\n", (int)le16_to_cpu(sup->default_compr)); - printk(KERN_DEBUG "\trp_size %llu\n", + pr_err("\trp_size %llu\n", (unsigned long long)le64_to_cpu(sup->rp_size)); - printk(KERN_DEBUG "\trp_uid %u\n", - le32_to_cpu(sup->rp_uid)); - printk(KERN_DEBUG "\trp_gid %u\n", - le32_to_cpu(sup->rp_gid)); - printk(KERN_DEBUG "\tfmt_version %u\n", - le32_to_cpu(sup->fmt_version)); - printk(KERN_DEBUG "\ttime_gran %u\n", - le32_to_cpu(sup->time_gran)); - printk(KERN_DEBUG "\tUUID %02X%02X%02X%02X-%02X%02X" - "-%02X%02X-%02X%02X-%02X%02X%02X%02X%02X%02X\n", - sup->uuid[0], sup->uuid[1], sup->uuid[2], sup->uuid[3], - sup->uuid[4], sup->uuid[5], sup->uuid[6], sup->uuid[7], - sup->uuid[8], sup->uuid[9], sup->uuid[10], sup->uuid[11], - sup->uuid[12], sup->uuid[13], sup->uuid[14], - sup->uuid[15]); + pr_err("\trp_uid %u\n", le32_to_cpu(sup->rp_uid)); + pr_err("\trp_gid %u\n", le32_to_cpu(sup->rp_gid)); + pr_err("\tfmt_version %u\n", le32_to_cpu(sup->fmt_version)); + pr_err("\ttime_gran %u\n", le32_to_cpu(sup->time_gran)); + pr_err("\tUUID %pUB\n", sup->uuid); break; } case UBIFS_MST_NODE: { const struct ubifs_mst_node *mst = node; - printk(KERN_DEBUG "\thighest_inum %llu\n", + pr_err("\thighest_inum %llu\n", (unsigned long long)le64_to_cpu(mst->highest_inum)); - printk(KERN_DEBUG "\tcommit number %llu\n", + pr_err("\tcommit number %llu\n", (unsigned long long)le64_to_cpu(mst->cmt_no)); - printk(KERN_DEBUG "\tflags %#x\n", - le32_to_cpu(mst->flags)); - printk(KERN_DEBUG "\tlog_lnum %u\n", - le32_to_cpu(mst->log_lnum)); - printk(KERN_DEBUG "\troot_lnum %u\n", - le32_to_cpu(mst->root_lnum)); - printk(KERN_DEBUG "\troot_offs %u\n", - le32_to_cpu(mst->root_offs)); - printk(KERN_DEBUG "\troot_len %u\n", - le32_to_cpu(mst->root_len)); - printk(KERN_DEBUG "\tgc_lnum %u\n", - le32_to_cpu(mst->gc_lnum)); - printk(KERN_DEBUG "\tihead_lnum %u\n", - le32_to_cpu(mst->ihead_lnum)); - printk(KERN_DEBUG "\tihead_offs %u\n", - le32_to_cpu(mst->ihead_offs)); - printk(KERN_DEBUG "\tindex_size %u\n", - le32_to_cpu(mst->index_size)); - printk(KERN_DEBUG "\tlpt_lnum %u\n", - le32_to_cpu(mst->lpt_lnum)); - printk(KERN_DEBUG "\tlpt_offs %u\n", - le32_to_cpu(mst->lpt_offs)); - printk(KERN_DEBUG "\tnhead_lnum %u\n", - le32_to_cpu(mst->nhead_lnum)); - printk(KERN_DEBUG "\tnhead_offs %u\n", - le32_to_cpu(mst->nhead_offs)); - printk(KERN_DEBUG "\tltab_lnum %u\n", - le32_to_cpu(mst->ltab_lnum)); - printk(KERN_DEBUG "\tltab_offs %u\n", - le32_to_cpu(mst->ltab_offs)); - printk(KERN_DEBUG "\tlsave_lnum %u\n", - le32_to_cpu(mst->lsave_lnum)); - printk(KERN_DEBUG "\tlsave_offs %u\n", - le32_to_cpu(mst->lsave_offs)); - printk(KERN_DEBUG "\tlscan_lnum %u\n", - le32_to_cpu(mst->lscan_lnum)); - printk(KERN_DEBUG "\tleb_cnt %u\n", - le32_to_cpu(mst->leb_cnt)); - printk(KERN_DEBUG "\tempty_lebs %u\n", - le32_to_cpu(mst->empty_lebs)); - printk(KERN_DEBUG "\tidx_lebs %u\n", - le32_to_cpu(mst->idx_lebs)); - printk(KERN_DEBUG "\ttotal_free %llu\n", + pr_err("\tflags %#x\n", le32_to_cpu(mst->flags)); + pr_err("\tlog_lnum %u\n", le32_to_cpu(mst->log_lnum)); + pr_err("\troot_lnum %u\n", le32_to_cpu(mst->root_lnum)); + pr_err("\troot_offs %u\n", le32_to_cpu(mst->root_offs)); + pr_err("\troot_len %u\n", le32_to_cpu(mst->root_len)); + pr_err("\tgc_lnum %u\n", le32_to_cpu(mst->gc_lnum)); + pr_err("\tihead_lnum %u\n", le32_to_cpu(mst->ihead_lnum)); + pr_err("\tihead_offs %u\n", le32_to_cpu(mst->ihead_offs)); + pr_err("\tindex_size %llu\n", + (unsigned long long)le64_to_cpu(mst->index_size)); + pr_err("\tlpt_lnum %u\n", le32_to_cpu(mst->lpt_lnum)); + pr_err("\tlpt_offs %u\n", le32_to_cpu(mst->lpt_offs)); + pr_err("\tnhead_lnum %u\n", le32_to_cpu(mst->nhead_lnum)); + pr_err("\tnhead_offs %u\n", le32_to_cpu(mst->nhead_offs)); + pr_err("\tltab_lnum %u\n", le32_to_cpu(mst->ltab_lnum)); + pr_err("\tltab_offs %u\n", le32_to_cpu(mst->ltab_offs)); + pr_err("\tlsave_lnum %u\n", le32_to_cpu(mst->lsave_lnum)); + pr_err("\tlsave_offs %u\n", le32_to_cpu(mst->lsave_offs)); + pr_err("\tlscan_lnum %u\n", le32_to_cpu(mst->lscan_lnum)); + pr_err("\tleb_cnt %u\n", le32_to_cpu(mst->leb_cnt)); + pr_err("\tempty_lebs %u\n", le32_to_cpu(mst->empty_lebs)); + pr_err("\tidx_lebs %u\n", le32_to_cpu(mst->idx_lebs)); + pr_err("\ttotal_free %llu\n", (unsigned long long)le64_to_cpu(mst->total_free)); - printk(KERN_DEBUG "\ttotal_dirty %llu\n", + pr_err("\ttotal_dirty %llu\n", (unsigned long long)le64_to_cpu(mst->total_dirty)); - printk(KERN_DEBUG "\ttotal_used %llu\n", + pr_err("\ttotal_used %llu\n", (unsigned long long)le64_to_cpu(mst->total_used)); - printk(KERN_DEBUG "\ttotal_dead %llu\n", + pr_err("\ttotal_dead %llu\n", (unsigned long long)le64_to_cpu(mst->total_dead)); - printk(KERN_DEBUG "\ttotal_dark %llu\n", + pr_err("\ttotal_dark %llu\n", (unsigned long long)le64_to_cpu(mst->total_dark)); break; } @@ -398,12 +407,9 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node) { const struct ubifs_ref_node *ref = node; - printk(KERN_DEBUG "\tlnum %u\n", - le32_to_cpu(ref->lnum)); - printk(KERN_DEBUG "\toffs %u\n", - le32_to_cpu(ref->offs)); - printk(KERN_DEBUG "\tjhead %u\n", - le32_to_cpu(ref->jhead)); + pr_err("\tlnum %u\n", le32_to_cpu(ref->lnum)); + pr_err("\toffs %u\n", le32_to_cpu(ref->offs)); + pr_err("\tjhead %u\n", le32_to_cpu(ref->jhead)); break; } case UBIFS_INO_NODE: @@ -411,40 +417,32 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node) const struct ubifs_ino_node *ino = node; key_read(c, &ino->key, &key); - printk(KERN_DEBUG "\tkey %s\n", DBGKEY(&key)); - printk(KERN_DEBUG "\tcreat_sqnum %llu\n", + pr_err("\tkey %s\n", + dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN)); + pr_err("\tcreat_sqnum %llu\n", (unsigned long long)le64_to_cpu(ino->creat_sqnum)); - printk(KERN_DEBUG "\tsize %llu\n", + pr_err("\tsize %llu\n", (unsigned long long)le64_to_cpu(ino->size)); - printk(KERN_DEBUG "\tnlink %u\n", - le32_to_cpu(ino->nlink)); - printk(KERN_DEBUG "\tatime %lld.%u\n", + pr_err("\tnlink %u\n", le32_to_cpu(ino->nlink)); + pr_err("\tatime %lld.%u\n", (long long)le64_to_cpu(ino->atime_sec), le32_to_cpu(ino->atime_nsec)); - printk(KERN_DEBUG "\tmtime %lld.%u\n", + pr_err("\tmtime %lld.%u\n", (long long)le64_to_cpu(ino->mtime_sec), le32_to_cpu(ino->mtime_nsec)); - printk(KERN_DEBUG "\tctime %lld.%u\n", + pr_err("\tctime %lld.%u\n", (long long)le64_to_cpu(ino->ctime_sec), le32_to_cpu(ino->ctime_nsec)); - printk(KERN_DEBUG "\tuid %u\n", - le32_to_cpu(ino->uid)); - printk(KERN_DEBUG "\tgid %u\n", - le32_to_cpu(ino->gid)); - printk(KERN_DEBUG "\tmode %u\n", - le32_to_cpu(ino->mode)); - printk(KERN_DEBUG "\tflags %#x\n", - le32_to_cpu(ino->flags)); - printk(KERN_DEBUG "\txattr_cnt %u\n", - le32_to_cpu(ino->xattr_cnt)); - printk(KERN_DEBUG "\txattr_size %u\n", - le32_to_cpu(ino->xattr_size)); - printk(KERN_DEBUG "\txattr_names %u\n", - le32_to_cpu(ino->xattr_names)); - printk(KERN_DEBUG "\tcompr_type %#x\n", + pr_err("\tuid %u\n", le32_to_cpu(ino->uid)); + pr_err("\tgid %u\n", le32_to_cpu(ino->gid)); + pr_err("\tmode %u\n", le32_to_cpu(ino->mode)); + pr_err("\tflags %#x\n", le32_to_cpu(ino->flags)); + pr_err("\txattr_cnt %u\n", le32_to_cpu(ino->xattr_cnt)); + pr_err("\txattr_size %u\n", le32_to_cpu(ino->xattr_size)); + pr_err("\txattr_names %u\n", le32_to_cpu(ino->xattr_names)); + pr_err("\tcompr_type %#x\n", (int)le16_to_cpu(ino->compr_type)); - printk(KERN_DEBUG "\tdata len %u\n", - le32_to_cpu(ino->data_len)); + pr_err("\tdata len %u\n", le32_to_cpu(ino->data_len)); break; } case UBIFS_DENT_NODE: @@ -454,21 +452,21 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node) int nlen = le16_to_cpu(dent->nlen); key_read(c, &dent->key, &key); - printk(KERN_DEBUG "\tkey %s\n", DBGKEY(&key)); - printk(KERN_DEBUG "\tinum %llu\n", + pr_err("\tkey %s\n", + dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN)); + pr_err("\tinum %llu\n", (unsigned long long)le64_to_cpu(dent->inum)); - printk(KERN_DEBUG "\ttype %d\n", (int)dent->type); - printk(KERN_DEBUG "\tnlen %d\n", nlen); - printk(KERN_DEBUG "\tname "); + pr_err("\ttype %d\n", (int)dent->type); + pr_err("\tnlen %d\n", nlen); + pr_err("\tname "); if (nlen > UBIFS_MAX_NLEN) - printk(KERN_DEBUG "(bad name length, not printing, " - "bad or corrupted node)"); + pr_err("(bad name length, not printing, bad or corrupted node)"); else { for (i = 0; i < nlen && dent->name[i]; i++) - printk("%c", dent->name[i]); + pr_cont("%c", dent->name[i]); } - printk("\n"); + pr_cont("\n"); break; } @@ -478,15 +476,14 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node) int dlen = le32_to_cpu(ch->len) - UBIFS_DATA_NODE_SZ; key_read(c, &dn->key, &key); - printk(KERN_DEBUG "\tkey %s\n", DBGKEY(&key)); - printk(KERN_DEBUG "\tsize %u\n", - le32_to_cpu(dn->size)); - printk(KERN_DEBUG "\tcompr_typ %d\n", + pr_err("\tkey %s\n", + dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN)); + pr_err("\tsize %u\n", le32_to_cpu(dn->size)); + pr_err("\tcompr_typ %d\n", (int)le16_to_cpu(dn->compr_type)); - printk(KERN_DEBUG "\tdata size %d\n", - dlen); - printk(KERN_DEBUG "\tdata:\n"); - print_hex_dump(KERN_DEBUG, "\t", DUMP_PREFIX_OFFSET, 32, 1, + pr_err("\tdata size %d\n", dlen); + pr_err("\tdata:\n"); + print_hex_dump(KERN_ERR, "\t", DUMP_PREFIX_OFFSET, 32, 1, (void *)&dn->data, dlen, 0); break; } @@ -494,11 +491,10 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node) { const struct ubifs_trun_node *trun = node; - printk(KERN_DEBUG "\tinum %u\n", - le32_to_cpu(trun->inum)); - printk(KERN_DEBUG "\told_size %llu\n", + pr_err("\tinum %u\n", le32_to_cpu(trun->inum)); + pr_err("\told_size %llu\n", (unsigned long long)le64_to_cpu(trun->old_size)); - printk(KERN_DEBUG "\tnew_size %llu\n", + pr_err("\tnew_size %llu\n", (unsigned long long)le64_to_cpu(trun->new_size)); break; } @@ -507,19 +503,20 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node) const struct ubifs_idx_node *idx = node; n = le16_to_cpu(idx->child_cnt); - printk(KERN_DEBUG "\tchild_cnt %d\n", n); - printk(KERN_DEBUG "\tlevel %d\n", - (int)le16_to_cpu(idx->level)); - printk(KERN_DEBUG "\tBranches:\n"); + pr_err("\tchild_cnt %d\n", n); + pr_err("\tlevel %d\n", (int)le16_to_cpu(idx->level)); + pr_err("\tBranches:\n"); for (i = 0; i < n && i < c->fanout - 1; i++) { const struct ubifs_branch *br; br = ubifs_idx_branch(c, idx, i); key_read(c, &br->key, &key); - printk(KERN_DEBUG "\t%d: LEB %d:%d len %d key %s\n", + pr_err("\t%d: LEB %d:%d len %d key %s\n", i, le32_to_cpu(br->lnum), le32_to_cpu(br->offs), - le32_to_cpu(br->len), DBGKEY(&key)); + le32_to_cpu(br->len), + dbg_snprintf_key(c, &key, key_buf, + DBG_KEY_BUF_LEN)); } break; } @@ -529,159 +526,333 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node) { const struct ubifs_orph_node *orph = node; - printk(KERN_DEBUG "\tcommit number %llu\n", + pr_err("\tcommit number %llu\n", (unsigned long long) le64_to_cpu(orph->cmt_no) & LLONG_MAX); - printk(KERN_DEBUG "\tlast node flag %llu\n", + pr_err("\tlast node flag %llu\n", (unsigned long long)(le64_to_cpu(orph->cmt_no)) >> 63); n = (le32_to_cpu(ch->len) - UBIFS_ORPH_NODE_SZ) >> 3; - printk(KERN_DEBUG "\t%d orphan inode numbers:\n", n); + pr_err("\t%d orphan inode numbers:\n", n); for (i = 0; i < n; i++) - printk(KERN_DEBUG "\t ino %llu\n", - le64_to_cpu(orph->inos[i])); + pr_err("\t ino %llu\n", + (unsigned long long)le64_to_cpu(orph->inos[i])); break; } default: - printk(KERN_DEBUG "node type %d was not recognized\n", + pr_err("node type %d was not recognized\n", (int)ch->node_type); } spin_unlock(&dbg_lock); } -void dbg_dump_budget_req(const struct ubifs_budget_req *req) +void ubifs_dump_budget_req(const struct ubifs_budget_req *req) { spin_lock(&dbg_lock); - printk(KERN_DEBUG "Budgeting request: new_ino %d, dirtied_ino %d\n", + pr_err("Budgeting request: new_ino %d, dirtied_ino %d\n", req->new_ino, req->dirtied_ino); - printk(KERN_DEBUG "\tnew_ino_d %d, dirtied_ino_d %d\n", + pr_err("\tnew_ino_d %d, dirtied_ino_d %d\n", req->new_ino_d, req->dirtied_ino_d); - printk(KERN_DEBUG "\tnew_page %d, dirtied_page %d\n", + pr_err("\tnew_page %d, dirtied_page %d\n", req->new_page, req->dirtied_page); - printk(KERN_DEBUG "\tnew_dent %d, mod_dent %d\n", + pr_err("\tnew_dent %d, mod_dent %d\n", req->new_dent, req->mod_dent); - printk(KERN_DEBUG "\tidx_growth %d\n", req->idx_growth); - printk(KERN_DEBUG "\tdata_growth %d dd_growth %d\n", + pr_err("\tidx_growth %d\n", req->idx_growth); + pr_err("\tdata_growth %d dd_growth %d\n", req->data_growth, req->dd_growth); spin_unlock(&dbg_lock); } -void dbg_dump_lstats(const struct ubifs_lp_stats *lst) +void ubifs_dump_lstats(const struct ubifs_lp_stats *lst) { spin_lock(&dbg_lock); - printk(KERN_DEBUG "Lprops statistics: empty_lebs %d, idx_lebs %d\n", - lst->empty_lebs, lst->idx_lebs); - printk(KERN_DEBUG "\ttaken_empty_lebs %d, total_free %lld, " - "total_dirty %lld\n", lst->taken_empty_lebs, lst->total_free, - lst->total_dirty); - printk(KERN_DEBUG "\ttotal_used %lld, total_dark %lld, " - "total_dead %lld\n", lst->total_used, lst->total_dark, - lst->total_dead); + pr_err("(pid %d) Lprops statistics: empty_lebs %d, idx_lebs %d\n", + current->pid, lst->empty_lebs, lst->idx_lebs); + pr_err("\ttaken_empty_lebs %d, total_free %lld, total_dirty %lld\n", + lst->taken_empty_lebs, lst->total_free, lst->total_dirty); + pr_err("\ttotal_used %lld, total_dark %lld, total_dead %lld\n", + lst->total_used, lst->total_dark, lst->total_dead); spin_unlock(&dbg_lock); } -void dbg_dump_budg(struct ubifs_info *c) +void ubifs_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi) { int i; struct rb_node *rb; struct ubifs_bud *bud; struct ubifs_gced_idx_leb *idx_gc; + long long available, outstanding, free; + spin_lock(&c->space_lock); spin_lock(&dbg_lock); - printk(KERN_DEBUG "Budgeting info: budg_data_growth %lld, " - "budg_dd_growth %lld, budg_idx_growth %lld\n", - c->budg_data_growth, c->budg_dd_growth, c->budg_idx_growth); - printk(KERN_DEBUG "\tdata budget sum %lld, total budget sum %lld, " - "freeable_cnt %d\n", c->budg_data_growth + c->budg_dd_growth, - c->budg_data_growth + c->budg_dd_growth + c->budg_idx_growth, - c->freeable_cnt); - printk(KERN_DEBUG "\tmin_idx_lebs %d, old_idx_sz %lld, " - "calc_idx_sz %lld, idx_gc_cnt %d\n", c->min_idx_lebs, - c->old_idx_sz, c->calc_idx_sz, c->idx_gc_cnt); - printk(KERN_DEBUG "\tdirty_pg_cnt %ld, dirty_zn_cnt %ld, " - "clean_zn_cnt %ld\n", atomic_long_read(&c->dirty_pg_cnt), + pr_err("(pid %d) Budgeting info: data budget sum %lld, total budget sum %lld\n", + current->pid, bi->data_growth + bi->dd_growth, + bi->data_growth + bi->dd_growth + bi->idx_growth); + pr_err("\tbudg_data_growth %lld, budg_dd_growth %lld, budg_idx_growth %lld\n", + bi->data_growth, bi->dd_growth, bi->idx_growth); + pr_err("\tmin_idx_lebs %d, old_idx_sz %llu, uncommitted_idx %lld\n", + bi->min_idx_lebs, bi->old_idx_sz, bi->uncommitted_idx); + pr_err("\tpage_budget %d, inode_budget %d, dent_budget %d\n", + bi->page_budget, bi->inode_budget, bi->dent_budget); + pr_err("\tnospace %u, nospace_rp %u\n", bi->nospace, bi->nospace_rp); + pr_err("\tdark_wm %d, dead_wm %d, max_idx_node_sz %d\n", + c->dark_wm, c->dead_wm, c->max_idx_node_sz); + + if (bi != &c->bi) + /* + * If we are dumping saved budgeting data, do not print + * additional information which is about the current state, not + * the old one which corresponded to the saved budgeting data. + */ + goto out_unlock; + + pr_err("\tfreeable_cnt %d, calc_idx_sz %lld, idx_gc_cnt %d\n", + c->freeable_cnt, c->calc_idx_sz, c->idx_gc_cnt); + pr_err("\tdirty_pg_cnt %ld, dirty_zn_cnt %ld, clean_zn_cnt %ld\n", + atomic_long_read(&c->dirty_pg_cnt), atomic_long_read(&c->dirty_zn_cnt), atomic_long_read(&c->clean_zn_cnt)); - printk(KERN_DEBUG "\tdark_wm %d, dead_wm %d, max_idx_node_sz %d\n", - c->dark_wm, c->dead_wm, c->max_idx_node_sz); - printk(KERN_DEBUG "\tgc_lnum %d, ihead_lnum %d\n", - c->gc_lnum, c->ihead_lnum); - for (i = 0; i < c->jhead_cnt; i++) - printk(KERN_DEBUG "\tjhead %d\t LEB %d\n", - c->jheads[i].wbuf.jhead, c->jheads[i].wbuf.lnum); + pr_err("\tgc_lnum %d, ihead_lnum %d\n", c->gc_lnum, c->ihead_lnum); + + /* If we are in R/O mode, journal heads do not exist */ + if (c->jheads) + for (i = 0; i < c->jhead_cnt; i++) + pr_err("\tjhead %s\t LEB %d\n", + dbg_jhead(c->jheads[i].wbuf.jhead), + c->jheads[i].wbuf.lnum); for (rb = rb_first(&c->buds); rb; rb = rb_next(rb)) { bud = rb_entry(rb, struct ubifs_bud, rb); - printk(KERN_DEBUG "\tbud LEB %d\n", bud->lnum); + pr_err("\tbud LEB %d\n", bud->lnum); } list_for_each_entry(bud, &c->old_buds, list) - printk(KERN_DEBUG "\told bud LEB %d\n", bud->lnum); + pr_err("\told bud LEB %d\n", bud->lnum); list_for_each_entry(idx_gc, &c->idx_gc, list) - printk(KERN_DEBUG "\tGC'ed idx LEB %d unmap %d\n", + pr_err("\tGC'ed idx LEB %d unmap %d\n", idx_gc->lnum, idx_gc->unmap); - printk(KERN_DEBUG "\tcommit state %d\n", c->cmt_state); + pr_err("\tcommit state %d\n", c->cmt_state); + + /* Print budgeting predictions */ + available = ubifs_calc_available(c, c->bi.min_idx_lebs); + outstanding = c->bi.data_growth + c->bi.dd_growth; + free = ubifs_get_free_space_nolock(c); + pr_err("Budgeting predictions:\n"); + pr_err("\tavailable: %lld, outstanding %lld, free %lld\n", + available, outstanding, free); +out_unlock: spin_unlock(&dbg_lock); + spin_unlock(&c->space_lock); } -void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp) +void ubifs_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp) { - printk(KERN_DEBUG "LEB %d lprops: free %d, dirty %d (used %d), " - "flags %#x\n", lp->lnum, lp->free, lp->dirty, - c->leb_size - lp->free - lp->dirty, lp->flags); + int i, spc, dark = 0, dead = 0; + struct rb_node *rb; + struct ubifs_bud *bud; + + spc = lp->free + lp->dirty; + if (spc < c->dead_wm) + dead = spc; + else + dark = ubifs_calc_dark(c, spc); + + if (lp->flags & LPROPS_INDEX) + pr_err("LEB %-7d free %-8d dirty %-8d used %-8d free + dirty %-8d flags %#x (", + lp->lnum, lp->free, lp->dirty, c->leb_size - spc, spc, + lp->flags); + else + pr_err("LEB %-7d free %-8d dirty %-8d used %-8d free + dirty %-8d dark %-4d dead %-4d nodes fit %-3d flags %#-4x (", + lp->lnum, lp->free, lp->dirty, c->leb_size - spc, spc, + dark, dead, (int)(spc / UBIFS_MAX_NODE_SZ), lp->flags); + + if (lp->flags & LPROPS_TAKEN) { + if (lp->flags & LPROPS_INDEX) + pr_cont("index, taken"); + else + pr_cont("taken"); + } else { + const char *s; + + if (lp->flags & LPROPS_INDEX) { + switch (lp->flags & LPROPS_CAT_MASK) { + case LPROPS_DIRTY_IDX: + s = "dirty index"; + break; + case LPROPS_FRDI_IDX: + s = "freeable index"; + break; + default: + s = "index"; + } + } else { + switch (lp->flags & LPROPS_CAT_MASK) { + case LPROPS_UNCAT: + s = "not categorized"; + break; + case LPROPS_DIRTY: + s = "dirty"; + break; + case LPROPS_FREE: + s = "free"; + break; + case LPROPS_EMPTY: + s = "empty"; + break; + case LPROPS_FREEABLE: + s = "freeable"; + break; + default: + s = NULL; + break; + } + } + pr_cont("%s", s); + } + + for (rb = rb_first((struct rb_root *)&c->buds); rb; rb = rb_next(rb)) { + bud = rb_entry(rb, struct ubifs_bud, rb); + if (bud->lnum == lp->lnum) { + int head = 0; + for (i = 0; i < c->jhead_cnt; i++) { + /* + * Note, if we are in R/O mode or in the middle + * of mounting/re-mounting, the write-buffers do + * not exist. + */ + if (c->jheads && + lp->lnum == c->jheads[i].wbuf.lnum) { + pr_cont(", jhead %s", dbg_jhead(i)); + head = 1; + } + } + if (!head) + pr_cont(", bud of jhead %s", + dbg_jhead(bud->jhead)); + } + } + if (lp->lnum == c->gc_lnum) + pr_cont(", GC LEB"); + pr_cont(")\n"); } -void dbg_dump_lprops(struct ubifs_info *c) +void ubifs_dump_lprops(struct ubifs_info *c) { int lnum, err; struct ubifs_lprops lp; struct ubifs_lp_stats lst; - printk(KERN_DEBUG "Dumping LEB properties\n"); + pr_err("(pid %d) start dumping LEB properties\n", current->pid); ubifs_get_lp_stats(c, &lst); - dbg_dump_lstats(&lst); + ubifs_dump_lstats(&lst); for (lnum = c->main_first; lnum < c->leb_cnt; lnum++) { err = ubifs_read_one_lp(c, lnum, &lp); - if (err) + if (err) { ubifs_err("cannot read lprops for LEB %d", lnum); + continue; + } - dbg_dump_lprop(c, &lp); + ubifs_dump_lprop(c, &lp); } + pr_err("(pid %d) finish dumping LEB properties\n", current->pid); } -void dbg_dump_leb(const struct ubifs_info *c, int lnum) +void ubifs_dump_lpt_info(struct ubifs_info *c) +{ + int i; + + spin_lock(&dbg_lock); + pr_err("(pid %d) dumping LPT information\n", current->pid); + pr_err("\tlpt_sz: %lld\n", c->lpt_sz); + pr_err("\tpnode_sz: %d\n", c->pnode_sz); + pr_err("\tnnode_sz: %d\n", c->nnode_sz); + pr_err("\tltab_sz: %d\n", c->ltab_sz); + pr_err("\tlsave_sz: %d\n", c->lsave_sz); + pr_err("\tbig_lpt: %d\n", c->big_lpt); + pr_err("\tlpt_hght: %d\n", c->lpt_hght); + pr_err("\tpnode_cnt: %d\n", c->pnode_cnt); + pr_err("\tnnode_cnt: %d\n", c->nnode_cnt); + pr_err("\tdirty_pn_cnt: %d\n", c->dirty_pn_cnt); + pr_err("\tdirty_nn_cnt: %d\n", c->dirty_nn_cnt); + pr_err("\tlsave_cnt: %d\n", c->lsave_cnt); + pr_err("\tspace_bits: %d\n", c->space_bits); + pr_err("\tlpt_lnum_bits: %d\n", c->lpt_lnum_bits); + pr_err("\tlpt_offs_bits: %d\n", c->lpt_offs_bits); + pr_err("\tlpt_spc_bits: %d\n", c->lpt_spc_bits); + pr_err("\tpcnt_bits: %d\n", c->pcnt_bits); + pr_err("\tlnum_bits: %d\n", c->lnum_bits); + pr_err("\tLPT root is at %d:%d\n", c->lpt_lnum, c->lpt_offs); + pr_err("\tLPT head is at %d:%d\n", + c->nhead_lnum, c->nhead_offs); + pr_err("\tLPT ltab is at %d:%d\n", c->ltab_lnum, c->ltab_offs); + if (c->big_lpt) + pr_err("\tLPT lsave is at %d:%d\n", + c->lsave_lnum, c->lsave_offs); + for (i = 0; i < c->lpt_lebs; i++) + pr_err("\tLPT LEB %d free %d dirty %d tgc %d cmt %d\n", + i + c->lpt_first, c->ltab[i].free, c->ltab[i].dirty, + c->ltab[i].tgc, c->ltab[i].cmt); + spin_unlock(&dbg_lock); +} + +void ubifs_dump_sleb(const struct ubifs_info *c, + const struct ubifs_scan_leb *sleb, int offs) +{ + struct ubifs_scan_node *snod; + + pr_err("(pid %d) start dumping scanned data from LEB %d:%d\n", + current->pid, sleb->lnum, offs); + + list_for_each_entry(snod, &sleb->nodes, list) { + cond_resched(); + pr_err("Dumping node at LEB %d:%d len %d\n", + sleb->lnum, snod->offs, snod->len); + ubifs_dump_node(c, snod->node); + } +} + +void ubifs_dump_leb(const struct ubifs_info *c, int lnum) { struct ubifs_scan_leb *sleb; struct ubifs_scan_node *snod; + void *buf; - if (dbg_failure_mode) - return; + pr_err("(pid %d) start dumping LEB %d\n", current->pid, lnum); - printk(KERN_DEBUG "Dumping LEB %d\n", lnum); + buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); + if (!buf) { + ubifs_err("cannot allocate memory for dumping LEB %d", lnum); + return; + } - sleb = ubifs_scan(c, lnum, 0, c->dbg_buf); + sleb = ubifs_scan(c, lnum, 0, buf, 0); if (IS_ERR(sleb)) { ubifs_err("scan error %d", (int)PTR_ERR(sleb)); - return; + goto out; } - printk(KERN_DEBUG "LEB %d has %d nodes ending at %d\n", lnum, + pr_err("LEB %d has %d nodes ending at %d\n", lnum, sleb->nodes_cnt, sleb->endpt); list_for_each_entry(snod, &sleb->nodes, list) { cond_resched(); - printk(KERN_DEBUG "Dumping node at LEB %d:%d len %d\n", lnum, + pr_err("Dumping node at LEB %d:%d len %d\n", lnum, snod->offs, snod->len); - dbg_dump_node(c, snod->node); + ubifs_dump_node(c, snod->node); } + pr_err("(pid %d) finish dumping LEB %d\n", current->pid, lnum); ubifs_scan_destroy(sleb); + +out: + vfree(buf); return; } -void dbg_dump_znode(const struct ubifs_info *c, - const struct ubifs_znode *znode) +void ubifs_dump_znode(const struct ubifs_info *c, + const struct ubifs_znode *znode) { int n; const struct ubifs_zbranch *zbr; + char key_buf[DBG_KEY_BUF_LEN]; spin_lock(&dbg_lock); if (znode->parent) @@ -689,109 +860,203 @@ void dbg_dump_znode(const struct ubifs_info *c, else zbr = &c->zroot; - printk(KERN_DEBUG "znode %p, LEB %d:%d len %d parent %p iip %d level %d" - " child_cnt %d flags %lx\n", znode, zbr->lnum, zbr->offs, - zbr->len, znode->parent, znode->iip, znode->level, - znode->child_cnt, znode->flags); + pr_err("znode %p, LEB %d:%d len %d parent %p iip %d level %d child_cnt %d flags %lx\n", + znode, zbr->lnum, zbr->offs, zbr->len, znode->parent, znode->iip, + znode->level, znode->child_cnt, znode->flags); if (znode->child_cnt <= 0 || znode->child_cnt > c->fanout) { spin_unlock(&dbg_lock); return; } - printk(KERN_DEBUG "zbranches:\n"); + pr_err("zbranches:\n"); for (n = 0; n < znode->child_cnt; n++) { zbr = &znode->zbranch[n]; if (znode->level > 0) - printk(KERN_DEBUG "\t%d: znode %p LEB %d:%d len %d key " - "%s\n", n, zbr->znode, zbr->lnum, - zbr->offs, zbr->len, - DBGKEY(&zbr->key)); + pr_err("\t%d: znode %p LEB %d:%d len %d key %s\n", + n, zbr->znode, zbr->lnum, zbr->offs, zbr->len, + dbg_snprintf_key(c, &zbr->key, key_buf, + DBG_KEY_BUF_LEN)); else - printk(KERN_DEBUG "\t%d: LNC %p LEB %d:%d len %d key " - "%s\n", n, zbr->znode, zbr->lnum, - zbr->offs, zbr->len, - DBGKEY(&zbr->key)); + pr_err("\t%d: LNC %p LEB %d:%d len %d key %s\n", + n, zbr->znode, zbr->lnum, zbr->offs, zbr->len, + dbg_snprintf_key(c, &zbr->key, key_buf, + DBG_KEY_BUF_LEN)); } spin_unlock(&dbg_lock); } -void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat) +void ubifs_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat) { int i; - printk(KERN_DEBUG "Dumping heap cat %d (%d elements)\n", - cat, heap->cnt); + pr_err("(pid %d) start dumping heap cat %d (%d elements)\n", + current->pid, cat, heap->cnt); for (i = 0; i < heap->cnt; i++) { struct ubifs_lprops *lprops = heap->arr[i]; - printk(KERN_DEBUG "\t%d. LEB %d hpos %d free %d dirty %d " - "flags %d\n", i, lprops->lnum, lprops->hpos, - lprops->free, lprops->dirty, lprops->flags); + pr_err("\t%d. LEB %d hpos %d free %d dirty %d flags %d\n", + i, lprops->lnum, lprops->hpos, lprops->free, + lprops->dirty, lprops->flags); } + pr_err("(pid %d) finish dumping heap\n", current->pid); } -void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, - struct ubifs_nnode *parent, int iip) +void ubifs_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, + struct ubifs_nnode *parent, int iip) { int i; - printk(KERN_DEBUG "Dumping pnode:\n"); - printk(KERN_DEBUG "\taddress %zx parent %zx cnext %zx\n", + pr_err("(pid %d) dumping pnode:\n", current->pid); + pr_err("\taddress %zx parent %zx cnext %zx\n", (size_t)pnode, (size_t)parent, (size_t)pnode->cnext); - printk(KERN_DEBUG "\tflags %lu iip %d level %d num %d\n", + pr_err("\tflags %lu iip %d level %d num %d\n", pnode->flags, iip, pnode->level, pnode->num); for (i = 0; i < UBIFS_LPT_FANOUT; i++) { struct ubifs_lprops *lp = &pnode->lprops[i]; - printk(KERN_DEBUG "\t%d: free %d dirty %d flags %d lnum %d\n", + pr_err("\t%d: free %d dirty %d flags %d lnum %d\n", i, lp->free, lp->dirty, lp->flags, lp->lnum); } } -void dbg_dump_tnc(struct ubifs_info *c) +void ubifs_dump_tnc(struct ubifs_info *c) { struct ubifs_znode *znode; int level; - printk(KERN_DEBUG "\n"); - printk(KERN_DEBUG "Dumping the TNC tree\n"); + pr_err("\n"); + pr_err("(pid %d) start dumping TNC tree\n", current->pid); znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL); level = znode->level; - printk(KERN_DEBUG "== Level %d ==\n", level); + pr_err("== Level %d ==\n", level); while (znode) { if (level != znode->level) { level = znode->level; - printk(KERN_DEBUG "== Level %d ==\n", level); + pr_err("== Level %d ==\n", level); } - dbg_dump_znode(c, znode); + ubifs_dump_znode(c, znode); znode = ubifs_tnc_levelorder_next(c->zroot.znode, znode); } - - printk(KERN_DEBUG "\n"); + pr_err("(pid %d) finish dumping TNC tree\n", current->pid); } static int dump_znode(struct ubifs_info *c, struct ubifs_znode *znode, void *priv) { - dbg_dump_znode(c, znode); + ubifs_dump_znode(c, znode); return 0; } /** - * dbg_dump_index - dump the on-flash index. + * ubifs_dump_index - dump the on-flash index. * @c: UBIFS file-system description object * - * This function dumps whole UBIFS indexing B-tree, unlike 'dbg_dump_tnc()' + * This function dumps whole UBIFS indexing B-tree, unlike 'ubifs_dump_tnc()' * which dumps only in-memory znodes and does not read znodes which from flash. */ -void dbg_dump_index(struct ubifs_info *c) +void ubifs_dump_index(struct ubifs_info *c) { dbg_walk_index(c, NULL, dump_znode, NULL); } /** + * dbg_save_space_info - save information about flash space. + * @c: UBIFS file-system description object + * + * This function saves information about UBIFS free space, dirty space, etc, in + * order to check it later. + */ +void dbg_save_space_info(struct ubifs_info *c) +{ + struct ubifs_debug_info *d = c->dbg; + int freeable_cnt; + + spin_lock(&c->space_lock); + memcpy(&d->saved_lst, &c->lst, sizeof(struct ubifs_lp_stats)); + memcpy(&d->saved_bi, &c->bi, sizeof(struct ubifs_budg_info)); + d->saved_idx_gc_cnt = c->idx_gc_cnt; + + /* + * We use a dirty hack here and zero out @c->freeable_cnt, because it + * affects the free space calculations, and UBIFS might not know about + * all freeable eraseblocks. Indeed, we know about freeable eraseblocks + * only when we read their lprops, and we do this only lazily, upon the + * need. So at any given point of time @c->freeable_cnt might be not + * exactly accurate. + * + * Just one example about the issue we hit when we did not zero + * @c->freeable_cnt. + * 1. The file-system is mounted R/O, c->freeable_cnt is %0. We save the + * amount of free space in @d->saved_free + * 2. We re-mount R/W, which makes UBIFS to read the "lsave" + * information from flash, where we cache LEBs from various + * categories ('ubifs_remount_fs()' -> 'ubifs_lpt_init()' + * -> 'lpt_init_wr()' -> 'read_lsave()' -> 'ubifs_lpt_lookup()' + * -> 'ubifs_get_pnode()' -> 'update_cats()' + * -> 'ubifs_add_to_cat()'). + * 3. Lsave contains a freeable eraseblock, and @c->freeable_cnt + * becomes %1. + * 4. We calculate the amount of free space when the re-mount is + * finished in 'dbg_check_space_info()' and it does not match + * @d->saved_free. + */ + freeable_cnt = c->freeable_cnt; + c->freeable_cnt = 0; + d->saved_free = ubifs_get_free_space_nolock(c); + c->freeable_cnt = freeable_cnt; + spin_unlock(&c->space_lock); +} + +/** + * dbg_check_space_info - check flash space information. + * @c: UBIFS file-system description object + * + * This function compares current flash space information with the information + * which was saved when the 'dbg_save_space_info()' function was called. + * Returns zero if the information has not changed, and %-EINVAL it it has + * changed. + */ +int dbg_check_space_info(struct ubifs_info *c) +{ + struct ubifs_debug_info *d = c->dbg; + struct ubifs_lp_stats lst; + long long free; + int freeable_cnt; + + spin_lock(&c->space_lock); + freeable_cnt = c->freeable_cnt; + c->freeable_cnt = 0; + free = ubifs_get_free_space_nolock(c); + c->freeable_cnt = freeable_cnt; + spin_unlock(&c->space_lock); + + if (free != d->saved_free) { + ubifs_err("free space changed from %lld to %lld", + d->saved_free, free); + goto out; + } + + return 0; + +out: + ubifs_msg("saved lprops statistics dump"); + ubifs_dump_lstats(&d->saved_lst); + ubifs_msg("saved budgeting info dump"); + ubifs_dump_budg(c, &d->saved_bi); + ubifs_msg("saved idx_gc_cnt %d", d->saved_idx_gc_cnt); + ubifs_msg("current lprops statistics dump"); + ubifs_get_lp_stats(c, &lst); + ubifs_dump_lstats(&lst); + ubifs_msg("current budgeting info dump"); + ubifs_dump_budg(c, &c->bi); + dump_stack(); + return -EINVAL; +} + +/** * dbg_check_synced_i_size - check synchronized inode size. + * @c: UBIFS file-system description object * @inode: inode to check * * If inode is clean, synchronized inode size has to be equivalent to current @@ -799,12 +1064,12 @@ void dbg_dump_index(struct ubifs_info *c) * has to be locked). Returns %0 if synchronized inode size if correct, and * %-EINVAL if not. */ -int dbg_check_synced_i_size(struct inode *inode) +int dbg_check_synced_i_size(const struct ubifs_info *c, struct inode *inode) { int err = 0; struct ubifs_inode *ui = ubifs_inode(inode); - if (!(ubifs_chk_flags & UBIFS_CHK_GEN)) + if (!dbg_is_chk_gen(c)) return 0; if (!S_ISREG(inode->i_mode)) return 0; @@ -812,11 +1077,11 @@ int dbg_check_synced_i_size(struct inode *inode) mutex_lock(&ui->ui_mutex); spin_lock(&ui->ui_lock); if (ui->ui_size != ui->synced_i_size && !ui->dirty) { - ubifs_err("ui_size is %lld, synced_i_size is %lld, but inode " - "is clean", ui->ui_size, ui->synced_i_size); + ubifs_err("ui_size is %lld, synced_i_size is %lld, but inode is clean", + ui->ui_size, ui->synced_i_size); ubifs_err("i_ino %lu, i_mode %#x, i_size %lld", inode->i_ino, inode->i_mode, i_size_read(inode)); - dbg_dump_stack(); + dump_stack(); err = -EINVAL; } spin_unlock(&ui->ui_lock); @@ -837,7 +1102,7 @@ int dbg_check_synced_i_size(struct inode *inode) * Note, it is good idea to make sure the @dir->i_mutex is locked before * calling this function. */ -int dbg_check_dir_size(struct ubifs_info *c, const struct inode *dir) +int dbg_check_dir(struct ubifs_info *c, const struct inode *dir) { unsigned int nlink = 2; union ubifs_key key; @@ -845,7 +1110,7 @@ int dbg_check_dir_size(struct ubifs_info *c, const struct inode *dir) struct qstr nm = { .name = NULL }; loff_t size = UBIFS_INO_NODE_SZ; - if (!(ubifs_chk_flags & UBIFS_CHK_GEN)) + if (!dbg_is_chk_gen(c)) return 0; if (!S_ISDIR(dir->i_mode)) @@ -875,16 +1140,17 @@ int dbg_check_dir_size(struct ubifs_info *c, const struct inode *dir) kfree(pdent); if (i_size_read(dir) != size) { - ubifs_err("directory inode %lu has size %llu, " - "but calculated size is %llu", dir->i_ino, - (unsigned long long)i_size_read(dir), + ubifs_err("directory inode %lu has size %llu, but calculated size is %llu", + dir->i_ino, (unsigned long long)i_size_read(dir), (unsigned long long)size); + ubifs_dump_inode(c, dir); dump_stack(); return -EINVAL; } if (dir->i_nlink != nlink) { - ubifs_err("directory inode %lu has nlink %u, but calculated " - "nlink is %u", dir->i_ino, dir->i_nlink, nlink); + ubifs_err("directory inode %lu has nlink %u, but calculated nlink is %u", + dir->i_ino, dir->i_nlink, nlink); + ubifs_dump_inode(c, dir); dump_stack(); return -EINVAL; } @@ -911,6 +1177,7 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1, int err, nlen1, nlen2, cmp; struct ubifs_dent_node *dent1, *dent2; union ubifs_key key; + char key_buf[DBG_KEY_BUF_LEN]; ubifs_assert(!keys_cmp(c, &zbr1->key, &zbr2->key)); dent1 = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS); @@ -940,22 +1207,26 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1, err = 1; key_read(c, &dent1->key, &key); if (keys_cmp(c, &zbr1->key, &key)) { - dbg_err("1st entry at %d:%d has key %s", zbr1->lnum, - zbr1->offs, DBGKEY(&key)); - dbg_err("but it should have key %s according to tnc", - DBGKEY(&zbr1->key)); - dbg_dump_node(c, dent1); - goto out_free; + ubifs_err("1st entry at %d:%d has key %s", zbr1->lnum, + zbr1->offs, dbg_snprintf_key(c, &key, key_buf, + DBG_KEY_BUF_LEN)); + ubifs_err("but it should have key %s according to tnc", + dbg_snprintf_key(c, &zbr1->key, key_buf, + DBG_KEY_BUF_LEN)); + ubifs_dump_node(c, dent1); + goto out_free; } key_read(c, &dent2->key, &key); if (keys_cmp(c, &zbr2->key, &key)) { - dbg_err("2nd entry at %d:%d has key %s", zbr1->lnum, - zbr1->offs, DBGKEY(&key)); - dbg_err("but it should have key %s according to tnc", - DBGKEY(&zbr2->key)); - dbg_dump_node(c, dent2); - goto out_free; + ubifs_err("2nd entry at %d:%d has key %s", zbr1->lnum, + zbr1->offs, dbg_snprintf_key(c, &key, key_buf, + DBG_KEY_BUF_LEN)); + ubifs_err("but it should have key %s according to tnc", + dbg_snprintf_key(c, &zbr2->key, key_buf, + DBG_KEY_BUF_LEN)); + ubifs_dump_node(c, dent2); + goto out_free; } nlen1 = le16_to_cpu(dent1->nlen); @@ -967,15 +1238,15 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1, goto out_free; } if (cmp == 0 && nlen1 == nlen2) - dbg_err("2 xent/dent nodes with the same name"); + ubifs_err("2 xent/dent nodes with the same name"); else - dbg_err("bad order of colliding key %s", - DBGKEY(&key)); + ubifs_err("bad order of colliding key %s", + dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN)); - dbg_msg("first node at %d:%d\n", zbr1->lnum, zbr1->offs); - dbg_dump_node(c, dent1); - dbg_msg("second node at %d:%d\n", zbr2->lnum, zbr2->offs); - dbg_dump_node(c, dent2); + ubifs_msg("first node at %d:%d\n", zbr1->lnum, zbr1->offs); + ubifs_dump_node(c, dent1); + ubifs_msg("second node at %d:%d\n", zbr2->lnum, zbr2->offs); + ubifs_dump_node(c, dent2); out_free: kfree(dent2); @@ -1086,7 +1357,7 @@ static int dbg_check_znode(struct ubifs_info *c, struct ubifs_zbranch *zbr) /* * Make sure the last key in our znode is less or - * equivalent than the the key in zbranch which goes + * equivalent than the key in the zbranch which goes * after our pointing zbranch. */ cmp = keys_cmp(c, max, @@ -1178,10 +1449,10 @@ static int dbg_check_znode(struct ubifs_info *c, struct ubifs_zbranch *zbr) out: ubifs_err("failed, error %d", err); ubifs_msg("dump of the znode"); - dbg_dump_znode(c, znode); + ubifs_dump_znode(c, znode); if (zp) { ubifs_msg("dump of the parent znode"); - dbg_dump_znode(c, zp); + ubifs_dump_znode(c, zp); } dump_stack(); return -EINVAL; @@ -1201,7 +1472,7 @@ int dbg_check_tnc(struct ubifs_info *c, int extra) long clean_cnt = 0, dirty_cnt = 0; int err, last; - if (!(ubifs_chk_flags & UBIFS_CHK_TNC)) + if (!dbg_is_chk_index(c)) return 0; ubifs_assert(mutex_is_locked(&c->tnc_mutex)); @@ -1248,9 +1519,9 @@ int dbg_check_tnc(struct ubifs_info *c, int extra) return err; if (err) { ubifs_msg("first znode"); - dbg_dump_znode(c, prev); + ubifs_dump_znode(c, prev); ubifs_msg("second znode"); - dbg_dump_znode(c, znode); + ubifs_dump_znode(c, znode); return -EINVAL; } } @@ -1279,7 +1550,7 @@ int dbg_check_tnc(struct ubifs_info *c, int extra) * @c: UBIFS file-system description object * @leaf_cb: called for each leaf node * @znode_cb: called for each indexing node - * @priv: private date which is passed to callbacks + * @priv: private data which is passed to callbacks * * This function walks the UBIFS index and calls the @leaf_cb for each leaf * node and @znode_cb for each indexing node. Returns zero in case of success @@ -1337,9 +1608,9 @@ int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb, if (znode_cb) { err = znode_cb(c, znode, priv); if (err) { - ubifs_err("znode checking function returned " - "error %d", err); - dbg_dump_znode(c, znode); + ubifs_err("znode checking function returned error %d", + err); + ubifs_dump_znode(c, znode); goto out_dump; } } @@ -1348,9 +1619,7 @@ int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb, zbr = &znode->zbranch[idx]; err = leaf_cb(c, zbr, priv); if (err) { - ubifs_err("leaf checking function " - "returned error %d, for leaf " - "at LEB %d:%d", + ubifs_err("leaf checking function returned error %d, for leaf at LEB %d:%d", err, zbr->lnum, zbr->offs); goto out_dump; } @@ -1407,7 +1676,7 @@ out_dump: else zbr = &c->zroot; ubifs_msg("dump of znode at LEB %d:%d", zbr->lnum, zbr->offs); - dbg_dump_znode(c, znode); + ubifs_dump_znode(c, znode); out_unlock: mutex_unlock(&c->tnc_mutex); return err; @@ -1448,7 +1717,7 @@ int dbg_check_idx_size(struct ubifs_info *c, long long idx_size) int err; long long calc = 0; - if (!(ubifs_chk_flags & UBIFS_CHK_IDX_SZ)) + if (!dbg_is_chk_index(c)) return 0; err = dbg_walk_index(c, NULL, add_size, &calc); @@ -1458,8 +1727,8 @@ int dbg_check_idx_size(struct ubifs_info *c, long long idx_size) } if (calc != idx_size) { - ubifs_err("index size check failed: calculated size is %lld, " - "should be %lld", calc, idx_size); + ubifs_err("index size check failed: calculated size is %lld, should be %lld", + calc, idx_size); dump_stack(); return -EINVAL; } @@ -1529,6 +1798,8 @@ static struct fsck_inode *add_inode(struct ubifs_info *c, struct rb_node **p, *parent = NULL; struct fsck_inode *fscki; ino_t inum = key_inum_flash(c, &ino->key); + struct inode *inode; + struct ubifs_inode *ui; p = &fsckd->inodes.rb_node; while (*p) { @@ -1544,7 +1815,7 @@ static struct fsck_inode *add_inode(struct ubifs_info *c, if (inum > c->highest_inum) { ubifs_err("too high inode number, max. is %lu", - c->highest_inum); + (unsigned long)c->highest_inum); return ERR_PTR(-EINVAL); } @@ -1552,19 +1823,46 @@ static struct fsck_inode *add_inode(struct ubifs_info *c, if (!fscki) return ERR_PTR(-ENOMEM); + inode = ilookup(c->vfs_sb, inum); + fscki->inum = inum; - fscki->nlink = le32_to_cpu(ino->nlink); - fscki->size = le64_to_cpu(ino->size); - fscki->xattr_cnt = le32_to_cpu(ino->xattr_cnt); - fscki->xattr_sz = le32_to_cpu(ino->xattr_size); - fscki->xattr_nms = le32_to_cpu(ino->xattr_names); - fscki->mode = le32_to_cpu(ino->mode); + /* + * If the inode is present in the VFS inode cache, use it instead of + * the on-flash inode which might be out-of-date. E.g., the size might + * be out-of-date. If we do not do this, the following may happen, for + * example: + * 1. A power cut happens + * 2. We mount the file-system R/O, the replay process fixes up the + * inode size in the VFS cache, but on on-flash. + * 3. 'check_leaf()' fails because it hits a data node beyond inode + * size. + */ + if (!inode) { + fscki->nlink = le32_to_cpu(ino->nlink); + fscki->size = le64_to_cpu(ino->size); + fscki->xattr_cnt = le32_to_cpu(ino->xattr_cnt); + fscki->xattr_sz = le32_to_cpu(ino->xattr_size); + fscki->xattr_nms = le32_to_cpu(ino->xattr_names); + fscki->mode = le32_to_cpu(ino->mode); + } else { + ui = ubifs_inode(inode); + fscki->nlink = inode->i_nlink; + fscki->size = inode->i_size; + fscki->xattr_cnt = ui->xattr_cnt; + fscki->xattr_sz = ui->xattr_size; + fscki->xattr_nms = ui->xattr_names; + fscki->mode = inode->i_mode; + iput(inode); + } + if (S_ISDIR(fscki->mode)) { fscki->calc_sz = UBIFS_INO_NODE_SZ; fscki->calc_cnt = 2; } + rb_link_node(&fscki->rb, parent, p); rb_insert_color(&fscki->rb, &fsckd->inodes); + return fscki; } @@ -1623,16 +1921,18 @@ static struct fsck_inode *read_add_inode(struct ubifs_info *c, ino_key_init(c, &key, inum); err = ubifs_lookup_level0(c, &key, &znode, &n); if (!err) { - ubifs_err("inode %lu not found in index", inum); + ubifs_err("inode %lu not found in index", (unsigned long)inum); return ERR_PTR(-ENOENT); } else if (err < 0) { - ubifs_err("error %d while looking up inode %lu", err, inum); + ubifs_err("error %d while looking up inode %lu", + err, (unsigned long)inum); return ERR_PTR(err); } zbr = &znode->zbranch[n]; if (zbr->len < UBIFS_INO_NODE_SZ) { - ubifs_err("bad node %lu node length %d", inum, zbr->len); + ubifs_err("bad node %lu node length %d", + (unsigned long)inum, zbr->len); return ERR_PTR(-EINVAL); } @@ -1652,7 +1952,7 @@ static struct fsck_inode *read_add_inode(struct ubifs_info *c, kfree(ino); if (IS_ERR(fscki)) { ubifs_err("error %ld while adding inode %lu node", - PTR_ERR(fscki), inum); + PTR_ERR(fscki), (unsigned long)inum); return fscki; } @@ -1740,8 +2040,8 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr, fscki = read_add_inode(c, priv, inum); if (IS_ERR(fscki)) { err = PTR_ERR(fscki); - ubifs_err("error %d while processing data node and " - "trying to find inode node %lu", err, inum); + ubifs_err("error %d while processing data node and trying to find inode node %lu", + err, (unsigned long)inum); goto out_dump; } @@ -1750,9 +2050,8 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr, blk_offs <<= UBIFS_BLOCK_SHIFT; blk_offs += le32_to_cpu(dn->size); if (blk_offs > fscki->size) { - ubifs_err("data node at LEB %d:%d is not within inode " - "size %lld", zbr->lnum, zbr->offs, - fscki->size); + ubifs_err("data node at LEB %d:%d is not within inode size %lld", + zbr->lnum, zbr->offs, fscki->size); err = -EINVAL; goto out_dump; } @@ -1773,8 +2072,8 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr, fscki = read_add_inode(c, priv, inum); if (IS_ERR(fscki)) { err = PTR_ERR(fscki); - ubifs_err("error %d while processing entry node and " - "trying to find inode node %lu", err, inum); + ubifs_err("error %d while processing entry node and trying to find inode node %lu", + err, (unsigned long)inum); goto out_dump; } @@ -1784,10 +2083,9 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr, inum = key_inum_flash(c, &dent->key); fscki1 = read_add_inode(c, priv, inum); if (IS_ERR(fscki1)) { - err = PTR_ERR(fscki); - ubifs_err("error %d while processing entry node and " - "trying to find parent inode node %lu", - err, inum); + err = PTR_ERR(fscki1); + ubifs_err("error %d while processing entry node and trying to find parent inode node %lu", + err, (unsigned long)inum); goto out_dump; } @@ -1810,7 +2108,7 @@ out: out_dump: ubifs_msg("dump of node at LEB %d:%d", zbr->lnum, zbr->offs); - dbg_dump_node(c, node); + ubifs_dump_node(c, node); out_free: kfree(node); return err; @@ -1822,26 +2120,10 @@ out_free: */ static void free_inodes(struct fsck_data *fsckd) { - struct rb_node *this = fsckd->inodes.rb_node; - struct fsck_inode *fscki; + struct fsck_inode *fscki, *n; - while (this) { - if (this->rb_left) - this = this->rb_left; - else if (this->rb_right) - this = this->rb_right; - else { - fscki = rb_entry(this, struct fsck_inode, rb); - this = rb_parent(this); - if (this) { - if (this->rb_left == &fscki->rb) - this->rb_left = NULL; - else - this->rb_right = NULL; - } - kfree(fscki); - } - } + rbtree_postorder_for_each_entry_safe(fscki, n, &fsckd->inodes, rb) + kfree(fscki); } /** @@ -1876,58 +2158,53 @@ static int check_inodes(struct ubifs_info *c, struct fsck_data *fsckd) */ if (fscki->inum != UBIFS_ROOT_INO && fscki->references != 1) { - ubifs_err("directory inode %lu has %d " - "direntries which refer it, but " - "should be 1", fscki->inum, + ubifs_err("directory inode %lu has %d direntries which refer it, but should be 1", + (unsigned long)fscki->inum, fscki->references); goto out_dump; } if (fscki->inum == UBIFS_ROOT_INO && fscki->references != 0) { - ubifs_err("root inode %lu has non-zero (%d) " - "direntries which refer it", - fscki->inum, fscki->references); + ubifs_err("root inode %lu has non-zero (%d) direntries which refer it", + (unsigned long)fscki->inum, + fscki->references); goto out_dump; } if (fscki->calc_sz != fscki->size) { - ubifs_err("directory inode %lu size is %lld, " - "but calculated size is %lld", - fscki->inum, fscki->size, - fscki->calc_sz); + ubifs_err("directory inode %lu size is %lld, but calculated size is %lld", + (unsigned long)fscki->inum, + fscki->size, fscki->calc_sz); goto out_dump; } if (fscki->calc_cnt != fscki->nlink) { - ubifs_err("directory inode %lu nlink is %d, " - "but calculated nlink is %d", - fscki->inum, fscki->nlink, - fscki->calc_cnt); + ubifs_err("directory inode %lu nlink is %d, but calculated nlink is %d", + (unsigned long)fscki->inum, + fscki->nlink, fscki->calc_cnt); goto out_dump; } } else { if (fscki->references != fscki->nlink) { - ubifs_err("inode %lu nlink is %d, but " - "calculated nlink is %d", fscki->inum, + ubifs_err("inode %lu nlink is %d, but calculated nlink is %d", + (unsigned long)fscki->inum, fscki->nlink, fscki->references); goto out_dump; } } if (fscki->xattr_sz != fscki->calc_xsz) { - ubifs_err("inode %lu has xattr size %u, but " - "calculated size is %lld", - fscki->inum, fscki->xattr_sz, + ubifs_err("inode %lu has xattr size %u, but calculated size is %lld", + (unsigned long)fscki->inum, fscki->xattr_sz, fscki->calc_xsz); goto out_dump; } if (fscki->xattr_cnt != fscki->calc_xcnt) { - ubifs_err("inode %lu has %u xattrs, but " - "calculated count is %lld", fscki->inum, + ubifs_err("inode %lu has %u xattrs, but calculated count is %lld", + (unsigned long)fscki->inum, fscki->xattr_cnt, fscki->calc_xcnt); goto out_dump; } if (fscki->xattr_nms != fscki->calc_xnms) { - ubifs_err("inode %lu has xattr names' size %u, but " - "calculated names' size is %lld", - fscki->inum, fscki->xattr_nms, + ubifs_err("inode %lu has xattr names' size %u, but calculated names' size is %lld", + (unsigned long)fscki->inum, fscki->xattr_nms, fscki->calc_xnms); goto out_dump; } @@ -1940,11 +2217,12 @@ out_dump: ino_key_init(c, &key, fscki->inum); err = ubifs_lookup_level0(c, &key, &znode, &n); if (!err) { - ubifs_err("inode %lu not found in index", fscki->inum); + ubifs_err("inode %lu not found in index", + (unsigned long)fscki->inum); return -ENOENT; } else if (err < 0) { ubifs_err("error %d while looking up inode %lu", - err, fscki->inum); + err, (unsigned long)fscki->inum); return err; } @@ -1962,8 +2240,8 @@ out_dump: } ubifs_msg("dump of the inode %lu sitting in LEB %d:%d", - fscki->inum, zbr->lnum, zbr->offs); - dbg_dump_node(c, ino); + (unsigned long)fscki->inum, zbr->lnum, zbr->offs); + ubifs_dump_node(c, ino); kfree(ino); return -EINVAL; } @@ -1986,7 +2264,7 @@ int dbg_check_filesystem(struct ubifs_info *c) int err; struct fsck_data fsckd; - if (!(ubifs_chk_flags & UBIFS_CHK_FS)) + if (!dbg_is_chk_fs(c)) return 0; fsckd.inodes = RB_ROOT; @@ -2008,282 +2286,815 @@ out_free: return err; } -static int invocation_cnt; - -int dbg_force_in_the_gaps(void) +/** + * dbg_check_data_nodes_order - check that list of data nodes is sorted. + * @c: UBIFS file-system description object + * @head: the list of nodes ('struct ubifs_scan_node' objects) + * + * This function returns zero if the list of data nodes is sorted correctly, + * and %-EINVAL if not. + */ +int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head) { - if (!dbg_force_in_the_gaps_enabled) - return 0; - /* Force in-the-gaps every 8th commit */ - return !((invocation_cnt++) & 0x7); -} + struct list_head *cur; + struct ubifs_scan_node *sa, *sb; -/* Failure mode for recovery testing */ + if (!dbg_is_chk_gen(c)) + return 0; -#define chance(n, d) (simple_rand() <= (n) * 32768LL / (d)) + for (cur = head->next; cur->next != head; cur = cur->next) { + ino_t inuma, inumb; + uint32_t blka, blkb; -struct failure_mode_info { - struct list_head list; - struct ubifs_info *c; -}; + cond_resched(); + sa = container_of(cur, struct ubifs_scan_node, list); + sb = container_of(cur->next, struct ubifs_scan_node, list); -static LIST_HEAD(fmi_list); -static DEFINE_SPINLOCK(fmi_lock); + if (sa->type != UBIFS_DATA_NODE) { + ubifs_err("bad node type %d", sa->type); + ubifs_dump_node(c, sa->node); + return -EINVAL; + } + if (sb->type != UBIFS_DATA_NODE) { + ubifs_err("bad node type %d", sb->type); + ubifs_dump_node(c, sb->node); + return -EINVAL; + } -static unsigned int next; + inuma = key_inum(c, &sa->key); + inumb = key_inum(c, &sb->key); -static int simple_rand(void) -{ - if (next == 0) - next = current->pid; - next = next * 1103515245 + 12345; - return (next >> 16) & 32767; -} + if (inuma < inumb) + continue; + if (inuma > inumb) { + ubifs_err("larger inum %lu goes before inum %lu", + (unsigned long)inuma, (unsigned long)inumb); + goto error_dump; + } -void dbg_failure_mode_registration(struct ubifs_info *c) -{ - struct failure_mode_info *fmi; + blka = key_block(c, &sa->key); + blkb = key_block(c, &sb->key); - fmi = kmalloc(sizeof(struct failure_mode_info), GFP_NOFS); - if (!fmi) { - dbg_err("Failed to register failure mode - no memory"); - return; + if (blka > blkb) { + ubifs_err("larger block %u goes before %u", blka, blkb); + goto error_dump; + } + if (blka == blkb) { + ubifs_err("two data nodes for the same block"); + goto error_dump; + } } - fmi->c = c; - spin_lock(&fmi_lock); - list_add_tail(&fmi->list, &fmi_list); - spin_unlock(&fmi_lock); + + return 0; + +error_dump: + ubifs_dump_node(c, sa->node); + ubifs_dump_node(c, sb->node); + return -EINVAL; } -void dbg_failure_mode_deregistration(struct ubifs_info *c) +/** + * dbg_check_nondata_nodes_order - check that list of data nodes is sorted. + * @c: UBIFS file-system description object + * @head: the list of nodes ('struct ubifs_scan_node' objects) + * + * This function returns zero if the list of non-data nodes is sorted correctly, + * and %-EINVAL if not. + */ +int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head) { - struct failure_mode_info *fmi, *tmp; + struct list_head *cur; + struct ubifs_scan_node *sa, *sb; + + if (!dbg_is_chk_gen(c)) + return 0; + + for (cur = head->next; cur->next != head; cur = cur->next) { + ino_t inuma, inumb; + uint32_t hasha, hashb; - spin_lock(&fmi_lock); - list_for_each_entry_safe(fmi, tmp, &fmi_list, list) - if (fmi->c == c) { - list_del(&fmi->list); - kfree(fmi); + cond_resched(); + sa = container_of(cur, struct ubifs_scan_node, list); + sb = container_of(cur->next, struct ubifs_scan_node, list); + + if (sa->type != UBIFS_INO_NODE && sa->type != UBIFS_DENT_NODE && + sa->type != UBIFS_XENT_NODE) { + ubifs_err("bad node type %d", sa->type); + ubifs_dump_node(c, sa->node); + return -EINVAL; + } + if (sa->type != UBIFS_INO_NODE && sa->type != UBIFS_DENT_NODE && + sa->type != UBIFS_XENT_NODE) { + ubifs_err("bad node type %d", sb->type); + ubifs_dump_node(c, sb->node); + return -EINVAL; } - spin_unlock(&fmi_lock); -} -static struct ubifs_info *dbg_find_info(struct ubi_volume_desc *desc) -{ - struct failure_mode_info *fmi; + if (sa->type != UBIFS_INO_NODE && sb->type == UBIFS_INO_NODE) { + ubifs_err("non-inode node goes before inode node"); + goto error_dump; + } - spin_lock(&fmi_lock); - list_for_each_entry(fmi, &fmi_list, list) - if (fmi->c->ubi == desc) { - struct ubifs_info *c = fmi->c; + if (sa->type == UBIFS_INO_NODE && sb->type != UBIFS_INO_NODE) + continue; - spin_unlock(&fmi_lock); - return c; + if (sa->type == UBIFS_INO_NODE && sb->type == UBIFS_INO_NODE) { + /* Inode nodes are sorted in descending size order */ + if (sa->len < sb->len) { + ubifs_err("smaller inode node goes first"); + goto error_dump; + } + continue; } - spin_unlock(&fmi_lock); - return NULL; + + /* + * This is either a dentry or xentry, which should be sorted in + * ascending (parent ino, hash) order. + */ + inuma = key_inum(c, &sa->key); + inumb = key_inum(c, &sb->key); + + if (inuma < inumb) + continue; + if (inuma > inumb) { + ubifs_err("larger inum %lu goes before inum %lu", + (unsigned long)inuma, (unsigned long)inumb); + goto error_dump; + } + + hasha = key_block(c, &sa->key); + hashb = key_block(c, &sb->key); + + if (hasha > hashb) { + ubifs_err("larger hash %u goes before %u", + hasha, hashb); + goto error_dump; + } + } + + return 0; + +error_dump: + ubifs_msg("dumping first node"); + ubifs_dump_node(c, sa->node); + ubifs_msg("dumping second node"); + ubifs_dump_node(c, sb->node); + return -EINVAL; + return 0; } -static int in_failure_mode(struct ubi_volume_desc *desc) +static inline int chance(unsigned int n, unsigned int out_of) { - struct ubifs_info *c = dbg_find_info(desc); + return !!((prandom_u32() % out_of) + 1 <= n); - if (c && dbg_failure_mode) - return c->failure_mode; - return 0; } -static int do_fail(struct ubi_volume_desc *desc, int lnum, int write) +static int power_cut_emulated(struct ubifs_info *c, int lnum, int write) { - struct ubifs_info *c = dbg_find_info(desc); + struct ubifs_debug_info *d = c->dbg; - if (!c || !dbg_failure_mode) - return 0; - if (c->failure_mode) - return 1; - if (!c->fail_cnt) { - /* First call - decide delay to failure */ + ubifs_assert(dbg_is_tst_rcvry(c)); + + if (!d->pc_cnt) { + /* First call - decide delay to the power cut */ if (chance(1, 2)) { - unsigned int delay = 1 << (simple_rand() >> 11); + unsigned long delay; if (chance(1, 2)) { - c->fail_delay = 1; - c->fail_timeout = jiffies + - msecs_to_jiffies(delay); - dbg_rcvry("failing after %ums", delay); + d->pc_delay = 1; + /* Fail withing 1 minute */ + delay = prandom_u32() % 60000; + d->pc_timeout = jiffies; + d->pc_timeout += msecs_to_jiffies(delay); + ubifs_warn("failing after %lums", delay); } else { - c->fail_delay = 2; - c->fail_cnt_max = delay; - dbg_rcvry("failing after %u calls", delay); + d->pc_delay = 2; + delay = prandom_u32() % 10000; + /* Fail within 10000 operations */ + d->pc_cnt_max = delay; + ubifs_warn("failing after %lu calls", delay); } } - c->fail_cnt += 1; + + d->pc_cnt += 1; } + /* Determine if failure delay has expired */ - if (c->fail_delay == 1) { - if (time_before(jiffies, c->fail_timeout)) + if (d->pc_delay == 1 && time_before(jiffies, d->pc_timeout)) return 0; - } else if (c->fail_delay == 2) - if (c->fail_cnt++ < c->fail_cnt_max) + if (d->pc_delay == 2 && d->pc_cnt++ < d->pc_cnt_max) return 0; + if (lnum == UBIFS_SB_LNUM) { - if (write) { - if (chance(1, 2)) - return 0; - } else if (chance(19, 20)) + if (write && chance(1, 2)) return 0; - dbg_rcvry("failing in super block LEB %d", lnum); + if (chance(19, 20)) + return 0; + ubifs_warn("failing in super block LEB %d", lnum); } else if (lnum == UBIFS_MST_LNUM || lnum == UBIFS_MST_LNUM + 1) { if (chance(19, 20)) return 0; - dbg_rcvry("failing in master LEB %d", lnum); + ubifs_warn("failing in master LEB %d", lnum); } else if (lnum >= UBIFS_LOG_LNUM && lnum <= c->log_last) { - if (write) { - if (chance(99, 100)) - return 0; - } else if (chance(399, 400)) + if (write && chance(99, 100)) + return 0; + if (chance(399, 400)) return 0; - dbg_rcvry("failing in log LEB %d", lnum); + ubifs_warn("failing in log LEB %d", lnum); } else if (lnum >= c->lpt_first && lnum <= c->lpt_last) { - if (write) { - if (chance(7, 8)) - return 0; - } else if (chance(19, 20)) + if (write && chance(7, 8)) return 0; - dbg_rcvry("failing in LPT LEB %d", lnum); + if (chance(19, 20)) + return 0; + ubifs_warn("failing in LPT LEB %d", lnum); } else if (lnum >= c->orph_first && lnum <= c->orph_last) { - if (write) { - if (chance(1, 2)) - return 0; - } else if (chance(9, 10)) + if (write && chance(1, 2)) + return 0; + if (chance(9, 10)) return 0; - dbg_rcvry("failing in orphan LEB %d", lnum); + ubifs_warn("failing in orphan LEB %d", lnum); } else if (lnum == c->ihead_lnum) { if (chance(99, 100)) return 0; - dbg_rcvry("failing in index head LEB %d", lnum); + ubifs_warn("failing in index head LEB %d", lnum); } else if (c->jheads && lnum == c->jheads[GCHD].wbuf.lnum) { if (chance(9, 10)) return 0; - dbg_rcvry("failing in GC head LEB %d", lnum); + ubifs_warn("failing in GC head LEB %d", lnum); } else if (write && !RB_EMPTY_ROOT(&c->buds) && !ubifs_search_bud(c, lnum)) { if (chance(19, 20)) return 0; - dbg_rcvry("failing in non-bud LEB %d", lnum); + ubifs_warn("failing in non-bud LEB %d", lnum); } else if (c->cmt_state == COMMIT_RUNNING_BACKGROUND || c->cmt_state == COMMIT_RUNNING_REQUIRED) { if (chance(999, 1000)) return 0; - dbg_rcvry("failing in bud LEB %d commit running", lnum); + ubifs_warn("failing in bud LEB %d commit running", lnum); } else { if (chance(9999, 10000)) return 0; - dbg_rcvry("failing in bud LEB %d commit not running", lnum); + ubifs_warn("failing in bud LEB %d commit not running", lnum); } - ubifs_err("*** SETTING FAILURE MODE ON (LEB %d) ***", lnum); - c->failure_mode = 1; + + d->pc_happened = 1; + ubifs_warn("========== Power cut emulated =========="); dump_stack(); return 1; } -static void cut_data(const void *buf, int len) +static int corrupt_data(const struct ubifs_info *c, const void *buf, + unsigned int len) { - int flen, i; + unsigned int from, to, ffs = chance(1, 2); unsigned char *p = (void *)buf; - flen = (len * (long long)simple_rand()) >> 15; - for (i = flen; i < len; i++) - p[i] = 0xff; -} + from = prandom_u32() % len; + /* Corruption span max to end of write unit */ + to = min(len, ALIGN(from + 1, c->max_write_size)); -int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset, - int len, int check) -{ - if (in_failure_mode(desc)) - return -EIO; - return ubi_leb_read(desc, lnum, buf, offset, len, check); + ubifs_warn("filled bytes %u-%u with %s", from, to - 1, + ffs ? "0xFFs" : "random data"); + + if (ffs) + memset(p + from, 0xFF, to - from); + else + prandom_bytes(p + from, to - from); + + return to; } -int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf, - int offset, int len, int dtype) +int dbg_leb_write(struct ubifs_info *c, int lnum, const void *buf, + int offs, int len) { - int err; + int err, failing; + + if (c->dbg->pc_happened) + return -EROFS; - if (in_failure_mode(desc)) - return -EIO; - if (do_fail(desc, lnum, 1)) - cut_data(buf, len); - err = ubi_leb_write(desc, lnum, buf, offset, len, dtype); + failing = power_cut_emulated(c, lnum, 1); + if (failing) { + len = corrupt_data(c, buf, len); + ubifs_warn("actually write %d bytes to LEB %d:%d (the buffer was corrupted)", + len, lnum, offs); + } + err = ubi_leb_write(c->ubi, lnum, buf, offs, len); if (err) return err; - if (in_failure_mode(desc)) - return -EIO; + if (failing) + return -EROFS; return 0; } -int dbg_leb_change(struct ubi_volume_desc *desc, int lnum, const void *buf, - int len, int dtype) +int dbg_leb_change(struct ubifs_info *c, int lnum, const void *buf, + int len) { int err; - if (do_fail(desc, lnum, 1)) - return -EIO; - err = ubi_leb_change(desc, lnum, buf, len, dtype); + if (c->dbg->pc_happened) + return -EROFS; + if (power_cut_emulated(c, lnum, 1)) + return -EROFS; + err = ubi_leb_change(c->ubi, lnum, buf, len); if (err) return err; - if (do_fail(desc, lnum, 1)) - return -EIO; + if (power_cut_emulated(c, lnum, 1)) + return -EROFS; return 0; } -int dbg_leb_erase(struct ubi_volume_desc *desc, int lnum) +int dbg_leb_unmap(struct ubifs_info *c, int lnum) { int err; - if (do_fail(desc, lnum, 0)) - return -EIO; - err = ubi_leb_erase(desc, lnum); + if (c->dbg->pc_happened) + return -EROFS; + if (power_cut_emulated(c, lnum, 0)) + return -EROFS; + err = ubi_leb_unmap(c->ubi, lnum); if (err) return err; - if (do_fail(desc, lnum, 0)) - return -EIO; + if (power_cut_emulated(c, lnum, 0)) + return -EROFS; return 0; } -int dbg_leb_unmap(struct ubi_volume_desc *desc, int lnum) +int dbg_leb_map(struct ubifs_info *c, int lnum) { int err; - if (do_fail(desc, lnum, 0)) - return -EIO; - err = ubi_leb_unmap(desc, lnum); + if (c->dbg->pc_happened) + return -EROFS; + if (power_cut_emulated(c, lnum, 0)) + return -EROFS; + err = ubi_leb_map(c->ubi, lnum); if (err) return err; - if (do_fail(desc, lnum, 0)) - return -EIO; + if (power_cut_emulated(c, lnum, 0)) + return -EROFS; + return 0; +} + +/* + * Root directory for UBIFS stuff in debugfs. Contains sub-directories which + * contain the stuff specific to particular file-system mounts. + */ +static struct dentry *dfs_rootdir; + +static int dfs_file_open(struct inode *inode, struct file *file) +{ + file->private_data = inode->i_private; + return nonseekable_open(inode, file); +} + +/** + * provide_user_output - provide output to the user reading a debugfs file. + * @val: boolean value for the answer + * @u: the buffer to store the answer at + * @count: size of the buffer + * @ppos: position in the @u output buffer + * + * This is a simple helper function which stores @val boolean value in the user + * buffer when the user reads one of UBIFS debugfs files. Returns amount of + * bytes written to @u in case of success and a negative error code in case of + * failure. + */ +static int provide_user_output(int val, char __user *u, size_t count, + loff_t *ppos) +{ + char buf[3]; + + if (val) + buf[0] = '1'; + else + buf[0] = '0'; + buf[1] = '\n'; + buf[2] = 0x00; + + return simple_read_from_buffer(u, count, ppos, buf, 2); +} + +static ssize_t dfs_file_read(struct file *file, char __user *u, size_t count, + loff_t *ppos) +{ + struct dentry *dent = file->f_path.dentry; + struct ubifs_info *c = file->private_data; + struct ubifs_debug_info *d = c->dbg; + int val; + + if (dent == d->dfs_chk_gen) + val = d->chk_gen; + else if (dent == d->dfs_chk_index) + val = d->chk_index; + else if (dent == d->dfs_chk_orph) + val = d->chk_orph; + else if (dent == d->dfs_chk_lprops) + val = d->chk_lprops; + else if (dent == d->dfs_chk_fs) + val = d->chk_fs; + else if (dent == d->dfs_tst_rcvry) + val = d->tst_rcvry; + else if (dent == d->dfs_ro_error) + val = c->ro_error; + else + return -EINVAL; + + return provide_user_output(val, u, count, ppos); +} + +/** + * interpret_user_input - interpret user debugfs file input. + * @u: user-provided buffer with the input + * @count: buffer size + * + * This is a helper function which interpret user input to a boolean UBIFS + * debugfs file. Returns %0 or %1 in case of success and a negative error code + * in case of failure. + */ +static int interpret_user_input(const char __user *u, size_t count) +{ + size_t buf_size; + char buf[8]; + + buf_size = min_t(size_t, count, (sizeof(buf) - 1)); + if (copy_from_user(buf, u, buf_size)) + return -EFAULT; + + if (buf[0] == '1') + return 1; + else if (buf[0] == '0') + return 0; + + return -EINVAL; +} + +static ssize_t dfs_file_write(struct file *file, const char __user *u, + size_t count, loff_t *ppos) +{ + struct ubifs_info *c = file->private_data; + struct ubifs_debug_info *d = c->dbg; + struct dentry *dent = file->f_path.dentry; + int val; + + /* + * TODO: this is racy - the file-system might have already been + * unmounted and we'd oops in this case. The plan is to fix it with + * help of 'iterate_supers_type()' which we should have in v3.0: when + * a debugfs opened, we rember FS's UUID in file->private_data. Then + * whenever we access the FS via a debugfs file, we iterate all UBIFS + * superblocks and fine the one with the same UUID, and take the + * locking right. + * + * The other way to go suggested by Al Viro is to create a separate + * 'ubifs-debug' file-system instead. + */ + if (file->f_path.dentry == d->dfs_dump_lprops) { + ubifs_dump_lprops(c); + return count; + } + if (file->f_path.dentry == d->dfs_dump_budg) { + ubifs_dump_budg(c, &c->bi); + return count; + } + if (file->f_path.dentry == d->dfs_dump_tnc) { + mutex_lock(&c->tnc_mutex); + ubifs_dump_tnc(c); + mutex_unlock(&c->tnc_mutex); + return count; + } + + val = interpret_user_input(u, count); + if (val < 0) + return val; + + if (dent == d->dfs_chk_gen) + d->chk_gen = val; + else if (dent == d->dfs_chk_index) + d->chk_index = val; + else if (dent == d->dfs_chk_orph) + d->chk_orph = val; + else if (dent == d->dfs_chk_lprops) + d->chk_lprops = val; + else if (dent == d->dfs_chk_fs) + d->chk_fs = val; + else if (dent == d->dfs_tst_rcvry) + d->tst_rcvry = val; + else if (dent == d->dfs_ro_error) + c->ro_error = !!val; + else + return -EINVAL; + + return count; +} + +static const struct file_operations dfs_fops = { + .open = dfs_file_open, + .read = dfs_file_read, + .write = dfs_file_write, + .owner = THIS_MODULE, + .llseek = no_llseek, +}; + +/** + * dbg_debugfs_init_fs - initialize debugfs for UBIFS instance. + * @c: UBIFS file-system description object + * + * This function creates all debugfs files for this instance of UBIFS. Returns + * zero in case of success and a negative error code in case of failure. + * + * Note, the only reason we have not merged this function with the + * 'ubifs_debugging_init()' function is because it is better to initialize + * debugfs interfaces at the very end of the mount process, and remove them at + * the very beginning of the mount process. + */ +int dbg_debugfs_init_fs(struct ubifs_info *c) +{ + int err, n; + const char *fname; + struct dentry *dent; + struct ubifs_debug_info *d = c->dbg; + + if (!IS_ENABLED(CONFIG_DEBUG_FS)) + return 0; + + n = snprintf(d->dfs_dir_name, UBIFS_DFS_DIR_LEN + 1, UBIFS_DFS_DIR_NAME, + c->vi.ubi_num, c->vi.vol_id); + if (n == UBIFS_DFS_DIR_LEN) { + /* The array size is too small */ + fname = UBIFS_DFS_DIR_NAME; + dent = ERR_PTR(-EINVAL); + goto out; + } + + fname = d->dfs_dir_name; + dent = debugfs_create_dir(fname, dfs_rootdir); + if (IS_ERR_OR_NULL(dent)) + goto out; + d->dfs_dir = dent; + + fname = "dump_lprops"; + dent = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, c, &dfs_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + d->dfs_dump_lprops = dent; + + fname = "dump_budg"; + dent = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, c, &dfs_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + d->dfs_dump_budg = dent; + + fname = "dump_tnc"; + dent = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, c, &dfs_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + d->dfs_dump_tnc = dent; + + fname = "chk_general"; + dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, d->dfs_dir, c, + &dfs_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + d->dfs_chk_gen = dent; + + fname = "chk_index"; + dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, d->dfs_dir, c, + &dfs_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + d->dfs_chk_index = dent; + + fname = "chk_orphans"; + dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, d->dfs_dir, c, + &dfs_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + d->dfs_chk_orph = dent; + + fname = "chk_lprops"; + dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, d->dfs_dir, c, + &dfs_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + d->dfs_chk_lprops = dent; + + fname = "chk_fs"; + dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, d->dfs_dir, c, + &dfs_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + d->dfs_chk_fs = dent; + + fname = "tst_recovery"; + dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, d->dfs_dir, c, + &dfs_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + d->dfs_tst_rcvry = dent; + + fname = "ro_error"; + dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, d->dfs_dir, c, + &dfs_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + d->dfs_ro_error = dent; + return 0; + +out_remove: + debugfs_remove_recursive(d->dfs_dir); +out: + err = dent ? PTR_ERR(dent) : -ENODEV; + ubifs_err("cannot create \"%s\" debugfs file or directory, error %d\n", + fname, err); + return err; +} + +/** + * dbg_debugfs_exit_fs - remove all debugfs files. + * @c: UBIFS file-system description object + */ +void dbg_debugfs_exit_fs(struct ubifs_info *c) +{ + if (IS_ENABLED(CONFIG_DEBUG_FS)) + debugfs_remove_recursive(c->dbg->dfs_dir); +} + +struct ubifs_global_debug_info ubifs_dbg; + +static struct dentry *dfs_chk_gen; +static struct dentry *dfs_chk_index; +static struct dentry *dfs_chk_orph; +static struct dentry *dfs_chk_lprops; +static struct dentry *dfs_chk_fs; +static struct dentry *dfs_tst_rcvry; + +static ssize_t dfs_global_file_read(struct file *file, char __user *u, + size_t count, loff_t *ppos) +{ + struct dentry *dent = file->f_path.dentry; + int val; + + if (dent == dfs_chk_gen) + val = ubifs_dbg.chk_gen; + else if (dent == dfs_chk_index) + val = ubifs_dbg.chk_index; + else if (dent == dfs_chk_orph) + val = ubifs_dbg.chk_orph; + else if (dent == dfs_chk_lprops) + val = ubifs_dbg.chk_lprops; + else if (dent == dfs_chk_fs) + val = ubifs_dbg.chk_fs; + else if (dent == dfs_tst_rcvry) + val = ubifs_dbg.tst_rcvry; + else + return -EINVAL; + + return provide_user_output(val, u, count, ppos); } -int dbg_is_mapped(struct ubi_volume_desc *desc, int lnum) +static ssize_t dfs_global_file_write(struct file *file, const char __user *u, + size_t count, loff_t *ppos) { - if (in_failure_mode(desc)) - return -EIO; - return ubi_is_mapped(desc, lnum); + struct dentry *dent = file->f_path.dentry; + int val; + + val = interpret_user_input(u, count); + if (val < 0) + return val; + + if (dent == dfs_chk_gen) + ubifs_dbg.chk_gen = val; + else if (dent == dfs_chk_index) + ubifs_dbg.chk_index = val; + else if (dent == dfs_chk_orph) + ubifs_dbg.chk_orph = val; + else if (dent == dfs_chk_lprops) + ubifs_dbg.chk_lprops = val; + else if (dent == dfs_chk_fs) + ubifs_dbg.chk_fs = val; + else if (dent == dfs_tst_rcvry) + ubifs_dbg.tst_rcvry = val; + else + return -EINVAL; + + return count; } -int dbg_leb_map(struct ubi_volume_desc *desc, int lnum, int dtype) +static const struct file_operations dfs_global_fops = { + .read = dfs_global_file_read, + .write = dfs_global_file_write, + .owner = THIS_MODULE, + .llseek = no_llseek, +}; + +/** + * dbg_debugfs_init - initialize debugfs file-system. + * + * UBIFS uses debugfs file-system to expose various debugging knobs to + * user-space. This function creates "ubifs" directory in the debugfs + * file-system. Returns zero in case of success and a negative error code in + * case of failure. + */ +int dbg_debugfs_init(void) { int err; + const char *fname; + struct dentry *dent; + + if (!IS_ENABLED(CONFIG_DEBUG_FS)) + return 0; + + fname = "ubifs"; + dent = debugfs_create_dir(fname, NULL); + if (IS_ERR_OR_NULL(dent)) + goto out; + dfs_rootdir = dent; + + fname = "chk_general"; + dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, dfs_rootdir, NULL, + &dfs_global_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + dfs_chk_gen = dent; + + fname = "chk_index"; + dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, dfs_rootdir, NULL, + &dfs_global_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + dfs_chk_index = dent; + + fname = "chk_orphans"; + dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, dfs_rootdir, NULL, + &dfs_global_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + dfs_chk_orph = dent; + + fname = "chk_lprops"; + dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, dfs_rootdir, NULL, + &dfs_global_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + dfs_chk_lprops = dent; + + fname = "chk_fs"; + dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, dfs_rootdir, NULL, + &dfs_global_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + dfs_chk_fs = dent; + + fname = "tst_recovery"; + dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, dfs_rootdir, NULL, + &dfs_global_fops); + if (IS_ERR_OR_NULL(dent)) + goto out_remove; + dfs_tst_rcvry = dent; - if (do_fail(desc, lnum, 0)) - return -EIO; - err = ubi_leb_map(desc, lnum, dtype); - if (err) - return err; - if (do_fail(desc, lnum, 0)) - return -EIO; return 0; + +out_remove: + debugfs_remove_recursive(dfs_rootdir); +out: + err = dent ? PTR_ERR(dent) : -ENODEV; + ubifs_err("cannot create \"%s\" debugfs file or directory, error %d\n", + fname, err); + return err; } -#endif /* CONFIG_UBIFS_FS_DEBUG */ +/** + * dbg_debugfs_exit - remove the "ubifs" directory from debugfs file-system. + */ +void dbg_debugfs_exit(void) +{ + if (IS_ENABLED(CONFIG_DEBUG_FS)) + debugfs_remove_recursive(dfs_rootdir); +} + +/** + * ubifs_debugging_init - initialize UBIFS debugging. + * @c: UBIFS file-system description object + * + * This function initializes debugging-related data for the file system. + * Returns zero in case of success and a negative error code in case of + * failure. + */ +int ubifs_debugging_init(struct ubifs_info *c) +{ + c->dbg = kzalloc(sizeof(struct ubifs_debug_info), GFP_KERNEL); + if (!c->dbg) + return -ENOMEM; + + return 0; +} + +/** + * ubifs_debugging_exit - free debugging data. + * @c: UBIFS file-system description object + */ +void ubifs_debugging_exit(struct ubifs_info *c) +{ + kfree(c->dbg); +} diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h index 3c4f1e93c9e..e03d5179769 100644 --- a/fs/ubifs/debug.h +++ b/fs/ubifs/debug.h @@ -23,381 +23,293 @@ #ifndef __UBIFS_DEBUG_H__ #define __UBIFS_DEBUG_H__ -#ifdef CONFIG_UBIFS_FS_DEBUG +/* Checking helper functions */ +typedef int (*dbg_leaf_callback)(struct ubifs_info *c, + struct ubifs_zbranch *zbr, void *priv); +typedef int (*dbg_znode_callback)(struct ubifs_info *c, + struct ubifs_znode *znode, void *priv); -#define UBIFS_DBG(op) op +/* + * The UBIFS debugfs directory name pattern and maximum name length (3 for "ubi" + * + 1 for "_" and plus 2x2 for 2 UBI numbers and 1 for the trailing zero byte. + */ +#define UBIFS_DFS_DIR_NAME "ubi%d_%d" +#define UBIFS_DFS_DIR_LEN (3 + 1 + 2*2 + 1) + +/** + * ubifs_debug_info - per-FS debugging information. + * @old_zroot: old index root - used by 'dbg_check_old_index()' + * @old_zroot_level: old index root level - used by 'dbg_check_old_index()' + * @old_zroot_sqnum: old index root sqnum - used by 'dbg_check_old_index()' + * + * @pc_happened: non-zero if an emulated power cut happened + * @pc_delay: 0=>don't delay, 1=>delay a time, 2=>delay a number of calls + * @pc_timeout: time in jiffies when delay of failure mode expires + * @pc_cnt: current number of calls to failure mode I/O functions + * @pc_cnt_max: number of calls by which to delay failure mode + * + * @chk_lpt_sz: used by LPT tree size checker + * @chk_lpt_sz2: used by LPT tree size checker + * @chk_lpt_wastage: used by LPT tree size checker + * @chk_lpt_lebs: used by LPT tree size checker + * @new_nhead_offs: used by LPT tree size checker + * @new_ihead_lnum: used by debugging to check @c->ihead_lnum + * @new_ihead_offs: used by debugging to check @c->ihead_offs + * + * @saved_lst: saved lprops statistics (used by 'dbg_save_space_info()') + * @saved_bi: saved budgeting information + * @saved_free: saved amount of free space + * @saved_idx_gc_cnt: saved value of @c->idx_gc_cnt + * + * @chk_gen: if general extra checks are enabled + * @chk_index: if index xtra checks are enabled + * @chk_orph: if orphans extra checks are enabled + * @chk_lprops: if lprops extra checks are enabled + * @chk_fs: if UBIFS contents extra checks are enabled + * @tst_rcvry: if UBIFS recovery testing mode enabled + * + * @dfs_dir_name: name of debugfs directory containing this file-system's files + * @dfs_dir: direntry object of the file-system debugfs directory + * @dfs_dump_lprops: "dump lprops" debugfs knob + * @dfs_dump_budg: "dump budgeting information" debugfs knob + * @dfs_dump_tnc: "dump TNC" debugfs knob + * @dfs_chk_gen: debugfs knob to enable UBIFS general extra checks + * @dfs_chk_index: debugfs knob to enable UBIFS index extra checks + * @dfs_chk_orph: debugfs knob to enable UBIFS orphans extra checks + * @dfs_chk_lprops: debugfs knob to enable UBIFS LEP properties extra checks + * @dfs_chk_fs: debugfs knob to enable UBIFS contents extra checks + * @dfs_tst_rcvry: debugfs knob to enable UBIFS recovery testing + * @dfs_ro_error: debugfs knob to switch UBIFS to R/O mode (different to + * re-mounting to R/O mode because it does not flush any buffers + * and UBIFS just starts returning -EROFS on all write + * operations) + */ +struct ubifs_debug_info { + struct ubifs_zbranch old_zroot; + int old_zroot_level; + unsigned long long old_zroot_sqnum; + + int pc_happened; + int pc_delay; + unsigned long pc_timeout; + unsigned int pc_cnt; + unsigned int pc_cnt_max; + + long long chk_lpt_sz; + long long chk_lpt_sz2; + long long chk_lpt_wastage; + int chk_lpt_lebs; + int new_nhead_offs; + int new_ihead_lnum; + int new_ihead_offs; + + struct ubifs_lp_stats saved_lst; + struct ubifs_budg_info saved_bi; + long long saved_free; + int saved_idx_gc_cnt; + + unsigned int chk_gen:1; + unsigned int chk_index:1; + unsigned int chk_orph:1; + unsigned int chk_lprops:1; + unsigned int chk_fs:1; + unsigned int tst_rcvry:1; + + char dfs_dir_name[UBIFS_DFS_DIR_LEN + 1]; + struct dentry *dfs_dir; + struct dentry *dfs_dump_lprops; + struct dentry *dfs_dump_budg; + struct dentry *dfs_dump_tnc; + struct dentry *dfs_chk_gen; + struct dentry *dfs_chk_index; + struct dentry *dfs_chk_orph; + struct dentry *dfs_chk_lprops; + struct dentry *dfs_chk_fs; + struct dentry *dfs_tst_rcvry; + struct dentry *dfs_ro_error; +}; -#define ubifs_assert(expr) do { \ +/** + * ubifs_global_debug_info - global (not per-FS) UBIFS debugging information. + * + * @chk_gen: if general extra checks are enabled + * @chk_index: if index xtra checks are enabled + * @chk_orph: if orphans extra checks are enabled + * @chk_lprops: if lprops extra checks are enabled + * @chk_fs: if UBIFS contents extra checks are enabled + * @tst_rcvry: if UBIFS recovery testing mode enabled + */ +struct ubifs_global_debug_info { + unsigned int chk_gen:1; + unsigned int chk_index:1; + unsigned int chk_orph:1; + unsigned int chk_lprops:1; + unsigned int chk_fs:1; + unsigned int tst_rcvry:1; +}; + +#define ubifs_assert(expr) do { \ if (unlikely(!(expr))) { \ - printk(KERN_CRIT "UBIFS assert failed in %s at %u (pid %d)\n", \ + pr_crit("UBIFS assert failed in %s at %u (pid %d)\n", \ __func__, __LINE__, current->pid); \ - dbg_dump_stack(); \ + dump_stack(); \ } \ } while (0) #define ubifs_assert_cmt_locked(c) do { \ if (unlikely(down_write_trylock(&(c)->commit_sem))) { \ up_write(&(c)->commit_sem); \ - printk(KERN_CRIT "commit lock is not locked!\n"); \ + pr_crit("commit lock is not locked!\n"); \ ubifs_assert(0); \ } \ } while (0) -#define dbg_dump_stack() do { \ - if (!dbg_failure_mode) \ - dump_stack(); \ -} while (0) - -/* Generic debugging messages */ -#define dbg_msg(fmt, ...) do { \ - spin_lock(&dbg_lock); \ - printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n", current->pid, \ - __func__, ##__VA_ARGS__); \ - spin_unlock(&dbg_lock); \ -} while (0) - -#define dbg_do_msg(typ, fmt, ...) do { \ - if (ubifs_msg_flags & typ) \ - dbg_msg(fmt, ##__VA_ARGS__); \ -} while (0) +#define ubifs_dbg_msg(type, fmt, ...) \ + pr_debug("UBIFS DBG " type " (pid %d): " fmt "\n", current->pid, \ + ##__VA_ARGS__) -#define dbg_err(fmt, ...) do { \ - spin_lock(&dbg_lock); \ - ubifs_err(fmt, ##__VA_ARGS__); \ - spin_unlock(&dbg_lock); \ +#define DBG_KEY_BUF_LEN 48 +#define ubifs_dbg_msg_key(type, key, fmt, ...) do { \ + char __tmp_key_buf[DBG_KEY_BUF_LEN]; \ + pr_debug("UBIFS DBG " type " (pid %d): " fmt "%s\n", current->pid, \ + ##__VA_ARGS__, \ + dbg_snprintf_key(c, key, __tmp_key_buf, DBG_KEY_BUF_LEN)); \ } while (0) -const char *dbg_key_str0(const struct ubifs_info *c, - const union ubifs_key *key); -const char *dbg_key_str1(const struct ubifs_info *c, - const union ubifs_key *key); - -/* - * DBGKEY macros require dbg_lock to be held, which it is in the dbg message - * macros. - */ -#define DBGKEY(key) dbg_key_str0(c, (key)) -#define DBGKEY1(key) dbg_key_str1(c, (key)) - /* General messages */ -#define dbg_gen(fmt, ...) dbg_do_msg(UBIFS_MSG_GEN, fmt, ##__VA_ARGS__) - +#define dbg_gen(fmt, ...) ubifs_dbg_msg("gen", fmt, ##__VA_ARGS__) /* Additional journal messages */ -#define dbg_jnl(fmt, ...) dbg_do_msg(UBIFS_MSG_JNL, fmt, ##__VA_ARGS__) - +#define dbg_jnl(fmt, ...) ubifs_dbg_msg("jnl", fmt, ##__VA_ARGS__) +#define dbg_jnlk(key, fmt, ...) \ + ubifs_dbg_msg_key("jnl", key, fmt, ##__VA_ARGS__) /* Additional TNC messages */ -#define dbg_tnc(fmt, ...) dbg_do_msg(UBIFS_MSG_TNC, fmt, ##__VA_ARGS__) - +#define dbg_tnc(fmt, ...) ubifs_dbg_msg("tnc", fmt, ##__VA_ARGS__) +#define dbg_tnck(key, fmt, ...) \ + ubifs_dbg_msg_key("tnc", key, fmt, ##__VA_ARGS__) /* Additional lprops messages */ -#define dbg_lp(fmt, ...) dbg_do_msg(UBIFS_MSG_LP, fmt, ##__VA_ARGS__) - +#define dbg_lp(fmt, ...) ubifs_dbg_msg("lp", fmt, ##__VA_ARGS__) /* Additional LEB find messages */ -#define dbg_find(fmt, ...) dbg_do_msg(UBIFS_MSG_FIND, fmt, ##__VA_ARGS__) - +#define dbg_find(fmt, ...) ubifs_dbg_msg("find", fmt, ##__VA_ARGS__) /* Additional mount messages */ -#define dbg_mnt(fmt, ...) dbg_do_msg(UBIFS_MSG_MNT, fmt, ##__VA_ARGS__) - +#define dbg_mnt(fmt, ...) ubifs_dbg_msg("mnt", fmt, ##__VA_ARGS__) +#define dbg_mntk(key, fmt, ...) \ + ubifs_dbg_msg_key("mnt", key, fmt, ##__VA_ARGS__) /* Additional I/O messages */ -#define dbg_io(fmt, ...) dbg_do_msg(UBIFS_MSG_IO, fmt, ##__VA_ARGS__) - +#define dbg_io(fmt, ...) ubifs_dbg_msg("io", fmt, ##__VA_ARGS__) /* Additional commit messages */ -#define dbg_cmt(fmt, ...) dbg_do_msg(UBIFS_MSG_CMT, fmt, ##__VA_ARGS__) - +#define dbg_cmt(fmt, ...) ubifs_dbg_msg("cmt", fmt, ##__VA_ARGS__) /* Additional budgeting messages */ -#define dbg_budg(fmt, ...) dbg_do_msg(UBIFS_MSG_BUDG, fmt, ##__VA_ARGS__) - +#define dbg_budg(fmt, ...) ubifs_dbg_msg("budg", fmt, ##__VA_ARGS__) /* Additional log messages */ -#define dbg_log(fmt, ...) dbg_do_msg(UBIFS_MSG_LOG, fmt, ##__VA_ARGS__) - +#define dbg_log(fmt, ...) ubifs_dbg_msg("log", fmt, ##__VA_ARGS__) /* Additional gc messages */ -#define dbg_gc(fmt, ...) dbg_do_msg(UBIFS_MSG_GC, fmt, ##__VA_ARGS__) - +#define dbg_gc(fmt, ...) ubifs_dbg_msg("gc", fmt, ##__VA_ARGS__) /* Additional scan messages */ -#define dbg_scan(fmt, ...) dbg_do_msg(UBIFS_MSG_SCAN, fmt, ##__VA_ARGS__) - +#define dbg_scan(fmt, ...) ubifs_dbg_msg("scan", fmt, ##__VA_ARGS__) /* Additional recovery messages */ -#define dbg_rcvry(fmt, ...) dbg_do_msg(UBIFS_MSG_RCVRY, fmt, ##__VA_ARGS__) - -/* - * Debugging message type flags (must match msg_type_names in debug.c). - * - * UBIFS_MSG_GEN: general messages - * UBIFS_MSG_JNL: journal messages - * UBIFS_MSG_MNT: mount messages - * UBIFS_MSG_CMT: commit messages - * UBIFS_MSG_FIND: LEB find messages - * UBIFS_MSG_BUDG: budgeting messages - * UBIFS_MSG_GC: garbage collection messages - * UBIFS_MSG_TNC: TNC messages - * UBIFS_MSG_LP: lprops messages - * UBIFS_MSG_IO: I/O messages - * UBIFS_MSG_LOG: log messages - * UBIFS_MSG_SCAN: scan messages - * UBIFS_MSG_RCVRY: recovery messages - */ -enum { - UBIFS_MSG_GEN = 0x1, - UBIFS_MSG_JNL = 0x2, - UBIFS_MSG_MNT = 0x4, - UBIFS_MSG_CMT = 0x8, - UBIFS_MSG_FIND = 0x10, - UBIFS_MSG_BUDG = 0x20, - UBIFS_MSG_GC = 0x40, - UBIFS_MSG_TNC = 0x80, - UBIFS_MSG_LP = 0x100, - UBIFS_MSG_IO = 0x200, - UBIFS_MSG_LOG = 0x400, - UBIFS_MSG_SCAN = 0x800, - UBIFS_MSG_RCVRY = 0x1000, -}; - -/* Debugging message type flags for each default debug message level */ -#define UBIFS_MSG_LVL_0 0 -#define UBIFS_MSG_LVL_1 0x1 -#define UBIFS_MSG_LVL_2 0x7f -#define UBIFS_MSG_LVL_3 0xffff - -/* - * Debugging check flags (must match chk_names in debug.c). - * - * UBIFS_CHK_GEN: general checks - * UBIFS_CHK_TNC: check TNC - * UBIFS_CHK_IDX_SZ: check index size - * UBIFS_CHK_ORPH: check orphans - * UBIFS_CHK_OLD_IDX: check the old index - * UBIFS_CHK_LPROPS: check lprops - * UBIFS_CHK_FS: check the file-system - */ -enum { - UBIFS_CHK_GEN = 0x1, - UBIFS_CHK_TNC = 0x2, - UBIFS_CHK_IDX_SZ = 0x4, - UBIFS_CHK_ORPH = 0x8, - UBIFS_CHK_OLD_IDX = 0x10, - UBIFS_CHK_LPROPS = 0x20, - UBIFS_CHK_FS = 0x40, -}; - -/* - * Special testing flags (must match tst_names in debug.c). - * - * UBIFS_TST_FORCE_IN_THE_GAPS: force the use of in-the-gaps method - * UBIFS_TST_RCVRY: failure mode for recovery testing - */ -enum { - UBIFS_TST_FORCE_IN_THE_GAPS = 0x2, - UBIFS_TST_RCVRY = 0x4, -}; +#define dbg_rcvry(fmt, ...) ubifs_dbg_msg("rcvry", fmt, ##__VA_ARGS__) -#if CONFIG_UBIFS_FS_DEBUG_MSG_LVL == 1 -#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_1 -#elif CONFIG_UBIFS_FS_DEBUG_MSG_LVL == 2 -#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_2 -#elif CONFIG_UBIFS_FS_DEBUG_MSG_LVL == 3 -#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_3 -#else -#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_0 -#endif +extern struct ubifs_global_debug_info ubifs_dbg; -#ifdef CONFIG_UBIFS_FS_DEBUG_CHKS -#define UBIFS_CHK_FLAGS_DEFAULT 0xffffffff -#else -#define UBIFS_CHK_FLAGS_DEFAULT 0 -#endif - -extern spinlock_t dbg_lock; +static inline int dbg_is_chk_gen(const struct ubifs_info *c) +{ + return !!(ubifs_dbg.chk_gen || c->dbg->chk_gen); +} +static inline int dbg_is_chk_index(const struct ubifs_info *c) +{ + return !!(ubifs_dbg.chk_index || c->dbg->chk_index); +} +static inline int dbg_is_chk_orph(const struct ubifs_info *c) +{ + return !!(ubifs_dbg.chk_orph || c->dbg->chk_orph); +} +static inline int dbg_is_chk_lprops(const struct ubifs_info *c) +{ + return !!(ubifs_dbg.chk_lprops || c->dbg->chk_lprops); +} +static inline int dbg_is_chk_fs(const struct ubifs_info *c) +{ + return !!(ubifs_dbg.chk_fs || c->dbg->chk_fs); +} +static inline int dbg_is_tst_rcvry(const struct ubifs_info *c) +{ + return !!(ubifs_dbg.tst_rcvry || c->dbg->tst_rcvry); +} +static inline int dbg_is_power_cut(const struct ubifs_info *c) +{ + return !!c->dbg->pc_happened; +} -extern unsigned int ubifs_msg_flags; -extern unsigned int ubifs_chk_flags; -extern unsigned int ubifs_tst_flags; +int ubifs_debugging_init(struct ubifs_info *c); +void ubifs_debugging_exit(struct ubifs_info *c); /* Dump functions */ - const char *dbg_ntype(int type); const char *dbg_cstate(int cmt_state); +const char *dbg_jhead(int jhead); const char *dbg_get_key_dump(const struct ubifs_info *c, const union ubifs_key *key); -void dbg_dump_inode(const struct ubifs_info *c, const struct inode *inode); -void dbg_dump_node(const struct ubifs_info *c, const void *node); -void dbg_dump_budget_req(const struct ubifs_budget_req *req); -void dbg_dump_lstats(const struct ubifs_lp_stats *lst); -void dbg_dump_budg(struct ubifs_info *c); -void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp); -void dbg_dump_lprops(struct ubifs_info *c); -void dbg_dump_leb(const struct ubifs_info *c, int lnum); -void dbg_dump_znode(const struct ubifs_info *c, - const struct ubifs_znode *znode); -void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat); -void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, - struct ubifs_nnode *parent, int iip); -void dbg_dump_tnc(struct ubifs_info *c); -void dbg_dump_index(struct ubifs_info *c); - -/* Checking helper functions */ - -typedef int (*dbg_leaf_callback)(struct ubifs_info *c, - struct ubifs_zbranch *zbr, void *priv); -typedef int (*dbg_znode_callback)(struct ubifs_info *c, - struct ubifs_znode *znode, void *priv); +const char *dbg_snprintf_key(const struct ubifs_info *c, + const union ubifs_key *key, char *buffer, int len); +void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode); +void ubifs_dump_node(const struct ubifs_info *c, const void *node); +void ubifs_dump_budget_req(const struct ubifs_budget_req *req); +void ubifs_dump_lstats(const struct ubifs_lp_stats *lst); +void ubifs_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi); +void ubifs_dump_lprop(const struct ubifs_info *c, + const struct ubifs_lprops *lp); +void ubifs_dump_lprops(struct ubifs_info *c); +void ubifs_dump_lpt_info(struct ubifs_info *c); +void ubifs_dump_leb(const struct ubifs_info *c, int lnum); +void ubifs_dump_sleb(const struct ubifs_info *c, + const struct ubifs_scan_leb *sleb, int offs); +void ubifs_dump_znode(const struct ubifs_info *c, + const struct ubifs_znode *znode); +void ubifs_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, + int cat); +void ubifs_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, + struct ubifs_nnode *parent, int iip); +void ubifs_dump_tnc(struct ubifs_info *c); +void ubifs_dump_index(struct ubifs_info *c); +void ubifs_dump_lpt_lebs(const struct ubifs_info *c); int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb, dbg_znode_callback znode_cb, void *priv); /* Checking functions */ - +void dbg_save_space_info(struct ubifs_info *c); +int dbg_check_space_info(struct ubifs_info *c); int dbg_check_lprops(struct ubifs_info *c); - int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot); int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot); - int dbg_check_cats(struct ubifs_info *c); - int dbg_check_ltab(struct ubifs_info *c); - -int dbg_check_synced_i_size(struct inode *inode); - -int dbg_check_dir_size(struct ubifs_info *c, const struct inode *dir); - +int dbg_chk_lpt_free_spc(struct ubifs_info *c); +int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len); +int dbg_check_synced_i_size(const struct ubifs_info *c, struct inode *inode); +int dbg_check_dir(struct ubifs_info *c, const struct inode *dir); int dbg_check_tnc(struct ubifs_info *c, int extra); - int dbg_check_idx_size(struct ubifs_info *c, long long idx_size); - int dbg_check_filesystem(struct ubifs_info *c); - void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat, int add_pos); - -int dbg_check_lprops(struct ubifs_info *c); int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode, int row, int col); - -/* Force the use of in-the-gaps method for testing */ - -#define dbg_force_in_the_gaps_enabled \ - (ubifs_tst_flags & UBIFS_TST_FORCE_IN_THE_GAPS) - -int dbg_force_in_the_gaps(void); - -/* Failure mode for recovery testing */ - -#define dbg_failure_mode (ubifs_tst_flags & UBIFS_TST_RCVRY) - -void dbg_failure_mode_registration(struct ubifs_info *c); -void dbg_failure_mode_deregistration(struct ubifs_info *c); - -#ifndef UBIFS_DBG_PRESERVE_UBI - -#define ubi_leb_read dbg_leb_read -#define ubi_leb_write dbg_leb_write -#define ubi_leb_change dbg_leb_change -#define ubi_leb_erase dbg_leb_erase -#define ubi_leb_unmap dbg_leb_unmap -#define ubi_is_mapped dbg_is_mapped -#define ubi_leb_map dbg_leb_map - -#endif - -int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset, - int len, int check); -int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf, - int offset, int len, int dtype); -int dbg_leb_change(struct ubi_volume_desc *desc, int lnum, const void *buf, - int len, int dtype); -int dbg_leb_erase(struct ubi_volume_desc *desc, int lnum); -int dbg_leb_unmap(struct ubi_volume_desc *desc, int lnum); -int dbg_is_mapped(struct ubi_volume_desc *desc, int lnum); -int dbg_leb_map(struct ubi_volume_desc *desc, int lnum, int dtype); - -static inline int dbg_read(struct ubi_volume_desc *desc, int lnum, char *buf, - int offset, int len) -{ - return dbg_leb_read(desc, lnum, buf, offset, len, 0); -} - -static inline int dbg_write(struct ubi_volume_desc *desc, int lnum, - const void *buf, int offset, int len) -{ - return dbg_leb_write(desc, lnum, buf, offset, len, UBI_UNKNOWN); -} - -static inline int dbg_change(struct ubi_volume_desc *desc, int lnum, - const void *buf, int len) -{ - return dbg_leb_change(desc, lnum, buf, len, UBI_UNKNOWN); -} - -#else /* !CONFIG_UBIFS_FS_DEBUG */ - -#define UBIFS_DBG(op) -#define ubifs_assert(expr) ({}) -#define ubifs_assert_cmt_locked(c) -#define dbg_dump_stack() -#define dbg_err(fmt, ...) ({}) -#define dbg_msg(fmt, ...) ({}) -#define dbg_key(c, key, fmt, ...) ({}) - -#define dbg_gen(fmt, ...) ({}) -#define dbg_jnl(fmt, ...) ({}) -#define dbg_tnc(fmt, ...) ({}) -#define dbg_lp(fmt, ...) ({}) -#define dbg_find(fmt, ...) ({}) -#define dbg_mnt(fmt, ...) ({}) -#define dbg_io(fmt, ...) ({}) -#define dbg_cmt(fmt, ...) ({}) -#define dbg_budg(fmt, ...) ({}) -#define dbg_log(fmt, ...) ({}) -#define dbg_gc(fmt, ...) ({}) -#define dbg_scan(fmt, ...) ({}) -#define dbg_rcvry(fmt, ...) ({}) - -#define dbg_ntype(type) "" -#define dbg_cstate(cmt_state) "" -#define dbg_get_key_dump(c, key) ({}) -#define dbg_dump_inode(c, inode) ({}) -#define dbg_dump_node(c, node) ({}) -#define dbg_dump_budget_req(req) ({}) -#define dbg_dump_lstats(lst) ({}) -#define dbg_dump_budg(c) ({}) -#define dbg_dump_lprop(c, lp) ({}) -#define dbg_dump_lprops(c) ({}) -#define dbg_dump_leb(c, lnum) ({}) -#define dbg_dump_znode(c, znode) ({}) -#define dbg_dump_heap(c, heap, cat) ({}) -#define dbg_dump_pnode(c, pnode, parent, iip) ({}) -#define dbg_dump_tnc(c) ({}) -#define dbg_dump_index(c) ({}) - -#define dbg_walk_index(c, leaf_cb, znode_cb, priv) 0 - -#define dbg_old_index_check_init(c, zroot) 0 -#define dbg_check_old_index(c, zroot) 0 - -#define dbg_check_cats(c) 0 - -#define dbg_check_ltab(c) 0 - -#define dbg_check_synced_i_size(inode) 0 - -#define dbg_check_dir_size(c, dir) 0 - -#define dbg_check_tnc(c, x) 0 - -#define dbg_check_idx_size(c, idx_size) 0 - -#define dbg_check_filesystem(c) 0 - -#define dbg_check_heap(c, heap, cat, add_pos) ({}) - -#define dbg_check_lprops(c) 0 -#define dbg_check_lpt_nodes(c, cnode, row, col) 0 - -#define dbg_force_in_the_gaps_enabled 0 -#define dbg_force_in_the_gaps() 0 - -#define dbg_failure_mode 0 -#define dbg_failure_mode_registration(c) ({}) -#define dbg_failure_mode_deregistration(c) ({}) - -#endif /* !CONFIG_UBIFS_FS_DEBUG */ +int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode, + loff_t size); +int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head); +int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head); + +int dbg_leb_write(struct ubifs_info *c, int lnum, const void *buf, int offs, + int len); +int dbg_leb_change(struct ubifs_info *c, int lnum, const void *buf, int len); +int dbg_leb_unmap(struct ubifs_info *c, int lnum); +int dbg_leb_map(struct ubifs_info *c, int lnum); + +/* Debugfs-related stuff */ +int dbg_debugfs_init(void); +void dbg_debugfs_exit(void); +int dbg_debugfs_init_fs(struct ubifs_info *c); +void dbg_debugfs_exit_fs(struct ubifs_info *c); #endif /* !__UBIFS_DEBUG_H__ */ diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index e90374be7d3..ea41649e4ca 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -56,7 +56,7 @@ * * This function returns the inherited flags. */ -static int inherit_flags(const struct inode *dir, int mode) +static int inherit_flags(const struct inode *dir, umode_t mode) { int flags; const struct ubifs_inode *ui = ubifs_inode(dir); @@ -86,7 +86,7 @@ static int inherit_flags(const struct inode *dir, int mode) * case of failure. */ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir, - int mode) + umode_t mode) { struct inode *inode; struct ubifs_inode *ui; @@ -102,16 +102,9 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir, * UBIFS has to fully control "clean <-> dirty" transitions of inodes * to make budgeting work. */ - inode->i_flags |= (S_NOCMTIME); + inode->i_flags |= S_NOCMTIME; - inode->i_uid = current->fsuid; - if (dir->i_mode & S_ISGID) { - inode->i_gid = dir->i_gid; - if (S_ISDIR(mode)) - mode |= S_ISGID; - } else - inode->i_gid = current->fsgid; - inode->i_mode = mode; + inode_init_owner(inode, dir, mode); inode->i_mtime = inode->i_atime = inode->i_ctime = ubifs_current_time(inode); inode->i_mapping->nrpages = 0; @@ -161,11 +154,10 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir, return ERR_PTR(-EINVAL); } ubifs_warn("running out of inode numbers (current %lu, max %d)", - c->highest_inum, INUM_WATERMARK); + (unsigned long)c->highest_inum, INUM_WATERMARK); } inode->i_ino = ++c->highest_inum; - inode->i_generation = ++c->vfs_gen; /* * The creation sequence number remains with this inode for its * lifetime. All nodes for this inode have a greater sequence number, @@ -178,11 +170,11 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir, return inode; } -#ifdef CONFIG_UBIFS_FS_DEBUG - -static int dbg_check_name(struct ubifs_dent_node *dent, struct qstr *nm) +static int dbg_check_name(const struct ubifs_info *c, + const struct ubifs_dent_node *dent, + const struct qstr *nm) { - if (!(ubifs_chk_flags & UBIFS_CHK_GEN)) + if (!dbg_is_chk_gen(c)) return 0; if (le16_to_cpu(dent->nlen) != nm->len) return -EINVAL; @@ -191,14 +183,8 @@ static int dbg_check_name(struct ubifs_dent_node *dent, struct qstr *nm) return 0; } -#else - -#define dbg_check_name(dent, nm) 0 - -#endif - static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { int err; union ubifs_key key; @@ -206,8 +192,7 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry, struct ubifs_dent_node *dent; struct ubifs_info *c = dir->i_sb->s_fs_info; - dbg_gen("'%.*s' in dir ino %lu", - dentry->d_name.len, dentry->d_name.name, dir->i_ino); + dbg_gen("'%pd' in dir ino %lu", dentry, dir->i_ino); if (dentry->d_name.len > UBIFS_MAX_NLEN) return ERR_PTR(-ENAMETOOLONG); @@ -220,22 +205,14 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry, err = ubifs_tnc_lookup_nm(c, &key, dent, &dentry->d_name); if (err) { - /* - * Do not hash the direntry if parent 'i_nlink' is zero, because - * this has side-effects - '->delete_inode()' call will not be - * called for the parent orphan inode, because 'd_count' of its - * direntry will stay 1 (it'll be negative direntry I guess) - * and prevent 'iput_final()' until the dentry is destroyed due - * to unmount or memory pressure. - */ - if (err == -ENOENT && dir->i_nlink != 0) { + if (err == -ENOENT) { dbg_gen("not found"); goto done; } goto out; } - if (dbg_check_name(dent, &dentry->d_name)) { + if (dbg_check_name(c, dent, &dentry->d_name)) { err = -EINVAL; goto out; } @@ -247,8 +224,8 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry, * checking. */ err = PTR_ERR(inode); - ubifs_err("dead directory entry '%.*s', error %d", - dentry->d_name.len, dentry->d_name.name, err); + ubifs_err("dead directory entry '%pd', error %d", + dentry, err); ubifs_ro_mode(c, err); goto out; } @@ -267,8 +244,8 @@ out: return ERR_PTR(err); } -static int ubifs_create(struct inode *dir, struct dentry *dentry, int mode, - struct nameidata *nd) +static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode, + bool excl) { struct inode *inode; struct ubifs_info *c = dir->i_sb->s_fs_info; @@ -282,8 +259,8 @@ static int ubifs_create(struct inode *dir, struct dentry *dentry, int mode, * parent directory inode. */ - dbg_gen("dent '%.*s', mode %#x in dir ino %lu", - dentry->d_name.len, dentry->d_name.name, mode, dir->i_ino); + dbg_gen("dent '%pd', mode %#hx in dir ino %lu", + dentry, mode, dir->i_ino); err = ubifs_budget_space(c, &req); if (err) @@ -368,38 +345,46 @@ static unsigned int vfs_dent_type(uint8_t type) * This means that UBIFS cannot support NFS which requires full * 'seekdir()'/'telldir()' support. */ -static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir) +static int ubifs_readdir(struct file *file, struct dir_context *ctx) { - int err, over = 0; + int err; struct qstr nm; union ubifs_key key; struct ubifs_dent_node *dent; - struct inode *dir = file->f_path.dentry->d_inode; + struct inode *dir = file_inode(file); struct ubifs_info *c = dir->i_sb->s_fs_info; - dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, file->f_pos); + dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, ctx->pos); - if (file->f_pos > UBIFS_S_KEY_HASH_MASK || file->f_pos == 2) + if (ctx->pos > UBIFS_S_KEY_HASH_MASK || ctx->pos == 2) /* * The directory was seek'ed to a senseless position or there * are no more entries. */ return 0; - /* File positions 0 and 1 correspond to "." and ".." */ - if (file->f_pos == 0) { - ubifs_assert(!file->private_data); - over = filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR); - if (over) - return 0; - file->f_pos = 1; + if (file->f_version == 0) { + /* + * The file was seek'ed, which means that @file->private_data + * is now invalid. This may also be just the first + * 'ubifs_readdir()' invocation, in which case + * @file->private_data is NULL, and the below code is + * basically a no-op. + */ + kfree(file->private_data); + file->private_data = NULL; } - if (file->f_pos == 1) { + /* + * 'generic_file_llseek()' unconditionally sets @file->f_version to + * zero, and we use this for detecting whether the file was seek'ed. + */ + file->f_version = 1; + + /* File positions 0 and 1 correspond to "." and ".." */ + if (ctx->pos < 2) { ubifs_assert(!file->private_data); - over = filldir(dirent, "..", 2, 1, - parent_ino(file->f_path.dentry), DT_DIR); - if (over) + if (!dir_emit_dots(file, ctx)) return 0; /* Find the first entry in TNC and save it */ @@ -411,7 +396,7 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir) goto out; } - file->f_pos = key_hash_flash(c, &dent->key); + ctx->pos = key_hash_flash(c, &dent->key); file->private_data = dent; } @@ -419,31 +404,30 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir) if (!dent) { /* * The directory was seek'ed to and is now readdir'ed. - * Find the entry corresponding to @file->f_pos or the - * closest one. + * Find the entry corresponding to @ctx->pos or the closest one. */ - dent_key_init_hash(c, &key, dir->i_ino, file->f_pos); + dent_key_init_hash(c, &key, dir->i_ino, ctx->pos); nm.name = NULL; dent = ubifs_tnc_next_ent(c, &key, &nm); if (IS_ERR(dent)) { err = PTR_ERR(dent); goto out; } - file->f_pos = key_hash_flash(c, &dent->key); + ctx->pos = key_hash_flash(c, &dent->key); file->private_data = dent; } while (1) { dbg_gen("feed '%s', ino %llu, new f_pos %#x", - dent->name, le64_to_cpu(dent->inum), + dent->name, (unsigned long long)le64_to_cpu(dent->inum), key_hash_flash(c, &dent->key)); - ubifs_assert(dent->ch.sqnum > ubifs_inode(dir)->creat_sqnum); + ubifs_assert(le64_to_cpu(dent->ch.sqnum) > + ubifs_inode(dir)->creat_sqnum); nm.len = le16_to_cpu(dent->nlen); - over = filldir(dirent, dent->name, nm.len, file->f_pos, + if (!dir_emit(ctx, dent->name, nm.len, le64_to_cpu(dent->inum), - vfs_dent_type(dent->type)); - if (over) + vfs_dent_type(dent->type))) return 0; /* Switch to the next entry */ @@ -456,7 +440,7 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir) } kfree(file->private_data); - file->f_pos = key_hash_flash(c, &dent->key); + ctx->pos = key_hash_flash(c, &dent->key); file->private_data = dent; cond_resched(); } @@ -469,18 +453,11 @@ out: kfree(file->private_data); file->private_data = NULL; - file->f_pos = 2; + /* 2 is a special value indicating that there are no more direntries */ + ctx->pos = 2; return 0; } -/* If a directory is seeked, we have to free saved readdir() state */ -static loff_t ubifs_dir_llseek(struct file *file, loff_t offset, int origin) -{ - kfree(file->private_data); - file->private_data = NULL; - return generic_file_llseek(file, offset, origin); -} - /* Free saved readdir() state when the directory is closed */ static int ubifs_dir_release(struct inode *dir, struct file *file) { @@ -490,30 +467,29 @@ static int ubifs_dir_release(struct inode *dir, struct file *file) } /** - * lock_2_inodes - lock two UBIFS inodes. + * lock_2_inodes - a wrapper for locking two UBIFS inodes. * @inode1: first inode * @inode2: second inode + * + * We do not implement any tricks to guarantee strict lock ordering, because + * VFS has already done it for us on the @i_mutex. So this is just a simple + * wrapper function. */ static void lock_2_inodes(struct inode *inode1, struct inode *inode2) { - if (inode1->i_ino < inode2->i_ino) { - mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_2); - mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_3); - } else { - mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2); - mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_3); - } + mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1); + mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2); } /** - * unlock_2_inodes - unlock two UBIFS inodes inodes. + * unlock_2_inodes - a wrapper for unlocking two UBIFS inodes. * @inode1: first inode * @inode2: second inode */ static void unlock_2_inodes(struct inode *inode1, struct inode *inode2) { - mutex_unlock(&ubifs_inode(inode1)->ui_mutex); mutex_unlock(&ubifs_inode(inode2)->ui_mutex); + mutex_unlock(&ubifs_inode(inode1)->ui_mutex); } static int ubifs_link(struct dentry *old_dentry, struct inode *dir, @@ -525,17 +501,20 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir, struct ubifs_inode *dir_ui = ubifs_inode(dir); int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len); struct ubifs_budget_req req = { .new_dent = 1, .dirtied_ino = 2, - .dirtied_ino_d = ui->data_len }; + .dirtied_ino_d = ALIGN(ui->data_len, 8) }; /* * Budget request settings: new direntry, changing the target inode, * changing the parent inode. */ - dbg_gen("dent '%.*s' to ino %lu (nlink %d) in dir ino %lu", - dentry->d_name.len, dentry->d_name.name, inode->i_ino, + dbg_gen("dent '%pd' to ino %lu (nlink %d) in dir ino %lu", + dentry, inode->i_ino, inode->i_nlink, dir->i_ino); - err = dbg_check_synced_i_size(inode); + ubifs_assert(mutex_is_locked(&dir->i_mutex)); + ubifs_assert(mutex_is_locked(&inode->i_mutex)); + + err = dbg_check_synced_i_size(c, inode); if (err) return err; @@ -545,7 +524,7 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir, lock_2_inodes(dir, inode); inc_nlink(inode); - atomic_inc(&inode->i_count); + ihold(inode); inode->i_ctime = ubifs_current_time(inode); dir->i_size += sz_change; dir_ui->ui_size = dir->i_size; @@ -577,6 +556,7 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry) int sz_change = CALC_DENT_SIZE(dentry->d_name.len); int err, budgeted = 1; struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 }; + unsigned int saved_nlink = inode->i_nlink; /* * Budget request settings: deletion direntry, deletion inode (+1 for @@ -585,10 +565,12 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry) * deletions. */ - dbg_gen("dent '%.*s' from ino %lu (nlink %d) in dir ino %lu", - dentry->d_name.len, dentry->d_name.name, inode->i_ino, + dbg_gen("dent '%pd' from ino %lu (nlink %d) in dir ino %lu", + dentry, inode->i_ino, inode->i_nlink, dir->i_ino); - err = dbg_check_synced_i_size(inode); + ubifs_assert(mutex_is_locked(&dir->i_mutex)); + ubifs_assert(mutex_is_locked(&inode->i_mutex)); + err = dbg_check_synced_i_size(c, inode); if (err) return err; @@ -596,7 +578,6 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry) if (err) { if (err != -ENOSPC) return err; - err = 0; budgeted = 0; } @@ -615,7 +596,7 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry) ubifs_release_budget(c, &req); else { /* We've deleted something - clean the "no space" flags */ - c->nospace = c->nospace_rp = 0; + c->bi.nospace = c->bi.nospace_rp = 0; smp_wmb(); } return 0; @@ -623,7 +604,7 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry) out_cancel: dir->i_size += sz_change; dir_ui->ui_size = dir->i_size; - inc_nlink(inode); + set_nlink(inode, saved_nlink); unlock_2_inodes(dir, inode); if (budgeted) ubifs_release_budget(c, &req); @@ -674,9 +655,10 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry) * because we have extra space reserved for deletions. */ - dbg_gen("directory '%.*s', ino %lu in dir ino %lu", dentry->d_name.len, - dentry->d_name.name, inode->i_ino, dir->i_ino); - + dbg_gen("directory '%pd', ino %lu in dir ino %lu", dentry, + inode->i_ino, dir->i_ino); + ubifs_assert(mutex_is_locked(&dir->i_mutex)); + ubifs_assert(mutex_is_locked(&inode->i_mutex)); err = check_dir_empty(c, dentry->d_inode); if (err) return err; @@ -704,7 +686,7 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry) ubifs_release_budget(c, &req); else { /* We've deleted something - clean the "no space" flags */ - c->nospace = c->nospace_rp = 0; + c->bi.nospace = c->bi.nospace_rp = 0; smp_wmb(); } return 0; @@ -713,30 +695,28 @@ out_cancel: dir->i_size += sz_change; dir_ui->ui_size = dir->i_size; inc_nlink(dir); - inc_nlink(inode); - inc_nlink(inode); + set_nlink(inode, 2); unlock_2_inodes(dir, inode); if (budgeted) ubifs_release_budget(c, &req); return err; } -static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; struct ubifs_inode *dir_ui = ubifs_inode(dir); struct ubifs_info *c = dir->i_sb->s_fs_info; int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len); - struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, - .dirtied_ino_d = 1 }; + struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1 }; /* * Budget request settings: new inode, new direntry and changing parent * directory inode. */ - dbg_gen("dent '%.*s', mode %#x in dir ino %lu", - dentry->d_name.len, dentry->d_name.name, mode, dir->i_ino); + dbg_gen("dent '%pd', mode %#hx in dir ino %lu", + dentry, mode, dir->i_ino); err = ubifs_budget_space(c, &req); if (err) @@ -779,7 +759,7 @@ out_budg: } static int ubifs_mknod(struct inode *dir, struct dentry *dentry, - int mode, dev_t rdev) + umode_t mode, dev_t rdev) { struct inode *inode; struct ubifs_inode *ui; @@ -789,15 +769,15 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry, int sz_change = CALC_DENT_SIZE(dentry->d_name.len); int err, devlen = 0; struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, - .new_ino_d = devlen, .dirtied_ino = 1 }; + .new_ino_d = ALIGN(devlen, 8), + .dirtied_ino = 1 }; /* * Budget request settings: new inode, new direntry and changing parent * directory inode. */ - dbg_gen("dent '%.*s' in dir ino %lu", - dentry->d_name.len, dentry->d_name.name, dir->i_ino); + dbg_gen("dent '%pd' in dir ino %lu", dentry, dir->i_ino); if (!new_valid_dev(rdev)) return -EINVAL; @@ -863,15 +843,16 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry, int err, len = strlen(symname); int sz_change = CALC_DENT_SIZE(dentry->d_name.len); struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, - .new_ino_d = len, .dirtied_ino = 1 }; + .new_ino_d = ALIGN(len, 8), + .dirtied_ino = 1 }; /* * Budget request settings: new inode, new direntry and changing parent * directory inode. */ - dbg_gen("dent '%.*s', target '%s' in dir ino %lu", dentry->d_name.len, - dentry->d_name.name, symname, dir->i_ino); + dbg_gen("dent '%pd', target '%s' in dir ino %lu", dentry, + symname, dir->i_ino); if (len > UBIFS_MAX_INO_DATA) return -ENAMETOOLONG; @@ -930,59 +911,30 @@ out_budg: } /** - * lock_3_inodes - lock three UBIFS inodes for rename. + * lock_3_inodes - a wrapper for locking three UBIFS inodes. * @inode1: first inode * @inode2: second inode * @inode3: third inode * - * For 'ubifs_rename()', @inode1 may be the same as @inode2 whereas @inode3 may - * be null. + * This function is used for 'ubifs_rename()' and @inode1 may be the same as + * @inode2 whereas @inode3 may be %NULL. + * + * We do not implement any tricks to guarantee strict lock ordering, because + * VFS has already done it for us on the @i_mutex. So this is just a simple + * wrapper function. */ static void lock_3_inodes(struct inode *inode1, struct inode *inode2, struct inode *inode3) { - struct inode *i1, *i2, *i3; - - if (!inode3) { - if (inode1 != inode2) { - lock_2_inodes(inode1, inode2); - return; - } - mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1); - return; - } - - if (inode1 == inode2) { - lock_2_inodes(inode1, inode3); - return; - } - - /* 3 different inodes */ - if (inode1 < inode2) { - i3 = inode2; - if (inode1 < inode3) { - i1 = inode1; - i2 = inode3; - } else { - i1 = inode3; - i2 = inode1; - } - } else { - i3 = inode1; - if (inode2 < inode3) { - i1 = inode2; - i2 = inode3; - } else { - i1 = inode3; - i2 = inode2; - } - } - mutex_lock_nested(&ubifs_inode(i1)->ui_mutex, WB_MUTEX_1); - lock_2_inodes(i2, i3); + mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1); + if (inode2 != inode1) + mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2); + if (inode3) + mutex_lock_nested(&ubifs_inode(inode3)->ui_mutex, WB_MUTEX_3); } /** - * unlock_3_inodes - unlock three UBIFS inodes for rename. + * unlock_3_inodes - a wrapper for unlocking three UBIFS inodes for rename. * @inode1: first inode * @inode2: second inode * @inode3: third inode @@ -990,11 +942,11 @@ static void lock_3_inodes(struct inode *inode1, struct inode *inode2, static void unlock_3_inodes(struct inode *inode1, struct inode *inode2, struct inode *inode3) { - mutex_unlock(&ubifs_inode(inode1)->ui_mutex); - if (inode1 != inode2) - mutex_unlock(&ubifs_inode(inode2)->ui_mutex); if (inode3) mutex_unlock(&ubifs_inode(inode3)->ui_mutex); + if (inode1 != inode2) + mutex_unlock(&ubifs_inode(inode2)->ui_mutex); + mutex_unlock(&ubifs_inode(inode1)->ui_mutex); } static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry, @@ -1012,8 +964,9 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry, struct ubifs_budget_req req = { .new_dent = 1, .mod_dent = 1, .dirtied_ino = 3 }; struct ubifs_budget_req ino_req = { .dirtied_ino = 1, - .dirtied_ino_d = old_inode_ui->data_len }; + .dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) }; struct timespec time; + unsigned int uninitialized_var(saved_nlink); /* * Budget request settings: deletion direntry, new direntry, removing @@ -1024,10 +977,14 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry, * separately. */ - dbg_gen("dent '%.*s' ino %lu in dir ino %lu to dent '%.*s' in " - "dir ino %lu", old_dentry->d_name.len, old_dentry->d_name.name, - old_inode->i_ino, old_dir->i_ino, new_dentry->d_name.len, - new_dentry->d_name.name, new_dir->i_ino); + dbg_gen("dent '%pd' ino %lu in dir ino %lu to dent '%pd' in dir ino %lu", + old_dentry, old_inode->i_ino, old_dir->i_ino, + new_dentry, new_dir->i_ino); + ubifs_assert(mutex_is_locked(&old_dir->i_mutex)); + ubifs_assert(mutex_is_locked(&new_dir->i_mutex)); + if (unlink) + ubifs_assert(mutex_is_locked(&new_inode->i_mutex)); + if (unlink && is_dir) { err = check_dir_empty(c, new_inode); @@ -1091,13 +1048,14 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry, if (unlink) { /* * Directories cannot have hard-links, so if this is a - * directory, decrement its @i_nlink twice because an empty - * directory has @i_nlink 2. + * directory, just clear @i_nlink. */ + saved_nlink = new_inode->i_nlink; if (is_dir) + clear_nlink(new_inode); + else drop_nlink(new_inode); new_inode->i_ctime = time; - drop_nlink(new_inode); } else { new_dir->i_size += new_sz; ubifs_inode(new_dir)->ui_size = new_dir->i_size; @@ -1129,14 +1087,12 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry, if (release) ubifs_release_budget(c, &ino_req); if (IS_SYNC(old_inode)) - err = old_inode->i_sb->s_op->write_inode(old_inode, 1); + err = old_inode->i_sb->s_op->write_inode(old_inode, NULL); return err; out_cancel: if (unlink) { - if (is_dir) - inc_nlink(new_inode); - inc_nlink(new_inode); + set_nlink(new_inode, saved_nlink); } else { new_dir->i_size -= new_sz; ubifs_inode(new_dir)->ui_size = new_dir->i_size; @@ -1167,16 +1123,7 @@ int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct ubifs_inode *ui = ubifs_inode(inode); mutex_lock(&ui->ui_mutex); - stat->dev = inode->i_sb->s_dev; - stat->ino = inode->i_ino; - stat->mode = inode->i_mode; - stat->nlink = inode->i_nlink; - stat->uid = inode->i_uid; - stat->gid = inode->i_gid; - stat->rdev = inode->i_rdev; - stat->atime = inode->i_atime; - stat->mtime = inode->i_mtime; - stat->ctime = inode->i_ctime; + generic_fillattr(inode, stat); stat->blksize = UBIFS_BLOCK_SIZE; stat->size = ui->ui_size; @@ -1207,7 +1154,7 @@ int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry, return 0; } -struct inode_operations ubifs_dir_inode_operations = { +const struct inode_operations ubifs_dir_inode_operations = { .lookup = ubifs_lookup, .create = ubifs_create, .link = ubifs_link, @@ -1219,19 +1166,17 @@ struct inode_operations ubifs_dir_inode_operations = { .rename = ubifs_rename, .setattr = ubifs_setattr, .getattr = ubifs_getattr, -#ifdef CONFIG_UBIFS_FS_XATTR .setxattr = ubifs_setxattr, .getxattr = ubifs_getxattr, .listxattr = ubifs_listxattr, .removexattr = ubifs_removexattr, -#endif }; -struct file_operations ubifs_dir_operations = { - .llseek = ubifs_dir_llseek, +const struct file_operations ubifs_dir_operations = { + .llseek = generic_file_llseek, .release = ubifs_dir_release, .read = generic_read_dir, - .readdir = ubifs_readdir, + .iterate = ubifs_readdir, .fsync = ubifs_fsync, .unlocked_ioctl = ubifs_ioctl, #ifdef CONFIG_COMPAT diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 8565e586e53..b5b593c4527 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -21,39 +21,39 @@ */ /* - * This file implements VFS file and inode operations of regular files, device + * This file implements VFS file and inode operations for regular files, device * nodes and symlinks as well as address space operations. * - * UBIFS uses 2 page flags: PG_private and PG_checked. PG_private is set if the - * page is dirty and is used for budgeting purposes - dirty pages should not be - * budgeted. The PG_checked flag is set if full budgeting is required for the - * page e.g., when it corresponds to a file hole or it is just beyond the file - * size. The budgeting is done in 'ubifs_write_begin()', because it is OK to - * fail in this function, and the budget is released in 'ubifs_write_end()'. So - * the PG_private and PG_checked flags carry the information about how the page - * was budgeted, to make it possible to release the budget properly. + * UBIFS uses 2 page flags: @PG_private and @PG_checked. @PG_private is set if + * the page is dirty and is used for optimization purposes - dirty pages are + * not budgeted so the flag shows that 'ubifs_write_end()' should not release + * the budget for this page. The @PG_checked flag is set if full budgeting is + * required for the page e.g., when it corresponds to a file hole or it is + * beyond the file size. The budgeting is done in 'ubifs_write_begin()', because + * it is OK to fail in this function, and the budget is released in + * 'ubifs_write_end()'. So the @PG_private and @PG_checked flags carry + * information about how the page was budgeted, to make it possible to release + * the budget properly. * - * A thing to keep in mind: inode's 'i_mutex' is locked in most VFS operations - * we implement. However, this is not true for '->writepage()', which might be - * called with 'i_mutex' unlocked. For example, when pdflush is performing - * write-back, it calls 'writepage()' with unlocked 'i_mutex', although the - * inode has 'I_LOCK' flag in this case. At "normal" work-paths 'i_mutex' is - * locked in '->writepage', e.g. in "sys_write -> alloc_pages -> direct reclaim - * path'. So, in '->writepage()' we are only guaranteed that the page is - * locked. + * A thing to keep in mind: inode @i_mutex is locked in most VFS operations we + * implement. However, this is not true for 'ubifs_writepage()', which may be + * called with @i_mutex unlocked. For example, when flusher thread is doing + * background write-back, it calls 'ubifs_writepage()' with unlocked @i_mutex. + * At "normal" work-paths the @i_mutex is locked in 'ubifs_writepage()', e.g. + * in the "sys_write -> alloc_pages -> direct reclaim path". So, in + * 'ubifs_writepage()' we are only guaranteed that the page is locked. * - * Similarly, 'i_mutex' does not have to be locked in readpage(), e.g., - * readahead path does not have it locked ("sys_read -> generic_file_aio_read - * -> ondemand_readahead -> readpage"). In case of readahead, 'I_LOCK' flag is - * not set as well. However, UBIFS disables readahead. - * - * This, for example means that there might be 2 concurrent '->writepage()' - * calls for the same inode, but different inode dirty pages. + * Similarly, @i_mutex is not always locked in 'ubifs_readpage()', e.g., the + * read-ahead path does not lock it ("sys_read -> generic_file_aio_read -> + * ondemand_readahead -> readpage"). In case of readahead, @I_SYNC flag is not + * set as well. However, UBIFS disables readahead. */ #include "ubifs.h" +#include <linux/aio.h> #include <linux/mount.h> #include <linux/namei.h> +#include <linux/slab.h> static int read_block(struct inode *inode, void *addr, unsigned int block, struct ubifs_data_node *dn) @@ -72,8 +72,8 @@ static int read_block(struct inode *inode, void *addr, unsigned int block, return err; } - ubifs_assert(dn->ch.sqnum > ubifs_inode(inode)->creat_sqnum); - + ubifs_assert(le64_to_cpu(dn->ch.sqnum) > + ubifs_inode(inode)->creat_sqnum); len = le32_to_cpu(dn->size); if (len <= 0 || len > UBIFS_BLOCK_SIZE) goto dump; @@ -98,7 +98,7 @@ static int read_block(struct inode *inode, void *addr, unsigned int block, dump: ubifs_err("bad data node (block %u, inode %lu)", block, inode->i_ino); - dbg_dump_node(c, dn); + ubifs_dump_node(c, dn); return -EINVAL; } @@ -147,6 +147,12 @@ static int do_readpage(struct page *page) err = ret; if (err != -ENOENT) break; + } else if (block + 1 == beyond) { + int dlen = le32_to_cpu(dn->size); + int ilen = i_size & (UBIFS_BLOCK_SIZE - 1); + + if (ilen && ilen < dlen) + memset(addr + ilen, 0, dlen - ilen); } } if (++i >= UBIFS_BLOCKS_PER_PAGE) @@ -207,13 +213,14 @@ static void release_new_page_budget(struct ubifs_info *c) */ static void release_existing_page_budget(struct ubifs_info *c) { - struct ubifs_budget_req req = { .dd_growth = c->page_budget}; + struct ubifs_budget_req req = { .dd_growth = c->bi.page_budget}; ubifs_release_budget(c, &req); } static int write_begin_slow(struct address_space *mapping, - loff_t pos, unsigned len, struct page **pagep) + loff_t pos, unsigned len, struct page **pagep, + unsigned flags) { struct inode *inode = mapping->host; struct ubifs_info *c = inode->i_sb->s_fs_info; @@ -241,14 +248,14 @@ static int write_begin_slow(struct address_space *mapping, if (unlikely(err)) return err; - page = __grab_cache_page(mapping, index); + page = grab_cache_page_write_begin(mapping, index, flags); if (unlikely(!page)) { ubifs_release_budget(c, &req); return -ENOMEM; } if (!PageUptodate(page)) { - if (!(pos & PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) + if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) SetPageChecked(page); else { err = do_readpage(page); @@ -423,31 +430,35 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, struct ubifs_inode *ui = ubifs_inode(inode); pgoff_t index = pos >> PAGE_CACHE_SHIFT; int uninitialized_var(err), appending = !!(pos + len > inode->i_size); + int skipped_read = 0; struct page *page; - ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size); + ubifs_assert(!c->ro_media && !c->ro_mount); - if (unlikely(c->ro_media)) + if (unlikely(c->ro_error)) return -EROFS; /* Try out the fast-path part first */ - page = __grab_cache_page(mapping, index); + page = grab_cache_page_write_begin(mapping, index, flags); if (unlikely(!page)) return -ENOMEM; if (!PageUptodate(page)) { /* The page is not loaded from the flash */ - if (!(pos & PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) + if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) { /* * We change whole page so no need to load it. But we - * have to set the @PG_checked flag to make the further - * code the page is new. This might be not true, but it - * is better to budget more that to read the page from - * the media. + * do not know whether this page exists on the media or + * not, so we assume the latter because it requires + * larger budget. The assumption is that it is better + * to budget a bit more than to read the page from the + * media. Thus, we are setting the @PG_checked flag + * here. */ SetPageChecked(page); - else { + skipped_read = 1; + } else { err = do_readpage(page); if (err) { unlock_page(page); @@ -464,6 +475,14 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, if (unlikely(err)) { ubifs_assert(err == -ENOSPC); /* + * If we skipped reading the page because we were going to + * write all of it, then it is not up to date. + */ + if (skipped_read) { + ClearPageChecked(page); + ClearPageUptodate(page); + } + /* * Budgeting failed which means it would have to force * write-back but didn't, because we set the @fast flag in the * request. Write-back cannot be done now, while we have the @@ -477,12 +496,12 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, unlock_page(page); page_cache_release(page); - return write_begin_slow(mapping, pos, len, pagep); + return write_begin_slow(mapping, pos, len, pagep, flags); } /* - * Whee, we aquired budgeting quickly - without involving - * garbage-collection, committing or forceing write-back. We return + * Whee, we acquired budgeting quickly - without involving + * garbage-collection, committing or forcing write-back. We return * with @ui->ui_mutex locked if we are appending pages, and unlocked * otherwise. This is an optimization (slightly hacky though). */ @@ -543,10 +562,11 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping, dbg_gen("copied %d instead of %d, read page and repeat", copied, len); cancel_budget(c, page, ui, appending); + ClearPageChecked(page); /* * Return 0 to force VFS to repeat the whole operation, or the - * error code if 'do_readpage()' failes. + * error code if 'do_readpage()' fails. */ copied = do_readpage(page); goto out; @@ -577,8 +597,297 @@ out: return copied; } +/** + * populate_page - copy data nodes into a page for bulk-read. + * @c: UBIFS file-system description object + * @page: page + * @bu: bulk-read information + * @n: next zbranch slot + * + * This function returns %0 on success and a negative error code on failure. + */ +static int populate_page(struct ubifs_info *c, struct page *page, + struct bu_info *bu, int *n) +{ + int i = 0, nn = *n, offs = bu->zbranch[0].offs, hole = 0, read = 0; + struct inode *inode = page->mapping->host; + loff_t i_size = i_size_read(inode); + unsigned int page_block; + void *addr, *zaddr; + pgoff_t end_index; + + dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx", + inode->i_ino, page->index, i_size, page->flags); + + addr = zaddr = kmap(page); + + end_index = (i_size - 1) >> PAGE_CACHE_SHIFT; + if (!i_size || page->index > end_index) { + hole = 1; + memset(addr, 0, PAGE_CACHE_SIZE); + goto out_hole; + } + + page_block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT; + while (1) { + int err, len, out_len, dlen; + + if (nn >= bu->cnt) { + hole = 1; + memset(addr, 0, UBIFS_BLOCK_SIZE); + } else if (key_block(c, &bu->zbranch[nn].key) == page_block) { + struct ubifs_data_node *dn; + + dn = bu->buf + (bu->zbranch[nn].offs - offs); + + ubifs_assert(le64_to_cpu(dn->ch.sqnum) > + ubifs_inode(inode)->creat_sqnum); + + len = le32_to_cpu(dn->size); + if (len <= 0 || len > UBIFS_BLOCK_SIZE) + goto out_err; + + dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ; + out_len = UBIFS_BLOCK_SIZE; + err = ubifs_decompress(&dn->data, dlen, addr, &out_len, + le16_to_cpu(dn->compr_type)); + if (err || len != out_len) + goto out_err; + + if (len < UBIFS_BLOCK_SIZE) + memset(addr + len, 0, UBIFS_BLOCK_SIZE - len); + + nn += 1; + read = (i << UBIFS_BLOCK_SHIFT) + len; + } else if (key_block(c, &bu->zbranch[nn].key) < page_block) { + nn += 1; + continue; + } else { + hole = 1; + memset(addr, 0, UBIFS_BLOCK_SIZE); + } + if (++i >= UBIFS_BLOCKS_PER_PAGE) + break; + addr += UBIFS_BLOCK_SIZE; + page_block += 1; + } + + if (end_index == page->index) { + int len = i_size & (PAGE_CACHE_SIZE - 1); + + if (len && len < read) + memset(zaddr + len, 0, read - len); + } + +out_hole: + if (hole) { + SetPageChecked(page); + dbg_gen("hole"); + } + + SetPageUptodate(page); + ClearPageError(page); + flush_dcache_page(page); + kunmap(page); + *n = nn; + return 0; + +out_err: + ClearPageUptodate(page); + SetPageError(page); + flush_dcache_page(page); + kunmap(page); + ubifs_err("bad data node (block %u, inode %lu)", + page_block, inode->i_ino); + return -EINVAL; +} + +/** + * ubifs_do_bulk_read - do bulk-read. + * @c: UBIFS file-system description object + * @bu: bulk-read information + * @page1: first page to read + * + * This function returns %1 if the bulk-read is done, otherwise %0 is returned. + */ +static int ubifs_do_bulk_read(struct ubifs_info *c, struct bu_info *bu, + struct page *page1) +{ + pgoff_t offset = page1->index, end_index; + struct address_space *mapping = page1->mapping; + struct inode *inode = mapping->host; + struct ubifs_inode *ui = ubifs_inode(inode); + int err, page_idx, page_cnt, ret = 0, n = 0; + int allocate = bu->buf ? 0 : 1; + loff_t isize; + + err = ubifs_tnc_get_bu_keys(c, bu); + if (err) + goto out_warn; + + if (bu->eof) { + /* Turn off bulk-read at the end of the file */ + ui->read_in_a_row = 1; + ui->bulk_read = 0; + } + + page_cnt = bu->blk_cnt >> UBIFS_BLOCKS_PER_PAGE_SHIFT; + if (!page_cnt) { + /* + * This happens when there are multiple blocks per page and the + * blocks for the first page we are looking for, are not + * together. If all the pages were like this, bulk-read would + * reduce performance, so we turn it off for a while. + */ + goto out_bu_off; + } + + if (bu->cnt) { + if (allocate) { + /* + * Allocate bulk-read buffer depending on how many data + * nodes we are going to read. + */ + bu->buf_len = bu->zbranch[bu->cnt - 1].offs + + bu->zbranch[bu->cnt - 1].len - + bu->zbranch[0].offs; + ubifs_assert(bu->buf_len > 0); + ubifs_assert(bu->buf_len <= c->leb_size); + bu->buf = kmalloc(bu->buf_len, GFP_NOFS | __GFP_NOWARN); + if (!bu->buf) + goto out_bu_off; + } + + err = ubifs_tnc_bulk_read(c, bu); + if (err) + goto out_warn; + } + + err = populate_page(c, page1, bu, &n); + if (err) + goto out_warn; + + unlock_page(page1); + ret = 1; + + isize = i_size_read(inode); + if (isize == 0) + goto out_free; + end_index = ((isize - 1) >> PAGE_CACHE_SHIFT); + + for (page_idx = 1; page_idx < page_cnt; page_idx++) { + pgoff_t page_offset = offset + page_idx; + struct page *page; + + if (page_offset > end_index) + break; + page = find_or_create_page(mapping, page_offset, + GFP_NOFS | __GFP_COLD); + if (!page) + break; + if (!PageUptodate(page)) + err = populate_page(c, page, bu, &n); + unlock_page(page); + page_cache_release(page); + if (err) + break; + } + + ui->last_page_read = offset + page_idx - 1; + +out_free: + if (allocate) + kfree(bu->buf); + return ret; + +out_warn: + ubifs_warn("ignoring error %d and skipping bulk-read", err); + goto out_free; + +out_bu_off: + ui->read_in_a_row = ui->bulk_read = 0; + goto out_free; +} + +/** + * ubifs_bulk_read - determine whether to bulk-read and, if so, do it. + * @page: page from which to start bulk-read. + * + * Some flash media are capable of reading sequentially at faster rates. UBIFS + * bulk-read facility is designed to take advantage of that, by reading in one + * go consecutive data nodes that are also located consecutively in the same + * LEB. This function returns %1 if a bulk-read is done and %0 otherwise. + */ +static int ubifs_bulk_read(struct page *page) +{ + struct inode *inode = page->mapping->host; + struct ubifs_info *c = inode->i_sb->s_fs_info; + struct ubifs_inode *ui = ubifs_inode(inode); + pgoff_t index = page->index, last_page_read = ui->last_page_read; + struct bu_info *bu; + int err = 0, allocated = 0; + + ui->last_page_read = index; + if (!c->bulk_read) + return 0; + + /* + * Bulk-read is protected by @ui->ui_mutex, but it is an optimization, + * so don't bother if we cannot lock the mutex. + */ + if (!mutex_trylock(&ui->ui_mutex)) + return 0; + + if (index != last_page_read + 1) { + /* Turn off bulk-read if we stop reading sequentially */ + ui->read_in_a_row = 1; + if (ui->bulk_read) + ui->bulk_read = 0; + goto out_unlock; + } + + if (!ui->bulk_read) { + ui->read_in_a_row += 1; + if (ui->read_in_a_row < 3) + goto out_unlock; + /* Three reads in a row, so switch on bulk-read */ + ui->bulk_read = 1; + } + + /* + * If possible, try to use pre-allocated bulk-read information, which + * is protected by @c->bu_mutex. + */ + if (mutex_trylock(&c->bu_mutex)) + bu = &c->bu; + else { + bu = kmalloc(sizeof(struct bu_info), GFP_NOFS | __GFP_NOWARN); + if (!bu) + goto out_unlock; + + bu->buf = NULL; + allocated = 1; + } + + bu->buf_len = c->max_bu_buf_len; + data_key_init(c, &bu->key, inode->i_ino, + page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT); + err = ubifs_do_bulk_read(c, bu, page); + + if (!allocated) + mutex_unlock(&c->bu_mutex); + else + kfree(bu); + +out_unlock: + mutex_unlock(&ui->ui_mutex); + return err; +} + static int ubifs_readpage(struct file *file, struct page *page) { + if (ubifs_bulk_read(page)) + return 0; do_readpage(page); unlock_page(page); return 0; @@ -594,8 +903,9 @@ static int do_writepage(struct page *page, int len) struct ubifs_info *c = inode->i_sb->s_fs_info; #ifdef UBIFS_DEBUG + struct ubifs_inode *ui = ubifs_inode(inode); spin_lock(&ui->ui_lock); - ubifs_assert(page->index <= ui->synced_i_size << PAGE_CACHE_SIZE); + ubifs_assert(page->index <= ui->synced_i_size >> PAGE_CACHE_SHIFT); spin_unlock(&ui->ui_lock); #endif @@ -654,7 +964,7 @@ static int do_writepage(struct page *page, int len) * whole index and correct all inode sizes, which is long an unacceptable. * * To prevent situations like this, UBIFS writes pages back only if they are - * within last synchronized inode size, i.e. the the size which has been + * within the last synchronized inode size, i.e. the size which has been * written to the flash media last time. Otherwise, UBIFS forces inode * write-back, thus making sure the on-flash inode contains current inode size, * and then keeps writing pages back. @@ -663,11 +973,15 @@ static int do_writepage(struct page *page, int len) * the page locked, and it locks @ui_mutex. However, write-back does take inode * @i_mutex, which means other VFS operations may be run on this inode at the * same time. And the problematic one is truncation to smaller size, from where - * we have to call 'vmtruncate()', which first changes @inode->i_size, then - * drops the truncated pages. And while dropping the pages, it takes the page - * lock. This means that 'do_truncation()' cannot call 'vmtruncate()' with - * @ui_mutex locked, because it would deadlock with 'ubifs_writepage()'. This - * means that @inode->i_size is changed while @ui_mutex is unlocked. + * we have to call 'truncate_setsize()', which first changes @inode->i_size, + * then drops the truncated pages. And while dropping the pages, it takes the + * page lock. This means that 'do_truncation()' cannot call 'truncate_setsize()' + * with @ui_mutex locked, because it would deadlock with 'ubifs_writepage()'. + * This means that @inode->i_size is changed while @ui_mutex is unlocked. + * + * XXX(truncate): with the new truncate sequence this is not true anymore, + * and the calls to truncate_setsize can be move around freely. They should + * be moved to the very end of the truncate sequence. * * But in 'ubifs_writepage()' we have to guarantee that we do not write beyond * inode size. How do we do this if @inode->i_size may became smaller while we @@ -708,7 +1022,7 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc) /* Is the page fully inside @i_size? */ if (page->index < end_index) { if (page->index >= synced_i_size >> PAGE_CACHE_SHIFT) { - err = inode->i_sb->s_op->write_inode(inode, 1); + err = inode->i_sb->s_op->write_inode(inode, NULL); if (err) goto out_unlock; /* @@ -730,13 +1044,13 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc) * the page size, the remaining memory is zeroed when mapped, and * writes to that region are not written out to the file." */ - kaddr = kmap_atomic(page, KM_USER0); + kaddr = kmap_atomic(page); memset(kaddr + len, 0, PAGE_CACHE_SIZE - len); flush_dcache_page(page); - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); if (i_size > synced_i_size) { - err = inode->i_sb->s_op->write_inode(inode, 1); + err = inode->i_sb->s_op->write_inode(inode, NULL); if (err) goto out_unlock; } @@ -793,7 +1107,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode, int err; struct ubifs_budget_req req; loff_t old_size = inode->i_size, new_size = attr->ia_size; - int offset = new_size & (UBIFS_BLOCK_SIZE - 1); + int offset = new_size & (UBIFS_BLOCK_SIZE - 1), budgeted = 1; struct ubifs_inode *ui = ubifs_inode(inode); dbg_gen("ino %lu, size %lld -> %lld", inode->i_ino, old_size, new_size); @@ -811,12 +1125,17 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode, /* A funny way to budget for truncation node */ req.dirtied_ino_d = UBIFS_TRUN_NODE_SZ; err = ubifs_budget_space(c, &req); - if (err) - return err; + if (err) { + /* + * Treat truncations to zero as deletion and always allow them, + * just like we do for '->unlink()'. + */ + if (new_size || err != -ENOSPC) + return err; + budgeted = 0; + } - err = vmtruncate(inode, new_size); - if (err) - goto out_budg; + truncate_setsize(inode, new_size); if (offset) { pgoff_t index = new_size >> PAGE_CACHE_SHIFT; @@ -863,13 +1182,18 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode, ui->ui_size = inode->i_size; /* Truncation changes inode [mc]time */ inode->i_mtime = inode->i_ctime = ubifs_current_time(inode); - /* The other attributes may be changed at the same time as well */ + /* Other attributes may be changed at the same time as well */ do_attr_changes(inode, attr); - err = ubifs_jnl_truncate(c, inode, old_size, new_size); mutex_unlock(&ui->ui_mutex); + out_budg: - ubifs_release_budget(c, &req); + if (budgeted) + ubifs_release_budget(c, &req); + else { + c->bi.nospace = c->bi.nospace_rp = 0; + smp_wmb(); + } return err; } @@ -890,7 +1214,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode, loff_t new_size = attr->ia_size; struct ubifs_inode *ui = ubifs_inode(inode); struct ubifs_budget_req req = { .dirtied_ino = 1, - .dirtied_ino_d = ui->data_len }; + .dirtied_ino_d = ALIGN(ui->data_len, 8) }; err = ubifs_budget_space(c, &req); if (err) @@ -898,16 +1222,14 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode, if (attr->ia_valid & ATTR_SIZE) { dbg_gen("size %lld -> %lld", inode->i_size, new_size); - err = vmtruncate(inode, new_size); - if (err) - goto out; + truncate_setsize(inode, new_size); } mutex_lock(&ui->ui_mutex); if (attr->ia_valid & ATTR_SIZE) { /* Truncation changes inode [mc]time */ inode->i_mtime = inode->i_ctime = ubifs_current_time(inode); - /* 'vmtruncate()' changed @i_size, update @ui_size */ + /* 'truncate_setsize()' changed @i_size, update @ui_size */ ui->ui_size = inode->i_size; } @@ -927,11 +1249,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode, if (release) ubifs_release_budget(c, &req); if (IS_SYNC(inode)) - err = inode->i_sb->s_op->write_inode(inode, 1); - return err; - -out: - ubifs_release_budget(c, &req); + err = inode->i_sb->s_op->write_inode(inode, NULL); return err; } @@ -941,12 +1259,13 @@ int ubifs_setattr(struct dentry *dentry, struct iattr *attr) struct inode *inode = dentry->d_inode; struct ubifs_info *c = inode->i_sb->s_fs_info; - dbg_gen("ino %lu, ia_valid %#x", inode->i_ino, attr->ia_valid); + dbg_gen("ino %lu, mode %#x, ia_valid %#x", + inode->i_ino, inode->i_mode, attr->ia_valid); err = inode_change_ok(inode, attr); if (err) return err; - err = dbg_check_synced_i_size(inode); + err = dbg_check_synced_i_size(c, inode); if (err) return err; @@ -959,13 +1278,14 @@ int ubifs_setattr(struct dentry *dentry, struct iattr *attr) return err; } -static void ubifs_invalidatepage(struct page *page, unsigned long offset) +static void ubifs_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { struct inode *inode = page->mapping->host; struct ubifs_info *c = inode->i_sb->s_fs_info; ubifs_assert(PagePrivate(page)); - if (offset) + if (offset || length < PAGE_CACHE_SIZE) /* Partial page remains dirty */ return; @@ -987,22 +1307,31 @@ static void *ubifs_follow_link(struct dentry *dentry, struct nameidata *nd) return NULL; } -int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync) +int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync) { - struct inode *inode = dentry->d_inode; + struct inode *inode = file->f_mapping->host; struct ubifs_info *c = inode->i_sb->s_fs_info; int err; dbg_gen("syncing inode %lu", inode->i_ino); - /* - * VFS has already synchronized dirty pages for this inode. Synchronize - * the inode unless this is a 'datasync()' call. - */ + if (c->ro_mount) + /* + * For some really strange reasons VFS does not filter out + * 'fsync()' for R/O mounted file-systems as per 2.6.39. + */ + return 0; + + err = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (err) + return err; + mutex_lock(&inode->i_mutex); + + /* Synchronize the inode unless this is a 'datasync()' call. */ if (!datasync || (inode->i_state & I_DIRTY_DATASYNC)) { - err = inode->i_sb->s_op->write_inode(inode, 1); + err = inode->i_sb->s_op->write_inode(inode, NULL); if (err) - return err; + goto out; } /* @@ -1010,10 +1339,9 @@ int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync) * them. */ err = ubifs_sync_wbufs_by_inode(c, inode); - if (err) - return err; - - return 0; +out: + mutex_unlock(&inode->i_mutex); + return err; } /** @@ -1036,22 +1364,22 @@ static inline int mctime_update_needed(const struct inode *inode, /** * update_ctime - update mtime and ctime of an inode. - * @c: UBIFS file-system description object * @inode: inode to update * * This function updates mtime and ctime of the inode if it is not equivalent to * current time. Returns zero in case of success and a negative error code in * case of failure. */ -static int update_mctime(struct ubifs_info *c, struct inode *inode) +static int update_mctime(struct inode *inode) { struct timespec now = ubifs_current_time(inode); struct ubifs_inode *ui = ubifs_inode(inode); + struct ubifs_info *c = inode->i_sb->s_fs_info; if (mctime_update_needed(inode, &now)) { int err, release; struct ubifs_budget_req req = { .dirtied_ino = 1, - .dirtied_ino_d = ui->data_len }; + .dirtied_ino_d = ALIGN(ui->data_len, 8) }; err = ubifs_budget_space(c, &req); if (err) @@ -1069,29 +1397,13 @@ static int update_mctime(struct ubifs_info *c, struct inode *inode) return 0; } -static ssize_t ubifs_aio_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +static ssize_t ubifs_write_iter(struct kiocb *iocb, struct iov_iter *from) { - int err; - ssize_t ret; - struct inode *inode = iocb->ki_filp->f_mapping->host; - struct ubifs_info *c = inode->i_sb->s_fs_info; - - err = update_mctime(c, inode); + int err = update_mctime(file_inode(iocb->ki_filp)); if (err) return err; - ret = generic_file_aio_write(iocb, iov, nr_segs, pos); - if (ret < 0) - return ret; - - if (ret > 0 && (IS_SYNC(inode) || iocb->ki_filp->f_flags & O_SYNC)) { - err = ubifs_sync_wbufs_by_inode(c, inode); - if (err) - return err; - } - - return ret; + return generic_file_write_iter(iocb, from); } static int ubifs_set_page_dirty(struct page *page) @@ -1123,12 +1435,14 @@ static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags) } /* - * mmap()d file has taken write protection fault and is being made - * writable. UBIFS must ensure page is budgeted for. + * mmap()d file has taken write protection fault and is being made writable. + * UBIFS must ensure page is budgeted for. */ -static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page) +static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, + struct vm_fault *vmf) { - struct inode *inode = vma->vm_file->f_path.dentry->d_inode; + struct page *page = vmf->page; + struct inode *inode = file_inode(vma->vm_file); struct ubifs_info *c = inode->i_sb->s_fs_info; struct timespec now = ubifs_current_time(inode); struct ubifs_budget_req req = { .new_page = 1 }; @@ -1136,10 +1450,10 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page) dbg_gen("ino %lu, pg %lu, i_size %lld", inode->i_ino, page->index, i_size_read(inode)); - ubifs_assert(!(inode->i_sb->s_flags & MS_RDONLY)); + ubifs_assert(!c->ro_media && !c->ro_mount); - if (unlikely(c->ro_media)) - return -EROFS; + if (unlikely(c->ro_error)) + return VM_FAULT_SIGBUS; /* -EROFS */ /* * We have not locked @page so far so we may budget for changing the @@ -1170,9 +1484,9 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page) err = ubifs_budget_space(c, &req); if (unlikely(err)) { if (err == -ENOSPC) - ubifs_warn("out of space for mmapped file " - "(inode number %lu)", inode->i_ino); - return err; + ubifs_warn("out of space for mmapped file (inode number %lu)", + inode->i_ino); + return VM_FAULT_SIGBUS; } lock_page(page); @@ -1206,25 +1520,28 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page) ubifs_release_dirty_inode_budget(c, ui); } - unlock_page(page); - return 0; + wait_for_stable_page(page); + return VM_FAULT_LOCKED; out_unlock: unlock_page(page); ubifs_release_budget(c, &req); + if (err) + err = VM_FAULT_SIGBUS; return err; } -static struct vm_operations_struct ubifs_file_vm_ops = { +static const struct vm_operations_struct ubifs_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = ubifs_vm_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma) { int err; - /* 'generic_file_mmap()' takes care of NOMMU case */ err = generic_file_mmap(file, vma); if (err) return err; @@ -1232,7 +1549,7 @@ static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma) return 0; } -struct address_space_operations ubifs_file_address_operations = { +const struct address_space_operations ubifs_file_address_operations = { .readpage = ubifs_readpage, .writepage = ubifs_writepage, .write_begin = ubifs_write_begin, @@ -1242,34 +1559,33 @@ struct address_space_operations ubifs_file_address_operations = { .releasepage = ubifs_releasepage, }; -struct inode_operations ubifs_file_inode_operations = { +const struct inode_operations ubifs_file_inode_operations = { .setattr = ubifs_setattr, .getattr = ubifs_getattr, -#ifdef CONFIG_UBIFS_FS_XATTR .setxattr = ubifs_setxattr, .getxattr = ubifs_getxattr, .listxattr = ubifs_listxattr, .removexattr = ubifs_removexattr, -#endif }; -struct inode_operations ubifs_symlink_inode_operations = { +const struct inode_operations ubifs_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = ubifs_follow_link, .setattr = ubifs_setattr, .getattr = ubifs_getattr, }; -struct file_operations ubifs_file_operations = { +const struct file_operations ubifs_file_operations = { .llseek = generic_file_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = generic_file_aio_read, - .aio_write = ubifs_aio_write, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = generic_file_read_iter, + .write_iter = ubifs_write_iter, .mmap = ubifs_file_mmap, .fsync = ubifs_fsync, .unlocked_ioctl = ubifs_ioctl, .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, #ifdef CONFIG_COMPAT .compat_ioctl = ubifs_compat_ioctl, #endif diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c index 10394c54836..2dcf3d473fe 100644 --- a/fs/ubifs/find.c +++ b/fs/ubifs/find.c @@ -211,14 +211,8 @@ static const struct ubifs_lprops *scan_for_dirty(struct ubifs_info *c, * dirty index heap, and it falls-back to LPT scanning if the heaps are empty * or do not have an LEB which satisfies the @min_space criteria. * - * Note: - * o LEBs which have less than dead watermark of dirty space are never picked - * by this function; - * - * Returns zero and the LEB properties of - * found dirty LEB in case of success, %-ENOSPC if no dirty LEB was found and a - * negative error code in case of other failures. The returned LEB is marked as - * "taken". + * Note, LEBs which have less than dead watermark of free + dirty space are + * never picked by this function. * * The additional @pick_free argument controls if this function has to return a * free or freeable LEB if one is present. For example, GC must to set it to %1, @@ -231,6 +225,10 @@ static const struct ubifs_lprops *scan_for_dirty(struct ubifs_info *c, * * In addition @pick_free is set to %2 by the recovery process in order to * recover gc_lnum in which case an index LEB must not be returned. + * + * This function returns zero and the LEB properties of found dirty LEB in case + * of success, %-ENOSPC if no dirty LEB was found and a negative error code in + * case of other failures. The returned LEB is marked as "taken". */ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp, int min_space, int pick_free) @@ -245,7 +243,7 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp, int lebs, rsvd_idx_lebs = 0; spin_lock(&c->space_lock); - lebs = c->lst.empty_lebs; + lebs = c->lst.empty_lebs + c->idx_gc_cnt; lebs += c->freeable_cnt - c->lst.taken_empty_lebs; /* @@ -254,8 +252,8 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp, * But if the index takes fewer LEBs than it is reserved for it, * this function must avoid picking those reserved LEBs. */ - if (c->min_idx_lebs >= c->lst.idx_lebs) { - rsvd_idx_lebs = c->min_idx_lebs - c->lst.idx_lebs; + if (c->bi.min_idx_lebs >= c->lst.idx_lebs) { + rsvd_idx_lebs = c->bi.min_idx_lebs - c->lst.idx_lebs; exclude_index = 1; } spin_unlock(&c->space_lock); @@ -278,7 +276,7 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp, pick_free = 0; } else { spin_lock(&c->space_lock); - exclude_index = (c->min_idx_lebs >= c->lst.idx_lebs); + exclude_index = (c->bi.min_idx_lebs >= c->lst.idx_lebs); spin_unlock(&c->space_lock); } @@ -290,9 +288,14 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp, idx_lp = idx_heap->arr[0]; sum = idx_lp->free + idx_lp->dirty; /* - * Since we reserve twice as more space for the index than it + * Since we reserve thrice as much space for the index than it * actually takes, it does not make sense to pick indexing LEBs - * with less than half LEB of dirty space. + * with less than, say, half LEB of dirty space. May be half is + * not the optimal boundary - this should be tested and + * checked. This boundary should determine how much we use + * in-the-gaps to consolidate the index comparing to how much + * we use garbage collector to consolidate it. The "half" + * criteria just feels to be fine. */ if (sum < min_space || sum < c->half_leb_size) idx_lp = NULL; @@ -312,7 +315,7 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp, lp = idx_lp; if (lp) { - ubifs_assert(lp->dirty >= c->dead_wm); + ubifs_assert(lp->free + lp->dirty >= c->dead_wm); goto found; } @@ -475,7 +478,7 @@ const struct ubifs_lprops *do_find_free_space(struct ubifs_info *c, * ubifs_find_free_space - find a data LEB with free space. * @c: the UBIFS file-system description object * @min_space: minimum amount of required free space - * @free: contains amount of free space in the LEB on exit + * @offs: contains offset of where free space starts on exit * @squeeze: whether to try to find space in a non-empty LEB first * * This function looks for an LEB with at least @min_space bytes of free space. @@ -487,7 +490,7 @@ const struct ubifs_lprops *do_find_free_space(struct ubifs_info *c, * failed to find a LEB with @min_space bytes of free space and other a negative * error codes in case of failure. */ -int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free, +int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *offs, int squeeze) { const struct ubifs_lprops *lprops; @@ -498,13 +501,12 @@ int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free, /* Check if there are enough empty LEBs for commit */ spin_lock(&c->space_lock); - if (c->min_idx_lebs > c->lst.idx_lebs) - rsvd_idx_lebs = c->min_idx_lebs - c->lst.idx_lebs; + if (c->bi.min_idx_lebs > c->lst.idx_lebs) + rsvd_idx_lebs = c->bi.min_idx_lebs - c->lst.idx_lebs; else rsvd_idx_lebs = 0; lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt - c->lst.taken_empty_lebs; - ubifs_assert(lebs + c->lst.idx_lebs >= c->min_idx_lebs); if (rsvd_idx_lebs < lebs) /* * OK to allocate an empty LEB, but we still don't want to go @@ -556,10 +558,10 @@ int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free, spin_unlock(&c->space_lock); } - *free = lprops->free; + *offs = c->leb_size - lprops->free; ubifs_release_lprops(c); - if (*free == c->leb_size) { + if (*offs == 0) { /* * Ensure that empty LEBs have been unmapped. They may not have * been, for example, because of an unclean unmount. Also @@ -571,8 +573,8 @@ int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free, return err; } - dbg_find("found LEB %d, free %d", lnum, *free); - ubifs_assert(*free >= min_space); + dbg_find("found LEB %d, free %d", lnum, c->leb_size - *offs); + ubifs_assert(*offs <= c->leb_size - min_space); return lnum; out: @@ -679,8 +681,16 @@ int ubifs_find_free_leb_for_idx(struct ubifs_info *c) if (!lprops) { lprops = ubifs_fast_find_freeable(c); if (!lprops) { - ubifs_assert(c->freeable_cnt == 0); - if (c->lst.empty_lebs - c->lst.taken_empty_lebs > 0) { + /* + * The first condition means the following: go scan the + * LPT if there are uncategorized lprops, which means + * there may be freeable LEBs there (UBIFS does not + * store the information about freeable LEBs in the + * master node). + */ + if (c->in_a_category_cnt != c->main_lebs || + c->lst.empty_lebs - c->lst.taken_empty_lebs > 0) { + ubifs_assert(c->freeable_cnt == 0); lprops = scan_for_leb_for_idx(c); if (IS_ERR(lprops)) { err = PTR_ERR(lprops); @@ -899,11 +909,11 @@ static int get_idx_gc_leb(struct ubifs_info *c) * it is needed now for this commit. */ lp = ubifs_lpt_lookup_dirty(c, lnum); - if (unlikely(IS_ERR(lp))) + if (IS_ERR(lp)) return PTR_ERR(lp); lp = ubifs_change_lp(c, lp, LPROPS_NC, LPROPS_NC, lp->flags | LPROPS_INDEX, -1); - if (unlikely(IS_ERR(lp))) + if (IS_ERR(lp)) return PTR_ERR(lp); dbg_find("LEB %d, dirty %d and free %d flags %#x", lp->lnum, lp->dirty, lp->free, lp->flags); @@ -937,8 +947,8 @@ static int find_dirtiest_idx_leb(struct ubifs_info *c) } dbg_find("LEB %d, dirty %d and free %d flags %#x", lp->lnum, lp->dirty, lp->free, lp->flags); - ubifs_assert(lp->flags | LPROPS_TAKEN); - ubifs_assert(lp->flags | LPROPS_INDEX); + ubifs_assert(lp->flags & LPROPS_TAKEN); + ubifs_assert(lp->flags & LPROPS_INDEX); return lnum; } diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c index d0f3dac2908..9718da86ad0 100644 --- a/fs/ubifs/gc.c +++ b/fs/ubifs/gc.c @@ -31,21 +31,35 @@ * to be reused. Garbage collection will cause the number of dirty index nodes * to grow, however sufficient space is reserved for the index to ensure the * commit will never run out of space. + * + * Notes about dead watermark. At current UBIFS implementation we assume that + * LEBs which have less than @c->dead_wm bytes of free + dirty space are full + * and not worth garbage-collecting. The dead watermark is one min. I/O unit + * size, or min. UBIFS node size, depending on what is greater. Indeed, UBIFS + * Garbage Collector has to synchronize the GC head's write buffer before + * returning, so this is about wasting one min. I/O unit. However, UBIFS GC can + * actually reclaim even very small pieces of dirty space by garbage collecting + * enough dirty LEBs, but we do not bother doing this at this implementation. + * + * Notes about dark watermark. The results of GC work depends on how big are + * the UBIFS nodes GC deals with. Large nodes make GC waste more space. Indeed, + * if GC move data from LEB A to LEB B and nodes in LEB A are large, GC would + * have to waste large pieces of free space at the end of LEB B, because nodes + * from LEB A would not fit. And the worst situation is when all nodes are of + * maximum size. So dark watermark is the amount of free + dirty space in LEB + * which are guaranteed to be reclaimable. If LEB has less space, the GC might + * be unable to reclaim it. So, LEBs with free + dirty greater than dark + * watermark are "good" LEBs from GC's point of few. The other LEBs are not so + * good, and GC takes extra care when moving them. */ +#include <linux/slab.h> #include <linux/pagemap.h> +#include <linux/list_sort.h> #include "ubifs.h" /* - * GC tries to optimize the way it fit nodes to available space, and it sorts - * nodes a little. The below constants are watermarks which define "large", - * "medium", and "small" nodes. - */ -#define MEDIUM_NODE_WM (UBIFS_BLOCK_SIZE / 4) -#define SMALL_NODE_WM UBIFS_MAX_DENT_NODE_SZ - -/* - * GC may need to move more then one LEB to make progress. The below constants + * GC may need to move more than one LEB to make progress. The below constants * define "soft" and "hard" limits on the number of LEBs the garbage collector * may move. */ @@ -86,86 +100,257 @@ static int switch_gc_head(struct ubifs_info *c) if (err) return err; + err = ubifs_wbuf_sync_nolock(wbuf); + if (err) + return err; + err = ubifs_add_bud_to_log(c, GCHD, gc_lnum, 0); if (err) return err; c->gc_lnum = -1; - err = ubifs_wbuf_seek_nolock(wbuf, gc_lnum, 0, UBI_LONGTERM); + err = ubifs_wbuf_seek_nolock(wbuf, gc_lnum, 0); return err; } /** - * move_nodes - move nodes. + * data_nodes_cmp - compare 2 data nodes. + * @priv: UBIFS file-system description object + * @a: first data node + * @a: second data node + * + * This function compares data nodes @a and @b. Returns %1 if @a has greater + * inode or block number, and %-1 otherwise. + */ +static int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b) +{ + ino_t inuma, inumb; + struct ubifs_info *c = priv; + struct ubifs_scan_node *sa, *sb; + + cond_resched(); + if (a == b) + return 0; + + sa = list_entry(a, struct ubifs_scan_node, list); + sb = list_entry(b, struct ubifs_scan_node, list); + + ubifs_assert(key_type(c, &sa->key) == UBIFS_DATA_KEY); + ubifs_assert(key_type(c, &sb->key) == UBIFS_DATA_KEY); + ubifs_assert(sa->type == UBIFS_DATA_NODE); + ubifs_assert(sb->type == UBIFS_DATA_NODE); + + inuma = key_inum(c, &sa->key); + inumb = key_inum(c, &sb->key); + + if (inuma == inumb) { + unsigned int blka = key_block(c, &sa->key); + unsigned int blkb = key_block(c, &sb->key); + + if (blka <= blkb) + return -1; + } else if (inuma <= inumb) + return -1; + + return 1; +} + +/* + * nondata_nodes_cmp - compare 2 non-data nodes. + * @priv: UBIFS file-system description object + * @a: first node + * @a: second node + * + * This function compares nodes @a and @b. It makes sure that inode nodes go + * first and sorted by length in descending order. Directory entry nodes go + * after inode nodes and are sorted in ascending hash valuer order. + */ +static int nondata_nodes_cmp(void *priv, struct list_head *a, + struct list_head *b) +{ + ino_t inuma, inumb; + struct ubifs_info *c = priv; + struct ubifs_scan_node *sa, *sb; + + cond_resched(); + if (a == b) + return 0; + + sa = list_entry(a, struct ubifs_scan_node, list); + sb = list_entry(b, struct ubifs_scan_node, list); + + ubifs_assert(key_type(c, &sa->key) != UBIFS_DATA_KEY && + key_type(c, &sb->key) != UBIFS_DATA_KEY); + ubifs_assert(sa->type != UBIFS_DATA_NODE && + sb->type != UBIFS_DATA_NODE); + + /* Inodes go before directory entries */ + if (sa->type == UBIFS_INO_NODE) { + if (sb->type == UBIFS_INO_NODE) + return sb->len - sa->len; + return -1; + } + if (sb->type == UBIFS_INO_NODE) + return 1; + + ubifs_assert(key_type(c, &sa->key) == UBIFS_DENT_KEY || + key_type(c, &sa->key) == UBIFS_XENT_KEY); + ubifs_assert(key_type(c, &sb->key) == UBIFS_DENT_KEY || + key_type(c, &sb->key) == UBIFS_XENT_KEY); + ubifs_assert(sa->type == UBIFS_DENT_NODE || + sa->type == UBIFS_XENT_NODE); + ubifs_assert(sb->type == UBIFS_DENT_NODE || + sb->type == UBIFS_XENT_NODE); + + inuma = key_inum(c, &sa->key); + inumb = key_inum(c, &sb->key); + + if (inuma == inumb) { + uint32_t hasha = key_hash(c, &sa->key); + uint32_t hashb = key_hash(c, &sb->key); + + if (hasha <= hashb) + return -1; + } else if (inuma <= inumb) + return -1; + + return 1; +} + +/** + * sort_nodes - sort nodes for GC. * @c: UBIFS file-system description object - * @sleb: describes nodes to move + * @sleb: describes nodes to sort and contains the result on exit + * @nondata: contains non-data nodes on exit + * @min: minimum node size is returned here * - * This function moves valid nodes from data LEB described by @sleb to the GC - * journal head. The obsolete nodes are dropped. + * This function sorts the list of inodes to garbage collect. First of all, it + * kills obsolete nodes and separates data and non-data nodes to the + * @sleb->nodes and @nondata lists correspondingly. + * + * Data nodes are then sorted in block number order - this is important for + * bulk-read; data nodes with lower inode number go before data nodes with + * higher inode number, and data nodes with lower block number go before data + * nodes with higher block number; * - * When moving nodes we have to deal with classical bin-packing problem: the - * space in the current GC journal head LEB and in @c->gc_lnum are the "bins", - * where the nodes in the @sleb->nodes list are the elements which should be - * fit optimally to the bins. This function uses the "first fit decreasing" - * strategy, although it does not really sort the nodes but just split them on - * 3 classes - large, medium, and small, so they are roughly sorted. + * Non-data nodes are sorted as follows. + * o First go inode nodes - they are sorted in descending length order. + * o Then go directory entry nodes - they are sorted in hash order, which + * should supposedly optimize 'readdir()'. Direntry nodes with lower parent + * inode number go before direntry nodes with higher parent inode number, + * and direntry nodes with lower name hash values go before direntry nodes + * with higher name hash values. * - * This function returns zero in case of success, %-EAGAIN if commit is - * required, and other negative error codes in case of other failures. + * This function returns zero in case of success and a negative error code in + * case of failure. */ -static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb) +static int sort_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb, + struct list_head *nondata, int *min) { + int err; struct ubifs_scan_node *snod, *tmp; - struct list_head large, medium, small; - struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf; - int avail, err, min = INT_MAX; - INIT_LIST_HEAD(&large); - INIT_LIST_HEAD(&medium); - INIT_LIST_HEAD(&small); + *min = INT_MAX; + /* Separate data nodes and non-data nodes */ list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) { - struct list_head *lst; + ubifs_assert(snod->type == UBIFS_INO_NODE || + snod->type == UBIFS_DATA_NODE || + snod->type == UBIFS_DENT_NODE || + snod->type == UBIFS_XENT_NODE || + snod->type == UBIFS_TRUN_NODE); + + if (snod->type != UBIFS_INO_NODE && + snod->type != UBIFS_DATA_NODE && + snod->type != UBIFS_DENT_NODE && + snod->type != UBIFS_XENT_NODE) { + /* Probably truncation node, zap it */ + list_del(&snod->list); + kfree(snod); + continue; + } - ubifs_assert(snod->type != UBIFS_IDX_NODE); - ubifs_assert(snod->type != UBIFS_REF_NODE); - ubifs_assert(snod->type != UBIFS_CS_NODE); + ubifs_assert(key_type(c, &snod->key) == UBIFS_DATA_KEY || + key_type(c, &snod->key) == UBIFS_INO_KEY || + key_type(c, &snod->key) == UBIFS_DENT_KEY || + key_type(c, &snod->key) == UBIFS_XENT_KEY); err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum, snod->offs, 0); if (err < 0) - goto out; + return err; - lst = &snod->list; - list_del(lst); if (!err) { /* The node is obsolete, remove it from the list */ + list_del(&snod->list); kfree(snod); continue; } - /* - * Sort the list of nodes so that large nodes go first, and - * small nodes go last. - */ - if (snod->len > MEDIUM_NODE_WM) - list_add(lst, &large); - else if (snod->len > SMALL_NODE_WM) - list_add(lst, &medium); - else - list_add(lst, &small); - - /* And find the smallest node */ - if (snod->len < min) - min = snod->len; + if (snod->len < *min) + *min = snod->len; + + if (key_type(c, &snod->key) != UBIFS_DATA_KEY) + list_move_tail(&snod->list, nondata); } - /* - * Join the tree lists so that we'd have one roughly sorted list - * ('large' will be the head of the joined list). - */ - list_splice(&medium, large.prev); - list_splice(&small, large.prev); + /* Sort data and non-data nodes */ + list_sort(c, &sleb->nodes, &data_nodes_cmp); + list_sort(c, nondata, &nondata_nodes_cmp); + + err = dbg_check_data_nodes_order(c, &sleb->nodes); + if (err) + return err; + err = dbg_check_nondata_nodes_order(c, nondata); + if (err) + return err; + return 0; +} + +/** + * move_node - move a node. + * @c: UBIFS file-system description object + * @sleb: describes the LEB to move nodes from + * @snod: the mode to move + * @wbuf: write-buffer to move node to + * + * This function moves node @snod to @wbuf, changes TNC correspondingly, and + * destroys @snod. Returns zero in case of success and a negative error code in + * case of failure. + */ +static int move_node(struct ubifs_info *c, struct ubifs_scan_leb *sleb, + struct ubifs_scan_node *snod, struct ubifs_wbuf *wbuf) +{ + int err, new_lnum = wbuf->lnum, new_offs = wbuf->offs + wbuf->used; + + cond_resched(); + err = ubifs_wbuf_write_nolock(wbuf, snod->node, snod->len); + if (err) + return err; + + err = ubifs_tnc_replace(c, &snod->key, sleb->lnum, + snod->offs, new_lnum, new_offs, + snod->len); + list_del(&snod->list); + kfree(snod); + return err; +} + +/** + * move_nodes - move nodes. + * @c: UBIFS file-system description object + * @sleb: describes the LEB to move nodes from + * + * This function moves valid nodes from data LEB described by @sleb to the GC + * journal head. This function returns zero in case of success, %-EAGAIN if + * commit is required, and other negative error codes in case of other + * failures. + */ +static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb) +{ + int err, min; + LIST_HEAD(nondata); + struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf; if (wbuf->lnum == -1) { /* @@ -174,42 +359,59 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb) */ err = switch_gc_head(c); if (err) - goto out; + return err; } + err = sort_nodes(c, sleb, &nondata, &min); + if (err) + goto out; + /* Write nodes to their new location. Use the first-fit strategy */ while (1) { - avail = c->leb_size - wbuf->offs - wbuf->used; - list_for_each_entry_safe(snod, tmp, &large, list) { - int new_lnum, new_offs; + int avail; + struct ubifs_scan_node *snod, *tmp; + + /* Move data nodes */ + list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) { + avail = c->leb_size - wbuf->offs - wbuf->used; + if (snod->len > avail) + /* + * Do not skip data nodes in order to optimize + * bulk-read. + */ + break; + err = move_node(c, sleb, snod, wbuf); + if (err) + goto out; + } + + /* Move non-data nodes */ + list_for_each_entry_safe(snod, tmp, &nondata, list) { + avail = c->leb_size - wbuf->offs - wbuf->used; if (avail < min) break; - if (snod->len > avail) - /* This node does not fit */ + if (snod->len > avail) { + /* + * Keep going only if this is an inode with + * some data. Otherwise stop and switch the GC + * head. IOW, we assume that data-less inode + * nodes and direntry nodes are roughly of the + * same size. + */ + if (key_type(c, &snod->key) == UBIFS_DENT_KEY || + snod->len == UBIFS_INO_NODE_SZ) + break; continue; + } - cond_resched(); - - new_lnum = wbuf->lnum; - new_offs = wbuf->offs + wbuf->used; - err = ubifs_wbuf_write_nolock(wbuf, snod->node, - snod->len); + err = move_node(c, sleb, snod, wbuf); if (err) goto out; - err = ubifs_tnc_replace(c, &snod->key, sleb->lnum, - snod->offs, new_lnum, new_offs, - snod->len); - if (err) - goto out; - - avail = c->leb_size - wbuf->offs - wbuf->used; - list_del(&snod->list); - kfree(snod); } - if (list_empty(&large)) + if (list_empty(&sleb->nodes) && list_empty(&nondata)) break; /* @@ -224,10 +426,7 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb) return 0; out: - list_for_each_entry_safe(snod, tmp, &large, list) { - list_del(&snod->list); - kfree(snod); - } + list_splice_tail(&nondata, &sleb->nodes); return err; } @@ -279,11 +478,42 @@ int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp) ubifs_assert(c->gc_lnum != lnum); ubifs_assert(wbuf->lnum != lnum); + if (lp->free + lp->dirty == c->leb_size) { + /* Special case - a free LEB */ + dbg_gc("LEB %d is free, return it", lp->lnum); + ubifs_assert(!(lp->flags & LPROPS_INDEX)); + + if (lp->free != c->leb_size) { + /* + * Write buffers must be sync'd before unmapping + * freeable LEBs, because one of them may contain data + * which obsoletes something in 'lp->pnum'. + */ + err = gc_sync_wbufs(c); + if (err) + return err; + err = ubifs_change_one_lp(c, lp->lnum, c->leb_size, + 0, 0, 0, 0); + if (err) + return err; + } + err = ubifs_leb_unmap(c, lp->lnum); + if (err) + return err; + + if (c->gc_lnum == -1) { + c->gc_lnum = lnum; + return LEB_RETAINED; + } + + return LEB_FREED; + } + /* * We scan the entire LEB even though we only really need to scan up to * (c->leb_size - lp->free). */ - sleb = ubifs_scan(c, lnum, 0, c->sbuf); + sleb = ubifs_scan(c, lnum, 0, c->sbuf, 0); if (IS_ERR(sleb)) return PTR_ERR(sleb); @@ -319,7 +549,7 @@ int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp) /* * Don't release the LEB until after the next commit, because - * it may contain date which is needed for recovery. So + * it may contain data which is needed for recovery. So * although we freed this LEB, it will become usable only after * the commit. */ @@ -334,15 +564,21 @@ int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp) err = move_nodes(c, sleb); if (err) - goto out; + goto out_inc_seq; err = gc_sync_wbufs(c); if (err) - goto out; + goto out_inc_seq; err = ubifs_change_one_lp(c, lnum, c->leb_size, 0, 0, 0, 0); if (err) - goto out; + goto out_inc_seq; + + /* Allow for races with TNC */ + c->gced_lnum = lnum; + smp_wmb(); + c->gc_seq += 1; + smp_wmb(); if (c->gc_lnum == -1) { c->gc_lnum = lnum; @@ -363,6 +599,14 @@ int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp) out: ubifs_scan_destroy(sleb); return err; + +out_inc_seq: + /* We may have moved at least some nodes so allow for races with TNC */ + c->gced_lnum = lnum; + smp_wmb(); + c->gc_seq += 1; + smp_wmb(); + goto out; } /** @@ -408,13 +652,14 @@ int ubifs_garbage_collect(struct ubifs_info *c, int anyway) struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf; ubifs_assert_cmt_locked(c); + ubifs_assert(!c->ro_media && !c->ro_mount); if (ubifs_gc_should_commit(c)) return -EAGAIN; mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); - if (c->ro_media) { + if (c->ro_error) { ret = -EROFS; goto out_unlock; } @@ -423,8 +668,7 @@ int ubifs_garbage_collect(struct ubifs_info *c, int anyway) ubifs_assert(!wbuf->used); for (i = 0; ; i++) { - int space_before = c->leb_size - wbuf->offs - wbuf->used; - int space_after; + int space_before, space_after; cond_resched(); @@ -469,40 +713,9 @@ int ubifs_garbage_collect(struct ubifs_info *c, int anyway) break; } - dbg_gc("found LEB %d: free %d, dirty %d, sum %d " - "(min. space %d)", lp.lnum, lp.free, lp.dirty, - lp.free + lp.dirty, min_space); - - if (lp.free + lp.dirty == c->leb_size) { - /* An empty LEB was returned */ - dbg_gc("LEB %d is free, return it", lp.lnum); - /* - * ubifs_find_dirty_leb() doesn't return freeable index - * LEBs. - */ - ubifs_assert(!(lp.flags & LPROPS_INDEX)); - if (lp.free != c->leb_size) { - /* - * Write buffers must be sync'd before - * unmapping freeable LEBs, because one of them - * may contain data which obsoletes something - * in 'lp.pnum'. - */ - ret = gc_sync_wbufs(c); - if (ret) - goto out; - ret = ubifs_change_one_lp(c, lp.lnum, - c->leb_size, 0, 0, 0, - 0); - if (ret) - goto out; - } - ret = ubifs_leb_unmap(c, lp.lnum); - if (ret) - goto out; - ret = lp.lnum; - break; - } + dbg_gc("found LEB %d: free %d, dirty %d, sum %d (min. space %d)", + lp.lnum, lp.free, lp.dirty, lp.free + lp.dirty, + min_space); space_before = c->leb_size - wbuf->offs - wbuf->used; if (wbuf->lnum == -1) @@ -510,14 +723,12 @@ int ubifs_garbage_collect(struct ubifs_info *c, int anyway) ret = ubifs_garbage_collect_leb(c, &lp); if (ret < 0) { - if (ret == -EAGAIN || ret == -ENOSPC) { + if (ret == -EAGAIN) { /* - * These codes are not errors, so we have to - * return the LEB to lprops. But if the - * 'ubifs_return_leb()' function fails, its - * failure code is propagated to the caller - * instead of the original '-EAGAIN' or - * '-ENOSPC'. + * This is not error, so we have to return the + * LEB to lprops. But if 'ubifs_return_leb()' + * fails, its failure code is propagated to the + * caller instead of the original '-EAGAIN'. */ err = ubifs_return_leb(c, lp.lnum); if (err) @@ -607,8 +818,8 @@ out_unlock: out: ubifs_assert(ret < 0); ubifs_assert(ret != -ENOSPC && ret != -EAGAIN); - ubifs_ro_mode(c, ret); ubifs_wbuf_sync_nolock(wbuf); + ubifs_ro_mode(c, ret); mutex_unlock(&wbuf->io_mutex); ubifs_return_leb(c, lp.lnum); return ret; @@ -639,7 +850,7 @@ int ubifs_gc_start_commit(struct ubifs_info *c) */ while (1) { lp = ubifs_fast_find_freeable(c); - if (unlikely(IS_ERR(lp))) { + if (IS_ERR(lp)) { err = PTR_ERR(lp); goto out; } @@ -651,7 +862,7 @@ int ubifs_gc_start_commit(struct ubifs_info *c) if (err) goto out; lp = ubifs_change_lp(c, lp, c->leb_size, 0, lp->flags, 0); - if (unlikely(IS_ERR(lp))) { + if (IS_ERR(lp)) { err = PTR_ERR(lp); goto out; } @@ -666,7 +877,7 @@ int ubifs_gc_start_commit(struct ubifs_info *c) /* Record index freeable LEBs for unmapping after commit */ while (1) { lp = ubifs_fast_find_frdi_idx(c); - if (unlikely(IS_ERR(lp))) { + if (IS_ERR(lp)) { err = PTR_ERR(lp); goto out; } @@ -682,7 +893,7 @@ int ubifs_gc_start_commit(struct ubifs_info *c) /* Don't release the LEB until after the next commit */ flags = (lp->flags | LPROPS_TAKEN) ^ LPROPS_INDEX; lp = ubifs_change_lp(c, lp, c->leb_size, 0, flags, 1); - if (unlikely(IS_ERR(lp))) { + if (IS_ERR(lp)) { err = PTR_ERR(lp); kfree(idx_gc); goto out; @@ -734,8 +945,9 @@ out: * ubifs_destroy_idx_gc - destroy idx_gc list. * @c: UBIFS file-system description object * - * This function destroys the idx_gc list. It is called when unmounting or - * remounting read-only so locks are not needed. + * This function destroys the @c->idx_gc list. It is called when unmounting + * so locks are not needed. Returns zero in case of success and a negative + * error code in case of failure. */ void ubifs_destroy_idx_gc(struct ubifs_info *c) { @@ -748,7 +960,6 @@ void ubifs_destroy_idx_gc(struct ubifs_info *c) list_del(&idx_gc->list); kfree(idx_gc); } - } /** diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c index 3374f91b670..2290d586672 100644 --- a/fs/ubifs/io.c +++ b/fs/ubifs/io.c @@ -29,7 +29,27 @@ * would have been wasted for padding to the nearest minimal I/O unit boundary. * Instead, data first goes to the write-buffer and is flushed when the * buffer is full or when it is not used for some time (by timer). This is - * similarto the mechanism is used by JFFS2. + * similar to the mechanism is used by JFFS2. + * + * UBIFS distinguishes between minimum write size (@c->min_io_size) and maximum + * write size (@c->max_write_size). The latter is the maximum amount of bytes + * the underlying flash is able to program at a time, and writing in + * @c->max_write_size units should presumably be faster. Obviously, + * @c->min_io_size <= @c->max_write_size. Write-buffers are of + * @c->max_write_size bytes in size for maximum performance. However, when a + * write-buffer is flushed, only the portion of it (aligned to @c->min_io_size + * boundary) which contains data is written, not the whole write-buffer, + * because this is more space-efficient. + * + * This optimization adds few complications to the code. Indeed, on the one + * hand, we want to write in optimal @c->max_write_size bytes chunks, which + * also means aligning writes at the @c->max_write_size bytes offsets. On the + * other hand, we do not want to waste space when synchronizing the write + * buffer, so during synchronization we writes in smaller chunks. And this makes + * the next write offset to be not aligned to @c->max_write_size bytes. So the + * have to make sure that the write-buffer offset (@wbuf->offs) becomes aligned + * to @c->max_write_size bytes again. We do this by temporarily shrinking + * write-buffer size (@wbuf->size). * * Write-buffers are defined by 'struct ubifs_wbuf' objects and protected by * mutexes defined inside these objects. Since sometimes upper-level code @@ -46,20 +66,154 @@ * UBIFS uses padding when it pads to the next min. I/O unit. In this case it * uses padding nodes or padding bytes, if the padding node does not fit. * - * All UBIFS nodes are protected by CRC checksums and UBIFS checks all nodes - * every time they are read from the flash media. + * All UBIFS nodes are protected by CRC checksums and UBIFS checks CRC when + * they are read from the flash media. */ #include <linux/crc32.h> +#include <linux/slab.h> #include "ubifs.h" /** + * ubifs_ro_mode - switch UBIFS to read read-only mode. + * @c: UBIFS file-system description object + * @err: error code which is the reason of switching to R/O mode + */ +void ubifs_ro_mode(struct ubifs_info *c, int err) +{ + if (!c->ro_error) { + c->ro_error = 1; + c->no_chk_data_crc = 0; + c->vfs_sb->s_flags |= MS_RDONLY; + ubifs_warn("switched to read-only mode, error %d", err); + dump_stack(); + } +} + +/* + * Below are simple wrappers over UBI I/O functions which include some + * additional checks and UBIFS debugging stuff. See corresponding UBI function + * for more information. + */ + +int ubifs_leb_read(const struct ubifs_info *c, int lnum, void *buf, int offs, + int len, int even_ebadmsg) +{ + int err; + + err = ubi_read(c->ubi, lnum, buf, offs, len); + /* + * In case of %-EBADMSG print the error message only if the + * @even_ebadmsg is true. + */ + if (err && (err != -EBADMSG || even_ebadmsg)) { + ubifs_err("reading %d bytes from LEB %d:%d failed, error %d", + len, lnum, offs, err); + dump_stack(); + } + return err; +} + +int ubifs_leb_write(struct ubifs_info *c, int lnum, const void *buf, int offs, + int len) +{ + int err; + + ubifs_assert(!c->ro_media && !c->ro_mount); + if (c->ro_error) + return -EROFS; + if (!dbg_is_tst_rcvry(c)) + err = ubi_leb_write(c->ubi, lnum, buf, offs, len); + else + err = dbg_leb_write(c, lnum, buf, offs, len); + if (err) { + ubifs_err("writing %d bytes to LEB %d:%d failed, error %d", + len, lnum, offs, err); + ubifs_ro_mode(c, err); + dump_stack(); + } + return err; +} + +int ubifs_leb_change(struct ubifs_info *c, int lnum, const void *buf, int len) +{ + int err; + + ubifs_assert(!c->ro_media && !c->ro_mount); + if (c->ro_error) + return -EROFS; + if (!dbg_is_tst_rcvry(c)) + err = ubi_leb_change(c->ubi, lnum, buf, len); + else + err = dbg_leb_change(c, lnum, buf, len); + if (err) { + ubifs_err("changing %d bytes in LEB %d failed, error %d", + len, lnum, err); + ubifs_ro_mode(c, err); + dump_stack(); + } + return err; +} + +int ubifs_leb_unmap(struct ubifs_info *c, int lnum) +{ + int err; + + ubifs_assert(!c->ro_media && !c->ro_mount); + if (c->ro_error) + return -EROFS; + if (!dbg_is_tst_rcvry(c)) + err = ubi_leb_unmap(c->ubi, lnum); + else + err = dbg_leb_unmap(c, lnum); + if (err) { + ubifs_err("unmap LEB %d failed, error %d", lnum, err); + ubifs_ro_mode(c, err); + dump_stack(); + } + return err; +} + +int ubifs_leb_map(struct ubifs_info *c, int lnum) +{ + int err; + + ubifs_assert(!c->ro_media && !c->ro_mount); + if (c->ro_error) + return -EROFS; + if (!dbg_is_tst_rcvry(c)) + err = ubi_leb_map(c->ubi, lnum); + else + err = dbg_leb_map(c, lnum); + if (err) { + ubifs_err("mapping LEB %d failed, error %d", lnum, err); + ubifs_ro_mode(c, err); + dump_stack(); + } + return err; +} + +int ubifs_is_mapped(const struct ubifs_info *c, int lnum) +{ + int err; + + err = ubi_is_mapped(c->ubi, lnum); + if (err < 0) { + ubifs_err("ubi_is_mapped failed for LEB %d, error %d", + lnum, err); + dump_stack(); + } + return err; +} + +/** * ubifs_check_node - check node. * @c: UBIFS file-system description object * @buf: node to check * @lnum: logical eraseblock number * @offs: offset within the logical eraseblock * @quiet: print no messages + * @must_chk_crc: indicates whether to always check the CRC * * This function checks node magic number and CRC checksum. This function also * validates node length to prevent UBIFS from becoming crazy when an attacker @@ -67,11 +221,21 @@ * node length in the common header could cause UBIFS to read memory outside of * allocated buffer when checking the CRC checksum. * - * This function returns zero in case of success %-EUCLEAN in case of bad CRC - * or magic. + * This function may skip data nodes CRC checking if @c->no_chk_data_crc is + * true, which is controlled by corresponding UBIFS mount option. However, if + * @must_chk_crc is true, then @c->no_chk_data_crc is ignored and CRC is + * checked. Similarly, if @c->mounting or @c->remounting_rw is true (we are + * mounting or re-mounting to R/W mode), @c->no_chk_data_crc is ignored and CRC + * is checked. This is because during mounting or re-mounting from R/O mode to + * R/W mode we may read journal nodes (when replying the journal or doing the + * recovery) and the journal nodes may potentially be corrupted, so checking is + * required. + * + * This function returns zero in case of success and %-EUCLEAN in case of bad + * CRC or magic. */ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum, - int offs, int quiet) + int offs, int quiet, int must_chk_crc) { int err = -EINVAL, type, node_len; uint32_t crc, node_crc, magic; @@ -107,6 +271,10 @@ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum, node_len > c->ranges[type].max_len) goto out_len; + if (!must_chk_crc && type == UBIFS_DATA_NODE && !c->mounting && + !c->remounting_rw && c->no_chk_data_crc) + return 0; + crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8); node_crc = le32_to_cpu(ch->crc); if (crc != node_crc) { @@ -125,8 +293,8 @@ out_len: out: if (!quiet) { ubifs_err("bad node at LEB %d:%d", lnum, offs); - dbg_dump_node(c, buf); - dbg_dump_stack(); + ubifs_dump_node(c, buf); + dump_stack(); } return err; } @@ -267,13 +435,15 @@ void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last) * * This function is called when the write-buffer timer expires. */ -static void wbuf_timer_callback_nolock(unsigned long data) +static enum hrtimer_restart wbuf_timer_callback_nolock(struct hrtimer *timer) { - struct ubifs_wbuf *wbuf = (struct ubifs_wbuf *)data; + struct ubifs_wbuf *wbuf = container_of(timer, struct ubifs_wbuf, timer); + dbg_io("jhead %s", dbg_jhead(wbuf->jhead)); wbuf->need_sync = 1; wbuf->c->need_wbuf_sync = 1; ubifs_wake_up_bgt(wbuf->c); + return HRTIMER_NORESTART; } /** @@ -282,13 +452,17 @@ static void wbuf_timer_callback_nolock(unsigned long data) */ static void new_wbuf_timer_nolock(struct ubifs_wbuf *wbuf) { - ubifs_assert(!timer_pending(&wbuf->timer)); + ubifs_assert(!hrtimer_active(&wbuf->timer)); - if (!wbuf->timeout) + if (wbuf->no_timer) return; - - wbuf->timer.expires = jiffies + wbuf->timeout; - add_timer(&wbuf->timer); + dbg_io("set timer for jhead %s, %llu-%llu millisecs", + dbg_jhead(wbuf->jhead), + div_u64(ktime_to_ns(wbuf->softlimit), USEC_PER_SEC), + div_u64(ktime_to_ns(wbuf->softlimit) + wbuf->delta, + USEC_PER_SEC)); + hrtimer_start_range_ns(&wbuf->timer, wbuf->softlimit, wbuf->delta, + HRTIMER_MODE_REL); } /** @@ -297,13 +471,10 @@ static void new_wbuf_timer_nolock(struct ubifs_wbuf *wbuf) */ static void cancel_wbuf_timer_nolock(struct ubifs_wbuf *wbuf) { - /* - * If the syncer is waiting for the lock (from the background thread's - * context) and another task is changing write-buffer then the syncing - * should be canceled. - */ + if (wbuf->no_timer) + return; wbuf->need_sync = 0; - del_timer(&wbuf->timer); + hrtimer_cancel(&wbuf->timer); } /** @@ -312,41 +483,68 @@ static void cancel_wbuf_timer_nolock(struct ubifs_wbuf *wbuf) * * This function synchronizes write-buffer @buf and returns zero in case of * success or a negative error code in case of failure. + * + * Note, although write-buffers are of @c->max_write_size, this function does + * not necessarily writes all @c->max_write_size bytes to the flash. Instead, + * if the write-buffer is only partially filled with data, only the used part + * of the write-buffer (aligned on @c->min_io_size boundary) is synchronized. + * This way we waste less space. */ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf) { struct ubifs_info *c = wbuf->c; - int err, dirt; + int err, dirt, sync_len; cancel_wbuf_timer_nolock(wbuf); if (!wbuf->used || wbuf->lnum == -1) /* Write-buffer is empty or not seeked */ return 0; - dbg_io("LEB %d:%d, %d bytes", - wbuf->lnum, wbuf->offs, wbuf->used); - ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY)); + dbg_io("LEB %d:%d, %d bytes, jhead %s", + wbuf->lnum, wbuf->offs, wbuf->used, dbg_jhead(wbuf->jhead)); ubifs_assert(!(wbuf->avail & 7)); - ubifs_assert(wbuf->offs + c->min_io_size <= c->leb_size); - - if (c->ro_media) + ubifs_assert(wbuf->offs + wbuf->size <= c->leb_size); + ubifs_assert(wbuf->size >= c->min_io_size); + ubifs_assert(wbuf->size <= c->max_write_size); + ubifs_assert(wbuf->size % c->min_io_size == 0); + ubifs_assert(!c->ro_media && !c->ro_mount); + if (c->leb_size - wbuf->offs >= c->max_write_size) + ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size)); + + if (c->ro_error) return -EROFS; - ubifs_pad(c, wbuf->buf + wbuf->used, wbuf->avail); - err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs, - c->min_io_size, wbuf->dtype); - if (err) { - ubifs_err("cannot write %d bytes to LEB %d:%d", - c->min_io_size, wbuf->lnum, wbuf->offs); - dbg_dump_stack(); + /* + * Do not write whole write buffer but write only the minimum necessary + * amount of min. I/O units. + */ + sync_len = ALIGN(wbuf->used, c->min_io_size); + dirt = sync_len - wbuf->used; + if (dirt) + ubifs_pad(c, wbuf->buf + wbuf->used, dirt); + err = ubifs_leb_write(c, wbuf->lnum, wbuf->buf, wbuf->offs, sync_len); + if (err) return err; - } - - dirt = wbuf->avail; spin_lock(&wbuf->lock); - wbuf->offs += c->min_io_size; - wbuf->avail = c->min_io_size; + wbuf->offs += sync_len; + /* + * Now @wbuf->offs is not necessarily aligned to @c->max_write_size. + * But our goal is to optimize writes and make sure we write in + * @c->max_write_size chunks and to @c->max_write_size-aligned offset. + * Thus, if @wbuf->offs is not aligned to @c->max_write_size now, make + * sure that @wbuf->offs + @wbuf->size is aligned to + * @c->max_write_size. This way we make sure that after next + * write-buffer flush we are again at the optimal offset (aligned to + * @c->max_write_size). + */ + if (c->leb_size - wbuf->offs < c->max_write_size) + wbuf->size = c->leb_size - wbuf->offs; + else if (wbuf->offs & (c->max_write_size - 1)) + wbuf->size = ALIGN(wbuf->offs, c->max_write_size) - wbuf->offs; + else + wbuf->size = c->max_write_size; + wbuf->avail = wbuf->size; wbuf->used = 0; wbuf->next_ino = 0; spin_unlock(&wbuf->lock); @@ -362,37 +560,34 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf) * @wbuf: write-buffer * @lnum: logical eraseblock number to seek to * @offs: logical eraseblock offset to seek to - * @dtype: data type * - * This function targets the write buffer to logical eraseblock @lnum:@offs. - * The write-buffer is synchronized if it is not empty. Returns zero in case of - * success and a negative error code in case of failure. + * This function targets the write-buffer to logical eraseblock @lnum:@offs. + * The write-buffer has to be empty. Returns zero in case of success and a + * negative error code in case of failure. */ -int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs, - int dtype) +int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs) { const struct ubifs_info *c = wbuf->c; - dbg_io("LEB %d:%d", lnum, offs); + dbg_io("LEB %d:%d, jhead %s", lnum, offs, dbg_jhead(wbuf->jhead)); ubifs_assert(lnum >= 0 && lnum < c->leb_cnt); ubifs_assert(offs >= 0 && offs <= c->leb_size); ubifs_assert(offs % c->min_io_size == 0 && !(offs & 7)); ubifs_assert(lnum != wbuf->lnum); - - if (wbuf->used > 0) { - int err = ubifs_wbuf_sync_nolock(wbuf); - - if (err) - return err; - } + ubifs_assert(wbuf->used == 0); spin_lock(&wbuf->lock); wbuf->lnum = lnum; wbuf->offs = offs; - wbuf->avail = c->min_io_size; + if (c->leb_size - wbuf->offs < c->max_write_size) + wbuf->size = c->leb_size - wbuf->offs; + else if (wbuf->offs & (c->max_write_size - 1)) + wbuf->size = ALIGN(wbuf->offs, c->max_write_size) - wbuf->offs; + else + wbuf->size = c->max_write_size; + wbuf->avail = wbuf->size; wbuf->used = 0; spin_unlock(&wbuf->lock); - wbuf->dtype = dtype; return 0; } @@ -409,11 +604,12 @@ int ubifs_bg_wbufs_sync(struct ubifs_info *c) { int err, i; + ubifs_assert(!c->ro_media && !c->ro_mount); if (!c->need_wbuf_sync) return 0; c->need_wbuf_sync = 0; - if (c->ro_media) { + if (c->ro_error) { err = -EROFS; goto out_timers; } @@ -468,8 +664,9 @@ out_timers: * * This function writes data to flash via write-buffer @wbuf. This means that * the last piece of the node won't reach the flash media immediately if it - * does not take whole minimal I/O unit. Instead, the node will sit in RAM - * until the write-buffer is synchronized (e.g., by timer). + * does not take whole max. write unit (@c->max_write_size). Instead, the node + * will sit in RAM until the write-buffer is synchronized (e.g., by timer, or + * because more data are appended to the write-buffer). * * This function returns zero in case of success and a negative error code in * case of failure. If the node cannot be written because there is no more @@ -478,16 +675,23 @@ out_timers: int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) { struct ubifs_info *c = wbuf->c; - int err, written, n, aligned_len = ALIGN(len, 8), offs; + int err, written, n, aligned_len = ALIGN(len, 8); - dbg_io("%d bytes (%s) to wbuf at LEB %d:%d", len, - dbg_ntype(((struct ubifs_ch *)buf)->node_type), wbuf->lnum, - wbuf->offs + wbuf->used); + dbg_io("%d bytes (%s) to jhead %s wbuf at LEB %d:%d", len, + dbg_ntype(((struct ubifs_ch *)buf)->node_type), + dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs + wbuf->used); ubifs_assert(len > 0 && wbuf->lnum >= 0 && wbuf->lnum < c->leb_cnt); ubifs_assert(wbuf->offs >= 0 && wbuf->offs % c->min_io_size == 0); ubifs_assert(!(wbuf->offs & 7) && wbuf->offs <= c->leb_size); - ubifs_assert(wbuf->avail > 0 && wbuf->avail <= c->min_io_size); + ubifs_assert(wbuf->avail > 0 && wbuf->avail <= wbuf->size); + ubifs_assert(wbuf->size >= c->min_io_size); + ubifs_assert(wbuf->size <= c->max_write_size); + ubifs_assert(wbuf->size % c->min_io_size == 0); ubifs_assert(mutex_is_locked(&wbuf->io_mutex)); + ubifs_assert(!c->ro_media && !c->ro_mount); + ubifs_assert(!c->space_fixup); + if (c->leb_size - wbuf->offs >= c->max_write_size) + ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size)); if (c->leb_size - wbuf->offs - wbuf->used < aligned_len) { err = -ENOSPC; @@ -496,7 +700,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) cancel_wbuf_timer_nolock(wbuf); - if (c->ro_media) + if (c->ro_error) return -EROFS; if (aligned_len <= wbuf->avail) { @@ -507,17 +711,20 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) memcpy(wbuf->buf + wbuf->used, buf, len); if (aligned_len == wbuf->avail) { - dbg_io("flush wbuf to LEB %d:%d", wbuf->lnum, - wbuf->offs); - err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, - wbuf->offs, c->min_io_size, - wbuf->dtype); + dbg_io("flush jhead %s wbuf to LEB %d:%d", + dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs); + err = ubifs_leb_write(c, wbuf->lnum, wbuf->buf, + wbuf->offs, wbuf->size); if (err) goto out; spin_lock(&wbuf->lock); - wbuf->offs += c->min_io_size; - wbuf->avail = c->min_io_size; + wbuf->offs += wbuf->size; + if (c->leb_size - wbuf->offs >= c->max_write_size) + wbuf->size = c->max_write_size; + else + wbuf->size = c->leb_size - wbuf->offs; + wbuf->avail = wbuf->size; wbuf->used = 0; wbuf->next_ino = 0; spin_unlock(&wbuf->lock); @@ -531,38 +738,63 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) goto exit; } - /* - * The node is large enough and does not fit entirely within current - * minimal I/O unit. We have to fill and flush write-buffer and switch - * to the next min. I/O unit. - */ - dbg_io("flush wbuf to LEB %d:%d", wbuf->lnum, wbuf->offs); - memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail); - err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs, - c->min_io_size, wbuf->dtype); - if (err) - goto out; + written = 0; - offs = wbuf->offs + c->min_io_size; - len -= wbuf->avail; - aligned_len -= wbuf->avail; - written = wbuf->avail; + if (wbuf->used) { + /* + * The node is large enough and does not fit entirely within + * current available space. We have to fill and flush + * write-buffer and switch to the next max. write unit. + */ + dbg_io("flush jhead %s wbuf to LEB %d:%d", + dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs); + memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail); + err = ubifs_leb_write(c, wbuf->lnum, wbuf->buf, wbuf->offs, + wbuf->size); + if (err) + goto out; + + wbuf->offs += wbuf->size; + len -= wbuf->avail; + aligned_len -= wbuf->avail; + written += wbuf->avail; + } else if (wbuf->offs & (c->max_write_size - 1)) { + /* + * The write-buffer offset is not aligned to + * @c->max_write_size and @wbuf->size is less than + * @c->max_write_size. Write @wbuf->size bytes to make sure the + * following writes are done in optimal @c->max_write_size + * chunks. + */ + dbg_io("write %d bytes to LEB %d:%d", + wbuf->size, wbuf->lnum, wbuf->offs); + err = ubifs_leb_write(c, wbuf->lnum, buf, wbuf->offs, + wbuf->size); + if (err) + goto out; + + wbuf->offs += wbuf->size; + len -= wbuf->size; + aligned_len -= wbuf->size; + written += wbuf->size; + } /* - * The remaining data may take more whole min. I/O units, so write the - * remains multiple to min. I/O unit size directly to the flash media. + * The remaining data may take more whole max. write units, so write the + * remains multiple to max. write unit size directly to the flash media. * We align node length to 8-byte boundary because we anyway flash wbuf * if the remaining space is less than 8 bytes. */ - n = aligned_len >> c->min_io_shift; + n = aligned_len >> c->max_write_shift; if (n) { - n <<= c->min_io_shift; - dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum, offs); - err = ubi_leb_write(c->ubi, wbuf->lnum, buf + written, offs, n, - wbuf->dtype); + n <<= c->max_write_shift; + dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum, + wbuf->offs); + err = ubifs_leb_write(c, wbuf->lnum, buf + written, + wbuf->offs, n); if (err) goto out; - offs += n; + wbuf->offs += n; aligned_len -= n; len -= n; written += n; @@ -572,14 +804,17 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) if (aligned_len) /* * And now we have what's left and what does not take whole - * min. I/O unit, so write it to the write-buffer and we are + * max. write unit, so write it to the write-buffer and we are * done. */ memcpy(wbuf->buf, buf + written, len); - wbuf->offs = offs; + if (c->leb_size - wbuf->offs >= c->max_write_size) + wbuf->size = c->max_write_size; + else + wbuf->size = c->leb_size - wbuf->offs; + wbuf->avail = wbuf->size - aligned_len; wbuf->used = aligned_len; - wbuf->avail = c->min_io_size - aligned_len; wbuf->next_ino = 0; spin_unlock(&wbuf->lock); @@ -600,9 +835,9 @@ exit: out: ubifs_err("cannot write %d bytes to LEB %d:%d, error %d", len, wbuf->lnum, wbuf->offs, err); - dbg_dump_node(c, buf); - dbg_dump_stack(); - dbg_dump_leb(c, wbuf->lnum); + ubifs_dump_node(c, buf); + dump_stack(); + ubifs_dump_leb(c, wbuf->lnum); return err; } @@ -613,7 +848,6 @@ out: * @len: node length * @lnum: logical eraseblock number * @offs: offset within the logical eraseblock - * @dtype: node life-time hint (%UBI_LONGTERM, %UBI_SHORTTERM, %UBI_UNKNOWN) * * This function automatically fills node magic number, assigns sequence * number, and calculates node CRC checksum. The length of the @buf buffer has @@ -622,7 +856,7 @@ out: * success and a negative error code in case of failure. */ int ubifs_write_node(struct ubifs_info *c, void *buf, int len, int lnum, - int offs, int dtype) + int offs) { int err, buf_len = ALIGN(len, c->min_io_size); @@ -631,18 +865,16 @@ int ubifs_write_node(struct ubifs_info *c, void *buf, int len, int lnum, buf_len); ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0); ubifs_assert(offs % c->min_io_size == 0 && offs < c->leb_size); + ubifs_assert(!c->ro_media && !c->ro_mount); + ubifs_assert(!c->space_fixup); - if (c->ro_media) + if (c->ro_error) return -EROFS; ubifs_prepare_node(c, buf, len, 1); - err = ubi_leb_write(c->ubi, lnum, buf, offs, buf_len, dtype); - if (err) { - ubifs_err("cannot write %d bytes to LEB %d:%d, error %d", - buf_len, lnum, offs, err); - dbg_dump_node(c, buf); - dbg_dump_stack(); - } + err = ubifs_leb_write(c, lnum, buf, offs, buf_len); + if (err) + ubifs_dump_node(c, buf); return err; } @@ -669,7 +901,8 @@ int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len, int err, rlen, overlap; struct ubifs_ch *ch = buf; - dbg_io("LEB %d:%d, %s, length %d", lnum, offs, dbg_ntype(type), len); + dbg_io("LEB %d:%d, %s, length %d, jhead %s", lnum, offs, + dbg_ntype(type), len, dbg_jhead(wbuf->jhead)); ubifs_assert(wbuf && lnum >= 0 && lnum < c->leb_cnt && offs >= 0); ubifs_assert(!(offs & 7) && offs < c->leb_size); ubifs_assert(type >= 0 && type < UBIFS_NODE_TYPES_CNT); @@ -693,13 +926,9 @@ int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len, if (rlen > 0) { /* Read everything that goes before write-buffer */ - err = ubi_read(c->ubi, lnum, buf, offs, rlen); - if (err && err != -EBADMSG) { - ubifs_err("failed to read node %d from LEB %d:%d, " - "error %d", type, lnum, offs, err); - dbg_dump_stack(); + err = ubifs_leb_read(c, lnum, buf, offs, rlen, 0); + if (err && err != -EBADMSG) return err; - } } if (type != ch->node_type) { @@ -708,7 +937,7 @@ int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len, goto out; } - err = ubifs_check_node(c, buf, lnum, offs, 0); + err = ubifs_check_node(c, buf, lnum, offs, 0, 0); if (err) { ubifs_err("expected node type %d", type); return err; @@ -724,8 +953,8 @@ int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len, out: ubifs_err("bad node at LEB %d:%d", lnum, offs); - dbg_dump_node(c, buf); - dbg_dump_stack(); + ubifs_dump_node(c, buf); + dump_stack(); return -EINVAL; } @@ -754,37 +983,37 @@ int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len, ubifs_assert(!(offs & 7) && offs < c->leb_size); ubifs_assert(type >= 0 && type < UBIFS_NODE_TYPES_CNT); - err = ubi_read(c->ubi, lnum, buf, offs, len); - if (err && err != -EBADMSG) { - ubifs_err("cannot read node %d from LEB %d:%d, error %d", - type, lnum, offs, err); + err = ubifs_leb_read(c, lnum, buf, offs, len, 0); + if (err && err != -EBADMSG) return err; - } if (type != ch->node_type) { - ubifs_err("bad node type (%d but expected %d)", - ch->node_type, type); + ubifs_errc(c, "bad node type (%d but expected %d)", + ch->node_type, type); goto out; } - err = ubifs_check_node(c, buf, lnum, offs, 0); + err = ubifs_check_node(c, buf, lnum, offs, 0, 0); if (err) { - ubifs_err("expected node type %d", type); + ubifs_errc(c, "expected node type %d", type); return err; } l = le32_to_cpu(ch->len); if (l != len) { - ubifs_err("bad node length %d, expected %d", l, len); + ubifs_errc(c, "bad node length %d, expected %d", l, len); goto out; } return 0; out: - ubifs_err("bad node at LEB %d:%d", lnum, offs); - dbg_dump_node(c, buf); - dbg_dump_stack(); + ubifs_errc(c, "bad node at LEB %d:%d, LEB mapping status %d", lnum, + offs, ubi_is_mapped(c->ubi, lnum)); + if (!c->probing) { + ubifs_dump_node(c, buf); + dump_stack(); + } return -EINVAL; } @@ -793,18 +1022,18 @@ out: * @c: UBIFS file-system description object * @wbuf: write-buffer to initialize * - * This function initializes write buffer. Returns zero in case of success + * This function initializes write-buffer. Returns zero in case of success * %-ENOMEM in case of failure. */ int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf) { size_t size; - wbuf->buf = kmalloc(c->min_io_size, GFP_KERNEL); + wbuf->buf = kmalloc(c->max_write_size, GFP_KERNEL); if (!wbuf->buf) return -ENOMEM; - size = (c->min_io_size / UBIFS_CH_SZ + 1) * sizeof(ino_t); + size = (c->max_write_size / UBIFS_CH_SZ + 1) * sizeof(ino_t); wbuf->inodes = kmalloc(size, GFP_KERNEL); if (!wbuf->inodes) { kfree(wbuf->buf); @@ -814,25 +1043,32 @@ int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf) wbuf->used = 0; wbuf->lnum = wbuf->offs = -1; - wbuf->avail = c->min_io_size; - wbuf->dtype = UBI_UNKNOWN; + /* + * If the LEB starts at the max. write size aligned address, then + * write-buffer size has to be set to @c->max_write_size. Otherwise, + * set it to something smaller so that it ends at the closest max. + * write size boundary. + */ + size = c->max_write_size - (c->leb_start % c->max_write_size); + wbuf->avail = wbuf->size = size; wbuf->sync_callback = NULL; mutex_init(&wbuf->io_mutex); spin_lock_init(&wbuf->lock); - wbuf->c = c; - init_timer(&wbuf->timer); - wbuf->timer.function = wbuf_timer_callback_nolock; - wbuf->timer.data = (unsigned long)wbuf; - wbuf->timeout = DEFAULT_WBUF_TIMEOUT; wbuf->next_ino = 0; + hrtimer_init(&wbuf->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + wbuf->timer.function = wbuf_timer_callback_nolock; + wbuf->softlimit = ktime_set(WBUF_TIMEOUT_SOFTLIMIT, 0); + wbuf->delta = WBUF_TIMEOUT_HARDLIMIT - WBUF_TIMEOUT_SOFTLIMIT; + wbuf->delta *= 1000000000ULL; + ubifs_assert(wbuf->delta <= ULONG_MAX); return 0; } /** * ubifs_wbuf_add_ino_nolock - add an inode number into the wbuf inode array. - * @wbuf: the write-buffer whereto add + * @wbuf: the write-buffer where to add * @inum: the inode number * * This function adds an inode number to the inode array of the write-buffer. diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c index 5e82cffe969..648b143606c 100644 --- a/fs/ubifs/ioctl.c +++ b/fs/ubifs/ioctl.c @@ -25,7 +25,6 @@ /* This file implements EXT2-compatible extended attribute ioctl() calls */ #include <linux/compat.h> -#include <linux/smp_lock.h> #include <linux/mount.h> #include "ubifs.h" @@ -148,19 +147,20 @@ out_unlock: long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { int flags, err; - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); switch (cmd) { case FS_IOC_GETFLAGS: flags = ubifs2ioctl(ubifs_inode(inode)->flags); + dbg_gen("get flags: %#x, i_flags %#x", flags, inode->i_flags); return put_user(flags, (int __user *) arg); case FS_IOC_SETFLAGS: { if (IS_RDONLY(inode)) return -EROFS; - if (!is_owner_or_cap(inode)) + if (!inode_owner_or_capable(inode)) return -EACCES; if (get_user(flags, (int __user *) arg)) @@ -173,11 +173,12 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) * Make sure the file-system is read-write and make sure it * will not become read-only while we are changing the flags. */ - err = mnt_want_write(file->f_path.mnt); + err = mnt_want_write_file(file); if (err) return err; + dbg_gen("set flags: %#x, i_flags %#x", flags, inode->i_flags); err = setflags(inode, flags); - mnt_drop_write(file->f_path.mnt); + mnt_drop_write_file(file); return err; } diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 283155abe5f..0e045e75abd 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -114,7 +114,7 @@ static inline void zero_trun_node_unused(struct ubifs_trun_node *trun) */ static int reserve_space(struct ubifs_info *c, int jhead, int len) { - int err = 0, err1, retries = 0, avail, lnum, offs, free, squeeze; + int err = 0, err1, retries = 0, avail, lnum, offs, squeeze; struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf; /* @@ -122,11 +122,12 @@ static int reserve_space(struct ubifs_info *c, int jhead, int len) * better to try to allocate space at the ends of eraseblocks. This is * what the squeeze parameter does. */ + ubifs_assert(!c->ro_media && !c->ro_mount); squeeze = (jhead == BASEHD); again: mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); - if (c->ro_media) { + if (c->ro_error) { err = -EROFS; goto out_unlock; } @@ -139,16 +140,9 @@ again: * Write buffer wasn't seek'ed or there is no enough space - look for an * LEB with some empty space. */ - lnum = ubifs_find_free_space(c, len, &free, squeeze); - if (lnum >= 0) { - /* Found an LEB, add it to the journal head */ - offs = c->leb_size - free; - err = ubifs_add_bud_to_log(c, jhead, lnum, offs); - if (err) - goto out_return; - /* A new bud was successfully allocated and added to the log */ + lnum = ubifs_find_free_space(c, len, &offs, squeeze); + if (lnum >= 0) goto out; - } err = lnum; if (err != -ENOSPC) @@ -159,7 +153,7 @@ again: * some. But the write-buffer mutex has to be unlocked because * GC also takes it. */ - dbg_jnl("no free space jhead %d, run GC", jhead); + dbg_jnl("no free space in jhead %s, run GC", dbg_jhead(jhead)); mutex_unlock(&wbuf->io_mutex); lnum = ubifs_garbage_collect(c, 0); @@ -174,7 +168,8 @@ again: * because we dropped @wbuf->io_mutex, so try once * again. */ - dbg_jnl("GC couldn't make a free LEB for jhead %d", jhead); + dbg_jnl("GC couldn't make a free LEB for jhead %s", + dbg_jhead(jhead)); if (retries++ < 2) { dbg_jnl("retry (%d)", retries); goto again; @@ -185,13 +180,13 @@ again: } mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); - dbg_jnl("got LEB %d for jhead %d", lnum, jhead); + dbg_jnl("got LEB %d for jhead %s", lnum, dbg_jhead(jhead)); avail = c->leb_size - wbuf->offs - wbuf->used; if (wbuf->lnum != -1 && avail >= len) { /* * Someone else has switched the journal head and we have - * enough space now. This happens when more then one process is + * enough space now. This happens when more than one process is * trying to write to the same journal head at the same time. */ dbg_jnl("return LEB %d back, already have LEB %d:%d", @@ -202,13 +197,24 @@ again: return 0; } - err = ubifs_add_bud_to_log(c, jhead, lnum, 0); - if (err) - goto out_return; offs = 0; out: - err = ubifs_wbuf_seek_nolock(wbuf, lnum, offs, UBI_SHORTTERM); + /* + * Make sure we synchronize the write-buffer before we add the new bud + * to the log. Otherwise we may have a power cut after the log + * reference node for the last bud (@lnum) is written but before the + * write-buffer data are written to the next-to-last bud + * (@wbuf->lnum). And the effect would be that the recovery would see + * that there is corruption in the next-to-last bud. + */ + err = ubifs_wbuf_sync_nolock(wbuf); + if (err) + goto out_return; + err = ubifs_add_bud_to_log(c, jhead, lnum, offs); + if (err) + goto out_return; + err = ubifs_wbuf_seek_nolock(wbuf, lnum, offs); if (err) goto out_unlock; @@ -256,7 +262,8 @@ static int write_node(struct ubifs_info *c, int jhead, void *node, int len, *lnum = c->jheads[jhead].wbuf.lnum; *offs = c->jheads[jhead].wbuf.offs + c->jheads[jhead].wbuf.used; - dbg_jnl("jhead %d, LEB %d:%d, len %d", jhead, *lnum, *offs, len); + dbg_jnl("jhead %s, LEB %d:%d, len %d", + dbg_jhead(jhead), *lnum, *offs, len); ubifs_prepare_node(c, node, len, 0); return ubifs_wbuf_write_nolock(wbuf, node, len); @@ -286,7 +293,8 @@ static int write_head(struct ubifs_info *c, int jhead, void *buf, int len, *lnum = c->jheads[jhead].wbuf.lnum; *offs = c->jheads[jhead].wbuf.offs + c->jheads[jhead].wbuf.used; - dbg_jnl("jhead %d, LEB %d:%d, len %d", jhead, *lnum, *offs, len); + dbg_jnl("jhead %s, LEB %d:%d, len %d", + dbg_jhead(jhead), *lnum, *offs, len); err = ubifs_wbuf_write_nolock(wbuf, buf, len); if (err) @@ -377,11 +385,9 @@ out: if (err == -ENOSPC) { /* This are some budgeting problems, print useful information */ down_write(&c->commit_sem); - spin_lock(&c->space_lock); - dbg_dump_stack(); - dbg_dump_budg(c); - spin_unlock(&c->space_lock); - dbg_dump_lprops(c); + dump_stack(); + ubifs_dump_budg(c, &c->bi); + ubifs_dump_lprops(c); cmt_retries = dbg_check_lprops(c); up_write(&c->commit_sem); } @@ -447,13 +453,11 @@ static int get_dent_type(int mode) * @ino: buffer in which to pack inode node * @inode: inode to pack * @last: indicates the last node of the group - * @last_reference: non-zero if this is a deletion inode */ static void pack_inode(struct ubifs_info *c, struct ubifs_ino_node *ino, - const struct inode *inode, int last, - int last_reference) + const struct inode *inode, int last) { - int data_len = 0; + int data_len = 0, last_reference = !inode->i_nlink; struct ubifs_inode *ui = ubifs_inode(inode); ino->ch.node_type = UBIFS_INO_NODE; @@ -465,8 +469,8 @@ static void pack_inode(struct ubifs_info *c, struct ubifs_ino_node *ino, ino->ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); ino->mtime_sec = cpu_to_le64(inode->i_mtime.tv_sec); ino->mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); - ino->uid = cpu_to_le32(inode->i_uid); - ino->gid = cpu_to_le32(inode->i_gid); + ino->uid = cpu_to_le32(i_uid_read(inode)); + ino->gid = cpu_to_le32(i_gid_read(inode)); ino->mode = cpu_to_le32(inode->i_mode); ino->flags = cpu_to_le32(ui->flags); ino->size = cpu_to_le64(ui->ui_size); @@ -596,9 +600,9 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, ubifs_prep_grp_node(c, dent, dlen, 0); ino = (void *)dent + aligned_dlen; - pack_inode(c, ino, inode, 0, last_reference); + pack_inode(c, ino, inode, 0); ino = (void *)ino + aligned_ilen; - pack_inode(c, ino, dir, 1, 0); + pack_inode(c, ino, dir, 1); if (last_reference) { err = ubifs_add_orphan(c, inode->i_ino); @@ -606,6 +610,7 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, release_head(c, BASEHD); goto out_finish; } + ui->del_cmtno = c->cmt_no; } err = write_head(c, BASEHD, dent, len, &lnum, &dent_offs, sync); @@ -664,6 +669,7 @@ out_free: out_release: release_head(c, BASEHD); + kfree(dent); out_ro: ubifs_ro_mode(c, err); if (last_reference) @@ -688,23 +694,33 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode, { struct ubifs_data_node *data; int err, lnum, offs, compr_type, out_len; - int dlen = UBIFS_DATA_NODE_SZ + UBIFS_BLOCK_SIZE * WORST_COMPR_FACTOR; + int dlen = COMPRESSED_DATA_NODE_BUF_SZ, allocated = 1; struct ubifs_inode *ui = ubifs_inode(inode); - dbg_jnl("ino %lu, blk %u, len %d, key %s", key_inum(c, key), - key_block(c, key), len, DBGKEY(key)); + dbg_jnlk(key, "ino %lu, blk %u, len %d, key ", + (unsigned long)key_inum(c, key), key_block(c, key), len); ubifs_assert(len <= UBIFS_BLOCK_SIZE); - data = kmalloc(dlen, GFP_NOFS); - if (!data) - return -ENOMEM; + data = kmalloc(dlen, GFP_NOFS | __GFP_NOWARN); + if (!data) { + /* + * Fall-back to the write reserve buffer. Note, we might be + * currently on the memory reclaim path, when the kernel is + * trying to free some memory by writing out dirty pages. The + * write reserve buffer helps us to guarantee that we are + * always able to write the data. + */ + allocated = 0; + mutex_lock(&c->write_reserve_mutex); + data = c->write_reserve_buf; + } data->ch.node_type = UBIFS_DATA_NODE; key_write(c, key, &data->key); data->size = cpu_to_le32(len); zero_data_node_unused(data); - if (!(ui->flags && UBIFS_COMPR_FL)) + if (!(ui->flags & UBIFS_COMPR_FL)) /* Compression is disabled for this inode */ compr_type = UBIFS_COMPR_NONE; else @@ -733,7 +749,10 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode, goto out_ro; finish_reservation(c); - kfree(data); + if (!allocated) + mutex_unlock(&c->write_reserve_mutex); + else + kfree(data); return 0; out_release: @@ -742,7 +761,10 @@ out_ro: ubifs_ro_mode(c, err); finish_reservation(c); out_free: - kfree(data); + if (!allocated) + mutex_unlock(&c->write_reserve_mutex); + else + kfree(data); return err; } @@ -750,30 +772,25 @@ out_free: * ubifs_jnl_write_inode - flush inode to the journal. * @c: UBIFS file-system description object * @inode: inode to flush - * @deletion: inode has been deleted * * This function writes inode @inode to the journal. If the inode is * synchronous, it also synchronizes the write-buffer. Returns zero in case of * success and a negative error code in case of failure. */ -int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode, - int deletion) +int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode) { - int err, len, lnum, offs, sync = 0; + int err, lnum, offs; struct ubifs_ino_node *ino; struct ubifs_inode *ui = ubifs_inode(inode); + int sync = 0, len = UBIFS_INO_NODE_SZ, last_reference = !inode->i_nlink; - dbg_jnl("ino %lu%s", inode->i_ino, - deletion ? " (last reference)" : ""); - if (deletion) - ubifs_assert(inode->i_nlink == 0); + dbg_jnl("ino %lu, nlink %u", inode->i_ino, inode->i_nlink); - len = UBIFS_INO_NODE_SZ; /* * If the inode is being deleted, do not write the attached data. No * need to synchronize the write-buffer either. */ - if (!deletion) { + if (!last_reference) { len += ui->data_len; sync = IS_SYNC(inode); } @@ -786,7 +803,7 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode, if (err) goto out_free; - pack_inode(c, ino, inode, 1, deletion); + pack_inode(c, ino, inode, 1); err = write_head(c, BASEHD, ino, len, &lnum, &offs, sync); if (err) goto out_release; @@ -795,7 +812,7 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode, inode->i_ino); release_head(c, BASEHD); - if (deletion) { + if (last_reference) { err = ubifs_tnc_remove_ino(c, inode->i_ino); if (err) goto out_ro; @@ -828,6 +845,65 @@ out_free: } /** + * ubifs_jnl_delete_inode - delete an inode. + * @c: UBIFS file-system description object + * @inode: inode to delete + * + * This function deletes inode @inode which includes removing it from orphans, + * deleting it from TNC and, in some cases, writing a deletion inode to the + * journal. + * + * When regular file inodes are unlinked or a directory inode is removed, the + * 'ubifs_jnl_update()' function writes a corresponding deletion inode and + * direntry to the media, and adds the inode to orphans. After this, when the + * last reference to this inode has been dropped, this function is called. In + * general, it has to write one more deletion inode to the media, because if + * a commit happened between 'ubifs_jnl_update()' and + * 'ubifs_jnl_delete_inode()', the deletion inode is not in the journal + * anymore, and in fact it might not be on the flash anymore, because it might + * have been garbage-collected already. And for optimization reasons UBIFS does + * not read the orphan area if it has been unmounted cleanly, so it would have + * no indication in the journal that there is a deleted inode which has to be + * removed from TNC. + * + * However, if there was no commit between 'ubifs_jnl_update()' and + * 'ubifs_jnl_delete_inode()', then there is no need to write the deletion + * inode to the media for the second time. And this is quite a typical case. + * + * This function returns zero in case of success and a negative error code in + * case of failure. + */ +int ubifs_jnl_delete_inode(struct ubifs_info *c, const struct inode *inode) +{ + int err; + struct ubifs_inode *ui = ubifs_inode(inode); + + ubifs_assert(inode->i_nlink == 0); + + if (ui->del_cmtno != c->cmt_no) + /* A commit happened for sure */ + return ubifs_jnl_write_inode(c, inode); + + down_read(&c->commit_sem); + /* + * Check commit number again, because the first test has been done + * without @c->commit_sem, so a commit might have happened. + */ + if (ui->del_cmtno != c->cmt_no) { + up_read(&c->commit_sem); + return ubifs_jnl_write_inode(c, inode); + } + + err = ubifs_tnc_remove_ino(c, inode->i_ino); + if (err) + ubifs_ro_mode(c, err); + else + ubifs_delete_orphan(c, inode->i_ino); + up_read(&c->commit_sem); + return err; +} + +/** * ubifs_jnl_rename - rename a directory entry. * @c: UBIFS file-system description object * @old_dir: parent inode of directory entry to rename @@ -857,10 +933,8 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, int move = (old_dir != new_dir); struct ubifs_inode *uninitialized_var(new_ui); - dbg_jnl("dent '%.*s' in dir ino %lu to dent '%.*s' in dir ino %lu", - old_dentry->d_name.len, old_dentry->d_name.name, - old_dir->i_ino, new_dentry->d_name.len, - new_dentry->d_name.name, new_dir->i_ino); + dbg_jnl("dent '%pd' in dir ino %lu to dent '%pd' in dir ino %lu", + old_dentry, old_dir->i_ino, new_dentry, new_dir->i_ino); ubifs_assert(ubifs_inode(old_dir)->data_len == 0); ubifs_assert(ubifs_inode(new_dir)->data_len == 0); ubifs_assert(mutex_is_locked(&ubifs_inode(old_dir)->ui_mutex)); @@ -917,16 +991,16 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, p = (void *)dent2 + aligned_dlen2; if (new_inode) { - pack_inode(c, p, new_inode, 0, last_reference); + pack_inode(c, p, new_inode, 0); p += ALIGN(ilen, 8); } if (!move) - pack_inode(c, p, old_dir, 1, 0); + pack_inode(c, p, old_dir, 1); else { - pack_inode(c, p, old_dir, 0, 0); + pack_inode(c, p, old_dir, 0); p += ALIGN(plen, 8); - pack_inode(c, p, new_dir, 1, 0); + pack_inode(c, p, new_dir, 1); } if (last_reference) { @@ -935,6 +1009,7 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, release_head(c, BASEHD); goto out_finish; } + new_ui->del_cmtno = c->cmt_no; } err = write_head(c, BASEHD, dent, len, &lnum, &offs, sync); @@ -1074,7 +1149,8 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, ino_t inum = inode->i_ino; unsigned int blk; - dbg_jnl("ino %lu, size %lld -> %lld", inum, old_size, new_size); + dbg_jnl("ino %lu, size %lld -> %lld", + (unsigned long)inum, old_size, new_size); ubifs_assert(!ui->data_len); ubifs_assert(S_ISREG(inode->i_mode)); ubifs_assert(mutex_is_locked(&ui->ui_mutex)); @@ -1098,7 +1174,7 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, dn = (void *)trun + UBIFS_TRUN_NODE_SZ; blk = new_size >> UBIFS_BLOCK_SHIFT; data_key_init(c, &key, inum, blk); - dbg_jnl("last block key %s", DBGKEY(&key)); + dbg_jnlk(&key, "last block key "); err = ubifs_tnc_lookup(c, &key, dn); if (err == -ENOENT) dlen = 0; /* Not found (so it is a hole) */ @@ -1131,7 +1207,7 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, if (err) goto out_free; - pack_inode(c, ino, inode, 0, 0); + pack_inode(c, ino, inode, 0); ubifs_prep_grp_node(c, trun, UBIFS_TRUN_NODE_SZ, dlen ? 0 : 1); if (dlen) ubifs_prep_grp_node(c, dn, dlen, 1); @@ -1164,7 +1240,7 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, data_key_init(c, &key, inum, blk); bit = old_size & (UBIFS_BLOCK_SIZE - 1); - blk = (old_size >> UBIFS_BLOCK_SHIFT) - (bit ? 0: 1); + blk = (old_size >> UBIFS_BLOCK_SHIFT) - (bit ? 0 : 1); data_key_init(c, &to_key, inum, blk); err = ubifs_tnc_remove_range(c, &key, &to_key); @@ -1189,7 +1265,6 @@ out_free: return err; } -#ifdef CONFIG_UBIFS_FS_XATTR /** * ubifs_jnl_delete_xattr - delete an extended attribute. @@ -1251,9 +1326,9 @@ int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host, ubifs_prep_grp_node(c, xent, xlen, 0); ino = (void *)xent + aligned_xlen; - pack_inode(c, ino, inode, 0, 1); + pack_inode(c, ino, inode, 0); ino = (void *)ino + UBIFS_INO_NODE_SZ; - pack_inode(c, ino, host, 1, 0); + pack_inode(c, ino, host, 1); err = write_head(c, BASEHD, xent, len, &lnum, &xent_offs, sync); if (!sync && !err) @@ -1310,7 +1385,7 @@ out_ro: * @host: host inode * * This function writes the updated version of an extended attribute inode and - * the host inode tho the journal (to the base head). The host inode is written + * the host inode to the journal (to the base head). The host inode is written * after the extended attribute inode in order to guarantee that the extended * attribute will be flushed when the inode is synchronized by 'fsync()' and * consequently, the write-buffer is synchronized. This function returns zero @@ -1320,7 +1395,7 @@ int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode, const struct inode *host) { int err, len1, len2, aligned_len, aligned_len1, lnum, offs; - struct ubifs_inode *host_ui = ubifs_inode(inode); + struct ubifs_inode *host_ui = ubifs_inode(host); struct ubifs_ino_node *ino; union ubifs_key key; int sync = IS_DIRSYNC(host); @@ -1344,8 +1419,8 @@ int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode, if (err) goto out_free; - pack_inode(c, ino, host, 0, 0); - pack_inode(c, (void *)ino + aligned_len1, inode, 1, 0); + pack_inode(c, ino, host, 0); + pack_inode(c, (void *)ino + aligned_len1, inode, 1); err = write_head(c, BASEHD, ino, aligned_len, &lnum, &offs, 0); if (!sync && !err) { @@ -1384,4 +1459,3 @@ out_free: return err; } -#endif /* CONFIG_UBIFS_FS_XATTR */ diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h index 8f747600754..92a8491a8f8 100644 --- a/fs/ubifs/key.h +++ b/fs/ubifs/key.h @@ -38,6 +38,22 @@ #define __UBIFS_KEY_H__ /** + * key_mask_hash - mask a valid hash value. + * @val: value to be masked + * + * We use hash values as offset in directories, so values %0 and %1 are + * reserved for "." and "..". %2 is reserved for "end of readdir" marker. This + * function makes sure the reserved values are not used. + */ +static inline uint32_t key_mask_hash(uint32_t hash) +{ + hash &= UBIFS_S_KEY_HASH_MASK; + if (unlikely(hash <= 2)) + hash += 3; + return hash; +} + +/** * key_r5_hash - R5 hash function (borrowed from reiserfs). * @s: direntry name * @len: name length @@ -54,16 +70,7 @@ static inline uint32_t key_r5_hash(const char *s, int len) str++; } - a &= UBIFS_S_KEY_HASH_MASK; - - /* - * We use hash values as offset in directories, so values %0 and %1 are - * reserved for "." and "..". %2 is reserved for "end of readdir" - * marker. - */ - if (unlikely(a >= 0 && a <= 2)) - a += 3; - return a; + return key_mask_hash(a); } /** @@ -77,10 +84,7 @@ static inline uint32_t key_test_hash(const char *str, int len) len = min_t(uint32_t, len, 4); memcpy(&a, str, len); - a &= UBIFS_S_KEY_HASH_MASK; - if (unlikely(a >= 0 && a <= 2)) - a += 3; - return a; + return key_mask_hash(a); } /** @@ -225,23 +229,6 @@ static inline void xent_key_init(const struct ubifs_info *c, } /** - * xent_key_init_hash - initialize extended attribute entry key without - * re-calculating hash function. - * @c: UBIFS file-system description object - * @key: key to initialize - * @inum: host inode number - * @hash: extended attribute entry name hash - */ -static inline void xent_key_init_hash(const struct ubifs_info *c, - union ubifs_key *key, ino_t inum, - uint32_t hash) -{ - ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK)); - key->u32[0] = inum; - key->u32[1] = hash | (UBIFS_XENT_KEY << UBIFS_S_KEY_HASH_BITS); -} - -/** * xent_key_init_flash - initialize on-flash extended attribute entry key. * @c: UBIFS file-system description object * @k: key to initialize @@ -291,22 +278,15 @@ static inline void data_key_init(const struct ubifs_info *c, } /** - * data_key_init_flash - initialize on-flash data key. + * highest_data_key - get the highest possible data key for an inode. * @c: UBIFS file-system description object - * @k: key to initialize + * @key: key to initialize * @inum: inode number - * @block: block number */ -static inline void data_key_init_flash(const struct ubifs_info *c, void *k, - ino_t inum, unsigned int block) +static inline void highest_data_key(const struct ubifs_info *c, + union ubifs_key *key, ino_t inum) { - union ubifs_key *key = k; - - ubifs_assert(!(block & ~UBIFS_S_KEY_BLOCK_MASK)); - key->j32[0] = cpu_to_le32(inum); - key->j32[1] = cpu_to_le32(block | - (UBIFS_DATA_KEY << UBIFS_S_KEY_BLOCK_BITS)); - memset(k + 8, 0, UBIFS_MAX_KEY_LEN - 8); + data_key_init(c, key, inum, UBIFS_S_KEY_BLOCK_MASK); } /** @@ -326,6 +306,20 @@ static inline void trun_key_init(const struct ubifs_info *c, } /** + * invalid_key_init - initialize invalid node key. + * @c: UBIFS file-system description object + * @key: key to initialize + * + * This is a helper function which marks a @key object as invalid. + */ +static inline void invalid_key_init(const struct ubifs_info *c, + union ubifs_key *key) +{ + key->u32[0] = 0xDEADBEAF; + key->u32[1] = UBIFS_INVALID_KEY; +} + +/** * key_type - get key type. * @c: UBIFS file-system description object * @key: key to get type of @@ -345,7 +339,7 @@ static inline int key_type_flash(const struct ubifs_info *c, const void *k) { const union ubifs_key *key = k; - return le32_to_cpu(key->u32[1]) >> UBIFS_S_KEY_BLOCK_BITS; + return le32_to_cpu(key->j32[1]) >> UBIFS_S_KEY_BLOCK_BITS; } /** @@ -377,8 +371,8 @@ static inline ino_t key_inum_flash(const struct ubifs_info *c, const void *k) * @c: UBIFS file-system description object * @key: the key to get hash from */ -static inline int key_hash(const struct ubifs_info *c, - const union ubifs_key *key) +static inline uint32_t key_hash(const struct ubifs_info *c, + const union ubifs_key *key) { return key->u32[1] & UBIFS_S_KEY_HASH_MASK; } @@ -388,7 +382,7 @@ static inline int key_hash(const struct ubifs_info *c, * @c: UBIFS file-system description object * @k: the key to get hash from */ -static inline int key_hash_flash(const struct ubifs_info *c, const void *k) +static inline uint32_t key_hash_flash(const struct ubifs_info *c, const void *k) { const union ubifs_key *key = k; @@ -416,7 +410,7 @@ static inline unsigned int key_block_flash(const struct ubifs_info *c, { const union ubifs_key *key = k; - return le32_to_cpu(key->u32[1]) & UBIFS_S_KEY_BLOCK_MASK; + return le32_to_cpu(key->j32[1]) & UBIFS_S_KEY_BLOCK_MASK; } /** @@ -484,7 +478,7 @@ static inline void key_copy(const struct ubifs_info *c, * @key2: the second key to compare * * This function compares 2 keys and returns %-1 if @key1 is less than - * @key2, 0 if the keys are equivalent and %1 if @key1 is greater than @key2. + * @key2, %0 if the keys are equivalent and %1 if @key1 is greater than @key2. */ static inline int keys_cmp(const struct ubifs_info *c, const union ubifs_key *key1, @@ -503,6 +497,26 @@ static inline int keys_cmp(const struct ubifs_info *c, } /** + * keys_eq - determine if keys are equivalent. + * @c: UBIFS file-system description object + * @key1: the first key to compare + * @key2: the second key to compare + * + * This function compares 2 keys and returns %1 if @key1 is equal to @key2 and + * %0 if not. + */ +static inline int keys_eq(const struct ubifs_info *c, + const union ubifs_key *key1, + const union ubifs_key *key2) +{ + if (key1->u32[0] != key2->u32[0]) + return 0; + if (key1->u32[1] != key2->u32[1]) + return 0; + return 1; +} + +/** * is_hash_key - is a key vulnerable to hash collisions. * @c: UBIFS file-system description object * @key: key @@ -530,4 +544,5 @@ static inline unsigned long long key_max_inode_size(const struct ubifs_info *c) return 0; } } + #endif /* !__UBIFS_KEY_H__ */ diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c index 36857b9ed59..a902c5919e4 100644 --- a/fs/ubifs/log.c +++ b/fs/ubifs/log.c @@ -29,11 +29,7 @@ #include "ubifs.h" -#ifdef CONFIG_UBIFS_FS_DEBUG static int dbg_check_bud_bytes(struct ubifs_info *c); -#else -#define dbg_check_bud_bytes(c) 0 -#endif /** * ubifs_search_bud - search bud LEB. @@ -100,20 +96,6 @@ struct ubifs_wbuf *ubifs_get_wbuf(struct ubifs_info *c, int lnum) } /** - * next_log_lnum - switch to the next log LEB. - * @c: UBIFS file-system description object - * @lnum: current log LEB - */ -static inline int next_log_lnum(const struct ubifs_info *c, int lnum) -{ - lnum += 1; - if (lnum > c->log_last) - lnum = UBIFS_LOG_LNUM; - - return lnum; -} - -/** * empty_log_bytes - calculate amount of empty space in the log. * @c: UBIFS file-system description object */ @@ -159,7 +141,7 @@ void ubifs_add_bud(struct ubifs_info *c, struct ubifs_bud *bud) jhead = &c->jheads[bud->jhead]; list_add_tail(&bud->list, &jhead->buds_list); } else - ubifs_assert(c->replaying && (c->vfs_sb->s_flags & MS_RDONLY)); + ubifs_assert(c->replaying && c->ro_mount); /* * Note, although this is a new bud, we anyway account this space now, @@ -169,28 +151,8 @@ void ubifs_add_bud(struct ubifs_info *c, struct ubifs_bud *bud) */ c->bud_bytes += c->leb_size - bud->start; - dbg_log("LEB %d:%d, jhead %d, bud_bytes %lld", bud->lnum, - bud->start, bud->jhead, c->bud_bytes); - spin_unlock(&c->buds_lock); -} - -/** - * ubifs_create_buds_lists - create journal head buds lists for remount rw. - * @c: UBIFS file-system description object - */ -void ubifs_create_buds_lists(struct ubifs_info *c) -{ - struct rb_node *p; - - spin_lock(&c->buds_lock); - p = rb_first(&c->buds); - while (p) { - struct ubifs_bud *bud = rb_entry(p, struct ubifs_bud, rb); - struct ubifs_jhead *jhead = &c->jheads[bud->jhead]; - - list_add_tail(&bud->list, &jhead->buds_list); - p = rb_next(p); - } + dbg_log("LEB %d:%d, jhead %s, bud_bytes %lld", bud->lnum, + bud->start, dbg_jhead(bud->jhead), c->bud_bytes); spin_unlock(&c->buds_lock); } @@ -223,8 +185,8 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs) } mutex_lock(&c->log_mutex); - - if (c->ro_media) { + ubifs_assert(!c->ro_media && !c->ro_mount); + if (c->ro_error) { err = -EROFS; goto out_unlock; } @@ -239,7 +201,7 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs) } /* - * Make sure the the amount of space in buds will not exceed + * Make sure the amount of space in buds will not exceed the * 'c->max_bud_bytes' limit, because we want to guarantee mount time * limits. * @@ -277,7 +239,7 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs) ref->jhead = cpu_to_le32(jhead); if (c->lhead_offs > c->leb_size - c->ref_node_alsz) { - c->lhead_lnum = next_log_lnum(c, c->lhead_lnum); + c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum); c->lhead_offs = 0; } @@ -296,7 +258,7 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs) * an unclean reboot, because the target LEB might have been * unmapped, but not yet physically erased. */ - err = ubi_leb_map(c->ubi, bud->lnum, UBI_SHORTTERM); + err = ubifs_leb_map(c, bud->lnum); if (err) goto out_unlock; } @@ -304,7 +266,7 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs) dbg_log("write ref LEB %d:%d", c->lhead_lnum, c->lhead_offs); err = ubifs_write_node(c, ref, UBIFS_REF_NODE_SZ, c->lhead_lnum, - c->lhead_offs, UBI_SHORTTERM); + c->lhead_offs); if (err) goto out_unlock; @@ -353,19 +315,16 @@ static void remove_buds(struct ubifs_info *c) * heads (non-closed buds). */ c->cmt_bud_bytes += wbuf->offs - bud->start; - dbg_log("preserve %d:%d, jhead %d, bud bytes %d, " - "cmt_bud_bytes %lld", bud->lnum, bud->start, - bud->jhead, wbuf->offs - bud->start, - c->cmt_bud_bytes); + dbg_log("preserve %d:%d, jhead %s, bud bytes %d, cmt_bud_bytes %lld", + bud->lnum, bud->start, dbg_jhead(bud->jhead), + wbuf->offs - bud->start, c->cmt_bud_bytes); bud->start = wbuf->offs; } else { c->cmt_bud_bytes += c->leb_size - bud->start; - dbg_log("remove %d:%d, jhead %d, bud bytes %d, " - "cmt_bud_bytes %lld", bud->lnum, bud->start, - bud->jhead, c->leb_size - bud->start, - c->cmt_bud_bytes); + dbg_log("remove %d:%d, jhead %s, bud bytes %d, cmt_bud_bytes %lld", + bud->lnum, bud->start, dbg_jhead(bud->jhead), + c->leb_size - bud->start, c->cmt_bud_bytes); rb_erase(p1, &c->buds); - list_del(&bud->list); /* * If the commit does not finish, the recovery will need * to replay the journal, in which case the old buds @@ -373,7 +332,7 @@ static void remove_buds(struct ubifs_info *c) * commit i.e. do not allow them to be garbage * collected. */ - list_add(&bud->list, &c->old_buds); + list_move(&bud->list, &c->old_buds); } } spin_unlock(&c->buds_lock); @@ -410,7 +369,7 @@ int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum) return -ENOMEM; cs->ch.node_type = UBIFS_CS_NODE; - cs->cmt_no = cpu_to_le64(c->cmt_no + 1); + cs->cmt_no = cpu_to_le64(c->cmt_no); ubifs_prepare_node(c, cs, UBIFS_CS_NODE_SZ, 0); /* @@ -428,7 +387,8 @@ int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum) if (lnum == -1 || offs == c->leb_size) continue; - dbg_log("add ref to LEB %d:%d for jhead %d", lnum, offs, i); + dbg_log("add ref to LEB %d:%d for jhead %s", + lnum, offs, dbg_jhead(i)); ref = buf + len; ref->ch.node_type = UBIFS_REF_NODE; ref->lnum = cpu_to_le32(lnum); @@ -443,7 +403,7 @@ int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum) /* Switch to the next log LEB */ if (c->lhead_offs) { - c->lhead_lnum = next_log_lnum(c, c->lhead_lnum); + c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum); c->lhead_offs = 0; } @@ -456,7 +416,7 @@ int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum) len = ALIGN(len, c->min_io_size); dbg_log("writing commit start at LEB %d:0, len %d", c->lhead_lnum, len); - err = ubifs_leb_write(c, c->lhead_lnum, cs, 0, len, UBI_SHORTTERM); + err = ubifs_leb_write(c, c->lhead_lnum, cs, 0, len); if (err) goto out; @@ -464,7 +424,7 @@ int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum) c->lhead_offs += len; if (c->lhead_offs == c->leb_size) { - c->lhead_lnum = next_log_lnum(c, c->lhead_lnum); + c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum); c->lhead_offs = 0; } @@ -551,7 +511,7 @@ int ubifs_log_post_commit(struct ubifs_info *c, int old_ltail_lnum) } mutex_lock(&c->log_mutex); for (lnum = old_ltail_lnum; lnum != c->ltail_lnum; - lnum = next_log_lnum(c, lnum)) { + lnum = ubifs_next_log_lnum(c, lnum)) { dbg_log("unmap log LEB %d", lnum); err = ubifs_leb_unmap(c, lnum); if (err) @@ -614,27 +574,10 @@ static int done_already(struct rb_root *done_tree, int lnum) */ static void destroy_done_tree(struct rb_root *done_tree) { - struct rb_node *this = done_tree->rb_node; - struct done_ref *dr; + struct done_ref *dr, *n; - while (this) { - if (this->rb_left) { - this = this->rb_left; - continue; - } else if (this->rb_right) { - this = this->rb_right; - continue; - } - dr = rb_entry(this, struct done_ref, rb); - this = rb_parent(this); - if (this) { - if (this->rb_left == &dr->rb) - this->rb_left = NULL; - else - this->rb_right = NULL; - } + rbtree_postorder_for_each_entry_safe(dr, n, done_tree, rb) kfree(dr); - } } /** @@ -657,10 +600,10 @@ static int add_node(struct ubifs_info *c, void *buf, int *lnum, int *offs, int sz = ALIGN(*offs, c->min_io_size), err; ubifs_pad(c, buf + *offs, sz - *offs); - err = ubifs_leb_change(c, *lnum, buf, sz, UBI_SHORTTERM); + err = ubifs_leb_change(c, *lnum, buf, sz); if (err) return err; - *lnum = next_log_lnum(c, *lnum); + *lnum = ubifs_next_log_lnum(c, *lnum); *offs = 0; } memcpy(buf + *offs, node, len); @@ -694,7 +637,7 @@ int ubifs_consolidate_log(struct ubifs_info *c) lnum = c->ltail_lnum; write_lnum = lnum; while (1) { - sleb = ubifs_scan(c, lnum, 0, c->sbuf); + sleb = ubifs_scan(c, lnum, 0, c->sbuf, 0); if (IS_ERR(sleb)) { err = PTR_ERR(sleb); goto out_free; @@ -730,13 +673,13 @@ int ubifs_consolidate_log(struct ubifs_info *c) ubifs_scan_destroy(sleb); if (lnum == c->lhead_lnum) break; - lnum = next_log_lnum(c, lnum); + lnum = ubifs_next_log_lnum(c, lnum); } if (offs) { int sz = ALIGN(offs, c->min_io_size); ubifs_pad(c, buf + offs, sz - offs); - err = ubifs_leb_change(c, write_lnum, buf, sz, UBI_SHORTTERM); + err = ubifs_leb_change(c, write_lnum, buf, sz); if (err) goto out_free; offs = ALIGN(offs, c->min_io_size); @@ -750,7 +693,7 @@ int ubifs_consolidate_log(struct ubifs_info *c) /* Unmap remaining LEBs */ lnum = write_lnum; do { - lnum = next_log_lnum(c, lnum); + lnum = ubifs_next_log_lnum(c, lnum); err = ubifs_leb_unmap(c, lnum); if (err) return err; @@ -768,8 +711,6 @@ out_free: return err; } -#ifdef CONFIG_UBIFS_FS_DEBUG - /** * dbg_check_bud_bytes - make sure bud bytes calculation are all right. * @c: UBIFS file-system description object @@ -784,7 +725,7 @@ static int dbg_check_bud_bytes(struct ubifs_info *c) struct ubifs_bud *bud; long long bud_bytes = 0; - if (!(ubifs_chk_flags & UBIFS_CHK_GEN)) + if (!dbg_is_chk_gen(c)) return 0; spin_lock(&c->buds_lock); @@ -801,5 +742,3 @@ static int dbg_check_bud_bytes(struct ubifs_info *c) return err; } - -#endif /* CONFIG_UBIFS_FS_DEBUG */ diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c index 2ba93da71b6..46190a7c42a 100644 --- a/fs/ubifs/lprops.c +++ b/fs/ubifs/lprops.c @@ -125,6 +125,7 @@ static void adjust_lpt_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, } } } + /* Not greater than parent, so compare to children */ while (1) { /* Compare to left child */ @@ -280,7 +281,7 @@ void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops, case LPROPS_FREE: if (add_to_lpt_heap(c, lprops, cat)) break; - /* No more room on heap so make it uncategorized */ + /* No more room on heap so make it un-categorized */ cat = LPROPS_UNCAT; /* Fall through */ case LPROPS_UNCAT: @@ -299,8 +300,11 @@ void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops, default: ubifs_assert(0); } + lprops->flags &= ~LPROPS_CAT_MASK; lprops->flags |= cat; + c->in_a_category_cnt += 1; + ubifs_assert(c->in_a_category_cnt <= c->main_lebs); } /** @@ -333,6 +337,9 @@ static void ubifs_remove_from_cat(struct ubifs_info *c, default: ubifs_assert(0); } + + c->in_a_category_cnt -= 1; + ubifs_assert(c->in_a_category_cnt >= 0); } /** @@ -374,8 +381,8 @@ void ubifs_replace_cat(struct ubifs_info *c, struct ubifs_lprops *old_lprops, * @lprops: LEB properties * * A LEB may have fallen off of the bottom of a heap, and ended up as - * uncategorized even though it has enough space for us now. If that is the case - * this function will put the LEB back onto a heap. + * un-categorized even though it has enough space for us now. If that is the + * case this function will put the LEB back onto a heap. */ void ubifs_ensure_cat(struct ubifs_info *c, struct ubifs_lprops *lprops) { @@ -435,10 +442,10 @@ int ubifs_categorize_lprops(const struct ubifs_info *c, /** * change_category - change LEB properties category. * @c: UBIFS file-system description object - * @lprops: LEB properties to recategorize + * @lprops: LEB properties to re-categorize * * LEB properties are categorized to enable fast find operations. When the LEB - * properties change they must be recategorized. + * properties change they must be re-categorized. */ static void change_category(struct ubifs_info *c, struct ubifs_lprops *lprops) { @@ -446,7 +453,7 @@ static void change_category(struct ubifs_info *c, struct ubifs_lprops *lprops) int new_cat = ubifs_categorize_lprops(c, lprops); if (old_cat == new_cat) { - struct ubifs_lpt_heap *heap = &c->lpt_heap[new_cat - 1]; + struct ubifs_lpt_heap *heap; /* lprops on a heap now must be moved up or down */ if (new_cat < 1 || new_cat > LPROPS_HEAP_CNT) @@ -460,33 +467,18 @@ static void change_category(struct ubifs_info *c, struct ubifs_lprops *lprops) } /** - * ubifs_get_lprops - get reference to LEB properties. - * @c: the UBIFS file-system description object - * - * This function locks lprops. Lprops have to be unlocked by - * 'ubifs_release_lprops()'. - */ -void ubifs_get_lprops(struct ubifs_info *c) -{ - mutex_lock(&c->lp_mutex); -} - -/** - * calc_dark - calculate LEB dark space size. + * ubifs_calc_dark - calculate LEB dark space size. * @c: the UBIFS file-system description object * @spc: amount of free and dirty space in the LEB * - * This function calculates amount of dark space in an LEB which has @spc bytes - * of free and dirty space. Returns the calculations result. + * This function calculates and returns amount of dark space in an LEB which + * has @spc bytes of free and dirty space. * - * Dark space is the space which is not always usable - it depends on which - * nodes are written in which order. E.g., if an LEB has only 512 free bytes, - * it is dark space, because it cannot fit a large data node. So UBIFS cannot - * count on this LEB and treat these 512 bytes as usable because it is not true - * if, for example, only big chunks of uncompressible data will be written to - * the FS. + * UBIFS is trying to account the space which might not be usable, and this + * space is called "dark space". For example, if an LEB has only %512 free + * bytes, it is dark space, because it cannot fit a large data node. */ -static int calc_dark(struct ubifs_info *c, int spc) +int ubifs_calc_dark(const struct ubifs_info *c, int spc) { ubifs_assert(!(spc & 7)); @@ -518,7 +510,7 @@ static int is_lprops_dirty(struct ubifs_info *c, struct ubifs_lprops *lprops) pnode = (struct ubifs_pnode *)container_of(lprops - pos, struct ubifs_pnode, lprops[0]); - return !test_bit(COW_ZNODE, &pnode->flags) && + return !test_bit(COW_CNODE, &pnode->flags) && test_bit(DIRTY_CNODE, &pnode->flags); } @@ -529,15 +521,15 @@ static int is_lprops_dirty(struct ubifs_info *c, struct ubifs_lprops *lprops) * @free: new free space amount * @dirty: new dirty space amount * @flags: new flags - * @idx_gc_cnt: change to the count of idx_gc list + * @idx_gc_cnt: change to the count of @idx_gc list * - * This function changes LEB properties. This function does not change a LEB - * property (@free, @dirty or @flag) if the value passed is %LPROPS_NC. + * This function changes LEB properties (@free, @dirty or @flag). However, the + * property which has the %LPROPS_NC value is not changed. Returns a pointer to + * the updated LEB properties on success and a negative error code on failure. * - * This function returns a pointer to the updated LEB properties on success - * and a negative error code on failure. N.B. the LEB properties may have had to - * be copied (due to COW) and consequently the pointer returned may not be the - * same as the pointer passed. + * Note, the LEB properties may have had to be copied (due to COW) and + * consequently the pointer returned may not be the same as the pointer + * passed. */ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c, const struct ubifs_lprops *lp, @@ -546,7 +538,7 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c, { /* * This is the only function that is allowed to change lprops, so we - * discard the const qualifier. + * discard the "const" qualifier. */ struct ubifs_lprops *lprops = (struct ubifs_lprops *)lp; @@ -576,7 +568,6 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c, ubifs_assert(!(lprops->free & 7) && !(lprops->dirty & 7)); spin_lock(&c->space_lock); - if ((lprops->flags & LPROPS_TAKEN) && lprops->free == c->leb_size) c->lst.taken_empty_lebs -= 1; @@ -587,7 +578,7 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c, if (old_spc < c->dead_wm) c->lst.total_dead -= old_spc; else - c->lst.total_dark -= calc_dark(c, old_spc); + c->lst.total_dark -= ubifs_calc_dark(c, old_spc); c->lst.total_used -= c->leb_size - old_spc; } @@ -628,7 +619,7 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c, if (new_spc < c->dead_wm) c->lst.total_dead += new_spc; else - c->lst.total_dark += calc_dark(c, new_spc); + c->lst.total_dark += ubifs_calc_dark(c, new_spc); c->lst.total_used += c->leb_size - new_spc; } @@ -637,39 +628,20 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c, c->lst.taken_empty_lebs += 1; change_category(c, lprops); - c->idx_gc_cnt += idx_gc_cnt; - spin_unlock(&c->space_lock); - return lprops; } /** - * ubifs_release_lprops - release lprops lock. - * @c: the UBIFS file-system description object - * - * This function has to be called after each 'ubifs_get_lprops()' call to - * unlock lprops. - */ -void ubifs_release_lprops(struct ubifs_info *c) -{ - ubifs_assert(mutex_is_locked(&c->lp_mutex)); - ubifs_assert(c->lst.empty_lebs >= 0 && - c->lst.empty_lebs <= c->main_lebs); - - mutex_unlock(&c->lp_mutex); -} - -/** * ubifs_get_lp_stats - get lprops statistics. * @c: UBIFS file-system description object * @st: return statistics */ -void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *st) +void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *lst) { spin_lock(&c->space_lock); - memcpy(st, &c->lst, sizeof(struct ubifs_lp_stats)); + memcpy(lst, &c->lst, sizeof(struct ubifs_lp_stats)); spin_unlock(&c->space_lock); } @@ -709,6 +681,9 @@ int ubifs_change_one_lp(struct ubifs_info *c, int lnum, int free, int dirty, out: ubifs_release_lprops(c); + if (err) + ubifs_err("cannot change properties of LEB %d, error %d", + lnum, err); return err; } @@ -745,6 +720,9 @@ int ubifs_update_one_lp(struct ubifs_info *c, int lnum, int free, int dirty, out: ubifs_release_lprops(c); + if (err) + ubifs_err("cannot update properties of LEB %d, error %d", + lnum, err); return err; } @@ -768,6 +746,8 @@ int ubifs_read_one_lp(struct ubifs_info *c, int lnum, struct ubifs_lprops *lp) lpp = ubifs_lpt_lookup(c, lnum); if (IS_ERR(lpp)) { err = PTR_ERR(lpp); + ubifs_err("cannot read properties of LEB %d, error %d", + lnum, err); goto out; } @@ -872,7 +852,9 @@ const struct ubifs_lprops *ubifs_fast_find_frdi_idx(struct ubifs_info *c) return lprops; } -#ifdef CONFIG_UBIFS_FS_DEBUG +/* + * Everything below is related to debugging. + */ /** * dbg_check_cats - check category heaps and lists. @@ -886,20 +868,20 @@ int dbg_check_cats(struct ubifs_info *c) struct list_head *pos; int i, cat; - if (!(ubifs_chk_flags & (UBIFS_CHK_GEN | UBIFS_CHK_LPROPS))) + if (!dbg_is_chk_gen(c) && !dbg_is_chk_lprops(c)) return 0; list_for_each_entry(lprops, &c->empty_list, list) { if (lprops->free != c->leb_size) { - ubifs_err("non-empty LEB %d on empty list " - "(free %d dirty %d flags %d)", lprops->lnum, - lprops->free, lprops->dirty, lprops->flags); + ubifs_err("non-empty LEB %d on empty list (free %d dirty %d flags %d)", + lprops->lnum, lprops->free, lprops->dirty, + lprops->flags); return -EINVAL; } if (lprops->flags & LPROPS_TAKEN) { - ubifs_err("taken LEB %d on empty list " - "(free %d dirty %d flags %d)", lprops->lnum, - lprops->free, lprops->dirty, lprops->flags); + ubifs_err("taken LEB %d on empty list (free %d dirty %d flags %d)", + lprops->lnum, lprops->free, lprops->dirty, + lprops->flags); return -EINVAL; } } @@ -907,15 +889,15 @@ int dbg_check_cats(struct ubifs_info *c) i = 0; list_for_each_entry(lprops, &c->freeable_list, list) { if (lprops->free + lprops->dirty != c->leb_size) { - ubifs_err("non-freeable LEB %d on freeable list " - "(free %d dirty %d flags %d)", lprops->lnum, - lprops->free, lprops->dirty, lprops->flags); + ubifs_err("non-freeable LEB %d on freeable list (free %d dirty %d flags %d)", + lprops->lnum, lprops->free, lprops->dirty, + lprops->flags); return -EINVAL; } if (lprops->flags & LPROPS_TAKEN) { - ubifs_err("taken LEB %d on freeable list " - "(free %d dirty %d flags %d)", lprops->lnum, - lprops->free, lprops->dirty, lprops->flags); + ubifs_err("taken LEB %d on freeable list (free %d dirty %d flags %d)", + lprops->lnum, lprops->free, lprops->dirty, + lprops->flags); return -EINVAL; } i += 1; @@ -937,21 +919,21 @@ int dbg_check_cats(struct ubifs_info *c) list_for_each_entry(lprops, &c->frdi_idx_list, list) { if (lprops->free + lprops->dirty != c->leb_size) { - ubifs_err("non-freeable LEB %d on frdi_idx list " - "(free %d dirty %d flags %d)", lprops->lnum, - lprops->free, lprops->dirty, lprops->flags); + ubifs_err("non-freeable LEB %d on frdi_idx list (free %d dirty %d flags %d)", + lprops->lnum, lprops->free, lprops->dirty, + lprops->flags); return -EINVAL; } if (lprops->flags & LPROPS_TAKEN) { - ubifs_err("taken LEB %d on frdi_idx list " - "(free %d dirty %d flags %d)", lprops->lnum, - lprops->free, lprops->dirty, lprops->flags); + ubifs_err("taken LEB %d on frdi_idx list (free %d dirty %d flags %d)", + lprops->lnum, lprops->free, lprops->dirty, + lprops->flags); return -EINVAL; } if (!(lprops->flags & LPROPS_INDEX)) { - ubifs_err("non-index LEB %d on frdi_idx list " - "(free %d dirty %d flags %d)", lprops->lnum, - lprops->free, lprops->dirty, lprops->flags); + ubifs_err("non-index LEB %d on frdi_idx list (free %d dirty %d flags %d)", + lprops->lnum, lprops->free, lprops->dirty, + lprops->flags); return -EINVAL; } } @@ -984,7 +966,7 @@ void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat, { int i = 0, j, err = 0; - if (!(ubifs_chk_flags & (UBIFS_CHK_GEN | UBIFS_CHK_LPROPS))) + if (!dbg_is_chk_gen(c) && !dbg_is_chk_lprops(c)) return; for (i = 0; i < heap->cnt; i++) { @@ -1006,9 +988,9 @@ void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat, goto out; } if (lprops != lp) { - dbg_msg("lprops %zx lp %zx lprops->lnum %d lp->lnum %d", - (size_t)lprops, (size_t)lp, lprops->lnum, - lp->lnum); + ubifs_err("lprops %zx lp %zx lprops->lnum %d lp->lnum %d", + (size_t)lprops, (size_t)lp, lprops->lnum, + lp->lnum); err = 4; goto out; } @@ -1026,28 +1008,18 @@ void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat, } out: if (err) { - dbg_msg("failed cat %d hpos %d err %d", cat, i, err); - dbg_dump_stack(); - dbg_dump_heap(c, heap, cat); + ubifs_err("failed cat %d hpos %d err %d", cat, i, err); + dump_stack(); + ubifs_dump_heap(c, heap, cat); } } /** - * struct scan_check_data - data provided to scan callback function. - * @lst: LEB properties statistics - * @err: error code - */ -struct scan_check_data { - struct ubifs_lp_stats lst; - int err; -}; - -/** * scan_check_cb - scan callback. * @c: the UBIFS file-system description object * @lp: LEB properties to scan * @in_tree: whether the LEB properties are in main memory - * @data: information passed to and from the caller of the scan + * @lst: lprops statistics to update * * This function returns a code that indicates whether the scan should continue * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree @@ -1056,12 +1028,12 @@ struct scan_check_data { */ static int scan_check_cb(struct ubifs_info *c, const struct ubifs_lprops *lp, int in_tree, - struct scan_check_data *data) + struct ubifs_lp_stats *lst) { struct ubifs_scan_leb *sleb; struct ubifs_scan_node *snod; - struct ubifs_lp_stats *lst = &data->lst; - int cat, lnum = lp->lnum, is_idx = 0, used = 0, free, dirty; + int cat, lnum = lp->lnum, is_idx = 0, used = 0, free, dirty, ret; + void *buf = NULL; cat = lp->flags & LPROPS_CAT_MASK; if (cat != LPROPS_UNCAT) { @@ -1069,7 +1041,7 @@ static int scan_check_cb(struct ubifs_info *c, if (cat != (lp->flags & LPROPS_CAT_MASK)) { ubifs_err("bad LEB category %d expected %d", (lp->flags & LPROPS_CAT_MASK), cat); - goto out; + return -EINVAL; } } @@ -1103,7 +1075,7 @@ static int scan_check_cb(struct ubifs_info *c, } if (!found) { ubifs_err("bad LPT list (category %d)", cat); - goto out; + return -EINVAL; } } } @@ -1115,36 +1087,40 @@ static int scan_check_cb(struct ubifs_info *c, if ((lp->hpos != -1 && heap->arr[lp->hpos]->lnum != lnum) || lp != heap->arr[lp->hpos]) { ubifs_err("bad LPT heap (category %d)", cat); - goto out; + return -EINVAL; } } - sleb = ubifs_scan(c, lnum, 0, c->dbg_buf); - if (IS_ERR(sleb)) { - /* - * After an unclean unmount, empty and freeable LEBs - * may contain garbage. - */ - if (lp->free == c->leb_size) { - ubifs_err("scan errors were in empty LEB " - "- continuing checking"); - lst->empty_lebs += 1; - lst->total_free += c->leb_size; - lst->total_dark += calc_dark(c, c->leb_size); - return LPT_SCAN_CONTINUE; - } + buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); + if (!buf) + return -ENOMEM; + + /* + * After an unclean unmount, empty and freeable LEBs + * may contain garbage - do not scan them. + */ + if (lp->free == c->leb_size) { + lst->empty_lebs += 1; + lst->total_free += c->leb_size; + lst->total_dark += ubifs_calc_dark(c, c->leb_size); + return LPT_SCAN_CONTINUE; + } + if (lp->free + lp->dirty == c->leb_size && + !(lp->flags & LPROPS_INDEX)) { + lst->total_free += lp->free; + lst->total_dirty += lp->dirty; + lst->total_dark += ubifs_calc_dark(c, c->leb_size); + return LPT_SCAN_CONTINUE; + } - if (lp->free + lp->dirty == c->leb_size && - !(lp->flags & LPROPS_INDEX)) { - ubifs_err("scan errors were in freeable LEB " - "- continuing checking"); - lst->total_free += lp->free; - lst->total_dirty += lp->dirty; - lst->total_dark += calc_dark(c, c->leb_size); - return LPT_SCAN_CONTINUE; + sleb = ubifs_scan(c, lnum, 0, buf, 0); + if (IS_ERR(sleb)) { + ret = PTR_ERR(sleb); + if (ret == -EUCLEAN) { + ubifs_dump_lprops(c); + ubifs_dump_budg(c, &c->bi); } - data->err = PTR_ERR(sleb); - return LPT_SCAN_STOP; + goto out; } is_idx = -1; @@ -1183,8 +1159,8 @@ static int scan_check_cb(struct ubifs_info *c, if (free > c->leb_size || free < 0 || dirty > c->leb_size || dirty < 0) { - ubifs_err("bad calculated accounting for LEB %d: " - "free %d, dirty %d", lnum, free, dirty); + ubifs_err("bad calculated accounting for LEB %d: free %d, dirty %d", + lnum, free, dirty); goto out_destroy; } @@ -1230,8 +1206,7 @@ static int scan_check_cb(struct ubifs_info *c, /* Free but not unmapped LEB, it's fine */ is_idx = 0; else { - ubifs_err("indexing node without indexing " - "flag"); + ubifs_err("indexing node without indexing flag"); goto out_print; } } @@ -1258,23 +1233,23 @@ static int scan_check_cb(struct ubifs_info *c, if (spc < c->dead_wm) lst->total_dead += spc; else - lst->total_dark += calc_dark(c, spc); + lst->total_dark += ubifs_calc_dark(c, spc); } ubifs_scan_destroy(sleb); - + vfree(buf); return LPT_SCAN_CONTINUE; out_print: - ubifs_err("bad accounting of LEB %d: free %d, dirty %d flags %#x, " - "should be free %d, dirty %d", + ubifs_err("bad accounting of LEB %d: free %d, dirty %d flags %#x, should be free %d, dirty %d", lnum, lp->free, lp->dirty, lp->flags, free, dirty); - dbg_dump_leb(c, lnum); + ubifs_dump_leb(c, lnum); out_destroy: ubifs_scan_destroy(sleb); + ret = -EINVAL; out: - data->err = -EINVAL; - return LPT_SCAN_STOP; + vfree(buf); + return ret; } /** @@ -1291,10 +1266,9 @@ out: int dbg_check_lprops(struct ubifs_info *c) { int i, err; - struct scan_check_data data; - struct ubifs_lp_stats *lst = &data.lst; + struct ubifs_lp_stats lst; - if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS)) + if (!dbg_is_chk_lprops(c)) return 0; /* @@ -1307,42 +1281,34 @@ int dbg_check_lprops(struct ubifs_info *c) return err; } - memset(lst, 0, sizeof(struct ubifs_lp_stats)); - - data.err = 0; + memset(&lst, 0, sizeof(struct ubifs_lp_stats)); err = ubifs_lpt_scan_nolock(c, c->main_first, c->leb_cnt - 1, (ubifs_lpt_scan_callback)scan_check_cb, - &data); + &lst); if (err && err != -ENOSPC) goto out; - if (data.err) { - err = data.err; - goto out; - } - if (lst->empty_lebs != c->lst.empty_lebs || - lst->idx_lebs != c->lst.idx_lebs || - lst->total_free != c->lst.total_free || - lst->total_dirty != c->lst.total_dirty || - lst->total_used != c->lst.total_used) { + if (lst.empty_lebs != c->lst.empty_lebs || + lst.idx_lebs != c->lst.idx_lebs || + lst.total_free != c->lst.total_free || + lst.total_dirty != c->lst.total_dirty || + lst.total_used != c->lst.total_used) { ubifs_err("bad overall accounting"); - ubifs_err("calculated: empty_lebs %d, idx_lebs %d, " - "total_free %lld, total_dirty %lld, total_used %lld", - lst->empty_lebs, lst->idx_lebs, lst->total_free, - lst->total_dirty, lst->total_used); - ubifs_err("read from lprops: empty_lebs %d, idx_lebs %d, " - "total_free %lld, total_dirty %lld, total_used %lld", + ubifs_err("calculated: empty_lebs %d, idx_lebs %d, total_free %lld, total_dirty %lld, total_used %lld", + lst.empty_lebs, lst.idx_lebs, lst.total_free, + lst.total_dirty, lst.total_used); + ubifs_err("read from lprops: empty_lebs %d, idx_lebs %d, total_free %lld, total_dirty %lld, total_used %lld", c->lst.empty_lebs, c->lst.idx_lebs, c->lst.total_free, c->lst.total_dirty, c->lst.total_used); err = -EINVAL; goto out; } - if (lst->total_dead != c->lst.total_dead || - lst->total_dark != c->lst.total_dark) { + if (lst.total_dead != c->lst.total_dead || + lst.total_dark != c->lst.total_dark) { ubifs_err("bad dead/dark space accounting"); ubifs_err("calculated: total_dead %lld, total_dark %lld", - lst->total_dead, lst->total_dark); + lst.total_dead, lst.total_dark); ubifs_err("read from lprops: total_dead %lld, total_dark %lld", c->lst.total_dead, c->lst.total_dark); err = -EINVAL; @@ -1353,5 +1319,3 @@ int dbg_check_lprops(struct ubifs_info *c) out: return err; } - -#endif /* CONFIG_UBIFS_FS_DEBUG */ diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c index 9ff2463177e..d46b19ec181 100644 --- a/fs/ubifs/lpt.c +++ b/fs/ubifs/lpt.c @@ -36,15 +36,17 @@ * can be written into a single eraseblock. In that case, garbage collection * consists of just writing the whole table, which therefore makes all other * eraseblocks reusable. In the case of the big model, dirty eraseblocks are - * selected for garbage collection, which consists are marking the nodes in + * selected for garbage collection, which consists of marking the clean nodes in * that LEB as dirty, and then only the dirty nodes are written out. Also, in * the case of the big model, a table of LEB numbers is saved so that the entire * LPT does not to be scanned looking for empty eraseblocks when UBIFS is first * mounted. */ -#include <linux/crc16.h> #include "ubifs.h" +#include <linux/crc16.h> +#include <linux/math64.h> +#include <linux/slab.h> /** * do_calc_lpt_geom - calculate sizes for the LPT area. @@ -109,7 +111,8 @@ static void do_calc_lpt_geom(struct ubifs_info *c) c->lpt_sz = (long long)c->pnode_cnt * c->pnode_sz; c->lpt_sz += (long long)c->nnode_cnt * c->nnode_sz; c->lpt_sz += c->ltab_sz; - c->lpt_sz += c->lsave_sz; + if (c->big_lpt) + c->lpt_sz += c->lsave_sz; /* Add wastage */ sz = c->lpt_sz; @@ -134,15 +137,13 @@ static void do_calc_lpt_geom(struct ubifs_info *c) int ubifs_calc_lpt_geom(struct ubifs_info *c) { int lebs_needed; - uint64_t sz; + long long sz; do_calc_lpt_geom(c); /* Verify that lpt_lebs is big enough */ sz = c->lpt_sz * 2; /* Must have at least 2 times the size */ - sz += c->leb_size - 1; - do_div(sz, c->leb_size); - lebs_needed = sz; + lebs_needed = div_u64(sz + c->leb_size - 1, c->leb_size); if (lebs_needed > c->lpt_lebs) { ubifs_err("too few LPT LEBs"); return -EINVAL; @@ -155,7 +156,6 @@ int ubifs_calc_lpt_geom(struct ubifs_info *c) } c->check_lpt_free = c->big_lpt; - return 0; } @@ -175,7 +175,7 @@ static int calc_dflt_lpt_geom(struct ubifs_info *c, int *main_lebs, int *big_lpt) { int i, lebs_needed; - uint64_t sz; + long long sz; /* Start by assuming the minimum number of LPT LEBs */ c->lpt_lebs = UBIFS_MIN_LPT_LEBS; @@ -202,9 +202,7 @@ static int calc_dflt_lpt_geom(struct ubifs_info *c, int *main_lebs, /* Now check there are enough LPT LEBs */ for (i = 0; i < 64 ; i++) { sz = c->lpt_sz * 4; /* Allow 4 times the size */ - sz += c->leb_size - 1; - do_div(sz, c->leb_size); - lebs_needed = sz; + lebs_needed = div_u64(sz + c->leb_size - 1, c->leb_size); if (lebs_needed > c->lpt_lebs) { /* Not enough LPT LEBs so try again with more */ c->lpt_lebs = lebs_needed; @@ -287,25 +285,56 @@ uint32_t ubifs_unpack_bits(uint8_t **addr, int *pos, int nrbits) const int k = 32 - nrbits; uint8_t *p = *addr; int b = *pos; - uint32_t val; + uint32_t uninitialized_var(val); + const int bytes = (nrbits + b + 7) >> 3; ubifs_assert(nrbits > 0); ubifs_assert(nrbits <= 32); ubifs_assert(*pos >= 0); ubifs_assert(*pos < 8); if (b) { - val = p[1] | ((uint32_t)p[2] << 8) | ((uint32_t)p[3] << 16) | - ((uint32_t)p[4] << 24); + switch (bytes) { + case 2: + val = p[1]; + break; + case 3: + val = p[1] | ((uint32_t)p[2] << 8); + break; + case 4: + val = p[1] | ((uint32_t)p[2] << 8) | + ((uint32_t)p[3] << 16); + break; + case 5: + val = p[1] | ((uint32_t)p[2] << 8) | + ((uint32_t)p[3] << 16) | + ((uint32_t)p[4] << 24); + } val <<= (8 - b); val |= *p >> b; nrbits += b; - } else - val = p[0] | ((uint32_t)p[1] << 8) | ((uint32_t)p[2] << 16) | - ((uint32_t)p[3] << 24); + } else { + switch (bytes) { + case 1: + val = p[0]; + break; + case 2: + val = p[0] | ((uint32_t)p[1] << 8); + break; + case 3: + val = p[0] | ((uint32_t)p[1] << 8) | + ((uint32_t)p[2] << 16); + break; + case 4: + val = p[0] | ((uint32_t)p[1] << 8) | + ((uint32_t)p[2] << 16) | + ((uint32_t)p[3] << 24); + break; + } + } val <<= k; val >>= k; b = nrbits & 7; - p += nrbits / 8; + p += nrbits >> 3; *addr = p; *pos = b; ubifs_assert((val >> nrbits) == 0 || nrbits - b == 32); @@ -526,7 +555,7 @@ static int calc_nnode_num(int row, int col) * This function calculates and returns the nnode number based on the parent's * nnode number and the index in parent. */ -static int calc_nnode_num_from_parent(struct ubifs_info *c, +static int calc_nnode_num_from_parent(const struct ubifs_info *c, struct ubifs_nnode *parent, int iip) { int num, shft; @@ -551,7 +580,7 @@ static int calc_nnode_num_from_parent(struct ubifs_info *c, * This function calculates and returns the pnode number based on the parent's * nnode number and the index in parent. */ -static int calc_pnode_num_from_parent(struct ubifs_info *c, +static int calc_pnode_num_from_parent(const struct ubifs_info *c, struct ubifs_nnode *parent, int iip) { int i, n = c->lpt_hght - 1, pnum = parent->num, num = 0; @@ -672,8 +701,7 @@ int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first, alen = ALIGN(len, c->min_io_size); set_ltab(c, lnum, c->leb_size - alen, alen - len); memset(p, 0xff, alen - len); - err = ubi_leb_change(c->ubi, lnum++, buf, alen, - UBI_SHORTTERM); + err = ubifs_leb_change(c, lnum++, buf, alen); if (err) goto out; p = buf; @@ -703,8 +731,7 @@ int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first, set_ltab(c, lnum, c->leb_size - alen, alen - len); memset(p, 0xff, alen - len); - err = ubi_leb_change(c->ubi, lnum++, buf, alen, - UBI_SHORTTERM); + err = ubifs_leb_change(c, lnum++, buf, alen); if (err) goto out; p = buf; @@ -751,8 +778,7 @@ int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first, alen = ALIGN(len, c->min_io_size); set_ltab(c, lnum, c->leb_size - alen, alen - len); memset(p, 0xff, alen - len); - err = ubi_leb_change(c->ubi, lnum++, buf, alen, - UBI_SHORTTERM); + err = ubifs_leb_change(c, lnum++, buf, alen); if (err) goto out; p = buf; @@ -777,7 +803,7 @@ int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first, alen = ALIGN(len, c->min_io_size); set_ltab(c, lnum, c->leb_size - alen, alen - len); memset(p, 0xff, alen - len); - err = ubi_leb_change(c->ubi, lnum++, buf, alen, UBI_SHORTTERM); + err = ubifs_leb_change(c, lnum++, buf, alen); if (err) goto out; p = buf; @@ -797,7 +823,7 @@ int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first, /* Write remaining buffer */ memset(p, 0xff, alen - len); - err = ubi_leb_change(c->ubi, lnum, buf, alen, UBI_SHORTTERM); + err = ubifs_leb_change(c, lnum, buf, alen); if (err) goto out; @@ -897,7 +923,7 @@ static int check_lpt_crc(void *buf, int len) if (crc != calc_crc) { ubifs_err("invalid crc in LPT node: crc %hx calc %hx", crc, calc_crc); - dbg_dump_stack(); + dump_stack(); return -EINVAL; } return 0; @@ -920,7 +946,7 @@ static int check_lpt_type(uint8_t **addr, int *pos, int type) if (node_type != type) { ubifs_err("invalid type (%d) in LPT node type %d", node_type, type); - dbg_dump_stack(); + dump_stack(); return -EINVAL; } return 0; @@ -934,7 +960,7 @@ static int check_lpt_type(uint8_t **addr, int *pos, int type) * * This function returns %0 on success and a negative error code on failure. */ -static int unpack_pnode(struct ubifs_info *c, void *buf, +static int unpack_pnode(const struct ubifs_info *c, void *buf, struct ubifs_pnode *pnode) { uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; @@ -964,15 +990,15 @@ static int unpack_pnode(struct ubifs_info *c, void *buf, } /** - * unpack_nnode - unpack a nnode. + * ubifs_unpack_nnode - unpack a nnode. * @c: UBIFS file-system description object * @buf: buffer containing packed nnode to unpack * @nnode: nnode structure to fill * * This function returns %0 on success and a negative error code on failure. */ -static int unpack_nnode(struct ubifs_info *c, void *buf, - struct ubifs_nnode *nnode) +int ubifs_unpack_nnode(const struct ubifs_info *c, void *buf, + struct ubifs_nnode *nnode) { uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; int i, pos = 0, err; @@ -1004,7 +1030,7 @@ static int unpack_nnode(struct ubifs_info *c, void *buf, * * This function returns %0 on success and a negative error code on failure. */ -static int unpack_ltab(struct ubifs_info *c, void *buf) +static int unpack_ltab(const struct ubifs_info *c, void *buf) { uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; int i, pos = 0, err; @@ -1036,7 +1062,7 @@ static int unpack_ltab(struct ubifs_info *c, void *buf) * * This function returns %0 on success and a negative error code on failure. */ -static int unpack_lsave(struct ubifs_info *c, void *buf) +static int unpack_lsave(const struct ubifs_info *c, void *buf) { uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; int i, pos = 0, err; @@ -1064,7 +1090,7 @@ static int unpack_lsave(struct ubifs_info *c, void *buf) * * This function returns %0 on success and a negative error code on failure. */ -static int validate_nnode(struct ubifs_info *c, struct ubifs_nnode *nnode, +static int validate_nnode(const struct ubifs_info *c, struct ubifs_nnode *nnode, struct ubifs_nnode *parent, int iip) { int i, lvl, max_offs; @@ -1108,7 +1134,7 @@ static int validate_nnode(struct ubifs_info *c, struct ubifs_nnode *nnode, * * This function returns %0 on success and a negative error code on failure. */ -static int validate_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, +static int validate_pnode(const struct ubifs_info *c, struct ubifs_pnode *pnode, struct ubifs_nnode *parent, int iip) { int i; @@ -1142,7 +1168,8 @@ static int validate_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, * This function calculates the LEB numbers for the LEB properties it contains * based on the pnode number. */ -static void set_pnode_lnum(struct ubifs_info *c, struct ubifs_pnode *pnode) +static void set_pnode_lnum(const struct ubifs_info *c, + struct ubifs_pnode *pnode) { int i, lnum; @@ -1192,10 +1219,10 @@ int ubifs_read_nnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip) if (c->big_lpt) nnode->num = calc_nnode_num_from_parent(c, parent, iip); } else { - err = ubi_read(c->ubi, lnum, buf, offs, c->nnode_sz); + err = ubifs_leb_read(c, lnum, buf, offs, c->nnode_sz, 1); if (err) goto out; - err = unpack_nnode(c, buf, nnode); + err = ubifs_unpack_nnode(c, buf, nnode); if (err) goto out; } @@ -1217,6 +1244,7 @@ int ubifs_read_nnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip) out: ubifs_err("error %d reading nnode at %d:%d", err, lnum, offs); + dump_stack(); kfree(nnode); return err; } @@ -1240,10 +1268,9 @@ static int read_pnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip) lnum = branch->lnum; offs = branch->offs; pnode = kzalloc(sizeof(struct ubifs_pnode), GFP_NOFS); - if (!pnode) { - err = -ENOMEM; - goto out; - } + if (!pnode) + return -ENOMEM; + if (lnum == 0) { /* * This pnode was not written which just means that the LEB @@ -1261,7 +1288,7 @@ static int read_pnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip) lprops->flags = ubifs_categorize_lprops(c, lprops); } } else { - err = ubi_read(c->ubi, lnum, buf, offs, c->pnode_sz); + err = ubifs_leb_read(c, lnum, buf, offs, c->pnode_sz, 1); if (err) goto out; err = unpack_pnode(c, buf, pnode); @@ -1282,8 +1309,9 @@ static int read_pnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip) out: ubifs_err("error %d reading pnode at %d:%d", err, lnum, offs); - dbg_dump_pnode(c, pnode, parent, iip); - dbg_msg("calc num: %d", calc_pnode_num_from_parent(c, parent, iip)); + ubifs_dump_pnode(c, pnode, parent, iip); + dump_stack(); + ubifs_err("calc num: %d", calc_pnode_num_from_parent(c, parent, iip)); kfree(pnode); return err; } @@ -1302,7 +1330,7 @@ static int read_ltab(struct ubifs_info *c) buf = vmalloc(c->ltab_sz); if (!buf) return -ENOMEM; - err = ubi_read(c->ubi, c->ltab_lnum, buf, c->ltab_offs, c->ltab_sz); + err = ubifs_leb_read(c, c->ltab_lnum, buf, c->ltab_offs, c->ltab_sz, 1); if (err) goto out; err = unpack_ltab(c, buf); @@ -1325,7 +1353,8 @@ static int read_lsave(struct ubifs_info *c) buf = vmalloc(c->lsave_sz); if (!buf) return -ENOMEM; - err = ubi_read(c->ubi, c->lsave_lnum, buf, c->lsave_offs, c->lsave_sz); + err = ubifs_leb_read(c, c->lsave_lnum, buf, c->lsave_offs, + c->lsave_sz, 1); if (err) goto out; err = unpack_lsave(c, buf); @@ -1333,6 +1362,7 @@ static int read_lsave(struct ubifs_info *c) goto out; for (i = 0; i < c->lsave_cnt; i++) { int lnum = c->lsave[i]; + struct ubifs_lprops *lprops; /* * Due to automatic resizing, the values in the lsave table @@ -1340,7 +1370,11 @@ static int read_lsave(struct ubifs_info *c) */ if (lnum >= c->leb_cnt) continue; - ubifs_lpt_lookup(c, lnum); + lprops = ubifs_lpt_lookup(c, lnum); + if (IS_ERR(lprops)) { + err = PTR_ERR(lprops); + goto out; + } } out: vfree(buf); @@ -1427,13 +1461,13 @@ struct ubifs_lprops *ubifs_lpt_lookup(struct ubifs_info *c, int lnum) shft -= UBIFS_LPT_FANOUT_SHIFT; nnode = ubifs_get_nnode(c, nnode, iip); if (IS_ERR(nnode)) - return ERR_PTR(PTR_ERR(nnode)); + return ERR_CAST(nnode); } iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); shft -= UBIFS_LPT_FANOUT_SHIFT; pnode = ubifs_get_pnode(c, nnode, iip); if (IS_ERR(pnode)) - return ERR_PTR(PTR_ERR(pnode)); + return ERR_CAST(pnode); iip = (i & (UBIFS_LPT_FANOUT - 1)); dbg_lp("LEB %d, free %d, dirty %d, flags %d", lnum, pnode->lprops[iip].free, pnode->lprops[iip].dirty, @@ -1556,7 +1590,7 @@ struct ubifs_lprops *ubifs_lpt_lookup_dirty(struct ubifs_info *c, int lnum) nnode = c->nroot; nnode = dirty_cow_nnode(c, nnode); if (IS_ERR(nnode)) - return ERR_PTR(PTR_ERR(nnode)); + return ERR_CAST(nnode); i = lnum - c->main_first; shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT; for (h = 1; h < c->lpt_hght; h++) { @@ -1564,19 +1598,19 @@ struct ubifs_lprops *ubifs_lpt_lookup_dirty(struct ubifs_info *c, int lnum) shft -= UBIFS_LPT_FANOUT_SHIFT; nnode = ubifs_get_nnode(c, nnode, iip); if (IS_ERR(nnode)) - return ERR_PTR(PTR_ERR(nnode)); + return ERR_CAST(nnode); nnode = dirty_cow_nnode(c, nnode); if (IS_ERR(nnode)) - return ERR_PTR(PTR_ERR(nnode)); + return ERR_CAST(nnode); } iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); shft -= UBIFS_LPT_FANOUT_SHIFT; pnode = ubifs_get_pnode(c, nnode, iip); if (IS_ERR(pnode)) - return ERR_PTR(PTR_ERR(pnode)); + return ERR_CAST(pnode); pnode = dirty_cow_pnode(c, pnode); if (IS_ERR(pnode)) - return ERR_PTR(PTR_ERR(pnode)); + return ERR_CAST(pnode); iip = (i & (UBIFS_LPT_FANOUT - 1)); dbg_lp("LEB %d, free %d, dirty %d, flags %d", lnum, pnode->lprops[iip].free, pnode->lprops[iip].dirty, @@ -1703,16 +1737,23 @@ int ubifs_lpt_init(struct ubifs_info *c, int rd, int wr) if (rd) { err = lpt_init_rd(c); if (err) - return err; + goto out_err; } if (wr) { err = lpt_init_wr(c); if (err) - return err; + goto out_err; } return 0; + +out_err: + if (wr) + ubifs_lpt_free(c, 1); + if (rd) + ubifs_lpt_free(c, 0); + return err; } /** @@ -1780,11 +1821,11 @@ static struct ubifs_nnode *scan_get_nnode(struct ubifs_info *c, if (c->big_lpt) nnode->num = calc_nnode_num_from_parent(c, parent, iip); } else { - err = ubi_read(c->ubi, branch->lnum, buf, branch->offs, - c->nnode_sz); + err = ubifs_leb_read(c, branch->lnum, buf, branch->offs, + c->nnode_sz, 1); if (err) return ERR_PTR(err); - err = unpack_nnode(c, buf, nnode); + err = ubifs_unpack_nnode(c, buf, nnode); if (err) return ERR_PTR(err); } @@ -1849,8 +1890,8 @@ static struct ubifs_pnode *scan_get_pnode(struct ubifs_info *c, ubifs_assert(branch->lnum >= c->lpt_first && branch->lnum <= c->lpt_last); ubifs_assert(branch->offs >= 0 && branch->offs < c->leb_size); - err = ubi_read(c->ubi, branch->lnum, buf, branch->offs, - c->pnode_sz); + err = ubifs_leb_read(c, branch->lnum, buf, branch->offs, + c->pnode_sz, 1); if (err) return ERR_PTR(err); err = unpack_pnode(c, buf, pnode); @@ -1949,12 +1990,11 @@ again: if (path[h].in_tree) continue; - nnode = kmalloc(sz, GFP_NOFS); + nnode = kmemdup(&path[h].nnode, sz, GFP_NOFS); if (!nnode) { err = -ENOMEM; goto out; } - memcpy(nnode, &path[h].nnode, sz); parent = nnode->parent; parent->nbranch[nnode->iip].nnode = nnode; path[h].ptr.nnode = nnode; @@ -1967,12 +2007,11 @@ again: const size_t sz = sizeof(struct ubifs_pnode); struct ubifs_nnode *parent; - pnode = kmalloc(sz, GFP_NOFS); + pnode = kmemdup(&path[h].pnode, sz, GFP_NOFS); if (!pnode) { err = -ENOMEM; goto out; } - memcpy(pnode, &path[h].pnode, sz); parent = pnode->parent; parent->nbranch[pnode->iip].pnode = pnode; path[h].ptr.pnode = pnode; @@ -2045,8 +2084,6 @@ out: return err; } -#ifdef CONFIG_UBIFS_FS_DEBUG - /** * dbg_chk_pnode - check a pnode. * @c: the UBIFS file-system description object @@ -2061,8 +2098,8 @@ static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, int i; if (pnode->num != col) { - dbg_err("pnode num %d expected %d parent num %d iip %d", - pnode->num, col, pnode->parent->num, pnode->iip); + ubifs_err("pnode num %d expected %d parent num %d iip %d", + pnode->num, col, pnode->parent->num, pnode->iip); return -EINVAL; } for (i = 0; i < UBIFS_LPT_FANOUT; i++) { @@ -2076,14 +2113,14 @@ static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, if (lnum >= c->leb_cnt) continue; if (lprops->lnum != lnum) { - dbg_err("bad LEB number %d expected %d", - lprops->lnum, lnum); + ubifs_err("bad LEB number %d expected %d", + lprops->lnum, lnum); return -EINVAL; } if (lprops->flags & LPROPS_TAKEN) { if (cat != LPROPS_UNCAT) { - dbg_err("LEB %d taken but not uncat %d", - lprops->lnum, cat); + ubifs_err("LEB %d taken but not uncat %d", + lprops->lnum, cat); return -EINVAL; } continue; @@ -2095,8 +2132,8 @@ static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, case LPROPS_FRDI_IDX: break; default: - dbg_err("LEB %d index but cat %d", - lprops->lnum, cat); + ubifs_err("LEB %d index but cat %d", + lprops->lnum, cat); return -EINVAL; } } else { @@ -2108,8 +2145,8 @@ static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, case LPROPS_FREEABLE: break; default: - dbg_err("LEB %d not index but cat %d", - lprops->lnum, cat); + ubifs_err("LEB %d not index but cat %d", + lprops->lnum, cat); return -EINVAL; } } @@ -2149,24 +2186,24 @@ static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, break; } if (!found) { - dbg_err("LEB %d cat %d not found in cat heap/list", - lprops->lnum, cat); + ubifs_err("LEB %d cat %d not found in cat heap/list", + lprops->lnum, cat); return -EINVAL; } switch (cat) { case LPROPS_EMPTY: if (lprops->free != c->leb_size) { - dbg_err("LEB %d cat %d free %d dirty %d", - lprops->lnum, cat, lprops->free, - lprops->dirty); + ubifs_err("LEB %d cat %d free %d dirty %d", + lprops->lnum, cat, lprops->free, + lprops->dirty); return -EINVAL; } case LPROPS_FREEABLE: case LPROPS_FRDI_IDX: if (lprops->free + lprops->dirty != c->leb_size) { - dbg_err("LEB %d cat %d free %d dirty %d", - lprops->lnum, cat, lprops->free, - lprops->dirty); + ubifs_err("LEB %d cat %d free %d dirty %d", + lprops->lnum, cat, lprops->free, + lprops->dirty); return -EINVAL; } } @@ -2190,7 +2227,7 @@ int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode, struct ubifs_cnode *cn; int num, iip = 0, err; - if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS)) + if (!dbg_is_chk_lprops(c)) return 0; while (cnode) { @@ -2200,9 +2237,9 @@ int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode, /* cnode is a nnode */ num = calc_nnode_num(row, col); if (cnode->num != num) { - dbg_err("nnode num %d expected %d " - "parent num %d iip %d", cnode->num, num, - (nnode ? nnode->num : 0), cnode->iip); + ubifs_err("nnode num %d expected %d parent num %d iip %d", + cnode->num, num, + (nnode ? nnode->num : 0), cnode->iip); return -EINVAL; } nn = (struct ubifs_nnode *)cnode; @@ -2239,5 +2276,3 @@ int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode, } return 0; } - -#endif /* CONFIG_UBIFS_FS_DEBUG */ diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c index 5f0b83e20af..45d4e96a6ba 100644 --- a/fs/ubifs/lpt_commit.c +++ b/fs/ubifs/lpt_commit.c @@ -26,8 +26,12 @@ */ #include <linux/crc16.h> +#include <linux/slab.h> +#include <linux/random.h> #include "ubifs.h" +static int dbg_populate_lsave(struct ubifs_info *c); + /** * first_dirty_cnode - find first dirty cnode. * @c: UBIFS file-system description object @@ -109,8 +113,8 @@ static int get_cnodes_to_commit(struct ubifs_info *c) return 0; cnt += 1; while (1) { - ubifs_assert(!test_bit(COW_ZNODE, &cnode->flags)); - __set_bit(COW_ZNODE, &cnode->flags); + ubifs_assert(!test_bit(COW_CNODE, &cnode->flags)); + __set_bit(COW_CNODE, &cnode->flags); cnext = next_dirty_cnode(cnode); if (!cnext) { cnode->cnext = c->lpt_cnext; @@ -177,8 +181,6 @@ static int alloc_lpt_leb(struct ubifs_info *c, int *lnum) return 0; } } - dbg_err("last LEB %d", *lnum); - dump_stack(); return -ENOSPC; } @@ -193,6 +195,9 @@ static int layout_cnodes(struct ubifs_info *c) int lnum, offs, len, alen, done_lsave, done_ltab, err; struct ubifs_cnode *cnode; + err = dbg_chk_lpt_sz(c, 0, 0); + if (err) + return err; cnode = c->lpt_cnext; if (!cnode) return 0; @@ -206,6 +211,7 @@ static int layout_cnodes(struct ubifs_info *c) c->lsave_lnum = lnum; c->lsave_offs = offs; offs += c->lsave_sz; + dbg_chk_lpt_sz(c, 1, c->lsave_sz); } if (offs + c->ltab_sz <= c->leb_size) { @@ -213,6 +219,7 @@ static int layout_cnodes(struct ubifs_info *c) c->ltab_lnum = lnum; c->ltab_offs = offs; offs += c->ltab_sz; + dbg_chk_lpt_sz(c, 1, c->ltab_sz); } do { @@ -226,9 +233,10 @@ static int layout_cnodes(struct ubifs_info *c) while (offs + len > c->leb_size) { alen = ALIGN(offs, c->min_io_size); upd_ltab(c, lnum, c->leb_size - alen, alen - offs); + dbg_chk_lpt_sz(c, 2, c->leb_size - offs); err = alloc_lpt_leb(c, &lnum); if (err) - return err; + goto no_space; offs = 0; ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last); @@ -238,6 +246,7 @@ static int layout_cnodes(struct ubifs_info *c) c->lsave_lnum = lnum; c->lsave_offs = offs; offs += c->lsave_sz; + dbg_chk_lpt_sz(c, 1, c->lsave_sz); continue; } if (!done_ltab) { @@ -245,6 +254,7 @@ static int layout_cnodes(struct ubifs_info *c) c->ltab_lnum = lnum; c->ltab_offs = offs; offs += c->ltab_sz; + dbg_chk_lpt_sz(c, 1, c->ltab_sz); continue; } break; @@ -257,6 +267,7 @@ static int layout_cnodes(struct ubifs_info *c) c->lpt_offs = offs; } offs += len; + dbg_chk_lpt_sz(c, 1, len); cnode = cnode->cnext; } while (cnode && cnode != c->lpt_cnext); @@ -265,9 +276,10 @@ static int layout_cnodes(struct ubifs_info *c) if (offs + c->lsave_sz > c->leb_size) { alen = ALIGN(offs, c->min_io_size); upd_ltab(c, lnum, c->leb_size - alen, alen - offs); + dbg_chk_lpt_sz(c, 2, c->leb_size - offs); err = alloc_lpt_leb(c, &lnum); if (err) - return err; + goto no_space; offs = 0; ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last); @@ -276,6 +288,7 @@ static int layout_cnodes(struct ubifs_info *c) c->lsave_lnum = lnum; c->lsave_offs = offs; offs += c->lsave_sz; + dbg_chk_lpt_sz(c, 1, c->lsave_sz); } /* Make sure to place LPT's own lprops table */ @@ -283,9 +296,10 @@ static int layout_cnodes(struct ubifs_info *c) if (offs + c->ltab_sz > c->leb_size) { alen = ALIGN(offs, c->min_io_size); upd_ltab(c, lnum, c->leb_size - alen, alen - offs); + dbg_chk_lpt_sz(c, 2, c->leb_size - offs); err = alloc_lpt_leb(c, &lnum); if (err) - return err; + goto no_space; offs = 0; ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last); @@ -294,11 +308,24 @@ static int layout_cnodes(struct ubifs_info *c) c->ltab_lnum = lnum; c->ltab_offs = offs; offs += c->ltab_sz; + dbg_chk_lpt_sz(c, 1, c->ltab_sz); } alen = ALIGN(offs, c->min_io_size); upd_ltab(c, lnum, c->leb_size - alen, alen - offs); + dbg_chk_lpt_sz(c, 4, alen - offs); + err = dbg_chk_lpt_sz(c, 3, alen); + if (err) + return err; return 0; + +no_space: + ubifs_err("LPT out of space at LEB %d:%d needing %d, done_ltab %d, done_lsave %d", + lnum, offs, len, done_ltab, done_lsave); + ubifs_dump_lpt_info(c); + ubifs_dump_lpt_lebs(c); + dump_stack(); + return err; } /** @@ -333,8 +360,6 @@ static int realloc_lpt_leb(struct ubifs_info *c, int *lnum) *lnum = i + c->lpt_first; return 0; } - dbg_err("last LEB %d", *lnum); - dump_stack(); return -ENOSPC; } @@ -369,12 +394,14 @@ static int write_cnodes(struct ubifs_info *c) done_lsave = 1; ubifs_pack_lsave(c, buf + offs, c->lsave); offs += c->lsave_sz; + dbg_chk_lpt_sz(c, 1, c->lsave_sz); } if (offs + c->ltab_sz <= c->leb_size) { done_ltab = 1; ubifs_pack_ltab(c, buf + offs, c->ltab_cmt); offs += c->ltab_sz; + dbg_chk_lpt_sz(c, 1, c->ltab_sz); } /* Loop for each cnode */ @@ -389,15 +416,15 @@ static int write_cnodes(struct ubifs_info *c) alen = ALIGN(wlen, c->min_io_size); memset(buf + offs, 0xff, alen - wlen); err = ubifs_leb_write(c, lnum, buf + from, from, - alen, UBI_SHORTTERM); + alen); if (err) return err; } + dbg_chk_lpt_sz(c, 2, c->leb_size - offs); err = realloc_lpt_leb(c, &lnum); if (err) - return err; - offs = 0; - from = 0; + goto no_space; + offs = from = 0; ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last); err = ubifs_leb_unmap(c, lnum); @@ -408,12 +435,14 @@ static int write_cnodes(struct ubifs_info *c) done_lsave = 1; ubifs_pack_lsave(c, buf + offs, c->lsave); offs += c->lsave_sz; + dbg_chk_lpt_sz(c, 1, c->lsave_sz); continue; } if (!done_ltab) { done_ltab = 1; ubifs_pack_ltab(c, buf + offs, c->ltab_cmt); offs += c->ltab_sz; + dbg_chk_lpt_sz(c, 1, c->ltab_sz); continue; } break; @@ -431,10 +460,11 @@ static int write_cnodes(struct ubifs_info *c) * important. */ clear_bit(DIRTY_CNODE, &cnode->flags); - smp_mb__before_clear_bit(); - clear_bit(COW_ZNODE, &cnode->flags); - smp_mb__after_clear_bit(); + smp_mb__before_atomic(); + clear_bit(COW_CNODE, &cnode->flags); + smp_mb__after_atomic(); offs += len; + dbg_chk_lpt_sz(c, 1, len); cnode = cnode->cnext; } while (cnode && cnode != c->lpt_cnext); @@ -444,14 +474,14 @@ static int write_cnodes(struct ubifs_info *c) wlen = offs - from; alen = ALIGN(wlen, c->min_io_size); memset(buf + offs, 0xff, alen - wlen); - err = ubifs_leb_write(c, lnum, buf + from, from, alen, - UBI_SHORTTERM); + err = ubifs_leb_write(c, lnum, buf + from, from, alen); if (err) return err; + dbg_chk_lpt_sz(c, 2, c->leb_size - offs); err = realloc_lpt_leb(c, &lnum); if (err) - return err; - offs = 0; + goto no_space; + offs = from = 0; ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last); err = ubifs_leb_unmap(c, lnum); @@ -461,6 +491,7 @@ static int write_cnodes(struct ubifs_info *c) done_lsave = 1; ubifs_pack_lsave(c, buf + offs, c->lsave); offs += c->lsave_sz; + dbg_chk_lpt_sz(c, 1, c->lsave_sz); } /* Make sure to place LPT's own lprops table */ @@ -469,14 +500,14 @@ static int write_cnodes(struct ubifs_info *c) wlen = offs - from; alen = ALIGN(wlen, c->min_io_size); memset(buf + offs, 0xff, alen - wlen); - err = ubifs_leb_write(c, lnum, buf + from, from, alen, - UBI_SHORTTERM); + err = ubifs_leb_write(c, lnum, buf + from, from, alen); if (err) return err; + dbg_chk_lpt_sz(c, 2, c->leb_size - offs); err = realloc_lpt_leb(c, &lnum); if (err) - return err; - offs = 0; + goto no_space; + offs = from = 0; ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last); err = ubifs_leb_unmap(c, lnum); @@ -486,15 +517,22 @@ static int write_cnodes(struct ubifs_info *c) done_ltab = 1; ubifs_pack_ltab(c, buf + offs, c->ltab_cmt); offs += c->ltab_sz; + dbg_chk_lpt_sz(c, 1, c->ltab_sz); } /* Write remaining data in buffer */ wlen = offs - from; alen = ALIGN(wlen, c->min_io_size); memset(buf + offs, 0xff, alen - wlen); - err = ubifs_leb_write(c, lnum, buf + from, from, alen, UBI_SHORTTERM); + err = ubifs_leb_write(c, lnum, buf + from, from, alen); if (err) return err; + + dbg_chk_lpt_sz(c, 4, alen - wlen); + err = dbg_chk_lpt_sz(c, 3, ALIGN(offs, c->min_io_size)); + if (err) + return err; + c->nhead_lnum = lnum; c->nhead_offs = ALIGN(offs, c->min_io_size); @@ -503,31 +541,38 @@ static int write_cnodes(struct ubifs_info *c) dbg_lp("LPT ltab is at %d:%d", c->ltab_lnum, c->ltab_offs); if (c->big_lpt) dbg_lp("LPT lsave is at %d:%d", c->lsave_lnum, c->lsave_offs); + return 0; + +no_space: + ubifs_err("LPT out of space mismatch at LEB %d:%d needing %d, done_ltab %d, done_lsave %d", + lnum, offs, len, done_ltab, done_lsave); + ubifs_dump_lpt_info(c); + ubifs_dump_lpt_lebs(c); + dump_stack(); + return err; } /** - * next_pnode - find next pnode. + * next_pnode_to_dirty - find next pnode to dirty. * @c: UBIFS file-system description object * @pnode: pnode * - * This function returns the next pnode or %NULL if there are no more pnodes. + * This function returns the next pnode to dirty or %NULL if there are no more + * pnodes. Note that pnodes that have never been written (lnum == 0) are + * skipped. */ -static struct ubifs_pnode *next_pnode(struct ubifs_info *c, - struct ubifs_pnode *pnode) +static struct ubifs_pnode *next_pnode_to_dirty(struct ubifs_info *c, + struct ubifs_pnode *pnode) { struct ubifs_nnode *nnode; int iip; /* Try to go right */ nnode = pnode->parent; - iip = pnode->iip + 1; - if (iip < UBIFS_LPT_FANOUT) { - /* We assume here that LEB zero is never an LPT LEB */ + for (iip = pnode->iip + 1; iip < UBIFS_LPT_FANOUT; iip++) { if (nnode->nbranch[iip].lnum) return ubifs_get_pnode(c, nnode, iip); - else - return NULL; } /* Go up while can't go right */ @@ -536,8 +581,11 @@ static struct ubifs_pnode *next_pnode(struct ubifs_info *c, nnode = nnode->parent; if (!nnode) return NULL; - /* We assume here that LEB zero is never an LPT LEB */ - } while (iip >= UBIFS_LPT_FANOUT || !nnode->nbranch[iip].lnum); + for (; iip < UBIFS_LPT_FANOUT; iip++) { + if (nnode->nbranch[iip].lnum) + break; + } + } while (iip >= UBIFS_LPT_FANOUT); /* Go right */ nnode = ubifs_get_nnode(c, nnode, iip); @@ -546,12 +594,29 @@ static struct ubifs_pnode *next_pnode(struct ubifs_info *c, /* Go down to level 1 */ while (nnode->level > 1) { - nnode = ubifs_get_nnode(c, nnode, 0); + for (iip = 0; iip < UBIFS_LPT_FANOUT; iip++) { + if (nnode->nbranch[iip].lnum) + break; + } + if (iip >= UBIFS_LPT_FANOUT) { + /* + * Should not happen, but we need to keep going + * if it does. + */ + iip = 0; + } + nnode = ubifs_get_nnode(c, nnode, iip); if (IS_ERR(nnode)) return (void *)nnode; } - return ubifs_get_pnode(c, nnode, 0); + for (iip = 0; iip < UBIFS_LPT_FANOUT; iip++) + if (nnode->nbranch[iip].lnum) + break; + if (iip >= UBIFS_LPT_FANOUT) + /* Should not happen, but we need to keep going if it does */ + iip = 0; + return ubifs_get_pnode(c, nnode, iip); } /** @@ -580,7 +645,7 @@ static struct ubifs_pnode *pnode_lookup(struct ubifs_info *c, int i) shft -= UBIFS_LPT_FANOUT_SHIFT; nnode = ubifs_get_nnode(c, nnode, iip); if (IS_ERR(nnode)) - return ERR_PTR(PTR_ERR(nnode)); + return ERR_CAST(nnode); } iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); return ubifs_get_pnode(c, nnode, iip); @@ -639,9 +704,12 @@ static int make_tree_dirty(struct ubifs_info *c) struct ubifs_pnode *pnode; pnode = pnode_lookup(c, 0); + if (IS_ERR(pnode)) + return PTR_ERR(pnode); + while (pnode) { do_make_pnode_dirty(c, pnode); - pnode = next_pnode(c, pnode); + pnode = next_pnode_to_dirty(c, pnode); if (IS_ERR(pnode)) return PTR_ERR(pnode); } @@ -706,7 +774,7 @@ static void lpt_tgc_start(struct ubifs_info *c) * LPT trivial garbage collection is where a LPT LEB contains only dirty and * free space and so may be reused as soon as the next commit is completed. * This function is called after the commit is completed (master node has been - * written) and unmaps LPT LEBs that were marked for trivial GC. + * written) and un-maps LPT LEBs that were marked for trivial GC. */ static int lpt_tgc_end(struct ubifs_info *c) { @@ -746,6 +814,10 @@ static void populate_lsave(struct ubifs_info *c) c->lpt_drty_flgs |= LSAVE_DIRTY; ubifs_add_lpt_dirt(c, c->lsave_lnum, c->lsave_sz); } + + if (dbg_populate_lsave(c)) + return; + list_for_each_entry(lprops, &c->empty_list, list) { c->lsave[cnt++] = lprops->lnum; if (cnt >= c->lsave_cnt) @@ -982,7 +1054,7 @@ static int make_node_dirty(struct ubifs_info *c, int node_type, int node_num, * @c: UBIFS file-system description object * @node_type: LPT node type */ -static int get_lpt_node_len(struct ubifs_info *c, int node_type) +static int get_lpt_node_len(const struct ubifs_info *c, int node_type) { switch (node_type) { case UBIFS_LPT_NNODE: @@ -1003,7 +1075,7 @@ static int get_lpt_node_len(struct ubifs_info *c, int node_type) * @buf: buffer * @len: length of buffer */ -static int get_pad_len(struct ubifs_info *c, uint8_t *buf, int len) +static int get_pad_len(const struct ubifs_info *c, uint8_t *buf, int len) { int offs, pad_len; @@ -1020,7 +1092,8 @@ static int get_pad_len(struct ubifs_info *c, uint8_t *buf, int len) * @buf: buffer * @node_num: node number is returned here */ -static int get_lpt_node_type(struct ubifs_info *c, uint8_t *buf, int *node_num) +static int get_lpt_node_type(const struct ubifs_info *c, uint8_t *buf, + int *node_num) { uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; int pos = 0, node_type; @@ -1038,12 +1111,14 @@ static int get_lpt_node_type(struct ubifs_info *c, uint8_t *buf, int *node_num) * * This function returns %1 if the buffer contains a node or %0 if it does not. */ -static int is_a_node(struct ubifs_info *c, uint8_t *buf, int len) +static int is_a_node(const struct ubifs_info *c, uint8_t *buf, int len) { uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; int pos = 0, node_type, node_len; uint16_t crc, calc_crc; + if (len < UBIFS_LPT_CRC_BYTES + (UBIFS_LPT_TYPE_BITS + 7) / 8) + return 0; node_type = ubifs_unpack_bits(&addr, &pos, UBIFS_LPT_TYPE_BITS); if (node_type == UBIFS_LPT_NOT_A_NODE) return 0; @@ -1060,7 +1135,6 @@ static int is_a_node(struct ubifs_info *c, uint8_t *buf, int len) return 1; } - /** * lpt_gc_lnum - garbage collect a LPT LEB. * @c: UBIFS file-system description object @@ -1079,11 +1153,11 @@ static int lpt_gc_lnum(struct ubifs_info *c, int lnum) void *buf = c->lpt_buf; dbg_lp("LEB %d", lnum); - err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size); - if (err) { - ubifs_err("cannot read LEB %d, error %d", lnum, err); + + err = ubifs_leb_read(c, lnum, buf, 0, c->leb_size, 1); + if (err) return err; - } + while (1) { if (!is_a_node(c, buf, len)) { int pad_len; @@ -1156,6 +1230,9 @@ int ubifs_lpt_start_commit(struct ubifs_info *c) dbg_lp(""); mutex_lock(&c->lp_mutex); + err = dbg_chk_lpt_free_spc(c); + if (err) + goto out; err = dbg_check_ltab(c); if (err) goto out; @@ -1412,10 +1489,12 @@ void ubifs_lpt_free(struct ubifs_info *c, int wr_only) kfree(c->lpt_nod_buf); } -#ifdef CONFIG_UBIFS_FS_DEBUG +/* + * Everything below is related to debugging. + */ /** - * dbg_is_all_ff - determine if a buffer contains only 0xff bytes. + * dbg_is_all_ff - determine if a buffer contains only 0xFF bytes. * @buf: buffer * @len: buffer length */ @@ -1440,7 +1519,7 @@ static int dbg_is_nnode_dirty(struct ubifs_info *c, int lnum, int offs) struct ubifs_nnode *nnode; int hght; - /* Entire tree is in memory so first_nnode / next_nnode are ok */ + /* Entire tree is in memory so first_nnode / next_nnode are OK */ nnode = first_nnode(c, &hght); for (; nnode; nnode = next_nnode(c, nnode, &hght)) { struct ubifs_nbranch *branch; @@ -1554,53 +1633,65 @@ static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum) { int err, len = c->leb_size, dirty = 0, node_type, node_num, node_len; int ret; - void *buf = c->dbg_buf; + void *buf, *p; - dbg_lp("LEB %d", lnum); - err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size); - if (err) { - dbg_msg("ubi_read failed, LEB %d, error %d", lnum, err); - return err; + if (!dbg_is_chk_lprops(c)) + return 0; + + buf = p = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); + if (!buf) { + ubifs_err("cannot allocate memory for ltab checking"); + return 0; } + + dbg_lp("LEB %d", lnum); + + err = ubifs_leb_read(c, lnum, buf, 0, c->leb_size, 1); + if (err) + goto out; + while (1) { - if (!is_a_node(c, buf, len)) { + if (!is_a_node(c, p, len)) { int i, pad_len; - pad_len = get_pad_len(c, buf, len); + pad_len = get_pad_len(c, p, len); if (pad_len) { - buf += pad_len; + p += pad_len; len -= pad_len; dirty += pad_len; continue; } - if (!dbg_is_all_ff(buf, len)) { - dbg_msg("invalid empty space in LEB %d at %d", - lnum, c->leb_size - len); + if (!dbg_is_all_ff(p, len)) { + ubifs_err("invalid empty space in LEB %d at %d", + lnum, c->leb_size - len); err = -EINVAL; } i = lnum - c->lpt_first; if (len != c->ltab[i].free) { - dbg_msg("invalid free space in LEB %d " - "(free %d, expected %d)", - lnum, len, c->ltab[i].free); + ubifs_err("invalid free space in LEB %d (free %d, expected %d)", + lnum, len, c->ltab[i].free); err = -EINVAL; } if (dirty != c->ltab[i].dirty) { - dbg_msg("invalid dirty space in LEB %d " - "(dirty %d, expected %d)", - lnum, dirty, c->ltab[i].dirty); + ubifs_err("invalid dirty space in LEB %d (dirty %d, expected %d)", + lnum, dirty, c->ltab[i].dirty); err = -EINVAL; } - return err; + goto out; } - node_type = get_lpt_node_type(c, buf, &node_num); + node_type = get_lpt_node_type(c, p, &node_num); node_len = get_lpt_node_len(c, node_type); ret = dbg_is_node_dirty(c, node_type, lnum, c->leb_size - len); if (ret == 1) dirty += node_len; - buf += node_len; + p += node_len; len -= node_len; } + + err = 0; +out: + vfree(buf); + return err; } /** @@ -1613,7 +1704,7 @@ int dbg_check_ltab(struct ubifs_info *c) { int lnum, err, i, cnt; - if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS)) + if (!dbg_is_chk_lprops(c)) return 0; /* Bring the entire tree into memory */ @@ -1636,7 +1727,7 @@ int dbg_check_ltab(struct ubifs_info *c) for (lnum = c->lpt_first; lnum <= c->lpt_last; lnum++) { err = dbg_check_ltab_lnum(c, lnum); if (err) { - dbg_err("failed at LEB %d", lnum); + ubifs_err("failed at LEB %d", lnum); return err; } } @@ -1645,4 +1736,299 @@ int dbg_check_ltab(struct ubifs_info *c) return 0; } -#endif /* CONFIG_UBIFS_FS_DEBUG */ +/** + * dbg_chk_lpt_free_spc - check LPT free space is enough to write entire LPT. + * @c: the UBIFS file-system description object + * + * This function returns %0 on success and a negative error code on failure. + */ +int dbg_chk_lpt_free_spc(struct ubifs_info *c) +{ + long long free = 0; + int i; + + if (!dbg_is_chk_lprops(c)) + return 0; + + for (i = 0; i < c->lpt_lebs; i++) { + if (c->ltab[i].tgc || c->ltab[i].cmt) + continue; + if (i + c->lpt_first == c->nhead_lnum) + free += c->leb_size - c->nhead_offs; + else if (c->ltab[i].free == c->leb_size) + free += c->leb_size; + } + if (free < c->lpt_sz) { + ubifs_err("LPT space error: free %lld lpt_sz %lld", + free, c->lpt_sz); + ubifs_dump_lpt_info(c); + ubifs_dump_lpt_lebs(c); + dump_stack(); + return -EINVAL; + } + return 0; +} + +/** + * dbg_chk_lpt_sz - check LPT does not write more than LPT size. + * @c: the UBIFS file-system description object + * @action: what to do + * @len: length written + * + * This function returns %0 on success and a negative error code on failure. + * The @action argument may be one of: + * o %0 - LPT debugging checking starts, initialize debugging variables; + * o %1 - wrote an LPT node, increase LPT size by @len bytes; + * o %2 - switched to a different LEB and wasted @len bytes; + * o %3 - check that we've written the right number of bytes. + * o %4 - wasted @len bytes; + */ +int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len) +{ + struct ubifs_debug_info *d = c->dbg; + long long chk_lpt_sz, lpt_sz; + int err = 0; + + if (!dbg_is_chk_lprops(c)) + return 0; + + switch (action) { + case 0: + d->chk_lpt_sz = 0; + d->chk_lpt_sz2 = 0; + d->chk_lpt_lebs = 0; + d->chk_lpt_wastage = 0; + if (c->dirty_pn_cnt > c->pnode_cnt) { + ubifs_err("dirty pnodes %d exceed max %d", + c->dirty_pn_cnt, c->pnode_cnt); + err = -EINVAL; + } + if (c->dirty_nn_cnt > c->nnode_cnt) { + ubifs_err("dirty nnodes %d exceed max %d", + c->dirty_nn_cnt, c->nnode_cnt); + err = -EINVAL; + } + return err; + case 1: + d->chk_lpt_sz += len; + return 0; + case 2: + d->chk_lpt_sz += len; + d->chk_lpt_wastage += len; + d->chk_lpt_lebs += 1; + return 0; + case 3: + chk_lpt_sz = c->leb_size; + chk_lpt_sz *= d->chk_lpt_lebs; + chk_lpt_sz += len - c->nhead_offs; + if (d->chk_lpt_sz != chk_lpt_sz) { + ubifs_err("LPT wrote %lld but space used was %lld", + d->chk_lpt_sz, chk_lpt_sz); + err = -EINVAL; + } + if (d->chk_lpt_sz > c->lpt_sz) { + ubifs_err("LPT wrote %lld but lpt_sz is %lld", + d->chk_lpt_sz, c->lpt_sz); + err = -EINVAL; + } + if (d->chk_lpt_sz2 && d->chk_lpt_sz != d->chk_lpt_sz2) { + ubifs_err("LPT layout size %lld but wrote %lld", + d->chk_lpt_sz, d->chk_lpt_sz2); + err = -EINVAL; + } + if (d->chk_lpt_sz2 && d->new_nhead_offs != len) { + ubifs_err("LPT new nhead offs: expected %d was %d", + d->new_nhead_offs, len); + err = -EINVAL; + } + lpt_sz = (long long)c->pnode_cnt * c->pnode_sz; + lpt_sz += (long long)c->nnode_cnt * c->nnode_sz; + lpt_sz += c->ltab_sz; + if (c->big_lpt) + lpt_sz += c->lsave_sz; + if (d->chk_lpt_sz - d->chk_lpt_wastage > lpt_sz) { + ubifs_err("LPT chk_lpt_sz %lld + waste %lld exceeds %lld", + d->chk_lpt_sz, d->chk_lpt_wastage, lpt_sz); + err = -EINVAL; + } + if (err) { + ubifs_dump_lpt_info(c); + ubifs_dump_lpt_lebs(c); + dump_stack(); + } + d->chk_lpt_sz2 = d->chk_lpt_sz; + d->chk_lpt_sz = 0; + d->chk_lpt_wastage = 0; + d->chk_lpt_lebs = 0; + d->new_nhead_offs = len; + return err; + case 4: + d->chk_lpt_sz += len; + d->chk_lpt_wastage += len; + return 0; + default: + return -EINVAL; + } +} + +/** + * ubifs_dump_lpt_leb - dump an LPT LEB. + * @c: UBIFS file-system description object + * @lnum: LEB number to dump + * + * This function dumps an LEB from LPT area. Nodes in this area are very + * different to nodes in the main area (e.g., they do not have common headers, + * they do not have 8-byte alignments, etc), so we have a separate function to + * dump LPT area LEBs. Note, LPT has to be locked by the caller. + */ +static void dump_lpt_leb(const struct ubifs_info *c, int lnum) +{ + int err, len = c->leb_size, node_type, node_num, node_len, offs; + void *buf, *p; + + pr_err("(pid %d) start dumping LEB %d\n", current->pid, lnum); + buf = p = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); + if (!buf) { + ubifs_err("cannot allocate memory to dump LPT"); + return; + } + + err = ubifs_leb_read(c, lnum, buf, 0, c->leb_size, 1); + if (err) + goto out; + + while (1) { + offs = c->leb_size - len; + if (!is_a_node(c, p, len)) { + int pad_len; + + pad_len = get_pad_len(c, p, len); + if (pad_len) { + pr_err("LEB %d:%d, pad %d bytes\n", + lnum, offs, pad_len); + p += pad_len; + len -= pad_len; + continue; + } + if (len) + pr_err("LEB %d:%d, free %d bytes\n", + lnum, offs, len); + break; + } + + node_type = get_lpt_node_type(c, p, &node_num); + switch (node_type) { + case UBIFS_LPT_PNODE: + { + node_len = c->pnode_sz; + if (c->big_lpt) + pr_err("LEB %d:%d, pnode num %d\n", + lnum, offs, node_num); + else + pr_err("LEB %d:%d, pnode\n", lnum, offs); + break; + } + case UBIFS_LPT_NNODE: + { + int i; + struct ubifs_nnode nnode; + + node_len = c->nnode_sz; + if (c->big_lpt) + pr_err("LEB %d:%d, nnode num %d, ", + lnum, offs, node_num); + else + pr_err("LEB %d:%d, nnode, ", + lnum, offs); + err = ubifs_unpack_nnode(c, p, &nnode); + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + pr_cont("%d:%d", nnode.nbranch[i].lnum, + nnode.nbranch[i].offs); + if (i != UBIFS_LPT_FANOUT - 1) + pr_cont(", "); + } + pr_cont("\n"); + break; + } + case UBIFS_LPT_LTAB: + node_len = c->ltab_sz; + pr_err("LEB %d:%d, ltab\n", lnum, offs); + break; + case UBIFS_LPT_LSAVE: + node_len = c->lsave_sz; + pr_err("LEB %d:%d, lsave len\n", lnum, offs); + break; + default: + ubifs_err("LPT node type %d not recognized", node_type); + goto out; + } + + p += node_len; + len -= node_len; + } + + pr_err("(pid %d) finish dumping LEB %d\n", current->pid, lnum); +out: + vfree(buf); + return; +} + +/** + * ubifs_dump_lpt_lebs - dump LPT lebs. + * @c: UBIFS file-system description object + * + * This function dumps all LPT LEBs. The caller has to make sure the LPT is + * locked. + */ +void ubifs_dump_lpt_lebs(const struct ubifs_info *c) +{ + int i; + + pr_err("(pid %d) start dumping all LPT LEBs\n", current->pid); + for (i = 0; i < c->lpt_lebs; i++) + dump_lpt_leb(c, i + c->lpt_first); + pr_err("(pid %d) finish dumping all LPT LEBs\n", current->pid); +} + +/** + * dbg_populate_lsave - debugging version of 'populate_lsave()' + * @c: UBIFS file-system description object + * + * This is a debugging version for 'populate_lsave()' which populates lsave + * with random LEBs instead of useful LEBs, which is good for test coverage. + * Returns zero if lsave has not been populated (this debugging feature is + * disabled) an non-zero if lsave has been populated. + */ +static int dbg_populate_lsave(struct ubifs_info *c) +{ + struct ubifs_lprops *lprops; + struct ubifs_lpt_heap *heap; + int i; + + if (!dbg_is_chk_gen(c)) + return 0; + if (prandom_u32() & 3) + return 0; + + for (i = 0; i < c->lsave_cnt; i++) + c->lsave[i] = c->main_first; + + list_for_each_entry(lprops, &c->empty_list, list) + c->lsave[prandom_u32() % c->lsave_cnt] = lprops->lnum; + list_for_each_entry(lprops, &c->freeable_list, list) + c->lsave[prandom_u32() % c->lsave_cnt] = lprops->lnum; + list_for_each_entry(lprops, &c->frdi_idx_list, list) + c->lsave[prandom_u32() % c->lsave_cnt] = lprops->lnum; + + heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1]; + for (i = 0; i < heap->cnt; i++) + c->lsave[prandom_u32() % c->lsave_cnt] = heap->arr[i]->lnum; + heap = &c->lpt_heap[LPROPS_DIRTY - 1]; + for (i = 0; i < heap->cnt; i++) + c->lsave[prandom_u32() % c->lsave_cnt] = heap->arr[i]->lnum; + heap = &c->lpt_heap[LPROPS_FREE - 1]; + for (i = 0; i < heap->cnt; i++) + c->lsave[prandom_u32() % c->lsave_cnt] = heap->arr[i]->lnum; + + return 1; +} diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c index 71d5493bf56..ab83ace9910 100644 --- a/fs/ubifs/master.c +++ b/fs/ubifs/master.c @@ -29,7 +29,8 @@ * @c: UBIFS file-system description object * * This function scans the master node LEBs and search for the latest master - * node. Returns zero in case of success and a negative error code in case of + * node. Returns zero in case of success, %-EUCLEAN if there master area is + * corrupted and requires recovery, and a negative error code in case of * failure. */ static int scan_for_master(struct ubifs_info *c) @@ -40,7 +41,7 @@ static int scan_for_master(struct ubifs_info *c) lnum = UBIFS_MST_LNUM; - sleb = ubifs_scan(c, lnum, 0, c->sbuf); + sleb = ubifs_scan(c, lnum, 0, c->sbuf, 1); if (IS_ERR(sleb)) return PTR_ERR(sleb); nodes_cnt = sleb->nodes_cnt; @@ -48,7 +49,7 @@ static int scan_for_master(struct ubifs_info *c) snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node, list); if (snod->type != UBIFS_MST_NODE) - goto out; + goto out_dump; memcpy(c->mst_node, snod->node, snod->len); offs = snod->offs; } @@ -56,7 +57,7 @@ static int scan_for_master(struct ubifs_info *c) lnum += 1; - sleb = ubifs_scan(c, lnum, 0, c->sbuf); + sleb = ubifs_scan(c, lnum, 0, c->sbuf, 1); if (IS_ERR(sleb)) return PTR_ERR(sleb); if (sleb->nodes_cnt != nodes_cnt) @@ -65,7 +66,7 @@ static int scan_for_master(struct ubifs_info *c) goto out; snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node, list); if (snod->type != UBIFS_MST_NODE) - goto out; + goto out_dump; if (snod->offs != offs) goto out; if (memcmp((void *)c->mst_node + UBIFS_CH_SZ, @@ -78,6 +79,12 @@ static int scan_for_master(struct ubifs_info *c) out: ubifs_scan_destroy(sleb); + return -EUCLEAN; + +out_dump: + ubifs_err("unexpected node type %d master LEB %d:%d", + snod->type, lnum, snod->offs); + ubifs_scan_destroy(sleb); return -EINVAL; } @@ -141,7 +148,7 @@ static int validate_master(const struct ubifs_info *c) } main_sz = (long long)c->main_lebs * c->leb_size; - if (c->old_idx_sz & 7 || c->old_idx_sz >= main_sz) { + if (c->bi.old_idx_sz & 7 || c->bi.old_idx_sz >= main_sz) { err = 9; goto out; } @@ -211,7 +218,7 @@ static int validate_master(const struct ubifs_info *c) } if (c->lst.total_dead + c->lst.total_dark + - c->lst.total_used + c->old_idx_sz > main_sz) { + c->lst.total_used + c->bi.old_idx_sz > main_sz) { err = 21; goto out; } @@ -234,7 +241,7 @@ static int validate_master(const struct ubifs_info *c) out: ubifs_err("bad master node at offset %d error %d", c->mst_offs, err); - dbg_dump_node(c, c->mst_node); + ubifs_dump_node(c, c->mst_node); return -EINVAL; } @@ -256,7 +263,8 @@ int ubifs_read_master(struct ubifs_info *c) err = scan_for_master(c); if (err) { - err = ubifs_recover_master_node(c); + if (err == -EUCLEAN) + err = ubifs_recover_master_node(c); if (err) /* * Note, we do not free 'c->mst_node' here because the @@ -278,7 +286,7 @@ int ubifs_read_master(struct ubifs_info *c) c->gc_lnum = le32_to_cpu(c->mst_node->gc_lnum); c->ihead_lnum = le32_to_cpu(c->mst_node->ihead_lnum); c->ihead_offs = le32_to_cpu(c->mst_node->ihead_offs); - c->old_idx_sz = le64_to_cpu(c->mst_node->index_size); + c->bi.old_idx_sz = le64_to_cpu(c->mst_node->index_size); c->lpt_lnum = le32_to_cpu(c->mst_node->lpt_lnum); c->lpt_offs = le32_to_cpu(c->mst_node->lpt_offs); c->nhead_lnum = le32_to_cpu(c->mst_node->nhead_lnum); @@ -297,7 +305,7 @@ int ubifs_read_master(struct ubifs_info *c) c->lst.total_dead = le64_to_cpu(c->mst_node->total_dead); c->lst.total_dark = le64_to_cpu(c->mst_node->total_dark); - c->calc_idx_sz = c->old_idx_sz; + c->calc_idx_sz = c->bi.old_idx_sz; if (c->mst_node->flags & cpu_to_le32(UBIFS_MST_NO_ORPHS)) c->no_orphs = 1; @@ -309,7 +317,7 @@ int ubifs_read_master(struct ubifs_info *c) if (c->leb_cnt < old_leb_cnt || c->leb_cnt < UBIFS_MIN_LEB_CNT) { ubifs_err("bad leb_cnt on master node"); - dbg_dump_node(c, c->mst_node); + ubifs_dump_node(c, c->mst_node); return -EINVAL; } @@ -353,8 +361,9 @@ int ubifs_write_master(struct ubifs_info *c) { int err, lnum, offs, len; - if (c->ro_media) - return -EINVAL; + ubifs_assert(!c->ro_media && !c->ro_mount); + if (c->ro_error) + return -EROFS; lnum = UBIFS_MST_LNUM; offs = c->mst_offs + c->mst_node_alsz; @@ -370,7 +379,7 @@ int ubifs_write_master(struct ubifs_info *c) c->mst_offs = offs; c->mst_node->highest_inum = cpu_to_le64(c->highest_inum); - err = ubifs_write_node(c, c->mst_node, len, lnum, offs, UBI_SHORTTERM); + err = ubifs_write_node(c, c->mst_node, len, lnum, offs); if (err) return err; @@ -381,7 +390,7 @@ int ubifs_write_master(struct ubifs_info *c) if (err) return err; } - err = ubifs_write_node(c, c->mst_node, len, lnum, offs, UBI_SHORTTERM); + err = ubifs_write_node(c, c->mst_node, len, lnum, offs); return err; } diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h index 4beccfc256d..ee7cb5ebb6e 100644 --- a/fs/ubifs/misc.h +++ b/fs/ubifs/misc.h @@ -39,6 +39,29 @@ static inline int ubifs_zn_dirty(const struct ubifs_znode *znode) } /** + * ubifs_zn_obsolete - check if znode is obsolete. + * @znode: znode to check + * + * This helper function returns %1 if @znode is obsolete and %0 otherwise. + */ +static inline int ubifs_zn_obsolete(const struct ubifs_znode *znode) +{ + return !!test_bit(OBSOLETE_ZNODE, &znode->flags); +} + +/** + * ubifs_zn_cow - check if znode has to be copied on write. + * @znode: znode to check + * + * This helper function returns %1 if @znode is has COW flag set and %0 + * otherwise. + */ +static inline int ubifs_zn_cow(const struct ubifs_znode *znode) +{ + return !!test_bit(COW_ZNODE, &znode->flags); +} + +/** * ubifs_wake_up_bgt - wake up background thread. * @c: UBIFS file-system description object */ @@ -80,20 +103,6 @@ static inline struct ubifs_inode *ubifs_inode(const struct inode *inode) } /** - * ubifs_ro_mode - switch UBIFS to read read-only mode. - * @c: UBIFS file-system description object - * @err: error code which is the reason of switching to R/O mode - */ -static inline void ubifs_ro_mode(struct ubifs_info *c, int err) -{ - if (!c->ro_media) { - c->ro_media = 1; - ubifs_warn("switched to read-only mode, error %d", err); - dbg_dump_stack(); - } -} - -/** * ubifs_compr_present - check if compressor was compiled in. * @compr_type: compressor type to check * @@ -136,83 +145,6 @@ static inline int ubifs_wbuf_sync(struct ubifs_wbuf *wbuf) } /** - * ubifs_leb_unmap - unmap an LEB. - * @c: UBIFS file-system description object - * @lnum: LEB number to unmap - * - * This function returns %0 on success and a negative error code on failure. - */ -static inline int ubifs_leb_unmap(const struct ubifs_info *c, int lnum) -{ - int err; - - if (c->ro_media) - return -EROFS; - err = ubi_leb_unmap(c->ubi, lnum); - if (err) { - ubifs_err("unmap LEB %d failed, error %d", lnum, err); - return err; - } - - return 0; -} - -/** - * ubifs_leb_write - write to a LEB. - * @c: UBIFS file-system description object - * @lnum: LEB number to write - * @buf: buffer to write from - * @offs: offset within LEB to write to - * @len: length to write - * @dtype: data type - * - * This function returns %0 on success and a negative error code on failure. - */ -static inline int ubifs_leb_write(const struct ubifs_info *c, int lnum, - const void *buf, int offs, int len, int dtype) -{ - int err; - - if (c->ro_media) - return -EROFS; - err = ubi_leb_write(c->ubi, lnum, buf, offs, len, dtype); - if (err) { - ubifs_err("writing %d bytes at %d:%d, error %d", - len, lnum, offs, err); - return err; - } - - return 0; -} - -/** - * ubifs_leb_change - atomic LEB change. - * @c: UBIFS file-system description object - * @lnum: LEB number to write - * @buf: buffer to write from - * @len: length to write - * @dtype: data type - * - * This function returns %0 on success and a negative error code on failure. - */ -static inline int ubifs_leb_change(const struct ubifs_info *c, int lnum, - const void *buf, int len, int dtype) -{ - int err; - - if (c->ro_media) - return -EROFS; - err = ubi_leb_change(c->ubi, lnum, buf, len, dtype); - if (err) { - ubifs_err("changing %d bytes in LEB %d, error %d", - len, lnum, err); - return err; - } - - return 0; -} - -/** * ubifs_encode_dev - encode device node IDs. * @dev: UBIFS device node information * @rdev: device IDs to encode @@ -298,45 +230,74 @@ static inline void *ubifs_idx_key(const struct ubifs_info *c, } /** - * ubifs_reported_space - calculate reported free space. - * @c: the UBIFS file-system description object - * @free: amount of free space - * - * This function calculates amount of free space which will be reported to - * user-space. User-space application tend to expect that if the file-system - * (e.g., via the 'statfs()' call) reports that it has N bytes available, they - * are able to write a file of size N. UBIFS attaches node headers to each data - * node and it has to write indexind nodes as well. This introduces additional - * overhead, and UBIFS it has to report sligtly less free space to meet the - * above expectetion. - * - * This function assumes free space is made up of uncompressed data nodes and - * full index nodes (one per data node, doubled because we always allow enough - * space to write the index twice). + * ubifs_current_time - round current time to time granularity. + * @inode: inode + */ +static inline struct timespec ubifs_current_time(struct inode *inode) +{ + return (inode->i_sb->s_time_gran < NSEC_PER_SEC) ? + current_fs_time(inode->i_sb) : CURRENT_TIME_SEC; +} + +/** + * ubifs_tnc_lookup - look up a file-system node. + * @c: UBIFS file-system description object + * @key: node key to lookup + * @node: the node is returned here * - * Note, the calculation is pessimistic, which means that most of the time - * UBIFS reports less space than it actually has. + * This function look up and reads node with key @key. The caller has to make + * sure the @node buffer is large enough to fit the node. Returns zero in case + * of success, %-ENOENT if the node was not found, and a negative error code in + * case of failure. */ -static inline long long ubifs_reported_space(const struct ubifs_info *c, - uint64_t free) +static inline int ubifs_tnc_lookup(struct ubifs_info *c, + const union ubifs_key *key, void *node) { - int divisor, factor; + return ubifs_tnc_locate(c, key, node, NULL, NULL); +} - divisor = UBIFS_MAX_DATA_NODE_SZ + (c->max_idx_node_sz << 1); - factor = UBIFS_MAX_DATA_NODE_SZ - UBIFS_DATA_NODE_SZ; - do_div(free, divisor); +/** + * ubifs_get_lprops - get reference to LEB properties. + * @c: the UBIFS file-system description object + * + * This function locks lprops. Lprops have to be unlocked by + * 'ubifs_release_lprops()'. + */ +static inline void ubifs_get_lprops(struct ubifs_info *c) +{ + mutex_lock(&c->lp_mutex); +} - return free * factor; +/** + * ubifs_release_lprops - release lprops lock. + * @c: the UBIFS file-system description object + * + * This function has to be called after each 'ubifs_get_lprops()' call to + * unlock lprops. + */ +static inline void ubifs_release_lprops(struct ubifs_info *c) +{ + ubifs_assert(mutex_is_locked(&c->lp_mutex)); + ubifs_assert(c->lst.empty_lebs >= 0 && + c->lst.empty_lebs <= c->main_lebs); + mutex_unlock(&c->lp_mutex); } /** - * ubifs_current_time - round current time to time granularity. - * @inode: inode + * ubifs_next_log_lnum - switch to the next log LEB. + * @c: UBIFS file-system description object + * @lnum: current log LEB + * + * This helper function returns the log LEB number which goes next after LEB + * 'lnum'. */ -static inline struct timespec ubifs_current_time(struct inode *inode) +static inline int ubifs_next_log_lnum(const struct ubifs_info *c, int lnum) { - return (inode->i_sb->s_time_gran < NSEC_PER_SEC) ? - current_fs_time(inode->i_sb) : CURRENT_TIME_SEC; + lnum += 1; + if (lnum > c->log_last) + lnum = UBIFS_LOG_LNUM; + + return lnum; } #endif /* __UBIFS_MISC_H__ */ diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index 3afeb9242c6..f1c3e5a1b31 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -46,17 +46,13 @@ * Orphans are accumulated in a rb-tree. When an inode's link count drops to * zero, the inode number is added to the rb-tree. It is removed from the tree * when the inode is deleted. Any new orphans that are in the orphan tree when - * the commit is run, are written to the orphan area in 1 or more orph nodes. + * the commit is run, are written to the orphan area in 1 or more orphan nodes. * If the orphan area is full, it is consolidated to make space. There is * always enough space because validation prevents the user from creating more * than the maximum number of orphans allowed. */ -#ifdef CONFIG_UBIFS_FS_DEBUG static int dbg_check_orphans(struct ubifs_info *c); -#else -#define dbg_check_orphans(c) 0 -#endif /** * ubifs_add_orphan - add an orphan. @@ -92,7 +88,7 @@ int ubifs_add_orphan(struct ubifs_info *c, ino_t inum) else if (inum > o->inum) p = &(*p)->rb_right; else { - dbg_err("orphaned twice"); + ubifs_err("orphaned twice"); spin_unlock(&c->orphan_lock); kfree(orphan); return 0; @@ -105,7 +101,7 @@ int ubifs_add_orphan(struct ubifs_info *c, ino_t inum) list_add_tail(&orphan->list, &c->orph_list); list_add_tail(&orphan->new_list, &c->orph_new); spin_unlock(&c->orphan_lock); - dbg_gen("ino %lu", inum); + dbg_gen("ino %lu", (unsigned long)inum); return 0; } @@ -130,16 +126,19 @@ void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum) else if (inum > o->inum) p = p->rb_right; else { - if (o->dnext) { + if (o->del) { spin_unlock(&c->orphan_lock); - dbg_gen("deleted twice ino %lu", inum); + dbg_gen("deleted twice ino %lu", + (unsigned long)inum); return; } - if (o->cnext) { + if (o->cmt) { + o->del = 1; o->dnext = c->orph_dnext; c->orph_dnext = o; spin_unlock(&c->orphan_lock); - dbg_gen("delete later ino %lu", inum); + dbg_gen("delete later ino %lu", + (unsigned long)inum); return; } rb_erase(p, &c->orph_tree); @@ -151,13 +150,13 @@ void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum) } spin_unlock(&c->orphan_lock); kfree(o); - dbg_gen("inum %lu", inum); + dbg_gen("inum %lu", (unsigned long)inum); return; } } spin_unlock(&c->orphan_lock); - dbg_err("missing orphan ino %lu", inum); - dbg_dump_stack(); + ubifs_err("missing orphan ino %lu", (unsigned long)inum); + dump_stack(); } /** @@ -174,11 +173,13 @@ int ubifs_orphan_start_commit(struct ubifs_info *c) last = &c->orph_cnext; list_for_each_entry(orphan, &c->orph_new, new_list) { ubifs_assert(orphan->new); + ubifs_assert(!orphan->cmt); orphan->new = 0; + orphan->cmt = 1; *last = orphan; last = &orphan->cnext; } - *last = orphan->cnext; + *last = NULL; c->cmt_orphans = c->new_orphans; c->new_orphans = 0; dbg_cmt("%d orphans to commit", c->cmt_orphans); @@ -229,7 +230,7 @@ static int tot_avail_orphs(struct ubifs_info *c) } /** - * do_write_orph_node - write a node + * do_write_orph_node - write a node to the orphan head. * @c: UBIFS file-system description object * @len: length of node * @atomic: write atomically @@ -246,8 +247,7 @@ static int do_write_orph_node(struct ubifs_info *c, int len, int atomic) ubifs_assert(c->ohead_offs == 0); ubifs_prepare_node(c, c->orph_buf, len, 1); len = ALIGN(len, c->min_io_size); - err = ubifs_leb_change(c, c->ohead_lnum, c->orph_buf, len, - UBI_SHORTTERM); + err = ubifs_leb_change(c, c->ohead_lnum, c->orph_buf, len); } else { if (c->ohead_offs == 0) { /* Ensure LEB has been unmapped */ @@ -256,17 +256,17 @@ static int do_write_orph_node(struct ubifs_info *c, int len, int atomic) return err; } err = ubifs_write_node(c, c->orph_buf, len, c->ohead_lnum, - c->ohead_offs, UBI_SHORTTERM); + c->ohead_offs); } return err; } /** - * write_orph_node - write an orph node + * write_orph_node - write an orphan node. * @c: UBIFS file-system description object * @atomic: write atomically * - * This function builds an orph node from the cnext list and writes it to the + * This function builds an orphan node from the cnext list and writes it to the * orphan head. On success, %0 is returned, otherwise a negative error code * is returned. */ @@ -302,7 +302,9 @@ static int write_orph_node(struct ubifs_info *c, int atomic) cnext = c->orph_cnext; for (i = 0; i < cnt; i++) { orphan = cnext; + ubifs_assert(orphan->cmt); orph->inos[i] = cpu_to_le64(orphan->inum); + orphan->cmt = 0; cnext = orphan->cnext; orphan->cnext = NULL; } @@ -310,10 +312,10 @@ static int write_orph_node(struct ubifs_info *c, int atomic) c->cmt_orphans -= cnt; spin_unlock(&c->orphan_lock); if (c->cmt_orphans) - orph->cmt_no = cpu_to_le64(c->cmt_no + 1); + orph->cmt_no = cpu_to_le64(c->cmt_no); else /* Mark the last node of the commit */ - orph->cmt_no = cpu_to_le64((c->cmt_no + 1) | (1ULL << 63)); + orph->cmt_no = cpu_to_le64((c->cmt_no) | (1ULL << 63)); ubifs_assert(c->ohead_offs + len <= c->leb_size); ubifs_assert(c->ohead_lnum >= c->orph_first); ubifs_assert(c->ohead_lnum <= c->orph_last); @@ -324,11 +326,11 @@ static int write_orph_node(struct ubifs_info *c, int atomic) } /** - * write_orph_nodes - write orph nodes until there are no more to commit + * write_orph_nodes - write orphan nodes until there are no more to commit. * @c: UBIFS file-system description object * @atomic: write atomically * - * This function writes orph nodes for all the orphans to commit. On success, + * This function writes orphan nodes for all the orphans to commit. On success, * %0 is returned, otherwise a negative error code is returned. */ static int write_orph_nodes(struct ubifs_info *c, int atomic) @@ -381,11 +383,12 @@ static int consolidate(struct ubifs_info *c) list_for_each_entry(orphan, &c->orph_list, list) { if (orphan->new) continue; + orphan->cmt = 1; *last = orphan; last = &orphan->cnext; cnt += 1; } - *last = orphan->cnext; + *last = NULL; ubifs_assert(cnt == c->tot_orphans - c->new_orphans); c->cmt_orphans = cnt; c->ohead_lnum = c->orph_first; @@ -445,10 +448,11 @@ static void erase_deleted(struct ubifs_info *c) orphan = dnext; dnext = orphan->dnext; ubifs_assert(!orphan->new); + ubifs_assert(orphan->del); rb_erase(&orphan->rb, &c->orph_tree); list_del(&orphan->list); c->tot_orphans -= 1; - dbg_gen("deleting orphan ino %lu", orphan->inum); + dbg_gen("deleting orphan ino %lu", (unsigned long)orphan->inum); kfree(orphan); } c->orph_dnext = NULL; @@ -476,14 +480,14 @@ int ubifs_orphan_end_commit(struct ubifs_info *c) } /** - * clear_orphans - erase all LEBs used for orphans. + * ubifs_clear_orphans - erase all LEBs used for orphans. * @c: UBIFS file-system description object * * If recovery is not required, then the orphans from the previous session * are not needed. This function locates the LEBs used to record * orphans, and un-maps them. */ -static int clear_orphans(struct ubifs_info *c) +int ubifs_clear_orphans(struct ubifs_info *c) { int lnum, err; @@ -534,10 +538,11 @@ static int insert_dead_orphan(struct ubifs_info *c, ino_t inum) rb_link_node(&orphan->rb, parent, p); rb_insert_color(&orphan->rb, &c->orph_tree); list_add_tail(&orphan->list, &c->orph_list); + orphan->del = 1; orphan->dnext = c->orph_dnext; c->orph_dnext = orphan; - dbg_mnt("ino %lu, new %d, tot %d", - inum, c->new_orphans, c->tot_orphans); + dbg_mnt("ino %lu, new %d, tot %d", (unsigned long)inum, + c->new_orphans, c->tot_orphans); return 0; } @@ -545,9 +550,9 @@ static int insert_dead_orphan(struct ubifs_info *c, ino_t inum) * do_kill_orphans - remove orphan inodes from the index. * @c: UBIFS file-system description object * @sleb: scanned LEB - * @last_cmt_no: cmt_no of last orph node read is passed and returned here + * @last_cmt_no: cmt_no of last orphan node read is passed and returned here * @outofdate: whether the LEB is out of date is returned here - * @last_flagged: whether the end orph node is encountered + * @last_flagged: whether the end orphan node is encountered * * This function is a helper to the 'kill_orphans()' function. It goes through * every orphan node in a LEB and for every inode number recorded, removes @@ -565,9 +570,9 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb, list_for_each_entry(snod, &sleb->nodes, list) { if (snod->type != UBIFS_ORPH_NODE) { - ubifs_err("invalid node type %d in orphan area at " - "%d:%d", snod->type, sleb->lnum, snod->offs); - dbg_dump_node(c, snod->node); + ubifs_err("invalid node type %d in orphan area at %d:%d", + snod->type, sleb->lnum, snod->offs); + ubifs_dump_node(c, snod->node); return -EINVAL; } @@ -578,8 +583,8 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb, /* * The commit number on the master node may be less, because * of a failed commit. If there are several failed commits in a - * row, the commit number written on orph nodes will continue to - * increase (because the commit number is adjusted here) even + * row, the commit number written on orphan nodes will continue + * to increase (because the commit number is adjusted here) even * though the commit number on the master node stays the same * because the master node has not been re-written. */ @@ -587,15 +592,14 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb, c->cmt_no = cmt_no; if (cmt_no < *last_cmt_no && *last_flagged) { /* - * The last orph node had a higher commit number and was - * flagged as the last written for that commit number. - * That makes this orph node, out of date. + * The last orphan node had a higher commit number and + * was flagged as the last written for that commit + * number. That makes this orphan node, out of date. */ if (!first) { - ubifs_err("out of order commit number %llu in " - "orphan node at %d:%d", + ubifs_err("out of order commit number %llu in orphan node at %d:%d", cmt_no, sleb->lnum, snod->offs); - dbg_dump_node(c, snod->node); + ubifs_dump_node(c, snod->node); return -EINVAL; } dbg_rcvry("out of date LEB %d", sleb->lnum); @@ -609,7 +613,8 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb, n = (le32_to_cpu(orph->ch.len) - UBIFS_ORPH_NODE_SZ) >> 3; for (i = 0; i < n; i++) { inum = le64_to_cpu(orph->inos[i]); - dbg_rcvry("deleting orphaned inode %lu", inum); + dbg_rcvry("deleting orphaned inode %lu", + (unsigned long)inum); err = ubifs_tnc_remove_ino(c, inum); if (err) return err; @@ -655,10 +660,10 @@ static int kill_orphans(struct ubifs_info *c) /* * Orph nodes always start at c->orph_first and are written to each * successive LEB in turn. Generally unused LEBs will have been unmapped - * but may contain out of date orph nodes if the unmap didn't go - * through. In addition, the last orph node written for each commit is + * but may contain out of date orphan nodes if the unmap didn't go + * through. In addition, the last orphan node written for each commit is * marked (top bit of orph->cmt_no is set to 1). It is possible that - * there are orph nodes from the next commit (i.e. the commit did not + * there are orphan nodes from the next commit (i.e. the commit did not * complete successfully). In that case, no orphans will have been lost * due to the way that orphans are written, and any orphans added will * be valid orphans anyway and so can be deleted. @@ -667,9 +672,11 @@ static int kill_orphans(struct ubifs_info *c) struct ubifs_scan_leb *sleb; dbg_rcvry("LEB %d", lnum); - sleb = ubifs_scan(c, lnum, 0, c->sbuf); + sleb = ubifs_scan(c, lnum, 0, c->sbuf, 1); if (IS_ERR(sleb)) { - sleb = ubifs_recover_leb(c, lnum, 0, c->sbuf, 0); + if (PTR_ERR(sleb) == -EUCLEAN) + sleb = ubifs_recover_leb(c, lnum, 0, + c->sbuf, -1); if (IS_ERR(sleb)) { err = PTR_ERR(sleb); break; @@ -715,12 +722,14 @@ int ubifs_mount_orphans(struct ubifs_info *c, int unclean, int read_only) if (unclean) err = kill_orphans(c); else if (!read_only) - err = clear_orphans(c); + err = ubifs_clear_orphans(c); return err; } -#ifdef CONFIG_UBIFS_FS_DEBUG +/* + * Everything below is related to debugging. + */ struct check_orphan { struct rb_node rb; @@ -806,27 +815,10 @@ static int dbg_find_check_orphan(struct rb_root *root, ino_t inum) static void dbg_free_check_tree(struct rb_root *root) { - struct rb_node *this = root->rb_node; - struct check_orphan *o; + struct check_orphan *o, *n; - while (this) { - if (this->rb_left) { - this = this->rb_left; - continue; - } else if (this->rb_right) { - this = this->rb_right; - continue; - } - o = rb_entry(this, struct check_orphan, rb); - this = rb_parent(this); - if (this) { - if (this->rb_left == &o->rb) - this->rb_left = NULL; - else - this->rb_right = NULL; - } + rbtree_postorder_for_each_entry_safe(o, n, root, rb) kfree(o); - } } static int dbg_orphan_check(struct ubifs_info *c, struct ubifs_zbranch *zbr, @@ -840,8 +832,8 @@ static int dbg_orphan_check(struct ubifs_info *c, struct ubifs_zbranch *zbr, if (inum != ci->last_ino) { /* Lowest node type is the inode node, so it comes first */ if (key_type(c, &zbr->key) != UBIFS_INO_KEY) - ubifs_err("found orphan node ino %lu, type %d", inum, - key_type(c, &zbr->key)); + ubifs_err("found orphan node ino %lu, type %d", + (unsigned long)inum, key_type(c, &zbr->key)); ci->last_ino = inum; ci->tot_inos += 1; err = ubifs_tnc_read_node(c, zbr, ci->node); @@ -853,7 +845,8 @@ static int dbg_orphan_check(struct ubifs_info *c, struct ubifs_zbranch *zbr, /* Must be recorded as an orphan */ if (!dbg_find_check_orphan(&ci->root, inum) && !dbg_find_orphan(c, inum)) { - ubifs_err("missing orphan, ino %lu", inum); + ubifs_err("missing orphan, ino %lu", + (unsigned long)inum); ci->missing += 1; } } @@ -887,15 +880,22 @@ static int dbg_read_orphans(struct check_info *ci, struct ubifs_scan_leb *sleb) static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci) { int lnum, err = 0; + void *buf; /* Check no-orphans flag and skip this if no orphans */ if (c->no_orphs) return 0; + buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); + if (!buf) { + ubifs_err("cannot allocate memory to check orphans"); + return 0; + } + for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) { struct ubifs_scan_leb *sleb; - sleb = ubifs_scan(c, lnum, 0, c->dbg_buf); + sleb = ubifs_scan(c, lnum, 0, buf, 0); if (IS_ERR(sleb)) { err = PTR_ERR(sleb); break; @@ -907,6 +907,7 @@ static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci) break; } + vfree(buf); return err; } @@ -915,7 +916,7 @@ static int dbg_check_orphans(struct ubifs_info *c) struct check_info ci; int err; - if (!(ubifs_chk_flags & UBIFS_CHK_ORPH)) + if (!dbg_is_chk_orph(c)) return 0; ci.last_ino = 0; @@ -954,5 +955,3 @@ out: kfree(ci.node); return err; } - -#endif /* CONFIG_UBIFS_FS_DEBUG */ diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c index 77d26c141cf..c14adb2f420 100644 --- a/fs/ubifs/recovery.c +++ b/fs/ubifs/recovery.c @@ -23,14 +23,32 @@ /* * This file implements functions needed to recover from unclean un-mounts. * When UBIFS is mounted, it checks a flag on the master node to determine if - * an un-mount was completed sucessfully. If not, the process of mounting - * incorparates additional checking and fixing of on-flash data structures. + * an un-mount was completed successfully. If not, the process of mounting + * incorporates additional checking and fixing of on-flash data structures. * UBIFS always cleans away all remnants of an unclean un-mount, so that * errors do not accumulate. However UBIFS defers recovery if it is mounted * read-only, and the flash is not modified in that case. + * + * The general UBIFS approach to the recovery is that it recovers from + * corruptions which could be caused by power cuts, but it refuses to recover + * from corruption caused by other reasons. And UBIFS tries to distinguish + * between these 2 reasons of corruptions and silently recover in the former + * case and loudly complain in the latter case. + * + * UBIFS writes only to erased LEBs, so it writes only to the flash space + * containing only 0xFFs. UBIFS also always writes strictly from the beginning + * of the LEB to the end. And UBIFS assumes that the underlying flash media + * writes in @c->max_write_size bytes at a time. + * + * Hence, if UBIFS finds a corrupted node at offset X, it expects only the min. + * I/O unit corresponding to offset X to contain corrupted data, all the + * following min. I/O units have to contain empty space (all 0xFFs). If this is + * not true, the corruption cannot be the result of a power cut, and UBIFS + * refuses to mount. */ #include <linux/crc32.h> +#include <linux/slab.h> #include "ubifs.h" /** @@ -53,6 +71,25 @@ static int is_empty(void *buf, int len) } /** + * first_non_ff - find offset of the first non-0xff byte. + * @buf: buffer to search in + * @len: length of buffer + * + * This function returns offset of the first non-0xff byte in @buf or %-1 if + * the buffer contains only 0xff bytes. + */ +static int first_non_ff(void *buf, int len) +{ + uint8_t *p = buf; + int i; + + for (i = 0; i < len; i++) + if (*p++ != 0xff) + return i; + return -1; +} + +/** * get_master_node - get the last valid master node allowing for corruption. * @c: UBIFS file-system description object * @lnum: LEB number @@ -80,7 +117,7 @@ static int get_master_node(const struct ubifs_info *c, int lnum, void **pbuf, if (!sbuf) return -ENOMEM; - err = ubi_read(c->ubi, lnum, sbuf, 0, c->leb_size); + err = ubifs_leb_read(c, lnum, sbuf, 0, c->leb_size, 0); if (err && err != -EBADMSG) goto out_free; @@ -168,18 +205,18 @@ static int write_rcvrd_mst_node(struct ubifs_info *c, struct ubifs_mst_node *mst) { int err = 0, lnum = UBIFS_MST_LNUM, sz = c->mst_node_alsz; - uint32_t save_flags; + __le32 save_flags; dbg_rcvry("recovery"); save_flags = mst->flags; - mst->flags = cpu_to_le32(le32_to_cpu(mst->flags) | UBIFS_MST_RCVRY); + mst->flags |= cpu_to_le32(UBIFS_MST_RCVRY); ubifs_prepare_node(c, mst, UBIFS_MST_NODE_SZ, 1); - err = ubi_leb_change(c->ubi, lnum, mst, sz, UBI_SHORTTERM); + err = ubifs_leb_change(c, lnum, mst, sz); if (err) goto out; - err = ubi_leb_change(c->ubi, lnum + 1, mst, sz, UBI_SHORTTERM); + err = ubifs_leb_change(c, lnum + 1, mst, sz); if (err) goto out; out: @@ -237,7 +274,8 @@ int ubifs_recover_master_node(struct ubifs_info *c) if (cor1) goto out_err; mst = mst1; - } else if (offs1 == 0 && offs2 + sz >= c->leb_size) { + } else if (offs1 == 0 && + c->leb_size - offs2 - sz < sz) { /* 1st LEB was unmapped and written, 2nd not */ if (cor1) goto out_err; @@ -267,12 +305,12 @@ int ubifs_recover_master_node(struct ubifs_info *c) mst = mst2; } - dbg_rcvry("recovered master node from LEB %d", + ubifs_msg("recovered master node from LEB %d", (mst == mst1 ? UBIFS_MST_LNUM : UBIFS_MST_LNUM + 1)); memcpy(c->mst_node, mst, UBIFS_MST_NODE_SZ); - if ((c->vfs_sb->s_flags & MS_RDONLY)) { + if (c->ro_mount) { /* Read-only mode. Keep a copy for switching to rw mode */ c->rcvrd_mst_node = kmalloc(sz, GFP_KERNEL); if (!c->rcvrd_mst_node) { @@ -280,6 +318,32 @@ int ubifs_recover_master_node(struct ubifs_info *c) goto out_free; } memcpy(c->rcvrd_mst_node, c->mst_node, UBIFS_MST_NODE_SZ); + + /* + * We had to recover the master node, which means there was an + * unclean reboot. However, it is possible that the master node + * is clean at this point, i.e., %UBIFS_MST_DIRTY is not set. + * E.g., consider the following chain of events: + * + * 1. UBIFS was cleanly unmounted, so the master node is clean + * 2. UBIFS is being mounted R/W and starts changing the master + * node in the first (%UBIFS_MST_LNUM). A power cut happens, + * so this LEB ends up with some amount of garbage at the + * end. + * 3. UBIFS is being mounted R/O. We reach this place and + * recover the master node from the second LEB + * (%UBIFS_MST_LNUM + 1). But we cannot update the media + * because we are being mounted R/O. We have to defer the + * operation. + * 4. However, this master node (@c->mst_node) is marked as + * clean (since the step 1). And if we just return, the + * mount code will be confused and won't recover the master + * node when it is re-mounter R/W later. + * + * Thus, to force the recovery by marking the master node as + * dirty. + */ + c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY); } else { /* Write the recovered master node */ c->max_sqnum = le64_to_cpu(mst->ch.sqnum) - 1; @@ -298,12 +362,12 @@ out_err: out_free: ubifs_err("failed to recover master node"); if (mst1) { - dbg_err("dumping first master node"); - dbg_dump_node(c, mst1); + ubifs_err("dumping first master node"); + ubifs_dump_node(c, mst1); } if (mst2) { - dbg_err("dumping second master node"); - dbg_dump_node(c, mst2); + ubifs_err("dumping second master node"); + ubifs_dump_node(c, mst2); } vfree(buf2); vfree(buf1); @@ -342,44 +406,23 @@ int ubifs_write_rcvrd_mst_node(struct ubifs_info *c) * @offs: offset to check * * This function returns %1 if @offs was in the last write to the LEB whose data - * is in @buf, otherwise %0 is returned. The determination is made by checking - * for subsequent empty space starting from the next min_io_size boundary (or a - * bit less than the common header size if min_io_size is one). + * is in @buf, otherwise %0 is returned. The determination is made by checking + * for subsequent empty space starting from the next @c->max_write_size + * boundary. */ static int is_last_write(const struct ubifs_info *c, void *buf, int offs) { - int empty_offs; - int check_len; + int empty_offs, check_len; uint8_t *p; - if (c->min_io_size == 1) { - check_len = c->leb_size - offs; - p = buf + check_len; - for (; check_len > 0; check_len--) - if (*--p != 0xff) - break; - /* - * 'check_len' is the size of the corruption which cannot be - * more than the size of 1 node if it was caused by an unclean - * unmount. - */ - if (check_len > UBIFS_MAX_NODE_SZ) - return 0; - return 1; - } - /* - * Round up to the next c->min_io_size boundary i.e. 'offs' is in the - * last wbuf written. After that should be empty space. + * Round up to the next @c->max_write_size boundary i.e. @offs is in + * the last wbuf written. After that should be empty space. */ - empty_offs = ALIGN(offs + 1, c->min_io_size); + empty_offs = ALIGN(offs + 1, c->max_write_size); check_len = c->leb_size - empty_offs; p = buf + empty_offs - offs; - - for (; check_len > 0; check_len--) - if (*p++ != 0xff) - return 0; - return 1; + return is_empty(p, check_len); } /** @@ -392,7 +435,7 @@ static int is_last_write(const struct ubifs_info *c, void *buf, int offs) * * This function pads up to the next min_io_size boundary (if there is one) and * sets empty space to all 0xff. @buf, @offs and @len are updated to the next - * min_io_size boundary (if there is one). + * @c->min_io_size boundary. */ static void clean_buf(const struct ubifs_info *c, void **buf, int lnum, int *offs, int *len) @@ -402,11 +445,6 @@ static void clean_buf(const struct ubifs_info *c, void **buf, int lnum, lnum = lnum; dbg_rcvry("cleaning corruption at %d:%d", lnum, *offs); - if (c->min_io_size == 1) { - memset(*buf, 0xff, c->leb_size - *offs); - return; - } - ubifs_assert(!(*offs & 7)); empty_offs = ALIGN(*offs, c->min_io_size); pad_len = empty_offs - *offs; @@ -425,59 +463,35 @@ static void clean_buf(const struct ubifs_info *c, void **buf, int lnum, * @lnum: LEB number of the LEB from which @buf was read * @offs: offset from which @buf was read * - * This function scans @buf for more nodes and returns %0 is a node is found and - * %1 if no more nodes are found. + * This function ensures that the corrupted node at @offs is the last thing + * written to a LEB. This function returns %1 if more data is not found and + * %0 if more data is found. */ static int no_more_nodes(const struct ubifs_info *c, void *buf, int len, int lnum, int offs) { - int skip, next_offs = 0; + struct ubifs_ch *ch = buf; + int skip, dlen = le32_to_cpu(ch->len); - if (len > UBIFS_DATA_NODE_SZ) { - struct ubifs_ch *ch = buf; - int dlen = le32_to_cpu(ch->len); - - if (ch->node_type == UBIFS_DATA_NODE && dlen >= UBIFS_CH_SZ && - dlen <= UBIFS_MAX_DATA_NODE_SZ) - /* The corrupt node looks like a data node */ - next_offs = ALIGN(offs + dlen, 8); - } - - if (c->min_io_size == 1) - skip = 8; - else - skip = ALIGN(offs + 1, c->min_io_size) - offs; - - offs += skip; - buf += skip; - len -= skip; - while (len > 8) { - struct ubifs_ch *ch = buf; - uint32_t magic = le32_to_cpu(ch->magic); - int ret; - - if (magic == UBIFS_NODE_MAGIC) { - ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1); - if (ret == SCANNED_A_NODE || ret > 0) { - /* - * There is a small chance this is just data in - * a data node, so check that possibility. e.g. - * this is part of a file that itself contains - * a UBIFS image. - */ - if (next_offs && offs + le32_to_cpu(ch->len) <= - next_offs) - continue; - dbg_rcvry("unexpected node at %d:%d", lnum, - offs); - return 0; - } - } - offs += 8; - buf += 8; - len -= 8; + /* Check for empty space after the corrupt node's common header */ + skip = ALIGN(offs + UBIFS_CH_SZ, c->max_write_size) - offs; + if (is_empty(buf + skip, len - skip)) + return 1; + /* + * The area after the common header size is not empty, so the common + * header must be intact. Check it. + */ + if (ubifs_check_node(c, buf, lnum, offs, 1, 0) != -EUCLEAN) { + dbg_rcvry("unexpected bad common header at %d:%d", lnum, offs); + return 0; } - return 1; + /* Now we know the corrupt node's length we can skip over it */ + skip = ALIGN(offs + dlen, c->max_write_size) - offs; + /* After which there should be empty space */ + if (is_empty(buf + skip, len - skip)) + return 1; + dbg_rcvry("unexpected data at %d:%d", lnum, offs + skip); + return 0; } /** @@ -500,7 +514,7 @@ static int fix_unclean_leb(struct ubifs_info *c, struct ubifs_scan_leb *sleb, endpt = snod->offs + snod->len; } - if ((c->vfs_sb->s_flags & MS_RDONLY) && !c->remounting_rw) { + if (c->ro_mount && !c->remounting_rw) { /* Add to recovery list */ struct ubifs_unclean_leb *ucleb; @@ -526,8 +540,8 @@ static int fix_unclean_leb(struct ubifs_info *c, struct ubifs_scan_leb *sleb, int len = ALIGN(endpt, c->min_io_size); if (start) { - err = ubi_read(c->ubi, lnum, sleb->buf, 0, - start); + err = ubifs_leb_read(c, lnum, sleb->buf, 0, + start, 1); if (err) return err; } @@ -541,8 +555,7 @@ static int fix_unclean_leb(struct ubifs_info *c, struct ubifs_scan_leb *sleb, ubifs_pad(c, buf, pad_len); } } - err = ubi_leb_change(c->ubi, lnum, sleb->buf, len, - UBI_UNKNOWN); + err = ubifs_leb_change(c, lnum, sleb->buf, len); if (err) return err; } @@ -551,16 +564,15 @@ static int fix_unclean_leb(struct ubifs_info *c, struct ubifs_scan_leb *sleb, } /** - * drop_incomplete_group - drop nodes from an incomplete group. + * drop_last_group - drop the last group of nodes. * @sleb: scanned LEB information * @offs: offset of dropped nodes is returned here * - * This function returns %1 if nodes are dropped and %0 otherwise. + * This is a helper function for 'ubifs_recover_leb()' which drops the last + * group of nodes of the scanned LEB. */ -static int drop_incomplete_group(struct ubifs_scan_leb *sleb, int *offs) +static void drop_last_group(struct ubifs_scan_leb *sleb, int *offs) { - int dropped = 0; - while (!list_empty(&sleb->nodes)) { struct ubifs_scan_node *snod; struct ubifs_ch *ch; @@ -569,15 +581,41 @@ static int drop_incomplete_group(struct ubifs_scan_leb *sleb, int *offs) list); ch = snod->node; if (ch->group_type != UBIFS_IN_NODE_GROUP) - return dropped; - dbg_rcvry("dropping node at %d:%d", sleb->lnum, snod->offs); + break; + + dbg_rcvry("dropping grouped node at %d:%d", + sleb->lnum, snod->offs); + *offs = snod->offs; + list_del(&snod->list); + kfree(snod); + sleb->nodes_cnt -= 1; + } +} + +/** + * drop_last_node - drop the last node. + * @sleb: scanned LEB information + * @offs: offset of dropped nodes is returned here + * @grouped: non-zero if whole group of nodes have to be dropped + * + * This is a helper function for 'ubifs_recover_leb()' which drops the last + * node of the scanned LEB. + */ +static void drop_last_node(struct ubifs_scan_leb *sleb, int *offs) +{ + struct ubifs_scan_node *snod; + + if (!list_empty(&sleb->nodes)) { + snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node, + list); + + dbg_rcvry("dropping last node at %d:%d", + sleb->lnum, snod->offs); *offs = snod->offs; list_del(&snod->list); kfree(snod); sleb->nodes_cnt -= 1; - dropped = 1; } - return dropped; } /** @@ -586,33 +624,30 @@ static int drop_incomplete_group(struct ubifs_scan_leb *sleb, int *offs) * @lnum: LEB number * @offs: offset * @sbuf: LEB-sized buffer to use - * @grouped: nodes may be grouped for recovery + * @jhead: journal head number this LEB belongs to (%-1 if the LEB does not + * belong to any journal head) * * This function does a scan of a LEB, but caters for errors that might have * been caused by the unclean unmount from which we are attempting to recover. - * - * This function returns %0 on success and a negative error code on failure. + * Returns %0 in case of success, %-EUCLEAN if an unrecoverable corruption is + * found, and a negative error code in case of failure. */ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum, - int offs, void *sbuf, int grouped) + int offs, void *sbuf, int jhead) { - int err, len = c->leb_size - offs, need_clean = 0, quiet = 1; - int empty_chkd = 0, start = offs; + int ret = 0, err, len = c->leb_size - offs, start = offs, min_io_unit; + int grouped = jhead == -1 ? 0 : c->jheads[jhead].grouped; struct ubifs_scan_leb *sleb; void *buf = sbuf + offs; - dbg_rcvry("%d:%d", lnum, offs); + dbg_rcvry("%d:%d, jhead %d, grouped %d", lnum, offs, jhead, grouped); sleb = ubifs_start_scan(c, lnum, offs, sbuf); if (IS_ERR(sleb)) return sleb; - if (sleb->ecc) - need_clean = 1; - + ubifs_assert(len >= 8); while (len >= 8) { - int ret; - dbg_scan("look at LEB %d:%d (%d bytes left)", lnum, offs, len); @@ -622,8 +657,7 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum, * Scan quietly until there is an error from which we cannot * recover */ - ret = ubifs_scan_a_node(c, buf, len, lnum, offs, quiet); - + ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1); if (ret == SCANNED_A_NODE) { /* A valid node, and not a padding node */ struct ubifs_ch *ch = buf; @@ -636,98 +670,127 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum, offs += node_len; buf += node_len; len -= node_len; - continue; - } - - if (ret > 0) { + } else if (ret > 0) { /* Padding bytes or a valid padding node */ offs += ret; buf += ret; len -= ret; - continue; - } - - if (ret == SCANNED_EMPTY_SPACE) { - if (!is_empty(buf, len)) { - if (!is_last_write(c, buf, offs)) - break; - clean_buf(c, &buf, lnum, &offs, &len); - need_clean = 1; - } - empty_chkd = 1; + } else if (ret == SCANNED_EMPTY_SPACE || + ret == SCANNED_GARBAGE || + ret == SCANNED_A_BAD_PAD_NODE || + ret == SCANNED_A_CORRUPT_NODE) { + dbg_rcvry("found corruption (%d) at %d:%d", + ret, lnum, offs); break; + } else { + ubifs_err("unexpected return value %d", ret); + err = -EINVAL; + goto error; } + } - if (ret == SCANNED_GARBAGE || ret == SCANNED_A_BAD_PAD_NODE) - if (is_last_write(c, buf, offs)) { - clean_buf(c, &buf, lnum, &offs, &len); - need_clean = 1; - empty_chkd = 1; - break; - } + if (ret == SCANNED_GARBAGE || ret == SCANNED_A_BAD_PAD_NODE) { + if (!is_last_write(c, buf, offs)) + goto corrupted_rescan; + } else if (ret == SCANNED_A_CORRUPT_NODE) { + if (!no_more_nodes(c, buf, len, lnum, offs)) + goto corrupted_rescan; + } else if (!is_empty(buf, len)) { + if (!is_last_write(c, buf, offs)) { + int corruption = first_non_ff(buf, len); - if (ret == SCANNED_A_CORRUPT_NODE) - if (no_more_nodes(c, buf, len, lnum, offs)) { - clean_buf(c, &buf, lnum, &offs, &len); - need_clean = 1; - empty_chkd = 1; - break; - } - - if (quiet) { - /* Redo the last scan but noisily */ - quiet = 0; - continue; - } - - switch (ret) { - case SCANNED_GARBAGE: - dbg_err("garbage"); - goto corrupted; - case SCANNED_A_CORRUPT_NODE: - case SCANNED_A_BAD_PAD_NODE: - dbg_err("bad node"); - goto corrupted; - default: - dbg_err("unknown"); + /* + * See header comment for this file for more + * explanations about the reasons we have this check. + */ + ubifs_err("corrupt empty space LEB %d:%d, corruption starts at %d", + lnum, offs, corruption); + /* Make sure we dump interesting non-0xFF data */ + offs += corruption; + buf += corruption; goto corrupted; } } - if (!empty_chkd && !is_empty(buf, len)) { - if (is_last_write(c, buf, offs)) { - clean_buf(c, &buf, lnum, &offs, &len); - need_clean = 1; - } else { - ubifs_err("corrupt empty space at LEB %d:%d", - lnum, offs); - goto corrupted; - } - } + min_io_unit = round_down(offs, c->min_io_size); + if (grouped) + /* + * If nodes are grouped, always drop the incomplete group at + * the end. + */ + drop_last_group(sleb, &offs); - /* Drop nodes from incomplete group */ - if (grouped && drop_incomplete_group(sleb, &offs)) { - buf = sbuf + offs; - len = c->leb_size - offs; - clean_buf(c, &buf, lnum, &offs, &len); - need_clean = 1; + if (jhead == GCHD) { + /* + * If this LEB belongs to the GC head then while we are in the + * middle of the same min. I/O unit keep dropping nodes. So + * basically, what we want is to make sure that the last min. + * I/O unit where we saw the corruption is dropped completely + * with all the uncorrupted nodes which may possibly sit there. + * + * In other words, let's name the min. I/O unit where the + * corruption starts B, and the previous min. I/O unit A. The + * below code tries to deal with a situation when half of B + * contains valid nodes or the end of a valid node, and the + * second half of B contains corrupted data or garbage. This + * means that UBIFS had been writing to B just before the power + * cut happened. I do not know how realistic is this scenario + * that half of the min. I/O unit had been written successfully + * and the other half not, but this is possible in our 'failure + * mode emulation' infrastructure at least. + * + * So what is the problem, why we need to drop those nodes? Why + * can't we just clean-up the second half of B by putting a + * padding node there? We can, and this works fine with one + * exception which was reproduced with power cut emulation + * testing and happens extremely rarely. + * + * Imagine the file-system is full, we run GC which starts + * moving valid nodes from LEB X to LEB Y (obviously, LEB Y is + * the current GC head LEB). The @c->gc_lnum is -1, which means + * that GC will retain LEB X and will try to continue. Imagine + * that LEB X is currently the dirtiest LEB, and the amount of + * used space in LEB Y is exactly the same as amount of free + * space in LEB X. + * + * And a power cut happens when nodes are moved from LEB X to + * LEB Y. We are here trying to recover LEB Y which is the GC + * head LEB. We find the min. I/O unit B as described above. + * Then we clean-up LEB Y by padding min. I/O unit. And later + * 'ubifs_rcvry_gc_commit()' function fails, because it cannot + * find a dirty LEB which could be GC'd into LEB Y! Even LEB X + * does not match because the amount of valid nodes there does + * not fit the free space in LEB Y any more! And this is + * because of the padding node which we added to LEB Y. The + * user-visible effect of this which I once observed and + * analysed is that we cannot mount the file-system with + * -ENOSPC error. + * + * So obviously, to make sure that situation does not happen we + * should free min. I/O unit B in LEB Y completely and the last + * used min. I/O unit in LEB Y should be A. This is basically + * what the below code tries to do. + */ + while (offs > min_io_unit) + drop_last_node(sleb, &offs); } - if (offs % c->min_io_size) { - clean_buf(c, &buf, lnum, &offs, &len); - need_clean = 1; - } + buf = sbuf + offs; + len = c->leb_size - offs; + clean_buf(c, &buf, lnum, &offs, &len); ubifs_end_scan(c, sleb, lnum, offs); - if (need_clean) { - err = fix_unclean_leb(c, sleb, start); - if (err) - goto error; - } + err = fix_unclean_leb(c, sleb, start); + if (err) + goto error; return sleb; +corrupted_rescan: + /* Re-scan the corrupted data with verbose messages */ + ubifs_err("corruption %d", ret); + ubifs_scan_a_node(c, buf, len, lnum, offs, 1); corrupted: ubifs_scanned_corruption(c, lnum, offs, buf); err = -EUCLEAN; @@ -758,22 +821,23 @@ static int get_cs_sqnum(struct ubifs_info *c, int lnum, int offs, return -ENOMEM; if (c->leb_size - offs < UBIFS_CS_NODE_SZ) goto out_err; - err = ubi_read(c->ubi, lnum, (void *)cs_node, offs, UBIFS_CS_NODE_SZ); + err = ubifs_leb_read(c, lnum, (void *)cs_node, offs, + UBIFS_CS_NODE_SZ, 0); if (err && err != -EBADMSG) goto out_free; ret = ubifs_scan_a_node(c, cs_node, UBIFS_CS_NODE_SZ, lnum, offs, 0); if (ret != SCANNED_A_NODE) { - dbg_err("Not a valid node"); + ubifs_err("Not a valid node"); goto out_err; } if (cs_node->ch.node_type != UBIFS_CS_NODE) { - dbg_err("Node a CS node, type is %d", cs_node->ch.node_type); + ubifs_err("Node a CS node, type is %d", cs_node->ch.node_type); goto out_err; } if (le64_to_cpu(cs_node->cmt_no) != c->cmt_no) { - dbg_err("CS node cmt_no %llu != current cmt_no %llu", - (unsigned long long)le64_to_cpu(cs_node->cmt_no), - c->cmt_no); + ubifs_err("CS node cmt_no %llu != current cmt_no %llu", + (unsigned long long)le64_to_cpu(cs_node->cmt_no), + c->cmt_no); goto out_err; } *cs_sqnum = le64_to_cpu(cs_node->ch.sqnum); @@ -797,7 +861,8 @@ out_free: * @sbuf: LEB-sized buffer to use * * This function does a scan of a LEB, but caters for errors that might have - * been caused by the unclean unmount from which we are attempting to recover. + * been caused by unclean reboots from which we are attempting to recover + * (assume that only the last log LEB can be corrupted by an unclean reboot). * * This function returns %0 on success and a negative error code on failure. */ @@ -816,7 +881,7 @@ struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum, * We can only recover at the end of the log, so check that the * next log LEB is empty or out of date. */ - sleb = ubifs_scan(c, next_lnum, 0, sbuf); + sleb = ubifs_scan(c, next_lnum, 0, sbuf, 0); if (IS_ERR(sleb)) return sleb; if (sleb->nodes_cnt) { @@ -835,15 +900,15 @@ struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum, } } if (snod->sqnum > cs_sqnum) { - ubifs_err("unrecoverable log corruption " - "in LEB %d", lnum); + ubifs_err("unrecoverable log corruption in LEB %d", + lnum); ubifs_scan_destroy(sleb); return ERR_PTR(-EUCLEAN); } } ubifs_scan_destroy(sleb); } - return ubifs_recover_leb(c, lnum, offs, sbuf, 0); + return ubifs_recover_leb(c, lnum, offs, sbuf, -1); } /** @@ -857,15 +922,10 @@ struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum, * * This function returns %0 on success and a negative error code on failure. */ -static int recover_head(const struct ubifs_info *c, int lnum, int offs, - void *sbuf) +static int recover_head(struct ubifs_info *c, int lnum, int offs, void *sbuf) { - int len, err, need_clean = 0; + int len = c->max_write_size, err; - if (c->min_io_size > 1) - len = c->min_io_size; - else - len = 512; if (offs + len > c->leb_size) len = c->leb_size - offs; @@ -873,27 +933,15 @@ static int recover_head(const struct ubifs_info *c, int lnum, int offs, return 0; /* Read at the head location and check it is empty flash */ - err = ubi_read(c->ubi, lnum, sbuf, offs, len); - if (err) - need_clean = 1; - else { - uint8_t *p = sbuf; - - while (len--) - if (*p++ != 0xff) { - need_clean = 1; - break; - } - } - - if (need_clean) { + err = ubifs_leb_read(c, lnum, sbuf, offs, len, 1); + if (err || !is_empty(sbuf, len)) { dbg_rcvry("cleaning head at %d:%d", lnum, offs); if (offs == 0) return ubifs_leb_unmap(c, lnum); - err = ubi_read(c->ubi, lnum, sbuf, 0, offs); + err = ubifs_leb_read(c, lnum, sbuf, 0, offs, 1); if (err) return err; - return ubi_leb_change(c->ubi, lnum, sbuf, offs, UBI_UNKNOWN); + return ubifs_leb_change(c, lnum, sbuf, offs); } return 0; @@ -916,11 +964,11 @@ static int recover_head(const struct ubifs_info *c, int lnum, int offs, * * This function returns %0 on success and a negative error code on failure. */ -int ubifs_recover_inl_heads(const struct ubifs_info *c, void *sbuf) +int ubifs_recover_inl_heads(struct ubifs_info *c, void *sbuf) { int err; - ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY) || c->remounting_rw); + ubifs_assert(!c->ro_mount || c->remounting_rw); dbg_rcvry("checking index head at %d:%d", c->ihead_lnum, c->ihead_offs); err = recover_head(c, c->ihead_lnum, c->ihead_offs, sbuf); @@ -936,7 +984,7 @@ int ubifs_recover_inl_heads(const struct ubifs_info *c, void *sbuf) } /** - * clean_an_unclean_leb - read and write a LEB to remove corruption. + * clean_an_unclean_leb - read and write a LEB to remove corruption. * @c: UBIFS file-system description object * @ucleb: unclean LEB information * @sbuf: LEB-sized buffer to use @@ -947,7 +995,7 @@ int ubifs_recover_inl_heads(const struct ubifs_info *c, void *sbuf) * * This function returns %0 on success and a negative error code on failure. */ -static int clean_an_unclean_leb(const struct ubifs_info *c, +static int clean_an_unclean_leb(struct ubifs_info *c, struct ubifs_unclean_leb *ucleb, void *sbuf) { int err, lnum = ucleb->lnum, offs = 0, len = ucleb->endpt, quiet = 1; @@ -963,7 +1011,7 @@ static int clean_an_unclean_leb(const struct ubifs_info *c, return 0; } - err = ubi_read(c->ubi, lnum, buf, offs, len); + err = ubifs_leb_read(c, lnum, buf, offs, len, 0); if (err && err != -EBADMSG) return err; @@ -1023,7 +1071,7 @@ static int clean_an_unclean_leb(const struct ubifs_info *c, } /* Write back the LEB atomically */ - err = ubi_leb_change(c->ubi, lnum, sbuf, len, UBI_UNKNOWN); + err = ubifs_leb_change(c, lnum, sbuf, len); if (err) return err; @@ -1043,7 +1091,7 @@ static int clean_an_unclean_leb(const struct ubifs_info *c, * * This function returns %0 on success and a negative error code on failure. */ -int ubifs_clean_lebs(const struct ubifs_info *c, void *sbuf) +int ubifs_clean_lebs(struct ubifs_info *c, void *sbuf) { dbg_rcvry("recovery"); while (!list_empty(&c->unclean_leb_list)) { @@ -1062,6 +1110,53 @@ int ubifs_clean_lebs(const struct ubifs_info *c, void *sbuf) } /** + * grab_empty_leb - grab an empty LEB to use as GC LEB and run commit. + * @c: UBIFS file-system description object + * + * This is a helper function for 'ubifs_rcvry_gc_commit()' which grabs an empty + * LEB to be used as GC LEB (@c->gc_lnum), and then runs the commit. Returns + * zero in case of success and a negative error code in case of failure. + */ +static int grab_empty_leb(struct ubifs_info *c) +{ + int lnum, err; + + /* + * Note, it is very important to first search for an empty LEB and then + * run the commit, not vice-versa. The reason is that there might be + * only one empty LEB at the moment, the one which has been the + * @c->gc_lnum just before the power cut happened. During the regular + * UBIFS operation (not now) @c->gc_lnum is marked as "taken", so no + * one but GC can grab it. But at this moment this single empty LEB is + * not marked as taken, so if we run commit - what happens? Right, the + * commit will grab it and write the index there. Remember that the + * index always expands as long as there is free space, and it only + * starts consolidating when we run out of space. + * + * IOW, if we run commit now, we might not be able to find a free LEB + * after this. + */ + lnum = ubifs_find_free_leb_for_idx(c); + if (lnum < 0) { + ubifs_err("could not find an empty LEB"); + ubifs_dump_lprops(c); + ubifs_dump_budg(c, &c->bi); + return lnum; + } + + /* Reset the index flag */ + err = ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0, + LPROPS_INDEX, 0); + if (err) + return err; + + c->gc_lnum = lnum; + dbg_rcvry("found empty LEB %d, run commit", lnum); + + return ubifs_run_commit(c); +} + +/** * ubifs_rcvry_gc_commit - recover the GC LEB number and run the commit. * @c: UBIFS file-system description object * @@ -1083,58 +1178,26 @@ int ubifs_rcvry_gc_commit(struct ubifs_info *c) { struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf; struct ubifs_lprops lp; - int lnum, err; + int err; + + dbg_rcvry("GC head LEB %d, offs %d", wbuf->lnum, wbuf->offs); c->gc_lnum = -1; - if (wbuf->lnum == -1) { - dbg_rcvry("no GC head LEB"); - goto find_free; - } - /* - * See whether the used space in the dirtiest LEB fits in the GC head - * LEB. - */ - if (wbuf->offs == c->leb_size) { - dbg_rcvry("no room in GC head LEB"); - goto find_free; - } + if (wbuf->lnum == -1 || wbuf->offs == c->leb_size) + return grab_empty_leb(c); + err = ubifs_find_dirty_leb(c, &lp, wbuf->offs, 2); if (err) { - if (err == -ENOSPC) - dbg_err("could not find a dirty LEB"); - return err; - } - ubifs_assert(!(lp.flags & LPROPS_INDEX)); - lnum = lp.lnum; - if (lp.free + lp.dirty == c->leb_size) { - /* An empty LEB was returned */ - if (lp.free != c->leb_size) { - err = ubifs_change_one_lp(c, lnum, c->leb_size, - 0, 0, 0, 0); - if (err) - return err; - } - err = ubifs_leb_unmap(c, lnum); - if (err) - return err; - c->gc_lnum = lnum; - dbg_rcvry("allocated LEB %d for GC", lnum); - /* Run the commit */ - dbg_rcvry("committing"); - return ubifs_run_commit(c); - } - /* - * There was no empty LEB so the used space in the dirtiest LEB must fit - * in the GC head LEB. - */ - if (lp.free + lp.dirty < wbuf->offs) { - dbg_rcvry("LEB %d doesn't fit in GC head LEB %d:%d", - lnum, wbuf->lnum, wbuf->offs); - err = ubifs_return_leb(c, lnum); - if (err) + if (err != -ENOSPC) return err; - goto find_free; + + dbg_rcvry("could not find a dirty LEB"); + return grab_empty_leb(c); } + + ubifs_assert(!(lp.flags & LPROPS_INDEX)); + ubifs_assert(lp.free + lp.dirty >= wbuf->offs); + /* * We run the commit before garbage collection otherwise subsequent * mounts will see the GC and orphan deletion in a different order. @@ -1143,11 +1206,8 @@ int ubifs_rcvry_gc_commit(struct ubifs_info *c) err = ubifs_run_commit(c); if (err) return err; - /* - * The data in the dirtiest LEB fits in the GC head LEB, so do the GC - * - use locking to keep 'ubifs_assert()' happy. - */ - dbg_rcvry("GC'ing LEB %d", lnum); + + dbg_rcvry("GC'ing LEB %d", lp.lnum); mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); err = ubifs_garbage_collect_leb(c, &lp); if (err >= 0) { @@ -1158,42 +1218,22 @@ int ubifs_rcvry_gc_commit(struct ubifs_info *c) } mutex_unlock(&wbuf->io_mutex); if (err < 0) { - dbg_err("GC failed, error %d", err); + ubifs_err("GC failed, error %d", err); if (err == -EAGAIN) err = -EINVAL; return err; } - if (err != LEB_RETAINED) { - dbg_err("GC returned %d", err); + + ubifs_assert(err == LEB_RETAINED); + if (err != LEB_RETAINED) return -EINVAL; - } + err = ubifs_leb_unmap(c, c->gc_lnum); if (err) return err; - dbg_rcvry("allocated LEB %d for GC", lnum); - return 0; -find_free: - /* - * There is no GC head LEB or the free space in the GC head LEB is too - * small. Allocate gc_lnum by calling 'ubifs_find_free_leb_for_idx()' so - * GC is not run. - */ - lnum = ubifs_find_free_leb_for_idx(c); - if (lnum < 0) { - dbg_err("could not find an empty LEB"); - return lnum; - } - /* And reset the index flag */ - err = ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0, - LPROPS_INDEX, 0); - if (err) - return err; - c->gc_lnum = lnum; - dbg_rcvry("allocated LEB %d for GC", lnum); - /* Run the commit */ - dbg_rcvry("committing"); - return ubifs_run_commit(c); + dbg_rcvry("allocated LEB %d for GC", lp.lnum); + return 0; } /** @@ -1295,29 +1335,14 @@ static void remove_ino(struct ubifs_info *c, ino_t inum) */ void ubifs_destroy_size_tree(struct ubifs_info *c) { - struct rb_node *this = c->size_tree.rb_node; - struct size_entry *e; + struct size_entry *e, *n; - while (this) { - if (this->rb_left) { - this = this->rb_left; - continue; - } else if (this->rb_right) { - this = this->rb_right; - continue; - } - e = rb_entry(this, struct size_entry, rb); + rbtree_postorder_for_each_entry_safe(e, n, &c->size_tree, rb) { if (e->inode) iput(e->inode); - this = rb_parent(this); - if (this) { - if (this->rb_left == &e->rb) - this->rb_left = NULL; - else - this->rb_right = NULL; - } kfree(e); } + c->size_tree = RB_ROOT; } @@ -1416,7 +1441,7 @@ static int fix_size_in_place(struct ubifs_info *c, struct size_entry *e) if (i_size >= e->d_size) return 0; /* Read the LEB */ - err = ubi_read(c->ubi, lnum, c->sbuf, 0, c->leb_size); + err = ubifs_leb_read(c, lnum, c->sbuf, 0, c->leb_size, 1); if (err) goto out; /* Change the size field and recalculate the CRC */ @@ -1432,16 +1457,16 @@ static int fix_size_in_place(struct ubifs_info *c, struct size_entry *e) len -= 1; len = ALIGN(len + 1, c->min_io_size); /* Atomically write the fixed LEB back again */ - err = ubi_leb_change(c->ubi, lnum, c->sbuf, len, UBI_UNKNOWN); + err = ubifs_leb_change(c, lnum, c->sbuf, len); if (err) goto out; - dbg_rcvry("inode %lu at %d:%d size %lld -> %lld ", e->inum, lnum, offs, - i_size, e->d_size); + dbg_rcvry("inode %lu at %d:%d size %lld -> %lld", + (unsigned long)e->inum, lnum, offs, i_size, e->d_size); return 0; out: ubifs_warn("inode %lu failed to fix size %lld -> %lld error %d", - e->inum, e->i_size, e->d_size, err); + (unsigned long)e->inum, e->i_size, e->d_size, err); return err; } @@ -1472,7 +1497,8 @@ int ubifs_recover_size(struct ubifs_info *c) return err; if (err == -ENOENT) { /* Remove data nodes that have no inode */ - dbg_rcvry("removing ino %lu", e->inum); + dbg_rcvry("removing ino %lu", + (unsigned long)e->inum); err = ubifs_tnc_remove_ino(c, e->inum); if (err) return err; @@ -1483,20 +1509,27 @@ int ubifs_recover_size(struct ubifs_info *c) e->i_size = le64_to_cpu(ino->size); } } + if (e->exists && e->i_size < e->d_size) { - if (!e->inode && (c->vfs_sb->s_flags & MS_RDONLY)) { + if (c->ro_mount) { /* Fix the inode size and pin it in memory */ struct inode *inode; + struct ubifs_inode *ui; + + ubifs_assert(!e->inode); inode = ubifs_iget(c->vfs_sb, e->inum); if (IS_ERR(inode)) return PTR_ERR(inode); + + ui = ubifs_inode(inode); if (inode->i_size < e->d_size) { dbg_rcvry("ino %lu size %lld -> %lld", - e->inum, e->d_size, - inode->i_size); + (unsigned long)e->inum, + inode->i_size, e->d_size); inode->i_size = e->d_size; - ubifs_inode(inode)->ui_size = e->d_size; + ui->ui_size = e->d_size; + ui->synced_i_size = e->d_size; e->inode = inode; this = rb_next(this); continue; @@ -1511,9 +1544,11 @@ int ubifs_recover_size(struct ubifs_info *c) iput(e->inode); } } + this = rb_next(this); rb_erase(&e->rb, &c->size_tree); kfree(e); } + return 0; } diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c index 7399692af85..3187925e987 100644 --- a/fs/ubifs/replay.c +++ b/fs/ubifs/replay.c @@ -33,43 +33,32 @@ */ #include "ubifs.h" - -/* - * Replay flags. - * - * REPLAY_DELETION: node was deleted - * REPLAY_REF: node is a reference node - */ -enum { - REPLAY_DELETION = 1, - REPLAY_REF = 2, -}; +#include <linux/list_sort.h> /** - * struct replay_entry - replay tree entry. + * struct replay_entry - replay list entry. * @lnum: logical eraseblock number of the node * @offs: node offset * @len: node length + * @deletion: non-zero if this entry corresponds to a node deletion * @sqnum: node sequence number - * @flags: replay flags - * @rb: links the replay tree + * @list: links the replay list * @key: node key * @nm: directory entry name * @old_size: truncation old size * @new_size: truncation new size - * @free: amount of free space in a bud - * @dirty: amount of dirty space in a bud from padding and deletion nodes * - * UBIFS journal replay must compare node sequence numbers, which means it must - * build a tree of node information to insert into the TNC. + * The replay process first scans all buds and builds the replay list, then + * sorts the replay list in nodes sequence number order, and then inserts all + * the replay entries to the TNC. */ struct replay_entry { int lnum; int offs; int len; + unsigned int deletion:1; unsigned long long sqnum; - int flags; - struct rb_node rb; + struct list_head list; union ubifs_key key; union { struct qstr nm; @@ -77,10 +66,6 @@ struct replay_entry { loff_t old_size; loff_t new_size; }; - struct { - int free; - int dirty; - }; }; }; @@ -88,83 +73,116 @@ struct replay_entry { * struct bud_entry - entry in the list of buds to replay. * @list: next bud in the list * @bud: bud description object - * @free: free bytes in the bud * @sqnum: reference node sequence number + * @free: free bytes in the bud + * @dirty: dirty bytes in the bud */ struct bud_entry { struct list_head list; struct ubifs_bud *bud; - int free; unsigned long long sqnum; + int free; + int dirty; }; /** * set_bud_lprops - set free and dirty space used by a bud. * @c: UBIFS file-system description object - * @r: replay entry of bud + * @b: bud entry which describes the bud + * + * This function makes sure the LEB properties of bud @b are set correctly + * after the replay. Returns zero in case of success and a negative error code + * in case of failure. */ -static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r) +static int set_bud_lprops(struct ubifs_info *c, struct bud_entry *b) { const struct ubifs_lprops *lp; int err = 0, dirty; ubifs_get_lprops(c); - lp = ubifs_lpt_lookup_dirty(c, r->lnum); + lp = ubifs_lpt_lookup_dirty(c, b->bud->lnum); if (IS_ERR(lp)) { err = PTR_ERR(lp); goto out; } dirty = lp->dirty; - if (r->offs == 0 && (lp->free != c->leb_size || lp->dirty != 0)) { + if (b->bud->start == 0 && (lp->free != c->leb_size || lp->dirty != 0)) { /* * The LEB was added to the journal with a starting offset of * zero which means the LEB must have been empty. The LEB - * property values should be lp->free == c->leb_size and - * lp->dirty == 0, but that is not the case. The reason is that - * the LEB was garbage collected. The garbage collector resets - * the free and dirty space without recording it anywhere except - * lprops, so if there is not a commit then lprops does not have - * that information next time the file system is mounted. + * property values should be @lp->free == @c->leb_size and + * @lp->dirty == 0, but that is not the case. The reason is that + * the LEB had been garbage collected before it became the bud, + * and there was not commit inbetween. The garbage collector + * resets the free and dirty space without recording it + * anywhere except lprops, so if there was no commit then + * lprops does not have that information. * * We do not need to adjust free space because the scan has told * us the exact value which is recorded in the replay entry as - * r->free. + * @b->free. * * However we do need to subtract from the dirty space the * amount of space that the garbage collector reclaimed, which * is the whole LEB minus the amount of space that was free. */ - dbg_mnt("bud LEB %d was GC'd (%d free, %d dirty)", r->lnum, + dbg_mnt("bud LEB %d was GC'd (%d free, %d dirty)", b->bud->lnum, lp->free, lp->dirty); - dbg_gc("bud LEB %d was GC'd (%d free, %d dirty)", r->lnum, + dbg_gc("bud LEB %d was GC'd (%d free, %d dirty)", b->bud->lnum, lp->free, lp->dirty); dirty -= c->leb_size - lp->free; /* * If the replay order was perfect the dirty space would now be - * zero. The order is not perfect because the the journal heads - * race with eachother. This is not a problem but is does mean + * zero. The order is not perfect because the journal heads + * race with each other. This is not a problem but is does mean * that the dirty space may temporarily exceed c->leb_size * during the replay. */ if (dirty != 0) - dbg_msg("LEB %d lp: %d free %d dirty " - "replay: %d free %d dirty", r->lnum, lp->free, - lp->dirty, r->free, r->dirty); + dbg_mnt("LEB %d lp: %d free %d dirty replay: %d free %d dirty", + b->bud->lnum, lp->free, lp->dirty, b->free, + b->dirty); } - lp = ubifs_change_lp(c, lp, r->free, dirty + r->dirty, + lp = ubifs_change_lp(c, lp, b->free, dirty + b->dirty, lp->flags | LPROPS_TAKEN, 0); if (IS_ERR(lp)) { err = PTR_ERR(lp); goto out; } + + /* Make sure the journal head points to the latest bud */ + err = ubifs_wbuf_seek_nolock(&c->jheads[b->bud->jhead].wbuf, + b->bud->lnum, c->leb_size - b->free); + out: ubifs_release_lprops(c); return err; } /** + * set_buds_lprops - set free and dirty space for all replayed buds. + * @c: UBIFS file-system description object + * + * This function sets LEB properties for all replayed buds. Returns zero in + * case of success and a negative error code in case of failure. + */ +static int set_buds_lprops(struct ubifs_info *c) +{ + struct bud_entry *b; + int err; + + list_for_each_entry(b, &c->replay_buds, list) { + err = set_bud_lprops(c, b); + if (err) + return err; + } + + return 0; +} + +/** * trun_remove_range - apply a replay entry for a truncation to the TNC. * @c: UBIFS file-system description object * @r: replay entry of truncation @@ -200,24 +218,22 @@ static int trun_remove_range(struct ubifs_info *c, struct replay_entry *r) */ static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r) { - int err, deletion = ((r->flags & REPLAY_DELETION) != 0); + int err; - dbg_mnt("LEB %d:%d len %d flgs %d sqnum %llu %s", r->lnum, - r->offs, r->len, r->flags, r->sqnum, DBGKEY(&r->key)); + dbg_mntk(&r->key, "LEB %d:%d len %d deletion %d sqnum %llu key ", + r->lnum, r->offs, r->len, r->deletion, r->sqnum); /* Set c->replay_sqnum to help deal with dangling branches. */ c->replay_sqnum = r->sqnum; - if (r->flags & REPLAY_REF) - err = set_bud_lprops(c, r); - else if (is_hash_key(c, &r->key)) { - if (deletion) + if (is_hash_key(c, &r->key)) { + if (r->deletion) err = ubifs_tnc_remove_nm(c, &r->key, &r->nm); else err = ubifs_tnc_add_nm(c, &r->key, r->lnum, r->offs, r->len, &r->nm); } else { - if (deletion) + if (r->deletion) switch (key_type(c, &r->key)) { case UBIFS_INO_KEY: { @@ -240,7 +256,7 @@ static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r) return err; if (c->need_recovery) - err = ubifs_recover_size_accum(c, &r->key, deletion, + err = ubifs_recover_size_accum(c, &r->key, r->deletion, r->new_size); } @@ -248,68 +264,77 @@ static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r) } /** - * destroy_replay_tree - destroy the replay. - * @c: UBIFS file-system description object + * replay_entries_cmp - compare 2 replay entries. + * @priv: UBIFS file-system description object + * @a: first replay entry + * @a: second replay entry * - * Destroy the replay tree. + * This is a comparios function for 'list_sort()' which compares 2 replay + * entries @a and @b by comparing their sequence numer. Returns %1 if @a has + * greater sequence number and %-1 otherwise. */ -static void destroy_replay_tree(struct ubifs_info *c) +static int replay_entries_cmp(void *priv, struct list_head *a, + struct list_head *b) { - struct rb_node *this = c->replay_tree.rb_node; - struct replay_entry *r; - - while (this) { - if (this->rb_left) { - this = this->rb_left; - continue; - } else if (this->rb_right) { - this = this->rb_right; - continue; - } - r = rb_entry(this, struct replay_entry, rb); - this = rb_parent(this); - if (this) { - if (this->rb_left == &r->rb) - this->rb_left = NULL; - else - this->rb_right = NULL; - } - if (is_hash_key(c, &r->key)) - kfree(r->nm.name); - kfree(r); - } - c->replay_tree = RB_ROOT; + struct replay_entry *ra, *rb; + + cond_resched(); + if (a == b) + return 0; + + ra = list_entry(a, struct replay_entry, list); + rb = list_entry(b, struct replay_entry, list); + ubifs_assert(ra->sqnum != rb->sqnum); + if (ra->sqnum > rb->sqnum) + return 1; + return -1; } /** - * apply_replay_tree - apply the replay tree to the TNC. + * apply_replay_list - apply the replay list to the TNC. * @c: UBIFS file-system description object * - * Apply the replay tree. - * Returns zero in case of success and a negative error code in case of - * failure. + * Apply all entries in the replay list to the TNC. Returns zero in case of + * success and a negative error code in case of failure. */ -static int apply_replay_tree(struct ubifs_info *c) +static int apply_replay_list(struct ubifs_info *c) { - struct rb_node *this = rb_first(&c->replay_tree); + struct replay_entry *r; + int err; - while (this) { - struct replay_entry *r; - int err; + list_sort(c, &c->replay_list, &replay_entries_cmp); + list_for_each_entry(r, &c->replay_list, list) { cond_resched(); - r = rb_entry(this, struct replay_entry, rb); err = apply_replay_entry(c, r); if (err) return err; - this = rb_next(this); } + return 0; } /** - * insert_node - insert a node to the replay tree. + * destroy_replay_list - destroy the replay. + * @c: UBIFS file-system description object + * + * Destroy the replay list. + */ +static void destroy_replay_list(struct ubifs_info *c) +{ + struct replay_entry *r, *tmp; + + list_for_each_entry_safe(r, tmp, &c->replay_list, list) { + if (is_hash_key(c, &r->key)) + kfree(r->nm.name); + list_del(&r->list); + kfree(r); + } +} + +/** + * insert_node - insert a node to the replay list * @c: UBIFS file-system description object * @lnum: node logical eraseblock number * @offs: node offset @@ -321,39 +346,25 @@ static int apply_replay_tree(struct ubifs_info *c) * @old_size: truncation old size * @new_size: truncation new size * - * This function inserts a scanned non-direntry node to the replay tree. The - * replay tree is an RB-tree containing @struct replay_entry elements which are - * indexed by the sequence number. The replay tree is applied at the very end - * of the replay process. Since the tree is sorted in sequence number order, - * the older modifications are applied first. This function returns zero in - * case of success and a negative error code in case of failure. + * This function inserts a scanned non-direntry node to the replay list. The + * replay list contains @struct replay_entry elements, and we sort this list in + * sequence number order before applying it. The replay list is applied at the + * very end of the replay process. Since the list is sorted in sequence number + * order, the older modifications are applied first. This function returns zero + * in case of success and a negative error code in case of failure. */ static int insert_node(struct ubifs_info *c, int lnum, int offs, int len, union ubifs_key *key, unsigned long long sqnum, int deletion, int *used, loff_t old_size, loff_t new_size) { - struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL; struct replay_entry *r; + dbg_mntk(key, "add LEB %d:%d, key ", lnum, offs); + if (key_inum(c, key) >= c->highest_inum) c->highest_inum = key_inum(c, key); - dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key)); - while (*p) { - parent = *p; - r = rb_entry(parent, struct replay_entry, rb); - if (sqnum < r->sqnum) { - p = &(*p)->rb_left; - continue; - } else if (sqnum > r->sqnum) { - p = &(*p)->rb_right; - continue; - } - ubifs_err("duplicate sqnum in replay"); - return -EINVAL; - } - r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL); if (!r) return -ENOMEM; @@ -363,19 +374,18 @@ static int insert_node(struct ubifs_info *c, int lnum, int offs, int len, r->lnum = lnum; r->offs = offs; r->len = len; + r->deletion = !!deletion; r->sqnum = sqnum; - r->flags = (deletion ? REPLAY_DELETION : 0); + key_copy(c, key, &r->key); r->old_size = old_size; r->new_size = new_size; - key_copy(c, key, &r->key); - rb_link_node(&r->rb, parent, p); - rb_insert_color(&r->rb, &c->replay_tree); + list_add_tail(&r->list, &c->replay_list); return 0; } /** - * insert_dent - insert a directory entry node into the replay tree. + * insert_dent - insert a directory entry node into the replay list. * @c: UBIFS file-system description object * @lnum: node logical eraseblock number * @offs: node offset @@ -387,43 +397,25 @@ static int insert_node(struct ubifs_info *c, int lnum, int offs, int len, * @deletion: non-zero if this is a deletion * @used: number of bytes in use in a LEB * - * This function inserts a scanned directory entry node to the replay tree. - * Returns zero in case of success and a negative error code in case of - * failure. - * - * This function is also used for extended attribute entries because they are - * implemented as directory entry nodes. + * This function inserts a scanned directory entry node or an extended + * attribute entry to the replay list. Returns zero in case of success and a + * negative error code in case of failure. */ static int insert_dent(struct ubifs_info *c, int lnum, int offs, int len, union ubifs_key *key, const char *name, int nlen, unsigned long long sqnum, int deletion, int *used) { - struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL; struct replay_entry *r; char *nbuf; + dbg_mntk(key, "add LEB %d:%d, key ", lnum, offs); if (key_inum(c, key) >= c->highest_inum) c->highest_inum = key_inum(c, key); - dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key)); - while (*p) { - parent = *p; - r = rb_entry(parent, struct replay_entry, rb); - if (sqnum < r->sqnum) { - p = &(*p)->rb_left; - continue; - } - if (sqnum > r->sqnum) { - p = &(*p)->rb_right; - continue; - } - ubifs_err("duplicate sqnum in replay"); - return -EINVAL; - } - r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL); if (!r) return -ENOMEM; + nbuf = kmalloc(nlen + 1, GFP_KERNEL); if (!nbuf) { kfree(r); @@ -435,17 +427,15 @@ static int insert_dent(struct ubifs_info *c, int lnum, int offs, int len, r->lnum = lnum; r->offs = offs; r->len = len; + r->deletion = !!deletion; r->sqnum = sqnum; + key_copy(c, key, &r->key); r->nm.len = nlen; memcpy(nbuf, name, nlen); nbuf[nlen] = '\0'; r->nm.name = nbuf; - r->flags = (deletion ? REPLAY_DELETION : 0); - key_copy(c, key, &r->key); - ubifs_assert(!*p); - rb_link_node(&r->rb, parent, p); - rb_insert_color(&r->rb, &c->replay_tree); + list_add_tail(&r->list, &c->replay_list); return 0; } @@ -482,31 +472,92 @@ int ubifs_validate_entry(struct ubifs_info *c, } /** + * is_last_bud - check if the bud is the last in the journal head. + * @c: UBIFS file-system description object + * @bud: bud description object + * + * This function checks if bud @bud is the last bud in its journal head. This + * information is then used by 'replay_bud()' to decide whether the bud can + * have corruptions or not. Indeed, only last buds can be corrupted by power + * cuts. Returns %1 if this is the last bud, and %0 if not. + */ +static int is_last_bud(struct ubifs_info *c, struct ubifs_bud *bud) +{ + struct ubifs_jhead *jh = &c->jheads[bud->jhead]; + struct ubifs_bud *next; + uint32_t data; + int err; + + if (list_is_last(&bud->list, &jh->buds_list)) + return 1; + + /* + * The following is a quirk to make sure we work correctly with UBIFS + * images used with older UBIFS. + * + * Normally, the last bud will be the last in the journal head's list + * of bud. However, there is one exception if the UBIFS image belongs + * to older UBIFS. This is fairly unlikely: one would need to use old + * UBIFS, then have a power cut exactly at the right point, and then + * try to mount this image with new UBIFS. + * + * The exception is: it is possible to have 2 buds A and B, A goes + * before B, and B is the last, bud B is contains no data, and bud A is + * corrupted at the end. The reason is that in older versions when the + * journal code switched the next bud (from A to B), it first added a + * log reference node for the new bud (B), and only after this it + * synchronized the write-buffer of current bud (A). But later this was + * changed and UBIFS started to always synchronize the write-buffer of + * the bud (A) before writing the log reference for the new bud (B). + * + * But because older UBIFS always synchronized A's write-buffer before + * writing to B, we can recognize this exceptional situation but + * checking the contents of bud B - if it is empty, then A can be + * treated as the last and we can recover it. + * + * TODO: remove this piece of code in a couple of years (today it is + * 16.05.2011). + */ + next = list_entry(bud->list.next, struct ubifs_bud, list); + if (!list_is_last(&next->list, &jh->buds_list)) + return 0; + + err = ubifs_leb_read(c, next->lnum, (char *)&data, next->start, 4, 1); + if (err) + return 0; + + return data == 0xFFFFFFFF; +} + +/** * replay_bud - replay a bud logical eraseblock. * @c: UBIFS file-system description object - * @lnum: bud logical eraseblock number to replay - * @offs: bud start offset - * @jhead: journal head to which this bud belongs - * @free: amount of free space in the bud is returned here - * @dirty: amount of dirty space from padding and deletion nodes is returned - * here + * @b: bud entry which describes the bud * - * This function returns zero in case of success and a negative error code in - * case of failure. + * This function replays bud @bud, recovers it if needed, and adds all nodes + * from this bud to the replay list. Returns zero in case of success and a + * negative error code in case of failure. */ -static int replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead, - int *free, int *dirty) +static int replay_bud(struct ubifs_info *c, struct bud_entry *b) { - int err = 0, used = 0; + int is_last = is_last_bud(c, b->bud); + int err = 0, used = 0, lnum = b->bud->lnum, offs = b->bud->start; struct ubifs_scan_leb *sleb; struct ubifs_scan_node *snod; - struct ubifs_bud *bud; - dbg_mnt("replay bud LEB %d, head %d", lnum, jhead); - if (c->need_recovery) - sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf, jhead != GCHD); + dbg_mnt("replay bud LEB %d, head %d, offs %d, is_last %d", + lnum, b->bud->jhead, offs, is_last); + + if (c->need_recovery && is_last) + /* + * Recover only last LEBs in the journal heads, because power + * cuts may cause corruptions only in these LEBs, because only + * these LEBs could possibly be written to at the power cut + * time. + */ + sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf, b->bud->jhead); else - sleb = ubifs_scan(c, lnum, offs, c->sbuf); + sleb = ubifs_scan(c, lnum, offs, c->sbuf, 0); if (IS_ERR(sleb)) return PTR_ERR(sleb); @@ -620,20 +671,14 @@ static int replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead, goto out; } - bud = ubifs_search_bud(c, lnum); - if (!bud) - BUG(); - + ubifs_assert(ubifs_search_bud(c, lnum)); ubifs_assert(sleb->endpt - offs >= used); ubifs_assert(sleb->endpt % c->min_io_size == 0); - if (sleb->endpt + c->min_io_size <= c->leb_size && - !(c->vfs_sb->s_flags & MS_RDONLY)) - err = ubifs_wbuf_seek_nolock(&c->jheads[jhead].wbuf, lnum, - sleb->endpt, UBI_SHORTTERM); - - *dirty = sleb->endpt - offs - used; - *free = c->leb_size - sleb->endpt; + b->dirty = sleb->endpt - offs - used; + b->free = c->leb_size - sleb->endpt; + dbg_mnt("bud LEB %d replied: dirty %d, free %d", + lnum, b->dirty, b->free); out: ubifs_scan_destroy(sleb); @@ -641,61 +686,12 @@ out: out_dump: ubifs_err("bad node is at LEB %d:%d", lnum, snod->offs); - dbg_dump_node(c, snod->node); + ubifs_dump_node(c, snod->node); ubifs_scan_destroy(sleb); return -EINVAL; } /** - * insert_ref_node - insert a reference node to the replay tree. - * @c: UBIFS file-system description object - * @lnum: node logical eraseblock number - * @offs: node offset - * @sqnum: sequence number - * @free: amount of free space in bud - * @dirty: amount of dirty space from padding and deletion nodes - * - * This function inserts a reference node to the replay tree and returns zero - * in case of success ort a negative error code in case of failure. - */ -static int insert_ref_node(struct ubifs_info *c, int lnum, int offs, - unsigned long long sqnum, int free, int dirty) -{ - struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL; - struct replay_entry *r; - - dbg_mnt("add ref LEB %d:%d", lnum, offs); - while (*p) { - parent = *p; - r = rb_entry(parent, struct replay_entry, rb); - if (sqnum < r->sqnum) { - p = &(*p)->rb_left; - continue; - } else if (sqnum > r->sqnum) { - p = &(*p)->rb_right; - continue; - } - ubifs_err("duplicate sqnum in replay tree"); - return -EINVAL; - } - - r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL); - if (!r) - return -ENOMEM; - - r->lnum = lnum; - r->offs = offs; - r->sqnum = sqnum; - r->flags = REPLAY_REF; - r->free = free; - r->dirty = dirty; - - rb_link_node(&r->rb, parent, p); - rb_insert_color(&r->rb, &c->replay_tree); - return 0; -} - -/** * replay_buds - replay all buds. * @c: UBIFS file-system description object * @@ -705,17 +701,16 @@ static int insert_ref_node(struct ubifs_info *c, int lnum, int offs, static int replay_buds(struct ubifs_info *c) { struct bud_entry *b; - int err, uninitialized_var(free), uninitialized_var(dirty); + int err; + unsigned long long prev_sqnum = 0; list_for_each_entry(b, &c->replay_buds, list) { - err = replay_bud(c, b->bud->lnum, b->bud->start, b->bud->jhead, - &free, &dirty); - if (err) - return err; - err = insert_ref_node(c, b->bud->lnum, b->bud->start, b->sqnum, - free, dirty); + err = replay_bud(c, b); if (err) return err; + + ubifs_assert(b->sqnum > prev_sqnum); + prev_sqnum = b->sqnum; } return 0; @@ -836,10 +831,16 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf) const struct ubifs_cs_node *node; dbg_mnt("replay log LEB %d:%d", lnum, offs); - sleb = ubifs_scan(c, lnum, offs, sbuf); + sleb = ubifs_scan(c, lnum, offs, sbuf, c->need_recovery); if (IS_ERR(sleb)) { - if (c->need_recovery) - sleb = ubifs_recover_log_leb(c, lnum, offs, sbuf); + if (PTR_ERR(sleb) != -EUCLEAN || !c->need_recovery) + return PTR_ERR(sleb); + /* + * Note, the below function will recover this log LEB only if + * it is the last, because unclean reboots can possibly corrupt + * only the tail of the log. + */ + sleb = ubifs_recover_log_leb(c, lnum, offs, sbuf); if (IS_ERR(sleb)) return PTR_ERR(sleb); } @@ -850,7 +851,6 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf) } node = sleb->buf; - snod = list_entry(sleb->nodes.next, struct ubifs_scan_node, list); if (c->cs_sqnum == 0) { /* @@ -861,16 +861,15 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf) * numbers. */ if (snod->type != UBIFS_CS_NODE) { - dbg_err("first log node at LEB %d:%d is not CS node", - lnum, offs); + ubifs_err("first log node at LEB %d:%d is not CS node", + lnum, offs); goto out_dump; } if (le64_to_cpu(node->cmt_no) != c->cmt_no) { - dbg_err("first CS node at LEB %d:%d has wrong " - "commit number %llu expected %llu", - lnum, offs, - (unsigned long long)le64_to_cpu(node->cmt_no), - c->cmt_no); + ubifs_err("first CS node at LEB %d:%d has wrong commit number %llu expected %llu", + lnum, offs, + (unsigned long long)le64_to_cpu(node->cmt_no), + c->cmt_no); goto out_dump; } @@ -883,7 +882,7 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf) * This means that we reached end of log and now * look to the older log data, which was already * committed but the eraseblock was not erased (UBIFS - * only unmaps it). So this basically means we have to + * only un-maps it). So this basically means we have to * exit with "end of log" code. */ err = 1; @@ -892,12 +891,11 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf) /* Make sure the first node sits at offset zero of the LEB */ if (snod->offs != 0) { - dbg_err("first node is not at zero offset"); + ubifs_err("first node is not at zero offset"); goto out_dump; } list_for_each_entry(snod, &sleb->nodes, list) { - cond_resched(); if (snod->sqnum >= SQNUM_WATERMARK) { @@ -906,8 +904,8 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf) } if (snod->sqnum < c->cs_sqnum) { - dbg_err("bad sqnum %llu, commit sqnum %llu", - snod->sqnum, c->cs_sqnum); + ubifs_err("bad sqnum %llu, commit sqnum %llu", + snod->sqnum, c->cs_sqnum); goto out_dump; } @@ -957,9 +955,9 @@ out: return err; out_dump: - ubifs_err("log error detected while replying the log at LEB %d:%d", + ubifs_err("log error detected while replaying the log at LEB %d:%d", lnum, offs + snod->offs); - dbg_dump_node(c, snod->node); + ubifs_dump_node(c, snod->node); ubifs_scan_destroy(sleb); return -EINVAL; } @@ -1009,8 +1007,7 @@ out: */ int ubifs_replay_journal(struct ubifs_info *c) { - int err, i, lnum, offs, free; - void *sbuf = NULL; + int err, lnum, free; BUILD_BUG_ON(UBIFS_TRUN_KEY > 5); @@ -1025,51 +1022,48 @@ int ubifs_replay_journal(struct ubifs_info *c) return -EINVAL; } - sbuf = vmalloc(c->leb_size); - if (!sbuf) - return -ENOMEM; - dbg_mnt("start replaying the journal"); - c->replaying = 1; - lnum = c->ltail_lnum = c->lhead_lnum; - offs = c->lhead_offs; - for (i = 0; i < c->log_lebs; i++, lnum++) { - if (lnum >= UBIFS_LOG_LNUM + c->log_lebs) { - /* - * The log is logically circular, we reached the last - * LEB, switch to the first one. - */ - lnum = UBIFS_LOG_LNUM; - offs = 0; - } - err = replay_log_leb(c, lnum, offs, sbuf); + do { + err = replay_log_leb(c, lnum, 0, c->sbuf); if (err == 1) /* We hit the end of the log */ break; if (err) goto out; - offs = 0; - } + lnum = ubifs_next_log_lnum(c, lnum); + } while (lnum != c->ltail_lnum); err = replay_buds(c); if (err) goto out; - err = apply_replay_tree(c); + err = apply_replay_list(c); if (err) goto out; + err = set_buds_lprops(c); + if (err) + goto out; + + /* + * UBIFS budgeting calculations use @c->bi.uncommitted_idx variable + * to roughly estimate index growth. Things like @c->bi.min_idx_lebs + * depend on it. This means we have to initialize it to make sure + * budgeting works properly. + */ + c->bi.uncommitted_idx = atomic_long_read(&c->dirty_zn_cnt); + c->bi.uncommitted_idx *= c->max_idx_node_sz; + ubifs_assert(c->bud_bytes <= c->max_bud_bytes || c->need_recovery); - dbg_mnt("finished, log head LEB %d:%d, max_sqnum %llu, " - "highest_inum %lu", c->lhead_lnum, c->lhead_offs, c->max_sqnum, - c->highest_inum); + dbg_mnt("finished, log head LEB %d:%d, max_sqnum %llu, highest_inum %lu", + c->lhead_lnum, c->lhead_offs, c->max_sqnum, + (unsigned long)c->highest_inum); out: - destroy_replay_tree(c); + destroy_replay_list(c); destroy_bud_list(c); - vfree(sbuf); c->replaying = 0; return err; } diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c index 2bf753b3888..4c37607a958 100644 --- a/fs/ubifs/sb.c +++ b/fs/ubifs/sb.c @@ -27,7 +27,9 @@ */ #include "ubifs.h" +#include <linux/slab.h> #include <linux/random.h> +#include <linux/math64.h> /* * Default journal size in logical eraseblocks as a percent of total @@ -80,7 +82,8 @@ static int create_default_filesystem(struct ubifs_info *c) int err, tmp, jnl_lebs, log_lebs, max_buds, main_lebs, main_first; int lpt_lebs, lpt_first, orph_lebs, big_lpt, ino_waste, sup_flags = 0; int min_leb_cnt = UBIFS_MIN_LEB_CNT; - uint64_t tmp64, main_bytes; + long long tmp64, main_bytes; + __le64 tmp_le64; /* Some functions called from here depend on the @c->key_len filed */ c->key_len = UBIFS_SK_LEN; @@ -127,7 +130,6 @@ static int create_default_filesystem(struct ubifs_info *c) * orphan node. */ orph_lebs = UBIFS_MIN_ORPH_LEBS; -#ifdef CONFIG_UBIFS_FS_DEBUG if (c->leb_cnt - min_leb_cnt > 1) /* * For debugging purposes it is better to have at least 2 @@ -135,7 +137,6 @@ static int create_default_filesystem(struct ubifs_info *c) * consolidations and would be stressed more. */ orph_lebs += 1; -#endif main_lebs = c->leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS - log_lebs; main_lebs -= orph_lebs; @@ -159,7 +160,7 @@ static int create_default_filesystem(struct ubifs_info *c) if (!sup) return -ENOMEM; - tmp64 = (uint64_t)max_buds * c->leb_size; + tmp64 = (long long)max_buds * c->leb_size; if (big_lpt) sup_flags |= UBIFS_FLG_BIGLPT; @@ -178,19 +179,22 @@ static int create_default_filesystem(struct ubifs_info *c) sup->fanout = cpu_to_le32(DEFAULT_FANOUT); sup->lsave_cnt = cpu_to_le32(c->lsave_cnt); sup->fmt_version = cpu_to_le32(UBIFS_FORMAT_VERSION); - sup->default_compr = cpu_to_le16(UBIFS_COMPR_LZO); sup->time_gran = cpu_to_le32(DEFAULT_TIME_GRAN); + if (c->mount_opts.override_compr) + sup->default_compr = cpu_to_le16(c->mount_opts.compr_type); + else + sup->default_compr = cpu_to_le16(UBIFS_COMPR_LZO); generate_random_uuid(sup->uuid); - main_bytes = (uint64_t)main_lebs * c->leb_size; - tmp64 = main_bytes * DEFAULT_RP_PERCENT; - do_div(tmp64, 100); + main_bytes = (long long)main_lebs * c->leb_size; + tmp64 = div_u64(main_bytes * DEFAULT_RP_PERCENT, 100); if (tmp64 > DEFAULT_MAX_RP_SIZE) tmp64 = DEFAULT_MAX_RP_SIZE; sup->rp_size = cpu_to_le64(tmp64); + sup->ro_compat_version = cpu_to_le32(UBIFS_RO_COMPAT_VERSION); - err = ubifs_write_node(c, sup, UBIFS_SB_NODE_SZ, 0, 0, UBI_LONGTERM); + err = ubifs_write_node(c, sup, UBIFS_SB_NODE_SZ, 0, 0); kfree(sup); if (err) return err; @@ -241,19 +245,18 @@ static int create_default_filesystem(struct ubifs_info *c) mst->total_dirty = cpu_to_le64(tmp64); /* The indexing LEB does not contribute to dark space */ - tmp64 = (c->main_lebs - 1) * c->dark_wm; + tmp64 = ((long long)(c->main_lebs - 1) * c->dark_wm); mst->total_dark = cpu_to_le64(tmp64); mst->total_used = cpu_to_le64(UBIFS_INO_NODE_SZ); - err = ubifs_write_node(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM, 0, - UBI_UNKNOWN); + err = ubifs_write_node(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM, 0); if (err) { kfree(mst); return err; } - err = ubifs_write_node(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM + 1, 0, - UBI_UNKNOWN); + err = ubifs_write_node(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM + 1, + 0); kfree(mst); if (err) return err; @@ -276,8 +279,7 @@ static int create_default_filesystem(struct ubifs_info *c) key_write_idx(c, &key, &br->key); br->lnum = cpu_to_le32(main_first + DEFAULT_DATA_LEB); br->len = cpu_to_le32(UBIFS_INO_NODE_SZ); - err = ubifs_write_node(c, idx, tmp, main_first + DEFAULT_IDX_LEB, 0, - UBI_UNKNOWN); + err = ubifs_write_node(c, idx, tmp, main_first + DEFAULT_IDX_LEB, 0); kfree(idx); if (err) return err; @@ -295,10 +297,10 @@ static int create_default_filesystem(struct ubifs_info *c) ino->ch.node_type = UBIFS_INO_NODE; ino->creat_sqnum = cpu_to_le64(++c->max_sqnum); ino->nlink = cpu_to_le32(2); - tmp = cpu_to_le64(CURRENT_TIME_SEC.tv_sec); - ino->atime_sec = tmp; - ino->ctime_sec = tmp; - ino->mtime_sec = tmp; + tmp_le64 = cpu_to_le64(CURRENT_TIME_SEC.tv_sec); + ino->atime_sec = tmp_le64; + ino->ctime_sec = tmp_le64; + ino->mtime_sec = tmp_le64; ino->atime_nsec = 0; ino->ctime_nsec = 0; ino->mtime_nsec = 0; @@ -309,8 +311,7 @@ static int create_default_filesystem(struct ubifs_info *c) ino->flags = cpu_to_le32(UBIFS_COMPR_FL); err = ubifs_write_node(c, ino, UBIFS_INO_NODE_SZ, - main_first + DEFAULT_DATA_LEB, 0, - UBI_UNKNOWN); + main_first + DEFAULT_DATA_LEB, 0); kfree(ino); if (err) return err; @@ -329,8 +330,7 @@ static int create_default_filesystem(struct ubifs_info *c) return -ENOMEM; cs->ch.node_type = UBIFS_CS_NODE; - err = ubifs_write_node(c, cs, UBIFS_CS_NODE_SZ, UBIFS_LOG_LNUM, - 0, UBI_UNKNOWN); + err = ubifs_write_node(c, cs, UBIFS_CS_NODE_SZ, UBIFS_LOG_LNUM, 0); kfree(cs); ubifs_msg("default file-system created"); @@ -391,9 +391,8 @@ static int validate_sb(struct ubifs_info *c, struct ubifs_sb_node *sup) min_leb_cnt += c->lpt_lebs + c->orph_lebs + c->jhead_cnt + 6; if (c->leb_cnt < min_leb_cnt || c->leb_cnt > c->vi.size) { - ubifs_err("bad LEB count: %d in superblock, %d on UBI volume, " - "%d minimum required", c->leb_cnt, c->vi.size, - min_leb_cnt); + ubifs_err("bad LEB count: %d in superblock, %d on UBI volume, %d minimum required", + c->leb_cnt, c->vi.size, min_leb_cnt); goto failed; } @@ -404,13 +403,22 @@ static int validate_sb(struct ubifs_info *c, struct ubifs_sb_node *sup) } if (c->main_lebs < UBIFS_MIN_MAIN_LEBS) { - err = 7; + ubifs_err("too few main LEBs count %d, must be at least %d", + c->main_lebs, UBIFS_MIN_MAIN_LEBS); + goto failed; + } + + max_bytes = (long long)c->leb_size * UBIFS_MIN_BUD_LEBS; + if (c->max_bud_bytes < max_bytes) { + ubifs_err("too small journal (%lld bytes), must be at least %lld bytes", + c->max_bud_bytes, max_bytes); goto failed; } - if (c->max_bud_bytes < (long long)c->leb_size * UBIFS_MIN_BUD_LEBS || - c->max_bud_bytes > (long long)c->leb_size * c->main_lebs) { - err = 8; + max_bytes = (long long)c->leb_size * c->main_lebs; + if (c->max_bud_bytes > max_bytes) { + ubifs_err("too large journal size (%lld bytes), only %lld bytes available in the main area", + c->max_bud_bytes, max_bytes); goto failed; } @@ -444,7 +452,6 @@ static int validate_sb(struct ubifs_info *c, struct ubifs_sb_node *sup) goto failed; } - max_bytes = c->main_lebs * (long long)c->leb_size; if (c->rp_size < 0 || max_bytes < c->rp_size) { err = 14; goto failed; @@ -460,7 +467,7 @@ static int validate_sb(struct ubifs_info *c, struct ubifs_sb_node *sup) failed: ubifs_err("bad superblock, error %d", err); - dbg_dump_node(c, sup); + ubifs_dump_node(c, sup); return -EINVAL; } @@ -469,7 +476,8 @@ failed: * @c: UBIFS file-system description object * * This function returns a pointer to the superblock node or a negative error - * code. + * code. Note, the user of this function is responsible of kfree()'ing the + * returned superblock buffer. */ struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c) { @@ -502,7 +510,7 @@ int ubifs_write_sb_node(struct ubifs_info *c, struct ubifs_sb_node *sup) int len = ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size); ubifs_prepare_node(c, sup, UBIFS_SB_NODE_SZ, 1); - return ubifs_leb_change(c, UBIFS_SB_LNUM, sup, len, UBI_LONGTERM); + return ubifs_leb_change(c, UBIFS_SB_LNUM, sup, len); } /** @@ -528,17 +536,35 @@ int ubifs_read_superblock(struct ubifs_info *c) if (IS_ERR(sup)) return PTR_ERR(sup); + c->fmt_version = le32_to_cpu(sup->fmt_version); + c->ro_compat_version = le32_to_cpu(sup->ro_compat_version); + /* * The software supports all previous versions but not future versions, * due to the unavailability of time-travelling equipment. */ - c->fmt_version = le32_to_cpu(sup->fmt_version); if (c->fmt_version > UBIFS_FORMAT_VERSION) { - ubifs_err("on-flash format version is %d, but software only " - "supports up to version %d", c->fmt_version, - UBIFS_FORMAT_VERSION); - err = -EINVAL; - goto out; + ubifs_assert(!c->ro_media || c->ro_mount); + if (!c->ro_mount || + c->ro_compat_version > UBIFS_RO_COMPAT_VERSION) { + ubifs_err("on-flash format version is w%d/r%d, but software only supports up to version w%d/r%d", + c->fmt_version, c->ro_compat_version, + UBIFS_FORMAT_VERSION, + UBIFS_RO_COMPAT_VERSION); + if (c->ro_compat_version <= UBIFS_RO_COMPAT_VERSION) { + ubifs_msg("only R/O mounting is possible"); + err = -EROFS; + } else + err = -EINVAL; + goto out; + } + + /* + * The FS is mounted R/O, and the media format is + * R/O-compatible with the UBIFS implementation, so we can + * mount. + */ + c->rw_incompat = 1; } if (c->fmt_version < 3) { @@ -581,23 +607,23 @@ int ubifs_read_superblock(struct ubifs_info *c) c->jhead_cnt = le32_to_cpu(sup->jhead_cnt) + NONDATA_JHEADS_CNT; c->fanout = le32_to_cpu(sup->fanout); c->lsave_cnt = le32_to_cpu(sup->lsave_cnt); - c->default_compr = le16_to_cpu(sup->default_compr); c->rp_size = le64_to_cpu(sup->rp_size); - c->rp_uid = le32_to_cpu(sup->rp_uid); - c->rp_gid = le32_to_cpu(sup->rp_gid); + c->rp_uid = make_kuid(&init_user_ns, le32_to_cpu(sup->rp_uid)); + c->rp_gid = make_kgid(&init_user_ns, le32_to_cpu(sup->rp_gid)); sup_flags = le32_to_cpu(sup->flags); + if (!c->mount_opts.override_compr) + c->default_compr = le16_to_cpu(sup->default_compr); c->vfs_sb->s_time_gran = le32_to_cpu(sup->time_gran); - memcpy(&c->uuid, &sup->uuid, 16); - c->big_lpt = !!(sup_flags & UBIFS_FLG_BIGLPT); + c->space_fixup = !!(sup_flags & UBIFS_FLG_SPACE_FIXUP); /* Automatically increase file system size to the maximum size */ c->old_leb_cnt = c->leb_cnt; if (c->leb_cnt < c->vi.size && c->leb_cnt < c->max_leb_cnt) { c->leb_cnt = min_t(int, c->max_leb_cnt, c->vi.size); - if (c->vfs_sb->s_flags & MS_RDONLY) + if (c->ro_mount) dbg_mnt("Auto resizing (ro) from %d LEBs to %d LEBs", c->old_leb_cnt, c->leb_cnt); else { @@ -620,10 +646,162 @@ int ubifs_read_superblock(struct ubifs_info *c) c->main_lebs = c->leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS; c->main_lebs -= c->log_lebs + c->lpt_lebs + c->orph_lebs; c->main_first = c->leb_cnt - c->main_lebs; - c->report_rp_size = ubifs_reported_space(c, c->rp_size); err = validate_sb(c, sup); out: kfree(sup); return err; } + +/** + * fixup_leb - fixup/unmap an LEB containing free space. + * @c: UBIFS file-system description object + * @lnum: the LEB number to fix up + * @len: number of used bytes in LEB (starting at offset 0) + * + * This function reads the contents of the given LEB number @lnum, then fixes + * it up, so that empty min. I/O units in the end of LEB are actually erased on + * flash (rather than being just all-0xff real data). If the LEB is completely + * empty, it is simply unmapped. + */ +static int fixup_leb(struct ubifs_info *c, int lnum, int len) +{ + int err; + + ubifs_assert(len >= 0); + ubifs_assert(len % c->min_io_size == 0); + ubifs_assert(len < c->leb_size); + + if (len == 0) { + dbg_mnt("unmap empty LEB %d", lnum); + return ubifs_leb_unmap(c, lnum); + } + + dbg_mnt("fixup LEB %d, data len %d", lnum, len); + err = ubifs_leb_read(c, lnum, c->sbuf, 0, len, 1); + if (err) + return err; + + return ubifs_leb_change(c, lnum, c->sbuf, len); +} + +/** + * fixup_free_space - find & remap all LEBs containing free space. + * @c: UBIFS file-system description object + * + * This function walks through all LEBs in the filesystem and fiexes up those + * containing free/empty space. + */ +static int fixup_free_space(struct ubifs_info *c) +{ + int lnum, err = 0; + struct ubifs_lprops *lprops; + + ubifs_get_lprops(c); + + /* Fixup LEBs in the master area */ + for (lnum = UBIFS_MST_LNUM; lnum < UBIFS_LOG_LNUM; lnum++) { + err = fixup_leb(c, lnum, c->mst_offs + c->mst_node_alsz); + if (err) + goto out; + } + + /* Unmap unused log LEBs */ + lnum = ubifs_next_log_lnum(c, c->lhead_lnum); + while (lnum != c->ltail_lnum) { + err = fixup_leb(c, lnum, 0); + if (err) + goto out; + lnum = ubifs_next_log_lnum(c, lnum); + } + + /* + * Fixup the log head which contains the only a CS node at the + * beginning. + */ + err = fixup_leb(c, c->lhead_lnum, + ALIGN(UBIFS_CS_NODE_SZ, c->min_io_size)); + if (err) + goto out; + + /* Fixup LEBs in the LPT area */ + for (lnum = c->lpt_first; lnum <= c->lpt_last; lnum++) { + int free = c->ltab[lnum - c->lpt_first].free; + + if (free > 0) { + err = fixup_leb(c, lnum, c->leb_size - free); + if (err) + goto out; + } + } + + /* Unmap LEBs in the orphans area */ + for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) { + err = fixup_leb(c, lnum, 0); + if (err) + goto out; + } + + /* Fixup LEBs in the main area */ + for (lnum = c->main_first; lnum < c->leb_cnt; lnum++) { + lprops = ubifs_lpt_lookup(c, lnum); + if (IS_ERR(lprops)) { + err = PTR_ERR(lprops); + goto out; + } + + if (lprops->free > 0) { + err = fixup_leb(c, lnum, c->leb_size - lprops->free); + if (err) + goto out; + } + } + +out: + ubifs_release_lprops(c); + return err; +} + +/** + * ubifs_fixup_free_space - find & fix all LEBs with free space. + * @c: UBIFS file-system description object + * + * This function fixes up LEBs containing free space on first mount, if the + * appropriate flag was set when the FS was created. Each LEB with one or more + * empty min. I/O unit (i.e. free-space-count > 0) is re-written, to make sure + * the free space is actually erased. E.g., this is necessary for some NAND + * chips, since the free space may have been programmed like real "0xff" data + * (generating a non-0xff ECC), causing future writes to the not-really-erased + * NAND pages to behave badly. After the space is fixed up, the superblock flag + * is cleared, so that this is skipped for all future mounts. + */ +int ubifs_fixup_free_space(struct ubifs_info *c) +{ + int err; + struct ubifs_sb_node *sup; + + ubifs_assert(c->space_fixup); + ubifs_assert(!c->ro_mount); + + ubifs_msg("start fixing up free space"); + + err = fixup_free_space(c); + if (err) + return err; + + sup = ubifs_read_sb_node(c); + if (IS_ERR(sup)) + return PTR_ERR(sup); + + /* Free-space fixup is no longer required */ + c->space_fixup = 0; + sup->flags &= cpu_to_le32(~UBIFS_FLG_SPACE_FIXUP); + + err = ubifs_write_sb_node(c, sup); + kfree(sup); + if (err) + return err; + + ubifs_msg("free space fixup complete"); + return err; +} diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c index acf5c5fffc6..58aa05df2bb 100644 --- a/fs/ubifs/scan.c +++ b/fs/ubifs/scan.c @@ -75,7 +75,7 @@ int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum, magic = le32_to_cpu(ch->magic); if (magic == 0xFFFFFFFF) { - dbg_scan("hit empty space"); + dbg_scan("hit empty space at LEB %d:%d", lnum, offs); return SCANNED_EMPTY_SPACE; } @@ -85,9 +85,10 @@ int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum, if (len < UBIFS_CH_SZ) return SCANNED_GARBAGE; - dbg_scan("scanning %s", dbg_ntype(ch->node_type)); + dbg_scan("scanning %s at LEB %d:%d", + dbg_ntype(ch->node_type), lnum, offs); - if (ubifs_check_node(c, buf, lnum, offs, quiet)) + if (ubifs_check_node(c, buf, lnum, offs, quiet, 1)) return SCANNED_A_CORRUPT_NODE; if (ch->node_type == UBIFS_PAD_NODE) { @@ -101,22 +102,21 @@ int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum, if (!quiet) { ubifs_err("bad pad node at LEB %d:%d", lnum, offs); - dbg_dump_node(c, pad); + ubifs_dump_node(c, pad); } return SCANNED_A_BAD_PAD_NODE; } /* Make the node pads to 8-byte boundary */ if ((node_len + pad_len) & 7) { - if (!quiet) { - dbg_err("bad padding length %d - %d", - offs, offs + node_len + pad_len); - } + if (!quiet) + ubifs_err("bad padding length %d - %d", + offs, offs + node_len + pad_len); return SCANNED_A_BAD_PAD_NODE; } - dbg_scan("%d bytes padded, offset now %d", - pad_len, ALIGN(offs + node_len + pad_len, 8)); + dbg_scan("%d bytes padded at LEB %d:%d, offset now %d", pad_len, + lnum, offs, ALIGN(offs + node_len + pad_len, 8)); return node_len + pad_len; } @@ -149,10 +149,10 @@ struct ubifs_scan_leb *ubifs_start_scan(const struct ubifs_info *c, int lnum, INIT_LIST_HEAD(&sleb->nodes); sleb->buf = sbuf; - err = ubi_read(c->ubi, lnum, sbuf + offs, offs, c->leb_size - offs); + err = ubifs_leb_read(c, lnum, sbuf + offs, offs, c->leb_size - offs, 0); if (err && err != -EBADMSG) { - ubifs_err("cannot read %d bytes from LEB %d:%d," - " error %d", c->leb_size - offs, lnum, offs, err); + ubifs_err("cannot read %d bytes from LEB %d:%d, error %d", + c->leb_size - offs, lnum, offs, err); kfree(sleb); return ERR_PTR(err); } @@ -198,7 +198,7 @@ int ubifs_add_snod(const struct ubifs_info *c, struct ubifs_scan_leb *sleb, struct ubifs_ino_node *ino = buf; struct ubifs_scan_node *snod; - snod = kzalloc(sizeof(struct ubifs_scan_node), GFP_NOFS); + snod = kmalloc(sizeof(struct ubifs_scan_node), GFP_NOFS); if (!snod) return -ENOMEM; @@ -213,13 +213,15 @@ int ubifs_add_snod(const struct ubifs_info *c, struct ubifs_scan_leb *sleb, case UBIFS_DENT_NODE: case UBIFS_XENT_NODE: case UBIFS_DATA_NODE: - case UBIFS_TRUN_NODE: /* * The key is in the same place in all keyed * nodes. */ key_read(c, &ino->key, &snod->key); break; + default: + invalid_key_init(c, &snod->key); + break; } list_add_tail(&snod->list, &sleb->nodes); sleb->nodes_cnt += 1; @@ -238,13 +240,11 @@ void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs, { int len; - ubifs_err("corrupted data at LEB %d:%d", lnum, offs); - if (dbg_failure_mode) - return; + ubifs_err("corruption at LEB %d:%d", lnum, offs); len = c->leb_size - offs; - if (len > 4096) - len = 4096; - dbg_err("first %d bytes from LEB %d:%d", len, lnum, offs); + if (len > 8192) + len = 8192; + ubifs_err("first %d bytes from LEB %d:%d", len, lnum, offs); print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 32, 4, buf, len, 1); } @@ -253,13 +253,19 @@ void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs, * @c: UBIFS file-system description object * @lnum: logical eraseblock number * @offs: offset to start at (usually zero) - * @sbuf: scan buffer (must be c->leb_size) + * @sbuf: scan buffer (must be of @c->leb_size bytes in size) + * @quiet: print no messages * * This function scans LEB number @lnum and returns complete information about - * its contents. Returns an error code in case of failure. + * its contents. Returns the scaned information in case of success and, + * %-EUCLEAN if the LEB neads recovery, and other negative error codes in case + * of failure. + * + * If @quiet is non-zero, this function does not print large and scary + * error messages and flash dumps in case of errors. */ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum, - int offs, void *sbuf) + int offs, void *sbuf, int quiet) { void *buf = sbuf + offs; int err, len = c->leb_size - offs; @@ -278,8 +284,7 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum, cond_resched(); - ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 0); - + ret = ubifs_scan_a_node(c, buf, len, lnum, offs, quiet); if (ret > 0) { /* Padding bytes or a valid padding node */ offs += ret; @@ -294,17 +299,18 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum, switch (ret) { case SCANNED_GARBAGE: - dbg_err("garbage"); + ubifs_err("garbage"); goto corrupted; case SCANNED_A_NODE: break; case SCANNED_A_CORRUPT_NODE: case SCANNED_A_BAD_PAD_NODE: - dbg_err("bad node"); + ubifs_err("bad node"); goto corrupted; default: - dbg_err("unknown"); - goto corrupted; + ubifs_err("unknown"); + err = -EINVAL; + goto error; } err = ubifs_add_snod(c, sleb, buf, offs); @@ -317,8 +323,12 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum, len -= node_len; } - if (offs % c->min_io_size) + if (offs % c->min_io_size) { + if (!quiet) + ubifs_err("empty space starts at non-aligned offset %d", + offs); goto corrupted; + } ubifs_end_scan(c, sleb, lnum, offs); @@ -327,18 +337,25 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum, break; for (; len; offs++, buf++, len--) if (*(uint8_t *)buf != 0xff) { - ubifs_err("corrupt empty space at LEB %d:%d", - lnum, offs); + if (!quiet) + ubifs_err("corrupt empty space at LEB %d:%d", + lnum, offs); goto corrupted; } return sleb; corrupted: - ubifs_scanned_corruption(c, lnum, offs, buf); + if (!quiet) { + ubifs_scanned_corruption(c, lnum, offs, buf); + ubifs_err("LEB %d scanning failed", lnum); + } err = -EUCLEAN; + ubifs_scan_destroy(sleb); + return ERR_PTR(err); + error: - ubifs_err("LEB %d scanning failed", lnum); + ubifs_err("LEB %d scanning failed, error %d", lnum, err); ubifs_scan_destroy(sleb); return ERR_PTR(err); } diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c index f248533841a..9a9fb94a41c 100644 --- a/fs/ubifs/shrinker.c +++ b/fs/ubifs/shrinker.c @@ -128,7 +128,6 @@ static int shrink_tnc(struct ubifs_info *c, int nr, int age, int *contention) freed = ubifs_destroy_tnc_subtree(znode); atomic_long_sub(freed, &ubifs_clean_zn_cnt); atomic_long_sub(freed, &c->clean_zn_cnt); - ubifs_assert(atomic_long_read(&c->clean_zn_cnt) >= 0); total_freed += freed; znode = zprev; } @@ -151,7 +150,7 @@ static int shrink_tnc(struct ubifs_info *c, int nr, int age, int *contention) * @contention: if any contention, this is set to %1 * * This function walks the list of mounted UBIFS file-systems and frees clean - * znodes which are older then @age, until at least @nr znodes are freed. + * znodes which are older than @age, until at least @nr znodes are freed. * Returns the number of freed znodes. */ static int shrink_tnc_trees(int nr, int age, int *contention) @@ -206,8 +205,7 @@ static int shrink_tnc_trees(int nr, int age, int *contention) * Move this one to the end of the list to provide some * fairness. */ - list_del(&c->infos_list); - list_add_tail(&c->infos_list, &ubifs_infos); + list_move_tail(&c->infos_list, &ubifs_infos); mutex_unlock(&c->umount_mutex); if (freed >= nr) break; @@ -251,7 +249,7 @@ static int kick_a_thread(void) dirty_zn_cnt = atomic_long_read(&c->dirty_zn_cnt); if (!dirty_zn_cnt || c->cmt_state == COMMIT_BROKEN || - c->ro_media) { + c->ro_mount || c->ro_error) { mutex_unlock(&c->umount_mutex); continue; } @@ -263,8 +261,7 @@ static int kick_a_thread(void) } if (i == 1) { - list_del(&c->infos_list); - list_add_tail(&c->infos_list, &ubifs_infos); + list_move_tail(&c->infos_list, &ubifs_infos); spin_unlock(&ubifs_infos_lock); ubifs_request_bg_commit(c); @@ -279,13 +276,25 @@ static int kick_a_thread(void) return 0; } -int ubifs_shrinker(int nr, gfp_t gfp_mask) +unsigned long ubifs_shrink_count(struct shrinker *shrink, + struct shrink_control *sc) { - int freed, contention = 0; long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt); - if (nr == 0) - return clean_zn_cnt; + /* + * Due to the way UBIFS updates the clean znode counter it may + * temporarily be negative. + */ + return clean_zn_cnt >= 0 ? clean_zn_cnt : 1; +} + +unsigned long ubifs_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + unsigned long nr = sc->nr_to_scan; + int contention = 0; + unsigned long freed; + long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt); if (!clean_zn_cnt) { /* @@ -313,10 +322,10 @@ int ubifs_shrinker(int nr, gfp_t gfp_mask) if (!freed && contention) { dbg_tnc("freed nothing, but contention"); - return -1; + return SHRINK_STOP; } out: - dbg_tnc("%d znodes were freed, requested %d", freed, nr); + dbg_tnc("%lu znodes were freed, requested %lu", freed, nr); return freed; } diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index ca1e2d4e03c..3904c8574ef 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -30,19 +30,27 @@ #include <linux/slab.h> #include <linux/module.h> #include <linux/ctype.h> -#include <linux/random.h> #include <linux/kthread.h> #include <linux/parser.h> #include <linux/seq_file.h> #include <linux/mount.h> +#include <linux/math64.h> +#include <linux/writeback.h> #include "ubifs.h" +/* + * Maximum amount of memory we may 'kmalloc()' without worrying that we are + * allocating too much. + */ +#define UBIFS_KMALLOC_OK (128*1024) + /* Slab cache for UBIFS inodes */ struct kmem_cache *ubifs_inode_slab; /* UBIFS TNC shrinker description */ static struct shrinker ubifs_shrinker_info = { - .shrink = ubifs_shrinker, + .scan_objects = ubifs_shrink_scan, + .count_objects = ubifs_shrink_count, .seeks = DEFAULT_SEEKS, }; @@ -78,16 +86,15 @@ static int validate_inode(struct ubifs_info *c, const struct inode *inode) if (ui->data_len < 0 || ui->data_len > UBIFS_MAX_INO_DATA) return 4; - if (ui->xattr && (inode->i_mode & S_IFMT) != S_IFREG) + if (ui->xattr && !S_ISREG(inode->i_mode)) return 5; if (!ubifs_compr_present(ui->compr_type)) { - ubifs_warn("inode %lu uses '%s' compression, but it was not " - "compiled in", inode->i_ino, - ubifs_compr_name(ui->compr_type)); + ubifs_warn("inode %lu uses '%s' compression, but it was not compiled in", + inode->i_ino, ubifs_compr_name(ui->compr_type)); } - err = dbg_check_dir_size(c, inode); + err = dbg_check_dir(c, inode); return err; } @@ -122,9 +129,9 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum) goto out_ino; inode->i_flags |= (S_NOCMTIME | S_NOATIME); - inode->i_nlink = le32_to_cpu(ino->nlink); - inode->i_uid = le32_to_cpu(ino->uid); - inode->i_gid = le32_to_cpu(ino->gid); + set_nlink(inode, le32_to_cpu(ino->nlink)); + i_uid_write(inode, le32_to_cpu(ino->uid)); + i_gid_write(inode, le32_to_cpu(ino->gid)); inode->i_atime.tv_sec = (int64_t)le64_to_cpu(ino->atime_sec); inode->i_atime.tv_nsec = le32_to_cpu(ino->atime_nsec); inode->i_mtime.tv_sec = (int64_t)le64_to_cpu(ino->mtime_sec); @@ -149,7 +156,7 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum) if (err) goto out_invalid; - /* Disable readahead */ + /* Disable read-ahead */ inode->i_mapping->backing_dev_info = &c->bdi; switch (inode->i_mode & S_IFMT) { @@ -239,8 +246,8 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum) out_invalid: ubifs_err("inode %lu validation failed, error %d", inode->i_ino, err); - dbg_dump_node(c, ino); - dbg_dump_inode(c, inode); + ubifs_dump_node(c, ino); + ubifs_dump_inode(c, inode); err = -EINVAL; out_ino: kfree(ino); @@ -265,20 +272,27 @@ static struct inode *ubifs_alloc_inode(struct super_block *sb) return &ui->vfs_inode; }; +static void ubifs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + struct ubifs_inode *ui = ubifs_inode(inode); + kmem_cache_free(ubifs_inode_slab, ui); +} + static void ubifs_destroy_inode(struct inode *inode) { struct ubifs_inode *ui = ubifs_inode(inode); kfree(ui->data); - kmem_cache_free(ubifs_inode_slab, inode); + call_rcu(&inode->i_rcu, ubifs_i_callback); } /* * Note, Linux write-back code calls this without 'i_mutex'. */ -static int ubifs_write_inode(struct inode *inode, int wait) +static int ubifs_write_inode(struct inode *inode, struct writeback_control *wbc) { - int err; + int err = 0; struct ubifs_info *c = inode->i_sb->s_fs_info; struct ubifs_inode *ui = ubifs_inode(inode); @@ -289,7 +303,7 @@ static int ubifs_write_inode(struct inode *inode, int wait) mutex_lock(&ui->ui_mutex); /* * Due to races between write-back forced by budgeting - * (see 'sync_some_inodes()') and pdflush write-back, the inode may + * (see 'sync_some_inodes()') and background write-back, the inode may * have already been synchronized, do not do this again. This might * also happen if it was synchronized in an VFS operation, e.g. * 'ubifs_link()'. @@ -299,10 +313,20 @@ static int ubifs_write_inode(struct inode *inode, int wait) return 0; } - dbg_gen("inode %lu", inode->i_ino); - err = ubifs_jnl_write_inode(c, inode, 0); - if (err) - ubifs_err("can't write inode %lu, error %d", inode->i_ino, err); + /* + * As an optimization, do not write orphan inodes to the media just + * because this is not needed. + */ + dbg_gen("inode %lu, mode %#x, nlink %u", + inode->i_ino, (int)inode->i_mode, inode->i_nlink); + if (inode->i_nlink) { + err = ubifs_jnl_write_inode(c, inode); + if (err) + ubifs_err("can't write inode %lu, error %d", + inode->i_ino, err); + else + err = dbg_check_inode_size(c, inode, ui->ui_size); + } ui->dirty = 0; mutex_unlock(&ui->ui_mutex); @@ -310,12 +334,13 @@ static int ubifs_write_inode(struct inode *inode, int wait) return err; } -static void ubifs_delete_inode(struct inode *inode) +static void ubifs_evict_inode(struct inode *inode) { int err; struct ubifs_info *c = inode->i_sb->s_fs_info; + struct ubifs_inode *ui = ubifs_inode(inode); - if (ubifs_inode(inode)->xattr) + if (ui->xattr) /* * Extended attribute inode deletions are fully handled in * 'ubifs_removexattr()'. These inodes are special and have @@ -323,27 +348,40 @@ static void ubifs_delete_inode(struct inode *inode) */ goto out; - dbg_gen("inode %lu", inode->i_ino); + dbg_gen("inode %lu, mode %#x", inode->i_ino, (int)inode->i_mode); ubifs_assert(!atomic_read(&inode->i_count)); - ubifs_assert(inode->i_nlink == 0); - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); + + if (inode->i_nlink) + goto done; + if (is_bad_inode(inode)) goto out; - ubifs_inode(inode)->ui_size = inode->i_size = 0; - err = ubifs_jnl_write_inode(c, inode, 1); + ui->ui_size = inode->i_size = 0; + err = ubifs_jnl_delete_inode(c, inode); if (err) /* * Worst case we have a lost orphan inode wasting space, so a - * simple error message is ok here. + * simple error message is OK here. */ - ubifs_err("can't write inode %lu, error %d", inode->i_ino, err); + ubifs_err("can't delete inode %lu, error %d", + inode->i_ino, err); + out: + if (ui->dirty) + ubifs_release_dirty_inode_budget(c, ui); + else { + /* We've deleted something - clean the "no space" flags */ + c->bi.nospace = c->bi.nospace_rp = 0; + smp_wmb(); + } +done: clear_inode(inode); } -static void ubifs_dirty_inode(struct inode *inode) +static void ubifs_dirty_inode(struct inode *inode, int flags) { struct ubifs_inode *ui = ubifs_inode(inode); @@ -358,8 +396,9 @@ static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct ubifs_info *c = dentry->d_sb->s_fs_info; unsigned long long free; + __le32 *uuid = (__le32 *)c->uuid; - free = ubifs_budg_get_free_space(c); + free = ubifs_get_free_space(c); dbg_gen("free space %lld bytes (%lld blocks)", free, free >> UBIFS_BLOCK_SHIFT); @@ -374,39 +413,74 @@ static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_files = 0; buf->f_ffree = 0; buf->f_namelen = UBIFS_MAX_NLEN; - + buf->f_fsid.val[0] = le32_to_cpu(uuid[0]) ^ le32_to_cpu(uuid[2]); + buf->f_fsid.val[1] = le32_to_cpu(uuid[1]) ^ le32_to_cpu(uuid[3]); + ubifs_assert(buf->f_bfree <= c->block_cnt); return 0; } -static int ubifs_show_options(struct seq_file *s, struct vfsmount *mnt) +static int ubifs_show_options(struct seq_file *s, struct dentry *root) { - struct ubifs_info *c = mnt->mnt_sb->s_fs_info; + struct ubifs_info *c = root->d_sb->s_fs_info; if (c->mount_opts.unmount_mode == 2) seq_printf(s, ",fast_unmount"); else if (c->mount_opts.unmount_mode == 1) seq_printf(s, ",norm_unmount"); + if (c->mount_opts.bulk_read == 2) + seq_printf(s, ",bulk_read"); + else if (c->mount_opts.bulk_read == 1) + seq_printf(s, ",no_bulk_read"); + + if (c->mount_opts.chk_data_crc == 2) + seq_printf(s, ",chk_data_crc"); + else if (c->mount_opts.chk_data_crc == 1) + seq_printf(s, ",no_chk_data_crc"); + + if (c->mount_opts.override_compr) { + seq_printf(s, ",compr=%s", + ubifs_compr_name(c->mount_opts.compr_type)); + } + return 0; } static int ubifs_sync_fs(struct super_block *sb, int wait) { + int i, err; struct ubifs_info *c = sb->s_fs_info; - int i, ret = 0, err; - if (c->jheads) - for (i = 0; i < c->jhead_cnt; i++) { - err = ubifs_wbuf_sync(&c->jheads[i].wbuf); - if (err && !ret) - ret = err; - } /* - * We ought to call sync for c->ubi but it does not have one. If it had - * it would in turn call mtd->sync, however mtd operations are - * synchronous anyway, so we don't lose any sleep here. + * Zero @wait is just an advisory thing to help the file system shove + * lots of data into the queues, and there will be the second + * '->sync_fs()' call, with non-zero @wait. + */ + if (!wait) + return 0; + + /* + * Synchronize write buffers, because 'ubifs_run_commit()' does not + * do this if it waits for an already running commit. + */ + for (i = 0; i < c->jhead_cnt; i++) { + err = ubifs_wbuf_sync(&c->jheads[i].wbuf); + if (err) + return err; + } + + /* + * Strictly speaking, it is not necessary to commit the journal here, + * synchronizing write-buffers would be enough. But committing makes + * UBIFS free space predictions much more accurate, so we want to let + * the user be able to get more accurate results of 'statfs()' after + * they synchronize the file system. */ - return ret; + err = ubifs_run_commit(c); + if (err) + return err; + + return ubi_sync(c->vi.ubi_num); } /** @@ -437,9 +511,12 @@ static int init_constants_early(struct ubifs_info *c) c->leb_cnt = c->vi.size; c->leb_size = c->vi.usable_leb_size; + c->leb_start = c->di.leb_start; c->half_leb_size = c->leb_size / 2; c->min_io_size = c->di.min_io_size; c->min_io_shift = fls(c->min_io_size) - 1; + c->max_write_size = c->di.max_write_size; + c->max_write_shift = fls(c->max_write_size) - 1; if (c->leb_size < UBIFS_MIN_LEB_SZ) { ubifs_err("too small LEBs (%d bytes), min. is %d bytes", @@ -459,6 +536,18 @@ static int init_constants_early(struct ubifs_info *c) } /* + * Maximum write size has to be greater or equivalent to min. I/O + * size, and be multiple of min. I/O size. + */ + if (c->max_write_size < c->min_io_size || + c->max_write_size % c->min_io_size || + !is_power_of_2(c->max_write_size)) { + ubifs_err("bad write buffer size %d for %d min. I/O unit", + c->max_write_size, c->min_io_size); + return -EINVAL; + } + + /* * UBIFS aligns all node to 8-byte boundary, so to make function in * io.c simpler, assume minimum I/O unit size to be 8 bytes if it is * less than 8. @@ -466,6 +555,10 @@ static int init_constants_early(struct ubifs_info *c) if (c->min_io_size < 8) { c->min_io_size = 8; c->min_io_shift = 3; + if (c->max_write_size < c->min_io_size) { + c->max_write_size = c->min_io_size; + c->max_write_shift = c->min_io_shift; + } } c->ref_node_alsz = ALIGN(UBIFS_REF_NODE_SZ, c->min_io_size); @@ -505,19 +598,23 @@ static int init_constants_early(struct ubifs_info *c) c->ranges[UBIFS_IDX_NODE].max_len = INT_MAX; /* - * Initialize dead and dark LEB space watermarks. - * - * Dead space is the space which cannot be used. Its watermark is - * equivalent to min. I/O unit or minimum node size if it is greater - * then min. I/O unit. - * - * Dark space is the space which might be used, or might not, depending - * on which node should be written to the LEB. Its watermark is - * equivalent to maximum UBIFS node size. + * Initialize dead and dark LEB space watermarks. See gc.c for comments + * about these values. */ c->dead_wm = ALIGN(MIN_WRITE_SZ, c->min_io_size); c->dark_wm = ALIGN(UBIFS_MAX_NODE_SZ, c->min_io_size); + /* + * Calculate how many bytes would be wasted at the end of LEB if it was + * fully filled with data nodes of maximum size. This is used in + * calculations when reporting free space. + */ + c->leb_overhead = c->leb_size % UBIFS_MAX_DATA_NODE_SZ; + + /* Buffer size for bulk-reads */ + c->max_bu_buf_len = UBIFS_MAX_BULK_READ * UBIFS_MAX_DATA_NODE_SZ; + if (c->max_bu_buf_len > c->leb_size) + c->max_bu_buf_len = c->leb_size; return 0; } @@ -542,7 +639,7 @@ static int bud_wbuf_callback(struct ubifs_info *c, int lnum, int free, int pad) } /* - * init_constants_late - initialize UBIFS constants. + * init_constants_sb - initialize UBIFS constants. * @c: UBIFS file-system description object * * This is a helper function which initializes various UBIFS constants after @@ -550,10 +647,10 @@ static int bud_wbuf_callback(struct ubifs_info *c, int lnum, int free, int pad) * makes sure they are all right. Returns zero in case of success and a * negative error code in case of failure. */ -static int init_constants_late(struct ubifs_info *c) +static int init_constants_sb(struct ubifs_info *c) { int tmp, err; - uint64_t tmp64; + long long tmp64; c->main_bytes = (long long)c->main_lebs * c->leb_size; c->max_znode_sz = sizeof(struct ubifs_znode) + @@ -571,8 +668,8 @@ static int init_constants_late(struct ubifs_info *c) tmp = UBIFS_CS_NODE_SZ + UBIFS_REF_NODE_SZ * c->jhead_cnt; tmp = ALIGN(tmp, c->min_io_size); if (tmp > c->leb_size) { - dbg_err("too small LEB size %d, at least %d needed", - c->leb_size, tmp); + ubifs_err("too small LEB size %d, at least %d needed", + c->leb_size, tmp); return -EINVAL; } @@ -580,15 +677,14 @@ static int init_constants_late(struct ubifs_info *c) * Make sure that the log is large enough to fit reference nodes for * all buds plus one reserved LEB. */ - tmp64 = c->max_bud_bytes; - tmp = do_div(tmp64, c->leb_size); - c->max_bud_cnt = tmp64 + !!tmp; + tmp64 = c->max_bud_bytes + c->leb_size - 1; + c->max_bud_cnt = div_u64(tmp64, c->leb_size); tmp = (c->ref_node_alsz * c->max_bud_cnt + c->leb_size - 1); tmp /= c->leb_size; tmp += 1; if (c->log_lebs < tmp) { - dbg_err("too small log %d LEBs, required min. %d LEBs", - c->log_lebs, tmp); + ubifs_err("too small log %d LEBs, required min. %d LEBs", + c->log_lebs, tmp); return -EINVAL; } @@ -597,11 +693,11 @@ static int init_constants_late(struct ubifs_info *c) * be compressed and direntries are of the maximum size. * * Note, data, which may be stored in inodes is budgeted separately, so - * it is not included into 'c->inode_budget'. + * it is not included into 'c->bi.inode_budget'. */ - c->page_budget = UBIFS_MAX_DATA_NODE_SZ * UBIFS_BLOCKS_PER_PAGE; - c->inode_budget = UBIFS_INO_NODE_SZ; - c->dent_budget = UBIFS_MAX_DENT_NODE_SZ; + c->bi.page_budget = UBIFS_MAX_DATA_NODE_SZ * UBIFS_BLOCKS_PER_PAGE; + c->bi.inode_budget = UBIFS_INO_NODE_SZ; + c->bi.dent_budget = UBIFS_MAX_DENT_NODE_SZ; /* * When the amount of flash space used by buds becomes @@ -618,7 +714,7 @@ static int init_constants_late(struct ubifs_info *c) * Consequently, if the journal is too small, UBIFS will treat it as * always full. */ - tmp64 = (uint64_t)(c->jhead_cnt + 1) * c->leb_size + 1; + tmp64 = (long long)(c->jhead_cnt + 1) * c->leb_size + 1; if (c->bg_bud_bytes < tmp64) c->bg_bud_bytes = tmp64; if (c->max_bud_bytes < tmp64 + c->leb_size) @@ -628,36 +724,51 @@ static int init_constants_late(struct ubifs_info *c) if (err) return err; - c->min_idx_lebs = ubifs_calc_min_idx_lebs(c); + /* Initialize effective LEB size used in budgeting calculations */ + c->idx_leb_size = c->leb_size - c->max_idx_node_sz; + return 0; +} + +/* + * init_constants_master - initialize UBIFS constants. + * @c: UBIFS file-system description object + * + * This is a helper function which initializes various UBIFS constants after + * the master node has been read. It also checks various UBIFS parameters and + * makes sure they are all right. + */ +static void init_constants_master(struct ubifs_info *c) +{ + long long tmp64; + + c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c); + c->report_rp_size = ubifs_reported_space(c, c->rp_size); /* * Calculate total amount of FS blocks. This number is not used * internally because it does not make much sense for UBIFS, but it is * necessary to report something for the 'statfs()' call. * - * Subtract the LEB reserved for GC and the LEB which is reserved for - * deletions. - * - * Review 'ubifs_calc_available()' if changing this calculation. + * Subtract the LEB reserved for GC, the LEB which is reserved for + * deletions, minimum LEBs for the index, and assume only one journal + * head is available. */ - tmp64 = c->main_lebs - 2; - tmp64 *= (uint64_t)c->leb_size - c->dark_wm; + tmp64 = c->main_lebs - 1 - 1 - MIN_INDEX_LEBS - c->jhead_cnt + 1; + tmp64 *= (long long)c->leb_size - c->leb_overhead; tmp64 = ubifs_reported_space(c, tmp64); c->block_cnt = tmp64 >> UBIFS_BLOCK_SHIFT; - - return 0; } /** * take_gc_lnum - reserve GC LEB. * @c: UBIFS file-system description object * - * This function ensures that the LEB reserved for garbage collection is - * unmapped and is marked as "taken" in lprops. We also have to set free space - * to LEB size and dirty space to zero, because lprops may contain out-of-date - * information if the file-system was un-mounted before it has been committed. - * This function returns zero in case of success and a negative error code in - * case of failure. + * This function ensures that the LEB reserved for garbage collection is marked + * as "taken" in lprops. We also have to set free space to LEB size and dirty + * space to zero, because lprops may contain out-of-date information if the + * file-system was un-mounted before it has been committed. This function + * returns zero in case of success and a negative error code in case of + * failure. */ static int take_gc_lnum(struct ubifs_info *c) { @@ -668,10 +779,6 @@ static int take_gc_lnum(struct ubifs_info *c) return -EINVAL; } - err = ubifs_leb_unmap(c, c->gc_lnum); - if (err) - return err; - /* And we have to tell lprops that this LEB is taken */ err = ubifs_change_one_lp(c, c->gc_lnum, c->leb_size, 0, LPROPS_TAKEN, 0, 0); @@ -703,15 +810,15 @@ static int alloc_wbufs(struct ubifs_info *c) c->jheads[i].wbuf.sync_callback = &bud_wbuf_callback; c->jheads[i].wbuf.jhead = i; + c->jheads[i].grouped = 1; } - c->jheads[BASEHD].wbuf.dtype = UBI_SHORTTERM; /* - * Garbage Collector head likely contains long-term data and - * does not need to be synchronized by timer. + * Garbage Collector head does not need to be synchronized by timer. + * Also GC head nodes are not grouped. */ - c->jheads[GCHD].wbuf.dtype = UBI_LONGTERM; - c->jheads[GCHD].wbuf.timeout = 0; + c->jheads[GCHD].wbuf.no_timer = 1; + c->jheads[GCHD].grouped = 0; return 0; } @@ -753,7 +860,7 @@ static void free_orphans(struct ubifs_info *c) orph = list_entry(c->orph_list.next, struct ubifs_orphan, list); list_del(&orph->list); kfree(orph); - dbg_err("orphan list not empty at unmount"); + ubifs_err("orphan list not empty at unmount"); } vfree(c->orph_buf); @@ -766,26 +873,10 @@ static void free_orphans(struct ubifs_info *c) */ static void free_buds(struct ubifs_info *c) { - struct rb_node *this = c->buds.rb_node; - struct ubifs_bud *bud; - - while (this) { - if (this->rb_left) - this = this->rb_left; - else if (this->rb_right) - this = this->rb_right; - else { - bud = rb_entry(this, struct ubifs_bud, rb); - this = rb_parent(this); - if (this) { - if (this->rb_left == &bud->rb) - this->rb_left = NULL; - else - this->rb_right = NULL; - } - kfree(bud); - } - } + struct ubifs_bud *bud, *n; + + rbtree_postorder_for_each_entry_safe(bud, n, &c->buds, rb) + kfree(bud); } /** @@ -803,7 +894,7 @@ static int check_volume_empty(struct ubifs_info *c) c->empty = 1; for (lnum = 0; lnum < c->leb_cnt; lnum++) { - err = ubi_is_mapped(c->ubi, lnum); + err = ubifs_is_mapped(c, lnum); if (unlikely(err < 0)) return err; if (err == 1) { @@ -822,21 +913,57 @@ static int check_volume_empty(struct ubifs_info *c) * * Opt_fast_unmount: do not run a journal commit before un-mounting * Opt_norm_unmount: run a journal commit before un-mounting + * Opt_bulk_read: enable bulk-reads + * Opt_no_bulk_read: disable bulk-reads + * Opt_chk_data_crc: check CRCs when reading data nodes + * Opt_no_chk_data_crc: do not check CRCs when reading data nodes + * Opt_override_compr: override default compressor * Opt_err: just end of array marker */ enum { Opt_fast_unmount, Opt_norm_unmount, + Opt_bulk_read, + Opt_no_bulk_read, + Opt_chk_data_crc, + Opt_no_chk_data_crc, + Opt_override_compr, Opt_err, }; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_fast_unmount, "fast_unmount"}, {Opt_norm_unmount, "norm_unmount"}, + {Opt_bulk_read, "bulk_read"}, + {Opt_no_bulk_read, "no_bulk_read"}, + {Opt_chk_data_crc, "chk_data_crc"}, + {Opt_no_chk_data_crc, "no_chk_data_crc"}, + {Opt_override_compr, "compr=%s"}, {Opt_err, NULL}, }; /** + * parse_standard_option - parse a standard mount option. + * @option: the option to parse + * + * Normally, standard mount options like "sync" are passed to file-systems as + * flags. However, when a "rootflags=" kernel boot parameter is used, they may + * be present in the options string. This function tries to deal with this + * situation and parse standard options. Returns 0 if the option was not + * recognized, and the corresponding integer flag if it was. + * + * UBIFS is only interested in the "sync" option, so do not check for anything + * else. + */ +static int parse_standard_option(const char *option) +{ + ubifs_msg("parse %s", option); + if (!strcmp(option, "sync")) + return MS_SYNCHRONOUS; + return 0; +} + +/** * ubifs_parse_options - parse mount parameters. * @c: UBIFS file-system description object * @options: parameters to parse @@ -862,18 +989,69 @@ static int ubifs_parse_options(struct ubifs_info *c, char *options, token = match_token(p, tokens, args); switch (token) { + /* + * %Opt_fast_unmount and %Opt_norm_unmount options are ignored. + * We accept them in order to be backward-compatible. But this + * should be removed at some point. + */ case Opt_fast_unmount: c->mount_opts.unmount_mode = 2; - c->fast_unmount = 1; break; case Opt_norm_unmount: c->mount_opts.unmount_mode = 1; - c->fast_unmount = 0; break; + case Opt_bulk_read: + c->mount_opts.bulk_read = 2; + c->bulk_read = 1; + break; + case Opt_no_bulk_read: + c->mount_opts.bulk_read = 1; + c->bulk_read = 0; + break; + case Opt_chk_data_crc: + c->mount_opts.chk_data_crc = 2; + c->no_chk_data_crc = 0; + break; + case Opt_no_chk_data_crc: + c->mount_opts.chk_data_crc = 1; + c->no_chk_data_crc = 1; + break; + case Opt_override_compr: + { + char *name = match_strdup(&args[0]); + + if (!name) + return -ENOMEM; + if (!strcmp(name, "none")) + c->mount_opts.compr_type = UBIFS_COMPR_NONE; + else if (!strcmp(name, "lzo")) + c->mount_opts.compr_type = UBIFS_COMPR_LZO; + else if (!strcmp(name, "zlib")) + c->mount_opts.compr_type = UBIFS_COMPR_ZLIB; + else { + ubifs_err("unknown compressor \"%s\"", name); + kfree(name); + return -EINVAL; + } + kfree(name); + c->mount_opts.override_compr = 1; + c->default_compr = c->mount_opts.compr_type; + break; + } default: - ubifs_err("unrecognized mount option \"%s\" " - "or missing value", p); - return -EINVAL; + { + unsigned long flag; + struct super_block *sb = c->vfs_sb; + + flag = parse_standard_option(p); + if (!flag) { + ubifs_err("unrecognized mount option \"%s\" or missing value", + p); + return -EINVAL; + } + sb->s_flags |= flag; + break; + } } } @@ -911,37 +1089,82 @@ static void destroy_journal(struct ubifs_info *c) } /** + * bu_init - initialize bulk-read information. + * @c: UBIFS file-system description object + */ +static void bu_init(struct ubifs_info *c) +{ + ubifs_assert(c->bulk_read == 1); + + if (c->bu.buf) + return; /* Already initialized */ + +again: + c->bu.buf = kmalloc(c->max_bu_buf_len, GFP_KERNEL | __GFP_NOWARN); + if (!c->bu.buf) { + if (c->max_bu_buf_len > UBIFS_KMALLOC_OK) { + c->max_bu_buf_len = UBIFS_KMALLOC_OK; + goto again; + } + + /* Just disable bulk-read */ + ubifs_warn("cannot allocate %d bytes of memory for bulk-read, disabling it", + c->max_bu_buf_len); + c->mount_opts.bulk_read = 1; + c->bulk_read = 0; + return; + } +} + +/** + * check_free_space - check if there is enough free space to mount. + * @c: UBIFS file-system description object + * + * This function makes sure UBIFS has enough free space to be mounted in + * read/write mode. UBIFS must always have some free space to allow deletions. + */ +static int check_free_space(struct ubifs_info *c) +{ + ubifs_assert(c->dark_wm > 0); + if (c->lst.total_free + c->lst.total_dirty < c->dark_wm) { + ubifs_err("insufficient free space to mount in R/W mode"); + ubifs_dump_budg(c, &c->bi); + ubifs_dump_lprops(c); + return -ENOSPC; + } + return 0; +} + +/** * mount_ubifs - mount UBIFS file-system. * @c: UBIFS file-system description object * * This function mounts UBIFS file system. Returns zero in case of success and * a negative error code in case of failure. - * - * Note, the function does not de-allocate resources it it fails half way - * through, and the caller has to do this instead. */ static int mount_ubifs(struct ubifs_info *c) { - struct super_block *sb = c->vfs_sb; - int err, mounted_read_only = (sb->s_flags & MS_RDONLY); - long long x; + int err; + long long x, y; size_t sz; + c->ro_mount = !!(c->vfs_sb->s_flags & MS_RDONLY); + /* Suppress error messages while probing if MS_SILENT is set */ + c->probing = !!(c->vfs_sb->s_flags & MS_SILENT); + err = init_constants_early(c); if (err) return err; -#ifdef CONFIG_UBIFS_FS_DEBUG - c->dbg_buf = vmalloc(c->leb_size); - if (!c->dbg_buf) - return -ENOMEM; -#endif + err = ubifs_debugging_init(c); + if (err) + return err; err = check_volume_empty(c); if (err) goto out_free; - if (c->empty && (mounted_read_only || c->ro_media)) { + if (c->empty && (c->ro_mount || c->ro_media)) { /* * This UBI volume is empty, and read-only, or the file system * is mounted read-only - we cannot format it. @@ -952,7 +1175,7 @@ static int mount_ubifs(struct ubifs_info *c) goto out_free; } - if (c->ro_media && !mounted_read_only) { + if (c->ro_media && !c->ro_mount) { ubifs_err("cannot mount read-write - read-only media"); err = -EROFS; goto out_free; @@ -972,51 +1195,61 @@ static int mount_ubifs(struct ubifs_info *c) if (!c->sbuf) goto out_free; - if (!mounted_read_only) { + if (!c->ro_mount) { c->ileb_buf = vmalloc(c->leb_size); if (!c->ileb_buf) goto out_free; } + if (c->bulk_read == 1) + bu_init(c); + + if (!c->ro_mount) { + c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ, + GFP_KERNEL); + if (!c->write_reserve_buf) + goto out_free; + } + + c->mounting = 1; + err = ubifs_read_superblock(c); if (err) goto out_free; + c->probing = 0; + /* - * Make sure the compressor which is set as the default on in the - * superblock was actually compiled in. + * Make sure the compressor which is set as default in the superblock + * or overridden by mount options is actually compiled in. */ if (!ubifs_compr_present(c->default_compr)) { - ubifs_warn("'%s' compressor is set by superblock, but not " - "compiled in", ubifs_compr_name(c->default_compr)); - c->default_compr = UBIFS_COMPR_NONE; + ubifs_err("'compressor \"%s\" is not compiled in", + ubifs_compr_name(c->default_compr)); + err = -ENOTSUPP; + goto out_free; } - dbg_failure_mode_registration(c); - - err = init_constants_late(c); + err = init_constants_sb(c); if (err) - goto out_dereg; + goto out_free; sz = ALIGN(c->max_idx_node_sz, c->min_io_size); sz = ALIGN(sz + c->max_idx_node_sz, c->min_io_size); c->cbuf = kmalloc(sz, GFP_NOFS); if (!c->cbuf) { err = -ENOMEM; - goto out_dereg; + goto out_free; } - if (!mounted_read_only) { - err = alloc_wbufs(c); - if (err) - goto out_cbuf; + err = alloc_wbufs(c); + if (err) + goto out_cbuf; + sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num, c->vi.vol_id); + if (!c->ro_mount) { /* Create background thread */ - sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num, - c->vi.vol_id); - c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name); - if (!c->bgt) - c->bgt = ERR_PTR(-EINVAL); + c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name); if (IS_ERR(c->bgt)) { err = PTR_ERR(c->bgt); c->bgt = NULL; @@ -1031,15 +1264,30 @@ static int mount_ubifs(struct ubifs_info *c) if (err) goto out_master; + init_constants_master(c); + if ((c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY)) != 0) { ubifs_msg("recovery needed"); c->need_recovery = 1; - if (!mounted_read_only) { - err = ubifs_recover_inl_heads(c, c->sbuf); - if (err) - goto out_master; - } - } else if (!mounted_read_only) { + } + + if (c->need_recovery && !c->ro_mount) { + err = ubifs_recover_inl_heads(c, c->sbuf); + if (err) + goto out_master; + } + + err = ubifs_lpt_init(c, 1, !c->ro_mount); + if (err) + goto out_master; + + if (!c->ro_mount && c->space_fixup) { + err = ubifs_fixup_free_space(c); + if (err) + goto out_lpt; + } + + if (!c->ro_mount) { /* * Set the "dirty" flag so that if we reboot uncleanly we * will notice this immediately on the next mount. @@ -1047,14 +1295,10 @@ static int mount_ubifs(struct ubifs_info *c) c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY); err = ubifs_write_master(c); if (err) - goto out_master; + goto out_lpt; } - err = ubifs_lpt_init(c, 1, !mounted_read_only); - if (err) - goto out_lpt; - - err = dbg_check_idx_size(c, c->old_idx_sz); + err = dbg_check_idx_size(c, c->bi.old_idx_sz); if (err) goto out_lpt; @@ -1062,19 +1306,19 @@ static int mount_ubifs(struct ubifs_info *c) if (err) goto out_journal; - err = ubifs_mount_orphans(c, c->need_recovery, mounted_read_only); + /* Calculate 'min_idx_lebs' after journal replay */ + c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c); + + err = ubifs_mount_orphans(c, c->need_recovery, c->ro_mount); if (err) goto out_orphans; - if (!mounted_read_only) { + if (!c->ro_mount) { int lnum; - /* Check for enough free space */ - if (ubifs_calc_available(c, c->min_idx_lebs) <= 0) { - ubifs_err("insufficient available space"); - err = -EINVAL; + err = check_free_space(c); + if (err) goto out_orphans; - } /* Check for enough log space */ lnum = c->lhead_lnum + 1; @@ -1091,10 +1335,21 @@ static int mount_ubifs(struct ubifs_info *c) if (err) goto out_orphans; err = ubifs_rcvry_gc_commit(c); - } else + if (err) + goto out_orphans; + } else { err = take_gc_lnum(c); - if (err) - goto out_orphans; + if (err) + goto out_orphans; + + /* + * GC LEB may contain garbage if there was an unclean + * reboot, and it should be un-mapped. + */ + err = ubifs_leb_unmap(c, c->gc_lnum); + if (err) + goto out_orphans; + } err = dbg_check_lprops(c); if (err) @@ -1103,6 +1358,16 @@ static int mount_ubifs(struct ubifs_info *c) err = ubifs_recover_size(c); if (err) goto out_orphans; + } else { + /* + * Even if we mount read-only, we have to set space in GC LEB + * to proper value because this affects UBIFS free space + * reporting. We do not want to have a situation when + * re-mounting from R/O to R/W changes amount of free space. + */ + err = take_gc_lnum(c); + if (err) + goto out_orphans; } spin_lock(&ubifs_infos_lock); @@ -1110,76 +1375,94 @@ static int mount_ubifs(struct ubifs_info *c) spin_unlock(&ubifs_infos_lock); if (c->need_recovery) { - if (mounted_read_only) + if (c->ro_mount) ubifs_msg("recovery deferred"); else { c->need_recovery = 0; ubifs_msg("recovery completed"); + /* + * GC LEB has to be empty and taken at this point. But + * the journal head LEBs may also be accounted as + * "empty taken" if they are empty. + */ + ubifs_assert(c->lst.taken_empty_lebs > 0); } - } + } else + ubifs_assert(c->lst.taken_empty_lebs > 0); err = dbg_check_filesystem(c); if (err) goto out_infos; - ubifs_msg("mounted UBI device %d, volume %d", c->vi.ubi_num, - c->vi.vol_id); - if (mounted_read_only) - ubifs_msg("mounted read-only"); + err = dbg_debugfs_init_fs(c); + if (err) + goto out_infos; + + c->mounting = 0; + + ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"%s", + c->vi.ubi_num, c->vi.vol_id, c->vi.name, + c->ro_mount ? ", R/O mode" : ""); x = (long long)c->main_lebs * c->leb_size; - ubifs_msg("file system size: %lld bytes (%lld KiB, %lld MiB, %d LEBs)", - x, x >> 10, x >> 20, c->main_lebs); - x = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes; - ubifs_msg("journal size: %lld bytes (%lld KiB, %lld MiB, %d LEBs)", - x, x >> 10, x >> 20, c->log_lebs + c->max_bud_cnt); - ubifs_msg("default compressor: %s", ubifs_compr_name(c->default_compr)); - ubifs_msg("media format %d, latest format %d", - c->fmt_version, UBIFS_FORMAT_VERSION); - - dbg_msg("compiled on: " __DATE__ " at " __TIME__); - dbg_msg("min. I/O unit size: %d bytes", c->min_io_size); - dbg_msg("LEB size: %d bytes (%d KiB)", - c->leb_size, c->leb_size / 1024); - dbg_msg("data journal heads: %d", + y = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes; + ubifs_msg("LEB size: %d bytes (%d KiB), min./max. I/O unit sizes: %d bytes/%d bytes", + c->leb_size, c->leb_size >> 10, c->min_io_size, + c->max_write_size); + ubifs_msg("FS size: %lld bytes (%lld MiB, %d LEBs), journal size %lld bytes (%lld MiB, %d LEBs)", + x, x >> 20, c->main_lebs, + y, y >> 20, c->log_lebs + c->max_bud_cnt); + ubifs_msg("reserved for root: %llu bytes (%llu KiB)", + c->report_rp_size, c->report_rp_size >> 10); + ubifs_msg("media format: w%d/r%d (latest is w%d/r%d), UUID %pUB%s", + c->fmt_version, c->ro_compat_version, + UBIFS_FORMAT_VERSION, UBIFS_RO_COMPAT_VERSION, c->uuid, + c->big_lpt ? ", big LPT model" : ", small LPT model"); + + dbg_gen("default compressor: %s", ubifs_compr_name(c->default_compr)); + dbg_gen("data journal heads: %d", c->jhead_cnt - NONDATA_JHEADS_CNT); - dbg_msg("UUID: %02X%02X%02X%02X-%02X%02X" - "-%02X%02X-%02X%02X-%02X%02X%02X%02X%02X%02X", - c->uuid[0], c->uuid[1], c->uuid[2], c->uuid[3], - c->uuid[4], c->uuid[5], c->uuid[6], c->uuid[7], - c->uuid[8], c->uuid[9], c->uuid[10], c->uuid[11], - c->uuid[12], c->uuid[13], c->uuid[14], c->uuid[15]); - dbg_msg("fast unmount: %d", c->fast_unmount); - dbg_msg("big_lpt %d", c->big_lpt); - dbg_msg("log LEBs: %d (%d - %d)", + dbg_gen("log LEBs: %d (%d - %d)", c->log_lebs, UBIFS_LOG_LNUM, c->log_last); - dbg_msg("LPT area LEBs: %d (%d - %d)", + dbg_gen("LPT area LEBs: %d (%d - %d)", c->lpt_lebs, c->lpt_first, c->lpt_last); - dbg_msg("orphan area LEBs: %d (%d - %d)", + dbg_gen("orphan area LEBs: %d (%d - %d)", c->orph_lebs, c->orph_first, c->orph_last); - dbg_msg("main area LEBs: %d (%d - %d)", + dbg_gen("main area LEBs: %d (%d - %d)", c->main_lebs, c->main_first, c->leb_cnt - 1); - dbg_msg("index LEBs: %d", c->lst.idx_lebs); - dbg_msg("total index bytes: %lld (%lld KiB, %lld MiB)", - c->old_idx_sz, c->old_idx_sz >> 10, c->old_idx_sz >> 20); - dbg_msg("key hash type: %d", c->key_hash_type); - dbg_msg("tree fanout: %d", c->fanout); - dbg_msg("reserved GC LEB: %d", c->gc_lnum); - dbg_msg("first main LEB: %d", c->main_first); - dbg_msg("dead watermark: %d", c->dead_wm); - dbg_msg("dark watermark: %d", c->dark_wm); + dbg_gen("index LEBs: %d", c->lst.idx_lebs); + dbg_gen("total index bytes: %lld (%lld KiB, %lld MiB)", + c->bi.old_idx_sz, c->bi.old_idx_sz >> 10, + c->bi.old_idx_sz >> 20); + dbg_gen("key hash type: %d", c->key_hash_type); + dbg_gen("tree fanout: %d", c->fanout); + dbg_gen("reserved GC LEB: %d", c->gc_lnum); + dbg_gen("max. znode size %d", c->max_znode_sz); + dbg_gen("max. index node size %d", c->max_idx_node_sz); + dbg_gen("node sizes: data %zu, inode %zu, dentry %zu", + UBIFS_DATA_NODE_SZ, UBIFS_INO_NODE_SZ, UBIFS_DENT_NODE_SZ); + dbg_gen("node sizes: trun %zu, sb %zu, master %zu", + UBIFS_TRUN_NODE_SZ, UBIFS_SB_NODE_SZ, UBIFS_MST_NODE_SZ); + dbg_gen("node sizes: ref %zu, cmt. start %zu, orph %zu", + UBIFS_REF_NODE_SZ, UBIFS_CS_NODE_SZ, UBIFS_ORPH_NODE_SZ); + dbg_gen("max. node sizes: data %zu, inode %zu dentry %zu, idx %d", + UBIFS_MAX_DATA_NODE_SZ, UBIFS_MAX_INO_NODE_SZ, + UBIFS_MAX_DENT_NODE_SZ, ubifs_idx_node_sz(c, c->fanout)); + dbg_gen("dead watermark: %d", c->dead_wm); + dbg_gen("dark watermark: %d", c->dark_wm); + dbg_gen("LEB overhead: %d", c->leb_overhead); x = (long long)c->main_lebs * c->dark_wm; - dbg_msg("max. dark space: %lld (%lld KiB, %lld MiB)", + dbg_gen("max. dark space: %lld (%lld KiB, %lld MiB)", x, x >> 10, x >> 20); - dbg_msg("maximum bud bytes: %lld (%lld KiB, %lld MiB)", + dbg_gen("maximum bud bytes: %lld (%lld KiB, %lld MiB)", c->max_bud_bytes, c->max_bud_bytes >> 10, c->max_bud_bytes >> 20); - dbg_msg("BG commit bud bytes: %lld (%lld KiB, %lld MiB)", + dbg_gen("BG commit bud bytes: %lld (%lld KiB, %lld MiB)", c->bg_bud_bytes, c->bg_bud_bytes >> 10, c->bg_bud_bytes >> 20); - dbg_msg("current bud bytes %lld (%lld KiB, %lld MiB)", + dbg_gen("current bud bytes %lld (%lld KiB, %lld MiB)", c->bud_bytes, c->bud_bytes >> 10, c->bud_bytes >> 20); - dbg_msg("max. seq. number: %llu", c->max_sqnum); - dbg_msg("commit number: %llu", c->cmt_no); + dbg_gen("max. seq. number: %llu", c->max_sqnum); + dbg_gen("commit number: %llu", c->cmt_no); return 0; @@ -1202,13 +1485,13 @@ out_wbufs: free_wbufs(c); out_cbuf: kfree(c->cbuf); -out_dereg: - dbg_failure_mode_deregistration(c); out_free: + kfree(c->write_reserve_buf); + kfree(c->bu.buf); vfree(c->ileb_buf); vfree(c->sbuf); kfree(c->bottom_up_buf); - UBIFS_DBG(vfree(c->dbg_buf)); + ubifs_debugging_exit(c); return err; } @@ -1226,6 +1509,7 @@ static void ubifs_umount(struct ubifs_info *c) dbg_gen("un-mounting UBI device %d, volume %d", c->vi.ubi_num, c->vi.vol_id); + dbg_debugfs_exit_fs(c); spin_lock(&ubifs_infos_lock); list_del(&c->infos_list); spin_unlock(&ubifs_infos_lock); @@ -1241,11 +1525,12 @@ static void ubifs_umount(struct ubifs_info *c) kfree(c->cbuf); kfree(c->rcvrd_mst_node); kfree(c->mst_node); + kfree(c->write_reserve_buf); + kfree(c->bu.buf); + vfree(c->ileb_buf); vfree(c->sbuf); kfree(c->bottom_up_buf); - UBIFS_DBG(vfree(c->dbg_buf)); - vfree(c->ileb_buf); - dbg_failure_mode_deregistration(c); + ubifs_debugging_exit(c); } /** @@ -1260,19 +1545,29 @@ static int ubifs_remount_rw(struct ubifs_info *c) { int err, lnum; - if (c->ro_media) - return -EINVAL; + if (c->rw_incompat) { + ubifs_err("the file-system is not R/W-compatible"); + ubifs_msg("on-flash format version is w%d/r%d, but software only supports up to version w%d/r%d", + c->fmt_version, c->ro_compat_version, + UBIFS_FORMAT_VERSION, UBIFS_RO_COMPAT_VERSION); + return -EROFS; + } mutex_lock(&c->umount_mutex); + dbg_save_space_info(c); c->remounting_rw = 1; + c->ro_mount = 0; - /* Check for enough free space */ - if (ubifs_calc_available(c, c->min_idx_lebs) <= 0) { - ubifs_err("insufficient available space"); - err = -EINVAL; - goto out; + if (c->space_fixup) { + err = ubifs_fixup_free_space(c); + if (err) + goto out; } + err = check_free_space(c); + if (err) + goto out; + if (c->old_leb_cnt != c->leb_cnt) { struct ubifs_sb_node *sup; @@ -1283,6 +1578,7 @@ static int ubifs_remount_rw(struct ubifs_info *c) } sup->leb_cnt = cpu_to_le32(c->leb_cnt); err = ubifs_write_sb_node(c, sup); + kfree(sup); if (err) goto out; } @@ -1301,6 +1597,12 @@ static int ubifs_remount_rw(struct ubifs_info *c) err = ubifs_recover_inl_heads(c, c->sbuf); if (err) goto out; + } else { + /* A readonly mount is not allowed to have orphans */ + ubifs_assert(c->tot_orphans == 0); + err = ubifs_clear_orphans(c); + if (err) + goto out; } if (!(c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY))) { @@ -1316,32 +1618,32 @@ static int ubifs_remount_rw(struct ubifs_info *c) goto out; } - err = ubifs_lpt_init(c, 0, 1); - if (err) + c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ, GFP_KERNEL); + if (!c->write_reserve_buf) { + err = -ENOMEM; goto out; + } - err = alloc_wbufs(c); + err = ubifs_lpt_init(c, 0, 1); if (err) goto out; - ubifs_create_buds_lists(c); - /* Create background thread */ - c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name); - if (!c->bgt) - c->bgt = ERR_PTR(-EINVAL); + c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name); if (IS_ERR(c->bgt)) { err = PTR_ERR(c->bgt); c->bgt = NULL; ubifs_err("cannot spawn \"%s\", error %d", c->bgt_name, err); - return err; + goto out; } wake_up_process(c->bgt); c->orph_buf = vmalloc(c->leb_size); - if (!c->orph_buf) - return -ENOMEM; + if (!c->orph_buf) { + err = -ENOMEM; + goto out; + } /* Check for enough log space */ lnum = c->lhead_lnum + 1; @@ -1356,22 +1658,35 @@ static int ubifs_remount_rw(struct ubifs_info *c) if (c->need_recovery) err = ubifs_rcvry_gc_commit(c); else - err = take_gc_lnum(c); + err = ubifs_leb_unmap(c, c->gc_lnum); if (err) goto out; + dbg_gen("re-mounted read-write"); + c->remounting_rw = 0; + if (c->need_recovery) { c->need_recovery = 0; ubifs_msg("deferred recovery completed"); + } else { + /* + * Do not run the debugging space check if the were doing + * recovery, because when we saved the information we had the + * file-system in a state where the TNC and lprops has been + * modified in memory, but all the I/O operations (including a + * commit) were deferred. So the file-system was in + * "non-committed" state. Now the file-system is in committed + * state, and of course the amount of free space will change + * because, for example, the old index size was imprecise. + */ + err = dbg_check_space_info(c); } - dbg_gen("re-mounted read-write"); - c->vfs_sb->s_flags &= ~MS_RDONLY; - c->remounting_rw = 0; mutex_unlock(&c->umount_mutex); - return 0; + return err; out: + c->ro_mount = 1; vfree(c->orph_buf); c->orph_buf = NULL; if (c->bgt) { @@ -1379,6 +1694,8 @@ out: c->bgt = NULL; } free_wbufs(c); + kfree(c->write_reserve_buf); + c->write_reserve_buf = NULL; vfree(c->ileb_buf); c->ileb_buf = NULL; ubifs_lpt_free(c, 1); @@ -1388,42 +1705,18 @@ out: } /** - * commit_on_unmount - commit the journal when un-mounting. - * @c: UBIFS file-system description object - * - * This function is called during un-mounting and it commits the journal unless - * the "fast unmount" mode is enabled. It also avoids committing the journal if - * it contains too few data. - * - * Sometimes recovery requires the journal to be committed at least once, and - * this function takes care about this. - */ -static void commit_on_unmount(struct ubifs_info *c) -{ - if (!c->fast_unmount) { - long long bud_bytes; - - spin_lock(&c->buds_lock); - bud_bytes = c->bud_bytes; - spin_unlock(&c->buds_lock); - if (bud_bytes > c->leb_size) - ubifs_run_commit(c); - } -} - -/** * ubifs_remount_ro - re-mount in read-only mode. * @c: UBIFS file-system description object * - * We rely on VFS to have stopped writing. Possibly the background thread could - * be running a commit, however kthread_stop will wait in that case. + * We assume VFS has stopped writing. Possibly the background thread could be + * running a commit, however kthread_stop will wait in that case. */ static void ubifs_remount_ro(struct ubifs_info *c) { int i, err; ubifs_assert(!c->need_recovery); - commit_on_unmount(c); + ubifs_assert(!c->ro_mount); mutex_lock(&c->umount_mutex); if (c->bgt) { @@ -1431,27 +1724,29 @@ static void ubifs_remount_ro(struct ubifs_info *c) c->bgt = NULL; } - for (i = 0; i < c->jhead_cnt; i++) { + dbg_save_space_info(c); + + for (i = 0; i < c->jhead_cnt; i++) ubifs_wbuf_sync(&c->jheads[i].wbuf); - del_timer_sync(&c->jheads[i].wbuf.timer); - } - if (!c->ro_media) { - c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY); - c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS); - c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum); - err = ubifs_write_master(c); - if (err) - ubifs_ro_mode(c, err); - } + c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY); + c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS); + c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum); + err = ubifs_write_master(c); + if (err) + ubifs_ro_mode(c, err); - ubifs_destroy_idx_gc(c); - free_wbufs(c); vfree(c->orph_buf); c->orph_buf = NULL; + kfree(c->write_reserve_buf); + c->write_reserve_buf = NULL; vfree(c->ileb_buf); c->ileb_buf = NULL; ubifs_lpt_free(c, 1); + c->ro_mount = 1; + err = dbg_check_space_info(c); + if (err) + ubifs_ro_mode(c, err); mutex_unlock(&c->umount_mutex); } @@ -1462,14 +1757,17 @@ static void ubifs_put_super(struct super_block *sb) ubifs_msg("un-mount UBI device %d, volume %d", c->vi.ubi_num, c->vi.vol_id); + /* * The following asserts are only valid if there has not been a failure * of the media. For example, there will be dirty inodes if we failed * to write them back because of I/O errors. */ - ubifs_assert(atomic_long_read(&c->dirty_pg_cnt) == 0); - ubifs_assert(c->budg_idx_growth == 0); - ubifs_assert(c->budg_data_growth == 0); + if (!c->ro_error) { + ubifs_assert(c->bi.idx_growth == 0); + ubifs_assert(c->bi.dd_growth == 0); + ubifs_assert(c->bi.data_growth == 0); + } /* * The 'c->umount_lock' prevents races between UBIFS memory shrinker @@ -1478,7 +1776,7 @@ static void ubifs_put_super(struct super_block *sb) * the mutex is locked. */ mutex_lock(&c->umount_mutex); - if (!(c->vfs_sb->s_flags & MS_RDONLY)) { + if (!c->ro_mount) { /* * First of all kill the background thread to make sure it does * not interfere with un-mounting and freeing resources. @@ -1488,25 +1786,22 @@ static void ubifs_put_super(struct super_block *sb) c->bgt = NULL; } - /* Synchronize write-buffers */ - if (c->jheads) - for (i = 0; i < c->jhead_cnt; i++) { - ubifs_wbuf_sync(&c->jheads[i].wbuf); - del_timer_sync(&c->jheads[i].wbuf.timer); - } - /* - * On fatal errors c->ro_media is set to 1, in which case we do + * On fatal errors c->ro_error is set to 1, in which case we do * not write the master node. */ - if (!c->ro_media) { + if (!c->ro_error) { + int err; + + /* Synchronize write-buffers */ + for (i = 0; i < c->jhead_cnt; i++) + ubifs_wbuf_sync(&c->jheads[i].wbuf); + /* * We are being cleanly unmounted which means the * orphans were killed - indicate this in the master * node. Also save the reserved GC LEB number. */ - int err; - c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY); c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS); c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum); @@ -1517,8 +1812,12 @@ static void ubifs_put_super(struct super_block *sb) * next mount, so we just print a message and * continue to unmount normally. */ - ubifs_err("failed to write master node, " - "error %d", err); + ubifs_err("failed to write master node, error %d", + err); + } else { + for (i = 0; i < c->jhead_cnt; i++) + /* Make sure write-buffer timers are canceled */ + hrtimer_cancel(&c->jheads[i].wbuf.timer); } } @@ -1526,7 +1825,6 @@ static void ubifs_put_super(struct super_block *sb) bdi_destroy(&c->bdi); ubi_close_volume(c->ubi); mutex_unlock(&c->umount_mutex); - kfree(c); } static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data) @@ -1534,6 +1832,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data) int err; struct ubifs_info *c = sb->s_fs_info; + sync_filesystem(sb); dbg_gen("old flags %#lx, new flags %#x", sb->s_flags, *flags); err = ubifs_parse_options(c, data, 1); @@ -1541,22 +1840,45 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data) ubifs_err("invalid or unknown remount parameter"); return err; } - if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) { + + if (c->ro_mount && !(*flags & MS_RDONLY)) { + if (c->ro_error) { + ubifs_msg("cannot re-mount R/W due to prior errors"); + return -EROFS; + } + if (c->ro_media) { + ubifs_msg("cannot re-mount R/W - UBI volume is R/O"); + return -EROFS; + } err = ubifs_remount_rw(c); if (err) return err; - } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) + } else if (!c->ro_mount && (*flags & MS_RDONLY)) { + if (c->ro_error) { + ubifs_msg("cannot re-mount R/O due to prior errors"); + return -EROFS; + } ubifs_remount_ro(c); + } + if (c->bulk_read == 1) + bu_init(c); + else { + dbg_gen("disable bulk-read"); + kfree(c->bu.buf); + c->bu.buf = NULL; + } + + ubifs_assert(c->lst.taken_empty_lebs > 0); return 0; } -struct super_operations ubifs_super_operations = { +const struct super_operations ubifs_super_operations = { .alloc_inode = ubifs_alloc_inode, .destroy_inode = ubifs_destroy_inode, .put_super = ubifs_put_super, .write_inode = ubifs_write_inode, - .delete_inode = ubifs_delete_inode, + .evict_inode = ubifs_evict_inode, .statfs = ubifs_statfs, .dirty_inode = ubifs_dirty_inode, .remount_fs = ubifs_remount_fs, @@ -1569,22 +1891,32 @@ struct super_operations ubifs_super_operations = { * @name: UBI volume name * @mode: UBI volume open mode * - * There are several ways to specify UBI volumes when mounting UBIFS: - * o ubiX_Y - UBI device number X, volume Y; - * o ubiY - UBI device number 0, volume Y; + * The primary method of mounting UBIFS is by specifying the UBI volume + * character device node path. However, UBIFS may also be mounted withoug any + * character device node using one of the following methods: + * + * o ubiX_Y - mount UBI device number X, volume Y; + * o ubiY - mount UBI device number 0, volume Y; * o ubiX:NAME - mount UBI device X, volume with name NAME; * o ubi:NAME - mount UBI device 0, volume with name NAME. * * Alternative '!' separator may be used instead of ':' (because some shells * like busybox may interpret ':' as an NFS host name separator). This function - * returns ubi volume object in case of success and a negative error code in - * case of failure. + * returns UBI volume description object in case of success and a negative + * error code in case of failure. */ static struct ubi_volume_desc *open_ubi(const char *name, int mode) { + struct ubi_volume_desc *ubi; int dev, vol; char *endptr; + /* First, try to open using the device node path method */ + ubi = ubi_open_volume_path(name, mode); + if (!IS_ERR(ubi)) + return ubi; + + /* Try the "nodev" method */ if (name[0] != 'u' || name[1] != 'b' || name[2] != 'i') return ERR_PTR(-EINVAL); @@ -1616,85 +1948,94 @@ static struct ubi_volume_desc *open_ubi(const char *name, int mode) return ERR_PTR(-EINVAL); } -static int ubifs_fill_super(struct super_block *sb, void *data, int silent) +static struct ubifs_info *alloc_ubifs_info(struct ubi_volume_desc *ubi) { - struct ubi_volume_desc *ubi = sb->s_fs_info; struct ubifs_info *c; - struct inode *root; - int err; c = kzalloc(sizeof(struct ubifs_info), GFP_KERNEL); - if (!c) - return -ENOMEM; + if (c) { + spin_lock_init(&c->cnt_lock); + spin_lock_init(&c->cs_lock); + spin_lock_init(&c->buds_lock); + spin_lock_init(&c->space_lock); + spin_lock_init(&c->orphan_lock); + init_rwsem(&c->commit_sem); + mutex_init(&c->lp_mutex); + mutex_init(&c->tnc_mutex); + mutex_init(&c->log_mutex); + mutex_init(&c->mst_mutex); + mutex_init(&c->umount_mutex); + mutex_init(&c->bu_mutex); + mutex_init(&c->write_reserve_mutex); + init_waitqueue_head(&c->cmt_wq); + c->buds = RB_ROOT; + c->old_idx = RB_ROOT; + c->size_tree = RB_ROOT; + c->orph_tree = RB_ROOT; + INIT_LIST_HEAD(&c->infos_list); + INIT_LIST_HEAD(&c->idx_gc); + INIT_LIST_HEAD(&c->replay_list); + INIT_LIST_HEAD(&c->replay_buds); + INIT_LIST_HEAD(&c->uncat_list); + INIT_LIST_HEAD(&c->empty_list); + INIT_LIST_HEAD(&c->freeable_list); + INIT_LIST_HEAD(&c->frdi_idx_list); + INIT_LIST_HEAD(&c->unclean_leb_list); + INIT_LIST_HEAD(&c->old_buds); + INIT_LIST_HEAD(&c->orph_list); + INIT_LIST_HEAD(&c->orph_new); + c->no_chk_data_crc = 1; + + c->highest_inum = UBIFS_FIRST_INO; + c->lhead_lnum = c->ltail_lnum = UBIFS_LOG_LNUM; + + ubi_get_volume_info(ubi, &c->vi); + ubi_get_device_info(c->vi.ubi_num, &c->di); + } + return c; +} - spin_lock_init(&c->cnt_lock); - spin_lock_init(&c->cs_lock); - spin_lock_init(&c->buds_lock); - spin_lock_init(&c->space_lock); - spin_lock_init(&c->orphan_lock); - init_rwsem(&c->commit_sem); - mutex_init(&c->lp_mutex); - mutex_init(&c->tnc_mutex); - mutex_init(&c->log_mutex); - mutex_init(&c->mst_mutex); - mutex_init(&c->umount_mutex); - init_waitqueue_head(&c->cmt_wq); - c->buds = RB_ROOT; - c->old_idx = RB_ROOT; - c->size_tree = RB_ROOT; - c->orph_tree = RB_ROOT; - INIT_LIST_HEAD(&c->infos_list); - INIT_LIST_HEAD(&c->idx_gc); - INIT_LIST_HEAD(&c->replay_list); - INIT_LIST_HEAD(&c->replay_buds); - INIT_LIST_HEAD(&c->uncat_list); - INIT_LIST_HEAD(&c->empty_list); - INIT_LIST_HEAD(&c->freeable_list); - INIT_LIST_HEAD(&c->frdi_idx_list); - INIT_LIST_HEAD(&c->unclean_leb_list); - INIT_LIST_HEAD(&c->old_buds); - INIT_LIST_HEAD(&c->orph_list); - INIT_LIST_HEAD(&c->orph_new); - - c->highest_inum = UBIFS_FIRST_INO; - get_random_bytes(&c->vfs_gen, sizeof(int)); - c->lhead_lnum = c->ltail_lnum = UBIFS_LOG_LNUM; - - ubi_get_volume_info(ubi, &c->vi); - ubi_get_device_info(c->vi.ubi_num, &c->di); +static int ubifs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct ubifs_info *c = sb->s_fs_info; + struct inode *root; + int err; + c->vfs_sb = sb; /* Re-open the UBI device in read-write mode */ c->ubi = ubi_open_volume(c->vi.ubi_num, c->vi.vol_id, UBI_READWRITE); if (IS_ERR(c->ubi)) { err = PTR_ERR(c->ubi); - goto out_free; + goto out; } /* - * UBIFS provids 'backing_dev_info' in order to disable readahead. For + * UBIFS provides 'backing_dev_info' in order to disable read-ahead. For * UBIFS, I/O is not deferred, it is done immediately in readpage, * which means the user would have to wait not just for their own I/O - * but the readahead I/O as well i.e. completely pointless. + * but the read-ahead I/O as well i.e. completely pointless. * * Read-ahead will be disabled because @c->bdi.ra_pages is 0. */ + c->bdi.name = "ubifs", c->bdi.capabilities = BDI_CAP_MAP_COPY; - c->bdi.unplug_io_fn = default_unplug_io_fn; err = bdi_init(&c->bdi); if (err) goto out_close; + err = bdi_register(&c->bdi, NULL, "ubifs_%d_%d", + c->vi.ubi_num, c->vi.vol_id); + if (err) + goto out_bdi; err = ubifs_parse_options(c, data, 0); if (err) goto out_bdi; - c->vfs_sb = sb; - + sb->s_bdi = &c->bdi; sb->s_fs_info = c; sb->s_magic = UBIFS_SUPER_MAGIC; sb->s_blocksize = UBIFS_BLOCK_SIZE; sb->s_blocksize_bits = UBIFS_BLOCK_SHIFT; - sb->s_dev = c->vi.cdev; sb->s_maxbytes = c->max_inode_sz = key_max_inode_size(c); if (c->max_inode_sz > MAX_LFS_FILESIZE) sb->s_maxbytes = c->max_inode_sz = MAX_LFS_FILESIZE; @@ -1714,16 +2055,15 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) goto out_umount; } - sb->s_root = d_alloc_root(root); - if (!sb->s_root) - goto out_iput; + sb->s_root = d_make_root(root); + if (!sb->s_root) { + err = -ENOMEM; + goto out_umount; + } mutex_unlock(&c->umount_mutex); - return 0; -out_iput: - iput(root); out_umount: ubifs_umount(c); out_unlock: @@ -1732,31 +2072,29 @@ out_bdi: bdi_destroy(&c->bdi); out_close: ubi_close_volume(c->ubi); -out_free: - kfree(c); +out: return err; } static int sb_test(struct super_block *sb, void *data) { - dev_t *dev = data; + struct ubifs_info *c1 = data; + struct ubifs_info *c = sb->s_fs_info; - return sb->s_dev == *dev; + return c->vi.cdev == c1->vi.cdev; } static int sb_set(struct super_block *sb, void *data) { - dev_t *dev = data; - - sb->s_dev = *dev; - return 0; + sb->s_fs_info = data; + return set_anon_super(sb, NULL); } -static int ubifs_get_sb(struct file_system_type *fs_type, int flags, - const char *name, void *data, struct vfsmount *mnt) +static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags, + const char *name, void *data) { struct ubi_volume_desc *ubi; - struct ubi_volume_info vi; + struct ubifs_info *c; struct super_block *sb; int err; @@ -1771,32 +2109,34 @@ static int ubifs_get_sb(struct file_system_type *fs_type, int flags, if (IS_ERR(ubi)) { ubifs_err("cannot open \"%s\", error %d", name, (int)PTR_ERR(ubi)); - return PTR_ERR(ubi); + return ERR_CAST(ubi); } - ubi_get_volume_info(ubi, &vi); - dbg_gen("opened ubi%d_%d", vi.ubi_num, vi.vol_id); + c = alloc_ubifs_info(ubi); + if (!c) { + err = -ENOMEM; + goto out_close; + } - sb = sget(fs_type, &sb_test, &sb_set, &vi.cdev); + dbg_gen("opened ubi%d_%d", c->vi.ubi_num, c->vi.vol_id); + + sb = sget(fs_type, sb_test, sb_set, flags, c); if (IS_ERR(sb)) { err = PTR_ERR(sb); + kfree(c); goto out_close; } if (sb->s_root) { + struct ubifs_info *c1 = sb->s_fs_info; + kfree(c); /* A new mount point for already mounted UBIFS */ dbg_gen("this ubi volume is already mounted"); - if ((flags ^ sb->s_flags) & MS_RDONLY) { + if (!!(flags & MS_RDONLY) != c1->ro_mount) { err = -EBUSY; goto out_deact; } } else { - sb->s_flags = flags; - /* - * Pass 'ubi' to 'fill_super()' in sb->s_fs_info where it is - * replaced by 'c'. - */ - sb->s_fs_info = ubi; err = ubifs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0); if (err) goto out_deact; @@ -1807,36 +2147,29 @@ static int ubifs_get_sb(struct file_system_type *fs_type, int flags, /* 'fill_super()' opens ubi again so we must close it here */ ubi_close_volume(ubi); - return simple_set_mnt(mnt, sb); + return dget(sb->s_root); out_deact: - up_write(&sb->s_umount); - deactivate_super(sb); + deactivate_locked_super(sb); out_close: ubi_close_volume(ubi); - return err; + return ERR_PTR(err); } -static void ubifs_kill_sb(struct super_block *sb) +static void kill_ubifs_super(struct super_block *s) { - struct ubifs_info *c = sb->s_fs_info; - - /* - * We do 'commit_on_unmount()' here instead of 'ubifs_put_super()' - * in order to be outside BKL. - */ - if (sb->s_root && !(sb->s_flags & MS_RDONLY)) - commit_on_unmount(c); - /* The un-mount routine is actually done in put_super() */ - generic_shutdown_super(sb); + struct ubifs_info *c = s->s_fs_info; + kill_anon_super(s); + kfree(c); } static struct file_system_type ubifs_fs_type = { .name = "ubifs", .owner = THIS_MODULE, - .get_sb = ubifs_get_sb, - .kill_sb = ubifs_kill_sb + .mount = ubifs_mount, + .kill_sb = kill_ubifs_super, }; +MODULE_ALIAS_FS("ubifs"); /* * Inode slab cache constructor. @@ -1891,43 +2224,54 @@ static int __init ubifs_init(void) BUILD_BUG_ON(UBIFS_REF_NODE_SZ != 64); /* + * We use 2 bit wide bit-fields to store compression type, which should + * be amended if more compressors are added. The bit-fields are: + * @compr_type in 'struct ubifs_inode', @default_compr in + * 'struct ubifs_info' and @compr_type in 'struct ubifs_mount_opts'. + */ + BUILD_BUG_ON(UBIFS_COMPR_TYPES_CNT > 4); + + /* * We require that PAGE_CACHE_SIZE is greater-than-or-equal-to * UBIFS_BLOCK_SIZE. It is assumed that both are powers of 2. */ if (PAGE_CACHE_SIZE < UBIFS_BLOCK_SIZE) { - ubifs_err("VFS page cache size is %u bytes, but UBIFS requires" - " at least 4096 bytes", + ubifs_err("VFS page cache size is %u bytes, but UBIFS requires at least 4096 bytes", (unsigned int)PAGE_CACHE_SIZE); return -EINVAL; } - err = register_filesystem(&ubifs_fs_type); - if (err) { - ubifs_err("cannot register file system, error %d", err); - return err; - } - - err = -ENOMEM; ubifs_inode_slab = kmem_cache_create("ubifs_inode_slab", sizeof(struct ubifs_inode), 0, SLAB_MEM_SPREAD | SLAB_RECLAIM_ACCOUNT, &inode_slab_ctor); if (!ubifs_inode_slab) - goto out_reg; + return -ENOMEM; register_shrinker(&ubifs_shrinker_info); err = ubifs_compressors_init(); if (err) + goto out_shrinker; + + err = dbg_debugfs_init(); + if (err) goto out_compr; + err = register_filesystem(&ubifs_fs_type); + if (err) { + ubifs_err("cannot register file system, error %d", err); + goto out_dbg; + } return 0; +out_dbg: + dbg_debugfs_exit(); out_compr: + ubifs_compressors_exit(); +out_shrinker: unregister_shrinker(&ubifs_shrinker_info); kmem_cache_destroy(ubifs_inode_slab); -out_reg: - unregister_filesystem(&ubifs_fs_type); return err; } /* late_initcall to let compressors initialize first */ @@ -1938,8 +2282,15 @@ static void __exit ubifs_exit(void) ubifs_assert(list_empty(&ubifs_infos)); ubifs_assert(atomic_long_read(&ubifs_clean_zn_cnt) == 0); + dbg_debugfs_exit(); ubifs_compressors_exit(); unregister_shrinker(&ubifs_shrinker_info); + + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); kmem_cache_destroy(ubifs_inode_slab); unregister_filesystem(&ubifs_fs_type); } diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index e909f4a9644..8a40cf9c02d 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -31,6 +31,7 @@ */ #include <linux/crc32.h> +#include <linux/slab.h> #include "ubifs.h" /* @@ -177,27 +178,11 @@ static int ins_clr_old_idx_znode(struct ubifs_info *c, */ void destroy_old_idx(struct ubifs_info *c) { - struct rb_node *this = c->old_idx.rb_node; - struct ubifs_old_idx *old_idx; + struct ubifs_old_idx *old_idx, *n; - while (this) { - if (this->rb_left) { - this = this->rb_left; - continue; - } else if (this->rb_right) { - this = this->rb_right; - continue; - } - old_idx = rb_entry(this, struct ubifs_old_idx, rb); - this = rb_parent(this); - if (this) { - if (this->rb_left == &old_idx->rb) - this->rb_left = NULL; - else - this->rb_right = NULL; - } + rbtree_postorder_for_each_entry_safe(old_idx, n, &c->old_idx, rb) kfree(old_idx); - } + c->old_idx = RB_ROOT; } @@ -222,7 +207,7 @@ static struct ubifs_znode *copy_znode(struct ubifs_info *c, __set_bit(DIRTY_ZNODE, &zn->flags); __clear_bit(COW_ZNODE, &zn->flags); - ubifs_assert(!test_bit(OBSOLETE_ZNODE, &znode->flags)); + ubifs_assert(!ubifs_zn_obsolete(znode)); __set_bit(OBSOLETE_ZNODE, &znode->flags); if (znode->level != 0) { @@ -270,7 +255,7 @@ static struct ubifs_znode *dirty_cow_znode(struct ubifs_info *c, struct ubifs_znode *zn; int err; - if (!test_bit(COW_ZNODE, &znode->flags)) { + if (!ubifs_zn_cow(znode)) { /* znode is not being committed */ if (!test_and_set_bit(DIRTY_ZNODE, &znode->flags)) { atomic_long_inc(&c->dirty_zn_cnt); @@ -284,7 +269,7 @@ static struct ubifs_znode *dirty_cow_znode(struct ubifs_info *c, } zn = copy_znode(c, znode); - if (unlikely(IS_ERR(zn))) + if (IS_ERR(zn)) return zn; if (zbr->len) { @@ -338,17 +323,16 @@ static int lnc_add(struct ubifs_info *c, struct ubifs_zbranch *zbr, err = ubifs_validate_entry(c, dent); if (err) { - dbg_dump_stack(); - dbg_dump_node(c, dent); + dump_stack(); + ubifs_dump_node(c, dent); return err; } - lnc_node = kmalloc(zbr->len, GFP_NOFS); + lnc_node = kmemdup(node, zbr->len, GFP_NOFS); if (!lnc_node) /* We don't have to have the cache, so no error */ return 0; - memcpy(lnc_node, node, zbr->len); zbr->leaf = lnc_node; return 0; } @@ -372,8 +356,8 @@ static int lnc_add_directly(struct ubifs_info *c, struct ubifs_zbranch *zbr, err = ubifs_validate_entry(c, node); if (err) { - dbg_dump_stack(); - dbg_dump_node(c, node); + dump_stack(); + ubifs_dump_node(c, node); return err; } @@ -443,6 +427,14 @@ static int tnc_read_node_nm(struct ubifs_info *c, struct ubifs_zbranch *zbr, * This function performs that same function as ubifs_read_node except that * it does not require that there is actually a node present and instead * the return code indicates if a node was read. + * + * Note, this function does not check CRC of data nodes if @c->no_chk_data_crc + * is true (it is controlled by corresponding mount option). However, if + * @c->mounting or @c->remounting_rw is true (we are mounting or re-mounting to + * R/W mode), @c->no_chk_data_crc is ignored and CRC is checked. This is + * because during mounting or re-mounting from R/O mode to R/W mode we may read + * journal nodes (when replying the journal or doing the recovery) and the + * journal nodes may potentially be corrupted, so checking is required. */ static int try_read_node(const struct ubifs_info *c, void *buf, int type, int len, int lnum, int offs) @@ -453,7 +445,7 @@ static int try_read_node(const struct ubifs_info *c, void *buf, int type, dbg_io("LEB %d:%d, %s, length %d", lnum, offs, dbg_ntype(type), len); - err = ubi_read(c->ubi, lnum, buf, offs, len); + err = ubifs_leb_read(c, lnum, buf, offs, len, 1); if (err) { ubifs_err("cannot read node type %d from LEB %d:%d, error %d", type, lnum, offs, err); @@ -470,6 +462,10 @@ static int try_read_node(const struct ubifs_info *c, void *buf, int type, if (node_len != len) return 0; + if (type == UBIFS_DATA_NODE && c->no_chk_data_crc && !c->mounting && + !c->remounting_rw) + return 1; + crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8); node_crc = le32_to_cpu(ch->crc); if (crc != node_crc) @@ -493,7 +489,7 @@ static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key, { int ret; - dbg_tnc("LEB %d:%d, key %s", zbr->lnum, zbr->offs, DBGKEY(key)); + dbg_tnck(key, "LEB %d:%d, key ", zbr->lnum, zbr->offs); ret = try_read_node(c, node, key_type(c, key), zbr->len, zbr->lnum, zbr->offs); @@ -506,9 +502,9 @@ static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key, if (keys_cmp(c, key, &node_key) != 0) ret = 0; } - if (ret == 0) - dbg_mnt("dangling branch LEB %d:%d len %d, key %s", - zbr->lnum, zbr->offs, zbr->len, DBGKEY(key)); + if (ret == 0 && c->replaying) + dbg_mntk(key, "dangling branch LEB %d:%d len %d, key ", + zbr->lnum, zbr->offs, zbr->len); return ret; } @@ -983,9 +979,9 @@ static int fallible_resolve_collision(struct ubifs_info *c, if (adding || !o_znode) return 0; - dbg_mnt("dangling match LEB %d:%d len %d %s", + dbg_mntk(key, "dangling match LEB %d:%d len %d key ", o_znode->zbranch[o_n].lnum, o_znode->zbranch[o_n].offs, - o_znode->zbranch[o_n].len, DBGKEY(key)); + o_znode->zbranch[o_n].len); *zn = o_znode; *n = o_n; return 1; @@ -1128,7 +1124,7 @@ static struct ubifs_znode *dirty_cow_bottom_up(struct ubifs_info *c, ubifs_assert(znode == c->zroot.znode); znode = dirty_cow_znode(c, &c->zroot); } - if (unlikely(IS_ERR(znode)) || !p) + if (IS_ERR(znode) || !p) break; ubifs_assert(path[p - 1] >= 0); ubifs_assert(path[p - 1] < znode->child_cnt); @@ -1151,8 +1147,8 @@ static struct ubifs_znode *dirty_cow_bottom_up(struct ubifs_info *c, * o exact match, i.e. the found zero-level znode contains key @key, then %1 * is returned and slot number of the matched branch is stored in @n; * o not exact match, which means that zero-level znode does not contain - * @key, then %0 is returned and slot number of the closed branch is stored - * in @n; + * @key, then %0 is returned and slot number of the closest branch is stored + * in @n; * o @key is so small that it is even less than the lowest key of the * leftmost zero-level node, then %0 is returned and %0 is stored in @n. * @@ -1167,7 +1163,8 @@ int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key, struct ubifs_znode *znode; unsigned long time = get_seconds(); - dbg_tnc("search key %s", DBGKEY(key)); + dbg_tnck(key, "search key "); + ubifs_assert(key_type(c, key) < UBIFS_INVALID_KEY); znode = c->zroot.znode; if (unlikely(!znode)) { @@ -1244,7 +1241,7 @@ int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key, * splitting in the middle of the colliding sequence. Also, when * removing the leftmost key, we would have to correct the key of the * parent node, which would introduce additional complications. Namely, - * if we changed the the leftmost key of the parent znode, the garbage + * if we changed the leftmost key of the parent znode, the garbage * collector would be unable to find it (GC is doing this when GC'ing * indexing LEBs). Although we already have an additional RB-tree where * we save such changed znodes (see 'ins_clr_old_idx_znode()') until @@ -1302,7 +1299,7 @@ static int lookup_level0_dirty(struct ubifs_info *c, const union ubifs_key *key, struct ubifs_znode *znode; unsigned long time = get_seconds(); - dbg_tnc("search and dirty key %s", DBGKEY(key)); + dbg_tnck(key, "search and dirty key "); znode = c->zroot.znode; if (unlikely(!znode)) { @@ -1382,23 +1379,62 @@ static int lookup_level0_dirty(struct ubifs_info *c, const union ubifs_key *key, } /** - * ubifs_tnc_lookup - look up a file-system node. + * maybe_leb_gced - determine if a LEB may have been garbage collected. + * @c: UBIFS file-system description object + * @lnum: LEB number + * @gc_seq1: garbage collection sequence number + * + * This function determines if @lnum may have been garbage collected since + * sequence number @gc_seq1. If it may have been then %1 is returned, otherwise + * %0 is returned. + */ +static int maybe_leb_gced(struct ubifs_info *c, int lnum, int gc_seq1) +{ + int gc_seq2, gced_lnum; + + gced_lnum = c->gced_lnum; + smp_rmb(); + gc_seq2 = c->gc_seq; + /* Same seq means no GC */ + if (gc_seq1 == gc_seq2) + return 0; + /* Different by more than 1 means we don't know */ + if (gc_seq1 + 1 != gc_seq2) + return 1; + /* + * We have seen the sequence number has increased by 1. Now we need to + * be sure we read the right LEB number, so read it again. + */ + smp_rmb(); + if (gced_lnum != c->gced_lnum) + return 1; + /* Finally we can check lnum */ + if (gced_lnum == lnum) + return 1; + return 0; +} + +/** + * ubifs_tnc_locate - look up a file-system node and return it and its location. * @c: UBIFS file-system description object * @key: node key to lookup * @node: the node is returned here + * @lnum: LEB number is returned here + * @offs: offset is returned here * - * This function look up and reads node with key @key. The caller has to make + * This function looks up and reads node with key @key. The caller has to make * sure the @node buffer is large enough to fit the node. Returns zero in case * of success, %-ENOENT if the node was not found, and a negative error code in - * case of failure. + * case of failure. The node location can be returned in @lnum and @offs. */ -int ubifs_tnc_lookup(struct ubifs_info *c, const union ubifs_key *key, - void *node) +int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key, + void *node, int *lnum, int *offs) { - int found, n, err; + int found, n, err, safely = 0, gc_seq1; struct ubifs_znode *znode; struct ubifs_zbranch zbr, *zt; +again: mutex_lock(&c->tnc_mutex); found = ubifs_lookup_level0(c, key, &znode, &n); if (!found) { @@ -1409,6 +1445,10 @@ int ubifs_tnc_lookup(struct ubifs_info *c, const union ubifs_key *key, goto out; } zt = &znode->zbranch[n]; + if (lnum) { + *lnum = zt->lnum; + *offs = zt->offs; + } if (is_hash_key(c, key)) { /* * In this case the leaf node cache gets used, so we pass the @@ -1417,11 +1457,31 @@ int ubifs_tnc_lookup(struct ubifs_info *c, const union ubifs_key *key, err = tnc_read_node_nm(c, zt, node); goto out; } + if (safely) { + err = ubifs_tnc_read_node(c, zt, node); + goto out; + } + /* Drop the TNC mutex prematurely and race with garbage collection */ zbr = znode->zbranch[n]; + gc_seq1 = c->gc_seq; mutex_unlock(&c->tnc_mutex); - err = ubifs_tnc_read_node(c, &zbr, node); - return err; + if (ubifs_get_wbuf(c, zbr.lnum)) { + /* We do not GC journal heads */ + err = ubifs_tnc_read_node(c, &zbr, node); + return err; + } + + err = fallible_read_node(c, key, &zbr, node); + if (err <= 0 || maybe_leb_gced(c, zbr.lnum, gc_seq1)) { + /* + * The node may have been GC'ed out from under us so try again + * while keeping the TNC mutex locked. + */ + safely = 1; + goto again; + } + return 0; out: mutex_unlock(&c->tnc_mutex); @@ -1429,58 +1489,294 @@ out: } /** - * ubifs_tnc_locate - look up a file-system node and return it and its location. + * ubifs_tnc_get_bu_keys - lookup keys for bulk-read. * @c: UBIFS file-system description object - * @key: node key to lookup - * @node: the node is returned here - * @lnum: LEB number is returned here - * @offs: offset is returned here + * @bu: bulk-read parameters and results * - * This function is the same as 'ubifs_tnc_lookup()' but it returns the node - * location also. See 'ubifs_tnc_lookup()'. + * Lookup consecutive data node keys for the same inode that reside + * consecutively in the same LEB. This function returns zero in case of success + * and a negative error code in case of failure. + * + * Note, if the bulk-read buffer length (@bu->buf_len) is known, this function + * makes sure bulk-read nodes fit the buffer. Otherwise, this function prepares + * maximum possible amount of nodes for bulk-read. */ -int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key, - void *node, int *lnum, int *offs) +int ubifs_tnc_get_bu_keys(struct ubifs_info *c, struct bu_info *bu) { - int found, n, err; + int n, err = 0, lnum = -1, uninitialized_var(offs); + int uninitialized_var(len); + unsigned int block = key_block(c, &bu->key); struct ubifs_znode *znode; - struct ubifs_zbranch zbr, *zt; + + bu->cnt = 0; + bu->blk_cnt = 0; + bu->eof = 0; mutex_lock(&c->tnc_mutex); - found = ubifs_lookup_level0(c, key, &znode, &n); - if (!found) { - err = -ENOENT; - goto out; - } else if (found < 0) { - err = found; + /* Find first key */ + err = ubifs_lookup_level0(c, &bu->key, &znode, &n); + if (err < 0) goto out; + if (err) { + /* Key found */ + len = znode->zbranch[n].len; + /* The buffer must be big enough for at least 1 node */ + if (len > bu->buf_len) { + err = -EINVAL; + goto out; + } + /* Add this key */ + bu->zbranch[bu->cnt++] = znode->zbranch[n]; + bu->blk_cnt += 1; + lnum = znode->zbranch[n].lnum; + offs = ALIGN(znode->zbranch[n].offs + len, 8); } - zt = &znode->zbranch[n]; - if (is_hash_key(c, key)) { - /* - * In this case the leaf node cache gets used, so we pass the - * address of the zbranch and keep the mutex locked - */ - *lnum = zt->lnum; - *offs = zt->offs; - err = tnc_read_node_nm(c, zt, node); - goto out; + while (1) { + struct ubifs_zbranch *zbr; + union ubifs_key *key; + unsigned int next_block; + + /* Find next key */ + err = tnc_next(c, &znode, &n); + if (err) + goto out; + zbr = &znode->zbranch[n]; + key = &zbr->key; + /* See if there is another data key for this file */ + if (key_inum(c, key) != key_inum(c, &bu->key) || + key_type(c, key) != UBIFS_DATA_KEY) { + err = -ENOENT; + goto out; + } + if (lnum < 0) { + /* First key found */ + lnum = zbr->lnum; + offs = ALIGN(zbr->offs + zbr->len, 8); + len = zbr->len; + if (len > bu->buf_len) { + err = -EINVAL; + goto out; + } + } else { + /* + * The data nodes must be in consecutive positions in + * the same LEB. + */ + if (zbr->lnum != lnum || zbr->offs != offs) + goto out; + offs += ALIGN(zbr->len, 8); + len = ALIGN(len, 8) + zbr->len; + /* Must not exceed buffer length */ + if (len > bu->buf_len) + goto out; + } + /* Allow for holes */ + next_block = key_block(c, key); + bu->blk_cnt += (next_block - block - 1); + if (bu->blk_cnt >= UBIFS_MAX_BULK_READ) + goto out; + block = next_block; + /* Add this key */ + bu->zbranch[bu->cnt++] = *zbr; + bu->blk_cnt += 1; + /* See if we have room for more */ + if (bu->cnt >= UBIFS_MAX_BULK_READ) + goto out; + if (bu->blk_cnt >= UBIFS_MAX_BULK_READ) + goto out; } - zbr = znode->zbranch[n]; +out: + if (err == -ENOENT) { + bu->eof = 1; + err = 0; + } + bu->gc_seq = c->gc_seq; mutex_unlock(&c->tnc_mutex); + if (err) + return err; + /* + * An enormous hole could cause bulk-read to encompass too many + * page cache pages, so limit the number here. + */ + if (bu->blk_cnt > UBIFS_MAX_BULK_READ) + bu->blk_cnt = UBIFS_MAX_BULK_READ; + /* + * Ensure that bulk-read covers a whole number of page cache + * pages. + */ + if (UBIFS_BLOCKS_PER_PAGE == 1 || + !(bu->blk_cnt & (UBIFS_BLOCKS_PER_PAGE - 1))) + return 0; + if (bu->eof) { + /* At the end of file we can round up */ + bu->blk_cnt += UBIFS_BLOCKS_PER_PAGE - 1; + return 0; + } + /* Exclude data nodes that do not make up a whole page cache page */ + block = key_block(c, &bu->key) + bu->blk_cnt; + block &= ~(UBIFS_BLOCKS_PER_PAGE - 1); + while (bu->cnt) { + if (key_block(c, &bu->zbranch[bu->cnt - 1].key) < block) + break; + bu->cnt -= 1; + } + return 0; +} - *lnum = zbr.lnum; - *offs = zbr.offs; +/** + * read_wbuf - bulk-read from a LEB with a wbuf. + * @wbuf: wbuf that may overlap the read + * @buf: buffer into which to read + * @len: read length + * @lnum: LEB number from which to read + * @offs: offset from which to read + * + * This functions returns %0 on success or a negative error code on failure. + */ +static int read_wbuf(struct ubifs_wbuf *wbuf, void *buf, int len, int lnum, + int offs) +{ + const struct ubifs_info *c = wbuf->c; + int rlen, overlap; - err = ubifs_tnc_read_node(c, &zbr, node); - return err; + dbg_io("LEB %d:%d, length %d", lnum, offs, len); + ubifs_assert(wbuf && lnum >= 0 && lnum < c->leb_cnt && offs >= 0); + ubifs_assert(!(offs & 7) && offs < c->leb_size); + ubifs_assert(offs + len <= c->leb_size); + spin_lock(&wbuf->lock); + overlap = (lnum == wbuf->lnum && offs + len > wbuf->offs); + if (!overlap) { + /* We may safely unlock the write-buffer and read the data */ + spin_unlock(&wbuf->lock); + return ubifs_leb_read(c, lnum, buf, offs, len, 0); + } + + /* Don't read under wbuf */ + rlen = wbuf->offs - offs; + if (rlen < 0) + rlen = 0; + + /* Copy the rest from the write-buffer */ + memcpy(buf + rlen, wbuf->buf + offs + rlen - wbuf->offs, len - rlen); + spin_unlock(&wbuf->lock); + + if (rlen > 0) + /* Read everything that goes before write-buffer */ + return ubifs_leb_read(c, lnum, buf, offs, rlen, 0); + + return 0; +} + +/** + * validate_data_node - validate data nodes for bulk-read. + * @c: UBIFS file-system description object + * @buf: buffer containing data node to validate + * @zbr: zbranch of data node to validate + * + * This functions returns %0 on success or a negative error code on failure. + */ +static int validate_data_node(struct ubifs_info *c, void *buf, + struct ubifs_zbranch *zbr) +{ + union ubifs_key key1; + struct ubifs_ch *ch = buf; + int err, len; + + if (ch->node_type != UBIFS_DATA_NODE) { + ubifs_err("bad node type (%d but expected %d)", + ch->node_type, UBIFS_DATA_NODE); + goto out_err; + } + + err = ubifs_check_node(c, buf, zbr->lnum, zbr->offs, 0, 0); + if (err) { + ubifs_err("expected node type %d", UBIFS_DATA_NODE); + goto out; + } + + len = le32_to_cpu(ch->len); + if (len != zbr->len) { + ubifs_err("bad node length %d, expected %d", len, zbr->len); + goto out_err; + } + + /* Make sure the key of the read node is correct */ + key_read(c, buf + UBIFS_KEY_OFFSET, &key1); + if (!keys_eq(c, &zbr->key, &key1)) { + ubifs_err("bad key in node at LEB %d:%d", + zbr->lnum, zbr->offs); + dbg_tnck(&zbr->key, "looked for key "); + dbg_tnck(&key1, "found node's key "); + goto out_err; + } + + return 0; + +out_err: + err = -EINVAL; out: - mutex_unlock(&c->tnc_mutex); + ubifs_err("bad node at LEB %d:%d", zbr->lnum, zbr->offs); + ubifs_dump_node(c, buf); + dump_stack(); return err; } /** + * ubifs_tnc_bulk_read - read a number of data nodes in one go. + * @c: UBIFS file-system description object + * @bu: bulk-read parameters and results + * + * This functions reads and validates the data nodes that were identified by the + * 'ubifs_tnc_get_bu_keys()' function. This functions returns %0 on success, + * -EAGAIN to indicate a race with GC, or another negative error code on + * failure. + */ +int ubifs_tnc_bulk_read(struct ubifs_info *c, struct bu_info *bu) +{ + int lnum = bu->zbranch[0].lnum, offs = bu->zbranch[0].offs, len, err, i; + struct ubifs_wbuf *wbuf; + void *buf; + + len = bu->zbranch[bu->cnt - 1].offs; + len += bu->zbranch[bu->cnt - 1].len - offs; + if (len > bu->buf_len) { + ubifs_err("buffer too small %d vs %d", bu->buf_len, len); + return -EINVAL; + } + + /* Do the read */ + wbuf = ubifs_get_wbuf(c, lnum); + if (wbuf) + err = read_wbuf(wbuf, bu->buf, len, lnum, offs); + else + err = ubifs_leb_read(c, lnum, bu->buf, offs, len, 0); + + /* Check for a race with GC */ + if (maybe_leb_gced(c, lnum, bu->gc_seq)) + return -EAGAIN; + + if (err && err != -EBADMSG) { + ubifs_err("failed to read from LEB %d:%d, error %d", + lnum, offs, err); + dump_stack(); + dbg_tnck(&bu->key, "key "); + return err; + } + + /* Validate the nodes read */ + buf = bu->buf; + for (i = 0; i < bu->cnt; i++) { + err = validate_data_node(c, buf, &bu->zbranch[i]); + if (err) + return err; + buf = buf + ALIGN(bu->zbranch[i].len, 8); + } + + return 0; +} + +/** * do_lookup_nm- look up a "hashed" node. * @c: UBIFS file-system description object * @key: node key to lookup @@ -1498,9 +1794,8 @@ static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key, { int found, n, err; struct ubifs_znode *znode; - struct ubifs_zbranch zbr; - dbg_tnc("name '%.*s' key %s", nm->len, nm->name, DBGKEY(key)); + dbg_tnck(key, "name '%.*s' key ", nm->len, nm->name); mutex_lock(&c->tnc_mutex); found = ubifs_lookup_level0(c, key, &znode, &n); if (!found) { @@ -1522,11 +1817,7 @@ static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key, goto out_unlock; } - zbr = znode->zbranch[n]; - mutex_unlock(&c->tnc_mutex); - - err = tnc_read_node_nm(c, &zbr, node); - return err; + err = tnc_read_node_nm(c, &znode->zbranch[n], node); out_unlock: mutex_unlock(&c->tnc_mutex); @@ -1669,7 +1960,7 @@ static int tnc_insert(struct ubifs_info *c, struct ubifs_znode *znode, { struct ubifs_znode *zn, *zi, *zp; int i, keep, move, appending = 0; - union ubifs_key *key = &zbr->key; + union ubifs_key *key = &zbr->key, *key1; ubifs_assert(n >= 0 && n <= c->fanout); @@ -1678,8 +1969,7 @@ again: zp = znode->parent; if (znode->child_cnt < c->fanout) { ubifs_assert(n != c->fanout); - dbg_tnc("inserted at %d level %d, key %s", n, znode->level, - DBGKEY(key)); + dbg_tnck(key, "inserted at %d level %d, key ", n, znode->level); insert_zbranch(znode, zbr, n); @@ -1694,7 +1984,7 @@ again: * Unfortunately, @znode does not have more empty slots and we have to * split it. */ - dbg_tnc("splitting level %d, key %s", znode->level, DBGKEY(key)); + dbg_tnck(key, "splitting level %d, key ", znode->level); if (znode->alt) /* @@ -1710,20 +2000,33 @@ again: zn->level = znode->level; /* Decide where to split */ - if (znode->level == 0 && n == c->fanout && - key_type(c, key) == UBIFS_DATA_KEY) { - union ubifs_key *key1; - - /* - * If this is an inode which is being appended - do not split - * it because no other zbranches can be inserted between - * zbranches of consecutive data nodes anyway. - */ - key1 = &znode->zbranch[n - 1].key; - if (key_inum(c, key1) == key_inum(c, key) && - key_type(c, key1) == UBIFS_DATA_KEY && - key_block(c, key1) == key_block(c, key) - 1) - appending = 1; + if (znode->level == 0 && key_type(c, key) == UBIFS_DATA_KEY) { + /* Try not to split consecutive data keys */ + if (n == c->fanout) { + key1 = &znode->zbranch[n - 1].key; + if (key_inum(c, key1) == key_inum(c, key) && + key_type(c, key1) == UBIFS_DATA_KEY) + appending = 1; + } else + goto check_split; + } else if (appending && n != c->fanout) { + /* Try not to split consecutive data keys */ + appending = 0; +check_split: + if (n >= (c->fanout + 1) / 2) { + key1 = &znode->zbranch[0].key; + if (key_inum(c, key1) == key_inum(c, key) && + key_type(c, key1) == UBIFS_DATA_KEY) { + key1 = &znode->zbranch[n].key; + if (key_inum(c, key1) != key_inum(c, key) || + key_type(c, key1) != UBIFS_DATA_KEY) { + keep = n; + move = c->fanout - keep; + zi = znode; + goto do_split; + } + } + } } if (appending) { @@ -1753,6 +2056,8 @@ again: zbr->znode->parent = zn; } +do_split: + __set_bit(DIRTY_ZNODE, &zn->flags); atomic_long_inc(&c->dirty_zn_cnt); @@ -1773,20 +2078,17 @@ again: } /* Insert new key and branch */ - dbg_tnc("inserting at %d level %d, key %s", n, zn->level, DBGKEY(key)); + dbg_tnck(key, "inserting at %d level %d, key ", n, zn->level); insert_zbranch(zi, zbr, n); /* Insert new znode (produced by spitting) into the parent */ if (zp) { - i = n; + if (n == 0 && zi == znode && znode->iip == 0) + correct_parent_keys(c, znode); + /* Locate insertion point */ n = znode->iip + 1; - if (appending && n != c->fanout) - appending = 0; - - if (i == 0 && zi == znode && znode->iip == 0) - correct_parent_keys(c, znode); /* Tail recursion */ zbr->key = zn->zbranch[0].key; @@ -1852,7 +2154,7 @@ int ubifs_tnc_add(struct ubifs_info *c, const union ubifs_key *key, int lnum, struct ubifs_znode *znode; mutex_lock(&c->tnc_mutex); - dbg_tnc("%d:%d, len %d, key %s", lnum, offs, len, DBGKEY(key)); + dbg_tnck(key, "%d:%d, len %d, key ", lnum, offs, len); found = lookup_level0_dirty(c, key, &znode, &n); if (!found) { struct ubifs_zbranch zbr; @@ -1901,8 +2203,8 @@ int ubifs_tnc_replace(struct ubifs_info *c, const union ubifs_key *key, struct ubifs_znode *znode; mutex_lock(&c->tnc_mutex); - dbg_tnc("old LEB %d:%d, new LEB %d:%d, len %d, key %s", old_lnum, - old_offs, lnum, offs, len, DBGKEY(key)); + dbg_tnck(key, "old LEB %d:%d, new LEB %d:%d, len %d, key ", old_lnum, + old_offs, lnum, offs, len); found = lookup_level0_dirty(c, key, &znode, &n); if (found < 0) { err = found; @@ -1935,12 +2237,11 @@ int ubifs_tnc_replace(struct ubifs_info *c, const union ubifs_key *key, if (found) { /* Ensure the znode is dirtied */ if (znode->cnext || !ubifs_zn_dirty(znode)) { - znode = dirty_cow_bottom_up(c, - znode); - if (IS_ERR(znode)) { - err = PTR_ERR(znode); - goto out_unlock; - } + znode = dirty_cow_bottom_up(c, znode); + if (IS_ERR(znode)) { + err = PTR_ERR(znode); + goto out_unlock; + } } zbr = &znode->zbranch[n]; lnc_free(zbr); @@ -1985,8 +2286,8 @@ int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key, struct ubifs_znode *znode; mutex_lock(&c->tnc_mutex); - dbg_tnc("LEB %d:%d, name '%.*s', key %s", lnum, offs, nm->len, nm->name, - DBGKEY(key)); + dbg_tnck(key, "LEB %d:%d, name '%.*s', key ", + lnum, offs, nm->len, nm->name); found = lookup_level0_dirty(c, key, &znode, &n); if (found < 0) { err = found; @@ -2007,11 +2308,11 @@ int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key, /* Ensure the znode is dirtied */ if (znode->cnext || !ubifs_zn_dirty(znode)) { - znode = dirty_cow_bottom_up(c, znode); - if (IS_ERR(znode)) { - err = PTR_ERR(znode); - goto out_unlock; - } + znode = dirty_cow_bottom_up(c, znode); + if (IS_ERR(znode)) { + err = PTR_ERR(znode); + goto out_unlock; + } } if (found == 1) { @@ -2044,7 +2345,7 @@ int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key, * by passing 'ubifs_tnc_remove_nm()' the same key but * an unmatchable name. */ - struct qstr noname = { .len = 0, .name = "" }; + struct qstr noname = { .name = "" }; err = dbg_check_tnc(c, 0); mutex_unlock(&c->tnc_mutex); @@ -2079,14 +2380,14 @@ static int tnc_delete(struct ubifs_info *c, struct ubifs_znode *znode, int n) /* Delete without merge for now */ ubifs_assert(znode->level == 0); ubifs_assert(n >= 0 && n < c->fanout); - dbg_tnc("deleting %s", DBGKEY(&znode->zbranch[n].key)); + dbg_tnck(&znode->zbranch[n].key, "deleting key "); zbr = &znode->zbranch[n]; lnc_free(zbr); err = ubifs_add_dirt(c, zbr->lnum, zbr->len); if (err) { - dbg_dump_znode(c, znode); + ubifs_dump_znode(c, znode); return err; } @@ -2104,7 +2405,7 @@ static int tnc_delete(struct ubifs_info *c, struct ubifs_znode *znode, int n) */ do { - ubifs_assert(!test_bit(OBSOLETE_ZNODE, &znode->flags)); + ubifs_assert(!ubifs_zn_obsolete(znode)); ubifs_assert(ubifs_zn_dirty(znode)); zp = znode->parent; @@ -2160,9 +2461,8 @@ static int tnc_delete(struct ubifs_info *c, struct ubifs_znode *znode, int n) c->zroot.offs = zbr->offs; c->zroot.len = zbr->len; c->zroot.znode = znode; - ubifs_assert(!test_bit(OBSOLETE_ZNODE, - &zp->flags)); - ubifs_assert(test_bit(DIRTY_ZNODE, &zp->flags)); + ubifs_assert(!ubifs_zn_obsolete(zp)); + ubifs_assert(ubifs_zn_dirty(zp)); atomic_long_dec(&c->dirty_zn_cnt); if (zp->cnext) { @@ -2190,7 +2490,7 @@ int ubifs_tnc_remove(struct ubifs_info *c, const union ubifs_key *key) struct ubifs_znode *znode; mutex_lock(&c->tnc_mutex); - dbg_tnc("key %s", DBGKEY(key)); + dbg_tnck(key, "key "); found = lookup_level0_dirty(c, key, &znode, &n); if (found < 0) { err = found; @@ -2221,7 +2521,7 @@ int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key, struct ubifs_znode *znode; mutex_lock(&c->tnc_mutex); - dbg_tnc("%.*s, key %s", nm->len, nm->name, DBGKEY(key)); + dbg_tnck(key, "%.*s, key ", nm->len, nm->name); err = lookup_level0_dirty(c, key, &znode, &n); if (err < 0) goto out_unlock; @@ -2238,11 +2538,11 @@ int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key, if (err) { /* Ensure the znode is dirtied */ if (znode->cnext || !ubifs_zn_dirty(znode)) { - znode = dirty_cow_bottom_up(c, znode); - if (IS_ERR(znode)) { - err = PTR_ERR(znode); - goto out_unlock; - } + znode = dirty_cow_bottom_up(c, znode); + if (IS_ERR(znode)) { + err = PTR_ERR(znode); + goto out_unlock; + } } err = tnc_delete(c, znode, n); } @@ -2317,11 +2617,11 @@ int ubifs_tnc_remove_range(struct ubifs_info *c, union ubifs_key *from_key, /* Ensure the znode is dirtied */ if (znode->cnext || !ubifs_zn_dirty(znode)) { - znode = dirty_cow_bottom_up(c, znode); - if (IS_ERR(znode)) { - err = PTR_ERR(znode); - goto out_unlock; - } + znode = dirty_cow_bottom_up(c, znode); + if (IS_ERR(znode)) { + err = PTR_ERR(znode); + goto out_unlock; + } } /* Remove all keys in range except the first */ @@ -2333,10 +2633,10 @@ int ubifs_tnc_remove_range(struct ubifs_info *c, union ubifs_key *from_key, err = ubifs_add_dirt(c, znode->zbranch[i].lnum, znode->zbranch[i].len); if (err) { - dbg_dump_znode(c, znode); + ubifs_dump_znode(c, znode); goto out_unlock; } - dbg_tnc("removing %s", DBGKEY(key)); + dbg_tnck(key, "removing key "); } if (k) { for (i = n + 1 + k; i < znode->child_cnt; i++) @@ -2372,7 +2672,7 @@ int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum) struct ubifs_dent_node *xent, *pxent = NULL; struct qstr nm = { .name = NULL }; - dbg_tnc("ino %lu", inum); + dbg_tnc("ino %lu", (unsigned long)inum); /* * Walk all extended attribute entries and remove them together with @@ -2392,7 +2692,8 @@ int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum) } xattr_inum = le64_to_cpu(xent->inum); - dbg_tnc("xent '%s', ino %lu", xent->name, xattr_inum); + dbg_tnc("xent '%s', ino %lu", xent->name, + (unsigned long)xattr_inum); nm.name = xent->name; nm.len = le16_to_cpu(xent->nlen); @@ -2455,7 +2756,7 @@ struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c, struct ubifs_zbranch *zbr; union ubifs_key *dkey; - dbg_tnc("%s %s", nm->name ? (char *)nm->name : "(lowest)", DBGKEY(key)); + dbg_tnck(key, "%s ", nm->name ? (char *)nm->name : "(lowest)"); ubifs_assert(is_hash_key(c, key)); mutex_lock(&c->tnc_mutex); @@ -2545,7 +2846,7 @@ static void tnc_destroy_cnext(struct ubifs_info *c) struct ubifs_znode *znode = cnext; cnext = cnext->cnext; - if (test_bit(OBSOLETE_ZNODE, &znode->flags)) + if (ubifs_zn_obsolete(znode)) kfree(znode); } while (cnext && cnext != c->cnext); } @@ -2556,12 +2857,14 @@ static void tnc_destroy_cnext(struct ubifs_info *c) */ void ubifs_tnc_close(struct ubifs_info *c) { - long clean_freed; - tnc_destroy_cnext(c); if (c->zroot.znode) { - clean_freed = ubifs_destroy_tnc_subtree(c->zroot.znode); - atomic_long_sub(clean_freed, &ubifs_clean_zn_cnt); + long n, freed; + + n = atomic_long_read(&c->clean_zn_cnt); + freed = ubifs_destroy_tnc_subtree(c->zroot.znode); + ubifs_assert(freed == n); + atomic_long_sub(n, &ubifs_clean_zn_cnt); } kfree(c->gap_lebs); kfree(c->ilebs); @@ -2651,7 +2954,7 @@ static struct ubifs_znode *right_znode(struct ubifs_info *c, * * This function searches an indexing node by its first key @key and its * address @lnum:@offs. It looks up the indexing tree by pulling all indexing - * nodes it traverses to TNC. This function is called fro indexing nodes which + * nodes it traverses to TNC. This function is called for indexing nodes which * were found on the media by scanning, for example when garbage-collecting or * when doing in-the-gaps commit. This means that the indexing node which is * looked for does not have to have exactly the same leftmost key @key, because @@ -2673,6 +2976,8 @@ static struct ubifs_znode *lookup_znode(struct ubifs_info *c, struct ubifs_znode *znode, *zn; int n, nn; + ubifs_assert(key_type(c, key) < UBIFS_INVALID_KEY); + /* * The arguments have probably been read off flash, so don't assume * they are valid. @@ -2954,3 +3259,70 @@ out_unlock: mutex_unlock(&c->tnc_mutex); return err; } + +/** + * dbg_check_inode_size - check if inode size is correct. + * @c: UBIFS file-system description object + * @inum: inode number + * @size: inode size + * + * This function makes sure that the inode size (@size) is correct and it does + * not have any pages beyond @size. Returns zero if the inode is OK, %-EINVAL + * if it has a data page beyond @size, and other negative error code in case of + * other errors. + */ +int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode, + loff_t size) +{ + int err, n; + union ubifs_key from_key, to_key, *key; + struct ubifs_znode *znode; + unsigned int block; + + if (!S_ISREG(inode->i_mode)) + return 0; + if (!dbg_is_chk_gen(c)) + return 0; + + block = (size + UBIFS_BLOCK_SIZE - 1) >> UBIFS_BLOCK_SHIFT; + data_key_init(c, &from_key, inode->i_ino, block); + highest_data_key(c, &to_key, inode->i_ino); + + mutex_lock(&c->tnc_mutex); + err = ubifs_lookup_level0(c, &from_key, &znode, &n); + if (err < 0) + goto out_unlock; + + if (err) { + err = -EINVAL; + key = &from_key; + goto out_dump; + } + + err = tnc_next(c, &znode, &n); + if (err == -ENOENT) { + err = 0; + goto out_unlock; + } + if (err < 0) + goto out_unlock; + + ubifs_assert(err == 0); + key = &znode->zbranch[n].key; + if (!key_in_range(c, key, &from_key, &to_key)) + goto out_unlock; + +out_dump: + block = key_block(c, key); + ubifs_err("inode %lu has size %lld, but there are data at offset %lld", + (unsigned long)inode->i_ino, size, + ((loff_t)block) << UBIFS_BLOCK_SHIFT); + mutex_unlock(&c->tnc_mutex); + ubifs_dump_inode(c, inode); + dump_stack(); + return -EINVAL; + +out_unlock: + mutex_unlock(&c->tnc_mutex); + return err; +} diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c index 8117e65ba2e..3600994f841 100644 --- a/fs/ubifs/tnc_commit.c +++ b/fs/ubifs/tnc_commit.c @@ -22,6 +22,7 @@ /* This file implements TNC functions for committing */ +#include <linux/random.h> #include "ubifs.h" /** @@ -53,18 +54,16 @@ static int make_idx_node(struct ubifs_info *c, struct ubifs_idx_node *idx, br->len = cpu_to_le32(zbr->len); if (!zbr->lnum || !zbr->len) { ubifs_err("bad ref in znode"); - dbg_dump_znode(c, znode); + ubifs_dump_znode(c, znode); if (zbr->znode) - dbg_dump_znode(c, zbr->znode); + ubifs_dump_znode(c, zbr->znode); } } ubifs_prepare_node(c, idx, len, 0); -#ifdef CONFIG_UBIFS_FS_DEBUG znode->lnum = lnum; znode->offs = offs; znode->len = len; -#endif err = insert_old_idx_znode(c, znode); @@ -87,8 +86,12 @@ static int make_idx_node(struct ubifs_info *c, struct ubifs_idx_node *idx, atomic_long_dec(&c->dirty_zn_cnt); ubifs_assert(ubifs_zn_dirty(znode)); - ubifs_assert(test_bit(COW_ZNODE, &znode->flags)); + ubifs_assert(ubifs_zn_cow(znode)); + /* + * Note, unlike 'write_index()' we do not add memory barriers here + * because this function is called with @c->tnc_mutex locked. + */ __clear_bit(DIRTY_ZNODE, &znode->flags); __clear_bit(COW_ZNODE, &znode->flags); @@ -245,7 +248,7 @@ static int layout_leb_in_gaps(struct ubifs_info *c, int *p) * it is more comprehensive and less efficient than is needed for this * purpose. */ - sleb = ubifs_scan(c, lnum, 0, c->ileb_buf); + sleb = ubifs_scan(c, lnum, 0, c->ileb_buf, 0); c->ileb_len = 0; if (IS_ERR(sleb)) return PTR_ERR(sleb); @@ -317,8 +320,7 @@ static int layout_leb_in_gaps(struct ubifs_info *c, int *p) 0, 0, 0); if (err) return err; - err = ubifs_leb_change(c, lnum, c->ileb_buf, c->ileb_len, - UBI_SHORTTERM); + err = ubifs_leb_change(c, lnum, c->ileb_buf, c->ileb_len); if (err) return err; dbg_gc("LEB %d wrote %d index nodes", lnum, tot_written); @@ -372,26 +374,23 @@ static int layout_in_gaps(struct ubifs_info *c, int cnt) written = layout_leb_in_gaps(c, p); if (written < 0) { err = written; - if (err == -ENOSPC) { - if (!dbg_force_in_the_gaps_enabled) { - /* - * Do not print scary warnings if the - * debugging option which forces - * in-the-gaps is enabled. - */ - ubifs_err("out of space"); - spin_lock(&c->space_lock); - dbg_dump_budg(c); - spin_unlock(&c->space_lock); - dbg_dump_lprops(c); - } - /* Try to commit anyway */ - err = 0; - break; + if (err != -ENOSPC) { + kfree(c->gap_lebs); + c->gap_lebs = NULL; + return err; } - kfree(c->gap_lebs); - c->gap_lebs = NULL; - return err; + if (!dbg_is_chk_index(c)) { + /* + * Do not print scary warnings if the debugging + * option which forces in-the-gaps is enabled. + */ + ubifs_warn("out of space"); + ubifs_dump_budg(c, &c->bi); + ubifs_dump_lprops(c); + } + /* Try to commit anyway */ + err = 0; + break; } p++; cnt -= written; @@ -454,11 +453,9 @@ static int layout_in_empty_space(struct ubifs_info *c) offs = buf_offs + used; -#ifdef CONFIG_UBIFS_FS_DEBUG znode->lnum = lnum; znode->offs = offs; znode->len = len; -#endif /* Update the parent */ zp = znode->parent; @@ -494,25 +491,6 @@ static int layout_in_empty_space(struct ubifs_info *c) else next_len = ubifs_idx_node_sz(c, cnext->child_cnt); - if (c->min_io_size == 1) { - buf_offs += ALIGN(len, 8); - if (next_len) { - if (buf_offs + next_len <= c->leb_size) - continue; - err = ubifs_update_one_lp(c, lnum, 0, - c->leb_size - buf_offs, 0, 0); - if (err) - return err; - lnum = -1; - continue; - } - err = ubifs_update_one_lp(c, lnum, - c->leb_size - buf_offs, 0, 0, 0); - if (err) - return err; - break; - } - /* Update buffer positions */ wlen = used + len; used += ALIGN(len, 8); @@ -553,10 +531,8 @@ static int layout_in_empty_space(struct ubifs_info *c) break; } -#ifdef CONFIG_UBIFS_FS_DEBUG - c->new_ihead_lnum = lnum; - c->new_ihead_offs = buf_offs; -#endif + c->dbg->new_ihead_lnum = lnum; + c->dbg->new_ihead_offs = buf_offs; return 0; } @@ -661,7 +637,7 @@ static int get_znodes_to_commit(struct ubifs_info *c) } cnt += 1; while (1) { - ubifs_assert(!test_bit(COW_ZNODE, &znode->flags)); + ubifs_assert(!ubifs_zn_cow(znode)); __set_bit(COW_ZNODE, &znode->flags); znode->alt = 0; cnext = find_next_dirty(znode); @@ -707,7 +683,7 @@ static int alloc_idx_lebs(struct ubifs_info *c, int cnt) c->ilebs[c->ileb_cnt++] = lnum; dbg_cmt("LEB %d", lnum); } - if (dbg_force_in_the_gaps()) + if (dbg_is_chk_index(c) && !(prandom_u32() & 7)) return -ENOSPC; return 0; } @@ -797,14 +773,16 @@ int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot) spin_lock(&c->space_lock); /* * Although we have not finished committing yet, update size of the - * committed index ('c->old_idx_sz') and zero out the index growth + * committed index ('c->bi.old_idx_sz') and zero out the index growth * budget. It is OK to do this now, because we've reserved all the * space which is needed to commit the index, and it is save for the * budgeting subsystem to assume the index is already committed, * even though it is not. */ - c->old_idx_sz = c->calc_idx_sz; - c->budg_uncommitted_idx = 0; + ubifs_assert(c->bi.min_idx_lebs == ubifs_calc_min_idx_lebs(c)); + c->bi.old_idx_sz = c->calc_idx_sz; + c->bi.uncommitted_idx = 0; + c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c); spin_unlock(&c->space_lock); mutex_unlock(&c->tnc_mutex); @@ -831,7 +809,7 @@ static int write_index(struct ubifs_info *c) struct ubifs_idx_node *idx; struct ubifs_znode *znode, *cnext; int i, lnum, offs, len, next_len, buf_len, buf_offs, used; - int avail, wlen, err, lnum_pos = 0; + int avail, wlen, err, lnum_pos = 0, blen, nxt_offs; cnext = c->enext; if (!cnext) @@ -879,9 +857,9 @@ static int write_index(struct ubifs_info *c) br->len = cpu_to_le32(zbr->len); if (!zbr->lnum || !zbr->len) { ubifs_err("bad ref in znode"); - dbg_dump_znode(c, znode); + ubifs_dump_znode(c, znode); if (zbr->znode) - dbg_dump_znode(c, zbr->znode); + ubifs_dump_znode(c, zbr->znode); } } len = ubifs_idx_node_sz(c, znode->child_cnt); @@ -896,19 +874,17 @@ static int write_index(struct ubifs_info *c) } offs = buf_offs + used; -#ifdef CONFIG_UBIFS_FS_DEBUG if (lnum != znode->lnum || offs != znode->offs || len != znode->len) { ubifs_err("inconsistent znode posn"); return -EINVAL; } -#endif /* Grab some stuff from znode while we still can */ cnext = znode->cnext; ubifs_assert(ubifs_zn_dirty(znode)); - ubifs_assert(test_bit(COW_ZNODE, &znode->flags)); + ubifs_assert(ubifs_zn_cow(znode)); /* * It is important that other threads should see %DIRTY_ZNODE @@ -919,9 +895,31 @@ static int write_index(struct ubifs_info *c) * the reason for the second barrier. */ clear_bit(DIRTY_ZNODE, &znode->flags); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(COW_ZNODE, &znode->flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); + + /* + * We have marked the znode as clean but have not updated the + * @c->clean_zn_cnt counter. If this znode becomes dirty again + * before 'free_obsolete_znodes()' is called, then + * @c->clean_zn_cnt will be decremented before it gets + * incremented (resulting in 2 decrements for the same znode). + * This means that @c->clean_zn_cnt may become negative for a + * while. + * + * Q: why we cannot increment @c->clean_zn_cnt? + * A: because we do not have the @c->tnc_mutex locked, and the + * following code would be racy and buggy: + * + * if (!ubifs_zn_obsolete(znode)) { + * atomic_long_inc(&c->clean_zn_cnt); + * atomic_long_inc(&ubifs_clean_zn_cnt); + * } + * + * Thus, we just delay the @c->clean_zn_cnt update until we + * have the mutex locked. + */ /* Do not access znode from this point on */ @@ -939,75 +937,46 @@ static int write_index(struct ubifs_info *c) else next_len = ubifs_idx_node_sz(c, cnext->child_cnt); - if (c->min_io_size == 1) { - /* - * Write the prepared index node immediately if there is - * no minimum IO size - */ - err = ubifs_leb_write(c, lnum, c->cbuf, buf_offs, - wlen, UBI_SHORTTERM); - if (err) - return err; - buf_offs += ALIGN(wlen, 8); - if (next_len) { - used = 0; - avail = buf_len; - if (buf_offs + next_len > c->leb_size) { - err = ubifs_update_one_lp(c, lnum, - LPROPS_NC, 0, 0, LPROPS_TAKEN); - if (err) - return err; - lnum = -1; - } + nxt_offs = buf_offs + used + next_len; + if (next_len && nxt_offs <= c->leb_size) { + if (avail > 0) continue; - } + else + blen = buf_len; } else { - int blen, nxt_offs = buf_offs + used + next_len; - - if (next_len && nxt_offs <= c->leb_size) { - if (avail > 0) - continue; - else - blen = buf_len; - } else { - wlen = ALIGN(wlen, 8); - blen = ALIGN(wlen, c->min_io_size); - ubifs_pad(c, c->cbuf + wlen, blen - wlen); - } - /* - * The buffer is full or there are no more znodes - * to do - */ - err = ubifs_leb_write(c, lnum, c->cbuf, buf_offs, - blen, UBI_SHORTTERM); - if (err) - return err; - buf_offs += blen; - if (next_len) { - if (nxt_offs > c->leb_size) { - err = ubifs_update_one_lp(c, lnum, - LPROPS_NC, 0, 0, LPROPS_TAKEN); - if (err) - return err; - lnum = -1; - } - used -= blen; - if (used < 0) - used = 0; - avail = buf_len - used; - memmove(c->cbuf, c->cbuf + blen, used); - continue; + wlen = ALIGN(wlen, 8); + blen = ALIGN(wlen, c->min_io_size); + ubifs_pad(c, c->cbuf + wlen, blen - wlen); + } + + /* The buffer is full or there are no more znodes to do */ + err = ubifs_leb_write(c, lnum, c->cbuf, buf_offs, blen); + if (err) + return err; + buf_offs += blen; + if (next_len) { + if (nxt_offs > c->leb_size) { + err = ubifs_update_one_lp(c, lnum, LPROPS_NC, 0, + 0, LPROPS_TAKEN); + if (err) + return err; + lnum = -1; } + used -= blen; + if (used < 0) + used = 0; + avail = buf_len - used; + memmove(c->cbuf, c->cbuf + blen, used); + continue; } break; } -#ifdef CONFIG_UBIFS_FS_DEBUG - if (lnum != c->new_ihead_lnum || buf_offs != c->new_ihead_offs) { + if (lnum != c->dbg->new_ihead_lnum || + buf_offs != c->dbg->new_ihead_offs) { ubifs_err("inconsistent ihead"); return -EINVAL; } -#endif c->ihead_lnum = lnum; c->ihead_offs = buf_offs; @@ -1029,7 +998,7 @@ static void free_obsolete_znodes(struct ubifs_info *c) do { znode = cnext; cnext = znode->cnext; - if (test_bit(OBSOLETE_ZNODE, &znode->flags)) + if (ubifs_zn_obsolete(znode)) kfree(znode); else { znode->cnext = NULL; diff --git a/fs/ubifs/tnc_misc.c b/fs/ubifs/tnc_misc.c index a25c1cc1f8d..f6bf8995c7b 100644 --- a/fs/ubifs/tnc_misc.c +++ b/fs/ubifs/tnc_misc.c @@ -293,10 +293,10 @@ static int read_znode(struct ubifs_info *c, int lnum, int offs, int len, lnum, offs, znode->level, znode->child_cnt); if (znode->child_cnt > c->fanout || znode->level > UBIFS_MAX_LEVELS) { - dbg_err("current fanout %d, branch count %d", - c->fanout, znode->child_cnt); - dbg_err("max levels %d, znode level %d", - UBIFS_MAX_LEVELS, znode->level); + ubifs_err("current fanout %d, branch count %d", + c->fanout, znode->child_cnt); + ubifs_err("max levels %d, znode level %d", + UBIFS_MAX_LEVELS, znode->level); err = 1; goto out_dump; } @@ -316,7 +316,7 @@ static int read_znode(struct ubifs_info *c, int lnum, int offs, int len, if (zbr->lnum < c->main_first || zbr->lnum >= c->leb_cnt || zbr->offs < 0 || zbr->offs + zbr->len > c->leb_size || zbr->offs & 7) { - dbg_err("bad branch %d", i); + ubifs_err("bad branch %d", i); err = 2; goto out_dump; } @@ -328,8 +328,8 @@ static int read_znode(struct ubifs_info *c, int lnum, int offs, int len, case UBIFS_XENT_KEY: break; default: - dbg_msg("bad key type at slot %d: %s", i, - DBGKEY(&zbr->key)); + ubifs_err("bad key type at slot %d: %d", + i, key_type(c, &zbr->key)); err = 3; goto out_dump; } @@ -340,19 +340,19 @@ static int read_znode(struct ubifs_info *c, int lnum, int offs, int len, type = key_type(c, &zbr->key); if (c->ranges[type].max_len == 0) { if (zbr->len != c->ranges[type].len) { - dbg_err("bad target node (type %d) length (%d)", - type, zbr->len); - dbg_err("have to be %d", c->ranges[type].len); + ubifs_err("bad target node (type %d) length (%d)", + type, zbr->len); + ubifs_err("have to be %d", c->ranges[type].len); err = 4; goto out_dump; } } else if (zbr->len < c->ranges[type].min_len || zbr->len > c->ranges[type].max_len) { - dbg_err("bad target node (type %d) length (%d)", - type, zbr->len); - dbg_err("have to be in range of %d-%d", - c->ranges[type].min_len, - c->ranges[type].max_len); + ubifs_err("bad target node (type %d) length (%d)", + type, zbr->len); + ubifs_err("have to be in range of %d-%d", + c->ranges[type].min_len, + c->ranges[type].max_len); err = 5; goto out_dump; } @@ -370,13 +370,13 @@ static int read_znode(struct ubifs_info *c, int lnum, int offs, int len, cmp = keys_cmp(c, key1, key2); if (cmp > 0) { - dbg_err("bad key order (keys %d and %d)", i, i + 1); + ubifs_err("bad key order (keys %d and %d)", i, i + 1); err = 6; goto out_dump; } else if (cmp == 0 && !is_hash_key(c, key1)) { /* These can only be keys with colliding hash */ - dbg_err("keys %d and %d are not hashed but equivalent", - i, i + 1); + ubifs_err("keys %d and %d are not hashed but equivalent", + i, i + 1); err = 7; goto out_dump; } @@ -387,7 +387,7 @@ static int read_znode(struct ubifs_info *c, int lnum, int offs, int len, out_dump: ubifs_err("bad indexing node at LEB %d:%d, error %d", lnum, offs, err); - dbg_dump_node(c, idx); + ubifs_dump_node(c, idx); kfree(idx); return -EINVAL; } @@ -475,18 +475,18 @@ int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr, zbr->offs); if (err) { - dbg_tnc("key %s", DBGKEY(key)); + dbg_tnck(key, "key "); return err; } /* Make sure the key of the read node is correct */ - key_read(c, key, &key1); - if (memcmp(node + UBIFS_KEY_OFFSET, &key1, c->key_len)) { + key_read(c, node + UBIFS_KEY_OFFSET, &key1); + if (!keys_eq(c, key, &key1)) { ubifs_err("bad key in node at LEB %d:%d", zbr->lnum, zbr->offs); - dbg_tnc("looked for key %s found node's key %s", - DBGKEY(key), DBGKEY1(&key1)); - dbg_dump_node(c, node); + dbg_tnck(key, "looked for key "); + dbg_tnck(&key1, "but found node's key "); + ubifs_dump_node(c, node); return -EINVAL; } diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h index 0cc7da9bed4..e24380cf46e 100644 --- a/fs/ubifs/ubifs-media.h +++ b/fs/ubifs/ubifs-media.h @@ -36,9 +36,31 @@ /* UBIFS node magic number (must not have the padding byte first or last) */ #define UBIFS_NODE_MAGIC 0x06101831 -/* UBIFS on-flash format version */ +/* + * UBIFS on-flash format version. This version is increased when the on-flash + * format is changing. If this happens, UBIFS is will support older versions as + * well. But older UBIFS code will not support newer formats. Format changes + * will be rare and only when absolutely necessary, e.g. to fix a bug or to add + * a new feature. + * + * UBIFS went into mainline kernel with format version 4. The older formats + * were development formats. + */ #define UBIFS_FORMAT_VERSION 4 +/* + * Read-only compatibility version. If the UBIFS format is changed, older UBIFS + * implementations will not be able to mount newer formats in read-write mode. + * However, depending on the change, it may be possible to mount newer formats + * in R/O mode. This is indicated by the R/O compatibility version which is + * stored in the super-block. + * + * This is needed to support boot-loaders which only need R/O mounting. With + * this flag it is possible to do UBIFS format changes without a need to update + * boot-loaders. + */ +#define UBIFS_RO_COMPAT_VERSION 0 + /* Minimum logical eraseblock size in bytes */ #define UBIFS_MIN_LEB_SZ (15*1024) @@ -51,6 +73,13 @@ */ #define UBIFS_MIN_COMPR_LEN 128 +/* + * If compressed data length is less than %UBIFS_MIN_COMPRESS_DIFF bytes + * shorter than uncompressed data length, UBIFS prefers to leave this data + * node uncompress, because it'll be read faster. + */ +#define UBIFS_MIN_COMPRESS_DIFF 64 + /* Root inode number */ #define UBIFS_ROOT_INO 1 @@ -75,7 +104,6 @@ */ #define UBIFS_BLOCK_SIZE 4096 #define UBIFS_BLOCK_SHIFT 12 -#define UBIFS_BLOCK_MASK 0x00000FFF /* UBIFS padding byte pattern (must not be first or last byte of node magic) */ #define UBIFS_PADDING_BYTE 0xCE @@ -87,7 +115,7 @@ #define UBIFS_SK_LEN 8 /* Minimum index tree fanout */ -#define UBIFS_MIN_FANOUT 2 +#define UBIFS_MIN_FANOUT 3 /* Maximum number of levels in UBIFS indexing B-tree */ #define UBIFS_MAX_LEVELS 512 @@ -107,6 +135,13 @@ /* The key is always at the same position in all keyed nodes */ #define UBIFS_KEY_OFFSET offsetof(struct ubifs_ino_node, key) +/* Garbage collector journal head number */ +#define UBIFS_GC_HEAD 0 +/* Base journal head number */ +#define UBIFS_BASE_HEAD 1 +/* Data journal head number */ +#define UBIFS_DATA_HEAD 2 + /* * LEB Properties Tree node types. * @@ -228,10 +263,10 @@ enum { /* Minimum number of orphan area logical eraseblocks */ #define UBIFS_MIN_ORPH_LEBS 1 /* - * Minimum number of main area logical eraseblocks (buds, 2 for the index, 1 + * Minimum number of main area logical eraseblocks (buds, 3 for the index, 1 * for GC, 1 for deletions, and at least 1 for committed data). */ -#define UBIFS_MIN_MAIN_LEBS (UBIFS_MIN_BUD_LEBS + 5) +#define UBIFS_MIN_MAIN_LEBS (UBIFS_MIN_BUD_LEBS + 6) /* Minimum number of logical eraseblocks */ #define UBIFS_MIN_LEB_CNT (UBIFS_SB_LEBS + UBIFS_MST_LEBS + \ @@ -373,9 +408,11 @@ enum { * Superblock flags. * * UBIFS_FLG_BIGLPT: if "big" LPT model is used if set + * UBIFS_FLG_SPACE_FIXUP: first-mount "fixup" of free space within LEBs needed */ enum { UBIFS_FLG_BIGLPT = 0x02, + UBIFS_FLG_SPACE_FIXUP = 0x04, }; /** @@ -399,7 +436,7 @@ struct ubifs_ch { __u8 node_type; __u8 group_type; __u8 padding[2]; -} __attribute__ ((packed)); +} __packed; /** * union ubifs_dev_desc - device node descriptor. @@ -413,7 +450,7 @@ struct ubifs_ch { union ubifs_dev_desc { __le32 new; __le64 huge; -} __attribute__ ((packed)); +} __packed; /** * struct ubifs_ino_node - inode node. @@ -474,7 +511,7 @@ struct ubifs_ino_node { __le16 compr_type; __u8 padding2[26]; /* Watch 'zero_ino_node_unused()' if changing! */ __u8 data[]; -} __attribute__ ((packed)); +} __packed; /** * struct ubifs_dent_node - directory entry node. @@ -499,7 +536,7 @@ struct ubifs_dent_node { __le16 nlen; __u8 padding2[4]; /* Watch 'zero_dent_node_unused()' if changing! */ __u8 name[]; -} __attribute__ ((packed)); +} __packed; /** * struct ubifs_data_node - data node. @@ -520,7 +557,7 @@ struct ubifs_data_node { __le16 compr_type; __u8 padding[2]; /* Watch 'zero_data_node_unused()' if changing! */ __u8 data[]; -} __attribute__ ((packed)); +} __packed; /** * struct ubifs_trun_node - truncation node. @@ -540,7 +577,7 @@ struct ubifs_trun_node { __u8 padding[12]; /* Watch 'zero_trun_node_unused()' if changing! */ __le64 old_size; __le64 new_size; -} __attribute__ ((packed)); +} __packed; /** * struct ubifs_pad_node - padding node. @@ -551,7 +588,7 @@ struct ubifs_trun_node { struct ubifs_pad_node { struct ubifs_ch ch; __le32 pad_len; -} __attribute__ ((packed)); +} __packed; /** * struct ubifs_sb_node - superblock node. @@ -580,6 +617,7 @@ struct ubifs_pad_node { * @padding2: reserved for future, zeroes * @time_gran: time granularity in nanoseconds * @uuid: UUID generated when the file system image was created + * @ro_compat_version: UBIFS R/O compatibility version */ struct ubifs_sb_node { struct ubifs_ch ch; @@ -606,8 +644,9 @@ struct ubifs_sb_node { __le64 rp_size; __le32 time_gran; __u8 uuid[16]; - __u8 padding2[3972]; -} __attribute__ ((packed)); + __le32 ro_compat_version; + __u8 padding2[3968]; +} __packed; /** * struct ubifs_mst_node - master node. @@ -674,7 +713,7 @@ struct ubifs_mst_node { __le32 idx_lebs; __le32 leb_cnt; __u8 padding[344]; -} __attribute__ ((packed)); +} __packed; /** * struct ubifs_ref_node - logical eraseblock reference node. @@ -690,7 +729,7 @@ struct ubifs_ref_node { __le32 offs; __le32 jhead; __u8 padding[28]; -} __attribute__ ((packed)); +} __packed; /** * struct ubifs_branch - key/reference/length branch @@ -704,7 +743,7 @@ struct ubifs_branch { __le32 offs; __le32 len; __u8 key[]; -} __attribute__ ((packed)); +} __packed; /** * struct ubifs_idx_node - indexing node. @@ -718,7 +757,7 @@ struct ubifs_idx_node { __le16 child_cnt; __le16 level; __u8 branches[]; -} __attribute__ ((packed)); +} __packed; /** * struct ubifs_cs_node - commit start node. @@ -728,7 +767,7 @@ struct ubifs_idx_node { struct ubifs_cs_node { struct ubifs_ch ch; __le64 cmt_no; -} __attribute__ ((packed)); +} __packed; /** * struct ubifs_orph_node - orphan node. @@ -740,6 +779,6 @@ struct ubifs_orph_node { struct ubifs_ch ch; __le64 cmt_no; __le64 inos[]; -} __attribute__ ((packed)); +} __packed; #endif /* __UBIFS_MEDIA_H__ */ diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index e4f89f27182..c1f71fe17cc 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -20,8 +20,6 @@ * Adrian Hunter */ -/* Implementation version 0.7 */ - #ifndef __UBIFS_H__ #define __UBIFS_H__ @@ -30,6 +28,7 @@ #include <linux/fs.h> #include <linux/err.h> #include <linux/sched.h> +#include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/spinlock.h> #include <linux/mutex.h> @@ -43,16 +42,24 @@ #define UBIFS_VERSION 1 /* Normal UBIFS messages */ -#define ubifs_msg(fmt, ...) \ - printk(KERN_NOTICE "UBIFS: " fmt "\n", ##__VA_ARGS__) +#define ubifs_msg(fmt, ...) pr_notice("UBIFS: " fmt "\n", ##__VA_ARGS__) /* UBIFS error messages */ -#define ubifs_err(fmt, ...) \ - printk(KERN_ERR "UBIFS error (pid %d): %s: " fmt "\n", current->pid, \ +#define ubifs_err(fmt, ...) \ + pr_err("UBIFS error (pid %d): %s: " fmt "\n", current->pid, \ __func__, ##__VA_ARGS__) /* UBIFS warning messages */ -#define ubifs_warn(fmt, ...) \ - printk(KERN_WARNING "UBIFS warning (pid %d): %s: " fmt "\n", \ - current->pid, __func__, ##__VA_ARGS__) +#define ubifs_warn(fmt, ...) \ + pr_warn("UBIFS warning (pid %d): %s: " fmt "\n", \ + current->pid, __func__, ##__VA_ARGS__) +/* + * A variant of 'ubifs_err()' which takes the UBIFS file-sytem description + * object as an argument. + */ +#define ubifs_errc(c, fmt, ...) \ + do { \ + if (!(c)->probing) \ + ubifs_err(fmt, ##__VA_ARGS__); \ + } while (0) /* UBIFS file system VFS magic number */ #define UBIFS_SUPER_MAGIC 0x24051905 @@ -65,6 +72,14 @@ #define SQNUM_WARN_WATERMARK 0xFFFFFFFF00000000ULL #define SQNUM_WATERMARK 0xFFFFFFFFFF000000ULL +/* + * Minimum amount of LEBs reserved for the index. At present the index needs at + * least 2 LEBs: one for the index head and one for in-the-gaps method (which + * currently does not cater for the index head and so excludes it from + * consideration). + */ +#define MIN_INDEX_LEBS 2 + /* Minimum amount of data UBIFS writes to the flash */ #define MIN_WRITE_SZ (UBIFS_DATA_NODE_SZ + 8) @@ -77,9 +92,6 @@ #define INUM_WARN_WATERMARK 0xFFF00000 #define INUM_WATERMARK 0xFFFFFF00 -/* Largest key size supported in this implementation */ -#define CUR_MAX_KEY_LEN UBIFS_SK_LEN - /* Maximum number of entries in each LPT (LEB category) heap */ #define LPT_HEAP_SZ 256 @@ -89,8 +101,9 @@ */ #define BGT_NAME_PATTERN "ubifs_bgt%d_%d" -/* Default write-buffer synchronization timeout (5 secs) */ -#define DEFAULT_WBUF_TIMEOUT (5 * HZ) +/* Write-buffer synchronization timeout interval in seconds */ +#define WBUF_TIMEOUT_SOFTLIMIT 3 +#define WBUF_TIMEOUT_HARDLIMIT 5 /* Maximum possible inode number (only 32-bit inodes are supported now) */ #define MAX_INUM 0xFFFFFFFF @@ -98,12 +111,10 @@ /* Number of non-data journal heads */ #define NONDATA_JHEADS_CNT 2 -/* Garbage collector head */ -#define GCHD 0 -/* Base journal head number */ -#define BASEHD 1 -/* First "general purpose" journal head */ -#define DATAHD 2 +/* Shorter names for journal head numbers for internal usage */ +#define GCHD UBIFS_GC_HEAD +#define BASEHD UBIFS_BASE_HEAD +#define DATAHD UBIFS_DATA_HEAD /* 'No change' value for 'ubifs_change_lp()' */ #define LPROPS_NC 0x80000001 @@ -113,8 +124,12 @@ * in TNC. However, when replaying, it is handy to introduce fake "truncation" * keys for truncation nodes because the code becomes simpler. So we define * %UBIFS_TRUN_KEY type. + * + * But otherwise, out of the journal reply scope, the truncation keys are + * invalid. */ -#define UBIFS_TRUN_KEY UBIFS_KEY_TYPES_CNT +#define UBIFS_TRUN_KEY UBIFS_KEY_TYPES_CNT +#define UBIFS_INVALID_KEY UBIFS_KEY_TYPES_CNT /* * How much a directory entry/extended attribute entry adds to the parent/host @@ -141,9 +156,18 @@ */ #define WORST_COMPR_FACTOR 2 +/* + * How much memory is needed for a buffer where we comress a data node. + */ +#define COMPRESSED_DATA_NODE_BUF_SZ \ + (UBIFS_DATA_NODE_SZ + UBIFS_BLOCK_SIZE * WORST_COMPR_FACTOR) + /* Maximum expected tree height for use by bottom_up_buf */ #define BOTTOM_UP_HEIGHT 64 +/* Maximum number of data nodes to bulk-read */ +#define UBIFS_MAX_BULK_READ 32 + /* * Lockdep classes for UBIFS inode @ui_mutex. */ @@ -211,14 +235,14 @@ enum { * LPT cnode flag bits. * * DIRTY_CNODE: cnode is dirty - * COW_CNODE: cnode is being committed and must be copied before writing * OBSOLETE_CNODE: cnode is being committed and has been copied (or deleted), - * so it can (and must) be freed when the commit is finished + * so it can (and must) be freed when the commit is finished + * COW_CNODE: cnode is being committed and must be copied before writing */ enum { DIRTY_CNODE = 0, - COW_CNODE = 1, - OBSOLETE_CNODE = 2, + OBSOLETE_CNODE = 1, + COW_CNODE = 2, }; /* @@ -258,10 +282,10 @@ struct ubifs_old_idx { /* The below union makes it easier to deal with keys */ union ubifs_key { - uint8_t u8[CUR_MAX_KEY_LEN]; - uint32_t u32[CUR_MAX_KEY_LEN/4]; - uint64_t u64[CUR_MAX_KEY_LEN/8]; - __le32 j32[CUR_MAX_KEY_LEN/4]; + uint8_t u8[UBIFS_SK_LEN]; + uint32_t u32[UBIFS_SK_LEN/4]; + uint64_t u64[UBIFS_SK_LEN/8]; + __le32 j32[UBIFS_SK_LEN/4]; }; /** @@ -322,15 +346,18 @@ struct ubifs_gced_idx_leb { * struct ubifs_inode - UBIFS in-memory inode description. * @vfs_inode: VFS inode description object * @creat_sqnum: sequence number at time of creation + * @del_cmtno: commit number corresponding to the time the inode was deleted, + * protected by @c->commit_sem; * @xattr_size: summarized size of all extended attributes in bytes * @xattr_cnt: count of extended attributes this inode has * @xattr_names: sum of lengths of all extended attribute names belonging to * this inode * @dirty: non-zero if the inode is dirty * @xattr: non-zero if this is an extended attribute inode + * @bulk_read: non-zero if bulk-read should be used * @ui_mutex: serializes inode write-back with the rest of VFS operations, - * serializes "clean <-> dirty" state changes, protects @dirty, - * @ui_size, and @xattr_size + * serializes "clean <-> dirty" state changes, serializes bulk-read, + * protects @dirty, @bulk_read, @ui_size, and @xattr_size * @ui_lock: protects @synced_i_size * @synced_i_size: synchronized size of inode, i.e. the value of inode size * currently stored on the flash; used only for regular file @@ -338,6 +365,8 @@ struct ubifs_gced_idx_leb { * @ui_size: inode size used by UBIFS when writing to flash * @flags: inode flags (@UBIFS_COMPR_FL, etc) * @compr_type: default compression type used for this inode + * @last_page_read: page number of last page read (for bulk read) + * @read_in_a_row: number of consecutive pages read in a row (for bulk read) * @data_len: length of the data attached to the inode * @data: inode's data * @@ -365,25 +394,29 @@ struct ubifs_gced_idx_leb { * The @ui_size is a "shadow" variable for @inode->i_size and UBIFS uses * @ui_size instead of @inode->i_size. The reason for this is that UBIFS cannot * make sure @inode->i_size is always changed under @ui_mutex, because it - * cannot call 'vmtruncate()' with @ui_mutex locked, because it would deadlock - * with 'ubifs_writepage()' (see file.c). All the other inode fields are - * changed under @ui_mutex, so they do not need "shadow" fields. Note, one + * cannot call 'truncate_setsize()' with @ui_mutex locked, because it would + * deadlock with 'ubifs_writepage()' (see file.c). All the other inode fields + * are changed under @ui_mutex, so they do not need "shadow" fields. Note, one * could consider to rework locking and base it on "shadow" fields. */ struct ubifs_inode { struct inode vfs_inode; unsigned long long creat_sqnum; + unsigned long long del_cmtno; unsigned int xattr_size; unsigned int xattr_cnt; unsigned int xattr_names; unsigned int dirty:1; unsigned int xattr:1; + unsigned int bulk_read:1; + unsigned int compr_type:2; struct mutex ui_mutex; spinlock_t ui_lock; loff_t synced_i_size; loff_t ui_size; int flags; - int compr_type; + pgoff_t last_page_read; + pgoff_t read_in_a_row; int data_len; void *data; }; @@ -408,9 +441,9 @@ struct ubifs_unclean_leb { * LEB properties flags. * * LPROPS_UNCAT: not categorized - * LPROPS_DIRTY: dirty > 0, not index - * LPROPS_DIRTY_IDX: dirty + free > UBIFS_CH_SZ and index - * LPROPS_FREE: free > 0, not empty, not index + * LPROPS_DIRTY: dirty > free, dirty >= @c->dead_wm, not index + * LPROPS_DIRTY_IDX: dirty + free > @c->min_idx_node_sze and index + * LPROPS_FREE: free > 0, dirty < @c->dead_wm, not empty, not index * LPROPS_HEAP_CNT: number of heaps used for storing categorized LEBs * LPROPS_EMPTY: LEB is empty, not taken * LPROPS_FREEABLE: free + dirty == leb_size, not index, not taken @@ -463,8 +496,8 @@ struct ubifs_lprops { struct ubifs_lpt_lprops { int free; int dirty; - unsigned tgc : 1; - unsigned cmt : 1; + unsigned tgc:1; + unsigned cmt:1; }; /** @@ -472,24 +505,26 @@ struct ubifs_lpt_lprops { * @empty_lebs: number of empty LEBs * @taken_empty_lebs: number of taken LEBs * @idx_lebs: number of indexing LEBs - * @total_free: total free space in bytes - * @total_dirty: total dirty space in bytes - * @total_used: total used space in bytes (includes only data LEBs) - * @total_dead: total dead space in bytes (includes only data LEBs) - * @total_dark: total dark space in bytes (includes only data LEBs) + * @total_free: total free space in bytes (includes all LEBs) + * @total_dirty: total dirty space in bytes (includes all LEBs) + * @total_used: total used space in bytes (does not include index LEBs) + * @total_dead: total dead space in bytes (does not include index LEBs) + * @total_dark: total dark space in bytes (does not include index LEBs) + * + * The @taken_empty_lebs field counts the LEBs that are in the transient state + * of having been "taken" for use but not yet written to. @taken_empty_lebs is + * needed to account correctly for @gc_lnum, otherwise @empty_lebs could be + * used by itself (in which case 'unused_lebs' would be a better name). In the + * case of @gc_lnum, it is "taken" at mount time or whenever a LEB is retained + * by GC, but unlike other empty LEBs that are "taken", it may not be written + * straight away (i.e. before the next commit start or unmount), so either + * @gc_lnum must be specially accounted for, or the current approach followed + * i.e. count it under @taken_empty_lebs. * - * N.B. total_dirty and total_used are different to other total_* fields, - * because they account _all_ LEBs, not just data LEBs. + * @empty_lebs includes @taken_empty_lebs. * - * 'taken_empty_lebs' counts the LEBs that are in the transient state of having - * been 'taken' for use but not yet written to. 'taken_empty_lebs' is needed - * to account correctly for gc_lnum, otherwise 'empty_lebs' could be used - * by itself (in which case 'unused_lebs' would be a better name). In the case - * of gc_lnum, it is 'taken' at mount time or whenever a LEB is retained by GC, - * but unlike other empty LEBs that are 'taken', it may not be written straight - * away (i.e. before the next commit start or unmount), so either gc_lnum must - * be specially accounted for, or the current approach followed i.e. count it - * under 'taken_empty_lebs'. + * @total_used, @total_dead and @total_dark fields do not account indexing + * LEBs. */ struct ubifs_lp_stats { int empty_lebs; @@ -622,17 +657,19 @@ typedef int (*ubifs_lpt_scan_callback)(struct ubifs_info *c, * @offs: write-buffer offset in this logical eraseblock * @avail: number of bytes available in the write-buffer * @used: number of used bytes in the write-buffer - * @dtype: type of data stored in this LEB (%UBI_LONGTERM, %UBI_SHORTTERM, - * %UBI_UNKNOWN) + * @size: write-buffer size (in [@c->min_io_size, @c->max_write_size] range) * @jhead: journal head the mutex belongs to (note, needed only to shut lockdep * up by 'mutex_lock_nested()). * @sync_callback: write-buffer synchronization callback * @io_mutex: serializes write-buffer I/O * @lock: serializes @buf, @lnum, @offs, @avail, @used, @next_ino and @inodes * fields + * @softlimit: soft write-buffer timeout interval + * @delta: hard and soft timeouts delta (the timer expire inteval is @softlimit + * and @softlimit + @delta) * @timer: write-buffer timer - * @timeout: timer expire interval in jiffies - * @need_sync: it is set if its timer expired and needs sync + * @no_timer: non-zero if this write-buffer does not have a timer + * @need_sync: non-zero if the timer expired and the wbuf needs sync'ing * @next_ino: points to the next position of the following inode number * @inodes: stores the inode numbers of the nodes which are in wbuf * @@ -653,14 +690,16 @@ struct ubifs_wbuf { int offs; int avail; int used; - int dtype; + int size; int jhead; int (*sync_callback)(struct ubifs_info *c, int lnum, int free, int pad); struct mutex io_mutex; spinlock_t lock; - struct timer_list timer; - int timeout; - int need_sync; + ktime_t softlimit; + unsigned long long delta; + struct hrtimer timer; + unsigned int no_timer:1; + unsigned int need_sync:1; int next_ino; ino_t *inodes; }; @@ -685,20 +724,22 @@ struct ubifs_bud { * struct ubifs_jhead - journal head. * @wbuf: head's write-buffer * @buds_list: list of bud LEBs belonging to this journal head + * @grouped: non-zero if UBIFS groups nodes when writing to this journal head * * Note, the @buds list is protected by the @c->buds_lock. */ struct ubifs_jhead { struct ubifs_wbuf wbuf; struct list_head buds_list; + unsigned int grouped:1; }; /** * struct ubifs_zbranch - key/coordinate/length branch stored in znodes. * @key: key * @znode: znode address in memory - * @lnum: LEB number of the indexing node - * @offs: offset of the indexing node within @lnum + * @lnum: LEB number of the target node (indexing node or data node) + * @offs: target node offset within @lnum * @len: target node length */ struct ubifs_zbranch { @@ -726,6 +767,9 @@ struct ubifs_zbranch { * @offs: offset of the corresponding indexing node * @len: length of the corresponding indexing node * @zbranch: array of znode branches (@c->fanout elements) + * + * Note! The @lnum, @offs, and @len fields are not really needed - we have them + * only for internal consistency check. They could be removed to save some RAM. */ struct ubifs_znode { struct ubifs_znode *parent; @@ -736,13 +780,35 @@ struct ubifs_znode { int child_cnt; int iip; int alt; -#ifdef CONFIG_UBIFS_FS_DEBUG - int lnum, offs, len; -#endif + int lnum; + int offs; + int len; struct ubifs_zbranch zbranch[]; }; /** + * struct bu_info - bulk-read information. + * @key: first data node key + * @zbranch: zbranches of data nodes to bulk read + * @buf: buffer to read into + * @buf_len: buffer length + * @gc_seq: GC sequence number to detect races with GC + * @cnt: number of data nodes for bulk read + * @blk_cnt: number of data blocks including holes + * @oef: end of file reached + */ +struct bu_info { + union ubifs_key key; + struct ubifs_zbranch zbranch[UBIFS_MAX_BULK_READ]; + void *buf; + int buf_len; + int gc_seq; + int cnt; + int blk_cnt; + int eof; +}; + +/** * struct ubifs_node_range - node length range description data structure. * @len: fixed node length * @min_len: minimum possible node length @@ -779,7 +845,7 @@ struct ubifs_compressor { /** * struct ubifs_budget_req - budget requirements of an operation. * - * @fast: non-zero if the budgeting should try to aquire budget quickly and + * @fast: non-zero if the budgeting should try to acquire budget quickly and * should not try to call write-back * @recalculate: non-zero if @idx_growth, @data_growth, and @dd_growth fields * have to be re-calculated @@ -805,21 +871,31 @@ struct ubifs_compressor { * An inode may contain 4KiB of data at max., thus the widths of @new_ino_d * is 13 bits, and @dirtied_ino_d - 15, because up to 4 inodes may be made * dirty by the re-name operation. + * + * Note, UBIFS aligns node lengths to 8-bytes boundary, so the requester has to + * make sure the amount of inode data which contribute to @new_ino_d and + * @dirtied_ino_d fields are aligned. */ struct ubifs_budget_req { unsigned int fast:1; unsigned int recalculate:1; +#ifndef UBIFS_DEBUG unsigned int new_page:1; unsigned int dirtied_page:1; unsigned int new_dent:1; unsigned int mod_dent:1; unsigned int new_ino:1; unsigned int new_ino_d:13; -#ifndef UBIFS_DEBUG unsigned int dirtied_ino:4; unsigned int dirtied_ino_d:15; #else /* Not bit-fields to check for overflows */ + unsigned int new_page; + unsigned int dirtied_page; + unsigned int new_dent; + unsigned int mod_dent; + unsigned int new_ino; + unsigned int new_ino_d; unsigned int dirtied_ino; unsigned int dirtied_ino_d; #endif @@ -837,6 +913,8 @@ struct ubifs_budget_req { * @dnext: next orphan to delete * @inum: inode number * @new: %1 => added since the last commit, otherwise %0 + * @cmt: %1 => commit pending, otherwise %0 + * @del: %1 => delete pending, otherwise %0 */ struct ubifs_orphan { struct rb_node rb; @@ -845,29 +923,80 @@ struct ubifs_orphan { struct ubifs_orphan *cnext; struct ubifs_orphan *dnext; ino_t inum; - int new; + unsigned new:1; + unsigned cmt:1; + unsigned del:1; }; /** * struct ubifs_mount_opts - UBIFS-specific mount options information. * @unmount_mode: selected unmount mode (%0 default, %1 normal, %2 fast) + * @bulk_read: enable/disable bulk-reads (%0 default, %1 disabe, %2 enable) + * @chk_data_crc: enable/disable CRC data checking when reading data nodes + * (%0 default, %1 disabe, %2 enable) + * @override_compr: override default compressor (%0 - do not override and use + * superblock compressor, %1 - override and use compressor + * specified in @compr_type) + * @compr_type: compressor type to override the superblock compressor with + * (%UBIFS_COMPR_NONE, etc) */ struct ubifs_mount_opts { unsigned int unmount_mode:2; + unsigned int bulk_read:2; + unsigned int chk_data_crc:2; + unsigned int override_compr:1; + unsigned int compr_type:2; +}; + +/** + * struct ubifs_budg_info - UBIFS budgeting information. + * @idx_growth: amount of bytes budgeted for index growth + * @data_growth: amount of bytes budgeted for cached data + * @dd_growth: amount of bytes budgeted for cached data that will make + * other data dirty + * @uncommitted_idx: amount of bytes were budgeted for growth of the index, but + * which still have to be taken into account because the index + * has not been committed so far + * @old_idx_sz: size of index on flash + * @min_idx_lebs: minimum number of LEBs required for the index + * @nospace: non-zero if the file-system does not have flash space (used as + * optimization) + * @nospace_rp: the same as @nospace, but additionally means that even reserved + * pool is full + * @page_budget: budget for a page (constant, nenver changed after mount) + * @inode_budget: budget for an inode (constant, nenver changed after mount) + * @dent_budget: budget for a directory entry (constant, nenver changed after + * mount) + */ +struct ubifs_budg_info { + long long idx_growth; + long long data_growth; + long long dd_growth; + long long uncommitted_idx; + unsigned long long old_idx_sz; + int min_idx_lebs; + unsigned int nospace:1; + unsigned int nospace_rp:1; + int page_budget; + int inode_budget; + int dent_budget; }; +struct ubifs_debug_info; + /** * struct ubifs_info - UBIFS file-system description data structure * (per-superblock). * @vfs_sb: VFS @struct super_block object - * @bdi: backing device info object to make VFS happy and disable readahead + * @bdi: backing device info object to make VFS happy and disable read-ahead * * @highest_inum: highest used inode number - * @vfs_gen: VFS inode generation counter * @max_sqnum: current global sequence number - * @cmt_no: commit number (last successfully completed commit) - * @cnt_lock: protects @highest_inum, @vfs_gen, and @max_sqnum counters + * @cmt_no: commit number of the last successfully completed commit, protected + * by @commit_sem + * @cnt_lock: protects @highest_inum and @max_sqnum counters * @fmt_version: UBIFS on-flash format version + * @ro_compat_version: R/O compatibility version * @uuid: UUID from super block * * @lhead_lnum: log head logical eraseblock number @@ -894,13 +1023,14 @@ struct ubifs_mount_opts { * @cmt_state: commit state * @cs_lock: commit state lock * @cmt_wq: wait queue to sleep on if the log is full and a commit is running - * @fast_unmount: do not run journal commit before un-mounting + * * @big_lpt: flag that LPT is too big to write whole during commit - * @check_lpt_free: flag that indicates LPT GC may be needed - * @nospace: non-zero if the file-system does not have flash space (used as - * optimization) - * @nospace_rp: the same as @nospace, but additionally means that even reserved - * pool is full + * @space_fixup: flag indicating that free space in LEBs needs to be cleaned up + * @no_chk_data_crc: do not check CRCs when reading data nodes (except during + * recovery) + * @bulk_read: enable bulk-reads + * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc) + * @rw_incompat: the media is not R/W compatible * * @tnc_mutex: protects the Tree Node Cache (TNC), @zroot, @cnext, @enext, and * @calc_idx_sz @@ -918,13 +1048,20 @@ struct ubifs_mount_opts { * @ileb_nxt: next pre-allocated index LEBs * @old_idx: tree of index nodes obsoleted since the last commit start * @bottom_up_buf: a buffer which is used by 'dirty_cow_bottom_up()' in tnc.c - * @new_ihead_lnum: used by debugging to check ihead_lnum - * @new_ihead_offs: used by debugging to check ihead_offs * * @mst_node: master node * @mst_offs: offset of valid master node * @mst_mutex: protects the master node area, @mst_node, and @mst_offs * + * @max_bu_buf_len: maximum bulk-read buffer length + * @bu_mutex: protects the pre-allocated bulk-read buffer and @c->bu + * @bu: pre-allocated bulk-read information + * + * @write_reserve_mutex: protects @write_reserve_buf + * @write_reserve_buf: on the write path we allocate memory, which might + * sometimes be unavailable, in which case we use this + * write reserve buffer + * * @log_lebs: number of logical eraseblocks in the log * @log_bytes: log size in bytes * @log_last: last LEB of the log @@ -937,7 +1074,6 @@ struct ubifs_mount_opts { * @main_lebs: count of LEBs in the main area * @main_first: first LEB of the main area * @main_bytes: main area size in bytes - * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc) * * @key_hash_type: type of the key hash * @key_hash: direntry key hash function @@ -947,43 +1083,42 @@ struct ubifs_mount_opts { * * @min_io_size: minimal input/output unit size * @min_io_shift: number of bits in @min_io_size minus one + * @max_write_size: maximum amount of bytes the underlying flash can write at a + * time (MTD write buffer size) + * @max_write_shift: number of bits in @max_write_size minus one * @leb_size: logical eraseblock size in bytes + * @leb_start: starting offset of logical eraseblocks within physical + * eraseblocks * @half_leb_size: half LEB size + * @idx_leb_size: how many bytes of an LEB are effectively available when it is + * used to store indexing nodes (@leb_size - @max_idx_node_sz) * @leb_cnt: count of logical eraseblocks * @max_leb_cnt: maximum count of logical eraseblocks * @old_leb_cnt: count of logical eraseblocks before re-size * @ro_media: the underlying UBI volume is read-only + * @ro_mount: the file-system was mounted as read-only + * @ro_error: UBIFS switched to R/O mode because an error happened * * @dirty_pg_cnt: number of dirty pages (not used) * @dirty_zn_cnt: number of dirty znodes * @clean_zn_cnt: number of clean znodes * - * @budg_idx_growth: amount of bytes budgeted for index growth - * @budg_data_growth: amount of bytes budgeted for cached data - * @budg_dd_growth: amount of bytes budgeted for cached data that will make - * other data dirty - * @budg_uncommitted_idx: amount of bytes were budgeted for growth of the index, - * but which still have to be taken into account because - * the index has not been committed so far - * @space_lock: protects @budg_idx_growth, @budg_data_growth, @budg_dd_growth, - * @budg_uncommited_idx, @min_idx_lebs, @old_idx_sz, and @lst; - * @min_idx_lebs: minimum number of LEBs required for the index - * @old_idx_sz: size of index on flash + * @space_lock: protects @bi and @lst + * @lst: lprops statistics + * @bi: budgeting information * @calc_idx_sz: temporary variable which is used to calculate new index size * (contains accurate new index size at end of TNC commit start) - * @lst: lprops statistics - * - * @page_budget: budget for a page - * @inode_budget: budget for an inode - * @dent_budget: budget for a directory entry * * @ref_node_alsz: size of the LEB reference node aligned to the min. flash - * I/O unit + * I/O unit * @mst_node_alsz: master node aligned size * @min_idx_node_sz: minimum indexing node aligned on 8-bytes boundary * @max_idx_node_sz: maximum indexing node aligned on 8-bytes boundary * @max_inode_sz: maximum possible inode size in bytes * @max_znode_sz: size of znode in bytes + * + * @leb_overhead: how many bytes are wasted in an LEB when it is filled with + * data nodes of maximum size - used in free space reporting * @dead_wm: LEB dead space watermark * @dark_wm: LEB dark space watermark * @block_cnt: count of 4KiB blocks on the FS @@ -1017,6 +1152,8 @@ struct ubifs_mount_opts { * @sbuf: a buffer of LEB size used by GC and replay for scanning * @idx_gc: list of index LEBs that have been garbage collected * @idx_gc_cnt: number of elements on the idx_gc list + * @gc_seq: incremented for every non-index LEB garbage collected + * @gced_lnum: last non-index LEB that was garbage collected * * @infos_list: links all 'ubifs_info' objects * @umount_mutex: serializes shrinker and un-mount @@ -1045,6 +1182,7 @@ struct ubifs_mount_opts { * @lpt_drty_flgs: dirty flags for LPT special nodes e.g. ltab * @dirty_nn_cnt: number of dirty nnodes * @dirty_pn_cnt: number of dirty pnodes + * @check_lpt_free: flag that indicates LPT GC may be needed * @lpt_sz: LPT size * @lpt_nod_buf: buffer for an on-flash nnode or pnode * @lpt_buf: buffer of LEB size used by LPT @@ -1055,9 +1193,11 @@ struct ubifs_mount_opts { * previous commit start * @uncat_list: list of un-categorized LEBs * @empty_list: list of empty LEBs - * @freeable_list: list of freeable non-index LEBs (free + dirty == leb_size) - * @frdi_idx_list: list of freeable index LEBs (free + dirty == leb_size) + * @freeable_list: list of freeable non-index LEBs (free + dirty == @leb_size) + * @frdi_idx_list: list of freeable index LEBs (free + dirty == @leb_size) * @freeable_cnt: number of freeable LEBs in @freeable_list + * @in_a_category_cnt: count of lprops which are in a certain category, which + * basically meants that they were loaded from the flash * * @ltab_lnum: LEB number of LPT's own lprops table * @ltab_offs: offset of LPT's own lprops table @@ -1074,40 +1214,35 @@ struct ubifs_mount_opts { * @rp_uid: reserved pool user ID * @rp_gid: reserved pool group ID * - * @empty: if the UBI device is empty - * @replay_tree: temporary tree used during journal replay + * @empty: %1 if the UBI device is empty + * @need_recovery: %1 if the file-system needs recovery + * @replaying: %1 during journal replay + * @mounting: %1 while mounting + * @probing: %1 while attempting to mount if MS_SILENT mount flag is set + * @remounting_rw: %1 while re-mounting from R/O mode to R/W mode * @replay_list: temporary list used during journal replay * @replay_buds: list of buds to replay * @cs_sqnum: sequence number of first node in the log (commit start node) * @replay_sqnum: sequence number of node currently being replayed - * @need_recovery: file-system needs recovery - * @replaying: set to %1 during journal replay - * @unclean_leb_list: LEBs to recover when mounting ro to rw - * @rcvrd_mst_node: recovered master node to write when mounting ro to rw + * @unclean_leb_list: LEBs to recover when re-mounting R/O mounted FS to R/W + * mode + * @rcvrd_mst_node: recovered master node to write when re-mounting R/O mounted + * FS to R/W mode * @size_tree: inode size information for recovery - * @remounting_rw: set while remounting from ro to rw (sb flags have MS_RDONLY) * @mount_opts: UBIFS-specific mount options * - * @dbg_buf: a buffer of LEB size used for debugging purposes - * @old_zroot: old index root - used by 'dbg_check_old_index()' - * @old_zroot_level: old index root level - used by 'dbg_check_old_index()' - * @old_zroot_sqnum: old index root sqnum - used by 'dbg_check_old_index()' - * @failure_mode: failure mode for recovery testing - * @fail_delay: 0=>don't delay, 1=>delay a time, 2=>delay a number of calls - * @fail_timeout: time in jiffies when delay of failure mode expires - * @fail_cnt: current number of calls to failure mode I/O functions - * @fail_cnt_max: number of calls by which to delay failure mode + * @dbg: debugging-related information */ struct ubifs_info { struct super_block *vfs_sb; struct backing_dev_info bdi; ino_t highest_inum; - unsigned int vfs_gen; unsigned long long max_sqnum; unsigned long long cmt_no; spinlock_t cnt_lock; int fmt_version; + int ro_compat_version; unsigned char uuid[16]; int lhead_lnum; @@ -1131,11 +1266,13 @@ struct ubifs_info { int cmt_state; spinlock_t cs_lock; wait_queue_head_t cmt_wq; - unsigned int fast_unmount:1; + unsigned int big_lpt:1; - unsigned int check_lpt_free:1; - unsigned int nospace:1; - unsigned int nospace_rp:1; + unsigned int space_fixup:1; + unsigned int no_chk_data_crc:1; + unsigned int bulk_read:1; + unsigned int default_compr:2; + unsigned int rw_incompat:1; struct mutex tnc_mutex; struct ubifs_zbranch zroot; @@ -1152,15 +1289,18 @@ struct ubifs_info { int ileb_nxt; struct rb_root old_idx; int *bottom_up_buf; -#ifdef CONFIG_UBIFS_FS_DEBUG - int new_ihead_lnum; - int new_ihead_offs; -#endif struct ubifs_mst_node *mst_node; int mst_offs; struct mutex mst_mutex; + int max_bu_buf_len; + struct mutex bu_mutex; + struct bu_info bu; + + struct mutex write_reserve_mutex; + void *write_reserve_buf; + int log_lebs; long long log_bytes; int log_last; @@ -1173,7 +1313,6 @@ struct ubifs_info { int main_lebs; int main_first; long long main_bytes; - int default_compr; uint8_t key_hash_type; uint32_t (*key_hash)(const char *str, int len); @@ -1183,30 +1322,27 @@ struct ubifs_info { int min_io_size; int min_io_shift; + int max_write_size; + int max_write_shift; int leb_size; + int leb_start; int half_leb_size; + int idx_leb_size; int leb_cnt; int max_leb_cnt; int old_leb_cnt; - int ro_media; + unsigned int ro_media:1; + unsigned int ro_mount:1; + unsigned int ro_error:1; atomic_long_t dirty_pg_cnt; atomic_long_t dirty_zn_cnt; atomic_long_t clean_zn_cnt; - long long budg_idx_growth; - long long budg_data_growth; - long long budg_dd_growth; - long long budg_uncommitted_idx; spinlock_t space_lock; - int min_idx_lebs; - unsigned long long old_idx_sz; - unsigned long long calc_idx_sz; struct ubifs_lp_stats lst; - - int page_budget; - int inode_budget; - int dent_budget; + struct ubifs_budg_info bi; + unsigned long long calc_idx_sz; int ref_node_alsz; int mst_node_alsz; @@ -1214,6 +1350,8 @@ struct ubifs_info { int max_idx_node_sz; long long max_inode_sz; int max_znode_sz; + + int leb_overhead; int dead_wm; int dark_wm; int block_cnt; @@ -1247,6 +1385,8 @@ struct ubifs_info { void *sbuf; struct list_head idx_gc; int idx_gc_cnt; + int gc_seq; + int gced_lnum; struct list_head infos_list; struct mutex umount_mutex; @@ -1275,6 +1415,7 @@ struct ubifs_info { int lpt_drty_flgs; int dirty_nn_cnt; int dirty_pn_cnt; + int check_lpt_free; long long lpt_sz; void *lpt_nod_buf; void *lpt_buf; @@ -1287,6 +1428,7 @@ struct ubifs_info { struct list_head freeable_list; struct list_head frdi_idx_list; int freeable_cnt; + int in_a_category_cnt; int ltab_lnum; int ltab_offs; @@ -1300,64 +1442,63 @@ struct ubifs_info { long long rp_size; long long report_rp_size; - uid_t rp_uid; - gid_t rp_gid; + kuid_t rp_uid; + kgid_t rp_gid; /* The below fields are used only during mounting and re-mounting */ - int empty; - struct rb_root replay_tree; + unsigned int empty:1; + unsigned int need_recovery:1; + unsigned int replaying:1; + unsigned int mounting:1; + unsigned int remounting_rw:1; + unsigned int probing:1; struct list_head replay_list; struct list_head replay_buds; unsigned long long cs_sqnum; unsigned long long replay_sqnum; - int need_recovery; - int replaying; struct list_head unclean_leb_list; struct ubifs_mst_node *rcvrd_mst_node; struct rb_root size_tree; - int remounting_rw; struct ubifs_mount_opts mount_opts; -#ifdef CONFIG_UBIFS_FS_DEBUG - void *dbg_buf; - struct ubifs_zbranch old_zroot; - int old_zroot_level; - unsigned long long old_zroot_sqnum; - int failure_mode; - int fail_delay; - unsigned long fail_timeout; - unsigned int fail_cnt; - unsigned int fail_cnt_max; -#endif + struct ubifs_debug_info *dbg; }; extern struct list_head ubifs_infos; extern spinlock_t ubifs_infos_lock; extern atomic_long_t ubifs_clean_zn_cnt; extern struct kmem_cache *ubifs_inode_slab; -extern struct super_operations ubifs_super_operations; -extern struct address_space_operations ubifs_file_address_operations; -extern struct file_operations ubifs_file_operations; -extern struct inode_operations ubifs_file_inode_operations; -extern struct file_operations ubifs_dir_operations; -extern struct inode_operations ubifs_dir_inode_operations; -extern struct inode_operations ubifs_symlink_inode_operations; +extern const struct super_operations ubifs_super_operations; +extern const struct address_space_operations ubifs_file_address_operations; +extern const struct file_operations ubifs_file_operations; +extern const struct inode_operations ubifs_file_inode_operations; +extern const struct file_operations ubifs_dir_operations; +extern const struct inode_operations ubifs_dir_inode_operations; +extern const struct inode_operations ubifs_symlink_inode_operations; extern struct backing_dev_info ubifs_backing_dev_info; extern struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT]; /* io.c */ +void ubifs_ro_mode(struct ubifs_info *c, int err); +int ubifs_leb_read(const struct ubifs_info *c, int lnum, void *buf, int offs, + int len, int even_ebadmsg); +int ubifs_leb_write(struct ubifs_info *c, int lnum, const void *buf, int offs, + int len); +int ubifs_leb_change(struct ubifs_info *c, int lnum, const void *buf, int len); +int ubifs_leb_unmap(struct ubifs_info *c, int lnum); +int ubifs_leb_map(struct ubifs_info *c, int lnum); +int ubifs_is_mapped(const struct ubifs_info *c, int lnum); int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len); -int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs, - int dtype); +int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs); int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf); int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len, int lnum, int offs); int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len, int lnum, int offs); int ubifs_write_node(struct ubifs_info *c, void *node, int len, int lnum, - int offs, int dtype); + int offs); int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum, - int offs, int quiet); + int offs, int quiet, int must_chk_crc); void ubifs_prepare_node(struct ubifs_info *c, void *buf, int len, int pad); void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last); int ubifs_io_init(struct ubifs_info *c); @@ -1369,7 +1510,7 @@ int ubifs_sync_wbufs_by_inode(struct ubifs_info *c, struct inode *inode); /* scan.c */ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum, - int offs, void *sbuf); + int offs, void *sbuf, int quiet); void ubifs_scan_destroy(struct ubifs_scan_leb *sleb); int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum, int offs, int quiet); @@ -1399,8 +1540,8 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, int deletion, int xent); int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode, const union ubifs_key *key, const void *buf, int len); -int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode, - int last_reference); +int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode); +int ubifs_jnl_delete_inode(struct ubifs_info *c, const struct inode *inode); int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, const struct dentry *old_dentry, const struct inode *new_dir, @@ -1423,13 +1564,15 @@ void ubifs_release_ino_dirty(struct ubifs_info *c, struct inode *inode, struct ubifs_budget_req *req); void ubifs_cancel_ino_op(struct ubifs_info *c, struct inode *inode, struct ubifs_budget_req *req); -long long ubifs_budg_get_free_space(struct ubifs_info *c); +long long ubifs_get_free_space(struct ubifs_info *c); +long long ubifs_get_free_space_nolock(struct ubifs_info *c); int ubifs_calc_min_idx_lebs(struct ubifs_info *c); void ubifs_convert_page_budget(struct ubifs_info *c); +long long ubifs_reported_space(const struct ubifs_info *c, long long free); long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs); /* find.c */ -int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free, +int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *offs, int squeeze); int ubifs_find_free_leb_for_idx(struct ubifs_info *c); int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp, @@ -1440,8 +1583,6 @@ int ubifs_save_dirty_idx_lnums(struct ubifs_info *c); /* tnc.c */ int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key, struct ubifs_znode **zn, int *n); -int ubifs_tnc_lookup(struct ubifs_info *c, const union ubifs_key *key, - void *node); int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key, void *node, const struct qstr *nm); int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key, @@ -1471,6 +1612,8 @@ void destroy_old_idx(struct ubifs_info *c); int is_idx_node_in_tnc(struct ubifs_info *c, union ubifs_key *key, int level, int lnum, int offs); int insert_old_idx_znode(struct ubifs_info *c, struct ubifs_znode *znode); +int ubifs_tnc_get_bu_keys(struct ubifs_info *c, struct bu_info *bu); +int ubifs_tnc_bulk_read(struct ubifs_info *c, struct bu_info *bu); /* tnc_misc.c */ struct ubifs_znode *ubifs_tnc_levelorder_next(struct ubifs_znode *zr, @@ -1492,7 +1635,10 @@ int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot); int ubifs_tnc_end_commit(struct ubifs_info *c); /* shrinker.c */ -int ubifs_shrinker(int nr_to_scan, gfp_t gfp_mask); +unsigned long ubifs_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc); +unsigned long ubifs_shrink_count(struct shrinker *shrink, + struct shrink_control *sc); /* commit.c */ int ubifs_bg_thread(void *info); @@ -1511,6 +1657,7 @@ int ubifs_write_master(struct ubifs_info *c); int ubifs_read_superblock(struct ubifs_info *c); struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c); int ubifs_write_sb_node(struct ubifs_info *c, struct ubifs_sb_node *sup); +int ubifs_fixup_free_space(struct ubifs_info *c); /* replay.c */ int ubifs_validate_entry(struct ubifs_info *c, @@ -1531,6 +1678,7 @@ void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum); int ubifs_orphan_start_commit(struct ubifs_info *c); int ubifs_orphan_end_commit(struct ubifs_info *c); int ubifs_mount_orphans(struct ubifs_info *c, int unclean, int read_only); +int ubifs_clear_orphans(struct ubifs_info *c); /* lpt.c */ int ubifs_calc_lpt_geom(struct ubifs_info *c); @@ -1559,6 +1707,9 @@ void ubifs_add_lpt_dirt(struct ubifs_info *c, int lnum, int dirty); void ubifs_add_nnode_dirt(struct ubifs_info *c, struct ubifs_nnode *nnode); uint32_t ubifs_unpack_bits(uint8_t **addr, int *pos, int nrbits); struct ubifs_nnode *ubifs_first_nnode(struct ubifs_info *c, int *hght); +/* Needed only in debugging code in lpt_commit.c */ +int ubifs_unpack_nnode(const struct ubifs_info *c, void *buf, + struct ubifs_nnode *nnode); /* lpt_commit.c */ int ubifs_lpt_start_commit(struct ubifs_info *c); @@ -1567,13 +1718,11 @@ int ubifs_lpt_post_commit(struct ubifs_info *c); void ubifs_lpt_free(struct ubifs_info *c, int wr_only); /* lprops.c */ -void ubifs_get_lprops(struct ubifs_info *c); const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c, const struct ubifs_lprops *lp, int free, int dirty, int flags, int idx_gc_cnt); -void ubifs_release_lprops(struct ubifs_info *c); -void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *stats); +void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *lst); void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops, int cat); void ubifs_replace_cat(struct ubifs_info *c, struct ubifs_lprops *old_lprops, @@ -1590,14 +1739,15 @@ const struct ubifs_lprops *ubifs_fast_find_free(struct ubifs_info *c); const struct ubifs_lprops *ubifs_fast_find_empty(struct ubifs_info *c); const struct ubifs_lprops *ubifs_fast_find_freeable(struct ubifs_info *c); const struct ubifs_lprops *ubifs_fast_find_frdi_idx(struct ubifs_info *c); +int ubifs_calc_dark(const struct ubifs_info *c, int spc); /* file.c */ -int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync); +int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync); int ubifs_setattr(struct dentry *dentry, struct iattr *attr); /* dir.c */ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir, - int mode); + umode_t mode); int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); @@ -1616,11 +1766,11 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum); int ubifs_recover_master_node(struct ubifs_info *c); int ubifs_write_rcvrd_mst_node(struct ubifs_info *c); struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum, - int offs, void *sbuf, int grouped); + int offs, void *sbuf, int jhead); struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf); -int ubifs_recover_inl_heads(const struct ubifs_info *c, void *sbuf); -int ubifs_clean_lebs(const struct ubifs_info *c, void *sbuf); +int ubifs_recover_inl_heads(struct ubifs_info *c, void *sbuf); +int ubifs_clean_lebs(struct ubifs_info *c, void *sbuf); int ubifs_rcvry_gc_commit(struct ubifs_info *c); int ubifs_recover_size_accum(struct ubifs_info *c, union ubifs_key *key, int deletion, loff_t new_size); @@ -1636,7 +1786,7 @@ long ubifs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); /* compressor.c */ int __init ubifs_compressors_init(void); -void __exit ubifs_compressors_exit(void); +void ubifs_compressors_exit(void); void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len, int *compr_type); int ubifs_decompress(const void *buf, int len, void *out, int *out_len, diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index 1388a078e1a..5e0a63b1b0d 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -55,13 +55,15 @@ * ACL support is not implemented. */ +#include "ubifs.h" +#include <linux/fs.h> +#include <linux/slab.h> #include <linux/xattr.h> #include <linux/posix_acl_xattr.h> -#include "ubifs.h" /* * Limit the number of extended attributes per inode so that the total size - * (xattr_size) is guaranteeded to fit in an 'unsigned int'. + * (@xattr_size) is guaranteeded to fit in an 'unsigned int'. */ #define MAX_XATTRS_PER_INODE 65535 @@ -78,9 +80,8 @@ enum { SECURITY_XATTR, }; -static struct inode_operations none_inode_operations; -static struct address_space_operations none_address_operations; -static struct file_operations none_file_operations; +static const struct inode_operations empty_iops; +static const struct file_operations empty_fops; /** * create_xattr - create an extended attribute. @@ -103,14 +104,14 @@ static int create_xattr(struct ubifs_info *c, struct inode *host, struct inode *inode; struct ubifs_inode *ui, *host_ui = ubifs_inode(host); struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, - .new_ino_d = size, .dirtied_ino = 1, - .dirtied_ino_d = host_ui->data_len}; + .new_ino_d = ALIGN(size, 8), .dirtied_ino = 1, + .dirtied_ino_d = ALIGN(host_ui->data_len, 8) }; if (host_ui->xattr_cnt >= MAX_XATTRS_PER_INODE) return -ENOSPC; /* * Linux limits the maximum size of the extended attribute names list - * to %XATTR_LIST_MAX. This means we should not allow creating more* + * to %XATTR_LIST_MAX. This means we should not allow creating more * extended attributes if the name list becomes larger. This limitation * is artificial for UBIFS, though. */ @@ -128,36 +129,30 @@ static int create_xattr(struct ubifs_info *c, struct inode *host, goto out_budg; } - mutex_lock(&host_ui->ui_mutex); /* Re-define all operations to be "nothing" */ - inode->i_mapping->a_ops = &none_address_operations; - inode->i_op = &none_inode_operations; - inode->i_fop = &none_file_operations; + inode->i_mapping->a_ops = &empty_aops; + inode->i_op = &empty_iops; + inode->i_fop = &empty_fops; inode->i_flags |= S_SYNC | S_NOATIME | S_NOCMTIME | S_NOQUOTA; ui = ubifs_inode(inode); ui->xattr = 1; ui->flags |= UBIFS_XATTR_FL; - ui->data = kmalloc(size, GFP_NOFS); + ui->data = kmemdup(value, size, GFP_NOFS); if (!ui->data) { err = -ENOMEM; - goto out_unlock; + goto out_free; } + inode->i_size = ui->ui_size = size; + ui->data_len = size; - memcpy(ui->data, value, size); + mutex_lock(&host_ui->ui_mutex); host->i_ctime = ubifs_current_time(host); host_ui->xattr_cnt += 1; host_ui->xattr_size += CALC_DENT_SIZE(nm->len); host_ui->xattr_size += CALC_XATTR_BYTES(size); host_ui->xattr_names += nm->len; - /* - * We do not use i_size_write() because nobody can race with us as we - * are holding host @host->i_mutex - every xattr operation for this - * inode is serialized by it. - */ - inode->i_size = ui->ui_size = size; - ui->data_len = size; err = ubifs_jnl_update(c, host, nm, inode, 0, 1); if (err) goto out_cancel; @@ -172,8 +167,8 @@ out_cancel: host_ui->xattr_cnt -= 1; host_ui->xattr_size -= CALC_DENT_SIZE(nm->len); host_ui->xattr_size -= CALC_XATTR_BYTES(size); -out_unlock: mutex_unlock(&host_ui->ui_mutex); +out_free: make_bad_inode(inode); iput(inode); out_budg: @@ -200,29 +195,27 @@ static int change_xattr(struct ubifs_info *c, struct inode *host, struct ubifs_inode *host_ui = ubifs_inode(host); struct ubifs_inode *ui = ubifs_inode(inode); struct ubifs_budget_req req = { .dirtied_ino = 2, - .dirtied_ino_d = size + host_ui->data_len }; + .dirtied_ino_d = ALIGN(size, 8) + ALIGN(host_ui->data_len, 8) }; ubifs_assert(ui->data_len == inode->i_size); err = ubifs_budget_space(c, &req); if (err) return err; - mutex_lock(&host_ui->ui_mutex); - host->i_ctime = ubifs_current_time(host); - host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len); - host_ui->xattr_size += CALC_XATTR_BYTES(size); - kfree(ui->data); - ui->data = kmalloc(size, GFP_NOFS); + ui->data = kmemdup(value, size, GFP_NOFS); if (!ui->data) { err = -ENOMEM; - goto out_unlock; + goto out_free; } - - memcpy(ui->data, value, size); inode->i_size = ui->ui_size = size; ui->data_len = size; + mutex_lock(&host_ui->ui_mutex); + host->i_ctime = ubifs_current_time(host); + host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len); + host_ui->xattr_size += CALC_XATTR_BYTES(size); + /* * It is important to write the host inode after the xattr inode * because if the host inode gets synchronized (via 'fsync()'), then @@ -240,9 +233,9 @@ static int change_xattr(struct ubifs_info *c, struct inode *host, out_cancel: host_ui->xattr_size -= CALC_XATTR_BYTES(size); host_ui->xattr_size += CALC_XATTR_BYTES(ui->data_len); - make_bad_inode(inode); -out_unlock: mutex_unlock(&host_ui->ui_mutex); + make_bad_inode(inode); +out_free: ubifs_release_budget(c, &req); return err; } @@ -305,13 +298,14 @@ int ubifs_setxattr(struct dentry *dentry, const char *name, { struct inode *inode, *host = dentry->d_inode; struct ubifs_info *c = host->i_sb->s_fs_info; - struct qstr nm = { .name = name, .len = strlen(name) }; + struct qstr nm = QSTR_INIT(name, strlen(name)); struct ubifs_dent_node *xent; union ubifs_key key; int err, type; - dbg_gen("xattr '%s', host ino %lu ('%.*s'), size %zd", name, - host->i_ino, dentry->d_name.len, dentry->d_name.name, size); + dbg_gen("xattr '%s', host ino %lu ('%pd'), size %zd", name, + host->i_ino, dentry, size); + ubifs_assert(mutex_is_locked(&host->i_mutex)); if (size > UBIFS_MAX_INO_DATA) return -ERANGE; @@ -367,14 +361,14 @@ ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf, { struct inode *inode, *host = dentry->d_inode; struct ubifs_info *c = host->i_sb->s_fs_info; - struct qstr nm = { .name = name, .len = strlen(name) }; + struct qstr nm = QSTR_INIT(name, strlen(name)); struct ubifs_inode *ui; struct ubifs_dent_node *xent; union ubifs_key key; int err; - dbg_gen("xattr '%s', ino %lu ('%.*s'), buf size %zd", name, - host->i_ino, dentry->d_name.len, dentry->d_name.name, size); + dbg_gen("xattr '%s', ino %lu ('%pd'), buf size %zd", name, + host->i_ino, dentry, size); err = check_namespace(&nm); if (err < 0) @@ -384,7 +378,6 @@ ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf, if (!xent) return -ENOMEM; - mutex_lock(&host->i_mutex); xent_key_init(c, &key, host->i_ino, &nm); err = ubifs_tnc_lookup_nm(c, &key, xent, &nm); if (err) { @@ -406,8 +399,8 @@ ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf, if (buf) { /* If @buf is %NULL we are supposed to return the length */ if (ui->data_len > size) { - dbg_err("buffer size %zd, xattr len %d", - size, ui->data_len); + ubifs_err("buffer size %zd, xattr len %d", + size, ui->data_len); err = -ERANGE; goto out_iput; } @@ -419,7 +412,6 @@ ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf, out_iput: iput(inode); out_unlock: - mutex_unlock(&host->i_mutex); kfree(xent); return err; } @@ -434,8 +426,8 @@ ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size) int err, len, written = 0; struct qstr nm = { .name = NULL }; - dbg_gen("ino %lu ('%.*s'), buffer size %zd", host->i_ino, - dentry->d_name.len, dentry->d_name.name, size); + dbg_gen("ino %lu ('%pd'), buffer size %zd", host->i_ino, + dentry, size); len = host_ui->xattr_names + host_ui->xattr_cnt; if (!buffer) @@ -449,13 +441,11 @@ ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size) return -ERANGE; lowest_xent_key(c, &key, host->i_ino); - - mutex_lock(&host->i_mutex); while (1) { int type; xent = ubifs_tnc_next_ent(c, &key, &nm); - if (unlikely(IS_ERR(xent))) { + if (IS_ERR(xent)) { err = PTR_ERR(xent); break; } @@ -479,7 +469,6 @@ ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size) pxent = xent; key_read(c, &xent->key, &key); } - mutex_unlock(&host->i_mutex); kfree(pxent); if (err != -ENOENT) { @@ -497,8 +486,8 @@ static int remove_xattr(struct ubifs_info *c, struct inode *host, int err; struct ubifs_inode *host_ui = ubifs_inode(host); struct ubifs_inode *ui = ubifs_inode(inode); - struct ubifs_budget_req req = { .dirtied_ino = 1, .mod_dent = 1, - .dirtied_ino_d = host_ui->data_len }; + struct ubifs_budget_req req = { .dirtied_ino = 2, .mod_dent = 1, + .dirtied_ino_d = ALIGN(host_ui->data_len, 8) }; ubifs_assert(ui->data_len == inode->i_size); @@ -535,13 +524,13 @@ int ubifs_removexattr(struct dentry *dentry, const char *name) { struct inode *inode, *host = dentry->d_inode; struct ubifs_info *c = host->i_sb->s_fs_info; - struct qstr nm = { .name = name, .len = strlen(name) }; + struct qstr nm = QSTR_INIT(name, strlen(name)); struct ubifs_dent_node *xent; union ubifs_key key; int err; - dbg_gen("xattr '%s', ino %lu ('%.*s')", name, - host->i_ino, dentry->d_name.len, dentry->d_name.name); + dbg_gen("xattr '%s', ino %lu ('%pd')", name, + host->i_ino, dentry); ubifs_assert(mutex_is_locked(&host->i_mutex)); err = check_namespace(&nm); @@ -567,10 +556,10 @@ int ubifs_removexattr(struct dentry *dentry, const char *name) } ubifs_assert(inode->i_nlink == 1); - inode->i_nlink = 0; + clear_nlink(inode); err = remove_xattr(c, host, inode, &nm); if (err) - inode->i_nlink = 1; + set_nlink(inode, 1); /* If @i_nlink is 0, 'iput()' will delete the inode */ iput(inode); |
