diff options
64 files changed, 9922 insertions, 5846 deletions
diff --git a/Documentation/filesystems/xfs-self-describing-metadata.txt b/Documentation/filesystems/xfs-self-describing-metadata.txt new file mode 100644 index 00000000000..05aa455163e --- /dev/null +++ b/Documentation/filesystems/xfs-self-describing-metadata.txt @@ -0,0 +1,350 @@ +XFS Self Describing Metadata +---------------------------- + +Introduction +------------ + +The largest scalability problem facing XFS is not one of algorithmic +scalability, but of verification of the filesystem structure. Scalabilty of the +structures and indexes on disk and the algorithms for iterating them are +adequate for supporting PB scale filesystems with billions of inodes, however it +is this very scalability that causes the verification problem. + +Almost all metadata on XFS is dynamically allocated. The only fixed location +metadata is the allocation group headers (SB, AGF, AGFL and AGI), while all +other metadata structures need to be discovered by walking the filesystem +structure in different ways. While this is already done by userspace tools for +validating and repairing the structure, there are limits to what they can +verify, and this in turn limits the supportable size of an XFS filesystem. + +For example, it is entirely possible to manually use xfs_db and a bit of +scripting to analyse the structure of a 100TB filesystem when trying to +determine the root cause of a corruption problem, but it is still mainly a +manual task of verifying that things like single bit errors or misplaced writes +weren't the ultimate cause of a corruption event. It may take a few hours to a +few days to perform such forensic analysis, so for at this scale root cause +analysis is entirely possible. + +However, if we scale the filesystem up to 1PB, we now have 10x as much metadata +to analyse and so that analysis blows out towards weeks/months of forensic work. +Most of the analysis work is slow and tedious, so as the amount of analysis goes +up, the more likely that the cause will be lost in the noise. Hence the primary +concern for supporting PB scale filesystems is minimising the time and effort +required for basic forensic analysis of the filesystem structure. + + +Self Describing Metadata +------------------------ + +One of the problems with the current metadata format is that apart from the +magic number in the metadata block, we have no other way of identifying what it +is supposed to be. We can't even identify if it is the right place. Put simply, +you can't look at a single metadata block in isolation and say "yes, it is +supposed to be there and the contents are valid". + +Hence most of the time spent on forensic analysis is spent doing basic +verification of metadata values, looking for values that are in range (and hence +not detected by automated verification checks) but are not correct. Finding and +understanding how things like cross linked block lists (e.g. sibling +pointers in a btree end up with loops in them) are the key to understanding what +went wrong, but it is impossible to tell what order the blocks were linked into +each other or written to disk after the fact. + +Hence we need to record more information into the metadata to allow us to +quickly determine if the metadata is intact and can be ignored for the purpose +of analysis. We can't protect against every possible type of error, but we can +ensure that common types of errors are easily detectable. Hence the concept of +self describing metadata. + +The first, fundamental requirement of self describing metadata is that the +metadata object contains some form of unique identifier in a well known +location. This allows us to identify the expected contents of the block and +hence parse and verify the metadata object. IF we can't independently identify +the type of metadata in the object, then the metadata doesn't describe itself +very well at all! + +Luckily, almost all XFS metadata has magic numbers embedded already - only the +AGFL, remote symlinks and remote attribute blocks do not contain identifying +magic numbers. Hence we can change the on-disk format of all these objects to +add more identifying information and detect this simply by changing the magic +numbers in the metadata objects. That is, if it has the current magic number, +the metadata isn't self identifying. If it contains a new magic number, it is +self identifying and we can do much more expansive automated verification of the +metadata object at runtime, during forensic analysis or repair. + +As a primary concern, self describing metadata needs some form of overall +integrity checking. We cannot trust the metadata if we cannot verify that it has +not been changed as a result of external influences. Hence we need some form of +integrity check, and this is done by adding CRC32c validation to the metadata +block. If we can verify the block contains the metadata it was intended to +contain, a large amount of the manual verification work can be skipped. + +CRC32c was selected as metadata cannot be more than 64k in length in XFS and +hence a 32 bit CRC is more than sufficient to detect multi-bit errors in +metadata blocks. CRC32c is also now hardware accelerated on common CPUs so it is +fast. So while CRC32c is not the strongest of possible integrity checks that +could be used, it is more than sufficient for our needs and has relatively +little overhead. Adding support for larger integrity fields and/or algorithms +does really provide any extra value over CRC32c, but it does add a lot of +complexity and so there is no provision for changing the integrity checking +mechanism. + +Self describing metadata needs to contain enough information so that the +metadata block can be verified as being in the correct place without needing to +look at any other metadata. This means it needs to contain location information. +Just adding a block number to the metadata is not sufficient to protect against +mis-directed writes - a write might be misdirected to the wrong LUN and so be +written to the "correct block" of the wrong filesystem. Hence location +information must contain a filesystem identifier as well as a block number. + +Another key information point in forensic analysis is knowing who the metadata +block belongs to. We already know the type, the location, that it is valid +and/or corrupted, and how long ago that it was last modified. Knowing the owner +of the block is important as it allows us to find other related metadata to +determine the scope of the corruption. For example, if we have a extent btree +object, we don't know what inode it belongs to and hence have to walk the entire +filesystem to find the owner of the block. Worse, the corruption could mean that +no owner can be found (i.e. it's an orphan block), and so without an owner field +in the metadata we have no idea of the scope of the corruption. If we have an +owner field in the metadata object, we can immediately do top down validation to +determine the scope of the problem. + +Different types of metadata have different owner identifiers. For example, +directory, attribute and extent tree blocks are all owned by an inode, whilst +freespace btree blocks are owned by an allocation group. Hence the size and +contents of the owner field are determined by the type of metadata object we are +looking at. The owner information can also identify misplaced writes (e.g. +freespace btree block written to the wrong AG). + +Self describing metadata also needs to contain some indication of when it was +written to the filesystem. One of the key information points when doing forensic +analysis is how recently the block was modified. Correlation of set of corrupted +metadata blocks based on modification times is important as it can indicate +whether the corruptions are related, whether there's been multiple corruption +events that lead to the eventual failure, and even whether there are corruptions +present that the run-time verification is not detecting. + +For example, we can determine whether a metadata object is supposed to be free +space or still allocated if it is still referenced by its owner by looking at +when the free space btree block that contains the block was last written +compared to when the metadata object itself was last written. If the free space +block is more recent than the object and the object's owner, then there is a +very good chance that the block should have been removed from the owner. + +To provide this "written timestamp", each metadata block gets the Log Sequence +Number (LSN) of the most recent transaction it was modified on written into it. +This number will always increase over the life of the filesystem, and the only +thing that resets it is running xfs_repair on the filesystem. Further, by use of +the LSN we can tell if the corrupted metadata all belonged to the same log +checkpoint and hence have some idea of how much modification occurred between +the first and last instance of corrupt metadata on disk and, further, how much +modification occurred between the corruption being written and when it was +detected. + +Runtime Validation +------------------ + +Validation of self-describing metadata takes place at runtime in two places: + + - immediately after a successful read from disk + - immediately prior to write IO submission + +The verification is completely stateless - it is done independently of the +modification process, and seeks only to check that the metadata is what it says +it is and that the metadata fields are within bounds and internally consistent. +As such, we cannot catch all types of corruption that can occur within a block +as there may be certain limitations that operational state enforces of the +metadata, or there may be corruption of interblock relationships (e.g. corrupted +sibling pointer lists). Hence we still need stateful checking in the main code +body, but in general most of the per-field validation is handled by the +verifiers. + +For read verification, the caller needs to specify the expected type of metadata +that it should see, and the IO completion process verifies that the metadata +object matches what was expected. If the verification process fails, then it +marks the object being read as EFSCORRUPTED. The caller needs to catch this +error (same as for IO errors), and if it needs to take special action due to a +verification error it can do so by catching the EFSCORRUPTED error value. If we +need more discrimination of error type at higher levels, we can define new +error numbers for different errors as necessary. + +The first step in read verification is checking the magic number and determining +whether CRC validating is necessary. If it is, the CRC32c is calculated and +compared against the value stored in the object itself. Once this is validated, +further checks are made against the location information, followed by extensive +object specific metadata validation. If any of these checks fail, then the +buffer is considered corrupt and the EFSCORRUPTED error is set appropriately. + +Write verification is the opposite of the read verification - first the object +is extensively verified and if it is OK we then update the LSN from the last +modification made to the object, After this, we calculate the CRC and insert it +into the object. Once this is done the write IO is allowed to continue. If any +error occurs during this process, the buffer is again marked with a EFSCORRUPTED +error for the higher layers to catch. + +Structures +---------- + +A typical on-disk structure needs to contain the following information: + +struct xfs_ondisk_hdr { + __be32 magic; /* magic number */ + __be32 crc; /* CRC, not logged */ + uuid_t uuid; /* filesystem identifier */ + __be64 owner; /* parent object */ + __be64 blkno; /* location on disk */ + __be64 lsn; /* last modification in log, not logged */ +}; + +Depending on the metadata, this information may be part of a header structure +separate to the metadata contents, or may be distributed through an existing +structure. The latter occurs with metadata that already contains some of this +information, such as the superblock and AG headers. + +Other metadata may have different formats for the information, but the same +level of information is generally provided. For example: + + - short btree blocks have a 32 bit owner (ag number) and a 32 bit block + number for location. The two of these combined provide the same + information as @owner and @blkno in eh above structure, but using 8 + bytes less space on disk. + + - directory/attribute node blocks have a 16 bit magic number, and the + header that contains the magic number has other information in it as + well. hence the additional metadata headers change the overall format + of the metadata. + +A typical buffer read verifier is structured as follows: + +#define XFS_FOO_CRC_OFF offsetof(struct xfs_ondisk_hdr, crc) + +static void +xfs_foo_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + if ((xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), + XFS_FOO_CRC_OFF)) || + !xfs_foo_verify(bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } +} + +The code ensures that the CRC is only checked if the filesystem has CRCs enabled +by checking the superblock of the feature bit, and then if the CRC verifies OK +(or is not needed) it verifies the actual contents of the block. + +The verifier function will take a couple of different forms, depending on +whether the magic number can be used to determine the format of the block. In +the case it can't, the code is structured as follows: + +static bool +xfs_foo_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_ondisk_hdr *hdr = bp->b_addr; + + if (hdr->magic != cpu_to_be32(XFS_FOO_MAGIC)) + return false; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) { + if (!uuid_equal(&hdr->uuid, &mp->m_sb.sb_uuid)) + return false; + if (bp->b_bn != be64_to_cpu(hdr->blkno)) + return false; + if (hdr->owner == 0) + return false; + } + + /* object specific verification checks here */ + + return true; +} + +If there are different magic numbers for the different formats, the verifier +will look like: + +static bool +xfs_foo_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_ondisk_hdr *hdr = bp->b_addr; + + if (hdr->magic == cpu_to_be32(XFS_FOO_CRC_MAGIC)) { + if (!uuid_equal(&hdr->uuid, &mp->m_sb.sb_uuid)) + return false; + if (bp->b_bn != be64_to_cpu(hdr->blkno)) + return false; + if (hdr->owner == 0) + return false; + } else if (hdr->magic != cpu_to_be32(XFS_FOO_MAGIC)) + return false; + + /* object specific verification checks here */ + + return true; +} + +Write verifiers are very similar to the read verifiers, they just do things in +the opposite order to the read verifiers. A typical write verifier: + +static void +xfs_foo_write_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + + if (!xfs_foo_verify(bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + + if (bip) { + struct xfs_ondisk_hdr *hdr = bp->b_addr; + hdr->lsn = cpu_to_be64(bip->bli_item.li_lsn); + } + xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_FOO_CRC_OFF); +} + +This will verify the internal structure of the metadata before we go any +further, detecting corruptions that have occurred as the metadata has been +modified in memory. If the metadata verifies OK, and CRCs are enabled, we then +update the LSN field (when it was last modified) and calculate the CRC on the +metadata. Once this is done, we can issue the IO. + +Inodes and Dquots +----------------- + +Inodes and dquots are special snowflakes. They have per-object CRC and +self-identifiers, but they are packed so that there are multiple objects per +buffer. Hence we do not use per-buffer verifiers to do the work of per-object +verification and CRC calculations. The per-buffer verifiers simply perform basic +identification of the buffer - that they contain inodes or dquots, and that +there are magic numbers in all the expected spots. All further CRC and +verification checks are done when each inode is read from or written back to the +buffer. + +The structure of the verifiers and the identifiers checks is very similar to the +buffer code described above. The only difference is where they are called. For +example, inode read verification is done in xfs_iread() when the inode is first +read out of the buffer and the struct xfs_inode is instantiated. The inode is +already extensively verified during writeback in xfs_iflush_int, so the only +addition here is to add the LSN and CRC to the inode as it is copied back into +the buffer. + +XXX: inode unlinked list modification doesn't recalculate the inode CRC! None of +the unlinked list modifications check or update CRCs, neither during unlink nor +log recovery. So, it's gone unnoticed until now. This won't matter immediately - +repair will probably complain about it - but it needs to be fixed. + diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index d02201df855..6313b69b664 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -45,11 +45,11 @@ xfs-y += xfs_aops.o \ xfs_itable.o \ xfs_message.o \ xfs_mru_cache.o \ - xfs_super.o \ - xfs_xattr.o \ xfs_rename.o \ + xfs_super.o \ xfs_utils.o \ xfs_vnodeops.o \ + xfs_xattr.o \ kmem.o \ uuid.o @@ -58,6 +58,7 @@ xfs-y += xfs_alloc.o \ xfs_alloc_btree.o \ xfs_attr.o \ xfs_attr_leaf.o \ + xfs_attr_remote.o \ xfs_bmap.o \ xfs_bmap_btree.o \ xfs_btree.o \ @@ -73,6 +74,7 @@ xfs-y += xfs_alloc.o \ xfs_inode.o \ xfs_log_recover.o \ xfs_mount.o \ + xfs_symlink.o \ xfs_trans.o # low-level transaction/log code diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index f2aeedb6a57..317aa86d96e 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -30,6 +30,7 @@ struct xfs_trans; #define XFS_AGF_MAGIC 0x58414746 /* 'XAGF' */ #define XFS_AGI_MAGIC 0x58414749 /* 'XAGI' */ +#define XFS_AGFL_MAGIC 0x5841464c /* 'XAFL' */ #define XFS_AGF_VERSION 1 #define XFS_AGI_VERSION 1 @@ -63,12 +64,29 @@ typedef struct xfs_agf { __be32 agf_spare0; /* spare field */ __be32 agf_levels[XFS_BTNUM_AGF]; /* btree levels */ __be32 agf_spare1; /* spare field */ + __be32 agf_flfirst; /* first freelist block's index */ __be32 agf_fllast; /* last freelist block's index */ __be32 agf_flcount; /* count of blocks in freelist */ __be32 agf_freeblks; /* total free blocks */ + __be32 agf_longest; /* longest free space */ __be32 agf_btreeblks; /* # of blocks held in AGF btrees */ + uuid_t agf_uuid; /* uuid of filesystem */ + + /* + * reserve some contiguous space for future logged fields before we add + * the unlogged fields. This makes the range logging via flags and + * structure offsets much simpler. + */ + __be64 agf_spare64[16]; + + /* unlogged fields, written during buffer writeback. */ + __be64 agf_lsn; /* last write sequence */ + __be32 agf_crc; /* crc of agf sector */ + __be32 agf_spare2; + + /* structure must be padded to 64 bit alignment */ } xfs_agf_t; #define XFS_AGF_MAGICNUM 0x00000001 @@ -83,7 +101,8 @@ typedef struct xfs_agf { #define XFS_AGF_FREEBLKS 0x00000200 #define XFS_AGF_LONGEST 0x00000400 #define XFS_AGF_BTREEBLKS 0x00000800 -#define XFS_AGF_NUM_BITS 12 +#define XFS_AGF_UUID 0x00001000 +#define XFS_AGF_NUM_BITS 13 #define XFS_AGF_ALL_BITS ((1 << XFS_AGF_NUM_BITS) - 1) #define XFS_AGF_FLAGS \ @@ -98,7 +117,8 @@ typedef struct xfs_agf { { XFS_AGF_FLCOUNT, "FLCOUNT" }, \ { XFS_AGF_FREEBLKS, "FREEBLKS" }, \ { XFS_AGF_LONGEST, "LONGEST" }, \ - { XFS_AGF_BTREEBLKS, "BTREEBLKS" } + { XFS_AGF_BTREEBLKS, "BTREEBLKS" }, \ + { XFS_AGF_UUID, "UUID" } /* disk block (xfs_daddr_t) in the AG */ #define XFS_AGF_DADDR(mp) ((xfs_daddr_t)(1 << (mp)->m_sectbb_log)) @@ -132,6 +152,7 @@ typedef struct xfs_agi { __be32 agi_root; /* root of inode btree */ __be32 agi_level; /* levels in inode btree */ __be32 agi_freecount; /* number of free inodes */ + __be32 agi_newino; /* new inode just allocated */ __be32 agi_dirino; /* last directory inode chunk */ /* @@ -139,6 +160,13 @@ typedef struct xfs_agi { * still being referenced. */ __be32 agi_unlinked[XFS_AGI_UNLINKED_BUCKETS]; + + uuid_t agi_uuid; /* uuid of filesystem */ + __be32 agi_crc; /* crc of agi sector */ + __be32 agi_pad32; + __be64 agi_lsn; /* last write sequence */ + + /* structure must be padded to 64 bit alignment */ } xfs_agi_t; #define XFS_AGI_MAGICNUM 0x00000001 @@ -171,11 +199,31 @@ extern const struct xfs_buf_ops xfs_agi_buf_ops; */ #define XFS_AGFL_DADDR(mp) ((xfs_daddr_t)(3 << (mp)->m_sectbb_log)) #define XFS_AGFL_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGFL_DADDR(mp)) -#define XFS_AGFL_SIZE(mp) ((mp)->m_sb.sb_sectsize / sizeof(xfs_agblock_t)) #define XFS_BUF_TO_AGFL(bp) ((xfs_agfl_t *)((bp)->b_addr)) +#define XFS_BUF_TO_AGFL_BNO(mp, bp) \ + (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ + &(XFS_BUF_TO_AGFL(bp)->agfl_bno[0]) : \ + (__be32 *)(bp)->b_addr) + +/* + * Size of the AGFL. For CRC-enabled filesystes we steal a couple of + * slots in the beginning of the block for a proper header with the + * location information and CRC. + */ +#define XFS_AGFL_SIZE(mp) \ + (((mp)->m_sb.sb_sectsize - \ + (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ + sizeof(struct xfs_agfl) : 0)) / \ + sizeof(xfs_agblock_t)) + typedef struct xfs_agfl { - __be32 agfl_bno[1]; /* actually XFS_AGFL_SIZE(mp) */ + __be32 agfl_magicnum; + __be32 agfl_seqno; + uuid_t agfl_uuid; + __be64 agfl_lsn; + __be32 agfl_crc; + __be32 agfl_bno[]; /* actually XFS_AGFL_SIZE(mp) */ } xfs_agfl_t; /* diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 0ad23253e8b..5673bcfda2f 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -33,7 +33,9 @@ #include "xfs_alloc.h" #include "xfs_extent_busy.h" #include "xfs_error.h" +#include "xfs_cksum.h" #include "xfs_trace.h" +#include "xfs_buf_item.h" struct workqueue_struct *xfs_alloc_wq; @@ -430,53 +432,84 @@ xfs_alloc_fixup_trees( return 0; } -static void +static bool xfs_agfl_verify( struct xfs_buf *bp) { -#ifdef WHEN_CRCS_COME_ALONG - /* - * we cannot actually do any verification of the AGFL because mkfs does - * not initialise the AGFL to zero or NULL. Hence the only valid part of - * the AGFL is what the AGF says is active. We can't get to the AGF, so - * we can't verify just those entries are valid. - * - * This problem goes away when the CRC format change comes along as that - * requires the AGFL to be initialised by mkfs. At that point, we can - * verify the blocks in the agfl -active or not- lie within the bounds - * of the AG. Until then, just leave this check ifdef'd out. - */ struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp); - int agfl_ok = 1; - int i; + if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_uuid)) + return false; + if (be32_to_cpu(agfl->agfl_magicnum) != XFS_AGFL_MAGIC) + return false; + /* + * during growfs operations, the perag is not fully initialised, + * so we can't use it for any useful checking. growfs ensures we can't + * use it by using uncached buffers that don't have the perag attached + * so we can detect and avoid this problem. + */ + if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno) + return false; + for (i = 0; i < XFS_AGFL_SIZE(mp); i++) { - if (be32_to_cpu(agfl->agfl_bno[i]) == NULLAGBLOCK || + if (be32_to_cpu(agfl->agfl_bno[i]) != NULLAGBLOCK && be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks) - agfl_ok = 0; + return false; } + return true; +} + +static void +xfs_agfl_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + int agfl_ok = 1; + + /* + * There is no verification of non-crc AGFLs because mkfs does not + * initialise the AGFL to zero or NULL. Hence the only valid part of the + * AGFL is what the AGF says is active. We can't get to the AGF, so we + * can't verify just those entries are valid. + */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + agfl_ok = xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), + offsetof(struct xfs_agfl, agfl_crc)); + + agfl_ok = agfl_ok && xfs_agfl_verify(bp); if (!agfl_ok) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agfl); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); xfs_buf_ioerror(bp, EFSCORRUPTED); } -#endif } static void xfs_agfl_write_verify( struct xfs_buf *bp) { - xfs_agfl_verify(bp); -} + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; -static void -xfs_agfl_read_verify( - struct xfs_buf *bp) -{ - xfs_agfl_verify(bp); + /* no verification of non-crc AGFLs */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (!xfs_agfl_verify(bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + return; + } + + if (bip) + XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), + offsetof(struct xfs_agfl, agfl_crc)); } const struct xfs_buf_ops xfs_agfl_buf_ops = { @@ -842,7 +875,7 @@ xfs_alloc_ag_vextent_near( */ int dofirst; /* set to do first algorithm */ - dofirst = random32() & 1; + dofirst = prandom_u32() & 1; #endif restart: @@ -1982,18 +2015,18 @@ xfs_alloc_get_freelist( int btreeblk) /* destination is a AGF btree */ { xfs_agf_t *agf; /* a.g. freespace structure */ - xfs_agfl_t *agfl; /* a.g. freelist structure */ xfs_buf_t *agflbp;/* buffer for a.g. freelist structure */ xfs_agblock_t bno; /* block number returned */ + __be32 *agfl_bno; int error; int logflags; - xfs_mount_t *mp; /* mount structure */ + xfs_mount_t *mp = tp->t_mountp; xfs_perag_t *pag; /* per allocation group data */ - agf = XFS_BUF_TO_AGF(agbp); /* * Freelist is empty, give up. */ + agf = XFS_BUF_TO_AGF(agbp); if (!agf->agf_flcount) { *bnop = NULLAGBLOCK; return 0; @@ -2001,15 +2034,17 @@ xfs_alloc_get_freelist( /* * Read the array of free blocks. */ - mp = tp->t_mountp; - if ((error = xfs_alloc_read_agfl(mp, tp, - be32_to_cpu(agf->agf_seqno), &agflbp))) + error = xfs_alloc_read_agfl(mp, tp, be32_to_cpu(agf->agf_seqno), + &agflbp); + if (error) return error; - agfl = XFS_BUF_TO_AGFL(agflbp); + + /* * Get the block number and update the data structures. */ - bno = be32_to_cpu(agfl->agfl_bno[be32_to_cpu(agf->agf_flfirst)]); + agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp); + bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]); be32_add_cpu(&agf->agf_flfirst, 1); xfs_trans_brelse(tp, agflbp); if (be32_to_cpu(agf->agf_flfirst) == XFS_AGFL_SIZE(mp)) @@ -2058,11 +2093,14 @@ xfs_alloc_log_agf( offsetof(xfs_agf_t, agf_freeblks), offsetof(xfs_agf_t, agf_longest), offsetof(xfs_agf_t, agf_btreeblks), + offsetof(xfs_agf_t, agf_uuid), sizeof(xfs_agf_t) }; trace_xfs_agf(tp->t_mountp, XFS_BUF_TO_AGF(bp), fields, _RET_IP_); + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGF_BUF); + xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last); xfs_trans_log_buf(tp, bp, (uint)first, (uint)last); } @@ -2099,12 +2137,13 @@ xfs_alloc_put_freelist( int btreeblk) /* block came from a AGF btree */ { xfs_agf_t *agf; /* a.g. freespace structure */ - xfs_agfl_t *agfl; /* a.g. free block array */ __be32 *blockp;/* pointer to array entry */ int error; int logflags; xfs_mount_t *mp; /* mount structure */ xfs_perag_t *pag; /* per allocation group data */ + __be32 *agfl_bno; + int startoff; agf = XFS_BUF_TO_AGF(agbp); mp = tp->t_mountp; @@ -2112,7 +2151,6 @@ xfs_alloc_put_freelist( if (!agflbp && (error = xfs_alloc_read_agfl(mp, tp, be32_to_cpu(agf->agf_seqno), &agflbp))) return error; - agfl = XFS_BUF_TO_AGFL(agflbp); be32_add_cpu(&agf->agf_fllast, 1); if (be32_to_cpu(agf->agf_fllast) == XFS_AGFL_SIZE(mp)) agf->agf_fllast = 0; @@ -2133,32 +2171,38 @@ xfs_alloc_put_freelist( xfs_alloc_log_agf(tp, agbp, logflags); ASSERT(be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp)); - blockp = &agfl->agfl_bno[be32_to_cpu(agf->agf_fllast)]; + + agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp); + blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)]; *blockp = cpu_to_be32(bno); + startoff = (char *)blockp - (char *)agflbp->b_addr; + xfs_alloc_log_agf(tp, agbp, logflags); - xfs_trans_log_buf(tp, agflbp, - (int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl), - (int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl + - sizeof(xfs_agblock_t) - 1)); + + xfs_trans_buf_set_type(tp, agflbp, XFS_BLFT_AGFL_BUF); + xfs_trans_log_buf(tp, agflbp, startoff, + startoff + sizeof(xfs_agblock_t) - 1); return 0; } -static void +static bool xfs_agf_verify( + struct xfs_mount *mp, struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_agf *agf; - int agf_ok; + struct xfs_agf *agf = XFS_BUF_TO_AGF(bp); - agf = XFS_BUF_TO_AGF(bp); + if (xfs_sb_version_hascrc(&mp->m_sb) && + !uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_uuid)) + return false; - agf_ok = agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) && - XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) && - be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) && - be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) && - be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) && - be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp); + if (!(agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) && + XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) && + be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) && + be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) && + be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) && + be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp))) + return false; /* * during growfs operations, the perag is not fully initialised, @@ -2166,33 +2210,58 @@ xfs_agf_verify( * use it by using uncached buffers that don't have the perag attached * so we can detect and avoid this problem. */ - if (bp->b_pag) - agf_ok = agf_ok && be32_to_cpu(agf->agf_seqno) == - bp->b_pag->pag_agno; + if (bp->b_pag && be32_to_cpu(agf->agf_seqno) != bp->b_pag->pag_agno) + return false; - if (xfs_sb_version_haslazysbcount(&mp->m_sb)) - agf_ok = agf_ok && be32_to_cpu(agf->agf_btreeblks) <= - be32_to_cpu(agf->agf_length); + if (xfs_sb_version_haslazysbcount(&mp->m_sb) && + be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length)) + return false; + + return true;; - if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF, - XFS_RANDOM_ALLOC_READ_AGF))) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agf); - xfs_buf_ioerror(bp, EFSCORRUPTED); - } } static void xfs_agf_read_verify( struct xfs_buf *bp) { - xfs_agf_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + int agf_ok = 1; + + if (xfs_sb_version_hascrc(&mp->m_sb)) + agf_ok = xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), + offsetof(struct xfs_agf, agf_crc)); + + agf_ok = agf_ok && xfs_agf_verify(mp, bp); + + if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF, + XFS_RANDOM_ALLOC_READ_AGF))) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } } static void xfs_agf_write_verify( struct xfs_buf *bp) { - xfs_agf_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + + if (!xfs_agf_verify(mp, bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + XFS_BUF_TO_AGF(bp)->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), + offsetof(struct xfs_agf, agf_crc)); } const struct xfs_buf_ops xfs_agf_buf_ops = { diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c index b1ddef6b268..30c4c1434fa 100644 --- a/fs/xfs/xfs_alloc_btree.c +++ b/fs/xfs/xfs_alloc_btree.c @@ -33,6 +33,7 @@ #include "xfs_extent_busy.h" #include "xfs_error.h" #include "xfs_trace.h" +#include "xfs_cksum.h" STATIC struct xfs_btree_cur * @@ -272,7 +273,7 @@ xfs_allocbt_key_diff( return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock; } -static void +static bool xfs_allocbt_verify( struct xfs_buf *bp) { @@ -280,66 +281,103 @@ xfs_allocbt_verify( struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); struct xfs_perag *pag = bp->b_pag; unsigned int level; - int sblock_ok; /* block passes checks */ /* * magic number and level verification * - * During growfs operations, we can't verify the exact level as the - * perag is not fully initialised and hence not attached to the buffer. - * In this case, check against the maximum tree depth. + * During growfs operations, we can't verify the exact level or owner as + * the perag is not fully initialised and hence not attached to the + * buffer. In this case, check against the maximum tree depth. + * + * Similarly, during log recovery we will have a perag structure + * attached, but the agf information will not yet have been initialised + * from the on disk AGF. Again, we can only check against maximum limits + * in this case. */ level = be16_to_cpu(block->bb_level); switch (block->bb_magic) { + case cpu_to_be32(XFS_ABTB_CRC_MAGIC): + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid)) + return false; + if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn)) + return false; + if (pag && + be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno) + return false; + /* fall through */ case cpu_to_be32(XFS_ABTB_MAGIC): - if (pag) - sblock_ok = level < pag->pagf_levels[XFS_BTNUM_BNOi]; - else - sblock_ok = level < mp->m_ag_maxlevels; + if (pag && pag->pagf_init) { + if (level >= pag->pagf_levels[XFS_BTNUM_BNOi]) + return false; + } else if (level >= mp->m_ag_maxlevels) + return false; break; + case cpu_to_be32(XFS_ABTC_CRC_MAGIC): + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid)) + return false; + if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn)) + return false; + if (pag && + be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno) + return false; + /* fall through */ case cpu_to_be32(XFS_ABTC_MAGIC): - if (pag) - sblock_ok = level < pag->pagf_levels[XFS_BTNUM_CNTi]; - else - sblock_ok = level < mp->m_ag_maxlevels; + if (pag && pag->pagf_init) { + if (level >= pag->pagf_levels[XFS_BTNUM_CNTi]) + return false; + } else if (level >= mp->m_ag_maxlevels) + return false; break; default: - sblock_ok = 0; - break; + return false; } /* numrecs verification */ - sblock_ok = sblock_ok && - be16_to_cpu(block->bb_numrecs) <= mp->m_alloc_mxr[level != 0]; + if (be16_to_cpu(block->bb_numrecs) > mp->m_alloc_mxr[level != 0]) + return false; /* sibling pointer verification */ - sblock_ok = sblock_ok && - (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) || - be32_to_cpu(block->bb_u.s.bb_leftsib) < mp->m_sb.sb_agblocks) && - block->bb_u.s.bb_leftsib && - (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) || - be32_to_cpu(block->bb_u.s.bb_rightsib) < mp->m_sb.sb_agblocks) && - block->bb_u.s.bb_rightsib; - - if (!sblock_ok) { - trace_xfs_btree_corrupt(bp, _RET_IP_); - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block); - xfs_buf_ioerror(bp, EFSCORRUPTED); - } + if (!block->bb_u.s.bb_leftsib || + (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks && + block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK))) + return false; + if (!block->bb_u.s.bb_rightsib || + (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks && + block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK))) + return false; + + return true; } static void xfs_allocbt_read_verify( struct xfs_buf *bp) { - xfs_allocbt_verify(bp); + if (!(xfs_btree_sblock_verify_crc(bp) && + xfs_allocbt_verify(bp))) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, + bp->b_target->bt_mount, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } } static void xfs_allocbt_write_verify( struct xfs_buf *bp) { - xfs_allocbt_verify(bp); + if (!xfs_allocbt_verify(bp)) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, + bp->b_target->bt_mount, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } + xfs_btree_sblock_calc_crc(bp); + } const struct xfs_buf_ops xfs_allocbt_buf_ops = { @@ -444,6 +482,9 @@ xfs_allocbt_init_cursor( cur->bc_private.a.agbp = agbp; cur->bc_private.a.agno = agno; + if (xfs_sb_version_hascrc(&mp->m_sb)) + cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; + return cur; } diff --git a/fs/xfs/xfs_alloc_btree.h b/fs/xfs/xfs_alloc_btree.h index 7e89a2b429d..e3a3f742419 100644 --- a/fs/xfs/xfs_alloc_btree.h +++ b/fs/xfs/xfs_alloc_btree.h @@ -31,8 +31,10 @@ struct xfs_mount; * by blockcount and blockno. All blocks look the same to make the code * simpler; if we have time later, we'll make the optimizations. */ -#define XFS_ABTB_MAGIC 0x41425442 /* 'ABTB' for bno tree */ -#define XFS_ABTC_MAGIC 0x41425443 /* 'ABTC' for cnt tree */ +#define XFS_ABTB_MAGIC 0x41425442 /* 'ABTB' for bno tree */ +#define XFS_ABTB_CRC_MAGIC 0x41423342 /* 'AB3B' */ +#define XFS_ABTC_MAGIC 0x41425443 /* 'ABTC' for cnt tree */ +#define XFS_ABTC_CRC_MAGIC 0x41423343 /* 'AB3C' */ /* * Data record/key structure @@ -59,10 +61,10 @@ typedef __be32 xfs_alloc_ptr_t; /* * Btree block header size depends on a superblock flag. - * - * (not quite yet, but soon) */ -#define XFS_ALLOC_BLOCK_LEN(mp) XFS_BTREE_SBLOCK_LEN +#define XFS_ALLOC_BLOCK_LEN(mp) \ + (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ + XFS_BTREE_SBLOCK_CRC_LEN : XFS_BTREE_SBLOCK_LEN) /* * Record, key, and pointer address macros for btree blocks. diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 5f707e53717..3244c988d37 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -953,13 +953,13 @@ xfs_vm_writepage( unsigned offset_into_page = offset & (PAGE_CACHE_SIZE - 1); /* - * Just skip the page if it is fully outside i_size, e.g. due - * to a truncate operation that is in progress. + * Skip the page if it is fully outside i_size, e.g. due to a + * truncate operation that is in progress. We must redirty the + * page so that reclaim stops reclaiming it. Otherwise + * xfs_vm_releasepage() is called on it and gets confused. */ - if (page->index >= end_index + 1 || offset_into_page == 0) { - unlock_page(page); - return 0; - } + if (page->index >= end_index + 1 || offset_into_page == 0) + goto redirty; /* * The page straddles i_size. It must be zeroed out on each diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index 888683844d9..20fe3fe9d34 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -15,7 +15,6 @@ * along with this program; if not, write the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ - #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" @@ -35,6 +34,7 @@ #include "xfs_bmap.h" #include "xfs_attr.h" #include "xfs_attr_leaf.h" +#include "xfs_attr_remote.h" #include "xfs_error.h" #include "xfs_quota.h" #include "xfs_trans_space.h" @@ -74,13 +74,6 @@ STATIC int xfs_attr_node_list(xfs_attr_list_context_t *context); STATIC int xfs_attr_fillstate(xfs_da_state_t *state); STATIC int xfs_attr_refillstate(xfs_da_state_t *state); -/* - * Routines to manipulate out-of-line attribute values. - */ -STATIC int xfs_attr_rmtval_set(xfs_da_args_t *args); -STATIC int xfs_attr_rmtval_remove(xfs_da_args_t *args); - -#define ATTR_RMTVALUE_MAPSIZE 1 /* # of map entries at once */ STATIC int xfs_attr_name_to_xname( @@ -820,7 +813,7 @@ xfs_attr_inactive(xfs_inode_t *dp) error = 0; goto out; } - error = xfs_attr_root_inactive(&trans, dp); + error = xfs_attr3_root_inactive(&trans, dp); if (error) goto out; @@ -906,7 +899,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) */ dp = args->dp; args->blkno = 0; - error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); if (error) return error; @@ -914,14 +907,14 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * Look up the given attribute in the leaf block. Figure out if * the given flags produce an error or call for an atomic rename. */ - retval = xfs_attr_leaf_lookup_int(bp, args); + retval = xfs_attr3_leaf_lookup_int(bp, args); if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) { xfs_trans_brelse(args->trans, bp); - return(retval); + return retval; } else if (retval == EEXIST) { if (args->flags & ATTR_CREATE) { /* pure create op */ xfs_trans_brelse(args->trans, bp); - return(retval); + return retval; } trace_xfs_attr_leaf_replace(args); @@ -937,7 +930,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * Add the attribute to the leaf block, transitioning to a Btree * if required. */ - retval = xfs_attr_leaf_add(bp, args); + retval = xfs_attr3_leaf_add(bp, args); if (retval == ENOSPC) { /* * Promote the attribute list to the Btree format, then @@ -945,7 +938,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * can manage its own transactions. */ xfs_bmap_init(args->flist, args->firstblock); - error = xfs_attr_leaf_to_node(args); + error = xfs_attr3_leaf_to_node(args); if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, &committed); @@ -1010,7 +1003,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * In a separate transaction, set the incomplete flag on the * "old" attr and clear the incomplete flag on the "new" attr. */ - error = xfs_attr_leaf_flipflags(args); + error = xfs_attr3_leaf_flipflags(args); if (error) return(error); @@ -1032,19 +1025,19 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * Read in the block containing the "old" attr, then * remove the "old" attr from that block (neat, huh!) */ - error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); if (error) return error; - xfs_attr_leaf_remove(bp, args); + xfs_attr3_leaf_remove(bp, args); /* * If the result is small enough, shrink it all into the inode. */ if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { xfs_bmap_init(args->flist, args->firstblock); - error = xfs_attr_leaf_to_shortform(bp, args, forkoff); + error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ if (!error) { error = xfs_bmap_finish(&args->trans, @@ -1076,9 +1069,9 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) /* * Added a "remote" value, just clear the incomplete flag. */ - error = xfs_attr_leaf_clearflag(args); + error = xfs_attr3_leaf_clearflag(args); } - return(error); + return error; } /* @@ -1101,24 +1094,24 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) */ dp = args->dp; args->blkno = 0; - error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); if (error) return error; - error = xfs_attr_leaf_lookup_int(bp, args); + error = xfs_attr3_leaf_lookup_int(bp, args); if (error == ENOATTR) { xfs_trans_brelse(args->trans, bp); - return(error); + return error; } - xfs_attr_leaf_remove(bp, args); + xfs_attr3_leaf_remove(bp, args); /* * If the result is small enough, shrink it all into the inode. */ if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { xfs_bmap_init(args->flist, args->firstblock); - error = xfs_attr_leaf_to_shortform(bp, args, forkoff); + error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, @@ -1128,7 +1121,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) ASSERT(committed); args->trans = NULL; xfs_bmap_cancel(args->flist); - return(error); + return error; } /* @@ -1138,7 +1131,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) if (committed) xfs_trans_ijoin(args->trans, dp, 0); } - return(0); + return 0; } /* @@ -1156,21 +1149,21 @@ xfs_attr_leaf_get(xfs_da_args_t *args) trace_xfs_attr_leaf_get(args); args->blkno = 0; - error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); if (error) return error; - error = xfs_attr_leaf_lookup_int(bp, args); + error = xfs_attr3_leaf_lookup_int(bp, args); if (error != EEXIST) { xfs_trans_brelse(args->trans, bp); - return(error); + return error; } - error = xfs_attr_leaf_getvalue(bp, args); + error = xfs_attr3_leaf_getvalue(bp, args); xfs_trans_brelse(args->trans, bp); if (!error && (args->rmtblkno > 0) && !(args->flags & ATTR_KERNOVAL)) { error = xfs_attr_rmtval_get(args); } - return(error); + return error; } /* @@ -1185,11 +1178,11 @@ xfs_attr_leaf_list(xfs_attr_list_context_t *context) trace_xfs_attr_leaf_list(context); context->cursor->blkno = 0; - error = xfs_attr_leaf_read(NULL, context->dp, 0, -1, &bp); + error = xfs_attr3_leaf_read(NULL, context->dp, 0, -1, &bp); if (error) return XFS_ERROR(error); - error = xfs_attr_leaf_list_int(bp, context); + error = xfs_attr3_leaf_list_int(bp, context); xfs_trans_brelse(NULL, bp); return XFS_ERROR(error); } @@ -1236,7 +1229,7 @@ restart: * Search to see if name already exists, and get back a pointer * to where it should go. */ - error = xfs_da_node_lookup_int(state, &retval); + error = xfs_da3_node_lookup_int(state, &retval); if (error) goto out; blk = &state->path.blk[ state->path.active-1 ]; @@ -1258,7 +1251,7 @@ restart: args->rmtblkcnt = 0; } - retval = xfs_attr_leaf_add(blk->bp, state->args); + retval = xfs_attr3_leaf_add(blk->bp, state->args); if (retval == ENOSPC) { if (state->path.active == 1) { /* @@ -1268,7 +1261,7 @@ restart: */ xfs_da_state_free(state); xfs_bmap_init(args->flist, args->firstblock); - error = xfs_attr_leaf_to_node(args); + error = xfs_attr3_leaf_to_node(args); if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, @@ -1307,7 +1300,7 @@ restart: * in the index2/blkno2/rmtblkno2/rmtblkcnt2 fields. */ xfs_bmap_init(args->flist, args->firstblock); - error = xfs_da_split(state); + error = xfs_da3_split(state); if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, &committed); @@ -1329,7 +1322,7 @@ restart: /* * Addition succeeded, update Btree hashvals. */ - xfs_da_fixhashpath(state, &state->path); + xfs_da3_fixhashpath(state, &state->path); } /* @@ -1370,7 +1363,7 @@ restart: * In a separate transaction, set the incomplete flag on the * "old" attr and clear the incomplete flag on the "new" attr. */ - error = xfs_attr_leaf_flipflags(args); + error = xfs_attr3_leaf_flipflags(args); if (error) goto out; @@ -1400,7 +1393,7 @@ restart: state->blocksize = state->mp->m_sb.sb_blocksize; state->node_ents = state->mp->m_attr_node_ents; state->inleaf = 0; - error = xfs_da_node_lookup_int(state, &retval); + error = xfs_da3_node_lookup_int(state, &retval); if (error) goto out; @@ -1409,15 +1402,15 @@ restart: */ blk = &state->path.blk[ state->path.active-1 ]; ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); - error = xfs_attr_leaf_remove(blk->bp, args); - xfs_da_fixhashpath(state, &state->path); + error = xfs_attr3_leaf_remove(blk->bp, args); + xfs_da3_fixhashpath(state, &state->path); /* * Check to see if the tree needs to be collapsed. */ if (retval && (state->path.active > 1)) { xfs_bmap_init(args->flist, args->firstblock); - error = xfs_da_join(state); + error = xfs_da3_join(state); if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, @@ -1450,7 +1443,7 @@ restart: /* * Added a "remote" value, just clear the incomplete flag. */ - error = xfs_attr_leaf_clearflag(args); + error = xfs_attr3_leaf_clearflag(args); if (error) goto out; } @@ -1495,7 +1488,7 @@ xfs_attr_node_removename(xfs_da_args_t *args) /* * Search to see if name exists, and get back a pointer to it. */ - error = xfs_da_node_lookup_int(state, &retval); + error = xfs_da3_node_lookup_int(state, &retval); if (error || (retval != EEXIST)) { if (error == 0) error = retval; @@ -1524,7 +1517,7 @@ xfs_attr_node_removename(xfs_da_args_t *args) * Mark the attribute as INCOMPLETE, then bunmapi() the * remote value. */ - error = xfs_attr_leaf_setflag(args); + error = xfs_attr3_leaf_setflag(args); if (error) goto out; error = xfs_attr_rmtval_remove(args); @@ -1545,15 +1538,15 @@ xfs_attr_node_removename(xfs_da_args_t *args) */ blk = &state->path.blk[ state->path.active-1 ]; ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); - retval = xfs_attr_leaf_remove(blk->bp, args); - xfs_da_fixhashpath(state, &state->path); + retval = xfs_attr3_leaf_remove(blk->bp, args); + xfs_da3_fixhashpath(state, &state->path); /* * Check to see if the tree needs to be collapsed. */ if (retval && (state->path.active > 1)) { xfs_bmap_init(args->flist, args->firstblock); - error = xfs_da_join(state); + error = xfs_da3_join(state); if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, &committed); @@ -1591,13 +1584,13 @@ xfs_attr_node_removename(xfs_da_args_t *args) ASSERT(state->path.blk[0].bp); state->path.blk[0].bp = NULL; - error = xfs_attr_leaf_read(args->trans, args->dp, 0, -1, &bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, 0, -1, &bp); if (error) goto out; if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { xfs_bmap_init(args->flist, args->firstblock); - error = xfs_attr_leaf_to_shortform(bp, args, forkoff); + error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ if (!error) { error = xfs_bmap_finish(&args->trans, @@ -1699,7 +1692,7 @@ xfs_attr_refillstate(xfs_da_state_t *state) ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); for (blk = path->blk, level = 0; level < path->active; blk++, level++) { if (blk->disk_blkno) { - error = xfs_da_node_read(state->args->trans, + error = xfs_da3_node_read(state->args->trans, state->args->dp, blk->blkno, blk->disk_blkno, &blk->bp, XFS_ATTR_FORK); @@ -1718,7 +1711,7 @@ xfs_attr_refillstate(xfs_da_state_t *state) ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); for (blk = path->blk, level = 0; level < path->active; blk++, level++) { if (blk->disk_blkno) { - error = xfs_da_node_read(state->args->trans, + error = xfs_da3_node_read(state->args->trans, state->args->dp, blk->blkno, blk->disk_blkno, &blk->bp, XFS_ATTR_FORK); @@ -1758,7 +1751,7 @@ xfs_attr_node_get(xfs_da_args_t *args) /* * Search to see if name exists, and get back a pointer to it. */ - error = xfs_da_node_lookup_int(state, &retval); + error = xfs_da3_node_lookup_int(state, &retval); if (error) { retval = error; } else if (retval == EEXIST) { @@ -1769,7 +1762,7 @@ xfs_attr_node_get(xfs_da_args_t *args) /* * Get the value, local or "remote" */ - retval = xfs_attr_leaf_getvalue(blk->bp, args); + retval = xfs_attr3_leaf_getvalue(blk->bp, args); if (!retval && (args->rmtblkno > 0) && !(args->flags & ATTR_KERNOVAL)) { retval = xfs_attr_rmtval_get(args); @@ -1794,7 +1787,9 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) attrlist_cursor_kern_t *cursor; xfs_attr_leafblock_t *leaf; xfs_da_intnode_t *node; - xfs_da_node_entry_t *btree; + struct xfs_attr3_icleaf_hdr leafhdr; + struct xfs_da3_icnode_hdr nodehdr; + struct xfs_da_node_entry *btree; int error, i; struct xfs_buf *bp; @@ -1810,27 +1805,33 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) */ bp = NULL; if (cursor->blkno > 0) { - error = xfs_da_node_read(NULL, context->dp, cursor->blkno, -1, + error = xfs_da3_node_read(NULL, context->dp, cursor->blkno, -1, &bp, XFS_ATTR_FORK); if ((error != 0) && (error != EFSCORRUPTED)) return(error); if (bp) { + struct xfs_attr_leaf_entry *entries; + node = bp->b_addr; switch (be16_to_cpu(node->hdr.info.magic)) { case XFS_DA_NODE_MAGIC: + case XFS_DA3_NODE_MAGIC: trace_xfs_attr_list_wrong_blk(context); xfs_trans_brelse(NULL, bp); bp = NULL; break; case XFS_ATTR_LEAF_MAGIC: + case XFS_ATTR3_LEAF_MAGIC: leaf = bp->b_addr; - if (cursor->hashval > be32_to_cpu(leaf->entries[ - be16_to_cpu(leaf->hdr.count)-1].hashval)) { + xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf); + entries = xfs_attr3_leaf_entryp(leaf); + if (cursor->hashval > be32_to_cpu( + entries[leafhdr.count - 1].hashval)) { trace_xfs_attr_list_wrong_blk(context); xfs_trans_brelse(NULL, bp); bp = NULL; - } else if (cursor->hashval <= - be32_to_cpu(leaf->entries[0].hashval)) { + } else if (cursor->hashval <= be32_to_cpu( + entries[0].hashval)) { trace_xfs_attr_list_wrong_blk(context); xfs_trans_brelse(NULL, bp); bp = NULL; @@ -1852,27 +1853,31 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) if (bp == NULL) { cursor->blkno = 0; for (;;) { - error = xfs_da_node_read(NULL, context->dp, + __uint16_t magic; + + error = xfs_da3_node_read(NULL, context->dp, cursor->blkno, -1, &bp, XFS_ATTR_FORK); if (error) return(error); node = bp->b_addr; - if (node->hdr.info.magic == - cpu_to_be16(XFS_ATTR_LEAF_MAGIC)) + magic = be16_to_cpu(node->hdr.info.magic); + if (magic == XFS_ATTR_LEAF_MAGIC || + magic == XFS_ATTR3_LEAF_MAGIC) break; - if (unlikely(node->hdr.info.magic != - cpu_to_be16(XFS_DA_NODE_MAGIC))) { + if (magic != XFS_DA_NODE_MAGIC && + magic != XFS_DA3_NODE_MAGIC) { XFS_CORRUPTION_ERROR("xfs_attr_node_list(3)", XFS_ERRLEVEL_LOW, context->dp->i_mount, node); xfs_trans_brelse(NULL, bp); - return(XFS_ERROR(EFSCORRUPTED)); + return XFS_ERROR(EFSCORRUPTED); } - btree = node->btree; - for (i = 0; i < be16_to_cpu(node->hdr.count); - btree++, i++) { + + xfs_da3_node_hdr_from_disk(&nodehdr, node); + btree = xfs_da3_node_tree_p(node); + for (i = 0; i < nodehdr.count; btree++, i++) { if (cursor->hashval <= be32_to_cpu(btree->hashval)) { cursor->blkno = be32_to_cpu(btree->before); @@ -1881,9 +1886,9 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) break; } } - if (i == be16_to_cpu(node->hdr.count)) { + if (i == nodehdr.count) { xfs_trans_brelse(NULL, bp); - return(0); + return 0; } xfs_trans_brelse(NULL, bp); } @@ -1897,310 +1902,21 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) */ for (;;) { leaf = bp->b_addr; - error = xfs_attr_leaf_list_int(bp, context); + error = xfs_attr3_leaf_list_int(bp, context); if (error) { xfs_trans_brelse(NULL, bp); return error; } - if (context->seen_enough || leaf->hdr.info.forw == 0) + xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf); + if (context->seen_enough || leafhdr.forw == 0) break; - cursor->blkno = be32_to_cpu(leaf->hdr.info.forw); + cursor->blkno = leafhdr.forw; xfs_trans_brelse(NULL, bp); - error = xfs_attr_leaf_read(NULL, context->dp, cursor->blkno, -1, + error = xfs_attr3_leaf_read(NULL, context->dp, cursor->blkno, -1, &bp); if (error) return error; } xfs_trans_brelse(NULL, bp); - return(0); -} - - -/*======================================================================== - * External routines for manipulating out-of-line attribute values. - *========================================================================*/ - -/* - * Read the value associated with an attribute from the out-of-line buffer - * that we stored it in. - */ -int -xfs_attr_rmtval_get(xfs_da_args_t *args) -{ - xfs_bmbt_irec_t map[ATTR_RMTVALUE_MAPSIZE]; - xfs_mount_t *mp; - xfs_daddr_t dblkno; - void *dst; - xfs_buf_t *bp; - int nmap, error, tmp, valuelen, blkcnt, i; - xfs_dablk_t lblkno; - - trace_xfs_attr_rmtval_get(args); - - ASSERT(!(args->flags & ATTR_KERNOVAL)); - - mp = args->dp->i_mount; - dst = args->value; - valuelen = args->valuelen; - lblkno = args->rmtblkno; - while (valuelen > 0) { - nmap = ATTR_RMTVALUE_MAPSIZE; - error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno, - args->rmtblkcnt, map, &nmap, - XFS_BMAPI_ATTRFORK); - if (error) - return(error); - ASSERT(nmap >= 1); - - for (i = 0; (i < nmap) && (valuelen > 0); i++) { - ASSERT((map[i].br_startblock != DELAYSTARTBLOCK) && - (map[i].br_startblock != HOLESTARTBLOCK)); - dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); - blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); - error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, - dblkno, blkcnt, 0, &bp, NULL); - if (error) - return(error); - - tmp = min_t(int, valuelen, BBTOB(bp->b_length)); - xfs_buf_iomove(bp, 0, tmp, dst, XBRW_READ); - xfs_buf_relse(bp); - dst += tmp; - valuelen -= tmp; - - lblkno += map[i].br_blockcount; - } - } - ASSERT(valuelen == 0); - return(0); -} - -/* - * Write the value associated with an attribute into the out-of-line buffer - * that we have defined for it. - */ -STATIC int -xfs_attr_rmtval_set(xfs_da_args_t *args) -{ - xfs_mount_t *mp; - xfs_fileoff_t lfileoff; - xfs_inode_t *dp; - xfs_bmbt_irec_t map; - xfs_daddr_t dblkno; - void *src; - xfs_buf_t *bp; - xfs_dablk_t lblkno; - int blkcnt, valuelen, nmap, error, tmp, committed; - - trace_xfs_attr_rmtval_set(args); - - dp = args->dp; - mp = dp->i_mount; - src = args->value; - - /* - * Find a "hole" in the attribute address space large enough for - * us to drop the new attribute's value into. - */ - blkcnt = XFS_B_TO_FSB(mp, args->valuelen); - lfileoff = 0; - error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff, - XFS_ATTR_FORK); - if (error) { - return(error); - } - args->rmtblkno = lblkno = (xfs_dablk_t)lfileoff; - args->rmtblkcnt = blkcnt; - - /* - * Roll through the "value", allocating blocks on disk as required. - */ - while (blkcnt > 0) { - /* - * Allocate a single extent, up to the size of the value. - */ - xfs_bmap_init(args->flist, args->firstblock); - nmap = 1; - error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno, - blkcnt, - XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, - args->firstblock, args->total, &map, &nmap, - args->flist); - if (!error) { - error = xfs_bmap_finish(&args->trans, args->flist, - &committed); - } - if (error) { - ASSERT(committed); - args->trans = NULL; - xfs_bmap_cancel(args->flist); - return(error); - } - - /* - * bmap_finish() may have committed the last trans and started - * a new one. We need the inode to be in all transactions. - */ - if (committed) - xfs_trans_ijoin(args->trans, dp, 0); - - ASSERT(nmap == 1); - ASSERT((map.br_startblock != DELAYSTARTBLOCK) && - (map.br_startblock != HOLESTARTBLOCK)); - lblkno += map.br_blockcount; - blkcnt -= map.br_blockcount; - - /* - * Start the next trans in the chain. - */ - error = xfs_trans_roll(&args->trans, dp); - if (error) - return (error); - } - - /* - * Roll through the "value", copying the attribute value to the - * already-allocated blocks. Blocks are written synchronously - * so that we can know they are all on disk before we turn off - * the INCOMPLETE flag. - */ - lblkno = args->rmtblkno; - valuelen = args->valuelen; - while (valuelen > 0) { - int buflen; - - /* - * Try to remember where we decided to put the value. - */ - xfs_bmap_init(args->flist, args->firstblock); - nmap = 1; - error = xfs_bmapi_read(dp, (xfs_fileoff_t)lblkno, - args->rmtblkcnt, &map, &nmap, - XFS_BMAPI_ATTRFORK); - if (error) - return(error); - ASSERT(nmap == 1); - ASSERT((map.br_startblock != DELAYSTARTBLOCK) && - (map.br_startblock != HOLESTARTBLOCK)); - - dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), - blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); - - bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt, 0); - if (!bp) - return ENOMEM; - - buflen = BBTOB(bp->b_length); - tmp = min_t(int, valuelen, buflen); - xfs_buf_iomove(bp, 0, tmp, src, XBRW_WRITE); - if (tmp < buflen) - xfs_buf_zero(bp, tmp, buflen - tmp); - - error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */ - xfs_buf_relse(bp); - if (error) - return error; - src += tmp; - valuelen -= tmp; - - lblkno += map.br_blockcount; - } - ASSERT(valuelen == 0); - return(0); -} - -/* - * Remove the value associated with an attribute by deleting the - * out-of-line buffer that it is stored on. - */ -STATIC int -xfs_attr_rmtval_remove(xfs_da_args_t *args) -{ - xfs_mount_t *mp; - xfs_bmbt_irec_t map; - xfs_buf_t *bp; - xfs_daddr_t dblkno; - xfs_dablk_t lblkno; - int valuelen, blkcnt, nmap, error, done, committed; - - trace_xfs_attr_rmtval_remove(args); - - mp = args->dp->i_mount; - - /* - * Roll through the "value", invalidating the attribute value's - * blocks. - */ - lblkno = args->rmtblkno; - valuelen = args->rmtblkcnt; - while (valuelen > 0) { - /* - * Try to remember where we decided to put the value. - */ - nmap = 1; - error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno, - args->rmtblkcnt, &map, &nmap, - XFS_BMAPI_ATTRFORK); - if (error) - return(error); - ASSERT(nmap == 1); - ASSERT((map.br_startblock != DELAYSTARTBLOCK) && - (map.br_startblock != HOLESTARTBLOCK)); - - dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), - blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); - - /* - * If the "remote" value is in the cache, remove it. - */ - bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt, XBF_TRYLOCK); - if (bp) { - xfs_buf_stale(bp); - xfs_buf_relse(bp); - bp = NULL; - } - - valuelen -= map.br_blockcount; - - lblkno += map.br_blockcount; - } - - /* - * Keep de-allocating extents until the remote-value region is gone. - */ - lblkno = args->rmtblkno; - blkcnt = args->rmtblkcnt; - done = 0; - while (!done) { - xfs_bmap_init(args->flist, args->firstblock); - error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt, - XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, - 1, args->firstblock, args->flist, - &done); - if (!error) { - error = xfs_bmap_finish(&args->trans, args->flist, - &committed); - } - if (error) { - ASSERT(committed); - args->trans = NULL; - xfs_bmap_cancel(args->flist); - return(error); - } - - /* - * bmap_finish() may have committed the last trans and started - * a new one. We need the inode to be in all transactions. - */ - if (committed) - xfs_trans_ijoin(args->trans, args->dp, 0); - - /* - * Close out trans and start the next one in the chain. - */ - error = xfs_trans_roll(&args->trans, args->dp); - if (error) - return (error); - } - return(0); + return 0; } diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h index e920d68ef50..de8dd58da46 100644 --- a/fs/xfs/xfs_attr.h +++ b/fs/xfs/xfs_attr.h @@ -140,7 +140,6 @@ typedef struct xfs_attr_list_context { * Overall external interface routines. */ int xfs_attr_inactive(struct xfs_inode *dp); -int xfs_attr_rmtval_get(struct xfs_da_args *args); int xfs_attr_list_int(struct xfs_attr_list_context *); #endif /* __XFS_ATTR_H__ */ diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index ee24993c7d1..08d5457c948 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -31,6 +32,7 @@ #include "xfs_alloc.h" #include "xfs_btree.h" #include "xfs_attr_sf.h" +#include "xfs_attr_remote.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_inode_item.h" @@ -39,6 +41,9 @@ #include "xfs_attr_leaf.h" #include "xfs_error.h" #include "xfs_trace.h" +#include "xfs_buf_item.h" +#include "xfs_cksum.h" + /* * xfs_attr_leaf.c @@ -53,85 +58,226 @@ /* * Routines used for growing the Btree. */ -STATIC int xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t which_block, - struct xfs_buf **bpp); -STATIC int xfs_attr_leaf_add_work(struct xfs_buf *leaf_buffer, - xfs_da_args_t *args, int freemap_index); -STATIC void xfs_attr_leaf_compact(struct xfs_da_args *args, - struct xfs_buf *leaf_buffer); -STATIC void xfs_attr_leaf_rebalance(xfs_da_state_t *state, +STATIC int xfs_attr3_leaf_create(struct xfs_da_args *args, + xfs_dablk_t which_block, struct xfs_buf **bpp); +STATIC int xfs_attr3_leaf_add_work(struct xfs_buf *leaf_buffer, + struct xfs_attr3_icleaf_hdr *ichdr, + struct xfs_da_args *args, int freemap_index); +STATIC void xfs_attr3_leaf_compact(struct xfs_da_args *args, + struct xfs_attr3_icleaf_hdr *ichdr, + struct xfs_buf *leaf_buffer); +STATIC void xfs_attr3_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, xfs_da_state_blk_t *blk2); -STATIC int xfs_attr_leaf_figure_balance(xfs_da_state_t *state, - xfs_da_state_blk_t *leaf_blk_1, - xfs_da_state_blk_t *leaf_blk_2, - int *number_entries_in_blk1, - int *number_usedbytes_in_blk1); +STATIC int xfs_attr3_leaf_figure_balance(xfs_da_state_t *state, + xfs_da_state_blk_t *leaf_blk_1, + struct xfs_attr3_icleaf_hdr *ichdr1, + xfs_da_state_blk_t *leaf_blk_2, + struct xfs_attr3_icleaf_hdr *ichdr2, + int *number_entries_in_blk1, + int *number_usedbytes_in_blk1); /* * Routines used for shrinking the Btree. */ -STATIC int xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, +STATIC int xfs_attr3_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, struct xfs_buf *bp, int level); -STATIC int xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp, +STATIC int xfs_attr3_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp, struct xfs_buf *bp); -STATIC int xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp, +STATIC int xfs_attr3_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dablk_t blkno, int blkcnt); /* * Utility routines. */ -STATIC void xfs_attr_leaf_moveents(xfs_attr_leafblock_t *src_leaf, - int src_start, - xfs_attr_leafblock_t *dst_leaf, - int dst_start, int move_count, - xfs_mount_t *mp); +STATIC void xfs_attr3_leaf_moveents(struct xfs_attr_leafblock *src_leaf, + struct xfs_attr3_icleaf_hdr *src_ichdr, int src_start, + struct xfs_attr_leafblock *dst_leaf, + struct xfs_attr3_icleaf_hdr *dst_ichdr, int dst_start, + int move_count, struct xfs_mount *mp); STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index); -static void -xfs_attr_leaf_verify( +void +xfs_attr3_leaf_hdr_from_disk( + struct xfs_attr3_icleaf_hdr *to, + struct xfs_attr_leafblock *from) +{ + int i; + + ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) || + from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)); + + if (from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) { + struct xfs_attr3_leaf_hdr *hdr3 = (struct xfs_attr3_leaf_hdr *)from; + + to->forw = be32_to_cpu(hdr3->info.hdr.forw); + to->back = be32_to_cpu(hdr3->info.hdr.back); + to->magic = be16_to_cpu(hdr3->info.hdr.magic); + to->count = be16_to_cpu(hdr3->count); + to->usedbytes = be16_to_cpu(hdr3->usedbytes); + to->firstused = be16_to_cpu(hdr3->firstused); + to->holes = hdr3->holes; + + for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { + to->freemap[i].base = be16_to_cpu(hdr3->freemap[i].base); + to->freemap[i].size = be16_to_cpu(hdr3->freemap[i].size); + } + return; + } + to->forw = be32_to_cpu(from->hdr.info.forw); + to->back = be32_to_cpu(from->hdr.info.back); + to->magic = be16_to_cpu(from->hdr.info.magic); + to->count = be16_to_cpu(from->hdr.count); + to->usedbytes = be16_to_cpu(from->hdr.usedbytes); + to->firstused = be16_to_cpu(from->hdr.firstused); + to->holes = from->hdr.holes; + + for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { + to->freemap[i].base = be16_to_cpu(from->hdr.freemap[i].base); + to->freemap[i].size = be16_to_cpu(from->hdr.freemap[i].size); + } +} + +void +xfs_attr3_leaf_hdr_to_disk( + struct xfs_attr_leafblock *to, + struct xfs_attr3_icleaf_hdr *from) +{ + int i; + + ASSERT(from->magic == XFS_ATTR_LEAF_MAGIC || + from->magic == XFS_ATTR3_LEAF_MAGIC); + + if (from->magic == XFS_ATTR3_LEAF_MAGIC) { + struct xfs_attr3_leaf_hdr *hdr3 = (struct xfs_attr3_leaf_hdr *)to; + + hdr3->info.hdr.forw = cpu_to_be32(from->forw); + hdr3->info.hdr.back = cpu_to_be32(from->back); + hdr3->info.hdr.magic = cpu_to_be16(from->magic); + hdr3->count = cpu_to_be16(from->count); + hdr3->usedbytes = cpu_to_be16(from->usedbytes); + hdr3->firstused = cpu_to_be16(from->firstused); + hdr3->holes = from->holes; + hdr3->pad1 = 0; + + for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { + hdr3->freemap[i].base = cpu_to_be16(from->freemap[i].base); + hdr3->freemap[i].size = cpu_to_be16(from->freemap[i].size); + } + return; + } + to->hdr.info.forw = cpu_to_be32(from->forw); + to->hdr.info.back = cpu_to_be32(from->back); + to->hdr.info.magic = cpu_to_be16(from->magic); + to->hdr.count = cpu_to_be16(from->count); + to->hdr.usedbytes = cpu_to_be16(from->usedbytes); + to->hdr.firstused = cpu_to_be16(from->firstused); + to->hdr.holes = from->holes; + to->hdr.pad1 = 0; + + for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { + to->hdr.freemap[i].base = cpu_to_be16(from->freemap[i].base); + to->hdr.freemap[i].size = cpu_to_be16(from->freemap[i].size); + } +} + +static bool +xfs_attr3_leaf_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_attr_leaf_hdr *hdr = bp->b_addr; - int block_ok = 0; + struct xfs_attr_leafblock *leaf = bp->b_addr; + struct xfs_attr3_icleaf_hdr ichdr; - block_ok = hdr->info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC); - if (!block_ok) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); - xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_da3_node_hdr *hdr3 = bp->b_addr; + + if (ichdr.magic != XFS_ATTR3_LEAF_MAGIC) + return false; + + if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_uuid)) + return false; + if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn) + return false; + } else { + if (ichdr.magic != XFS_ATTR_LEAF_MAGIC) + return false; } + if (ichdr.count == 0) + return false; + + /* XXX: need to range check rest of attr header values */ + /* XXX: hash order check? */ + + return true; } static void -xfs_attr_leaf_read_verify( +xfs_attr3_leaf_write_verify( struct xfs_buf *bp) { - xfs_attr_leaf_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_attr3_leaf_hdr *hdr3 = bp->b_addr; + + if (!xfs_attr3_leaf_verify(bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_ATTR3_LEAF_CRC_OFF); } +/* + * leaf/node format detection on trees is sketchy, so a node read can be done on + * leaf level blocks when detection identifies the tree as a node format tree + * incorrectly. In this case, we need to swap the verifier to match the correct + * format of the block being read. + */ static void -xfs_attr_leaf_write_verify( - struct xfs_buf *bp) +xfs_attr3_leaf_read_verify( + struct xfs_buf *bp) { - xfs_attr_leaf_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + + if ((xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), + XFS_ATTR3_LEAF_CRC_OFF)) || + !xfs_attr3_leaf_verify(bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } } -const struct xfs_buf_ops xfs_attr_leaf_buf_ops = { - .verify_read = xfs_attr_leaf_read_verify, - .verify_write = xfs_attr_leaf_write_verify, +const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = { + .verify_read = xfs_attr3_leaf_read_verify, + .verify_write = xfs_attr3_leaf_write_verify, }; int -xfs_attr_leaf_read( +xfs_attr3_leaf_read( struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mappedbno, struct xfs_buf **bpp) { - return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, - XFS_ATTR_FORK, &xfs_attr_leaf_buf_ops); + int err; + + err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, + XFS_ATTR_FORK, &xfs_attr3_leaf_buf_ops); + if (!err && tp) + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_ATTR_LEAF_BUF); + return err; } /*======================================================================== @@ -172,7 +318,8 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes) int dsize; xfs_mount_t *mp = dp->i_mount; - offset = (XFS_LITINO(mp) - bytes) >> 3; /* rounded down */ + /* rounded down */ + offset = (XFS_LITINO(mp, dp->i_d.di_version) - bytes) >> 3; switch (dp->i_d.di_format) { case XFS_DINODE_FMT_DEV: @@ -231,7 +378,7 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes) return 0; return dp->i_d.di_forkoff; } - dsize = XFS_BMAP_BROOT_SPACE(dp->i_df.if_broot); + dsize = XFS_BMAP_BROOT_SPACE(mp, dp->i_df.if_broot); break; } @@ -243,7 +390,8 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes) minforkoff = roundup(minforkoff, 8) >> 3; /* attr fork btree root can have at least this many key/ptr pairs */ - maxforkoff = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS); + maxforkoff = XFS_LITINO(mp, dp->i_d.di_version) - + XFS_BMDR_SPACE_CALC(MINABTPTRS); maxforkoff = maxforkoff >> 3; /* rounded down */ if (offset >= maxforkoff) @@ -557,7 +705,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args) } ASSERT(blkno == 0); - error = xfs_attr_leaf_create(args, blkno, &bp); + error = xfs_attr3_leaf_create(args, blkno, &bp); if (error) { error = xfs_da_shrink_inode(args, 0, bp); bp = NULL; @@ -586,9 +734,9 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args) nargs.hashval = xfs_da_hashname(sfe->nameval, sfe->namelen); nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(sfe->flags); - error = xfs_attr_leaf_lookup_int(bp, &nargs); /* set a->index */ + error = xfs_attr3_leaf_lookup_int(bp, &nargs); /* set a->index */ ASSERT(error == ENOATTR); - error = xfs_attr_leaf_add(bp, &nargs); + error = xfs_attr3_leaf_add(bp, &nargs); ASSERT(error != ENOSPC); if (error) goto out; @@ -801,7 +949,7 @@ xfs_attr_shortform_allfit( continue; /* don't copy partial entries */ if (!(entry->flags & XFS_ATTR_LOCAL)) return(0); - name_loc = xfs_attr_leaf_name_local(leaf, i); + name_loc = xfs_attr3_leaf_name_local(leaf, i); if (name_loc->namelen >= XFS_ATTR_SF_ENTSIZE_MAX) return(0); if (be16_to_cpu(name_loc->valuelen) >= XFS_ATTR_SF_ENTSIZE_MAX) @@ -821,29 +969,34 @@ xfs_attr_shortform_allfit( * Convert a leaf attribute list to shortform attribute list */ int -xfs_attr_leaf_to_shortform( - struct xfs_buf *bp, - xfs_da_args_t *args, - int forkoff) +xfs_attr3_leaf_to_shortform( + struct xfs_buf *bp, + struct xfs_da_args *args, + int forkoff) { - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_entry_t *entry; - xfs_attr_leaf_name_local_t *name_loc; - xfs_da_args_t nargs; - xfs_inode_t *dp; - char *tmpbuffer; - int error, i; + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_attr_leaf_entry *entry; + struct xfs_attr_leaf_name_local *name_loc; + struct xfs_da_args nargs; + struct xfs_inode *dp = args->dp; + char *tmpbuffer; + int error; + int i; trace_xfs_attr_leaf_to_sf(args); - dp = args->dp; tmpbuffer = kmem_alloc(XFS_LBSIZE(dp->i_mount), KM_SLEEP); - ASSERT(tmpbuffer != NULL); + if (!tmpbuffer) + return ENOMEM; - ASSERT(bp != NULL); memcpy(tmpbuffer, bp->b_addr, XFS_LBSIZE(dp->i_mount)); + leaf = (xfs_attr_leafblock_t *)tmpbuffer; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); + xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + entry = xfs_attr3_leaf_entryp(leaf); + + /* XXX (dgc): buffer is about to be marked stale - why zero it? */ memset(bp->b_addr, 0, XFS_LBSIZE(dp->i_mount)); /* @@ -873,14 +1026,14 @@ xfs_attr_leaf_to_shortform( nargs.whichfork = XFS_ATTR_FORK; nargs.trans = args->trans; nargs.op_flags = XFS_DA_OP_OKNOENT; - entry = &leaf->entries[0]; - for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) { + + for (i = 0; i < ichdr.count; entry++, i++) { if (entry->flags & XFS_ATTR_INCOMPLETE) continue; /* don't copy partial entries */ if (!entry->nameidx) continue; ASSERT(entry->flags & XFS_ATTR_LOCAL); - name_loc = xfs_attr_leaf_name_local(leaf, i); + name_loc = xfs_attr3_leaf_name_local(leaf, i); nargs.name = name_loc->nameval; nargs.namelen = name_loc->namelen; nargs.value = &name_loc->nameval[nargs.namelen]; @@ -893,61 +1046,75 @@ xfs_attr_leaf_to_shortform( out: kmem_free(tmpbuffer); - return(error); + return error; } /* * Convert from using a single leaf to a root node and a leaf. */ int -xfs_attr_leaf_to_node(xfs_da_args_t *args) +xfs_attr3_leaf_to_node( + struct xfs_da_args *args) { - xfs_attr_leafblock_t *leaf; - xfs_da_intnode_t *node; - xfs_inode_t *dp; - struct xfs_buf *bp1, *bp2; - xfs_dablk_t blkno; - int error; + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr icleafhdr; + struct xfs_attr_leaf_entry *entries; + struct xfs_da_node_entry *btree; + struct xfs_da3_icnode_hdr icnodehdr; + struct xfs_da_intnode *node; + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; + struct xfs_buf *bp1 = NULL; + struct xfs_buf *bp2 = NULL; + xfs_dablk_t blkno; + int error; trace_xfs_attr_leaf_to_node(args); - dp = args->dp; - bp1 = bp2 = NULL; error = xfs_da_grow_inode(args, &blkno); if (error) goto out; - error = xfs_attr_leaf_read(args->trans, args->dp, 0, -1, &bp1); + error = xfs_attr3_leaf_read(args->trans, dp, 0, -1, &bp1); if (error) goto out; - bp2 = NULL; - error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp2, - XFS_ATTR_FORK); + error = xfs_da_get_buf(args->trans, dp, blkno, -1, &bp2, XFS_ATTR_FORK); if (error) goto out; + + /* copy leaf to new buffer, update identifiers */ + xfs_trans_buf_set_type(args->trans, bp2, XFS_BLFT_ATTR_LEAF_BUF); bp2->b_ops = bp1->b_ops; - memcpy(bp2->b_addr, bp1->b_addr, XFS_LBSIZE(dp->i_mount)); - bp1 = NULL; - xfs_trans_log_buf(args->trans, bp2, 0, XFS_LBSIZE(dp->i_mount) - 1); + memcpy(bp2->b_addr, bp1->b_addr, XFS_LBSIZE(mp)); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_da3_blkinfo *hdr3 = bp2->b_addr; + hdr3->blkno = cpu_to_be64(bp2->b_bn); + } + xfs_trans_log_buf(args->trans, bp2, 0, XFS_LBSIZE(mp) - 1); /* * Set up the new root node. */ - error = xfs_da_node_create(args, 0, 1, &bp1, XFS_ATTR_FORK); + error = xfs_da3_node_create(args, 0, 1, &bp1, XFS_ATTR_FORK); if (error) goto out; node = bp1->b_addr; + xfs_da3_node_hdr_from_disk(&icnodehdr, node); + btree = xfs_da3_node_tree_p(node); + leaf = bp2->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); + xfs_attr3_leaf_hdr_from_disk(&icleafhdr, leaf); + entries = xfs_attr3_leaf_entryp(leaf); + /* both on-disk, don't endian-flip twice */ - node->btree[0].hashval = - leaf->entries[be16_to_cpu(leaf->hdr.count)-1 ].hashval; - node->btree[0].before = cpu_to_be32(blkno); - node->hdr.count = cpu_to_be16(1); - xfs_trans_log_buf(args->trans, bp1, 0, XFS_LBSIZE(dp->i_mount) - 1); + btree[0].hashval = entries[icleafhdr.count - 1].hashval; + btree[0].before = cpu_to_be32(blkno); + icnodehdr.count = 1; + xfs_da3_node_hdr_to_disk(node, &icnodehdr); + xfs_trans_log_buf(args->trans, bp1, 0, XFS_LBSIZE(mp) - 1); error = 0; out: - return(error); + return error; } @@ -960,52 +1127,63 @@ out: * or a leaf in a node attribute list. */ STATIC int -xfs_attr_leaf_create( - xfs_da_args_t *args, - xfs_dablk_t blkno, - struct xfs_buf **bpp) +xfs_attr3_leaf_create( + struct xfs_da_args *args, + xfs_dablk_t blkno, + struct xfs_buf **bpp) { - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_hdr_t *hdr; - xfs_inode_t *dp; - struct xfs_buf *bp; - int error; + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; + struct xfs_buf *bp; + int error; trace_xfs_attr_leaf_create(args); - dp = args->dp; - ASSERT(dp != NULL); error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp, XFS_ATTR_FORK); if (error) - return(error); - bp->b_ops = &xfs_attr_leaf_buf_ops; + return error; + bp->b_ops = &xfs_attr3_leaf_buf_ops; + xfs_trans_buf_set_type(args->trans, bp, XFS_BLFT_ATTR_LEAF_BUF); leaf = bp->b_addr; - memset((char *)leaf, 0, XFS_LBSIZE(dp->i_mount)); - hdr = &leaf->hdr; - hdr->info.magic = cpu_to_be16(XFS_ATTR_LEAF_MAGIC); - hdr->firstused = cpu_to_be16(XFS_LBSIZE(dp->i_mount)); - if (!hdr->firstused) { - hdr->firstused = cpu_to_be16( - XFS_LBSIZE(dp->i_mount) - XFS_ATTR_LEAF_NAME_ALIGN); - } + memset(leaf, 0, XFS_LBSIZE(mp)); - hdr->freemap[0].base = cpu_to_be16(sizeof(xfs_attr_leaf_hdr_t)); - hdr->freemap[0].size = cpu_to_be16(be16_to_cpu(hdr->firstused) - - sizeof(xfs_attr_leaf_hdr_t)); + memset(&ichdr, 0, sizeof(ichdr)); + ichdr.firstused = XFS_LBSIZE(mp); - xfs_trans_log_buf(args->trans, bp, 0, XFS_LBSIZE(dp->i_mount) - 1); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_da3_blkinfo *hdr3 = bp->b_addr; + + ichdr.magic = XFS_ATTR3_LEAF_MAGIC; + + hdr3->blkno = cpu_to_be64(bp->b_bn); + hdr3->owner = cpu_to_be64(dp->i_ino); + uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid); + + ichdr.freemap[0].base = sizeof(struct xfs_attr3_leaf_hdr); + } else { + ichdr.magic = XFS_ATTR_LEAF_MAGIC; + ichdr.freemap[0].base = sizeof(struct xfs_attr_leaf_hdr); + } + ichdr.freemap[0].size = ichdr.firstused - ichdr.freemap[0].base; + + xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr); + xfs_trans_log_buf(args->trans, bp, 0, XFS_LBSIZE(mp) - 1); *bpp = bp; - return(0); + return 0; } /* * Split the leaf node, rebalance, then add the new entry. */ int -xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, - xfs_da_state_blk_t *newblk) +xfs_attr3_leaf_split( + struct xfs_da_state *state, + struct xfs_da_state_blk *oldblk, + struct xfs_da_state_blk *newblk) { xfs_dablk_t blkno; int error; @@ -1019,7 +1197,7 @@ xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, error = xfs_da_grow_inode(state->args, &blkno); if (error) return(error); - error = xfs_attr_leaf_create(state->args, blkno, &newblk->bp); + error = xfs_attr3_leaf_create(state->args, blkno, &newblk->bp); if (error) return(error); newblk->blkno = blkno; @@ -1029,8 +1207,8 @@ xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, * Rebalance the entries across the two leaves. * NOTE: rebalance() currently depends on the 2nd block being empty. */ - xfs_attr_leaf_rebalance(state, oldblk, newblk); - error = xfs_da_blk_link(state, oldblk, newblk); + xfs_attr3_leaf_rebalance(state, oldblk, newblk); + error = xfs_da3_blk_link(state, oldblk, newblk); if (error) return(error); @@ -1043,10 +1221,10 @@ xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, */ if (state->inleaf) { trace_xfs_attr_leaf_add_old(state->args); - error = xfs_attr_leaf_add(oldblk->bp, state->args); + error = xfs_attr3_leaf_add(oldblk->bp, state->args); } else { trace_xfs_attr_leaf_add_new(state->args); - error = xfs_attr_leaf_add(newblk->bp, state->args); + error = xfs_attr3_leaf_add(newblk->bp, state->args); } /* @@ -1061,22 +1239,23 @@ xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, * Add a name to the leaf attribute list structure. */ int -xfs_attr_leaf_add( +xfs_attr3_leaf_add( struct xfs_buf *bp, struct xfs_da_args *args) { - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_hdr_t *hdr; - xfs_attr_leaf_map_t *map; - int tablesize, entsize, sum, tmp, i; + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + int tablesize; + int entsize; + int sum; + int tmp; + int i; trace_xfs_attr_leaf_add(args); leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - ASSERT((args->index >= 0) - && (args->index <= be16_to_cpu(leaf->hdr.count))); - hdr = &leaf->hdr; + xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + ASSERT(args->index >= 0 && args->index <= ichdr.count); entsize = xfs_attr_leaf_newentsize(args->namelen, args->valuelen, args->trans->t_mountp->m_sb.sb_blocksize, NULL); @@ -1084,25 +1263,23 @@ xfs_attr_leaf_add( * Search through freemap for first-fit on new name length. * (may need to figure in size of entry struct too) */ - tablesize = (be16_to_cpu(hdr->count) + 1) - * sizeof(xfs_attr_leaf_entry_t) - + sizeof(xfs_attr_leaf_hdr_t); - map = &hdr->freemap[XFS_ATTR_LEAF_MAPSIZE-1]; - for (sum = 0, i = XFS_ATTR_LEAF_MAPSIZE-1; i >= 0; map--, i--) { - if (tablesize > be16_to_cpu(hdr->firstused)) { - sum += be16_to_cpu(map->size); + tablesize = (ichdr.count + 1) * sizeof(xfs_attr_leaf_entry_t) + + xfs_attr3_leaf_hdr_size(leaf); + for (sum = 0, i = XFS_ATTR_LEAF_MAPSIZE - 1; i >= 0; i--) { + if (tablesize > ichdr.firstused) { + sum += ichdr.freemap[i].size; continue; } - if (!map->size) + if (!ichdr.freemap[i].size) continue; /* no space in this map */ tmp = entsize; - if (be16_to_cpu(map->base) < be16_to_cpu(hdr->firstused)) + if (ichdr.freemap[i].base < ichdr.firstused) tmp += sizeof(xfs_attr_leaf_entry_t); - if (be16_to_cpu(map->size) >= tmp) { - tmp = xfs_attr_leaf_add_work(bp, args, i); - return(tmp); + if (ichdr.freemap[i].size >= tmp) { + tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, i); + goto out_log_hdr; } - sum += be16_to_cpu(map->size); + sum += ichdr.freemap[i].size; } /* @@ -1110,82 +1287,89 @@ xfs_attr_leaf_add( * and we don't have enough freespace, then compaction will do us * no good and we should just give up. */ - if (!hdr->holes && (sum < entsize)) - return(XFS_ERROR(ENOSPC)); + if (!ichdr.holes && sum < entsize) + return XFS_ERROR(ENOSPC); /* * Compact the entries to coalesce free space. * This may change the hdr->count via dropping INCOMPLETE entries. */ - xfs_attr_leaf_compact(args, bp); + xfs_attr3_leaf_compact(args, &ichdr, bp); /* * After compaction, the block is guaranteed to have only one * free region, in freemap[0]. If it is not big enough, give up. */ - if (be16_to_cpu(hdr->freemap[0].size) - < (entsize + sizeof(xfs_attr_leaf_entry_t))) - return(XFS_ERROR(ENOSPC)); + if (ichdr.freemap[0].size < (entsize + sizeof(xfs_attr_leaf_entry_t))) { + tmp = ENOSPC; + goto out_log_hdr; + } + + tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, 0); - return(xfs_attr_leaf_add_work(bp, args, 0)); +out_log_hdr: + xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr); + xfs_trans_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, &leaf->hdr, + xfs_attr3_leaf_hdr_size(leaf))); + return tmp; } /* * Add a name to a leaf attribute list structure. */ STATIC int -xfs_attr_leaf_add_work( - struct xfs_buf *bp, - xfs_da_args_t *args, - int mapindex) +xfs_attr3_leaf_add_work( + struct xfs_buf *bp, + struct xfs_attr3_icleaf_hdr *ichdr, + struct xfs_da_args *args, + int mapindex) { - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_hdr_t *hdr; - xfs_attr_leaf_entry_t *entry; - xfs_attr_leaf_name_local_t *name_loc; - xfs_attr_leaf_name_remote_t *name_rmt; - xfs_attr_leaf_map_t *map; - xfs_mount_t *mp; - int tmp, i; + struct xfs_attr_leafblock *leaf; + struct xfs_attr_leaf_entry *entry; + struct xfs_attr_leaf_name_local *name_loc; + struct xfs_attr_leaf_name_remote *name_rmt; + struct xfs_mount *mp; + int tmp; + int i; trace_xfs_attr_leaf_add_work(args); leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - hdr = &leaf->hdr; - ASSERT((mapindex >= 0) && (mapindex < XFS_ATTR_LEAF_MAPSIZE)); - ASSERT((args->index >= 0) && (args->index <= be16_to_cpu(hdr->count))); + ASSERT(mapindex >= 0 && mapindex < XFS_ATTR_LEAF_MAPSIZE); + ASSERT(args->index >= 0 && args->index <= ichdr->count); /* * Force open some space in the entry array and fill it in. */ - entry = &leaf->entries[args->index]; - if (args->index < be16_to_cpu(hdr->count)) { - tmp = be16_to_cpu(hdr->count) - args->index; + entry = &xfs_attr3_leaf_entryp(leaf)[args->index]; + if (args->index < ichdr->count) { + tmp = ichdr->count - args->index; tmp *= sizeof(xfs_attr_leaf_entry_t); - memmove((char *)(entry+1), (char *)entry, tmp); + memmove(entry + 1, entry, tmp); xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry))); } - be16_add_cpu(&hdr->count, 1); + ichdr->count++; /* * Allocate space for the new string (at the end of the run). */ - map = &hdr->freemap[mapindex]; mp = args->trans->t_mountp; - ASSERT(be16_to_cpu(map->base) < XFS_LBSIZE(mp)); - ASSERT((be16_to_cpu(map->base) & 0x3) == 0); - ASSERT(be16_to_cpu(map->size) >= + ASSERT(ichdr->freemap[mapindex].base < XFS_LBSIZE(mp)); + ASSERT((ichdr->freemap[mapindex].base & 0x3) == 0); + ASSERT(ichdr->freemap[mapindex].size >= xfs_attr_leaf_newentsize(args->namelen, args->valuelen, mp->m_sb.sb_blocksize, NULL)); - ASSERT(be16_to_cpu(map->size) < XFS_LBSIZE(mp)); - ASSERT((be16_to_cpu(map->size) & 0x3) == 0); - be16_add_cpu(&map->size, - -xfs_attr_leaf_newentsize(args->namelen, args->valuelen, - mp->m_sb.sb_blocksize, &tmp)); - entry->nameidx = cpu_to_be16(be16_to_cpu(map->base) + - be16_to_cpu(map->size)); + ASSERT(ichdr->freemap[mapindex].size < XFS_LBSIZE(mp)); + ASSERT((ichdr->freemap[mapindex].size & 0x3) == 0); + + ichdr->freemap[mapindex].size -= + xfs_attr_leaf_newentsize(args->namelen, args->valuelen, + mp->m_sb.sb_blocksize, &tmp); + + entry->nameidx = cpu_to_be16(ichdr->freemap[mapindex].base + + ichdr->freemap[mapindex].size); entry->hashval = cpu_to_be32(args->hashval); entry->flags = tmp ? XFS_ATTR_LOCAL : 0; entry->flags |= XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags); @@ -1200,7 +1384,7 @@ xfs_attr_leaf_add_work( XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry))); ASSERT((args->index == 0) || (be32_to_cpu(entry->hashval) >= be32_to_cpu((entry-1)->hashval))); - ASSERT((args->index == be16_to_cpu(hdr->count)-1) || + ASSERT((args->index == ichdr->count - 1) || (be32_to_cpu(entry->hashval) <= be32_to_cpu((entry+1)->hashval))); /* @@ -1211,14 +1395,14 @@ xfs_attr_leaf_add_work( * as part of this transaction (a split operation for example). */ if (entry->flags & XFS_ATTR_LOCAL) { - name_loc = xfs_attr_leaf_name_local(leaf, args->index); + name_loc = xfs_attr3_leaf_name_local(leaf, args->index); name_loc->namelen = args->namelen; name_loc->valuelen = cpu_to_be16(args->valuelen); memcpy((char *)name_loc->nameval, args->name, args->namelen); memcpy((char *)&name_loc->nameval[args->namelen], args->value, be16_to_cpu(name_loc->valuelen)); } else { - name_rmt = xfs_attr_leaf_name_remote(leaf, args->index); + name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); name_rmt->namelen = args->namelen; memcpy((char *)name_rmt->name, args->name, args->namelen); entry->flags |= XFS_ATTR_INCOMPLETE; @@ -1229,44 +1413,41 @@ xfs_attr_leaf_add_work( args->rmtblkcnt = XFS_B_TO_FSB(mp, args->valuelen); } xfs_trans_log_buf(args->trans, bp, - XFS_DA_LOGRANGE(leaf, xfs_attr_leaf_name(leaf, args->index), + XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index), xfs_attr_leaf_entsize(leaf, args->index))); /* * Update the control info for this leaf node */ - if (be16_to_cpu(entry->nameidx) < be16_to_cpu(hdr->firstused)) { - /* both on-disk, don't endian-flip twice */ - hdr->firstused = entry->nameidx; - } - ASSERT(be16_to_cpu(hdr->firstused) >= - ((be16_to_cpu(hdr->count) * sizeof(*entry)) + sizeof(*hdr))); - tmp = (be16_to_cpu(hdr->count)-1) * sizeof(xfs_attr_leaf_entry_t) - + sizeof(xfs_attr_leaf_hdr_t); - map = &hdr->freemap[0]; - for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; map++, i++) { - if (be16_to_cpu(map->base) == tmp) { - be16_add_cpu(&map->base, sizeof(xfs_attr_leaf_entry_t)); - be16_add_cpu(&map->size, - -((int)sizeof(xfs_attr_leaf_entry_t))); + if (be16_to_cpu(entry->nameidx) < ichdr->firstused) + ichdr->firstused = be16_to_cpu(entry->nameidx); + + ASSERT(ichdr->firstused >= ichdr->count * sizeof(xfs_attr_leaf_entry_t) + + xfs_attr3_leaf_hdr_size(leaf)); + tmp = (ichdr->count - 1) * sizeof(xfs_attr_leaf_entry_t) + + xfs_attr3_leaf_hdr_size(leaf); + + for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { + if (ichdr->freemap[i].base == tmp) { + ichdr->freemap[i].base += sizeof(xfs_attr_leaf_entry_t); + ichdr->freemap[i].size -= sizeof(xfs_attr_leaf_entry_t); } } - be16_add_cpu(&hdr->usedbytes, xfs_attr_leaf_entsize(leaf, args->index)); - xfs_trans_log_buf(args->trans, bp, - XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr))); - return(0); + ichdr->usedbytes += xfs_attr_leaf_entsize(leaf, args->index); + return 0; } /* * Garbage collect a leaf attribute list block by copying it to a new buffer. */ STATIC void -xfs_attr_leaf_compact( +xfs_attr3_leaf_compact( struct xfs_da_args *args, + struct xfs_attr3_icleaf_hdr *ichdr_d, struct xfs_buf *bp) { xfs_attr_leafblock_t *leaf_s, *leaf_d; - xfs_attr_leaf_hdr_t *hdr_s, *hdr_d; + struct xfs_attr3_icleaf_hdr ichdr_s; struct xfs_trans *trans = args->trans; struct xfs_mount *mp = trans->t_mountp; char *tmpbuffer; @@ -1283,34 +1464,69 @@ xfs_attr_leaf_compact( */ leaf_s = (xfs_attr_leafblock_t *)tmpbuffer; leaf_d = bp->b_addr; - hdr_s = &leaf_s->hdr; - hdr_d = &leaf_d->hdr; - hdr_d->info = hdr_s->info; /* struct copy */ - hdr_d->firstused = cpu_to_be16(XFS_LBSIZE(mp)); - /* handle truncation gracefully */ - if (!hdr_d->firstused) { - hdr_d->firstused = cpu_to_be16( - XFS_LBSIZE(mp) - XFS_ATTR_LEAF_NAME_ALIGN); - } - hdr_d->usedbytes = 0; - hdr_d->count = 0; - hdr_d->holes = 0; - hdr_d->freemap[0].base = cpu_to_be16(sizeof(xfs_attr_leaf_hdr_t)); - hdr_d->freemap[0].size = cpu_to_be16(be16_to_cpu(hdr_d->firstused) - - sizeof(xfs_attr_leaf_hdr_t)); + ichdr_s = *ichdr_d; /* struct copy */ + ichdr_d->firstused = XFS_LBSIZE(mp); + ichdr_d->usedbytes = 0; + ichdr_d->count = 0; + ichdr_d->holes = 0; + ichdr_d->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_s); + ichdr_d->freemap[0].size = ichdr_d->firstused - ichdr_d->freemap[0].base; /* * Copy all entry's in the same (sorted) order, * but allocate name/value pairs packed and in sequence. */ - xfs_attr_leaf_moveents(leaf_s, 0, leaf_d, 0, - be16_to_cpu(hdr_s->count), mp); + xfs_attr3_leaf_moveents(leaf_s, &ichdr_s, 0, leaf_d, ichdr_d, 0, + ichdr_s.count, mp); + /* + * this logs the entire buffer, but the caller must write the header + * back to the buffer when it is finished modifying it. + */ xfs_trans_log_buf(trans, bp, 0, XFS_LBSIZE(mp) - 1); kmem_free(tmpbuffer); } /* + * Compare two leaf blocks "order". + * Return 0 unless leaf2 should go before leaf1. + */ +static int +xfs_attr3_leaf_order( + struct xfs_buf *leaf1_bp, + struct xfs_attr3_icleaf_hdr *leaf1hdr, + struct xfs_buf *leaf2_bp, + struct xfs_attr3_icleaf_hdr *leaf2hdr) +{ + struct xfs_attr_leaf_entry *entries1; + struct xfs_attr_leaf_entry *entries2; + + entries1 = xfs_attr3_leaf_entryp(leaf1_bp->b_addr); + entries2 = xfs_attr3_leaf_entryp(leaf2_bp->b_addr); + if (leaf1hdr->count > 0 && leaf2hdr->count > 0 && + ((be32_to_cpu(entries2[0].hashval) < + be32_to_cpu(entries1[0].hashval)) || + (be32_to_cpu(entries2[leaf2hdr->count - 1].hashval) < + be32_to_cpu(entries1[leaf1hdr->count - 1].hashval)))) { + return 1; + } + return 0; +} + +int +xfs_attr_leaf_order( + struct xfs_buf *leaf1_bp, + struct xfs_buf *leaf2_bp) +{ + struct xfs_attr3_icleaf_hdr ichdr1; + struct xfs_attr3_icleaf_hdr ichdr2; + + xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1_bp->b_addr); + xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2_bp->b_addr); + return xfs_attr3_leaf_order(leaf1_bp, &ichdr1, leaf2_bp, &ichdr2); +} + +/* * Redistribute the attribute list entries between two leaf nodes, * taking into account the size of the new entry. * @@ -1323,14 +1539,23 @@ xfs_attr_leaf_compact( * the "new" and "old" values can end up in different blocks. */ STATIC void -xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, - xfs_da_state_blk_t *blk2) +xfs_attr3_leaf_rebalance( + struct xfs_da_state *state, + struct xfs_da_state_blk *blk1, + struct xfs_da_state_blk *blk2) { - xfs_da_args_t *args; - xfs_da_state_blk_t *tmp_blk; - xfs_attr_leafblock_t *leaf1, *leaf2; - xfs_attr_leaf_hdr_t *hdr1, *hdr2; - int count, totallen, max, space, swap; + struct xfs_da_args *args; + struct xfs_attr_leafblock *leaf1; + struct xfs_attr_leafblock *leaf2; + struct xfs_attr3_icleaf_hdr ichdr1; + struct xfs_attr3_icleaf_hdr ichdr2; + struct xfs_attr_leaf_entry *entries1; + struct xfs_attr_leaf_entry *entries2; + int count; + int totallen; + int max; + int space; + int swap; /* * Set up environment. @@ -1339,9 +1564,9 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, ASSERT(blk2->magic == XFS_ATTR_LEAF_MAGIC); leaf1 = blk1->bp->b_addr; leaf2 = blk2->bp->b_addr; - ASSERT(leaf1->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - ASSERT(leaf2->hdr.count == 0); + xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1); + xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2); + ASSERT(ichdr2.count == 0); args = state->args; trace_xfs_attr_leaf_rebalance(args); @@ -1353,16 +1578,23 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, * second block, this code should never set "swap". */ swap = 0; - if (xfs_attr_leaf_order(blk1->bp, blk2->bp)) { + if (xfs_attr3_leaf_order(blk1->bp, &ichdr1, blk2->bp, &ichdr2)) { + struct xfs_da_state_blk *tmp_blk; + struct xfs_attr3_icleaf_hdr tmp_ichdr; + tmp_blk = blk1; blk1 = blk2; blk2 = tmp_blk; + + /* struct copies to swap them rather than reconverting */ + tmp_ichdr = ichdr1; + ichdr1 = ichdr2; + ichdr2 = tmp_ichdr; + leaf1 = blk1->bp->b_addr; leaf2 = blk2->bp->b_addr; swap = 1; } - hdr1 = &leaf1->hdr; - hdr2 = &leaf2->hdr; /* * Examine entries until we reduce the absolute difference in @@ -1372,41 +1604,39 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, * "inleaf" is true if the new entry should be inserted into blk1. * If "swap" is also true, then reverse the sense of "inleaf". */ - state->inleaf = xfs_attr_leaf_figure_balance(state, blk1, blk2, - &count, &totallen); + state->inleaf = xfs_attr3_leaf_figure_balance(state, blk1, &ichdr1, + blk2, &ichdr2, + &count, &totallen); if (swap) state->inleaf = !state->inleaf; /* * Move any entries required from leaf to leaf: */ - if (count < be16_to_cpu(hdr1->count)) { + if (count < ichdr1.count) { /* * Figure the total bytes to be added to the destination leaf. */ /* number entries being moved */ - count = be16_to_cpu(hdr1->count) - count; - space = be16_to_cpu(hdr1->usedbytes) - totallen; + count = ichdr1.count - count; + space = ichdr1.usedbytes - totallen; space += count * sizeof(xfs_attr_leaf_entry_t); /* * leaf2 is the destination, compact it if it looks tight. */ - max = be16_to_cpu(hdr2->firstused) - - sizeof(xfs_attr_leaf_hdr_t); - max -= be16_to_cpu(hdr2->count) * sizeof(xfs_attr_leaf_entry_t); + max = ichdr2.firstused - xfs_attr3_leaf_hdr_size(leaf1); + max -= ichdr2.count * sizeof(xfs_attr_leaf_entry_t); if (space > max) - xfs_attr_leaf_compact(args, blk2->bp); + xfs_attr3_leaf_compact(args, &ichdr2, blk2->bp); /* * Move high entries from leaf1 to low end of leaf2. */ - xfs_attr_leaf_moveents(leaf1, be16_to_cpu(hdr1->count) - count, - leaf2, 0, count, state->mp); + xfs_attr3_leaf_moveents(leaf1, &ichdr1, ichdr1.count - count, + leaf2, &ichdr2, 0, count, state->mp); - xfs_trans_log_buf(args->trans, blk1->bp, 0, state->blocksize-1); - xfs_trans_log_buf(args->trans, blk2->bp, 0, state->blocksize-1); - } else if (count > be16_to_cpu(hdr1->count)) { + } else if (count > ichdr1.count) { /* * I assert that since all callers pass in an empty * second buffer, this code should never execute. @@ -1417,36 +1647,37 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, * Figure the total bytes to be added to the destination leaf. */ /* number entries being moved */ - count -= be16_to_cpu(hdr1->count); - space = totallen - be16_to_cpu(hdr1->usedbytes); + count -= ichdr1.count; + space = totallen - ichdr1.usedbytes; space += count * sizeof(xfs_attr_leaf_entry_t); /* * leaf1 is the destination, compact it if it looks tight. */ - max = be16_to_cpu(hdr1->firstused) - - sizeof(xfs_attr_leaf_hdr_t); - max -= be16_to_cpu(hdr1->count) * sizeof(xfs_attr_leaf_entry_t); + max = ichdr1.firstused - xfs_attr3_leaf_hdr_size(leaf1); + max -= ichdr1.count * sizeof(xfs_attr_leaf_entry_t); if (space > max) - xfs_attr_leaf_compact(args, blk1->bp); + xfs_attr3_leaf_compact(args, &ichdr1, blk1->bp); /* * Move low entries from leaf2 to high end of leaf1. */ - xfs_attr_leaf_moveents(leaf2, 0, leaf1, - be16_to_cpu(hdr1->count), count, state->mp); - - xfs_trans_log_buf(args->trans, blk1->bp, 0, state->blocksize-1); - xfs_trans_log_buf(args->trans, blk2->bp, 0, state->blocksize-1); + xfs_attr3_leaf_moveents(leaf2, &ichdr2, 0, leaf1, &ichdr1, + ichdr1.count, count, state->mp); } + xfs_attr3_leaf_hdr_to_disk(leaf1, &ichdr1); + xfs_attr3_leaf_hdr_to_disk(leaf2, &ichdr2); + xfs_trans_log_buf(args->trans, blk1->bp, 0, state->blocksize-1); + xfs_trans_log_buf(args->trans, blk2->bp, 0, state->blocksize-1); + /* * Copy out last hashval in each block for B-tree code. */ - blk1->hashval = be32_to_cpu( - leaf1->entries[be16_to_cpu(leaf1->hdr.count)-1].hashval); - blk2->hashval = be32_to_cpu( - leaf2->entries[be16_to_cpu(leaf2->hdr.count)-1].hashval); + entries1 = xfs_attr3_leaf_entryp(leaf1); + entries2 = xfs_attr3_leaf_entryp(leaf2); + blk1->hashval = be32_to_cpu(entries1[ichdr1.count - 1].hashval); + blk2->hashval = be32_to_cpu(entries2[ichdr2.count - 1].hashval); /* * Adjust the expected index for insertion. @@ -1460,12 +1691,12 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, * inserting. The index/blkno fields refer to the "old" entry, * while the index2/blkno2 fields refer to the "new" entry. */ - if (blk1->index > be16_to_cpu(leaf1->hdr.count)) { + if (blk1->index > ichdr1.count) { ASSERT(state->inleaf == 0); - blk2->index = blk1->index - be16_to_cpu(leaf1->hdr.count); + blk2->index = blk1->index - ichdr1.count; args->index = args->index2 = blk2->index; args->blkno = args->blkno2 = blk2->blkno; - } else if (blk1->index == be16_to_cpu(leaf1->hdr.count)) { + } else if (blk1->index == ichdr1.count) { if (state->inleaf) { args->index = blk1->index; args->blkno = blk1->blkno; @@ -1477,8 +1708,7 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, * is already stored in blkno2/index2, so don't * overwrite it overwise we corrupt the tree. */ - blk2->index = blk1->index - - be16_to_cpu(leaf1->hdr.count); + blk2->index = blk1->index - ichdr1.count; args->index = blk2->index; args->blkno = blk2->blkno; if (!state->extravalid) { @@ -1506,42 +1736,40 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, * GROT: Do a double-split for this case? */ STATIC int -xfs_attr_leaf_figure_balance(xfs_da_state_t *state, - xfs_da_state_blk_t *blk1, - xfs_da_state_blk_t *blk2, - int *countarg, int *usedbytesarg) +xfs_attr3_leaf_figure_balance( + struct xfs_da_state *state, + struct xfs_da_state_blk *blk1, + struct xfs_attr3_icleaf_hdr *ichdr1, + struct xfs_da_state_blk *blk2, + struct xfs_attr3_icleaf_hdr *ichdr2, + int *countarg, + int *usedbytesarg) { - xfs_attr_leafblock_t *leaf1, *leaf2; - xfs_attr_leaf_hdr_t *hdr1, *hdr2; - xfs_attr_leaf_entry_t *entry; - int count, max, index, totallen, half; - int lastdelta, foundit, tmp; - - /* - * Set up environment. - */ - leaf1 = blk1->bp->b_addr; - leaf2 = blk2->bp->b_addr; - hdr1 = &leaf1->hdr; - hdr2 = &leaf2->hdr; - foundit = 0; - totallen = 0; + struct xfs_attr_leafblock *leaf1 = blk1->bp->b_addr; + struct xfs_attr_leafblock *leaf2 = blk2->bp->b_addr; + struct xfs_attr_leaf_entry *entry; + int count; + int max; + int index; + int totallen = 0; + int half; + int lastdelta; + int foundit = 0; + int tmp; /* * Examine entries until we reduce the absolute difference in * byte usage between the two blocks to a minimum. */ - max = be16_to_cpu(hdr1->count) + be16_to_cpu(hdr2->count); - half = (max+1) * sizeof(*entry); - half += be16_to_cpu(hdr1->usedbytes) + - be16_to_cpu(hdr2->usedbytes) + - xfs_attr_leaf_newentsize( - state->args->namelen, - state->args->valuelen, - state->blocksize, NULL); + max = ichdr1->count + ichdr2->count; + half = (max + 1) * sizeof(*entry); + half += ichdr1->usedbytes + ichdr2->usedbytes + + xfs_attr_leaf_newentsize(state->args->namelen, + state->args->valuelen, + state->blocksize, NULL); half /= 2; lastdelta = state->blocksize; - entry = &leaf1->entries[0]; + entry = xfs_attr3_leaf_entryp(leaf1); for (count = index = 0; count < max; entry++, index++, count++) { #define XFS_ATTR_ABS(A) (((A) < 0) ? -(A) : (A)) @@ -1564,9 +1792,9 @@ xfs_attr_leaf_figure_balance(xfs_da_state_t *state, /* * Wrap around into the second block if necessary. */ - if (count == be16_to_cpu(hdr1->count)) { + if (count == ichdr1->count) { leaf1 = leaf2; - entry = &leaf1->entries[0]; + entry = xfs_attr3_leaf_entryp(leaf1); index = 0; } @@ -1597,7 +1825,7 @@ xfs_attr_leaf_figure_balance(xfs_da_state_t *state, *countarg = count; *usedbytesarg = totallen; - return(foundit); + return foundit; } /*======================================================================== @@ -1616,14 +1844,20 @@ xfs_attr_leaf_figure_balance(xfs_da_state_t *state, * GROT: allow for INCOMPLETE entries in calculation. */ int -xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) +xfs_attr3_leaf_toosmall( + struct xfs_da_state *state, + int *action) { - xfs_attr_leafblock_t *leaf; - xfs_da_state_blk_t *blk; - xfs_da_blkinfo_t *info; - int count, bytes, forward, error, retval, i; - xfs_dablk_t blkno; - struct xfs_buf *bp; + struct xfs_attr_leafblock *leaf; + struct xfs_da_state_blk *blk; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_buf *bp; + xfs_dablk_t blkno; + int bytes; + int forward; + int error; + int retval; + int i; trace_xfs_attr_leaf_toosmall(state->args); @@ -1633,13 +1867,11 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) * to coalesce with a sibling. */ blk = &state->path.blk[ state->path.active-1 ]; - info = blk->bp->b_addr; - ASSERT(info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - leaf = (xfs_attr_leafblock_t *)info; - count = be16_to_cpu(leaf->hdr.count); - bytes = sizeof(xfs_attr_leaf_hdr_t) + - count * sizeof(xfs_attr_leaf_entry_t) + - be16_to_cpu(leaf->hdr.usedbytes); + leaf = blk->bp->b_addr; + xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + bytes = xfs_attr3_leaf_hdr_size(leaf) + + ichdr.count * sizeof(xfs_attr_leaf_entry_t) + + ichdr.usedbytes; if (bytes > (state->blocksize >> 1)) { *action = 0; /* blk over 50%, don't try to join */ return(0); @@ -1651,14 +1883,14 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) * coalesce it with a sibling block. We choose (arbitrarily) * to merge with the forward block unless it is NULL. */ - if (count == 0) { + if (ichdr.count == 0) { /* * Make altpath point to the block we want to keep and * path point to the block we want to drop (this one). */ - forward = (info->forw != 0); + forward = (ichdr.forw != 0); memcpy(&state->altpath, &state->path, sizeof(state->path)); - error = xfs_da_path_shift(state, &state->altpath, forward, + error = xfs_da3_path_shift(state, &state->altpath, forward, 0, &retval); if (error) return(error); @@ -1667,7 +1899,7 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) } else { *action = 2; } - return(0); + return 0; } /* @@ -1678,28 +1910,28 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) * to shrink an attribute list over time. */ /* start with smaller blk num */ - forward = (be32_to_cpu(info->forw) < be32_to_cpu(info->back)); + forward = ichdr.forw < ichdr.back; for (i = 0; i < 2; forward = !forward, i++) { + struct xfs_attr3_icleaf_hdr ichdr2; if (forward) - blkno = be32_to_cpu(info->forw); + blkno = ichdr.forw; else - blkno = be32_to_cpu(info->back); + blkno = ichdr.back; if (blkno == 0) continue; - error = xfs_attr_leaf_read(state->args->trans, state->args->dp, + error = xfs_attr3_leaf_read(state->args->trans, state->args->dp, blkno, -1, &bp); if (error) return(error); - leaf = (xfs_attr_leafblock_t *)info; - count = be16_to_cpu(leaf->hdr.count); - bytes = state->blocksize - (state->blocksize>>2); - bytes -= be16_to_cpu(leaf->hdr.usedbytes); - leaf = bp->b_addr; - count += be16_to_cpu(leaf->hdr.count); - bytes -= be16_to_cpu(leaf->hdr.usedbytes); - bytes -= count * sizeof(xfs_attr_leaf_entry_t); - bytes -= sizeof(xfs_attr_leaf_hdr_t); + xfs_attr3_leaf_hdr_from_disk(&ichdr2, bp->b_addr); + + bytes = state->blocksize - (state->blocksize >> 2) - + ichdr.usedbytes - ichdr2.usedbytes - + ((ichdr.count + ichdr2.count) * + sizeof(xfs_attr_leaf_entry_t)) - + xfs_attr3_leaf_hdr_size(leaf); + xfs_trans_brelse(state->args->trans, bp); if (bytes >= 0) break; /* fits with at least 25% to spare */ @@ -1715,10 +1947,10 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) */ memcpy(&state->altpath, &state->path, sizeof(state->path)); if (blkno < blk->blkno) { - error = xfs_da_path_shift(state, &state->altpath, forward, + error = xfs_da3_path_shift(state, &state->altpath, forward, 0, &retval); } else { - error = xfs_da_path_shift(state, &state->path, forward, + error = xfs_da3_path_shift(state, &state->path, forward, 0, &retval); } if (error) @@ -1738,32 +1970,35 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) * If two leaves are 37% full, when combined they will leave 25% free. */ int -xfs_attr_leaf_remove( - struct xfs_buf *bp, - xfs_da_args_t *args) +xfs_attr3_leaf_remove( + struct xfs_buf *bp, + struct xfs_da_args *args) { - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_hdr_t *hdr; - xfs_attr_leaf_map_t *map; - xfs_attr_leaf_entry_t *entry; - int before, after, smallest, entsize; - int tablesize, tmp, i; - xfs_mount_t *mp; + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_attr_leaf_entry *entry; + struct xfs_mount *mp = args->trans->t_mountp; + int before; + int after; + int smallest; + int entsize; + int tablesize; + int tmp; + int i; trace_xfs_attr_leaf_remove(args); leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - hdr = &leaf->hdr; - mp = args->trans->t_mountp; - ASSERT((be16_to_cpu(hdr->count) > 0) - && (be16_to_cpu(hdr->count) < (XFS_LBSIZE(mp)/8))); - ASSERT((args->index >= 0) - && (args->index < be16_to_cpu(hdr->count))); - ASSERT(be16_to_cpu(hdr->firstused) >= - ((be16_to_cpu(hdr->count) * sizeof(*entry)) + sizeof(*hdr))); - entry = &leaf->entries[args->index]; - ASSERT(be16_to_cpu(entry->nameidx) >= be16_to_cpu(hdr->firstused)); + xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + + ASSERT(ichdr.count > 0 && ichdr.count < XFS_LBSIZE(mp) / 8); + ASSERT(args->index >= 0 && args->index < ichdr.count); + ASSERT(ichdr.firstused >= ichdr.count * sizeof(*entry) + + xfs_attr3_leaf_hdr_size(leaf)); + + entry = &xfs_attr3_leaf_entryp(leaf)[args->index]; + + ASSERT(be16_to_cpu(entry->nameidx) >= ichdr.firstused); ASSERT(be16_to_cpu(entry->nameidx) < XFS_LBSIZE(mp)); /* @@ -1772,30 +2007,28 @@ xfs_attr_leaf_remove( * find smallest free region in case we need to replace it, * adjust any map that borders the entry table, */ - tablesize = be16_to_cpu(hdr->count) * sizeof(xfs_attr_leaf_entry_t) - + sizeof(xfs_attr_leaf_hdr_t); - map = &hdr->freemap[0]; - tmp = be16_to_cpu(map->size); + tablesize = ichdr.count * sizeof(xfs_attr_leaf_entry_t) + + xfs_attr3_leaf_hdr_size(leaf); + tmp = ichdr.freemap[0].size; before = after = -1; smallest = XFS_ATTR_LEAF_MAPSIZE - 1; entsize = xfs_attr_leaf_entsize(leaf, args->index); - for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; map++, i++) { - ASSERT(be16_to_cpu(map->base) < XFS_LBSIZE(mp)); - ASSERT(be16_to_cpu(map->size) < XFS_LBSIZE(mp)); - if (be16_to_cpu(map->base) == tablesize) { - be16_add_cpu(&map->base, - -((int)sizeof(xfs_attr_leaf_entry_t))); - be16_add_cpu(&map->size, sizeof(xfs_attr_leaf_entry_t)); + for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { + ASSERT(ichdr.freemap[i].base < XFS_LBSIZE(mp)); + ASSERT(ichdr.freemap[i].size < XFS_LBSIZE(mp)); + if (ichdr.freemap[i].base == tablesize) { + ichdr.freemap[i].base -= sizeof(xfs_attr_leaf_entry_t); + ichdr.freemap[i].size += sizeof(xfs_attr_leaf_entry_t); } - if ((be16_to_cpu(map->base) + be16_to_cpu(map->size)) - == be16_to_cpu(entry->nameidx)) { + if (ichdr.freemap[i].base + ichdr.freemap[i].size == + be16_to_cpu(entry->nameidx)) { before = i; - } else if (be16_to_cpu(map->base) - == (be16_to_cpu(entry->nameidx) + entsize)) { + } else if (ichdr.freemap[i].base == + (be16_to_cpu(entry->nameidx) + entsize)) { after = i; - } else if (be16_to_cpu(map->size) < tmp) { - tmp = be16_to_cpu(map->size); + } else if (ichdr.freemap[i].size < tmp) { + tmp = ichdr.freemap[i].size; smallest = i; } } @@ -1806,36 +2039,30 @@ xfs_attr_leaf_remove( */ if ((before >= 0) || (after >= 0)) { if ((before >= 0) && (after >= 0)) { - map = &hdr->freemap[before]; - be16_add_cpu(&map->size, entsize); - be16_add_cpu(&map->size, - be16_to_cpu(hdr->freemap[after].size)); - hdr->freemap[after].base = 0; - hdr->freemap[after].size = 0; + ichdr.freemap[before].size += entsize; + ichdr.freemap[before].size += ichdr.freemap[after].size; + ichdr.freemap[after].base = 0; + ichdr.freemap[after].size = 0; } else if (before >= 0) { - map = &hdr->freemap[before]; - be16_add_cpu(&map->size, entsize); + ichdr.freemap[before].size += entsize; } else { - map = &hdr->freemap[after]; - /* both on-disk, don't endian flip twice */ - map->base = entry->nameidx; - be16_add_cpu(&map->size, entsize); + ichdr.freemap[after].base = be16_to_cpu(entry->nameidx); + ichdr.freemap[after].size += entsize; } } else { /* * Replace smallest region (if it is smaller than free'd entry) */ - map = &hdr->freemap[smallest]; - if (be16_to_cpu(map->size) < entsize) { - map->base = cpu_to_be16(be16_to_cpu(entry->nameidx)); - map->size = cpu_to_be16(entsize); + if (ichdr.freemap[smallest].size < entsize) { + ichdr.freemap[smallest].base = be16_to_cpu(entry->nameidx); + ichdr.freemap[smallest].size = entsize; } } /* * Did we remove the first entry? */ - if (be16_to_cpu(entry->nameidx) == be16_to_cpu(hdr->firstused)) + if (be16_to_cpu(entry->nameidx) == ichdr.firstused) smallest = 1; else smallest = 0; @@ -1843,20 +2070,20 @@ xfs_attr_leaf_remove( /* * Compress the remaining entries and zero out the removed stuff. */ - memset(xfs_attr_leaf_name(leaf, args->index), 0, entsize); - be16_add_cpu(&hdr->usedbytes, -entsize); + memset(xfs_attr3_leaf_name(leaf, args->index), 0, entsize); + ichdr.usedbytes -= entsize; xfs_trans_log_buf(args->trans, bp, - XFS_DA_LOGRANGE(leaf, xfs_attr_leaf_name(leaf, args->index), + XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index), entsize)); - tmp = (be16_to_cpu(hdr->count) - args->index) - * sizeof(xfs_attr_leaf_entry_t); - memmove((char *)entry, (char *)(entry+1), tmp); - be16_add_cpu(&hdr->count, -1); + tmp = (ichdr.count - args->index) * sizeof(xfs_attr_leaf_entry_t); + memmove(entry, entry + 1, tmp); + ichdr.count--; xfs_trans_log_buf(args->trans, bp, - XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry))); - entry = &leaf->entries[be16_to_cpu(hdr->count)]; - memset((char *)entry, 0, sizeof(xfs_attr_leaf_entry_t)); + XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(xfs_attr_leaf_entry_t))); + + entry = &xfs_attr3_leaf_entryp(leaf)[ichdr.count]; + memset(entry, 0, sizeof(xfs_attr_leaf_entry_t)); /* * If we removed the first entry, re-find the first used byte @@ -1866,130 +2093,130 @@ xfs_attr_leaf_remove( */ if (smallest) { tmp = XFS_LBSIZE(mp); - entry = &leaf->entries[0]; - for (i = be16_to_cpu(hdr->count)-1; i >= 0; entry++, i--) { - ASSERT(be16_to_cpu(entry->nameidx) >= - be16_to_cpu(hdr->firstused)); + entry = xfs_attr3_leaf_entryp(leaf); + for (i = ichdr.count - 1; i >= 0; entry++, i--) { + ASSERT(be16_to_cpu(entry->nameidx) >= ichdr.firstused); ASSERT(be16_to_cpu(entry->nameidx) < XFS_LBSIZE(mp)); if (be16_to_cpu(entry->nameidx) < tmp) tmp = be16_to_cpu(entry->nameidx); } - hdr->firstused = cpu_to_be16(tmp); - if (!hdr->firstused) { - hdr->firstused = cpu_to_be16( - tmp - XFS_ATTR_LEAF_NAME_ALIGN); - } + ichdr.firstused = tmp; + if (!ichdr.firstused) + ichdr.firstused = tmp - XFS_ATTR_LEAF_NAME_ALIGN; } else { - hdr->holes = 1; /* mark as needing compaction */ + ichdr.holes = 1; /* mark as needing compaction */ } + xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr); xfs_trans_log_buf(args->trans, bp, - XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr))); + XFS_DA_LOGRANGE(leaf, &leaf->hdr, + xfs_attr3_leaf_hdr_size(leaf))); /* * Check if leaf is less than 50% full, caller may want to * "join" the leaf with a sibling if so. */ - tmp = sizeof(xfs_attr_leaf_hdr_t); - tmp += be16_to_cpu(leaf->hdr.count) * sizeof(xfs_attr_leaf_entry_t); - tmp += be16_to_cpu(leaf->hdr.usedbytes); - return(tmp < mp->m_attr_magicpct); /* leaf is < 37% full */ + tmp = ichdr.usedbytes + xfs_attr3_leaf_hdr_size(leaf) + + ichdr.count * sizeof(xfs_attr_leaf_entry_t); + + return tmp < mp->m_attr_magicpct; /* leaf is < 37% full */ } /* * Move all the attribute list entries from drop_leaf into save_leaf. */ void -xfs_attr_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, - xfs_da_state_blk_t *save_blk) +xfs_attr3_leaf_unbalance( + struct xfs_da_state *state, + struct xfs_da_state_blk *drop_blk, + struct xfs_da_state_blk *save_blk) { - xfs_attr_leafblock_t *drop_leaf, *save_leaf, *tmp_leaf; - xfs_attr_leaf_hdr_t *drop_hdr, *save_hdr, *tmp_hdr; - xfs_mount_t *mp; - char *tmpbuffer; + struct xfs_attr_leafblock *drop_leaf = drop_blk->bp->b_addr; + struct xfs_attr_leafblock *save_leaf = save_blk->bp->b_addr; + struct xfs_attr3_icleaf_hdr drophdr; + struct xfs_attr3_icleaf_hdr savehdr; + struct xfs_attr_leaf_entry *entry; + struct xfs_mount *mp = state->mp; trace_xfs_attr_leaf_unbalance(state->args); - /* - * Set up environment. - */ - mp = state->mp; - ASSERT(drop_blk->magic == XFS_ATTR_LEAF_MAGIC); - ASSERT(save_blk->magic == XFS_ATTR_LEAF_MAGIC); drop_leaf = drop_blk->bp->b_addr; save_leaf = save_blk->bp->b_addr; - ASSERT(drop_leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - ASSERT(save_leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - drop_hdr = &drop_leaf->hdr; - save_hdr = &save_leaf->hdr; + xfs_attr3_leaf_hdr_from_disk(&drophdr, drop_leaf); + xfs_attr3_leaf_hdr_from_disk(&savehdr, save_leaf); + entry = xfs_attr3_leaf_entryp(drop_leaf); /* * Save last hashval from dying block for later Btree fixup. */ - drop_blk->hashval = be32_to_cpu( - drop_leaf->entries[be16_to_cpu(drop_leaf->hdr.count)-1].hashval); + drop_blk->hashval = be32_to_cpu(entry[drophdr.count - 1].hashval); /* * Check if we need a temp buffer, or can we do it in place. * Note that we don't check "leaf" for holes because we will * always be dropping it, toosmall() decided that for us already. */ - if (save_hdr->holes == 0) { + if (savehdr.holes == 0) { /* * dest leaf has no holes, so we add there. May need * to make some room in the entry array. */ - if (xfs_attr_leaf_order(save_blk->bp, drop_blk->bp)) { - xfs_attr_leaf_moveents(drop_leaf, 0, save_leaf, 0, - be16_to_cpu(drop_hdr->count), mp); + if (xfs_attr3_leaf_order(save_blk->bp, &savehdr, + drop_blk->bp, &drophdr)) { + xfs_attr3_leaf_moveents(drop_leaf, &drophdr, 0, + save_leaf, &savehdr, 0, + drophdr.count, mp); } else { - xfs_attr_leaf_moveents(drop_leaf, 0, save_leaf, - be16_to_cpu(save_hdr->count), - be16_to_cpu(drop_hdr->count), mp); + xfs_attr3_leaf_moveents(drop_leaf, &drophdr, 0, + save_leaf, &savehdr, + savehdr.count, drophdr.count, mp); } } else { /* * Destination has holes, so we make a temporary copy * of the leaf and add them both to that. */ - tmpbuffer = kmem_alloc(state->blocksize, KM_SLEEP); - ASSERT(tmpbuffer != NULL); - memset(tmpbuffer, 0, state->blocksize); - tmp_leaf = (xfs_attr_leafblock_t *)tmpbuffer; - tmp_hdr = &tmp_leaf->hdr; - tmp_hdr->info = save_hdr->info; /* struct copy */ - tmp_hdr->count = 0; - tmp_hdr->firstused = cpu_to_be16(state->blocksize); - if (!tmp_hdr->firstused) { - tmp_hdr->firstused = cpu_to_be16( - state->blocksize - XFS_ATTR_LEAF_NAME_ALIGN); - } - tmp_hdr->usedbytes = 0; - if (xfs_attr_leaf_order(save_blk->bp, drop_blk->bp)) { - xfs_attr_leaf_moveents(drop_leaf, 0, tmp_leaf, 0, - be16_to_cpu(drop_hdr->count), mp); - xfs_attr_leaf_moveents(save_leaf, 0, tmp_leaf, - be16_to_cpu(tmp_leaf->hdr.count), - be16_to_cpu(save_hdr->count), mp); + struct xfs_attr_leafblock *tmp_leaf; + struct xfs_attr3_icleaf_hdr tmphdr; + + tmp_leaf = kmem_alloc(state->blocksize, KM_SLEEP); + memset(tmp_leaf, 0, state->blocksize); + memset(&tmphdr, 0, sizeof(tmphdr)); + + tmphdr.magic = savehdr.magic; + tmphdr.forw = savehdr.forw; + tmphdr.back = savehdr.back; + tmphdr.firstused = state->blocksize; + if (xfs_attr3_leaf_order(save_blk->bp, &savehdr, + drop_blk->bp, &drophdr)) { + xfs_attr3_leaf_moveents(drop_leaf, &drophdr, 0, + tmp_leaf, &tmphdr, 0, + drophdr.count, mp); + xfs_attr3_leaf_moveents(save_leaf, &savehdr, 0, + tmp_leaf, &tmphdr, tmphdr.count, + savehdr.count, mp); } else { - xfs_attr_leaf_moveents(save_leaf, 0, tmp_leaf, 0, - be16_to_cpu(save_hdr->count), mp); - xfs_attr_leaf_moveents(drop_leaf, 0, tmp_leaf, - be16_to_cpu(tmp_leaf->hdr.count), - be16_to_cpu(drop_hdr->count), mp); + xfs_attr3_leaf_moveents(save_leaf, &savehdr, 0, + tmp_leaf, &tmphdr, 0, + savehdr.count, mp); + xfs_attr3_leaf_moveents(drop_leaf, &drophdr, 0, + tmp_leaf, &tmphdr, tmphdr.count, + drophdr.count, mp); } - memcpy((char *)save_leaf, (char *)tmp_leaf, state->blocksize); - kmem_free(tmpbuffer); + memcpy(save_leaf, tmp_leaf, state->blocksize); + savehdr = tmphdr; /* struct copy */ + kmem_free(tmp_leaf); } + xfs_attr3_leaf_hdr_to_disk(save_leaf, &savehdr); xfs_trans_log_buf(state->args->trans, save_blk->bp, 0, state->blocksize - 1); /* * Copy out last hashval in each block for B-tree code. */ - save_blk->hashval = be32_to_cpu( - save_leaf->entries[be16_to_cpu(save_leaf->hdr.count)-1].hashval); + entry = xfs_attr3_leaf_entryp(save_leaf); + save_blk->hashval = be32_to_cpu(entry[savehdr.count - 1].hashval); } /*======================================================================== @@ -2010,31 +2237,33 @@ xfs_attr_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, * Don't change the args->value unless we find the attribute. */ int -xfs_attr_leaf_lookup_int( - struct xfs_buf *bp, - xfs_da_args_t *args) +xfs_attr3_leaf_lookup_int( + struct xfs_buf *bp, + struct xfs_da_args *args) { - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_entry_t *entry; - xfs_attr_leaf_name_local_t *name_loc; - xfs_attr_leaf_name_remote_t *name_rmt; - int probe, span; - xfs_dahash_t hashval; + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_attr_leaf_entry *entry; + struct xfs_attr_leaf_entry *entries; + struct xfs_attr_leaf_name_local *name_loc; + struct xfs_attr_leaf_name_remote *name_rmt; + xfs_dahash_t hashval; + int probe; + int span; trace_xfs_attr_leaf_lookup(args); leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - ASSERT(be16_to_cpu(leaf->hdr.count) - < (XFS_LBSIZE(args->dp->i_mount)/8)); + xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + entries = xfs_attr3_leaf_entryp(leaf); + ASSERT(ichdr.count < XFS_LBSIZE(args->dp->i_mount) / 8); /* * Binary search. (note: small blocks will skip this loop) */ hashval = args->hashval; - probe = span = be16_to_cpu(leaf->hdr.count) / 2; - for (entry = &leaf->entries[probe]; span > 4; - entry = &leaf->entries[probe]) { + probe = span = ichdr.count / 2; + for (entry = &entries[probe]; span > 4; entry = &entries[probe]) { span /= 2; if (be32_to_cpu(entry->hashval) < hashval) probe += span; @@ -2043,35 +2272,31 @@ xfs_attr_leaf_lookup_int( else break; } - ASSERT((probe >= 0) && - (!leaf->hdr.count - || (probe < be16_to_cpu(leaf->hdr.count)))); - ASSERT((span <= 4) || (be32_to_cpu(entry->hashval) == hashval)); + ASSERT(probe >= 0 && (!ichdr.count || probe < ichdr.count)); + ASSERT(span <= 4 || be32_to_cpu(entry->hashval) == hashval); /* * Since we may have duplicate hashval's, find the first matching * hashval in the leaf. */ - while ((probe > 0) && (be32_to_cpu(entry->hashval) >= hashval)) { + while (probe > 0 && be32_to_cpu(entry->hashval) >= hashval) { entry--; probe--; } - while ((probe < be16_to_cpu(leaf->hdr.count)) && - (be32_to_cpu(entry->hashval) < hashval)) { + while (probe < ichdr.count && + be32_to_cpu(entry->hashval) < hashval) { entry++; probe++; } - if ((probe == be16_to_cpu(leaf->hdr.count)) || - (be32_to_cpu(entry->hashval) != hashval)) { + if (probe == ichdr.count || be32_to_cpu(entry->hashval) != hashval) { args->index = probe; - return(XFS_ERROR(ENOATTR)); + return XFS_ERROR(ENOATTR); } /* * Duplicate keys may be present, so search all of them for a match. */ - for ( ; (probe < be16_to_cpu(leaf->hdr.count)) && - (be32_to_cpu(entry->hashval) == hashval); + for (; probe < ichdr.count && (be32_to_cpu(entry->hashval) == hashval); entry++, probe++) { /* * GROT: Add code to remove incomplete entries. @@ -2085,21 +2310,22 @@ xfs_attr_leaf_lookup_int( continue; } if (entry->flags & XFS_ATTR_LOCAL) { - name_loc = xfs_attr_leaf_name_local(leaf, probe); + name_loc = xfs_attr3_leaf_name_local(leaf, probe); if (name_loc->namelen != args->namelen) continue; - if (memcmp(args->name, (char *)name_loc->nameval, args->namelen) != 0) + if (memcmp(args->name, name_loc->nameval, + args->namelen) != 0) continue; if (!xfs_attr_namesp_match(args->flags, entry->flags)) continue; args->index = probe; - return(XFS_ERROR(EEXIST)); + return XFS_ERROR(EEXIST); } else { - name_rmt = xfs_attr_leaf_name_remote(leaf, probe); + name_rmt = xfs_attr3_leaf_name_remote(leaf, probe); if (name_rmt->namelen != args->namelen) continue; - if (memcmp(args->name, (char *)name_rmt->name, - args->namelen) != 0) + if (memcmp(args->name, name_rmt->name, + args->namelen) != 0) continue; if (!xfs_attr_namesp_match(args->flags, entry->flags)) continue; @@ -2107,11 +2333,11 @@ xfs_attr_leaf_lookup_int( args->rmtblkno = be32_to_cpu(name_rmt->valueblk); args->rmtblkcnt = XFS_B_TO_FSB(args->dp->i_mount, be32_to_cpu(name_rmt->valuelen)); - return(XFS_ERROR(EEXIST)); + return XFS_ERROR(EEXIST); } } args->index = probe; - return(XFS_ERROR(ENOATTR)); + return XFS_ERROR(ENOATTR); } /* @@ -2119,40 +2345,40 @@ xfs_attr_leaf_lookup_int( * list structure. */ int -xfs_attr_leaf_getvalue( - struct xfs_buf *bp, - xfs_da_args_t *args) +xfs_attr3_leaf_getvalue( + struct xfs_buf *bp, + struct xfs_da_args *args) { - int valuelen; - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_entry_t *entry; - xfs_attr_leaf_name_local_t *name_loc; - xfs_attr_leaf_name_remote_t *name_rmt; + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_attr_leaf_entry *entry; + struct xfs_attr_leaf_name_local *name_loc; + struct xfs_attr_leaf_name_remote *name_rmt; + int valuelen; leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - ASSERT(be16_to_cpu(leaf->hdr.count) - < (XFS_LBSIZE(args->dp->i_mount)/8)); - ASSERT(args->index < be16_to_cpu(leaf->hdr.count)); + xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + ASSERT(ichdr.count < XFS_LBSIZE(args->dp->i_mount) / 8); + ASSERT(args->index < ichdr.count); - entry = &leaf->entries[args->index]; + entry = &xfs_attr3_leaf_entryp(leaf)[args->index]; if (entry->flags & XFS_ATTR_LOCAL) { - name_loc = xfs_attr_leaf_name_local(leaf, args->index); + name_loc = xfs_attr3_leaf_name_local(leaf, args->index); ASSERT(name_loc->namelen == args->namelen); ASSERT(memcmp(args->name, name_loc->nameval, args->namelen) == 0); valuelen = be16_to_cpu(name_loc->valuelen); if (args->flags & ATTR_KERNOVAL) { args->valuelen = valuelen; - return(0); + return 0; } if (args->valuelen < valuelen) { args->valuelen = valuelen; - return(XFS_ERROR(ERANGE)); + return XFS_ERROR(ERANGE); } args->valuelen = valuelen; memcpy(args->value, &name_loc->nameval[args->namelen], valuelen); } else { - name_rmt = xfs_attr_leaf_name_remote(leaf, args->index); + name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); ASSERT(name_rmt->namelen == args->namelen); ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0); valuelen = be32_to_cpu(name_rmt->valuelen); @@ -2160,15 +2386,15 @@ xfs_attr_leaf_getvalue( args->rmtblkcnt = XFS_B_TO_FSB(args->dp->i_mount, valuelen); if (args->flags & ATTR_KERNOVAL) { args->valuelen = valuelen; - return(0); + return 0; } if (args->valuelen < valuelen) { args->valuelen = valuelen; - return(XFS_ERROR(ERANGE)); + return XFS_ERROR(ERANGE); } args->valuelen = valuelen; } - return(0); + return 0; } /*======================================================================== @@ -2181,13 +2407,21 @@ xfs_attr_leaf_getvalue( */ /*ARGSUSED*/ STATIC void -xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s, - xfs_attr_leafblock_t *leaf_d, int start_d, - int count, xfs_mount_t *mp) +xfs_attr3_leaf_moveents( + struct xfs_attr_leafblock *leaf_s, + struct xfs_attr3_icleaf_hdr *ichdr_s, + int start_s, + struct xfs_attr_leafblock *leaf_d, + struct xfs_attr3_icleaf_hdr *ichdr_d, + int start_d, + int count, + struct xfs_mount *mp) { - xfs_attr_leaf_hdr_t *hdr_s, *hdr_d; - xfs_attr_leaf_entry_t *entry_s, *entry_d; - int desti, tmp, i; + struct xfs_attr_leaf_entry *entry_s; + struct xfs_attr_leaf_entry *entry_d; + int desti; + int tmp; + int i; /* * Check for nothing to do. @@ -2198,45 +2432,41 @@ xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s, /* * Set up environment. */ - ASSERT(leaf_s->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - ASSERT(leaf_d->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - hdr_s = &leaf_s->hdr; - hdr_d = &leaf_d->hdr; - ASSERT((be16_to_cpu(hdr_s->count) > 0) && - (be16_to_cpu(hdr_s->count) < (XFS_LBSIZE(mp)/8))); - ASSERT(be16_to_cpu(hdr_s->firstused) >= - ((be16_to_cpu(hdr_s->count) - * sizeof(*entry_s))+sizeof(*hdr_s))); - ASSERT(be16_to_cpu(hdr_d->count) < (XFS_LBSIZE(mp)/8)); - ASSERT(be16_to_cpu(hdr_d->firstused) >= - ((be16_to_cpu(hdr_d->count) - * sizeof(*entry_d))+sizeof(*hdr_d))); - - ASSERT(start_s < be16_to_cpu(hdr_s->count)); - ASSERT(start_d <= be16_to_cpu(hdr_d->count)); - ASSERT(count <= be16_to_cpu(hdr_s->count)); + ASSERT(ichdr_s->magic == XFS_ATTR_LEAF_MAGIC || + ichdr_s->magic == XFS_ATTR3_LEAF_MAGIC); + ASSERT(ichdr_s->magic == ichdr_d->magic); + ASSERT(ichdr_s->count > 0 && ichdr_s->count < XFS_LBSIZE(mp) / 8); + ASSERT(ichdr_s->firstused >= (ichdr_s->count * sizeof(*entry_s)) + + xfs_attr3_leaf_hdr_size(leaf_s)); + ASSERT(ichdr_d->count < XFS_LBSIZE(mp) / 8); + ASSERT(ichdr_d->firstused >= (ichdr_d->count * sizeof(*entry_d)) + + xfs_attr3_leaf_hdr_size(leaf_d)); + + ASSERT(start_s < ichdr_s->count); + ASSERT(start_d <= ichdr_d->count); + ASSERT(count <= ichdr_s->count); + /* * Move the entries in the destination leaf up to make a hole? */ - if (start_d < be16_to_cpu(hdr_d->count)) { - tmp = be16_to_cpu(hdr_d->count) - start_d; + if (start_d < ichdr_d->count) { + tmp = ichdr_d->count - start_d; tmp *= sizeof(xfs_attr_leaf_entry_t); - entry_s = &leaf_d->entries[start_d]; - entry_d = &leaf_d->entries[start_d + count]; - memmove((char *)entry_d, (char *)entry_s, tmp); + entry_s = &xfs_attr3_leaf_entryp(leaf_d)[start_d]; + entry_d = &xfs_attr3_leaf_entryp(leaf_d)[start_d + count]; + memmove(entry_d, entry_s, tmp); } /* * Copy all entry's in the same (sorted) order, * but allocate attribute info packed and in sequence. */ - entry_s = &leaf_s->entries[start_s]; - entry_d = &leaf_d->entries[start_d]; + entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s]; + entry_d = &xfs_attr3_leaf_entryp(leaf_d)[start_d]; desti = start_d; for (i = 0; i < count; entry_s++, entry_d++, desti++, i++) { - ASSERT(be16_to_cpu(entry_s->nameidx) - >= be16_to_cpu(hdr_s->firstused)); + ASSERT(be16_to_cpu(entry_s->nameidx) >= ichdr_s->firstused); tmp = xfs_attr_leaf_entsize(leaf_s, start_s + i); #ifdef GROT /* @@ -2245,36 +2475,34 @@ xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s, * off for 6.2, should be revisited later. */ if (entry_s->flags & XFS_ATTR_INCOMPLETE) { /* skip partials? */ - memset(xfs_attr_leaf_name(leaf_s, start_s + i), 0, tmp); - be16_add_cpu(&hdr_s->usedbytes, -tmp); - be16_add_cpu(&hdr_s->count, -1); + memset(xfs_attr3_leaf_name(leaf_s, start_s + i), 0, tmp); + ichdr_s->usedbytes -= tmp; + ichdr_s->count -= 1; entry_d--; /* to compensate for ++ in loop hdr */ desti--; if ((start_s + i) < offset) result++; /* insertion index adjustment */ } else { #endif /* GROT */ - be16_add_cpu(&hdr_d->firstused, -tmp); + ichdr_d->firstused -= tmp; /* both on-disk, don't endian flip twice */ entry_d->hashval = entry_s->hashval; - /* both on-disk, don't endian flip twice */ - entry_d->nameidx = hdr_d->firstused; + entry_d->nameidx = cpu_to_be16(ichdr_d->firstused); entry_d->flags = entry_s->flags; ASSERT(be16_to_cpu(entry_d->nameidx) + tmp <= XFS_LBSIZE(mp)); - memmove(xfs_attr_leaf_name(leaf_d, desti), - xfs_attr_leaf_name(leaf_s, start_s + i), tmp); + memmove(xfs_attr3_leaf_name(leaf_d, desti), + xfs_attr3_leaf_name(leaf_s, start_s + i), tmp); ASSERT(be16_to_cpu(entry_s->nameidx) + tmp <= XFS_LBSIZE(mp)); - memset(xfs_attr_leaf_name(leaf_s, start_s + i), 0, tmp); - be16_add_cpu(&hdr_s->usedbytes, -tmp); - be16_add_cpu(&hdr_d->usedbytes, tmp); - be16_add_cpu(&hdr_s->count, -1); - be16_add_cpu(&hdr_d->count, 1); - tmp = be16_to_cpu(hdr_d->count) - * sizeof(xfs_attr_leaf_entry_t) - + sizeof(xfs_attr_leaf_hdr_t); - ASSERT(be16_to_cpu(hdr_d->firstused) >= tmp); + memset(xfs_attr3_leaf_name(leaf_s, start_s + i), 0, tmp); + ichdr_s->usedbytes -= tmp; + ichdr_d->usedbytes += tmp; + ichdr_s->count -= 1; + ichdr_d->count += 1; + tmp = ichdr_d->count * sizeof(xfs_attr_leaf_entry_t) + + xfs_attr3_leaf_hdr_size(leaf_d); + ASSERT(ichdr_d->firstused >= tmp); #ifdef GROT } #endif /* GROT */ @@ -2283,71 +2511,40 @@ xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s, /* * Zero out the entries we just copied. */ - if (start_s == be16_to_cpu(hdr_s->count)) { + if (start_s == ichdr_s->count) { tmp = count * sizeof(xfs_attr_leaf_entry_t); - entry_s = &leaf_s->entries[start_s]; + entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s]; ASSERT(((char *)entry_s + tmp) <= ((char *)leaf_s + XFS_LBSIZE(mp))); - memset((char *)entry_s, 0, tmp); + memset(entry_s, 0, tmp); } else { /* * Move the remaining entries down to fill the hole, * then zero the entries at the top. */ - tmp = be16_to_cpu(hdr_s->count) - count; - tmp *= sizeof(xfs_attr_leaf_entry_t); - entry_s = &leaf_s->entries[start_s + count]; - entry_d = &leaf_s->entries[start_s]; - memmove((char *)entry_d, (char *)entry_s, tmp); + tmp = (ichdr_s->count - count) * sizeof(xfs_attr_leaf_entry_t); + entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s + count]; + entry_d = &xfs_attr3_leaf_entryp(leaf_s)[start_s]; + memmove(entry_d, entry_s, tmp); tmp = count * sizeof(xfs_attr_leaf_entry_t); - entry_s = &leaf_s->entries[be16_to_cpu(hdr_s->count)]; + entry_s = &xfs_attr3_leaf_entryp(leaf_s)[ichdr_s->count]; ASSERT(((char *)entry_s + tmp) <= ((char *)leaf_s + XFS_LBSIZE(mp))); - memset((char *)entry_s, 0, tmp); + memset(entry_s, 0, tmp); } /* * Fill in the freemap information */ - hdr_d->freemap[0].base = cpu_to_be16(sizeof(xfs_attr_leaf_hdr_t)); - be16_add_cpu(&hdr_d->freemap[0].base, be16_to_cpu(hdr_d->count) * - sizeof(xfs_attr_leaf_entry_t)); - hdr_d->freemap[0].size = cpu_to_be16(be16_to_cpu(hdr_d->firstused) - - be16_to_cpu(hdr_d->freemap[0].base)); - hdr_d->freemap[1].base = 0; - hdr_d->freemap[2].base = 0; - hdr_d->freemap[1].size = 0; - hdr_d->freemap[2].size = 0; - hdr_s->holes = 1; /* leaf may not be compact */ -} - -/* - * Compare two leaf blocks "order". - * Return 0 unless leaf2 should go before leaf1. - */ -int -xfs_attr_leaf_order( - struct xfs_buf *leaf1_bp, - struct xfs_buf *leaf2_bp) -{ - xfs_attr_leafblock_t *leaf1, *leaf2; - - leaf1 = leaf1_bp->b_addr; - leaf2 = leaf2_bp->b_addr; - ASSERT((leaf1->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)) && - (leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC))); - if ((be16_to_cpu(leaf1->hdr.count) > 0) && - (be16_to_cpu(leaf2->hdr.count) > 0) && - ((be32_to_cpu(leaf2->entries[0].hashval) < - be32_to_cpu(leaf1->entries[0].hashval)) || - (be32_to_cpu(leaf2->entries[ - be16_to_cpu(leaf2->hdr.count)-1].hashval) < - be32_to_cpu(leaf1->entries[ - be16_to_cpu(leaf1->hdr.count)-1].hashval)))) { - return(1); - } - return(0); + ichdr_d->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_d); + ichdr_d->freemap[0].base += ichdr_d->count * sizeof(xfs_attr_leaf_entry_t); + ichdr_d->freemap[0].size = ichdr_d->firstused - ichdr_d->freemap[0].base; + ichdr_d->freemap[1].base = 0; + ichdr_d->freemap[2].base = 0; + ichdr_d->freemap[1].size = 0; + ichdr_d->freemap[2].size = 0; + ichdr_s->holes = 1; /* leaf may not be compact */ } /* @@ -2358,15 +2555,16 @@ xfs_attr_leaf_lasthash( struct xfs_buf *bp, int *count) { - xfs_attr_leafblock_t *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_attr_leaf_entry *entries; - leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); + xfs_attr3_leaf_hdr_from_disk(&ichdr, bp->b_addr); + entries = xfs_attr3_leaf_entryp(bp->b_addr); if (count) - *count = be16_to_cpu(leaf->hdr.count); - if (!leaf->hdr.count) - return(0); - return be32_to_cpu(leaf->entries[be16_to_cpu(leaf->hdr.count)-1].hashval); + *count = ichdr.count; + if (!ichdr.count) + return 0; + return be32_to_cpu(entries[ichdr.count - 1].hashval); } /* @@ -2376,20 +2574,21 @@ xfs_attr_leaf_lasthash( STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index) { + struct xfs_attr_leaf_entry *entries; xfs_attr_leaf_name_local_t *name_loc; xfs_attr_leaf_name_remote_t *name_rmt; int size; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - if (leaf->entries[index].flags & XFS_ATTR_LOCAL) { - name_loc = xfs_attr_leaf_name_local(leaf, index); + entries = xfs_attr3_leaf_entryp(leaf); + if (entries[index].flags & XFS_ATTR_LOCAL) { + name_loc = xfs_attr3_leaf_name_local(leaf, index); size = xfs_attr_leaf_entsize_local(name_loc->namelen, be16_to_cpu(name_loc->valuelen)); } else { - name_rmt = xfs_attr_leaf_name_remote(leaf, index); + name_rmt = xfs_attr3_leaf_name_remote(leaf, index); size = xfs_attr_leaf_entsize_remote(name_rmt->namelen); } - return(size); + return size; } /* @@ -2414,35 +2613,40 @@ xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize, int *local) *local = 0; } } - return(size); + return size; } /* * Copy out attribute list entries for attr_list(), for leaf attribute lists. */ int -xfs_attr_leaf_list_int( - struct xfs_buf *bp, - xfs_attr_list_context_t *context) +xfs_attr3_leaf_list_int( + struct xfs_buf *bp, + struct xfs_attr_list_context *context) { - attrlist_cursor_kern_t *cursor; - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_entry_t *entry; - int retval, i; + struct attrlist_cursor_kern *cursor; + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_attr_leaf_entry *entries; + struct xfs_attr_leaf_entry *entry; + int retval; + int i; + + trace_xfs_attr_list_leaf(context); - ASSERT(bp != NULL); leaf = bp->b_addr; + xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + entries = xfs_attr3_leaf_entryp(leaf); + cursor = context->cursor; cursor->initted = 1; - trace_xfs_attr_list_leaf(context); - /* * Re-find our place in the leaf block if this is a new syscall. */ if (context->resynch) { - entry = &leaf->entries[0]; - for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) { + entry = &entries[0]; + for (i = 0; i < ichdr.count; entry++, i++) { if (be32_to_cpu(entry->hashval) == cursor->hashval) { if (cursor->offset == context->dupcnt) { context->dupcnt = 0; @@ -2455,12 +2659,12 @@ xfs_attr_leaf_list_int( break; } } - if (i == be16_to_cpu(leaf->hdr.count)) { + if (i == ichdr.count) { trace_xfs_attr_list_notfound(context); - return(0); + return 0; } } else { - entry = &leaf->entries[0]; + entry = &entries[0]; i = 0; } context->resynch = 0; @@ -2469,7 +2673,7 @@ xfs_attr_leaf_list_int( * We have found our place, start copying out the new attributes. */ retval = 0; - for ( ; (i < be16_to_cpu(leaf->hdr.count)); entry++, i++) { + for (; i < ichdr.count; entry++, i++) { if (be32_to_cpu(entry->hashval) != cursor->hashval) { cursor->hashval = be32_to_cpu(entry->hashval); cursor->offset = 0; @@ -2480,7 +2684,7 @@ xfs_attr_leaf_list_int( if (entry->flags & XFS_ATTR_LOCAL) { xfs_attr_leaf_name_local_t *name_loc = - xfs_attr_leaf_name_local(leaf, i); + xfs_attr3_leaf_name_local(leaf, i); retval = context->put_listent(context, entry->flags, @@ -2492,7 +2696,7 @@ xfs_attr_leaf_list_int( return retval; } else { xfs_attr_leaf_name_remote_t *name_rmt = - xfs_attr_leaf_name_remote(leaf, i); + xfs_attr3_leaf_name_remote(leaf, i); int valuelen = be32_to_cpu(name_rmt->valuelen); @@ -2532,7 +2736,7 @@ xfs_attr_leaf_list_int( cursor->offset++; } trace_xfs_attr_list_leaf_end(context); - return(retval); + return retval; } @@ -2544,14 +2748,16 @@ xfs_attr_leaf_list_int( * Clear the INCOMPLETE flag on an entry in a leaf block. */ int -xfs_attr_leaf_clearflag(xfs_da_args_t *args) +xfs_attr3_leaf_clearflag( + struct xfs_da_args *args) { - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_entry_t *entry; - xfs_attr_leaf_name_remote_t *name_rmt; - struct xfs_buf *bp; - int error; + struct xfs_attr_leafblock *leaf; + struct xfs_attr_leaf_entry *entry; + struct xfs_attr_leaf_name_remote *name_rmt; + struct xfs_buf *bp; + int error; #ifdef DEBUG + struct xfs_attr3_icleaf_hdr ichdr; xfs_attr_leaf_name_local_t *name_loc; int namelen; char *name; @@ -2561,23 +2767,25 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args) /* * Set up the operation. */ - error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); if (error) return(error); leaf = bp->b_addr; - ASSERT(args->index < be16_to_cpu(leaf->hdr.count)); - ASSERT(args->index >= 0); - entry = &leaf->entries[ args->index ]; + entry = &xfs_attr3_leaf_entryp(leaf)[args->index]; ASSERT(entry->flags & XFS_ATTR_INCOMPLETE); #ifdef DEBUG + xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + ASSERT(args->index < ichdr.count); + ASSERT(args->index >= 0); + if (entry->flags & XFS_ATTR_LOCAL) { - name_loc = xfs_attr_leaf_name_local(leaf, args->index); + name_loc = xfs_attr3_leaf_name_local(leaf, args->index); namelen = name_loc->namelen; name = (char *)name_loc->nameval; } else { - name_rmt = xfs_attr_leaf_name_remote(leaf, args->index); + name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); namelen = name_rmt->namelen; name = (char *)name_rmt->name; } @@ -2592,7 +2800,7 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args) if (args->rmtblkno) { ASSERT((entry->flags & XFS_ATTR_LOCAL) == 0); - name_rmt = xfs_attr_leaf_name_remote(leaf, args->index); + name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); name_rmt->valueblk = cpu_to_be32(args->rmtblkno); name_rmt->valuelen = cpu_to_be32(args->valuelen); xfs_trans_log_buf(args->trans, bp, @@ -2609,34 +2817,41 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args) * Set the INCOMPLETE flag on an entry in a leaf block. */ int -xfs_attr_leaf_setflag(xfs_da_args_t *args) +xfs_attr3_leaf_setflag( + struct xfs_da_args *args) { - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_entry_t *entry; - xfs_attr_leaf_name_remote_t *name_rmt; - struct xfs_buf *bp; + struct xfs_attr_leafblock *leaf; + struct xfs_attr_leaf_entry *entry; + struct xfs_attr_leaf_name_remote *name_rmt; + struct xfs_buf *bp; int error; +#ifdef DEBUG + struct xfs_attr3_icleaf_hdr ichdr; +#endif trace_xfs_attr_leaf_setflag(args); /* * Set up the operation. */ - error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); if (error) return(error); leaf = bp->b_addr; - ASSERT(args->index < be16_to_cpu(leaf->hdr.count)); +#ifdef DEBUG + xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + ASSERT(args->index < ichdr.count); ASSERT(args->index >= 0); - entry = &leaf->entries[ args->index ]; +#endif + entry = &xfs_attr3_leaf_entryp(leaf)[args->index]; ASSERT((entry->flags & XFS_ATTR_INCOMPLETE) == 0); entry->flags |= XFS_ATTR_INCOMPLETE; xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry))); if ((entry->flags & XFS_ATTR_LOCAL) == 0) { - name_rmt = xfs_attr_leaf_name_remote(leaf, args->index); + name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); name_rmt->valueblk = 0; name_rmt->valuelen = 0; xfs_trans_log_buf(args->trans, bp, @@ -2657,14 +2872,20 @@ xfs_attr_leaf_setflag(xfs_da_args_t *args) * Note that they could be in different blocks, or in the same block. */ int -xfs_attr_leaf_flipflags(xfs_da_args_t *args) +xfs_attr3_leaf_flipflags( + struct xfs_da_args *args) { - xfs_attr_leafblock_t *leaf1, *leaf2; - xfs_attr_leaf_entry_t *entry1, *entry2; - xfs_attr_leaf_name_remote_t *name_rmt; - struct xfs_buf *bp1, *bp2; + struct xfs_attr_leafblock *leaf1; + struct xfs_attr_leafblock *leaf2; + struct xfs_attr_leaf_entry *entry1; + struct xfs_attr_leaf_entry *entry2; + struct xfs_attr_leaf_name_remote *name_rmt; + struct xfs_buf *bp1; + struct xfs_buf *bp2; int error; #ifdef DEBUG + struct xfs_attr3_icleaf_hdr ichdr1; + struct xfs_attr3_icleaf_hdr ichdr2; xfs_attr_leaf_name_local_t *name_loc; int namelen1, namelen2; char *name1, *name2; @@ -2675,7 +2896,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args) /* * Read the block containing the "old" attr */ - error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp1); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp1); if (error) return error; @@ -2683,7 +2904,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args) * Read the block containing the "new" attr, if it is different */ if (args->blkno2 != args->blkno) { - error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno2, + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno2, -1, &bp2); if (error) return error; @@ -2692,31 +2913,35 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args) } leaf1 = bp1->b_addr; - ASSERT(args->index < be16_to_cpu(leaf1->hdr.count)); - ASSERT(args->index >= 0); - entry1 = &leaf1->entries[ args->index ]; + entry1 = &xfs_attr3_leaf_entryp(leaf1)[args->index]; leaf2 = bp2->b_addr; - ASSERT(args->index2 < be16_to_cpu(leaf2->hdr.count)); - ASSERT(args->index2 >= 0); - entry2 = &leaf2->entries[ args->index2 ]; + entry2 = &xfs_attr3_leaf_entryp(leaf2)[args->index2]; #ifdef DEBUG + xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1); + ASSERT(args->index < ichdr1.count); + ASSERT(args->index >= 0); + + xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2); + ASSERT(args->index2 < ichdr2.count); + ASSERT(args->index2 >= 0); + if (entry1->flags & XFS_ATTR_LOCAL) { - name_loc = xfs_attr_leaf_name_local(leaf1, args->index); + name_loc = xfs_attr3_leaf_name_local(leaf1, args->index); namelen1 = name_loc->namelen; name1 = (char *)name_loc->nameval; } else { - name_rmt = xfs_attr_leaf_name_remote(leaf1, args->index); + name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index); namelen1 = name_rmt->namelen; name1 = (char *)name_rmt->name; } if (entry2->flags & XFS_ATTR_LOCAL) { - name_loc = xfs_attr_leaf_name_local(leaf2, args->index2); + name_loc = xfs_attr3_leaf_name_local(leaf2, args->index2); namelen2 = name_loc->namelen; name2 = (char *)name_loc->nameval; } else { - name_rmt = xfs_attr_leaf_name_remote(leaf2, args->index2); + name_rmt = xfs_attr3_leaf_name_remote(leaf2, args->index2); namelen2 = name_rmt->namelen; name2 = (char *)name_rmt->name; } @@ -2733,7 +2958,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args) XFS_DA_LOGRANGE(leaf1, entry1, sizeof(*entry1))); if (args->rmtblkno) { ASSERT((entry1->flags & XFS_ATTR_LOCAL) == 0); - name_rmt = xfs_attr_leaf_name_remote(leaf1, args->index); + name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index); name_rmt->valueblk = cpu_to_be32(args->rmtblkno); name_rmt->valuelen = cpu_to_be32(args->valuelen); xfs_trans_log_buf(args->trans, bp1, @@ -2744,7 +2969,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args) xfs_trans_log_buf(args->trans, bp2, XFS_DA_LOGRANGE(leaf2, entry2, sizeof(*entry2))); if ((entry2->flags & XFS_ATTR_LOCAL) == 0) { - name_rmt = xfs_attr_leaf_name_remote(leaf2, args->index2); + name_rmt = xfs_attr3_leaf_name_remote(leaf2, args->index2); name_rmt->valueblk = 0; name_rmt->valuelen = 0; xfs_trans_log_buf(args->trans, bp2, @@ -2756,7 +2981,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args) */ error = xfs_trans_roll(&args->trans, args->dp); - return(error); + return error; } /*======================================================================== @@ -2768,12 +2993,14 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args) * We're doing a depth-first traversal in order to invalidate everything. */ int -xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp) +xfs_attr3_root_inactive( + struct xfs_trans **trans, + struct xfs_inode *dp) { - xfs_da_blkinfo_t *info; - xfs_daddr_t blkno; - struct xfs_buf *bp; - int error; + struct xfs_da_blkinfo *info; + struct xfs_buf *bp; + xfs_daddr_t blkno; + int error; /* * Read block 0 to see what we have to work with. @@ -2781,40 +3008,46 @@ xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp) * the extents in reverse order the extent containing * block 0 must still be there. */ - error = xfs_da_node_read(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK); + error = xfs_da3_node_read(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK); if (error) - return(error); - blkno = XFS_BUF_ADDR(bp); + return error; + blkno = bp->b_bn; /* * Invalidate the tree, even if the "tree" is only a single leaf block. * This is a depth-first traversal! */ info = bp->b_addr; - if (info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC)) { - error = xfs_attr_node_inactive(trans, dp, bp, 1); - } else if (info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)) { - error = xfs_attr_leaf_inactive(trans, dp, bp); - } else { + switch (info->magic) { + case cpu_to_be16(XFS_DA_NODE_MAGIC): + case cpu_to_be16(XFS_DA3_NODE_MAGIC): + error = xfs_attr3_node_inactive(trans, dp, bp, 1); + break; + case cpu_to_be16(XFS_ATTR_LEAF_MAGIC): + case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC): + error = xfs_attr3_leaf_inactive(trans, dp, bp); + break; + default: error = XFS_ERROR(EIO); xfs_trans_brelse(*trans, bp); + break; } if (error) - return(error); + return error; /* * Invalidate the incore copy of the root block. */ error = xfs_da_get_buf(*trans, dp, 0, blkno, &bp, XFS_ATTR_FORK); if (error) - return(error); + return error; xfs_trans_binval(*trans, bp); /* remove from cache */ /* * Commit the invalidate and start the next transaction. */ error = xfs_trans_roll(trans, dp); - return (error); + return error; } /* @@ -2822,7 +3055,7 @@ xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp) * We're doing a depth-first traversal in order to invalidate everything. */ STATIC int -xfs_attr_node_inactive( +xfs_attr3_node_inactive( struct xfs_trans **trans, struct xfs_inode *dp, struct xfs_buf *bp, @@ -2832,26 +3065,28 @@ xfs_attr_node_inactive( xfs_da_intnode_t *node; xfs_dablk_t child_fsb; xfs_daddr_t parent_blkno, child_blkno; - int error, count, i; + int error, i; struct xfs_buf *child_bp; + struct xfs_da_node_entry *btree; + struct xfs_da3_icnode_hdr ichdr; /* * Since this code is recursive (gasp!) we must protect ourselves. */ if (level > XFS_DA_NODE_MAXDEPTH) { xfs_trans_brelse(*trans, bp); /* no locks for later trans */ - return(XFS_ERROR(EIO)); + return XFS_ERROR(EIO); } node = bp->b_addr; - ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); - parent_blkno = XFS_BUF_ADDR(bp); /* save for re-read later */ - count = be16_to_cpu(node->hdr.count); - if (!count) { + xfs_da3_node_hdr_from_disk(&ichdr, node); + parent_blkno = bp->b_bn; + if (!ichdr.count) { xfs_trans_brelse(*trans, bp); - return(0); + return 0; } - child_fsb = be32_to_cpu(node->btree[0].before); + btree = xfs_da3_node_tree_p(node); + child_fsb = be32_to_cpu(btree[0].before); xfs_trans_brelse(*trans, bp); /* no locks for later trans */ /* @@ -2859,14 +3094,14 @@ xfs_attr_node_inactive( * over the leaves removing all of them. If this is higher up * in the tree, recurse downward. */ - for (i = 0; i < count; i++) { + for (i = 0; i < ichdr.count; i++) { /* * Read the subsidiary block to see what we have to work with. * Don't do this in a transaction. This is a depth-first * traversal of the tree so we may deal with many blocks * before we come back to this one. */ - error = xfs_da_node_read(*trans, dp, child_fsb, -2, &child_bp, + error = xfs_da3_node_read(*trans, dp, child_fsb, -2, &child_bp, XFS_ATTR_FORK); if (error) return(error); @@ -2878,18 +3113,24 @@ xfs_attr_node_inactive( * Invalidate the subtree, however we have to. */ info = child_bp->b_addr; - if (info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC)) { - error = xfs_attr_node_inactive(trans, dp, - child_bp, level+1); - } else if (info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)) { - error = xfs_attr_leaf_inactive(trans, dp, - child_bp); - } else { + switch (info->magic) { + case cpu_to_be16(XFS_DA_NODE_MAGIC): + case cpu_to_be16(XFS_DA3_NODE_MAGIC): + error = xfs_attr3_node_inactive(trans, dp, + child_bp, level + 1); + break; + case cpu_to_be16(XFS_ATTR_LEAF_MAGIC): + case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC): + error = xfs_attr3_leaf_inactive(trans, dp, + child_bp); + break; + default: error = XFS_ERROR(EIO); xfs_trans_brelse(*trans, child_bp); + break; } if (error) - return(error); + return error; /* * Remove the subsidiary block from the cache @@ -2898,7 +3139,7 @@ xfs_attr_node_inactive( error = xfs_da_get_buf(*trans, dp, 0, child_blkno, &child_bp, XFS_ATTR_FORK); if (error) - return(error); + return error; xfs_trans_binval(*trans, child_bp); } @@ -2906,12 +3147,12 @@ xfs_attr_node_inactive( * If we're not done, re-read the parent to get the next * child block number. */ - if ((i+1) < count) { - error = xfs_da_node_read(*trans, dp, 0, parent_blkno, + if (i + 1 < ichdr.count) { + error = xfs_da3_node_read(*trans, dp, 0, parent_blkno, &bp, XFS_ATTR_FORK); if (error) - return(error); - child_fsb = be32_to_cpu(node->btree[i+1].before); + return error; + child_fsb = be32_to_cpu(btree[i + 1].before); xfs_trans_brelse(*trans, bp); } /* @@ -2919,10 +3160,10 @@ xfs_attr_node_inactive( */ error = xfs_trans_roll(trans, dp); if (error) - return (error); + return error; } - return(0); + return 0; } /* @@ -2932,29 +3173,35 @@ xfs_attr_node_inactive( * caught holding something that the logging code wants to flush to disk. */ STATIC int -xfs_attr_leaf_inactive( - struct xfs_trans **trans, - struct xfs_inode *dp, - struct xfs_buf *bp) +xfs_attr3_leaf_inactive( + struct xfs_trans **trans, + struct xfs_inode *dp, + struct xfs_buf *bp) { - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_entry_t *entry; - xfs_attr_leaf_name_remote_t *name_rmt; - xfs_attr_inactive_list_t *list, *lp; - int error, count, size, tmp, i; + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_attr_leaf_entry *entry; + struct xfs_attr_leaf_name_remote *name_rmt; + struct xfs_attr_inactive_list *list; + struct xfs_attr_inactive_list *lp; + int error; + int count; + int size; + int tmp; + int i; leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); + xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); /* * Count the number of "remote" value extents. */ count = 0; - entry = &leaf->entries[0]; - for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) { + entry = xfs_attr3_leaf_entryp(leaf); + for (i = 0; i < ichdr.count; entry++, i++) { if (be16_to_cpu(entry->nameidx) && ((entry->flags & XFS_ATTR_LOCAL) == 0)) { - name_rmt = xfs_attr_leaf_name_remote(leaf, i); + name_rmt = xfs_attr3_leaf_name_remote(leaf, i); if (name_rmt->valueblk) count++; } @@ -2965,24 +3212,24 @@ xfs_attr_leaf_inactive( */ if (count == 0) { xfs_trans_brelse(*trans, bp); - return(0); + return 0; } /* * Allocate storage for a list of all the "remote" value extents. */ size = count * sizeof(xfs_attr_inactive_list_t); - list = (xfs_attr_inactive_list_t *)kmem_alloc(size, KM_SLEEP); + list = kmem_alloc(size, KM_SLEEP); /* * Identify each of the "remote" value extents. */ lp = list; - entry = &leaf->entries[0]; - for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) { + entry = xfs_attr3_leaf_entryp(leaf); + for (i = 0; i < ichdr.count; entry++, i++) { if (be16_to_cpu(entry->nameidx) && ((entry->flags & XFS_ATTR_LOCAL) == 0)) { - name_rmt = xfs_attr_leaf_name_remote(leaf, i); + name_rmt = xfs_attr3_leaf_name_remote(leaf, i); if (name_rmt->valueblk) { lp->valueblk = be32_to_cpu(name_rmt->valueblk); lp->valuelen = XFS_B_TO_FSB(dp->i_mount, @@ -2998,15 +3245,15 @@ xfs_attr_leaf_inactive( */ error = 0; for (lp = list, i = 0; i < count; i++, lp++) { - tmp = xfs_attr_leaf_freextent(trans, dp, + tmp = xfs_attr3_leaf_freextent(trans, dp, lp->valueblk, lp->valuelen); if (error == 0) error = tmp; /* save only the 1st errno */ } - kmem_free((xfs_caddr_t)list); - return(error); + kmem_free(list); + return error; } /* @@ -3014,14 +3261,20 @@ xfs_attr_leaf_inactive( * invalidate any buffers that are incore/in transactions. */ STATIC int -xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp, - xfs_dablk_t blkno, int blkcnt) +xfs_attr3_leaf_freextent( + struct xfs_trans **trans, + struct xfs_inode *dp, + xfs_dablk_t blkno, + int blkcnt) { - xfs_bmbt_irec_t map; - xfs_dablk_t tblkno; - int tblkcnt, dblkcnt, nmap, error; - xfs_daddr_t dblkno; - xfs_buf_t *bp; + struct xfs_bmbt_irec map; + struct xfs_buf *bp; + xfs_dablk_t tblkno; + xfs_daddr_t dblkno; + int tblkcnt; + int dblkcnt; + int nmap; + int error; /* * Roll through the "value", invalidating the attribute value's diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h index 77de139a58f..f9d7846097e 100644 --- a/fs/xfs/xfs_attr_leaf.h +++ b/fs/xfs/xfs_attr_leaf.h @@ -1,5 +1,6 @@ /* * Copyright (c) 2000,2002-2003,2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -89,7 +90,7 @@ typedef struct xfs_attr_leaf_hdr { /* constant-structure header block */ typedef struct xfs_attr_leaf_entry { /* sorted on key, not name */ __be32 hashval; /* hash value of name */ - __be16 nameidx; /* index into buffer of name/value */ + __be16 nameidx; /* index into buffer of name/value */ __u8 flags; /* LOCAL/ROOT/SECURE/INCOMPLETE flag */ __u8 pad2; /* unused pad byte */ } xfs_attr_leaf_entry_t; @@ -115,6 +116,54 @@ typedef struct xfs_attr_leafblock { } xfs_attr_leafblock_t; /* + * CRC enabled leaf structures. Called "version 3" structures to match the + * version number of the directory and dablk structures for this feature, and + * attr2 is already taken by the variable inode attribute fork size feature. + */ +struct xfs_attr3_leaf_hdr { + struct xfs_da3_blkinfo info; + __be16 count; + __be16 usedbytes; + __be16 firstused; + __u8 holes; + __u8 pad1; + struct xfs_attr_leaf_map freemap[XFS_ATTR_LEAF_MAPSIZE]; +}; + +#define XFS_ATTR3_LEAF_CRC_OFF (offsetof(struct xfs_attr3_leaf_hdr, info.crc)) + +struct xfs_attr3_leafblock { + struct xfs_attr3_leaf_hdr hdr; + struct xfs_attr_leaf_entry entries[1]; + + /* + * The rest of the block contains the following structures after the + * leaf entries, growing from the bottom up. The variables are never + * referenced, the locations accessed purely from helper functions. + * + * struct xfs_attr_leaf_name_local + * struct xfs_attr_leaf_name_remote + */ +}; + +/* + * incore, neutral version of the attribute leaf header + */ +struct xfs_attr3_icleaf_hdr { + __uint32_t forw; + __uint32_t back; + __uint16_t magic; + __uint16_t count; + __uint16_t usedbytes; + __uint16_t firstused; + __u8 holes; + struct { + __uint16_t base; + __uint16_t size; + } freemap[XFS_ATTR_LEAF_MAPSIZE]; +}; + +/* * Flags used in the leaf_entry[i].flags field. * NOTE: the INCOMPLETE bit must not collide with the flags bits specified * on the system call, they are "or"ed together for various operations. @@ -147,26 +196,43 @@ typedef struct xfs_attr_leafblock { */ #define XFS_ATTR_LEAF_NAME_ALIGN ((uint)sizeof(xfs_dablk_t)) +static inline int +xfs_attr3_leaf_hdr_size(struct xfs_attr_leafblock *leafp) +{ + if (leafp->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) + return sizeof(struct xfs_attr3_leaf_hdr); + return sizeof(struct xfs_attr_leaf_hdr); +} + +static inline struct xfs_attr_leaf_entry * +xfs_attr3_leaf_entryp(xfs_attr_leafblock_t *leafp) +{ + if (leafp->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) + return &((struct xfs_attr3_leafblock *)leafp)->entries[0]; + return &leafp->entries[0]; +} + /* * Cast typed pointers for "local" and "remote" name/value structs. */ -static inline xfs_attr_leaf_name_remote_t * -xfs_attr_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx) +static inline char * +xfs_attr3_leaf_name(xfs_attr_leafblock_t *leafp, int idx) { - return (xfs_attr_leaf_name_remote_t *) - &((char *)leafp)[be16_to_cpu(leafp->entries[idx].nameidx)]; + struct xfs_attr_leaf_entry *entries = xfs_attr3_leaf_entryp(leafp); + + return &((char *)leafp)[be16_to_cpu(entries[idx].nameidx)]; } -static inline xfs_attr_leaf_name_local_t * -xfs_attr_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx) +static inline xfs_attr_leaf_name_remote_t * +xfs_attr3_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx) { - return (xfs_attr_leaf_name_local_t *) - &((char *)leafp)[be16_to_cpu(leafp->entries[idx].nameidx)]; + return (xfs_attr_leaf_name_remote_t *)xfs_attr3_leaf_name(leafp, idx); } -static inline char *xfs_attr_leaf_name(xfs_attr_leafblock_t *leafp, int idx) +static inline xfs_attr_leaf_name_local_t * +xfs_attr3_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx) { - return &((char *)leafp)[be16_to_cpu(leafp->entries[idx].nameidx)]; + return (xfs_attr_leaf_name_local_t *)xfs_attr3_leaf_name(leafp, idx); } /* @@ -221,37 +287,37 @@ int xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes); /* * Internal routines when attribute fork size == XFS_LBSIZE(mp). */ -int xfs_attr_leaf_to_node(struct xfs_da_args *args); -int xfs_attr_leaf_to_shortform(struct xfs_buf *bp, +int xfs_attr3_leaf_to_node(struct xfs_da_args *args); +int xfs_attr3_leaf_to_shortform(struct xfs_buf *bp, struct xfs_da_args *args, int forkoff); -int xfs_attr_leaf_clearflag(struct xfs_da_args *args); -int xfs_attr_leaf_setflag(struct xfs_da_args *args); -int xfs_attr_leaf_flipflags(xfs_da_args_t *args); +int xfs_attr3_leaf_clearflag(struct xfs_da_args *args); +int xfs_attr3_leaf_setflag(struct xfs_da_args *args); +int xfs_attr3_leaf_flipflags(struct xfs_da_args *args); /* * Routines used for growing the Btree. */ -int xfs_attr_leaf_split(struct xfs_da_state *state, +int xfs_attr3_leaf_split(struct xfs_da_state *state, struct xfs_da_state_blk *oldblk, struct xfs_da_state_blk *newblk); -int xfs_attr_leaf_lookup_int(struct xfs_buf *leaf, +int xfs_attr3_leaf_lookup_int(struct xfs_buf *leaf, struct xfs_da_args *args); -int xfs_attr_leaf_getvalue(struct xfs_buf *bp, struct xfs_da_args *args); -int xfs_attr_leaf_add(struct xfs_buf *leaf_buffer, +int xfs_attr3_leaf_getvalue(struct xfs_buf *bp, struct xfs_da_args *args); +int xfs_attr3_leaf_add(struct xfs_buf *leaf_buffer, struct xfs_da_args *args); -int xfs_attr_leaf_remove(struct xfs_buf *leaf_buffer, +int xfs_attr3_leaf_remove(struct xfs_buf *leaf_buffer, struct xfs_da_args *args); -int xfs_attr_leaf_list_int(struct xfs_buf *bp, +int xfs_attr3_leaf_list_int(struct xfs_buf *bp, struct xfs_attr_list_context *context); /* * Routines used for shrinking the Btree. */ -int xfs_attr_leaf_toosmall(struct xfs_da_state *state, int *retval); -void xfs_attr_leaf_unbalance(struct xfs_da_state *state, +int xfs_attr3_leaf_toosmall(struct xfs_da_state *state, int *retval); +void xfs_attr3_leaf_unbalance(struct xfs_da_state *state, struct xfs_da_state_blk *drop_blk, struct xfs_da_state_blk *save_blk); -int xfs_attr_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp); +int xfs_attr3_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp); /* * Utility routines. @@ -261,10 +327,12 @@ int xfs_attr_leaf_order(struct xfs_buf *leaf1_bp, struct xfs_buf *leaf2_bp); int xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize, int *local); -int xfs_attr_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp, +int xfs_attr3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mappedbno, struct xfs_buf **bpp); +void xfs_attr3_leaf_hdr_from_disk(struct xfs_attr3_icleaf_hdr *to, + struct xfs_attr_leafblock *from); -extern const struct xfs_buf_ops xfs_attr_leaf_buf_ops; +extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops; #endif /* __XFS_ATTR_LEAF_H__ */ diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c new file mode 100644 index 00000000000..dee84466dcc --- /dev/null +++ b/fs/xfs/xfs_attr_remote.c @@ -0,0 +1,541 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_error.h" +#include "xfs_da_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "xfs_inode_item.h" +#include "xfs_bmap.h" +#include "xfs_attr.h" +#include "xfs_attr_leaf.h" +#include "xfs_attr_remote.h" +#include "xfs_trans_space.h" +#include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_buf_item.h" + +#define ATTR_RMTVALUE_MAPSIZE 1 /* # of map entries at once */ + +/* + * Each contiguous block has a header, so it is not just a simple attribute + * length to FSB conversion. + */ +static int +xfs_attr3_rmt_blocks( + struct xfs_mount *mp, + int attrlen) +{ + int buflen = XFS_ATTR3_RMT_BUF_SPACE(mp, + mp->m_sb.sb_blocksize); + return (attrlen + buflen - 1) / buflen; +} + +static bool +xfs_attr3_rmt_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_attr3_rmt_hdr *rmt = bp->b_addr; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (rmt->rm_magic != cpu_to_be32(XFS_ATTR3_RMT_MAGIC)) + return false; + if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_uuid)) + return false; + if (bp->b_bn != be64_to_cpu(rmt->rm_blkno)) + return false; + if (be32_to_cpu(rmt->rm_offset) + + be32_to_cpu(rmt->rm_bytes) >= XATTR_SIZE_MAX) + return false; + if (rmt->rm_owner == 0) + return false; + + return true; +} + +static void +xfs_attr3_rmt_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + /* no verification of non-crc buffers */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (!xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), + XFS_ATTR3_RMT_CRC_OFF) || + !xfs_attr3_rmt_verify(bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } +} + +static void +xfs_attr3_rmt_write_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + + /* no verification of non-crc buffers */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (!xfs_attr3_rmt_verify(bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + return; + } + + if (bip) { + struct xfs_attr3_rmt_hdr *rmt = bp->b_addr; + rmt->rm_lsn = cpu_to_be64(bip->bli_item.li_lsn); + } + xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), + XFS_ATTR3_RMT_CRC_OFF); +} + +const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = { + .verify_read = xfs_attr3_rmt_read_verify, + .verify_write = xfs_attr3_rmt_write_verify, +}; + +static int +xfs_attr3_rmt_hdr_set( + struct xfs_mount *mp, + xfs_ino_t ino, + uint32_t offset, + uint32_t size, + struct xfs_buf *bp) +{ + struct xfs_attr3_rmt_hdr *rmt = bp->b_addr; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return 0; + + rmt->rm_magic = cpu_to_be32(XFS_ATTR3_RMT_MAGIC); + rmt->rm_offset = cpu_to_be32(offset); + rmt->rm_bytes = cpu_to_be32(size); + uuid_copy(&rmt->rm_uuid, &mp->m_sb.sb_uuid); + rmt->rm_owner = cpu_to_be64(ino); + rmt->rm_blkno = cpu_to_be64(bp->b_bn); + bp->b_ops = &xfs_attr3_rmt_buf_ops; + + return sizeof(struct xfs_attr3_rmt_hdr); +} + +/* + * Checking of the remote attribute header is split into two parts. the verifier + * does CRC, location and bounds checking, the unpacking function checks the + * attribute parameters and owner. + */ +static bool +xfs_attr3_rmt_hdr_ok( + struct xfs_mount *mp, + xfs_ino_t ino, + uint32_t offset, + uint32_t size, + struct xfs_buf *bp) +{ + struct xfs_attr3_rmt_hdr *rmt = bp->b_addr; + + if (offset != be32_to_cpu(rmt->rm_offset)) + return false; + if (size != be32_to_cpu(rmt->rm_bytes)) + return false; + if (ino != be64_to_cpu(rmt->rm_owner)) + return false; + + /* ok */ + return true; +} + +/* + * Read the value associated with an attribute from the out-of-line buffer + * that we stored it in. + */ +int +xfs_attr_rmtval_get( + struct xfs_da_args *args) +{ + struct xfs_bmbt_irec map[ATTR_RMTVALUE_MAPSIZE]; + struct xfs_mount *mp = args->dp->i_mount; + struct xfs_buf *bp; + xfs_daddr_t dblkno; + xfs_dablk_t lblkno = args->rmtblkno; + void *dst = args->value; + int valuelen = args->valuelen; + int nmap; + int error; + int blkcnt; + int i; + int offset = 0; + + trace_xfs_attr_rmtval_get(args); + + ASSERT(!(args->flags & ATTR_KERNOVAL)); + + while (valuelen > 0) { + nmap = ATTR_RMTVALUE_MAPSIZE; + error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno, + args->rmtblkcnt, map, &nmap, + XFS_BMAPI_ATTRFORK); + if (error) + return error; + ASSERT(nmap >= 1); + + for (i = 0; (i < nmap) && (valuelen > 0); i++) { + int byte_cnt; + char *src; + + ASSERT((map[i].br_startblock != DELAYSTARTBLOCK) && + (map[i].br_startblock != HOLESTARTBLOCK)); + dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); + blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, + dblkno, blkcnt, 0, &bp, + &xfs_attr3_rmt_buf_ops); + if (error) + return error; + + byte_cnt = min_t(int, valuelen, BBTOB(bp->b_length)); + byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, byte_cnt); + + src = bp->b_addr; + if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (!xfs_attr3_rmt_hdr_ok(mp, args->dp->i_ino, + offset, byte_cnt, bp)) { + xfs_alert(mp, +"remote attribute header does not match required off/len/owner (0x%x/Ox%x,0x%llx)", + offset, byte_cnt, args->dp->i_ino); + xfs_buf_relse(bp); + return EFSCORRUPTED; + + } + + src += sizeof(struct xfs_attr3_rmt_hdr); + } + + memcpy(dst, src, byte_cnt); + xfs_buf_relse(bp); + + offset += byte_cnt; + dst += byte_cnt; + valuelen -= byte_cnt; + + lblkno += map[i].br_blockcount; + } + } + ASSERT(valuelen == 0); + return 0; +} + +/* + * Write the value associated with an attribute into the out-of-line buffer + * that we have defined for it. + */ +int +xfs_attr_rmtval_set( + struct xfs_da_args *args) +{ + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; + struct xfs_bmbt_irec map; + struct xfs_buf *bp; + xfs_daddr_t dblkno; + xfs_dablk_t lblkno; + xfs_fileoff_t lfileoff = 0; + void *src = args->value; + int blkcnt; + int valuelen; + int nmap; + int error; + int hdrcnt = 0; + bool crcs = xfs_sb_version_hascrc(&mp->m_sb); + int offset = 0; + + trace_xfs_attr_rmtval_set(args); + + /* + * Find a "hole" in the attribute address space large enough for + * us to drop the new attribute's value into. Because CRC enable + * attributes have headers, we can't just do a straight byte to FSB + * conversion. We calculate the worst case block count in this case + * and we may not need that many, so we have to handle this when + * allocating the blocks below. + */ + if (!crcs) + blkcnt = XFS_B_TO_FSB(mp, args->valuelen); + else + blkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen); + + error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff, + XFS_ATTR_FORK); + if (error) + return error; + + /* Start with the attribute data. We'll allocate the rest afterwards. */ + if (crcs) + blkcnt = XFS_B_TO_FSB(mp, args->valuelen); + + args->rmtblkno = lblkno = (xfs_dablk_t)lfileoff; + args->rmtblkcnt = blkcnt; + + /* + * Roll through the "value", allocating blocks on disk as required. + */ + while (blkcnt > 0) { + int committed; + + /* + * Allocate a single extent, up to the size of the value. + */ + xfs_bmap_init(args->flist, args->firstblock); + nmap = 1; + error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno, + blkcnt, + XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, + args->firstblock, args->total, &map, &nmap, + args->flist); + if (!error) { + error = xfs_bmap_finish(&args->trans, args->flist, + &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + return(error); + } + + /* + * bmap_finish() may have committed the last trans and started + * a new one. We need the inode to be in all transactions. + */ + if (committed) + xfs_trans_ijoin(args->trans, dp, 0); + + ASSERT(nmap == 1); + ASSERT((map.br_startblock != DELAYSTARTBLOCK) && + (map.br_startblock != HOLESTARTBLOCK)); + lblkno += map.br_blockcount; + blkcnt -= map.br_blockcount; + hdrcnt++; + + /* + * If we have enough blocks for the attribute data, calculate + * how many extra blocks we need for headers. We might run + * through this multiple times in the case that the additional + * headers in the blocks needed for the data fragments spills + * into requiring more blocks. e.g. for 512 byte blocks, we'll + * spill for another block every 9 headers we require in this + * loop. + */ + if (crcs && blkcnt == 0) { + int total_len; + + total_len = args->valuelen + + hdrcnt * sizeof(struct xfs_attr3_rmt_hdr); + blkcnt = XFS_B_TO_FSB(mp, total_len); + blkcnt -= args->rmtblkcnt; + args->rmtblkcnt += blkcnt; + } + + /* + * Start the next trans in the chain. + */ + error = xfs_trans_roll(&args->trans, dp); + if (error) + return (error); + } + + /* + * Roll through the "value", copying the attribute value to the + * already-allocated blocks. Blocks are written synchronously + * so that we can know they are all on disk before we turn off + * the INCOMPLETE flag. + */ + lblkno = args->rmtblkno; + valuelen = args->valuelen; + while (valuelen > 0) { + int byte_cnt; + char *buf; + + /* + * Try to remember where we decided to put the value. + */ + xfs_bmap_init(args->flist, args->firstblock); + nmap = 1; + error = xfs_bmapi_read(dp, (xfs_fileoff_t)lblkno, + args->rmtblkcnt, &map, &nmap, + XFS_BMAPI_ATTRFORK); + if (error) + return(error); + ASSERT(nmap == 1); + ASSERT((map.br_startblock != DELAYSTARTBLOCK) && + (map.br_startblock != HOLESTARTBLOCK)); + + dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), + blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); + + bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt, 0); + if (!bp) + return ENOMEM; + bp->b_ops = &xfs_attr3_rmt_buf_ops; + + byte_cnt = BBTOB(bp->b_length); + byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, byte_cnt); + if (valuelen < byte_cnt) + byte_cnt = valuelen; + + buf = bp->b_addr; + buf += xfs_attr3_rmt_hdr_set(mp, dp->i_ino, offset, + byte_cnt, bp); + memcpy(buf, src, byte_cnt); + + if (byte_cnt < BBTOB(bp->b_length)) + xfs_buf_zero(bp, byte_cnt, + BBTOB(bp->b_length) - byte_cnt); + + error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */ + xfs_buf_relse(bp); + if (error) + return error; + + src += byte_cnt; + valuelen -= byte_cnt; + offset += byte_cnt; + hdrcnt--; + + lblkno += map.br_blockcount; + } + ASSERT(valuelen == 0); + ASSERT(hdrcnt == 0); + return 0; +} + +/* + * Remove the value associated with an attribute by deleting the + * out-of-line buffer that it is stored on. + */ +int +xfs_attr_rmtval_remove(xfs_da_args_t *args) +{ + xfs_mount_t *mp; + xfs_bmbt_irec_t map; + xfs_buf_t *bp; + xfs_daddr_t dblkno; + xfs_dablk_t lblkno; + int valuelen, blkcnt, nmap, error, done, committed; + + trace_xfs_attr_rmtval_remove(args); + + mp = args->dp->i_mount; + + /* + * Roll through the "value", invalidating the attribute value's + * blocks. + */ + lblkno = args->rmtblkno; + valuelen = args->rmtblkcnt; + while (valuelen > 0) { + /* + * Try to remember where we decided to put the value. + */ + nmap = 1; + error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno, + args->rmtblkcnt, &map, &nmap, + XFS_BMAPI_ATTRFORK); + if (error) + return(error); + ASSERT(nmap == 1); + ASSERT((map.br_startblock != DELAYSTARTBLOCK) && + (map.br_startblock != HOLESTARTBLOCK)); + + dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), + blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); + + /* + * If the "remote" value is in the cache, remove it. + */ + bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt, XBF_TRYLOCK); + if (bp) { + xfs_buf_stale(bp); + xfs_buf_relse(bp); + bp = NULL; + } + + valuelen -= map.br_blockcount; + + lblkno += map.br_blockcount; + } + + /* + * Keep de-allocating extents until the remote-value region is gone. + */ + lblkno = args->rmtblkno; + blkcnt = args->rmtblkcnt; + done = 0; + while (!done) { + xfs_bmap_init(args->flist, args->firstblock); + error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt, + XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, + 1, args->firstblock, args->flist, + &done); + if (!error) { + error = xfs_bmap_finish(&args->trans, args->flist, + &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + return error; + } + + /* + * bmap_finish() may have committed the last trans and started + * a new one. We need the inode to be in all transactions. + */ + if (committed) + xfs_trans_ijoin(args->trans, args->dp, 0); + + /* + * Close out trans and start the next one in the chain. + */ + error = xfs_trans_roll(&args->trans, args->dp); + if (error) + return (error); + } + return(0); +} + diff --git a/fs/xfs/xfs_attr_remote.h b/fs/xfs/xfs_attr_remote.h new file mode 100644 index 00000000000..c7cca60a062 --- /dev/null +++ b/fs/xfs/xfs_attr_remote.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_ATTR_REMOTE_H__ +#define __XFS_ATTR_REMOTE_H__ + +#define XFS_ATTR3_RMT_MAGIC 0x5841524d /* XARM */ + +struct xfs_attr3_rmt_hdr { + __be32 rm_magic; + __be32 rm_offset; + __be32 rm_bytes; + __be32 rm_crc; + uuid_t rm_uuid; + __be64 rm_owner; + __be64 rm_blkno; + __be64 rm_lsn; +}; + +#define XFS_ATTR3_RMT_CRC_OFF offsetof(struct xfs_attr3_rmt_hdr, rm_crc) + +#define XFS_ATTR3_RMT_BUF_SPACE(mp, bufsize) \ + ((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \ + sizeof(struct xfs_attr3_rmt_hdr) : 0)) + +extern const struct xfs_buf_ops xfs_attr3_rmt_buf_ops; + +int xfs_attr_rmtval_get(struct xfs_da_args *args); +int xfs_attr_rmtval_set(struct xfs_da_args *args); +int xfs_attr_rmtval_remove(struct xfs_da_args *args); + +#endif /* __XFS_ATTR_REMOTE_H__ */ diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index b44af9211bd..89042848f9e 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -25,6 +25,7 @@ #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_dir2.h" +#include "xfs_mount.h" #include "xfs_da_btree.h" #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" @@ -47,180 +48,78 @@ #include "xfs_filestream.h" #include "xfs_vnodeops.h" #include "xfs_trace.h" +#include "xfs_symlink.h" kmem_zone_t *xfs_bmap_free_item_zone; /* - * Prototypes for internal bmap routines. - */ - -#ifdef DEBUG -STATIC void -xfs_bmap_check_leaf_extents( - struct xfs_btree_cur *cur, - struct xfs_inode *ip, - int whichfork); -#else -#define xfs_bmap_check_leaf_extents(cur, ip, whichfork) do { } while (0) -#endif - - -/* - * Called from xfs_bmap_add_attrfork to handle extents format files. - */ -STATIC int /* error */ -xfs_bmap_add_attrfork_extents( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fsblock_t *firstblock, /* first block allocated */ - xfs_bmap_free_t *flist, /* blocks to free at commit */ - int *flags); /* inode logging flags */ - -/* - * Called from xfs_bmap_add_attrfork to handle local format files. + * Miscellaneous helper functions */ -STATIC int /* error */ -xfs_bmap_add_attrfork_local( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fsblock_t *firstblock, /* first block allocated */ - xfs_bmap_free_t *flist, /* blocks to free at commit */ - int *flags); /* inode logging flags */ /* - * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file. - * It figures out where to ask the underlying allocator to put the new extent. - */ -STATIC int /* error */ -xfs_bmap_alloc( - xfs_bmalloca_t *ap); /* bmap alloc argument struct */ - -/* - * Transform a btree format file with only one leaf node, where the - * extents list will fit in the inode, into an extents format file. - * Since the file extents are already in-core, all we have to do is - * give up the space for the btree root and pitch the leaf block. - */ -STATIC int /* error */ -xfs_bmap_btree_to_extents( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_btree_cur_t *cur, /* btree cursor */ - int *logflagsp, /* inode logging flags */ - int whichfork); /* data or attr fork */ - -/* - * Remove the entry "free" from the free item list. Prev points to the - * previous entry, unless "free" is the head of the list. - */ -STATIC void -xfs_bmap_del_free( - xfs_bmap_free_t *flist, /* free item list header */ - xfs_bmap_free_item_t *prev, /* previous item on list, if any */ - xfs_bmap_free_item_t *free); /* list item to be freed */ - -/* - * Convert an extents-format file into a btree-format file. - * The new file will have a root block (in the inode) and a single child block. - */ -STATIC int /* error */ -xfs_bmap_extents_to_btree( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fsblock_t *firstblock, /* first-block-allocated */ - xfs_bmap_free_t *flist, /* blocks freed in xaction */ - xfs_btree_cur_t **curp, /* cursor returned to caller */ - int wasdel, /* converting a delayed alloc */ - int *logflagsp, /* inode logging flags */ - int whichfork); /* data or attr fork */ - -/* - * Convert a local file to an extents file. - * This code is sort of bogus, since the file data needs to get - * logged so it won't be lost. The bmap-level manipulations are ok, though. - */ -STATIC int /* error */ -xfs_bmap_local_to_extents( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fsblock_t *firstblock, /* first block allocated in xaction */ - xfs_extlen_t total, /* total blocks needed by transaction */ - int *logflagsp, /* inode logging flags */ - int whichfork, /* data or attr fork */ - void (*init_fn)(struct xfs_buf *bp, - struct xfs_inode *ip, - struct xfs_ifork *ifp)); - -/* - * Search the extents list for the inode, for the extent containing bno. - * If bno lies in a hole, point to the next entry. If bno lies past eof, - * *eofp will be set, and *prevp will contain the last entry (null if none). - * Else, *lastxp will be set to the index of the found - * entry; *gotp will contain the entry. - */ -STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */ -xfs_bmap_search_extents( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fileoff_t bno, /* block number searched for */ - int whichfork, /* data or attr fork */ - int *eofp, /* out: end of file found */ - xfs_extnum_t *lastxp, /* out: last extent index */ - xfs_bmbt_irec_t *gotp, /* out: extent entry found */ - xfs_bmbt_irec_t *prevp); /* out: previous extent entry found */ - -/* - * Compute the worst-case number of indirect blocks that will be used - * for ip's delayed extent of length "len". - */ -STATIC xfs_filblks_t -xfs_bmap_worst_indlen( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_filblks_t len); /* delayed extent length */ - -#ifdef DEBUG -/* - * Perform various validation checks on the values being returned - * from xfs_bmapi(). + * Compute and fill in the value of the maximum depth of a bmap btree + * in this filesystem. Done once, during mount. */ -STATIC void -xfs_bmap_validate_ret( - xfs_fileoff_t bno, - xfs_filblks_t len, - int flags, - xfs_bmbt_irec_t *mval, - int nmap, - int ret_nmap); -#else -#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap) -#endif /* DEBUG */ - -STATIC int -xfs_bmap_count_tree( - xfs_mount_t *mp, - xfs_trans_t *tp, - xfs_ifork_t *ifp, - xfs_fsblock_t blockno, - int levelin, - int *count); - -STATIC void -xfs_bmap_count_leaves( - xfs_ifork_t *ifp, - xfs_extnum_t idx, - int numrecs, - int *count); +void +xfs_bmap_compute_maxlevels( + xfs_mount_t *mp, /* file system mount structure */ + int whichfork) /* data or attr fork */ +{ + int level; /* btree level */ + uint maxblocks; /* max blocks at this level */ + uint maxleafents; /* max leaf entries possible */ + int maxrootrecs; /* max records in root block */ + int minleafrecs; /* min records in leaf block */ + int minnoderecs; /* min records in node block */ + int sz; /* root block size */ -STATIC void -xfs_bmap_disk_count_leaves( - struct xfs_mount *mp, - struct xfs_btree_block *block, - int numrecs, - int *count); + /* + * The maximum number of extents in a file, hence the maximum + * number of leaf entries, is controlled by the type of di_nextents + * (a signed 32-bit number, xfs_extnum_t), or by di_anextents + * (a signed 16-bit number, xfs_aextnum_t). + * + * Note that we can no longer assume that if we are in ATTR1 that + * the fork offset of all the inodes will be + * (xfs_default_attroffset(ip) >> 3) because we could have mounted + * with ATTR2 and then mounted back with ATTR1, keeping the + * di_forkoff's fixed but probably at various positions. Therefore, + * for both ATTR1 and ATTR2 we have to assume the worst case scenario + * of a minimum size available. + */ + if (whichfork == XFS_DATA_FORK) { + maxleafents = MAXEXTNUM; + sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS); + } else { + maxleafents = MAXAEXTNUM; + sz = XFS_BMDR_SPACE_CALC(MINABTPTRS); + } + maxrootrecs = xfs_bmdr_maxrecs(mp, sz, 0); + minleafrecs = mp->m_bmap_dmnr[0]; + minnoderecs = mp->m_bmap_dmnr[1]; + maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs; + for (level = 1; maxblocks > 1; level++) { + if (maxblocks <= maxrootrecs) + maxblocks = 1; + else + maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs; + } + mp->m_bm_maxlevels[whichfork] = level; +} /* - * Bmap internal routines. + * Convert the given file system block to a disk block. We have to treat it + * differently based on whether the file is a real time file or not, because the + * bmap code does. */ +xfs_daddr_t +xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb) +{ + return (XFS_IS_REALTIME_INODE(ip) ? \ + (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \ + XFS_FSB_TO_DADDR((ip)->i_mount, (fsb))); +} STATIC int /* error */ xfs_bmbt_lookup_eq( @@ -290,6 +189,1070 @@ xfs_bmbt_update( } /* + * Compute the worst-case number of indirect blocks that will be used + * for ip's delayed extent of length "len". + */ +STATIC xfs_filblks_t +xfs_bmap_worst_indlen( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_filblks_t len) /* delayed extent length */ +{ + int level; /* btree level number */ + int maxrecs; /* maximum record count at this level */ + xfs_mount_t *mp; /* mount structure */ + xfs_filblks_t rval; /* return value */ + + mp = ip->i_mount; + maxrecs = mp->m_bmap_dmxr[0]; + for (level = 0, rval = 0; + level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK); + level++) { + len += maxrecs - 1; + do_div(len, maxrecs); + rval += len; + if (len == 1) + return rval + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) - + level - 1; + if (level == 0) + maxrecs = mp->m_bmap_dmxr[1]; + } + return rval; +} + +/* + * Calculate the default attribute fork offset for newly created inodes. + */ +uint +xfs_default_attroffset( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + uint offset; + + if (mp->m_sb.sb_inodesize == 256) { + offset = XFS_LITINO(mp, ip->i_d.di_version) - + XFS_BMDR_SPACE_CALC(MINABTPTRS); + } else { + offset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS); + } + + ASSERT(offset < XFS_LITINO(mp, ip->i_d.di_version)); + return offset; +} + +/* + * Helper routine to reset inode di_forkoff field when switching + * attribute fork from local to extent format - we reset it where + * possible to make space available for inline data fork extents. + */ +STATIC void +xfs_bmap_forkoff_reset( + xfs_mount_t *mp, + xfs_inode_t *ip, + int whichfork) +{ + if (whichfork == XFS_ATTR_FORK && + ip->i_d.di_format != XFS_DINODE_FMT_DEV && + ip->i_d.di_format != XFS_DINODE_FMT_UUID && + ip->i_d.di_format != XFS_DINODE_FMT_BTREE) { + uint dfl_forkoff = xfs_default_attroffset(ip) >> 3; + + if (dfl_forkoff > ip->i_d.di_forkoff) + ip->i_d.di_forkoff = dfl_forkoff; + } +} + +/* + * Extent tree block counting routines. + */ + +/* + * Count leaf blocks given a range of extent records. + */ +STATIC void +xfs_bmap_count_leaves( + xfs_ifork_t *ifp, + xfs_extnum_t idx, + int numrecs, + int *count) +{ + int b; + + for (b = 0; b < numrecs; b++) { + xfs_bmbt_rec_host_t *frp = xfs_iext_get_ext(ifp, idx + b); + *count += xfs_bmbt_get_blockcount(frp); + } +} + +/* + * Count leaf blocks given a range of extent records originally + * in btree format. + */ +STATIC void +xfs_bmap_disk_count_leaves( + struct xfs_mount *mp, + struct xfs_btree_block *block, + int numrecs, + int *count) +{ + int b; + xfs_bmbt_rec_t *frp; + + for (b = 1; b <= numrecs; b++) { + frp = XFS_BMBT_REC_ADDR(mp, block, b); + *count += xfs_bmbt_disk_get_blockcount(frp); + } +} + +/* + * Recursively walks each level of a btree + * to count total fsblocks is use. + */ +STATIC int /* error */ +xfs_bmap_count_tree( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_fsblock_t blockno, /* file system block number */ + int levelin, /* level in btree */ + int *count) /* Count of blocks */ +{ + int error; + xfs_buf_t *bp, *nbp; + int level = levelin; + __be64 *pp; + xfs_fsblock_t bno = blockno; + xfs_fsblock_t nextbno; + struct xfs_btree_block *block, *nextblock; + int numrecs; + + error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF, + &xfs_bmbt_buf_ops); + if (error) + return error; + *count += 1; + block = XFS_BUF_TO_BLOCK(bp); + + if (--level) { + /* Not at node above leaves, count this level of nodes */ + nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); + while (nextbno != NULLFSBLOCK) { + error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp, + XFS_BMAP_BTREE_REF, + &xfs_bmbt_buf_ops); + if (error) + return error; + *count += 1; + nextblock = XFS_BUF_TO_BLOCK(nbp); + nextbno = be64_to_cpu(nextblock->bb_u.l.bb_rightsib); + xfs_trans_brelse(tp, nbp); + } + + /* Dive to the next level */ + pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); + bno = be64_to_cpu(*pp); + if (unlikely((error = + xfs_bmap_count_tree(mp, tp, ifp, bno, level, count)) < 0)) { + xfs_trans_brelse(tp, bp); + XFS_ERROR_REPORT("xfs_bmap_count_tree(1)", + XFS_ERRLEVEL_LOW, mp); + return XFS_ERROR(EFSCORRUPTED); + } + xfs_trans_brelse(tp, bp); + } else { + /* count all level 1 nodes and their leaves */ + for (;;) { + nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); + numrecs = be16_to_cpu(block->bb_numrecs); + xfs_bmap_disk_count_leaves(mp, block, numrecs, count); + xfs_trans_brelse(tp, bp); + if (nextbno == NULLFSBLOCK) + break; + bno = nextbno; + error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, + XFS_BMAP_BTREE_REF, + &xfs_bmbt_buf_ops); + if (error) + return error; + *count += 1; + block = XFS_BUF_TO_BLOCK(bp); + } + } + return 0; +} + +/* + * Count fsblocks of the given fork. + */ +int /* error */ +xfs_bmap_count_blocks( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode */ + int whichfork, /* data or attr fork */ + int *count) /* out: count of blocks */ +{ + struct xfs_btree_block *block; /* current btree block */ + xfs_fsblock_t bno; /* block # of "block" */ + xfs_ifork_t *ifp; /* fork structure */ + int level; /* btree level, for checking */ + xfs_mount_t *mp; /* file system mount structure */ + __be64 *pp; /* pointer to block address */ + + bno = NULLFSBLOCK; + mp = ip->i_mount; + ifp = XFS_IFORK_PTR(ip, whichfork); + if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) { + xfs_bmap_count_leaves(ifp, 0, + ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t), + count); + return 0; + } + + /* + * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. + */ + block = ifp->if_broot; + level = be16_to_cpu(block->bb_level); + ASSERT(level > 0); + pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); + bno = be64_to_cpu(*pp); + ASSERT(bno != NULLDFSBNO); + ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); + ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); + + if (unlikely(xfs_bmap_count_tree(mp, tp, ifp, bno, level, count) < 0)) { + XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", XFS_ERRLEVEL_LOW, + mp); + return XFS_ERROR(EFSCORRUPTED); + } + + return 0; +} + +/* + * Debug/sanity checking code + */ + +STATIC int +xfs_bmap_sanity_check( + struct xfs_mount *mp, + struct xfs_buf *bp, + int level) +{ + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + + if (block->bb_magic != cpu_to_be32(XFS_BMAP_CRC_MAGIC) && + block->bb_magic != cpu_to_be32(XFS_BMAP_MAGIC)) + return 0; + + if (be16_to_cpu(block->bb_level) != level || + be16_to_cpu(block->bb_numrecs) == 0 || + be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0]) + return 0; + + return 1; +} + +#ifdef DEBUG +STATIC struct xfs_buf * +xfs_bmap_get_bp( + struct xfs_btree_cur *cur, + xfs_fsblock_t bno) +{ + struct xfs_log_item_desc *lidp; + int i; + + if (!cur) + return NULL; + + for (i = 0; i < XFS_BTREE_MAXLEVELS; i++) { + if (!cur->bc_bufs[i]) + break; + if (XFS_BUF_ADDR(cur->bc_bufs[i]) == bno) + return cur->bc_bufs[i]; + } + + /* Chase down all the log items to see if the bp is there */ + list_for_each_entry(lidp, &cur->bc_tp->t_items, lid_trans) { + struct xfs_buf_log_item *bip; + bip = (struct xfs_buf_log_item *)lidp->lid_item; + if (bip->bli_item.li_type == XFS_LI_BUF && + XFS_BUF_ADDR(bip->bli_buf) == bno) + return bip->bli_buf; + } + + return NULL; +} + +STATIC void +xfs_check_block( + struct xfs_btree_block *block, + xfs_mount_t *mp, + int root, + short sz) +{ + int i, j, dmxr; + __be64 *pp, *thispa; /* pointer to block address */ + xfs_bmbt_key_t *prevp, *keyp; + + ASSERT(be16_to_cpu(block->bb_level) > 0); + + prevp = NULL; + for( i = 1; i <= xfs_btree_get_numrecs(block); i++) { + dmxr = mp->m_bmap_dmxr[0]; + keyp = XFS_BMBT_KEY_ADDR(mp, block, i); + + if (prevp) { + ASSERT(be64_to_cpu(prevp->br_startoff) < + be64_to_cpu(keyp->br_startoff)); + } + prevp = keyp; + + /* + * Compare the block numbers to see if there are dups. + */ + if (root) + pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, i, sz); + else + pp = XFS_BMBT_PTR_ADDR(mp, block, i, dmxr); + + for (j = i+1; j <= be16_to_cpu(block->bb_numrecs); j++) { + if (root) + thispa = XFS_BMAP_BROOT_PTR_ADDR(mp, block, j, sz); + else + thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr); + if (*thispa == *pp) { + xfs_warn(mp, "%s: thispa(%d) == pp(%d) %Ld", + __func__, j, i, + (unsigned long long)be64_to_cpu(*thispa)); + panic("%s: ptrs are equal in node\n", + __func__); + } + } + } +} + +/* + * Check that the extents for the inode ip are in the right order in all + * btree leaves. + */ + +STATIC void +xfs_bmap_check_leaf_extents( + xfs_btree_cur_t *cur, /* btree cursor or null */ + xfs_inode_t *ip, /* incore inode pointer */ + int whichfork) /* data or attr fork */ +{ + struct xfs_btree_block *block; /* current btree block */ + xfs_fsblock_t bno; /* block # of "block" */ + xfs_buf_t *bp; /* buffer for "block" */ + int error; /* error return value */ + xfs_extnum_t i=0, j; /* index into the extents list */ + xfs_ifork_t *ifp; /* fork structure */ + int level; /* btree level, for checking */ + xfs_mount_t *mp; /* file system mount structure */ + __be64 *pp; /* pointer to block address */ + xfs_bmbt_rec_t *ep; /* pointer to current extent */ + xfs_bmbt_rec_t last = {0, 0}; /* last extent in prev block */ + xfs_bmbt_rec_t *nextp; /* pointer to next extent */ + int bp_release = 0; + + if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) { + return; + } + + bno = NULLFSBLOCK; + mp = ip->i_mount; + ifp = XFS_IFORK_PTR(ip, whichfork); + block = ifp->if_broot; + /* + * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. + */ + level = be16_to_cpu(block->bb_level); + ASSERT(level > 0); + xfs_check_block(block, mp, 1, ifp->if_broot_bytes); + pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); + bno = be64_to_cpu(*pp); + + ASSERT(bno != NULLDFSBNO); + ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); + ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); + + /* + * Go down the tree until leaf level is reached, following the first + * pointer (leftmost) at each level. + */ + while (level-- > 0) { + /* See if buf is in cur first */ + bp_release = 0; + bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); + if (!bp) { + bp_release = 1; + error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, + XFS_BMAP_BTREE_REF, + &xfs_bmbt_buf_ops); + if (error) + goto error_norelse; + } + block = XFS_BUF_TO_BLOCK(bp); + XFS_WANT_CORRUPTED_GOTO( + xfs_bmap_sanity_check(mp, bp, level), + error0); + if (level == 0) + break; + + /* + * Check this block for basic sanity (increasing keys and + * no duplicate blocks). + */ + + xfs_check_block(block, mp, 0, 0); + pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); + bno = be64_to_cpu(*pp); + XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0); + if (bp_release) { + bp_release = 0; + xfs_trans_brelse(NULL, bp); + } + } + + /* + * Here with bp and block set to the leftmost leaf node in the tree. + */ + i = 0; + + /* + * Loop over all leaf nodes checking that all extents are in the right order. + */ + for (;;) { + xfs_fsblock_t nextbno; + xfs_extnum_t num_recs; + + + num_recs = xfs_btree_get_numrecs(block); + + /* + * Read-ahead the next leaf block, if any. + */ + + nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); + + /* + * Check all the extents to make sure they are OK. + * If we had a previous block, the last entry should + * conform with the first entry in this one. + */ + + ep = XFS_BMBT_REC_ADDR(mp, block, 1); + if (i) { + ASSERT(xfs_bmbt_disk_get_startoff(&last) + + xfs_bmbt_disk_get_blockcount(&last) <= + xfs_bmbt_disk_get_startoff(ep)); + } + for (j = 1; j < num_recs; j++) { + nextp = XFS_BMBT_REC_ADDR(mp, block, j + 1); + ASSERT(xfs_bmbt_disk_get_startoff(ep) + + xfs_bmbt_disk_get_blockcount(ep) <= + xfs_bmbt_disk_get_startoff(nextp)); + ep = nextp; + } + + last = *ep; + i += num_recs; + if (bp_release) { + bp_release = 0; + xfs_trans_brelse(NULL, bp); + } + bno = nextbno; + /* + * If we've reached the end, stop. + */ + if (bno == NULLFSBLOCK) + break; + + bp_release = 0; + bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); + if (!bp) { + bp_release = 1; + error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, + XFS_BMAP_BTREE_REF, + &xfs_bmbt_buf_ops); + if (error) + goto error_norelse; + } + block = XFS_BUF_TO_BLOCK(bp); + } + if (bp_release) { + bp_release = 0; + xfs_trans_brelse(NULL, bp); + } + return; + +error0: + xfs_warn(mp, "%s: at error0", __func__); + if (bp_release) + xfs_trans_brelse(NULL, bp); +error_norelse: + xfs_warn(mp, "%s: BAD after btree leaves for %d extents", + __func__, i); + panic("%s: CORRUPTED BTREE OR SOMETHING", __func__); + return; +} + +/* + * Add bmap trace insert entries for all the contents of the extent records. + */ +void +xfs_bmap_trace_exlist( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t cnt, /* count of entries in the list */ + int whichfork, /* data or attr fork */ + unsigned long caller_ip) +{ + xfs_extnum_t idx; /* extent record index */ + xfs_ifork_t *ifp; /* inode fork pointer */ + int state = 0; + + if (whichfork == XFS_ATTR_FORK) + state |= BMAP_ATTRFORK; + + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(cnt == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))); + for (idx = 0; idx < cnt; idx++) + trace_xfs_extlist(ip, idx, whichfork, caller_ip); +} + +/* + * Validate that the bmbt_irecs being returned from bmapi are valid + * given the callers original parameters. Specifically check the + * ranges of the returned irecs to ensure that they only extent beyond + * the given parameters if the XFS_BMAPI_ENTIRE flag was set. + */ +STATIC void +xfs_bmap_validate_ret( + xfs_fileoff_t bno, + xfs_filblks_t len, + int flags, + xfs_bmbt_irec_t *mval, + int nmap, + int ret_nmap) +{ + int i; /* index to map values */ + + ASSERT(ret_nmap <= nmap); + + for (i = 0; i < ret_nmap; i++) { + ASSERT(mval[i].br_blockcount > 0); + if (!(flags & XFS_BMAPI_ENTIRE)) { + ASSERT(mval[i].br_startoff >= bno); + ASSERT(mval[i].br_blockcount <= len); + ASSERT(mval[i].br_startoff + mval[i].br_blockcount <= + bno + len); + } else { + ASSERT(mval[i].br_startoff < bno + len); + ASSERT(mval[i].br_startoff + mval[i].br_blockcount > + bno); + } + ASSERT(i == 0 || + mval[i - 1].br_startoff + mval[i - 1].br_blockcount == + mval[i].br_startoff); + ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK && + mval[i].br_startblock != HOLESTARTBLOCK); + ASSERT(mval[i].br_state == XFS_EXT_NORM || + mval[i].br_state == XFS_EXT_UNWRITTEN); + } +} + +#else +#define xfs_bmap_check_leaf_extents(cur, ip, whichfork) do { } while (0) +#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap) +#endif /* DEBUG */ + +/* + * bmap free list manipulation functions + */ + +/* + * Add the extent to the list of extents to be free at transaction end. + * The list is maintained sorted (by block number). + */ +void +xfs_bmap_add_free( + xfs_fsblock_t bno, /* fs block number of extent */ + xfs_filblks_t len, /* length of extent */ + xfs_bmap_free_t *flist, /* list of extents */ + xfs_mount_t *mp) /* mount point structure */ +{ + xfs_bmap_free_item_t *cur; /* current (next) element */ + xfs_bmap_free_item_t *new; /* new element */ + xfs_bmap_free_item_t *prev; /* previous element */ +#ifdef DEBUG + xfs_agnumber_t agno; + xfs_agblock_t agbno; + + ASSERT(bno != NULLFSBLOCK); + ASSERT(len > 0); + ASSERT(len <= MAXEXTLEN); + ASSERT(!isnullstartblock(bno)); + agno = XFS_FSB_TO_AGNO(mp, bno); + agbno = XFS_FSB_TO_AGBNO(mp, bno); + ASSERT(agno < mp->m_sb.sb_agcount); + ASSERT(agbno < mp->m_sb.sb_agblocks); + ASSERT(len < mp->m_sb.sb_agblocks); + ASSERT(agbno + len <= mp->m_sb.sb_agblocks); +#endif + ASSERT(xfs_bmap_free_item_zone != NULL); + new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP); + new->xbfi_startblock = bno; + new->xbfi_blockcount = (xfs_extlen_t)len; + for (prev = NULL, cur = flist->xbf_first; + cur != NULL; + prev = cur, cur = cur->xbfi_next) { + if (cur->xbfi_startblock >= bno) + break; + } + if (prev) + prev->xbfi_next = new; + else + flist->xbf_first = new; + new->xbfi_next = cur; + flist->xbf_count++; +} + +/* + * Remove the entry "free" from the free item list. Prev points to the + * previous entry, unless "free" is the head of the list. + */ +STATIC void +xfs_bmap_del_free( + xfs_bmap_free_t *flist, /* free item list header */ + xfs_bmap_free_item_t *prev, /* previous item on list, if any */ + xfs_bmap_free_item_t *free) /* list item to be freed */ +{ + if (prev) + prev->xbfi_next = free->xbfi_next; + else + flist->xbf_first = free->xbfi_next; + flist->xbf_count--; + kmem_zone_free(xfs_bmap_free_item_zone, free); +} + + +/* + * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi + * caller. Frees all the extents that need freeing, which must be done + * last due to locking considerations. We never free any extents in + * the first transaction. + * + * Return 1 if the given transaction was committed and a new one + * started, and 0 otherwise in the committed parameter. + */ +int /* error */ +xfs_bmap_finish( + xfs_trans_t **tp, /* transaction pointer addr */ + xfs_bmap_free_t *flist, /* i/o: list extents to free */ + int *committed) /* xact committed or not */ +{ + xfs_efd_log_item_t *efd; /* extent free data */ + xfs_efi_log_item_t *efi; /* extent free intention */ + int error; /* error return value */ + xfs_bmap_free_item_t *free; /* free extent item */ + unsigned int logres; /* new log reservation */ + unsigned int logcount; /* new log count */ + xfs_mount_t *mp; /* filesystem mount structure */ + xfs_bmap_free_item_t *next; /* next item on free list */ + xfs_trans_t *ntp; /* new transaction pointer */ + + ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); + if (flist->xbf_count == 0) { + *committed = 0; + return 0; + } + ntp = *tp; + efi = xfs_trans_get_efi(ntp, flist->xbf_count); + for (free = flist->xbf_first; free; free = free->xbfi_next) + xfs_trans_log_efi_extent(ntp, efi, free->xbfi_startblock, + free->xbfi_blockcount); + logres = ntp->t_log_res; + logcount = ntp->t_log_count; + ntp = xfs_trans_dup(*tp); + error = xfs_trans_commit(*tp, 0); + *tp = ntp; + *committed = 1; + /* + * We have a new transaction, so we should return committed=1, + * even though we're returning an error. + */ + if (error) + return error; + + /* + * transaction commit worked ok so we can drop the extra ticket + * reference that we gained in xfs_trans_dup() + */ + xfs_log_ticket_put(ntp->t_ticket); + + if ((error = xfs_trans_reserve(ntp, 0, logres, 0, XFS_TRANS_PERM_LOG_RES, + logcount))) + return error; + efd = xfs_trans_get_efd(ntp, efi, flist->xbf_count); + for (free = flist->xbf_first; free != NULL; free = next) { + next = free->xbfi_next; + if ((error = xfs_free_extent(ntp, free->xbfi_startblock, + free->xbfi_blockcount))) { + /* + * The bmap free list will be cleaned up at a + * higher level. The EFI will be canceled when + * this transaction is aborted. + * Need to force shutdown here to make sure it + * happens, since this transaction may not be + * dirty yet. + */ + mp = ntp->t_mountp; + if (!XFS_FORCED_SHUTDOWN(mp)) + xfs_force_shutdown(mp, + (error == EFSCORRUPTED) ? + SHUTDOWN_CORRUPT_INCORE : + SHUTDOWN_META_IO_ERROR); + return error; + } + xfs_trans_log_efd_extent(ntp, efd, free->xbfi_startblock, + free->xbfi_blockcount); + xfs_bmap_del_free(flist, NULL, free); + } + return 0; +} + +/* + * Free up any items left in the list. + */ +void +xfs_bmap_cancel( + xfs_bmap_free_t *flist) /* list of bmap_free_items */ +{ + xfs_bmap_free_item_t *free; /* free list item */ + xfs_bmap_free_item_t *next; + + if (flist->xbf_count == 0) + return; + ASSERT(flist->xbf_first != NULL); + for (free = flist->xbf_first; free; free = next) { + next = free->xbfi_next; + xfs_bmap_del_free(flist, NULL, free); + } + ASSERT(flist->xbf_count == 0); +} + +/* + * Inode fork format manipulation functions + */ + +/* + * Transform a btree format file with only one leaf node, where the + * extents list will fit in the inode, into an extents format file. + * Since the file extents are already in-core, all we have to do is + * give up the space for the btree root and pitch the leaf block. + */ +STATIC int /* error */ +xfs_bmap_btree_to_extents( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_btree_cur_t *cur, /* btree cursor */ + int *logflagsp, /* inode logging flags */ + int whichfork) /* data or attr fork */ +{ + /* REFERENCED */ + struct xfs_btree_block *cblock;/* child btree block */ + xfs_fsblock_t cbno; /* child block number */ + xfs_buf_t *cbp; /* child block's buffer */ + int error; /* error return value */ + xfs_ifork_t *ifp; /* inode fork data */ + xfs_mount_t *mp; /* mount point structure */ + __be64 *pp; /* ptr to block address */ + struct xfs_btree_block *rblock;/* root btree block */ + + mp = ip->i_mount; + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(ifp->if_flags & XFS_IFEXTENTS); + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE); + rblock = ifp->if_broot; + ASSERT(be16_to_cpu(rblock->bb_level) == 1); + ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1); + ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1); + pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes); + cbno = be64_to_cpu(*pp); + *logflagsp = 0; +#ifdef DEBUG + if ((error = xfs_btree_check_lptr(cur, cbno, 1))) + return error; +#endif + error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, XFS_BMAP_BTREE_REF, + &xfs_bmbt_buf_ops); + if (error) + return error; + cblock = XFS_BUF_TO_BLOCK(cbp); + if ((error = xfs_btree_check_block(cur, cblock, 0, cbp))) + return error; + xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp); + ip->i_d.di_nblocks--; + xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); + xfs_trans_binval(tp, cbp); + if (cur->bc_bufs[0] == cbp) + cur->bc_bufs[0] = NULL; + xfs_iroot_realloc(ip, -1, whichfork); + ASSERT(ifp->if_broot == NULL); + ASSERT((ifp->if_flags & XFS_IFBROOT) == 0); + XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); + *logflagsp = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); + return 0; +} + +/* + * Convert an extents-format file into a btree-format file. + * The new file will have a root block (in the inode) and a single child block. + */ +STATIC int /* error */ +xfs_bmap_extents_to_btree( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fsblock_t *firstblock, /* first-block-allocated */ + xfs_bmap_free_t *flist, /* blocks freed in xaction */ + xfs_btree_cur_t **curp, /* cursor returned to caller */ + int wasdel, /* converting a delayed alloc */ + int *logflagsp, /* inode logging flags */ + int whichfork) /* data or attr fork */ +{ + struct xfs_btree_block *ablock; /* allocated (child) bt block */ + xfs_buf_t *abp; /* buffer for ablock */ + xfs_alloc_arg_t args; /* allocation arguments */ + xfs_bmbt_rec_t *arp; /* child record pointer */ + struct xfs_btree_block *block; /* btree root block */ + xfs_btree_cur_t *cur; /* bmap btree cursor */ + xfs_bmbt_rec_host_t *ep; /* extent record pointer */ + int error; /* error return value */ + xfs_extnum_t i, cnt; /* extent record index */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_bmbt_key_t *kp; /* root block key pointer */ + xfs_mount_t *mp; /* mount structure */ + xfs_extnum_t nextents; /* number of file extents */ + xfs_bmbt_ptr_t *pp; /* root block address pointer */ + + mp = ip->i_mount; + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS); + + /* + * Make space in the inode incore. + */ + xfs_iroot_realloc(ip, 1, whichfork); + ifp->if_flags |= XFS_IFBROOT; + + /* + * Fill in the root. + */ + block = ifp->if_broot; + if (xfs_sb_version_hascrc(&mp->m_sb)) + xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL, + XFS_BMAP_CRC_MAGIC, 1, 1, ip->i_ino, + XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS); + else + xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL, + XFS_BMAP_MAGIC, 1, 1, ip->i_ino, + XFS_BTREE_LONG_PTRS); + + /* + * Need a cursor. Can't allocate until bb_level is filled in. + */ + cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); + cur->bc_private.b.firstblock = *firstblock; + cur->bc_private.b.flist = flist; + cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0; + /* + * Convert to a btree with two levels, one record in root. + */ + XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE); + memset(&args, 0, sizeof(args)); + args.tp = tp; + args.mp = mp; + args.firstblock = *firstblock; + if (*firstblock == NULLFSBLOCK) { + args.type = XFS_ALLOCTYPE_START_BNO; + args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino); + } else if (flist->xbf_low) { + args.type = XFS_ALLOCTYPE_START_BNO; + args.fsbno = *firstblock; + } else { + args.type = XFS_ALLOCTYPE_NEAR_BNO; + args.fsbno = *firstblock; + } + args.minlen = args.maxlen = args.prod = 1; + args.wasdel = wasdel; + *logflagsp = 0; + if ((error = xfs_alloc_vextent(&args))) { + xfs_iroot_realloc(ip, -1, whichfork); + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; + } + /* + * Allocation can't fail, the space was reserved. + */ + ASSERT(args.fsbno != NULLFSBLOCK); + ASSERT(*firstblock == NULLFSBLOCK || + args.agno == XFS_FSB_TO_AGNO(mp, *firstblock) || + (flist->xbf_low && + args.agno > XFS_FSB_TO_AGNO(mp, *firstblock))); + *firstblock = cur->bc_private.b.firstblock = args.fsbno; + cur->bc_private.b.allocated++; + ip->i_d.di_nblocks++; + xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L); + abp = xfs_btree_get_bufl(mp, tp, args.fsbno, 0); + /* + * Fill in the child block. + */ + abp->b_ops = &xfs_bmbt_buf_ops; + ablock = XFS_BUF_TO_BLOCK(abp); + if (xfs_sb_version_hascrc(&mp->m_sb)) + xfs_btree_init_block_int(mp, ablock, abp->b_bn, + XFS_BMAP_CRC_MAGIC, 0, 0, ip->i_ino, + XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS); + else + xfs_btree_init_block_int(mp, ablock, abp->b_bn, + XFS_BMAP_MAGIC, 0, 0, ip->i_ino, + XFS_BTREE_LONG_PTRS); + + arp = XFS_BMBT_REC_ADDR(mp, ablock, 1); + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + for (cnt = i = 0; i < nextents; i++) { + ep = xfs_iext_get_ext(ifp, i); + if (!isnullstartblock(xfs_bmbt_get_startblock(ep))) { + arp->l0 = cpu_to_be64(ep->l0); + arp->l1 = cpu_to_be64(ep->l1); + arp++; cnt++; + } + } + ASSERT(cnt == XFS_IFORK_NEXTENTS(ip, whichfork)); + xfs_btree_set_numrecs(ablock, cnt); + + /* + * Fill in the root key and pointer. + */ + kp = XFS_BMBT_KEY_ADDR(mp, block, 1); + arp = XFS_BMBT_REC_ADDR(mp, ablock, 1); + kp->br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(arp)); + pp = XFS_BMBT_PTR_ADDR(mp, block, 1, xfs_bmbt_get_maxrecs(cur, + be16_to_cpu(block->bb_level))); + *pp = cpu_to_be64(args.fsbno); + + /* + * Do all this logging at the end so that + * the root is at the right level. + */ + xfs_btree_log_block(cur, abp, XFS_BB_ALL_BITS); + xfs_btree_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs)); + ASSERT(*curp == NULL); + *curp = cur; + *logflagsp = XFS_ILOG_CORE | xfs_ilog_fbroot(whichfork); + return 0; +} + +/* + * Convert a local file to an extents file. + * This code is out of bounds for data forks of regular files, + * since the file data needs to get logged so things will stay consistent. + * (The bmap-level manipulations are ok, though). + */ +STATIC int /* error */ +xfs_bmap_local_to_extents( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fsblock_t *firstblock, /* first block allocated in xaction */ + xfs_extlen_t total, /* total blocks needed by transaction */ + int *logflagsp, /* inode logging flags */ + int whichfork, + void (*init_fn)(struct xfs_trans *tp, + struct xfs_buf *bp, + struct xfs_inode *ip, + struct xfs_ifork *ifp)) +{ + int error; /* error return value */ + int flags; /* logging flags returned */ + xfs_ifork_t *ifp; /* inode fork pointer */ + + /* + * We don't want to deal with the case of keeping inode data inline yet. + * So sending the data fork of a regular inode is invalid. + */ + ASSERT(!(S_ISREG(ip->i_d.di_mode) && whichfork == XFS_DATA_FORK)); + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); + flags = 0; + error = 0; + if (ifp->if_bytes) { + xfs_alloc_arg_t args; /* allocation arguments */ + xfs_buf_t *bp; /* buffer for extent block */ + xfs_bmbt_rec_host_t *ep;/* extent record pointer */ + + ASSERT((ifp->if_flags & + (XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) == XFS_IFINLINE); + memset(&args, 0, sizeof(args)); + args.tp = tp; + args.mp = ip->i_mount; + args.firstblock = *firstblock; + /* + * Allocate a block. We know we need only one, since the + * file currently fits in an inode. + */ + if (*firstblock == NULLFSBLOCK) { + args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino); + args.type = XFS_ALLOCTYPE_START_BNO; + } else { + args.fsbno = *firstblock; + args.type = XFS_ALLOCTYPE_NEAR_BNO; + } + args.total = total; + args.minlen = args.maxlen = args.prod = 1; + error = xfs_alloc_vextent(&args); + if (error) + goto done; + + /* Can't fail, the space was reserved. */ + ASSERT(args.fsbno != NULLFSBLOCK); + ASSERT(args.len == 1); + *firstblock = args.fsbno; + bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0); + + /* initialise the block and copy the data */ + init_fn(tp, bp, ip, ifp); + + /* account for the change in fork size and log everything */ + xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1); + xfs_bmap_forkoff_reset(args.mp, ip, whichfork); + xfs_idata_realloc(ip, -ifp->if_bytes, whichfork); + xfs_iext_add(ifp, 0, 1); + ep = xfs_iext_get_ext(ifp, 0); + xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM); + trace_xfs_bmap_post_update(ip, 0, + whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0, + _THIS_IP_); + XFS_IFORK_NEXT_SET(ip, whichfork, 1); + ip->i_d.di_nblocks = 1; + xfs_trans_mod_dquot_byino(tp, ip, + XFS_TRANS_DQ_BCOUNT, 1L); + flags |= xfs_ilog_fext(whichfork); + } else { + ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0); + xfs_bmap_forkoff_reset(ip->i_mount, ip, whichfork); + } + ifp->if_flags &= ~XFS_IFINLINE; + ifp->if_flags |= XFS_IFEXTENTS; + XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); + flags |= XFS_ILOG_CORE; +done: + *logflagsp = flags; + return error; +} + +/* * Called from xfs_bmap_add_attrfork to handle btree format files. */ STATIC int /* error */ @@ -360,29 +1323,22 @@ xfs_bmap_add_attrfork_extents( } /* - * Block initialisation functions for local to extent format conversion. - * As these get more complex, they will be moved to the relevant files, - * but for now they are too simple to worry about. + * Block initialisation function for local to extent format conversion. + * + * This shouldn't actually be called by anyone, so make sure debug kernels cause + * a noticable failure. */ STATIC void xfs_bmap_local_to_extents_init_fn( + struct xfs_trans *tp, struct xfs_buf *bp, struct xfs_inode *ip, struct xfs_ifork *ifp) { + ASSERT(0); bp->b_ops = &xfs_bmbt_buf_ops; memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes); -} - -STATIC void -xfs_symlink_local_to_remote( - struct xfs_buf *bp, - struct xfs_inode *ip, - struct xfs_ifork *ifp) -{ - /* remote symlink blocks are not verifiable until CRCs come along */ - bp->b_ops = NULL; - memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes); + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF); } /* @@ -394,8 +1350,7 @@ xfs_symlink_local_to_remote( * * XXX (dgc): investigate whether directory conversion can use the generic * formatting callout. It should be possible - it's just a very complex - * formatter. it would also require passing the transaction through to the init - * function. + * formatter. */ STATIC int /* error */ xfs_bmap_add_attrfork_local( @@ -432,6 +1387,640 @@ xfs_bmap_add_attrfork_local( } /* + * Convert inode from non-attributed to attributed. + * Must not be in a transaction, ip must not be locked. + */ +int /* error code */ +xfs_bmap_add_attrfork( + xfs_inode_t *ip, /* incore inode pointer */ + int size, /* space new attribute needs */ + int rsvd) /* xact may use reserved blks */ +{ + xfs_fsblock_t firstblock; /* 1st block/ag allocated */ + xfs_bmap_free_t flist; /* freed extent records */ + xfs_mount_t *mp; /* mount structure */ + xfs_trans_t *tp; /* transaction pointer */ + int blks; /* space reservation */ + int version = 1; /* superblock attr version */ + int committed; /* xaction was committed */ + int logflags; /* logging flags */ + int error; /* error return value */ + + ASSERT(XFS_IFORK_Q(ip) == 0); + + mp = ip->i_mount; + ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); + tp = xfs_trans_alloc(mp, XFS_TRANS_ADDAFORK); + blks = XFS_ADDAFORK_SPACE_RES(mp); + if (rsvd) + tp->t_flags |= XFS_TRANS_RESERVE; + if ((error = xfs_trans_reserve(tp, blks, XFS_ADDAFORK_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, XFS_ADDAFORK_LOG_COUNT))) + goto error0; + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ? + XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : + XFS_QMOPT_RES_REGBLKS); + if (error) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES); + return error; + } + if (XFS_IFORK_Q(ip)) + goto error1; + if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) { + /* + * For inodes coming from pre-6.2 filesystems. + */ + ASSERT(ip->i_d.di_aformat == 0); + ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; + } + ASSERT(ip->i_d.di_anextents == 0); + + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + switch (ip->i_d.di_format) { + case XFS_DINODE_FMT_DEV: + ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3; + break; + case XFS_DINODE_FMT_UUID: + ip->i_d.di_forkoff = roundup(sizeof(uuid_t), 8) >> 3; + break; + case XFS_DINODE_FMT_LOCAL: + case XFS_DINODE_FMT_EXTENTS: + case XFS_DINODE_FMT_BTREE: + ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size); + if (!ip->i_d.di_forkoff) + ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3; + else if (mp->m_flags & XFS_MOUNT_ATTR2) + version = 2; + break; + default: + ASSERT(0); + error = XFS_ERROR(EINVAL); + goto error1; + } + + ASSERT(ip->i_afp == NULL); + ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP); + ip->i_afp->if_flags = XFS_IFEXTENTS; + logflags = 0; + xfs_bmap_init(&flist, &firstblock); + switch (ip->i_d.di_format) { + case XFS_DINODE_FMT_LOCAL: + error = xfs_bmap_add_attrfork_local(tp, ip, &firstblock, &flist, + &logflags); + break; + case XFS_DINODE_FMT_EXTENTS: + error = xfs_bmap_add_attrfork_extents(tp, ip, &firstblock, + &flist, &logflags); + break; + case XFS_DINODE_FMT_BTREE: + error = xfs_bmap_add_attrfork_btree(tp, ip, &firstblock, &flist, + &logflags); + break; + default: + error = 0; + break; + } + if (logflags) + xfs_trans_log_inode(tp, ip, logflags); + if (error) + goto error2; + if (!xfs_sb_version_hasattr(&mp->m_sb) || + (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2)) { + __int64_t sbfields = 0; + + spin_lock(&mp->m_sb_lock); + if (!xfs_sb_version_hasattr(&mp->m_sb)) { + xfs_sb_version_addattr(&mp->m_sb); + sbfields |= XFS_SB_VERSIONNUM; + } + if (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2) { + xfs_sb_version_addattr2(&mp->m_sb); + sbfields |= (XFS_SB_VERSIONNUM | XFS_SB_FEATURES2); + } + if (sbfields) { + spin_unlock(&mp->m_sb_lock); + xfs_mod_sb(tp, sbfields); + } else + spin_unlock(&mp->m_sb_lock); + } + + error = xfs_bmap_finish(&tp, &flist, &committed); + if (error) + goto error2; + return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); +error2: + xfs_bmap_cancel(&flist); +error1: + xfs_iunlock(ip, XFS_ILOCK_EXCL); +error0: + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); + return error; +} + +/* + * Internal and external extent tree search functions. + */ + +/* + * Read in the extents to if_extents. + * All inode fields are set up by caller, we just traverse the btree + * and copy the records in. If the file system cannot contain unwritten + * extents, the records are checked for no "state" flags. + */ +int /* error */ +xfs_bmap_read_extents( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode */ + int whichfork) /* data or attr fork */ +{ + struct xfs_btree_block *block; /* current btree block */ + xfs_fsblock_t bno; /* block # of "block" */ + xfs_buf_t *bp; /* buffer for "block" */ + int error; /* error return value */ + xfs_exntfmt_t exntf; /* XFS_EXTFMT_NOSTATE, if checking */ + xfs_extnum_t i, j; /* index into the extents list */ + xfs_ifork_t *ifp; /* fork structure */ + int level; /* btree level, for checking */ + xfs_mount_t *mp; /* file system mount structure */ + __be64 *pp; /* pointer to block address */ + /* REFERENCED */ + xfs_extnum_t room; /* number of entries there's room for */ + + bno = NULLFSBLOCK; + mp = ip->i_mount; + ifp = XFS_IFORK_PTR(ip, whichfork); + exntf = (whichfork != XFS_DATA_FORK) ? XFS_EXTFMT_NOSTATE : + XFS_EXTFMT_INODE(ip); + block = ifp->if_broot; + /* + * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. + */ + level = be16_to_cpu(block->bb_level); + ASSERT(level > 0); + pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); + bno = be64_to_cpu(*pp); + ASSERT(bno != NULLDFSBNO); + ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); + ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); + /* + * Go down the tree until leaf level is reached, following the first + * pointer (leftmost) at each level. + */ + while (level-- > 0) { + error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, + XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops); + if (error) + return error; + block = XFS_BUF_TO_BLOCK(bp); + XFS_WANT_CORRUPTED_GOTO( + xfs_bmap_sanity_check(mp, bp, level), + error0); + if (level == 0) + break; + pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); + bno = be64_to_cpu(*pp); + XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0); + xfs_trans_brelse(tp, bp); + } + /* + * Here with bp and block set to the leftmost leaf node in the tree. + */ + room = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + i = 0; + /* + * Loop over all leaf nodes. Copy information to the extent records. + */ + for (;;) { + xfs_bmbt_rec_t *frp; + xfs_fsblock_t nextbno; + xfs_extnum_t num_recs; + xfs_extnum_t start; + + num_recs = xfs_btree_get_numrecs(block); + if (unlikely(i + num_recs > room)) { + ASSERT(i + num_recs <= room); + xfs_warn(ip->i_mount, + "corrupt dinode %Lu, (btree extents).", + (unsigned long long) ip->i_ino); + XFS_CORRUPTION_ERROR("xfs_bmap_read_extents(1)", + XFS_ERRLEVEL_LOW, ip->i_mount, block); + goto error0; + } + XFS_WANT_CORRUPTED_GOTO( + xfs_bmap_sanity_check(mp, bp, 0), + error0); + /* + * Read-ahead the next leaf block, if any. + */ + nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); + if (nextbno != NULLFSBLOCK) + xfs_btree_reada_bufl(mp, nextbno, 1, + &xfs_bmbt_buf_ops); + /* + * Copy records into the extent records. + */ + frp = XFS_BMBT_REC_ADDR(mp, block, 1); + start = i; + for (j = 0; j < num_recs; j++, i++, frp++) { + xfs_bmbt_rec_host_t *trp = xfs_iext_get_ext(ifp, i); + trp->l0 = be64_to_cpu(frp->l0); + trp->l1 = be64_to_cpu(frp->l1); + } + if (exntf == XFS_EXTFMT_NOSTATE) { + /* + * Check all attribute bmap btree records and + * any "older" data bmap btree records for a + * set bit in the "extent flag" position. + */ + if (unlikely(xfs_check_nostate_extents(ifp, + start, num_recs))) { + XFS_ERROR_REPORT("xfs_bmap_read_extents(2)", + XFS_ERRLEVEL_LOW, + ip->i_mount); + goto error0; + } + } + xfs_trans_brelse(tp, bp); + bno = nextbno; + /* + * If we've reached the end, stop. + */ + if (bno == NULLFSBLOCK) + break; + error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, + XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops); + if (error) + return error; + block = XFS_BUF_TO_BLOCK(bp); + } + ASSERT(i == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))); + ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork)); + XFS_BMAP_TRACE_EXLIST(ip, i, whichfork); + return 0; +error0: + xfs_trans_brelse(tp, bp); + return XFS_ERROR(EFSCORRUPTED); +} + + +/* + * Search the extent records for the entry containing block bno. + * If bno lies in a hole, point to the next entry. If bno lies + * past eof, *eofp will be set, and *prevp will contain the last + * entry (null if none). Else, *lastxp will be set to the index + * of the found entry; *gotp will contain the entry. + */ +STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */ +xfs_bmap_search_multi_extents( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_fileoff_t bno, /* block number searched for */ + int *eofp, /* out: end of file found */ + xfs_extnum_t *lastxp, /* out: last extent index */ + xfs_bmbt_irec_t *gotp, /* out: extent entry found */ + xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */ +{ + xfs_bmbt_rec_host_t *ep; /* extent record pointer */ + xfs_extnum_t lastx; /* last extent index */ + + /* + * Initialize the extent entry structure to catch access to + * uninitialized br_startblock field. + */ + gotp->br_startoff = 0xffa5a5a5a5a5a5a5LL; + gotp->br_blockcount = 0xa55a5a5a5a5a5a5aLL; + gotp->br_state = XFS_EXT_INVALID; +#if XFS_BIG_BLKNOS + gotp->br_startblock = 0xffffa5a5a5a5a5a5LL; +#else + gotp->br_startblock = 0xffffa5a5; +#endif + prevp->br_startoff = NULLFILEOFF; + + ep = xfs_iext_bno_to_ext(ifp, bno, &lastx); + if (lastx > 0) { + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx - 1), prevp); + } + if (lastx < (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) { + xfs_bmbt_get_all(ep, gotp); + *eofp = 0; + } else { + if (lastx > 0) { + *gotp = *prevp; + } + *eofp = 1; + ep = NULL; + } + *lastxp = lastx; + return ep; +} + +/* + * Search the extents list for the inode, for the extent containing bno. + * If bno lies in a hole, point to the next entry. If bno lies past eof, + * *eofp will be set, and *prevp will contain the last entry (null if none). + * Else, *lastxp will be set to the index of the found + * entry; *gotp will contain the entry. + */ +STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */ +xfs_bmap_search_extents( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fileoff_t bno, /* block number searched for */ + int fork, /* data or attr fork */ + int *eofp, /* out: end of file found */ + xfs_extnum_t *lastxp, /* out: last extent index */ + xfs_bmbt_irec_t *gotp, /* out: extent entry found */ + xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */ +{ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_bmbt_rec_host_t *ep; /* extent record pointer */ + + XFS_STATS_INC(xs_look_exlist); + ifp = XFS_IFORK_PTR(ip, fork); + + ep = xfs_bmap_search_multi_extents(ifp, bno, eofp, lastxp, gotp, prevp); + + if (unlikely(!(gotp->br_startblock) && (*lastxp != NULLEXTNUM) && + !(XFS_IS_REALTIME_INODE(ip) && fork == XFS_DATA_FORK))) { + xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO, + "Access to block zero in inode %llu " + "start_block: %llx start_off: %llx " + "blkcnt: %llx extent-state: %x lastx: %x\n", + (unsigned long long)ip->i_ino, + (unsigned long long)gotp->br_startblock, + (unsigned long long)gotp->br_startoff, + (unsigned long long)gotp->br_blockcount, + gotp->br_state, *lastxp); + *lastxp = NULLEXTNUM; + *eofp = 1; + return NULL; + } + return ep; +} + +/* + * Returns the file-relative block number of the first unused block(s) + * in the file with at least "len" logically contiguous blocks free. + * This is the lowest-address hole if the file has holes, else the first block + * past the end of file. + * Return 0 if the file is currently local (in-inode). + */ +int /* error */ +xfs_bmap_first_unused( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode */ + xfs_extlen_t len, /* size of hole to find */ + xfs_fileoff_t *first_unused, /* unused block */ + int whichfork) /* data or attr fork */ +{ + int error; /* error return value */ + int idx; /* extent record index */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_fileoff_t lastaddr; /* last block number seen */ + xfs_fileoff_t lowest; /* lowest useful block */ + xfs_fileoff_t max; /* starting useful block */ + xfs_fileoff_t off; /* offset for this block */ + xfs_extnum_t nextents; /* number of extent entries */ + + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE || + XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS || + XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { + *first_unused = 0; + return 0; + } + ifp = XFS_IFORK_PTR(ip, whichfork); + if (!(ifp->if_flags & XFS_IFEXTENTS) && + (error = xfs_iread_extents(tp, ip, whichfork))) + return error; + lowest = *first_unused; + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + for (idx = 0, lastaddr = 0, max = lowest; idx < nextents; idx++) { + xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx); + off = xfs_bmbt_get_startoff(ep); + /* + * See if the hole before this extent will work. + */ + if (off >= lowest + len && off - max >= len) { + *first_unused = max; + return 0; + } + lastaddr = off + xfs_bmbt_get_blockcount(ep); + max = XFS_FILEOFF_MAX(lastaddr, lowest); + } + *first_unused = max; + return 0; +} + +/* + * Returns the file-relative block number of the last block + 1 before + * last_block (input value) in the file. + * This is not based on i_size, it is based on the extent records. + * Returns 0 for local files, as they do not have extent records. + */ +int /* error */ +xfs_bmap_last_before( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode */ + xfs_fileoff_t *last_block, /* last block */ + int whichfork) /* data or attr fork */ +{ + xfs_fileoff_t bno; /* input file offset */ + int eof; /* hit end of file */ + xfs_bmbt_rec_host_t *ep; /* pointer to last extent */ + int error; /* error return value */ + xfs_bmbt_irec_t got; /* current extent value */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_extnum_t lastx; /* last extent used */ + xfs_bmbt_irec_t prev; /* previous extent value */ + + if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL) + return XFS_ERROR(EIO); + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { + *last_block = 0; + return 0; + } + ifp = XFS_IFORK_PTR(ip, whichfork); + if (!(ifp->if_flags & XFS_IFEXTENTS) && + (error = xfs_iread_extents(tp, ip, whichfork))) + return error; + bno = *last_block - 1; + ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, + &prev); + if (eof || xfs_bmbt_get_startoff(ep) > bno) { + if (prev.br_startoff == NULLFILEOFF) + *last_block = 0; + else + *last_block = prev.br_startoff + prev.br_blockcount; + } + /* + * Otherwise *last_block is already the right answer. + */ + return 0; +} + +STATIC int +xfs_bmap_last_extent( + struct xfs_trans *tp, + struct xfs_inode *ip, + int whichfork, + struct xfs_bmbt_irec *rec, + int *is_empty) +{ + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + int error; + int nextents; + + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + return error; + } + + nextents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); + if (nextents == 0) { + *is_empty = 1; + return 0; + } + + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, nextents - 1), rec); + *is_empty = 0; + return 0; +} + +/* + * Check the last inode extent to determine whether this allocation will result + * in blocks being allocated at the end of the file. When we allocate new data + * blocks at the end of the file which do not start at the previous data block, + * we will try to align the new blocks at stripe unit boundaries. + * + * Returns 0 in bma->aeof if the file (fork) is empty as any new write will be + * at, or past the EOF. + */ +STATIC int +xfs_bmap_isaeof( + struct xfs_bmalloca *bma, + int whichfork) +{ + struct xfs_bmbt_irec rec; + int is_empty; + int error; + + bma->aeof = 0; + error = xfs_bmap_last_extent(NULL, bma->ip, whichfork, &rec, + &is_empty); + if (error || is_empty) + return error; + + /* + * Check if we are allocation or past the last extent, or at least into + * the last delayed allocated extent. + */ + bma->aeof = bma->offset >= rec.br_startoff + rec.br_blockcount || + (bma->offset >= rec.br_startoff && + isnullstartblock(rec.br_startblock)); + return 0; +} + +/* + * Check if the endoff is outside the last extent. If so the caller will grow + * the allocation to a stripe unit boundary. All offsets are considered outside + * the end of file for an empty fork, so 1 is returned in *eof in that case. + */ +int +xfs_bmap_eof( + struct xfs_inode *ip, + xfs_fileoff_t endoff, + int whichfork, + int *eof) +{ + struct xfs_bmbt_irec rec; + int error; + + error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, eof); + if (error || *eof) + return error; + + *eof = endoff >= rec.br_startoff + rec.br_blockcount; + return 0; +} + +/* + * Returns the file-relative block number of the first block past eof in + * the file. This is not based on i_size, it is based on the extent records. + * Returns 0 for local files, as they do not have extent records. + */ +int +xfs_bmap_last_offset( + struct xfs_trans *tp, + struct xfs_inode *ip, + xfs_fileoff_t *last_block, + int whichfork) +{ + struct xfs_bmbt_irec rec; + int is_empty; + int error; + + *last_block = 0; + + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) + return 0; + + if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) + return XFS_ERROR(EIO); + + error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty); + if (error || is_empty) + return error; + + *last_block = rec.br_startoff + rec.br_blockcount; + return 0; +} + +/* + * Returns whether the selected fork of the inode has exactly one + * block or not. For the data fork we check this matches di_size, + * implying the file's range is 0..bsize-1. + */ +int /* 1=>1 block, 0=>otherwise */ +xfs_bmap_one_block( + xfs_inode_t *ip, /* incore inode */ + int whichfork) /* data or attr fork */ +{ + xfs_bmbt_rec_host_t *ep; /* ptr to fork's extent */ + xfs_ifork_t *ifp; /* inode fork pointer */ + int rval; /* return value */ + xfs_bmbt_irec_t s; /* internal version of extent */ + +#ifndef DEBUG + if (whichfork == XFS_DATA_FORK) + return XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize; +#endif /* !DEBUG */ + if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1) + return 0; + if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) + return 0; + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(ifp->if_flags & XFS_IFEXTENTS); + ep = xfs_iext_get_ext(ifp, 0); + xfs_bmbt_get_all(ep, &s); + rval = s.br_startoff == 0 && s.br_blockcount == 1; + if (rval && whichfork == XFS_DATA_FORK) + ASSERT(XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize); + return rval; +} + +/* + * Extent tree manipulation functions used during allocation. + */ + +/* * Convert a delayed allocation to a real allocation. */ STATIC int /* error */ @@ -1894,6 +3483,10 @@ done: } /* + * Functions used in the extent read, allocate and remove paths + */ + +/* * Adjust the size of the new extent based on di_extsize and rt extsize. */ STATIC int @@ -2666,1628 +4259,6 @@ xfs_bmap_alloc( } /* - * Transform a btree format file with only one leaf node, where the - * extents list will fit in the inode, into an extents format file. - * Since the file extents are already in-core, all we have to do is - * give up the space for the btree root and pitch the leaf block. - */ -STATIC int /* error */ -xfs_bmap_btree_to_extents( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_btree_cur_t *cur, /* btree cursor */ - int *logflagsp, /* inode logging flags */ - int whichfork) /* data or attr fork */ -{ - /* REFERENCED */ - struct xfs_btree_block *cblock;/* child btree block */ - xfs_fsblock_t cbno; /* child block number */ - xfs_buf_t *cbp; /* child block's buffer */ - int error; /* error return value */ - xfs_ifork_t *ifp; /* inode fork data */ - xfs_mount_t *mp; /* mount point structure */ - __be64 *pp; /* ptr to block address */ - struct xfs_btree_block *rblock;/* root btree block */ - - mp = ip->i_mount; - ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(ifp->if_flags & XFS_IFEXTENTS); - ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE); - rblock = ifp->if_broot; - ASSERT(be16_to_cpu(rblock->bb_level) == 1); - ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1); - ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1); - pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes); - cbno = be64_to_cpu(*pp); - *logflagsp = 0; -#ifdef DEBUG - if ((error = xfs_btree_check_lptr(cur, cbno, 1))) - return error; -#endif - error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, XFS_BMAP_BTREE_REF, - &xfs_bmbt_buf_ops); - if (error) - return error; - cblock = XFS_BUF_TO_BLOCK(cbp); - if ((error = xfs_btree_check_block(cur, cblock, 0, cbp))) - return error; - xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp); - ip->i_d.di_nblocks--; - xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); - xfs_trans_binval(tp, cbp); - if (cur->bc_bufs[0] == cbp) - cur->bc_bufs[0] = NULL; - xfs_iroot_realloc(ip, -1, whichfork); - ASSERT(ifp->if_broot == NULL); - ASSERT((ifp->if_flags & XFS_IFBROOT) == 0); - XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); - *logflagsp = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); - return 0; -} - -/* - * Called by xfs_bmapi to update file extent records and the btree - * after removing space (or undoing a delayed allocation). - */ -STATIC int /* error */ -xfs_bmap_del_extent( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_trans_t *tp, /* current transaction pointer */ - xfs_extnum_t *idx, /* extent number to update/delete */ - xfs_bmap_free_t *flist, /* list of extents to be freed */ - xfs_btree_cur_t *cur, /* if null, not a btree */ - xfs_bmbt_irec_t *del, /* data to remove from extents */ - int *logflagsp, /* inode logging flags */ - int whichfork) /* data or attr fork */ -{ - xfs_filblks_t da_new; /* new delay-alloc indirect blocks */ - xfs_filblks_t da_old; /* old delay-alloc indirect blocks */ - xfs_fsblock_t del_endblock=0; /* first block past del */ - xfs_fileoff_t del_endoff; /* first offset past del */ - int delay; /* current block is delayed allocated */ - int do_fx; /* free extent at end of routine */ - xfs_bmbt_rec_host_t *ep; /* current extent entry pointer */ - int error; /* error return value */ - int flags; /* inode logging flags */ - xfs_bmbt_irec_t got; /* current extent entry */ - xfs_fileoff_t got_endoff; /* first offset past got */ - int i; /* temp state */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_mount_t *mp; /* mount structure */ - xfs_filblks_t nblks; /* quota/sb block count */ - xfs_bmbt_irec_t new; /* new record to be inserted */ - /* REFERENCED */ - uint qfield; /* quota field to update */ - xfs_filblks_t temp; /* for indirect length calculations */ - xfs_filblks_t temp2; /* for indirect length calculations */ - int state = 0; - - XFS_STATS_INC(xs_del_exlist); - - if (whichfork == XFS_ATTR_FORK) - state |= BMAP_ATTRFORK; - - mp = ip->i_mount; - ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT((*idx >= 0) && (*idx < ifp->if_bytes / - (uint)sizeof(xfs_bmbt_rec_t))); - ASSERT(del->br_blockcount > 0); - ep = xfs_iext_get_ext(ifp, *idx); - xfs_bmbt_get_all(ep, &got); - ASSERT(got.br_startoff <= del->br_startoff); - del_endoff = del->br_startoff + del->br_blockcount; - got_endoff = got.br_startoff + got.br_blockcount; - ASSERT(got_endoff >= del_endoff); - delay = isnullstartblock(got.br_startblock); - ASSERT(isnullstartblock(del->br_startblock) == delay); - flags = 0; - qfield = 0; - error = 0; - /* - * If deleting a real allocation, must free up the disk space. - */ - if (!delay) { - flags = XFS_ILOG_CORE; - /* - * Realtime allocation. Free it and record di_nblocks update. - */ - if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) { - xfs_fsblock_t bno; - xfs_filblks_t len; - - ASSERT(do_mod(del->br_blockcount, - mp->m_sb.sb_rextsize) == 0); - ASSERT(do_mod(del->br_startblock, - mp->m_sb.sb_rextsize) == 0); - bno = del->br_startblock; - len = del->br_blockcount; - do_div(bno, mp->m_sb.sb_rextsize); - do_div(len, mp->m_sb.sb_rextsize); - error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len); - if (error) - goto done; - do_fx = 0; - nblks = len * mp->m_sb.sb_rextsize; - qfield = XFS_TRANS_DQ_RTBCOUNT; - } - /* - * Ordinary allocation. - */ - else { - do_fx = 1; - nblks = del->br_blockcount; - qfield = XFS_TRANS_DQ_BCOUNT; - } - /* - * Set up del_endblock and cur for later. - */ - del_endblock = del->br_startblock + del->br_blockcount; - if (cur) { - if ((error = xfs_bmbt_lookup_eq(cur, got.br_startoff, - got.br_startblock, got.br_blockcount, - &i))) - goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); - } - da_old = da_new = 0; - } else { - da_old = startblockval(got.br_startblock); - da_new = 0; - nblks = 0; - do_fx = 0; - } - /* - * Set flag value to use in switch statement. - * Left-contig is 2, right-contig is 1. - */ - switch (((got.br_startoff == del->br_startoff) << 1) | - (got_endoff == del_endoff)) { - case 3: - /* - * Matches the whole extent. Delete the entry. - */ - xfs_iext_remove(ip, *idx, 1, - whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0); - --*idx; - if (delay) - break; - - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) - 1); - flags |= XFS_ILOG_CORE; - if (!cur) { - flags |= xfs_ilog_fext(whichfork); - break; - } - if ((error = xfs_btree_delete(cur, &i))) - goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); - break; - - case 2: - /* - * Deleting the first part of the extent. - */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_startoff(ep, del_endoff); - temp = got.br_blockcount - del->br_blockcount; - xfs_bmbt_set_blockcount(ep, temp); - if (delay) { - temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), - da_old); - xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - da_new = temp; - break; - } - xfs_bmbt_set_startblock(ep, del_endblock); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - if (!cur) { - flags |= xfs_ilog_fext(whichfork); - break; - } - if ((error = xfs_bmbt_update(cur, del_endoff, del_endblock, - got.br_blockcount - del->br_blockcount, - got.br_state))) - goto done; - break; - - case 1: - /* - * Deleting the last part of the extent. - */ - temp = got.br_blockcount - del->br_blockcount; - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(ep, temp); - if (delay) { - temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), - da_old); - xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - da_new = temp; - break; - } - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - if (!cur) { - flags |= xfs_ilog_fext(whichfork); - break; - } - if ((error = xfs_bmbt_update(cur, got.br_startoff, - got.br_startblock, - got.br_blockcount - del->br_blockcount, - got.br_state))) - goto done; - break; - - case 0: - /* - * Deleting the middle of the extent. - */ - temp = del->br_startoff - got.br_startoff; - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(ep, temp); - new.br_startoff = del_endoff; - temp2 = got_endoff - del_endoff; - new.br_blockcount = temp2; - new.br_state = got.br_state; - if (!delay) { - new.br_startblock = del_endblock; - flags |= XFS_ILOG_CORE; - if (cur) { - if ((error = xfs_bmbt_update(cur, - got.br_startoff, - got.br_startblock, temp, - got.br_state))) - goto done; - if ((error = xfs_btree_increment(cur, 0, &i))) - goto done; - cur->bc_rec.b = new; - error = xfs_btree_insert(cur, &i); - if (error && error != ENOSPC) - goto done; - /* - * If get no-space back from btree insert, - * it tried a split, and we have a zero - * block reservation. - * Fix up our state and return the error. - */ - if (error == ENOSPC) { - /* - * Reset the cursor, don't trust - * it after any insert operation. - */ - if ((error = xfs_bmbt_lookup_eq(cur, - got.br_startoff, - got.br_startblock, - temp, &i))) - goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); - /* - * Update the btree record back - * to the original value. - */ - if ((error = xfs_bmbt_update(cur, - got.br_startoff, - got.br_startblock, - got.br_blockcount, - got.br_state))) - goto done; - /* - * Reset the extent record back - * to the original value. - */ - xfs_bmbt_set_blockcount(ep, - got.br_blockcount); - flags = 0; - error = XFS_ERROR(ENOSPC); - goto done; - } - XFS_WANT_CORRUPTED_GOTO(i == 1, done); - } else - flags |= xfs_ilog_fext(whichfork); - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) + 1); - } else { - ASSERT(whichfork == XFS_DATA_FORK); - temp = xfs_bmap_worst_indlen(ip, temp); - xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - temp2 = xfs_bmap_worst_indlen(ip, temp2); - new.br_startblock = nullstartblock((int)temp2); - da_new = temp + temp2; - while (da_new > da_old) { - if (temp) { - temp--; - da_new--; - xfs_bmbt_set_startblock(ep, - nullstartblock((int)temp)); - } - if (da_new == da_old) - break; - if (temp2) { - temp2--; - da_new--; - new.br_startblock = - nullstartblock((int)temp2); - } - } - } - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - xfs_iext_insert(ip, *idx + 1, 1, &new, state); - ++*idx; - break; - } - /* - * If we need to, add to list of extents to delete. - */ - if (do_fx) - xfs_bmap_add_free(del->br_startblock, del->br_blockcount, flist, - mp); - /* - * Adjust inode # blocks in the file. - */ - if (nblks) - ip->i_d.di_nblocks -= nblks; - /* - * Adjust quota data. - */ - if (qfield) - xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks); - - /* - * Account for change in delayed indirect blocks. - * Nothing to do for disk quota accounting here. - */ - ASSERT(da_old >= da_new); - if (da_old > da_new) { - xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, - (int64_t)(da_old - da_new), 0); - } -done: - *logflagsp = flags; - return error; -} - -/* - * Remove the entry "free" from the free item list. Prev points to the - * previous entry, unless "free" is the head of the list. - */ -STATIC void -xfs_bmap_del_free( - xfs_bmap_free_t *flist, /* free item list header */ - xfs_bmap_free_item_t *prev, /* previous item on list, if any */ - xfs_bmap_free_item_t *free) /* list item to be freed */ -{ - if (prev) - prev->xbfi_next = free->xbfi_next; - else - flist->xbf_first = free->xbfi_next; - flist->xbf_count--; - kmem_zone_free(xfs_bmap_free_item_zone, free); -} - -/* - * Convert an extents-format file into a btree-format file. - * The new file will have a root block (in the inode) and a single child block. - */ -STATIC int /* error */ -xfs_bmap_extents_to_btree( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fsblock_t *firstblock, /* first-block-allocated */ - xfs_bmap_free_t *flist, /* blocks freed in xaction */ - xfs_btree_cur_t **curp, /* cursor returned to caller */ - int wasdel, /* converting a delayed alloc */ - int *logflagsp, /* inode logging flags */ - int whichfork) /* data or attr fork */ -{ - struct xfs_btree_block *ablock; /* allocated (child) bt block */ - xfs_buf_t *abp; /* buffer for ablock */ - xfs_alloc_arg_t args; /* allocation arguments */ - xfs_bmbt_rec_t *arp; /* child record pointer */ - struct xfs_btree_block *block; /* btree root block */ - xfs_btree_cur_t *cur; /* bmap btree cursor */ - xfs_bmbt_rec_host_t *ep; /* extent record pointer */ - int error; /* error return value */ - xfs_extnum_t i, cnt; /* extent record index */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_bmbt_key_t *kp; /* root block key pointer */ - xfs_mount_t *mp; /* mount structure */ - xfs_extnum_t nextents; /* number of file extents */ - xfs_bmbt_ptr_t *pp; /* root block address pointer */ - - ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS); - - /* - * Make space in the inode incore. - */ - xfs_iroot_realloc(ip, 1, whichfork); - ifp->if_flags |= XFS_IFBROOT; - - /* - * Fill in the root. - */ - block = ifp->if_broot; - block->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC); - block->bb_level = cpu_to_be16(1); - block->bb_numrecs = cpu_to_be16(1); - block->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO); - block->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO); - - /* - * Need a cursor. Can't allocate until bb_level is filled in. - */ - mp = ip->i_mount; - cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); - cur->bc_private.b.firstblock = *firstblock; - cur->bc_private.b.flist = flist; - cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0; - /* - * Convert to a btree with two levels, one record in root. - */ - XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE); - memset(&args, 0, sizeof(args)); - args.tp = tp; - args.mp = mp; - args.firstblock = *firstblock; - if (*firstblock == NULLFSBLOCK) { - args.type = XFS_ALLOCTYPE_START_BNO; - args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino); - } else if (flist->xbf_low) { - args.type = XFS_ALLOCTYPE_START_BNO; - args.fsbno = *firstblock; - } else { - args.type = XFS_ALLOCTYPE_NEAR_BNO; - args.fsbno = *firstblock; - } - args.minlen = args.maxlen = args.prod = 1; - args.wasdel = wasdel; - *logflagsp = 0; - if ((error = xfs_alloc_vextent(&args))) { - xfs_iroot_realloc(ip, -1, whichfork); - xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); - return error; - } - /* - * Allocation can't fail, the space was reserved. - */ - ASSERT(args.fsbno != NULLFSBLOCK); - ASSERT(*firstblock == NULLFSBLOCK || - args.agno == XFS_FSB_TO_AGNO(mp, *firstblock) || - (flist->xbf_low && - args.agno > XFS_FSB_TO_AGNO(mp, *firstblock))); - *firstblock = cur->bc_private.b.firstblock = args.fsbno; - cur->bc_private.b.allocated++; - ip->i_d.di_nblocks++; - xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L); - abp = xfs_btree_get_bufl(mp, tp, args.fsbno, 0); - /* - * Fill in the child block. - */ - abp->b_ops = &xfs_bmbt_buf_ops; - ablock = XFS_BUF_TO_BLOCK(abp); - ablock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC); - ablock->bb_level = 0; - ablock->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO); - ablock->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO); - arp = XFS_BMBT_REC_ADDR(mp, ablock, 1); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - for (cnt = i = 0; i < nextents; i++) { - ep = xfs_iext_get_ext(ifp, i); - if (!isnullstartblock(xfs_bmbt_get_startblock(ep))) { - arp->l0 = cpu_to_be64(ep->l0); - arp->l1 = cpu_to_be64(ep->l1); - arp++; cnt++; - } - } - ASSERT(cnt == XFS_IFORK_NEXTENTS(ip, whichfork)); - xfs_btree_set_numrecs(ablock, cnt); - - /* - * Fill in the root key and pointer. - */ - kp = XFS_BMBT_KEY_ADDR(mp, block, 1); - arp = XFS_BMBT_REC_ADDR(mp, ablock, 1); - kp->br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(arp)); - pp = XFS_BMBT_PTR_ADDR(mp, block, 1, xfs_bmbt_get_maxrecs(cur, - be16_to_cpu(block->bb_level))); - *pp = cpu_to_be64(args.fsbno); - - /* - * Do all this logging at the end so that - * the root is at the right level. - */ - xfs_btree_log_block(cur, abp, XFS_BB_ALL_BITS); - xfs_btree_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs)); - ASSERT(*curp == NULL); - *curp = cur; - *logflagsp = XFS_ILOG_CORE | xfs_ilog_fbroot(whichfork); - return 0; -} - -/* - * Calculate the default attribute fork offset for newly created inodes. - */ -uint -xfs_default_attroffset( - struct xfs_inode *ip) -{ - struct xfs_mount *mp = ip->i_mount; - uint offset; - - if (mp->m_sb.sb_inodesize == 256) { - offset = XFS_LITINO(mp) - - XFS_BMDR_SPACE_CALC(MINABTPTRS); - } else { - offset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS); - } - - ASSERT(offset < XFS_LITINO(mp)); - return offset; -} - -/* - * Helper routine to reset inode di_forkoff field when switching - * attribute fork from local to extent format - we reset it where - * possible to make space available for inline data fork extents. - */ -STATIC void -xfs_bmap_forkoff_reset( - xfs_mount_t *mp, - xfs_inode_t *ip, - int whichfork) -{ - if (whichfork == XFS_ATTR_FORK && - ip->i_d.di_format != XFS_DINODE_FMT_DEV && - ip->i_d.di_format != XFS_DINODE_FMT_UUID && - ip->i_d.di_format != XFS_DINODE_FMT_BTREE) { - uint dfl_forkoff = xfs_default_attroffset(ip) >> 3; - - if (dfl_forkoff > ip->i_d.di_forkoff) - ip->i_d.di_forkoff = dfl_forkoff; - } -} - -/* - * Convert a local file to an extents file. - * This code is out of bounds for data forks of regular files, - * since the file data needs to get logged so things will stay consistent. - * (The bmap-level manipulations are ok, though). - */ -STATIC int /* error */ -xfs_bmap_local_to_extents( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fsblock_t *firstblock, /* first block allocated in xaction */ - xfs_extlen_t total, /* total blocks needed by transaction */ - int *logflagsp, /* inode logging flags */ - int whichfork, - void (*init_fn)(struct xfs_buf *bp, - struct xfs_inode *ip, - struct xfs_ifork *ifp)) -{ - int error; /* error return value */ - int flags; /* logging flags returned */ - xfs_ifork_t *ifp; /* inode fork pointer */ - - /* - * We don't want to deal with the case of keeping inode data inline yet. - * So sending the data fork of a regular inode is invalid. - */ - ASSERT(!(S_ISREG(ip->i_d.di_mode) && whichfork == XFS_DATA_FORK)); - ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); - flags = 0; - error = 0; - if (ifp->if_bytes) { - xfs_alloc_arg_t args; /* allocation arguments */ - xfs_buf_t *bp; /* buffer for extent block */ - xfs_bmbt_rec_host_t *ep;/* extent record pointer */ - - ASSERT((ifp->if_flags & - (XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) == XFS_IFINLINE); - memset(&args, 0, sizeof(args)); - args.tp = tp; - args.mp = ip->i_mount; - args.firstblock = *firstblock; - /* - * Allocate a block. We know we need only one, since the - * file currently fits in an inode. - */ - if (*firstblock == NULLFSBLOCK) { - args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino); - args.type = XFS_ALLOCTYPE_START_BNO; - } else { - args.fsbno = *firstblock; - args.type = XFS_ALLOCTYPE_NEAR_BNO; - } - args.total = total; - args.minlen = args.maxlen = args.prod = 1; - error = xfs_alloc_vextent(&args); - if (error) - goto done; - - /* Can't fail, the space was reserved. */ - ASSERT(args.fsbno != NULLFSBLOCK); - ASSERT(args.len == 1); - *firstblock = args.fsbno; - bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0); - - /* initialise the block and copy the data */ - init_fn(bp, ip, ifp); - - /* account for the change in fork size and log everything */ - xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1); - xfs_bmap_forkoff_reset(args.mp, ip, whichfork); - xfs_idata_realloc(ip, -ifp->if_bytes, whichfork); - xfs_iext_add(ifp, 0, 1); - ep = xfs_iext_get_ext(ifp, 0); - xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM); - trace_xfs_bmap_post_update(ip, 0, - whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0, - _THIS_IP_); - XFS_IFORK_NEXT_SET(ip, whichfork, 1); - ip->i_d.di_nblocks = 1; - xfs_trans_mod_dquot_byino(tp, ip, - XFS_TRANS_DQ_BCOUNT, 1L); - flags |= xfs_ilog_fext(whichfork); - } else { - ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0); - xfs_bmap_forkoff_reset(ip->i_mount, ip, whichfork); - } - ifp->if_flags &= ~XFS_IFINLINE; - ifp->if_flags |= XFS_IFEXTENTS; - XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); - flags |= XFS_ILOG_CORE; -done: - *logflagsp = flags; - return error; -} - -/* - * Search the extent records for the entry containing block bno. - * If bno lies in a hole, point to the next entry. If bno lies - * past eof, *eofp will be set, and *prevp will contain the last - * entry (null if none). Else, *lastxp will be set to the index - * of the found entry; *gotp will contain the entry. - */ -STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */ -xfs_bmap_search_multi_extents( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_fileoff_t bno, /* block number searched for */ - int *eofp, /* out: end of file found */ - xfs_extnum_t *lastxp, /* out: last extent index */ - xfs_bmbt_irec_t *gotp, /* out: extent entry found */ - xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */ -{ - xfs_bmbt_rec_host_t *ep; /* extent record pointer */ - xfs_extnum_t lastx; /* last extent index */ - - /* - * Initialize the extent entry structure to catch access to - * uninitialized br_startblock field. - */ - gotp->br_startoff = 0xffa5a5a5a5a5a5a5LL; - gotp->br_blockcount = 0xa55a5a5a5a5a5a5aLL; - gotp->br_state = XFS_EXT_INVALID; -#if XFS_BIG_BLKNOS - gotp->br_startblock = 0xffffa5a5a5a5a5a5LL; -#else - gotp->br_startblock = 0xffffa5a5; -#endif - prevp->br_startoff = NULLFILEOFF; - - ep = xfs_iext_bno_to_ext(ifp, bno, &lastx); - if (lastx > 0) { - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx - 1), prevp); - } - if (lastx < (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) { - xfs_bmbt_get_all(ep, gotp); - *eofp = 0; - } else { - if (lastx > 0) { - *gotp = *prevp; - } - *eofp = 1; - ep = NULL; - } - *lastxp = lastx; - return ep; -} - -/* - * Search the extents list for the inode, for the extent containing bno. - * If bno lies in a hole, point to the next entry. If bno lies past eof, - * *eofp will be set, and *prevp will contain the last entry (null if none). - * Else, *lastxp will be set to the index of the found - * entry; *gotp will contain the entry. - */ -STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */ -xfs_bmap_search_extents( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fileoff_t bno, /* block number searched for */ - int fork, /* data or attr fork */ - int *eofp, /* out: end of file found */ - xfs_extnum_t *lastxp, /* out: last extent index */ - xfs_bmbt_irec_t *gotp, /* out: extent entry found */ - xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */ -{ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_bmbt_rec_host_t *ep; /* extent record pointer */ - - XFS_STATS_INC(xs_look_exlist); - ifp = XFS_IFORK_PTR(ip, fork); - - ep = xfs_bmap_search_multi_extents(ifp, bno, eofp, lastxp, gotp, prevp); - - if (unlikely(!(gotp->br_startblock) && (*lastxp != NULLEXTNUM) && - !(XFS_IS_REALTIME_INODE(ip) && fork == XFS_DATA_FORK))) { - xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO, - "Access to block zero in inode %llu " - "start_block: %llx start_off: %llx " - "blkcnt: %llx extent-state: %x lastx: %x\n", - (unsigned long long)ip->i_ino, - (unsigned long long)gotp->br_startblock, - (unsigned long long)gotp->br_startoff, - (unsigned long long)gotp->br_blockcount, - gotp->br_state, *lastxp); - *lastxp = NULLEXTNUM; - *eofp = 1; - return NULL; - } - return ep; -} - -/* - * Compute the worst-case number of indirect blocks that will be used - * for ip's delayed extent of length "len". - */ -STATIC xfs_filblks_t -xfs_bmap_worst_indlen( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_filblks_t len) /* delayed extent length */ -{ - int level; /* btree level number */ - int maxrecs; /* maximum record count at this level */ - xfs_mount_t *mp; /* mount structure */ - xfs_filblks_t rval; /* return value */ - - mp = ip->i_mount; - maxrecs = mp->m_bmap_dmxr[0]; - for (level = 0, rval = 0; - level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK); - level++) { - len += maxrecs - 1; - do_div(len, maxrecs); - rval += len; - if (len == 1) - return rval + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) - - level - 1; - if (level == 0) - maxrecs = mp->m_bmap_dmxr[1]; - } - return rval; -} - -/* - * Convert inode from non-attributed to attributed. - * Must not be in a transaction, ip must not be locked. - */ -int /* error code */ -xfs_bmap_add_attrfork( - xfs_inode_t *ip, /* incore inode pointer */ - int size, /* space new attribute needs */ - int rsvd) /* xact may use reserved blks */ -{ - xfs_fsblock_t firstblock; /* 1st block/ag allocated */ - xfs_bmap_free_t flist; /* freed extent records */ - xfs_mount_t *mp; /* mount structure */ - xfs_trans_t *tp; /* transaction pointer */ - int blks; /* space reservation */ - int version = 1; /* superblock attr version */ - int committed; /* xaction was committed */ - int logflags; /* logging flags */ - int error; /* error return value */ - - ASSERT(XFS_IFORK_Q(ip) == 0); - - mp = ip->i_mount; - ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); - tp = xfs_trans_alloc(mp, XFS_TRANS_ADDAFORK); - blks = XFS_ADDAFORK_SPACE_RES(mp); - if (rsvd) - tp->t_flags |= XFS_TRANS_RESERVE; - if ((error = xfs_trans_reserve(tp, blks, XFS_ADDAFORK_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_ADDAFORK_LOG_COUNT))) - goto error0; - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ? - XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : - XFS_QMOPT_RES_REGBLKS); - if (error) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES); - return error; - } - if (XFS_IFORK_Q(ip)) - goto error1; - if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) { - /* - * For inodes coming from pre-6.2 filesystems. - */ - ASSERT(ip->i_d.di_aformat == 0); - ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; - } - ASSERT(ip->i_d.di_anextents == 0); - - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - - switch (ip->i_d.di_format) { - case XFS_DINODE_FMT_DEV: - ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3; - break; - case XFS_DINODE_FMT_UUID: - ip->i_d.di_forkoff = roundup(sizeof(uuid_t), 8) >> 3; - break; - case XFS_DINODE_FMT_LOCAL: - case XFS_DINODE_FMT_EXTENTS: - case XFS_DINODE_FMT_BTREE: - ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size); - if (!ip->i_d.di_forkoff) - ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3; - else if (mp->m_flags & XFS_MOUNT_ATTR2) - version = 2; - break; - default: - ASSERT(0); - error = XFS_ERROR(EINVAL); - goto error1; - } - - ASSERT(ip->i_afp == NULL); - ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP); - ip->i_afp->if_flags = XFS_IFEXTENTS; - logflags = 0; - xfs_bmap_init(&flist, &firstblock); - switch (ip->i_d.di_format) { - case XFS_DINODE_FMT_LOCAL: - error = xfs_bmap_add_attrfork_local(tp, ip, &firstblock, &flist, - &logflags); - break; - case XFS_DINODE_FMT_EXTENTS: - error = xfs_bmap_add_attrfork_extents(tp, ip, &firstblock, - &flist, &logflags); - break; - case XFS_DINODE_FMT_BTREE: - error = xfs_bmap_add_attrfork_btree(tp, ip, &firstblock, &flist, - &logflags); - break; - default: - error = 0; - break; - } - if (logflags) - xfs_trans_log_inode(tp, ip, logflags); - if (error) - goto error2; - if (!xfs_sb_version_hasattr(&mp->m_sb) || - (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2)) { - __int64_t sbfields = 0; - - spin_lock(&mp->m_sb_lock); - if (!xfs_sb_version_hasattr(&mp->m_sb)) { - xfs_sb_version_addattr(&mp->m_sb); - sbfields |= XFS_SB_VERSIONNUM; - } - if (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2) { - xfs_sb_version_addattr2(&mp->m_sb); - sbfields |= (XFS_SB_VERSIONNUM | XFS_SB_FEATURES2); - } - if (sbfields) { - spin_unlock(&mp->m_sb_lock); - xfs_mod_sb(tp, sbfields); - } else - spin_unlock(&mp->m_sb_lock); - } - - error = xfs_bmap_finish(&tp, &flist, &committed); - if (error) - goto error2; - return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); -error2: - xfs_bmap_cancel(&flist); -error1: - xfs_iunlock(ip, XFS_ILOCK_EXCL); -error0: - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); - return error; -} - -/* - * Add the extent to the list of extents to be free at transaction end. - * The list is maintained sorted (by block number). - */ -/* ARGSUSED */ -void -xfs_bmap_add_free( - xfs_fsblock_t bno, /* fs block number of extent */ - xfs_filblks_t len, /* length of extent */ - xfs_bmap_free_t *flist, /* list of extents */ - xfs_mount_t *mp) /* mount point structure */ -{ - xfs_bmap_free_item_t *cur; /* current (next) element */ - xfs_bmap_free_item_t *new; /* new element */ - xfs_bmap_free_item_t *prev; /* previous element */ -#ifdef DEBUG - xfs_agnumber_t agno; - xfs_agblock_t agbno; - - ASSERT(bno != NULLFSBLOCK); - ASSERT(len > 0); - ASSERT(len <= MAXEXTLEN); - ASSERT(!isnullstartblock(bno)); - agno = XFS_FSB_TO_AGNO(mp, bno); - agbno = XFS_FSB_TO_AGBNO(mp, bno); - ASSERT(agno < mp->m_sb.sb_agcount); - ASSERT(agbno < mp->m_sb.sb_agblocks); - ASSERT(len < mp->m_sb.sb_agblocks); - ASSERT(agbno + len <= mp->m_sb.sb_agblocks); -#endif - ASSERT(xfs_bmap_free_item_zone != NULL); - new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP); - new->xbfi_startblock = bno; - new->xbfi_blockcount = (xfs_extlen_t)len; - for (prev = NULL, cur = flist->xbf_first; - cur != NULL; - prev = cur, cur = cur->xbfi_next) { - if (cur->xbfi_startblock >= bno) - break; - } - if (prev) - prev->xbfi_next = new; - else - flist->xbf_first = new; - new->xbfi_next = cur; - flist->xbf_count++; -} - -/* - * Compute and fill in the value of the maximum depth of a bmap btree - * in this filesystem. Done once, during mount. - */ -void -xfs_bmap_compute_maxlevels( - xfs_mount_t *mp, /* file system mount structure */ - int whichfork) /* data or attr fork */ -{ - int level; /* btree level */ - uint maxblocks; /* max blocks at this level */ - uint maxleafents; /* max leaf entries possible */ - int maxrootrecs; /* max records in root block */ - int minleafrecs; /* min records in leaf block */ - int minnoderecs; /* min records in node block */ - int sz; /* root block size */ - - /* - * The maximum number of extents in a file, hence the maximum - * number of leaf entries, is controlled by the type of di_nextents - * (a signed 32-bit number, xfs_extnum_t), or by di_anextents - * (a signed 16-bit number, xfs_aextnum_t). - * - * Note that we can no longer assume that if we are in ATTR1 that - * the fork offset of all the inodes will be - * (xfs_default_attroffset(ip) >> 3) because we could have mounted - * with ATTR2 and then mounted back with ATTR1, keeping the - * di_forkoff's fixed but probably at various positions. Therefore, - * for both ATTR1 and ATTR2 we have to assume the worst case scenario - * of a minimum size available. - */ - if (whichfork == XFS_DATA_FORK) { - maxleafents = MAXEXTNUM; - sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS); - } else { - maxleafents = MAXAEXTNUM; - sz = XFS_BMDR_SPACE_CALC(MINABTPTRS); - } - maxrootrecs = xfs_bmdr_maxrecs(mp, sz, 0); - minleafrecs = mp->m_bmap_dmnr[0]; - minnoderecs = mp->m_bmap_dmnr[1]; - maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs; - for (level = 1; maxblocks > 1; level++) { - if (maxblocks <= maxrootrecs) - maxblocks = 1; - else - maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs; - } - mp->m_bm_maxlevels[whichfork] = level; -} - -/* - * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi - * caller. Frees all the extents that need freeing, which must be done - * last due to locking considerations. We never free any extents in - * the first transaction. - * - * Return 1 if the given transaction was committed and a new one - * started, and 0 otherwise in the committed parameter. - */ -int /* error */ -xfs_bmap_finish( - xfs_trans_t **tp, /* transaction pointer addr */ - xfs_bmap_free_t *flist, /* i/o: list extents to free */ - int *committed) /* xact committed or not */ -{ - xfs_efd_log_item_t *efd; /* extent free data */ - xfs_efi_log_item_t *efi; /* extent free intention */ - int error; /* error return value */ - xfs_bmap_free_item_t *free; /* free extent item */ - unsigned int logres; /* new log reservation */ - unsigned int logcount; /* new log count */ - xfs_mount_t *mp; /* filesystem mount structure */ - xfs_bmap_free_item_t *next; /* next item on free list */ - xfs_trans_t *ntp; /* new transaction pointer */ - - ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); - if (flist->xbf_count == 0) { - *committed = 0; - return 0; - } - ntp = *tp; - efi = xfs_trans_get_efi(ntp, flist->xbf_count); - for (free = flist->xbf_first; free; free = free->xbfi_next) - xfs_trans_log_efi_extent(ntp, efi, free->xbfi_startblock, - free->xbfi_blockcount); - logres = ntp->t_log_res; - logcount = ntp->t_log_count; - ntp = xfs_trans_dup(*tp); - error = xfs_trans_commit(*tp, 0); - *tp = ntp; - *committed = 1; - /* - * We have a new transaction, so we should return committed=1, - * even though we're returning an error. - */ - if (error) - return error; - - /* - * transaction commit worked ok so we can drop the extra ticket - * reference that we gained in xfs_trans_dup() - */ - xfs_log_ticket_put(ntp->t_ticket); - - if ((error = xfs_trans_reserve(ntp, 0, logres, 0, XFS_TRANS_PERM_LOG_RES, - logcount))) - return error; - efd = xfs_trans_get_efd(ntp, efi, flist->xbf_count); - for (free = flist->xbf_first; free != NULL; free = next) { - next = free->xbfi_next; - if ((error = xfs_free_extent(ntp, free->xbfi_startblock, - free->xbfi_blockcount))) { - /* - * The bmap free list will be cleaned up at a - * higher level. The EFI will be canceled when - * this transaction is aborted. - * Need to force shutdown here to make sure it - * happens, since this transaction may not be - * dirty yet. - */ - mp = ntp->t_mountp; - if (!XFS_FORCED_SHUTDOWN(mp)) - xfs_force_shutdown(mp, - (error == EFSCORRUPTED) ? - SHUTDOWN_CORRUPT_INCORE : - SHUTDOWN_META_IO_ERROR); - return error; - } - xfs_trans_log_efd_extent(ntp, efd, free->xbfi_startblock, - free->xbfi_blockcount); - xfs_bmap_del_free(flist, NULL, free); - } - return 0; -} - -/* - * Free up any items left in the list. - */ -void -xfs_bmap_cancel( - xfs_bmap_free_t *flist) /* list of bmap_free_items */ -{ - xfs_bmap_free_item_t *free; /* free list item */ - xfs_bmap_free_item_t *next; - - if (flist->xbf_count == 0) - return; - ASSERT(flist->xbf_first != NULL); - for (free = flist->xbf_first; free; free = next) { - next = free->xbfi_next; - xfs_bmap_del_free(flist, NULL, free); - } - ASSERT(flist->xbf_count == 0); -} - -/* - * Returns the file-relative block number of the first unused block(s) - * in the file with at least "len" logically contiguous blocks free. - * This is the lowest-address hole if the file has holes, else the first block - * past the end of file. - * Return 0 if the file is currently local (in-inode). - */ -int /* error */ -xfs_bmap_first_unused( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode */ - xfs_extlen_t len, /* size of hole to find */ - xfs_fileoff_t *first_unused, /* unused block */ - int whichfork) /* data or attr fork */ -{ - int error; /* error return value */ - int idx; /* extent record index */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_fileoff_t lastaddr; /* last block number seen */ - xfs_fileoff_t lowest; /* lowest useful block */ - xfs_fileoff_t max; /* starting useful block */ - xfs_fileoff_t off; /* offset for this block */ - xfs_extnum_t nextents; /* number of extent entries */ - - ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE || - XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS || - XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { - *first_unused = 0; - return 0; - } - ifp = XFS_IFORK_PTR(ip, whichfork); - if (!(ifp->if_flags & XFS_IFEXTENTS) && - (error = xfs_iread_extents(tp, ip, whichfork))) - return error; - lowest = *first_unused; - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - for (idx = 0, lastaddr = 0, max = lowest; idx < nextents; idx++) { - xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx); - off = xfs_bmbt_get_startoff(ep); - /* - * See if the hole before this extent will work. - */ - if (off >= lowest + len && off - max >= len) { - *first_unused = max; - return 0; - } - lastaddr = off + xfs_bmbt_get_blockcount(ep); - max = XFS_FILEOFF_MAX(lastaddr, lowest); - } - *first_unused = max; - return 0; -} - -/* - * Returns the file-relative block number of the last block + 1 before - * last_block (input value) in the file. - * This is not based on i_size, it is based on the extent records. - * Returns 0 for local files, as they do not have extent records. - */ -int /* error */ -xfs_bmap_last_before( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode */ - xfs_fileoff_t *last_block, /* last block */ - int whichfork) /* data or attr fork */ -{ - xfs_fileoff_t bno; /* input file offset */ - int eof; /* hit end of file */ - xfs_bmbt_rec_host_t *ep; /* pointer to last extent */ - int error; /* error return value */ - xfs_bmbt_irec_t got; /* current extent value */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_extnum_t lastx; /* last extent used */ - xfs_bmbt_irec_t prev; /* previous extent value */ - - if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL) - return XFS_ERROR(EIO); - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { - *last_block = 0; - return 0; - } - ifp = XFS_IFORK_PTR(ip, whichfork); - if (!(ifp->if_flags & XFS_IFEXTENTS) && - (error = xfs_iread_extents(tp, ip, whichfork))) - return error; - bno = *last_block - 1; - ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, - &prev); - if (eof || xfs_bmbt_get_startoff(ep) > bno) { - if (prev.br_startoff == NULLFILEOFF) - *last_block = 0; - else - *last_block = prev.br_startoff + prev.br_blockcount; - } - /* - * Otherwise *last_block is already the right answer. - */ - return 0; -} - -STATIC int -xfs_bmap_last_extent( - struct xfs_trans *tp, - struct xfs_inode *ip, - int whichfork, - struct xfs_bmbt_irec *rec, - int *is_empty) -{ - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); - int error; - int nextents; - - if (!(ifp->if_flags & XFS_IFEXTENTS)) { - error = xfs_iread_extents(tp, ip, whichfork); - if (error) - return error; - } - - nextents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); - if (nextents == 0) { - *is_empty = 1; - return 0; - } - - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, nextents - 1), rec); - *is_empty = 0; - return 0; -} - -/* - * Check the last inode extent to determine whether this allocation will result - * in blocks being allocated at the end of the file. When we allocate new data - * blocks at the end of the file which do not start at the previous data block, - * we will try to align the new blocks at stripe unit boundaries. - * - * Returns 0 in bma->aeof if the file (fork) is empty as any new write will be - * at, or past the EOF. - */ -STATIC int -xfs_bmap_isaeof( - struct xfs_bmalloca *bma, - int whichfork) -{ - struct xfs_bmbt_irec rec; - int is_empty; - int error; - - bma->aeof = 0; - error = xfs_bmap_last_extent(NULL, bma->ip, whichfork, &rec, - &is_empty); - if (error || is_empty) - return error; - - /* - * Check if we are allocation or past the last extent, or at least into - * the last delayed allocated extent. - */ - bma->aeof = bma->offset >= rec.br_startoff + rec.br_blockcount || - (bma->offset >= rec.br_startoff && - isnullstartblock(rec.br_startblock)); - return 0; -} - -/* - * Check if the endoff is outside the last extent. If so the caller will grow - * the allocation to a stripe unit boundary. All offsets are considered outside - * the end of file for an empty fork, so 1 is returned in *eof in that case. - */ -int -xfs_bmap_eof( - struct xfs_inode *ip, - xfs_fileoff_t endoff, - int whichfork, - int *eof) -{ - struct xfs_bmbt_irec rec; - int error; - - error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, eof); - if (error || *eof) - return error; - - *eof = endoff >= rec.br_startoff + rec.br_blockcount; - return 0; -} - -/* - * Returns the file-relative block number of the first block past eof in - * the file. This is not based on i_size, it is based on the extent records. - * Returns 0 for local files, as they do not have extent records. - */ -int -xfs_bmap_last_offset( - struct xfs_trans *tp, - struct xfs_inode *ip, - xfs_fileoff_t *last_block, - int whichfork) -{ - struct xfs_bmbt_irec rec; - int is_empty; - int error; - - *last_block = 0; - - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) - return 0; - - if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) - return XFS_ERROR(EIO); - - error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty); - if (error || is_empty) - return error; - - *last_block = rec.br_startoff + rec.br_blockcount; - return 0; -} - -/* - * Returns whether the selected fork of the inode has exactly one - * block or not. For the data fork we check this matches di_size, - * implying the file's range is 0..bsize-1. - */ -int /* 1=>1 block, 0=>otherwise */ -xfs_bmap_one_block( - xfs_inode_t *ip, /* incore inode */ - int whichfork) /* data or attr fork */ -{ - xfs_bmbt_rec_host_t *ep; /* ptr to fork's extent */ - xfs_ifork_t *ifp; /* inode fork pointer */ - int rval; /* return value */ - xfs_bmbt_irec_t s; /* internal version of extent */ - -#ifndef DEBUG - if (whichfork == XFS_DATA_FORK) - return XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize; -#endif /* !DEBUG */ - if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1) - return 0; - if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) - return 0; - ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(ifp->if_flags & XFS_IFEXTENTS); - ep = xfs_iext_get_ext(ifp, 0); - xfs_bmbt_get_all(ep, &s); - rval = s.br_startoff == 0 && s.br_blockcount == 1; - if (rval && whichfork == XFS_DATA_FORK) - ASSERT(XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize); - return rval; -} - -STATIC int -xfs_bmap_sanity_check( - struct xfs_mount *mp, - struct xfs_buf *bp, - int level) -{ - struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); - - if (block->bb_magic != cpu_to_be32(XFS_BMAP_MAGIC) || - be16_to_cpu(block->bb_level) != level || - be16_to_cpu(block->bb_numrecs) == 0 || - be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0]) - return 0; - return 1; -} - -/* - * Read in the extents to if_extents. - * All inode fields are set up by caller, we just traverse the btree - * and copy the records in. If the file system cannot contain unwritten - * extents, the records are checked for no "state" flags. - */ -int /* error */ -xfs_bmap_read_extents( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode */ - int whichfork) /* data or attr fork */ -{ - struct xfs_btree_block *block; /* current btree block */ - xfs_fsblock_t bno; /* block # of "block" */ - xfs_buf_t *bp; /* buffer for "block" */ - int error; /* error return value */ - xfs_exntfmt_t exntf; /* XFS_EXTFMT_NOSTATE, if checking */ - xfs_extnum_t i, j; /* index into the extents list */ - xfs_ifork_t *ifp; /* fork structure */ - int level; /* btree level, for checking */ - xfs_mount_t *mp; /* file system mount structure */ - __be64 *pp; /* pointer to block address */ - /* REFERENCED */ - xfs_extnum_t room; /* number of entries there's room for */ - - bno = NULLFSBLOCK; - mp = ip->i_mount; - ifp = XFS_IFORK_PTR(ip, whichfork); - exntf = (whichfork != XFS_DATA_FORK) ? XFS_EXTFMT_NOSTATE : - XFS_EXTFMT_INODE(ip); - block = ifp->if_broot; - /* - * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. - */ - level = be16_to_cpu(block->bb_level); - ASSERT(level > 0); - pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); - bno = be64_to_cpu(*pp); - ASSERT(bno != NULLDFSBNO); - ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); - ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); - /* - * Go down the tree until leaf level is reached, following the first - * pointer (leftmost) at each level. - */ - while (level-- > 0) { - error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, - XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops); - if (error) - return error; - block = XFS_BUF_TO_BLOCK(bp); - XFS_WANT_CORRUPTED_GOTO( - xfs_bmap_sanity_check(mp, bp, level), - error0); - if (level == 0) - break; - pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); - bno = be64_to_cpu(*pp); - XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0); - xfs_trans_brelse(tp, bp); - } - /* - * Here with bp and block set to the leftmost leaf node in the tree. - */ - room = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - i = 0; - /* - * Loop over all leaf nodes. Copy information to the extent records. - */ - for (;;) { - xfs_bmbt_rec_t *frp; - xfs_fsblock_t nextbno; - xfs_extnum_t num_recs; - xfs_extnum_t start; - - num_recs = xfs_btree_get_numrecs(block); - if (unlikely(i + num_recs > room)) { - ASSERT(i + num_recs <= room); - xfs_warn(ip->i_mount, - "corrupt dinode %Lu, (btree extents).", - (unsigned long long) ip->i_ino); - XFS_CORRUPTION_ERROR("xfs_bmap_read_extents(1)", - XFS_ERRLEVEL_LOW, ip->i_mount, block); - goto error0; - } - XFS_WANT_CORRUPTED_GOTO( - xfs_bmap_sanity_check(mp, bp, 0), - error0); - /* - * Read-ahead the next leaf block, if any. - */ - nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); - if (nextbno != NULLFSBLOCK) - xfs_btree_reada_bufl(mp, nextbno, 1, - &xfs_bmbt_buf_ops); - /* - * Copy records into the extent records. - */ - frp = XFS_BMBT_REC_ADDR(mp, block, 1); - start = i; - for (j = 0; j < num_recs; j++, i++, frp++) { - xfs_bmbt_rec_host_t *trp = xfs_iext_get_ext(ifp, i); - trp->l0 = be64_to_cpu(frp->l0); - trp->l1 = be64_to_cpu(frp->l1); - } - if (exntf == XFS_EXTFMT_NOSTATE) { - /* - * Check all attribute bmap btree records and - * any "older" data bmap btree records for a - * set bit in the "extent flag" position. - */ - if (unlikely(xfs_check_nostate_extents(ifp, - start, num_recs))) { - XFS_ERROR_REPORT("xfs_bmap_read_extents(2)", - XFS_ERRLEVEL_LOW, - ip->i_mount); - goto error0; - } - } - xfs_trans_brelse(tp, bp); - bno = nextbno; - /* - * If we've reached the end, stop. - */ - if (bno == NULLFSBLOCK) - break; - error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, - XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops); - if (error) - return error; - block = XFS_BUF_TO_BLOCK(bp); - } - ASSERT(i == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))); - ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork)); - XFS_BMAP_TRACE_EXLIST(ip, i, whichfork); - return 0; -error0: - xfs_trans_brelse(tp, bp); - return XFS_ERROR(EFSCORRUPTED); -} - -#ifdef DEBUG -/* - * Add bmap trace insert entries for all the contents of the extent records. - */ -void -xfs_bmap_trace_exlist( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t cnt, /* count of entries in the list */ - int whichfork, /* data or attr fork */ - unsigned long caller_ip) -{ - xfs_extnum_t idx; /* extent record index */ - xfs_ifork_t *ifp; /* inode fork pointer */ - int state = 0; - - if (whichfork == XFS_ATTR_FORK) - state |= BMAP_ATTRFORK; - - ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(cnt == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))); - for (idx = 0; idx < cnt; idx++) - trace_xfs_extlist(ip, idx, whichfork, caller_ip); -} - -/* - * Validate that the bmbt_irecs being returned from bmapi are valid - * given the callers original parameters. Specifically check the - * ranges of the returned irecs to ensure that they only extent beyond - * the given parameters if the XFS_BMAPI_ENTIRE flag was set. - */ -STATIC void -xfs_bmap_validate_ret( - xfs_fileoff_t bno, - xfs_filblks_t len, - int flags, - xfs_bmbt_irec_t *mval, - int nmap, - int ret_nmap) -{ - int i; /* index to map values */ - - ASSERT(ret_nmap <= nmap); - - for (i = 0; i < ret_nmap; i++) { - ASSERT(mval[i].br_blockcount > 0); - if (!(flags & XFS_BMAPI_ENTIRE)) { - ASSERT(mval[i].br_startoff >= bno); - ASSERT(mval[i].br_blockcount <= len); - ASSERT(mval[i].br_startoff + mval[i].br_blockcount <= - bno + len); - } else { - ASSERT(mval[i].br_startoff < bno + len); - ASSERT(mval[i].br_startoff + mval[i].br_blockcount > - bno); - } - ASSERT(i == 0 || - mval[i - 1].br_startoff + mval[i - 1].br_blockcount == - mval[i].br_startoff); - ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK && - mval[i].br_startblock != HOLESTARTBLOCK); - ASSERT(mval[i].br_state == XFS_EXT_NORM || - mval[i].br_state == XFS_EXT_UNWRITTEN); - } -} -#endif /* DEBUG */ - - -/* * Trim the returned map to the required bounds */ STATIC void @@ -5151,6 +5122,328 @@ error0: } /* + * Called by xfs_bmapi to update file extent records and the btree + * after removing space (or undoing a delayed allocation). + */ +STATIC int /* error */ +xfs_bmap_del_extent( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_trans_t *tp, /* current transaction pointer */ + xfs_extnum_t *idx, /* extent number to update/delete */ + xfs_bmap_free_t *flist, /* list of extents to be freed */ + xfs_btree_cur_t *cur, /* if null, not a btree */ + xfs_bmbt_irec_t *del, /* data to remove from extents */ + int *logflagsp, /* inode logging flags */ + int whichfork) /* data or attr fork */ +{ + xfs_filblks_t da_new; /* new delay-alloc indirect blocks */ + xfs_filblks_t da_old; /* old delay-alloc indirect blocks */ + xfs_fsblock_t del_endblock=0; /* first block past del */ + xfs_fileoff_t del_endoff; /* first offset past del */ + int delay; /* current block is delayed allocated */ + int do_fx; /* free extent at end of routine */ + xfs_bmbt_rec_host_t *ep; /* current extent entry pointer */ + int error; /* error return value */ + int flags; /* inode logging flags */ + xfs_bmbt_irec_t got; /* current extent entry */ + xfs_fileoff_t got_endoff; /* first offset past got */ + int i; /* temp state */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_mount_t *mp; /* mount structure */ + xfs_filblks_t nblks; /* quota/sb block count */ + xfs_bmbt_irec_t new; /* new record to be inserted */ + /* REFERENCED */ + uint qfield; /* quota field to update */ + xfs_filblks_t temp; /* for indirect length calculations */ + xfs_filblks_t temp2; /* for indirect length calculations */ + int state = 0; + + XFS_STATS_INC(xs_del_exlist); + + if (whichfork == XFS_ATTR_FORK) + state |= BMAP_ATTRFORK; + + mp = ip->i_mount; + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT((*idx >= 0) && (*idx < ifp->if_bytes / + (uint)sizeof(xfs_bmbt_rec_t))); + ASSERT(del->br_blockcount > 0); + ep = xfs_iext_get_ext(ifp, *idx); + xfs_bmbt_get_all(ep, &got); + ASSERT(got.br_startoff <= del->br_startoff); + del_endoff = del->br_startoff + del->br_blockcount; + got_endoff = got.br_startoff + got.br_blockcount; + ASSERT(got_endoff >= del_endoff); + delay = isnullstartblock(got.br_startblock); + ASSERT(isnullstartblock(del->br_startblock) == delay); + flags = 0; + qfield = 0; + error = 0; + /* + * If deleting a real allocation, must free up the disk space. + */ + if (!delay) { + flags = XFS_ILOG_CORE; + /* + * Realtime allocation. Free it and record di_nblocks update. + */ + if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) { + xfs_fsblock_t bno; + xfs_filblks_t len; + + ASSERT(do_mod(del->br_blockcount, + mp->m_sb.sb_rextsize) == 0); + ASSERT(do_mod(del->br_startblock, + mp->m_sb.sb_rextsize) == 0); + bno = del->br_startblock; + len = del->br_blockcount; + do_div(bno, mp->m_sb.sb_rextsize); + do_div(len, mp->m_sb.sb_rextsize); + error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len); + if (error) + goto done; + do_fx = 0; + nblks = len * mp->m_sb.sb_rextsize; + qfield = XFS_TRANS_DQ_RTBCOUNT; + } + /* + * Ordinary allocation. + */ + else { + do_fx = 1; + nblks = del->br_blockcount; + qfield = XFS_TRANS_DQ_BCOUNT; + } + /* + * Set up del_endblock and cur for later. + */ + del_endblock = del->br_startblock + del->br_blockcount; + if (cur) { + if ((error = xfs_bmbt_lookup_eq(cur, got.br_startoff, + got.br_startblock, got.br_blockcount, + &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + } + da_old = da_new = 0; + } else { + da_old = startblockval(got.br_startblock); + da_new = 0; + nblks = 0; + do_fx = 0; + } + /* + * Set flag value to use in switch statement. + * Left-contig is 2, right-contig is 1. + */ + switch (((got.br_startoff == del->br_startoff) << 1) | + (got_endoff == del_endoff)) { + case 3: + /* + * Matches the whole extent. Delete the entry. + */ + xfs_iext_remove(ip, *idx, 1, + whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0); + --*idx; + if (delay) + break; + + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) - 1); + flags |= XFS_ILOG_CORE; + if (!cur) { + flags |= xfs_ilog_fext(whichfork); + break; + } + if ((error = xfs_btree_delete(cur, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + break; + + case 2: + /* + * Deleting the first part of the extent. + */ + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_startoff(ep, del_endoff); + temp = got.br_blockcount - del->br_blockcount; + xfs_bmbt_set_blockcount(ep, temp); + if (delay) { + temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + da_old); + xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + da_new = temp; + break; + } + xfs_bmbt_set_startblock(ep, del_endblock); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + if (!cur) { + flags |= xfs_ilog_fext(whichfork); + break; + } + if ((error = xfs_bmbt_update(cur, del_endoff, del_endblock, + got.br_blockcount - del->br_blockcount, + got.br_state))) + goto done; + break; + + case 1: + /* + * Deleting the last part of the extent. + */ + temp = got.br_blockcount - del->br_blockcount; + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(ep, temp); + if (delay) { + temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + da_old); + xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + da_new = temp; + break; + } + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + if (!cur) { + flags |= xfs_ilog_fext(whichfork); + break; + } + if ((error = xfs_bmbt_update(cur, got.br_startoff, + got.br_startblock, + got.br_blockcount - del->br_blockcount, + got.br_state))) + goto done; + break; + + case 0: + /* + * Deleting the middle of the extent. + */ + temp = del->br_startoff - got.br_startoff; + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(ep, temp); + new.br_startoff = del_endoff; + temp2 = got_endoff - del_endoff; + new.br_blockcount = temp2; + new.br_state = got.br_state; + if (!delay) { + new.br_startblock = del_endblock; + flags |= XFS_ILOG_CORE; + if (cur) { + if ((error = xfs_bmbt_update(cur, + got.br_startoff, + got.br_startblock, temp, + got.br_state))) + goto done; + if ((error = xfs_btree_increment(cur, 0, &i))) + goto done; + cur->bc_rec.b = new; + error = xfs_btree_insert(cur, &i); + if (error && error != ENOSPC) + goto done; + /* + * If get no-space back from btree insert, + * it tried a split, and we have a zero + * block reservation. + * Fix up our state and return the error. + */ + if (error == ENOSPC) { + /* + * Reset the cursor, don't trust + * it after any insert operation. + */ + if ((error = xfs_bmbt_lookup_eq(cur, + got.br_startoff, + got.br_startblock, + temp, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + /* + * Update the btree record back + * to the original value. + */ + if ((error = xfs_bmbt_update(cur, + got.br_startoff, + got.br_startblock, + got.br_blockcount, + got.br_state))) + goto done; + /* + * Reset the extent record back + * to the original value. + */ + xfs_bmbt_set_blockcount(ep, + got.br_blockcount); + flags = 0; + error = XFS_ERROR(ENOSPC); + goto done; + } + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + } else + flags |= xfs_ilog_fext(whichfork); + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) + 1); + } else { + ASSERT(whichfork == XFS_DATA_FORK); + temp = xfs_bmap_worst_indlen(ip, temp); + xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); + temp2 = xfs_bmap_worst_indlen(ip, temp2); + new.br_startblock = nullstartblock((int)temp2); + da_new = temp + temp2; + while (da_new > da_old) { + if (temp) { + temp--; + da_new--; + xfs_bmbt_set_startblock(ep, + nullstartblock((int)temp)); + } + if (da_new == da_old) + break; + if (temp2) { + temp2--; + da_new--; + new.br_startblock = + nullstartblock((int)temp2); + } + } + } + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + xfs_iext_insert(ip, *idx + 1, 1, &new, state); + ++*idx; + break; + } + /* + * If we need to, add to list of extents to delete. + */ + if (do_fx) + xfs_bmap_add_free(del->br_startblock, del->br_blockcount, flist, + mp); + /* + * Adjust inode # blocks in the file. + */ + if (nblks) + ip->i_d.di_nblocks -= nblks; + /* + * Adjust quota data. + */ + if (qfield) + xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks); + + /* + * Account for change in delayed indirect blocks. + * Nothing to do for disk quota accounting here. + */ + ASSERT(da_old >= da_new); + if (da_old > da_new) { + xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, + (int64_t)(da_old - da_new), 0); + } +done: + *logflagsp = flags; + return error; +} + +/* * Unmap (remove) blocks from a file. * If nexts is nonzero then the number of extents to remove is limited to * that value. If not all extents in the block range can be removed then @@ -5811,416 +6104,6 @@ xfs_getbmap( return error; } -#ifdef DEBUG -STATIC struct xfs_buf * -xfs_bmap_get_bp( - struct xfs_btree_cur *cur, - xfs_fsblock_t bno) -{ - struct xfs_log_item_desc *lidp; - int i; - - if (!cur) - return NULL; - - for (i = 0; i < XFS_BTREE_MAXLEVELS; i++) { - if (!cur->bc_bufs[i]) - break; - if (XFS_BUF_ADDR(cur->bc_bufs[i]) == bno) - return cur->bc_bufs[i]; - } - - /* Chase down all the log items to see if the bp is there */ - list_for_each_entry(lidp, &cur->bc_tp->t_items, lid_trans) { - struct xfs_buf_log_item *bip; - bip = (struct xfs_buf_log_item *)lidp->lid_item; - if (bip->bli_item.li_type == XFS_LI_BUF && - XFS_BUF_ADDR(bip->bli_buf) == bno) - return bip->bli_buf; - } - - return NULL; -} - -STATIC void -xfs_check_block( - struct xfs_btree_block *block, - xfs_mount_t *mp, - int root, - short sz) -{ - int i, j, dmxr; - __be64 *pp, *thispa; /* pointer to block address */ - xfs_bmbt_key_t *prevp, *keyp; - - ASSERT(be16_to_cpu(block->bb_level) > 0); - - prevp = NULL; - for( i = 1; i <= xfs_btree_get_numrecs(block); i++) { - dmxr = mp->m_bmap_dmxr[0]; - keyp = XFS_BMBT_KEY_ADDR(mp, block, i); - - if (prevp) { - ASSERT(be64_to_cpu(prevp->br_startoff) < - be64_to_cpu(keyp->br_startoff)); - } - prevp = keyp; - - /* - * Compare the block numbers to see if there are dups. - */ - if (root) - pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, i, sz); - else - pp = XFS_BMBT_PTR_ADDR(mp, block, i, dmxr); - - for (j = i+1; j <= be16_to_cpu(block->bb_numrecs); j++) { - if (root) - thispa = XFS_BMAP_BROOT_PTR_ADDR(mp, block, j, sz); - else - thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr); - if (*thispa == *pp) { - xfs_warn(mp, "%s: thispa(%d) == pp(%d) %Ld", - __func__, j, i, - (unsigned long long)be64_to_cpu(*thispa)); - panic("%s: ptrs are equal in node\n", - __func__); - } - } - } -} - -/* - * Check that the extents for the inode ip are in the right order in all - * btree leaves. - */ - -STATIC void -xfs_bmap_check_leaf_extents( - xfs_btree_cur_t *cur, /* btree cursor or null */ - xfs_inode_t *ip, /* incore inode pointer */ - int whichfork) /* data or attr fork */ -{ - struct xfs_btree_block *block; /* current btree block */ - xfs_fsblock_t bno; /* block # of "block" */ - xfs_buf_t *bp; /* buffer for "block" */ - int error; /* error return value */ - xfs_extnum_t i=0, j; /* index into the extents list */ - xfs_ifork_t *ifp; /* fork structure */ - int level; /* btree level, for checking */ - xfs_mount_t *mp; /* file system mount structure */ - __be64 *pp; /* pointer to block address */ - xfs_bmbt_rec_t *ep; /* pointer to current extent */ - xfs_bmbt_rec_t last = {0, 0}; /* last extent in prev block */ - xfs_bmbt_rec_t *nextp; /* pointer to next extent */ - int bp_release = 0; - - if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) { - return; - } - - bno = NULLFSBLOCK; - mp = ip->i_mount; - ifp = XFS_IFORK_PTR(ip, whichfork); - block = ifp->if_broot; - /* - * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. - */ - level = be16_to_cpu(block->bb_level); - ASSERT(level > 0); - xfs_check_block(block, mp, 1, ifp->if_broot_bytes); - pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); - bno = be64_to_cpu(*pp); - - ASSERT(bno != NULLDFSBNO); - ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); - ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); - - /* - * Go down the tree until leaf level is reached, following the first - * pointer (leftmost) at each level. - */ - while (level-- > 0) { - /* See if buf is in cur first */ - bp_release = 0; - bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); - if (!bp) { - bp_release = 1; - error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, - XFS_BMAP_BTREE_REF, - &xfs_bmbt_buf_ops); - if (error) - goto error_norelse; - } - block = XFS_BUF_TO_BLOCK(bp); - XFS_WANT_CORRUPTED_GOTO( - xfs_bmap_sanity_check(mp, bp, level), - error0); - if (level == 0) - break; - - /* - * Check this block for basic sanity (increasing keys and - * no duplicate blocks). - */ - - xfs_check_block(block, mp, 0, 0); - pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); - bno = be64_to_cpu(*pp); - XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0); - if (bp_release) { - bp_release = 0; - xfs_trans_brelse(NULL, bp); - } - } - - /* - * Here with bp and block set to the leftmost leaf node in the tree. - */ - i = 0; - - /* - * Loop over all leaf nodes checking that all extents are in the right order. - */ - for (;;) { - xfs_fsblock_t nextbno; - xfs_extnum_t num_recs; - - - num_recs = xfs_btree_get_numrecs(block); - - /* - * Read-ahead the next leaf block, if any. - */ - - nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); - - /* - * Check all the extents to make sure they are OK. - * If we had a previous block, the last entry should - * conform with the first entry in this one. - */ - - ep = XFS_BMBT_REC_ADDR(mp, block, 1); - if (i) { - ASSERT(xfs_bmbt_disk_get_startoff(&last) + - xfs_bmbt_disk_get_blockcount(&last) <= - xfs_bmbt_disk_get_startoff(ep)); - } - for (j = 1; j < num_recs; j++) { - nextp = XFS_BMBT_REC_ADDR(mp, block, j + 1); - ASSERT(xfs_bmbt_disk_get_startoff(ep) + - xfs_bmbt_disk_get_blockcount(ep) <= - xfs_bmbt_disk_get_startoff(nextp)); - ep = nextp; - } - - last = *ep; - i += num_recs; - if (bp_release) { - bp_release = 0; - xfs_trans_brelse(NULL, bp); - } - bno = nextbno; - /* - * If we've reached the end, stop. - */ - if (bno == NULLFSBLOCK) - break; - - bp_release = 0; - bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); - if (!bp) { - bp_release = 1; - error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, - XFS_BMAP_BTREE_REF, - &xfs_bmbt_buf_ops); - if (error) - goto error_norelse; - } - block = XFS_BUF_TO_BLOCK(bp); - } - if (bp_release) { - bp_release = 0; - xfs_trans_brelse(NULL, bp); - } - return; - -error0: - xfs_warn(mp, "%s: at error0", __func__); - if (bp_release) - xfs_trans_brelse(NULL, bp); -error_norelse: - xfs_warn(mp, "%s: BAD after btree leaves for %d extents", - __func__, i); - panic("%s: CORRUPTED BTREE OR SOMETHING", __func__); - return; -} -#endif - -/* - * Count fsblocks of the given fork. - */ -int /* error */ -xfs_bmap_count_blocks( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode */ - int whichfork, /* data or attr fork */ - int *count) /* out: count of blocks */ -{ - struct xfs_btree_block *block; /* current btree block */ - xfs_fsblock_t bno; /* block # of "block" */ - xfs_ifork_t *ifp; /* fork structure */ - int level; /* btree level, for checking */ - xfs_mount_t *mp; /* file system mount structure */ - __be64 *pp; /* pointer to block address */ - - bno = NULLFSBLOCK; - mp = ip->i_mount; - ifp = XFS_IFORK_PTR(ip, whichfork); - if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) { - xfs_bmap_count_leaves(ifp, 0, - ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t), - count); - return 0; - } - - /* - * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. - */ - block = ifp->if_broot; - level = be16_to_cpu(block->bb_level); - ASSERT(level > 0); - pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); - bno = be64_to_cpu(*pp); - ASSERT(bno != NULLDFSBNO); - ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); - ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); - - if (unlikely(xfs_bmap_count_tree(mp, tp, ifp, bno, level, count) < 0)) { - XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", XFS_ERRLEVEL_LOW, - mp); - return XFS_ERROR(EFSCORRUPTED); - } - - return 0; -} - -/* - * Recursively walks each level of a btree - * to count total fsblocks is use. - */ -STATIC int /* error */ -xfs_bmap_count_tree( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_fsblock_t blockno, /* file system block number */ - int levelin, /* level in btree */ - int *count) /* Count of blocks */ -{ - int error; - xfs_buf_t *bp, *nbp; - int level = levelin; - __be64 *pp; - xfs_fsblock_t bno = blockno; - xfs_fsblock_t nextbno; - struct xfs_btree_block *block, *nextblock; - int numrecs; - - error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF, - &xfs_bmbt_buf_ops); - if (error) - return error; - *count += 1; - block = XFS_BUF_TO_BLOCK(bp); - - if (--level) { - /* Not at node above leaves, count this level of nodes */ - nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); - while (nextbno != NULLFSBLOCK) { - error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp, - XFS_BMAP_BTREE_REF, - &xfs_bmbt_buf_ops); - if (error) - return error; - *count += 1; - nextblock = XFS_BUF_TO_BLOCK(nbp); - nextbno = be64_to_cpu(nextblock->bb_u.l.bb_rightsib); - xfs_trans_brelse(tp, nbp); - } - - /* Dive to the next level */ - pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); - bno = be64_to_cpu(*pp); - if (unlikely((error = - xfs_bmap_count_tree(mp, tp, ifp, bno, level, count)) < 0)) { - xfs_trans_brelse(tp, bp); - XFS_ERROR_REPORT("xfs_bmap_count_tree(1)", - XFS_ERRLEVEL_LOW, mp); - return XFS_ERROR(EFSCORRUPTED); - } - xfs_trans_brelse(tp, bp); - } else { - /* count all level 1 nodes and their leaves */ - for (;;) { - nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); - numrecs = be16_to_cpu(block->bb_numrecs); - xfs_bmap_disk_count_leaves(mp, block, numrecs, count); - xfs_trans_brelse(tp, bp); - if (nextbno == NULLFSBLOCK) - break; - bno = nextbno; - error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, - XFS_BMAP_BTREE_REF, - &xfs_bmbt_buf_ops); - if (error) - return error; - *count += 1; - block = XFS_BUF_TO_BLOCK(bp); - } - } - return 0; -} - -/* - * Count leaf blocks given a range of extent records. - */ -STATIC void -xfs_bmap_count_leaves( - xfs_ifork_t *ifp, - xfs_extnum_t idx, - int numrecs, - int *count) -{ - int b; - - for (b = 0; b < numrecs; b++) { - xfs_bmbt_rec_host_t *frp = xfs_iext_get_ext(ifp, idx + b); - *count += xfs_bmbt_get_blockcount(frp); - } -} - -/* - * Count leaf blocks given a range of extent records originally - * in btree format. - */ -STATIC void -xfs_bmap_disk_count_leaves( - struct xfs_mount *mp, - struct xfs_btree_block *block, - int numrecs, - int *count) -{ - int b; - xfs_bmbt_rec_t *frp; - - for (b = 1; b <= numrecs; b++) { - frp = XFS_BMBT_REC_ADDR(mp, block, b); - *count += xfs_bmbt_disk_get_blockcount(frp); - } -} - /* * dead simple method of punching delalyed allocation blocks from a range in * the inode. Walks a block at a time so will be slow, but is only executed in @@ -6295,16 +6178,3 @@ next_block: return error; } - -/* - * Convert the given file system block to a disk block. We have to treat it - * differently based on whether the file is a real time file or not, because the - * bmap code does. - */ -xfs_daddr_t -xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb) -{ - return (XFS_IS_REALTIME_INODE(ip) ? \ - (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \ - XFS_FSB_TO_DADDR((ip)->i_mount, (fsb))); -} diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c index 061b45cbe61..3a86c3fa6de 100644 --- a/fs/xfs/xfs_bmap_btree.c +++ b/fs/xfs/xfs_bmap_btree.c @@ -37,6 +37,7 @@ #include "xfs_error.h" #include "xfs_quota.h" #include "xfs_trace.h" +#include "xfs_cksum.h" /* * Determine the extent state. @@ -59,24 +60,31 @@ xfs_extent_state( */ void xfs_bmdr_to_bmbt( - struct xfs_mount *mp, + struct xfs_inode *ip, xfs_bmdr_block_t *dblock, int dblocklen, struct xfs_btree_block *rblock, int rblocklen) { + struct xfs_mount *mp = ip->i_mount; int dmxr; xfs_bmbt_key_t *fkp; __be64 *fpp; xfs_bmbt_key_t *tkp; __be64 *tpp; - rblock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC); + if (xfs_sb_version_hascrc(&mp->m_sb)) + xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL, + XFS_BMAP_CRC_MAGIC, 0, 0, ip->i_ino, + XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS); + else + xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL, + XFS_BMAP_MAGIC, 0, 0, ip->i_ino, + XFS_BTREE_LONG_PTRS); + rblock->bb_level = dblock->bb_level; ASSERT(be16_to_cpu(rblock->bb_level) > 0); rblock->bb_numrecs = dblock->bb_numrecs; - rblock->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO); - rblock->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO); dmxr = xfs_bmdr_maxrecs(mp, dblocklen, 0); fkp = XFS_BMDR_KEY_ADDR(dblock, 1); tkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1); @@ -424,7 +432,13 @@ xfs_bmbt_to_bmdr( xfs_bmbt_key_t *tkp; __be64 *tpp; - ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC)); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_CRC_MAGIC)); + ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid)); + ASSERT(rblock->bb_u.l.bb_blkno == + cpu_to_be64(XFS_BUF_DADDR_NULL)); + } else + ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC)); ASSERT(rblock->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO)); ASSERT(rblock->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO)); ASSERT(rblock->bb_level != 0); @@ -708,59 +722,89 @@ xfs_bmbt_key_diff( cur->bc_rec.b.br_startoff; } -static void +static int xfs_bmbt_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); unsigned int level; - int lblock_ok; /* block passes checks */ - /* magic number and level verification. + switch (block->bb_magic) { + case cpu_to_be32(XFS_BMAP_CRC_MAGIC): + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid)) + return false; + if (be64_to_cpu(block->bb_u.l.bb_blkno) != bp->b_bn) + return false; + /* + * XXX: need a better way of verifying the owner here. Right now + * just make sure there has been one set. + */ + if (be64_to_cpu(block->bb_u.l.bb_owner) == 0) + return false; + /* fall through */ + case cpu_to_be32(XFS_BMAP_MAGIC): + break; + default: + return false; + } + + /* + * numrecs and level verification. * - * We don't know waht fork we belong to, so just verify that the level + * We don't know what fork we belong to, so just verify that the level * is less than the maximum of the two. Later checks will be more * precise. */ level = be16_to_cpu(block->bb_level); - lblock_ok = block->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC) && - level < max(mp->m_bm_maxlevels[0], mp->m_bm_maxlevels[1]); - - /* numrecs verification */ - lblock_ok = lblock_ok && - be16_to_cpu(block->bb_numrecs) <= mp->m_bmap_dmxr[level != 0]; + if (level > max(mp->m_bm_maxlevels[0], mp->m_bm_maxlevels[1])) + return false; + if (be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0]) + return false; /* sibling pointer verification */ - lblock_ok = lblock_ok && - block->bb_u.l.bb_leftsib && - (block->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO) || - XFS_FSB_SANITY_CHECK(mp, - be64_to_cpu(block->bb_u.l.bb_leftsib))) && - block->bb_u.l.bb_rightsib && - (block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO) || - XFS_FSB_SANITY_CHECK(mp, - be64_to_cpu(block->bb_u.l.bb_rightsib))); - - if (!lblock_ok) { - trace_xfs_btree_corrupt(bp, _RET_IP_); - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block); - xfs_buf_ioerror(bp, EFSCORRUPTED); - } + if (!block->bb_u.l.bb_leftsib || + (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLDFSBNO) && + !XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_leftsib)))) + return false; + if (!block->bb_u.l.bb_rightsib || + (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLDFSBNO) && + !XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_rightsib)))) + return false; + + return true; + } static void xfs_bmbt_read_verify( struct xfs_buf *bp) { - xfs_bmbt_verify(bp); + if (!(xfs_btree_lblock_verify_crc(bp) && + xfs_bmbt_verify(bp))) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, + bp->b_target->bt_mount, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } + } static void xfs_bmbt_write_verify( struct xfs_buf *bp) { - xfs_bmbt_verify(bp); + if (!xfs_bmbt_verify(bp)) { + xfs_warn(bp->b_target->bt_mount, "bmbt daddr 0x%llx failed", bp->b_bn); + trace_xfs_btree_corrupt(bp, _RET_IP_); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, + bp->b_target->bt_mount, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + return; + } + xfs_btree_lblock_calc_crc(bp); } const struct xfs_buf_ops xfs_bmbt_buf_ops = { @@ -838,6 +882,8 @@ xfs_bmbt_init_cursor( cur->bc_ops = &xfs_bmbt_ops; cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE; + if (xfs_sb_version_hascrc(&mp->m_sb)) + cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork); cur->bc_private.b.ip = ip; diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h index 88469ca0869..70c43d9f72c 100644 --- a/fs/xfs/xfs_bmap_btree.h +++ b/fs/xfs/xfs_bmap_btree.h @@ -18,7 +18,8 @@ #ifndef __XFS_BMAP_BTREE_H__ #define __XFS_BMAP_BTREE_H__ -#define XFS_BMAP_MAGIC 0x424d4150 /* 'BMAP' */ +#define XFS_BMAP_MAGIC 0x424d4150 /* 'BMAP' */ +#define XFS_BMAP_CRC_MAGIC 0x424d4133 /* 'BMA3' */ struct xfs_btree_cur; struct xfs_btree_block; @@ -136,10 +137,10 @@ typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t; /* * Btree block header size depends on a superblock flag. - * - * (not quite yet, but soon) */ -#define XFS_BMBT_BLOCK_LEN(mp) XFS_BTREE_LBLOCK_LEN +#define XFS_BMBT_BLOCK_LEN(mp) \ + (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ + XFS_BTREE_LBLOCK_CRC_LEN : XFS_BTREE_LBLOCK_LEN) #define XFS_BMBT_REC_ADDR(mp, block, index) \ ((xfs_bmbt_rec_t *) \ @@ -186,12 +187,12 @@ typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t; #define XFS_BMAP_BROOT_PTR_ADDR(mp, bb, i, sz) \ XFS_BMBT_PTR_ADDR(mp, bb, i, xfs_bmbt_maxrecs(mp, sz, 0)) -#define XFS_BMAP_BROOT_SPACE_CALC(nrecs) \ - (int)(XFS_BTREE_LBLOCK_LEN + \ +#define XFS_BMAP_BROOT_SPACE_CALC(mp, nrecs) \ + (int)(XFS_BMBT_BLOCK_LEN(mp) + \ ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t)))) -#define XFS_BMAP_BROOT_SPACE(bb) \ - (XFS_BMAP_BROOT_SPACE_CALC(be16_to_cpu((bb)->bb_numrecs))) +#define XFS_BMAP_BROOT_SPACE(mp, bb) \ + (XFS_BMAP_BROOT_SPACE_CALC(mp, be16_to_cpu((bb)->bb_numrecs))) #define XFS_BMDR_SPACE_CALC(nrecs) \ (int)(sizeof(xfs_bmdr_block_t) + \ ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t)))) @@ -204,7 +205,7 @@ typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t; /* * Prototypes for xfs_bmap.c to call. */ -extern void xfs_bmdr_to_bmbt(struct xfs_mount *, xfs_bmdr_block_t *, int, +extern void xfs_bmdr_to_bmbt(struct xfs_inode *, xfs_bmdr_block_t *, int, struct xfs_btree_block *, int); extern void xfs_bmbt_get_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s); extern xfs_filblks_t xfs_bmbt_get_blockcount(xfs_bmbt_rec_host_t *r); diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index db010408d70..8804b8a3c31 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -30,9 +30,11 @@ #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_inode_item.h" +#include "xfs_buf_item.h" #include "xfs_btree.h" #include "xfs_error.h" #include "xfs_trace.h" +#include "xfs_cksum.h" /* * Cursor allocation zone. @@ -42,9 +44,13 @@ kmem_zone_t *xfs_btree_cur_zone; /* * Btree magic numbers. */ -const __uint32_t xfs_magics[XFS_BTNUM_MAX] = { - XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC +static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = { + { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC }, + { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC, + XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC } }; +#define xfs_btree_magic(cur) \ + xfs_magics[!!((cur)->bc_flags & XFS_BTREE_CRC_BLOCKS)][cur->bc_btnum] STATIC int /* error (0 or EFSCORRUPTED) */ @@ -54,30 +60,38 @@ xfs_btree_check_lblock( int level, /* level of the btree block */ struct xfs_buf *bp) /* buffer for block, if any */ { - int lblock_ok; /* block passes checks */ + int lblock_ok = 1; /* block passes checks */ struct xfs_mount *mp; /* file system mount point */ mp = cur->bc_mp; - lblock_ok = - be32_to_cpu(block->bb_magic) == xfs_magics[cur->bc_btnum] && + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + lblock_ok = lblock_ok && + uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid) && + block->bb_u.l.bb_blkno == cpu_to_be64( + bp ? bp->b_bn : XFS_BUF_DADDR_NULL); + } + + lblock_ok = lblock_ok && + be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) && be16_to_cpu(block->bb_level) == level && be16_to_cpu(block->bb_numrecs) <= cur->bc_ops->get_maxrecs(cur, level) && block->bb_u.l.bb_leftsib && (block->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO) || XFS_FSB_SANITY_CHECK(mp, - be64_to_cpu(block->bb_u.l.bb_leftsib))) && + be64_to_cpu(block->bb_u.l.bb_leftsib))) && block->bb_u.l.bb_rightsib && (block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO) || XFS_FSB_SANITY_CHECK(mp, - be64_to_cpu(block->bb_u.l.bb_rightsib))); + be64_to_cpu(block->bb_u.l.bb_rightsib))); + if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp, XFS_ERRTAG_BTREE_CHECK_LBLOCK, XFS_RANDOM_BTREE_CHECK_LBLOCK))) { if (bp) trace_xfs_btree_corrupt(bp, _RET_IP_); - XFS_ERROR_REPORT("xfs_btree_check_lblock", XFS_ERRLEVEL_LOW, - mp); + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); return XFS_ERROR(EFSCORRUPTED); } return 0; @@ -90,16 +104,26 @@ xfs_btree_check_sblock( int level, /* level of the btree block */ struct xfs_buf *bp) /* buffer containing block */ { + struct xfs_mount *mp; /* file system mount point */ struct xfs_buf *agbp; /* buffer for ag. freespace struct */ struct xfs_agf *agf; /* ag. freespace structure */ xfs_agblock_t agflen; /* native ag. freespace length */ - int sblock_ok; /* block passes checks */ + int sblock_ok = 1; /* block passes checks */ + mp = cur->bc_mp; agbp = cur->bc_private.a.agbp; agf = XFS_BUF_TO_AGF(agbp); agflen = be32_to_cpu(agf->agf_length); - sblock_ok = - be32_to_cpu(block->bb_magic) == xfs_magics[cur->bc_btnum] && + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + sblock_ok = sblock_ok && + uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid) && + block->bb_u.s.bb_blkno == cpu_to_be64( + bp ? bp->b_bn : XFS_BUF_DADDR_NULL); + } + + sblock_ok = sblock_ok && + be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) && be16_to_cpu(block->bb_level) == level && be16_to_cpu(block->bb_numrecs) <= cur->bc_ops->get_maxrecs(cur, level) && @@ -109,13 +133,13 @@ xfs_btree_check_sblock( (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) || be32_to_cpu(block->bb_u.s.bb_rightsib) < agflen) && block->bb_u.s.bb_rightsib; - if (unlikely(XFS_TEST_ERROR(!sblock_ok, cur->bc_mp, + + if (unlikely(XFS_TEST_ERROR(!sblock_ok, mp, XFS_ERRTAG_BTREE_CHECK_SBLOCK, XFS_RANDOM_BTREE_CHECK_SBLOCK))) { if (bp) trace_xfs_btree_corrupt(bp, _RET_IP_); - XFS_CORRUPTION_ERROR("xfs_btree_check_sblock", - XFS_ERRLEVEL_LOW, cur->bc_mp, block); + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); return XFS_ERROR(EFSCORRUPTED); } return 0; @@ -194,6 +218,72 @@ xfs_btree_check_ptr( #endif /* + * Calculate CRC on the whole btree block and stuff it into the + * long-form btree header. + * + * Prior to calculting the CRC, pull the LSN out of the buffer log item and put + * it into the buffer so recovery knows what the last modifcation was that made + * it to disk. + */ +void +xfs_btree_lblock_calc_crc( + struct xfs_buf *bp) +{ + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + struct xfs_buf_log_item *bip = bp->b_fspriv; + + if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb)) + return; + if (bip) + block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); + xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), + XFS_BTREE_LBLOCK_CRC_OFF); +} + +bool +xfs_btree_lblock_verify_crc( + struct xfs_buf *bp) +{ + if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb)) + return xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), + XFS_BTREE_LBLOCK_CRC_OFF); + return true; +} + +/* + * Calculate CRC on the whole btree block and stuff it into the + * short-form btree header. + * + * Prior to calculting the CRC, pull the LSN out of the buffer log item and put + * it into the buffer so recovery knows what the last modifcation was that made + * it to disk. + */ +void +xfs_btree_sblock_calc_crc( + struct xfs_buf *bp) +{ + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + struct xfs_buf_log_item *bip = bp->b_fspriv; + + if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb)) + return; + if (bip) + block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); + xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), + XFS_BTREE_SBLOCK_CRC_OFF); +} + +bool +xfs_btree_sblock_verify_crc( + struct xfs_buf *bp) +{ + if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb)) + return xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), + XFS_BTREE_SBLOCK_CRC_OFF); + return true; +} + +/* * Delete the btree cursor. */ void @@ -277,10 +367,8 @@ xfs_btree_dup_cursor( *ncur = NULL; return error; } - new->bc_bufs[i] = bp; - ASSERT(!xfs_buf_geterror(bp)); - } else - new->bc_bufs[i] = NULL; + } + new->bc_bufs[i] = bp; } *ncur = new; return 0; @@ -321,9 +409,14 @@ xfs_btree_dup_cursor( */ static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur) { - return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ? - XFS_BTREE_LBLOCK_LEN : - XFS_BTREE_SBLOCK_LEN; + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) + return XFS_BTREE_LBLOCK_CRC_LEN; + return XFS_BTREE_LBLOCK_LEN; + } + if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) + return XFS_BTREE_SBLOCK_CRC_LEN; + return XFS_BTREE_SBLOCK_LEN; } /* @@ -863,43 +956,85 @@ xfs_btree_set_sibling( } void +xfs_btree_init_block_int( + struct xfs_mount *mp, + struct xfs_btree_block *buf, + xfs_daddr_t blkno, + __u32 magic, + __u16 level, + __u16 numrecs, + __u64 owner, + unsigned int flags) +{ + buf->bb_magic = cpu_to_be32(magic); + buf->bb_level = cpu_to_be16(level); + buf->bb_numrecs = cpu_to_be16(numrecs); + + if (flags & XFS_BTREE_LONG_PTRS) { + buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO); + buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO); + if (flags & XFS_BTREE_CRC_BLOCKS) { + buf->bb_u.l.bb_blkno = cpu_to_be64(blkno); + buf->bb_u.l.bb_owner = cpu_to_be64(owner); + uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid); + buf->bb_u.l.bb_pad = 0; + } + } else { + /* owner is a 32 bit value on short blocks */ + __u32 __owner = (__u32)owner; + + buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); + buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); + if (flags & XFS_BTREE_CRC_BLOCKS) { + buf->bb_u.s.bb_blkno = cpu_to_be64(blkno); + buf->bb_u.s.bb_owner = cpu_to_be32(__owner); + uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid); + } + } +} + +void xfs_btree_init_block( struct xfs_mount *mp, struct xfs_buf *bp, __u32 magic, __u16 level, __u16 numrecs, + __u64 owner, unsigned int flags) { - struct xfs_btree_block *new = XFS_BUF_TO_BLOCK(bp); - - new->bb_magic = cpu_to_be32(magic); - new->bb_level = cpu_to_be16(level); - new->bb_numrecs = cpu_to_be16(numrecs); - - if (flags & XFS_BTREE_LONG_PTRS) { - new->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO); - new->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO); - } else { - new->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); - new->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); - } + xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn, + magic, level, numrecs, owner, flags); } STATIC void xfs_btree_init_block_cur( struct xfs_btree_cur *cur, + struct xfs_buf *bp, int level, - int numrecs, - struct xfs_buf *bp) + int numrecs) { - xfs_btree_init_block(cur->bc_mp, bp, xfs_magics[cur->bc_btnum], - level, numrecs, cur->bc_flags); + __u64 owner; + + /* + * we can pull the owner from the cursor right now as the different + * owners align directly with the pointer size of the btree. This may + * change in future, but is safe for current users of the generic btree + * code. + */ + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + owner = cur->bc_private.b.ip->i_ino; + else + owner = cur->bc_private.a.agno; + + xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn, + xfs_btree_magic(cur), level, numrecs, + owner, cur->bc_flags); } /* * Return true if ptr is the last record in the btree and - * we need to track updateѕ to this record. The decision + * we need to track updates to this record. The decision * will be further refined in the update_lastrec method. */ STATIC int @@ -1147,6 +1282,7 @@ xfs_btree_log_keys( XFS_BTREE_TRACE_ARGBII(cur, bp, first, last); if (bp) { + xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); xfs_trans_log_buf(cur->bc_tp, bp, xfs_btree_key_offset(cur, first), xfs_btree_key_offset(cur, last + 1) - 1); @@ -1171,6 +1307,7 @@ xfs_btree_log_recs( XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); XFS_BTREE_TRACE_ARGBII(cur, bp, first, last); + xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); xfs_trans_log_buf(cur->bc_tp, bp, xfs_btree_rec_offset(cur, first), xfs_btree_rec_offset(cur, last + 1) - 1); @@ -1195,6 +1332,7 @@ xfs_btree_log_ptrs( struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); int level = xfs_btree_get_level(block); + xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); xfs_trans_log_buf(cur->bc_tp, bp, xfs_btree_ptr_offset(cur, first, level), xfs_btree_ptr_offset(cur, last + 1, level) - 1); @@ -1223,7 +1361,12 @@ xfs_btree_log_block( offsetof(struct xfs_btree_block, bb_numrecs), offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib), offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib), - XFS_BTREE_SBLOCK_LEN + offsetof(struct xfs_btree_block, bb_u.s.bb_blkno), + offsetof(struct xfs_btree_block, bb_u.s.bb_lsn), + offsetof(struct xfs_btree_block, bb_u.s.bb_uuid), + offsetof(struct xfs_btree_block, bb_u.s.bb_owner), + offsetof(struct xfs_btree_block, bb_u.s.bb_crc), + XFS_BTREE_SBLOCK_CRC_LEN }; static const short loffsets[] = { /* table of offsets (long) */ offsetof(struct xfs_btree_block, bb_magic), @@ -1231,17 +1374,40 @@ xfs_btree_log_block( offsetof(struct xfs_btree_block, bb_numrecs), offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib), offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib), - XFS_BTREE_LBLOCK_LEN + offsetof(struct xfs_btree_block, bb_u.l.bb_blkno), + offsetof(struct xfs_btree_block, bb_u.l.bb_lsn), + offsetof(struct xfs_btree_block, bb_u.l.bb_uuid), + offsetof(struct xfs_btree_block, bb_u.l.bb_owner), + offsetof(struct xfs_btree_block, bb_u.l.bb_crc), + offsetof(struct xfs_btree_block, bb_u.l.bb_pad), + XFS_BTREE_LBLOCK_CRC_LEN }; XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); XFS_BTREE_TRACE_ARGBI(cur, bp, fields); if (bp) { + int nbits; + + if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) { + /* + * We don't log the CRC when updating a btree + * block but instead recreate it during log + * recovery. As the log buffers have checksums + * of their own this is safe and avoids logging a crc + * update in a lot of places. + */ + if (fields == XFS_BB_ALL_BITS) + fields = XFS_BB_ALL_BITS_CRC; + nbits = XFS_BB_NUM_BITS_CRC; + } else { + nbits = XFS_BB_NUM_BITS; + } xfs_btree_offsets(fields, (cur->bc_flags & XFS_BTREE_LONG_PTRS) ? loffsets : soffsets, - XFS_BB_NUM_BITS, &first, &last); + nbits, &first, &last); + xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); xfs_trans_log_buf(cur->bc_tp, bp, first, last); } else { xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, @@ -2204,7 +2370,7 @@ xfs_btree_split( goto error0; /* Fill in the btree header for the new right block. */ - xfs_btree_init_block_cur(cur, xfs_btree_get_level(left), 0, rbp); + xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0); /* * Split the entries between the old and the new block evenly. @@ -2513,7 +2679,7 @@ xfs_btree_new_root( nptr = 2; } /* Fill in the new block's btree header and log it. */ - xfs_btree_init_block_cur(cur, cur->bc_nlevels, 2, nbp); + xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2); xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS); ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) && !xfs_btree_ptr_is_null(cur, &rptr)); diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h index f932897194e..6e6c915673f 100644 --- a/fs/xfs/xfs_btree.h +++ b/fs/xfs/xfs_btree.h @@ -42,11 +42,15 @@ extern kmem_zone_t *xfs_btree_cur_zone; * Generic btree header. * * This is a combination of the actual format used on disk for short and long - * format btrees. The first three fields are shared by both format, but - * the pointers are different and should be used with care. + * format btrees. The first three fields are shared by both format, but the + * pointers are different and should be used with care. * - * To get the size of the actual short or long form headers please use - * the size macros below. Never use sizeof(xfs_btree_block). + * To get the size of the actual short or long form headers please use the size + * macros below. Never use sizeof(xfs_btree_block). + * + * The blkno, crc, lsn, owner and uuid fields are only available in filesystems + * with the crc feature bit, and all accesses to them must be conditional on + * that flag. */ struct xfs_btree_block { __be32 bb_magic; /* magic number for block type */ @@ -56,10 +60,23 @@ struct xfs_btree_block { struct { __be32 bb_leftsib; __be32 bb_rightsib; + + __be64 bb_blkno; + __be64 bb_lsn; + uuid_t bb_uuid; + __be32 bb_owner; + __le32 bb_crc; } s; /* short form pointers */ struct { __be64 bb_leftsib; __be64 bb_rightsib; + + __be64 bb_blkno; + __be64 bb_lsn; + uuid_t bb_uuid; + __be64 bb_owner; + __le32 bb_crc; + __be32 bb_pad; /* padding for alignment */ } l; /* long form pointers */ } bb_u; /* rest */ }; @@ -67,6 +84,16 @@ struct xfs_btree_block { #define XFS_BTREE_SBLOCK_LEN 16 /* size of a short form block */ #define XFS_BTREE_LBLOCK_LEN 24 /* size of a long form block */ +/* sizes of CRC enabled btree blocks */ +#define XFS_BTREE_SBLOCK_CRC_LEN (XFS_BTREE_SBLOCK_LEN + 40) +#define XFS_BTREE_LBLOCK_CRC_LEN (XFS_BTREE_LBLOCK_LEN + 48) + + +#define XFS_BTREE_SBLOCK_CRC_OFF \ + offsetof(struct xfs_btree_block, bb_u.s.bb_crc) +#define XFS_BTREE_LBLOCK_CRC_OFF \ + offsetof(struct xfs_btree_block, bb_u.l.bb_crc) + /* * Generic key, ptr and record wrapper structures. @@ -101,13 +128,11 @@ union xfs_btree_rec { #define XFS_BB_NUMRECS 0x04 #define XFS_BB_LEFTSIB 0x08 #define XFS_BB_RIGHTSIB 0x10 +#define XFS_BB_BLKNO 0x20 #define XFS_BB_NUM_BITS 5 #define XFS_BB_ALL_BITS ((1 << XFS_BB_NUM_BITS) - 1) - -/* - * Magic numbers for btree blocks. - */ -extern const __uint32_t xfs_magics[]; +#define XFS_BB_NUM_BITS_CRC 8 +#define XFS_BB_ALL_BITS_CRC ((1 << XFS_BB_NUM_BITS_CRC) - 1) /* * Generic stats interface @@ -256,6 +281,7 @@ typedef struct xfs_btree_cur #define XFS_BTREE_LONG_PTRS (1<<0) /* pointers are 64bits long */ #define XFS_BTREE_ROOT_IN_INODE (1<<1) /* root may be variable size */ #define XFS_BTREE_LASTREC_UPDATE (1<<2) /* track last rec externally */ +#define XFS_BTREE_CRC_BLOCKS (1<<3) /* uses extended btree blocks */ #define XFS_BTREE_NOERROR 0 @@ -393,8 +419,20 @@ xfs_btree_init_block( __u32 magic, __u16 level, __u16 numrecs, + __u64 owner, unsigned int flags); +void +xfs_btree_init_block_int( + struct xfs_mount *mp, + struct xfs_btree_block *buf, + xfs_daddr_t blkno, + __u32 magic, + __u16 level, + __u16 numrecs, + __u64 owner, + unsigned int flags); + /* * Common btree core entry points. */ @@ -408,6 +446,14 @@ int xfs_btree_delete(struct xfs_btree_cur *, int *); int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *); /* + * btree block CRC helpers + */ +void xfs_btree_lblock_calc_crc(struct xfs_buf *); +bool xfs_btree_lblock_verify_crc(struct xfs_buf *); +void xfs_btree_sblock_calc_crc(struct xfs_buf *); +bool xfs_btree_sblock_verify_crc(struct xfs_buf *); + +/* * Internal btree helpers also used by xfs_bmap.c. */ void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, int); diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 8459b5d8cb7..82b70bda9f4 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1022,7 +1022,9 @@ xfs_buf_iodone_work( bool read = !!(bp->b_flags & XBF_READ); bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); - if (read && bp->b_ops) + + /* only validate buffers that were read without errors */ + if (read && bp->b_ops && !bp->b_error && (bp->b_flags & XBF_DONE)) bp->b_ops->verify_read(bp); if (bp->b_iodone) diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index ee36c88ecfd..2573d2a75fc 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -24,19 +24,20 @@ extern kmem_zone_t *xfs_buf_item_zone; * This flag indicates that the buffer contains on disk inodes * and requires special recovery handling. */ -#define XFS_BLF_INODE_BUF 0x1 +#define XFS_BLF_INODE_BUF (1<<0) /* * This flag indicates that the buffer should not be replayed * during recovery because its blocks are being freed. */ -#define XFS_BLF_CANCEL 0x2 +#define XFS_BLF_CANCEL (1<<1) + /* * This flag indicates that the buffer contains on disk * user or group dquots and may require special recovery handling. */ -#define XFS_BLF_UDQUOT_BUF 0x4 -#define XFS_BLF_PDQUOT_BUF 0x8 -#define XFS_BLF_GDQUOT_BUF 0x10 +#define XFS_BLF_UDQUOT_BUF (1<<2) +#define XFS_BLF_PDQUOT_BUF (1<<3) +#define XFS_BLF_GDQUOT_BUF (1<<4) #define XFS_BLF_CHUNK 128 #define XFS_BLF_SHIFT 7 @@ -61,6 +62,55 @@ typedef struct xfs_buf_log_format { } xfs_buf_log_format_t; /* + * All buffers now need to tell recovery where the magic number + * is so that it can verify and calculate the CRCs on the buffer correctly + * once the changes have been replayed into the buffer. + * + * The type value is held in the upper 5 bits of the blf_flags field, which is + * an unsigned 16 bit field. Hence we need to shift it 11 bits up and down. + */ +#define XFS_BLFT_BITS 5 +#define XFS_BLFT_SHIFT 11 +#define XFS_BLFT_MASK (((1 << XFS_BLFT_BITS) - 1) << XFS_BLFT_SHIFT) + +enum xfs_blft { + XFS_BLFT_UNKNOWN_BUF = 0, + XFS_BLFT_UDQUOT_BUF, + XFS_BLFT_PDQUOT_BUF, + XFS_BLFT_GDQUOT_BUF, + XFS_BLFT_BTREE_BUF, + XFS_BLFT_AGF_BUF, + XFS_BLFT_AGFL_BUF, + XFS_BLFT_AGI_BUF, + XFS_BLFT_DINO_BUF, + XFS_BLFT_SYMLINK_BUF, + XFS_BLFT_DIR_BLOCK_BUF, + XFS_BLFT_DIR_DATA_BUF, + XFS_BLFT_DIR_FREE_BUF, + XFS_BLFT_DIR_LEAF1_BUF, + XFS_BLFT_DIR_LEAFN_BUF, + XFS_BLFT_DA_NODE_BUF, + XFS_BLFT_ATTR_LEAF_BUF, + XFS_BLFT_ATTR_RMT_BUF, + XFS_BLFT_SB_BUF, + XFS_BLFT_MAX_BUF = (1 << XFS_BLFT_BITS), +}; + +static inline void +xfs_blft_to_flags(struct xfs_buf_log_format *blf, enum xfs_blft type) +{ + ASSERT(type > XFS_BLFT_UNKNOWN_BUF && type < XFS_BLFT_MAX_BUF); + blf->blf_flags &= ~XFS_BLFT_MASK; + blf->blf_flags |= ((type << XFS_BLFT_SHIFT) & XFS_BLFT_MASK); +} + +static inline __uint16_t +xfs_blft_from_flags(struct xfs_buf_log_format *blf) +{ + return (blf->blf_flags & XFS_BLFT_MASK) >> XFS_BLFT_SHIFT; +} + +/* * buf log item flags */ #define XFS_BLI_HOLD 0x01 @@ -113,6 +163,10 @@ void xfs_buf_attach_iodone(struct xfs_buf *, void xfs_buf_iodone_callbacks(struct xfs_buf *); void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *); +void xfs_trans_buf_set_type(struct xfs_trans *, struct xfs_buf *, + enum xfs_blft); +void xfs_trans_buf_copy_type(struct xfs_buf *dst_bp, struct xfs_buf *src_bp); + #endif /* __KERNEL__ */ #endif /* __XFS_BUF_ITEM_H__ */ diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 4d7696a0241..9b26a99ebfe 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -38,6 +39,8 @@ #include "xfs_attr_leaf.h" #include "xfs_error.h" #include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_buf_item.h" /* * xfs_da_btree.c @@ -52,69 +55,195 @@ /* * Routines used for growing the Btree. */ -STATIC int xfs_da_root_split(xfs_da_state_t *state, +STATIC int xfs_da3_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *existing_root, xfs_da_state_blk_t *new_child); -STATIC int xfs_da_node_split(xfs_da_state_t *state, +STATIC int xfs_da3_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *existing_blk, xfs_da_state_blk_t *split_blk, xfs_da_state_blk_t *blk_to_add, int treelevel, int *result); -STATIC void xfs_da_node_rebalance(xfs_da_state_t *state, +STATIC void xfs_da3_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *node_blk_1, xfs_da_state_blk_t *node_blk_2); -STATIC void xfs_da_node_add(xfs_da_state_t *state, +STATIC void xfs_da3_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *old_node_blk, xfs_da_state_blk_t *new_node_blk); /* * Routines used for shrinking the Btree. */ -STATIC int xfs_da_root_join(xfs_da_state_t *state, +STATIC int xfs_da3_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk); -STATIC int xfs_da_node_toosmall(xfs_da_state_t *state, int *retval); -STATIC void xfs_da_node_remove(xfs_da_state_t *state, +STATIC int xfs_da3_node_toosmall(xfs_da_state_t *state, int *retval); +STATIC void xfs_da3_node_remove(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk); -STATIC void xfs_da_node_unbalance(xfs_da_state_t *state, +STATIC void xfs_da3_node_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *src_node_blk, xfs_da_state_blk_t *dst_node_blk); /* * Utility routines. */ -STATIC uint xfs_da_node_lasthash(struct xfs_buf *bp, int *count); -STATIC int xfs_da_node_order(struct xfs_buf *node1_bp, - struct xfs_buf *node2_bp); -STATIC int xfs_da_blk_unlink(xfs_da_state_t *state, +STATIC int xfs_da3_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, xfs_da_state_blk_t *save_blk); -STATIC void xfs_da_state_kill_altpath(xfs_da_state_t *state); -static void -xfs_da_node_verify( + +kmem_zone_t *xfs_da_state_zone; /* anchor for state struct zone */ + +/* + * Allocate a dir-state structure. + * We don't put them on the stack since they're large. + */ +xfs_da_state_t * +xfs_da_state_alloc(void) +{ + return kmem_zone_zalloc(xfs_da_state_zone, KM_NOFS); +} + +/* + * Kill the altpath contents of a da-state structure. + */ +STATIC void +xfs_da_state_kill_altpath(xfs_da_state_t *state) +{ + int i; + + for (i = 0; i < state->altpath.active; i++) + state->altpath.blk[i].bp = NULL; + state->altpath.active = 0; +} + +/* + * Free a da-state structure. + */ +void +xfs_da_state_free(xfs_da_state_t *state) +{ + xfs_da_state_kill_altpath(state); +#ifdef DEBUG + memset((char *)state, 0, sizeof(*state)); +#endif /* DEBUG */ + kmem_zone_free(xfs_da_state_zone, state); +} + +void +xfs_da3_node_hdr_from_disk( + struct xfs_da3_icnode_hdr *to, + struct xfs_da_intnode *from) +{ + ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC) || + from->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)); + + if (from->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) { + struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)from; + + to->forw = be32_to_cpu(hdr3->info.hdr.forw); + to->back = be32_to_cpu(hdr3->info.hdr.back); + to->magic = be16_to_cpu(hdr3->info.hdr.magic); + to->count = be16_to_cpu(hdr3->__count); + to->level = be16_to_cpu(hdr3->__level); + return; + } + to->forw = be32_to_cpu(from->hdr.info.forw); + to->back = be32_to_cpu(from->hdr.info.back); + to->magic = be16_to_cpu(from->hdr.info.magic); + to->count = be16_to_cpu(from->hdr.__count); + to->level = be16_to_cpu(from->hdr.__level); +} + +void +xfs_da3_node_hdr_to_disk( + struct xfs_da_intnode *to, + struct xfs_da3_icnode_hdr *from) +{ + ASSERT(from->magic == XFS_DA_NODE_MAGIC || + from->magic == XFS_DA3_NODE_MAGIC); + + if (from->magic == XFS_DA3_NODE_MAGIC) { + struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)to; + + hdr3->info.hdr.forw = cpu_to_be32(from->forw); + hdr3->info.hdr.back = cpu_to_be32(from->back); + hdr3->info.hdr.magic = cpu_to_be16(from->magic); + hdr3->__count = cpu_to_be16(from->count); + hdr3->__level = cpu_to_be16(from->level); + return; + } + to->hdr.info.forw = cpu_to_be32(from->forw); + to->hdr.info.back = cpu_to_be32(from->back); + to->hdr.info.magic = cpu_to_be16(from->magic); + to->hdr.__count = cpu_to_be16(from->count); + to->hdr.__level = cpu_to_be16(from->level); +} + +static bool +xfs_da3_node_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_da_node_hdr *hdr = bp->b_addr; - int block_ok = 0; - - block_ok = hdr->info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC); - block_ok = block_ok && - be16_to_cpu(hdr->level) > 0 && - be16_to_cpu(hdr->count) > 0 ; - if (!block_ok) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); - xfs_buf_ioerror(bp, EFSCORRUPTED); + struct xfs_da_intnode *hdr = bp->b_addr; + struct xfs_da3_icnode_hdr ichdr; + + xfs_da3_node_hdr_from_disk(&ichdr, hdr); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_da3_node_hdr *hdr3 = bp->b_addr; + + if (ichdr.magic != XFS_DA3_NODE_MAGIC) + return false; + + if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_uuid)) + return false; + if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn) + return false; + } else { + if (ichdr.magic != XFS_DA_NODE_MAGIC) + return false; } + if (ichdr.level == 0) + return false; + if (ichdr.level > XFS_DA_NODE_MAXDEPTH) + return false; + if (ichdr.count == 0) + return false; + /* + * we don't know if the node is for and attribute or directory tree, + * so only fail if the count is outside both bounds + */ + if (ichdr.count > mp->m_dir_node_ents && + ichdr.count > mp->m_attr_node_ents) + return false; + + /* XXX: hash order check? */ + + return true; } static void -xfs_da_node_write_verify( +xfs_da3_node_write_verify( struct xfs_buf *bp) { - xfs_da_node_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_da3_node_hdr *hdr3 = bp->b_addr; + + if (!xfs_da3_node_verify(bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DA3_NODE_CRC_OFF); } /* @@ -124,40 +253,47 @@ xfs_da_node_write_verify( * format of the block being read. */ static void -xfs_da_node_read_verify( +xfs_da3_node_read_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_da_blkinfo *info = bp->b_addr; switch (be16_to_cpu(info->magic)) { + case XFS_DA3_NODE_MAGIC: + if (!xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), + XFS_DA3_NODE_CRC_OFF)) + break; + /* fall through */ case XFS_DA_NODE_MAGIC: - xfs_da_node_verify(bp); - break; + if (!xfs_da3_node_verify(bp)) + break; + return; case XFS_ATTR_LEAF_MAGIC: - bp->b_ops = &xfs_attr_leaf_buf_ops; + bp->b_ops = &xfs_attr3_leaf_buf_ops; bp->b_ops->verify_read(bp); return; case XFS_DIR2_LEAFN_MAGIC: - bp->b_ops = &xfs_dir2_leafn_buf_ops; + case XFS_DIR3_LEAFN_MAGIC: + bp->b_ops = &xfs_dir3_leafn_buf_ops; bp->b_ops->verify_read(bp); return; default: - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, - mp, info); - xfs_buf_ioerror(bp, EFSCORRUPTED); break; } + + /* corrupt block */ + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); } -const struct xfs_buf_ops xfs_da_node_buf_ops = { - .verify_read = xfs_da_node_read_verify, - .verify_write = xfs_da_node_write_verify, +const struct xfs_buf_ops xfs_da3_node_buf_ops = { + .verify_read = xfs_da3_node_read_verify, + .verify_write = xfs_da3_node_write_verify, }; - int -xfs_da_node_read( +xfs_da3_node_read( struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, @@ -165,8 +301,35 @@ xfs_da_node_read( struct xfs_buf **bpp, int which_fork) { - return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, - which_fork, &xfs_da_node_buf_ops); + int err; + + err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, + which_fork, &xfs_da3_node_buf_ops); + if (!err && tp) { + struct xfs_da_blkinfo *info = (*bpp)->b_addr; + int type; + + switch (be16_to_cpu(info->magic)) { + case XFS_DA_NODE_MAGIC: + case XFS_DA3_NODE_MAGIC: + type = XFS_BLFT_DA_NODE_BUF; + break; + case XFS_ATTR_LEAF_MAGIC: + case XFS_ATTR3_LEAF_MAGIC: + type = XFS_BLFT_ATTR_LEAF_BUF; + break; + case XFS_DIR2_LEAFN_MAGIC: + case XFS_DIR3_LEAFN_MAGIC: + type = XFS_BLFT_DIR_LEAFN_BUF; + break; + default: + type = 0; + ASSERT(0); + break; + } + xfs_trans_buf_set_type(tp, *bpp, type); + } + return err; } /*======================================================================== @@ -177,33 +340,46 @@ xfs_da_node_read( * Create the initial contents of an intermediate node. */ int -xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level, - struct xfs_buf **bpp, int whichfork) +xfs_da3_node_create( + struct xfs_da_args *args, + xfs_dablk_t blkno, + int level, + struct xfs_buf **bpp, + int whichfork) { - xfs_da_intnode_t *node; - struct xfs_buf *bp; - int error; - xfs_trans_t *tp; + struct xfs_da_intnode *node; + struct xfs_trans *tp = args->trans; + struct xfs_mount *mp = tp->t_mountp; + struct xfs_da3_icnode_hdr ichdr = {0}; + struct xfs_buf *bp; + int error; trace_xfs_da_node_create(args); + ASSERT(level <= XFS_DA_NODE_MAXDEPTH); - tp = args->trans; error = xfs_da_get_buf(tp, args->dp, blkno, -1, &bp, whichfork); if (error) return(error); - ASSERT(bp != NULL); + bp->b_ops = &xfs_da3_node_buf_ops; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF); node = bp->b_addr; - node->hdr.info.forw = 0; - node->hdr.info.back = 0; - node->hdr.info.magic = cpu_to_be16(XFS_DA_NODE_MAGIC); - node->hdr.info.pad = 0; - node->hdr.count = 0; - node->hdr.level = cpu_to_be16(level); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_da3_node_hdr *hdr3 = bp->b_addr; + + ichdr.magic = XFS_DA3_NODE_MAGIC; + hdr3->info.blkno = cpu_to_be64(bp->b_bn); + hdr3->info.owner = cpu_to_be64(args->dp->i_ino); + uuid_copy(&hdr3->info.uuid, &mp->m_sb.sb_uuid); + } else { + ichdr.magic = XFS_DA_NODE_MAGIC; + } + ichdr.level = level; + + xfs_da3_node_hdr_to_disk(node, &ichdr); xfs_trans_log_buf(tp, bp, - XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr))); + XFS_DA_LOGRANGE(node, &node->hdr, xfs_da3_node_hdr_size(node))); - bp->b_ops = &xfs_da_node_buf_ops; *bpp = bp; return(0); } @@ -213,12 +389,18 @@ xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level, * intermediate nodes, rebalance, etc. */ int /* error */ -xfs_da_split(xfs_da_state_t *state) +xfs_da3_split( + struct xfs_da_state *state) { - xfs_da_state_blk_t *oldblk, *newblk, *addblk; - xfs_da_intnode_t *node; - struct xfs_buf *bp; - int max, action, error, i; + struct xfs_da_state_blk *oldblk; + struct xfs_da_state_blk *newblk; + struct xfs_da_state_blk *addblk; + struct xfs_da_intnode *node; + struct xfs_buf *bp; + int max; + int action; + int error; + int i; trace_xfs_da_split(state->args); @@ -246,7 +428,7 @@ xfs_da_split(xfs_da_state_t *state) */ switch (oldblk->magic) { case XFS_ATTR_LEAF_MAGIC: - error = xfs_attr_leaf_split(state, oldblk, newblk); + error = xfs_attr3_leaf_split(state, oldblk, newblk); if ((error != 0) && (error != ENOSPC)) { return(error); /* GROT: attr is inconsistent */ } @@ -261,12 +443,12 @@ xfs_da_split(xfs_da_state_t *state) if (state->inleaf) { state->extraafter = 0; /* before newblk */ trace_xfs_attr_leaf_split_before(state->args); - error = xfs_attr_leaf_split(state, oldblk, + error = xfs_attr3_leaf_split(state, oldblk, &state->extrablk); } else { state->extraafter = 1; /* after newblk */ trace_xfs_attr_leaf_split_after(state->args); - error = xfs_attr_leaf_split(state, newblk, + error = xfs_attr3_leaf_split(state, newblk, &state->extrablk); } if (error) @@ -280,7 +462,7 @@ xfs_da_split(xfs_da_state_t *state) addblk = newblk; break; case XFS_DA_NODE_MAGIC: - error = xfs_da_node_split(state, oldblk, newblk, addblk, + error = xfs_da3_node_split(state, oldblk, newblk, addblk, max - i, &action); addblk->bp = NULL; if (error) @@ -298,7 +480,7 @@ xfs_da_split(xfs_da_state_t *state) /* * Update the btree to show the new hashval for this child. */ - xfs_da_fixhashpath(state, &state->path); + xfs_da3_fixhashpath(state, &state->path); } if (!addblk) return(0); @@ -308,7 +490,7 @@ xfs_da_split(xfs_da_state_t *state) */ ASSERT(state->path.active == 0); oldblk = &state->path.blk[0]; - error = xfs_da_root_split(state, oldblk, addblk); + error = xfs_da3_root_split(state, oldblk, addblk); if (error) { addblk->bp = NULL; return(error); /* GROT: dir is inconsistent */ @@ -319,8 +501,12 @@ xfs_da_split(xfs_da_state_t *state) * just got bumped because of the addition of a new root node. * There might be three blocks involved if a double split occurred, * and the original block 0 could be at any position in the list. + * + * Note: the magic numbers and sibling pointers are in the same + * physical place for both v2 and v3 headers (by design). Hence it + * doesn't matter which version of the xfs_da_intnode structure we use + * here as the result will be the same using either structure. */ - node = oldblk->bp->b_addr; if (node->hdr.info.forw) { if (be32_to_cpu(node->hdr.info.forw) == addblk->blkno) { @@ -359,18 +545,25 @@ xfs_da_split(xfs_da_state_t *state) * the EOF, extending the inode in process. */ STATIC int /* error */ -xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, - xfs_da_state_blk_t *blk2) +xfs_da3_root_split( + struct xfs_da_state *state, + struct xfs_da_state_blk *blk1, + struct xfs_da_state_blk *blk2) { - xfs_da_intnode_t *node, *oldroot; - xfs_da_args_t *args; - xfs_dablk_t blkno; - struct xfs_buf *bp; - int error, size; - xfs_inode_t *dp; - xfs_trans_t *tp; - xfs_mount_t *mp; - xfs_dir2_leaf_t *leaf; + struct xfs_da_intnode *node; + struct xfs_da_intnode *oldroot; + struct xfs_da_node_entry *btree; + struct xfs_da3_icnode_hdr nodehdr; + struct xfs_da_args *args; + struct xfs_buf *bp; + struct xfs_inode *dp; + struct xfs_trans *tp; + struct xfs_mount *mp; + struct xfs_dir2_leaf *leaf; + xfs_dablk_t blkno; + int level; + int error; + int size; trace_xfs_da_root_split(state->args); @@ -379,29 +572,65 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, * to a free space somewhere. */ args = state->args; - ASSERT(args != NULL); error = xfs_da_grow_inode(args, &blkno); if (error) - return(error); + return error; + dp = args->dp; tp = args->trans; mp = state->mp; error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, args->whichfork); if (error) - return(error); - ASSERT(bp != NULL); + return error; node = bp->b_addr; oldroot = blk1->bp->b_addr; - if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)) { - size = (int)((char *)&oldroot->btree[be16_to_cpu(oldroot->hdr.count)] - - (char *)oldroot); + if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC) || + oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) { + struct xfs_da3_icnode_hdr nodehdr; + + xfs_da3_node_hdr_from_disk(&nodehdr, oldroot); + btree = xfs_da3_node_tree_p(oldroot); + size = (int)((char *)&btree[nodehdr.count] - (char *)oldroot); + level = nodehdr.level; + + /* + * we are about to copy oldroot to bp, so set up the type + * of bp while we know exactly what it will be. + */ + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF); } else { - ASSERT(oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_dir2_leaf_entry *ents; + leaf = (xfs_dir2_leaf_t *)oldroot; - size = (int)((char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] - - (char *)leaf); + xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); + ents = xfs_dir3_leaf_ents_p(leaf); + + ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC || + leafhdr.magic == XFS_DIR3_LEAFN_MAGIC); + size = (int)((char *)&ents[leafhdr.count] - (char *)leaf); + level = 0; + + /* + * we are about to copy oldroot to bp, so set up the type + * of bp while we know exactly what it will be. + */ + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF); } + + /* + * we can copy most of the information in the node from one block to + * another, but for CRC enabled headers we have to make sure that the + * block specific identifiers are kept intact. We update the buffer + * directly for this. + */ memcpy(node, oldroot, size); + if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) || + oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) { + struct xfs_da3_intnode *node3 = (struct xfs_da3_intnode *)node; + + node3->hdr.info.blkno = cpu_to_be64(bp->b_bn); + } xfs_trans_log_buf(tp, bp, 0, size - 1); bp->b_ops = blk1->bp->b_ops; @@ -411,20 +640,25 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, /* * Set up the new root node. */ - error = xfs_da_node_create(args, + error = xfs_da3_node_create(args, (args->whichfork == XFS_DATA_FORK) ? mp->m_dirleafblk : 0, - be16_to_cpu(node->hdr.level) + 1, &bp, args->whichfork); + level + 1, &bp, args->whichfork); if (error) - return(error); + return error; + node = bp->b_addr; - node->btree[0].hashval = cpu_to_be32(blk1->hashval); - node->btree[0].before = cpu_to_be32(blk1->blkno); - node->btree[1].hashval = cpu_to_be32(blk2->hashval); - node->btree[1].before = cpu_to_be32(blk2->blkno); - node->hdr.count = cpu_to_be16(2); + xfs_da3_node_hdr_from_disk(&nodehdr, node); + btree = xfs_da3_node_tree_p(node); + btree[0].hashval = cpu_to_be32(blk1->hashval); + btree[0].before = cpu_to_be32(blk1->blkno); + btree[1].hashval = cpu_to_be32(blk2->hashval); + btree[1].before = cpu_to_be32(blk2->blkno); + nodehdr.count = 2; + xfs_da3_node_hdr_to_disk(node, &nodehdr); #ifdef DEBUG - if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)) { + if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || + oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) { ASSERT(blk1->blkno >= mp->m_dirleafblk && blk1->blkno < mp->m_dirfreeblk); ASSERT(blk2->blkno >= mp->m_dirleafblk && @@ -434,30 +668,34 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, /* Header is already logged by xfs_da_node_create */ xfs_trans_log_buf(tp, bp, - XFS_DA_LOGRANGE(node, node->btree, - sizeof(xfs_da_node_entry_t) * 2)); + XFS_DA_LOGRANGE(node, btree, sizeof(xfs_da_node_entry_t) * 2)); - return(0); + return 0; } /* * Split the node, rebalance, then add the new entry. */ STATIC int /* error */ -xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, - xfs_da_state_blk_t *newblk, - xfs_da_state_blk_t *addblk, - int treelevel, int *result) +xfs_da3_node_split( + struct xfs_da_state *state, + struct xfs_da_state_blk *oldblk, + struct xfs_da_state_blk *newblk, + struct xfs_da_state_blk *addblk, + int treelevel, + int *result) { - xfs_da_intnode_t *node; - xfs_dablk_t blkno; - int newcount, error; - int useextra; + struct xfs_da_intnode *node; + struct xfs_da3_icnode_hdr nodehdr; + xfs_dablk_t blkno; + int newcount; + int error; + int useextra; trace_xfs_da_node_split(state->args); node = oldblk->bp->b_addr; - ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); + xfs_da3_node_hdr_from_disk(&nodehdr, node); /* * With V2 dirs the extra block is data or freespace. @@ -467,7 +705,7 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, /* * Do we have to split the node? */ - if ((be16_to_cpu(node->hdr.count) + newcount) > state->node_ents) { + if (nodehdr.count + newcount > state->node_ents) { /* * Allocate a new node, add to the doubly linked chain of * nodes, then move some of our excess entries into it. @@ -476,14 +714,14 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, if (error) return(error); /* GROT: dir is inconsistent */ - error = xfs_da_node_create(state->args, blkno, treelevel, + error = xfs_da3_node_create(state->args, blkno, treelevel, &newblk->bp, state->args->whichfork); if (error) return(error); /* GROT: dir is inconsistent */ newblk->blkno = blkno; newblk->magic = XFS_DA_NODE_MAGIC; - xfs_da_node_rebalance(state, oldblk, newblk); - error = xfs_da_blk_link(state, oldblk, newblk); + xfs_da3_node_rebalance(state, oldblk, newblk); + error = xfs_da3_blk_link(state, oldblk, newblk); if (error) return(error); *result = 1; @@ -495,7 +733,7 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, * Insert the new entry(s) into the correct block * (updating last hashval in the process). * - * xfs_da_node_add() inserts BEFORE the given index, + * xfs_da3_node_add() inserts BEFORE the given index, * and as a result of using node_lookup_int() we always * point to a valid entry (not after one), but a split * operation always results in a new block whose hashvals @@ -504,22 +742,23 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, * If we had double-split op below us, then add the extra block too. */ node = oldblk->bp->b_addr; - if (oldblk->index <= be16_to_cpu(node->hdr.count)) { + xfs_da3_node_hdr_from_disk(&nodehdr, node); + if (oldblk->index <= nodehdr.count) { oldblk->index++; - xfs_da_node_add(state, oldblk, addblk); + xfs_da3_node_add(state, oldblk, addblk); if (useextra) { if (state->extraafter) oldblk->index++; - xfs_da_node_add(state, oldblk, &state->extrablk); + xfs_da3_node_add(state, oldblk, &state->extrablk); state->extravalid = 0; } } else { newblk->index++; - xfs_da_node_add(state, newblk, addblk); + xfs_da3_node_add(state, newblk, addblk); if (useextra) { if (state->extraafter) newblk->index++; - xfs_da_node_add(state, newblk, &state->extrablk); + xfs_da3_node_add(state, newblk, &state->extrablk); state->extravalid = 0; } } @@ -534,33 +773,53 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, * NOTE: if blk2 is empty, then it will get the upper half of blk1. */ STATIC void -xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, - xfs_da_state_blk_t *blk2) +xfs_da3_node_rebalance( + struct xfs_da_state *state, + struct xfs_da_state_blk *blk1, + struct xfs_da_state_blk *blk2) { - xfs_da_intnode_t *node1, *node2, *tmpnode; - xfs_da_node_entry_t *btree_s, *btree_d; - int count, tmp; - xfs_trans_t *tp; + struct xfs_da_intnode *node1; + struct xfs_da_intnode *node2; + struct xfs_da_intnode *tmpnode; + struct xfs_da_node_entry *btree1; + struct xfs_da_node_entry *btree2; + struct xfs_da_node_entry *btree_s; + struct xfs_da_node_entry *btree_d; + struct xfs_da3_icnode_hdr nodehdr1; + struct xfs_da3_icnode_hdr nodehdr2; + struct xfs_trans *tp; + int count; + int tmp; + int swap = 0; trace_xfs_da_node_rebalance(state->args); node1 = blk1->bp->b_addr; node2 = blk2->bp->b_addr; + xfs_da3_node_hdr_from_disk(&nodehdr1, node1); + xfs_da3_node_hdr_from_disk(&nodehdr2, node2); + btree1 = xfs_da3_node_tree_p(node1); + btree2 = xfs_da3_node_tree_p(node2); + /* * Figure out how many entries need to move, and in which direction. * Swap the nodes around if that makes it simpler. */ - if ((be16_to_cpu(node1->hdr.count) > 0) && (be16_to_cpu(node2->hdr.count) > 0) && - ((be32_to_cpu(node2->btree[0].hashval) < be32_to_cpu(node1->btree[0].hashval)) || - (be32_to_cpu(node2->btree[be16_to_cpu(node2->hdr.count)-1].hashval) < - be32_to_cpu(node1->btree[be16_to_cpu(node1->hdr.count)-1].hashval)))) { + if (nodehdr1.count > 0 && nodehdr2.count > 0 && + ((be32_to_cpu(btree2[0].hashval) < be32_to_cpu(btree1[0].hashval)) || + (be32_to_cpu(btree2[nodehdr2.count - 1].hashval) < + be32_to_cpu(btree1[nodehdr1.count - 1].hashval)))) { tmpnode = node1; node1 = node2; node2 = tmpnode; + xfs_da3_node_hdr_from_disk(&nodehdr1, node1); + xfs_da3_node_hdr_from_disk(&nodehdr2, node2); + btree1 = xfs_da3_node_tree_p(node1); + btree2 = xfs_da3_node_tree_p(node2); + swap = 1; } - ASSERT(node1->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); - ASSERT(node2->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); - count = (be16_to_cpu(node1->hdr.count) - be16_to_cpu(node2->hdr.count)) / 2; + + count = (nodehdr1.count - nodehdr2.count) / 2; if (count == 0) return; tp = state->args->trans; @@ -571,10 +830,11 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, /* * Move elements in node2 up to make a hole. */ - if ((tmp = be16_to_cpu(node2->hdr.count)) > 0) { + tmp = nodehdr2.count; + if (tmp > 0) { tmp *= (uint)sizeof(xfs_da_node_entry_t); - btree_s = &node2->btree[0]; - btree_d = &node2->btree[count]; + btree_s = &btree2[0]; + btree_d = &btree2[count]; memmove(btree_d, btree_s, tmp); } @@ -582,12 +842,12 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, * Move the req'd B-tree elements from high in node1 to * low in node2. */ - be16_add_cpu(&node2->hdr.count, count); + nodehdr2.count += count; tmp = count * (uint)sizeof(xfs_da_node_entry_t); - btree_s = &node1->btree[be16_to_cpu(node1->hdr.count) - count]; - btree_d = &node2->btree[0]; + btree_s = &btree1[nodehdr1.count - count]; + btree_d = &btree2[0]; memcpy(btree_d, btree_s, tmp); - be16_add_cpu(&node1->hdr.count, -count); + nodehdr1.count -= count; } else { /* * Move the req'd B-tree elements from low in node2 to @@ -595,49 +855,60 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, */ count = -count; tmp = count * (uint)sizeof(xfs_da_node_entry_t); - btree_s = &node2->btree[0]; - btree_d = &node1->btree[be16_to_cpu(node1->hdr.count)]; + btree_s = &btree2[0]; + btree_d = &btree1[nodehdr1.count]; memcpy(btree_d, btree_s, tmp); - be16_add_cpu(&node1->hdr.count, count); + nodehdr1.count += count; + xfs_trans_log_buf(tp, blk1->bp, XFS_DA_LOGRANGE(node1, btree_d, tmp)); /* * Move elements in node2 down to fill the hole. */ - tmp = be16_to_cpu(node2->hdr.count) - count; + tmp = nodehdr2.count - count; tmp *= (uint)sizeof(xfs_da_node_entry_t); - btree_s = &node2->btree[count]; - btree_d = &node2->btree[0]; + btree_s = &btree2[count]; + btree_d = &btree2[0]; memmove(btree_d, btree_s, tmp); - be16_add_cpu(&node2->hdr.count, -count); + nodehdr2.count -= count; } /* * Log header of node 1 and all current bits of node 2. */ + xfs_da3_node_hdr_to_disk(node1, &nodehdr1); xfs_trans_log_buf(tp, blk1->bp, - XFS_DA_LOGRANGE(node1, &node1->hdr, sizeof(node1->hdr))); + XFS_DA_LOGRANGE(node1, &node1->hdr, + xfs_da3_node_hdr_size(node1))); + + xfs_da3_node_hdr_to_disk(node2, &nodehdr2); xfs_trans_log_buf(tp, blk2->bp, XFS_DA_LOGRANGE(node2, &node2->hdr, - sizeof(node2->hdr) + - sizeof(node2->btree[0]) * be16_to_cpu(node2->hdr.count))); + xfs_da3_node_hdr_size(node2) + + (sizeof(btree2[0]) * nodehdr2.count))); /* * Record the last hashval from each block for upward propagation. * (note: don't use the swapped node pointers) */ - node1 = blk1->bp->b_addr; - node2 = blk2->bp->b_addr; - blk1->hashval = be32_to_cpu(node1->btree[be16_to_cpu(node1->hdr.count)-1].hashval); - blk2->hashval = be32_to_cpu(node2->btree[be16_to_cpu(node2->hdr.count)-1].hashval); + if (swap) { + node1 = blk1->bp->b_addr; + node2 = blk2->bp->b_addr; + xfs_da3_node_hdr_from_disk(&nodehdr1, node1); + xfs_da3_node_hdr_from_disk(&nodehdr2, node2); + btree1 = xfs_da3_node_tree_p(node1); + btree2 = xfs_da3_node_tree_p(node2); + } + blk1->hashval = be32_to_cpu(btree1[nodehdr1.count - 1].hashval); + blk2->hashval = be32_to_cpu(btree2[nodehdr2.count - 1].hashval); /* * Adjust the expected index for insertion. */ - if (blk1->index >= be16_to_cpu(node1->hdr.count)) { - blk2->index = blk1->index - be16_to_cpu(node1->hdr.count); - blk1->index = be16_to_cpu(node1->hdr.count) + 1; /* make it invalid */ + if (blk1->index >= nodehdr1.count) { + blk2->index = blk1->index - nodehdr1.count; + blk1->index = nodehdr1.count + 1; /* make it invalid */ } } @@ -645,18 +916,23 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, * Add a new entry to an intermediate node. */ STATIC void -xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, - xfs_da_state_blk_t *newblk) +xfs_da3_node_add( + struct xfs_da_state *state, + struct xfs_da_state_blk *oldblk, + struct xfs_da_state_blk *newblk) { - xfs_da_intnode_t *node; - xfs_da_node_entry_t *btree; - int tmp; + struct xfs_da_intnode *node; + struct xfs_da3_icnode_hdr nodehdr; + struct xfs_da_node_entry *btree; + int tmp; trace_xfs_da_node_add(state->args); node = oldblk->bp->b_addr; - ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); - ASSERT((oldblk->index >= 0) && (oldblk->index <= be16_to_cpu(node->hdr.count))); + xfs_da3_node_hdr_from_disk(&nodehdr, node); + btree = xfs_da3_node_tree_p(node); + + ASSERT(oldblk->index >= 0 && oldblk->index <= nodehdr.count); ASSERT(newblk->blkno != 0); if (state->args->whichfork == XFS_DATA_FORK) ASSERT(newblk->blkno >= state->mp->m_dirleafblk && @@ -666,23 +942,25 @@ xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, * We may need to make some room before we insert the new node. */ tmp = 0; - btree = &node->btree[ oldblk->index ]; - if (oldblk->index < be16_to_cpu(node->hdr.count)) { - tmp = (be16_to_cpu(node->hdr.count) - oldblk->index) * (uint)sizeof(*btree); - memmove(btree + 1, btree, tmp); + if (oldblk->index < nodehdr.count) { + tmp = (nodehdr.count - oldblk->index) * (uint)sizeof(*btree); + memmove(&btree[oldblk->index + 1], &btree[oldblk->index], tmp); } - btree->hashval = cpu_to_be32(newblk->hashval); - btree->before = cpu_to_be32(newblk->blkno); + btree[oldblk->index].hashval = cpu_to_be32(newblk->hashval); + btree[oldblk->index].before = cpu_to_be32(newblk->blkno); xfs_trans_log_buf(state->args->trans, oldblk->bp, - XFS_DA_LOGRANGE(node, btree, tmp + sizeof(*btree))); - be16_add_cpu(&node->hdr.count, 1); + XFS_DA_LOGRANGE(node, &btree[oldblk->index], + tmp + sizeof(*btree))); + + nodehdr.count += 1; + xfs_da3_node_hdr_to_disk(node, &nodehdr); xfs_trans_log_buf(state->args->trans, oldblk->bp, - XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr))); + XFS_DA_LOGRANGE(node, &node->hdr, xfs_da3_node_hdr_size(node))); /* * Copy the last hash value from the oldblk to propagate upwards. */ - oldblk->hashval = be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1 ].hashval); + oldblk->hashval = be32_to_cpu(btree[nodehdr.count - 1].hashval); } /*======================================================================== @@ -694,14 +972,16 @@ xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, * possibly deallocating that block, etc... */ int -xfs_da_join(xfs_da_state_t *state) +xfs_da3_join( + struct xfs_da_state *state) { - xfs_da_state_blk_t *drop_blk, *save_blk; - int action, error; + struct xfs_da_state_blk *drop_blk; + struct xfs_da_state_blk *save_blk; + int action = 0; + int error; trace_xfs_da_join(state->args); - action = 0; drop_blk = &state->path.blk[ state->path.active-1 ]; save_blk = &state->altpath.blk[ state->path.active-1 ]; ASSERT(state->path.blk[0].magic == XFS_DA_NODE_MAGIC); @@ -722,12 +1002,12 @@ xfs_da_join(xfs_da_state_t *state) */ switch (drop_blk->magic) { case XFS_ATTR_LEAF_MAGIC: - error = xfs_attr_leaf_toosmall(state, &action); + error = xfs_attr3_leaf_toosmall(state, &action); if (error) return(error); if (action == 0) return(0); - xfs_attr_leaf_unbalance(state, drop_blk, save_blk); + xfs_attr3_leaf_unbalance(state, drop_blk, save_blk); break; case XFS_DIR2_LEAFN_MAGIC: error = xfs_dir2_leafn_toosmall(state, &action); @@ -742,18 +1022,18 @@ xfs_da_join(xfs_da_state_t *state) * Remove the offending node, fixup hashvals, * check for a toosmall neighbor. */ - xfs_da_node_remove(state, drop_blk); - xfs_da_fixhashpath(state, &state->path); - error = xfs_da_node_toosmall(state, &action); + xfs_da3_node_remove(state, drop_blk); + xfs_da3_fixhashpath(state, &state->path); + error = xfs_da3_node_toosmall(state, &action); if (error) return(error); if (action == 0) return 0; - xfs_da_node_unbalance(state, drop_blk, save_blk); + xfs_da3_node_unbalance(state, drop_blk, save_blk); break; } - xfs_da_fixhashpath(state, &state->altpath); - error = xfs_da_blk_unlink(state, drop_blk, save_blk); + xfs_da3_fixhashpath(state, &state->altpath); + error = xfs_da3_blk_unlink(state, drop_blk, save_blk); xfs_da_state_kill_altpath(state); if (error) return(error); @@ -768,9 +1048,9 @@ xfs_da_join(xfs_da_state_t *state) * we only have one entry in the root, make the child block * the new root. */ - xfs_da_node_remove(state, drop_blk); - xfs_da_fixhashpath(state, &state->path); - error = xfs_da_root_join(state, &state->path.blk[0]); + xfs_da3_node_remove(state, drop_blk); + xfs_da3_fixhashpath(state, &state->path); + error = xfs_da3_root_join(state, &state->path.blk[0]); return(error); } @@ -782,9 +1062,13 @@ xfs_da_blkinfo_onlychild_validate(struct xfs_da_blkinfo *blkinfo, __u16 level) if (level == 1) { ASSERT(magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || - magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - } else - ASSERT(magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); + magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) || + magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) || + magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)); + } else { + ASSERT(magic == cpu_to_be16(XFS_DA_NODE_MAGIC) || + magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)); + } ASSERT(!blkinfo->forw); ASSERT(!blkinfo->back); } @@ -797,52 +1081,61 @@ xfs_da_blkinfo_onlychild_validate(struct xfs_da_blkinfo *blkinfo, __u16 level) * the old root to block 0 as the new root node. */ STATIC int -xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk) +xfs_da3_root_join( + struct xfs_da_state *state, + struct xfs_da_state_blk *root_blk) { - xfs_da_intnode_t *oldroot; - xfs_da_args_t *args; - xfs_dablk_t child; - struct xfs_buf *bp; - int error; + struct xfs_da_intnode *oldroot; + struct xfs_da_args *args; + xfs_dablk_t child; + struct xfs_buf *bp; + struct xfs_da3_icnode_hdr oldroothdr; + struct xfs_da_node_entry *btree; + int error; trace_xfs_da_root_join(state->args); - args = state->args; - ASSERT(args != NULL); ASSERT(root_blk->magic == XFS_DA_NODE_MAGIC); + + args = state->args; oldroot = root_blk->bp->b_addr; - ASSERT(oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); - ASSERT(!oldroot->hdr.info.forw); - ASSERT(!oldroot->hdr.info.back); + xfs_da3_node_hdr_from_disk(&oldroothdr, oldroot); + ASSERT(oldroothdr.forw == 0); + ASSERT(oldroothdr.back == 0); /* * If the root has more than one child, then don't do anything. */ - if (be16_to_cpu(oldroot->hdr.count) > 1) - return(0); + if (oldroothdr.count > 1) + return 0; /* * Read in the (only) child block, then copy those bytes into * the root block's buffer and free the original child block. */ - child = be32_to_cpu(oldroot->btree[0].before); + btree = xfs_da3_node_tree_p(oldroot); + child = be32_to_cpu(btree[0].before); ASSERT(child != 0); - error = xfs_da_node_read(args->trans, args->dp, child, -1, &bp, + error = xfs_da3_node_read(args->trans, args->dp, child, -1, &bp, args->whichfork); if (error) - return(error); - ASSERT(bp != NULL); - xfs_da_blkinfo_onlychild_validate(bp->b_addr, - be16_to_cpu(oldroot->hdr.level)); + return error; + xfs_da_blkinfo_onlychild_validate(bp->b_addr, oldroothdr.level); /* * This could be copying a leaf back into the root block in the case of * there only being a single leaf block left in the tree. Hence we have * to update the b_ops pointer as well to match the buffer type change - * that could occur. + * that could occur. For dir3 blocks we also need to update the block + * number in the buffer header. */ memcpy(root_blk->bp->b_addr, bp->b_addr, state->blocksize); root_blk->bp->b_ops = bp->b_ops; + xfs_trans_buf_copy_type(root_blk->bp, bp); + if (oldroothdr.magic == XFS_DA3_NODE_MAGIC) { + struct xfs_da3_blkinfo *da3 = root_blk->bp->b_addr; + da3->blkno = cpu_to_be64(root_blk->bp->b_bn); + } xfs_trans_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1); error = xfs_da_shrink_inode(args, child, bp); return(error); @@ -858,14 +1151,21 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk) * If nothing can be done, return 0. */ STATIC int -xfs_da_node_toosmall(xfs_da_state_t *state, int *action) +xfs_da3_node_toosmall( + struct xfs_da_state *state, + int *action) { - xfs_da_intnode_t *node; - xfs_da_state_blk_t *blk; - xfs_da_blkinfo_t *info; - int count, forward, error, retval, i; - xfs_dablk_t blkno; - struct xfs_buf *bp; + struct xfs_da_intnode *node; + struct xfs_da_state_blk *blk; + struct xfs_da_blkinfo *info; + xfs_dablk_t blkno; + struct xfs_buf *bp; + struct xfs_da3_icnode_hdr nodehdr; + int count; + int forward; + int error; + int retval; + int i; trace_xfs_da_node_toosmall(state->args); @@ -876,10 +1176,9 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action) */ blk = &state->path.blk[ state->path.active-1 ]; info = blk->bp->b_addr; - ASSERT(info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); node = (xfs_da_intnode_t *)info; - count = be16_to_cpu(node->hdr.count); - if (count > (state->node_ents >> 1)) { + xfs_da3_node_hdr_from_disk(&nodehdr, node); + if (nodehdr.count > (state->node_ents >> 1)) { *action = 0; /* blk over 50%, don't try to join */ return(0); /* blk over 50%, don't try to join */ } @@ -890,14 +1189,14 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action) * coalesce it with a sibling block. We choose (arbitrarily) * to merge with the forward block unless it is NULL. */ - if (count == 0) { + if (nodehdr.count == 0) { /* * Make altpath point to the block we want to keep and * path point to the block we want to drop (this one). */ forward = (info->forw != 0); memcpy(&state->altpath, &state->path, sizeof(state->path)); - error = xfs_da_path_shift(state, &state->altpath, forward, + error = xfs_da3_path_shift(state, &state->altpath, forward, 0, &retval); if (error) return(error); @@ -916,35 +1215,34 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action) * We prefer coalescing with the lower numbered sibling so as * to shrink a directory over time. */ + count = state->node_ents; + count -= state->node_ents >> 2; + count -= nodehdr.count; + /* start with smaller blk num */ - forward = (be32_to_cpu(info->forw) < be32_to_cpu(info->back)); + forward = nodehdr.forw < nodehdr.back; for (i = 0; i < 2; forward = !forward, i++) { if (forward) - blkno = be32_to_cpu(info->forw); + blkno = nodehdr.forw; else - blkno = be32_to_cpu(info->back); + blkno = nodehdr.back; if (blkno == 0) continue; - error = xfs_da_node_read(state->args->trans, state->args->dp, + error = xfs_da3_node_read(state->args->trans, state->args->dp, blkno, -1, &bp, state->args->whichfork); if (error) return(error); - ASSERT(bp != NULL); - node = (xfs_da_intnode_t *)info; - count = state->node_ents; - count -= state->node_ents >> 2; - count -= be16_to_cpu(node->hdr.count); node = bp->b_addr; - ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); - count -= be16_to_cpu(node->hdr.count); + xfs_da3_node_hdr_from_disk(&nodehdr, node); xfs_trans_brelse(state->args->trans, bp); - if (count >= 0) + + if (count - nodehdr.count >= 0) break; /* fits with at least 25% to spare */ } if (i >= 2) { *action = 0; - return(0); + return 0; } /* @@ -953,28 +1251,42 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action) */ memcpy(&state->altpath, &state->path, sizeof(state->path)); if (blkno < blk->blkno) { - error = xfs_da_path_shift(state, &state->altpath, forward, + error = xfs_da3_path_shift(state, &state->altpath, forward, 0, &retval); - if (error) { - return(error); - } - if (retval) { - *action = 0; - return(0); - } } else { - error = xfs_da_path_shift(state, &state->path, forward, + error = xfs_da3_path_shift(state, &state->path, forward, 0, &retval); - if (error) { - return(error); - } - if (retval) { - *action = 0; - return(0); - } + } + if (error) + return error; + if (retval) { + *action = 0; + return 0; } *action = 1; - return(0); + return 0; +} + +/* + * Pick up the last hashvalue from an intermediate node. + */ +STATIC uint +xfs_da3_node_lasthash( + struct xfs_buf *bp, + int *count) +{ + struct xfs_da_intnode *node; + struct xfs_da_node_entry *btree; + struct xfs_da3_icnode_hdr nodehdr; + + node = bp->b_addr; + xfs_da3_node_hdr_from_disk(&nodehdr, node); + if (count) + *count = nodehdr.count; + if (!nodehdr.count) + return 0; + btree = xfs_da3_node_tree_p(node); + return be32_to_cpu(btree[nodehdr.count - 1].hashval); } /* @@ -982,13 +1294,16 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action) * when we stop making changes, return. */ void -xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path) +xfs_da3_fixhashpath( + struct xfs_da_state *state, + struct xfs_da_state_path *path) { - xfs_da_state_blk_t *blk; - xfs_da_intnode_t *node; - xfs_da_node_entry_t *btree; - xfs_dahash_t lasthash=0; - int level, count; + struct xfs_da_state_blk *blk; + struct xfs_da_intnode *node; + struct xfs_da_node_entry *btree; + xfs_dahash_t lasthash=0; + int level; + int count; trace_xfs_da_fixhashpath(state->args); @@ -1006,23 +1321,26 @@ xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path) return; break; case XFS_DA_NODE_MAGIC: - lasthash = xfs_da_node_lasthash(blk->bp, &count); + lasthash = xfs_da3_node_lasthash(blk->bp, &count); if (count == 0) return; break; } for (blk--, level--; level >= 0; blk--, level--) { + struct xfs_da3_icnode_hdr nodehdr; + node = blk->bp->b_addr; - ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); - btree = &node->btree[ blk->index ]; + xfs_da3_node_hdr_from_disk(&nodehdr, node); + btree = xfs_da3_node_tree_p(node); if (be32_to_cpu(btree->hashval) == lasthash) break; blk->hashval = lasthash; - btree->hashval = cpu_to_be32(lasthash); + btree[blk->index].hashval = cpu_to_be32(lasthash); xfs_trans_log_buf(state->args->trans, blk->bp, - XFS_DA_LOGRANGE(node, btree, sizeof(*btree))); + XFS_DA_LOGRANGE(node, &btree[blk->index], + sizeof(*btree))); - lasthash = be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1].hashval); + lasthash = be32_to_cpu(btree[nodehdr.count - 1].hashval); } } @@ -1030,104 +1348,120 @@ xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path) * Remove an entry from an intermediate node. */ STATIC void -xfs_da_node_remove(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk) +xfs_da3_node_remove( + struct xfs_da_state *state, + struct xfs_da_state_blk *drop_blk) { - xfs_da_intnode_t *node; - xfs_da_node_entry_t *btree; - int tmp; + struct xfs_da_intnode *node; + struct xfs_da3_icnode_hdr nodehdr; + struct xfs_da_node_entry *btree; + int index; + int tmp; trace_xfs_da_node_remove(state->args); node = drop_blk->bp->b_addr; - ASSERT(drop_blk->index < be16_to_cpu(node->hdr.count)); + xfs_da3_node_hdr_from_disk(&nodehdr, node); + ASSERT(drop_blk->index < nodehdr.count); ASSERT(drop_blk->index >= 0); /* * Copy over the offending entry, or just zero it out. */ - btree = &node->btree[drop_blk->index]; - if (drop_blk->index < (be16_to_cpu(node->hdr.count)-1)) { - tmp = be16_to_cpu(node->hdr.count) - drop_blk->index - 1; + index = drop_blk->index; + btree = xfs_da3_node_tree_p(node); + if (index < nodehdr.count - 1) { + tmp = nodehdr.count - index - 1; tmp *= (uint)sizeof(xfs_da_node_entry_t); - memmove(btree, btree + 1, tmp); + memmove(&btree[index], &btree[index + 1], tmp); xfs_trans_log_buf(state->args->trans, drop_blk->bp, - XFS_DA_LOGRANGE(node, btree, tmp)); - btree = &node->btree[be16_to_cpu(node->hdr.count)-1]; + XFS_DA_LOGRANGE(node, &btree[index], tmp)); + index = nodehdr.count - 1; } - memset((char *)btree, 0, sizeof(xfs_da_node_entry_t)); + memset(&btree[index], 0, sizeof(xfs_da_node_entry_t)); xfs_trans_log_buf(state->args->trans, drop_blk->bp, - XFS_DA_LOGRANGE(node, btree, sizeof(*btree))); - be16_add_cpu(&node->hdr.count, -1); + XFS_DA_LOGRANGE(node, &btree[index], sizeof(btree[index]))); + nodehdr.count -= 1; + xfs_da3_node_hdr_to_disk(node, &nodehdr); xfs_trans_log_buf(state->args->trans, drop_blk->bp, - XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr))); + XFS_DA_LOGRANGE(node, &node->hdr, xfs_da3_node_hdr_size(node))); /* * Copy the last hash value from the block to propagate upwards. */ - btree--; - drop_blk->hashval = be32_to_cpu(btree->hashval); + drop_blk->hashval = be32_to_cpu(btree[index - 1].hashval); } /* - * Unbalance the btree elements between two intermediate nodes, + * Unbalance the elements between two intermediate nodes, * move all Btree elements from one node into another. */ STATIC void -xfs_da_node_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, - xfs_da_state_blk_t *save_blk) +xfs_da3_node_unbalance( + struct xfs_da_state *state, + struct xfs_da_state_blk *drop_blk, + struct xfs_da_state_blk *save_blk) { - xfs_da_intnode_t *drop_node, *save_node; - xfs_da_node_entry_t *btree; - int tmp; - xfs_trans_t *tp; + struct xfs_da_intnode *drop_node; + struct xfs_da_intnode *save_node; + struct xfs_da_node_entry *drop_btree; + struct xfs_da_node_entry *save_btree; + struct xfs_da3_icnode_hdr drop_hdr; + struct xfs_da3_icnode_hdr save_hdr; + struct xfs_trans *tp; + int sindex; + int tmp; trace_xfs_da_node_unbalance(state->args); drop_node = drop_blk->bp->b_addr; save_node = save_blk->bp->b_addr; - ASSERT(drop_node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); - ASSERT(save_node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); + xfs_da3_node_hdr_from_disk(&drop_hdr, drop_node); + xfs_da3_node_hdr_from_disk(&save_hdr, save_node); + drop_btree = xfs_da3_node_tree_p(drop_node); + save_btree = xfs_da3_node_tree_p(save_node); tp = state->args->trans; /* * If the dying block has lower hashvals, then move all the * elements in the remaining block up to make a hole. */ - if ((be32_to_cpu(drop_node->btree[0].hashval) < be32_to_cpu(save_node->btree[ 0 ].hashval)) || - (be32_to_cpu(drop_node->btree[be16_to_cpu(drop_node->hdr.count)-1].hashval) < - be32_to_cpu(save_node->btree[be16_to_cpu(save_node->hdr.count)-1].hashval))) - { - btree = &save_node->btree[be16_to_cpu(drop_node->hdr.count)]; - tmp = be16_to_cpu(save_node->hdr.count) * (uint)sizeof(xfs_da_node_entry_t); - memmove(btree, &save_node->btree[0], tmp); - btree = &save_node->btree[0]; + if ((be32_to_cpu(drop_btree[0].hashval) < + be32_to_cpu(save_btree[0].hashval)) || + (be32_to_cpu(drop_btree[drop_hdr.count - 1].hashval) < + be32_to_cpu(save_btree[save_hdr.count - 1].hashval))) { + /* XXX: check this - is memmove dst correct? */ + tmp = save_hdr.count * sizeof(xfs_da_node_entry_t); + memmove(&save_btree[drop_hdr.count], &save_btree[0], tmp); + + sindex = 0; xfs_trans_log_buf(tp, save_blk->bp, - XFS_DA_LOGRANGE(save_node, btree, - (be16_to_cpu(save_node->hdr.count) + be16_to_cpu(drop_node->hdr.count)) * - sizeof(xfs_da_node_entry_t))); + XFS_DA_LOGRANGE(save_node, &save_btree[0], + (save_hdr.count + drop_hdr.count) * + sizeof(xfs_da_node_entry_t))); } else { - btree = &save_node->btree[be16_to_cpu(save_node->hdr.count)]; + sindex = save_hdr.count; xfs_trans_log_buf(tp, save_blk->bp, - XFS_DA_LOGRANGE(save_node, btree, - be16_to_cpu(drop_node->hdr.count) * - sizeof(xfs_da_node_entry_t))); + XFS_DA_LOGRANGE(save_node, &save_btree[sindex], + drop_hdr.count * sizeof(xfs_da_node_entry_t))); } /* * Move all the B-tree elements from drop_blk to save_blk. */ - tmp = be16_to_cpu(drop_node->hdr.count) * (uint)sizeof(xfs_da_node_entry_t); - memcpy(btree, &drop_node->btree[0], tmp); - be16_add_cpu(&save_node->hdr.count, be16_to_cpu(drop_node->hdr.count)); + tmp = drop_hdr.count * (uint)sizeof(xfs_da_node_entry_t); + memcpy(&save_btree[sindex], &drop_btree[0], tmp); + save_hdr.count += drop_hdr.count; + xfs_da3_node_hdr_to_disk(save_node, &save_hdr); xfs_trans_log_buf(tp, save_blk->bp, XFS_DA_LOGRANGE(save_node, &save_node->hdr, - sizeof(save_node->hdr))); + xfs_da3_node_hdr_size(save_node))); /* * Save the last hashval in the remaining block for upward propagation. */ - save_blk->hashval = be32_to_cpu(save_node->btree[be16_to_cpu(save_node->hdr.count)-1].hashval); + save_blk->hashval = be32_to_cpu(save_btree[save_hdr.count - 1].hashval); } /*======================================================================== @@ -1146,16 +1480,24 @@ xfs_da_node_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, * pruned depth-first tree search. */ int /* error */ -xfs_da_node_lookup_int(xfs_da_state_t *state, int *result) +xfs_da3_node_lookup_int( + struct xfs_da_state *state, + int *result) { - xfs_da_state_blk_t *blk; - xfs_da_blkinfo_t *curr; - xfs_da_intnode_t *node; - xfs_da_node_entry_t *btree; - xfs_dablk_t blkno; - int probe, span, max, error, retval; - xfs_dahash_t hashval, btreehashval; - xfs_da_args_t *args; + struct xfs_da_state_blk *blk; + struct xfs_da_blkinfo *curr; + struct xfs_da_intnode *node; + struct xfs_da_node_entry *btree; + struct xfs_da3_icnode_hdr nodehdr; + struct xfs_da_args *args; + xfs_dablk_t blkno; + xfs_dahash_t hashval; + xfs_dahash_t btreehashval; + int probe; + int span; + int max; + int error; + int retval; args = state->args; @@ -1171,7 +1513,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result) * Read the next node down in the tree. */ blk->blkno = blkno; - error = xfs_da_node_read(args->trans, args->dp, blkno, + error = xfs_da3_node_read(args->trans, args->dp, blkno, -1, &blk->bp, args->whichfork); if (error) { blk->blkno = 0; @@ -1180,66 +1522,75 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result) } curr = blk->bp->b_addr; blk->magic = be16_to_cpu(curr->magic); - ASSERT(blk->magic == XFS_DA_NODE_MAGIC || - blk->magic == XFS_DIR2_LEAFN_MAGIC || - blk->magic == XFS_ATTR_LEAF_MAGIC); + + if (blk->magic == XFS_ATTR_LEAF_MAGIC || + blk->magic == XFS_ATTR3_LEAF_MAGIC) { + blk->magic = XFS_ATTR_LEAF_MAGIC; + blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL); + break; + } + + if (blk->magic == XFS_DIR2_LEAFN_MAGIC || + blk->magic == XFS_DIR3_LEAFN_MAGIC) { + blk->magic = XFS_DIR2_LEAFN_MAGIC; + blk->hashval = xfs_dir2_leafn_lasthash(blk->bp, NULL); + break; + } + + blk->magic = XFS_DA_NODE_MAGIC; + /* * Search an intermediate node for a match. */ - if (blk->magic == XFS_DA_NODE_MAGIC) { - node = blk->bp->b_addr; - max = be16_to_cpu(node->hdr.count); - blk->hashval = be32_to_cpu(node->btree[max-1].hashval); + node = blk->bp->b_addr; + xfs_da3_node_hdr_from_disk(&nodehdr, node); + btree = xfs_da3_node_tree_p(node); - /* - * Binary search. (note: small blocks will skip loop) - */ - probe = span = max / 2; - hashval = args->hashval; - for (btree = &node->btree[probe]; span > 4; - btree = &node->btree[probe]) { - span /= 2; - btreehashval = be32_to_cpu(btree->hashval); - if (btreehashval < hashval) - probe += span; - else if (btreehashval > hashval) - probe -= span; - else - break; - } - ASSERT((probe >= 0) && (probe < max)); - ASSERT((span <= 4) || (be32_to_cpu(btree->hashval) == hashval)); + max = nodehdr.count; + blk->hashval = be32_to_cpu(btree[max - 1].hashval); - /* - * Since we may have duplicate hashval's, find the first - * matching hashval in the node. - */ - while ((probe > 0) && (be32_to_cpu(btree->hashval) >= hashval)) { - btree--; - probe--; - } - while ((probe < max) && (be32_to_cpu(btree->hashval) < hashval)) { - btree++; - probe++; - } + /* + * Binary search. (note: small blocks will skip loop) + */ + probe = span = max / 2; + hashval = args->hashval; + while (span > 4) { + span /= 2; + btreehashval = be32_to_cpu(btree[probe].hashval); + if (btreehashval < hashval) + probe += span; + else if (btreehashval > hashval) + probe -= span; + else + break; + } + ASSERT((probe >= 0) && (probe < max)); + ASSERT((span <= 4) || + (be32_to_cpu(btree[probe].hashval) == hashval)); - /* - * Pick the right block to descend on. - */ - if (probe == max) { - blk->index = max-1; - blkno = be32_to_cpu(node->btree[max-1].before); - } else { - blk->index = probe; - blkno = be32_to_cpu(btree->before); - } - } else if (blk->magic == XFS_ATTR_LEAF_MAGIC) { - blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL); - break; - } else if (blk->magic == XFS_DIR2_LEAFN_MAGIC) { - blk->hashval = xfs_dir2_leafn_lasthash(blk->bp, NULL); - break; + /* + * Since we may have duplicate hashval's, find the first + * matching hashval in the node. + */ + while (probe > 0 && + be32_to_cpu(btree[probe].hashval) >= hashval) { + probe--; + } + while (probe < max && + be32_to_cpu(btree[probe].hashval) < hashval) { + probe++; + } + + /* + * Pick the right block to descend on. + */ + if (probe == max) { + blk->index = max - 1; + blkno = be32_to_cpu(btree[max - 1].before); + } else { + blk->index = probe; + blkno = be32_to_cpu(btree[probe].before); } } @@ -1254,7 +1605,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result) retval = xfs_dir2_leafn_lookup_int(blk->bp, args, &blk->index, state); } else if (blk->magic == XFS_ATTR_LEAF_MAGIC) { - retval = xfs_attr_leaf_lookup_int(blk->bp, args); + retval = xfs_attr3_leaf_lookup_int(blk->bp, args); blk->index = args->index; args->blkno = blk->blkno; } else { @@ -1263,7 +1614,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result) } if (((retval == ENOENT) || (retval == ENOATTR)) && (blk->hashval == args->hashval)) { - error = xfs_da_path_shift(state, &state->path, 1, 1, + error = xfs_da3_path_shift(state, &state->path, 1, 1, &retval); if (error) return(error); @@ -1285,16 +1636,52 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result) *========================================================================*/ /* + * Compare two intermediate nodes for "order". + */ +STATIC int +xfs_da3_node_order( + struct xfs_buf *node1_bp, + struct xfs_buf *node2_bp) +{ + struct xfs_da_intnode *node1; + struct xfs_da_intnode *node2; + struct xfs_da_node_entry *btree1; + struct xfs_da_node_entry *btree2; + struct xfs_da3_icnode_hdr node1hdr; + struct xfs_da3_icnode_hdr node2hdr; + + node1 = node1_bp->b_addr; + node2 = node2_bp->b_addr; + xfs_da3_node_hdr_from_disk(&node1hdr, node1); + xfs_da3_node_hdr_from_disk(&node2hdr, node2); + btree1 = xfs_da3_node_tree_p(node1); + btree2 = xfs_da3_node_tree_p(node2); + + if (node1hdr.count > 0 && node2hdr.count > 0 && + ((be32_to_cpu(btree2[0].hashval) < be32_to_cpu(btree1[0].hashval)) || + (be32_to_cpu(btree2[node2hdr.count - 1].hashval) < + be32_to_cpu(btree1[node1hdr.count - 1].hashval)))) { + return 1; + } + return 0; +} + +/* * Link a new block into a doubly linked list of blocks (of whatever type). */ int /* error */ -xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, - xfs_da_state_blk_t *new_blk) +xfs_da3_blk_link( + struct xfs_da_state *state, + struct xfs_da_state_blk *old_blk, + struct xfs_da_state_blk *new_blk) { - xfs_da_blkinfo_t *old_info, *new_info, *tmp_info; - xfs_da_args_t *args; - int before=0, error; - struct xfs_buf *bp; + struct xfs_da_blkinfo *old_info; + struct xfs_da_blkinfo *new_info; + struct xfs_da_blkinfo *tmp_info; + struct xfs_da_args *args; + struct xfs_buf *bp; + int before = 0; + int error; /* * Set up environment. @@ -1306,9 +1693,6 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, ASSERT(old_blk->magic == XFS_DA_NODE_MAGIC || old_blk->magic == XFS_DIR2_LEAFN_MAGIC || old_blk->magic == XFS_ATTR_LEAF_MAGIC); - ASSERT(old_blk->magic == be16_to_cpu(old_info->magic)); - ASSERT(new_blk->magic == be16_to_cpu(new_info->magic)); - ASSERT(old_blk->magic == new_blk->magic); switch (old_blk->magic) { case XFS_ATTR_LEAF_MAGIC: @@ -1318,7 +1702,7 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, before = xfs_dir2_leafn_order(old_blk->bp, new_blk->bp); break; case XFS_DA_NODE_MAGIC: - before = xfs_da_node_order(old_blk->bp, new_blk->bp); + before = xfs_da3_node_order(old_blk->bp, new_blk->bp); break; } @@ -1333,14 +1717,14 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, new_info->forw = cpu_to_be32(old_blk->blkno); new_info->back = old_info->back; if (old_info->back) { - error = xfs_da_node_read(args->trans, args->dp, + error = xfs_da3_node_read(args->trans, args->dp, be32_to_cpu(old_info->back), -1, &bp, args->whichfork); if (error) return(error); ASSERT(bp != NULL); tmp_info = bp->b_addr; - ASSERT(be16_to_cpu(tmp_info->magic) == be16_to_cpu(old_info->magic)); + ASSERT(tmp_info->magic == old_info->magic); ASSERT(be32_to_cpu(tmp_info->forw) == old_blk->blkno); tmp_info->forw = cpu_to_be32(new_blk->blkno); xfs_trans_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1); @@ -1354,7 +1738,7 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, new_info->forw = old_info->forw; new_info->back = cpu_to_be32(old_blk->blkno); if (old_info->forw) { - error = xfs_da_node_read(args->trans, args->dp, + error = xfs_da3_node_read(args->trans, args->dp, be32_to_cpu(old_info->forw), -1, &bp, args->whichfork); if (error) @@ -1375,59 +1759,20 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, } /* - * Compare two intermediate nodes for "order". - */ -STATIC int -xfs_da_node_order( - struct xfs_buf *node1_bp, - struct xfs_buf *node2_bp) -{ - xfs_da_intnode_t *node1, *node2; - - node1 = node1_bp->b_addr; - node2 = node2_bp->b_addr; - ASSERT(node1->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC) && - node2->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); - if ((be16_to_cpu(node1->hdr.count) > 0) && (be16_to_cpu(node2->hdr.count) > 0) && - ((be32_to_cpu(node2->btree[0].hashval) < - be32_to_cpu(node1->btree[0].hashval)) || - (be32_to_cpu(node2->btree[be16_to_cpu(node2->hdr.count)-1].hashval) < - be32_to_cpu(node1->btree[be16_to_cpu(node1->hdr.count)-1].hashval)))) { - return(1); - } - return(0); -} - -/* - * Pick up the last hashvalue from an intermediate node. - */ -STATIC uint -xfs_da_node_lasthash( - struct xfs_buf *bp, - int *count) -{ - xfs_da_intnode_t *node; - - node = bp->b_addr; - ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); - if (count) - *count = be16_to_cpu(node->hdr.count); - if (!node->hdr.count) - return(0); - return be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1].hashval); -} - -/* * Unlink a block from a doubly linked list of blocks. */ STATIC int /* error */ -xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, - xfs_da_state_blk_t *save_blk) +xfs_da3_blk_unlink( + struct xfs_da_state *state, + struct xfs_da_state_blk *drop_blk, + struct xfs_da_state_blk *save_blk) { - xfs_da_blkinfo_t *drop_info, *save_info, *tmp_info; - xfs_da_args_t *args; - struct xfs_buf *bp; - int error; + struct xfs_da_blkinfo *drop_info; + struct xfs_da_blkinfo *save_info; + struct xfs_da_blkinfo *tmp_info; + struct xfs_da_args *args; + struct xfs_buf *bp; + int error; /* * Set up environment. @@ -1439,8 +1784,6 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, ASSERT(save_blk->magic == XFS_DA_NODE_MAGIC || save_blk->magic == XFS_DIR2_LEAFN_MAGIC || save_blk->magic == XFS_ATTR_LEAF_MAGIC); - ASSERT(save_blk->magic == be16_to_cpu(save_info->magic)); - ASSERT(drop_blk->magic == be16_to_cpu(drop_info->magic)); ASSERT(save_blk->magic == drop_blk->magic); ASSERT((be32_to_cpu(save_info->forw) == drop_blk->blkno) || (be32_to_cpu(save_info->back) == drop_blk->blkno)); @@ -1454,7 +1797,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, trace_xfs_da_unlink_back(args); save_info->back = drop_info->back; if (drop_info->back) { - error = xfs_da_node_read(args->trans, args->dp, + error = xfs_da3_node_read(args->trans, args->dp, be32_to_cpu(drop_info->back), -1, &bp, args->whichfork); if (error) @@ -1471,7 +1814,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, trace_xfs_da_unlink_forward(args); save_info->forw = drop_info->forw; if (drop_info->forw) { - error = xfs_da_node_read(args->trans, args->dp, + error = xfs_da3_node_read(args->trans, args->dp, be32_to_cpu(drop_info->forw), -1, &bp, args->whichfork); if (error) @@ -1499,15 +1842,22 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, * the new bottom and the root. */ int /* error */ -xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, - int forward, int release, int *result) +xfs_da3_path_shift( + struct xfs_da_state *state, + struct xfs_da_state_path *path, + int forward, + int release, + int *result) { - xfs_da_state_blk_t *blk; - xfs_da_blkinfo_t *info; - xfs_da_intnode_t *node; - xfs_da_args_t *args; - xfs_dablk_t blkno=0; - int level, error; + struct xfs_da_state_blk *blk; + struct xfs_da_blkinfo *info; + struct xfs_da_intnode *node; + struct xfs_da_args *args; + struct xfs_da_node_entry *btree; + struct xfs_da3_icnode_hdr nodehdr; + xfs_dablk_t blkno = 0; + int level; + int error; trace_xfs_da_path_shift(state->args); @@ -1522,16 +1872,17 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, ASSERT((path->active > 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); level = (path->active-1) - 1; /* skip bottom layer in path */ for (blk = &path->blk[level]; level >= 0; blk--, level--) { - ASSERT(blk->bp != NULL); node = blk->bp->b_addr; - ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); - if (forward && (blk->index < be16_to_cpu(node->hdr.count)-1)) { + xfs_da3_node_hdr_from_disk(&nodehdr, node); + btree = xfs_da3_node_tree_p(node); + + if (forward && (blk->index < nodehdr.count - 1)) { blk->index++; - blkno = be32_to_cpu(node->btree[blk->index].before); + blkno = be32_to_cpu(btree[blk->index].before); break; } else if (!forward && (blk->index > 0)) { blk->index--; - blkno = be32_to_cpu(node->btree[blk->index].before); + blkno = be32_to_cpu(btree[blk->index].before); break; } } @@ -1557,45 +1908,60 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, * Read the next child block. */ blk->blkno = blkno; - error = xfs_da_node_read(args->trans, args->dp, blkno, -1, + error = xfs_da3_node_read(args->trans, args->dp, blkno, -1, &blk->bp, args->whichfork); if (error) return(error); - ASSERT(blk->bp != NULL); info = blk->bp->b_addr; ASSERT(info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC) || + info->magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) || info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || - info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - blk->magic = be16_to_cpu(info->magic); - if (blk->magic == XFS_DA_NODE_MAGIC) { + info->magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) || + info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) || + info->magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)); + + + /* + * Note: we flatten the magic number to a single type so we + * don't have to compare against crc/non-crc types elsewhere. + */ + switch (be16_to_cpu(info->magic)) { + case XFS_DA_NODE_MAGIC: + case XFS_DA3_NODE_MAGIC: + blk->magic = XFS_DA_NODE_MAGIC; node = (xfs_da_intnode_t *)info; - blk->hashval = be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1].hashval); + xfs_da3_node_hdr_from_disk(&nodehdr, node); + btree = xfs_da3_node_tree_p(node); + blk->hashval = be32_to_cpu(btree[nodehdr.count - 1].hashval); if (forward) blk->index = 0; else - blk->index = be16_to_cpu(node->hdr.count)-1; - blkno = be32_to_cpu(node->btree[blk->index].before); - } else { + blk->index = nodehdr.count - 1; + blkno = be32_to_cpu(btree[blk->index].before); + break; + case XFS_ATTR_LEAF_MAGIC: + case XFS_ATTR3_LEAF_MAGIC: + blk->magic = XFS_ATTR_LEAF_MAGIC; ASSERT(level == path->active-1); blk->index = 0; - switch(blk->magic) { - case XFS_ATTR_LEAF_MAGIC: - blk->hashval = xfs_attr_leaf_lasthash(blk->bp, - NULL); - break; - case XFS_DIR2_LEAFN_MAGIC: - blk->hashval = xfs_dir2_leafn_lasthash(blk->bp, - NULL); - break; - default: - ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC || - blk->magic == XFS_DIR2_LEAFN_MAGIC); - break; - } + blk->hashval = xfs_attr_leaf_lasthash(blk->bp, + NULL); + break; + case XFS_DIR2_LEAFN_MAGIC: + case XFS_DIR3_LEAFN_MAGIC: + blk->magic = XFS_DIR2_LEAFN_MAGIC; + ASSERT(level == path->active-1); + blk->index = 0; + blk->hashval = xfs_dir2_leafn_lasthash(blk->bp, + NULL); + break; + default: + ASSERT(0); + break; } } *result = 0; - return(0); + return 0; } @@ -1782,22 +2148,36 @@ xfs_da_grow_inode( * a bmap btree split to do that. */ STATIC int -xfs_da_swap_lastblock( - xfs_da_args_t *args, - xfs_dablk_t *dead_blknop, - struct xfs_buf **dead_bufp) +xfs_da3_swap_lastblock( + struct xfs_da_args *args, + xfs_dablk_t *dead_blknop, + struct xfs_buf **dead_bufp) { - xfs_dablk_t dead_blkno, last_blkno, sib_blkno, par_blkno; - struct xfs_buf *dead_buf, *last_buf, *sib_buf, *par_buf; - xfs_fileoff_t lastoff; - xfs_inode_t *ip; - xfs_trans_t *tp; - xfs_mount_t *mp; - int error, w, entno, level, dead_level; - xfs_da_blkinfo_t *dead_info, *sib_info; - xfs_da_intnode_t *par_node, *dead_node; - xfs_dir2_leaf_t *dead_leaf2; - xfs_dahash_t dead_hash; + struct xfs_da_blkinfo *dead_info; + struct xfs_da_blkinfo *sib_info; + struct xfs_da_intnode *par_node; + struct xfs_da_intnode *dead_node; + struct xfs_dir2_leaf *dead_leaf2; + struct xfs_da_node_entry *btree; + struct xfs_da3_icnode_hdr par_hdr; + struct xfs_inode *ip; + struct xfs_trans *tp; + struct xfs_mount *mp; + struct xfs_buf *dead_buf; + struct xfs_buf *last_buf; + struct xfs_buf *sib_buf; + struct xfs_buf *par_buf; + xfs_dahash_t dead_hash; + xfs_fileoff_t lastoff; + xfs_dablk_t dead_blkno; + xfs_dablk_t last_blkno; + xfs_dablk_t sib_blkno; + xfs_dablk_t par_blkno; + int error; + int w; + int entno; + int level; + int dead_level; trace_xfs_da_swap_lastblock(args); @@ -1821,7 +2201,7 @@ xfs_da_swap_lastblock( * Read the last block in the btree space. */ last_blkno = (xfs_dablk_t)lastoff - mp->m_dirblkfsbs; - error = xfs_da_node_read(tp, ip, last_blkno, -1, &last_buf, w); + error = xfs_da3_node_read(tp, ip, last_blkno, -1, &last_buf, w); if (error) return error; /* @@ -1833,22 +2213,31 @@ xfs_da_swap_lastblock( /* * Get values from the moved block. */ - if (dead_info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)) { + if (dead_info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || + dead_info->magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) { + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_dir2_leaf_entry *ents; + dead_leaf2 = (xfs_dir2_leaf_t *)dead_info; + xfs_dir3_leaf_hdr_from_disk(&leafhdr, dead_leaf2); + ents = xfs_dir3_leaf_ents_p(dead_leaf2); dead_level = 0; - dead_hash = be32_to_cpu(dead_leaf2->ents[be16_to_cpu(dead_leaf2->hdr.count) - 1].hashval); + dead_hash = be32_to_cpu(ents[leafhdr.count - 1].hashval); } else { - ASSERT(dead_info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); + struct xfs_da3_icnode_hdr deadhdr; + dead_node = (xfs_da_intnode_t *)dead_info; - dead_level = be16_to_cpu(dead_node->hdr.level); - dead_hash = be32_to_cpu(dead_node->btree[be16_to_cpu(dead_node->hdr.count) - 1].hashval); + xfs_da3_node_hdr_from_disk(&deadhdr, dead_node); + btree = xfs_da3_node_tree_p(dead_node); + dead_level = deadhdr.level; + dead_hash = be32_to_cpu(btree[deadhdr.count - 1].hashval); } sib_buf = par_buf = NULL; /* * If the moved block has a left sibling, fix up the pointers. */ if ((sib_blkno = be32_to_cpu(dead_info->back))) { - error = xfs_da_node_read(tp, ip, sib_blkno, -1, &sib_buf, w); + error = xfs_da3_node_read(tp, ip, sib_blkno, -1, &sib_buf, w); if (error) goto done; sib_info = sib_buf->b_addr; @@ -1870,7 +2259,7 @@ xfs_da_swap_lastblock( * If the moved block has a right sibling, fix up the pointers. */ if ((sib_blkno = be32_to_cpu(dead_info->forw))) { - error = xfs_da_node_read(tp, ip, sib_blkno, -1, &sib_buf, w); + error = xfs_da3_node_read(tp, ip, sib_blkno, -1, &sib_buf, w); if (error) goto done; sib_info = sib_buf->b_addr; @@ -1894,31 +2283,31 @@ xfs_da_swap_lastblock( * Walk down the tree looking for the parent of the moved block. */ for (;;) { - error = xfs_da_node_read(tp, ip, par_blkno, -1, &par_buf, w); + error = xfs_da3_node_read(tp, ip, par_blkno, -1, &par_buf, w); if (error) goto done; par_node = par_buf->b_addr; - if (unlikely(par_node->hdr.info.magic != - cpu_to_be16(XFS_DA_NODE_MAGIC) || - (level >= 0 && level != be16_to_cpu(par_node->hdr.level) + 1))) { + xfs_da3_node_hdr_from_disk(&par_hdr, par_node); + if (level >= 0 && level != par_hdr.level + 1) { XFS_ERROR_REPORT("xfs_da_swap_lastblock(4)", XFS_ERRLEVEL_LOW, mp); error = XFS_ERROR(EFSCORRUPTED); goto done; } - level = be16_to_cpu(par_node->hdr.level); + level = par_hdr.level; + btree = xfs_da3_node_tree_p(par_node); for (entno = 0; - entno < be16_to_cpu(par_node->hdr.count) && - be32_to_cpu(par_node->btree[entno].hashval) < dead_hash; + entno < par_hdr.count && + be32_to_cpu(btree[entno].hashval) < dead_hash; entno++) continue; - if (unlikely(entno == be16_to_cpu(par_node->hdr.count))) { + if (entno == par_hdr.count) { XFS_ERROR_REPORT("xfs_da_swap_lastblock(5)", XFS_ERRLEVEL_LOW, mp); error = XFS_ERROR(EFSCORRUPTED); goto done; } - par_blkno = be32_to_cpu(par_node->btree[entno].before); + par_blkno = be32_to_cpu(btree[entno].before); if (level == dead_level + 1) break; xfs_trans_brelse(tp, par_buf); @@ -1930,13 +2319,13 @@ xfs_da_swap_lastblock( */ for (;;) { for (; - entno < be16_to_cpu(par_node->hdr.count) && - be32_to_cpu(par_node->btree[entno].before) != last_blkno; + entno < par_hdr.count && + be32_to_cpu(btree[entno].before) != last_blkno; entno++) continue; - if (entno < be16_to_cpu(par_node->hdr.count)) + if (entno < par_hdr.count) break; - par_blkno = be32_to_cpu(par_node->hdr.info.forw); + par_blkno = par_hdr.forw; xfs_trans_brelse(tp, par_buf); par_buf = NULL; if (unlikely(par_blkno == 0)) { @@ -1945,27 +2334,27 @@ xfs_da_swap_lastblock( error = XFS_ERROR(EFSCORRUPTED); goto done; } - error = xfs_da_node_read(tp, ip, par_blkno, -1, &par_buf, w); + error = xfs_da3_node_read(tp, ip, par_blkno, -1, &par_buf, w); if (error) goto done; par_node = par_buf->b_addr; - if (unlikely( - be16_to_cpu(par_node->hdr.level) != level || - par_node->hdr.info.magic != cpu_to_be16(XFS_DA_NODE_MAGIC))) { + xfs_da3_node_hdr_from_disk(&par_hdr, par_node); + if (par_hdr.level != level) { XFS_ERROR_REPORT("xfs_da_swap_lastblock(7)", XFS_ERRLEVEL_LOW, mp); error = XFS_ERROR(EFSCORRUPTED); goto done; } + btree = xfs_da3_node_tree_p(par_node); entno = 0; } /* * Update the parent entry pointing to the moved block. */ - par_node->btree[entno].before = cpu_to_be32(dead_blkno); + btree[entno].before = cpu_to_be32(dead_blkno); xfs_trans_log_buf(tp, par_buf, - XFS_DA_LOGRANGE(par_node, &par_node->btree[entno].before, - sizeof(par_node->btree[entno].before))); + XFS_DA_LOGRANGE(par_node, &btree[entno].before, + sizeof(btree[entno].before))); *dead_blknop = last_blkno; *dead_bufp = last_buf; return 0; @@ -2007,14 +2396,15 @@ xfs_da_shrink_inode( * Remove extents. If we get ENOSPC for a dir we have to move * the last block to the place we want to kill. */ - if ((error = xfs_bunmapi(tp, dp, dead_blkno, count, - xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA, - 0, args->firstblock, args->flist, - &done)) == ENOSPC) { + error = xfs_bunmapi(tp, dp, dead_blkno, count, + xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA, + 0, args->firstblock, args->flist, &done); + if (error == ENOSPC) { if (w != XFS_DATA_FORK) break; - if ((error = xfs_da_swap_lastblock(args, &dead_blkno, - &dead_buf))) + error = xfs_da3_swap_lastblock(args, &dead_blkno, + &dead_buf); + if (error) break; } else { break; @@ -2279,12 +2669,21 @@ xfs_da_read_buf( magic1 = be32_to_cpu(hdr->magic); if (unlikely( XFS_TEST_ERROR((magic != XFS_DA_NODE_MAGIC) && + (magic != XFS_DA3_NODE_MAGIC) && (magic != XFS_ATTR_LEAF_MAGIC) && + (magic != XFS_ATTR3_LEAF_MAGIC) && (magic != XFS_DIR2_LEAF1_MAGIC) && + (magic != XFS_DIR3_LEAF1_MAGIC) && (magic != XFS_DIR2_LEAFN_MAGIC) && + (magic != XFS_DIR3_LEAFN_MAGIC) && (magic1 != XFS_DIR2_BLOCK_MAGIC) && + (magic1 != XFS_DIR3_BLOCK_MAGIC) && (magic1 != XFS_DIR2_DATA_MAGIC) && - (free->hdr.magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC)), + (magic1 != XFS_DIR3_DATA_MAGIC) && + (free->hdr.magic != + cpu_to_be32(XFS_DIR2_FREE_MAGIC)) && + (free->hdr.magic != + cpu_to_be32(XFS_DIR3_FREE_MAGIC)), mp, XFS_ERRTAG_DA_READ_BUF, XFS_RANDOM_DA_READ_BUF))) { trace_xfs_da_btree_corrupt(bp, _RET_IP_); @@ -2342,41 +2741,3 @@ out_free: return -1; return mappedbno; } - -kmem_zone_t *xfs_da_state_zone; /* anchor for state struct zone */ - -/* - * Allocate a dir-state structure. - * We don't put them on the stack since they're large. - */ -xfs_da_state_t * -xfs_da_state_alloc(void) -{ - return kmem_zone_zalloc(xfs_da_state_zone, KM_NOFS); -} - -/* - * Kill the altpath contents of a da-state structure. - */ -STATIC void -xfs_da_state_kill_altpath(xfs_da_state_t *state) -{ - int i; - - for (i = 0; i < state->altpath.active; i++) - state->altpath.blk[i].bp = NULL; - state->altpath.active = 0; -} - -/* - * Free a da-state structure. - */ -void -xfs_da_state_free(xfs_da_state_t *state) -{ - xfs_da_state_kill_altpath(state); -#ifdef DEBUG - memset((char *)state, 0, sizeof(*state)); -#endif /* DEBUG */ - kmem_zone_free(xfs_da_state_zone, state); -} diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h index ee5170c46ae..6fb3371c63c 100644 --- a/fs/xfs/xfs_da_btree.h +++ b/fs/xfs/xfs_da_btree.h @@ -1,5 +1,6 @@ /* * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -20,7 +21,6 @@ struct xfs_bmap_free; struct xfs_inode; -struct xfs_mount; struct xfs_trans; struct zone; @@ -47,6 +47,33 @@ typedef struct xfs_da_blkinfo { } xfs_da_blkinfo_t; /* + * CRC enabled directory structure types + * + * The headers change size for the additional verification information, but + * otherwise the tree layouts and contents are unchanged. Hence the da btree + * code can use the struct xfs_da_blkinfo for manipulating the tree links and + * magic numbers without modification for both v2 and v3 nodes. + */ +#define XFS_DA3_NODE_MAGIC 0x3ebe /* magic number: non-leaf blocks */ +#define XFS_ATTR3_LEAF_MAGIC 0x3bee /* magic number: attribute leaf blks */ +#define XFS_DIR3_LEAF1_MAGIC 0x3df1 /* magic number: v2 dirlf single blks */ +#define XFS_DIR3_LEAFN_MAGIC 0x3dff /* magic number: v2 dirlf multi blks */ + +struct xfs_da3_blkinfo { + /* + * the node link manipulation code relies on the fact that the first + * element of this structure is the struct xfs_da_blkinfo so it can + * ignore the differences in the rest of the structures. + */ + struct xfs_da_blkinfo hdr; + __be32 crc; /* CRC of block */ + __be64 blkno; /* first block of the buffer */ + __be64 lsn; /* sequence number of last write */ + uuid_t uuid; /* filesystem we belong to */ + __be64 owner; /* inode that owns the block */ +}; + +/* * This is the structure of the root and intermediate nodes in the Btree. * The leaf nodes are defined above. * @@ -57,19 +84,76 @@ typedef struct xfs_da_blkinfo { */ #define XFS_DA_NODE_MAXDEPTH 5 /* max depth of Btree */ +typedef struct xfs_da_node_hdr { + struct xfs_da_blkinfo info; /* block type, links, etc. */ + __be16 __count; /* count of active entries */ + __be16 __level; /* level above leaves (leaf == 0) */ +} xfs_da_node_hdr_t; + +struct xfs_da3_node_hdr { + struct xfs_da3_blkinfo info; /* block type, links, etc. */ + __be16 __count; /* count of active entries */ + __be16 __level; /* level above leaves (leaf == 0) */ + __be32 __pad32; +}; + +#define XFS_DA3_NODE_CRC_OFF (offsetof(struct xfs_da3_node_hdr, info.crc)) + +typedef struct xfs_da_node_entry { + __be32 hashval; /* hash value for this descendant */ + __be32 before; /* Btree block before this key */ +} xfs_da_node_entry_t; + typedef struct xfs_da_intnode { - struct xfs_da_node_hdr { /* constant-structure header block */ - xfs_da_blkinfo_t info; /* block type, links, etc. */ - __be16 count; /* count of active entries */ - __be16 level; /* level above leaves (leaf == 0) */ - } hdr; - struct xfs_da_node_entry { - __be32 hashval; /* hash value for this descendant */ - __be32 before; /* Btree block before this key */ - } btree[1]; /* variable sized array of keys */ + struct xfs_da_node_hdr hdr; + struct xfs_da_node_entry __btree[]; } xfs_da_intnode_t; -typedef struct xfs_da_node_hdr xfs_da_node_hdr_t; -typedef struct xfs_da_node_entry xfs_da_node_entry_t; + +struct xfs_da3_intnode { + struct xfs_da3_node_hdr hdr; + struct xfs_da_node_entry __btree[]; +}; + +/* + * In-core version of the node header to abstract the differences in the v2 and + * v3 disk format of the headers. Callers need to convert to/from disk format as + * appropriate. + */ +struct xfs_da3_icnode_hdr { + __uint32_t forw; + __uint32_t back; + __uint16_t magic; + __uint16_t count; + __uint16_t level; +}; + +extern void xfs_da3_node_hdr_from_disk(struct xfs_da3_icnode_hdr *to, + struct xfs_da_intnode *from); +extern void xfs_da3_node_hdr_to_disk(struct xfs_da_intnode *to, + struct xfs_da3_icnode_hdr *from); + +static inline int +xfs_da3_node_hdr_size(struct xfs_da_intnode *dap) +{ + if (dap->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) + return sizeof(struct xfs_da3_node_hdr); + return sizeof(struct xfs_da_node_hdr); +} + +static inline struct xfs_da_node_entry * +xfs_da3_node_tree_p(struct xfs_da_intnode *dap) +{ + if (dap->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) { + struct xfs_da3_intnode *dap3 = (struct xfs_da3_intnode *)dap; + return dap3->__btree; + } + return dap->__btree; +} + +extern void xfs_da3_intnode_from_disk(struct xfs_da3_icnode_hdr *to, + struct xfs_da_intnode *from); +extern void xfs_da3_intnode_to_disk(struct xfs_da_intnode *to, + struct xfs_da3_icnode_hdr *from); #define XFS_LBSIZE(mp) (mp)->m_sb.sb_blocksize @@ -191,32 +275,34 @@ struct xfs_nameops { /* * Routines used for growing the Btree. */ -int xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level, - struct xfs_buf **bpp, int whichfork); -int xfs_da_split(xfs_da_state_t *state); +int xfs_da3_node_create(struct xfs_da_args *args, xfs_dablk_t blkno, + int level, struct xfs_buf **bpp, int whichfork); +int xfs_da3_split(xfs_da_state_t *state); /* * Routines used for shrinking the Btree. */ -int xfs_da_join(xfs_da_state_t *state); -void xfs_da_fixhashpath(xfs_da_state_t *state, - xfs_da_state_path_t *path_to_to_fix); +int xfs_da3_join(xfs_da_state_t *state); +void xfs_da3_fixhashpath(struct xfs_da_state *state, + struct xfs_da_state_path *path_to_to_fix); /* * Routines used for finding things in the Btree. */ -int xfs_da_node_lookup_int(xfs_da_state_t *state, int *result); -int xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, +int xfs_da3_node_lookup_int(xfs_da_state_t *state, int *result); +int xfs_da3_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, int forward, int release, int *result); /* * Utility routines. */ -int xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, +int xfs_da3_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, xfs_da_state_blk_t *new_blk); -int xfs_da_node_read(struct xfs_trans *tp, struct xfs_inode *dp, +int xfs_da3_node_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mappedbno, struct xfs_buf **bpp, int which_fork); +extern const struct xfs_buf_ops xfs_da3_node_buf_ops; + /* * Utility routines. */ diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h index 1d9643b3dce..f7a0e95d197 100644 --- a/fs/xfs/xfs_dinode.h +++ b/fs/xfs/xfs_dinode.h @@ -19,7 +19,7 @@ #define __XFS_DINODE_H__ #define XFS_DINODE_MAGIC 0x494e /* 'IN' */ -#define XFS_DINODE_GOOD_VERSION(v) (((v) == 1 || (v) == 2)) +#define XFS_DINODE_GOOD_VERSION(v) ((v) >= 1 && (v) <= 3) typedef struct xfs_timestamp { __be32 t_sec; /* timestamp seconds */ @@ -70,11 +70,36 @@ typedef struct xfs_dinode { /* di_next_unlinked is the only non-core field in the old dinode */ __be32 di_next_unlinked;/* agi unlinked list ptr */ -} __attribute__((packed)) xfs_dinode_t; + + /* start of the extended dinode, writable fields */ + __le32 di_crc; /* CRC of the inode */ + __be64 di_changecount; /* number of attribute changes */ + __be64 di_lsn; /* flush sequence */ + __be64 di_flags2; /* more random flags */ + __u8 di_pad2[16]; /* more padding for future expansion */ + + /* fields only written to during inode creation */ + xfs_timestamp_t di_crtime; /* time created */ + __be64 di_ino; /* inode number */ + uuid_t di_uuid; /* UUID of the filesystem */ + + /* structure must be padded to 64 bit alignment */ +} xfs_dinode_t; #define DI_MAX_FLUSH 0xffff /* + * Size of the core inode on disk. Version 1 and 2 inodes have + * the same size, but version 3 has grown a few additional fields. + */ +static inline uint xfs_dinode_size(int version) +{ + if (version == 3) + return sizeof(struct xfs_dinode); + return offsetof(struct xfs_dinode, di_crc); +} + +/* * The 32 bit link count in the inode theoretically maxes out at UINT_MAX. * Since the pathconf interface is signed, we use 2^31 - 1 instead. * The old inode format had a 16 bit link count, so its maximum is USHRT_MAX. @@ -104,11 +129,11 @@ typedef enum xfs_dinode_fmt { /* * Inode size for given fs. */ -#define XFS_LITINO(mp) \ - ((int)(((mp)->m_sb.sb_inodesize) - sizeof(struct xfs_dinode))) +#define XFS_LITINO(mp, version) \ + ((int)(((mp)->m_sb.sb_inodesize) - xfs_dinode_size(version))) -#define XFS_BROOT_SIZE_ADJ \ - (XFS_BTREE_LBLOCK_LEN - sizeof(xfs_bmdr_block_t)) +#define XFS_BROOT_SIZE_ADJ(ip) \ + (XFS_BMBT_BLOCK_LEN((ip)->i_mount) - sizeof(xfs_bmdr_block_t)) /* * Inode data & attribute fork sizes, per inode. @@ -119,10 +144,10 @@ typedef enum xfs_dinode_fmt { #define XFS_DFORK_DSIZE(dip,mp) \ (XFS_DFORK_Q(dip) ? \ XFS_DFORK_BOFF(dip) : \ - XFS_LITINO(mp)) + XFS_LITINO(mp, (dip)->di_version)) #define XFS_DFORK_ASIZE(dip,mp) \ (XFS_DFORK_Q(dip) ? \ - XFS_LITINO(mp) - XFS_DFORK_BOFF(dip) : \ + XFS_LITINO(mp, (dip)->di_version) - XFS_DFORK_BOFF(dip) : \ 0) #define XFS_DFORK_SIZE(dip,mp,w) \ ((w) == XFS_DATA_FORK ? \ @@ -133,7 +158,7 @@ typedef enum xfs_dinode_fmt { * Return pointers to the data or attribute forks. */ #define XFS_DFORK_DPTR(dip) \ - ((char *)(dip) + sizeof(struct xfs_dinode)) + ((char *)dip + xfs_dinode_size(dip->di_version)) #define XFS_DFORK_APTR(dip) \ (XFS_DFORK_DPTR(dip) + XFS_DFORK_BOFF(dip)) #define XFS_DFORK_PTR(dip,w) \ diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c index 12afe07a91d..e59f5fc816f 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/xfs_dir2_block.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -28,11 +29,13 @@ #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_inode_item.h" +#include "xfs_buf_item.h" #include "xfs_dir2.h" #include "xfs_dir2_format.h" #include "xfs_dir2_priv.h" #include "xfs_error.h" #include "xfs_trace.h" +#include "xfs_cksum.h" /* * Local function prototypes. @@ -56,52 +59,110 @@ xfs_dir_startup(void) xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2); } -static void -xfs_dir2_block_verify( +static bool +xfs_dir3_block_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_dir2_data_hdr *hdr = bp->b_addr; - int block_ok = 0; - - block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); - block_ok = block_ok && __xfs_dir2_data_check(NULL, bp) == 0; - - if (!block_ok) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); - xfs_buf_ioerror(bp, EFSCORRUPTED); + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (hdr3->magic != cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) + return false; + if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid)) + return false; + if (be64_to_cpu(hdr3->blkno) != bp->b_bn) + return false; + } else { + if (hdr3->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) + return false; } + if (__xfs_dir3_data_check(NULL, bp)) + return false; + return true; } static void -xfs_dir2_block_read_verify( +xfs_dir3_block_read_verify( struct xfs_buf *bp) { - xfs_dir2_block_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + + if ((xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), + XFS_DIR3_DATA_CRC_OFF)) || + !xfs_dir3_block_verify(bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } } static void -xfs_dir2_block_write_verify( +xfs_dir3_block_write_verify( struct xfs_buf *bp) { - xfs_dir2_block_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + + if (!xfs_dir3_block_verify(bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DIR3_DATA_CRC_OFF); } -const struct xfs_buf_ops xfs_dir2_block_buf_ops = { - .verify_read = xfs_dir2_block_read_verify, - .verify_write = xfs_dir2_block_write_verify, +const struct xfs_buf_ops xfs_dir3_block_buf_ops = { + .verify_read = xfs_dir3_block_read_verify, + .verify_write = xfs_dir3_block_write_verify, }; static int -xfs_dir2_block_read( +xfs_dir3_block_read( struct xfs_trans *tp, struct xfs_inode *dp, struct xfs_buf **bpp) { struct xfs_mount *mp = dp->i_mount; + int err; - return xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, bpp, - XFS_DATA_FORK, &xfs_dir2_block_buf_ops); + err = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, bpp, + XFS_DATA_FORK, &xfs_dir3_block_buf_ops); + if (!err && tp) + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF); + return err; +} + +static void +xfs_dir3_block_init( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *bp, + struct xfs_inode *dp) +{ + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + + bp->b_ops = &xfs_dir3_block_buf_ops; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_BLOCK_BUF); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + memset(hdr3, 0, sizeof(*hdr3)); + hdr3->magic = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC); + hdr3->blkno = cpu_to_be64(bp->b_bn); + hdr3->owner = cpu_to_be64(dp->i_ino); + uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid); + return; + + } + hdr3->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); } static void @@ -121,7 +182,7 @@ xfs_dir2_block_need_space( struct xfs_dir2_data_unused *enddup = NULL; *compact = 0; - bf = hdr->bestfree; + bf = xfs_dir3_data_bestfree_p(hdr); /* * If there are stale entries we'll use one for the leaf. @@ -303,7 +364,7 @@ xfs_dir2_block_addname( mp = dp->i_mount; /* Read the (one and only) directory block into bp. */ - error = xfs_dir2_block_read(tp, dp, &bp); + error = xfs_dir3_block_read(tp, dp, &bp); if (error) return error; @@ -498,7 +559,7 @@ xfs_dir2_block_addname( xfs_dir2_data_log_header(tp, bp); xfs_dir2_block_log_tail(tp, bp); xfs_dir2_data_log_entry(tp, bp, dep); - xfs_dir2_data_check(dp, bp); + xfs_dir3_data_check(dp, bp); return 0; } @@ -531,7 +592,7 @@ xfs_dir2_block_getdents( if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk) return 0; - error = xfs_dir2_block_read(NULL, dp, &bp); + error = xfs_dir3_block_read(NULL, dp, &bp); if (error) return error; @@ -541,12 +602,12 @@ xfs_dir2_block_getdents( */ wantoff = xfs_dir2_dataptr_to_off(mp, *offset); hdr = bp->b_addr; - xfs_dir2_data_check(dp, bp); + xfs_dir3_data_check(dp, bp); /* * Set up values for the loop. */ btp = xfs_dir2_block_tail_p(mp, hdr); - ptr = (char *)(hdr + 1); + ptr = (char *)xfs_dir3_data_entry_p(hdr); endptr = (char *)xfs_dir2_block_leaf_p(btp); /* @@ -665,7 +726,7 @@ xfs_dir2_block_lookup( dp = args->dp; mp = dp->i_mount; hdr = bp->b_addr; - xfs_dir2_data_check(dp, bp); + xfs_dir3_data_check(dp, bp); btp = xfs_dir2_block_tail_p(mp, hdr); blp = xfs_dir2_block_leaf_p(btp); /* @@ -711,12 +772,12 @@ xfs_dir2_block_lookup_int( tp = args->trans; mp = dp->i_mount; - error = xfs_dir2_block_read(tp, dp, &bp); + error = xfs_dir3_block_read(tp, dp, &bp); if (error) return error; hdr = bp->b_addr; - xfs_dir2_data_check(dp, bp); + xfs_dir3_data_check(dp, bp); btp = xfs_dir2_block_tail_p(mp, hdr); blp = xfs_dir2_block_leaf_p(btp); /* @@ -853,7 +914,7 @@ xfs_dir2_block_removename( xfs_dir2_data_freescan(mp, hdr, &needlog); if (needlog) xfs_dir2_data_log_header(tp, bp); - xfs_dir2_data_check(dp, bp); + xfs_dir3_data_check(dp, bp); /* * See if the size as a shortform is good enough. */ @@ -910,7 +971,7 @@ xfs_dir2_block_replace( */ dep->inumber = cpu_to_be64(args->inumber); xfs_dir2_data_log_entry(args->trans, bp, dep); - xfs_dir2_data_check(dp, bp); + xfs_dir3_data_check(dp, bp); return 0; } @@ -958,6 +1019,8 @@ xfs_dir2_leaf_to_block( __be16 *tagp; /* end of entry (tag) */ int to; /* block/leaf to index */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; trace_xfs_dir2_leaf_to_block(args); @@ -965,8 +1028,12 @@ xfs_dir2_leaf_to_block( tp = args->trans; mp = dp->i_mount; leaf = lbp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); + xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); + ents = xfs_dir3_leaf_ents_p(leaf); ltp = xfs_dir2_leaf_tail_p(mp, leaf); + + ASSERT(leafhdr.magic == XFS_DIR2_LEAF1_MAGIC || + leafhdr.magic == XFS_DIR3_LEAF1_MAGIC); /* * If there are data blocks other than the first one, take this * opportunity to remove trailing empty data blocks that may have @@ -974,9 +1041,12 @@ xfs_dir2_leaf_to_block( * These will show up in the leaf bests table. */ while (dp->i_d.di_size > mp->m_dirblksize) { + int hdrsz; + + hdrsz = xfs_dir3_data_hdr_size(xfs_sb_version_hascrc(&mp->m_sb)); bestsp = xfs_dir2_leaf_bests_p(ltp); if (be16_to_cpu(bestsp[be32_to_cpu(ltp->bestcount) - 1]) == - mp->m_dirblksize - (uint)sizeof(*hdr)) { + mp->m_dirblksize - hdrsz) { if ((error = xfs_dir2_leaf_trim_data(args, lbp, (xfs_dir2_db_t)(be32_to_cpu(ltp->bestcount) - 1)))) @@ -988,17 +1058,19 @@ xfs_dir2_leaf_to_block( * Read the data block if we don't already have it, give up if it fails. */ if (!dbp) { - error = xfs_dir2_data_read(tp, dp, mp->m_dirdatablk, -1, &dbp); + error = xfs_dir3_data_read(tp, dp, mp->m_dirdatablk, -1, &dbp); if (error) return error; } hdr = dbp->b_addr; - ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC)); + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC)); + /* * Size of the "leaf" area in the block. */ size = (uint)sizeof(xfs_dir2_block_tail_t) + - (uint)sizeof(*lep) * (be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale)); + (uint)sizeof(*lep) * (leafhdr.count - leafhdr.stale); /* * Look at the last data entry. */ @@ -1014,8 +1086,8 @@ xfs_dir2_leaf_to_block( /* * Start converting it to block form. */ - dbp->b_ops = &xfs_dir2_block_buf_ops; - hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); + xfs_dir3_block_init(mp, tp, dbp, dp); + needlog = 1; needscan = 0; /* @@ -1027,18 +1099,17 @@ xfs_dir2_leaf_to_block( * Initialize the block tail. */ btp = xfs_dir2_block_tail_p(mp, hdr); - btp->count = cpu_to_be32(be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale)); + btp->count = cpu_to_be32(leafhdr.count - leafhdr.stale); btp->stale = 0; xfs_dir2_block_log_tail(tp, dbp); /* * Initialize the block leaf area. We compact out stale entries. */ lep = xfs_dir2_block_leaf_p(btp); - for (from = to = 0; from < be16_to_cpu(leaf->hdr.count); from++) { - if (leaf->ents[from].address == - cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) + for (from = to = 0; from < leafhdr.count; from++) { + if (ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) continue; - lep[to++] = leaf->ents[from]; + lep[to++] = ents[from]; } ASSERT(to == be32_to_cpu(btp->count)); xfs_dir2_block_log_leaf(tp, dbp, 0, be32_to_cpu(btp->count) - 1); @@ -1137,16 +1208,16 @@ xfs_dir2_sf_to_block( return error; } /* - * Initialize the data block. + * Initialize the data block, then convert it to block format. */ - error = xfs_dir2_data_init(args, blkno, &bp); + error = xfs_dir3_data_init(args, blkno, &bp); if (error) { kmem_free(sfp); return error; } - bp->b_ops = &xfs_dir2_block_buf_ops; + xfs_dir3_block_init(mp, tp, bp, dp); hdr = bp->b_addr; - hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); + /* * Compute size of block "tail" area. */ @@ -1156,7 +1227,7 @@ xfs_dir2_sf_to_block( * The whole thing is initialized to free by the init routine. * Say we're using the leaf and tail area. */ - dup = (xfs_dir2_data_unused_t *)(hdr + 1); + dup = xfs_dir3_data_unused_p(hdr); needlog = needscan = 0; xfs_dir2_data_use_free(tp, bp, dup, mp->m_dirblksize - i, i, &needlog, &needscan); @@ -1178,8 +1249,7 @@ xfs_dir2_sf_to_block( /* * Create entry for . */ - dep = (xfs_dir2_data_entry_t *) - ((char *)hdr + XFS_DIR2_DATA_DOT_OFFSET); + dep = xfs_dir3_data_dot_entry_p(hdr); dep->inumber = cpu_to_be64(dp->i_ino); dep->namelen = 1; dep->name[0] = '.'; @@ -1192,8 +1262,7 @@ xfs_dir2_sf_to_block( /* * Create entry for .. */ - dep = (xfs_dir2_data_entry_t *) - ((char *)hdr + XFS_DIR2_DATA_DOTDOT_OFFSET); + dep = xfs_dir3_data_dotdot_entry_p(hdr); dep->inumber = cpu_to_be64(xfs_dir2_sf_get_parent_ino(sfp)); dep->namelen = 2; dep->name[0] = dep->name[1] = '.'; @@ -1203,7 +1272,7 @@ xfs_dir2_sf_to_block( blp[1].hashval = cpu_to_be32(xfs_dir_hash_dotdot); blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp, (char *)dep - (char *)hdr)); - offset = XFS_DIR2_DATA_FIRST_OFFSET; + offset = xfs_dir3_data_first_offset(hdr); /* * Loop over existing entries, stuff them in. */ @@ -1273,6 +1342,6 @@ xfs_dir2_sf_to_block( ASSERT(needscan == 0); xfs_dir2_block_log_leaf(tp, bp, 0, be32_to_cpu(btp->count) - 1); xfs_dir2_block_log_tail(tp, bp); - xfs_dir2_data_check(dp, bp); + xfs_dir3_data_check(dp, bp); return 0; } diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c index ffcf1774152..c2930238005 100644 --- a/fs/xfs/xfs_dir2_data.c +++ b/fs/xfs/xfs_dir2_data.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -30,6 +31,8 @@ #include "xfs_dir2_format.h" #include "xfs_dir2_priv.h" #include "xfs_error.h" +#include "xfs_buf_item.h" +#include "xfs_cksum.h" STATIC xfs_dir2_data_free_t * xfs_dir2_data_freefind(xfs_dir2_data_hdr_t *hdr, xfs_dir2_data_unused_t *dup); @@ -40,7 +43,7 @@ xfs_dir2_data_freefind(xfs_dir2_data_hdr_t *hdr, xfs_dir2_data_unused_t *dup); * Return 0 is the buffer is good, otherwise an error. */ int -__xfs_dir2_data_check( +__xfs_dir3_data_check( struct xfs_inode *dp, /* incore inode pointer */ struct xfs_buf *bp) /* data block's buffer */ { @@ -65,15 +68,17 @@ __xfs_dir2_data_check( mp = bp->b_target->bt_mount; hdr = bp->b_addr; - bf = hdr->bestfree; - p = (char *)(hdr + 1); + bf = xfs_dir3_data_bestfree_p(hdr); + p = (char *)xfs_dir3_data_entry_p(hdr); switch (hdr->magic) { + case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC): case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC): btp = xfs_dir2_block_tail_p(mp, hdr); lep = xfs_dir2_block_leaf_p(btp); endp = (char *)lep; break; + case cpu_to_be32(XFS_DIR3_DATA_MAGIC): case cpu_to_be32(XFS_DIR2_DATA_MAGIC): endp = (char *)hdr + mp->m_dirblksize; break; @@ -148,7 +153,8 @@ __xfs_dir2_data_check( (char *)dep - (char *)hdr); count++; lastfree = 0; - if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) { + if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) { addr = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, (xfs_dir2_data_aoff_t) ((char *)dep - (char *)hdr)); @@ -168,7 +174,8 @@ __xfs_dir2_data_check( * Need to have seen all the entries and all the bestfree slots. */ XFS_WANT_CORRUPTED_RETURN(freeseen == 7); - if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) { + if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) { for (i = stale = 0; i < be32_to_cpu(btp->count); i++) { if (lep[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) @@ -185,21 +192,27 @@ __xfs_dir2_data_check( return 0; } -static void -xfs_dir2_data_verify( +static bool +xfs_dir3_data_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_dir2_data_hdr *hdr = bp->b_addr; - int block_ok = 0; + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; - block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC); - block_ok = block_ok && __xfs_dir2_data_check(NULL, bp) == 0; - - if (!block_ok) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); - xfs_buf_ioerror(bp, EFSCORRUPTED); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (hdr3->magic != cpu_to_be32(XFS_DIR3_DATA_MAGIC)) + return false; + if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid)) + return false; + if (be64_to_cpu(hdr3->blkno) != bp->b_bn) + return false; + } else { + if (hdr3->magic != cpu_to_be32(XFS_DIR2_DATA_MAGIC)) + return false; } + if (__xfs_dir3_data_check(NULL, bp)) + return false; + return true; } /* @@ -208,7 +221,7 @@ xfs_dir2_data_verify( * format buffer or a data format buffer on readahead. */ static void -xfs_dir2_data_reada_verify( +xfs_dir3_data_reada_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; @@ -216,11 +229,13 @@ xfs_dir2_data_reada_verify( switch (hdr->magic) { case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC): - bp->b_ops = &xfs_dir2_block_buf_ops; + case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC): + bp->b_ops = &xfs_dir3_block_buf_ops; bp->b_ops->verify_read(bp); return; case cpu_to_be32(XFS_DIR2_DATA_MAGIC): - xfs_dir2_data_verify(bp); + case cpu_to_be32(XFS_DIR3_DATA_MAGIC): + xfs_dir3_data_verify(bp); return; default: XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); @@ -230,51 +245,80 @@ xfs_dir2_data_reada_verify( } static void -xfs_dir2_data_read_verify( +xfs_dir3_data_read_verify( struct xfs_buf *bp) { - xfs_dir2_data_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + + if ((xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), + XFS_DIR3_DATA_CRC_OFF)) || + !xfs_dir3_data_verify(bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } } static void -xfs_dir2_data_write_verify( +xfs_dir3_data_write_verify( struct xfs_buf *bp) { - xfs_dir2_data_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + + if (!xfs_dir3_data_verify(bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DIR3_DATA_CRC_OFF); } -const struct xfs_buf_ops xfs_dir2_data_buf_ops = { - .verify_read = xfs_dir2_data_read_verify, - .verify_write = xfs_dir2_data_write_verify, +const struct xfs_buf_ops xfs_dir3_data_buf_ops = { + .verify_read = xfs_dir3_data_read_verify, + .verify_write = xfs_dir3_data_write_verify, }; -static const struct xfs_buf_ops xfs_dir2_data_reada_buf_ops = { - .verify_read = xfs_dir2_data_reada_verify, - .verify_write = xfs_dir2_data_write_verify, +static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = { + .verify_read = xfs_dir3_data_reada_verify, + .verify_write = xfs_dir3_data_write_verify, }; int -xfs_dir2_data_read( +xfs_dir3_data_read( struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp) { - return xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp, - XFS_DATA_FORK, &xfs_dir2_data_buf_ops); + int err; + + err = xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp, + XFS_DATA_FORK, &xfs_dir3_data_buf_ops); + if (!err && tp) + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_DATA_BUF); + return err; } int -xfs_dir2_data_readahead( +xfs_dir3_data_readahead( struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mapped_bno) { return xfs_da_reada_buf(tp, dp, bno, mapped_bno, - XFS_DATA_FORK, &xfs_dir2_data_reada_buf_ops); + XFS_DATA_FORK, &xfs_dir3_data_reada_buf_ops); } /* @@ -288,12 +332,15 @@ xfs_dir2_data_freefind( { xfs_dir2_data_free_t *dfp; /* bestfree entry */ xfs_dir2_data_aoff_t off; /* offset value needed */ + struct xfs_dir2_data_free *bf; #if defined(DEBUG) && defined(__KERNEL__) int matched; /* matched the value */ int seenzero; /* saw a 0 bestfree entry */ #endif off = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr); + bf = xfs_dir3_data_bestfree_p(hdr); + #if defined(DEBUG) && defined(__KERNEL__) /* * Validate some consistency in the bestfree table. @@ -301,9 +348,11 @@ xfs_dir2_data_freefind( * one we're looking for it has to be exact. */ ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || - hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)); - for (dfp = &hdr->bestfree[0], seenzero = matched = 0; - dfp < &hdr->bestfree[XFS_DIR2_DATA_FD_COUNT]; + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); + for (dfp = &bf[0], seenzero = matched = 0; + dfp < &bf[XFS_DIR2_DATA_FD_COUNT]; dfp++) { if (!dfp->offset) { ASSERT(!dfp->length); @@ -319,7 +368,7 @@ xfs_dir2_data_freefind( else ASSERT(be16_to_cpu(dfp->offset) + be16_to_cpu(dfp->length) <= off); ASSERT(matched || be16_to_cpu(dfp->length) >= be16_to_cpu(dup->length)); - if (dfp > &hdr->bestfree[0]) + if (dfp > &bf[0]) ASSERT(be16_to_cpu(dfp[-1].length) >= be16_to_cpu(dfp[0].length)); } #endif @@ -328,14 +377,12 @@ xfs_dir2_data_freefind( * it can't be there since they're sorted. */ if (be16_to_cpu(dup->length) < - be16_to_cpu(hdr->bestfree[XFS_DIR2_DATA_FD_COUNT - 1].length)) + be16_to_cpu(bf[XFS_DIR2_DATA_FD_COUNT - 1].length)) return NULL; /* * Look at the three bestfree entries for our guy. */ - for (dfp = &hdr->bestfree[0]; - dfp < &hdr->bestfree[XFS_DIR2_DATA_FD_COUNT]; - dfp++) { + for (dfp = &bf[0]; dfp < &bf[XFS_DIR2_DATA_FD_COUNT]; dfp++) { if (!dfp->offset) return NULL; if (be16_to_cpu(dfp->offset) == off) @@ -359,11 +406,12 @@ xfs_dir2_data_freeinsert( xfs_dir2_data_free_t *dfp; /* bestfree table pointer */ xfs_dir2_data_free_t new; /* new bestfree entry */ -#ifdef __KERNEL__ ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || - hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)); -#endif - dfp = hdr->bestfree; + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); + + dfp = xfs_dir3_data_bestfree_p(hdr); new.length = dup->length; new.offset = cpu_to_be16((char *)dup - (char *)hdr); @@ -400,32 +448,36 @@ xfs_dir2_data_freeremove( xfs_dir2_data_free_t *dfp, /* bestfree entry pointer */ int *loghead) /* out: log data header */ { -#ifdef __KERNEL__ + struct xfs_dir2_data_free *bf; + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || - hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)); -#endif + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); + /* * It's the first entry, slide the next 2 up. */ - if (dfp == &hdr->bestfree[0]) { - hdr->bestfree[0] = hdr->bestfree[1]; - hdr->bestfree[1] = hdr->bestfree[2]; + bf = xfs_dir3_data_bestfree_p(hdr); + if (dfp == &bf[0]) { + bf[0] = bf[1]; + bf[1] = bf[2]; } /* * It's the second entry, slide the 3rd entry up. */ - else if (dfp == &hdr->bestfree[1]) - hdr->bestfree[1] = hdr->bestfree[2]; + else if (dfp == &bf[1]) + bf[1] = bf[2]; /* * Must be the last entry. */ else - ASSERT(dfp == &hdr->bestfree[2]); + ASSERT(dfp == &bf[2]); /* * Clear the 3rd entry, must be zero now. */ - hdr->bestfree[2].length = 0; - hdr->bestfree[2].offset = 0; + bf[2].length = 0; + bf[2].offset = 0; *loghead = 1; } @@ -441,23 +493,27 @@ xfs_dir2_data_freescan( xfs_dir2_block_tail_t *btp; /* block tail */ xfs_dir2_data_entry_t *dep; /* active data entry */ xfs_dir2_data_unused_t *dup; /* unused data entry */ + struct xfs_dir2_data_free *bf; char *endp; /* end of block's data */ char *p; /* current entry pointer */ -#ifdef __KERNEL__ ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || - hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)); -#endif + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); + /* * Start by clearing the table. */ - memset(hdr->bestfree, 0, sizeof(hdr->bestfree)); + bf = xfs_dir3_data_bestfree_p(hdr); + memset(bf, 0, sizeof(*bf) * XFS_DIR2_DATA_FD_COUNT); *loghead = 1; /* * Set up pointers. */ - p = (char *)(hdr + 1); - if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) { + p = (char *)xfs_dir3_data_entry_p(hdr); + if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) { btp = xfs_dir2_block_tail_p(mp, hdr); endp = (char *)xfs_dir2_block_leaf_p(btp); } else @@ -493,7 +549,7 @@ xfs_dir2_data_freescan( * Give back the buffer for the created block. */ int /* error */ -xfs_dir2_data_init( +xfs_dir3_data_init( xfs_da_args_t *args, /* directory operation args */ xfs_dir2_db_t blkno, /* logical dir block number */ struct xfs_buf **bpp) /* output block buffer */ @@ -502,6 +558,7 @@ xfs_dir2_data_init( xfs_dir2_data_hdr_t *hdr; /* data block header */ xfs_inode_t *dp; /* incore directory inode */ xfs_dir2_data_unused_t *dup; /* unused entry pointer */ + struct xfs_dir2_data_free *bf; int error; /* error return value */ int i; /* bestfree index */ xfs_mount_t *mp; /* filesystem mount point */ @@ -518,27 +575,40 @@ xfs_dir2_data_init( XFS_DATA_FORK); if (error) return error; - bp->b_ops = &xfs_dir2_data_buf_ops; + bp->b_ops = &xfs_dir3_data_buf_ops; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_DATA_BUF); /* * Initialize the header. */ hdr = bp->b_addr; - hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC); - hdr->bestfree[0].offset = cpu_to_be16(sizeof(*hdr)); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + + memset(hdr3, 0, sizeof(*hdr3)); + hdr3->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC); + hdr3->blkno = cpu_to_be64(bp->b_bn); + hdr3->owner = cpu_to_be64(dp->i_ino); + uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid); + + } else + hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC); + + bf = xfs_dir3_data_bestfree_p(hdr); + bf[0].offset = cpu_to_be16(xfs_dir3_data_entry_offset(hdr)); for (i = 1; i < XFS_DIR2_DATA_FD_COUNT; i++) { - hdr->bestfree[i].length = 0; - hdr->bestfree[i].offset = 0; + bf[i].length = 0; + bf[i].offset = 0; } /* * Set up an unused entry for the block's body. */ - dup = (xfs_dir2_data_unused_t *)(hdr + 1); + dup = xfs_dir3_data_unused_p(hdr); dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); - t = mp->m_dirblksize - (uint)sizeof(*hdr); - hdr->bestfree[0].length = cpu_to_be16(t); + t = mp->m_dirblksize - (uint)xfs_dir3_data_entry_offset(hdr); + bf[0].length = cpu_to_be16(t); dup->length = cpu_to_be16(t); *xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16((char *)dup - (char *)hdr); /* @@ -562,7 +632,9 @@ xfs_dir2_data_log_entry( xfs_dir2_data_hdr_t *hdr = bp->b_addr; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || - hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)); + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); xfs_trans_log_buf(tp, bp, (uint)((char *)dep - (char *)hdr), (uint)((char *)(xfs_dir2_data_entry_tag_p(dep) + 1) - @@ -580,9 +652,11 @@ xfs_dir2_data_log_header( xfs_dir2_data_hdr_t *hdr = bp->b_addr; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || - hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)); + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); - xfs_trans_log_buf(tp, bp, 0, sizeof(*hdr) - 1); + xfs_trans_log_buf(tp, bp, 0, xfs_dir3_data_entry_offset(hdr) - 1); } /* @@ -597,7 +671,9 @@ xfs_dir2_data_log_unused( xfs_dir2_data_hdr_t *hdr = bp->b_addr; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || - hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)); + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); /* * Log the first part of the unused entry. @@ -635,6 +711,7 @@ xfs_dir2_data_make_free( xfs_dir2_data_unused_t *newdup; /* new unused entry */ xfs_dir2_data_unused_t *postdup; /* unused entry after us */ xfs_dir2_data_unused_t *prevdup; /* unused entry before us */ + struct xfs_dir2_data_free *bf; mp = tp->t_mountp; hdr = bp->b_addr; @@ -642,12 +719,14 @@ xfs_dir2_data_make_free( /* * Figure out where the end of the data area is. */ - if (hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC)) + if (hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC)) endptr = (char *)hdr + mp->m_dirblksize; else { xfs_dir2_block_tail_t *btp; /* block tail */ - ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)); + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); btp = xfs_dir2_block_tail_p(mp, hdr); endptr = (char *)xfs_dir2_block_leaf_p(btp); } @@ -655,7 +734,7 @@ xfs_dir2_data_make_free( * If this isn't the start of the block, then back up to * the previous entry and see if it's free. */ - if (offset > sizeof(*hdr)) { + if (offset > xfs_dir3_data_entry_offset(hdr)) { __be16 *tagp; /* tag just before us */ tagp = (__be16 *)((char *)hdr + offset) - 1; @@ -681,6 +760,7 @@ xfs_dir2_data_make_free( * Previous and following entries are both free, * merge everything into a single free entry. */ + bf = xfs_dir3_data_bestfree_p(hdr); if (prevdup && postdup) { xfs_dir2_data_free_t *dfp2; /* another bestfree pointer */ @@ -695,7 +775,7 @@ xfs_dir2_data_make_free( * since the third bestfree is there, there might be more * entries. */ - needscan = (hdr->bestfree[2].length != 0); + needscan = (bf[2].length != 0); /* * Fix up the new big freespace. */ @@ -711,10 +791,10 @@ xfs_dir2_data_make_free( * Remove entry 1 first then entry 0. */ ASSERT(dfp && dfp2); - if (dfp == &hdr->bestfree[1]) { - dfp = &hdr->bestfree[0]; + if (dfp == &bf[1]) { + dfp = &bf[0]; ASSERT(dfp2 == dfp); - dfp2 = &hdr->bestfree[1]; + dfp2 = &bf[1]; } xfs_dir2_data_freeremove(hdr, dfp2, needlogp); xfs_dir2_data_freeremove(hdr, dfp, needlogp); @@ -722,7 +802,7 @@ xfs_dir2_data_make_free( * Now insert the new entry. */ dfp = xfs_dir2_data_freeinsert(hdr, prevdup, needlogp); - ASSERT(dfp == &hdr->bestfree[0]); + ASSERT(dfp == &bf[0]); ASSERT(dfp->length == prevdup->length); ASSERT(!dfp[1].length); ASSERT(!dfp[2].length); @@ -751,7 +831,7 @@ xfs_dir2_data_make_free( */ else { needscan = be16_to_cpu(prevdup->length) > - be16_to_cpu(hdr->bestfree[2].length); + be16_to_cpu(bf[2].length); } } /* @@ -779,7 +859,7 @@ xfs_dir2_data_make_free( */ else { needscan = be16_to_cpu(newdup->length) > - be16_to_cpu(hdr->bestfree[2].length); + be16_to_cpu(bf[2].length); } } /* @@ -818,10 +898,13 @@ xfs_dir2_data_use_free( xfs_dir2_data_unused_t *newdup; /* new unused entry */ xfs_dir2_data_unused_t *newdup2; /* another new unused entry */ int oldlen; /* old unused entry's length */ + struct xfs_dir2_data_free *bf; hdr = bp->b_addr; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || - hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)); + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); ASSERT(be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG); ASSERT(offset >= (char *)dup - (char *)hdr); ASSERT(offset + len <= (char *)dup + be16_to_cpu(dup->length) - (char *)hdr); @@ -831,7 +914,8 @@ xfs_dir2_data_use_free( */ dfp = xfs_dir2_data_freefind(hdr, dup); oldlen = be16_to_cpu(dup->length); - ASSERT(dfp || oldlen <= be16_to_cpu(hdr->bestfree[2].length)); + bf = xfs_dir3_data_bestfree_p(hdr); + ASSERT(dfp || oldlen <= be16_to_cpu(bf[2].length)); /* * Check for alignment with front and back of the entry. */ @@ -845,7 +929,7 @@ xfs_dir2_data_use_free( */ if (matchfront && matchback) { if (dfp) { - needscan = (hdr->bestfree[2].offset != 0); + needscan = (bf[2].offset != 0); if (!needscan) xfs_dir2_data_freeremove(hdr, dfp, needlogp); } @@ -875,7 +959,7 @@ xfs_dir2_data_use_free( * that means we don't know if there was a better * choice for the last slot, or not. Rescan. */ - needscan = dfp == &hdr->bestfree[2]; + needscan = dfp == &bf[2]; } } /* @@ -902,7 +986,7 @@ xfs_dir2_data_use_free( * that means we don't know if there was a better * choice for the last slot, or not. Rescan. */ - needscan = dfp == &hdr->bestfree[2]; + needscan = dfp == &bf[2]; } } /* @@ -930,7 +1014,7 @@ xfs_dir2_data_use_free( * the 2 new will work. */ if (dfp) { - needscan = (hdr->bestfree[2].length != 0); + needscan = (bf[2].length != 0); if (!needscan) { xfs_dir2_data_freeremove(hdr, dfp, needlogp); xfs_dir2_data_freeinsert(hdr, newdup, needlogp); diff --git a/fs/xfs/xfs_dir2_format.h b/fs/xfs/xfs_dir2_format.h index 07270981f48..a3b1bd841a8 100644 --- a/fs/xfs/xfs_dir2_format.h +++ b/fs/xfs/xfs_dir2_format.h @@ -1,5 +1,6 @@ /* * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -36,6 +37,38 @@ #define XFS_DIR2_FREE_MAGIC 0x58443246 /* XD2F: free index blocks */ /* + * Directory Version 3 With CRCs. + * + * The tree formats are the same as for version 2 directories. The difference + * is in the block header and dirent formats. In many cases the v3 structures + * use v2 definitions as they are no different and this makes code sharing much + * easier. + * + * Also, the xfs_dir3_*() functions handle both v2 and v3 formats - if the + * format is v2 then they switch to the existing v2 code, or the format is v3 + * they implement the v3 functionality. This means the existing dir2 is a mix of + * xfs_dir2/xfs_dir3 calls and functions. The xfs_dir3 functions are called + * where there is a difference in the formats, otherwise the code is unchanged. + * + * Where it is possible, the code decides what to do based on the magic numbers + * in the blocks rather than feature bits in the superblock. This means the code + * is as independent of the external XFS code as possible as doesn't require + * passing struct xfs_mount pointers into places where it isn't really + * necessary. + * + * Version 3 includes: + * + * - a larger block header for CRC and identification purposes and so the + * offsets of all the structures inside the blocks are different. + * + * - new magic numbers to be able to detect the v2/v3 types on the fly. + */ + +#define XFS_DIR3_BLOCK_MAGIC 0x58444233 /* XDB3: single block dirs */ +#define XFS_DIR3_DATA_MAGIC 0x58444433 /* XDD3: multiblock dirs */ +#define XFS_DIR3_FREE_MAGIC 0x58444633 /* XDF3: free index blocks */ + +/* * Byte offset in data block and shortform entry. */ typedef __uint16_t xfs_dir2_data_off_t; @@ -195,16 +228,6 @@ xfs_dir2_sf_nextentry(struct xfs_dir2_sf_hdr *hdr, xfs_dir2_byte_to_db(mp, XFS_DIR2_DATA_OFFSET) /* - * Offsets of . and .. in data space (always block 0) - */ -#define XFS_DIR2_DATA_DOT_OFFSET \ - ((xfs_dir2_data_aoff_t)sizeof(struct xfs_dir2_data_hdr)) -#define XFS_DIR2_DATA_DOTDOT_OFFSET \ - (XFS_DIR2_DATA_DOT_OFFSET + xfs_dir2_data_entsize(1)) -#define XFS_DIR2_DATA_FIRST_OFFSET \ - (XFS_DIR2_DATA_DOTDOT_OFFSET + xfs_dir2_data_entsize(2)) - -/* * Describe a free area in the data block. * * The freespace will be formatted as a xfs_dir2_data_unused_t. @@ -226,6 +249,39 @@ typedef struct xfs_dir2_data_hdr { } xfs_dir2_data_hdr_t; /* + * define a structure for all the verification fields we are adding to the + * directory block structures. This will be used in several structures. + * The magic number must be the first entry to align with all the dir2 + * structures so we determine how to decode them just by the magic number. + */ +struct xfs_dir3_blk_hdr { + __be32 magic; /* magic number */ + __be32 crc; /* CRC of block */ + __be64 blkno; /* first block of the buffer */ + __be64 lsn; /* sequence number of last write */ + uuid_t uuid; /* filesystem we belong to */ + __be64 owner; /* inode that owns the block */ +}; + +struct xfs_dir3_data_hdr { + struct xfs_dir3_blk_hdr hdr; + xfs_dir2_data_free_t best_free[XFS_DIR2_DATA_FD_COUNT]; +}; + +#define XFS_DIR3_DATA_CRC_OFF offsetof(struct xfs_dir3_data_hdr, hdr.crc) + +static inline struct xfs_dir2_data_free * +xfs_dir3_data_bestfree_p(struct xfs_dir2_data_hdr *hdr) +{ + if (hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) { + struct xfs_dir3_data_hdr *hdr3 = (struct xfs_dir3_data_hdr *)hdr; + return hdr3->best_free; + } + return hdr->bestfree; +} + +/* * Active entry in a data block. * * Aligned to 8 bytes. After the variable length name field there is a @@ -280,6 +336,94 @@ xfs_dir2_data_unused_tag_p(struct xfs_dir2_data_unused *dup) be16_to_cpu(dup->length) - sizeof(__be16)); } +static inline size_t +xfs_dir3_data_hdr_size(bool dir3) +{ + if (dir3) + return sizeof(struct xfs_dir3_data_hdr); + return sizeof(struct xfs_dir2_data_hdr); +} + +static inline size_t +xfs_dir3_data_entry_offset(struct xfs_dir2_data_hdr *hdr) +{ + bool dir3 = hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC); + return xfs_dir3_data_hdr_size(dir3); +} + +static inline struct xfs_dir2_data_entry * +xfs_dir3_data_entry_p(struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + xfs_dir3_data_entry_offset(hdr)); +} + +static inline struct xfs_dir2_data_unused * +xfs_dir3_data_unused_p(struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_unused *) + ((char *)hdr + xfs_dir3_data_entry_offset(hdr)); +} + +/* + * Offsets of . and .. in data space (always block 0) + * + * The macros are used for shortform directories as they have no headers to read + * the magic number out of. Shortform directories need to know the size of the + * data block header because the sfe embeds the block offset of the entry into + * it so that it doesn't change when format conversion occurs. Bad Things Happen + * if we don't follow this rule. + */ +#define XFS_DIR3_DATA_DOT_OFFSET(mp) \ + xfs_dir3_data_hdr_size(xfs_sb_version_hascrc(&(mp)->m_sb)) +#define XFS_DIR3_DATA_DOTDOT_OFFSET(mp) \ + (XFS_DIR3_DATA_DOT_OFFSET(mp) + xfs_dir2_data_entsize(1)) +#define XFS_DIR3_DATA_FIRST_OFFSET(mp) \ + (XFS_DIR3_DATA_DOTDOT_OFFSET(mp) + xfs_dir2_data_entsize(2)) + +static inline xfs_dir2_data_aoff_t +xfs_dir3_data_dot_offset(struct xfs_dir2_data_hdr *hdr) +{ + return xfs_dir3_data_entry_offset(hdr); +} + +static inline xfs_dir2_data_aoff_t +xfs_dir3_data_dotdot_offset(struct xfs_dir2_data_hdr *hdr) +{ + return xfs_dir3_data_dot_offset(hdr) + xfs_dir2_data_entsize(1); +} + +static inline xfs_dir2_data_aoff_t +xfs_dir3_data_first_offset(struct xfs_dir2_data_hdr *hdr) +{ + return xfs_dir3_data_dotdot_offset(hdr) + xfs_dir2_data_entsize(2); +} + +/* + * location of . and .. in data space (always block 0) + */ +static inline struct xfs_dir2_data_entry * +xfs_dir3_data_dot_entry_p(struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + xfs_dir3_data_dot_offset(hdr)); +} + +static inline struct xfs_dir2_data_entry * +xfs_dir3_data_dotdot_entry_p(struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + xfs_dir3_data_dotdot_offset(hdr)); +} + +static inline struct xfs_dir2_data_entry * +xfs_dir3_data_first_entry_p(struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + xfs_dir3_data_first_offset(hdr)); +} + /* * Leaf block structures. * @@ -329,6 +473,21 @@ typedef struct xfs_dir2_leaf_hdr { __be16 stale; /* count of stale entries */ } xfs_dir2_leaf_hdr_t; +struct xfs_dir3_leaf_hdr { + struct xfs_da3_blkinfo info; /* header for da routines */ + __be16 count; /* count of entries */ + __be16 stale; /* count of stale entries */ + __be32 pad; +}; + +struct xfs_dir3_icleaf_hdr { + __uint32_t forw; + __uint32_t back; + __uint16_t magic; + __uint16_t count; + __uint16_t stale; +}; + /* * Leaf block entry. */ @@ -348,23 +507,50 @@ typedef struct xfs_dir2_leaf_tail { * Leaf block. */ typedef struct xfs_dir2_leaf { - xfs_dir2_leaf_hdr_t hdr; /* leaf header */ - xfs_dir2_leaf_entry_t ents[]; /* entries */ + xfs_dir2_leaf_hdr_t hdr; /* leaf header */ + xfs_dir2_leaf_entry_t __ents[]; /* entries */ } xfs_dir2_leaf_t; -/* - * DB blocks here are logical directory block numbers, not filesystem blocks. - */ +struct xfs_dir3_leaf { + struct xfs_dir3_leaf_hdr hdr; /* leaf header */ + struct xfs_dir2_leaf_entry __ents[]; /* entries */ +}; -static inline int xfs_dir2_max_leaf_ents(struct xfs_mount *mp) +#define XFS_DIR3_LEAF_CRC_OFF offsetof(struct xfs_dir3_leaf_hdr, info.crc) + +static inline int +xfs_dir3_leaf_hdr_size(struct xfs_dir2_leaf *lp) { - return (mp->m_dirblksize - (uint)sizeof(struct xfs_dir2_leaf_hdr)) / + if (lp->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) || + lp->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) + return sizeof(struct xfs_dir3_leaf_hdr); + return sizeof(struct xfs_dir2_leaf_hdr); +} + +static inline int +xfs_dir3_max_leaf_ents(struct xfs_mount *mp, struct xfs_dir2_leaf *lp) +{ + return (mp->m_dirblksize - xfs_dir3_leaf_hdr_size(lp)) / (uint)sizeof(struct xfs_dir2_leaf_entry); } /* * Get address of the bestcount field in the single-leaf block. */ +static inline struct xfs_dir2_leaf_entry * +xfs_dir3_leaf_ents_p(struct xfs_dir2_leaf *lp) +{ + if (lp->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) || + lp->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) { + struct xfs_dir3_leaf *lp3 = (struct xfs_dir3_leaf *)lp; + return lp3->__ents; + } + return lp->__ents; +} + +/* + * Get address of the bestcount field in the single-leaf block. + */ static inline struct xfs_dir2_leaf_tail * xfs_dir2_leaf_tail_p(struct xfs_mount *mp, struct xfs_dir2_leaf *lp) { @@ -383,6 +569,10 @@ xfs_dir2_leaf_bests_p(struct xfs_dir2_leaf_tail *ltp) } /* + * DB blocks here are logical directory block numbers, not filesystem blocks. + */ + +/* * Convert dataptr to byte in file space */ static inline xfs_dir2_off_t @@ -520,19 +710,65 @@ typedef struct xfs_dir2_free { /* unused entries are -1 */ } xfs_dir2_free_t; -static inline int xfs_dir2_free_max_bests(struct xfs_mount *mp) +struct xfs_dir3_free_hdr { + struct xfs_dir3_blk_hdr hdr; + __be32 firstdb; /* db of first entry */ + __be32 nvalid; /* count of valid entries */ + __be32 nused; /* count of used entries */ +}; + +struct xfs_dir3_free { + struct xfs_dir3_free_hdr hdr; + __be16 bests[]; /* best free counts */ + /* unused entries are -1 */ +}; + +#define XFS_DIR3_FREE_CRC_OFF offsetof(struct xfs_dir3_free, hdr.hdr.crc) + +/* + * In core version of the free block header, abstracted away from on-disk format + * differences. Use this in the code, and convert to/from the disk version using + * xfs_dir3_free_hdr_from_disk/xfs_dir3_free_hdr_to_disk. + */ +struct xfs_dir3_icfree_hdr { + __uint32_t magic; + __uint32_t firstdb; + __uint32_t nvalid; + __uint32_t nused; + +}; + +void xfs_dir3_free_hdr_from_disk(struct xfs_dir3_icfree_hdr *to, + struct xfs_dir2_free *from); + +static inline int +xfs_dir3_free_hdr_size(struct xfs_mount *mp) { - return (mp->m_dirblksize - sizeof(struct xfs_dir2_free_hdr)) / + if (xfs_sb_version_hascrc(&mp->m_sb)) + return sizeof(struct xfs_dir3_free_hdr); + return sizeof(struct xfs_dir2_free_hdr); +} + +static inline int +xfs_dir3_free_max_bests(struct xfs_mount *mp) +{ + return (mp->m_dirblksize - xfs_dir3_free_hdr_size(mp)) / sizeof(xfs_dir2_data_off_t); } +static inline __be16 * +xfs_dir3_free_bests_p(struct xfs_mount *mp, struct xfs_dir2_free *free) +{ + return (__be16 *)((char *)free + xfs_dir3_free_hdr_size(mp)); +} + /* * Convert data space db to the corresponding free db. */ static inline xfs_dir2_db_t xfs_dir2_db_to_fdb(struct xfs_mount *mp, xfs_dir2_db_t db) { - return XFS_DIR2_FREE_FIRSTDB(mp) + db / xfs_dir2_free_max_bests(mp); + return XFS_DIR2_FREE_FIRSTDB(mp) + db / xfs_dir3_free_max_bests(mp); } /* @@ -541,7 +777,7 @@ xfs_dir2_db_to_fdb(struct xfs_mount *mp, xfs_dir2_db_t db) static inline int xfs_dir2_db_to_fdindex(struct xfs_mount *mp, xfs_dir2_db_t db) { - return db % xfs_dir2_free_max_bests(mp); + return db % xfs_dir3_free_max_bests(mp); } /* diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index 60cd2fa4e04..721ba2fe8e5 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -33,97 +34,371 @@ #include "xfs_dir2_priv.h" #include "xfs_error.h" #include "xfs_trace.h" +#include "xfs_buf_item.h" +#include "xfs_cksum.h" /* * Local function declarations. */ -#ifdef DEBUG -static void xfs_dir2_leaf_check(struct xfs_inode *dp, struct xfs_buf *bp); -#else -#define xfs_dir2_leaf_check(dp, bp) -#endif static int xfs_dir2_leaf_lookup_int(xfs_da_args_t *args, struct xfs_buf **lbpp, int *indexp, struct xfs_buf **dbpp); -static void xfs_dir2_leaf_log_bests(struct xfs_trans *tp, struct xfs_buf *bp, +static void xfs_dir3_leaf_log_bests(struct xfs_trans *tp, struct xfs_buf *bp, int first, int last); -static void xfs_dir2_leaf_log_tail(struct xfs_trans *tp, struct xfs_buf *bp); +static void xfs_dir3_leaf_log_tail(struct xfs_trans *tp, struct xfs_buf *bp); -static void -xfs_dir2_leaf_verify( +/* + * Check the internal consistency of a leaf1 block. + * Pop an assert if something is wrong. + */ +#ifdef DEBUG +#define xfs_dir3_leaf_check(mp, bp) \ +do { \ + if (!xfs_dir3_leaf1_check((mp), (bp))) \ + ASSERT(0); \ +} while (0); + +STATIC bool +xfs_dir3_leaf1_check( + struct xfs_mount *mp, + struct xfs_buf *bp) +{ + struct xfs_dir2_leaf *leaf = bp->b_addr; + struct xfs_dir3_icleaf_hdr leafhdr; + + xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); + + if (leafhdr.magic == XFS_DIR3_LEAF1_MAGIC) { + struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; + if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn) + return false; + } else if (leafhdr.magic != XFS_DIR2_LEAF1_MAGIC) + return false; + + return xfs_dir3_leaf_check_int(mp, &leafhdr, leaf); +} +#else +#define xfs_dir3_leaf_check(mp, bp) +#endif + +void +xfs_dir3_leaf_hdr_from_disk( + struct xfs_dir3_icleaf_hdr *to, + struct xfs_dir2_leaf *from) +{ + if (from->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) || + from->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)) { + to->forw = be32_to_cpu(from->hdr.info.forw); + to->back = be32_to_cpu(from->hdr.info.back); + to->magic = be16_to_cpu(from->hdr.info.magic); + to->count = be16_to_cpu(from->hdr.count); + to->stale = be16_to_cpu(from->hdr.stale); + } else { + struct xfs_dir3_leaf_hdr *hdr3 = (struct xfs_dir3_leaf_hdr *)from; + + to->forw = be32_to_cpu(hdr3->info.hdr.forw); + to->back = be32_to_cpu(hdr3->info.hdr.back); + to->magic = be16_to_cpu(hdr3->info.hdr.magic); + to->count = be16_to_cpu(hdr3->count); + to->stale = be16_to_cpu(hdr3->stale); + } + + ASSERT(to->magic == XFS_DIR2_LEAF1_MAGIC || + to->magic == XFS_DIR3_LEAF1_MAGIC || + to->magic == XFS_DIR2_LEAFN_MAGIC || + to->magic == XFS_DIR3_LEAFN_MAGIC); +} + +void +xfs_dir3_leaf_hdr_to_disk( + struct xfs_dir2_leaf *to, + struct xfs_dir3_icleaf_hdr *from) +{ + ASSERT(from->magic == XFS_DIR2_LEAF1_MAGIC || + from->magic == XFS_DIR3_LEAF1_MAGIC || + from->magic == XFS_DIR2_LEAFN_MAGIC || + from->magic == XFS_DIR3_LEAFN_MAGIC); + + if (from->magic == XFS_DIR2_LEAF1_MAGIC || + from->magic == XFS_DIR2_LEAFN_MAGIC) { + to->hdr.info.forw = cpu_to_be32(from->forw); + to->hdr.info.back = cpu_to_be32(from->back); + to->hdr.info.magic = cpu_to_be16(from->magic); + to->hdr.count = cpu_to_be16(from->count); + to->hdr.stale = cpu_to_be16(from->stale); + } else { + struct xfs_dir3_leaf_hdr *hdr3 = (struct xfs_dir3_leaf_hdr *)to; + + hdr3->info.hdr.forw = cpu_to_be32(from->forw); + hdr3->info.hdr.back = cpu_to_be32(from->back); + hdr3->info.hdr.magic = cpu_to_be16(from->magic); + hdr3->count = cpu_to_be16(from->count); + hdr3->stale = cpu_to_be16(from->stale); + } +} + +bool +xfs_dir3_leaf_check_int( + struct xfs_mount *mp, + struct xfs_dir3_icleaf_hdr *hdr, + struct xfs_dir2_leaf *leaf) +{ + struct xfs_dir2_leaf_entry *ents; + xfs_dir2_leaf_tail_t *ltp; + int stale; + int i; + + ents = xfs_dir3_leaf_ents_p(leaf); + ltp = xfs_dir2_leaf_tail_p(mp, leaf); + + /* + * XXX (dgc): This value is not restrictive enough. + * Should factor in the size of the bests table as well. + * We can deduce a value for that from di_size. + */ + if (hdr->count > xfs_dir3_max_leaf_ents(mp, leaf)) + return false; + + /* Leaves and bests don't overlap in leaf format. */ + if ((hdr->magic == XFS_DIR2_LEAF1_MAGIC || + hdr->magic == XFS_DIR3_LEAF1_MAGIC) && + (char *)&ents[hdr->count] > (char *)xfs_dir2_leaf_bests_p(ltp)) + return false; + + /* Check hash value order, count stale entries. */ + for (i = stale = 0; i < hdr->count; i++) { + if (i + 1 < hdr->count) { + if (be32_to_cpu(ents[i].hashval) > + be32_to_cpu(ents[i + 1].hashval)) + return false; + } + if (ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) + stale++; + } + if (hdr->stale != stale) + return false; + return true; +} + +static bool +xfs_dir3_leaf_verify( struct xfs_buf *bp, - __be16 magic) + __uint16_t magic) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dir2_leaf *leaf = bp->b_addr; + struct xfs_dir3_icleaf_hdr leafhdr; + + ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC); + + xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; + + if ((magic == XFS_DIR2_LEAF1_MAGIC && + leafhdr.magic != XFS_DIR3_LEAF1_MAGIC) || + (magic == XFS_DIR2_LEAFN_MAGIC && + leafhdr.magic != XFS_DIR3_LEAFN_MAGIC)) + return false; + + if (!uuid_equal(&leaf3->info.uuid, &mp->m_sb.sb_uuid)) + return false; + if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn) + return false; + } else { + if (leafhdr.magic != magic) + return false; + } + return xfs_dir3_leaf_check_int(mp, &leafhdr, leaf); +} + +static void +__read_verify( + struct xfs_buf *bp, + __uint16_t magic) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + if ((xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), + XFS_DIR3_LEAF_CRC_OFF)) || + !xfs_dir3_leaf_verify(bp, magic)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } +} + +static void +__write_verify( + struct xfs_buf *bp, + __uint16_t magic) { struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_dir2_leaf_hdr *hdr = bp->b_addr; - int block_ok = 0; + struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr; - block_ok = hdr->info.magic == magic; - if (!block_ok) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); + if (!xfs_dir3_leaf_verify(bp, magic)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); xfs_buf_ioerror(bp, EFSCORRUPTED); + return; } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DIR3_LEAF_CRC_OFF); } static void -xfs_dir2_leaf1_read_verify( +xfs_dir3_leaf1_read_verify( struct xfs_buf *bp) { - xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); + __read_verify(bp, XFS_DIR2_LEAF1_MAGIC); } static void -xfs_dir2_leaf1_write_verify( +xfs_dir3_leaf1_write_verify( struct xfs_buf *bp) { - xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); + __write_verify(bp, XFS_DIR2_LEAF1_MAGIC); } -void -xfs_dir2_leafn_read_verify( +static void +xfs_dir3_leafn_read_verify( struct xfs_buf *bp) { - xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); + __read_verify(bp, XFS_DIR2_LEAFN_MAGIC); } -void -xfs_dir2_leafn_write_verify( +static void +xfs_dir3_leafn_write_verify( struct xfs_buf *bp) { - xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); + __write_verify(bp, XFS_DIR2_LEAFN_MAGIC); } -static const struct xfs_buf_ops xfs_dir2_leaf1_buf_ops = { - .verify_read = xfs_dir2_leaf1_read_verify, - .verify_write = xfs_dir2_leaf1_write_verify, +const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops = { + .verify_read = xfs_dir3_leaf1_read_verify, + .verify_write = xfs_dir3_leaf1_write_verify, }; -const struct xfs_buf_ops xfs_dir2_leafn_buf_ops = { - .verify_read = xfs_dir2_leafn_read_verify, - .verify_write = xfs_dir2_leafn_write_verify, +const struct xfs_buf_ops xfs_dir3_leafn_buf_ops = { + .verify_read = xfs_dir3_leafn_read_verify, + .verify_write = xfs_dir3_leafn_write_verify, }; static int -xfs_dir2_leaf_read( +xfs_dir3_leaf_read( struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp) { - return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, - XFS_DATA_FORK, &xfs_dir2_leaf1_buf_ops); + int err; + + err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, + XFS_DATA_FORK, &xfs_dir3_leaf1_buf_ops); + if (!err && tp) + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAF1_BUF); + return err; } int -xfs_dir2_leafn_read( +xfs_dir3_leafn_read( struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp) { - return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, - XFS_DATA_FORK, &xfs_dir2_leafn_buf_ops); + int err; + + err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, + XFS_DATA_FORK, &xfs_dir3_leafn_buf_ops); + if (!err && tp) + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAFN_BUF); + return err; +} + +/* + * Initialize a new leaf block, leaf1 or leafn magic accepted. + */ +static void +xfs_dir3_leaf_init( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *bp, + xfs_ino_t owner, + __uint16_t type) +{ + struct xfs_dir2_leaf *leaf = bp->b_addr; + + ASSERT(type == XFS_DIR2_LEAF1_MAGIC || type == XFS_DIR2_LEAFN_MAGIC); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; + + memset(leaf3, 0, sizeof(*leaf3)); + + leaf3->info.hdr.magic = (type == XFS_DIR2_LEAF1_MAGIC) + ? cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) + : cpu_to_be16(XFS_DIR3_LEAFN_MAGIC); + leaf3->info.blkno = cpu_to_be64(bp->b_bn); + leaf3->info.owner = cpu_to_be64(owner); + uuid_copy(&leaf3->info.uuid, &mp->m_sb.sb_uuid); + } else { + memset(leaf, 0, sizeof(*leaf)); + leaf->hdr.info.magic = cpu_to_be16(type); + } + + /* + * If it's a leaf-format directory initialize the tail. + * Caller is responsible for initialising the bests table. + */ + if (type == XFS_DIR2_LEAF1_MAGIC) { + struct xfs_dir2_leaf_tail *ltp; + + ltp = xfs_dir2_leaf_tail_p(mp, leaf); + ltp->bestcount = 0; + bp->b_ops = &xfs_dir3_leaf1_buf_ops; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAF1_BUF); + } else { + bp->b_ops = &xfs_dir3_leafn_buf_ops; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF); + } +} + +int +xfs_dir3_leaf_get_buf( + xfs_da_args_t *args, + xfs_dir2_db_t bno, + struct xfs_buf **bpp, + __uint16_t magic) +{ + struct xfs_inode *dp = args->dp; + struct xfs_trans *tp = args->trans; + struct xfs_mount *mp = dp->i_mount; + struct xfs_buf *bp; + int error; + + ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC); + ASSERT(bno >= XFS_DIR2_LEAF_FIRSTDB(mp) && + bno < XFS_DIR2_FREE_FIRSTDB(mp)); + + error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, bno), -1, &bp, + XFS_DATA_FORK); + if (error) + return error; + + xfs_dir3_leaf_init(mp, tp, bp, dp->i_ino, magic); + xfs_dir3_leaf_log_header(tp, bp); + if (magic == XFS_DIR2_LEAF1_MAGIC) + xfs_dir3_leaf_log_tail(tp, bp); + *bpp = bp; + return 0; } /* @@ -149,6 +424,9 @@ xfs_dir2_block_to_leaf( int needlog; /* need to log block header */ int needscan; /* need to rescan bestfree */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir2_data_free *bf; + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; trace_xfs_dir2_block_to_leaf(args); @@ -168,26 +446,33 @@ xfs_dir2_block_to_leaf( /* * Initialize the leaf block, get a buffer for it. */ - if ((error = xfs_dir2_leaf_init(args, ldb, &lbp, XFS_DIR2_LEAF1_MAGIC))) { + error = xfs_dir3_leaf_get_buf(args, ldb, &lbp, XFS_DIR2_LEAF1_MAGIC); + if (error) return error; - } - ASSERT(lbp != NULL); + leaf = lbp->b_addr; hdr = dbp->b_addr; - xfs_dir2_data_check(dp, dbp); + xfs_dir3_data_check(dp, dbp); btp = xfs_dir2_block_tail_p(mp, hdr); blp = xfs_dir2_block_leaf_p(btp); + bf = xfs_dir3_data_bestfree_p(hdr); + ents = xfs_dir3_leaf_ents_p(leaf); + /* * Set the counts in the leaf header. */ - leaf->hdr.count = cpu_to_be16(be32_to_cpu(btp->count)); - leaf->hdr.stale = cpu_to_be16(be32_to_cpu(btp->stale)); + xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); + leafhdr.count = be32_to_cpu(btp->count); + leafhdr.stale = be32_to_cpu(btp->stale); + xfs_dir3_leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir3_leaf_log_header(tp, lbp); + /* * Could compact these but I think we always do the conversion * after squeezing out stale entries. */ - memcpy(leaf->ents, blp, be32_to_cpu(btp->count) * sizeof(xfs_dir2_leaf_entry_t)); - xfs_dir2_leaf_log_ents(tp, lbp, 0, be16_to_cpu(leaf->hdr.count) - 1); + memcpy(ents, blp, be32_to_cpu(btp->count) * sizeof(xfs_dir2_leaf_entry_t)); + xfs_dir3_leaf_log_ents(tp, lbp, 0, leafhdr.count - 1); needscan = 0; needlog = 1; /* @@ -202,8 +487,13 @@ xfs_dir2_block_to_leaf( /* * Fix up the block header, make it a data block. */ - dbp->b_ops = &xfs_dir2_data_buf_ops; - hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC); + dbp->b_ops = &xfs_dir3_data_buf_ops; + xfs_trans_buf_set_type(tp, dbp, XFS_BLFT_DIR_DATA_BUF); + if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) + hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC); + else + hdr->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC); + if (needscan) xfs_dir2_data_freescan(mp, hdr, &needlog); /* @@ -212,21 +502,22 @@ xfs_dir2_block_to_leaf( ltp = xfs_dir2_leaf_tail_p(mp, leaf); ltp->bestcount = cpu_to_be32(1); bestsp = xfs_dir2_leaf_bests_p(ltp); - bestsp[0] = hdr->bestfree[0].length; + bestsp[0] = bf[0].length; /* * Log the data header and leaf bests table. */ if (needlog) xfs_dir2_data_log_header(tp, dbp); - xfs_dir2_leaf_check(dp, lbp); - xfs_dir2_data_check(dp, dbp); - xfs_dir2_leaf_log_bests(tp, lbp, 0, 0); + xfs_dir3_leaf_check(mp, lbp); + xfs_dir3_data_check(dp, dbp); + xfs_dir3_leaf_log_bests(tp, lbp, 0, 0); return 0; } STATIC void -xfs_dir2_leaf_find_stale( - struct xfs_dir2_leaf *leaf, +xfs_dir3_leaf_find_stale( + struct xfs_dir3_icleaf_hdr *leafhdr, + struct xfs_dir2_leaf_entry *ents, int index, int *lowstale, int *highstale) @@ -235,7 +526,7 @@ xfs_dir2_leaf_find_stale( * Find the first stale entry before our index, if any. */ for (*lowstale = index - 1; *lowstale >= 0; --*lowstale) { - if (leaf->ents[*lowstale].address == + if (ents[*lowstale].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) break; } @@ -245,10 +536,8 @@ xfs_dir2_leaf_find_stale( * Stop if the result would require moving more entries than using * lowstale. */ - for (*highstale = index; - *highstale < be16_to_cpu(leaf->hdr.count); - ++*highstale) { - if (leaf->ents[*highstale].address == + for (*highstale = index; *highstale < leafhdr->count; ++*highstale) { + if (ents[*highstale].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) break; if (*lowstale >= 0 && index - *lowstale <= *highstale - index) @@ -257,8 +546,9 @@ xfs_dir2_leaf_find_stale( } struct xfs_dir2_leaf_entry * -xfs_dir2_leaf_find_entry( - xfs_dir2_leaf_t *leaf, /* leaf structure */ +xfs_dir3_leaf_find_entry( + struct xfs_dir3_icleaf_hdr *leafhdr, + struct xfs_dir2_leaf_entry *ents, int index, /* leaf table position */ int compact, /* need to compact leaves */ int lowstale, /* index of prev stale leaf */ @@ -266,7 +556,7 @@ xfs_dir2_leaf_find_entry( int *lfloglow, /* low leaf logging index */ int *lfloghigh) /* high leaf logging index */ { - if (!leaf->hdr.stale) { + if (!leafhdr->stale) { xfs_dir2_leaf_entry_t *lep; /* leaf entry table pointer */ /* @@ -274,18 +564,16 @@ xfs_dir2_leaf_find_entry( * * If there are no stale entries, just insert a hole at index. */ - lep = &leaf->ents[index]; - if (index < be16_to_cpu(leaf->hdr.count)) + lep = &ents[index]; + if (index < leafhdr->count) memmove(lep + 1, lep, - (be16_to_cpu(leaf->hdr.count) - index) * - sizeof(*lep)); + (leafhdr->count - index) * sizeof(*lep)); /* * Record low and high logging indices for the leaf. */ *lfloglow = index; - *lfloghigh = be16_to_cpu(leaf->hdr.count); - be16_add_cpu(&leaf->hdr.count, 1); + *lfloghigh = leafhdr->count++; return lep; } @@ -299,16 +587,17 @@ xfs_dir2_leaf_find_entry( * entries before and after our insertion point. */ if (compact == 0) - xfs_dir2_leaf_find_stale(leaf, index, &lowstale, &highstale); + xfs_dir3_leaf_find_stale(leafhdr, ents, index, + &lowstale, &highstale); /* * If the low one is better, use it. */ if (lowstale >= 0 && - (highstale == be16_to_cpu(leaf->hdr.count) || + (highstale == leafhdr->count || index - lowstale - 1 < highstale - index)) { ASSERT(index - lowstale - 1 >= 0); - ASSERT(leaf->ents[lowstale].address == + ASSERT(ents[lowstale].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)); /* @@ -316,37 +605,34 @@ xfs_dir2_leaf_find_entry( * for the new entry. */ if (index - lowstale - 1 > 0) { - memmove(&leaf->ents[lowstale], - &leaf->ents[lowstale + 1], + memmove(&ents[lowstale], &ents[lowstale + 1], (index - lowstale - 1) * - sizeof(xfs_dir2_leaf_entry_t)); + sizeof(xfs_dir2_leaf_entry_t)); } *lfloglow = MIN(lowstale, *lfloglow); *lfloghigh = MAX(index - 1, *lfloghigh); - be16_add_cpu(&leaf->hdr.stale, -1); - return &leaf->ents[index - 1]; + leafhdr->stale--; + return &ents[index - 1]; } /* * The high one is better, so use that one. */ ASSERT(highstale - index >= 0); - ASSERT(leaf->ents[highstale].address == - cpu_to_be32(XFS_DIR2_NULL_DATAPTR)); + ASSERT(ents[highstale].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)); /* * Copy entries down to cover the stale entry and make room for the * new entry. */ if (highstale - index > 0) { - memmove(&leaf->ents[index + 1], - &leaf->ents[index], + memmove(&ents[index + 1], &ents[index], (highstale - index) * sizeof(xfs_dir2_leaf_entry_t)); } *lfloglow = MIN(index, *lfloglow); *lfloghigh = MAX(highstale, *lfloghigh); - be16_add_cpu(&leaf->hdr.stale, -1); - return &leaf->ents[index]; + leafhdr->stale--; + return &ents[index]; } /* @@ -383,6 +669,9 @@ xfs_dir2_leaf_addname( __be16 *tagp; /* end of data entry */ xfs_trans_t *tp; /* transaction pointer */ xfs_dir2_db_t use_block; /* data block number */ + struct xfs_dir2_data_free *bf; /* bestfree table */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; trace_xfs_dir2_leaf_addname(args); @@ -390,7 +679,7 @@ xfs_dir2_leaf_addname( tp = args->trans; mp = dp->i_mount; - error = xfs_dir2_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp); + error = xfs_dir3_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp); if (error) return error; @@ -403,16 +692,19 @@ xfs_dir2_leaf_addname( index = xfs_dir2_leaf_search_hash(args, lbp); leaf = lbp->b_addr; ltp = xfs_dir2_leaf_tail_p(mp, leaf); + ents = xfs_dir3_leaf_ents_p(leaf); + xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); bestsp = xfs_dir2_leaf_bests_p(ltp); length = xfs_dir2_data_entsize(args->namelen); + /* * See if there are any entries with the same hash value * and space in their block for the new entry. * This is good because it puts multiple same-hash value entries * in a data block, improving the lookup of those entries. */ - for (use_block = -1, lep = &leaf->ents[index]; - index < be16_to_cpu(leaf->hdr.count) && be32_to_cpu(lep->hashval) == args->hashval; + for (use_block = -1, lep = &ents[index]; + index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval; index++, lep++) { if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR) continue; @@ -445,7 +737,7 @@ xfs_dir2_leaf_addname( * How many bytes do we need in the leaf block? */ needbytes = 0; - if (!leaf->hdr.stale) + if (!leafhdr.stale) needbytes += sizeof(xfs_dir2_leaf_entry_t); if (use_block == -1) needbytes += sizeof(xfs_dir2_data_off_t); @@ -460,16 +752,15 @@ xfs_dir2_leaf_addname( * If we don't have enough free bytes but we can make enough * by compacting out stale entries, we'll do that. */ - if ((char *)bestsp - (char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] < - needbytes && be16_to_cpu(leaf->hdr.stale) > 1) { + if ((char *)bestsp - (char *)&ents[leafhdr.count] < needbytes && + leafhdr.stale > 1) compact = 1; - } + /* * Otherwise if we don't have enough free bytes we need to * convert to node form. */ - else if ((char *)bestsp - (char *)&leaf->ents[be16_to_cpu( - leaf->hdr.count)] < needbytes) { + else if ((char *)bestsp - (char *)&ents[leafhdr.count] < needbytes) { /* * Just checking or no space reservation, give up. */ @@ -517,15 +808,15 @@ xfs_dir2_leaf_addname( * point later. */ if (compact) { - xfs_dir2_leaf_compact_x1(lbp, &index, &lowstale, &highstale, - &lfloglow, &lfloghigh); + xfs_dir3_leaf_compact_x1(&leafhdr, ents, &index, &lowstale, + &highstale, &lfloglow, &lfloghigh); } /* * There are stale entries, so we'll need log-low and log-high * impossibly bad values later. */ - else if (be16_to_cpu(leaf->hdr.stale)) { - lfloglow = be16_to_cpu(leaf->hdr.count); + else if (leafhdr.stale) { + lfloglow = leafhdr.count; lfloghigh = -1; } /* @@ -544,7 +835,7 @@ xfs_dir2_leaf_addname( /* * Initialize the block. */ - if ((error = xfs_dir2_data_init(args, use_block, &dbp))) { + if ((error = xfs_dir3_data_init(args, use_block, &dbp))) { xfs_trans_brelse(tp, lbp); return error; } @@ -557,23 +848,24 @@ xfs_dir2_leaf_addname( memmove(&bestsp[0], &bestsp[1], be32_to_cpu(ltp->bestcount) * sizeof(bestsp[0])); be32_add_cpu(<p->bestcount, 1); - xfs_dir2_leaf_log_tail(tp, lbp); - xfs_dir2_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); + xfs_dir3_leaf_log_tail(tp, lbp); + xfs_dir3_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); } /* * If we're filling in a previously empty block just log it. */ else - xfs_dir2_leaf_log_bests(tp, lbp, use_block, use_block); + xfs_dir3_leaf_log_bests(tp, lbp, use_block, use_block); hdr = dbp->b_addr; - bestsp[use_block] = hdr->bestfree[0].length; + bf = xfs_dir3_data_bestfree_p(hdr); + bestsp[use_block] = bf[0].length; grown = 1; } else { /* * Already had space in some data block. * Just read that one in. */ - error = xfs_dir2_data_read(tp, dp, + error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(mp, use_block), -1, &dbp); if (error) { @@ -581,13 +873,14 @@ xfs_dir2_leaf_addname( return error; } hdr = dbp->b_addr; + bf = xfs_dir3_data_bestfree_p(hdr); grown = 0; } /* * Point to the biggest freespace in our data block. */ dup = (xfs_dir2_data_unused_t *) - ((char *)hdr + be16_to_cpu(hdr->bestfree[0].offset)); + ((char *)hdr + be16_to_cpu(bf[0].offset)); ASSERT(be16_to_cpu(dup->length) >= length); needscan = needlog = 0; /* @@ -620,13 +913,13 @@ xfs_dir2_leaf_addname( * If the bests table needs to be changed, do it. * Log the change unless we've already done that. */ - if (be16_to_cpu(bestsp[use_block]) != be16_to_cpu(hdr->bestfree[0].length)) { - bestsp[use_block] = hdr->bestfree[0].length; + if (be16_to_cpu(bestsp[use_block]) != be16_to_cpu(bf[0].length)) { + bestsp[use_block] = bf[0].length; if (!grown) - xfs_dir2_leaf_log_bests(tp, lbp, use_block, use_block); + xfs_dir3_leaf_log_bests(tp, lbp, use_block, use_block); } - lep = xfs_dir2_leaf_find_entry(leaf, index, compact, lowstale, + lep = xfs_dir3_leaf_find_entry(&leafhdr, ents, index, compact, lowstale, highstale, &lfloglow, &lfloghigh); /* @@ -638,82 +931,40 @@ xfs_dir2_leaf_addname( /* * Log the leaf fields and give up the buffers. */ - xfs_dir2_leaf_log_header(tp, lbp); - xfs_dir2_leaf_log_ents(tp, lbp, lfloglow, lfloghigh); - xfs_dir2_leaf_check(dp, lbp); - xfs_dir2_data_check(dp, dbp); + xfs_dir3_leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir3_leaf_log_header(tp, lbp); + xfs_dir3_leaf_log_ents(tp, lbp, lfloglow, lfloghigh); + xfs_dir3_leaf_check(mp, lbp); + xfs_dir3_data_check(dp, dbp); return 0; } -#ifdef DEBUG -/* - * Check the internal consistency of a leaf1 block. - * Pop an assert if something is wrong. - */ -STATIC void -xfs_dir2_leaf_check( - struct xfs_inode *dp, /* incore directory inode */ - struct xfs_buf *bp) /* leaf's buffer */ -{ - int i; /* leaf index */ - xfs_dir2_leaf_t *leaf; /* leaf structure */ - xfs_dir2_leaf_tail_t *ltp; /* leaf tail pointer */ - xfs_mount_t *mp; /* filesystem mount point */ - int stale; /* count of stale leaves */ - - leaf = bp->b_addr; - mp = dp->i_mount; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); - /* - * This value is not restrictive enough. - * Should factor in the size of the bests table as well. - * We can deduce a value for that from di_size. - */ - ASSERT(be16_to_cpu(leaf->hdr.count) <= xfs_dir2_max_leaf_ents(mp)); - ltp = xfs_dir2_leaf_tail_p(mp, leaf); - /* - * Leaves and bests don't overlap. - */ - ASSERT((char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] <= - (char *)xfs_dir2_leaf_bests_p(ltp)); - /* - * Check hash value order, count stale entries. - */ - for (i = stale = 0; i < be16_to_cpu(leaf->hdr.count); i++) { - if (i + 1 < be16_to_cpu(leaf->hdr.count)) - ASSERT(be32_to_cpu(leaf->ents[i].hashval) <= - be32_to_cpu(leaf->ents[i + 1].hashval)); - if (leaf->ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) - stale++; - } - ASSERT(be16_to_cpu(leaf->hdr.stale) == stale); -} -#endif /* DEBUG */ - /* * Compact out any stale entries in the leaf. * Log the header and changed leaf entries, if any. */ void -xfs_dir2_leaf_compact( +xfs_dir3_leaf_compact( xfs_da_args_t *args, /* operation arguments */ + struct xfs_dir3_icleaf_hdr *leafhdr, struct xfs_buf *bp) /* leaf buffer */ { int from; /* source leaf index */ xfs_dir2_leaf_t *leaf; /* leaf structure */ int loglow; /* first leaf entry to log */ int to; /* target leaf index */ + struct xfs_dir2_leaf_entry *ents; leaf = bp->b_addr; - if (!leaf->hdr.stale) { + if (!leafhdr->stale) return; - } + /* * Compress out the stale entries in place. */ - for (from = to = 0, loglow = -1; from < be16_to_cpu(leaf->hdr.count); from++) { - if (leaf->ents[from].address == - cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) + ents = xfs_dir3_leaf_ents_p(leaf); + for (from = to = 0, loglow = -1; from < leafhdr->count; from++) { + if (ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) continue; /* * Only actually copy the entries that are different. @@ -721,19 +972,21 @@ xfs_dir2_leaf_compact( if (from > to) { if (loglow == -1) loglow = to; - leaf->ents[to] = leaf->ents[from]; + ents[to] = ents[from]; } to++; } /* * Update and log the header, log the leaf entries. */ - ASSERT(be16_to_cpu(leaf->hdr.stale) == from - to); - be16_add_cpu(&leaf->hdr.count, -(be16_to_cpu(leaf->hdr.stale))); - leaf->hdr.stale = 0; - xfs_dir2_leaf_log_header(args->trans, bp); + ASSERT(leafhdr->stale == from - to); + leafhdr->count -= leafhdr->stale; + leafhdr->stale = 0; + + xfs_dir3_leaf_hdr_to_disk(leaf, leafhdr); + xfs_dir3_leaf_log_header(args->trans, bp); if (loglow != -1) - xfs_dir2_leaf_log_ents(args->trans, bp, loglow, to - 1); + xfs_dir3_leaf_log_ents(args->trans, bp, loglow, to - 1); } /* @@ -745,8 +998,9 @@ xfs_dir2_leaf_compact( * and leaf logging indices. */ void -xfs_dir2_leaf_compact_x1( - struct xfs_buf *bp, /* leaf buffer */ +xfs_dir3_leaf_compact_x1( + struct xfs_dir3_icleaf_hdr *leafhdr, + struct xfs_dir2_leaf_entry *ents, int *indexp, /* insertion index */ int *lowstalep, /* out: stale entry before us */ int *highstalep, /* out: stale entry after us */ @@ -757,22 +1011,20 @@ xfs_dir2_leaf_compact_x1( int highstale; /* stale entry at/after index */ int index; /* insertion index */ int keepstale; /* source index of kept stale */ - xfs_dir2_leaf_t *leaf; /* leaf structure */ int lowstale; /* stale entry before index */ int newindex=0; /* new insertion index */ int to; /* destination copy index */ - leaf = bp->b_addr; - ASSERT(be16_to_cpu(leaf->hdr.stale) > 1); + ASSERT(leafhdr->stale > 1); index = *indexp; - xfs_dir2_leaf_find_stale(leaf, index, &lowstale, &highstale); + xfs_dir3_leaf_find_stale(leafhdr, ents, index, &lowstale, &highstale); /* * Pick the better of lowstale and highstale. */ if (lowstale >= 0 && - (highstale == be16_to_cpu(leaf->hdr.count) || + (highstale == leafhdr->count || index - lowstale <= highstale - index)) keepstale = lowstale; else @@ -781,15 +1033,14 @@ xfs_dir2_leaf_compact_x1( * Copy the entries in place, removing all the stale entries * except keepstale. */ - for (from = to = 0; from < be16_to_cpu(leaf->hdr.count); from++) { + for (from = to = 0; from < leafhdr->count; from++) { /* * Notice the new value of index. */ if (index == from) newindex = to; if (from != keepstale && - leaf->ents[from].address == - cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) { + ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) { if (from == to) *lowlogp = to; continue; @@ -803,7 +1054,7 @@ xfs_dir2_leaf_compact_x1( * Copy only the entries that have moved. */ if (from > to) - leaf->ents[to] = leaf->ents[from]; + ents[to] = ents[from]; to++; } ASSERT(from > to); @@ -817,8 +1068,8 @@ xfs_dir2_leaf_compact_x1( /* * Adjust the leaf header values. */ - be16_add_cpu(&leaf->hdr.count, -(from - to)); - leaf->hdr.stale = cpu_to_be16(1); + leafhdr->count -= from - to; + leafhdr->stale = 1; /* * Remember the low/high stale value only in the "right" * direction. @@ -826,8 +1077,8 @@ xfs_dir2_leaf_compact_x1( if (lowstale >= newindex) lowstale = -1; else - highstale = be16_to_cpu(leaf->hdr.count); - *highlogp = be16_to_cpu(leaf->hdr.count) - 1; + highstale = leafhdr->count; + *highlogp = leafhdr->count - 1; *lowstalep = lowstale; *highstalep = highstale; } @@ -965,7 +1216,7 @@ xfs_dir2_leaf_readbuf( * Read the directory block starting at the first mapping. */ mip->curdb = xfs_dir2_da_to_db(mp, map->br_startoff); - error = xfs_dir2_data_read(NULL, dp, map->br_startoff, + error = xfs_dir3_data_read(NULL, dp, map->br_startoff, map->br_blockcount >= mp->m_dirblkfsbs ? XFS_FSB_TO_DADDR(mp, map->br_startblock) : -1, &bp); @@ -994,7 +1245,7 @@ xfs_dir2_leaf_readbuf( */ if (i > mip->ra_current && map[mip->ra_index].br_blockcount >= mp->m_dirblkfsbs) { - xfs_dir2_data_readahead(NULL, dp, + xfs_dir3_data_readahead(NULL, dp, map[mip->ra_index].br_startoff + mip->ra_offset, XFS_FSB_TO_DADDR(mp, map[mip->ra_index].br_startblock + @@ -1007,7 +1258,7 @@ xfs_dir2_leaf_readbuf( * use our mapping, but this is a very rare case. */ else if (i > mip->ra_current) { - xfs_dir2_data_readahead(NULL, dp, + xfs_dir3_data_readahead(NULL, dp, map[mip->ra_index].br_startoff + mip->ra_offset, -1); mip->ra_current = i; @@ -1133,17 +1384,17 @@ xfs_dir2_leaf_getdents( ASSERT(xfs_dir2_byte_to_db(mp, curoff) == map_info->curdb); hdr = bp->b_addr; - xfs_dir2_data_check(dp, bp); + xfs_dir3_data_check(dp, bp); /* * Find our position in the block. */ - ptr = (char *)(hdr + 1); + ptr = (char *)xfs_dir3_data_entry_p(hdr); byteoff = xfs_dir2_byte_to_off(mp, curoff); /* * Skip past the header. */ if (byteoff == 0) - curoff += (uint)sizeof(*hdr); + curoff += xfs_dir3_data_entry_offset(hdr); /* * Skip past entries until we reach our offset. */ @@ -1220,69 +1471,12 @@ xfs_dir2_leaf_getdents( return error; } -/* - * Initialize a new leaf block, leaf1 or leafn magic accepted. - */ -int -xfs_dir2_leaf_init( - xfs_da_args_t *args, /* operation arguments */ - xfs_dir2_db_t bno, /* directory block number */ - struct xfs_buf **bpp, /* out: leaf buffer */ - int magic) /* magic number for block */ -{ - struct xfs_buf *bp; /* leaf buffer */ - xfs_inode_t *dp; /* incore directory inode */ - int error; /* error return code */ - xfs_dir2_leaf_t *leaf; /* leaf structure */ - xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ - xfs_mount_t *mp; /* filesystem mount point */ - xfs_trans_t *tp; /* transaction pointer */ - - dp = args->dp; - ASSERT(dp != NULL); - tp = args->trans; - mp = dp->i_mount; - ASSERT(bno >= XFS_DIR2_LEAF_FIRSTDB(mp) && - bno < XFS_DIR2_FREE_FIRSTDB(mp)); - /* - * Get the buffer for the block. - */ - error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, bno), -1, &bp, - XFS_DATA_FORK); - if (error) - return error; - - /* - * Initialize the header. - */ - leaf = bp->b_addr; - leaf->hdr.info.magic = cpu_to_be16(magic); - leaf->hdr.info.forw = 0; - leaf->hdr.info.back = 0; - leaf->hdr.count = 0; - leaf->hdr.stale = 0; - xfs_dir2_leaf_log_header(tp, bp); - /* - * If it's a leaf-format directory initialize the tail. - * In this case our caller has the real bests table to copy into - * the block. - */ - if (magic == XFS_DIR2_LEAF1_MAGIC) { - bp->b_ops = &xfs_dir2_leaf1_buf_ops; - ltp = xfs_dir2_leaf_tail_p(mp, leaf); - ltp->bestcount = 0; - xfs_dir2_leaf_log_tail(tp, bp); - } else - bp->b_ops = &xfs_dir2_leafn_buf_ops; - *bpp = bp; - return 0; -} /* * Log the bests entries indicated from a leaf1 block. */ static void -xfs_dir2_leaf_log_bests( +xfs_dir3_leaf_log_bests( xfs_trans_t *tp, /* transaction pointer */ struct xfs_buf *bp, /* leaf buffer */ int first, /* first entry to log */ @@ -1290,11 +1484,12 @@ xfs_dir2_leaf_log_bests( { __be16 *firstb; /* pointer to first entry */ __be16 *lastb; /* pointer to last entry */ - xfs_dir2_leaf_t *leaf; /* leaf structure */ + struct xfs_dir2_leaf *leaf = bp->b_addr; xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ - leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); + ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC)); + ltp = xfs_dir2_leaf_tail_p(tp->t_mountp, leaf); firstb = xfs_dir2_leaf_bests_p(ltp) + first; lastb = xfs_dir2_leaf_bests_p(ltp) + last; @@ -1306,7 +1501,7 @@ xfs_dir2_leaf_log_bests( * Log the leaf entries indicated from a leaf1 or leafn block. */ void -xfs_dir2_leaf_log_ents( +xfs_dir3_leaf_log_ents( xfs_trans_t *tp, /* transaction pointer */ struct xfs_buf *bp, /* leaf buffer */ int first, /* first entry to log */ @@ -1314,13 +1509,17 @@ xfs_dir2_leaf_log_ents( { xfs_dir2_leaf_entry_t *firstlep; /* pointer to first entry */ xfs_dir2_leaf_entry_t *lastlep; /* pointer to last entry */ - xfs_dir2_leaf_t *leaf; /* leaf structure */ + struct xfs_dir2_leaf *leaf = bp->b_addr; + struct xfs_dir2_leaf_entry *ents; - leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) || - leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); - firstlep = &leaf->ents[first]; - lastlep = &leaf->ents[last]; + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)); + + ents = xfs_dir3_leaf_ents_p(leaf); + firstlep = &ents[first]; + lastlep = &ents[last]; xfs_trans_log_buf(tp, bp, (uint)((char *)firstlep - (char *)leaf), (uint)((char *)lastlep - (char *)leaf + sizeof(*lastlep) - 1)); } @@ -1329,34 +1528,38 @@ xfs_dir2_leaf_log_ents( * Log the header of the leaf1 or leafn block. */ void -xfs_dir2_leaf_log_header( +xfs_dir3_leaf_log_header( struct xfs_trans *tp, struct xfs_buf *bp) { - xfs_dir2_leaf_t *leaf; /* leaf structure */ + struct xfs_dir2_leaf *leaf = bp->b_addr; - leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) || - leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)); + xfs_trans_log_buf(tp, bp, (uint)((char *)&leaf->hdr - (char *)leaf), - (uint)(sizeof(leaf->hdr) - 1)); + xfs_dir3_leaf_hdr_size(leaf) - 1); } /* * Log the tail of the leaf1 block. */ STATIC void -xfs_dir2_leaf_log_tail( +xfs_dir3_leaf_log_tail( struct xfs_trans *tp, struct xfs_buf *bp) { - xfs_dir2_leaf_t *leaf; /* leaf structure */ + struct xfs_dir2_leaf *leaf = bp->b_addr; xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ - xfs_mount_t *mp; /* filesystem mount point */ + struct xfs_mount *mp = tp->t_mountp; + + ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)); - mp = tp->t_mountp; - leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); ltp = xfs_dir2_leaf_tail_p(mp, leaf); xfs_trans_log_buf(tp, bp, (uint)((char *)ltp - (char *)leaf), (uint)(mp->m_dirblksize - 1)); @@ -1380,6 +1583,7 @@ xfs_dir2_leaf_lookup( xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_dir2_leaf_entry_t *lep; /* leaf entry */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir2_leaf_entry *ents; trace_xfs_dir2_leaf_lookup(args); @@ -1391,12 +1595,14 @@ xfs_dir2_leaf_lookup( } tp = args->trans; dp = args->dp; - xfs_dir2_leaf_check(dp, lbp); + xfs_dir3_leaf_check(dp->i_mount, lbp); leaf = lbp->b_addr; + ents = xfs_dir3_leaf_ents_p(leaf); /* * Get to the leaf entry and contained data entry address. */ - lep = &leaf->ents[index]; + lep = &ents[index]; + /* * Point to the data entry. */ @@ -1440,18 +1646,23 @@ xfs_dir2_leaf_lookup_int( xfs_trans_t *tp; /* transaction pointer */ xfs_dir2_db_t cidb = -1; /* case match data block no. */ enum xfs_dacmp cmp; /* name compare result */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; dp = args->dp; tp = args->trans; mp = dp->i_mount; - error = xfs_dir2_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp); + error = xfs_dir3_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp); if (error) return error; *lbpp = lbp; leaf = lbp->b_addr; - xfs_dir2_leaf_check(dp, lbp); + xfs_dir3_leaf_check(mp, lbp); + ents = xfs_dir3_leaf_ents_p(leaf); + xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); + /* * Look for the first leaf entry with our hash value. */ @@ -1460,9 +1671,9 @@ xfs_dir2_leaf_lookup_int( * Loop over all the entries with the right hash value * looking to match the name. */ - for (lep = &leaf->ents[index]; index < be16_to_cpu(leaf->hdr.count) && - be32_to_cpu(lep->hashval) == args->hashval; - lep++, index++) { + for (lep = &ents[index]; + index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval; + lep++, index++) { /* * Skip over stale leaf entries. */ @@ -1479,7 +1690,7 @@ xfs_dir2_leaf_lookup_int( if (newdb != curdb) { if (dbp) xfs_trans_brelse(tp, dbp); - error = xfs_dir2_data_read(tp, dp, + error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(mp, newdb), -1, &dbp); if (error) { @@ -1520,7 +1731,7 @@ xfs_dir2_leaf_lookup_int( ASSERT(cidb != -1); if (cidb != curdb) { xfs_trans_brelse(tp, dbp); - error = xfs_dir2_data_read(tp, dp, + error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(mp, cidb), -1, &dbp); if (error) { @@ -1566,6 +1777,9 @@ xfs_dir2_leaf_removename( int needscan; /* need to rescan data frees */ xfs_dir2_data_off_t oldbest; /* old value of best free */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir2_data_free *bf; /* bestfree table */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; trace_xfs_dir2_leaf_removename(args); @@ -1580,16 +1794,19 @@ xfs_dir2_leaf_removename( mp = dp->i_mount; leaf = lbp->b_addr; hdr = dbp->b_addr; - xfs_dir2_data_check(dp, dbp); + xfs_dir3_data_check(dp, dbp); + bf = xfs_dir3_data_bestfree_p(hdr); + xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); + ents = xfs_dir3_leaf_ents_p(leaf); /* * Point to the leaf entry, use that to point to the data entry. */ - lep = &leaf->ents[index]; + lep = &ents[index]; db = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address)); dep = (xfs_dir2_data_entry_t *) ((char *)hdr + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address))); needscan = needlog = 0; - oldbest = be16_to_cpu(hdr->bestfree[0].length); + oldbest = be16_to_cpu(bf[0].length); ltp = xfs_dir2_leaf_tail_p(mp, leaf); bestsp = xfs_dir2_leaf_bests_p(ltp); ASSERT(be16_to_cpu(bestsp[db]) == oldbest); @@ -1602,10 +1819,13 @@ xfs_dir2_leaf_removename( /* * We just mark the leaf entry stale by putting a null in it. */ - be16_add_cpu(&leaf->hdr.stale, 1); - xfs_dir2_leaf_log_header(tp, lbp); + leafhdr.stale++; + xfs_dir3_leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir3_leaf_log_header(tp, lbp); + lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR); - xfs_dir2_leaf_log_ents(tp, lbp, index, index); + xfs_dir3_leaf_log_ents(tp, lbp, index, index); + /* * Scan the freespace in the data block again if necessary, * log the data block header if necessary. @@ -1618,16 +1838,16 @@ xfs_dir2_leaf_removename( * If the longest freespace in the data block has changed, * put the new value in the bests table and log that. */ - if (be16_to_cpu(hdr->bestfree[0].length) != oldbest) { - bestsp[db] = hdr->bestfree[0].length; - xfs_dir2_leaf_log_bests(tp, lbp, db, db); + if (be16_to_cpu(bf[0].length) != oldbest) { + bestsp[db] = bf[0].length; + xfs_dir3_leaf_log_bests(tp, lbp, db, db); } - xfs_dir2_data_check(dp, dbp); + xfs_dir3_data_check(dp, dbp); /* * If the data block is now empty then get rid of the data block. */ - if (be16_to_cpu(hdr->bestfree[0].length) == - mp->m_dirblksize - (uint)sizeof(*hdr)) { + if (be16_to_cpu(bf[0].length) == + mp->m_dirblksize - xfs_dir3_data_entry_offset(hdr)) { ASSERT(db != mp->m_dirdatablk); if ((error = xfs_dir2_shrink_inode(args, db, dbp))) { /* @@ -1638,7 +1858,7 @@ xfs_dir2_leaf_removename( */ if (error == ENOSPC && args->total == 0) error = 0; - xfs_dir2_leaf_check(dp, lbp); + xfs_dir3_leaf_check(mp, lbp); return error; } dbp = NULL; @@ -1661,8 +1881,8 @@ xfs_dir2_leaf_removename( memmove(&bestsp[db - i], bestsp, (be32_to_cpu(ltp->bestcount) - (db - i)) * sizeof(*bestsp)); be32_add_cpu(<p->bestcount, -(db - i)); - xfs_dir2_leaf_log_tail(tp, lbp); - xfs_dir2_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); + xfs_dir3_leaf_log_tail(tp, lbp); + xfs_dir3_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); } else bestsp[db] = cpu_to_be16(NULLDATAOFF); } @@ -1672,7 +1892,7 @@ xfs_dir2_leaf_removename( else if (db != mp->m_dirdatablk) dbp = NULL; - xfs_dir2_leaf_check(dp, lbp); + xfs_dir3_leaf_check(mp, lbp); /* * See if we can convert to block form. */ @@ -1695,6 +1915,7 @@ xfs_dir2_leaf_replace( xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_dir2_leaf_entry_t *lep; /* leaf entry */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir2_leaf_entry *ents; trace_xfs_dir2_leaf_replace(args); @@ -1706,10 +1927,11 @@ xfs_dir2_leaf_replace( } dp = args->dp; leaf = lbp->b_addr; + ents = xfs_dir3_leaf_ents_p(leaf); /* * Point to the leaf entry, get data address from it. */ - lep = &leaf->ents[index]; + lep = &ents[index]; /* * Point to the data entry. */ @@ -1723,7 +1945,7 @@ xfs_dir2_leaf_replace( dep->inumber = cpu_to_be64(args->inumber); tp = args->trans; xfs_dir2_data_log_entry(tp, dbp, dep); - xfs_dir2_leaf_check(dp, lbp); + xfs_dir3_leaf_check(dp->i_mount, lbp); xfs_trans_brelse(tp, lbp); return 0; } @@ -1745,17 +1967,22 @@ xfs_dir2_leaf_search_hash( xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_dir2_leaf_entry_t *lep; /* leaf entry */ int mid=0; /* current leaf index */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; leaf = lbp->b_addr; + ents = xfs_dir3_leaf_ents_p(leaf); + xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); + #ifndef __KERNEL__ - if (!leaf->hdr.count) + if (!leafhdr.count) return 0; #endif /* * Note, the table cannot be empty, so we have to go through the loop. * Binary search the leaf entries looking for our hash value. */ - for (lep = leaf->ents, low = 0, high = be16_to_cpu(leaf->hdr.count) - 1, + for (lep = ents, low = 0, high = leafhdr.count - 1, hashwant = args->hashval; low <= high; ) { mid = (low + high) >> 1; @@ -1807,7 +2034,7 @@ xfs_dir2_leaf_trim_data( /* * Read the offending data block. We need its buffer. */ - error = xfs_dir2_data_read(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp); + error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp); if (error) return error; @@ -1817,10 +2044,12 @@ xfs_dir2_leaf_trim_data( #ifdef DEBUG { struct xfs_dir2_data_hdr *hdr = dbp->b_addr; + struct xfs_dir2_data_free *bf = xfs_dir3_data_bestfree_p(hdr); - ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC)); - ASSERT(be16_to_cpu(hdr->bestfree[0].length) == - mp->m_dirblksize - (uint)sizeof(*hdr)); + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC)); + ASSERT(be16_to_cpu(bf[0].length) == + mp->m_dirblksize - xfs_dir3_data_entry_offset(hdr)); ASSERT(db == be32_to_cpu(ltp->bestcount) - 1); } #endif @@ -1839,23 +2068,29 @@ xfs_dir2_leaf_trim_data( bestsp = xfs_dir2_leaf_bests_p(ltp); be32_add_cpu(<p->bestcount, -1); memmove(&bestsp[1], &bestsp[0], be32_to_cpu(ltp->bestcount) * sizeof(*bestsp)); - xfs_dir2_leaf_log_tail(tp, lbp); - xfs_dir2_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); + xfs_dir3_leaf_log_tail(tp, lbp); + xfs_dir3_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); return 0; } static inline size_t -xfs_dir2_leaf_size( - struct xfs_dir2_leaf_hdr *hdr, +xfs_dir3_leaf_size( + struct xfs_dir3_icleaf_hdr *hdr, int counts) { - int entries; + int entries; + int hdrsize; + + entries = hdr->count - hdr->stale; + if (hdr->magic == XFS_DIR2_LEAF1_MAGIC || + hdr->magic == XFS_DIR2_LEAFN_MAGIC) + hdrsize = sizeof(struct xfs_dir2_leaf_hdr); + else + hdrsize = sizeof(struct xfs_dir3_leaf_hdr); - entries = be16_to_cpu(hdr->count) - be16_to_cpu(hdr->stale); - return sizeof(xfs_dir2_leaf_hdr_t) + - entries * sizeof(xfs_dir2_leaf_entry_t) + - counts * sizeof(xfs_dir2_data_off_t) + - sizeof(xfs_dir2_leaf_tail_t); + return hdrsize + entries * sizeof(xfs_dir2_leaf_entry_t) + + counts * sizeof(xfs_dir2_data_off_t) + + sizeof(xfs_dir2_leaf_tail_t); } /* @@ -1879,6 +2114,8 @@ xfs_dir2_node_to_leaf( xfs_mount_t *mp; /* filesystem mount point */ int rval; /* successful free trim? */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_dir3_icfree_hdr freehdr; /* * There's more than a leaf level in the btree, so there must @@ -1928,7 +2165,11 @@ xfs_dir2_node_to_leaf( return 0; lbp = state->path.blk[0].bp; leaf = lbp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); + xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); + + ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC || + leafhdr.magic == XFS_DIR3_LEAFN_MAGIC); + /* * Read the freespace block. */ @@ -1936,44 +2177,49 @@ xfs_dir2_node_to_leaf( if (error) return error; free = fbp->b_addr; - ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); - ASSERT(!free->hdr.firstdb); + xfs_dir3_free_hdr_from_disk(&freehdr, free); + + ASSERT(!freehdr.firstdb); /* * Now see if the leafn and free data will fit in a leaf1. * If not, release the buffer and give up. */ - if (xfs_dir2_leaf_size(&leaf->hdr, be32_to_cpu(free->hdr.nvalid)) > - mp->m_dirblksize) { + if (xfs_dir3_leaf_size(&leafhdr, freehdr.nvalid) > mp->m_dirblksize) { xfs_trans_brelse(tp, fbp); return 0; } /* * If the leaf has any stale entries in it, compress them out. - * The compact routine will log the header. */ - if (be16_to_cpu(leaf->hdr.stale)) - xfs_dir2_leaf_compact(args, lbp); - else - xfs_dir2_leaf_log_header(tp, lbp); + if (leafhdr.stale) + xfs_dir3_leaf_compact(args, &leafhdr, lbp); - lbp->b_ops = &xfs_dir2_leaf1_buf_ops; - leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAF1_MAGIC); + lbp->b_ops = &xfs_dir3_leaf1_buf_ops; + xfs_trans_buf_set_type(tp, lbp, XFS_BLFT_DIR_LEAF1_BUF); + leafhdr.magic = (leafhdr.magic == XFS_DIR2_LEAFN_MAGIC) + ? XFS_DIR2_LEAF1_MAGIC + : XFS_DIR3_LEAF1_MAGIC; /* * Set up the leaf tail from the freespace block. */ ltp = xfs_dir2_leaf_tail_p(mp, leaf); - ltp->bestcount = free->hdr.nvalid; + ltp->bestcount = cpu_to_be32(freehdr.nvalid); + /* * Set up the leaf bests table. */ - memcpy(xfs_dir2_leaf_bests_p(ltp), free->bests, - be32_to_cpu(ltp->bestcount) * sizeof(xfs_dir2_data_off_t)); - xfs_dir2_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); - xfs_dir2_leaf_log_tail(tp, lbp); - xfs_dir2_leaf_check(dp, lbp); + memcpy(xfs_dir2_leaf_bests_p(ltp), xfs_dir3_free_bests_p(mp, free), + freehdr.nvalid * sizeof(xfs_dir2_data_off_t)); + + xfs_dir3_leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir3_leaf_log_header(tp, lbp); + xfs_dir3_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); + xfs_dir3_leaf_log_tail(tp, lbp); + xfs_dir3_leaf_check(mp, lbp); + /* * Get rid of the freespace block. */ diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index 5980f9b7fa9..ecc6c661064 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -32,20 +33,14 @@ #include "xfs_dir2_priv.h" #include "xfs_error.h" #include "xfs_trace.h" +#include "xfs_buf_item.h" +#include "xfs_cksum.h" /* * Function declarations. */ static int xfs_dir2_leafn_add(struct xfs_buf *bp, xfs_da_args_t *args, int index); -#ifdef DEBUG -static void xfs_dir2_leafn_check(struct xfs_inode *dp, struct xfs_buf *bp); -#else -#define xfs_dir2_leafn_check(dp, bp) -#endif -static void xfs_dir2_leafn_moveents(xfs_da_args_t *args, struct xfs_buf *bp_s, - int start_s, struct xfs_buf *bp_d, - int start_d, int count); static void xfs_dir2_leafn_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, xfs_da_state_blk_t *blk2); @@ -55,52 +50,126 @@ static int xfs_dir2_leafn_remove(xfs_da_args_t *args, struct xfs_buf *bp, static int xfs_dir2_node_addname_int(xfs_da_args_t *args, xfs_da_state_blk_t *fblk); -static void -xfs_dir2_free_verify( +/* + * Check internal consistency of a leafn block. + */ +#ifdef DEBUG +#define xfs_dir3_leaf_check(mp, bp) \ +do { \ + if (!xfs_dir3_leafn_check((mp), (bp))) \ + ASSERT(0); \ +} while (0); + +static bool +xfs_dir3_leafn_check( + struct xfs_mount *mp, + struct xfs_buf *bp) +{ + struct xfs_dir2_leaf *leaf = bp->b_addr; + struct xfs_dir3_icleaf_hdr leafhdr; + + xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); + + if (leafhdr.magic == XFS_DIR3_LEAFN_MAGIC) { + struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; + if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn) + return false; + } else if (leafhdr.magic != XFS_DIR2_LEAFN_MAGIC) + return false; + + return xfs_dir3_leaf_check_int(mp, &leafhdr, leaf); +} +#else +#define xfs_dir3_leaf_check(mp, bp) +#endif + +static bool +xfs_dir3_free_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_dir2_free_hdr *hdr = bp->b_addr; - int block_ok = 0; - block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC); - if (!block_ok) { - XFS_CORRUPTION_ERROR("xfs_dir2_free_verify magic", - XFS_ERRLEVEL_LOW, mp, hdr); - xfs_buf_ioerror(bp, EFSCORRUPTED); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + + if (hdr3->magic != cpu_to_be32(XFS_DIR3_FREE_MAGIC)) + return false; + if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid)) + return false; + if (be64_to_cpu(hdr3->blkno) != bp->b_bn) + return false; + } else { + if (hdr->magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC)) + return false; } + + /* XXX: should bounds check the xfs_dir3_icfree_hdr here */ + + return true; } static void -xfs_dir2_free_read_verify( +xfs_dir3_free_read_verify( struct xfs_buf *bp) { - xfs_dir2_free_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + + if ((xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), + XFS_DIR3_FREE_CRC_OFF)) || + !xfs_dir3_free_verify(bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } } static void -xfs_dir2_free_write_verify( +xfs_dir3_free_write_verify( struct xfs_buf *bp) { - xfs_dir2_free_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + + if (!xfs_dir3_free_verify(bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DIR3_FREE_CRC_OFF); } -static const struct xfs_buf_ops xfs_dir2_free_buf_ops = { - .verify_read = xfs_dir2_free_read_verify, - .verify_write = xfs_dir2_free_write_verify, +const struct xfs_buf_ops xfs_dir3_free_buf_ops = { + .verify_read = xfs_dir3_free_read_verify, + .verify_write = xfs_dir3_free_write_verify, }; static int -__xfs_dir2_free_read( +__xfs_dir3_free_read( struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp) { - return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, - XFS_DATA_FORK, &xfs_dir2_free_buf_ops); + int err; + + err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, + XFS_DATA_FORK, &xfs_dir3_free_buf_ops); + + /* try read returns without an error or *bpp if it lands in a hole */ + if (!err && tp && *bpp) + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_FREE_BUF); + return err; } int @@ -110,7 +179,7 @@ xfs_dir2_free_read( xfs_dablk_t fbno, struct xfs_buf **bpp) { - return __xfs_dir2_free_read(tp, dp, fbno, -1, bpp); + return __xfs_dir3_free_read(tp, dp, fbno, -1, bpp); } static int @@ -120,7 +189,95 @@ xfs_dir2_free_try_read( xfs_dablk_t fbno, struct xfs_buf **bpp) { - return __xfs_dir2_free_read(tp, dp, fbno, -2, bpp); + return __xfs_dir3_free_read(tp, dp, fbno, -2, bpp); +} + + +void +xfs_dir3_free_hdr_from_disk( + struct xfs_dir3_icfree_hdr *to, + struct xfs_dir2_free *from) +{ + if (from->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)) { + to->magic = be32_to_cpu(from->hdr.magic); + to->firstdb = be32_to_cpu(from->hdr.firstdb); + to->nvalid = be32_to_cpu(from->hdr.nvalid); + to->nused = be32_to_cpu(from->hdr.nused); + } else { + struct xfs_dir3_free_hdr *hdr3 = (struct xfs_dir3_free_hdr *)from; + + to->magic = be32_to_cpu(hdr3->hdr.magic); + to->firstdb = be32_to_cpu(hdr3->firstdb); + to->nvalid = be32_to_cpu(hdr3->nvalid); + to->nused = be32_to_cpu(hdr3->nused); + } + + ASSERT(to->magic == XFS_DIR2_FREE_MAGIC || + to->magic == XFS_DIR3_FREE_MAGIC); +} + +static void +xfs_dir3_free_hdr_to_disk( + struct xfs_dir2_free *to, + struct xfs_dir3_icfree_hdr *from) +{ + ASSERT(from->magic == XFS_DIR2_FREE_MAGIC || + from->magic == XFS_DIR3_FREE_MAGIC); + + if (from->magic == XFS_DIR2_FREE_MAGIC) { + to->hdr.magic = cpu_to_be32(from->magic); + to->hdr.firstdb = cpu_to_be32(from->firstdb); + to->hdr.nvalid = cpu_to_be32(from->nvalid); + to->hdr.nused = cpu_to_be32(from->nused); + } else { + struct xfs_dir3_free_hdr *hdr3 = (struct xfs_dir3_free_hdr *)to; + + hdr3->hdr.magic = cpu_to_be32(from->magic); + hdr3->firstdb = cpu_to_be32(from->firstdb); + hdr3->nvalid = cpu_to_be32(from->nvalid); + hdr3->nused = cpu_to_be32(from->nused); + } +} + +static int +xfs_dir3_free_get_buf( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dir2_db_t fbno, + struct xfs_buf **bpp) +{ + struct xfs_mount *mp = dp->i_mount; + struct xfs_buf *bp; + int error; + struct xfs_dir3_icfree_hdr hdr; + + error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, fbno), + -1, &bp, XFS_DATA_FORK); + if (error) + return error; + + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_FREE_BUF); + bp->b_ops = &xfs_dir3_free_buf_ops; + + /* + * Initialize the new block to be empty, and remember + * its first slot as our empty slot. + */ + hdr.magic = XFS_DIR2_FREE_MAGIC; + hdr.firstdb = 0; + hdr.nused = 0; + hdr.nvalid = 0; + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dir3_free_hdr *hdr3 = bp->b_addr; + + hdr.magic = XFS_DIR3_FREE_MAGIC; + hdr3->hdr.blkno = cpu_to_be64(bp->b_bn); + hdr3->hdr.owner = cpu_to_be64(dp->i_ino); + uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_uuid); + } + xfs_dir3_free_hdr_to_disk(bp->b_addr, &hdr); + *bpp = bp; + return 0; } /* @@ -134,13 +291,16 @@ xfs_dir2_free_log_bests( int last) /* last entry to log */ { xfs_dir2_free_t *free; /* freespace structure */ + __be16 *bests; free = bp->b_addr; - ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); + bests = xfs_dir3_free_bests_p(tp->t_mountp, free); + ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) || + free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC)); xfs_trans_log_buf(tp, bp, - (uint)((char *)&free->bests[first] - (char *)free), - (uint)((char *)&free->bests[last] - (char *)free + - sizeof(free->bests[0]) - 1)); + (uint)((char *)&bests[first] - (char *)free), + (uint)((char *)&bests[last] - (char *)free + + sizeof(bests[0]) - 1)); } /* @@ -154,9 +314,9 @@ xfs_dir2_free_log_header( xfs_dir2_free_t *free; /* freespace structure */ free = bp->b_addr; - ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); - xfs_trans_log_buf(tp, bp, (uint)((char *)&free->hdr - (char *)free), - (uint)(sizeof(xfs_dir2_free_hdr_t) - 1)); + ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) || + free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC)); + xfs_trans_log_buf(tp, bp, 0, xfs_dir3_free_hdr_size(tp->t_mountp) - 1); } /* @@ -183,6 +343,7 @@ xfs_dir2_leaf_to_node( xfs_dir2_data_off_t off; /* freespace entry value */ __be16 *to; /* pointer to freespace entry */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir3_icfree_hdr freehdr; trace_xfs_dir2_leaf_to_node(args); @@ -199,44 +360,53 @@ xfs_dir2_leaf_to_node( /* * Get the buffer for the new freespace block. */ - error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), -1, &fbp, - XFS_DATA_FORK); + error = xfs_dir3_free_get_buf(tp, dp, fdb, &fbp); if (error) return error; - fbp->b_ops = &xfs_dir2_free_buf_ops; free = fbp->b_addr; + xfs_dir3_free_hdr_from_disk(&freehdr, free); leaf = lbp->b_addr; ltp = xfs_dir2_leaf_tail_p(mp, leaf); - /* - * Initialize the freespace block header. - */ - free->hdr.magic = cpu_to_be32(XFS_DIR2_FREE_MAGIC); - free->hdr.firstdb = 0; - ASSERT(be32_to_cpu(ltp->bestcount) <= (uint)dp->i_d.di_size / mp->m_dirblksize); - free->hdr.nvalid = ltp->bestcount; + ASSERT(be32_to_cpu(ltp->bestcount) <= + (uint)dp->i_d.di_size / mp->m_dirblksize); + /* * Copy freespace entries from the leaf block to the new block. * Count active entries. */ - for (i = n = 0, from = xfs_dir2_leaf_bests_p(ltp), to = free->bests; - i < be32_to_cpu(ltp->bestcount); i++, from++, to++) { + from = xfs_dir2_leaf_bests_p(ltp); + to = xfs_dir3_free_bests_p(mp, free); + for (i = n = 0; i < be32_to_cpu(ltp->bestcount); i++, from++, to++) { if ((off = be16_to_cpu(*from)) != NULLDATAOFF) n++; *to = cpu_to_be16(off); } - free->hdr.nused = cpu_to_be32(n); - - lbp->b_ops = &xfs_dir2_leafn_buf_ops; - leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAFN_MAGIC); /* - * Log everything. + * Now initialize the freespace block header. */ - xfs_dir2_leaf_log_header(tp, lbp); + freehdr.nused = n; + freehdr.nvalid = be32_to_cpu(ltp->bestcount); + + xfs_dir3_free_hdr_to_disk(fbp->b_addr, &freehdr); + xfs_dir2_free_log_bests(tp, fbp, 0, freehdr.nvalid - 1); xfs_dir2_free_log_header(tp, fbp); - xfs_dir2_free_log_bests(tp, fbp, 0, be32_to_cpu(free->hdr.nvalid) - 1); - xfs_dir2_leafn_check(dp, lbp); + + /* + * Converting the leaf to a leafnode is just a matter of changing the + * magic number and the ops. Do the change directly to the buffer as + * it's less work (and less code) than decoding the header to host + * format and back again. + */ + if (leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)) + leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAFN_MAGIC); + else + leaf->hdr.info.magic = cpu_to_be16(XFS_DIR3_LEAFN_MAGIC); + lbp->b_ops = &xfs_dir3_leafn_buf_ops; + xfs_trans_buf_set_type(tp, lbp, XFS_BLFT_DIR_LEAFN_BUF); + xfs_dir3_leaf_log_header(tp, lbp); + xfs_dir3_leaf_check(mp, lbp); return 0; } @@ -260,6 +430,8 @@ xfs_dir2_leafn_add( int lowstale; /* previous stale entry */ xfs_mount_t *mp; /* filesystem mount point */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_dir2_leaf_entry *ents; trace_xfs_dir2_leafn_add(args, index); @@ -267,6 +439,8 @@ xfs_dir2_leafn_add( mp = dp->i_mount; tp = args->trans; leaf = bp->b_addr; + xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); + ents = xfs_dir3_leaf_ents_p(leaf); /* * Quick check just to make sure we are not going to index @@ -282,15 +456,15 @@ xfs_dir2_leafn_add( * a compact. */ - if (be16_to_cpu(leaf->hdr.count) == xfs_dir2_max_leaf_ents(mp)) { - if (!leaf->hdr.stale) + if (leafhdr.count == xfs_dir3_max_leaf_ents(mp, leaf)) { + if (!leafhdr.stale) return XFS_ERROR(ENOSPC); - compact = be16_to_cpu(leaf->hdr.stale) > 1; + compact = leafhdr.stale > 1; } else compact = 0; - ASSERT(index == 0 || be32_to_cpu(leaf->ents[index - 1].hashval) <= args->hashval); - ASSERT(index == be16_to_cpu(leaf->hdr.count) || - be32_to_cpu(leaf->ents[index].hashval) >= args->hashval); + ASSERT(index == 0 || be32_to_cpu(ents[index - 1].hashval) <= args->hashval); + ASSERT(index == leafhdr.count || + be32_to_cpu(ents[index].hashval) >= args->hashval); if (args->op_flags & XFS_DA_OP_JUSTCHECK) return 0; @@ -299,61 +473,51 @@ xfs_dir2_leafn_add( * Compact out all but one stale leaf entry. Leaves behind * the entry closest to index. */ - if (compact) { - xfs_dir2_leaf_compact_x1(bp, &index, &lowstale, &highstale, - &lfloglow, &lfloghigh); - } - /* - * Set impossible logging indices for this case. - */ - else if (leaf->hdr.stale) { - lfloglow = be16_to_cpu(leaf->hdr.count); + if (compact) + xfs_dir3_leaf_compact_x1(&leafhdr, ents, &index, &lowstale, + &highstale, &lfloglow, &lfloghigh); + else if (leafhdr.stale) { + /* + * Set impossible logging indices for this case. + */ + lfloglow = leafhdr.count; lfloghigh = -1; } /* * Insert the new entry, log everything. */ - lep = xfs_dir2_leaf_find_entry(leaf, index, compact, lowstale, + lep = xfs_dir3_leaf_find_entry(&leafhdr, ents, index, compact, lowstale, highstale, &lfloglow, &lfloghigh); lep->hashval = cpu_to_be32(args->hashval); lep->address = cpu_to_be32(xfs_dir2_db_off_to_dataptr(mp, args->blkno, args->index)); - xfs_dir2_leaf_log_header(tp, bp); - xfs_dir2_leaf_log_ents(tp, bp, lfloglow, lfloghigh); - xfs_dir2_leafn_check(dp, bp); + + xfs_dir3_leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir3_leaf_log_header(tp, bp); + xfs_dir3_leaf_log_ents(tp, bp, lfloglow, lfloghigh); + xfs_dir3_leaf_check(mp, bp); return 0; } #ifdef DEBUG -/* - * Check internal consistency of a leafn block. - */ -void -xfs_dir2_leafn_check( - struct xfs_inode *dp, - struct xfs_buf *bp) +static void +xfs_dir2_free_hdr_check( + struct xfs_mount *mp, + struct xfs_buf *bp, + xfs_dir2_db_t db) { - int i; /* leaf index */ - xfs_dir2_leaf_t *leaf; /* leaf structure */ - xfs_mount_t *mp; /* filesystem mount point */ - int stale; /* count of stale leaves */ + struct xfs_dir3_icfree_hdr hdr; - leaf = bp->b_addr; - mp = dp->i_mount; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); - ASSERT(be16_to_cpu(leaf->hdr.count) <= xfs_dir2_max_leaf_ents(mp)); - for (i = stale = 0; i < be16_to_cpu(leaf->hdr.count); i++) { - if (i + 1 < be16_to_cpu(leaf->hdr.count)) { - ASSERT(be32_to_cpu(leaf->ents[i].hashval) <= - be32_to_cpu(leaf->ents[i + 1].hashval)); - } - if (leaf->ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) - stale++; - } - ASSERT(be16_to_cpu(leaf->hdr.stale) == stale); + xfs_dir3_free_hdr_from_disk(&hdr, bp->b_addr); + + ASSERT((hdr.firstdb % xfs_dir3_free_max_bests(mp)) == 0); + ASSERT(hdr.firstdb <= db); + ASSERT(db < hdr.firstdb + hdr.nvalid); } +#else +#define xfs_dir2_free_hdr_check(mp, dp, db) #endif /* DEBUG */ /* @@ -365,15 +529,22 @@ xfs_dir2_leafn_lasthash( struct xfs_buf *bp, /* leaf buffer */ int *count) /* count of entries in leaf */ { - xfs_dir2_leaf_t *leaf; /* leaf structure */ + struct xfs_dir2_leaf *leaf = bp->b_addr; + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; + + xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); + + ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC || + leafhdr.magic == XFS_DIR3_LEAFN_MAGIC); - leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); if (count) - *count = be16_to_cpu(leaf->hdr.count); - if (!leaf->hdr.count) + *count = leafhdr.count; + if (!leafhdr.count) return 0; - return be32_to_cpu(leaf->ents[be16_to_cpu(leaf->hdr.count) - 1].hashval); + + ents = xfs_dir3_leaf_ents_p(leaf); + return be32_to_cpu(ents[leafhdr.count - 1].hashval); } /* @@ -402,16 +573,19 @@ xfs_dir2_leafn_lookup_for_addname( xfs_dir2_db_t newdb; /* new data block number */ xfs_dir2_db_t newfdb; /* new free block number */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; dp = args->dp; tp = args->trans; mp = dp->i_mount; leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); -#ifdef __KERNEL__ - ASSERT(be16_to_cpu(leaf->hdr.count) > 0); -#endif - xfs_dir2_leafn_check(dp, bp); + xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); + ents = xfs_dir3_leaf_ents_p(leaf); + + xfs_dir3_leaf_check(mp, bp); + ASSERT(leafhdr.count > 0); + /* * Look up the hash value in the leaf entries. */ @@ -424,15 +598,16 @@ xfs_dir2_leafn_lookup_for_addname( curbp = state->extrablk.bp; curfdb = state->extrablk.blkno; free = curbp->b_addr; - ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); + ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) || + free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC)); } length = xfs_dir2_data_entsize(args->namelen); /* * Loop over leaf entries with the right hash value. */ - for (lep = &leaf->ents[index]; index < be16_to_cpu(leaf->hdr.count) && - be32_to_cpu(lep->hashval) == args->hashval; - lep++, index++) { + for (lep = &ents[index]; + index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval; + lep++, index++) { /* * Skip stale leaf entries. */ @@ -451,6 +626,8 @@ xfs_dir2_leafn_lookup_for_addname( * in hand, take a look at it. */ if (newdb != curdb) { + __be16 *bests; + curdb = newdb; /* * Convert the data block to the free block @@ -473,13 +650,8 @@ xfs_dir2_leafn_lookup_for_addname( if (error) return error; free = curbp->b_addr; - ASSERT(be32_to_cpu(free->hdr.magic) == - XFS_DIR2_FREE_MAGIC); - ASSERT((be32_to_cpu(free->hdr.firstdb) % - xfs_dir2_free_max_bests(mp)) == 0); - ASSERT(be32_to_cpu(free->hdr.firstdb) <= curdb); - ASSERT(curdb < be32_to_cpu(free->hdr.firstdb) + - be32_to_cpu(free->hdr.nvalid)); + + xfs_dir2_free_hdr_check(mp, curbp, curdb); } /* * Get the index for our entry. @@ -488,8 +660,8 @@ xfs_dir2_leafn_lookup_for_addname( /* * If it has room, return it. */ - if (unlikely(free->bests[fi] == - cpu_to_be16(NULLDATAOFF))) { + bests = xfs_dir3_free_bests_p(mp, free); + if (unlikely(bests[fi] == cpu_to_be16(NULLDATAOFF))) { XFS_ERROR_REPORT("xfs_dir2_leafn_lookup_int", XFS_ERRLEVEL_LOW, mp); if (curfdb != newfdb) @@ -497,7 +669,7 @@ xfs_dir2_leafn_lookup_for_addname( return XFS_ERROR(EFSCORRUPTED); } curfdb = newfdb; - if (be16_to_cpu(free->bests[fi]) >= length) + if (be16_to_cpu(bests[fi]) >= length) goto out; } } @@ -511,6 +683,12 @@ out: state->extrablk.bp = curbp; state->extrablk.index = fi; state->extrablk.blkno = curfdb; + + /* + * Important: this magic number is not in the buffer - it's for + * buffer type information and therefore only the free/data type + * matters here, not whether CRCs are enabled or not. + */ state->extrablk.magic = XFS_DIR2_FREE_MAGIC; } else { state->extravalid = 0; @@ -545,16 +723,19 @@ xfs_dir2_leafn_lookup_for_entry( xfs_dir2_db_t newdb; /* new data block number */ xfs_trans_t *tp; /* transaction pointer */ enum xfs_dacmp cmp; /* comparison result */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; dp = args->dp; tp = args->trans; mp = dp->i_mount; leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); -#ifdef __KERNEL__ - ASSERT(be16_to_cpu(leaf->hdr.count) > 0); -#endif - xfs_dir2_leafn_check(dp, bp); + xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); + ents = xfs_dir3_leaf_ents_p(leaf); + + xfs_dir3_leaf_check(mp, bp); + ASSERT(leafhdr.count > 0); + /* * Look up the hash value in the leaf entries. */ @@ -569,9 +750,9 @@ xfs_dir2_leafn_lookup_for_entry( /* * Loop over leaf entries with the right hash value. */ - for (lep = &leaf->ents[index]; index < be16_to_cpu(leaf->hdr.count) && - be32_to_cpu(lep->hashval) == args->hashval; - lep++, index++) { + for (lep = &ents[index]; + index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval; + lep++, index++) { /* * Skip stale leaf entries. */ @@ -604,13 +785,13 @@ xfs_dir2_leafn_lookup_for_entry( ASSERT(state->extravalid); curbp = state->extrablk.bp; } else { - error = xfs_dir2_data_read(tp, dp, + error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(mp, newdb), -1, &curbp); if (error) return error; } - xfs_dir2_data_check(dp, curbp); + xfs_dir3_data_check(dp, curbp); curdb = newdb; } /* @@ -638,13 +819,13 @@ xfs_dir2_leafn_lookup_for_entry( state->extrablk.index = (int)((char *)dep - (char *)curbp->b_addr); state->extrablk.magic = XFS_DIR2_DATA_MAGIC; - curbp->b_ops = &xfs_dir2_data_buf_ops; + curbp->b_ops = &xfs_dir3_data_buf_ops; + xfs_trans_buf_set_type(tp, curbp, XFS_BLFT_DIR_DATA_BUF); if (cmp == XFS_CMP_EXACT) return XFS_ERROR(EEXIST); } } - ASSERT(index == be16_to_cpu(leaf->hdr.count) || - (args->op_flags & XFS_DA_OP_OKNOENT)); + ASSERT(index == leafhdr.count || (args->op_flags & XFS_DA_OP_OKNOENT)); if (curbp) { if (args->cmpresult == XFS_CMP_DIFFERENT) { /* Giving back last used data block. */ @@ -653,7 +834,8 @@ xfs_dir2_leafn_lookup_for_entry( state->extrablk.index = -1; state->extrablk.blkno = curdb; state->extrablk.magic = XFS_DIR2_DATA_MAGIC; - curbp->b_ops = &xfs_dir2_data_buf_ops; + curbp->b_ops = &xfs_dir3_data_buf_ops; + xfs_trans_buf_set_type(tp, curbp, XFS_BLFT_DIR_DATA_BUF); } else { /* If the curbp is not the CI match block, drop it */ if (state->extrablk.bp != curbp) @@ -689,52 +871,50 @@ xfs_dir2_leafn_lookup_int( * Log entries and headers. Stale entries are preserved. */ static void -xfs_dir2_leafn_moveents( - xfs_da_args_t *args, /* operation arguments */ - struct xfs_buf *bp_s, /* source leaf buffer */ - int start_s, /* source leaf index */ - struct xfs_buf *bp_d, /* destination leaf buffer */ - int start_d, /* destination leaf index */ - int count) /* count of leaves to copy */ +xfs_dir3_leafn_moveents( + xfs_da_args_t *args, /* operation arguments */ + struct xfs_buf *bp_s, /* source */ + struct xfs_dir3_icleaf_hdr *shdr, + struct xfs_dir2_leaf_entry *sents, + int start_s,/* source leaf index */ + struct xfs_buf *bp_d, /* destination */ + struct xfs_dir3_icleaf_hdr *dhdr, + struct xfs_dir2_leaf_entry *dents, + int start_d,/* destination leaf index */ + int count) /* count of leaves to copy */ { - xfs_dir2_leaf_t *leaf_d; /* destination leaf structure */ - xfs_dir2_leaf_t *leaf_s; /* source leaf structure */ - int stale; /* count stale leaves copied */ - xfs_trans_t *tp; /* transaction pointer */ + struct xfs_trans *tp = args->trans; + int stale; /* count stale leaves copied */ trace_xfs_dir2_leafn_moveents(args, start_s, start_d, count); /* * Silently return if nothing to do. */ - if (count == 0) { + if (count == 0) return; - } - tp = args->trans; - leaf_s = bp_s->b_addr; - leaf_d = bp_d->b_addr; + /* * If the destination index is not the end of the current * destination leaf entries, open up a hole in the destination * to hold the new entries. */ - if (start_d < be16_to_cpu(leaf_d->hdr.count)) { - memmove(&leaf_d->ents[start_d + count], &leaf_d->ents[start_d], - (be16_to_cpu(leaf_d->hdr.count) - start_d) * - sizeof(xfs_dir2_leaf_entry_t)); - xfs_dir2_leaf_log_ents(tp, bp_d, start_d + count, - count + be16_to_cpu(leaf_d->hdr.count) - 1); + if (start_d < dhdr->count) { + memmove(&dents[start_d + count], &dents[start_d], + (dhdr->count - start_d) * sizeof(xfs_dir2_leaf_entry_t)); + xfs_dir3_leaf_log_ents(tp, bp_d, start_d + count, + count + dhdr->count - 1); } /* * If the source has stale leaves, count the ones in the copy range * so we can update the header correctly. */ - if (leaf_s->hdr.stale) { + if (shdr->stale) { int i; /* temp leaf index */ for (i = start_s, stale = 0; i < start_s + count; i++) { - if (leaf_s->ents[i].address == - cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) + if (sents[i].address == + cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) stale++; } } else @@ -742,29 +922,27 @@ xfs_dir2_leafn_moveents( /* * Copy the leaf entries from source to destination. */ - memcpy(&leaf_d->ents[start_d], &leaf_s->ents[start_s], + memcpy(&dents[start_d], &sents[start_s], count * sizeof(xfs_dir2_leaf_entry_t)); - xfs_dir2_leaf_log_ents(tp, bp_d, start_d, start_d + count - 1); + xfs_dir3_leaf_log_ents(tp, bp_d, start_d, start_d + count - 1); + /* * If there are source entries after the ones we copied, * delete the ones we copied by sliding the next ones down. */ - if (start_s + count < be16_to_cpu(leaf_s->hdr.count)) { - memmove(&leaf_s->ents[start_s], &leaf_s->ents[start_s + count], + if (start_s + count < shdr->count) { + memmove(&sents[start_s], &sents[start_s + count], count * sizeof(xfs_dir2_leaf_entry_t)); - xfs_dir2_leaf_log_ents(tp, bp_s, start_s, start_s + count - 1); + xfs_dir3_leaf_log_ents(tp, bp_s, start_s, start_s + count - 1); } + /* * Update the headers and log them. */ - be16_add_cpu(&leaf_s->hdr.count, -(count)); - be16_add_cpu(&leaf_s->hdr.stale, -(stale)); - be16_add_cpu(&leaf_d->hdr.count, count); - be16_add_cpu(&leaf_d->hdr.stale, stale); - xfs_dir2_leaf_log_header(tp, bp_s); - xfs_dir2_leaf_log_header(tp, bp_d); - xfs_dir2_leafn_check(args->dp, bp_s); - xfs_dir2_leafn_check(args->dp, bp_d); + shdr->count -= count; + shdr->stale -= stale; + dhdr->count += count; + dhdr->stale += stale; } /* @@ -773,21 +951,25 @@ xfs_dir2_leafn_moveents( */ int /* sort order */ xfs_dir2_leafn_order( - struct xfs_buf *leaf1_bp, /* leaf1 buffer */ - struct xfs_buf *leaf2_bp) /* leaf2 buffer */ + struct xfs_buf *leaf1_bp, /* leaf1 buffer */ + struct xfs_buf *leaf2_bp) /* leaf2 buffer */ { - xfs_dir2_leaf_t *leaf1; /* leaf1 structure */ - xfs_dir2_leaf_t *leaf2; /* leaf2 structure */ - - leaf1 = leaf1_bp->b_addr; - leaf2 = leaf2_bp->b_addr; - ASSERT(leaf1->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); - ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); - if (be16_to_cpu(leaf1->hdr.count) > 0 && - be16_to_cpu(leaf2->hdr.count) > 0 && - (be32_to_cpu(leaf2->ents[0].hashval) < be32_to_cpu(leaf1->ents[0].hashval) || - be32_to_cpu(leaf2->ents[be16_to_cpu(leaf2->hdr.count) - 1].hashval) < - be32_to_cpu(leaf1->ents[be16_to_cpu(leaf1->hdr.count) - 1].hashval))) + struct xfs_dir2_leaf *leaf1 = leaf1_bp->b_addr; + struct xfs_dir2_leaf *leaf2 = leaf2_bp->b_addr; + struct xfs_dir2_leaf_entry *ents1; + struct xfs_dir2_leaf_entry *ents2; + struct xfs_dir3_icleaf_hdr hdr1; + struct xfs_dir3_icleaf_hdr hdr2; + + xfs_dir3_leaf_hdr_from_disk(&hdr1, leaf1); + xfs_dir3_leaf_hdr_from_disk(&hdr2, leaf2); + ents1 = xfs_dir3_leaf_ents_p(leaf1); + ents2 = xfs_dir3_leaf_ents_p(leaf2); + + if (hdr1.count > 0 && hdr2.count > 0 && + (be32_to_cpu(ents2[0].hashval) < be32_to_cpu(ents1[0].hashval) || + be32_to_cpu(ents2[hdr2.count - 1].hashval) < + be32_to_cpu(ents1[hdr1.count - 1].hashval))) return 1; return 0; } @@ -816,6 +998,10 @@ xfs_dir2_leafn_rebalance( #endif int oldsum; /* old total leaf count */ int swap; /* swapped leaf blocks */ + struct xfs_dir2_leaf_entry *ents1; + struct xfs_dir2_leaf_entry *ents2; + struct xfs_dir3_icleaf_hdr hdr1; + struct xfs_dir3_icleaf_hdr hdr2; args = state->args; /* @@ -830,11 +1016,17 @@ xfs_dir2_leafn_rebalance( } leaf1 = blk1->bp->b_addr; leaf2 = blk2->bp->b_addr; - oldsum = be16_to_cpu(leaf1->hdr.count) + be16_to_cpu(leaf2->hdr.count); + xfs_dir3_leaf_hdr_from_disk(&hdr1, leaf1); + xfs_dir3_leaf_hdr_from_disk(&hdr2, leaf2); + ents1 = xfs_dir3_leaf_ents_p(leaf1); + ents2 = xfs_dir3_leaf_ents_p(leaf2); + + oldsum = hdr1.count + hdr2.count; #ifdef DEBUG - oldstale = be16_to_cpu(leaf1->hdr.stale) + be16_to_cpu(leaf2->hdr.stale); + oldstale = hdr1.stale + hdr2.stale; #endif mid = oldsum >> 1; + /* * If the old leaf count was odd then the new one will be even, * so we need to divide the new count evenly. @@ -842,10 +1034,10 @@ xfs_dir2_leafn_rebalance( if (oldsum & 1) { xfs_dahash_t midhash; /* middle entry hash value */ - if (mid >= be16_to_cpu(leaf1->hdr.count)) - midhash = be32_to_cpu(leaf2->ents[mid - be16_to_cpu(leaf1->hdr.count)].hashval); + if (mid >= hdr1.count) + midhash = be32_to_cpu(ents2[mid - hdr1.count].hashval); else - midhash = be32_to_cpu(leaf1->ents[mid].hashval); + midhash = be32_to_cpu(ents1[mid].hashval); isleft = args->hashval <= midhash; } /* @@ -859,30 +1051,42 @@ xfs_dir2_leafn_rebalance( * Calculate moved entry count. Positive means left-to-right, * negative means right-to-left. Then move the entries. */ - count = be16_to_cpu(leaf1->hdr.count) - mid + (isleft == 0); + count = hdr1.count - mid + (isleft == 0); if (count > 0) - xfs_dir2_leafn_moveents(args, blk1->bp, - be16_to_cpu(leaf1->hdr.count) - count, blk2->bp, 0, count); + xfs_dir3_leafn_moveents(args, blk1->bp, &hdr1, ents1, + hdr1.count - count, blk2->bp, + &hdr2, ents2, 0, count); else if (count < 0) - xfs_dir2_leafn_moveents(args, blk2->bp, 0, blk1->bp, - be16_to_cpu(leaf1->hdr.count), count); - ASSERT(be16_to_cpu(leaf1->hdr.count) + be16_to_cpu(leaf2->hdr.count) == oldsum); - ASSERT(be16_to_cpu(leaf1->hdr.stale) + be16_to_cpu(leaf2->hdr.stale) == oldstale); + xfs_dir3_leafn_moveents(args, blk2->bp, &hdr2, ents2, 0, + blk1->bp, &hdr1, ents1, + hdr1.count, count); + + ASSERT(hdr1.count + hdr2.count == oldsum); + ASSERT(hdr1.stale + hdr2.stale == oldstale); + + /* log the changes made when moving the entries */ + xfs_dir3_leaf_hdr_to_disk(leaf1, &hdr1); + xfs_dir3_leaf_hdr_to_disk(leaf2, &hdr2); + xfs_dir3_leaf_log_header(args->trans, blk1->bp); + xfs_dir3_leaf_log_header(args->trans, blk2->bp); + + xfs_dir3_leaf_check(args->dp->i_mount, blk1->bp); + xfs_dir3_leaf_check(args->dp->i_mount, blk2->bp); + /* * Mark whether we're inserting into the old or new leaf. */ - if (be16_to_cpu(leaf1->hdr.count) < be16_to_cpu(leaf2->hdr.count)) + if (hdr1.count < hdr2.count) state->inleaf = swap; - else if (be16_to_cpu(leaf1->hdr.count) > be16_to_cpu(leaf2->hdr.count)) + else if (hdr1.count > hdr2.count) state->inleaf = !swap; else - state->inleaf = - swap ^ (blk1->index <= be16_to_cpu(leaf1->hdr.count)); + state->inleaf = swap ^ (blk1->index <= hdr1.count); /* * Adjust the expected index for insertion. */ if (!state->inleaf) - blk2->index = blk1->index - be16_to_cpu(leaf1->hdr.count); + blk2->index = blk1->index - hdr1.count; /* * Finally sanity check just to make sure we are not returning a @@ -898,7 +1102,7 @@ xfs_dir2_leafn_rebalance( } static int -xfs_dir2_data_block_free( +xfs_dir3_data_block_free( xfs_da_args_t *args, struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_free *free, @@ -909,57 +1113,66 @@ xfs_dir2_data_block_free( { struct xfs_trans *tp = args->trans; int logfree = 0; + __be16 *bests; + struct xfs_dir3_icfree_hdr freehdr; - if (!hdr) { - /* One less used entry in the free table. */ - be32_add_cpu(&free->hdr.nused, -1); - xfs_dir2_free_log_header(tp, fbp); + xfs_dir3_free_hdr_from_disk(&freehdr, free); + bests = xfs_dir3_free_bests_p(tp->t_mountp, free); + if (hdr) { /* - * If this was the last entry in the table, we can trim the - * table size back. There might be other entries at the end - * referring to non-existent data blocks, get those too. + * Data block is not empty, just set the free entry to the new + * value. */ - if (findex == be32_to_cpu(free->hdr.nvalid) - 1) { - int i; /* free entry index */ + bests[findex] = cpu_to_be16(longest); + xfs_dir2_free_log_bests(tp, fbp, findex, findex); + return 0; + } - for (i = findex - 1; i >= 0; i--) { - if (free->bests[i] != cpu_to_be16(NULLDATAOFF)) - break; - } - free->hdr.nvalid = cpu_to_be32(i + 1); - logfree = 0; - } else { - /* Not the last entry, just punch it out. */ - free->bests[findex] = cpu_to_be16(NULLDATAOFF); - logfree = 1; - } - /* - * If there are no useful entries left in the block, - * get rid of the block if we can. - */ - if (!free->hdr.nused) { - int error; + /* One less used entry in the free table. */ + freehdr.nused--; - error = xfs_dir2_shrink_inode(args, fdb, fbp); - if (error == 0) { - fbp = NULL; - logfree = 0; - } else if (error != ENOSPC || args->total != 0) - return error; - /* - * It's possible to get ENOSPC if there is no - * space reservation. In this case some one - * else will eventually get rid of this block. - */ + /* + * If this was the last entry in the table, we can trim the table size + * back. There might be other entries at the end referring to + * non-existent data blocks, get those too. + */ + if (findex == freehdr.nvalid - 1) { + int i; /* free entry index */ + + for (i = findex - 1; i >= 0; i--) { + if (bests[i] != cpu_to_be16(NULLDATAOFF)) + break; } + freehdr.nvalid = i + 1; + logfree = 0; } else { + /* Not the last entry, just punch it out. */ + bests[findex] = cpu_to_be16(NULLDATAOFF); + logfree = 1; + } + + xfs_dir3_free_hdr_to_disk(free, &freehdr); + xfs_dir2_free_log_header(tp, fbp); + + /* + * If there are no useful entries left in the block, get rid of the + * block if we can. + */ + if (!freehdr.nused) { + int error; + + error = xfs_dir2_shrink_inode(args, fdb, fbp); + if (error == 0) { + fbp = NULL; + logfree = 0; + } else if (error != ENOSPC || args->total != 0) + return error; /* - * Data block is not empty, just set the free entry to the new - * value. + * It's possible to get ENOSPC if there is no + * space reservation. In this case some one + * else will eventually get rid of this block. */ - free->bests[findex] = cpu_to_be16(longest); - logfree = 1; } /* Log the free entry that changed, unless we got rid of it. */ @@ -994,6 +1207,9 @@ xfs_dir2_leafn_remove( int needlog; /* need to log data header */ int needscan; /* need to rescan data frees */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir2_data_free *bf; /* bestfree table */ + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_dir2_leaf_entry *ents; trace_xfs_dir2_leafn_remove(args, index); @@ -1001,11 +1217,14 @@ xfs_dir2_leafn_remove( tp = args->trans; mp = dp->i_mount; leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); + xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); + ents = xfs_dir3_leaf_ents_p(leaf); + /* * Point to the entry we're removing. */ - lep = &leaf->ents[index]; + lep = &ents[index]; + /* * Extract the data block and offset from the entry. */ @@ -1013,14 +1232,18 @@ xfs_dir2_leafn_remove( ASSERT(dblk->blkno == db); off = xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address)); ASSERT(dblk->index == off); + /* * Kill the leaf entry by marking it stale. * Log the leaf block changes. */ - be16_add_cpu(&leaf->hdr.stale, 1); - xfs_dir2_leaf_log_header(tp, bp); + leafhdr.stale++; + xfs_dir3_leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir3_leaf_log_header(tp, bp); + lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR); - xfs_dir2_leaf_log_ents(tp, bp, index, index); + xfs_dir3_leaf_log_ents(tp, bp, index, index); + /* * Make the data entry free. Keep track of the longest freespace * in the data block in case it changes. @@ -1028,7 +1251,8 @@ xfs_dir2_leafn_remove( dbp = dblk->bp; hdr = dbp->b_addr; dep = (xfs_dir2_data_entry_t *)((char *)hdr + off); - longest = be16_to_cpu(hdr->bestfree[0].length); + bf = xfs_dir3_data_bestfree_p(hdr); + longest = be16_to_cpu(bf[0].length); needlog = needscan = 0; xfs_dir2_data_make_free(tp, dbp, off, xfs_dir2_data_entsize(dep->namelen), &needlog, &needscan); @@ -1040,12 +1264,12 @@ xfs_dir2_leafn_remove( xfs_dir2_data_freescan(mp, hdr, &needlog); if (needlog) xfs_dir2_data_log_header(tp, dbp); - xfs_dir2_data_check(dp, dbp); + xfs_dir3_data_check(dp, dbp); /* * If the longest data block freespace changes, need to update * the corresponding freeblock entry. */ - if (longest < be16_to_cpu(hdr->bestfree[0].length)) { + if (longest < be16_to_cpu(bf[0].length)) { int error; /* error return value */ struct xfs_buf *fbp; /* freeblock buffer */ xfs_dir2_db_t fdb; /* freeblock block number */ @@ -1062,20 +1286,25 @@ xfs_dir2_leafn_remove( if (error) return error; free = fbp->b_addr; - ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); - ASSERT(be32_to_cpu(free->hdr.firstdb) == - xfs_dir2_free_max_bests(mp) * - (fdb - XFS_DIR2_FREE_FIRSTDB(mp))); +#ifdef DEBUG + { + struct xfs_dir3_icfree_hdr freehdr; + xfs_dir3_free_hdr_from_disk(&freehdr, free); + ASSERT(freehdr.firstdb == xfs_dir3_free_max_bests(mp) * + (fdb - XFS_DIR2_FREE_FIRSTDB(mp))); + } +#endif /* * Calculate which entry we need to fix. */ findex = xfs_dir2_db_to_fdindex(mp, db); - longest = be16_to_cpu(hdr->bestfree[0].length); + longest = be16_to_cpu(bf[0].length); /* * If the data block is now empty we can get rid of it * (usually). */ - if (longest == mp->m_dirblksize - (uint)sizeof(*hdr)) { + if (longest == mp->m_dirblksize - + xfs_dir3_data_entry_offset(hdr)) { /* * Try to punch out the data block. */ @@ -1096,21 +1325,19 @@ xfs_dir2_leafn_remove( * If we got rid of the data block, we can eliminate that entry * in the free block. */ - error = xfs_dir2_data_block_free(args, hdr, free, + error = xfs_dir3_data_block_free(args, hdr, free, fdb, findex, fbp, longest); if (error) return error; } - xfs_dir2_leafn_check(dp, bp); + xfs_dir3_leaf_check(mp, bp); /* * Return indication of whether this leaf block is empty enough * to justify trying to join it with a neighbor. */ - *rval = - ((uint)sizeof(leaf->hdr) + - (uint)sizeof(leaf->ents[0]) * - (be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale))) < + *rval = (xfs_dir3_leaf_hdr_size(leaf) + + (uint)sizeof(ents[0]) * (leafhdr.count - leafhdr.stale)) < mp->m_dir_magicpct; return 0; } @@ -1143,11 +1370,11 @@ xfs_dir2_leafn_split( /* * Initialize the new leaf block. */ - error = xfs_dir2_leaf_init(args, xfs_dir2_da_to_db(mp, blkno), - &newblk->bp, XFS_DIR2_LEAFN_MAGIC); - if (error) { + error = xfs_dir3_leaf_get_buf(args, xfs_dir2_da_to_db(mp, blkno), + &newblk->bp, XFS_DIR2_LEAFN_MAGIC); + if (error) return error; - } + newblk->blkno = blkno; newblk->magic = XFS_DIR2_LEAFN_MAGIC; /* @@ -1155,7 +1382,7 @@ xfs_dir2_leafn_split( * block into the leaves. */ xfs_dir2_leafn_rebalance(state, oldblk, newblk); - error = xfs_da_blk_link(state, oldblk, newblk); + error = xfs_da3_blk_link(state, oldblk, newblk); if (error) { return error; } @@ -1171,8 +1398,8 @@ xfs_dir2_leafn_split( */ oldblk->hashval = xfs_dir2_leafn_lasthash(oldblk->bp, NULL); newblk->hashval = xfs_dir2_leafn_lasthash(newblk->bp, NULL); - xfs_dir2_leafn_check(args->dp, oldblk->bp); - xfs_dir2_leafn_check(args->dp, newblk->bp); + xfs_dir3_leaf_check(mp, oldblk->bp); + xfs_dir3_leaf_check(mp, newblk->bp); return error; } @@ -1198,9 +1425,10 @@ xfs_dir2_leafn_toosmall( int error; /* error return value */ int forward; /* sibling block direction */ int i; /* sibling counter */ - xfs_da_blkinfo_t *info; /* leaf block header */ xfs_dir2_leaf_t *leaf; /* leaf structure */ int rval; /* result from path_shift */ + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_dir2_leaf_entry *ents; /* * Check for the degenerate case of the block being over 50% full. @@ -1208,11 +1436,13 @@ xfs_dir2_leafn_toosmall( * to coalesce with a sibling. */ blk = &state->path.blk[state->path.active - 1]; - info = blk->bp->b_addr; - ASSERT(info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); - leaf = (xfs_dir2_leaf_t *)info; - count = be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale); - bytes = (uint)sizeof(leaf->hdr) + count * (uint)sizeof(leaf->ents[0]); + leaf = blk->bp->b_addr; + xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); + ents = xfs_dir3_leaf_ents_p(leaf); + xfs_dir3_leaf_check(state->args->dp->i_mount, blk->bp); + + count = leafhdr.count - leafhdr.stale; + bytes = xfs_dir3_leaf_hdr_size(leaf) + count * sizeof(ents[0]); if (bytes > (state->blocksize >> 1)) { /* * Blk over 50%, don't try to join. @@ -1231,9 +1461,9 @@ xfs_dir2_leafn_toosmall( * Make altpath point to the block we want to keep and * path point to the block we want to drop (this one). */ - forward = (info->forw != 0); + forward = (leafhdr.forw != 0); memcpy(&state->altpath, &state->path, sizeof(state->path)); - error = xfs_da_path_shift(state, &state->altpath, forward, 0, + error = xfs_da3_path_shift(state, &state->altpath, forward, 0, &rval); if (error) return error; @@ -1247,15 +1477,17 @@ xfs_dir2_leafn_toosmall( * We prefer coalescing with the lower numbered sibling so as * to shrink a directory over time. */ - forward = be32_to_cpu(info->forw) < be32_to_cpu(info->back); + forward = leafhdr.forw < leafhdr.back; for (i = 0, bp = NULL; i < 2; forward = !forward, i++) { - blkno = forward ? be32_to_cpu(info->forw) : be32_to_cpu(info->back); + struct xfs_dir3_icleaf_hdr hdr2; + + blkno = forward ? leafhdr.forw : leafhdr.back; if (blkno == 0) continue; /* * Read the sibling leaf block. */ - error = xfs_dir2_leafn_read(state->args->trans, state->args->dp, + error = xfs_dir3_leafn_read(state->args->trans, state->args->dp, blkno, -1, &bp); if (error) return error; @@ -1263,13 +1495,15 @@ xfs_dir2_leafn_toosmall( /* * Count bytes in the two blocks combined. */ - leaf = (xfs_dir2_leaf_t *)info; - count = be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale); + count = leafhdr.count - leafhdr.stale; bytes = state->blocksize - (state->blocksize >> 2); + leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); - count += be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale); - bytes -= count * (uint)sizeof(leaf->ents[0]); + xfs_dir3_leaf_hdr_from_disk(&hdr2, leaf); + ents = xfs_dir3_leaf_ents_p(leaf); + count += hdr2.count - hdr2.stale; + bytes -= count * sizeof(ents[0]); + /* * Fits with at least 25% to spare. */ @@ -1291,10 +1525,10 @@ xfs_dir2_leafn_toosmall( */ memcpy(&state->altpath, &state->path, sizeof(state->path)); if (blkno < blk->blkno) - error = xfs_da_path_shift(state, &state->altpath, forward, 0, + error = xfs_da3_path_shift(state, &state->altpath, forward, 0, &rval); else - error = xfs_da_path_shift(state, &state->path, forward, 0, + error = xfs_da3_path_shift(state, &state->path, forward, 0, &rval); if (error) { return error; @@ -1316,34 +1550,53 @@ xfs_dir2_leafn_unbalance( xfs_da_args_t *args; /* operation arguments */ xfs_dir2_leaf_t *drop_leaf; /* dead leaf structure */ xfs_dir2_leaf_t *save_leaf; /* surviving leaf structure */ + struct xfs_dir3_icleaf_hdr savehdr; + struct xfs_dir3_icleaf_hdr drophdr; + struct xfs_dir2_leaf_entry *sents; + struct xfs_dir2_leaf_entry *dents; args = state->args; ASSERT(drop_blk->magic == XFS_DIR2_LEAFN_MAGIC); ASSERT(save_blk->magic == XFS_DIR2_LEAFN_MAGIC); drop_leaf = drop_blk->bp->b_addr; save_leaf = save_blk->bp->b_addr; - ASSERT(drop_leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); - ASSERT(save_leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); + + xfs_dir3_leaf_hdr_from_disk(&savehdr, save_leaf); + xfs_dir3_leaf_hdr_from_disk(&drophdr, drop_leaf); + sents = xfs_dir3_leaf_ents_p(save_leaf); + dents = xfs_dir3_leaf_ents_p(drop_leaf); + /* * If there are any stale leaf entries, take this opportunity * to purge them. */ - if (drop_leaf->hdr.stale) - xfs_dir2_leaf_compact(args, drop_blk->bp); - if (save_leaf->hdr.stale) - xfs_dir2_leaf_compact(args, save_blk->bp); + if (drophdr.stale) + xfs_dir3_leaf_compact(args, &drophdr, drop_blk->bp); + if (savehdr.stale) + xfs_dir3_leaf_compact(args, &savehdr, save_blk->bp); + /* * Move the entries from drop to the appropriate end of save. */ - drop_blk->hashval = be32_to_cpu(drop_leaf->ents[be16_to_cpu(drop_leaf->hdr.count) - 1].hashval); + drop_blk->hashval = be32_to_cpu(dents[drophdr.count - 1].hashval); if (xfs_dir2_leafn_order(save_blk->bp, drop_blk->bp)) - xfs_dir2_leafn_moveents(args, drop_blk->bp, 0, save_blk->bp, 0, - be16_to_cpu(drop_leaf->hdr.count)); + xfs_dir3_leafn_moveents(args, drop_blk->bp, &drophdr, dents, 0, + save_blk->bp, &savehdr, sents, 0, + drophdr.count); else - xfs_dir2_leafn_moveents(args, drop_blk->bp, 0, save_blk->bp, - be16_to_cpu(save_leaf->hdr.count), be16_to_cpu(drop_leaf->hdr.count)); - save_blk->hashval = be32_to_cpu(save_leaf->ents[be16_to_cpu(save_leaf->hdr.count) - 1].hashval); - xfs_dir2_leafn_check(args->dp, save_blk->bp); + xfs_dir3_leafn_moveents(args, drop_blk->bp, &drophdr, dents, 0, + save_blk->bp, &savehdr, sents, + savehdr.count, drophdr.count); + save_blk->hashval = be32_to_cpu(sents[savehdr.count - 1].hashval); + + /* log the changes made when moving the entries */ + xfs_dir3_leaf_hdr_to_disk(save_leaf, &savehdr); + xfs_dir3_leaf_hdr_to_disk(drop_leaf, &drophdr); + xfs_dir3_leaf_log_header(args->trans, save_blk->bp); + xfs_dir3_leaf_log_header(args->trans, drop_blk->bp); + + xfs_dir3_leaf_check(args->dp->i_mount, save_blk->bp); + xfs_dir3_leaf_check(args->dp->i_mount, drop_blk->bp); } /* @@ -1372,7 +1625,7 @@ xfs_dir2_node_addname( * Look up the name. We're not supposed to find it, but * this gives us the insertion point. */ - error = xfs_da_node_lookup_int(state, &rval); + error = xfs_da3_node_lookup_int(state, &rval); if (error) rval = error; if (rval != ENOENT) { @@ -1398,7 +1651,7 @@ xfs_dir2_node_addname( * It worked, fix the hash values up the btree. */ if (!(args->op_flags & XFS_DA_OP_JUSTCHECK)) - xfs_da_fixhashpath(state, &state->path); + xfs_da3_fixhashpath(state, &state->path); } else { /* * It didn't work, we need to split the leaf block. @@ -1410,7 +1663,7 @@ xfs_dir2_node_addname( /* * Split the leaf block and insert the new entry. */ - rval = xfs_da_split(state); + rval = xfs_da3_split(state); } done: xfs_da_state_free(state); @@ -1447,6 +1700,9 @@ xfs_dir2_node_addname_int( int needscan; /* need to rescan data frees */ __be16 *tagp; /* data entry tag pointer */ xfs_trans_t *tp; /* transaction pointer */ + __be16 *bests; + struct xfs_dir3_icfree_hdr freehdr; + struct xfs_dir2_data_free *bf; dp = args->dp; mp = dp->i_mount; @@ -1464,36 +1720,37 @@ xfs_dir2_node_addname_int( */ ifbno = fblk->blkno; free = fbp->b_addr; - ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); findex = fblk->index; + bests = xfs_dir3_free_bests_p(mp, free); + xfs_dir3_free_hdr_from_disk(&freehdr, free); + /* * This means the free entry showed that the data block had * space for our entry, so we remembered it. * Use that data block. */ if (findex >= 0) { - ASSERT(findex < be32_to_cpu(free->hdr.nvalid)); - ASSERT(be16_to_cpu(free->bests[findex]) != NULLDATAOFF); - ASSERT(be16_to_cpu(free->bests[findex]) >= length); - dbno = be32_to_cpu(free->hdr.firstdb) + findex; - } - /* - * The data block looked at didn't have enough room. - * We'll start at the beginning of the freespace entries. - */ - else { + ASSERT(findex < freehdr.nvalid); + ASSERT(be16_to_cpu(bests[findex]) != NULLDATAOFF); + ASSERT(be16_to_cpu(bests[findex]) >= length); + dbno = freehdr.firstdb + findex; + } else { + /* + * The data block looked at didn't have enough room. + * We'll start at the beginning of the freespace entries. + */ dbno = -1; findex = 0; } - } - /* - * Didn't come in with a freespace block, so don't have a data block. - */ - else { + } else { + /* + * Didn't come in with a freespace block, so no data block. + */ ifbno = dbno = -1; fbp = NULL; findex = 0; } + /* * If we don't have a data block yet, we're going to scan the * freespace blocks looking for one. Figure out what the @@ -1547,20 +1804,26 @@ xfs_dir2_node_addname_int( if (!fbp) continue; free = fbp->b_addr; - ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); findex = 0; } /* * Look at the current free entry. Is it good enough? + * + * The bests initialisation should be where the bufer is read in + * the above branch. But gcc is too stupid to realise that bests + * and the freehdr are actually initialised if they are placed + * there, so we have to do it here to avoid warnings. Blech. */ - if (be16_to_cpu(free->bests[findex]) != NULLDATAOFF && - be16_to_cpu(free->bests[findex]) >= length) - dbno = be32_to_cpu(free->hdr.firstdb) + findex; + bests = xfs_dir3_free_bests_p(mp, free); + xfs_dir3_free_hdr_from_disk(&freehdr, free); + if (be16_to_cpu(bests[findex]) != NULLDATAOFF && + be16_to_cpu(bests[findex]) >= length) + dbno = freehdr.firstdb + findex; else { /* * Are we done with the freeblock? */ - if (++findex == be32_to_cpu(free->hdr.nvalid)) { + if (++findex == freehdr.nvalid) { /* * Drop the block. */ @@ -1588,7 +1851,7 @@ xfs_dir2_node_addname_int( if (unlikely((error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, &dbno)) || - (error = xfs_dir2_data_init(args, dbno, &dbp)))) + (error = xfs_dir3_data_init(args, dbno, &dbp)))) return error; /* @@ -1614,11 +1877,11 @@ xfs_dir2_node_addname_int( * If there wasn't a freespace block, the read will * return a NULL fbp. Allocate and initialize a new one. */ - if( fbp == NULL ) { - if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, - &fbno))) { + if (!fbp) { + error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, + &fbno); + if (error) return error; - } if (unlikely(xfs_dir2_db_to_fdb(mp, dbno) != fbno)) { xfs_alert(mp, @@ -1646,27 +1909,24 @@ xfs_dir2_node_addname_int( /* * Get a buffer for the new block. */ - error = xfs_da_get_buf(tp, dp, - xfs_dir2_db_to_da(mp, fbno), - -1, &fbp, XFS_DATA_FORK); + error = xfs_dir3_free_get_buf(tp, dp, fbno, &fbp); if (error) return error; - fbp->b_ops = &xfs_dir2_free_buf_ops; + free = fbp->b_addr; + bests = xfs_dir3_free_bests_p(mp, free); + xfs_dir3_free_hdr_from_disk(&freehdr, free); /* - * Initialize the new block to be empty, and remember - * its first slot as our empty slot. + * Remember the first slot as our empty slot. */ - free = fbp->b_addr; - free->hdr.magic = cpu_to_be32(XFS_DIR2_FREE_MAGIC); - free->hdr.firstdb = cpu_to_be32( - (fbno - XFS_DIR2_FREE_FIRSTDB(mp)) * - xfs_dir2_free_max_bests(mp)); + freehdr.firstdb = (fbno - XFS_DIR2_FREE_FIRSTDB(mp)) * + xfs_dir3_free_max_bests(mp); free->hdr.nvalid = 0; free->hdr.nused = 0; } else { free = fbp->b_addr; - ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); + bests = xfs_dir3_free_bests_p(mp, free); + xfs_dir3_free_hdr_from_disk(&freehdr, free); } /* @@ -1677,20 +1937,21 @@ xfs_dir2_node_addname_int( * If it's after the end of the current entries in the * freespace block, extend that table. */ - if (findex >= be32_to_cpu(free->hdr.nvalid)) { - ASSERT(findex < xfs_dir2_free_max_bests(mp)); - free->hdr.nvalid = cpu_to_be32(findex + 1); + if (findex >= freehdr.nvalid) { + ASSERT(findex < xfs_dir3_free_max_bests(mp)); + freehdr.nvalid = findex + 1; /* * Tag new entry so nused will go up. */ - free->bests[findex] = cpu_to_be16(NULLDATAOFF); + bests[findex] = cpu_to_be16(NULLDATAOFF); } /* * If this entry was for an empty data block * (this should always be true) then update the header. */ - if (free->bests[findex] == cpu_to_be16(NULLDATAOFF)) { - be32_add_cpu(&free->hdr.nused, 1); + if (bests[findex] == cpu_to_be16(NULLDATAOFF)) { + freehdr.nused++; + xfs_dir3_free_hdr_to_disk(fbp->b_addr, &freehdr); xfs_dir2_free_log_header(tp, fbp); } /* @@ -1699,7 +1960,8 @@ xfs_dir2_node_addname_int( * change again. */ hdr = dbp->b_addr; - free->bests[findex] = hdr->bestfree[0].length; + bf = xfs_dir3_data_bestfree_p(hdr); + bests[findex] = bf[0].length; logfree = 1; } /* @@ -1715,19 +1977,20 @@ xfs_dir2_node_addname_int( /* * Read the data block in. */ - error = xfs_dir2_data_read(tp, dp, xfs_dir2_db_to_da(mp, dbno), + error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(mp, dbno), -1, &dbp); if (error) return error; hdr = dbp->b_addr; + bf = xfs_dir3_data_bestfree_p(hdr); logfree = 0; } - ASSERT(be16_to_cpu(hdr->bestfree[0].length) >= length); + ASSERT(be16_to_cpu(bf[0].length) >= length); /* * Point to the existing unused space. */ dup = (xfs_dir2_data_unused_t *) - ((char *)hdr + be16_to_cpu(hdr->bestfree[0].offset)); + ((char *)hdr + be16_to_cpu(bf[0].offset)); needscan = needlog = 0; /* * Mark the first part of the unused space, inuse for us. @@ -1758,8 +2021,9 @@ xfs_dir2_node_addname_int( /* * If the freespace entry is now wrong, update it. */ - if (be16_to_cpu(free->bests[findex]) != be16_to_cpu(hdr->bestfree[0].length)) { - free->bests[findex] = hdr->bestfree[0].length; + bests = xfs_dir3_free_bests_p(mp, free); /* gcc is so stupid */ + if (be16_to_cpu(bests[findex]) != be16_to_cpu(bf[0].length)) { + bests[findex] = bf[0].length; logfree = 1; } /* @@ -1777,7 +2041,7 @@ xfs_dir2_node_addname_int( /* * Lookup an entry in a node-format directory. - * All the real work happens in xfs_da_node_lookup_int. + * All the real work happens in xfs_da3_node_lookup_int. * The only real output is the inode number of the entry. */ int /* error */ @@ -1802,7 +2066,7 @@ xfs_dir2_node_lookup( /* * Fill in the path to the entry in the cursor. */ - error = xfs_da_node_lookup_int(state, &rval); + error = xfs_da3_node_lookup_int(state, &rval); if (error) rval = error; else if (rval == ENOENT && args->cmpresult == XFS_CMP_CASE) { @@ -1857,7 +2121,7 @@ xfs_dir2_node_removename( /* * Look up the entry we're deleting, set up the cursor. */ - error = xfs_da_node_lookup_int(state, &rval); + error = xfs_da3_node_lookup_int(state, &rval); if (error) rval = error; /* @@ -1881,12 +2145,12 @@ xfs_dir2_node_removename( /* * Fix the hash values up the btree. */ - xfs_da_fixhashpath(state, &state->path); + xfs_da3_fixhashpath(state, &state->path); /* * If we need to join leaf blocks, do it. */ if (rval && state->path.active > 1) - error = xfs_da_join(state); + error = xfs_da3_join(state); /* * If no errors so far, try conversion to leaf format. */ @@ -1928,7 +2192,7 @@ xfs_dir2_node_replace( /* * Lookup the entry to change in the btree. */ - error = xfs_da_node_lookup_int(state, &rval); + error = xfs_da3_node_lookup_int(state, &rval); if (error) { rval = error; } @@ -1937,19 +2201,22 @@ xfs_dir2_node_replace( * and locked it. But paranoia is good. */ if (rval == EEXIST) { + struct xfs_dir2_leaf_entry *ents; /* * Find the leaf entry. */ blk = &state->path.blk[state->path.active - 1]; ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC); leaf = blk->bp->b_addr; - lep = &leaf->ents[blk->index]; + ents = xfs_dir3_leaf_ents_p(leaf); + lep = &ents[blk->index]; ASSERT(state->extravalid); /* * Point to the data entry. */ hdr = state->extrablk.bp->b_addr; - ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC)); + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC)); dep = (xfs_dir2_data_entry_t *) ((char *)hdr + xfs_dir2_dataptr_to_off(state->mp, be32_to_cpu(lep->address))); @@ -1995,6 +2262,7 @@ xfs_dir2_node_trim_free( xfs_dir2_free_t *free; /* freespace structure */ xfs_mount_t *mp; /* filesystem mount point */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir3_icfree_hdr freehdr; dp = args->dp; mp = dp->i_mount; @@ -2012,11 +2280,12 @@ xfs_dir2_node_trim_free( if (!bp) return 0; free = bp->b_addr; - ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); + xfs_dir3_free_hdr_from_disk(&freehdr, free); + /* * If there are used entries, there's nothing to do. */ - if (be32_to_cpu(free->hdr.nused) > 0) { + if (freehdr.nused > 0) { xfs_trans_brelse(tp, bp); *rvalp = 0; return 0; diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h index 7da79f6515f..7cf573c88aa 100644 --- a/fs/xfs/xfs_dir2_priv.h +++ b/fs/xfs/xfs_dir2_priv.h @@ -30,7 +30,7 @@ extern int xfs_dir_cilookup_result(struct xfs_da_args *args, const unsigned char *name, int len); /* xfs_dir2_block.c */ -extern const struct xfs_buf_ops xfs_dir2_block_buf_ops; +extern const struct xfs_buf_ops xfs_dir3_block_buf_ops; extern int xfs_dir2_block_addname(struct xfs_da_args *args); extern int xfs_dir2_block_getdents(struct xfs_inode *dp, void *dirent, @@ -43,17 +43,18 @@ extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args, /* xfs_dir2_data.c */ #ifdef DEBUG -#define xfs_dir2_data_check(dp,bp) __xfs_dir2_data_check(dp, bp); +#define xfs_dir3_data_check(dp,bp) __xfs_dir3_data_check(dp, bp); #else -#define xfs_dir2_data_check(dp,bp) +#define xfs_dir3_data_check(dp,bp) #endif -extern const struct xfs_buf_ops xfs_dir2_data_buf_ops; +extern const struct xfs_buf_ops xfs_dir3_data_buf_ops; +extern const struct xfs_buf_ops xfs_dir3_free_buf_ops; -extern int __xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_buf *bp); -extern int xfs_dir2_data_read(struct xfs_trans *tp, struct xfs_inode *dp, +extern int __xfs_dir3_data_check(struct xfs_inode *dp, struct xfs_buf *bp); +extern int xfs_dir3_data_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp); -extern int xfs_dir2_data_readahead(struct xfs_trans *tp, struct xfs_inode *dp, +extern int xfs_dir3_data_readahead(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mapped_bno); extern struct xfs_dir2_data_free * @@ -61,7 +62,7 @@ xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_data_unused *dup, int *loghead); extern void xfs_dir2_data_freescan(struct xfs_mount *mp, struct xfs_dir2_data_hdr *hdr, int *loghead); -extern int xfs_dir2_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno, +extern int xfs_dir3_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno, struct xfs_buf **bpp); extern void xfs_dir2_data_log_entry(struct xfs_trans *tp, struct xfs_buf *bp, struct xfs_dir2_data_entry *dep); @@ -77,24 +78,26 @@ extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_buf *bp, xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp); /* xfs_dir2_leaf.c */ -extern const struct xfs_buf_ops xfs_dir2_leafn_buf_ops; +extern const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops; +extern const struct xfs_buf_ops xfs_dir3_leafn_buf_ops; -extern int xfs_dir2_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp, +extern int xfs_dir3_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp); extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args, struct xfs_buf *dbp); extern int xfs_dir2_leaf_addname(struct xfs_da_args *args); -extern void xfs_dir2_leaf_compact(struct xfs_da_args *args, - struct xfs_buf *bp); -extern void xfs_dir2_leaf_compact_x1(struct xfs_buf *bp, int *indexp, +extern void xfs_dir3_leaf_compact(struct xfs_da_args *args, + struct xfs_dir3_icleaf_hdr *leafhdr, struct xfs_buf *bp); +extern void xfs_dir3_leaf_compact_x1(struct xfs_dir3_icleaf_hdr *leafhdr, + struct xfs_dir2_leaf_entry *ents, int *indexp, int *lowstalep, int *highstalep, int *lowlogp, int *highlogp); extern int xfs_dir2_leaf_getdents(struct xfs_inode *dp, void *dirent, size_t bufsize, xfs_off_t *offset, filldir_t filldir); -extern int xfs_dir2_leaf_init(struct xfs_da_args *args, xfs_dir2_db_t bno, - struct xfs_buf **bpp, int magic); -extern void xfs_dir2_leaf_log_ents(struct xfs_trans *tp, struct xfs_buf *bp, +extern int xfs_dir3_leaf_get_buf(struct xfs_da_args *args, xfs_dir2_db_t bno, + struct xfs_buf **bpp, __uint16_t magic); +extern void xfs_dir3_leaf_log_ents(struct xfs_trans *tp, struct xfs_buf *bp, int first, int last); -extern void xfs_dir2_leaf_log_header(struct xfs_trans *tp, +extern void xfs_dir3_leaf_log_header(struct xfs_trans *tp, struct xfs_buf *bp); extern int xfs_dir2_leaf_lookup(struct xfs_da_args *args); extern int xfs_dir2_leaf_removename(struct xfs_da_args *args); @@ -104,11 +107,18 @@ extern int xfs_dir2_leaf_search_hash(struct xfs_da_args *args, extern int xfs_dir2_leaf_trim_data(struct xfs_da_args *args, struct xfs_buf *lbp, xfs_dir2_db_t db); extern struct xfs_dir2_leaf_entry * -xfs_dir2_leaf_find_entry(struct xfs_dir2_leaf *leaf, int index, int compact, - int lowstale, int highstale, - int *lfloglow, int *lfloghigh); +xfs_dir3_leaf_find_entry(struct xfs_dir3_icleaf_hdr *leafhdr, + struct xfs_dir2_leaf_entry *ents, int index, int compact, + int lowstale, int highstale, int *lfloglow, int *lfloghigh); extern int xfs_dir2_node_to_leaf(struct xfs_da_state *state); +extern void xfs_dir3_leaf_hdr_from_disk(struct xfs_dir3_icleaf_hdr *to, + struct xfs_dir2_leaf *from); +extern void xfs_dir3_leaf_hdr_to_disk(struct xfs_dir2_leaf *to, + struct xfs_dir3_icleaf_hdr *from); +extern bool xfs_dir3_leaf_check_int(struct xfs_mount *mp, + struct xfs_dir3_icleaf_hdr *hdr, struct xfs_dir2_leaf *leaf); + /* xfs_dir2_node.c */ extern int xfs_dir2_leaf_to_node(struct xfs_da_args *args, struct xfs_buf *lbp); diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c index 1b9fc3ec7e4..6157424dbf8 100644 --- a/fs/xfs/xfs_dir2_sf.c +++ b/fs/xfs/xfs_dir2_sf.c @@ -278,7 +278,7 @@ xfs_dir2_block_to_sf( * Set up to loop over the block's entries. */ btp = xfs_dir2_block_tail_p(mp, hdr); - ptr = (char *)(hdr + 1); + ptr = (char *)xfs_dir3_data_entry_p(hdr); endptr = (char *)xfs_dir2_block_leaf_p(btp); sfep = xfs_dir2_sf_firstentry(sfp); /* @@ -535,7 +535,7 @@ xfs_dir2_sf_addname_hard( * to insert the new entry. * If it's going to end up at the end then oldsfep will point there. */ - for (offset = XFS_DIR2_DATA_FIRST_OFFSET, + for (offset = XFS_DIR3_DATA_FIRST_OFFSET(dp->i_mount), oldsfep = xfs_dir2_sf_firstentry(oldsfp), add_datasize = xfs_dir2_data_entsize(args->namelen), eof = (char *)oldsfep == &buf[old_isize]; @@ -617,7 +617,7 @@ xfs_dir2_sf_addname_pick( sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; size = xfs_dir2_data_entsize(args->namelen); - offset = XFS_DIR2_DATA_FIRST_OFFSET; + offset = XFS_DIR3_DATA_FIRST_OFFSET(mp); sfep = xfs_dir2_sf_firstentry(sfp); holefit = 0; /* @@ -688,7 +688,7 @@ xfs_dir2_sf_check( dp = args->dp; sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; - offset = XFS_DIR2_DATA_FIRST_OFFSET; + offset = XFS_DIR3_DATA_FIRST_OFFSET(dp->i_mount); ino = xfs_dir2_sf_get_parent_ino(sfp); i8count = ino > XFS_DIR2_MAX_SHORT_INUM; @@ -812,9 +812,9 @@ xfs_dir2_sf_getdents( * mp->m_dirdatablk. */ dot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, - XFS_DIR2_DATA_DOT_OFFSET); + XFS_DIR3_DATA_DOT_OFFSET(mp)); dotdot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, - XFS_DIR2_DATA_DOTDOT_OFFSET); + XFS_DIR3_DATA_DOTDOT_OFFSET(mp)); /* * Put . entry unless we're starting past it. diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 8025eb23ad7..a41f8bf1da3 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -36,6 +36,7 @@ #include "xfs_trans_space.h" #include "xfs_trans_priv.h" #include "xfs_qm.h" +#include "xfs_cksum.h" #include "xfs_trace.h" /* @@ -85,17 +86,23 @@ xfs_qm_dqdestroy( */ void xfs_qm_adjust_dqlimits( - xfs_mount_t *mp, - xfs_disk_dquot_t *d) + struct xfs_mount *mp, + struct xfs_dquot *dq) { - xfs_quotainfo_t *q = mp->m_quotainfo; + struct xfs_quotainfo *q = mp->m_quotainfo; + struct xfs_disk_dquot *d = &dq->q_core; + int prealloc = 0; ASSERT(d->d_id); - if (q->qi_bsoftlimit && !d->d_blk_softlimit) + if (q->qi_bsoftlimit && !d->d_blk_softlimit) { d->d_blk_softlimit = cpu_to_be64(q->qi_bsoftlimit); - if (q->qi_bhardlimit && !d->d_blk_hardlimit) + prealloc = 1; + } + if (q->qi_bhardlimit && !d->d_blk_hardlimit) { d->d_blk_hardlimit = cpu_to_be64(q->qi_bhardlimit); + prealloc = 1; + } if (q->qi_isoftlimit && !d->d_ino_softlimit) d->d_ino_softlimit = cpu_to_be64(q->qi_isoftlimit); if (q->qi_ihardlimit && !d->d_ino_hardlimit) @@ -104,6 +111,9 @@ xfs_qm_adjust_dqlimits( d->d_rtb_softlimit = cpu_to_be64(q->qi_rtbsoftlimit); if (q->qi_rtbhardlimit && !d->d_rtb_hardlimit) d->d_rtb_hardlimit = cpu_to_be64(q->qi_rtbhardlimit); + + if (prealloc) + xfs_dquot_set_prealloc_limits(dq); } /* @@ -239,6 +249,8 @@ xfs_qm_init_dquot_blk( d->dd_diskdq.d_version = XFS_DQUOT_VERSION; d->dd_diskdq.d_id = cpu_to_be32(curid); d->dd_diskdq.d_flags = type; + if (xfs_sb_version_hascrc(&mp->m_sb)) + uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid); } xfs_trans_dquot_buf(tp, bp, @@ -248,25 +260,113 @@ xfs_qm_init_dquot_blk( xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1); } -static void +/* + * Initialize the dynamic speculative preallocation thresholds. The lo/hi + * watermarks correspond to the soft and hard limits by default. If a soft limit + * is not specified, we use 95% of the hard limit. + */ +void +xfs_dquot_set_prealloc_limits(struct xfs_dquot *dqp) +{ + __uint64_t space; + + dqp->q_prealloc_hi_wmark = be64_to_cpu(dqp->q_core.d_blk_hardlimit); + dqp->q_prealloc_lo_wmark = be64_to_cpu(dqp->q_core.d_blk_softlimit); + if (!dqp->q_prealloc_lo_wmark) { + dqp->q_prealloc_lo_wmark = dqp->q_prealloc_hi_wmark; + do_div(dqp->q_prealloc_lo_wmark, 100); + dqp->q_prealloc_lo_wmark *= 95; + } + + space = dqp->q_prealloc_hi_wmark; + + do_div(space, 100); + dqp->q_low_space[XFS_QLOWSP_1_PCNT] = space; + dqp->q_low_space[XFS_QLOWSP_3_PCNT] = space * 3; + dqp->q_low_space[XFS_QLOWSP_5_PCNT] = space * 5; +} + +STATIC void +xfs_dquot_buf_calc_crc( + struct xfs_mount *mp, + struct xfs_buf *bp) +{ + struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr; + int i; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++, d++) { + xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk), + offsetof(struct xfs_dqblk, dd_crc)); + } +} + +STATIC bool +xfs_dquot_buf_verify_crc( + struct xfs_mount *mp, + struct xfs_buf *bp) +{ + struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr; + int ndquots; + int i; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return true; + + /* + * if we are in log recovery, the quota subsystem has not been + * initialised so we have no quotainfo structure. In that case, we need + * to manually calculate the number of dquots in the buffer. + */ + if (mp->m_quotainfo) + ndquots = mp->m_quotainfo->qi_dqperchunk; + else + ndquots = xfs_qm_calc_dquots_per_chunk(mp, + XFS_BB_TO_FSB(mp, bp->b_length)); + + for (i = 0; i < ndquots; i++, d++) { + if (!xfs_verify_cksum((char *)d, sizeof(struct xfs_dqblk), + offsetof(struct xfs_dqblk, dd_crc))) + return false; + if (!uuid_equal(&d->dd_uuid, &mp->m_sb.sb_uuid)) + return false; + } + + return true; +} + +STATIC bool xfs_dquot_buf_verify( + struct xfs_mount *mp, struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr; - struct xfs_disk_dquot *ddq; xfs_dqid_t id = 0; + int ndquots; int i; /* + * if we are in log recovery, the quota subsystem has not been + * initialised so we have no quotainfo structure. In that case, we need + * to manually calculate the number of dquots in the buffer. + */ + if (mp->m_quotainfo) + ndquots = mp->m_quotainfo->qi_dqperchunk; + else + ndquots = xfs_qm_calc_dquots_per_chunk(mp, bp->b_length); + + /* * On the first read of the buffer, verify that each dquot is valid. * We don't know what the id of the dquot is supposed to be, just that * they should be increasing monotonically within the buffer. If the * first id is corrupt, then it will fail on the second dquot in the * buffer so corruptions could point to the wrong dquot in this case. */ - for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) { - int error; + for (i = 0; i < ndquots; i++) { + struct xfs_disk_dquot *ddq; + int error; ddq = &d[i].dd_diskdq; @@ -274,27 +374,37 @@ xfs_dquot_buf_verify( id = be32_to_cpu(ddq->d_id); error = xfs_qm_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN, - "xfs_dquot_read_verify"); - if (error) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, d); - xfs_buf_ioerror(bp, EFSCORRUPTED); - break; - } + "xfs_dquot_buf_verify"); + if (error) + return false; } + return true; } static void xfs_dquot_buf_read_verify( struct xfs_buf *bp) { - xfs_dquot_buf_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + + if (!xfs_dquot_buf_verify_crc(mp, bp) || !xfs_dquot_buf_verify(mp, bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } } void xfs_dquot_buf_write_verify( struct xfs_buf *bp) { - xfs_dquot_buf_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + + if (!xfs_dquot_buf_verify(mp, bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + return; + } + xfs_dquot_buf_calc_crc(mp, bp); } const struct xfs_buf_ops xfs_dquot_buf_ops = { @@ -648,6 +758,9 @@ xfs_qm_dqread( dqp->q_res_icount = be64_to_cpu(ddqp->d_icount); dqp->q_res_rtbcount = be64_to_cpu(ddqp->d_rtbcount); + /* initialize the dquot speculative prealloc thresholds */ + xfs_dquot_set_prealloc_limits(dqp); + /* Mark the buf so that this will stay incore a little longer */ xfs_buf_set_ref(bp, XFS_DQUOT_REF); @@ -1035,6 +1148,17 @@ xfs_qm_dqflush( &dqp->q_logitem.qli_item.li_lsn); /* + * copy the lsn into the on-disk dquot now while we have the in memory + * dquot here. This can't be done later in the write verifier as we + * can't get access to the log item at that point in time. + */ + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddqp; + + dqb->dd_lsn = cpu_to_be64(dqp->q_logitem.qli_item.li_lsn); + } + + /* * Attach an iodone routine so that we can remove this dquot from the * AIL and release the flush lock once the dquot is synced to disk. */ diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index c694a8469c4..4f0ebfc43cc 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -32,6 +32,13 @@ struct xfs_mount; struct xfs_trans; +enum { + XFS_QLOWSP_1_PCNT = 0, + XFS_QLOWSP_3_PCNT, + XFS_QLOWSP_5_PCNT, + XFS_QLOWSP_MAX +}; + /* * The incore dquot structure */ @@ -51,6 +58,9 @@ typedef struct xfs_dquot { xfs_qcnt_t q_res_bcount; /* total regular nblks used+reserved */ xfs_qcnt_t q_res_icount; /* total inos allocd+reserved */ xfs_qcnt_t q_res_rtbcount;/* total realtime blks used+reserved */ + xfs_qcnt_t q_prealloc_lo_wmark;/* prealloc throttle wmark */ + xfs_qcnt_t q_prealloc_hi_wmark;/* prealloc disabled wmark */ + int64_t q_low_space[XFS_QLOWSP_MAX]; struct mutex q_qlock; /* quota lock */ struct completion q_flush; /* flush completion queue */ atomic_t q_pincount; /* dquot pin count */ @@ -145,14 +155,16 @@ extern int xfs_qm_dqflush(struct xfs_dquot *, struct xfs_buf **); extern void xfs_qm_dqunpin_wait(xfs_dquot_t *); extern void xfs_qm_adjust_dqtimers(xfs_mount_t *, xfs_disk_dquot_t *); -extern void xfs_qm_adjust_dqlimits(xfs_mount_t *, - xfs_disk_dquot_t *); +extern void xfs_qm_adjust_dqlimits(struct xfs_mount *, + struct xfs_dquot *); extern int xfs_qm_dqget(xfs_mount_t *, xfs_inode_t *, xfs_dqid_t, uint, uint, xfs_dquot_t **); extern void xfs_qm_dqput(xfs_dquot_t *); extern void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *); +extern void xfs_dquot_set_prealloc_limits(struct xfs_dquot *); + static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp) { xfs_dqlock(dqp); diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 610456054dc..35d3f5b041d 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -66,7 +66,7 @@ xfs_error_test(int error_tag, int *fsidp, char *expression, int i; int64_t fsid; - if (random32() % randfactor) + if (prandom_u32() % randfactor) return 0; memcpy(&fsid, fsidp, sizeof(xfs_fsid_t)); @@ -178,7 +178,7 @@ xfs_corruption_error( inst_t *ra) { if (level <= xfs_error_level) - xfs_hex_dump(p, 16); + xfs_hex_dump(p, 64); xfs_error_report(tag, level, mp, filename, linenum, ra); xfs_alert(mp, "Corruption detected. Unmount and run xfs_repair"); } diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index feb36d7551a..c0f375087ef 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -50,9 +50,8 @@ xfs_efi_item_free( * Freeing the efi requires that we remove it from the AIL if it has already * been placed there. However, the EFI may not yet have been placed in the AIL * when called by xfs_efi_release() from EFD processing due to the ordering of - * committed vs unpin operations in bulk insert operations. Hence the - * test_and_clear_bit(XFS_EFI_COMMITTED) to ensure only the last caller frees - * the EFI. + * committed vs unpin operations in bulk insert operations. Hence the reference + * count to ensure only the last caller frees the EFI. */ STATIC void __xfs_efi_release( @@ -60,7 +59,7 @@ __xfs_efi_release( { struct xfs_ail *ailp = efip->efi_item.li_ailp; - if (!test_and_clear_bit(XFS_EFI_COMMITTED, &efip->efi_flags)) { + if (atomic_dec_and_test(&efip->efi_refcount)) { spin_lock(&ailp->xa_lock); /* xfs_trans_ail_delete() drops the AIL lock. */ xfs_trans_ail_delete(ailp, &efip->efi_item, @@ -126,8 +125,8 @@ xfs_efi_item_pin( * which the EFI is manipulated during a transaction. If we are being asked to * remove the EFI it's because the transaction has been cancelled and by * definition that means the EFI cannot be in the AIL so remove it from the - * transaction and free it. Otherwise coordinate with xfs_efi_release() (via - * XFS_EFI_COMMITTED) to determine who gets to free the EFI. + * transaction and free it. Otherwise coordinate with xfs_efi_release() + * to determine who gets to free the EFI. */ STATIC void xfs_efi_item_unpin( @@ -171,19 +170,13 @@ xfs_efi_item_unlock( /* * The EFI is logged only once and cannot be moved in the log, so simply return - * the lsn at which it's been logged. For bulk transaction committed - * processing, the EFI may be processed but not yet unpinned prior to the EFD - * being processed. Set the XFS_EFI_COMMITTED flag so this case can be detected - * when processing the EFD. + * the lsn at which it's been logged. */ STATIC xfs_lsn_t xfs_efi_item_committed( struct xfs_log_item *lip, xfs_lsn_t lsn) { - struct xfs_efi_log_item *efip = EFI_ITEM(lip); - - set_bit(XFS_EFI_COMMITTED, &efip->efi_flags); return lsn; } @@ -241,6 +234,7 @@ xfs_efi_init( efip->efi_format.efi_nextents = nextents; efip->efi_format.efi_id = (__psint_t)(void*)efip; atomic_set(&efip->efi_next_extent, 0); + atomic_set(&efip->efi_refcount, 2); return efip; } @@ -310,8 +304,13 @@ xfs_efi_release(xfs_efi_log_item_t *efip, uint nextents) { ASSERT(atomic_read(&efip->efi_next_extent) >= nextents); - if (atomic_sub_and_test(nextents, &efip->efi_next_extent)) + if (atomic_sub_and_test(nextents, &efip->efi_next_extent)) { __xfs_efi_release(efip); + + /* recovery needs us to drop the EFI reference, too */ + if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) + __xfs_efi_release(efip); + } } static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip) diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h index 375f68e4253..432222418c5 100644 --- a/fs/xfs/xfs_extfree_item.h +++ b/fs/xfs/xfs_extfree_item.h @@ -114,16 +114,20 @@ typedef struct xfs_efd_log_format_64 { * Define EFI flag bits. Manipulated by set/clear/test_bit operators. */ #define XFS_EFI_RECOVERED 1 -#define XFS_EFI_COMMITTED 2 /* - * This is the "extent free intention" log item. It is used - * to log the fact that some extents need to be free. It is - * used in conjunction with the "extent free done" log item - * described below. + * This is the "extent free intention" log item. It is used to log the fact + * that some extents need to be free. It is used in conjunction with the + * "extent free done" log item described below. + * + * The EFI is reference counted so that it is not freed prior to both the EFI + * and EFD being committed and unpinned. This ensures that when the last + * reference goes away the EFI will always be in the AIL as it has been + * unpinned, regardless of whether the EFD is processed before or after the EFI. */ typedef struct xfs_efi_log_item { xfs_log_item_t efi_item; + atomic_t efi_refcount; atomic_t efi_next_extent; unsigned long efi_flags; /* misc flags */ xfs_efi_log_format_t efi_format; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 3800128d217..054d60c0ac5 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -890,7 +890,7 @@ xfs_dir_open( */ mode = xfs_ilock_map_shared(ip); if (ip->i_d.di_nextents > 0) - xfs_dir2_data_readahead(NULL, ip, 0, -1); + xfs_dir3_data_readahead(NULL, ip, 0, -1); xfs_iunlock(ip, mode); return 0; } diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 2866b8c78b7..87595b211da 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -247,6 +247,9 @@ xfs_growfs_data_private( tmpsize = agsize - XFS_PREALLOC_BLOCKS(mp); agf->agf_freeblks = cpu_to_be32(tmpsize); agf->agf_longest = cpu_to_be32(tmpsize); + if (xfs_sb_version_hascrc(&mp->m_sb)) + uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_uuid); + error = xfs_bwrite(bp); xfs_buf_relse(bp); if (error) @@ -265,6 +268,11 @@ xfs_growfs_data_private( } agfl = XFS_BUF_TO_AGFL(bp); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC); + agfl->agfl_seqno = cpu_to_be32(agno); + uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_uuid); + } for (bucket = 0; bucket < XFS_AGFL_SIZE(mp); bucket++) agfl->agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK); @@ -296,8 +304,11 @@ xfs_growfs_data_private( agi->agi_freecount = 0; agi->agi_newino = cpu_to_be32(NULLAGINO); agi->agi_dirino = cpu_to_be32(NULLAGINO); + if (xfs_sb_version_hascrc(&mp->m_sb)) + uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_uuid); for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO); + error = xfs_bwrite(bp); xfs_buf_relse(bp); if (error) @@ -316,7 +327,13 @@ xfs_growfs_data_private( goto error0; } - xfs_btree_init_block(mp, bp, XFS_ABTB_MAGIC, 0, 1, 0); + if (xfs_sb_version_hascrc(&mp->m_sb)) + xfs_btree_init_block(mp, bp, XFS_ABTB_CRC_MAGIC, 0, 1, + agno, XFS_BTREE_CRC_BLOCKS); + else + xfs_btree_init_block(mp, bp, XFS_ABTB_MAGIC, 0, 1, + agno, 0); + arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); arec->ar_blockcount = cpu_to_be32( @@ -339,7 +356,13 @@ xfs_growfs_data_private( goto error0; } - xfs_btree_init_block(mp, bp, XFS_ABTC_MAGIC, 0, 1, 0); + if (xfs_sb_version_hascrc(&mp->m_sb)) + xfs_btree_init_block(mp, bp, XFS_ABTC_CRC_MAGIC, 0, 1, + agno, XFS_BTREE_CRC_BLOCKS); + else + xfs_btree_init_block(mp, bp, XFS_ABTC_MAGIC, 0, 1, + agno, 0); + arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); arec->ar_blockcount = cpu_to_be32( @@ -363,7 +386,12 @@ xfs_growfs_data_private( goto error0; } - xfs_btree_init_block(mp, bp, XFS_IBT_MAGIC, 0, 0, 0); + if (xfs_sb_version_hascrc(&mp->m_sb)) + xfs_btree_init_block(mp, bp, XFS_IBT_CRC_MAGIC, 0, 0, + agno, XFS_BTREE_CRC_BLOCKS); + else + xfs_btree_init_block(mp, bp, XFS_IBT_MAGIC, 0, 0, + agno, 0); error = xfs_bwrite(bp); xfs_buf_relse(bp); diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 515bf71ce01..c8f5ae1debf 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -36,6 +36,8 @@ #include "xfs_rtalloc.h" #include "xfs_error.h" #include "xfs_bmap.h" +#include "xfs_cksum.h" +#include "xfs_buf_item.h" /* @@ -165,6 +167,7 @@ xfs_ialloc_inode_init( int version; int i, j; xfs_daddr_t d; + xfs_ino_t ino = 0; /* * Loop over the new block(s), filling in the inodes. @@ -183,13 +186,29 @@ xfs_ialloc_inode_init( } /* - * Figure out what version number to use in the inodes we create. - * If the superblock version has caught up to the one that supports - * the new inode format, then use the new inode version. Otherwise - * use the old version so that old kernels will continue to be - * able to use the file system. + * Figure out what version number to use in the inodes we create. If + * the superblock version has caught up to the one that supports the new + * inode format, then use the new inode version. Otherwise use the old + * version so that old kernels will continue to be able to use the file + * system. + * + * For v3 inodes, we also need to write the inode number into the inode, + * so calculate the first inode number of the chunk here as + * XFS_OFFBNO_TO_AGINO() only works within a filesystem block, not + * across multiple filesystem blocks (such as a cluster) and so cannot + * be used in the cluster buffer loop below. + * + * Further, because we are writing the inode directly into the buffer + * and calculating a CRC on the entire inode, we have ot log the entire + * inode so that the entire range the CRC covers is present in the log. + * That means for v3 inode we log the entire buffer rather than just the + * inode cores. */ - if (xfs_sb_version_hasnlink(&mp->m_sb)) + if (xfs_sb_version_hascrc(&mp->m_sb)) { + version = 3; + ino = XFS_AGINO_TO_INO(mp, agno, + XFS_OFFBNO_TO_AGINO(mp, agbno, 0)); + } else if (xfs_sb_version_hasnlink(&mp->m_sb)) version = 2; else version = 1; @@ -212,17 +231,32 @@ xfs_ialloc_inode_init( * individual transactions causing a lot of log traffic. */ fbuf->b_ops = &xfs_inode_buf_ops; - xfs_buf_zero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog); + xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length)); for (i = 0; i < ninodes; i++) { int ioffset = i << mp->m_sb.sb_inodelog; - uint isize = sizeof(struct xfs_dinode); + uint isize = xfs_dinode_size(version); free = xfs_make_iptr(mp, fbuf, i); free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); free->di_version = version; free->di_gen = cpu_to_be32(gen); free->di_next_unlinked = cpu_to_be32(NULLAGINO); - xfs_trans_log_buf(tp, fbuf, ioffset, ioffset + isize - 1); + + if (version == 3) { + free->di_ino = cpu_to_be64(ino); + ino++; + uuid_copy(&free->di_uuid, &mp->m_sb.sb_uuid); + xfs_dinode_calc_crc(mp, free); + } else { + /* just log the inode core */ + xfs_trans_log_buf(tp, fbuf, ioffset, + ioffset + isize - 1); + } + } + if (version == 3) { + /* need to log the entire buffer */ + xfs_trans_log_buf(tp, fbuf, 0, + BBTOB(fbuf->b_length) - 1); } xfs_trans_inode_alloc_buf(tp, fbuf); } @@ -369,7 +403,7 @@ xfs_ialloc_ag_alloc( * number from being easily guessable. */ error = xfs_ialloc_inode_init(args.mp, tp, agno, args.agbno, - args.len, random32()); + args.len, prandom_u32()); if (error) return error; @@ -1453,6 +1487,7 @@ xfs_ialloc_log_agi( /* * Log the allocation group inode header buffer. */ + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGI_BUF); xfs_trans_log_buf(tp, bp, first, last); } @@ -1470,19 +1505,23 @@ xfs_check_agi_unlinked( #define xfs_check_agi_unlinked(agi) #endif -static void +static bool xfs_agi_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_agi *agi = XFS_BUF_TO_AGI(bp); - int agi_ok; + if (xfs_sb_version_hascrc(&mp->m_sb) && + !uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_uuid)) + return false; /* * Validate the magic number of the agi block. */ - agi_ok = agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC) && - XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)); + if (agi->agi_magicnum != cpu_to_be32(XFS_AGI_MAGIC)) + return false; + if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum))) + return false; /* * during growfs operations, the perag is not fully initialised, @@ -1490,30 +1529,52 @@ xfs_agi_verify( * use it by using uncached buffers that don't have the perag attached * so we can detect and avoid this problem. */ - if (bp->b_pag) - agi_ok = agi_ok && be32_to_cpu(agi->agi_seqno) == - bp->b_pag->pag_agno; + if (bp->b_pag && be32_to_cpu(agi->agi_seqno) != bp->b_pag->pag_agno) + return false; - if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI, - XFS_RANDOM_IALLOC_READ_AGI))) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agi); - xfs_buf_ioerror(bp, EFSCORRUPTED); - } xfs_check_agi_unlinked(agi); + return true; } static void xfs_agi_read_verify( struct xfs_buf *bp) { - xfs_agi_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + int agi_ok = 1; + + if (xfs_sb_version_hascrc(&mp->m_sb)) + agi_ok = xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), + offsetof(struct xfs_agi, agi_crc)); + agi_ok = agi_ok && xfs_agi_verify(bp); + + if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI, + XFS_RANDOM_IALLOC_READ_AGI))) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } } static void xfs_agi_write_verify( struct xfs_buf *bp) { - xfs_agi_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + + if (!xfs_agi_verify(bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + XFS_BUF_TO_AGI(bp)->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn); + xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), + offsetof(struct xfs_agi, agi_crc)); } const struct xfs_buf_ops xfs_agi_buf_ops = { diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c index bec344b3650..c82ac886742 100644 --- a/fs/xfs/xfs_ialloc_btree.c +++ b/fs/xfs/xfs_ialloc_btree.c @@ -34,6 +34,7 @@ #include "xfs_alloc.h" #include "xfs_error.h" #include "xfs_trace.h" +#include "xfs_cksum.h" STATIC int @@ -182,52 +183,88 @@ xfs_inobt_key_diff( cur->bc_rec.i.ir_startino; } -void +static int xfs_inobt_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + struct xfs_perag *pag = bp->b_pag; unsigned int level; - int sblock_ok; /* block passes checks */ - /* magic number and level verification */ - level = be16_to_cpu(block->bb_level); - sblock_ok = block->bb_magic == cpu_to_be32(XFS_IBT_MAGIC) && - level < mp->m_in_maxlevels; + /* + * During growfs operations, we can't verify the exact owner as the + * perag is not fully initialised and hence not attached to the buffer. + * + * Similarly, during log recovery we will have a perag structure + * attached, but the agi information will not yet have been initialised + * from the on disk AGI. We don't currently use any of this information, + * but beware of the landmine (i.e. need to check pag->pagi_init) if we + * ever do. + */ + switch (block->bb_magic) { + case cpu_to_be32(XFS_IBT_CRC_MAGIC): + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid)) + return false; + if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn)) + return false; + if (pag && + be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno) + return false; + /* fall through */ + case cpu_to_be32(XFS_IBT_MAGIC): + break; + default: + return 0; + } - /* numrecs verification */ - sblock_ok = sblock_ok && - be16_to_cpu(block->bb_numrecs) <= mp->m_inobt_mxr[level != 0]; + /* numrecs and level verification */ + level = be16_to_cpu(block->bb_level); + if (level >= mp->m_in_maxlevels) + return false; + if (be16_to_cpu(block->bb_numrecs) > mp->m_inobt_mxr[level != 0]) + return false; /* sibling pointer verification */ - sblock_ok = sblock_ok && - (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) || - be32_to_cpu(block->bb_u.s.bb_leftsib) < mp->m_sb.sb_agblocks) && - block->bb_u.s.bb_leftsib && - (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) || - be32_to_cpu(block->bb_u.s.bb_rightsib) < mp->m_sb.sb_agblocks) && - block->bb_u.s.bb_rightsib; - - if (!sblock_ok) { - trace_xfs_btree_corrupt(bp, _RET_IP_); - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block); - xfs_buf_ioerror(bp, EFSCORRUPTED); - } + if (!block->bb_u.s.bb_leftsib || + (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks && + block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK))) + return false; + if (!block->bb_u.s.bb_rightsib || + (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks && + block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK))) + return false; + + return true; } static void xfs_inobt_read_verify( struct xfs_buf *bp) { - xfs_inobt_verify(bp); + if (!(xfs_btree_sblock_verify_crc(bp) && + xfs_inobt_verify(bp))) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, + bp->b_target->bt_mount, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } } static void xfs_inobt_write_verify( struct xfs_buf *bp) { - xfs_inobt_verify(bp); + if (!xfs_inobt_verify(bp)) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, + bp->b_target->bt_mount, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } + xfs_btree_sblock_calc_crc(bp); + } const struct xfs_buf_ops xfs_inobt_buf_ops = { @@ -301,6 +338,8 @@ xfs_inobt_init_cursor( cur->bc_blocklog = mp->m_sb.sb_blocklog; cur->bc_ops = &xfs_inobt_ops; + if (xfs_sb_version_hascrc(&mp->m_sb)) + cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; cur->bc_private.a.agbp = agbp; cur->bc_private.a.agno = agno; diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h index 25c0239a8ea..3ac36b7642e 100644 --- a/fs/xfs/xfs_ialloc_btree.h +++ b/fs/xfs/xfs_ialloc_btree.h @@ -29,7 +29,8 @@ struct xfs_mount; /* * There is a btree for the inode map per allocation group. */ -#define XFS_IBT_MAGIC 0x49414254 /* 'IABT' */ +#define XFS_IBT_MAGIC 0x49414254 /* 'IABT' */ +#define XFS_IBT_CRC_MAGIC 0x49414233 /* 'IAB3' */ typedef __uint64_t xfs_inofree_t; #define XFS_INODES_PER_CHUNK (NBBY * sizeof(xfs_inofree_t)) @@ -76,10 +77,10 @@ typedef __be32 xfs_inobt_ptr_t; /* * Btree block header size depends on a superblock flag. - * - * (not quite yet, but soon) */ -#define XFS_INOBT_BLOCK_LEN(mp) XFS_BTREE_SBLOCK_LEN +#define XFS_INOBT_BLOCK_LEN(mp) \ + (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ + XFS_BTREE_SBLOCK_CRC_LEN : XFS_BTREE_SBLOCK_LEN) /* * Record, key, and pointer address macros for btree blocks. diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 4f201656d2d..558ef494720 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -44,6 +44,7 @@ #include "xfs_quota.h" #include "xfs_filestream.h" #include "xfs_vnodeops.h" +#include "xfs_cksum.h" #include "xfs_trace.h" #include "xfs_icache.h" @@ -786,6 +787,7 @@ xfs_iformat_btree( xfs_dinode_t *dip, int whichfork) { + struct xfs_mount *mp = ip->i_mount; xfs_bmdr_block_t *dfp; xfs_ifork_t *ifp; /* REFERENCED */ @@ -794,7 +796,7 @@ xfs_iformat_btree( ifp = XFS_IFORK_PTR(ip, whichfork); dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork); - size = XFS_BMAP_BROOT_SPACE(dfp); + size = XFS_BMAP_BROOT_SPACE(mp, dfp); nrecs = be16_to_cpu(dfp->bb_numrecs); /* @@ -805,14 +807,14 @@ xfs_iformat_btree( * blocks. */ if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= - XFS_IFORK_MAXEXT(ip, whichfork) || + XFS_IFORK_MAXEXT(ip, whichfork) || XFS_BMDR_SPACE_CALC(nrecs) > - XFS_DFORK_SIZE(dip, ip->i_mount, whichfork) || + XFS_DFORK_SIZE(dip, mp, whichfork) || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) { - xfs_warn(ip->i_mount, "corrupt inode %Lu (btree).", - (unsigned long long) ip->i_ino); + xfs_warn(mp, "corrupt inode %Lu (btree).", + (unsigned long long) ip->i_ino); XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW, - ip->i_mount, dip); + mp, dip); return XFS_ERROR(EFSCORRUPTED); } @@ -823,8 +825,7 @@ xfs_iformat_btree( * Copy and convert from the on-disk structure * to the in-memory structure. */ - xfs_bmdr_to_bmbt(ip->i_mount, dfp, - XFS_DFORK_SIZE(dip, ip->i_mount, whichfork), + xfs_bmdr_to_bmbt(ip, dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork), ifp->if_broot, size); ifp->if_flags &= ~XFS_IFEXTENTS; ifp->if_flags |= XFS_IFBROOT; @@ -866,6 +867,17 @@ xfs_dinode_from_disk( to->di_dmstate = be16_to_cpu(from->di_dmstate); to->di_flags = be16_to_cpu(from->di_flags); to->di_gen = be32_to_cpu(from->di_gen); + + if (to->di_version == 3) { + to->di_changecount = be64_to_cpu(from->di_changecount); + to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec); + to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec); + to->di_flags2 = be64_to_cpu(from->di_flags2); + to->di_ino = be64_to_cpu(from->di_ino); + to->di_lsn = be64_to_cpu(from->di_lsn); + memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2)); + uuid_copy(&to->di_uuid, &from->di_uuid); + } } void @@ -902,6 +914,17 @@ xfs_dinode_to_disk( to->di_dmstate = cpu_to_be16(from->di_dmstate); to->di_flags = cpu_to_be16(from->di_flags); to->di_gen = cpu_to_be32(from->di_gen); + + if (from->di_version == 3) { + to->di_changecount = cpu_to_be64(from->di_changecount); + to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec); + to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec); + to->di_flags2 = cpu_to_be64(from->di_flags2); + to->di_ino = cpu_to_be64(from->di_ino); + to->di_lsn = cpu_to_be64(from->di_lsn); + memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2)); + uuid_copy(&to->di_uuid, &from->di_uuid); + } } STATIC uint @@ -962,6 +985,47 @@ xfs_dic2xflags( (XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0); } +static bool +xfs_dinode_verify( + struct xfs_mount *mp, + struct xfs_inode *ip, + struct xfs_dinode *dip) +{ + if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC)) + return false; + + /* only version 3 or greater inodes are extensively verified here */ + if (dip->di_version < 3) + return true; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize, + offsetof(struct xfs_dinode, di_crc))) + return false; + if (be64_to_cpu(dip->di_ino) != ip->i_ino) + return false; + if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_uuid)) + return false; + return true; +} + +void +xfs_dinode_calc_crc( + struct xfs_mount *mp, + struct xfs_dinode *dip) +{ + __uint32_t crc; + + if (dip->di_version < 3) + return; + + ASSERT(xfs_sb_version_hascrc(&mp->m_sb)); + crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize, + offsetof(struct xfs_dinode, di_crc)); + dip->di_crc = xfs_end_cksum(crc); +} + /* * Read the disk inode attributes into the in-core inode structure. */ @@ -990,17 +1054,13 @@ xfs_iread( if (error) return error; - /* - * If we got something that isn't an inode it means someone - * (nfs or dmi) has a stale handle. - */ - if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC)) { -#ifdef DEBUG - xfs_alert(mp, - "%s: dip->di_magic (0x%x) != XFS_DINODE_MAGIC (0x%x)", - __func__, be16_to_cpu(dip->di_magic), XFS_DINODE_MAGIC); -#endif /* DEBUG */ - error = XFS_ERROR(EINVAL); + /* even unallocated inodes are verified */ + if (!xfs_dinode_verify(mp, ip, dip)) { + xfs_alert(mp, "%s: validation failed for inode %lld failed", + __func__, ip->i_ino); + + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, dip); + error = XFS_ERROR(EFSCORRUPTED); goto out_brelse; } @@ -1022,10 +1082,20 @@ xfs_iread( goto out_brelse; } } else { + /* + * Partial initialisation of the in-core inode. Just the bits + * that xfs_ialloc won't overwrite or relies on being correct. + */ ip->i_d.di_magic = be16_to_cpu(dip->di_magic); ip->i_d.di_version = dip->di_version; ip->i_d.di_gen = be32_to_cpu(dip->di_gen); ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter); + + if (dip->di_version == 3) { + ip->i_d.di_ino = be64_to_cpu(dip->di_ino); + uuid_copy(&ip->i_d.di_uuid, &dip->di_uuid); + } + /* * Make sure to pull in the mode here as well in * case the inode is released without being used. @@ -1161,6 +1231,7 @@ xfs_ialloc( xfs_buf_t **ialloc_context, xfs_inode_t **ipp) { + struct xfs_mount *mp = tp->t_mountp; xfs_ino_t ino; xfs_inode_t *ip; uint flags; @@ -1187,7 +1258,7 @@ xfs_ialloc( * This is because we're setting fields here we need * to prevent others from looking at until we're done. */ - error = xfs_iget(tp->t_mountp, tp, ino, XFS_IGET_CREATE, + error = xfs_iget(mp, tp, ino, XFS_IGET_CREATE, XFS_ILOCK_EXCL, &ip); if (error) return error; @@ -1208,7 +1279,7 @@ xfs_ialloc( * the inode version number now. This way we only do the conversion * here rather than here and in the flush/logging code. */ - if (xfs_sb_version_hasnlink(&tp->t_mountp->m_sb) && + if (xfs_sb_version_hasnlink(&mp->m_sb) && ip->i_d.di_version == 1) { ip->i_d.di_version = 2; /* @@ -1258,6 +1329,19 @@ xfs_ialloc( ip->i_d.di_dmevmask = 0; ip->i_d.di_dmstate = 0; ip->i_d.di_flags = 0; + + if (ip->i_d.di_version == 3) { + ASSERT(ip->i_d.di_ino == ino); + ASSERT(uuid_equal(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid)); + ip->i_d.di_crc = 0; + ip->i_d.di_changecount = 1; + ip->i_d.di_lsn = 0; + ip->i_d.di_flags2 = 0; + memset(&(ip->i_d.di_pad2[0]), 0, sizeof(ip->i_d.di_pad2)); + ip->i_d.di_crtime = ip->i_d.di_mtime; + } + + flags = XFS_ILOG_CORE; switch (mode & S_IFMT) { case S_IFIFO: @@ -2037,7 +2121,7 @@ xfs_iroot_realloc( * allocate it now and get out. */ if (ifp->if_broot_bytes == 0) { - new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(rec_diff); + new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff); ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS); ifp->if_broot_bytes = (int)new_size; return; @@ -2051,9 +2135,9 @@ xfs_iroot_realloc( */ cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0); new_max = cur_max + rec_diff; - new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max); + new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max); ifp->if_broot = kmem_realloc(ifp->if_broot, new_size, - (size_t)XFS_BMAP_BROOT_SPACE_CALC(cur_max), /* old size */ + XFS_BMAP_BROOT_SPACE_CALC(mp, cur_max), KM_SLEEP | KM_NOFS); op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, ifp->if_broot_bytes); @@ -2061,7 +2145,7 @@ xfs_iroot_realloc( (int)new_size); ifp->if_broot_bytes = (int)new_size; ASSERT(ifp->if_broot_bytes <= - XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ); + XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ(ip)); memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t)); return; } @@ -2076,7 +2160,7 @@ xfs_iroot_realloc( new_max = cur_max + rec_diff; ASSERT(new_max >= 0); if (new_max > 0) - new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max); + new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max); else new_size = 0; if (new_size > 0) { @@ -2084,7 +2168,8 @@ xfs_iroot_realloc( /* * First copy over the btree block header. */ - memcpy(new_broot, ifp->if_broot, XFS_BTREE_LBLOCK_LEN); + memcpy(new_broot, ifp->if_broot, + XFS_BMBT_BLOCK_LEN(ip->i_mount)); } else { new_broot = NULL; ifp->if_flags &= ~XFS_IFBROOT; @@ -2114,7 +2199,7 @@ xfs_iroot_realloc( ifp->if_broot = new_broot; ifp->if_broot_bytes = (int)new_size; ASSERT(ifp->if_broot_bytes <= - XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ); + XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ(ip)); return; } @@ -2427,7 +2512,7 @@ xfs_iflush_fork( ASSERT(ifp->if_broot != NULL); ASSERT(ifp->if_broot_bytes <= (XFS_IFORK_SIZE(ip, whichfork) + - XFS_BROOT_SIZE_ADJ)); + XFS_BROOT_SIZE_ADJ(ip))); xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes, (xfs_bmdr_block_t *)cp, XFS_DFORK_SIZE(dip, mp, whichfork)); @@ -2715,20 +2800,18 @@ abort_out: STATIC int xfs_iflush_int( - xfs_inode_t *ip, - xfs_buf_t *bp) + struct xfs_inode *ip, + struct xfs_buf *bp) { - xfs_inode_log_item_t *iip; - xfs_dinode_t *dip; - xfs_mount_t *mp; + struct xfs_inode_log_item *iip = ip->i_itemp; + struct xfs_dinode *dip; + struct xfs_mount *mp = ip->i_mount; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); ASSERT(xfs_isiflocked(ip)); ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); - - iip = ip->i_itemp; - mp = ip->i_mount; + ASSERT(iip != NULL && iip->ili_fields != 0); /* set *dip = inode's place in the buffer */ dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); @@ -2789,9 +2872,9 @@ xfs_iflush_int( } /* * bump the flush iteration count, used to detect flushes which - * postdate a log record during recovery. + * postdate a log record during recovery. This is redundant as we now + * log every change and hence this can't happen. Still, it doesn't hurt. */ - ip->i_d.di_flushiter++; /* @@ -2867,41 +2950,30 @@ xfs_iflush_int( * need the AIL lock, because it is a 64 bit value that cannot be read * atomically. */ - if (iip != NULL && iip->ili_fields != 0) { - iip->ili_last_fields = iip->ili_fields; - iip->ili_fields = 0; - iip->ili_logged = 1; + iip->ili_last_fields = iip->ili_fields; + iip->ili_fields = 0; + iip->ili_logged = 1; - xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, - &iip->ili_item.li_lsn); + xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, + &iip->ili_item.li_lsn); - /* - * Attach the function xfs_iflush_done to the inode's - * buffer. This will remove the inode from the AIL - * and unlock the inode's flush lock when the inode is - * completely written to disk. - */ - xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item); + /* + * Attach the function xfs_iflush_done to the inode's + * buffer. This will remove the inode from the AIL + * and unlock the inode's flush lock when the inode is + * completely written to disk. + */ + xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item); - ASSERT(bp->b_fspriv != NULL); - ASSERT(bp->b_iodone != NULL); - } else { - /* - * We're flushing an inode which is not in the AIL and has - * not been logged. For this case we can immediately drop - * the inode flush lock because we can avoid the whole - * AIL state thing. It's OK to drop the flush lock now, - * because we've already locked the buffer and to do anything - * you really need both. - */ - if (iip != NULL) { - ASSERT(iip->ili_logged == 0); - ASSERT(iip->ili_last_fields == 0); - ASSERT((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0); - } - xfs_ifunlock(ip); - } + /* update the lsn in the on disk inode if required */ + if (ip->i_d.di_version == 3) + dip->di_lsn = cpu_to_be64(iip->ili_item.li_lsn); + + /* generate the checksum. */ + xfs_dinode_calc_crc(mp, dip); + ASSERT(bp->b_fspriv != NULL); + ASSERT(bp->b_iodone != NULL); return 0; corrupt_out: diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 237e7f6f2ab..91129794aae 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -150,13 +150,38 @@ typedef struct xfs_icdinode { __uint16_t di_dmstate; /* DMIG state info */ __uint16_t di_flags; /* random flags, XFS_DIFLAG_... */ __uint32_t di_gen; /* generation number */ + + /* di_next_unlinked is the only non-core field in the old dinode */ + xfs_agino_t di_next_unlinked;/* agi unlinked list ptr */ + + /* start of the extended dinode, writable fields */ + __uint32_t di_crc; /* CRC of the inode */ + __uint64_t di_changecount; /* number of attribute changes */ + xfs_lsn_t di_lsn; /* flush sequence */ + __uint64_t di_flags2; /* more random flags */ + __uint8_t di_pad2[16]; /* more padding for future expansion */ + + /* fields only written to during inode creation */ + xfs_ictimestamp_t di_crtime; /* time created */ + xfs_ino_t di_ino; /* inode number */ + uuid_t di_uuid; /* UUID of the filesystem */ + + /* structure must be padded to 64 bit alignment */ } xfs_icdinode_t; +static inline uint xfs_icdinode_size(int version) +{ + if (version == 3) + return sizeof(struct xfs_icdinode); + return offsetof(struct xfs_icdinode, di_next_unlinked); +} + /* * Flags for xfs_ichgtime(). */ #define XFS_ICHGTIME_MOD 0x1 /* data fork modification timestamp */ #define XFS_ICHGTIME_CHG 0x2 /* inode field change timestamp */ +#define XFS_ICHGTIME_CREATE 0x4 /* inode create timestamp */ /* * Per-fork incore inode flags. @@ -180,10 +205,11 @@ typedef struct xfs_icdinode { #define XFS_IFORK_DSIZE(ip) \ (XFS_IFORK_Q(ip) ? \ XFS_IFORK_BOFF(ip) : \ - XFS_LITINO((ip)->i_mount)) + XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version)) #define XFS_IFORK_ASIZE(ip) \ (XFS_IFORK_Q(ip) ? \ - XFS_LITINO((ip)->i_mount) - XFS_IFORK_BOFF(ip) : \ + XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version) - \ + XFS_IFORK_BOFF(ip) : \ 0) #define XFS_IFORK_SIZE(ip,w) \ ((w) == XFS_DATA_FORK ? \ @@ -555,6 +581,7 @@ int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *, struct xfs_buf **, uint, uint); int xfs_iread(struct xfs_mount *, struct xfs_trans *, struct xfs_inode *, uint); +void xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *); void xfs_dinode_to_disk(struct xfs_dinode *, struct xfs_icdinode *); void xfs_idestroy_fork(struct xfs_inode *, int); diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index f034bd1652f..f76ff52e43c 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -179,7 +179,7 @@ xfs_inode_item_format( nvecs = 1; vecp->i_addr = &ip->i_d; - vecp->i_len = sizeof(struct xfs_icdinode); + vecp->i_len = xfs_icdinode_size(ip->i_d.di_version); vecp->i_type = XLOG_REG_TYPE_ICORE; vecp++; nvecs++; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 5a30dd899d2..8f8aaee7f37 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -42,6 +42,8 @@ #include "xfs_iomap.h" #include "xfs_trace.h" #include "xfs_icache.h" +#include "xfs_dquot_item.h" +#include "xfs_dquot.h" #define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \ @@ -362,10 +364,65 @@ xfs_iomap_eof_prealloc_initial_size( if (imap[0].br_startblock == HOLESTARTBLOCK) return 0; if (imap[0].br_blockcount <= (MAXEXTLEN >> 1)) - return imap[0].br_blockcount; + return imap[0].br_blockcount << 1; return XFS_B_TO_FSB(mp, offset); } +STATIC bool +xfs_quota_need_throttle( + struct xfs_inode *ip, + int type, + xfs_fsblock_t alloc_blocks) +{ + struct xfs_dquot *dq = xfs_inode_dquot(ip, type); + + if (!dq || !xfs_this_quota_on(ip->i_mount, type)) + return false; + + /* no hi watermark, no throttle */ + if (!dq->q_prealloc_hi_wmark) + return false; + + /* under the lo watermark, no throttle */ + if (dq->q_res_bcount + alloc_blocks < dq->q_prealloc_lo_wmark) + return false; + + return true; +} + +STATIC void +xfs_quota_calc_throttle( + struct xfs_inode *ip, + int type, + xfs_fsblock_t *qblocks, + int *qshift) +{ + int64_t freesp; + int shift = 0; + struct xfs_dquot *dq = xfs_inode_dquot(ip, type); + + /* over hi wmark, squash the prealloc completely */ + if (dq->q_res_bcount >= dq->q_prealloc_hi_wmark) { + *qblocks = 0; + return; + } + + freesp = dq->q_prealloc_hi_wmark - dq->q_res_bcount; + if (freesp < dq->q_low_space[XFS_QLOWSP_5_PCNT]) { + shift = 2; + if (freesp < dq->q_low_space[XFS_QLOWSP_3_PCNT]) + shift += 2; + if (freesp < dq->q_low_space[XFS_QLOWSP_1_PCNT]) + shift += 2; + } + + /* only overwrite the throttle values if we are more aggressive */ + if ((freesp >> shift) < (*qblocks >> *qshift)) { + *qblocks = freesp; + *qshift = shift; + } +} + /* * If we don't have a user specified preallocation size, dynamically increase * the preallocation size as the size of the file grows. Cap the maximum size @@ -381,45 +438,89 @@ xfs_iomap_prealloc_size( int nimaps) { xfs_fsblock_t alloc_blocks = 0; + int shift = 0; + int64_t freesp; + xfs_fsblock_t qblocks; + int qshift = 0; alloc_blocks = xfs_iomap_eof_prealloc_initial_size(mp, ip, offset, imap, nimaps); - if (alloc_blocks > 0) { - int shift = 0; - int64_t freesp; - - alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN, - rounddown_pow_of_two(alloc_blocks)); - - xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT); - freesp = mp->m_sb.sb_fdblocks; - if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) { - shift = 2; - if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT]) - shift++; - if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT]) - shift++; - if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT]) - shift++; - if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT]) - shift++; - } - if (shift) - alloc_blocks >>= shift; + if (!alloc_blocks) + goto check_writeio; + qblocks = alloc_blocks; - /* - * If we are still trying to allocate more space than is - * available, squash the prealloc hard. This can happen if we - * have a large file on a small filesystem and the above - * lowspace thresholds are smaller than MAXEXTLEN. - */ - while (alloc_blocks && alloc_blocks >= freesp) - alloc_blocks >>= 4; + /* + * MAXEXTLEN is not a power of two value but we round the prealloc down + * to the nearest power of two value after throttling. To prevent the + * round down from unconditionally reducing the maximum supported prealloc + * size, we round up first, apply appropriate throttling, round down and + * cap the value to MAXEXTLEN. + */ + alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(MAXEXTLEN), + alloc_blocks); + + xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT); + freesp = mp->m_sb.sb_fdblocks; + if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) { + shift = 2; + if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT]) + shift++; + if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT]) + shift++; + if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT]) + shift++; + if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT]) + shift++; } + /* + * Check each quota to cap the prealloc size and provide a shift + * value to throttle with. + */ + if (xfs_quota_need_throttle(ip, XFS_DQ_USER, alloc_blocks)) + xfs_quota_calc_throttle(ip, XFS_DQ_USER, &qblocks, &qshift); + if (xfs_quota_need_throttle(ip, XFS_DQ_GROUP, alloc_blocks)) + xfs_quota_calc_throttle(ip, XFS_DQ_GROUP, &qblocks, &qshift); + if (xfs_quota_need_throttle(ip, XFS_DQ_PROJ, alloc_blocks)) + xfs_quota_calc_throttle(ip, XFS_DQ_PROJ, &qblocks, &qshift); + + /* + * The final prealloc size is set to the minimum of free space available + * in each of the quotas and the overall filesystem. + * + * The shift throttle value is set to the maximum value as determined by + * the global low free space values and per-quota low free space values. + */ + alloc_blocks = MIN(alloc_blocks, qblocks); + shift = MAX(shift, qshift); + + if (shift) + alloc_blocks >>= shift; + /* + * rounddown_pow_of_two() returns an undefined result if we pass in + * alloc_blocks = 0. + */ + if (alloc_blocks) + alloc_blocks = rounddown_pow_of_two(alloc_blocks); + if (alloc_blocks > MAXEXTLEN) + alloc_blocks = MAXEXTLEN; + + /* + * If we are still trying to allocate more space than is + * available, squash the prealloc hard. This can happen if we + * have a large file on a small filesystem and the above + * lowspace thresholds are smaller than MAXEXTLEN. + */ + while (alloc_blocks && alloc_blocks >= freesp) + alloc_blocks >>= 4; + +check_writeio: if (alloc_blocks < mp->m_writeio_blocks) alloc_blocks = mp->m_writeio_blocks; + trace_xfs_iomap_prealloc_size(ip, alloc_blocks, shift, + mp->m_writeio_blocks); + return alloc_blocks; } diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index fe7e4df85a7..14e59d953b7 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -72,6 +72,7 @@ #include <linux/kthread.h> #include <linux/freezer.h> #include <linux/list_sort.h> +#include <linux/ratelimit.h> #include <asm/page.h> #include <asm/div64.h> diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index eec226f78a4..b345a7c8515 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -3485,7 +3485,7 @@ xlog_ticket_alloc( tic->t_curr_res = unit_bytes; tic->t_cnt = cnt; tic->t_ocnt = cnt; - tic->t_tid = random32(); + tic->t_tid = prandom_u32(); tic->t_clientid = client; tic->t_flags = XLOG_TIC_INITED; tic->t_trans_type = 0; diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index ddc4529d07d..e3d0b85d852 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -668,10 +668,6 @@ xlog_cil_push_foreground( * transaction to the checkpoint context so we carry the busy extents through * to checkpoint completion, and then unlock all the items in the transaction. * - * For more specific information about the order of operations in - * xfs_log_commit_cil() please refer to the comments in - * xfs_trans_commit_iclog(). - * * Called with the context lock already held in read mode to lock out * background commit, returns without it held once background commits are * allowed again. diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 16d8d12ea3b..b9ea262dd1c 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -468,7 +468,6 @@ struct xfs_cil { * threshold, yet give us plenty of space for aggregation on large logs. */ #define XLOG_CIL_SPACE_LIMIT(log) (log->l_logsize >> 3) -#define XLOG_CIL_HARD_SPACE_LIMIT(log) (3 * (log->l_logsize >> 4)) /* * ticket grant locks, queues and accounting have their own cachlines diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index d1dba7ce75a..93f03ec17ee 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -29,6 +29,7 @@ #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" +#include "xfs_btree.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_inode_item.h" @@ -45,6 +46,14 @@ #include "xfs_trace.h" #include "xfs_icache.h" +/* Need all the magic numbers and buffer ops structures from these headers */ +#include "xfs_symlink.h" +#include "xfs_da_btree.h" +#include "xfs_dir2_format.h" +#include "xfs_dir2_priv.h" +#include "xfs_attr_leaf.h" +#include "xfs_attr_remote.h" + STATIC int xlog_find_zeroed( struct xlog *, @@ -1785,6 +1794,7 @@ xlog_recover_do_inode_buffer( xfs_agino_t *buffer_nextp; trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f); + bp->b_ops = &xfs_inode_buf_ops; inodes_per_buf = BBTOB(bp->b_io_length) >> mp->m_sb.sb_inodelog; for (i = 0; i < inodes_per_buf; i++) { @@ -1857,6 +1867,201 @@ xlog_recover_do_inode_buffer( } /* + * Validate the recovered buffer is of the correct type and attach the + * appropriate buffer operations to them for writeback. Magic numbers are in a + * few places: + * the first 16 bits of the buffer (inode buffer, dquot buffer), + * the first 32 bits of the buffer (most blocks), + * inside a struct xfs_da_blkinfo at the start of the buffer. + */ +static void +xlog_recovery_validate_buf_type( + struct xfs_mount *mp, + struct xfs_buf *bp, + xfs_buf_log_format_t *buf_f) +{ + struct xfs_da_blkinfo *info = bp->b_addr; + __uint32_t magic32; + __uint16_t magic16; + __uint16_t magicda; + + magic32 = be32_to_cpu(*(__be32 *)bp->b_addr); + magic16 = be16_to_cpu(*(__be16*)bp->b_addr); + magicda = be16_to_cpu(info->magic); + switch (xfs_blft_from_flags(buf_f)) { + case XFS_BLFT_BTREE_BUF: + switch (magic32) { + case XFS_ABTB_CRC_MAGIC: + case XFS_ABTC_CRC_MAGIC: + case XFS_ABTB_MAGIC: + case XFS_ABTC_MAGIC: + bp->b_ops = &xfs_allocbt_buf_ops; + break; + case XFS_IBT_CRC_MAGIC: + case XFS_IBT_MAGIC: + bp->b_ops = &xfs_inobt_buf_ops; + break; + case XFS_BMAP_CRC_MAGIC: + case XFS_BMAP_MAGIC: + bp->b_ops = &xfs_bmbt_buf_ops; + break; + default: + xfs_warn(mp, "Bad btree block magic!"); + ASSERT(0); + break; + } + break; + case XFS_BLFT_AGF_BUF: + if (magic32 != XFS_AGF_MAGIC) { + xfs_warn(mp, "Bad AGF block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_agf_buf_ops; + break; + case XFS_BLFT_AGFL_BUF: + if (!xfs_sb_version_hascrc(&mp->m_sb)) + break; + if (magic32 != XFS_AGFL_MAGIC) { + xfs_warn(mp, "Bad AGFL block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_agfl_buf_ops; + break; + case XFS_BLFT_AGI_BUF: + if (magic32 != XFS_AGI_MAGIC) { + xfs_warn(mp, "Bad AGI block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_agi_buf_ops; + break; + case XFS_BLFT_UDQUOT_BUF: + case XFS_BLFT_PDQUOT_BUF: + case XFS_BLFT_GDQUOT_BUF: +#ifdef CONFIG_XFS_QUOTA + if (magic16 != XFS_DQUOT_MAGIC) { + xfs_warn(mp, "Bad DQUOT block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dquot_buf_ops; +#else + xfs_alert(mp, + "Trying to recover dquots without QUOTA support built in!"); + ASSERT(0); +#endif + break; + case XFS_BLFT_DINO_BUF: + /* + * we get here with inode allocation buffers, not buffers that + * track unlinked list changes. + */ + if (magic16 != XFS_DINODE_MAGIC) { + xfs_warn(mp, "Bad INODE block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_inode_buf_ops; + break; + case XFS_BLFT_SYMLINK_BUF: + if (magic32 != XFS_SYMLINK_MAGIC) { + xfs_warn(mp, "Bad symlink block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_symlink_buf_ops; + break; + case XFS_BLFT_DIR_BLOCK_BUF: + if (magic32 != XFS_DIR2_BLOCK_MAGIC && + magic32 != XFS_DIR3_BLOCK_MAGIC) { + xfs_warn(mp, "Bad dir block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dir3_block_buf_ops; + break; + case XFS_BLFT_DIR_DATA_BUF: + if (magic32 != XFS_DIR2_DATA_MAGIC && + magic32 != XFS_DIR3_DATA_MAGIC) { + xfs_warn(mp, "Bad dir data magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dir3_data_buf_ops; + break; + case XFS_BLFT_DIR_FREE_BUF: + if (magic32 != XFS_DIR2_FREE_MAGIC && + magic32 != XFS_DIR3_FREE_MAGIC) { + xfs_warn(mp, "Bad dir3 free magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dir3_free_buf_ops; + break; + case XFS_BLFT_DIR_LEAF1_BUF: + if (magicda != XFS_DIR2_LEAF1_MAGIC && + magicda != XFS_DIR3_LEAF1_MAGIC) { + xfs_warn(mp, "Bad dir leaf1 magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dir3_leaf1_buf_ops; + break; + case XFS_BLFT_DIR_LEAFN_BUF: + if (magicda != XFS_DIR2_LEAFN_MAGIC && + magicda != XFS_DIR3_LEAFN_MAGIC) { + xfs_warn(mp, "Bad dir leafn magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dir3_leafn_buf_ops; + break; + case XFS_BLFT_DA_NODE_BUF: + if (magicda != XFS_DA_NODE_MAGIC && + magicda != XFS_DA3_NODE_MAGIC) { + xfs_warn(mp, "Bad da node magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_da3_node_buf_ops; + break; + case XFS_BLFT_ATTR_LEAF_BUF: + if (magicda != XFS_ATTR_LEAF_MAGIC && + magicda != XFS_ATTR3_LEAF_MAGIC) { + xfs_warn(mp, "Bad attr leaf magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_attr3_leaf_buf_ops; + break; + case XFS_BLFT_ATTR_RMT_BUF: + if (!xfs_sb_version_hascrc(&mp->m_sb)) + break; + if (magic32 != XFS_ATTR3_RMT_MAGIC) { + xfs_warn(mp, "Bad attr remote magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_attr3_rmt_buf_ops; + break; + case XFS_BLFT_SB_BUF: + if (magic32 != XFS_SB_MAGIC) { + xfs_warn(mp, "Bad SB block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_sb_buf_ops; + break; + default: + xfs_warn(mp, "Unknown buffer type %d!", + xfs_blft_from_flags(buf_f)); + break; + } +} + +/* * Perform a 'normal' buffer recovery. Each logged region of the * buffer should be copied over the corresponding region in the * given buffer. The bitmap in the buf log format structure indicates @@ -1928,6 +2133,8 @@ xlog_recover_do_reg_buffer( /* Shouldn't be any more regions */ ASSERT(i == item->ri_total); + + xlog_recovery_validate_buf_type(mp, bp, buf_f); } /* @@ -2213,6 +2420,7 @@ xlog_recover_inode_pass2( int attr_index; uint fields; xfs_icdinode_t *dicp; + uint isize; int need_free = 0; if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) { @@ -2238,7 +2446,7 @@ xlog_recover_inode_pass2( trace_xfs_log_recover_inode_recover(log, in_f); bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0, - NULL); + &xfs_inode_buf_ops); if (!bp) { error = ENOMEM; goto error; @@ -2349,7 +2557,8 @@ xlog_recover_inode_pass2( error = EFSCORRUPTED; goto error; } - if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) { + isize = xfs_icdinode_size(dicp->di_version); + if (unlikely(item->ri_buf[1].i_len > isize)) { XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)", XFS_ERRLEVEL_LOW, mp, dicp); xfs_buf_relse(bp); @@ -2361,13 +2570,13 @@ xlog_recover_inode_pass2( } /* The core is in in-core format */ - xfs_dinode_to_disk(dip, item->ri_buf[1].i_addr); + xfs_dinode_to_disk(dip, dicp); /* the rest is in on-disk format */ - if (item->ri_buf[1].i_len > sizeof(struct xfs_icdinode)) { - memcpy((xfs_caddr_t) dip + sizeof(struct xfs_icdinode), - item->ri_buf[1].i_addr + sizeof(struct xfs_icdinode), - item->ri_buf[1].i_len - sizeof(struct xfs_icdinode)); + if (item->ri_buf[1].i_len > isize) { + memcpy((char *)dip + isize, + item->ri_buf[1].i_addr + isize, + item->ri_buf[1].i_len - isize); } fields = in_f->ilf_fields; @@ -2451,6 +2660,9 @@ xlog_recover_inode_pass2( } write_inode_buffer: + /* re-generate the checksum. */ + xfs_dinode_calc_crc(log->l_mp, dip); + ASSERT(bp->b_target->bt_mount == mp); bp->b_iodone = xlog_recover_iodone; xfs_buf_delwri_queue(bp, buffer_list); @@ -2948,6 +3160,7 @@ xlog_recover_process_efi( * This will pull the EFI from the AIL and * free the memory associated with it. */ + set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); xfs_efi_release(efip, efip->efi_format.efi_nextents); return XFS_ERROR(EIO); } @@ -3751,6 +3964,25 @@ xlog_recover( return error; } + /* + * Version 5 superblock log feature mask validation. We know the + * log is dirty so check if there are any unknown log features + * in what we need to recover. If there are unknown features + * (e.g. unsupported transactions, then simply reject the + * attempt at recovery before touching anything. + */ + if (XFS_SB_VERSION_NUM(&log->l_mp->m_sb) == XFS_SB_VERSION_5 && + xfs_sb_has_incompat_log_feature(&log->l_mp->m_sb, + XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)) { + xfs_warn(log->l_mp, +"Superblock has unknown incompatible log features (0x%x) enabled.\n" +"The log can not be fully and/or safely recovered by this kernel.\n" +"Please recover the log on a kernel that supports the unknown features.", + (log->l_mp->m_sb.sb_features_log_incompat & + XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)); + return EINVAL; + } + xfs_notice(log->l_mp, "Starting recovery (logdev: %s)", log->l_mp->m_logname ? log->l_mp->m_logname : "internal"); diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h index 56dc0c17f16..76c81982f96 100644 --- a/fs/xfs/xfs_message.h +++ b/fs/xfs/xfs_message.h @@ -30,6 +30,32 @@ void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...) } #endif +#define xfs_printk_ratelimited(func, dev, fmt, ...) \ +do { \ + static DEFINE_RATELIMIT_STATE(_rs, \ + DEFAULT_RATELIMIT_INTERVAL, \ + DEFAULT_RATELIMIT_BURST); \ + if (__ratelimit(&_rs)) \ + func(dev, fmt, ##__VA_ARGS__); \ +} while (0) + +#define xfs_emerg_ratelimited(dev, fmt, ...) \ + xfs_printk_ratelimited(xfs_emerg, dev, fmt, ##__VA_ARGS__) +#define xfs_alert_ratelimited(dev, fmt, ...) \ + xfs_printk_ratelimited(xfs_alert, dev, fmt, ##__VA_ARGS__) +#define xfs_crit_ratelimited(dev, fmt, ...) \ + xfs_printk_ratelimited(xfs_crit, dev, fmt, ##__VA_ARGS__) +#define xfs_err_ratelimited(dev, fmt, ...) \ + xfs_printk_ratelimited(xfs_err, dev, fmt, ##__VA_ARGS__) +#define xfs_warn_ratelimited(dev, fmt, ...) \ + xfs_printk_ratelimited(xfs_warn, dev, fmt, ##__VA_ARGS__) +#define xfs_notice_ratelimited(dev, fmt, ...) \ + xfs_printk_ratelimited(xfs_notice, dev, fmt, ##__VA_ARGS__) +#define xfs_info_ratelimited(dev, fmt, ...) \ + xfs_printk_ratelimited(xfs_info, dev, fmt, ##__VA_ARGS__) +#define xfs_debug_ratelimited(dev, fmt, ...) \ + xfs_printk_ratelimited(xfs_debug, dev, fmt, ##__VA_ARGS__) + extern void assfail(char *expr, char *f, int l); extern void xfs_hex_dump(void *p, int length); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 3806088a8f7..f6bfbd73466 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -43,6 +43,8 @@ #include "xfs_utils.h" #include "xfs_trace.h" #include "xfs_icache.h" +#include "xfs_cksum.h" +#include "xfs_buf_item.h" #ifdef HAVE_PERCPU_SB @@ -109,6 +111,14 @@ static const struct { { offsetof(xfs_sb_t, sb_logsunit), 0 }, { offsetof(xfs_sb_t, sb_features2), 0 }, { offsetof(xfs_sb_t, sb_bad_features2), 0 }, + { offsetof(xfs_sb_t, sb_features_compat), 0 }, + { offsetof(xfs_sb_t, sb_features_ro_compat), 0 }, + { offsetof(xfs_sb_t, sb_features_incompat), 0 }, + { offsetof(xfs_sb_t, sb_features_log_incompat), 0 }, + { offsetof(xfs_sb_t, sb_crc), 0 }, + { offsetof(xfs_sb_t, sb_pad), 0 }, + { offsetof(xfs_sb_t, sb_pquotino), 0 }, + { offsetof(xfs_sb_t, sb_lsn), 0 }, { sizeof(xfs_sb_t), 0 } }; @@ -319,11 +329,54 @@ xfs_mount_validate_sb( return XFS_ERROR(EWRONGFS); } + if (!xfs_sb_good_version(sbp)) { xfs_warn(mp, "bad version"); return XFS_ERROR(EWRONGFS); } + /* + * Version 5 superblock feature mask validation. Reject combinations the + * kernel cannot support up front before checking anything else. + */ + if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) { + xfs_alert(mp, +"Version 5 superblock detected. This kernel has EXPERIMENTAL support enabled!\n" +"Use of these features in this kernel is at your own risk!"); + + if (xfs_sb_has_compat_feature(sbp, + XFS_SB_FEAT_COMPAT_UNKNOWN)) { + xfs_warn(mp, +"Superblock has unknown compatible features (0x%x) enabled.\n" +"Using a more recent kernel is recommended.", + (sbp->sb_features_compat & + XFS_SB_FEAT_COMPAT_UNKNOWN)); + } + + if (xfs_sb_has_ro_compat_feature(sbp, + XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) { + xfs_alert(mp, +"Superblock has unknown read-only compatible features (0x%x) enabled.", + (sbp->sb_features_ro_compat & + XFS_SB_FEAT_RO_COMPAT_UNKNOWN)); + if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { + xfs_warn(mp, +"Attempted to mount read-only compatible filesystem read-write.\n" +"Filesystem can only be safely mounted read only."); + return XFS_ERROR(EINVAL); + } + } + if (xfs_sb_has_incompat_feature(sbp, + XFS_SB_FEAT_INCOMPAT_UNKNOWN)) { + xfs_warn(mp, +"Superblock has unknown incompatible features (0x%x) enabled.\n" +"Filesystem can not be safely mounted by this kernel.", + (sbp->sb_features_incompat & + XFS_SB_FEAT_INCOMPAT_UNKNOWN)); + return XFS_ERROR(EINVAL); + } + } + if (unlikely( sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) { xfs_warn(mp, @@ -557,6 +610,14 @@ xfs_sb_from_disk( to->sb_logsunit = be32_to_cpu(from->sb_logsunit); to->sb_features2 = be32_to_cpu(from->sb_features2); to->sb_bad_features2 = be32_to_cpu(from->sb_bad_features2); + to->sb_features_compat = be32_to_cpu(from->sb_features_compat); + to->sb_features_ro_compat = be32_to_cpu(from->sb_features_ro_compat); + to->sb_features_incompat = be32_to_cpu(from->sb_features_incompat); + to->sb_features_log_incompat = + be32_to_cpu(from->sb_features_log_incompat); + to->sb_pad = 0; + to->sb_pquotino = be64_to_cpu(from->sb_pquotino); + to->sb_lsn = be64_to_cpu(from->sb_lsn); } /* @@ -612,13 +673,12 @@ xfs_sb_to_disk( } } -static void +static int xfs_sb_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_sb sb; - int error; xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp)); @@ -626,16 +686,46 @@ xfs_sb_verify( * Only check the in progress field for the primary superblock as * mkfs.xfs doesn't clear it from secondary superblocks. */ - error = xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR); - if (error) - xfs_buf_ioerror(bp, error); + return xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR); } +/* + * If the superblock has the CRC feature bit set or the CRC field is non-null, + * check that the CRC is valid. We check the CRC field is non-null because a + * single bit error could clear the feature bit and unused parts of the + * superblock are supposed to be zero. Hence a non-null crc field indicates that + * we've potentially lost a feature bit and we should check it anyway. + */ static void xfs_sb_read_verify( struct xfs_buf *bp) { - xfs_sb_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp); + int error; + + /* + * open code the version check to avoid needing to convert the entire + * superblock from disk order just to check the version number + */ + if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC) && + (((be16_to_cpu(dsb->sb_versionnum) & XFS_SB_VERSION_NUMBITS) == + XFS_SB_VERSION_5) || + dsb->sb_crc != 0)) { + + if (!xfs_verify_cksum(bp->b_addr, be16_to_cpu(dsb->sb_sectsize), + offsetof(struct xfs_sb, sb_crc))) { + error = EFSCORRUPTED; + goto out_error; + } + } + error = xfs_sb_verify(bp); + +out_error: + if (error) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, error); + } } /* @@ -648,11 +738,10 @@ static void xfs_sb_quiet_read_verify( struct xfs_buf *bp) { - struct xfs_sb sb; + struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp); - xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp)); - if (sb.sb_magicnum == XFS_SB_MAGIC) { + if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC)) { /* XFS filesystem, verify noisily! */ xfs_sb_read_verify(bp); return; @@ -663,9 +752,27 @@ xfs_sb_quiet_read_verify( static void xfs_sb_write_verify( - struct xfs_buf *bp) + struct xfs_buf *bp) { - xfs_sb_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + int error; + + error = xfs_sb_verify(bp); + if (error) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, error); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + XFS_BUF_TO_SBP(bp)->sb_lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), + offsetof(struct xfs_sb, sb_crc)); } const struct xfs_buf_ops xfs_sb_buf_ops = { @@ -687,7 +794,8 @@ int xfs_readsb(xfs_mount_t *mp, int flags) { unsigned int sector_size; - xfs_buf_t *bp; + struct xfs_buf *bp; + struct xfs_sb *sbp = &mp->m_sb; int error; int loud = !(flags & XFS_MFSI_QUIET); @@ -714,7 +822,7 @@ reread: if (bp->b_error) { error = bp->b_error; if (loud) - xfs_warn(mp, "SB validate failed"); + xfs_warn(mp, "SB validate failed with error %d.", error); goto release_buf; } @@ -726,10 +834,10 @@ reread: /* * We must be able to do sector-sized and sector-aligned IO. */ - if (sector_size > mp->m_sb.sb_sectsize) { + if (sector_size > sbp->sb_sectsize) { if (loud) xfs_warn(mp, "device supports %u byte sectors (not %u)", - sector_size, mp->m_sb.sb_sectsize); + sector_size, sbp->sb_sectsize); error = ENOSYS; goto release_buf; } @@ -738,15 +846,18 @@ reread: * If device sector size is smaller than the superblock size, * re-read the superblock so the buffer is correctly sized. */ - if (sector_size < mp->m_sb.sb_sectsize) { + if (sector_size < sbp->sb_sectsize) { xfs_buf_relse(bp); - sector_size = mp->m_sb.sb_sectsize; + sector_size = sbp->sb_sectsize; goto reread; } /* Initialize per-cpu counters */ xfs_icsb_reinit_counters(mp); + /* no need to be quiet anymore, so reset the buf ops */ + bp->b_ops = &xfs_sb_buf_ops; + mp->m_sb_bp = bp; xfs_buf_unlock(bp); return 0; @@ -1633,6 +1744,7 @@ xfs_mod_sb(xfs_trans_t *tp, __int64_t fields) ASSERT((1LL << f) & XFS_SB_MOD_BITS); first = xfs_sb_info[f].offset; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF); xfs_trans_log_buf(tp, bp, first, last); } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index bc907061d39..b004cecdfb0 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -207,7 +207,6 @@ typedef struct xfs_mount { trimming */ __int64_t m_update_flags; /* sb flags we need to update on the next remount,rw */ - struct shrinker m_inode_shrink; /* inode reclaim shrinker */ int64_t m_low_space[XFS_LOWSP_MAX]; /* low free space thresholds */ @@ -392,6 +391,7 @@ extern void xfs_set_low_space_thresholds(struct xfs_mount *); #endif /* __KERNEL__ */ +extern void xfs_sb_calc_crc(struct xfs_buf *); extern void xfs_mod_sb(struct xfs_trans *, __int64_t); extern int xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t, xfs_agnumber_t *); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index e5b5cf97378..f41702b4300 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -617,6 +617,20 @@ xfs_qm_dqdetach( } } +int +xfs_qm_calc_dquots_per_chunk( + struct xfs_mount *mp, + unsigned int nbblks) /* basic block units */ +{ + unsigned int ndquots; + + ASSERT(nbblks > 0); + ndquots = BBTOB(nbblks); + do_div(ndquots, sizeof(xfs_dqblk_t)); + + return ndquots; +} + /* * This initializes all the quota information that's kept in the * mount structure @@ -656,9 +670,8 @@ xfs_qm_init_quotainfo( /* Precalc some constants */ qinf->qi_dqchunklen = XFS_FSB_TO_BB(mp, XFS_DQUOT_CLUSTER_SIZE_FSB); - ASSERT(qinf->qi_dqchunklen); - qinf->qi_dqperchunk = BBTOB(qinf->qi_dqchunklen); - do_div(qinf->qi_dqperchunk, sizeof(xfs_dqblk_t)); + qinf->qi_dqperchunk = xfs_qm_calc_dquots_per_chunk(mp, + qinf->qi_dqchunklen); mp->m_qflags |= (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_CHKD); @@ -897,6 +910,10 @@ xfs_qm_dqiter_bufs( if (error) break; + /* + * XXX(hch): need to figure out if it makes sense to validate + * the CRC here. + */ xfs_qm_reset_dqcounts(mp, bp, firstid, type); xfs_buf_delwri_queue(bp, buffer_list); xfs_buf_relse(bp); @@ -1057,7 +1074,7 @@ xfs_qm_quotacheck_dqadjust( * There are no timers for the default values set in the root dquot. */ if (dqp->q_core.d_id) { - xfs_qm_adjust_dqlimits(mp, &dqp->q_core); + xfs_qm_adjust_dqlimits(mp, dqp); xfs_qm_adjust_dqtimers(mp, &dqp->q_core); } diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index 44b858b79d7..5d16a6e6900 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -75,6 +75,8 @@ typedef struct xfs_quotainfo { &((qi)->qi_gquota_tree)) +extern int xfs_qm_calc_dquots_per_chunk(struct xfs_mount *mp, + unsigned int nbblks); extern void xfs_trans_mod_dquot(xfs_trans_t *, xfs_dquot_t *, uint, long); extern int xfs_trans_reserve_quota_bydquots(xfs_trans_t *, xfs_mount_t *, xfs_dquot_t *, xfs_dquot_t *, long, long, uint); @@ -116,7 +118,7 @@ extern void xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint); extern int xfs_qm_scall_trunc_qfiles(xfs_mount_t *, uint); extern int xfs_qm_scall_getquota(xfs_mount_t *, xfs_dqid_t, uint, fs_disk_quota_t *); -extern int xfs_qm_scall_setqlim(xfs_mount_t *, xfs_dqid_t, uint, +extern int xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint, fs_disk_quota_t *); extern int xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *); extern int xfs_qm_scall_quotaon(xfs_mount_t *, uint); diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index cf9a34051e0..c41190cad6e 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -472,15 +472,15 @@ xfs_qm_scall_getqstat( */ int xfs_qm_scall_setqlim( - xfs_mount_t *mp, + struct xfs_mount *mp, xfs_dqid_t id, uint type, fs_disk_quota_t *newlim) { struct xfs_quotainfo *q = mp->m_quotainfo; - xfs_disk_dquot_t *ddq; - xfs_dquot_t *dqp; - xfs_trans_t *tp; + struct xfs_disk_dquot *ddq; + struct xfs_dquot *dqp; + struct xfs_trans *tp; int error; xfs_qcnt_t hard, soft; @@ -529,6 +529,7 @@ xfs_qm_scall_setqlim( if (hard == 0 || hard >= soft) { ddq->d_blk_hardlimit = cpu_to_be64(hard); ddq->d_blk_softlimit = cpu_to_be64(soft); + xfs_dquot_set_prealloc_limits(dqp); if (id == 0) { q->qi_bhardlimit = hard; q->qi_bsoftlimit = soft; diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index b50ec5b95d5..c61e31c7d99 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -77,7 +77,14 @@ typedef struct xfs_disk_dquot { */ typedef struct xfs_dqblk { xfs_disk_dquot_t dd_diskdq; /* portion that lives incore as well */ - char dd_fill[32]; /* filling for posterity */ + char dd_fill[4]; /* filling for posterity */ + + /* + * These two are only present on filesystems with the CRC bits set. + */ + __be32 dd_crc; /* checksum */ + __be64 dd_lsn; /* last modification in log */ + uuid_t dd_uuid; /* location information */ } xfs_dqblk_t; /* @@ -380,5 +387,7 @@ extern int xfs_qm_dqcheck(struct xfs_mount *, xfs_disk_dquot_t *, xfs_dqid_t, uint, uint, char *); extern int xfs_mount_reset_sbqflags(struct xfs_mount *); +extern const struct xfs_buf_ops xfs_dquot_buf_ops; + #endif /* __KERNEL__ */ #endif /* __XFS_QUOTA_H__ */ diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h index a05b45175fb..2de58a85833 100644 --- a/fs/xfs/xfs_sb.h +++ b/fs/xfs/xfs_sb.h @@ -32,6 +32,7 @@ struct xfs_mount; #define XFS_SB_VERSION_2 2 /* 6.2 - attributes */ #define XFS_SB_VERSION_3 3 /* 6.2 - new inode version */ #define XFS_SB_VERSION_4 4 /* 6.2+ - bitmask version */ +#define XFS_SB_VERSION_5 5 /* CRC enabled filesystem */ #define XFS_SB_VERSION_NUMBITS 0x000f #define XFS_SB_VERSION_ALLFBITS 0xfff0 #define XFS_SB_VERSION_SASHFBITS 0xf000 @@ -161,6 +162,20 @@ typedef struct xfs_sb { */ __uint32_t sb_bad_features2; + /* version 5 superblock fields start here */ + + /* feature masks */ + __uint32_t sb_features_compat; + __uint32_t sb_features_ro_compat; + __uint32_t sb_features_incompat; + __uint32_t sb_features_log_incompat; + + __uint32_t sb_crc; /* superblock crc */ + __uint32_t sb_pad; + + xfs_ino_t sb_pquotino; /* project quota inode */ + xfs_lsn_t sb_lsn; /* last write sequence */ + /* must be padded to 64 bit alignment */ } xfs_sb_t; @@ -229,7 +244,21 @@ typedef struct xfs_dsb { * for features2 bits. Easiest just to mark it bad and not use * it for anything else. */ - __be32 sb_bad_features2; + __be32 sb_bad_features2; + + /* version 5 superblock fields start here */ + + /* feature masks */ + __be32 sb_features_compat; + __be32 sb_features_ro_compat; + __be32 sb_features_incompat; + __be32 sb_features_log_incompat; + + __le32 sb_crc; /* superblock crc */ + __be32 sb_pad; + + __be64 sb_pquotino; /* project quota inode */ + __be64 sb_lsn; /* last write sequence */ /* must be padded to 64 bit alignment */ } xfs_dsb_t; @@ -250,7 +279,10 @@ typedef enum { XFS_SBS_GQUOTINO, XFS_SBS_QFLAGS, XFS_SBS_FLAGS, XFS_SBS_SHARED_VN, XFS_SBS_INOALIGNMT, XFS_SBS_UNIT, XFS_SBS_WIDTH, XFS_SBS_DIRBLKLOG, XFS_SBS_LOGSECTLOG, XFS_SBS_LOGSECTSIZE, XFS_SBS_LOGSUNIT, - XFS_SBS_FEATURES2, XFS_SBS_BAD_FEATURES2, + XFS_SBS_FEATURES2, XFS_SBS_BAD_FEATURES2, XFS_SBS_FEATURES_COMPAT, + XFS_SBS_FEATURES_RO_COMPAT, XFS_SBS_FEATURES_INCOMPAT, + XFS_SBS_FEATURES_LOG_INCOMPAT, XFS_SBS_CRC, XFS_SBS_PAD, + XFS_SBS_PQUOTINO, XFS_SBS_LSN, XFS_SBS_FIELDCOUNT } xfs_sb_field_t; @@ -276,6 +308,12 @@ typedef enum { #define XFS_SB_FDBLOCKS XFS_SB_MVAL(FDBLOCKS) #define XFS_SB_FEATURES2 XFS_SB_MVAL(FEATURES2) #define XFS_SB_BAD_FEATURES2 XFS_SB_MVAL(BAD_FEATURES2) +#define XFS_SB_FEATURES_COMPAT XFS_SB_MVAL(FEATURES_COMPAT) +#define XFS_SB_FEATURES_RO_COMPAT XFS_SB_MVAL(FEATURES_RO_COMPAT) +#define XFS_SB_FEATURES_INCOMPAT XFS_SB_MVAL(FEATURES_INCOMPAT) +#define XFS_SB_FEATURES_LOG_INCOMPAT XFS_SB_MVAL(FEATURES_LOG_INCOMPAT) +#define XFS_SB_CRC XFS_SB_MVAL(CRC) +#define XFS_SB_PQUOTINO XFS_SB_MVAL(PQUOTINO) #define XFS_SB_NUM_BITS ((int)XFS_SBS_FIELDCOUNT) #define XFS_SB_ALL_BITS ((1LL << XFS_SB_NUM_BITS) - 1) #define XFS_SB_MOD_BITS \ @@ -283,7 +321,9 @@ typedef enum { XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | XFS_SB_GQUOTINO | \ XFS_SB_QFLAGS | XFS_SB_SHARED_VN | XFS_SB_UNIT | XFS_SB_WIDTH | \ XFS_SB_ICOUNT | XFS_SB_IFREE | XFS_SB_FDBLOCKS | XFS_SB_FEATURES2 | \ - XFS_SB_BAD_FEATURES2) + XFS_SB_BAD_FEATURES2 | XFS_SB_FEATURES_COMPAT | \ + XFS_SB_FEATURES_RO_COMPAT | XFS_SB_FEATURES_INCOMPAT | \ + XFS_SB_FEATURES_LOG_INCOMPAT | XFS_SB_PQUOTINO) /* @@ -325,6 +365,8 @@ static inline int xfs_sb_good_version(xfs_sb_t *sbp) return 1; } + if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) + return 1; return 0; } @@ -365,7 +407,7 @@ static inline int xfs_sb_version_hasattr(xfs_sb_t *sbp) { return sbp->sb_versionnum == XFS_SB_VERSION_2 || sbp->sb_versionnum == XFS_SB_VERSION_3 || - (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && + (XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 && (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT)); } @@ -373,7 +415,7 @@ static inline void xfs_sb_version_addattr(xfs_sb_t *sbp) { if (sbp->sb_versionnum == XFS_SB_VERSION_1) sbp->sb_versionnum = XFS_SB_VERSION_2; - else if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) + else if (XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4) sbp->sb_versionnum |= XFS_SB_VERSION_ATTRBIT; else sbp->sb_versionnum = XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT; @@ -382,7 +424,7 @@ static inline void xfs_sb_version_addattr(xfs_sb_t *sbp) static inline int xfs_sb_version_hasnlink(xfs_sb_t *sbp) { return sbp->sb_versionnum == XFS_SB_VERSION_3 || - (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && + (XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 && (sbp->sb_versionnum & XFS_SB_VERSION_NLINKBIT)); } @@ -396,13 +438,13 @@ static inline void xfs_sb_version_addnlink(xfs_sb_t *sbp) static inline int xfs_sb_version_hasquota(xfs_sb_t *sbp) { - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && + return XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 && (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT); } static inline void xfs_sb_version_addquota(xfs_sb_t *sbp) { - if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) + if (XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4) sbp->sb_versionnum |= XFS_SB_VERSION_QUOTABIT; else sbp->sb_versionnum = xfs_sb_version_tonew(sbp->sb_versionnum) | @@ -411,13 +453,14 @@ static inline void xfs_sb_version_addquota(xfs_sb_t *sbp) static inline int xfs_sb_version_hasalign(xfs_sb_t *sbp) { - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && - (sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT); + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || + (XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 && + (sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT)); } static inline int xfs_sb_version_hasdalign(xfs_sb_t *sbp) { - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && + return XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 && (sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT); } @@ -429,38 +472,42 @@ static inline int xfs_sb_version_hasshared(xfs_sb_t *sbp) static inline int xfs_sb_version_hasdirv2(xfs_sb_t *sbp) { - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && - (sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT); + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || + (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && + (sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT)); } static inline int xfs_sb_version_haslogv2(xfs_sb_t *sbp) { - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && - (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT); + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || + (XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 && + (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT)); } static inline int xfs_sb_version_hasextflgbit(xfs_sb_t *sbp) { - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && - (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT); + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || + (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && + (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT)); } static inline int xfs_sb_version_hassector(xfs_sb_t *sbp) { - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && + return XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 && (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT); } static inline int xfs_sb_version_hasasciici(xfs_sb_t *sbp) { - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && + return XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 && (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT); } static inline int xfs_sb_version_hasmorebits(xfs_sb_t *sbp) { - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && - (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT); + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || + (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && + (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT)); } /* @@ -475,14 +522,16 @@ static inline int xfs_sb_version_hasmorebits(xfs_sb_t *sbp) static inline int xfs_sb_version_haslazysbcount(xfs_sb_t *sbp) { - return xfs_sb_version_hasmorebits(sbp) && - (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT); + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || + (xfs_sb_version_hasmorebits(sbp) && + (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT)); } static inline int xfs_sb_version_hasattr2(xfs_sb_t *sbp) { - return xfs_sb_version_hasmorebits(sbp) && - (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT); + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || + (xfs_sb_version_hasmorebits(sbp) && + (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT)); } static inline void xfs_sb_version_addattr2(xfs_sb_t *sbp) @@ -500,14 +549,73 @@ static inline void xfs_sb_version_removeattr2(xfs_sb_t *sbp) static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp) { - return xfs_sb_version_hasmorebits(sbp) && - (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT); + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || + (xfs_sb_version_hasmorebits(sbp) && + (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT)); } static inline int xfs_sb_version_hascrc(xfs_sb_t *sbp) { - return (xfs_sb_version_hasmorebits(sbp) && - (sbp->sb_features2 & XFS_SB_VERSION2_CRCBIT)); + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; +} + + +/* + * Extended v5 superblock feature masks. These are to be used for new v5 + * superblock features only. + * + * Compat features are new features that old kernels will not notice or affect + * and so can mount read-write without issues. + * + * RO-Compat (read only) are features that old kernels can read but will break + * if they write. Hence only read-only mounts of such filesystems are allowed on + * kernels that don't support the feature bit. + * + * InCompat features are features which old kernels will not understand and so + * must not mount. + * + * Log-InCompat features are for changes to log formats or new transactions that + * can't be replayed on older kernels. The fields are set when the filesystem is + * mounted, and a clean unmount clears the fields. + */ +#define XFS_SB_FEAT_COMPAT_ALL 0 +#define XFS_SB_FEAT_COMPAT_UNKNOWN ~XFS_SB_FEAT_COMPAT_ALL +static inline bool +xfs_sb_has_compat_feature( + struct xfs_sb *sbp, + __uint32_t feature) +{ + return (sbp->sb_features_compat & feature) != 0; +} + +#define XFS_SB_FEAT_RO_COMPAT_ALL 0 +#define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL +static inline bool +xfs_sb_has_ro_compat_feature( + struct xfs_sb *sbp, + __uint32_t feature) +{ + return (sbp->sb_features_ro_compat & feature) != 0; +} + +#define XFS_SB_FEAT_INCOMPAT_ALL 0 +#define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL +static inline bool +xfs_sb_has_incompat_feature( + struct xfs_sb *sbp, + __uint32_t feature) +{ + return (sbp->sb_features_incompat & feature) != 0; +} + +#define XFS_SB_FEAT_INCOMPAT_LOG_ALL 0 +#define XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_LOG_ALL +static inline bool +xfs_sb_has_incompat_log_feature( + struct xfs_sb *sbp, + __uint32_t feature) +{ + return (sbp->sb_features_log_incompat & feature) != 0; } /* diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c new file mode 100644 index 00000000000..5f234389327 --- /dev/null +++ b/fs/xfs/xfs_symlink.c @@ -0,0 +1,730 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * Copyright (c) 2012-2013 Red Hat, Inc. + * All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir2.h" +#include "xfs_mount.h" +#include "xfs_da_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_inode_item.h" +#include "xfs_itable.h" +#include "xfs_ialloc.h" +#include "xfs_alloc.h" +#include "xfs_bmap.h" +#include "xfs_error.h" +#include "xfs_quota.h" +#include "xfs_utils.h" +#include "xfs_trans_space.h" +#include "xfs_log_priv.h" +#include "xfs_trace.h" +#include "xfs_symlink.h" +#include "xfs_cksum.h" +#include "xfs_buf_item.h" + + +/* + * Each contiguous block has a header, so it is not just a simple pathlen + * to FSB conversion. + */ +int +xfs_symlink_blocks( + struct xfs_mount *mp, + int pathlen) +{ + int fsblocks = 0; + int len = pathlen; + + do { + fsblocks++; + len -= XFS_SYMLINK_BUF_SPACE(mp, mp->m_sb.sb_blocksize); + } while (len > 0); + + ASSERT(fsblocks <= XFS_SYMLINK_MAPS); + return fsblocks; +} + +static int +xfs_symlink_hdr_set( + struct xfs_mount *mp, + xfs_ino_t ino, + uint32_t offset, + uint32_t size, + struct xfs_buf *bp) +{ + struct xfs_dsymlink_hdr *dsl = bp->b_addr; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return 0; + + dsl->sl_magic = cpu_to_be32(XFS_SYMLINK_MAGIC); + dsl->sl_offset = cpu_to_be32(offset); + dsl->sl_bytes = cpu_to_be32(size); + uuid_copy(&dsl->sl_uuid, &mp->m_sb.sb_uuid); + dsl->sl_owner = cpu_to_be64(ino); + dsl->sl_blkno = cpu_to_be64(bp->b_bn); + bp->b_ops = &xfs_symlink_buf_ops; + + return sizeof(struct xfs_dsymlink_hdr); +} + +/* + * Checking of the symlink header is split into two parts. the verifier does + * CRC, location and bounds checking, the unpacking function checks the path + * parameters and owner. + */ +bool +xfs_symlink_hdr_ok( + struct xfs_mount *mp, + xfs_ino_t ino, + uint32_t offset, + uint32_t size, + struct xfs_buf *bp) +{ + struct xfs_dsymlink_hdr *dsl = bp->b_addr; + + if (offset != be32_to_cpu(dsl->sl_offset)) + return false; + if (size != be32_to_cpu(dsl->sl_bytes)) + return false; + if (ino != be64_to_cpu(dsl->sl_owner)) + return false; + + /* ok */ + return true; +} + +static bool +xfs_symlink_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dsymlink_hdr *dsl = bp->b_addr; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (dsl->sl_magic != cpu_to_be32(XFS_SYMLINK_MAGIC)) + return false; + if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_uuid)) + return false; + if (bp->b_bn != be64_to_cpu(dsl->sl_blkno)) + return false; + if (be32_to_cpu(dsl->sl_offset) + + be32_to_cpu(dsl->sl_bytes) >= MAXPATHLEN) + return false; + if (dsl->sl_owner == 0) + return false; + + return true; +} + +static void +xfs_symlink_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + /* no verification of non-crc buffers */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (!xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), + offsetof(struct xfs_dsymlink_hdr, sl_crc)) || + !xfs_symlink_verify(bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } +} + +static void +xfs_symlink_write_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + + /* no verification of non-crc buffers */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (!xfs_symlink_verify(bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + return; + } + + if (bip) { + struct xfs_dsymlink_hdr *dsl = bp->b_addr; + dsl->sl_lsn = cpu_to_be64(bip->bli_item.li_lsn); + } + xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), + offsetof(struct xfs_dsymlink_hdr, sl_crc)); +} + +const struct xfs_buf_ops xfs_symlink_buf_ops = { + .verify_read = xfs_symlink_read_verify, + .verify_write = xfs_symlink_write_verify, +}; + +void +xfs_symlink_local_to_remote( + struct xfs_trans *tp, + struct xfs_buf *bp, + struct xfs_inode *ip, + struct xfs_ifork *ifp) +{ + struct xfs_mount *mp = ip->i_mount; + char *buf; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) { + bp->b_ops = NULL; + memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes); + return; + } + + /* + * As this symlink fits in an inode literal area, it must also fit in + * the smallest buffer the filesystem supports. + */ + ASSERT(BBTOB(bp->b_length) >= + ifp->if_bytes + sizeof(struct xfs_dsymlink_hdr)); + + bp->b_ops = &xfs_symlink_buf_ops; + + buf = bp->b_addr; + buf += xfs_symlink_hdr_set(mp, ip->i_ino, 0, ifp->if_bytes, bp); + memcpy(buf, ifp->if_u1.if_data, ifp->if_bytes); +} + +/* ----- Kernel only functions below ----- */ +STATIC int +xfs_readlink_bmap( + struct xfs_inode *ip, + char *link) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS]; + struct xfs_buf *bp; + xfs_daddr_t d; + char *cur_chunk; + int pathlen = ip->i_d.di_size; + int nmaps = XFS_SYMLINK_MAPS; + int byte_cnt; + int n; + int error = 0; + int fsblocks = 0; + int offset; + + fsblocks = xfs_symlink_blocks(mp, pathlen); + error = xfs_bmapi_read(ip, 0, fsblocks, mval, &nmaps, 0); + if (error) + goto out; + + offset = 0; + for (n = 0; n < nmaps; n++) { + d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); + byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); + + bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0, + &xfs_symlink_buf_ops); + if (!bp) + return XFS_ERROR(ENOMEM); + error = bp->b_error; + if (error) { + xfs_buf_ioerror_alert(bp, __func__); + xfs_buf_relse(bp); + goto out; + } + byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt); + if (pathlen < byte_cnt) + byte_cnt = pathlen; + + cur_chunk = bp->b_addr; + if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (!xfs_symlink_hdr_ok(mp, ip->i_ino, offset, + byte_cnt, bp)) { + error = EFSCORRUPTED; + xfs_alert(mp, +"symlink header does not match required off/len/owner (0x%x/Ox%x,0x%llx)", + offset, byte_cnt, ip->i_ino); + xfs_buf_relse(bp); + goto out; + + } + + cur_chunk += sizeof(struct xfs_dsymlink_hdr); + } + + memcpy(link + offset, bp->b_addr, byte_cnt); + + pathlen -= byte_cnt; + offset += byte_cnt; + + xfs_buf_relse(bp); + } + ASSERT(pathlen == 0); + + link[ip->i_d.di_size] = '\0'; + error = 0; + + out: + return error; +} + +int +xfs_readlink( + struct xfs_inode *ip, + char *link) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_fsize_t pathlen; + int error = 0; + + trace_xfs_readlink(ip); + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + xfs_ilock(ip, XFS_ILOCK_SHARED); + + pathlen = ip->i_d.di_size; + if (!pathlen) + goto out; + + if (pathlen < 0 || pathlen > MAXPATHLEN) { + xfs_alert(mp, "%s: inode (%llu) bad symlink length (%lld)", + __func__, (unsigned long long) ip->i_ino, + (long long) pathlen); + ASSERT(0); + error = XFS_ERROR(EFSCORRUPTED); + goto out; + } + + + if (ip->i_df.if_flags & XFS_IFINLINE) { + memcpy(link, ip->i_df.if_u1.if_data, pathlen); + link[pathlen] = '\0'; + } else { + error = xfs_readlink_bmap(ip, link); + } + + out: + xfs_iunlock(ip, XFS_ILOCK_SHARED); + return error; +} + +int +xfs_symlink( + struct xfs_inode *dp, + struct xfs_name *link_name, + const char *target_path, + umode_t mode, + struct xfs_inode **ipp) +{ + struct xfs_mount *mp = dp->i_mount; + struct xfs_trans *tp = NULL; + struct xfs_inode *ip = NULL; + int error = 0; + int pathlen; + struct xfs_bmap_free free_list; + xfs_fsblock_t first_block; + bool unlock_dp_on_error = false; + uint cancel_flags; + int committed; + xfs_fileoff_t first_fsb; + xfs_filblks_t fs_blocks; + int nmaps; + struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS]; + xfs_daddr_t d; + const char *cur_chunk; + int byte_cnt; + int n; + xfs_buf_t *bp; + prid_t prid; + struct xfs_dquot *udqp, *gdqp; + uint resblks; + + *ipp = NULL; + + trace_xfs_symlink(dp, link_name); + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + /* + * Check component lengths of the target path name. + */ + pathlen = strlen(target_path); + if (pathlen >= MAXPATHLEN) /* total string too long */ + return XFS_ERROR(ENAMETOOLONG); + + udqp = gdqp = NULL; + if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) + prid = xfs_get_projid(dp); + else + prid = XFS_PROJID_DEFAULT; + + /* + * Make sure that we have allocated dquot(s) on disk. + */ + error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid, + XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp); + if (error) + goto std_return; + + tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK); + cancel_flags = XFS_TRANS_RELEASE_LOG_RES; + /* + * The symlink will fit into the inode data fork? + * There can't be any attributes so we get the whole variable part. + */ + if (pathlen <= XFS_LITINO(mp, dp->i_d.di_version)) + fs_blocks = 0; + else + fs_blocks = XFS_B_TO_FSB(mp, pathlen); + resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks); + error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT); + if (error == ENOSPC && fs_blocks == 0) { + resblks = 0; + error = xfs_trans_reserve(tp, 0, XFS_SYMLINK_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT); + } + if (error) { + cancel_flags = 0; + goto error_return; + } + + xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); + unlock_dp_on_error = true; + + /* + * Check whether the directory allows new symlinks or not. + */ + if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) { + error = XFS_ERROR(EPERM); + goto error_return; + } + + /* + * Reserve disk quota : blocks and inode. + */ + error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, resblks, 1, 0); + if (error) + goto error_return; + + /* + * Check for ability to enter directory entry, if no space reserved. + */ + error = xfs_dir_canenter(tp, dp, link_name, resblks); + if (error) + goto error_return; + /* + * Initialize the bmap freelist prior to calling either + * bmapi or the directory create code. + */ + xfs_bmap_init(&free_list, &first_block); + + /* + * Allocate an inode for the symlink. + */ + error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0, + prid, resblks > 0, &ip, NULL); + if (error) { + if (error == ENOSPC) + goto error_return; + goto error1; + } + + /* + * An error after we've joined dp to the transaction will result in the + * transaction cancel unlocking dp so don't do it explicitly in the + * error path. + */ + xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); + unlock_dp_on_error = false; + + /* + * Also attach the dquot(s) to it, if applicable. + */ + xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp); + + if (resblks) + resblks -= XFS_IALLOC_SPACE_RES(mp); + /* + * If the symlink will fit into the inode, write it inline. + */ + if (pathlen <= XFS_IFORK_DSIZE(ip)) { + xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK); + memcpy(ip->i_df.if_u1.if_data, target_path, pathlen); + ip->i_d.di_size = pathlen; + + /* + * The inode was initially created in extent format. + */ + ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT); + ip->i_df.if_flags |= XFS_IFINLINE; + + ip->i_d.di_format = XFS_DINODE_FMT_LOCAL; + xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE); + + } else { + int offset; + + first_fsb = 0; + nmaps = XFS_SYMLINK_MAPS; + + error = xfs_bmapi_write(tp, ip, first_fsb, fs_blocks, + XFS_BMAPI_METADATA, &first_block, resblks, + mval, &nmaps, &free_list); + if (error) + goto error2; + + if (resblks) + resblks -= fs_blocks; + ip->i_d.di_size = pathlen; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + cur_chunk = target_path; + offset = 0; + for (n = 0; n < nmaps; n++) { + char *buf; + + d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); + byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); + bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, + BTOBB(byte_cnt), 0); + if (!bp) { + error = ENOMEM; + goto error2; + } + bp->b_ops = &xfs_symlink_buf_ops; + + byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt); + if (pathlen < byte_cnt) { + byte_cnt = pathlen; + } + + buf = bp->b_addr; + buf += xfs_symlink_hdr_set(mp, ip->i_ino, offset, + byte_cnt, bp); + + memcpy(buf, cur_chunk, byte_cnt); + + cur_chunk += byte_cnt; + pathlen -= byte_cnt; + offset += byte_cnt; + + xfs_trans_log_buf(tp, bp, 0, (buf + byte_cnt - 1) - + (char *)bp->b_addr); + } + } + + /* + * Create the directory entry for the symlink. + */ + error = xfs_dir_createname(tp, dp, link_name, ip->i_ino, + &first_block, &free_list, resblks); + if (error) + goto error2; + xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + + /* + * If this is a synchronous mount, make sure that the + * symlink transaction goes to disk before returning to + * the user. + */ + if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { + xfs_trans_set_sync(tp); + } + + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) { + goto error2; + } + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + + *ipp = ip; + return 0; + + error2: + IRELE(ip); + error1: + xfs_bmap_cancel(&free_list); + cancel_flags |= XFS_TRANS_ABORT; + error_return: + xfs_trans_cancel(tp, cancel_flags); + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + + if (unlock_dp_on_error) + xfs_iunlock(dp, XFS_ILOCK_EXCL); + std_return: + return error; +} + +/* + * Free a symlink that has blocks associated with it. + */ +int +xfs_inactive_symlink_rmt( + xfs_inode_t *ip, + xfs_trans_t **tpp) +{ + xfs_buf_t *bp; + int committed; + int done; + int error; + xfs_fsblock_t first_block; + xfs_bmap_free_t free_list; + int i; + xfs_mount_t *mp; + xfs_bmbt_irec_t mval[XFS_SYMLINK_MAPS]; + int nmaps; + xfs_trans_t *ntp; + int size; + xfs_trans_t *tp; + + tp = *tpp; + mp = ip->i_mount; + ASSERT(ip->i_d.di_size > XFS_IFORK_DSIZE(ip)); + /* + * We're freeing a symlink that has some + * blocks allocated to it. Free the + * blocks here. We know that we've got + * either 1 or 2 extents and that we can + * free them all in one bunmapi call. + */ + ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2); + + /* + * Lock the inode, fix the size, and join it to the transaction. + * Hold it so in the normal path, we still have it locked for + * the second transaction. In the error paths we need it + * held so the cancel won't rele it, see below. + */ + size = (int)ip->i_d.di_size; + ip->i_d.di_size = 0; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + /* + * Find the block(s) so we can inval and unmap them. + */ + done = 0; + xfs_bmap_init(&free_list, &first_block); + nmaps = ARRAY_SIZE(mval); + error = xfs_bmapi_read(ip, 0, xfs_symlink_blocks(mp, size), + mval, &nmaps, 0); + if (error) + goto error0; + /* + * Invalidate the block(s). No validation is done. + */ + for (i = 0; i < nmaps; i++) { + bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, mval[i].br_startblock), + XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0); + if (!bp) { + error = ENOMEM; + goto error1; + } + xfs_trans_binval(tp, bp); + } + /* + * Unmap the dead block(s) to the free_list. + */ + if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps, + &first_block, &free_list, &done))) + goto error1; + ASSERT(done); + /* + * Commit the first transaction. This logs the EFI and the inode. + */ + if ((error = xfs_bmap_finish(&tp, &free_list, &committed))) + goto error1; + /* + * The transaction must have been committed, since there were + * actually extents freed by xfs_bunmapi. See xfs_bmap_finish. + * The new tp has the extent freeing and EFDs. + */ + ASSERT(committed); + /* + * The first xact was committed, so add the inode to the new one. + * Mark it dirty so it will be logged and moved forward in the log as + * part of every commit. + */ + xfs_trans_ijoin(tp, ip, 0); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + /* + * Get a new, empty transaction to return to our caller. + */ + ntp = xfs_trans_dup(tp); + /* + * Commit the transaction containing extent freeing and EFDs. + * If we get an error on the commit here or on the reserve below, + * we need to unlock the inode since the new transaction doesn't + * have the inode attached. + */ + error = xfs_trans_commit(tp, 0); + tp = ntp; + if (error) { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + goto error0; + } + /* + * transaction commit worked ok so we can drop the extra ticket + * reference that we gained in xfs_trans_dup() + */ + xfs_log_ticket_put(tp->t_ticket); + + /* + * Remove the memory for extent descriptions (just bookkeeping). + */ + if (ip->i_df.if_bytes) + xfs_idata_realloc(ip, -ip->i_df.if_bytes, XFS_DATA_FORK); + ASSERT(ip->i_df.if_bytes == 0); + /* + * Put an itruncate log reservation in the new transaction + * for our caller. + */ + if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + goto error0; + } + + xfs_trans_ijoin(tp, ip, 0); + *tpp = tp; + return 0; + + error1: + xfs_bmap_cancel(&free_list); + error0: + return error; +} diff --git a/fs/xfs/xfs_symlink.h b/fs/xfs/xfs_symlink.h new file mode 100644 index 00000000000..b39398d2097 --- /dev/null +++ b/fs/xfs/xfs_symlink.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2012 Red Hat, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_SYMLINK_H +#define __XFS_SYMLINK_H 1 + +struct xfs_mount; +struct xfs_trans; +struct xfs_inode; +struct xfs_buf; +struct xfs_ifork; +struct xfs_name; + +#define XFS_SYMLINK_MAGIC 0x58534c4d /* XSLM */ + +struct xfs_dsymlink_hdr { + __be32 sl_magic; + __be32 sl_offset; + __be32 sl_bytes; + __be32 sl_crc; + uuid_t sl_uuid; + __be64 sl_owner; + __be64 sl_blkno; + __be64 sl_lsn; +}; + +/* + * The maximum pathlen is 1024 bytes. Since the minimum file system + * blocksize is 512 bytes, we can get a max of 3 extents back from + * bmapi when crc headers are taken into account. + */ +#define XFS_SYMLINK_MAPS 3 + +#define XFS_SYMLINK_BUF_SPACE(mp, bufsize) \ + ((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \ + sizeof(struct xfs_dsymlink_hdr) : 0)) + +int xfs_symlink_blocks(struct xfs_mount *mp, int pathlen); + +void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp, + struct xfs_inode *ip, struct xfs_ifork *ifp); + +extern const struct xfs_buf_ops xfs_symlink_buf_ops; + +#ifdef __KERNEL__ + +int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name, + const char *target_path, umode_t mode, struct xfs_inode **ipp); +int xfs_readlink(struct xfs_inode *ip, char *link); +int xfs_inactive_symlink_rmt(struct xfs_inode *ip, struct xfs_trans **tpp); + +#endif /* __KERNEL__ */ +#endif /* __XFS_SYMLINK_H */ diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index 624bedd8135..b6e3897c1d9 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -22,7 +22,6 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_da_btree.h" #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" @@ -30,6 +29,7 @@ #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_mount.h" +#include "xfs_da_btree.h" #include "xfs_ialloc.h" #include "xfs_itable.h" #include "xfs_alloc.h" diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 16a812977ea..aa4db3307d3 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -619,6 +619,30 @@ DECLARE_EVENT_CLASS(xfs_iref_class, (char *)__entry->caller_ip) ) +TRACE_EVENT(xfs_iomap_prealloc_size, + TP_PROTO(struct xfs_inode *ip, xfs_fsblock_t blocks, int shift, + unsigned int writeio_blocks), + TP_ARGS(ip, blocks, shift, writeio_blocks), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fsblock_t, blocks) + __field(int, shift) + __field(unsigned int, writeio_blocks) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->blocks = blocks; + __entry->shift = shift; + __entry->writeio_blocks = writeio_blocks; + ), + TP_printk("dev %d:%d ino 0x%llx prealloc blocks %llu shift %d " + "m_writeio_blocks %u", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, + __entry->blocks, __entry->shift, __entry->writeio_blocks) +) + #define DEFINE_IREF_EVENT(name) \ DEFINE_EVENT(xfs_iref_class, name, \ TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \ diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 3edf5dbee00..73a5fa457e1 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -659,6 +659,7 @@ xfs_trans_binval( ASSERT(XFS_BUF_ISSTALE(bp)); ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY))); ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_INODE_BUF)); + ASSERT(!(bip->__bli_format.blf_flags & XFS_BLFT_MASK)); ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); ASSERT(bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY); ASSERT(tp->t_flags & XFS_TRANS_DIRTY); @@ -671,6 +672,7 @@ xfs_trans_binval( bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY); bip->__bli_format.blf_flags &= ~XFS_BLF_INODE_BUF; bip->__bli_format.blf_flags |= XFS_BLF_CANCEL; + bip->__bli_format.blf_flags &= ~XFS_BLFT_MASK; for (i = 0; i < bip->bli_format_count; i++) { memset(bip->bli_formats[i].blf_data_map, 0, (bip->bli_formats[i].blf_map_size * sizeof(uint))); @@ -702,12 +704,13 @@ xfs_trans_inode_buf( ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_flags |= XFS_BLI_INODE_BUF; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); } /* * This call is used to indicate that the buffer is going to * be staled and was an inode buffer. This means it gets - * special processing during unpin - where any inodes + * special processing during unpin - where any inodes * associated with the buffer should be removed from ail. * There is also special processing during recovery, * any replay of the inodes in the buffer needs to be @@ -726,6 +729,7 @@ xfs_trans_stale_inode_buf( bip->bli_flags |= XFS_BLI_STALE_INODE; bip->bli_item.li_cb = xfs_buf_iodone; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); } /* @@ -749,8 +753,43 @@ xfs_trans_inode_alloc_buf( ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_flags |= XFS_BLI_INODE_ALLOC_BUF; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); } +/* + * Set the type of the buffer for log recovery so that it can correctly identify + * and hence attach the correct buffer ops to the buffer after replay. + */ +void +xfs_trans_buf_set_type( + struct xfs_trans *tp, + struct xfs_buf *bp, + enum xfs_blft type) +{ + struct xfs_buf_log_item *bip = bp->b_fspriv; + + if (!tp) + return; + + ASSERT(bp->b_transp == tp); + ASSERT(bip != NULL); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + + xfs_blft_to_flags(&bip->__bli_format, type); +} + +void +xfs_trans_buf_copy_type( + struct xfs_buf *dst_bp, + struct xfs_buf *src_bp) +{ + struct xfs_buf_log_item *sbip = src_bp->b_fspriv; + struct xfs_buf_log_item *dbip = dst_bp->b_fspriv; + enum xfs_blft type; + + type = xfs_blft_from_flags(&sbip->__bli_format); + xfs_blft_to_flags(&dbip->__bli_format, type); +} /* * Similar to xfs_trans_inode_buf(), this marks the buffer as a cluster of @@ -769,14 +808,28 @@ xfs_trans_dquot_buf( xfs_buf_t *bp, uint type) { - xfs_buf_log_item_t *bip = bp->b_fspriv; + struct xfs_buf_log_item *bip = bp->b_fspriv; - ASSERT(bp->b_transp == tp); - ASSERT(bip != NULL); ASSERT(type == XFS_BLF_UDQUOT_BUF || type == XFS_BLF_PDQUOT_BUF || type == XFS_BLF_GDQUOT_BUF); - ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->__bli_format.blf_flags |= type; + + switch (type) { + case XFS_BLF_UDQUOT_BUF: + type = XFS_BLFT_UDQUOT_BUF; + break; + case XFS_BLF_PDQUOT_BUF: + type = XFS_BLFT_PDQUOT_BUF; + break; + case XFS_BLF_GDQUOT_BUF: + type = XFS_BLFT_GDQUOT_BUF; + break; + default: + type = XFS_BLFT_UNKNOWN_BUF; + break; + } + + xfs_trans_buf_set_type(tp, bp, type); } diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 642c2d6e1db..fec75d02370 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -326,12 +326,12 @@ xfs_trans_dqlockedjoin( */ void xfs_trans_apply_dquot_deltas( - xfs_trans_t *tp) + struct xfs_trans *tp) { int i, j; - xfs_dquot_t *dqp; - xfs_dqtrx_t *qtrx, *qa; - xfs_disk_dquot_t *d; + struct xfs_dquot *dqp; + struct xfs_dqtrx *qtrx, *qa; + struct xfs_disk_dquot *d; long totalbdelta; long totalrtbdelta; @@ -412,7 +412,7 @@ xfs_trans_apply_dquot_deltas( * Start/reset the timer(s) if needed. */ if (d->d_id) { - xfs_qm_adjust_dqlimits(tp->t_mountp, d); + xfs_qm_adjust_dqlimits(tp->t_mountp, dqp); xfs_qm_adjust_dqtimers(tp->t_mountp, d); } diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 77ad74834ba..1501f4fa51a 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * Copyright (c) 2012 Red Hat, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -48,103 +49,8 @@ #include "xfs_vnodeops.h" #include "xfs_trace.h" #include "xfs_icache.h" +#include "xfs_symlink.h" -/* - * The maximum pathlen is 1024 bytes. Since the minimum file system - * blocksize is 512 bytes, we can get a max of 2 extents back from - * bmapi. - */ -#define SYMLINK_MAPS 2 - -STATIC int -xfs_readlink_bmap( - xfs_inode_t *ip, - char *link) -{ - xfs_mount_t *mp = ip->i_mount; - int pathlen = ip->i_d.di_size; - int nmaps = SYMLINK_MAPS; - xfs_bmbt_irec_t mval[SYMLINK_MAPS]; - xfs_daddr_t d; - int byte_cnt; - int n; - xfs_buf_t *bp; - int error = 0; - - error = xfs_bmapi_read(ip, 0, XFS_B_TO_FSB(mp, pathlen), mval, &nmaps, - 0); - if (error) - goto out; - - for (n = 0; n < nmaps; n++) { - d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); - byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); - - bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0, NULL); - if (!bp) - return XFS_ERROR(ENOMEM); - error = bp->b_error; - if (error) { - xfs_buf_ioerror_alert(bp, __func__); - xfs_buf_relse(bp); - goto out; - } - if (pathlen < byte_cnt) - byte_cnt = pathlen; - pathlen -= byte_cnt; - - memcpy(link, bp->b_addr, byte_cnt); - xfs_buf_relse(bp); - } - - link[ip->i_d.di_size] = '\0'; - error = 0; - - out: - return error; -} - -int -xfs_readlink( - xfs_inode_t *ip, - char *link) -{ - xfs_mount_t *mp = ip->i_mount; - xfs_fsize_t pathlen; - int error = 0; - - trace_xfs_readlink(ip); - - if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); - - xfs_ilock(ip, XFS_ILOCK_SHARED); - - pathlen = ip->i_d.di_size; - if (!pathlen) - goto out; - - if (pathlen < 0 || pathlen > MAXPATHLEN) { - xfs_alert(mp, "%s: inode (%llu) bad symlink length (%lld)", - __func__, (unsigned long long) ip->i_ino, - (long long) pathlen); - ASSERT(0); - error = XFS_ERROR(EFSCORRUPTED); - goto out; - } - - - if (ip->i_df.if_flags & XFS_IFINLINE) { - memcpy(link, ip->i_df.if_u1.if_data, pathlen); - link[pathlen] = '\0'; - } else { - error = xfs_readlink_bmap(ip, link); - } - - out: - xfs_iunlock(ip, XFS_ILOCK_SHARED); - return error; -} /* * This is called by xfs_inactive to free any blocks beyond eof @@ -249,145 +155,6 @@ xfs_free_eofblocks( return error; } -/* - * Free a symlink that has blocks associated with it. - */ -STATIC int -xfs_inactive_symlink_rmt( - xfs_inode_t *ip, - xfs_trans_t **tpp) -{ - xfs_buf_t *bp; - int committed; - int done; - int error; - xfs_fsblock_t first_block; - xfs_bmap_free_t free_list; - int i; - xfs_mount_t *mp; - xfs_bmbt_irec_t mval[SYMLINK_MAPS]; - int nmaps; - xfs_trans_t *ntp; - int size; - xfs_trans_t *tp; - - tp = *tpp; - mp = ip->i_mount; - ASSERT(ip->i_d.di_size > XFS_IFORK_DSIZE(ip)); - /* - * We're freeing a symlink that has some - * blocks allocated to it. Free the - * blocks here. We know that we've got - * either 1 or 2 extents and that we can - * free them all in one bunmapi call. - */ - ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2); - - /* - * Lock the inode, fix the size, and join it to the transaction. - * Hold it so in the normal path, we still have it locked for - * the second transaction. In the error paths we need it - * held so the cancel won't rele it, see below. - */ - size = (int)ip->i_d.di_size; - ip->i_d.di_size = 0; - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - /* - * Find the block(s) so we can inval and unmap them. - */ - done = 0; - xfs_bmap_init(&free_list, &first_block); - nmaps = ARRAY_SIZE(mval); - error = xfs_bmapi_read(ip, 0, XFS_B_TO_FSB(mp, size), - mval, &nmaps, 0); - if (error) - goto error0; - /* - * Invalidate the block(s). - */ - for (i = 0; i < nmaps; i++) { - bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, - XFS_FSB_TO_DADDR(mp, mval[i].br_startblock), - XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0); - if (!bp) { - error = ENOMEM; - goto error1; - } - xfs_trans_binval(tp, bp); - } - /* - * Unmap the dead block(s) to the free_list. - */ - if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps, - &first_block, &free_list, &done))) - goto error1; - ASSERT(done); - /* - * Commit the first transaction. This logs the EFI and the inode. - */ - if ((error = xfs_bmap_finish(&tp, &free_list, &committed))) - goto error1; - /* - * The transaction must have been committed, since there were - * actually extents freed by xfs_bunmapi. See xfs_bmap_finish. - * The new tp has the extent freeing and EFDs. - */ - ASSERT(committed); - /* - * The first xact was committed, so add the inode to the new one. - * Mark it dirty so it will be logged and moved forward in the log as - * part of every commit. - */ - xfs_trans_ijoin(tp, ip, 0); - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - /* - * Get a new, empty transaction to return to our caller. - */ - ntp = xfs_trans_dup(tp); - /* - * Commit the transaction containing extent freeing and EFDs. - * If we get an error on the commit here or on the reserve below, - * we need to unlock the inode since the new transaction doesn't - * have the inode attached. - */ - error = xfs_trans_commit(tp, 0); - tp = ntp; - if (error) { - ASSERT(XFS_FORCED_SHUTDOWN(mp)); - goto error0; - } - /* - * transaction commit worked ok so we can drop the extra ticket - * reference that we gained in xfs_trans_dup() - */ - xfs_log_ticket_put(tp->t_ticket); - - /* - * Remove the memory for extent descriptions (just bookkeeping). - */ - if (ip->i_df.if_bytes) - xfs_idata_realloc(ip, -ip->i_df.if_bytes, XFS_DATA_FORK); - ASSERT(ip->i_df.if_bytes == 0); - /* - * Put an itruncate log reservation in the new transaction - * for our caller. - */ - if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) { - ASSERT(XFS_FORCED_SHUTDOWN(mp)); - goto error0; - } - - xfs_trans_ijoin(tp, ip, 0); - *tpp = tp; - return 0; - - error1: - xfs_bmap_cancel(&free_list); - error0: - return error; -} - int xfs_release( xfs_inode_t *ip) @@ -1353,247 +1120,6 @@ xfs_link( } int -xfs_symlink( - xfs_inode_t *dp, - struct xfs_name *link_name, - const char *target_path, - umode_t mode, - xfs_inode_t **ipp) -{ - xfs_mount_t *mp = dp->i_mount; - xfs_trans_t *tp; - xfs_inode_t *ip; - int error; - int pathlen; - xfs_bmap_free_t free_list; - xfs_fsblock_t first_block; - bool unlock_dp_on_error = false; - uint cancel_flags; - int committed; - xfs_fileoff_t first_fsb; - xfs_filblks_t fs_blocks; - int nmaps; - xfs_bmbt_irec_t mval[SYMLINK_MAPS]; - xfs_daddr_t d; - const char *cur_chunk; - int byte_cnt; - int n; - xfs_buf_t *bp; - prid_t prid; - struct xfs_dquot *udqp, *gdqp; - uint resblks; - - *ipp = NULL; - error = 0; - ip = NULL; - tp = NULL; - - trace_xfs_symlink(dp, link_name); - - if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); - - /* - * Check component lengths of the target path name. - */ - pathlen = strlen(target_path); - if (pathlen >= MAXPATHLEN) /* total string too long */ - return XFS_ERROR(ENAMETOOLONG); - - udqp = gdqp = NULL; - if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) - prid = xfs_get_projid(dp); - else - prid = XFS_PROJID_DEFAULT; - - /* - * Make sure that we have allocated dquot(s) on disk. - */ - error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid, - XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp); - if (error) - goto std_return; - - tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK); - cancel_flags = XFS_TRANS_RELEASE_LOG_RES; - /* - * The symlink will fit into the inode data fork? - * There can't be any attributes so we get the whole variable part. - */ - if (pathlen <= XFS_LITINO(mp)) - fs_blocks = 0; - else - fs_blocks = XFS_B_TO_FSB(mp, pathlen); - resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks); - error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT); - if (error == ENOSPC && fs_blocks == 0) { - resblks = 0; - error = xfs_trans_reserve(tp, 0, XFS_SYMLINK_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT); - } - if (error) { - cancel_flags = 0; - goto error_return; - } - - xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); - unlock_dp_on_error = true; - - /* - * Check whether the directory allows new symlinks or not. - */ - if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) { - error = XFS_ERROR(EPERM); - goto error_return; - } - - /* - * Reserve disk quota : blocks and inode. - */ - error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, resblks, 1, 0); - if (error) - goto error_return; - - /* - * Check for ability to enter directory entry, if no space reserved. - */ - error = xfs_dir_canenter(tp, dp, link_name, resblks); - if (error) - goto error_return; - /* - * Initialize the bmap freelist prior to calling either - * bmapi or the directory create code. - */ - xfs_bmap_init(&free_list, &first_block); - - /* - * Allocate an inode for the symlink. - */ - error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0, - prid, resblks > 0, &ip, NULL); - if (error) { - if (error == ENOSPC) - goto error_return; - goto error1; - } - - /* - * An error after we've joined dp to the transaction will result in the - * transaction cancel unlocking dp so don't do it explicitly in the - * error path. - */ - xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); - unlock_dp_on_error = false; - - /* - * Also attach the dquot(s) to it, if applicable. - */ - xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp); - - if (resblks) - resblks -= XFS_IALLOC_SPACE_RES(mp); - /* - * If the symlink will fit into the inode, write it inline. - */ - if (pathlen <= XFS_IFORK_DSIZE(ip)) { - xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK); - memcpy(ip->i_df.if_u1.if_data, target_path, pathlen); - ip->i_d.di_size = pathlen; - - /* - * The inode was initially created in extent format. - */ - ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT); - ip->i_df.if_flags |= XFS_IFINLINE; - - ip->i_d.di_format = XFS_DINODE_FMT_LOCAL; - xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE); - - } else { - first_fsb = 0; - nmaps = SYMLINK_MAPS; - - error = xfs_bmapi_write(tp, ip, first_fsb, fs_blocks, - XFS_BMAPI_METADATA, &first_block, resblks, - mval, &nmaps, &free_list); - if (error) - goto error2; - - if (resblks) - resblks -= fs_blocks; - ip->i_d.di_size = pathlen; - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - - cur_chunk = target_path; - for (n = 0; n < nmaps; n++) { - d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); - byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); - bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, - BTOBB(byte_cnt), 0); - if (!bp) { - error = ENOMEM; - goto error2; - } - if (pathlen < byte_cnt) { - byte_cnt = pathlen; - } - pathlen -= byte_cnt; - - memcpy(bp->b_addr, cur_chunk, byte_cnt); - cur_chunk += byte_cnt; - - xfs_trans_log_buf(tp, bp, 0, byte_cnt - 1); - } - } - - /* - * Create the directory entry for the symlink. - */ - error = xfs_dir_createname(tp, dp, link_name, ip->i_ino, - &first_block, &free_list, resblks); - if (error) - goto error2; - xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); - - /* - * If this is a synchronous mount, make sure that the - * symlink transaction goes to disk before returning to - * the user. - */ - if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { - xfs_trans_set_sync(tp); - } - - error = xfs_bmap_finish(&tp, &free_list, &committed); - if (error) { - goto error2; - } - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); - xfs_qm_dqrele(udqp); - xfs_qm_dqrele(gdqp); - - *ipp = ip; - return 0; - - error2: - IRELE(ip); - error1: - xfs_bmap_cancel(&free_list); - cancel_flags |= XFS_TRANS_ABORT; - error_return: - xfs_trans_cancel(tp, cancel_flags); - xfs_qm_dqrele(udqp); - xfs_qm_dqrele(gdqp); - - if (unlock_dp_on_error) - xfs_iunlock(dp, XFS_ILOCK_EXCL); - std_return: - return error; -} - -int xfs_set_dmattrs( xfs_inode_t *ip, u_int evmask, |