From 08c987deca56687c0930f308f5148ef1af38dc14 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Tue, 11 Apr 2023 19:00:07 -0700
Subject: [PATCH 1/2] xfs: fix rm_offset flag handling in rmap keys

Keys for extent interval records in the reverse mapping btree are
supposed to be computed as follows:

(physical block, owner, fork, is_btree, offset)

This provides users the ability to look up a reverse mapping from a file
block mapping record -- start with the physical block; then if there are
multiple records for the same block, move on to the owner; then the
inode fork type; and so on to the file offset.

Unfortunately, the code that creates rmap lookup keys from rmap records
forgot to mask off the record attribute flags, leading to ondisk keys
that look like this:

(physical block, owner, fork, is_btree, unwritten state, offset)

Fortunately, this has all worked ok for the past six years because the
key comparison functions incorrectly ignore the fork/bmbt/unwritten
information that's encoded in the on-disk offset.  This means that
lookup comparisons are only done with:

(physical block, owner, offset)

Queries can (theoretically) return incorrect results because of this
omission.  On consistent filesystems this isn't an issue because xattr
and bmbt blocks cannot be shared and hence the comparisons succeed
purely on the contents of the rm_startblock field.  For the one case
where we support sharing (written data fork blocks) all flag bits are
zero, so the omission in the comparison has no ill effects.

Unfortunately, this bug prevents scrub from detecting incorrect fork and
bmbt flag bits in the rmap btree, so we really do need to fix the
compare code.  Old filesystems with the unwritten bit erroneously set in
the rmap key struct will work fine on new kernels since we still ignore
the unwritten bit.  New filesystems on older kernels will work fine
since the old kernels never paid attention to the unwritten bit.

A previous version of this patch forgot to keep the (un)written state
flag masked during the comparison and caused a major regression in
5.9.x since unwritten extent conversion can update an rmap record
without requiring key updates.

Note that blocks cannot go directly from data fork to attr fork without
being deallocated and reallocated, nor can they be added to or removed
from a bmbt without a free/alloc cycle, so this should not cause any
regressions.

Found by fuzzing keys[1].attrfork = ones on xfs/371.

Fixes: 4b8ed67794fe ("xfs: add rmap btree operations")
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/libxfs/xfs_rmap_btree.c | 40 +++++++++++++++++++++++++---------
 1 file changed, 30 insertions(+), 10 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
index 56d074b426609e..84e2b692f0349c 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -156,6 +156,16 @@ xfs_rmapbt_get_maxrecs(
 	return cur->bc_mp->m_rmap_mxr[level != 0];
 }
 
+/*
+ * Convert the ondisk record's offset field into the ondisk key's offset field.
+ * Fork and bmbt are significant parts of the rmap record key, but written
+ * status is merely a record attribute.
+ */
+static inline __be64 ondisk_rec_offset_to_key(const union xfs_btree_rec *rec)
+{
+	return rec->rmap.rm_offset & ~cpu_to_be64(XFS_RMAP_OFF_UNWRITTEN);
+}
+
 STATIC void
 xfs_rmapbt_init_key_from_rec(
 	union xfs_btree_key		*key,
@@ -163,7 +173,7 @@ xfs_rmapbt_init_key_from_rec(
 {
 	key->rmap.rm_startblock = rec->rmap.rm_startblock;
 	key->rmap.rm_owner = rec->rmap.rm_owner;
-	key->rmap.rm_offset = rec->rmap.rm_offset;
+	key->rmap.rm_offset = ondisk_rec_offset_to_key(rec);
 }
 
 /*
@@ -186,7 +196,7 @@ xfs_rmapbt_init_high_key_from_rec(
 	key->rmap.rm_startblock = rec->rmap.rm_startblock;
 	be32_add_cpu(&key->rmap.rm_startblock, adj);
 	key->rmap.rm_owner = rec->rmap.rm_owner;
-	key->rmap.rm_offset = rec->rmap.rm_offset;
+	key->rmap.rm_offset = ondisk_rec_offset_to_key(rec);
 	if (XFS_RMAP_NON_INODE_OWNER(be64_to_cpu(rec->rmap.rm_owner)) ||
 	    XFS_RMAP_IS_BMBT_BLOCK(be64_to_cpu(rec->rmap.rm_offset)))
 		return;
@@ -219,6 +229,16 @@ xfs_rmapbt_init_ptr_from_cur(
 	ptr->s = agf->agf_roots[cur->bc_btnum];
 }
 
+/*
+ * Mask the appropriate parts of the ondisk key field for a key comparison.
+ * Fork and bmbt are significant parts of the rmap record key, but written
+ * status is merely a record attribute.
+ */
+static inline uint64_t offset_keymask(uint64_t offset)
+{
+	return offset & ~XFS_RMAP_OFF_UNWRITTEN;
+}
+
 STATIC int64_t
 xfs_rmapbt_key_diff(
 	struct xfs_btree_cur		*cur,
@@ -240,8 +260,8 @@ xfs_rmapbt_key_diff(
 	else if (y > x)
 		return -1;
 
-	x = XFS_RMAP_OFF(be64_to_cpu(kp->rm_offset));
-	y = rec->rm_offset;
+	x = offset_keymask(be64_to_cpu(kp->rm_offset));
+	y = offset_keymask(xfs_rmap_irec_offset_pack(rec));
 	if (x > y)
 		return 1;
 	else if (y > x)
@@ -272,8 +292,8 @@ xfs_rmapbt_diff_two_keys(
 	else if (y > x)
 		return -1;
 
-	x = XFS_RMAP_OFF(be64_to_cpu(kp1->rm_offset));
-	y = XFS_RMAP_OFF(be64_to_cpu(kp2->rm_offset));
+	x = offset_keymask(be64_to_cpu(kp1->rm_offset));
+	y = offset_keymask(be64_to_cpu(kp2->rm_offset));
 	if (x > y)
 		return 1;
 	else if (y > x)
@@ -387,8 +407,8 @@ xfs_rmapbt_keys_inorder(
 		return 1;
 	else if (a > b)
 		return 0;
-	a = XFS_RMAP_OFF(be64_to_cpu(k1->rmap.rm_offset));
-	b = XFS_RMAP_OFF(be64_to_cpu(k2->rmap.rm_offset));
+	a = offset_keymask(be64_to_cpu(k1->rmap.rm_offset));
+	b = offset_keymask(be64_to_cpu(k2->rmap.rm_offset));
 	if (a <= b)
 		return 1;
 	return 0;
@@ -417,8 +437,8 @@ xfs_rmapbt_recs_inorder(
 		return 1;
 	else if (a > b)
 		return 0;
-	a = XFS_RMAP_OFF(be64_to_cpu(r1->rmap.rm_offset));
-	b = XFS_RMAP_OFF(be64_to_cpu(r2->rmap.rm_offset));
+	a = offset_keymask(be64_to_cpu(r1->rmap.rm_offset));
+	b = offset_keymask(be64_to_cpu(r2->rmap.rm_offset));
 	if (a <= b)
 		return 1;
 	return 0;

From 38384569a2a8a721623d80c5ae3bcf80614ab792 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Tue, 11 Apr 2023 19:00:07 -0700
Subject: [PATCH 2/2] xfs: detect unwritten bit set in rmapbt node block keys

In the last patch, we changed the rmapbt code to remove the UNWRITTEN
bit when creating an rmapbt key from an rmapbt record, and we changed
the rmapbt key comparison code to start considering the ATTR and BMBT
flags during lookup.  This brought the behavior of the rmapbt
implementation in line with its specification.

However, there may exist filesystems that have the unwritten bit still
set in the rmapbt keys.  We should detect these situations and flag the
rmapbt as one that would benefit from optimization.  Eventually, online
repair will be able to do something in response to this.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/scrub/btree.c | 10 +++++++++
 fs/xfs/scrub/btree.h |  2 ++
 fs/xfs/scrub/rmap.c  | 53 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 65 insertions(+)

diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c
index 626282dbe2e378..de4b29ae083999 100644
--- a/fs/xfs/scrub/btree.c
+++ b/fs/xfs/scrub/btree.c
@@ -119,6 +119,16 @@ xchk_btree_xref_set_corrupt(
 			__return_address);
 }
 
+void
+xchk_btree_set_preen(
+	struct xfs_scrub	*sc,
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	__xchk_btree_set_corrupt(sc, cur, level, XFS_SCRUB_OFLAG_PREEN,
+			__return_address);
+}
+
 /*
  * Make sure this record is in order and doesn't stray outside of the parent
  * keys.
diff --git a/fs/xfs/scrub/btree.h b/fs/xfs/scrub/btree.h
index 70461885c6c7a4..639e44e7544fe7 100644
--- a/fs/xfs/scrub/btree.h
+++ b/fs/xfs/scrub/btree.h
@@ -19,6 +19,8 @@ bool xchk_btree_xref_process_error(struct xfs_scrub *sc,
 /* Check for btree corruption. */
 void xchk_btree_set_corrupt(struct xfs_scrub *sc,
 		struct xfs_btree_cur *cur, int level);
+void xchk_btree_set_preen(struct xfs_scrub *sc, struct xfs_btree_cur *cur,
+		int level);
 
 /* Check for btree xref discrepancies. */
 void xchk_btree_xref_set_corrupt(struct xfs_scrub *sc,
diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c
index ef6e4b8546a6ea..8e78e1bc9eef7d 100644
--- a/fs/xfs/scrub/rmap.c
+++ b/fs/xfs/scrub/rmap.c
@@ -87,6 +87,58 @@ xchk_rmapbt_xref(
 		xchk_rmapbt_xref_refc(sc, irec);
 }
 
+/*
+ * Check for bogus UNWRITTEN flags in the rmapbt node block keys.
+ *
+ * In reverse mapping records, the file mapping extent state
+ * (XFS_RMAP_OFF_UNWRITTEN) is a record attribute, not a key field.  It is not
+ * involved in lookups in any way.  In older kernels, the functions that
+ * convert rmapbt records to keys forgot to filter out the extent state bit,
+ * even though the key comparison functions have filtered the flag correctly.
+ * If we spot an rmap key with the unwritten bit set in rm_offset, we should
+ * mark the btree as needing optimization to rebuild the btree without those
+ * flags.
+ */
+STATIC void
+xchk_rmapbt_check_unwritten_in_keyflags(
+	struct xchk_btree	*bs)
+{
+	struct xfs_scrub	*sc = bs->sc;
+	struct xfs_btree_cur	*cur = bs->cur;
+	struct xfs_btree_block	*keyblock;
+	union xfs_btree_key	*lkey, *hkey;
+	__be64			badflag = cpu_to_be64(XFS_RMAP_OFF_UNWRITTEN);
+	unsigned int		level;
+
+	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_PREEN)
+		return;
+
+	for (level = 1; level < cur->bc_nlevels; level++) {
+		struct xfs_buf	*bp;
+		unsigned int	ptr;
+
+		/* Only check the first time we've seen this node block. */
+		if (cur->bc_levels[level].ptr > 1)
+			continue;
+
+		keyblock = xfs_btree_get_block(cur, level, &bp);
+		for (ptr = 1; ptr <= be16_to_cpu(keyblock->bb_numrecs); ptr++) {
+			lkey = xfs_btree_key_addr(cur, ptr, keyblock);
+
+			if (lkey->rmap.rm_offset & badflag) {
+				xchk_btree_set_preen(sc, cur, level);
+				break;
+			}
+
+			hkey = xfs_btree_high_key_addr(cur, ptr, keyblock);
+			if (hkey->rmap.rm_offset & badflag) {
+				xchk_btree_set_preen(sc, cur, level);
+				break;
+			}
+		}
+	}
+}
+
 /* Scrub an rmapbt record. */
 STATIC int
 xchk_rmapbt_rec(
@@ -101,6 +153,7 @@ xchk_rmapbt_rec(
 		return 0;
 	}
 
+	xchk_rmapbt_check_unwritten_in_keyflags(bs);
 	xchk_rmapbt_xref(bs->sc, &irec);
 	return 0;
 }