Skip to content

Commit 5342e9f

Browse files
fdmananagregkh
authored andcommitted
btrfs: fix lost prealloc extents beyond eof after full fsync
commit d994788 upstream. When doing a full fsync, if we have prealloc extents beyond (or at) eof, and the leaves that contain them were not modified in the current transaction, we end up not logging them. This results in losing those extents when we replay the log after a power failure, since the inode is truncated to the current value of the logged i_size. Just like for the fast fsync path, we need to always log all prealloc extents starting at or beyond i_size. The fast fsync case was fixed in commit 471d557 ("Btrfs: fix loss of prealloc extents past i_size after fsync log replay") but it missed the full fsync path. The problem exists since the very early days, when the log tree was added by commit e02119d ("Btrfs: Add a write ahead tree log to optimize synchronous operations"). Example reproducer: $ mkfs.btrfs -f /dev/sdc $ mount /dev/sdc /mnt # Create our test file with many file extent items, so that they span # several leaves of metadata, even if the node/page size is 64K. Use # direct IO and not fsync/O_SYNC because it's both faster and it avoids # clearing the full sync flag from the inode - we want the fsync below # to trigger the slow full sync code path. $ xfs_io -f -d -c "pwrite -b 4K 0 16M" /mnt/foo # Now add two preallocated extents to our file without extending the # file's size. One right at i_size, and another further beyond, leaving # a gap between the two prealloc extents. $ xfs_io -c "falloc -k 16M 1M" /mnt/foo $ xfs_io -c "falloc -k 20M 1M" /mnt/foo # Make sure everything is durably persisted and the transaction is # committed. This makes all created extents to have a generation lower # than the generation of the transaction used by the next write and # fsync. sync # Now overwrite only the first extent, which will result in modifying # only the first leaf of metadata for our inode. Then fsync it. This # fsync will use the slow code path (inode full sync bit is set) because # it's the first fsync since the inode was created/loaded. $ xfs_io -c "pwrite 0 4K" -c "fsync" /mnt/foo # Extent list before power failure. $ xfs_io -c "fiemap -v" /mnt/foo /mnt/foo: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..7]: 2178048..2178055 8 0x0 1: [8..16383]: 26632..43007 16376 0x0 2: [16384..32767]: 2156544..2172927 16384 0x0 3: [32768..34815]: 2172928..2174975 2048 0x800 4: [34816..40959]: hole 6144 5: [40960..43007]: 2174976..2177023 2048 0x801 <power fail> # Mount fs again, trigger log replay. $ mount /dev/sdc /mnt # Extent list after power failure and log replay. $ xfs_io -c "fiemap -v" /mnt/foo /mnt/foo: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..7]: 2178048..2178055 8 0x0 1: [8..16383]: 26632..43007 16376 0x0 2: [16384..32767]: 2156544..2172927 16384 0x1 # The prealloc extents at file offsets 16M and 20M are missing. So fix this by calling btrfs_log_prealloc_extents() when we are doing a full fsync, so that we always log all prealloc extents beyond eof. A test case for fstests will follow soon. CC: stable@vger.kernel.org # 4.19+ Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
1 parent 80660a7 commit 5342e9f

1 file changed

Lines changed: 31 additions & 12 deletions

File tree

fs/btrfs/tree-log.c

Lines changed: 31 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4423,7 +4423,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
44234423

44244424
/*
44254425
* Log all prealloc extents beyond the inode's i_size to make sure we do not
4426-
* lose them after doing a fast fsync and replaying the log. We scan the
4426+
* lose them after doing a full/fast fsync and replaying the log. We scan the
44274427
* subvolume's root instead of iterating the inode's extent map tree because
44284428
* otherwise we can log incorrect extent items based on extent map conversion.
44294429
* That can happen due to the fact that extent maps are merged when they
@@ -5208,6 +5208,7 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
52085208
struct btrfs_log_ctx *ctx,
52095209
bool *need_log_inode_item)
52105210
{
5211+
const u64 i_size = i_size_read(&inode->vfs_inode);
52115212
struct btrfs_root *root = inode->root;
52125213
int ins_start_slot = 0;
52135214
int ins_nr = 0;
@@ -5228,13 +5229,21 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
52285229
if (min_key->type > max_key->type)
52295230
break;
52305231

5231-
if (min_key->type == BTRFS_INODE_ITEM_KEY)
5232+
if (min_key->type == BTRFS_INODE_ITEM_KEY) {
52325233
*need_log_inode_item = false;
5233-
5234-
if ((min_key->type == BTRFS_INODE_REF_KEY ||
5235-
min_key->type == BTRFS_INODE_EXTREF_KEY) &&
5236-
inode->generation == trans->transid &&
5237-
!recursive_logging) {
5234+
} else if (min_key->type == BTRFS_EXTENT_DATA_KEY &&
5235+
min_key->offset >= i_size) {
5236+
/*
5237+
* Extents at and beyond eof are logged with
5238+
* btrfs_log_prealloc_extents().
5239+
* Only regular files have BTRFS_EXTENT_DATA_KEY keys,
5240+
* and no keys greater than that, so bail out.
5241+
*/
5242+
break;
5243+
} else if ((min_key->type == BTRFS_INODE_REF_KEY ||
5244+
min_key->type == BTRFS_INODE_EXTREF_KEY) &&
5245+
inode->generation == trans->transid &&
5246+
!recursive_logging) {
52385247
u64 other_ino = 0;
52395248
u64 other_parent = 0;
52405249

@@ -5265,10 +5274,8 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
52655274
btrfs_release_path(path);
52665275
goto next_key;
52675276
}
5268-
}
5269-
5270-
/* Skip xattrs, we log them later with btrfs_log_all_xattrs() */
5271-
if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
5277+
} else if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
5278+
/* Skip xattrs, logged later with btrfs_log_all_xattrs() */
52725279
if (ins_nr == 0)
52735280
goto next_slot;
52745281
ret = copy_items(trans, inode, dst_path, path,
@@ -5321,9 +5328,21 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
53215328
break;
53225329
}
53235330
}
5324-
if (ins_nr)
5331+
if (ins_nr) {
53255332
ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
53265333
ins_nr, inode_only, logged_isize);
5334+
if (ret)
5335+
return ret;
5336+
}
5337+
5338+
if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) {
5339+
/*
5340+
* Release the path because otherwise we might attempt to double
5341+
* lock the same leaf with btrfs_log_prealloc_extents() below.
5342+
*/
5343+
btrfs_release_path(path);
5344+
ret = btrfs_log_prealloc_extents(trans, inode, dst_path);
5345+
}
53275346

53285347
return ret;
53295348
}

0 commit comments

Comments
 (0)