Skip to content

Commit 6599d5e

Browse files
josefbacikgregkh
authored andcommitted
btrfs: do not start relocation until in progress drops are done
commit b4be6ae upstream. We hit a bug with a recovering relocation on mount for one of our file systems in production. I reproduced this locally by injecting errors into snapshot delete with balance running at the same time. This presented as an error while looking up an extent item WARNING: CPU: 5 PID: 1501 at fs/btrfs/extent-tree.c:866 lookup_inline_extent_backref+0x647/0x680 CPU: 5 PID: 1501 Comm: btrfs-balance Not tainted 5.16.0-rc8+ #8 RIP: 0010:lookup_inline_extent_backref+0x647/0x680 RSP: 0018:ffffae0a023ab960 EFLAGS: 00010202 RAX: 0000000000000001 RBX: 0000000000000000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 000000000000000c RDI: 0000000000000000 RBP: ffff943fd2a39b60 R08: 0000000000000000 R09: 0000000000000001 R10: 0001434088152de0 R11: 0000000000000000 R12: 0000000001d05000 R13: ffff943fd2a39b60 R14: ffff943fdb96f2a0 R15: ffff9442fc923000 FS: 0000000000000000(0000) GS:ffff944e9eb40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f1157b1fca8 CR3: 000000010f092000 CR4: 0000000000350ee0 Call Trace: <TASK> insert_inline_extent_backref+0x46/0xd0 __btrfs_inc_extent_ref.isra.0+0x5f/0x200 ? btrfs_merge_delayed_refs+0x164/0x190 __btrfs_run_delayed_refs+0x561/0xfa0 ? btrfs_search_slot+0x7b4/0xb30 ? btrfs_update_root+0x1a9/0x2c0 btrfs_run_delayed_refs+0x73/0x1f0 ? btrfs_update_root+0x1a9/0x2c0 btrfs_commit_transaction+0x50/0xa50 ? btrfs_update_reloc_root+0x122/0x220 prepare_to_merge+0x29f/0x320 relocate_block_group+0x2b8/0x550 btrfs_relocate_block_group+0x1a6/0x350 btrfs_relocate_chunk+0x27/0xe0 btrfs_balance+0x777/0xe60 balance_kthread+0x35/0x50 ? btrfs_balance+0xe60/0xe60 kthread+0x16b/0x190 ? set_kthread_struct+0x40/0x40 ret_from_fork+0x22/0x30 </TASK> Normally snapshot deletion and relocation are excluded from running at the same time by the fs_info->cleaner_mutex. However if we had a pending balance waiting to get the ->cleaner_mutex, and a snapshot deletion was running, and then the box crashed, we would come up in a state where we have a half deleted snapshot. Again, in the normal case the snapshot deletion needs to complete before relocation can start, but in this case relocation could very well start before the snapshot deletion completes, as we simply add the root to the dead roots list and wait for the next time the cleaner runs to clean up the snapshot. Fix this by setting a bit on the fs_info if we have any DEAD_ROOT's that had a pending drop_progress key. If they do then we know we were in the middle of the drop operation and set a flag on the fs_info. Then balance can wait until this flag is cleared to start up again. If there are DEAD_ROOT's that don't have a drop_progress set then we're safe to start balance right away as we'll be properly protected by the cleaner_mutex. CC: stable@vger.kernel.org # 5.10+ Reviewed-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Josef Bacik <josef@toxicpanda.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
1 parent 4aef4c9 commit 6599d5e

7 files changed

Lines changed: 91 additions & 1 deletion

File tree

fs/btrfs/ctree.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -593,6 +593,9 @@ enum {
593593
/* Indicate whether there are any tree modification log users */
594594
BTRFS_FS_TREE_MOD_LOG_USERS,
595595

596+
/* Indicate we have half completed snapshot deletions pending. */
597+
BTRFS_FS_UNFINISHED_DROPS,
598+
596599
#if BITS_PER_LONG == 32
597600
/* Indicate if we have error/warn message printed on 32bit systems */
598601
BTRFS_FS_32BIT_ERROR,
@@ -1098,8 +1101,15 @@ enum {
10981101
BTRFS_ROOT_HAS_LOG_TREE,
10991102
/* Qgroup flushing is in progress */
11001103
BTRFS_ROOT_QGROUP_FLUSHING,
1104+
/* This root has a drop operation that was started previously. */
1105+
BTRFS_ROOT_UNFINISHED_DROP,
11011106
};
11021107

1108+
static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
1109+
{
1110+
clear_and_wake_up_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags);
1111+
}
1112+
11031113
/*
11041114
* Record swapped tree blocks of a subvolume tree for delayed subtree trace
11051115
* code. For detail check comment in fs/btrfs/qgroup.c.

fs/btrfs/disk-io.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3659,6 +3659,10 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
36593659

36603660
set_bit(BTRFS_FS_OPEN, &fs_info->flags);
36613661

3662+
/* Kick the cleaner thread so it'll start deleting snapshots. */
3663+
if (test_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags))
3664+
wake_up_process(fs_info->cleaner_kthread);
3665+
36623666
clear_oneshot:
36633667
btrfs_clear_oneshot_options(fs_info);
36643668
return 0;
@@ -4340,6 +4344,12 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
43404344
*/
43414345
kthread_park(fs_info->cleaner_kthread);
43424346

4347+
/*
4348+
* If we had UNFINISHED_DROPS we could still be processing them, so
4349+
* clear that bit and wake up relocation so it can stop.
4350+
*/
4351+
btrfs_wake_unfinished_drop(fs_info);
4352+
43434353
/* wait for the qgroup rescan worker to stop */
43444354
btrfs_qgroup_wait_for_completion(fs_info, false);
43454355

fs/btrfs/extent-tree.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5541,6 +5541,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
55415541
int ret;
55425542
int level;
55435543
bool root_dropped = false;
5544+
bool unfinished_drop = false;
55445545

55455546
btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);
55465547

@@ -5583,6 +5584,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
55835584
* already dropped.
55845585
*/
55855586
set_bit(BTRFS_ROOT_DELETING, &root->state);
5587+
unfinished_drop = test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state);
5588+
55865589
if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
55875590
level = btrfs_header_level(root->node);
55885591
path->nodes[level] = btrfs_lock_root_node(root);
@@ -5757,6 +5760,13 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
57575760
kfree(wc);
57585761
btrfs_free_path(path);
57595762
out:
5763+
/*
5764+
* We were an unfinished drop root, check to see if there are any
5765+
* pending, and if not clear and wake up any waiters.
5766+
*/
5767+
if (!err && unfinished_drop)
5768+
btrfs_maybe_wake_unfinished_drop(fs_info);
5769+
57605770
/*
57615771
* So if we need to stop dropping the snapshot for whatever reason we
57625772
* need to make sure to add it back to the dead root list so that we

fs/btrfs/relocation.c

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3967,6 +3967,19 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
39673967
int rw = 0;
39683968
int err = 0;
39693969

3970+
/*
3971+
* This only gets set if we had a half-deleted snapshot on mount. We
3972+
* cannot allow relocation to start while we're still trying to clean up
3973+
* these pending deletions.
3974+
*/
3975+
ret = wait_on_bit(&fs_info->flags, BTRFS_FS_UNFINISHED_DROPS, TASK_INTERRUPTIBLE);
3976+
if (ret)
3977+
return ret;
3978+
3979+
/* We may have been woken up by close_ctree, so bail if we're closing. */
3980+
if (btrfs_fs_closing(fs_info))
3981+
return -EINTR;
3982+
39703983
bg = btrfs_lookup_block_group(fs_info, group_start);
39713984
if (!bg)
39723985
return -ENOENT;

fs/btrfs/root-tree.c

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -280,6 +280,21 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info)
280280

281281
WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state));
282282
if (btrfs_root_refs(&root->root_item) == 0) {
283+
struct btrfs_key drop_key;
284+
285+
btrfs_disk_key_to_cpu(&drop_key, &root->root_item.drop_progress);
286+
/*
287+
* If we have a non-zero drop_progress then we know we
288+
* made it partly through deleting this snapshot, and
289+
* thus we need to make sure we block any balance from
290+
* happening until this snapshot is completely dropped.
291+
*/
292+
if (drop_key.objectid != 0 || drop_key.type != 0 ||
293+
drop_key.offset != 0) {
294+
set_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags);
295+
set_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state);
296+
}
297+
283298
set_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
284299
btrfs_add_dead_root(root);
285300
}

fs/btrfs/transaction.c

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1340,6 +1340,32 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans)
13401340
return 0;
13411341
}
13421342

1343+
/*
1344+
* If we had a pending drop we need to see if there are any others left in our
1345+
* dead roots list, and if not clear our bit and wake any waiters.
1346+
*/
1347+
void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
1348+
{
1349+
/*
1350+
* We put the drop in progress roots at the front of the list, so if the
1351+
* first entry doesn't have UNFINISHED_DROP set we can wake everybody
1352+
* up.
1353+
*/
1354+
spin_lock(&fs_info->trans_lock);
1355+
if (!list_empty(&fs_info->dead_roots)) {
1356+
struct btrfs_root *root = list_first_entry(&fs_info->dead_roots,
1357+
struct btrfs_root,
1358+
root_list);
1359+
if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state)) {
1360+
spin_unlock(&fs_info->trans_lock);
1361+
return;
1362+
}
1363+
}
1364+
spin_unlock(&fs_info->trans_lock);
1365+
1366+
btrfs_wake_unfinished_drop(fs_info);
1367+
}
1368+
13431369
/*
13441370
* dead roots are old snapshots that need to be deleted. This allocates
13451371
* a dirty root struct and adds it into the list of dead roots that need to
@@ -1352,7 +1378,12 @@ void btrfs_add_dead_root(struct btrfs_root *root)
13521378
spin_lock(&fs_info->trans_lock);
13531379
if (list_empty(&root->root_list)) {
13541380
btrfs_grab_root(root);
1355-
list_add_tail(&root->root_list, &fs_info->dead_roots);
1381+
1382+
/* We want to process the partially complete drops first. */
1383+
if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state))
1384+
list_add(&root->root_list, &fs_info->dead_roots);
1385+
else
1386+
list_add_tail(&root->root_list, &fs_info->dead_roots);
13561387
}
13571388
spin_unlock(&fs_info->trans_lock);
13581389
}

fs/btrfs/transaction.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid);
217217

218218
void btrfs_add_dead_root(struct btrfs_root *root);
219219
int btrfs_defrag_root(struct btrfs_root *root);
220+
void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info);
220221
int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root);
221222
int btrfs_commit_transaction(struct btrfs_trans_handle *trans);
222223
int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans);

0 commit comments

Comments
 (0)