|
@@ -1533,6 +1533,11 @@ out:
|
|
|
* struct refsort is used to match byte number to slot in the btree block.
|
|
|
* we sort based on the byte number and then use the slot to actually
|
|
|
* find the item.
|
|
|
+ *
|
|
|
+ * struct refsort is smaller than strcut btrfs_item and smaller than
|
|
|
+ * struct btrfs_key_ptr. Since we're currently limited to the page size
|
|
|
+ * for a btree block, there's no way for a kmalloc of refsorts for a
|
|
|
+ * single node to be bigger than a page.
|
|
|
*/
|
|
|
struct refsort {
|
|
|
u64 bytenr;
|
|
@@ -3457,36 +3462,73 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
|
|
|
{
|
|
|
u64 leaf_owner;
|
|
|
u64 leaf_generation;
|
|
|
+ struct refsort *sorted;
|
|
|
struct btrfs_key key;
|
|
|
struct btrfs_file_extent_item *fi;
|
|
|
int i;
|
|
|
int nritems;
|
|
|
int ret;
|
|
|
+ int refi = 0;
|
|
|
+ int slot;
|
|
|
|
|
|
BUG_ON(!btrfs_is_leaf(leaf));
|
|
|
nritems = btrfs_header_nritems(leaf);
|
|
|
leaf_owner = btrfs_header_owner(leaf);
|
|
|
leaf_generation = btrfs_header_generation(leaf);
|
|
|
|
|
|
+ sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
|
|
|
+ /* we do this loop twice. The first time we build a list
|
|
|
+ * of the extents we have a reference on, then we sort the list
|
|
|
+ * by bytenr. The second time around we actually do the
|
|
|
+ * extent freeing.
|
|
|
+ */
|
|
|
for (i = 0; i < nritems; i++) {
|
|
|
u64 disk_bytenr;
|
|
|
cond_resched();
|
|
|
|
|
|
btrfs_item_key_to_cpu(leaf, &key, i);
|
|
|
+
|
|
|
+ /* only extents have references, skip everything else */
|
|
|
if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
|
|
|
continue;
|
|
|
+
|
|
|
fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
|
|
|
+
|
|
|
+ /* inline extents live in the btree, they don't have refs */
|
|
|
if (btrfs_file_extent_type(leaf, fi) ==
|
|
|
BTRFS_FILE_EXTENT_INLINE)
|
|
|
continue;
|
|
|
- /*
|
|
|
- * FIXME make sure to insert a trans record that
|
|
|
- * repeats the snapshot del on crash
|
|
|
- */
|
|
|
+
|
|
|
disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
|
|
|
+
|
|
|
+ /* holes don't have refs */
|
|
|
if (disk_bytenr == 0)
|
|
|
continue;
|
|
|
|
|
|
+ sorted[refi].bytenr = disk_bytenr;
|
|
|
+ sorted[refi].slot = i;
|
|
|
+ refi++;
|
|
|
+ }
|
|
|
+
|
|
|
+ if (refi == 0)
|
|
|
+ goto out;
|
|
|
+
|
|
|
+ sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
|
|
|
+
|
|
|
+ for (i = 0; i < refi; i++) {
|
|
|
+ u64 disk_bytenr;
|
|
|
+
|
|
|
+ disk_bytenr = sorted[i].bytenr;
|
|
|
+ slot = sorted[i].slot;
|
|
|
+
|
|
|
+ cond_resched();
|
|
|
+
|
|
|
+ btrfs_item_key_to_cpu(leaf, &key, slot);
|
|
|
+ if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
|
|
|
+ continue;
|
|
|
+
|
|
|
+ fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
|
|
|
+
|
|
|
ret = __btrfs_free_extent(trans, root, disk_bytenr,
|
|
|
btrfs_file_extent_disk_num_bytes(leaf, fi),
|
|
|
leaf->start, leaf_owner, leaf_generation,
|
|
@@ -3497,6 +3539,8 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
|
|
|
wake_up(&root->fs_info->transaction_throttle);
|
|
|
cond_resched();
|
|
|
}
|
|
|
+out:
|
|
|
+ kfree(sorted);
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
@@ -3506,9 +3550,25 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
|
|
|
{
|
|
|
int i;
|
|
|
int ret;
|
|
|
- struct btrfs_extent_info *info = ref->extents;
|
|
|
+ struct btrfs_extent_info *info;
|
|
|
+ struct refsort *sorted;
|
|
|
+
|
|
|
+ if (ref->nritems == 0)
|
|
|
+ return 0;
|
|
|
|
|
|
+ sorted = kmalloc(sizeof(*sorted) * ref->nritems, GFP_NOFS);
|
|
|
+ for (i = 0; i < ref->nritems; i++) {
|
|
|
+ sorted[i].bytenr = ref->extents[i].bytenr;
|
|
|
+ sorted[i].slot = i;
|
|
|
+ }
|
|
|
+ sort(sorted, ref->nritems, sizeof(struct refsort), refsort_cmp, NULL);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * the items in the ref were sorted when the ref was inserted
|
|
|
+ * into the ref cache, so this is already in order
|
|
|
+ */
|
|
|
for (i = 0; i < ref->nritems; i++) {
|
|
|
+ info = ref->extents + sorted[i].slot;
|
|
|
ret = __btrfs_free_extent(trans, root, info->bytenr,
|
|
|
info->num_bytes, ref->bytenr,
|
|
|
ref->owner, ref->generation,
|
|
@@ -3565,6 +3625,152 @@ static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start,
|
|
|
return ret;
|
|
|
}
|
|
|
|
|
|
+/*
|
|
|
+ * this is used while deleting old snapshots, and it drops the refs
|
|
|
+ * on a whole subtree starting from a level 1 node.
|
|
|
+ *
|
|
|
+ * The idea is to sort all the leaf pointers, and then drop the
|
|
|
+ * ref on all the leaves in order. Most of the time the leaves
|
|
|
+ * will have ref cache entries, so no leaf IOs will be required to
|
|
|
+ * find the extents they have references on.
|
|
|
+ *
|
|
|
+ * For each leaf, any references it has are also dropped in order
|
|
|
+ *
|
|
|
+ * This ends up dropping the references in something close to optimal
|
|
|
+ * order for reading and modifying the extent allocation tree.
|
|
|
+ */
|
|
|
+static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
|
|
|
+ struct btrfs_root *root,
|
|
|
+ struct btrfs_path *path)
|
|
|
+{
|
|
|
+ u64 bytenr;
|
|
|
+ u64 root_owner;
|
|
|
+ u64 root_gen;
|
|
|
+ struct extent_buffer *eb = path->nodes[1];
|
|
|
+ struct extent_buffer *leaf;
|
|
|
+ struct btrfs_leaf_ref *ref;
|
|
|
+ struct refsort *sorted = NULL;
|
|
|
+ int nritems = btrfs_header_nritems(eb);
|
|
|
+ int ret;
|
|
|
+ int i;
|
|
|
+ int refi = 0;
|
|
|
+ int slot = path->slots[1];
|
|
|
+ u32 blocksize = btrfs_level_size(root, 0);
|
|
|
+ u32 refs;
|
|
|
+
|
|
|
+ if (nritems == 0)
|
|
|
+ goto out;
|
|
|
+
|
|
|
+ root_owner = btrfs_header_owner(eb);
|
|
|
+ root_gen = btrfs_header_generation(eb);
|
|
|
+ sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * step one, sort all the leaf pointers so we don't scribble
|
|
|
+ * randomly into the extent allocation tree
|
|
|
+ */
|
|
|
+ for (i = slot; i < nritems; i++) {
|
|
|
+ sorted[refi].bytenr = btrfs_node_blockptr(eb, i);
|
|
|
+ sorted[refi].slot = i;
|
|
|
+ refi++;
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ * nritems won't be zero, but if we're picking up drop_snapshot
|
|
|
+ * after a crash, slot might be > 0, so double check things
|
|
|
+ * just in case.
|
|
|
+ */
|
|
|
+ if (refi == 0)
|
|
|
+ goto out;
|
|
|
+
|
|
|
+ sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * the first loop frees everything the leaves point to
|
|
|
+ */
|
|
|
+ for (i = 0; i < refi; i++) {
|
|
|
+ u64 ptr_gen;
|
|
|
+
|
|
|
+ bytenr = sorted[i].bytenr;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * check the reference count on this leaf. If it is > 1
|
|
|
+ * we just decrement it below and don't update any
|
|
|
+ * of the refs the leaf points to.
|
|
|
+ */
|
|
|
+ ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
|
|
|
+ BUG_ON(ret);
|
|
|
+ if (refs != 1)
|
|
|
+ continue;
|
|
|
+
|
|
|
+ ptr_gen = btrfs_node_ptr_generation(eb, sorted[i].slot);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * the leaf only had one reference, which means the
|
|
|
+ * only thing pointing to this leaf is the snapshot
|
|
|
+ * we're deleting. It isn't possible for the reference
|
|
|
+ * count to increase again later
|
|
|
+ *
|
|
|
+ * The reference cache is checked for the leaf,
|
|
|
+ * and if found we'll be able to drop any refs held by
|
|
|
+ * the leaf without needing to read it in.
|
|
|
+ */
|
|
|
+ ref = btrfs_lookup_leaf_ref(root, bytenr);
|
|
|
+ if (ref && ref->generation != ptr_gen) {
|
|
|
+ btrfs_free_leaf_ref(root, ref);
|
|
|
+ ref = NULL;
|
|
|
+ }
|
|
|
+ if (ref) {
|
|
|
+ ret = cache_drop_leaf_ref(trans, root, ref);
|
|
|
+ BUG_ON(ret);
|
|
|
+ btrfs_remove_leaf_ref(root, ref);
|
|
|
+ btrfs_free_leaf_ref(root, ref);
|
|
|
+ } else {
|
|
|
+ /*
|
|
|
+ * the leaf wasn't in the reference cache, so
|
|
|
+ * we have to read it.
|
|
|
+ */
|
|
|
+ leaf = read_tree_block(root, bytenr, blocksize,
|
|
|
+ ptr_gen);
|
|
|
+ ret = btrfs_drop_leaf_ref(trans, root, leaf);
|
|
|
+ BUG_ON(ret);
|
|
|
+ free_extent_buffer(leaf);
|
|
|
+ }
|
|
|
+ atomic_inc(&root->fs_info->throttle_gen);
|
|
|
+ wake_up(&root->fs_info->transaction_throttle);
|
|
|
+ cond_resched();
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ * run through the loop again to free the refs on the leaves.
|
|
|
+ * This is faster than doing it in the loop above because
|
|
|
+ * the leaves are likely to be clustered together. We end up
|
|
|
+ * working in nice chunks on the extent allocation tree.
|
|
|
+ */
|
|
|
+ for (i = 0; i < refi; i++) {
|
|
|
+ bytenr = sorted[i].bytenr;
|
|
|
+ ret = __btrfs_free_extent(trans, root, bytenr,
|
|
|
+ blocksize, eb->start,
|
|
|
+ root_owner, root_gen, 0, 1);
|
|
|
+ BUG_ON(ret);
|
|
|
+
|
|
|
+ atomic_inc(&root->fs_info->throttle_gen);
|
|
|
+ wake_up(&root->fs_info->transaction_throttle);
|
|
|
+ cond_resched();
|
|
|
+ }
|
|
|
+out:
|
|
|
+ kfree(sorted);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * update the path to show we've processed the entire level 1
|
|
|
+ * node. This will get saved into the root's drop_snapshot_progress
|
|
|
+ * field so these drops are not repeated again if this transaction
|
|
|
+ * commits.
|
|
|
+ */
|
|
|
+ path->slots[1] = nritems;
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+
|
|
|
/*
|
|
|
* helper function for drop_snapshot, this walks down the tree dropping ref
|
|
|
* counts as it goes.
|
|
@@ -3580,7 +3786,6 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
|
|
|
struct extent_buffer *next;
|
|
|
struct extent_buffer *cur;
|
|
|
struct extent_buffer *parent;
|
|
|
- struct btrfs_leaf_ref *ref;
|
|
|
u32 blocksize;
|
|
|
int ret;
|
|
|
u32 refs;
|
|
@@ -3607,17 +3812,46 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
|
|
|
if (path->slots[*level] >=
|
|
|
btrfs_header_nritems(cur))
|
|
|
break;
|
|
|
+
|
|
|
+ /* the new code goes down to level 1 and does all the
|
|
|
+ * leaves pointed to that node in bulk. So, this check
|
|
|
+ * for level 0 will always be false.
|
|
|
+ *
|
|
|
+ * But, the disk format allows the drop_snapshot_progress
|
|
|
+ * field in the root to leave things in a state where
|
|
|
+ * a leaf will need cleaning up here. If someone crashes
|
|
|
+ * with the old code and then boots with the new code,
|
|
|
+ * we might find a leaf here.
|
|
|
+ */
|
|
|
if (*level == 0) {
|
|
|
ret = btrfs_drop_leaf_ref(trans, root, cur);
|
|
|
BUG_ON(ret);
|
|
|
break;
|
|
|
}
|
|
|
+
|
|
|
+ /*
|
|
|
+ * once we get to level one, process the whole node
|
|
|
+ * at once, including everything below it.
|
|
|
+ */
|
|
|
+ if (*level == 1) {
|
|
|
+ ret = drop_level_one_refs(trans, root, path);
|
|
|
+ BUG_ON(ret);
|
|
|
+ break;
|
|
|
+ }
|
|
|
+
|
|
|
bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
|
|
|
ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
|
|
|
blocksize = btrfs_level_size(root, *level - 1);
|
|
|
|
|
|
ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
|
|
|
BUG_ON(ret);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * if there is more than one reference, we don't need
|
|
|
+ * to read that node to drop any references it has. We
|
|
|
+ * just drop the ref we hold on that node and move on to the
|
|
|
+ * next slot in this level.
|
|
|
+ */
|
|
|
if (refs != 1) {
|
|
|
parent = path->nodes[*level];
|
|
|
root_owner = btrfs_header_owner(parent);
|
|
@@ -3636,46 +3870,12 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
|
|
|
|
|
|
continue;
|
|
|
}
|
|
|
+
|
|
|
/*
|
|
|
- * at this point, we have a single ref, and since the
|
|
|
- * only place referencing this extent is a dead root
|
|
|
- * the reference count should never go higher.
|
|
|
- * So, we don't need to check it again
|
|
|
+ * we need to keep freeing things in the next level down.
|
|
|
+ * read the block and loop around to process it
|
|
|
*/
|
|
|
- if (*level == 1) {
|
|
|
- ref = btrfs_lookup_leaf_ref(root, bytenr);
|
|
|
- if (ref && ref->generation != ptr_gen) {
|
|
|
- btrfs_free_leaf_ref(root, ref);
|
|
|
- ref = NULL;
|
|
|
- }
|
|
|
- if (ref) {
|
|
|
- ret = cache_drop_leaf_ref(trans, root, ref);
|
|
|
- BUG_ON(ret);
|
|
|
- btrfs_remove_leaf_ref(root, ref);
|
|
|
- btrfs_free_leaf_ref(root, ref);
|
|
|
- *level = 0;
|
|
|
- break;
|
|
|
- }
|
|
|
- }
|
|
|
- next = btrfs_find_tree_block(root, bytenr, blocksize);
|
|
|
- if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
|
|
|
- free_extent_buffer(next);
|
|
|
-
|
|
|
- next = read_tree_block(root, bytenr, blocksize,
|
|
|
- ptr_gen);
|
|
|
- cond_resched();
|
|
|
-#if 0
|
|
|
- /*
|
|
|
- * this is a debugging check and can go away
|
|
|
- * the ref should never go all the way down to 1
|
|
|
- * at this point
|
|
|
- */
|
|
|
- ret = lookup_extent_ref(NULL, root, bytenr, blocksize,
|
|
|
- &refs);
|
|
|
- BUG_ON(ret);
|
|
|
- WARN_ON(refs != 1);
|
|
|
-#endif
|
|
|
- }
|
|
|
+ next = read_tree_block(root, bytenr, blocksize, ptr_gen);
|
|
|
WARN_ON(*level <= 0);
|
|
|
if (path->nodes[*level-1])
|
|
|
free_extent_buffer(path->nodes[*level-1]);
|
|
@@ -3700,11 +3900,16 @@ out:
|
|
|
root_owner = btrfs_header_owner(parent);
|
|
|
root_gen = btrfs_header_generation(parent);
|
|
|
|
|
|
+ /*
|
|
|
+ * cleanup and free the reference on the last node
|
|
|
+ * we processed
|
|
|
+ */
|
|
|
ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
|
|
|
parent->start, root_owner, root_gen,
|
|
|
*level, 1);
|
|
|
free_extent_buffer(path->nodes[*level]);
|
|
|
path->nodes[*level] = NULL;
|
|
|
+
|
|
|
*level += 1;
|
|
|
BUG_ON(ret);
|
|
|
|
|
@@ -3824,6 +4029,13 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
|
|
|
if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
|
|
|
struct extent_buffer *node;
|
|
|
struct btrfs_disk_key disk_key;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * there is more work to do in this level.
|
|
|
+ * Update the drop_progress marker to reflect
|
|
|
+ * the work we've done so far, and then bump
|
|
|
+ * the slot number
|
|
|
+ */
|
|
|
node = path->nodes[i];
|
|
|
path->slots[i]++;
|
|
|
*level = i;
|
|
@@ -3835,6 +4047,11 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
|
|
|
return 0;
|
|
|
} else {
|
|
|
struct extent_buffer *parent;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * this whole node is done, free our reference
|
|
|
+ * on it and go up one level
|
|
|
+ */
|
|
|
if (path->nodes[*level] == root->node)
|
|
|
parent = path->nodes[*level];
|
|
|
else
|
|
@@ -4849,6 +5066,7 @@ int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
|
|
|
ref->bytenr = buf->start;
|
|
|
ref->owner = btrfs_header_owner(buf);
|
|
|
ref->generation = btrfs_header_generation(buf);
|
|
|
+
|
|
|
ret = btrfs_add_leaf_ref(root, ref, 0);
|
|
|
WARN_ON(ret);
|
|
|
btrfs_free_leaf_ref(root, ref);
|