123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500 |
- /*
- * fs/ext4/extents_status.c
- *
- * Written by Yongqiang Yang <xiaoqiangnk@gmail.com>
- * Modified by
- * Allison Henderson <achender@linux.vnet.ibm.com>
- * Hugh Dickins <hughd@google.com>
- * Zheng Liu <wenqing.lz@taobao.com>
- *
- * Ext4 extents status tree core functions.
- */
- #include <linux/rbtree.h>
- #include "ext4.h"
- #include "extents_status.h"
- #include "ext4_extents.h"
- #include <trace/events/ext4.h>
- /*
- * According to previous discussion in Ext4 Developer Workshop, we
- * will introduce a new structure called io tree to track all extent
- * status in order to solve some problems that we have met
- * (e.g. Reservation space warning), and provide extent-level locking.
- * Delay extent tree is the first step to achieve this goal. It is
- * original built by Yongqiang Yang. At that time it is called delay
- * extent tree, whose goal is only track delay extent in memory to
- * simplify the implementation of fiemap and bigalloc, and introduce
- * lseek SEEK_DATA/SEEK_HOLE support. That is why it is still called
- * delay extent tree at the following comment. But for better
- * understand what it does, it has been rename to extent status tree.
- *
- * Currently the first step has been done. All delay extents are
- * tracked in the tree. It maintains the delay extent when a delay
- * allocation is issued, and the delay extent is written out or
- * invalidated. Therefore the implementation of fiemap and bigalloc
- * are simplified, and SEEK_DATA/SEEK_HOLE are introduced.
- *
- * The following comment describes the implemenmtation of extent
- * status tree and future works.
- */
- /*
- * extents status tree implementation for ext4.
- *
- *
- * ==========================================================================
- * Extents status encompass delayed extents and extent locks
- *
- * 1. Why delayed extent implementation ?
- *
- * Without delayed extent, ext4 identifies a delayed extent by looking
- * up page cache, this has several deficiencies - complicated, buggy,
- * and inefficient code.
- *
- * FIEMAP, SEEK_HOLE/DATA, bigalloc, punch hole and writeout all need
- * to know if a block or a range of blocks are belonged to a delayed
- * extent.
- *
- * Let us have a look at how they do without delayed extents implementation.
- * -- FIEMAP
- * FIEMAP looks up page cache to identify delayed allocations from holes.
- *
- * -- SEEK_HOLE/DATA
- * SEEK_HOLE/DATA has the same problem as FIEMAP.
- *
- * -- bigalloc
- * bigalloc looks up page cache to figure out if a block is
- * already under delayed allocation or not to determine whether
- * quota reserving is needed for the cluster.
- *
- * -- punch hole
- * punch hole looks up page cache to identify a delayed extent.
- *
- * -- writeout
- * Writeout looks up whole page cache to see if a buffer is
- * mapped, If there are not very many delayed buffers, then it is
- * time comsuming.
- *
- * With delayed extents implementation, FIEMAP, SEEK_HOLE/DATA,
- * bigalloc and writeout can figure out if a block or a range of
- * blocks is under delayed allocation(belonged to a delayed extent) or
- * not by searching the delayed extent tree.
- *
- *
- * ==========================================================================
- * 2. ext4 delayed extents impelmentation
- *
- * -- delayed extent
- * A delayed extent is a range of blocks which are contiguous
- * logically and under delayed allocation. Unlike extent in
- * ext4, delayed extent in ext4 is a in-memory struct, there is
- * no corresponding on-disk data. There is no limit on length of
- * delayed extent, so a delayed extent can contain as many blocks
- * as they are contiguous logically.
- *
- * -- delayed extent tree
- * Every inode has a delayed extent tree and all under delayed
- * allocation blocks are added to the tree as delayed extents.
- * Delayed extents in the tree are ordered by logical block no.
- *
- * -- operations on a delayed extent tree
- * There are three operations on a delayed extent tree: find next
- * delayed extent, adding a space(a range of blocks) and removing
- * a space.
- *
- * -- race on a delayed extent tree
- * Delayed extent tree is protected inode->i_es_lock.
- *
- *
- * ==========================================================================
- * 3. performance analysis
- * -- overhead
- * 1. There is a cache extent for write access, so if writes are
- * not very random, adding space operaions are in O(1) time.
- *
- * -- gain
- * 2. Code is much simpler, more readable, more maintainable and
- * more efficient.
- *
- *
- * ==========================================================================
- * 4. TODO list
- * -- Track all extent status
- *
- * -- Improve get block process
- *
- * -- Extent-level locking
- */
- static struct kmem_cache *ext4_es_cachep;
- int __init ext4_init_es(void)
- {
- ext4_es_cachep = KMEM_CACHE(extent_status, SLAB_RECLAIM_ACCOUNT);
- if (ext4_es_cachep == NULL)
- return -ENOMEM;
- return 0;
- }
- void ext4_exit_es(void)
- {
- if (ext4_es_cachep)
- kmem_cache_destroy(ext4_es_cachep);
- }
- void ext4_es_init_tree(struct ext4_es_tree *tree)
- {
- tree->root = RB_ROOT;
- tree->cache_es = NULL;
- }
- #ifdef ES_DEBUG__
- static void ext4_es_print_tree(struct inode *inode)
- {
- struct ext4_es_tree *tree;
- struct rb_node *node;
- printk(KERN_DEBUG "status extents for inode %lu:", inode->i_ino);
- tree = &EXT4_I(inode)->i_es_tree;
- node = rb_first(&tree->root);
- while (node) {
- struct extent_status *es;
- es = rb_entry(node, struct extent_status, rb_node);
- printk(KERN_DEBUG " [%u/%u)", es->start, es->len);
- node = rb_next(node);
- }
- printk(KERN_DEBUG "\n");
- }
- #else
- #define ext4_es_print_tree(inode)
- #endif
- static inline ext4_lblk_t extent_status_end(struct extent_status *es)
- {
- BUG_ON(es->start + es->len < es->start);
- return es->start + es->len - 1;
- }
- /*
- * search through the tree for an delayed extent with a given offset. If
- * it can't be found, try to find next extent.
- */
- static struct extent_status *__es_tree_search(struct rb_root *root,
- ext4_lblk_t offset)
- {
- struct rb_node *node = root->rb_node;
- struct extent_status *es = NULL;
- while (node) {
- es = rb_entry(node, struct extent_status, rb_node);
- if (offset < es->start)
- node = node->rb_left;
- else if (offset > extent_status_end(es))
- node = node->rb_right;
- else
- return es;
- }
- if (es && offset < es->start)
- return es;
- if (es && offset > extent_status_end(es)) {
- node = rb_next(&es->rb_node);
- return node ? rb_entry(node, struct extent_status, rb_node) :
- NULL;
- }
- return NULL;
- }
- /*
- * ext4_es_find_extent: find the 1st delayed extent covering @es->start
- * if it exists, otherwise, the next extent after @es->start.
- *
- * @inode: the inode which owns delayed extents
- * @es: delayed extent that we found
- *
- * Returns the first block of the next extent after es, otherwise
- * EXT_MAX_BLOCKS if no delay extent is found.
- * Delayed extent is returned via @es.
- */
- ext4_lblk_t ext4_es_find_extent(struct inode *inode, struct extent_status *es)
- {
- struct ext4_es_tree *tree = NULL;
- struct extent_status *es1 = NULL;
- struct rb_node *node;
- ext4_lblk_t ret = EXT_MAX_BLOCKS;
- trace_ext4_es_find_extent_enter(inode, es->start);
- read_lock(&EXT4_I(inode)->i_es_lock);
- tree = &EXT4_I(inode)->i_es_tree;
- /* find delay extent in cache firstly */
- if (tree->cache_es) {
- es1 = tree->cache_es;
- if (in_range(es->start, es1->start, es1->len)) {
- es_debug("%u cached by [%u/%u)\n",
- es->start, es1->start, es1->len);
- goto out;
- }
- }
- es->len = 0;
- es1 = __es_tree_search(&tree->root, es->start);
- out:
- if (es1) {
- tree->cache_es = es1;
- es->start = es1->start;
- es->len = es1->len;
- node = rb_next(&es1->rb_node);
- if (node) {
- es1 = rb_entry(node, struct extent_status, rb_node);
- ret = es1->start;
- }
- }
- read_unlock(&EXT4_I(inode)->i_es_lock);
- trace_ext4_es_find_extent_exit(inode, es, ret);
- return ret;
- }
- static struct extent_status *
- ext4_es_alloc_extent(ext4_lblk_t start, ext4_lblk_t len)
- {
- struct extent_status *es;
- es = kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC);
- if (es == NULL)
- return NULL;
- es->start = start;
- es->len = len;
- return es;
- }
- static void ext4_es_free_extent(struct extent_status *es)
- {
- kmem_cache_free(ext4_es_cachep, es);
- }
- static struct extent_status *
- ext4_es_try_to_merge_left(struct ext4_es_tree *tree, struct extent_status *es)
- {
- struct extent_status *es1;
- struct rb_node *node;
- node = rb_prev(&es->rb_node);
- if (!node)
- return es;
- es1 = rb_entry(node, struct extent_status, rb_node);
- if (es->start == extent_status_end(es1) + 1) {
- es1->len += es->len;
- rb_erase(&es->rb_node, &tree->root);
- ext4_es_free_extent(es);
- es = es1;
- }
- return es;
- }
- static struct extent_status *
- ext4_es_try_to_merge_right(struct ext4_es_tree *tree, struct extent_status *es)
- {
- struct extent_status *es1;
- struct rb_node *node;
- node = rb_next(&es->rb_node);
- if (!node)
- return es;
- es1 = rb_entry(node, struct extent_status, rb_node);
- if (es1->start == extent_status_end(es) + 1) {
- es->len += es1->len;
- rb_erase(node, &tree->root);
- ext4_es_free_extent(es1);
- }
- return es;
- }
- static int __es_insert_extent(struct ext4_es_tree *tree, ext4_lblk_t offset,
- ext4_lblk_t len)
- {
- struct rb_node **p = &tree->root.rb_node;
- struct rb_node *parent = NULL;
- struct extent_status *es;
- ext4_lblk_t end = offset + len - 1;
- BUG_ON(end < offset);
- es = tree->cache_es;
- if (es && offset == (extent_status_end(es) + 1)) {
- es_debug("cached by [%u/%u)\n", es->start, es->len);
- es->len += len;
- es = ext4_es_try_to_merge_right(tree, es);
- goto out;
- } else if (es && es->start == end + 1) {
- es_debug("cached by [%u/%u)\n", es->start, es->len);
- es->start = offset;
- es->len += len;
- es = ext4_es_try_to_merge_left(tree, es);
- goto out;
- } else if (es && es->start <= offset &&
- end <= extent_status_end(es)) {
- es_debug("cached by [%u/%u)\n", es->start, es->len);
- goto out;
- }
- while (*p) {
- parent = *p;
- es = rb_entry(parent, struct extent_status, rb_node);
- if (offset < es->start) {
- if (es->start == end + 1) {
- es->start = offset;
- es->len += len;
- es = ext4_es_try_to_merge_left(tree, es);
- goto out;
- }
- p = &(*p)->rb_left;
- } else if (offset > extent_status_end(es)) {
- if (offset == extent_status_end(es) + 1) {
- es->len += len;
- es = ext4_es_try_to_merge_right(tree, es);
- goto out;
- }
- p = &(*p)->rb_right;
- } else {
- if (extent_status_end(es) <= end)
- es->len = offset - es->start + len;
- goto out;
- }
- }
- es = ext4_es_alloc_extent(offset, len);
- if (!es)
- return -ENOMEM;
- rb_link_node(&es->rb_node, parent, p);
- rb_insert_color(&es->rb_node, &tree->root);
- out:
- tree->cache_es = es;
- return 0;
- }
- /*
- * ext4_es_insert_extent() adds a space to a delayed extent tree.
- * Caller holds inode->i_es_lock.
- *
- * ext4_es_insert_extent is called by ext4_da_write_begin and
- * ext4_es_remove_extent.
- *
- * Return 0 on success, error code on failure.
- */
- int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t offset,
- ext4_lblk_t len)
- {
- struct ext4_es_tree *tree;
- int err = 0;
- trace_ext4_es_insert_extent(inode, offset, len);
- es_debug("add [%u/%u) to extent status tree of inode %lu\n",
- offset, len, inode->i_ino);
- write_lock(&EXT4_I(inode)->i_es_lock);
- tree = &EXT4_I(inode)->i_es_tree;
- err = __es_insert_extent(tree, offset, len);
- write_unlock(&EXT4_I(inode)->i_es_lock);
- ext4_es_print_tree(inode);
- return err;
- }
- /*
- * ext4_es_remove_extent() removes a space from a delayed extent tree.
- * Caller holds inode->i_es_lock.
- *
- * Return 0 on success, error code on failure.
- */
- int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t offset,
- ext4_lblk_t len)
- {
- struct rb_node *node;
- struct ext4_es_tree *tree;
- struct extent_status *es;
- struct extent_status orig_es;
- ext4_lblk_t len1, len2, end;
- int err = 0;
- trace_ext4_es_remove_extent(inode, offset, len);
- es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
- offset, len, inode->i_ino);
- end = offset + len - 1;
- BUG_ON(end < offset);
- write_lock(&EXT4_I(inode)->i_es_lock);
- tree = &EXT4_I(inode)->i_es_tree;
- es = __es_tree_search(&tree->root, offset);
- if (!es)
- goto out;
- if (es->start > end)
- goto out;
- /* Simply invalidate cache_es. */
- tree->cache_es = NULL;
- orig_es.start = es->start;
- orig_es.len = es->len;
- len1 = offset > es->start ? offset - es->start : 0;
- len2 = extent_status_end(es) > end ?
- extent_status_end(es) - end : 0;
- if (len1 > 0)
- es->len = len1;
- if (len2 > 0) {
- if (len1 > 0) {
- err = __es_insert_extent(tree, end + 1, len2);
- if (err) {
- es->start = orig_es.start;
- es->len = orig_es.len;
- goto out;
- }
- } else {
- es->start = end + 1;
- es->len = len2;
- }
- goto out;
- }
- if (len1 > 0) {
- node = rb_next(&es->rb_node);
- if (node)
- es = rb_entry(node, struct extent_status, rb_node);
- else
- es = NULL;
- }
- while (es && extent_status_end(es) <= end) {
- node = rb_next(&es->rb_node);
- rb_erase(&es->rb_node, &tree->root);
- ext4_es_free_extent(es);
- if (!node) {
- es = NULL;
- break;
- }
- es = rb_entry(node, struct extent_status, rb_node);
- }
- if (es && es->start < end + 1) {
- len1 = extent_status_end(es) - end;
- es->start = end + 1;
- es->len = len1;
- }
- out:
- write_unlock(&EXT4_I(inode)->i_es_lock);
- ext4_es_print_tree(inode);
- return err;
- }
|