123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551 |
- /*
- * Copyright (C) 2007 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
- #include <linux/gfp.h>
- #include <linux/slab.h>
- #include <linux/blkdev.h>
- #include "ctree.h"
- #include "transaction.h"
- #include "btrfs_inode.h"
- #include "extent_io.h"
- static u64 entry_end(struct btrfs_ordered_extent *entry)
- {
- if (entry->file_offset + entry->len < entry->file_offset)
- return (u64)-1;
- return entry->file_offset + entry->len;
- }
- static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
- struct rb_node *node)
- {
- struct rb_node ** p = &root->rb_node;
- struct rb_node * parent = NULL;
- struct btrfs_ordered_extent *entry;
- while(*p) {
- parent = *p;
- entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node);
- if (file_offset < entry->file_offset)
- p = &(*p)->rb_left;
- else if (file_offset >= entry_end(entry))
- p = &(*p)->rb_right;
- else
- return parent;
- }
- rb_link_node(node, parent, p);
- rb_insert_color(node, root);
- return NULL;
- }
- static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
- struct rb_node **prev_ret)
- {
- struct rb_node * n = root->rb_node;
- struct rb_node *prev = NULL;
- struct rb_node *test;
- struct btrfs_ordered_extent *entry;
- struct btrfs_ordered_extent *prev_entry = NULL;
- while(n) {
- entry = rb_entry(n, struct btrfs_ordered_extent, rb_node);
- prev = n;
- prev_entry = entry;
- if (file_offset < entry->file_offset)
- n = n->rb_left;
- else if (file_offset >= entry_end(entry))
- n = n->rb_right;
- else
- return n;
- }
- if (!prev_ret)
- return NULL;
- while(prev && file_offset >= entry_end(prev_entry)) {
- test = rb_next(prev);
- if (!test)
- break;
- prev_entry = rb_entry(test, struct btrfs_ordered_extent,
- rb_node);
- if (file_offset < entry_end(prev_entry))
- break;
- prev = test;
- }
- if (prev)
- prev_entry = rb_entry(prev, struct btrfs_ordered_extent,
- rb_node);
- while(prev && file_offset < entry_end(prev_entry)) {
- test = rb_prev(prev);
- if (!test)
- break;
- prev_entry = rb_entry(test, struct btrfs_ordered_extent,
- rb_node);
- prev = test;
- }
- *prev_ret = prev;
- return NULL;
- }
- static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
- {
- if (file_offset < entry->file_offset ||
- entry->file_offset + entry->len <= file_offset)
- return 0;
- return 1;
- }
- static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
- u64 file_offset)
- {
- struct rb_root *root = &tree->tree;
- struct rb_node *prev;
- struct rb_node *ret;
- struct btrfs_ordered_extent *entry;
- if (tree->last) {
- entry = rb_entry(tree->last, struct btrfs_ordered_extent,
- rb_node);
- if (offset_in_entry(entry, file_offset))
- return tree->last;
- }
- ret = __tree_search(root, file_offset, &prev);
- if (!ret)
- ret = prev;
- if (ret)
- tree->last = ret;
- return ret;
- }
- /* allocate and add a new ordered_extent into the per-inode tree.
- * file_offset is the logical offset in the file
- *
- * start is the disk block number of an extent already reserved in the
- * extent allocation tree
- *
- * len is the length of the extent
- *
- * This also sets the EXTENT_ORDERED bit on the range in the inode.
- *
- * The tree is given a single reference on the ordered extent that was
- * inserted.
- */
- int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
- u64 start, u64 len)
- {
- struct btrfs_ordered_inode_tree *tree;
- struct rb_node *node;
- struct btrfs_ordered_extent *entry;
- tree = &BTRFS_I(inode)->ordered_tree;
- entry = kzalloc(sizeof(*entry), GFP_NOFS);
- if (!entry)
- return -ENOMEM;
- mutex_lock(&tree->mutex);
- entry->file_offset = file_offset;
- entry->start = start;
- entry->len = len;
- /* one ref for the tree */
- atomic_set(&entry->refs, 1);
- init_waitqueue_head(&entry->wait);
- INIT_LIST_HEAD(&entry->list);
- node = tree_insert(&tree->tree, file_offset,
- &entry->rb_node);
- if (node) {
- entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
- atomic_inc(&entry->refs);
- }
- set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset,
- entry_end(entry) - 1, GFP_NOFS);
- mutex_unlock(&tree->mutex);
- BUG_ON(node);
- return 0;
- }
- /*
- * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
- * when an ordered extent is finished. If the list covers more than one
- * ordered extent, it is split across multiples.
- */
- int btrfs_add_ordered_sum(struct inode *inode,
- struct btrfs_ordered_extent *entry,
- struct btrfs_ordered_sum *sum)
- {
- struct btrfs_ordered_inode_tree *tree;
- tree = &BTRFS_I(inode)->ordered_tree;
- mutex_lock(&tree->mutex);
- list_add_tail(&sum->list, &entry->list);
- mutex_unlock(&tree->mutex);
- return 0;
- }
- /*
- * this is used to account for finished IO across a given range
- * of the file. The IO should not span ordered extents. If
- * a given ordered_extent is completely done, 1 is returned, otherwise
- * 0.
- *
- * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
- * to make sure this function only returns 1 once for a given ordered extent.
- */
- int btrfs_dec_test_ordered_pending(struct inode *inode,
- u64 file_offset, u64 io_size)
- {
- struct btrfs_ordered_inode_tree *tree;
- struct rb_node *node;
- struct btrfs_ordered_extent *entry;
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
- int ret;
- tree = &BTRFS_I(inode)->ordered_tree;
- mutex_lock(&tree->mutex);
- clear_extent_ordered(io_tree, file_offset, file_offset + io_size - 1,
- GFP_NOFS);
- node = tree_search(tree, file_offset);
- if (!node) {
- ret = 1;
- goto out;
- }
- entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
- if (!offset_in_entry(entry, file_offset)) {
- ret = 1;
- goto out;
- }
- ret = test_range_bit(io_tree, entry->file_offset,
- entry->file_offset + entry->len - 1,
- EXTENT_ORDERED, 0);
- if (ret == 0)
- ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
- out:
- mutex_unlock(&tree->mutex);
- return ret == 0;
- }
- /*
- * used to drop a reference on an ordered extent. This will free
- * the extent if the last reference is dropped
- */
- int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
- {
- struct list_head *cur;
- struct btrfs_ordered_sum *sum;
- if (atomic_dec_and_test(&entry->refs)) {
- while(!list_empty(&entry->list)) {
- cur = entry->list.next;
- sum = list_entry(cur, struct btrfs_ordered_sum, list);
- list_del(&sum->list);
- kfree(sum);
- }
- kfree(entry);
- }
- return 0;
- }
- /*
- * remove an ordered extent from the tree. No references are dropped
- * but, anyone waiting on this extent is woken up.
- */
- int btrfs_remove_ordered_extent(struct inode *inode,
- struct btrfs_ordered_extent *entry)
- {
- struct btrfs_ordered_inode_tree *tree;
- struct rb_node *node;
- tree = &BTRFS_I(inode)->ordered_tree;
- mutex_lock(&tree->mutex);
- node = &entry->rb_node;
- rb_erase(node, &tree->tree);
- tree->last = NULL;
- set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
- mutex_unlock(&tree->mutex);
- wake_up(&entry->wait);
- return 0;
- }
- /*
- * Used to start IO or wait for a given ordered extent to finish.
- *
- * If wait is one, this effectively waits on page writeback for all the pages
- * in the extent, and it waits on the io completion code to insert
- * metadata into the btree corresponding to the extent
- */
- void btrfs_start_ordered_extent(struct inode *inode,
- struct btrfs_ordered_extent *entry,
- int wait)
- {
- u64 start = entry->file_offset;
- u64 end = start + entry->len - 1;
- /*
- * pages in the range can be dirty, clean or writeback. We
- * start IO on any dirty ones so the wait doesn't stall waiting
- * for pdflush to find them
- */
- #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)
- do_sync_file_range(file, start, end, SYNC_FILE_RANGE_WRITE);
- #else
- do_sync_mapping_range(inode->i_mapping, start, end,
- SYNC_FILE_RANGE_WRITE);
- #endif
- if (wait)
- wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
- &entry->flags));
- }
- /*
- * Used to wait on ordered extents across a large range of bytes.
- */
- void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
- {
- u64 end;
- struct btrfs_ordered_extent *ordered;
- int found;
- int should_wait = 0;
- again:
- if (start + len < start)
- end = (u64)-1;
- else
- end = start + len - 1;
- found = 0;
- while(1) {
- ordered = btrfs_lookup_first_ordered_extent(inode, end);
- if (!ordered) {
- break;
- }
- if (ordered->file_offset >= start + len) {
- btrfs_put_ordered_extent(ordered);
- break;
- }
- if (ordered->file_offset + ordered->len < start) {
- btrfs_put_ordered_extent(ordered);
- break;
- }
- btrfs_start_ordered_extent(inode, ordered, should_wait);
- found++;
- end = ordered->file_offset;
- btrfs_put_ordered_extent(ordered);
- if (end == 0)
- break;
- end--;
- }
- if (should_wait && found) {
- should_wait = 0;
- goto again;
- }
- }
- /*
- * find an ordered extent corresponding to file_offset. return NULL if
- * nothing is found, otherwise take a reference on the extent and return it
- */
- struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
- u64 file_offset)
- {
- struct btrfs_ordered_inode_tree *tree;
- struct rb_node *node;
- struct btrfs_ordered_extent *entry = NULL;
- tree = &BTRFS_I(inode)->ordered_tree;
- mutex_lock(&tree->mutex);
- node = tree_search(tree, file_offset);
- if (!node)
- goto out;
- entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
- if (!offset_in_entry(entry, file_offset))
- entry = NULL;
- if (entry)
- atomic_inc(&entry->refs);
- out:
- mutex_unlock(&tree->mutex);
- return entry;
- }
- /*
- * lookup and return any extent before 'file_offset'. NULL is returned
- * if none is found
- */
- struct btrfs_ordered_extent *
- btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset)
- {
- struct btrfs_ordered_inode_tree *tree;
- struct rb_node *node;
- struct btrfs_ordered_extent *entry = NULL;
- tree = &BTRFS_I(inode)->ordered_tree;
- mutex_lock(&tree->mutex);
- node = tree_search(tree, file_offset);
- if (!node)
- goto out;
- entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
- atomic_inc(&entry->refs);
- out:
- mutex_unlock(&tree->mutex);
- return entry;
- }
- /*
- * After an extent is done, call this to conditionally update the on disk
- * i_size. i_size is updated to cover any fully written part of the file.
- */
- int btrfs_ordered_update_i_size(struct inode *inode,
- struct btrfs_ordered_extent *ordered)
- {
- struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
- u64 disk_i_size;
- u64 new_i_size;
- u64 i_size_test;
- struct rb_node *node;
- struct btrfs_ordered_extent *test;
- mutex_lock(&tree->mutex);
- disk_i_size = BTRFS_I(inode)->disk_i_size;
- /*
- * if the disk i_size is already at the inode->i_size, or
- * this ordered extent is inside the disk i_size, we're done
- */
- if (disk_i_size >= inode->i_size ||
- ordered->file_offset + ordered->len <= disk_i_size) {
- goto out;
- }
- /*
- * we can't update the disk_isize if there are delalloc bytes
- * between disk_i_size and this ordered extent
- */
- if (test_range_bit(io_tree, disk_i_size,
- ordered->file_offset + ordered->len - 1,
- EXTENT_DELALLOC, 0)) {
- goto out;
- }
- /*
- * walk backward from this ordered extent to disk_i_size.
- * if we find an ordered extent then we can't update disk i_size
- * yet
- */
- node = &ordered->rb_node;
- while(1) {
- node = rb_prev(node);
- if (!node)
- break;
- test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
- if (test->file_offset + test->len <= disk_i_size)
- break;
- if (test->file_offset >= inode->i_size)
- break;
- if (test->file_offset >= disk_i_size)
- goto out;
- }
- new_i_size = min_t(u64, entry_end(ordered), i_size_read(inode));
- /*
- * at this point, we know we can safely update i_size to at least
- * the offset from this ordered extent. But, we need to
- * walk forward and see if ios from higher up in the file have
- * finished.
- */
- node = rb_next(&ordered->rb_node);
- i_size_test = 0;
- if (node) {
- /*
- * do we have an area where IO might have finished
- * between our ordered extent and the next one.
- */
- test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
- if (test->file_offset > entry_end(ordered)) {
- i_size_test = test->file_offset - 1;
- }
- } else {
- i_size_test = i_size_read(inode);
- }
- /*
- * i_size_test is the end of a region after this ordered
- * extent where there are no ordered extents. As long as there
- * are no delalloc bytes in this area, it is safe to update
- * disk_i_size to the end of the region.
- */
- if (i_size_test > entry_end(ordered) &&
- !test_range_bit(io_tree, entry_end(ordered), i_size_test,
- EXTENT_DELALLOC, 0)) {
- new_i_size = min_t(u64, i_size_test, i_size_read(inode));
- }
- BTRFS_I(inode)->disk_i_size = new_i_size;
- out:
- mutex_unlock(&tree->mutex);
- return 0;
- }
- /*
- * search the ordered extents for one corresponding to 'offset' and
- * try to find a checksum. This is used because we allow pages to
- * be reclaimed before their checksum is actually put into the btree
- */
- int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u32 *sum)
- {
- struct btrfs_ordered_sum *ordered_sum;
- struct btrfs_sector_sum *sector_sums;
- struct btrfs_ordered_extent *ordered;
- struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
- struct list_head *cur;
- unsigned long num_sectors;
- unsigned long i;
- u32 sectorsize = BTRFS_I(inode)->root->sectorsize;
- int ret = 1;
- ordered = btrfs_lookup_ordered_extent(inode, offset);
- if (!ordered)
- return 1;
- mutex_lock(&tree->mutex);
- list_for_each_prev(cur, &ordered->list) {
- ordered_sum = list_entry(cur, struct btrfs_ordered_sum, list);
- if (offset >= ordered_sum->file_offset) {
- num_sectors = ordered_sum->len / sectorsize;
- sector_sums = &ordered_sum->sums;
- for (i = 0; i < num_sectors; i++) {
- if (sector_sums[i].offset == offset) {
- printk("find ordered sum inode %lu offset %Lu\n", inode->i_ino, offset);
- *sum = sector_sums[i].sum;
- ret = 0;
- goto out;
- }
- }
- }
- }
- out:
- mutex_unlock(&tree->mutex);
- return ret;
- }
|