16 years ago · 5a3f23d515
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -66,6 +66,12 @@ struct btrfs_inode {
 
															 	 */
														
 
															 	struct list_head delalloc_inodes;
														
 
															+	/*
														
 
															+	 * list for tracking inodes that must be sent to disk before a
														
 
															+	 * rename or truncate commit
														
 
															+	 */
														
 
															+	struct list_head ordered_operations;
														
 
															+
														
 
															 	/* the space_info for where this inode's data allocations are done */
														
 
															 	struct btrfs_space_info *space_info;
														
@@ -122,6 +128,18 @@ struct btrfs_inode {
 
															 	 */
														
 
															 	u64 last_unlink_trans;
														
 
															+	/*
														
 
															+	 * ordered_data_close is set by truncate when a file that used
														
 
															+	 * to have good data has been truncated to zero.  When it is set
														
 
															+	 * the btrfs file release call will add this inode to the
														
 
															+	 * ordered operations list so that we make sure to flush out any
														
 
															+	 * new data the application may have written before commit.
														
 
															+	 *
														
 
															+	 * yes, its silly to have a single bitflag, but we might grow more
														
 
															+	 * of these.
														
 
															+	 */
														
 
															+	unsigned ordered_data_close:1;
														
 
															+
														
 
															 	struct inode vfs_inode;
														
 
															 };
														
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -45,6 +45,13 @@ struct btrfs_ordered_sum;
 
															 #define BTRFS_MAX_LEVEL 8
														
 
															+/*
														
 
															+ * files bigger than this get some pre-flushing when they are added
														
 
															+ * to the ordered operations list.  That way we limit the total
														
 
															+ * work done by the commit
														
 
															+ */
														
 
															+#define BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT (8 * 1024 * 1024)
														
 
															+
														
 
															 /* holds pointers to all of the tree roots */
														
 
															 #define BTRFS_ROOT_TREE_OBJECTID 1ULL
														
@@ -727,6 +734,15 @@ struct btrfs_fs_info {
 
															 	struct mutex volume_mutex;
														
 
															 	struct mutex tree_reloc_mutex;
														
 
															+	/*
														
 
															+	 * this protects the ordered operations list only while we are
														
 
															+	 * processing all of the entries on it.  This way we make
														
 
															+	 * sure the commit code doesn't find the list temporarily empty
														
 
															+	 * because another function happens to be doing non-waiting preflush
														
 
															+	 * before jumping into the main commit.
														
 
															+	 */
														
 
															+	struct mutex ordered_operations_mutex;
														
 
															+
														
 
															 	struct list_head trans_list;
														
 
															 	struct list_head hashers;
														
 
															 	struct list_head dead_roots;
														
@@ -741,9 +757,28 @@ struct btrfs_fs_info {
 
															 	 * ordered extents
														
 
															 	 */
														
 
															 	spinlock_t ordered_extent_lock;
														
 
															+
														
 
															+	/*
														
 
															+	 * all of the data=ordered extents pending writeback
														
 
															+	 * these can span multiple transactions and basically include
														
 
															+	 * every dirty data page that isn't from nodatacow
														
 
															+	 */
														
 
															 	struct list_head ordered_extents;
														
 
															+
														
 
															+	/*
														
 
															+	 * all of the inodes that have delalloc bytes.  It is possible for
														
 
															+	 * this list to be empty even when there is still dirty data=ordered
														
 
															+	 * extents waiting to finish IO.
														
 
															+	 */
														
 
															 	struct list_head delalloc_inodes;
														
 
															+	/*
														
 
															+	 * special rename and truncate targets that must be on disk before
														
 
															+	 * we're allowed to commit.  This is basically the ext3 style
														
 
															+	 * data=ordered list.
														
 
															+	 */
														
 
															+	struct list_head ordered_operations;
														
 
															+
														
 
															 	/*
														
 
															 	 * there is a pool of worker threads for checksumming during writes
														
 
															 	 * and a pool for checksumming after reads.  This is because readers
														
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1572,6 +1572,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
															 	INIT_LIST_HEAD(&fs_info->dead_roots);
														
 
															 	INIT_LIST_HEAD(&fs_info->hashers);
														
 
															 	INIT_LIST_HEAD(&fs_info->delalloc_inodes);
														
 
															+	INIT_LIST_HEAD(&fs_info->ordered_operations);
														
 
															 	spin_lock_init(&fs_info->delalloc_lock);
														
 
															 	spin_lock_init(&fs_info->new_trans_lock);
														
 
															 	spin_lock_init(&fs_info->ref_cache_lock);
														
@@ -1643,6 +1644,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
															 	insert_inode_hash(fs_info->btree_inode);
														
 
															 	mutex_init(&fs_info->trans_mutex);
														
 
															+	mutex_init(&fs_info->ordered_operations_mutex);
														
 
															 	mutex_init(&fs_info->tree_log_mutex);
														
 
															 	mutex_init(&fs_info->drop_mutex);
														
 
															 	mutex_init(&fs_info->pinned_mutex);
														
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1161,6 +1161,20 @@ out_nolock:
 
															 		page_cache_release(pinned[1]);
														
 
															 	*ppos = pos;
														
 
															+	/*
														
 
															+	 * we want to make sure fsync finds this change
														
 
															+	 * but we haven't joined a transaction running right now.
														
 
															+	 *
														
 
															+	 * Later on, someone is sure to update the inode and get the
														
 
															+	 * real transid recorded.
														
 
															+	 *
														
 
															+	 * We set last_trans now to the fs_info generation + 1,
														
 
															+	 * this will either be one more than the running transaction
														
 
															+	 * or the generation used for the next transaction if there isn't
														
 
															+	 * one running right now.
														
 
															+	 */
														
 
															+	BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
														
 
															+
														
 
															 	if (num_written > 0 && will_write) {
														
 
															 		struct btrfs_trans_handle *trans;
														
@@ -1194,6 +1208,18 @@ out_nolock:
 
															 int btrfs_release_file(struct inode *inode, struct file *filp)
														
 
															 {
														
 
															+	/*
														
 
															+	 * ordered_data_close is set by settattr when we are about to truncate
														
 
															+	 * a file from a non-zero size to a zero size.  This tries to
														
 
															+	 * flush down new bytes that may have been written if the
														
 
															+	 * application were using truncate to replace a file in place.
														
 
															+	 */
														
 
															+	if (BTRFS_I(inode)->ordered_data_close) {
														
 
															+		BTRFS_I(inode)->ordered_data_close = 0;
														
 
															+		btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode);
														
 
															+		if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
														
 
															+			filemap_flush(inode->i_mapping);
														
 
															+	}
														
 
															 	if (filp->private_data)
														
 
															 		btrfs_ioctl_trans_end(filp);
														
 
															 	return 0;
														
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2907,11 +2907,21 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 
															 	if (err)
														
 
															 		return err;
														
 
															-	if (S_ISREG(inode->i_mode) &&
														
 
															-	    attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) {
														
 
															-		err = btrfs_cont_expand(inode, attr->ia_size);
														
 
															-		if (err)
														
 
															-			return err;
														
 
															+	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
														
 
															+		if (attr->ia_size > inode->i_size) {
														
 
															+			err = btrfs_cont_expand(inode, attr->ia_size);
														
 
															+			if (err)
														
 
															+				return err;
														
 
															+		} else if (inode->i_size > 0 &&
														
 
															+			   attr->ia_size == 0) {
														
 
															+
														
 
															+			/* we're truncating a file that used to have good
														
 
															+			 * data down to zero.  Make sure it gets into
														
 
															+			 * the ordered flush list so that any new writes
														
 
															+			 * get down to disk quickly.
														
 
															+			 */
														
 
															+			BTRFS_I(inode)->ordered_data_close = 1;
														
 
															+		}
														
 
															 	}
														
 
															 	err = inode_setattr(inode, attr);
														
@@ -3050,6 +3060,7 @@ static noinline void init_btrfs_i(struct inode *inode)
 
															 	extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
														
 
															 			     inode->i_mapping, GFP_NOFS);
														
 
															 	INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
														
 
															+	INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
														
 
															 	btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
														
 
															 	mutex_init(&BTRFS_I(inode)->extent_mutex);
														
 
															 	mutex_init(&BTRFS_I(inode)->log_mutex);
														
@@ -4419,6 +4430,8 @@ again:
 
															 	}
														
 
															 	ClearPageChecked(page);
														
 
															 	set_page_dirty(page);
														
 
															+
														
 
															+	BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
														
 
															 	unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
														
 
															 out_unlock:
														
@@ -4444,6 +4457,27 @@ static void btrfs_truncate(struct inode *inode)
 
															 	btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
														
 
															 	trans = btrfs_start_transaction(root, 1);
														
 
															+
														
 
															+	/*
														
 
															+	 * setattr is responsible for setting the ordered_data_close flag,
														
 
															+	 * but that is only tested during the last file release.  That
														
 
															+	 * could happen well after the next commit, leaving a great big
														
 
															+	 * window where new writes may get lost if someone chooses to write
														
 
															+	 * to this file after truncating to zero
														
 
															+	 *
														
 
															+	 * The inode doesn't have any dirty data here, and so if we commit
														
 
															+	 * this is a noop.  If someone immediately starts writing to the inode
														
 
															+	 * it is very likely we'll catch some of their writes in this
														
 
															+	 * transaction, and the commit will find this file on the ordered
														
 
															+	 * data list with good things to send down.
														
 
															+	 *
														
 
															+	 * This is a best effort solution, there is still a window where
														
 
															+	 * using truncate to replace the contents of the file will
														
 
															+	 * end up with a zero length file after a crash.
														
 
															+	 */
														
 
															+	if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close)
														
 
															+		btrfs_add_ordered_operation(trans, root, inode);
														
 
															+
														
 
															 	btrfs_set_trans_block_group(trans, inode);
														
 
															 	btrfs_i_size_write(inode, inode->i_size);
														
@@ -4520,12 +4554,15 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 
															 	ei->i_acl = BTRFS_ACL_NOT_CACHED;
														
 
															 	ei->i_default_acl = BTRFS_ACL_NOT_CACHED;
														
 
															 	INIT_LIST_HEAD(&ei->i_orphan);
														
 
															+	INIT_LIST_HEAD(&ei->ordered_operations);
														
 
															 	return &ei->vfs_inode;
														
 
															 }
														
 
															 void btrfs_destroy_inode(struct inode *inode)
														
 
															 {
														
 
															 	struct btrfs_ordered_extent *ordered;
														
 
															+	struct btrfs_root *root = BTRFS_I(inode)->root;
														
 
															+
														
 
															 	WARN_ON(!list_empty(&inode->i_dentry));
														
 
															 	WARN_ON(inode->i_data.nrpages);
														
@@ -4536,13 +4573,24 @@ void btrfs_destroy_inode(struct inode *inode)
 
															 	    BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED)
														
 
															 		posix_acl_release(BTRFS_I(inode)->i_default_acl);
														
 
															-	spin_lock(&BTRFS_I(inode)->root->list_lock);
														
 
															+	/*
														
 
															+	 * Make sure we're properly removed from the ordered operation
														
 
															+	 * lists.
														
 
															+	 */
														
 
															+	smp_mb();
														
 
															+	if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
														
 
															+		spin_lock(&root->fs_info->ordered_extent_lock);
														
 
															+		list_del_init(&BTRFS_I(inode)->ordered_operations);
														
 
															+		spin_unlock(&root->fs_info->ordered_extent_lock);
														
 
															+	}
														
 
															+
														
 
															+	spin_lock(&root->list_lock);
														
 
															 	if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
														
 
															 		printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan"
														
 
															 		       " list\n", inode->i_ino);
														
 
															 		dump_stack();
														
 
															 	}
														
 
															-	spin_unlock(&BTRFS_I(inode)->root->list_lock);
														
 
															+	spin_unlock(&root->list_lock);
														
 
															 	while (1) {
														
 
															 		ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
														
@@ -4667,8 +4715,27 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 
															 	if (ret)
														
 
															 		goto out_unlock;
														
 
															+	/*
														
 
															+	 * we're using rename to replace one file with another.
														
 
															+	 * and the replacement file is large.  Start IO on it now so
														
 
															+	 * we don't add too much work to the end of the transaction
														
 
															+	 */
														
 
															+	if (new_inode && old_inode && S_ISREG(old_inode->i_mode) &&
														
 
															+	    new_inode->i_size &&
														
 
															+	    old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
														
 
															+		filemap_flush(old_inode->i_mapping);
														
 
															+
														
 
															 	trans = btrfs_start_transaction(root, 1);
														
 
															+	/*
														
 
															+	 * make sure the inode gets flushed if it is replacing
														
 
															+	 * something.
														
 
															+	 */
														
 
															+	if (new_inode && new_inode->i_size &&
														
 
															+	    old_inode && S_ISREG(old_inode->i_mode)) {
														
 
															+		btrfs_add_ordered_operation(trans, root, old_inode);
														
 
															+	}
														
 
															+
														
 
															 	/*
														
 
															 	 * this is an ugly little race, but the rename is required to make
														
 
															 	 * sure that if we crash, the inode is either at the old name
														
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -310,6 +310,16 @@ int btrfs_remove_ordered_extent(struct inode *inode,
 
															 	spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
														
 
															 	list_del_init(&entry->root_extent_list);
														
 
															+
														
 
															+	/*
														
 
															+	 * we have no more ordered extents for this inode and
														
 
															+	 * no dirty pages.  We can safely remove it from the
														
 
															+	 * list of ordered extents
														
 
															+	 */
														
 
															+	if (RB_EMPTY_ROOT(&tree->tree) &&
														
 
															+	    !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
														
 
															+		list_del_init(&BTRFS_I(inode)->ordered_operations);
														
 
															+	}
														
 
															 	spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
														
 
															 	mutex_unlock(&tree->mutex);
														
@@ -369,6 +379,68 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only)
 
															 	return 0;
														
 
															 }
														
 
															+/*
														
 
															+ * this is used during transaction commit to write all the inodes
														
 
															+ * added to the ordered operation list.  These files must be fully on
														
 
															+ * disk before the transaction commits.
														
 
															+ *
														
 
															+ * we have two modes here, one is to just start the IO via filemap_flush
														
 
															+ * and the other is to wait for all the io.  When we wait, we have an
														
 
															+ * extra check to make sure the ordered operation list really is empty
														
 
															+ * before we return
														
 
															+ */
														
 
															+int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
														
 
															+{
														
 
															+	struct btrfs_inode *btrfs_inode;
														
 
															+	struct inode *inode;
														
 
															+	struct list_head splice;
														
 
															+
														
 
															+	INIT_LIST_HEAD(&splice);
														
 
															+
														
 
															+	mutex_lock(&root->fs_info->ordered_operations_mutex);
														
 
															+	spin_lock(&root->fs_info->ordered_extent_lock);
														
 
															+again:
														
 
															+	list_splice_init(&root->fs_info->ordered_operations, &splice);
														
 
															+
														
 
															+	while (!list_empty(&splice)) {
														
 
															+		btrfs_inode = list_entry(splice.next, struct btrfs_inode,
														
 
															+				   ordered_operations);
														
 
															+
														
 
															+		inode = &btrfs_inode->vfs_inode;
														
 
															+
														
 
															+		list_del_init(&btrfs_inode->ordered_operations);
														
 
															+
														
 
															+		/*
														
 
															+		 * the inode may be getting freed (in sys_unlink path).
														
 
															+		 */
														
 
															+		inode = igrab(inode);
														
 
															+
														
 
															+		if (!wait && inode) {
														
 
															+			list_add_tail(&BTRFS_I(inode)->ordered_operations,
														
 
															+			      &root->fs_info->ordered_operations);
														
 
															+		}
														
 
															+		spin_unlock(&root->fs_info->ordered_extent_lock);
														
 
															+
														
 
															+		if (inode) {
														
 
															+			if (wait)
														
 
															+				btrfs_wait_ordered_range(inode, 0, (u64)-1);
														
 
															+			else
														
 
															+				filemap_flush(inode->i_mapping);
														
 
															+			iput(inode);
														
 
															+		}
														
 
															+
														
 
															+		cond_resched();
														
 
															+		spin_lock(&root->fs_info->ordered_extent_lock);
														
 
															+	}
														
 
															+	if (wait && !list_empty(&root->fs_info->ordered_operations))
														
 
															+		goto again;
														
 
															+
														
 
															+	spin_unlock(&root->fs_info->ordered_extent_lock);
														
 
															+	mutex_unlock(&root->fs_info->ordered_operations_mutex);
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															 /*
														
 
															  * Used to start IO or wait for a given ordered extent to finish.
														
 
															  *
														
@@ -726,3 +798,49 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
 
															 	return ret;
														
 
															 }
														
 
															+
														
 
															+/*
														
 
															+ * add a given inode to the list of inodes that must be fully on
														
 
															+ * disk before a transaction commit finishes.
														
 
															+ *
														
 
															+ * This basically gives us the ext3 style data=ordered mode, and it is mostly
														
 
															+ * used to make sure renamed files are fully on disk.
														
 
															+ *
														
 
															+ * It is a noop if the inode is already fully on disk.
														
 
															+ *
														
 
															+ * If trans is not null, we'll do a friendly check for a transaction that
														
 
															+ * is already flushing things and force the IO down ourselves.
														
 
															+ */
														
 
															+int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
														
 
															+				struct btrfs_root *root,
														
 
															+				struct inode *inode)
														
 
															+{
														
 
															+	u64 last_mod;
														
 
															+
														
 
															+	last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans);
														
 
															+
														
 
															+	/*
														
 
															+	 * if this file hasn't been changed since the last transaction
														
 
															+	 * commit, we can safely return without doing anything
														
 
															+	 */
														
 
															+	if (last_mod < root->fs_info->last_trans_committed)
														
 
															+		return 0;
														
 
															+
														
 
															+	/*
														
 
															+	 * the transaction is already committing.  Just start the IO and
														
 
															+	 * don't bother with all of this list nonsense
														
 
															+	 */
														
 
															+	if (trans && root->fs_info->running_transaction->blocked) {
														
 
															+		btrfs_wait_ordered_range(inode, 0, (u64)-1);
														
 
															+		return 0;
														
 
															+	}
														
 
															+
														
 
															+	spin_lock(&root->fs_info->ordered_extent_lock);
														
 
															+	if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
														
 
															+		list_add_tail(&BTRFS_I(inode)->ordered_operations,
														
 
															+			      &root->fs_info->ordered_operations);
														
 
															+	}
														
 
															+	spin_unlock(&root->fs_info->ordered_extent_lock);
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -155,4 +155,8 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
 
															 int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
														
 
															 			   loff_t end, int sync_mode);
														
 
															 int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only);
														
 
															+int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
														
 
															+int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
														
 
															+				struct btrfs_root *root,
														
 
															+				struct inode *inode);
														
 
															 #endif
														
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -975,6 +975,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
															 	int should_grow = 0;
														
 
															 	unsigned long now = get_seconds();
														
 
															+	btrfs_run_ordered_operations(root, 0);
														
 
															+
														
 
															 	/* make a pass through all the delayed refs we have so far
														
 
															 	 * any runnings procs may add more while we are here
														
 
															 	 */
														
@@ -1056,6 +1058,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
															 			BUG_ON(ret);
														
 
															 		}
														
 
															+		/*
														
 
															+		 * rename don't use btrfs_join_transaction, so, once we
														
 
															+		 * set the transaction to blocked above, we aren't going
														
 
															+		 * to get any new ordered operations.  We can safely run
														
 
															+		 * it here and no for sure that nothing new will be added
														
 
															+		 * to the list
														
 
															+		 */
														
 
															+		btrfs_run_ordered_operations(root, 1);
														
 
															+
														
 
															 		smp_mb();
														
 
															 		if (cur_trans->num_writers > 1 || should_grow)
														
 
															 			schedule_timeout(timeout);