12 years ago · e942f883bc
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -6,6 +6,9 @@ config BTRFS_FS
 
															 	select ZLIB_DEFLATE
														
 
															 	select LZO_COMPRESS
														
 
															 	select LZO_DECOMPRESS
														
 
															+	select RAID6_PQ
														
 
															+	select XOR_BLOCKS
														
 
															+
														
 
															 	help
														
 
															 	  Btrfs is a new filesystem with extents, writable snapshotting,
														
 
															 	  support for multiple devices and many more features.
														
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,7 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 
															 	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
														
 
															 	   export.o tree-log.o free-space-cache.o zlib.o lzo.o \
														
 
															 	   compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
														
 
															-	   reada.o backref.o ulist.o qgroup.o send.o dev-replace.o
														
 
															+	   reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o
														
 
															 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
														
 
															 btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
														
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -372,7 +372,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
 
															 		page = compressed_pages[pg_index];
														
 
															 		page->mapping = inode->i_mapping;
														
 
															 		if (bio->bi_size)
														
 
															-			ret = io_tree->ops->merge_bio_hook(page, 0,
														
 
															+			ret = io_tree->ops->merge_bio_hook(WRITE, page, 0,
														
 
															 							   PAGE_CACHE_SIZE,
														
 
															 							   bio, 0);
														
 
															 		else
														
@@ -655,7 +655,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 
															 		page->index = em_start >> PAGE_CACHE_SHIFT;
														
 
															 		if (comp_bio->bi_size)
														
 
															-			ret = tree->ops->merge_bio_hook(page, 0,
														
 
															+			ret = tree->ops->merge_bio_hook(READ, page, 0,
														
 
															 							PAGE_CACHE_SIZE,
														
 
															 							comp_bio, 0);
														
 
															 		else
														
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -506,6 +506,7 @@ struct btrfs_super_block {
 
															 #define BTRFS_FEATURE_INCOMPAT_BIG_METADATA	(1ULL << 5)
														
 
															 #define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF	(1ULL << 6)
														
 
															+#define BTRFS_FEATURE_INCOMPAT_RAID56		(1ULL << 7)
														
 
															 #define BTRFS_FEATURE_COMPAT_SUPP		0ULL
														
 
															 #define BTRFS_FEATURE_COMPAT_RO_SUPP		0ULL
														
@@ -515,6 +516,7 @@ struct btrfs_super_block {
 
															 	 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS |		\
														
 
															 	 BTRFS_FEATURE_INCOMPAT_BIG_METADATA |		\
														
 
															 	 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO |		\
														
 
															+	 BTRFS_FEATURE_INCOMPAT_RAID56 |		\
														
 
															 	 BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
														
 
															 /*
														
@@ -956,6 +958,8 @@ struct btrfs_dev_replace_item {
 
															 #define BTRFS_BLOCK_GROUP_RAID1		(1ULL << 4)
														
 
															 #define BTRFS_BLOCK_GROUP_DUP		(1ULL << 5)
														
 
															 #define BTRFS_BLOCK_GROUP_RAID10	(1ULL << 6)
														
 
															+#define BTRFS_BLOCK_GROUP_RAID5    (1 << 7)
														
 
															+#define BTRFS_BLOCK_GROUP_RAID6    (1 << 8)
														
 
															 #define BTRFS_BLOCK_GROUP_RESERVED	BTRFS_AVAIL_ALLOC_BIT_SINGLE
														
 
															 enum btrfs_raid_types {
														
@@ -964,6 +968,8 @@ enum btrfs_raid_types {
 
															 	BTRFS_RAID_DUP,
														
 
															 	BTRFS_RAID_RAID0,
														
 
															 	BTRFS_RAID_SINGLE,
														
 
															+	BTRFS_RAID_RAID5,
														
 
															+	BTRFS_RAID_RAID6,
														
 
															 	BTRFS_NR_RAID_TYPES
														
 
															 };
														
@@ -973,6 +979,8 @@ enum btrfs_raid_types {
 
															 #define BTRFS_BLOCK_GROUP_PROFILE_MASK	(BTRFS_BLOCK_GROUP_RAID0 |   \
														
 
															 					 BTRFS_BLOCK_GROUP_RAID1 |   \
														
 
															+					 BTRFS_BLOCK_GROUP_RAID5 |   \
														
 
															+					 BTRFS_BLOCK_GROUP_RAID6 |   \
														
 
															 					 BTRFS_BLOCK_GROUP_DUP |     \
														
 
															 					 BTRFS_BLOCK_GROUP_RAID10)
														
 
															 /*
														
@@ -1197,6 +1205,10 @@ struct btrfs_block_group_cache {
 
															 	u64 flags;
														
 
															 	u64 sectorsize;
														
 
															 	u64 cache_generation;
														
 
															+
														
 
															+	/* for raid56, this is a full stripe, without parity */
														
 
															+	unsigned long full_stripe_len;
														
 
															+
														
 
															 	unsigned int ro:1;
														
 
															 	unsigned int dirty:1;
														
 
															 	unsigned int iref:1;
														
@@ -1242,6 +1254,23 @@ enum btrfs_orphan_cleanup_state {
 
															 	ORPHAN_CLEANUP_DONE	= 2,
														
 
															 };
														
 
															+/* used by the raid56 code to lock stripes for read/modify/write */
														
 
															+struct btrfs_stripe_hash {
														
 
															+	struct list_head hash_list;
														
 
															+	wait_queue_head_t wait;
														
 
															+	spinlock_t lock;
														
 
															+};
														
 
															+
														
 
															+/* used by the raid56 code to lock stripes for read/modify/write */
														
 
															+struct btrfs_stripe_hash_table {
														
 
															+	struct list_head stripe_cache;
														
 
															+	spinlock_t cache_lock;
														
 
															+	int cache_size;
														
 
															+	struct btrfs_stripe_hash table[];
														
 
															+};
														
 
															+
														
 
															+#define BTRFS_STRIPE_HASH_TABLE_BITS 11
														
 
															+
														
 
															 /* fs_info */
														
 
															 struct reloc_control;
														
 
															 struct btrfs_device;
														
@@ -1341,6 +1370,13 @@ struct btrfs_fs_info {
 
															 	struct mutex cleaner_mutex;
														
 
															 	struct mutex chunk_mutex;
														
 
															 	struct mutex volume_mutex;
														
 
															+
														
 
															+	/* this is used during read/modify/write to make sure
														
 
															+	 * no two ios are trying to mod the same stripe at the same
														
 
															+	 * time
														
 
															+	 */
														
 
															+	struct btrfs_stripe_hash_table *stripe_hash_table;
														
 
															+
														
 
															 	/*
														
 
															 	 * this protects the ordered operations list only while we are
														
 
															 	 * processing all of the entries on it.  This way we make
														
@@ -1423,6 +1459,8 @@ struct btrfs_fs_info {
 
															 	struct btrfs_workers flush_workers;
														
 
															 	struct btrfs_workers endio_workers;
														
 
															 	struct btrfs_workers endio_meta_workers;
														
 
															+	struct btrfs_workers endio_raid56_workers;
														
 
															+	struct btrfs_workers rmw_workers;
														
 
															 	struct btrfs_workers endio_meta_write_workers;
														
 
															 	struct btrfs_workers endio_write_workers;
														
 
															 	struct btrfs_workers endio_freespace_worker;
														
@@ -3490,9 +3528,9 @@ int btrfs_writepages(struct address_space *mapping,
 
															 		     struct writeback_control *wbc);
														
 
															 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
														
 
															 			     struct btrfs_root *new_root, u64 new_dirid);
														
 
															-int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
														
 
															-			 size_t size, struct bio *bio, unsigned long bio_flags);
														
 
															-
														
 
															+int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
														
 
															+			 size_t size, struct bio *bio,
														
 
															+			 unsigned long bio_flags);
														
 
															 int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
														
 
															 int btrfs_readpage(struct file *file, struct page *page);
														
 
															 void btrfs_evict_inode(struct inode *inode);
														
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -131,6 +131,15 @@ struct btrfs_delayed_ref_root {
 
															 	/* total number of head nodes ready for processing */
														
 
															 	unsigned long num_heads_ready;
														
 
															+	/*
														
 
															+	 * bumped when someone is making progress on the delayed
														
 
															+	 * refs, so that other procs know they are just adding to
														
 
															+	 * contention intead of helping
														
 
															+	 */
														
 
															+	atomic_t procs_running_refs;
														
 
															+	atomic_t ref_seq;
														
 
															+	wait_queue_head_t wait;
														
 
															+
														
 
															 	/*
														
 
															 	 * set when the tree is flushing before a transaction commit,
														
 
															 	 * used by the throttling code to decide if new updates need
														
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -46,6 +46,7 @@
 
															 #include "check-integrity.h"
														
 
															 #include "rcu-string.h"
														
 
															 #include "dev-replace.h"
														
 
															+#include "raid56.h"
														
 
															 #ifdef CONFIG_X86
														
 
															 #include <asm/cpufeature.h>
														
@@ -640,8 +641,15 @@ err:
 
															 		btree_readahead_hook(root, eb, eb->start, ret);
														
 
															 	}
														
 
															-	if (ret)
														
 
															+	if (ret) {
														
 
															+		/*
														
 
															+		 * our io error hook is going to dec the io pages
														
 
															+		 * again, we have to make sure it has something
														
 
															+		 * to decrement
														
 
															+		 */
														
 
															+		atomic_inc(&eb->io_pages);
														
 
															 		clear_extent_buffer_uptodate(eb);
														
 
															+	}
														
 
															 	free_extent_buffer(eb);
														
 
															 out:
														
 
															 	return ret;
														
@@ -655,6 +663,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror)
 
															 	eb = (struct extent_buffer *)page->private;
														
 
															 	set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
														
 
															 	eb->read_mirror = failed_mirror;
														
 
															+	atomic_dec(&eb->io_pages);
														
 
															 	if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
														
 
															 		btree_readahead_hook(root, eb, eb->start, -EIO);
														
 
															 	return -EIO;	/* we fixed nothing */
														
@@ -671,17 +680,23 @@ static void end_workqueue_bio(struct bio *bio, int err)
 
															 	end_io_wq->work.flags = 0;
														
 
															 	if (bio->bi_rw & REQ_WRITE) {
														
 
															-		if (end_io_wq->metadata == 1)
														
 
															+		if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
														
 
															 			btrfs_queue_worker(&fs_info->endio_meta_write_workers,
														
 
															 					   &end_io_wq->work);
														
 
															-		else if (end_io_wq->metadata == 2)
														
 
															+		else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
														
 
															 			btrfs_queue_worker(&fs_info->endio_freespace_worker,
														
 
															 					   &end_io_wq->work);
														
 
															+		else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
														
 
															+			btrfs_queue_worker(&fs_info->endio_raid56_workers,
														
 
															+					   &end_io_wq->work);
														
 
															 		else
														
 
															 			btrfs_queue_worker(&fs_info->endio_write_workers,
														
 
															 					   &end_io_wq->work);
														
 
															 	} else {
														
 
															-		if (end_io_wq->metadata)
														
 
															+		if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
														
 
															+			btrfs_queue_worker(&fs_info->endio_raid56_workers,
														
 
															+					   &end_io_wq->work);
														
 
															+		else if (end_io_wq->metadata)
														
 
															 			btrfs_queue_worker(&fs_info->endio_meta_workers,
														
 
															 					   &end_io_wq->work);
														
 
															 		else
														
@@ -696,6 +711,7 @@ static void end_workqueue_bio(struct bio *bio, int err)
 
															  * 0 - if data
														
 
															  * 1 - if normal metadta
														
 
															  * 2 - if writing to the free space cache area
														
 
															+ * 3 - raid parity work
														
 
															  */
														
 
															 int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
														
 
															 			int metadata)
														
@@ -2179,6 +2195,12 @@ int open_ctree(struct super_block *sb,
 
															 	init_waitqueue_head(&fs_info->transaction_blocked_wait);
														
 
															 	init_waitqueue_head(&fs_info->async_submit_wait);
														
 
															+	ret = btrfs_alloc_stripe_hash_table(fs_info);
														
 
															+	if (ret) {
														
 
															+		err = -ENOMEM;
														
 
															+		goto fail_alloc;
														
 
															+	}
														
 
															+
														
 
															 	__setup_root(4096, 4096, 4096, 4096, tree_root,
														
 
															 		     fs_info, BTRFS_ROOT_TREE_OBJECTID);
														
@@ -2349,6 +2371,12 @@ int open_ctree(struct super_block *sb,
 
															 	btrfs_init_workers(&fs_info->endio_meta_write_workers,
														
 
															 			   "endio-meta-write", fs_info->thread_pool_size,
														
 
															 			   &fs_info->generic_worker);
														
 
															+	btrfs_init_workers(&fs_info->endio_raid56_workers,
														
 
															+			   "endio-raid56", fs_info->thread_pool_size,
														
 
															+			   &fs_info->generic_worker);
														
 
															+	btrfs_init_workers(&fs_info->rmw_workers,
														
 
															+			   "rmw", fs_info->thread_pool_size,
														
 
															+			   &fs_info->generic_worker);
														
 
															 	btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
														
 
															 			   fs_info->thread_pool_size,
														
 
															 			   &fs_info->generic_worker);
														
@@ -2367,6 +2395,8 @@ int open_ctree(struct super_block *sb,
 
															 	 */
														
 
															 	fs_info->endio_workers.idle_thresh = 4;
														
 
															 	fs_info->endio_meta_workers.idle_thresh = 4;
														
 
															+	fs_info->endio_raid56_workers.idle_thresh = 4;
														
 
															+	fs_info->rmw_workers.idle_thresh = 2;
														
 
															 	fs_info->endio_write_workers.idle_thresh = 2;
														
 
															 	fs_info->endio_meta_write_workers.idle_thresh = 2;
														
@@ -2383,6 +2413,8 @@ int open_ctree(struct super_block *sb,
 
															 	ret |= btrfs_start_workers(&fs_info->fixup_workers);
														
 
															 	ret |= btrfs_start_workers(&fs_info->endio_workers);
														
 
															 	ret |= btrfs_start_workers(&fs_info->endio_meta_workers);
														
 
															+	ret |= btrfs_start_workers(&fs_info->rmw_workers);
														
 
															+	ret |= btrfs_start_workers(&fs_info->endio_raid56_workers);
														
 
															 	ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers);
														
 
															 	ret |= btrfs_start_workers(&fs_info->endio_write_workers);
														
 
															 	ret |= btrfs_start_workers(&fs_info->endio_freespace_worker);
														
@@ -2726,6 +2758,8 @@ fail_sb_buffer:
 
															 	btrfs_stop_workers(&fs_info->workers);
														
 
															 	btrfs_stop_workers(&fs_info->endio_workers);
														
 
															 	btrfs_stop_workers(&fs_info->endio_meta_workers);
														
 
															+	btrfs_stop_workers(&fs_info->endio_raid56_workers);
														
 
															+	btrfs_stop_workers(&fs_info->rmw_workers);
														
 
															 	btrfs_stop_workers(&fs_info->endio_meta_write_workers);
														
 
															 	btrfs_stop_workers(&fs_info->endio_write_workers);
														
 
															 	btrfs_stop_workers(&fs_info->endio_freespace_worker);
														
@@ -2747,6 +2781,7 @@ fail_bdi:
 
															 fail_srcu:
														
 
															 	cleanup_srcu_struct(&fs_info->subvol_srcu);
														
 
															 fail:
														
 
															+	btrfs_free_stripe_hash_table(fs_info);
														
 
															 	btrfs_close_devices(fs_info->fs_devices);
														
 
															 	return err;
														
@@ -3094,11 +3129,16 @@ int btrfs_calc_num_tolerated_disk_barrier_failures(
 
															 				     ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK)
														
 
															 				      == 0)))
														
 
															 					num_tolerated_disk_barrier_failures = 0;
														
 
															-				else if (num_tolerated_disk_barrier_failures > 1
														
 
															-					 &&
														
 
															-					 (flags & (BTRFS_BLOCK_GROUP_RAID1 |
														
 
															-						   BTRFS_BLOCK_GROUP_RAID10)))
														
 
															-					num_tolerated_disk_barrier_failures = 1;
														
 
															+				else if (num_tolerated_disk_barrier_failures > 1) {
														
 
															+					if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
														
 
															+					    BTRFS_BLOCK_GROUP_RAID5 |
														
 
															+					    BTRFS_BLOCK_GROUP_RAID10)) {
														
 
															+						num_tolerated_disk_barrier_failures = 1;
														
 
															+					} else if (flags &
														
 
															+						   BTRFS_BLOCK_GROUP_RAID5) {
														
 
															+						num_tolerated_disk_barrier_failures = 2;
														
 
															+					}
														
 
															+				}
														
 
															 			}
														
 
															 		}
														
 
															 		up_read(&sinfo->groups_sem);
														
@@ -3402,6 +3442,8 @@ int close_ctree(struct btrfs_root *root)
 
															 	btrfs_stop_workers(&fs_info->workers);
														
 
															 	btrfs_stop_workers(&fs_info->endio_workers);
														
 
															 	btrfs_stop_workers(&fs_info->endio_meta_workers);
														
 
															+	btrfs_stop_workers(&fs_info->endio_raid56_workers);
														
 
															+	btrfs_stop_workers(&fs_info->rmw_workers);
														
 
															 	btrfs_stop_workers(&fs_info->endio_meta_write_workers);
														
 
															 	btrfs_stop_workers(&fs_info->endio_write_workers);
														
 
															 	btrfs_stop_workers(&fs_info->endio_freespace_worker);
														
@@ -3424,6 +3466,8 @@ int close_ctree(struct btrfs_root *root)
 
															 	bdi_destroy(&fs_info->bdi);
														
 
															 	cleanup_srcu_struct(&fs_info->subvol_srcu);
														
 
															+	btrfs_free_stripe_hash_table(fs_info);
														
 
															+
														
 
															 	return 0;
														
 
															 }
														
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -25,6 +25,13 @@
 
															 #define BTRFS_SUPER_MIRROR_MAX	 3
														
 
															 #define BTRFS_SUPER_MIRROR_SHIFT 12
														
 
															+enum {
														
 
															+	BTRFS_WQ_ENDIO_DATA = 0,
														
 
															+	BTRFS_WQ_ENDIO_METADATA = 1,
														
 
															+	BTRFS_WQ_ENDIO_FREE_SPACE = 2,
														
 
															+	BTRFS_WQ_ENDIO_RAID56 = 3,
														
 
															+};
														
 
															+
														
 
															 static inline u64 btrfs_sb_offset(int mirror)
														
 
															 {
														
 
															 	u64 start = 16 * 1024;
														
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -31,6 +31,7 @@
 
															 #include "print-tree.h"
														
 
															 #include "transaction.h"
														
 
															 #include "volumes.h"
														
 
															+#include "raid56.h"
														
 
															 #include "locking.h"
														
 
															 #include "free-space-cache.h"
														
 
															 #include "math.h"
														
@@ -1852,6 +1853,8 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
 
															 		*actual_bytes = discarded_bytes;
														
 
															+	if (ret == -EOPNOTSUPP)
														
 
															+		ret = 0;
														
 
															 	return ret;
														
 
															 }
														
@@ -2440,6 +2443,16 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
 
															 	return ret;
														
 
															 }
														
 
															+static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq,
														
 
															+		      int count)
														
 
															+{
														
 
															+	int val = atomic_read(&delayed_refs->ref_seq);
														
 
															+
														
 
															+	if (val < seq || val >= seq + count)
														
 
															+		return 1;
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															 /*
														
 
															  * this starts processing the delayed reference count updates and
														
 
															  * extent insertions we have queued up so far.  count can be
														
@@ -2474,6 +2487,44 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 
															 	delayed_refs = &trans->transaction->delayed_refs;
														
 
															 	INIT_LIST_HEAD(&cluster);
														
 
															+	if (count == 0) {
														
 
															+		count = delayed_refs->num_entries * 2;
														
 
															+		run_most = 1;
														
 
															+	}
														
 
															+
														
 
															+	if (!run_all && !run_most) {
														
 
															+		int old;
														
 
															+		int seq = atomic_read(&delayed_refs->ref_seq);
														
 
															+
														
 
															+progress:
														
 
															+		old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
														
 
															+		if (old) {
														
 
															+			DEFINE_WAIT(__wait);
														
 
															+			if (delayed_refs->num_entries < 16348)
														
 
															+				return 0;
														
 
															+
														
 
															+			prepare_to_wait(&delayed_refs->wait, &__wait,
														
 
															+					TASK_UNINTERRUPTIBLE);
														
 
															+
														
 
															+			old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
														
 
															+			if (old) {
														
 
															+				schedule();
														
 
															+				finish_wait(&delayed_refs->wait, &__wait);
														
 
															+
														
 
															+				if (!refs_newer(delayed_refs, seq, 256))
														
 
															+					goto progress;
														
 
															+				else
														
 
															+					return 0;
														
 
															+			} else {
														
 
															+				finish_wait(&delayed_refs->wait, &__wait);
														
 
															+				goto again;
														
 
															+			}
														
 
															+		}
														
 
															+
														
 
															+	} else {
														
 
															+		atomic_inc(&delayed_refs->procs_running_refs);
														
 
															+	}
														
 
															+
														
 
															 again:
														
 
															 	loops = 0;
														
 
															 	spin_lock(&delayed_refs->lock);
														
@@ -2482,10 +2533,6 @@ again:
 
															 	delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
														
 
															 #endif
														
 
															-	if (count == 0) {
														
 
															-		count = delayed_refs->num_entries * 2;
														
 
															-		run_most = 1;
														
 
															-	}
														
 
															 	while (1) {
														
 
															 		if (!(run_all || run_most) &&
														
 
															 		    delayed_refs->num_heads_ready < 64)
														
@@ -2508,9 +2555,12 @@ again:
 
															 			btrfs_release_ref_cluster(&cluster);
														
 
															 			spin_unlock(&delayed_refs->lock);
														
 
															 			btrfs_abort_transaction(trans, root, ret);
														
 
															+			atomic_dec(&delayed_refs->procs_running_refs);
														
 
															 			return ret;
														
 
															 		}
														
 
															+		atomic_add(ret, &delayed_refs->ref_seq);
														
 
															+
														
 
															 		count -= min_t(unsigned long, ret, count);
														
 
															 		if (count == 0)
														
@@ -2579,6 +2629,11 @@ again:
 
															 		goto again;
														
 
															 	}
														
 
															 out:
														
 
															+	atomic_dec(&delayed_refs->procs_running_refs);
														
 
															+	smp_mb();
														
 
															+	if (waitqueue_active(&delayed_refs->wait))
														
 
															+		wake_up(&delayed_refs->wait);
														
 
															+
														
 
															 	spin_unlock(&delayed_refs->lock);
														
 
															 	assert_qgroups_uptodate(trans);
														
 
															 	return 0;
														
@@ -3284,6 +3339,7 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 
															 	u64 num_devices = root->fs_info->fs_devices->rw_devices +
														
 
															 		root->fs_info->fs_devices->missing_devices;
														
 
															 	u64 target;
														
 
															+	u64 tmp;
														
 
															 	/*
														
 
															 	 * see if restripe for this chunk_type is in progress, if so
														
@@ -3300,30 +3356,32 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 
															 	}
														
 
															 	spin_unlock(&root->fs_info->balance_lock);
														
 
															+	/* First, mask out the RAID levels which aren't possible */
														
 
															 	if (num_devices == 1)
														
 
															-		flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
														
 
															+		flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 |
														
 
															+			   BTRFS_BLOCK_GROUP_RAID5);
														
 
															+	if (num_devices < 3)
														
 
															+		flags &= ~BTRFS_BLOCK_GROUP_RAID6;
														
 
															 	if (num_devices < 4)
														
 
															 		flags &= ~BTRFS_BLOCK_GROUP_RAID10;
														
 
															-	if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
														
 
															-	    (flags & (BTRFS_BLOCK_GROUP_RAID1 |
														
 
															-		      BTRFS_BLOCK_GROUP_RAID10))) {
														
 
															-		flags &= ~BTRFS_BLOCK_GROUP_DUP;
														
 
															-	}
														
 
															+	tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
														
 
															+		       BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 |
														
 
															+		       BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10);
														
 
															+	flags &= ~tmp;
														
 
															-	if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
														
 
															-	    (flags & BTRFS_BLOCK_GROUP_RAID10)) {
														
 
															-		flags &= ~BTRFS_BLOCK_GROUP_RAID1;
														
 
															-	}
														
 
															-
														
 
															-	if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
														
 
															-	    ((flags & BTRFS_BLOCK_GROUP_RAID1) |
														
 
															-	     (flags & BTRFS_BLOCK_GROUP_RAID10) |
														
 
															-	     (flags & BTRFS_BLOCK_GROUP_DUP))) {
														
 
															-		flags &= ~BTRFS_BLOCK_GROUP_RAID0;
														
 
															-	}
														
 
															+	if (tmp & BTRFS_BLOCK_GROUP_RAID6)
														
 
															+		tmp = BTRFS_BLOCK_GROUP_RAID6;
														
 
															+	else if (tmp & BTRFS_BLOCK_GROUP_RAID5)
														
 
															+		tmp = BTRFS_BLOCK_GROUP_RAID5;
														
 
															+	else if (tmp & BTRFS_BLOCK_GROUP_RAID10)
														
 
															+		tmp = BTRFS_BLOCK_GROUP_RAID10;
														
 
															+	else if (tmp & BTRFS_BLOCK_GROUP_RAID1)
														
 
															+		tmp = BTRFS_BLOCK_GROUP_RAID1;
														
 
															+	else if (tmp & BTRFS_BLOCK_GROUP_RAID0)
														
 
															+		tmp = BTRFS_BLOCK_GROUP_RAID0;
														
 
															-	return extended_to_chunk(flags);
														
 
															+	return extended_to_chunk(flags | tmp);
														
 
															 }
														
 
															 static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
														
@@ -3347,6 +3405,7 @@ static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
 
															 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
														
 
															 {
														
 
															 	u64 flags;
														
 
															+	u64 ret;
														
 
															 	if (data)
														
 
															 		flags = BTRFS_BLOCK_GROUP_DATA;
														
@@ -3355,7 +3414,8 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
 
															 	else
														
 
															 		flags = BTRFS_BLOCK_GROUP_METADATA;
														
 
															-	return get_alloc_profile(root, flags);
														
 
															+	ret = get_alloc_profile(root, flags);
														
 
															+	return ret;
														
 
															 }
														
 
															 /*
														
@@ -3530,8 +3590,10 @@ static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
 
															 {
														
 
															 	u64 num_dev;
														
 
															-	if (type & BTRFS_BLOCK_GROUP_RAID10 ||
														
 
															-	    type & BTRFS_BLOCK_GROUP_RAID0)
														
 
															+	if (type & (BTRFS_BLOCK_GROUP_RAID10 |
														
 
															+		    BTRFS_BLOCK_GROUP_RAID0 |
														
 
															+		    BTRFS_BLOCK_GROUP_RAID5 |
														
 
															+		    BTRFS_BLOCK_GROUP_RAID6))
														
 
															 		num_dev = root->fs_info->fs_devices->rw_devices;
														
 
															 	else if (type & BTRFS_BLOCK_GROUP_RAID1)
														
 
															 		num_dev = 2;
														
@@ -3706,7 +3768,9 @@ static int can_overcommit(struct btrfs_root *root,
 
															 	/*
														
 
															 	 * If we have dup, raid1 or raid10 then only half of the free
														
 
															-	 * space is actually useable.
														
 
															+	 * space is actually useable.  For raid56, the space info used
														
 
															+	 * doesn't include the parity drive, so we don't have to
														
 
															+	 * change the math
														
 
															 	 */
														
 
															 	if (profile & (BTRFS_BLOCK_GROUP_DUP |
														
 
															 		       BTRFS_BLOCK_GROUP_RAID1 |
														
@@ -5539,10 +5603,14 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 
															 	return ret;
														
 
															 }
														
 
															-static u64 stripe_align(struct btrfs_root *root, u64 val)
														
 
															+static u64 stripe_align(struct btrfs_root *root,
														
 
															+			struct btrfs_block_group_cache *cache,
														
 
															+			u64 val, u64 num_bytes)
														
 
															 {
														
 
															-	u64 mask = ((u64)root->stripesize - 1);
														
 
															-	u64 ret = (val + mask) & ~mask;
														
 
															+	u64 mask;
														
 
															+	u64 ret;
														
 
															+	mask = ((u64)root->stripesize - 1);
														
 
															+	ret = (val + mask) & ~mask;
														
 
															 	return ret;
														
 
															 }
														
@@ -5599,8 +5667,12 @@ int __get_raid_index(u64 flags)
 
															 		return BTRFS_RAID_DUP;
														
 
															 	else if (flags & BTRFS_BLOCK_GROUP_RAID0)
														
 
															 		return BTRFS_RAID_RAID0;
														
 
															-	else
														
 
															-		return BTRFS_RAID_SINGLE;
														
 
															+	else if (flags & BTRFS_BLOCK_GROUP_RAID5)
														
 
															+		return BTRFS_RAID_RAID5;
														
 
															+	else if (flags & BTRFS_BLOCK_GROUP_RAID6)
														
 
															+		return BTRFS_RAID_RAID6;
														
 
															+
														
 
															+	return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
														
 
															 }
														
 
															 static int get_block_group_index(struct btrfs_block_group_cache *cache)
														
@@ -5743,6 +5815,8 @@ search:
 
															 		if (!block_group_bits(block_group, data)) {
														
 
															 		    u64 extra = BTRFS_BLOCK_GROUP_DUP |
														
 
															 				BTRFS_BLOCK_GROUP_RAID1 |
														
 
															+				BTRFS_BLOCK_GROUP_RAID5 |
														
 
															+				BTRFS_BLOCK_GROUP_RAID6 |
														
 
															 				BTRFS_BLOCK_GROUP_RAID10;
														
 
															 			/*
														
@@ -5771,6 +5845,7 @@ have_block_group:
 
															 		 * lets look there
														
 
															 		 */
														
 
															 		if (last_ptr) {
														
 
															+			unsigned long aligned_cluster;
														
 
															 			/*
														
 
															 			 * the refill lock keeps out other
														
 
															 			 * people trying to start a new cluster
														
@@ -5837,11 +5912,15 @@ refill_cluster:
 
															 				goto unclustered_alloc;
														
 
															 			}
														
 
															+			aligned_cluster = max_t(unsigned long,
														
 
															+						empty_cluster + empty_size,
														
 
															+					      block_group->full_stripe_len);
														
 
															+
														
 
															 			/* allocate a cluster in this block group */
														
 
															 			ret = btrfs_find_space_cluster(trans, root,
														
 
															 					       block_group, last_ptr,
														
 
															 					       search_start, num_bytes,
														
 
															-					       empty_cluster + empty_size);
														
 
															+					       aligned_cluster);
														
 
															 			if (ret == 0) {
														
 
															 				/*
														
 
															 				 * now pull our allocation out of this
														
@@ -5912,7 +5991,8 @@ unclustered_alloc:
 
															 			goto loop;
														
 
															 		}
														
 
															 checks:
														
 
															-		search_start = stripe_align(root, offset);
														
 
															+		search_start = stripe_align(root, used_block_group,
														
 
															+					    offset, num_bytes);
														
 
															 		/* move on to the next group */
														
 
															 		if (search_start + num_bytes >
														
@@ -7284,6 +7364,7 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
 
															 		root->fs_info->fs_devices->missing_devices;
														
 
															 	stripped = BTRFS_BLOCK_GROUP_RAID0 |
														
 
															+		BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
														
 
															 		BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
														
 
															 	if (num_devices == 1) {
														
@@ -7837,7 +7918,9 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 
															 		btrfs_release_path(path);
														
 
															 		cache->flags = btrfs_block_group_flags(&cache->item);
														
 
															 		cache->sectorsize = root->sectorsize;
														
 
															-
														
 
															+		cache->full_stripe_len = btrfs_full_stripe_len(root,
														
 
															+					       &root->fs_info->mapping_tree,
														
 
															+					       found_key.objectid);
														
 
															 		btrfs_init_free_space_ctl(cache);
														
 
															 		/*
														
@@ -7891,6 +7974,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 
															 		if (!(get_alloc_profile(root, space_info->flags) &
														
 
															 		      (BTRFS_BLOCK_GROUP_RAID10 |
														
 
															 		       BTRFS_BLOCK_GROUP_RAID1 |
														
 
															+		       BTRFS_BLOCK_GROUP_RAID5 |
														
 
															+		       BTRFS_BLOCK_GROUP_RAID6 |
														
 
															 		       BTRFS_BLOCK_GROUP_DUP)))
														
 
															 			continue;
														
 
															 		/*
														
@@ -7966,6 +8051,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 
															 	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
														
 
															 	cache->sectorsize = root->sectorsize;
														
 
															 	cache->fs_info = root->fs_info;
														
 
															+	cache->full_stripe_len = btrfs_full_stripe_len(root,
														
 
															+					       &root->fs_info->mapping_tree,
														
 
															+					       chunk_offset);
														
 
															 	atomic_set(&cache->count, 1);
														
 
															 	spin_lock_init(&cache->lock);
														
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1895,13 +1895,11 @@ static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
 
															 	if (ret)
														
 
															 		err = ret;
														
 
															-	if (did_repair) {
														
 
															-		ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
														
 
															-					rec->start + rec->len - 1,
														
 
															-					EXTENT_DAMAGED, GFP_NOFS);
														
 
															-		if (ret && !err)
														
 
															-			err = ret;
														
 
															-	}
														
 
															+	ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
														
 
															+				rec->start + rec->len - 1,
														
 
															+				EXTENT_DAMAGED, GFP_NOFS);
														
 
															+	if (ret && !err)
														
 
															+		err = ret;
														
 
															 	kfree(rec);
														
 
															 	return err;
														
@@ -1932,10 +1930,15 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
 
															 	u64 map_length = 0;
														
 
															 	u64 sector;
														
 
															 	struct btrfs_bio *bbio = NULL;
														
 
															+	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
														
 
															 	int ret;
														
 
															 	BUG_ON(!mirror_num);
														
 
															+	/* we can't repair anything in raid56 yet */
														
 
															+	if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num))
														
 
															+		return 0;
														
 
															+
														
 
															 	bio = bio_alloc(GFP_NOFS, 1);
														
 
															 	if (!bio)
														
 
															 		return -EIO;
														
@@ -2052,6 +2055,7 @@ static int clean_io_failure(u64 start, struct page *page)
 
															 						failrec->failed_mirror);
														
 
															 			did_repair = !ret;
														
 
															 		}
														
 
															+		ret = 0;
														
 
															 	}
														
 
															 out:
														
@@ -2487,13 +2491,13 @@ static int __must_check submit_one_bio(int rw, struct bio *bio,
 
															 	return ret;
														
 
															 }
														
 
															-static int merge_bio(struct extent_io_tree *tree, struct page *page,
														
 
															+static int merge_bio(int rw, struct extent_io_tree *tree, struct page *page,
														
 
															 		     unsigned long offset, size_t size, struct bio *bio,
														
 
															 		     unsigned long bio_flags)
														
 
															 {
														
 
															 	int ret = 0;
														
 
															 	if (tree->ops && tree->ops->merge_bio_hook)
														
 
															-		ret = tree->ops->merge_bio_hook(page, offset, size, bio,
														
 
															+		ret = tree->ops->merge_bio_hook(rw, page, offset, size, bio,
														
 
															 						bio_flags);
														
 
															 	BUG_ON(ret < 0);
														
 
															 	return ret;
														
@@ -2528,7 +2532,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
 
															 				sector;
														
 
															 		if (prev_bio_flags != bio_flags || !contig ||
														
 
															-		    merge_bio(tree, page, offset, page_size, bio, bio_flags) ||
														
 
															+		    merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) ||
														
 
															 		    bio_add_page(bio, page, page_size, offset) < page_size) {
														
 
															 			ret = submit_one_bio(rw, bio, mirror_num,
														
 
															 					     prev_bio_flags);
														
@@ -4162,6 +4166,7 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
 
															 static void check_buffer_tree_ref(struct extent_buffer *eb)
														
 
															 {
														
 
															+	int refs;
														
 
															 	/* the ref bit is tricky.  We have to make sure it is set
														
 
															 	 * if we have the buffer dirty.   Otherwise the
														
 
															 	 * code to free a buffer can end up dropping a dirty
														
@@ -4182,6 +4187,10 @@ static void check_buffer_tree_ref(struct extent_buffer *eb)
 
															 	 * So bump the ref count first, then set the bit.  If someone
														
 
															 	 * beat us to it, drop the ref we added.
														
 
															 	 */
														
 
															+	refs = atomic_read(&eb->refs);
														
 
															+	if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
														
 
															+		return;
														
 
															+
														
 
															 	spin_lock(&eb->refs_lock);
														
 
															 	if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
														
 
															 		atomic_inc(&eb->refs);
														
@@ -4383,9 +4392,20 @@ static int release_extent_buffer(struct extent_buffer *eb, gfp_t mask)
 
															 void free_extent_buffer(struct extent_buffer *eb)
														
 
															 {
														
 
															+	int refs;
														
 
															+	int old;
														
 
															 	if (!eb)
														
 
															 		return;
														
 
															+	while (1) {
														
 
															+		refs = atomic_read(&eb->refs);
														
 
															+		if (refs <= 3)
														
 
															+			break;
														
 
															+		old = atomic_cmpxchg(&eb->refs, refs, refs - 1);
														
 
															+		if (old == refs)
														
 
															+			return;
														
 
															+	}
														
 
															+
														
 
															 	spin_lock(&eb->refs_lock);
														
 
															 	if (atomic_read(&eb->refs) == 2 &&
														
 
															 	    test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))
														
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -72,7 +72,7 @@ struct extent_io_ops {
 
															 	int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
														
 
															 	int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
														
 
															 	extent_submit_bio_hook_t *submit_bio_hook;
														
 
															-	int (*merge_bio_hook)(struct page *page, unsigned long offset,
														
 
															+	int (*merge_bio_hook)(int rw, struct page *page, unsigned long offset,
														
 
															 			      size_t size, struct bio *bio,
														
 
															 			      unsigned long bio_flags);
														
 
															 	int (*readpage_io_failed_hook)(struct page *page, int failed_mirror);
														
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1465,10 +1465,14 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
 
															 }
														
 
															 static struct btrfs_free_space *
														
 
															-find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes)
														
 
															+find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
														
 
															+		unsigned long align)
														
 
															 {
														
 
															 	struct btrfs_free_space *entry;
														
 
															 	struct rb_node *node;
														
 
															+	u64 ctl_off;
														
 
															+	u64 tmp;
														
 
															+	u64 align_off;
														
 
															 	int ret;
														
 
															 	if (!ctl->free_space_offset.rb_node)
														
@@ -1483,15 +1487,34 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes)
 
															 		if (entry->bytes < *bytes)
														
 
															 			continue;
														
 
															+		/* make sure the space returned is big enough
														
 
															+		 * to match our requested alignment
														
 
															+		 */
														
 
															+		if (*bytes >= align) {
														
 
															+			ctl_off = entry->offset - ctl->start;
														
 
															+			tmp = ctl_off + align - 1;;
														
 
															+			do_div(tmp, align);
														
 
															+			tmp = tmp * align + ctl->start;
														
 
															+			align_off = tmp - entry->offset;
														
 
															+		} else {
														
 
															+			align_off = 0;
														
 
															+			tmp = entry->offset;
														
 
															+		}
														
 
															+
														
 
															+		if (entry->bytes < *bytes + align_off)
														
 
															+			continue;
														
 
															+
														
 
															 		if (entry->bitmap) {
														
 
															-			ret = search_bitmap(ctl, entry, offset, bytes);
														
 
															-			if (!ret)
														
 
															+			ret = search_bitmap(ctl, entry, &tmp, bytes);
														
 
															+			if (!ret) {
														
 
															+				*offset = tmp;
														
 
															 				return entry;
														
 
															+			}
														
 
															 			continue;
														
 
															 		}
														
 
															-		*offset = entry->offset;
														
 
															-		*bytes = entry->bytes;
														
 
															+		*offset = tmp;
														
 
															+		*bytes = entry->bytes - align_off;
														
 
															 		return entry;
														
 
															 	}
														
@@ -2101,9 +2124,12 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
 
															 	struct btrfs_free_space *entry = NULL;
														
 
															 	u64 bytes_search = bytes + empty_size;
														
 
															 	u64 ret = 0;
														
 
															+	u64 align_gap = 0;
														
 
															+	u64 align_gap_len = 0;
														
 
															 	spin_lock(&ctl->tree_lock);
														
 
															-	entry = find_free_space(ctl, &offset, &bytes_search);
														
 
															+	entry = find_free_space(ctl, &offset, &bytes_search,
														
 
															+				block_group->full_stripe_len);
														
 
															 	if (!entry)
														
 
															 		goto out;
														
@@ -2113,9 +2139,15 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
 
															 		if (!entry->bytes)
														
 
															 			free_bitmap(ctl, entry);
														
 
															 	} else {
														
 
															+
														
 
															 		unlink_free_space(ctl, entry);
														
 
															-		entry->offset += bytes;
														
 
															-		entry->bytes -= bytes;
														
 
															+		align_gap_len = offset - entry->offset;
														
 
															+		align_gap = entry->offset;
														
 
															+
														
 
															+		entry->offset = offset + bytes;
														
 
															+		WARN_ON(entry->bytes < bytes + align_gap_len);
														
 
															+
														
 
															+		entry->bytes -= bytes + align_gap_len;
														
 
															 		if (!entry->bytes)
														
 
															 			kmem_cache_free(btrfs_free_space_cachep, entry);
														
 
															 		else
														
@@ -2125,6 +2157,8 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
 
															 out:
														
 
															 	spin_unlock(&ctl->tree_lock);
														
 
															+	if (align_gap_len)
														
 
															+		__btrfs_add_free_space(ctl, align_gap, align_gap_len);
														
 
															 	return ret;
														
 
															 }
														
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -40,6 +40,7 @@
 
															 #include <linux/ratelimit.h>
														
 
															 #include <linux/mount.h>
														
 
															 #include <linux/btrfs.h>
														
 
															+#include <linux/blkdev.h>
														
 
															 #include "compat.h"
														
 
															 #include "ctree.h"
														
 
															 #include "disk-io.h"
														
@@ -1605,7 +1606,7 @@ static void btrfs_clear_bit_hook(struct inode *inode,
 
															  * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
														
 
															  * we don't create bios that span stripes or chunks
														
 
															  */
														
 
															-int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
														
 
															+int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
														
 
															 			 size_t size, struct bio *bio,
														
 
															 			 unsigned long bio_flags)
														
 
															 {
														
@@ -1620,7 +1621,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 
															 	length = bio->bi_size;
														
 
															 	map_length = length;
														
 
															-	ret = btrfs_map_block(root->fs_info, READ, logical,
														
 
															+	ret = btrfs_map_block(root->fs_info, rw, logical,
														
 
															 			      &map_length, NULL, 0);
														
 
															 	/* Will always return 0 with map_multi == NULL */
														
 
															 	BUG_ON(ret < 0);
														
@@ -6464,19 +6465,24 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
 
															 	int async_submit = 0;
														
 
															 	map_length = orig_bio->bi_size;
														
 
															-	ret = btrfs_map_block(root->fs_info, READ, start_sector << 9,
														
 
															+	ret = btrfs_map_block(root->fs_info, rw, start_sector << 9,
														
 
															 			      &map_length, NULL, 0);
														
 
															 	if (ret) {
														
 
															 		bio_put(orig_bio);
														
 
															 		return -EIO;
														
 
															 	}
														
 
															-
														
 
															 	if (map_length >= orig_bio->bi_size) {
														
 
															 		bio = orig_bio;
														
 
															 		goto submit;
														
 
															 	}
														
 
															-	async_submit = 1;
														
 
															+	/* async crcs make it difficult to collect full stripe writes. */
														
 
															+	if (btrfs_get_alloc_profile(root, 1) &
														
 
															+	    (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6))
														
 
															+		async_submit = 0;
														
 
															+	else
														
 
															+		async_submit = 1;
														
 
															+
														
 
															 	bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
														
 
															 	if (!bio)
														
 
															 		return -ENOMEM;
														
@@ -6518,7 +6524,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
 
															 			bio->bi_end_io = btrfs_end_dio_bio;
														
 
															 			map_length = orig_bio->bi_size;
														
 
															-			ret = btrfs_map_block(root->fs_info, READ,
														
 
															+			ret = btrfs_map_block(root->fs_info, rw,
														
 
															 					      start_sector << 9,
														
 
															 					      &map_length, NULL, 0);
														
 
															 			if (ret) {
														
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -0,0 +1,2080 @@
 
															+/*
														
 
															+ * Copyright (C) 2012 Fusion-io  All rights reserved.
														
 
															+ * Copyright (C) 2012 Intel Corp. All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License v2 as published by the Free Software Foundation.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ */
														
 
															+#include <linux/sched.h>
														
 
															+#include <linux/wait.h>
														
 
															+#include <linux/bio.h>
														
 
															+#include <linux/slab.h>
														
 
															+#include <linux/buffer_head.h>
														
 
															+#include <linux/blkdev.h>
														
 
															+#include <linux/random.h>
														
 
															+#include <linux/iocontext.h>
														
 
															+#include <linux/capability.h>
														
 
															+#include <linux/ratelimit.h>
														
 
															+#include <linux/kthread.h>
														
 
															+#include <linux/raid/pq.h>
														
 
															+#include <linux/hash.h>
														
 
															+#include <linux/list_sort.h>
														
 
															+#include <linux/raid/xor.h>
														
 
															+#include <asm/div64.h>
														
 
															+#include "compat.h"
														
 
															+#include "ctree.h"
														
 
															+#include "extent_map.h"
														
 
															+#include "disk-io.h"
														
 
															+#include "transaction.h"
														
 
															+#include "print-tree.h"
														
 
															+#include "volumes.h"
														
 
															+#include "raid56.h"
														
 
															+#include "async-thread.h"
														
 
															+#include "check-integrity.h"
														
 
															+#include "rcu-string.h"
														
 
															+
														
 
															+/* set when additional merges to this rbio are not allowed */
														
 
															+#define RBIO_RMW_LOCKED_BIT	1
														
 
															+
														
 
															+/*
														
 
															+ * set when this rbio is sitting in the hash, but it is just a cache
														
 
															+ * of past RMW
														
 
															+ */
														
 
															+#define RBIO_CACHE_BIT		2
														
 
															+
														
 
															+/*
														
 
															+ * set when it is safe to trust the stripe_pages for caching
														
 
															+ */
														
 
															+#define RBIO_CACHE_READY_BIT	3
														
 
															+
														
 
															+
														
 
															+#define RBIO_CACHE_SIZE 1024
														
 
															+
														
 
															+struct btrfs_raid_bio {
														
 
															+	struct btrfs_fs_info *fs_info;
														
 
															+	struct btrfs_bio *bbio;
														
 
															+
														
 
															+	/*
														
 
															+	 * logical block numbers for the start of each stripe
														
 
															+	 * The last one or two are p/q.  These are sorted,
														
 
															+	 * so raid_map[0] is the start of our full stripe
														
 
															+	 */
														
 
															+	u64 *raid_map;
														
 
															+
														
 
															+	/* while we're doing rmw on a stripe
														
 
															+	 * we put it into a hash table so we can
														
 
															+	 * lock the stripe and merge more rbios
														
 
															+	 * into it.
														
 
															+	 */
														
 
															+	struct list_head hash_list;
														
 
															+
														
 
															+	/*
														
 
															+	 * LRU list for the stripe cache
														
 
															+	 */
														
 
															+	struct list_head stripe_cache;
														
 
															+
														
 
															+	/*
														
 
															+	 * for scheduling work in the helper threads
														
 
															+	 */
														
 
															+	struct btrfs_work work;
														
 
															+
														
 
															+	/*
														
 
															+	 * bio list and bio_list_lock are used
														
 
															+	 * to add more bios into the stripe
														
 
															+	 * in hopes of avoiding the full rmw
														
 
															+	 */
														
 
															+	struct bio_list bio_list;
														
 
															+	spinlock_t bio_list_lock;
														
 
															+
														
 
															+	/* also protected by the bio_list_lock, the
														
 
															+	 * plug list is used by the plugging code
														
 
															+	 * to collect partial bios while plugged.  The
														
 
															+	 * stripe locking code also uses it to hand off
														
 
															+	 * the stripe lock to the next pending IO
														
 
															+	 */
														
 
															+	struct list_head plug_list;
														
 
															+
														
 
															+	/*
														
 
															+	 * flags that tell us if it is safe to
														
 
															+	 * merge with this bio
														
 
															+	 */
														
 
															+	unsigned long flags;
														
 
															+
														
 
															+	/* size of each individual stripe on disk */
														
 
															+	int stripe_len;
														
 
															+
														
 
															+	/* number of data stripes (no p/q) */
														
 
															+	int nr_data;
														
 
															+
														
 
															+	/*
														
 
															+	 * set if we're doing a parity rebuild
														
 
															+	 * for a read from higher up, which is handled
														
 
															+	 * differently from a parity rebuild as part of
														
 
															+	 * rmw
														
 
															+	 */
														
 
															+	int read_rebuild;
														
 
															+
														
 
															+	/* first bad stripe */
														
 
															+	int faila;
														
 
															+
														
 
															+	/* second bad stripe (for raid6 use) */
														
 
															+	int failb;
														
 
															+
														
 
															+	/*
														
 
															+	 * number of pages needed to represent the full
														
 
															+	 * stripe
														
 
															+	 */
														
 
															+	int nr_pages;
														
 
															+
														
 
															+	/*
														
 
															+	 * size of all the bios in the bio_list.  This
														
 
															+	 * helps us decide if the rbio maps to a full
														
 
															+	 * stripe or not
														
 
															+	 */
														
 
															+	int bio_list_bytes;
														
 
															+
														
 
															+	atomic_t refs;
														
 
															+
														
 
															+	/*
														
 
															+	 * these are two arrays of pointers.  We allocate the
														
 
															+	 * rbio big enough to hold them both and setup their
														
 
															+	 * locations when the rbio is allocated
														
 
															+	 */
														
 
															+
														
 
															+	/* pointers to pages that we allocated for
														
 
															+	 * reading/writing stripes directly from the disk (including P/Q)
														
 
															+	 */
														
 
															+	struct page **stripe_pages;
														
 
															+
														
 
															+	/*
														
 
															+	 * pointers to the pages in the bio_list.  Stored
														
 
															+	 * here for faster lookup
														
 
															+	 */
														
 
															+	struct page **bio_pages;
														
 
															+};
														
 
															+
														
 
															+static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
														
 
															+static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
														
 
															+static void rmw_work(struct btrfs_work *work);
														
 
															+static void read_rebuild_work(struct btrfs_work *work);
														
 
															+static void async_rmw_stripe(struct btrfs_raid_bio *rbio);
														
 
															+static void async_read_rebuild(struct btrfs_raid_bio *rbio);
														
 
															+static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
														
 
															+static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
														
 
															+static void __free_raid_bio(struct btrfs_raid_bio *rbio);
														
 
															+static void index_rbio_pages(struct btrfs_raid_bio *rbio);
														
 
															+static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
														
 
															+
														
 
															+/*
														
 
															+ * the stripe hash table is used for locking, and to collect
														
 
															+ * bios in hopes of making a full stripe
														
 
															+ */
														
 
															+int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
														
 
															+{
														
 
															+	struct btrfs_stripe_hash_table *table;
														
 
															+	struct btrfs_stripe_hash_table *x;
														
 
															+	struct btrfs_stripe_hash *cur;
														
 
															+	struct btrfs_stripe_hash *h;
														
 
															+	int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
														
 
															+	int i;
														
 
															+
														
 
															+	if (info->stripe_hash_table)
														
 
															+		return 0;
														
 
															+
														
 
															+	table = kzalloc(sizeof(*table) + sizeof(*h) * num_entries, GFP_NOFS);
														
 
															+	if (!table)
														
 
															+		return -ENOMEM;
														
 
															+
														
 
															+	spin_lock_init(&table->cache_lock);
														
 
															+	INIT_LIST_HEAD(&table->stripe_cache);
														
 
															+
														
 
															+	h = table->table;
														
 
															+
														
 
															+	for (i = 0; i < num_entries; i++) {
														
 
															+		cur = h + i;
														
 
															+		INIT_LIST_HEAD(&cur->hash_list);
														
 
															+		spin_lock_init(&cur->lock);
														
 
															+		init_waitqueue_head(&cur->wait);
														
 
															+	}
														
 
															+
														
 
															+	x = cmpxchg(&info->stripe_hash_table, NULL, table);
														
 
															+	if (x)
														
 
															+		kfree(x);
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * caching an rbio means to copy anything from the
														
 
															+ * bio_pages array into the stripe_pages array.  We
														
 
															+ * use the page uptodate bit in the stripe cache array
														
 
															+ * to indicate if it has valid data
														
 
															+ *
														
 
															+ * once the caching is done, we set the cache ready
														
 
															+ * bit.
														
 
															+ */
														
 
															+static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
														
 
															+{
														
 
															+	int i;
														
 
															+	char *s;
														
 
															+	char *d;
														
 
															+	int ret;
														
 
															+
														
 
															+	ret = alloc_rbio_pages(rbio);
														
 
															+	if (ret)
														
 
															+		return;
														
 
															+
														
 
															+	for (i = 0; i < rbio->nr_pages; i++) {
														
 
															+		if (!rbio->bio_pages[i])
														
 
															+			continue;
														
 
															+
														
 
															+		s = kmap(rbio->bio_pages[i]);
														
 
															+		d = kmap(rbio->stripe_pages[i]);
														
 
															+
														
 
															+		memcpy(d, s, PAGE_CACHE_SIZE);
														
 
															+
														
 
															+		kunmap(rbio->bio_pages[i]);
														
 
															+		kunmap(rbio->stripe_pages[i]);
														
 
															+		SetPageUptodate(rbio->stripe_pages[i]);
														
 
															+	}
														
 
															+	set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * we hash on the first logical address of the stripe
														
 
															+ */
														
 
															+static int rbio_bucket(struct btrfs_raid_bio *rbio)
														
 
															+{
														
 
															+	u64 num = rbio->raid_map[0];
														
 
															+
														
 
															+	/*
														
 
															+	 * we shift down quite a bit.  We're using byte
														
 
															+	 * addressing, and most of the lower bits are zeros.
														
 
															+	 * This tends to upset hash_64, and it consistently
														
 
															+	 * returns just one or two different values.
														
 
															+	 *
														
 
															+	 * shifting off the lower bits fixes things.
														
 
															+	 */
														
 
															+	return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * stealing an rbio means taking all the uptodate pages from the stripe
														
 
															+ * array in the source rbio and putting them into the destination rbio
														
 
															+ */
														
 
															+static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
														
 
															+{
														
 
															+	int i;
														
 
															+	struct page *s;
														
 
															+	struct page *d;
														
 
															+
														
 
															+	if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
														
 
															+		return;
														
 
															+
														
 
															+	for (i = 0; i < dest->nr_pages; i++) {
														
 
															+		s = src->stripe_pages[i];
														
 
															+		if (!s || !PageUptodate(s)) {
														
 
															+			continue;
														
 
															+		}
														
 
															+
														
 
															+		d = dest->stripe_pages[i];
														
 
															+		if (d)
														
 
															+			__free_page(d);
														
 
															+
														
 
															+		dest->stripe_pages[i] = s;
														
 
															+		src->stripe_pages[i] = NULL;
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * merging means we take the bio_list from the victim and
														
 
															+ * splice it into the destination.  The victim should
														
 
															+ * be discarded afterwards.
														
 
															+ *
														
 
															+ * must be called with dest->rbio_list_lock held
														
 
															+ */
														
 
															+static void merge_rbio(struct btrfs_raid_bio *dest,
														
 
															+		       struct btrfs_raid_bio *victim)
														
 
															+{
														
 
															+	bio_list_merge(&dest->bio_list, &victim->bio_list);
														
 
															+	dest->bio_list_bytes += victim->bio_list_bytes;
														
 
															+	bio_list_init(&victim->bio_list);
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * used to prune items that are in the cache.  The caller
														
 
															+ * must hold the hash table lock.
														
 
															+ */
														
 
															+static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
														
 
															+{
														
 
															+	int bucket = rbio_bucket(rbio);
														
 
															+	struct btrfs_stripe_hash_table *table;
														
 
															+	struct btrfs_stripe_hash *h;
														
 
															+	int freeit = 0;
														
 
															+
														
 
															+	/*
														
 
															+	 * check the bit again under the hash table lock.
														
 
															+	 */
														
 
															+	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
														
 
															+		return;
														
 
															+
														
 
															+	table = rbio->fs_info->stripe_hash_table;
														
 
															+	h = table->table + bucket;
														
 
															+
														
 
															+	/* hold the lock for the bucket because we may be
														
 
															+	 * removing it from the hash table
														
 
															+	 */
														
 
															+	spin_lock(&h->lock);
														
 
															+
														
 
															+	/*
														
 
															+	 * hold the lock for the bio list because we need
														
 
															+	 * to make sure the bio list is empty
														
 
															+	 */
														
 
															+	spin_lock(&rbio->bio_list_lock);
														
 
															+
														
 
															+	if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
														
 
															+		list_del_init(&rbio->stripe_cache);
														
 
															+		table->cache_size -= 1;
														
 
															+		freeit = 1;
														
 
															+
														
 
															+		/* if the bio list isn't empty, this rbio is
														
 
															+		 * still involved in an IO.  We take it out
														
 
															+		 * of the cache list, and drop the ref that
														
 
															+		 * was held for the list.
														
 
															+		 *
														
 
															+		 * If the bio_list was empty, we also remove
														
 
															+		 * the rbio from the hash_table, and drop
														
 
															+		 * the corresponding ref
														
 
															+		 */
														
 
															+		if (bio_list_empty(&rbio->bio_list)) {
														
 
															+			if (!list_empty(&rbio->hash_list)) {
														
 
															+				list_del_init(&rbio->hash_list);
														
 
															+				atomic_dec(&rbio->refs);
														
 
															+				BUG_ON(!list_empty(&rbio->plug_list));
														
 
															+			}
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	spin_unlock(&rbio->bio_list_lock);
														
 
															+	spin_unlock(&h->lock);
														
 
															+
														
 
															+	if (freeit)
														
 
															+		__free_raid_bio(rbio);
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * prune a given rbio from the cache
														
 
															+ */
														
 
															+static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
														
 
															+{
														
 
															+	struct btrfs_stripe_hash_table *table;
														
 
															+	unsigned long flags;
														
 
															+
														
 
															+	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
														
 
															+		return;
														
 
															+
														
 
															+	table = rbio->fs_info->stripe_hash_table;
														
 
															+
														
 
															+	spin_lock_irqsave(&table->cache_lock, flags);
														
 
															+	__remove_rbio_from_cache(rbio);
														
 
															+	spin_unlock_irqrestore(&table->cache_lock, flags);
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * remove everything in the cache
														
 
															+ */
														
 
															+void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
														
 
															+{
														
 
															+	struct btrfs_stripe_hash_table *table;
														
 
															+	unsigned long flags;
														
 
															+	struct btrfs_raid_bio *rbio;
														
 
															+
														
 
															+	table = info->stripe_hash_table;
														
 
															+
														
 
															+	spin_lock_irqsave(&table->cache_lock, flags);
														
 
															+	while (!list_empty(&table->stripe_cache)) {
														
 
															+		rbio = list_entry(table->stripe_cache.next,
														
 
															+				  struct btrfs_raid_bio,
														
 
															+				  stripe_cache);
														
 
															+		__remove_rbio_from_cache(rbio);
														
 
															+	}
														
 
															+	spin_unlock_irqrestore(&table->cache_lock, flags);
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * remove all cached entries and free the hash table
														
 
															+ * used by unmount
														
 
															+ */
														
 
															+void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
														
 
															+{
														
 
															+	if (!info->stripe_hash_table)
														
 
															+		return;
														
 
															+	btrfs_clear_rbio_cache(info);
														
 
															+	kfree(info->stripe_hash_table);
														
 
															+	info->stripe_hash_table = NULL;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * insert an rbio into the stripe cache.  It
														
 
															+ * must have already been prepared by calling
														
 
															+ * cache_rbio_pages
														
 
															+ *
														
 
															+ * If this rbio was already cached, it gets
														
 
															+ * moved to the front of the lru.
														
 
															+ *
														
 
															+ * If the size of the rbio cache is too big, we
														
 
															+ * prune an item.
														
 
															+ */
														
 
															+static void cache_rbio(struct btrfs_raid_bio *rbio)
														
 
															+{
														
 
															+	struct btrfs_stripe_hash_table *table;
														
 
															+	unsigned long flags;
														
 
															+
														
 
															+	if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
														
 
															+		return;
														
 
															+
														
 
															+	table = rbio->fs_info->stripe_hash_table;
														
 
															+
														
 
															+	spin_lock_irqsave(&table->cache_lock, flags);
														
 
															+	spin_lock(&rbio->bio_list_lock);
														
 
															+
														
 
															+	/* bump our ref if we were not in the list before */
														
 
															+	if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
														
 
															+		atomic_inc(&rbio->refs);
														
 
															+
														
 
															+	if (!list_empty(&rbio->stripe_cache)){
														
 
															+		list_move(&rbio->stripe_cache, &table->stripe_cache);
														
 
															+	} else {
														
 
															+		list_add(&rbio->stripe_cache, &table->stripe_cache);
														
 
															+		table->cache_size += 1;
														
 
															+	}
														
 
															+
														
 
															+	spin_unlock(&rbio->bio_list_lock);
														
 
															+
														
 
															+	if (table->cache_size > RBIO_CACHE_SIZE) {
														
 
															+		struct btrfs_raid_bio *found;
														
 
															+
														
 
															+		found = list_entry(table->stripe_cache.prev,
														
 
															+				  struct btrfs_raid_bio,
														
 
															+				  stripe_cache);
														
 
															+
														
 
															+		if (found != rbio)
														
 
															+			__remove_rbio_from_cache(found);
														
 
															+	}
														
 
															+
														
 
															+	spin_unlock_irqrestore(&table->cache_lock, flags);
														
 
															+	return;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * helper function to run the xor_blocks api.  It is only
														
 
															+ * able to do MAX_XOR_BLOCKS at a time, so we need to
														
 
															+ * loop through.
														
 
															+ */
														
 
															+static void run_xor(void **pages, int src_cnt, ssize_t len)
														
 
															+{
														
 
															+	int src_off = 0;
														
 
															+	int xor_src_cnt = 0;
														
 
															+	void *dest = pages[src_cnt];
														
 
															+
														
 
															+	while(src_cnt > 0) {
														
 
															+		xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
														
 
															+		xor_blocks(xor_src_cnt, len, dest, pages + src_off);
														
 
															+
														
 
															+		src_cnt -= xor_src_cnt;
														
 
															+		src_off += xor_src_cnt;
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * returns true if the bio list inside this rbio
														
 
															+ * covers an entire stripe (no rmw required).
														
 
															+ * Must be called with the bio list lock held, or
														
 
															+ * at a time when you know it is impossible to add
														
 
															+ * new bios into the list
														
 
															+ */
														
 
															+static int __rbio_is_full(struct btrfs_raid_bio *rbio)
														
 
															+{
														
 
															+	unsigned long size = rbio->bio_list_bytes;
														
 
															+	int ret = 1;
														
 
															+
														
 
															+	if (size != rbio->nr_data * rbio->stripe_len)
														
 
															+		ret = 0;
														
 
															+
														
 
															+	BUG_ON(size > rbio->nr_data * rbio->stripe_len);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static int rbio_is_full(struct btrfs_raid_bio *rbio)
														
 
															+{
														
 
															+	unsigned long flags;
														
 
															+	int ret;
														
 
															+
														
 
															+	spin_lock_irqsave(&rbio->bio_list_lock, flags);
														
 
															+	ret = __rbio_is_full(rbio);
														
 
															+	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * returns 1 if it is safe to merge two rbios together.
														
 
															+ * The merging is safe if the two rbios correspond to
														
 
															+ * the same stripe and if they are both going in the same
														
 
															+ * direction (read vs write), and if neither one is
														
 
															+ * locked for final IO
														
 
															+ *
														
 
															+ * The caller is responsible for locking such that
														
 
															+ * rmw_locked is safe to test
														
 
															+ */
														
 
															+static int rbio_can_merge(struct btrfs_raid_bio *last,
														
 
															+			  struct btrfs_raid_bio *cur)
														
 
															+{
														
 
															+	if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
														
 
															+	    test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
														
 
															+		return 0;
														
 
															+
														
 
															+	/*
														
 
															+	 * we can't merge with cached rbios, since the
														
 
															+	 * idea is that when we merge the destination
														
 
															+	 * rbio is going to run our IO for us.  We can
														
 
															+	 * steal from cached rbio's though, other functions
														
 
															+	 * handle that.
														
 
															+	 */
														
 
															+	if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
														
 
															+	    test_bit(RBIO_CACHE_BIT, &cur->flags))
														
 
															+		return 0;
														
 
															+
														
 
															+	if (last->raid_map[0] !=
														
 
															+	    cur->raid_map[0])
														
 
															+		return 0;
														
 
															+
														
 
															+	/* reads can't merge with writes */
														
 
															+	if (last->read_rebuild !=
														
 
															+	    cur->read_rebuild) {
														
 
															+		return 0;
														
 
															+	}
														
 
															+
														
 
															+	return 1;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * helper to index into the pstripe
														
 
															+ */
														
 
															+static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
														
 
															+{
														
 
															+	index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
														
 
															+	return rbio->stripe_pages[index];
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * helper to index into the qstripe, returns null
														
 
															+ * if there is no qstripe
														
 
															+ */
														
 
															+static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
														
 
															+{
														
 
															+	if (rbio->nr_data + 1 == rbio->bbio->num_stripes)
														
 
															+		return NULL;
														
 
															+
														
 
															+	index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
														
 
															+		PAGE_CACHE_SHIFT;
														
 
															+	return rbio->stripe_pages[index];
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * The first stripe in the table for a logical address
														
 
															+ * has the lock.  rbios are added in one of three ways:
														
 
															+ *
														
 
															+ * 1) Nobody has the stripe locked yet.  The rbio is given
														
 
															+ * the lock and 0 is returned.  The caller must start the IO
														
 
															+ * themselves.
														
 
															+ *
														
 
															+ * 2) Someone has the stripe locked, but we're able to merge
														
 
															+ * with the lock owner.  The rbio is freed and the IO will
														
 
															+ * start automatically along with the existing rbio.  1 is returned.
														
 
															+ *
														
 
															+ * 3) Someone has the stripe locked, but we're not able to merge.
														
 
															+ * The rbio is added to the lock owner's plug list, or merged into
														
 
															+ * an rbio already on the plug list.  When the lock owner unlocks,
														
 
															+ * the next rbio on the list is run and the IO is started automatically.
														
 
															+ * 1 is returned
														
 
															+ *
														
 
															+ * If we return 0, the caller still owns the rbio and must continue with
														
 
															+ * IO submission.  If we return 1, the caller must assume the rbio has
														
 
															+ * already been freed.
														
 
															+ */
														
 
															+static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
														
 
															+{
														
 
															+	int bucket = rbio_bucket(rbio);
														
 
															+	struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket;
														
 
															+	struct btrfs_raid_bio *cur;
														
 
															+	struct btrfs_raid_bio *pending;
														
 
															+	unsigned long flags;
														
 
															+	DEFINE_WAIT(wait);
														
 
															+	struct btrfs_raid_bio *freeit = NULL;
														
 
															+	struct btrfs_raid_bio *cache_drop = NULL;
														
 
															+	int ret = 0;
														
 
															+	int walk = 0;
														
 
															+
														
 
															+	spin_lock_irqsave(&h->lock, flags);
														
 
															+	list_for_each_entry(cur, &h->hash_list, hash_list) {
														
 
															+		walk++;
														
 
															+		if (cur->raid_map[0] == rbio->raid_map[0]) {
														
 
															+			spin_lock(&cur->bio_list_lock);
														
 
															+
														
 
															+			/* can we steal this cached rbio's pages? */
														
 
															+			if (bio_list_empty(&cur->bio_list) &&
														
 
															+			    list_empty(&cur->plug_list) &&
														
 
															+			    test_bit(RBIO_CACHE_BIT, &cur->flags) &&
														
 
															+			    !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
														
 
															+				list_del_init(&cur->hash_list);
														
 
															+				atomic_dec(&cur->refs);
														
 
															+
														
 
															+				steal_rbio(cur, rbio);
														
 
															+				cache_drop = cur;
														
 
															+				spin_unlock(&cur->bio_list_lock);
														
 
															+
														
 
															+				goto lockit;
														
 
															+			}
														
 
															+
														
 
															+			/* can we merge into the lock owner? */
														
 
															+			if (rbio_can_merge(cur, rbio)) {
														
 
															+				merge_rbio(cur, rbio);
														
 
															+				spin_unlock(&cur->bio_list_lock);
														
 
															+				freeit = rbio;
														
 
															+				ret = 1;
														
 
															+				goto out;
														
 
															+			}
														
 
															+
														
 
															+
														
 
															+			/*
														
 
															+			 * we couldn't merge with the running
														
 
															+			 * rbio, see if we can merge with the
														
 
															+			 * pending ones.  We don't have to
														
 
															+			 * check for rmw_locked because there
														
 
															+			 * is no way they are inside finish_rmw
														
 
															+			 * right now
														
 
															+			 */
														
 
															+			list_for_each_entry(pending, &cur->plug_list,
														
 
															+					    plug_list) {
														
 
															+				if (rbio_can_merge(pending, rbio)) {
														
 
															+					merge_rbio(pending, rbio);
														
 
															+					spin_unlock(&cur->bio_list_lock);
														
 
															+					freeit = rbio;
														
 
															+					ret = 1;
														
 
															+					goto out;
														
 
															+				}
														
 
															+			}
														
 
															+
														
 
															+			/* no merging, put us on the tail of the plug list,
														
 
															+			 * our rbio will be started with the currently
														
 
															+			 * running rbio unlocks
														
 
															+			 */
														
 
															+			list_add_tail(&rbio->plug_list, &cur->plug_list);
														
 
															+			spin_unlock(&cur->bio_list_lock);
														
 
															+			ret = 1;
														
 
															+			goto out;
														
 
															+		}
														
 
															+	}
														
 
															+lockit:
														
 
															+	atomic_inc(&rbio->refs);
														
 
															+	list_add(&rbio->hash_list, &h->hash_list);
														
 
															+out:
														
 
															+	spin_unlock_irqrestore(&h->lock, flags);
														
 
															+	if (cache_drop)
														
 
															+		remove_rbio_from_cache(cache_drop);
														
 
															+	if (freeit)
														
 
															+		__free_raid_bio(freeit);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * called as rmw or parity rebuild is completed.  If the plug list has more
														
 
															+ * rbios waiting for this stripe, the next one on the list will be started
														
 
															+ */
														
 
															+static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
														
 
															+{
														
 
															+	int bucket;
														
 
															+	struct btrfs_stripe_hash *h;
														
 
															+	unsigned long flags;
														
 
															+	int keep_cache = 0;
														
 
															+
														
 
															+	bucket = rbio_bucket(rbio);
														
 
															+	h = rbio->fs_info->stripe_hash_table->table + bucket;
														
 
															+
														
 
															+	if (list_empty(&rbio->plug_list))
														
 
															+		cache_rbio(rbio);
														
 
															+
														
 
															+	spin_lock_irqsave(&h->lock, flags);
														
 
															+	spin_lock(&rbio->bio_list_lock);
														
 
															+
														
 
															+	if (!list_empty(&rbio->hash_list)) {
														
 
															+		/*
														
 
															+		 * if we're still cached and there is no other IO
														
 
															+		 * to perform, just leave this rbio here for others
														
 
															+		 * to steal from later
														
 
															+		 */
														
 
															+		if (list_empty(&rbio->plug_list) &&
														
 
															+		    test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
														
 
															+			keep_cache = 1;
														
 
															+			clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
														
 
															+			BUG_ON(!bio_list_empty(&rbio->bio_list));
														
 
															+			goto done;
														
 
															+		}
														
 
															+
														
 
															+		list_del_init(&rbio->hash_list);
														
 
															+		atomic_dec(&rbio->refs);
														
 
															+
														
 
															+		/*
														
 
															+		 * we use the plug list to hold all the rbios
														
 
															+		 * waiting for the chance to lock this stripe.
														
 
															+		 * hand the lock over to one of them.
														
 
															+		 */
														
 
															+		if (!list_empty(&rbio->plug_list)) {
														
 
															+			struct btrfs_raid_bio *next;
														
 
															+			struct list_head *head = rbio->plug_list.next;
														
 
															+
														
 
															+			next = list_entry(head, struct btrfs_raid_bio,
														
 
															+					  plug_list);
														
 
															+
														
 
															+			list_del_init(&rbio->plug_list);
														
 
															+
														
 
															+			list_add(&next->hash_list, &h->hash_list);
														
 
															+			atomic_inc(&next->refs);
														
 
															+			spin_unlock(&rbio->bio_list_lock);
														
 
															+			spin_unlock_irqrestore(&h->lock, flags);
														
 
															+
														
 
															+			if (next->read_rebuild)
														
 
															+				async_read_rebuild(next);
														
 
															+			else {
														
 
															+				steal_rbio(rbio, next);
														
 
															+				async_rmw_stripe(next);
														
 
															+			}
														
 
															+
														
 
															+			goto done_nolock;
														
 
															+		} else  if (waitqueue_active(&h->wait)) {
														
 
															+			spin_unlock(&rbio->bio_list_lock);
														
 
															+			spin_unlock_irqrestore(&h->lock, flags);
														
 
															+			wake_up(&h->wait);
														
 
															+			goto done_nolock;
														
 
															+		}
														
 
															+	}
														
 
															+done:
														
 
															+	spin_unlock(&rbio->bio_list_lock);
														
 
															+	spin_unlock_irqrestore(&h->lock, flags);
														
 
															+
														
 
															+done_nolock:
														
 
															+	if (!keep_cache)
														
 
															+		remove_rbio_from_cache(rbio);
														
 
															+}
														
 
															+
														
 
															+static void __free_raid_bio(struct btrfs_raid_bio *rbio)
														
 
															+{
														
 
															+	int i;
														
 
															+
														
 
															+	WARN_ON(atomic_read(&rbio->refs) < 0);
														
 
															+	if (!atomic_dec_and_test(&rbio->refs))
														
 
															+		return;
														
 
															+
														
 
															+	WARN_ON(!list_empty(&rbio->stripe_cache));
														
 
															+	WARN_ON(!list_empty(&rbio->hash_list));
														
 
															+	WARN_ON(!bio_list_empty(&rbio->bio_list));
														
 
															+
														
 
															+	for (i = 0; i < rbio->nr_pages; i++) {
														
 
															+		if (rbio->stripe_pages[i]) {
														
 
															+			__free_page(rbio->stripe_pages[i]);
														
 
															+			rbio->stripe_pages[i] = NULL;
														
 
															+		}
														
 
															+	}
														
 
															+	kfree(rbio->raid_map);
														
 
															+	kfree(rbio->bbio);
														
 
															+	kfree(rbio);
														
 
															+}
														
 
															+
														
 
															+static void free_raid_bio(struct btrfs_raid_bio *rbio)
														
 
															+{
														
 
															+	unlock_stripe(rbio);
														
 
															+	__free_raid_bio(rbio);
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * this frees the rbio and runs through all the bios in the
														
 
															+ * bio_list and calls end_io on them
														
 
															+ */
														
 
															+static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
														
 
															+{
														
 
															+	struct bio *cur = bio_list_get(&rbio->bio_list);
														
 
															+	struct bio *next;
														
 
															+	free_raid_bio(rbio);
														
 
															+
														
 
															+	while (cur) {
														
 
															+		next = cur->bi_next;
														
 
															+		cur->bi_next = NULL;
														
 
															+		if (uptodate)
														
 
															+			set_bit(BIO_UPTODATE, &cur->bi_flags);
														
 
															+		bio_endio(cur, err);
														
 
															+		cur = next;
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * end io function used by finish_rmw.  When we finally
														
 
															+ * get here, we've written a full stripe
														
 
															+ */
														
 
															+static void raid_write_end_io(struct bio *bio, int err)
														
 
															+{
														
 
															+	struct btrfs_raid_bio *rbio = bio->bi_private;
														
 
															+
														
 
															+	if (err)
														
 
															+		fail_bio_stripe(rbio, bio);
														
 
															+
														
 
															+	bio_put(bio);
														
 
															+
														
 
															+	if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
														
 
															+		return;
														
 
															+
														
 
															+	err = 0;
														
 
															+
														
 
															+	/* OK, we have read all the stripes we need to. */
														
 
															+	if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
														
 
															+		err = -EIO;
														
 
															+
														
 
															+	rbio_orig_end_io(rbio, err, 0);
														
 
															+	return;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * the read/modify/write code wants to use the original bio for
														
 
															+ * any pages it included, and then use the rbio for everything
														
 
															+ * else.  This function decides if a given index (stripe number)
														
 
															+ * and page number in that stripe fall inside the original bio
														
 
															+ * or the rbio.
														
 
															+ *
														
 
															+ * if you set bio_list_only, you'll get a NULL back for any ranges
														
 
															+ * that are outside the bio_list
														
 
															+ *
														
 
															+ * This doesn't take any refs on anything, you get a bare page pointer
														
 
															+ * and the caller must bump refs as required.
														
 
															+ *
														
 
															+ * You must call index_rbio_pages once before you can trust
														
 
															+ * the answers from this function.
														
 
															+ */
														
 
															+static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
														
 
															+				 int index, int pagenr, int bio_list_only)
														
 
															+{
														
 
															+	int chunk_page;
														
 
															+	struct page *p = NULL;
														
 
															+
														
 
															+	chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
														
 
															+
														
 
															+	spin_lock_irq(&rbio->bio_list_lock);
														
 
															+	p = rbio->bio_pages[chunk_page];
														
 
															+	spin_unlock_irq(&rbio->bio_list_lock);
														
 
															+
														
 
															+	if (p || bio_list_only)
														
 
															+		return p;
														
 
															+
														
 
															+	return rbio->stripe_pages[chunk_page];
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * number of pages we need for the entire stripe across all the
														
 
															+ * drives
														
 
															+ */
														
 
															+static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
														
 
															+{
														
 
															+	unsigned long nr = stripe_len * nr_stripes;
														
 
															+	return (nr + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * allocation and initial setup for the btrfs_raid_bio.  Not
														
 
															+ * this does not allocate any pages for rbio->pages.
														
 
															+ */
														
 
															+static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
														
 
															+			  struct btrfs_bio *bbio, u64 *raid_map,
														
 
															+			  u64 stripe_len)
														
 
															+{
														
 
															+	struct btrfs_raid_bio *rbio;
														
 
															+	int nr_data = 0;
														
 
															+	int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes);
														
 
															+	void *p;
														
 
															+
														
 
															+	rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2,
														
 
															+			GFP_NOFS);
														
 
															+	if (!rbio) {
														
 
															+		kfree(raid_map);
														
 
															+		kfree(bbio);
														
 
															+		return ERR_PTR(-ENOMEM);
														
 
															+	}
														
 
															+
														
 
															+	bio_list_init(&rbio->bio_list);
														
 
															+	INIT_LIST_HEAD(&rbio->plug_list);
														
 
															+	spin_lock_init(&rbio->bio_list_lock);
														
 
															+	INIT_LIST_HEAD(&rbio->stripe_cache);
														
 
															+	INIT_LIST_HEAD(&rbio->hash_list);
														
 
															+	rbio->bbio = bbio;
														
 
															+	rbio->raid_map = raid_map;
														
 
															+	rbio->fs_info = root->fs_info;
														
 
															+	rbio->stripe_len = stripe_len;
														
 
															+	rbio->nr_pages = num_pages;
														
 
															+	rbio->faila = -1;
														
 
															+	rbio->failb = -1;
														
 
															+	atomic_set(&rbio->refs, 1);
														
 
															+
														
 
															+	/*
														
 
															+	 * the stripe_pages and bio_pages array point to the extra
														
 
															+	 * memory we allocated past the end of the rbio
														
 
															+	 */
														
 
															+	p = rbio + 1;
														
 
															+	rbio->stripe_pages = p;
														
 
															+	rbio->bio_pages = p + sizeof(struct page *) * num_pages;
														
 
															+
														
 
															+	if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
														
 
															+		nr_data = bbio->num_stripes - 2;
														
 
															+	else
														
 
															+		nr_data = bbio->num_stripes - 1;
														
 
															+
														
 
															+	rbio->nr_data = nr_data;
														
 
															+	return rbio;
														
 
															+}
														
 
															+
														
 
															+/* allocate pages for all the stripes in the bio, including parity */
														
 
															+static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
														
 
															+{
														
 
															+	int i;
														
 
															+	struct page *page;
														
 
															+
														
 
															+	for (i = 0; i < rbio->nr_pages; i++) {
														
 
															+		if (rbio->stripe_pages[i])
														
 
															+			continue;
														
 
															+		page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
														
 
															+		if (!page)
														
 
															+			return -ENOMEM;
														
 
															+		rbio->stripe_pages[i] = page;
														
 
															+		ClearPageUptodate(page);
														
 
															+	}
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+/* allocate pages for just the p/q stripes */
														
 
															+static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
														
 
															+{
														
 
															+	int i;
														
 
															+	struct page *page;
														
 
															+
														
 
															+	i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
														
 
															+
														
 
															+	for (; i < rbio->nr_pages; i++) {
														
 
															+		if (rbio->stripe_pages[i])
														
 
															+			continue;
														
 
															+		page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
														
 
															+		if (!page)
														
 
															+			return -ENOMEM;
														
 
															+		rbio->stripe_pages[i] = page;
														
 
															+	}
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * add a single page from a specific stripe into our list of bios for IO
														
 
															+ * this will try to merge into existing bios if possible, and returns
														
 
															+ * zero if all went well.
														
 
															+ */
														
 
															+int rbio_add_io_page(struct btrfs_raid_bio *rbio,
														
 
															+		     struct bio_list *bio_list,
														
 
															+		     struct page *page,
														
 
															+		     int stripe_nr,
														
 
															+		     unsigned long page_index,
														
 
															+		     unsigned long bio_max_len)
														
 
															+{
														
 
															+	struct bio *last = bio_list->tail;
														
 
															+	u64 last_end = 0;
														
 
															+	int ret;
														
 
															+	struct bio *bio;
														
 
															+	struct btrfs_bio_stripe *stripe;
														
 
															+	u64 disk_start;
														
 
															+
														
 
															+	stripe = &rbio->bbio->stripes[stripe_nr];
														
 
															+	disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT);
														
 
															+
														
 
															+	/* if the device is missing, just fail this stripe */
														
 
															+	if (!stripe->dev->bdev)
														
 
															+		return fail_rbio_index(rbio, stripe_nr);
														
 
															+
														
 
															+	/* see if we can add this page onto our existing bio */
														
 
															+	if (last) {
														
 
															+		last_end = (u64)last->bi_sector << 9;
														
 
															+		last_end += last->bi_size;
														
 
															+
														
 
															+		/*
														
 
															+		 * we can't merge these if they are from different
														
 
															+		 * devices or if they are not contiguous
														
 
															+		 */
														
 
															+		if (last_end == disk_start && stripe->dev->bdev &&
														
 
															+		    test_bit(BIO_UPTODATE, &last->bi_flags) &&
														
 
															+		    last->bi_bdev == stripe->dev->bdev) {
														
 
															+			ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0);
														
 
															+			if (ret == PAGE_CACHE_SIZE)
														
 
															+				return 0;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	/* put a new bio on the list */
														
 
															+	bio = bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1);
														
 
															+	if (!bio)
														
 
															+		return -ENOMEM;
														
 
															+
														
 
															+	bio->bi_size = 0;
														
 
															+	bio->bi_bdev = stripe->dev->bdev;
														
 
															+	bio->bi_sector = disk_start >> 9;
														
 
															+	set_bit(BIO_UPTODATE, &bio->bi_flags);
														
 
															+
														
 
															+	bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
														
 
															+	bio_list_add(bio_list, bio);
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * while we're doing the read/modify/write cycle, we could
														
 
															+ * have errors in reading pages off the disk.  This checks
														
 
															+ * for errors and if we're not able to read the page it'll
														
 
															+ * trigger parity reconstruction.  The rmw will be finished
														
 
															+ * after we've reconstructed the failed stripes
														
 
															+ */
														
 
															+static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
														
 
															+{
														
 
															+	if (rbio->faila >= 0 || rbio->failb >= 0) {
														
 
															+		BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1);
														
 
															+		__raid56_parity_recover(rbio);
														
 
															+	} else {
														
 
															+		finish_rmw(rbio);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * these are just the pages from the rbio array, not from anything
														
 
															+ * the FS sent down to us
														
 
															+ */
														
 
															+static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page)
														
 
															+{
														
 
															+	int index;
														
 
															+	index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT);
														
 
															+	index += page;
														
 
															+	return rbio->stripe_pages[index];
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * helper function to walk our bio list and populate the bio_pages array with
														
 
															+ * the result.  This seems expensive, but it is faster than constantly
														
 
															+ * searching through the bio list as we setup the IO in finish_rmw or stripe
														
 
															+ * reconstruction.
														
 
															+ *
														
 
															+ * This must be called before you trust the answers from page_in_rbio
														
 
															+ */
														
 
															+static void index_rbio_pages(struct btrfs_raid_bio *rbio)
														
 
															+{
														
 
															+	struct bio *bio;
														
 
															+	u64 start;
														
 
															+	unsigned long stripe_offset;
														
 
															+	unsigned long page_index;
														
 
															+	struct page *p;
														
 
															+	int i;
														
 
															+
														
 
															+	spin_lock_irq(&rbio->bio_list_lock);
														
 
															+	bio_list_for_each(bio, &rbio->bio_list) {
														
 
															+		start = (u64)bio->bi_sector << 9;
														
 
															+		stripe_offset = start - rbio->raid_map[0];
														
 
															+		page_index = stripe_offset >> PAGE_CACHE_SHIFT;
														
 
															+
														
 
															+		for (i = 0; i < bio->bi_vcnt; i++) {
														
 
															+			p = bio->bi_io_vec[i].bv_page;
														
 
															+			rbio->bio_pages[page_index + i] = p;
														
 
															+		}
														
 
															+	}
														
 
															+	spin_unlock_irq(&rbio->bio_list_lock);
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * this is called from one of two situations.  We either
														
 
															+ * have a full stripe from the higher layers, or we've read all
														
 
															+ * the missing bits off disk.
														
 
															+ *
														
 
															+ * This will calculate the parity and then send down any
														
 
															+ * changed blocks.
														
 
															+ */
														
 
															+static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
														
 
															+{
														
 
															+	struct btrfs_bio *bbio = rbio->bbio;
														
 
															+	void *pointers[bbio->num_stripes];
														
 
															+	int stripe_len = rbio->stripe_len;
														
 
															+	int nr_data = rbio->nr_data;
														
 
															+	int stripe;
														
 
															+	int pagenr;
														
 
															+	int p_stripe = -1;
														
 
															+	int q_stripe = -1;
														
 
															+	struct bio_list bio_list;
														
 
															+	struct bio *bio;
														
 
															+	int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT;
														
 
															+	int ret;
														
 
															+
														
 
															+	bio_list_init(&bio_list);
														
 
															+
														
 
															+	if (bbio->num_stripes - rbio->nr_data == 1) {
														
 
															+		p_stripe = bbio->num_stripes - 1;
														
 
															+	} else if (bbio->num_stripes - rbio->nr_data == 2) {
														
 
															+		p_stripe = bbio->num_stripes - 2;
														
 
															+		q_stripe = bbio->num_stripes - 1;
														
 
															+	} else {
														
 
															+		BUG();
														
 
															+	}
														
 
															+
														
 
															+	/* at this point we either have a full stripe,
														
 
															+	 * or we've read the full stripe from the drive.
														
 
															+	 * recalculate the parity and write the new results.
														
 
															+	 *
														
 
															+	 * We're not allowed to add any new bios to the
														
 
															+	 * bio list here, anyone else that wants to
														
 
															+	 * change this stripe needs to do their own rmw.
														
 
															+	 */
														
 
															+	spin_lock_irq(&rbio->bio_list_lock);
														
 
															+	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
														
 
															+	spin_unlock_irq(&rbio->bio_list_lock);
														
 
															+
														
 
															+	atomic_set(&rbio->bbio->error, 0);
														
 
															+
														
 
															+	/*
														
 
															+	 * now that we've set rmw_locked, run through the
														
 
															+	 * bio list one last time and map the page pointers
														
 
															+	 *
														
 
															+	 * We don't cache full rbios because we're assuming
														
 
															+	 * the higher layers are unlikely to use this area of
														
 
															+	 * the disk again soon.  If they do use it again,
														
 
															+	 * hopefully they will send another full bio.
														
 
															+	 */
														
 
															+	index_rbio_pages(rbio);
														
 
															+	if (!rbio_is_full(rbio))
														
 
															+		cache_rbio_pages(rbio);
														
 
															+	else
														
 
															+		clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
														
 
															+
														
 
															+	for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
														
 
															+		struct page *p;
														
 
															+		/* first collect one page from each data stripe */
														
 
															+		for (stripe = 0; stripe < nr_data; stripe++) {
														
 
															+			p = page_in_rbio(rbio, stripe, pagenr, 0);
														
 
															+			pointers[stripe] = kmap(p);
														
 
															+		}
														
 
															+
														
 
															+		/* then add the parity stripe */
														
 
															+		p = rbio_pstripe_page(rbio, pagenr);
														
 
															+		SetPageUptodate(p);
														
 
															+		pointers[stripe++] = kmap(p);
														
 
															+
														
 
															+		if (q_stripe != -1) {
														
 
															+
														
 
															+			/*
														
 
															+			 * raid6, add the qstripe and call the
														
 
															+			 * library function to fill in our p/q
														
 
															+			 */
														
 
															+			p = rbio_qstripe_page(rbio, pagenr);
														
 
															+			SetPageUptodate(p);
														
 
															+			pointers[stripe++] = kmap(p);
														
 
															+
														
 
															+			raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE,
														
 
															+						pointers);
														
 
															+		} else {
														
 
															+			/* raid5 */
														
 
															+			memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
														
 
															+			run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
														
 
															+		}
														
 
															+
														
 
															+
														
 
															+		for (stripe = 0; stripe < bbio->num_stripes; stripe++)
														
 
															+			kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
														
 
															+	}
														
 
															+
														
 
															+	/*
														
 
															+	 * time to start writing.  Make bios for everything from the
														
 
															+	 * higher layers (the bio_list in our rbio) and our p/q.  Ignore
														
 
															+	 * everything else.
														
 
															+	 */
														
 
															+	for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
														
 
															+		for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
														
 
															+			struct page *page;
														
 
															+			if (stripe < rbio->nr_data) {
														
 
															+				page = page_in_rbio(rbio, stripe, pagenr, 1);
														
 
															+				if (!page)
														
 
															+					continue;
														
 
															+			} else {
														
 
															+			       page = rbio_stripe_page(rbio, stripe, pagenr);
														
 
															+			}
														
 
															+
														
 
															+			ret = rbio_add_io_page(rbio, &bio_list,
														
 
															+				       page, stripe, pagenr, rbio->stripe_len);
														
 
															+			if (ret)
														
 
															+				goto cleanup;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list));
														
 
															+	BUG_ON(atomic_read(&bbio->stripes_pending) == 0);
														
 
															+
														
 
															+	while (1) {
														
 
															+		bio = bio_list_pop(&bio_list);
														
 
															+		if (!bio)
														
 
															+			break;
														
 
															+
														
 
															+		bio->bi_private = rbio;
														
 
															+		bio->bi_end_io = raid_write_end_io;
														
 
															+		BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
														
 
															+		submit_bio(WRITE, bio);
														
 
															+	}
														
 
															+	return;
														
 
															+
														
 
															+cleanup:
														
 
															+	rbio_orig_end_io(rbio, -EIO, 0);
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * helper to find the stripe number for a given bio.  Used to figure out which
														
 
															+ * stripe has failed.  This expects the bio to correspond to a physical disk,
														
 
															+ * so it looks up based on physical sector numbers.
														
 
															+ */
														
 
															+static int find_bio_stripe(struct btrfs_raid_bio *rbio,
														
 
															+			   struct bio *bio)
														
 
															+{
														
 
															+	u64 physical = bio->bi_sector;
														
 
															+	u64 stripe_start;
														
 
															+	int i;
														
 
															+	struct btrfs_bio_stripe *stripe;
														
 
															+
														
 
															+	physical <<= 9;
														
 
															+
														
 
															+	for (i = 0; i < rbio->bbio->num_stripes; i++) {
														
 
															+		stripe = &rbio->bbio->stripes[i];
														
 
															+		stripe_start = stripe->physical;
														
 
															+		if (physical >= stripe_start &&
														
 
															+		    physical < stripe_start + rbio->stripe_len) {
														
 
															+			return i;
														
 
															+		}
														
 
															+	}
														
 
															+	return -1;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * helper to find the stripe number for a given
														
 
															+ * bio (before mapping).  Used to figure out which stripe has
														
 
															+ * failed.  This looks up based on logical block numbers.
														
 
															+ */
														
 
															+static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
														
 
															+				   struct bio *bio)
														
 
															+{
														
 
															+	u64 logical = bio->bi_sector;
														
 
															+	u64 stripe_start;
														
 
															+	int i;
														
 
															+
														
 
															+	logical <<= 9;
														
 
															+
														
 
															+	for (i = 0; i < rbio->nr_data; i++) {
														
 
															+		stripe_start = rbio->raid_map[i];
														
 
															+		if (logical >= stripe_start &&
														
 
															+		    logical < stripe_start + rbio->stripe_len) {
														
 
															+			return i;
														
 
															+		}
														
 
															+	}
														
 
															+	return -1;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * returns -EIO if we had too many failures
														
 
															+ */
														
 
															+static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
														
 
															+{
														
 
															+	unsigned long flags;
														
 
															+	int ret = 0;
														
 
															+
														
 
															+	spin_lock_irqsave(&rbio->bio_list_lock, flags);
														
 
															+
														
 
															+	/* we already know this stripe is bad, move on */
														
 
															+	if (rbio->faila == failed || rbio->failb == failed)
														
 
															+		goto out;
														
 
															+
														
 
															+	if (rbio->faila == -1) {
														
 
															+		/* first failure on this rbio */
														
 
															+		rbio->faila = failed;
														
 
															+		atomic_inc(&rbio->bbio->error);
														
 
															+	} else if (rbio->failb == -1) {
														
 
															+		/* second failure on this rbio */
														
 
															+		rbio->failb = failed;
														
 
															+		atomic_inc(&rbio->bbio->error);
														
 
															+	} else {
														
 
															+		ret = -EIO;
														
 
															+	}
														
 
															+out:
														
 
															+	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * helper to fail a stripe based on a physical disk
														
 
															+ * bio.
														
 
															+ */
														
 
															+static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
														
 
															+			   struct bio *bio)
														
 
															+{
														
 
															+	int failed = find_bio_stripe(rbio, bio);
														
 
															+
														
 
															+	if (failed < 0)
														
 
															+		return -EIO;
														
 
															+
														
 
															+	return fail_rbio_index(rbio, failed);
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * this sets each page in the bio uptodate.  It should only be used on private
														
 
															+ * rbio pages, nothing that comes in from the higher layers
														
 
															+ */
														
 
															+static void set_bio_pages_uptodate(struct bio *bio)
														
 
															+{
														
 
															+	int i;
														
 
															+	struct page *p;
														
 
															+
														
 
															+	for (i = 0; i < bio->bi_vcnt; i++) {
														
 
															+		p = bio->bi_io_vec[i].bv_page;
														
 
															+		SetPageUptodate(p);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * end io for the read phase of the rmw cycle.  All the bios here are physical
														
 
															+ * stripe bios we've read from the disk so we can recalculate the parity of the
														
 
															+ * stripe.
														
 
															+ *
														
 
															+ * This will usually kick off finish_rmw once all the bios are read in, but it
														
 
															+ * may trigger parity reconstruction if we had any errors along the way
														
 
															+ */
														
 
															+static void raid_rmw_end_io(struct bio *bio, int err)
														
 
															+{
														
 
															+	struct btrfs_raid_bio *rbio = bio->bi_private;
														
 
															+
														
 
															+	if (err)
														
 
															+		fail_bio_stripe(rbio, bio);
														
 
															+	else
														
 
															+		set_bio_pages_uptodate(bio);
														
 
															+
														
 
															+	bio_put(bio);
														
 
															+
														
 
															+	if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
														
 
															+		return;
														
 
															+
														
 
															+	err = 0;
														
 
															+	if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
														
 
															+		goto cleanup;
														
 
															+
														
 
															+	/*
														
 
															+	 * this will normally call finish_rmw to start our write
														
 
															+	 * but if there are any failed stripes we'll reconstruct
														
 
															+	 * from parity first
														
 
															+	 */
														
 
															+	validate_rbio_for_rmw(rbio);
														
 
															+	return;
														
 
															+
														
 
															+cleanup:
														
 
															+
														
 
															+	rbio_orig_end_io(rbio, -EIO, 0);
														
 
															+}
														
 
															+
														
 
															+static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
														
 
															+{
														
 
															+	rbio->work.flags = 0;
														
 
															+	rbio->work.func = rmw_work;
														
 
															+
														
 
															+	btrfs_queue_worker(&rbio->fs_info->rmw_workers,
														
 
															+			   &rbio->work);
														
 
															+}
														
 
															+
														
 
															+static void async_read_rebuild(struct btrfs_raid_bio *rbio)
														
 
															+{
														
 
															+	rbio->work.flags = 0;
														
 
															+	rbio->work.func = read_rebuild_work;
														
 
															+
														
 
															+	btrfs_queue_worker(&rbio->fs_info->rmw_workers,
														
 
															+			   &rbio->work);
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * the stripe must be locked by the caller.  It will
														
 
															+ * unlock after all the writes are done
														
 
															+ */
														
 
															+static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
														
 
															+{
														
 
															+	int bios_to_read = 0;
														
 
															+	struct btrfs_bio *bbio = rbio->bbio;
														
 
															+	struct bio_list bio_list;
														
 
															+	int ret;
														
 
															+	int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
														
 
															+	int pagenr;
														
 
															+	int stripe;
														
 
															+	struct bio *bio;
														
 
															+
														
 
															+	bio_list_init(&bio_list);
														
 
															+
														
 
															+	ret = alloc_rbio_pages(rbio);
														
 
															+	if (ret)
														
 
															+		goto cleanup;
														
 
															+
														
 
															+	index_rbio_pages(rbio);
														
 
															+
														
 
															+	atomic_set(&rbio->bbio->error, 0);
														
 
															+	/*
														
 
															+	 * build a list of bios to read all the missing parts of this
														
 
															+	 * stripe
														
 
															+	 */
														
 
															+	for (stripe = 0; stripe < rbio->nr_data; stripe++) {
														
 
															+		for (pagenr = 0; pagenr < nr_pages; pagenr++) {
														
 
															+			struct page *page;
														
 
															+			/*
														
 
															+			 * we want to find all the pages missing from
														
 
															+			 * the rbio and read them from the disk.  If
														
 
															+			 * page_in_rbio finds a page in the bio list
														
 
															+			 * we don't need to read it off the stripe.
														
 
															+			 */
														
 
															+			page = page_in_rbio(rbio, stripe, pagenr, 1);
														
 
															+			if (page)
														
 
															+				continue;
														
 
															+
														
 
															+			page = rbio_stripe_page(rbio, stripe, pagenr);
														
 
															+			/*
														
 
															+			 * the bio cache may have handed us an uptodate
														
 
															+			 * page.  If so, be happy and use it
														
 
															+			 */
														
 
															+			if (PageUptodate(page))
														
 
															+				continue;
														
 
															+
														
 
															+			ret = rbio_add_io_page(rbio, &bio_list, page,
														
 
															+				       stripe, pagenr, rbio->stripe_len);
														
 
															+			if (ret)
														
 
															+				goto cleanup;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	bios_to_read = bio_list_size(&bio_list);
														
 
															+	if (!bios_to_read) {
														
 
															+		/*
														
 
															+		 * this can happen if others have merged with
														
 
															+		 * us, it means there is nothing left to read.
														
 
															+		 * But if there are missing devices it may not be
														
 
															+		 * safe to do the full stripe write yet.
														
 
															+		 */
														
 
															+		goto finish;
														
 
															+	}
														
 
															+
														
 
															+	/*
														
 
															+	 * the bbio may be freed once we submit the last bio.  Make sure
														
 
															+	 * not to touch it after that
														
 
															+	 */
														
 
															+	atomic_set(&bbio->stripes_pending, bios_to_read);
														
 
															+	while (1) {
														
 
															+		bio = bio_list_pop(&bio_list);
														
 
															+		if (!bio)
														
 
															+			break;
														
 
															+
														
 
															+		bio->bi_private = rbio;
														
 
															+		bio->bi_end_io = raid_rmw_end_io;
														
 
															+
														
 
															+		btrfs_bio_wq_end_io(rbio->fs_info, bio,
														
 
															+				    BTRFS_WQ_ENDIO_RAID56);
														
 
															+
														
 
															+		BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
														
 
															+		submit_bio(READ, bio);
														
 
															+	}
														
 
															+	/* the actual write will happen once the reads are done */
														
 
															+	return 0;
														
 
															+
														
 
															+cleanup:
														
 
															+	rbio_orig_end_io(rbio, -EIO, 0);
														
 
															+	return -EIO;
														
 
															+
														
 
															+finish:
														
 
															+	validate_rbio_for_rmw(rbio);
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * if the upper layers pass in a full stripe, we thank them by only allocating
														
 
															+ * enough pages to hold the parity, and sending it all down quickly.
														
 
															+ */
														
 
															+static int full_stripe_write(struct btrfs_raid_bio *rbio)
														
 
															+{
														
 
															+	int ret;
														
 
															+
														
 
															+	ret = alloc_rbio_parity_pages(rbio);
														
 
															+	if (ret)
														
 
															+		return ret;
														
 
															+
														
 
															+	ret = lock_stripe_add(rbio);
														
 
															+	if (ret == 0)
														
 
															+		finish_rmw(rbio);
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * partial stripe writes get handed over to async helpers.
														
 
															+ * We're really hoping to merge a few more writes into this
														
 
															+ * rbio before calculating new parity
														
 
															+ */
														
 
															+static int partial_stripe_write(struct btrfs_raid_bio *rbio)
														
 
															+{
														
 
															+	int ret;
														
 
															+
														
 
															+	ret = lock_stripe_add(rbio);
														
 
															+	if (ret == 0)
														
 
															+		async_rmw_stripe(rbio);
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * sometimes while we were reading from the drive to
														
 
															+ * recalculate parity, enough new bios come into create
														
 
															+ * a full stripe.  So we do a check here to see if we can
														
 
															+ * go directly to finish_rmw
														
 
															+ */
														
 
															+static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
														
 
															+{
														
 
															+	/* head off into rmw land if we don't have a full stripe */
														
 
															+	if (!rbio_is_full(rbio))
														
 
															+		return partial_stripe_write(rbio);
														
 
															+	return full_stripe_write(rbio);
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * We use plugging call backs to collect full stripes.
														
 
															+ * Any time we get a partial stripe write while plugged
														
 
															+ * we collect it into a list.  When the unplug comes down,
														
 
															+ * we sort the list by logical block number and merge
														
 
															+ * everything we can into the same rbios
														
 
															+ */
														
 
															+struct btrfs_plug_cb {
														
 
															+	struct blk_plug_cb cb;
														
 
															+	struct btrfs_fs_info *info;
														
 
															+	struct list_head rbio_list;
														
 
															+	struct btrfs_work work;
														
 
															+};
														
 
															+
														
 
															+/*
														
 
															+ * rbios on the plug list are sorted for easier merging.
														
 
															+ */
														
 
															+static int plug_cmp(void *priv, struct list_head *a, struct list_head *b)
														
 
															+{
														
 
															+	struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
														
 
															+						 plug_list);
														
 
															+	struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
														
 
															+						 plug_list);
														
 
															+	u64 a_sector = ra->bio_list.head->bi_sector;
														
 
															+	u64 b_sector = rb->bio_list.head->bi_sector;
														
 
															+
														
 
															+	if (a_sector < b_sector)
														
 
															+		return -1;
														
 
															+	if (a_sector > b_sector)
														
 
															+		return 1;
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static void run_plug(struct btrfs_plug_cb *plug)
														
 
															+{
														
 
															+	struct btrfs_raid_bio *cur;
														
 
															+	struct btrfs_raid_bio *last = NULL;
														
 
															+
														
 
															+	/*
														
 
															+	 * sort our plug list then try to merge
														
 
															+	 * everything we can in hopes of creating full
														
 
															+	 * stripes.
														
 
															+	 */
														
 
															+	list_sort(NULL, &plug->rbio_list, plug_cmp);
														
 
															+	while (!list_empty(&plug->rbio_list)) {
														
 
															+		cur = list_entry(plug->rbio_list.next,
														
 
															+				 struct btrfs_raid_bio, plug_list);
														
 
															+		list_del_init(&cur->plug_list);
														
 
															+
														
 
															+		if (rbio_is_full(cur)) {
														
 
															+			/* we have a full stripe, send it down */
														
 
															+			full_stripe_write(cur);
														
 
															+			continue;
														
 
															+		}
														
 
															+		if (last) {
														
 
															+			if (rbio_can_merge(last, cur)) {
														
 
															+				merge_rbio(last, cur);
														
 
															+				__free_raid_bio(cur);
														
 
															+				continue;
														
 
															+
														
 
															+			}
														
 
															+			__raid56_parity_write(last);
														
 
															+		}
														
 
															+		last = cur;
														
 
															+	}
														
 
															+	if (last) {
														
 
															+		__raid56_parity_write(last);
														
 
															+	}
														
 
															+	kfree(plug);
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * if the unplug comes from schedule, we have to push the
														
 
															+ * work off to a helper thread
														
 
															+ */
														
 
															+static void unplug_work(struct btrfs_work *work)
														
 
															+{
														
 
															+	struct btrfs_plug_cb *plug;
														
 
															+	plug = container_of(work, struct btrfs_plug_cb, work);
														
 
															+	run_plug(plug);
														
 
															+}
														
 
															+
														
 
															+static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
														
 
															+{
														
 
															+	struct btrfs_plug_cb *plug;
														
 
															+	plug = container_of(cb, struct btrfs_plug_cb, cb);
														
 
															+
														
 
															+	if (from_schedule) {
														
 
															+		plug->work.flags = 0;
														
 
															+		plug->work.func = unplug_work;
														
 
															+		btrfs_queue_worker(&plug->info->rmw_workers,
														
 
															+				   &plug->work);
														
 
															+		return;
														
 
															+	}
														
 
															+	run_plug(plug);
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * our main entry point for writes from the rest of the FS.
														
 
															+ */
														
 
															+int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
														
 
															+			struct btrfs_bio *bbio, u64 *raid_map,
														
 
															+			u64 stripe_len)
														
 
															+{
														
 
															+	struct btrfs_raid_bio *rbio;
														
 
															+	struct btrfs_plug_cb *plug = NULL;
														
 
															+	struct blk_plug_cb *cb;
														
 
															+
														
 
															+	rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
														
 
															+	if (IS_ERR(rbio)) {
														
 
															+		kfree(raid_map);
														
 
															+		kfree(bbio);
														
 
															+		return PTR_ERR(rbio);
														
 
															+	}
														
 
															+	bio_list_add(&rbio->bio_list, bio);
														
 
															+	rbio->bio_list_bytes = bio->bi_size;
														
 
															+
														
 
															+	/*
														
 
															+	 * don't plug on full rbios, just get them out the door
														
 
															+	 * as quickly as we can
														
 
															+	 */
														
 
															+	if (rbio_is_full(rbio))
														
 
															+		return full_stripe_write(rbio);
														
 
															+
														
 
															+	cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info,
														
 
															+			       sizeof(*plug));
														
 
															+	if (cb) {
														
 
															+		plug = container_of(cb, struct btrfs_plug_cb, cb);
														
 
															+		if (!plug->info) {
														
 
															+			plug->info = root->fs_info;
														
 
															+			INIT_LIST_HEAD(&plug->rbio_list);
														
 
															+		}
														
 
															+		list_add_tail(&rbio->plug_list, &plug->rbio_list);
														
 
															+	} else {
														
 
															+		return __raid56_parity_write(rbio);
														
 
															+	}
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * all parity reconstruction happens here.  We've read in everything
														
 
															+ * we can find from the drives and this does the heavy lifting of
														
 
															+ * sorting the good from the bad.
														
 
															+ */
														
 
															+static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
														
 
															+{
														
 
															+	int pagenr, stripe;
														
 
															+	void **pointers;
														
 
															+	int faila = -1, failb = -1;
														
 
															+	int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
														
 
															+	struct page *page;
														
 
															+	int err;
														
 
															+	int i;
														
 
															+
														
 
															+	pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *),
														
 
															+			   GFP_NOFS);
														
 
															+	if (!pointers) {
														
 
															+		err = -ENOMEM;
														
 
															+		goto cleanup_io;
														
 
															+	}
														
 
															+
														
 
															+	faila = rbio->faila;
														
 
															+	failb = rbio->failb;
														
 
															+
														
 
															+	if (rbio->read_rebuild) {
														
 
															+		spin_lock_irq(&rbio->bio_list_lock);
														
 
															+		set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
														
 
															+		spin_unlock_irq(&rbio->bio_list_lock);
														
 
															+	}
														
 
															+
														
 
															+	index_rbio_pages(rbio);
														
 
															+
														
 
															+	for (pagenr = 0; pagenr < nr_pages; pagenr++) {
														
 
															+		/* setup our array of pointers with pages
														
 
															+		 * from each stripe
														
 
															+		 */
														
 
															+		for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
														
 
															+			/*
														
 
															+			 * if we're rebuilding a read, we have to use
														
 
															+			 * pages from the bio list
														
 
															+			 */
														
 
															+			if (rbio->read_rebuild &&
														
 
															+			    (stripe == faila || stripe == failb)) {
														
 
															+				page = page_in_rbio(rbio, stripe, pagenr, 0);
														
 
															+			} else {
														
 
															+				page = rbio_stripe_page(rbio, stripe, pagenr);
														
 
															+			}
														
 
															+			pointers[stripe] = kmap(page);
														
 
															+		}
														
 
															+
														
 
															+		/* all raid6 handling here */
														
 
															+		if (rbio->raid_map[rbio->bbio->num_stripes - 1] ==
														
 
															+		    RAID6_Q_STRIPE) {
														
 
															+
														
 
															+			/*
														
 
															+			 * single failure, rebuild from parity raid5
														
 
															+			 * style
														
 
															+			 */
														
 
															+			if (failb < 0) {
														
 
															+				if (faila == rbio->nr_data) {
														
 
															+					/*
														
 
															+					 * Just the P stripe has failed, without
														
 
															+					 * a bad data or Q stripe.
														
 
															+					 * TODO, we should redo the xor here.
														
 
															+					 */
														
 
															+					err = -EIO;
														
 
															+					goto cleanup;
														
 
															+				}
														
 
															+				/*
														
 
															+				 * a single failure in raid6 is rebuilt
														
 
															+				 * in the pstripe code below
														
 
															+				 */
														
 
															+				goto pstripe;
														
 
															+			}
														
 
															+
														
 
															+			/* make sure our ps and qs are in order */
														
 
															+			if (faila > failb) {
														
 
															+				int tmp = failb;
														
 
															+				failb = faila;
														
 
															+				faila = tmp;
														
 
															+			}
														
 
															+
														
 
															+			/* if the q stripe is failed, do a pstripe reconstruction
														
 
															+			 * from the xors.
														
 
															+			 * If both the q stripe and the P stripe are failed, we're
														
 
															+			 * here due to a crc mismatch and we can't give them the
														
 
															+			 * data they want
														
 
															+			 */
														
 
															+			if (rbio->raid_map[failb] == RAID6_Q_STRIPE) {
														
 
															+				if (rbio->raid_map[faila] == RAID5_P_STRIPE) {
														
 
															+					err = -EIO;
														
 
															+					goto cleanup;
														
 
															+				}
														
 
															+				/*
														
 
															+				 * otherwise we have one bad data stripe and
														
 
															+				 * a good P stripe.  raid5!
														
 
															+				 */
														
 
															+				goto pstripe;
														
 
															+			}
														
 
															+
														
 
															+			if (rbio->raid_map[failb] == RAID5_P_STRIPE) {
														
 
															+				raid6_datap_recov(rbio->bbio->num_stripes,
														
 
															+						  PAGE_SIZE, faila, pointers);
														
 
															+			} else {
														
 
															+				raid6_2data_recov(rbio->bbio->num_stripes,
														
 
															+						  PAGE_SIZE, faila, failb,
														
 
															+						  pointers);
														
 
															+			}
														
 
															+		} else {
														
 
															+			void *p;
														
 
															+
														
 
															+			/* rebuild from P stripe here (raid5 or raid6) */
														
 
															+			BUG_ON(failb != -1);
														
 
															+pstripe:
														
 
															+			/* Copy parity block into failed block to start with */
														
 
															+			memcpy(pointers[faila],
														
 
															+			       pointers[rbio->nr_data],
														
 
															+			       PAGE_CACHE_SIZE);
														
 
															+
														
 
															+			/* rearrange the pointer array */
														
 
															+			p = pointers[faila];
														
 
															+			for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
														
 
															+				pointers[stripe] = pointers[stripe + 1];
														
 
															+			pointers[rbio->nr_data - 1] = p;
														
 
															+
														
 
															+			/* xor in the rest */
														
 
															+			run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE);
														
 
															+		}
														
 
															+		/* if we're doing this rebuild as part of an rmw, go through
														
 
															+		 * and set all of our private rbio pages in the
														
 
															+		 * failed stripes as uptodate.  This way finish_rmw will
														
 
															+		 * know they can be trusted.  If this was a read reconstruction,
														
 
															+		 * other endio functions will fiddle the uptodate bits
														
 
															+		 */
														
 
															+		if (!rbio->read_rebuild) {
														
 
															+			for (i = 0;  i < nr_pages; i++) {
														
 
															+				if (faila != -1) {
														
 
															+					page = rbio_stripe_page(rbio, faila, i);
														
 
															+					SetPageUptodate(page);
														
 
															+				}
														
 
															+				if (failb != -1) {
														
 
															+					page = rbio_stripe_page(rbio, failb, i);
														
 
															+					SetPageUptodate(page);
														
 
															+				}
														
 
															+			}
														
 
															+		}
														
 
															+		for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
														
 
															+			/*
														
 
															+			 * if we're rebuilding a read, we have to use
														
 
															+			 * pages from the bio list
														
 
															+			 */
														
 
															+			if (rbio->read_rebuild &&
														
 
															+			    (stripe == faila || stripe == failb)) {
														
 
															+				page = page_in_rbio(rbio, stripe, pagenr, 0);
														
 
															+			} else {
														
 
															+				page = rbio_stripe_page(rbio, stripe, pagenr);
														
 
															+			}
														
 
															+			kunmap(page);
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	err = 0;
														
 
															+cleanup:
														
 
															+	kfree(pointers);
														
 
															+
														
 
															+cleanup_io:
														
 
															+
														
 
															+	if (rbio->read_rebuild) {
														
 
															+		if (err == 0)
														
 
															+			cache_rbio_pages(rbio);
														
 
															+		else
														
 
															+			clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
														
 
															+
														
 
															+		rbio_orig_end_io(rbio, err, err == 0);
														
 
															+	} else if (err == 0) {
														
 
															+		rbio->faila = -1;
														
 
															+		rbio->failb = -1;
														
 
															+		finish_rmw(rbio);
														
 
															+	} else {
														
 
															+		rbio_orig_end_io(rbio, err, 0);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * This is called only for stripes we've read from disk to
														
 
															+ * reconstruct the parity.
														
 
															+ */
														
 
															+static void raid_recover_end_io(struct bio *bio, int err)
														
 
															+{
														
 
															+	struct btrfs_raid_bio *rbio = bio->bi_private;
														
 
															+
														
 
															+	/*
														
 
															+	 * we only read stripe pages off the disk, set them
														
 
															+	 * up to date if there were no errors
														
 
															+	 */
														
 
															+	if (err)
														
 
															+		fail_bio_stripe(rbio, bio);
														
 
															+	else
														
 
															+		set_bio_pages_uptodate(bio);
														
 
															+	bio_put(bio);
														
 
															+
														
 
															+	if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
														
 
															+		return;
														
 
															+
														
 
															+	if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
														
 
															+		rbio_orig_end_io(rbio, -EIO, 0);
														
 
															+	else
														
 
															+		__raid_recover_end_io(rbio);
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * reads everything we need off the disk to reconstruct
														
 
															+ * the parity. endio handlers trigger final reconstruction
														
 
															+ * when the IO is done.
														
 
															+ *
														
 
															+ * This is used both for reads from the higher layers and for
														
 
															+ * parity construction required to finish a rmw cycle.
														
 
															+ */
														
 
															+static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
														
 
															+{
														
 
															+	int bios_to_read = 0;
														
 
															+	struct btrfs_bio *bbio = rbio->bbio;
														
 
															+	struct bio_list bio_list;
														
 
															+	int ret;
														
 
															+	int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
														
 
															+	int pagenr;
														
 
															+	int stripe;
														
 
															+	struct bio *bio;
														
 
															+
														
 
															+	bio_list_init(&bio_list);
														
 
															+
														
 
															+	ret = alloc_rbio_pages(rbio);
														
 
															+	if (ret)
														
 
															+		goto cleanup;
														
 
															+
														
 
															+	atomic_set(&rbio->bbio->error, 0);
														
 
															+
														
 
															+	/*
														
 
															+	 * read everything that hasn't failed.  Thanks to the
														
 
															+	 * stripe cache, it is possible that some or all of these
														
 
															+	 * pages are going to be uptodate.
														
 
															+	 */
														
 
															+	for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
														
 
															+		if (rbio->faila == stripe ||
														
 
															+		    rbio->failb == stripe)
														
 
															+			continue;
														
 
															+
														
 
															+		for (pagenr = 0; pagenr < nr_pages; pagenr++) {
														
 
															+			struct page *p;
														
 
															+
														
 
															+			/*
														
 
															+			 * the rmw code may have already read this
														
 
															+			 * page in
														
 
															+			 */
														
 
															+			p = rbio_stripe_page(rbio, stripe, pagenr);
														
 
															+			if (PageUptodate(p))
														
 
															+				continue;
														
 
															+
														
 
															+			ret = rbio_add_io_page(rbio, &bio_list,
														
 
															+				       rbio_stripe_page(rbio, stripe, pagenr),
														
 
															+				       stripe, pagenr, rbio->stripe_len);
														
 
															+			if (ret < 0)
														
 
															+				goto cleanup;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	bios_to_read = bio_list_size(&bio_list);
														
 
															+	if (!bios_to_read) {
														
 
															+		/*
														
 
															+		 * we might have no bios to read just because the pages
														
 
															+		 * were up to date, or we might have no bios to read because
														
 
															+		 * the devices were gone.
														
 
															+		 */
														
 
															+		if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) {
														
 
															+			__raid_recover_end_io(rbio);
														
 
															+			goto out;
														
 
															+		} else {
														
 
															+			goto cleanup;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	/*
														
 
															+	 * the bbio may be freed once we submit the last bio.  Make sure
														
 
															+	 * not to touch it after that
														
 
															+	 */
														
 
															+	atomic_set(&bbio->stripes_pending, bios_to_read);
														
 
															+	while (1) {
														
 
															+		bio = bio_list_pop(&bio_list);
														
 
															+		if (!bio)
														
 
															+			break;
														
 
															+
														
 
															+		bio->bi_private = rbio;
														
 
															+		bio->bi_end_io = raid_recover_end_io;
														
 
															+
														
 
															+		btrfs_bio_wq_end_io(rbio->fs_info, bio,
														
 
															+				    BTRFS_WQ_ENDIO_RAID56);
														
 
															+
														
 
															+		BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
														
 
															+		submit_bio(READ, bio);
														
 
															+	}
														
 
															+out:
														
 
															+	return 0;
														
 
															+
														
 
															+cleanup:
														
 
															+	if (rbio->read_rebuild)
														
 
															+		rbio_orig_end_io(rbio, -EIO, 0);
														
 
															+	return -EIO;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * the main entry point for reads from the higher layers.  This
														
 
															+ * is really only called when the normal read path had a failure,
														
 
															+ * so we assume the bio they send down corresponds to a failed part
														
 
															+ * of the drive.
														
 
															+ */
														
 
															+int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
														
 
															+			  struct btrfs_bio *bbio, u64 *raid_map,
														
 
															+			  u64 stripe_len, int mirror_num)
														
 
															+{
														
 
															+	struct btrfs_raid_bio *rbio;
														
 
															+	int ret;
														
 
															+
														
 
															+	rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
														
 
															+	if (IS_ERR(rbio)) {
														
 
															+		return PTR_ERR(rbio);
														
 
															+	}
														
 
															+
														
 
															+	rbio->read_rebuild = 1;
														
 
															+	bio_list_add(&rbio->bio_list, bio);
														
 
															+	rbio->bio_list_bytes = bio->bi_size;
														
 
															+
														
 
															+	rbio->faila = find_logical_bio_stripe(rbio, bio);
														
 
															+	if (rbio->faila == -1) {
														
 
															+		BUG();
														
 
															+		kfree(rbio);
														
 
															+		return -EIO;
														
 
															+	}
														
 
															+
														
 
															+	/*
														
 
															+	 * reconstruct from the q stripe if they are
														
 
															+	 * asking for mirror 3
														
 
															+	 */
														
 
															+	if (mirror_num == 3)
														
 
															+		rbio->failb = bbio->num_stripes - 2;
														
 
															+
														
 
															+	ret = lock_stripe_add(rbio);
														
 
															+
														
 
															+	/*
														
 
															+	 * __raid56_parity_recover will end the bio with
														
 
															+	 * any errors it hits.  We don't want to return
														
 
															+	 * its error value up the stack because our caller
														
 
															+	 * will end up calling bio_endio with any nonzero
														
 
															+	 * return
														
 
															+	 */
														
 
															+	if (ret == 0)
														
 
															+		__raid56_parity_recover(rbio);
														
 
															+	/*
														
 
															+	 * our rbio has been added to the list of
														
 
															+	 * rbios that will be handled after the
														
 
															+	 * currently lock owner is done
														
 
															+	 */
														
 
															+	return 0;
														
 
															+
														
 
															+}
														
 
															+
														
 
															+static void rmw_work(struct btrfs_work *work)
														
 
															+{
														
 
															+	struct btrfs_raid_bio *rbio;
														
 
															+
														
 
															+	rbio = container_of(work, struct btrfs_raid_bio, work);
														
 
															+	raid56_rmw_stripe(rbio);
														
 
															+}
														
 
															+
														
 
															+static void read_rebuild_work(struct btrfs_work *work)
														
 
															+{
														
 
															+	struct btrfs_raid_bio *rbio;
														
 
															+
														
 
															+	rbio = container_of(work, struct btrfs_raid_bio, work);
														
 
															+	__raid56_parity_recover(rbio);
														
 
															+}
														
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -0,0 +1,51 @@
 
															+/*
														
 
															+ * Copyright (C) 2012 Fusion-io  All rights reserved.
														
 
															+ * Copyright (C) 2012 Intel Corp. All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public
														
 
															+ * License v2 as published by the Free Software Foundation.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public
														
 
															+ * License along with this program; if not, write to the
														
 
															+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
														
 
															+ * Boston, MA 021110-1307, USA.
														
 
															+ */
														
 
															+
														
 
															+#ifndef __BTRFS_RAID56__
														
 
															+#define __BTRFS_RAID56__
														
 
															+static inline int nr_parity_stripes(struct map_lookup *map)
														
 
															+{
														
 
															+	if (map->type & BTRFS_BLOCK_GROUP_RAID5)
														
 
															+		return 1;
														
 
															+	else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
														
 
															+		return 2;
														
 
															+	else
														
 
															+		return 0;
														
 
															+}
														
 
															+
														
 
															+static inline int nr_data_stripes(struct map_lookup *map)
														
 
															+{
														
 
															+	return map->num_stripes - nr_parity_stripes(map);
														
 
															+}
														
 
															+#define RAID5_P_STRIPE ((u64)-2)
														
 
															+#define RAID6_Q_STRIPE ((u64)-1)
														
 
															+
														
 
															+#define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) ||		\
														
 
															+			     ((x) == RAID6_Q_STRIPE))
														
 
															+
														
 
															+int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
														
 
															+				 struct btrfs_bio *bbio, u64 *raid_map,
														
 
															+				 u64 stripe_len, int mirror_num);
														
 
															+int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
														
 
															+			       struct btrfs_bio *bbio, u64 *raid_map,
														
 
															+			       u64 stripe_len);
														
 
															+
														
 
															+int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
														
 
															+void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info);
														
 
															+#endif
														
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -28,6 +28,7 @@
 
															 #include "dev-replace.h"
														
 
															 #include "check-integrity.h"
														
 
															 #include "rcu-string.h"
														
 
															+#include "raid56.h"
														
 
															 /*
														
 
															  * This is only the first step towards a full-features scrub. It reads all
														
@@ -2254,6 +2255,13 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 
															 	struct btrfs_device *extent_dev;
														
 
															 	int extent_mirror_num;
														
 
															+	if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
														
 
															+			 BTRFS_BLOCK_GROUP_RAID6)) {
														
 
															+		if (num >= nr_data_stripes(map)) {
														
 
															+			return 0;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															 	nstripes = length;
														
 
															 	offset = 0;
														
 
															 	do_div(nstripes, map->stripe_len);
														
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -167,6 +167,9 @@ loop:
 
															 	spin_lock_init(&cur_trans->commit_lock);
														
 
															 	spin_lock_init(&cur_trans->delayed_refs.lock);
														
 
															+	atomic_set(&cur_trans->delayed_refs.procs_running_refs, 0);
														
 
															+	atomic_set(&cur_trans->delayed_refs.ref_seq, 0);
														
 
															+	init_waitqueue_head(&cur_trans->delayed_refs.wait);
														
 
															 	INIT_LIST_HEAD(&cur_trans->pending_snapshots);
														
 
															 	INIT_LIST_HEAD(&cur_trans->ordered_operations);
														
@@ -637,7 +640,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 
															 	if (!list_empty(&trans->new_bgs))
														
 
															 		btrfs_create_pending_block_groups(trans, root);
														
 
															-	while (count < 2) {
														
 
															+	while (count < 1) {
														
 
															 		unsigned long cur = trans->delayed_ref_updates;
														
 
															 		trans->delayed_ref_updates = 0;
														
 
															 		if (cur &&
														
@@ -649,6 +652,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 
															 		}
														
 
															 		count++;
														
 
															 	}
														
 
															+
														
 
															 	btrfs_trans_release_metadata(trans, root);
														
 
															 	trans->block_rsv = NULL;
														
@@ -744,7 +748,9 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
 
															 	struct extent_state *cached_state = NULL;
														
 
															 	u64 start = 0;
														
 
															 	u64 end;
														
 
															+	struct blk_plug plug;
														
 
															+	blk_start_plug(&plug);
														
 
															 	while (!find_first_extent_bit(dirty_pages, start, &start, &end,
														
 
															 				      mark, &cached_state)) {
														
 
															 		convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
														
@@ -758,6 +764,7 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
 
															 	}
														
 
															 	if (err)
														
 
															 		werr = err;
														
 
															+	blk_finish_plug(&plug);
														
 
															 	return werr;
														
 
															 }
														
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -25,6 +25,8 @@
 
															 #include <linux/capability.h>
														
 
															 #include <linux/ratelimit.h>
														
 
															 #include <linux/kthread.h>
														
 
															+#include <linux/raid/pq.h>
														
 
															+#include <asm/div64.h>
														
 
															 #include "compat.h"
														
 
															 #include "ctree.h"
														
 
															 #include "extent_map.h"
														
@@ -32,6 +34,7 @@
 
															 #include "transaction.h"
														
 
															 #include "print-tree.h"
														
 
															 #include "volumes.h"
														
 
															+#include "raid56.h"
														
 
															 #include "async-thread.h"
														
 
															 #include "check-integrity.h"
														
 
															 #include "rcu-string.h"
														
@@ -1465,6 +1468,21 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 
															 		goto out;
														
 
															 	}
														
 
															+	if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) &&
														
 
															+	    root->fs_info->fs_devices->rw_devices <= 2) {
														
 
															+		printk(KERN_ERR "btrfs: unable to go below two "
														
 
															+		       "devices on raid5\n");
														
 
															+		ret = -EINVAL;
														
 
															+		goto out;
														
 
															+	}
														
 
															+	if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) &&
														
 
															+	    root->fs_info->fs_devices->rw_devices <= 3) {
														
 
															+		printk(KERN_ERR "btrfs: unable to go below three "
														
 
															+		       "devices on raid6\n");
														
 
															+		ret = -EINVAL;
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															 	if (strcmp(device_path, "missing") == 0) {
														
 
															 		struct list_head *devices;
														
 
															 		struct btrfs_device *tmp;
														
@@ -2726,11 +2744,15 @@ static int chunk_drange_filter(struct extent_buffer *leaf,
 
															 		return 0;
														
 
															 	if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
														
 
															-	     BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))
														
 
															-		factor = 2;
														
 
															-	else
														
 
															-		factor = 1;
														
 
															-	factor = num_stripes / factor;
														
 
															+	     BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) {
														
 
															+		factor = num_stripes / 2;
														
 
															+	} else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) {
														
 
															+		factor = num_stripes - 1;
														
 
															+	} else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) {
														
 
															+		factor = num_stripes - 2;
														
 
															+	} else {
														
 
															+		factor = num_stripes;
														
 
															+	}
														
 
															 	for (i = 0; i < num_stripes; i++) {
														
 
															 		stripe = btrfs_stripe_nr(chunk, i);
														
@@ -3090,7 +3112,9 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 
															 		allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
														
 
															 	else
														
 
															 		allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
														
 
															-				BTRFS_BLOCK_GROUP_RAID10);
														
 
															+				BTRFS_BLOCK_GROUP_RAID10 |
														
 
															+				BTRFS_BLOCK_GROUP_RAID5 |
														
 
															+				BTRFS_BLOCK_GROUP_RAID6);
														
 
															 	if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
														
 
															 	    (!alloc_profile_is_valid(bctl->data.target, 1) ||
														
@@ -3130,7 +3154,9 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 
															 	/* allow to reduce meta or sys integrity only if force set */
														
 
															 	allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
														
 
															-			BTRFS_BLOCK_GROUP_RAID10;
														
 
															+			BTRFS_BLOCK_GROUP_RAID10 |
														
 
															+			BTRFS_BLOCK_GROUP_RAID5 |
														
 
															+			BTRFS_BLOCK_GROUP_RAID6;
														
 
															 	do {
														
 
															 		seq = read_seqbegin(&fs_info->profiles_lock);
														
@@ -3204,11 +3230,6 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 
															 		update_ioctl_balance_args(fs_info, 0, bargs);
														
 
															 	}
														
 
															-	if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
														
 
															-	    balance_need_close(fs_info)) {
														
 
															-		__cancel_balance(fs_info);
														
 
															-	}
														
 
															-
														
 
															 	wake_up(&fs_info->balance_wait_q);
														
 
															 	return ret;
														
@@ -3611,8 +3632,46 @@ struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
 
															 		.devs_increment	= 1,
														
 
															 		.ncopies	= 1,
														
 
															 	},
														
 
															+	[BTRFS_RAID_RAID5] = {
														
 
															+		.sub_stripes	= 1,
														
 
															+		.dev_stripes	= 1,
														
 
															+		.devs_max	= 0,
														
 
															+		.devs_min	= 2,
														
 
															+		.devs_increment	= 1,
														
 
															+		.ncopies	= 2,
														
 
															+	},
														
 
															+	[BTRFS_RAID_RAID6] = {
														
 
															+		.sub_stripes	= 1,
														
 
															+		.dev_stripes	= 1,
														
 
															+		.devs_max	= 0,
														
 
															+		.devs_min	= 3,
														
 
															+		.devs_increment	= 1,
														
 
															+		.ncopies	= 3,
														
 
															+	},
														
 
															 };
														
 
															- 
														
 
															+
														
 
															+static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target)
														
 
															+{
														
 
															+	/* TODO allow them to set a preferred stripe size */
														
 
															+	return 64 * 1024;
														
 
															+}
														
 
															+
														
 
															+static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
														
 
															+{
														
 
															+	u64 features;
														
 
															+
														
 
															+	if (!(type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)))
														
 
															+		return;
														
 
															+
														
 
															+	features = btrfs_super_incompat_flags(info->super_copy);
														
 
															+	if (features & BTRFS_FEATURE_INCOMPAT_RAID56)
														
 
															+		return;
														
 
															+
														
 
															+	features |= BTRFS_FEATURE_INCOMPAT_RAID56;
														
 
															+	btrfs_set_super_incompat_flags(info->super_copy, features);
														
 
															+	printk(KERN_INFO "btrfs: setting RAID5/6 feature flag\n");
														
 
															+}
														
 
															+
														
 
															 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
														
 
															 			       struct btrfs_root *extent_root,
														
 
															 			       struct map_lookup **map_ret,
														
@@ -3628,6 +3687,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 
															 	struct btrfs_device_info *devices_info = NULL;
														
 
															 	u64 total_avail;
														
 
															 	int num_stripes;	/* total number of stripes to allocate */
														
 
															+	int data_stripes;	/* number of stripes that count for
														
 
															+				   block group size */
														
 
															 	int sub_stripes;	/* sub_stripes info for map */
														
 
															 	int dev_stripes;	/* stripes per dev */
														
 
															 	int devs_max;		/* max devs to use */
														
@@ -3639,6 +3700,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 
															 	u64 max_chunk_size;
														
 
															 	u64 stripe_size;
														
 
															 	u64 num_bytes;
														
 
															+	u64 raid_stripe_len = BTRFS_STRIPE_LEN;
														
 
															 	int ndevs;
														
 
															 	int i;
														
 
															 	int j;
														
@@ -3768,16 +3830,31 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 
															 	stripe_size = devices_info[ndevs-1].max_avail;
														
 
															 	num_stripes = ndevs * dev_stripes;
														
 
															+	/*
														
 
															+	 * this will have to be fixed for RAID1 and RAID10 over
														
 
															+	 * more drives
														
 
															+	 */
														
 
															+	data_stripes = num_stripes / ncopies;
														
 
															+
														
 
															 	if (stripe_size * ndevs > max_chunk_size * ncopies) {
														
 
															 		stripe_size = max_chunk_size * ncopies;
														
 
															 		do_div(stripe_size, ndevs);
														
 
															 	}
														
 
															-
														
 
															+	if (type & BTRFS_BLOCK_GROUP_RAID5) {
														
 
															+		raid_stripe_len = find_raid56_stripe_len(ndevs - 1,
														
 
															+				 btrfs_super_stripesize(info->super_copy));
														
 
															+		data_stripes = num_stripes - 1;
														
 
															+	}
														
 
															+	if (type & BTRFS_BLOCK_GROUP_RAID6) {
														
 
															+		raid_stripe_len = find_raid56_stripe_len(ndevs - 2,
														
 
															+				 btrfs_super_stripesize(info->super_copy));
														
 
															+		data_stripes = num_stripes - 2;
														
 
															+	}
														
 
															 	do_div(stripe_size, dev_stripes);
														
 
															 	/* align to BTRFS_STRIPE_LEN */
														
 
															-	do_div(stripe_size, BTRFS_STRIPE_LEN);
														
 
															-	stripe_size *= BTRFS_STRIPE_LEN;
														
 
															+	do_div(stripe_size, raid_stripe_len);
														
 
															+	stripe_size *= raid_stripe_len;
														
 
															 	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
														
 
															 	if (!map) {
														
@@ -3795,14 +3872,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 
															 		}
														
 
															 	}
														
 
															 	map->sector_size = extent_root->sectorsize;
														
 
															-	map->stripe_len = BTRFS_STRIPE_LEN;
														
 
															-	map->io_align = BTRFS_STRIPE_LEN;
														
 
															-	map->io_width = BTRFS_STRIPE_LEN;
														
 
															+	map->stripe_len = raid_stripe_len;
														
 
															+	map->io_align = raid_stripe_len;
														
 
															+	map->io_width = raid_stripe_len;
														
 
															 	map->type = type;
														
 
															 	map->sub_stripes = sub_stripes;
														
 
															 	*map_ret = map;
														
 
															-	num_bytes = stripe_size * (num_stripes / ncopies);
														
 
															+	num_bytes = stripe_size * data_stripes;
														
 
															 	*stripe_size_out = stripe_size;
														
 
															 	*num_bytes_out = num_bytes;
														
@@ -3853,6 +3930,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 
															 	}
														
 
															 	free_extent_map(em);
														
 
															+	check_raid56_incompat_flag(extent_root->fs_info, type);
														
 
															+
														
 
															 	kfree(devices_info);
														
 
															 	return 0;
														
@@ -4136,6 +4215,10 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
 
															 		ret = map->num_stripes;
														
 
															 	else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
														
 
															 		ret = map->sub_stripes;
														
 
															+	else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
														
 
															+		ret = 2;
														
 
															+	else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
														
 
															+		ret = 3;
														
 
															 	else
														
 
															 		ret = 1;
														
 
															 	free_extent_map(em);
														
@@ -4148,6 +4231,52 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
 
															 	return ret;
														
 
															 }
														
 
															+unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
														
 
															+				    struct btrfs_mapping_tree *map_tree,
														
 
															+				    u64 logical)
														
 
															+{
														
 
															+	struct extent_map *em;
														
 
															+	struct map_lookup *map;
														
 
															+	struct extent_map_tree *em_tree = &map_tree->map_tree;
														
 
															+	unsigned long len = root->sectorsize;
														
 
															+
														
 
															+	read_lock(&em_tree->lock);
														
 
															+	em = lookup_extent_mapping(em_tree, logical, len);
														
 
															+	read_unlock(&em_tree->lock);
														
 
															+	BUG_ON(!em);
														
 
															+
														
 
															+	BUG_ON(em->start > logical || em->start + em->len < logical);
														
 
															+	map = (struct map_lookup *)em->bdev;
														
 
															+	if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
														
 
															+			 BTRFS_BLOCK_GROUP_RAID6)) {
														
 
															+		len = map->stripe_len * nr_data_stripes(map);
														
 
															+	}
														
 
															+	free_extent_map(em);
														
 
															+	return len;
														
 
															+}
														
 
															+
														
 
															+int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
														
 
															+			   u64 logical, u64 len, int mirror_num)
														
 
															+{
														
 
															+	struct extent_map *em;
														
 
															+	struct map_lookup *map;
														
 
															+	struct extent_map_tree *em_tree = &map_tree->map_tree;
														
 
															+	int ret = 0;
														
 
															+
														
 
															+	read_lock(&em_tree->lock);
														
 
															+	em = lookup_extent_mapping(em_tree, logical, len);
														
 
															+	read_unlock(&em_tree->lock);
														
 
															+	BUG_ON(!em);
														
 
															+
														
 
															+	BUG_ON(em->start > logical || em->start + em->len < logical);
														
 
															+	map = (struct map_lookup *)em->bdev;
														
 
															+	if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
														
 
															+			 BTRFS_BLOCK_GROUP_RAID6))
														
 
															+		ret = 1;
														
 
															+	free_extent_map(em);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															 static int find_live_mirror(struct btrfs_fs_info *fs_info,
														
 
															 			    struct map_lookup *map, int first, int num,
														
 
															 			    int optimal, int dev_replace_is_ongoing)
														
@@ -4185,10 +4314,39 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
 
															 	return optimal;
														
 
															 }
														
 
															+static inline int parity_smaller(u64 a, u64 b)
														
 
															+{
														
 
															+	return a > b;
														
 
															+}
														
 
															+
														
 
															+/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
														
 
															+static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map)
														
 
															+{
														
 
															+	struct btrfs_bio_stripe s;
														
 
															+	int i;
														
 
															+	u64 l;
														
 
															+	int again = 1;
														
 
															+
														
 
															+	while (again) {
														
 
															+		again = 0;
														
 
															+		for (i = 0; i < bbio->num_stripes - 1; i++) {
														
 
															+			if (parity_smaller(raid_map[i], raid_map[i+1])) {
														
 
															+				s = bbio->stripes[i];
														
 
															+				l = raid_map[i];
														
 
															+				bbio->stripes[i] = bbio->stripes[i+1];
														
 
															+				raid_map[i] = raid_map[i+1];
														
 
															+				bbio->stripes[i+1] = s;
														
 
															+				raid_map[i+1] = l;
														
 
															+				again = 1;
														
 
															+			}
														
 
															+		}
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															 static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
														
 
															 			     u64 logical, u64 *length,
														
 
															 			     struct btrfs_bio **bbio_ret,
														
 
															-			     int mirror_num)
														
 
															+			     int mirror_num, u64 **raid_map_ret)
														
 
															 {
														
 
															 	struct extent_map *em;
														
 
															 	struct map_lookup *map;
														
@@ -4200,6 +4358,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 
															 	u64 stripe_nr;
														
 
															 	u64 stripe_nr_orig;
														
 
															 	u64 stripe_nr_end;
														
 
															+	u64 stripe_len;
														
 
															+	u64 *raid_map = NULL;
														
 
															 	int stripe_index;
														
 
															 	int i;
														
 
															 	int ret = 0;
														
@@ -4211,6 +4371,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 
															 	int num_alloc_stripes;
														
 
															 	int patch_the_first_stripe_for_dev_replace = 0;
														
 
															 	u64 physical_to_patch_in_first_stripe = 0;
														
 
															+	u64 raid56_full_stripe_start = (u64)-1;
														
 
															 	read_lock(&em_tree->lock);
														
 
															 	em = lookup_extent_mapping(em_tree, logical, *length);
														
@@ -4227,29 +4388,63 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 
															 	map = (struct map_lookup *)em->bdev;
														
 
															 	offset = logical - em->start;
														
 
															+	if (mirror_num > map->num_stripes)
														
 
															+		mirror_num = 0;
														
 
															+
														
 
															+	stripe_len = map->stripe_len;
														
 
															 	stripe_nr = offset;
														
 
															 	/*
														
 
															 	 * stripe_nr counts the total number of stripes we have to stride
														
 
															 	 * to get to this block
														
 
															 	 */
														
 
															-	do_div(stripe_nr, map->stripe_len);
														
 
															+	do_div(stripe_nr, stripe_len);
														
 
															-	stripe_offset = stripe_nr * map->stripe_len;
														
 
															+	stripe_offset = stripe_nr * stripe_len;
														
 
															 	BUG_ON(offset < stripe_offset);
														
 
															 	/* stripe_offset is the offset of this block in its stripe*/
														
 
															 	stripe_offset = offset - stripe_offset;
														
 
															-	if (rw & REQ_DISCARD)
														
 
															+	/* if we're here for raid56, we need to know the stripe aligned start */
														
 
															+	if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
														
 
															+		unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
														
 
															+		raid56_full_stripe_start = offset;
														
 
															+
														
 
															+		/* allow a write of a full stripe, but make sure we don't
														
 
															+		 * allow straddling of stripes
														
 
															+		 */
														
 
															+		do_div(raid56_full_stripe_start, full_stripe_len);
														
 
															+		raid56_full_stripe_start *= full_stripe_len;
														
 
															+	}
														
 
															+
														
 
															+	if (rw & REQ_DISCARD) {
														
 
															+		/* we don't discard raid56 yet */
														
 
															+		if (map->type &
														
 
															+		    (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
														
 
															+			ret = -EOPNOTSUPP;
														
 
															+			goto out;
														
 
															+		}
														
 
															 		*length = min_t(u64, em->len - offset, *length);
														
 
															-	else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
														
 
															-		/* we limit the length of each bio to what fits in a stripe */
														
 
															-		*length = min_t(u64, em->len - offset,
														
 
															-				map->stripe_len - stripe_offset);
														
 
															+	} else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
														
 
															+		u64 max_len;
														
 
															+		/* For writes to RAID[56], allow a full stripeset across all disks.
														
 
															+		   For other RAID types and for RAID[56] reads, just allow a single
														
 
															+		   stripe (on a single disk). */
														
 
															+		if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) &&
														
 
															+		    (rw & REQ_WRITE)) {
														
 
															+			max_len = stripe_len * nr_data_stripes(map) -
														
 
															+				(offset - raid56_full_stripe_start);
														
 
															+		} else {
														
 
															+			/* we limit the length of each bio to what fits in a stripe */
														
 
															+			max_len = stripe_len - stripe_offset;
														
 
															+		}
														
 
															+		*length = min_t(u64, em->len - offset, max_len);
														
 
															 	} else {
														
 
															 		*length = em->len - offset;
														
 
															 	}
														
 
															+	/* This is for when we're called from btrfs_merge_bio_hook() and all
														
 
															+	   it cares about is the length */
														
 
															 	if (!bbio_ret)
														
 
															 		goto out;
														
@@ -4282,7 +4477,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 
															 		u64 physical_of_found = 0;
														
 
															 		ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS,
														
 
															-			     logical, &tmp_length, &tmp_bbio, 0);
														
 
															+			     logical, &tmp_length, &tmp_bbio, 0, NULL);
														
 
															 		if (ret) {
														
 
															 			WARN_ON(tmp_bbio != NULL);
														
 
															 			goto out;
														
@@ -4348,6 +4543,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 
															 	do_div(stripe_nr_end, map->stripe_len);
														
 
															 	stripe_end_offset = stripe_nr_end * map->stripe_len -
														
 
															 			    (offset + *length);
														
 
															+
														
 
															 	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
														
 
															 		if (rw & REQ_DISCARD)
														
 
															 			num_stripes = min_t(u64, map->num_stripes,
														
@@ -4398,6 +4594,65 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 
															 					      dev_replace_is_ongoing);
														
 
															 			mirror_num = stripe_index - old_stripe_index + 1;
														
 
															 		}
														
 
															+
														
 
															+	} else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
														
 
															+				BTRFS_BLOCK_GROUP_RAID6)) {
														
 
															+		u64 tmp;
														
 
															+
														
 
															+		if (bbio_ret && ((rw & REQ_WRITE) || mirror_num > 1)
														
 
															+		    && raid_map_ret) {
														
 
															+			int i, rot;
														
 
															+
														
 
															+			/* push stripe_nr back to the start of the full stripe */
														
 
															+			stripe_nr = raid56_full_stripe_start;
														
 
															+			do_div(stripe_nr, stripe_len);
														
 
															+
														
 
															+			stripe_index = do_div(stripe_nr, nr_data_stripes(map));
														
 
															+
														
 
															+			/* RAID[56] write or recovery. Return all stripes */
														
 
															+			num_stripes = map->num_stripes;
														
 
															+			max_errors = nr_parity_stripes(map);
														
 
															+
														
 
															+			raid_map = kmalloc(sizeof(u64) * num_stripes,
														
 
															+					   GFP_NOFS);
														
 
															+			if (!raid_map) {
														
 
															+				ret = -ENOMEM;
														
 
															+				goto out;
														
 
															+			}
														
 
															+
														
 
															+			/* Work out the disk rotation on this stripe-set */
														
 
															+			tmp = stripe_nr;
														
 
															+			rot = do_div(tmp, num_stripes);
														
 
															+
														
 
															+			/* Fill in the logical address of each stripe */
														
 
															+			tmp = stripe_nr * nr_data_stripes(map);
														
 
															+			for (i = 0; i < nr_data_stripes(map); i++)
														
 
															+				raid_map[(i+rot) % num_stripes] =
														
 
															+					em->start + (tmp + i) * map->stripe_len;
														
 
															+
														
 
															+			raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
														
 
															+			if (map->type & BTRFS_BLOCK_GROUP_RAID6)
														
 
															+				raid_map[(i+rot+1) % num_stripes] =
														
 
															+					RAID6_Q_STRIPE;
														
 
															+
														
 
															+			*length = map->stripe_len;
														
 
															+			stripe_index = 0;
														
 
															+			stripe_offset = 0;
														
 
															+		} else {
														
 
															+			/*
														
 
															+			 * Mirror #0 or #1 means the original data block.
														
 
															+			 * Mirror #2 is RAID5 parity block.
														
 
															+			 * Mirror #3 is RAID6 Q block.
														
 
															+			 */
														
 
															+			stripe_index = do_div(stripe_nr, nr_data_stripes(map));
														
 
															+			if (mirror_num > 1)
														
 
															+				stripe_index = nr_data_stripes(map) +
														
 
															+						mirror_num - 2;
														
 
															+
														
 
															+			/* We distribute the parity blocks across stripes */
														
 
															+			tmp = stripe_nr + stripe_index;
														
 
															+			stripe_index = do_div(tmp, map->num_stripes);
														
 
															+		}
														
 
															 	} else {
														
 
															 		/*
														
 
															 		 * after this do_div call, stripe_nr is the number of stripes
														
@@ -4506,8 +4761,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 
															 	if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) {
														
 
															 		if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
														
 
															 				 BTRFS_BLOCK_GROUP_RAID10 |
														
 
															+				 BTRFS_BLOCK_GROUP_RAID5 |
														
 
															 				 BTRFS_BLOCK_GROUP_DUP)) {
														
 
															 			max_errors = 1;
														
 
															+		} else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
														
 
															+			max_errors = 2;
														
 
															 		}
														
 
															 	}
														
@@ -4608,6 +4866,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 
															 		bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
														
 
															 		bbio->mirror_num = map->num_stripes + 1;
														
 
															 	}
														
 
															+	if (raid_map) {
														
 
															+		sort_parity_stripes(bbio, raid_map);
														
 
															+		*raid_map_ret = raid_map;
														
 
															+	}
														
 
															 out:
														
 
															 	if (dev_replace_is_ongoing)
														
 
															 		btrfs_dev_replace_unlock(dev_replace);
														
@@ -4620,7 +4882,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 
															 		      struct btrfs_bio **bbio_ret, int mirror_num)
														
 
															 {
														
 
															 	return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
														
 
															-				 mirror_num);
														
 
															+				 mirror_num, NULL);
														
 
															 }
														
 
															 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
														
@@ -4634,6 +4896,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
 
															 	u64 bytenr;
														
 
															 	u64 length;
														
 
															 	u64 stripe_nr;
														
 
															+	u64 rmap_len;
														
 
															 	int i, j, nr = 0;
														
 
															 	read_lock(&em_tree->lock);
														
@@ -4644,10 +4907,17 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
 
															 	map = (struct map_lookup *)em->bdev;
														
 
															 	length = em->len;
														
 
															+	rmap_len = map->stripe_len;
														
 
															+
														
 
															 	if (map->type & BTRFS_BLOCK_GROUP_RAID10)
														
 
															 		do_div(length, map->num_stripes / map->sub_stripes);
														
 
															 	else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
														
 
															 		do_div(length, map->num_stripes);
														
 
															+	else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
														
 
															+			      BTRFS_BLOCK_GROUP_RAID6)) {
														
 
															+		do_div(length, nr_data_stripes(map));
														
 
															+		rmap_len = map->stripe_len * nr_data_stripes(map);
														
 
															+	}
														
 
															 	buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
														
 
															 	BUG_ON(!buf); /* -ENOMEM */
														
@@ -4667,8 +4937,11 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
 
															 			do_div(stripe_nr, map->sub_stripes);
														
 
															 		} else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
														
 
															 			stripe_nr = stripe_nr * map->num_stripes + i;
														
 
															-		}
														
 
															-		bytenr = chunk_start + stripe_nr * map->stripe_len;
														
 
															+		} /* else if RAID[56], multiply by nr_data_stripes().
														
 
															+		   * Alternatively, just use rmap_len below instead of
														
 
															+		   * map->stripe_len */
														
 
															+
														
 
															+		bytenr = chunk_start + stripe_nr * rmap_len;
														
 
															 		WARN_ON(nr >= map->num_stripes);
														
 
															 		for (j = 0; j < nr; j++) {
														
 
															 			if (buf[j] == bytenr)
														
@@ -4682,7 +4955,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
 
															 	*logical = buf;
														
 
															 	*naddrs = nr;
														
 
															-	*stripe_len = map->stripe_len;
														
 
															+	*stripe_len = rmap_len;
														
 
															 	free_extent_map(em);
														
 
															 	return 0;
														
@@ -4756,7 +5029,7 @@ static void btrfs_end_bio(struct bio *bio, int err)
 
															 		bio->bi_bdev = (struct block_device *)
														
 
															 					(unsigned long)bbio->mirror_num;
														
 
															 		/* only send an error to the higher layers if it is
														
 
															-		 * beyond the tolerance of the multi-bio
														
 
															+		 * beyond the tolerance of the btrfs bio
														
 
															 		 */
														
 
															 		if (atomic_read(&bbio->error) > bbio->max_errors) {
														
 
															 			err = -EIO;
														
@@ -4790,13 +5063,18 @@ struct async_sched {
 
															  * This will add one bio to the pending list for a device and make sure
														
 
															  * the work struct is scheduled.
														
 
															  */
														
 
															-static noinline void schedule_bio(struct btrfs_root *root,
														
 
															+noinline void btrfs_schedule_bio(struct btrfs_root *root,
														
 
															 				 struct btrfs_device *device,
														
 
															 				 int rw, struct bio *bio)
														
 
															 {
														
 
															 	int should_queue = 1;
														
 
															 	struct btrfs_pending_bios *pending_bios;
														
 
															+	if (device->missing || !device->bdev) {
														
 
															+		bio_endio(bio, -EIO);
														
 
															+		return;
														
 
															+	}
														
 
															+
														
 
															 	/* don't bother with additional async steps for reads, right now */
														
 
															 	if (!(rw & REQ_WRITE)) {
														
 
															 		bio_get(bio);
														
@@ -4894,7 +5172,7 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
 
															 #endif
														
 
															 	bio->bi_bdev = dev->bdev;
														
 
															 	if (async)
														
 
															-		schedule_bio(root, dev, rw, bio);
														
 
															+		btrfs_schedule_bio(root, dev, rw, bio);
														
 
															 	else
														
 
															 		btrfsic_submit_bio(rw, bio);
														
 
															 }
														
@@ -4953,6 +5231,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 
															 	u64 logical = (u64)bio->bi_sector << 9;
														
 
															 	u64 length = 0;
														
 
															 	u64 map_length;
														
 
															+	u64 *raid_map = NULL;
														
 
															 	int ret;
														
 
															 	int dev_nr = 0;
														
 
															 	int total_devs = 1;
														
@@ -4961,12 +5240,30 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 
															 	length = bio->bi_size;
														
 
															 	map_length = length;
														
 
															-	ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
														
 
															-			      mirror_num);
														
 
															-	if (ret)
														
 
															+	ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
														
 
															+			      mirror_num, &raid_map);
														
 
															+	if (ret) /* -ENOMEM */
														
 
															 		return ret;
														
 
															 	total_devs = bbio->num_stripes;
														
 
															+	bbio->orig_bio = first_bio;
														
 
															+	bbio->private = first_bio->bi_private;
														
 
															+	bbio->end_io = first_bio->bi_end_io;
														
 
															+	atomic_set(&bbio->stripes_pending, bbio->num_stripes);
														
 
															+
														
 
															+	if (raid_map) {
														
 
															+		/* In this case, map_length has been set to the length of
														
 
															+		   a single stripe; not the whole write */
														
 
															+		if (rw & WRITE) {
														
 
															+			return raid56_parity_write(root, bio, bbio,
														
 
															+						   raid_map, map_length);
														
 
															+		} else {
														
 
															+			return raid56_parity_recover(root, bio, bbio,
														
 
															+						     raid_map, map_length,
														
 
															+						     mirror_num);
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															 	if (map_length < length) {
														
 
															 		printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu "
														
 
															 		       "len %llu\n", (unsigned long long)logical,
														
@@ -4975,11 +5272,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 
															 		BUG();
														
 
															 	}
														
 
															-	bbio->orig_bio = first_bio;
														
 
															-	bbio->private = first_bio->bi_private;
														
 
															-	bbio->end_io = first_bio->bi_end_io;
														
 
															-	atomic_set(&bbio->stripes_pending, bbio->num_stripes);
														
 
															-
														
 
															 	while (dev_nr < total_devs) {
														
 
															 		dev = bbio->stripes[dev_nr].dev;
														
 
															 		if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) {
														
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -321,7 +321,14 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 
															 void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
														
 
															 					      struct btrfs_device *tgtdev);
														
 
															 int btrfs_scratch_superblock(struct btrfs_device *device);
														
 
															-
														
 
															+void btrfs_schedule_bio(struct btrfs_root *root,
														
 
															+			struct btrfs_device *device,
														
 
															+			int rw, struct bio *bio);
														
 
															+int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
														
 
															+			   u64 logical, u64 len, int mirror_num);
														
 
															+unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
														
 
															+				    struct btrfs_mapping_tree *map_tree,
														
 
															+				    u64 logical);
														
 
															 static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
														
 
															 				      int index)
														
 
															 {