12 years ago · 4ae10b3a13
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1244,7 +1244,10 @@ struct btrfs_stripe_hash {
 
				 
			
 
				 /* used by the raid56 code to lock stripes for read/modify/write */
			
 
				 struct btrfs_stripe_hash_table {
			
 
				-	struct btrfs_stripe_hash *table;
			
 
				+	struct list_head stripe_cache;
			
 
				+	spinlock_t cache_lock;
			
 
				+	int cache_size;
			
 
				+	struct btrfs_stripe_hash table[];
			
 
				 };
			
 
				 
			
 
				 #define BTRFS_STRIPE_HASH_TABLE_BITS 11
			
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -47,6 +47,20 @@
 
				 /* set when additional merges to this rbio are not allowed */
			
 
				 #define RBIO_RMW_LOCKED_BIT	1
			
 
				 
			
 
				+/*
			
 
				+ * set when this rbio is sitting in the hash, but it is just a cache
			
 
				+ * of past RMW
			
 
				+ */
			
 
				+#define RBIO_CACHE_BIT		2
			
 
				+
			
 
				+/*
			
 
				+ * set when it is safe to trust the stripe_pages for caching
			
 
				+ */
			
 
				+#define RBIO_CACHE_READY_BIT	3
			
 
				+
			
 
				+
			
 
				+#define RBIO_CACHE_SIZE 1024
			
 
				+
			
 
				 struct btrfs_raid_bio {
			
 
				 	struct btrfs_fs_info *fs_info;
			
 
				 	struct btrfs_bio *bbio;
			
@@ -65,6 +79,11 @@ struct btrfs_raid_bio {
 
				 	 */
			
 
				 	struct list_head hash_list;
			
 
				 
			
 
				+	/*
			
 
				+	 * LRU list for the stripe cache
			
 
				+	 */
			
 
				+	struct list_head stripe_cache;
			
 
				+
			
 
				 	/*
			
 
				 	 * for scheduling work in the helper threads
			
 
				 	 */
			
@@ -176,7 +195,9 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
 
				 	if (!table)
			
 
				 		return -ENOMEM;
			
 
				 
			
 
				-	table->table = (void *)(table + 1);
			
 
				+	spin_lock_init(&table->cache_lock);
			
 
				+	INIT_LIST_HEAD(&table->stripe_cache);
			
 
				+
			
 
				 	h = table->table;
			
 
				 
			
 
				 	for (i = 0; i < num_entries; i++) {
			
@@ -192,6 +213,42 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * caching an rbio means to copy anything from the
			
 
				+ * bio_pages array into the stripe_pages array.  We
			
 
				+ * use the page uptodate bit in the stripe cache array
			
 
				+ * to indicate if it has valid data
			
 
				+ *
			
 
				+ * once the caching is done, we set the cache ready
			
 
				+ * bit.
			
 
				+ */
			
 
				+static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
			
 
				+{
			
 
				+	int i;
			
 
				+	char *s;
			
 
				+	char *d;
			
 
				+	int ret;
			
 
				+
			
 
				+	ret = alloc_rbio_pages(rbio);
			
 
				+	if (ret)
			
 
				+		return;
			
 
				+
			
 
				+	for (i = 0; i < rbio->nr_pages; i++) {
			
 
				+		if (!rbio->bio_pages[i])
			
 
				+			continue;
			
 
				+
			
 
				+		s = kmap(rbio->bio_pages[i]);
			
 
				+		d = kmap(rbio->stripe_pages[i]);
			
 
				+
			
 
				+		memcpy(d, s, PAGE_CACHE_SIZE);
			
 
				+
			
 
				+		kunmap(rbio->bio_pages[i]);
			
 
				+		kunmap(rbio->stripe_pages[i]);
			
 
				+		SetPageUptodate(rbio->stripe_pages[i]);
			
 
				+	}
			
 
				+	set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * we hash on the first logical address of the stripe
			
 
				  */
			
@@ -210,6 +267,34 @@ static int rbio_bucket(struct btrfs_raid_bio *rbio)
 
				 	return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * stealing an rbio means taking all the uptodate pages from the stripe
			
 
				+ * array in the source rbio and putting them into the destination rbio
			
 
				+ */
			
 
				+static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
			
 
				+{
			
 
				+	int i;
			
 
				+	struct page *s;
			
 
				+	struct page *d;
			
 
				+
			
 
				+	if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
			
 
				+		return;
			
 
				+
			
 
				+	for (i = 0; i < dest->nr_pages; i++) {
			
 
				+		s = src->stripe_pages[i];
			
 
				+		if (!s || !PageUptodate(s)) {
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		d = dest->stripe_pages[i];
			
 
				+		if (d)
			
 
				+			__free_page(d);
			
 
				+
			
 
				+		dest->stripe_pages[i] = s;
			
 
				+		src->stripe_pages[i] = NULL;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * merging means we take the bio_list from the victim and
			
 
				  * splice it into the destination.  The victim should
			
@@ -226,16 +311,170 @@ static void merge_rbio(struct btrfs_raid_bio *dest,
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * free the hash table used by unmount
			
 
				+ * used to prune items that are in the cache.  The caller
			
 
				+ * must hold the hash table lock.
			
 
				+ */
			
 
				+static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
			
 
				+{
			
 
				+	int bucket = rbio_bucket(rbio);
			
 
				+	struct btrfs_stripe_hash_table *table;
			
 
				+	struct btrfs_stripe_hash *h;
			
 
				+	int freeit = 0;
			
 
				+
			
 
				+	/*
			
 
				+	 * check the bit again under the hash table lock.
			
 
				+	 */
			
 
				+	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
			
 
				+		return;
			
 
				+
			
 
				+	table = rbio->fs_info->stripe_hash_table;
			
 
				+	h = table->table + bucket;
			
 
				+
			
 
				+	/* hold the lock for the bucket because we may be
			
 
				+	 * removing it from the hash table
			
 
				+	 */
			
 
				+	spin_lock(&h->lock);
			
 
				+
			
 
				+	/*
			
 
				+	 * hold the lock for the bio list because we need
			
 
				+	 * to make sure the bio list is empty
			
 
				+	 */
			
 
				+	spin_lock(&rbio->bio_list_lock);
			
 
				+
			
 
				+	if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
			
 
				+		list_del_init(&rbio->stripe_cache);
			
 
				+		table->cache_size -= 1;
			
 
				+		freeit = 1;
			
 
				+
			
 
				+		/* if the bio list isn't empty, this rbio is
			
 
				+		 * still involved in an IO.  We take it out
			
 
				+		 * of the cache list, and drop the ref that
			
 
				+		 * was held for the list.
			
 
				+		 *
			
 
				+		 * If the bio_list was empty, we also remove
			
 
				+		 * the rbio from the hash_table, and drop
			
 
				+		 * the corresponding ref
			
 
				+		 */
			
 
				+		if (bio_list_empty(&rbio->bio_list)) {
			
 
				+			if (!list_empty(&rbio->hash_list)) {
			
 
				+				list_del_init(&rbio->hash_list);
			
 
				+				atomic_dec(&rbio->refs);
			
 
				+				BUG_ON(!list_empty(&rbio->plug_list));
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	spin_unlock(&rbio->bio_list_lock);
			
 
				+	spin_unlock(&h->lock);
			
 
				+
			
 
				+	if (freeit)
			
 
				+		__free_raid_bio(rbio);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * prune a given rbio from the cache
			
 
				+ */
			
 
				+static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
			
 
				+{
			
 
				+	struct btrfs_stripe_hash_table *table;
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
			
 
				+		return;
			
 
				+
			
 
				+	table = rbio->fs_info->stripe_hash_table;
			
 
				+
			
 
				+	spin_lock_irqsave(&table->cache_lock, flags);
			
 
				+	__remove_rbio_from_cache(rbio);
			
 
				+	spin_unlock_irqrestore(&table->cache_lock, flags);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * remove everything in the cache
			
 
				+ */
			
 
				+void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
			
 
				+{
			
 
				+	struct btrfs_stripe_hash_table *table;
			
 
				+	unsigned long flags;
			
 
				+	struct btrfs_raid_bio *rbio;
			
 
				+
			
 
				+	table = info->stripe_hash_table;
			
 
				+
			
 
				+	spin_lock_irqsave(&table->cache_lock, flags);
			
 
				+	while (!list_empty(&table->stripe_cache)) {
			
 
				+		rbio = list_entry(table->stripe_cache.next,
			
 
				+				  struct btrfs_raid_bio,
			
 
				+				  stripe_cache);
			
 
				+		__remove_rbio_from_cache(rbio);
			
 
				+	}
			
 
				+	spin_unlock_irqrestore(&table->cache_lock, flags);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * remove all cached entries and free the hash table
			
 
				+ * used by unmount
			
 
				  */
			
 
				 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
			
 
				 {
			
 
				 	if (!info->stripe_hash_table)
			
 
				 		return;
			
 
				+	btrfs_clear_rbio_cache(info);
			
 
				 	kfree(info->stripe_hash_table);
			
 
				 	info->stripe_hash_table = NULL;
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * insert an rbio into the stripe cache.  It
			
 
				+ * must have already been prepared by calling
			
 
				+ * cache_rbio_pages
			
 
				+ *
			
 
				+ * If this rbio was already cached, it gets
			
 
				+ * moved to the front of the lru.
			
 
				+ *
			
 
				+ * If the size of the rbio cache is too big, we
			
 
				+ * prune an item.
			
 
				+ */
			
 
				+static void cache_rbio(struct btrfs_raid_bio *rbio)
			
 
				+{
			
 
				+	struct btrfs_stripe_hash_table *table;
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
			
 
				+		return;
			
 
				+
			
 
				+	table = rbio->fs_info->stripe_hash_table;
			
 
				+
			
 
				+	spin_lock_irqsave(&table->cache_lock, flags);
			
 
				+	spin_lock(&rbio->bio_list_lock);
			
 
				+
			
 
				+	/* bump our ref if we were not in the list before */
			
 
				+	if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
			
 
				+		atomic_inc(&rbio->refs);
			
 
				+
			
 
				+	if (!list_empty(&rbio->stripe_cache)){
			
 
				+		list_move(&rbio->stripe_cache, &table->stripe_cache);
			
 
				+	} else {
			
 
				+		list_add(&rbio->stripe_cache, &table->stripe_cache);
			
 
				+		table->cache_size += 1;
			
 
				+	}
			
 
				+
			
 
				+	spin_unlock(&rbio->bio_list_lock);
			
 
				+
			
 
				+	if (table->cache_size > RBIO_CACHE_SIZE) {
			
 
				+		struct btrfs_raid_bio *found;
			
 
				+
			
 
				+		found = list_entry(table->stripe_cache.prev,
			
 
				+				  struct btrfs_raid_bio,
			
 
				+				  stripe_cache);
			
 
				+
			
 
				+		if (found != rbio)
			
 
				+			__remove_rbio_from_cache(found);
			
 
				+	}
			
 
				+
			
 
				+	spin_unlock_irqrestore(&table->cache_lock, flags);
			
 
				+	return;
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * helper function to run the xor_blocks api.  It is only
			
 
				  * able to do MAX_XOR_BLOCKS at a time, so we need to
			
@@ -303,6 +542,17 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
 
				 	    test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
			
 
				 		return 0;
			
 
				 
			
 
				+	/*
			
 
				+	 * we can't merge with cached rbios, since the
			
 
				+	 * idea is that when we merge the destination
			
 
				+	 * rbio is going to run our IO for us.  We can
			
 
				+	 * steal from cached rbio's though, other functions
			
 
				+	 * handle that.
			
 
				+	 */
			
 
				+	if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
			
 
				+	    test_bit(RBIO_CACHE_BIT, &cur->flags))
			
 
				+		return 0;
			
 
				+
			
 
				 	if (last->raid_map[0] !=
			
 
				 	    cur->raid_map[0])
			
 
				 		return 0;
			
@@ -370,6 +620,7 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
 
				 	unsigned long flags;
			
 
				 	DEFINE_WAIT(wait);
			
 
				 	struct btrfs_raid_bio *freeit = NULL;
			
 
				+	struct btrfs_raid_bio *cache_drop = NULL;
			
 
				 	int ret = 0;
			
 
				 	int walk = 0;
			
 
				 
			
@@ -379,6 +630,21 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
 
				 		if (cur->raid_map[0] == rbio->raid_map[0]) {
			
 
				 			spin_lock(&cur->bio_list_lock);
			
 
				 
			
 
				+			/* can we steal this cached rbio's pages? */
			
 
				+			if (bio_list_empty(&cur->bio_list) &&
			
 
				+			    list_empty(&cur->plug_list) &&
			
 
				+			    test_bit(RBIO_CACHE_BIT, &cur->flags) &&
			
 
				+			    !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
			
 
				+				list_del_init(&cur->hash_list);
			
 
				+				atomic_dec(&cur->refs);
			
 
				+
			
 
				+				steal_rbio(cur, rbio);
			
 
				+				cache_drop = cur;
			
 
				+				spin_unlock(&cur->bio_list_lock);
			
 
				+
			
 
				+				goto lockit;
			
 
				+			}
			
 
				+
			
 
				 			/* can we merge into the lock owner? */
			
 
				 			if (rbio_can_merge(cur, rbio)) {
			
 
				 				merge_rbio(cur, rbio);
			
@@ -388,6 +654,7 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
 
				 				goto out;
			
 
				 			}
			
 
				 
			
 
				+
			
 
				 			/*
			
 
				 			 * we couldn't merge with the running
			
 
				 			 * rbio, see if we can merge with the
			
@@ -417,11 +684,13 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
 
				 			goto out;
			
 
				 		}
			
 
				 	}
			
 
				-
			
 
				+lockit:
			
 
				 	atomic_inc(&rbio->refs);
			
 
				 	list_add(&rbio->hash_list, &h->hash_list);
			
 
				 out:
			
 
				 	spin_unlock_irqrestore(&h->lock, flags);
			
 
				+	if (cache_drop)
			
 
				+		remove_rbio_from_cache(cache_drop);
			
 
				 	if (freeit)
			
 
				 		__free_raid_bio(freeit);
			
 
				 	return ret;
			
@@ -436,14 +705,30 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
 
				 	int bucket;
			
 
				 	struct btrfs_stripe_hash *h;
			
 
				 	unsigned long flags;
			
 
				+	int keep_cache = 0;
			
 
				 
			
 
				 	bucket = rbio_bucket(rbio);
			
 
				 	h = rbio->fs_info->stripe_hash_table->table + bucket;
			
 
				 
			
 
				+	if (list_empty(&rbio->plug_list))
			
 
				+		cache_rbio(rbio);
			
 
				+
			
 
				 	spin_lock_irqsave(&h->lock, flags);
			
 
				 	spin_lock(&rbio->bio_list_lock);
			
 
				 
			
 
				 	if (!list_empty(&rbio->hash_list)) {
			
 
				+		/*
			
 
				+		 * if we're still cached and there is no other IO
			
 
				+		 * to perform, just leave this rbio here for others
			
 
				+		 * to steal from later
			
 
				+		 */
			
 
				+		if (list_empty(&rbio->plug_list) &&
			
 
				+		    test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
			
 
				+			keep_cache = 1;
			
 
				+			clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
			
 
				+			BUG_ON(!bio_list_empty(&rbio->bio_list));
			
 
				+			goto done;
			
 
				+		}
			
 
				 
			
 
				 		list_del_init(&rbio->hash_list);
			
 
				 		atomic_dec(&rbio->refs);
			
@@ -469,11 +754,12 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
 
				 
			
 
				 			if (next->read_rebuild)
			
 
				 				async_read_rebuild(next);
			
 
				-			else
			
 
				+			else {
			
 
				+				steal_rbio(rbio, next);
			
 
				 				async_rmw_stripe(next);
			
 
				+			}
			
 
				 
			
 
				 			goto done_nolock;
			
 
				-
			
 
				 		} else  if (waitqueue_active(&h->wait)) {
			
 
				 			spin_unlock(&rbio->bio_list_lock);
			
 
				 			spin_unlock_irqrestore(&h->lock, flags);
			
@@ -481,11 +767,13 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
 
				 			goto done_nolock;
			
 
				 		}
			
 
				 	}
			
 
				+done:
			
 
				 	spin_unlock(&rbio->bio_list_lock);
			
 
				 	spin_unlock_irqrestore(&h->lock, flags);
			
 
				 
			
 
				 done_nolock:
			
 
				-	return;
			
 
				+	if (!keep_cache)
			
 
				+		remove_rbio_from_cache(rbio);
			
 
				 }
			
 
				 
			
 
				 static void __free_raid_bio(struct btrfs_raid_bio *rbio)
			
@@ -496,6 +784,7 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio)
 
				 	if (!atomic_dec_and_test(&rbio->refs))
			
 
				 		return;
			
 
				 
			
 
				+	WARN_ON(!list_empty(&rbio->stripe_cache));
			
 
				 	WARN_ON(!list_empty(&rbio->hash_list));
			
 
				 	WARN_ON(!bio_list_empty(&rbio->bio_list));
			
 
				 
			
@@ -630,6 +919,7 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
 
				 	bio_list_init(&rbio->bio_list);
			
 
				 	INIT_LIST_HEAD(&rbio->plug_list);
			
 
				 	spin_lock_init(&rbio->bio_list_lock);
			
 
				+	INIT_LIST_HEAD(&rbio->stripe_cache);
			
 
				 	INIT_LIST_HEAD(&rbio->hash_list);
			
 
				 	rbio->bbio = bbio;
			
 
				 	rbio->raid_map = raid_map;
			
@@ -864,8 +1154,17 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
 
				 	/*
			
 
				 	 * now that we've set rmw_locked, run through the
			
 
				 	 * bio list one last time and map the page pointers
			
 
				+	 *
			
 
				+	 * We don't cache full rbios because we're assuming
			
 
				+	 * the higher layers are unlikely to use this area of
			
 
				+	 * the disk again soon.  If they do use it again,
			
 
				+	 * hopefully they will send another full bio.
			
 
				 	 */
			
 
				 	index_rbio_pages(rbio);
			
 
				+	if (!rbio_is_full(rbio))
			
 
				+		cache_rbio_pages(rbio);
			
 
				+	else
			
 
				+		clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
			
 
				 
			
 
				 	for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
			
 
				 		struct page *p;
			
@@ -1155,6 +1454,13 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
 
				 				continue;
			
 
				 
			
 
				 			page = rbio_stripe_page(rbio, stripe, pagenr);
			
 
				+			/*
			
 
				+			 * the bio cache may have handed us an uptodate
			
 
				+			 * page.  If so, be happy and use it
			
 
				+			 */
			
 
				+			if (PageUptodate(page))
			
 
				+				continue;
			
 
				+
			
 
				 			ret = rbio_add_io_page(rbio, &bio_list, page,
			
 
				 				       stripe, pagenr, rbio->stripe_len);
			
 
				 			if (ret)
			
@@ -1440,6 +1746,11 @@ cleanup:
 
				 cleanup_io:
			
 
				 
			
 
				 	if (rbio->read_rebuild) {
			
 
				+		if (err == 0)
			
 
				+			cache_rbio_pages(rbio);
			
 
				+		else
			
 
				+			clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
			
 
				+
			
 
				 		rbio_orig_end_io(rbio, err, err == 0);
			
 
				 	} else if (err == 0) {
			
 
				 		rbio->faila = -1;
			
@@ -1505,7 +1816,9 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
 
				 	atomic_set(&rbio->bbio->error, 0);
			
 
				 
			
 
				 	/*
			
 
				-	 * read everything that hasn't failed.
			
 
				+	 * read everything that hasn't failed.  Thanks to the
			
 
				+	 * stripe cache, it is possible that some or all of these
			
 
				+	 * pages are going to be uptodate.
			
 
				 	 */
			
 
				 	for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
			
 
				 		if (rbio->faila == stripe ||