瀏覽代碼

md: resolve external metadata handling deadlock in md_allow_write

md_allow_write() marks the metadata dirty while holding mddev->lock and then
waits for the write to complete.  For externally managed metadata this causes a
deadlock as userspace needs to take the lock to communicate that the metadata
update has completed.

Change md_allow_write() in the 'external' case to start the 'mark active'
operation and then return -EAGAIN.  The expected side effects while waiting for
userspace to write 'active' to 'array_state' are holding off reshape (code
currently handles -ENOMEM), cause some 'stripe_cache_size' change requests to
fail, cause some GET_BITMAP_FILE ioctl requests to fall back to GFP_NOIO, and
cause updates to 'raid_disks' to fail.  Except for 'stripe_cache_size' changes
these failures can be mitigated by coordinating with mdmon.

md_write_start() still prevents writes from occurring until the metadata
handler has had a chance to take action as it unconditionally waits for
MD_CHANGE_CLEAN to be cleared.

[neilb@suse.de: return -EAGAIN, try GFP_NOIO]
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Dan Williams 17 年之前
父節點
當前提交
b5470dc5fc
共有 4 個文件被更改,包括 30 次插入17 次删除
  1. 16 11
      drivers/md/md.c
  2. 4 2
      drivers/md/raid1.c
  3. 9 3
      drivers/md/raid5.c
  4. 1 1
      include/linux/raid/md.h

+ 16 - 11
drivers/md/md.c

@@ -4172,9 +4172,11 @@ static int get_bitmap_file(mddev_t * mddev, void __user * arg)
 	char *ptr, *buf = NULL;
 	char *ptr, *buf = NULL;
 	int err = -ENOMEM;
 	int err = -ENOMEM;
 
 
-	md_allow_write(mddev);
+	if (md_allow_write(mddev))
+		file = kmalloc(sizeof(*file), GFP_NOIO);
+	else
+		file = kmalloc(sizeof(*file), GFP_KERNEL);
 
 
-	file = kmalloc(sizeof(*file), GFP_KERNEL);
 	if (!file)
 	if (!file)
 		goto out;
 		goto out;
 
 
@@ -5667,15 +5669,18 @@ void md_write_end(mddev_t *mddev)
  * may proceed without blocking.  It is important to call this before
  * may proceed without blocking.  It is important to call this before
  * attempting a GFP_KERNEL allocation while holding the mddev lock.
  * attempting a GFP_KERNEL allocation while holding the mddev lock.
  * Must be called with mddev_lock held.
  * Must be called with mddev_lock held.
+ *
+ * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock
+ * is dropped, so return -EAGAIN after notifying userspace.
  */
  */
-void md_allow_write(mddev_t *mddev)
+int md_allow_write(mddev_t *mddev)
 {
 {
 	if (!mddev->pers)
 	if (!mddev->pers)
-		return;
+		return 0;
 	if (mddev->ro)
 	if (mddev->ro)
-		return;
+		return 0;
 	if (!mddev->pers->sync_request)
 	if (!mddev->pers->sync_request)
-		return;
+		return 0;
 
 
 	spin_lock_irq(&mddev->write_lock);
 	spin_lock_irq(&mddev->write_lock);
 	if (mddev->in_sync) {
 	if (mddev->in_sync) {
@@ -5686,14 +5691,14 @@ void md_allow_write(mddev_t *mddev)
 			mddev->safemode = 1;
 			mddev->safemode = 1;
 		spin_unlock_irq(&mddev->write_lock);
 		spin_unlock_irq(&mddev->write_lock);
 		md_update_sb(mddev, 0);
 		md_update_sb(mddev, 0);
-
 		sysfs_notify(&mddev->kobj, NULL, "array_state");
 		sysfs_notify(&mddev->kobj, NULL, "array_state");
-		/* wait for the dirty state to be recorded in the metadata */
-		wait_event(mddev->sb_wait,
-			   !test_bit(MD_CHANGE_CLEAN, &mddev->flags) &&
-			   !test_bit(MD_CHANGE_PENDING, &mddev->flags));
 	} else
 	} else
 		spin_unlock_irq(&mddev->write_lock);
 		spin_unlock_irq(&mddev->write_lock);
+
+	if (test_bit(MD_CHANGE_CLEAN, &mddev->flags))
+		return -EAGAIN;
+	else
+		return 0;
 }
 }
 EXPORT_SYMBOL_GPL(md_allow_write);
 EXPORT_SYMBOL_GPL(md_allow_write);
 
 

+ 4 - 2
drivers/md/raid1.c

@@ -2136,7 +2136,7 @@ static int raid1_reshape(mddev_t *mddev)
 	conf_t *conf = mddev_to_conf(mddev);
 	conf_t *conf = mddev_to_conf(mddev);
 	int cnt, raid_disks;
 	int cnt, raid_disks;
 	unsigned long flags;
 	unsigned long flags;
-	int d, d2;
+	int d, d2, err;
 
 
 	/* Cannot change chunk_size, layout, or level */
 	/* Cannot change chunk_size, layout, or level */
 	if (mddev->chunk_size != mddev->new_chunk ||
 	if (mddev->chunk_size != mddev->new_chunk ||
@@ -2148,7 +2148,9 @@ static int raid1_reshape(mddev_t *mddev)
 		return -EINVAL;
 		return -EINVAL;
 	}
 	}
 
 
-	md_allow_write(mddev);
+	err = md_allow_write(mddev);
+	if (err)
+		return err;
 
 
 	raid_disks = mddev->raid_disks + mddev->delta_disks;
 	raid_disks = mddev->raid_disks + mddev->delta_disks;
 
 

+ 9 - 3
drivers/md/raid5.c

@@ -911,14 +911,16 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
 	struct stripe_head *osh, *nsh;
 	struct stripe_head *osh, *nsh;
 	LIST_HEAD(newstripes);
 	LIST_HEAD(newstripes);
 	struct disk_info *ndisks;
 	struct disk_info *ndisks;
-	int err = 0;
+	int err;
 	struct kmem_cache *sc;
 	struct kmem_cache *sc;
 	int i;
 	int i;
 
 
 	if (newsize <= conf->pool_size)
 	if (newsize <= conf->pool_size)
 		return 0; /* never bother to shrink */
 		return 0; /* never bother to shrink */
 
 
-	md_allow_write(conf->mddev);
+	err = md_allow_write(conf->mddev);
+	if (err)
+		return err;
 
 
 	/* Step 1 */
 	/* Step 1 */
 	sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
 	sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
@@ -3843,6 +3845,8 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
 {
 {
 	raid5_conf_t *conf = mddev_to_conf(mddev);
 	raid5_conf_t *conf = mddev_to_conf(mddev);
 	unsigned long new;
 	unsigned long new;
+	int err;
+
 	if (len >= PAGE_SIZE)
 	if (len >= PAGE_SIZE)
 		return -EINVAL;
 		return -EINVAL;
 	if (!conf)
 	if (!conf)
@@ -3858,7 +3862,9 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
 		else
 		else
 			break;
 			break;
 	}
 	}
-	md_allow_write(mddev);
+	err = md_allow_write(mddev);
+	if (err)
+		return err;
 	while (new > conf->max_nr_stripes) {
 	while (new > conf->max_nr_stripes) {
 		if (grow_one_stripe(conf))
 		if (grow_one_stripe(conf))
 			conf->max_nr_stripes++;
 			conf->max_nr_stripes++;

+ 1 - 1
include/linux/raid/md.h

@@ -95,7 +95,7 @@ extern int sync_page_io(struct block_device *bdev, sector_t sector, int size,
 			struct page *page, int rw);
 			struct page *page, int rw);
 extern void md_do_sync(mddev_t *mddev);
 extern void md_do_sync(mddev_t *mddev);
 extern void md_new_event(mddev_t *mddev);
 extern void md_new_event(mddev_t *mddev);
-extern void md_allow_write(mddev_t *mddev);
+extern int md_allow_write(mddev_t *mddev);
 extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
 extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
 
 
 #endif /* CONFIG_MD */
 #endif /* CONFIG_MD */