15 years ago · 0e093d9976
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -285,7 +285,7 @@ enum {
 
				 void clear_bdi_congested(struct backing_dev_info *bdi, int sync);
			
 
				 void set_bdi_congested(struct backing_dev_info *bdi, int sync);
			
 
				 long congestion_wait(int sync, long timeout);
			
 
				-
			
 
				+long wait_iff_congested(struct zone *zone, int sync, long timeout);
			
 
				 
			
 
				 static inline bool bdi_cap_writeback_dirty(struct backing_dev_info *bdi)
			
 
				 {
			
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -423,6 +423,9 @@ struct zone {
 
				 typedef enum {
			
 
				 	ZONE_RECLAIM_LOCKED,		/* prevents concurrent reclaim */
			
 
				 	ZONE_OOM_LOCKED,		/* zone is in OOM killer zonelist */
			
 
				+	ZONE_CONGESTED,			/* zone has many dirty pages backed by
			
 
				+					 * a congested BDI
			
 
				+					 */
			
 
				 } zone_flags_t;
			
 
				 
			
 
				 static inline void zone_set_flag(struct zone *zone, zone_flags_t flag)
			
@@ -440,6 +443,11 @@ static inline void zone_clear_flag(struct zone *zone, zone_flags_t flag)
 
				 	clear_bit(flag, &zone->flags);
			
 
				 }
			
 
				 
			
 
				+static inline int zone_is_reclaim_congested(const struct zone *zone)
			
 
				+{
			
 
				+	return test_bit(ZONE_CONGESTED, &zone->flags);
			
 
				+}
			
 
				+
			
 
				 static inline int zone_is_reclaim_locked(const struct zone *zone)
			
 
				 {
			
 
				 	return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags);
			
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -179,6 +179,13 @@ DEFINE_EVENT(writeback_congest_waited_template, writeback_congestion_wait,
 
				 	TP_ARGS(usec_timeout, usec_delayed)
			
 
				 );
			
 
				 
			
 
				+DEFINE_EVENT(writeback_congest_waited_template, writeback_wait_iff_congested,
			
 
				+
			
 
				+	TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed),
			
 
				+
			
 
				+	TP_ARGS(usec_timeout, usec_delayed)
			
 
				+);
			
 
				+
			
 
				 #endif /* _TRACE_WRITEBACK_H */
			
 
				 
			
 
				 /* This part must be outside protection */
			
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -729,6 +729,7 @@ static wait_queue_head_t congestion_wqh[2] = {
 
				 		__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
			
 
				 		__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
			
 
				 	};
			
 
				+static atomic_t nr_bdi_congested[2];
			
 
				 
			
 
				 void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
			
 
				 {
			
@@ -736,7 +737,8 @@ void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
 
				 	wait_queue_head_t *wqh = &congestion_wqh[sync];
			
 
				 
			
 
				 	bit = sync ? BDI_sync_congested : BDI_async_congested;
			
 
				-	clear_bit(bit, &bdi->state);
			
 
				+	if (test_and_clear_bit(bit, &bdi->state))
			
 
				+		atomic_dec(&nr_bdi_congested[sync]);
			
 
				 	smp_mb__after_clear_bit();
			
 
				 	if (waitqueue_active(wqh))
			
 
				 		wake_up(wqh);
			
@@ -748,7 +750,8 @@ void set_bdi_congested(struct backing_dev_info *bdi, int sync)
 
				 	enum bdi_state bit;
			
 
				 
			
 
				 	bit = sync ? BDI_sync_congested : BDI_async_congested;
			
 
				-	set_bit(bit, &bdi->state);
			
 
				+	if (!test_and_set_bit(bit, &bdi->state))
			
 
				+		atomic_inc(&nr_bdi_congested[sync]);
			
 
				 }
			
 
				 EXPORT_SYMBOL(set_bdi_congested);
			
 
				 
			
@@ -779,3 +782,57 @@ long congestion_wait(int sync, long timeout)
 
				 }
			
 
				 EXPORT_SYMBOL(congestion_wait);
			
 
				 
			
 
				+/**
			
 
				+ * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes
			
 
				+ * @zone: A zone to check if it is heavily congested
			
 
				+ * @sync: SYNC or ASYNC IO
			
 
				+ * @timeout: timeout in jiffies
			
 
				+ *
			
 
				+ * In the event of a congested backing_dev (any backing_dev) and the given
			
 
				+ * @zone has experienced recent congestion, this waits for up to @timeout
			
 
				+ * jiffies for either a BDI to exit congestion of the given @sync queue
			
 
				+ * or a write to complete.
			
 
				+ *
			
 
				+ * In the absense of zone congestion, cond_resched() is called to yield
			
 
				+ * the processor if necessary but otherwise does not sleep.
			
 
				+ *
			
 
				+ * The return value is 0 if the sleep is for the full timeout. Otherwise,
			
 
				+ * it is the number of jiffies that were still remaining when the function
			
 
				+ * returned. return_value == timeout implies the function did not sleep.
			
 
				+ */
			
 
				+long wait_iff_congested(struct zone *zone, int sync, long timeout)
			
 
				+{
			
 
				+	long ret;
			
 
				+	unsigned long start = jiffies;
			
 
				+	DEFINE_WAIT(wait);
			
 
				+	wait_queue_head_t *wqh = &congestion_wqh[sync];
			
 
				+
			
 
				+	/*
			
 
				+	 * If there is no congestion, or heavy congestion is not being
			
 
				+	 * encountered in the current zone, yield if necessary instead
			
 
				+	 * of sleeping on the congestion queue
			
 
				+	 */
			
 
				+	if (atomic_read(&nr_bdi_congested[sync]) == 0 ||
			
 
				+			!zone_is_reclaim_congested(zone)) {
			
 
				+		cond_resched();
			
 
				+
			
 
				+		/* In case we scheduled, work out time remaining */
			
 
				+		ret = timeout - (jiffies - start);
			
 
				+		if (ret < 0)
			
 
				+			ret = 0;
			
 
				+
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	/* Sleep until uncongested or a write happens */
			
 
				+	prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
			
 
				+	ret = io_schedule_timeout(timeout);
			
 
				+	finish_wait(wqh, &wait);
			
 
				+
			
 
				+out:
			
 
				+	trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout),
			
 
				+					jiffies_to_usecs(jiffies - start));
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+EXPORT_SYMBOL(wait_iff_congested);
			
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1907,7 +1907,7 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
 
				 			preferred_zone, migratetype);
			
 
				 
			
 
				 		if (!page && gfp_mask & __GFP_NOFAIL)
			
 
				-			congestion_wait(BLK_RW_ASYNC, HZ/50);
			
 
				+			wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
			
 
				 	} while (!page && (gfp_mask & __GFP_NOFAIL));
			
 
				 
			
 
				 	return page;
			
@@ -2095,7 +2095,7 @@ rebalance:
 
				 	pages_reclaimed += did_some_progress;
			
 
				 	if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
			
 
				 		/* Wait for some write requests to complete then retry */
			
 
				-		congestion_wait(BLK_RW_ASYNC, HZ/50);
			
 
				+		wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
			
 
				 		goto rebalance;
			
 
				 	}
			
 
				 
			
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -401,10 +401,8 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
 
				 	}
			
 
				 	if (mapping->a_ops->writepage == NULL)
			
 
				 		return PAGE_ACTIVATE;
			
 
				-	if (!may_write_to_queue(mapping->backing_dev_info, sc)) {
			
 
				-		disable_lumpy_reclaim_mode(sc);
			
 
				+	if (!may_write_to_queue(mapping->backing_dev_info, sc))
			
 
				 		return PAGE_KEEP;
			
 
				-	}
			
 
				 
			
 
				 	if (clear_page_dirty_for_io(page)) {
			
 
				 		int res;
			
@@ -681,11 +679,14 @@ static noinline_for_stack void free_page_list(struct list_head *free_pages)
 
				  * shrink_page_list() returns the number of reclaimed pages
			
 
				  */
			
 
				 static unsigned long shrink_page_list(struct list_head *page_list,
			
 
				+				      struct zone *zone,
			
 
				 				      struct scan_control *sc)
			
 
				 {
			
 
				 	LIST_HEAD(ret_pages);
			
 
				 	LIST_HEAD(free_pages);
			
 
				 	int pgactivate = 0;
			
 
				+	unsigned long nr_dirty = 0;
			
 
				+	unsigned long nr_congested = 0;
			
 
				 	unsigned long nr_reclaimed = 0;
			
 
				 
			
 
				 	cond_resched();
			
@@ -705,6 +706,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 
				 			goto keep;
			
 
				 
			
 
				 		VM_BUG_ON(PageActive(page));
			
 
				+		VM_BUG_ON(page_zone(page) != zone);
			
 
				 
			
 
				 		sc->nr_scanned++;
			
 
				 
			
@@ -782,6 +784,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 
				 		}
			
 
				 
			
 
				 		if (PageDirty(page)) {
			
 
				+			nr_dirty++;
			
 
				+
			
 
				 			if (references == PAGEREF_RECLAIM_CLEAN)
			
 
				 				goto keep_locked;
			
 
				 			if (!may_enter_fs)
			
@@ -792,6 +796,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 
				 			/* Page is dirty, try to write it out here */
			
 
				 			switch (pageout(page, mapping, sc)) {
			
 
				 			case PAGE_KEEP:
			
 
				+				nr_congested++;
			
 
				 				goto keep_locked;
			
 
				 			case PAGE_ACTIVATE:
			
 
				 				goto activate_locked;
			
@@ -902,6 +907,15 @@ keep_lumpy:
 
				 		VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
			
 
				 	}
			
 
				 
			
 
				+	/*
			
 
				+	 * Tag a zone as congested if all the dirty pages encountered were
			
 
				+	 * backed by a congested BDI. In this case, reclaimers should just
			
 
				+	 * back off and wait for congestion to clear because further reclaim
			
 
				+	 * will encounter the same problem
			
 
				+	 */
			
 
				+	if (nr_dirty == nr_congested)
			
 
				+		zone_set_flag(zone, ZONE_CONGESTED);
			
 
				+
			
 
				 	free_page_list(&free_pages);
			
 
				 
			
 
				 	list_splice(&ret_pages, page_list);
			
@@ -1386,12 +1400,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
 
				 
			
 
				 	spin_unlock_irq(&zone->lru_lock);
			
 
				 
			
 
				-	nr_reclaimed = shrink_page_list(&page_list, sc);
			
 
				+	nr_reclaimed = shrink_page_list(&page_list, zone, sc);
			
 
				 
			
 
				 	/* Check if we should syncronously wait for writeback */
			
 
				 	if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
			
 
				 		set_lumpy_reclaim_mode(priority, sc, true);
			
 
				-		nr_reclaimed += shrink_page_list(&page_list, sc);
			
 
				+		nr_reclaimed += shrink_page_list(&page_list, zone, sc);
			
 
				 	}
			
 
				 
			
 
				 	local_irq_disable();
			
@@ -1982,8 +1996,13 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 
				 
			
 
				 		/* Take a nap, wait for some writeback to complete */
			
 
				 		if (!sc->hibernation_mode && sc->nr_scanned &&
			
 
				-		    priority < DEF_PRIORITY - 2)
			
 
				-			congestion_wait(BLK_RW_ASYNC, HZ/10);
			
 
				+		    priority < DEF_PRIORITY - 2) {
			
 
				+			struct zone *preferred_zone;
			
 
				+
			
 
				+			first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
			
 
				+							NULL, &preferred_zone);
			
 
				+			wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				 out:
			
@@ -2282,6 +2301,15 @@ loop_again:
 
				 				if (!zone_watermark_ok(zone, order,
			
 
				 					    min_wmark_pages(zone), end_zone, 0))
			
 
				 					has_under_min_watermark_zone = 1;
			
 
				+			} else {
			
 
				+				/*
			
 
				+				 * If a zone reaches its high watermark,
			
 
				+				 * consider it to be no longer congested. It's
			
 
				+				 * possible there are dirty pages backed by
			
 
				+				 * congested BDIs but as pressure is relieved,
			
 
				+				 * spectulatively avoid congestion waits
			
 
				+				 */
			
 
				+				zone_clear_flag(zone, ZONE_CONGESTED);
			
 
				 			}
			
 
				 
			
 
				 		}