12 years ago · 283aba9f9e
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -499,6 +499,9 @@ typedef enum {
 
				 					 * many dirty file pages at the tail
			
 
				 					 * of the LRU.
			
 
				 					 */
			
 
				+	ZONE_WRITEBACK,			/* reclaim scanning has recently found
			
 
				+					 * many pages under writeback
			
 
				+					 */
			
 
				 } zone_flags_t;
			
 
				 
			
 
				 static inline void zone_set_flag(struct zone *zone, zone_flags_t flag)
			
@@ -526,6 +529,11 @@ static inline int zone_is_reclaim_dirty(const struct zone *zone)
 
				 	return test_bit(ZONE_TAIL_LRU_DIRTY, &zone->flags);
			
 
				 }
			
 
				 
			
 
				+static inline int zone_is_reclaim_writeback(const struct zone *zone)
			
 
				+{
			
 
				+	return test_bit(ZONE_WRITEBACK, &zone->flags);
			
 
				+}
			
 
				+
			
 
				 static inline int zone_is_reclaim_locked(const struct zone *zone)
			
 
				 {
			
 
				 	return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags);
			
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -724,25 +724,55 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 
				 		may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
			
 
				 			(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
			
 
				 
			
 
				+		/*
			
 
				+		 * If a page at the tail of the LRU is under writeback, there
			
 
				+		 * are three cases to consider.
			
 
				+		 *
			
 
				+		 * 1) If reclaim is encountering an excessive number of pages
			
 
				+		 *    under writeback and this page is both under writeback and
			
 
				+		 *    PageReclaim then it indicates that pages are being queued
			
 
				+		 *    for IO but are being recycled through the LRU before the
			
 
				+		 *    IO can complete. Waiting on the page itself risks an
			
 
				+		 *    indefinite stall if it is impossible to writeback the
			
 
				+		 *    page due to IO error or disconnected storage so instead
			
 
				+		 *    block for HZ/10 or until some IO completes then clear the
			
 
				+		 *    ZONE_WRITEBACK flag to recheck if the condition exists.
			
 
				+		 *
			
 
				+		 * 2) Global reclaim encounters a page, memcg encounters a
			
 
				+		 *    page that is not marked for immediate reclaim or
			
 
				+		 *    the caller does not have __GFP_IO. In this case mark
			
 
				+		 *    the page for immediate reclaim and continue scanning.
			
 
				+		 *
			
 
				+		 *    __GFP_IO is checked  because a loop driver thread might
			
 
				+		 *    enter reclaim, and deadlock if it waits on a page for
			
 
				+		 *    which it is needed to do the write (loop masks off
			
 
				+		 *    __GFP_IO|__GFP_FS for this reason); but more thought
			
 
				+		 *    would probably show more reasons.
			
 
				+		 *
			
 
				+		 *    Don't require __GFP_FS, since we're not going into the
			
 
				+		 *    FS, just waiting on its writeback completion. Worryingly,
			
 
				+		 *    ext4 gfs2 and xfs allocate pages with
			
 
				+		 *    grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing
			
 
				+		 *    may_enter_fs here is liable to OOM on them.
			
 
				+		 *
			
 
				+		 * 3) memcg encounters a page that is not already marked
			
 
				+		 *    PageReclaim. memcg does not have any dirty pages
			
 
				+		 *    throttling so we could easily OOM just because too many
			
 
				+		 *    pages are in writeback and there is nothing else to
			
 
				+		 *    reclaim. Wait for the writeback to complete.
			
 
				+		 */
			
 
				 		if (PageWriteback(page)) {
			
 
				-			/*
			
 
				-			 * memcg doesn't have any dirty pages throttling so we
			
 
				-			 * could easily OOM just because too many pages are in
			
 
				-			 * writeback and there is nothing else to reclaim.
			
 
				-			 *
			
 
				-			 * Check __GFP_IO, certainly because a loop driver
			
 
				-			 * thread might enter reclaim, and deadlock if it waits
			
 
				-			 * on a page for which it is needed to do the write
			
 
				-			 * (loop masks off __GFP_IO|__GFP_FS for this reason);
			
 
				-			 * but more thought would probably show more reasons.
			
 
				-			 *
			
 
				-			 * Don't require __GFP_FS, since we're not going into
			
 
				-			 * the FS, just waiting on its writeback completion.
			
 
				-			 * Worryingly, ext4 gfs2 and xfs allocate pages with
			
 
				-			 * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so
			
 
				-			 * testing may_enter_fs here is liable to OOM on them.
			
 
				-			 */
			
 
				-			if (global_reclaim(sc) ||
			
 
				+			/* Case 1 above */
			
 
				+			if (current_is_kswapd() &&
			
 
				+			    PageReclaim(page) &&
			
 
				+			    zone_is_reclaim_writeback(zone)) {
			
 
				+				unlock_page(page);
			
 
				+				congestion_wait(BLK_RW_ASYNC, HZ/10);
			
 
				+				zone_clear_flag(zone, ZONE_WRITEBACK);
			
 
				+				goto keep;
			
 
				+
			
 
				+			/* Case 2 above */
			
 
				+			} else if (global_reclaim(sc) ||
			
 
				 			    !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
			
 
				 				/*
			
 
				 				 * This is slightly racy - end_page_writeback()
			
@@ -757,9 +787,13 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 
				 				 */
			
 
				 				SetPageReclaim(page);
			
 
				 				nr_writeback++;
			
 
				+
			
 
				 				goto keep_locked;
			
 
				+
			
 
				+			/* Case 3 above */
			
 
				+			} else {
			
 
				+				wait_on_page_writeback(page);
			
 
				 			}
			
 
				-			wait_on_page_writeback(page);
			
 
				 		}
			
 
				 
			
 
				 		if (!force_reclaim)
			
@@ -1374,8 +1408,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 
				 	 *                     isolated page is PageWriteback
			
 
				 	 */
			
 
				 	if (nr_writeback && nr_writeback >=
			
 
				-			(nr_taken >> (DEF_PRIORITY - sc->priority)))
			
 
				+			(nr_taken >> (DEF_PRIORITY - sc->priority))) {
			
 
				 		wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
			
 
				+		zone_set_flag(zone, ZONE_WRITEBACK);
			
 
				+	}
			
 
				 
			
 
				 	/*
			
 
				 	 * Similarly, if many dirty pages are encountered that are not
			
@@ -2669,8 +2705,8 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
 
				  * the high watermark.
			
 
				  *
			
 
				  * Returns true if kswapd scanned at least the requested number of pages to
			
 
				- * reclaim. This is used to determine if the scanning priority needs to be
			
 
				- * raised.
			
 
				+ * reclaim or if the lack of progress was due to pages under writeback.
			
 
				+ * This is used to determine if the scanning priority needs to be raised.
			
 
				  */
			
 
				 static bool kswapd_shrink_zone(struct zone *zone,
			
 
				 			       struct scan_control *sc,
			
@@ -2697,6 +2733,8 @@ static bool kswapd_shrink_zone(struct zone *zone,
 
				 	if (nr_slab == 0 && !zone_reclaimable(zone))
			
 
				 		zone->all_unreclaimable = 1;
			
 
				 
			
 
				+	zone_clear_flag(zone, ZONE_WRITEBACK);
			
 
				+
			
 
				 	return sc->nr_scanned >= sc->nr_to_reclaim;
			
 
				 }