|
@@ -724,25 +724,55 @@ static unsigned long shrink_page_list(struct list_head *page_list,
|
|
|
may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
|
|
|
(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
|
|
|
|
|
|
+ /*
|
|
|
+ * If a page at the tail of the LRU is under writeback, there
|
|
|
+ * are three cases to consider.
|
|
|
+ *
|
|
|
+ * 1) If reclaim is encountering an excessive number of pages
|
|
|
+ * under writeback and this page is both under writeback and
|
|
|
+ * PageReclaim then it indicates that pages are being queued
|
|
|
+ * for IO but are being recycled through the LRU before the
|
|
|
+ * IO can complete. Waiting on the page itself risks an
|
|
|
+ * indefinite stall if it is impossible to writeback the
|
|
|
+ * page due to IO error or disconnected storage so instead
|
|
|
+ * block for HZ/10 or until some IO completes then clear the
|
|
|
+ * ZONE_WRITEBACK flag to recheck if the condition exists.
|
|
|
+ *
|
|
|
+ * 2) Global reclaim encounters a page, memcg encounters a
|
|
|
+ * page that is not marked for immediate reclaim or
|
|
|
+ * the caller does not have __GFP_IO. In this case mark
|
|
|
+ * the page for immediate reclaim and continue scanning.
|
|
|
+ *
|
|
|
+ * __GFP_IO is checked because a loop driver thread might
|
|
|
+ * enter reclaim, and deadlock if it waits on a page for
|
|
|
+ * which it is needed to do the write (loop masks off
|
|
|
+ * __GFP_IO|__GFP_FS for this reason); but more thought
|
|
|
+ * would probably show more reasons.
|
|
|
+ *
|
|
|
+ * Don't require __GFP_FS, since we're not going into the
|
|
|
+ * FS, just waiting on its writeback completion. Worryingly,
|
|
|
+ * ext4 gfs2 and xfs allocate pages with
|
|
|
+ * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing
|
|
|
+ * may_enter_fs here is liable to OOM on them.
|
|
|
+ *
|
|
|
+ * 3) memcg encounters a page that is not already marked
|
|
|
+ * PageReclaim. memcg does not have any dirty pages
|
|
|
+ * throttling so we could easily OOM just because too many
|
|
|
+ * pages are in writeback and there is nothing else to
|
|
|
+ * reclaim. Wait for the writeback to complete.
|
|
|
+ */
|
|
|
if (PageWriteback(page)) {
|
|
|
- /*
|
|
|
- * memcg doesn't have any dirty pages throttling so we
|
|
|
- * could easily OOM just because too many pages are in
|
|
|
- * writeback and there is nothing else to reclaim.
|
|
|
- *
|
|
|
- * Check __GFP_IO, certainly because a loop driver
|
|
|
- * thread might enter reclaim, and deadlock if it waits
|
|
|
- * on a page for which it is needed to do the write
|
|
|
- * (loop masks off __GFP_IO|__GFP_FS for this reason);
|
|
|
- * but more thought would probably show more reasons.
|
|
|
- *
|
|
|
- * Don't require __GFP_FS, since we're not going into
|
|
|
- * the FS, just waiting on its writeback completion.
|
|
|
- * Worryingly, ext4 gfs2 and xfs allocate pages with
|
|
|
- * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so
|
|
|
- * testing may_enter_fs here is liable to OOM on them.
|
|
|
- */
|
|
|
- if (global_reclaim(sc) ||
|
|
|
+ /* Case 1 above */
|
|
|
+ if (current_is_kswapd() &&
|
|
|
+ PageReclaim(page) &&
|
|
|
+ zone_is_reclaim_writeback(zone)) {
|
|
|
+ unlock_page(page);
|
|
|
+ congestion_wait(BLK_RW_ASYNC, HZ/10);
|
|
|
+ zone_clear_flag(zone, ZONE_WRITEBACK);
|
|
|
+ goto keep;
|
|
|
+
|
|
|
+ /* Case 2 above */
|
|
|
+ } else if (global_reclaim(sc) ||
|
|
|
!PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
|
|
|
/*
|
|
|
* This is slightly racy - end_page_writeback()
|
|
@@ -757,9 +787,13 @@ static unsigned long shrink_page_list(struct list_head *page_list,
|
|
|
*/
|
|
|
SetPageReclaim(page);
|
|
|
nr_writeback++;
|
|
|
+
|
|
|
goto keep_locked;
|
|
|
+
|
|
|
+ /* Case 3 above */
|
|
|
+ } else {
|
|
|
+ wait_on_page_writeback(page);
|
|
|
}
|
|
|
- wait_on_page_writeback(page);
|
|
|
}
|
|
|
|
|
|
if (!force_reclaim)
|
|
@@ -1374,8 +1408,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
|
|
|
* isolated page is PageWriteback
|
|
|
*/
|
|
|
if (nr_writeback && nr_writeback >=
|
|
|
- (nr_taken >> (DEF_PRIORITY - sc->priority)))
|
|
|
+ (nr_taken >> (DEF_PRIORITY - sc->priority))) {
|
|
|
wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
|
|
|
+ zone_set_flag(zone, ZONE_WRITEBACK);
|
|
|
+ }
|
|
|
|
|
|
/*
|
|
|
* Similarly, if many dirty pages are encountered that are not
|
|
@@ -2669,8 +2705,8 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
|
|
|
* the high watermark.
|
|
|
*
|
|
|
* Returns true if kswapd scanned at least the requested number of pages to
|
|
|
- * reclaim. This is used to determine if the scanning priority needs to be
|
|
|
- * raised.
|
|
|
+ * reclaim or if the lack of progress was due to pages under writeback.
|
|
|
+ * This is used to determine if the scanning priority needs to be raised.
|
|
|
*/
|
|
|
static bool kswapd_shrink_zone(struct zone *zone,
|
|
|
struct scan_control *sc,
|
|
@@ -2697,6 +2733,8 @@ static bool kswapd_shrink_zone(struct zone *zone,
|
|
|
if (nr_slab == 0 && !zone_reclaimable(zone))
|
|
|
zone->all_unreclaimable = 1;
|
|
|
|
|
|
+ zone_clear_flag(zone, ZONE_WRITEBACK);
|
|
|
+
|
|
|
return sc->nr_scanned >= sc->nr_to_reclaim;
|
|
|
}
|
|
|
|