|
@@ -1457,47 +1457,171 @@ try_next_zone:
|
|
|
return page;
|
|
|
}
|
|
|
|
|
|
-/*
|
|
|
- * This is the 'heart' of the zoned buddy allocator.
|
|
|
- */
|
|
|
-struct page *
|
|
|
-__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
|
|
|
- struct zonelist *zonelist, nodemask_t *nodemask)
|
|
|
+static inline int
|
|
|
+should_alloc_retry(gfp_t gfp_mask, unsigned int order,
|
|
|
+ unsigned long pages_reclaimed)
|
|
|
{
|
|
|
- const gfp_t wait = gfp_mask & __GFP_WAIT;
|
|
|
- enum zone_type high_zoneidx = gfp_zone(gfp_mask);
|
|
|
- struct zoneref *z;
|
|
|
- struct zone *zone;
|
|
|
- struct page *page;
|
|
|
- struct reclaim_state reclaim_state;
|
|
|
- struct task_struct *p = current;
|
|
|
- int do_retry;
|
|
|
- int alloc_flags;
|
|
|
- unsigned long did_some_progress;
|
|
|
- unsigned long pages_reclaimed = 0;
|
|
|
+ /* Do not loop if specifically requested */
|
|
|
+ if (gfp_mask & __GFP_NORETRY)
|
|
|
+ return 0;
|
|
|
|
|
|
- lockdep_trace_alloc(gfp_mask);
|
|
|
+ /*
|
|
|
+ * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
|
|
|
+ * means __GFP_NOFAIL, but that may not be true in other
|
|
|
+ * implementations.
|
|
|
+ */
|
|
|
+ if (order <= PAGE_ALLOC_COSTLY_ORDER)
|
|
|
+ return 1;
|
|
|
|
|
|
- might_sleep_if(wait);
|
|
|
+ /*
|
|
|
+ * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
|
|
|
+ * specified, then we retry until we no longer reclaim any pages
|
|
|
+ * (above), or we've reclaimed an order of pages at least as
|
|
|
+ * large as the allocation's order. In both cases, if the
|
|
|
+ * allocation still fails, we stop retrying.
|
|
|
+ */
|
|
|
+ if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
|
|
|
+ return 1;
|
|
|
|
|
|
- if (should_fail_alloc_page(gfp_mask, order))
|
|
|
- return NULL;
|
|
|
+ /*
|
|
|
+ * Don't let big-order allocations loop unless the caller
|
|
|
+ * explicitly requests that.
|
|
|
+ */
|
|
|
+ if (gfp_mask & __GFP_NOFAIL)
|
|
|
+ return 1;
|
|
|
|
|
|
- /* the list of zones suitable for gfp_mask */
|
|
|
- z = zonelist->_zonerefs;
|
|
|
- if (unlikely(!z->zone)) {
|
|
|
- /*
|
|
|
- * Happens if we have an empty zonelist as a result of
|
|
|
- * GFP_THISNODE being used on a memoryless node
|
|
|
- */
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+
|
|
|
+static inline struct page *
|
|
|
+__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
|
|
|
+ struct zonelist *zonelist, enum zone_type high_zoneidx,
|
|
|
+ nodemask_t *nodemask)
|
|
|
+{
|
|
|
+ struct page *page;
|
|
|
+
|
|
|
+ /* Acquire the OOM killer lock for the zones in zonelist */
|
|
|
+ if (!try_set_zone_oom(zonelist, gfp_mask)) {
|
|
|
+ schedule_timeout_uninterruptible(1);
|
|
|
return NULL;
|
|
|
}
|
|
|
|
|
|
-restart:
|
|
|
- page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
|
|
|
- zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET);
|
|
|
+ /*
|
|
|
+ * Go through the zonelist yet one more time, keep very high watermark
|
|
|
+ * here, this is only to catch a parallel oom killing, we must fail if
|
|
|
+ * we're still under heavy pressure.
|
|
|
+ */
|
|
|
+ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
|
|
|
+ order, zonelist, high_zoneidx,
|
|
|
+ ALLOC_WMARK_HIGH|ALLOC_CPUSET);
|
|
|
if (page)
|
|
|
- goto got_pg;
|
|
|
+ goto out;
|
|
|
+
|
|
|
+ /* The OOM killer will not help higher order allocs */
|
|
|
+ if (order > PAGE_ALLOC_COSTLY_ORDER)
|
|
|
+ goto out;
|
|
|
+
|
|
|
+ /* Exhausted what can be done so it's blamo time */
|
|
|
+ out_of_memory(zonelist, gfp_mask, order);
|
|
|
+
|
|
|
+out:
|
|
|
+ clear_zonelist_oom(zonelist, gfp_mask);
|
|
|
+ return page;
|
|
|
+}
|
|
|
+
|
|
|
+/* The really slow allocator path where we enter direct reclaim */
|
|
|
+static inline struct page *
|
|
|
+__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
|
|
|
+ struct zonelist *zonelist, enum zone_type high_zoneidx,
|
|
|
+ nodemask_t *nodemask, int alloc_flags, unsigned long *did_some_progress)
|
|
|
+{
|
|
|
+ struct page *page = NULL;
|
|
|
+ struct reclaim_state reclaim_state;
|
|
|
+ struct task_struct *p = current;
|
|
|
+
|
|
|
+ cond_resched();
|
|
|
+
|
|
|
+ /* We now go into synchronous reclaim */
|
|
|
+ cpuset_memory_pressure_bump();
|
|
|
+
|
|
|
+ /*
|
|
|
+ * The task's cpuset might have expanded its set of allowable nodes
|
|
|
+ */
|
|
|
+ p->flags |= PF_MEMALLOC;
|
|
|
+ lockdep_set_current_reclaim_state(gfp_mask);
|
|
|
+ reclaim_state.reclaimed_slab = 0;
|
|
|
+ p->reclaim_state = &reclaim_state;
|
|
|
+
|
|
|
+ *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
|
|
|
+
|
|
|
+ p->reclaim_state = NULL;
|
|
|
+ lockdep_clear_current_reclaim_state();
|
|
|
+ p->flags &= ~PF_MEMALLOC;
|
|
|
+
|
|
|
+ cond_resched();
|
|
|
+
|
|
|
+ if (order != 0)
|
|
|
+ drain_all_pages();
|
|
|
+
|
|
|
+ if (likely(*did_some_progress))
|
|
|
+ page = get_page_from_freelist(gfp_mask, nodemask, order,
|
|
|
+ zonelist, high_zoneidx, alloc_flags);
|
|
|
+ return page;
|
|
|
+}
|
|
|
+
|
|
|
+static inline int
|
|
|
+is_allocation_high_priority(struct task_struct *p, gfp_t gfp_mask)
|
|
|
+{
|
|
|
+ if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
|
|
|
+ && !in_interrupt())
|
|
|
+ return 1;
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ * This is called in the allocator slow-path if the allocation request is of
|
|
|
+ * sufficient urgency to ignore watermarks and take other desperate measures
|
|
|
+ */
|
|
|
+static inline struct page *
|
|
|
+__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
|
|
|
+ struct zonelist *zonelist, enum zone_type high_zoneidx,
|
|
|
+ nodemask_t *nodemask)
|
|
|
+{
|
|
|
+ struct page *page;
|
|
|
+
|
|
|
+ do {
|
|
|
+ page = get_page_from_freelist(gfp_mask, nodemask, order,
|
|
|
+ zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);
|
|
|
+
|
|
|
+ if (!page && gfp_mask & __GFP_NOFAIL)
|
|
|
+ congestion_wait(WRITE, HZ/50);
|
|
|
+ } while (!page && (gfp_mask & __GFP_NOFAIL));
|
|
|
+
|
|
|
+ return page;
|
|
|
+}
|
|
|
+
|
|
|
+static inline
|
|
|
+void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
|
|
|
+ enum zone_type high_zoneidx)
|
|
|
+{
|
|
|
+ struct zoneref *z;
|
|
|
+ struct zone *zone;
|
|
|
+
|
|
|
+ for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
|
|
|
+ wakeup_kswapd(zone, order);
|
|
|
+}
|
|
|
+
|
|
|
+static inline struct page *
|
|
|
+__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
|
|
|
+ struct zonelist *zonelist, enum zone_type high_zoneidx,
|
|
|
+ nodemask_t *nodemask)
|
|
|
+{
|
|
|
+ const gfp_t wait = gfp_mask & __GFP_WAIT;
|
|
|
+ struct page *page = NULL;
|
|
|
+ int alloc_flags;
|
|
|
+ unsigned long pages_reclaimed = 0;
|
|
|
+ unsigned long did_some_progress;
|
|
|
+ struct task_struct *p = current;
|
|
|
|
|
|
/*
|
|
|
* GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
|
|
@@ -1510,8 +1634,7 @@ restart:
|
|
|
if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
|
|
|
goto nopage;
|
|
|
|
|
|
- for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
|
|
|
- wakeup_kswapd(zone, order);
|
|
|
+ wake_all_kswapd(order, zonelist, high_zoneidx);
|
|
|
|
|
|
/*
|
|
|
* OK, we're below the kswapd watermark and have kicked background
|
|
@@ -1531,6 +1654,7 @@ restart:
|
|
|
if (wait)
|
|
|
alloc_flags |= ALLOC_CPUSET;
|
|
|
|
|
|
+restart:
|
|
|
/*
|
|
|
* Go through the zonelist again. Let __GFP_HIGH and allocations
|
|
|
* coming from realtime tasks go deeper into reserves.
|
|
@@ -1544,23 +1668,18 @@ restart:
|
|
|
if (page)
|
|
|
goto got_pg;
|
|
|
|
|
|
- /* This allocation should allow future memory freeing. */
|
|
|
-
|
|
|
rebalance:
|
|
|
- if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
|
|
|
- && !in_interrupt()) {
|
|
|
+ /* Allocate without watermarks if the context allows */
|
|
|
+ if (is_allocation_high_priority(p, gfp_mask)) {
|
|
|
+ /* Do not dip into emergency reserves if specified */
|
|
|
if (!(gfp_mask & __GFP_NOMEMALLOC)) {
|
|
|
-nofail_alloc:
|
|
|
- /* go through the zonelist yet again, ignoring mins */
|
|
|
- page = get_page_from_freelist(gfp_mask, nodemask, order,
|
|
|
- zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);
|
|
|
+ page = __alloc_pages_high_priority(gfp_mask, order,
|
|
|
+ zonelist, high_zoneidx, nodemask);
|
|
|
if (page)
|
|
|
goto got_pg;
|
|
|
- if (gfp_mask & __GFP_NOFAIL) {
|
|
|
- congestion_wait(WRITE, HZ/50);
|
|
|
- goto nofail_alloc;
|
|
|
- }
|
|
|
}
|
|
|
+
|
|
|
+ /* Ensure no recursion into the allocator */
|
|
|
goto nopage;
|
|
|
}
|
|
|
|
|
@@ -1568,93 +1687,42 @@ nofail_alloc:
|
|
|
if (!wait)
|
|
|
goto nopage;
|
|
|
|
|
|
- cond_resched();
|
|
|
-
|
|
|
- /* We now go into synchronous reclaim */
|
|
|
- cpuset_memory_pressure_bump();
|
|
|
-
|
|
|
- p->flags |= PF_MEMALLOC;
|
|
|
-
|
|
|
- lockdep_set_current_reclaim_state(gfp_mask);
|
|
|
- reclaim_state.reclaimed_slab = 0;
|
|
|
- p->reclaim_state = &reclaim_state;
|
|
|
-
|
|
|
- did_some_progress = try_to_free_pages(zonelist, order,
|
|
|
- gfp_mask, nodemask);
|
|
|
-
|
|
|
- p->reclaim_state = NULL;
|
|
|
- lockdep_clear_current_reclaim_state();
|
|
|
- p->flags &= ~PF_MEMALLOC;
|
|
|
+ /* Try direct reclaim and then allocating */
|
|
|
+ page = __alloc_pages_direct_reclaim(gfp_mask, order,
|
|
|
+ zonelist, high_zoneidx,
|
|
|
+ nodemask,
|
|
|
+ alloc_flags, &did_some_progress);
|
|
|
+ if (page)
|
|
|
+ goto got_pg;
|
|
|
|
|
|
- cond_resched();
|
|
|
+ /*
|
|
|
+ * If we failed to make any progress reclaiming, then we are
|
|
|
+ * running out of options and have to consider going OOM
|
|
|
+ */
|
|
|
+ if (!did_some_progress) {
|
|
|
+ if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
|
|
|
+ page = __alloc_pages_may_oom(gfp_mask, order,
|
|
|
+ zonelist, high_zoneidx,
|
|
|
+ nodemask);
|
|
|
+ if (page)
|
|
|
+ goto got_pg;
|
|
|
|
|
|
- if (order != 0)
|
|
|
- drain_all_pages();
|
|
|
+ /*
|
|
|
+ * The OOM killer does not trigger for high-order allocations
|
|
|
+ * but if no progress is being made, there are no other
|
|
|
+ * options and retrying is unlikely to help
|
|
|
+ */
|
|
|
+ if (order > PAGE_ALLOC_COSTLY_ORDER)
|
|
|
+ goto nopage;
|
|
|
|
|
|
- if (likely(did_some_progress)) {
|
|
|
- page = get_page_from_freelist(gfp_mask, nodemask, order,
|
|
|
- zonelist, high_zoneidx, alloc_flags);
|
|
|
- if (page)
|
|
|
- goto got_pg;
|
|
|
- } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
|
|
|
- if (!try_set_zone_oom(zonelist, gfp_mask)) {
|
|
|
- schedule_timeout_uninterruptible(1);
|
|
|
goto restart;
|
|
|
}
|
|
|
-
|
|
|
- /*
|
|
|
- * Go through the zonelist yet one more time, keep
|
|
|
- * very high watermark here, this is only to catch
|
|
|
- * a parallel oom killing, we must fail if we're still
|
|
|
- * under heavy pressure.
|
|
|
- */
|
|
|
- page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
|
|
|
- order, zonelist, high_zoneidx,
|
|
|
- ALLOC_WMARK_HIGH|ALLOC_CPUSET);
|
|
|
- if (page) {
|
|
|
- clear_zonelist_oom(zonelist, gfp_mask);
|
|
|
- goto got_pg;
|
|
|
- }
|
|
|
-
|
|
|
- /* The OOM killer will not help higher order allocs so fail */
|
|
|
- if (order > PAGE_ALLOC_COSTLY_ORDER) {
|
|
|
- clear_zonelist_oom(zonelist, gfp_mask);
|
|
|
- goto nopage;
|
|
|
- }
|
|
|
-
|
|
|
- out_of_memory(zonelist, gfp_mask, order);
|
|
|
- clear_zonelist_oom(zonelist, gfp_mask);
|
|
|
- goto restart;
|
|
|
}
|
|
|
|
|
|
- /*
|
|
|
- * Don't let big-order allocations loop unless the caller explicitly
|
|
|
- * requests that. Wait for some write requests to complete then retry.
|
|
|
- *
|
|
|
- * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
|
|
|
- * means __GFP_NOFAIL, but that may not be true in other
|
|
|
- * implementations.
|
|
|
- *
|
|
|
- * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
|
|
|
- * specified, then we retry until we no longer reclaim any pages
|
|
|
- * (above), or we've reclaimed an order of pages at least as
|
|
|
- * large as the allocation's order. In both cases, if the
|
|
|
- * allocation still fails, we stop retrying.
|
|
|
- */
|
|
|
+ /* Check if we should retry the allocation */
|
|
|
pages_reclaimed += did_some_progress;
|
|
|
- do_retry = 0;
|
|
|
- if (!(gfp_mask & __GFP_NORETRY)) {
|
|
|
- if (order <= PAGE_ALLOC_COSTLY_ORDER) {
|
|
|
- do_retry = 1;
|
|
|
- } else {
|
|
|
- if (gfp_mask & __GFP_REPEAT &&
|
|
|
- pages_reclaimed < (1 << order))
|
|
|
- do_retry = 1;
|
|
|
- }
|
|
|
- if (gfp_mask & __GFP_NOFAIL)
|
|
|
- do_retry = 1;
|
|
|
- }
|
|
|
- if (do_retry) {
|
|
|
+ if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
|
|
|
+ /* Wait for some write requests to complete then retry */
|
|
|
congestion_wait(WRITE, HZ/50);
|
|
|
goto rebalance;
|
|
|
}
|
|
@@ -1669,6 +1737,41 @@ nopage:
|
|
|
}
|
|
|
got_pg:
|
|
|
return page;
|
|
|
+
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ * This is the 'heart' of the zoned buddy allocator.
|
|
|
+ */
|
|
|
+struct page *
|
|
|
+__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
|
|
|
+ struct zonelist *zonelist, nodemask_t *nodemask)
|
|
|
+{
|
|
|
+ enum zone_type high_zoneidx = gfp_zone(gfp_mask);
|
|
|
+ struct page *page;
|
|
|
+
|
|
|
+ lockdep_trace_alloc(gfp_mask);
|
|
|
+
|
|
|
+ might_sleep_if(gfp_mask & __GFP_WAIT);
|
|
|
+
|
|
|
+ if (should_fail_alloc_page(gfp_mask, order))
|
|
|
+ return NULL;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Check the zones suitable for the gfp_mask contain at least one
|
|
|
+ * valid zone. It's possible to have an empty zonelist as a result
|
|
|
+ * of GFP_THISNODE and a memoryless node
|
|
|
+ */
|
|
|
+ if (unlikely(!zonelist->_zonerefs->zone))
|
|
|
+ return NULL;
|
|
|
+
|
|
|
+ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
|
|
|
+ zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET);
|
|
|
+ if (unlikely(!page))
|
|
|
+ page = __alloc_pages_slowpath(gfp_mask, order,
|
|
|
+ zonelist, high_zoneidx, nodemask);
|
|
|
+
|
|
|
+ return page;
|
|
|
}
|
|
|
EXPORT_SYMBOL(__alloc_pages_nodemask);
|
|
|
|