|
@@ -255,6 +255,7 @@ struct mem_cgroup {
|
|
|
|
|
|
bool oom_lock;
|
|
|
atomic_t under_oom;
|
|
|
+ atomic_t oom_wakeups;
|
|
|
|
|
|
int swappiness;
|
|
|
/* OOM-Killer disable */
|
|
@@ -2020,6 +2021,7 @@ static int memcg_oom_wake_function(wait_queue_t *wait,
|
|
|
|
|
|
static void memcg_wakeup_oom(struct mem_cgroup *memcg)
|
|
|
{
|
|
|
+ atomic_inc(&memcg->oom_wakeups);
|
|
|
/* for filtering, pass "memcg" as argument. */
|
|
|
__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
|
|
|
}
|
|
@@ -2031,19 +2033,17 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
|
|
|
}
|
|
|
|
|
|
/*
|
|
|
- * try to call OOM killer. returns false if we should exit memory-reclaim loop.
|
|
|
+ * try to call OOM killer
|
|
|
*/
|
|
|
-static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask,
|
|
|
- int order)
|
|
|
+static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
|
|
|
{
|
|
|
- struct oom_wait_info owait;
|
|
|
bool locked;
|
|
|
+ int wakeups;
|
|
|
|
|
|
- owait.memcg = memcg;
|
|
|
- owait.wait.flags = 0;
|
|
|
- owait.wait.func = memcg_oom_wake_function;
|
|
|
- owait.wait.private = current;
|
|
|
- INIT_LIST_HEAD(&owait.wait.task_list);
|
|
|
+ if (!current->memcg_oom.may_oom)
|
|
|
+ return;
|
|
|
+
|
|
|
+ current->memcg_oom.in_memcg_oom = 1;
|
|
|
|
|
|
/*
|
|
|
* As with any blocking lock, a contender needs to start
|
|
@@ -2051,12 +2051,8 @@ static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask,
|
|
|
* otherwise it can miss the wakeup from the unlock and sleep
|
|
|
* indefinitely. This is just open-coded because our locking
|
|
|
* is so particular to memcg hierarchies.
|
|
|
- *
|
|
|
- * Even if signal_pending(), we can't quit charge() loop without
|
|
|
- * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
|
|
|
- * under OOM is always welcomed, use TASK_KILLABLE here.
|
|
|
*/
|
|
|
- prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
|
|
|
+ wakeups = atomic_read(&memcg->oom_wakeups);
|
|
|
mem_cgroup_mark_under_oom(memcg);
|
|
|
|
|
|
locked = mem_cgroup_oom_trylock(memcg);
|
|
@@ -2066,15 +2062,95 @@ static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask,
|
|
|
|
|
|
if (locked && !memcg->oom_kill_disable) {
|
|
|
mem_cgroup_unmark_under_oom(memcg);
|
|
|
- finish_wait(&memcg_oom_waitq, &owait.wait);
|
|
|
mem_cgroup_out_of_memory(memcg, mask, order);
|
|
|
+ mem_cgroup_oom_unlock(memcg);
|
|
|
+ /*
|
|
|
+ * There is no guarantee that an OOM-lock contender
|
|
|
+ * sees the wakeups triggered by the OOM kill
|
|
|
+ * uncharges. Wake any sleepers explicitely.
|
|
|
+ */
|
|
|
+ memcg_oom_recover(memcg);
|
|
|
} else {
|
|
|
- schedule();
|
|
|
- mem_cgroup_unmark_under_oom(memcg);
|
|
|
- finish_wait(&memcg_oom_waitq, &owait.wait);
|
|
|
+ /*
|
|
|
+ * A system call can just return -ENOMEM, but if this
|
|
|
+ * is a page fault and somebody else is handling the
|
|
|
+ * OOM already, we need to sleep on the OOM waitqueue
|
|
|
+ * for this memcg until the situation is resolved.
|
|
|
+ * Which can take some time because it might be
|
|
|
+ * handled by a userspace task.
|
|
|
+ *
|
|
|
+ * However, this is the charge context, which means
|
|
|
+ * that we may sit on a large call stack and hold
|
|
|
+ * various filesystem locks, the mmap_sem etc. and we
|
|
|
+ * don't want the OOM handler to deadlock on them
|
|
|
+ * while we sit here and wait. Store the current OOM
|
|
|
+ * context in the task_struct, then return -ENOMEM.
|
|
|
+ * At the end of the page fault handler, with the
|
|
|
+ * stack unwound, pagefault_out_of_memory() will check
|
|
|
+ * back with us by calling
|
|
|
+ * mem_cgroup_oom_synchronize(), possibly putting the
|
|
|
+ * task to sleep.
|
|
|
+ */
|
|
|
+ current->memcg_oom.oom_locked = locked;
|
|
|
+ current->memcg_oom.wakeups = wakeups;
|
|
|
+ css_get(&memcg->css);
|
|
|
+ current->memcg_oom.wait_on_memcg = memcg;
|
|
|
}
|
|
|
+}
|
|
|
+
|
|
|
+/**
|
|
|
+ * mem_cgroup_oom_synchronize - complete memcg OOM handling
|
|
|
+ *
|
|
|
+ * This has to be called at the end of a page fault if the the memcg
|
|
|
+ * OOM handler was enabled and the fault is returning %VM_FAULT_OOM.
|
|
|
+ *
|
|
|
+ * Memcg supports userspace OOM handling, so failed allocations must
|
|
|
+ * sleep on a waitqueue until the userspace task resolves the
|
|
|
+ * situation. Sleeping directly in the charge context with all kinds
|
|
|
+ * of locks held is not a good idea, instead we remember an OOM state
|
|
|
+ * in the task and mem_cgroup_oom_synchronize() has to be called at
|
|
|
+ * the end of the page fault to put the task to sleep and clean up the
|
|
|
+ * OOM state.
|
|
|
+ *
|
|
|
+ * Returns %true if an ongoing memcg OOM situation was detected and
|
|
|
+ * finalized, %false otherwise.
|
|
|
+ */
|
|
|
+bool mem_cgroup_oom_synchronize(void)
|
|
|
+{
|
|
|
+ struct oom_wait_info owait;
|
|
|
+ struct mem_cgroup *memcg;
|
|
|
+
|
|
|
+ /* OOM is global, do not handle */
|
|
|
+ if (!current->memcg_oom.in_memcg_oom)
|
|
|
+ return false;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * We invoked the OOM killer but there is a chance that a kill
|
|
|
+ * did not free up any charges. Everybody else might already
|
|
|
+ * be sleeping, so restart the fault and keep the rampage
|
|
|
+ * going until some charges are released.
|
|
|
+ */
|
|
|
+ memcg = current->memcg_oom.wait_on_memcg;
|
|
|
+ if (!memcg)
|
|
|
+ goto out;
|
|
|
+
|
|
|
+ if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
|
|
|
+ goto out_memcg;
|
|
|
+
|
|
|
+ owait.memcg = memcg;
|
|
|
+ owait.wait.flags = 0;
|
|
|
+ owait.wait.func = memcg_oom_wake_function;
|
|
|
+ owait.wait.private = current;
|
|
|
+ INIT_LIST_HEAD(&owait.wait.task_list);
|
|
|
|
|
|
- if (locked) {
|
|
|
+ prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
|
|
|
+ /* Only sleep if we didn't miss any wakeups since OOM */
|
|
|
+ if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups)
|
|
|
+ schedule();
|
|
|
+ finish_wait(&memcg_oom_waitq, &owait.wait);
|
|
|
+out_memcg:
|
|
|
+ mem_cgroup_unmark_under_oom(memcg);
|
|
|
+ if (current->memcg_oom.oom_locked) {
|
|
|
mem_cgroup_oom_unlock(memcg);
|
|
|
/*
|
|
|
* There is no guarantee that an OOM-lock contender
|
|
@@ -2083,11 +2159,10 @@ static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask,
|
|
|
*/
|
|
|
memcg_oom_recover(memcg);
|
|
|
}
|
|
|
-
|
|
|
- if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
|
|
|
- return false;
|
|
|
- /* Give chance to dying process */
|
|
|
- schedule_timeout_uninterruptible(1);
|
|
|
+ css_put(&memcg->css);
|
|
|
+ current->memcg_oom.wait_on_memcg = NULL;
|
|
|
+out:
|
|
|
+ current->memcg_oom.in_memcg_oom = 0;
|
|
|
return true;
|
|
|
}
|
|
|
|
|
@@ -2400,12 +2475,11 @@ enum {
|
|
|
CHARGE_RETRY, /* need to retry but retry is not bad */
|
|
|
CHARGE_NOMEM, /* we can't do more. return -ENOMEM */
|
|
|
CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */
|
|
|
- CHARGE_OOM_DIE, /* the current is killed because of OOM */
|
|
|
};
|
|
|
|
|
|
static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
|
|
|
unsigned int nr_pages, unsigned int min_pages,
|
|
|
- bool oom_check)
|
|
|
+ bool invoke_oom)
|
|
|
{
|
|
|
unsigned long csize = nr_pages * PAGE_SIZE;
|
|
|
struct mem_cgroup *mem_over_limit;
|
|
@@ -2462,14 +2536,10 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
|
|
|
if (mem_cgroup_wait_acct_move(mem_over_limit))
|
|
|
return CHARGE_RETRY;
|
|
|
|
|
|
- /* If we don't need to call oom-killer at el, return immediately */
|
|
|
- if (!oom_check || !current->memcg_oom.may_oom)
|
|
|
- return CHARGE_NOMEM;
|
|
|
- /* check OOM */
|
|
|
- if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize)))
|
|
|
- return CHARGE_OOM_DIE;
|
|
|
+ if (invoke_oom)
|
|
|
+ mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize));
|
|
|
|
|
|
- return CHARGE_RETRY;
|
|
|
+ return CHARGE_NOMEM;
|
|
|
}
|
|
|
|
|
|
/*
|
|
@@ -2572,7 +2642,7 @@ again:
|
|
|
}
|
|
|
|
|
|
do {
|
|
|
- bool oom_check;
|
|
|
+ bool invoke_oom = oom && !nr_oom_retries;
|
|
|
|
|
|
/* If killed, bypass charge */
|
|
|
if (fatal_signal_pending(current)) {
|
|
@@ -2580,14 +2650,8 @@ again:
|
|
|
goto bypass;
|
|
|
}
|
|
|
|
|
|
- oom_check = false;
|
|
|
- if (oom && !nr_oom_retries) {
|
|
|
- oom_check = true;
|
|
|
- nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
|
|
|
- }
|
|
|
-
|
|
|
- ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, nr_pages,
|
|
|
- oom_check);
|
|
|
+ ret = mem_cgroup_do_charge(memcg, gfp_mask, batch,
|
|
|
+ nr_pages, invoke_oom);
|
|
|
switch (ret) {
|
|
|
case CHARGE_OK:
|
|
|
break;
|
|
@@ -2600,16 +2664,12 @@ again:
|
|
|
css_put(&memcg->css);
|
|
|
goto nomem;
|
|
|
case CHARGE_NOMEM: /* OOM routine works */
|
|
|
- if (!oom) {
|
|
|
+ if (!oom || invoke_oom) {
|
|
|
css_put(&memcg->css);
|
|
|
goto nomem;
|
|
|
}
|
|
|
- /* If oom, we never return -ENOMEM */
|
|
|
nr_oom_retries--;
|
|
|
break;
|
|
|
- case CHARGE_OOM_DIE: /* Killed by OOM Killer */
|
|
|
- css_put(&memcg->css);
|
|
|
- goto bypass;
|
|
|
}
|
|
|
} while (ret != CHARGE_OK);
|
|
|
|