|
@@ -203,7 +203,7 @@ struct mem_cgroup {
|
|
|
* Should the accounting and control be hierarchical, per subtree?
|
|
|
*/
|
|
|
bool use_hierarchy;
|
|
|
- unsigned long last_oom_jiffies;
|
|
|
+ atomic_t oom_lock;
|
|
|
atomic_t refcnt;
|
|
|
|
|
|
unsigned int swappiness;
|
|
@@ -1246,32 +1246,102 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
|
|
|
return total;
|
|
|
}
|
|
|
|
|
|
-bool mem_cgroup_oom_called(struct task_struct *task)
|
|
|
+static int mem_cgroup_oom_lock_cb(struct mem_cgroup *mem, void *data)
|
|
|
{
|
|
|
- bool ret = false;
|
|
|
- struct mem_cgroup *mem;
|
|
|
- struct mm_struct *mm;
|
|
|
+ int *val = (int *)data;
|
|
|
+ int x;
|
|
|
+ /*
|
|
|
+ * Logically, we can stop scanning immediately when we find
|
|
|
+ * a memcg is already locked. But condidering unlock ops and
|
|
|
+ * creation/removal of memcg, scan-all is simple operation.
|
|
|
+ */
|
|
|
+ x = atomic_inc_return(&mem->oom_lock);
|
|
|
+ *val = max(x, *val);
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+/*
|
|
|
+ * Check OOM-Killer is already running under our hierarchy.
|
|
|
+ * If someone is running, return false.
|
|
|
+ */
|
|
|
+static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
|
|
|
+{
|
|
|
+ int lock_count = 0;
|
|
|
|
|
|
- rcu_read_lock();
|
|
|
- mm = task->mm;
|
|
|
- if (!mm)
|
|
|
- mm = &init_mm;
|
|
|
- mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
|
|
|
- if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10))
|
|
|
- ret = true;
|
|
|
- rcu_read_unlock();
|
|
|
- return ret;
|
|
|
+ mem_cgroup_walk_tree(mem, &lock_count, mem_cgroup_oom_lock_cb);
|
|
|
+
|
|
|
+ if (lock_count == 1)
|
|
|
+ return true;
|
|
|
+ return false;
|
|
|
}
|
|
|
|
|
|
-static int record_last_oom_cb(struct mem_cgroup *mem, void *data)
|
|
|
+static int mem_cgroup_oom_unlock_cb(struct mem_cgroup *mem, void *data)
|
|
|
{
|
|
|
- mem->last_oom_jiffies = jiffies;
|
|
|
+ /*
|
|
|
+ * When a new child is created while the hierarchy is under oom,
|
|
|
+ * mem_cgroup_oom_lock() may not be called. We have to use
|
|
|
+ * atomic_add_unless() here.
|
|
|
+ */
|
|
|
+ atomic_add_unless(&mem->oom_lock, -1, 0);
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
-static void record_last_oom(struct mem_cgroup *mem)
|
|
|
+static void mem_cgroup_oom_unlock(struct mem_cgroup *mem)
|
|
|
{
|
|
|
- mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb);
|
|
|
+ mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_unlock_cb);
|
|
|
+}
|
|
|
+
|
|
|
+static DEFINE_MUTEX(memcg_oom_mutex);
|
|
|
+static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
|
|
|
+
|
|
|
+/*
|
|
|
+ * try to call OOM killer. returns false if we should exit memory-reclaim loop.
|
|
|
+ */
|
|
|
+bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
|
|
|
+{
|
|
|
+ DEFINE_WAIT(wait);
|
|
|
+ bool locked;
|
|
|
+
|
|
|
+ /* At first, try to OOM lock hierarchy under mem.*/
|
|
|
+ mutex_lock(&memcg_oom_mutex);
|
|
|
+ locked = mem_cgroup_oom_lock(mem);
|
|
|
+ /*
|
|
|
+ * Even if signal_pending(), we can't quit charge() loop without
|
|
|
+ * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
|
|
|
+ * under OOM is always welcomed, use TASK_KILLABLE here.
|
|
|
+ */
|
|
|
+ if (!locked)
|
|
|
+ prepare_to_wait(&memcg_oom_waitq, &wait, TASK_KILLABLE);
|
|
|
+ mutex_unlock(&memcg_oom_mutex);
|
|
|
+
|
|
|
+ if (locked)
|
|
|
+ mem_cgroup_out_of_memory(mem, mask);
|
|
|
+ else {
|
|
|
+ schedule();
|
|
|
+ finish_wait(&memcg_oom_waitq, &wait);
|
|
|
+ }
|
|
|
+ mutex_lock(&memcg_oom_mutex);
|
|
|
+ mem_cgroup_oom_unlock(mem);
|
|
|
+ /*
|
|
|
+ * Here, we use global waitq .....more fine grained waitq ?
|
|
|
+ * Assume following hierarchy.
|
|
|
+ * A/
|
|
|
+ * 01
|
|
|
+ * 02
|
|
|
+ * assume OOM happens both in A and 01 at the same time. Tthey are
|
|
|
+ * mutually exclusive by lock. (kill in 01 helps A.)
|
|
|
+ * When we use per memcg waitq, we have to wake up waiters on A and 02
|
|
|
+ * in addtion to waiters on 01. We use global waitq for avoiding mess.
|
|
|
+ * It will not be a big problem.
|
|
|
+ * (And a task may be moved to other groups while it's waiting for OOM.)
|
|
|
+ */
|
|
|
+ wake_up_all(&memcg_oom_waitq);
|
|
|
+ mutex_unlock(&memcg_oom_mutex);
|
|
|
+
|
|
|
+ if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
|
|
|
+ return false;
|
|
|
+ /* Give chance to dying process */
|
|
|
+ schedule_timeout(1);
|
|
|
+ return true;
|
|
|
}
|
|
|
|
|
|
/*
|
|
@@ -1443,11 +1513,14 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
|
|
|
struct res_counter *fail_res;
|
|
|
int csize = CHARGE_SIZE;
|
|
|
|
|
|
- if (unlikely(test_thread_flag(TIF_MEMDIE))) {
|
|
|
- /* Don't account this! */
|
|
|
- *memcg = NULL;
|
|
|
- return 0;
|
|
|
- }
|
|
|
+ /*
|
|
|
+ * Unlike gloval-vm's OOM-kill, we're not in memory shortage
|
|
|
+ * in system level. So, allow to go ahead dying process in addition to
|
|
|
+ * MEMDIE process.
|
|
|
+ */
|
|
|
+ if (unlikely(test_thread_flag(TIF_MEMDIE)
|
|
|
+ || fatal_signal_pending(current)))
|
|
|
+ goto bypass;
|
|
|
|
|
|
/*
|
|
|
* We always charge the cgroup the mm_struct belongs to.
|
|
@@ -1560,11 +1633,15 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
|
|
|
}
|
|
|
|
|
|
if (!nr_retries--) {
|
|
|
- if (oom) {
|
|
|
- mem_cgroup_out_of_memory(mem_over_limit, gfp_mask);
|
|
|
- record_last_oom(mem_over_limit);
|
|
|
+ if (!oom)
|
|
|
+ goto nomem;
|
|
|
+ if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) {
|
|
|
+ nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
|
|
|
+ continue;
|
|
|
}
|
|
|
- goto nomem;
|
|
|
+ /* When we reach here, current task is dying .*/
|
|
|
+ css_put(&mem->css);
|
|
|
+ goto bypass;
|
|
|
}
|
|
|
}
|
|
|
if (csize > PAGE_SIZE)
|
|
@@ -1574,6 +1651,9 @@ done:
|
|
|
nomem:
|
|
|
css_put(&mem->css);
|
|
|
return -ENOMEM;
|
|
|
+bypass:
|
|
|
+ *memcg = NULL;
|
|
|
+ return 0;
|
|
|
}
|
|
|
|
|
|
/*
|