|
@@ -214,6 +214,8 @@ struct mem_cgroup {
|
|
|
atomic_t refcnt;
|
|
|
|
|
|
unsigned int swappiness;
|
|
|
+ /* OOM-Killer disable */
|
|
|
+ int oom_kill_disable;
|
|
|
|
|
|
/* set when res.limit == memsw.limit */
|
|
|
bool memsw_is_minimum;
|
|
@@ -235,7 +237,6 @@ struct mem_cgroup {
|
|
|
* mem_cgroup ? And what type of charges should we move ?
|
|
|
*/
|
|
|
unsigned long move_charge_at_immigrate;
|
|
|
-
|
|
|
/*
|
|
|
* percpu counter.
|
|
|
*/
|
|
@@ -1342,20 +1343,26 @@ static void memcg_wakeup_oom(struct mem_cgroup *mem)
|
|
|
__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem);
|
|
|
}
|
|
|
|
|
|
+static void memcg_oom_recover(struct mem_cgroup *mem)
|
|
|
+{
|
|
|
+ if (mem->oom_kill_disable && atomic_read(&mem->oom_lock))
|
|
|
+ memcg_wakeup_oom(mem);
|
|
|
+}
|
|
|
+
|
|
|
/*
|
|
|
* try to call OOM killer. returns false if we should exit memory-reclaim loop.
|
|
|
*/
|
|
|
bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
|
|
|
{
|
|
|
struct oom_wait_info owait;
|
|
|
- bool locked;
|
|
|
+ bool locked, need_to_kill;
|
|
|
|
|
|
owait.mem = mem;
|
|
|
owait.wait.flags = 0;
|
|
|
owait.wait.func = memcg_oom_wake_function;
|
|
|
owait.wait.private = current;
|
|
|
INIT_LIST_HEAD(&owait.wait.task_list);
|
|
|
-
|
|
|
+ need_to_kill = true;
|
|
|
/* At first, try to OOM lock hierarchy under mem.*/
|
|
|
mutex_lock(&memcg_oom_mutex);
|
|
|
locked = mem_cgroup_oom_lock(mem);
|
|
@@ -1364,15 +1371,17 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
|
|
|
* accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
|
|
|
* under OOM is always welcomed, use TASK_KILLABLE here.
|
|
|
*/
|
|
|
- if (!locked)
|
|
|
- prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
|
|
|
- else
|
|
|
+ prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
|
|
|
+ if (!locked || mem->oom_kill_disable)
|
|
|
+ need_to_kill = false;
|
|
|
+ if (locked)
|
|
|
mem_cgroup_oom_notify(mem);
|
|
|
mutex_unlock(&memcg_oom_mutex);
|
|
|
|
|
|
- if (locked)
|
|
|
+ if (need_to_kill) {
|
|
|
+ finish_wait(&memcg_oom_waitq, &owait.wait);
|
|
|
mem_cgroup_out_of_memory(mem, mask);
|
|
|
- else {
|
|
|
+ } else {
|
|
|
schedule();
|
|
|
finish_wait(&memcg_oom_waitq, &owait.wait);
|
|
|
}
|
|
@@ -2162,15 +2171,6 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
|
|
|
/* If swapout, usage of swap doesn't decrease */
|
|
|
if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
|
|
|
uncharge_memsw = false;
|
|
|
- /*
|
|
|
- * do_batch > 0 when unmapping pages or inode invalidate/truncate.
|
|
|
- * In those cases, all pages freed continously can be expected to be in
|
|
|
- * the same cgroup and we have chance to coalesce uncharges.
|
|
|
- * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
|
|
|
- * because we want to do uncharge as soon as possible.
|
|
|
- */
|
|
|
- if (!current->memcg_batch.do_batch || test_thread_flag(TIF_MEMDIE))
|
|
|
- goto direct_uncharge;
|
|
|
|
|
|
batch = ¤t->memcg_batch;
|
|
|
/*
|
|
@@ -2180,6 +2180,17 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
|
|
|
*/
|
|
|
if (!batch->memcg)
|
|
|
batch->memcg = mem;
|
|
|
+ /*
|
|
|
+ * do_batch > 0 when unmapping pages or inode invalidate/truncate.
|
|
|
+ * In those cases, all pages freed continously can be expected to be in
|
|
|
+ * the same cgroup and we have chance to coalesce uncharges.
|
|
|
+ * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
|
|
|
+ * because we want to do uncharge as soon as possible.
|
|
|
+ */
|
|
|
+
|
|
|
+ if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
|
|
|
+ goto direct_uncharge;
|
|
|
+
|
|
|
/*
|
|
|
* In typical case, batch->memcg == mem. This means we can
|
|
|
* merge a series of uncharges to an uncharge of res_counter.
|
|
@@ -2196,6 +2207,8 @@ direct_uncharge:
|
|
|
res_counter_uncharge(&mem->res, PAGE_SIZE);
|
|
|
if (uncharge_memsw)
|
|
|
res_counter_uncharge(&mem->memsw, PAGE_SIZE);
|
|
|
+ if (unlikely(batch->memcg != mem))
|
|
|
+ memcg_oom_recover(mem);
|
|
|
return;
|
|
|
}
|
|
|
|
|
@@ -2332,6 +2345,7 @@ void mem_cgroup_uncharge_end(void)
|
|
|
res_counter_uncharge(&batch->memcg->res, batch->bytes);
|
|
|
if (batch->memsw_bytes)
|
|
|
res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes);
|
|
|
+ memcg_oom_recover(batch->memcg);
|
|
|
/* forget this pointer (for sanity check) */
|
|
|
batch->memcg = NULL;
|
|
|
}
|
|
@@ -2568,10 +2582,11 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
|
|
|
unsigned long long val)
|
|
|
{
|
|
|
int retry_count;
|
|
|
- u64 memswlimit;
|
|
|
+ u64 memswlimit, memlimit;
|
|
|
int ret = 0;
|
|
|
int children = mem_cgroup_count_children(memcg);
|
|
|
u64 curusage, oldusage;
|
|
|
+ int enlarge;
|
|
|
|
|
|
/*
|
|
|
* For keeping hierarchical_reclaim simple, how long we should retry
|
|
@@ -2582,6 +2597,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
|
|
|
|
|
|
oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
|
|
|
|
|
|
+ enlarge = 0;
|
|
|
while (retry_count) {
|
|
|
if (signal_pending(current)) {
|
|
|
ret = -EINTR;
|
|
@@ -2599,6 +2615,11 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
|
|
|
mutex_unlock(&set_limit_mutex);
|
|
|
break;
|
|
|
}
|
|
|
+
|
|
|
+ memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
|
|
|
+ if (memlimit < val)
|
|
|
+ enlarge = 1;
|
|
|
+
|
|
|
ret = res_counter_set_limit(&memcg->res, val);
|
|
|
if (!ret) {
|
|
|
if (memswlimit == val)
|
|
@@ -2620,6 +2641,8 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
|
|
|
else
|
|
|
oldusage = curusage;
|
|
|
}
|
|
|
+ if (!ret && enlarge)
|
|
|
+ memcg_oom_recover(memcg);
|
|
|
|
|
|
return ret;
|
|
|
}
|
|
@@ -2628,9 +2651,10 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
|
|
|
unsigned long long val)
|
|
|
{
|
|
|
int retry_count;
|
|
|
- u64 memlimit, oldusage, curusage;
|
|
|
+ u64 memlimit, memswlimit, oldusage, curusage;
|
|
|
int children = mem_cgroup_count_children(memcg);
|
|
|
int ret = -EBUSY;
|
|
|
+ int enlarge = 0;
|
|
|
|
|
|
/* see mem_cgroup_resize_res_limit */
|
|
|
retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
|
|
@@ -2652,6 +2676,9 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
|
|
|
mutex_unlock(&set_limit_mutex);
|
|
|
break;
|
|
|
}
|
|
|
+ memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
|
|
|
+ if (memswlimit < val)
|
|
|
+ enlarge = 1;
|
|
|
ret = res_counter_set_limit(&memcg->memsw, val);
|
|
|
if (!ret) {
|
|
|
if (memlimit == val)
|
|
@@ -2674,6 +2701,8 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
|
|
|
else
|
|
|
oldusage = curusage;
|
|
|
}
|
|
|
+ if (!ret && enlarge)
|
|
|
+ memcg_oom_recover(memcg);
|
|
|
return ret;
|
|
|
}
|
|
|
|
|
@@ -2865,6 +2894,7 @@ move_account:
|
|
|
if (ret)
|
|
|
break;
|
|
|
}
|
|
|
+ memcg_oom_recover(mem);
|
|
|
/* it seems parent cgroup doesn't have enough mem */
|
|
|
if (ret == -ENOMEM)
|
|
|
goto try_to_free;
|
|
@@ -3645,6 +3675,46 @@ static int mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
+static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
|
|
|
+ struct cftype *cft, struct cgroup_map_cb *cb)
|
|
|
+{
|
|
|
+ struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
|
|
|
+
|
|
|
+ cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable);
|
|
|
+
|
|
|
+ if (atomic_read(&mem->oom_lock))
|
|
|
+ cb->fill(cb, "under_oom", 1);
|
|
|
+ else
|
|
|
+ cb->fill(cb, "under_oom", 0);
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ */
|
|
|
+static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
|
|
|
+ struct cftype *cft, u64 val)
|
|
|
+{
|
|
|
+ struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
|
|
|
+ struct mem_cgroup *parent;
|
|
|
+
|
|
|
+ /* cannot set to root cgroup and only 0 and 1 are allowed */
|
|
|
+ if (!cgrp->parent || !((val == 0) || (val == 1)))
|
|
|
+ return -EINVAL;
|
|
|
+
|
|
|
+ parent = mem_cgroup_from_cont(cgrp->parent);
|
|
|
+
|
|
|
+ cgroup_lock();
|
|
|
+ /* oom-kill-disable is a flag for subhierarchy. */
|
|
|
+ if ((parent->use_hierarchy) ||
|
|
|
+ (mem->use_hierarchy && !list_empty(&cgrp->children))) {
|
|
|
+ cgroup_unlock();
|
|
|
+ return -EINVAL;
|
|
|
+ }
|
|
|
+ mem->oom_kill_disable = val;
|
|
|
+ cgroup_unlock();
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+
|
|
|
static struct cftype mem_cgroup_files[] = {
|
|
|
{
|
|
|
.name = "usage_in_bytes",
|
|
@@ -3702,6 +3772,8 @@ static struct cftype mem_cgroup_files[] = {
|
|
|
},
|
|
|
{
|
|
|
.name = "oom_control",
|
|
|
+ .read_map = mem_cgroup_oom_control_read,
|
|
|
+ .write_u64 = mem_cgroup_oom_control_write,
|
|
|
.register_event = mem_cgroup_oom_register_event,
|
|
|
.unregister_event = mem_cgroup_oom_unregister_event,
|
|
|
.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
|
|
@@ -3943,6 +4015,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
|
|
|
} else {
|
|
|
parent = mem_cgroup_from_cont(cont->parent);
|
|
|
mem->use_hierarchy = parent->use_hierarchy;
|
|
|
+ mem->oom_kill_disable = parent->oom_kill_disable;
|
|
|
}
|
|
|
|
|
|
if (parent && parent->use_hierarchy) {
|
|
@@ -4215,6 +4288,7 @@ static void mem_cgroup_clear_mc(void)
|
|
|
if (mc.precharge) {
|
|
|
__mem_cgroup_cancel_charge(mc.to, mc.precharge);
|
|
|
mc.precharge = 0;
|
|
|
+ memcg_oom_recover(mc.to);
|
|
|
}
|
|
|
/*
|
|
|
* we didn't uncharge from mc.from at mem_cgroup_move_account(), so
|
|
@@ -4223,6 +4297,7 @@ static void mem_cgroup_clear_mc(void)
|
|
|
if (mc.moved_charge) {
|
|
|
__mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
|
|
|
mc.moved_charge = 0;
|
|
|
+ memcg_oom_recover(mc.from);
|
|
|
}
|
|
|
/* we must fixup refcnts and charges */
|
|
|
if (mc.moved_swap) {
|