|
@@ -231,6 +231,11 @@ struct mem_cgroup {
|
|
|
* reclaimed from.
|
|
|
*/
|
|
|
int last_scanned_child;
|
|
|
+ int last_scanned_node;
|
|
|
+#if MAX_NUMNODES > 1
|
|
|
+ nodemask_t scan_nodes;
|
|
|
+ unsigned long next_scan_node_update;
|
|
|
+#endif
|
|
|
/*
|
|
|
* Should the accounting and control be hierarchical, per subtree?
|
|
|
*/
|
|
@@ -624,18 +629,27 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
|
|
|
preempt_enable();
|
|
|
}
|
|
|
|
|
|
+static unsigned long
|
|
|
+mem_cgroup_get_zonestat_node(struct mem_cgroup *mem, int nid, enum lru_list idx)
|
|
|
+{
|
|
|
+ struct mem_cgroup_per_zone *mz;
|
|
|
+ u64 total = 0;
|
|
|
+ int zid;
|
|
|
+
|
|
|
+ for (zid = 0; zid < MAX_NR_ZONES; zid++) {
|
|
|
+ mz = mem_cgroup_zoneinfo(mem, nid, zid);
|
|
|
+ total += MEM_CGROUP_ZSTAT(mz, idx);
|
|
|
+ }
|
|
|
+ return total;
|
|
|
+}
|
|
|
static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
|
|
|
enum lru_list idx)
|
|
|
{
|
|
|
- int nid, zid;
|
|
|
- struct mem_cgroup_per_zone *mz;
|
|
|
+ int nid;
|
|
|
u64 total = 0;
|
|
|
|
|
|
for_each_online_node(nid)
|
|
|
- for (zid = 0; zid < MAX_NR_ZONES; zid++) {
|
|
|
- mz = mem_cgroup_zoneinfo(mem, nid, zid);
|
|
|
- total += MEM_CGROUP_ZSTAT(mz, idx);
|
|
|
- }
|
|
|
+ total += mem_cgroup_get_zonestat_node(mem, nid, idx);
|
|
|
return total;
|
|
|
}
|
|
|
|
|
@@ -1418,6 +1432,81 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
|
|
|
return ret;
|
|
|
}
|
|
|
|
|
|
+#if MAX_NUMNODES > 1
|
|
|
+
|
|
|
+/*
|
|
|
+ * Always updating the nodemask is not very good - even if we have an empty
|
|
|
+ * list or the wrong list here, we can start from some node and traverse all
|
|
|
+ * nodes based on the zonelist. So update the list loosely once per 10 secs.
|
|
|
+ *
|
|
|
+ */
|
|
|
+static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
|
|
|
+{
|
|
|
+ int nid;
|
|
|
+
|
|
|
+ if (time_after(mem->next_scan_node_update, jiffies))
|
|
|
+ return;
|
|
|
+
|
|
|
+ mem->next_scan_node_update = jiffies + 10*HZ;
|
|
|
+ /* make a nodemask where this memcg uses memory from */
|
|
|
+ mem->scan_nodes = node_states[N_HIGH_MEMORY];
|
|
|
+
|
|
|
+ for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
|
|
|
+
|
|
|
+ if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) ||
|
|
|
+ mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE))
|
|
|
+ continue;
|
|
|
+
|
|
|
+ if (total_swap_pages &&
|
|
|
+ (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) ||
|
|
|
+ mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON)))
|
|
|
+ continue;
|
|
|
+ node_clear(nid, mem->scan_nodes);
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ * Selecting a node where we start reclaim from. Because what we need is just
|
|
|
+ * reducing usage counter, start from anywhere is O,K. Considering
|
|
|
+ * memory reclaim from current node, there are pros. and cons.
|
|
|
+ *
|
|
|
+ * Freeing memory from current node means freeing memory from a node which
|
|
|
+ * we'll use or we've used. So, it may make LRU bad. And if several threads
|
|
|
+ * hit limits, it will see a contention on a node. But freeing from remote
|
|
|
+ * node means more costs for memory reclaim because of memory latency.
|
|
|
+ *
|
|
|
+ * Now, we use round-robin. Better algorithm is welcomed.
|
|
|
+ */
|
|
|
+int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
|
|
|
+{
|
|
|
+ int node;
|
|
|
+
|
|
|
+ mem_cgroup_may_update_nodemask(mem);
|
|
|
+ node = mem->last_scanned_node;
|
|
|
+
|
|
|
+ node = next_node(node, mem->scan_nodes);
|
|
|
+ if (node == MAX_NUMNODES)
|
|
|
+ node = first_node(mem->scan_nodes);
|
|
|
+ /*
|
|
|
+ * We call this when we hit limit, not when pages are added to LRU.
|
|
|
+ * No LRU may hold pages because all pages are UNEVICTABLE or
|
|
|
+ * memcg is too small and all pages are not on LRU. In that case,
|
|
|
+ * we use curret node.
|
|
|
+ */
|
|
|
+ if (unlikely(node == MAX_NUMNODES))
|
|
|
+ node = numa_node_id();
|
|
|
+
|
|
|
+ mem->last_scanned_node = node;
|
|
|
+ return node;
|
|
|
+}
|
|
|
+
|
|
|
+#else
|
|
|
+int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
|
|
|
+{
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+#endif
|
|
|
+
|
|
|
/*
|
|
|
* Scan the hierarchy if needed to reclaim memory. We remember the last child
|
|
|
* we reclaimed from, so that we don't end up penalizing one child extensively
|
|
@@ -4606,6 +4695,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
|
|
|
res_counter_init(&mem->memsw, NULL);
|
|
|
}
|
|
|
mem->last_scanned_child = 0;
|
|
|
+ mem->last_scanned_node = MAX_NUMNODES;
|
|
|
INIT_LIST_HEAD(&mem->oom_notify);
|
|
|
|
|
|
if (parent)
|