|
@@ -37,6 +37,7 @@
|
|
|
#include <linux/delay.h>
|
|
|
#include <linux/kthread.h>
|
|
|
#include <linux/freezer.h>
|
|
|
+#include <linux/memcontrol.h>
|
|
|
|
|
|
#include <asm/tlbflush.h>
|
|
|
#include <asm/div64.h>
|
|
@@ -68,6 +69,15 @@ struct scan_control {
|
|
|
int all_unreclaimable;
|
|
|
|
|
|
int order;
|
|
|
+
|
|
|
+ /* Which cgroup do we reclaim from */
|
|
|
+ struct mem_cgroup *mem_cgroup;
|
|
|
+
|
|
|
+ /* Pluggable isolate pages callback */
|
|
|
+ unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst,
|
|
|
+ unsigned long *scanned, int order, int mode,
|
|
|
+ struct zone *z, struct mem_cgroup *mem_cont,
|
|
|
+ int active);
|
|
|
};
|
|
|
|
|
|
#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
|
|
@@ -626,7 +636,7 @@ keep:
|
|
|
*
|
|
|
* returns 0 on success, -ve errno on failure.
|
|
|
*/
|
|
|
-static int __isolate_lru_page(struct page *page, int mode)
|
|
|
+int __isolate_lru_page(struct page *page, int mode)
|
|
|
{
|
|
|
int ret = -EINVAL;
|
|
|
|
|
@@ -760,6 +770,21 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
|
|
|
return nr_taken;
|
|
|
}
|
|
|
|
|
|
+static unsigned long isolate_pages_global(unsigned long nr,
|
|
|
+ struct list_head *dst,
|
|
|
+ unsigned long *scanned, int order,
|
|
|
+ int mode, struct zone *z,
|
|
|
+ struct mem_cgroup *mem_cont,
|
|
|
+ int active)
|
|
|
+{
|
|
|
+ if (active)
|
|
|
+ return isolate_lru_pages(nr, &z->active_list, dst,
|
|
|
+ scanned, order, mode);
|
|
|
+ else
|
|
|
+ return isolate_lru_pages(nr, &z->inactive_list, dst,
|
|
|
+ scanned, order, mode);
|
|
|
+}
|
|
|
+
|
|
|
/*
|
|
|
* clear_active_flags() is a helper for shrink_active_list(), clearing
|
|
|
* any active bits from the pages in the list.
|
|
@@ -801,11 +826,11 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
|
|
|
unsigned long nr_freed;
|
|
|
unsigned long nr_active;
|
|
|
|
|
|
- nr_taken = isolate_lru_pages(sc->swap_cluster_max,
|
|
|
- &zone->inactive_list,
|
|
|
+ nr_taken = sc->isolate_pages(sc->swap_cluster_max,
|
|
|
&page_list, &nr_scan, sc->order,
|
|
|
(sc->order > PAGE_ALLOC_COSTLY_ORDER)?
|
|
|
- ISOLATE_BOTH : ISOLATE_INACTIVE);
|
|
|
+ ISOLATE_BOTH : ISOLATE_INACTIVE,
|
|
|
+ zone, sc->mem_cgroup, 0);
|
|
|
nr_active = clear_active_flags(&page_list);
|
|
|
__count_vm_events(PGDEACTIVATE, nr_active);
|
|
|
|
|
@@ -1018,8 +1043,9 @@ force_reclaim_mapped:
|
|
|
|
|
|
lru_add_drain();
|
|
|
spin_lock_irq(&zone->lru_lock);
|
|
|
- pgmoved = isolate_lru_pages(nr_pages, &zone->active_list,
|
|
|
- &l_hold, &pgscanned, sc->order, ISOLATE_ACTIVE);
|
|
|
+ pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order,
|
|
|
+ ISOLATE_ACTIVE, zone,
|
|
|
+ sc->mem_cgroup, 1);
|
|
|
zone->pages_scanned += pgscanned;
|
|
|
__mod_zone_page_state(zone, NR_ACTIVE, -pgmoved);
|
|
|
spin_unlock_irq(&zone->lru_lock);
|
|
@@ -1051,6 +1077,7 @@ force_reclaim_mapped:
|
|
|
ClearPageActive(page);
|
|
|
|
|
|
list_move(&page->lru, &zone->inactive_list);
|
|
|
+ mem_cgroup_move_lists(page_get_page_cgroup(page), false);
|
|
|
pgmoved++;
|
|
|
if (!pagevec_add(&pvec, page)) {
|
|
|
__mod_zone_page_state(zone, NR_INACTIVE, pgmoved);
|
|
@@ -1079,6 +1106,7 @@ force_reclaim_mapped:
|
|
|
SetPageLRU(page);
|
|
|
VM_BUG_ON(!PageActive(page));
|
|
|
list_move(&page->lru, &zone->active_list);
|
|
|
+ mem_cgroup_move_lists(page_get_page_cgroup(page), true);
|
|
|
pgmoved++;
|
|
|
if (!pagevec_add(&pvec, page)) {
|
|
|
__mod_zone_page_state(zone, NR_ACTIVE, pgmoved);
|
|
@@ -1206,7 +1234,8 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
|
|
|
* holds filesystem locks which prevent writeout this might not work, and the
|
|
|
* allocation attempt will fail.
|
|
|
*/
|
|
|
-unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
|
|
|
+static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask,
|
|
|
+ struct scan_control *sc)
|
|
|
{
|
|
|
int priority;
|
|
|
int ret = 0;
|
|
@@ -1215,14 +1244,6 @@ unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
|
|
|
struct reclaim_state *reclaim_state = current->reclaim_state;
|
|
|
unsigned long lru_pages = 0;
|
|
|
int i;
|
|
|
- struct scan_control sc = {
|
|
|
- .gfp_mask = gfp_mask,
|
|
|
- .may_writepage = !laptop_mode,
|
|
|
- .swap_cluster_max = SWAP_CLUSTER_MAX,
|
|
|
- .may_swap = 1,
|
|
|
- .swappiness = vm_swappiness,
|
|
|
- .order = order,
|
|
|
- };
|
|
|
|
|
|
count_vm_event(ALLOCSTALL);
|
|
|
|
|
@@ -1237,17 +1258,22 @@ unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
|
|
|
}
|
|
|
|
|
|
for (priority = DEF_PRIORITY; priority >= 0; priority--) {
|
|
|
- sc.nr_scanned = 0;
|
|
|
+ sc->nr_scanned = 0;
|
|
|
if (!priority)
|
|
|
disable_swap_token();
|
|
|
- nr_reclaimed += shrink_zones(priority, zones, &sc);
|
|
|
- shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
|
|
|
+ nr_reclaimed += shrink_zones(priority, zones, sc);
|
|
|
+ /*
|
|
|
+ * Don't shrink slabs when reclaiming memory from
|
|
|
+ * over limit cgroups
|
|
|
+ */
|
|
|
+ if (sc->mem_cgroup == NULL)
|
|
|
+ shrink_slab(sc->nr_scanned, gfp_mask, lru_pages);
|
|
|
if (reclaim_state) {
|
|
|
nr_reclaimed += reclaim_state->reclaimed_slab;
|
|
|
reclaim_state->reclaimed_slab = 0;
|
|
|
}
|
|
|
- total_scanned += sc.nr_scanned;
|
|
|
- if (nr_reclaimed >= sc.swap_cluster_max) {
|
|
|
+ total_scanned += sc->nr_scanned;
|
|
|
+ if (nr_reclaimed >= sc->swap_cluster_max) {
|
|
|
ret = 1;
|
|
|
goto out;
|
|
|
}
|
|
@@ -1259,18 +1285,18 @@ unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
|
|
|
* that's undesirable in laptop mode, where we *want* lumpy
|
|
|
* writeout. So in laptop mode, write out the whole world.
|
|
|
*/
|
|
|
- if (total_scanned > sc.swap_cluster_max +
|
|
|
- sc.swap_cluster_max / 2) {
|
|
|
+ if (total_scanned > sc->swap_cluster_max +
|
|
|
+ sc->swap_cluster_max / 2) {
|
|
|
wakeup_pdflush(laptop_mode ? 0 : total_scanned);
|
|
|
- sc.may_writepage = 1;
|
|
|
+ sc->may_writepage = 1;
|
|
|
}
|
|
|
|
|
|
/* Take a nap, wait for some writeback to complete */
|
|
|
- if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
|
|
|
+ if (sc->nr_scanned && priority < DEF_PRIORITY - 2)
|
|
|
congestion_wait(WRITE, HZ/10);
|
|
|
}
|
|
|
/* top priority shrink_caches still had more to do? don't OOM, then */
|
|
|
- if (!sc.all_unreclaimable)
|
|
|
+ if (!sc->all_unreclaimable && sc->mem_cgroup == NULL)
|
|
|
ret = 1;
|
|
|
out:
|
|
|
/*
|
|
@@ -1293,6 +1319,54 @@ out:
|
|
|
return ret;
|
|
|
}
|
|
|
|
|
|
+unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
|
|
|
+{
|
|
|
+ struct scan_control sc = {
|
|
|
+ .gfp_mask = gfp_mask,
|
|
|
+ .may_writepage = !laptop_mode,
|
|
|
+ .swap_cluster_max = SWAP_CLUSTER_MAX,
|
|
|
+ .may_swap = 1,
|
|
|
+ .swappiness = vm_swappiness,
|
|
|
+ .order = order,
|
|
|
+ .mem_cgroup = NULL,
|
|
|
+ .isolate_pages = isolate_pages_global,
|
|
|
+ };
|
|
|
+
|
|
|
+ return do_try_to_free_pages(zones, gfp_mask, &sc);
|
|
|
+}
|
|
|
+
|
|
|
+#ifdef CONFIG_CGROUP_MEM_CONT
|
|
|
+
|
|
|
+#ifdef CONFIG_HIGHMEM
|
|
|
+#define ZONE_USERPAGES ZONE_HIGHMEM
|
|
|
+#else
|
|
|
+#define ZONE_USERPAGES ZONE_NORMAL
|
|
|
+#endif
|
|
|
+
|
|
|
+unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont)
|
|
|
+{
|
|
|
+ struct scan_control sc = {
|
|
|
+ .gfp_mask = GFP_KERNEL,
|
|
|
+ .may_writepage = !laptop_mode,
|
|
|
+ .may_swap = 1,
|
|
|
+ .swap_cluster_max = SWAP_CLUSTER_MAX,
|
|
|
+ .swappiness = vm_swappiness,
|
|
|
+ .order = 0,
|
|
|
+ .mem_cgroup = mem_cont,
|
|
|
+ .isolate_pages = mem_cgroup_isolate_pages,
|
|
|
+ };
|
|
|
+ int node;
|
|
|
+ struct zone **zones;
|
|
|
+
|
|
|
+ for_each_online_node(node) {
|
|
|
+ zones = NODE_DATA(node)->node_zonelists[ZONE_USERPAGES].zones;
|
|
|
+ if (do_try_to_free_pages(zones, sc.gfp_mask, &sc))
|
|
|
+ return 1;
|
|
|
+ }
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+#endif
|
|
|
+
|
|
|
/*
|
|
|
* For kswapd, balance_pgdat() will work across all this node's zones until
|
|
|
* they are all at pages_high.
|
|
@@ -1328,6 +1402,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
|
|
|
.swap_cluster_max = SWAP_CLUSTER_MAX,
|
|
|
.swappiness = vm_swappiness,
|
|
|
.order = order,
|
|
|
+ .mem_cgroup = NULL,
|
|
|
+ .isolate_pages = isolate_pages_global,
|
|
|
};
|
|
|
/*
|
|
|
* temp_priority is used to remember the scanning priority at which
|
|
@@ -1649,6 +1725,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
|
|
|
.swap_cluster_max = nr_pages,
|
|
|
.may_writepage = 1,
|
|
|
.swappiness = vm_swappiness,
|
|
|
+ .isolate_pages = isolate_pages_global,
|
|
|
};
|
|
|
|
|
|
current->reclaim_state = &reclaim_state;
|
|
@@ -1834,6 +1911,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
|
|
|
SWAP_CLUSTER_MAX),
|
|
|
.gfp_mask = gfp_mask,
|
|
|
.swappiness = vm_swappiness,
|
|
|
+ .isolate_pages = isolate_pages_global,
|
|
|
};
|
|
|
unsigned long slab_reclaimable;
|
|
|
|