memcontrol.c 36 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526
  1. /* memcontrol.c - Memory Controller
  2. *
  3. * Copyright IBM Corporation, 2007
  4. * Author Balbir Singh <balbir@linux.vnet.ibm.com>
  5. *
  6. * Copyright 2007 OpenVZ SWsoft Inc
  7. * Author: Pavel Emelianov <xemul@openvz.org>
  8. *
  9. * This program is free software; you can redistribute it and/or modify
  10. * it under the terms of the GNU General Public License as published by
  11. * the Free Software Foundation; either version 2 of the License, or
  12. * (at your option) any later version.
  13. *
  14. * This program is distributed in the hope that it will be useful,
  15. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17. * GNU General Public License for more details.
  18. */
  19. #include <linux/res_counter.h>
  20. #include <linux/memcontrol.h>
  21. #include <linux/cgroup.h>
  22. #include <linux/mm.h>
  23. #include <linux/pagemap.h>
  24. #include <linux/smp.h>
  25. #include <linux/page-flags.h>
  26. #include <linux/backing-dev.h>
  27. #include <linux/bit_spinlock.h>
  28. #include <linux/rcupdate.h>
  29. #include <linux/slab.h>
  30. #include <linux/swap.h>
  31. #include <linux/spinlock.h>
  32. #include <linux/fs.h>
  33. #include <linux/seq_file.h>
  34. #include <linux/vmalloc.h>
  35. #include <linux/mm_inline.h>
  36. #include <linux/page_cgroup.h>
  37. #include <asm/uaccess.h>
  38. struct cgroup_subsys mem_cgroup_subsys __read_mostly;
  39. #define MEM_CGROUP_RECLAIM_RETRIES 5
  40. #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
  41. /* Turned on only when memory cgroup is enabled && really_do_swap_account = 0 */
  42. int do_swap_account __read_mostly;
  43. static int really_do_swap_account __initdata = 1; /* for remember boot option*/
  44. #else
  45. #define do_swap_account (0)
  46. #endif
  47. /*
  48. * Statistics for memory cgroup.
  49. */
  50. enum mem_cgroup_stat_index {
  51. /*
  52. * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
  53. */
  54. MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
  55. MEM_CGROUP_STAT_RSS, /* # of pages charged as rss */
  56. MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
  57. MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
  58. MEM_CGROUP_STAT_NSTATS,
  59. };
  60. struct mem_cgroup_stat_cpu {
  61. s64 count[MEM_CGROUP_STAT_NSTATS];
  62. } ____cacheline_aligned_in_smp;
  63. struct mem_cgroup_stat {
  64. struct mem_cgroup_stat_cpu cpustat[0];
  65. };
  66. /*
  67. * For accounting under irq disable, no need for increment preempt count.
  68. */
  69. static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat,
  70. enum mem_cgroup_stat_index idx, int val)
  71. {
  72. stat->count[idx] += val;
  73. }
  74. static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
  75. enum mem_cgroup_stat_index idx)
  76. {
  77. int cpu;
  78. s64 ret = 0;
  79. for_each_possible_cpu(cpu)
  80. ret += stat->cpustat[cpu].count[idx];
  81. return ret;
  82. }
  83. /*
  84. * per-zone information in memory controller.
  85. */
  86. struct mem_cgroup_per_zone {
  87. /*
  88. * spin_lock to protect the per cgroup LRU
  89. */
  90. spinlock_t lru_lock;
  91. struct list_head lists[NR_LRU_LISTS];
  92. unsigned long count[NR_LRU_LISTS];
  93. };
  94. /* Macro for accessing counter */
  95. #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])
  96. struct mem_cgroup_per_node {
  97. struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
  98. };
  99. struct mem_cgroup_lru_info {
  100. struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
  101. };
  102. /*
  103. * The memory controller data structure. The memory controller controls both
  104. * page cache and RSS per cgroup. We would eventually like to provide
  105. * statistics based on the statistics developed by Rik Van Riel for clock-pro,
  106. * to help the administrator determine what knobs to tune.
  107. *
  108. * TODO: Add a water mark for the memory controller. Reclaim will begin when
  109. * we hit the water mark. May be even add a low water mark, such that
  110. * no reclaim occurs from a cgroup at it's low water mark, this is
  111. * a feature that will be implemented much later in the future.
  112. */
  113. struct mem_cgroup {
  114. struct cgroup_subsys_state css;
  115. /*
  116. * the counter to account for memory usage
  117. */
  118. struct res_counter res;
  119. /*
  120. * Per cgroup active and inactive list, similar to the
  121. * per zone LRU lists.
  122. */
  123. struct mem_cgroup_lru_info info;
  124. int prev_priority; /* for recording reclaim priority */
  125. /*
  126. * statistics. This must be placed at the end of memcg.
  127. */
  128. struct mem_cgroup_stat stat;
  129. };
  130. enum charge_type {
  131. MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
  132. MEM_CGROUP_CHARGE_TYPE_MAPPED,
  133. MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */
  134. MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */
  135. MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
  136. NR_CHARGE_TYPE,
  137. };
  138. /* only for here (for easy reading.) */
  139. #define PCGF_CACHE (1UL << PCG_CACHE)
  140. #define PCGF_USED (1UL << PCG_USED)
  141. #define PCGF_ACTIVE (1UL << PCG_ACTIVE)
  142. #define PCGF_LOCK (1UL << PCG_LOCK)
  143. #define PCGF_FILE (1UL << PCG_FILE)
  144. static const unsigned long
  145. pcg_default_flags[NR_CHARGE_TYPE] = {
  146. PCGF_CACHE | PCGF_FILE | PCGF_USED | PCGF_LOCK, /* File Cache */
  147. PCGF_ACTIVE | PCGF_USED | PCGF_LOCK, /* Anon */
  148. PCGF_ACTIVE | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
  149. 0, /* FORCE */
  150. };
  151. /*
  152. * Always modified under lru lock. Then, not necessary to preempt_disable()
  153. */
  154. static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
  155. struct page_cgroup *pc,
  156. bool charge)
  157. {
  158. int val = (charge)? 1 : -1;
  159. struct mem_cgroup_stat *stat = &mem->stat;
  160. struct mem_cgroup_stat_cpu *cpustat;
  161. VM_BUG_ON(!irqs_disabled());
  162. cpustat = &stat->cpustat[smp_processor_id()];
  163. if (PageCgroupCache(pc))
  164. __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val);
  165. else
  166. __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val);
  167. if (charge)
  168. __mem_cgroup_stat_add_safe(cpustat,
  169. MEM_CGROUP_STAT_PGPGIN_COUNT, 1);
  170. else
  171. __mem_cgroup_stat_add_safe(cpustat,
  172. MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
  173. }
  174. static struct mem_cgroup_per_zone *
  175. mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
  176. {
  177. return &mem->info.nodeinfo[nid]->zoneinfo[zid];
  178. }
  179. static struct mem_cgroup_per_zone *
  180. page_cgroup_zoneinfo(struct page_cgroup *pc)
  181. {
  182. struct mem_cgroup *mem = pc->mem_cgroup;
  183. int nid = page_cgroup_nid(pc);
  184. int zid = page_cgroup_zid(pc);
  185. return mem_cgroup_zoneinfo(mem, nid, zid);
  186. }
  187. static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem,
  188. enum lru_list idx)
  189. {
  190. int nid, zid;
  191. struct mem_cgroup_per_zone *mz;
  192. u64 total = 0;
  193. for_each_online_node(nid)
  194. for (zid = 0; zid < MAX_NR_ZONES; zid++) {
  195. mz = mem_cgroup_zoneinfo(mem, nid, zid);
  196. total += MEM_CGROUP_ZSTAT(mz, idx);
  197. }
  198. return total;
  199. }
  200. static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
  201. {
  202. return container_of(cgroup_subsys_state(cont,
  203. mem_cgroup_subsys_id), struct mem_cgroup,
  204. css);
  205. }
  206. struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
  207. {
  208. /*
  209. * mm_update_next_owner() may clear mm->owner to NULL
  210. * if it races with swapoff, page migration, etc.
  211. * So this can be called with p == NULL.
  212. */
  213. if (unlikely(!p))
  214. return NULL;
  215. return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
  216. struct mem_cgroup, css);
  217. }
  218. static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
  219. struct page_cgroup *pc)
  220. {
  221. int lru = LRU_BASE;
  222. if (PageCgroupUnevictable(pc))
  223. lru = LRU_UNEVICTABLE;
  224. else {
  225. if (PageCgroupActive(pc))
  226. lru += LRU_ACTIVE;
  227. if (PageCgroupFile(pc))
  228. lru += LRU_FILE;
  229. }
  230. MEM_CGROUP_ZSTAT(mz, lru) -= 1;
  231. mem_cgroup_charge_statistics(pc->mem_cgroup, pc, false);
  232. list_del(&pc->lru);
  233. }
  234. static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz,
  235. struct page_cgroup *pc, bool hot)
  236. {
  237. int lru = LRU_BASE;
  238. if (PageCgroupUnevictable(pc))
  239. lru = LRU_UNEVICTABLE;
  240. else {
  241. if (PageCgroupActive(pc))
  242. lru += LRU_ACTIVE;
  243. if (PageCgroupFile(pc))
  244. lru += LRU_FILE;
  245. }
  246. MEM_CGROUP_ZSTAT(mz, lru) += 1;
  247. if (hot)
  248. list_add(&pc->lru, &mz->lists[lru]);
  249. else
  250. list_add_tail(&pc->lru, &mz->lists[lru]);
  251. mem_cgroup_charge_statistics(pc->mem_cgroup, pc, true);
  252. }
  253. static void __mem_cgroup_move_lists(struct page_cgroup *pc, enum lru_list lru)
  254. {
  255. struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);
  256. int active = PageCgroupActive(pc);
  257. int file = PageCgroupFile(pc);
  258. int unevictable = PageCgroupUnevictable(pc);
  259. enum lru_list from = unevictable ? LRU_UNEVICTABLE :
  260. (LRU_FILE * !!file + !!active);
  261. if (lru == from)
  262. return;
  263. MEM_CGROUP_ZSTAT(mz, from) -= 1;
  264. /*
  265. * However this is done under mz->lru_lock, another flags, which
  266. * are not related to LRU, will be modified from out-of-lock.
  267. * We have to use atomic set/clear flags.
  268. */
  269. if (is_unevictable_lru(lru)) {
  270. ClearPageCgroupActive(pc);
  271. SetPageCgroupUnevictable(pc);
  272. } else {
  273. if (is_active_lru(lru))
  274. SetPageCgroupActive(pc);
  275. else
  276. ClearPageCgroupActive(pc);
  277. ClearPageCgroupUnevictable(pc);
  278. }
  279. MEM_CGROUP_ZSTAT(mz, lru) += 1;
  280. list_move(&pc->lru, &mz->lists[lru]);
  281. }
  282. int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
  283. {
  284. int ret;
  285. task_lock(task);
  286. ret = task->mm && mm_match_cgroup(task->mm, mem);
  287. task_unlock(task);
  288. return ret;
  289. }
  290. /*
  291. * This routine assumes that the appropriate zone's lru lock is already held
  292. */
  293. void mem_cgroup_move_lists(struct page *page, enum lru_list lru)
  294. {
  295. struct page_cgroup *pc;
  296. struct mem_cgroup_per_zone *mz;
  297. unsigned long flags;
  298. if (mem_cgroup_subsys.disabled)
  299. return;
  300. /*
  301. * We cannot lock_page_cgroup while holding zone's lru_lock,
  302. * because other holders of lock_page_cgroup can be interrupted
  303. * with an attempt to rotate_reclaimable_page. But we cannot
  304. * safely get to page_cgroup without it, so just try_lock it:
  305. * mem_cgroup_isolate_pages allows for page left on wrong list.
  306. */
  307. pc = lookup_page_cgroup(page);
  308. if (!trylock_page_cgroup(pc))
  309. return;
  310. if (pc && PageCgroupUsed(pc)) {
  311. mz = page_cgroup_zoneinfo(pc);
  312. spin_lock_irqsave(&mz->lru_lock, flags);
  313. __mem_cgroup_move_lists(pc, lru);
  314. spin_unlock_irqrestore(&mz->lru_lock, flags);
  315. }
  316. unlock_page_cgroup(pc);
  317. }
  318. /*
  319. * Calculate mapped_ratio under memory controller. This will be used in
  320. * vmscan.c for deteremining we have to reclaim mapped pages.
  321. */
  322. int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
  323. {
  324. long total, rss;
  325. /*
  326. * usage is recorded in bytes. But, here, we assume the number of
  327. * physical pages can be represented by "long" on any arch.
  328. */
  329. total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L;
  330. rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
  331. return (int)((rss * 100L) / total);
  332. }
  333. /*
  334. * prev_priority control...this will be used in memory reclaim path.
  335. */
  336. int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
  337. {
  338. return mem->prev_priority;
  339. }
  340. void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)
  341. {
  342. if (priority < mem->prev_priority)
  343. mem->prev_priority = priority;
  344. }
  345. void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
  346. {
  347. mem->prev_priority = priority;
  348. }
  349. /*
  350. * Calculate # of pages to be scanned in this priority/zone.
  351. * See also vmscan.c
  352. *
  353. * priority starts from "DEF_PRIORITY" and decremented in each loop.
  354. * (see include/linux/mmzone.h)
  355. */
  356. long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone,
  357. int priority, enum lru_list lru)
  358. {
  359. long nr_pages;
  360. int nid = zone->zone_pgdat->node_id;
  361. int zid = zone_idx(zone);
  362. struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);
  363. nr_pages = MEM_CGROUP_ZSTAT(mz, lru);
  364. return (nr_pages >> priority);
  365. }
  366. unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
  367. struct list_head *dst,
  368. unsigned long *scanned, int order,
  369. int mode, struct zone *z,
  370. struct mem_cgroup *mem_cont,
  371. int active, int file)
  372. {
  373. unsigned long nr_taken = 0;
  374. struct page *page;
  375. unsigned long scan;
  376. LIST_HEAD(pc_list);
  377. struct list_head *src;
  378. struct page_cgroup *pc, *tmp;
  379. int nid = z->zone_pgdat->node_id;
  380. int zid = zone_idx(z);
  381. struct mem_cgroup_per_zone *mz;
  382. int lru = LRU_FILE * !!file + !!active;
  383. BUG_ON(!mem_cont);
  384. mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
  385. src = &mz->lists[lru];
  386. spin_lock(&mz->lru_lock);
  387. scan = 0;
  388. list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
  389. if (scan >= nr_to_scan)
  390. break;
  391. if (unlikely(!PageCgroupUsed(pc)))
  392. continue;
  393. page = pc->page;
  394. if (unlikely(!PageLRU(page)))
  395. continue;
  396. /*
  397. * TODO: play better with lumpy reclaim, grabbing anything.
  398. */
  399. if (PageUnevictable(page) ||
  400. (PageActive(page) && !active) ||
  401. (!PageActive(page) && active)) {
  402. __mem_cgroup_move_lists(pc, page_lru(page));
  403. continue;
  404. }
  405. scan++;
  406. list_move(&pc->lru, &pc_list);
  407. if (__isolate_lru_page(page, mode, file) == 0) {
  408. list_move(&page->lru, dst);
  409. nr_taken++;
  410. }
  411. }
  412. list_splice(&pc_list, src);
  413. spin_unlock(&mz->lru_lock);
  414. *scanned = scan;
  415. return nr_taken;
  416. }
  417. /*
  418. * Unlike exported interface, "oom" parameter is added. if oom==true,
  419. * oom-killer can be invoked.
  420. */
  421. static int __mem_cgroup_try_charge(struct mm_struct *mm,
  422. gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
  423. {
  424. struct mem_cgroup *mem;
  425. int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
  426. /*
  427. * We always charge the cgroup the mm_struct belongs to.
  428. * The mm_struct's mem_cgroup changes on task migration if the
  429. * thread group leader migrates. It's possible that mm is not
  430. * set, if so charge the init_mm (happens for pagecache usage).
  431. */
  432. if (likely(!*memcg)) {
  433. rcu_read_lock();
  434. mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
  435. if (unlikely(!mem)) {
  436. rcu_read_unlock();
  437. return 0;
  438. }
  439. /*
  440. * For every charge from the cgroup, increment reference count
  441. */
  442. css_get(&mem->css);
  443. *memcg = mem;
  444. rcu_read_unlock();
  445. } else {
  446. mem = *memcg;
  447. css_get(&mem->css);
  448. }
  449. while (unlikely(res_counter_charge(&mem->res, PAGE_SIZE))) {
  450. if (!(gfp_mask & __GFP_WAIT))
  451. goto nomem;
  452. if (try_to_free_mem_cgroup_pages(mem, gfp_mask))
  453. continue;
  454. /*
  455. * try_to_free_mem_cgroup_pages() might not give us a full
  456. * picture of reclaim. Some pages are reclaimed and might be
  457. * moved to swap cache or just unmapped from the cgroup.
  458. * Check the limit again to see if the reclaim reduced the
  459. * current usage of the cgroup before giving up
  460. */
  461. if (res_counter_check_under_limit(&mem->res))
  462. continue;
  463. if (!nr_retries--) {
  464. if (oom)
  465. mem_cgroup_out_of_memory(mem, gfp_mask);
  466. goto nomem;
  467. }
  468. }
  469. return 0;
  470. nomem:
  471. css_put(&mem->css);
  472. return -ENOMEM;
  473. }
  474. /**
  475. * mem_cgroup_try_charge - get charge of PAGE_SIZE.
  476. * @mm: an mm_struct which is charged against. (when *memcg is NULL)
  477. * @gfp_mask: gfp_mask for reclaim.
  478. * @memcg: a pointer to memory cgroup which is charged against.
  479. *
  480. * charge against memory cgroup pointed by *memcg. if *memcg == NULL, estimated
  481. * memory cgroup from @mm is got and stored in *memcg.
  482. *
  483. * Returns 0 if success. -ENOMEM at failure.
  484. * This call can invoke OOM-Killer.
  485. */
  486. int mem_cgroup_try_charge(struct mm_struct *mm,
  487. gfp_t mask, struct mem_cgroup **memcg)
  488. {
  489. return __mem_cgroup_try_charge(mm, mask, memcg, true);
  490. }
  491. /*
  492. * commit a charge got by mem_cgroup_try_charge() and makes page_cgroup to be
  493. * USED state. If already USED, uncharge and return.
  494. */
  495. static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
  496. struct page_cgroup *pc,
  497. enum charge_type ctype)
  498. {
  499. struct mem_cgroup_per_zone *mz;
  500. unsigned long flags;
  501. /* try_charge() can return NULL to *memcg, taking care of it. */
  502. if (!mem)
  503. return;
  504. lock_page_cgroup(pc);
  505. if (unlikely(PageCgroupUsed(pc))) {
  506. unlock_page_cgroup(pc);
  507. res_counter_uncharge(&mem->res, PAGE_SIZE);
  508. css_put(&mem->css);
  509. return;
  510. }
  511. pc->mem_cgroup = mem;
  512. /*
  513. * If a page is accounted as a page cache, insert to inactive list.
  514. * If anon, insert to active list.
  515. */
  516. pc->flags = pcg_default_flags[ctype];
  517. mz = page_cgroup_zoneinfo(pc);
  518. spin_lock_irqsave(&mz->lru_lock, flags);
  519. __mem_cgroup_add_list(mz, pc, true);
  520. spin_unlock_irqrestore(&mz->lru_lock, flags);
  521. unlock_page_cgroup(pc);
  522. }
  523. /**
  524. * mem_cgroup_move_account - move account of the page
  525. * @pc: page_cgroup of the page.
  526. * @from: mem_cgroup which the page is moved from.
  527. * @to: mem_cgroup which the page is moved to. @from != @to.
  528. *
  529. * The caller must confirm following.
  530. * 1. disable irq.
  531. * 2. lru_lock of old mem_cgroup(@from) should be held.
  532. *
  533. * returns 0 at success,
  534. * returns -EBUSY when lock is busy or "pc" is unstable.
  535. *
  536. * This function does "uncharge" from old cgroup but doesn't do "charge" to
  537. * new cgroup. It should be done by a caller.
  538. */
  539. static int mem_cgroup_move_account(struct page_cgroup *pc,
  540. struct mem_cgroup *from, struct mem_cgroup *to)
  541. {
  542. struct mem_cgroup_per_zone *from_mz, *to_mz;
  543. int nid, zid;
  544. int ret = -EBUSY;
  545. VM_BUG_ON(!irqs_disabled());
  546. VM_BUG_ON(from == to);
  547. nid = page_cgroup_nid(pc);
  548. zid = page_cgroup_zid(pc);
  549. from_mz = mem_cgroup_zoneinfo(from, nid, zid);
  550. to_mz = mem_cgroup_zoneinfo(to, nid, zid);
  551. if (!trylock_page_cgroup(pc))
  552. return ret;
  553. if (!PageCgroupUsed(pc))
  554. goto out;
  555. if (pc->mem_cgroup != from)
  556. goto out;
  557. if (spin_trylock(&to_mz->lru_lock)) {
  558. __mem_cgroup_remove_list(from_mz, pc);
  559. css_put(&from->css);
  560. res_counter_uncharge(&from->res, PAGE_SIZE);
  561. pc->mem_cgroup = to;
  562. css_get(&to->css);
  563. __mem_cgroup_add_list(to_mz, pc, false);
  564. ret = 0;
  565. spin_unlock(&to_mz->lru_lock);
  566. }
  567. out:
  568. unlock_page_cgroup(pc);
  569. return ret;
  570. }
  571. /*
  572. * move charges to its parent.
  573. */
  574. static int mem_cgroup_move_parent(struct page_cgroup *pc,
  575. struct mem_cgroup *child,
  576. gfp_t gfp_mask)
  577. {
  578. struct cgroup *cg = child->css.cgroup;
  579. struct cgroup *pcg = cg->parent;
  580. struct mem_cgroup *parent;
  581. struct mem_cgroup_per_zone *mz;
  582. unsigned long flags;
  583. int ret;
  584. /* Is ROOT ? */
  585. if (!pcg)
  586. return -EINVAL;
  587. parent = mem_cgroup_from_cont(pcg);
  588. ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
  589. if (ret)
  590. return ret;
  591. mz = mem_cgroup_zoneinfo(child,
  592. page_cgroup_nid(pc), page_cgroup_zid(pc));
  593. spin_lock_irqsave(&mz->lru_lock, flags);
  594. ret = mem_cgroup_move_account(pc, child, parent);
  595. spin_unlock_irqrestore(&mz->lru_lock, flags);
  596. /* drop extra refcnt */
  597. css_put(&parent->css);
  598. /* uncharge if move fails */
  599. if (ret)
  600. res_counter_uncharge(&parent->res, PAGE_SIZE);
  601. return ret;
  602. }
  603. /*
  604. * Charge the memory controller for page usage.
  605. * Return
  606. * 0 if the charge was successful
  607. * < 0 if the cgroup is over its limit
  608. */
  609. static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
  610. gfp_t gfp_mask, enum charge_type ctype,
  611. struct mem_cgroup *memcg)
  612. {
  613. struct mem_cgroup *mem;
  614. struct page_cgroup *pc;
  615. int ret;
  616. pc = lookup_page_cgroup(page);
  617. /* can happen at boot */
  618. if (unlikely(!pc))
  619. return 0;
  620. prefetchw(pc);
  621. mem = memcg;
  622. ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
  623. if (ret)
  624. return ret;
  625. __mem_cgroup_commit_charge(mem, pc, ctype);
  626. return 0;
  627. }
  628. int mem_cgroup_newpage_charge(struct page *page,
  629. struct mm_struct *mm, gfp_t gfp_mask)
  630. {
  631. if (mem_cgroup_subsys.disabled)
  632. return 0;
  633. if (PageCompound(page))
  634. return 0;
  635. /*
  636. * If already mapped, we don't have to account.
  637. * If page cache, page->mapping has address_space.
  638. * But page->mapping may have out-of-use anon_vma pointer,
  639. * detecit it by PageAnon() check. newly-mapped-anon's page->mapping
  640. * is NULL.
  641. */
  642. if (page_mapped(page) || (page->mapping && !PageAnon(page)))
  643. return 0;
  644. if (unlikely(!mm))
  645. mm = &init_mm;
  646. return mem_cgroup_charge_common(page, mm, gfp_mask,
  647. MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
  648. }
  649. int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
  650. gfp_t gfp_mask)
  651. {
  652. if (mem_cgroup_subsys.disabled)
  653. return 0;
  654. if (PageCompound(page))
  655. return 0;
  656. /*
  657. * Corner case handling. This is called from add_to_page_cache()
  658. * in usual. But some FS (shmem) precharges this page before calling it
  659. * and call add_to_page_cache() with GFP_NOWAIT.
  660. *
  661. * For GFP_NOWAIT case, the page may be pre-charged before calling
  662. * add_to_page_cache(). (See shmem.c) check it here and avoid to call
  663. * charge twice. (It works but has to pay a bit larger cost.)
  664. */
  665. if (!(gfp_mask & __GFP_WAIT)) {
  666. struct page_cgroup *pc;
  667. pc = lookup_page_cgroup(page);
  668. if (!pc)
  669. return 0;
  670. lock_page_cgroup(pc);
  671. if (PageCgroupUsed(pc)) {
  672. unlock_page_cgroup(pc);
  673. return 0;
  674. }
  675. unlock_page_cgroup(pc);
  676. }
  677. if (unlikely(!mm))
  678. mm = &init_mm;
  679. if (page_is_file_cache(page))
  680. return mem_cgroup_charge_common(page, mm, gfp_mask,
  681. MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
  682. else
  683. return mem_cgroup_charge_common(page, mm, gfp_mask,
  684. MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL);
  685. }
  686. #ifdef CONFIG_SWAP
  687. int mem_cgroup_cache_charge_swapin(struct page *page,
  688. struct mm_struct *mm, gfp_t mask, bool locked)
  689. {
  690. int ret = 0;
  691. if (mem_cgroup_subsys.disabled)
  692. return 0;
  693. if (unlikely(!mm))
  694. mm = &init_mm;
  695. if (!locked)
  696. lock_page(page);
  697. /*
  698. * If not locked, the page can be dropped from SwapCache until
  699. * we reach here.
  700. */
  701. if (PageSwapCache(page)) {
  702. ret = mem_cgroup_charge_common(page, mm, mask,
  703. MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL);
  704. }
  705. if (!locked)
  706. unlock_page(page);
  707. return ret;
  708. }
  709. #endif
  710. void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
  711. {
  712. struct page_cgroup *pc;
  713. if (mem_cgroup_subsys.disabled)
  714. return;
  715. if (!ptr)
  716. return;
  717. pc = lookup_page_cgroup(page);
  718. __mem_cgroup_commit_charge(ptr, pc, MEM_CGROUP_CHARGE_TYPE_MAPPED);
  719. }
  720. void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
  721. {
  722. if (mem_cgroup_subsys.disabled)
  723. return;
  724. if (!mem)
  725. return;
  726. res_counter_uncharge(&mem->res, PAGE_SIZE);
  727. css_put(&mem->css);
  728. }
  729. /*
  730. * uncharge if !page_mapped(page)
  731. */
  732. static void
  733. __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
  734. {
  735. struct page_cgroup *pc;
  736. struct mem_cgroup *mem;
  737. struct mem_cgroup_per_zone *mz;
  738. unsigned long flags;
  739. if (mem_cgroup_subsys.disabled)
  740. return;
  741. if (PageSwapCache(page))
  742. return;
  743. /*
  744. * Check if our page_cgroup is valid
  745. */
  746. pc = lookup_page_cgroup(page);
  747. if (unlikely(!pc || !PageCgroupUsed(pc)))
  748. return;
  749. lock_page_cgroup(pc);
  750. if (!PageCgroupUsed(pc))
  751. goto unlock_out;
  752. switch (ctype) {
  753. case MEM_CGROUP_CHARGE_TYPE_MAPPED:
  754. if (page_mapped(page))
  755. goto unlock_out;
  756. break;
  757. case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
  758. if (!PageAnon(page)) { /* Shared memory */
  759. if (page->mapping && !page_is_file_cache(page))
  760. goto unlock_out;
  761. } else if (page_mapped(page)) /* Anon */
  762. goto unlock_out;
  763. break;
  764. default:
  765. break;
  766. }
  767. ClearPageCgroupUsed(pc);
  768. mem = pc->mem_cgroup;
  769. mz = page_cgroup_zoneinfo(pc);
  770. spin_lock_irqsave(&mz->lru_lock, flags);
  771. __mem_cgroup_remove_list(mz, pc);
  772. spin_unlock_irqrestore(&mz->lru_lock, flags);
  773. unlock_page_cgroup(pc);
  774. res_counter_uncharge(&mem->res, PAGE_SIZE);
  775. css_put(&mem->css);
  776. return;
  777. unlock_out:
  778. unlock_page_cgroup(pc);
  779. return;
  780. }
  781. void mem_cgroup_uncharge_page(struct page *page)
  782. {
  783. /* early check. */
  784. if (page_mapped(page))
  785. return;
  786. if (page->mapping && !PageAnon(page))
  787. return;
  788. __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
  789. }
  790. void mem_cgroup_uncharge_cache_page(struct page *page)
  791. {
  792. VM_BUG_ON(page_mapped(page));
  793. VM_BUG_ON(page->mapping);
  794. __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
  795. }
  796. void mem_cgroup_uncharge_swapcache(struct page *page)
  797. {
  798. __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_SWAPOUT);
  799. }
  800. /*
  801. * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
  802. * page belongs to.
  803. */
  804. int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
  805. {
  806. struct page_cgroup *pc;
  807. struct mem_cgroup *mem = NULL;
  808. int ret = 0;
  809. if (mem_cgroup_subsys.disabled)
  810. return 0;
  811. pc = lookup_page_cgroup(page);
  812. lock_page_cgroup(pc);
  813. if (PageCgroupUsed(pc)) {
  814. mem = pc->mem_cgroup;
  815. css_get(&mem->css);
  816. }
  817. unlock_page_cgroup(pc);
  818. if (mem) {
  819. ret = mem_cgroup_try_charge(NULL, GFP_HIGHUSER_MOVABLE, &mem);
  820. css_put(&mem->css);
  821. }
  822. *ptr = mem;
  823. return ret;
  824. }
  825. /* remove redundant charge if migration failed*/
  826. void mem_cgroup_end_migration(struct mem_cgroup *mem,
  827. struct page *oldpage, struct page *newpage)
  828. {
  829. struct page *target, *unused;
  830. struct page_cgroup *pc;
  831. enum charge_type ctype;
  832. if (!mem)
  833. return;
  834. /* at migration success, oldpage->mapping is NULL. */
  835. if (oldpage->mapping) {
  836. target = oldpage;
  837. unused = NULL;
  838. } else {
  839. target = newpage;
  840. unused = oldpage;
  841. }
  842. if (PageAnon(target))
  843. ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
  844. else if (page_is_file_cache(target))
  845. ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
  846. else
  847. ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
  848. /* unused page is not on radix-tree now. */
  849. if (unused)
  850. __mem_cgroup_uncharge_common(unused, ctype);
  851. pc = lookup_page_cgroup(target);
  852. /*
  853. * __mem_cgroup_commit_charge() check PCG_USED bit of page_cgroup.
  854. * So, double-counting is effectively avoided.
  855. */
  856. __mem_cgroup_commit_charge(mem, pc, ctype);
  857. /*
  858. * Both of oldpage and newpage are still under lock_page().
  859. * Then, we don't have to care about race in radix-tree.
  860. * But we have to be careful that this page is unmapped or not.
  861. *
  862. * There is a case for !page_mapped(). At the start of
  863. * migration, oldpage was mapped. But now, it's zapped.
  864. * But we know *target* page is not freed/reused under us.
  865. * mem_cgroup_uncharge_page() does all necessary checks.
  866. */
  867. if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
  868. mem_cgroup_uncharge_page(target);
  869. }
  870. /*
  871. * A call to try to shrink memory usage under specified resource controller.
  872. * This is typically used for page reclaiming for shmem for reducing side
  873. * effect of page allocation from shmem, which is used by some mem_cgroup.
  874. */
  875. int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
  876. {
  877. struct mem_cgroup *mem;
  878. int progress = 0;
  879. int retry = MEM_CGROUP_RECLAIM_RETRIES;
  880. if (mem_cgroup_subsys.disabled)
  881. return 0;
  882. if (!mm)
  883. return 0;
  884. rcu_read_lock();
  885. mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
  886. if (unlikely(!mem)) {
  887. rcu_read_unlock();
  888. return 0;
  889. }
  890. css_get(&mem->css);
  891. rcu_read_unlock();
  892. do {
  893. progress = try_to_free_mem_cgroup_pages(mem, gfp_mask);
  894. progress += res_counter_check_under_limit(&mem->res);
  895. } while (!progress && --retry);
  896. css_put(&mem->css);
  897. if (!retry)
  898. return -ENOMEM;
  899. return 0;
  900. }
  901. static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
  902. unsigned long long val)
  903. {
  904. int retry_count = MEM_CGROUP_RECLAIM_RETRIES;
  905. int progress;
  906. int ret = 0;
  907. while (res_counter_set_limit(&memcg->res, val)) {
  908. if (signal_pending(current)) {
  909. ret = -EINTR;
  910. break;
  911. }
  912. if (!retry_count) {
  913. ret = -EBUSY;
  914. break;
  915. }
  916. progress = try_to_free_mem_cgroup_pages(memcg,
  917. GFP_HIGHUSER_MOVABLE);
  918. if (!progress)
  919. retry_count--;
  920. }
  921. return ret;
  922. }
  923. /*
  924. * This routine traverse page_cgroup in given list and drop them all.
  925. * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
  926. */
  927. static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
  928. struct mem_cgroup_per_zone *mz,
  929. enum lru_list lru)
  930. {
  931. struct page_cgroup *pc, *busy;
  932. unsigned long flags;
  933. unsigned long loop;
  934. struct list_head *list;
  935. int ret = 0;
  936. list = &mz->lists[lru];
  937. loop = MEM_CGROUP_ZSTAT(mz, lru);
  938. /* give some margin against EBUSY etc...*/
  939. loop += 256;
  940. busy = NULL;
  941. while (loop--) {
  942. ret = 0;
  943. spin_lock_irqsave(&mz->lru_lock, flags);
  944. if (list_empty(list)) {
  945. spin_unlock_irqrestore(&mz->lru_lock, flags);
  946. break;
  947. }
  948. pc = list_entry(list->prev, struct page_cgroup, lru);
  949. if (busy == pc) {
  950. list_move(&pc->lru, list);
  951. busy = 0;
  952. spin_unlock_irqrestore(&mz->lru_lock, flags);
  953. continue;
  954. }
  955. spin_unlock_irqrestore(&mz->lru_lock, flags);
  956. ret = mem_cgroup_move_parent(pc, mem, GFP_HIGHUSER_MOVABLE);
  957. if (ret == -ENOMEM)
  958. break;
  959. if (ret == -EBUSY || ret == -EINVAL) {
  960. /* found lock contention or "pc" is obsolete. */
  961. busy = pc;
  962. cond_resched();
  963. } else
  964. busy = NULL;
  965. }
  966. if (!ret && !list_empty(list))
  967. return -EBUSY;
  968. return ret;
  969. }
  970. /*
  971. * make mem_cgroup's charge to be 0 if there is no task.
  972. * This enables deleting this mem_cgroup.
  973. */
  974. static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)
  975. {
  976. int ret;
  977. int node, zid, shrink;
  978. int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
  979. struct cgroup *cgrp = mem->css.cgroup;
  980. css_get(&mem->css);
  981. shrink = 0;
  982. /* should free all ? */
  983. if (free_all)
  984. goto try_to_free;
  985. move_account:
  986. while (mem->res.usage > 0) {
  987. ret = -EBUSY;
  988. if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
  989. goto out;
  990. ret = -EINTR;
  991. if (signal_pending(current))
  992. goto out;
  993. /* This is for making all *used* pages to be on LRU. */
  994. lru_add_drain_all();
  995. ret = 0;
  996. for_each_node_state(node, N_POSSIBLE) {
  997. for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
  998. struct mem_cgroup_per_zone *mz;
  999. enum lru_list l;
  1000. mz = mem_cgroup_zoneinfo(mem, node, zid);
  1001. for_each_lru(l) {
  1002. ret = mem_cgroup_force_empty_list(mem,
  1003. mz, l);
  1004. if (ret)
  1005. break;
  1006. }
  1007. }
  1008. if (ret)
  1009. break;
  1010. }
  1011. /* it seems parent cgroup doesn't have enough mem */
  1012. if (ret == -ENOMEM)
  1013. goto try_to_free;
  1014. cond_resched();
  1015. }
  1016. ret = 0;
  1017. out:
  1018. css_put(&mem->css);
  1019. return ret;
  1020. try_to_free:
  1021. /* returns EBUSY if there is a task or if we come here twice. */
  1022. if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {
  1023. ret = -EBUSY;
  1024. goto out;
  1025. }
  1026. /* we call try-to-free pages for make this cgroup empty */
  1027. lru_add_drain_all();
  1028. /* try to free all pages in this cgroup */
  1029. shrink = 1;
  1030. while (nr_retries && mem->res.usage > 0) {
  1031. int progress;
  1032. if (signal_pending(current)) {
  1033. ret = -EINTR;
  1034. goto out;
  1035. }
  1036. progress = try_to_free_mem_cgroup_pages(mem,
  1037. GFP_HIGHUSER_MOVABLE);
  1038. if (!progress) {
  1039. nr_retries--;
  1040. /* maybe some writeback is necessary */
  1041. congestion_wait(WRITE, HZ/10);
  1042. }
  1043. }
  1044. /* try move_account...there may be some *locked* pages. */
  1045. if (mem->res.usage)
  1046. goto move_account;
  1047. ret = 0;
  1048. goto out;
  1049. }
  1050. int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
  1051. {
  1052. return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
  1053. }
  1054. static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
  1055. {
  1056. return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res,
  1057. cft->private);
  1058. }
  1059. /*
  1060. * The user of this function is...
  1061. * RES_LIMIT.
  1062. */
  1063. static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
  1064. const char *buffer)
  1065. {
  1066. struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
  1067. unsigned long long val;
  1068. int ret;
  1069. switch (cft->private) {
  1070. case RES_LIMIT:
  1071. /* This function does all necessary parse...reuse it */
  1072. ret = res_counter_memparse_write_strategy(buffer, &val);
  1073. if (!ret)
  1074. ret = mem_cgroup_resize_limit(memcg, val);
  1075. break;
  1076. default:
  1077. ret = -EINVAL; /* should be BUG() ? */
  1078. break;
  1079. }
  1080. return ret;
  1081. }
  1082. static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
  1083. {
  1084. struct mem_cgroup *mem;
  1085. mem = mem_cgroup_from_cont(cont);
  1086. switch (event) {
  1087. case RES_MAX_USAGE:
  1088. res_counter_reset_max(&mem->res);
  1089. break;
  1090. case RES_FAILCNT:
  1091. res_counter_reset_failcnt(&mem->res);
  1092. break;
  1093. }
  1094. return 0;
  1095. }
  1096. static const struct mem_cgroup_stat_desc {
  1097. const char *msg;
  1098. u64 unit;
  1099. } mem_cgroup_stat_desc[] = {
  1100. [MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, },
  1101. [MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, },
  1102. [MEM_CGROUP_STAT_PGPGIN_COUNT] = {"pgpgin", 1, },
  1103. [MEM_CGROUP_STAT_PGPGOUT_COUNT] = {"pgpgout", 1, },
  1104. };
  1105. static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
  1106. struct cgroup_map_cb *cb)
  1107. {
  1108. struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
  1109. struct mem_cgroup_stat *stat = &mem_cont->stat;
  1110. int i;
  1111. for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) {
  1112. s64 val;
  1113. val = mem_cgroup_read_stat(stat, i);
  1114. val *= mem_cgroup_stat_desc[i].unit;
  1115. cb->fill(cb, mem_cgroup_stat_desc[i].msg, val);
  1116. }
  1117. /* showing # of active pages */
  1118. {
  1119. unsigned long active_anon, inactive_anon;
  1120. unsigned long active_file, inactive_file;
  1121. unsigned long unevictable;
  1122. inactive_anon = mem_cgroup_get_all_zonestat(mem_cont,
  1123. LRU_INACTIVE_ANON);
  1124. active_anon = mem_cgroup_get_all_zonestat(mem_cont,
  1125. LRU_ACTIVE_ANON);
  1126. inactive_file = mem_cgroup_get_all_zonestat(mem_cont,
  1127. LRU_INACTIVE_FILE);
  1128. active_file = mem_cgroup_get_all_zonestat(mem_cont,
  1129. LRU_ACTIVE_FILE);
  1130. unevictable = mem_cgroup_get_all_zonestat(mem_cont,
  1131. LRU_UNEVICTABLE);
  1132. cb->fill(cb, "active_anon", (active_anon) * PAGE_SIZE);
  1133. cb->fill(cb, "inactive_anon", (inactive_anon) * PAGE_SIZE);
  1134. cb->fill(cb, "active_file", (active_file) * PAGE_SIZE);
  1135. cb->fill(cb, "inactive_file", (inactive_file) * PAGE_SIZE);
  1136. cb->fill(cb, "unevictable", unevictable * PAGE_SIZE);
  1137. }
  1138. return 0;
  1139. }
  1140. static struct cftype mem_cgroup_files[] = {
  1141. {
  1142. .name = "usage_in_bytes",
  1143. .private = RES_USAGE,
  1144. .read_u64 = mem_cgroup_read,
  1145. },
  1146. {
  1147. .name = "max_usage_in_bytes",
  1148. .private = RES_MAX_USAGE,
  1149. .trigger = mem_cgroup_reset,
  1150. .read_u64 = mem_cgroup_read,
  1151. },
  1152. {
  1153. .name = "limit_in_bytes",
  1154. .private = RES_LIMIT,
  1155. .write_string = mem_cgroup_write,
  1156. .read_u64 = mem_cgroup_read,
  1157. },
  1158. {
  1159. .name = "failcnt",
  1160. .private = RES_FAILCNT,
  1161. .trigger = mem_cgroup_reset,
  1162. .read_u64 = mem_cgroup_read,
  1163. },
  1164. {
  1165. .name = "stat",
  1166. .read_map = mem_control_stat_show,
  1167. },
  1168. {
  1169. .name = "force_empty",
  1170. .trigger = mem_cgroup_force_empty_write,
  1171. },
  1172. };
  1173. static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
  1174. {
  1175. struct mem_cgroup_per_node *pn;
  1176. struct mem_cgroup_per_zone *mz;
  1177. enum lru_list l;
  1178. int zone, tmp = node;
  1179. /*
  1180. * This routine is called against possible nodes.
  1181. * But it's BUG to call kmalloc() against offline node.
  1182. *
  1183. * TODO: this routine can waste much memory for nodes which will
  1184. * never be onlined. It's better to use memory hotplug callback
  1185. * function.
  1186. */
  1187. if (!node_state(node, N_NORMAL_MEMORY))
  1188. tmp = -1;
  1189. pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
  1190. if (!pn)
  1191. return 1;
  1192. mem->info.nodeinfo[node] = pn;
  1193. memset(pn, 0, sizeof(*pn));
  1194. for (zone = 0; zone < MAX_NR_ZONES; zone++) {
  1195. mz = &pn->zoneinfo[zone];
  1196. spin_lock_init(&mz->lru_lock);
  1197. for_each_lru(l)
  1198. INIT_LIST_HEAD(&mz->lists[l]);
  1199. }
  1200. return 0;
  1201. }
  1202. static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
  1203. {
  1204. kfree(mem->info.nodeinfo[node]);
  1205. }
  1206. static int mem_cgroup_size(void)
  1207. {
  1208. int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu);
  1209. return sizeof(struct mem_cgroup) + cpustat_size;
  1210. }
  1211. static struct mem_cgroup *mem_cgroup_alloc(void)
  1212. {
  1213. struct mem_cgroup *mem;
  1214. int size = mem_cgroup_size();
  1215. if (size < PAGE_SIZE)
  1216. mem = kmalloc(size, GFP_KERNEL);
  1217. else
  1218. mem = vmalloc(size);
  1219. if (mem)
  1220. memset(mem, 0, size);
  1221. return mem;
  1222. }
  1223. static void mem_cgroup_free(struct mem_cgroup *mem)
  1224. {
  1225. if (mem_cgroup_size() < PAGE_SIZE)
  1226. kfree(mem);
  1227. else
  1228. vfree(mem);
  1229. }
  1230. #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
  1231. static void __init enable_swap_cgroup(void)
  1232. {
  1233. if (!mem_cgroup_subsys.disabled && really_do_swap_account)
  1234. do_swap_account = 1;
  1235. }
  1236. #else
  1237. static void __init enable_swap_cgroup(void)
  1238. {
  1239. }
  1240. #endif
  1241. static struct cgroup_subsys_state *
  1242. mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
  1243. {
  1244. struct mem_cgroup *mem;
  1245. int node;
  1246. mem = mem_cgroup_alloc();
  1247. if (!mem)
  1248. return ERR_PTR(-ENOMEM);
  1249. res_counter_init(&mem->res);
  1250. for_each_node_state(node, N_POSSIBLE)
  1251. if (alloc_mem_cgroup_per_zone_info(mem, node))
  1252. goto free_out;
  1253. /* root ? */
  1254. if (cont->parent == NULL)
  1255. enable_swap_cgroup();
  1256. return &mem->css;
  1257. free_out:
  1258. for_each_node_state(node, N_POSSIBLE)
  1259. free_mem_cgroup_per_zone_info(mem, node);
  1260. mem_cgroup_free(mem);
  1261. return ERR_PTR(-ENOMEM);
  1262. }
  1263. static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
  1264. struct cgroup *cont)
  1265. {
  1266. struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
  1267. mem_cgroup_force_empty(mem, false);
  1268. }
  1269. static void mem_cgroup_destroy(struct cgroup_subsys *ss,
  1270. struct cgroup *cont)
  1271. {
  1272. int node;
  1273. struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
  1274. for_each_node_state(node, N_POSSIBLE)
  1275. free_mem_cgroup_per_zone_info(mem, node);
  1276. mem_cgroup_free(mem_cgroup_from_cont(cont));
  1277. }
  1278. static int mem_cgroup_populate(struct cgroup_subsys *ss,
  1279. struct cgroup *cont)
  1280. {
  1281. return cgroup_add_files(cont, ss, mem_cgroup_files,
  1282. ARRAY_SIZE(mem_cgroup_files));
  1283. }
  1284. static void mem_cgroup_move_task(struct cgroup_subsys *ss,
  1285. struct cgroup *cont,
  1286. struct cgroup *old_cont,
  1287. struct task_struct *p)
  1288. {
  1289. struct mm_struct *mm;
  1290. struct mem_cgroup *mem, *old_mem;
  1291. mm = get_task_mm(p);
  1292. if (mm == NULL)
  1293. return;
  1294. mem = mem_cgroup_from_cont(cont);
  1295. old_mem = mem_cgroup_from_cont(old_cont);
  1296. /*
  1297. * Only thread group leaders are allowed to migrate, the mm_struct is
  1298. * in effect owned by the leader
  1299. */
  1300. if (!thread_group_leader(p))
  1301. goto out;
  1302. out:
  1303. mmput(mm);
  1304. }
  1305. struct cgroup_subsys mem_cgroup_subsys = {
  1306. .name = "memory",
  1307. .subsys_id = mem_cgroup_subsys_id,
  1308. .create = mem_cgroup_create,
  1309. .pre_destroy = mem_cgroup_pre_destroy,
  1310. .destroy = mem_cgroup_destroy,
  1311. .populate = mem_cgroup_populate,
  1312. .attach = mem_cgroup_move_task,
  1313. .early_init = 0,
  1314. };
  1315. #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
  1316. static int __init disable_swap_account(char *s)
  1317. {
  1318. really_do_swap_account = 0;
  1319. return 1;
  1320. }
  1321. __setup("noswapaccount", disable_swap_account);
  1322. #endif