memcontrol.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622
  1. /* memcontrol.c - Memory Controller
  2. *
  3. * Copyright IBM Corporation, 2007
  4. * Author Balbir Singh <balbir@linux.vnet.ibm.com>
  5. *
  6. * Copyright 2007 OpenVZ SWsoft Inc
  7. * Author: Pavel Emelianov <xemul@openvz.org>
  8. *
  9. * This program is free software; you can redistribute it and/or modify
  10. * it under the terms of the GNU General Public License as published by
  11. * the Free Software Foundation; either version 2 of the License, or
  12. * (at your option) any later version.
  13. *
  14. * This program is distributed in the hope that it will be useful,
  15. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17. * GNU General Public License for more details.
  18. */
  19. #include <linux/res_counter.h>
  20. #include <linux/memcontrol.h>
  21. #include <linux/cgroup.h>
  22. #include <linux/mm.h>
  23. #include <linux/page-flags.h>
  24. #include <linux/backing-dev.h>
  25. #include <linux/bit_spinlock.h>
  26. #include <linux/rcupdate.h>
  27. #include <linux/swap.h>
  28. #include <linux/spinlock.h>
  29. #include <linux/fs.h>
  30. #include <asm/uaccess.h>
  31. struct cgroup_subsys mem_cgroup_subsys;
  32. static const int MEM_CGROUP_RECLAIM_RETRIES = 5;
  33. /*
  34. * The memory controller data structure. The memory controller controls both
  35. * page cache and RSS per cgroup. We would eventually like to provide
  36. * statistics based on the statistics developed by Rik Van Riel for clock-pro,
  37. * to help the administrator determine what knobs to tune.
  38. *
  39. * TODO: Add a water mark for the memory controller. Reclaim will begin when
  40. * we hit the water mark. May be even add a low water mark, such that
  41. * no reclaim occurs from a cgroup at it's low water mark, this is
  42. * a feature that will be implemented much later in the future.
  43. */
  44. struct mem_cgroup {
  45. struct cgroup_subsys_state css;
  46. /*
  47. * the counter to account for memory usage
  48. */
  49. struct res_counter res;
  50. /*
  51. * Per cgroup active and inactive list, similar to the
  52. * per zone LRU lists.
  53. * TODO: Consider making these lists per zone
  54. */
  55. struct list_head active_list;
  56. struct list_head inactive_list;
  57. /*
  58. * spin_lock to protect the per cgroup LRU
  59. */
  60. spinlock_t lru_lock;
  61. unsigned long control_type; /* control RSS or RSS+Pagecache */
  62. };
  63. /*
  64. * We use the lower bit of the page->page_cgroup pointer as a bit spin
  65. * lock. We need to ensure that page->page_cgroup is atleast two
  66. * byte aligned (based on comments from Nick Piggin)
  67. */
  68. #define PAGE_CGROUP_LOCK_BIT 0x0
  69. #define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT)
  70. /*
  71. * A page_cgroup page is associated with every page descriptor. The
  72. * page_cgroup helps us identify information about the cgroup
  73. */
  74. struct page_cgroup {
  75. struct list_head lru; /* per cgroup LRU list */
  76. struct page *page;
  77. struct mem_cgroup *mem_cgroup;
  78. atomic_t ref_cnt; /* Helpful when pages move b/w */
  79. /* mapped and cached states */
  80. };
  81. enum {
  82. MEM_CGROUP_TYPE_UNSPEC = 0,
  83. MEM_CGROUP_TYPE_MAPPED,
  84. MEM_CGROUP_TYPE_CACHED,
  85. MEM_CGROUP_TYPE_ALL,
  86. MEM_CGROUP_TYPE_MAX,
  87. };
  88. static struct mem_cgroup init_mem_cgroup;
  89. static inline
  90. struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
  91. {
  92. return container_of(cgroup_subsys_state(cont,
  93. mem_cgroup_subsys_id), struct mem_cgroup,
  94. css);
  95. }
  96. static inline
  97. struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
  98. {
  99. return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
  100. struct mem_cgroup, css);
  101. }
  102. inline struct mem_cgroup *mm_cgroup(struct mm_struct *mm)
  103. {
  104. return rcu_dereference(mm->mem_cgroup);
  105. }
  106. void mm_init_cgroup(struct mm_struct *mm, struct task_struct *p)
  107. {
  108. struct mem_cgroup *mem;
  109. mem = mem_cgroup_from_task(p);
  110. css_get(&mem->css);
  111. mm->mem_cgroup = mem;
  112. }
  113. void mm_free_cgroup(struct mm_struct *mm)
  114. {
  115. css_put(&mm->mem_cgroup->css);
  116. }
  117. static inline int page_cgroup_locked(struct page *page)
  118. {
  119. return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT,
  120. &page->page_cgroup);
  121. }
  122. void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc)
  123. {
  124. int locked;
  125. /*
  126. * While resetting the page_cgroup we might not hold the
  127. * page_cgroup lock. free_hot_cold_page() is an example
  128. * of such a scenario
  129. */
  130. if (pc)
  131. VM_BUG_ON(!page_cgroup_locked(page));
  132. locked = (page->page_cgroup & PAGE_CGROUP_LOCK);
  133. page->page_cgroup = ((unsigned long)pc | locked);
  134. }
  135. struct page_cgroup *page_get_page_cgroup(struct page *page)
  136. {
  137. return (struct page_cgroup *)
  138. (page->page_cgroup & ~PAGE_CGROUP_LOCK);
  139. }
  140. static void __always_inline lock_page_cgroup(struct page *page)
  141. {
  142. bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
  143. VM_BUG_ON(!page_cgroup_locked(page));
  144. }
  145. static void __always_inline unlock_page_cgroup(struct page *page)
  146. {
  147. bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
  148. }
  149. static void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active)
  150. {
  151. if (active)
  152. list_move(&pc->lru, &pc->mem_cgroup->active_list);
  153. else
  154. list_move(&pc->lru, &pc->mem_cgroup->inactive_list);
  155. }
  156. /*
  157. * This routine assumes that the appropriate zone's lru lock is already held
  158. */
  159. void mem_cgroup_move_lists(struct page_cgroup *pc, bool active)
  160. {
  161. struct mem_cgroup *mem;
  162. if (!pc)
  163. return;
  164. mem = pc->mem_cgroup;
  165. spin_lock(&mem->lru_lock);
  166. __mem_cgroup_move_lists(pc, active);
  167. spin_unlock(&mem->lru_lock);
  168. }
  169. unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
  170. struct list_head *dst,
  171. unsigned long *scanned, int order,
  172. int mode, struct zone *z,
  173. struct mem_cgroup *mem_cont,
  174. int active)
  175. {
  176. unsigned long nr_taken = 0;
  177. struct page *page;
  178. unsigned long scan;
  179. LIST_HEAD(pc_list);
  180. struct list_head *src;
  181. struct page_cgroup *pc;
  182. if (active)
  183. src = &mem_cont->active_list;
  184. else
  185. src = &mem_cont->inactive_list;
  186. spin_lock(&mem_cont->lru_lock);
  187. for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
  188. pc = list_entry(src->prev, struct page_cgroup, lru);
  189. page = pc->page;
  190. VM_BUG_ON(!pc);
  191. if (PageActive(page) && !active) {
  192. __mem_cgroup_move_lists(pc, true);
  193. scan--;
  194. continue;
  195. }
  196. if (!PageActive(page) && active) {
  197. __mem_cgroup_move_lists(pc, false);
  198. scan--;
  199. continue;
  200. }
  201. /*
  202. * Reclaim, per zone
  203. * TODO: make the active/inactive lists per zone
  204. */
  205. if (page_zone(page) != z)
  206. continue;
  207. /*
  208. * Check if the meta page went away from under us
  209. */
  210. if (!list_empty(&pc->lru))
  211. list_move(&pc->lru, &pc_list);
  212. else
  213. continue;
  214. if (__isolate_lru_page(page, mode) == 0) {
  215. list_move(&page->lru, dst);
  216. nr_taken++;
  217. }
  218. }
  219. list_splice(&pc_list, src);
  220. spin_unlock(&mem_cont->lru_lock);
  221. *scanned = scan;
  222. return nr_taken;
  223. }
  224. /*
  225. * Charge the memory controller for page usage.
  226. * Return
  227. * 0 if the charge was successful
  228. * < 0 if the cgroup is over its limit
  229. */
  230. int mem_cgroup_charge(struct page *page, struct mm_struct *mm)
  231. {
  232. struct mem_cgroup *mem;
  233. struct page_cgroup *pc, *race_pc;
  234. unsigned long flags;
  235. unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
  236. /*
  237. * Should page_cgroup's go to their own slab?
  238. * One could optimize the performance of the charging routine
  239. * by saving a bit in the page_flags and using it as a lock
  240. * to see if the cgroup page already has a page_cgroup associated
  241. * with it
  242. */
  243. retry:
  244. lock_page_cgroup(page);
  245. pc = page_get_page_cgroup(page);
  246. /*
  247. * The page_cgroup exists and the page has already been accounted
  248. */
  249. if (pc) {
  250. if (unlikely(!atomic_inc_not_zero(&pc->ref_cnt))) {
  251. /* this page is under being uncharged ? */
  252. unlock_page_cgroup(page);
  253. cpu_relax();
  254. goto retry;
  255. } else
  256. goto done;
  257. }
  258. unlock_page_cgroup(page);
  259. pc = kzalloc(sizeof(struct page_cgroup), GFP_KERNEL);
  260. if (pc == NULL)
  261. goto err;
  262. rcu_read_lock();
  263. /*
  264. * We always charge the cgroup the mm_struct belongs to
  265. * the mm_struct's mem_cgroup changes on task migration if the
  266. * thread group leader migrates. It's possible that mm is not
  267. * set, if so charge the init_mm (happens for pagecache usage).
  268. */
  269. if (!mm)
  270. mm = &init_mm;
  271. mem = rcu_dereference(mm->mem_cgroup);
  272. /*
  273. * For every charge from the cgroup, increment reference
  274. * count
  275. */
  276. css_get(&mem->css);
  277. rcu_read_unlock();
  278. /*
  279. * If we created the page_cgroup, we should free it on exceeding
  280. * the cgroup limit.
  281. */
  282. while (res_counter_charge(&mem->res, PAGE_SIZE)) {
  283. if (try_to_free_mem_cgroup_pages(mem))
  284. continue;
  285. /*
  286. * try_to_free_mem_cgroup_pages() might not give us a full
  287. * picture of reclaim. Some pages are reclaimed and might be
  288. * moved to swap cache or just unmapped from the cgroup.
  289. * Check the limit again to see if the reclaim reduced the
  290. * current usage of the cgroup before giving up
  291. */
  292. if (res_counter_check_under_limit(&mem->res))
  293. continue;
  294. /*
  295. * Since we control both RSS and cache, we end up with a
  296. * very interesting scenario where we end up reclaiming
  297. * memory (essentially RSS), since the memory is pushed
  298. * to swap cache, we eventually end up adding those
  299. * pages back to our list. Hence we give ourselves a
  300. * few chances before we fail
  301. */
  302. else if (nr_retries--) {
  303. congestion_wait(WRITE, HZ/10);
  304. continue;
  305. }
  306. css_put(&mem->css);
  307. mem_cgroup_out_of_memory(mem, GFP_KERNEL);
  308. goto free_pc;
  309. }
  310. lock_page_cgroup(page);
  311. /*
  312. * Check if somebody else beat us to allocating the page_cgroup
  313. */
  314. race_pc = page_get_page_cgroup(page);
  315. if (race_pc) {
  316. kfree(pc);
  317. pc = race_pc;
  318. atomic_inc(&pc->ref_cnt);
  319. res_counter_uncharge(&mem->res, PAGE_SIZE);
  320. css_put(&mem->css);
  321. goto done;
  322. }
  323. atomic_set(&pc->ref_cnt, 1);
  324. pc->mem_cgroup = mem;
  325. pc->page = page;
  326. page_assign_page_cgroup(page, pc);
  327. spin_lock_irqsave(&mem->lru_lock, flags);
  328. list_add(&pc->lru, &mem->active_list);
  329. spin_unlock_irqrestore(&mem->lru_lock, flags);
  330. done:
  331. unlock_page_cgroup(page);
  332. return 0;
  333. free_pc:
  334. kfree(pc);
  335. err:
  336. return -ENOMEM;
  337. }
  338. /*
  339. * See if the cached pages should be charged at all?
  340. */
  341. int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm)
  342. {
  343. struct mem_cgroup *mem;
  344. if (!mm)
  345. mm = &init_mm;
  346. mem = rcu_dereference(mm->mem_cgroup);
  347. if (mem->control_type == MEM_CGROUP_TYPE_ALL)
  348. return mem_cgroup_charge(page, mm);
  349. else
  350. return 0;
  351. }
  352. /*
  353. * Uncharging is always a welcome operation, we never complain, simply
  354. * uncharge.
  355. */
  356. void mem_cgroup_uncharge(struct page_cgroup *pc)
  357. {
  358. struct mem_cgroup *mem;
  359. struct page *page;
  360. unsigned long flags;
  361. /*
  362. * This can handle cases when a page is not charged at all and we
  363. * are switching between handling the control_type.
  364. */
  365. if (!pc)
  366. return;
  367. if (atomic_dec_and_test(&pc->ref_cnt)) {
  368. page = pc->page;
  369. lock_page_cgroup(page);
  370. mem = pc->mem_cgroup;
  371. css_put(&mem->css);
  372. page_assign_page_cgroup(page, NULL);
  373. unlock_page_cgroup(page);
  374. res_counter_uncharge(&mem->res, PAGE_SIZE);
  375. spin_lock_irqsave(&mem->lru_lock, flags);
  376. list_del_init(&pc->lru);
  377. spin_unlock_irqrestore(&mem->lru_lock, flags);
  378. kfree(pc);
  379. }
  380. }
  381. int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp)
  382. {
  383. *tmp = memparse(buf, &buf);
  384. if (*buf != '\0')
  385. return -EINVAL;
  386. /*
  387. * Round up the value to the closest page size
  388. */
  389. *tmp = ((*tmp + PAGE_SIZE - 1) >> PAGE_SHIFT) << PAGE_SHIFT;
  390. return 0;
  391. }
  392. static ssize_t mem_cgroup_read(struct cgroup *cont,
  393. struct cftype *cft, struct file *file,
  394. char __user *userbuf, size_t nbytes, loff_t *ppos)
  395. {
  396. return res_counter_read(&mem_cgroup_from_cont(cont)->res,
  397. cft->private, userbuf, nbytes, ppos,
  398. NULL);
  399. }
  400. static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
  401. struct file *file, const char __user *userbuf,
  402. size_t nbytes, loff_t *ppos)
  403. {
  404. return res_counter_write(&mem_cgroup_from_cont(cont)->res,
  405. cft->private, userbuf, nbytes, ppos,
  406. mem_cgroup_write_strategy);
  407. }
  408. static ssize_t mem_control_type_write(struct cgroup *cont,
  409. struct cftype *cft, struct file *file,
  410. const char __user *userbuf,
  411. size_t nbytes, loff_t *pos)
  412. {
  413. int ret;
  414. char *buf, *end;
  415. unsigned long tmp;
  416. struct mem_cgroup *mem;
  417. mem = mem_cgroup_from_cont(cont);
  418. buf = kmalloc(nbytes + 1, GFP_KERNEL);
  419. ret = -ENOMEM;
  420. if (buf == NULL)
  421. goto out;
  422. buf[nbytes] = 0;
  423. ret = -EFAULT;
  424. if (copy_from_user(buf, userbuf, nbytes))
  425. goto out_free;
  426. ret = -EINVAL;
  427. tmp = simple_strtoul(buf, &end, 10);
  428. if (*end != '\0')
  429. goto out_free;
  430. if (tmp <= MEM_CGROUP_TYPE_UNSPEC || tmp >= MEM_CGROUP_TYPE_MAX)
  431. goto out_free;
  432. mem->control_type = tmp;
  433. ret = nbytes;
  434. out_free:
  435. kfree(buf);
  436. out:
  437. return ret;
  438. }
  439. static ssize_t mem_control_type_read(struct cgroup *cont,
  440. struct cftype *cft,
  441. struct file *file, char __user *userbuf,
  442. size_t nbytes, loff_t *ppos)
  443. {
  444. unsigned long val;
  445. char buf[64], *s;
  446. struct mem_cgroup *mem;
  447. mem = mem_cgroup_from_cont(cont);
  448. s = buf;
  449. val = mem->control_type;
  450. s += sprintf(s, "%lu\n", val);
  451. return simple_read_from_buffer((void __user *)userbuf, nbytes,
  452. ppos, buf, s - buf);
  453. }
  454. static struct cftype mem_cgroup_files[] = {
  455. {
  456. .name = "usage_in_bytes",
  457. .private = RES_USAGE,
  458. .read = mem_cgroup_read,
  459. },
  460. {
  461. .name = "limit_in_bytes",
  462. .private = RES_LIMIT,
  463. .write = mem_cgroup_write,
  464. .read = mem_cgroup_read,
  465. },
  466. {
  467. .name = "failcnt",
  468. .private = RES_FAILCNT,
  469. .read = mem_cgroup_read,
  470. },
  471. {
  472. .name = "control_type",
  473. .write = mem_control_type_write,
  474. .read = mem_control_type_read,
  475. },
  476. };
  477. static struct mem_cgroup init_mem_cgroup;
  478. static struct cgroup_subsys_state *
  479. mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
  480. {
  481. struct mem_cgroup *mem;
  482. if (unlikely((cont->parent) == NULL)) {
  483. mem = &init_mem_cgroup;
  484. init_mm.mem_cgroup = mem;
  485. } else
  486. mem = kzalloc(sizeof(struct mem_cgroup), GFP_KERNEL);
  487. if (mem == NULL)
  488. return NULL;
  489. res_counter_init(&mem->res);
  490. INIT_LIST_HEAD(&mem->active_list);
  491. INIT_LIST_HEAD(&mem->inactive_list);
  492. spin_lock_init(&mem->lru_lock);
  493. mem->control_type = MEM_CGROUP_TYPE_ALL;
  494. return &mem->css;
  495. }
  496. static void mem_cgroup_destroy(struct cgroup_subsys *ss,
  497. struct cgroup *cont)
  498. {
  499. kfree(mem_cgroup_from_cont(cont));
  500. }
  501. static int mem_cgroup_populate(struct cgroup_subsys *ss,
  502. struct cgroup *cont)
  503. {
  504. return cgroup_add_files(cont, ss, mem_cgroup_files,
  505. ARRAY_SIZE(mem_cgroup_files));
  506. }
  507. static void mem_cgroup_move_task(struct cgroup_subsys *ss,
  508. struct cgroup *cont,
  509. struct cgroup *old_cont,
  510. struct task_struct *p)
  511. {
  512. struct mm_struct *mm;
  513. struct mem_cgroup *mem, *old_mem;
  514. mm = get_task_mm(p);
  515. if (mm == NULL)
  516. return;
  517. mem = mem_cgroup_from_cont(cont);
  518. old_mem = mem_cgroup_from_cont(old_cont);
  519. if (mem == old_mem)
  520. goto out;
  521. /*
  522. * Only thread group leaders are allowed to migrate, the mm_struct is
  523. * in effect owned by the leader
  524. */
  525. if (p->tgid != p->pid)
  526. goto out;
  527. css_get(&mem->css);
  528. rcu_assign_pointer(mm->mem_cgroup, mem);
  529. css_put(&old_mem->css);
  530. out:
  531. mmput(mm);
  532. return;
  533. }
  534. struct cgroup_subsys mem_cgroup_subsys = {
  535. .name = "memory",
  536. .subsys_id = mem_cgroup_subsys_id,
  537. .create = mem_cgroup_create,
  538. .destroy = mem_cgroup_destroy,
  539. .populate = mem_cgroup_populate,
  540. .attach = mem_cgroup_move_task,
  541. .early_init = 1,
  542. };