vmstat.c 29 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219
  1. /*
  2. * linux/mm/vmstat.c
  3. *
  4. * Manages VM statistics
  5. * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
  6. *
  7. * zoned VM statistics
  8. * Copyright (C) 2006 Silicon Graphics, Inc.,
  9. * Christoph Lameter <christoph@lameter.com>
  10. */
  11. #include <linux/fs.h>
  12. #include <linux/mm.h>
  13. #include <linux/err.h>
  14. #include <linux/module.h>
  15. #include <linux/slab.h>
  16. #include <linux/cpu.h>
  17. #include <linux/vmstat.h>
  18. #include <linux/sched.h>
  19. #include <linux/math64.h>
  20. #ifdef CONFIG_VM_EVENT_COUNTERS
  21. DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
  22. EXPORT_PER_CPU_SYMBOL(vm_event_states);
  23. static void sum_vm_events(unsigned long *ret)
  24. {
  25. int cpu;
  26. int i;
  27. memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
  28. for_each_online_cpu(cpu) {
  29. struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
  30. for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
  31. ret[i] += this->event[i];
  32. }
  33. }
  34. /*
  35. * Accumulate the vm event counters across all CPUs.
  36. * The result is unavoidably approximate - it can change
  37. * during and after execution of this function.
  38. */
  39. void all_vm_events(unsigned long *ret)
  40. {
  41. get_online_cpus();
  42. sum_vm_events(ret);
  43. put_online_cpus();
  44. }
  45. EXPORT_SYMBOL_GPL(all_vm_events);
  46. #ifdef CONFIG_HOTPLUG
  47. /*
  48. * Fold the foreign cpu events into our own.
  49. *
  50. * This is adding to the events on one processor
  51. * but keeps the global counts constant.
  52. */
  53. void vm_events_fold_cpu(int cpu)
  54. {
  55. struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu);
  56. int i;
  57. for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
  58. count_vm_events(i, fold_state->event[i]);
  59. fold_state->event[i] = 0;
  60. }
  61. }
  62. #endif /* CONFIG_HOTPLUG */
  63. #endif /* CONFIG_VM_EVENT_COUNTERS */
  64. /*
  65. * Manage combined zone based / global counters
  66. *
  67. * vm_stat contains the global counters
  68. */
  69. atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
  70. EXPORT_SYMBOL(vm_stat);
  71. #ifdef CONFIG_SMP
  72. static int calculate_threshold(struct zone *zone)
  73. {
  74. int threshold;
  75. int mem; /* memory in 128 MB units */
  76. /*
  77. * The threshold scales with the number of processors and the amount
  78. * of memory per zone. More memory means that we can defer updates for
  79. * longer, more processors could lead to more contention.
  80. * fls() is used to have a cheap way of logarithmic scaling.
  81. *
  82. * Some sample thresholds:
  83. *
  84. * Threshold Processors (fls) Zonesize fls(mem+1)
  85. * ------------------------------------------------------------------
  86. * 8 1 1 0.9-1 GB 4
  87. * 16 2 2 0.9-1 GB 4
  88. * 20 2 2 1-2 GB 5
  89. * 24 2 2 2-4 GB 6
  90. * 28 2 2 4-8 GB 7
  91. * 32 2 2 8-16 GB 8
  92. * 4 2 2 <128M 1
  93. * 30 4 3 2-4 GB 5
  94. * 48 4 3 8-16 GB 8
  95. * 32 8 4 1-2 GB 4
  96. * 32 8 4 0.9-1GB 4
  97. * 10 16 5 <128M 1
  98. * 40 16 5 900M 4
  99. * 70 64 7 2-4 GB 5
  100. * 84 64 7 4-8 GB 6
  101. * 108 512 9 4-8 GB 6
  102. * 125 1024 10 8-16 GB 8
  103. * 125 1024 10 16-32 GB 9
  104. */
  105. mem = zone->present_pages >> (27 - PAGE_SHIFT);
  106. threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
  107. /*
  108. * Maximum threshold is 125
  109. */
  110. threshold = min(125, threshold);
  111. return threshold;
  112. }
  113. /*
  114. * Refresh the thresholds for each zone.
  115. */
  116. static void refresh_zone_stat_thresholds(void)
  117. {
  118. struct zone *zone;
  119. int cpu;
  120. int threshold;
  121. for_each_populated_zone(zone) {
  122. unsigned long max_drift, tolerate_drift;
  123. threshold = calculate_threshold(zone);
  124. for_each_online_cpu(cpu)
  125. per_cpu_ptr(zone->pageset, cpu)->stat_threshold
  126. = threshold;
  127. /*
  128. * Only set percpu_drift_mark if there is a danger that
  129. * NR_FREE_PAGES reports the low watermark is ok when in fact
  130. * the min watermark could be breached by an allocation
  131. */
  132. tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone);
  133. max_drift = num_online_cpus() * threshold;
  134. if (max_drift > tolerate_drift)
  135. zone->percpu_drift_mark = high_wmark_pages(zone) +
  136. max_drift;
  137. }
  138. }
  139. /*
  140. * For use when we know that interrupts are disabled.
  141. */
  142. void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
  143. int delta)
  144. {
  145. struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
  146. s8 *p = pcp->vm_stat_diff + item;
  147. long x;
  148. x = delta + *p;
  149. if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) {
  150. zone_page_state_add(x, zone, item);
  151. x = 0;
  152. }
  153. *p = x;
  154. }
  155. EXPORT_SYMBOL(__mod_zone_page_state);
  156. /*
  157. * For an unknown interrupt state
  158. */
  159. void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
  160. int delta)
  161. {
  162. unsigned long flags;
  163. local_irq_save(flags);
  164. __mod_zone_page_state(zone, item, delta);
  165. local_irq_restore(flags);
  166. }
  167. EXPORT_SYMBOL(mod_zone_page_state);
  168. /*
  169. * Optimized increment and decrement functions.
  170. *
  171. * These are only for a single page and therefore can take a struct page *
  172. * argument instead of struct zone *. This allows the inclusion of the code
  173. * generated for page_zone(page) into the optimized functions.
  174. *
  175. * No overflow check is necessary and therefore the differential can be
  176. * incremented or decremented in place which may allow the compilers to
  177. * generate better code.
  178. * The increment or decrement is known and therefore one boundary check can
  179. * be omitted.
  180. *
  181. * NOTE: These functions are very performance sensitive. Change only
  182. * with care.
  183. *
  184. * Some processors have inc/dec instructions that are atomic vs an interrupt.
  185. * However, the code must first determine the differential location in a zone
  186. * based on the processor number and then inc/dec the counter. There is no
  187. * guarantee without disabling preemption that the processor will not change
  188. * in between and therefore the atomicity vs. interrupt cannot be exploited
  189. * in a useful way here.
  190. */
  191. void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
  192. {
  193. struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
  194. s8 *p = pcp->vm_stat_diff + item;
  195. (*p)++;
  196. if (unlikely(*p > pcp->stat_threshold)) {
  197. int overstep = pcp->stat_threshold / 2;
  198. zone_page_state_add(*p + overstep, zone, item);
  199. *p = -overstep;
  200. }
  201. }
  202. void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
  203. {
  204. __inc_zone_state(page_zone(page), item);
  205. }
  206. EXPORT_SYMBOL(__inc_zone_page_state);
  207. void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
  208. {
  209. struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
  210. s8 *p = pcp->vm_stat_diff + item;
  211. (*p)--;
  212. if (unlikely(*p < - pcp->stat_threshold)) {
  213. int overstep = pcp->stat_threshold / 2;
  214. zone_page_state_add(*p - overstep, zone, item);
  215. *p = overstep;
  216. }
  217. }
  218. void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
  219. {
  220. __dec_zone_state(page_zone(page), item);
  221. }
  222. EXPORT_SYMBOL(__dec_zone_page_state);
  223. void inc_zone_state(struct zone *zone, enum zone_stat_item item)
  224. {
  225. unsigned long flags;
  226. local_irq_save(flags);
  227. __inc_zone_state(zone, item);
  228. local_irq_restore(flags);
  229. }
  230. void inc_zone_page_state(struct page *page, enum zone_stat_item item)
  231. {
  232. unsigned long flags;
  233. struct zone *zone;
  234. zone = page_zone(page);
  235. local_irq_save(flags);
  236. __inc_zone_state(zone, item);
  237. local_irq_restore(flags);
  238. }
  239. EXPORT_SYMBOL(inc_zone_page_state);
  240. void dec_zone_page_state(struct page *page, enum zone_stat_item item)
  241. {
  242. unsigned long flags;
  243. local_irq_save(flags);
  244. __dec_zone_page_state(page, item);
  245. local_irq_restore(flags);
  246. }
  247. EXPORT_SYMBOL(dec_zone_page_state);
  248. /*
  249. * Update the zone counters for one cpu.
  250. *
  251. * The cpu specified must be either the current cpu or a processor that
  252. * is not online. If it is the current cpu then the execution thread must
  253. * be pinned to the current cpu.
  254. *
  255. * Note that refresh_cpu_vm_stats strives to only access
  256. * node local memory. The per cpu pagesets on remote zones are placed
  257. * in the memory local to the processor using that pageset. So the
  258. * loop over all zones will access a series of cachelines local to
  259. * the processor.
  260. *
  261. * The call to zone_page_state_add updates the cachelines with the
  262. * statistics in the remote zone struct as well as the global cachelines
  263. * with the global counters. These could cause remote node cache line
  264. * bouncing and will have to be only done when necessary.
  265. */
  266. void refresh_cpu_vm_stats(int cpu)
  267. {
  268. struct zone *zone;
  269. int i;
  270. int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
  271. for_each_populated_zone(zone) {
  272. struct per_cpu_pageset *p;
  273. p = per_cpu_ptr(zone->pageset, cpu);
  274. for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
  275. if (p->vm_stat_diff[i]) {
  276. unsigned long flags;
  277. int v;
  278. local_irq_save(flags);
  279. v = p->vm_stat_diff[i];
  280. p->vm_stat_diff[i] = 0;
  281. local_irq_restore(flags);
  282. atomic_long_add(v, &zone->vm_stat[i]);
  283. global_diff[i] += v;
  284. #ifdef CONFIG_NUMA
  285. /* 3 seconds idle till flush */
  286. p->expire = 3;
  287. #endif
  288. }
  289. cond_resched();
  290. #ifdef CONFIG_NUMA
  291. /*
  292. * Deal with draining the remote pageset of this
  293. * processor
  294. *
  295. * Check if there are pages remaining in this pageset
  296. * if not then there is nothing to expire.
  297. */
  298. if (!p->expire || !p->pcp.count)
  299. continue;
  300. /*
  301. * We never drain zones local to this processor.
  302. */
  303. if (zone_to_nid(zone) == numa_node_id()) {
  304. p->expire = 0;
  305. continue;
  306. }
  307. p->expire--;
  308. if (p->expire)
  309. continue;
  310. if (p->pcp.count)
  311. drain_zone_pages(zone, &p->pcp);
  312. #endif
  313. }
  314. for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
  315. if (global_diff[i])
  316. atomic_long_add(global_diff[i], &vm_stat[i]);
  317. }
  318. #endif
  319. #ifdef CONFIG_NUMA
  320. /*
  321. * zonelist = the list of zones passed to the allocator
  322. * z = the zone from which the allocation occurred.
  323. *
  324. * Must be called with interrupts disabled.
  325. */
  326. void zone_statistics(struct zone *preferred_zone, struct zone *z)
  327. {
  328. if (z->zone_pgdat == preferred_zone->zone_pgdat) {
  329. __inc_zone_state(z, NUMA_HIT);
  330. } else {
  331. __inc_zone_state(z, NUMA_MISS);
  332. __inc_zone_state(preferred_zone, NUMA_FOREIGN);
  333. }
  334. if (z->node == numa_node_id())
  335. __inc_zone_state(z, NUMA_LOCAL);
  336. else
  337. __inc_zone_state(z, NUMA_OTHER);
  338. }
  339. #endif
  340. #ifdef CONFIG_COMPACTION
  341. struct contig_page_info {
  342. unsigned long free_pages;
  343. unsigned long free_blocks_total;
  344. unsigned long free_blocks_suitable;
  345. };
  346. /*
  347. * Calculate the number of free pages in a zone, how many contiguous
  348. * pages are free and how many are large enough to satisfy an allocation of
  349. * the target size. Note that this function makes no attempt to estimate
  350. * how many suitable free blocks there *might* be if MOVABLE pages were
  351. * migrated. Calculating that is possible, but expensive and can be
  352. * figured out from userspace
  353. */
  354. static void fill_contig_page_info(struct zone *zone,
  355. unsigned int suitable_order,
  356. struct contig_page_info *info)
  357. {
  358. unsigned int order;
  359. info->free_pages = 0;
  360. info->free_blocks_total = 0;
  361. info->free_blocks_suitable = 0;
  362. for (order = 0; order < MAX_ORDER; order++) {
  363. unsigned long blocks;
  364. /* Count number of free blocks */
  365. blocks = zone->free_area[order].nr_free;
  366. info->free_blocks_total += blocks;
  367. /* Count free base pages */
  368. info->free_pages += blocks << order;
  369. /* Count the suitable free blocks */
  370. if (order >= suitable_order)
  371. info->free_blocks_suitable += blocks <<
  372. (order - suitable_order);
  373. }
  374. }
  375. /*
  376. * A fragmentation index only makes sense if an allocation of a requested
  377. * size would fail. If that is true, the fragmentation index indicates
  378. * whether external fragmentation or a lack of memory was the problem.
  379. * The value can be used to determine if page reclaim or compaction
  380. * should be used
  381. */
  382. static int __fragmentation_index(unsigned int order, struct contig_page_info *info)
  383. {
  384. unsigned long requested = 1UL << order;
  385. if (!info->free_blocks_total)
  386. return 0;
  387. /* Fragmentation index only makes sense when a request would fail */
  388. if (info->free_blocks_suitable)
  389. return -1000;
  390. /*
  391. * Index is between 0 and 1 so return within 3 decimal places
  392. *
  393. * 0 => allocation would fail due to lack of memory
  394. * 1 => allocation would fail due to fragmentation
  395. */
  396. return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total);
  397. }
  398. /* Same as __fragmentation index but allocs contig_page_info on stack */
  399. int fragmentation_index(struct zone *zone, unsigned int order)
  400. {
  401. struct contig_page_info info;
  402. fill_contig_page_info(zone, order, &info);
  403. return __fragmentation_index(order, &info);
  404. }
  405. #endif
  406. #if defined(CONFIG_PROC_FS) || defined(CONFIG_COMPACTION)
  407. #include <linux/proc_fs.h>
  408. #include <linux/seq_file.h>
  409. static char * const migratetype_names[MIGRATE_TYPES] = {
  410. "Unmovable",
  411. "Reclaimable",
  412. "Movable",
  413. "Reserve",
  414. "Isolate",
  415. };
  416. static void *frag_start(struct seq_file *m, loff_t *pos)
  417. {
  418. pg_data_t *pgdat;
  419. loff_t node = *pos;
  420. for (pgdat = first_online_pgdat();
  421. pgdat && node;
  422. pgdat = next_online_pgdat(pgdat))
  423. --node;
  424. return pgdat;
  425. }
  426. static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
  427. {
  428. pg_data_t *pgdat = (pg_data_t *)arg;
  429. (*pos)++;
  430. return next_online_pgdat(pgdat);
  431. }
  432. static void frag_stop(struct seq_file *m, void *arg)
  433. {
  434. }
  435. /* Walk all the zones in a node and print using a callback */
  436. static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
  437. void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
  438. {
  439. struct zone *zone;
  440. struct zone *node_zones = pgdat->node_zones;
  441. unsigned long flags;
  442. for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
  443. if (!populated_zone(zone))
  444. continue;
  445. spin_lock_irqsave(&zone->lock, flags);
  446. print(m, pgdat, zone);
  447. spin_unlock_irqrestore(&zone->lock, flags);
  448. }
  449. }
  450. #endif
  451. #ifdef CONFIG_PROC_FS
  452. static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
  453. struct zone *zone)
  454. {
  455. int order;
  456. seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
  457. for (order = 0; order < MAX_ORDER; ++order)
  458. seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
  459. seq_putc(m, '\n');
  460. }
  461. /*
  462. * This walks the free areas for each zone.
  463. */
  464. static int frag_show(struct seq_file *m, void *arg)
  465. {
  466. pg_data_t *pgdat = (pg_data_t *)arg;
  467. walk_zones_in_node(m, pgdat, frag_show_print);
  468. return 0;
  469. }
  470. static void pagetypeinfo_showfree_print(struct seq_file *m,
  471. pg_data_t *pgdat, struct zone *zone)
  472. {
  473. int order, mtype;
  474. for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) {
  475. seq_printf(m, "Node %4d, zone %8s, type %12s ",
  476. pgdat->node_id,
  477. zone->name,
  478. migratetype_names[mtype]);
  479. for (order = 0; order < MAX_ORDER; ++order) {
  480. unsigned long freecount = 0;
  481. struct free_area *area;
  482. struct list_head *curr;
  483. area = &(zone->free_area[order]);
  484. list_for_each(curr, &area->free_list[mtype])
  485. freecount++;
  486. seq_printf(m, "%6lu ", freecount);
  487. }
  488. seq_putc(m, '\n');
  489. }
  490. }
  491. /* Print out the free pages at each order for each migatetype */
  492. static int pagetypeinfo_showfree(struct seq_file *m, void *arg)
  493. {
  494. int order;
  495. pg_data_t *pgdat = (pg_data_t *)arg;
  496. /* Print header */
  497. seq_printf(m, "%-43s ", "Free pages count per migrate type at order");
  498. for (order = 0; order < MAX_ORDER; ++order)
  499. seq_printf(m, "%6d ", order);
  500. seq_putc(m, '\n');
  501. walk_zones_in_node(m, pgdat, pagetypeinfo_showfree_print);
  502. return 0;
  503. }
  504. static void pagetypeinfo_showblockcount_print(struct seq_file *m,
  505. pg_data_t *pgdat, struct zone *zone)
  506. {
  507. int mtype;
  508. unsigned long pfn;
  509. unsigned long start_pfn = zone->zone_start_pfn;
  510. unsigned long end_pfn = start_pfn + zone->spanned_pages;
  511. unsigned long count[MIGRATE_TYPES] = { 0, };
  512. for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
  513. struct page *page;
  514. if (!pfn_valid(pfn))
  515. continue;
  516. page = pfn_to_page(pfn);
  517. /* Watch for unexpected holes punched in the memmap */
  518. if (!memmap_valid_within(pfn, page, zone))
  519. continue;
  520. mtype = get_pageblock_migratetype(page);
  521. if (mtype < MIGRATE_TYPES)
  522. count[mtype]++;
  523. }
  524. /* Print counts */
  525. seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
  526. for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
  527. seq_printf(m, "%12lu ", count[mtype]);
  528. seq_putc(m, '\n');
  529. }
  530. /* Print out the free pages at each order for each migratetype */
  531. static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
  532. {
  533. int mtype;
  534. pg_data_t *pgdat = (pg_data_t *)arg;
  535. seq_printf(m, "\n%-23s", "Number of blocks type ");
  536. for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
  537. seq_printf(m, "%12s ", migratetype_names[mtype]);
  538. seq_putc(m, '\n');
  539. walk_zones_in_node(m, pgdat, pagetypeinfo_showblockcount_print);
  540. return 0;
  541. }
  542. /*
  543. * This prints out statistics in relation to grouping pages by mobility.
  544. * It is expensive to collect so do not constantly read the file.
  545. */
  546. static int pagetypeinfo_show(struct seq_file *m, void *arg)
  547. {
  548. pg_data_t *pgdat = (pg_data_t *)arg;
  549. /* check memoryless node */
  550. if (!node_state(pgdat->node_id, N_HIGH_MEMORY))
  551. return 0;
  552. seq_printf(m, "Page block order: %d\n", pageblock_order);
  553. seq_printf(m, "Pages per block: %lu\n", pageblock_nr_pages);
  554. seq_putc(m, '\n');
  555. pagetypeinfo_showfree(m, pgdat);
  556. pagetypeinfo_showblockcount(m, pgdat);
  557. return 0;
  558. }
  559. static const struct seq_operations fragmentation_op = {
  560. .start = frag_start,
  561. .next = frag_next,
  562. .stop = frag_stop,
  563. .show = frag_show,
  564. };
  565. static int fragmentation_open(struct inode *inode, struct file *file)
  566. {
  567. return seq_open(file, &fragmentation_op);
  568. }
  569. static const struct file_operations fragmentation_file_operations = {
  570. .open = fragmentation_open,
  571. .read = seq_read,
  572. .llseek = seq_lseek,
  573. .release = seq_release,
  574. };
  575. static const struct seq_operations pagetypeinfo_op = {
  576. .start = frag_start,
  577. .next = frag_next,
  578. .stop = frag_stop,
  579. .show = pagetypeinfo_show,
  580. };
  581. static int pagetypeinfo_open(struct inode *inode, struct file *file)
  582. {
  583. return seq_open(file, &pagetypeinfo_op);
  584. }
  585. static const struct file_operations pagetypeinfo_file_ops = {
  586. .open = pagetypeinfo_open,
  587. .read = seq_read,
  588. .llseek = seq_lseek,
  589. .release = seq_release,
  590. };
  591. #ifdef CONFIG_ZONE_DMA
  592. #define TEXT_FOR_DMA(xx) xx "_dma",
  593. #else
  594. #define TEXT_FOR_DMA(xx)
  595. #endif
  596. #ifdef CONFIG_ZONE_DMA32
  597. #define TEXT_FOR_DMA32(xx) xx "_dma32",
  598. #else
  599. #define TEXT_FOR_DMA32(xx)
  600. #endif
  601. #ifdef CONFIG_HIGHMEM
  602. #define TEXT_FOR_HIGHMEM(xx) xx "_high",
  603. #else
  604. #define TEXT_FOR_HIGHMEM(xx)
  605. #endif
  606. #define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
  607. TEXT_FOR_HIGHMEM(xx) xx "_movable",
  608. static const char * const vmstat_text[] = {
  609. /* Zoned VM counters */
  610. "nr_free_pages",
  611. "nr_inactive_anon",
  612. "nr_active_anon",
  613. "nr_inactive_file",
  614. "nr_active_file",
  615. "nr_unevictable",
  616. "nr_mlock",
  617. "nr_anon_pages",
  618. "nr_mapped",
  619. "nr_file_pages",
  620. "nr_dirty",
  621. "nr_writeback",
  622. "nr_slab_reclaimable",
  623. "nr_slab_unreclaimable",
  624. "nr_page_table_pages",
  625. "nr_kernel_stack",
  626. "nr_unstable",
  627. "nr_bounce",
  628. "nr_vmscan_write",
  629. "nr_writeback_temp",
  630. "nr_isolated_anon",
  631. "nr_isolated_file",
  632. "nr_shmem",
  633. #ifdef CONFIG_NUMA
  634. "numa_hit",
  635. "numa_miss",
  636. "numa_foreign",
  637. "numa_interleave",
  638. "numa_local",
  639. "numa_other",
  640. #endif
  641. #ifdef CONFIG_VM_EVENT_COUNTERS
  642. "pgpgin",
  643. "pgpgout",
  644. "pswpin",
  645. "pswpout",
  646. TEXTS_FOR_ZONES("pgalloc")
  647. "pgfree",
  648. "pgactivate",
  649. "pgdeactivate",
  650. "pgfault",
  651. "pgmajfault",
  652. TEXTS_FOR_ZONES("pgrefill")
  653. TEXTS_FOR_ZONES("pgsteal")
  654. TEXTS_FOR_ZONES("pgscan_kswapd")
  655. TEXTS_FOR_ZONES("pgscan_direct")
  656. #ifdef CONFIG_NUMA
  657. "zone_reclaim_failed",
  658. #endif
  659. "pginodesteal",
  660. "slabs_scanned",
  661. "kswapd_steal",
  662. "kswapd_inodesteal",
  663. "kswapd_low_wmark_hit_quickly",
  664. "kswapd_high_wmark_hit_quickly",
  665. "kswapd_skip_congestion_wait",
  666. "pageoutrun",
  667. "allocstall",
  668. "pgrotated",
  669. #ifdef CONFIG_COMPACTION
  670. "compact_blocks_moved",
  671. "compact_pages_moved",
  672. "compact_pagemigrate_failed",
  673. "compact_stall",
  674. "compact_fail",
  675. "compact_success",
  676. #endif
  677. #ifdef CONFIG_HUGETLB_PAGE
  678. "htlb_buddy_alloc_success",
  679. "htlb_buddy_alloc_fail",
  680. #endif
  681. "unevictable_pgs_culled",
  682. "unevictable_pgs_scanned",
  683. "unevictable_pgs_rescued",
  684. "unevictable_pgs_mlocked",
  685. "unevictable_pgs_munlocked",
  686. "unevictable_pgs_cleared",
  687. "unevictable_pgs_stranded",
  688. "unevictable_pgs_mlockfreed",
  689. #endif
  690. };
  691. static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
  692. struct zone *zone)
  693. {
  694. int i;
  695. seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
  696. seq_printf(m,
  697. "\n pages free %lu"
  698. "\n min %lu"
  699. "\n low %lu"
  700. "\n high %lu"
  701. "\n scanned %lu"
  702. "\n spanned %lu"
  703. "\n present %lu",
  704. zone_nr_free_pages(zone),
  705. min_wmark_pages(zone),
  706. low_wmark_pages(zone),
  707. high_wmark_pages(zone),
  708. zone->pages_scanned,
  709. zone->spanned_pages,
  710. zone->present_pages);
  711. for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
  712. seq_printf(m, "\n %-12s %lu", vmstat_text[i],
  713. zone_page_state(zone, i));
  714. seq_printf(m,
  715. "\n protection: (%lu",
  716. zone->lowmem_reserve[0]);
  717. for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
  718. seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
  719. seq_printf(m,
  720. ")"
  721. "\n pagesets");
  722. for_each_online_cpu(i) {
  723. struct per_cpu_pageset *pageset;
  724. pageset = per_cpu_ptr(zone->pageset, i);
  725. seq_printf(m,
  726. "\n cpu: %i"
  727. "\n count: %i"
  728. "\n high: %i"
  729. "\n batch: %i",
  730. i,
  731. pageset->pcp.count,
  732. pageset->pcp.high,
  733. pageset->pcp.batch);
  734. #ifdef CONFIG_SMP
  735. seq_printf(m, "\n vm stats threshold: %d",
  736. pageset->stat_threshold);
  737. #endif
  738. }
  739. seq_printf(m,
  740. "\n all_unreclaimable: %u"
  741. "\n start_pfn: %lu"
  742. "\n inactive_ratio: %u",
  743. zone->all_unreclaimable,
  744. zone->zone_start_pfn,
  745. zone->inactive_ratio);
  746. seq_putc(m, '\n');
  747. }
  748. /*
  749. * Output information about zones in @pgdat.
  750. */
  751. static int zoneinfo_show(struct seq_file *m, void *arg)
  752. {
  753. pg_data_t *pgdat = (pg_data_t *)arg;
  754. walk_zones_in_node(m, pgdat, zoneinfo_show_print);
  755. return 0;
  756. }
  757. static const struct seq_operations zoneinfo_op = {
  758. .start = frag_start, /* iterate over all zones. The same as in
  759. * fragmentation. */
  760. .next = frag_next,
  761. .stop = frag_stop,
  762. .show = zoneinfo_show,
  763. };
  764. static int zoneinfo_open(struct inode *inode, struct file *file)
  765. {
  766. return seq_open(file, &zoneinfo_op);
  767. }
  768. static const struct file_operations proc_zoneinfo_file_operations = {
  769. .open = zoneinfo_open,
  770. .read = seq_read,
  771. .llseek = seq_lseek,
  772. .release = seq_release,
  773. };
  774. static void *vmstat_start(struct seq_file *m, loff_t *pos)
  775. {
  776. unsigned long *v;
  777. #ifdef CONFIG_VM_EVENT_COUNTERS
  778. unsigned long *e;
  779. #endif
  780. int i;
  781. if (*pos >= ARRAY_SIZE(vmstat_text))
  782. return NULL;
  783. #ifdef CONFIG_VM_EVENT_COUNTERS
  784. v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long)
  785. + sizeof(struct vm_event_state), GFP_KERNEL);
  786. #else
  787. v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long),
  788. GFP_KERNEL);
  789. #endif
  790. m->private = v;
  791. if (!v)
  792. return ERR_PTR(-ENOMEM);
  793. for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
  794. v[i] = global_page_state(i);
  795. #ifdef CONFIG_VM_EVENT_COUNTERS
  796. e = v + NR_VM_ZONE_STAT_ITEMS;
  797. all_vm_events(e);
  798. e[PGPGIN] /= 2; /* sectors -> kbytes */
  799. e[PGPGOUT] /= 2;
  800. #endif
  801. return v + *pos;
  802. }
  803. static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
  804. {
  805. (*pos)++;
  806. if (*pos >= ARRAY_SIZE(vmstat_text))
  807. return NULL;
  808. return (unsigned long *)m->private + *pos;
  809. }
  810. static int vmstat_show(struct seq_file *m, void *arg)
  811. {
  812. unsigned long *l = arg;
  813. unsigned long off = l - (unsigned long *)m->private;
  814. seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
  815. return 0;
  816. }
  817. static void vmstat_stop(struct seq_file *m, void *arg)
  818. {
  819. kfree(m->private);
  820. m->private = NULL;
  821. }
  822. static const struct seq_operations vmstat_op = {
  823. .start = vmstat_start,
  824. .next = vmstat_next,
  825. .stop = vmstat_stop,
  826. .show = vmstat_show,
  827. };
  828. static int vmstat_open(struct inode *inode, struct file *file)
  829. {
  830. return seq_open(file, &vmstat_op);
  831. }
  832. static const struct file_operations proc_vmstat_file_operations = {
  833. .open = vmstat_open,
  834. .read = seq_read,
  835. .llseek = seq_lseek,
  836. .release = seq_release,
  837. };
  838. #endif /* CONFIG_PROC_FS */
  839. #ifdef CONFIG_SMP
  840. static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
  841. int sysctl_stat_interval __read_mostly = HZ;
  842. static void vmstat_update(struct work_struct *w)
  843. {
  844. refresh_cpu_vm_stats(smp_processor_id());
  845. schedule_delayed_work(&__get_cpu_var(vmstat_work),
  846. round_jiffies_relative(sysctl_stat_interval));
  847. }
  848. static void __cpuinit start_cpu_timer(int cpu)
  849. {
  850. struct delayed_work *work = &per_cpu(vmstat_work, cpu);
  851. INIT_DELAYED_WORK_DEFERRABLE(work, vmstat_update);
  852. schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu));
  853. }
  854. /*
  855. * Use the cpu notifier to insure that the thresholds are recalculated
  856. * when necessary.
  857. */
  858. static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
  859. unsigned long action,
  860. void *hcpu)
  861. {
  862. long cpu = (long)hcpu;
  863. switch (action) {
  864. case CPU_ONLINE:
  865. case CPU_ONLINE_FROZEN:
  866. refresh_zone_stat_thresholds();
  867. start_cpu_timer(cpu);
  868. node_set_state(cpu_to_node(cpu), N_CPU);
  869. break;
  870. case CPU_DOWN_PREPARE:
  871. case CPU_DOWN_PREPARE_FROZEN:
  872. cancel_rearming_delayed_work(&per_cpu(vmstat_work, cpu));
  873. per_cpu(vmstat_work, cpu).work.func = NULL;
  874. break;
  875. case CPU_DOWN_FAILED:
  876. case CPU_DOWN_FAILED_FROZEN:
  877. start_cpu_timer(cpu);
  878. break;
  879. case CPU_DEAD:
  880. case CPU_DEAD_FROZEN:
  881. refresh_zone_stat_thresholds();
  882. break;
  883. default:
  884. break;
  885. }
  886. return NOTIFY_OK;
  887. }
  888. static struct notifier_block __cpuinitdata vmstat_notifier =
  889. { &vmstat_cpuup_callback, NULL, 0 };
  890. #endif
  891. static int __init setup_vmstat(void)
  892. {
  893. #ifdef CONFIG_SMP
  894. int cpu;
  895. refresh_zone_stat_thresholds();
  896. register_cpu_notifier(&vmstat_notifier);
  897. for_each_online_cpu(cpu)
  898. start_cpu_timer(cpu);
  899. #endif
  900. #ifdef CONFIG_PROC_FS
  901. proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);
  902. proc_create("pagetypeinfo", S_IRUGO, NULL, &pagetypeinfo_file_ops);
  903. proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations);
  904. proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations);
  905. #endif
  906. return 0;
  907. }
  908. module_init(setup_vmstat)
  909. #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)
  910. #include <linux/debugfs.h>
  911. static struct dentry *extfrag_debug_root;
  912. /*
  913. * Return an index indicating how much of the available free memory is
  914. * unusable for an allocation of the requested size.
  915. */
  916. static int unusable_free_index(unsigned int order,
  917. struct contig_page_info *info)
  918. {
  919. /* No free memory is interpreted as all free memory is unusable */
  920. if (info->free_pages == 0)
  921. return 1000;
  922. /*
  923. * Index should be a value between 0 and 1. Return a value to 3
  924. * decimal places.
  925. *
  926. * 0 => no fragmentation
  927. * 1 => high fragmentation
  928. */
  929. return div_u64((info->free_pages - (info->free_blocks_suitable << order)) * 1000ULL, info->free_pages);
  930. }
  931. static void unusable_show_print(struct seq_file *m,
  932. pg_data_t *pgdat, struct zone *zone)
  933. {
  934. unsigned int order;
  935. int index;
  936. struct contig_page_info info;
  937. seq_printf(m, "Node %d, zone %8s ",
  938. pgdat->node_id,
  939. zone->name);
  940. for (order = 0; order < MAX_ORDER; ++order) {
  941. fill_contig_page_info(zone, order, &info);
  942. index = unusable_free_index(order, &info);
  943. seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
  944. }
  945. seq_putc(m, '\n');
  946. }
  947. /*
  948. * Display unusable free space index
  949. *
  950. * The unusable free space index measures how much of the available free
  951. * memory cannot be used to satisfy an allocation of a given size and is a
  952. * value between 0 and 1. The higher the value, the more of free memory is
  953. * unusable and by implication, the worse the external fragmentation is. This
  954. * can be expressed as a percentage by multiplying by 100.
  955. */
  956. static int unusable_show(struct seq_file *m, void *arg)
  957. {
  958. pg_data_t *pgdat = (pg_data_t *)arg;
  959. /* check memoryless node */
  960. if (!node_state(pgdat->node_id, N_HIGH_MEMORY))
  961. return 0;
  962. walk_zones_in_node(m, pgdat, unusable_show_print);
  963. return 0;
  964. }
  965. static const struct seq_operations unusable_op = {
  966. .start = frag_start,
  967. .next = frag_next,
  968. .stop = frag_stop,
  969. .show = unusable_show,
  970. };
  971. static int unusable_open(struct inode *inode, struct file *file)
  972. {
  973. return seq_open(file, &unusable_op);
  974. }
  975. static const struct file_operations unusable_file_ops = {
  976. .open = unusable_open,
  977. .read = seq_read,
  978. .llseek = seq_lseek,
  979. .release = seq_release,
  980. };
  981. static void extfrag_show_print(struct seq_file *m,
  982. pg_data_t *pgdat, struct zone *zone)
  983. {
  984. unsigned int order;
  985. int index;
  986. /* Alloc on stack as interrupts are disabled for zone walk */
  987. struct contig_page_info info;
  988. seq_printf(m, "Node %d, zone %8s ",
  989. pgdat->node_id,
  990. zone->name);
  991. for (order = 0; order < MAX_ORDER; ++order) {
  992. fill_contig_page_info(zone, order, &info);
  993. index = __fragmentation_index(order, &info);
  994. seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
  995. }
  996. seq_putc(m, '\n');
  997. }
  998. /*
  999. * Display fragmentation index for orders that allocations would fail for
  1000. */
  1001. static int extfrag_show(struct seq_file *m, void *arg)
  1002. {
  1003. pg_data_t *pgdat = (pg_data_t *)arg;
  1004. walk_zones_in_node(m, pgdat, extfrag_show_print);
  1005. return 0;
  1006. }
  1007. static const struct seq_operations extfrag_op = {
  1008. .start = frag_start,
  1009. .next = frag_next,
  1010. .stop = frag_stop,
  1011. .show = extfrag_show,
  1012. };
  1013. static int extfrag_open(struct inode *inode, struct file *file)
  1014. {
  1015. return seq_open(file, &extfrag_op);
  1016. }
  1017. static const struct file_operations extfrag_file_ops = {
  1018. .open = extfrag_open,
  1019. .read = seq_read,
  1020. .llseek = seq_lseek,
  1021. .release = seq_release,
  1022. };
  1023. static int __init extfrag_debug_init(void)
  1024. {
  1025. extfrag_debug_root = debugfs_create_dir("extfrag", NULL);
  1026. if (!extfrag_debug_root)
  1027. return -ENOMEM;
  1028. if (!debugfs_create_file("unusable_index", 0444,
  1029. extfrag_debug_root, NULL, &unusable_file_ops))
  1030. return -ENOMEM;
  1031. if (!debugfs_create_file("extfrag_index", 0444,
  1032. extfrag_debug_root, NULL, &extfrag_file_ops))
  1033. return -ENOMEM;
  1034. return 0;
  1035. }
  1036. module_init(extfrag_debug_init);
  1037. #endif