vmstat.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667
  1. /*
  2. * linux/mm/vmstat.c
  3. *
  4. * Manages VM statistics
  5. * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
  6. *
  7. * zoned VM statistics
  8. * Copyright (C) 2006 Silicon Graphics, Inc.,
  9. * Christoph Lameter <christoph@lameter.com>
  10. */
  11. #include <linux/mm.h>
  12. #include <linux/module.h>
  13. #include <linux/cpu.h>
  14. #ifdef CONFIG_VM_EVENT_COUNTERS
  15. DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
  16. EXPORT_PER_CPU_SYMBOL(vm_event_states);
  17. static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask)
  18. {
  19. int cpu = 0;
  20. int i;
  21. memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
  22. cpu = first_cpu(*cpumask);
  23. while (cpu < NR_CPUS) {
  24. struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
  25. cpu = next_cpu(cpu, *cpumask);
  26. if (cpu < NR_CPUS)
  27. prefetch(&per_cpu(vm_event_states, cpu));
  28. for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
  29. ret[i] += this->event[i];
  30. }
  31. }
  32. /*
  33. * Accumulate the vm event counters across all CPUs.
  34. * The result is unavoidably approximate - it can change
  35. * during and after execution of this function.
  36. */
  37. void all_vm_events(unsigned long *ret)
  38. {
  39. sum_vm_events(ret, &cpu_online_map);
  40. }
  41. EXPORT_SYMBOL_GPL(all_vm_events);
  42. #ifdef CONFIG_HOTPLUG
  43. /*
  44. * Fold the foreign cpu events into our own.
  45. *
  46. * This is adding to the events on one processor
  47. * but keeps the global counts constant.
  48. */
  49. void vm_events_fold_cpu(int cpu)
  50. {
  51. struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu);
  52. int i;
  53. for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
  54. count_vm_events(i, fold_state->event[i]);
  55. fold_state->event[i] = 0;
  56. }
  57. }
  58. #endif /* CONFIG_HOTPLUG */
  59. #endif /* CONFIG_VM_EVENT_COUNTERS */
  60. /*
  61. * Manage combined zone based / global counters
  62. *
  63. * vm_stat contains the global counters
  64. */
  65. atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
  66. EXPORT_SYMBOL(vm_stat);
  67. #ifdef CONFIG_SMP
  68. static int calculate_threshold(struct zone *zone)
  69. {
  70. int threshold;
  71. int mem; /* memory in 128 MB units */
  72. /*
  73. * The threshold scales with the number of processors and the amount
  74. * of memory per zone. More memory means that we can defer updates for
  75. * longer, more processors could lead to more contention.
  76. * fls() is used to have a cheap way of logarithmic scaling.
  77. *
  78. * Some sample thresholds:
  79. *
  80. * Threshold Processors (fls) Zonesize fls(mem+1)
  81. * ------------------------------------------------------------------
  82. * 8 1 1 0.9-1 GB 4
  83. * 16 2 2 0.9-1 GB 4
  84. * 20 2 2 1-2 GB 5
  85. * 24 2 2 2-4 GB 6
  86. * 28 2 2 4-8 GB 7
  87. * 32 2 2 8-16 GB 8
  88. * 4 2 2 <128M 1
  89. * 30 4 3 2-4 GB 5
  90. * 48 4 3 8-16 GB 8
  91. * 32 8 4 1-2 GB 4
  92. * 32 8 4 0.9-1GB 4
  93. * 10 16 5 <128M 1
  94. * 40 16 5 900M 4
  95. * 70 64 7 2-4 GB 5
  96. * 84 64 7 4-8 GB 6
  97. * 108 512 9 4-8 GB 6
  98. * 125 1024 10 8-16 GB 8
  99. * 125 1024 10 16-32 GB 9
  100. */
  101. mem = zone->present_pages >> (27 - PAGE_SHIFT);
  102. threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
  103. /*
  104. * Maximum threshold is 125
  105. */
  106. threshold = min(125, threshold);
  107. return threshold;
  108. }
  109. /*
  110. * Refresh the thresholds for each zone.
  111. */
  112. static void refresh_zone_stat_thresholds(void)
  113. {
  114. struct zone *zone;
  115. int cpu;
  116. int threshold;
  117. for_each_zone(zone) {
  118. if (!zone->present_pages)
  119. continue;
  120. threshold = calculate_threshold(zone);
  121. for_each_online_cpu(cpu)
  122. zone_pcp(zone, cpu)->stat_threshold = threshold;
  123. }
  124. }
  125. /*
  126. * For use when we know that interrupts are disabled.
  127. */
  128. void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
  129. int delta)
  130. {
  131. struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
  132. s8 *p = pcp->vm_stat_diff + item;
  133. long x;
  134. x = delta + *p;
  135. if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) {
  136. zone_page_state_add(x, zone, item);
  137. x = 0;
  138. }
  139. *p = x;
  140. }
  141. EXPORT_SYMBOL(__mod_zone_page_state);
  142. /*
  143. * For an unknown interrupt state
  144. */
  145. void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
  146. int delta)
  147. {
  148. unsigned long flags;
  149. local_irq_save(flags);
  150. __mod_zone_page_state(zone, item, delta);
  151. local_irq_restore(flags);
  152. }
  153. EXPORT_SYMBOL(mod_zone_page_state);
  154. /*
  155. * Optimized increment and decrement functions.
  156. *
  157. * These are only for a single page and therefore can take a struct page *
  158. * argument instead of struct zone *. This allows the inclusion of the code
  159. * generated for page_zone(page) into the optimized functions.
  160. *
  161. * No overflow check is necessary and therefore the differential can be
  162. * incremented or decremented in place which may allow the compilers to
  163. * generate better code.
  164. * The increment or decrement is known and therefore one boundary check can
  165. * be omitted.
  166. *
  167. * NOTE: These functions are very performance sensitive. Change only
  168. * with care.
  169. *
  170. * Some processors have inc/dec instructions that are atomic vs an interrupt.
  171. * However, the code must first determine the differential location in a zone
  172. * based on the processor number and then inc/dec the counter. There is no
  173. * guarantee without disabling preemption that the processor will not change
  174. * in between and therefore the atomicity vs. interrupt cannot be exploited
  175. * in a useful way here.
  176. */
  177. void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
  178. {
  179. struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
  180. s8 *p = pcp->vm_stat_diff + item;
  181. (*p)++;
  182. if (unlikely(*p > pcp->stat_threshold)) {
  183. int overstep = pcp->stat_threshold / 2;
  184. zone_page_state_add(*p + overstep, zone, item);
  185. *p = -overstep;
  186. }
  187. }
  188. void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
  189. {
  190. __inc_zone_state(page_zone(page), item);
  191. }
  192. EXPORT_SYMBOL(__inc_zone_page_state);
  193. void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
  194. {
  195. struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
  196. s8 *p = pcp->vm_stat_diff + item;
  197. (*p)--;
  198. if (unlikely(*p < - pcp->stat_threshold)) {
  199. int overstep = pcp->stat_threshold / 2;
  200. zone_page_state_add(*p - overstep, zone, item);
  201. *p = overstep;
  202. }
  203. }
  204. void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
  205. {
  206. __dec_zone_state(page_zone(page), item);
  207. }
  208. EXPORT_SYMBOL(__dec_zone_page_state);
  209. void inc_zone_state(struct zone *zone, enum zone_stat_item item)
  210. {
  211. unsigned long flags;
  212. local_irq_save(flags);
  213. __inc_zone_state(zone, item);
  214. local_irq_restore(flags);
  215. }
  216. void inc_zone_page_state(struct page *page, enum zone_stat_item item)
  217. {
  218. unsigned long flags;
  219. struct zone *zone;
  220. zone = page_zone(page);
  221. local_irq_save(flags);
  222. __inc_zone_state(zone, item);
  223. local_irq_restore(flags);
  224. }
  225. EXPORT_SYMBOL(inc_zone_page_state);
  226. void dec_zone_page_state(struct page *page, enum zone_stat_item item)
  227. {
  228. unsigned long flags;
  229. local_irq_save(flags);
  230. __dec_zone_page_state(page, item);
  231. local_irq_restore(flags);
  232. }
  233. EXPORT_SYMBOL(dec_zone_page_state);
  234. /*
  235. * Update the zone counters for one cpu.
  236. */
  237. void refresh_cpu_vm_stats(int cpu)
  238. {
  239. struct zone *zone;
  240. int i;
  241. unsigned long flags;
  242. for_each_zone(zone) {
  243. struct per_cpu_pageset *pcp;
  244. if (!populated_zone(zone))
  245. continue;
  246. pcp = zone_pcp(zone, cpu);
  247. for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
  248. if (pcp->vm_stat_diff[i]) {
  249. local_irq_save(flags);
  250. zone_page_state_add(pcp->vm_stat_diff[i],
  251. zone, i);
  252. pcp->vm_stat_diff[i] = 0;
  253. local_irq_restore(flags);
  254. }
  255. }
  256. }
  257. static void __refresh_cpu_vm_stats(void *dummy)
  258. {
  259. refresh_cpu_vm_stats(smp_processor_id());
  260. }
  261. /*
  262. * Consolidate all counters.
  263. *
  264. * Note that the result is less inaccurate but still inaccurate
  265. * if concurrent processes are allowed to run.
  266. */
  267. void refresh_vm_stats(void)
  268. {
  269. on_each_cpu(__refresh_cpu_vm_stats, NULL, 0, 1);
  270. }
  271. EXPORT_SYMBOL(refresh_vm_stats);
  272. #endif
  273. #ifdef CONFIG_NUMA
  274. /*
  275. * zonelist = the list of zones passed to the allocator
  276. * z = the zone from which the allocation occurred.
  277. *
  278. * Must be called with interrupts disabled.
  279. */
  280. void zone_statistics(struct zonelist *zonelist, struct zone *z)
  281. {
  282. if (z->zone_pgdat == zonelist->zones[0]->zone_pgdat) {
  283. __inc_zone_state(z, NUMA_HIT);
  284. } else {
  285. __inc_zone_state(z, NUMA_MISS);
  286. __inc_zone_state(zonelist->zones[0], NUMA_FOREIGN);
  287. }
  288. if (z->node == numa_node_id())
  289. __inc_zone_state(z, NUMA_LOCAL);
  290. else
  291. __inc_zone_state(z, NUMA_OTHER);
  292. }
  293. #endif
  294. #ifdef CONFIG_PROC_FS
  295. #include <linux/seq_file.h>
  296. static void *frag_start(struct seq_file *m, loff_t *pos)
  297. {
  298. pg_data_t *pgdat;
  299. loff_t node = *pos;
  300. for (pgdat = first_online_pgdat();
  301. pgdat && node;
  302. pgdat = next_online_pgdat(pgdat))
  303. --node;
  304. return pgdat;
  305. }
  306. static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
  307. {
  308. pg_data_t *pgdat = (pg_data_t *)arg;
  309. (*pos)++;
  310. return next_online_pgdat(pgdat);
  311. }
  312. static void frag_stop(struct seq_file *m, void *arg)
  313. {
  314. }
  315. /*
  316. * This walks the free areas for each zone.
  317. */
  318. static int frag_show(struct seq_file *m, void *arg)
  319. {
  320. pg_data_t *pgdat = (pg_data_t *)arg;
  321. struct zone *zone;
  322. struct zone *node_zones = pgdat->node_zones;
  323. unsigned long flags;
  324. int order;
  325. for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
  326. if (!populated_zone(zone))
  327. continue;
  328. spin_lock_irqsave(&zone->lock, flags);
  329. seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
  330. for (order = 0; order < MAX_ORDER; ++order)
  331. seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
  332. spin_unlock_irqrestore(&zone->lock, flags);
  333. seq_putc(m, '\n');
  334. }
  335. return 0;
  336. }
  337. const struct seq_operations fragmentation_op = {
  338. .start = frag_start,
  339. .next = frag_next,
  340. .stop = frag_stop,
  341. .show = frag_show,
  342. };
  343. #ifdef CONFIG_ZONE_DMA32
  344. #define TEXT_FOR_DMA32(xx) xx "_dma32",
  345. #else
  346. #define TEXT_FOR_DMA32(xx)
  347. #endif
  348. #ifdef CONFIG_HIGHMEM
  349. #define TEXT_FOR_HIGHMEM(xx) xx "_high",
  350. #else
  351. #define TEXT_FOR_HIGHMEM(xx)
  352. #endif
  353. #define TEXTS_FOR_ZONES(xx) xx "_dma", TEXT_FOR_DMA32(xx) xx "_normal", \
  354. TEXT_FOR_HIGHMEM(xx)
  355. static const char * const vmstat_text[] = {
  356. /* Zoned VM counters */
  357. "nr_free_pages",
  358. "nr_active",
  359. "nr_inactive",
  360. "nr_anon_pages",
  361. "nr_mapped",
  362. "nr_file_pages",
  363. "nr_dirty",
  364. "nr_writeback",
  365. "nr_slab_reclaimable",
  366. "nr_slab_unreclaimable",
  367. "nr_page_table_pages",
  368. "nr_unstable",
  369. "nr_bounce",
  370. "nr_vmscan_write",
  371. #ifdef CONFIG_NUMA
  372. "numa_hit",
  373. "numa_miss",
  374. "numa_foreign",
  375. "numa_interleave",
  376. "numa_local",
  377. "numa_other",
  378. #endif
  379. #ifdef CONFIG_VM_EVENT_COUNTERS
  380. "pgpgin",
  381. "pgpgout",
  382. "pswpin",
  383. "pswpout",
  384. TEXTS_FOR_ZONES("pgalloc")
  385. "pgfree",
  386. "pgactivate",
  387. "pgdeactivate",
  388. "pgfault",
  389. "pgmajfault",
  390. TEXTS_FOR_ZONES("pgrefill")
  391. TEXTS_FOR_ZONES("pgsteal")
  392. TEXTS_FOR_ZONES("pgscan_kswapd")
  393. TEXTS_FOR_ZONES("pgscan_direct")
  394. "pginodesteal",
  395. "slabs_scanned",
  396. "kswapd_steal",
  397. "kswapd_inodesteal",
  398. "pageoutrun",
  399. "allocstall",
  400. "pgrotated",
  401. #endif
  402. };
  403. /*
  404. * Output information about zones in @pgdat.
  405. */
  406. static int zoneinfo_show(struct seq_file *m, void *arg)
  407. {
  408. pg_data_t *pgdat = arg;
  409. struct zone *zone;
  410. struct zone *node_zones = pgdat->node_zones;
  411. unsigned long flags;
  412. for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) {
  413. int i;
  414. if (!populated_zone(zone))
  415. continue;
  416. spin_lock_irqsave(&zone->lock, flags);
  417. seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
  418. seq_printf(m,
  419. "\n pages free %lu"
  420. "\n min %lu"
  421. "\n low %lu"
  422. "\n high %lu"
  423. "\n scanned %lu (a: %lu i: %lu)"
  424. "\n spanned %lu"
  425. "\n present %lu",
  426. zone_page_state(zone, NR_FREE_PAGES),
  427. zone->pages_min,
  428. zone->pages_low,
  429. zone->pages_high,
  430. zone->pages_scanned,
  431. zone->nr_scan_active, zone->nr_scan_inactive,
  432. zone->spanned_pages,
  433. zone->present_pages);
  434. for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
  435. seq_printf(m, "\n %-12s %lu", vmstat_text[i],
  436. zone_page_state(zone, i));
  437. seq_printf(m,
  438. "\n protection: (%lu",
  439. zone->lowmem_reserve[0]);
  440. for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
  441. seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
  442. seq_printf(m,
  443. ")"
  444. "\n pagesets");
  445. for_each_online_cpu(i) {
  446. struct per_cpu_pageset *pageset;
  447. int j;
  448. pageset = zone_pcp(zone, i);
  449. for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
  450. seq_printf(m,
  451. "\n cpu: %i pcp: %i"
  452. "\n count: %i"
  453. "\n high: %i"
  454. "\n batch: %i",
  455. i, j,
  456. pageset->pcp[j].count,
  457. pageset->pcp[j].high,
  458. pageset->pcp[j].batch);
  459. }
  460. #ifdef CONFIG_SMP
  461. seq_printf(m, "\n vm stats threshold: %d",
  462. pageset->stat_threshold);
  463. #endif
  464. }
  465. seq_printf(m,
  466. "\n all_unreclaimable: %u"
  467. "\n prev_priority: %i"
  468. "\n start_pfn: %lu",
  469. zone->all_unreclaimable,
  470. zone->prev_priority,
  471. zone->zone_start_pfn);
  472. spin_unlock_irqrestore(&zone->lock, flags);
  473. seq_putc(m, '\n');
  474. }
  475. return 0;
  476. }
  477. const struct seq_operations zoneinfo_op = {
  478. .start = frag_start, /* iterate over all zones. The same as in
  479. * fragmentation. */
  480. .next = frag_next,
  481. .stop = frag_stop,
  482. .show = zoneinfo_show,
  483. };
  484. static void *vmstat_start(struct seq_file *m, loff_t *pos)
  485. {
  486. unsigned long *v;
  487. #ifdef CONFIG_VM_EVENT_COUNTERS
  488. unsigned long *e;
  489. #endif
  490. int i;
  491. if (*pos >= ARRAY_SIZE(vmstat_text))
  492. return NULL;
  493. #ifdef CONFIG_VM_EVENT_COUNTERS
  494. v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long)
  495. + sizeof(struct vm_event_state), GFP_KERNEL);
  496. #else
  497. v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long),
  498. GFP_KERNEL);
  499. #endif
  500. m->private = v;
  501. if (!v)
  502. return ERR_PTR(-ENOMEM);
  503. for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
  504. v[i] = global_page_state(i);
  505. #ifdef CONFIG_VM_EVENT_COUNTERS
  506. e = v + NR_VM_ZONE_STAT_ITEMS;
  507. all_vm_events(e);
  508. e[PGPGIN] /= 2; /* sectors -> kbytes */
  509. e[PGPGOUT] /= 2;
  510. #endif
  511. return v + *pos;
  512. }
  513. static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
  514. {
  515. (*pos)++;
  516. if (*pos >= ARRAY_SIZE(vmstat_text))
  517. return NULL;
  518. return (unsigned long *)m->private + *pos;
  519. }
  520. static int vmstat_show(struct seq_file *m, void *arg)
  521. {
  522. unsigned long *l = arg;
  523. unsigned long off = l - (unsigned long *)m->private;
  524. seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
  525. return 0;
  526. }
  527. static void vmstat_stop(struct seq_file *m, void *arg)
  528. {
  529. kfree(m->private);
  530. m->private = NULL;
  531. }
  532. const struct seq_operations vmstat_op = {
  533. .start = vmstat_start,
  534. .next = vmstat_next,
  535. .stop = vmstat_stop,
  536. .show = vmstat_show,
  537. };
  538. #endif /* CONFIG_PROC_FS */
  539. #ifdef CONFIG_SMP
  540. /*
  541. * Use the cpu notifier to insure that the thresholds are recalculated
  542. * when necessary.
  543. */
  544. static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
  545. unsigned long action,
  546. void *hcpu)
  547. {
  548. switch (action) {
  549. case CPU_UP_PREPARE:
  550. case CPU_UP_CANCELED:
  551. case CPU_DEAD:
  552. refresh_zone_stat_thresholds();
  553. break;
  554. default:
  555. break;
  556. }
  557. return NOTIFY_OK;
  558. }
  559. static struct notifier_block __cpuinitdata vmstat_notifier =
  560. { &vmstat_cpuup_callback, NULL, 0 };
  561. int __init setup_vmstat(void)
  562. {
  563. refresh_zone_stat_thresholds();
  564. register_cpu_notifier(&vmstat_notifier);
  565. return 0;
  566. }
  567. module_init(setup_vmstat)
  568. #endif