numa.c 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335
  1. /*
  2. * Generic VM initialization for x86-64 NUMA setups.
  3. * Copyright 2002,2003 Andi Kleen, SuSE Labs.
  4. */
  5. #include <linux/kernel.h>
  6. #include <linux/mm.h>
  7. #include <linux/string.h>
  8. #include <linux/init.h>
  9. #include <linux/bootmem.h>
  10. #include <linux/mmzone.h>
  11. #include <linux/ctype.h>
  12. #include <linux/module.h>
  13. #include <linux/nodemask.h>
  14. #include <asm/e820.h>
  15. #include <asm/proto.h>
  16. #include <asm/dma.h>
  17. #include <asm/numa.h>
  18. #include <asm/acpi.h>
  19. #ifndef Dprintk
  20. #define Dprintk(x...)
  21. #endif
  22. struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
  23. bootmem_data_t plat_node_bdata[MAX_NUMNODES];
  24. int memnode_shift;
  25. u8 memnodemap[NODEMAPSIZE];
  26. unsigned char cpu_to_node[NR_CPUS] __read_mostly = {
  27. [0 ... NR_CPUS-1] = NUMA_NO_NODE
  28. };
  29. unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
  30. [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
  31. };
  32. cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
  33. int numa_off __initdata;
  34. /*
  35. * Given a shift value, try to populate memnodemap[]
  36. * Returns :
  37. * 1 if OK
  38. * 0 if memnodmap[] too small (of shift too small)
  39. * -1 if node overlap or lost ram (shift too big)
  40. */
  41. static int __init populate_memnodemap(
  42. const struct node *nodes, int numnodes, int shift)
  43. {
  44. int i;
  45. int res = -1;
  46. unsigned long addr, end;
  47. memset(memnodemap, 0xff, sizeof(memnodemap));
  48. for (i = 0; i < numnodes; i++) {
  49. addr = nodes[i].start;
  50. end = nodes[i].end;
  51. if (addr >= end)
  52. continue;
  53. if ((end >> shift) >= NODEMAPSIZE)
  54. return 0;
  55. do {
  56. if (memnodemap[addr >> shift] != 0xff)
  57. return -1;
  58. memnodemap[addr >> shift] = i;
  59. addr += (1 << shift);
  60. } while (addr < end);
  61. res = 1;
  62. }
  63. return res;
  64. }
  65. int __init compute_hash_shift(struct node *nodes, int numnodes)
  66. {
  67. int shift = 20;
  68. while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0)
  69. shift++;
  70. printk(KERN_DEBUG "Using %d for the hash shift.\n",
  71. shift);
  72. if (populate_memnodemap(nodes, numnodes, shift) != 1) {
  73. printk(KERN_INFO
  74. "Your memory is not aligned you need to rebuild your kernel "
  75. "with a bigger NODEMAPSIZE shift=%d\n",
  76. shift);
  77. return -1;
  78. }
  79. return shift;
  80. }
  81. #ifdef CONFIG_SPARSEMEM
  82. int early_pfn_to_nid(unsigned long pfn)
  83. {
  84. return phys_to_nid(pfn << PAGE_SHIFT);
  85. }
  86. #endif
  87. /* Initialize bootmem allocator for a node */
  88. void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
  89. {
  90. unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start;
  91. unsigned long nodedata_phys;
  92. const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
  93. start = round_up(start, ZONE_ALIGN);
  94. printk("Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
  95. start_pfn = start >> PAGE_SHIFT;
  96. end_pfn = end >> PAGE_SHIFT;
  97. nodedata_phys = find_e820_area(start, end, pgdat_size);
  98. if (nodedata_phys == -1L)
  99. panic("Cannot find memory pgdat in node %d\n", nodeid);
  100. Dprintk("nodedata_phys %lx\n", nodedata_phys);
  101. node_data[nodeid] = phys_to_virt(nodedata_phys);
  102. memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
  103. NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
  104. NODE_DATA(nodeid)->node_start_pfn = start_pfn;
  105. NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
  106. /* Find a place for the bootmem map */
  107. bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
  108. bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
  109. bootmap_start = find_e820_area(bootmap_start, end, bootmap_pages<<PAGE_SHIFT);
  110. if (bootmap_start == -1L)
  111. panic("Not enough continuous space for bootmap on node %d", nodeid);
  112. Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
  113. bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
  114. bootmap_start >> PAGE_SHIFT,
  115. start_pfn, end_pfn);
  116. e820_bootmem_free(NODE_DATA(nodeid), start, end);
  117. reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
  118. reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
  119. node_set_online(nodeid);
  120. }
  121. /* Initialize final allocator for a zone */
  122. void __init setup_node_zones(int nodeid)
  123. {
  124. unsigned long start_pfn, end_pfn;
  125. unsigned long zones[MAX_NR_ZONES];
  126. unsigned long holes[MAX_NR_ZONES];
  127. start_pfn = node_start_pfn(nodeid);
  128. end_pfn = node_end_pfn(nodeid);
  129. Dprintk(KERN_INFO "setting up node %d %lx-%lx\n",
  130. nodeid, start_pfn, end_pfn);
  131. size_zones(zones, holes, start_pfn, end_pfn);
  132. free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
  133. start_pfn, holes);
  134. }
  135. void __init numa_init_array(void)
  136. {
  137. int rr, i;
  138. /* There are unfortunately some poorly designed mainboards around
  139. that only connect memory to a single CPU. This breaks the 1:1 cpu->node
  140. mapping. To avoid this fill in the mapping for all possible
  141. CPUs, as the number of CPUs is not known yet.
  142. We round robin the existing nodes. */
  143. rr = first_node(node_online_map);
  144. for (i = 0; i < NR_CPUS; i++) {
  145. if (cpu_to_node[i] != NUMA_NO_NODE)
  146. continue;
  147. numa_set_node(i, rr);
  148. rr = next_node(rr, node_online_map);
  149. if (rr == MAX_NUMNODES)
  150. rr = first_node(node_online_map);
  151. }
  152. }
  153. #ifdef CONFIG_NUMA_EMU
  154. int numa_fake __initdata = 0;
  155. /* Numa emulation */
  156. static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
  157. {
  158. int i;
  159. struct node nodes[MAX_NUMNODES];
  160. unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
  161. /* Kludge needed for the hash function */
  162. if (hweight64(sz) > 1) {
  163. unsigned long x = 1;
  164. while ((x << 1) < sz)
  165. x <<= 1;
  166. if (x < sz/2)
  167. printk("Numa emulation unbalanced. Complain to maintainer\n");
  168. sz = x;
  169. }
  170. memset(&nodes,0,sizeof(nodes));
  171. for (i = 0; i < numa_fake; i++) {
  172. nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
  173. if (i == numa_fake-1)
  174. sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
  175. nodes[i].end = nodes[i].start + sz;
  176. printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
  177. i,
  178. nodes[i].start, nodes[i].end,
  179. (nodes[i].end - nodes[i].start) >> 20);
  180. node_set_online(i);
  181. }
  182. memnode_shift = compute_hash_shift(nodes, numa_fake);
  183. if (memnode_shift < 0) {
  184. memnode_shift = 0;
  185. printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
  186. return -1;
  187. }
  188. for_each_online_node(i)
  189. setup_node_bootmem(i, nodes[i].start, nodes[i].end);
  190. numa_init_array();
  191. return 0;
  192. }
  193. #endif
  194. void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
  195. {
  196. int i;
  197. #ifdef CONFIG_NUMA_EMU
  198. if (numa_fake && !numa_emulation(start_pfn, end_pfn))
  199. return;
  200. #endif
  201. #ifdef CONFIG_ACPI_NUMA
  202. if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
  203. end_pfn << PAGE_SHIFT))
  204. return;
  205. #endif
  206. #ifdef CONFIG_K8_NUMA
  207. if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
  208. return;
  209. #endif
  210. printk(KERN_INFO "%s\n",
  211. numa_off ? "NUMA turned off" : "No NUMA configuration found");
  212. printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
  213. start_pfn << PAGE_SHIFT,
  214. end_pfn << PAGE_SHIFT);
  215. /* setup dummy node covering all memory */
  216. memnode_shift = 63;
  217. memnodemap[0] = 0;
  218. nodes_clear(node_online_map);
  219. node_set_online(0);
  220. for (i = 0; i < NR_CPUS; i++)
  221. numa_set_node(i, 0);
  222. node_to_cpumask[0] = cpumask_of_cpu(0);
  223. setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
  224. }
  225. __cpuinit void numa_add_cpu(int cpu)
  226. {
  227. set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
  228. }
  229. void __cpuinit numa_set_node(int cpu, int node)
  230. {
  231. cpu_pda[cpu].nodenumber = node;
  232. cpu_to_node[cpu] = node;
  233. }
  234. unsigned long __init numa_free_all_bootmem(void)
  235. {
  236. int i;
  237. unsigned long pages = 0;
  238. for_each_online_node(i) {
  239. pages += free_all_bootmem_node(NODE_DATA(i));
  240. }
  241. return pages;
  242. }
  243. #ifdef CONFIG_SPARSEMEM
  244. static void __init arch_sparse_init(void)
  245. {
  246. int i;
  247. for_each_online_node(i)
  248. memory_present(i, node_start_pfn(i), node_end_pfn(i));
  249. sparse_init();
  250. }
  251. #else
  252. #define arch_sparse_init() do {} while (0)
  253. #endif
  254. void __init paging_init(void)
  255. {
  256. int i;
  257. arch_sparse_init();
  258. for_each_online_node(i) {
  259. setup_node_zones(i);
  260. }
  261. }
  262. /* [numa=off] */
  263. __init int numa_setup(char *opt)
  264. {
  265. if (!strncmp(opt,"off",3))
  266. numa_off = 1;
  267. #ifdef CONFIG_NUMA_EMU
  268. if(!strncmp(opt, "fake=", 5)) {
  269. numa_fake = simple_strtoul(opt+5,NULL,0); ;
  270. if (numa_fake >= MAX_NUMNODES)
  271. numa_fake = MAX_NUMNODES;
  272. }
  273. #endif
  274. #ifdef CONFIG_ACPI_NUMA
  275. if (!strncmp(opt,"noacpi",6))
  276. acpi_numa = -1;
  277. #endif
  278. return 1;
  279. }
  280. EXPORT_SYMBOL(cpu_to_node);
  281. EXPORT_SYMBOL(node_to_cpumask);
  282. EXPORT_SYMBOL(memnode_shift);
  283. EXPORT_SYMBOL(memnodemap);
  284. EXPORT_SYMBOL(node_data);