numa.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429
  1. /*
  2. * Generic VM initialization for x86-64 NUMA setups.
  3. * Copyright 2002,2003 Andi Kleen, SuSE Labs.
  4. */
  5. #include <linux/kernel.h>
  6. #include <linux/mm.h>
  7. #include <linux/string.h>
  8. #include <linux/init.h>
  9. #include <linux/bootmem.h>
  10. #include <linux/mmzone.h>
  11. #include <linux/ctype.h>
  12. #include <linux/module.h>
  13. #include <linux/nodemask.h>
  14. #include <asm/e820.h>
  15. #include <asm/proto.h>
  16. #include <asm/dma.h>
  17. #include <asm/numa.h>
  18. #include <asm/acpi.h>
  19. #ifndef Dprintk
  20. #define Dprintk(x...)
  21. #endif
  22. struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
  23. bootmem_data_t plat_node_bdata[MAX_NUMNODES];
  24. struct memnode memnode;
  25. unsigned char cpu_to_node[NR_CPUS] __read_mostly = {
  26. [0 ... NR_CPUS-1] = NUMA_NO_NODE
  27. };
  28. unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
  29. [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
  30. };
  31. cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
  32. int numa_off __initdata;
  33. /*
  34. * Given a shift value, try to populate memnodemap[]
  35. * Returns :
  36. * 1 if OK
  37. * 0 if memnodmap[] too small (of shift too small)
  38. * -1 if node overlap or lost ram (shift too big)
  39. */
  40. static int __init
  41. populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift)
  42. {
  43. int i;
  44. int res = -1;
  45. unsigned long addr, end;
  46. if (shift >= 64)
  47. return -1;
  48. memset(memnodemap, 0xff, sizeof(memnodemap));
  49. for (i = 0; i < numnodes; i++) {
  50. addr = nodes[i].start;
  51. end = nodes[i].end;
  52. if (addr >= end)
  53. continue;
  54. if ((end >> shift) >= NODEMAPSIZE)
  55. return 0;
  56. do {
  57. if (memnodemap[addr >> shift] != 0xff)
  58. return -1;
  59. memnodemap[addr >> shift] = i;
  60. addr += (1UL << shift);
  61. } while (addr < end);
  62. res = 1;
  63. }
  64. return res;
  65. }
  66. int __init compute_hash_shift(struct bootnode *nodes, int numnodes)
  67. {
  68. int shift = 20;
  69. while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0)
  70. shift++;
  71. printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
  72. shift);
  73. if (populate_memnodemap(nodes, numnodes, shift) != 1) {
  74. printk(KERN_INFO
  75. "Your memory is not aligned you need to rebuild your kernel "
  76. "with a bigger NODEMAPSIZE shift=%d\n",
  77. shift);
  78. return -1;
  79. }
  80. return shift;
  81. }
  82. #ifdef CONFIG_SPARSEMEM
  83. int early_pfn_to_nid(unsigned long pfn)
  84. {
  85. return phys_to_nid(pfn << PAGE_SHIFT);
  86. }
  87. #endif
  88. static void * __init
  89. early_node_mem(int nodeid, unsigned long start, unsigned long end,
  90. unsigned long size)
  91. {
  92. unsigned long mem = find_e820_area(start, end, size);
  93. void *ptr;
  94. if (mem != -1L)
  95. return __va(mem);
  96. ptr = __alloc_bootmem_nopanic(size,
  97. SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS));
  98. if (ptr == 0) {
  99. printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
  100. size, nodeid);
  101. return NULL;
  102. }
  103. return ptr;
  104. }
  105. /* Initialize bootmem allocator for a node */
  106. void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
  107. {
  108. unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start;
  109. unsigned long nodedata_phys;
  110. void *bootmap;
  111. const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
  112. start = round_up(start, ZONE_ALIGN);
  113. printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
  114. start_pfn = start >> PAGE_SHIFT;
  115. end_pfn = end >> PAGE_SHIFT;
  116. node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size);
  117. if (node_data[nodeid] == NULL)
  118. return;
  119. nodedata_phys = __pa(node_data[nodeid]);
  120. memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
  121. NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
  122. NODE_DATA(nodeid)->node_start_pfn = start_pfn;
  123. NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
  124. /* Find a place for the bootmem map */
  125. bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
  126. bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
  127. bootmap = early_node_mem(nodeid, bootmap_start, end,
  128. bootmap_pages<<PAGE_SHIFT);
  129. if (bootmap == NULL) {
  130. if (nodedata_phys < start || nodedata_phys >= end)
  131. free_bootmem((unsigned long)node_data[nodeid],pgdat_size);
  132. node_data[nodeid] = NULL;
  133. return;
  134. }
  135. bootmap_start = __pa(bootmap);
  136. Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
  137. bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
  138. bootmap_start >> PAGE_SHIFT,
  139. start_pfn, end_pfn);
  140. free_bootmem_with_active_regions(nodeid, end);
  141. reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
  142. reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
  143. #ifdef CONFIG_ACPI_NUMA
  144. srat_reserve_add_area(nodeid);
  145. #endif
  146. node_set_online(nodeid);
  147. }
  148. /* Initialize final allocator for a zone */
  149. void __init setup_node_zones(int nodeid)
  150. {
  151. unsigned long start_pfn, end_pfn, memmapsize, limit;
  152. start_pfn = node_start_pfn(nodeid);
  153. end_pfn = node_end_pfn(nodeid);
  154. Dprintk(KERN_INFO "Setting up memmap for node %d %lx-%lx\n",
  155. nodeid, start_pfn, end_pfn);
  156. /* Try to allocate mem_map at end to not fill up precious <4GB
  157. memory. */
  158. memmapsize = sizeof(struct page) * (end_pfn-start_pfn);
  159. limit = end_pfn << PAGE_SHIFT;
  160. #ifdef CONFIG_FLAT_NODE_MEM_MAP
  161. NODE_DATA(nodeid)->node_mem_map =
  162. __alloc_bootmem_core(NODE_DATA(nodeid)->bdata,
  163. memmapsize, SMP_CACHE_BYTES,
  164. round_down(limit - memmapsize, PAGE_SIZE),
  165. limit);
  166. #endif
  167. }
  168. void __init numa_init_array(void)
  169. {
  170. int rr, i;
  171. /* There are unfortunately some poorly designed mainboards around
  172. that only connect memory to a single CPU. This breaks the 1:1 cpu->node
  173. mapping. To avoid this fill in the mapping for all possible
  174. CPUs, as the number of CPUs is not known yet.
  175. We round robin the existing nodes. */
  176. rr = first_node(node_online_map);
  177. for (i = 0; i < NR_CPUS; i++) {
  178. if (cpu_to_node[i] != NUMA_NO_NODE)
  179. continue;
  180. numa_set_node(i, rr);
  181. rr = next_node(rr, node_online_map);
  182. if (rr == MAX_NUMNODES)
  183. rr = first_node(node_online_map);
  184. }
  185. }
  186. #ifdef CONFIG_NUMA_EMU
  187. int numa_fake __initdata = 0;
  188. /* Numa emulation */
  189. static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
  190. {
  191. int i;
  192. struct bootnode nodes[MAX_NUMNODES];
  193. unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
  194. /* Kludge needed for the hash function */
  195. if (hweight64(sz) > 1) {
  196. unsigned long x = 1;
  197. while ((x << 1) < sz)
  198. x <<= 1;
  199. if (x < sz/2)
  200. printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n");
  201. sz = x;
  202. }
  203. memset(&nodes,0,sizeof(nodes));
  204. for (i = 0; i < numa_fake; i++) {
  205. nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
  206. if (i == numa_fake-1)
  207. sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
  208. nodes[i].end = nodes[i].start + sz;
  209. printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
  210. i,
  211. nodes[i].start, nodes[i].end,
  212. (nodes[i].end - nodes[i].start) >> 20);
  213. node_set_online(i);
  214. }
  215. memnode_shift = compute_hash_shift(nodes, numa_fake);
  216. if (memnode_shift < 0) {
  217. memnode_shift = 0;
  218. printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
  219. return -1;
  220. }
  221. for_each_online_node(i) {
  222. e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
  223. nodes[i].end >> PAGE_SHIFT);
  224. setup_node_bootmem(i, nodes[i].start, nodes[i].end);
  225. }
  226. numa_init_array();
  227. return 0;
  228. }
  229. #endif
  230. void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
  231. {
  232. int i;
  233. #ifdef CONFIG_NUMA_EMU
  234. if (numa_fake && !numa_emulation(start_pfn, end_pfn))
  235. return;
  236. #endif
  237. #ifdef CONFIG_ACPI_NUMA
  238. if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
  239. end_pfn << PAGE_SHIFT))
  240. return;
  241. #endif
  242. #ifdef CONFIG_K8_NUMA
  243. if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
  244. return;
  245. #endif
  246. printk(KERN_INFO "%s\n",
  247. numa_off ? "NUMA turned off" : "No NUMA configuration found");
  248. printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
  249. start_pfn << PAGE_SHIFT,
  250. end_pfn << PAGE_SHIFT);
  251. /* setup dummy node covering all memory */
  252. memnode_shift = 63;
  253. memnodemap[0] = 0;
  254. nodes_clear(node_online_map);
  255. node_set_online(0);
  256. for (i = 0; i < NR_CPUS; i++)
  257. numa_set_node(i, 0);
  258. node_to_cpumask[0] = cpumask_of_cpu(0);
  259. e820_register_active_regions(0, start_pfn, end_pfn);
  260. setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
  261. }
  262. __cpuinit void numa_add_cpu(int cpu)
  263. {
  264. set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
  265. }
  266. void __cpuinit numa_set_node(int cpu, int node)
  267. {
  268. cpu_pda(cpu)->nodenumber = node;
  269. cpu_to_node[cpu] = node;
  270. }
  271. unsigned long __init numa_free_all_bootmem(void)
  272. {
  273. int i;
  274. unsigned long pages = 0;
  275. for_each_online_node(i) {
  276. pages += free_all_bootmem_node(NODE_DATA(i));
  277. }
  278. return pages;
  279. }
  280. #ifdef CONFIG_SPARSEMEM
  281. static void __init arch_sparse_init(void)
  282. {
  283. int i;
  284. for_each_online_node(i)
  285. memory_present(i, node_start_pfn(i), node_end_pfn(i));
  286. sparse_init();
  287. }
  288. #else
  289. #define arch_sparse_init() do {} while (0)
  290. #endif
  291. void __init paging_init(void)
  292. {
  293. int i;
  294. unsigned long max_zone_pfns[MAX_NR_ZONES];
  295. memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
  296. max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
  297. max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
  298. max_zone_pfns[ZONE_NORMAL] = end_pfn;
  299. arch_sparse_init();
  300. for_each_online_node(i) {
  301. setup_node_zones(i);
  302. }
  303. free_area_init_nodes(max_zone_pfns);
  304. }
  305. static __init int numa_setup(char *opt)
  306. {
  307. if (!opt)
  308. return -EINVAL;
  309. if (!strncmp(opt,"off",3))
  310. numa_off = 1;
  311. #ifdef CONFIG_NUMA_EMU
  312. if(!strncmp(opt, "fake=", 5)) {
  313. numa_fake = simple_strtoul(opt+5,NULL,0); ;
  314. if (numa_fake >= MAX_NUMNODES)
  315. numa_fake = MAX_NUMNODES;
  316. }
  317. #endif
  318. #ifdef CONFIG_ACPI_NUMA
  319. if (!strncmp(opt,"noacpi",6))
  320. acpi_numa = -1;
  321. if (!strncmp(opt,"hotadd=", 7))
  322. hotadd_percent = simple_strtoul(opt+7, NULL, 10);
  323. #endif
  324. return 0;
  325. }
  326. early_param("numa", numa_setup);
  327. /*
  328. * Setup early cpu_to_node.
  329. *
  330. * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
  331. * and apicid_to_node[] tables have valid entries for a CPU.
  332. * This means we skip cpu_to_node[] initialisation for NUMA
  333. * emulation and faking node case (when running a kernel compiled
  334. * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
  335. * is already initialized in a round robin manner at numa_init_array,
  336. * prior to this call, and this initialization is good enough
  337. * for the fake NUMA cases.
  338. */
  339. void __init init_cpu_to_node(void)
  340. {
  341. int i;
  342. for (i = 0; i < NR_CPUS; i++) {
  343. u8 apicid = x86_cpu_to_apicid[i];
  344. if (apicid == BAD_APICID)
  345. continue;
  346. if (apicid_to_node[apicid] == NUMA_NO_NODE)
  347. continue;
  348. numa_set_node(i,apicid_to_node[apicid]);
  349. }
  350. }
  351. EXPORT_SYMBOL(cpu_to_node);
  352. EXPORT_SYMBOL(node_to_cpumask);
  353. EXPORT_SYMBOL(memnode);
  354. EXPORT_SYMBOL(node_data);
  355. #ifdef CONFIG_DISCONTIGMEM
  356. /*
  357. * Functions to convert PFNs from/to per node page addresses.
  358. * These are out of line because they are quite big.
  359. * They could be all tuned by pre caching more state.
  360. * Should do that.
  361. */
  362. int pfn_valid(unsigned long pfn)
  363. {
  364. unsigned nid;
  365. if (pfn >= num_physpages)
  366. return 0;
  367. nid = pfn_to_nid(pfn);
  368. if (nid == 0xff)
  369. return 0;
  370. return pfn >= node_start_pfn(nid) && (pfn) < node_end_pfn(nid);
  371. }
  372. EXPORT_SYMBOL(pfn_valid);
  373. #endif