numa_emulation.c 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492
  1. /*
  2. * NUMA emulation
  3. */
  4. #include <linux/kernel.h>
  5. #include <linux/errno.h>
  6. #include <linux/topology.h>
  7. #include <linux/memblock.h>
  8. #include <linux/bootmem.h>
  9. #include <asm/dma.h>
  10. #include "numa_internal.h"
  11. static int emu_nid_to_phys[MAX_NUMNODES] __cpuinitdata;
  12. static char *emu_cmdline __initdata;
  13. void __init numa_emu_cmdline(char *str)
  14. {
  15. emu_cmdline = str;
  16. }
  17. static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi)
  18. {
  19. int i;
  20. for (i = 0; i < mi->nr_blks; i++)
  21. if (mi->blk[i].nid == nid)
  22. return i;
  23. return -ENOENT;
  24. }
  25. /*
  26. * Sets up nid to range from @start to @end. The return value is -errno if
  27. * something went wrong, 0 otherwise.
  28. */
  29. static int __init emu_setup_memblk(struct numa_meminfo *ei,
  30. struct numa_meminfo *pi,
  31. int nid, int phys_blk, u64 size)
  32. {
  33. struct numa_memblk *eb = &ei->blk[ei->nr_blks];
  34. struct numa_memblk *pb = &pi->blk[phys_blk];
  35. if (ei->nr_blks >= NR_NODE_MEMBLKS) {
  36. pr_err("NUMA: Too many emulated memblks, failing emulation\n");
  37. return -EINVAL;
  38. }
  39. ei->nr_blks++;
  40. eb->start = pb->start;
  41. eb->end = pb->start + size;
  42. eb->nid = nid;
  43. if (emu_nid_to_phys[nid] == NUMA_NO_NODE)
  44. emu_nid_to_phys[nid] = pb->nid;
  45. pb->start += size;
  46. if (pb->start >= pb->end) {
  47. WARN_ON_ONCE(pb->start > pb->end);
  48. numa_remove_memblk_from(phys_blk, pi);
  49. }
  50. printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
  51. eb->start, eb->end, (eb->end - eb->start) >> 20);
  52. return 0;
  53. }
  54. /*
  55. * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
  56. * to max_addr. The return value is the number of nodes allocated.
  57. */
  58. static int __init split_nodes_interleave(struct numa_meminfo *ei,
  59. struct numa_meminfo *pi,
  60. u64 addr, u64 max_addr, int nr_nodes)
  61. {
  62. nodemask_t physnode_mask = NODE_MASK_NONE;
  63. u64 size;
  64. int big;
  65. int nid = 0;
  66. int i, ret;
  67. if (nr_nodes <= 0)
  68. return -1;
  69. if (nr_nodes > MAX_NUMNODES) {
  70. pr_info("numa=fake=%d too large, reducing to %d\n",
  71. nr_nodes, MAX_NUMNODES);
  72. nr_nodes = MAX_NUMNODES;
  73. }
  74. /*
  75. * Calculate target node size. x86_32 freaks on __udivdi3() so do
  76. * the division in ulong number of pages and convert back.
  77. */
  78. size = max_addr - addr - memblock_x86_hole_size(addr, max_addr);
  79. size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes);
  80. /*
  81. * Calculate the number of big nodes that can be allocated as a result
  82. * of consolidating the remainder.
  83. */
  84. big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
  85. FAKE_NODE_MIN_SIZE;
  86. size &= FAKE_NODE_MIN_HASH_MASK;
  87. if (!size) {
  88. pr_err("Not enough memory for each node. "
  89. "NUMA emulation disabled.\n");
  90. return -1;
  91. }
  92. for (i = 0; i < pi->nr_blks; i++)
  93. node_set(pi->blk[i].nid, physnode_mask);
  94. /*
  95. * Continue to fill physical nodes with fake nodes until there is no
  96. * memory left on any of them.
  97. */
  98. while (nodes_weight(physnode_mask)) {
  99. for_each_node_mask(i, physnode_mask) {
  100. u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
  101. u64 start, limit, end;
  102. int phys_blk;
  103. phys_blk = emu_find_memblk_by_nid(i, pi);
  104. if (phys_blk < 0) {
  105. node_clear(i, physnode_mask);
  106. continue;
  107. }
  108. start = pi->blk[phys_blk].start;
  109. limit = pi->blk[phys_blk].end;
  110. end = start + size;
  111. if (nid < big)
  112. end += FAKE_NODE_MIN_SIZE;
  113. /*
  114. * Continue to add memory to this fake node if its
  115. * non-reserved memory is less than the per-node size.
  116. */
  117. while (end - start -
  118. memblock_x86_hole_size(start, end) < size) {
  119. end += FAKE_NODE_MIN_SIZE;
  120. if (end > limit) {
  121. end = limit;
  122. break;
  123. }
  124. }
  125. /*
  126. * If there won't be at least FAKE_NODE_MIN_SIZE of
  127. * non-reserved memory in ZONE_DMA32 for the next node,
  128. * this one must extend to the boundary.
  129. */
  130. if (end < dma32_end && dma32_end - end -
  131. memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
  132. end = dma32_end;
  133. /*
  134. * If there won't be enough non-reserved memory for the
  135. * next node, this one must extend to the end of the
  136. * physical node.
  137. */
  138. if (limit - end -
  139. memblock_x86_hole_size(end, limit) < size)
  140. end = limit;
  141. ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes,
  142. phys_blk,
  143. min(end, limit) - start);
  144. if (ret < 0)
  145. return ret;
  146. }
  147. }
  148. return 0;
  149. }
  150. /*
  151. * Returns the end address of a node so that there is at least `size' amount of
  152. * non-reserved memory or `max_addr' is reached.
  153. */
  154. static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
  155. {
  156. u64 end = start + size;
  157. while (end - start - memblock_x86_hole_size(start, end) < size) {
  158. end += FAKE_NODE_MIN_SIZE;
  159. if (end > max_addr) {
  160. end = max_addr;
  161. break;
  162. }
  163. }
  164. return end;
  165. }
  166. /*
  167. * Sets up fake nodes of `size' interleaved over physical nodes ranging from
  168. * `addr' to `max_addr'. The return value is the number of nodes allocated.
  169. */
  170. static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
  171. struct numa_meminfo *pi,
  172. u64 addr, u64 max_addr, u64 size)
  173. {
  174. nodemask_t physnode_mask = NODE_MASK_NONE;
  175. u64 min_size;
  176. int nid = 0;
  177. int i, ret;
  178. if (!size)
  179. return -1;
  180. /*
  181. * The limit on emulated nodes is MAX_NUMNODES, so the size per node is
  182. * increased accordingly if the requested size is too small. This
  183. * creates a uniform distribution of node sizes across the entire
  184. * machine (but not necessarily over physical nodes).
  185. */
  186. min_size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) /
  187. MAX_NUMNODES;
  188. min_size = max(min_size, FAKE_NODE_MIN_SIZE);
  189. if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size)
  190. min_size = (min_size + FAKE_NODE_MIN_SIZE) &
  191. FAKE_NODE_MIN_HASH_MASK;
  192. if (size < min_size) {
  193. pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
  194. size >> 20, min_size >> 20);
  195. size = min_size;
  196. }
  197. size &= FAKE_NODE_MIN_HASH_MASK;
  198. for (i = 0; i < pi->nr_blks; i++)
  199. node_set(pi->blk[i].nid, physnode_mask);
  200. /*
  201. * Fill physical nodes with fake nodes of size until there is no memory
  202. * left on any of them.
  203. */
  204. while (nodes_weight(physnode_mask)) {
  205. for_each_node_mask(i, physnode_mask) {
  206. u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
  207. u64 start, limit, end;
  208. int phys_blk;
  209. phys_blk = emu_find_memblk_by_nid(i, pi);
  210. if (phys_blk < 0) {
  211. node_clear(i, physnode_mask);
  212. continue;
  213. }
  214. start = pi->blk[phys_blk].start;
  215. limit = pi->blk[phys_blk].end;
  216. end = find_end_of_node(start, limit, size);
  217. /*
  218. * If there won't be at least FAKE_NODE_MIN_SIZE of
  219. * non-reserved memory in ZONE_DMA32 for the next node,
  220. * this one must extend to the boundary.
  221. */
  222. if (end < dma32_end && dma32_end - end -
  223. memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
  224. end = dma32_end;
  225. /*
  226. * If there won't be enough non-reserved memory for the
  227. * next node, this one must extend to the end of the
  228. * physical node.
  229. */
  230. if (limit - end -
  231. memblock_x86_hole_size(end, limit) < size)
  232. end = limit;
  233. ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES,
  234. phys_blk,
  235. min(end, limit) - start);
  236. if (ret < 0)
  237. return ret;
  238. }
  239. }
  240. return 0;
  241. }
  242. /**
  243. * numa_emulation - Emulate NUMA nodes
  244. * @numa_meminfo: NUMA configuration to massage
  245. * @numa_dist_cnt: The size of the physical NUMA distance table
  246. *
  247. * Emulate NUMA nodes according to the numa=fake kernel parameter.
  248. * @numa_meminfo contains the physical memory configuration and is modified
  249. * to reflect the emulated configuration on success. @numa_dist_cnt is
  250. * used to determine the size of the physical distance table.
  251. *
  252. * On success, the following modifications are made.
  253. *
  254. * - @numa_meminfo is updated to reflect the emulated nodes.
  255. *
  256. * - __apicid_to_node[] is updated such that APIC IDs are mapped to the
  257. * emulated nodes.
  258. *
  259. * - NUMA distance table is rebuilt to represent distances between emulated
  260. * nodes. The distances are determined considering how emulated nodes
  261. * are mapped to physical nodes and match the actual distances.
  262. *
  263. * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical
  264. * nodes. This is used by numa_add_cpu() and numa_remove_cpu().
  265. *
  266. * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with
  267. * identity mapping and no other modification is made.
  268. */
  269. void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
  270. {
  271. static struct numa_meminfo ei __initdata;
  272. static struct numa_meminfo pi __initdata;
  273. const u64 max_addr = PFN_PHYS(max_pfn);
  274. u8 *phys_dist = NULL;
  275. size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]);
  276. int max_emu_nid, dfl_phys_nid;
  277. int i, j, ret;
  278. if (!emu_cmdline)
  279. goto no_emu;
  280. memset(&ei, 0, sizeof(ei));
  281. pi = *numa_meminfo;
  282. for (i = 0; i < MAX_NUMNODES; i++)
  283. emu_nid_to_phys[i] = NUMA_NO_NODE;
  284. /*
  285. * If the numa=fake command-line contains a 'M' or 'G', it represents
  286. * the fixed node size. Otherwise, if it is just a single number N,
  287. * split the system RAM into N fake nodes.
  288. */
  289. if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) {
  290. u64 size;
  291. size = memparse(emu_cmdline, &emu_cmdline);
  292. ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size);
  293. } else {
  294. unsigned long n;
  295. n = simple_strtoul(emu_cmdline, NULL, 0);
  296. ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n);
  297. }
  298. if (ret < 0)
  299. goto no_emu;
  300. if (numa_cleanup_meminfo(&ei) < 0) {
  301. pr_warning("NUMA: Warning: constructed meminfo invalid, disabling emulation\n");
  302. goto no_emu;
  303. }
  304. /* copy the physical distance table */
  305. if (numa_dist_cnt) {
  306. u64 phys;
  307. phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
  308. phys_size, PAGE_SIZE);
  309. if (phys == MEMBLOCK_ERROR) {
  310. pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n");
  311. goto no_emu;
  312. }
  313. memblock_x86_reserve_range(phys, phys + phys_size, "TMP NUMA DIST");
  314. phys_dist = __va(phys);
  315. for (i = 0; i < numa_dist_cnt; i++)
  316. for (j = 0; j < numa_dist_cnt; j++)
  317. phys_dist[i * numa_dist_cnt + j] =
  318. node_distance(i, j);
  319. }
  320. /*
  321. * Determine the max emulated nid and the default phys nid to use
  322. * for unmapped nodes.
  323. */
  324. max_emu_nid = 0;
  325. dfl_phys_nid = NUMA_NO_NODE;
  326. for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) {
  327. if (emu_nid_to_phys[i] != NUMA_NO_NODE) {
  328. max_emu_nid = i;
  329. if (dfl_phys_nid == NUMA_NO_NODE)
  330. dfl_phys_nid = emu_nid_to_phys[i];
  331. }
  332. }
  333. if (dfl_phys_nid == NUMA_NO_NODE) {
  334. pr_warning("NUMA: Warning: can't determine default physical node, disabling emulation\n");
  335. goto no_emu;
  336. }
  337. /* commit */
  338. *numa_meminfo = ei;
  339. /*
  340. * Transform __apicid_to_node table to use emulated nids by
  341. * reverse-mapping phys_nid. The maps should always exist but fall
  342. * back to zero just in case.
  343. */
  344. for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) {
  345. if (__apicid_to_node[i] == NUMA_NO_NODE)
  346. continue;
  347. for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++)
  348. if (__apicid_to_node[i] == emu_nid_to_phys[j])
  349. break;
  350. __apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0;
  351. }
  352. /* make sure all emulated nodes are mapped to a physical node */
  353. for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
  354. if (emu_nid_to_phys[i] == NUMA_NO_NODE)
  355. emu_nid_to_phys[i] = dfl_phys_nid;
  356. /* transform distance table */
  357. numa_reset_distance();
  358. for (i = 0; i < max_emu_nid + 1; i++) {
  359. for (j = 0; j < max_emu_nid + 1; j++) {
  360. int physi = emu_nid_to_phys[i];
  361. int physj = emu_nid_to_phys[j];
  362. int dist;
  363. if (physi >= numa_dist_cnt || physj >= numa_dist_cnt)
  364. dist = physi == physj ?
  365. LOCAL_DISTANCE : REMOTE_DISTANCE;
  366. else
  367. dist = phys_dist[physi * numa_dist_cnt + physj];
  368. numa_set_distance(i, j, dist);
  369. }
  370. }
  371. /* free the copied physical distance table */
  372. if (phys_dist)
  373. memblock_x86_free_range(__pa(phys_dist), __pa(phys_dist) + phys_size);
  374. return;
  375. no_emu:
  376. /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */
  377. for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
  378. emu_nid_to_phys[i] = i;
  379. }
  380. #ifndef CONFIG_DEBUG_PER_CPU_MAPS
  381. void __cpuinit numa_add_cpu(int cpu)
  382. {
  383. int physnid, nid;
  384. nid = early_cpu_to_node(cpu);
  385. BUG_ON(nid == NUMA_NO_NODE || !node_online(nid));
  386. physnid = emu_nid_to_phys[nid];
  387. /*
  388. * Map the cpu to each emulated node that is allocated on the physical
  389. * node of the cpu's apic id.
  390. */
  391. for_each_online_node(nid)
  392. if (emu_nid_to_phys[nid] == physnid)
  393. cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
  394. }
  395. void __cpuinit numa_remove_cpu(int cpu)
  396. {
  397. int i;
  398. for_each_online_node(i)
  399. cpumask_clear_cpu(cpu, node_to_cpumask_map[i]);
  400. }
  401. #else /* !CONFIG_DEBUG_PER_CPU_MAPS */
  402. static void __cpuinit numa_set_cpumask(int cpu, bool enable)
  403. {
  404. int nid, physnid;
  405. nid = early_cpu_to_node(cpu);
  406. if (nid == NUMA_NO_NODE) {
  407. /* early_cpu_to_node() already emits a warning and trace */
  408. return;
  409. }
  410. physnid = emu_nid_to_phys[nid];
  411. for_each_online_node(nid) {
  412. if (emu_nid_to_phys[nid] != physnid)
  413. continue;
  414. debug_cpumask_set_cpu(cpu, nid, enable);
  415. }
  416. }
  417. void __cpuinit numa_add_cpu(int cpu)
  418. {
  419. numa_set_cpumask(cpu, true);
  420. }
  421. void __cpuinit numa_remove_cpu(int cpu)
  422. {
  423. numa_set_cpumask(cpu, false);
  424. }
  425. #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */