numa.c 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780
  1. /*
  2. * pSeries NUMA support
  3. *
  4. * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
  5. *
  6. * This program is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU General Public License
  8. * as published by the Free Software Foundation; either version
  9. * 2 of the License, or (at your option) any later version.
  10. */
  11. #include <linux/threads.h>
  12. #include <linux/bootmem.h>
  13. #include <linux/init.h>
  14. #include <linux/mm.h>
  15. #include <linux/mmzone.h>
  16. #include <linux/module.h>
  17. #include <linux/nodemask.h>
  18. #include <linux/cpu.h>
  19. #include <linux/notifier.h>
  20. #include <asm/lmb.h>
  21. #include <asm/machdep.h>
  22. #include <asm/abs_addr.h>
  23. #include <asm/system.h>
  24. #include <asm/smp.h>
  25. static int numa_enabled = 1;
  26. static int numa_debug;
  27. #define dbg(args...) if (numa_debug) { printk(KERN_INFO args); }
  28. #ifdef DEBUG_NUMA
  29. #define ARRAY_INITIALISER -1
  30. #else
  31. #define ARRAY_INITIALISER 0
  32. #endif
  33. int numa_cpu_lookup_table[NR_CPUS] = { [ 0 ... (NR_CPUS - 1)] =
  34. ARRAY_INITIALISER};
  35. char *numa_memory_lookup_table;
  36. cpumask_t numa_cpumask_lookup_table[MAX_NUMNODES];
  37. int nr_cpus_in_node[MAX_NUMNODES] = { [0 ... (MAX_NUMNODES -1)] = 0};
  38. struct pglist_data *node_data[MAX_NUMNODES];
  39. bootmem_data_t __initdata plat_node_bdata[MAX_NUMNODES];
  40. static int min_common_depth;
  41. /*
  42. * We need somewhere to store start/span for each node until we have
  43. * allocated the real node_data structures.
  44. */
  45. static struct {
  46. unsigned long node_start_pfn;
  47. unsigned long node_end_pfn;
  48. unsigned long node_present_pages;
  49. } init_node_data[MAX_NUMNODES] __initdata;
  50. EXPORT_SYMBOL(node_data);
  51. EXPORT_SYMBOL(numa_cpu_lookup_table);
  52. EXPORT_SYMBOL(numa_memory_lookup_table);
  53. EXPORT_SYMBOL(numa_cpumask_lookup_table);
  54. EXPORT_SYMBOL(nr_cpus_in_node);
  55. static inline void map_cpu_to_node(int cpu, int node)
  56. {
  57. numa_cpu_lookup_table[cpu] = node;
  58. if (!(cpu_isset(cpu, numa_cpumask_lookup_table[node]))) {
  59. cpu_set(cpu, numa_cpumask_lookup_table[node]);
  60. nr_cpus_in_node[node]++;
  61. }
  62. }
  63. #ifdef CONFIG_HOTPLUG_CPU
  64. static void unmap_cpu_from_node(unsigned long cpu)
  65. {
  66. int node = numa_cpu_lookup_table[cpu];
  67. dbg("removing cpu %lu from node %d\n", cpu, node);
  68. if (cpu_isset(cpu, numa_cpumask_lookup_table[node])) {
  69. cpu_clear(cpu, numa_cpumask_lookup_table[node]);
  70. nr_cpus_in_node[node]--;
  71. } else {
  72. printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n",
  73. cpu, node);
  74. }
  75. }
  76. #endif /* CONFIG_HOTPLUG_CPU */
  77. static struct device_node * __devinit find_cpu_node(unsigned int cpu)
  78. {
  79. unsigned int hw_cpuid = get_hard_smp_processor_id(cpu);
  80. struct device_node *cpu_node = NULL;
  81. unsigned int *interrupt_server, *reg;
  82. int len;
  83. while ((cpu_node = of_find_node_by_type(cpu_node, "cpu")) != NULL) {
  84. /* Try interrupt server first */
  85. interrupt_server = (unsigned int *)get_property(cpu_node,
  86. "ibm,ppc-interrupt-server#s", &len);
  87. len = len / sizeof(u32);
  88. if (interrupt_server && (len > 0)) {
  89. while (len--) {
  90. if (interrupt_server[len] == hw_cpuid)
  91. return cpu_node;
  92. }
  93. } else {
  94. reg = (unsigned int *)get_property(cpu_node,
  95. "reg", &len);
  96. if (reg && (len > 0) && (reg[0] == hw_cpuid))
  97. return cpu_node;
  98. }
  99. }
  100. return NULL;
  101. }
  102. /* must hold reference to node during call */
  103. static int *of_get_associativity(struct device_node *dev)
  104. {
  105. return (unsigned int *)get_property(dev, "ibm,associativity", NULL);
  106. }
  107. static int of_node_numa_domain(struct device_node *device)
  108. {
  109. int numa_domain;
  110. unsigned int *tmp;
  111. if (min_common_depth == -1)
  112. return 0;
  113. tmp = of_get_associativity(device);
  114. if (tmp && (tmp[0] >= min_common_depth)) {
  115. numa_domain = tmp[min_common_depth];
  116. } else {
  117. dbg("WARNING: no NUMA information for %s\n",
  118. device->full_name);
  119. numa_domain = 0;
  120. }
  121. return numa_domain;
  122. }
  123. /*
  124. * In theory, the "ibm,associativity" property may contain multiple
  125. * associativity lists because a resource may be multiply connected
  126. * into the machine. This resource then has different associativity
  127. * characteristics relative to its multiple connections. We ignore
  128. * this for now. We also assume that all cpu and memory sets have
  129. * their distances represented at a common level. This won't be
  130. * true for heirarchical NUMA.
  131. *
  132. * In any case the ibm,associativity-reference-points should give
  133. * the correct depth for a normal NUMA system.
  134. *
  135. * - Dave Hansen <haveblue@us.ibm.com>
  136. */
  137. static int __init find_min_common_depth(void)
  138. {
  139. int depth;
  140. unsigned int *ref_points;
  141. struct device_node *rtas_root;
  142. unsigned int len;
  143. rtas_root = of_find_node_by_path("/rtas");
  144. if (!rtas_root)
  145. return -1;
  146. /*
  147. * this property is 2 32-bit integers, each representing a level of
  148. * depth in the associativity nodes. The first is for an SMP
  149. * configuration (should be all 0's) and the second is for a normal
  150. * NUMA configuration.
  151. */
  152. ref_points = (unsigned int *)get_property(rtas_root,
  153. "ibm,associativity-reference-points", &len);
  154. if ((len >= 1) && ref_points) {
  155. depth = ref_points[1];
  156. } else {
  157. dbg("WARNING: could not find NUMA "
  158. "associativity reference point\n");
  159. depth = -1;
  160. }
  161. of_node_put(rtas_root);
  162. return depth;
  163. }
  164. static int __init get_mem_addr_cells(void)
  165. {
  166. struct device_node *memory = NULL;
  167. int rc;
  168. memory = of_find_node_by_type(memory, "memory");
  169. if (!memory)
  170. return 0; /* it won't matter */
  171. rc = prom_n_addr_cells(memory);
  172. return rc;
  173. }
  174. static int __init get_mem_size_cells(void)
  175. {
  176. struct device_node *memory = NULL;
  177. int rc;
  178. memory = of_find_node_by_type(memory, "memory");
  179. if (!memory)
  180. return 0; /* it won't matter */
  181. rc = prom_n_size_cells(memory);
  182. return rc;
  183. }
  184. static unsigned long read_n_cells(int n, unsigned int **buf)
  185. {
  186. unsigned long result = 0;
  187. while (n--) {
  188. result = (result << 32) | **buf;
  189. (*buf)++;
  190. }
  191. return result;
  192. }
  193. /*
  194. * Figure out to which domain a cpu belongs and stick it there.
  195. * Return the id of the domain used.
  196. */
  197. static int numa_setup_cpu(unsigned long lcpu)
  198. {
  199. int numa_domain = 0;
  200. struct device_node *cpu = find_cpu_node(lcpu);
  201. if (!cpu) {
  202. WARN_ON(1);
  203. goto out;
  204. }
  205. numa_domain = of_node_numa_domain(cpu);
  206. if (numa_domain >= num_online_nodes()) {
  207. /*
  208. * POWER4 LPAR uses 0xffff as invalid node,
  209. * dont warn in this case.
  210. */
  211. if (numa_domain != 0xffff)
  212. printk(KERN_ERR "WARNING: cpu %ld "
  213. "maps to invalid NUMA node %d\n",
  214. lcpu, numa_domain);
  215. numa_domain = 0;
  216. }
  217. out:
  218. node_set_online(numa_domain);
  219. map_cpu_to_node(lcpu, numa_domain);
  220. of_node_put(cpu);
  221. return numa_domain;
  222. }
  223. static int cpu_numa_callback(struct notifier_block *nfb,
  224. unsigned long action,
  225. void *hcpu)
  226. {
  227. unsigned long lcpu = (unsigned long)hcpu;
  228. int ret = NOTIFY_DONE;
  229. switch (action) {
  230. case CPU_UP_PREPARE:
  231. if (min_common_depth == -1 || !numa_enabled)
  232. map_cpu_to_node(lcpu, 0);
  233. else
  234. numa_setup_cpu(lcpu);
  235. ret = NOTIFY_OK;
  236. break;
  237. #ifdef CONFIG_HOTPLUG_CPU
  238. case CPU_DEAD:
  239. case CPU_UP_CANCELED:
  240. unmap_cpu_from_node(lcpu);
  241. break;
  242. ret = NOTIFY_OK;
  243. #endif
  244. }
  245. return ret;
  246. }
  247. /*
  248. * Check and possibly modify a memory region to enforce the memory limit.
  249. *
  250. * Returns the size the region should have to enforce the memory limit.
  251. * This will either be the original value of size, a truncated value,
  252. * or zero. If the returned value of size is 0 the region should be
  253. * discarded as it lies wholy above the memory limit.
  254. */
  255. static unsigned long __init numa_enforce_memory_limit(unsigned long start, unsigned long size)
  256. {
  257. /*
  258. * We use lmb_end_of_DRAM() in here instead of memory_limit because
  259. * we've already adjusted it for the limit and it takes care of
  260. * having memory holes below the limit.
  261. */
  262. if (! memory_limit)
  263. return size;
  264. if (start + size <= lmb_end_of_DRAM())
  265. return size;
  266. if (start >= lmb_end_of_DRAM())
  267. return 0;
  268. return lmb_end_of_DRAM() - start;
  269. }
  270. static int __init parse_numa_properties(void)
  271. {
  272. struct device_node *cpu = NULL;
  273. struct device_node *memory = NULL;
  274. int addr_cells, size_cells;
  275. int max_domain = 0;
  276. long entries = lmb_end_of_DRAM() >> MEMORY_INCREMENT_SHIFT;
  277. unsigned long i;
  278. if (numa_enabled == 0) {
  279. printk(KERN_WARNING "NUMA disabled by user\n");
  280. return -1;
  281. }
  282. numa_memory_lookup_table =
  283. (char *)abs_to_virt(lmb_alloc(entries * sizeof(char), 1));
  284. memset(numa_memory_lookup_table, 0, entries * sizeof(char));
  285. for (i = 0; i < entries ; i++)
  286. numa_memory_lookup_table[i] = ARRAY_INITIALISER;
  287. min_common_depth = find_min_common_depth();
  288. dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);
  289. if (min_common_depth < 0)
  290. return min_common_depth;
  291. max_domain = numa_setup_cpu(boot_cpuid);
  292. /*
  293. * Even though we connect cpus to numa domains later in SMP init,
  294. * we need to know the maximum node id now. This is because each
  295. * node id must have NODE_DATA etc backing it.
  296. * As a result of hotplug we could still have cpus appear later on
  297. * with larger node ids. In that case we force the cpu into node 0.
  298. */
  299. for_each_cpu(i) {
  300. int numa_domain;
  301. cpu = find_cpu_node(i);
  302. if (cpu) {
  303. numa_domain = of_node_numa_domain(cpu);
  304. of_node_put(cpu);
  305. if (numa_domain < MAX_NUMNODES &&
  306. max_domain < numa_domain)
  307. max_domain = numa_domain;
  308. }
  309. }
  310. addr_cells = get_mem_addr_cells();
  311. size_cells = get_mem_size_cells();
  312. memory = NULL;
  313. while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
  314. unsigned long start;
  315. unsigned long size;
  316. int numa_domain;
  317. int ranges;
  318. unsigned int *memcell_buf;
  319. unsigned int len;
  320. memcell_buf = (unsigned int *)get_property(memory, "reg", &len);
  321. if (!memcell_buf || len <= 0)
  322. continue;
  323. ranges = memory->n_addrs;
  324. new_range:
  325. /* these are order-sensitive, and modify the buffer pointer */
  326. start = read_n_cells(addr_cells, &memcell_buf);
  327. size = read_n_cells(size_cells, &memcell_buf);
  328. start = _ALIGN_DOWN(start, MEMORY_INCREMENT);
  329. size = _ALIGN_UP(size, MEMORY_INCREMENT);
  330. numa_domain = of_node_numa_domain(memory);
  331. if (numa_domain >= MAX_NUMNODES) {
  332. if (numa_domain != 0xffff)
  333. printk(KERN_ERR "WARNING: memory at %lx maps "
  334. "to invalid NUMA node %d\n", start,
  335. numa_domain);
  336. numa_domain = 0;
  337. }
  338. if (max_domain < numa_domain)
  339. max_domain = numa_domain;
  340. if (! (size = numa_enforce_memory_limit(start, size))) {
  341. if (--ranges)
  342. goto new_range;
  343. else
  344. continue;
  345. }
  346. /*
  347. * Initialize new node struct, or add to an existing one.
  348. */
  349. if (init_node_data[numa_domain].node_end_pfn) {
  350. if ((start / PAGE_SIZE) <
  351. init_node_data[numa_domain].node_start_pfn)
  352. init_node_data[numa_domain].node_start_pfn =
  353. start / PAGE_SIZE;
  354. if (((start / PAGE_SIZE) + (size / PAGE_SIZE)) >
  355. init_node_data[numa_domain].node_end_pfn)
  356. init_node_data[numa_domain].node_end_pfn =
  357. (start / PAGE_SIZE) +
  358. (size / PAGE_SIZE);
  359. init_node_data[numa_domain].node_present_pages +=
  360. size / PAGE_SIZE;
  361. } else {
  362. node_set_online(numa_domain);
  363. init_node_data[numa_domain].node_start_pfn =
  364. start / PAGE_SIZE;
  365. init_node_data[numa_domain].node_end_pfn =
  366. init_node_data[numa_domain].node_start_pfn +
  367. size / PAGE_SIZE;
  368. init_node_data[numa_domain].node_present_pages =
  369. size / PAGE_SIZE;
  370. }
  371. for (i = start ; i < (start+size); i += MEMORY_INCREMENT)
  372. numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] =
  373. numa_domain;
  374. if (--ranges)
  375. goto new_range;
  376. }
  377. for (i = 0; i <= max_domain; i++)
  378. node_set_online(i);
  379. return 0;
  380. }
  381. static void __init setup_nonnuma(void)
  382. {
  383. unsigned long top_of_ram = lmb_end_of_DRAM();
  384. unsigned long total_ram = lmb_phys_mem_size();
  385. unsigned long i;
  386. printk(KERN_INFO "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
  387. top_of_ram, total_ram);
  388. printk(KERN_INFO "Memory hole size: %ldMB\n",
  389. (top_of_ram - total_ram) >> 20);
  390. if (!numa_memory_lookup_table) {
  391. long entries = top_of_ram >> MEMORY_INCREMENT_SHIFT;
  392. numa_memory_lookup_table =
  393. (char *)abs_to_virt(lmb_alloc(entries * sizeof(char), 1));
  394. memset(numa_memory_lookup_table, 0, entries * sizeof(char));
  395. for (i = 0; i < entries ; i++)
  396. numa_memory_lookup_table[i] = ARRAY_INITIALISER;
  397. }
  398. map_cpu_to_node(boot_cpuid, 0);
  399. node_set_online(0);
  400. init_node_data[0].node_start_pfn = 0;
  401. init_node_data[0].node_end_pfn = lmb_end_of_DRAM() / PAGE_SIZE;
  402. init_node_data[0].node_present_pages = total_ram / PAGE_SIZE;
  403. for (i = 0 ; i < top_of_ram; i += MEMORY_INCREMENT)
  404. numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] = 0;
  405. }
  406. static void __init dump_numa_topology(void)
  407. {
  408. unsigned int node;
  409. unsigned int count;
  410. if (min_common_depth == -1 || !numa_enabled)
  411. return;
  412. for_each_online_node(node) {
  413. unsigned long i;
  414. printk(KERN_INFO "Node %d Memory:", node);
  415. count = 0;
  416. for (i = 0; i < lmb_end_of_DRAM(); i += MEMORY_INCREMENT) {
  417. if (numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] == node) {
  418. if (count == 0)
  419. printk(" 0x%lx", i);
  420. ++count;
  421. } else {
  422. if (count > 0)
  423. printk("-0x%lx", i);
  424. count = 0;
  425. }
  426. }
  427. if (count > 0)
  428. printk("-0x%lx", i);
  429. printk("\n");
  430. }
  431. return;
  432. }
  433. /*
  434. * Allocate some memory, satisfying the lmb or bootmem allocator where
  435. * required. nid is the preferred node and end is the physical address of
  436. * the highest address in the node.
  437. *
  438. * Returns the physical address of the memory.
  439. */
  440. static unsigned long careful_allocation(int nid, unsigned long size,
  441. unsigned long align, unsigned long end)
  442. {
  443. unsigned long ret = lmb_alloc_base(size, align, end);
  444. /* retry over all memory */
  445. if (!ret)
  446. ret = lmb_alloc_base(size, align, lmb_end_of_DRAM());
  447. if (!ret)
  448. panic("numa.c: cannot allocate %lu bytes on node %d",
  449. size, nid);
  450. /*
  451. * If the memory came from a previously allocated node, we must
  452. * retry with the bootmem allocator.
  453. */
  454. if (pa_to_nid(ret) < nid) {
  455. nid = pa_to_nid(ret);
  456. ret = (unsigned long)__alloc_bootmem_node(NODE_DATA(nid),
  457. size, align, 0);
  458. if (!ret)
  459. panic("numa.c: cannot allocate %lu bytes on node %d",
  460. size, nid);
  461. ret = virt_to_abs(ret);
  462. dbg("alloc_bootmem %lx %lx\n", ret, size);
  463. }
  464. return ret;
  465. }
  466. void __init do_init_bootmem(void)
  467. {
  468. int nid;
  469. int addr_cells, size_cells;
  470. struct device_node *memory = NULL;
  471. static struct notifier_block ppc64_numa_nb = {
  472. .notifier_call = cpu_numa_callback,
  473. .priority = 1 /* Must run before sched domains notifier. */
  474. };
  475. min_low_pfn = 0;
  476. max_low_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT;
  477. max_pfn = max_low_pfn;
  478. if (parse_numa_properties())
  479. setup_nonnuma();
  480. else
  481. dump_numa_topology();
  482. register_cpu_notifier(&ppc64_numa_nb);
  483. for_each_online_node(nid) {
  484. unsigned long start_paddr, end_paddr;
  485. int i;
  486. unsigned long bootmem_paddr;
  487. unsigned long bootmap_pages;
  488. start_paddr = init_node_data[nid].node_start_pfn * PAGE_SIZE;
  489. end_paddr = init_node_data[nid].node_end_pfn * PAGE_SIZE;
  490. /* Allocate the node structure node local if possible */
  491. NODE_DATA(nid) = (struct pglist_data *)careful_allocation(nid,
  492. sizeof(struct pglist_data),
  493. SMP_CACHE_BYTES, end_paddr);
  494. NODE_DATA(nid) = abs_to_virt(NODE_DATA(nid));
  495. memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
  496. dbg("node %d\n", nid);
  497. dbg("NODE_DATA() = %p\n", NODE_DATA(nid));
  498. NODE_DATA(nid)->bdata = &plat_node_bdata[nid];
  499. NODE_DATA(nid)->node_start_pfn =
  500. init_node_data[nid].node_start_pfn;
  501. NODE_DATA(nid)->node_spanned_pages =
  502. end_paddr - start_paddr;
  503. if (NODE_DATA(nid)->node_spanned_pages == 0)
  504. continue;
  505. dbg("start_paddr = %lx\n", start_paddr);
  506. dbg("end_paddr = %lx\n", end_paddr);
  507. bootmap_pages = bootmem_bootmap_pages((end_paddr - start_paddr) >> PAGE_SHIFT);
  508. bootmem_paddr = careful_allocation(nid,
  509. bootmap_pages << PAGE_SHIFT,
  510. PAGE_SIZE, end_paddr);
  511. memset(abs_to_virt(bootmem_paddr), 0,
  512. bootmap_pages << PAGE_SHIFT);
  513. dbg("bootmap_paddr = %lx\n", bootmem_paddr);
  514. init_bootmem_node(NODE_DATA(nid), bootmem_paddr >> PAGE_SHIFT,
  515. start_paddr >> PAGE_SHIFT,
  516. end_paddr >> PAGE_SHIFT);
  517. /*
  518. * We need to do another scan of all memory sections to
  519. * associate memory with the correct node.
  520. */
  521. addr_cells = get_mem_addr_cells();
  522. size_cells = get_mem_size_cells();
  523. memory = NULL;
  524. while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
  525. unsigned long mem_start, mem_size;
  526. int numa_domain, ranges;
  527. unsigned int *memcell_buf;
  528. unsigned int len;
  529. memcell_buf = (unsigned int *)get_property(memory, "reg", &len);
  530. if (!memcell_buf || len <= 0)
  531. continue;
  532. ranges = memory->n_addrs; /* ranges in cell */
  533. new_range:
  534. mem_start = read_n_cells(addr_cells, &memcell_buf);
  535. mem_size = read_n_cells(size_cells, &memcell_buf);
  536. if (numa_enabled) {
  537. numa_domain = of_node_numa_domain(memory);
  538. if (numa_domain >= MAX_NUMNODES)
  539. numa_domain = 0;
  540. } else
  541. numa_domain = 0;
  542. if (numa_domain != nid)
  543. continue;
  544. mem_size = numa_enforce_memory_limit(mem_start, mem_size);
  545. if (mem_size) {
  546. dbg("free_bootmem %lx %lx\n", mem_start, mem_size);
  547. free_bootmem_node(NODE_DATA(nid), mem_start, mem_size);
  548. }
  549. if (--ranges) /* process all ranges in cell */
  550. goto new_range;
  551. }
  552. /*
  553. * Mark reserved regions on this node
  554. */
  555. for (i = 0; i < lmb.reserved.cnt; i++) {
  556. unsigned long physbase = lmb.reserved.region[i].base;
  557. unsigned long size = lmb.reserved.region[i].size;
  558. if (pa_to_nid(physbase) != nid &&
  559. pa_to_nid(physbase+size-1) != nid)
  560. continue;
  561. if (physbase < end_paddr &&
  562. (physbase+size) > start_paddr) {
  563. /* overlaps */
  564. if (physbase < start_paddr) {
  565. size -= start_paddr - physbase;
  566. physbase = start_paddr;
  567. }
  568. if (size > end_paddr - physbase)
  569. size = end_paddr - physbase;
  570. dbg("reserve_bootmem %lx %lx\n", physbase,
  571. size);
  572. reserve_bootmem_node(NODE_DATA(nid), physbase,
  573. size);
  574. }
  575. }
  576. /*
  577. * This loop may look famaliar, but we have to do it again
  578. * after marking our reserved memory to mark memory present
  579. * for sparsemem.
  580. */
  581. addr_cells = get_mem_addr_cells();
  582. size_cells = get_mem_size_cells();
  583. memory = NULL;
  584. while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
  585. unsigned long mem_start, mem_size;
  586. int numa_domain, ranges;
  587. unsigned int *memcell_buf;
  588. unsigned int len;
  589. memcell_buf = (unsigned int *)get_property(memory, "reg", &len);
  590. if (!memcell_buf || len <= 0)
  591. continue;
  592. ranges = memory->n_addrs; /* ranges in cell */
  593. new_range2:
  594. mem_start = read_n_cells(addr_cells, &memcell_buf);
  595. mem_size = read_n_cells(size_cells, &memcell_buf);
  596. if (numa_enabled) {
  597. numa_domain = of_node_numa_domain(memory);
  598. if (numa_domain >= MAX_NUMNODES)
  599. numa_domain = 0;
  600. } else
  601. numa_domain = 0;
  602. if (numa_domain != nid)
  603. continue;
  604. mem_size = numa_enforce_memory_limit(mem_start, mem_size);
  605. memory_present(numa_domain, mem_start >> PAGE_SHIFT,
  606. (mem_start + mem_size) >> PAGE_SHIFT);
  607. if (--ranges) /* process all ranges in cell */
  608. goto new_range2;
  609. }
  610. }
  611. }
  612. void __init paging_init(void)
  613. {
  614. unsigned long zones_size[MAX_NR_ZONES];
  615. unsigned long zholes_size[MAX_NR_ZONES];
  616. int nid;
  617. memset(zones_size, 0, sizeof(zones_size));
  618. memset(zholes_size, 0, sizeof(zholes_size));
  619. for_each_online_node(nid) {
  620. unsigned long start_pfn;
  621. unsigned long end_pfn;
  622. start_pfn = init_node_data[nid].node_start_pfn;
  623. end_pfn = init_node_data[nid].node_end_pfn;
  624. zones_size[ZONE_DMA] = end_pfn - start_pfn;
  625. zholes_size[ZONE_DMA] = zones_size[ZONE_DMA] -
  626. init_node_data[nid].node_present_pages;
  627. dbg("free_area_init node %d %lx %lx (hole: %lx)\n", nid,
  628. zones_size[ZONE_DMA], start_pfn, zholes_size[ZONE_DMA]);
  629. free_area_init_node(nid, NODE_DATA(nid), zones_size,
  630. start_pfn, zholes_size);
  631. }
  632. }
  633. static int __init early_numa(char *p)
  634. {
  635. if (!p)
  636. return 0;
  637. if (strstr(p, "off"))
  638. numa_enabled = 0;
  639. if (strstr(p, "debug"))
  640. numa_debug = 1;
  641. return 0;
  642. }
  643. early_param("numa", early_numa);