kexec.c 29 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136
  1. /*
  2. * kexec.c - kexec system call
  3. * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
  4. *
  5. * This source code is licensed under the GNU General Public License,
  6. * Version 2. See the file COPYING for more details.
  7. */
  8. #include <linux/capability.h>
  9. #include <linux/mm.h>
  10. #include <linux/file.h>
  11. #include <linux/slab.h>
  12. #include <linux/fs.h>
  13. #include <linux/kexec.h>
  14. #include <linux/spinlock.h>
  15. #include <linux/list.h>
  16. #include <linux/highmem.h>
  17. #include <linux/syscalls.h>
  18. #include <linux/reboot.h>
  19. #include <linux/syscalls.h>
  20. #include <linux/ioport.h>
  21. #include <linux/hardirq.h>
  22. #include <linux/elf.h>
  23. #include <linux/elfcore.h>
  24. #include <asm/page.h>
  25. #include <asm/uaccess.h>
  26. #include <asm/io.h>
  27. #include <asm/system.h>
  28. #include <asm/semaphore.h>
  29. /* Per cpu memory for storing cpu states in case of system crash. */
  30. note_buf_t* crash_notes;
  31. /* Location of the reserved area for the crash kernel */
  32. struct resource crashk_res = {
  33. .name = "Crash kernel",
  34. .start = 0,
  35. .end = 0,
  36. .flags = IORESOURCE_BUSY | IORESOURCE_MEM
  37. };
  38. int kexec_should_crash(struct task_struct *p)
  39. {
  40. if (in_interrupt() || !p->pid || is_init(p) || panic_on_oops)
  41. return 1;
  42. return 0;
  43. }
  44. /*
  45. * When kexec transitions to the new kernel there is a one-to-one
  46. * mapping between physical and virtual addresses. On processors
  47. * where you can disable the MMU this is trivial, and easy. For
  48. * others it is still a simple predictable page table to setup.
  49. *
  50. * In that environment kexec copies the new kernel to its final
  51. * resting place. This means I can only support memory whose
  52. * physical address can fit in an unsigned long. In particular
  53. * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
  54. * If the assembly stub has more restrictive requirements
  55. * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
  56. * defined more restrictively in <asm/kexec.h>.
  57. *
  58. * The code for the transition from the current kernel to the
  59. * the new kernel is placed in the control_code_buffer, whose size
  60. * is given by KEXEC_CONTROL_CODE_SIZE. In the best case only a single
  61. * page of memory is necessary, but some architectures require more.
  62. * Because this memory must be identity mapped in the transition from
  63. * virtual to physical addresses it must live in the range
  64. * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
  65. * modifiable.
  66. *
  67. * The assembly stub in the control code buffer is passed a linked list
  68. * of descriptor pages detailing the source pages of the new kernel,
  69. * and the destination addresses of those source pages. As this data
  70. * structure is not used in the context of the current OS, it must
  71. * be self-contained.
  72. *
  73. * The code has been made to work with highmem pages and will use a
  74. * destination page in its final resting place (if it happens
  75. * to allocate it). The end product of this is that most of the
  76. * physical address space, and most of RAM can be used.
  77. *
  78. * Future directions include:
  79. * - allocating a page table with the control code buffer identity
  80. * mapped, to simplify machine_kexec and make kexec_on_panic more
  81. * reliable.
  82. */
  83. /*
  84. * KIMAGE_NO_DEST is an impossible destination address..., for
  85. * allocating pages whose destination address we do not care about.
  86. */
  87. #define KIMAGE_NO_DEST (-1UL)
  88. static int kimage_is_destination_range(struct kimage *image,
  89. unsigned long start, unsigned long end);
  90. static struct page *kimage_alloc_page(struct kimage *image,
  91. gfp_t gfp_mask,
  92. unsigned long dest);
  93. static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
  94. unsigned long nr_segments,
  95. struct kexec_segment __user *segments)
  96. {
  97. size_t segment_bytes;
  98. struct kimage *image;
  99. unsigned long i;
  100. int result;
  101. /* Allocate a controlling structure */
  102. result = -ENOMEM;
  103. image = kzalloc(sizeof(*image), GFP_KERNEL);
  104. if (!image)
  105. goto out;
  106. image->head = 0;
  107. image->entry = &image->head;
  108. image->last_entry = &image->head;
  109. image->control_page = ~0; /* By default this does not apply */
  110. image->start = entry;
  111. image->type = KEXEC_TYPE_DEFAULT;
  112. /* Initialize the list of control pages */
  113. INIT_LIST_HEAD(&image->control_pages);
  114. /* Initialize the list of destination pages */
  115. INIT_LIST_HEAD(&image->dest_pages);
  116. /* Initialize the list of unuseable pages */
  117. INIT_LIST_HEAD(&image->unuseable_pages);
  118. /* Read in the segments */
  119. image->nr_segments = nr_segments;
  120. segment_bytes = nr_segments * sizeof(*segments);
  121. result = copy_from_user(image->segment, segments, segment_bytes);
  122. if (result)
  123. goto out;
  124. /*
  125. * Verify we have good destination addresses. The caller is
  126. * responsible for making certain we don't attempt to load
  127. * the new image into invalid or reserved areas of RAM. This
  128. * just verifies it is an address we can use.
  129. *
  130. * Since the kernel does everything in page size chunks ensure
  131. * the destination addreses are page aligned. Too many
  132. * special cases crop of when we don't do this. The most
  133. * insidious is getting overlapping destination addresses
  134. * simply because addresses are changed to page size
  135. * granularity.
  136. */
  137. result = -EADDRNOTAVAIL;
  138. for (i = 0; i < nr_segments; i++) {
  139. unsigned long mstart, mend;
  140. mstart = image->segment[i].mem;
  141. mend = mstart + image->segment[i].memsz;
  142. if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
  143. goto out;
  144. if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
  145. goto out;
  146. }
  147. /* Verify our destination addresses do not overlap.
  148. * If we alloed overlapping destination addresses
  149. * through very weird things can happen with no
  150. * easy explanation as one segment stops on another.
  151. */
  152. result = -EINVAL;
  153. for (i = 0; i < nr_segments; i++) {
  154. unsigned long mstart, mend;
  155. unsigned long j;
  156. mstart = image->segment[i].mem;
  157. mend = mstart + image->segment[i].memsz;
  158. for (j = 0; j < i; j++) {
  159. unsigned long pstart, pend;
  160. pstart = image->segment[j].mem;
  161. pend = pstart + image->segment[j].memsz;
  162. /* Do the segments overlap ? */
  163. if ((mend > pstart) && (mstart < pend))
  164. goto out;
  165. }
  166. }
  167. /* Ensure our buffer sizes are strictly less than
  168. * our memory sizes. This should always be the case,
  169. * and it is easier to check up front than to be surprised
  170. * later on.
  171. */
  172. result = -EINVAL;
  173. for (i = 0; i < nr_segments; i++) {
  174. if (image->segment[i].bufsz > image->segment[i].memsz)
  175. goto out;
  176. }
  177. result = 0;
  178. out:
  179. if (result == 0)
  180. *rimage = image;
  181. else
  182. kfree(image);
  183. return result;
  184. }
  185. static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
  186. unsigned long nr_segments,
  187. struct kexec_segment __user *segments)
  188. {
  189. int result;
  190. struct kimage *image;
  191. /* Allocate and initialize a controlling structure */
  192. image = NULL;
  193. result = do_kimage_alloc(&image, entry, nr_segments, segments);
  194. if (result)
  195. goto out;
  196. *rimage = image;
  197. /*
  198. * Find a location for the control code buffer, and add it
  199. * the vector of segments so that it's pages will also be
  200. * counted as destination pages.
  201. */
  202. result = -ENOMEM;
  203. image->control_code_page = kimage_alloc_control_pages(image,
  204. get_order(KEXEC_CONTROL_CODE_SIZE));
  205. if (!image->control_code_page) {
  206. printk(KERN_ERR "Could not allocate control_code_buffer\n");
  207. goto out;
  208. }
  209. result = 0;
  210. out:
  211. if (result == 0)
  212. *rimage = image;
  213. else
  214. kfree(image);
  215. return result;
  216. }
  217. static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
  218. unsigned long nr_segments,
  219. struct kexec_segment __user *segments)
  220. {
  221. int result;
  222. struct kimage *image;
  223. unsigned long i;
  224. image = NULL;
  225. /* Verify we have a valid entry point */
  226. if ((entry < crashk_res.start) || (entry > crashk_res.end)) {
  227. result = -EADDRNOTAVAIL;
  228. goto out;
  229. }
  230. /* Allocate and initialize a controlling structure */
  231. result = do_kimage_alloc(&image, entry, nr_segments, segments);
  232. if (result)
  233. goto out;
  234. /* Enable the special crash kernel control page
  235. * allocation policy.
  236. */
  237. image->control_page = crashk_res.start;
  238. image->type = KEXEC_TYPE_CRASH;
  239. /*
  240. * Verify we have good destination addresses. Normally
  241. * the caller is responsible for making certain we don't
  242. * attempt to load the new image into invalid or reserved
  243. * areas of RAM. But crash kernels are preloaded into a
  244. * reserved area of ram. We must ensure the addresses
  245. * are in the reserved area otherwise preloading the
  246. * kernel could corrupt things.
  247. */
  248. result = -EADDRNOTAVAIL;
  249. for (i = 0; i < nr_segments; i++) {
  250. unsigned long mstart, mend;
  251. mstart = image->segment[i].mem;
  252. mend = mstart + image->segment[i].memsz - 1;
  253. /* Ensure we are within the crash kernel limits */
  254. if ((mstart < crashk_res.start) || (mend > crashk_res.end))
  255. goto out;
  256. }
  257. /*
  258. * Find a location for the control code buffer, and add
  259. * the vector of segments so that it's pages will also be
  260. * counted as destination pages.
  261. */
  262. result = -ENOMEM;
  263. image->control_code_page = kimage_alloc_control_pages(image,
  264. get_order(KEXEC_CONTROL_CODE_SIZE));
  265. if (!image->control_code_page) {
  266. printk(KERN_ERR "Could not allocate control_code_buffer\n");
  267. goto out;
  268. }
  269. result = 0;
  270. out:
  271. if (result == 0)
  272. *rimage = image;
  273. else
  274. kfree(image);
  275. return result;
  276. }
  277. static int kimage_is_destination_range(struct kimage *image,
  278. unsigned long start,
  279. unsigned long end)
  280. {
  281. unsigned long i;
  282. for (i = 0; i < image->nr_segments; i++) {
  283. unsigned long mstart, mend;
  284. mstart = image->segment[i].mem;
  285. mend = mstart + image->segment[i].memsz;
  286. if ((end > mstart) && (start < mend))
  287. return 1;
  288. }
  289. return 0;
  290. }
  291. static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
  292. {
  293. struct page *pages;
  294. pages = alloc_pages(gfp_mask, order);
  295. if (pages) {
  296. unsigned int count, i;
  297. pages->mapping = NULL;
  298. set_page_private(pages, order);
  299. count = 1 << order;
  300. for (i = 0; i < count; i++)
  301. SetPageReserved(pages + i);
  302. }
  303. return pages;
  304. }
  305. static void kimage_free_pages(struct page *page)
  306. {
  307. unsigned int order, count, i;
  308. order = page_private(page);
  309. count = 1 << order;
  310. for (i = 0; i < count; i++)
  311. ClearPageReserved(page + i);
  312. __free_pages(page, order);
  313. }
  314. static void kimage_free_page_list(struct list_head *list)
  315. {
  316. struct list_head *pos, *next;
  317. list_for_each_safe(pos, next, list) {
  318. struct page *page;
  319. page = list_entry(pos, struct page, lru);
  320. list_del(&page->lru);
  321. kimage_free_pages(page);
  322. }
  323. }
  324. static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
  325. unsigned int order)
  326. {
  327. /* Control pages are special, they are the intermediaries
  328. * that are needed while we copy the rest of the pages
  329. * to their final resting place. As such they must
  330. * not conflict with either the destination addresses
  331. * or memory the kernel is already using.
  332. *
  333. * The only case where we really need more than one of
  334. * these are for architectures where we cannot disable
  335. * the MMU and must instead generate an identity mapped
  336. * page table for all of the memory.
  337. *
  338. * At worst this runs in O(N) of the image size.
  339. */
  340. struct list_head extra_pages;
  341. struct page *pages;
  342. unsigned int count;
  343. count = 1 << order;
  344. INIT_LIST_HEAD(&extra_pages);
  345. /* Loop while I can allocate a page and the page allocated
  346. * is a destination page.
  347. */
  348. do {
  349. unsigned long pfn, epfn, addr, eaddr;
  350. pages = kimage_alloc_pages(GFP_KERNEL, order);
  351. if (!pages)
  352. break;
  353. pfn = page_to_pfn(pages);
  354. epfn = pfn + count;
  355. addr = pfn << PAGE_SHIFT;
  356. eaddr = epfn << PAGE_SHIFT;
  357. if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
  358. kimage_is_destination_range(image, addr, eaddr)) {
  359. list_add(&pages->lru, &extra_pages);
  360. pages = NULL;
  361. }
  362. } while (!pages);
  363. if (pages) {
  364. /* Remember the allocated page... */
  365. list_add(&pages->lru, &image->control_pages);
  366. /* Because the page is already in it's destination
  367. * location we will never allocate another page at
  368. * that address. Therefore kimage_alloc_pages
  369. * will not return it (again) and we don't need
  370. * to give it an entry in image->segment[].
  371. */
  372. }
  373. /* Deal with the destination pages I have inadvertently allocated.
  374. *
  375. * Ideally I would convert multi-page allocations into single
  376. * page allocations, and add everyting to image->dest_pages.
  377. *
  378. * For now it is simpler to just free the pages.
  379. */
  380. kimage_free_page_list(&extra_pages);
  381. return pages;
  382. }
  383. static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
  384. unsigned int order)
  385. {
  386. /* Control pages are special, they are the intermediaries
  387. * that are needed while we copy the rest of the pages
  388. * to their final resting place. As such they must
  389. * not conflict with either the destination addresses
  390. * or memory the kernel is already using.
  391. *
  392. * Control pages are also the only pags we must allocate
  393. * when loading a crash kernel. All of the other pages
  394. * are specified by the segments and we just memcpy
  395. * into them directly.
  396. *
  397. * The only case where we really need more than one of
  398. * these are for architectures where we cannot disable
  399. * the MMU and must instead generate an identity mapped
  400. * page table for all of the memory.
  401. *
  402. * Given the low demand this implements a very simple
  403. * allocator that finds the first hole of the appropriate
  404. * size in the reserved memory region, and allocates all
  405. * of the memory up to and including the hole.
  406. */
  407. unsigned long hole_start, hole_end, size;
  408. struct page *pages;
  409. pages = NULL;
  410. size = (1 << order) << PAGE_SHIFT;
  411. hole_start = (image->control_page + (size - 1)) & ~(size - 1);
  412. hole_end = hole_start + size - 1;
  413. while (hole_end <= crashk_res.end) {
  414. unsigned long i;
  415. if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT)
  416. break;
  417. if (hole_end > crashk_res.end)
  418. break;
  419. /* See if I overlap any of the segments */
  420. for (i = 0; i < image->nr_segments; i++) {
  421. unsigned long mstart, mend;
  422. mstart = image->segment[i].mem;
  423. mend = mstart + image->segment[i].memsz - 1;
  424. if ((hole_end >= mstart) && (hole_start <= mend)) {
  425. /* Advance the hole to the end of the segment */
  426. hole_start = (mend + (size - 1)) & ~(size - 1);
  427. hole_end = hole_start + size - 1;
  428. break;
  429. }
  430. }
  431. /* If I don't overlap any segments I have found my hole! */
  432. if (i == image->nr_segments) {
  433. pages = pfn_to_page(hole_start >> PAGE_SHIFT);
  434. break;
  435. }
  436. }
  437. if (pages)
  438. image->control_page = hole_end;
  439. return pages;
  440. }
  441. struct page *kimage_alloc_control_pages(struct kimage *image,
  442. unsigned int order)
  443. {
  444. struct page *pages = NULL;
  445. switch (image->type) {
  446. case KEXEC_TYPE_DEFAULT:
  447. pages = kimage_alloc_normal_control_pages(image, order);
  448. break;
  449. case KEXEC_TYPE_CRASH:
  450. pages = kimage_alloc_crash_control_pages(image, order);
  451. break;
  452. }
  453. return pages;
  454. }
  455. static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
  456. {
  457. if (*image->entry != 0)
  458. image->entry++;
  459. if (image->entry == image->last_entry) {
  460. kimage_entry_t *ind_page;
  461. struct page *page;
  462. page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
  463. if (!page)
  464. return -ENOMEM;
  465. ind_page = page_address(page);
  466. *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
  467. image->entry = ind_page;
  468. image->last_entry = ind_page +
  469. ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
  470. }
  471. *image->entry = entry;
  472. image->entry++;
  473. *image->entry = 0;
  474. return 0;
  475. }
  476. static int kimage_set_destination(struct kimage *image,
  477. unsigned long destination)
  478. {
  479. int result;
  480. destination &= PAGE_MASK;
  481. result = kimage_add_entry(image, destination | IND_DESTINATION);
  482. if (result == 0)
  483. image->destination = destination;
  484. return result;
  485. }
  486. static int kimage_add_page(struct kimage *image, unsigned long page)
  487. {
  488. int result;
  489. page &= PAGE_MASK;
  490. result = kimage_add_entry(image, page | IND_SOURCE);
  491. if (result == 0)
  492. image->destination += PAGE_SIZE;
  493. return result;
  494. }
  495. static void kimage_free_extra_pages(struct kimage *image)
  496. {
  497. /* Walk through and free any extra destination pages I may have */
  498. kimage_free_page_list(&image->dest_pages);
  499. /* Walk through and free any unuseable pages I have cached */
  500. kimage_free_page_list(&image->unuseable_pages);
  501. }
  502. static int kimage_terminate(struct kimage *image)
  503. {
  504. if (*image->entry != 0)
  505. image->entry++;
  506. *image->entry = IND_DONE;
  507. return 0;
  508. }
  509. #define for_each_kimage_entry(image, ptr, entry) \
  510. for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
  511. ptr = (entry & IND_INDIRECTION)? \
  512. phys_to_virt((entry & PAGE_MASK)): ptr +1)
  513. static void kimage_free_entry(kimage_entry_t entry)
  514. {
  515. struct page *page;
  516. page = pfn_to_page(entry >> PAGE_SHIFT);
  517. kimage_free_pages(page);
  518. }
  519. static void kimage_free(struct kimage *image)
  520. {
  521. kimage_entry_t *ptr, entry;
  522. kimage_entry_t ind = 0;
  523. if (!image)
  524. return;
  525. kimage_free_extra_pages(image);
  526. for_each_kimage_entry(image, ptr, entry) {
  527. if (entry & IND_INDIRECTION) {
  528. /* Free the previous indirection page */
  529. if (ind & IND_INDIRECTION)
  530. kimage_free_entry(ind);
  531. /* Save this indirection page until we are
  532. * done with it.
  533. */
  534. ind = entry;
  535. }
  536. else if (entry & IND_SOURCE)
  537. kimage_free_entry(entry);
  538. }
  539. /* Free the final indirection page */
  540. if (ind & IND_INDIRECTION)
  541. kimage_free_entry(ind);
  542. /* Handle any machine specific cleanup */
  543. machine_kexec_cleanup(image);
  544. /* Free the kexec control pages... */
  545. kimage_free_page_list(&image->control_pages);
  546. kfree(image);
  547. }
  548. static kimage_entry_t *kimage_dst_used(struct kimage *image,
  549. unsigned long page)
  550. {
  551. kimage_entry_t *ptr, entry;
  552. unsigned long destination = 0;
  553. for_each_kimage_entry(image, ptr, entry) {
  554. if (entry & IND_DESTINATION)
  555. destination = entry & PAGE_MASK;
  556. else if (entry & IND_SOURCE) {
  557. if (page == destination)
  558. return ptr;
  559. destination += PAGE_SIZE;
  560. }
  561. }
  562. return NULL;
  563. }
  564. static struct page *kimage_alloc_page(struct kimage *image,
  565. gfp_t gfp_mask,
  566. unsigned long destination)
  567. {
  568. /*
  569. * Here we implement safeguards to ensure that a source page
  570. * is not copied to its destination page before the data on
  571. * the destination page is no longer useful.
  572. *
  573. * To do this we maintain the invariant that a source page is
  574. * either its own destination page, or it is not a
  575. * destination page at all.
  576. *
  577. * That is slightly stronger than required, but the proof
  578. * that no problems will not occur is trivial, and the
  579. * implementation is simply to verify.
  580. *
  581. * When allocating all pages normally this algorithm will run
  582. * in O(N) time, but in the worst case it will run in O(N^2)
  583. * time. If the runtime is a problem the data structures can
  584. * be fixed.
  585. */
  586. struct page *page;
  587. unsigned long addr;
  588. /*
  589. * Walk through the list of destination pages, and see if I
  590. * have a match.
  591. */
  592. list_for_each_entry(page, &image->dest_pages, lru) {
  593. addr = page_to_pfn(page) << PAGE_SHIFT;
  594. if (addr == destination) {
  595. list_del(&page->lru);
  596. return page;
  597. }
  598. }
  599. page = NULL;
  600. while (1) {
  601. kimage_entry_t *old;
  602. /* Allocate a page, if we run out of memory give up */
  603. page = kimage_alloc_pages(gfp_mask, 0);
  604. if (!page)
  605. return NULL;
  606. /* If the page cannot be used file it away */
  607. if (page_to_pfn(page) >
  608. (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
  609. list_add(&page->lru, &image->unuseable_pages);
  610. continue;
  611. }
  612. addr = page_to_pfn(page) << PAGE_SHIFT;
  613. /* If it is the destination page we want use it */
  614. if (addr == destination)
  615. break;
  616. /* If the page is not a destination page use it */
  617. if (!kimage_is_destination_range(image, addr,
  618. addr + PAGE_SIZE))
  619. break;
  620. /*
  621. * I know that the page is someones destination page.
  622. * See if there is already a source page for this
  623. * destination page. And if so swap the source pages.
  624. */
  625. old = kimage_dst_used(image, addr);
  626. if (old) {
  627. /* If so move it */
  628. unsigned long old_addr;
  629. struct page *old_page;
  630. old_addr = *old & PAGE_MASK;
  631. old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
  632. copy_highpage(page, old_page);
  633. *old = addr | (*old & ~PAGE_MASK);
  634. /* The old page I have found cannot be a
  635. * destination page, so return it.
  636. */
  637. addr = old_addr;
  638. page = old_page;
  639. break;
  640. }
  641. else {
  642. /* Place the page on the destination list I
  643. * will use it later.
  644. */
  645. list_add(&page->lru, &image->dest_pages);
  646. }
  647. }
  648. return page;
  649. }
  650. static int kimage_load_normal_segment(struct kimage *image,
  651. struct kexec_segment *segment)
  652. {
  653. unsigned long maddr;
  654. unsigned long ubytes, mbytes;
  655. int result;
  656. unsigned char __user *buf;
  657. result = 0;
  658. buf = segment->buf;
  659. ubytes = segment->bufsz;
  660. mbytes = segment->memsz;
  661. maddr = segment->mem;
  662. result = kimage_set_destination(image, maddr);
  663. if (result < 0)
  664. goto out;
  665. while (mbytes) {
  666. struct page *page;
  667. char *ptr;
  668. size_t uchunk, mchunk;
  669. page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
  670. if (page == 0) {
  671. result = -ENOMEM;
  672. goto out;
  673. }
  674. result = kimage_add_page(image, page_to_pfn(page)
  675. << PAGE_SHIFT);
  676. if (result < 0)
  677. goto out;
  678. ptr = kmap(page);
  679. /* Start with a clear page */
  680. memset(ptr, 0, PAGE_SIZE);
  681. ptr += maddr & ~PAGE_MASK;
  682. mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
  683. if (mchunk > mbytes)
  684. mchunk = mbytes;
  685. uchunk = mchunk;
  686. if (uchunk > ubytes)
  687. uchunk = ubytes;
  688. result = copy_from_user(ptr, buf, uchunk);
  689. kunmap(page);
  690. if (result) {
  691. result = (result < 0) ? result : -EIO;
  692. goto out;
  693. }
  694. ubytes -= uchunk;
  695. maddr += mchunk;
  696. buf += mchunk;
  697. mbytes -= mchunk;
  698. }
  699. out:
  700. return result;
  701. }
  702. static int kimage_load_crash_segment(struct kimage *image,
  703. struct kexec_segment *segment)
  704. {
  705. /* For crash dumps kernels we simply copy the data from
  706. * user space to it's destination.
  707. * We do things a page at a time for the sake of kmap.
  708. */
  709. unsigned long maddr;
  710. unsigned long ubytes, mbytes;
  711. int result;
  712. unsigned char __user *buf;
  713. result = 0;
  714. buf = segment->buf;
  715. ubytes = segment->bufsz;
  716. mbytes = segment->memsz;
  717. maddr = segment->mem;
  718. while (mbytes) {
  719. struct page *page;
  720. char *ptr;
  721. size_t uchunk, mchunk;
  722. page = pfn_to_page(maddr >> PAGE_SHIFT);
  723. if (page == 0) {
  724. result = -ENOMEM;
  725. goto out;
  726. }
  727. ptr = kmap(page);
  728. ptr += maddr & ~PAGE_MASK;
  729. mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
  730. if (mchunk > mbytes)
  731. mchunk = mbytes;
  732. uchunk = mchunk;
  733. if (uchunk > ubytes) {
  734. uchunk = ubytes;
  735. /* Zero the trailing part of the page */
  736. memset(ptr + uchunk, 0, mchunk - uchunk);
  737. }
  738. result = copy_from_user(ptr, buf, uchunk);
  739. kunmap(page);
  740. if (result) {
  741. result = (result < 0) ? result : -EIO;
  742. goto out;
  743. }
  744. ubytes -= uchunk;
  745. maddr += mchunk;
  746. buf += mchunk;
  747. mbytes -= mchunk;
  748. }
  749. out:
  750. return result;
  751. }
  752. static int kimage_load_segment(struct kimage *image,
  753. struct kexec_segment *segment)
  754. {
  755. int result = -ENOMEM;
  756. switch (image->type) {
  757. case KEXEC_TYPE_DEFAULT:
  758. result = kimage_load_normal_segment(image, segment);
  759. break;
  760. case KEXEC_TYPE_CRASH:
  761. result = kimage_load_crash_segment(image, segment);
  762. break;
  763. }
  764. return result;
  765. }
  766. /*
  767. * Exec Kernel system call: for obvious reasons only root may call it.
  768. *
  769. * This call breaks up into three pieces.
  770. * - A generic part which loads the new kernel from the current
  771. * address space, and very carefully places the data in the
  772. * allocated pages.
  773. *
  774. * - A generic part that interacts with the kernel and tells all of
  775. * the devices to shut down. Preventing on-going dmas, and placing
  776. * the devices in a consistent state so a later kernel can
  777. * reinitialize them.
  778. *
  779. * - A machine specific part that includes the syscall number
  780. * and the copies the image to it's final destination. And
  781. * jumps into the image at entry.
  782. *
  783. * kexec does not sync, or unmount filesystems so if you need
  784. * that to happen you need to do that yourself.
  785. */
  786. struct kimage *kexec_image;
  787. struct kimage *kexec_crash_image;
  788. /*
  789. * A home grown binary mutex.
  790. * Nothing can wait so this mutex is safe to use
  791. * in interrupt context :)
  792. */
  793. static int kexec_lock;
  794. asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
  795. struct kexec_segment __user *segments,
  796. unsigned long flags)
  797. {
  798. struct kimage **dest_image, *image;
  799. int locked;
  800. int result;
  801. /* We only trust the superuser with rebooting the system. */
  802. if (!capable(CAP_SYS_BOOT))
  803. return -EPERM;
  804. /*
  805. * Verify we have a legal set of flags
  806. * This leaves us room for future extensions.
  807. */
  808. if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK))
  809. return -EINVAL;
  810. /* Verify we are on the appropriate architecture */
  811. if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) &&
  812. ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
  813. return -EINVAL;
  814. /* Put an artificial cap on the number
  815. * of segments passed to kexec_load.
  816. */
  817. if (nr_segments > KEXEC_SEGMENT_MAX)
  818. return -EINVAL;
  819. image = NULL;
  820. result = 0;
  821. /* Because we write directly to the reserved memory
  822. * region when loading crash kernels we need a mutex here to
  823. * prevent multiple crash kernels from attempting to load
  824. * simultaneously, and to prevent a crash kernel from loading
  825. * over the top of a in use crash kernel.
  826. *
  827. * KISS: always take the mutex.
  828. */
  829. locked = xchg(&kexec_lock, 1);
  830. if (locked)
  831. return -EBUSY;
  832. dest_image = &kexec_image;
  833. if (flags & KEXEC_ON_CRASH)
  834. dest_image = &kexec_crash_image;
  835. if (nr_segments > 0) {
  836. unsigned long i;
  837. /* Loading another kernel to reboot into */
  838. if ((flags & KEXEC_ON_CRASH) == 0)
  839. result = kimage_normal_alloc(&image, entry,
  840. nr_segments, segments);
  841. /* Loading another kernel to switch to if this one crashes */
  842. else if (flags & KEXEC_ON_CRASH) {
  843. /* Free any current crash dump kernel before
  844. * we corrupt it.
  845. */
  846. kimage_free(xchg(&kexec_crash_image, NULL));
  847. result = kimage_crash_alloc(&image, entry,
  848. nr_segments, segments);
  849. }
  850. if (result)
  851. goto out;
  852. result = machine_kexec_prepare(image);
  853. if (result)
  854. goto out;
  855. for (i = 0; i < nr_segments; i++) {
  856. result = kimage_load_segment(image, &image->segment[i]);
  857. if (result)
  858. goto out;
  859. }
  860. result = kimage_terminate(image);
  861. if (result)
  862. goto out;
  863. }
  864. /* Install the new kernel, and Uninstall the old */
  865. image = xchg(dest_image, image);
  866. out:
  867. locked = xchg(&kexec_lock, 0); /* Release the mutex */
  868. BUG_ON(!locked);
  869. kimage_free(image);
  870. return result;
  871. }
  872. #ifdef CONFIG_COMPAT
  873. asmlinkage long compat_sys_kexec_load(unsigned long entry,
  874. unsigned long nr_segments,
  875. struct compat_kexec_segment __user *segments,
  876. unsigned long flags)
  877. {
  878. struct compat_kexec_segment in;
  879. struct kexec_segment out, __user *ksegments;
  880. unsigned long i, result;
  881. /* Don't allow clients that don't understand the native
  882. * architecture to do anything.
  883. */
  884. if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT)
  885. return -EINVAL;
  886. if (nr_segments > KEXEC_SEGMENT_MAX)
  887. return -EINVAL;
  888. ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
  889. for (i=0; i < nr_segments; i++) {
  890. result = copy_from_user(&in, &segments[i], sizeof(in));
  891. if (result)
  892. return -EFAULT;
  893. out.buf = compat_ptr(in.buf);
  894. out.bufsz = in.bufsz;
  895. out.mem = in.mem;
  896. out.memsz = in.memsz;
  897. result = copy_to_user(&ksegments[i], &out, sizeof(out));
  898. if (result)
  899. return -EFAULT;
  900. }
  901. return sys_kexec_load(entry, nr_segments, ksegments, flags);
  902. }
  903. #endif
  904. void crash_kexec(struct pt_regs *regs)
  905. {
  906. int locked;
  907. /* Take the kexec_lock here to prevent sys_kexec_load
  908. * running on one cpu from replacing the crash kernel
  909. * we are using after a panic on a different cpu.
  910. *
  911. * If the crash kernel was not located in a fixed area
  912. * of memory the xchg(&kexec_crash_image) would be
  913. * sufficient. But since I reuse the memory...
  914. */
  915. locked = xchg(&kexec_lock, 1);
  916. if (!locked) {
  917. if (kexec_crash_image) {
  918. struct pt_regs fixed_regs;
  919. crash_setup_regs(&fixed_regs, regs);
  920. machine_crash_shutdown(&fixed_regs);
  921. machine_kexec(kexec_crash_image);
  922. }
  923. locked = xchg(&kexec_lock, 0);
  924. BUG_ON(!locked);
  925. }
  926. }
  927. static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
  928. size_t data_len)
  929. {
  930. struct elf_note note;
  931. note.n_namesz = strlen(name) + 1;
  932. note.n_descsz = data_len;
  933. note.n_type = type;
  934. memcpy(buf, &note, sizeof(note));
  935. buf += (sizeof(note) + 3)/4;
  936. memcpy(buf, name, note.n_namesz);
  937. buf += (note.n_namesz + 3)/4;
  938. memcpy(buf, data, note.n_descsz);
  939. buf += (note.n_descsz + 3)/4;
  940. return buf;
  941. }
  942. static void final_note(u32 *buf)
  943. {
  944. struct elf_note note;
  945. note.n_namesz = 0;
  946. note.n_descsz = 0;
  947. note.n_type = 0;
  948. memcpy(buf, &note, sizeof(note));
  949. }
  950. void crash_save_cpu(struct pt_regs *regs, int cpu)
  951. {
  952. struct elf_prstatus prstatus;
  953. u32 *buf;
  954. if ((cpu < 0) || (cpu >= NR_CPUS))
  955. return;
  956. /* Using ELF notes here is opportunistic.
  957. * I need a well defined structure format
  958. * for the data I pass, and I need tags
  959. * on the data to indicate what information I have
  960. * squirrelled away. ELF notes happen to provide
  961. * all of that, so there is no need to invent something new.
  962. */
  963. buf = (u32*)per_cpu_ptr(crash_notes, cpu);
  964. if (!buf)
  965. return;
  966. memset(&prstatus, 0, sizeof(prstatus));
  967. prstatus.pr_pid = current->pid;
  968. elf_core_copy_regs(&prstatus.pr_reg, regs);
  969. buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus,
  970. sizeof(prstatus));
  971. final_note(buf);
  972. }
  973. static int __init crash_notes_memory_init(void)
  974. {
  975. /* Allocate memory for saving cpu registers. */
  976. crash_notes = alloc_percpu(note_buf_t);
  977. if (!crash_notes) {
  978. printk("Kexec: Memory allocation for saving cpu register"
  979. " states failed\n");
  980. return -ENOMEM;
  981. }
  982. return 0;
  983. }
  984. module_init(crash_notes_memory_init)