kexec.c 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080
  1. /*
  2. * kexec.c - kexec system call
  3. * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
  4. *
  5. * This source code is licensed under the GNU General Public License,
  6. * Version 2. See the file COPYING for more details.
  7. */
  8. #include <linux/mm.h>
  9. #include <linux/file.h>
  10. #include <linux/slab.h>
  11. #include <linux/fs.h>
  12. #include <linux/kexec.h>
  13. #include <linux/spinlock.h>
  14. #include <linux/list.h>
  15. #include <linux/highmem.h>
  16. #include <linux/syscalls.h>
  17. #include <linux/reboot.h>
  18. #include <linux/syscalls.h>
  19. #include <linux/ioport.h>
  20. #include <linux/hardirq.h>
  21. #include <asm/page.h>
  22. #include <asm/uaccess.h>
  23. #include <asm/io.h>
  24. #include <asm/system.h>
  25. #include <asm/semaphore.h>
  26. /* Per cpu memory for storing cpu states in case of system crash. */
  27. note_buf_t* crash_notes;
  28. /* Location of the reserved area for the crash kernel */
  29. struct resource crashk_res = {
  30. .name = "Crash kernel",
  31. .start = 0,
  32. .end = 0,
  33. .flags = IORESOURCE_BUSY | IORESOURCE_MEM
  34. };
  35. int kexec_should_crash(struct task_struct *p)
  36. {
  37. if (in_interrupt() || !p->pid || p->pid == 1 || panic_on_oops)
  38. return 1;
  39. return 0;
  40. }
  41. /*
  42. * When kexec transitions to the new kernel there is a one-to-one
  43. * mapping between physical and virtual addresses. On processors
  44. * where you can disable the MMU this is trivial, and easy. For
  45. * others it is still a simple predictable page table to setup.
  46. *
  47. * In that environment kexec copies the new kernel to its final
  48. * resting place. This means I can only support memory whose
  49. * physical address can fit in an unsigned long. In particular
  50. * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
  51. * If the assembly stub has more restrictive requirements
  52. * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
  53. * defined more restrictively in <asm/kexec.h>.
  54. *
  55. * The code for the transition from the current kernel to the
  56. * the new kernel is placed in the control_code_buffer, whose size
  57. * is given by KEXEC_CONTROL_CODE_SIZE. In the best case only a single
  58. * page of memory is necessary, but some architectures require more.
  59. * Because this memory must be identity mapped in the transition from
  60. * virtual to physical addresses it must live in the range
  61. * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
  62. * modifiable.
  63. *
  64. * The assembly stub in the control code buffer is passed a linked list
  65. * of descriptor pages detailing the source pages of the new kernel,
  66. * and the destination addresses of those source pages. As this data
  67. * structure is not used in the context of the current OS, it must
  68. * be self-contained.
  69. *
  70. * The code has been made to work with highmem pages and will use a
  71. * destination page in its final resting place (if it happens
  72. * to allocate it). The end product of this is that most of the
  73. * physical address space, and most of RAM can be used.
  74. *
  75. * Future directions include:
  76. * - allocating a page table with the control code buffer identity
  77. * mapped, to simplify machine_kexec and make kexec_on_panic more
  78. * reliable.
  79. */
  80. /*
  81. * KIMAGE_NO_DEST is an impossible destination address..., for
  82. * allocating pages whose destination address we do not care about.
  83. */
  84. #define KIMAGE_NO_DEST (-1UL)
  85. static int kimage_is_destination_range(struct kimage *image,
  86. unsigned long start, unsigned long end);
  87. static struct page *kimage_alloc_page(struct kimage *image,
  88. gfp_t gfp_mask,
  89. unsigned long dest);
  90. static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
  91. unsigned long nr_segments,
  92. struct kexec_segment __user *segments)
  93. {
  94. size_t segment_bytes;
  95. struct kimage *image;
  96. unsigned long i;
  97. int result;
  98. /* Allocate a controlling structure */
  99. result = -ENOMEM;
  100. image = kmalloc(sizeof(*image), GFP_KERNEL);
  101. if (!image)
  102. goto out;
  103. memset(image, 0, sizeof(*image));
  104. image->head = 0;
  105. image->entry = &image->head;
  106. image->last_entry = &image->head;
  107. image->control_page = ~0; /* By default this does not apply */
  108. image->start = entry;
  109. image->type = KEXEC_TYPE_DEFAULT;
  110. /* Initialize the list of control pages */
  111. INIT_LIST_HEAD(&image->control_pages);
  112. /* Initialize the list of destination pages */
  113. INIT_LIST_HEAD(&image->dest_pages);
  114. /* Initialize the list of unuseable pages */
  115. INIT_LIST_HEAD(&image->unuseable_pages);
  116. /* Read in the segments */
  117. image->nr_segments = nr_segments;
  118. segment_bytes = nr_segments * sizeof(*segments);
  119. result = copy_from_user(image->segment, segments, segment_bytes);
  120. if (result)
  121. goto out;
  122. /*
  123. * Verify we have good destination addresses. The caller is
  124. * responsible for making certain we don't attempt to load
  125. * the new image into invalid or reserved areas of RAM. This
  126. * just verifies it is an address we can use.
  127. *
  128. * Since the kernel does everything in page size chunks ensure
  129. * the destination addreses are page aligned. Too many
  130. * special cases crop of when we don't do this. The most
  131. * insidious is getting overlapping destination addresses
  132. * simply because addresses are changed to page size
  133. * granularity.
  134. */
  135. result = -EADDRNOTAVAIL;
  136. for (i = 0; i < nr_segments; i++) {
  137. unsigned long mstart, mend;
  138. mstart = image->segment[i].mem;
  139. mend = mstart + image->segment[i].memsz;
  140. if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
  141. goto out;
  142. if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
  143. goto out;
  144. }
  145. /* Verify our destination addresses do not overlap.
  146. * If we alloed overlapping destination addresses
  147. * through very weird things can happen with no
  148. * easy explanation as one segment stops on another.
  149. */
  150. result = -EINVAL;
  151. for (i = 0; i < nr_segments; i++) {
  152. unsigned long mstart, mend;
  153. unsigned long j;
  154. mstart = image->segment[i].mem;
  155. mend = mstart + image->segment[i].memsz;
  156. for (j = 0; j < i; j++) {
  157. unsigned long pstart, pend;
  158. pstart = image->segment[j].mem;
  159. pend = pstart + image->segment[j].memsz;
  160. /* Do the segments overlap ? */
  161. if ((mend > pstart) && (mstart < pend))
  162. goto out;
  163. }
  164. }
  165. /* Ensure our buffer sizes are strictly less than
  166. * our memory sizes. This should always be the case,
  167. * and it is easier to check up front than to be surprised
  168. * later on.
  169. */
  170. result = -EINVAL;
  171. for (i = 0; i < nr_segments; i++) {
  172. if (image->segment[i].bufsz > image->segment[i].memsz)
  173. goto out;
  174. }
  175. result = 0;
  176. out:
  177. if (result == 0)
  178. *rimage = image;
  179. else
  180. kfree(image);
  181. return result;
  182. }
  183. static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
  184. unsigned long nr_segments,
  185. struct kexec_segment __user *segments)
  186. {
  187. int result;
  188. struct kimage *image;
  189. /* Allocate and initialize a controlling structure */
  190. image = NULL;
  191. result = do_kimage_alloc(&image, entry, nr_segments, segments);
  192. if (result)
  193. goto out;
  194. *rimage = image;
  195. /*
  196. * Find a location for the control code buffer, and add it
  197. * the vector of segments so that it's pages will also be
  198. * counted as destination pages.
  199. */
  200. result = -ENOMEM;
  201. image->control_code_page = kimage_alloc_control_pages(image,
  202. get_order(KEXEC_CONTROL_CODE_SIZE));
  203. if (!image->control_code_page) {
  204. printk(KERN_ERR "Could not allocate control_code_buffer\n");
  205. goto out;
  206. }
  207. result = 0;
  208. out:
  209. if (result == 0)
  210. *rimage = image;
  211. else
  212. kfree(image);
  213. return result;
  214. }
  215. static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
  216. unsigned long nr_segments,
  217. struct kexec_segment __user *segments)
  218. {
  219. int result;
  220. struct kimage *image;
  221. unsigned long i;
  222. image = NULL;
  223. /* Verify we have a valid entry point */
  224. if ((entry < crashk_res.start) || (entry > crashk_res.end)) {
  225. result = -EADDRNOTAVAIL;
  226. goto out;
  227. }
  228. /* Allocate and initialize a controlling structure */
  229. result = do_kimage_alloc(&image, entry, nr_segments, segments);
  230. if (result)
  231. goto out;
  232. /* Enable the special crash kernel control page
  233. * allocation policy.
  234. */
  235. image->control_page = crashk_res.start;
  236. image->type = KEXEC_TYPE_CRASH;
  237. /*
  238. * Verify we have good destination addresses. Normally
  239. * the caller is responsible for making certain we don't
  240. * attempt to load the new image into invalid or reserved
  241. * areas of RAM. But crash kernels are preloaded into a
  242. * reserved area of ram. We must ensure the addresses
  243. * are in the reserved area otherwise preloading the
  244. * kernel could corrupt things.
  245. */
  246. result = -EADDRNOTAVAIL;
  247. for (i = 0; i < nr_segments; i++) {
  248. unsigned long mstart, mend;
  249. mstart = image->segment[i].mem;
  250. mend = mstart + image->segment[i].memsz - 1;
  251. /* Ensure we are within the crash kernel limits */
  252. if ((mstart < crashk_res.start) || (mend > crashk_res.end))
  253. goto out;
  254. }
  255. /*
  256. * Find a location for the control code buffer, and add
  257. * the vector of segments so that it's pages will also be
  258. * counted as destination pages.
  259. */
  260. result = -ENOMEM;
  261. image->control_code_page = kimage_alloc_control_pages(image,
  262. get_order(KEXEC_CONTROL_CODE_SIZE));
  263. if (!image->control_code_page) {
  264. printk(KERN_ERR "Could not allocate control_code_buffer\n");
  265. goto out;
  266. }
  267. result = 0;
  268. out:
  269. if (result == 0)
  270. *rimage = image;
  271. else
  272. kfree(image);
  273. return result;
  274. }
  275. static int kimage_is_destination_range(struct kimage *image,
  276. unsigned long start,
  277. unsigned long end)
  278. {
  279. unsigned long i;
  280. for (i = 0; i < image->nr_segments; i++) {
  281. unsigned long mstart, mend;
  282. mstart = image->segment[i].mem;
  283. mend = mstart + image->segment[i].memsz;
  284. if ((end > mstart) && (start < mend))
  285. return 1;
  286. }
  287. return 0;
  288. }
  289. static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
  290. {
  291. struct page *pages;
  292. pages = alloc_pages(gfp_mask, order);
  293. if (pages) {
  294. unsigned int count, i;
  295. pages->mapping = NULL;
  296. set_page_private(pages, order);
  297. count = 1 << order;
  298. for (i = 0; i < count; i++)
  299. SetPageReserved(pages + i);
  300. }
  301. return pages;
  302. }
  303. static void kimage_free_pages(struct page *page)
  304. {
  305. unsigned int order, count, i;
  306. order = page_private(page);
  307. count = 1 << order;
  308. for (i = 0; i < count; i++)
  309. ClearPageReserved(page + i);
  310. __free_pages(page, order);
  311. }
  312. static void kimage_free_page_list(struct list_head *list)
  313. {
  314. struct list_head *pos, *next;
  315. list_for_each_safe(pos, next, list) {
  316. struct page *page;
  317. page = list_entry(pos, struct page, lru);
  318. list_del(&page->lru);
  319. kimage_free_pages(page);
  320. }
  321. }
  322. static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
  323. unsigned int order)
  324. {
  325. /* Control pages are special, they are the intermediaries
  326. * that are needed while we copy the rest of the pages
  327. * to their final resting place. As such they must
  328. * not conflict with either the destination addresses
  329. * or memory the kernel is already using.
  330. *
  331. * The only case where we really need more than one of
  332. * these are for architectures where we cannot disable
  333. * the MMU and must instead generate an identity mapped
  334. * page table for all of the memory.
  335. *
  336. * At worst this runs in O(N) of the image size.
  337. */
  338. struct list_head extra_pages;
  339. struct page *pages;
  340. unsigned int count;
  341. count = 1 << order;
  342. INIT_LIST_HEAD(&extra_pages);
  343. /* Loop while I can allocate a page and the page allocated
  344. * is a destination page.
  345. */
  346. do {
  347. unsigned long pfn, epfn, addr, eaddr;
  348. pages = kimage_alloc_pages(GFP_KERNEL, order);
  349. if (!pages)
  350. break;
  351. pfn = page_to_pfn(pages);
  352. epfn = pfn + count;
  353. addr = pfn << PAGE_SHIFT;
  354. eaddr = epfn << PAGE_SHIFT;
  355. if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
  356. kimage_is_destination_range(image, addr, eaddr)) {
  357. list_add(&pages->lru, &extra_pages);
  358. pages = NULL;
  359. }
  360. } while (!pages);
  361. if (pages) {
  362. /* Remember the allocated page... */
  363. list_add(&pages->lru, &image->control_pages);
  364. /* Because the page is already in it's destination
  365. * location we will never allocate another page at
  366. * that address. Therefore kimage_alloc_pages
  367. * will not return it (again) and we don't need
  368. * to give it an entry in image->segment[].
  369. */
  370. }
  371. /* Deal with the destination pages I have inadvertently allocated.
  372. *
  373. * Ideally I would convert multi-page allocations into single
  374. * page allocations, and add everyting to image->dest_pages.
  375. *
  376. * For now it is simpler to just free the pages.
  377. */
  378. kimage_free_page_list(&extra_pages);
  379. return pages;
  380. }
  381. static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
  382. unsigned int order)
  383. {
  384. /* Control pages are special, they are the intermediaries
  385. * that are needed while we copy the rest of the pages
  386. * to their final resting place. As such they must
  387. * not conflict with either the destination addresses
  388. * or memory the kernel is already using.
  389. *
  390. * Control pages are also the only pags we must allocate
  391. * when loading a crash kernel. All of the other pages
  392. * are specified by the segments and we just memcpy
  393. * into them directly.
  394. *
  395. * The only case where we really need more than one of
  396. * these are for architectures where we cannot disable
  397. * the MMU and must instead generate an identity mapped
  398. * page table for all of the memory.
  399. *
  400. * Given the low demand this implements a very simple
  401. * allocator that finds the first hole of the appropriate
  402. * size in the reserved memory region, and allocates all
  403. * of the memory up to and including the hole.
  404. */
  405. unsigned long hole_start, hole_end, size;
  406. struct page *pages;
  407. pages = NULL;
  408. size = (1 << order) << PAGE_SHIFT;
  409. hole_start = (image->control_page + (size - 1)) & ~(size - 1);
  410. hole_end = hole_start + size - 1;
  411. while (hole_end <= crashk_res.end) {
  412. unsigned long i;
  413. if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT)
  414. break;
  415. if (hole_end > crashk_res.end)
  416. break;
  417. /* See if I overlap any of the segments */
  418. for (i = 0; i < image->nr_segments; i++) {
  419. unsigned long mstart, mend;
  420. mstart = image->segment[i].mem;
  421. mend = mstart + image->segment[i].memsz - 1;
  422. if ((hole_end >= mstart) && (hole_start <= mend)) {
  423. /* Advance the hole to the end of the segment */
  424. hole_start = (mend + (size - 1)) & ~(size - 1);
  425. hole_end = hole_start + size - 1;
  426. break;
  427. }
  428. }
  429. /* If I don't overlap any segments I have found my hole! */
  430. if (i == image->nr_segments) {
  431. pages = pfn_to_page(hole_start >> PAGE_SHIFT);
  432. break;
  433. }
  434. }
  435. if (pages)
  436. image->control_page = hole_end;
  437. return pages;
  438. }
  439. struct page *kimage_alloc_control_pages(struct kimage *image,
  440. unsigned int order)
  441. {
  442. struct page *pages = NULL;
  443. switch (image->type) {
  444. case KEXEC_TYPE_DEFAULT:
  445. pages = kimage_alloc_normal_control_pages(image, order);
  446. break;
  447. case KEXEC_TYPE_CRASH:
  448. pages = kimage_alloc_crash_control_pages(image, order);
  449. break;
  450. }
  451. return pages;
  452. }
  453. static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
  454. {
  455. if (*image->entry != 0)
  456. image->entry++;
  457. if (image->entry == image->last_entry) {
  458. kimage_entry_t *ind_page;
  459. struct page *page;
  460. page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
  461. if (!page)
  462. return -ENOMEM;
  463. ind_page = page_address(page);
  464. *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
  465. image->entry = ind_page;
  466. image->last_entry = ind_page +
  467. ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
  468. }
  469. *image->entry = entry;
  470. image->entry++;
  471. *image->entry = 0;
  472. return 0;
  473. }
  474. static int kimage_set_destination(struct kimage *image,
  475. unsigned long destination)
  476. {
  477. int result;
  478. destination &= PAGE_MASK;
  479. result = kimage_add_entry(image, destination | IND_DESTINATION);
  480. if (result == 0)
  481. image->destination = destination;
  482. return result;
  483. }
  484. static int kimage_add_page(struct kimage *image, unsigned long page)
  485. {
  486. int result;
  487. page &= PAGE_MASK;
  488. result = kimage_add_entry(image, page | IND_SOURCE);
  489. if (result == 0)
  490. image->destination += PAGE_SIZE;
  491. return result;
  492. }
  493. static void kimage_free_extra_pages(struct kimage *image)
  494. {
  495. /* Walk through and free any extra destination pages I may have */
  496. kimage_free_page_list(&image->dest_pages);
  497. /* Walk through and free any unuseable pages I have cached */
  498. kimage_free_page_list(&image->unuseable_pages);
  499. }
  500. static int kimage_terminate(struct kimage *image)
  501. {
  502. if (*image->entry != 0)
  503. image->entry++;
  504. *image->entry = IND_DONE;
  505. return 0;
  506. }
  507. #define for_each_kimage_entry(image, ptr, entry) \
  508. for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
  509. ptr = (entry & IND_INDIRECTION)? \
  510. phys_to_virt((entry & PAGE_MASK)): ptr +1)
  511. static void kimage_free_entry(kimage_entry_t entry)
  512. {
  513. struct page *page;
  514. page = pfn_to_page(entry >> PAGE_SHIFT);
  515. kimage_free_pages(page);
  516. }
  517. static void kimage_free(struct kimage *image)
  518. {
  519. kimage_entry_t *ptr, entry;
  520. kimage_entry_t ind = 0;
  521. if (!image)
  522. return;
  523. kimage_free_extra_pages(image);
  524. for_each_kimage_entry(image, ptr, entry) {
  525. if (entry & IND_INDIRECTION) {
  526. /* Free the previous indirection page */
  527. if (ind & IND_INDIRECTION)
  528. kimage_free_entry(ind);
  529. /* Save this indirection page until we are
  530. * done with it.
  531. */
  532. ind = entry;
  533. }
  534. else if (entry & IND_SOURCE)
  535. kimage_free_entry(entry);
  536. }
  537. /* Free the final indirection page */
  538. if (ind & IND_INDIRECTION)
  539. kimage_free_entry(ind);
  540. /* Handle any machine specific cleanup */
  541. machine_kexec_cleanup(image);
  542. /* Free the kexec control pages... */
  543. kimage_free_page_list(&image->control_pages);
  544. kfree(image);
  545. }
  546. static kimage_entry_t *kimage_dst_used(struct kimage *image,
  547. unsigned long page)
  548. {
  549. kimage_entry_t *ptr, entry;
  550. unsigned long destination = 0;
  551. for_each_kimage_entry(image, ptr, entry) {
  552. if (entry & IND_DESTINATION)
  553. destination = entry & PAGE_MASK;
  554. else if (entry & IND_SOURCE) {
  555. if (page == destination)
  556. return ptr;
  557. destination += PAGE_SIZE;
  558. }
  559. }
  560. return NULL;
  561. }
  562. static struct page *kimage_alloc_page(struct kimage *image,
  563. gfp_t gfp_mask,
  564. unsigned long destination)
  565. {
  566. /*
  567. * Here we implement safeguards to ensure that a source page
  568. * is not copied to its destination page before the data on
  569. * the destination page is no longer useful.
  570. *
  571. * To do this we maintain the invariant that a source page is
  572. * either its own destination page, or it is not a
  573. * destination page at all.
  574. *
  575. * That is slightly stronger than required, but the proof
  576. * that no problems will not occur is trivial, and the
  577. * implementation is simply to verify.
  578. *
  579. * When allocating all pages normally this algorithm will run
  580. * in O(N) time, but in the worst case it will run in O(N^2)
  581. * time. If the runtime is a problem the data structures can
  582. * be fixed.
  583. */
  584. struct page *page;
  585. unsigned long addr;
  586. /*
  587. * Walk through the list of destination pages, and see if I
  588. * have a match.
  589. */
  590. list_for_each_entry(page, &image->dest_pages, lru) {
  591. addr = page_to_pfn(page) << PAGE_SHIFT;
  592. if (addr == destination) {
  593. list_del(&page->lru);
  594. return page;
  595. }
  596. }
  597. page = NULL;
  598. while (1) {
  599. kimage_entry_t *old;
  600. /* Allocate a page, if we run out of memory give up */
  601. page = kimage_alloc_pages(gfp_mask, 0);
  602. if (!page)
  603. return NULL;
  604. /* If the page cannot be used file it away */
  605. if (page_to_pfn(page) >
  606. (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
  607. list_add(&page->lru, &image->unuseable_pages);
  608. continue;
  609. }
  610. addr = page_to_pfn(page) << PAGE_SHIFT;
  611. /* If it is the destination page we want use it */
  612. if (addr == destination)
  613. break;
  614. /* If the page is not a destination page use it */
  615. if (!kimage_is_destination_range(image, addr,
  616. addr + PAGE_SIZE))
  617. break;
  618. /*
  619. * I know that the page is someones destination page.
  620. * See if there is already a source page for this
  621. * destination page. And if so swap the source pages.
  622. */
  623. old = kimage_dst_used(image, addr);
  624. if (old) {
  625. /* If so move it */
  626. unsigned long old_addr;
  627. struct page *old_page;
  628. old_addr = *old & PAGE_MASK;
  629. old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
  630. copy_highpage(page, old_page);
  631. *old = addr | (*old & ~PAGE_MASK);
  632. /* The old page I have found cannot be a
  633. * destination page, so return it.
  634. */
  635. addr = old_addr;
  636. page = old_page;
  637. break;
  638. }
  639. else {
  640. /* Place the page on the destination list I
  641. * will use it later.
  642. */
  643. list_add(&page->lru, &image->dest_pages);
  644. }
  645. }
  646. return page;
  647. }
  648. static int kimage_load_normal_segment(struct kimage *image,
  649. struct kexec_segment *segment)
  650. {
  651. unsigned long maddr;
  652. unsigned long ubytes, mbytes;
  653. int result;
  654. unsigned char __user *buf;
  655. result = 0;
  656. buf = segment->buf;
  657. ubytes = segment->bufsz;
  658. mbytes = segment->memsz;
  659. maddr = segment->mem;
  660. result = kimage_set_destination(image, maddr);
  661. if (result < 0)
  662. goto out;
  663. while (mbytes) {
  664. struct page *page;
  665. char *ptr;
  666. size_t uchunk, mchunk;
  667. page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
  668. if (page == 0) {
  669. result = -ENOMEM;
  670. goto out;
  671. }
  672. result = kimage_add_page(image, page_to_pfn(page)
  673. << PAGE_SHIFT);
  674. if (result < 0)
  675. goto out;
  676. ptr = kmap(page);
  677. /* Start with a clear page */
  678. memset(ptr, 0, PAGE_SIZE);
  679. ptr += maddr & ~PAGE_MASK;
  680. mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
  681. if (mchunk > mbytes)
  682. mchunk = mbytes;
  683. uchunk = mchunk;
  684. if (uchunk > ubytes)
  685. uchunk = ubytes;
  686. result = copy_from_user(ptr, buf, uchunk);
  687. kunmap(page);
  688. if (result) {
  689. result = (result < 0) ? result : -EIO;
  690. goto out;
  691. }
  692. ubytes -= uchunk;
  693. maddr += mchunk;
  694. buf += mchunk;
  695. mbytes -= mchunk;
  696. }
  697. out:
  698. return result;
  699. }
  700. static int kimage_load_crash_segment(struct kimage *image,
  701. struct kexec_segment *segment)
  702. {
  703. /* For crash dumps kernels we simply copy the data from
  704. * user space to it's destination.
  705. * We do things a page at a time for the sake of kmap.
  706. */
  707. unsigned long maddr;
  708. unsigned long ubytes, mbytes;
  709. int result;
  710. unsigned char __user *buf;
  711. result = 0;
  712. buf = segment->buf;
  713. ubytes = segment->bufsz;
  714. mbytes = segment->memsz;
  715. maddr = segment->mem;
  716. while (mbytes) {
  717. struct page *page;
  718. char *ptr;
  719. size_t uchunk, mchunk;
  720. page = pfn_to_page(maddr >> PAGE_SHIFT);
  721. if (page == 0) {
  722. result = -ENOMEM;
  723. goto out;
  724. }
  725. ptr = kmap(page);
  726. ptr += maddr & ~PAGE_MASK;
  727. mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
  728. if (mchunk > mbytes)
  729. mchunk = mbytes;
  730. uchunk = mchunk;
  731. if (uchunk > ubytes) {
  732. uchunk = ubytes;
  733. /* Zero the trailing part of the page */
  734. memset(ptr + uchunk, 0, mchunk - uchunk);
  735. }
  736. result = copy_from_user(ptr, buf, uchunk);
  737. kunmap(page);
  738. if (result) {
  739. result = (result < 0) ? result : -EIO;
  740. goto out;
  741. }
  742. ubytes -= uchunk;
  743. maddr += mchunk;
  744. buf += mchunk;
  745. mbytes -= mchunk;
  746. }
  747. out:
  748. return result;
  749. }
  750. static int kimage_load_segment(struct kimage *image,
  751. struct kexec_segment *segment)
  752. {
  753. int result = -ENOMEM;
  754. switch (image->type) {
  755. case KEXEC_TYPE_DEFAULT:
  756. result = kimage_load_normal_segment(image, segment);
  757. break;
  758. case KEXEC_TYPE_CRASH:
  759. result = kimage_load_crash_segment(image, segment);
  760. break;
  761. }
  762. return result;
  763. }
  764. /*
  765. * Exec Kernel system call: for obvious reasons only root may call it.
  766. *
  767. * This call breaks up into three pieces.
  768. * - A generic part which loads the new kernel from the current
  769. * address space, and very carefully places the data in the
  770. * allocated pages.
  771. *
  772. * - A generic part that interacts with the kernel and tells all of
  773. * the devices to shut down. Preventing on-going dmas, and placing
  774. * the devices in a consistent state so a later kernel can
  775. * reinitialize them.
  776. *
  777. * - A machine specific part that includes the syscall number
  778. * and the copies the image to it's final destination. And
  779. * jumps into the image at entry.
  780. *
  781. * kexec does not sync, or unmount filesystems so if you need
  782. * that to happen you need to do that yourself.
  783. */
  784. struct kimage *kexec_image = NULL;
  785. static struct kimage *kexec_crash_image = NULL;
  786. /*
  787. * A home grown binary mutex.
  788. * Nothing can wait so this mutex is safe to use
  789. * in interrupt context :)
  790. */
  791. static int kexec_lock = 0;
  792. asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
  793. struct kexec_segment __user *segments,
  794. unsigned long flags)
  795. {
  796. struct kimage **dest_image, *image;
  797. int locked;
  798. int result;
  799. /* We only trust the superuser with rebooting the system. */
  800. if (!capable(CAP_SYS_BOOT))
  801. return -EPERM;
  802. /*
  803. * Verify we have a legal set of flags
  804. * This leaves us room for future extensions.
  805. */
  806. if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK))
  807. return -EINVAL;
  808. /* Verify we are on the appropriate architecture */
  809. if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) &&
  810. ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
  811. return -EINVAL;
  812. /* Put an artificial cap on the number
  813. * of segments passed to kexec_load.
  814. */
  815. if (nr_segments > KEXEC_SEGMENT_MAX)
  816. return -EINVAL;
  817. image = NULL;
  818. result = 0;
  819. /* Because we write directly to the reserved memory
  820. * region when loading crash kernels we need a mutex here to
  821. * prevent multiple crash kernels from attempting to load
  822. * simultaneously, and to prevent a crash kernel from loading
  823. * over the top of a in use crash kernel.
  824. *
  825. * KISS: always take the mutex.
  826. */
  827. locked = xchg(&kexec_lock, 1);
  828. if (locked)
  829. return -EBUSY;
  830. dest_image = &kexec_image;
  831. if (flags & KEXEC_ON_CRASH)
  832. dest_image = &kexec_crash_image;
  833. if (nr_segments > 0) {
  834. unsigned long i;
  835. /* Loading another kernel to reboot into */
  836. if ((flags & KEXEC_ON_CRASH) == 0)
  837. result = kimage_normal_alloc(&image, entry,
  838. nr_segments, segments);
  839. /* Loading another kernel to switch to if this one crashes */
  840. else if (flags & KEXEC_ON_CRASH) {
  841. /* Free any current crash dump kernel before
  842. * we corrupt it.
  843. */
  844. kimage_free(xchg(&kexec_crash_image, NULL));
  845. result = kimage_crash_alloc(&image, entry,
  846. nr_segments, segments);
  847. }
  848. if (result)
  849. goto out;
  850. result = machine_kexec_prepare(image);
  851. if (result)
  852. goto out;
  853. for (i = 0; i < nr_segments; i++) {
  854. result = kimage_load_segment(image, &image->segment[i]);
  855. if (result)
  856. goto out;
  857. }
  858. result = kimage_terminate(image);
  859. if (result)
  860. goto out;
  861. }
  862. /* Install the new kernel, and Uninstall the old */
  863. image = xchg(dest_image, image);
  864. out:
  865. xchg(&kexec_lock, 0); /* Release the mutex */
  866. kimage_free(image);
  867. return result;
  868. }
  869. #ifdef CONFIG_COMPAT
  870. asmlinkage long compat_sys_kexec_load(unsigned long entry,
  871. unsigned long nr_segments,
  872. struct compat_kexec_segment __user *segments,
  873. unsigned long flags)
  874. {
  875. struct compat_kexec_segment in;
  876. struct kexec_segment out, __user *ksegments;
  877. unsigned long i, result;
  878. /* Don't allow clients that don't understand the native
  879. * architecture to do anything.
  880. */
  881. if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT)
  882. return -EINVAL;
  883. if (nr_segments > KEXEC_SEGMENT_MAX)
  884. return -EINVAL;
  885. ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
  886. for (i=0; i < nr_segments; i++) {
  887. result = copy_from_user(&in, &segments[i], sizeof(in));
  888. if (result)
  889. return -EFAULT;
  890. out.buf = compat_ptr(in.buf);
  891. out.bufsz = in.bufsz;
  892. out.mem = in.mem;
  893. out.memsz = in.memsz;
  894. result = copy_to_user(&ksegments[i], &out, sizeof(out));
  895. if (result)
  896. return -EFAULT;
  897. }
  898. return sys_kexec_load(entry, nr_segments, ksegments, flags);
  899. }
  900. #endif
  901. void crash_kexec(struct pt_regs *regs)
  902. {
  903. struct kimage *image;
  904. int locked;
  905. /* Take the kexec_lock here to prevent sys_kexec_load
  906. * running on one cpu from replacing the crash kernel
  907. * we are using after a panic on a different cpu.
  908. *
  909. * If the crash kernel was not located in a fixed area
  910. * of memory the xchg(&kexec_crash_image) would be
  911. * sufficient. But since I reuse the memory...
  912. */
  913. locked = xchg(&kexec_lock, 1);
  914. if (!locked) {
  915. image = xchg(&kexec_crash_image, NULL);
  916. if (image) {
  917. struct pt_regs fixed_regs;
  918. crash_setup_regs(&fixed_regs, regs);
  919. machine_crash_shutdown(&fixed_regs);
  920. machine_kexec(image);
  921. }
  922. xchg(&kexec_lock, 0);
  923. }
  924. }
  925. static int __init crash_notes_memory_init(void)
  926. {
  927. /* Allocate memory for saving cpu registers. */
  928. crash_notes = alloc_percpu(note_buf_t);
  929. if (!crash_notes) {
  930. printk("Kexec: Memory allocation for saving cpu register"
  931. " states failed\n");
  932. return -ENOMEM;
  933. }
  934. return 0;
  935. }
  936. module_init(crash_notes_memory_init)