kexec.c 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062
  1. /*
  2. * kexec.c - kexec system call
  3. * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
  4. *
  5. * This source code is licensed under the GNU General Public License,
  6. * Version 2. See the file COPYING for more details.
  7. */
  8. #include <linux/mm.h>
  9. #include <linux/file.h>
  10. #include <linux/slab.h>
  11. #include <linux/fs.h>
  12. #include <linux/kexec.h>
  13. #include <linux/spinlock.h>
  14. #include <linux/list.h>
  15. #include <linux/highmem.h>
  16. #include <linux/syscalls.h>
  17. #include <linux/reboot.h>
  18. #include <linux/syscalls.h>
  19. #include <linux/ioport.h>
  20. #include <linux/hardirq.h>
  21. #include <asm/page.h>
  22. #include <asm/uaccess.h>
  23. #include <asm/io.h>
  24. #include <asm/system.h>
  25. #include <asm/semaphore.h>
  26. /* Location of the reserved area for the crash kernel */
  27. struct resource crashk_res = {
  28. .name = "Crash kernel",
  29. .start = 0,
  30. .end = 0,
  31. .flags = IORESOURCE_BUSY | IORESOURCE_MEM
  32. };
  33. int kexec_should_crash(struct task_struct *p)
  34. {
  35. if (in_interrupt() || !p->pid || p->pid == 1 || panic_on_oops)
  36. return 1;
  37. return 0;
  38. }
  39. /*
  40. * When kexec transitions to the new kernel there is a one-to-one
  41. * mapping between physical and virtual addresses. On processors
  42. * where you can disable the MMU this is trivial, and easy. For
  43. * others it is still a simple predictable page table to setup.
  44. *
  45. * In that environment kexec copies the new kernel to its final
  46. * resting place. This means I can only support memory whose
  47. * physical address can fit in an unsigned long. In particular
  48. * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
  49. * If the assembly stub has more restrictive requirements
  50. * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
  51. * defined more restrictively in <asm/kexec.h>.
  52. *
  53. * The code for the transition from the current kernel to the
  54. * the new kernel is placed in the control_code_buffer, whose size
  55. * is given by KEXEC_CONTROL_CODE_SIZE. In the best case only a single
  56. * page of memory is necessary, but some architectures require more.
  57. * Because this memory must be identity mapped in the transition from
  58. * virtual to physical addresses it must live in the range
  59. * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
  60. * modifiable.
  61. *
  62. * The assembly stub in the control code buffer is passed a linked list
  63. * of descriptor pages detailing the source pages of the new kernel,
  64. * and the destination addresses of those source pages. As this data
  65. * structure is not used in the context of the current OS, it must
  66. * be self-contained.
  67. *
  68. * The code has been made to work with highmem pages and will use a
  69. * destination page in its final resting place (if it happens
  70. * to allocate it). The end product of this is that most of the
  71. * physical address space, and most of RAM can be used.
  72. *
  73. * Future directions include:
  74. * - allocating a page table with the control code buffer identity
  75. * mapped, to simplify machine_kexec and make kexec_on_panic more
  76. * reliable.
  77. */
  78. /*
  79. * KIMAGE_NO_DEST is an impossible destination address..., for
  80. * allocating pages whose destination address we do not care about.
  81. */
  82. #define KIMAGE_NO_DEST (-1UL)
  83. static int kimage_is_destination_range(struct kimage *image,
  84. unsigned long start, unsigned long end);
  85. static struct page *kimage_alloc_page(struct kimage *image,
  86. gfp_t gfp_mask,
  87. unsigned long dest);
  88. static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
  89. unsigned long nr_segments,
  90. struct kexec_segment __user *segments)
  91. {
  92. size_t segment_bytes;
  93. struct kimage *image;
  94. unsigned long i;
  95. int result;
  96. /* Allocate a controlling structure */
  97. result = -ENOMEM;
  98. image = kmalloc(sizeof(*image), GFP_KERNEL);
  99. if (!image)
  100. goto out;
  101. memset(image, 0, sizeof(*image));
  102. image->head = 0;
  103. image->entry = &image->head;
  104. image->last_entry = &image->head;
  105. image->control_page = ~0; /* By default this does not apply */
  106. image->start = entry;
  107. image->type = KEXEC_TYPE_DEFAULT;
  108. /* Initialize the list of control pages */
  109. INIT_LIST_HEAD(&image->control_pages);
  110. /* Initialize the list of destination pages */
  111. INIT_LIST_HEAD(&image->dest_pages);
  112. /* Initialize the list of unuseable pages */
  113. INIT_LIST_HEAD(&image->unuseable_pages);
  114. /* Read in the segments */
  115. image->nr_segments = nr_segments;
  116. segment_bytes = nr_segments * sizeof(*segments);
  117. result = copy_from_user(image->segment, segments, segment_bytes);
  118. if (result)
  119. goto out;
  120. /*
  121. * Verify we have good destination addresses. The caller is
  122. * responsible for making certain we don't attempt to load
  123. * the new image into invalid or reserved areas of RAM. This
  124. * just verifies it is an address we can use.
  125. *
  126. * Since the kernel does everything in page size chunks ensure
  127. * the destination addreses are page aligned. Too many
  128. * special cases crop of when we don't do this. The most
  129. * insidious is getting overlapping destination addresses
  130. * simply because addresses are changed to page size
  131. * granularity.
  132. */
  133. result = -EADDRNOTAVAIL;
  134. for (i = 0; i < nr_segments; i++) {
  135. unsigned long mstart, mend;
  136. mstart = image->segment[i].mem;
  137. mend = mstart + image->segment[i].memsz;
  138. if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
  139. goto out;
  140. if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
  141. goto out;
  142. }
  143. /* Verify our destination addresses do not overlap.
  144. * If we alloed overlapping destination addresses
  145. * through very weird things can happen with no
  146. * easy explanation as one segment stops on another.
  147. */
  148. result = -EINVAL;
  149. for (i = 0; i < nr_segments; i++) {
  150. unsigned long mstart, mend;
  151. unsigned long j;
  152. mstart = image->segment[i].mem;
  153. mend = mstart + image->segment[i].memsz;
  154. for (j = 0; j < i; j++) {
  155. unsigned long pstart, pend;
  156. pstart = image->segment[j].mem;
  157. pend = pstart + image->segment[j].memsz;
  158. /* Do the segments overlap ? */
  159. if ((mend > pstart) && (mstart < pend))
  160. goto out;
  161. }
  162. }
  163. /* Ensure our buffer sizes are strictly less than
  164. * our memory sizes. This should always be the case,
  165. * and it is easier to check up front than to be surprised
  166. * later on.
  167. */
  168. result = -EINVAL;
  169. for (i = 0; i < nr_segments; i++) {
  170. if (image->segment[i].bufsz > image->segment[i].memsz)
  171. goto out;
  172. }
  173. result = 0;
  174. out:
  175. if (result == 0)
  176. *rimage = image;
  177. else
  178. kfree(image);
  179. return result;
  180. }
  181. static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
  182. unsigned long nr_segments,
  183. struct kexec_segment __user *segments)
  184. {
  185. int result;
  186. struct kimage *image;
  187. /* Allocate and initialize a controlling structure */
  188. image = NULL;
  189. result = do_kimage_alloc(&image, entry, nr_segments, segments);
  190. if (result)
  191. goto out;
  192. *rimage = image;
  193. /*
  194. * Find a location for the control code buffer, and add it
  195. * the vector of segments so that it's pages will also be
  196. * counted as destination pages.
  197. */
  198. result = -ENOMEM;
  199. image->control_code_page = kimage_alloc_control_pages(image,
  200. get_order(KEXEC_CONTROL_CODE_SIZE));
  201. if (!image->control_code_page) {
  202. printk(KERN_ERR "Could not allocate control_code_buffer\n");
  203. goto out;
  204. }
  205. result = 0;
  206. out:
  207. if (result == 0)
  208. *rimage = image;
  209. else
  210. kfree(image);
  211. return result;
  212. }
  213. static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
  214. unsigned long nr_segments,
  215. struct kexec_segment __user *segments)
  216. {
  217. int result;
  218. struct kimage *image;
  219. unsigned long i;
  220. image = NULL;
  221. /* Verify we have a valid entry point */
  222. if ((entry < crashk_res.start) || (entry > crashk_res.end)) {
  223. result = -EADDRNOTAVAIL;
  224. goto out;
  225. }
  226. /* Allocate and initialize a controlling structure */
  227. result = do_kimage_alloc(&image, entry, nr_segments, segments);
  228. if (result)
  229. goto out;
  230. /* Enable the special crash kernel control page
  231. * allocation policy.
  232. */
  233. image->control_page = crashk_res.start;
  234. image->type = KEXEC_TYPE_CRASH;
  235. /*
  236. * Verify we have good destination addresses. Normally
  237. * the caller is responsible for making certain we don't
  238. * attempt to load the new image into invalid or reserved
  239. * areas of RAM. But crash kernels are preloaded into a
  240. * reserved area of ram. We must ensure the addresses
  241. * are in the reserved area otherwise preloading the
  242. * kernel could corrupt things.
  243. */
  244. result = -EADDRNOTAVAIL;
  245. for (i = 0; i < nr_segments; i++) {
  246. unsigned long mstart, mend;
  247. mstart = image->segment[i].mem;
  248. mend = mstart + image->segment[i].memsz - 1;
  249. /* Ensure we are within the crash kernel limits */
  250. if ((mstart < crashk_res.start) || (mend > crashk_res.end))
  251. goto out;
  252. }
  253. /*
  254. * Find a location for the control code buffer, and add
  255. * the vector of segments so that it's pages will also be
  256. * counted as destination pages.
  257. */
  258. result = -ENOMEM;
  259. image->control_code_page = kimage_alloc_control_pages(image,
  260. get_order(KEXEC_CONTROL_CODE_SIZE));
  261. if (!image->control_code_page) {
  262. printk(KERN_ERR "Could not allocate control_code_buffer\n");
  263. goto out;
  264. }
  265. result = 0;
  266. out:
  267. if (result == 0)
  268. *rimage = image;
  269. else
  270. kfree(image);
  271. return result;
  272. }
  273. static int kimage_is_destination_range(struct kimage *image,
  274. unsigned long start,
  275. unsigned long end)
  276. {
  277. unsigned long i;
  278. for (i = 0; i < image->nr_segments; i++) {
  279. unsigned long mstart, mend;
  280. mstart = image->segment[i].mem;
  281. mend = mstart + image->segment[i].memsz;
  282. if ((end > mstart) && (start < mend))
  283. return 1;
  284. }
  285. return 0;
  286. }
  287. static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
  288. {
  289. struct page *pages;
  290. pages = alloc_pages(gfp_mask, order);
  291. if (pages) {
  292. unsigned int count, i;
  293. pages->mapping = NULL;
  294. pages->private = order;
  295. count = 1 << order;
  296. for (i = 0; i < count; i++)
  297. SetPageReserved(pages + i);
  298. }
  299. return pages;
  300. }
  301. static void kimage_free_pages(struct page *page)
  302. {
  303. unsigned int order, count, i;
  304. order = page->private;
  305. count = 1 << order;
  306. for (i = 0; i < count; i++)
  307. ClearPageReserved(page + i);
  308. __free_pages(page, order);
  309. }
  310. static void kimage_free_page_list(struct list_head *list)
  311. {
  312. struct list_head *pos, *next;
  313. list_for_each_safe(pos, next, list) {
  314. struct page *page;
  315. page = list_entry(pos, struct page, lru);
  316. list_del(&page->lru);
  317. kimage_free_pages(page);
  318. }
  319. }
  320. static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
  321. unsigned int order)
  322. {
  323. /* Control pages are special, they are the intermediaries
  324. * that are needed while we copy the rest of the pages
  325. * to their final resting place. As such they must
  326. * not conflict with either the destination addresses
  327. * or memory the kernel is already using.
  328. *
  329. * The only case where we really need more than one of
  330. * these are for architectures where we cannot disable
  331. * the MMU and must instead generate an identity mapped
  332. * page table for all of the memory.
  333. *
  334. * At worst this runs in O(N) of the image size.
  335. */
  336. struct list_head extra_pages;
  337. struct page *pages;
  338. unsigned int count;
  339. count = 1 << order;
  340. INIT_LIST_HEAD(&extra_pages);
  341. /* Loop while I can allocate a page and the page allocated
  342. * is a destination page.
  343. */
  344. do {
  345. unsigned long pfn, epfn, addr, eaddr;
  346. pages = kimage_alloc_pages(GFP_KERNEL, order);
  347. if (!pages)
  348. break;
  349. pfn = page_to_pfn(pages);
  350. epfn = pfn + count;
  351. addr = pfn << PAGE_SHIFT;
  352. eaddr = epfn << PAGE_SHIFT;
  353. if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
  354. kimage_is_destination_range(image, addr, eaddr)) {
  355. list_add(&pages->lru, &extra_pages);
  356. pages = NULL;
  357. }
  358. } while (!pages);
  359. if (pages) {
  360. /* Remember the allocated page... */
  361. list_add(&pages->lru, &image->control_pages);
  362. /* Because the page is already in it's destination
  363. * location we will never allocate another page at
  364. * that address. Therefore kimage_alloc_pages
  365. * will not return it (again) and we don't need
  366. * to give it an entry in image->segment[].
  367. */
  368. }
  369. /* Deal with the destination pages I have inadvertently allocated.
  370. *
  371. * Ideally I would convert multi-page allocations into single
  372. * page allocations, and add everyting to image->dest_pages.
  373. *
  374. * For now it is simpler to just free the pages.
  375. */
  376. kimage_free_page_list(&extra_pages);
  377. return pages;
  378. }
  379. static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
  380. unsigned int order)
  381. {
  382. /* Control pages are special, they are the intermediaries
  383. * that are needed while we copy the rest of the pages
  384. * to their final resting place. As such they must
  385. * not conflict with either the destination addresses
  386. * or memory the kernel is already using.
  387. *
  388. * Control pages are also the only pags we must allocate
  389. * when loading a crash kernel. All of the other pages
  390. * are specified by the segments and we just memcpy
  391. * into them directly.
  392. *
  393. * The only case where we really need more than one of
  394. * these are for architectures where we cannot disable
  395. * the MMU and must instead generate an identity mapped
  396. * page table for all of the memory.
  397. *
  398. * Given the low demand this implements a very simple
  399. * allocator that finds the first hole of the appropriate
  400. * size in the reserved memory region, and allocates all
  401. * of the memory up to and including the hole.
  402. */
  403. unsigned long hole_start, hole_end, size;
  404. struct page *pages;
  405. pages = NULL;
  406. size = (1 << order) << PAGE_SHIFT;
  407. hole_start = (image->control_page + (size - 1)) & ~(size - 1);
  408. hole_end = hole_start + size - 1;
  409. while (hole_end <= crashk_res.end) {
  410. unsigned long i;
  411. if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT)
  412. break;
  413. if (hole_end > crashk_res.end)
  414. break;
  415. /* See if I overlap any of the segments */
  416. for (i = 0; i < image->nr_segments; i++) {
  417. unsigned long mstart, mend;
  418. mstart = image->segment[i].mem;
  419. mend = mstart + image->segment[i].memsz - 1;
  420. if ((hole_end >= mstart) && (hole_start <= mend)) {
  421. /* Advance the hole to the end of the segment */
  422. hole_start = (mend + (size - 1)) & ~(size - 1);
  423. hole_end = hole_start + size - 1;
  424. break;
  425. }
  426. }
  427. /* If I don't overlap any segments I have found my hole! */
  428. if (i == image->nr_segments) {
  429. pages = pfn_to_page(hole_start >> PAGE_SHIFT);
  430. break;
  431. }
  432. }
  433. if (pages)
  434. image->control_page = hole_end;
  435. return pages;
  436. }
  437. struct page *kimage_alloc_control_pages(struct kimage *image,
  438. unsigned int order)
  439. {
  440. struct page *pages = NULL;
  441. switch (image->type) {
  442. case KEXEC_TYPE_DEFAULT:
  443. pages = kimage_alloc_normal_control_pages(image, order);
  444. break;
  445. case KEXEC_TYPE_CRASH:
  446. pages = kimage_alloc_crash_control_pages(image, order);
  447. break;
  448. }
  449. return pages;
  450. }
  451. static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
  452. {
  453. if (*image->entry != 0)
  454. image->entry++;
  455. if (image->entry == image->last_entry) {
  456. kimage_entry_t *ind_page;
  457. struct page *page;
  458. page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
  459. if (!page)
  460. return -ENOMEM;
  461. ind_page = page_address(page);
  462. *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
  463. image->entry = ind_page;
  464. image->last_entry = ind_page +
  465. ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
  466. }
  467. *image->entry = entry;
  468. image->entry++;
  469. *image->entry = 0;
  470. return 0;
  471. }
  472. static int kimage_set_destination(struct kimage *image,
  473. unsigned long destination)
  474. {
  475. int result;
  476. destination &= PAGE_MASK;
  477. result = kimage_add_entry(image, destination | IND_DESTINATION);
  478. if (result == 0)
  479. image->destination = destination;
  480. return result;
  481. }
  482. static int kimage_add_page(struct kimage *image, unsigned long page)
  483. {
  484. int result;
  485. page &= PAGE_MASK;
  486. result = kimage_add_entry(image, page | IND_SOURCE);
  487. if (result == 0)
  488. image->destination += PAGE_SIZE;
  489. return result;
  490. }
  491. static void kimage_free_extra_pages(struct kimage *image)
  492. {
  493. /* Walk through and free any extra destination pages I may have */
  494. kimage_free_page_list(&image->dest_pages);
  495. /* Walk through and free any unuseable pages I have cached */
  496. kimage_free_page_list(&image->unuseable_pages);
  497. }
  498. static int kimage_terminate(struct kimage *image)
  499. {
  500. if (*image->entry != 0)
  501. image->entry++;
  502. *image->entry = IND_DONE;
  503. return 0;
  504. }
  505. #define for_each_kimage_entry(image, ptr, entry) \
  506. for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
  507. ptr = (entry & IND_INDIRECTION)? \
  508. phys_to_virt((entry & PAGE_MASK)): ptr +1)
  509. static void kimage_free_entry(kimage_entry_t entry)
  510. {
  511. struct page *page;
  512. page = pfn_to_page(entry >> PAGE_SHIFT);
  513. kimage_free_pages(page);
  514. }
  515. static void kimage_free(struct kimage *image)
  516. {
  517. kimage_entry_t *ptr, entry;
  518. kimage_entry_t ind = 0;
  519. if (!image)
  520. return;
  521. kimage_free_extra_pages(image);
  522. for_each_kimage_entry(image, ptr, entry) {
  523. if (entry & IND_INDIRECTION) {
  524. /* Free the previous indirection page */
  525. if (ind & IND_INDIRECTION)
  526. kimage_free_entry(ind);
  527. /* Save this indirection page until we are
  528. * done with it.
  529. */
  530. ind = entry;
  531. }
  532. else if (entry & IND_SOURCE)
  533. kimage_free_entry(entry);
  534. }
  535. /* Free the final indirection page */
  536. if (ind & IND_INDIRECTION)
  537. kimage_free_entry(ind);
  538. /* Handle any machine specific cleanup */
  539. machine_kexec_cleanup(image);
  540. /* Free the kexec control pages... */
  541. kimage_free_page_list(&image->control_pages);
  542. kfree(image);
  543. }
  544. static kimage_entry_t *kimage_dst_used(struct kimage *image,
  545. unsigned long page)
  546. {
  547. kimage_entry_t *ptr, entry;
  548. unsigned long destination = 0;
  549. for_each_kimage_entry(image, ptr, entry) {
  550. if (entry & IND_DESTINATION)
  551. destination = entry & PAGE_MASK;
  552. else if (entry & IND_SOURCE) {
  553. if (page == destination)
  554. return ptr;
  555. destination += PAGE_SIZE;
  556. }
  557. }
  558. return NULL;
  559. }
  560. static struct page *kimage_alloc_page(struct kimage *image,
  561. gfp_t gfp_mask,
  562. unsigned long destination)
  563. {
  564. /*
  565. * Here we implement safeguards to ensure that a source page
  566. * is not copied to its destination page before the data on
  567. * the destination page is no longer useful.
  568. *
  569. * To do this we maintain the invariant that a source page is
  570. * either its own destination page, or it is not a
  571. * destination page at all.
  572. *
  573. * That is slightly stronger than required, but the proof
  574. * that no problems will not occur is trivial, and the
  575. * implementation is simply to verify.
  576. *
  577. * When allocating all pages normally this algorithm will run
  578. * in O(N) time, but in the worst case it will run in O(N^2)
  579. * time. If the runtime is a problem the data structures can
  580. * be fixed.
  581. */
  582. struct page *page;
  583. unsigned long addr;
  584. /*
  585. * Walk through the list of destination pages, and see if I
  586. * have a match.
  587. */
  588. list_for_each_entry(page, &image->dest_pages, lru) {
  589. addr = page_to_pfn(page) << PAGE_SHIFT;
  590. if (addr == destination) {
  591. list_del(&page->lru);
  592. return page;
  593. }
  594. }
  595. page = NULL;
  596. while (1) {
  597. kimage_entry_t *old;
  598. /* Allocate a page, if we run out of memory give up */
  599. page = kimage_alloc_pages(gfp_mask, 0);
  600. if (!page)
  601. return NULL;
  602. /* If the page cannot be used file it away */
  603. if (page_to_pfn(page) >
  604. (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
  605. list_add(&page->lru, &image->unuseable_pages);
  606. continue;
  607. }
  608. addr = page_to_pfn(page) << PAGE_SHIFT;
  609. /* If it is the destination page we want use it */
  610. if (addr == destination)
  611. break;
  612. /* If the page is not a destination page use it */
  613. if (!kimage_is_destination_range(image, addr,
  614. addr + PAGE_SIZE))
  615. break;
  616. /*
  617. * I know that the page is someones destination page.
  618. * See if there is already a source page for this
  619. * destination page. And if so swap the source pages.
  620. */
  621. old = kimage_dst_used(image, addr);
  622. if (old) {
  623. /* If so move it */
  624. unsigned long old_addr;
  625. struct page *old_page;
  626. old_addr = *old & PAGE_MASK;
  627. old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
  628. copy_highpage(page, old_page);
  629. *old = addr | (*old & ~PAGE_MASK);
  630. /* The old page I have found cannot be a
  631. * destination page, so return it.
  632. */
  633. addr = old_addr;
  634. page = old_page;
  635. break;
  636. }
  637. else {
  638. /* Place the page on the destination list I
  639. * will use it later.
  640. */
  641. list_add(&page->lru, &image->dest_pages);
  642. }
  643. }
  644. return page;
  645. }
  646. static int kimage_load_normal_segment(struct kimage *image,
  647. struct kexec_segment *segment)
  648. {
  649. unsigned long maddr;
  650. unsigned long ubytes, mbytes;
  651. int result;
  652. unsigned char __user *buf;
  653. result = 0;
  654. buf = segment->buf;
  655. ubytes = segment->bufsz;
  656. mbytes = segment->memsz;
  657. maddr = segment->mem;
  658. result = kimage_set_destination(image, maddr);
  659. if (result < 0)
  660. goto out;
  661. while (mbytes) {
  662. struct page *page;
  663. char *ptr;
  664. size_t uchunk, mchunk;
  665. page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
  666. if (page == 0) {
  667. result = -ENOMEM;
  668. goto out;
  669. }
  670. result = kimage_add_page(image, page_to_pfn(page)
  671. << PAGE_SHIFT);
  672. if (result < 0)
  673. goto out;
  674. ptr = kmap(page);
  675. /* Start with a clear page */
  676. memset(ptr, 0, PAGE_SIZE);
  677. ptr += maddr & ~PAGE_MASK;
  678. mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
  679. if (mchunk > mbytes)
  680. mchunk = mbytes;
  681. uchunk = mchunk;
  682. if (uchunk > ubytes)
  683. uchunk = ubytes;
  684. result = copy_from_user(ptr, buf, uchunk);
  685. kunmap(page);
  686. if (result) {
  687. result = (result < 0) ? result : -EIO;
  688. goto out;
  689. }
  690. ubytes -= uchunk;
  691. maddr += mchunk;
  692. buf += mchunk;
  693. mbytes -= mchunk;
  694. }
  695. out:
  696. return result;
  697. }
  698. static int kimage_load_crash_segment(struct kimage *image,
  699. struct kexec_segment *segment)
  700. {
  701. /* For crash dumps kernels we simply copy the data from
  702. * user space to it's destination.
  703. * We do things a page at a time for the sake of kmap.
  704. */
  705. unsigned long maddr;
  706. unsigned long ubytes, mbytes;
  707. int result;
  708. unsigned char __user *buf;
  709. result = 0;
  710. buf = segment->buf;
  711. ubytes = segment->bufsz;
  712. mbytes = segment->memsz;
  713. maddr = segment->mem;
  714. while (mbytes) {
  715. struct page *page;
  716. char *ptr;
  717. size_t uchunk, mchunk;
  718. page = pfn_to_page(maddr >> PAGE_SHIFT);
  719. if (page == 0) {
  720. result = -ENOMEM;
  721. goto out;
  722. }
  723. ptr = kmap(page);
  724. ptr += maddr & ~PAGE_MASK;
  725. mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
  726. if (mchunk > mbytes)
  727. mchunk = mbytes;
  728. uchunk = mchunk;
  729. if (uchunk > ubytes) {
  730. uchunk = ubytes;
  731. /* Zero the trailing part of the page */
  732. memset(ptr + uchunk, 0, mchunk - uchunk);
  733. }
  734. result = copy_from_user(ptr, buf, uchunk);
  735. kunmap(page);
  736. if (result) {
  737. result = (result < 0) ? result : -EIO;
  738. goto out;
  739. }
  740. ubytes -= uchunk;
  741. maddr += mchunk;
  742. buf += mchunk;
  743. mbytes -= mchunk;
  744. }
  745. out:
  746. return result;
  747. }
  748. static int kimage_load_segment(struct kimage *image,
  749. struct kexec_segment *segment)
  750. {
  751. int result = -ENOMEM;
  752. switch (image->type) {
  753. case KEXEC_TYPE_DEFAULT:
  754. result = kimage_load_normal_segment(image, segment);
  755. break;
  756. case KEXEC_TYPE_CRASH:
  757. result = kimage_load_crash_segment(image, segment);
  758. break;
  759. }
  760. return result;
  761. }
  762. /*
  763. * Exec Kernel system call: for obvious reasons only root may call it.
  764. *
  765. * This call breaks up into three pieces.
  766. * - A generic part which loads the new kernel from the current
  767. * address space, and very carefully places the data in the
  768. * allocated pages.
  769. *
  770. * - A generic part that interacts with the kernel and tells all of
  771. * the devices to shut down. Preventing on-going dmas, and placing
  772. * the devices in a consistent state so a later kernel can
  773. * reinitialize them.
  774. *
  775. * - A machine specific part that includes the syscall number
  776. * and the copies the image to it's final destination. And
  777. * jumps into the image at entry.
  778. *
  779. * kexec does not sync, or unmount filesystems so if you need
  780. * that to happen you need to do that yourself.
  781. */
  782. struct kimage *kexec_image = NULL;
  783. static struct kimage *kexec_crash_image = NULL;
  784. /*
  785. * A home grown binary mutex.
  786. * Nothing can wait so this mutex is safe to use
  787. * in interrupt context :)
  788. */
  789. static int kexec_lock = 0;
  790. asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
  791. struct kexec_segment __user *segments,
  792. unsigned long flags)
  793. {
  794. struct kimage **dest_image, *image;
  795. int locked;
  796. int result;
  797. /* We only trust the superuser with rebooting the system. */
  798. if (!capable(CAP_SYS_BOOT))
  799. return -EPERM;
  800. /*
  801. * Verify we have a legal set of flags
  802. * This leaves us room for future extensions.
  803. */
  804. if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK))
  805. return -EINVAL;
  806. /* Verify we are on the appropriate architecture */
  807. if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) &&
  808. ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
  809. return -EINVAL;
  810. /* Put an artificial cap on the number
  811. * of segments passed to kexec_load.
  812. */
  813. if (nr_segments > KEXEC_SEGMENT_MAX)
  814. return -EINVAL;
  815. image = NULL;
  816. result = 0;
  817. /* Because we write directly to the reserved memory
  818. * region when loading crash kernels we need a mutex here to
  819. * prevent multiple crash kernels from attempting to load
  820. * simultaneously, and to prevent a crash kernel from loading
  821. * over the top of a in use crash kernel.
  822. *
  823. * KISS: always take the mutex.
  824. */
  825. locked = xchg(&kexec_lock, 1);
  826. if (locked)
  827. return -EBUSY;
  828. dest_image = &kexec_image;
  829. if (flags & KEXEC_ON_CRASH)
  830. dest_image = &kexec_crash_image;
  831. if (nr_segments > 0) {
  832. unsigned long i;
  833. /* Loading another kernel to reboot into */
  834. if ((flags & KEXEC_ON_CRASH) == 0)
  835. result = kimage_normal_alloc(&image, entry,
  836. nr_segments, segments);
  837. /* Loading another kernel to switch to if this one crashes */
  838. else if (flags & KEXEC_ON_CRASH) {
  839. /* Free any current crash dump kernel before
  840. * we corrupt it.
  841. */
  842. kimage_free(xchg(&kexec_crash_image, NULL));
  843. result = kimage_crash_alloc(&image, entry,
  844. nr_segments, segments);
  845. }
  846. if (result)
  847. goto out;
  848. result = machine_kexec_prepare(image);
  849. if (result)
  850. goto out;
  851. for (i = 0; i < nr_segments; i++) {
  852. result = kimage_load_segment(image, &image->segment[i]);
  853. if (result)
  854. goto out;
  855. }
  856. result = kimage_terminate(image);
  857. if (result)
  858. goto out;
  859. }
  860. /* Install the new kernel, and Uninstall the old */
  861. image = xchg(dest_image, image);
  862. out:
  863. xchg(&kexec_lock, 0); /* Release the mutex */
  864. kimage_free(image);
  865. return result;
  866. }
  867. #ifdef CONFIG_COMPAT
  868. asmlinkage long compat_sys_kexec_load(unsigned long entry,
  869. unsigned long nr_segments,
  870. struct compat_kexec_segment __user *segments,
  871. unsigned long flags)
  872. {
  873. struct compat_kexec_segment in;
  874. struct kexec_segment out, __user *ksegments;
  875. unsigned long i, result;
  876. /* Don't allow clients that don't understand the native
  877. * architecture to do anything.
  878. */
  879. if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT)
  880. return -EINVAL;
  881. if (nr_segments > KEXEC_SEGMENT_MAX)
  882. return -EINVAL;
  883. ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
  884. for (i=0; i < nr_segments; i++) {
  885. result = copy_from_user(&in, &segments[i], sizeof(in));
  886. if (result)
  887. return -EFAULT;
  888. out.buf = compat_ptr(in.buf);
  889. out.bufsz = in.bufsz;
  890. out.mem = in.mem;
  891. out.memsz = in.memsz;
  892. result = copy_to_user(&ksegments[i], &out, sizeof(out));
  893. if (result)
  894. return -EFAULT;
  895. }
  896. return sys_kexec_load(entry, nr_segments, ksegments, flags);
  897. }
  898. #endif
  899. void crash_kexec(struct pt_regs *regs)
  900. {
  901. struct kimage *image;
  902. int locked;
  903. /* Take the kexec_lock here to prevent sys_kexec_load
  904. * running on one cpu from replacing the crash kernel
  905. * we are using after a panic on a different cpu.
  906. *
  907. * If the crash kernel was not located in a fixed area
  908. * of memory the xchg(&kexec_crash_image) would be
  909. * sufficient. But since I reuse the memory...
  910. */
  911. locked = xchg(&kexec_lock, 1);
  912. if (!locked) {
  913. image = xchg(&kexec_crash_image, NULL);
  914. if (image) {
  915. machine_crash_shutdown(regs);
  916. machine_kexec(image);
  917. }
  918. xchg(&kexec_lock, 0);
  919. }
  920. }