lguest.c 26 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012
  1. /* Simple program to layout "physical" memory for new lguest guest.
  2. * Linked high to avoid likely physical memory. */
  3. #define _LARGEFILE64_SOURCE
  4. #define _GNU_SOURCE
  5. #include <stdio.h>
  6. #include <string.h>
  7. #include <unistd.h>
  8. #include <err.h>
  9. #include <stdint.h>
  10. #include <stdlib.h>
  11. #include <elf.h>
  12. #include <sys/mman.h>
  13. #include <sys/types.h>
  14. #include <sys/stat.h>
  15. #include <sys/wait.h>
  16. #include <fcntl.h>
  17. #include <stdbool.h>
  18. #include <errno.h>
  19. #include <ctype.h>
  20. #include <sys/socket.h>
  21. #include <sys/ioctl.h>
  22. #include <sys/time.h>
  23. #include <time.h>
  24. #include <netinet/in.h>
  25. #include <net/if.h>
  26. #include <linux/sockios.h>
  27. #include <linux/if_tun.h>
  28. #include <sys/uio.h>
  29. #include <termios.h>
  30. #include <getopt.h>
  31. #include <zlib.h>
  32. typedef unsigned long long u64;
  33. typedef uint32_t u32;
  34. typedef uint16_t u16;
  35. typedef uint8_t u8;
  36. #include "../../include/linux/lguest_launcher.h"
  37. #include "../../include/asm-i386/e820.h"
  38. #define PAGE_PRESENT 0x7 /* Present, RW, Execute */
  39. #define NET_PEERNUM 1
  40. #define BRIDGE_PFX "bridge:"
  41. #ifndef SIOCBRADDIF
  42. #define SIOCBRADDIF 0x89a2 /* add interface to bridge */
  43. #endif
  44. static bool verbose;
  45. #define verbose(args...) \
  46. do { if (verbose) printf(args); } while(0)
  47. static int waker_fd;
  48. struct device_list
  49. {
  50. fd_set infds;
  51. int max_infd;
  52. struct device *dev;
  53. struct device **lastdev;
  54. };
  55. struct device
  56. {
  57. struct device *next;
  58. struct lguest_device_desc *desc;
  59. void *mem;
  60. /* Watch this fd if handle_input non-NULL. */
  61. int fd;
  62. bool (*handle_input)(int fd, struct device *me);
  63. /* Watch DMA to this key if handle_input non-NULL. */
  64. unsigned long watch_key;
  65. u32 (*handle_output)(int fd, const struct iovec *iov,
  66. unsigned int num, struct device *me);
  67. /* Device-specific data. */
  68. void *priv;
  69. };
  70. static int open_or_die(const char *name, int flags)
  71. {
  72. int fd = open(name, flags);
  73. if (fd < 0)
  74. err(1, "Failed to open %s", name);
  75. return fd;
  76. }
  77. static void *map_zeroed_pages(unsigned long addr, unsigned int num)
  78. {
  79. static int fd = -1;
  80. if (fd == -1)
  81. fd = open_or_die("/dev/zero", O_RDONLY);
  82. if (mmap((void *)addr, getpagesize() * num,
  83. PROT_READ|PROT_WRITE|PROT_EXEC, MAP_FIXED|MAP_PRIVATE, fd, 0)
  84. != (void *)addr)
  85. err(1, "Mmaping %u pages of /dev/zero @%p", num, (void *)addr);
  86. return (void *)addr;
  87. }
  88. /* Find magic string marking entry point, return entry point. */
  89. static unsigned long entry_point(void *start, void *end,
  90. unsigned long page_offset)
  91. {
  92. void *p;
  93. for (p = start; p < end; p++)
  94. if (memcmp(p, "GenuineLguest", strlen("GenuineLguest")) == 0)
  95. return (long)p + strlen("GenuineLguest") + page_offset;
  96. err(1, "Is this image a genuine lguest?");
  97. }
  98. /* Returns the entry point */
  99. static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr,
  100. unsigned long *page_offset)
  101. {
  102. void *addr;
  103. Elf32_Phdr phdr[ehdr->e_phnum];
  104. unsigned int i;
  105. unsigned long start = -1UL, end = 0;
  106. /* Sanity checks. */
  107. if (ehdr->e_type != ET_EXEC
  108. || ehdr->e_machine != EM_386
  109. || ehdr->e_phentsize != sizeof(Elf32_Phdr)
  110. || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr))
  111. errx(1, "Malformed elf header");
  112. if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0)
  113. err(1, "Seeking to program headers");
  114. if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr))
  115. err(1, "Reading program headers");
  116. *page_offset = 0;
  117. /* We map the loadable segments at virtual addresses corresponding
  118. * to their physical addresses (our virtual == guest physical). */
  119. for (i = 0; i < ehdr->e_phnum; i++) {
  120. if (phdr[i].p_type != PT_LOAD)
  121. continue;
  122. verbose("Section %i: size %i addr %p\n",
  123. i, phdr[i].p_memsz, (void *)phdr[i].p_paddr);
  124. /* We expect linear address space. */
  125. if (!*page_offset)
  126. *page_offset = phdr[i].p_vaddr - phdr[i].p_paddr;
  127. else if (*page_offset != phdr[i].p_vaddr - phdr[i].p_paddr)
  128. errx(1, "Page offset of section %i different", i);
  129. if (phdr[i].p_paddr < start)
  130. start = phdr[i].p_paddr;
  131. if (phdr[i].p_paddr + phdr[i].p_filesz > end)
  132. end = phdr[i].p_paddr + phdr[i].p_filesz;
  133. /* We map everything private, writable. */
  134. addr = mmap((void *)phdr[i].p_paddr,
  135. phdr[i].p_filesz,
  136. PROT_READ|PROT_WRITE|PROT_EXEC,
  137. MAP_FIXED|MAP_PRIVATE,
  138. elf_fd, phdr[i].p_offset);
  139. if (addr != (void *)phdr[i].p_paddr)
  140. err(1, "Mmaping vmlinux seg %i gave %p not %p",
  141. i, addr, (void *)phdr[i].p_paddr);
  142. }
  143. return entry_point((void *)start, (void *)end, *page_offset);
  144. }
  145. /* This is amazingly reliable. */
  146. static unsigned long intuit_page_offset(unsigned char *img, unsigned long len)
  147. {
  148. unsigned int i, possibilities[256] = { 0 };
  149. for (i = 0; i + 4 < len; i++) {
  150. /* mov 0xXXXXXXXX,%eax */
  151. if (img[i] == 0xA1 && ++possibilities[img[i+4]] > 3)
  152. return (unsigned long)img[i+4] << 24;
  153. }
  154. errx(1, "could not determine page offset");
  155. }
  156. static unsigned long unpack_bzimage(int fd, unsigned long *page_offset)
  157. {
  158. gzFile f;
  159. int ret, len = 0;
  160. void *img = (void *)0x100000;
  161. f = gzdopen(fd, "rb");
  162. while ((ret = gzread(f, img + len, 65536)) > 0)
  163. len += ret;
  164. if (ret < 0)
  165. err(1, "reading image from bzImage");
  166. verbose("Unpacked size %i addr %p\n", len, img);
  167. *page_offset = intuit_page_offset(img, len);
  168. return entry_point(img, img + len, *page_offset);
  169. }
  170. static unsigned long load_bzimage(int fd, unsigned long *page_offset)
  171. {
  172. unsigned char c;
  173. int state = 0;
  174. /* Ugly brute force search for gzip header. */
  175. while (read(fd, &c, 1) == 1) {
  176. switch (state) {
  177. case 0:
  178. if (c == 0x1F)
  179. state++;
  180. break;
  181. case 1:
  182. if (c == 0x8B)
  183. state++;
  184. else
  185. state = 0;
  186. break;
  187. case 2 ... 8:
  188. state++;
  189. break;
  190. case 9:
  191. lseek(fd, -10, SEEK_CUR);
  192. if (c != 0x03) /* Compressed under UNIX. */
  193. state = -1;
  194. else
  195. return unpack_bzimage(fd, page_offset);
  196. }
  197. }
  198. errx(1, "Could not find kernel in bzImage");
  199. }
  200. static unsigned long load_kernel(int fd, unsigned long *page_offset)
  201. {
  202. Elf32_Ehdr hdr;
  203. if (read(fd, &hdr, sizeof(hdr)) != sizeof(hdr))
  204. err(1, "Reading kernel");
  205. if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0)
  206. return map_elf(fd, &hdr, page_offset);
  207. return load_bzimage(fd, page_offset);
  208. }
  209. static inline unsigned long page_align(unsigned long addr)
  210. {
  211. return ((addr + getpagesize()-1) & ~(getpagesize()-1));
  212. }
  213. /* initrd gets loaded at top of memory: return length. */
  214. static unsigned long load_initrd(const char *name, unsigned long mem)
  215. {
  216. int ifd;
  217. struct stat st;
  218. unsigned long len;
  219. void *iaddr;
  220. ifd = open_or_die(name, O_RDONLY);
  221. if (fstat(ifd, &st) < 0)
  222. err(1, "fstat() on initrd '%s'", name);
  223. len = page_align(st.st_size);
  224. iaddr = mmap((void *)mem - len, st.st_size,
  225. PROT_READ|PROT_EXEC|PROT_WRITE,
  226. MAP_FIXED|MAP_PRIVATE, ifd, 0);
  227. if (iaddr != (void *)mem - len)
  228. err(1, "Mmaping initrd '%s' returned %p not %p",
  229. name, iaddr, (void *)mem - len);
  230. close(ifd);
  231. verbose("mapped initrd %s size=%lu @ %p\n", name, st.st_size, iaddr);
  232. return len;
  233. }
  234. static unsigned long setup_pagetables(unsigned long mem,
  235. unsigned long initrd_size,
  236. unsigned long page_offset)
  237. {
  238. u32 *pgdir, *linear;
  239. unsigned int mapped_pages, i, linear_pages;
  240. unsigned int ptes_per_page = getpagesize()/sizeof(u32);
  241. /* If we can map all of memory above page_offset, we do so. */
  242. if (mem <= -page_offset)
  243. mapped_pages = mem/getpagesize();
  244. else
  245. mapped_pages = -page_offset/getpagesize();
  246. /* Each linear PTE page can map ptes_per_page pages. */
  247. linear_pages = (mapped_pages + ptes_per_page-1)/ptes_per_page;
  248. /* We lay out top-level then linear mapping immediately below initrd */
  249. pgdir = (void *)mem - initrd_size - getpagesize();
  250. linear = (void *)pgdir - linear_pages*getpagesize();
  251. for (i = 0; i < mapped_pages; i++)
  252. linear[i] = ((i * getpagesize()) | PAGE_PRESENT);
  253. /* Now set up pgd so that this memory is at page_offset */
  254. for (i = 0; i < mapped_pages; i += ptes_per_page) {
  255. pgdir[(i + page_offset/getpagesize())/ptes_per_page]
  256. = (((u32)linear + i*sizeof(u32)) | PAGE_PRESENT);
  257. }
  258. verbose("Linear mapping of %u pages in %u pte pages at %p\n",
  259. mapped_pages, linear_pages, linear);
  260. return (unsigned long)pgdir;
  261. }
  262. static void concat(char *dst, char *args[])
  263. {
  264. unsigned int i, len = 0;
  265. for (i = 0; args[i]; i++) {
  266. strcpy(dst+len, args[i]);
  267. strcat(dst+len, " ");
  268. len += strlen(args[i]) + 1;
  269. }
  270. /* In case it's empty. */
  271. dst[len] = '\0';
  272. }
  273. static int tell_kernel(u32 pgdir, u32 start, u32 page_offset)
  274. {
  275. u32 args[] = { LHREQ_INITIALIZE,
  276. LGUEST_GUEST_TOP/getpagesize(), /* Just below us */
  277. pgdir, start, page_offset };
  278. int fd;
  279. fd = open_or_die("/dev/lguest", O_RDWR);
  280. if (write(fd, args, sizeof(args)) < 0)
  281. err(1, "Writing to /dev/lguest");
  282. return fd;
  283. }
  284. static void set_fd(int fd, struct device_list *devices)
  285. {
  286. FD_SET(fd, &devices->infds);
  287. if (fd > devices->max_infd)
  288. devices->max_infd = fd;
  289. }
  290. /* When input arrives, we tell the kernel to kick lguest out with -EAGAIN. */
  291. static void wake_parent(int pipefd, int lguest_fd, struct device_list *devices)
  292. {
  293. set_fd(pipefd, devices);
  294. for (;;) {
  295. fd_set rfds = devices->infds;
  296. u32 args[] = { LHREQ_BREAK, 1 };
  297. select(devices->max_infd+1, &rfds, NULL, NULL, NULL);
  298. if (FD_ISSET(pipefd, &rfds)) {
  299. int ignorefd;
  300. if (read(pipefd, &ignorefd, sizeof(ignorefd)) == 0)
  301. exit(0);
  302. FD_CLR(ignorefd, &devices->infds);
  303. } else
  304. write(lguest_fd, args, sizeof(args));
  305. }
  306. }
  307. static int setup_waker(int lguest_fd, struct device_list *device_list)
  308. {
  309. int pipefd[2], child;
  310. pipe(pipefd);
  311. child = fork();
  312. if (child == -1)
  313. err(1, "forking");
  314. if (child == 0) {
  315. close(pipefd[1]);
  316. wake_parent(pipefd[0], lguest_fd, device_list);
  317. }
  318. close(pipefd[0]);
  319. return pipefd[1];
  320. }
  321. static void *_check_pointer(unsigned long addr, unsigned int size,
  322. unsigned int line)
  323. {
  324. if (addr >= LGUEST_GUEST_TOP || addr + size >= LGUEST_GUEST_TOP)
  325. errx(1, "%s:%i: Invalid address %li", __FILE__, line, addr);
  326. return (void *)addr;
  327. }
  328. #define check_pointer(addr,size) _check_pointer(addr, size, __LINE__)
  329. /* Returns pointer to dma->used_len */
  330. static u32 *dma2iov(unsigned long dma, struct iovec iov[], unsigned *num)
  331. {
  332. unsigned int i;
  333. struct lguest_dma *udma;
  334. udma = check_pointer(dma, sizeof(*udma));
  335. for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) {
  336. if (!udma->len[i])
  337. break;
  338. iov[i].iov_base = check_pointer(udma->addr[i], udma->len[i]);
  339. iov[i].iov_len = udma->len[i];
  340. }
  341. *num = i;
  342. return &udma->used_len;
  343. }
  344. static u32 *get_dma_buffer(int fd, void *key,
  345. struct iovec iov[], unsigned int *num, u32 *irq)
  346. {
  347. u32 buf[] = { LHREQ_GETDMA, (u32)key };
  348. unsigned long udma;
  349. u32 *res;
  350. udma = write(fd, buf, sizeof(buf));
  351. if (udma == (unsigned long)-1)
  352. return NULL;
  353. /* Kernel stashes irq in ->used_len. */
  354. res = dma2iov(udma, iov, num);
  355. *irq = *res;
  356. return res;
  357. }
  358. static void trigger_irq(int fd, u32 irq)
  359. {
  360. u32 buf[] = { LHREQ_IRQ, irq };
  361. if (write(fd, buf, sizeof(buf)) != 0)
  362. err(1, "Triggering irq %i", irq);
  363. }
  364. static void discard_iovec(struct iovec *iov, unsigned int *num)
  365. {
  366. static char discard_buf[1024];
  367. *num = 1;
  368. iov->iov_base = discard_buf;
  369. iov->iov_len = sizeof(discard_buf);
  370. }
  371. static struct termios orig_term;
  372. static void restore_term(void)
  373. {
  374. tcsetattr(STDIN_FILENO, TCSANOW, &orig_term);
  375. }
  376. struct console_abort
  377. {
  378. int count;
  379. struct timeval start;
  380. };
  381. /* We DMA input to buffer bound at start of console page. */
  382. static bool handle_console_input(int fd, struct device *dev)
  383. {
  384. u32 irq = 0, *lenp;
  385. int len;
  386. unsigned int num;
  387. struct iovec iov[LGUEST_MAX_DMA_SECTIONS];
  388. struct console_abort *abort = dev->priv;
  389. lenp = get_dma_buffer(fd, dev->mem, iov, &num, &irq);
  390. if (!lenp) {
  391. warn("console: no dma buffer!");
  392. discard_iovec(iov, &num);
  393. }
  394. len = readv(dev->fd, iov, num);
  395. if (len <= 0) {
  396. warnx("Failed to get console input, ignoring console.");
  397. len = 0;
  398. }
  399. if (lenp) {
  400. *lenp = len;
  401. trigger_irq(fd, irq);
  402. }
  403. /* Three ^C within one second? Exit. */
  404. if (len == 1 && ((char *)iov[0].iov_base)[0] == 3) {
  405. if (!abort->count++)
  406. gettimeofday(&abort->start, NULL);
  407. else if (abort->count == 3) {
  408. struct timeval now;
  409. gettimeofday(&now, NULL);
  410. if (now.tv_sec <= abort->start.tv_sec+1) {
  411. /* Make sure waker is not blocked in BREAK */
  412. u32 args[] = { LHREQ_BREAK, 0 };
  413. close(waker_fd);
  414. write(fd, args, sizeof(args));
  415. exit(2);
  416. }
  417. abort->count = 0;
  418. }
  419. } else
  420. abort->count = 0;
  421. if (!len) {
  422. restore_term();
  423. return false;
  424. }
  425. return true;
  426. }
  427. static u32 handle_console_output(int fd, const struct iovec *iov,
  428. unsigned num, struct device*dev)
  429. {
  430. return writev(STDOUT_FILENO, iov, num);
  431. }
  432. static u32 handle_tun_output(int fd, const struct iovec *iov,
  433. unsigned num, struct device *dev)
  434. {
  435. /* Now we've seen output, we should warn if we can't get buffers. */
  436. *(bool *)dev->priv = true;
  437. return writev(dev->fd, iov, num);
  438. }
  439. static unsigned long peer_offset(unsigned int peernum)
  440. {
  441. return 4 * peernum;
  442. }
  443. static bool handle_tun_input(int fd, struct device *dev)
  444. {
  445. u32 irq = 0, *lenp;
  446. int len;
  447. unsigned num;
  448. struct iovec iov[LGUEST_MAX_DMA_SECTIONS];
  449. lenp = get_dma_buffer(fd, dev->mem+peer_offset(NET_PEERNUM), iov, &num,
  450. &irq);
  451. if (!lenp) {
  452. if (*(bool *)dev->priv)
  453. warn("network: no dma buffer!");
  454. discard_iovec(iov, &num);
  455. }
  456. len = readv(dev->fd, iov, num);
  457. if (len <= 0)
  458. err(1, "reading network");
  459. if (lenp) {
  460. *lenp = len;
  461. trigger_irq(fd, irq);
  462. }
  463. verbose("tun input packet len %i [%02x %02x] (%s)\n", len,
  464. ((u8 *)iov[0].iov_base)[0], ((u8 *)iov[0].iov_base)[1],
  465. lenp ? "sent" : "discarded");
  466. return true;
  467. }
  468. static u32 handle_block_output(int fd, const struct iovec *iov,
  469. unsigned num, struct device *dev)
  470. {
  471. struct lguest_block_page *p = dev->mem;
  472. u32 irq, *lenp;
  473. unsigned int len, reply_num;
  474. struct iovec reply[LGUEST_MAX_DMA_SECTIONS];
  475. off64_t device_len, off = (off64_t)p->sector * 512;
  476. device_len = *(off64_t *)dev->priv;
  477. if (off >= device_len)
  478. err(1, "Bad offset %llu vs %llu", off, device_len);
  479. if (lseek64(dev->fd, off, SEEK_SET) != off)
  480. err(1, "Bad seek to sector %i", p->sector);
  481. verbose("Block: %s at offset %llu\n", p->type ? "WRITE" : "READ", off);
  482. lenp = get_dma_buffer(fd, dev->mem, reply, &reply_num, &irq);
  483. if (!lenp)
  484. err(1, "Block request didn't give us a dma buffer");
  485. if (p->type) {
  486. len = writev(dev->fd, iov, num);
  487. if (off + len > device_len) {
  488. ftruncate(dev->fd, device_len);
  489. errx(1, "Write past end %llu+%u", off, len);
  490. }
  491. *lenp = 0;
  492. } else {
  493. len = readv(dev->fd, reply, reply_num);
  494. *lenp = len;
  495. }
  496. p->result = 1 + (p->bytes != len);
  497. trigger_irq(fd, irq);
  498. return 0;
  499. }
  500. static void handle_output(int fd, unsigned long dma, unsigned long key,
  501. struct device_list *devices)
  502. {
  503. struct device *i;
  504. u32 *lenp;
  505. struct iovec iov[LGUEST_MAX_DMA_SECTIONS];
  506. unsigned num = 0;
  507. lenp = dma2iov(dma, iov, &num);
  508. for (i = devices->dev; i; i = i->next) {
  509. if (i->handle_output && key == i->watch_key) {
  510. *lenp = i->handle_output(fd, iov, num, i);
  511. return;
  512. }
  513. }
  514. warnx("Pending dma %p, key %p", (void *)dma, (void *)key);
  515. }
  516. static void handle_input(int fd, struct device_list *devices)
  517. {
  518. struct timeval poll = { .tv_sec = 0, .tv_usec = 0 };
  519. for (;;) {
  520. struct device *i;
  521. fd_set fds = devices->infds;
  522. if (select(devices->max_infd+1, &fds, NULL, NULL, &poll) == 0)
  523. break;
  524. for (i = devices->dev; i; i = i->next) {
  525. if (i->handle_input && FD_ISSET(i->fd, &fds)) {
  526. if (!i->handle_input(fd, i)) {
  527. FD_CLR(i->fd, &devices->infds);
  528. /* Tell waker to ignore it too... */
  529. write(waker_fd, &i->fd, sizeof(i->fd));
  530. }
  531. }
  532. }
  533. }
  534. }
  535. static struct lguest_device_desc *new_dev_desc(u16 type, u16 features,
  536. u16 num_pages)
  537. {
  538. static unsigned long top = LGUEST_GUEST_TOP;
  539. struct lguest_device_desc *desc;
  540. desc = malloc(sizeof(*desc));
  541. desc->type = type;
  542. desc->num_pages = num_pages;
  543. desc->features = features;
  544. desc->status = 0;
  545. if (num_pages) {
  546. top -= num_pages*getpagesize();
  547. map_zeroed_pages(top, num_pages);
  548. desc->pfn = top / getpagesize();
  549. } else
  550. desc->pfn = 0;
  551. return desc;
  552. }
  553. static struct device *new_device(struct device_list *devices,
  554. u16 type, u16 num_pages, u16 features,
  555. int fd,
  556. bool (*handle_input)(int, struct device *),
  557. unsigned long watch_off,
  558. u32 (*handle_output)(int,
  559. const struct iovec *,
  560. unsigned,
  561. struct device *))
  562. {
  563. struct device *dev = malloc(sizeof(*dev));
  564. /* Append to device list. */
  565. *devices->lastdev = dev;
  566. dev->next = NULL;
  567. devices->lastdev = &dev->next;
  568. dev->fd = fd;
  569. if (handle_input)
  570. set_fd(dev->fd, devices);
  571. dev->desc = new_dev_desc(type, features, num_pages);
  572. dev->mem = (void *)(dev->desc->pfn * getpagesize());
  573. dev->handle_input = handle_input;
  574. dev->watch_key = (unsigned long)dev->mem + watch_off;
  575. dev->handle_output = handle_output;
  576. return dev;
  577. }
  578. static void setup_console(struct device_list *devices)
  579. {
  580. struct device *dev;
  581. if (tcgetattr(STDIN_FILENO, &orig_term) == 0) {
  582. struct termios term = orig_term;
  583. term.c_lflag &= ~(ISIG|ICANON|ECHO);
  584. tcsetattr(STDIN_FILENO, TCSANOW, &term);
  585. atexit(restore_term);
  586. }
  587. /* We don't currently require a page for the console. */
  588. dev = new_device(devices, LGUEST_DEVICE_T_CONSOLE, 0, 0,
  589. STDIN_FILENO, handle_console_input,
  590. LGUEST_CONSOLE_DMA_KEY, handle_console_output);
  591. dev->priv = malloc(sizeof(struct console_abort));
  592. ((struct console_abort *)dev->priv)->count = 0;
  593. verbose("device %p: console\n",
  594. (void *)(dev->desc->pfn * getpagesize()));
  595. }
  596. static void setup_block_file(const char *filename, struct device_list *devices)
  597. {
  598. int fd;
  599. struct device *dev;
  600. off64_t *device_len;
  601. struct lguest_block_page *p;
  602. fd = open_or_die(filename, O_RDWR|O_LARGEFILE|O_DIRECT);
  603. dev = new_device(devices, LGUEST_DEVICE_T_BLOCK, 1,
  604. LGUEST_DEVICE_F_RANDOMNESS,
  605. fd, NULL, 0, handle_block_output);
  606. device_len = dev->priv = malloc(sizeof(*device_len));
  607. *device_len = lseek64(fd, 0, SEEK_END);
  608. p = dev->mem;
  609. p->num_sectors = *device_len/512;
  610. verbose("device %p: block %i sectors\n",
  611. (void *)(dev->desc->pfn * getpagesize()), p->num_sectors);
  612. }
  613. /* We use fnctl locks to reserve network slots (autocleanup!) */
  614. static unsigned int find_slot(int netfd, const char *filename)
  615. {
  616. struct flock fl;
  617. fl.l_type = F_WRLCK;
  618. fl.l_whence = SEEK_SET;
  619. fl.l_len = 1;
  620. for (fl.l_start = 0;
  621. fl.l_start < getpagesize()/sizeof(struct lguest_net);
  622. fl.l_start++) {
  623. if (fcntl(netfd, F_SETLK, &fl) == 0)
  624. return fl.l_start;
  625. }
  626. errx(1, "No free slots in network file %s", filename);
  627. }
  628. static void setup_net_file(const char *filename,
  629. struct device_list *devices)
  630. {
  631. int netfd;
  632. struct device *dev;
  633. netfd = open(filename, O_RDWR, 0);
  634. if (netfd < 0) {
  635. if (errno == ENOENT) {
  636. netfd = open(filename, O_RDWR|O_CREAT, 0600);
  637. if (netfd >= 0) {
  638. char page[getpagesize()];
  639. memset(page, 0, sizeof(page));
  640. write(netfd, page, sizeof(page));
  641. }
  642. }
  643. if (netfd < 0)
  644. err(1, "cannot open net file '%s'", filename);
  645. }
  646. dev = new_device(devices, LGUEST_DEVICE_T_NET, 1,
  647. find_slot(netfd, filename)|LGUEST_NET_F_NOCSUM,
  648. -1, NULL, 0, NULL);
  649. /* We overwrite the /dev/zero mapping with the actual file. */
  650. if (mmap(dev->mem, getpagesize(), PROT_READ|PROT_WRITE,
  651. MAP_FIXED|MAP_SHARED, netfd, 0) != dev->mem)
  652. err(1, "could not mmap '%s'", filename);
  653. verbose("device %p: shared net %s, peer %i\n",
  654. (void *)(dev->desc->pfn * getpagesize()), filename,
  655. dev->desc->features & ~LGUEST_NET_F_NOCSUM);
  656. }
  657. static u32 str2ip(const char *ipaddr)
  658. {
  659. unsigned int byte[4];
  660. sscanf(ipaddr, "%u.%u.%u.%u", &byte[0], &byte[1], &byte[2], &byte[3]);
  661. return (byte[0] << 24) | (byte[1] << 16) | (byte[2] << 8) | byte[3];
  662. }
  663. /* adapted from libbridge */
  664. static void add_to_bridge(int fd, const char *if_name, const char *br_name)
  665. {
  666. int ifidx;
  667. struct ifreq ifr;
  668. if (!*br_name)
  669. errx(1, "must specify bridge name");
  670. ifidx = if_nametoindex(if_name);
  671. if (!ifidx)
  672. errx(1, "interface %s does not exist!", if_name);
  673. strncpy(ifr.ifr_name, br_name, IFNAMSIZ);
  674. ifr.ifr_ifindex = ifidx;
  675. if (ioctl(fd, SIOCBRADDIF, &ifr) < 0)
  676. err(1, "can't add %s to bridge %s", if_name, br_name);
  677. }
  678. static void configure_device(int fd, const char *devname, u32 ipaddr,
  679. unsigned char hwaddr[6])
  680. {
  681. struct ifreq ifr;
  682. struct sockaddr_in *sin = (struct sockaddr_in *)&ifr.ifr_addr;
  683. memset(&ifr, 0, sizeof(ifr));
  684. strcpy(ifr.ifr_name, devname);
  685. sin->sin_family = AF_INET;
  686. sin->sin_addr.s_addr = htonl(ipaddr);
  687. if (ioctl(fd, SIOCSIFADDR, &ifr) != 0)
  688. err(1, "Setting %s interface address", devname);
  689. ifr.ifr_flags = IFF_UP;
  690. if (ioctl(fd, SIOCSIFFLAGS, &ifr) != 0)
  691. err(1, "Bringing interface %s up", devname);
  692. if (ioctl(fd, SIOCGIFHWADDR, &ifr) != 0)
  693. err(1, "getting hw address for %s", devname);
  694. memcpy(hwaddr, ifr.ifr_hwaddr.sa_data, 6);
  695. }
  696. static void setup_tun_net(const char *arg, struct device_list *devices)
  697. {
  698. struct device *dev;
  699. struct ifreq ifr;
  700. int netfd, ipfd;
  701. u32 ip;
  702. const char *br_name = NULL;
  703. netfd = open_or_die("/dev/net/tun", O_RDWR);
  704. memset(&ifr, 0, sizeof(ifr));
  705. ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
  706. strcpy(ifr.ifr_name, "tap%d");
  707. if (ioctl(netfd, TUNSETIFF, &ifr) != 0)
  708. err(1, "configuring /dev/net/tun");
  709. ioctl(netfd, TUNSETNOCSUM, 1);
  710. /* You will be peer 1: we should create enough jitter to randomize */
  711. dev = new_device(devices, LGUEST_DEVICE_T_NET, 1,
  712. NET_PEERNUM|LGUEST_DEVICE_F_RANDOMNESS, netfd,
  713. handle_tun_input, peer_offset(0), handle_tun_output);
  714. dev->priv = malloc(sizeof(bool));
  715. *(bool *)dev->priv = false;
  716. ipfd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
  717. if (ipfd < 0)
  718. err(1, "opening IP socket");
  719. if (!strncmp(BRIDGE_PFX, arg, strlen(BRIDGE_PFX))) {
  720. ip = INADDR_ANY;
  721. br_name = arg + strlen(BRIDGE_PFX);
  722. add_to_bridge(ipfd, ifr.ifr_name, br_name);
  723. } else
  724. ip = str2ip(arg);
  725. /* We are peer 0, ie. first slot. */
  726. configure_device(ipfd, ifr.ifr_name, ip, dev->mem);
  727. /* Set "promisc" bit: we want every single packet. */
  728. *((u8 *)dev->mem) |= 0x1;
  729. close(ipfd);
  730. verbose("device %p: tun net %u.%u.%u.%u\n",
  731. (void *)(dev->desc->pfn * getpagesize()),
  732. (u8)(ip>>24), (u8)(ip>>16), (u8)(ip>>8), (u8)ip);
  733. if (br_name)
  734. verbose("attached to bridge: %s\n", br_name);
  735. }
  736. /* Now we know how much memory we have, we copy in device descriptors */
  737. static void map_device_descriptors(struct device_list *devs, unsigned long mem)
  738. {
  739. struct device *i;
  740. unsigned int num;
  741. struct lguest_device_desc *descs;
  742. /* Device descriptor array sits just above top of normal memory */
  743. descs = map_zeroed_pages(mem, 1);
  744. for (i = devs->dev, num = 0; i; i = i->next, num++) {
  745. if (num == LGUEST_MAX_DEVICES)
  746. errx(1, "too many devices");
  747. verbose("Device %i: %s\n", num,
  748. i->desc->type == LGUEST_DEVICE_T_NET ? "net"
  749. : i->desc->type == LGUEST_DEVICE_T_CONSOLE ? "console"
  750. : i->desc->type == LGUEST_DEVICE_T_BLOCK ? "block"
  751. : "unknown");
  752. descs[num] = *i->desc;
  753. free(i->desc);
  754. i->desc = &descs[num];
  755. }
  756. }
  757. static void __attribute__((noreturn))
  758. run_guest(int lguest_fd, struct device_list *device_list)
  759. {
  760. for (;;) {
  761. u32 args[] = { LHREQ_BREAK, 0 };
  762. unsigned long arr[2];
  763. int readval;
  764. /* We read from the /dev/lguest device to run the Guest. */
  765. readval = read(lguest_fd, arr, sizeof(arr));
  766. if (readval == sizeof(arr)) {
  767. handle_output(lguest_fd, arr[0], arr[1], device_list);
  768. continue;
  769. } else if (errno == ENOENT) {
  770. char reason[1024] = { 0 };
  771. read(lguest_fd, reason, sizeof(reason)-1);
  772. errx(1, "%s", reason);
  773. } else if (errno != EAGAIN)
  774. err(1, "Running guest failed");
  775. handle_input(lguest_fd, device_list);
  776. if (write(lguest_fd, args, sizeof(args)) < 0)
  777. err(1, "Resetting break");
  778. }
  779. }
  780. static struct option opts[] = {
  781. { "verbose", 0, NULL, 'v' },
  782. { "sharenet", 1, NULL, 's' },
  783. { "tunnet", 1, NULL, 't' },
  784. { "block", 1, NULL, 'b' },
  785. { "initrd", 1, NULL, 'i' },
  786. { NULL },
  787. };
  788. static void usage(void)
  789. {
  790. errx(1, "Usage: lguest [--verbose] "
  791. "[--sharenet=<filename>|--tunnet=(<ipaddr>|bridge:<bridgename>)\n"
  792. "|--block=<filename>|--initrd=<filename>]...\n"
  793. "<mem-in-mb> vmlinux [args...]");
  794. }
  795. int main(int argc, char *argv[])
  796. {
  797. unsigned long mem, pgdir, start, page_offset, initrd_size = 0;
  798. int c, lguest_fd;
  799. struct device_list device_list;
  800. void *boot = (void *)0;
  801. const char *initrd_name = NULL;
  802. device_list.max_infd = -1;
  803. device_list.dev = NULL;
  804. device_list.lastdev = &device_list.dev;
  805. FD_ZERO(&device_list.infds);
  806. while ((c = getopt_long(argc, argv, "v", opts, NULL)) != EOF) {
  807. switch (c) {
  808. case 'v':
  809. verbose = true;
  810. break;
  811. case 's':
  812. setup_net_file(optarg, &device_list);
  813. break;
  814. case 't':
  815. setup_tun_net(optarg, &device_list);
  816. break;
  817. case 'b':
  818. setup_block_file(optarg, &device_list);
  819. break;
  820. case 'i':
  821. initrd_name = optarg;
  822. break;
  823. default:
  824. warnx("Unknown argument %s", argv[optind]);
  825. usage();
  826. }
  827. }
  828. if (optind + 2 > argc)
  829. usage();
  830. /* We need a console device */
  831. setup_console(&device_list);
  832. /* First we map /dev/zero over all of guest-physical memory. */
  833. mem = atoi(argv[optind]) * 1024 * 1024;
  834. map_zeroed_pages(0, mem / getpagesize());
  835. /* Now we load the kernel */
  836. start = load_kernel(open_or_die(argv[optind+1], O_RDONLY),
  837. &page_offset);
  838. /* Write the device descriptors into memory. */
  839. map_device_descriptors(&device_list, mem);
  840. /* Map the initrd image if requested */
  841. if (initrd_name) {
  842. initrd_size = load_initrd(initrd_name, mem);
  843. *(unsigned long *)(boot+0x218) = mem - initrd_size;
  844. *(unsigned long *)(boot+0x21c) = initrd_size;
  845. *(unsigned char *)(boot+0x210) = 0xFF;
  846. }
  847. /* Set up the initial linar pagetables. */
  848. pgdir = setup_pagetables(mem, initrd_size, page_offset);
  849. /* E820 memory map: ours is a simple, single region. */
  850. *(char*)(boot+E820NR) = 1;
  851. *((struct e820entry *)(boot+E820MAP))
  852. = ((struct e820entry) { 0, mem, E820_RAM });
  853. /* Command line pointer and command line (at 4096) */
  854. *(void **)(boot + 0x228) = boot + 4096;
  855. concat(boot + 4096, argv+optind+2);
  856. /* Paravirt type: 1 == lguest */
  857. *(int *)(boot + 0x23c) = 1;
  858. lguest_fd = tell_kernel(pgdir, start, page_offset);
  859. waker_fd = setup_waker(lguest_fd, &device_list);
  860. run_guest(lguest_fd, &device_list);
  861. }