vsyscall_64.c 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298
  1. /*
  2. * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
  3. * Copyright 2003 Andi Kleen, SuSE Labs.
  4. *
  5. * [ NOTE: this mechanism is now deprecated in favor of the vDSO. ]
  6. *
  7. * Thanks to hpa@transmeta.com for some useful hint.
  8. * Special thanks to Ingo Molnar for his early experience with
  9. * a different vsyscall implementation for Linux/IA32 and for the name.
  10. *
  11. * vsyscall 1 is located at -10Mbyte, vsyscall 2 is located
  12. * at virtual address -10Mbyte+1024bytes etc... There are at max 4
  13. * vsyscalls. One vsyscall can reserve more than 1 slot to avoid
  14. * jumping out of line if necessary. We cannot add more with this
  15. * mechanism because older kernels won't return -ENOSYS.
  16. *
  17. * Note: the concept clashes with user mode linux. UML users should
  18. * use the vDSO.
  19. */
  20. #include <linux/time.h>
  21. #include <linux/init.h>
  22. #include <linux/kernel.h>
  23. #include <linux/timer.h>
  24. #include <linux/seqlock.h>
  25. #include <linux/jiffies.h>
  26. #include <linux/sysctl.h>
  27. #include <linux/clocksource.h>
  28. #include <linux/getcpu.h>
  29. #include <linux/cpu.h>
  30. #include <linux/smp.h>
  31. #include <linux/notifier.h>
  32. #include <linux/syscalls.h>
  33. #include <linux/ratelimit.h>
  34. #include <asm/vsyscall.h>
  35. #include <asm/pgtable.h>
  36. #include <asm/compat.h>
  37. #include <asm/page.h>
  38. #include <asm/unistd.h>
  39. #include <asm/fixmap.h>
  40. #include <asm/errno.h>
  41. #include <asm/io.h>
  42. #include <asm/segment.h>
  43. #include <asm/desc.h>
  44. #include <asm/topology.h>
  45. #include <asm/vgtod.h>
  46. #include <asm/traps.h>
  47. #define CREATE_TRACE_POINTS
  48. #include "vsyscall_trace.h"
  49. DEFINE_VVAR(int, vgetcpu_mode);
  50. DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) =
  51. {
  52. .lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock),
  53. };
  54. static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE;
  55. static int __init vsyscall_setup(char *str)
  56. {
  57. if (str) {
  58. if (!strcmp("emulate", str))
  59. vsyscall_mode = EMULATE;
  60. else if (!strcmp("native", str))
  61. vsyscall_mode = NATIVE;
  62. else if (!strcmp("none", str))
  63. vsyscall_mode = NONE;
  64. else
  65. return -EINVAL;
  66. return 0;
  67. }
  68. return -EINVAL;
  69. }
  70. early_param("vsyscall", vsyscall_setup);
  71. void update_vsyscall_tz(void)
  72. {
  73. unsigned long flags;
  74. write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
  75. /* sys_tz has changed */
  76. vsyscall_gtod_data.sys_tz = sys_tz;
  77. write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
  78. }
  79. void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
  80. struct clocksource *clock, u32 mult)
  81. {
  82. unsigned long flags;
  83. write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
  84. /* copy vsyscall data */
  85. vsyscall_gtod_data.clock.vclock_mode = clock->archdata.vclock_mode;
  86. vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
  87. vsyscall_gtod_data.clock.mask = clock->mask;
  88. vsyscall_gtod_data.clock.mult = mult;
  89. vsyscall_gtod_data.clock.shift = clock->shift;
  90. vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
  91. vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
  92. vsyscall_gtod_data.wall_to_monotonic = *wtm;
  93. vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();
  94. write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
  95. }
  96. static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
  97. const char *message)
  98. {
  99. static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST);
  100. struct task_struct *tsk;
  101. if (!show_unhandled_signals || !__ratelimit(&rs))
  102. return;
  103. tsk = current;
  104. printk("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n",
  105. level, tsk->comm, task_pid_nr(tsk),
  106. message, regs->ip, regs->cs,
  107. regs->sp, regs->ax, regs->si, regs->di);
  108. }
  109. static int addr_to_vsyscall_nr(unsigned long addr)
  110. {
  111. int nr;
  112. if ((addr & ~0xC00UL) != VSYSCALL_START)
  113. return -EINVAL;
  114. nr = (addr & 0xC00UL) >> 10;
  115. if (nr >= 3)
  116. return -EINVAL;
  117. return nr;
  118. }
  119. bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
  120. {
  121. struct task_struct *tsk;
  122. unsigned long caller;
  123. int vsyscall_nr;
  124. long ret;
  125. /*
  126. * No point in checking CS -- the only way to get here is a user mode
  127. * trap to a high address, which means that we're in 64-bit user code.
  128. */
  129. WARN_ON_ONCE(address != regs->ip);
  130. if (vsyscall_mode == NONE) {
  131. warn_bad_vsyscall(KERN_INFO, regs,
  132. "vsyscall attempted with vsyscall=none");
  133. return false;
  134. }
  135. vsyscall_nr = addr_to_vsyscall_nr(address);
  136. trace_emulate_vsyscall(vsyscall_nr);
  137. if (vsyscall_nr < 0) {
  138. warn_bad_vsyscall(KERN_WARNING, regs,
  139. "misaligned vsyscall (exploit attempt or buggy program) -- look up the vsyscall kernel parameter if you need a workaround");
  140. goto sigsegv;
  141. }
  142. if (get_user(caller, (unsigned long __user *)regs->sp) != 0) {
  143. warn_bad_vsyscall(KERN_WARNING, regs,
  144. "vsyscall with bad stack (exploit attempt?)");
  145. goto sigsegv;
  146. }
  147. tsk = current;
  148. if (seccomp_mode(&tsk->seccomp))
  149. do_exit(SIGKILL);
  150. switch (vsyscall_nr) {
  151. case 0:
  152. ret = sys_gettimeofday(
  153. (struct timeval __user *)regs->di,
  154. (struct timezone __user *)regs->si);
  155. break;
  156. case 1:
  157. ret = sys_time((time_t __user *)regs->di);
  158. break;
  159. case 2:
  160. ret = sys_getcpu((unsigned __user *)regs->di,
  161. (unsigned __user *)regs->si,
  162. 0);
  163. break;
  164. }
  165. if (ret == -EFAULT) {
  166. /*
  167. * Bad news -- userspace fed a bad pointer to a vsyscall.
  168. *
  169. * With a real vsyscall, that would have caused SIGSEGV.
  170. * To make writing reliable exploits using the emulated
  171. * vsyscalls harder, generate SIGSEGV here as well.
  172. */
  173. warn_bad_vsyscall(KERN_INFO, regs,
  174. "vsyscall fault (exploit attempt?)");
  175. goto sigsegv;
  176. }
  177. regs->ax = ret;
  178. /* Emulate a ret instruction. */
  179. regs->ip = caller;
  180. regs->sp += 8;
  181. return true;
  182. sigsegv:
  183. force_sig(SIGSEGV, current);
  184. return true;
  185. }
  186. /*
  187. * Assume __initcall executes before all user space. Hopefully kmod
  188. * doesn't violate that. We'll find out if it does.
  189. */
  190. static void __cpuinit vsyscall_set_cpu(int cpu)
  191. {
  192. unsigned long d;
  193. unsigned long node = 0;
  194. #ifdef CONFIG_NUMA
  195. node = cpu_to_node(cpu);
  196. #endif
  197. if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP))
  198. write_rdtscp_aux((node << 12) | cpu);
  199. /*
  200. * Store cpu number in limit so that it can be loaded quickly
  201. * in user space in vgetcpu. (12 bits for the CPU and 8 bits for the node)
  202. */
  203. d = 0x0f40000000000ULL;
  204. d |= cpu;
  205. d |= (node & 0xf) << 12;
  206. d |= (node >> 4) << 48;
  207. write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
  208. }
  209. static void __cpuinit cpu_vsyscall_init(void *arg)
  210. {
  211. /* preemption should be already off */
  212. vsyscall_set_cpu(raw_smp_processor_id());
  213. }
  214. static int __cpuinit
  215. cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
  216. {
  217. long cpu = (long)arg;
  218. if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
  219. smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1);
  220. return NOTIFY_DONE;
  221. }
  222. void __init map_vsyscall(void)
  223. {
  224. extern char __vsyscall_page;
  225. unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page);
  226. extern char __vvar_page;
  227. unsigned long physaddr_vvar_page = __pa_symbol(&__vvar_page);
  228. __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_vsyscall,
  229. vsyscall_mode == NATIVE
  230. ? PAGE_KERNEL_VSYSCALL
  231. : PAGE_KERNEL_VVAR);
  232. BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_FIRST_PAGE) !=
  233. (unsigned long)VSYSCALL_START);
  234. __set_fixmap(VVAR_PAGE, physaddr_vvar_page, PAGE_KERNEL_VVAR);
  235. BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) !=
  236. (unsigned long)VVAR_ADDRESS);
  237. }
  238. static int __init vsyscall_init(void)
  239. {
  240. BUG_ON(VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE));
  241. on_each_cpu(cpu_vsyscall_init, NULL, 1);
  242. /* notifier priority > KVM */
  243. hotcpu_notifier(cpu_vsyscall_notifier, 30);
  244. return 0;
  245. }
  246. __initcall(vsyscall_init);