|
@@ -2,6 +2,8 @@
|
|
|
* Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
|
|
|
* Copyright 2003 Andi Kleen, SuSE Labs.
|
|
|
*
|
|
|
+ * [ NOTE: this mechanism is now deprecated in favor of the vDSO. ]
|
|
|
+ *
|
|
|
* Thanks to hpa@transmeta.com for some useful hint.
|
|
|
* Special thanks to Ingo Molnar for his early experience with
|
|
|
* a different vsyscall implementation for Linux/IA32 and for the name.
|
|
@@ -11,10 +13,9 @@
|
|
|
* vsyscalls. One vsyscall can reserve more than 1 slot to avoid
|
|
|
* jumping out of line if necessary. We cannot add more with this
|
|
|
* mechanism because older kernels won't return -ENOSYS.
|
|
|
- * If we want more than four we need a vDSO.
|
|
|
*
|
|
|
- * Note: the concept clashes with user mode linux. If you use UML and
|
|
|
- * want per guest time just set the kernel.vsyscall64 sysctl to 0.
|
|
|
+ * Note: the concept clashes with user mode linux. UML users should
|
|
|
+ * use the vDSO.
|
|
|
*/
|
|
|
|
|
|
/* Disable profiling for userspace code: */
|
|
@@ -32,9 +33,12 @@
|
|
|
#include <linux/cpu.h>
|
|
|
#include <linux/smp.h>
|
|
|
#include <linux/notifier.h>
|
|
|
+#include <linux/syscalls.h>
|
|
|
+#include <linux/ratelimit.h>
|
|
|
|
|
|
#include <asm/vsyscall.h>
|
|
|
#include <asm/pgtable.h>
|
|
|
+#include <asm/compat.h>
|
|
|
#include <asm/page.h>
|
|
|
#include <asm/unistd.h>
|
|
|
#include <asm/fixmap.h>
|
|
@@ -44,16 +48,12 @@
|
|
|
#include <asm/desc.h>
|
|
|
#include <asm/topology.h>
|
|
|
#include <asm/vgtod.h>
|
|
|
-
|
|
|
-#define __vsyscall(nr) \
|
|
|
- __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
|
|
|
-#define __syscall_clobber "r11","cx","memory"
|
|
|
+#include <asm/traps.h>
|
|
|
|
|
|
DEFINE_VVAR(int, vgetcpu_mode);
|
|
|
DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) =
|
|
|
{
|
|
|
.lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock),
|
|
|
- .sysctl_enabled = 1,
|
|
|
};
|
|
|
|
|
|
void update_vsyscall_tz(void)
|
|
@@ -72,179 +72,149 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
|
|
|
unsigned long flags;
|
|
|
|
|
|
write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
|
|
|
+
|
|
|
/* copy vsyscall data */
|
|
|
- vsyscall_gtod_data.clock.vread = clock->vread;
|
|
|
- vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
|
|
|
- vsyscall_gtod_data.clock.mask = clock->mask;
|
|
|
- vsyscall_gtod_data.clock.mult = mult;
|
|
|
- vsyscall_gtod_data.clock.shift = clock->shift;
|
|
|
- vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
|
|
|
- vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
|
|
|
- vsyscall_gtod_data.wall_to_monotonic = *wtm;
|
|
|
- vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();
|
|
|
+ vsyscall_gtod_data.clock.vclock_mode = clock->archdata.vclock_mode;
|
|
|
+ vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
|
|
|
+ vsyscall_gtod_data.clock.mask = clock->mask;
|
|
|
+ vsyscall_gtod_data.clock.mult = mult;
|
|
|
+ vsyscall_gtod_data.clock.shift = clock->shift;
|
|
|
+ vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
|
|
|
+ vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
|
|
|
+ vsyscall_gtod_data.wall_to_monotonic = *wtm;
|
|
|
+ vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();
|
|
|
+
|
|
|
write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
|
|
|
}
|
|
|
|
|
|
-/* RED-PEN may want to readd seq locking, but then the variable should be
|
|
|
- * write-once.
|
|
|
- */
|
|
|
-static __always_inline void do_get_tz(struct timezone * tz)
|
|
|
+static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
|
|
|
+ const char *message)
|
|
|
{
|
|
|
- *tz = VVAR(vsyscall_gtod_data).sys_tz;
|
|
|
-}
|
|
|
+ static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST);
|
|
|
+ struct task_struct *tsk;
|
|
|
|
|
|
-static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
|
|
|
-{
|
|
|
- int ret;
|
|
|
- asm volatile("syscall"
|
|
|
- : "=a" (ret)
|
|
|
- : "0" (__NR_gettimeofday),"D" (tv),"S" (tz)
|
|
|
- : __syscall_clobber );
|
|
|
- return ret;
|
|
|
-}
|
|
|
+ if (!show_unhandled_signals || !__ratelimit(&rs))
|
|
|
+ return;
|
|
|
|
|
|
-static __always_inline long time_syscall(long *t)
|
|
|
-{
|
|
|
- long secs;
|
|
|
- asm volatile("syscall"
|
|
|
- : "=a" (secs)
|
|
|
- : "0" (__NR_time),"D" (t) : __syscall_clobber);
|
|
|
- return secs;
|
|
|
-}
|
|
|
+ tsk = current;
|
|
|
|
|
|
-static __always_inline void do_vgettimeofday(struct timeval * tv)
|
|
|
-{
|
|
|
- cycle_t now, base, mask, cycle_delta;
|
|
|
- unsigned seq;
|
|
|
- unsigned long mult, shift, nsec;
|
|
|
- cycle_t (*vread)(void);
|
|
|
- do {
|
|
|
- seq = read_seqbegin(&VVAR(vsyscall_gtod_data).lock);
|
|
|
-
|
|
|
- vread = VVAR(vsyscall_gtod_data).clock.vread;
|
|
|
- if (unlikely(!VVAR(vsyscall_gtod_data).sysctl_enabled ||
|
|
|
- !vread)) {
|
|
|
- gettimeofday(tv,NULL);
|
|
|
- return;
|
|
|
- }
|
|
|
-
|
|
|
- now = vread();
|
|
|
- base = VVAR(vsyscall_gtod_data).clock.cycle_last;
|
|
|
- mask = VVAR(vsyscall_gtod_data).clock.mask;
|
|
|
- mult = VVAR(vsyscall_gtod_data).clock.mult;
|
|
|
- shift = VVAR(vsyscall_gtod_data).clock.shift;
|
|
|
-
|
|
|
- tv->tv_sec = VVAR(vsyscall_gtod_data).wall_time_sec;
|
|
|
- nsec = VVAR(vsyscall_gtod_data).wall_time_nsec;
|
|
|
- } while (read_seqretry(&VVAR(vsyscall_gtod_data).lock, seq));
|
|
|
-
|
|
|
- /* calculate interval: */
|
|
|
- cycle_delta = (now - base) & mask;
|
|
|
- /* convert to nsecs: */
|
|
|
- nsec += (cycle_delta * mult) >> shift;
|
|
|
-
|
|
|
- while (nsec >= NSEC_PER_SEC) {
|
|
|
- tv->tv_sec += 1;
|
|
|
- nsec -= NSEC_PER_SEC;
|
|
|
- }
|
|
|
- tv->tv_usec = nsec / NSEC_PER_USEC;
|
|
|
+ printk("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n",
|
|
|
+ level, tsk->comm, task_pid_nr(tsk),
|
|
|
+ message, regs->ip - 2, regs->cs,
|
|
|
+ regs->sp, regs->ax, regs->si, regs->di);
|
|
|
}
|
|
|
|
|
|
-int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
|
|
|
+static int addr_to_vsyscall_nr(unsigned long addr)
|
|
|
{
|
|
|
- if (tv)
|
|
|
- do_vgettimeofday(tv);
|
|
|
- if (tz)
|
|
|
- do_get_tz(tz);
|
|
|
- return 0;
|
|
|
-}
|
|
|
+ int nr;
|
|
|
|
|
|
-/* This will break when the xtime seconds get inaccurate, but that is
|
|
|
- * unlikely */
|
|
|
-time_t __vsyscall(1) vtime(time_t *t)
|
|
|
-{
|
|
|
- unsigned seq;
|
|
|
- time_t result;
|
|
|
- if (unlikely(!VVAR(vsyscall_gtod_data).sysctl_enabled))
|
|
|
- return time_syscall(t);
|
|
|
+ if ((addr & ~0xC00UL) != VSYSCALL_START)
|
|
|
+ return -EINVAL;
|
|
|
|
|
|
- do {
|
|
|
- seq = read_seqbegin(&VVAR(vsyscall_gtod_data).lock);
|
|
|
+ nr = (addr & 0xC00UL) >> 10;
|
|
|
+ if (nr >= 3)
|
|
|
+ return -EINVAL;
|
|
|
|
|
|
- result = VVAR(vsyscall_gtod_data).wall_time_sec;
|
|
|
+ return nr;
|
|
|
+}
|
|
|
|
|
|
- } while (read_seqretry(&VVAR(vsyscall_gtod_data).lock, seq));
|
|
|
+void dotraplinkage do_emulate_vsyscall(struct pt_regs *regs, long error_code)
|
|
|
+{
|
|
|
+ struct task_struct *tsk;
|
|
|
+ unsigned long caller;
|
|
|
+ int vsyscall_nr;
|
|
|
+ long ret;
|
|
|
+
|
|
|
+ local_irq_enable();
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Real 64-bit user mode code has cs == __USER_CS. Anything else
|
|
|
+ * is bogus.
|
|
|
+ */
|
|
|
+ if (regs->cs != __USER_CS) {
|
|
|
+ /*
|
|
|
+ * If we trapped from kernel mode, we might as well OOPS now
|
|
|
+ * instead of returning to some random address and OOPSing
|
|
|
+ * then.
|
|
|
+ */
|
|
|
+ BUG_ON(!user_mode(regs));
|
|
|
+
|
|
|
+ /* Compat mode and non-compat 32-bit CS should both segfault. */
|
|
|
+ warn_bad_vsyscall(KERN_WARNING, regs,
|
|
|
+ "illegal int 0xcc from 32-bit mode");
|
|
|
+ goto sigsegv;
|
|
|
+ }
|
|
|
|
|
|
- if (t)
|
|
|
- *t = result;
|
|
|
- return result;
|
|
|
-}
|
|
|
+ /*
|
|
|
+ * x86-ism here: regs->ip points to the instruction after the int 0xcc,
|
|
|
+ * and int 0xcc is two bytes long.
|
|
|
+ */
|
|
|
+ vsyscall_nr = addr_to_vsyscall_nr(regs->ip - 2);
|
|
|
+ if (vsyscall_nr < 0) {
|
|
|
+ warn_bad_vsyscall(KERN_WARNING, regs,
|
|
|
+ "illegal int 0xcc (exploit attempt?)");
|
|
|
+ goto sigsegv;
|
|
|
+ }
|
|
|
|
|
|
-/* Fast way to get current CPU and node.
|
|
|
- This helps to do per node and per CPU caches in user space.
|
|
|
- The result is not guaranteed without CPU affinity, but usually
|
|
|
- works out because the scheduler tries to keep a thread on the same
|
|
|
- CPU.
|
|
|
+ if (get_user(caller, (unsigned long __user *)regs->sp) != 0) {
|
|
|
+ warn_bad_vsyscall(KERN_WARNING, regs, "int 0xcc with bad stack (exploit attempt?)");
|
|
|
+ goto sigsegv;
|
|
|
+ }
|
|
|
|
|
|
- tcache must point to a two element sized long array.
|
|
|
- All arguments can be NULL. */
|
|
|
-long __vsyscall(2)
|
|
|
-vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
|
|
|
-{
|
|
|
- unsigned int p;
|
|
|
- unsigned long j = 0;
|
|
|
-
|
|
|
- /* Fast cache - only recompute value once per jiffies and avoid
|
|
|
- relatively costly rdtscp/cpuid otherwise.
|
|
|
- This works because the scheduler usually keeps the process
|
|
|
- on the same CPU and this syscall doesn't guarantee its
|
|
|
- results anyways.
|
|
|
- We do this here because otherwise user space would do it on
|
|
|
- its own in a likely inferior way (no access to jiffies).
|
|
|
- If you don't like it pass NULL. */
|
|
|
- if (tcache && tcache->blob[0] == (j = VVAR(jiffies))) {
|
|
|
- p = tcache->blob[1];
|
|
|
- } else if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) {
|
|
|
- /* Load per CPU data from RDTSCP */
|
|
|
- native_read_tscp(&p);
|
|
|
- } else {
|
|
|
- /* Load per CPU data from GDT */
|
|
|
- asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
|
|
|
+ tsk = current;
|
|
|
+ if (seccomp_mode(&tsk->seccomp))
|
|
|
+ do_exit(SIGKILL);
|
|
|
+
|
|
|
+ switch (vsyscall_nr) {
|
|
|
+ case 0:
|
|
|
+ ret = sys_gettimeofday(
|
|
|
+ (struct timeval __user *)regs->di,
|
|
|
+ (struct timezone __user *)regs->si);
|
|
|
+ break;
|
|
|
+
|
|
|
+ case 1:
|
|
|
+ ret = sys_time((time_t __user *)regs->di);
|
|
|
+ break;
|
|
|
+
|
|
|
+ case 2:
|
|
|
+ ret = sys_getcpu((unsigned __user *)regs->di,
|
|
|
+ (unsigned __user *)regs->si,
|
|
|
+ 0);
|
|
|
+ break;
|
|
|
}
|
|
|
- if (tcache) {
|
|
|
- tcache->blob[0] = j;
|
|
|
- tcache->blob[1] = p;
|
|
|
+
|
|
|
+ if (ret == -EFAULT) {
|
|
|
+ /*
|
|
|
+ * Bad news -- userspace fed a bad pointer to a vsyscall.
|
|
|
+ *
|
|
|
+ * With a real vsyscall, that would have caused SIGSEGV.
|
|
|
+ * To make writing reliable exploits using the emulated
|
|
|
+ * vsyscalls harder, generate SIGSEGV here as well.
|
|
|
+ */
|
|
|
+ warn_bad_vsyscall(KERN_INFO, regs,
|
|
|
+ "vsyscall fault (exploit attempt?)");
|
|
|
+ goto sigsegv;
|
|
|
}
|
|
|
- if (cpu)
|
|
|
- *cpu = p & 0xfff;
|
|
|
- if (node)
|
|
|
- *node = p >> 12;
|
|
|
- return 0;
|
|
|
-}
|
|
|
|
|
|
-static long __vsyscall(3) venosys_1(void)
|
|
|
-{
|
|
|
- return -ENOSYS;
|
|
|
-}
|
|
|
+ regs->ax = ret;
|
|
|
|
|
|
-#ifdef CONFIG_SYSCTL
|
|
|
-static ctl_table kernel_table2[] = {
|
|
|
- { .procname = "vsyscall64",
|
|
|
- .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int),
|
|
|
- .mode = 0644,
|
|
|
- .proc_handler = proc_dointvec },
|
|
|
- {}
|
|
|
-};
|
|
|
+ /* Emulate a ret instruction. */
|
|
|
+ regs->ip = caller;
|
|
|
+ regs->sp += 8;
|
|
|
|
|
|
-static ctl_table kernel_root_table2[] = {
|
|
|
- { .procname = "kernel", .mode = 0555,
|
|
|
- .child = kernel_table2 },
|
|
|
- {}
|
|
|
-};
|
|
|
-#endif
|
|
|
+ local_irq_disable();
|
|
|
+ return;
|
|
|
+
|
|
|
+sigsegv:
|
|
|
+ regs->ip -= 2; /* The faulting instruction should be the int 0xcc. */
|
|
|
+ force_sig(SIGSEGV, current);
|
|
|
+ local_irq_disable();
|
|
|
+}
|
|
|
|
|
|
-/* Assume __initcall executes before all user space. Hopefully kmod
|
|
|
- doesn't violate that. We'll find out if it does. */
|
|
|
+/*
|
|
|
+ * Assume __initcall executes before all user space. Hopefully kmod
|
|
|
+ * doesn't violate that. We'll find out if it does.
|
|
|
+ */
|
|
|
static void __cpuinit vsyscall_set_cpu(int cpu)
|
|
|
{
|
|
|
unsigned long d;
|
|
@@ -255,13 +225,15 @@ static void __cpuinit vsyscall_set_cpu(int cpu)
|
|
|
if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP))
|
|
|
write_rdtscp_aux((node << 12) | cpu);
|
|
|
|
|
|
- /* Store cpu number in limit so that it can be loaded quickly
|
|
|
- in user space in vgetcpu.
|
|
|
- 12 bits for the CPU and 8 bits for the node. */
|
|
|
+ /*
|
|
|
+ * Store cpu number in limit so that it can be loaded quickly
|
|
|
+ * in user space in vgetcpu. (12 bits for the CPU and 8 bits for the node)
|
|
|
+ */
|
|
|
d = 0x0f40000000000ULL;
|
|
|
d |= cpu;
|
|
|
d |= (node & 0xf) << 12;
|
|
|
d |= (node >> 4) << 48;
|
|
|
+
|
|
|
write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
|
|
|
}
|
|
|
|
|
@@ -275,8 +247,10 @@ static int __cpuinit
|
|
|
cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
|
|
|
{
|
|
|
long cpu = (long)arg;
|
|
|
+
|
|
|
if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
|
|
|
smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1);
|
|
|
+
|
|
|
return NOTIFY_DONE;
|
|
|
}
|
|
|
|
|
@@ -284,25 +258,23 @@ void __init map_vsyscall(void)
|
|
|
{
|
|
|
extern char __vsyscall_0;
|
|
|
unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
|
|
|
+ extern char __vvar_page;
|
|
|
+ unsigned long physaddr_vvar_page = __pa_symbol(&__vvar_page);
|
|
|
|
|
|
/* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */
|
|
|
__set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
|
|
|
+ __set_fixmap(VVAR_PAGE, physaddr_vvar_page, PAGE_KERNEL_VVAR);
|
|
|
+ BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) != (unsigned long)VVAR_ADDRESS);
|
|
|
}
|
|
|
|
|
|
static int __init vsyscall_init(void)
|
|
|
{
|
|
|
- BUG_ON(((unsigned long) &vgettimeofday !=
|
|
|
- VSYSCALL_ADDR(__NR_vgettimeofday)));
|
|
|
- BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
|
|
|
- BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
|
|
|
- BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
|
|
|
-#ifdef CONFIG_SYSCTL
|
|
|
- register_sysctl_table(kernel_root_table2);
|
|
|
-#endif
|
|
|
+ BUG_ON(VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE));
|
|
|
+
|
|
|
on_each_cpu(cpu_vsyscall_init, NULL, 1);
|
|
|
/* notifier priority > KVM */
|
|
|
hotcpu_notifier(cpu_vsyscall_notifier, 30);
|
|
|
+
|
|
|
return 0;
|
|
|
}
|
|
|
-
|
|
|
__initcall(vsyscall_init);
|