summaryrefslogtreecommitdiff
path: root/i386
diff options
context:
space:
mode:
Diffstat (limited to 'i386')
-rw-r--r--i386/Makefrag.am47
-rw-r--r--i386/configfrag.ac12
-rw-r--r--i386/i386/debug_trace.S1
-rw-r--r--i386/i386/fpu.c15
-rw-r--r--i386/i386/gdt.c27
-rw-r--r--i386/i386/gdt.h4
-rw-r--r--i386/i386/i386asm.sym15
-rw-r--r--i386/i386/idt.c5
-rw-r--r--i386/i386/idt_inittab.S16
-rw-r--r--i386/i386/ktss.c7
-rw-r--r--i386/i386/ldt.c15
-rw-r--r--i386/i386/locore.S50
-rw-r--r--i386/i386/mp_desc.c5
-rw-r--r--i386/i386/pcb.c31
-rw-r--r--i386/i386/phys.c7
-rw-r--r--i386/i386/proc_reg.h22
-rw-r--r--i386/i386/seg.h18
-rw-r--r--i386/i386/spl.S18
-rw-r--r--i386/i386/trap.c5
-rw-r--r--i386/i386/user_ldt.c27
-rw-r--r--i386/i386/user_ldt.h3
-rw-r--r--i386/i386/vm_param.h25
-rw-r--r--i386/i386/xen.h357
-rw-r--r--i386/i386at/conf.c21
-rw-r--r--i386/i386at/cons_conf.c8
-rw-r--r--i386/i386at/model_dep.c180
-rw-r--r--i386/intel/pmap.c388
-rw-r--r--i386/intel/pmap.h17
-rw-r--r--i386/xen/Makefrag.am33
-rw-r--r--i386/xen/xen.c77
-rw-r--r--i386/xen/xen_boothdr.S167
-rw-r--r--i386/xen/xen_locore.S110
32 files changed, 1685 insertions, 48 deletions
diff --git a/i386/Makefrag.am b/i386/Makefrag.am
index bad0ce9..876761c 100644
--- a/i386/Makefrag.am
+++ b/i386/Makefrag.am
@@ -19,33 +19,37 @@
libkernel_a_SOURCES += \
i386/i386at/autoconf.c \
+ i386/i386at/conf.c \
+ i386/i386at/cons_conf.c \
+ i386/i386at/idt.h \
+ i386/i386at/kd_event.c \
+ i386/i386at/kd_event.h \
+ i386/i386at/kd_queue.c \
+ i386/i386at/kd_queue.h \
+ i386/i386at/model_dep.c \
+ i386/include/mach/sa/stdarg.h
+
+if PLATFORM_at
+libkernel_a_SOURCES += \
i386/i386at/boothdr.S \
i386/i386at/com.c \
i386/i386at/comreg.h \
- i386/i386at/conf.c \
- i386/i386at/cons_conf.c \
i386/i386at/cram.h \
i386/i386at/disk.h \
i386/i386at/i8250.h \
- i386/i386at/idt.h \
i386/i386at/immc.c \
i386/i386at/int_init.c \
i386/i386at/interrupt.S \
i386/i386at/kd.c \
i386/i386at/kd.h \
- i386/i386at/kd_event.c \
- i386/i386at/kd_event.h \
i386/i386at/kd_mouse.c \
i386/i386at/kd_mouse.h \
- i386/i386at/kd_queue.c \
- i386/i386at/kd_queue.h \
i386/i386at/kdasm.S \
i386/i386at/kdsoft.h \
- i386/i386at/model_dep.c \
i386/i386at/pic_isa.c \
i386/i386at/rtc.c \
- i386/i386at/rtc.h \
- i386/include/mach/sa/stdarg.h
+ i386/i386at/rtc.h
+endif
#
# `lpr' device support.
@@ -80,11 +84,9 @@ libkernel_a_SOURCES += \
i386/i386/fpu.h \
i386/i386/gdt.c \
i386/i386/gdt.h \
- i386/i386/hardclock.c \
i386/i386/idt-gen.h \
i386/i386/idt.c \
i386/i386/idt_inittab.S \
- i386/i386/io_map.c \
i386/i386/io_perm.c \
i386/i386/io_perm.h \
i386/i386/ipl.h \
@@ -107,11 +109,7 @@ libkernel_a_SOURCES += \
i386/i386/pcb.c \
i386/i386/pcb.h \
i386/i386/phys.c \
- i386/i386/pic.c \
- i386/i386/pic.h \
i386/i386/pio.h \
- i386/i386/pit.c \
- i386/i386/pit.h \
i386/i386/pmap.h \
i386/i386/proc_reg.h \
i386/i386/sched_param.h \
@@ -139,6 +137,15 @@ libkernel_a_SOURCES += \
EXTRA_DIST += \
i386/i386/mach_i386.srv
+if PLATFORM_at
+libkernel_a_SOURCES += \
+ i386/i386/hardclock.c \
+ i386/i386/io_map.c \
+ i386/i386/pic.c \
+ i386/i386/pic.h \
+ i386/i386/pit.c \
+ i386/i386/pit.h
+endif
#
# KDB support.
@@ -225,3 +232,11 @@ EXTRA_DIST += \
# Instead of listing each file individually...
EXTRA_DIST += \
i386/include
+
+#
+# Platform specific parts.
+#
+
+if PLATFORM_xen
+include i386/xen/Makefrag.am
+endif
diff --git a/i386/configfrag.ac b/i386/configfrag.ac
index f95aa86..1132b69 100644
--- a/i386/configfrag.ac
+++ b/i386/configfrag.ac
@@ -51,6 +51,12 @@ case $host_platform:$host_cpu in
# i386/bogus/platforms.h]
AC_DEFINE([AT386], [1], [AT386])[;;
+ xen:i?86)
+ # TODO. That should probably not be needed.
+ ncom=1
+ # TODO. That should probably not be needed.
+ # i386/bogus/platforms.h]
+ AC_DEFINE([AT386], [1], [AT386])[;;
*)
:;;
esac]
@@ -105,9 +111,11 @@ if [ x"$enable_lpr" = xyes ]; then]
AC_ARG_ENABLE([pae],
- AS_HELP_STRING([--enable-pae], [PAE feature (ix86-only); disabled by
- default]))
+ AS_HELP_STRING([--enable-pae], [PAE support (ix86-only); on ix86-at disabled
+ by default, on ix86-xen enabled by default]))
[case $host_platform:$host_cpu in
+ xen:i?86)
+ enable_pae=${enable_pae-yes};;
*:i?86)
:;;
*)
diff --git a/i386/i386/debug_trace.S b/i386/i386/debug_trace.S
index e741516..f275e1b 100644
--- a/i386/i386/debug_trace.S
+++ b/i386/i386/debug_trace.S
@@ -24,6 +24,7 @@
#ifdef DEBUG
#include <mach/machine/asm.h>
+#include <i386/xen.h>
#include "debug.h"
diff --git a/i386/i386/fpu.c b/i386/i386/fpu.c
index 109d0d7..2a4b9c0 100644
--- a/i386/i386/fpu.c
+++ b/i386/i386/fpu.c
@@ -109,6 +109,10 @@ void
init_fpu()
{
unsigned short status, control;
+
+#ifdef MACH_HYP
+ clear_ts();
+#else /* MACH_HYP */
unsigned int native = 0;
if (machine_slot[cpu_number()].cpu_type >= CPU_TYPE_I486)
@@ -120,6 +124,7 @@ init_fpu()
* the control and status registers.
*/
set_cr0((get_cr0() & ~(CR0_EM|CR0_TS)) | native); /* allow use of FPU */
+#endif /* MACH_HYP */
fninit();
status = fnstsw();
@@ -153,8 +158,10 @@ init_fpu()
struct i386_xfp_save save;
unsigned long mask;
fp_kind = FP_387X;
+#ifndef MACH_HYP
printf("Enabling FXSR\n");
set_cr4(get_cr4() | CR4_OSFXSR);
+#endif /* MACH_HYP */
fxsave(&save);
mask = save.fp_mxcsr_mask;
if (!mask)
@@ -163,10 +170,14 @@ init_fpu()
} else
fp_kind = FP_387;
}
+#ifdef MACH_HYP
+ set_ts();
+#else /* MACH_HYP */
/*
* Trap wait instructions. Turn off FPU for now.
*/
set_cr0(get_cr0() | CR0_TS | CR0_MP);
+#endif /* MACH_HYP */
}
else {
/*
@@ -675,6 +686,7 @@ fpexterrflt()
/*NOTREACHED*/
}
+#ifndef MACH_XEN
/*
* FPU error. Called by AST.
*/
@@ -731,6 +743,7 @@ ASSERT_IPL(SPL0);
thread->pcb->ims.ifps->fp_save_state.fp_status);
/*NOTREACHED*/
}
+#endif /* MACH_XEN */
/*
* Save FPU state.
@@ -846,7 +859,7 @@ fp_state_alloc()
}
}
-#if AT386
+#if AT386 && !defined(MACH_XEN)
/*
* Handle a coprocessor error interrupt on the AT386.
* This comes in on line 5 of the slave PIC at SPL1.
diff --git a/i386/i386/gdt.c b/i386/i386/gdt.c
index 845e7c6..b5fb033 100644
--- a/i386/i386/gdt.c
+++ b/i386/i386/gdt.c
@@ -31,11 +31,18 @@
* Global descriptor table.
*/
#include <mach/machine/vm_types.h>
+#include <mach/xen.h>
+
+#include <intel/pmap.h>
#include "vm_param.h"
#include "seg.h"
#include "gdt.h"
+#ifdef MACH_XEN
+/* It is actually defined in xen_boothdr.S */
+extern
+#endif /* MACH_XEN */
struct real_descriptor gdt[GDTSZ];
void
@@ -50,11 +57,21 @@ gdt_init()
LINEAR_MIN_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS,
LINEAR_MAX_KERNEL_ADDRESS - (LINEAR_MIN_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) - 1,
ACC_PL_K|ACC_DATA_W, SZ_32);
+#ifndef MACH_HYP
fill_gdt_descriptor(LINEAR_DS,
0,
0xffffffff,
ACC_PL_K|ACC_DATA_W, SZ_32);
+#endif /* MACH_HYP */
+#ifdef MACH_XEN
+ unsigned long frame = kv_to_mfn(gdt);
+ pmap_set_page_readonly(gdt);
+ if (hyp_set_gdt(kv_to_la(&frame), GDTSZ))
+ panic("couldn't set gdt\n");
+ if (hyp_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments))
+ panic("couldn't set 4gb segments vm assist");
+#else /* MACH_XEN */
/* Load the new GDT. */
{
struct pseudo_descriptor pdesc;
@@ -63,6 +80,7 @@ gdt_init()
pdesc.linear_base = kvtolin(&gdt);
lgdt(&pdesc);
}
+#endif /* MACH_XEN */
/* Reload all the segment registers from the new GDT.
We must load ds and es with 0 before loading them with KERNEL_DS
@@ -79,5 +97,14 @@ gdt_init()
"movw %w1,%%es\n"
"movw %w1,%%ss\n"
: : "i" (KERNEL_CS), "r" (KERNEL_DS), "r" (0));
+#ifdef MACH_XEN
+#if VM_MIN_KERNEL_ADDRESS != LINEAR_MIN_KERNEL_ADDRESS
+ /* things now get shifted */
+#ifdef MACH_PSEUDO_PHYS
+ pfn_list = (void*) pfn_list + VM_MIN_KERNEL_ADDRESS - LINEAR_MIN_KERNEL_ADDRESS;
+#endif /* MACH_PSEUDO_PHYS */
+ la_shift += LINEAR_MIN_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
+#endif
+#endif /* MACH_XEN */
}
diff --git a/i386/i386/gdt.h b/i386/i386/gdt.h
index 50e01e6..41ace79 100644
--- a/i386/i386/gdt.h
+++ b/i386/i386/gdt.h
@@ -40,12 +40,16 @@
*/
#define KERNEL_CS (0x08 | KERNEL_RING) /* kernel code */
#define KERNEL_DS (0x10 | KERNEL_RING) /* kernel data */
+#ifndef MACH_XEN
#define KERNEL_LDT 0x18 /* master LDT */
+#endif /* MACH_XEN */
#define KERNEL_TSS 0x20 /* master TSS (uniprocessor) */
#define USER_LDT 0x28 /* place for per-thread LDT */
#define USER_TSS 0x30 /* place for per-thread TSS
that holds IO bitmap */
+#ifndef MACH_HYP
#define LINEAR_DS 0x38 /* linear mapping */
+#endif /* MACH_HYP */
/* 0x40 was USER_FPREGS, now free */
#define USER_GDT 0x48 /* user-defined GDT entries */
diff --git a/i386/i386/i386asm.sym b/i386/i386/i386asm.sym
index 868bf09..b1670e8 100644
--- a/i386/i386/i386asm.sym
+++ b/i386/i386/i386asm.sym
@@ -45,6 +45,7 @@
#include <i386/gdt.h>
#include <i386/ldt.h>
#include <i386/mp_desc.h>
+#include <i386/xen.h>
offset thread th pcb
@@ -90,6 +91,9 @@ expr VM_MIN_ADDRESS
expr VM_MAX_ADDRESS
expr VM_MIN_KERNEL_ADDRESS KERNELBASE
expr KERNEL_STACK_SIZE
+#if VM_MIN_KERNEL_ADDRESS == LINEAR_MIN_KERNEL_ADDRESS
+expr PFN_LIST pfn_list
+#endif
#if PAE
expr PDPSHIFT
@@ -117,7 +121,9 @@ expr KERNEL_RING
expr KERNEL_CS
expr KERNEL_DS
expr KERNEL_TSS
+#ifndef MACH_XEN
expr KERNEL_LDT
+#endif /* MACH_XEN */
expr (VM_MIN_KERNEL_ADDRESS>>PDESHIFT)*sizeof(pt_entry_t) KERNELBASEPDE
@@ -135,3 +141,12 @@ expr TIMER_HIGH_UNIT
offset thread th system_timer
offset thread th user_timer
#endif
+
+#ifdef MACH_XEN
+offset shared_info si vcpu_info[0].evtchn_upcall_mask CPU_CLI
+offset shared_info si vcpu_info[0].evtchn_upcall_pending CPU_PENDING
+offset shared_info si vcpu_info[0].evtchn_pending_sel CPU_PENDING_SEL
+offset shared_info si evtchn_pending PENDING
+offset shared_info si evtchn_mask EVTMASK
+offset shared_info si vcpu_info[0].arch.cr2 CR2
+#endif /* MACH_XEN */
diff --git a/i386/i386/idt.c b/i386/i386/idt.c
index 1a8f917..b5e3d08 100644
--- a/i386/i386/idt.c
+++ b/i386/i386/idt.c
@@ -38,6 +38,10 @@ extern struct idt_init_entry idt_inittab[];
void idt_init()
{
+#ifdef MACH_HYP
+ if (hyp_set_trap_table(kvtolin(idt_inittab)))
+ panic("couldn't set trap table\n");
+#else /* MACH_HYP */
struct idt_init_entry *iie = idt_inittab;
/* Initialize the exception vectors from the idt_inittab. */
@@ -55,5 +59,6 @@ void idt_init()
pdesc.linear_base = kvtolin(&idt);
lidt(&pdesc);
}
+#endif /* MACH_HYP */
}
diff --git a/i386/i386/idt_inittab.S b/i386/i386/idt_inittab.S
index 7718568..4dcad8d 100644
--- a/i386/i386/idt_inittab.S
+++ b/i386/i386/idt_inittab.S
@@ -25,7 +25,8 @@
*/
#include <mach/machine/asm.h>
-#include "seg.h"
+#include <i386/seg.h>
+#include <i386/i386asm.h>
/* We'll be using macros to fill in a table in data hunk 2
@@ -38,12 +39,22 @@ ENTRY(idt_inittab)
/*
* Interrupt descriptor table and code vectors for it.
*/
+#ifdef MACH_XEN
+#define IDT_ENTRY(n,entry,type) \
+ .data 2 ;\
+ .byte n ;\
+ .byte (((type)&ACC_PL)>>5)|((((type)&(ACC_TYPE|ACC_A))==ACC_INTR_GATE)<<2) ;\
+ .word KERNEL_CS ;\
+ .long entry ;\
+ .text
+#else /* MACH_XEN */
#define IDT_ENTRY(n,entry,type) \
.data 2 ;\
.long entry ;\
.word n ;\
.word type ;\
.text
+#endif /* MACH_XEN */
/*
* No error code. Clear error code and push trap number.
@@ -118,4 +129,7 @@ EXCEPTION(0x1f,t_trap_1f)
/* Terminator */
.data 2
.long 0
+#ifdef MACH_XEN
+ .long 0
+#endif /* MACH_XEN */
diff --git a/i386/i386/ktss.c b/i386/i386/ktss.c
index 03d9a04..66432f3 100644
--- a/i386/i386/ktss.c
+++ b/i386/i386/ktss.c
@@ -45,6 +45,12 @@ ktss_init()
/* XXX temporary exception stack */
static int exception_stack[1024];
+#ifdef MACH_XEN
+ /* Xen won't allow us to do any I/O by default anyway, just register
+ * exception stack */
+ if (hyp_stack_switch(KERNEL_DS, (unsigned)(exception_stack+1024)))
+ panic("couldn't register exception stack\n");
+#else /* MACH_XEN */
/* Initialize the master TSS descriptor. */
fill_gdt_descriptor(KERNEL_TSS,
kvtolin(&ktss), sizeof(struct task_tss) - 1,
@@ -59,5 +65,6 @@ ktss_init()
/* Load the TSS. */
ltr(KERNEL_TSS);
+#endif /* MACH_XEN */
}
diff --git a/i386/i386/ldt.c b/i386/i386/ldt.c
index 7299377..0ef7a8c 100644
--- a/i386/i386/ldt.c
+++ b/i386/i386/ldt.c
@@ -28,6 +28,9 @@
* same LDT.
*/
#include <mach/machine/vm_types.h>
+#include <mach/xen.h>
+
+#include <intel/pmap.h>
#include "vm_param.h"
#include "seg.h"
@@ -36,15 +39,23 @@
extern int syscall();
+#ifdef MACH_XEN
+/* It is actually defined in xen_boothdr.S */
+extern
+#endif /* MACH_XEN */
struct real_descriptor ldt[LDTSZ];
void
ldt_init()
{
+#ifdef MACH_XEN
+ pmap_set_page_readwrite(ldt);
+#else /* MACH_XEN */
/* Initialize the master LDT descriptor in the GDT. */
fill_gdt_descriptor(KERNEL_LDT,
kvtolin(&ldt), sizeof(ldt)-1,
ACC_PL_K|ACC_LDT, 0);
+#endif /* MACH_XEN */
/* Initialize the LDT descriptors. */
fill_ldt_gate(USER_SCALL,
@@ -61,5 +72,9 @@ ldt_init()
ACC_PL_U|ACC_DATA_W, SZ_32);
/* Activate the LDT. */
+#ifdef MACH_HYP
+ hyp_set_ldt(&ldt, LDTSZ);
+#else /* MACH_HYP */
lldt(KERNEL_LDT);
+#endif /* MACH_HYP */
}
diff --git a/i386/i386/locore.S b/i386/i386/locore.S
index 13a44d9..663db43 100644
--- a/i386/i386/locore.S
+++ b/i386/i386/locore.S
@@ -36,6 +36,7 @@
#include <i386/ldt.h>
#include <i386/i386asm.h>
#include <i386/cpu_number.h>
+#include <i386/xen.h>
/*
* Fault recovery.
@@ -323,8 +324,9 @@ ENTRY(t_segnp)
trap_check_kernel_exit:
testl $(EFL_VM),16(%esp) /* is trap from V86 mode? */
jnz EXT(alltraps) /* isn`t kernel trap if so */
- testl $3,12(%esp) /* is trap from kernel mode? */
- jne EXT(alltraps) /* if so: */
+ /* Note: handling KERNEL_RING value by hand */
+ testl $2,12(%esp) /* is trap from kernel mode? */
+ jnz EXT(alltraps) /* if so: */
/* check for the kernel exit sequence */
cmpl $_kret_iret,8(%esp) /* on IRET? */
je fault_iret
@@ -410,7 +412,8 @@ push_segregs:
ENTRY(t_debug)
testl $(EFL_VM),8(%esp) /* is trap from V86 mode? */
jnz 0f /* isn`t kernel trap if so */
- testl $3,4(%esp) /* is trap from kernel mode? */
+ /* Note: handling KERNEL_RING value by hand */
+ testl $2,4(%esp) /* is trap from kernel mode? */
jnz 0f /* if so: */
cmpl $syscall_entry,(%esp) /* system call entry? */
jne 0f /* if so: */
@@ -429,7 +432,11 @@ ENTRY(t_debug)
ENTRY(t_page_fault)
pushl $(T_PAGE_FAULT) /* mark a page fault trap */
pusha /* save the general registers */
+#ifdef MACH_XEN
+ movl %ss:hyp_shared_info+CR2,%eax
+#else /* MACH_XEN */
movl %cr2,%eax /* get the faulting address */
+#endif /* MACH_XEN */
movl %eax,12(%esp) /* save in esp save slot */
jmp trap_push_segs /* continue fault */
@@ -465,7 +472,8 @@ trap_set_segs:
cld /* clear direction flag */
testl $(EFL_VM),R_EFLAGS(%esp) /* in V86 mode? */
jnz trap_from_user /* user mode trap if so */
- testb $3,R_CS(%esp) /* user mode trap? */
+ /* Note: handling KERNEL_RING value by hand */
+ testb $2,R_CS(%esp) /* user mode trap? */
jz trap_from_kernel /* kernel trap if not */
trap_from_user:
@@ -679,7 +687,8 @@ LEXT(return_to_iret) /* ( label for kdb_kintr and hardclock) */
testl $(EFL_VM),I_EFL(%esp) /* if in V86 */
jnz 0f /* or */
- testb $3,I_CS(%esp) /* user mode, */
+ /* Note: handling KERNEL_RING value by hand */
+ testb $2,I_CS(%esp) /* user mode, */
jz 1f /* check for ASTs */
0:
cmpl $0,CX(EXT(need_ast),%edx)
@@ -1156,9 +1165,14 @@ ENTRY(discover_x86_cpu_type)
movl %esp,%ebp /* Save stack pointer */
and $~0x3,%esp /* Align stack pointer */
+#ifdef MACH_HYP
+#warning Assuming not Cyrix CPU
+#else /* MACH_HYP */
inb $0xe8,%al /* Enable ID flag for Cyrix CPU ... */
andb $0x80,%al /* ... in CCR4 reg bit7 */
outb %al,$0xe8
+#endif /* MACH_HYP */
+
pushfl /* Fetch flags ... */
popl %eax /* ... into eax */
movl %eax,%ecx /* Save original flags for return */
@@ -1266,13 +1280,24 @@ Entry(copyoutmsg)
* XXX only have to do this on 386's.
*/
copyout_retry:
+#ifdef MACH_HYP
+ movl cr3,%ecx /* point to page directory */
+#else /* MACH_HYP */
movl %cr3,%ecx /* point to page directory */
+#endif /* MACH_HYP */
#if PAE
movl %edi,%eax /* get page directory pointer bits */
shrl $(PDPSHIFT),%eax /* from user address */
movl KERNELBASE(%ecx,%eax,PTE_SIZE),%ecx
/* get page directory pointer */
+#ifdef MACH_PSEUDO_PHYS
+ shrl $(PTESHIFT),%ecx
+ movl pfn_list,%eax
+ movl (%eax,%ecx,4),%ecx /* mfn_to_pfn */
+ shll $(PTESHIFT),%ecx
+#else /* MACH_PSEUDO_PHYS */
andl $(PTE_PFN),%ecx /* isolate page frame address */
+#endif /* MACH_PSEUDO_PHYS */
#endif /* PAE */
movl %edi,%eax /* get page directory bits */
shrl $(PDESHIFT),%eax /* from user address */
@@ -1283,7 +1308,14 @@ copyout_retry:
/* get page directory pointer */
testl $(PTE_V),%ecx /* present? */
jz 0f /* if not, fault is OK */
+#ifdef MACH_PSEUDO_PHYS
+ shrl $(PTESHIFT),%ecx
+ movl pfn_list,%eax
+ movl (%eax,%ecx,4),%ecx /* mfn_to_pfn */
+ shll $(PTESHIFT),%ecx
+#else /* MACH_PSEUDO_PHYS */
andl $(PTE_PFN),%ecx /* isolate page frame address */
+#endif /* MACH_PSEUDO_PHYS */
movl %edi,%eax /* get page table bits */
shrl $(PTESHIFT),%eax
andl $(PTEMASK),%eax /* from user address */
@@ -1297,9 +1329,17 @@ copyout_retry:
/*
* Not writable - must fake a fault. Turn off access to the page.
*/
+#ifdef MACH_HYP
+ pushl %edx
+ pushl %ecx
+ call hyp_invalidate_pte
+ popl %ecx
+ popl %edx
+#else /* MACH_HYP */
andl $(PTE_INVALID),(%ecx) /* turn off valid bit */
movl %cr3,%eax /* invalidate TLB */
movl %eax,%cr3
+#endif /* MACH_HYP */
0:
/*
diff --git a/i386/i386/mp_desc.c b/i386/i386/mp_desc.c
index 54660d5..2fd5ec2 100644
--- a/i386/i386/mp_desc.c
+++ b/i386/i386/mp_desc.c
@@ -31,6 +31,7 @@
#include <kern/cpu_number.h>
#include <kern/debug.h>
#include <mach/machine.h>
+#include <mach/xen.h>
#include <vm/vm_kern.h>
#include <i386/mp_desc.h>
@@ -149,6 +150,9 @@ mp_desc_init(mycpu)
* Fix up the entries in the GDT to point to
* this LDT and this TSS.
*/
+#ifdef MACH_HYP
+ panic("TODO %s:%d\n",__FILE__,__LINE__);
+#else /* MACH_HYP */
fill_descriptor(&mpt->gdt[sel_idx(KERNEL_LDT)],
(unsigned)&mpt->ldt,
LDTSZ * sizeof(struct real_descriptor) - 1,
@@ -161,6 +165,7 @@ mp_desc_init(mycpu)
mpt->ktss.tss.ss0 = KERNEL_DS;
mpt->ktss.tss.io_bit_map_offset = IOPB_INVAL;
mpt->ktss.barrier = 0xFF;
+#endif /* MACH_HYP */
return mpt;
}
diff --git a/i386/i386/pcb.c b/i386/i386/pcb.c
index 3226195..b9c52dd 100644
--- a/i386/i386/pcb.c
+++ b/i386/i386/pcb.c
@@ -31,6 +31,7 @@
#include <mach/kern_return.h>
#include <mach/thread_status.h>
#include <mach/exec/exec.h>
+#include <mach/xen.h>
#include "vm_param.h"
#include <kern/counters.h>
@@ -152,7 +153,12 @@ void switch_ktss(pcb)
? (int) (&pcb->iss + 1)
: (int) (&pcb->iss.v86_segs);
+#ifdef MACH_XEN
+ /* No IO mask here */
+ hyp_stack_switch(KERNEL_DS, pcb_stack_top);
+#else /* MACH_XEN */
curr_ktss(mycpu)->tss.esp0 = pcb_stack_top;
+#endif /* MACH_XEN */
}
{
@@ -164,22 +170,47 @@ void switch_ktss(pcb)
/*
* Use system LDT.
*/
+#ifdef MACH_HYP
+ hyp_set_ldt(&ldt, LDTSZ);
+#else /* MACH_HYP */
set_ldt(KERNEL_LDT);
+#endif /* MACH_HYP */
}
else {
/*
* Thread has its own LDT.
*/
+#ifdef MACH_HYP
+ hyp_set_ldt(tldt->ldt,
+ (tldt->desc.limit_low|(tldt->desc.limit_high<<16)) /
+ sizeof(struct real_descriptor));
+#else /* MACH_HYP */
*gdt_desc_p(mycpu,USER_LDT) = tldt->desc;
set_ldt(USER_LDT);
+#endif /* MACH_HYP */
}
}
+#ifdef MACH_XEN
+ {
+ int i;
+ for (i=0; i < USER_GDT_SLOTS; i++) {
+ if (memcmp(gdt_desc_p (mycpu, USER_GDT + (i << 3)),
+ &pcb->ims.user_gdt[i], sizeof pcb->ims.user_gdt[i])) {
+ if (hyp_do_update_descriptor(kv_to_ma(gdt_desc_p (mycpu, USER_GDT + (i << 3))),
+ *(unsigned long long *) &pcb->ims.user_gdt[i]))
+ panic("couldn't set user gdt %d\n",i);
+ }
+ }
+ }
+#else /* MACH_XEN */
+
/* Copy in the per-thread GDT slots. No reloading is necessary
because just restoring the segment registers on the way back to
user mode reloads the shadow registers from the in-memory GDT. */
memcpy (gdt_desc_p (mycpu, USER_GDT),
pcb->ims.user_gdt, sizeof pcb->ims.user_gdt);
+#endif /* MACH_XEN */
/*
* Load the floating-point context, if necessary.
diff --git a/i386/i386/phys.c b/i386/i386/phys.c
index 2c30f17..925593b 100644
--- a/i386/i386/phys.c
+++ b/i386/i386/phys.c
@@ -27,6 +27,7 @@
#include <string.h>
#include <mach/boolean.h>
+#include <mach/xen.h>
#include <kern/task.h>
#include <kern/thread.h>
#include <vm/vm_map.h>
@@ -104,5 +105,9 @@ vm_offset_t addr;
if ((pte = pmap_pte(kernel_pmap, addr)) == PT_ENTRY_NULL)
return 0;
- return i386_trunc_page(*pte) | (addr & INTEL_OFFMASK);
+ return i386_trunc_page(
+#ifdef MACH_PSEUDO_PHYS
+ ma_to_pa
+#endif /* MACH_PSEUDO_PHYS */
+ (*pte)) | (addr & INTEL_OFFMASK);
}
diff --git a/i386/i386/proc_reg.h b/i386/i386/proc_reg.h
index d9f32bc..64d8c43 100644
--- a/i386/i386/proc_reg.h
+++ b/i386/i386/proc_reg.h
@@ -72,8 +72,10 @@
#ifndef __ASSEMBLER__
#ifdef __GNUC__
+#ifndef MACH_HYP
#include <i386/gdt.h>
#include <i386/ldt.h>
+#endif /* MACH_HYP */
static inline unsigned
get_eflags(void)
@@ -122,6 +124,16 @@ set_eflags(unsigned eflags)
_temp__; \
})
+#ifdef MACH_HYP
+extern unsigned long cr3;
+#define get_cr3() (cr3)
+#define set_cr3(value) \
+ ({ \
+ cr3 = (value); \
+ if (!hyp_set_cr3(value)) \
+ panic("set_cr3"); \
+ })
+#else /* MACH_HYP */
#define get_cr3() \
({ \
register unsigned int _temp__; \
@@ -134,9 +146,11 @@ set_eflags(unsigned eflags)
register unsigned int _temp__ = (value); \
asm volatile("mov %0, %%cr3" : : "r" (_temp__)); \
})
+#endif /* MACH_HYP */
#define flush_tlb() set_cr3(get_cr3())
+#ifndef MACH_HYP
#define invlpg(addr) \
({ \
asm volatile("invlpg (%0)" : : "r" (addr)); \
@@ -164,6 +178,7 @@ set_eflags(unsigned eflags)
: "+r" (var) : "r" (end), \
"q" (LINEAR_DS), "q" (KERNEL_DS), "i" (PAGE_SIZE)); \
})
+#endif /* MACH_HYP */
#define get_cr4() \
({ \
@@ -179,11 +194,18 @@ set_eflags(unsigned eflags)
})
+#ifdef MACH_HYP
+#define set_ts() \
+ hyp_fpu_taskswitch(1)
+#define clear_ts() \
+ hyp_fpu_taskswitch(0)
+#else /* MACH_HYP */
#define set_ts() \
set_cr0(get_cr0() | CR0_TS)
#define clear_ts() \
asm volatile("clts")
+#endif /* MACH_HYP */
#define get_tr() \
({ \
diff --git a/i386/i386/seg.h b/i386/i386/seg.h
index 9a09af5..01b1a2e 100644
--- a/i386/i386/seg.h
+++ b/i386/i386/seg.h
@@ -37,7 +37,12 @@
* i386 segmentation.
*/
+/* Note: the value of KERNEL_RING is handled by hand in locore.S */
+#ifdef MACH_HYP
+#define KERNEL_RING 1
+#else /* MACH_HYP */
#define KERNEL_RING 0
+#endif /* MACH_HYP */
#ifndef __ASSEMBLER__
@@ -118,6 +123,7 @@ struct real_gate {
#ifndef __ASSEMBLER__
#include <mach/inline.h>
+#include <mach/xen.h>
/* Format of a "pseudo-descriptor", used for loading the IDT and GDT. */
@@ -152,9 +158,15 @@ MACH_INLINE void lldt(unsigned short ldt_selector)
/* Fill a segment descriptor. */
MACH_INLINE void
-fill_descriptor(struct real_descriptor *desc, unsigned base, unsigned limit,
+fill_descriptor(struct real_descriptor *_desc, unsigned base, unsigned limit,
unsigned char access, unsigned char sizebits)
{
+ /* TODO: when !MACH_XEN, setting desc and just memcpy isn't simpler actually */
+#ifdef MACH_XEN
+ struct real_descriptor __desc, *desc = &__desc;
+#else /* MACH_XEN */
+ struct real_descriptor *desc = _desc;
+#endif /* MACH_XEN */
if (limit > 0xfffff)
{
limit >>= 12;
@@ -167,6 +179,10 @@ fill_descriptor(struct real_descriptor *desc, unsigned base, unsigned limit,
desc->limit_high = limit >> 16;
desc->granularity = sizebits;
desc->base_high = base >> 24;
+#ifdef MACH_XEN
+ if (hyp_do_update_descriptor(kv_to_ma(_desc), *(unsigned long long*)desc))
+ panic("couldn't update descriptor(%p to %08lx%08lx)\n", kv_to_ma(_desc), *(((unsigned long*)desc)+1), *(unsigned long *)desc);
+#endif /* MACH_XEN */
}
/* Fill a gate with particular values. */
diff --git a/i386/i386/spl.S b/i386/i386/spl.S
index f77b556..f1d4b45 100644
--- a/i386/i386/spl.S
+++ b/i386/i386/spl.S
@@ -20,6 +20,8 @@
#include <mach/machine/asm.h>
#include <i386/ipl.h>
#include <i386/pic.h>
+#include <i386/i386asm.h>
+#include <i386/xen.h>
/*
* Set IPL to the specified value.
@@ -42,6 +44,7 @@
/*
* Program PICs with mask in %eax.
*/
+#ifndef MACH_XEN
#define SETMASK() \
cmpl EXT(curr_pic_mask),%eax; \
je 9f; \
@@ -50,6 +53,21 @@
movb %ah,%al; \
outb %al,$(PIC_SLAVE_OCW); \
9:
+#else /* MACH_XEN */
+#define pic_mask int_mask
+#define SETMASK() \
+ pushl %ebx; \
+ movl %eax,%ebx; \
+ xchgl %eax,hyp_shared_info+EVTMASK; \
+ notl %ebx; \
+ andl %eax,%ebx; /* Get unmasked events */ \
+ testl hyp_shared_info+PENDING, %ebx; \
+ popl %ebx; \
+ jz 9f; /* Check whether there was some pending */ \
+lock orl $1,hyp_shared_info+CPU_PENDING_SEL; /* Yes, activate it */ \
+ movb $1,hyp_shared_info+CPU_PENDING; \
+9:
+#endif /* MACH_XEN */
ENTRY(spl0)
movl EXT(curr_ipl),%eax /* save current ipl */
diff --git a/i386/i386/trap.c b/i386/i386/trap.c
index 4361fcd..28a9e0c 100644
--- a/i386/i386/trap.c
+++ b/i386/i386/trap.c
@@ -585,6 +585,7 @@ i386_astintr()
int mycpu = cpu_number();
(void) splsched(); /* block interrupts to check reasons */
+#ifndef MACH_XEN
if (need_ast[mycpu] & AST_I386_FP) {
/*
* AST was for delayed floating-point exception -
@@ -596,7 +597,9 @@ i386_astintr()
fpastintr();
}
- else {
+ else
+#endif /* MACH_XEN */
+ {
/*
* Not an FPU trap. Handle the AST.
* Interrupts are still blocked.
diff --git a/i386/i386/user_ldt.c b/i386/i386/user_ldt.c
index 942ad07..dfe6b1e 100644
--- a/i386/i386/user_ldt.c
+++ b/i386/i386/user_ldt.c
@@ -39,6 +39,7 @@
#include <i386/seg.h>
#include <i386/thread.h>
#include <i386/user_ldt.h>
+#include <stddef.h>
#include "ldt.h"
#include "vm_param.h"
@@ -195,9 +196,17 @@ i386_set_ldt(thread, first_selector, desc_list, count, desc_list_inline)
if (new_ldt == 0) {
simple_unlock(&pcb->lock);
+#ifdef MACH_XEN
+ /* LDT needs to be aligned on a page */
+ vm_offset_t alloc = kalloc(ldt_size_needed + PAGE_SIZE + offsetof(struct user_ldt, ldt));
+ new_ldt = (user_ldt_t) (round_page((alloc + offsetof(struct user_ldt, ldt))) - offsetof(struct user_ldt, ldt));
+ new_ldt->alloc = alloc;
+
+#else /* MACH_XEN */
new_ldt = (user_ldt_t)
kalloc(ldt_size_needed
+ sizeof(struct real_descriptor));
+#endif /* MACH_XEN */
/*
* Build a descriptor that describes the
* LDT itself
@@ -263,9 +272,19 @@ i386_set_ldt(thread, first_selector, desc_list, count, desc_list_inline)
simple_unlock(&pcb->lock);
if (new_ldt)
+#ifdef MACH_XEN
+ {
+ int i;
+ for (i=0; i<(new_ldt->desc.limit_low + 1)/sizeof(struct real_descriptor); i+=PAGE_SIZE/sizeof(struct real_descriptor))
+ pmap_set_page_readwrite(&new_ldt->ldt[i]);
+ kfree(new_ldt->alloc, new_ldt->desc.limit_low + 1
+ + PAGE_SIZE + offsetof(struct user_ldt, ldt));
+ }
+#else /* MACH_XEN */
kfree((vm_offset_t)new_ldt,
new_ldt->desc.limit_low + 1
+ sizeof(struct real_descriptor));
+#endif /* MACH_XEN */
/*
* Free the descriptor list, if it was
@@ -398,9 +417,17 @@ void
user_ldt_free(user_ldt)
user_ldt_t user_ldt;
{
+#ifdef MACH_XEN
+ int i;
+ for (i=0; i<(user_ldt->desc.limit_low + 1)/sizeof(struct real_descriptor); i+=PAGE_SIZE/sizeof(struct real_descriptor))
+ pmap_set_page_readwrite(&user_ldt->ldt[i]);
+ kfree(user_ldt->alloc, user_ldt->desc.limit_low + 1
+ + PAGE_SIZE + offsetof(struct user_ldt, ldt));
+#else /* MACH_XEN */
kfree((vm_offset_t)user_ldt,
user_ldt->desc.limit_low + 1
+ sizeof(struct real_descriptor));
+#endif /* MACH_XEN */
}
diff --git a/i386/i386/user_ldt.h b/i386/i386/user_ldt.h
index dd3ad4b..8d16ed8 100644
--- a/i386/i386/user_ldt.h
+++ b/i386/i386/user_ldt.h
@@ -36,6 +36,9 @@
#include <i386/seg.h>
struct user_ldt {
+#ifdef MACH_XEN
+ vm_offset_t alloc; /* allocation before alignment */
+#endif /* MACH_XEN */
struct real_descriptor desc; /* descriptor for self */
struct real_descriptor ldt[1]; /* descriptor table (variable) */
};
diff --git a/i386/i386/vm_param.h b/i386/i386/vm_param.h
index 8e92e79..95df604 100644
--- a/i386/i386/vm_param.h
+++ b/i386/i386/vm_param.h
@@ -25,10 +25,25 @@
/* XXX use xu/vm_param.h */
#include <mach/vm_param.h>
+#include <xen/public/xen.h>
/* The kernel address space is 1GB, starting at virtual address 0. */
-#define VM_MIN_KERNEL_ADDRESS (0x00000000)
-#define VM_MAX_KERNEL_ADDRESS ((LINEAR_MAX_KERNEL_ADDRESS - LINEAR_MIN_KERNEL_ADDRESS + VM_MIN_KERNEL_ADDRESS))
+#ifdef MACH_XEN
+#define VM_MIN_KERNEL_ADDRESS 0x20000000UL
+#else /* MACH_XEN */
+#define VM_MIN_KERNEL_ADDRESS 0x00000000UL
+#endif /* MACH_XEN */
+
+#ifdef MACH_XEN
+#if PAE
+#define HYP_VIRT_START HYPERVISOR_VIRT_START_PAE
+#else /* PAE */
+#define HYP_VIRT_START HYPERVISOR_VIRT_START_NONPAE
+#endif /* PAE */
+#define VM_MAX_KERNEL_ADDRESS (HYP_VIRT_START - LINEAR_MIN_KERNEL_ADDRESS + VM_MIN_KERNEL_ADDRESS)
+#else /* MACH_XEN */
+#define VM_MAX_KERNEL_ADDRESS (LINEAR_MAX_KERNEL_ADDRESS - LINEAR_MIN_KERNEL_ADDRESS + VM_MIN_KERNEL_ADDRESS)
+#endif /* MACH_XEN */
/* The kernel virtual address space is actually located
at high linear addresses.
@@ -36,8 +51,14 @@
#define LINEAR_MIN_KERNEL_ADDRESS (VM_MAX_ADDRESS)
#define LINEAR_MAX_KERNEL_ADDRESS (0xffffffffUL)
+#ifdef MACH_XEN
+/* need room for mmu updates (2*8bytes) */
+#define KERNEL_STACK_SIZE (4*I386_PGBYTES)
+#define INTSTACK_SIZE (4*I386_PGBYTES)
+#else /* MACH_XEN */
#define KERNEL_STACK_SIZE (1*I386_PGBYTES)
#define INTSTACK_SIZE (1*I386_PGBYTES)
+#endif /* MACH_XEN */
/* interrupt stack size */
/*
diff --git a/i386/i386/xen.h b/i386/i386/xen.h
new file mode 100644
index 0000000..a7fb641
--- /dev/null
+++ b/i386/i386/xen.h
@@ -0,0 +1,357 @@
+/*
+ * Copyright (C) 2006 Samuel Thibault <samuel.thibault@ens-lyon.org>
+ *
+ * This program is free software ; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation ; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY ; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with the program ; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef XEN_HYPCALL_H
+#define XEN_HYPCALL_H
+
+#ifdef MACH_XEN
+#ifndef __ASSEMBLER__
+#include <kern/printf.h>
+#include <mach/machine/vm_types.h>
+#include <mach/vm_param.h>
+#include <mach/inline.h>
+#include <machine/vm_param.h>
+#include <intel/pmap.h>
+#include <kern/debug.h>
+#include <xen/public/xen.h>
+
+/* TODO: this should be moved in appropriate non-Xen place. */
+#define barrier() __asm__ __volatile__ ("": : :"memory")
+#define mb() __asm__ __volatile__("lock; addl $0,0(%esp)")
+#define rmb() mb()
+#define wmb() mb()
+MACH_INLINE unsigned long xchgl(volatile unsigned long *ptr, unsigned long x)
+{
+ __asm__ __volatile__("xchgl %0, %1"
+ : "=r" (x)
+ : "m" (*(ptr)), "0" (x): "memory");
+ return x;
+}
+#define _TOSTR(x) #x
+#define TOSTR(x) _TOSTR (x)
+
+
+
+/* x86-specific hypercall interface. */
+#define _hypcall0(type, name) \
+MACH_INLINE type hyp_##name(void) \
+{ \
+ long __ret; \
+ asm volatile ("call hypcalls+("TOSTR(__HYPERVISOR_##name)"*32)" \
+ : "=a" (__ret) \
+ : : "memory"); \
+ return __ret; \
+}
+
+#define _hypcall1(type, name, type1, arg1) \
+MACH_INLINE type hyp_##name(type1 arg1) \
+{ \
+ long __ret; \
+ long foo1; \
+ asm volatile ("call hypcalls+("TOSTR(__HYPERVISOR_##name)"*32)" \
+ : "=a" (__ret), \
+ "=b" (foo1) \
+ : "1" ((long)arg1) \
+ : "memory"); \
+ return __ret; \
+}
+
+#define _hypcall2(type, name, type1, arg1, type2, arg2) \
+MACH_INLINE type hyp_##name(type1 arg1, type2 arg2) \
+{ \
+ long __ret; \
+ long foo1, foo2; \
+ asm volatile ("call hypcalls+("TOSTR(__HYPERVISOR_##name)"*32)" \
+ : "=a" (__ret), \
+ "=b" (foo1), \
+ "=c" (foo2) \
+ : "1" ((long)arg1), \
+ "2" ((long)arg2) \
+ : "memory"); \
+ return __ret; \
+}
+
+#define _hypcall3(type, name, type1, arg1, type2, arg2, type3, arg3) \
+MACH_INLINE type hyp_##name(type1 arg1, type2 arg2, type3 arg3) \
+{ \
+ long __ret; \
+ long foo1, foo2, foo3; \
+ asm volatile ("call hypcalls+("TOSTR(__HYPERVISOR_##name)"*32)" \
+ : "=a" (__ret), \
+ "=b" (foo1), \
+ "=c" (foo2), \
+ "=d" (foo3) \
+ : "1" ((long)arg1), \
+ "2" ((long)arg2), \
+ "3" ((long)arg3) \
+ : "memory"); \
+ return __ret; \
+}
+
+#define _hypcall4(type, name, type1, arg1, type2, arg2, type3, arg3, type4, arg4) \
+MACH_INLINE type hyp_##name(type1 arg1, type2 arg2, type3 arg3, type4 arg4) \
+{ \
+ long __ret; \
+ long foo1, foo2, foo3, foo4; \
+ asm volatile ("call hypcalls+("TOSTR(__HYPERVISOR_##name)"*32)" \
+ : "=a" (__ret), \
+ "=b" (foo1), \
+ "=c" (foo2), \
+ "=d" (foo3), \
+ "=S" (foo4) \
+ : "1" ((long)arg1), \
+ "2" ((long)arg2), \
+ "3" ((long)arg3), \
+ "4" ((long)arg4) \
+ : "memory"); \
+ return __ret; \
+}
+
+#define _hypcall5(type, name, type1, arg1, type2, arg2, type3, arg3, type4, arg4, type5, arg5) \
+MACH_INLINE type hyp_##name(type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5) \
+{ \
+ long __ret; \
+ long foo1, foo2, foo3, foo4, foo5; \
+ asm volatile ("call hypcalls+("TOSTR(__HYPERVISOR_##name)"*32)" \
+ : "=a" (__ret), \
+ "=b" (foo1), \
+ "=c" (foo2), \
+ "=d" (foo3), \
+ "=S" (foo4), \
+ "=D" (foo5) \
+ : "1" ((long)arg1), \
+ "2" ((long)arg2), \
+ "3" ((long)arg3), \
+ "4" ((long)arg4), \
+ "5" ((long)arg5) \
+ : "memory"); \
+ return __ret; \
+}
+
+/* x86 Hypercalls */
+
+/* Note: since Hypervisor uses flat memory model, remember to always use
+ * kvtolin when giving pointers as parameters for the hypercall to read data
+ * at. Use kv_to_la when they may be used before GDT got set up. */
+
+_hypcall1(long, set_trap_table, vm_offset_t /* struct trap_info * */, traps);
+
+_hypcall4(int, mmu_update, vm_offset_t /* struct mmu_update * */, req, int, count, vm_offset_t /* int * */, success_count, domid_t, domid)
+MACH_INLINE int hyp_mmu_update_pte(unsigned long pte, unsigned long long val)
+{
+ struct mmu_update update =
+ {
+ .ptr = pte,
+ .val = val,
+ };
+ int count;
+ hyp_mmu_update(kv_to_la(&update), 1, kv_to_la(&count), DOMID_SELF);
+ return count;
+}
+/* Note: make sure this fits in KERNEL_STACK_SIZE */
+#define HYP_BATCH_MMU_UPDATES 256
+
+#define hyp_mmu_update_la(la, val) hyp_mmu_update_pte( \
+ (unsigned long)(((pt_entry_t*)(kernel_pmap->dirbase[lin2pdenum((unsigned long)la)] & INTEL_PTE_PFN)) \
+ + ptenum((unsigned long)la)), val)
+
+_hypcall2(long, set_gdt, vm_offset_t /* unsigned long * */, frame_list, unsigned int, entries)
+
+_hypcall2(long, stack_switch, unsigned long, ss, unsigned long, esp);
+
+_hypcall4(long, set_callbacks, unsigned long, es, void *, ea,
+ unsigned long, fss, void *, fsa);
+_hypcall1(long, fpu_taskswitch, int, set);
+
+_hypcall4(long, update_descriptor, unsigned long, ma_lo, unsigned long, ma_hi, unsigned long, desc_lo, unsigned long, desc_hi);
+#define hyp_do_update_descriptor(ma, desc) ({ \
+ unsigned long long __desc = (desc); \
+ hyp_update_descriptor(ma, 0, __desc, __desc >> 32); \
+})
+
+#include <xen/public/memory.h>
+_hypcall2(long, memory_op, unsigned long, cmd, vm_offset_t /* void * */, arg);
+MACH_INLINE void hyp_free_mfn(unsigned long mfn)
+{
+ struct xen_memory_reservation reservation;
+ reservation.extent_start = (void*) kvtolin(&mfn);
+ reservation.nr_extents = 1;
+ reservation.extent_order = 0;
+ reservation.address_bits = 0;
+ reservation.domid = DOMID_SELF;
+ if (hyp_memory_op(XENMEM_decrease_reservation, kvtolin(&reservation)) != 1)
+ panic("couldn't free page %d\n", mfn);
+}
+
+_hypcall4(int, update_va_mapping, unsigned long, va, unsigned long, val_lo, unsigned long, val_hi, unsigned long, flags);
+#define hyp_do_update_va_mapping(va, val, flags) ({ \
+ unsigned long long __val = (val); \
+ hyp_update_va_mapping(va, __val & 0xffffffffU, __val >> 32, flags); \
+})
+
+MACH_INLINE void hyp_free_page(unsigned long pfn, void *va)
+{
+ /* save mfn */
+ unsigned long mfn = pfn_to_mfn(pfn);
+
+ /* remove from mappings */
+ if (hyp_do_update_va_mapping(kvtolin(va), 0, UVMF_INVLPG|UVMF_ALL))
+ panic("couldn't clear page %d at %p\n", pfn, va);
+
+#ifdef MACH_PSEUDO_PHYS
+ /* drop machine page */
+ mfn_list[pfn] = ~0;
+#endif /* MACH_PSEUDO_PHYS */
+
+ /* and free from Xen */
+ hyp_free_mfn(mfn);
+}
+
+_hypcall4(int, mmuext_op, vm_offset_t /* struct mmuext_op * */, op, int, count, vm_offset_t /* int * */, success_count, domid_t, domid);
+MACH_INLINE int hyp_mmuext_op_void(unsigned int cmd)
+{
+ struct mmuext_op op = {
+ .cmd = cmd,
+ };
+ int count;
+ hyp_mmuext_op(kv_to_la(&op), 1, kv_to_la(&count), DOMID_SELF);
+ return count;
+}
+MACH_INLINE int hyp_mmuext_op_mfn(unsigned int cmd, unsigned long mfn)
+{
+ struct mmuext_op op = {
+ .cmd = cmd,
+ .arg1.mfn = mfn,
+ };
+ int count;
+ hyp_mmuext_op(kv_to_la(&op), 1, kv_to_la(&count), DOMID_SELF);
+ return count;
+}
+MACH_INLINE void hyp_set_ldt(void *ldt, unsigned long nbentries) {
+ struct mmuext_op op = {
+ .cmd = MMUEXT_SET_LDT,
+ .arg1.linear_addr = kvtolin(ldt),
+ .arg2.nr_ents = nbentries,
+ };
+ int count;
+ if (((unsigned long)ldt) & PAGE_MASK)
+ panic("ldt %p is not aligned on a page\n", ldt);
+ for (count=0; count<nbentries; count+= PAGE_SIZE/8)
+ pmap_set_page_readonly(ldt+count*8);
+ hyp_mmuext_op(kvtolin(&op), 1, kvtolin(&count), DOMID_SELF);
+ if (!count)
+ panic("couldn't set LDT\n");
+}
+/* TODO: use xen_pfn_to_cr3/xen_cr3_to_pfn to cope with pdp above 4GB */
+#define hyp_set_cr3(value) hyp_mmuext_op_mfn(MMUEXT_NEW_BASEPTR, pa_to_mfn(value))
+MACH_INLINE void hyp_invlpg(vm_offset_t lin) {
+ struct mmuext_op ops;
+ int n;
+ ops.cmd = MMUEXT_INVLPG_ALL;
+ ops.arg1.linear_addr = lin;
+ hyp_mmuext_op(kvtolin(&ops), 1, kvtolin(&n), DOMID_SELF);
+ if (n < 1)
+ panic("couldn't invlpg\n");
+}
+
+_hypcall2(long, set_timer_op, unsigned long, absolute_lo, unsigned long, absolute_hi);
+#define hyp_do_set_timer_op(absolute_nsec) ({ \
+ unsigned long long __absolute = (absolute_nsec); \
+ hyp_set_timer_op(__absolute, __absolute >> 32); \
+})
+
+#include <xen/public/event_channel.h>
+_hypcall1(int, event_channel_op, vm_offset_t /* evtchn_op_t * */, op);
+MACH_INLINE int hyp_event_channel_send(evtchn_port_t port) {
+ evtchn_op_t op = {
+ .cmd = EVTCHNOP_send,
+ .u.send.port = port,
+ };
+ return hyp_event_channel_op(kvtolin(&op));
+}
+MACH_INLINE evtchn_port_t hyp_event_channel_alloc(domid_t domid) {
+ evtchn_op_t op = {
+ .cmd = EVTCHNOP_alloc_unbound,
+ .u.alloc_unbound.dom = DOMID_SELF,
+ .u.alloc_unbound.remote_dom = domid,
+ };
+ if (hyp_event_channel_op(kvtolin(&op)))
+ panic("couldn't allocate event channel");
+ return op.u.alloc_unbound.port;
+}
+MACH_INLINE evtchn_port_t hyp_event_channel_bind_virq(uint32_t virq, uint32_t vcpu) {
+ evtchn_op_t op = { .cmd = EVTCHNOP_bind_virq, .u.bind_virq = { .virq = virq, .vcpu = vcpu }};
+ if (hyp_event_channel_op(kvtolin(&op)))
+ panic("can't bind virq %d\n",virq);
+ return op.u.bind_virq.port;
+}
+
+_hypcall3(int, console_io, int, cmd, int, count, vm_offset_t /* const char * */, buffer);
+
+_hypcall3(long, grant_table_op, unsigned int, cmd, vm_offset_t /* void * */, uop, unsigned int, count);
+
+_hypcall2(long, vm_assist, unsigned int, cmd, unsigned int, type);
+
+_hypcall0(long, iret);
+
+#include <xen/public/sched.h>
+_hypcall2(long, sched_op, int, cmd, vm_offset_t /* void* */, arg)
+#define hyp_yield() hyp_sched_op(SCHEDOP_yield, 0)
+#define hyp_block() hyp_sched_op(SCHEDOP_block, 0)
+MACH_INLINE void __attribute__((noreturn)) hyp_crash(void)
+{
+ unsigned int shut = SHUTDOWN_crash;
+ hyp_sched_op(SCHEDOP_shutdown, kvtolin(&shut));
+ /* really shouldn't return */
+ printf("uh, shutdown returned?!\n");
+ for(;;);
+}
+
+MACH_INLINE void __attribute__((noreturn)) hyp_halt(void)
+{
+ unsigned int shut = SHUTDOWN_poweroff;
+ hyp_sched_op(SCHEDOP_shutdown, kvtolin(&shut));
+ /* really shouldn't return */
+ printf("uh, shutdown returned?!\n");
+ for(;;);
+}
+
+MACH_INLINE void __attribute__((noreturn)) hyp_reboot(void)
+{
+ unsigned int shut = SHUTDOWN_reboot;
+ hyp_sched_op(SCHEDOP_shutdown, kvtolin(&shut));
+ /* really shouldn't return */
+ printf("uh, reboot returned?!\n");
+ for(;;);
+}
+
+/* x86-specific */
+MACH_INLINE unsigned64_t hyp_cpu_clock(void) {
+ unsigned64_t tsc;
+ asm volatile("rdtsc":"=A"(tsc));
+ return tsc;
+}
+
+#else /* __ASSEMBLER__ */
+/* TODO: SMP */
+#define cli movb $0xff,hyp_shared_info+CPU_CLI
+#define sti call hyp_sti
+#endif /* ASSEMBLER */
+#endif /* MACH_XEN */
+
+#endif /* XEN_HYPCALL_H */
diff --git a/i386/i386at/conf.c b/i386/i386at/conf.c
index 23c2a6f..f5ab36c 100644
--- a/i386/i386at/conf.c
+++ b/i386/i386at/conf.c
@@ -34,6 +34,7 @@ extern int timeopen(), timeclose();
extern vm_offset_t timemmap();
#define timename "time"
+#ifndef MACH_HYP
extern int kdopen(), kdclose(), kdread(), kdwrite();
extern int kdgetstat(), kdsetstat(), kdportdeath();
extern vm_offset_t kdmmap();
@@ -50,17 +51,26 @@ extern int lpropen(), lprclose(), lprread(), lprwrite();
extern int lprgetstat(), lprsetstat(), lprportdeath();
#define lprname "lpr"
#endif /* NLPR > 0 */
+#endif /* MACH_HYP */
extern int kbdopen(), kbdclose(), kbdread();
extern int kbdgetstat(), kbdsetstat();
#define kbdname "kbd"
+#ifndef MACH_HYP
extern int mouseopen(), mouseclose(), mouseread(), mousegetstat();
#define mousename "mouse"
+#endif /* MACH_HYP */
extern int kmsgopen(), kmsgclose(), kmsgread(), kmsggetstat();
#define kmsgname "kmsg"
+#ifdef MACH_HYP
+extern int hypcnopen(), hypcnclose(), hypcnread(), hypcnwrite();
+extern int hypcngetstat(), hypcnsetstat(), hypcnportdeath();
+#define hypcnname "hyp"
+#endif /* MACH_HYP */
+
/*
* List of devices - console must be at slot 0
*/
@@ -79,16 +89,19 @@ struct dev_ops dev_name_list[] =
nodev, nulldev, nulldev, 0,
nodev },
+#ifndef MACH_HYP
{ kdname, kdopen, kdclose, kdread,
kdwrite, kdgetstat, kdsetstat, kdmmap,
nodev, nulldev, kdportdeath, 0,
nodev },
+#endif /* MACH_HYP */
{ timename, timeopen, timeclose, nulldev,
nulldev, nulldev, nulldev, timemmap,
nodev, nulldev, nulldev, 0,
nodev },
+#ifndef MACH_HYP
#if NCOM > 0
{ comname, comopen, comclose, comread,
comwrite, comgetstat, comsetstat, nomap,
@@ -107,6 +120,7 @@ struct dev_ops dev_name_list[] =
nodev, mousegetstat, nulldev, nomap,
nodev, nulldev, nulldev, 0,
nodev },
+#endif /* MACH_HYP */
{ kbdname, kbdopen, kbdclose, kbdread,
nodev, kbdgetstat, kbdsetstat, nomap,
@@ -120,6 +134,13 @@ struct dev_ops dev_name_list[] =
nodev },
#endif
+#ifdef MACH_HYP
+ { hypcnname, hypcnopen, hypcnclose, hypcnread,
+ hypcnwrite, hypcngetstat, hypcnsetstat, nomap,
+ nodev, nulldev, hypcnportdeath, 0,
+ nodev },
+#endif /* MACH_HYP */
+
};
int dev_name_count = sizeof(dev_name_list)/sizeof(dev_name_list[0]);
diff --git a/i386/i386at/cons_conf.c b/i386/i386at/cons_conf.c
index 8784ed9..ea8ccb5 100644
--- a/i386/i386at/cons_conf.c
+++ b/i386/i386at/cons_conf.c
@@ -30,19 +30,27 @@
#include <sys/types.h>
#include <device/cons.h>
+#ifdef MACH_HYP
+extern int hypcnprobe(), hypcninit(), hypcngetc(), hypcnputc();
+#else /* MACH_HYP */
extern int kdcnprobe(), kdcninit(), kdcngetc(), kdcnputc();
#if NCOM > 0 && RCLINE >= 0
extern int comcnprobe(), comcninit(), comcngetc(), comcnputc();
#endif
+#endif /* MACH_HYP */
/*
* The rest of the consdev fields are filled in by the respective
* cnprobe routine.
*/
struct consdev constab[] = {
+#ifdef MACH_HYP
+ {"hyp", hypcnprobe, hypcninit, hypcngetc, hypcnputc},
+#else /* MACH_HYP */
{"kd", kdcnprobe, kdcninit, kdcngetc, kdcnputc},
#if NCOM > 0 && RCLINE >= 0 && 1
{"com", comcnprobe, comcninit, comcngetc, comcnputc},
#endif
+#endif /* MACH_HYP */
{0}
};
diff --git a/i386/i386at/model_dep.c b/i386/i386at/model_dep.c
index 3ebe2e6..61605a1 100644
--- a/i386/i386at/model_dep.c
+++ b/i386/i386at/model_dep.c
@@ -40,6 +40,7 @@
#include <mach/vm_prot.h>
#include <mach/machine.h>
#include <mach/machine/multiboot.h>
+#include <mach/xen.h>
#include <i386/vm_param.h>
#include <kern/assert.h>
@@ -48,6 +49,7 @@
#include <kern/mach_clock.h>
#include <kern/printf.h>
#include <sys/time.h>
+#include <sys/types.h>
#include <vm/vm_page.h>
#include <i386/fpu.h>
#include <i386/gdt.h>
@@ -65,6 +67,12 @@
#include <i386at/int_init.h>
#include <i386at/kd.h>
#include <i386at/rtc.h>
+#ifdef MACH_XEN
+#include <xen/console.h>
+#include <xen/store.h>
+#include <xen/evt.h>
+#include <xen/xen.h>
+#endif /* MACH_XEN */
/* Location of the kernel's symbol table.
Both of these are 0 if none is available. */
@@ -81,7 +89,20 @@ vm_offset_t phys_first_addr = 0;
vm_offset_t phys_last_addr;
/* A copy of the multiboot info structure passed by the boot loader. */
+#ifdef MACH_XEN
+struct start_info boot_info;
+#ifdef MACH_PSEUDO_PHYS
+unsigned long *mfn_list;
+#if VM_MIN_KERNEL_ADDRESS != LINEAR_MIN_KERNEL_ADDRESS
+unsigned long *pfn_list = (void*) PFN_LIST;
+#endif
+#endif /* MACH_PSEUDO_PHYS */
+#if VM_MIN_KERNEL_ADDRESS != LINEAR_MIN_KERNEL_ADDRESS
+unsigned long la_shift = VM_MIN_KERNEL_ADDRESS;
+#endif
+#else /* MACH_XEN */
struct multiboot_info boot_info;
+#endif /* MACH_XEN */
/* Command line supplied to kernel. */
char *kernel_cmdline = "";
@@ -90,7 +111,11 @@ char *kernel_cmdline = "";
it gets bumped up through physical memory
that exists and is not occupied by boot gunk.
It is not necessarily page-aligned. */
-static vm_offset_t avail_next = 0x1000; /* XX end of BIOS data area */
+static vm_offset_t avail_next
+#ifndef MACH_HYP
+ = 0x1000 /* XX end of BIOS data area */
+#endif /* MACH_HYP */
+ ;
/* Possibly overestimated amount of available memory
still remaining to be handed to the VM system. */
@@ -135,6 +160,9 @@ void machine_init(void)
*/
init_fpu();
+#ifdef MACH_HYP
+ hyp_init();
+#else /* MACH_HYP */
#ifdef LINUX_DEV
/*
* Initialize Linux drivers.
@@ -146,16 +174,19 @@ void machine_init(void)
* Find the devices
*/
probeio();
+#endif /* MACH_HYP */
/*
* Get the time
*/
inittodr();
+#ifndef MACH_HYP
/*
* Tell the BIOS not to clear and test memory.
*/
*(unsigned short *)phystokv(0x472) = 0x1234;
+#endif /* MACH_HYP */
/*
* Unmap page 0 to trap NULL references.
@@ -166,8 +197,17 @@ void machine_init(void)
/* Conserve power on processor CPU. */
void machine_idle (int cpu)
{
+#ifdef MACH_HYP
+ hyp_idle();
+#else /* MACH_HYP */
assert (cpu == cpu_number ());
asm volatile ("hlt" : : : "memory");
+#endif /* MACH_HYP */
+}
+
+void machine_relax ()
+{
+ asm volatile ("rep; nop" : : : "memory");
}
/*
@@ -175,9 +215,13 @@ void machine_idle (int cpu)
*/
void halt_cpu(void)
{
+#ifdef MACH_HYP
+ hyp_halt();
+#else /* MACH_HYP */
asm volatile("cli");
while (TRUE)
machine_idle (cpu_number ());
+#endif /* MACH_HYP */
}
/*
@@ -187,10 +231,16 @@ void halt_all_cpus(reboot)
boolean_t reboot;
{
if (reboot) {
+#ifdef MACH_HYP
+ hyp_reboot();
+#endif /* MACH_HYP */
kdreboot();
}
else {
rebootflag = 1;
+#ifdef MACH_HYP
+ hyp_halt();
+#endif /* MACH_HYP */
printf("In tight loop: hit ctl-alt-del to reboot\n");
(void) spl0();
}
@@ -215,22 +265,26 @@ void db_reset_cpu(void)
void
mem_size_init(void)
{
- vm_size_t phys_last_kb;
-
/* Physical memory on all PCs starts at physical address 0.
XX make it a constant. */
phys_first_addr = 0;
- phys_last_kb = 0x400 + boot_info.mem_upper;
+#ifdef MACH_HYP
+ if (boot_info.nr_pages >= 0x100000) {
+ printf("Truncating memory size to 4GiB\n");
+ phys_last_addr = 0xffffffffU;
+ } else
+ phys_last_addr = boot_info.nr_pages * 0x1000;
+#else /* MACH_HYP */
+ /* TODO: support mmap */
+ vm_size_t phys_last_kb = 0x400 + boot_info.mem_upper;
/* Avoid 4GiB overflow. */
if (phys_last_kb < 0x400 || phys_last_kb >= 0x400000) {
printf("Truncating memory size to 4GiB\n");
- phys_last_kb = 0x400000 - 1;
- }
-
- /* TODO: support mmap */
-
- phys_last_addr = phys_last_kb * 0x400;
+ phys_last_addr = 0xffffffffU;
+ } else
+ phys_last_addr = phys_last_kb * 0x400;
+#endif /* MACH_HYP */
printf("AT386 boot: physical memory from 0x%x to 0x%x\n",
phys_first_addr, phys_last_addr);
@@ -240,14 +294,20 @@ mem_size_init(void)
if (phys_last_addr > ((VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) / 6) * 5) {
phys_last_addr = ((VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) / 6) * 5;
printf("Truncating memory size to %dMiB\n", (phys_last_addr - phys_first_addr) / (1024 * 1024));
+ /* TODO Xen: free lost memory */
}
phys_first_addr = round_page(phys_first_addr);
phys_last_addr = trunc_page(phys_last_addr);
+#ifdef MACH_HYP
+ /* Memory is just contiguous */
+ avail_remaining = phys_last_addr;
+#else /* MACH_HYP */
avail_remaining
= phys_last_addr - (0x100000 - (boot_info.mem_lower * 0x400)
- 0x1000);
+#endif /* MACH_HYP */
}
/*
@@ -263,13 +323,20 @@ i386at_init(void)
/*
* Initialize the PIC prior to any possible call to an spl.
*/
+#ifndef MACH_HYP
picinit();
+#else /* MACH_HYP */
+ hyp_intrinit();
+#endif /* MACH_HYP */
/*
* Find memory size parameters.
*/
mem_size_init();
+#ifdef MACH_XEN
+ kernel_cmdline = (char*) boot_info.cmd_line;
+#else /* MACH_XEN */
/* Copy content pointed by boot_info before losing access to it when it
* is too far in physical memory. */
if (boot_info.flags & MULTIBOOT_CMDLINE) {
@@ -304,6 +371,7 @@ i386at_init(void)
m[i].string = addr;
}
}
+#endif /* MACH_XEN */
/*
* Initialize kernel physical map, mapping the
@@ -325,19 +393,42 @@ i386at_init(void)
kernel_page_dir[lin2pdenum(VM_MIN_KERNEL_ADDRESS)] =
kernel_page_dir[lin2pdenum(LINEAR_MIN_KERNEL_ADDRESS)];
#if PAE
+ /* PAE page tables are 2MB only */
kernel_page_dir[lin2pdenum(VM_MIN_KERNEL_ADDRESS) + 1] =
kernel_page_dir[lin2pdenum(LINEAR_MIN_KERNEL_ADDRESS) + 1];
+ kernel_page_dir[lin2pdenum(VM_MIN_KERNEL_ADDRESS) + 2] =
+ kernel_page_dir[lin2pdenum(LINEAR_MIN_KERNEL_ADDRESS) + 2];
+#endif /* PAE */
+#ifdef MACH_XEN
+ {
+ int i;
+ for (i = 0; i < PDPNUM; i++)
+ pmap_set_page_readonly_init((void*) kernel_page_dir + i * INTEL_PGBYTES);
+#if PAE
+ pmap_set_page_readonly_init(kernel_pmap->pdpbase);
+#endif /* PAE */
+ }
+#endif /* MACH_XEN */
+#if PAE
set_cr3((unsigned)_kvtophys(kernel_pmap->pdpbase));
+#ifndef MACH_HYP
if (!CPU_HAS_FEATURE(CPU_FEATURE_PAE))
panic("CPU doesn't have support for PAE.");
set_cr4(get_cr4() | CR4_PAE);
+#endif /* MACH_HYP */
#else
set_cr3((unsigned)_kvtophys(kernel_page_dir));
#endif /* PAE */
+#ifndef MACH_HYP
if (CPU_HAS_FEATURE(CPU_FEATURE_PGE))
set_cr4(get_cr4() | CR4_PGE);
+ /* already set by Hypervisor */
set_cr0(get_cr0() | CR0_PG | CR0_WP);
+#endif /* MACH_HYP */
flush_instr_queue();
+#ifdef MACH_XEN
+ pmap_clear_bootstrap_pagetable((void *)boot_info.pt_base);
+#endif /* MACH_XEN */
/* Interrupt stacks are allocated in physical memory,
while kernel stacks are allocated in kernel virtual memory,
@@ -349,18 +440,47 @@ i386at_init(void)
*/
gdt_init();
idt_init();
+#ifndef MACH_HYP
int_init();
+#endif /* MACH_HYP */
ldt_init();
ktss_init();
/* Get rid of the temporary direct mapping and flush it out of the TLB. */
+#ifdef MACH_XEN
+#ifdef MACH_PSEUDO_PHYS
+ if (!hyp_mmu_update_pte(kv_to_ma(&kernel_page_dir[lin2pdenum(VM_MIN_KERNEL_ADDRESS)]), 0))
+#else /* MACH_PSEUDO_PHYS */
+ if (hyp_do_update_va_mapping(VM_MIN_KERNEL_ADDRESS, 0, UVMF_INVLPG | UVMF_ALL))
+#endif /* MACH_PSEUDO_PHYS */
+ printf("couldn't unmap frame 0\n");
+#if PAE
+#ifdef MACH_PSEUDO_PHYS
+ if (!hyp_mmu_update_pte(kv_to_ma(&kernel_page_dir[lin2pdenum(VM_MIN_KERNEL_ADDRESS) + 1]), 0))
+#else /* MACH_PSEUDO_PHYS */
+ if (hyp_do_update_va_mapping(VM_MIN_KERNEL_ADDRESS + INTEL_PGBYTES, 0, UVMF_INVLPG | UVMF_ALL))
+#endif /* MACH_PSEUDO_PHYS */
+ printf("couldn't unmap frame 1\n");
+#ifdef MACH_PSEUDO_PHYS
+ if (!hyp_mmu_update_pte(kv_to_ma(&kernel_page_dir[lin2pdenum(VM_MIN_KERNEL_ADDRESS) + 2]), 0))
+#else /* MACH_PSEUDO_PHYS */
+ if (hyp_do_update_va_mapping(VM_MIN_KERNEL_ADDRESS + 2*INTEL_PGBYTES, 0, UVMF_INVLPG | UVMF_ALL))
+#endif /* MACH_PSEUDO_PHYS */
+ printf("couldn't unmap frame 2\n");
+#endif /* PAE */
+ hyp_free_page(0, (void*) VM_MIN_KERNEL_ADDRESS);
+#else /* MACH_XEN */
kernel_page_dir[lin2pdenum(VM_MIN_KERNEL_ADDRESS)] = 0;
#if PAE
kernel_page_dir[lin2pdenum(VM_MIN_KERNEL_ADDRESS) + 1] = 0;
+ kernel_page_dir[lin2pdenum(VM_MIN_KERNEL_ADDRESS) + 2] = 0;
#endif /* PAE */
+#endif /* MACH_XEN */
flush_tlb();
-
+#ifdef MACH_XEN
+ hyp_p2m_init();
+#endif /* MACH_XEN */
/* XXX We'll just use the initialization stack we're already running on
as the interrupt stack for now. Later this will have to change,
@@ -384,6 +504,15 @@ void c_boot_entry(vm_offset_t bi)
printf(version);
printf("\n");
+#ifdef MACH_XEN
+ printf("Running on %s.\n", boot_info.magic);
+ if (boot_info.flags & SIF_PRIVILEGED)
+ panic("Mach can't run as dom0.");
+#ifdef MACH_PSEUDO_PHYS
+ mfn_list = (void*)boot_info.mfn_list;
+#endif
+#else /* MACH_XEN */
+
#if MACH_KDB
/*
* Locate the kernel's symbol table, if the boot loader provided it.
@@ -405,6 +534,7 @@ void c_boot_entry(vm_offset_t bi)
symtab_size, strtab_size);
}
#endif /* MACH_KDB */
+#endif /* MACH_XEN */
cpu_type = discover_x86_cpu_type ();
@@ -525,6 +655,12 @@ boolean_t
init_alloc_aligned(vm_size_t size, vm_offset_t *addrp)
{
vm_offset_t addr;
+
+#ifdef MACH_HYP
+ /* There is none */
+ if (!avail_next)
+ avail_next = _kvtophys(boot_info.pt_base) + (boot_info.nr_pt_frames + 3) * 0x1000;
+#else /* MACH_HYP */
extern char start[], end[];
int i;
static int wrapped = 0;
@@ -543,11 +679,14 @@ init_alloc_aligned(vm_size_t size, vm_offset_t *addrp)
: 0;
retry:
+#endif /* MACH_HYP */
/* Page-align the start address. */
avail_next = round_page(avail_next);
+#ifndef MACH_HYP
/* Start with memory above 16MB, reserving the low memory for later. */
+ /* Don't care on Xen */
if (!wrapped && phys_last_addr > 16 * 1024*1024)
{
if (avail_next < 16 * 1024*1024)
@@ -563,9 +702,15 @@ init_alloc_aligned(vm_size_t size, vm_offset_t *addrp)
wrapped = 1;
}
}
+#endif /* MACH_HYP */
/* Check if we have reached the end of memory. */
- if (avail_next == (wrapped ? 16 * 1024*1024 : phys_last_addr))
+ if (avail_next ==
+ (
+#ifndef MACH_HYP
+ wrapped ? 16 * 1024*1024 :
+#endif /* MACH_HYP */
+ phys_last_addr))
return FALSE;
/* Tentatively assign the current location to the caller. */
@@ -575,6 +720,7 @@ init_alloc_aligned(vm_size_t size, vm_offset_t *addrp)
and see where that puts us. */
avail_next += size;
+#ifndef MACH_HYP
/* Skip past the I/O and ROM area. */
if ((avail_next > (boot_info.mem_lower * 0x400)) && (addr < 0x100000))
{
@@ -620,6 +766,7 @@ init_alloc_aligned(vm_size_t size, vm_offset_t *addrp)
/* XXX string */
}
}
+#endif /* MACH_HYP */
avail_remaining -= size;
@@ -649,6 +796,11 @@ boolean_t pmap_valid_page(x)
vm_offset_t x;
{
/* XXX is this OK? What does it matter for? */
- return (((phys_first_addr <= x) && (x < phys_last_addr)) &&
- !(((boot_info.mem_lower * 1024) <= x) && (x < 1024*1024)));
+ return (((phys_first_addr <= x) && (x < phys_last_addr))
+#ifndef MACH_HYP
+ && !(
+ ((boot_info.mem_lower * 1024) <= x) &&
+ (x < 1024*1024))
+#endif /* MACH_HYP */
+ );
}
diff --git a/i386/intel/pmap.c b/i386/intel/pmap.c
index c633fd9..ee19c4b 100644
--- a/i386/intel/pmap.c
+++ b/i386/intel/pmap.c
@@ -77,13 +77,18 @@
#include <vm/vm_user.h>
#include <mach/machine/vm_param.h>
+#include <mach/xen.h>
#include <machine/thread.h>
#include <i386/cpu_number.h>
#include <i386/proc_reg.h>
#include <i386/locore.h>
#include <i386/model_dep.h>
+#ifdef MACH_PSEUDO_PHYS
+#define WRITE_PTE(pte_p, pte_entry) *(pte_p) = pte_entry?pa_to_ma(pte_entry):0;
+#else /* MACH_PSEUDO_PHYS */
#define WRITE_PTE(pte_p, pte_entry) *(pte_p) = (pte_entry);
+#endif /* MACH_PSEUDO_PHYS */
/*
* Private data structures.
@@ -325,6 +330,19 @@ lock_data_t pmap_system_lock;
#define MAX_TBIS_SIZE 32 /* > this -> TBIA */ /* XXX */
+#ifdef MACH_HYP
+#if 1
+#define INVALIDATE_TLB(pmap, s, e) hyp_mmuext_op_void(MMUEXT_TLB_FLUSH_LOCAL)
+#else
+#define INVALIDATE_TLB(pmap, s, e) do { \
+ if (__builtin_constant_p((e) - (s)) \
+ && (e) - (s) == PAGE_SIZE) \
+ hyp_invlpg((pmap) == kernel_pmap ? kvtolin(s) : (s)); \
+ else \
+ hyp_mmuext_op_void(MMUEXT_TLB_FLUSH_LOCAL); \
+} while(0)
+#endif
+#else /* MACH_HYP */
#if 0
/* It is hard to know when a TLB flush becomes less expensive than a bunch of
* invlpgs. But it surely is more expensive than just one invlpg. */
@@ -338,6 +356,7 @@ lock_data_t pmap_system_lock;
#else
#define INVALIDATE_TLB(pmap, s, e) flush_tlb()
#endif
+#endif /* MACH_HYP */
#if NCPUS > 1
@@ -507,6 +526,10 @@ vm_offset_t pmap_map_bd(virt, start, end, prot)
register pt_entry_t template;
register pt_entry_t *pte;
int spl;
+#ifdef MACH_XEN
+ int n, i = 0;
+ struct mmu_update update[HYP_BATCH_MMU_UPDATES];
+#endif /* MACH_XEN */
template = pa_to_pte(start)
| INTEL_PTE_NCACHE|INTEL_PTE_WTHRU
@@ -521,11 +544,30 @@ vm_offset_t pmap_map_bd(virt, start, end, prot)
pte = pmap_pte(kernel_pmap, virt);
if (pte == PT_ENTRY_NULL)
panic("pmap_map_bd: Invalid kernel address\n");
+#ifdef MACH_XEN
+ update[i].ptr = kv_to_ma(pte);
+ update[i].val = pa_to_ma(template);
+ i++;
+ if (i == HYP_BATCH_MMU_UPDATES) {
+ hyp_mmu_update(kvtolin(&update), i, kvtolin(&n), DOMID_SELF);
+ if (n != i)
+ panic("couldn't pmap_map_bd\n");
+ i = 0;
+ }
+#else /* MACH_XEN */
WRITE_PTE(pte, template)
+#endif /* MACH_XEN */
pte_increment_pa(template);
virt += PAGE_SIZE;
start += PAGE_SIZE;
}
+#ifdef MACH_XEN
+ if (i > HYP_BATCH_MMU_UPDATES)
+ panic("overflowed array in pmap_map_bd");
+ hyp_mmu_update(kvtolin(&update), i, kvtolin(&n), DOMID_SELF);
+ if (n != i)
+ panic("couldn't pmap_map_bd\n");
+#endif /* MACH_XEN */
PMAP_READ_UNLOCK(pmap, spl);
return(virt);
}
@@ -583,6 +625,8 @@ void pmap_bootstrap()
/*
* Allocate and clear a kernel page directory.
*/
+ /* Note: initial Xen mapping holds at least 512kB free mapped page.
+ * We use that for directly building our linear mapping. */
#if PAE
{
vm_offset_t addr;
@@ -604,6 +648,53 @@ void pmap_bootstrap()
kernel_pmap->dirbase[i] = 0;
}
+#ifdef MACH_XEN
+ /*
+ * Xen may only provide as few as 512KB extra bootstrap linear memory,
+ * which is far from enough to map all available memory, so we need to
+ * map more bootstrap linear memory. We here map 1 (resp. 4 for PAE)
+ * other L1 table(s), thus 4MiB extra memory (resp. 8MiB), which is
+ * enough for a pagetable mapping 4GiB.
+ */
+#ifdef PAE
+#define NSUP_L1 4
+#else
+#define NSUP_L1 1
+#endif
+ pt_entry_t *l1_map[NSUP_L1];
+ {
+ pt_entry_t *base = (pt_entry_t*) boot_info.pt_base;
+ int i;
+ int n_l1map;
+#ifdef PAE
+ pt_entry_t *l2_map = (pt_entry_t*) phystokv(pte_to_pa(base[0]));
+#else /* PAE */
+ pt_entry_t *l2_map = base;
+#endif /* PAE */
+ for (n_l1map = 0, i = lin2pdenum(VM_MIN_KERNEL_ADDRESS); i < NPTES; i++) {
+ if (!(l2_map[i] & INTEL_PTE_VALID)) {
+ struct mmu_update update;
+ int j, n;
+
+ l1_map[n_l1map] = (pt_entry_t*) phystokv(pmap_grab_page());
+ for (j = 0; j < NPTES; j++)
+ l1_map[n_l1map][j] = intel_ptob(pfn_to_mfn((i - lin2pdenum(VM_MIN_KERNEL_ADDRESS)) * NPTES + j)) | INTEL_PTE_VALID | INTEL_PTE_WRITE;
+ pmap_set_page_readonly_init(l1_map[n_l1map]);
+ if (!hyp_mmuext_op_mfn (MMUEXT_PIN_L1_TABLE, kv_to_mfn (l1_map[n_l1map])))
+ panic("couldn't pin page %p(%p)", l1_map[n_l1map], kv_to_ma (l1_map[n_l1map]));
+ update.ptr = kv_to_ma(&l2_map[i]);
+ update.val = kv_to_ma(l1_map[n_l1map]) | INTEL_PTE_VALID | INTEL_PTE_WRITE;
+ hyp_mmu_update(kv_to_la(&update), 1, kv_to_la(&n), DOMID_SELF);
+ if (n != 1)
+ panic("couldn't complete bootstrap map");
+ /* added the last L1 table, can stop */
+ if (++n_l1map >= NSUP_L1)
+ break;
+ }
+ }
+ }
+#endif /* MACH_XEN */
+
/*
* Allocate and set up the kernel page tables.
*/
@@ -640,19 +731,42 @@ void pmap_bootstrap()
WRITE_PTE(pte, 0);
}
else
+#ifdef MACH_XEN
+ if (va == (vm_offset_t) &hyp_shared_info)
+ {
+ *pte = boot_info.shared_info | INTEL_PTE_VALID | INTEL_PTE_WRITE;
+ va += INTEL_PGBYTES;
+ }
+ else
+#endif /* MACH_XEN */
{
extern char _start[], etext[];
- if ((va >= (vm_offset_t)_start)
+ if (((va >= (vm_offset_t) _start)
&& (va + INTEL_PGBYTES <= (vm_offset_t)etext))
+#ifdef MACH_XEN
+ || (va >= (vm_offset_t) boot_info.pt_base
+ && (va + INTEL_PGBYTES <=
+ (vm_offset_t) ptable + INTEL_PGBYTES))
+#endif /* MACH_XEN */
+ )
{
WRITE_PTE(pte, pa_to_pte(_kvtophys(va))
| INTEL_PTE_VALID | global);
}
else
{
- WRITE_PTE(pte, pa_to_pte(_kvtophys(va))
- | INTEL_PTE_VALID | INTEL_PTE_WRITE | global);
+#ifdef MACH_XEN
+ int i;
+ for (i = 0; i < NSUP_L1; i++)
+ if (va == (vm_offset_t) l1_map[i])
+ WRITE_PTE(pte, pa_to_pte(_kvtophys(va))
+ | INTEL_PTE_VALID | global);
+ if (i == NSUP_L1)
+#endif /* MACH_XEN */
+ WRITE_PTE(pte, pa_to_pte(_kvtophys(va))
+ | INTEL_PTE_VALID | INTEL_PTE_WRITE | global)
+
}
va += INTEL_PGBYTES;
}
@@ -662,6 +776,11 @@ void pmap_bootstrap()
WRITE_PTE(pte, 0);
va += INTEL_PGBYTES;
}
+#ifdef MACH_XEN
+ pmap_set_page_readonly_init(ptable);
+ if (!hyp_mmuext_op_mfn (MMUEXT_PIN_L1_TABLE, kv_to_mfn (ptable)))
+ panic("couldn't pin page %p(%p)\n", ptable, kv_to_ma (ptable));
+#endif /* MACH_XEN */
}
}
@@ -669,6 +788,100 @@ void pmap_bootstrap()
soon after we return from here. */
}
+#ifdef MACH_XEN
+/* These are only required because of Xen security policies */
+
+/* Set back a page read write */
+void pmap_set_page_readwrite(void *_vaddr) {
+ vm_offset_t vaddr = (vm_offset_t) _vaddr;
+ vm_offset_t paddr = kvtophys(vaddr);
+ vm_offset_t canon_vaddr = phystokv(paddr);
+ if (hyp_do_update_va_mapping (kvtolin(vaddr), pa_to_pte (pa_to_ma(paddr)) | INTEL_PTE_VALID | INTEL_PTE_WRITE, UVMF_NONE))
+ panic("couldn't set hiMMU readwrite for addr %p(%p)\n", vaddr, pa_to_ma (paddr));
+ if (canon_vaddr != vaddr)
+ if (hyp_do_update_va_mapping (kvtolin(canon_vaddr), pa_to_pte (pa_to_ma(paddr)) | INTEL_PTE_VALID | INTEL_PTE_WRITE, UVMF_NONE))
+ panic("couldn't set hiMMU readwrite for paddr %p(%p)\n", canon_vaddr, pa_to_ma (paddr));
+}
+
+/* Set a page read only (so as to pin it for instance) */
+void pmap_set_page_readonly(void *_vaddr) {
+ vm_offset_t vaddr = (vm_offset_t) _vaddr;
+ vm_offset_t paddr = kvtophys(vaddr);
+ vm_offset_t canon_vaddr = phystokv(paddr);
+ if (*pmap_pde(kernel_pmap, vaddr) & INTEL_PTE_VALID) {
+ if (hyp_do_update_va_mapping (kvtolin(vaddr), pa_to_pte (pa_to_ma(paddr)) | INTEL_PTE_VALID, UVMF_NONE))
+ panic("couldn't set hiMMU readonly for vaddr %p(%p)\n", vaddr, pa_to_ma (paddr));
+ }
+ if (canon_vaddr != vaddr &&
+ *pmap_pde(kernel_pmap, canon_vaddr) & INTEL_PTE_VALID) {
+ if (hyp_do_update_va_mapping (kvtolin(canon_vaddr), pa_to_pte (pa_to_ma(paddr)) | INTEL_PTE_VALID, UVMF_NONE))
+ panic("couldn't set hiMMU readonly for vaddr %p canon_vaddr %p paddr %p (%p)\n", vaddr, canon_vaddr, paddr, pa_to_ma (paddr));
+ }
+}
+
+/* This needs to be called instead of pmap_set_page_readonly as long as RC3
+ * still points to the bootstrap dirbase. */
+void pmap_set_page_readonly_init(void *_vaddr) {
+ vm_offset_t vaddr = (vm_offset_t) _vaddr;
+#if PAE
+ pt_entry_t *pdpbase = (void*) boot_info.pt_base;
+ vm_offset_t dirbase = ptetokv(pdpbase[0]);
+#else
+ vm_offset_t dirbase = boot_info.pt_base;
+#endif
+ struct pmap linear_pmap = {
+ .dirbase = (void*) dirbase,
+ };
+ /* Modify our future kernel map (can't use update_va_mapping for this)... */
+ if (*pmap_pde(kernel_pmap, vaddr) & INTEL_PTE_VALID)
+ if (!hyp_mmu_update_la (kvtolin(vaddr), pa_to_pte (kv_to_ma(vaddr)) | INTEL_PTE_VALID))
+ panic("couldn't set hiMMU readonly for vaddr %p(%p)\n", vaddr, kv_to_ma (vaddr));
+ /* ... and the bootstrap map. */
+ if (*pmap_pde(&linear_pmap, vaddr) & INTEL_PTE_VALID)
+ if (hyp_do_update_va_mapping (vaddr, pa_to_pte (kv_to_ma(vaddr)) | INTEL_PTE_VALID, UVMF_NONE))
+ panic("couldn't set MMU readonly for vaddr %p(%p)\n", vaddr, kv_to_ma (vaddr));
+}
+
+void pmap_clear_bootstrap_pagetable(pt_entry_t *base) {
+ int i;
+ pt_entry_t *dir;
+ vm_offset_t va = 0;
+#if PAE
+ int j;
+#endif /* PAE */
+ if (!hyp_mmuext_op_mfn (MMUEXT_UNPIN_TABLE, kv_to_mfn(base)))
+ panic("pmap_clear_bootstrap_pagetable: couldn't unpin page %p(%p)\n", base, kv_to_ma(base));
+#if PAE
+ for (j = 0; j < PDPNUM; j++)
+ {
+ pt_entry_t pdpe = base[j];
+ if (pdpe & INTEL_PTE_VALID) {
+ dir = (pt_entry_t *) phystokv(pte_to_pa(pdpe));
+#else /* PAE */
+ dir = base;
+#endif /* PAE */
+ for (i = 0; i < NPTES; i++) {
+ pt_entry_t pde = dir[i];
+ unsigned long pfn = mfn_to_pfn(atop(pde));
+ void *pgt = (void*) phystokv(ptoa(pfn));
+ if (pde & INTEL_PTE_VALID)
+ hyp_free_page(pfn, pgt);
+ va += NPTES * INTEL_PGBYTES;
+ if (va >= HYP_VIRT_START)
+ break;
+ }
+#if PAE
+ hyp_free_page(atop(_kvtophys(dir)), dir);
+ } else
+ va += NPTES * NPTES * INTEL_PGBYTES;
+ if (va >= HYP_VIRT_START)
+ break;
+ }
+#endif /* PAE */
+ hyp_free_page(atop(_kvtophys(base)), base);
+}
+#endif /* MACH_XEN */
+
void pmap_virtual_space(startp, endp)
vm_offset_t *startp;
vm_offset_t *endp;
@@ -823,6 +1036,29 @@ pmap_page_table_page_alloc()
return pa;
}
+#ifdef MACH_XEN
+void pmap_map_mfn(void *_addr, unsigned long mfn) {
+ vm_offset_t addr = (vm_offset_t) _addr;
+ pt_entry_t *pte, *pdp;
+ vm_offset_t ptp;
+ if ((pte = pmap_pte(kernel_pmap, addr)) == PT_ENTRY_NULL) {
+ ptp = phystokv(pmap_page_table_page_alloc());
+ pmap_set_page_readonly((void*) ptp);
+ if (!hyp_mmuext_op_mfn (MMUEXT_PIN_L1_TABLE, pa_to_mfn(ptp)))
+ panic("couldn't pin page %p(%p)\n",ptp,kv_to_ma(ptp));
+ pdp = pmap_pde(kernel_pmap, addr);
+ if (!hyp_mmu_update_pte(kv_to_ma(pdp),
+ pa_to_pte(kv_to_ma(ptp)) | INTEL_PTE_VALID
+ | INTEL_PTE_USER
+ | INTEL_PTE_WRITE))
+ panic("%s:%d could not set pde %p(%p) to %p(%p)\n",__FILE__,__LINE__,kvtophys((vm_offset_t)pdp),kv_to_ma(pdp), ptp, pa_to_ma(ptp));
+ pte = pmap_pte(kernel_pmap, addr);
+ }
+ if (!hyp_mmu_update_pte(kv_to_ma(pte), ptoa(mfn) | INTEL_PTE_VALID | INTEL_PTE_WRITE))
+ panic("%s:%d could not set pte %p(%p) to %p(%p)\n",__FILE__,__LINE__,pte,kv_to_ma(pte), ptoa(mfn), pa_to_ma(ptoa(mfn)));
+}
+#endif /* MACH_XEN */
+
/*
* Deallocate a page-table page.
* The page-table page must have all mappings removed,
@@ -884,6 +1120,13 @@ pmap_t pmap_create(size)
panic("pmap_create");
memcpy(p->dirbase, kernel_page_dir, PDPNUM * INTEL_PGBYTES);
+#ifdef MACH_XEN
+ {
+ int i;
+ for (i = 0; i < PDPNUM; i++)
+ pmap_set_page_readonly((void*) p->dirbase + i * INTEL_PGBYTES);
+ }
+#endif /* MACH_XEN */
#if PAE
if (kmem_alloc_wired(kernel_map,
@@ -895,6 +1138,9 @@ pmap_t pmap_create(size)
for (i = 0; i < PDPNUM; i++)
WRITE_PTE(&p->pdpbase[i], pa_to_pte(kvtophys((vm_offset_t) p->dirbase + i * INTEL_PGBYTES)) | INTEL_PTE_VALID);
}
+#ifdef MACH_XEN
+ pmap_set_page_readonly(p->pdpbase);
+#endif /* MACH_XEN */
#endif /* PAE */
p->ref_count = 1;
@@ -954,14 +1200,29 @@ void pmap_destroy(p)
if (m == VM_PAGE_NULL)
panic("pmap_destroy: pte page not in object");
vm_page_lock_queues();
+#ifdef MACH_XEN
+ if (!hyp_mmuext_op_mfn (MMUEXT_UNPIN_TABLE, pa_to_mfn(pa)))
+ panic("pmap_destroy: couldn't unpin page %p(%p)\n", pa, kv_to_ma(pa));
+ pmap_set_page_readwrite((void*) phystokv(pa));
+#endif /* MACH_XEN */
vm_page_free(m);
inuse_ptepages_count--;
vm_page_unlock_queues();
vm_object_unlock(pmap_object);
}
}
+#ifdef MACH_XEN
+ {
+ int i;
+ for (i = 0; i < PDPNUM; i++)
+ pmap_set_page_readwrite((void*) p->dirbase + i * INTEL_PGBYTES);
+ }
+#endif /* MACH_XEN */
kmem_free(kernel_map, (vm_offset_t)p->dirbase, PDPNUM * INTEL_PGBYTES);
#if PAE
+#ifdef MACH_XEN
+ pmap_set_page_readwrite(p->pdpbase);
+#endif /* MACH_XEN */
kmem_free(kernel_map, (vm_offset_t)p->pdpbase, INTEL_PGBYTES);
#endif /* PAE */
zfree(pmap_zone, (vm_offset_t) p);
@@ -1007,6 +1268,10 @@ void pmap_remove_range(pmap, va, spte, epte)
int num_removed, num_unwired;
int pai;
vm_offset_t pa;
+#ifdef MACH_XEN
+ int n, ii = 0;
+ struct mmu_update update[HYP_BATCH_MMU_UPDATES];
+#endif /* MACH_XEN */
#if DEBUG_PTE_PAGE
if (pmap != kernel_pmap)
@@ -1035,7 +1300,19 @@ void pmap_remove_range(pmap, va, spte, epte)
register int i = ptes_per_vm_page;
register pt_entry_t *lpte = cpte;
do {
+#ifdef MACH_XEN
+ update[ii].ptr = kv_to_ma(lpte);
+ update[ii].val = 0;
+ ii++;
+ if (ii == HYP_BATCH_MMU_UPDATES) {
+ hyp_mmu_update(kvtolin(&update), ii, kvtolin(&n), DOMID_SELF);
+ if (n != ii)
+ panic("couldn't pmap_remove_range\n");
+ ii = 0;
+ }
+#else /* MACH_XEN */
*lpte = 0;
+#endif /* MACH_XEN */
lpte++;
} while (--i > 0);
continue;
@@ -1056,7 +1333,19 @@ void pmap_remove_range(pmap, va, spte, epte)
do {
pmap_phys_attributes[pai] |=
*lpte & (PHYS_MODIFIED|PHYS_REFERENCED);
+#ifdef MACH_XEN
+ update[ii].ptr = kv_to_ma(lpte);
+ update[ii].val = 0;
+ ii++;
+ if (ii == HYP_BATCH_MMU_UPDATES) {
+ hyp_mmu_update(kvtolin(&update), ii, kvtolin(&n), DOMID_SELF);
+ if (n != ii)
+ panic("couldn't pmap_remove_range\n");
+ ii = 0;
+ }
+#else /* MACH_XEN */
*lpte = 0;
+#endif /* MACH_XEN */
lpte++;
} while (--i > 0);
}
@@ -1102,6 +1391,14 @@ void pmap_remove_range(pmap, va, spte, epte)
}
}
+#ifdef MACH_XEN
+ if (ii > HYP_BATCH_MMU_UPDATES)
+ panic("overflowed array in pmap_remove_range");
+ hyp_mmu_update(kvtolin(&update), ii, kvtolin(&n), DOMID_SELF);
+ if (n != ii)
+ panic("couldn't pmap_remove_range\n");
+#endif /* MACH_XEN */
+
/*
* Update the counts
*/
@@ -1246,7 +1543,12 @@ void pmap_page_protect(phys, prot)
do {
pmap_phys_attributes[pai] |=
*pte & (PHYS_MODIFIED|PHYS_REFERENCED);
+#ifdef MACH_XEN
+ if (!hyp_mmu_update_pte(kv_to_ma(pte++), 0))
+ panic("%s:%d could not clear pte %p\n",__FILE__,__LINE__,pte-1);
+#else /* MACH_XEN */
*pte++ = 0;
+#endif /* MACH_XEN */
} while (--i > 0);
}
@@ -1276,7 +1578,12 @@ void pmap_page_protect(phys, prot)
register int i = ptes_per_vm_page;
do {
+#ifdef MACH_XEN
+ if (!hyp_mmu_update_pte(kv_to_ma(pte), *pte & ~INTEL_PTE_WRITE))
+ panic("%s:%d could not enable write on pte %p\n",__FILE__,__LINE__,pte);
+#else /* MACH_XEN */
*pte &= ~INTEL_PTE_WRITE;
+#endif /* MACH_XEN */
pte++;
} while (--i > 0);
@@ -1365,11 +1672,36 @@ void pmap_protect(map, s, e, prot)
spte = &spte[ptenum(s)];
epte = &spte[intel_btop(l-s)];
+#ifdef MACH_XEN
+ int n, i = 0;
+ struct mmu_update update[HYP_BATCH_MMU_UPDATES];
+#endif /* MACH_XEN */
+
while (spte < epte) {
- if (*spte & INTEL_PTE_VALID)
+ if (*spte & INTEL_PTE_VALID) {
+#ifdef MACH_XEN
+ update[i].ptr = kv_to_ma(spte);
+ update[i].val = *spte & ~INTEL_PTE_WRITE;
+ i++;
+ if (i == HYP_BATCH_MMU_UPDATES) {
+ hyp_mmu_update(kvtolin(&update), i, kvtolin(&n), DOMID_SELF);
+ if (n != i)
+ panic("couldn't pmap_protect\n");
+ i = 0;
+ }
+#else /* MACH_XEN */
*spte &= ~INTEL_PTE_WRITE;
+#endif /* MACH_XEN */
+ }
spte++;
}
+#ifdef MACH_XEN
+ if (i > HYP_BATCH_MMU_UPDATES)
+ panic("overflowed array in pmap_protect");
+ hyp_mmu_update(kvtolin(&update), i, kvtolin(&n), DOMID_SELF);
+ if (n != i)
+ panic("couldn't pmap_protect\n");
+#endif /* MACH_XEN */
}
s = l;
pde++;
@@ -1412,6 +1744,8 @@ if (pmap_debug) printf("pmap(%x, %x)\n", v, pa);
if (pmap == PMAP_NULL)
return;
+ if (pmap == kernel_pmap && (v < kernel_virtual_start || v >= kernel_virtual_end))
+ panic("pmap_enter(%p, %p) falls in physical memory area!\n", v, pa);
if (pmap == kernel_pmap && (prot & VM_PROT_WRITE) == 0
&& !wired /* hack for io_wire */ ) {
/*
@@ -1502,9 +1836,20 @@ Retry:
/*XX pdp = &pmap->dirbase[pdenum(v) & ~(i-1)];*/
pdp = pmap_pde(pmap, v);
do {
+#ifdef MACH_XEN
+ pmap_set_page_readonly((void *) ptp);
+ if (!hyp_mmuext_op_mfn (MMUEXT_PIN_L1_TABLE, kv_to_mfn(ptp)))
+ panic("couldn't pin page %p(%p)\n",ptp,kv_to_ma(ptp));
+ if (!hyp_mmu_update_pte(pa_to_ma(kvtophys((vm_offset_t)pdp)),
+ pa_to_pte(pa_to_ma(kvtophys(ptp))) | INTEL_PTE_VALID
+ | INTEL_PTE_USER
+ | INTEL_PTE_WRITE))
+ panic("%s:%d could not set pde %p(%p,%p) to %p(%p,%p) %p\n",__FILE__,__LINE__, pdp, kvtophys((vm_offset_t)pdp), pa_to_ma(kvtophys((vm_offset_t)pdp)), ptp, kvtophys(ptp), pa_to_ma(kvtophys(ptp)), pa_to_pte(kv_to_ma(ptp)));
+#else /* MACH_XEN */
*pdp = pa_to_pte(ptp) | INTEL_PTE_VALID
| INTEL_PTE_USER
| INTEL_PTE_WRITE;
+#endif /* MACH_XEN */
pdp++;
ptp += INTEL_PGBYTES;
} while (--i > 0);
@@ -1544,7 +1889,12 @@ Retry:
do {
if (*pte & INTEL_PTE_MOD)
template |= INTEL_PTE_MOD;
+#ifdef MACH_XEN
+ if (!hyp_mmu_update_pte(kv_to_ma(pte), pa_to_ma(template)))
+ panic("%s:%d could not set pte %p to %p\n",__FILE__,__LINE__,pte,template);
+#else /* MACH_XEN */
WRITE_PTE(pte, template)
+#endif /* MACH_XEN */
pte++;
pte_increment_pa(template);
} while (--i > 0);
@@ -1649,7 +1999,12 @@ Retry:
template |= INTEL_PTE_WIRED;
i = ptes_per_vm_page;
do {
+#ifdef MACH_XEN
+ if (!(hyp_mmu_update_pte(kv_to_ma(pte), pa_to_ma(template))))
+ panic("%s:%d could not set pte %p to %p\n",__FILE__,__LINE__,pte,template);
+#else /* MACH_XEN */
WRITE_PTE(pte, template)
+#endif /* MACH_XEN */
pte++;
pte_increment_pa(template);
} while (--i > 0);
@@ -1704,7 +2059,12 @@ void pmap_change_wiring(map, v, wired)
map->stats.wired_count--;
i = ptes_per_vm_page;
do {
+#ifdef MACH_XEN
+ if (!(hyp_mmu_update_pte(kv_to_ma(pte), *pte & ~INTEL_PTE_WIRED)))
+ panic("%s:%d could not wire down pte %p\n",__FILE__,__LINE__,pte);
+#else /* MACH_XEN */
*pte &= ~INTEL_PTE_WIRED;
+#endif /* MACH_XEN */
pte++;
} while (--i > 0);
}
@@ -1835,7 +2195,17 @@ void pmap_collect(p)
register int i = ptes_per_vm_page;
register pt_entry_t *pdep = pdp;
do {
+#ifdef MACH_XEN
+ unsigned long pte = *pdep;
+ void *ptable = (void*) ptetokv(pte);
+ if (!(hyp_mmu_update_pte(pa_to_ma(kvtophys((vm_offset_t)pdep++)), 0)))
+ panic("%s:%d could not clear pde %p\n",__FILE__,__LINE__,pdep-1);
+ if (!hyp_mmuext_op_mfn (MMUEXT_UNPIN_TABLE, kv_to_mfn(ptable)))
+ panic("couldn't unpin page %p(%p)\n", ptable, pa_to_ma(kvtophys((vm_offset_t)ptable)));
+ pmap_set_page_readwrite(ptable);
+#else /* MACH_XEN */
*pdep++ = 0;
+#endif /* MACH_XEN */
} while (--i > 0);
}
@@ -2052,7 +2422,12 @@ phys_attribute_clear(phys, bits)
{
register int i = ptes_per_vm_page;
do {
+#ifdef MACH_XEN
+ if (!(hyp_mmu_update_pte(kv_to_ma(pte), *pte & ~bits)))
+ panic("%s:%d could not clear bits %lx from pte %p\n",__FILE__,__LINE__,bits,pte);
+#else /* MACH_XEN */
*pte &= ~bits;
+#endif /* MACH_XEN */
} while (--i > 0);
}
PMAP_UPDATE_TLBS(pmap, va, va + PAGE_SIZE);
@@ -2413,7 +2788,12 @@ pmap_unmap_page_zero ()
if (!pte)
return;
assert (pte);
+#ifdef MACH_XEN
+ if (!hyp_mmu_update_pte(kv_to_ma(pte), 0))
+ printf("couldn't unmap page 0\n");
+#else /* MACH_XEN */
*pte = 0;
INVALIDATE_TLB(kernel_pmap, 0, PAGE_SIZE);
+#endif /* MACH_XEN */
}
#endif /* i386 */
diff --git a/i386/intel/pmap.h b/i386/intel/pmap.h
index 7354a0f..a2b6442 100644
--- a/i386/intel/pmap.h
+++ b/i386/intel/pmap.h
@@ -126,12 +126,21 @@ typedef unsigned int pt_entry_t;
#define INTEL_PTE_NCACHE 0x00000010
#define INTEL_PTE_REF 0x00000020
#define INTEL_PTE_MOD 0x00000040
+#ifdef MACH_XEN
+/* Not supported */
+#define INTEL_PTE_GLOBAL 0x00000000
+#else /* MACH_XEN */
#define INTEL_PTE_GLOBAL 0x00000100
+#endif /* MACH_XEN */
#define INTEL_PTE_WIRED 0x00000200
#define INTEL_PTE_PFN 0xfffff000
#define pa_to_pte(a) ((a) & INTEL_PTE_PFN)
+#ifdef MACH_PSEUDO_PHYS
+#define pte_to_pa(p) ma_to_pa((p) & INTEL_PTE_PFN)
+#else /* MACH_PSEUDO_PHYS */
#define pte_to_pa(p) ((p) & INTEL_PTE_PFN)
+#endif /* MACH_PSEUDO_PHYS */
#define pte_increment_pa(p) ((p) += INTEL_OFFMASK+1)
/*
@@ -159,6 +168,14 @@ typedef struct pmap *pmap_t;
#define PMAP_NULL ((pmap_t) 0)
+#ifdef MACH_XEN
+extern void pmap_set_page_readwrite(void *addr);
+extern void pmap_set_page_readonly(void *addr);
+extern void pmap_set_page_readonly_init(void *addr);
+extern void pmap_map_mfn(void *addr, unsigned long mfn);
+extern void pmap_clear_bootstrap_pagetable(pt_entry_t *addr);
+#endif /* MACH_XEN */
+
#if PAE
#define set_pmap(pmap) set_cr3(kvtophys((vm_offset_t)(pmap)->pdpbase))
#else /* PAE */
diff --git a/i386/xen/Makefrag.am b/i386/xen/Makefrag.am
new file mode 100644
index 0000000..b15b7db
--- /dev/null
+++ b/i386/xen/Makefrag.am
@@ -0,0 +1,33 @@
+# Makefile fragment for the ix86 specific part of the Xen platform.
+
+# Copyright (C) 2007 Free Software Foundation, Inc.
+
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 2, or (at your option) any later
+# version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+# for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+#
+# Xen support.
+#
+
+libkernel_a_SOURCES += \
+ i386/xen/xen.c \
+ i386/xen/xen_locore.S \
+ i386/xen/xen_boothdr.S
+
+
+if PLATFORM_xen
+gnumach_LINKFLAGS += \
+ --defsym _START=0x20000000 \
+ -T '$(srcdir)'/i386/ldscript
+endif
diff --git a/i386/xen/xen.c b/i386/xen/xen.c
new file mode 100644
index 0000000..aa3c2cc
--- /dev/null
+++ b/i386/xen/xen.c
@@ -0,0 +1,77 @@
+/*
+ * Copyright (C) 2006 Samuel Thibault <samuel.thibault@ens-lyon.org>
+ *
+ * This program is free software ; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation ; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY ; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with the program ; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <kern/printf.h>
+#include <kern/debug.h>
+
+#include <mach/machine/eflags.h>
+#include <machine/thread.h>
+#include <machine/ipl.h>
+
+#include <machine/model_dep.h>
+
+unsigned long cr3;
+
+struct failsafe_callback_regs {
+ unsigned int ds;
+ unsigned int es;
+ unsigned int fs;
+ unsigned int gs;
+ unsigned int ip;
+ unsigned int cs_and_mask;
+ unsigned int flags;
+};
+
+void hyp_failsafe_c_callback(struct failsafe_callback_regs *regs) {
+ printf("Fail-Safe callback!\n");
+ printf("IP: %08X CS: %4X DS: %4X ES: %4X FS: %4X GS: %4X FLAGS %08X MASK %04X\n", regs->ip, regs->cs_and_mask & 0xffff, regs->ds, regs->es, regs->fs, regs->gs, regs->flags, regs->cs_and_mask >> 16);
+ panic("failsafe");
+}
+
+extern void clock_interrupt();
+extern void return_to_iret;
+
+void hypclock_machine_intr(int old_ipl, void *ret_addr, struct i386_interrupt_state *regs, unsigned64_t delta) {
+ if (ret_addr == &return_to_iret) {
+ clock_interrupt(delta/1000, /* usec per tick */
+ (regs->efl & EFL_VM) || /* user mode */
+ ((regs->cs & 0x02) != 0), /* user mode */
+ old_ipl == SPL0); /* base priority */
+ } else
+ clock_interrupt(delta/1000, FALSE, FALSE);
+}
+
+void hyp_p2m_init(void) {
+ unsigned long nb_pfns = atop(phys_last_addr);
+#ifdef MACH_PSEUDO_PHYS
+#define P2M_PAGE_ENTRIES (PAGE_SIZE / sizeof(unsigned long))
+ unsigned long *l3 = (unsigned long *)phystokv(pmap_grab_page()), *l2 = NULL;
+ unsigned long i;
+
+ for (i = 0; i < (nb_pfns + P2M_PAGE_ENTRIES) / P2M_PAGE_ENTRIES; i++) {
+ if (!(i % P2M_PAGE_ENTRIES)) {
+ l2 = (unsigned long *) phystokv(pmap_grab_page());
+ l3[i / P2M_PAGE_ENTRIES] = kv_to_mfn(l2);
+ }
+ l2[i % P2M_PAGE_ENTRIES] = kv_to_mfn(&mfn_list[i * P2M_PAGE_ENTRIES]);
+ }
+
+ hyp_shared_info.arch.pfn_to_mfn_frame_list_list = kv_to_mfn(l3);
+#endif
+ hyp_shared_info.arch.max_pfn = nb_pfns;
+}
diff --git a/i386/xen/xen_boothdr.S b/i386/xen/xen_boothdr.S
new file mode 100644
index 0000000..3d84e0c
--- /dev/null
+++ b/i386/xen/xen_boothdr.S
@@ -0,0 +1,167 @@
+/*
+ * Copyright (C) 2006 Samuel Thibault <samuel.thibault@ens-lyon.org>
+ *
+ * This program is free software ; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation ; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY ; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with the program ; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <xen/public/elfnote.h>
+
+.section __xen_guest
+ .ascii "GUEST_OS=GNU Mach"
+ .ascii ",GUEST_VERSION=1.3"
+ .ascii ",XEN_VER=xen-3.0"
+ .ascii ",VIRT_BASE=0x20000000"
+ .ascii ",ELF_PADDR_OFFSET=0x20000000"
+ .ascii ",HYPERCALL_PAGE=0x2"
+#if PAE
+ .ascii ",PAE=yes"
+#else
+ .ascii ",PAE=no"
+#endif
+ .ascii ",LOADER=generic"
+#ifndef MACH_PSEUDO_PHYS
+ .ascii ",FEATURES=!auto_translated_physmap"
+#endif
+ .byte 0
+
+/* Macro taken from linux/include/linux/elfnote.h */
+#define ELFNOTE(name, type, desctype, descdata) \
+.pushsection .note.name ; \
+ .align 4 ; \
+ .long 2f - 1f /* namesz */ ; \
+ .long 4f - 3f /* descsz */ ; \
+ .long type ; \
+1:.asciz "name" ; \
+2:.align 4 ; \
+3:desctype descdata ; \
+4:.align 4 ; \
+.popsection ;
+
+ ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz, "GNU Mach")
+ ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz, "1.3")
+ ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz, "xen-3.0")
+ ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long, _START)
+ ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .long, _START)
+ ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long, start)
+ ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long, hypcalls)
+#if PAE
+ ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz, "yes")
+#else
+ ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz, "no")
+#endif
+ ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz, "generic")
+ ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz, ""
+#ifndef MACH_PSEUDO_PHYS
+ "!auto_translated_physmap"
+#endif
+ )
+
+#include <mach/machine/asm.h>
+
+#include <i386/i386/i386asm.h>
+
+ .text
+ .globl gdt, ldt
+ .globl start, _start, gdt
+start:
+_start:
+
+ /* Switch to our own interrupt stack. */
+ movl $(_intstack+INTSTACK_SIZE),%eax
+ movl %eax,%esp
+
+ /* Reset EFLAGS to a known state. */
+ pushl $0
+ popf
+
+ /* Push the start_info pointer to be the second argument. */
+ subl $KERNELBASE,%esi
+ pushl %esi
+
+ /* Jump into C code. */
+ call EXT(c_boot_entry)
+
+/* Those need to be aligned on page boundaries. */
+.global hyp_shared_info, hypcalls
+
+ .org (start + 0x1000)
+hyp_shared_info:
+ .org hyp_shared_info + 0x1000
+
+/* Labels just for debuggers */
+#define hypcall(name, n) \
+ .org hypcalls + n*32 ; \
+__hyp_##name:
+
+hypcalls:
+ hypcall(set_trap_table, 0)
+ hypcall(mmu_update, 1)
+ hypcall(set_gdt, 2)
+ hypcall(stack_switch, 3)
+ hypcall(set_callbacks, 4)
+ hypcall(fpu_taskswitch, 5)
+ hypcall(sched_op_compat, 6)
+ hypcall(platform_op, 7)
+ hypcall(set_debugreg, 8)
+ hypcall(get_debugreg, 9)
+ hypcall(update_descriptor, 10)
+ hypcall(memory_op, 12)
+ hypcall(multicall, 13)
+ hypcall(update_va_mapping, 14)
+ hypcall(set_timer_op, 15)
+ hypcall(event_channel_op_compat, 16)
+ hypcall(xen_version, 17)
+ hypcall(console_io, 18)
+ hypcall(physdev_op_compat, 19)
+ hypcall(grant_table_op, 20)
+ hypcall(vm_assist, 21)
+ hypcall(update_va_mapping_otherdomain, 22)
+ hypcall(iret, 23)
+ hypcall(vcpu_op, 24)
+ hypcall(set_segment_base, 25)
+ hypcall(mmuext_op, 26)
+ hypcall(acm_op, 27)
+ hypcall(nmi_op, 28)
+ hypcall(sched_op, 29)
+ hypcall(callback_op, 30)
+ hypcall(xenoprof_op, 31)
+ hypcall(event_channel_op, 32)
+ hypcall(physdev_op, 33)
+ hypcall(hvm_op, 34)
+ hypcall(sysctl, 35)
+ hypcall(domctl, 36)
+ hypcall(kexec_op, 37)
+
+ hypcall(arch_0, 48)
+ hypcall(arch_1, 49)
+ hypcall(arch_2, 50)
+ hypcall(arch_3, 51)
+ hypcall(arch_4, 52)
+ hypcall(arch_5, 53)
+ hypcall(arch_6, 54)
+ hypcall(arch_7, 55)
+
+ .org hypcalls + 0x1000
+
+gdt:
+ .org gdt + 0x1000
+
+ldt:
+ .org ldt + 0x1000
+
+stack:
+ .long _intstack+INTSTACK_SIZE,0xe021
+ .comm _intstack,INTSTACK_SIZE
+
diff --git a/i386/xen/xen_locore.S b/i386/xen/xen_locore.S
new file mode 100644
index 0000000..51f823f
--- /dev/null
+++ b/i386/xen/xen_locore.S
@@ -0,0 +1,110 @@
+/*
+ * Copyright (C) 2006 Samuel Thibault <samuel.thibault@ens-lyon.org>
+ *
+ * This program is free software ; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation ; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY ; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with the program ; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <mach/machine/asm.h>
+
+#include <i386/i386asm.h>
+#include <i386/cpu_number.h>
+#include <i386/xen.h>
+
+ .data 2
+int_active:
+ .long 0
+
+
+ .text
+ .globl hyp_callback, hyp_failsafe_callback
+ P2ALIGN(TEXT_ALIGN)
+hyp_callback:
+ pushl %eax
+ jmp EXT(all_intrs)
+
+ENTRY(interrupt)
+ incl int_active /* currently handling interrupts */
+ call EXT(hyp_c_callback) /* call generic interrupt routine */
+ decl int_active /* stopped handling interrupts */
+ sti
+ ret
+
+/* FIXME: if we're _very_ unlucky, we may be re-interrupted, filling stack
+ *
+ * Far from trivial, see mini-os. That said, maybe we could just, before poping
+ * everything (which is _not_ destructive), save sp into a known place and use
+ * it+jmp back?
+ *
+ * Mmm, there seems to be an iret hypcall that does exactly what we want:
+ * perform iret, and if IF is set, clear the interrupt mask.
+ */
+
+/* Pfff, we have to check pending interrupts ourselves. Some other DomUs just make an hypercall for retriggering the irq. Not sure it's really easier/faster */
+ENTRY(hyp_sti)
+ pushl %ebp
+ movl %esp, %ebp
+_hyp_sti:
+ movb $0,hyp_shared_info+CPU_CLI /* Enable interrupts */
+ cmpl $0,int_active /* Check whether we were already checking pending interrupts */
+ jz 0f
+ popl %ebp
+ ret /* Already active, just return */
+0:
+ /* Not active, check pending interrupts by hand */
+ /* no memory barrier needed on x86 */
+ cmpb $0,hyp_shared_info+CPU_PENDING
+ jne 0f
+ popl %ebp
+ ret
+0:
+ movb $0xff,hyp_shared_info+CPU_CLI
+1:
+ pushl %eax
+ pushl %ecx
+ pushl %edx
+ incl int_active /* currently handling interrupts */
+
+ pushl $0
+ pushl $0
+ call EXT(hyp_c_callback)
+ popl %edx
+ popl %edx
+
+ popl %edx
+ popl %ecx
+ popl %eax
+ decl int_active /* stopped handling interrupts */
+ cmpb $0,hyp_shared_info+CPU_PENDING
+ jne 1b
+ jmp _hyp_sti
+
+/* Hypervisor failed to reload segments. Dump them. */
+hyp_failsafe_callback:
+#if 1
+ /* load sane segments */
+ mov %ss, %ax
+ mov %ax, %ds
+ mov %ax, %es
+ mov %ax, %fs
+ mov %ax, %gs
+ push %esp
+ call EXT(hyp_failsafe_c_callback)
+#else
+ popl %ds
+ popl %es
+ popl %fs
+ popl %gs
+ iret
+#endif