diff options
Diffstat (limited to 'i386')
-rw-r--r-- | i386/Makefrag.am | 47 | ||||
-rw-r--r-- | i386/configfrag.ac | 12 | ||||
-rw-r--r-- | i386/i386/debug_trace.S | 1 | ||||
-rw-r--r-- | i386/i386/fpu.c | 15 | ||||
-rw-r--r-- | i386/i386/gdt.c | 27 | ||||
-rw-r--r-- | i386/i386/gdt.h | 4 | ||||
-rw-r--r-- | i386/i386/i386asm.sym | 15 | ||||
-rw-r--r-- | i386/i386/idt.c | 5 | ||||
-rw-r--r-- | i386/i386/idt_inittab.S | 16 | ||||
-rw-r--r-- | i386/i386/ktss.c | 7 | ||||
-rw-r--r-- | i386/i386/ldt.c | 15 | ||||
-rw-r--r-- | i386/i386/locore.S | 50 | ||||
-rw-r--r-- | i386/i386/mp_desc.c | 5 | ||||
-rw-r--r-- | i386/i386/pcb.c | 31 | ||||
-rw-r--r-- | i386/i386/phys.c | 7 | ||||
-rw-r--r-- | i386/i386/proc_reg.h | 22 | ||||
-rw-r--r-- | i386/i386/seg.h | 18 | ||||
-rw-r--r-- | i386/i386/spl.S | 18 | ||||
-rw-r--r-- | i386/i386/trap.c | 5 | ||||
-rw-r--r-- | i386/i386/user_ldt.c | 27 | ||||
-rw-r--r-- | i386/i386/user_ldt.h | 3 | ||||
-rw-r--r-- | i386/i386/vm_param.h | 25 | ||||
-rw-r--r-- | i386/i386/xen.h | 357 | ||||
-rw-r--r-- | i386/i386at/conf.c | 21 | ||||
-rw-r--r-- | i386/i386at/cons_conf.c | 8 | ||||
-rw-r--r-- | i386/i386at/model_dep.c | 180 | ||||
-rw-r--r-- | i386/intel/pmap.c | 388 | ||||
-rw-r--r-- | i386/intel/pmap.h | 17 | ||||
-rw-r--r-- | i386/xen/Makefrag.am | 33 | ||||
-rw-r--r-- | i386/xen/xen.c | 77 | ||||
-rw-r--r-- | i386/xen/xen_boothdr.S | 167 | ||||
-rw-r--r-- | i386/xen/xen_locore.S | 110 |
32 files changed, 1685 insertions, 48 deletions
diff --git a/i386/Makefrag.am b/i386/Makefrag.am index bad0ce9..876761c 100644 --- a/i386/Makefrag.am +++ b/i386/Makefrag.am @@ -19,33 +19,37 @@ libkernel_a_SOURCES += \ i386/i386at/autoconf.c \ + i386/i386at/conf.c \ + i386/i386at/cons_conf.c \ + i386/i386at/idt.h \ + i386/i386at/kd_event.c \ + i386/i386at/kd_event.h \ + i386/i386at/kd_queue.c \ + i386/i386at/kd_queue.h \ + i386/i386at/model_dep.c \ + i386/include/mach/sa/stdarg.h + +if PLATFORM_at +libkernel_a_SOURCES += \ i386/i386at/boothdr.S \ i386/i386at/com.c \ i386/i386at/comreg.h \ - i386/i386at/conf.c \ - i386/i386at/cons_conf.c \ i386/i386at/cram.h \ i386/i386at/disk.h \ i386/i386at/i8250.h \ - i386/i386at/idt.h \ i386/i386at/immc.c \ i386/i386at/int_init.c \ i386/i386at/interrupt.S \ i386/i386at/kd.c \ i386/i386at/kd.h \ - i386/i386at/kd_event.c \ - i386/i386at/kd_event.h \ i386/i386at/kd_mouse.c \ i386/i386at/kd_mouse.h \ - i386/i386at/kd_queue.c \ - i386/i386at/kd_queue.h \ i386/i386at/kdasm.S \ i386/i386at/kdsoft.h \ - i386/i386at/model_dep.c \ i386/i386at/pic_isa.c \ i386/i386at/rtc.c \ - i386/i386at/rtc.h \ - i386/include/mach/sa/stdarg.h + i386/i386at/rtc.h +endif # # `lpr' device support. @@ -80,11 +84,9 @@ libkernel_a_SOURCES += \ i386/i386/fpu.h \ i386/i386/gdt.c \ i386/i386/gdt.h \ - i386/i386/hardclock.c \ i386/i386/idt-gen.h \ i386/i386/idt.c \ i386/i386/idt_inittab.S \ - i386/i386/io_map.c \ i386/i386/io_perm.c \ i386/i386/io_perm.h \ i386/i386/ipl.h \ @@ -107,11 +109,7 @@ libkernel_a_SOURCES += \ i386/i386/pcb.c \ i386/i386/pcb.h \ i386/i386/phys.c \ - i386/i386/pic.c \ - i386/i386/pic.h \ i386/i386/pio.h \ - i386/i386/pit.c \ - i386/i386/pit.h \ i386/i386/pmap.h \ i386/i386/proc_reg.h \ i386/i386/sched_param.h \ @@ -139,6 +137,15 @@ libkernel_a_SOURCES += \ EXTRA_DIST += \ i386/i386/mach_i386.srv +if PLATFORM_at +libkernel_a_SOURCES += \ + i386/i386/hardclock.c \ + i386/i386/io_map.c \ + i386/i386/pic.c \ + i386/i386/pic.h \ + i386/i386/pit.c \ + i386/i386/pit.h +endif # # KDB support. @@ -225,3 +232,11 @@ EXTRA_DIST += \ # Instead of listing each file individually... EXTRA_DIST += \ i386/include + +# +# Platform specific parts. +# + +if PLATFORM_xen +include i386/xen/Makefrag.am +endif diff --git a/i386/configfrag.ac b/i386/configfrag.ac index f95aa86..1132b69 100644 --- a/i386/configfrag.ac +++ b/i386/configfrag.ac @@ -51,6 +51,12 @@ case $host_platform:$host_cpu in # i386/bogus/platforms.h] AC_DEFINE([AT386], [1], [AT386])[;; + xen:i?86) + # TODO. That should probably not be needed. + ncom=1 + # TODO. That should probably not be needed. + # i386/bogus/platforms.h] + AC_DEFINE([AT386], [1], [AT386])[;; *) :;; esac] @@ -105,9 +111,11 @@ if [ x"$enable_lpr" = xyes ]; then] AC_ARG_ENABLE([pae], - AS_HELP_STRING([--enable-pae], [PAE feature (ix86-only); disabled by - default])) + AS_HELP_STRING([--enable-pae], [PAE support (ix86-only); on ix86-at disabled + by default, on ix86-xen enabled by default])) [case $host_platform:$host_cpu in + xen:i?86) + enable_pae=${enable_pae-yes};; *:i?86) :;; *) diff --git a/i386/i386/debug_trace.S b/i386/i386/debug_trace.S index e741516..f275e1b 100644 --- a/i386/i386/debug_trace.S +++ b/i386/i386/debug_trace.S @@ -24,6 +24,7 @@ #ifdef DEBUG #include <mach/machine/asm.h> +#include <i386/xen.h> #include "debug.h" diff --git a/i386/i386/fpu.c b/i386/i386/fpu.c index 109d0d7..2a4b9c0 100644 --- a/i386/i386/fpu.c +++ b/i386/i386/fpu.c @@ -109,6 +109,10 @@ void init_fpu() { unsigned short status, control; + +#ifdef MACH_HYP + clear_ts(); +#else /* MACH_HYP */ unsigned int native = 0; if (machine_slot[cpu_number()].cpu_type >= CPU_TYPE_I486) @@ -120,6 +124,7 @@ init_fpu() * the control and status registers. */ set_cr0((get_cr0() & ~(CR0_EM|CR0_TS)) | native); /* allow use of FPU */ +#endif /* MACH_HYP */ fninit(); status = fnstsw(); @@ -153,8 +158,10 @@ init_fpu() struct i386_xfp_save save; unsigned long mask; fp_kind = FP_387X; +#ifndef MACH_HYP printf("Enabling FXSR\n"); set_cr4(get_cr4() | CR4_OSFXSR); +#endif /* MACH_HYP */ fxsave(&save); mask = save.fp_mxcsr_mask; if (!mask) @@ -163,10 +170,14 @@ init_fpu() } else fp_kind = FP_387; } +#ifdef MACH_HYP + set_ts(); +#else /* MACH_HYP */ /* * Trap wait instructions. Turn off FPU for now. */ set_cr0(get_cr0() | CR0_TS | CR0_MP); +#endif /* MACH_HYP */ } else { /* @@ -675,6 +686,7 @@ fpexterrflt() /*NOTREACHED*/ } +#ifndef MACH_XEN /* * FPU error. Called by AST. */ @@ -731,6 +743,7 @@ ASSERT_IPL(SPL0); thread->pcb->ims.ifps->fp_save_state.fp_status); /*NOTREACHED*/ } +#endif /* MACH_XEN */ /* * Save FPU state. @@ -846,7 +859,7 @@ fp_state_alloc() } } -#if AT386 +#if AT386 && !defined(MACH_XEN) /* * Handle a coprocessor error interrupt on the AT386. * This comes in on line 5 of the slave PIC at SPL1. diff --git a/i386/i386/gdt.c b/i386/i386/gdt.c index 845e7c6..b5fb033 100644 --- a/i386/i386/gdt.c +++ b/i386/i386/gdt.c @@ -31,11 +31,18 @@ * Global descriptor table. */ #include <mach/machine/vm_types.h> +#include <mach/xen.h> + +#include <intel/pmap.h> #include "vm_param.h" #include "seg.h" #include "gdt.h" +#ifdef MACH_XEN +/* It is actually defined in xen_boothdr.S */ +extern +#endif /* MACH_XEN */ struct real_descriptor gdt[GDTSZ]; void @@ -50,11 +57,21 @@ gdt_init() LINEAR_MIN_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS, LINEAR_MAX_KERNEL_ADDRESS - (LINEAR_MIN_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) - 1, ACC_PL_K|ACC_DATA_W, SZ_32); +#ifndef MACH_HYP fill_gdt_descriptor(LINEAR_DS, 0, 0xffffffff, ACC_PL_K|ACC_DATA_W, SZ_32); +#endif /* MACH_HYP */ +#ifdef MACH_XEN + unsigned long frame = kv_to_mfn(gdt); + pmap_set_page_readonly(gdt); + if (hyp_set_gdt(kv_to_la(&frame), GDTSZ)) + panic("couldn't set gdt\n"); + if (hyp_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments)) + panic("couldn't set 4gb segments vm assist"); +#else /* MACH_XEN */ /* Load the new GDT. */ { struct pseudo_descriptor pdesc; @@ -63,6 +80,7 @@ gdt_init() pdesc.linear_base = kvtolin(&gdt); lgdt(&pdesc); } +#endif /* MACH_XEN */ /* Reload all the segment registers from the new GDT. We must load ds and es with 0 before loading them with KERNEL_DS @@ -79,5 +97,14 @@ gdt_init() "movw %w1,%%es\n" "movw %w1,%%ss\n" : : "i" (KERNEL_CS), "r" (KERNEL_DS), "r" (0)); +#ifdef MACH_XEN +#if VM_MIN_KERNEL_ADDRESS != LINEAR_MIN_KERNEL_ADDRESS + /* things now get shifted */ +#ifdef MACH_PSEUDO_PHYS + pfn_list = (void*) pfn_list + VM_MIN_KERNEL_ADDRESS - LINEAR_MIN_KERNEL_ADDRESS; +#endif /* MACH_PSEUDO_PHYS */ + la_shift += LINEAR_MIN_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; +#endif +#endif /* MACH_XEN */ } diff --git a/i386/i386/gdt.h b/i386/i386/gdt.h index 50e01e6..41ace79 100644 --- a/i386/i386/gdt.h +++ b/i386/i386/gdt.h @@ -40,12 +40,16 @@ */ #define KERNEL_CS (0x08 | KERNEL_RING) /* kernel code */ #define KERNEL_DS (0x10 | KERNEL_RING) /* kernel data */ +#ifndef MACH_XEN #define KERNEL_LDT 0x18 /* master LDT */ +#endif /* MACH_XEN */ #define KERNEL_TSS 0x20 /* master TSS (uniprocessor) */ #define USER_LDT 0x28 /* place for per-thread LDT */ #define USER_TSS 0x30 /* place for per-thread TSS that holds IO bitmap */ +#ifndef MACH_HYP #define LINEAR_DS 0x38 /* linear mapping */ +#endif /* MACH_HYP */ /* 0x40 was USER_FPREGS, now free */ #define USER_GDT 0x48 /* user-defined GDT entries */ diff --git a/i386/i386/i386asm.sym b/i386/i386/i386asm.sym index 868bf09..b1670e8 100644 --- a/i386/i386/i386asm.sym +++ b/i386/i386/i386asm.sym @@ -45,6 +45,7 @@ #include <i386/gdt.h> #include <i386/ldt.h> #include <i386/mp_desc.h> +#include <i386/xen.h> offset thread th pcb @@ -90,6 +91,9 @@ expr VM_MIN_ADDRESS expr VM_MAX_ADDRESS expr VM_MIN_KERNEL_ADDRESS KERNELBASE expr KERNEL_STACK_SIZE +#if VM_MIN_KERNEL_ADDRESS == LINEAR_MIN_KERNEL_ADDRESS +expr PFN_LIST pfn_list +#endif #if PAE expr PDPSHIFT @@ -117,7 +121,9 @@ expr KERNEL_RING expr KERNEL_CS expr KERNEL_DS expr KERNEL_TSS +#ifndef MACH_XEN expr KERNEL_LDT +#endif /* MACH_XEN */ expr (VM_MIN_KERNEL_ADDRESS>>PDESHIFT)*sizeof(pt_entry_t) KERNELBASEPDE @@ -135,3 +141,12 @@ expr TIMER_HIGH_UNIT offset thread th system_timer offset thread th user_timer #endif + +#ifdef MACH_XEN +offset shared_info si vcpu_info[0].evtchn_upcall_mask CPU_CLI +offset shared_info si vcpu_info[0].evtchn_upcall_pending CPU_PENDING +offset shared_info si vcpu_info[0].evtchn_pending_sel CPU_PENDING_SEL +offset shared_info si evtchn_pending PENDING +offset shared_info si evtchn_mask EVTMASK +offset shared_info si vcpu_info[0].arch.cr2 CR2 +#endif /* MACH_XEN */ diff --git a/i386/i386/idt.c b/i386/i386/idt.c index 1a8f917..b5e3d08 100644 --- a/i386/i386/idt.c +++ b/i386/i386/idt.c @@ -38,6 +38,10 @@ extern struct idt_init_entry idt_inittab[]; void idt_init() { +#ifdef MACH_HYP + if (hyp_set_trap_table(kvtolin(idt_inittab))) + panic("couldn't set trap table\n"); +#else /* MACH_HYP */ struct idt_init_entry *iie = idt_inittab; /* Initialize the exception vectors from the idt_inittab. */ @@ -55,5 +59,6 @@ void idt_init() pdesc.linear_base = kvtolin(&idt); lidt(&pdesc); } +#endif /* MACH_HYP */ } diff --git a/i386/i386/idt_inittab.S b/i386/i386/idt_inittab.S index 7718568..4dcad8d 100644 --- a/i386/i386/idt_inittab.S +++ b/i386/i386/idt_inittab.S @@ -25,7 +25,8 @@ */ #include <mach/machine/asm.h> -#include "seg.h" +#include <i386/seg.h> +#include <i386/i386asm.h> /* We'll be using macros to fill in a table in data hunk 2 @@ -38,12 +39,22 @@ ENTRY(idt_inittab) /* * Interrupt descriptor table and code vectors for it. */ +#ifdef MACH_XEN +#define IDT_ENTRY(n,entry,type) \ + .data 2 ;\ + .byte n ;\ + .byte (((type)&ACC_PL)>>5)|((((type)&(ACC_TYPE|ACC_A))==ACC_INTR_GATE)<<2) ;\ + .word KERNEL_CS ;\ + .long entry ;\ + .text +#else /* MACH_XEN */ #define IDT_ENTRY(n,entry,type) \ .data 2 ;\ .long entry ;\ .word n ;\ .word type ;\ .text +#endif /* MACH_XEN */ /* * No error code. Clear error code and push trap number. @@ -118,4 +129,7 @@ EXCEPTION(0x1f,t_trap_1f) /* Terminator */ .data 2 .long 0 +#ifdef MACH_XEN + .long 0 +#endif /* MACH_XEN */ diff --git a/i386/i386/ktss.c b/i386/i386/ktss.c index 03d9a04..66432f3 100644 --- a/i386/i386/ktss.c +++ b/i386/i386/ktss.c @@ -45,6 +45,12 @@ ktss_init() /* XXX temporary exception stack */ static int exception_stack[1024]; +#ifdef MACH_XEN + /* Xen won't allow us to do any I/O by default anyway, just register + * exception stack */ + if (hyp_stack_switch(KERNEL_DS, (unsigned)(exception_stack+1024))) + panic("couldn't register exception stack\n"); +#else /* MACH_XEN */ /* Initialize the master TSS descriptor. */ fill_gdt_descriptor(KERNEL_TSS, kvtolin(&ktss), sizeof(struct task_tss) - 1, @@ -59,5 +65,6 @@ ktss_init() /* Load the TSS. */ ltr(KERNEL_TSS); +#endif /* MACH_XEN */ } diff --git a/i386/i386/ldt.c b/i386/i386/ldt.c index 7299377..0ef7a8c 100644 --- a/i386/i386/ldt.c +++ b/i386/i386/ldt.c @@ -28,6 +28,9 @@ * same LDT. */ #include <mach/machine/vm_types.h> +#include <mach/xen.h> + +#include <intel/pmap.h> #include "vm_param.h" #include "seg.h" @@ -36,15 +39,23 @@ extern int syscall(); +#ifdef MACH_XEN +/* It is actually defined in xen_boothdr.S */ +extern +#endif /* MACH_XEN */ struct real_descriptor ldt[LDTSZ]; void ldt_init() { +#ifdef MACH_XEN + pmap_set_page_readwrite(ldt); +#else /* MACH_XEN */ /* Initialize the master LDT descriptor in the GDT. */ fill_gdt_descriptor(KERNEL_LDT, kvtolin(&ldt), sizeof(ldt)-1, ACC_PL_K|ACC_LDT, 0); +#endif /* MACH_XEN */ /* Initialize the LDT descriptors. */ fill_ldt_gate(USER_SCALL, @@ -61,5 +72,9 @@ ldt_init() ACC_PL_U|ACC_DATA_W, SZ_32); /* Activate the LDT. */ +#ifdef MACH_HYP + hyp_set_ldt(&ldt, LDTSZ); +#else /* MACH_HYP */ lldt(KERNEL_LDT); +#endif /* MACH_HYP */ } diff --git a/i386/i386/locore.S b/i386/i386/locore.S index 13a44d9..663db43 100644 --- a/i386/i386/locore.S +++ b/i386/i386/locore.S @@ -36,6 +36,7 @@ #include <i386/ldt.h> #include <i386/i386asm.h> #include <i386/cpu_number.h> +#include <i386/xen.h> /* * Fault recovery. @@ -323,8 +324,9 @@ ENTRY(t_segnp) trap_check_kernel_exit: testl $(EFL_VM),16(%esp) /* is trap from V86 mode? */ jnz EXT(alltraps) /* isn`t kernel trap if so */ - testl $3,12(%esp) /* is trap from kernel mode? */ - jne EXT(alltraps) /* if so: */ + /* Note: handling KERNEL_RING value by hand */ + testl $2,12(%esp) /* is trap from kernel mode? */ + jnz EXT(alltraps) /* if so: */ /* check for the kernel exit sequence */ cmpl $_kret_iret,8(%esp) /* on IRET? */ je fault_iret @@ -410,7 +412,8 @@ push_segregs: ENTRY(t_debug) testl $(EFL_VM),8(%esp) /* is trap from V86 mode? */ jnz 0f /* isn`t kernel trap if so */ - testl $3,4(%esp) /* is trap from kernel mode? */ + /* Note: handling KERNEL_RING value by hand */ + testl $2,4(%esp) /* is trap from kernel mode? */ jnz 0f /* if so: */ cmpl $syscall_entry,(%esp) /* system call entry? */ jne 0f /* if so: */ @@ -429,7 +432,11 @@ ENTRY(t_debug) ENTRY(t_page_fault) pushl $(T_PAGE_FAULT) /* mark a page fault trap */ pusha /* save the general registers */ +#ifdef MACH_XEN + movl %ss:hyp_shared_info+CR2,%eax +#else /* MACH_XEN */ movl %cr2,%eax /* get the faulting address */ +#endif /* MACH_XEN */ movl %eax,12(%esp) /* save in esp save slot */ jmp trap_push_segs /* continue fault */ @@ -465,7 +472,8 @@ trap_set_segs: cld /* clear direction flag */ testl $(EFL_VM),R_EFLAGS(%esp) /* in V86 mode? */ jnz trap_from_user /* user mode trap if so */ - testb $3,R_CS(%esp) /* user mode trap? */ + /* Note: handling KERNEL_RING value by hand */ + testb $2,R_CS(%esp) /* user mode trap? */ jz trap_from_kernel /* kernel trap if not */ trap_from_user: @@ -679,7 +687,8 @@ LEXT(return_to_iret) /* ( label for kdb_kintr and hardclock) */ testl $(EFL_VM),I_EFL(%esp) /* if in V86 */ jnz 0f /* or */ - testb $3,I_CS(%esp) /* user mode, */ + /* Note: handling KERNEL_RING value by hand */ + testb $2,I_CS(%esp) /* user mode, */ jz 1f /* check for ASTs */ 0: cmpl $0,CX(EXT(need_ast),%edx) @@ -1156,9 +1165,14 @@ ENTRY(discover_x86_cpu_type) movl %esp,%ebp /* Save stack pointer */ and $~0x3,%esp /* Align stack pointer */ +#ifdef MACH_HYP +#warning Assuming not Cyrix CPU +#else /* MACH_HYP */ inb $0xe8,%al /* Enable ID flag for Cyrix CPU ... */ andb $0x80,%al /* ... in CCR4 reg bit7 */ outb %al,$0xe8 +#endif /* MACH_HYP */ + pushfl /* Fetch flags ... */ popl %eax /* ... into eax */ movl %eax,%ecx /* Save original flags for return */ @@ -1266,13 +1280,24 @@ Entry(copyoutmsg) * XXX only have to do this on 386's. */ copyout_retry: +#ifdef MACH_HYP + movl cr3,%ecx /* point to page directory */ +#else /* MACH_HYP */ movl %cr3,%ecx /* point to page directory */ +#endif /* MACH_HYP */ #if PAE movl %edi,%eax /* get page directory pointer bits */ shrl $(PDPSHIFT),%eax /* from user address */ movl KERNELBASE(%ecx,%eax,PTE_SIZE),%ecx /* get page directory pointer */ +#ifdef MACH_PSEUDO_PHYS + shrl $(PTESHIFT),%ecx + movl pfn_list,%eax + movl (%eax,%ecx,4),%ecx /* mfn_to_pfn */ + shll $(PTESHIFT),%ecx +#else /* MACH_PSEUDO_PHYS */ andl $(PTE_PFN),%ecx /* isolate page frame address */ +#endif /* MACH_PSEUDO_PHYS */ #endif /* PAE */ movl %edi,%eax /* get page directory bits */ shrl $(PDESHIFT),%eax /* from user address */ @@ -1283,7 +1308,14 @@ copyout_retry: /* get page directory pointer */ testl $(PTE_V),%ecx /* present? */ jz 0f /* if not, fault is OK */ +#ifdef MACH_PSEUDO_PHYS + shrl $(PTESHIFT),%ecx + movl pfn_list,%eax + movl (%eax,%ecx,4),%ecx /* mfn_to_pfn */ + shll $(PTESHIFT),%ecx +#else /* MACH_PSEUDO_PHYS */ andl $(PTE_PFN),%ecx /* isolate page frame address */ +#endif /* MACH_PSEUDO_PHYS */ movl %edi,%eax /* get page table bits */ shrl $(PTESHIFT),%eax andl $(PTEMASK),%eax /* from user address */ @@ -1297,9 +1329,17 @@ copyout_retry: /* * Not writable - must fake a fault. Turn off access to the page. */ +#ifdef MACH_HYP + pushl %edx + pushl %ecx + call hyp_invalidate_pte + popl %ecx + popl %edx +#else /* MACH_HYP */ andl $(PTE_INVALID),(%ecx) /* turn off valid bit */ movl %cr3,%eax /* invalidate TLB */ movl %eax,%cr3 +#endif /* MACH_HYP */ 0: /* diff --git a/i386/i386/mp_desc.c b/i386/i386/mp_desc.c index 54660d5..2fd5ec2 100644 --- a/i386/i386/mp_desc.c +++ b/i386/i386/mp_desc.c @@ -31,6 +31,7 @@ #include <kern/cpu_number.h> #include <kern/debug.h> #include <mach/machine.h> +#include <mach/xen.h> #include <vm/vm_kern.h> #include <i386/mp_desc.h> @@ -149,6 +150,9 @@ mp_desc_init(mycpu) * Fix up the entries in the GDT to point to * this LDT and this TSS. */ +#ifdef MACH_HYP + panic("TODO %s:%d\n",__FILE__,__LINE__); +#else /* MACH_HYP */ fill_descriptor(&mpt->gdt[sel_idx(KERNEL_LDT)], (unsigned)&mpt->ldt, LDTSZ * sizeof(struct real_descriptor) - 1, @@ -161,6 +165,7 @@ mp_desc_init(mycpu) mpt->ktss.tss.ss0 = KERNEL_DS; mpt->ktss.tss.io_bit_map_offset = IOPB_INVAL; mpt->ktss.barrier = 0xFF; +#endif /* MACH_HYP */ return mpt; } diff --git a/i386/i386/pcb.c b/i386/i386/pcb.c index 3226195..b9c52dd 100644 --- a/i386/i386/pcb.c +++ b/i386/i386/pcb.c @@ -31,6 +31,7 @@ #include <mach/kern_return.h> #include <mach/thread_status.h> #include <mach/exec/exec.h> +#include <mach/xen.h> #include "vm_param.h" #include <kern/counters.h> @@ -152,7 +153,12 @@ void switch_ktss(pcb) ? (int) (&pcb->iss + 1) : (int) (&pcb->iss.v86_segs); +#ifdef MACH_XEN + /* No IO mask here */ + hyp_stack_switch(KERNEL_DS, pcb_stack_top); +#else /* MACH_XEN */ curr_ktss(mycpu)->tss.esp0 = pcb_stack_top; +#endif /* MACH_XEN */ } { @@ -164,22 +170,47 @@ void switch_ktss(pcb) /* * Use system LDT. */ +#ifdef MACH_HYP + hyp_set_ldt(&ldt, LDTSZ); +#else /* MACH_HYP */ set_ldt(KERNEL_LDT); +#endif /* MACH_HYP */ } else { /* * Thread has its own LDT. */ +#ifdef MACH_HYP + hyp_set_ldt(tldt->ldt, + (tldt->desc.limit_low|(tldt->desc.limit_high<<16)) / + sizeof(struct real_descriptor)); +#else /* MACH_HYP */ *gdt_desc_p(mycpu,USER_LDT) = tldt->desc; set_ldt(USER_LDT); +#endif /* MACH_HYP */ } } +#ifdef MACH_XEN + { + int i; + for (i=0; i < USER_GDT_SLOTS; i++) { + if (memcmp(gdt_desc_p (mycpu, USER_GDT + (i << 3)), + &pcb->ims.user_gdt[i], sizeof pcb->ims.user_gdt[i])) { + if (hyp_do_update_descriptor(kv_to_ma(gdt_desc_p (mycpu, USER_GDT + (i << 3))), + *(unsigned long long *) &pcb->ims.user_gdt[i])) + panic("couldn't set user gdt %d\n",i); + } + } + } +#else /* MACH_XEN */ + /* Copy in the per-thread GDT slots. No reloading is necessary because just restoring the segment registers on the way back to user mode reloads the shadow registers from the in-memory GDT. */ memcpy (gdt_desc_p (mycpu, USER_GDT), pcb->ims.user_gdt, sizeof pcb->ims.user_gdt); +#endif /* MACH_XEN */ /* * Load the floating-point context, if necessary. diff --git a/i386/i386/phys.c b/i386/i386/phys.c index 2c30f17..925593b 100644 --- a/i386/i386/phys.c +++ b/i386/i386/phys.c @@ -27,6 +27,7 @@ #include <string.h> #include <mach/boolean.h> +#include <mach/xen.h> #include <kern/task.h> #include <kern/thread.h> #include <vm/vm_map.h> @@ -104,5 +105,9 @@ vm_offset_t addr; if ((pte = pmap_pte(kernel_pmap, addr)) == PT_ENTRY_NULL) return 0; - return i386_trunc_page(*pte) | (addr & INTEL_OFFMASK); + return i386_trunc_page( +#ifdef MACH_PSEUDO_PHYS + ma_to_pa +#endif /* MACH_PSEUDO_PHYS */ + (*pte)) | (addr & INTEL_OFFMASK); } diff --git a/i386/i386/proc_reg.h b/i386/i386/proc_reg.h index d9f32bc..64d8c43 100644 --- a/i386/i386/proc_reg.h +++ b/i386/i386/proc_reg.h @@ -72,8 +72,10 @@ #ifndef __ASSEMBLER__ #ifdef __GNUC__ +#ifndef MACH_HYP #include <i386/gdt.h> #include <i386/ldt.h> +#endif /* MACH_HYP */ static inline unsigned get_eflags(void) @@ -122,6 +124,16 @@ set_eflags(unsigned eflags) _temp__; \ }) +#ifdef MACH_HYP +extern unsigned long cr3; +#define get_cr3() (cr3) +#define set_cr3(value) \ + ({ \ + cr3 = (value); \ + if (!hyp_set_cr3(value)) \ + panic("set_cr3"); \ + }) +#else /* MACH_HYP */ #define get_cr3() \ ({ \ register unsigned int _temp__; \ @@ -134,9 +146,11 @@ set_eflags(unsigned eflags) register unsigned int _temp__ = (value); \ asm volatile("mov %0, %%cr3" : : "r" (_temp__)); \ }) +#endif /* MACH_HYP */ #define flush_tlb() set_cr3(get_cr3()) +#ifndef MACH_HYP #define invlpg(addr) \ ({ \ asm volatile("invlpg (%0)" : : "r" (addr)); \ @@ -164,6 +178,7 @@ set_eflags(unsigned eflags) : "+r" (var) : "r" (end), \ "q" (LINEAR_DS), "q" (KERNEL_DS), "i" (PAGE_SIZE)); \ }) +#endif /* MACH_HYP */ #define get_cr4() \ ({ \ @@ -179,11 +194,18 @@ set_eflags(unsigned eflags) }) +#ifdef MACH_HYP +#define set_ts() \ + hyp_fpu_taskswitch(1) +#define clear_ts() \ + hyp_fpu_taskswitch(0) +#else /* MACH_HYP */ #define set_ts() \ set_cr0(get_cr0() | CR0_TS) #define clear_ts() \ asm volatile("clts") +#endif /* MACH_HYP */ #define get_tr() \ ({ \ diff --git a/i386/i386/seg.h b/i386/i386/seg.h index 9a09af5..01b1a2e 100644 --- a/i386/i386/seg.h +++ b/i386/i386/seg.h @@ -37,7 +37,12 @@ * i386 segmentation. */ +/* Note: the value of KERNEL_RING is handled by hand in locore.S */ +#ifdef MACH_HYP +#define KERNEL_RING 1 +#else /* MACH_HYP */ #define KERNEL_RING 0 +#endif /* MACH_HYP */ #ifndef __ASSEMBLER__ @@ -118,6 +123,7 @@ struct real_gate { #ifndef __ASSEMBLER__ #include <mach/inline.h> +#include <mach/xen.h> /* Format of a "pseudo-descriptor", used for loading the IDT and GDT. */ @@ -152,9 +158,15 @@ MACH_INLINE void lldt(unsigned short ldt_selector) /* Fill a segment descriptor. */ MACH_INLINE void -fill_descriptor(struct real_descriptor *desc, unsigned base, unsigned limit, +fill_descriptor(struct real_descriptor *_desc, unsigned base, unsigned limit, unsigned char access, unsigned char sizebits) { + /* TODO: when !MACH_XEN, setting desc and just memcpy isn't simpler actually */ +#ifdef MACH_XEN + struct real_descriptor __desc, *desc = &__desc; +#else /* MACH_XEN */ + struct real_descriptor *desc = _desc; +#endif /* MACH_XEN */ if (limit > 0xfffff) { limit >>= 12; @@ -167,6 +179,10 @@ fill_descriptor(struct real_descriptor *desc, unsigned base, unsigned limit, desc->limit_high = limit >> 16; desc->granularity = sizebits; desc->base_high = base >> 24; +#ifdef MACH_XEN + if (hyp_do_update_descriptor(kv_to_ma(_desc), *(unsigned long long*)desc)) + panic("couldn't update descriptor(%p to %08lx%08lx)\n", kv_to_ma(_desc), *(((unsigned long*)desc)+1), *(unsigned long *)desc); +#endif /* MACH_XEN */ } /* Fill a gate with particular values. */ diff --git a/i386/i386/spl.S b/i386/i386/spl.S index f77b556..f1d4b45 100644 --- a/i386/i386/spl.S +++ b/i386/i386/spl.S @@ -20,6 +20,8 @@ #include <mach/machine/asm.h> #include <i386/ipl.h> #include <i386/pic.h> +#include <i386/i386asm.h> +#include <i386/xen.h> /* * Set IPL to the specified value. @@ -42,6 +44,7 @@ /* * Program PICs with mask in %eax. */ +#ifndef MACH_XEN #define SETMASK() \ cmpl EXT(curr_pic_mask),%eax; \ je 9f; \ @@ -50,6 +53,21 @@ movb %ah,%al; \ outb %al,$(PIC_SLAVE_OCW); \ 9: +#else /* MACH_XEN */ +#define pic_mask int_mask +#define SETMASK() \ + pushl %ebx; \ + movl %eax,%ebx; \ + xchgl %eax,hyp_shared_info+EVTMASK; \ + notl %ebx; \ + andl %eax,%ebx; /* Get unmasked events */ \ + testl hyp_shared_info+PENDING, %ebx; \ + popl %ebx; \ + jz 9f; /* Check whether there was some pending */ \ +lock orl $1,hyp_shared_info+CPU_PENDING_SEL; /* Yes, activate it */ \ + movb $1,hyp_shared_info+CPU_PENDING; \ +9: +#endif /* MACH_XEN */ ENTRY(spl0) movl EXT(curr_ipl),%eax /* save current ipl */ diff --git a/i386/i386/trap.c b/i386/i386/trap.c index 4361fcd..28a9e0c 100644 --- a/i386/i386/trap.c +++ b/i386/i386/trap.c @@ -585,6 +585,7 @@ i386_astintr() int mycpu = cpu_number(); (void) splsched(); /* block interrupts to check reasons */ +#ifndef MACH_XEN if (need_ast[mycpu] & AST_I386_FP) { /* * AST was for delayed floating-point exception - @@ -596,7 +597,9 @@ i386_astintr() fpastintr(); } - else { + else +#endif /* MACH_XEN */ + { /* * Not an FPU trap. Handle the AST. * Interrupts are still blocked. diff --git a/i386/i386/user_ldt.c b/i386/i386/user_ldt.c index 942ad07..dfe6b1e 100644 --- a/i386/i386/user_ldt.c +++ b/i386/i386/user_ldt.c @@ -39,6 +39,7 @@ #include <i386/seg.h> #include <i386/thread.h> #include <i386/user_ldt.h> +#include <stddef.h> #include "ldt.h" #include "vm_param.h" @@ -195,9 +196,17 @@ i386_set_ldt(thread, first_selector, desc_list, count, desc_list_inline) if (new_ldt == 0) { simple_unlock(&pcb->lock); +#ifdef MACH_XEN + /* LDT needs to be aligned on a page */ + vm_offset_t alloc = kalloc(ldt_size_needed + PAGE_SIZE + offsetof(struct user_ldt, ldt)); + new_ldt = (user_ldt_t) (round_page((alloc + offsetof(struct user_ldt, ldt))) - offsetof(struct user_ldt, ldt)); + new_ldt->alloc = alloc; + +#else /* MACH_XEN */ new_ldt = (user_ldt_t) kalloc(ldt_size_needed + sizeof(struct real_descriptor)); +#endif /* MACH_XEN */ /* * Build a descriptor that describes the * LDT itself @@ -263,9 +272,19 @@ i386_set_ldt(thread, first_selector, desc_list, count, desc_list_inline) simple_unlock(&pcb->lock); if (new_ldt) +#ifdef MACH_XEN + { + int i; + for (i=0; i<(new_ldt->desc.limit_low + 1)/sizeof(struct real_descriptor); i+=PAGE_SIZE/sizeof(struct real_descriptor)) + pmap_set_page_readwrite(&new_ldt->ldt[i]); + kfree(new_ldt->alloc, new_ldt->desc.limit_low + 1 + + PAGE_SIZE + offsetof(struct user_ldt, ldt)); + } +#else /* MACH_XEN */ kfree((vm_offset_t)new_ldt, new_ldt->desc.limit_low + 1 + sizeof(struct real_descriptor)); +#endif /* MACH_XEN */ /* * Free the descriptor list, if it was @@ -398,9 +417,17 @@ void user_ldt_free(user_ldt) user_ldt_t user_ldt; { +#ifdef MACH_XEN + int i; + for (i=0; i<(user_ldt->desc.limit_low + 1)/sizeof(struct real_descriptor); i+=PAGE_SIZE/sizeof(struct real_descriptor)) + pmap_set_page_readwrite(&user_ldt->ldt[i]); + kfree(user_ldt->alloc, user_ldt->desc.limit_low + 1 + + PAGE_SIZE + offsetof(struct user_ldt, ldt)); +#else /* MACH_XEN */ kfree((vm_offset_t)user_ldt, user_ldt->desc.limit_low + 1 + sizeof(struct real_descriptor)); +#endif /* MACH_XEN */ } diff --git a/i386/i386/user_ldt.h b/i386/i386/user_ldt.h index dd3ad4b..8d16ed8 100644 --- a/i386/i386/user_ldt.h +++ b/i386/i386/user_ldt.h @@ -36,6 +36,9 @@ #include <i386/seg.h> struct user_ldt { +#ifdef MACH_XEN + vm_offset_t alloc; /* allocation before alignment */ +#endif /* MACH_XEN */ struct real_descriptor desc; /* descriptor for self */ struct real_descriptor ldt[1]; /* descriptor table (variable) */ }; diff --git a/i386/i386/vm_param.h b/i386/i386/vm_param.h index 8e92e79..95df604 100644 --- a/i386/i386/vm_param.h +++ b/i386/i386/vm_param.h @@ -25,10 +25,25 @@ /* XXX use xu/vm_param.h */ #include <mach/vm_param.h> +#include <xen/public/xen.h> /* The kernel address space is 1GB, starting at virtual address 0. */ -#define VM_MIN_KERNEL_ADDRESS (0x00000000) -#define VM_MAX_KERNEL_ADDRESS ((LINEAR_MAX_KERNEL_ADDRESS - LINEAR_MIN_KERNEL_ADDRESS + VM_MIN_KERNEL_ADDRESS)) +#ifdef MACH_XEN +#define VM_MIN_KERNEL_ADDRESS 0x20000000UL +#else /* MACH_XEN */ +#define VM_MIN_KERNEL_ADDRESS 0x00000000UL +#endif /* MACH_XEN */ + +#ifdef MACH_XEN +#if PAE +#define HYP_VIRT_START HYPERVISOR_VIRT_START_PAE +#else /* PAE */ +#define HYP_VIRT_START HYPERVISOR_VIRT_START_NONPAE +#endif /* PAE */ +#define VM_MAX_KERNEL_ADDRESS (HYP_VIRT_START - LINEAR_MIN_KERNEL_ADDRESS + VM_MIN_KERNEL_ADDRESS) +#else /* MACH_XEN */ +#define VM_MAX_KERNEL_ADDRESS (LINEAR_MAX_KERNEL_ADDRESS - LINEAR_MIN_KERNEL_ADDRESS + VM_MIN_KERNEL_ADDRESS) +#endif /* MACH_XEN */ /* The kernel virtual address space is actually located at high linear addresses. @@ -36,8 +51,14 @@ #define LINEAR_MIN_KERNEL_ADDRESS (VM_MAX_ADDRESS) #define LINEAR_MAX_KERNEL_ADDRESS (0xffffffffUL) +#ifdef MACH_XEN +/* need room for mmu updates (2*8bytes) */ +#define KERNEL_STACK_SIZE (4*I386_PGBYTES) +#define INTSTACK_SIZE (4*I386_PGBYTES) +#else /* MACH_XEN */ #define KERNEL_STACK_SIZE (1*I386_PGBYTES) #define INTSTACK_SIZE (1*I386_PGBYTES) +#endif /* MACH_XEN */ /* interrupt stack size */ /* diff --git a/i386/i386/xen.h b/i386/i386/xen.h new file mode 100644 index 0000000..a7fb641 --- /dev/null +++ b/i386/i386/xen.h @@ -0,0 +1,357 @@ +/* + * Copyright (C) 2006 Samuel Thibault <samuel.thibault@ens-lyon.org> + * + * This program is free software ; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation ; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY ; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with the program ; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef XEN_HYPCALL_H +#define XEN_HYPCALL_H + +#ifdef MACH_XEN +#ifndef __ASSEMBLER__ +#include <kern/printf.h> +#include <mach/machine/vm_types.h> +#include <mach/vm_param.h> +#include <mach/inline.h> +#include <machine/vm_param.h> +#include <intel/pmap.h> +#include <kern/debug.h> +#include <xen/public/xen.h> + +/* TODO: this should be moved in appropriate non-Xen place. */ +#define barrier() __asm__ __volatile__ ("": : :"memory") +#define mb() __asm__ __volatile__("lock; addl $0,0(%esp)") +#define rmb() mb() +#define wmb() mb() +MACH_INLINE unsigned long xchgl(volatile unsigned long *ptr, unsigned long x) +{ + __asm__ __volatile__("xchgl %0, %1" + : "=r" (x) + : "m" (*(ptr)), "0" (x): "memory"); + return x; +} +#define _TOSTR(x) #x +#define TOSTR(x) _TOSTR (x) + + + +/* x86-specific hypercall interface. */ +#define _hypcall0(type, name) \ +MACH_INLINE type hyp_##name(void) \ +{ \ + long __ret; \ + asm volatile ("call hypcalls+("TOSTR(__HYPERVISOR_##name)"*32)" \ + : "=a" (__ret) \ + : : "memory"); \ + return __ret; \ +} + +#define _hypcall1(type, name, type1, arg1) \ +MACH_INLINE type hyp_##name(type1 arg1) \ +{ \ + long __ret; \ + long foo1; \ + asm volatile ("call hypcalls+("TOSTR(__HYPERVISOR_##name)"*32)" \ + : "=a" (__ret), \ + "=b" (foo1) \ + : "1" ((long)arg1) \ + : "memory"); \ + return __ret; \ +} + +#define _hypcall2(type, name, type1, arg1, type2, arg2) \ +MACH_INLINE type hyp_##name(type1 arg1, type2 arg2) \ +{ \ + long __ret; \ + long foo1, foo2; \ + asm volatile ("call hypcalls+("TOSTR(__HYPERVISOR_##name)"*32)" \ + : "=a" (__ret), \ + "=b" (foo1), \ + "=c" (foo2) \ + : "1" ((long)arg1), \ + "2" ((long)arg2) \ + : "memory"); \ + return __ret; \ +} + +#define _hypcall3(type, name, type1, arg1, type2, arg2, type3, arg3) \ +MACH_INLINE type hyp_##name(type1 arg1, type2 arg2, type3 arg3) \ +{ \ + long __ret; \ + long foo1, foo2, foo3; \ + asm volatile ("call hypcalls+("TOSTR(__HYPERVISOR_##name)"*32)" \ + : "=a" (__ret), \ + "=b" (foo1), \ + "=c" (foo2), \ + "=d" (foo3) \ + : "1" ((long)arg1), \ + "2" ((long)arg2), \ + "3" ((long)arg3) \ + : "memory"); \ + return __ret; \ +} + +#define _hypcall4(type, name, type1, arg1, type2, arg2, type3, arg3, type4, arg4) \ +MACH_INLINE type hyp_##name(type1 arg1, type2 arg2, type3 arg3, type4 arg4) \ +{ \ + long __ret; \ + long foo1, foo2, foo3, foo4; \ + asm volatile ("call hypcalls+("TOSTR(__HYPERVISOR_##name)"*32)" \ + : "=a" (__ret), \ + "=b" (foo1), \ + "=c" (foo2), \ + "=d" (foo3), \ + "=S" (foo4) \ + : "1" ((long)arg1), \ + "2" ((long)arg2), \ + "3" ((long)arg3), \ + "4" ((long)arg4) \ + : "memory"); \ + return __ret; \ +} + +#define _hypcall5(type, name, type1, arg1, type2, arg2, type3, arg3, type4, arg4, type5, arg5) \ +MACH_INLINE type hyp_##name(type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5) \ +{ \ + long __ret; \ + long foo1, foo2, foo3, foo4, foo5; \ + asm volatile ("call hypcalls+("TOSTR(__HYPERVISOR_##name)"*32)" \ + : "=a" (__ret), \ + "=b" (foo1), \ + "=c" (foo2), \ + "=d" (foo3), \ + "=S" (foo4), \ + "=D" (foo5) \ + : "1" ((long)arg1), \ + "2" ((long)arg2), \ + "3" ((long)arg3), \ + "4" ((long)arg4), \ + "5" ((long)arg5) \ + : "memory"); \ + return __ret; \ +} + +/* x86 Hypercalls */ + +/* Note: since Hypervisor uses flat memory model, remember to always use + * kvtolin when giving pointers as parameters for the hypercall to read data + * at. Use kv_to_la when they may be used before GDT got set up. */ + +_hypcall1(long, set_trap_table, vm_offset_t /* struct trap_info * */, traps); + +_hypcall4(int, mmu_update, vm_offset_t /* struct mmu_update * */, req, int, count, vm_offset_t /* int * */, success_count, domid_t, domid) +MACH_INLINE int hyp_mmu_update_pte(unsigned long pte, unsigned long long val) +{ + struct mmu_update update = + { + .ptr = pte, + .val = val, + }; + int count; + hyp_mmu_update(kv_to_la(&update), 1, kv_to_la(&count), DOMID_SELF); + return count; +} +/* Note: make sure this fits in KERNEL_STACK_SIZE */ +#define HYP_BATCH_MMU_UPDATES 256 + +#define hyp_mmu_update_la(la, val) hyp_mmu_update_pte( \ + (unsigned long)(((pt_entry_t*)(kernel_pmap->dirbase[lin2pdenum((unsigned long)la)] & INTEL_PTE_PFN)) \ + + ptenum((unsigned long)la)), val) + +_hypcall2(long, set_gdt, vm_offset_t /* unsigned long * */, frame_list, unsigned int, entries) + +_hypcall2(long, stack_switch, unsigned long, ss, unsigned long, esp); + +_hypcall4(long, set_callbacks, unsigned long, es, void *, ea, + unsigned long, fss, void *, fsa); +_hypcall1(long, fpu_taskswitch, int, set); + +_hypcall4(long, update_descriptor, unsigned long, ma_lo, unsigned long, ma_hi, unsigned long, desc_lo, unsigned long, desc_hi); +#define hyp_do_update_descriptor(ma, desc) ({ \ + unsigned long long __desc = (desc); \ + hyp_update_descriptor(ma, 0, __desc, __desc >> 32); \ +}) + +#include <xen/public/memory.h> +_hypcall2(long, memory_op, unsigned long, cmd, vm_offset_t /* void * */, arg); +MACH_INLINE void hyp_free_mfn(unsigned long mfn) +{ + struct xen_memory_reservation reservation; + reservation.extent_start = (void*) kvtolin(&mfn); + reservation.nr_extents = 1; + reservation.extent_order = 0; + reservation.address_bits = 0; + reservation.domid = DOMID_SELF; + if (hyp_memory_op(XENMEM_decrease_reservation, kvtolin(&reservation)) != 1) + panic("couldn't free page %d\n", mfn); +} + +_hypcall4(int, update_va_mapping, unsigned long, va, unsigned long, val_lo, unsigned long, val_hi, unsigned long, flags); +#define hyp_do_update_va_mapping(va, val, flags) ({ \ + unsigned long long __val = (val); \ + hyp_update_va_mapping(va, __val & 0xffffffffU, __val >> 32, flags); \ +}) + +MACH_INLINE void hyp_free_page(unsigned long pfn, void *va) +{ + /* save mfn */ + unsigned long mfn = pfn_to_mfn(pfn); + + /* remove from mappings */ + if (hyp_do_update_va_mapping(kvtolin(va), 0, UVMF_INVLPG|UVMF_ALL)) + panic("couldn't clear page %d at %p\n", pfn, va); + +#ifdef MACH_PSEUDO_PHYS + /* drop machine page */ + mfn_list[pfn] = ~0; +#endif /* MACH_PSEUDO_PHYS */ + + /* and free from Xen */ + hyp_free_mfn(mfn); +} + +_hypcall4(int, mmuext_op, vm_offset_t /* struct mmuext_op * */, op, int, count, vm_offset_t /* int * */, success_count, domid_t, domid); +MACH_INLINE int hyp_mmuext_op_void(unsigned int cmd) +{ + struct mmuext_op op = { + .cmd = cmd, + }; + int count; + hyp_mmuext_op(kv_to_la(&op), 1, kv_to_la(&count), DOMID_SELF); + return count; +} +MACH_INLINE int hyp_mmuext_op_mfn(unsigned int cmd, unsigned long mfn) +{ + struct mmuext_op op = { + .cmd = cmd, + .arg1.mfn = mfn, + }; + int count; + hyp_mmuext_op(kv_to_la(&op), 1, kv_to_la(&count), DOMID_SELF); + return count; +} +MACH_INLINE void hyp_set_ldt(void *ldt, unsigned long nbentries) { + struct mmuext_op op = { + .cmd = MMUEXT_SET_LDT, + .arg1.linear_addr = kvtolin(ldt), + .arg2.nr_ents = nbentries, + }; + int count; + if (((unsigned long)ldt) & PAGE_MASK) + panic("ldt %p is not aligned on a page\n", ldt); + for (count=0; count<nbentries; count+= PAGE_SIZE/8) + pmap_set_page_readonly(ldt+count*8); + hyp_mmuext_op(kvtolin(&op), 1, kvtolin(&count), DOMID_SELF); + if (!count) + panic("couldn't set LDT\n"); +} +/* TODO: use xen_pfn_to_cr3/xen_cr3_to_pfn to cope with pdp above 4GB */ +#define hyp_set_cr3(value) hyp_mmuext_op_mfn(MMUEXT_NEW_BASEPTR, pa_to_mfn(value)) +MACH_INLINE void hyp_invlpg(vm_offset_t lin) { + struct mmuext_op ops; + int n; + ops.cmd = MMUEXT_INVLPG_ALL; + ops.arg1.linear_addr = lin; + hyp_mmuext_op(kvtolin(&ops), 1, kvtolin(&n), DOMID_SELF); + if (n < 1) + panic("couldn't invlpg\n"); +} + +_hypcall2(long, set_timer_op, unsigned long, absolute_lo, unsigned long, absolute_hi); +#define hyp_do_set_timer_op(absolute_nsec) ({ \ + unsigned long long __absolute = (absolute_nsec); \ + hyp_set_timer_op(__absolute, __absolute >> 32); \ +}) + +#include <xen/public/event_channel.h> +_hypcall1(int, event_channel_op, vm_offset_t /* evtchn_op_t * */, op); +MACH_INLINE int hyp_event_channel_send(evtchn_port_t port) { + evtchn_op_t op = { + .cmd = EVTCHNOP_send, + .u.send.port = port, + }; + return hyp_event_channel_op(kvtolin(&op)); +} +MACH_INLINE evtchn_port_t hyp_event_channel_alloc(domid_t domid) { + evtchn_op_t op = { + .cmd = EVTCHNOP_alloc_unbound, + .u.alloc_unbound.dom = DOMID_SELF, + .u.alloc_unbound.remote_dom = domid, + }; + if (hyp_event_channel_op(kvtolin(&op))) + panic("couldn't allocate event channel"); + return op.u.alloc_unbound.port; +} +MACH_INLINE evtchn_port_t hyp_event_channel_bind_virq(uint32_t virq, uint32_t vcpu) { + evtchn_op_t op = { .cmd = EVTCHNOP_bind_virq, .u.bind_virq = { .virq = virq, .vcpu = vcpu }}; + if (hyp_event_channel_op(kvtolin(&op))) + panic("can't bind virq %d\n",virq); + return op.u.bind_virq.port; +} + +_hypcall3(int, console_io, int, cmd, int, count, vm_offset_t /* const char * */, buffer); + +_hypcall3(long, grant_table_op, unsigned int, cmd, vm_offset_t /* void * */, uop, unsigned int, count); + +_hypcall2(long, vm_assist, unsigned int, cmd, unsigned int, type); + +_hypcall0(long, iret); + +#include <xen/public/sched.h> +_hypcall2(long, sched_op, int, cmd, vm_offset_t /* void* */, arg) +#define hyp_yield() hyp_sched_op(SCHEDOP_yield, 0) +#define hyp_block() hyp_sched_op(SCHEDOP_block, 0) +MACH_INLINE void __attribute__((noreturn)) hyp_crash(void) +{ + unsigned int shut = SHUTDOWN_crash; + hyp_sched_op(SCHEDOP_shutdown, kvtolin(&shut)); + /* really shouldn't return */ + printf("uh, shutdown returned?!\n"); + for(;;); +} + +MACH_INLINE void __attribute__((noreturn)) hyp_halt(void) +{ + unsigned int shut = SHUTDOWN_poweroff; + hyp_sched_op(SCHEDOP_shutdown, kvtolin(&shut)); + /* really shouldn't return */ + printf("uh, shutdown returned?!\n"); + for(;;); +} + +MACH_INLINE void __attribute__((noreturn)) hyp_reboot(void) +{ + unsigned int shut = SHUTDOWN_reboot; + hyp_sched_op(SCHEDOP_shutdown, kvtolin(&shut)); + /* really shouldn't return */ + printf("uh, reboot returned?!\n"); + for(;;); +} + +/* x86-specific */ +MACH_INLINE unsigned64_t hyp_cpu_clock(void) { + unsigned64_t tsc; + asm volatile("rdtsc":"=A"(tsc)); + return tsc; +} + +#else /* __ASSEMBLER__ */ +/* TODO: SMP */ +#define cli movb $0xff,hyp_shared_info+CPU_CLI +#define sti call hyp_sti +#endif /* ASSEMBLER */ +#endif /* MACH_XEN */ + +#endif /* XEN_HYPCALL_H */ diff --git a/i386/i386at/conf.c b/i386/i386at/conf.c index 23c2a6f..f5ab36c 100644 --- a/i386/i386at/conf.c +++ b/i386/i386at/conf.c @@ -34,6 +34,7 @@ extern int timeopen(), timeclose(); extern vm_offset_t timemmap(); #define timename "time" +#ifndef MACH_HYP extern int kdopen(), kdclose(), kdread(), kdwrite(); extern int kdgetstat(), kdsetstat(), kdportdeath(); extern vm_offset_t kdmmap(); @@ -50,17 +51,26 @@ extern int lpropen(), lprclose(), lprread(), lprwrite(); extern int lprgetstat(), lprsetstat(), lprportdeath(); #define lprname "lpr" #endif /* NLPR > 0 */ +#endif /* MACH_HYP */ extern int kbdopen(), kbdclose(), kbdread(); extern int kbdgetstat(), kbdsetstat(); #define kbdname "kbd" +#ifndef MACH_HYP extern int mouseopen(), mouseclose(), mouseread(), mousegetstat(); #define mousename "mouse" +#endif /* MACH_HYP */ extern int kmsgopen(), kmsgclose(), kmsgread(), kmsggetstat(); #define kmsgname "kmsg" +#ifdef MACH_HYP +extern int hypcnopen(), hypcnclose(), hypcnread(), hypcnwrite(); +extern int hypcngetstat(), hypcnsetstat(), hypcnportdeath(); +#define hypcnname "hyp" +#endif /* MACH_HYP */ + /* * List of devices - console must be at slot 0 */ @@ -79,16 +89,19 @@ struct dev_ops dev_name_list[] = nodev, nulldev, nulldev, 0, nodev }, +#ifndef MACH_HYP { kdname, kdopen, kdclose, kdread, kdwrite, kdgetstat, kdsetstat, kdmmap, nodev, nulldev, kdportdeath, 0, nodev }, +#endif /* MACH_HYP */ { timename, timeopen, timeclose, nulldev, nulldev, nulldev, nulldev, timemmap, nodev, nulldev, nulldev, 0, nodev }, +#ifndef MACH_HYP #if NCOM > 0 { comname, comopen, comclose, comread, comwrite, comgetstat, comsetstat, nomap, @@ -107,6 +120,7 @@ struct dev_ops dev_name_list[] = nodev, mousegetstat, nulldev, nomap, nodev, nulldev, nulldev, 0, nodev }, +#endif /* MACH_HYP */ { kbdname, kbdopen, kbdclose, kbdread, nodev, kbdgetstat, kbdsetstat, nomap, @@ -120,6 +134,13 @@ struct dev_ops dev_name_list[] = nodev }, #endif +#ifdef MACH_HYP + { hypcnname, hypcnopen, hypcnclose, hypcnread, + hypcnwrite, hypcngetstat, hypcnsetstat, nomap, + nodev, nulldev, hypcnportdeath, 0, + nodev }, +#endif /* MACH_HYP */ + }; int dev_name_count = sizeof(dev_name_list)/sizeof(dev_name_list[0]); diff --git a/i386/i386at/cons_conf.c b/i386/i386at/cons_conf.c index 8784ed9..ea8ccb5 100644 --- a/i386/i386at/cons_conf.c +++ b/i386/i386at/cons_conf.c @@ -30,19 +30,27 @@ #include <sys/types.h> #include <device/cons.h> +#ifdef MACH_HYP +extern int hypcnprobe(), hypcninit(), hypcngetc(), hypcnputc(); +#else /* MACH_HYP */ extern int kdcnprobe(), kdcninit(), kdcngetc(), kdcnputc(); #if NCOM > 0 && RCLINE >= 0 extern int comcnprobe(), comcninit(), comcngetc(), comcnputc(); #endif +#endif /* MACH_HYP */ /* * The rest of the consdev fields are filled in by the respective * cnprobe routine. */ struct consdev constab[] = { +#ifdef MACH_HYP + {"hyp", hypcnprobe, hypcninit, hypcngetc, hypcnputc}, +#else /* MACH_HYP */ {"kd", kdcnprobe, kdcninit, kdcngetc, kdcnputc}, #if NCOM > 0 && RCLINE >= 0 && 1 {"com", comcnprobe, comcninit, comcngetc, comcnputc}, #endif +#endif /* MACH_HYP */ {0} }; diff --git a/i386/i386at/model_dep.c b/i386/i386at/model_dep.c index 3ebe2e6..61605a1 100644 --- a/i386/i386at/model_dep.c +++ b/i386/i386at/model_dep.c @@ -40,6 +40,7 @@ #include <mach/vm_prot.h> #include <mach/machine.h> #include <mach/machine/multiboot.h> +#include <mach/xen.h> #include <i386/vm_param.h> #include <kern/assert.h> @@ -48,6 +49,7 @@ #include <kern/mach_clock.h> #include <kern/printf.h> #include <sys/time.h> +#include <sys/types.h> #include <vm/vm_page.h> #include <i386/fpu.h> #include <i386/gdt.h> @@ -65,6 +67,12 @@ #include <i386at/int_init.h> #include <i386at/kd.h> #include <i386at/rtc.h> +#ifdef MACH_XEN +#include <xen/console.h> +#include <xen/store.h> +#include <xen/evt.h> +#include <xen/xen.h> +#endif /* MACH_XEN */ /* Location of the kernel's symbol table. Both of these are 0 if none is available. */ @@ -81,7 +89,20 @@ vm_offset_t phys_first_addr = 0; vm_offset_t phys_last_addr; /* A copy of the multiboot info structure passed by the boot loader. */ +#ifdef MACH_XEN +struct start_info boot_info; +#ifdef MACH_PSEUDO_PHYS +unsigned long *mfn_list; +#if VM_MIN_KERNEL_ADDRESS != LINEAR_MIN_KERNEL_ADDRESS +unsigned long *pfn_list = (void*) PFN_LIST; +#endif +#endif /* MACH_PSEUDO_PHYS */ +#if VM_MIN_KERNEL_ADDRESS != LINEAR_MIN_KERNEL_ADDRESS +unsigned long la_shift = VM_MIN_KERNEL_ADDRESS; +#endif +#else /* MACH_XEN */ struct multiboot_info boot_info; +#endif /* MACH_XEN */ /* Command line supplied to kernel. */ char *kernel_cmdline = ""; @@ -90,7 +111,11 @@ char *kernel_cmdline = ""; it gets bumped up through physical memory that exists and is not occupied by boot gunk. It is not necessarily page-aligned. */ -static vm_offset_t avail_next = 0x1000; /* XX end of BIOS data area */ +static vm_offset_t avail_next +#ifndef MACH_HYP + = 0x1000 /* XX end of BIOS data area */ +#endif /* MACH_HYP */ + ; /* Possibly overestimated amount of available memory still remaining to be handed to the VM system. */ @@ -135,6 +160,9 @@ void machine_init(void) */ init_fpu(); +#ifdef MACH_HYP + hyp_init(); +#else /* MACH_HYP */ #ifdef LINUX_DEV /* * Initialize Linux drivers. @@ -146,16 +174,19 @@ void machine_init(void) * Find the devices */ probeio(); +#endif /* MACH_HYP */ /* * Get the time */ inittodr(); +#ifndef MACH_HYP /* * Tell the BIOS not to clear and test memory. */ *(unsigned short *)phystokv(0x472) = 0x1234; +#endif /* MACH_HYP */ /* * Unmap page 0 to trap NULL references. @@ -166,8 +197,17 @@ void machine_init(void) /* Conserve power on processor CPU. */ void machine_idle (int cpu) { +#ifdef MACH_HYP + hyp_idle(); +#else /* MACH_HYP */ assert (cpu == cpu_number ()); asm volatile ("hlt" : : : "memory"); +#endif /* MACH_HYP */ +} + +void machine_relax () +{ + asm volatile ("rep; nop" : : : "memory"); } /* @@ -175,9 +215,13 @@ void machine_idle (int cpu) */ void halt_cpu(void) { +#ifdef MACH_HYP + hyp_halt(); +#else /* MACH_HYP */ asm volatile("cli"); while (TRUE) machine_idle (cpu_number ()); +#endif /* MACH_HYP */ } /* @@ -187,10 +231,16 @@ void halt_all_cpus(reboot) boolean_t reboot; { if (reboot) { +#ifdef MACH_HYP + hyp_reboot(); +#endif /* MACH_HYP */ kdreboot(); } else { rebootflag = 1; +#ifdef MACH_HYP + hyp_halt(); +#endif /* MACH_HYP */ printf("In tight loop: hit ctl-alt-del to reboot\n"); (void) spl0(); } @@ -215,22 +265,26 @@ void db_reset_cpu(void) void mem_size_init(void) { - vm_size_t phys_last_kb; - /* Physical memory on all PCs starts at physical address 0. XX make it a constant. */ phys_first_addr = 0; - phys_last_kb = 0x400 + boot_info.mem_upper; +#ifdef MACH_HYP + if (boot_info.nr_pages >= 0x100000) { + printf("Truncating memory size to 4GiB\n"); + phys_last_addr = 0xffffffffU; + } else + phys_last_addr = boot_info.nr_pages * 0x1000; +#else /* MACH_HYP */ + /* TODO: support mmap */ + vm_size_t phys_last_kb = 0x400 + boot_info.mem_upper; /* Avoid 4GiB overflow. */ if (phys_last_kb < 0x400 || phys_last_kb >= 0x400000) { printf("Truncating memory size to 4GiB\n"); - phys_last_kb = 0x400000 - 1; - } - - /* TODO: support mmap */ - - phys_last_addr = phys_last_kb * 0x400; + phys_last_addr = 0xffffffffU; + } else + phys_last_addr = phys_last_kb * 0x400; +#endif /* MACH_HYP */ printf("AT386 boot: physical memory from 0x%x to 0x%x\n", phys_first_addr, phys_last_addr); @@ -240,14 +294,20 @@ mem_size_init(void) if (phys_last_addr > ((VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) / 6) * 5) { phys_last_addr = ((VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) / 6) * 5; printf("Truncating memory size to %dMiB\n", (phys_last_addr - phys_first_addr) / (1024 * 1024)); + /* TODO Xen: free lost memory */ } phys_first_addr = round_page(phys_first_addr); phys_last_addr = trunc_page(phys_last_addr); +#ifdef MACH_HYP + /* Memory is just contiguous */ + avail_remaining = phys_last_addr; +#else /* MACH_HYP */ avail_remaining = phys_last_addr - (0x100000 - (boot_info.mem_lower * 0x400) - 0x1000); +#endif /* MACH_HYP */ } /* @@ -263,13 +323,20 @@ i386at_init(void) /* * Initialize the PIC prior to any possible call to an spl. */ +#ifndef MACH_HYP picinit(); +#else /* MACH_HYP */ + hyp_intrinit(); +#endif /* MACH_HYP */ /* * Find memory size parameters. */ mem_size_init(); +#ifdef MACH_XEN + kernel_cmdline = (char*) boot_info.cmd_line; +#else /* MACH_XEN */ /* Copy content pointed by boot_info before losing access to it when it * is too far in physical memory. */ if (boot_info.flags & MULTIBOOT_CMDLINE) { @@ -304,6 +371,7 @@ i386at_init(void) m[i].string = addr; } } +#endif /* MACH_XEN */ /* * Initialize kernel physical map, mapping the @@ -325,19 +393,42 @@ i386at_init(void) kernel_page_dir[lin2pdenum(VM_MIN_KERNEL_ADDRESS)] = kernel_page_dir[lin2pdenum(LINEAR_MIN_KERNEL_ADDRESS)]; #if PAE + /* PAE page tables are 2MB only */ kernel_page_dir[lin2pdenum(VM_MIN_KERNEL_ADDRESS) + 1] = kernel_page_dir[lin2pdenum(LINEAR_MIN_KERNEL_ADDRESS) + 1]; + kernel_page_dir[lin2pdenum(VM_MIN_KERNEL_ADDRESS) + 2] = + kernel_page_dir[lin2pdenum(LINEAR_MIN_KERNEL_ADDRESS) + 2]; +#endif /* PAE */ +#ifdef MACH_XEN + { + int i; + for (i = 0; i < PDPNUM; i++) + pmap_set_page_readonly_init((void*) kernel_page_dir + i * INTEL_PGBYTES); +#if PAE + pmap_set_page_readonly_init(kernel_pmap->pdpbase); +#endif /* PAE */ + } +#endif /* MACH_XEN */ +#if PAE set_cr3((unsigned)_kvtophys(kernel_pmap->pdpbase)); +#ifndef MACH_HYP if (!CPU_HAS_FEATURE(CPU_FEATURE_PAE)) panic("CPU doesn't have support for PAE."); set_cr4(get_cr4() | CR4_PAE); +#endif /* MACH_HYP */ #else set_cr3((unsigned)_kvtophys(kernel_page_dir)); #endif /* PAE */ +#ifndef MACH_HYP if (CPU_HAS_FEATURE(CPU_FEATURE_PGE)) set_cr4(get_cr4() | CR4_PGE); + /* already set by Hypervisor */ set_cr0(get_cr0() | CR0_PG | CR0_WP); +#endif /* MACH_HYP */ flush_instr_queue(); +#ifdef MACH_XEN + pmap_clear_bootstrap_pagetable((void *)boot_info.pt_base); +#endif /* MACH_XEN */ /* Interrupt stacks are allocated in physical memory, while kernel stacks are allocated in kernel virtual memory, @@ -349,18 +440,47 @@ i386at_init(void) */ gdt_init(); idt_init(); +#ifndef MACH_HYP int_init(); +#endif /* MACH_HYP */ ldt_init(); ktss_init(); /* Get rid of the temporary direct mapping and flush it out of the TLB. */ +#ifdef MACH_XEN +#ifdef MACH_PSEUDO_PHYS + if (!hyp_mmu_update_pte(kv_to_ma(&kernel_page_dir[lin2pdenum(VM_MIN_KERNEL_ADDRESS)]), 0)) +#else /* MACH_PSEUDO_PHYS */ + if (hyp_do_update_va_mapping(VM_MIN_KERNEL_ADDRESS, 0, UVMF_INVLPG | UVMF_ALL)) +#endif /* MACH_PSEUDO_PHYS */ + printf("couldn't unmap frame 0\n"); +#if PAE +#ifdef MACH_PSEUDO_PHYS + if (!hyp_mmu_update_pte(kv_to_ma(&kernel_page_dir[lin2pdenum(VM_MIN_KERNEL_ADDRESS) + 1]), 0)) +#else /* MACH_PSEUDO_PHYS */ + if (hyp_do_update_va_mapping(VM_MIN_KERNEL_ADDRESS + INTEL_PGBYTES, 0, UVMF_INVLPG | UVMF_ALL)) +#endif /* MACH_PSEUDO_PHYS */ + printf("couldn't unmap frame 1\n"); +#ifdef MACH_PSEUDO_PHYS + if (!hyp_mmu_update_pte(kv_to_ma(&kernel_page_dir[lin2pdenum(VM_MIN_KERNEL_ADDRESS) + 2]), 0)) +#else /* MACH_PSEUDO_PHYS */ + if (hyp_do_update_va_mapping(VM_MIN_KERNEL_ADDRESS + 2*INTEL_PGBYTES, 0, UVMF_INVLPG | UVMF_ALL)) +#endif /* MACH_PSEUDO_PHYS */ + printf("couldn't unmap frame 2\n"); +#endif /* PAE */ + hyp_free_page(0, (void*) VM_MIN_KERNEL_ADDRESS); +#else /* MACH_XEN */ kernel_page_dir[lin2pdenum(VM_MIN_KERNEL_ADDRESS)] = 0; #if PAE kernel_page_dir[lin2pdenum(VM_MIN_KERNEL_ADDRESS) + 1] = 0; + kernel_page_dir[lin2pdenum(VM_MIN_KERNEL_ADDRESS) + 2] = 0; #endif /* PAE */ +#endif /* MACH_XEN */ flush_tlb(); - +#ifdef MACH_XEN + hyp_p2m_init(); +#endif /* MACH_XEN */ /* XXX We'll just use the initialization stack we're already running on as the interrupt stack for now. Later this will have to change, @@ -384,6 +504,15 @@ void c_boot_entry(vm_offset_t bi) printf(version); printf("\n"); +#ifdef MACH_XEN + printf("Running on %s.\n", boot_info.magic); + if (boot_info.flags & SIF_PRIVILEGED) + panic("Mach can't run as dom0."); +#ifdef MACH_PSEUDO_PHYS + mfn_list = (void*)boot_info.mfn_list; +#endif +#else /* MACH_XEN */ + #if MACH_KDB /* * Locate the kernel's symbol table, if the boot loader provided it. @@ -405,6 +534,7 @@ void c_boot_entry(vm_offset_t bi) symtab_size, strtab_size); } #endif /* MACH_KDB */ +#endif /* MACH_XEN */ cpu_type = discover_x86_cpu_type (); @@ -525,6 +655,12 @@ boolean_t init_alloc_aligned(vm_size_t size, vm_offset_t *addrp) { vm_offset_t addr; + +#ifdef MACH_HYP + /* There is none */ + if (!avail_next) + avail_next = _kvtophys(boot_info.pt_base) + (boot_info.nr_pt_frames + 3) * 0x1000; +#else /* MACH_HYP */ extern char start[], end[]; int i; static int wrapped = 0; @@ -543,11 +679,14 @@ init_alloc_aligned(vm_size_t size, vm_offset_t *addrp) : 0; retry: +#endif /* MACH_HYP */ /* Page-align the start address. */ avail_next = round_page(avail_next); +#ifndef MACH_HYP /* Start with memory above 16MB, reserving the low memory for later. */ + /* Don't care on Xen */ if (!wrapped && phys_last_addr > 16 * 1024*1024) { if (avail_next < 16 * 1024*1024) @@ -563,9 +702,15 @@ init_alloc_aligned(vm_size_t size, vm_offset_t *addrp) wrapped = 1; } } +#endif /* MACH_HYP */ /* Check if we have reached the end of memory. */ - if (avail_next == (wrapped ? 16 * 1024*1024 : phys_last_addr)) + if (avail_next == + ( +#ifndef MACH_HYP + wrapped ? 16 * 1024*1024 : +#endif /* MACH_HYP */ + phys_last_addr)) return FALSE; /* Tentatively assign the current location to the caller. */ @@ -575,6 +720,7 @@ init_alloc_aligned(vm_size_t size, vm_offset_t *addrp) and see where that puts us. */ avail_next += size; +#ifndef MACH_HYP /* Skip past the I/O and ROM area. */ if ((avail_next > (boot_info.mem_lower * 0x400)) && (addr < 0x100000)) { @@ -620,6 +766,7 @@ init_alloc_aligned(vm_size_t size, vm_offset_t *addrp) /* XXX string */ } } +#endif /* MACH_HYP */ avail_remaining -= size; @@ -649,6 +796,11 @@ boolean_t pmap_valid_page(x) vm_offset_t x; { /* XXX is this OK? What does it matter for? */ - return (((phys_first_addr <= x) && (x < phys_last_addr)) && - !(((boot_info.mem_lower * 1024) <= x) && (x < 1024*1024))); + return (((phys_first_addr <= x) && (x < phys_last_addr)) +#ifndef MACH_HYP + && !( + ((boot_info.mem_lower * 1024) <= x) && + (x < 1024*1024)) +#endif /* MACH_HYP */ + ); } diff --git a/i386/intel/pmap.c b/i386/intel/pmap.c index c633fd9..ee19c4b 100644 --- a/i386/intel/pmap.c +++ b/i386/intel/pmap.c @@ -77,13 +77,18 @@ #include <vm/vm_user.h> #include <mach/machine/vm_param.h> +#include <mach/xen.h> #include <machine/thread.h> #include <i386/cpu_number.h> #include <i386/proc_reg.h> #include <i386/locore.h> #include <i386/model_dep.h> +#ifdef MACH_PSEUDO_PHYS +#define WRITE_PTE(pte_p, pte_entry) *(pte_p) = pte_entry?pa_to_ma(pte_entry):0; +#else /* MACH_PSEUDO_PHYS */ #define WRITE_PTE(pte_p, pte_entry) *(pte_p) = (pte_entry); +#endif /* MACH_PSEUDO_PHYS */ /* * Private data structures. @@ -325,6 +330,19 @@ lock_data_t pmap_system_lock; #define MAX_TBIS_SIZE 32 /* > this -> TBIA */ /* XXX */ +#ifdef MACH_HYP +#if 1 +#define INVALIDATE_TLB(pmap, s, e) hyp_mmuext_op_void(MMUEXT_TLB_FLUSH_LOCAL) +#else +#define INVALIDATE_TLB(pmap, s, e) do { \ + if (__builtin_constant_p((e) - (s)) \ + && (e) - (s) == PAGE_SIZE) \ + hyp_invlpg((pmap) == kernel_pmap ? kvtolin(s) : (s)); \ + else \ + hyp_mmuext_op_void(MMUEXT_TLB_FLUSH_LOCAL); \ +} while(0) +#endif +#else /* MACH_HYP */ #if 0 /* It is hard to know when a TLB flush becomes less expensive than a bunch of * invlpgs. But it surely is more expensive than just one invlpg. */ @@ -338,6 +356,7 @@ lock_data_t pmap_system_lock; #else #define INVALIDATE_TLB(pmap, s, e) flush_tlb() #endif +#endif /* MACH_HYP */ #if NCPUS > 1 @@ -507,6 +526,10 @@ vm_offset_t pmap_map_bd(virt, start, end, prot) register pt_entry_t template; register pt_entry_t *pte; int spl; +#ifdef MACH_XEN + int n, i = 0; + struct mmu_update update[HYP_BATCH_MMU_UPDATES]; +#endif /* MACH_XEN */ template = pa_to_pte(start) | INTEL_PTE_NCACHE|INTEL_PTE_WTHRU @@ -521,11 +544,30 @@ vm_offset_t pmap_map_bd(virt, start, end, prot) pte = pmap_pte(kernel_pmap, virt); if (pte == PT_ENTRY_NULL) panic("pmap_map_bd: Invalid kernel address\n"); +#ifdef MACH_XEN + update[i].ptr = kv_to_ma(pte); + update[i].val = pa_to_ma(template); + i++; + if (i == HYP_BATCH_MMU_UPDATES) { + hyp_mmu_update(kvtolin(&update), i, kvtolin(&n), DOMID_SELF); + if (n != i) + panic("couldn't pmap_map_bd\n"); + i = 0; + } +#else /* MACH_XEN */ WRITE_PTE(pte, template) +#endif /* MACH_XEN */ pte_increment_pa(template); virt += PAGE_SIZE; start += PAGE_SIZE; } +#ifdef MACH_XEN + if (i > HYP_BATCH_MMU_UPDATES) + panic("overflowed array in pmap_map_bd"); + hyp_mmu_update(kvtolin(&update), i, kvtolin(&n), DOMID_SELF); + if (n != i) + panic("couldn't pmap_map_bd\n"); +#endif /* MACH_XEN */ PMAP_READ_UNLOCK(pmap, spl); return(virt); } @@ -583,6 +625,8 @@ void pmap_bootstrap() /* * Allocate and clear a kernel page directory. */ + /* Note: initial Xen mapping holds at least 512kB free mapped page. + * We use that for directly building our linear mapping. */ #if PAE { vm_offset_t addr; @@ -604,6 +648,53 @@ void pmap_bootstrap() kernel_pmap->dirbase[i] = 0; } +#ifdef MACH_XEN + /* + * Xen may only provide as few as 512KB extra bootstrap linear memory, + * which is far from enough to map all available memory, so we need to + * map more bootstrap linear memory. We here map 1 (resp. 4 for PAE) + * other L1 table(s), thus 4MiB extra memory (resp. 8MiB), which is + * enough for a pagetable mapping 4GiB. + */ +#ifdef PAE +#define NSUP_L1 4 +#else +#define NSUP_L1 1 +#endif + pt_entry_t *l1_map[NSUP_L1]; + { + pt_entry_t *base = (pt_entry_t*) boot_info.pt_base; + int i; + int n_l1map; +#ifdef PAE + pt_entry_t *l2_map = (pt_entry_t*) phystokv(pte_to_pa(base[0])); +#else /* PAE */ + pt_entry_t *l2_map = base; +#endif /* PAE */ + for (n_l1map = 0, i = lin2pdenum(VM_MIN_KERNEL_ADDRESS); i < NPTES; i++) { + if (!(l2_map[i] & INTEL_PTE_VALID)) { + struct mmu_update update; + int j, n; + + l1_map[n_l1map] = (pt_entry_t*) phystokv(pmap_grab_page()); + for (j = 0; j < NPTES; j++) + l1_map[n_l1map][j] = intel_ptob(pfn_to_mfn((i - lin2pdenum(VM_MIN_KERNEL_ADDRESS)) * NPTES + j)) | INTEL_PTE_VALID | INTEL_PTE_WRITE; + pmap_set_page_readonly_init(l1_map[n_l1map]); + if (!hyp_mmuext_op_mfn (MMUEXT_PIN_L1_TABLE, kv_to_mfn (l1_map[n_l1map]))) + panic("couldn't pin page %p(%p)", l1_map[n_l1map], kv_to_ma (l1_map[n_l1map])); + update.ptr = kv_to_ma(&l2_map[i]); + update.val = kv_to_ma(l1_map[n_l1map]) | INTEL_PTE_VALID | INTEL_PTE_WRITE; + hyp_mmu_update(kv_to_la(&update), 1, kv_to_la(&n), DOMID_SELF); + if (n != 1) + panic("couldn't complete bootstrap map"); + /* added the last L1 table, can stop */ + if (++n_l1map >= NSUP_L1) + break; + } + } + } +#endif /* MACH_XEN */ + /* * Allocate and set up the kernel page tables. */ @@ -640,19 +731,42 @@ void pmap_bootstrap() WRITE_PTE(pte, 0); } else +#ifdef MACH_XEN + if (va == (vm_offset_t) &hyp_shared_info) + { + *pte = boot_info.shared_info | INTEL_PTE_VALID | INTEL_PTE_WRITE; + va += INTEL_PGBYTES; + } + else +#endif /* MACH_XEN */ { extern char _start[], etext[]; - if ((va >= (vm_offset_t)_start) + if (((va >= (vm_offset_t) _start) && (va + INTEL_PGBYTES <= (vm_offset_t)etext)) +#ifdef MACH_XEN + || (va >= (vm_offset_t) boot_info.pt_base + && (va + INTEL_PGBYTES <= + (vm_offset_t) ptable + INTEL_PGBYTES)) +#endif /* MACH_XEN */ + ) { WRITE_PTE(pte, pa_to_pte(_kvtophys(va)) | INTEL_PTE_VALID | global); } else { - WRITE_PTE(pte, pa_to_pte(_kvtophys(va)) - | INTEL_PTE_VALID | INTEL_PTE_WRITE | global); +#ifdef MACH_XEN + int i; + for (i = 0; i < NSUP_L1; i++) + if (va == (vm_offset_t) l1_map[i]) + WRITE_PTE(pte, pa_to_pte(_kvtophys(va)) + | INTEL_PTE_VALID | global); + if (i == NSUP_L1) +#endif /* MACH_XEN */ + WRITE_PTE(pte, pa_to_pte(_kvtophys(va)) + | INTEL_PTE_VALID | INTEL_PTE_WRITE | global) + } va += INTEL_PGBYTES; } @@ -662,6 +776,11 @@ void pmap_bootstrap() WRITE_PTE(pte, 0); va += INTEL_PGBYTES; } +#ifdef MACH_XEN + pmap_set_page_readonly_init(ptable); + if (!hyp_mmuext_op_mfn (MMUEXT_PIN_L1_TABLE, kv_to_mfn (ptable))) + panic("couldn't pin page %p(%p)\n", ptable, kv_to_ma (ptable)); +#endif /* MACH_XEN */ } } @@ -669,6 +788,100 @@ void pmap_bootstrap() soon after we return from here. */ } +#ifdef MACH_XEN +/* These are only required because of Xen security policies */ + +/* Set back a page read write */ +void pmap_set_page_readwrite(void *_vaddr) { + vm_offset_t vaddr = (vm_offset_t) _vaddr; + vm_offset_t paddr = kvtophys(vaddr); + vm_offset_t canon_vaddr = phystokv(paddr); + if (hyp_do_update_va_mapping (kvtolin(vaddr), pa_to_pte (pa_to_ma(paddr)) | INTEL_PTE_VALID | INTEL_PTE_WRITE, UVMF_NONE)) + panic("couldn't set hiMMU readwrite for addr %p(%p)\n", vaddr, pa_to_ma (paddr)); + if (canon_vaddr != vaddr) + if (hyp_do_update_va_mapping (kvtolin(canon_vaddr), pa_to_pte (pa_to_ma(paddr)) | INTEL_PTE_VALID | INTEL_PTE_WRITE, UVMF_NONE)) + panic("couldn't set hiMMU readwrite for paddr %p(%p)\n", canon_vaddr, pa_to_ma (paddr)); +} + +/* Set a page read only (so as to pin it for instance) */ +void pmap_set_page_readonly(void *_vaddr) { + vm_offset_t vaddr = (vm_offset_t) _vaddr; + vm_offset_t paddr = kvtophys(vaddr); + vm_offset_t canon_vaddr = phystokv(paddr); + if (*pmap_pde(kernel_pmap, vaddr) & INTEL_PTE_VALID) { + if (hyp_do_update_va_mapping (kvtolin(vaddr), pa_to_pte (pa_to_ma(paddr)) | INTEL_PTE_VALID, UVMF_NONE)) + panic("couldn't set hiMMU readonly for vaddr %p(%p)\n", vaddr, pa_to_ma (paddr)); + } + if (canon_vaddr != vaddr && + *pmap_pde(kernel_pmap, canon_vaddr) & INTEL_PTE_VALID) { + if (hyp_do_update_va_mapping (kvtolin(canon_vaddr), pa_to_pte (pa_to_ma(paddr)) | INTEL_PTE_VALID, UVMF_NONE)) + panic("couldn't set hiMMU readonly for vaddr %p canon_vaddr %p paddr %p (%p)\n", vaddr, canon_vaddr, paddr, pa_to_ma (paddr)); + } +} + +/* This needs to be called instead of pmap_set_page_readonly as long as RC3 + * still points to the bootstrap dirbase. */ +void pmap_set_page_readonly_init(void *_vaddr) { + vm_offset_t vaddr = (vm_offset_t) _vaddr; +#if PAE + pt_entry_t *pdpbase = (void*) boot_info.pt_base; + vm_offset_t dirbase = ptetokv(pdpbase[0]); +#else + vm_offset_t dirbase = boot_info.pt_base; +#endif + struct pmap linear_pmap = { + .dirbase = (void*) dirbase, + }; + /* Modify our future kernel map (can't use update_va_mapping for this)... */ + if (*pmap_pde(kernel_pmap, vaddr) & INTEL_PTE_VALID) + if (!hyp_mmu_update_la (kvtolin(vaddr), pa_to_pte (kv_to_ma(vaddr)) | INTEL_PTE_VALID)) + panic("couldn't set hiMMU readonly for vaddr %p(%p)\n", vaddr, kv_to_ma (vaddr)); + /* ... and the bootstrap map. */ + if (*pmap_pde(&linear_pmap, vaddr) & INTEL_PTE_VALID) + if (hyp_do_update_va_mapping (vaddr, pa_to_pte (kv_to_ma(vaddr)) | INTEL_PTE_VALID, UVMF_NONE)) + panic("couldn't set MMU readonly for vaddr %p(%p)\n", vaddr, kv_to_ma (vaddr)); +} + +void pmap_clear_bootstrap_pagetable(pt_entry_t *base) { + int i; + pt_entry_t *dir; + vm_offset_t va = 0; +#if PAE + int j; +#endif /* PAE */ + if (!hyp_mmuext_op_mfn (MMUEXT_UNPIN_TABLE, kv_to_mfn(base))) + panic("pmap_clear_bootstrap_pagetable: couldn't unpin page %p(%p)\n", base, kv_to_ma(base)); +#if PAE + for (j = 0; j < PDPNUM; j++) + { + pt_entry_t pdpe = base[j]; + if (pdpe & INTEL_PTE_VALID) { + dir = (pt_entry_t *) phystokv(pte_to_pa(pdpe)); +#else /* PAE */ + dir = base; +#endif /* PAE */ + for (i = 0; i < NPTES; i++) { + pt_entry_t pde = dir[i]; + unsigned long pfn = mfn_to_pfn(atop(pde)); + void *pgt = (void*) phystokv(ptoa(pfn)); + if (pde & INTEL_PTE_VALID) + hyp_free_page(pfn, pgt); + va += NPTES * INTEL_PGBYTES; + if (va >= HYP_VIRT_START) + break; + } +#if PAE + hyp_free_page(atop(_kvtophys(dir)), dir); + } else + va += NPTES * NPTES * INTEL_PGBYTES; + if (va >= HYP_VIRT_START) + break; + } +#endif /* PAE */ + hyp_free_page(atop(_kvtophys(base)), base); +} +#endif /* MACH_XEN */ + void pmap_virtual_space(startp, endp) vm_offset_t *startp; vm_offset_t *endp; @@ -823,6 +1036,29 @@ pmap_page_table_page_alloc() return pa; } +#ifdef MACH_XEN +void pmap_map_mfn(void *_addr, unsigned long mfn) { + vm_offset_t addr = (vm_offset_t) _addr; + pt_entry_t *pte, *pdp; + vm_offset_t ptp; + if ((pte = pmap_pte(kernel_pmap, addr)) == PT_ENTRY_NULL) { + ptp = phystokv(pmap_page_table_page_alloc()); + pmap_set_page_readonly((void*) ptp); + if (!hyp_mmuext_op_mfn (MMUEXT_PIN_L1_TABLE, pa_to_mfn(ptp))) + panic("couldn't pin page %p(%p)\n",ptp,kv_to_ma(ptp)); + pdp = pmap_pde(kernel_pmap, addr); + if (!hyp_mmu_update_pte(kv_to_ma(pdp), + pa_to_pte(kv_to_ma(ptp)) | INTEL_PTE_VALID + | INTEL_PTE_USER + | INTEL_PTE_WRITE)) + panic("%s:%d could not set pde %p(%p) to %p(%p)\n",__FILE__,__LINE__,kvtophys((vm_offset_t)pdp),kv_to_ma(pdp), ptp, pa_to_ma(ptp)); + pte = pmap_pte(kernel_pmap, addr); + } + if (!hyp_mmu_update_pte(kv_to_ma(pte), ptoa(mfn) | INTEL_PTE_VALID | INTEL_PTE_WRITE)) + panic("%s:%d could not set pte %p(%p) to %p(%p)\n",__FILE__,__LINE__,pte,kv_to_ma(pte), ptoa(mfn), pa_to_ma(ptoa(mfn))); +} +#endif /* MACH_XEN */ + /* * Deallocate a page-table page. * The page-table page must have all mappings removed, @@ -884,6 +1120,13 @@ pmap_t pmap_create(size) panic("pmap_create"); memcpy(p->dirbase, kernel_page_dir, PDPNUM * INTEL_PGBYTES); +#ifdef MACH_XEN + { + int i; + for (i = 0; i < PDPNUM; i++) + pmap_set_page_readonly((void*) p->dirbase + i * INTEL_PGBYTES); + } +#endif /* MACH_XEN */ #if PAE if (kmem_alloc_wired(kernel_map, @@ -895,6 +1138,9 @@ pmap_t pmap_create(size) for (i = 0; i < PDPNUM; i++) WRITE_PTE(&p->pdpbase[i], pa_to_pte(kvtophys((vm_offset_t) p->dirbase + i * INTEL_PGBYTES)) | INTEL_PTE_VALID); } +#ifdef MACH_XEN + pmap_set_page_readonly(p->pdpbase); +#endif /* MACH_XEN */ #endif /* PAE */ p->ref_count = 1; @@ -954,14 +1200,29 @@ void pmap_destroy(p) if (m == VM_PAGE_NULL) panic("pmap_destroy: pte page not in object"); vm_page_lock_queues(); +#ifdef MACH_XEN + if (!hyp_mmuext_op_mfn (MMUEXT_UNPIN_TABLE, pa_to_mfn(pa))) + panic("pmap_destroy: couldn't unpin page %p(%p)\n", pa, kv_to_ma(pa)); + pmap_set_page_readwrite((void*) phystokv(pa)); +#endif /* MACH_XEN */ vm_page_free(m); inuse_ptepages_count--; vm_page_unlock_queues(); vm_object_unlock(pmap_object); } } +#ifdef MACH_XEN + { + int i; + for (i = 0; i < PDPNUM; i++) + pmap_set_page_readwrite((void*) p->dirbase + i * INTEL_PGBYTES); + } +#endif /* MACH_XEN */ kmem_free(kernel_map, (vm_offset_t)p->dirbase, PDPNUM * INTEL_PGBYTES); #if PAE +#ifdef MACH_XEN + pmap_set_page_readwrite(p->pdpbase); +#endif /* MACH_XEN */ kmem_free(kernel_map, (vm_offset_t)p->pdpbase, INTEL_PGBYTES); #endif /* PAE */ zfree(pmap_zone, (vm_offset_t) p); @@ -1007,6 +1268,10 @@ void pmap_remove_range(pmap, va, spte, epte) int num_removed, num_unwired; int pai; vm_offset_t pa; +#ifdef MACH_XEN + int n, ii = 0; + struct mmu_update update[HYP_BATCH_MMU_UPDATES]; +#endif /* MACH_XEN */ #if DEBUG_PTE_PAGE if (pmap != kernel_pmap) @@ -1035,7 +1300,19 @@ void pmap_remove_range(pmap, va, spte, epte) register int i = ptes_per_vm_page; register pt_entry_t *lpte = cpte; do { +#ifdef MACH_XEN + update[ii].ptr = kv_to_ma(lpte); + update[ii].val = 0; + ii++; + if (ii == HYP_BATCH_MMU_UPDATES) { + hyp_mmu_update(kvtolin(&update), ii, kvtolin(&n), DOMID_SELF); + if (n != ii) + panic("couldn't pmap_remove_range\n"); + ii = 0; + } +#else /* MACH_XEN */ *lpte = 0; +#endif /* MACH_XEN */ lpte++; } while (--i > 0); continue; @@ -1056,7 +1333,19 @@ void pmap_remove_range(pmap, va, spte, epte) do { pmap_phys_attributes[pai] |= *lpte & (PHYS_MODIFIED|PHYS_REFERENCED); +#ifdef MACH_XEN + update[ii].ptr = kv_to_ma(lpte); + update[ii].val = 0; + ii++; + if (ii == HYP_BATCH_MMU_UPDATES) { + hyp_mmu_update(kvtolin(&update), ii, kvtolin(&n), DOMID_SELF); + if (n != ii) + panic("couldn't pmap_remove_range\n"); + ii = 0; + } +#else /* MACH_XEN */ *lpte = 0; +#endif /* MACH_XEN */ lpte++; } while (--i > 0); } @@ -1102,6 +1391,14 @@ void pmap_remove_range(pmap, va, spte, epte) } } +#ifdef MACH_XEN + if (ii > HYP_BATCH_MMU_UPDATES) + panic("overflowed array in pmap_remove_range"); + hyp_mmu_update(kvtolin(&update), ii, kvtolin(&n), DOMID_SELF); + if (n != ii) + panic("couldn't pmap_remove_range\n"); +#endif /* MACH_XEN */ + /* * Update the counts */ @@ -1246,7 +1543,12 @@ void pmap_page_protect(phys, prot) do { pmap_phys_attributes[pai] |= *pte & (PHYS_MODIFIED|PHYS_REFERENCED); +#ifdef MACH_XEN + if (!hyp_mmu_update_pte(kv_to_ma(pte++), 0)) + panic("%s:%d could not clear pte %p\n",__FILE__,__LINE__,pte-1); +#else /* MACH_XEN */ *pte++ = 0; +#endif /* MACH_XEN */ } while (--i > 0); } @@ -1276,7 +1578,12 @@ void pmap_page_protect(phys, prot) register int i = ptes_per_vm_page; do { +#ifdef MACH_XEN + if (!hyp_mmu_update_pte(kv_to_ma(pte), *pte & ~INTEL_PTE_WRITE)) + panic("%s:%d could not enable write on pte %p\n",__FILE__,__LINE__,pte); +#else /* MACH_XEN */ *pte &= ~INTEL_PTE_WRITE; +#endif /* MACH_XEN */ pte++; } while (--i > 0); @@ -1365,11 +1672,36 @@ void pmap_protect(map, s, e, prot) spte = &spte[ptenum(s)]; epte = &spte[intel_btop(l-s)]; +#ifdef MACH_XEN + int n, i = 0; + struct mmu_update update[HYP_BATCH_MMU_UPDATES]; +#endif /* MACH_XEN */ + while (spte < epte) { - if (*spte & INTEL_PTE_VALID) + if (*spte & INTEL_PTE_VALID) { +#ifdef MACH_XEN + update[i].ptr = kv_to_ma(spte); + update[i].val = *spte & ~INTEL_PTE_WRITE; + i++; + if (i == HYP_BATCH_MMU_UPDATES) { + hyp_mmu_update(kvtolin(&update), i, kvtolin(&n), DOMID_SELF); + if (n != i) + panic("couldn't pmap_protect\n"); + i = 0; + } +#else /* MACH_XEN */ *spte &= ~INTEL_PTE_WRITE; +#endif /* MACH_XEN */ + } spte++; } +#ifdef MACH_XEN + if (i > HYP_BATCH_MMU_UPDATES) + panic("overflowed array in pmap_protect"); + hyp_mmu_update(kvtolin(&update), i, kvtolin(&n), DOMID_SELF); + if (n != i) + panic("couldn't pmap_protect\n"); +#endif /* MACH_XEN */ } s = l; pde++; @@ -1412,6 +1744,8 @@ if (pmap_debug) printf("pmap(%x, %x)\n", v, pa); if (pmap == PMAP_NULL) return; + if (pmap == kernel_pmap && (v < kernel_virtual_start || v >= kernel_virtual_end)) + panic("pmap_enter(%p, %p) falls in physical memory area!\n", v, pa); if (pmap == kernel_pmap && (prot & VM_PROT_WRITE) == 0 && !wired /* hack for io_wire */ ) { /* @@ -1502,9 +1836,20 @@ Retry: /*XX pdp = &pmap->dirbase[pdenum(v) & ~(i-1)];*/ pdp = pmap_pde(pmap, v); do { +#ifdef MACH_XEN + pmap_set_page_readonly((void *) ptp); + if (!hyp_mmuext_op_mfn (MMUEXT_PIN_L1_TABLE, kv_to_mfn(ptp))) + panic("couldn't pin page %p(%p)\n",ptp,kv_to_ma(ptp)); + if (!hyp_mmu_update_pte(pa_to_ma(kvtophys((vm_offset_t)pdp)), + pa_to_pte(pa_to_ma(kvtophys(ptp))) | INTEL_PTE_VALID + | INTEL_PTE_USER + | INTEL_PTE_WRITE)) + panic("%s:%d could not set pde %p(%p,%p) to %p(%p,%p) %p\n",__FILE__,__LINE__, pdp, kvtophys((vm_offset_t)pdp), pa_to_ma(kvtophys((vm_offset_t)pdp)), ptp, kvtophys(ptp), pa_to_ma(kvtophys(ptp)), pa_to_pte(kv_to_ma(ptp))); +#else /* MACH_XEN */ *pdp = pa_to_pte(ptp) | INTEL_PTE_VALID | INTEL_PTE_USER | INTEL_PTE_WRITE; +#endif /* MACH_XEN */ pdp++; ptp += INTEL_PGBYTES; } while (--i > 0); @@ -1544,7 +1889,12 @@ Retry: do { if (*pte & INTEL_PTE_MOD) template |= INTEL_PTE_MOD; +#ifdef MACH_XEN + if (!hyp_mmu_update_pte(kv_to_ma(pte), pa_to_ma(template))) + panic("%s:%d could not set pte %p to %p\n",__FILE__,__LINE__,pte,template); +#else /* MACH_XEN */ WRITE_PTE(pte, template) +#endif /* MACH_XEN */ pte++; pte_increment_pa(template); } while (--i > 0); @@ -1649,7 +1999,12 @@ Retry: template |= INTEL_PTE_WIRED; i = ptes_per_vm_page; do { +#ifdef MACH_XEN + if (!(hyp_mmu_update_pte(kv_to_ma(pte), pa_to_ma(template)))) + panic("%s:%d could not set pte %p to %p\n",__FILE__,__LINE__,pte,template); +#else /* MACH_XEN */ WRITE_PTE(pte, template) +#endif /* MACH_XEN */ pte++; pte_increment_pa(template); } while (--i > 0); @@ -1704,7 +2059,12 @@ void pmap_change_wiring(map, v, wired) map->stats.wired_count--; i = ptes_per_vm_page; do { +#ifdef MACH_XEN + if (!(hyp_mmu_update_pte(kv_to_ma(pte), *pte & ~INTEL_PTE_WIRED))) + panic("%s:%d could not wire down pte %p\n",__FILE__,__LINE__,pte); +#else /* MACH_XEN */ *pte &= ~INTEL_PTE_WIRED; +#endif /* MACH_XEN */ pte++; } while (--i > 0); } @@ -1835,7 +2195,17 @@ void pmap_collect(p) register int i = ptes_per_vm_page; register pt_entry_t *pdep = pdp; do { +#ifdef MACH_XEN + unsigned long pte = *pdep; + void *ptable = (void*) ptetokv(pte); + if (!(hyp_mmu_update_pte(pa_to_ma(kvtophys((vm_offset_t)pdep++)), 0))) + panic("%s:%d could not clear pde %p\n",__FILE__,__LINE__,pdep-1); + if (!hyp_mmuext_op_mfn (MMUEXT_UNPIN_TABLE, kv_to_mfn(ptable))) + panic("couldn't unpin page %p(%p)\n", ptable, pa_to_ma(kvtophys((vm_offset_t)ptable))); + pmap_set_page_readwrite(ptable); +#else /* MACH_XEN */ *pdep++ = 0; +#endif /* MACH_XEN */ } while (--i > 0); } @@ -2052,7 +2422,12 @@ phys_attribute_clear(phys, bits) { register int i = ptes_per_vm_page; do { +#ifdef MACH_XEN + if (!(hyp_mmu_update_pte(kv_to_ma(pte), *pte & ~bits))) + panic("%s:%d could not clear bits %lx from pte %p\n",__FILE__,__LINE__,bits,pte); +#else /* MACH_XEN */ *pte &= ~bits; +#endif /* MACH_XEN */ } while (--i > 0); } PMAP_UPDATE_TLBS(pmap, va, va + PAGE_SIZE); @@ -2413,7 +2788,12 @@ pmap_unmap_page_zero () if (!pte) return; assert (pte); +#ifdef MACH_XEN + if (!hyp_mmu_update_pte(kv_to_ma(pte), 0)) + printf("couldn't unmap page 0\n"); +#else /* MACH_XEN */ *pte = 0; INVALIDATE_TLB(kernel_pmap, 0, PAGE_SIZE); +#endif /* MACH_XEN */ } #endif /* i386 */ diff --git a/i386/intel/pmap.h b/i386/intel/pmap.h index 7354a0f..a2b6442 100644 --- a/i386/intel/pmap.h +++ b/i386/intel/pmap.h @@ -126,12 +126,21 @@ typedef unsigned int pt_entry_t; #define INTEL_PTE_NCACHE 0x00000010 #define INTEL_PTE_REF 0x00000020 #define INTEL_PTE_MOD 0x00000040 +#ifdef MACH_XEN +/* Not supported */ +#define INTEL_PTE_GLOBAL 0x00000000 +#else /* MACH_XEN */ #define INTEL_PTE_GLOBAL 0x00000100 +#endif /* MACH_XEN */ #define INTEL_PTE_WIRED 0x00000200 #define INTEL_PTE_PFN 0xfffff000 #define pa_to_pte(a) ((a) & INTEL_PTE_PFN) +#ifdef MACH_PSEUDO_PHYS +#define pte_to_pa(p) ma_to_pa((p) & INTEL_PTE_PFN) +#else /* MACH_PSEUDO_PHYS */ #define pte_to_pa(p) ((p) & INTEL_PTE_PFN) +#endif /* MACH_PSEUDO_PHYS */ #define pte_increment_pa(p) ((p) += INTEL_OFFMASK+1) /* @@ -159,6 +168,14 @@ typedef struct pmap *pmap_t; #define PMAP_NULL ((pmap_t) 0) +#ifdef MACH_XEN +extern void pmap_set_page_readwrite(void *addr); +extern void pmap_set_page_readonly(void *addr); +extern void pmap_set_page_readonly_init(void *addr); +extern void pmap_map_mfn(void *addr, unsigned long mfn); +extern void pmap_clear_bootstrap_pagetable(pt_entry_t *addr); +#endif /* MACH_XEN */ + #if PAE #define set_pmap(pmap) set_cr3(kvtophys((vm_offset_t)(pmap)->pdpbase)) #else /* PAE */ diff --git a/i386/xen/Makefrag.am b/i386/xen/Makefrag.am new file mode 100644 index 0000000..b15b7db --- /dev/null +++ b/i386/xen/Makefrag.am @@ -0,0 +1,33 @@ +# Makefile fragment for the ix86 specific part of the Xen platform. + +# Copyright (C) 2007 Free Software Foundation, Inc. + +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; either version 2, or (at your option) any later +# version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# +# Xen support. +# + +libkernel_a_SOURCES += \ + i386/xen/xen.c \ + i386/xen/xen_locore.S \ + i386/xen/xen_boothdr.S + + +if PLATFORM_xen +gnumach_LINKFLAGS += \ + --defsym _START=0x20000000 \ + -T '$(srcdir)'/i386/ldscript +endif diff --git a/i386/xen/xen.c b/i386/xen/xen.c new file mode 100644 index 0000000..aa3c2cc --- /dev/null +++ b/i386/xen/xen.c @@ -0,0 +1,77 @@ +/* + * Copyright (C) 2006 Samuel Thibault <samuel.thibault@ens-lyon.org> + * + * This program is free software ; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation ; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY ; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with the program ; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include <kern/printf.h> +#include <kern/debug.h> + +#include <mach/machine/eflags.h> +#include <machine/thread.h> +#include <machine/ipl.h> + +#include <machine/model_dep.h> + +unsigned long cr3; + +struct failsafe_callback_regs { + unsigned int ds; + unsigned int es; + unsigned int fs; + unsigned int gs; + unsigned int ip; + unsigned int cs_and_mask; + unsigned int flags; +}; + +void hyp_failsafe_c_callback(struct failsafe_callback_regs *regs) { + printf("Fail-Safe callback!\n"); + printf("IP: %08X CS: %4X DS: %4X ES: %4X FS: %4X GS: %4X FLAGS %08X MASK %04X\n", regs->ip, regs->cs_and_mask & 0xffff, regs->ds, regs->es, regs->fs, regs->gs, regs->flags, regs->cs_and_mask >> 16); + panic("failsafe"); +} + +extern void clock_interrupt(); +extern void return_to_iret; + +void hypclock_machine_intr(int old_ipl, void *ret_addr, struct i386_interrupt_state *regs, unsigned64_t delta) { + if (ret_addr == &return_to_iret) { + clock_interrupt(delta/1000, /* usec per tick */ + (regs->efl & EFL_VM) || /* user mode */ + ((regs->cs & 0x02) != 0), /* user mode */ + old_ipl == SPL0); /* base priority */ + } else + clock_interrupt(delta/1000, FALSE, FALSE); +} + +void hyp_p2m_init(void) { + unsigned long nb_pfns = atop(phys_last_addr); +#ifdef MACH_PSEUDO_PHYS +#define P2M_PAGE_ENTRIES (PAGE_SIZE / sizeof(unsigned long)) + unsigned long *l3 = (unsigned long *)phystokv(pmap_grab_page()), *l2 = NULL; + unsigned long i; + + for (i = 0; i < (nb_pfns + P2M_PAGE_ENTRIES) / P2M_PAGE_ENTRIES; i++) { + if (!(i % P2M_PAGE_ENTRIES)) { + l2 = (unsigned long *) phystokv(pmap_grab_page()); + l3[i / P2M_PAGE_ENTRIES] = kv_to_mfn(l2); + } + l2[i % P2M_PAGE_ENTRIES] = kv_to_mfn(&mfn_list[i * P2M_PAGE_ENTRIES]); + } + + hyp_shared_info.arch.pfn_to_mfn_frame_list_list = kv_to_mfn(l3); +#endif + hyp_shared_info.arch.max_pfn = nb_pfns; +} diff --git a/i386/xen/xen_boothdr.S b/i386/xen/xen_boothdr.S new file mode 100644 index 0000000..3d84e0c --- /dev/null +++ b/i386/xen/xen_boothdr.S @@ -0,0 +1,167 @@ +/* + * Copyright (C) 2006 Samuel Thibault <samuel.thibault@ens-lyon.org> + * + * This program is free software ; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation ; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY ; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with the program ; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include <xen/public/elfnote.h> + +.section __xen_guest + .ascii "GUEST_OS=GNU Mach" + .ascii ",GUEST_VERSION=1.3" + .ascii ",XEN_VER=xen-3.0" + .ascii ",VIRT_BASE=0x20000000" + .ascii ",ELF_PADDR_OFFSET=0x20000000" + .ascii ",HYPERCALL_PAGE=0x2" +#if PAE + .ascii ",PAE=yes" +#else + .ascii ",PAE=no" +#endif + .ascii ",LOADER=generic" +#ifndef MACH_PSEUDO_PHYS + .ascii ",FEATURES=!auto_translated_physmap" +#endif + .byte 0 + +/* Macro taken from linux/include/linux/elfnote.h */ +#define ELFNOTE(name, type, desctype, descdata) \ +.pushsection .note.name ; \ + .align 4 ; \ + .long 2f - 1f /* namesz */ ; \ + .long 4f - 3f /* descsz */ ; \ + .long type ; \ +1:.asciz "name" ; \ +2:.align 4 ; \ +3:desctype descdata ; \ +4:.align 4 ; \ +.popsection ; + + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz, "GNU Mach") + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz, "1.3") + ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz, "xen-3.0") + ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long, _START) + ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .long, _START) + ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long, start) + ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long, hypcalls) +#if PAE + ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz, "yes") +#else + ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz, "no") +#endif + ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz, "generic") + ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz, "" +#ifndef MACH_PSEUDO_PHYS + "!auto_translated_physmap" +#endif + ) + +#include <mach/machine/asm.h> + +#include <i386/i386/i386asm.h> + + .text + .globl gdt, ldt + .globl start, _start, gdt +start: +_start: + + /* Switch to our own interrupt stack. */ + movl $(_intstack+INTSTACK_SIZE),%eax + movl %eax,%esp + + /* Reset EFLAGS to a known state. */ + pushl $0 + popf + + /* Push the start_info pointer to be the second argument. */ + subl $KERNELBASE,%esi + pushl %esi + + /* Jump into C code. */ + call EXT(c_boot_entry) + +/* Those need to be aligned on page boundaries. */ +.global hyp_shared_info, hypcalls + + .org (start + 0x1000) +hyp_shared_info: + .org hyp_shared_info + 0x1000 + +/* Labels just for debuggers */ +#define hypcall(name, n) \ + .org hypcalls + n*32 ; \ +__hyp_##name: + +hypcalls: + hypcall(set_trap_table, 0) + hypcall(mmu_update, 1) + hypcall(set_gdt, 2) + hypcall(stack_switch, 3) + hypcall(set_callbacks, 4) + hypcall(fpu_taskswitch, 5) + hypcall(sched_op_compat, 6) + hypcall(platform_op, 7) + hypcall(set_debugreg, 8) + hypcall(get_debugreg, 9) + hypcall(update_descriptor, 10) + hypcall(memory_op, 12) + hypcall(multicall, 13) + hypcall(update_va_mapping, 14) + hypcall(set_timer_op, 15) + hypcall(event_channel_op_compat, 16) + hypcall(xen_version, 17) + hypcall(console_io, 18) + hypcall(physdev_op_compat, 19) + hypcall(grant_table_op, 20) + hypcall(vm_assist, 21) + hypcall(update_va_mapping_otherdomain, 22) + hypcall(iret, 23) + hypcall(vcpu_op, 24) + hypcall(set_segment_base, 25) + hypcall(mmuext_op, 26) + hypcall(acm_op, 27) + hypcall(nmi_op, 28) + hypcall(sched_op, 29) + hypcall(callback_op, 30) + hypcall(xenoprof_op, 31) + hypcall(event_channel_op, 32) + hypcall(physdev_op, 33) + hypcall(hvm_op, 34) + hypcall(sysctl, 35) + hypcall(domctl, 36) + hypcall(kexec_op, 37) + + hypcall(arch_0, 48) + hypcall(arch_1, 49) + hypcall(arch_2, 50) + hypcall(arch_3, 51) + hypcall(arch_4, 52) + hypcall(arch_5, 53) + hypcall(arch_6, 54) + hypcall(arch_7, 55) + + .org hypcalls + 0x1000 + +gdt: + .org gdt + 0x1000 + +ldt: + .org ldt + 0x1000 + +stack: + .long _intstack+INTSTACK_SIZE,0xe021 + .comm _intstack,INTSTACK_SIZE + diff --git a/i386/xen/xen_locore.S b/i386/xen/xen_locore.S new file mode 100644 index 0000000..51f823f --- /dev/null +++ b/i386/xen/xen_locore.S @@ -0,0 +1,110 @@ +/* + * Copyright (C) 2006 Samuel Thibault <samuel.thibault@ens-lyon.org> + * + * This program is free software ; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation ; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY ; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with the program ; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include <mach/machine/asm.h> + +#include <i386/i386asm.h> +#include <i386/cpu_number.h> +#include <i386/xen.h> + + .data 2 +int_active: + .long 0 + + + .text + .globl hyp_callback, hyp_failsafe_callback + P2ALIGN(TEXT_ALIGN) +hyp_callback: + pushl %eax + jmp EXT(all_intrs) + +ENTRY(interrupt) + incl int_active /* currently handling interrupts */ + call EXT(hyp_c_callback) /* call generic interrupt routine */ + decl int_active /* stopped handling interrupts */ + sti + ret + +/* FIXME: if we're _very_ unlucky, we may be re-interrupted, filling stack + * + * Far from trivial, see mini-os. That said, maybe we could just, before poping + * everything (which is _not_ destructive), save sp into a known place and use + * it+jmp back? + * + * Mmm, there seems to be an iret hypcall that does exactly what we want: + * perform iret, and if IF is set, clear the interrupt mask. + */ + +/* Pfff, we have to check pending interrupts ourselves. Some other DomUs just make an hypercall for retriggering the irq. Not sure it's really easier/faster */ +ENTRY(hyp_sti) + pushl %ebp + movl %esp, %ebp +_hyp_sti: + movb $0,hyp_shared_info+CPU_CLI /* Enable interrupts */ + cmpl $0,int_active /* Check whether we were already checking pending interrupts */ + jz 0f + popl %ebp + ret /* Already active, just return */ +0: + /* Not active, check pending interrupts by hand */ + /* no memory barrier needed on x86 */ + cmpb $0,hyp_shared_info+CPU_PENDING + jne 0f + popl %ebp + ret +0: + movb $0xff,hyp_shared_info+CPU_CLI +1: + pushl %eax + pushl %ecx + pushl %edx + incl int_active /* currently handling interrupts */ + + pushl $0 + pushl $0 + call EXT(hyp_c_callback) + popl %edx + popl %edx + + popl %edx + popl %ecx + popl %eax + decl int_active /* stopped handling interrupts */ + cmpb $0,hyp_shared_info+CPU_PENDING + jne 1b + jmp _hyp_sti + +/* Hypervisor failed to reload segments. Dump them. */ +hyp_failsafe_callback: +#if 1 + /* load sane segments */ + mov %ss, %ax + mov %ax, %ds + mov %ax, %es + mov %ax, %fs + mov %ax, %gs + push %esp + call EXT(hyp_failsafe_c_callback) +#else + popl %ds + popl %es + popl %fs + popl %gs + iret +#endif |