From 9e608d32e9e85246decd868f326efde66108c7f3 Mon Sep 17 00:00:00 2001 From: Samuel Thibault Date: Sat, 28 Nov 2009 17:09:38 +0100 Subject: Add XMM FPU registers save/restore support. * i386/include/mach/i386/fp_reg.h (struct i386_fp_regs): Invert array indices. (struct i386_xfp_save): New structure. (FP_387X): New macro. * i386/i386/thread.h (struct i386_fpsave_state): Add xfp_save_state member, keep existing fp_save_state and fp_regs members in an unnamed union member. Move fp_valid member to the end of the structure. * i386/i386/fpu.h (fxsave, fxrstor): New macros. (fpu_save_context): Use fxsave() when FPU is FXSR-capable. * i386/i386/fpu.c: Include (mxcsr_feature_mask): New variable. (fp_save, fp_load): Add declaration. (init_fpu): Add FXSR-capable FPU detection. (fpu_module_init): Request 16-byte alignment to zinit() for i386_fpsave_state structures. (twd_i387_to_fxsr, twd_fxsr_to_i387): New functions. (fpu_set_state): Convert FPU state when FPU is FXSR-capable. (fpu_get_state): Convert FPU state when FPU is FXSR-capable. (fpexterrflt): Pass to i386_exception either xfp_save_state or fp_save_state according to FPU type. (fpastintr): Likewise. (fp_load): Likewise. Use fxrstor() when FPU is FXSR-capable. (fp_save): Use fxsave() when FPU is FXSR-capable. (fp_state_alloc): Add FXSR-aware initialization. --- i386/i386/fpu.c | 208 ++++++++++++++++++++++++++++----- i386/i386/fpu.h | 11 +- i386/i386/thread.h | 9 +- i386/include/mach/i386/fp_reg.h | 23 +++- i386/include/mach/i386/thread_status.h | 2 +- 5 files changed, 217 insertions(+), 36 deletions(-) diff --git a/i386/i386/fpu.c b/i386/i386/fpu.c index 8679816..109d0d7 100644 --- a/i386/i386/fpu.c +++ b/i386/i386/fpu.c @@ -23,6 +23,15 @@ * any improvements or extensions that they make and grant Carnegie Mellon * the rights to redistribute these changes. */ + +/* + * Copyright (C) 1994 Linus Torvalds + * + * Pentium III FXSR, SSE support + * General FPU state handling cleanups + * Gareth Hughes , May 2000 + */ + /* * Support for 80387 floating point or FP emulator. */ @@ -43,6 +52,7 @@ #include #include #include +#include #include "cpu_number.h" #if 0 @@ -63,6 +73,10 @@ extern void i386_exception(); int fp_kind = FP_387; /* 80387 present */ zone_t ifps_zone; /* zone for FPU save area */ +static unsigned long mxcsr_feature_mask = 0xffffffff; /* Always AND user-provided mxcsr with this security mask */ + +void fp_save(thread_t thread); +void fp_load(thread_t thread); #if NCPUS == 1 volatile thread_t fp_thread = THREAD_NULL; @@ -134,7 +148,20 @@ init_fpu() /* * We have a 387. */ - fp_kind = FP_387; + if (CPU_HAS_FEATURE(CPU_FEATURE_FXSR)) { + static /* because we _need_ alignment */ + struct i386_xfp_save save; + unsigned long mask; + fp_kind = FP_387X; + printf("Enabling FXSR\n"); + set_cr4(get_cr4() | CR4_OSFXSR); + fxsave(&save); + mask = save.fp_mxcsr_mask; + if (!mask) + mask = 0x0000ffbf; + mxcsr_feature_mask &= mask; + } else + fp_kind = FP_387; } /* * Trap wait instructions. Turn off FPU for now. @@ -155,7 +182,7 @@ init_fpu() void fpu_module_init() { - ifps_zone = zinit(sizeof(struct i386_fpsave_state), 0, + ifps_zone = zinit(sizeof(struct i386_fpsave_state), 16, THREAD_MAX * sizeof(struct i386_fpsave_state), THREAD_CHUNK * sizeof(struct i386_fpsave_state), 0, "i386 fpsave state"); @@ -186,6 +213,73 @@ ASSERT_IPL(SPL0); zfree(ifps_zone, (vm_offset_t) fps); } +/* The two following functions were stolen from Linux's i387.c */ +static inline unsigned short +twd_i387_to_fxsr (unsigned short twd) +{ + unsigned int tmp; /* to avoid 16 bit prefixes in the code */ + + /* Transform each pair of bits into 01 (valid) or 00 (empty) */ + tmp = ~twd; + tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */ + /* and move the valid bits to the lower byte. */ + tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */ + tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */ + tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */ + return tmp; +} + +static inline unsigned long +twd_fxsr_to_i387 (struct i386_xfp_save *fxsave) +{ + struct { + unsigned short significand[4]; + unsigned short exponent; + unsigned short padding[3]; + } *st = NULL; + unsigned long tos = (fxsave->fp_status >> 11) & 7; + unsigned long twd = (unsigned long) fxsave->fp_tag; + unsigned long tag; + unsigned long ret = 0xffff0000u; + int i; + +#define FPREG_ADDR(f, n) ((void *)&(f)->fp_reg_word + (n) * 16); + + for (i = 0 ; i < 8 ; i++) { + if (twd & 0x1) { + st = FPREG_ADDR (fxsave, (i - tos) & 7); + + switch (st->exponent & 0x7fff) { + case 0x7fff: + tag = 2; /* Special */ + break; + case 0x0000: + if (!st->significand[0] && + !st->significand[1] && + !st->significand[2] && + !st->significand[3] ) { + tag = 1; /* Zero */ + } else { + tag = 2; /* Special */ + } + break; + default: + if (st->significand[3] & 0x8000) { + tag = 0; /* Valid */ + } else { + tag = 2; /* Special */ + } + break; + } + } else { + tag = 3; /* Empty */ + } + ret |= (tag << (2 * i)); + twd = twd >> 1; + } + return ret; +} + /* * Set the floating-point state for a thread. * If the thread is not the current thread, it is @@ -264,16 +358,30 @@ ASSERT_IPL(SPL0); */ memset(&ifps->fp_save_state, 0, sizeof(struct i386_fp_save)); - ifps->fp_save_state.fp_control = user_fp_state->fp_control; - ifps->fp_save_state.fp_status = user_fp_state->fp_status; - ifps->fp_save_state.fp_tag = user_fp_state->fp_tag; - ifps->fp_save_state.fp_eip = user_fp_state->fp_eip; - ifps->fp_save_state.fp_cs = user_fp_state->fp_cs; - ifps->fp_save_state.fp_opcode = user_fp_state->fp_opcode; - ifps->fp_save_state.fp_dp = user_fp_state->fp_dp; - ifps->fp_save_state.fp_ds = user_fp_state->fp_ds; - ifps->fp_regs = *user_fp_regs; - ifps->fp_valid = TRUE; + if (fp_kind == FP_387X) { + int i; + + ifps->xfp_save_state.fp_control = user_fp_state->fp_control; + ifps->xfp_save_state.fp_status = user_fp_state->fp_status; + ifps->xfp_save_state.fp_tag = twd_i387_to_fxsr(user_fp_state->fp_tag); + ifps->xfp_save_state.fp_eip = user_fp_state->fp_eip; + ifps->xfp_save_state.fp_cs = user_fp_state->fp_cs; + ifps->xfp_save_state.fp_opcode = user_fp_state->fp_opcode; + ifps->xfp_save_state.fp_dp = user_fp_state->fp_dp; + ifps->xfp_save_state.fp_ds = user_fp_state->fp_ds; + for (i=0; i<8; i++) + memcpy(&ifps->xfp_save_state.fp_reg_word[i], &user_fp_regs[i], sizeof(user_fp_regs[i])); + } else { + ifps->fp_save_state.fp_control = user_fp_state->fp_control; + ifps->fp_save_state.fp_status = user_fp_state->fp_status; + ifps->fp_save_state.fp_tag = user_fp_state->fp_tag; + ifps->fp_save_state.fp_eip = user_fp_state->fp_eip; + ifps->fp_save_state.fp_cs = user_fp_state->fp_cs; + ifps->fp_save_state.fp_opcode = user_fp_state->fp_opcode; + ifps->fp_save_state.fp_dp = user_fp_state->fp_dp; + ifps->fp_save_state.fp_ds = user_fp_state->fp_ds; + ifps->fp_regs = *user_fp_regs; + } simple_unlock(&pcb->lock); if (new_ifps != 0) @@ -343,15 +451,30 @@ ASSERT_IPL(SPL0); */ memset(user_fp_state, 0, sizeof(struct i386_fp_save)); - user_fp_state->fp_control = ifps->fp_save_state.fp_control; - user_fp_state->fp_status = ifps->fp_save_state.fp_status; - user_fp_state->fp_tag = ifps->fp_save_state.fp_tag; - user_fp_state->fp_eip = ifps->fp_save_state.fp_eip; - user_fp_state->fp_cs = ifps->fp_save_state.fp_cs; - user_fp_state->fp_opcode = ifps->fp_save_state.fp_opcode; - user_fp_state->fp_dp = ifps->fp_save_state.fp_dp; - user_fp_state->fp_ds = ifps->fp_save_state.fp_ds; - *user_fp_regs = ifps->fp_regs; + if (fp_kind == FP_387X) { + int i; + + user_fp_state->fp_control = ifps->xfp_save_state.fp_control; + user_fp_state->fp_status = ifps->xfp_save_state.fp_status; + user_fp_state->fp_tag = twd_fxsr_to_i387(&ifps->xfp_save_state); + user_fp_state->fp_eip = ifps->xfp_save_state.fp_eip; + user_fp_state->fp_cs = ifps->xfp_save_state.fp_cs; + user_fp_state->fp_opcode = ifps->xfp_save_state.fp_opcode; + user_fp_state->fp_dp = ifps->xfp_save_state.fp_dp; + user_fp_state->fp_ds = ifps->xfp_save_state.fp_ds; + for (i=0; i<8; i++) + memcpy(&user_fp_regs[i], &ifps->xfp_save_state.fp_reg_word[i], sizeof(user_fp_regs[i])); + } else { + user_fp_state->fp_control = ifps->fp_save_state.fp_control; + user_fp_state->fp_status = ifps->fp_save_state.fp_status; + user_fp_state->fp_tag = ifps->fp_save_state.fp_tag; + user_fp_state->fp_eip = ifps->fp_save_state.fp_eip; + user_fp_state->fp_cs = ifps->fp_save_state.fp_cs; + user_fp_state->fp_opcode = ifps->fp_save_state.fp_opcode; + user_fp_state->fp_dp = ifps->fp_save_state.fp_dp; + user_fp_state->fp_ds = ifps->fp_save_state.fp_ds; + *user_fp_regs = ifps->fp_regs; + } } simple_unlock(&pcb->lock); @@ -546,7 +669,9 @@ fpexterrflt() */ i386_exception(EXC_ARITHMETIC, EXC_I386_EXTERR, - thread->pcb->ims.ifps->fp_save_state.fp_status); + fp_kind == FP_387X ? + thread->pcb->ims.ifps->xfp_save_state.fp_status : + thread->pcb->ims.ifps->fp_save_state.fp_status); /*NOTREACHED*/ } @@ -601,7 +726,9 @@ ASSERT_IPL(SPL0); */ i386_exception(EXC_ARITHMETIC, EXC_I386_EXTERR, - thread->pcb->ims.ifps->fp_save_state.fp_status); + fp_kind == FP_387X ? + thread->pcb->ims.ifps->xfp_save_state.fp_status : + thread->pcb->ims.ifps->fp_save_state.fp_status); /*NOTREACHED*/ } @@ -623,7 +750,10 @@ fp_save(thread) if (ifps != 0 && !ifps->fp_valid) { /* registers are in FPU */ ifps->fp_valid = TRUE; - fnsave(&ifps->fp_save_state); + if (fp_kind == FP_387X) + fxsave(&ifps->xfp_save_state); + else + fnsave(&ifps->fp_save_state); } } @@ -664,14 +794,19 @@ ASSERT_IPL(SPL0); */ i386_exception(EXC_ARITHMETIC, EXC_I386_EXTERR, - thread->pcb->ims.ifps->fp_save_state.fp_status); + fp_kind == FP_387X ? + thread->pcb->ims.ifps->xfp_save_state.fp_status : + thread->pcb->ims.ifps->fp_save_state.fp_status); /*NOTREACHED*/ #endif } else if (! ifps->fp_valid) { printf("fp_load: invalid FPU state!\n"); fninit (); } else { - frstor(ifps->fp_save_state); + if (fp_kind == FP_387X) + fxrstor(ifps->xfp_save_state); + else + frstor(ifps->fp_save_state); } ifps->fp_valid = FALSE; /* in FPU */ } @@ -693,11 +828,22 @@ fp_state_alloc() pcb->ims.ifps = ifps; ifps->fp_valid = TRUE; - ifps->fp_save_state.fp_control = (0x037f - & ~(FPC_IM|FPC_ZM|FPC_OM|FPC_PC)) - | (FPC_PC_53|FPC_IC_AFF); - ifps->fp_save_state.fp_status = 0; - ifps->fp_save_state.fp_tag = 0xffff; /* all empty */ + + if (fp_kind == FP_387X) { + ifps->xfp_save_state.fp_control = (0x037f + & ~(FPC_IM|FPC_ZM|FPC_OM|FPC_PC)) + | (FPC_PC_53|FPC_IC_AFF); + ifps->xfp_save_state.fp_status = 0; + ifps->xfp_save_state.fp_tag = 0xffff; /* all empty */ + if (CPU_HAS_FEATURE(CPU_FEATURE_SSE)) + ifps->xfp_save_state.fp_mxcsr = 0x1f80; + } else { + ifps->fp_save_state.fp_control = (0x037f + & ~(FPC_IM|FPC_ZM|FPC_OM|FPC_PC)) + | (FPC_PC_53|FPC_IC_AFF); + ifps->fp_save_state.fp_status = 0; + ifps->fp_save_state.fp_tag = 0xffff; /* all empty */ + } } #if AT386 diff --git a/i386/i386/fpu.h b/i386/i386/fpu.h index 7efb7e2..1a1b61f 100644 --- a/i386/i386/fpu.h +++ b/i386/i386/fpu.h @@ -67,6 +67,12 @@ #define frstor(state) \ asm volatile("frstor %0" : : "m" (state)) +#define fxsave(state) \ + asm volatile("fxsave %0" : "=m" (*state)) + +#define fxrstor(state) \ + asm volatile("fxrstor %0" : : "m" (state)) + #define fwait() \ asm("fwait"); @@ -86,7 +92,10 @@ if (ifps != 0 && !ifps->fp_valid) { \ /* registers are in FPU - save to memory */ \ ifps->fp_valid = TRUE; \ - fnsave(&ifps->fp_save_state); \ + if (fp_kind == FP_387X) \ + fxsave(&ifps->xfp_save_state); \ + else \ + fnsave(&ifps->fp_save_state); \ set_ts(); \ } \ } diff --git a/i386/i386/thread.h b/i386/i386/thread.h index 76aa1ef..f2ae8bf 100644 --- a/i386/i386/thread.h +++ b/i386/i386/thread.h @@ -111,9 +111,14 @@ struct i386_kernel_state { */ struct i386_fpsave_state { + union { + struct { + struct i386_fp_save fp_save_state; + struct i386_fp_regs fp_regs; + }; + struct i386_xfp_save xfp_save_state; + }; boolean_t fp_valid; - struct i386_fp_save fp_save_state; - struct i386_fp_regs fp_regs; }; /* diff --git a/i386/include/mach/i386/fp_reg.h b/i386/include/mach/i386/fp_reg.h index 6fe7af5..5673055 100644 --- a/i386/include/mach/i386/fp_reg.h +++ b/i386/include/mach/i386/fp_reg.h @@ -46,10 +46,30 @@ struct i386_fp_save { }; struct i386_fp_regs { - unsigned short fp_reg_word[5][8]; + unsigned short fp_reg_word[8][5]; /* space for 8 80-bit FP registers */ }; +struct i386_xfp_save { + unsigned short fp_control; /* control */ + unsigned short fp_status; /* status */ + unsigned short fp_tag; /* register tags */ + unsigned short fp_opcode; /* opcode of failed instruction */ + unsigned int fp_eip; /* eip at failed instruction */ + unsigned short fp_cs; /* cs at failed instruction */ + unsigned short fp_unused_1; + unsigned int fp_dp; /* data address */ + unsigned short fp_ds; /* data segment */ + unsigned short fp_unused_2; + unsigned int fp_mxcsr; /* MXCSR */ + unsigned int fp_mxcsr_mask; /* MXCSR_MASK */ + unsigned char fp_reg_word[8][16]; + /* space for 8 128-bit FP registers */ + unsigned char fp_xreg_word[8][16]; + /* space for 8 128-bit XMM registers */ + unsigned int padding[56]; +} __attribute__((aligned(16))); + /* * Control register */ @@ -104,5 +124,6 @@ struct i386_fp_regs { #define FP_SOFT 1 /* software FP emulator */ #define FP_287 2 /* 80287 */ #define FP_387 3 /* 80387 or 80486 */ +#define FP_387X 4 /* FXSAVE/RSTOR-capable */ #endif /* _MACH_I386_FP_REG_H_ */ diff --git a/i386/include/mach/i386/thread_status.h b/i386/include/mach/i386/thread_status.h index cc3dc66..5f20355 100644 --- a/i386/include/mach/i386/thread_status.h +++ b/i386/include/mach/i386/thread_status.h @@ -111,7 +111,7 @@ struct i386_thread_state { (sizeof (struct i386_fp_save) + sizeof (struct i386_fp_regs)) struct i386_float_state { - int fpkind; /* FP_NO..FP_387 (readonly) */ + int fpkind; /* FP_NO..FP_387X (readonly) */ int initialized; unsigned char hw_state[FP_STATE_BYTES]; /* actual "hardware" state */ int exc_status; /* exception status (readonly) */ -- cgit v1.2.3