Skip to content

Commit

Permalink
Merge pull request FEX-Emu#4130 from pmatos/X87F64Simp
Browse files Browse the repository at this point in the history
X87 Code Simplification
  • Loading branch information
Sonicadvance1 authored Oct 24, 2024
2 parents caaacb6 + 11a87c2 commit 0190e1a
Show file tree
Hide file tree
Showing 6 changed files with 49 additions and 180 deletions.
6 changes: 3 additions & 3 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5583,7 +5583,7 @@ void InstallOpcodeHandlers(Context::OperatingMode Mode) {
{OPD(0xDB, 0xD8), 8, &OpDispatchBuilder::X87FCMOV},
// E0 = Invalid
{OPD(0xDB, 0xE2), 1, &OpDispatchBuilder::NOPOp}, // FNCLEX
{OPD(0xDB, 0xE3), 1, &OpDispatchBuilder::FNINITF64},
{OPD(0xDB, 0xE3), 1, &OpDispatchBuilder::FNINIT},
// E4 = Invalid
{OPD(0xDB, 0xE8), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FCOMIF64, 80, false, OpDispatchBuilder::FCOMIFlags::FLAGS_RFLAGS, false>},
{OPD(0xDB, 0xF0), 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FCOMIF64, 80, false, OpDispatchBuilder::FCOMIFlags::FLAGS_RFLAGS, false>},
Expand Down Expand Up @@ -5621,10 +5621,10 @@ void InstallOpcodeHandlers(Context::OperatingMode Mode) {

{OPDReg(0xDD, 3) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FSTF64, 64>},

{OPDReg(0xDD, 4) | 0x00, 8, &OpDispatchBuilder::X87FRSTORF64},
{OPDReg(0xDD, 4) | 0x00, 8, &OpDispatchBuilder::X87FRSTOR},

// 5 = Invalid
{OPDReg(0xDD, 6) | 0x00, 8, &OpDispatchBuilder::X87FNSAVEF64},
{OPDReg(0xDD, 6) | 0x00, 8, &OpDispatchBuilder::X87FNSAVE},

{OPDReg(0xDD, 7) | 0x00, 8, &OpDispatchBuilder::X87FNSTSW},

Expand Down
3 changes: 0 additions & 3 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher.h
Original file line number Diff line number Diff line change
Expand Up @@ -770,16 +770,13 @@ class OpDispatchBuilder final : public IREmitter {
void FABSF64(OpcodeArgs);
void FTSTF64(OpcodeArgs);
void FRNDINTF64(OpcodeArgs);
void FNINITF64(OpcodeArgs);
void FSQRTF64(OpcodeArgs);
void X87UnaryOpF64(OpcodeArgs, FEXCore::IR::IROps IROp);
void X87BinaryOpF64(OpcodeArgs, FEXCore::IR::IROps IROp);
void X87SinCosF64(OpcodeArgs);
void X87FLDCWF64(OpcodeArgs);
void X87TANF64(OpcodeArgs);
void X87ATANF64(OpcodeArgs);
void X87FNSAVEF64(OpcodeArgs);
void X87FRSTORF64(OpcodeArgs);
void X87FXAMF64(OpcodeArgs);
void X87FXTRACTF64(OpcodeArgs);
void X87LDENVF64(OpcodeArgs);
Expand Down
45 changes: 34 additions & 11 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher/X87.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -420,6 +420,7 @@ void OpDispatchBuilder::X87LDENV(OpcodeArgs) {

void OpDispatchBuilder::X87FNSAVE(OpcodeArgs) {
_SyncStackToSlow();

// 14 bytes for 16bit
// 2 Bytes : FCW
// 2 Bytes : FSW
Expand All @@ -438,7 +439,6 @@ void OpDispatchBuilder::X87FNSAVE(OpcodeArgs) {
// 2 bytes : Opcode
// 4 bytes : data pointer offset
// 4 bytes : data pointer selector

const auto Size = GetDstSize(Op);
Ref Mem = MakeSegmentAddress(Op, Op->Dest);
Ref Top = GetX87Top();
Expand Down Expand Up @@ -478,14 +478,21 @@ void OpDispatchBuilder::X87FNSAVE(OpcodeArgs) {

auto OneConst = _Constant(1);
auto SevenConst = _Constant(7);
size_t LoadSize = ReducedPrecisionMode ? 8 : 16;
for (int i = 0; i < 7; ++i) {
auto data = _LoadContextIndexed(Top, 16, MMBaseOffset(), 16, FPRClass);
Ref data = _LoadContextIndexed(Top, LoadSize, MMBaseOffset(), 16, FPRClass);
if (ReducedPrecisionMode) {
data = _F80CVTTo(data, 8);
}
_StoreMem(FPRClass, 16, data, Mem, _Constant((Size * 7) + (10 * i)), 1, MEM_OFFSET_SXTX, 1);
Top = _And(OpSize::i32Bit, _Add(OpSize::i32Bit, Top, OneConst), SevenConst);
}

// The final st(7) needs a bit of special handling here
auto data = _LoadContextIndexed(Top, 16, MMBaseOffset(), 16, FPRClass);
Ref data = _LoadContextIndexed(Top, LoadSize, MMBaseOffset(), 16, FPRClass);
if (ReducedPrecisionMode) {
data = _F80CVTTo(data, 8);
}
// ST7 broken in to two parts
// Lower 64bits [63:0]
// upper 16 bits [79:64]
Expand All @@ -504,6 +511,16 @@ void OpDispatchBuilder::X87FRSTOR(OpcodeArgs) {

auto NewFCW = _LoadMem(GPRClass, 2, Mem, 2);
_StoreContext(2, GPRClass, NewFCW, offsetof(FEXCore::Core::CPUState, FCW));
if (ReducedPrecisionMode) {
// ignore the rounding precision, we're always 64-bit in F64.
// extract rounding mode
Ref roundingMode = NewFCW;
auto roundShift = _Constant(10);
auto roundMask = _Constant(3);
roundingMode = _Lshr(OpSize::i32Bit, roundingMode, roundShift);
roundingMode = _And(OpSize::i32Bit, roundingMode, roundMask);
_SetRoundingMode(roundingMode, false, roundingMode);
}

auto NewFSW = _LoadMem(GPRClass, Size, Mem, _Constant(Size * 1), Size, MEM_OFFSET_SXTX, 1);
Ref Top = ReconstructX87StateFromFSW_Helper(NewFSW);
Expand All @@ -519,13 +536,16 @@ void OpDispatchBuilder::X87FRSTOR(OpcodeArgs) {
auto high = _Constant(0xFFFF);
Ref Mask = _VCastFromGPR(16, 8, low);
Mask = _VInsGPR(16, 8, 1, Mask, high);

size_t StoreSize = ReducedPrecisionMode ? 8 : 16;
for (int i = 0; i < 7; ++i) {
Ref Reg = _LoadMem(FPRClass, 16, Mem, _Constant((Size * 7) + (10 * i)), 1, MEM_OFFSET_SXTX, 1);
// Mask off the top bits
Reg = _VAnd(16, 16, Reg, Mask);

_StoreContextIndexed(Reg, Top, 16, MMBaseOffset(), 16, FPRClass);
if (ReducedPrecisionMode) {
// Convert to double precision
Reg = _F80CVT(8, Reg);
}
_StoreContextIndexed(Reg, Top, StoreSize, MMBaseOffset(), 16, FPRClass);

Top = _And(OpSize::i32Bit, _Add(OpSize::i32Bit, Top, OneConst), SevenConst);
}
Expand All @@ -537,7 +557,10 @@ void OpDispatchBuilder::X87FRSTOR(OpcodeArgs) {
Ref Reg = _LoadMem(FPRClass, 8, Mem, _Constant((Size * 7) + (10 * 7)), 1, MEM_OFFSET_SXTX, 1);
Ref RegHigh = _LoadMem(FPRClass, 2, Mem, _Constant((Size * 7) + (10 * 7) + 8), 1, MEM_OFFSET_SXTX, 1);
Reg = _VInsElement(16, 2, 4, 0, Reg, RegHigh);
_StoreContextIndexed(Reg, Top, 16, MMBaseOffset(), 16, FPRClass);
if (ReducedPrecisionMode) {
Reg = _F80CVT(8, Reg); // Convert to double precision
}
_StoreContextIndexed(Reg, Top, StoreSize, MMBaseOffset(), 16, FPRClass);
}

// Load / Store Control Word
Expand All @@ -546,7 +569,6 @@ void OpDispatchBuilder::X87FSTCW(OpcodeArgs) {
StoreResult(GPRClass, Op, FCW, -1);
}


void OpDispatchBuilder::X87FLDCW(OpcodeArgs) {
// FIXME: Because loading control flags will affect several instructions in fast path, we might have
// to switch for now to slow mode whenever these are manually changed.
Expand All @@ -556,7 +578,6 @@ void OpDispatchBuilder::X87FLDCW(OpcodeArgs) {
_StoreContext(2, GPRClass, NewFCW, offsetof(FEXCore::Core::CPUState, FCW));
}


void OpDispatchBuilder::FXCH(OpcodeArgs) {
uint8_t Offset = Op->OP & 7;
// fxch st0, st0 is for us essentially a nop
Expand Down Expand Up @@ -675,7 +696,6 @@ void OpDispatchBuilder::X87ModifySTP(OpcodeArgs, bool Inc) {
// Optionally we can pass a pre calculated value for Top, otherwise we calculate it
// during the function runtime.
Ref OpDispatchBuilder::ReconstructFSW_Helper(Ref T) {

// Start with the top value
auto Top = T ? T : GetX87Top();
Ref FSW = _Lshl(OpSize::i64Bit, Top, _Constant(11));
Expand All @@ -700,7 +720,6 @@ Ref OpDispatchBuilder::ReconstructFSW_Helper(Ref T) {
// There's no load Status Word instruction but you can load it through frstor
// or fldenv.
void OpDispatchBuilder::X87FNSTSW(OpcodeArgs) {

Ref TopValue = _SyncStackToSlow();
Ref StatusWord = ReconstructFSW_Helper(TopValue);
StoreResult(GPRClass, Op, StatusWord, -1);
Expand All @@ -709,6 +728,10 @@ void OpDispatchBuilder::X87FNSTSW(OpcodeArgs) {
void OpDispatchBuilder::FNINIT(OpcodeArgs) {
auto Zero = _Constant(0);

if (ReducedPrecisionMode) {
_SetRoundingMode(Zero, false, Zero);
}

// Init FCW to 0x037F
auto NewFCW = _Constant(16, 0x037F);
_StoreContext(2, GPRClass, NewFCW, offsetof(FEXCore::Core::CPUState, FCW));
Expand Down
149 changes: 0 additions & 149 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher/X87F64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,6 @@ class OrderedNode;

#define OpcodeArgs [[maybe_unused]] FEXCore::X86Tables::DecodedOp Op

void OpDispatchBuilder::FNINITF64(OpcodeArgs) {
// Init host rounding mode to zero
auto Zero = _Constant(0);
_SetRoundingMode(Zero, false, Zero);

// Call generic version
FNINIT(Op);
}

void OpDispatchBuilder::X87LDENVF64(OpcodeArgs) {
_StackForceSlow();

Expand Down Expand Up @@ -380,146 +371,6 @@ void OpDispatchBuilder::FCOMIF64(OpcodeArgs, size_t Width, bool Integer, OpDispa
}
}

// This function converts to F80 on save for compatibility
void OpDispatchBuilder::X87FNSAVEF64(OpcodeArgs) {
_SyncStackToSlow();
// 14 bytes for 16bit
// 2 Bytes : FCW
// 2 Bytes : FSW
// 2 bytes : FTW
// 2 bytes : Instruction offset
// 2 bytes : Instruction CS selector
// 2 bytes : Data offset
// 2 bytes : Data selector

// 28 bytes for 32bit
// 4 bytes : FCW
// 4 bytes : FSW
// 4 bytes : FTW
// 4 bytes : Instruction pointer
// 2 bytes : instruction pointer selector
// 2 bytes : Opcode
// 4 bytes : data pointer offset
// 4 bytes : data pointer selector

const auto Size = GetDstSize(Op);
Ref Mem = MakeSegmentAddress(Op, Op->Dest);
Ref Top = GetX87Top();
{
auto FCW = _LoadContext(2, GPRClass, offsetof(FEXCore::Core::CPUState, FCW));
_StoreMem(GPRClass, Size, Mem, FCW, Size);
}

{ _StoreMem(GPRClass, Size, ReconstructFSW_Helper(), Mem, _Constant(Size * 1), Size, MEM_OFFSET_SXTX, 1); }

auto ZeroConst = _Constant(0);

{
// FTW
_StoreMem(GPRClass, Size, GetX87FTW_Helper(), Mem, _Constant(Size * 2), Size, MEM_OFFSET_SXTX, 1);
}

{
// Instruction Offset
_StoreMem(GPRClass, Size, ZeroConst, Mem, _Constant(Size * 3), Size, MEM_OFFSET_SXTX, 1);
}

{
// Instruction CS selector (+ Opcode)
_StoreMem(GPRClass, Size, ZeroConst, Mem, _Constant(Size * 4), Size, MEM_OFFSET_SXTX, 1);
}

{
// Data pointer offset
_StoreMem(GPRClass, Size, ZeroConst, Mem, _Constant(Size * 5), Size, MEM_OFFSET_SXTX, 1);
}

{
// Data pointer selector
_StoreMem(GPRClass, Size, ZeroConst, Mem, _Constant(Size * 6), Size, MEM_OFFSET_SXTX, 1);
}

auto OneConst = _Constant(1);
auto SevenConst = _Constant(7);
for (int i = 0; i < 7; ++i) {
Ref data = _LoadContextIndexed(Top, 8, MMBaseOffset(), 16, FPRClass);
data = _F80CVTTo(data, 8);
_StoreMem(FPRClass, 16, data, Mem, _Constant((Size * 7) + (i * 10)), 1, MEM_OFFSET_SXTX, 1);
Top = _And(OpSize::i32Bit, _Add(OpSize::i32Bit, Top, OneConst), SevenConst);
}

// The final st(7) needs a bit of special handling here
Ref data = _LoadContextIndexed(Top, 8, MMBaseOffset(), 16, FPRClass);
data = _F80CVTTo(data, 8);
// ST7 broken in to two parts
// Lower 64bits [63:0]
// upper 16 bits [79:64]
_StoreMem(FPRClass, 8, data, Mem, _Constant((Size * 7) + (7 * 10)), 1, MEM_OFFSET_SXTX, 1);
auto topBytes = _VDupElement(16, 2, data, 4);
_StoreMem(FPRClass, 2, topBytes, Mem, _Constant((Size * 7) + (7 * 10) + 8), 1, MEM_OFFSET_SXTX, 1);

// reset to default
FNINITF64(Op);
}

// This function converts from F80 on load for compatibility

void OpDispatchBuilder::X87FRSTORF64(OpcodeArgs) {
_StackForceSlow();
const auto Size = GetSrcSize(Op);
Ref Mem = MakeSegmentAddress(Op, Op->Src[0]);

auto NewFCW = _LoadMem(GPRClass, 2, Mem, 2);
// ignore the rounding precision, we're always 64-bit in F64.
// extract rounding mode
Ref roundingMode = NewFCW;
auto roundShift = _Constant(10);
auto roundMask = _Constant(3);
roundingMode = _Lshr(OpSize::i32Bit, roundingMode, roundShift);
roundingMode = _And(OpSize::i32Bit, roundingMode, roundMask);
_SetRoundingMode(roundingMode, false, roundingMode);
_StoreContext(2, GPRClass, NewFCW, offsetof(FEXCore::Core::CPUState, FCW));
_StoreContext(2, GPRClass, NewFCW, offsetof(FEXCore::Core::CPUState, FCW));

auto NewFSW = _LoadMem(GPRClass, Size, Mem, _Constant(Size * 1), Size, MEM_OFFSET_SXTX, 1);
Ref Top = ReconstructX87StateFromFSW_Helper(NewFSW);

{
// FTW
SetX87FTW(_LoadMem(GPRClass, Size, Mem, _Constant(Size * 2), Size, MEM_OFFSET_SXTX, 1));
}

auto OneConst = _Constant(1);
auto SevenConst = _Constant(7);

auto low = _Constant(~0ULL);
auto high = _Constant(0xFFFF);
Ref Mask = _VCastFromGPR(16, 8, low);
Mask = _VInsGPR(16, 8, 1, Mask, high);

for (int i = 0; i < 7; ++i) {
Ref Reg = _LoadMem(FPRClass, 16, Mem, _Constant((Size * 7) + (i * 10)), 1, MEM_OFFSET_SXTX, 1);
// Mask off the top bits
Reg = _VAnd(16, 16, Reg, Mask);
// Convert to double precision
Reg = _F80CVT(8, Reg);
_StoreContextIndexed(Reg, Top, 8, MMBaseOffset(), 16, FPRClass);

Top = _And(OpSize::i32Bit, _Add(OpSize::i32Bit, Top, OneConst), SevenConst);
}

// The final st(7) needs a bit of special handling here
// ST7 broken in to two parts
// Lower 64bits [63:0]
// upper 16 bits [79:64]

Ref Reg = _LoadMem(FPRClass, 8, Mem, _Constant((Size * 7) + (7 * 10)), 1, MEM_OFFSET_SXTX, 1);
Ref RegHigh = _LoadMem(FPRClass, 2, Mem, _Constant((Size * 7) + (7 * 10) + 8), 1, MEM_OFFSET_SXTX, 1);
Reg = _VInsElement(16, 2, 4, 0, Reg, RegHigh);
Reg = _F80CVT(8, Reg); // Convert to double precision
_StoreContextIndexed(Reg, Top, 8, MMBaseOffset(), 16, FPRClass);
}

void OpDispatchBuilder::X87FXTRACTF64(OpcodeArgs) {
// Split node into SIG and EXP while handling the special zero case.
// i.e. if val == 0.0, then sig = 0.0, exp = -inf
Expand Down
13 changes: 6 additions & 7 deletions unittests/InstructionCountCI/FlagM/x87_f64.json
Original file line number Diff line number Diff line change
Expand Up @@ -6247,23 +6247,22 @@
]
},
"frstor [rax]": {
"ExpectedInstructionCount": 317,
"ExpectedInstructionCount": 316,
"Comment": [
"0xdd !11b /4"
],
"ExpectedArm64ASM": [
"ldrh w20, [x4]",
"lsr w21, w20, #10",
"and w21, w21, #0x3",
"rbit w1, w21",
"strh w20, [x28, #1296]",
"lsr w20, w20, #10",
"and w20, w20, #0x3",
"rbit w1, w20",
"lsr w1, w1, #30",
"mrs x0, fpcr",
"bfi x0, x1, #22, #2",
"lsr x1, x21, #2",
"lsr x1, x20, #2",
"bfi x0, x1, #24, #1",
"msr fpcr, x0",
"strh w20, [x28, #1296]",
"strh w20, [x28, #1296]",
"ldr w20, [x4, #4]",
"ubfx w21, w20, #11, #3",
"strb w21, [x28, #1019]",
Expand Down
13 changes: 6 additions & 7 deletions unittests/InstructionCountCI/x87_f64.json
Original file line number Diff line number Diff line change
Expand Up @@ -6286,23 +6286,22 @@
]
},
"frstor [rax]": {
"ExpectedInstructionCount": 317,
"ExpectedInstructionCount": 316,
"Comment": [
"0xdd !11b /4"
],
"ExpectedArm64ASM": [
"ldrh w20, [x4]",
"lsr w21, w20, #10",
"and w21, w21, #0x3",
"rbit w1, w21",
"strh w20, [x28, #1296]",
"lsr w20, w20, #10",
"and w20, w20, #0x3",
"rbit w1, w20",
"lsr w1, w1, #30",
"mrs x0, fpcr",
"bfi x0, x1, #22, #2",
"lsr x1, x21, #2",
"lsr x1, x20, #2",
"bfi x0, x1, #24, #1",
"msr fpcr, x0",
"strh w20, [x28, #1296]",
"strh w20, [x28, #1296]",
"ldr w20, [x4, #4]",
"ubfx w21, w20, #11, #3",
"strb w21, [x28, #1019]",
Expand Down

0 comments on commit 0190e1a

Please sign in to comment.