Skip to content

Commit

Permalink
Merge pull request FEX-Emu#4265 from pmatos/RevertPredCache
Browse files Browse the repository at this point in the history
Revert pred cache
  • Loading branch information
Sonicadvance1 authored Jan 10, 2025
2 parents 8c94b78 + 1dce491 commit 8cfc016
Show file tree
Hide file tree
Showing 7 changed files with 25 additions and 45 deletions.
2 changes: 1 addition & 1 deletion FEXCore/Scripts/json_ir_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ def parse_ops(ops):
(OpArg.Type == "GPR" or
OpArg.Type == "GPRPair" or
OpArg.Type == "FPR" or
OpArg.Type == "PRED")):
OpArg.Type == "PR")):
OpDef.EmitValidation.append(f"GetOpRegClass({ArgName}) == InvalidClass || WalkFindRegClass({ArgName}) == {OpArg.Type}Class")

OpArg.Name = ArgName
Expand Down
4 changes: 2 additions & 2 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4314,7 +4314,7 @@ Ref OpDispatchBuilder::LoadSource_WithOpSize(RegisterClassType Class, const X86T
Ref MemSrc = LoadEffectiveAddress(A, true);
if (CTX->HostFeatures.SupportsSVE128 || CTX->HostFeatures.SupportsSVE256) {
// Using SVE we can load this with a single instruction.
auto PReg = InitPredicateCached(OpSize::i16Bit, ARMEmitter::PredicatePattern::SVE_VL5);
auto PReg = _InitPredicate(OpSize::i16Bit, FEXCore::ToUnderlying(ARMEmitter::PredicatePattern::SVE_VL5));
return _LoadMemPredicate(OpSize::i128Bit, OpSize::i16Bit, PReg, MemSrc);
} else {
// For X87 extended doubles, Split the load.
Expand Down Expand Up @@ -4448,7 +4448,7 @@ void OpDispatchBuilder::StoreResult_WithOpSize(FEXCore::IR::RegisterClassType Cl
if (OpSize == OpSize::f80Bit) {
Ref MemStoreDst = LoadEffectiveAddress(A, true);
if (CTX->HostFeatures.SupportsSVE128 || CTX->HostFeatures.SupportsSVE256) {
auto PReg = InitPredicateCached(OpSize::i16Bit, ARMEmitter::PredicatePattern::SVE_VL5);
auto PReg = _InitPredicate(OpSize::i16Bit, FEXCore::ToUnderlying(ARMEmitter::PredicatePattern::SVE_VL5));
_StoreMemPredicate(OpSize::i128Bit, OpSize::i16Bit, Src, PReg, MemStoreDst);
} else {
// For X87 extended doubles, split before storing
Expand Down
3 changes: 0 additions & 3 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher.h
Original file line number Diff line number Diff line change
Expand Up @@ -125,9 +125,6 @@ class OpDispatchBuilder final : public IREmitter {

// Need to clear any named constants that were cached.
ClearCachedNamedConstants();

// Clear predicate cache for x87 ldst
ResetInitPredicateCache();
}

IRPair<IROp_Jump> Jump() {
Expand Down
1 change: 0 additions & 1 deletion FEXCore/Source/Interface/IR/IREmitter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ FEXCore::IR::RegisterClassType IREmitter::WalkFindRegClass(Ref Node) {
case FPRClass:
case GPRFixedClass:
case FPRFixedClass:
case PREDClass:
case InvalidClass: return Class;
default: break;
}
Expand Down
34 changes: 1 addition & 33 deletions FEXCore/Source/Interface/IR/IREmitter.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
// SPDX-License-Identifier: MIT
#pragma once

#include "CodeEmitter/Emitter.h"
#include "Interface/IR/IR.h"
#include "Interface/IR/IntrusiveIRList.h"

Expand All @@ -10,9 +9,9 @@

#include <FEXCore/Utils/LogManager.h>
#include <FEXCore/fextl/vector.h>
#include <FEXCore/fextl/unordered_map.h>

#include <algorithm>
#include <new>
#include <stdint.h>
#include <string.h>

Expand Down Expand Up @@ -46,37 +45,6 @@ class IREmitter {
}
void ResetWorkingList();

// Predicate Cache Implementation
// This lives here rather than OpcodeDispatcher because x87StackOptimization Pass
// also needs it.
struct PredicateKey {
ARMEmitter::PredicatePattern Pattern;
OpSize Size;
bool operator==(const PredicateKey& rhs) const = default;
};

struct PredicateKeyHash {
size_t operator()(const PredicateKey& key) const {
return FEXCore::ToUnderlying(key.Pattern) + (FEXCore::ToUnderlying(key.Size) * FEXCore::ToUnderlying(OpSize::iInvalid));
}
};
fextl::unordered_map<PredicateKey, Ref, PredicateKeyHash> InitPredicateCache;

Ref InitPredicateCached(OpSize Size, ARMEmitter::PredicatePattern Pattern) {
PredicateKey Key {Pattern, Size};
auto ValIt = InitPredicateCache.find(Key);
if (ValIt == InitPredicateCache.end()) {
auto Predicate = _InitPredicate(Size, static_cast<uint8_t>(FEXCore::ToUnderlying(Pattern)));
InitPredicateCache[Key] = Predicate;
return Predicate;
}
return ValIt->second;
}

void ResetInitPredicateCache() {
InitPredicateCache.clear();
}

/**
* @name IR allocation routines
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -824,7 +824,7 @@ void X87StackOptimization::Run(IREmitter* Emit) {
}
if (Op->StoreSize == OpSize::f80Bit) { // Part of code from StoreResult_WithOpSize()
if (Features.SupportsSVE128 || Features.SupportsSVE256) {
auto PReg = IREmit->InitPredicateCached(OpSize::i16Bit, ARMEmitter::PredicatePattern::SVE_VL5);
auto PReg = IREmit->_InitPredicate(OpSize::i16Bit, FEXCore::ToUnderlying(ARMEmitter::PredicatePattern::SVE_VL5));
IREmit->_StoreMemPredicate(OpSize::i128Bit, OpSize::i16Bit, StackNode, PReg, AddrNode);
} else {
// For X87 extended doubles, split before storing
Expand Down
24 changes: 20 additions & 4 deletions unittests/InstructionCountCI/X87ldst-SVE.json
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
},
"2-store 80bit": {
"x86InstructionCount": 2,
"ExpectedInstructionCount": 24,
"ExpectedInstructionCount": 25,
"x86Insts": [
"fstp tword [rax]",
"fstp tword [rax+10]"
Expand All @@ -56,6 +56,7 @@
"add x21, x4, #0xa (10)",
"add x0, x28, x20, lsl #4",
"ldr q2, [x0, #1040]",
"ptrue p2.h, vl5",
"st1h {z2.h}, p2, [x21]",
"ldrb w21, [x28, #1298]",
"lsl w22, w22, w20",
Expand All @@ -68,7 +69,7 @@
},
"8-store 80bit": {
"x86InstructionCount": 8,
"ExpectedInstructionCount": 90,
"ExpectedInstructionCount": 97,
"x86Insts": [
"fstp tword [rax]",
"fstp tword [rax+10]",
Expand Down Expand Up @@ -96,6 +97,7 @@
"add x21, x4, #0xa (10)",
"add x0, x28, x20, lsl #4",
"ldr q2, [x0, #1040]",
"ptrue p2.h, vl5",
"st1h {z2.h}, p2, [x21]",
"ldrb w21, [x28, #1298]",
"lsl w23, w22, w20",
Expand All @@ -107,6 +109,7 @@
"add x21, x4, #0x14 (20)",
"add x0, x28, x20, lsl #4",
"ldr q2, [x0, #1040]",
"ptrue p2.h, vl5",
"st1h {z2.h}, p2, [x21]",
"ldrb w21, [x28, #1298]",
"lsl w23, w22, w20",
Expand All @@ -118,6 +121,7 @@
"add x21, x4, #0x1e (30)",
"add x0, x28, x20, lsl #4",
"ldr q2, [x0, #1040]",
"ptrue p2.h, vl5",
"st1h {z2.h}, p2, [x21]",
"ldrb w21, [x28, #1298]",
"lsl w23, w22, w20",
Expand All @@ -129,6 +133,7 @@
"add x21, x4, #0x28 (40)",
"add x0, x28, x20, lsl #4",
"ldr q2, [x0, #1040]",
"ptrue p2.h, vl5",
"st1h {z2.h}, p2, [x21]",
"ldrb w21, [x28, #1298]",
"lsl w23, w22, w20",
Expand All @@ -140,6 +145,7 @@
"add x21, x4, #0x32 (50)",
"add x0, x28, x20, lsl #4",
"ldr q2, [x0, #1040]",
"ptrue p2.h, vl5",
"st1h {z2.h}, p2, [x21]",
"ldrb w21, [x28, #1298]",
"lsl w23, w22, w20",
Expand All @@ -151,6 +157,7 @@
"add x21, x4, #0x3c (60)",
"add x0, x28, x20, lsl #4",
"ldr q2, [x0, #1040]",
"ptrue p2.h, vl5",
"st1h {z2.h}, p2, [x21]",
"ldrb w21, [x28, #1298]",
"lsl w23, w22, w20",
Expand All @@ -162,6 +169,7 @@
"add x21, x4, #0x46 (70)",
"add x0, x28, x20, lsl #4",
"ldr q2, [x0, #1040]",
"ptrue p2.h, vl5",
"st1h {z2.h}, p2, [x21]",
"ldrb w21, [x28, #1298]",
"lsl w22, w22, w20",
Expand Down Expand Up @@ -193,7 +201,7 @@
},
"2-load 80bit": {
"x86InstructionCount": 2,
"ExpectedInstructionCount": 21,
"ExpectedInstructionCount": 22,
"x86Insts": [
"fld tword [rax]",
"fld tword [rax+10]"
Expand All @@ -202,6 +210,7 @@
"ptrue p2.h, vl5",
"ld1h {z2.h}, p2/z, [x4]",
"add x20, x4, #0xa (10)",
"ptrue p2.h, vl5",
"ld1h {z3.h}, p2/z, [x20]",
"ldrb w20, [x28, #1019]",
"sub w20, w20, #0x2 (2)",
Expand All @@ -224,7 +233,7 @@
},
"8-load 80bit": {
"x86InstructionCount": 8,
"ExpectedInstructionCount": 52,
"ExpectedInstructionCount": 59,
"x86Insts": [
"fld tword [rax]",
"fld tword [rax+10]",
Expand All @@ -239,18 +248,25 @@
"ptrue p2.h, vl5",
"ld1h {z2.h}, p2/z, [x4]",
"add x20, x4, #0xa (10)",
"ptrue p2.h, vl5",
"ld1h {z3.h}, p2/z, [x20]",
"add x20, x4, #0x14 (20)",
"ptrue p2.h, vl5",
"ld1h {z4.h}, p2/z, [x20]",
"add x20, x4, #0x1e (30)",
"ptrue p2.h, vl5",
"ld1h {z5.h}, p2/z, [x20]",
"add x20, x4, #0x28 (40)",
"ptrue p2.h, vl5",
"ld1h {z6.h}, p2/z, [x20]",
"add x20, x4, #0x32 (50)",
"ptrue p2.h, vl5",
"ld1h {z7.h}, p2/z, [x20]",
"add x20, x4, #0x3c (60)",
"ptrue p2.h, vl5",
"ld1h {z8.h}, p2/z, [x20]",
"add x20, x4, #0x46 (70)",
"ptrue p2.h, vl5",
"ld1h {z9.h}, p2/z, [x20]",
"ldrb w20, [x28, #1019]",
"sub w20, w20, #0x8 (8)",
Expand Down

0 comments on commit 8cfc016

Please sign in to comment.