-
Notifications
You must be signed in to change notification settings - Fork 4.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
JIT: Enable inlining for late devirtualization #110827
base: main
Are you sure you want to change the base?
Conversation
Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch |
6173a81
to
e287f69
Compare
e287f69
to
686f64e
Compare
cc @AndyAyersMS |
Skimmed the changes and it seems promising. I'll take a deeper look soon. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does this have any benefit for PGO, or is it just non-PGO?
It's mainly for non-PGO, but for PGO I'm seeing some cases where late devirted calls can be inlined earlier. A test case is using System.Runtime.CompilerServices;
class Program
{
static void Main()
{
_ = GetDistance();
}
[MethodImpl(MethodImplOptions.NoInlining)]
static float GetDistance()
{
IVector p1 = Vector.GetVector(4, 2);
IVector p2 = Vector.GetVector(1, 6);
IVector dir = p2.Sub(p1);
float dist = MathF.Sqrt(dir.X * dir.X + dir.Y * dir.Y);
return dist;
}
}
struct Vector : IVector
{
public float X { get; set; }
public float Y { get; set; }
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public readonly IVector Sub(IVector other) => GetVector(X - other.X, Y - other.Y);
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static IVector GetVector(float x, float y) => new Vector { X = x, Y = y };
}
interface IVector
{
IVector Sub(IVector other);
float X { get; set; }
float Y { get; set; }
} With GDV we managed to devirt and inline all the callees, but the test branches prevent from stack allocating value-class boxes: ; Assembly listing for method Program:GetDistance():float (Tier1)
; Emitting BLENDED_CODE for X64 with AVX - Windows
; Tier1 code
; optimized code
; rsp based frame
; partially interruptible
; 1 inlinees with PGO data; 17 single block inlinees; 0 inlinees without PGO data
; Final local variable assignments
;
; V00 loc0 [V00,T06] ( 3, 3 ) ref -> rsi class-hnd exact single-def <Vector>
; V01 loc1 [V01,T07] ( 3, 3 ) ref -> rax class-hnd <IVector>
; V02 OutArgs [V02 ] ( 1, 1 ) struct (32) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
; V03 tmp1 [V03,T05] ( 2, 4 ) ref -> rax class-hnd exact single-def "spilling ret_expr" <Vector>
; V04 tmp2 [V04,T08] ( 2, 4 ) float -> mm1 "impAppendStmt"
; V05 tmp3 [V05,T09] ( 2, 4 ) float -> mm0 "impAppendStmt"
; V06 tmp4 [V06,T10] ( 2, 4 ) float -> mm2 "impAppendStmt"
;* V07 tmp5 [V07 ] ( 0, 0 ) ref -> zero-ref "guarded devirt return temp"
;* V08 tmp6 [V08 ] ( 0, 0 ) ref -> zero-ref class-hnd exact "guarded devirt this exact temp" <Vector>
;* V09 tmp7 [V09 ] ( 0, 0 ) ref -> zero-ref class-hnd exact "guarded devirt this exact temp" <Vector>
;* V10 tmp8 [V10 ] ( 0, 0 ) float -> zero-ref "guarded devirt return temp"
;* V11 tmp9 [V11 ] ( 0, 0 ) ref -> zero-ref class-hnd exact "guarded devirt this exact temp" <Vector>
;* V12 tmp10 [V12 ] ( 0, 0 ) ref -> zero-ref class-hnd exact "guarded devirt this exact temp" <Vector>
; V13 tmp11 [V13,T15] ( 2, 2 ) float -> mm1 "guarded devirt return temp"
;* V14 tmp12 [V14 ] ( 0, 0 ) ref -> zero-ref class-hnd exact "guarded devirt this exact temp" <Vector>
;* V15 tmp13 [V15 ] ( 0, 0 ) struct ( 8) zero-ref ld-addr-op "Inline ldloca(s) first use temp" <Vector>
; V16 tmp14 [V16,T01] ( 3, 6 ) ref -> rsi class-hnd exact single-def "Single-def Box Helper" <Vector>
;* V17 tmp15 [V17 ] ( 0, 0 ) struct ( 8) zero-ref ld-addr-op "Inline ldloca(s) first use temp" <Vector>
; V18 tmp16 [V18,T02] ( 3, 6 ) ref -> rax class-hnd exact single-def "Single-def Box Helper" <Vector>
; V19 tmp17 [V19,T03] ( 3, 6 ) byref -> rax single-def "Inlining Arg"
;* V20 tmp18 [V20 ] ( 0, 0 ) float -> zero-ref "impAppendStmt"
; V21 tmp19 [V21,T11] ( 2, 4 ) float -> mm0 "impAppendStmt"
;* V22 tmp20 [V22 ] ( 0, 0 ) float -> zero-ref "impAppendStmt"
;* V23 tmp21 [V23 ] ( 0, 0 ) byref -> zero-ref "Inlining Arg"
;* V24 tmp22 [V24 ] ( 0, 0 ) byref -> zero-ref "Inlining Arg"
;* V25 tmp23 [V25 ] ( 0, 0 ) struct ( 8) zero-ref ld-addr-op "Inline ldloca(s) first use temp" <Vector>
; V26 tmp24 [V26,T12] ( 2, 4 ) float -> mm1 "Inlining Arg"
; V27 tmp25 [V27,T00] ( 4, 8 ) ref -> rax class-hnd exact single-def "Single-def Box Helper" <Vector>
;* V28 tmp26 [V28 ] ( 0, 0 ) byref -> zero-ref "Inlining Arg"
;* V29 tmp27 [V29 ] ( 0, 0 ) byref -> zero-ref "Inlining Arg"
;* V30 tmp28 [V30 ] ( 0, 0 ) byref -> zero-ref "Inlining Arg"
;* V31 tmp29 [V31 ] ( 0, 0 ) byref -> zero-ref "Inlining Arg"
;* V32 tmp30 [V32,T18] ( 0, 0 ) float -> zero-ref single-def "field V15.<X>k__BackingField (fldOffset=0x0)" P-INDEP
;* V33 tmp31 [V33,T19] ( 0, 0 ) float -> zero-ref single-def "field V15.<Y>k__BackingField (fldOffset=0x4)" P-INDEP
;* V34 tmp32 [V34,T20] ( 0, 0 ) float -> zero-ref single-def "field V17.<X>k__BackingField (fldOffset=0x0)" P-INDEP
;* V35 tmp33 [V35,T21] ( 0, 0 ) float -> zero-ref single-def "field V17.<Y>k__BackingField (fldOffset=0x4)" P-INDEP
; V36 tmp34 [V36,T16] ( 2, 2 ) float -> [rsp+0x24] spill-single-def "field V25.<X>k__BackingField (fldOffset=0x0)" P-INDEP
; V37 tmp35 [V37,T17] ( 2, 2 ) float -> [rsp+0x20] spill-single-def "field V25.<Y>k__BackingField (fldOffset=0x4)" P-INDEP
; V38 cse0 [V38,T13] ( 3, 3 ) float -> mm0 "CSE #03: moderate"
; V39 cse1 [V39,T14] ( 3, 3 ) float -> mm1 "CSE #04: moderate"
; V40 cse2 [V40,T04] ( 4, 4 ) long -> rbx "CSE #01: aggressive"
;
; Lcl frame size = 40
G_M12138_IG01: ;; offset=0x0000
push rsi
push rbx
sub rsp, 40
;; size=6 bbWeight=1 PerfScore 2.25
G_M12138_IG02: ;; offset=0x0006
mov rbx, 0x7FFC3439EA88 ; Vector
mov rcx, rbx
call CORINFO_HELP_NEWSFAST
mov rsi, rax
mov rcx, 0x4000000040800000
mov qword ptr [rsi+0x08], rcx
mov rcx, rbx
call CORINFO_HELP_NEWSFAST
mov rcx, 0x40C000003F800000
mov qword ptr [rax+0x08], rcx
add rax, 8
vmovss xmm0, dword ptr [rax]
vsubss xmm0, xmm0, dword ptr [rsi+0x08]
vmovss xmm1, dword ptr [rax+0x04]
vsubss xmm1, xmm1, dword ptr [rsi+0x0C]
vmovss dword ptr [rsp+0x24], xmm0
vmovss dword ptr [rsp+0x20], xmm1
mov rcx, rbx
call CORINFO_HELP_NEWSFAST
vmovss xmm0, dword ptr [rsp+0x24]
vmovss dword ptr [rax+0x08], xmm0
vmovss xmm1, dword ptr [rsp+0x20]
vmovss dword ptr [rax+0x0C], xmm1
vmovss xmm0, dword ptr [rax+0x08]
vmovaps xmm1, xmm0
vmulss xmm0, xmm1, xmm0
vmovss xmm1, dword ptr [rax+0x0C]
vmovaps xmm2, xmm1
vmulss xmm1, xmm2, xmm1
vaddss xmm0, xmm1, xmm0
vsqrtss xmm0, xmm0, xmm0
;; size=156 bbWeight=1 PerfScore 67.50
G_M12138_IG03: ;; offset=0x00A2
add rsp, 40
pop rbx
pop rsi
ret
;; size=7 bbWeight=1 PerfScore 2.25
; Total bytes of code 169, prolog size 6, PerfScore 72.00, instruction count 38, allocated bytes for code 169 (MethodHash=60b0d095) for method Program:GetDistance():float (Tier1)
; ============================================================ While without GDV we can now fold the entire code in ; Assembly listing for method Program:GetDistance():float (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX - Windows
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data
; 0 inlinees with PGO data; 18 single block inlinees; 0 inlinees without PGO data
; Final local variable assignments
;
;* V00 loc0 [V00 ] ( 0, 0 ) long -> zero-ref class-hnd exact <Vector>
;* V01 loc1 [V01 ] ( 0, 0 ) long -> zero-ref class-hnd exact <Vector>
;# V02 OutArgs [V02 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;* V03 tmp1 [V03 ] ( 0, 0 ) float -> zero-ref "non-inline candidate call"
;* V04 tmp2 [V04 ] ( 0, 0 ) float -> zero-ref "non-inline candidate call"
;* V05 tmp3 [V05 ] ( 0, 0 ) float -> zero-ref "non-inline candidate call"
;* V06 tmp4 [V06 ] ( 0, 0 ) struct ( 8) zero-ref ld-addr-op "Inline ldloca(s) first use temp" <Vector>
;* V07 tmp5 [V07 ] ( 0, 0 ) long -> zero-ref class-hnd exact "Single-def Box Helper" <Vector>
;* V08 tmp6 [V08 ] ( 0, 0 ) struct ( 8) zero-ref ld-addr-op "Inline ldloca(s) first use temp" <Vector>
;* V09 tmp7 [V09 ] ( 0, 0 ) struct ( 8) zero-ref ld-addr-op class-hnd exact "Single-def Box Helper" <Vector>
;* V10 tmp8 [V10 ] ( 0, 0 ) float -> zero-ref "impAppendStmt"
;* V11 tmp9 [V11 ] ( 0, 0 ) float -> zero-ref "impAppendStmt"
;* V12 tmp10 [V12 ] ( 0, 0 ) float -> zero-ref "impAppendStmt"
;* V13 tmp11 [V13 ] ( 0, 0 ) byref -> zero-ref "Inlining Arg"
;* V14 tmp12 [V14 ] ( 0, 0 ) byref -> zero-ref "Inlining Arg"
;* V15 tmp13 [V15 ] ( 0, 0 ) struct ( 8) zero-ref ld-addr-op "Inline ldloca(s) first use temp" <Vector>
;* V16 tmp14 [V16 ] ( 0, 0 ) float -> zero-ref "Inlining Arg"
;* V17 tmp15 [V17 ] ( 0, 0 ) long -> zero-ref class-hnd exact "Single-def Box Helper" <Vector>
;* V18 tmp16 [V18 ] ( 0, 0 ) byref -> zero-ref "Inlining Arg"
;* V19 tmp17 [V19 ] ( 0, 0 ) byref -> zero-ref "Inlining Arg"
;* V20 tmp18 [V20 ] ( 0, 0 ) byref -> zero-ref "Inlining Arg"
;* V21 tmp19 [V21 ] ( 0, 0 ) byref -> zero-ref "Inlining Arg"
;* V22 tmp20 [V22 ] ( 0, 0 ) struct (16) zero-ref do-not-enreg[SF] "stack allocated boxed value class temp" <System.Runtime.CompilerServices.StackAllocatedBox`1[Vector]>
;* V23 tmp21 [V23 ] ( 0, 0 ) struct (16) zero-ref do-not-enreg[SF] "stack allocated boxed value class temp" <System.Runtime.CompilerServices.StackAllocatedBox`1[Vector]>
;* V24 tmp22 [V24 ] ( 0, 0 ) float -> zero-ref single-def "field V06.<X>k__BackingField (fldOffset=0x0)" P-INDEP
;* V25 tmp23 [V25 ] ( 0, 0 ) float -> zero-ref single-def "field V06.<Y>k__BackingField (fldOffset=0x4)" P-INDEP
;* V26 tmp24 [V26 ] ( 0, 0 ) float -> zero-ref single-def "field V08.<X>k__BackingField (fldOffset=0x0)" P-INDEP
;* V27 tmp25 [V27 ] ( 0, 0 ) float -> zero-ref single-def "field V08.<Y>k__BackingField (fldOffset=0x4)" P-INDEP
;* V28 tmp26 [V28 ] ( 0, 0 ) float -> zero-ref single-def "field V09.<X>k__BackingField (fldOffset=0x0)" P-INDEP
;* V29 tmp27 [V29 ] ( 0, 0 ) float -> zero-ref single-def "field V09.<Y>k__BackingField (fldOffset=0x4)" P-INDEP
;* V30 tmp28 [V30 ] ( 0, 0 ) float -> zero-ref single-def "field V15.<X>k__BackingField (fldOffset=0x0)" P-INDEP
;* V31 tmp29 [V31 ] ( 0, 0 ) float -> zero-ref single-def "field V15.<Y>k__BackingField (fldOffset=0x4)" P-INDEP
;* V32 tmp30 [V32 ] ( 0, 0 ) long -> zero-ref single-def "V22.[000..008)" ;* V33 tmp31 [V33 ] ( 0, 0 ) float -> zero-ref single-def "V22.[008..012)"
;* V34 tmp32 [V34 ] ( 0, 0 ) float -> zero-ref single-def "V22.[012..016)"
;* V35 tmp33 [V35 ] ( 0, 0 ) long -> zero-ref single-def "V23.[000..008)"
;* V36 tmp34 [V36 ] ( 0, 0 ) float -> zero-ref single-def "V23.[008..012)"
;* V37 tmp35 [V37 ] ( 0, 0 ) float -> zero-ref single-def "V23.[012..016)"
;
; Lcl frame size = 0
G_M12138_IG01: ;; offset=0x0000
;; size=0 bbWeight=1 PerfScore 0.00
G_M12138_IG02: ;; offset=0x0000
vmovss xmm0, dword ptr [reloc @RWD00]
;; size=8 bbWeight=1 PerfScore 3.00
G_M12138_IG03: ;; offset=0x0008
ret
;; size=1 bbWeight=1 PerfScore 1.00
RWD00 dd 40A00000h ; 5
; Total bytes of code 9, prolog size 0, PerfScore 4.00, instruction count 2, allocated bytes for code 9 (MethodHash=60b0d095) for method Program:GetDistance():float (FullOpts)
; ============================================================ |
295ac12
to
3b5b446
Compare
@MihuBot |
If we see a new inline candidate after late devirtualization, we can try marking and inlining it.
This unblocks the inlining and stack-allocating arbitrary ref-class enumerators (unless the inliner considers
GetEnumerator
as non-profitable). We might need to tune the inliner heuristics later to get more profitable inlining opportunities.There're some really nice diffs: https://gist.github.com/MihuBot/29f7c64533ac1f38494fbfab361ab505
/cc: @AndyAyersMS