From 8f4fd3f34e8e218cb90435b5a8c6ba9be23a1e1e Mon Sep 17 00:00:00 2001 From: Zheng Xu Date: Wed, 29 Aug 2018 14:55:03 +0800 Subject: [PATCH] build: support frame-pointer for arm64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Supporting frame-pointer makes Linux's perf and other profilers much more useful because it lets them gather a stack trace efficiently on profiling events. Major changes include: 1. save FP on the word below where RSP is pointing to (proposed by Cherry and Austin) 2. adjust some specific offsets in runtime assembly and wrapper code 3. add support to FP in goroutine scheduler 4. adjust link stack overflow check to take the extra word into account 5. adjust nosplit test cases to enable frame sizes which are 16 bytes aligned Performance impacts on go1 benchmarks: Enable frame-pointer (by default) name old time/op new time/op delta BinaryTree17-46 5.94s ± 0% 6.00s ± 0% +1.03% (p=0.029 n=4+4) Fannkuch11-46 2.84s ± 1% 2.77s ± 0% -2.58% (p=0.008 n=5+5) FmtFprintfEmpty-46 55.0ns ± 1% 58.9ns ± 1% +7.06% (p=0.008 n=5+5) FmtFprintfString-46 102ns ± 0% 105ns ± 0% +2.94% (p=0.008 n=5+5) FmtFprintfInt-46 118ns ± 0% 117ns ± 1% -1.19% (p=0.000 n=4+5) FmtFprintfIntInt-46 181ns ± 0% 182ns ± 1% ~ (p=0.444 n=5+5) FmtFprintfPrefixedInt-46 215ns ± 1% 214ns ± 0% ~ (p=0.254 n=5+4) FmtFprintfFloat-46 292ns ± 0% 296ns ± 0% +1.46% (p=0.029 n=4+4) FmtManyArgs-46 720ns ± 0% 732ns ± 0% +1.72% (p=0.008 n=5+5) GobDecode-46 9.82ms ± 1% 10.03ms ± 2% +2.10% (p=0.008 n=5+5) GobEncode-46 8.14ms ± 0% 8.72ms ± 1% +7.14% (p=0.008 n=5+5) Gzip-46 420ms ± 0% 424ms ± 0% +0.92% (p=0.008 n=5+5) Gunzip-46 48.2ms ± 0% 48.4ms ± 0% +0.41% (p=0.008 n=5+5) HTTPClientServer-46 201µs ± 4% 201µs ± 0% ~ (p=0.730 n=5+4) JSONEncode-46 17.1ms ± 0% 17.7ms ± 1% +3.80% (p=0.008 n=5+5) JSONDecode-46 88.0ms ± 0% 90.1ms ± 0% +2.42% (p=0.008 n=5+5) Mandelbrot200-46 5.06ms ± 0% 5.07ms ± 0% ~ (p=0.310 n=5+5) GoParse-46 5.04ms ± 0% 5.12ms ± 0% +1.53% (p=0.008 n=5+5) RegexpMatchEasy0_32-46 117ns ± 0% 117ns ± 0% ~ (all equal) RegexpMatchEasy0_1K-46 332ns ± 0% 329ns ± 0% -0.78% (p=0.008 n=5+5) RegexpMatchEasy1_32-46 104ns ± 0% 113ns ± 0% +8.65% (p=0.029 n=4+4) RegexpMatchEasy1_1K-46 563ns ± 0% 569ns ± 0% +1.10% (p=0.008 n=5+5) RegexpMatchMedium_32-46 167ns ± 2% 177ns ± 1% +5.74% (p=0.008 n=5+5) RegexpMatchMedium_1K-46 49.5µs ± 0% 53.4µs ± 0% +7.81% (p=0.008 n=5+5) RegexpMatchHard_32-46 2.56µs ± 1% 2.72µs ± 0% +6.01% (p=0.008 n=5+5) RegexpMatchHard_1K-46 77.0µs ± 0% 81.8µs ± 0% +6.24% (p=0.016 n=5+4) Revcomp-46 631ms ± 1% 627ms ± 1% ~ (p=0.095 n=5+5) Template-46 81.8ms ± 0% 86.3ms ± 0% +5.55% (p=0.008 n=5+5) TimeParse-46 423ns ± 0% 432ns ± 0% +2.32% (p=0.008 n=5+5) TimeFormat-46 478ns ± 2% 497ns ± 1% +3.89% (p=0.008 n=5+5) [Geo mean] 71.6µs 73.3µs +2.45% name old speed new speed delta GobDecode-46 78.1MB/s ± 1% 76.6MB/s ± 2% -2.04% (p=0.008 n=5+5) GobEncode-46 94.3MB/s ± 0% 88.0MB/s ± 1% -6.67% (p=0.008 n=5+5) Gzip-46 46.2MB/s ± 0% 45.8MB/s ± 0% -0.91% (p=0.008 n=5+5) Gunzip-46 403MB/s ± 0% 401MB/s ± 0% -0.41% (p=0.008 n=5+5) JSONEncode-46 114MB/s ± 0% 109MB/s ± 1% -3.66% (p=0.008 n=5+5) JSONDecode-46 22.0MB/s ± 0% 21.5MB/s ± 0% -2.35% (p=0.008 n=5+5) GoParse-46 11.5MB/s ± 0% 11.3MB/s ± 0% -1.51% (p=0.008 n=5+5) RegexpMatchEasy0_32-46 272MB/s ± 0% 272MB/s ± 1% ~ (p=0.190 n=4+5) RegexpMatchEasy0_1K-46 3.08GB/s ± 0% 3.11GB/s ± 0% +0.77% (p=0.008 n=5+5) RegexpMatchEasy1_32-46 306MB/s ± 0% 283MB/s ± 0% -7.63% (p=0.029 n=4+4) RegexpMatchEasy1_1K-46 1.82GB/s ± 0% 1.80GB/s ± 0% -1.07% (p=0.008 n=5+5) RegexpMatchMedium_32-46 5.99MB/s ± 0% 5.64MB/s ± 1% -5.77% (p=0.016 n=4+5) RegexpMatchMedium_1K-46 20.7MB/s ± 0% 19.2MB/s ± 0% -7.25% (p=0.008 n=5+5) RegexpMatchHard_32-46 12.5MB/s ± 1% 11.8MB/s ± 0% -5.66% (p=0.008 n=5+5) RegexpMatchHard_1K-46 13.3MB/s ± 0% 12.5MB/s ± 1% -6.01% (p=0.008 n=5+5) Revcomp-46 402MB/s ± 1% 405MB/s ± 1% ~ (p=0.095 n=5+5) Template-46 23.7MB/s ± 0% 22.5MB/s ± 0% -5.25% (p=0.008 n=5+5) [Geo mean] 82.2MB/s 79.6MB/s -3.26% Disable frame-pointer (GOEXPERIMENT=noframepointer) name old time/op new time/op delta BinaryTree17-46 5.94s ± 0% 5.96s ± 0% +0.39% (p=0.029 n=4+4) Fannkuch11-46 2.84s ± 1% 2.79s ± 1% -1.68% (p=0.008 n=5+5) FmtFprintfEmpty-46 55.0ns ± 1% 55.2ns ± 3% ~ (p=0.794 n=5+5) FmtFprintfString-46 102ns ± 0% 103ns ± 0% +0.98% (p=0.016 n=5+4) FmtFprintfInt-46 118ns ± 0% 115ns ± 0% -2.54% (p=0.029 n=4+4) FmtFprintfIntInt-46 181ns ± 0% 179ns ± 0% -1.10% (p=0.000 n=5+4) FmtFprintfPrefixedInt-46 215ns ± 1% 213ns ± 0% ~ (p=0.143 n=5+4) FmtFprintfFloat-46 292ns ± 0% 300ns ± 0% +2.83% (p=0.029 n=4+4) FmtManyArgs-46 720ns ± 0% 739ns ± 0% +2.64% (p=0.008 n=5+5) GobDecode-46 9.82ms ± 1% 9.78ms ± 1% ~ (p=0.151 n=5+5) GobEncode-46 8.14ms ± 0% 8.12ms ± 1% ~ (p=0.690 n=5+5) Gzip-46 420ms ± 0% 420ms ± 0% ~ (p=0.548 n=5+5) Gunzip-46 48.2ms ± 0% 48.0ms ± 0% -0.33% (p=0.032 n=5+5) HTTPClientServer-46 201µs ± 4% 199µs ± 3% ~ (p=0.548 n=5+5) JSONEncode-46 17.1ms ± 0% 17.2ms ± 0% ~ (p=0.056 n=5+5) JSONDecode-46 88.0ms ± 0% 88.6ms ± 0% +0.64% (p=0.008 n=5+5) Mandelbrot200-46 5.06ms ± 0% 5.07ms ± 0% ~ (p=0.548 n=5+5) GoParse-46 5.04ms ± 0% 5.07ms ± 0% +0.65% (p=0.008 n=5+5) RegexpMatchEasy0_32-46 117ns ± 0% 112ns ± 4% -4.27% (p=0.016 n=4+5) RegexpMatchEasy0_1K-46 332ns ± 0% 330ns ± 1% ~ (p=0.095 n=5+5) RegexpMatchEasy1_32-46 104ns ± 0% 110ns ± 1% +5.29% (p=0.029 n=4+4) RegexpMatchEasy1_1K-46 563ns ± 0% 567ns ± 2% ~ (p=0.151 n=5+5) RegexpMatchMedium_32-46 167ns ± 2% 166ns ± 0% ~ (p=0.333 n=5+4) RegexpMatchMedium_1K-46 49.5µs ± 0% 49.6µs ± 0% ~ (p=0.841 n=5+5) RegexpMatchHard_32-46 2.56µs ± 1% 2.49µs ± 0% -2.81% (p=0.008 n=5+5) RegexpMatchHard_1K-46 77.0µs ± 0% 75.8µs ± 0% -1.55% (p=0.008 n=5+5) Revcomp-46 631ms ± 1% 628ms ± 0% ~ (p=0.095 n=5+5) Template-46 81.8ms ± 0% 84.3ms ± 1% +3.05% (p=0.008 n=5+5) TimeParse-46 423ns ± 0% 425ns ± 0% +0.52% (p=0.008 n=5+5) TimeFormat-46 478ns ± 2% 478ns ± 1% ~ (p=1.000 n=5+5) [Geo mean] 71.6µs 71.6µs -0.01% name old speed new speed delta GobDecode-46 78.1MB/s ± 1% 78.5MB/s ± 1% ~ (p=0.151 n=5+5) GobEncode-46 94.3MB/s ± 0% 94.5MB/s ± 1% ~ (p=0.690 n=5+5) Gzip-46 46.2MB/s ± 0% 46.2MB/s ± 0% ~ (p=0.571 n=5+5) Gunzip-46 403MB/s ± 0% 404MB/s ± 0% +0.33% (p=0.032 n=5+5) JSONEncode-46 114MB/s ± 0% 113MB/s ± 0% ~ (p=0.056 n=5+5) JSONDecode-46 22.0MB/s ± 0% 21.9MB/s ± 0% -0.64% (p=0.008 n=5+5) GoParse-46 11.5MB/s ± 0% 11.4MB/s ± 0% -0.64% (p=0.008 n=5+5) RegexpMatchEasy0_32-46 272MB/s ± 0% 285MB/s ± 4% +4.74% (p=0.016 n=4+5) RegexpMatchEasy0_1K-46 3.08GB/s ± 0% 3.10GB/s ± 1% ~ (p=0.151 n=5+5) RegexpMatchEasy1_32-46 306MB/s ± 0% 290MB/s ± 1% -5.21% (p=0.029 n=4+4) RegexpMatchEasy1_1K-46 1.82GB/s ± 0% 1.81GB/s ± 2% ~ (p=0.151 n=5+5) RegexpMatchMedium_32-46 5.99MB/s ± 0% 6.02MB/s ± 1% ~ (p=0.063 n=4+5) RegexpMatchMedium_1K-46 20.7MB/s ± 0% 20.7MB/s ± 0% ~ (p=0.659 n=5+5) RegexpMatchHard_32-46 12.5MB/s ± 1% 12.8MB/s ± 0% +2.88% (p=0.008 n=5+5) RegexpMatchHard_1K-46 13.3MB/s ± 0% 13.5MB/s ± 0% +1.58% (p=0.008 n=5+5) Revcomp-46 402MB/s ± 1% 405MB/s ± 0% ~ (p=0.095 n=5+5) Template-46 23.7MB/s ± 0% 23.0MB/s ± 1% -2.95% (p=0.008 n=5+5) [Geo mean] 82.2MB/s 82.3MB/s +0.04% Frame-pointer is enabled on Linux by default but can be disabled by setting: GOEXPERIMENT=noframepointer. Fixes #10110 Change-Id: I1bfaca6dba29a63009d7c6ab04ed7a1413d9479e Reviewed-on: https://go-review.googlesource.com/61511 Reviewed-by: Cherry Zhang Run-TryBot: Cherry Zhang TryBot-Result: Gobot Gobot --- src/cmd/compile/internal/arm64/ggen.go | 8 +- src/cmd/compile/internal/gc/pgen.go | 6 +- src/cmd/internal/obj/arm64/asm7.go | 10 +- src/cmd/internal/obj/arm64/obj7.go | 183 ++++++++++++++++++++++--- src/cmd/internal/objabi/util.go | 2 +- src/cmd/link/internal/ld/lib.go | 4 + src/runtime/asm_arm64.s | 34 +++-- src/runtime/cgocall.go | 3 +- src/runtime/rt0_darwin_arm64.s | 2 + src/runtime/rt0_linux_arm64.s | 2 + src/runtime/sys_linux_arm64.s | 8 +- src/runtime/traceback.go | 2 +- test/codegen/stack.go | 10 +- test/nosplit.go | 26 ++-- 14 files changed, 243 insertions(+), 57 deletions(-) diff --git a/src/cmd/compile/internal/arm64/ggen.go b/src/cmd/compile/internal/arm64/ggen.go index f7b3851398f51c..204391fef1c778 100644 --- a/src/cmd/compile/internal/arm64/ggen.go +++ b/src/cmd/compile/internal/arm64/ggen.go @@ -14,10 +14,10 @@ import ( var darwin = objabi.GOOS == "darwin" func padframe(frame int64) int64 { - // arm64 requires that the frame size (not counting saved LR) - // be empty or be 8 mod 16. If not, pad it. - if frame != 0 && frame%16 != 8 { - frame += 8 + // arm64 requires that the frame size (not counting saved FP&LR) + // be 16 bytes aligned. If not, pad it. + if frame%16 != 0 { + frame += 16 - (frame % 16) } return frame } diff --git a/src/cmd/compile/internal/gc/pgen.go b/src/cmd/compile/internal/gc/pgen.go index 7f20643ab57feb..563eb9e966388b 100644 --- a/src/cmd/compile/internal/gc/pgen.go +++ b/src/cmd/compile/internal/gc/pgen.go @@ -427,7 +427,8 @@ func createSimpleVars(automDecls []*Node) ([]*Node, []*dwarf.Var, map[*Node]bool if Ctxt.FixedFrameSize() == 0 { offs -= int64(Widthptr) } - if objabi.Framepointer_enabled(objabi.GOOS, objabi.GOARCH) { + if objabi.Framepointer_enabled(objabi.GOOS, objabi.GOARCH) || objabi.GOARCH == "arm64" { + // There is a word space for FP on ARM64 even if the frame pointer is disabled offs -= int64(Widthptr) } @@ -607,7 +608,8 @@ func stackOffset(slot ssa.LocalSlot) int32 { if Ctxt.FixedFrameSize() == 0 { base -= int64(Widthptr) } - if objabi.Framepointer_enabled(objabi.GOOS, objabi.GOARCH) { + if objabi.Framepointer_enabled(objabi.GOOS, objabi.GOARCH) || objabi.GOARCH == "arm64" { + // There is a word space for FP on ARM64 even if the frame pointer is disabled base -= int64(Widthptr) } case PPARAM, PPARAMOUT: diff --git a/src/cmd/internal/obj/arm64/asm7.go b/src/cmd/internal/obj/arm64/asm7.go index ad4f1725446712..2abb8c2c773d93 100644 --- a/src/cmd/internal/obj/arm64/asm7.go +++ b/src/cmd/internal/obj/arm64/asm7.go @@ -49,6 +49,7 @@ type ctxt7 struct { blitrl *obj.Prog elitrl *obj.Prog autosize int32 + extrasize int32 instoffset int64 pc int64 pool struct { @@ -777,7 +778,8 @@ func span7(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) { ctxt.Diag("arm64 ops not initialized, call arm64.buildop first") } - c := ctxt7{ctxt: ctxt, newprog: newprog, cursym: cursym, autosize: int32(p.To.Offset&0xffffffff) + 8} + c := ctxt7{ctxt: ctxt, newprog: newprog, cursym: cursym, autosize: int32(p.To.Offset & 0xffffffff), extrasize: int32(p.To.Offset >> 32)} + p.To.Offset &= 0xffffffff // extrasize is no longer needed bflag := 1 pc := int64(0) @@ -1436,7 +1438,8 @@ func (c *ctxt7) aclass(a *obj.Addr) int { // a.Offset is still relative to pseudo-SP. a.Reg = obj.REG_NONE } - c.instoffset = int64(c.autosize) + a.Offset + // The frame top 8 or 16 bytes are for FP + c.instoffset = int64(c.autosize) + a.Offset - int64(c.extrasize) return autoclass(c.instoffset) case obj.NAME_PARAM: @@ -1536,7 +1539,8 @@ func (c *ctxt7) aclass(a *obj.Addr) int { // a.Offset is still relative to pseudo-SP. a.Reg = obj.REG_NONE } - c.instoffset = int64(c.autosize) + a.Offset + // The frame top 8 or 16 bytes are for FP + c.instoffset = int64(c.autosize) + a.Offset - int64(c.extrasize) case obj.NAME_PARAM: if a.Reg == REGSP { diff --git a/src/cmd/internal/obj/arm64/obj7.go b/src/cmd/internal/obj/arm64/obj7.go index 0d832387d7f7dd..97b8f70c9b20e3 100644 --- a/src/cmd/internal/obj/arm64/obj7.go +++ b/src/cmd/internal/obj/arm64/obj7.go @@ -542,22 +542,28 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) { c.autosize += 8 } - if c.autosize != 0 && c.autosize&(16-1) != 0 { - // The frame includes an LR. - // If the frame size is 8, it's only an LR, - // so there's no potential for breaking references to - // local variables by growing the frame size, - // because there are no local variables. - // But otherwise, if there is a non-empty locals section, - // the author of the code is responsible for making sure - // that the frame size is 8 mod 16. - if c.autosize == 8 { - c.autosize += 8 - c.cursym.Func.Locals += 8 + if c.autosize != 0 { + extrasize := int32(0) + if c.autosize%16 == 8 { + // Allocate extra 8 bytes on the frame top to save FP + extrasize = 8 + } else if c.autosize&(16-1) == 0 { + // Allocate extra 16 bytes to save FP for the old frame whose size is 8 mod 16 + extrasize = 16 } else { - c.ctxt.Diag("%v: unaligned frame size %d - must be 8 mod 16 (or 0)", p, c.autosize-8) + c.ctxt.Diag("%v: unaligned frame size %d - must be 16 aligned", p, c.autosize-8) } + c.autosize += extrasize + c.cursym.Func.Locals += extrasize + + // low 32 bits for autosize + // high 32 bits for extrasize + p.To.Offset = int64(c.autosize) | int64(extrasize)<<32 + } else { + // NOFRAME + p.To.Offset = 0 } + if c.autosize == 0 && c.cursym.Func.Text.Mark&LEAF == 0 { if c.ctxt.Debugvlog { c.ctxt.Logf("save suppressed in: %s\n", c.cursym.Func.Text.From.Sym.Name) @@ -565,9 +571,6 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) { c.cursym.Func.Text.Mark |= LEAF } - // FP offsets need an updated p.To.Offset. - p.To.Offset = int64(c.autosize) - 8 - if cursym.Func.Text.Mark&LEAF != 0 { cursym.Set(obj.AttrLeaf, true) if p.From.Sym.NoFrame() { @@ -631,6 +634,26 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) { q1.Spadj = aoffset } + if objabi.Framepointer_enabled(objabi.GOOS, objabi.GOARCH) { + q1 = obj.Appendp(q1, c.newprog) + q1.Pos = p.Pos + q1.As = AMOVD + q1.From.Type = obj.TYPE_REG + q1.From.Reg = REGFP + q1.To.Type = obj.TYPE_MEM + q1.To.Reg = REGSP + q1.To.Offset = -8 + + q1 = obj.Appendp(q1, c.newprog) + q1.Pos = p.Pos + q1.As = ASUB + q1.From.Type = obj.TYPE_CONST + q1.From.Offset = 8 + q1.Reg = REGSP + q1.To.Type = obj.TYPE_REG + q1.To.Reg = REGFP + } + if c.cursym.Func.Text.From.Sym.Wrapper() { // if(g->panic != nil && g->panic->argp == FP) g->panic->argp = bottom-of-frame // @@ -753,9 +776,30 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) { p.To.Type = obj.TYPE_REG p.To.Reg = REGSP p.Spadj = -c.autosize + + if objabi.Framepointer_enabled(objabi.GOOS, objabi.GOARCH) { + p = obj.Appendp(p, c.newprog) + p.As = ASUB + p.From.Type = obj.TYPE_CONST + p.From.Offset = 8 + p.Reg = REGSP + p.To.Type = obj.TYPE_REG + p.To.Reg = REGFP + } } } else { /* want write-back pre-indexed SP+autosize -> SP, loading REGLINK*/ + + if objabi.Framepointer_enabled(objabi.GOOS, objabi.GOARCH) { + p.As = AMOVD + p.From.Type = obj.TYPE_MEM + p.From.Reg = REGSP + p.From.Offset = -8 + p.To.Type = obj.TYPE_REG + p.To.Reg = REGFP + p = obj.Appendp(p, c.newprog) + } + aoffset := c.autosize if aoffset > 0xF0 { @@ -814,7 +858,6 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) { p.Spadj = int32(+p.From.Offset) } } - break case obj.AGETCALLERPC: if cursym.Leaf() { @@ -828,6 +871,112 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) { p.From.Type = obj.TYPE_MEM p.From.Reg = REGSP } + + case obj.ADUFFCOPY: + if objabi.Framepointer_enabled(objabi.GOOS, objabi.GOARCH) { + // ADR ret_addr, R27 + // STP (FP, R27), -24(SP) + // SUB 24, SP, FP + // DUFFCOPY + // ret_addr: + // SUB 8, SP, FP + + q1 := p + // copy DUFFCOPY from q1 to q4 + q4 := obj.Appendp(p, c.newprog) + q4.Pos = p.Pos + q4.As = obj.ADUFFCOPY + q4.To = p.To + + q1.As = AADR + q1.From.Type = obj.TYPE_BRANCH + q1.To.Type = obj.TYPE_REG + q1.To.Reg = REG_R27 + + q2 := obj.Appendp(q1, c.newprog) + q2.Pos = p.Pos + q2.As = ASTP + q2.From.Type = obj.TYPE_REGREG + q2.From.Reg = REGFP + q2.From.Offset = int64(REG_R27) + q2.To.Type = obj.TYPE_MEM + q2.To.Reg = REGSP + q2.To.Offset = -24 + + // maintaine FP for DUFFCOPY + q3 := obj.Appendp(q2, c.newprog) + q3.Pos = p.Pos + q3.As = ASUB + q3.From.Type = obj.TYPE_CONST + q3.From.Offset = 24 + q3.Reg = REGSP + q3.To.Type = obj.TYPE_REG + q3.To.Reg = REGFP + + q5 := obj.Appendp(q4, c.newprog) + q5.Pos = p.Pos + q5.As = ASUB + q5.From.Type = obj.TYPE_CONST + q5.From.Offset = 8 + q5.Reg = REGSP + q5.To.Type = obj.TYPE_REG + q5.To.Reg = REGFP + q1.Pcond = q5 + p = q5 + } + + case obj.ADUFFZERO: + if objabi.Framepointer_enabled(objabi.GOOS, objabi.GOARCH) { + // ADR ret_addr, R27 + // STP (FP, R27), -24(SP) + // SUB 24, SP, FP + // DUFFZERO + // ret_addr: + // SUB 8, SP, FP + + q1 := p + // copy DUFFZERO from q1 to q4 + q4 := obj.Appendp(p, c.newprog) + q4.Pos = p.Pos + q4.As = obj.ADUFFZERO + q4.To = p.To + + q1.As = AADR + q1.From.Type = obj.TYPE_BRANCH + q1.To.Type = obj.TYPE_REG + q1.To.Reg = REG_R27 + + q2 := obj.Appendp(q1, c.newprog) + q2.Pos = p.Pos + q2.As = ASTP + q2.From.Type = obj.TYPE_REGREG + q2.From.Reg = REGFP + q2.From.Offset = int64(REG_R27) + q2.To.Type = obj.TYPE_MEM + q2.To.Reg = REGSP + q2.To.Offset = -24 + + // maintaine FP for DUFFZERO + q3 := obj.Appendp(q2, c.newprog) + q3.Pos = p.Pos + q3.As = ASUB + q3.From.Type = obj.TYPE_CONST + q3.From.Offset = 24 + q3.Reg = REGSP + q3.To.Type = obj.TYPE_REG + q3.To.Reg = REGFP + + q5 := obj.Appendp(q4, c.newprog) + q5.Pos = p.Pos + q5.As = ASUB + q5.From.Type = obj.TYPE_CONST + q5.From.Offset = 8 + q5.Reg = REGSP + q5.To.Type = obj.TYPE_REG + q5.To.Reg = REGFP + q1.Pcond = q5 + p = q5 + } } } } diff --git a/src/cmd/internal/objabi/util.go b/src/cmd/internal/objabi/util.go index a47e2f93a10e9f..ffd1c04d39fcd0 100644 --- a/src/cmd/internal/objabi/util.go +++ b/src/cmd/internal/objabi/util.go @@ -76,7 +76,7 @@ func init() { } func Framepointer_enabled(goos, goarch string) bool { - return framepointer_enabled != 0 && goarch == "amd64" && goos != "nacl" + return framepointer_enabled != 0 && (goarch == "amd64" && goos != "nacl" || goarch == "arm64" && goos == "linux") } func addexp(s string) { diff --git a/src/cmd/link/internal/ld/lib.go b/src/cmd/link/internal/ld/lib.go index 1b6d5d17047129..ba03cb707b84f7 100644 --- a/src/cmd/link/internal/ld/lib.go +++ b/src/cmd/link/internal/ld/lib.go @@ -1815,6 +1815,10 @@ func (ctxt *Link) dostkcheck() { ch.up = nil ch.limit = objabi.StackLimit - callsize(ctxt) + if objabi.GOARCH == "arm64" { + // need extra 8 bytes below SP to save FP + ch.limit -= 8 + } // Check every function, but do the nosplit functions in a first pass, // to make the printed failure chains as short as possible. diff --git a/src/runtime/asm_arm64.s b/src/runtime/asm_arm64.s index af389be9fee336..6a6a699241dc58 100644 --- a/src/runtime/asm_arm64.s +++ b/src/runtime/asm_arm64.s @@ -39,10 +39,9 @@ TEXT runtime·rt0_go(SB),NOSPLIT,$0 #endif MOVD $setg_gcc<>(SB), R1 // arg 1: setg MOVD g, R0 // arg 0: G + SUB $16, RSP // reserve 16 bytes for sp-8 where fp may be saved. BL (R12) - MOVD _cgo_init(SB), R12 - CMP $0, R12 - BEQ nocgo + ADD $16, RSP nocgo: // update stackguard after _cgo_init @@ -107,6 +106,7 @@ TEXT runtime·gosave(SB), NOSPLIT|NOFRAME, $0-8 MOVD buf+0(FP), R3 MOVD RSP, R0 MOVD R0, gobuf_sp(R3) + MOVD R29, gobuf_bp(R3) MOVD LR, gobuf_pc(R3) MOVD g, gobuf_g(R3) MOVD ZR, gobuf_lr(R3) @@ -128,10 +128,12 @@ TEXT runtime·gogo(SB), NOSPLIT, $24-8 MOVD 0(g), R4 // make sure g is not nil MOVD gobuf_sp(R5), R0 MOVD R0, RSP + MOVD gobuf_bp(R5), R29 MOVD gobuf_lr(R5), LR MOVD gobuf_ret(R5), R0 MOVD gobuf_ctxt(R5), R26 MOVD $0, gobuf_sp(R5) + MOVD $0, gobuf_bp(R5) MOVD $0, gobuf_ret(R5) MOVD $0, gobuf_lr(R5) MOVD $0, gobuf_ctxt(R5) @@ -147,6 +149,7 @@ TEXT runtime·mcall(SB), NOSPLIT|NOFRAME, $0-8 // Save caller state in g->sched MOVD RSP, R0 MOVD R0, (g_sched+gobuf_sp)(g) + MOVD R29, (g_sched+gobuf_bp)(g) MOVD LR, (g_sched+gobuf_pc)(g) MOVD $0, (g_sched+gobuf_lr)(g) MOVD g, (g_sched+gobuf_g)(g) @@ -163,6 +166,7 @@ TEXT runtime·mcall(SB), NOSPLIT|NOFRAME, $0-8 MOVD 0(R26), R4 // code pointer MOVD (g_sched+gobuf_sp)(g), R0 MOVD R0, RSP // sp = m->g0->sched.sp + MOVD (g_sched+gobuf_bp)(g), R29 MOVD R3, -8(RSP) MOVD $0, -16(RSP) SUB $16, RSP @@ -211,6 +215,7 @@ switch: MOVD R6, (g_sched+gobuf_pc)(g) MOVD RSP, R0 MOVD R0, (g_sched+gobuf_sp)(g) + MOVD R29, (g_sched+gobuf_bp)(g) MOVD $0, (g_sched+gobuf_lr)(g) MOVD g, (g_sched+gobuf_g)(g) @@ -224,6 +229,7 @@ switch: MOVD $runtime·mstart(SB), R4 MOVD R4, 0(R3) MOVD R3, RSP + MOVD (g_sched+gobuf_bp)(g), R29 // call target function MOVD 0(R26), R3 // code pointer @@ -235,7 +241,9 @@ switch: BL runtime·save_g(SB) MOVD (g_sched+gobuf_sp)(g), R0 MOVD R0, RSP + MOVD (g_sched+gobuf_bp)(g), R29 MOVD $0, (g_sched+gobuf_sp)(g) + MOVD $0, (g_sched+gobuf_bp)(g) RET noswitch: @@ -244,6 +252,7 @@ noswitch: // at an intermediate systemstack. MOVD 0(R26), R3 // code pointer MOVD.P 16(RSP), R30 // restore LR + SUB $8, RSP, R29 // restore FP B (R3) /* @@ -278,6 +287,7 @@ TEXT runtime·morestack(SB),NOSPLIT|NOFRAME,$0-0 // Set g->sched to context in f MOVD RSP, R0 MOVD R0, (g_sched+gobuf_sp)(g) + MOVD R29, (g_sched+gobuf_bp)(g) MOVD LR, (g_sched+gobuf_pc)(g) MOVD R3, (g_sched+gobuf_lr)(g) MOVD R26, (g_sched+gobuf_ctxt)(g) @@ -294,6 +304,7 @@ TEXT runtime·morestack(SB),NOSPLIT|NOFRAME,$0-0 BL runtime·save_g(SB) MOVD (g_sched+gobuf_sp)(g), R0 MOVD R0, RSP + MOVD (g_sched+gobuf_bp)(g), R29 MOVD.W $0, -16(RSP) // create a call frame on g0 (saved LR; keep 16-aligned) BL runtime·newstack(SB) @@ -843,8 +854,9 @@ TEXT runtime·jmpdefer(SB), NOSPLIT|NOFRAME, $0-16 // Save state of caller into g->sched. Smashes R0. TEXT gosave<>(SB),NOSPLIT|NOFRAME,$0 MOVD LR, (g_sched+gobuf_pc)(g) - MOVD RSP, R0 + MOVD RSP, R0 MOVD R0, (g_sched+gobuf_sp)(g) + MOVD R29, (g_sched+gobuf_bp)(g) MOVD $0, (g_sched+gobuf_lr)(g) MOVD $0, (g_sched+gobuf_ret)(g) // Assert ctxt is zero. See func save. @@ -885,6 +897,7 @@ TEXT ·asmcgocall(SB),NOSPLIT,$0-20 BL runtime·save_g(SB) MOVD (g_sched+gobuf_sp)(g), R0 MOVD R0, RSP + MOVD (g_sched+gobuf_bp)(g), R29 MOVD R9, R0 // Now on a scheduling stack (a pthread-created stack). @@ -996,6 +1009,7 @@ needm: MOVD m_g0(R8), R3 MOVD RSP, R0 MOVD R0, (g_sched+gobuf_sp)(R3) + MOVD R29, (g_sched+gobuf_bp)(R3) havem: // Now there's a valid m, and we're running on its m->g0. @@ -1003,7 +1017,7 @@ havem: // Save current sp in m->g0->sched.sp in preparation for // switch back to m->curg stack. // NOTE: unwindm knows that the saved g->sched.sp is at 16(RSP) aka savedsp-16(SP). - // Beware that the frame size is actually 32. + // Beware that the frame size is actually 32+16. MOVD m_g0(R8), R3 MOVD (g_sched+gobuf_sp)(R3), R4 MOVD R4, savedsp-16(SP) @@ -1030,10 +1044,12 @@ havem: BL runtime·save_g(SB) MOVD (g_sched+gobuf_sp)(g), R4 // prepare stack as R4 MOVD (g_sched+gobuf_pc)(g), R5 - MOVD R5, -(24+8)(R4) + MOVD R5, -48(R4) + MOVD (g_sched+gobuf_bp)(g), R5 + MOVD R5, -56(R4) MOVD ctxt+24(FP), R0 - MOVD R0, -(16+8)(R4) - MOVD $-(24+8)(R4), R0 // maintain 16-byte SP alignment + MOVD R0, -40(R4) + MOVD $-48(R4), R0 // maintain 16-byte SP alignment MOVD R0, RSP BL runtime·cgocallbackg(SB) @@ -1041,7 +1057,7 @@ havem: MOVD 0(RSP), R5 MOVD R5, (g_sched+gobuf_pc)(g) MOVD RSP, R4 - ADD $(24+8), R4, R4 + ADD $48, R4, R4 MOVD R4, (g_sched+gobuf_sp)(g) // Switch back to m->g0's stack and restore m->g0->sched.sp. diff --git a/src/runtime/cgocall.go b/src/runtime/cgocall.go index c85033f4bccff6..86bd2fb01c0966 100644 --- a/src/runtime/cgocall.go +++ b/src/runtime/cgocall.go @@ -268,7 +268,8 @@ func cgocallbackg1(ctxt uintptr) { case "arm64": // On arm64, stack frame is four words and there's a saved LR between // SP and the stack frame and between the stack frame and the arguments. - cb = (*args)(unsafe.Pointer(sp + 5*sys.PtrSize)) + // Additional two words (16-byte alignment) are for saving FP. + cb = (*args)(unsafe.Pointer(sp + 7*sys.PtrSize)) case "amd64": // On amd64, stack frame is two words, plus caller PC. if framepointer_enabled { diff --git a/src/runtime/rt0_darwin_arm64.s b/src/runtime/rt0_darwin_arm64.s index d039a8e0abab27..e3972f492429af 100644 --- a/src/runtime/rt0_darwin_arm64.s +++ b/src/runtime/rt0_darwin_arm64.s @@ -49,7 +49,9 @@ TEXT _rt0_arm64_darwin_lib(SB),NOSPLIT,$168 MOVD _cgo_sys_thread_create(SB), R4 MOVD $_rt0_arm64_darwin_lib_go(SB), R0 MOVD $0, R1 + SUB $16, RSP // reserve 16 bytes for sp-8 where fp may be saved. BL (R4) + ADD $16, RSP // Restore callee-save registers. MOVD 24(RSP), R19 diff --git a/src/runtime/rt0_linux_arm64.s b/src/runtime/rt0_linux_arm64.s index 458f082159dae9..a6bc99df569309 100644 --- a/src/runtime/rt0_linux_arm64.s +++ b/src/runtime/rt0_linux_arm64.s @@ -48,7 +48,9 @@ TEXT _rt0_arm64_linux_lib(SB),NOSPLIT,$184 BEQ nocgo MOVD $_rt0_arm64_linux_lib_go(SB), R0 MOVD $0, R1 + SUB $16, RSP // reserve 16 bytes for sp-8 where fp may be saved. BL (R4) + ADD $16, RSP B restore nocgo: diff --git a/src/runtime/sys_linux_arm64.s b/src/runtime/sys_linux_arm64.s index c6afd76a657e27..1c8fce3db649b0 100644 --- a/src/runtime/sys_linux_arm64.s +++ b/src/runtime/sys_linux_arm64.s @@ -239,7 +239,7 @@ TEXT runtime·nanotime(SB),NOSPLIT,$24-8 MOVD (g_sched+gobuf_sp)(R3), R1 // Set RSP to g0 stack noswitch: - SUB $16, R1 + SUB $32, R1 BIC $15, R1 MOVD R1, RSP @@ -298,7 +298,9 @@ TEXT runtime·callCgoSigaction(SB),NOSPLIT,$0 MOVD new+8(FP), R1 MOVD old+16(FP), R2 MOVD _cgo_sigaction(SB), R3 + SUB $16, RSP // reserve 16 bytes for sp-8 where fp may be saved. BL R3 + ADD $16, RSP MOVW R0, ret+24(FP) RET @@ -361,7 +363,9 @@ TEXT runtime·callCgoMmap(SB),NOSPLIT,$0 MOVW fd+24(FP), R4 MOVW off+28(FP), R5 MOVD _cgo_mmap(SB), R9 + SUB $16, RSP // reserve 16 bytes for sp-8 where fp may be saved. BL R9 + ADD $16, RSP MOVD R0, ret+32(FP) RET @@ -382,7 +386,9 @@ TEXT runtime·callCgoMunmap(SB),NOSPLIT,$0 MOVD addr+0(FP), R0 MOVD n+8(FP), R1 MOVD _cgo_munmap(SB), R9 + SUB $16, RSP // reserve 16 bytes for sp-8 where fp may be saved. BL R9 + ADD $16, RSP RET TEXT runtime·madvise(SB),NOSPLIT|NOFRAME,$0 diff --git a/src/runtime/traceback.go b/src/runtime/traceback.go index 8370fd75937edd..4c2010493a76a8 100644 --- a/src/runtime/traceback.go +++ b/src/runtime/traceback.go @@ -285,7 +285,7 @@ func gentraceback(pc0, sp0, lr0 uintptr, gp *g, skip int, pcbuf *uintptr, max in // If framepointer_enabled and there's a frame, then // there's a saved bp here. - if framepointer_enabled && GOARCH == "amd64" && frame.varp > frame.sp { + if frame.varp > frame.sp && (framepointer_enabled && GOARCH == "amd64" || GOARCH == "arm64") { frame.varp -= sys.RegSize } diff --git a/test/codegen/stack.go b/test/codegen/stack.go index 7e12dbc0eb6bbe..0f2f6178c7f4cb 100644 --- a/test/codegen/stack.go +++ b/test/codegen/stack.go @@ -16,7 +16,7 @@ import "runtime" // 386:"TEXT\t.*, [$]0-" // amd64:"TEXT\t.*, [$]0-" // arm:"TEXT\t.*, [$]-4-" -// arm64:"TEXT\t.*, [$]-8-" +// arm64:"TEXT\t.*, [$]0-" // mips:"TEXT\t.*, [$]-4-" // ppc64le:"TEXT\t.*, [$]0-" // s390x:"TEXT\t.*, [$]0-" @@ -35,7 +35,7 @@ type T struct { // 386:"TEXT\t.*, [$]0-" // amd64:"TEXT\t.*, [$]0-" // arm:"TEXT\t.*, [$]0-" (spills return address) -// arm64:"TEXT\t.*, [$]-8-" +// arm64:"TEXT\t.*, [$]0-" // mips:"TEXT\t.*, [$]-4-" // ppc64le:"TEXT\t.*, [$]0-" // s390x:"TEXT\t.*, [$]0-" @@ -50,7 +50,7 @@ func ZeroLargeStruct(x *T) { // - 386 fails due to spilling a register // amd64:"TEXT\t.*, [$]0-" // arm:"TEXT\t.*, [$]0-" (spills return address) -// arm64:"TEXT\t.*, [$]-8-" +// arm64:"TEXT\t.*, [$]0-" // ppc64le:"TEXT\t.*, [$]0-" // s390x:"TEXT\t.*, [$]0-" // Note: that 386 currently has to spill a register. @@ -64,7 +64,7 @@ func KeepWanted(t *T) { // - 386 fails due to spilling a register // - arm & mips fail due to softfloat calls // amd64:"TEXT\t.*, [$]0-" -// arm64:"TEXT\t.*, [$]-8-" +// arm64:"TEXT\t.*, [$]0-" // ppc64le:"TEXT\t.*, [$]0-" // s390x:"TEXT\t.*, [$]0-" func ArrayAdd64(a, b [4]float64) [4]float64 { @@ -76,7 +76,7 @@ func ArrayAdd64(a, b [4]float64) [4]float64 { // 386:"TEXT\t.*, [$]0-" // amd64:"TEXT\t.*, [$]0-" // arm:"TEXT\t.*, [$]0-" (spills return address) -// arm64:"TEXT\t.*, [$]-8-" +// arm64:"TEXT\t.*, [$]0-" // mips:"TEXT\t.*, [$]-4-" // ppc64le:"TEXT\t.*, [$]0-" // s390x:"TEXT\t.*, [$]0-" diff --git a/test/nosplit.go b/test/nosplit.go index e6cd04e563060d..8b61c9e96d1429 100644 --- a/test/nosplit.go +++ b/test/nosplit.go @@ -118,11 +118,11 @@ main 136 # (CallSize is 32 on ppc64, 8 on amd64 for frame pointer.) main 96 nosplit main 100 nosplit; REJECT ppc64 ppc64le -main 104 nosplit; REJECT ppc64 ppc64le +main 104 nosplit; REJECT ppc64 ppc64le arm64 main 108 nosplit; REJECT ppc64 ppc64le -main 112 nosplit; REJECT ppc64 ppc64le +main 112 nosplit; REJECT ppc64 ppc64le arm64 main 116 nosplit; REJECT ppc64 ppc64le -main 120 nosplit; REJECT ppc64 ppc64le amd64 +main 120 nosplit; REJECT ppc64 ppc64le amd64 arm64 main 124 nosplit; REJECT ppc64 ppc64le amd64 main 128 nosplit; REJECT main 132 nosplit; REJECT @@ -136,11 +136,11 @@ main 136 nosplit; REJECT # Because AMD64 uses frame pointer, it has 8 fewer bytes. main 96 nosplit call f; f 0 nosplit main 100 nosplit call f; f 0 nosplit; REJECT ppc64 ppc64le -main 104 nosplit call f; f 0 nosplit; REJECT ppc64 ppc64le +main 104 nosplit call f; f 0 nosplit; REJECT ppc64 ppc64le arm64 main 108 nosplit call f; f 0 nosplit; REJECT ppc64 ppc64le -main 112 nosplit call f; f 0 nosplit; REJECT ppc64 ppc64le amd64 +main 112 nosplit call f; f 0 nosplit; REJECT ppc64 ppc64le amd64 arm64 main 116 nosplit call f; f 0 nosplit; REJECT ppc64 ppc64le amd64 -main 120 nosplit call f; f 0 nosplit; REJECT ppc64 ppc64le amd64 +main 124 nosplit call f; f 0 nosplit; REJECT ppc64 ppc64le amd64 arm64 main 124 nosplit call f; f 0 nosplit; REJECT ppc64 ppc64le amd64 386 main 128 nosplit call f; f 0 nosplit; REJECT main 132 nosplit call f; f 0 nosplit; REJECT @@ -152,11 +152,11 @@ main 136 nosplit call f; f 0 nosplit; REJECT # Architectures differ in the same way as before. main 96 nosplit call f; f 0 call f main 100 nosplit call f; f 0 call f; REJECT ppc64 ppc64le -main 104 nosplit call f; f 0 call f; REJECT ppc64 ppc64le amd64 +main 104 nosplit call f; f 0 call f; REJECT ppc64 ppc64le amd64 arm64 main 108 nosplit call f; f 0 call f; REJECT ppc64 ppc64le amd64 -main 112 nosplit call f; f 0 call f; REJECT ppc64 ppc64le amd64 +main 112 nosplit call f; f 0 call f; REJECT ppc64 ppc64le amd64 arm64 main 116 nosplit call f; f 0 call f; REJECT ppc64 ppc64le amd64 -main 120 nosplit call f; f 0 call f; REJECT ppc64 ppc64le amd64 386 +main 120 nosplit call f; f 0 call f; REJECT ppc64 ppc64le amd64 386 arm64 main 124 nosplit call f; f 0 call f; REJECT ppc64 ppc64le amd64 386 main 128 nosplit call f; f 0 call f; REJECT main 132 nosplit call f; f 0 call f; REJECT @@ -165,11 +165,11 @@ main 136 nosplit call f; f 0 call f; REJECT # Indirect calls are assumed to be splitting functions. main 96 nosplit callind main 100 nosplit callind; REJECT ppc64 ppc64le -main 104 nosplit callind; REJECT ppc64 ppc64le amd64 +main 104 nosplit callind; REJECT ppc64 ppc64le amd64 arm64 main 108 nosplit callind; REJECT ppc64 ppc64le amd64 -main 112 nosplit callind; REJECT ppc64 ppc64le amd64 +main 112 nosplit callind; REJECT ppc64 ppc64le amd64 arm64 main 116 nosplit callind; REJECT ppc64 ppc64le amd64 -main 120 nosplit callind; REJECT ppc64 ppc64le amd64 386 +main 120 nosplit callind; REJECT ppc64 ppc64le amd64 386 arm64 main 124 nosplit callind; REJECT ppc64 ppc64le amd64 386 main 128 nosplit callind; REJECT main 132 nosplit callind; REJECT @@ -319,7 +319,7 @@ TestCases: } } - if size%ptrSize == 4 || goarch == "arm64" && size != 0 && (size+8)%16 != 0 { + if size%ptrSize == 4 { continue TestCases } nosplit := m[3]