Skip to content

Commit

Permalink
cmd/internal/obj/arm64: avoid unnecessary literal pool usage for moves
Browse files Browse the repository at this point in the history
In a number of load and store cases, the use of the literal pool can be
entirely avoided by simply adding or subtracting the offset from the
register. This uses the same number of instructions, while avoiding a
load from memory, along with the need for the value to be in the literal
pool. Overall this reduces the size of binaries slightly and should have
lower overhead.

Updates #59615

Change-Id: I9cb6a403dc71e34a46af913f5db87dbf52f8688c
Reviewed-on: https://go-review.googlesource.com/c/go/+/512539
Reviewed-by: David Chase <[email protected]>
TryBot-Result: Gopher Robot <[email protected]>
Reviewed-by: Cherry Mui <[email protected]>
Run-TryBot: Joel Sing <[email protected]>
  • Loading branch information
4a6f656c committed Jul 31, 2023
1 parent 3313b39 commit 208fc13
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 5 deletions.
33 changes: 32 additions & 1 deletion src/cmd/asm/internal/asm/testdata/arm64.s
Original file line number Diff line number Diff line change
Expand Up @@ -557,7 +557,38 @@ TEXT foo(SB), DUPOK|NOSPLIT, $-8
FMOVQ 65520(R10), F10 // 4afdff3d
FMOVQ 64(RSP), F11 // eb13c03d

// large aligned offset, use two instructions(add+ldr/store).
// medium offsets that either fit a single instruction or can use add+ldr/str
MOVD -4095(R17), R3 // 3bfe3fd1630340f9
MOVD -391(R17), R3 // 3b1e06d1630340f9
MOVD -257(R17), R3 // 3b0604d1630340f9
MOVD -256(R17), R3 // 230250f8
MOVD 255(R17), R3 // 23f24ff8
MOVD 256(R17), R3 // 238240f9
MOVD 257(R17), R3 // 3b060491630340f9
MOVD 391(R17), R3 // 3b1e0691630340f9
MOVD 4095(R17), R3 // 3bfe3f91630340f9

MOVD R0, -4095(R17) // 3bfe3fd1600300f9
MOVD R0, -391(R17) // 3b1e06d1600300f9
MOVD R0, -257(R17) // 3b0604d1600300f9
MOVD R0, -256(R17) // 200210f8
MOVD R0, 255(R17) // 20f20ff8
MOVD R0, 256(R17) // 208200f9
MOVD R0, 257(R17) // 3b060491600300f9
MOVD R0, 391(R17) // 3b1e0691600300f9
MOVD R0, 4095(R17) // 3bfe3f91600300f9
MOVD R0, 4096(R17) // 200208f9
MOVD R3, -4095(R17) // 3bfe3fd1630300f9
MOVD R3, -391(R17) // 3b1e06d1630300f9
MOVD R3, -257(R17) // 3b0604d1630300f9
MOVD R3, -256(R17) // 230210f8
MOVD R3, 255(R17) // 23f20ff8
MOVD R3, 256(R17) // 238200f9
MOVD R3, 257(R17) // 3b060491630300f9
MOVD R3, 391(R17) // 3b1e0691630300f9
MOVD R3, 4095(R17) // 3bfe3f91630300f9

// large aligned offset, use two instructions(add+ldr/str).
MOVB R1, 0x1001(R2) // MOVB R1, 4097(R2) // 5b04409161070039
MOVB R1, 0xffffff(R2) // MOVB R1, 16777215(R2) // 5bfc7f9161ff3f39
MOVH R1, 0x2002(R2) // MOVH R1, 8194(R2) // 5b08409161070079
Expand Down
46 changes: 42 additions & 4 deletions src/cmd/internal/obj/arm64/asm7.go
Original file line number Diff line number Diff line change
Expand Up @@ -1959,6 +1959,10 @@ func (c *ctxt7) loadStoreClass(p *obj.Prog, lsc int, v int64) int {
}

needsPool := true
if v >= -4095 && v <= 4095 {
needsPool = false
}

switch p.As {
case AMOVB, AMOVBU:
if cmp(C_UAUTO4K, lsc) || cmp(C_UOREG4K, lsc) {
Expand Down Expand Up @@ -4015,10 +4019,13 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) {
o1 |= uint32(p.From.Reg&31)<<5 | uint32(p.To.Reg&31)

case 30: /* movT R,L(R) -> strT */
// if offset L can be split into hi+lo, and both fit into instructions, do
// If offset L fits in a 12 bit unsigned immediate:
// add $L, R, Rtmp or sub $L, R, Rtmp
// str R, (Rtmp)
// Otherwise, if offset L can be split into hi+lo, and both fit into instructions:
// add $hi, R, Rtmp
// str R, lo(Rtmp)
// otherwise, use constant pool
// Otherwise, use constant pool:
// mov $L, Rtmp (from constant pool)
// str R, (R+Rtmp)
s := movesize(o.as)
Expand All @@ -4032,6 +4039,20 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) {
}

v := c.regoff(&p.To)
if v >= -256 && v <= 256 {
c.ctxt.Diag("%v: bad type for offset %d (should be 9 bit signed immediate store)", p, v)
}
if v >= 0 && v <= 4095 && v&((1<<int32(s))-1) == 0 {
c.ctxt.Diag("%v: bad type for offset %d (should be 12 bit unsigned immediate store)", p, v)
}

// Handle smaller unaligned and negative offsets via addition or subtraction.
if v >= -4095 && v <= 4095 {
o1 = c.oaddi12(p, v, REGTMP, int16(r))
o2 = c.olsr12u(p, c.opstr(p, p.As), 0, REGTMP, p.From.Reg)
break
}

hi, lo, err := splitImm24uScaled(v, s)
if err != nil {
goto storeusepool
Expand All @@ -4054,10 +4075,13 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) {
o2 = c.olsxrr(p, int32(c.opstrr(p, p.As, false)), int(p.From.Reg), int(r), REGTMP)

case 31: /* movT L(R), R -> ldrT */
// if offset L can be split into hi+lo, and both fit into instructions, do
// If offset L fits in a 12 bit unsigned immediate:
// add $L, R, Rtmp or sub $L, R, Rtmp
// ldr R, (Rtmp)
// Otherwise, if offset L can be split into hi+lo, and both fit into instructions:
// add $hi, R, Rtmp
// ldr lo(Rtmp), R
// otherwise, use constant pool
// Otherwise, use constant pool:
// mov $L, Rtmp (from constant pool)
// ldr (R+Rtmp), R
s := movesize(o.as)
Expand All @@ -4071,6 +4095,20 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) {
}

v := c.regoff(&p.From)
if v >= -256 && v <= 256 {
c.ctxt.Diag("%v: bad type for offset %d (should be 9 bit signed immediate load)", p, v)
}
if v >= 0 && v <= 4095 && v&((1<<int32(s))-1) == 0 {
c.ctxt.Diag("%v: bad type for offset %d (should be 12 bit unsigned immediate load)", p, v)
}

// Handle smaller unaligned and negative offsets via addition or subtraction.
if v >= -4095 && v <= 4095 {
o1 = c.oaddi12(p, v, REGTMP, int16(r))
o2 = c.olsr12u(p, c.opldr(p, p.As), 0, REGTMP, p.To.Reg)
break
}

hi, lo, err := splitImm24uScaled(v, s)
if err != nil {
goto loadusepool
Expand Down

0 comments on commit 208fc13

Please sign in to comment.