Skip to content

Commit

Permalink
cmd/asm: add V[LD|ST][2-4] vector instructions on arm64
Browse files Browse the repository at this point in the history
This change adds VLD2, VLD3, VLD4, VST2, VST3, VST4 (multiple structures)
for image or multi media optimazation.

Change-Id: Iae3538ef4434e436e3fb2f19153c58f918f773af
Reviewed-on: https://go-review.googlesource.com/c/go/+/166518
Run-TryBot: Cherry Zhang <[email protected]>
TryBot-Result: Gobot Gobot <[email protected]>
Reviewed-by: Cherry Zhang <[email protected]>
  • Loading branch information
mengzhuo authored and cherrymui committed Aug 28, 2019
1 parent be452ce commit 8403d4e
Show file tree
Hide file tree
Showing 4 changed files with 102 additions and 9 deletions.
18 changes: 18 additions & 0 deletions src/cmd/asm/internal/asm/testdata/arm64.s
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,15 @@ TEXT foo(SB), DUPOK|NOSPLIT, $-8
VST1 [V0.S4, V1.S4], (R0) // 00a8004c
VLD1 (R30), [V15.S2, V16.S2] // cfab400c
VLD1.P 24(R30), [V3.S2,V4.S2,V5.S2] // c36bdf0c
VLD2 (R29), [V23.H8, V24.H8] // b787404c
VLD2.P 16(R0), [V18.B8, V19.B8] // 1280df0c
VLD2.P (R1)(R2), [V15.S2, V16.S2] // VLD2.P (R1)(R2*1), [V15.S2,V16.S2] // 2f88c20c
VLD3 (R27), [V11.S4, V12.S4, V13.S4] // 6b4b404c
VLD3.P 48(RSP), [V11.S4, V12.S4, V13.S4] // eb4bdf4c
VLD3.P (R30)(R2), [V14.D2, V15.D2, V16.D2] // VLD3.P (R30)(R2*1), [V14.D2,V15.D2,V16.D2] // ce4fc24c
VLD4 (R15), [V10.H4, V11.H4, V12.H4, V13.H4] // ea05400c
VLD4.P 32(R24), [V31.B8, V0.B8, V1.B8, V2.B8] // 1f03df0c
VLD4.P (R13)(R9), [V14.S2, V15.S2, V16.S2, V17.S2] // VLD4.P (R13)(R9*1), [V14.S2,V15.S2,V16.S2,V17.S2] // ae09c90c
VST1.P [V24.S2], 8(R2) // 58789f0c
VST1 [V29.S2, V30.S2], (R29) // bdab000c
VST1 [V14.H4, V15.H4, V16.H4], (R27) // 6e67000c
Expand All @@ -352,6 +361,15 @@ TEXT foo(SB), DUPOK|NOSPLIT, $-8
VST1.P V4.D[1], 8(R0) // 04849f4d
VST1.P V4.D[1], (R0)(R1) // VST1.P V4.D[1], (R0)(R1*1) // 0484814d
VST1 V4.D[1], (R0) // 0484004d
VST2 [V22.H8, V23.H8], (R23) // f686004c
VST2.P [V14.H4, V15.H4], 16(R17) // 2e869f0c
VST2.P [V14.H4, V15.H4], (R3)(R17) // VST2.P [V14.H4,V15.H4], (R3)(R17*1) // 6e84910c
VST3 [V1.D2, V2.D2, V3.D2], (R11) // 614d004c
VST3.P [V18.S4, V19.S4, V20.S4], 48(R25) // 324b9f4c
VST3.P [V19.B8, V20.B8, V21.B8], (R3)(R7) // VST3.P [V19.B8, V20.B8, V21.B8], (R3)(R7*1) // 7340870c
VST4 [V22.D2, V23.D2, V24.D2, V25.D2], (R3) // 760c004c
VST4.P [V14.D2, V15.D2, V16.D2, V17.D2], 64(R15) // ee0d9f4c
VST4.P [V24.B8, V25.B8, V26.B8, V27.B8], (R3)(R23) // VST4.P [V24.B8, V25.B8, V26.B8, V27.B8], (R3)(R23*1) // 7800970c
FMOVS F20, (R0) // 140000bd
FMOVS.P F20, 4(R0) // 144400bc
FMOVS.W F20, 4(R0) // 144c00bc
Expand Down
6 changes: 6 additions & 0 deletions src/cmd/internal/obj/arm64/a.out.go
Original file line number Diff line number Diff line change
Expand Up @@ -953,10 +953,16 @@ const (
AVEOR
AVMOV
AVLD1
AVLD2
AVLD3
AVLD4
AVORR
AVREV32
AVREV64
AVST1
AVST2
AVST3
AVST4
AVDUP
AVADDV
AVMOVI
Expand Down
6 changes: 6 additions & 0 deletions src/cmd/internal/obj/arm64/anames.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

81 changes: 72 additions & 9 deletions src/cmd/internal/obj/arm64/asm7.go
Original file line number Diff line number Diff line change
Expand Up @@ -780,16 +780,34 @@ var optab = []Optab{
{ASTLXR, C_REG, C_NONE, C_NONE, C_ZOREG, 59, 4, 0, 0, 0}, // RegTo2=C_REG
{ASTXP, C_PAIR, C_NONE, C_NONE, C_ZOREG, 59, 4, 0, 0, 0},

/* VLD1/VST1 */
/* VLD[1-4]/VST[1-4] */
{AVLD1, C_ZOREG, C_NONE, C_NONE, C_LIST, 81, 4, 0, 0, 0},
{AVLD1, C_LOREG, C_NONE, C_NONE, C_LIST, 81, 4, 0, 0, C_XPOST},
{AVLD1, C_ROFF, C_NONE, C_NONE, C_LIST, 81, 4, 0, 0, C_XPOST},
{AVLD2, C_ZOREG, C_NONE, C_NONE, C_LIST, 81, 4, 0, 0, 0},
{AVLD2, C_LOREG, C_NONE, C_NONE, C_LIST, 81, 4, 0, 0, C_XPOST},
{AVLD2, C_ROFF, C_NONE, C_NONE, C_LIST, 81, 4, 0, 0, C_XPOST},
{AVLD3, C_ZOREG, C_NONE, C_NONE, C_LIST, 81, 4, 0, 0, 0},
{AVLD3, C_LOREG, C_NONE, C_NONE, C_LIST, 81, 4, 0, 0, C_XPOST},
{AVLD3, C_ROFF, C_NONE, C_NONE, C_LIST, 81, 4, 0, 0, C_XPOST},
{AVLD4, C_ZOREG, C_NONE, C_NONE, C_LIST, 81, 4, 0, 0, 0},
{AVLD4, C_LOREG, C_NONE, C_NONE, C_LIST, 81, 4, 0, 0, C_XPOST},
{AVLD4, C_ROFF, C_NONE, C_NONE, C_LIST, 81, 4, 0, 0, C_XPOST},
{AVLD1, C_LOREG, C_NONE, C_NONE, C_ELEM, 97, 4, 0, 0, C_XPOST},
{AVLD1, C_ROFF, C_NONE, C_NONE, C_ELEM, 97, 4, 0, 0, C_XPOST},
{AVLD1, C_LOREG, C_NONE, C_NONE, C_ELEM, 97, 4, 0, 0, 0},
{AVST1, C_LIST, C_NONE, C_NONE, C_ZOREG, 84, 4, 0, 0, 0},
{AVST1, C_LIST, C_NONE, C_NONE, C_LOREG, 84, 4, 0, 0, C_XPOST},
{AVST1, C_LIST, C_NONE, C_NONE, C_ROFF, 84, 4, 0, 0, C_XPOST},
{AVST2, C_LIST, C_NONE, C_NONE, C_ZOREG, 84, 4, 0, 0, 0},
{AVST2, C_LIST, C_NONE, C_NONE, C_LOREG, 84, 4, 0, 0, C_XPOST},
{AVST2, C_LIST, C_NONE, C_NONE, C_ROFF, 84, 4, 0, 0, C_XPOST},
{AVST3, C_LIST, C_NONE, C_NONE, C_ZOREG, 84, 4, 0, 0, 0},
{AVST3, C_LIST, C_NONE, C_NONE, C_LOREG, 84, 4, 0, 0, C_XPOST},
{AVST3, C_LIST, C_NONE, C_NONE, C_ROFF, 84, 4, 0, 0, C_XPOST},
{AVST4, C_LIST, C_NONE, C_NONE, C_ZOREG, 84, 4, 0, 0, 0},
{AVST4, C_LIST, C_NONE, C_NONE, C_LOREG, 84, 4, 0, 0, C_XPOST},
{AVST4, C_LIST, C_NONE, C_NONE, C_ROFF, 84, 4, 0, 0, C_XPOST},
{AVST1, C_ELEM, C_NONE, C_NONE, C_LOREG, 96, 4, 0, 0, C_XPOST},
{AVST1, C_ELEM, C_NONE, C_NONE, C_ROFF, 96, 4, 0, 0, C_XPOST},
{AVST1, C_ELEM, C_NONE, C_NONE, C_LOREG, 96, 4, 0, 0, 0},
Expand Down Expand Up @@ -2695,7 +2713,13 @@ func buildop(ctxt *obj.Link) {
AVCNT,
AVMOV,
AVLD1,
AVLD2,
AVLD3,
AVLD4,
AVST1,
AVST2,
AVST3,
AVST4,
AVTBL,
AVDUP,
AVMOVI,
Expand Down Expand Up @@ -2775,14 +2799,14 @@ func (c *ctxt7) checkindex(p *obj.Prog, index, maxindex int) {
}
}

/* checkoffset checks whether the immediate offset is valid for VLD1.P and VST1.P */
/* checkoffset checks whether the immediate offset is valid for VLD[1-4].P and VST[1-4].P */
func (c *ctxt7) checkoffset(p *obj.Prog, as obj.As) {
var offset, list, n int64
var offset, list, n, expect int64
switch as {
case AVLD1:
case AVLD1, AVLD2, AVLD3, AVLD4:
offset = p.From.Offset
list = p.To.Offset
case AVST1:
case AVST1, AVST2, AVST3, AVST4:
offset = p.To.Offset
list = p.From.Offset
default:
Expand All @@ -2808,6 +2832,21 @@ func (c *ctxt7) checkoffset(p *obj.Prog, as obj.As) {
if !(q == 0 && offset == n*8) && !(q == 1 && offset == n*16) {
c.ctxt.Diag("invalid post-increment offset: %v", p)
}

switch as {
case AVLD1, AVST1:
return
case AVLD2, AVST2:
expect = 2
case AVLD3, AVST3:
expect = 3
case AVLD4, AVST4:
expect = 4
}

if expect != n {
c.ctxt.Diag("expected %d registers, got %d: %v.", expect, n, p)
}
}

/* checkShiftAmount checks whether the index shift amount is valid */
Expand Down Expand Up @@ -4305,14 +4344,14 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) {
}
o1 |= (uint32(imm5&0x1f) << 16) | (uint32(rf&31) << 5) | uint32(rt&31)

case 81: /* vld1 (Rn), [Vt1.<T>, Vt2.<T>, ...] */
case 81: /* vld[1-4] (Rn), [Vt1.<T>, Vt2.<T>, ...] */
c.checkoffset(p, p.As)
r := int(p.From.Reg)
o1 = 3<<26 | 1<<22
if o.scond == C_XPOST {
o1 |= 1 << 23
if p.From.Index == 0 {
// immediate offset variant
c.checkoffset(p, p.As)
o1 |= 0x1f << 16
} else {
// register offset variant
Expand All @@ -4323,6 +4362,9 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) {
}
}
o1 |= uint32(p.To.Offset)
// cmd/asm/internal/arch/arm64.go:ARM64RegisterListOffset
// add opcode(bit 12-15) for vld1, mask it off if it's not vld1
o1 = c.maskOpvldvst(p, o1)
o1 |= uint32(r&31) << 5

case 82: /* vmov Rn, Vd.<T> */
Expand Down Expand Up @@ -4410,14 +4452,14 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) {

o1 |= (Q&1)<<30 | (size&3)<<22 | uint32(rf&31)<<5 | uint32(rt&31)

case 84: /* vst1 [Vt1.<T>, Vt2.<T>, ...], (Rn) */
case 84: /* vst[1-4] [Vt1.<T>, Vt2.<T>, ...], (Rn) */
c.checkoffset(p, p.As)
r := int(p.To.Reg)
o1 = 3 << 26
if o.scond == C_XPOST {
o1 |= 1 << 23
if p.To.Index == 0 {
// immediate offset variant
c.checkoffset(p, p.As)
o1 |= 0x1f << 16
} else {
// register offset variant
Expand All @@ -4428,6 +4470,9 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) {
}
}
o1 |= uint32(p.From.Offset)
// cmd/asm/internal/arch/arm64.go:ARM64RegisterListOffset
// add opcode(bit 12-15) for vst1, mask it off if it's not vst1
o1 = c.maskOpvldvst(p, o1)
o1 |= uint32(r&31) << 5

case 85: /* vaddv/vuaddlv Vn.<T>, Vd*/
Expand Down Expand Up @@ -6727,6 +6772,24 @@ func (c *ctxt7) opldpstp(p *obj.Prog, o *Optab, vo int32, rbase, rl, rh, ldp uin
return ret
}

func (c *ctxt7) maskOpvldvst(p *obj.Prog, o1 uint32) uint32 {
if p.As == AVLD1 || p.As == AVST1 {
return o1
}

o1 &^= 0xf000 // mask out "opcode" field (bit 12-15)
switch p.As {
case AVLD2, AVST2:
o1 |= 8 << 12
case AVLD3, AVST3:
o1 |= 4 << 12
case AVLD4, AVST4:
default:
c.ctxt.Diag("unsupported instruction:%v\n", p.As)
}
return o1
}

/*
* size in log2(bytes)
*/
Expand Down

0 comments on commit 8403d4e

Please sign in to comment.