1 files changed, 135 insertions, 268 deletions
diff --git a/src/cmd/internal/obj/arm64/doc.go b/src/cmd/internal/obj/arm64/doc.go
index d06025d21c..d98b1b6f9e 100644
--- a/src/cmd/internal/obj/arm64/doc.go
+++ b/src/cmd/internal/obj/arm64/doc.go
@@ -1,334 +1,201 @@
-// Copyright 2017 The Go Authors. All rights reserved.
+// Copyright 2018 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-package arm64
-
 /*
+Package arm64 implements an ARM64 assembler. Go assembly syntax is different from GNU ARM64
+syntax, but we can still follow the general rules to map between them.
 
-Go Assembly for ARM64 Reference Manual
-
-1. Alphabetical list of basic instructions
-    // TODO
-
-    LDARB: Load-Acquire Register Byte
-      LDARB	(<Rn>), <Rd>
-        Loads a byte from memory, zero-extends it and writes it to Rd.
+Instructions mnemonics mapping rules
 
-    LDARH: Load-Acquire Register Halfword
-      LDARH	(<Rn>), <Rd>
-        Loads a halfword from memory, zero-extends it and writes it to Rd.
+1. Most instructions use width suffixes of instruction names to indicate operand width rather than
+using different register names.
 
-    LDAXP: Load-Acquire Exclusive Pair of Registers
-      LDAXP	(<Rn>), (<Rt1>, <Rt2>)
-        Loads two 64-bit doublewords from memory, and writes them to Rt1 and Rt2.
+  Examples:
+    ADC R24, R14, R12          <=>     adc x12, x24
+    ADDW R26->24, R21, R15     <=>     add w15, w21, w26, asr #24
+    FCMPS F2, F3               <=>     fcmp s3, s2
+    FCMPD F2, F3               <=>     fcmp d3, d2
+    FCVTDH F2, F3              <=>     fcvt h3, d2
 
-    LDAXPW: Load-Acquire Exclusive Pair of Registers
-      LDAXPW	(<Rn>), (<Rt1>, <Rt2>)
-        Loads two 32-bit words from memory, and writes them to Rt1 and Rt2.
+2. Go uses .P and .W suffixes to indicate post-increment and pre-increment.
 
-    LDXP: 64-bit Load Exclusive Pair of Registers
-      LDXP	(<Rn>), (<Rt1>, <Rt2>)
-        Loads two 64-bit doublewords from memory, and writes them to Rt1 and Rt2.
+  Examples:
+    MOVD.P -8(R10), R8         <=>      ldr x8, [x10],#-8
+    MOVB.W 16(R16), R10        <=>      ldr x10, [x16,#16]!
 
-    LDXPW: 32-bit Load Exclusive Pair of Registers
-      LDXPW	(<Rn>), (<Rt1>, <Rt2>)
-        Loads two 32-bit words from memory, and writes them to Rt1 and Rt2.
+3. Go uses a series of MOV instructions as load and store.
 
-    MOVD|MOVW|MOVH|MOVHU|MOVB|MOVBU: Load Register (register offset)
-      MOVD	(Rn)(Rm.UXTW<<3), Rt
-      MOVD	(Rn)(Rm.SXTX), Rt
-      MOVD	(Rn)(Rm<<3), Rt
-      MOVD	(Rn)(Rm), Rt
-      MOVB|MOVBU	(Rn)(Rm.UXTW), Rt
+64-bit variant ldr, str, stur => MOVD;
+32-bit variant str, stur, ldrsw => MOVW;
+32-bit variant ldr => MOVWU;
+ldrb => MOVBU; ldrh => MOVHU;
+ldrsb, sturb, strb => MOVB;
+ldrsh, sturh, strh =>  MOVH.
 
-    MOVD|MOVW|MOVH|MOVB: Stote Register (register offset)
-      MOVD	Rt, (Rn)(Rm.UXTW<<3)
-      MOVD	Rt, (Rn)(Rm.SXTX)
-      MOVD	Rt, (Rn)(Rm)
+4. Go moves conditions into opcode suffix, like BLT.
 
-    PRFM: Prefetch Memory (immediate)
-      PRFM	imm(Rn), <prfop>
-        prfop is the prefetch operation and can have the following values:
-        PLDL1KEEP, PLDL1STRM, PLDL2KEEP, PLDL2STRM, PLDL3KEEP, PLDL3STRM,
-        PLIL1KEEP, PLIL1STRM, PLIL2KEEP, PLIL2STRM, PLIL3KEEP, PLIL3STRM,
-        PSTL1KEEP, PSTL1STRM, PSTL2KEEP, PSTL2STRM, PSTL3KEEP, PSTL3STRM.
-      PRFM	imm(Rn), $imm
-        $imm prefetch operation is encoded as an immediate.
+5. Go adds a V prefix for most floating-point and SIMD instrutions except cryptographic extension
+instructions and floating-point(scalar) instructions.
 
-    STLRB: Store-Release Register Byte
-      STLRB	<Rd>, (<Rn>)
-        Stores a byte from Rd to a memory location from Rn.
+  Examples:
+    VADD V5.H8, V18.H8, V9.H8         <=>      add v9.8h, v18.8h, v5.8h
+    VLD1.P (R6)(R11), [V31.D1]        <=>      ld1 {v31.1d}, [x6], x11
+    VFMLA V29.S2, V20.S2, V14.S2      <=>      fmla v14.2s, v20.2s, v29.2s
+    AESD V22.B16, V19.B16             <=>      aesd v19.16b, v22.16b
+    SCVTFWS R3, F16                   <=>      scvtf s17, w6
 
-    STLRH: Store-Release Register Halfword
-      STLRH	<Rd>, (<Rn>)
-        Stores a halfword from Rd to a memory location from Rn.
+Special Cases.
 
-    STLXP: 64-bit Store-Release Exclusive Pair of registers
-      STLXP	(<Rt1>, <Rt2>), (<Rn>), <Rs>
-        Stores two 64-bit doublewords from Rt1 and Rt2 to a memory location from Rn,
-        and returns in Rs a status value of 0 if the store was successful, or of 1 if
-        no store was performed.
+(1) umov is written as VMOV.
 
-    STLXPW: 32-bit Store-Release Exclusive Pair of registers
-      STLXPW	(<Rt1>, <Rt2>), (<Rn>), <Rs>
-        Stores two 32-bit words from Rt1 and Rt2 to a memory location from Rn, and
-        returns in Rs a status value of 0 if the store was successful, or of 1 if no
-        store was performed.
+(2) br is renamed JMP, blr is renamed CALL.
 
-    STXP: 64-bit Store Exclusive Pair of registers
-      STXP	(<Rt1>, <Rt2>), (<Rn>), <Rs>
-        Stores two 64-bit doublewords from Rt1 and Rt2 to a memory location from Rn,
-        and returns in Rs a status value of 0 if the store was successful, or of 1 if
-        no store was performed.
+(3) No need to add "W" suffix: LDARB, LDARH, LDAXRB, LDAXRH, LDTRH, LDXRB, LDXRH.
 
-    STXPW: 32-bit Store Exclusive Pair of registers
-      STXPW	(<Rt1>, <Rt2>), (<Rn>), <Rs>
-        Stores two 32-bit words from Rt1 and Rt2 to a memory location from Rn, and returns in
-        a Rs a status value of 0 if the store was successful, or of 1 if no store was performed.
+  Examples:
+    VMOV V13.B[1], R20      <=>      mov x20, v13.b[1]
+    VMOV V13.H[1], R20      <=>      mov w20, v13.h[1]
+    JMP (R3)                <=>      br x3
+    CALL (R17)              <=>      blr x17
+    LDAXRB (R19), R16       <=>      ldaxrb w16, [x19]
 
-2. Alphabetical list of float-point instructions
-    // TODO
 
-    FMADDD: 64-bit floating-point fused Multiply-Add
-      FMADDD	<Fm>, <Fa>, <Fn>, <Fd>
-        Multiplies the values of <Fm> and <Fn>,
-        adds the product to <Fa>, and writes the result to <Fd>.
+Register mapping rules
 
-    FMADDS: 32-bit floating-point fused Multiply-Add
-      FMADDS	<Fm>, <Fa>, <Fn>, <Fd>
-        Multiplies the values of <Fm> and <Fn>,
-        adds the product to <Fa>, and writes the result to <Fd>.
+1. All basic register names are written as Rn.
 
-    FMSUBD: 64-bit floating-point fused Multiply-Subtract
-      FMSUBD	<Fm>, <Fa>, <Fn>, <Fd>
-        Multiplies the values of <Fm> and <Fn>, negates the product,
-        adds the product to <Fa>, and writes the result to <Fd>.
+2. Go uses ZR as the zero register and RSP as the stack pointer.
 
-    FMSUBS: 32-bit floating-point fused Multiply-Subtract
-      FMSUBS	<Fm>, <Fa>, <Fn>, <Fd>
-        Multiplies the values of <Fm> and <Fn>, negates the product,
-        adds the product to <Fa>, and writes the result to <Fd>.
+3. Bn, Hn, Dn, Sn and Qn instructions are written as Fn in floating-point instructions and as Vn
+in SIMD instructions.
 
-    FNMADDD: 64-bit floating-point negated fused Multiply-Add
-      FNMADDD	<Fm>, <Fa>, <Fn>, <Fd>
-        Multiplies the values of <Fm> and <Fn>, negates the product,
-        subtracts the value of <Fa>, and writes the result to <Fd>.
 
-    FNMADDS: 32-bit floating-point negated fused Multiply-Add
-      FNMADDS	<Fm>, <Fa>, <Fn>, <Fd>
-        Multiplies the values of <Fm> and <Fn>, negates the product,
-        subtracts the value of <Fa>, and writes the result to <Fd>.
+Argument mapping rules
 
-    FNMSUBD: 64-bit floating-point negated fused Multiply-Subtract
-      FNMSUBD	<Fm>, <Fa>, <Fn>, <Fd>
-        Multiplies the values of <Fm> and <Fn>,
-        subtracts the value of <Fa>, and writes the result to <Fd>.
+1. The operands appear in left-to-right assignment order.
 
-    FNMSUBS: 32-bit floating-point negated fused Multiply-Subtract
-      FNMSUBS	<Fm>, <Fa>, <Fn>, <Fd>
-        Multiplies the values of <Fm> and <Fn>,
-        subtracts the value of <Fa>, and writes the result to <Fd>.
+Go reverses the arguments of most instructions.
 
-3. Alphabetical list of SIMD instructions
-    VADD: Add (scalar)
-      VADD	<Vm>, <Vn>, <Vd>
-        Add corresponding low 64-bit elements in <Vm> and <Vn>,
-        place the result into low 64-bit element of <Vd>.
+    Examples:
+      ADD R11.SXTB<<1, RSP, R25      <=>      add x25, sp, w11, sxtb #1
+      VADD V16, V19, V14             <=>      add d14, d19, d16
 
-    VADD: Add (vector).
-      VADD	<Vm>.T, <Vn>.<T>, <Vd>.<T>
-        <T> Is an arrangement specifier and can have the following values:
-        B8, B16, H4, H8, S2, S4, D2
+Special Cases.
 
-    VADDP: Add Pairwise (vector)
-      VADDP	<Vm>.<T>, <Vn>.<T>, <Vd>.<T>
-        <T> Is an arrangement specifier and can have the following values:
-        B8, B16, H4, H8, S2, S4, D2
+(1) Argument order is the same as in the GNU ARM64 syntax: cbz, cbnz and some store instructions,
+such as str, stur, strb, sturb, strh, sturh stlr, stlrb. stlrh, st1.
 
-    VADDV: Add across Vector.
-      VADDV	<Vn>.<T>, Vd
-        <T> Is an arrangement specifier and can have the following values:
-        8B, 16B, H4, H8, S4
+  Examples:
+    MOVD R29, 384(R19)    <=>    str x29, [x19,#384]
+    MOVB.P R30, 30(R4)    <=>    strb w30, [x4],#30
+    STLRH R21, (R18)      <=>    stlrh w21, [x18]
 
-    VAND: Bitwise AND (vector)
-      VAND	<Vm>.<T>, <Vn>.<T>, <Vd>.<T>
-        <T> Is an arrangement specifier and can have the following values:
-        B8, B16
+(2) MADD, MADDW, MSUB, MSUBW, SMADDL, SMSUBL, UMADDL, UMSUBL <Rm>, <Ra>, <Rn>, <Rd>
 
-    VCMEQ: Compare bitwise Equal (vector)
-      VCMEQ	<Vm>.<T>, <Vn>.<T>, <Vd>.<T>
-        <T> Is an arrangement specifier and can have the following values:
-        B8, B16, H4, H8, S2, S4, D2
+  Examples:
+    MADD R2, R30, R22, R6       <=>    madd x6, x22, x2, x30
+    SMSUBL R10, R3, R17, R27    <=>    smsubl x27, w17, w10, x3
 
-    VDUP: Duplicate vector element to vector or scalar.
-      VDUP	<Vn>.<Ts>[index], <Vd>.<T>
-        <T> Is an arrangement specifier and can have the following values:
-        8B, 16B, H4, H8, S2, S4, D2
-        <Ts> Is an element size specifier and can have the following values:
-        B, H, S, D
+  Examples:
+    FMADDD F30, F20, F3, F29    <=>    fmadd d29, d3, d30, d20
+    FNMSUBS F7, F25, F7, F22    <=>    fnmsub s22, s7, s7, s25
 
-    VEOR: Bitwise exclusive OR (vector, register)
-      VEOR	<Vm>.<T>, <Vn>.<T>, <Vd>.<T>
-        <T> Is an arrangement specifier and can have the following values:
-        B8, B16
+(4) BFI, BFXIL, SBFIZ, SBFX, UBFIZ, UBFX $<lsb>, <Rn>, $<width>, <Rd>
 
-    VFMLA: Floating-point fused Multiply-Add to accumulator (vector)
-      VFMLA	<Vm>.<T>, <Vn>.<T>, <Vd>.<T>
-        <T> Is an arrangement specifier and can have the following values:
-        S2, S4, D2
+  Examples:
+    BFIW $16, R20, $6, R0      <=>    bfi w0, w20, #16, #6
+    UBFIZ $34, R26, $5, R20    <=>    ubfiz x20, x26, #34, #5
 
-    VFMLS: Floating-point fused Multiply-Subtract from accumulator (vector)
-      VFMLS	<Vm>.<T>, <Vn>.<T>, <Vd>.<T>
-        <T> Is an arrangement specifier and can have the following values:
-        S2, S4, D2
+(5) FCCMPD, FCCMPS, FCCMPED, FCCMPES <cond>, Fm. Fn, $<nzcv>
 
-    VEXT:  Extracts vector elements from src SIMD registers to dst SIMD register
-      VEXT	$index, <Vm>.<T>, <Vn>.<T>, <Vd>.<T>
-        <T> is an arrangment specifier and can be B8, B16
-        $index is the lowest numbered byte element to be exracted.
+  Examples:
+    FCCMPD AL, F8, F26, $0     <=>    fccmp d26, d8, #0x0, al
+    FCCMPS VS, F29, F4, $4     <=>    fccmp s4, s29, #0x4, vs
+    FCCMPED LE, F20, F5, $13   <=>    fccmpe d5, d20, #0xd, le
+    FCCMPES NE, F26, F10, $0   <=>    fccmpe s10, s26, #0x0, ne
 
-    VLD1: Load multiple single-element structures
-      VLD1	(Rn), [<Vt>.<T>, <Vt2>.<T> ...]     // no offset
-      VLD1.P	imm(Rn), [<Vt>.<T>, <Vt2>.<T> ...]  // immediate offset variant
-      VLD1.P	(Rn)(Rm), [<Vt>.<T>, <Vt2>.<T> ...] // register offset variant
-        <T> Is an arrangement specifier and can have the following values:
-        B8, B16, H4, H8, S2, S4, D1, D2
+(6) CCMN, CCMNW, CCMP, CCMPW <cond>, <Rn>, $<imm>, $<nzcv>
 
-    VLD1: Load one single-element structure
-      VLD1	(Rn), <Vt>.<T>[index]     // no offset
-      VLD1.P	imm(Rn), <Vt>.<T>[index]  // immediate offset variant
-      VLD1.P	(Rn)(Rm), <Vt>.<T>[index] // register offset variant
-        <T> is an arrangement specifier and can have the following values:
-        B, H, S D
+  Examples:
+    CCMP MI, R22, $12, $13     <=>    ccmp x22, #0xc, #0xd, mi
+    CCMNW AL, R1, $11, $8      <=>    ccmn w1, #0xb, #0x8, al
 
-    VMOV: move
-      VMOV	<Vn>.<T>[index], Rd // Move vector element to general-purpose register.
-        <T> Is a source width specifier and can have the following values:
-        B, H, S (Wd)
-        D (Xd)
+(7) CCMN, CCMNW, CCMP, CCMPW <cond>, <Rn>, <Rm>, $<nzcv>
 
-      VMOV	Rn, <Vd>.<T> // Duplicate general-purpose register to vector.
-        <T> Is an arrangement specifier and can have the following values:
-        B8, B16, H4, H8, S2, S4 (Wn)
-        D2 (Xn)
+  Examples:
+    CCMN VS, R13, R22, $10     <=>    ccmn x13, x22, #0xa, vs
+    CCMPW HS, R18, R14, $11    <=>    ccmp w18, w14, #0xb, cs
 
-      VMOV	<Vn>.<T>, <Vd>.<T> // Move vector.
-        <T> Is an arrangement specifier and can have the following values:
-        B8, B16
+(9) CSEL, CSELW, CSNEG, CSNEGW, CSINC, CSINCW <cond>, <Rn>, <Rm>, <Rd> ;
+FCSELD, FCSELS <cond>, <Fn>, <Fm>, <Fd>
 
-      VMOV	Rn, <Vd>.<T>[index] // Move general-purpose register to a vector element.
-        <T> Is a source width specifier and can have the following values:
-        B, H, S (Wd)
-        D (Xd)
+  Examples:
+    CSEL GT, R0, R19, R1        <=>    csel x1, x0, x19, gt
+    CSNEGW GT, R7, R17, R8      <=>    csneg w8, w7, w17, gt
+    FCSELD EQ, F15, F18, F16    <=>    fcsel d16, d15, d18, eq
 
-      VMOV	<Vn>.<T>[index], Vn  // Move vector element to scalar.
-        <T> Is an element size specifier and can have the following values:
-        B, H, S, D
+(10) TBNZ, TBZ $<imm>, <Rt>, <label>
 
-      VMOV	<Vn>.<T>[index], <Vd>.<T>[index] // Move vector element to another vector element.
-        <T> Is an element size specifier and can have the following values:
-        B, H, S, D
 
-    VMOVI: Move Immediate (vector).
-      VMOVI	$imm8, <Vd>.<T>
-        <T> is an arrangement specifier and can have the following values:
-        8B, 16B
+(11) STLXR, STLXRW, STXR, STXRW, STLXRB, STLXRH, STXRB, STXRH  <Rf>, (<Rn|RSP>), <Rs>
 
-    VMOVS: Load SIMD&FP Register (immediate offset). ARMv8: LDR (immediate, SIMD&FP)
-      Store SIMD&FP register (immediate offset). ARMv8: STR (immediate, SIMD&FP)
-      VMOVS	(Rn), Vn
-      VMOVS.W	imm(Rn), Vn
-      VMOVS.P	imm(Rn), Vn
-      VMOVS	Vn, (Rn)
-      VMOVS.W	Vn, imm(Rn)
-      VMOVS.P	Vn, imm(Rn)
+  Examples:
+    STLXR ZR, (R15), R16    <=>    stlxr w16, xzr, [x15]
+    STXRB R9, (R21), R18    <=>    stxrb w18, w9, [x21]
 
-    VORR: Bitwise inclusive OR (vector, register)
-      VORR	<Vm>.<T>, <Vn>.<T>, <Vd>.<T>
-        <T> Is an arrangement specifier and can have the following values:
-        B8, B16
+(12) STLXP, STLXPW, STXP, STXPW (<Rf1>, <Rf2>), (<Rn|RSP>), <Rs>
 
-    VRBIT: Reverse bit order (vector)
-      VRBIT	<Vn>.<T>, <Vd>.<T>
-        <T> is an arrangment specifier and can be B8, B16
+  Examples:
+    STLXP (R17, R18), (R4), R5      <=>    stlxp w5, x17, x18, [x4]
+    STXPW (R30, R25), (R22), R13    <=>    stxp w13, w30, w25, [x22]
 
-    VREV32: Reverse elements in 32-bit words (vector).
-      REV32 <Vn>.<T>, <Vd>.<T>
-        <T> Is an arrangement specifier and can have the following values:
-        B8, B16, H4, H8
+2. Expressions for special arguments.
 
-    VREV64: Reverse elements in 64-bit words (vector).
-      REV64 <Vn>.<T>, <Vd>.<T>
-        <T> Is an arrangement specifier and can have the following values:
-        B8, B16, H4, H8, S2, S4
+#<immediate> is written as $<immediate>.
 
-    VSHL: Shift Left(immediate)
-      VSHL 	$shift, <Vn>.<T>, <Vd>.<T>
-        <T> is an arrangement specifier and can have the following values:
-        B8, B16, H4, H8, S2, S4, D1, D2
-        $shift Is the left shift amount
+Optionally-shifted immedate.
 
-    VST1: Store multiple single-element structures
-      VST1	[<Vt>.<T>, <Vt2>.<T> ...], (Rn)         // no offset
-      VST1.P	[<Vt>.<T>, <Vt2>.<T> ...], imm(Rn)      // immediate offset variant
-      VST1.P	[<Vt>.<T>, <Vt2>.<T> ...], (Rn)(Rm)     // register offset variant
-        <T> Is an arrangement specifier and can have the following values:
-        B8, B16, H4, H8, S2, S4, D1, D2
+  Examples:
+    ADD $(3151<<12), R14, R20     <=>    add x20, x14, #0xc4f, lsl #12
+    ADDW $1864, R25, R6           <=>    add w6, w25, #0x748
 
-    VSUB: Sub (scalar)
-      VSUB	<Vm>, <Vn>, <Vd>
-        Subtract low 64-bit element in <Vm> from the corresponding element in <Vn>,
-        place the result into low 64-bit element of <Vd>.
+Optionally-shifted registers are written as <Rm>{<shift><amount>}.
+The <shift> can be <<(lsl), >>(lsr), ->(asr), @>(ror).
 
-    VUADDLV: Unsigned sum Long across Vector.
-      VUADDLV	<Vn>.<T>, Vd
-        <T> Is an arrangement specifier and can have the following values:
-        8B, 16B, H4, H8, S4
+  Examples:
+    ADD R19>>30, R10, R24     <=>    add x24, x10, x19, lsr #30
+    ADDW R26->24, R21, R15    <=>    add w15, w21, w26, asr #24
 
-    VST1: Store one single-element structure
-      VST1	<Vt>.<T>.<Index>, (Rn)         // no offset
-      VST1.P	<Vt>.<T>.<Index>, imm(Rn)      // immediate offset variant
-      VST1.P	<Vt>.<T>.<Index>, (Rn)(Rm)     // register offset variant
-        <T> Is an arrangement specifier and can have the following values:
-        B, H, S, D
+Extended registers are written as <Rm>{.<extend>{<<<amount>}}.
+<extend> can be UXTB, UXTH, UXTW, UXTX, SXTB, SXTH, SXTW or SXTX.
 
-    VUSHR: Unsigned shift right(immediate)
-      VUSHR	$shift, <Vn>.<T>, <Vm>.<T>
-        <T> is an arrangement specifier and can have the following values:
-        B8, B16, H4, H8, S2, S4, D1, D2
-        $shift is the right shift amount
+  Examples:
+    ADDS R18.UXTB<<4, R9, R26     <=>    adds x26, x9, w18, uxtb #4
+    ADDSW R14.SXTX, R14, R6       <=>    adds w6, w14, w14, sxtx
 
+Memory references: [<Xn|SP>{,#0}] is written as (Rn|RSP), a base register and an immediate
+offset is written as imm(Rn|RSP), a base register and an offset register is written as (Rn|RSP)(Rm).
 
-4. Alphabetical list of cryptographic extension instructions
+  Examples:
+    LDAR (R22), R9                  <=>    ldar x9, [x22]
+    LDP 28(R17), (R15, R23)         <=>    ldp x15, x23, [x17,#28]
+    MOVWU (R4)(R12<<2), R8          <=>    ldr w8, [x4, x12, lsl #2]
+    MOVD (R7)(R11.UXTW<<3), R25     <=>    ldr x25, [x7,w11,uxtw #3]
+    MOVBU (R27)(R23), R14           <=>    ldrb w14, [x27,x23]
 
-    VPMULL{2}: Polynomial multiply long.
-      VPMULL{2}	<Vm>.<Tb>, <Vn>.<Tb>, <Vd>.<Ta>
-        VPMULL multiplies corresponding elements in the lower half of the
-        vectors of two source SIMD registers and VPMULL{2} operates in the upper half.
-        <Ta> is an arrangement specifier, it can be H8, Q1
-        <Tb> is an arrangement specifier, it can be B8, B16, D1, D2
+Register pairs are written as (Rt1, Rt2).
 
-    SHA1C, SHA1M, SHA1P: SHA1 hash update.
-      SHA1C	<Vm>.S4, Vn, Vd
-      SHA1M	<Vm>.S4, Vn, Vd
-      SHA1P	<Vm>.S4, Vn, Vd
+  Examples:
+    LDP.P -240(R11), (R12, R26)    <=>    ldp x12, x26, [x11],#-240
 
-    SHA1H: SHA1 fixed rotate.
-      SHA1H	Vn, Vd
-
-    SHA1SU0:   SHA1 schedule update 0.
-    SHA256SU1: SHA256 schedule update 1.
-      SHA1SU0	<Vm>.S4, <Vn>.S4, <Vd>.S4
-      SHA256SU1	<Vm>.S4, <Vn>.S4, <Vd>.S4
-
-    SHA1SU1:   SHA1 schedule update 1.
-    SHA256SU0: SHA256 schedule update 0.
-      SHA1SU1	<Vn>.S4, <Vd>.S4
-      SHA256SU0	<Vn>.S4, <Vd>.S4
-
-    SHA256H, SHA256H2: SHA256 hash update.
-      SHA256H	<Vm>.S4, Vn, Vd
-      SHA256H2	<Vm>.S4, Vn, Vd
+Register with arrangement and register with arrangement and index.
 
+  Examples:
+    VADD V5.H8, V18.H8, V9.H8                     <=>    add v9.8h, v18.8h, v5.8h
+    VLD1 (R2), [V21.B16]                          <=>    ld1 {v21.16b}, [x2]
+    VST1.P V9.S[1], (R16)(R21)                    <=>    st1 {v9.s}[1], [x16], x28
+    VST1.P [V13.H8, V14.H8, V15.H8], (R3)(R14)    <=>    st1 {v13.8h-v15.8h}, [x3], x14
+    VST1.P [V14.D1, V15.D1], (R7)(R23)            <=>    st1 {v14.1d, v15.1d}, [x7], x23
 */
+package arm64