Discussion:
[gem5-dev] Review Request 3547: cpu,
(too old to reply)
Fernando Endo
2016-07-10 18:45:42 UTC
Permalink
-----------------------------------------------------------
This is an automatically generated e-mail. To reply, visit:
http://reviews.gem5.org/r/3547/
-----------------------------------------------------------

Review request for Default.


Repository: gem5


Description
-------

Modify the opClass assigned to AArch64 FP instructions from SimdFloat* to Float*. Also create the FloatMemRead and FloatMemWrite opClasses, which distinguishes writes to the INT and FP register banks.


Diffs
-----

src/arch/isa_parser.py cdb94f2332a6
src/cpu/FuncUnit.py cdb94f2332a6
src/cpu/o3/FuncUnitConfig.py cdb94f2332a6
src/cpu/op_class.hh cdb94f2332a6
configs/common/O3_ARM_v7a.py cdb94f2332a6
src/arch/arm/isa/insts/fp64.isa cdb94f2332a6

Diff: http://reviews.gem5.org/r/3547/diff/


Testing
-------

The changes from SimdFloat* to Float* were quickly tested, indeed the neon64.isa and fp64.isa files already separate the implementations, so it should not be buggy.
Load and store instructions tested, followed by #uops and opClass from stats.txt:

asm("ldr X2, [X30], #0"); // 1 MemRead
asm("ldp X2, X3, [X30], #0"); // 1 MemRead
asm("ldpsw X2, X3, [X30], #0"); // 1 MemRead
asm("ldr W2, [X30], #0"); // 1 MemRead
asm("ldp W2, W3, [X30], #0"); // 1 MemRead

asm("ldp Q0, Q1, [SP], #0"); // 2 FloatMemRead
asm("ldp D2, D3, [SP], #0"); // 1 FloatMemRead
asm("ldp S2, S3, [X30], #0"); // 1 FloatMemRead
asm("ldr Q4, [SP], #0"); // 1 FloatMemRead
asm("ldr Q4, [SP, X28]"); // 1 FloatMemRead
asm("ldr D5, [SP], #0"); // 1 FloatMemRead
asm("ldr D5, [SP, X28]"); // 1 FloatMemRead
asm("ldr S6, [SP], #0"); // 1 FloatMemRead
asm("ldr S2, [X30, X28]"); // 1 FloatMemRead
asm("ldr H7, [X30], #0"); // 1 FloatMemRead
asm("ldr B8, [SP], #0"); // 1 FloatMemRead

asm("ldur B0, [X30, #0]"); // 1 FloatMemRead
asm("ldur H0, [X30, #0]"); // 1 FloatMemRead
asm("ldur S0, [X30, #0]"); // 1 FloatMemRead
asm("ldur D0, [X30, #0]"); // 1 FloatMemRead
asm("ldur Q0, [X30, #0]"); // 1 FloatMemRead

asm("ld1 {V9.B}[0], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.B}[15], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.H}[0], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.H}[7], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.S}[0], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.S}[3], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.D}[0], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.D}[1], [X30]"); // 1 FloatMemRead

asm("ld2 {V9.B, V10.B}[0], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.B, V10.B}[15], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.H, V10.H}[0], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.H, V10.H}[7], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.S, V10.S}[0], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.S, V10.S}[3], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.D, V10.D}[0], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.D, V10.D}[1], [X30]"); // 1 FloatMemRead

asm("ld3 {V9.B, V10.B, V11.B}[0], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.B, V10.B, V11.B}[15], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.H, V10.H, V11.H}[0], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.H, V10.H, V11.H}[7], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.S, V10.S, V11.S}[0], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.S, V10.S, V11.S}[3], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.D, V10.D, V11.D}[0], [X30]"); // 2 FloatMemRead
asm("ld3 {V9.D, V10.D, V11.D}[1], [X30]"); // 2 FloatMemRead

asm("ld4 {V9.B, V10.B, V11.B, V12.B}[0], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.B, V10.B, V11.B, V12.B}[15], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.H, V10.H, V11.H, V12.H}[0], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.H, V10.H, V11.H, V12.H}[7], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.S, V10.S, V11.S, V12.S}[0], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.S, V10.S, V11.S, V12.S}[3], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.D, V10.D, V11.D, V12.D}[0], [X30]"); // 2 FloatMemRead
asm("ld4 {V9.D, V10.D, V11.D, V12.D}[1], [X30]"); // 2 FloatMemRead

asm("ld1 {V9.16B}, [X30], #16"); // 1 FloatMemRead
asm("ld1 {V9.8H, V10.8H}, [X30], #32"); // 2 FloatMemRead
asm("ld1 {V9.4S, V10.4S, V11.4S}, [X30], #48"); // 3 FloatMemRead
asm("ld1 {V9.2D, V10.2D, V11.2D, V12.2D}, [X30], #64"); // 4 FloatMemRead

asm("ld2 {V9.8H, V10.8H}, [X30], #32"); // 2 FloatMemRead
asm("ld3 {V9.4S, V10.4S, V11.4S}, [X30], #48"); // 3 FloatMemRead
asm("ld4 {V9.2D, V10.2D, V11.2D, V12.2D}, [X30], #64"); // 4 FloatMemRead

asm("ld1r {V9.16B}, [X30]"); // 1 FloatMemRead
asm("ld1r {V9.8H}, [X30]"); // 1 FloatMemRead
asm("ld1r {V9.4S}, [X30]"); // 1 FloatMemRead
asm("ld1r {V9.2D}, [X30]"); // 1 FloatMemRead

asm("ld2r {V9.16B, V10.16B}, [X30]"); // 1 FloatMemRead
asm("ld2r {V9.8H, V10.8H}, [X30]"); // 1 FloatMemRead
asm("ld2r {V9.4S, V10.4S}, [X30]"); // 1 FloatMemRead
asm("ld2r {V9.2D, V10.2D}, [X30]"); // 1 FloatMemRead

asm("ld3r {V9.16B, V10.16B, V11.16B}, [X30]"); // 1 FloatMemRead
asm("ld3r {V9.8H, V10.8H, V11.8H}, [X30]"); // 1 FloatMemRead
asm("ld3r {V9.4S, V10.4S, V11.4S}, [X30]"); // 1 FloatMemRead
asm("ld3r {V9.2D, V10.2D, V11.2D}, [X30]"); // 2 FloatMemRead

asm("ld4r {V9.16B, V10.16B, V11.16B, V12.16B}, [X30]"); // 1 FloatMemRead
asm("ld4r {V9.8H, V10.8H, V11.8H, V12.8H}, [X30]"); // 1 FloatMemRead
asm("ld4r {V9.4S, V10.4S, V11.4S, V12.4S}, [X30]"); // 1 FloatMemRead
asm("ld4r {V9.2D, V10.2D, V11.2D, V12.2D}, [X30]"); // 2 FloatMemRead

asm("str X2, [X30], #0"); // 1 MemWrite
asm("stp X2, X3, [X30], #0"); // 2 MemWrite
asm("str W2, [X30], #0"); // 1 MemWrite
asm("stp W2, W3, [X30], #0"); // 1 MemWrite

asm("stp Q0, Q1, [X29], #0"); // 4 FloatMemWrite
asm("stp D2, D3, [X29], #0"); // 2 FloatMemWrite
asm("stp S2, S3, [X29], #0"); // 1 FloatMemWrite
asm("str Q4, [X29], #0"); // 2 FloatMemWrite
asm("str D5, [X29], #0"); // 1 FloatMemWrite
asm("str S6, [X29], #0"); // 1 FloatMemWrite
asm("str H7, [X29], #0"); // 1 FloatMemWrite
asm("str B8, [X29], #0"); // 1 FloatMemWrite

asm("stur B0, [X29, #0]"); // 1 FloatMemWrite
asm("stur H0, [X29, #0]"); // 1 FloatMemWrite
asm("stur S0, [X29, #0]"); // 1 FloatMemWrite
asm("stur D0, [X29, #0]"); // 1 FloatMemWrite
asm("stur Q0, [X29, #0]"); // 2 FloatMemWrite

asm("st1 {V9.B}[0], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.B}[15], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.H}[0], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.H}[7], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.S}[0], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.S}[3], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.D}[0], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.D}[1], [X29]"); // 1 FloatMemWrite

asm("st2 {V9.B, V10.B}[0], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.B, V10.B}[15], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.H, V10.H}[0], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.H, V10.H}[7], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.S, V10.S}[0], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.S, V10.S}[3], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.D, V10.D}[0], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.D, V10.D}[1], [X29]"); // 1 FloatMemWrite

asm("st3 {V9.B, V10.B, V11.B}[0], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.B, V10.B, V11.B}[15], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.H, V10.H, V11.H}[0], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.H, V10.H, V11.H}[7], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.S, V10.S, V11.S}[0], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.S, V10.S, V11.S}[3], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.D, V10.D, V11.D}[0], [X29]"); // 2 FloatMemWrite
asm("st3 {V9.D, V10.D, V11.D}[1], [X29]"); // 2 FloatMemWrite

asm("st4 {V9.B, V10.B, V11.B, V12.B}[0], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.B, V10.B, V11.B, V12.B}[15], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.H, V10.H, V11.H, V12.H}[0], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.H, V10.H, V11.H, V12.H}[7], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.S, V10.S, V11.S, V12.S}[0], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.S, V10.S, V11.S, V12.S}[3], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.D, V10.D, V11.D, V12.D}[0], [X29]"); // 2 FloatMemWrite
asm("st4 {V9.D, V10.D, V11.D, V12.D}[1], [X29]"); // 2 FloatMemWrite

asm("st1 {V9.16B}, [X29], #16"); // 1 FloatMemWrite
asm("st1 {V9.8H, V10.8H}, [X29], #32"); // 2 FloatMemWrite
asm("st1 {V9.4S, V10.4S, V11.4S}, [X29], #48"); // 3 FloatMemWrite
asm("st1 {V9.2D, V10.2D, V11.2D, V12.2D}, [X29], #64"); // 4 FloatMemWrite

asm("st2 {V9.8H, V10.8H}, [X29], #32"); // 2 FloatMemWrite
asm("st3 {V9.4S, V10.4S, V11.4S}, [X29], #48"); // 3 FloatMemWrite
asm("st4 {V9.2D, V10.2D, V11.2D, V12.2D}, [X29], #64"); // 4 FloatMemWrite


Thanks,

Fernando Endo
Giacomo Gabrielli
2016-07-13 13:36:26 UTC
Permalink
-----------------------------------------------------------
This is an automatically generated e-mail. To reply, visit:
http://reviews.gem5.org/r/3547/#review8474
-----------------------------------------------------------


Thanks for this contribution, the Float/Simd split for AArch64 makes a lot of sense.
Overall the modifications look great, I only have a couple of comments:
1. I'm not sure whether MinorCPU would still work, given the additional opclasses (it might just ignore them). I'd suggest to update src/cpu/minor/MinorCPU.py as well to include the new "Float" opclasses in MinorDefaultFloatSimdFU; that should be enough to get it working, I believe.
2. I'm not too keen on the addition of FloatMem{Read/Write}. These might be useful for generating instruction distributions, but from a functional perspective all ARM loads/stores just deal with bytes and do not need to interpret their content, apart from endianness conversions (not sure about x86). I understand that in this context "Float" means that the destination is a Float register, but in my view the opclass in gem5 is mostly a way to steer instructions to functional units, but in this case plain Mem{Read/Write} and FloatMem{Read/Write} will always land on the same datapath...

- Giacomo Gabrielli
Post by Fernando Endo
-----------------------------------------------------------
http://reviews.gem5.org/r/3547/
-----------------------------------------------------------
(Updated July 10, 2016, 6:45 p.m.)
Review request for Default.
Repository: gem5
Description
-------
Modify the opClass assigned to AArch64 FP instructions from SimdFloat* to Float*. Also create the FloatMemRead and FloatMemWrite opClasses, which distinguishes writes to the INT and FP register banks.
Diffs
-----
src/arch/isa_parser.py cdb94f2332a6
src/cpu/FuncUnit.py cdb94f2332a6
src/cpu/o3/FuncUnitConfig.py cdb94f2332a6
src/cpu/op_class.hh cdb94f2332a6
configs/common/O3_ARM_v7a.py cdb94f2332a6
src/arch/arm/isa/insts/fp64.isa cdb94f2332a6
Diff: http://reviews.gem5.org/r/3547/diff/
Testing
-------
The changes from SimdFloat* to Float* were quickly tested, indeed the neon64.isa and fp64.isa files already separate the implementations, so it should not be buggy.
asm("ldr X2, [X30], #0"); // 1 MemRead
asm("ldp X2, X3, [X30], #0"); // 1 MemRead
asm("ldpsw X2, X3, [X30], #0"); // 1 MemRead
asm("ldr W2, [X30], #0"); // 1 MemRead
asm("ldp W2, W3, [X30], #0"); // 1 MemRead
asm("ldp Q0, Q1, [SP], #0"); // 2 FloatMemRead
asm("ldp D2, D3, [SP], #0"); // 1 FloatMemRead
asm("ldp S2, S3, [X30], #0"); // 1 FloatMemRead
asm("ldr Q4, [SP], #0"); // 1 FloatMemRead
asm("ldr Q4, [SP, X28]"); // 1 FloatMemRead
asm("ldr D5, [SP], #0"); // 1 FloatMemRead
asm("ldr D5, [SP, X28]"); // 1 FloatMemRead
asm("ldr S6, [SP], #0"); // 1 FloatMemRead
asm("ldr S2, [X30, X28]"); // 1 FloatMemRead
asm("ldr H7, [X30], #0"); // 1 FloatMemRead
asm("ldr B8, [SP], #0"); // 1 FloatMemRead
asm("ldur B0, [X30, #0]"); // 1 FloatMemRead
asm("ldur H0, [X30, #0]"); // 1 FloatMemRead
asm("ldur S0, [X30, #0]"); // 1 FloatMemRead
asm("ldur D0, [X30, #0]"); // 1 FloatMemRead
asm("ldur Q0, [X30, #0]"); // 1 FloatMemRead
asm("ld1 {V9.B}[0], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.B}[15], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.H}[0], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.H}[7], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.S}[0], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.S}[3], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.D}[0], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.D}[1], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.B, V10.B}[0], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.B, V10.B}[15], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.H, V10.H}[0], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.H, V10.H}[7], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.S, V10.S}[0], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.S, V10.S}[3], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.D, V10.D}[0], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.D, V10.D}[1], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.B, V10.B, V11.B}[0], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.B, V10.B, V11.B}[15], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.H, V10.H, V11.H}[0], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.H, V10.H, V11.H}[7], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.S, V10.S, V11.S}[0], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.S, V10.S, V11.S}[3], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.D, V10.D, V11.D}[0], [X30]"); // 2 FloatMemRead
asm("ld3 {V9.D, V10.D, V11.D}[1], [X30]"); // 2 FloatMemRead
asm("ld4 {V9.B, V10.B, V11.B, V12.B}[0], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.B, V10.B, V11.B, V12.B}[15], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.H, V10.H, V11.H, V12.H}[0], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.H, V10.H, V11.H, V12.H}[7], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.S, V10.S, V11.S, V12.S}[0], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.S, V10.S, V11.S, V12.S}[3], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.D, V10.D, V11.D, V12.D}[0], [X30]"); // 2 FloatMemRead
asm("ld4 {V9.D, V10.D, V11.D, V12.D}[1], [X30]"); // 2 FloatMemRead
asm("ld1 {V9.16B}, [X30], #16"); // 1 FloatMemRead
asm("ld1 {V9.8H, V10.8H}, [X30], #32"); // 2 FloatMemRead
asm("ld1 {V9.4S, V10.4S, V11.4S}, [X30], #48"); // 3 FloatMemRead
asm("ld1 {V9.2D, V10.2D, V11.2D, V12.2D}, [X30], #64"); // 4 FloatMemRead
asm("ld2 {V9.8H, V10.8H}, [X30], #32"); // 2 FloatMemRead
asm("ld3 {V9.4S, V10.4S, V11.4S}, [X30], #48"); // 3 FloatMemRead
asm("ld4 {V9.2D, V10.2D, V11.2D, V12.2D}, [X30], #64"); // 4 FloatMemRead
asm("ld1r {V9.16B}, [X30]"); // 1 FloatMemRead
asm("ld1r {V9.8H}, [X30]"); // 1 FloatMemRead
asm("ld1r {V9.4S}, [X30]"); // 1 FloatMemRead
asm("ld1r {V9.2D}, [X30]"); // 1 FloatMemRead
asm("ld2r {V9.16B, V10.16B}, [X30]"); // 1 FloatMemRead
asm("ld2r {V9.8H, V10.8H}, [X30]"); // 1 FloatMemRead
asm("ld2r {V9.4S, V10.4S}, [X30]"); // 1 FloatMemRead
asm("ld2r {V9.2D, V10.2D}, [X30]"); // 1 FloatMemRead
asm("ld3r {V9.16B, V10.16B, V11.16B}, [X30]"); // 1 FloatMemRead
asm("ld3r {V9.8H, V10.8H, V11.8H}, [X30]"); // 1 FloatMemRead
asm("ld3r {V9.4S, V10.4S, V11.4S}, [X30]"); // 1 FloatMemRead
asm("ld3r {V9.2D, V10.2D, V11.2D}, [X30]"); // 2 FloatMemRead
asm("ld4r {V9.16B, V10.16B, V11.16B, V12.16B}, [X30]"); // 1 FloatMemRead
asm("ld4r {V9.8H, V10.8H, V11.8H, V12.8H}, [X30]"); // 1 FloatMemRead
asm("ld4r {V9.4S, V10.4S, V11.4S, V12.4S}, [X30]"); // 1 FloatMemRead
asm("ld4r {V9.2D, V10.2D, V11.2D, V12.2D}, [X30]"); // 2 FloatMemRead
asm("str X2, [X30], #0"); // 1 MemWrite
asm("stp X2, X3, [X30], #0"); // 2 MemWrite
asm("str W2, [X30], #0"); // 1 MemWrite
asm("stp W2, W3, [X30], #0"); // 1 MemWrite
asm("stp Q0, Q1, [X29], #0"); // 4 FloatMemWrite
asm("stp D2, D3, [X29], #0"); // 2 FloatMemWrite
asm("stp S2, S3, [X29], #0"); // 1 FloatMemWrite
asm("str Q4, [X29], #0"); // 2 FloatMemWrite
asm("str D5, [X29], #0"); // 1 FloatMemWrite
asm("str S6, [X29], #0"); // 1 FloatMemWrite
asm("str H7, [X29], #0"); // 1 FloatMemWrite
asm("str B8, [X29], #0"); // 1 FloatMemWrite
asm("stur B0, [X29, #0]"); // 1 FloatMemWrite
asm("stur H0, [X29, #0]"); // 1 FloatMemWrite
asm("stur S0, [X29, #0]"); // 1 FloatMemWrite
asm("stur D0, [X29, #0]"); // 1 FloatMemWrite
asm("stur Q0, [X29, #0]"); // 2 FloatMemWrite
asm("st1 {V9.B}[0], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.B}[15], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.H}[0], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.H}[7], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.S}[0], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.S}[3], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.D}[0], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.D}[1], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.B, V10.B}[0], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.B, V10.B}[15], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.H, V10.H}[0], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.H, V10.H}[7], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.S, V10.S}[0], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.S, V10.S}[3], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.D, V10.D}[0], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.D, V10.D}[1], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.B, V10.B, V11.B}[0], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.B, V10.B, V11.B}[15], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.H, V10.H, V11.H}[0], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.H, V10.H, V11.H}[7], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.S, V10.S, V11.S}[0], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.S, V10.S, V11.S}[3], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.D, V10.D, V11.D}[0], [X29]"); // 2 FloatMemWrite
asm("st3 {V9.D, V10.D, V11.D}[1], [X29]"); // 2 FloatMemWrite
asm("st4 {V9.B, V10.B, V11.B, V12.B}[0], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.B, V10.B, V11.B, V12.B}[15], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.H, V10.H, V11.H, V12.H}[0], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.H, V10.H, V11.H, V12.H}[7], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.S, V10.S, V11.S, V12.S}[0], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.S, V10.S, V11.S, V12.S}[3], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.D, V10.D, V11.D, V12.D}[0], [X29]"); // 2 FloatMemWrite
asm("st4 {V9.D, V10.D, V11.D, V12.D}[1], [X29]"); // 2 FloatMemWrite
asm("st1 {V9.16B}, [X29], #16"); // 1 FloatMemWrite
asm("st1 {V9.8H, V10.8H}, [X29], #32"); // 2 FloatMemWrite
asm("st1 {V9.4S, V10.4S, V11.4S}, [X29], #48"); // 3 FloatMemWrite
asm("st1 {V9.2D, V10.2D, V11.2D, V12.2D}, [X29], #64"); // 4 FloatMemWrite
asm("st2 {V9.8H, V10.8H}, [X29], #32"); // 2 FloatMemWrite
asm("st3 {V9.4S, V10.4S, V11.4S}, [X29], #48"); // 3 FloatMemWrite
asm("st4 {V9.2D, V10.2D, V11.2D, V12.2D}, [X29], #64"); // 4 FloatMemWrite
Thanks,
Fernando Endo
Fernando Endo
2016-07-16 16:57:03 UTC
Permalink
Post by Fernando Endo
Post by Giacomo Gabrielli
Thanks for this contribution, the Float/Simd split for AArch64 makes a lot of sense.
1. I'm not sure whether MinorCPU would still work, given the additional opclasses (it might just ignore them). I'd suggest to update src/cpu/minor/MinorCPU.py as well to include the new "Float" opclasses in MinorDefaultFloatSimdFU; that should be enough to get it working, I believe.
2. I'm not too keen on the addition of FloatMem{Read/Write}. These might be useful for generating instruction distributions, but from a functional perspective all ARM loads/stores just deal with bytes and do not need to interpret their content, apart from endianness conversions (not sure about x86). I understand that in this context "Float" means that the destination is a Float register, but in my view the opclass in gem5 is mostly a way to steer instructions to functional units, but in this case plain Mem{Read/Write} and FloatMem{Read/Write} will always land on the same datapath...
Hello Giacomo,

Thanks for your suggestions, I fixed and tested the Minor CPU config.
Regarding the FloatMem* opClass, I took the gem5 spirit of having a highly configurable user interface. In my patch I purposefully put the FloatMemWrite in the same "functional unit" (i.e., execution port) as MemWrite, and idem for FloatMemRead, which is the usual. However, a user may want to have longer latency for FP loads/stores for example. Or set a separate execution port for them.


- Fernando


-----------------------------------------------------------
This is an automatically generated e-mail. To reply, visit:
http://reviews.gem5.org/r/3547/#review8474
-----------------------------------------------------------
Post by Fernando Endo
-----------------------------------------------------------
http://reviews.gem5.org/r/3547/
-----------------------------------------------------------
(Updated July 16, 2016, 4:44 p.m.)
Review request for Default.
Repository: gem5
Description
-------
Modify the opClass assigned to AArch64 FP instructions from SimdFloat* to Float*. Also create the FloatMemRead and FloatMemWrite opClasses, which distinguishes writes to the INT and FP register banks.
Diffs
-----
configs/common/O3_ARM_v7a.py cdb94f2332a6
src/arch/arm/isa/insts/fp64.isa cdb94f2332a6
src/arch/isa_parser.py cdb94f2332a6
src/cpu/FuncUnit.py cdb94f2332a6
src/cpu/minor/MinorCPU.py cdb94f2332a6
src/cpu/o3/FuncUnitConfig.py cdb94f2332a6
src/cpu/op_class.hh cdb94f2332a6
Diff: http://reviews.gem5.org/r/3547/diff/
Testing
-------
The changes from SimdFloat* to Float* were quickly tested, indeed the neon64.isa and fp64.isa files already separate the implementations, so it should not be buggy.
asm("ldr X2, [X30], #0"); // 1 MemRead
asm("ldp X2, X3, [X30], #0"); // 1 MemRead
asm("ldpsw X2, X3, [X30], #0"); // 1 MemRead
asm("ldr W2, [X30], #0"); // 1 MemRead
asm("ldp W2, W3, [X30], #0"); // 1 MemRead
asm("ldp Q0, Q1, [SP], #0"); // 2 FloatMemRead
asm("ldp D2, D3, [SP], #0"); // 1 FloatMemRead
asm("ldp S2, S3, [X30], #0"); // 1 FloatMemRead
asm("ldr Q4, [SP], #0"); // 1 FloatMemRead
asm("ldr Q4, [SP, X28]"); // 1 FloatMemRead
asm("ldr D5, [SP], #0"); // 1 FloatMemRead
asm("ldr D5, [SP, X28]"); // 1 FloatMemRead
asm("ldr S6, [SP], #0"); // 1 FloatMemRead
asm("ldr S2, [X30, X28]"); // 1 FloatMemRead
asm("ldr H7, [X30], #0"); // 1 FloatMemRead
asm("ldr B8, [SP], #0"); // 1 FloatMemRead
asm("ldur B0, [X30, #0]"); // 1 FloatMemRead
asm("ldur H0, [X30, #0]"); // 1 FloatMemRead
asm("ldur S0, [X30, #0]"); // 1 FloatMemRead
asm("ldur D0, [X30, #0]"); // 1 FloatMemRead
asm("ldur Q0, [X30, #0]"); // 1 FloatMemRead
asm("ld1 {V9.B}[0], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.B}[15], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.H}[0], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.H}[7], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.S}[0], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.S}[3], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.D}[0], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.D}[1], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.B, V10.B}[0], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.B, V10.B}[15], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.H, V10.H}[0], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.H, V10.H}[7], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.S, V10.S}[0], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.S, V10.S}[3], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.D, V10.D}[0], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.D, V10.D}[1], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.B, V10.B, V11.B}[0], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.B, V10.B, V11.B}[15], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.H, V10.H, V11.H}[0], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.H, V10.H, V11.H}[7], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.S, V10.S, V11.S}[0], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.S, V10.S, V11.S}[3], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.D, V10.D, V11.D}[0], [X30]"); // 2 FloatMemRead
asm("ld3 {V9.D, V10.D, V11.D}[1], [X30]"); // 2 FloatMemRead
asm("ld4 {V9.B, V10.B, V11.B, V12.B}[0], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.B, V10.B, V11.B, V12.B}[15], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.H, V10.H, V11.H, V12.H}[0], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.H, V10.H, V11.H, V12.H}[7], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.S, V10.S, V11.S, V12.S}[0], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.S, V10.S, V11.S, V12.S}[3], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.D, V10.D, V11.D, V12.D}[0], [X30]"); // 2 FloatMemRead
asm("ld4 {V9.D, V10.D, V11.D, V12.D}[1], [X30]"); // 2 FloatMemRead
asm("ld1 {V9.16B}, [X30], #16"); // 1 FloatMemRead
asm("ld1 {V9.8H, V10.8H}, [X30], #32"); // 2 FloatMemRead
asm("ld1 {V9.4S, V10.4S, V11.4S}, [X30], #48"); // 3 FloatMemRead
asm("ld1 {V9.2D, V10.2D, V11.2D, V12.2D}, [X30], #64"); // 4 FloatMemRead
asm("ld2 {V9.8H, V10.8H}, [X30], #32"); // 2 FloatMemRead
asm("ld3 {V9.4S, V10.4S, V11.4S}, [X30], #48"); // 3 FloatMemRead
asm("ld4 {V9.2D, V10.2D, V11.2D, V12.2D}, [X30], #64"); // 4 FloatMemRead
asm("ld1r {V9.16B}, [X30]"); // 1 FloatMemRead
asm("ld1r {V9.8H}, [X30]"); // 1 FloatMemRead
asm("ld1r {V9.4S}, [X30]"); // 1 FloatMemRead
asm("ld1r {V9.2D}, [X30]"); // 1 FloatMemRead
asm("ld2r {V9.16B, V10.16B}, [X30]"); // 1 FloatMemRead
asm("ld2r {V9.8H, V10.8H}, [X30]"); // 1 FloatMemRead
asm("ld2r {V9.4S, V10.4S}, [X30]"); // 1 FloatMemRead
asm("ld2r {V9.2D, V10.2D}, [X30]"); // 1 FloatMemRead
asm("ld3r {V9.16B, V10.16B, V11.16B}, [X30]"); // 1 FloatMemRead
asm("ld3r {V9.8H, V10.8H, V11.8H}, [X30]"); // 1 FloatMemRead
asm("ld3r {V9.4S, V10.4S, V11.4S}, [X30]"); // 1 FloatMemRead
asm("ld3r {V9.2D, V10.2D, V11.2D}, [X30]"); // 2 FloatMemRead
asm("ld4r {V9.16B, V10.16B, V11.16B, V12.16B}, [X30]"); // 1 FloatMemRead
asm("ld4r {V9.8H, V10.8H, V11.8H, V12.8H}, [X30]"); // 1 FloatMemRead
asm("ld4r {V9.4S, V10.4S, V11.4S, V12.4S}, [X30]"); // 1 FloatMemRead
asm("ld4r {V9.2D, V10.2D, V11.2D, V12.2D}, [X30]"); // 2 FloatMemRead
asm("str X2, [X30], #0"); // 1 MemWrite
asm("stp X2, X3, [X30], #0"); // 2 MemWrite
asm("str W2, [X30], #0"); // 1 MemWrite
asm("stp W2, W3, [X30], #0"); // 1 MemWrite
asm("stp Q0, Q1, [X29], #0"); // 4 FloatMemWrite
asm("stp D2, D3, [X29], #0"); // 2 FloatMemWrite
asm("stp S2, S3, [X29], #0"); // 1 FloatMemWrite
asm("str Q4, [X29], #0"); // 2 FloatMemWrite
asm("str D5, [X29], #0"); // 1 FloatMemWrite
asm("str S6, [X29], #0"); // 1 FloatMemWrite
asm("str H7, [X29], #0"); // 1 FloatMemWrite
asm("str B8, [X29], #0"); // 1 FloatMemWrite
asm("stur B0, [X29, #0]"); // 1 FloatMemWrite
asm("stur H0, [X29, #0]"); // 1 FloatMemWrite
asm("stur S0, [X29, #0]"); // 1 FloatMemWrite
asm("stur D0, [X29, #0]"); // 1 FloatMemWrite
asm("stur Q0, [X29, #0]"); // 2 FloatMemWrite
asm("st1 {V9.B}[0], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.B}[15], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.H}[0], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.H}[7], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.S}[0], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.S}[3], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.D}[0], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.D}[1], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.B, V10.B}[0], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.B, V10.B}[15], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.H, V10.H}[0], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.H, V10.H}[7], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.S, V10.S}[0], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.S, V10.S}[3], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.D, V10.D}[0], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.D, V10.D}[1], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.B, V10.B, V11.B}[0], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.B, V10.B, V11.B}[15], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.H, V10.H, V11.H}[0], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.H, V10.H, V11.H}[7], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.S, V10.S, V11.S}[0], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.S, V10.S, V11.S}[3], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.D, V10.D, V11.D}[0], [X29]"); // 2 FloatMemWrite
asm("st3 {V9.D, V10.D, V11.D}[1], [X29]"); // 2 FloatMemWrite
asm("st4 {V9.B, V10.B, V11.B, V12.B}[0], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.B, V10.B, V11.B, V12.B}[15], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.H, V10.H, V11.H, V12.H}[0], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.H, V10.H, V11.H, V12.H}[7], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.S, V10.S, V11.S, V12.S}[0], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.S, V10.S, V11.S, V12.S}[3], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.D, V10.D, V11.D, V12.D}[0], [X29]"); // 2 FloatMemWrite
asm("st4 {V9.D, V10.D, V11.D, V12.D}[1], [X29]"); // 2 FloatMemWrite
asm("st1 {V9.16B}, [X29], #16"); // 1 FloatMemWrite
asm("st1 {V9.8H, V10.8H}, [X29], #32"); // 2 FloatMemWrite
asm("st1 {V9.4S, V10.4S, V11.4S}, [X29], #48"); // 3 FloatMemWrite
asm("st1 {V9.2D, V10.2D, V11.2D, V12.2D}, [X29], #64"); // 4 FloatMemWrite
asm("st2 {V9.8H, V10.8H}, [X29], #32"); // 2 FloatMemWrite
asm("st3 {V9.4S, V10.4S, V11.4S}, [X29], #48"); // 3 FloatMemWrite
asm("st4 {V9.2D, V10.2D, V11.2D, V12.2D}, [X29], #64"); // 4 FloatMemWrite
Thanks,
Fernando Endo
Giacomo Gabrielli
2016-07-20 09:47:23 UTC
Permalink
Post by Fernando Endo
Post by Giacomo Gabrielli
Thanks for this contribution, the Float/Simd split for AArch64 makes a lot of sense.
1. I'm not sure whether MinorCPU would still work, given the additional opclasses (it might just ignore them). I'd suggest to update src/cpu/minor/MinorCPU.py as well to include the new "Float" opclasses in MinorDefaultFloatSimdFU; that should be enough to get it working, I believe.
2. I'm not too keen on the addition of FloatMem{Read/Write}. These might be useful for generating instruction distributions, but from a functional perspective all ARM loads/stores just deal with bytes and do not need to interpret their content, apart from endianness conversions (not sure about x86). I understand that in this context "Float" means that the destination is a Float register, but in my view the opclass in gem5 is mostly a way to steer instructions to functional units, but in this case plain Mem{Read/Write} and FloatMem{Read/Write} will always land on the same datapath...
Hello Giacomo,
Thanks for your suggestions, I fixed and tested the Minor CPU config.
Regarding the FloatMem* opClass, I took the gem5 spirit of having a highly configurable user interface. In my patch I purposefully put the FloatMemWrite in the same "functional unit" (i.e., execution port) as MemWrite, and idem for FloatMemRead, which is the usual. However, a user may want to have longer latency for FP loads/stores for example. Or set a separate execution port for them.
Overall I'm happy with the current status of the patch. Thanks!


- Giacomo


-----------------------------------------------------------
This is an automatically generated e-mail. To reply, visit:
http://reviews.gem5.org/r/3547/#review8474
-----------------------------------------------------------
Post by Fernando Endo
-----------------------------------------------------------
http://reviews.gem5.org/r/3547/
-----------------------------------------------------------
(Updated July 16, 2016, 4:44 p.m.)
Review request for Default.
Repository: gem5
Description
-------
Modify the opClass assigned to AArch64 FP instructions from SimdFloat* to Float*. Also create the FloatMemRead and FloatMemWrite opClasses, which distinguishes writes to the INT and FP register banks.
Diffs
-----
configs/common/O3_ARM_v7a.py cdb94f2332a6
src/arch/arm/isa/insts/fp64.isa cdb94f2332a6
src/arch/isa_parser.py cdb94f2332a6
src/cpu/FuncUnit.py cdb94f2332a6
src/cpu/minor/MinorCPU.py cdb94f2332a6
src/cpu/o3/FuncUnitConfig.py cdb94f2332a6
src/cpu/op_class.hh cdb94f2332a6
Diff: http://reviews.gem5.org/r/3547/diff/
Testing
-------
The changes from SimdFloat* to Float* were quickly tested, indeed the neon64.isa and fp64.isa files already separate the implementations, so it should not be buggy.
asm("ldr X2, [X30], #0"); // 1 MemRead
asm("ldp X2, X3, [X30], #0"); // 1 MemRead
asm("ldpsw X2, X3, [X30], #0"); // 1 MemRead
asm("ldr W2, [X30], #0"); // 1 MemRead
asm("ldp W2, W3, [X30], #0"); // 1 MemRead
asm("ldp Q0, Q1, [SP], #0"); // 2 FloatMemRead
asm("ldp D2, D3, [SP], #0"); // 1 FloatMemRead
asm("ldp S2, S3, [X30], #0"); // 1 FloatMemRead
asm("ldr Q4, [SP], #0"); // 1 FloatMemRead
asm("ldr Q4, [SP, X28]"); // 1 FloatMemRead
asm("ldr D5, [SP], #0"); // 1 FloatMemRead
asm("ldr D5, [SP, X28]"); // 1 FloatMemRead
asm("ldr S6, [SP], #0"); // 1 FloatMemRead
asm("ldr S2, [X30, X28]"); // 1 FloatMemRead
asm("ldr H7, [X30], #0"); // 1 FloatMemRead
asm("ldr B8, [SP], #0"); // 1 FloatMemRead
asm("ldur B0, [X30, #0]"); // 1 FloatMemRead
asm("ldur H0, [X30, #0]"); // 1 FloatMemRead
asm("ldur S0, [X30, #0]"); // 1 FloatMemRead
asm("ldur D0, [X30, #0]"); // 1 FloatMemRead
asm("ldur Q0, [X30, #0]"); // 1 FloatMemRead
asm("ld1 {V9.B}[0], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.B}[15], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.H}[0], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.H}[7], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.S}[0], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.S}[3], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.D}[0], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.D}[1], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.B, V10.B}[0], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.B, V10.B}[15], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.H, V10.H}[0], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.H, V10.H}[7], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.S, V10.S}[0], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.S, V10.S}[3], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.D, V10.D}[0], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.D, V10.D}[1], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.B, V10.B, V11.B}[0], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.B, V10.B, V11.B}[15], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.H, V10.H, V11.H}[0], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.H, V10.H, V11.H}[7], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.S, V10.S, V11.S}[0], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.S, V10.S, V11.S}[3], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.D, V10.D, V11.D}[0], [X30]"); // 2 FloatMemRead
asm("ld3 {V9.D, V10.D, V11.D}[1], [X30]"); // 2 FloatMemRead
asm("ld4 {V9.B, V10.B, V11.B, V12.B}[0], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.B, V10.B, V11.B, V12.B}[15], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.H, V10.H, V11.H, V12.H}[0], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.H, V10.H, V11.H, V12.H}[7], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.S, V10.S, V11.S, V12.S}[0], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.S, V10.S, V11.S, V12.S}[3], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.D, V10.D, V11.D, V12.D}[0], [X30]"); // 2 FloatMemRead
asm("ld4 {V9.D, V10.D, V11.D, V12.D}[1], [X30]"); // 2 FloatMemRead
asm("ld1 {V9.16B}, [X30], #16"); // 1 FloatMemRead
asm("ld1 {V9.8H, V10.8H}, [X30], #32"); // 2 FloatMemRead
asm("ld1 {V9.4S, V10.4S, V11.4S}, [X30], #48"); // 3 FloatMemRead
asm("ld1 {V9.2D, V10.2D, V11.2D, V12.2D}, [X30], #64"); // 4 FloatMemRead
asm("ld2 {V9.8H, V10.8H}, [X30], #32"); // 2 FloatMemRead
asm("ld3 {V9.4S, V10.4S, V11.4S}, [X30], #48"); // 3 FloatMemRead
asm("ld4 {V9.2D, V10.2D, V11.2D, V12.2D}, [X30], #64"); // 4 FloatMemRead
asm("ld1r {V9.16B}, [X30]"); // 1 FloatMemRead
asm("ld1r {V9.8H}, [X30]"); // 1 FloatMemRead
asm("ld1r {V9.4S}, [X30]"); // 1 FloatMemRead
asm("ld1r {V9.2D}, [X30]"); // 1 FloatMemRead
asm("ld2r {V9.16B, V10.16B}, [X30]"); // 1 FloatMemRead
asm("ld2r {V9.8H, V10.8H}, [X30]"); // 1 FloatMemRead
asm("ld2r {V9.4S, V10.4S}, [X30]"); // 1 FloatMemRead
asm("ld2r {V9.2D, V10.2D}, [X30]"); // 1 FloatMemRead
asm("ld3r {V9.16B, V10.16B, V11.16B}, [X30]"); // 1 FloatMemRead
asm("ld3r {V9.8H, V10.8H, V11.8H}, [X30]"); // 1 FloatMemRead
asm("ld3r {V9.4S, V10.4S, V11.4S}, [X30]"); // 1 FloatMemRead
asm("ld3r {V9.2D, V10.2D, V11.2D}, [X30]"); // 2 FloatMemRead
asm("ld4r {V9.16B, V10.16B, V11.16B, V12.16B}, [X30]"); // 1 FloatMemRead
asm("ld4r {V9.8H, V10.8H, V11.8H, V12.8H}, [X30]"); // 1 FloatMemRead
asm("ld4r {V9.4S, V10.4S, V11.4S, V12.4S}, [X30]"); // 1 FloatMemRead
asm("ld4r {V9.2D, V10.2D, V11.2D, V12.2D}, [X30]"); // 2 FloatMemRead
asm("str X2, [X30], #0"); // 1 MemWrite
asm("stp X2, X3, [X30], #0"); // 2 MemWrite
asm("str W2, [X30], #0"); // 1 MemWrite
asm("stp W2, W3, [X30], #0"); // 1 MemWrite
asm("stp Q0, Q1, [X29], #0"); // 4 FloatMemWrite
asm("stp D2, D3, [X29], #0"); // 2 FloatMemWrite
asm("stp S2, S3, [X29], #0"); // 1 FloatMemWrite
asm("str Q4, [X29], #0"); // 2 FloatMemWrite
asm("str D5, [X29], #0"); // 1 FloatMemWrite
asm("str S6, [X29], #0"); // 1 FloatMemWrite
asm("str H7, [X29], #0"); // 1 FloatMemWrite
asm("str B8, [X29], #0"); // 1 FloatMemWrite
asm("stur B0, [X29, #0]"); // 1 FloatMemWrite
asm("stur H0, [X29, #0]"); // 1 FloatMemWrite
asm("stur S0, [X29, #0]"); // 1 FloatMemWrite
asm("stur D0, [X29, #0]"); // 1 FloatMemWrite
asm("stur Q0, [X29, #0]"); // 2 FloatMemWrite
asm("st1 {V9.B}[0], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.B}[15], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.H}[0], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.H}[7], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.S}[0], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.S}[3], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.D}[0], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.D}[1], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.B, V10.B}[0], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.B, V10.B}[15], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.H, V10.H}[0], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.H, V10.H}[7], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.S, V10.S}[0], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.S, V10.S}[3], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.D, V10.D}[0], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.D, V10.D}[1], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.B, V10.B, V11.B}[0], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.B, V10.B, V11.B}[15], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.H, V10.H, V11.H}[0], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.H, V10.H, V11.H}[7], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.S, V10.S, V11.S}[0], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.S, V10.S, V11.S}[3], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.D, V10.D, V11.D}[0], [X29]"); // 2 FloatMemWrite
asm("st3 {V9.D, V10.D, V11.D}[1], [X29]"); // 2 FloatMemWrite
asm("st4 {V9.B, V10.B, V11.B, V12.B}[0], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.B, V10.B, V11.B, V12.B}[15], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.H, V10.H, V11.H, V12.H}[0], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.H, V10.H, V11.H, V12.H}[7], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.S, V10.S, V11.S, V12.S}[0], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.S, V10.S, V11.S, V12.S}[3], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.D, V10.D, V11.D, V12.D}[0], [X29]"); // 2 FloatMemWrite
asm("st4 {V9.D, V10.D, V11.D, V12.D}[1], [X29]"); // 2 FloatMemWrite
asm("st1 {V9.16B}, [X29], #16"); // 1 FloatMemWrite
asm("st1 {V9.8H, V10.8H}, [X29], #32"); // 2 FloatMemWrite
asm("st1 {V9.4S, V10.4S, V11.4S}, [X29], #48"); // 3 FloatMemWrite
asm("st1 {V9.2D, V10.2D, V11.2D, V12.2D}, [X29], #64"); // 4 FloatMemWrite
asm("st2 {V9.8H, V10.8H}, [X29], #32"); // 2 FloatMemWrite
asm("st3 {V9.4S, V10.4S, V11.4S}, [X29], #48"); // 3 FloatMemWrite
asm("st4 {V9.2D, V10.2D, V11.2D, V12.2D}, [X29], #64"); // 4 FloatMemWrite
Thanks,
Fernando Endo
Fernando Endo
2016-07-16 16:44:38 UTC
Permalink
-----------------------------------------------------------
This is an automatically generated e-mail. To reply, visit:
http://reviews.gem5.org/r/3547/
-----------------------------------------------------------

(Updated July 16, 2016, 4:44 p.m.)


Review request for Default.


Changes
-------

Upload new diff


Repository: gem5


Description
-------

Modify the opClass assigned to AArch64 FP instructions from SimdFloat* to Float*. Also create the FloatMemRead and FloatMemWrite opClasses, which distinguishes writes to the INT and FP register banks.


Diffs (updated)
-----

configs/common/O3_ARM_v7a.py cdb94f2332a6
src/arch/arm/isa/insts/fp64.isa cdb94f2332a6
src/arch/isa_parser.py cdb94f2332a6
src/cpu/FuncUnit.py cdb94f2332a6
src/cpu/minor/MinorCPU.py cdb94f2332a6
src/cpu/o3/FuncUnitConfig.py cdb94f2332a6
src/cpu/op_class.hh cdb94f2332a6

Diff: http://reviews.gem5.org/r/3547/diff/


Testing
-------

The changes from SimdFloat* to Float* were quickly tested, indeed the neon64.isa and fp64.isa files already separate the implementations, so it should not be buggy.
Load and store instructions tested, followed by #uops and opClass from stats.txt:

asm("ldr X2, [X30], #0"); // 1 MemRead
asm("ldp X2, X3, [X30], #0"); // 1 MemRead
asm("ldpsw X2, X3, [X30], #0"); // 1 MemRead
asm("ldr W2, [X30], #0"); // 1 MemRead
asm("ldp W2, W3, [X30], #0"); // 1 MemRead

asm("ldp Q0, Q1, [SP], #0"); // 2 FloatMemRead
asm("ldp D2, D3, [SP], #0"); // 1 FloatMemRead
asm("ldp S2, S3, [X30], #0"); // 1 FloatMemRead
asm("ldr Q4, [SP], #0"); // 1 FloatMemRead
asm("ldr Q4, [SP, X28]"); // 1 FloatMemRead
asm("ldr D5, [SP], #0"); // 1 FloatMemRead
asm("ldr D5, [SP, X28]"); // 1 FloatMemRead
asm("ldr S6, [SP], #0"); // 1 FloatMemRead
asm("ldr S2, [X30, X28]"); // 1 FloatMemRead
asm("ldr H7, [X30], #0"); // 1 FloatMemRead
asm("ldr B8, [SP], #0"); // 1 FloatMemRead

asm("ldur B0, [X30, #0]"); // 1 FloatMemRead
asm("ldur H0, [X30, #0]"); // 1 FloatMemRead
asm("ldur S0, [X30, #0]"); // 1 FloatMemRead
asm("ldur D0, [X30, #0]"); // 1 FloatMemRead
asm("ldur Q0, [X30, #0]"); // 1 FloatMemRead

asm("ld1 {V9.B}[0], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.B}[15], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.H}[0], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.H}[7], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.S}[0], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.S}[3], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.D}[0], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.D}[1], [X30]"); // 1 FloatMemRead

asm("ld2 {V9.B, V10.B}[0], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.B, V10.B}[15], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.H, V10.H}[0], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.H, V10.H}[7], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.S, V10.S}[0], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.S, V10.S}[3], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.D, V10.D}[0], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.D, V10.D}[1], [X30]"); // 1 FloatMemRead

asm("ld3 {V9.B, V10.B, V11.B}[0], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.B, V10.B, V11.B}[15], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.H, V10.H, V11.H}[0], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.H, V10.H, V11.H}[7], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.S, V10.S, V11.S}[0], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.S, V10.S, V11.S}[3], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.D, V10.D, V11.D}[0], [X30]"); // 2 FloatMemRead
asm("ld3 {V9.D, V10.D, V11.D}[1], [X30]"); // 2 FloatMemRead

asm("ld4 {V9.B, V10.B, V11.B, V12.B}[0], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.B, V10.B, V11.B, V12.B}[15], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.H, V10.H, V11.H, V12.H}[0], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.H, V10.H, V11.H, V12.H}[7], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.S, V10.S, V11.S, V12.S}[0], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.S, V10.S, V11.S, V12.S}[3], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.D, V10.D, V11.D, V12.D}[0], [X30]"); // 2 FloatMemRead
asm("ld4 {V9.D, V10.D, V11.D, V12.D}[1], [X30]"); // 2 FloatMemRead

asm("ld1 {V9.16B}, [X30], #16"); // 1 FloatMemRead
asm("ld1 {V9.8H, V10.8H}, [X30], #32"); // 2 FloatMemRead
asm("ld1 {V9.4S, V10.4S, V11.4S}, [X30], #48"); // 3 FloatMemRead
asm("ld1 {V9.2D, V10.2D, V11.2D, V12.2D}, [X30], #64"); // 4 FloatMemRead

asm("ld2 {V9.8H, V10.8H}, [X30], #32"); // 2 FloatMemRead
asm("ld3 {V9.4S, V10.4S, V11.4S}, [X30], #48"); // 3 FloatMemRead
asm("ld4 {V9.2D, V10.2D, V11.2D, V12.2D}, [X30], #64"); // 4 FloatMemRead

asm("ld1r {V9.16B}, [X30]"); // 1 FloatMemRead
asm("ld1r {V9.8H}, [X30]"); // 1 FloatMemRead
asm("ld1r {V9.4S}, [X30]"); // 1 FloatMemRead
asm("ld1r {V9.2D}, [X30]"); // 1 FloatMemRead

asm("ld2r {V9.16B, V10.16B}, [X30]"); // 1 FloatMemRead
asm("ld2r {V9.8H, V10.8H}, [X30]"); // 1 FloatMemRead
asm("ld2r {V9.4S, V10.4S}, [X30]"); // 1 FloatMemRead
asm("ld2r {V9.2D, V10.2D}, [X30]"); // 1 FloatMemRead

asm("ld3r {V9.16B, V10.16B, V11.16B}, [X30]"); // 1 FloatMemRead
asm("ld3r {V9.8H, V10.8H, V11.8H}, [X30]"); // 1 FloatMemRead
asm("ld3r {V9.4S, V10.4S, V11.4S}, [X30]"); // 1 FloatMemRead
asm("ld3r {V9.2D, V10.2D, V11.2D}, [X30]"); // 2 FloatMemRead

asm("ld4r {V9.16B, V10.16B, V11.16B, V12.16B}, [X30]"); // 1 FloatMemRead
asm("ld4r {V9.8H, V10.8H, V11.8H, V12.8H}, [X30]"); // 1 FloatMemRead
asm("ld4r {V9.4S, V10.4S, V11.4S, V12.4S}, [X30]"); // 1 FloatMemRead
asm("ld4r {V9.2D, V10.2D, V11.2D, V12.2D}, [X30]"); // 2 FloatMemRead

asm("str X2, [X30], #0"); // 1 MemWrite
asm("stp X2, X3, [X30], #0"); // 2 MemWrite
asm("str W2, [X30], #0"); // 1 MemWrite
asm("stp W2, W3, [X30], #0"); // 1 MemWrite

asm("stp Q0, Q1, [X29], #0"); // 4 FloatMemWrite
asm("stp D2, D3, [X29], #0"); // 2 FloatMemWrite
asm("stp S2, S3, [X29], #0"); // 1 FloatMemWrite
asm("str Q4, [X29], #0"); // 2 FloatMemWrite
asm("str D5, [X29], #0"); // 1 FloatMemWrite
asm("str S6, [X29], #0"); // 1 FloatMemWrite
asm("str H7, [X29], #0"); // 1 FloatMemWrite
asm("str B8, [X29], #0"); // 1 FloatMemWrite

asm("stur B0, [X29, #0]"); // 1 FloatMemWrite
asm("stur H0, [X29, #0]"); // 1 FloatMemWrite
asm("stur S0, [X29, #0]"); // 1 FloatMemWrite
asm("stur D0, [X29, #0]"); // 1 FloatMemWrite
asm("stur Q0, [X29, #0]"); // 2 FloatMemWrite

asm("st1 {V9.B}[0], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.B}[15], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.H}[0], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.H}[7], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.S}[0], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.S}[3], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.D}[0], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.D}[1], [X29]"); // 1 FloatMemWrite

asm("st2 {V9.B, V10.B}[0], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.B, V10.B}[15], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.H, V10.H}[0], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.H, V10.H}[7], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.S, V10.S}[0], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.S, V10.S}[3], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.D, V10.D}[0], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.D, V10.D}[1], [X29]"); // 1 FloatMemWrite

asm("st3 {V9.B, V10.B, V11.B}[0], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.B, V10.B, V11.B}[15], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.H, V10.H, V11.H}[0], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.H, V10.H, V11.H}[7], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.S, V10.S, V11.S}[0], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.S, V10.S, V11.S}[3], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.D, V10.D, V11.D}[0], [X29]"); // 2 FloatMemWrite
asm("st3 {V9.D, V10.D, V11.D}[1], [X29]"); // 2 FloatMemWrite

asm("st4 {V9.B, V10.B, V11.B, V12.B}[0], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.B, V10.B, V11.B, V12.B}[15], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.H, V10.H, V11.H, V12.H}[0], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.H, V10.H, V11.H, V12.H}[7], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.S, V10.S, V11.S, V12.S}[0], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.S, V10.S, V11.S, V12.S}[3], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.D, V10.D, V11.D, V12.D}[0], [X29]"); // 2 FloatMemWrite
asm("st4 {V9.D, V10.D, V11.D, V12.D}[1], [X29]"); // 2 FloatMemWrite

asm("st1 {V9.16B}, [X29], #16"); // 1 FloatMemWrite
asm("st1 {V9.8H, V10.8H}, [X29], #32"); // 2 FloatMemWrite
asm("st1 {V9.4S, V10.4S, V11.4S}, [X29], #48"); // 3 FloatMemWrite
asm("st1 {V9.2D, V10.2D, V11.2D, V12.2D}, [X29], #64"); // 4 FloatMemWrite

asm("st2 {V9.8H, V10.8H}, [X29], #32"); // 2 FloatMemWrite
asm("st3 {V9.4S, V10.4S, V11.4S}, [X29], #48"); // 3 FloatMemWrite
asm("st4 {V9.2D, V10.2D, V11.2D, V12.2D}, [X29], #64"); // 4 FloatMemWrite


Thanks,

Fernando Endo
Jason Lowe-Power
2016-07-19 14:58:34 UTC
Permalink
-----------------------------------------------------------
This is an automatically generated e-mail. To reply, visit:
http://reviews.gem5.org/r/3547/#review8483
-----------------------------------------------------------


Seems reasonable to me except for the comment below. It would be good if one of the ARM folks had a look at this and signed off.

Also, does this affect the regression stats? I would imagine so.


configs/common/O3_ARM_v7a.py (line 65)
<http://reviews.gem5.org/r/3547/#comment7346>

Does this change the performance at all? Is there a need for this change?


- Jason Lowe-Power
Post by Fernando Endo
-----------------------------------------------------------
http://reviews.gem5.org/r/3547/
-----------------------------------------------------------
(Updated July 16, 2016, 4:44 p.m.)
Review request for Default.
Repository: gem5
Description
-------
Modify the opClass assigned to AArch64 FP instructions from SimdFloat* to Float*. Also create the FloatMemRead and FloatMemWrite opClasses, which distinguishes writes to the INT and FP register banks.
Diffs
-----
configs/common/O3_ARM_v7a.py cdb94f2332a6
src/arch/arm/isa/insts/fp64.isa cdb94f2332a6
src/arch/isa_parser.py cdb94f2332a6
src/cpu/FuncUnit.py cdb94f2332a6
src/cpu/minor/MinorCPU.py cdb94f2332a6
src/cpu/o3/FuncUnitConfig.py cdb94f2332a6
src/cpu/op_class.hh cdb94f2332a6
Diff: http://reviews.gem5.org/r/3547/diff/
Testing
-------
The changes from SimdFloat* to Float* were quickly tested, indeed the neon64.isa and fp64.isa files already separate the implementations, so it should not be buggy.
asm("ldr X2, [X30], #0"); // 1 MemRead
asm("ldp X2, X3, [X30], #0"); // 1 MemRead
asm("ldpsw X2, X3, [X30], #0"); // 1 MemRead
asm("ldr W2, [X30], #0"); // 1 MemRead
asm("ldp W2, W3, [X30], #0"); // 1 MemRead
asm("ldp Q0, Q1, [SP], #0"); // 2 FloatMemRead
asm("ldp D2, D3, [SP], #0"); // 1 FloatMemRead
asm("ldp S2, S3, [X30], #0"); // 1 FloatMemRead
asm("ldr Q4, [SP], #0"); // 1 FloatMemRead
asm("ldr Q4, [SP, X28]"); // 1 FloatMemRead
asm("ldr D5, [SP], #0"); // 1 FloatMemRead
asm("ldr D5, [SP, X28]"); // 1 FloatMemRead
asm("ldr S6, [SP], #0"); // 1 FloatMemRead
asm("ldr S2, [X30, X28]"); // 1 FloatMemRead
asm("ldr H7, [X30], #0"); // 1 FloatMemRead
asm("ldr B8, [SP], #0"); // 1 FloatMemRead
asm("ldur B0, [X30, #0]"); // 1 FloatMemRead
asm("ldur H0, [X30, #0]"); // 1 FloatMemRead
asm("ldur S0, [X30, #0]"); // 1 FloatMemRead
asm("ldur D0, [X30, #0]"); // 1 FloatMemRead
asm("ldur Q0, [X30, #0]"); // 1 FloatMemRead
asm("ld1 {V9.B}[0], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.B}[15], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.H}[0], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.H}[7], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.S}[0], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.S}[3], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.D}[0], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.D}[1], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.B, V10.B}[0], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.B, V10.B}[15], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.H, V10.H}[0], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.H, V10.H}[7], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.S, V10.S}[0], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.S, V10.S}[3], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.D, V10.D}[0], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.D, V10.D}[1], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.B, V10.B, V11.B}[0], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.B, V10.B, V11.B}[15], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.H, V10.H, V11.H}[0], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.H, V10.H, V11.H}[7], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.S, V10.S, V11.S}[0], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.S, V10.S, V11.S}[3], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.D, V10.D, V11.D}[0], [X30]"); // 2 FloatMemRead
asm("ld3 {V9.D, V10.D, V11.D}[1], [X30]"); // 2 FloatMemRead
asm("ld4 {V9.B, V10.B, V11.B, V12.B}[0], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.B, V10.B, V11.B, V12.B}[15], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.H, V10.H, V11.H, V12.H}[0], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.H, V10.H, V11.H, V12.H}[7], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.S, V10.S, V11.S, V12.S}[0], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.S, V10.S, V11.S, V12.S}[3], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.D, V10.D, V11.D, V12.D}[0], [X30]"); // 2 FloatMemRead
asm("ld4 {V9.D, V10.D, V11.D, V12.D}[1], [X30]"); // 2 FloatMemRead
asm("ld1 {V9.16B}, [X30], #16"); // 1 FloatMemRead
asm("ld1 {V9.8H, V10.8H}, [X30], #32"); // 2 FloatMemRead
asm("ld1 {V9.4S, V10.4S, V11.4S}, [X30], #48"); // 3 FloatMemRead
asm("ld1 {V9.2D, V10.2D, V11.2D, V12.2D}, [X30], #64"); // 4 FloatMemRead
asm("ld2 {V9.8H, V10.8H}, [X30], #32"); // 2 FloatMemRead
asm("ld3 {V9.4S, V10.4S, V11.4S}, [X30], #48"); // 3 FloatMemRead
asm("ld4 {V9.2D, V10.2D, V11.2D, V12.2D}, [X30], #64"); // 4 FloatMemRead
asm("ld1r {V9.16B}, [X30]"); // 1 FloatMemRead
asm("ld1r {V9.8H}, [X30]"); // 1 FloatMemRead
asm("ld1r {V9.4S}, [X30]"); // 1 FloatMemRead
asm("ld1r {V9.2D}, [X30]"); // 1 FloatMemRead
asm("ld2r {V9.16B, V10.16B}, [X30]"); // 1 FloatMemRead
asm("ld2r {V9.8H, V10.8H}, [X30]"); // 1 FloatMemRead
asm("ld2r {V9.4S, V10.4S}, [X30]"); // 1 FloatMemRead
asm("ld2r {V9.2D, V10.2D}, [X30]"); // 1 FloatMemRead
asm("ld3r {V9.16B, V10.16B, V11.16B}, [X30]"); // 1 FloatMemRead
asm("ld3r {V9.8H, V10.8H, V11.8H}, [X30]"); // 1 FloatMemRead
asm("ld3r {V9.4S, V10.4S, V11.4S}, [X30]"); // 1 FloatMemRead
asm("ld3r {V9.2D, V10.2D, V11.2D}, [X30]"); // 2 FloatMemRead
asm("ld4r {V9.16B, V10.16B, V11.16B, V12.16B}, [X30]"); // 1 FloatMemRead
asm("ld4r {V9.8H, V10.8H, V11.8H, V12.8H}, [X30]"); // 1 FloatMemRead
asm("ld4r {V9.4S, V10.4S, V11.4S, V12.4S}, [X30]"); // 1 FloatMemRead
asm("ld4r {V9.2D, V10.2D, V11.2D, V12.2D}, [X30]"); // 2 FloatMemRead
asm("str X2, [X30], #0"); // 1 MemWrite
asm("stp X2, X3, [X30], #0"); // 2 MemWrite
asm("str W2, [X30], #0"); // 1 MemWrite
asm("stp W2, W3, [X30], #0"); // 1 MemWrite
asm("stp Q0, Q1, [X29], #0"); // 4 FloatMemWrite
asm("stp D2, D3, [X29], #0"); // 2 FloatMemWrite
asm("stp S2, S3, [X29], #0"); // 1 FloatMemWrite
asm("str Q4, [X29], #0"); // 2 FloatMemWrite
asm("str D5, [X29], #0"); // 1 FloatMemWrite
asm("str S6, [X29], #0"); // 1 FloatMemWrite
asm("str H7, [X29], #0"); // 1 FloatMemWrite
asm("str B8, [X29], #0"); // 1 FloatMemWrite
asm("stur B0, [X29, #0]"); // 1 FloatMemWrite
asm("stur H0, [X29, #0]"); // 1 FloatMemWrite
asm("stur S0, [X29, #0]"); // 1 FloatMemWrite
asm("stur D0, [X29, #0]"); // 1 FloatMemWrite
asm("stur Q0, [X29, #0]"); // 2 FloatMemWrite
asm("st1 {V9.B}[0], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.B}[15], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.H}[0], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.H}[7], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.S}[0], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.S}[3], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.D}[0], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.D}[1], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.B, V10.B}[0], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.B, V10.B}[15], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.H, V10.H}[0], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.H, V10.H}[7], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.S, V10.S}[0], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.S, V10.S}[3], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.D, V10.D}[0], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.D, V10.D}[1], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.B, V10.B, V11.B}[0], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.B, V10.B, V11.B}[15], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.H, V10.H, V11.H}[0], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.H, V10.H, V11.H}[7], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.S, V10.S, V11.S}[0], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.S, V10.S, V11.S}[3], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.D, V10.D, V11.D}[0], [X29]"); // 2 FloatMemWrite
asm("st3 {V9.D, V10.D, V11.D}[1], [X29]"); // 2 FloatMemWrite
asm("st4 {V9.B, V10.B, V11.B, V12.B}[0], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.B, V10.B, V11.B, V12.B}[15], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.H, V10.H, V11.H, V12.H}[0], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.H, V10.H, V11.H, V12.H}[7], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.S, V10.S, V11.S, V12.S}[0], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.S, V10.S, V11.S, V12.S}[3], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.D, V10.D, V11.D, V12.D}[0], [X29]"); // 2 FloatMemWrite
asm("st4 {V9.D, V10.D, V11.D, V12.D}[1], [X29]"); // 2 FloatMemWrite
asm("st1 {V9.16B}, [X29], #16"); // 1 FloatMemWrite
asm("st1 {V9.8H, V10.8H}, [X29], #32"); // 2 FloatMemWrite
asm("st1 {V9.4S, V10.4S, V11.4S}, [X29], #48"); // 3 FloatMemWrite
asm("st1 {V9.2D, V10.2D, V11.2D, V12.2D}, [X29], #64"); // 4 FloatMemWrite
asm("st2 {V9.8H, V10.8H}, [X29], #32"); // 2 FloatMemWrite
asm("st3 {V9.4S, V10.4S, V11.4S}, [X29], #48"); // 3 FloatMemWrite
asm("st4 {V9.2D, V10.2D, V11.2D, V12.2D}, [X29], #64"); // 4 FloatMemWrite
Thanks,
Fernando Endo
Giacomo Gabrielli
2016-07-20 09:43:53 UTC
Permalink
Post by Fernando Endo
configs/common/O3_ARM_v7a.py, line 65
<http://reviews.gem5.org/r/3547/diff/2/?file=57281#file57281line65>
Does this change the performance at all? Is there a need for this change?
I think there is definitely a need for this change - the single-cycle FMA looked like an oversight...
This will certainly affect the regression stats, but in a good way :)


- Giacomo


-----------------------------------------------------------
This is an automatically generated e-mail. To reply, visit:
http://reviews.gem5.org/r/3547/#review8483
-----------------------------------------------------------
Post by Fernando Endo
-----------------------------------------------------------
http://reviews.gem5.org/r/3547/
-----------------------------------------------------------
(Updated July 16, 2016, 4:44 p.m.)
Review request for Default.
Repository: gem5
Description
-------
Modify the opClass assigned to AArch64 FP instructions from SimdFloat* to Float*. Also create the FloatMemRead and FloatMemWrite opClasses, which distinguishes writes to the INT and FP register banks.
Diffs
-----
configs/common/O3_ARM_v7a.py cdb94f2332a6
src/arch/arm/isa/insts/fp64.isa cdb94f2332a6
src/arch/isa_parser.py cdb94f2332a6
src/cpu/FuncUnit.py cdb94f2332a6
src/cpu/minor/MinorCPU.py cdb94f2332a6
src/cpu/o3/FuncUnitConfig.py cdb94f2332a6
src/cpu/op_class.hh cdb94f2332a6
Diff: http://reviews.gem5.org/r/3547/diff/
Testing
-------
The changes from SimdFloat* to Float* were quickly tested, indeed the neon64.isa and fp64.isa files already separate the implementations, so it should not be buggy.
asm("ldr X2, [X30], #0"); // 1 MemRead
asm("ldp X2, X3, [X30], #0"); // 1 MemRead
asm("ldpsw X2, X3, [X30], #0"); // 1 MemRead
asm("ldr W2, [X30], #0"); // 1 MemRead
asm("ldp W2, W3, [X30], #0"); // 1 MemRead
asm("ldp Q0, Q1, [SP], #0"); // 2 FloatMemRead
asm("ldp D2, D3, [SP], #0"); // 1 FloatMemRead
asm("ldp S2, S3, [X30], #0"); // 1 FloatMemRead
asm("ldr Q4, [SP], #0"); // 1 FloatMemRead
asm("ldr Q4, [SP, X28]"); // 1 FloatMemRead
asm("ldr D5, [SP], #0"); // 1 FloatMemRead
asm("ldr D5, [SP, X28]"); // 1 FloatMemRead
asm("ldr S6, [SP], #0"); // 1 FloatMemRead
asm("ldr S2, [X30, X28]"); // 1 FloatMemRead
asm("ldr H7, [X30], #0"); // 1 FloatMemRead
asm("ldr B8, [SP], #0"); // 1 FloatMemRead
asm("ldur B0, [X30, #0]"); // 1 FloatMemRead
asm("ldur H0, [X30, #0]"); // 1 FloatMemRead
asm("ldur S0, [X30, #0]"); // 1 FloatMemRead
asm("ldur D0, [X30, #0]"); // 1 FloatMemRead
asm("ldur Q0, [X30, #0]"); // 1 FloatMemRead
asm("ld1 {V9.B}[0], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.B}[15], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.H}[0], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.H}[7], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.S}[0], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.S}[3], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.D}[0], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.D}[1], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.B, V10.B}[0], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.B, V10.B}[15], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.H, V10.H}[0], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.H, V10.H}[7], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.S, V10.S}[0], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.S, V10.S}[3], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.D, V10.D}[0], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.D, V10.D}[1], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.B, V10.B, V11.B}[0], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.B, V10.B, V11.B}[15], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.H, V10.H, V11.H}[0], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.H, V10.H, V11.H}[7], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.S, V10.S, V11.S}[0], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.S, V10.S, V11.S}[3], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.D, V10.D, V11.D}[0], [X30]"); // 2 FloatMemRead
asm("ld3 {V9.D, V10.D, V11.D}[1], [X30]"); // 2 FloatMemRead
asm("ld4 {V9.B, V10.B, V11.B, V12.B}[0], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.B, V10.B, V11.B, V12.B}[15], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.H, V10.H, V11.H, V12.H}[0], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.H, V10.H, V11.H, V12.H}[7], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.S, V10.S, V11.S, V12.S}[0], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.S, V10.S, V11.S, V12.S}[3], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.D, V10.D, V11.D, V12.D}[0], [X30]"); // 2 FloatMemRead
asm("ld4 {V9.D, V10.D, V11.D, V12.D}[1], [X30]"); // 2 FloatMemRead
asm("ld1 {V9.16B}, [X30], #16"); // 1 FloatMemRead
asm("ld1 {V9.8H, V10.8H}, [X30], #32"); // 2 FloatMemRead
asm("ld1 {V9.4S, V10.4S, V11.4S}, [X30], #48"); // 3 FloatMemRead
asm("ld1 {V9.2D, V10.2D, V11.2D, V12.2D}, [X30], #64"); // 4 FloatMemRead
asm("ld2 {V9.8H, V10.8H}, [X30], #32"); // 2 FloatMemRead
asm("ld3 {V9.4S, V10.4S, V11.4S}, [X30], #48"); // 3 FloatMemRead
asm("ld4 {V9.2D, V10.2D, V11.2D, V12.2D}, [X30], #64"); // 4 FloatMemRead
asm("ld1r {V9.16B}, [X30]"); // 1 FloatMemRead
asm("ld1r {V9.8H}, [X30]"); // 1 FloatMemRead
asm("ld1r {V9.4S}, [X30]"); // 1 FloatMemRead
asm("ld1r {V9.2D}, [X30]"); // 1 FloatMemRead
asm("ld2r {V9.16B, V10.16B}, [X30]"); // 1 FloatMemRead
asm("ld2r {V9.8H, V10.8H}, [X30]"); // 1 FloatMemRead
asm("ld2r {V9.4S, V10.4S}, [X30]"); // 1 FloatMemRead
asm("ld2r {V9.2D, V10.2D}, [X30]"); // 1 FloatMemRead
asm("ld3r {V9.16B, V10.16B, V11.16B}, [X30]"); // 1 FloatMemRead
asm("ld3r {V9.8H, V10.8H, V11.8H}, [X30]"); // 1 FloatMemRead
asm("ld3r {V9.4S, V10.4S, V11.4S}, [X30]"); // 1 FloatMemRead
asm("ld3r {V9.2D, V10.2D, V11.2D}, [X30]"); // 2 FloatMemRead
asm("ld4r {V9.16B, V10.16B, V11.16B, V12.16B}, [X30]"); // 1 FloatMemRead
asm("ld4r {V9.8H, V10.8H, V11.8H, V12.8H}, [X30]"); // 1 FloatMemRead
asm("ld4r {V9.4S, V10.4S, V11.4S, V12.4S}, [X30]"); // 1 FloatMemRead
asm("ld4r {V9.2D, V10.2D, V11.2D, V12.2D}, [X30]"); // 2 FloatMemRead
asm("str X2, [X30], #0"); // 1 MemWrite
asm("stp X2, X3, [X30], #0"); // 2 MemWrite
asm("str W2, [X30], #0"); // 1 MemWrite
asm("stp W2, W3, [X30], #0"); // 1 MemWrite
asm("stp Q0, Q1, [X29], #0"); // 4 FloatMemWrite
asm("stp D2, D3, [X29], #0"); // 2 FloatMemWrite
asm("stp S2, S3, [X29], #0"); // 1 FloatMemWrite
asm("str Q4, [X29], #0"); // 2 FloatMemWrite
asm("str D5, [X29], #0"); // 1 FloatMemWrite
asm("str S6, [X29], #0"); // 1 FloatMemWrite
asm("str H7, [X29], #0"); // 1 FloatMemWrite
asm("str B8, [X29], #0"); // 1 FloatMemWrite
asm("stur B0, [X29, #0]"); // 1 FloatMemWrite
asm("stur H0, [X29, #0]"); // 1 FloatMemWrite
asm("stur S0, [X29, #0]"); // 1 FloatMemWrite
asm("stur D0, [X29, #0]"); // 1 FloatMemWrite
asm("stur Q0, [X29, #0]"); // 2 FloatMemWrite
asm("st1 {V9.B}[0], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.B}[15], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.H}[0], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.H}[7], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.S}[0], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.S}[3], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.D}[0], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.D}[1], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.B, V10.B}[0], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.B, V10.B}[15], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.H, V10.H}[0], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.H, V10.H}[7], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.S, V10.S}[0], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.S, V10.S}[3], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.D, V10.D}[0], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.D, V10.D}[1], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.B, V10.B, V11.B}[0], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.B, V10.B, V11.B}[15], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.H, V10.H, V11.H}[0], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.H, V10.H, V11.H}[7], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.S, V10.S, V11.S}[0], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.S, V10.S, V11.S}[3], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.D, V10.D, V11.D}[0], [X29]"); // 2 FloatMemWrite
asm("st3 {V9.D, V10.D, V11.D}[1], [X29]"); // 2 FloatMemWrite
asm("st4 {V9.B, V10.B, V11.B, V12.B}[0], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.B, V10.B, V11.B, V12.B}[15], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.H, V10.H, V11.H, V12.H}[0], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.H, V10.H, V11.H, V12.H}[7], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.S, V10.S, V11.S, V12.S}[0], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.S, V10.S, V11.S, V12.S}[3], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.D, V10.D, V11.D, V12.D}[0], [X29]"); // 2 FloatMemWrite
asm("st4 {V9.D, V10.D, V11.D, V12.D}[1], [X29]"); // 2 FloatMemWrite
asm("st1 {V9.16B}, [X29], #16"); // 1 FloatMemWrite
asm("st1 {V9.8H, V10.8H}, [X29], #32"); // 2 FloatMemWrite
asm("st1 {V9.4S, V10.4S, V11.4S}, [X29], #48"); // 3 FloatMemWrite
asm("st1 {V9.2D, V10.2D, V11.2D, V12.2D}, [X29], #64"); // 4 FloatMemWrite
asm("st2 {V9.8H, V10.8H}, [X29], #32"); // 2 FloatMemWrite
asm("st3 {V9.4S, V10.4S, V11.4S}, [X29], #48"); // 3 FloatMemWrite
asm("st4 {V9.2D, V10.2D, V11.2D, V12.2D}, [X29], #64"); // 4 FloatMemWrite
Thanks,
Fernando Endo
Fernando Endo
2016-07-31 11:50:03 UTC
Permalink
Post by Giacomo Gabrielli
configs/common/O3_ARM_v7a.py, line 65
<http://reviews.gem5.org/r/3547/diff/2/?file=57281#file57281line65>
Does this change the performance at all? Is there a need for this change?
I think there is definitely a need for this change - the single-cycle FMA looked like an oversight...
This will certainly affect the regression stats, but in a good way :)
Briefly, in the Cortex-A72, the "latency" of FMADD is 3 if the next instruction is a FMADD and has only the augend to destination dependency, otherwise its latency is 7 cycles. Averaging, we get 5.

I'm currently working on a fix to this too, it is called late-forwarding.


- Fernando


-----------------------------------------------------------
This is an automatically generated e-mail. To reply, visit:
http://reviews.gem5.org/r/3547/#review8483
-----------------------------------------------------------
Post by Giacomo Gabrielli
-----------------------------------------------------------
http://reviews.gem5.org/r/3547/
-----------------------------------------------------------
(Updated July 16, 2016, 4:44 p.m.)
Review request for Default.
Repository: gem5
Description
-------
Modify the opClass assigned to AArch64 FP instructions from SimdFloat* to Float*. Also create the FloatMemRead and FloatMemWrite opClasses, which distinguishes writes to the INT and FP register banks.
Diffs
-----
configs/common/O3_ARM_v7a.py cdb94f2332a6
src/arch/arm/isa/insts/fp64.isa cdb94f2332a6
src/arch/isa_parser.py cdb94f2332a6
src/cpu/FuncUnit.py cdb94f2332a6
src/cpu/minor/MinorCPU.py cdb94f2332a6
src/cpu/o3/FuncUnitConfig.py cdb94f2332a6
src/cpu/op_class.hh cdb94f2332a6
Diff: http://reviews.gem5.org/r/3547/diff/
Testing
-------
The changes from SimdFloat* to Float* were quickly tested, indeed the neon64.isa and fp64.isa files already separate the implementations, so it should not be buggy.
asm("ldr X2, [X30], #0"); // 1 MemRead
asm("ldp X2, X3, [X30], #0"); // 1 MemRead
asm("ldpsw X2, X3, [X30], #0"); // 1 MemRead
asm("ldr W2, [X30], #0"); // 1 MemRead
asm("ldp W2, W3, [X30], #0"); // 1 MemRead
asm("ldp Q0, Q1, [SP], #0"); // 2 FloatMemRead
asm("ldp D2, D3, [SP], #0"); // 1 FloatMemRead
asm("ldp S2, S3, [X30], #0"); // 1 FloatMemRead
asm("ldr Q4, [SP], #0"); // 1 FloatMemRead
asm("ldr Q4, [SP, X28]"); // 1 FloatMemRead
asm("ldr D5, [SP], #0"); // 1 FloatMemRead
asm("ldr D5, [SP, X28]"); // 1 FloatMemRead
asm("ldr S6, [SP], #0"); // 1 FloatMemRead
asm("ldr S2, [X30, X28]"); // 1 FloatMemRead
asm("ldr H7, [X30], #0"); // 1 FloatMemRead
asm("ldr B8, [SP], #0"); // 1 FloatMemRead
asm("ldur B0, [X30, #0]"); // 1 FloatMemRead
asm("ldur H0, [X30, #0]"); // 1 FloatMemRead
asm("ldur S0, [X30, #0]"); // 1 FloatMemRead
asm("ldur D0, [X30, #0]"); // 1 FloatMemRead
asm("ldur Q0, [X30, #0]"); // 1 FloatMemRead
asm("ld1 {V9.B}[0], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.B}[15], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.H}[0], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.H}[7], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.S}[0], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.S}[3], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.D}[0], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.D}[1], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.B, V10.B}[0], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.B, V10.B}[15], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.H, V10.H}[0], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.H, V10.H}[7], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.S, V10.S}[0], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.S, V10.S}[3], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.D, V10.D}[0], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.D, V10.D}[1], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.B, V10.B, V11.B}[0], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.B, V10.B, V11.B}[15], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.H, V10.H, V11.H}[0], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.H, V10.H, V11.H}[7], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.S, V10.S, V11.S}[0], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.S, V10.S, V11.S}[3], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.D, V10.D, V11.D}[0], [X30]"); // 2 FloatMemRead
asm("ld3 {V9.D, V10.D, V11.D}[1], [X30]"); // 2 FloatMemRead
asm("ld4 {V9.B, V10.B, V11.B, V12.B}[0], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.B, V10.B, V11.B, V12.B}[15], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.H, V10.H, V11.H, V12.H}[0], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.H, V10.H, V11.H, V12.H}[7], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.S, V10.S, V11.S, V12.S}[0], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.S, V10.S, V11.S, V12.S}[3], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.D, V10.D, V11.D, V12.D}[0], [X30]"); // 2 FloatMemRead
asm("ld4 {V9.D, V10.D, V11.D, V12.D}[1], [X30]"); // 2 FloatMemRead
asm("ld1 {V9.16B}, [X30], #16"); // 1 FloatMemRead
asm("ld1 {V9.8H, V10.8H}, [X30], #32"); // 2 FloatMemRead
asm("ld1 {V9.4S, V10.4S, V11.4S}, [X30], #48"); // 3 FloatMemRead
asm("ld1 {V9.2D, V10.2D, V11.2D, V12.2D}, [X30], #64"); // 4 FloatMemRead
asm("ld2 {V9.8H, V10.8H}, [X30], #32"); // 2 FloatMemRead
asm("ld3 {V9.4S, V10.4S, V11.4S}, [X30], #48"); // 3 FloatMemRead
asm("ld4 {V9.2D, V10.2D, V11.2D, V12.2D}, [X30], #64"); // 4 FloatMemRead
asm("ld1r {V9.16B}, [X30]"); // 1 FloatMemRead
asm("ld1r {V9.8H}, [X30]"); // 1 FloatMemRead
asm("ld1r {V9.4S}, [X30]"); // 1 FloatMemRead
asm("ld1r {V9.2D}, [X30]"); // 1 FloatMemRead
asm("ld2r {V9.16B, V10.16B}, [X30]"); // 1 FloatMemRead
asm("ld2r {V9.8H, V10.8H}, [X30]"); // 1 FloatMemRead
asm("ld2r {V9.4S, V10.4S}, [X30]"); // 1 FloatMemRead
asm("ld2r {V9.2D, V10.2D}, [X30]"); // 1 FloatMemRead
asm("ld3r {V9.16B, V10.16B, V11.16B}, [X30]"); // 1 FloatMemRead
asm("ld3r {V9.8H, V10.8H, V11.8H}, [X30]"); // 1 FloatMemRead
asm("ld3r {V9.4S, V10.4S, V11.4S}, [X30]"); // 1 FloatMemRead
asm("ld3r {V9.2D, V10.2D, V11.2D}, [X30]"); // 2 FloatMemRead
asm("ld4r {V9.16B, V10.16B, V11.16B, V12.16B}, [X30]"); // 1 FloatMemRead
asm("ld4r {V9.8H, V10.8H, V11.8H, V12.8H}, [X30]"); // 1 FloatMemRead
asm("ld4r {V9.4S, V10.4S, V11.4S, V12.4S}, [X30]"); // 1 FloatMemRead
asm("ld4r {V9.2D, V10.2D, V11.2D, V12.2D}, [X30]"); // 2 FloatMemRead
asm("str X2, [X30], #0"); // 1 MemWrite
asm("stp X2, X3, [X30], #0"); // 2 MemWrite
asm("str W2, [X30], #0"); // 1 MemWrite
asm("stp W2, W3, [X30], #0"); // 1 MemWrite
asm("stp Q0, Q1, [X29], #0"); // 4 FloatMemWrite
asm("stp D2, D3, [X29], #0"); // 2 FloatMemWrite
asm("stp S2, S3, [X29], #0"); // 1 FloatMemWrite
asm("str Q4, [X29], #0"); // 2 FloatMemWrite
asm("str D5, [X29], #0"); // 1 FloatMemWrite
asm("str S6, [X29], #0"); // 1 FloatMemWrite
asm("str H7, [X29], #0"); // 1 FloatMemWrite
asm("str B8, [X29], #0"); // 1 FloatMemWrite
asm("stur B0, [X29, #0]"); // 1 FloatMemWrite
asm("stur H0, [X29, #0]"); // 1 FloatMemWrite
asm("stur S0, [X29, #0]"); // 1 FloatMemWrite
asm("stur D0, [X29, #0]"); // 1 FloatMemWrite
asm("stur Q0, [X29, #0]"); // 2 FloatMemWrite
asm("st1 {V9.B}[0], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.B}[15], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.H}[0], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.H}[7], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.S}[0], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.S}[3], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.D}[0], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.D}[1], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.B, V10.B}[0], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.B, V10.B}[15], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.H, V10.H}[0], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.H, V10.H}[7], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.S, V10.S}[0], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.S, V10.S}[3], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.D, V10.D}[0], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.D, V10.D}[1], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.B, V10.B, V11.B}[0], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.B, V10.B, V11.B}[15], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.H, V10.H, V11.H}[0], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.H, V10.H, V11.H}[7], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.S, V10.S, V11.S}[0], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.S, V10.S, V11.S}[3], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.D, V10.D, V11.D}[0], [X29]"); // 2 FloatMemWrite
asm("st3 {V9.D, V10.D, V11.D}[1], [X29]"); // 2 FloatMemWrite
asm("st4 {V9.B, V10.B, V11.B, V12.B}[0], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.B, V10.B, V11.B, V12.B}[15], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.H, V10.H, V11.H, V12.H}[0], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.H, V10.H, V11.H, V12.H}[7], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.S, V10.S, V11.S, V12.S}[0], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.S, V10.S, V11.S, V12.S}[3], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.D, V10.D, V11.D, V12.D}[0], [X29]"); // 2 FloatMemWrite
asm("st4 {V9.D, V10.D, V11.D, V12.D}[1], [X29]"); // 2 FloatMemWrite
asm("st1 {V9.16B}, [X29], #16"); // 1 FloatMemWrite
asm("st1 {V9.8H, V10.8H}, [X29], #32"); // 2 FloatMemWrite
asm("st1 {V9.4S, V10.4S, V11.4S}, [X29], #48"); // 3 FloatMemWrite
asm("st1 {V9.2D, V10.2D, V11.2D, V12.2D}, [X29], #64"); // 4 FloatMemWrite
asm("st2 {V9.8H, V10.8H}, [X29], #32"); // 2 FloatMemWrite
asm("st3 {V9.4S, V10.4S, V11.4S}, [X29], #48"); // 3 FloatMemWrite
asm("st4 {V9.2D, V10.2D, V11.2D, V12.2D}, [X29], #64"); // 4 FloatMemWrite
Thanks,
Fernando Endo
Jason Lowe-Power
2016-08-01 14:11:16 UTC
Permalink
Post by Giacomo Gabrielli
configs/common/O3_ARM_v7a.py, line 65
<http://reviews.gem5.org/r/3547/diff/2/?file=57281#file57281line65>
Does this change the performance at all? Is there a need for this change?
I think there is definitely a need for this change - the single-cycle FMA looked like an oversight...
This will certainly affect the regression stats, but in a good way :)
Briefly, in the Cortex-A72, the "latency" of FMADD is 3 if the next instruction is a FMADD and has only the augend to destination dependency, otherwise its latency is 7 cycles. Averaging, we get 5.
I'm currently working on a fix to this too, it is called late-forwarding.
Gotcha. Could you add something to the commit message about this? Also, it will make it easier to commit if you format the commit message according to the formatting guidelines: http://www.m5sim.org/Submitting_Contributions#Commit_Messages.

Thanks!


- Jason


-----------------------------------------------------------
This is an automatically generated e-mail. To reply, visit:
http://reviews.gem5.org/r/3547/#review8483
-----------------------------------------------------------
Post by Giacomo Gabrielli
-----------------------------------------------------------
http://reviews.gem5.org/r/3547/
-----------------------------------------------------------
(Updated July 16, 2016, 4:44 p.m.)
Review request for Default.
Repository: gem5
Description
-------
Modify the opClass assigned to AArch64 FP instructions from SimdFloat* to Float*. Also create the FloatMemRead and FloatMemWrite opClasses, which distinguishes writes to the INT and FP register banks.
Diffs
-----
configs/common/O3_ARM_v7a.py cdb94f2332a6
src/arch/arm/isa/insts/fp64.isa cdb94f2332a6
src/arch/isa_parser.py cdb94f2332a6
src/cpu/FuncUnit.py cdb94f2332a6
src/cpu/minor/MinorCPU.py cdb94f2332a6
src/cpu/o3/FuncUnitConfig.py cdb94f2332a6
src/cpu/op_class.hh cdb94f2332a6
Diff: http://reviews.gem5.org/r/3547/diff/
Testing
-------
The changes from SimdFloat* to Float* were quickly tested, indeed the neon64.isa and fp64.isa files already separate the implementations, so it should not be buggy.
asm("ldr X2, [X30], #0"); // 1 MemRead
asm("ldp X2, X3, [X30], #0"); // 1 MemRead
asm("ldpsw X2, X3, [X30], #0"); // 1 MemRead
asm("ldr W2, [X30], #0"); // 1 MemRead
asm("ldp W2, W3, [X30], #0"); // 1 MemRead
asm("ldp Q0, Q1, [SP], #0"); // 2 FloatMemRead
asm("ldp D2, D3, [SP], #0"); // 1 FloatMemRead
asm("ldp S2, S3, [X30], #0"); // 1 FloatMemRead
asm("ldr Q4, [SP], #0"); // 1 FloatMemRead
asm("ldr Q4, [SP, X28]"); // 1 FloatMemRead
asm("ldr D5, [SP], #0"); // 1 FloatMemRead
asm("ldr D5, [SP, X28]"); // 1 FloatMemRead
asm("ldr S6, [SP], #0"); // 1 FloatMemRead
asm("ldr S2, [X30, X28]"); // 1 FloatMemRead
asm("ldr H7, [X30], #0"); // 1 FloatMemRead
asm("ldr B8, [SP], #0"); // 1 FloatMemRead
asm("ldur B0, [X30, #0]"); // 1 FloatMemRead
asm("ldur H0, [X30, #0]"); // 1 FloatMemRead
asm("ldur S0, [X30, #0]"); // 1 FloatMemRead
asm("ldur D0, [X30, #0]"); // 1 FloatMemRead
asm("ldur Q0, [X30, #0]"); // 1 FloatMemRead
asm("ld1 {V9.B}[0], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.B}[15], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.H}[0], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.H}[7], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.S}[0], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.S}[3], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.D}[0], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.D}[1], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.B, V10.B}[0], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.B, V10.B}[15], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.H, V10.H}[0], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.H, V10.H}[7], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.S, V10.S}[0], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.S, V10.S}[3], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.D, V10.D}[0], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.D, V10.D}[1], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.B, V10.B, V11.B}[0], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.B, V10.B, V11.B}[15], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.H, V10.H, V11.H}[0], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.H, V10.H, V11.H}[7], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.S, V10.S, V11.S}[0], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.S, V10.S, V11.S}[3], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.D, V10.D, V11.D}[0], [X30]"); // 2 FloatMemRead
asm("ld3 {V9.D, V10.D, V11.D}[1], [X30]"); // 2 FloatMemRead
asm("ld4 {V9.B, V10.B, V11.B, V12.B}[0], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.B, V10.B, V11.B, V12.B}[15], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.H, V10.H, V11.H, V12.H}[0], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.H, V10.H, V11.H, V12.H}[7], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.S, V10.S, V11.S, V12.S}[0], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.S, V10.S, V11.S, V12.S}[3], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.D, V10.D, V11.D, V12.D}[0], [X30]"); // 2 FloatMemRead
asm("ld4 {V9.D, V10.D, V11.D, V12.D}[1], [X30]"); // 2 FloatMemRead
asm("ld1 {V9.16B}, [X30], #16"); // 1 FloatMemRead
asm("ld1 {V9.8H, V10.8H}, [X30], #32"); // 2 FloatMemRead
asm("ld1 {V9.4S, V10.4S, V11.4S}, [X30], #48"); // 3 FloatMemRead
asm("ld1 {V9.2D, V10.2D, V11.2D, V12.2D}, [X30], #64"); // 4 FloatMemRead
asm("ld2 {V9.8H, V10.8H}, [X30], #32"); // 2 FloatMemRead
asm("ld3 {V9.4S, V10.4S, V11.4S}, [X30], #48"); // 3 FloatMemRead
asm("ld4 {V9.2D, V10.2D, V11.2D, V12.2D}, [X30], #64"); // 4 FloatMemRead
asm("ld1r {V9.16B}, [X30]"); // 1 FloatMemRead
asm("ld1r {V9.8H}, [X30]"); // 1 FloatMemRead
asm("ld1r {V9.4S}, [X30]"); // 1 FloatMemRead
asm("ld1r {V9.2D}, [X30]"); // 1 FloatMemRead
asm("ld2r {V9.16B, V10.16B}, [X30]"); // 1 FloatMemRead
asm("ld2r {V9.8H, V10.8H}, [X30]"); // 1 FloatMemRead
asm("ld2r {V9.4S, V10.4S}, [X30]"); // 1 FloatMemRead
asm("ld2r {V9.2D, V10.2D}, [X30]"); // 1 FloatMemRead
asm("ld3r {V9.16B, V10.16B, V11.16B}, [X30]"); // 1 FloatMemRead
asm("ld3r {V9.8H, V10.8H, V11.8H}, [X30]"); // 1 FloatMemRead
asm("ld3r {V9.4S, V10.4S, V11.4S}, [X30]"); // 1 FloatMemRead
asm("ld3r {V9.2D, V10.2D, V11.2D}, [X30]"); // 2 FloatMemRead
asm("ld4r {V9.16B, V10.16B, V11.16B, V12.16B}, [X30]"); // 1 FloatMemRead
asm("ld4r {V9.8H, V10.8H, V11.8H, V12.8H}, [X30]"); // 1 FloatMemRead
asm("ld4r {V9.4S, V10.4S, V11.4S, V12.4S}, [X30]"); // 1 FloatMemRead
asm("ld4r {V9.2D, V10.2D, V11.2D, V12.2D}, [X30]"); // 2 FloatMemRead
asm("str X2, [X30], #0"); // 1 MemWrite
asm("stp X2, X3, [X30], #0"); // 2 MemWrite
asm("str W2, [X30], #0"); // 1 MemWrite
asm("stp W2, W3, [X30], #0"); // 1 MemWrite
asm("stp Q0, Q1, [X29], #0"); // 4 FloatMemWrite
asm("stp D2, D3, [X29], #0"); // 2 FloatMemWrite
asm("stp S2, S3, [X29], #0"); // 1 FloatMemWrite
asm("str Q4, [X29], #0"); // 2 FloatMemWrite
asm("str D5, [X29], #0"); // 1 FloatMemWrite
asm("str S6, [X29], #0"); // 1 FloatMemWrite
asm("str H7, [X29], #0"); // 1 FloatMemWrite
asm("str B8, [X29], #0"); // 1 FloatMemWrite
asm("stur B0, [X29, #0]"); // 1 FloatMemWrite
asm("stur H0, [X29, #0]"); // 1 FloatMemWrite
asm("stur S0, [X29, #0]"); // 1 FloatMemWrite
asm("stur D0, [X29, #0]"); // 1 FloatMemWrite
asm("stur Q0, [X29, #0]"); // 2 FloatMemWrite
asm("st1 {V9.B}[0], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.B}[15], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.H}[0], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.H}[7], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.S}[0], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.S}[3], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.D}[0], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.D}[1], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.B, V10.B}[0], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.B, V10.B}[15], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.H, V10.H}[0], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.H, V10.H}[7], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.S, V10.S}[0], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.S, V10.S}[3], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.D, V10.D}[0], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.D, V10.D}[1], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.B, V10.B, V11.B}[0], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.B, V10.B, V11.B}[15], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.H, V10.H, V11.H}[0], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.H, V10.H, V11.H}[7], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.S, V10.S, V11.S}[0], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.S, V10.S, V11.S}[3], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.D, V10.D, V11.D}[0], [X29]"); // 2 FloatMemWrite
asm("st3 {V9.D, V10.D, V11.D}[1], [X29]"); // 2 FloatMemWrite
asm("st4 {V9.B, V10.B, V11.B, V12.B}[0], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.B, V10.B, V11.B, V12.B}[15], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.H, V10.H, V11.H, V12.H}[0], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.H, V10.H, V11.H, V12.H}[7], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.S, V10.S, V11.S, V12.S}[0], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.S, V10.S, V11.S, V12.S}[3], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.D, V10.D, V11.D, V12.D}[0], [X29]"); // 2 FloatMemWrite
asm("st4 {V9.D, V10.D, V11.D, V12.D}[1], [X29]"); // 2 FloatMemWrite
asm("st1 {V9.16B}, [X29], #16"); // 1 FloatMemWrite
asm("st1 {V9.8H, V10.8H}, [X29], #32"); // 2 FloatMemWrite
asm("st1 {V9.4S, V10.4S, V11.4S}, [X29], #48"); // 3 FloatMemWrite
asm("st1 {V9.2D, V10.2D, V11.2D, V12.2D}, [X29], #64"); // 4 FloatMemWrite
asm("st2 {V9.8H, V10.8H}, [X29], #32"); // 2 FloatMemWrite
asm("st3 {V9.4S, V10.4S, V11.4S}, [X29], #48"); // 3 FloatMemWrite
asm("st4 {V9.2D, V10.2D, V11.2D, V12.2D}, [X29], #64"); // 4 FloatMemWrite
Thanks,
Fernando Endo
Giacomo Gabrielli
2016-07-20 10:50:04 UTC
Permalink
-----------------------------------------------------------
This is an automatically generated e-mail. To reply, visit:
http://reviews.gem5.org/r/3547/#review8487
-----------------------------------------------------------

Ship it!


Ship It!

- Giacomo Gabrielli
Post by Fernando Endo
-----------------------------------------------------------
http://reviews.gem5.org/r/3547/
-----------------------------------------------------------
(Updated July 16, 2016, 4:44 p.m.)
Review request for Default.
Repository: gem5
Description
-------
Modify the opClass assigned to AArch64 FP instructions from SimdFloat* to Float*. Also create the FloatMemRead and FloatMemWrite opClasses, which distinguishes writes to the INT and FP register banks.
Diffs
-----
configs/common/O3_ARM_v7a.py cdb94f2332a6
src/arch/arm/isa/insts/fp64.isa cdb94f2332a6
src/arch/isa_parser.py cdb94f2332a6
src/cpu/FuncUnit.py cdb94f2332a6
src/cpu/minor/MinorCPU.py cdb94f2332a6
src/cpu/o3/FuncUnitConfig.py cdb94f2332a6
src/cpu/op_class.hh cdb94f2332a6
Diff: http://reviews.gem5.org/r/3547/diff/
Testing
-------
The changes from SimdFloat* to Float* were quickly tested, indeed the neon64.isa and fp64.isa files already separate the implementations, so it should not be buggy.
asm("ldr X2, [X30], #0"); // 1 MemRead
asm("ldp X2, X3, [X30], #0"); // 1 MemRead
asm("ldpsw X2, X3, [X30], #0"); // 1 MemRead
asm("ldr W2, [X30], #0"); // 1 MemRead
asm("ldp W2, W3, [X30], #0"); // 1 MemRead
asm("ldp Q0, Q1, [SP], #0"); // 2 FloatMemRead
asm("ldp D2, D3, [SP], #0"); // 1 FloatMemRead
asm("ldp S2, S3, [X30], #0"); // 1 FloatMemRead
asm("ldr Q4, [SP], #0"); // 1 FloatMemRead
asm("ldr Q4, [SP, X28]"); // 1 FloatMemRead
asm("ldr D5, [SP], #0"); // 1 FloatMemRead
asm("ldr D5, [SP, X28]"); // 1 FloatMemRead
asm("ldr S6, [SP], #0"); // 1 FloatMemRead
asm("ldr S2, [X30, X28]"); // 1 FloatMemRead
asm("ldr H7, [X30], #0"); // 1 FloatMemRead
asm("ldr B8, [SP], #0"); // 1 FloatMemRead
asm("ldur B0, [X30, #0]"); // 1 FloatMemRead
asm("ldur H0, [X30, #0]"); // 1 FloatMemRead
asm("ldur S0, [X30, #0]"); // 1 FloatMemRead
asm("ldur D0, [X30, #0]"); // 1 FloatMemRead
asm("ldur Q0, [X30, #0]"); // 1 FloatMemRead
asm("ld1 {V9.B}[0], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.B}[15], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.H}[0], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.H}[7], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.S}[0], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.S}[3], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.D}[0], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.D}[1], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.B, V10.B}[0], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.B, V10.B}[15], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.H, V10.H}[0], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.H, V10.H}[7], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.S, V10.S}[0], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.S, V10.S}[3], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.D, V10.D}[0], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.D, V10.D}[1], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.B, V10.B, V11.B}[0], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.B, V10.B, V11.B}[15], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.H, V10.H, V11.H}[0], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.H, V10.H, V11.H}[7], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.S, V10.S, V11.S}[0], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.S, V10.S, V11.S}[3], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.D, V10.D, V11.D}[0], [X30]"); // 2 FloatMemRead
asm("ld3 {V9.D, V10.D, V11.D}[1], [X30]"); // 2 FloatMemRead
asm("ld4 {V9.B, V10.B, V11.B, V12.B}[0], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.B, V10.B, V11.B, V12.B}[15], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.H, V10.H, V11.H, V12.H}[0], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.H, V10.H, V11.H, V12.H}[7], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.S, V10.S, V11.S, V12.S}[0], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.S, V10.S, V11.S, V12.S}[3], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.D, V10.D, V11.D, V12.D}[0], [X30]"); // 2 FloatMemRead
asm("ld4 {V9.D, V10.D, V11.D, V12.D}[1], [X30]"); // 2 FloatMemRead
asm("ld1 {V9.16B}, [X30], #16"); // 1 FloatMemRead
asm("ld1 {V9.8H, V10.8H}, [X30], #32"); // 2 FloatMemRead
asm("ld1 {V9.4S, V10.4S, V11.4S}, [X30], #48"); // 3 FloatMemRead
asm("ld1 {V9.2D, V10.2D, V11.2D, V12.2D}, [X30], #64"); // 4 FloatMemRead
asm("ld2 {V9.8H, V10.8H}, [X30], #32"); // 2 FloatMemRead
asm("ld3 {V9.4S, V10.4S, V11.4S}, [X30], #48"); // 3 FloatMemRead
asm("ld4 {V9.2D, V10.2D, V11.2D, V12.2D}, [X30], #64"); // 4 FloatMemRead
asm("ld1r {V9.16B}, [X30]"); // 1 FloatMemRead
asm("ld1r {V9.8H}, [X30]"); // 1 FloatMemRead
asm("ld1r {V9.4S}, [X30]"); // 1 FloatMemRead
asm("ld1r {V9.2D}, [X30]"); // 1 FloatMemRead
asm("ld2r {V9.16B, V10.16B}, [X30]"); // 1 FloatMemRead
asm("ld2r {V9.8H, V10.8H}, [X30]"); // 1 FloatMemRead
asm("ld2r {V9.4S, V10.4S}, [X30]"); // 1 FloatMemRead
asm("ld2r {V9.2D, V10.2D}, [X30]"); // 1 FloatMemRead
asm("ld3r {V9.16B, V10.16B, V11.16B}, [X30]"); // 1 FloatMemRead
asm("ld3r {V9.8H, V10.8H, V11.8H}, [X30]"); // 1 FloatMemRead
asm("ld3r {V9.4S, V10.4S, V11.4S}, [X30]"); // 1 FloatMemRead
asm("ld3r {V9.2D, V10.2D, V11.2D}, [X30]"); // 2 FloatMemRead
asm("ld4r {V9.16B, V10.16B, V11.16B, V12.16B}, [X30]"); // 1 FloatMemRead
asm("ld4r {V9.8H, V10.8H, V11.8H, V12.8H}, [X30]"); // 1 FloatMemRead
asm("ld4r {V9.4S, V10.4S, V11.4S, V12.4S}, [X30]"); // 1 FloatMemRead
asm("ld4r {V9.2D, V10.2D, V11.2D, V12.2D}, [X30]"); // 2 FloatMemRead
asm("str X2, [X30], #0"); // 1 MemWrite
asm("stp X2, X3, [X30], #0"); // 2 MemWrite
asm("str W2, [X30], #0"); // 1 MemWrite
asm("stp W2, W3, [X30], #0"); // 1 MemWrite
asm("stp Q0, Q1, [X29], #0"); // 4 FloatMemWrite
asm("stp D2, D3, [X29], #0"); // 2 FloatMemWrite
asm("stp S2, S3, [X29], #0"); // 1 FloatMemWrite
asm("str Q4, [X29], #0"); // 2 FloatMemWrite
asm("str D5, [X29], #0"); // 1 FloatMemWrite
asm("str S6, [X29], #0"); // 1 FloatMemWrite
asm("str H7, [X29], #0"); // 1 FloatMemWrite
asm("str B8, [X29], #0"); // 1 FloatMemWrite
asm("stur B0, [X29, #0]"); // 1 FloatMemWrite
asm("stur H0, [X29, #0]"); // 1 FloatMemWrite
asm("stur S0, [X29, #0]"); // 1 FloatMemWrite
asm("stur D0, [X29, #0]"); // 1 FloatMemWrite
asm("stur Q0, [X29, #0]"); // 2 FloatMemWrite
asm("st1 {V9.B}[0], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.B}[15], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.H}[0], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.H}[7], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.S}[0], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.S}[3], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.D}[0], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.D}[1], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.B, V10.B}[0], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.B, V10.B}[15], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.H, V10.H}[0], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.H, V10.H}[7], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.S, V10.S}[0], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.S, V10.S}[3], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.D, V10.D}[0], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.D, V10.D}[1], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.B, V10.B, V11.B}[0], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.B, V10.B, V11.B}[15], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.H, V10.H, V11.H}[0], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.H, V10.H, V11.H}[7], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.S, V10.S, V11.S}[0], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.S, V10.S, V11.S}[3], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.D, V10.D, V11.D}[0], [X29]"); // 2 FloatMemWrite
asm("st3 {V9.D, V10.D, V11.D}[1], [X29]"); // 2 FloatMemWrite
asm("st4 {V9.B, V10.B, V11.B, V12.B}[0], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.B, V10.B, V11.B, V12.B}[15], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.H, V10.H, V11.H, V12.H}[0], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.H, V10.H, V11.H, V12.H}[7], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.S, V10.S, V11.S, V12.S}[0], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.S, V10.S, V11.S, V12.S}[3], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.D, V10.D, V11.D, V12.D}[0], [X29]"); // 2 FloatMemWrite
asm("st4 {V9.D, V10.D, V11.D, V12.D}[1], [X29]"); // 2 FloatMemWrite
asm("st1 {V9.16B}, [X29], #16"); // 1 FloatMemWrite
asm("st1 {V9.8H, V10.8H}, [X29], #32"); // 2 FloatMemWrite
asm("st1 {V9.4S, V10.4S, V11.4S}, [X29], #48"); // 3 FloatMemWrite
asm("st1 {V9.2D, V10.2D, V11.2D, V12.2D}, [X29], #64"); // 4 FloatMemWrite
asm("st2 {V9.8H, V10.8H}, [X29], #32"); // 2 FloatMemWrite
asm("st3 {V9.4S, V10.4S, V11.4S}, [X29], #48"); // 3 FloatMemWrite
asm("st4 {V9.2D, V10.2D, V11.2D, V12.2D}, [X29], #64"); // 4 FloatMemWrite
Thanks,
Fernando Endo
Jason Lowe-Power
2016-08-01 14:11:19 UTC
Permalink
-----------------------------------------------------------
This is an automatically generated e-mail. To reply, visit:
http://reviews.gem5.org/r/3547/#review8567
-----------------------------------------------------------

Ship it!


Ship It!

- Jason Lowe-Power
Post by Fernando Endo
-----------------------------------------------------------
http://reviews.gem5.org/r/3547/
-----------------------------------------------------------
(Updated July 16, 2016, 4:44 p.m.)
Review request for Default.
Repository: gem5
Description
-------
Modify the opClass assigned to AArch64 FP instructions from SimdFloat* to Float*. Also create the FloatMemRead and FloatMemWrite opClasses, which distinguishes writes to the INT and FP register banks.
Diffs
-----
configs/common/O3_ARM_v7a.py cdb94f2332a6
src/arch/arm/isa/insts/fp64.isa cdb94f2332a6
src/arch/isa_parser.py cdb94f2332a6
src/cpu/FuncUnit.py cdb94f2332a6
src/cpu/minor/MinorCPU.py cdb94f2332a6
src/cpu/o3/FuncUnitConfig.py cdb94f2332a6
src/cpu/op_class.hh cdb94f2332a6
Diff: http://reviews.gem5.org/r/3547/diff/
Testing
-------
The changes from SimdFloat* to Float* were quickly tested, indeed the neon64.isa and fp64.isa files already separate the implementations, so it should not be buggy.
asm("ldr X2, [X30], #0"); // 1 MemRead
asm("ldp X2, X3, [X30], #0"); // 1 MemRead
asm("ldpsw X2, X3, [X30], #0"); // 1 MemRead
asm("ldr W2, [X30], #0"); // 1 MemRead
asm("ldp W2, W3, [X30], #0"); // 1 MemRead
asm("ldp Q0, Q1, [SP], #0"); // 2 FloatMemRead
asm("ldp D2, D3, [SP], #0"); // 1 FloatMemRead
asm("ldp S2, S3, [X30], #0"); // 1 FloatMemRead
asm("ldr Q4, [SP], #0"); // 1 FloatMemRead
asm("ldr Q4, [SP, X28]"); // 1 FloatMemRead
asm("ldr D5, [SP], #0"); // 1 FloatMemRead
asm("ldr D5, [SP, X28]"); // 1 FloatMemRead
asm("ldr S6, [SP], #0"); // 1 FloatMemRead
asm("ldr S2, [X30, X28]"); // 1 FloatMemRead
asm("ldr H7, [X30], #0"); // 1 FloatMemRead
asm("ldr B8, [SP], #0"); // 1 FloatMemRead
asm("ldur B0, [X30, #0]"); // 1 FloatMemRead
asm("ldur H0, [X30, #0]"); // 1 FloatMemRead
asm("ldur S0, [X30, #0]"); // 1 FloatMemRead
asm("ldur D0, [X30, #0]"); // 1 FloatMemRead
asm("ldur Q0, [X30, #0]"); // 1 FloatMemRead
asm("ld1 {V9.B}[0], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.B}[15], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.H}[0], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.H}[7], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.S}[0], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.S}[3], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.D}[0], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.D}[1], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.B, V10.B}[0], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.B, V10.B}[15], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.H, V10.H}[0], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.H, V10.H}[7], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.S, V10.S}[0], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.S, V10.S}[3], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.D, V10.D}[0], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.D, V10.D}[1], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.B, V10.B, V11.B}[0], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.B, V10.B, V11.B}[15], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.H, V10.H, V11.H}[0], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.H, V10.H, V11.H}[7], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.S, V10.S, V11.S}[0], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.S, V10.S, V11.S}[3], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.D, V10.D, V11.D}[0], [X30]"); // 2 FloatMemRead
asm("ld3 {V9.D, V10.D, V11.D}[1], [X30]"); // 2 FloatMemRead
asm("ld4 {V9.B, V10.B, V11.B, V12.B}[0], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.B, V10.B, V11.B, V12.B}[15], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.H, V10.H, V11.H, V12.H}[0], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.H, V10.H, V11.H, V12.H}[7], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.S, V10.S, V11.S, V12.S}[0], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.S, V10.S, V11.S, V12.S}[3], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.D, V10.D, V11.D, V12.D}[0], [X30]"); // 2 FloatMemRead
asm("ld4 {V9.D, V10.D, V11.D, V12.D}[1], [X30]"); // 2 FloatMemRead
asm("ld1 {V9.16B}, [X30], #16"); // 1 FloatMemRead
asm("ld1 {V9.8H, V10.8H}, [X30], #32"); // 2 FloatMemRead
asm("ld1 {V9.4S, V10.4S, V11.4S}, [X30], #48"); // 3 FloatMemRead
asm("ld1 {V9.2D, V10.2D, V11.2D, V12.2D}, [X30], #64"); // 4 FloatMemRead
asm("ld2 {V9.8H, V10.8H}, [X30], #32"); // 2 FloatMemRead
asm("ld3 {V9.4S, V10.4S, V11.4S}, [X30], #48"); // 3 FloatMemRead
asm("ld4 {V9.2D, V10.2D, V11.2D, V12.2D}, [X30], #64"); // 4 FloatMemRead
asm("ld1r {V9.16B}, [X30]"); // 1 FloatMemRead
asm("ld1r {V9.8H}, [X30]"); // 1 FloatMemRead
asm("ld1r {V9.4S}, [X30]"); // 1 FloatMemRead
asm("ld1r {V9.2D}, [X30]"); // 1 FloatMemRead
asm("ld2r {V9.16B, V10.16B}, [X30]"); // 1 FloatMemRead
asm("ld2r {V9.8H, V10.8H}, [X30]"); // 1 FloatMemRead
asm("ld2r {V9.4S, V10.4S}, [X30]"); // 1 FloatMemRead
asm("ld2r {V9.2D, V10.2D}, [X30]"); // 1 FloatMemRead
asm("ld3r {V9.16B, V10.16B, V11.16B}, [X30]"); // 1 FloatMemRead
asm("ld3r {V9.8H, V10.8H, V11.8H}, [X30]"); // 1 FloatMemRead
asm("ld3r {V9.4S, V10.4S, V11.4S}, [X30]"); // 1 FloatMemRead
asm("ld3r {V9.2D, V10.2D, V11.2D}, [X30]"); // 2 FloatMemRead
asm("ld4r {V9.16B, V10.16B, V11.16B, V12.16B}, [X30]"); // 1 FloatMemRead
asm("ld4r {V9.8H, V10.8H, V11.8H, V12.8H}, [X30]"); // 1 FloatMemRead
asm("ld4r {V9.4S, V10.4S, V11.4S, V12.4S}, [X30]"); // 1 FloatMemRead
asm("ld4r {V9.2D, V10.2D, V11.2D, V12.2D}, [X30]"); // 2 FloatMemRead
asm("str X2, [X30], #0"); // 1 MemWrite
asm("stp X2, X3, [X30], #0"); // 2 MemWrite
asm("str W2, [X30], #0"); // 1 MemWrite
asm("stp W2, W3, [X30], #0"); // 1 MemWrite
asm("stp Q0, Q1, [X29], #0"); // 4 FloatMemWrite
asm("stp D2, D3, [X29], #0"); // 2 FloatMemWrite
asm("stp S2, S3, [X29], #0"); // 1 FloatMemWrite
asm("str Q4, [X29], #0"); // 2 FloatMemWrite
asm("str D5, [X29], #0"); // 1 FloatMemWrite
asm("str S6, [X29], #0"); // 1 FloatMemWrite
asm("str H7, [X29], #0"); // 1 FloatMemWrite
asm("str B8, [X29], #0"); // 1 FloatMemWrite
asm("stur B0, [X29, #0]"); // 1 FloatMemWrite
asm("stur H0, [X29, #0]"); // 1 FloatMemWrite
asm("stur S0, [X29, #0]"); // 1 FloatMemWrite
asm("stur D0, [X29, #0]"); // 1 FloatMemWrite
asm("stur Q0, [X29, #0]"); // 2 FloatMemWrite
asm("st1 {V9.B}[0], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.B}[15], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.H}[0], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.H}[7], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.S}[0], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.S}[3], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.D}[0], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.D}[1], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.B, V10.B}[0], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.B, V10.B}[15], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.H, V10.H}[0], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.H, V10.H}[7], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.S, V10.S}[0], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.S, V10.S}[3], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.D, V10.D}[0], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.D, V10.D}[1], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.B, V10.B, V11.B}[0], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.B, V10.B, V11.B}[15], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.H, V10.H, V11.H}[0], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.H, V10.H, V11.H}[7], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.S, V10.S, V11.S}[0], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.S, V10.S, V11.S}[3], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.D, V10.D, V11.D}[0], [X29]"); // 2 FloatMemWrite
asm("st3 {V9.D, V10.D, V11.D}[1], [X29]"); // 2 FloatMemWrite
asm("st4 {V9.B, V10.B, V11.B, V12.B}[0], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.B, V10.B, V11.B, V12.B}[15], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.H, V10.H, V11.H, V12.H}[0], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.H, V10.H, V11.H, V12.H}[7], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.S, V10.S, V11.S, V12.S}[0], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.S, V10.S, V11.S, V12.S}[3], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.D, V10.D, V11.D, V12.D}[0], [X29]"); // 2 FloatMemWrite
asm("st4 {V9.D, V10.D, V11.D, V12.D}[1], [X29]"); // 2 FloatMemWrite
asm("st1 {V9.16B}, [X29], #16"); // 1 FloatMemWrite
asm("st1 {V9.8H, V10.8H}, [X29], #32"); // 2 FloatMemWrite
asm("st1 {V9.4S, V10.4S, V11.4S}, [X29], #48"); // 3 FloatMemWrite
asm("st1 {V9.2D, V10.2D, V11.2D, V12.2D}, [X29], #64"); // 4 FloatMemWrite
asm("st2 {V9.8H, V10.8H}, [X29], #32"); // 2 FloatMemWrite
asm("st3 {V9.4S, V10.4S, V11.4S}, [X29], #48"); // 3 FloatMemWrite
asm("st4 {V9.2D, V10.2D, V11.2D, V12.2D}, [X29], #64"); // 4 FloatMemWrite
Thanks,
Fernando Endo
Fernando Endo
2016-08-01 15:57:06 UTC
Permalink
-----------------------------------------------------------
This is an automatically generated e-mail. To reply, visit:
http://reviews.gem5.org/r/3547/
-----------------------------------------------------------

(Updated Aug. 1, 2016, 3:57 p.m.)


Review request for Default.


Changes
-------

Format summary and description


Summary (updated)
-----------------

cpu, arm: Separate Float* from SimdFloat*, add FloatMem* opClass


Repository: gem5


Description (updated)
-------

Modify the opClass assigned to AArch64 FP instructions from SimdFloat* to
Float*. Also create the FloatMemRead and FloatMemWrite opClasses, which
distinguishes writes to the INT and FP register banks.
Change the latency of (Simd)FloatMultAcc to 5, based on the Cortex-A72,
where the "latency" of FMADD is 3 if the next instruction is a FMADD and
has only the augend to destination dependency, otherwise it's 7 cycles.


Diffs
-----

configs/common/O3_ARM_v7a.py cdb94f2332a6
src/arch/arm/isa/insts/fp64.isa cdb94f2332a6
src/arch/isa_parser.py cdb94f2332a6
src/cpu/FuncUnit.py cdb94f2332a6
src/cpu/minor/MinorCPU.py cdb94f2332a6
src/cpu/o3/FuncUnitConfig.py cdb94f2332a6
src/cpu/op_class.hh cdb94f2332a6

Diff: http://reviews.gem5.org/r/3547/diff/


Testing
-------

The changes from SimdFloat* to Float* were quickly tested, indeed the neon64.isa and fp64.isa files already separate the implementations, so it should not be buggy.
Load and store instructions tested, followed by #uops and opClass from stats.txt:

asm("ldr X2, [X30], #0"); // 1 MemRead
asm("ldp X2, X3, [X30], #0"); // 1 MemRead
asm("ldpsw X2, X3, [X30], #0"); // 1 MemRead
asm("ldr W2, [X30], #0"); // 1 MemRead
asm("ldp W2, W3, [X30], #0"); // 1 MemRead

asm("ldp Q0, Q1, [SP], #0"); // 2 FloatMemRead
asm("ldp D2, D3, [SP], #0"); // 1 FloatMemRead
asm("ldp S2, S3, [X30], #0"); // 1 FloatMemRead
asm("ldr Q4, [SP], #0"); // 1 FloatMemRead
asm("ldr Q4, [SP, X28]"); // 1 FloatMemRead
asm("ldr D5, [SP], #0"); // 1 FloatMemRead
asm("ldr D5, [SP, X28]"); // 1 FloatMemRead
asm("ldr S6, [SP], #0"); // 1 FloatMemRead
asm("ldr S2, [X30, X28]"); // 1 FloatMemRead
asm("ldr H7, [X30], #0"); // 1 FloatMemRead
asm("ldr B8, [SP], #0"); // 1 FloatMemRead

asm("ldur B0, [X30, #0]"); // 1 FloatMemRead
asm("ldur H0, [X30, #0]"); // 1 FloatMemRead
asm("ldur S0, [X30, #0]"); // 1 FloatMemRead
asm("ldur D0, [X30, #0]"); // 1 FloatMemRead
asm("ldur Q0, [X30, #0]"); // 1 FloatMemRead

asm("ld1 {V9.B}[0], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.B}[15], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.H}[0], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.H}[7], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.S}[0], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.S}[3], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.D}[0], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.D}[1], [X30]"); // 1 FloatMemRead

asm("ld2 {V9.B, V10.B}[0], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.B, V10.B}[15], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.H, V10.H}[0], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.H, V10.H}[7], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.S, V10.S}[0], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.S, V10.S}[3], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.D, V10.D}[0], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.D, V10.D}[1], [X30]"); // 1 FloatMemRead

asm("ld3 {V9.B, V10.B, V11.B}[0], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.B, V10.B, V11.B}[15], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.H, V10.H, V11.H}[0], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.H, V10.H, V11.H}[7], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.S, V10.S, V11.S}[0], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.S, V10.S, V11.S}[3], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.D, V10.D, V11.D}[0], [X30]"); // 2 FloatMemRead
asm("ld3 {V9.D, V10.D, V11.D}[1], [X30]"); // 2 FloatMemRead

asm("ld4 {V9.B, V10.B, V11.B, V12.B}[0], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.B, V10.B, V11.B, V12.B}[15], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.H, V10.H, V11.H, V12.H}[0], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.H, V10.H, V11.H, V12.H}[7], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.S, V10.S, V11.S, V12.S}[0], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.S, V10.S, V11.S, V12.S}[3], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.D, V10.D, V11.D, V12.D}[0], [X30]"); // 2 FloatMemRead
asm("ld4 {V9.D, V10.D, V11.D, V12.D}[1], [X30]"); // 2 FloatMemRead

asm("ld1 {V9.16B}, [X30], #16"); // 1 FloatMemRead
asm("ld1 {V9.8H, V10.8H}, [X30], #32"); // 2 FloatMemRead
asm("ld1 {V9.4S, V10.4S, V11.4S}, [X30], #48"); // 3 FloatMemRead
asm("ld1 {V9.2D, V10.2D, V11.2D, V12.2D}, [X30], #64"); // 4 FloatMemRead

asm("ld2 {V9.8H, V10.8H}, [X30], #32"); // 2 FloatMemRead
asm("ld3 {V9.4S, V10.4S, V11.4S}, [X30], #48"); // 3 FloatMemRead
asm("ld4 {V9.2D, V10.2D, V11.2D, V12.2D}, [X30], #64"); // 4 FloatMemRead

asm("ld1r {V9.16B}, [X30]"); // 1 FloatMemRead
asm("ld1r {V9.8H}, [X30]"); // 1 FloatMemRead
asm("ld1r {V9.4S}, [X30]"); // 1 FloatMemRead
asm("ld1r {V9.2D}, [X30]"); // 1 FloatMemRead

asm("ld2r {V9.16B, V10.16B}, [X30]"); // 1 FloatMemRead
asm("ld2r {V9.8H, V10.8H}, [X30]"); // 1 FloatMemRead
asm("ld2r {V9.4S, V10.4S}, [X30]"); // 1 FloatMemRead
asm("ld2r {V9.2D, V10.2D}, [X30]"); // 1 FloatMemRead

asm("ld3r {V9.16B, V10.16B, V11.16B}, [X30]"); // 1 FloatMemRead
asm("ld3r {V9.8H, V10.8H, V11.8H}, [X30]"); // 1 FloatMemRead
asm("ld3r {V9.4S, V10.4S, V11.4S}, [X30]"); // 1 FloatMemRead
asm("ld3r {V9.2D, V10.2D, V11.2D}, [X30]"); // 2 FloatMemRead

asm("ld4r {V9.16B, V10.16B, V11.16B, V12.16B}, [X30]"); // 1 FloatMemRead
asm("ld4r {V9.8H, V10.8H, V11.8H, V12.8H}, [X30]"); // 1 FloatMemRead
asm("ld4r {V9.4S, V10.4S, V11.4S, V12.4S}, [X30]"); // 1 FloatMemRead
asm("ld4r {V9.2D, V10.2D, V11.2D, V12.2D}, [X30]"); // 2 FloatMemRead

asm("str X2, [X30], #0"); // 1 MemWrite
asm("stp X2, X3, [X30], #0"); // 2 MemWrite
asm("str W2, [X30], #0"); // 1 MemWrite
asm("stp W2, W3, [X30], #0"); // 1 MemWrite

asm("stp Q0, Q1, [X29], #0"); // 4 FloatMemWrite
asm("stp D2, D3, [X29], #0"); // 2 FloatMemWrite
asm("stp S2, S3, [X29], #0"); // 1 FloatMemWrite
asm("str Q4, [X29], #0"); // 2 FloatMemWrite
asm("str D5, [X29], #0"); // 1 FloatMemWrite
asm("str S6, [X29], #0"); // 1 FloatMemWrite
asm("str H7, [X29], #0"); // 1 FloatMemWrite
asm("str B8, [X29], #0"); // 1 FloatMemWrite

asm("stur B0, [X29, #0]"); // 1 FloatMemWrite
asm("stur H0, [X29, #0]"); // 1 FloatMemWrite
asm("stur S0, [X29, #0]"); // 1 FloatMemWrite
asm("stur D0, [X29, #0]"); // 1 FloatMemWrite
asm("stur Q0, [X29, #0]"); // 2 FloatMemWrite

asm("st1 {V9.B}[0], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.B}[15], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.H}[0], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.H}[7], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.S}[0], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.S}[3], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.D}[0], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.D}[1], [X29]"); // 1 FloatMemWrite

asm("st2 {V9.B, V10.B}[0], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.B, V10.B}[15], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.H, V10.H}[0], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.H, V10.H}[7], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.S, V10.S}[0], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.S, V10.S}[3], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.D, V10.D}[0], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.D, V10.D}[1], [X29]"); // 1 FloatMemWrite

asm("st3 {V9.B, V10.B, V11.B}[0], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.B, V10.B, V11.B}[15], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.H, V10.H, V11.H}[0], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.H, V10.H, V11.H}[7], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.S, V10.S, V11.S}[0], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.S, V10.S, V11.S}[3], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.D, V10.D, V11.D}[0], [X29]"); // 2 FloatMemWrite
asm("st3 {V9.D, V10.D, V11.D}[1], [X29]"); // 2 FloatMemWrite

asm("st4 {V9.B, V10.B, V11.B, V12.B}[0], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.B, V10.B, V11.B, V12.B}[15], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.H, V10.H, V11.H, V12.H}[0], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.H, V10.H, V11.H, V12.H}[7], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.S, V10.S, V11.S, V12.S}[0], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.S, V10.S, V11.S, V12.S}[3], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.D, V10.D, V11.D, V12.D}[0], [X29]"); // 2 FloatMemWrite
asm("st4 {V9.D, V10.D, V11.D, V12.D}[1], [X29]"); // 2 FloatMemWrite

asm("st1 {V9.16B}, [X29], #16"); // 1 FloatMemWrite
asm("st1 {V9.8H, V10.8H}, [X29], #32"); // 2 FloatMemWrite
asm("st1 {V9.4S, V10.4S, V11.4S}, [X29], #48"); // 3 FloatMemWrite
asm("st1 {V9.2D, V10.2D, V11.2D, V12.2D}, [X29], #64"); // 4 FloatMemWrite

asm("st2 {V9.8H, V10.8H}, [X29], #32"); // 2 FloatMemWrite
asm("st3 {V9.4S, V10.4S, V11.4S}, [X29], #48"); // 3 FloatMemWrite
asm("st4 {V9.2D, V10.2D, V11.2D, V12.2D}, [X29], #64"); // 4 FloatMemWrite


Thanks,

Fernando Endo
Jason Lowe-Power
2016-10-15 20:31:36 UTC
Permalink
-----------------------------------------------------------
This is an automatically generated e-mail. To reply, visit:
http://reviews.gem5.org/r/3547/#review8849
-----------------------------------------------------------


Please mark this as submitted. Thanks.

- Jason Lowe-Power
Post by Fernando Endo
-----------------------------------------------------------
http://reviews.gem5.org/r/3547/
-----------------------------------------------------------
(Updated Aug. 1, 2016, 3:57 p.m.)
Review request for Default.
Repository: gem5
Description
-------
Modify the opClass assigned to AArch64 FP instructions from SimdFloat* to
Float*. Also create the FloatMemRead and FloatMemWrite opClasses, which
distinguishes writes to the INT and FP register banks.
Change the latency of (Simd)FloatMultAcc to 5, based on the Cortex-A72,
where the "latency" of FMADD is 3 if the next instruction is a FMADD and
has only the augend to destination dependency, otherwise it's 7 cycles.
Diffs
-----
configs/common/O3_ARM_v7a.py cdb94f2332a6
src/arch/arm/isa/insts/fp64.isa cdb94f2332a6
src/arch/isa_parser.py cdb94f2332a6
src/cpu/FuncUnit.py cdb94f2332a6
src/cpu/minor/MinorCPU.py cdb94f2332a6
src/cpu/o3/FuncUnitConfig.py cdb94f2332a6
src/cpu/op_class.hh cdb94f2332a6
Diff: http://reviews.gem5.org/r/3547/diff/
Testing
-------
The changes from SimdFloat* to Float* were quickly tested, indeed the neon64.isa and fp64.isa files already separate the implementations, so it should not be buggy.
asm("ldr X2, [X30], #0"); // 1 MemRead
asm("ldp X2, X3, [X30], #0"); // 1 MemRead
asm("ldpsw X2, X3, [X30], #0"); // 1 MemRead
asm("ldr W2, [X30], #0"); // 1 MemRead
asm("ldp W2, W3, [X30], #0"); // 1 MemRead
asm("ldp Q0, Q1, [SP], #0"); // 2 FloatMemRead
asm("ldp D2, D3, [SP], #0"); // 1 FloatMemRead
asm("ldp S2, S3, [X30], #0"); // 1 FloatMemRead
asm("ldr Q4, [SP], #0"); // 1 FloatMemRead
asm("ldr Q4, [SP, X28]"); // 1 FloatMemRead
asm("ldr D5, [SP], #0"); // 1 FloatMemRead
asm("ldr D5, [SP, X28]"); // 1 FloatMemRead
asm("ldr S6, [SP], #0"); // 1 FloatMemRead
asm("ldr S2, [X30, X28]"); // 1 FloatMemRead
asm("ldr H7, [X30], #0"); // 1 FloatMemRead
asm("ldr B8, [SP], #0"); // 1 FloatMemRead
asm("ldur B0, [X30, #0]"); // 1 FloatMemRead
asm("ldur H0, [X30, #0]"); // 1 FloatMemRead
asm("ldur S0, [X30, #0]"); // 1 FloatMemRead
asm("ldur D0, [X30, #0]"); // 1 FloatMemRead
asm("ldur Q0, [X30, #0]"); // 1 FloatMemRead
asm("ld1 {V9.B}[0], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.B}[15], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.H}[0], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.H}[7], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.S}[0], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.S}[3], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.D}[0], [X30]"); // 1 FloatMemRead
asm("ld1 {V9.D}[1], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.B, V10.B}[0], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.B, V10.B}[15], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.H, V10.H}[0], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.H, V10.H}[7], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.S, V10.S}[0], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.S, V10.S}[3], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.D, V10.D}[0], [X30]"); // 1 FloatMemRead
asm("ld2 {V9.D, V10.D}[1], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.B, V10.B, V11.B}[0], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.B, V10.B, V11.B}[15], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.H, V10.H, V11.H}[0], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.H, V10.H, V11.H}[7], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.S, V10.S, V11.S}[0], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.S, V10.S, V11.S}[3], [X30]"); // 1 FloatMemRead
asm("ld3 {V9.D, V10.D, V11.D}[0], [X30]"); // 2 FloatMemRead
asm("ld3 {V9.D, V10.D, V11.D}[1], [X30]"); // 2 FloatMemRead
asm("ld4 {V9.B, V10.B, V11.B, V12.B}[0], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.B, V10.B, V11.B, V12.B}[15], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.H, V10.H, V11.H, V12.H}[0], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.H, V10.H, V11.H, V12.H}[7], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.S, V10.S, V11.S, V12.S}[0], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.S, V10.S, V11.S, V12.S}[3], [X30]"); // 1 FloatMemRead
asm("ld4 {V9.D, V10.D, V11.D, V12.D}[0], [X30]"); // 2 FloatMemRead
asm("ld4 {V9.D, V10.D, V11.D, V12.D}[1], [X30]"); // 2 FloatMemRead
asm("ld1 {V9.16B}, [X30], #16"); // 1 FloatMemRead
asm("ld1 {V9.8H, V10.8H}, [X30], #32"); // 2 FloatMemRead
asm("ld1 {V9.4S, V10.4S, V11.4S}, [X30], #48"); // 3 FloatMemRead
asm("ld1 {V9.2D, V10.2D, V11.2D, V12.2D}, [X30], #64"); // 4 FloatMemRead
asm("ld2 {V9.8H, V10.8H}, [X30], #32"); // 2 FloatMemRead
asm("ld3 {V9.4S, V10.4S, V11.4S}, [X30], #48"); // 3 FloatMemRead
asm("ld4 {V9.2D, V10.2D, V11.2D, V12.2D}, [X30], #64"); // 4 FloatMemRead
asm("ld1r {V9.16B}, [X30]"); // 1 FloatMemRead
asm("ld1r {V9.8H}, [X30]"); // 1 FloatMemRead
asm("ld1r {V9.4S}, [X30]"); // 1 FloatMemRead
asm("ld1r {V9.2D}, [X30]"); // 1 FloatMemRead
asm("ld2r {V9.16B, V10.16B}, [X30]"); // 1 FloatMemRead
asm("ld2r {V9.8H, V10.8H}, [X30]"); // 1 FloatMemRead
asm("ld2r {V9.4S, V10.4S}, [X30]"); // 1 FloatMemRead
asm("ld2r {V9.2D, V10.2D}, [X30]"); // 1 FloatMemRead
asm("ld3r {V9.16B, V10.16B, V11.16B}, [X30]"); // 1 FloatMemRead
asm("ld3r {V9.8H, V10.8H, V11.8H}, [X30]"); // 1 FloatMemRead
asm("ld3r {V9.4S, V10.4S, V11.4S}, [X30]"); // 1 FloatMemRead
asm("ld3r {V9.2D, V10.2D, V11.2D}, [X30]"); // 2 FloatMemRead
asm("ld4r {V9.16B, V10.16B, V11.16B, V12.16B}, [X30]"); // 1 FloatMemRead
asm("ld4r {V9.8H, V10.8H, V11.8H, V12.8H}, [X30]"); // 1 FloatMemRead
asm("ld4r {V9.4S, V10.4S, V11.4S, V12.4S}, [X30]"); // 1 FloatMemRead
asm("ld4r {V9.2D, V10.2D, V11.2D, V12.2D}, [X30]"); // 2 FloatMemRead
asm("str X2, [X30], #0"); // 1 MemWrite
asm("stp X2, X3, [X30], #0"); // 2 MemWrite
asm("str W2, [X30], #0"); // 1 MemWrite
asm("stp W2, W3, [X30], #0"); // 1 MemWrite
asm("stp Q0, Q1, [X29], #0"); // 4 FloatMemWrite
asm("stp D2, D3, [X29], #0"); // 2 FloatMemWrite
asm("stp S2, S3, [X29], #0"); // 1 FloatMemWrite
asm("str Q4, [X29], #0"); // 2 FloatMemWrite
asm("str D5, [X29], #0"); // 1 FloatMemWrite
asm("str S6, [X29], #0"); // 1 FloatMemWrite
asm("str H7, [X29], #0"); // 1 FloatMemWrite
asm("str B8, [X29], #0"); // 1 FloatMemWrite
asm("stur B0, [X29, #0]"); // 1 FloatMemWrite
asm("stur H0, [X29, #0]"); // 1 FloatMemWrite
asm("stur S0, [X29, #0]"); // 1 FloatMemWrite
asm("stur D0, [X29, #0]"); // 1 FloatMemWrite
asm("stur Q0, [X29, #0]"); // 2 FloatMemWrite
asm("st1 {V9.B}[0], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.B}[15], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.H}[0], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.H}[7], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.S}[0], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.S}[3], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.D}[0], [X29]"); // 1 FloatMemWrite
asm("st1 {V9.D}[1], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.B, V10.B}[0], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.B, V10.B}[15], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.H, V10.H}[0], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.H, V10.H}[7], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.S, V10.S}[0], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.S, V10.S}[3], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.D, V10.D}[0], [X29]"); // 1 FloatMemWrite
asm("st2 {V9.D, V10.D}[1], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.B, V10.B, V11.B}[0], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.B, V10.B, V11.B}[15], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.H, V10.H, V11.H}[0], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.H, V10.H, V11.H}[7], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.S, V10.S, V11.S}[0], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.S, V10.S, V11.S}[3], [X29]"); // 1 FloatMemWrite
asm("st3 {V9.D, V10.D, V11.D}[0], [X29]"); // 2 FloatMemWrite
asm("st3 {V9.D, V10.D, V11.D}[1], [X29]"); // 2 FloatMemWrite
asm("st4 {V9.B, V10.B, V11.B, V12.B}[0], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.B, V10.B, V11.B, V12.B}[15], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.H, V10.H, V11.H, V12.H}[0], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.H, V10.H, V11.H, V12.H}[7], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.S, V10.S, V11.S, V12.S}[0], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.S, V10.S, V11.S, V12.S}[3], [X29]"); // 1 FloatMemWrite
asm("st4 {V9.D, V10.D, V11.D, V12.D}[0], [X29]"); // 2 FloatMemWrite
asm("st4 {V9.D, V10.D, V11.D, V12.D}[1], [X29]"); // 2 FloatMemWrite
asm("st1 {V9.16B}, [X29], #16"); // 1 FloatMemWrite
asm("st1 {V9.8H, V10.8H}, [X29], #32"); // 2 FloatMemWrite
asm("st1 {V9.4S, V10.4S, V11.4S}, [X29], #48"); // 3 FloatMemWrite
asm("st1 {V9.2D, V10.2D, V11.2D, V12.2D}, [X29], #64"); // 4 FloatMemWrite
asm("st2 {V9.8H, V10.8H}, [X29], #32"); // 2 FloatMemWrite
asm("st3 {V9.4S, V10.4S, V11.4S}, [X29], #48"); // 3 FloatMemWrite
asm("st4 {V9.2D, V10.2D, V11.2D, V12.2D}, [X29], #64"); // 4 FloatMemWrite
Thanks,
Fernando Endo
Loading...