Add NEON support for ARM64 (#62)

* Add support for arm64 using NEON instructions

Specifically using the PMULL/PMULL2 polynomial multiplication instructions followed by a reduction step (actually two steps).

* Add ARM performance numbers

* Formatting for performance table

* Refactoring of NEON version and 256-bit wide version

* Expand test slice beyond 32 (for AVX2 and NEON) and test galMulSliceXor explicitly.

* Fix ARM code with missing function.

* Fix missing newline
master
Frank Wessels 2017-08-26 02:47:42 -07:00 committed by Klaus Post
parent d78bf472d8
commit 7b88f42e61
6 changed files with 257 additions and 8 deletions

View File

@ -234,6 +234,16 @@ BenchmarkReconstruct50x20x1M-8 1364.35 4189.79 3.07x
BenchmarkReconstruct10x4x16M-8 1484.35 5779.53 3.89x
```
# Performance on ARM64 NEON
By exploiting NEON instructions the performance for ARM has been accelerated. Below are the performance numbers for a single core on an ARM Cortex-A53 CPU @ 1.2GHz (Debian 8.0 Jessie running Go: 1.7.4):
| Data | Parity | Parity | ARM64 Go MB/s | ARM64 NEON MB/s | NEON Speed |
|------|--------|--------|--------------:|----------------:|-----------:|
| 5 | 2 | 40% | 189 | 1304 | 588% |
| 10 | 2 | 20% | 188 | 1738 | 925% |
| 10 | 4 | 40% | 96 | 839 | 877% |
# asm2plan9s
[asm2plan9s](https://github.com/fwessels/asm2plan9s) is used for assembling the AVX2 instructions into their BYTE/WORD/LONG equivalents.

View File

@ -20,7 +20,7 @@ func galMulAVX2(low, high, in, out []byte)
//go:noescape
func sSE2XorSlice(in, out []byte)
// This is what the assembler rountes does in blocks of 16 bytes:
// This is what the assembler routines do in blocks of 16 bytes:
/*
func galMulSSSE3(low, high, in, out []byte) {
for n, input := range in {

48
galois_arm64.go Normal file
View File

@ -0,0 +1,48 @@
//+build !noasm
//+build !appengine
// Copyright 2015, Klaus Post, see LICENSE for details.
// Copyright 2017, Minio, Inc.
package reedsolomon
//go:noescape
func galMulNEON(c uint64, in, out []byte)
//go:noescape
func galMulXorNEON(c uint64, in, out []byte)
func galMulSlice(c byte, in, out []byte, ssse3, avx2 bool) {
var done int
galMulNEON(uint64(c), in, out)
done = (len(in) >> 5) << 5
remain := len(in) - done
if remain > 0 {
mt := mulTable[c]
for i := done; i < len(in); i++ {
out[i] = mt[in[i]]
}
}
}
func galMulSliceXor(c byte, in, out []byte, ssse3, avx2 bool) {
var done int
galMulXorNEON(uint64(c), in, out)
done = (len(in) >> 5) << 5
remain := len(in) - done
if remain > 0 {
mt := mulTable[c]
for i := done; i < len(in); i++ {
out[i] ^= mt[in[i]]
}
}
}
// slice galois add
func sliceXor(in, out []byte, sse2 bool) {
for n, input := range in {
out[n] ^= input
}
}

141
galois_arm64.s Normal file
View File

@ -0,0 +1,141 @@
//+build !noasm !appengine
// Copyright 2015, Klaus Post, see LICENSE for details.
// Copyright 2017, Minio, Inc.
// Use github.com/minio/asm2plan9s on this file to assemble ARM instructions to
// the opcodes of their Plan9 equivalents
// polynomial multiplication
#define POLYNOMIAL_MULTIPLICATION \
WORD $0x0e3ce340 \ // pmull v0.8h,v26.8b,v28.8b
WORD $0x4e3ce346 \ // pmull2 v6.8h,v26.16b,v28.16b
WORD $0x0e3ce36c \ // pmull v12.8h,v27.8b,v28.8b
WORD $0x4e3ce372 // pmull2 v18.8h,v27.16b,v28.16b
// first reduction
#define FIRST_REDUCTION \
WORD $0x0f088402 \ // shrn v2.8b, v0.8h, #8
WORD $0x0f0884c8 \ // shrn v8.8b, v6.8h, #8
WORD $0x0f08858e \ // shrn v14.8b, v12.8h, #8
WORD $0x0f088654 \ // shrn v20.8b, v18.8h, #8
WORD $0x0e22e3c3 \ // pmull v3.8h,v30.8b,v2.8b
WORD $0x0e28e3c9 \ // pmull v9.8h,v30.8b,v8.8b
WORD $0x0e2ee3cf \ // pmull v15.8h,v30.8b,v14.8b
WORD $0x0e34e3d5 \ // pmull v21.8h,v30.8b,v20.8b
WORD $0x6e201c60 \ // eor v0.16b,v3.16b,v0.16b
WORD $0x6e261d26 \ // eor v6.16b,v9.16b,v6.16b
WORD $0x6e2c1dec \ // eor v12.16b,v15.16b,v12.16b
WORD $0x6e321eb2 // eor v18.16b,v21.16b,v18.16b
// second reduction
#define SECOND_REDUCTION \
WORD $0x0f088404 \ // shrn v4.8b, v0.8h, #8
WORD $0x0f0884ca \ // shrn v10.8b, v6.8h, #8
WORD $0x0f088590 \ // shrn v16.8b, v12.8h, #8
WORD $0x0f088656 \ // shrn v22.8b, v18.8h, #8
WORD $0x6e241c44 \ // eor v4.16b,v2.16b,v4.16b
WORD $0x6e2a1d0a \ // eor v10.16b,v8.16b,v10.16b
WORD $0x6e301dd0 \ // eor v16.16b,v14.16b,v16.16b
WORD $0x6e361e96 \ // eor v22.16b,v20.16b,v22.16b
WORD $0x0e24e3c5 \ // pmull v5.8h,v30.8b,v4.8b
WORD $0x0e2ae3cb \ // pmull v11.8h,v30.8b,v10.8b
WORD $0x0e30e3d1 \ // pmull v17.8h,v30.8b,v16.8b
WORD $0x0e36e3d7 \ // pmull v23.8h,v30.8b,v22.8b
WORD $0x6e201ca0 \ // eor v0.16b,v5.16b,v0.16b
WORD $0x6e261d61 \ // eor v1.16b,v11.16b,v6.16b
WORD $0x6e2c1e22 \ // eor v2.16b,v17.16b,v12.16b
WORD $0x6e321ee3 // eor v3.16b,v23.16b,v18.16b
// func galMulNEON(c uint64, in, out []byte)
TEXT ·galMulNEON(SB), 7, $0
MOVD c+0(FP), R0
MOVD in_base+8(FP), R1
MOVD in_len+16(FP), R2 // length of message
MOVD out_base+32(FP), R5
SUBS $32, R2
BMI complete
// Load constants table pointer
MOVD $·constants(SB), R3
// and load constants into v30 & v31
WORD $0x4c40a07e // ld1 {v30.16b-v31.16b}, [x3]
WORD $0x4e010c1c // dup v28.16b, w0
loop:
// Main loop
WORD $0x4cdfa83a // ld1 {v26.4s-v27.4s}, [x1], #32
POLYNOMIAL_MULTIPLICATION
FIRST_REDUCTION
SECOND_REDUCTION
// combine results
WORD $0x4e1f2000 // tbl v0.16b,{v0.16b,v1.16b},v31.16b
WORD $0x4e1f2041 // tbl v1.16b,{v2.16b,v3.16b},v31.16b
// Store result
WORD $0x4c9faca0 // st1 {v0.2d-v1.2d}, [x5], #32
SUBS $32, R2
BPL loop
complete:
RET
// func galMulXorNEON(c uint64, in, out []byte)
TEXT ·galMulXorNEON(SB), 7, $0
MOVD c+0(FP), R0
MOVD in_base+8(FP), R1
MOVD in_len+16(FP), R2 // length of message
MOVD out_base+32(FP), R5
SUBS $32, R2
BMI completeXor
// Load constants table pointer
MOVD $·constants(SB), R3
// and load constants into v30 & v31
WORD $0x4c40a07e // ld1 {v30.16b-v31.16b}, [x3]
WORD $0x4e010c1c // dup v28.16b, w0
loopXor:
// Main loop
WORD $0x4cdfa83a // ld1 {v26.4s-v27.4s}, [x1], #32
WORD $0x4c40a8b8 // ld1 {v24.4s-v25.4s}, [x5]
POLYNOMIAL_MULTIPLICATION
FIRST_REDUCTION
SECOND_REDUCTION
// combine results
WORD $0x4e1f2000 // tbl v0.16b,{v0.16b,v1.16b},v31.16b
WORD $0x4e1f2041 // tbl v1.16b,{v2.16b,v3.16b},v31.16b
// Xor result and store
WORD $0x6e381c00 // eor v0.16b,v0.16b,v24.16b
WORD $0x6e391c21 // eor v1.16b,v1.16b,v25.16b
WORD $0x4c9faca0 // st1 {v0.2d-v1.2d}, [x5], #32
SUBS $32, R2
BPL loopXor
completeXor:
RET
// Constants table
// generating polynomial is 29 (= 0x1d)
DATA ·constants+0x0(SB)/8, $0x1d1d1d1d1d1d1d1d
DATA ·constants+0x8(SB)/8, $0x1d1d1d1d1d1d1d1d
// constant for TBL instruction
DATA ·constants+0x10(SB)/8, $0x0e0c0a0806040200
DATA ·constants+0x18(SB)/8, $0x1e1c1a1816141210
GLOBL ·constants(SB), 8, $32

View File

@ -1,4 +1,5 @@
//+build !amd64 noasm appengine
//+build !arm64 noasm appengine
// Copyright 2015, Klaus Post, see LICENSE for details.

View File

@ -128,19 +128,30 @@ func TestGalois(t *testing.T) {
t.Fatal("galMultiply(23, 45) != 41")
}
// Test slices (>16 entries to test assembler)
in := []byte{0, 1, 2, 3, 4, 5, 6, 10, 50, 100, 150, 174, 201, 255, 99, 32, 67, 85}
// Test slices (>32 entries to test assembler -- AVX2 & NEON)
in := []byte{0, 1, 2, 3, 4, 5, 6, 10, 50, 100, 150, 174, 201, 255, 99, 32, 67, 85, 200, 199, 198, 197, 196, 195, 194, 193, 192, 191, 190, 189, 188, 187, 186, 185}
out := make([]byte, len(in))
galMulSlice(25, in, out, false, false)
expect := []byte{0x0, 0x19, 0x32, 0x2b, 0x64, 0x7d, 0x56, 0xfa, 0xb8, 0x6d, 0xc7, 0x85, 0xc3, 0x1f, 0x22, 0x7, 0x25, 0xfe}
expect := []byte{0x0, 0x19, 0x32, 0x2b, 0x64, 0x7d, 0x56, 0xfa, 0xb8, 0x6d, 0xc7, 0x85, 0xc3, 0x1f, 0x22, 0x7, 0x25, 0xfe, 0xda, 0x5d, 0x44, 0x6f, 0x76, 0x39, 0x20, 0xb, 0x12, 0x11, 0x8, 0x23, 0x3a, 0x75, 0x6c, 0x47}
if 0 != bytes.Compare(out, expect) {
t.Errorf("got %#v, expected %#v", out, expect)
}
expectXor := []byte{0x0, 0x2d, 0x5a, 0x77, 0xb4, 0x99, 0xee, 0x2f, 0x79, 0xf2, 0x7, 0x51, 0xd4, 0x19, 0x31, 0xc9, 0xf8, 0xfc, 0xf9, 0x4f, 0x62, 0x15, 0x38, 0xfb, 0xd6, 0xa1, 0x8c, 0x96, 0xbb, 0xcc, 0xe1, 0x22, 0xf, 0x78}
galMulSliceXor(52, in, out, false, false)
if 0 != bytes.Compare(out, expectXor) {
t.Errorf("got %#v, expected %#v", out, expectXor)
}
galMulSlice(177, in, out, false, false)
expect = []byte{0x0, 0xb1, 0x7f, 0xce, 0xfe, 0x4f, 0x81, 0x9e, 0x3, 0x6, 0xe8, 0x75, 0xbd, 0x40, 0x36, 0xa3, 0x95, 0xcb, 0xc, 0xdd, 0x6c, 0xa2, 0x13, 0x23, 0x92, 0x5c, 0xed, 0x1b, 0xaa, 0x64, 0xd5, 0xe5, 0x54, 0x9a}
if 0 != bytes.Compare(out, expect) {
t.Errorf("got %#v, expected %#v", out, expect)
}
galMulSlice(177, in, out, false, false)
expect = []byte{0x0, 0xb1, 0x7f, 0xce, 0xfe, 0x4f, 0x81, 0x9e, 0x3, 0x6, 0xe8, 0x75, 0xbd, 0x40, 0x36, 0xa3, 0x95, 0xcb}
if 0 != bytes.Compare(out, expect) {
t.Errorf("got %#v, expected %#v", out, expect)
expectXor = []byte{0x0, 0xc4, 0x95, 0x51, 0x37, 0xf3, 0xa2, 0xfb, 0xec, 0xc5, 0xd0, 0xc7, 0x53, 0x88, 0xa3, 0xa5, 0x6, 0x78, 0x97, 0x9f, 0x5b, 0xa, 0xce, 0xa8, 0x6c, 0x3d, 0xf9, 0xdf, 0x1b, 0x4a, 0x8e, 0xe8, 0x2c, 0x7d}
galMulSliceXor(117, in, out, false, false)
if 0 != bytes.Compare(out, expectXor) {
t.Errorf("got %#v, expected %#v", out, expectXor)
}
if galExp(2, 2) != 4 {
@ -200,3 +211,41 @@ func TestSliceGalADD(t *testing.T) {
}
}
}
func benchmarkGalois(b *testing.B, size int) {
in := make([]byte, size)
out := make([]byte, size)
b.SetBytes(int64(size))
b.ResetTimer()
for i := 0; i < b.N; i++ {
galMulSlice(25, in[:], out[:], true, false)
}
}
func BenchmarkGalois128K(b *testing.B) {
benchmarkGalois(b, 128*1024)
}
func BenchmarkGalois1M(b *testing.B) {
benchmarkGalois(b, 1024*1024)
}
func benchmarkGaloisXor(b *testing.B, size int) {
in := make([]byte, size)
out := make([]byte, size)
b.SetBytes(int64(size))
b.ResetTimer()
for i := 0; i < b.N; i++ {
galMulSliceXor(177, in[:], out[:], true, false)
}
}
func BenchmarkGaloisXor128K(b *testing.B) {
benchmarkGaloisXor(b, 128*1024)
}
func BenchmarkGaloisXor1M(b *testing.B) {
benchmarkGaloisXor(b, 1024*1024)
}