Add NEON support for ARM64 (#62)

* Add support for arm64 using NEON instructions Specifically using the PMULL/PMULL2 polynomial multiplication instructions followed by a reduction step (actually two steps). * Add ARM performance numbers * Formatting for performance table * Refactoring of NEON version and 256-bit wide version * Expand test slice beyond 32 (for AVX2 and NEON) and test galMulSliceXor explicitly. * Fix ARM code with missing function. * Fix missing newline
2017-08-26 02:47:42 -07:00 · 2017-08-26 02:47:42 -07:00 · 7b88f42e61
parent d78bf472d8
commit 7b88f42e61
6 changed files with 257 additions and 8 deletions
--- a/README.md
+++ b/README.md
@ -234,6 +234,16 @@ BenchmarkReconstruct50x20x1M-8       1364.35      4189.79      3.07x
 BenchmarkReconstruct10x4x16M-8       1484.35      5779.53      3.89x
 ```

+# Performance on ARM64 NEON
+
+By exploiting NEON instructions the performance for ARM has been accelerated. Below are the performance numbers for a single core on an ARM Cortex-A53 CPU @ 1.2GHz (Debian 8.0 Jessie running Go: 1.7.4):
+
+| Data | Parity | Parity | ARM64 Go MB/s | ARM64 NEON MB/s | NEON Speed |
+|------|--------|--------|--------------:|----------------:|-----------:|
+| 5    | 2      | 40%    |           189 |            1304 |       588% |
+| 10   | 2      | 20%    |           188 |            1738 |       925% |
+| 10   | 4      | 40%    |            96 |             839 |       877% |
+
 # asm2plan9s

 [asm2plan9s](https://github.com/fwessels/asm2plan9s) is used for assembling the AVX2 instructions into their BYTE/WORD/LONG equivalents.
--- a/galois_amd64.go
+++ b/galois_amd64.go
@ -20,7 +20,7 @@ func galMulAVX2(low, high, in, out []byte)
 //go:noescape
 func sSE2XorSlice(in, out []byte)

-// This is what the assembler rountes does in blocks of 16 bytes:
+// This is what the assembler routines do in blocks of 16 bytes:
 /*
 func galMulSSSE3(low, high, in, out []byte) {
 	for n, input := range in {
--- a/galois_arm64.go
+++ b/galois_arm64.go
@ -0,0 +1,48 @@
+//+build !noasm
+//+build !appengine
+
+// Copyright 2015, Klaus Post, see LICENSE for details.
+// Copyright 2017, Minio, Inc.
+
+package reedsolomon
+
+//go:noescape
+func galMulNEON(c uint64, in, out []byte)
+
+//go:noescape
+func galMulXorNEON(c uint64, in, out []byte)
+
+func galMulSlice(c byte, in, out []byte, ssse3, avx2 bool) {
+	var done int
+	galMulNEON(uint64(c), in, out)
+	done = (len(in) >> 5) << 5
+
+	remain := len(in) - done
+	if remain > 0 {
+		mt := mulTable[c]
+		for i := done; i < len(in); i++ {
+			out[i] = mt[in[i]]
+		}
+	}
+}
+
+func galMulSliceXor(c byte, in, out []byte, ssse3, avx2 bool) {
+	var done int
+	galMulXorNEON(uint64(c), in, out)
+	done = (len(in) >> 5) << 5
+
+	remain := len(in) - done
+	if remain > 0 {
+		mt := mulTable[c]
+		for i := done; i < len(in); i++ {
+			out[i] ^= mt[in[i]]
+		}
+	}
+}
+
+// slice galois add
+func sliceXor(in, out []byte, sse2 bool) {
+	for n, input := range in {
+		out[n] ^= input
+	}
+}
--- a/galois_arm64.s
+++ b/galois_arm64.s
@ -0,0 +1,141 @@
+//+build !noasm !appengine
+
+// Copyright 2015, Klaus Post, see LICENSE for details.
+// Copyright 2017, Minio, Inc.
+
+// Use github.com/minio/asm2plan9s on this file to assemble ARM instructions to
+// the opcodes of their Plan9 equivalents
+
+// polynomial multiplication
+#define POLYNOMIAL_MULTIPLICATION \
+	WORD $0x0e3ce340 \ // pmull  v0.8h,v26.8b,v28.8b
+	WORD $0x4e3ce346 \ // pmull2 v6.8h,v26.16b,v28.16b
+	WORD $0x0e3ce36c \ // pmull  v12.8h,v27.8b,v28.8b
+	WORD $0x4e3ce372 // pmull2 v18.8h,v27.16b,v28.16b
+
+// first reduction
+#define FIRST_REDUCTION \
+	WORD $0x0f088402 \ // shrn  v2.8b, v0.8h, #8
+	WORD $0x0f0884c8 \ // shrn  v8.8b, v6.8h, #8
+	WORD $0x0f08858e \ // shrn  v14.8b, v12.8h, #8
+	WORD $0x0f088654 \ // shrn  v20.8b, v18.8h, #8
+	WORD $0x0e22e3c3 \ // pmull v3.8h,v30.8b,v2.8b
+	WORD $0x0e28e3c9 \ // pmull v9.8h,v30.8b,v8.8b
+	WORD $0x0e2ee3cf \ // pmull v15.8h,v30.8b,v14.8b
+	WORD $0x0e34e3d5 \ // pmull v21.8h,v30.8b,v20.8b
+	WORD $0x6e201c60 \ // eor   v0.16b,v3.16b,v0.16b
+	WORD $0x6e261d26 \ // eor   v6.16b,v9.16b,v6.16b
+	WORD $0x6e2c1dec \ // eor   v12.16b,v15.16b,v12.16b
+	WORD $0x6e321eb2 // eor   v18.16b,v21.16b,v18.16b
+
+// second reduction
+#define SECOND_REDUCTION \
+	WORD $0x0f088404 \ // shrn  v4.8b, v0.8h, #8
+	WORD $0x0f0884ca \ // shrn  v10.8b, v6.8h, #8
+	WORD $0x0f088590 \ // shrn  v16.8b, v12.8h, #8
+	WORD $0x0f088656 \ // shrn  v22.8b, v18.8h, #8
+	WORD $0x6e241c44 \ // eor   v4.16b,v2.16b,v4.16b
+	WORD $0x6e2a1d0a \ // eor   v10.16b,v8.16b,v10.16b
+	WORD $0x6e301dd0 \ // eor   v16.16b,v14.16b,v16.16b
+	WORD $0x6e361e96 \ // eor   v22.16b,v20.16b,v22.16b
+	WORD $0x0e24e3c5 \ // pmull v5.8h,v30.8b,v4.8b
+	WORD $0x0e2ae3cb \ // pmull v11.8h,v30.8b,v10.8b
+	WORD $0x0e30e3d1 \ // pmull v17.8h,v30.8b,v16.8b
+	WORD $0x0e36e3d7 \ // pmull v23.8h,v30.8b,v22.8b
+	WORD $0x6e201ca0 \ // eor   v0.16b,v5.16b,v0.16b
+	WORD $0x6e261d61 \ // eor   v1.16b,v11.16b,v6.16b
+	WORD $0x6e2c1e22 \ // eor   v2.16b,v17.16b,v12.16b
+	WORD $0x6e321ee3 // eor   v3.16b,v23.16b,v18.16b
+
+// func galMulNEON(c uint64, in, out []byte)
+TEXT ·galMulNEON(SB), 7, $0
+	MOVD c+0(FP), R0
+	MOVD in_base+8(FP), R1
+	MOVD in_len+16(FP), R2   // length of message
+	MOVD out_base+32(FP), R5
+	SUBS $32, R2
+	BMI  complete
+
+	// Load constants table pointer
+	MOVD $·constants(SB), R3
+
+	// and load constants into v30 & v31
+	WORD $0x4c40a07e // ld1    {v30.16b-v31.16b}, [x3]
+
+	WORD $0x4e010c1c // dup    v28.16b, w0
+
+loop:
+	// Main loop
+	WORD $0x4cdfa83a // ld1   {v26.4s-v27.4s}, [x1], #32
+
+	POLYNOMIAL_MULTIPLICATION
+
+	FIRST_REDUCTION
+
+	SECOND_REDUCTION
+
+	// combine results
+	WORD $0x4e1f2000 // tbl v0.16b,{v0.16b,v1.16b},v31.16b
+	WORD $0x4e1f2041 // tbl v1.16b,{v2.16b,v3.16b},v31.16b
+
+	// Store result
+	WORD $0x4c9faca0 // st1    {v0.2d-v1.2d}, [x5], #32
+
+	SUBS $32, R2
+	BPL  loop
+
+complete:
+	RET
+
+// func galMulXorNEON(c uint64, in, out []byte)
+TEXT ·galMulXorNEON(SB), 7, $0
+	MOVD c+0(FP), R0
+	MOVD in_base+8(FP), R1
+	MOVD in_len+16(FP), R2   // length of message
+	MOVD out_base+32(FP), R5
+	SUBS $32, R2
+	BMI  completeXor
+
+	// Load constants table pointer
+	MOVD $·constants(SB), R3
+
+	// and load constants into v30 & v31
+	WORD $0x4c40a07e // ld1    {v30.16b-v31.16b}, [x3]
+
+	WORD $0x4e010c1c // dup    v28.16b, w0
+
+loopXor:
+	// Main loop
+	WORD $0x4cdfa83a // ld1   {v26.4s-v27.4s}, [x1], #32
+	WORD $0x4c40a8b8 // ld1   {v24.4s-v25.4s}, [x5]
+
+	POLYNOMIAL_MULTIPLICATION
+
+	FIRST_REDUCTION
+
+	SECOND_REDUCTION
+
+	// combine results
+	WORD $0x4e1f2000 // tbl v0.16b,{v0.16b,v1.16b},v31.16b
+	WORD $0x4e1f2041 // tbl v1.16b,{v2.16b,v3.16b},v31.16b
+
+	// Xor result and store
+	WORD $0x6e381c00 // eor v0.16b,v0.16b,v24.16b
+	WORD $0x6e391c21 // eor v1.16b,v1.16b,v25.16b
+	WORD $0x4c9faca0 // st1   {v0.2d-v1.2d}, [x5], #32
+
+	SUBS $32, R2
+	BPL  loopXor
+
+completeXor:
+	RET
+
+// Constants table
+//   generating polynomial is 29 (= 0x1d)
+DATA ·constants+0x0(SB)/8, $0x1d1d1d1d1d1d1d1d
+DATA ·constants+0x8(SB)/8, $0x1d1d1d1d1d1d1d1d
+//   constant for TBL instruction
+DATA ·constants+0x10(SB)/8, $0x0e0c0a0806040200
+DATA ·constants+0x18(SB)/8, $0x1e1c1a1816141210
+
+GLOBL ·constants(SB), 8, $32
--- a/galois_noasm.go
+++ b/galois_noasm.go
@ -1,4 +1,5 @@
 //+build !amd64 noasm appengine
+//+build !arm64 noasm appengine

 // Copyright 2015, Klaus Post, see LICENSE for details.

--- a/galois_test.go
+++ b/galois_test.go
@ -128,19 +128,30 @@ func TestGalois(t *testing.T) {
 		t.Fatal("galMultiply(23, 45) != 41")
 	}

-	// Test slices (>16 entries to test assembler)
-	in := []byte{0, 1, 2, 3, 4, 5, 6, 10, 50, 100, 150, 174, 201, 255, 99, 32, 67, 85}
+	// Test slices (>32 entries to test assembler -- AVX2 & NEON)
+	in := []byte{0, 1, 2, 3, 4, 5, 6, 10, 50, 100, 150, 174, 201, 255, 99, 32, 67, 85, 200, 199, 198, 197, 196, 195, 194, 193, 192, 191, 190, 189, 188, 187, 186, 185}
 	out := make([]byte, len(in))
 	galMulSlice(25, in, out, false, false)
-	expect := []byte{0x0, 0x19, 0x32, 0x2b, 0x64, 0x7d, 0x56, 0xfa, 0xb8, 0x6d, 0xc7, 0x85, 0xc3, 0x1f, 0x22, 0x7, 0x25, 0xfe}
+	expect := []byte{0x0, 0x19, 0x32, 0x2b, 0x64, 0x7d, 0x56, 0xfa, 0xb8, 0x6d, 0xc7, 0x85, 0xc3, 0x1f, 0x22, 0x7, 0x25, 0xfe, 0xda, 0x5d, 0x44, 0x6f, 0x76, 0x39, 0x20, 0xb, 0x12, 0x11, 0x8, 0x23, 0x3a, 0x75, 0x6c, 0x47}
+	if 0 != bytes.Compare(out, expect) {
+		t.Errorf("got %#v, expected %#v", out, expect)
+	}
+	expectXor := []byte{0x0, 0x2d, 0x5a, 0x77, 0xb4, 0x99, 0xee, 0x2f, 0x79, 0xf2, 0x7, 0x51, 0xd4, 0x19, 0x31, 0xc9, 0xf8, 0xfc, 0xf9, 0x4f, 0x62, 0x15, 0x38, 0xfb, 0xd6, 0xa1, 0x8c, 0x96, 0xbb, 0xcc, 0xe1, 0x22, 0xf, 0x78}
+	galMulSliceXor(52, in, out, false, false)
+	if 0 != bytes.Compare(out, expectXor) {
+		t.Errorf("got %#v, expected %#v", out, expectXor)
+	}
+
+	galMulSlice(177, in, out, false, false)
+	expect = []byte{0x0, 0xb1, 0x7f, 0xce, 0xfe, 0x4f, 0x81, 0x9e, 0x3, 0x6, 0xe8, 0x75, 0xbd, 0x40, 0x36, 0xa3, 0x95, 0xcb, 0xc, 0xdd, 0x6c, 0xa2, 0x13, 0x23, 0x92, 0x5c, 0xed, 0x1b, 0xaa, 0x64, 0xd5, 0xe5, 0x54, 0x9a}
 	if 0 != bytes.Compare(out, expect) {
 		t.Errorf("got %#v, expected %#v", out, expect)
 	}

-	galMulSlice(177, in, out, false, false)
-	expect = []byte{0x0, 0xb1, 0x7f, 0xce, 0xfe, 0x4f, 0x81, 0x9e, 0x3, 0x6, 0xe8, 0x75, 0xbd, 0x40, 0x36, 0xa3, 0x95, 0xcb}
-	if 0 != bytes.Compare(out, expect) {
-		t.Errorf("got %#v, expected %#v", out, expect)
+	expectXor = []byte{0x0, 0xc4, 0x95, 0x51, 0x37, 0xf3, 0xa2, 0xfb, 0xec, 0xc5, 0xd0, 0xc7, 0x53, 0x88, 0xa3, 0xa5, 0x6, 0x78, 0x97, 0x9f, 0x5b, 0xa, 0xce, 0xa8, 0x6c, 0x3d, 0xf9, 0xdf, 0x1b, 0x4a, 0x8e, 0xe8, 0x2c, 0x7d}
+	galMulSliceXor(117, in, out, false, false)
+	if 0 != bytes.Compare(out, expectXor) {
+		t.Errorf("got %#v, expected %#v", out, expectXor)
 	}

 	if galExp(2, 2) != 4 {
@ -200,3 +211,41 @@ func TestSliceGalADD(t *testing.T) {
 		}
 	}
 }
+
+func benchmarkGalois(b *testing.B, size int) {
+	in := make([]byte, size)
+	out := make([]byte, size)
+
+	b.SetBytes(int64(size))
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		galMulSlice(25, in[:], out[:], true, false)
+	}
+}
+
+func BenchmarkGalois128K(b *testing.B) {
+	benchmarkGalois(b, 128*1024)
+}
+
+func BenchmarkGalois1M(b *testing.B) {
+	benchmarkGalois(b, 1024*1024)
+}
+
+func benchmarkGaloisXor(b *testing.B, size int) {
+	in := make([]byte, size)
+	out := make([]byte, size)
+
+	b.SetBytes(int64(size))
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		galMulSliceXor(177, in[:], out[:], true, false)
+	}
+}
+
+func BenchmarkGaloisXor128K(b *testing.B) {
+	benchmarkGaloisXor(b, 128*1024)
+}
+
+func BenchmarkGaloisXor1M(b *testing.B) {
+	benchmarkGaloisXor(b, 1024*1024)
+}