parent
278ba25f43
commit
8885f3a1c7
16
README.md
16
README.md
|
@ -24,6 +24,10 @@ go get -u github.com/klauspost/reedsolomon
|
|||
|
||||
# Changes
|
||||
|
||||
## December 18, 2018
|
||||
|
||||
Assembly code for ppc64le has been contributed, this boosts performance by about 10x on this platform.
|
||||
|
||||
## November 18, 2017
|
||||
|
||||
Added [WithAutoGoroutines](https://godoc.org/github.com/klauspost/reedsolomon#WithAutoGoroutines) which will attempt to calculate the optimal number of goroutines to use based on your expected shard size and detected CPU.
|
||||
|
@ -259,6 +263,18 @@ By exploiting NEON instructions the performance for ARM has been accelerated. Be
|
|||
| 10 | 2 | 20% | 188 | 1738 | 925% |
|
||||
| 10 | 4 | 40% | 96 | 839 | 877% |
|
||||
|
||||
# Performance on ppc64le
|
||||
|
||||
The performance for ppc64le has been accelerated. This gives roughly a 10x performance improvement on this architecture as can been seen below:
|
||||
|
||||
```
|
||||
benchmark old MB/s new MB/s speedup
|
||||
BenchmarkGalois128K-160 948.87 8878.85 9.36x
|
||||
BenchmarkGalois1M-160 968.85 9041.92 9.33x
|
||||
BenchmarkGaloisXor128K-160 862.02 7905.00 9.17x
|
||||
BenchmarkGaloisXor1M-160 784.60 6296.65 8.03x
|
||||
```
|
||||
|
||||
# asm2plan9s
|
||||
|
||||
[asm2plan9s](https://github.com/fwessels/asm2plan9s) is used for assembling the AVX2 instructions into their BYTE/WORD/LONG equivalents.
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
//+build !amd64 noasm appengine gccgo
|
||||
//+build !arm64 noasm appengine gccgo
|
||||
//+build !ppc64le noasm appengine gccgo
|
||||
|
||||
// Copyright 2015, Klaus Post, see LICENSE for details.
|
||||
|
||||
|
|
|
@ -0,0 +1,67 @@
|
|||
//+build !noasm
|
||||
//+build !appengine
|
||||
//+build !gccgo
|
||||
|
||||
// Copyright 2015, Klaus Post, see LICENSE for details.
|
||||
// Copyright 2018, Minio, Inc.
|
||||
|
||||
package reedsolomon
|
||||
|
||||
//go:noescape
|
||||
func galMulPpc(low, high, in, out []byte)
|
||||
|
||||
//go:noescape
|
||||
func galMulPpcXor(low, high, in, out []byte)
|
||||
|
||||
// This is what the assembler routines do in blocks of 16 bytes:
|
||||
/*
|
||||
func galMulPpc(low, high, in, out []byte) {
|
||||
for n, input := range in {
|
||||
l := input & 0xf
|
||||
h := input >> 4
|
||||
out[n] = low[l] ^ high[h]
|
||||
}
|
||||
}
|
||||
func galMulPpcXor(low, high, in, out []byte) {
|
||||
for n, input := range in {
|
||||
l := input & 0xf
|
||||
h := input >> 4
|
||||
out[n] ^= low[l] ^ high[h]
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
func galMulSlice(c byte, in, out []byte, ssse3, avx2 bool) {
|
||||
done := (len(in) >> 4) << 4
|
||||
if done > 0 {
|
||||
galMulPpc(mulTableLow[c][:], mulTableHigh[c][:], in[:done], out)
|
||||
}
|
||||
remain := len(in) - done
|
||||
if remain > 0 {
|
||||
mt := mulTable[c]
|
||||
for i := done; i < len(in); i++ {
|
||||
out[i] = mt[in[i]]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func galMulSliceXor(c byte, in, out []byte, ssse3, avx2 bool) {
|
||||
done := (len(in) >> 4) << 4
|
||||
if done > 0 {
|
||||
galMulPpcXor(mulTableLow[c][:], mulTableHigh[c][:], in[:done], out)
|
||||
}
|
||||
remain := len(in) - done
|
||||
if remain > 0 {
|
||||
mt := mulTable[c]
|
||||
for i := done; i < len(in); i++ {
|
||||
out[i] ^= mt[in[i]]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// slice galois add
|
||||
func sliceXor(in, out []byte, sse2 bool) {
|
||||
for n, input := range in {
|
||||
out[n] ^= input
|
||||
}
|
||||
}
|
|
@ -0,0 +1,126 @@
|
|||
//+build !noasm !appengine !gccgo
|
||||
|
||||
// Copyright 2015, Klaus Post, see LICENSE for details.
|
||||
// Copyright 2018, Minio, Inc.
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
#define LOW R3
|
||||
#define HIGH R4
|
||||
#define IN R5
|
||||
#define LEN R6
|
||||
#define OUT R7
|
||||
#define CONSTANTS R8
|
||||
#define OFFSET R9
|
||||
#define OFFSET1 R10
|
||||
#define OFFSET2 R11
|
||||
|
||||
#define X6 VS34
|
||||
#define X6_ V2
|
||||
#define X7 VS35
|
||||
#define X7_ V3
|
||||
#define MSG VS36
|
||||
#define MSG_ V4
|
||||
#define MSG_HI VS37
|
||||
#define MSG_HI_ V5
|
||||
#define RESULT VS38
|
||||
#define RESULT_ V6
|
||||
#define ROTATE VS39
|
||||
#define ROTATE_ V7
|
||||
#define MASK VS40
|
||||
#define MASK_ V8
|
||||
#define FLIP VS41
|
||||
#define FLIP_ V9
|
||||
|
||||
|
||||
// func galMulPpc(low, high, in, out []byte)
|
||||
TEXT ·galMulPpc(SB), NOFRAME|NOSPLIT, $0-96
|
||||
MOVD low+0(FP), LOW
|
||||
MOVD high+24(FP), HIGH
|
||||
MOVD in+48(FP), IN
|
||||
MOVD in_len+56(FP), LEN
|
||||
MOVD out+72(FP), OUT
|
||||
|
||||
MOVD $16, OFFSET1
|
||||
MOVD $32, OFFSET2
|
||||
|
||||
MOVD $·constants(SB), CONSTANTS
|
||||
LXVD2X (CONSTANTS)(R0), ROTATE
|
||||
LXVD2X (CONSTANTS)(OFFSET1), MASK
|
||||
LXVD2X (CONSTANTS)(OFFSET2), FLIP
|
||||
|
||||
LXVD2X (LOW)(R0), X6
|
||||
LXVD2X (HIGH)(R0), X7
|
||||
VPERM X6_, V31, FLIP_, X6_
|
||||
VPERM X7_, V31, FLIP_, X7_
|
||||
|
||||
MOVD $0, OFFSET
|
||||
|
||||
loop:
|
||||
LXVD2X (IN)(OFFSET), MSG
|
||||
|
||||
VSRB MSG_, ROTATE_, MSG_HI_
|
||||
VAND MSG_, MASK_, MSG_
|
||||
VPERM X6_, V31, MSG_, MSG_
|
||||
VPERM X7_, V31, MSG_HI_, MSG_HI_
|
||||
|
||||
VXOR MSG_, MSG_HI_, MSG_
|
||||
|
||||
STXVD2X MSG, (OUT)(OFFSET)
|
||||
|
||||
ADD $16, OFFSET, OFFSET
|
||||
CMP LEN, OFFSET
|
||||
BGT loop
|
||||
RET
|
||||
|
||||
|
||||
// func galMulPpcXorlow, high, in, out []byte)
|
||||
TEXT ·galMulPpcXor(SB), NOFRAME|NOSPLIT, $0-96
|
||||
MOVD low+0(FP), LOW
|
||||
MOVD high+24(FP), HIGH
|
||||
MOVD in+48(FP), IN
|
||||
MOVD in_len+56(FP), LEN
|
||||
MOVD out+72(FP), OUT
|
||||
|
||||
MOVD $16, OFFSET1
|
||||
MOVD $32, OFFSET2
|
||||
|
||||
MOVD $·constants(SB), CONSTANTS
|
||||
LXVD2X (CONSTANTS)(R0), ROTATE
|
||||
LXVD2X (CONSTANTS)(OFFSET1), MASK
|
||||
LXVD2X (CONSTANTS)(OFFSET2), FLIP
|
||||
|
||||
LXVD2X (LOW)(R0), X6
|
||||
LXVD2X (HIGH)(R0), X7
|
||||
VPERM X6_, V31, FLIP_, X6_
|
||||
VPERM X7_, V31, FLIP_, X7_
|
||||
|
||||
MOVD $0, OFFSET
|
||||
|
||||
loopXor:
|
||||
LXVD2X (IN)(OFFSET), MSG
|
||||
LXVD2X (OUT)(OFFSET), RESULT
|
||||
|
||||
VSRB MSG_, ROTATE_, MSG_HI_
|
||||
VAND MSG_, MASK_, MSG_
|
||||
VPERM X6_, V31, MSG_, MSG_
|
||||
VPERM X7_, V31, MSG_HI_, MSG_HI_
|
||||
|
||||
VXOR MSG_, MSG_HI_, MSG_
|
||||
VXOR MSG_, RESULT_, RESULT_
|
||||
|
||||
STXVD2X RESULT, (OUT)(OFFSET)
|
||||
|
||||
ADD $16, OFFSET, OFFSET
|
||||
CMP LEN, OFFSET
|
||||
BGT loopXor
|
||||
RET
|
||||
|
||||
DATA ·constants+0x0(SB)/8, $0x0404040404040404
|
||||
DATA ·constants+0x8(SB)/8, $0x0404040404040404
|
||||
DATA ·constants+0x10(SB)/8, $0x0f0f0f0f0f0f0f0f
|
||||
DATA ·constants+0x18(SB)/8, $0x0f0f0f0f0f0f0f0f
|
||||
DATA ·constants+0x20(SB)/8, $0x0706050403020100
|
||||
DATA ·constants+0x28(SB)/8, $0x0f0e0d0c0b0a0908
|
||||
|
||||
GLOBL ·constants(SB), 8, $48
|
Loading…
Reference in New Issue