Make sure assembler is formatted (#145)

* Make sure assembler is formatted
master
Klaus Post 2020-05-14 12:04:55 +02:00 committed by GitHub
parent 27f8a7b6bf
commit f338110979
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 47 additions and 40 deletions

View File

@ -20,7 +20,7 @@ go:
install:
- go get ./...
script:
script:
- go vet ./...
- go test -cpu=1,2 .
- go test -tags=noasm -cpu=1,2 .
@ -29,10 +29,14 @@ script:
- go build examples/stream-decoder.go
- go build examples/stream-encoder.go
stages:
- gofmt
- test
- deploy
jobs:
allow_failures:
- go: 'master'
- arch: s390x
fast_finish: true
include:
- stage: gofmt
@ -40,7 +44,10 @@ jobs:
os: linux
arch: amd64
script:
- diff <(gofmt -d .) <("")
- diff <(gofmt -d .) <(printf "")
- diff <(gofmt -d ./examples) <(printf "")
- go install github.com/klauspost/asmfmt/cmd/asmfmt
- diff <(asmfmt -d .) <(printf "")
- stage: race
go: 1.14.x
os: linux

View File

@ -16,8 +16,8 @@
VPTERNLOGD $0x96, LO, HI, OUT
#define GALOIS(C1, C2, IN, LO, HI, OUT) \
VSHUFI64X2 $C1, IN, IN, LO \
VSHUFI64X2 $C2, IN, IN, HI \
VSHUFI64X2 $C1, IN, IN, LO \
VSHUFI64X2 $C2, IN, IN, HI \
GALOIS_MUL(LO, HI, LO, HI, OUT)
//
@ -73,49 +73,49 @@ TEXT ·_galMulAVX512Parallel81(SB), 7, $0
loopback_avx512_parallel81:
VMOVDQU64.Z (DX), K1, Z4
LOAD(0x00) // &in[0][0]
LOAD(0x00) // &in[0][0]
GALOIS_MUL(Z16, Z20, Z14, Z15, Z4)
CMPQ AX, $1
JE skip_avx512_parallel81
LOAD(0x18) // &in[1][0]
LOAD(0x18) // &in[1][0]
GALOIS_MUL(Z24, Z28, Z14, Z15, Z4)
CMPQ AX, $2
JE skip_avx512_parallel81
LOAD(0x30) // &in[2][0]
LOAD(0x30) // &in[2][0]
GALOIS_MUL(Z17, Z21, Z14, Z15, Z4)
CMPQ AX, $3
JE skip_avx512_parallel81
LOAD(0x48) // &in[3][0]
LOAD(0x48) // &in[3][0]
GALOIS_MUL(Z25, Z29, Z14, Z15, Z4)
CMPQ AX, $4
JE skip_avx512_parallel81
LOAD(0x60) // &in[4][0]
LOAD(0x60) // &in[4][0]
GALOIS_MUL(Z18, Z22, Z14, Z15, Z4)
CMPQ AX, $5
JE skip_avx512_parallel81
LOAD(0x78) // &in[5][0]
LOAD(0x78) // &in[5][0]
GALOIS_MUL(Z26, Z30, Z14, Z15, Z4)
CMPQ AX, $6
JE skip_avx512_parallel81
LOAD(0x90) // &in[6][0]
LOAD(0x90) // &in[6][0]
GALOIS_MUL(Z19, Z23, Z14, Z15, Z4)
CMPQ AX, $7
JE skip_avx512_parallel81
LOAD(0xa8) // &in[7][0]
LOAD(0xa8) // &in[7][0]
GALOIS_MUL(Z27, Z31, Z14, Z15, Z4)
skip_avx512_parallel81:
@ -191,28 +191,28 @@ loopback_avx512_parallel82:
VMOVDQU64.Z (DX), K1, Z4
VMOVDQU64.Z (CX), K1, Z5
LOAD(0x00) // &in[0][0]
LOAD(0x00) // &in[0][0]
GALOIS_MUL(Z16, Z24, Z14, Z15, Z4)
GALOIS_MUL(Z20, Z27, Z12, Z13, Z5)
CMPQ AX, $1
JE skip_avx512_parallel82
LOAD(0x18) // &in[1][0]
LOAD(0x18) // &in[1][0]
GALOIS_MUL(Z25, Z26, Z14, Z15, Z4)
GALOIS_MUL(Z28, Z29, Z12, Z13, Z5)
CMPQ AX, $2
JE skip_avx512_parallel82
LOAD(0x30) // &in[2][0]
LOAD(0x30) // &in[2][0]
GALOIS_MUL(Z17, Z30, Z14, Z15, Z4)
GALOIS_MUL(Z21, Z8, Z12, Z13, Z5)
CMPQ AX, $3
JE skip_avx512_parallel82
LOAD(0x48) // &in[3][0]
LOAD(0x48) // &in[3][0]
GALOIS_MUL(Z31, Z11, Z14, Z15, Z4)
GALOIS_MUL(Z9, Z10, Z12, Z13, Z5)

View File

@ -5,13 +5,13 @@
// func galMulNEON(low, high, in, out []byte)
TEXT ·galMulNEON(SB), 7, $0
MOVD in_base+48(FP), R1
MOVD in_len+56(FP), R2 // length of message
MOVD in_base+48(FP), R1
MOVD in_len+56(FP), R2 // length of message
MOVD out_base+72(FP), R5
SUBS $32, R2
BMI complete
MOVD low+0(FP), R10 // R10: &low
MOVD low+0(FP), R10 // R10: &low
MOVD high+24(FP), R11 // R11: &high
VLD1 (R10), [V6.B16]
VLD1 (R11), [V7.B16]
@ -22,7 +22,7 @@ TEXT ·galMulNEON(SB), 7, $0
//
MOVD $0x0f, R3
VMOV R3, V8.B[0]
VDUP V8.B[0], V8.B16
VDUP V8.B[0], V8.B16
loop:
// Main loop
@ -31,8 +31,8 @@ loop:
// Get low input and high input
VUSHR $4, V0.B16, V10.B16
VUSHR $4, V1.B16, V11.B16
VAND V8.B16, V0.B16, V0.B16
VAND V8.B16, V1.B16, V1.B16
VAND V8.B16, V0.B16, V0.B16
VAND V8.B16, V1.B16, V1.B16
// Mul low part and mul high part
VTBL V0.B16, [V6.B16], V4.B16
@ -41,8 +41,8 @@ loop:
VTBL V11.B16, [V7.B16], V15.B16
// Combine results
VEOR V5.B16, V4.B16, V4.B16
VEOR V15.B16, V14.B16, V5.B16
VEOR V5.B16, V4.B16, V4.B16
VEOR V15.B16, V14.B16, V5.B16
// Store result
VST1.P [V4.D2, V5.D2], 32(R5)
@ -53,16 +53,15 @@ loop:
complete:
RET
// func galMulXorNEON(low, high, in, out []byte)
TEXT ·galMulXorNEON(SB), 7, $0
MOVD in_base+48(FP), R1
MOVD in_len+56(FP), R2 // length of message
MOVD in_base+48(FP), R1
MOVD in_len+56(FP), R2 // length of message
MOVD out_base+72(FP), R5
SUBS $32, R2
BMI completeXor
MOVD low+0(FP), R10 // R10: &low
MOVD low+0(FP), R10 // R10: &low
MOVD high+24(FP), R11 // R11: &high
VLD1 (R10), [V6.B16]
VLD1 (R11), [V7.B16]
@ -73,7 +72,7 @@ TEXT ·galMulXorNEON(SB), 7, $0
//
MOVD $0x0f, R3
VMOV R3, V8.B[0]
VDUP V8.B[0], V8.B16
VDUP V8.B[0], V8.B16
loopXor:
// Main loop
@ -83,8 +82,8 @@ loopXor:
// Get low input and high input
VUSHR $4, V0.B16, V10.B16
VUSHR $4, V1.B16, V11.B16
VAND V8.B16, V0.B16, V0.B16
VAND V8.B16, V1.B16, V1.B16
VAND V8.B16, V0.B16, V0.B16
VAND V8.B16, V1.B16, V1.B16
// Mul low part and mul high part
VTBL V0.B16, [V6.B16], V4.B16
@ -93,10 +92,10 @@ loopXor:
VTBL V11.B16, [V7.B16], V15.B16
// Combine results
VEOR V5.B16, V4.B16, V4.B16
VEOR V15.B16, V14.B16, V5.B16
VEOR V20.B16, V4.B16, V4.B16
VEOR V21.B16, V5.B16, V5.B16
VEOR V5.B16, V4.B16, V4.B16
VEOR V15.B16, V14.B16, V5.B16
VEOR V20.B16, V4.B16, V4.B16
VEOR V21.B16, V5.B16, V5.B16
// Store result
VST1.P [V4.D2, V5.D2], 32(R5)
@ -109,8 +108,8 @@ completeXor:
// func galXorNEON(in, out []byte)
TEXT ·galXorNEON(SB), 7, $0
MOVD in_base+0(FP), R1
MOVD in_len+8(FP), R2 // length of message
MOVD in_base+0(FP), R1
MOVD in_len+8(FP), R2 // length of message
MOVD out_base+24(FP), R5
SUBS $32, R2
BMI completeXor
@ -120,8 +119,8 @@ loopXor:
VLD1.P 32(R1), [V0.B16, V1.B16]
VLD1 (R5), [V20.B16, V21.B16]
VEOR V20.B16, V0.B16, V4.B16
VEOR V21.B16, V1.B16, V5.B16
VEOR V20.B16, V0.B16, V4.B16
VEOR V21.B16, V1.B16, V5.B16
// Store result
VST1.P [V4.D2, V5.D2], 32(R5)
@ -131,3 +130,4 @@ loopXor:
completeXor:
RET