Make sure assembler is formatted (#145)

* Make sure assembler is formatted
master
Klaus Post 2020-05-14 12:04:55 +02:00 committed by GitHub
parent 27f8a7b6bf
commit f338110979
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 47 additions and 40 deletions

View File

@ -29,10 +29,14 @@ script:
- go build examples/stream-decoder.go - go build examples/stream-decoder.go
- go build examples/stream-encoder.go - go build examples/stream-encoder.go
stages:
- gofmt
- test
- deploy
jobs: jobs:
allow_failures: allow_failures:
- go: 'master' - go: 'master'
- arch: s390x
fast_finish: true fast_finish: true
include: include:
- stage: gofmt - stage: gofmt
@ -40,7 +44,10 @@ jobs:
os: linux os: linux
arch: amd64 arch: amd64
script: script:
- diff <(gofmt -d .) <("") - diff <(gofmt -d .) <(printf "")
- diff <(gofmt -d ./examples) <(printf "")
- go install github.com/klauspost/asmfmt/cmd/asmfmt
- diff <(asmfmt -d .) <(printf "")
- stage: race - stage: race
go: 1.14.x go: 1.14.x
os: linux os: linux

View File

@ -16,8 +16,8 @@
VPTERNLOGD $0x96, LO, HI, OUT VPTERNLOGD $0x96, LO, HI, OUT
#define GALOIS(C1, C2, IN, LO, HI, OUT) \ #define GALOIS(C1, C2, IN, LO, HI, OUT) \
VSHUFI64X2 $C1, IN, IN, LO \ VSHUFI64X2 $C1, IN, IN, LO \
VSHUFI64X2 $C2, IN, IN, HI \ VSHUFI64X2 $C2, IN, IN, HI \
GALOIS_MUL(LO, HI, LO, HI, OUT) GALOIS_MUL(LO, HI, LO, HI, OUT)
// //
@ -73,49 +73,49 @@ TEXT ·_galMulAVX512Parallel81(SB), 7, $0
loopback_avx512_parallel81: loopback_avx512_parallel81:
VMOVDQU64.Z (DX), K1, Z4 VMOVDQU64.Z (DX), K1, Z4
LOAD(0x00) // &in[0][0] LOAD(0x00) // &in[0][0]
GALOIS_MUL(Z16, Z20, Z14, Z15, Z4) GALOIS_MUL(Z16, Z20, Z14, Z15, Z4)
CMPQ AX, $1 CMPQ AX, $1
JE skip_avx512_parallel81 JE skip_avx512_parallel81
LOAD(0x18) // &in[1][0] LOAD(0x18) // &in[1][0]
GALOIS_MUL(Z24, Z28, Z14, Z15, Z4) GALOIS_MUL(Z24, Z28, Z14, Z15, Z4)
CMPQ AX, $2 CMPQ AX, $2
JE skip_avx512_parallel81 JE skip_avx512_parallel81
LOAD(0x30) // &in[2][0] LOAD(0x30) // &in[2][0]
GALOIS_MUL(Z17, Z21, Z14, Z15, Z4) GALOIS_MUL(Z17, Z21, Z14, Z15, Z4)
CMPQ AX, $3 CMPQ AX, $3
JE skip_avx512_parallel81 JE skip_avx512_parallel81
LOAD(0x48) // &in[3][0] LOAD(0x48) // &in[3][0]
GALOIS_MUL(Z25, Z29, Z14, Z15, Z4) GALOIS_MUL(Z25, Z29, Z14, Z15, Z4)
CMPQ AX, $4 CMPQ AX, $4
JE skip_avx512_parallel81 JE skip_avx512_parallel81
LOAD(0x60) // &in[4][0] LOAD(0x60) // &in[4][0]
GALOIS_MUL(Z18, Z22, Z14, Z15, Z4) GALOIS_MUL(Z18, Z22, Z14, Z15, Z4)
CMPQ AX, $5 CMPQ AX, $5
JE skip_avx512_parallel81 JE skip_avx512_parallel81
LOAD(0x78) // &in[5][0] LOAD(0x78) // &in[5][0]
GALOIS_MUL(Z26, Z30, Z14, Z15, Z4) GALOIS_MUL(Z26, Z30, Z14, Z15, Z4)
CMPQ AX, $6 CMPQ AX, $6
JE skip_avx512_parallel81 JE skip_avx512_parallel81
LOAD(0x90) // &in[6][0] LOAD(0x90) // &in[6][0]
GALOIS_MUL(Z19, Z23, Z14, Z15, Z4) GALOIS_MUL(Z19, Z23, Z14, Z15, Z4)
CMPQ AX, $7 CMPQ AX, $7
JE skip_avx512_parallel81 JE skip_avx512_parallel81
LOAD(0xa8) // &in[7][0] LOAD(0xa8) // &in[7][0]
GALOIS_MUL(Z27, Z31, Z14, Z15, Z4) GALOIS_MUL(Z27, Z31, Z14, Z15, Z4)
skip_avx512_parallel81: skip_avx512_parallel81:
@ -191,28 +191,28 @@ loopback_avx512_parallel82:
VMOVDQU64.Z (DX), K1, Z4 VMOVDQU64.Z (DX), K1, Z4
VMOVDQU64.Z (CX), K1, Z5 VMOVDQU64.Z (CX), K1, Z5
LOAD(0x00) // &in[0][0] LOAD(0x00) // &in[0][0]
GALOIS_MUL(Z16, Z24, Z14, Z15, Z4) GALOIS_MUL(Z16, Z24, Z14, Z15, Z4)
GALOIS_MUL(Z20, Z27, Z12, Z13, Z5) GALOIS_MUL(Z20, Z27, Z12, Z13, Z5)
CMPQ AX, $1 CMPQ AX, $1
JE skip_avx512_parallel82 JE skip_avx512_parallel82
LOAD(0x18) // &in[1][0] LOAD(0x18) // &in[1][0]
GALOIS_MUL(Z25, Z26, Z14, Z15, Z4) GALOIS_MUL(Z25, Z26, Z14, Z15, Z4)
GALOIS_MUL(Z28, Z29, Z12, Z13, Z5) GALOIS_MUL(Z28, Z29, Z12, Z13, Z5)
CMPQ AX, $2 CMPQ AX, $2
JE skip_avx512_parallel82 JE skip_avx512_parallel82
LOAD(0x30) // &in[2][0] LOAD(0x30) // &in[2][0]
GALOIS_MUL(Z17, Z30, Z14, Z15, Z4) GALOIS_MUL(Z17, Z30, Z14, Z15, Z4)
GALOIS_MUL(Z21, Z8, Z12, Z13, Z5) GALOIS_MUL(Z21, Z8, Z12, Z13, Z5)
CMPQ AX, $3 CMPQ AX, $3
JE skip_avx512_parallel82 JE skip_avx512_parallel82
LOAD(0x48) // &in[3][0] LOAD(0x48) // &in[3][0]
GALOIS_MUL(Z31, Z11, Z14, Z15, Z4) GALOIS_MUL(Z31, Z11, Z14, Z15, Z4)
GALOIS_MUL(Z9, Z10, Z12, Z13, Z5) GALOIS_MUL(Z9, Z10, Z12, Z13, Z5)

View File

@ -5,13 +5,13 @@
// func galMulNEON(low, high, in, out []byte) // func galMulNEON(low, high, in, out []byte)
TEXT ·galMulNEON(SB), 7, $0 TEXT ·galMulNEON(SB), 7, $0
MOVD in_base+48(FP), R1 MOVD in_base+48(FP), R1
MOVD in_len+56(FP), R2 // length of message MOVD in_len+56(FP), R2 // length of message
MOVD out_base+72(FP), R5 MOVD out_base+72(FP), R5
SUBS $32, R2 SUBS $32, R2
BMI complete BMI complete
MOVD low+0(FP), R10 // R10: &low MOVD low+0(FP), R10 // R10: &low
MOVD high+24(FP), R11 // R11: &high MOVD high+24(FP), R11 // R11: &high
VLD1 (R10), [V6.B16] VLD1 (R10), [V6.B16]
VLD1 (R11), [V7.B16] VLD1 (R11), [V7.B16]
@ -31,8 +31,8 @@ loop:
// Get low input and high input // Get low input and high input
VUSHR $4, V0.B16, V10.B16 VUSHR $4, V0.B16, V10.B16
VUSHR $4, V1.B16, V11.B16 VUSHR $4, V1.B16, V11.B16
VAND V8.B16, V0.B16, V0.B16 VAND V8.B16, V0.B16, V0.B16
VAND V8.B16, V1.B16, V1.B16 VAND V8.B16, V1.B16, V1.B16
// Mul low part and mul high part // Mul low part and mul high part
VTBL V0.B16, [V6.B16], V4.B16 VTBL V0.B16, [V6.B16], V4.B16
@ -41,8 +41,8 @@ loop:
VTBL V11.B16, [V7.B16], V15.B16 VTBL V11.B16, [V7.B16], V15.B16
// Combine results // Combine results
VEOR V5.B16, V4.B16, V4.B16 VEOR V5.B16, V4.B16, V4.B16
VEOR V15.B16, V14.B16, V5.B16 VEOR V15.B16, V14.B16, V5.B16
// Store result // Store result
VST1.P [V4.D2, V5.D2], 32(R5) VST1.P [V4.D2, V5.D2], 32(R5)
@ -53,16 +53,15 @@ loop:
complete: complete:
RET RET
// func galMulXorNEON(low, high, in, out []byte) // func galMulXorNEON(low, high, in, out []byte)
TEXT ·galMulXorNEON(SB), 7, $0 TEXT ·galMulXorNEON(SB), 7, $0
MOVD in_base+48(FP), R1 MOVD in_base+48(FP), R1
MOVD in_len+56(FP), R2 // length of message MOVD in_len+56(FP), R2 // length of message
MOVD out_base+72(FP), R5 MOVD out_base+72(FP), R5
SUBS $32, R2 SUBS $32, R2
BMI completeXor BMI completeXor
MOVD low+0(FP), R10 // R10: &low MOVD low+0(FP), R10 // R10: &low
MOVD high+24(FP), R11 // R11: &high MOVD high+24(FP), R11 // R11: &high
VLD1 (R10), [V6.B16] VLD1 (R10), [V6.B16]
VLD1 (R11), [V7.B16] VLD1 (R11), [V7.B16]
@ -83,8 +82,8 @@ loopXor:
// Get low input and high input // Get low input and high input
VUSHR $4, V0.B16, V10.B16 VUSHR $4, V0.B16, V10.B16
VUSHR $4, V1.B16, V11.B16 VUSHR $4, V1.B16, V11.B16
VAND V8.B16, V0.B16, V0.B16 VAND V8.B16, V0.B16, V0.B16
VAND V8.B16, V1.B16, V1.B16 VAND V8.B16, V1.B16, V1.B16
// Mul low part and mul high part // Mul low part and mul high part
VTBL V0.B16, [V6.B16], V4.B16 VTBL V0.B16, [V6.B16], V4.B16
@ -93,10 +92,10 @@ loopXor:
VTBL V11.B16, [V7.B16], V15.B16 VTBL V11.B16, [V7.B16], V15.B16
// Combine results // Combine results
VEOR V5.B16, V4.B16, V4.B16 VEOR V5.B16, V4.B16, V4.B16
VEOR V15.B16, V14.B16, V5.B16 VEOR V15.B16, V14.B16, V5.B16
VEOR V20.B16, V4.B16, V4.B16 VEOR V20.B16, V4.B16, V4.B16
VEOR V21.B16, V5.B16, V5.B16 VEOR V21.B16, V5.B16, V5.B16
// Store result // Store result
VST1.P [V4.D2, V5.D2], 32(R5) VST1.P [V4.D2, V5.D2], 32(R5)
@ -109,8 +108,8 @@ completeXor:
// func galXorNEON(in, out []byte) // func galXorNEON(in, out []byte)
TEXT ·galXorNEON(SB), 7, $0 TEXT ·galXorNEON(SB), 7, $0
MOVD in_base+0(FP), R1 MOVD in_base+0(FP), R1
MOVD in_len+8(FP), R2 // length of message MOVD in_len+8(FP), R2 // length of message
MOVD out_base+24(FP), R5 MOVD out_base+24(FP), R5
SUBS $32, R2 SUBS $32, R2
BMI completeXor BMI completeXor
@ -120,8 +119,8 @@ loopXor:
VLD1.P 32(R1), [V0.B16, V1.B16] VLD1.P 32(R1), [V0.B16, V1.B16]
VLD1 (R5), [V20.B16, V21.B16] VLD1 (R5), [V20.B16, V21.B16]
VEOR V20.B16, V0.B16, V4.B16 VEOR V20.B16, V0.B16, V4.B16
VEOR V21.B16, V1.B16, V5.B16 VEOR V21.B16, V1.B16, V5.B16
// Store result // Store result
VST1.P [V4.D2, V5.D2], 32(R5) VST1.P [V4.D2, V5.D2], 32(R5)
@ -131,3 +130,4 @@ loopXor:
completeXor: completeXor:
RET RET