diff --git a/.travis.yml b/.travis.yml index 21aae6b..94ec04e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -20,7 +20,7 @@ go: install: - go get ./... -script: +script: - go vet ./... - go test -cpu=1,2 . - go test -tags=noasm -cpu=1,2 . @@ -29,10 +29,14 @@ script: - go build examples/stream-decoder.go - go build examples/stream-encoder.go +stages: + - gofmt + - test + - deploy + jobs: allow_failures: - go: 'master' - - arch: s390x fast_finish: true include: - stage: gofmt @@ -40,7 +44,10 @@ jobs: os: linux arch: amd64 script: - - diff <(gofmt -d .) <("") + - diff <(gofmt -d .) <(printf "") + - diff <(gofmt -d ./examples) <(printf "") + - go install github.com/klauspost/asmfmt/cmd/asmfmt + - diff <(asmfmt -d .) <(printf "") - stage: race go: 1.14.x os: linux diff --git a/galoisAvx512_amd64.s b/galoisAvx512_amd64.s index e631b02..97ad420 100644 --- a/galoisAvx512_amd64.s +++ b/galoisAvx512_amd64.s @@ -16,8 +16,8 @@ VPTERNLOGD $0x96, LO, HI, OUT #define GALOIS(C1, C2, IN, LO, HI, OUT) \ - VSHUFI64X2 $C1, IN, IN, LO \ - VSHUFI64X2 $C2, IN, IN, HI \ + VSHUFI64X2 $C1, IN, IN, LO \ + VSHUFI64X2 $C2, IN, IN, HI \ GALOIS_MUL(LO, HI, LO, HI, OUT) // @@ -73,49 +73,49 @@ TEXT ·_galMulAVX512Parallel81(SB), 7, $0 loopback_avx512_parallel81: VMOVDQU64.Z (DX), K1, Z4 - LOAD(0x00) // &in[0][0] + LOAD(0x00) // &in[0][0] GALOIS_MUL(Z16, Z20, Z14, Z15, Z4) CMPQ AX, $1 JE skip_avx512_parallel81 - LOAD(0x18) // &in[1][0] + LOAD(0x18) // &in[1][0] GALOIS_MUL(Z24, Z28, Z14, Z15, Z4) CMPQ AX, $2 JE skip_avx512_parallel81 - LOAD(0x30) // &in[2][0] + LOAD(0x30) // &in[2][0] GALOIS_MUL(Z17, Z21, Z14, Z15, Z4) CMPQ AX, $3 JE skip_avx512_parallel81 - LOAD(0x48) // &in[3][0] + LOAD(0x48) // &in[3][0] GALOIS_MUL(Z25, Z29, Z14, Z15, Z4) CMPQ AX, $4 JE skip_avx512_parallel81 - LOAD(0x60) // &in[4][0] + LOAD(0x60) // &in[4][0] GALOIS_MUL(Z18, Z22, Z14, Z15, Z4) CMPQ AX, $5 JE skip_avx512_parallel81 - LOAD(0x78) // &in[5][0] + LOAD(0x78) // &in[5][0] GALOIS_MUL(Z26, Z30, Z14, Z15, Z4) CMPQ AX, $6 JE skip_avx512_parallel81 - LOAD(0x90) // &in[6][0] + LOAD(0x90) // &in[6][0] GALOIS_MUL(Z19, Z23, Z14, Z15, Z4) CMPQ AX, $7 JE skip_avx512_parallel81 - LOAD(0xa8) // &in[7][0] + LOAD(0xa8) // &in[7][0] GALOIS_MUL(Z27, Z31, Z14, Z15, Z4) skip_avx512_parallel81: @@ -191,28 +191,28 @@ loopback_avx512_parallel82: VMOVDQU64.Z (DX), K1, Z4 VMOVDQU64.Z (CX), K1, Z5 - LOAD(0x00) // &in[0][0] + LOAD(0x00) // &in[0][0] GALOIS_MUL(Z16, Z24, Z14, Z15, Z4) GALOIS_MUL(Z20, Z27, Z12, Z13, Z5) CMPQ AX, $1 JE skip_avx512_parallel82 - LOAD(0x18) // &in[1][0] + LOAD(0x18) // &in[1][0] GALOIS_MUL(Z25, Z26, Z14, Z15, Z4) GALOIS_MUL(Z28, Z29, Z12, Z13, Z5) CMPQ AX, $2 JE skip_avx512_parallel82 - LOAD(0x30) // &in[2][0] + LOAD(0x30) // &in[2][0] GALOIS_MUL(Z17, Z30, Z14, Z15, Z4) GALOIS_MUL(Z21, Z8, Z12, Z13, Z5) CMPQ AX, $3 JE skip_avx512_parallel82 - LOAD(0x48) // &in[3][0] + LOAD(0x48) // &in[3][0] GALOIS_MUL(Z31, Z11, Z14, Z15, Z4) GALOIS_MUL(Z9, Z10, Z12, Z13, Z5) diff --git a/galois_arm64.s b/galois_arm64.s index 3149a0b..890f555 100644 --- a/galois_arm64.s +++ b/galois_arm64.s @@ -5,13 +5,13 @@ // func galMulNEON(low, high, in, out []byte) TEXT ·galMulNEON(SB), 7, $0 - MOVD in_base+48(FP), R1 - MOVD in_len+56(FP), R2 // length of message + MOVD in_base+48(FP), R1 + MOVD in_len+56(FP), R2 // length of message MOVD out_base+72(FP), R5 SUBS $32, R2 BMI complete - MOVD low+0(FP), R10 // R10: &low + MOVD low+0(FP), R10 // R10: &low MOVD high+24(FP), R11 // R11: &high VLD1 (R10), [V6.B16] VLD1 (R11), [V7.B16] @@ -22,7 +22,7 @@ TEXT ·galMulNEON(SB), 7, $0 // MOVD $0x0f, R3 VMOV R3, V8.B[0] - VDUP V8.B[0], V8.B16 + VDUP V8.B[0], V8.B16 loop: // Main loop @@ -31,8 +31,8 @@ loop: // Get low input and high input VUSHR $4, V0.B16, V10.B16 VUSHR $4, V1.B16, V11.B16 - VAND V8.B16, V0.B16, V0.B16 - VAND V8.B16, V1.B16, V1.B16 + VAND V8.B16, V0.B16, V0.B16 + VAND V8.B16, V1.B16, V1.B16 // Mul low part and mul high part VTBL V0.B16, [V6.B16], V4.B16 @@ -41,8 +41,8 @@ loop: VTBL V11.B16, [V7.B16], V15.B16 // Combine results - VEOR V5.B16, V4.B16, V4.B16 - VEOR V15.B16, V14.B16, V5.B16 + VEOR V5.B16, V4.B16, V4.B16 + VEOR V15.B16, V14.B16, V5.B16 // Store result VST1.P [V4.D2, V5.D2], 32(R5) @@ -53,16 +53,15 @@ loop: complete: RET - // func galMulXorNEON(low, high, in, out []byte) TEXT ·galMulXorNEON(SB), 7, $0 - MOVD in_base+48(FP), R1 - MOVD in_len+56(FP), R2 // length of message + MOVD in_base+48(FP), R1 + MOVD in_len+56(FP), R2 // length of message MOVD out_base+72(FP), R5 SUBS $32, R2 BMI completeXor - MOVD low+0(FP), R10 // R10: &low + MOVD low+0(FP), R10 // R10: &low MOVD high+24(FP), R11 // R11: &high VLD1 (R10), [V6.B16] VLD1 (R11), [V7.B16] @@ -73,7 +72,7 @@ TEXT ·galMulXorNEON(SB), 7, $0 // MOVD $0x0f, R3 VMOV R3, V8.B[0] - VDUP V8.B[0], V8.B16 + VDUP V8.B[0], V8.B16 loopXor: // Main loop @@ -83,8 +82,8 @@ loopXor: // Get low input and high input VUSHR $4, V0.B16, V10.B16 VUSHR $4, V1.B16, V11.B16 - VAND V8.B16, V0.B16, V0.B16 - VAND V8.B16, V1.B16, V1.B16 + VAND V8.B16, V0.B16, V0.B16 + VAND V8.B16, V1.B16, V1.B16 // Mul low part and mul high part VTBL V0.B16, [V6.B16], V4.B16 @@ -93,10 +92,10 @@ loopXor: VTBL V11.B16, [V7.B16], V15.B16 // Combine results - VEOR V5.B16, V4.B16, V4.B16 - VEOR V15.B16, V14.B16, V5.B16 - VEOR V20.B16, V4.B16, V4.B16 - VEOR V21.B16, V5.B16, V5.B16 + VEOR V5.B16, V4.B16, V4.B16 + VEOR V15.B16, V14.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 // Store result VST1.P [V4.D2, V5.D2], 32(R5) @@ -109,8 +108,8 @@ completeXor: // func galXorNEON(in, out []byte) TEXT ·galXorNEON(SB), 7, $0 - MOVD in_base+0(FP), R1 - MOVD in_len+8(FP), R2 // length of message + MOVD in_base+0(FP), R1 + MOVD in_len+8(FP), R2 // length of message MOVD out_base+24(FP), R5 SUBS $32, R2 BMI completeXor @@ -120,8 +119,8 @@ loopXor: VLD1.P 32(R1), [V0.B16, V1.B16] VLD1 (R5), [V20.B16, V21.B16] - VEOR V20.B16, V0.B16, V4.B16 - VEOR V21.B16, V1.B16, V5.B16 + VEOR V20.B16, V0.B16, V4.B16 + VEOR V21.B16, V1.B16, V5.B16 // Store result VST1.P [V4.D2, V5.D2], 32(R5) @@ -131,3 +130,4 @@ loopXor: completeXor: RET +