134 lines
2.7 KiB
ArmAsm
134 lines
2.7 KiB
ArmAsm
//+build !noasm !appengine !gccgo
|
|
|
|
// Copyright 2015, Klaus Post, see LICENSE for details.
|
|
// Copyright 2017, Minio, Inc.
|
|
|
|
// func galMulNEON(low, high, in, out []byte)
|
|
TEXT ·galMulNEON(SB), 7, $0
|
|
MOVD in_base+48(FP), R1
|
|
MOVD in_len+56(FP), R2 // length of message
|
|
MOVD out_base+72(FP), R5
|
|
SUBS $32, R2
|
|
BMI complete
|
|
|
|
MOVD low+0(FP), R10 // R10: &low
|
|
MOVD high+24(FP), R11 // R11: &high
|
|
VLD1 (R10), [V6.B16]
|
|
VLD1 (R11), [V7.B16]
|
|
|
|
//
|
|
// Use an extra instruction below since `VDUP R3, V8.B16` generates assembler error
|
|
// WORD $0x4e010c68 // dup v8.16b, w3
|
|
//
|
|
MOVD $0x0f, R3
|
|
VMOV R3, V8.B[0]
|
|
VDUP V8.B[0], V8.B16
|
|
|
|
loop:
|
|
// Main loop
|
|
VLD1.P 32(R1), [V0.B16, V1.B16]
|
|
|
|
// Get low input and high input
|
|
VUSHR $4, V0.B16, V10.B16
|
|
VUSHR $4, V1.B16, V11.B16
|
|
VAND V8.B16, V0.B16, V0.B16
|
|
VAND V8.B16, V1.B16, V1.B16
|
|
|
|
// Mul low part and mul high part
|
|
VTBL V0.B16, [V6.B16], V4.B16
|
|
VTBL V10.B16, [V7.B16], V5.B16
|
|
VTBL V1.B16, [V6.B16], V14.B16
|
|
VTBL V11.B16, [V7.B16], V15.B16
|
|
|
|
// Combine results
|
|
VEOR V5.B16, V4.B16, V4.B16
|
|
VEOR V15.B16, V14.B16, V5.B16
|
|
|
|
// Store result
|
|
VST1.P [V4.D2, V5.D2], 32(R5)
|
|
|
|
SUBS $32, R2
|
|
BPL loop
|
|
|
|
complete:
|
|
RET
|
|
|
|
// func galMulXorNEON(low, high, in, out []byte)
|
|
TEXT ·galMulXorNEON(SB), 7, $0
|
|
MOVD in_base+48(FP), R1
|
|
MOVD in_len+56(FP), R2 // length of message
|
|
MOVD out_base+72(FP), R5
|
|
SUBS $32, R2
|
|
BMI completeXor
|
|
|
|
MOVD low+0(FP), R10 // R10: &low
|
|
MOVD high+24(FP), R11 // R11: &high
|
|
VLD1 (R10), [V6.B16]
|
|
VLD1 (R11), [V7.B16]
|
|
|
|
//
|
|
// Use an extra instruction below since `VDUP R3, V8.B16` generates assembler error
|
|
// WORD $0x4e010c68 // dup v8.16b, w3
|
|
//
|
|
MOVD $0x0f, R3
|
|
VMOV R3, V8.B[0]
|
|
VDUP V8.B[0], V8.B16
|
|
|
|
loopXor:
|
|
// Main loop
|
|
VLD1.P 32(R1), [V0.B16, V1.B16]
|
|
VLD1 (R5), [V20.B16, V21.B16]
|
|
|
|
// Get low input and high input
|
|
VUSHR $4, V0.B16, V10.B16
|
|
VUSHR $4, V1.B16, V11.B16
|
|
VAND V8.B16, V0.B16, V0.B16
|
|
VAND V8.B16, V1.B16, V1.B16
|
|
|
|
// Mul low part and mul high part
|
|
VTBL V0.B16, [V6.B16], V4.B16
|
|
VTBL V10.B16, [V7.B16], V5.B16
|
|
VTBL V1.B16, [V6.B16], V14.B16
|
|
VTBL V11.B16, [V7.B16], V15.B16
|
|
|
|
// Combine results
|
|
VEOR V5.B16, V4.B16, V4.B16
|
|
VEOR V15.B16, V14.B16, V5.B16
|
|
VEOR V20.B16, V4.B16, V4.B16
|
|
VEOR V21.B16, V5.B16, V5.B16
|
|
|
|
// Store result
|
|
VST1.P [V4.D2, V5.D2], 32(R5)
|
|
|
|
SUBS $32, R2
|
|
BPL loopXor
|
|
|
|
completeXor:
|
|
RET
|
|
|
|
// func galXorNEON(in, out []byte)
|
|
TEXT ·galXorNEON(SB), 7, $0
|
|
MOVD in_base+0(FP), R1
|
|
MOVD in_len+8(FP), R2 // length of message
|
|
MOVD out_base+24(FP), R5
|
|
SUBS $32, R2
|
|
BMI completeXor
|
|
|
|
loopXor:
|
|
// Main loop
|
|
VLD1.P 32(R1), [V0.B16, V1.B16]
|
|
VLD1 (R5), [V20.B16, V21.B16]
|
|
|
|
VEOR V20.B16, V0.B16, V4.B16
|
|
VEOR V21.B16, V1.B16, V5.B16
|
|
|
|
// Store result
|
|
VST1.P [V4.D2, V5.D2], 32(R5)
|
|
|
|
SUBS $32, R2
|
|
BPL loopXor
|
|
|
|
completeXor:
|
|
RET
|
|
|