2015-06-21 23:54:13 +03:00
/ / + build ! n o a s m ! a p p e n g i n e
2015-06-21 22:23:22 +03:00
/ / Copyright 2 0 1 5 , K l a u s P o s t , s e e L I C E N S E f o r d e t a i l s .
/ / Based o n h t t p : / / w w w . s n i a . o r g / s i t e s / d e f a u l t / f i l e s2 / S D C 2 0 1 3 / p r e s e n t a t i o n s / N e w T h i n k i n g / E t h a n M i l l e r _ S c r e a m i n g _ F a s t _ G a l o i s _ F i e l d % 2 0 A r i t h m e t i c _ S I M D % 2 0 I n s t r u c t i o n s . p d f
/ / and h t t p : / / j e r a s u r e . o r g / j e r a s u r e / g f - c o m p l e t e / t r e e / m a s t e r
2015-06-24 17:57:38 +03:00
/ / func g a l M u l S S S E 3 X o r ( l o w , h i g h , i n , o u t [ ] b y t e )
TEXT · g a l M u l S S S E 3 X o r ( S B ) , 7 , $ 0
2015-06-21 22:23:22 +03:00
MOVQ l o w + 0 ( F P ) ,S I / / S I : & l o w
MOVQ h i g h + 2 4 ( F P ) ,D X / / D X : & h i g h
MOVOU ( S I ) , X 6 / / X 6 l o w
MOVOU ( D X ) , X 7 / / X 7 : h i g h
MOVQ $ 1 5 , B X / / B X : l o w m a s k
MOVQ B X , X 8
2015-12-07 15:40:57 +03:00
PXOR X 5 , X 5
2015-06-21 22:23:22 +03:00
MOVQ i n + 4 8 ( F P ) ,S I / / R 1 1 : & i n
MOVQ i n _ l e n + 5 6 ( F P ) ,R 9 / / R 9 : l e n ( i n )
MOVQ o u t + 7 2 ( F P ) , D X / / D X : & o u t
PSHUFB X 5 , X 8 / / X 8 : l o m a s k ( u n p a c k e d )
SHRQ $ 4 , R 9 / / l e n ( i n ) / 1 6
CMPQ R 9 ,$ 0
JEQ d o n e _ x o r
loopback_xor :
MOVOU ( S I ) ,X 0 / / i n [ x ]
MOVOU ( D X ) ,X 4 / / o u t [ x ]
MOVOU X 0 , X 1 / / i n [ x ]
MOVOU X 6 , X 2 / / l o w c o p y
MOVOU X 7 , X 3 / / h i g h c o p y
PSRLQ $ 4 , X 1 / / X 1 : h i g h i n p u t
PAND X 8 , X 0 / / X 0 : l o w i n p u t
PAND X 8 , X 1 / / X 0 : h i g h i n p u t
PSHUFB X 0 , X 2 / / X 2 : m u l l o w p a r t
PSHUFB X 1 , X 3 / / X 3 : m u l h i g h p a r t
PXOR X 2 , X 3 / / X 3 : R e s u l t
PXOR X 4 , X 3 / / X 3 : R e s u l t x o r e x i s t i n g o u t
MOVOU X 3 , ( D X ) / / S t o r e
ADDQ $ 1 6 , S I / / i n + =16
2015-12-07 15:40:57 +03:00
ADDQ $ 1 6 , D X / / o u t + =16
2015-06-21 22:23:22 +03:00
SUBQ $ 1 , R 9
JNZ l o o p b a c k _ x o r
done_xor :
2015-06-21 22:27:32 +03:00
RET
2015-06-21 22:23:22 +03:00
2015-06-24 17:57:38 +03:00
/ / func g a l M u l S S S E 3 ( l o w , h i g h , i n , o u t [ ] b y t e )
TEXT · g a l M u l S S S E 3 ( S B ) , 7 , $ 0
2015-06-21 22:23:22 +03:00
MOVQ l o w + 0 ( F P ) ,S I / / S I : & l o w
MOVQ h i g h + 2 4 ( F P ) ,D X / / D X : & h i g h
MOVOU ( S I ) , X 6 / / X 6 l o w
MOVOU ( D X ) , X 7 / / X 7 : h i g h
MOVQ $ 1 5 , B X / / B X : l o w m a s k
MOVQ B X , X 8
2015-12-07 15:40:57 +03:00
PXOR X 5 , X 5
2015-06-21 22:23:22 +03:00
MOVQ i n + 4 8 ( F P ) ,S I / / R 1 1 : & i n
MOVQ i n _ l e n + 5 6 ( F P ) ,R 9 / / R 9 : l e n ( i n )
MOVQ o u t + 7 2 ( F P ) , D X / / D X : & o u t
PSHUFB X 5 , X 8 / / X 8 : l o m a s k ( u n p a c k e d )
SHRQ $ 4 , R 9 / / l e n ( i n ) / 1 6
CMPQ R 9 ,$ 0
JEQ d o n e
loopback :
MOVOU ( S I ) ,X 0 / / i n [ x ]
MOVOU X 0 , X 1 / / i n [ x ]
MOVOU X 6 , X 2 / / l o w c o p y
MOVOU X 7 , X 3 / / h i g h c o p y
PSRLQ $ 4 , X 1 / / X 1 : h i g h i n p u t
PAND X 8 , X 0 / / X 0 : l o w i n p u t
PAND X 8 , X 1 / / X 0 : h i g h i n p u t
PSHUFB X 0 , X 2 / / X 2 : m u l l o w p a r t
PSHUFB X 1 , X 3 / / X 3 : m u l h i g h p a r t
PXOR X 2 , X 3 / / X 3 : R e s u l t
MOVOU X 3 , ( D X ) / / S t o r e
ADDQ $ 1 6 , S I / / i n + =16
2015-12-07 15:40:57 +03:00
ADDQ $ 1 6 , D X / / o u t + =16
2015-06-21 22:23:22 +03:00
SUBQ $ 1 , R 9
JNZ l o o p b a c k
done :
2015-06-21 22:27:32 +03:00
RET
2015-06-21 22:23:22 +03:00
2015-12-07 15:40:57 +03:00
/ / func g a l M u l A V X 2 X o r ( l o w , h i g h , i n , o u t [ ] b y t e )
TEXT · g a l M u l A V X 2 X o r ( S B ) , 7 , $ 0
MOVQ l o w + 0 ( F P ) ,S I / / S I : & l o w
MOVQ h i g h + 2 4 ( F P ) ,D X / / D X : & h i g h
MOVQ $ 1 5 , B X / / B X : l o w m a s k
MOVQ B X , X 5
MOVOU ( S I ) , X 6 / / X 6 l o w
MOVOU ( D X ) , X 7 / / X 7 : h i g h
MOVQ i n _ l e n + 5 6 ( F P ) ,R 9 / / R 9 : l e n ( i n )
/ *
YASM :
VINSERTI1 2 8 Y M M 6 , Y M M 6 , X M M 6 , 1 ; low
VINSERTI1 2 8 Y M M 7 , Y M M 7 , X M M 7 , 1 ; high
VPBROADCASTB Y M M 8 , X M M 5 ; X8: lomask (unpacked)
* /
2015-06-21 22:23:22 +03:00
2015-12-07 15:40:57 +03:00
BYTE $ 0 x c4 ;BYTE $0xe3;BYTE $0x4d;BYTE $0x38;BYTE $0xf6;BYTE $0x01;BYTE $0xc4;BYTE $0xe3;BYTE $0x45;BYTE $0x38;BYTE $0xff;BYTE $0x01;BYTE $0xc4;BYTE $0x62;BYTE $0x7d;BYTE $0x78;BYTE $0xc5;
SHRQ $ 5 , R 9 / / l e n ( i n ) / 3 2
MOVQ o u t + 7 2 ( F P ) , D X / / D X : & o u t
MOVQ i n + 4 8 ( F P ) , S I / / R 1 1 : & i n
TESTQ R 9 , R 9
JZ d o n e _ x o r _ a v x2
loopback_xor_avx2 :
/ * Yasm :
VMOVDQU Y M M 0 , [ r s i ]
VMOVDQU Y M M 4 , [ r d x ]
VPSRLQ Y M M 1 , Y M M 0 , 4 ; X1: high input
VPAND Y M M 0 , Y M M 0 , Y M M 8 ; X0: low input
VPAND Y M M 1 , Y M M 1 , Y M M 8 ; X1: high input
VPSHUFB Y M M 2 , Y M M 6 , Y M M 0 ; X2: mul low part
VPSHUFB Y M M 3 , Y M M 7 , Y M M 1 ; X2: mul high part
VPXOR Y M M 3 , Y M M 2 , Y M M 3 ; X3: Result
VPXOR Y M M 4 , Y M M 3 , Y M M 4 ; X4: Result
VMOVDQU [ r d x ] , Y M M 4
* /
BYTE $ 0 x c5 ;BYTE $0xfe;BYTE $0x6f;BYTE $0x06;BYTE $0xc5;BYTE $0xfe;BYTE $0x6f;BYTE $0x22;BYTE $0xc5;BYTE $0xf5;BYTE $0x73;BYTE $0xd0;BYTE $0x04;BYTE $0xc4;BYTE $0xc1;BYTE $0x7d;BYTE $0xdb;BYTE $0xc0;BYTE $0xc4;BYTE $0xc1;BYTE $0x75;BYTE $0xdb;BYTE $0xc8;BYTE $0xc4;BYTE $0xe2;BYTE $0x4d;BYTE $0x00;BYTE $0xd0;BYTE $0xc4;BYTE $0xe2;BYTE $0x45;BYTE $0x00;BYTE $0xd9;BYTE $0xc5;BYTE $0xed;BYTE $0xef;BYTE $0xdb;BYTE $0xc5;BYTE $0xe5;BYTE $0xef;BYTE $0xe4;BYTE $0xc5;BYTE $0xfe;BYTE $0x7f;BYTE $0x22;
ADDQ $ 3 2 , S I / / i n + =32
ADDQ $ 3 2 , D X / / o u t + =32
SUBQ $ 1 , R 9
JNZ l o o p b a c k _ x o r _ a v x2
done_xor_avx2 :
/ / VZEROUPPER
BYTE $ 0 x c5 ;BYTE $0xf8;BYTE $0x77;
RET
/ / func g a l M u l A V X 2 ( l o w , h i g h , i n , o u t [ ] b y t e )
TEXT · g a l M u l A V X 2 ( S B ) , 7 , $ 0
MOVQ l o w + 0 ( F P ) ,S I / / S I : & l o w
MOVQ h i g h + 2 4 ( F P ) ,D X / / D X : & h i g h
MOVQ $ 1 5 , B X / / B X : l o w m a s k
MOVQ B X , X 5
MOVOU ( S I ) , X 6 / / X 6 l o w
MOVOU ( D X ) , X 7 / / X 7 : h i g h
MOVQ i n _ l e n + 5 6 ( F P ) ,R 9 / / R 9 : l e n ( i n )
/ *
YASM :
VINSERTI1 2 8 Y M M 6 , Y M M 6 , X M M 6 , 1 ; low
VINSERTI1 2 8 Y M M 7 , Y M M 7 , X M M 7 , 1 ; high
VPBROADCASTB Y M M 8 , X M M 5 ; X8: lomask (unpacked)
* /
BYTE $ 0 x c4 ;BYTE $0xe3;BYTE $0x4d;BYTE $0x38;BYTE $0xf6;BYTE $0x01;BYTE $0xc4;BYTE $0xe3;BYTE $0x45;BYTE $0x38;BYTE $0xff;BYTE $0x01;BYTE $0xc4;BYTE $0x62;BYTE $0x7d;BYTE $0x78;BYTE $0xc5;
SHRQ $ 5 , R 9 / / l e n ( i n ) / 3 2
MOVQ o u t + 7 2 ( F P ) , D X / / D X : & o u t
MOVQ i n + 4 8 ( F P ) , S I / / R 1 1 : & i n
TESTQ R 9 , R 9
JZ d o n e _ a v x2
loopback_avx2 :
/ * Yasm :
VMOVDQU Y M M 0 , [ r s i ]
VPSRLQ Y M M 1 , Y M M 0 , 4 ; X1: high input
VPAND Y M M 0 , Y M M 0 , Y M M 8 ; X0: low input
VPAND Y M M 1 , Y M M 1 , Y M M 8 ; X1: high input
VPSHUFB Y M M 2 , Y M M 6 , Y M M 0 ; X2: mul low part
VPSHUFB Y M M 3 , Y M M 7 , Y M M 1 ; X2: mul high part
VPXOR Y M M 4 , Y M M 2 , Y M M 3 ; X4: Result
VMOVDQU [ r d x ] , Y M M 4
* /
BYTE $ 0 x c5 ;BYTE $0xfe;BYTE $0x6f;BYTE $0x06;BYTE $0xc5;BYTE $0xf5;BYTE $0x73;BYTE $0xd0;BYTE $0x04;BYTE $0xc4;BYTE $0xc1;BYTE $0x7d;BYTE $0xdb;BYTE $0xc0;BYTE $0xc4;BYTE $0xc1;BYTE $0x75;BYTE $0xdb;BYTE $0xc8;BYTE $0xc4;BYTE $0xe2;BYTE $0x4d;BYTE $0x00;BYTE $0xd0;BYTE $0xc4;BYTE $0xe2;BYTE $0x45;BYTE $0x00;BYTE $0xd9;BYTE $0xc5;BYTE $0xed;BYTE $0xef;BYTE $0xe3;BYTE $0xc5;BYTE $0xfe;BYTE $0x7f;BYTE $0x22;
ADDQ $ 3 2 , S I / / i n + =32
ADDQ $ 3 2 , D X / / o u t + =32
SUBQ $ 1 , R 9
JNZ l o o p b a c k _ a v x2
JMP d o n e _ a v x2
done_avx2 :
/ / VZEROUPPER
BYTE $ 0 x c5 ;BYTE $0xf8;BYTE $0x77;
RET