From 0e9e10435f3bc870657652624d04e1237557bffe Mon Sep 17 00:00:00 2001
From: Klaus Post <klauspost@gmail.com>
Date: Tue, 5 May 2020 16:36:01 +0200
Subject: [PATCH] avx2: Add 64 bytes per loop processing (#128)

* avx2: Add 64 bytes per loop processing

Not super clean benchmark run, but `BenchmarkGalois` is consistently faster.

```
benchmark                                 old ns/op     new ns/op     delta
BenchmarkGalois128K-32                    2551          2261          -11.37%
BenchmarkGalois1M-32                      22492         21107         -6.16%
BenchmarkGaloisXor128K-32                 2972          2808          -5.52%
BenchmarkGaloisXor1M-32                   25181         23951         -4.88%
BenchmarkEncode10x2x10000-32              5081          4722          -7.07%
BenchmarkEncode100x20x10000-32            383800        346655        -9.68%
BenchmarkEncode17x3x1M-32                 264806        263191        -0.61%
BenchmarkEncode10x4x16M-32                8337857       8376910       +0.47%
BenchmarkEncode5x2x1M-32                  77119         73598         -4.57%
BenchmarkEncode10x2x1M-32                 108424        102423        -5.53%
BenchmarkEncode10x4x1M-32                 194427        184301        -5.21%
BenchmarkEncode50x20x1M-32                3870301       3747639       -3.17%
BenchmarkEncode17x3x16M-32                10617586      10602449      -0.14%
BenchmarkEncode_8x4x8M-32                 3227254       3229451       +0.07%
BenchmarkEncode_12x4x12M-32               6841898       6847261       +0.08%
BenchmarkEncode_16x4x16M-32               11153469      11048738      -0.94%
BenchmarkEncode_16x4x32M-32               21947506      21826647      -0.55%
BenchmarkEncode_16x4x64M-32               43163608      42971338      -0.45%
BenchmarkEncode_8x5x8M-32                 3856675       3780730       -1.97%
BenchmarkEncode_8x6x8M-32                 4322023       4437109       +2.66%
BenchmarkEncode_8x7x8M-32                 5011434       4959623       -1.03%
BenchmarkEncode_8x9x8M-32                 6243694       6098824       -2.32%
BenchmarkEncode_8x10x8M-32                6724456       6657099       -1.00%
BenchmarkEncode_8x11x8M-32                7207693       7340332       +1.84%
BenchmarkEncode_8x8x05M-32                176877        172183        -2.65%
BenchmarkEncode_8x8x1M-32                 309716        301743        -2.57%
BenchmarkEncode_8x8x8M-32                 5498952       5489078       -0.18%
BenchmarkEncode_8x8x32M-32                22630195      22557074      -0.32%
BenchmarkEncode_24x8x24M-32               28488886      28220702      -0.94%
BenchmarkEncode_24x8x48M-32               56124735      54862495      -2.25%
BenchmarkVerify10x2x10000-32              9874          9356          -5.25%
BenchmarkVerify50x5x50000-32              175610        159735        -9.04%
BenchmarkVerify10x2x1M-32                 331276        311726        -5.90%
BenchmarkVerify5x2x1M-32                  265466        248075        -6.55%
BenchmarkVerify10x4x1M-32                 701627        606420        -13.57%
BenchmarkVerify50x20x1M-32                4338171       4245635       -2.13%
BenchmarkVerify10x4x16M-32                12312830      11932698      -3.09%
BenchmarkReconstruct10x2x10000-32         1594          1504          -5.65%
BenchmarkReconstruct50x5x50000-32         95101         79558         -16.34%
BenchmarkReconstruct10x2x1M-32            38479         37225         -3.26%
BenchmarkReconstruct5x2x1M-32             30968         30013         -3.08%
BenchmarkReconstruct10x4x1M-32            81630         75350         -7.69%
BenchmarkReconstruct50x20x1M-32           1136952       1040156       -8.51%
BenchmarkReconstruct10x4x16M-32           685408        656484        -4.22%
BenchmarkReconstructData10x2x10000-32     1609          1486          -7.64%
BenchmarkReconstructData50x5x50000-32     87090         71512         -17.89%
BenchmarkReconstructData10x2x1M-32        31497         30347         -3.65%
BenchmarkReconstructData5x2x1M-32         23379         22611         -3.28%
BenchmarkReconstructData10x4x1M-32        63853         61035         -4.41%
BenchmarkReconstructData50x20x1M-32       1048807       966201        -7.88%
BenchmarkReconstructData10x4x16M-32       866658        892252        +2.95%
BenchmarkReconstructP10x2x10000-32        544           540           -0.74%
BenchmarkReconstructP10x5x20000-32        1242          1206          -2.90%
BenchmarkSplit10x4x160M-32                2735508       2743214       +0.28%
BenchmarkSplit5x2x5M-32                   276232        288523        +4.45%
BenchmarkSplit10x2x1M-32                  44389         45517         +2.54%
BenchmarkSplit10x4x10M-32                 477282        460888        -3.43%
BenchmarkSplit50x20x50M-32                1608821       1602105       -0.42%
BenchmarkSplit17x3x272M-32                2035932       2034705       -0.06%
BenchmarkParallel_8x8x05M-32              346733        351837        +1.47%
BenchmarkParallel_20x10x05M-32            577127        586232        +1.58%
BenchmarkParallel_8x8x1M-32               722453        729294        +0.95%
BenchmarkParallel_8x8x8M-32               5717650       5817130       +1.74%
BenchmarkParallel_8x8x32M-32              22914260      24132696      +5.32%
BenchmarkStreamEncode10x2x10000-32        6703131       7141021       +6.53%
BenchmarkStreamEncode100x20x10000-32      38175873      39767386      +4.17%
BenchmarkStreamEncode17x3x1M-32           8920549       9218973       +3.35%
BenchmarkStreamEncode10x4x16M-32          21841702      21784898      -0.26%
BenchmarkStreamEncode5x2x1M-32            4088001       3247404       -20.56%
BenchmarkStreamEncode10x2x1M-32           5860652       5932381       +1.22%
BenchmarkStreamEncode10x4x1M-32           7555172       7589960       +0.46%
BenchmarkStreamEncode50x20x1M-32          30006814      30250054      +0.81%
BenchmarkStreamEncode17x3x16M-32          32757489      32818254      +0.19%
BenchmarkStreamVerify10x2x10000-32        6714996       6831093       +1.73%
BenchmarkStreamVerify50x5x50000-32        18525904      18761767      +1.27%
BenchmarkStreamVerify10x2x1M-32           5232278       5444148       +4.05%
BenchmarkStreamVerify5x2x1M-32            3673843       3755283       +2.22%
BenchmarkStreamVerify10x4x1M-32           7184419       7185293       +0.01%
BenchmarkStreamVerify50x20x1M-32          28441187      28574766      +0.47%
BenchmarkStreamVerify10x4x16M-32          8538440       8668614       +1.52%

benchmark                                 old MB/s      new MB/s      speedup
BenchmarkGalois128K-32                    51374.59      57976.36      1.13x
BenchmarkGalois1M-32                      46620.03      49679.10      1.07x
BenchmarkGaloisXor128K-32                 44106.22      46671.56      1.06x
BenchmarkGaloisXor1M-32                   41641.82      43779.89      1.05x
BenchmarkEncode10x2x10000-32              19682.61      21176.81      1.08x
BenchmarkEncode100x20x10000-32            2605.52       2884.71       1.11x
BenchmarkEncode17x3x1M-32                 67316.54      67729.50      1.01x
BenchmarkEncode10x4x16M-32                20121.74      20027.93      1.00x
BenchmarkEncode5x2x1M-32                  67984.17      71236.47      1.05x
BenchmarkEncode10x2x1M-32                 96710.29      102377.00     1.06x
BenchmarkEncode10x4x1M-32                 53931.74      56894.82      1.05x
BenchmarkEncode50x20x1M-32                13546.44      13989.82      1.03x
BenchmarkEncode17x3x16M-32                26862.29      26900.64      1.00x
BenchmarkEncode_8x4x8M-32                 20794.42      20780.27      1.00x
BenchmarkEncode_12x4x12M-32               22069.16      22051.88      1.00x
BenchmarkEncode_16x4x16M-32               24067.44      24295.58      1.01x
BenchmarkEncode_16x4x32M-32               24461.59      24597.04      1.01x
BenchmarkEncode_16x4x64M-32               24876.09      24987.40      1.00x
BenchmarkEncode_8x5x8M-32                 17400.71      17750.24      1.02x
BenchmarkEncode_8x6x8M-32                 15527.19      15124.46      0.97x
BenchmarkEncode_8x7x8M-32                 13391.15      13531.04      1.01x
BenchmarkEncode_8x9x8M-32                 10748.26      11003.58      1.02x
BenchmarkEncode_8x10x8M-32                9979.82       10080.80      1.01x
BenchmarkEncode_8x11x8M-32                9310.73       9142.48       0.98x
BenchmarkEncode_8x8x05M-32                23713.12      24359.50      1.03x
BenchmarkEncode_8x8x1M-32                 27084.87      27800.50      1.03x
BenchmarkEncode_8x8x8M-32                 12203.94      12225.89      1.00x
BenchmarkEncode_8x8x32M-32                11861.83      11900.28      1.00x
BenchmarkEncode_24x8x24M-32               21200.54      21402.01      1.01x
BenchmarkEncode_24x8x48M-32               21522.77      22017.95      1.02x
BenchmarkVerify10x2x10000-32              10127.24      10688.01      1.06x
BenchmarkVerify50x5x50000-32              28472.25      31301.75      1.10x
BenchmarkVerify10x2x1M-32                 31652.63      33637.74      1.06x
BenchmarkVerify5x2x1M-32                  19749.74      21134.27      1.07x
BenchmarkVerify10x4x1M-32                 14944.92      17291.25      1.16x
BenchmarkVerify50x20x1M-32                12085.46      12348.87      1.02x
BenchmarkVerify10x4x16M-32                13625.80      14059.87      1.03x
BenchmarkReconstruct10x2x10000-32         62723.68      66470.81      1.06x
BenchmarkReconstruct50x5x50000-32         52575.87      62847.32      1.20x
BenchmarkReconstruct10x2x1M-32            272507.04     281685.84     1.03x
BenchmarkReconstruct5x2x1M-32             169299.03     174685.39     1.03x
BenchmarkReconstruct10x4x1M-32            128455.17     139161.42     1.08x
BenchmarkReconstruct50x20x1M-32           46113.48      50404.73      1.09x
BenchmarkReconstruct10x4x16M-32           244777.11     255561.72     1.04x
BenchmarkReconstructData10x2x10000-32     62160.46      67305.98      1.08x
BenchmarkReconstructData50x5x50000-32     57411.81      69917.97      1.22x
BenchmarkReconstructData10x2x1M-32        332909.82     345526.29     1.04x
BenchmarkReconstructData5x2x1M-32         224254.60     231868.74     1.03x
BenchmarkReconstructData10x4x1M-32        164216.61     171799.68     1.05x
BenchmarkReconstructData50x20x1M-32       49988.98      54262.82      1.09x
BenchmarkReconstructData10x4x16M-32       193585.15     188032.29     0.97x
BenchmarkReconstructP10x2x10000-32        183806.57     185284.57     1.01x
BenchmarkReconstructP10x5x20000-32        160985.46     165852.51     1.03x
BenchmarkParallel_8x8x05M-32              12096.63      11921.17      0.99x
BenchmarkParallel_20x10x05M-32            18168.91      17886.72      0.98x
BenchmarkParallel_8x8x1M-32               11611.28      11502.36      0.99x
BenchmarkParallel_8x8x8M-32               11737.14      11536.42      0.98x
BenchmarkParallel_8x8x32M-32              11714.78      11123.31      0.95x
BenchmarkStreamEncode10x2x10000-32        14.92         14.00         0.94x
BenchmarkStreamEncode100x20x10000-32      26.19         25.15         0.96x
BenchmarkStreamEncode17x3x1M-32           1998.28       1933.60       0.97x
BenchmarkStreamEncode10x4x16M-32          7681.28       7701.31       1.00x
BenchmarkStreamEncode5x2x1M-32            1282.50       1614.48       1.26x
BenchmarkStreamEncode10x2x1M-32           1789.18       1767.55       0.99x
BenchmarkStreamEncode10x4x1M-32           1387.89       1381.53       1.00x
BenchmarkStreamEncode50x20x1M-32          1747.23       1733.18       0.99x
BenchmarkStreamEncode17x3x16M-32          8706.79       8690.67       1.00x
BenchmarkStreamVerify10x2x10000-32        14.89         14.64         0.98x
BenchmarkStreamVerify50x5x50000-32        269.89        266.50        0.99x
BenchmarkStreamVerify10x2x1M-32           2004.05       1926.06       0.96x
BenchmarkStreamVerify5x2x1M-32            1427.08       1396.13       0.98x
BenchmarkStreamVerify10x4x1M-32           1459.51       1459.34       1.00x
BenchmarkStreamVerify50x20x1M-32          1843.41       1834.79       1.00x
BenchmarkStreamVerify10x4x16M-32          19649.04      19353.98      0.98x
```
---
 galois_amd64.go |  71 +++++++++++++++++++-------
 galois_amd64.s  | 132 ++++++++++++++++++++++++++++++++++++++++++++++++
 reedsolomon.go  |   8 +--
 3 files changed, 190 insertions(+), 21 deletions(-)

diff --git a/galois_amd64.go b/galois_amd64.go
index 2b63983..bf7faca 100644
--- a/galois_amd64.go
+++ b/galois_amd64.go
@@ -21,6 +21,15 @@ func galMulAVX2(low, high, in, out []byte)
 //go:noescape
 func sSE2XorSlice(in, out []byte)
 
+//go:noescape
+func galMulAVX2Xor_64(low, high, in, out []byte)
+
+//go:noescape
+func galMulAVX2_64(low, high, in, out []byte)
+
+//go:noescape
+func sSE2XorSlice_64(in, out []byte)
+
 // This is what the assembler routines do in blocks of 16 bytes:
 /*
 func galMulSSSE3(low, high, in, out []byte) {
@@ -40,17 +49,29 @@ func galMulSSSE3Xor(low, high, in, out []byte) {
 }
 */
 
+// bigSwitchover is the size where 64 bytes are processed per loop.
+const bigSwitchover = 128
+
 func galMulSlice(c byte, in, out []byte, o *options) {
-	var done int
 	if o.useAVX2 {
-		galMulAVX2(mulTableLow[c][:], mulTableHigh[c][:], in, out)
-		done = (len(in) >> 5) << 5
+		if len(in) >= bigSwitchover {
+			galMulAVX2_64(mulTableLow[c][:], mulTableHigh[c][:], in, out)
+			done := (len(in) >> 6) << 6
+			in = in[done:]
+			out = out[done:]
+		}
+		if len(in) > 32 {
+			galMulAVX2(mulTableLow[c][:], mulTableHigh[c][:], in, out)
+			done := (len(in) >> 5) << 5
+			in = in[done:]
+			out = out[done:]
+		}
 	} else if o.useSSSE3 {
 		galMulSSSE3(mulTableLow[c][:], mulTableHigh[c][:], in, out)
-		done = (len(in) >> 4) << 4
+		done := (len(in) >> 4) << 4
+		in = in[done:]
+		out = out[done:]
 	}
-	in = in[done:]
-	out = out[done:]
 	out = out[:len(in)]
 	mt := mulTable[c][:256]
 	for i := range in {
@@ -59,16 +80,25 @@ func galMulSlice(c byte, in, out []byte, o *options) {
 }
 
 func galMulSliceXor(c byte, in, out []byte, o *options) {
-	var done int
 	if o.useAVX2 {
-		galMulAVX2Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out)
-		done = (len(in) >> 5) << 5
+		if len(in) >= bigSwitchover {
+			galMulAVX2Xor_64(mulTableLow[c][:], mulTableHigh[c][:], in, out)
+			done := (len(in) >> 6) << 6
+			in = in[done:]
+			out = out[done:]
+		}
+		if len(in) >= 32 {
+			galMulAVX2Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out)
+			done := (len(in) >> 5) << 5
+			in = in[done:]
+			out = out[done:]
+		}
 	} else if o.useSSSE3 {
 		galMulSSSE3Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out)
-		done = (len(in) >> 4) << 4
+		done := (len(in) >> 4) << 4
+		in = in[done:]
+		out = out[done:]
 	}
-	in = in[done:]
-	out = out[done:]
 	out = out[:len(in)]
 	mt := mulTable[c][:256]
 	for i := range in {
@@ -78,13 +108,20 @@ func galMulSliceXor(c byte, in, out []byte, o *options) {
 
 // slice galois add
 func sliceXor(in, out []byte, sse2 bool) {
-	var done int
 	if sse2 {
-		sSE2XorSlice(in, out)
-		done = (len(in) >> 4) << 4
+		if len(in) >= bigSwitchover {
+			sSE2XorSlice_64(in, out)
+			done := (len(in) >> 6) << 6
+			in = in[done:]
+			out = out[done:]
+		}
+		if len(in) >= 16 {
+			sSE2XorSlice(in, out)
+			done := (len(in) >> 4) << 4
+			in = in[done:]
+			out = out[done:]
+		}
 	}
-	in = in[done:]
-	out = out[done:]
 	out = out[:len(in)]
 	for i := range in {
 		out[i] ^= in[i]
diff --git a/galois_amd64.s b/galois_amd64.s
index b768028..3501110 100644
--- a/galois_amd64.s
+++ b/galois_amd64.s
@@ -234,3 +234,135 @@ loopback_xor_sse2:
 
 done_xor_sse2:
 	RET
+
+// func galMulAVX2Xor_64(low, high, in, out []byte)
+TEXT ·galMulAVX2Xor_64(SB), 7, $0
+	MOVQ  low+0(FP), SI     // SI: &low
+	MOVQ  high+24(FP), DX   // DX: &high
+	MOVQ  $15, BX           // BX: low mask
+	MOVQ  BX, X5
+	MOVOU (SI), X6          // X6: low
+	MOVOU (DX), X7          // X7: high
+	MOVQ  in_len+56(FP), R9 // R9: len(in)
+
+	VINSERTI128  $1, X6, Y6, Y6 // low
+	VINSERTI128  $1, X7, Y7, Y7 // high
+	VPBROADCASTB X5, Y8         // Y8: lomask (unpacked)
+
+	SHRQ  $6, R9           // len(in) / 64
+	MOVQ  out+72(FP), DX   // DX: &out
+	MOVQ  in+48(FP), SI    // SI: &in
+	TESTQ R9, R9
+	JZ    done_xor_avx2_64
+
+loopback_xor_avx2_64:
+	VMOVDQU (SI), Y0
+	VMOVDQU 32(SI), Y10
+	VMOVDQU (DX), Y4
+	VMOVDQU 32(DX), Y14
+	VPSRLQ  $4, Y0, Y1    // Y1: high input
+	VPSRLQ  $4, Y10, Y11  // Y11: high input 2
+	VPAND   Y8, Y0, Y0    // Y0: low input
+	VPAND   Y8, Y10, Y10  // Y10: low input 2
+	VPAND   Y8, Y1, Y1    // Y11: high input
+	VPAND   Y8, Y11, Y11  // Y11: high input 2
+	VPSHUFB Y0, Y6, Y2    // Y2: mul low part
+	VPSHUFB Y10, Y6, Y12  // Y12: mul low part 2
+	VPSHUFB Y1, Y7, Y3    // Y3: mul high part
+	VPSHUFB Y11, Y7, Y13  // Y13: mul high part 2
+	VPXOR   Y3, Y2, Y3    // Y3: Result
+	VPXOR   Y13, Y12, Y13 // Y13: Result 2
+	VPXOR   Y4, Y3, Y4    // Y4: Result
+	VPXOR   Y14, Y13, Y14 // Y4: Result 2
+	VMOVDQU Y4, (DX)
+	VMOVDQU Y14, 32(DX)
+
+	ADDQ $64, SI              // in+=64
+	ADDQ $64, DX              // out+=64
+	SUBQ $1, R9
+	JNZ  loopback_xor_avx2_64
+
+done_xor_avx2_64:
+	VZEROUPPER
+	RET
+
+// func galMulAVX2_64(low, high, in, out []byte)
+TEXT ·galMulAVX2_64(SB), 7, $0
+	MOVQ  low+0(FP), SI     // SI: &low
+	MOVQ  high+24(FP), DX   // DX: &high
+	MOVQ  $15, BX           // BX: low mask
+	MOVQ  BX, X5
+	MOVOU (SI), X6          // X6: low
+	MOVOU (DX), X7          // X7: high
+	MOVQ  in_len+56(FP), R9 // R9: len(in)
+
+	VINSERTI128  $1, X6, Y6, Y6 // low
+	VINSERTI128  $1, X7, Y7, Y7 // high
+	VPBROADCASTB X5, Y8         // Y8: lomask (unpacked)
+
+	SHRQ  $6, R9         // len(in) / 64
+	MOVQ  out+72(FP), DX // DX: &out
+	MOVQ  in+48(FP), SI  // SI: &in
+	TESTQ R9, R9
+	JZ    done_avx2_64
+
+loopback_avx2_64:
+	VMOVDQU (SI), Y0
+	VMOVDQU 32(SI), Y10
+	VPSRLQ  $4, Y0, Y1    // Y1: high input
+	VPSRLQ  $4, Y10, Y11  // Y11: high input 2
+	VPAND   Y8, Y0, Y0    // Y0: low input
+	VPAND   Y8, Y10, Y10  // Y10: low input
+	VPAND   Y8, Y1, Y1    // Y1: high input
+	VPAND   Y8, Y11, Y11  // Y11: high input 2
+	VPSHUFB Y0, Y6, Y2    // Y2: mul low part
+	VPSHUFB Y10, Y6, Y12  // Y12: mul low part 2
+	VPSHUFB Y1, Y7, Y3    // Y3: mul high part
+	VPSHUFB Y11, Y7, Y13  // Y13: mul high part 2
+	VPXOR   Y3, Y2, Y4    // Y4: Result
+	VPXOR   Y13, Y12, Y14 // Y14: Result 2
+	VMOVDQU Y4, (DX)
+	VMOVDQU Y14, 32(DX)
+
+	ADDQ $64, SI          // in+=64
+	ADDQ $64, DX          // out+=64
+	SUBQ $1, R9
+	JNZ  loopback_avx2_64
+
+done_avx2_64:
+	VZEROUPPER
+	RET
+
+// func sSE2XorSlice_64(in, out []byte)
+TEXT ·sSE2XorSlice_64(SB), 7, $0
+	MOVQ in+0(FP), SI     // SI: &in
+	MOVQ in_len+8(FP), R9 // R9: len(in)
+	MOVQ out+24(FP), DX   // DX: &out
+	SHRQ $6, R9           // len(in) / 64
+	CMPQ R9, $0
+	JEQ  done_xor_sse2_64
+
+loopback_xor_sse2_64:
+	MOVOU (SI), X0             // in[x]
+	MOVOU 16(SI), X2           // in[x]
+	MOVOU 32(SI), X4           // in[x]
+	MOVOU 48(SI), X6           // in[x]
+	MOVOU (DX), X1             // out[x]
+	MOVOU 16(DX), X3           // out[x]
+	MOVOU 32(DX), X5           // out[x]
+	MOVOU 48(DX), X7           // out[x]
+	PXOR  X0, X1
+	PXOR  X2, X3
+	PXOR  X4, X5
+	PXOR  X6, X7
+	MOVOU X1, (DX)
+	MOVOU X3, 16(DX)
+	MOVOU X5, 32(DX)
+	MOVOU X7, 48(DX)
+	ADDQ  $64, SI              // in+=64
+	ADDQ  $64, DX              // out+=64
+	SUBQ  $1, R9
+	JNZ   loopback_xor_sse2_64
+
+done_xor_sse2_64:
+	RET
diff --git a/reedsolomon.go b/reedsolomon.go
index 999fcc3..9bfbb26 100644
--- a/reedsolomon.go
+++ b/reedsolomon.go
@@ -503,8 +503,8 @@ func (r reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, outpu
 	if do < r.o.minSplitSize {
 		do = r.o.minSplitSize
 	}
-	// Make sizes divisible by 32
-	do = (do + 31) & (^31)
+	// Make sizes divisible by 64
+	do = (do + 63) & (^63)
 	start := 0
 	for start < byteCount {
 		if start+do > byteCount {
@@ -576,8 +576,8 @@ func (r reedSolomon) checkSomeShardsP(matrixRows, inputs, toCheck [][]byte, outp
 	if do < r.o.minSplitSize {
 		do = r.o.minSplitSize
 	}
-	// Make sizes divisible by 32
-	do = (do + 31) & (^31)
+	// Make sizes divisible by 64
+	do = (do + 63) & (^63)
 	start := 0
 	for start < byteCount {
 		if start+do > byteCount {