Add an efficient implementation of shard counts up to 65536 (#191)

This is a O(n*log n) implementation of Reed-Solomon codes, ported from the C++ library https://github.com/catid/leopard. The implementation is based on the paper "Novel Polynomial Basis with Fast Fourier Transform and Its Application to Reed-Solomon Erasure Codes" Several TODOs are left for future commits: - Performance optimizations, in particular SIMD and multiple goroutines - Documentation - Detailed Tests - Merging of reedSolomonFF16 and reedSolomon types - Turn the straight C++ port into more idiomatic Go This change also bumps one race testing timeout, to ensure adequate time on CI.
2022-07-21 14:27:10 +02:00 · 2022-07-21 14:27:10 +02:00 · 49be604db0
parent 6ef9059245
commit 49be604db0
5 changed files with 887 additions and 11 deletions
--- a/.github/workflows/go.yml
+++ b/.github/workflows/go.yml
@ -64,12 +64,12 @@ jobs:
    - name: Test Races, noasm, 1 cpu
      env:
        CGO_ENABLED: 1
-      run: go test -tags=noasm -cpu=1 -short -race .
+      run: go test -tags=noasm -cpu=1 -short -race -timeout 20m .

    - name: Test Races, noasm, 4 cpu
      env:
        CGO_ENABLED: 1
-      run: go test -tags=noasm -cpu=4 -short -race .
+      run: go test -tags=noasm -cpu=4 -short -race -timeout 20m .

    - name: Test Races, no avx512
      env:
--- a/leopard.go
+++ b/leopard.go
@ -0,0 +1,859 @@
+package reedsolomon
+
+// This is a O(n*log n) implementation of Reed-Solomon
+// codes, ported from the C++ library https://github.com/catid/leopard.
+//
+// The implementation is based on the paper
+//
+// S.-J. Lin, T. Y. Al-Naffouri, Y. S. Han, and W.-H. Chung,
+// "Novel Polynomial Basis with Fast Fourier Transform
+// and Its Application to Reed-Solomon Erasure Codes"
+// IEEE Trans. on Information Theory, pp. 6284-6299, November, 2016.
+
+import (
+	"bytes"
+	"errors"
+	"io"
+	"math/bits"
+	"sync"
+	"unsafe"
+)
+
+// reedSolomonFF16 is like reedSolomon but for more than 256 total shards.
+type reedSolomonFF16 struct {
+	DataShards   int // Number of data shards, should not be modified.
+	ParityShards int // Number of parity shards, should not be modified.
+	Shards       int // Total number of shards. Calculated, and should not be modified.
+
+	o options
+}
+
+// newFF16 is like New, but for more than 256 total shards.
+func newFF16(dataShards, parityShards int, opts ...Option) (*reedSolomonFF16, error) {
+	initConstants()
+
+	if dataShards <= 0 || parityShards < 0 {
+		return nil, ErrInvShardNum
+	}
+
+	if dataShards+parityShards > 65536 {
+		return nil, ErrMaxShardNum
+	}
+
+	r := &reedSolomonFF16{
+		DataShards:   dataShards,
+		ParityShards: parityShards,
+		Shards:       dataShards + parityShards,
+		o:            defaultOptions,
+	}
+	for _, opt := range opts {
+		opt(&r.o)
+	}
+	return r, nil
+}
+
+type ffe uint16
+
+const (
+	bitwidth   = 16
+	order      = 1 << bitwidth
+	modulus    = order - 1
+	polynomial = 0x1002D
+)
+
+var (
+	fftSkew  *[modulus]ffe
+	logWalsh *[order]ffe
+)
+
+// Logarithm Tables
+var (
+	logLUT *[order]ffe
+	expLUT *[order]ffe
+)
+
+// Stores the partial products of x * y at offset x + y * 65536
+// Repeated accesses from the same y value are faster
+var mul16LUTs *[order]mul16LUT
+
+type mul16LUT struct {
+	LUT [4 * 16]ffe
+}
+
+func (r *reedSolomonFF16) Encode(shards [][]byte) error {
+	if len(shards) != r.Shards {
+		return ErrTooFewShards
+	}
+
+	if err := checkShards(shards, false); err != nil {
+		return err
+	}
+	return r.encode(shards)
+}
+
+func (r *reedSolomonFF16) encode(shards [][]byte) error {
+	shardSize := len(shards[0])
+	if shardSize%64 != 0 {
+		return ErrShardSize
+	}
+
+	m := ceilPow2(r.ParityShards)
+
+	work := make([][]byte, m*2)
+	for i := range work {
+		work[i] = make([]byte, shardSize)
+	}
+
+	mtrunc := m
+	if r.DataShards < mtrunc {
+		mtrunc = r.DataShards
+	}
+
+	skewLUT := fftSkew[m-1:]
+
+	sh := shards
+	ifftDITEncoder(
+		sh[:r.DataShards],
+		mtrunc,
+		work,
+		nil, // No xor output
+		m,
+		skewLUT,
+		&r.o,
+	)
+
+	lastCount := r.DataShards % m
+	if m >= r.DataShards {
+		goto skip_body
+	}
+
+	// For sets of m data pieces:
+	for i := m; i+m <= r.DataShards; i += m {
+		sh = sh[m:]
+		skewLUT = skewLUT[m:]
+
+		// work <- work xor IFFT(data + i, m, m + i)
+
+		ifftDITEncoder(
+			sh, // data source
+			m,
+			work[m:], // temporary workspace
+			work,     // xor destination
+			m,
+			skewLUT,
+			&r.o,
+		)
+	}
+
+	// Handle final partial set of m pieces:
+	if lastCount != 0 {
+		sh = sh[m:]
+		skewLUT = skewLUT[m:]
+
+		// work <- work xor IFFT(data + i, m, m + i)
+
+		ifftDITEncoder(
+			sh, // data source
+			lastCount,
+			work[m:], // temporary workspace
+			work,     // xor destination
+			m,
+			skewLUT,
+			&r.o,
+		)
+	}
+
+skip_body:
+	// work <- FFT(work, m, 0)
+	fftDIT(work, r.ParityShards, m, fftSkew[:], &r.o)
+
+	for i, w := range work[:r.ParityShards] {
+		sh := shards[i+r.DataShards]
+		if cap(sh) >= shardSize {
+			sh = append(sh[:0], w...)
+		} else {
+			sh = w
+		}
+		shards[i+r.DataShards] = sh
+	}
+
+	return nil
+}
+
+func (r *reedSolomonFF16) EncodeIdx(dataShard []byte, idx int, parity [][]byte) error {
+	return errors.New("not implemented")
+}
+
+func (r *reedSolomonFF16) Join(dst io.Writer, shards [][]byte, outSize int) error {
+	return errors.New("not implemented")
+}
+
+func (r *reedSolomonFF16) Update(shards [][]byte, newDatashards [][]byte) error {
+	return errors.New("not implemented")
+}
+
+func (r *reedSolomonFF16) Split(data []byte) ([][]byte, error) {
+	return nil, errors.New("not implemented")
+}
+
+func (r *reedSolomonFF16) ReconstructSome(shards [][]byte, required []bool) error {
+	return r.ReconstructData(shards)
+}
+
+func (r *reedSolomonFF16) Reconstruct(shards [][]byte) error {
+	return r.reconstruct(shards, true)
+}
+
+func (r *reedSolomonFF16) ReconstructData(shards [][]byte) error {
+	return r.reconstruct(shards, false)
+}
+
+func (r *reedSolomonFF16) Verify(shards [][]byte) (bool, error) {
+	if len(shards) != r.Shards {
+		return false, ErrTooFewShards
+	}
+	if err := checkShards(shards, false); err != nil {
+		return false, err
+	}
+
+	// Re-encode parity shards to temporary storage.
+	shardSize := len(shards[0])
+	outputs := make([][]byte, r.Shards)
+	copy(outputs, shards[:r.DataShards])
+	for i := r.DataShards; i < r.Shards; i++ {
+		outputs[i] = make([]byte, shardSize)
+	}
+	if err := r.Encode(outputs); err != nil {
+		return false, err
+	}
+
+	// Compare.
+	for i := r.DataShards; i < r.Shards; i++ {
+		if !bytes.Equal(outputs[i], shards[i]) {
+			return false, nil
+		}
+	}
+	return true, nil
+}
+
+func (r *reedSolomonFF16) reconstruct(shards [][]byte, recoverAll bool) error {
+	if len(shards) != r.Shards {
+		return ErrTooFewShards
+	}
+
+	if err := checkShards(shards, true); err != nil {
+		return err
+	}
+
+	shardSize := len(shards[0])
+	if shardSize%64 != 0 {
+		return ErrShardSize
+	}
+
+	m := ceilPow2(r.ParityShards)
+	n := ceilPow2(m + r.DataShards)
+
+	// Fill in error locations.
+	var errLocs [order]ffe
+	for i := 0; i < r.ParityShards; i++ {
+		if len(shards[i+r.DataShards]) == 0 {
+			errLocs[i] = 1
+		}
+	}
+	for i := r.ParityShards; i < m; i++ {
+		errLocs[i] = 1
+	}
+	for i := 0; i < r.DataShards; i++ {
+		if len(shards[i]) == 0 {
+			errLocs[i+m] = 1
+		}
+	}
+	// Evaluate error locator polynomial
+
+	fwht(errLocs[:], order, m+r.DataShards)
+
+	for i := 0; i < order; i++ {
+		errLocs[i] = ffe((uint(errLocs[i]) * uint(logWalsh[i])) % modulus)
+	}
+
+	fwht(errLocs[:], order, order)
+
+	work := make([][]byte, n)
+	for i := range work {
+		work[i] = make([]byte, shardSize)
+	}
+
+	// work <- recovery data
+
+	for i := 0; i < r.ParityShards; i++ {
+		if len(shards[i+r.DataShards]) != 0 {
+			mul(work[i], shards[i+r.DataShards], errLocs[i])
+		} else {
+			memclr(work[i])
+		}
+	}
+	for i := r.ParityShards; i < m; i++ {
+		memclr(work[i])
+	}
+
+	// work <- original data
+
+	for i := 0; i < r.DataShards; i++ {
+		if len(shards[i]) != 0 {
+			mul(work[m+i], shards[i], errLocs[m+i])
+		} else {
+			memclr(work[m+i])
+		}
+	}
+	for i := m + r.DataShards; i < n; i++ {
+		memclr(work[i])
+	}
+
+	// work <- IFFT(work, n, 0)
+
+	ifftDITDecoder(
+		m+r.DataShards,
+		work,
+		n,
+		fftSkew[:],
+		&r.o,
+	)
+
+	// work <- FormalDerivative(work, n)
+
+	for i := 1; i < n; i++ {
+		width := ((i ^ (i - 1)) + 1) >> 1
+		slicesXor(work[i-width:i], work[i:i+width], &r.o)
+	}
+
+	// work <- FFT(work, n, 0) truncated to m + dataShards
+
+	outputCount := m + r.DataShards
+
+	fftDIT(work, outputCount, n, fftSkew[:], &r.o)
+
+	// Reveal erasures
+	//
+	//  Original = -ErrLocator * FFT( Derivative( IFFT( ErrLocator * ReceivedData ) ) )
+	//  mul_mem(x, y, log_m, ) equals x[] = y[] * log_m
+	//
+	// mem layout: [Recovery Data (Power of Two = M)] [Original Data (K)] [Zero Padding out to N]
+	end := r.DataShards
+	if recoverAll {
+		end = r.Shards
+	}
+	for i := 0; i < end; i++ {
+		if len(shards[i]) != 0 {
+			continue
+		}
+		if cap(shards[i]) >= shardSize {
+			shards[i] = shards[i][:shardSize]
+		} else {
+			shards[i] = make([]byte, shardSize)
+		}
+		if i >= r.DataShards {
+			// Parity shard.
+			mul(shards[i], work[i-r.DataShards], modulus-errLocs[i-r.DataShards])
+		} else {
+			// Data shard.
+			mul(shards[i], work[i+m], modulus-errLocs[i+m])
+		}
+	}
+
+	return nil
+}
+
+// Basic no-frills version for decoder
+func ifftDITDecoder(mtrunc int, work [][]byte, m int, skewLUT []ffe, o *options) {
+	// Decimation in time: Unroll 2 layers at a time
+	dist := 1
+	dist4 := 4
+	for dist4 <= m {
+		// For each set of dist*4 elements:
+		for r := 0; r < mtrunc; r += dist4 {
+			iend := r + dist
+			log_m01 := skewLUT[iend-1]
+			log_m02 := skewLUT[iend+dist-1]
+			log_m23 := skewLUT[iend+dist*2-1]
+
+			// For each set of dist elements:
+			for i := r; i < iend; i++ {
+				ifftDIT4(work[i:], dist, log_m01, log_m23, log_m02, o)
+			}
+		}
+		dist = dist4
+		dist4 <<= 2
+	}
+
+	// If there is one layer left:
+	if dist < m {
+		// Assuming that dist = m / 2
+		if dist*2 != m {
+			panic("internal error")
+		}
+
+		log_m := skewLUT[dist-1]
+
+		if log_m == modulus {
+			slicesXor(work[dist:2*dist], work[:dist], o)
+		} else {
+			for i := 0; i < dist; i++ {
+				ifftDIT2(
+					work[i],
+					work[i+dist],
+					log_m,
+					o,
+				)
+			}
+		}
+	}
+}
+
+// In-place FFT for encoder and decoder
+func fftDIT(work [][]byte, mtrunc, m int, skewLUT []ffe, o *options) {
+	// Decimation in time: Unroll 2 layers at a time
+	dist4 := m
+	dist := m >> 2
+	for dist != 0 {
+		// For each set of dist*4 elements:
+		for r := 0; r < mtrunc; r += dist4 {
+			iend := r + dist
+			log_m01 := skewLUT[iend-1]
+			log_m02 := skewLUT[iend+dist-1]
+			log_m23 := skewLUT[iend+dist*2-1]
+
+			// For each set of dist elements:
+			for i := r; i < iend; i++ {
+				fftDIT4(
+					work[i:],
+					dist,
+					log_m01,
+					log_m23,
+					log_m02,
+					o,
+				)
+			}
+		}
+		dist4 = dist
+		dist >>= 2
+	}
+
+	// If there is one layer left:
+	if dist4 == 2 {
+		for r := 0; r < mtrunc; r += 2 {
+			log_m := skewLUT[r+1-1]
+
+			if log_m == modulus {
+				sliceXor(work[r], work[r+1], o)
+			} else {
+				fftDIT2(work[r], work[r+1], log_m, o)
+			}
+		}
+	}
+}
+
+// 4-way butterfly
+func fftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) {
+	// First layer:
+	if log_m02 == modulus {
+		sliceXor(work[0], work[dist*2], o)
+		sliceXor(work[dist], work[dist*3], o)
+	} else {
+		fftDIT2(work[0], work[dist*2], log_m02, o)
+		fftDIT2(work[dist], work[dist*3], log_m02, o)
+	}
+
+	// Second layer:
+	if log_m01 == modulus {
+		sliceXor(work[0], work[dist], o)
+	} else {
+		fftDIT2(work[0], work[dist], log_m01, o)
+	}
+
+	if log_m23 == modulus {
+		sliceXor(work[dist*2], work[dist*3], o)
+	} else {
+		fftDIT2(work[dist*2], work[dist*3], log_m23, o)
+	}
+}
+
+// 2-way butterfly
+func fftDIT2(x, y []byte, log_m ffe, o *options) {
+	// Reference version:
+	refMulAdd(x, y, log_m)
+	sliceXor(x, y, o)
+}
+
+// Unrolled IFFT for encoder
+func ifftDITEncoder(data [][]byte, mtrunc int, work [][]byte, xorRes [][]byte, m int, skewLUT []ffe, o *options) {
+	// I tried rolling the memcpy/memset into the first layer of the FFT and
+	// found that it only yields a 4% performance improvement, which is not
+	// worth the extra complexity.
+	for i := 0; i < mtrunc; i++ {
+		copy(work[i], data[i])
+	}
+	for i := mtrunc; i < m; i++ {
+		memclr(work[i])
+	}
+
+	// I tried splitting up the first few layers into L3-cache sized blocks but
+	// found that it only provides about 5% performance boost, which is not
+	// worth the extra complexity.
+
+	// Decimation in time: Unroll 2 layers at a time
+	dist := 1
+	dist4 := 4
+	for dist4 <= m {
+		// For each set of dist*4 elements:
+		for r := 0; r < mtrunc; r += dist4 {
+			iend := r + dist
+			log_m01 := skewLUT[iend]
+			log_m02 := skewLUT[iend+dist]
+			log_m23 := skewLUT[iend+dist*2]
+
+			// For each set of dist elements:
+			for i := r; i < iend; i++ {
+				ifftDIT4(
+					work[i:],
+					dist,
+					log_m01,
+					log_m23,
+					log_m02,
+					o,
+				)
+			}
+		}
+
+		dist = dist4
+		dist4 <<= 2
+		// I tried alternating sweeps left->right and right->left to reduce cache misses.
+		// It provides about 1% performance boost when done for both FFT and IFFT, so it
+		// does not seem to be worth the extra complexity.
+	}
+
+	// If there is one layer left:
+	if dist < m {
+		// Assuming that dist = m / 2
+		if dist*2 != m {
+			panic("internal error")
+		}
+
+		logm := skewLUT[dist]
+
+		if logm == modulus {
+			slicesXor(work[dist:dist*2], work[:dist], o)
+		} else {
+			for i := 0; i < dist; i++ {
+				ifftDIT2(work[i], work[i+dist], logm, o)
+			}
+		}
+	}
+
+	// I tried unrolling this but it does not provide more than 5% performance
+	// improvement for 16-bit finite fields, so it's not worth the complexity.
+	if xorRes != nil {
+		slicesXor(xorRes[:m], work[:m], o)
+	}
+}
+
+// 4-way butterfly
+func ifftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) {
+	// First layer:
+	if log_m01 == modulus {
+		sliceXor(work[0], work[dist], o)
+	} else {
+		ifftDIT2(work[0], work[dist], log_m01, o)
+	}
+
+	if log_m23 == modulus {
+		sliceXor(work[dist*2], work[dist*3], o)
+	} else {
+		ifftDIT2(work[dist*2], work[dist*3], log_m23, o)
+	}
+
+	// Second layer:
+	if log_m02 == modulus {
+		sliceXor(work[0], work[dist*2], o)
+		sliceXor(work[dist], work[dist*3], o)
+	} else {
+		ifftDIT2(work[0], work[dist*2], log_m02, o)
+		ifftDIT2(work[dist], work[dist*3], log_m02, o)
+	}
+}
+
+// 2-way butterfly
+func ifftDIT2(x, y []byte, log_m ffe, o *options) {
+	// Reference version:
+	sliceXor(x, y, o)
+	refMulAdd(x, y, log_m)
+}
+
+// Reference version of muladd: x[] ^= y[] * log_m
+func refMulAdd(x, y []byte, log_m ffe) {
+	lut := mul16LUTs[log_m].LUT
+
+	for off := 0; off < len(x); off += 64 {
+		for i := 0; i < 32; i++ {
+			lo := y[off+i]
+			hi := y[off+i+32]
+
+			prod :=
+				lut[(lo&15)] ^
+					lut[(lo>>4)+16] ^
+					lut[(hi&15)+32] ^
+					lut[(hi>>4)+48]
+
+			x[off+i] ^= byte(prod)
+			x[off+i+32] ^= byte(prod >> 8)
+		}
+	}
+}
+
+func memclr(s []byte) {
+	for i := range s {
+		s[i] = 0
+	}
+}
+
+// slicesXor calls xor for every slice pair in v1, v2.
+func slicesXor(v1, v2 [][]byte, o *options) {
+	for i, v := range v1 {
+		sliceXor(v2[i], v, o)
+	}
+}
+
+func mul(x, y []byte, log_m ffe) {
+	refMul(x, y, log_m)
+}
+
+// Reference version of mul: x[] = y[] * log_m
+func refMul(x, y []byte, log_m ffe) {
+	lut := mul16LUTs[log_m].LUT
+
+	for off := 0; off < len(x); off += 64 {
+		for i := 0; i < 32; i++ {
+			lo := y[off+i]
+			hi := y[off+i+32]
+
+			prod :=
+				lut[(lo&15)] ^
+					lut[(lo>>4)+16] ^
+					lut[(hi&15)+32] ^
+					lut[(hi>>4)+48]
+
+			x[off+i] = byte(prod)
+			x[off+i+32] = byte(prod >> 8)
+		}
+	}
+}
+
+// Returns a * Log(b)
+func mulLog(a, log_b ffe) ffe {
+	/*
+	   Note that this operation is not a normal multiplication in a finite
+	   field because the right operand is already a logarithm.  This is done
+	   because it moves K table lookups from the Decode() method into the
+	   initialization step that is less performance critical.  The LogWalsh[]
+	   table below contains precalculated logarithms so it is easier to do
+	   all the other multiplies in that form as well.
+	*/
+	if a == 0 {
+		return 0
+	}
+	return expLUT[addMod(logLUT[a], log_b)]
+}
+
+// z = x + y (mod kModulus)
+func addMod(a, b ffe) ffe {
+	sum := uint(a) + uint(b)
+
+	// Partial reduction step, allowing for kModulus to be returned
+	return ffe(sum + sum>>bitwidth)
+}
+
+// z = x - y (mod kModulus)
+func subMod(a, b ffe) ffe {
+	dif := uint(a) - uint(b)
+
+	// Partial reduction step, allowing for kModulus to be returned
+	return ffe(dif + dif>>bitwidth)
+}
+
+// ceilPow2 returns power of two at or above n.
+func ceilPow2(n int) int {
+	const w = int(unsafe.Sizeof(n) * 8)
+	return 1 << (w - bits.LeadingZeros(uint(n-1)))
+}
+
+// Decimation in time (DIT) Fast Walsh-Hadamard Transform
+// Unrolls pairs of layers to perform cross-layer operations in registers
+// mtrunc: Number of elements that are non-zero at the front of data
+func fwht(data []ffe, m, mtrunc int) {
+	// Decimation in time: Unroll 2 layers at a time
+	dist := 1
+	dist4 := 4
+	for dist4 <= m {
+		// For each set of dist*4 elements:
+		for r := 0; r < mtrunc; r += dist4 {
+			// For each set of dist elements:
+			for i := r; i < r+dist; i++ {
+				fwht4(data[i:], dist)
+			}
+		}
+		dist = dist4
+		dist4 <<= 2
+	}
+
+	// If there is one layer left:
+	if dist < m {
+		for i := 0; i < dist; i++ {
+			fwht2(&data[i], &data[i+dist])
+		}
+	}
+}
+
+func fwht4(data []ffe, s int) {
+	s2 := s << 1
+
+	t0 := &data[0]
+	t1 := &data[s]
+	t2 := &data[s2]
+	t3 := &data[s2+s]
+
+	fwht2(t0, t1)
+	fwht2(t2, t3)
+	fwht2(t0, t2)
+	fwht2(t1, t3)
+}
+
+// {a, b} = {a + b, a - b} (Mod Q)
+func fwht2(a, b *ffe) {
+	sum := addMod(*a, *b)
+	dif := subMod(*a, *b)
+	*a = sum
+	*b = dif
+}
+
+var initOnce sync.Once
+
+func initConstants() {
+	initOnce.Do(func() {
+		initLUTs()
+		initFFTSkew()
+		initMul16LUT()
+	})
+}
+
+// Initialize logLUT, expLUT.
+func initLUTs() {
+	cantorBasis := [bitwidth]ffe{
+		0x0001, 0xACCA, 0x3C0E, 0x163E,
+		0xC582, 0xED2E, 0x914C, 0x4012,
+		0x6C98, 0x10D8, 0x6A72, 0xB900,
+		0xFDB8, 0xFB34, 0xFF38, 0x991E,
+	}
+
+	expLUT = &[order]ffe{}
+	logLUT = &[order]ffe{}
+
+	// LFSR table generation:
+	state := 1
+	for i := ffe(0); i < modulus; i++ {
+		expLUT[state] = i
+		state <<= 1
+		if state >= order {
+			state ^= polynomial
+		}
+	}
+	expLUT[0] = modulus
+
+	// Conversion to Cantor basis:
+
+	logLUT[0] = 0
+	for i := 0; i < bitwidth; i++ {
+		basis := cantorBasis[i]
+		width := 1 << i
+
+		for j := 0; j < width; j++ {
+			logLUT[j+width] = logLUT[j] ^ basis
+		}
+	}
+
+	for i := 0; i < order; i++ {
+		logLUT[i] = expLUT[logLUT[i]]
+	}
+
+	for i := 0; i < order; i++ {
+		expLUT[logLUT[i]] = ffe(i)
+	}
+
+	expLUT[modulus] = expLUT[0]
+}
+
+// Initialize fftSkew.
+func initFFTSkew() {
+	var temp [bitwidth - 1]ffe
+
+	// Generate FFT skew vector {1}:
+
+	for i := 1; i < bitwidth; i++ {
+		temp[i-1] = ffe(1 << i)
+	}
+
+	fftSkew = &[modulus]ffe{}
+	logWalsh = &[order]ffe{}
+
+	for m := 0; m < bitwidth-1; m++ {
+		step := 1 << (m + 1)
+
+		fftSkew[1<<m-1] = 0
+
+		for i := m; i < bitwidth-1; i++ {
+			s := 1 << (i + 1)
+
+			for j := 1<<m - 1; j < s; j += step {
+				fftSkew[j+s] = fftSkew[j] ^ temp[i]
+			}
+		}
+
+		temp[m] = modulus - logLUT[mulLog(temp[m], logLUT[temp[m]^1])]
+
+		for i := m + 1; i < bitwidth-1; i++ {
+			sum := addMod(logLUT[temp[i]^1], temp[m])
+			temp[i] = mulLog(temp[i], sum)
+		}
+	}
+
+	for i := 0; i < modulus; i++ {
+		fftSkew[i] = logLUT[fftSkew[i]]
+	}
+
+	// Precalculate FWHT(Log[i]):
+
+	for i := 0; i < order; i++ {
+		logWalsh[i] = logLUT[i]
+	}
+	logWalsh[0] = 0
+
+	fwht(logWalsh[:], order, order)
+}
+
+func initMul16LUT() {
+	mul16LUTs = &[order]mul16LUT{}
+
+	// For each log_m multiplicand:
+	for log_m := 0; log_m < order; log_m++ {
+		lut := &mul16LUTs[log_m]
+
+		for nibble, shift := 0, 0; nibble < 4; {
+			nibble_lut := lut.LUT[nibble*16:]
+
+			for xnibble := 0; xnibble < 16; xnibble++ {
+				prod := mulLog(ffe(xnibble<<shift), ffe(log_m))
+				nibble_lut[xnibble] = prod
+			}
+			nibble++
+			shift += 4
+		}
+	}
+}
--- a/reedsolomon.go
+++ b/reedsolomon.go
@ -270,9 +270,17 @@ func buildXorMatrix(dataShards, totalShards int) (matrix, error) {
 // New creates a new encoder and initializes it to
 // the number of data shards and parity shards that
 // you want to use. You can reuse this encoder.
-// Note that the maximum number of total shards is 256.
+// Note that the maximum number of total shards is 65536, with some
+// restrictions for a total larger than 256:
+//
+//   - Shard sizes must be multiple of 64
+//   - The methods Join/Split/Update/EncodeIdx are not supported
+//
 // If no options are supplied, default options are used.
 func New(dataShards, parityShards int, opts ...Option) (Encoder, error) {
+	if dataShards+parityShards > 256 {
+		return newFF16(dataShards, parityShards, opts...)
+	}
 	r := reedSolomon{
 		DataShards:   dataShards,
 		ParityShards: parityShards,
@ -287,10 +295,6 @@ func New(dataShards, parityShards int, opts ...Option) (Encoder, error) {
 		return nil, ErrInvShardNum
 	}

-	if dataShards+parityShards > 256 {
-		return nil, ErrMaxShardNum
-	}
-
 	if parityShards == 0 {
 		return &r, nil
 	}
--- a/reedsolomon_test.go
+++ b/reedsolomon_test.go
@ -191,7 +191,9 @@ func TestEncoding(t *testing.T) {
 // note that par1 matric will fail on some combinations.
 var testSizes = [][2]int{
 	{1, 0}, {3, 0}, {5, 0}, {8, 0}, {10, 0}, {12, 0}, {14, 0}, {41, 0}, {49, 0},
-	{1, 1}, {1, 2}, {3, 3}, {3, 1}, {5, 3}, {8, 4}, {10, 30}, {12, 10}, {14, 7}, {41, 17}, {49, 1}, {5, 20}}
+	{1, 1}, {1, 2}, {3, 3}, {3, 1}, {5, 3}, {8, 4}, {10, 30}, {12, 10}, {14, 7}, {41, 17}, {49, 1}, {5, 20},
+	{256, 1},
+}
 var testDataSizes = []int{10, 100, 1000, 10001, 100003, 1000055}
 var testDataSizesShort = []int{10, 10001, 100003}

@ -201,10 +203,14 @@ func testEncoding(t *testing.T, o ...Option) {
 		rng := rand.New(rand.NewSource(0xabadc0cac01a))
 		t.Run(fmt.Sprintf("%dx%d", data, parity), func(t *testing.T) {
 			sz := testDataSizes
-			if testing.Short() {
+			if testing.Short() || data+parity > 256 {
 				sz = testDataSizesShort
 			}
 			for _, perShard := range sz {
+				if data+parity > 256 {
+					// Round up to 64 bytes.
+					perShard = (perShard + 63) &^ 63
+				}
 				t.Run(fmt.Sprint(perShard), func(t *testing.T) {

 					r, err := New(data, parity, testOptions(o...)...)
@ -295,6 +301,9 @@ func testEncodingIdx(t *testing.T, o ...Option) {
 		data, parity := size[0], size[1]
 		rng := rand.New(rand.NewSource(0xabadc0cac01a))
 		t.Run(fmt.Sprintf("%dx%d", data, parity), func(t *testing.T) {
+			if data+parity > 256 {
+				t.Skip("EncodingIdx not supported for total shards > 256")
+			}
 			sz := testDataSizes
 			if testing.Short() {
 				sz = testDataSizesShort
@ -1663,11 +1672,11 @@ func TestNew(t *testing.T) {
 		{255, 1, nil},
 		{255, 0, nil},
 		{1, 0, nil},
-		{256, 256, ErrMaxShardNum},
+		{65536, 65536, ErrMaxShardNum},

 		{0, 1, ErrInvShardNum},
 		{1, -1, ErrInvShardNum},
-		{256, 1, ErrMaxShardNum},
+		{65636, 1, ErrMaxShardNum},

 		// overflow causes r.Shards to be negative
 		{256, int(^uint(0) >> 1), errInvalidRowSize},
--- a/streaming.go
+++ b/streaming.go
@ -147,6 +147,10 @@ type rsStream struct {
 // you want to use. You can reuse this encoder.
 // Note that the maximum number of data shards is 256.
 func NewStream(dataShards, parityShards int, o ...Option) (StreamEncoder, error) {
+	if dataShards+parityShards > 256 {
+		return nil, ErrMaxShardNum
+	}
+
 	r := rsStream{o: defaultOptions}
 	for _, opt := range o {
 		opt(&r.o)