Add support for PAR1 (#55)

PAR1 is a file format which uses a Reed-Solomon code similar to the current one, except it uses a different (flawed) coding matrix. Add support for it via a WithPAR1Matrix option, so that this code can be used to encode/decode PAR1 files. Also add the option to existing tests, and add a test demonstrating the flaw in PAR1's coding matrix. Also fix an mistakenly inverted test in testOpts(). Incidentally, PAR1 is obsoleted by PAR2, which uses GF(2^16) and tries to fix the flaw in the coding matrix; however, PAR2's coding matrix is still flawed! The real solution is to build the coding matrix like in this repository. PAR1 spec: http://parchive.sourceforge.net/docs/specifications/parity-volume-spec-1.0/article-spec.html Paper describing the (flawed) Reed-Solomon code used by PAR1: http://web.eecs.utk.edu/~plank/plank/papers/CS-96-332.html
2017-06-20 11:24:57 -07:00 · 2017-06-20 11:24:57 -07:00 · 18d548df63
parent 87c4e5ae75
commit 18d548df63
3 changed files with 207 additions and 15 deletions
--- a/options.go
+++ b/options.go
@ -13,6 +13,7 @@ type options struct {
 	maxGoroutines     int
 	minSplitSize      int
 	useAVX2, useSSSE3 bool
+	usePAR1Matrix     bool
 }

 var defaultOptions = options{
@ -43,7 +44,7 @@ func WithMaxGoroutines(n int) Option {
 	}
 }

-// MinSplitSize Is the minimum encoding size in bytes per goroutine.
+// WithMinSplitSize is the minimum encoding size in bytes per goroutine.
 // See WithMaxGoroutines on how jobs are split.
 // If n <= 0, it is ignored.
 func WithMinSplitSize(n int) Option {
@ -65,3 +66,13 @@ func withAVX2(enabled bool) Option {
 		o.useAVX2 = enabled
 	}
 }
+
+// WithPAR1Matrix causes the encoder to build the matrix how PARv1
+// does. Note that the method they use is buggy, and may lead to cases
+// where recovery is impossible, even if there are enough parity
+// shards.
+func WithPAR1Matrix() Option {
+	return func(o *options) {
+		o.usePAR1Matrix = true
+	}
+}
--- a/reedsolomon.go
+++ b/reedsolomon.go
@ -94,6 +94,68 @@ var ErrInvShardNum = errors.New("cannot create Encoder with zero or less data/pa
 // GF(2^8).
 var ErrMaxShardNum = errors.New("cannot create Encoder with more than 256 data+parity shards")

+// buildMatrix creates the matrix to use for encoding, given the
+// number of data shards and the number of total shards.
+//
+// The top square of the matrix is guaranteed to be an identity
+// matrix, which means that the data shards are unchanged after
+// encoding.
+func buildMatrix(dataShards, totalShards int) (matrix, error) {
+	// Start with a Vandermonde matrix.  This matrix would work,
+	// in theory, but doesn't have the property that the data
+	// shards are unchanged after encoding.
+	vm, err := vandermonde(totalShards, dataShards)
+	if err != nil {
+		return nil, err
+	}
+
+	// Multiply by the inverse of the top square of the matrix.
+	// This will make the top square be the identity matrix, but
+	// preserve the property that any square subset of rows is
+	// invertible.
+	top, err := vm.SubMatrix(0, 0, dataShards, dataShards)
+	if err != nil {
+		return nil, err
+	}
+
+	topInv, err := top.Invert()
+	if err != nil {
+		return nil, err
+	}
+
+	return vm.Multiply(topInv)
+}
+
+// buildMatrixPAR1 creates the matrix to use for encoding according to
+// the PARv1 spec, given the number of data shards and the number of
+// total shards. Note that the method they use is buggy, and may lead
+// to cases where recovery is impossible, even if there are enough
+// parity shards.
+//
+// The top square of the matrix is guaranteed to be an identity
+// matrix, which means that the data shards are unchanged after
+// encoding.
+func buildMatrixPAR1(dataShards, totalShards int) (matrix, error) {
+	result, err := newMatrix(totalShards, dataShards)
+	if err != nil {
+		return nil, err
+	}
+
+	for r, row := range result {
+		// The top portion of the matrix is the identity
+		// matrix, and the bottom is a transposed Vandermonde
+		// matrix starting at 1 instead of 0.
+		if r < dataShards {
+			result[r][r] = 1
+		} else {
+			for c := range row {
+				result[r][c] = galExp(byte(c+1), r-dataShards)
+			}
+		}
+	}
+	return result, nil
+}
+
 // New creates a new encoder and initializes it to
 // the number of data shards and parity shards that
 // you want to use. You can reuse this encoder.
@ -118,22 +180,16 @@ func New(dataShards, parityShards int, opts ...Option) (Encoder, error) {
 		return nil, ErrMaxShardNum
 	}

-	// Start with a Vandermonde matrix.  This matrix would work,
-	// in theory, but doesn't have the property that the data
-	// shards are unchanged after encoding.
-	vm, err := vandermonde(r.Shards, dataShards)
+	var err error
+	if r.o.usePAR1Matrix {
+		r.m, err = buildMatrixPAR1(dataShards, r.Shards)
+	} else {
+		r.m, err = buildMatrix(dataShards, r.Shards)
+	}
 	if err != nil {
 		return nil, err
 	}

-	// Multiply by the inverse of the top square of the matrix.
-	// This will make the top square be the identity matrix, but
-	// preserve the property that any square subset of rows  is
-	// invertible.
-	top, _ := vm.SubMatrix(0, 0, dataShards, dataShards)
-	top, _ = top.Invert()
-	r.m, _ = vm.Multiply(top)
-
 	// Inverted matrices are cached in a tree keyed by the indices
 	// of the invalid rows of the data to reconstruct.
 	// The inversion root node will have the identity matrix as
--- a/reedsolomon_test.go
+++ b/reedsolomon_test.go
@ -14,11 +14,99 @@ import (
 	"testing"
 )

+func isIncreasingAndContainsDataRow(indices []int) bool {
+	cols := len(indices)
+	for i := 0; i < cols-1; i++ {
+		if indices[i] >= indices[i+1] {
+			return false
+		}
+	}
+	// Data rows are in the upper square portion of the matrix.
+	return indices[0] < cols
+}
+
+func incrementIndices(indices []int, indexBound int) (valid bool) {
+	for i := len(indices) - 1; i >= 0; i-- {
+		indices[i]++
+		if indices[i] < indexBound {
+			break
+		}
+
+		if i == 0 {
+			return false
+		}
+
+		indices[i] = 0
+	}
+
+	return true
+}
+
+func incrementIndicesUntilIncreasingAndContainsDataRow(
+	indices []int, maxIndex int) bool {
+	for {
+		valid := incrementIndices(indices, maxIndex)
+		if !valid {
+			return false
+		}
+
+		if isIncreasingAndContainsDataRow(indices) {
+			return true
+		}
+	}
+}
+
+func findSingularSubMatrix(m matrix) (matrix, error) {
+	rows := len(m)
+	cols := len(m[0])
+	rowIndices := make([]int, cols)
+	for incrementIndicesUntilIncreasingAndContainsDataRow(rowIndices, rows) {
+		subMatrix, _ := newMatrix(cols, cols)
+		for i, r := range rowIndices {
+			for c := 0; c < cols; c++ {
+				subMatrix[i][c] = m[r][c]
+			}
+		}
+
+		_, err := subMatrix.Invert()
+		if err == errSingular {
+			return subMatrix, nil
+		} else if err != nil {
+			return nil, err
+		}
+	}
+
+	return nil, nil
+}
+
+func TestBuildMatrixPAR1Singular(t *testing.T) {
+	totalShards := 8
+	dataShards := 4
+	m, err := buildMatrixPAR1(dataShards, totalShards)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	singularSubMatrix, err := findSingularSubMatrix(m)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if singularSubMatrix == nil {
+		t.Fatal("No singular sub-matrix found")
+	}
+
+	t.Logf("matrix %s has singular sub-matrix %s", m, singularSubMatrix)
+}
+
 func testOpts() [][]Option {
-	if !testing.Short() {
-		return [][]Option{}
+	if testing.Short() {
+		return [][]Option{
+			{WithPAR1Matrix()},
+		}
 	}
 	opts := [][]Option{
+		{WithPAR1Matrix()},
 		{WithMaxGoroutines(1), WithMinSplitSize(500), withSSE3(false), withAVX2(false)},
 		{WithMaxGoroutines(5000), WithMinSplitSize(50), withSSE3(false), withAVX2(false)},
 		{WithMaxGoroutines(5000), WithMinSplitSize(500000), withSSE3(false), withAVX2(false)},
@ -162,6 +250,43 @@ func testReconstruct(t *testing.T, o ...Option) {
 	}
 }

+func TestReconstructPAR1Singular(t *testing.T) {
+	perShard := 50
+	r, err := New(4, 4, WithPAR1Matrix())
+	if err != nil {
+		t.Fatal(err)
+	}
+	shards := make([][]byte, 8)
+	for s := range shards {
+		shards[s] = make([]byte, perShard)
+	}
+
+	rand.Seed(0)
+	for s := 0; s < 8; s++ {
+		fillRandom(shards[s])
+	}
+
+	err = r.Encode(shards)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// Reconstruct with only the last data shard present, and the
+	// first, second, and fourth parity shard present (based on
+	// the result of TestBuildMatrixPAR1Singular). This should
+	// fail.
+	shards[0] = nil
+	shards[1] = nil
+	shards[2] = nil
+	shards[6] = nil
+
+	err = r.Reconstruct(shards)
+	if err != errSingular {
+		t.Fatal(err)
+		t.Errorf("expected %v, got %v", errSingular, err)
+	}
+}
+
 func TestVerify(t *testing.T) {
 	testVerify(t)
 	for _, o := range testOpts() {