diff --git a/GNUmakefile b/GNUmakefile
index 0f35276..80cd3d3 100644
--- a/GNUmakefile
+++ b/GNUmakefile
@@ -1,24 +1,23 @@
 #
 # GNUmakefile for Galois field library
 #
-#
+# The default flags do *not* have the SSE instructions enabled.
+# Please cd to flag_tester and run which_compile_flags.sh to see which SSE instructions
+# your machine and compiler support, and which flags you should include below.
+
+CFLAGS = -O3 
+LDFLAGS = -O3 
 
 SRCS = gf_w4.c gf_w8.c gf_w16.c gf_w32.c gf_w64.c gf_w128.c gf_wgen.c gf.c gf_unit.c \
        gf_time.c gf_mult.c gf_method.c gf_methods.c gf_div.c gf_rand.c gf_general.c \
        gf_poly.c gf_example_1.c gf_add.c gf_example_2.c gf_example_3.c gf_example_4.c \
-       gf_inline_time.c
+       gf_inline_time.c gf_example_5.c gf_example_6.c gf_example_7.c
 
 HDRS = gf_complete.h gf_int.h
 
 EXECUTABLES = gf_mult gf_div gf_add gf_unit gf_time gf_methods gf_poly \
-              gf_example_1 gf_example_2 gf_example_3 gf_example_4 gf_inline_time
-
-CFLAGS = -O3 -msse4 -maes -mpclmul -DINTEL_SSE4 -DINTEL_PCLMUL
-LDFLAGS = -O3 -msse4 -maes -mpclmul
-
-# Use these if you don't have INTEL_PCLMUL
-# CFLAGS = -O3 -msse4 -DINTEL_SSE4
-# LDFLAGS = -O3 -msse4 
+              gf_example_1 gf_example_2 gf_example_3 gf_example_4 gf_inline_time \
+              gf_example_5 gf_example_6 gf_example_7
 
 RM = /bin/rm -f
 
@@ -45,6 +44,9 @@ gf_example_1: gf_example_1.o gf_complete.a
 gf_example_2: gf_example_2.o gf_complete.a
 gf_example_3: gf_example_3.o gf_complete.a
 gf_example_4: gf_example_4.o gf_complete.a
+gf_example_5: gf_example_5.o gf_complete.a
+gf_example_6: gf_example_6.o gf_complete.a
+gf_example_7: gf_example_7.o gf_complete.a
 gf_mult: gf_mult.o gf_complete.a
 gf_div: gf_div.o gf_complete.a
 gf_poly: gf_poly.o gf_complete.a
@@ -54,7 +56,8 @@ clean:
 	$(RM) $(OBJS) gf_div.c
 
 spotless: clean
-	$(RM) *~ $(EXECUTABLES)
+	$(RM) *~ $(EXECUTABLES) which_compile_flags
+	$(RM) gf_complete.a
 
 gf_div.o: gf_complete.h gf_method.h
 gf_methods.o: gf_complete.h gf_method.h
@@ -71,8 +74,12 @@ gf_example_1.o: gf_complete.h gf_rand.h
 gf_example_2.o: gf_complete.h gf_rand.h
 gf_example_3.o: gf_complete.h gf_rand.h
 gf_example_4.o: gf_complete.h gf_rand.h
+gf_example_5.o: gf_complete.h gf_rand.h
+gf_example_6.o: gf_complete.h gf_rand.h
+gf_example_7.o: gf_complete.h gf_rand.h
 gf_general.o: gf_complete.h gf_int.h gf_general.h gf_rand.h
 gf_mult.o: gf_complete.h gf_method.h
+gf.o: gf_complete.h gf_int.h
 gf_method.o: gf_complete.h
 
 gf_div.c: gf_mult.c
diff --git a/Log-Zero-for-w=8.odg b/Log-Zero-for-w=8.odg
deleted file mode 100644
index 138a673..0000000
Binary files a/Log-Zero-for-w=8.odg and /dev/null differ
diff --git a/Manual.pdf b/Manual.pdf
new file mode 100644
index 0000000..fdc9756
Binary files /dev/null and b/Manual.pdf differ
diff --git a/README b/README
deleted file mode 100644
index 4169e1c..0000000
--- a/README
+++ /dev/null
@@ -1 +0,0 @@
-This is a README file.
diff --git a/README.txt b/README.txt
index 91fecc5..0726922 100644
--- a/README.txt
+++ b/README.txt
@@ -1,5 +1,13 @@
-This is GF-Complete, Revision 0.1.  
+This is GF-Complete, Revision 1.0.  
+
+The user's manual is in the file Manual.pdf.  
+
+There are two online homes for GF-Complete:
+
+  - https://bitbucket.org/jimplank/gf-complete
+  - http://www.cs.utk.edu/~plank/plank/papers/CS-13-716.html
+
+When compiling this for the first time, cd to flag_tester, and
+do "sh which_compile_flags.sh xxx", where xxx is the compiler
+that you will use in the GNUMakefile.
 
-Please see http://www.cs.utk.edu/~plank/plank/papers/CS-13-703.html for the user's
-manual and other important documentation about this library, including more 
-recent revisions.
diff --git a/explanation.html b/explanation.html
deleted file mode 100644
index 72f03d0..0000000
--- a/explanation.html
+++ /dev/null
@@ -1,777 +0,0 @@
-<h3>Code structure as of 7/20/2012</h3>
-
-written by Jim.
-<p>
-Ok -- once again, I have messed with the structure.  My goal is flexible and efficient.
-It's similar to the stuff before, but better because it makes things like Euclid's 
-method much cleaner.
-<p>
-I think we're ready to hack.  
-<p>
-<p>
-<hr>
-<h3>Files</h3>
-<UL>
-<LI> <a href=GNUmakefile><b>GNUmakefile</b></a>: Makefile
-<LI> <a href=README><b>README</b></a>: Empty readme
-<LI> <a href=explanation.html><b>explanation.html</b></a>: This file.
-<LI> <a href=gf.c><b>gf.c</b></a>: Main gf routines
-<LI> <a href=gf.h><b>gf.h</b></a>: Main gf prototypes and typedefs
-<LI> <a href=gf_int.h><b>gf_int.h</b></a>: Prototypes and typedefs for common routines for the 
-    internal gf implementations.
-<LI> <a href=gf_method.c><b>gf_method.c</b></a>: Code to help parse argc/argv to define the method.
-    This way, various programs can be consistent with how they handle the command line.
-<LI> <a href=gf_method.h><b>gf_method.h</b></a>: Prototypes for ibid.
-<LI> <a href=gf_methods.c><b>gf_methods.c</b></a>: This program prints out how to define
-    the various methods on the command line.  My idea is to beef this up so that you can 
-    give it a method spec on the command line, and it will tell you whether it's valid, or
-    why it's invalid.  I haven't written that part yet.
-<LI> <a href=gf_mult.c><b>gf_mult.c</b></a>: Program to do single multiplication.
-<LI> <a href=gf_mult.c><b>gf_mult.c</b></a>: Program to do single divisions -- it's created
-    in the makefile with a sed script on gf_mult.c.
-<LI> <a href=gf_time.c><b>gf_time.c</b></a>: Time tester
-<LI> <a href=gf_unit.c><b>gf_unit.c</b></a>: Unit tester
-<LI> <a href=gf_54.c><b>gf_54.c</b></a>: A simple example program that multiplies 
-    5 and 4 in GF(2^4).
-<LI> <a href=gf_w4.c><b>gf_w4.c</b></a>: Implementation of code for <i>w</i> = 4.
-(For now, only SHIFT and LOG, plus EUCLID & MATRIX).
-<LI> <a href=gf_w8.c><b>gf_w8.c</b></a>: Implementation of code for <i>w</i> = 8.
-(For now, only SHIFT plus EUCLID & MATRIX).
-<LI> <a href=gf_w16.c><b>gf_w16.c</b></a>: Implementation of code for <i>w</i> = 16.
-(For now, only SHIFT plus EUCLID & MATRIX).
-<LI> <a href=gf_w32.c><b>gf_w32.c</b></a>: Implementation of code for <i>w</i> = 32.
-(For now, only SHIFT plus EUCLID & MATRIX).
-<LI> <a href=gf_w64.c><b>gf_w64.c</b></a>: Implementation of code for <i>w</i> = 64.
-(For now, only SHIFT and EUCLID.
-<LI> I don't have gf_w128.c or gf_gen.c yet.
-</UL>
-
-<hr>
-<h3>Prototypes and typedefs in gf.h</h3>
-
-The main structure that users will see is in <b>gf.h</b>, and it is of type
-<b>gf_t</b>:
-
-<p><center><table border=3 cellpadding=3><td><pre>
-typedef struct gf {
-  gf_func_a_b    multiply;
-  gf_func_a_b    divide;
-  gf_func_a      inverse;
-  gf_region      multiply_region;
-  void           *scratch;
-} gf_t;
-</pre></td></table></center><p>
-    
-We can beef it up later with buf-buf or buf-acc.  The problem is that the paper is 
-already bloated, so right now, I want to keep it lean.
-<p>
-The types of the procedures are big unions, so that they work with the following
-types of arguments:
-
-<p><center><table border=3 cellpadding=3><td><pre>
-typedef uint8_t     gf_val_4_t;
-typedef uint8_t     gf_val_8_t;
-typedef uint16_t    gf_val_16_t;
-typedef uint32_t    gf_val_32_t;
-typedef uint64_t    gf_val_64_t;
-typedef uint64_t    *gf_val_128_t;
-typedef uint32_t    gf_val_gen_t;   /* The intent here is for general values <= 32 */
-</pre></td></table></center><p>
-
-To use one of these, you need to create one with <b>gf_init_easy()</b> or 
-<b>gf_init_hard()</b>.  Let's concentrate on the former:
-
-<p><center><table border=3 cellpadding=3><td><pre>
-extern int gf_init_easy(gf_t *gf, int w, int mult_type);
-</pre></td></table></center><p>
-
-You pass it memory for a <b>gf_t</b>, a value of <b>w</b> and
-a variable that says how to do multiplication.  The valid values of <b>mult_type</b>
-are enumerated in <b>gf.h</b>:
-
-<p><center><table border=3 cellpadding=3><td><pre>
-typedef enum {GF_MULT_DEFAULT,
-              GF_MULT_SHIFT,
-              GF_MULT_GROUP,
-              GF_MULT_BYTWO_p,
-              GF_MULT_BYTWO_b,
-              GF_MULT_TABLE,
-              GF_MULT_LOG_TABLE,
-              GF_MULT_SPLIT_TABLE,
-              GF_MULT_COMPOSITE } gf_mult_type_t;
-</pre></td></table></center><p>
-
-After creating the <b>gf_t</b>, you use its <b>multiply</b> method
-to multiply, using the union's fields to work with the various types.
-It looks easier than my explanation.  For example, suppose you wanted to multiply 5 and 4 in <i>GF(2<sup>4</sup>)</i>.
-You can do it as in 
-<b><a href=gf_54.c>gf_54.c</a></b>
-
-<p><center><table border=3 cellpadding=3><td><pre>
-#include "gf.h"
-
-main()
-{
-  gf_t gf;
-
-  gf_init_easy(&gf, 4, GF_MULT_DEFAULT);
-  printf("%d\n", gf.multiply.w4(&gf, 5, 4));
-  exit(0);
-}
-</pre></td></table></center><p>
-
-
-If you wanted to multiply in <i>GF(2<sup>8</sup>)</i>, then you'd have to use 8 as a parameter
-to <b>gf_init_easy</b>, and call the multiplier as <b>gf.mult.w8()</b>.
-<p>
-When you're done with your <b>gf_t</b>, you should call <b>gf_free()</b> on it so
-that it can free memory that it has allocated.  We'll talk more about memory later, but if you
-create your <b>gf_t</b> with <b>gf_init_easy</b>, then it calls <b>malloc()</b>, and 
-if you care about freeing memory, you'll have to call <b>gf_free()</b>.
-<p>
-
-<hr>
-<h3>Memory allocation</h3>
-
-Each implementation of a multiplication technique keeps around its
-own data.  For example, <b>GF_MULT_TABLE</b> keeps around 
-multiplication and division tables, and <b>GF_MULT_LOG</b> maintains log and
-antilog tables.  This data is stored in the pointer <b>scratch</b>.  My intent
-is that the memory that is there is all that's required.  In other
-words, the <b>multiply()</b>, <b>divide()</b>, <b>inverse()</b> and
-<b>multiply_region()</b> calls don't do any memory allocation.
-Moreover, <b>gf_init_easy()</b> only allocates one chunk of memory --
-the one in <b>scratch</b>.
-<p>
-If you don't want to have the initialization call allocate memory, you can use <b>gf_init_hard()</b>:
-
-<p><center><table border=3 cellpadding=3><td><pre>
-extern int gf_init_hard(gf_t *gf,
-                        int w,
-                        int mult_type,
-                        int region_type,
-                        int divide_type,
-                        uint64_t prim_poly,
-                        int arg1,
-                        int arg2,
-                        gf_t *base_gf,
-                        void *scratch_memory);
-</pre></td></table></center><p>
-
-The first three parameters are the same as <b>gf_init_easy()</b>. 
-You can add additional arguments for performing <b>multiply_region</b>, and
-for performing division in the <b>region_type</b> and <b>divide_type</b>
-arguments.  Their values are also defined in <b>gf.h</b>.  You can 
-mix the <b>region_type</b> values (e.g. "DOUBLE" and "SSE"):
-
-<p><center><table border=3 cellpadding=3><td><pre>
-#define GF_REGION_DEFAULT      (0x0)
-#define GF_REGION_SINGLE_TABLE (0x1)
-#define GF_REGION_DOUBLE_TABLE (0x2)
-#define GF_REGION_QUAD_TABLE   (0x4)
-#define GF_REGION_LAZY         (0x8)
-#define GF_REGION_SSE          (0x10)
-#define GF_REGION_NOSSE        (0x20)
-#define GF_REGION_STDMAP       (0x40)
-#define GF_REGION_ALTMAP       (0x80)
-#define GF_REGION_CAUCHY       (0x100)
-
-typedef uint32_t gf_region_type_t;
-
-typedef enum { GF_DIVIDE_DEFAULT,
-               GF_DIVIDE_MATRIX,
-               GF_DIVIDE_EUCLID } gf_division_type_t;
-</pre></td></table></center><p>
-You can change
-the primitive polynomial with <b>prim_poly</b>, give additional arguments with 
-<b>arg1</b> and <b>arg2</b> and give a base Galois Field for composite fields.
-Finally, you can pass it a pointer to memory in <b>scratch_memory</b>.  That
-way, you can avoid having <b>gf_init_hard()</b> call <b>malloc()</b>.  
-<p>
-There is a procedure called <b>gf_scratch_size()</b> that lets you know the minimum
-size for <b>scratch_memory</b>, depending on <i>w</i>, the multiplication type
-and the arguments:
-
-<p><center><table border=3 cellpadding=3><td><pre>
-extern int gf_scratch_size(int w,
-                           int mult_type,
-                           int region_type,
-                           int divide_type,
-                           int arg1,
-                           int arg2);
-</pre></td></table></center><p>
-
-You can specify default arguments in <b>gf_init_hard()</b>:
-<UL>
-<LI> <b>region_type</b> = <b>GF_REGION_DEFAULT</b> 
-<LI> <b>divide_type</b> = <b>GF_REGION_DEFAULT</b>
-<LI> <b>prim_poly</b> = 0
-<LI> <b>arg1</b> = 0
-<LI> <b>arg2</b> = 0
-<LI> <b>base_gf</b> = <b>NULL</b>
-<LI> <b>scratch_memory</b> = <b>NULL</b>
-</UL>
-If any argument is equal to its default, then default actions are taken (e.g. a 
-standard primitive polynomial is used, or memory is allocated for <b>scratch_memory</b>).
-In fact, <b>gf_init_easy()</b> simply calls <b>gf_init_hard()</b> with the default
-parameters.
-<p>
-<b>gf_free()</b> frees memory that was allocated with <b>gf_init_easy()</b>
-or <b>gf_init_hard()</b>.  The <b>recursive</b> parameter is in case you 
-use composite fields, and want to recursively free the base fields.
-If you pass <b>scratch_memory</b> to <b>gf_init_hard()</b>, then you typically
-don't need to call <b>gf_free()</b>.  It won't hurt to call it, though.
-
-<hr>
-<h3>gf_mult and gf_div</h3>
-
-For the moment, I have few things completely implemented, but that's because I want
-to be able to explain the structure, and how to specify methods.  In particular, for
-<i>w=4</i>, I have implemented <b>SHIFT</b> and <b>LOG</b>.  For <i>w=8, 16, 32, 64</i>
-I have implemented <b>SHIFT</b>.  For all <i>w &le; 32</i>, I have implemented both
-Euclid's algorithm for inversion, and the matrix method for inversion.  For
-<i>w=64</i>, it's just Euclid.  You can
-test these all with <b>gf_mult</b> and <b>gf_div</b>.  Here are a few calls:
-
-<pre>
-UNIX> <font color=darkred><b>gf_mult 7 11 4</b></font>                - Default
-4
-UNIX> <font color=darkred><b>gf_mult 7 11 4 SHIFT - -</b></font>      - Use shift
-4
-UNIX> <font color=darkred><b>gf_mult 7 11 4 LOG - -</b></font>        - Use logs
-4
-UNIX> <font color=darkred><b>gf_div 4 7 4</b></font>                  - Default
-11
-UNIX> <font color=darkred><b>gf_div 4 7 4 LOG - -</b></font>          - Use logs
-11
-UNIX> <font color=darkred><b>gf_div 4 7 4 LOG - EUCLID</b></font>     - Use Euclid instead of logs
-11
-UNIX> <font color=darkred><b>gf_div 4 7 4 LOG - MATRIX</b></font>     - Use Matrix inversion instead of logs
-11
-UNIX> <font color=darkred><b>gf_div 4 7 4 SHIFT - -</b></font>        - Default
-11
-UNIX> <font color=darkred><b>gf_div 4 7 4 SHIFT - EUCLID</b></font>   - Use Euclid (which is the default)
-11
-UNIX> <font color=darkred><b>gf_div 4 7 4 SHIFT - MATRIX</b></font>   - Use Matrix inversion instead of logs
-11
-UNIX> <font color=darkred><b>gf_mult 200 211 8</b></font>        - The remainder are shift/Euclid
-201
-UNIX> <font color=darkred><b>gf_div 201 211 8</b></font>
-200
-UNIX> <font color=darkred><b>gf_mult 60000 65111 16</b></font>
-63515
-UNIX> <font color=darkred><b>gf_div 63515 65111 16</b></font>
-60000
-UNIX> <font color=darkred><b>gf_mult abcd0001 9afbf788 32h</b></font>
-b0359681
-UNIX> <font color=darkred><b>gf_div b0359681 9afbf788 32h</b></font>
-abcd0001
-UNIX> <font color=darkred><b>gf_mult abcd00018c8b8c8a 9afbf7887f6d8e5b 64h</b></font>
-3a7def35185bd571
-UNIX> <font color=darkred><b>gf_mult abcd00018c8b8c8a 9afbf7887f6d8e5b 64h</b></font>
-3a7def35185bd571
-UNIX> <font color=darkred><b>gf_div 3a7def35185bd571 9afbf7887f6d8e5b 64h</b></font>
-abcd00018c8b8c8a
-UNIX> <font color=darkred><b></b></font>
-</pre>
-
-You can see all the methods with <b>gf_methods</b>.  We have a lot of implementing to do:
-
-<pre>
-UNIX> <font color=darkred><b>gf_methods</b></font>
-To specify the methods, do one of the following: 
-       - leave empty to use defaults
-       - use a single dash to use defaults
-       - specify MULTIPLY REGION DIVIDE
-
-Legal values of MULTIPLY:
-       SHIFT: shift
-       GROUP g_mult g_reduce: the Group technique - see the paper
-       BYTWO_p: BYTWO doubling the product.
-       BYTWO_b: BYTWO doubling b (more efficient thatn BYTWO_p)
-       TABLE: Full multiplication table
-       LOG:   Discrete logs
-       LOG_ZERO: Discrete logs with a large table for zeros
-       SPLIT g_a g_b: Split tables defined by g_a and g_b
-       COMPOSITE k l [METHOD]: Composite field, recursively specify the
-                               method of the base field in GF(2^l)
-
-Legal values of REGION: Specify multiples with commas e.g. 'DOUBLE,LAZY'
-       -: Use defaults
-       SINGLE/DOUBLE/QUAD: Expand tables
-       LAZY: Lazily create table (only applies to TABLE and SPLIT)
-       SSE/NOSSE: Use 128-bit SSE instructions if you can
-       CAUCHY/ALTMAP/STDMAP: Use different memory mappings
-
-Legal values of DIVIDE:
-       -: Use defaults
-       MATRIX: Use matrix inversion
-       EUCLID: Use the extended Euclidian algorithm.
-
-See the user's manual for more information.
-There are many restrictions, so it is better to simply use defaults in most cases.
-UNIX> <font color=darkred><b></b></font>
-</pre>
-
-<hr>
-<h3>gf_unit and gf_time</h3>
-
-<b><a href=gf_unit.c>gf_unit.c</a></b> is a unit tester, and 
-<b><a href=gf_time.c>gf_time.c</a></b> is a time tester.
-
-They are called as follows:
-
-<p><center><table border=3 cellpadding=3><td><pre>
-UNIX> <font color=darkred><b>gf_unit w tests seed [METHOD] </b></font>
-UNIX> <font color=darkred><b>gf_time w tests seed size(bytes) iterations [METHOD] </b></font>
-</pre></td></table></center><p>
-
-The <b>tests</b> parameter is one or more of the following characters:
-
-<UL>
-<LI>        A: Do all tests
-<LI>        S: Test only single operations (multiplication/division)
-<LI>        R: Test only region operations
-<LI>        V: Verbose Output
-</UL>
-
-<b>seed</b> is a seed for <b>srand48()</b> -- using -1 defaults to the current time.
-<p>
-For example, testing the defaults with w=4:
-
-<pre>
-UNIX> <font color=darkred><b>gf_unit 4 AV 1 LOG - -</b></font>
-Seed: 1
-Testing single multiplications/divisions.
-Testing Inversions.
-Testing buffer-constant, src != dest, xor = 0
-Testing buffer-constant, src != dest, xor = 1
-Testing buffer-constant, src == dest, xor = 0
-Testing buffer-constant, src == dest, xor = 1
-UNIX> <font color=darkred><b>gf_unit 4 AV 1 SHIFT - -</b></font>
-Seed: 1
-Testing single multiplications/divisions.
-Testing Inversions.
-No multiply_region.
-UNIX> <font color=darkred><b></b></font>
-</pre>
-
-There is no <b>multiply_region()</b> method defined for <b>SHIFT</b>.
-Thus, the procedures are <b>NULL</b> and the unit tester ignores them.
-<p>
-At the moment, I only have the unit tester working for w=4.
-<p>
-<b>gf_time</b> takes the size of an array (in bytes) and a number of iterations, and
-tests the speed of both single and region operations.  The tests are:
-
-<UL>
-<LI> A: All
-<LI> S: All Single Operations
-<LI> R: All Region Operations
-<LI> M: Single: Multiplications
-<LI> D: Single: Divisions
-<LI> I: Single: Inverses
-<LI> B: Region: Multipy_Region
-</UL> 
-
-Here are some examples with <b>SHIFT</b> and <b>LOG</b> on my mac.
-
-<pre>
-UNIX> <font color=darkred><b>gf_time 4 A 1 102400 1024 LOG - -</b></font>
-Seed: 1
-Multiply:   0.538126 s      185.830 Mega-ops/s
-Divide:     0.520825 s      192.003 Mega-ops/s
-Inverse:    0.631198 s      158.429 Mega-ops/s
-Buffer-Const,s!=d,xor=0:    0.478395 s      209.032 MB/s
-Buffer-Const,s!=d,xor=1:    0.524245 s      190.751 MB/s
-Buffer-Const,s==d,xor=0:    0.471851 s      211.931 MB/s
-Buffer-Const,s==d,xor=1:    0.528275 s      189.295 MB/s
-UNIX> <font color=darkred><b>gf_time 4 A 1 102400 1024 LOG - EUCLID</b></font>
-Seed: 1
-Multiply:   0.555512 s      180.014 Mega-ops/s
-Divide:     5.359434 s       18.659 Mega-ops/s
-Inverse:    4.911719 s       20.359 Mega-ops/s
-Buffer-Const,s!=d,xor=0:    0.496097 s      201.573 MB/s
-Buffer-Const,s!=d,xor=1:    0.538536 s      185.689 MB/s
-Buffer-Const,s==d,xor=0:    0.485564 s      205.946 MB/s
-Buffer-Const,s==d,xor=1:    0.540227 s      185.107 MB/s
-UNIX> <font color=darkred><b>gf_time 4 A 1 102400 1024 LOG - MATRIX</b></font>
-Seed: 1
-Multiply:   0.544005 s      183.822 Mega-ops/s
-Divide:     7.602822 s       13.153 Mega-ops/s
-Inverse:    7.000564 s       14.285 Mega-ops/s
-Buffer-Const,s!=d,xor=0:    0.474868 s      210.585 MB/s
-Buffer-Const,s!=d,xor=1:    0.527588 s      189.542 MB/s
-Buffer-Const,s==d,xor=0:    0.473130 s      211.358 MB/s
-Buffer-Const,s==d,xor=1:    0.529877 s      188.723 MB/s
-UNIX> <font color=darkred><b>gf_time 4 A 1 102400 1024 SHIFT - -</b></font>
-Seed: 1
-Multiply:   2.708842 s       36.916 Mega-ops/s
-Divide:     8.756882 s       11.420 Mega-ops/s
-Inverse:    5.695511 s       17.558 Mega-ops/s
-UNIX> <font color=darkred><b></b></font>
-</pre>
-
-At the moment, I only have the timer working for w=4.
-
-<hr>
-<h3>Walking you through <b>LOG</b></h3>
-
-To see how <b>scratch</b> is used to store data, let's look at what happens when 
-you call <b>gf_init_easy(&gf, 4, GF_MULT_LOG);</b>  
-First, <b>gf_init_easy()</b> calls <b>gf_init_hard()</b> with default parameters.
-This is in <b><a href=gf.c>gf.c</a></b>.
-<p>
-<b>gf_init_hard()</b>' first job is to set up the scratch.
-The scratch's type is <b>gf_internal_t</b>, defined in 
-<b><a href=gf_int.h>gf_int.h</a></b>:
-
-<p><center><table border=3 cellpadding=3><td><pre>
-typedef struct {
-  int mult_type;
-  int region_type;
-  int divide_type;
-  int w;
-  uint64_t prim_poly;
-  int free_me;
-  int arg1;
-  int arg2;
-  gf_t *base_gf;
-  void *private;
-} gf_internal_t;
-</pre></td></table></center><p>
-
-All the fields are straightfoward, with the exception of <b>private</b>.  That is
-a <b>(void *)</b> which points to the implementation's private data.
-<p>
-Here's the code for 
-<b>gf_init_hard()</b>:
-
-<p><center><table border=3 cellpadding=3><td><pre>
-int gf_init_hard(gf_t *gf, int w, int mult_type, 
-                        int region_type,
-                        int divide_type,
-                        uint64_t prim_poly,
-                        int arg1, int arg2,
-                        gf_t *base_gf,
-                        void *scratch_memory) 
-{
-  int sz;
-  gf_internal_t *h;
-
-
-  if (scratch_memory == NULL) {
-    sz = gf_scratch_size(w, mult_type, region_type, divide_type, arg1, arg2);
-    if (sz &lt;= 0) return 0;
-    h = (gf_internal_t *) malloc(sz);
-    h-&gt;free_me = 1;
-  } else {
-    h = scratch_memory;
-    h-&gt;free_me = 0;
-  }
-  gf-&gt;scratch = (void *) h;
-  h-&gt;mult_type = mult_type;
-  h-&gt;region_type = region_type;
-  h-&gt;divide_type = divide_type;
-  h-&gt;w = w;
-  h-&gt;prim_poly = prim_poly;
-  h-&gt;arg1 = arg1;
-  h-&gt;arg2 = arg2;
-  h-&gt;base_gf = base_gf;
-  h-&gt;private = (void *) gf-&gt;scratch;
-  h-&gt;private += (sizeof(gf_internal_t));
-
-  switch(w) {
-    case 4: return gf_w4_init(gf);
-    case 8: return gf_w8_init(gf);
-    case 16: return gf_w16_init(gf);
-    case 32: return gf_w32_init(gf);
-    case 64: return gf_w64_init(gf);
-    case 128: return gf_dummy_init(gf);
-    default: return 0;
-  }
-}
-</pre></td></table></center><p>
-
-The first thing it does is determine if it has to allocate space for <b>scratch</b>.
-If it must, it uses <b>gf_scratch_size()</b> to figure out how big the space must be.
-It then sets <b>gf->scratch</b> to this space, and sets all of the fields of the
-scratch to the arguments in <b>gf_init_hard()</b>.  The <b>private</b> pointer is
-set to be the space just after the pointer <b>gf->private</b>.   Again, it is up to 
-<b>gf_scratch_size()</b> to make sure there is enough space for the scratch, and 
-for all of the private data needed by the implementation.
-<p>
-Once the scratch is set up, <b>gf_init_hard()</b> calls <b>gf_w4_init()</b>.  This is
-in <b><a href=gf_w4.c>gf_w4.c</a></b>, and it is a 
-simple dispatcher to the various initialization routines, plus it 
-sets <b>EUCLID</b> and <b>MATRIX</b> if need be:
-
-<p><center><table border=3 cellpadding=3><td><pre>
-int gf_w4_init(gf_t *gf)
-{
-  gf_internal_t *h;
-
-  h = (gf_internal_t *) gf-&gt;scratch;
-  if (h-&gt;prim_poly == 0) h-&gt;prim_poly = 0x13;
-
-  gf-&gt;multiply.w4 = NULL;
-  gf-&gt;divide.w4 = NULL;
-  gf-&gt;inverse.w4 = NULL;
-  gf-&gt;multiply_region.w4 = NULL;
-
-  switch(h-&gt;mult_type) {
-    case GF_MULT_SHIFT:     if (gf_w4_shift_init(gf) == 0) return 0; break;
-    case GF_MULT_LOG_TABLE: if (gf_w4_log_init(gf) == 0) return 0; break;
-    case GF_MULT_DEFAULT:   if (gf_w4_log_init(gf) == 0) return 0; break;
-    default: return 0;
-  }
-  if (h-&gt;divide_type == GF_DIVIDE_EUCLID) {
-    gf-&gt;divide.w4 = gf_w4_divide_from_inverse;
-    gf-&gt;inverse.w4 = gf_w4_euclid;
-  } else if (h-&gt;divide_type == GF_DIVIDE_MATRIX) {
-    gf-&gt;divide.w4 = gf_w4_divide_from_inverse;
-    gf-&gt;inverse.w4 = gf_w4_matrix;
-  }
-
-  if (gf-&gt;inverse.w4 != NULL && gf-&gt;divide.w4 == NULL) {
-    gf-&gt;divide.w4 = gf_w4_divide_from_inverse;
-  }
-  if (gf-&gt;inverse.w4 == NULL && gf-&gt;divide.w4 != NULL) {
-    gf-&gt;inverse.w4 = gf_w4_inverse_from_divide;
-  }
-  return 1;
-}
-</pre></td></table></center><p>
-
-The code in <b>gf_w4_log_init()</b> sets up the log and antilog tables, and sets
-the <b>multiply.w4</b>, <b>divide.w4</b> etc routines to be the ones for logs.  The
-tables are put into <b>gf->scratch->private</b>, which is typecast to a <b>struct
-gf_logtable_data *</b>:
-
-<p><center><table border=3 cellpadding=3><td><pre>
-struct gf_logtable_data {
-    gf_val_4_t      log_tbl[GF_FIELD_SIZE];
-    gf_val_4_t      antilog_tbl[GF_FIELD_SIZE * 2];
-    gf_val_4_t      *antilog_tbl_div;
-};
-.......
-
-static 
-int gf_w4_log_init(gf_t *gf)
-{
-  gf_internal_t *h;
-  struct gf_logtable_data *ltd;
-  int i, b;
-
-  h = (gf_internal_t *) gf-&gt;scratch;
-  ltd = h-&gt;private;
-
-  ltd-&gt;log_tbl[0] = 0;
-
-  ltd-&gt;antilog_tbl_div = ltd-&gt;antilog_tbl + (GF_FIELD_SIZE-1);
-  b = 1;
-  for (i = 0; i &lt; GF_FIELD_SIZE-1; i++) {
-      ltd-&gt;log_tbl[b] = (gf_val_8_t)i;
-      ltd-&gt;antilog_tbl[i] = (gf_val_8_t)b;
-      ltd-&gt;antilog_tbl[i+GF_FIELD_SIZE-1] = (gf_val_8_t)b;
-      b &lt;&lt;= 1;
-      if (b & GF_FIELD_SIZE) {
-          b = b ^ h-&gt;prim_poly;
-      }
-  }
-    
-  gf-&gt;inverse.w4 = gf_w4_inverse_from_divide;
-  gf-&gt;divide.w4 = gf_w4_log_divide;
-  gf-&gt;multiply.w4 = gf_w4_log_multiply;
-  gf-&gt;multiply_region.w4 = gf_w4_log_multiply_region;
-  return 1;
-}
-</pre></td></table></center><p>
-
-And of course the individual routines use <b>h->private</b> to access the tables:
-
-<p><center><table border=3 cellpadding=3><td><pre>
-static
-inline
-gf_val_8_t gf_w4_log_multiply (gf_t *gf, gf_val_8_t a, gf_val_8_t b)
-{
-  struct gf_logtable_data *ltd;
-    
-  ltd = (struct gf_logtable_data *) ((gf_internal_t *) (gf-&gt;scratch))-&gt;private;
-  return (a == 0 || b == 0) ? 0 : ltd-&gt;antilog_tbl[(unsigned)(ltd-&gt;log_tbl[a] + ltd-&gt;log_tbl[b])];
-}
-</pre></td></table></center><p>
-
-Finally, it's important that the proper sizes are put into 
-<b>gf_w4_scratch_size()</b> for each implementation:
-
-<p><center><table border=3 cellpadding=3><td><pre>
-int gf_w4_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
-{
-  int region_tbl_size;
-  switch(mult_type)
-  {
-    case GF_MULT_DEFAULT:
-    case GF_MULT_LOG_TABLE:
-      return sizeof(gf_internal_t) + sizeof(struct gf_logtable_data) + 64;
-      break;
-    case GF_MULT_SHIFT:
-      return sizeof(gf_internal_t);
-      break;
-    default:
-      return -1;
-   }
-}
-</pre></td></table></center><p>
-I hope that's enough explanation for y'all to start implementing.  Let me know if you have
-problems -- thanks -- Jim
-
-<hr>
-The initial structure has been set for w=4, 8, 16, 32 and 64, with implementations of SHIFT and EUCLID, and for w <= 32, MATRIX.  There are some weird caveats:
-
-<UL>
-<LI> For w=32 and w=64, the primitive polynomial does not have the leading one.  
-<LI> I'd like for naming to be:
-<p>
-<UL>
-      <b>gf_w</b><i>w</i><b>_</b><i>technique</i></i><b>_</b><i>funcationality</i><b>()</b>.
-</UL>
-<p>
-For example, the log techniques for w=4 are:
-<pre>
-gf_w4_log_multiply()
-gf_w4_log_divide()
-gf_w4_log_multiply_region()
-gf_w4_log_init()
-</pre>
-<p>
-<LI> I'd also like a header block on implementations that says who wrote it.
-</UL>
-
-<hr>
-<h3>Things we need to Implement: <i>w=4</i></h3>
-
-<p><table border=3 cellpadding=2>
-<tr> <td> SHIFT </td> <td> Done - Jim </td> </tr>
-<tr> <td> BYTWO_p </td> <td>Done - Jim</td> </tr>
-<tr> <td> BYTWO_b </td> <td>Done - Jim</td> </tr>
-<tr> <td> BYTWO_p, SSE </td> <td>Done - Jim</td> </tr>
-<tr> <td> BYTWO_b, SSE </td> <td>Done - Jim</td> </tr>
-<tr> <td> Single TABLE </td> <td> Done - Jim </td> </tr>
-<tr> <td> Double TABLE </td> <td> Done - Jim </td> </tr>
-<tr> <td> Double TABLE, SSE </td> <td> Done - Jim </td> </tr>
-<tr> <td> Quad TABLE </td> <td>Done - Jim</td> </tr>
-<tr> <td> Lazy Quad TABLE </td> <td>Done - Jim</td> </tr>
-<tr> <td> LOG </td> <td> Done - Jim </td> </tr>
-</table><p>
-
-<hr>
-<h3>Things we need to Implement: <i>w=8</i></h3>
-
-<p><table border=3 cellpadding=2>
-<tr> <td> SHIFT </td> <td> Done - Jim </td> </tr>
-<tr> <td> BYTWO_p </td> <td>Done - Jim </td> </tr>
-<tr> <td> BYTWO_b </td> <td>Done - Jim </td> </tr>
-<tr> <td> BYTWO_p, SSE </td> <td>Done - Jim </td> </tr>
-<tr> <td> BYTWO_b, SSE </td> <td>Done - Jim </td> </tr>
-<tr> <td> Single TABLE </td> <td> Done - Kevin </td> </tr>
-<tr> <td> Double TABLE </td> <td> Done - Jim </td> </tr>
-<tr> <td> Lazy Double TABLE </td> <td> Done - Jim </td> </tr>
-<tr> <td> Split 2 1 (Half) SSE </td> <td>Done - Jim</td> </tr>
-<tr> <td> Composite, k=2 </td> <td> Done - Kevin (alt mapping not passing unit test) </td> </tr>
-<tr> <td> LOG </td> <td> Done - Kevin </td> </tr>
-<tr> <td> LOG ZERO</td> <td> Done - Jim</td> </tr>
-</table><p>
-
-<hr>
-<h3>Things we need to Implement: <i>w=16</i></h3>
-
-<p><table border=3 cellpadding=2>
-<tr> <td> SHIFT </td> <td> Done - Jim </td> </tr>
-<tr> <td> BYTWO_p </td> <td>Done - Jim</td> </tr>
-<tr> <td> BYTWO_b </td> <td>Done - Jim</td> </tr>
-<tr> <td> BYTWO_p, SSE </td> <td>Done - Jim</td> </tr>
-<tr> <td> BYTWO_b, SSE </td> <td>Done - Jim</td> </tr>
-<tr> <td> Lazy TABLE </td> <td>Done - Jim</td> </tr>
-<tr> <td> Split 4 16 No-SSE, lazy </td> <td>Done - Jim</td> </tr>
-<tr> <td> Split 4 16 SSE, lazy </td> <td>Done - Jim</td> </tr>
-<tr> <td> Split 4 16 SSE, lazy, alternate mapping </td> <td>Done - Jim</td> </tr>
-<tr> <td> Split 8 16, lazy </td> <td>Done - Jim</td> </tr>
-<tr> <td> Composite, k=2, stdmap recursive </td> <td> Done - Kevin</td> </tr>
-<tr> <td> Composite, k=2, altmap recursive </td> <td> Done - Kevin</td> </tr>
-<tr> <td> Composite, k=2, stdmap inline </td> <td> Done - Kevin</td> </tr>
-<tr> <td> LOG </td> <td> Done - Kevin </td> </tr>
-<tr> <td> LOG ZERO</td> <td> Done - Kevin </td> </tr>
-<tr> <td> Group 4 4 </td> <td>Done - Jim: I don't see a reason to implement others, although 4-8 will be faster, and 8 8 will have faster region ops.  They'll never beat SPLIT.</td> </tr>
-</table><p>
-
-<hr>
-<h3>Things we need to Implement: <i>w=32</i></h3>
-
-<p><table border=3 cellpadding=2>
-<tr> <td> SHIFT </td> <td> Done - Jim </td> </tr>
-<tr> <td> BYTWO_p </td> <td>Done - Jim</td> </tr>
-<tr> <td> BYTWO_b </td> <td>Done - Jim</td> </tr>
-<tr> <td> BYTWO_p, SSE </td> <td>Done - Jim</td> </tr>
-<tr> <td> BYTWO_b, SSE </td> <td>Done - Jim</td> </tr>
-<tr> <td> Split 2 32,lazy </td> <td>Done  - Jim</td> </tr>
-<tr> <td> Split 2 32, SSE, lazy </td> <td>Done  - Jim</td> </tr>
-<tr> <td> Split 4 32, lazy </td> <td>Done  - Jim</td> </tr>
-<tr> <td> Split 4 32, SSE,ALTMAP lazy </td> <td>Done  - Jim</td> </tr>
-<tr> <td> Split 4 32, SSE, lazy </td> <td>Done  - Jim</td> </tr>
-<tr> <td> Split 8 8 </td> <td>Done - Jim </td> </tr>
-<tr> <td> Group, g_s == g_r </td> <td>Done - Jim</td></tr>
-<tr> <td> Group, any g_s and g_r</td> <td>Done - Jim</td></tr>
-<tr> <td> Composite, k=2, stdmap recursive </td> <td> Done - Kevin</td> </tr>
-<tr> <td> Composite, k=2, altmap recursive </td> <td> Done - Kevin</td> </tr>
-<tr> <td> Composite, k=2, stdmap inline </td> <td> Done - Kevin</td> </tr>
-</table><p>
-<hr>
-<h3>Things we need to Implement: <i>w=64</i></h3>
-
-<p><table border=3 cellpadding=2>
-<tr> <td> SHIFT </td> <td> Done - Jim </td> </tr>
-<tr> <td> BYTWO_p </td> <td> - </td> </tr>
-<tr> <td> BYTWO_b </td> <td> - </td> </tr>
-<tr> <td> BYTWO_p, SSE </td> <td> - </td> </tr>
-<tr> <td> BYTWO_b, SSE </td> <td> - </td> </tr>
-<tr> <td> Split 16 1 SSE, maybe lazy </td> <td> - </td> </tr>
-<tr> <td> Split 8 1 lazy </td> <td> - </td> </tr>
-<tr> <td> Split 8 8 </td> <td> - </td> </tr>
-<tr> <td> Split 8 8 lazy </td> <td> - </td> </tr>
-<tr> <td> Group </td> <td> - </td> </tr>
-<tr> <td> Composite, k=2, alternate mapping </td> <td> - </td> </tr>
-</table><p>
-<hr>
-<h3>Things we need to Implement: <i>w=128</i></h3>
-
-<p><table border=3 cellpadding=2>
-<tr> <td> SHIFT </td> <td> Done - Will </td> </tr>
-<tr> <td> BYTWO_p </td> <td> - </td> </tr>
-<tr> <td> BYTWO_b </td> <td> - </td> </tr>
-<tr> <td> BYTWO_p, SSE </td> <td> - </td> </tr>
-<tr> <td> BYTWO_b, SSE </td> <td> - </td> </tr>
-<tr> <td> Split 32 1 SSE, maybe lazy </td> <td> - </td> </tr>
-<tr> <td> Split 16 1 lazy </td> <td> - </td> </tr>
-<tr> <td> Split 16 16 - Maybe that's insanity</td> <td> - </td> </tr>
-<tr> <td> Split 16 16 lazy </td> <td> - </td> </tr>
-<tr> <td> Group (SSE) </td> <td> - </td> </tr>
-<tr> <td> Composite, k=?, alternate mapping </td> <td> - </td> </tr>
-</table><p>
-<hr>
-<h3>Things we need to Implement: <i>w=general between 1 & 32</i></h3>
-
-<p><table border=3 cellpadding=2>
-<tr> <td> CAUCHY Region (SSE XOR)</td> <td> Done - Jim </td> </tr>
-<tr> <td> SHIFT </td> <td> Done - Jim </td> </tr>
-<tr> <td> TABLE </td> <td> Done - Jim </td> </tr>
-<tr> <td> LOG </td> <td> Done - Jim </td> </tr>
-<tr> <td> BYTWO_p </td> <td>Done - Jim</td> </tr>
-<tr> <td> BYTWO_b </td> <td>Done - Jim</td> </tr>
-<tr> <td> Group, g_s == g_r </td> <td>Done - Jim</td></tr>
-<tr> <td> Group, any g_s and g_r</td> <td>Done - Jim</td></tr>
-<tr> <td> Split - do we need it?</td> <td>Done - Jim</td></tr>
-<tr> <td> Composite - do we need it?</td> <td> - </td></tr>
-<tr> <td> Split - do we need it?</td> <td> - </td></tr>
-<tr> <td> Logzero?</td> <td> - </td></tr>
-</table><p>
diff --git a/flag_tester/README.txt b/flag_tester/README.txt
new file mode 100644
index 0000000..19101ff
--- /dev/null
+++ b/flag_tester/README.txt
@@ -0,0 +1,10 @@
+Run which_compile_flags.sh and it will print out the compile flags to use in
+  GNUmakefile. By default, this script uses "cc" as its compiler but you can
+  pass in the name of your compiler as an argument.
+
+EXAMPLE: "./which_compile_flags.sh clang"
+
+This script will run "clang" in the above example so be warned that if you type
+something like "rm" for that argument, you get what you asked for.  Also, make
+sure that the compiler that you pass to which_compile_flags.sh is the same as
+the compiler in GNUmakefile.
diff --git a/flag_tester/flag_test.c b/flag_tester/flag_test.c
new file mode 100644
index 0000000..cecf472
--- /dev/null
+++ b/flag_tester/flag_test.c
@@ -0,0 +1,120 @@
+/*
+ * flag_test.c - copied from whats_my_sse.c to output proper compile
+ *  flags for the GNUmakefile
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "intel_cpu_capabilities.h"
+
+void usage()
+{
+  fprintf(stderr, "usage: flag_test <compiler name>\n");
+  exit(EXIT_FAILURE);
+}
+
+int main(int argc, char **argv)
+{
+  //make sure to extend these buffers if more flags are added to this program
+  char cflags[1000], ldflags[1000], buf[1000];
+  FILE *file;
+  char sse_found = 0;
+
+  if(argc != 2)
+    usage();
+
+  sprintf(cflags, "CFLAGS = -O3");
+  sprintf(ldflags, "LDFLAGS = -O3");
+
+  if(cpu_has_feature(CPU_CAP_SSE42))
+  {
+    sprintf(buf, "%s sse_test.c -o sse4 -msse4 -DSSE4 2> /dev/null", argv[1]);
+    system(buf);
+    if(file = fopen("sse4", "r"))
+    {
+      fclose(file);
+
+      //run program and compare to the included output
+      system("./sse4 > temp.txt 2> /dev/null");
+      system("diff sse4_test.txt temp.txt > diff.txt 2> /dev/null");
+      file = fopen("diff.txt", "r");
+      if(fgetc(file) == EOF)
+      {
+        strcat(cflags, " -msse4 -DINTEL_SSE4");
+        strcat(ldflags, " -msse4");
+        sse_found = 1;
+      }
+      fclose(file);
+    }
+  }
+
+  if(cpu_has_feature(CPU_CAP_SSSE3) && !sse_found)
+  {
+    sprintf(buf, "%s sse_test.c -o ssse3 -mssse3 -DSSSE3 2> /dev/null", argv[1]);
+    system(buf);
+    if(file = fopen("ssse3", "r"))
+    {
+      fclose(file);
+
+      //run program and compare to the included output
+      system("./ssse3 > temp.txt 2> /dev/null");
+      system("diff ssse3_test.txt temp.txt > diff.txt 2> /dev/null");
+      file = fopen("diff.txt", "r");
+      if(fgetc(file) == EOF)
+      {
+        strcat(cflags, " -mssse3 -DINTEL_SSSE3");
+        strcat(ldflags, " -mssse3");
+        sse_found = 1;
+      }
+      fclose(file);
+    }
+  }
+
+  if(cpu_has_feature(CPU_CAP_SSE2) && !sse_found)
+  {
+    sprintf(buf, "%s sse_test.c -o sse2 -msse2 -DSSE2 2> /dev/null", argv[1]);
+    system(buf);
+    if(file = fopen("sse2", "r"))
+    {
+      fclose(file);
+
+      //run program and compare to the included output
+      system("./sse2 > temp.txt 2> /dev/null");
+      system("diff sse2_test.txt temp.txt > diff.txt 2> /dev/null");
+      file = fopen("diff.txt", "r");
+      if(fgetc(file) == EOF)
+      {
+        strcat(cflags, " -msse2 -DINTEL_SSE2");
+        strcat(ldflags, " -msse2");
+        sse_found = 1;
+      }
+      fclose(file);
+    }
+  }
+
+  if(cpu_has_feature(CPU_CAP_PCLMULQDQ) && sse_found)
+  {
+    sprintf(buf, "%s pclmul_test.c -o pclmul -maes -mpclmul 2> /dev/null"
+      , argv[1]);
+    system(buf);
+    if(file = fopen("pclmul", "r"))
+    {
+      fclose(file);
+
+      //run program and compare to the included output
+      system("./pclmul > temp.txt 2> /dev/null");
+      system("diff pclmul_test.txt temp.txt > diff.txt 2> /dev/null");
+      file = fopen("diff.txt", "r");
+      if(fgetc(file) == EOF)
+      {
+        strcat(cflags, " -maes -mpclmul -DINTEL_PCLMUL");
+        strcat(ldflags, " -maes -mpclmul");
+      }
+      fclose(file);
+    }
+  }
+
+  printf("%s\n%s\n", cflags, ldflags);
+}
diff --git a/intel_cpu_capabilities.h b/flag_tester/intel_cpu_capabilities.h
similarity index 95%
rename from intel_cpu_capabilities.h
rename to flag_tester/intel_cpu_capabilities.h
index 5fe0fea..6d1bbeb 100644
--- a/intel_cpu_capabilities.h
+++ b/flag_tester/intel_cpu_capabilities.h
@@ -16,7 +16,7 @@
 #define CPU_CPSSE               0x2000
 #define CPU_CAP_SSE3            (CPU_CPSSE | 0)
 #define CPU_CAP_PCLMULQDQ       (CPU_CPSSE | 1)
-#define CPU_CAP_SSSE3           (CPU_CPSSE | 10)
+#define CPU_CAP_SSSE3           (CPU_CPSSE | 9)
 #define CPU_CAP_SSE41           (CPU_CPSSE | 19)
 #define CPU_CAP_SSE42           (CPU_CPSSE | 20)
 #define CPU_CAP_AVX             (CPU_CPSSE | 28)
@@ -25,7 +25,6 @@
         __asm__ __volatile__ ("cpuid":\
                               "=a" (ax), "=b" (bx), "=c" (cx), "=d" (dx) : "a" (func));
 
-inline
 int
 cpu_has_feature (unsigned which)
 {
diff --git a/flag_tester/pclmul_test.c b/flag_tester/pclmul_test.c
new file mode 100644
index 0000000..bdae184
--- /dev/null
+++ b/flag_tester/pclmul_test.c
@@ -0,0 +1,40 @@
+#include <wmmintrin.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#define MM_PRINT8(s, r) { uint8_t blah[16], ii; printf("%-20s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 1) printf("%s%02x", (ii%4==0) ? "   " : " ", blah[15-ii]); printf("\n"); }
+
+
+int main()
+{
+  uint64_t answer;
+  uint32_t pp;
+  __m128i a, b, c;
+
+  a = _mm_set1_epi8(0x0D);
+  b = _mm_set_epi32(0,0,0,0x0A);
+  pp = 0x13;
+  MM_PRINT8("a", a);
+  MM_PRINT8("b", b);
+
+  c = _mm_clmulepi64_si128(a, b, 0);
+  MM_PRINT8("a clm b", c);
+
+  a = _mm_set1_epi8(0xf0);
+  MM_PRINT8("a", a);
+  b = _mm_and_si128(a, c);
+  b = _mm_srli_epi64(b, 4);
+  MM_PRINT8("shifted", b);
+
+
+  a = _mm_set_epi32(0,0,0,pp);
+  MM_PRINT8("PP", a);
+
+  b = _mm_clmulepi64_si128(a, b, 0);
+  MM_PRINT8("PP clm over", b);
+
+  c = _mm_xor_si128(c,b);
+  MM_PRINT8("Answer", c);
+  //answer = _mm_extract_epi64(c, 0);
+  //printf("%llx\n", answer);
+}
diff --git a/flag_tester/pclmul_test.txt b/flag_tester/pclmul_test.txt
new file mode 100644
index 0000000..6102f94
--- /dev/null
+++ b/flag_tester/pclmul_test.txt
@@ -0,0 +1,8 @@
+a                      0d 0d 0d 0d   0d 0d 0d 0d   0d 0d 0d 0d   0d 0d 0d 0d
+b                      00 00 00 00   00 00 00 00   00 00 00 00   00 00 00 0a
+a clm b                00 00 00 00   00 00 00 00   72 72 72 72   72 72 72 72
+a                      f0 f0 f0 f0   f0 f0 f0 f0   f0 f0 f0 f0   f0 f0 f0 f0
+shifted                00 00 00 00   00 00 00 00   07 07 07 07   07 07 07 07
+PP                     00 00 00 00   00 00 00 00   00 00 00 00   00 00 00 13
+PP clm over            00 00 00 00   00 00 00 00   79 79 79 79   79 79 79 79
+Answer                 00 00 00 00   00 00 00 00   0b 0b 0b 0b   0b 0b 0b 0b
diff --git a/flag_tester/sse2_test.txt b/flag_tester/sse2_test.txt
new file mode 100644
index 0000000..f79b6e0
--- /dev/null
+++ b/flag_tester/sse2_test.txt
@@ -0,0 +1,30 @@
+a                      0f 0e 0d 0c   0b 0a 09 08   07 06 05 04   03 02 01 00
+b                      10 0f 0e 0d   0c 0b 0a 09   08 07 06 05   04 03 02 01
+c                      11 10 0f 0e   0d 0c 0b 0a   09 08 07 06   05 04 03 02
+d                      12 11 10 0f   0e 0d 0c 0b   0a 09 08 07   06 05 04 03
+a sl16                 3c 38 34 30   2c 28 24 20   1c 18 14 10   0c 08 04 00
+b sl32                 40 3c 38 34   30 2c 28 24   20 1c 18 14   10 0c 08 04
+c sl64                 44 40 3c 38   34 30 2c 28   24 20 1c 18   14 10 0c 08
+d sl128                10 0f 0e 0d   0c 0b 0a 09   08 07 06 05   04 03 00 00
+a sr16                 0f 0e 0d 0c   0b 0a 09 08   07 06 05 04   03 02 01 00
+b sr32                 10 0f 0e 0d   0c 0b 0a 09   08 07 06 05   04 03 02 01
+c sr64                 11 10 0f 0e   0d 0c 0b 0a   09 08 07 06   05 04 03 02
+d sr128                00 00 10 0f   0e 0d 0c 0b   0a 09 08 07   06 05 04 03
+d = a^b                1f 01 03 01   07 01 03 01   0f 01 03 01   07 01 03 01
+d = a-b epi8           ff ff ff ff   ff ff ff ff   ff ff ff ff   ff ff ff ff
+d = a-b epi16          fe ff fe ff   fe ff fe ff   fe ff fe ff   fe ff fe ff
+d = a-b epi32          fe fe fe ff   fe fe fe ff   fe fe fe ff   fe fe fe ff
+d = a-b epi64          fe fe fe fe   fe fe fe ff   fe fe fe fe   fe fe fe ff
+d set_epi8             0f 0e 0d 0c   0b 0a 09 08   07 06 05 04   03 02 01 00
+d set_epi32            12 34 56 78   9a bc de f0   12 34 56 78   9a bc de f0
+d set1_epi64           f0 f0 f0 f0   f0 f0 f0 f0   f0 f0 f0 f0   f0 f0 f0 f0
+d set1_epi32           e2 e2 e2 e2   e2 e2 e2 e2   e2 e2 e2 e2   e2 e2 e2 e2
+d set1_epi16           af f3 af f3   af f3 af f3   af f3 af f3   af f3 af f3
+d set1_epi8            c5 c5 c5 c5   c5 c5 c5 c5   c5 c5 c5 c5   c5 c5 c5 c5
+d packus_epi16(d,d)    00 00 00 00   00 00 00 00   00 00 00 00   00 00 00 00
+c unpackhi(a,d)        00 0f 00 0e   00 0d 00 0c   00 0b 00 0a   00 09 00 08
+b unpacklo(c,a)        07 00 06 0b   05 00 04 0a   03 00 02 09   01 00 00 08
+d and(d,b)             00 00 00 00   00 00 00 00   00 00 00 00   00 00 00 00
+a stored to mem: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 
+d setzero              00 00 00 00   00 00 00 00   00 00 00 00   00 00 00 00
+c                      05 05 05 05   05 05 05 05   05 05 05 05   05 05 05 05
diff --git a/flag_tester/sse4_test.txt b/flag_tester/sse4_test.txt
new file mode 100644
index 0000000..3f6d7ec
--- /dev/null
+++ b/flag_tester/sse4_test.txt
@@ -0,0 +1,35 @@
+a                      0f 0e 0d 0c   0b 0a 09 08   07 06 05 04   03 02 01 00
+b                      10 0f 0e 0d   0c 0b 0a 09   08 07 06 05   04 03 02 01
+c                      11 10 0f 0e   0d 0c 0b 0a   09 08 07 06   05 04 03 02
+d                      12 11 10 0f   0e 0d 0c 0b   0a 09 08 07   06 05 04 03
+a sl16                 3c 38 34 30   2c 28 24 20   1c 18 14 10   0c 08 04 00
+b sl32                 40 3c 38 34   30 2c 28 24   20 1c 18 14   10 0c 08 04
+c sl64                 44 40 3c 38   34 30 2c 28   24 20 1c 18   14 10 0c 08
+d sl128                10 0f 0e 0d   0c 0b 0a 09   08 07 06 05   04 03 00 00
+a sr16                 0f 0e 0d 0c   0b 0a 09 08   07 06 05 04   03 02 01 00
+b sr32                 10 0f 0e 0d   0c 0b 0a 09   08 07 06 05   04 03 02 01
+c sr64                 11 10 0f 0e   0d 0c 0b 0a   09 08 07 06   05 04 03 02
+d sr128                00 00 10 0f   0e 0d 0c 0b   0a 09 08 07   06 05 04 03
+d = a^b                1f 01 03 01   07 01 03 01   0f 01 03 01   07 01 03 01
+d = a-b epi8           ff ff ff ff   ff ff ff ff   ff ff ff ff   ff ff ff ff
+d = a-b epi16          fe ff fe ff   fe ff fe ff   fe ff fe ff   fe ff fe ff
+d = a-b epi32          fe fe fe ff   fe fe fe ff   fe fe fe ff   fe fe fe ff
+d = a-b epi64          fe fe fe fe   fe fe fe ff   fe fe fe fe   fe fe fe ff
+d set_epi8             0f 0e 0d 0c   0b 0a 09 08   07 06 05 04   03 02 01 00
+d set_epi32            12 34 56 78   9a bc de f0   12 34 56 78   9a bc de f0
+d set1_epi64           f0 f0 f0 f0   f0 f0 f0 f0   f0 f0 f0 f0   f0 f0 f0 f0
+d set1_epi32           e2 e2 e2 e2   e2 e2 e2 e2   e2 e2 e2 e2   e2 e2 e2 e2
+d set1_epi16           af f3 af f3   af f3 af f3   af f3 af f3   af f3 af f3
+d set1_epi8            c5 c5 c5 c5   c5 c5 c5 c5   c5 c5 c5 c5   c5 c5 c5 c5
+d packus_epi16(d,d)    00 00 00 00   00 00 00 00   00 00 00 00   00 00 00 00
+c unpackhi(a,d)        00 0f 00 0e   00 0d 00 0c   00 0b 00 0a   00 09 00 08
+b unpacklo(c,a)        07 00 06 0b   05 00 04 0a   03 00 02 09   01 00 00 08
+d and(d,b)             00 00 00 00   00 00 00 00   00 00 00 00   00 00 00 00
+a stored to mem: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 
+d setzero              00 00 00 00   00 00 00 00   00 00 00 00   00 00 00 00
+d insert32 @ 2         00 00 00 00   ab cd 12 34   00 00 00 00   00 00 00 00
+extract_epi32 @ 2: abcd1234
+d insert64 @ 0         00 00 00 00   ab cd 12 34   fe dc ba 12   91 82 73 64
+extract_epi64 @ 0: fedcba1291827364
+c                      05 05 05 05   05 05 05 05   05 05 05 05   05 05 05 05
+a shuffle(b, c)        02 02 02 02   02 02 02 02   02 02 02 02   02 02 02 02
diff --git a/flag_tester/sse_test.c b/flag_tester/sse_test.c
new file mode 100644
index 0000000..e40cf25
--- /dev/null
+++ b/flag_tester/sse_test.c
@@ -0,0 +1,142 @@
+#ifdef SSE4
+#define SSSE3
+#include <nmmintrin.h>
+#endif
+
+#ifdef SSSE3
+#define SSE2
+#include <tmmintrin.h>
+#endif
+
+#ifdef SSE2
+#include <emmintrin.h>
+#endif
+
+#include <stdio.h>
+#include <stdint.h>
+#include <inttypes.h>
+
+#define MM_PRINT8(s, r) { uint8_t blah[16], ii; printf("%-20s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 1) printf("%s%02x", (ii%4==0) ? "   " : " ", blah[15-ii]); printf("\n"); }
+
+int main()
+{
+  uint32_t u32;
+  uint64_t u64;
+  uint8_t *ui8 = malloc(20), i;
+  __m128i a, b, c, d;
+
+  for(i=0; i < 20; i++)
+    ui8[i] = i;
+
+  a = _mm_load_si128( (__m128i *) ui8 );
+  b = _mm_loadu_si128( (__m128i *) (ui8+1));
+  c = _mm_loadu_si128( (__m128i *) (ui8+2));
+  d = _mm_loadu_si128( (__m128i *) (ui8+3));
+
+  MM_PRINT8("a", a);
+  MM_PRINT8("b", b);
+  MM_PRINT8("c", c);
+  MM_PRINT8("d", d);
+
+  a = _mm_slli_epi16(a, 2);
+  b = _mm_slli_epi32(b, 2);
+  c = _mm_slli_epi64(c, 2);
+  d = _mm_slli_si128(d, 2);
+
+  MM_PRINT8("a sl16", a);
+  MM_PRINT8("b sl32", b);
+  MM_PRINT8("c sl64", c);
+  MM_PRINT8("d sl128", d);
+
+  a = _mm_srli_epi16(a, 2);
+  b = _mm_srli_epi32(b, 2);
+  c = _mm_srli_epi64(c, 2);
+  d = _mm_srli_si128(d, 2);
+
+  MM_PRINT8("a sr16", a);
+  MM_PRINT8("b sr32", b);
+  MM_PRINT8("c sr64", c);
+  MM_PRINT8("d sr128", d);
+
+  d = _mm_xor_si128(a, b);
+  MM_PRINT8("d = a^b", d);
+  
+  d = _mm_sub_epi8(a, b);
+  MM_PRINT8("d = a-b epi8", d);
+  
+  d = _mm_sub_epi16(a, b);
+  MM_PRINT8("d = a-b epi16", d);
+  
+  d = _mm_sub_epi32(a, b);
+  MM_PRINT8("d = a-b epi32", d);
+  
+  d = _mm_sub_epi64(a, b);
+  MM_PRINT8("d = a-b epi64", d);
+  
+  d = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  MM_PRINT8("d set_epi8", d);
+  
+  d = _mm_set_epi32(0x12345678, 0x9abcdef0, 0x12345678, 0x9abcdef0);
+  MM_PRINT8("d set_epi32", d);
+  
+  d = _mm_set1_epi64x(0xF0F0F0F0F0F0F0F0ULL);
+  MM_PRINT8("d set1_epi64", d);
+  
+  d = _mm_set1_epi32(0xe2e2e2e2);
+  MM_PRINT8("d set1_epi32", d);
+
+  d = _mm_set1_epi16(0xaff3);
+  MM_PRINT8("d set1_epi16", d);
+
+  d = _mm_set1_epi8(0xc5);
+  MM_PRINT8("d set1_epi8", d);
+
+  d = _mm_packus_epi16(d, d);
+  MM_PRINT8("d packus_epi16(d,d)", d);
+
+  c = _mm_unpackhi_epi8(a, d);
+  MM_PRINT8("c unpackhi(a,d)", c);
+
+  b = _mm_unpacklo_epi8(c, a);
+  MM_PRINT8("b unpacklo(c,a)", b);
+
+  d = _mm_and_si128(d, b);
+  MM_PRINT8("d and(d,b)", d);
+
+  _mm_store_si128( (__m128i *) ui8, a);
+  printf("a stored to mem: ");
+  for(i=0; i < 16; i++)
+    printf("%u ", ui8[i]);
+  printf("\n");
+
+  d = _mm_setzero_si128();
+  MM_PRINT8("d setzero", d);
+
+  u32 = 0xABCD1234;
+  u64 = 0xFEDCBA1291827364ULL;
+  
+  #ifdef SSE4
+  d = _mm_insert_epi32(d, u32, 2);
+  MM_PRINT8("d insert32 @ 2", d);
+
+  u32 = 0;
+  u32 = _mm_extract_epi32(d, 2);
+  printf("extract_epi32 @ 2: %x\n", u32);
+
+  d = _mm_insert_epi64(d, u64, 0);
+  MM_PRINT8("d insert64 @ 0", d);
+
+  u64 = 0;
+  u64 = _mm_extract_epi64(d, 0);
+  printf("extract_epi64 @ 0: %" PRIx64 "\n", u64);
+  #endif
+
+  c = _mm_set1_epi8(5);
+  MM_PRINT8("c", c);
+
+  #ifdef SSSE3
+  a = _mm_shuffle_epi8(b, c);
+  MM_PRINT8("a shuffle(b, c)", a);
+  #endif
+
+}
diff --git a/flag_tester/ssse3_test.txt b/flag_tester/ssse3_test.txt
new file mode 100644
index 0000000..17bee1a
--- /dev/null
+++ b/flag_tester/ssse3_test.txt
@@ -0,0 +1,31 @@
+a                      0f 0e 0d 0c   0b 0a 09 08   07 06 05 04   03 02 01 00
+b                      10 0f 0e 0d   0c 0b 0a 09   08 07 06 05   04 03 02 01
+c                      11 10 0f 0e   0d 0c 0b 0a   09 08 07 06   05 04 03 02
+d                      12 11 10 0f   0e 0d 0c 0b   0a 09 08 07   06 05 04 03
+a sl16                 3c 38 34 30   2c 28 24 20   1c 18 14 10   0c 08 04 00
+b sl32                 40 3c 38 34   30 2c 28 24   20 1c 18 14   10 0c 08 04
+c sl64                 44 40 3c 38   34 30 2c 28   24 20 1c 18   14 10 0c 08
+d sl128                10 0f 0e 0d   0c 0b 0a 09   08 07 06 05   04 03 00 00
+a sr16                 0f 0e 0d 0c   0b 0a 09 08   07 06 05 04   03 02 01 00
+b sr32                 10 0f 0e 0d   0c 0b 0a 09   08 07 06 05   04 03 02 01
+c sr64                 11 10 0f 0e   0d 0c 0b 0a   09 08 07 06   05 04 03 02
+d sr128                00 00 10 0f   0e 0d 0c 0b   0a 09 08 07   06 05 04 03
+d = a^b                1f 01 03 01   07 01 03 01   0f 01 03 01   07 01 03 01
+d = a-b epi8           ff ff ff ff   ff ff ff ff   ff ff ff ff   ff ff ff ff
+d = a-b epi16          fe ff fe ff   fe ff fe ff   fe ff fe ff   fe ff fe ff
+d = a-b epi32          fe fe fe ff   fe fe fe ff   fe fe fe ff   fe fe fe ff
+d = a-b epi64          fe fe fe fe   fe fe fe ff   fe fe fe fe   fe fe fe ff
+d set_epi8             0f 0e 0d 0c   0b 0a 09 08   07 06 05 04   03 02 01 00
+d set_epi32            12 34 56 78   9a bc de f0   12 34 56 78   9a bc de f0
+d set1_epi64           f0 f0 f0 f0   f0 f0 f0 f0   f0 f0 f0 f0   f0 f0 f0 f0
+d set1_epi32           e2 e2 e2 e2   e2 e2 e2 e2   e2 e2 e2 e2   e2 e2 e2 e2
+d set1_epi16           af f3 af f3   af f3 af f3   af f3 af f3   af f3 af f3
+d set1_epi8            c5 c5 c5 c5   c5 c5 c5 c5   c5 c5 c5 c5   c5 c5 c5 c5
+d packus_epi16(d,d)    00 00 00 00   00 00 00 00   00 00 00 00   00 00 00 00
+c unpackhi(a,d)        00 0f 00 0e   00 0d 00 0c   00 0b 00 0a   00 09 00 08
+b unpacklo(c,a)        07 00 06 0b   05 00 04 0a   03 00 02 09   01 00 00 08
+d and(d,b)             00 00 00 00   00 00 00 00   00 00 00 00   00 00 00 00
+a stored to mem: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 
+d setzero              00 00 00 00   00 00 00 00   00 00 00 00   00 00 00 00
+c                      05 05 05 05   05 05 05 05   05 05 05 05   05 05 05 05
+a shuffle(b, c)        02 02 02 02   02 02 02 02   02 02 02 02   02 02 02 02
diff --git a/whats_my_sse.c b/flag_tester/whats_my_sse.c
similarity index 100%
rename from whats_my_sse.c
rename to flag_tester/whats_my_sse.c
diff --git a/flag_tester/which_compile_flags.sh b/flag_tester/which_compile_flags.sh
new file mode 100755
index 0000000..f39c609
--- /dev/null
+++ b/flag_tester/which_compile_flags.sh
@@ -0,0 +1,19 @@
+if [ -n "$1" ]; then
+  CC=$1
+else
+  CC=cc
+fi
+
+$CC flag_test.c -o flag_test 2> /dev/null
+if [ -e "flag_test" ]; then
+  OUTPUT=`./flag_test $CC 2> /dev/null`
+  if [ -n "$OUTPUT" ]; then
+    echo "$OUTPUT"
+  else
+    printf "CFLAGS = -O3\nLDFLAGS = -O3\n"
+  fi
+else
+  printf "$CC failed to compile flag_test.c\n"
+fi
+
+rm sse4 sse2 ssse3 pclmul diff.txt flag_test temp.txt 2> /dev/null
diff --git a/gf.c b/gf.c
index 4304e1d..b027473 100644
--- a/gf.c
+++ b/gf.c
@@ -8,6 +8,405 @@
 #include <stdio.h>
 #include <stdlib.h>
 
+int _gf_errno = GF_E_DEFAULT;
+
+void gf_error()
+{
+  char *s;
+
+  switch(_gf_errno) {
+    case GF_E_DEFAULT: s = "No Error."; break;
+    case GF_E_TWOMULT: s = "Cannot specify two -m's."; break;
+    case GF_E_TWO_DIV: s = "Cannot specify two -d's."; break;
+    case GF_E_POLYSPC: s = "-p needs to be followed by a number in hex (0x optional)."; break;
+    case GF_E_GROUPAR: s = "Ran out of arguments in -m GROUP."; break;
+    case GF_E_GROUPNU: s = "In -m GROUP g_s g_r -- g_s and g_r need to be numbers."; break;
+    case GF_E_SPLITAR: s = "Ran out of arguments in -m SPLIT."; break;
+    case GF_E_SPLITNU: s = "In -m SPLIT w_a w_b -- w_a and w_b need to be numbers."; break;
+    case GF_E_FEWARGS: s = "Not enough arguments (Perhaps end with '-'?)"; break;
+    case GF_E_CFM___W: s = "-m CARRY_FREE, w must be 4, 8, 16, 32, 64 or 128."; break;
+    case GF_E_COMPXPP: s = "-m COMPOSITE, No poly specified, and we don't have a default for the given sub-field."; break;
+    case GF_E_BASE__W: s = "-m COMPOSITE and the base field is not for w/2."; break;
+    case GF_E_CFM4POL: s = "-m CARRY_FREE, w=4. (Prim-poly & 0xc) must equal 0."; break;
+    case GF_E_CFM8POL: s = "-m CARRY_FREE, w=8. (Prim-poly & 0x80) must equal 0."; break;
+    case GF_E_CF16POL: s = "-m CARRY_FREE, w=16. (Prim-poly & 0xe000) must equal 0."; break;
+    case GF_E_CF32POL: s = "-m CARRY_FREE, w=32. (Prim-poly & 0xfe000000) must equal 0."; break;
+    case GF_E_CF64POL: s = "-m CARRY_FREE, w=64. (Prim-poly & 0xfffe000000000000ULL) must equal 0."; break;
+    case GF_E_MDEFDIV: s = "If multiplication method == default, can't change division."; break;
+    case GF_E_MDEFREG: s = "If multiplication method == default, can't change region."; break;
+    case GF_E_MDEFARG: s = "If multiplication method == default, can't use arg1/arg2."; break;
+    case GF_E_DIVCOMP: s = "Cannot change the division technique with -m COMPOSITE."; break;
+    case GF_E_DOUQUAD: s = "Cannot specify -r DOUBLE and -r QUAD."; break;
+    case GF_E_SSE__NO: s = "Cannot specify -r SSE and -r NOSSE."; break;
+    case GF_E_CAUCHYB: s = "Cannot specify -r CAUCHY and any other -r."; break;
+    case GF_E_CAUCOMP: s = "Cannot specify -m COMPOSITE and -r CAUCHY."; break;
+    case GF_E_CAUGT32: s = "Cannot specify -r CAUCHY with w > 32."; break;
+    case GF_E_ARG1SET: s = "Only use arg1 with SPLIT, GROUP or COMPOSITE."; break;
+    case GF_E_ARG2SET: s = "Only use arg2 with SPLIT or GROUP."; break;
+    case GF_E_MATRIXW: s = "Cannot specify -d MATRIX with w > 32."; break;
+    case GF_E_BAD___W: s = "W must be 1-32, 64 or 128."; break;
+    case GF_E_DOUBLET: s = "Can only specify -r DOUBLE with -m TABLE."; break;
+    case GF_E_DOUBLEW: s = "Can only specify -r DOUBLE w = 4 or w = 8."; break;
+    case GF_E_DOUBLEJ: s = "Cannot specify -r DOUBLE with -r ALTMAP|SSE|NOSSE."; break;
+    case GF_E_DOUBLEL: s = "Can only specify -r DOUBLE -r LAZY with w = 8"; break;
+    case GF_E_QUAD__T: s = "Can only specify -r QUAD with -m TABLE."; break;
+    case GF_E_QUAD__W: s = "Can only specify -r QUAD w = 4."; break;
+    case GF_E_QUAD__J: s = "Cannot specify -r QUAD with -r ALTMAP|SSE|NOSSE."; break;
+    case GF_E_BADPOLY: s = "Bad primitive polynomial (high bits set)."; break;
+    case GF_E_COMP_PP: s = "Bad primitive polynomial -- bigger than sub-field."; break;
+    case GF_E_LAZY__X: s = "If -r LAZY, then -r must be DOUBLE or QUAD."; break;
+    case GF_E_ALTSHIF: s = "Cannot specify -m SHIFT and -r ALTMAP."; break;
+    case GF_E_SSESHIF: s = "Cannot specify -m SHIFT and -r SSE|NOSSE."; break;
+    case GF_E_ALT_CFM: s = "Cannot specify -m CARRY_FREE and -r ALTMAP."; break;
+    case GF_E_SSE_CFM: s = "Cannot specify -m CARRY_FREE and -r SSE|NOSSE."; break;
+    case GF_E_PCLMULX: s = "Specified -m CARRY_FREE, but PCLMUL is not supported."; break;
+    case GF_E_ALT_BY2: s = "Cannot specify -m BYTWO_x and -r ALTMAP."; break;
+    case GF_E_BY2_SSE: s = "Specified -m BYTWO_x -r SSE, but SSE2 is not supported."; break;
+    case GF_E_LOGBADW: s = "With Log Tables, w must be <= 27."; break;
+    case GF_E_LOG___J: s = "Cannot use Log tables with -r ALTMAP|SSE|NOSSE."; break;
+    case GF_E_LOGPOLY: s = "Cannot use Log tables because the polynomial is not primitive."; break;
+    case GF_E_ZERBADW: s = "With -m LOG_ZERO, w must be 8 or 16."; break;
+    case GF_E_ZEXBADW: s = "With -m LOG_ZERO_EXT, w must be 8."; break;
+    case GF_E_GR_ARGX: s = "With -m GROUP, arg1 and arg2 must be >= 0."; break;
+    case GF_E_GR_W_48: s = "With -m GROUP, w cannot be 4 or 8."; break;
+    case GF_E_GR_W_16: s = "With -m GROUP, w == 16, arg1 and arg2 must be 4."; break;
+    case GF_E_GR_128A: s = "With -m GROUP, w == 128, arg1 must be 4, and arg2 in { 4,8,16 }."; break;
+    case GF_E_GR_SSE4: s = "With -m GROUP, w == 128, you need SSE4."; break;
+    case GF_E_GR_A_27: s = "With -m GROUP, arg1 and arg2 must be <= 27."; break;
+    case GF_E_GR_AR_W: s = "With -m GROUP, arg1 and arg2 must be <= w."; break;
+    case GF_E_GR____J: s = "Cannot use GROUP with -r ALTMAP|SSE|NOSSE."; break;
+    case GF_E_TABLE_W: s = "With -m TABLE, w must be < 15, or == 16."; break;
+    case GF_E_TAB_SSE: s = "With -m TABLE, SSE|NOSSE only applies to w=4."; break;
+    case GF_E_TABSSE3: s = "With -m TABLE, -r SSE, you need SSSE3 supported."; break;
+    case GF_E_TAB_ALT: s = "With -m TABLE, you cannot use ALTMAP."; break;
+    case GF_E_SP128AR: s = "With -m SPLIT, w=128, bad arg1/arg2."; break;
+    case GF_E_SP128AL: s = "With -m SPLIT, w=128, -r SSE requires -r ALTMAP."; break;
+    case GF_E_SP128AS: s = "With -m SPLIT, w=128, ALTMAP needs SSSE3 supported."; break;
+    case GF_E_SP128_A: s = "With -m SPLIT, w=128, -r SSE|NOSSE only with arg1/arg2 = 4/128."; break;
+    case GF_E_SP128_S: s = "With -m SPLIT, w=128, -r ALTMAP only with arg1/arg2 = 4/128."; break;
+    case GF_E_SPLIT_W: s = "With -m SPLIT, w must be in {8, 16, 32, 64, 128}."; break;
+    case GF_E_SP_16AR: s = "With -m SPLIT, w=16, Bad arg1/arg2."; break;
+    case GF_E_SP_16_A: s = "With -m SPLIT, w=16, -r ALTMAP only with arg1/arg2 = 4/16."; break;
+    case GF_E_SP_16_S: s = "With -m SPLIT, w=16, -r SSE|NOSSE only with arg1/arg2 = 4/16."; break;
+    case GF_E_SP_32AR: s = "With -m SPLIT, w=32, Bad arg1/arg2."; break;
+    case GF_E_SP_32AS: s = "With -m SPLIT, w=32, -r ALTMAP needs SSSE3 supported."; break;
+    case GF_E_SP_32_A: s = "With -m SPLIT, w=32, -r ALTMAP only with arg1/arg2 = 4/32."; break;
+    case GF_E_SP_32_S: s = "With -m SPLIT, w=32, -r SSE|NOSSE only with arg1/arg2 = 4/32."; break;
+    case GF_E_SP_64AR: s = "With -m SPLIT, w=64, Bad arg1/arg2."; break;
+    case GF_E_SP_64AS: s = "With -m SPLIT, w=64, -r ALTMAP needs SSSE3 supported."; break;
+    case GF_E_SP_64_A: s = "With -m SPLIT, w=64, -r ALTMAP only with arg1/arg2 = 4/64."; break;
+    case GF_E_SP_64_S: s = "With -m SPLIT, w=64, -r SSE|NOSSE only with arg1/arg2 = 4/64."; break;
+    case GF_E_SP_8_AR: s = "With -m SPLIT, w=8, Bad arg1/arg2."; break;
+    case GF_E_SP_8__A: s = "With -m SPLIT, w=8, Can't have -r ALTMAP."; break;
+    case GF_E_SP_SSE3: s = "With -m SPLIT, Need SSSE3 support for SSE."; break;
+    case GF_E_COMP_A2: s = "With -m COMPOSITE, arg1 must equal 2."; break;
+    case GF_E_COMP_SS: s = "With -m COMPOSITE, -r SSE and -r NOSSE do not apply."; break;
+    case GF_E_COMP__W: s = "With -m COMPOSITE, w must be 8, 16, 32, 64 or 128."; break;
+    case GF_E_UNKFLAG: s = "Unknown method flag - should be -m, -d, -r or -p."; break;
+    case GF_E_UNKNOWN: s = "Unknown multiplication type."; break;
+    case GF_E_UNK_REG: s = "Unknown region type."; break;
+    case GF_E_UNK_DIV: s = "Unknown division type."; break;
+    default: s = "Undefined error.";
+  }
+
+  fprintf(stderr, "%s\n", s);
+}
+
+uint64_t gf_composite_get_default_poly(gf_t *base) 
+{
+  gf_internal_t *h;
+  int rv;
+
+  h = (gf_internal_t *) base->scratch;
+  if (h->w == 4) {
+    if (h->mult_type == GF_MULT_COMPOSITE) return 0;
+    if (h->prim_poly == 0x13) return 2;
+    return 0;
+  } 
+  if (h->w == 8) {
+    if (h->mult_type == GF_MULT_COMPOSITE) return 0;
+    if (h->prim_poly == 0x11d) return 3;
+    return 0;
+  }
+  if (h->w == 16) {
+    if (h->mult_type == GF_MULT_COMPOSITE) {
+      rv = gf_composite_get_default_poly(h->base_gf);
+      if (rv != h->prim_poly) return 0;
+      if (rv == 3) return 0x105;
+      return 0;
+    } else {
+      if (h->prim_poly == 0x1100b) return 2;
+      if (h->prim_poly == 0x1002d) return 7;
+      return 0;
+    }
+  }
+  if (h->w == 32) {
+    if (h->mult_type == GF_MULT_COMPOSITE) {
+      rv = gf_composite_get_default_poly(h->base_gf);
+      if (rv != h->prim_poly) return 0;
+      if (rv == 2) return 0x10005;
+      if (rv == 7) return 0x10008;
+      if (rv == 0x105) return 0x10002;
+      return 0;
+    } else {
+      if (h->prim_poly == 0x400007) return 2;
+      if (h->prim_poly == 0xc5) return 3;
+      return 0;
+    }
+  }
+  if (h->w == 64) {
+    if (h->mult_type == GF_MULT_COMPOSITE) {
+      rv = gf_composite_get_default_poly(h->base_gf);
+      if (rv != h->prim_poly) return 0;
+      if (rv == 3) return 0x100000009ULL;
+      if (rv == 2) return 0x100000004ULL;
+      if (rv == 0x10005) return 0x100000003ULL;
+      if (rv == 0x10002) return 0x100000005ULL;
+      if (rv == 0x10008) return 0x100000006ULL;  /* JSP: (0x0x100000003 works too, 
+                                                    but I want to differentiate cases). */
+      return 0;
+    } else {
+      if (h->prim_poly == 0x1bULL) return 2;
+      return 0;
+    }
+  }
+  return 0;
+}
+
+int gf_error_check(int w, int mult_type, int region_type, int divide_type,
+                   int arg1, int arg2, uint64_t poly, gf_t *base)
+{
+  int sse4 = 0;
+  int sse3 = 0;
+  int sse2 = 0;
+  int pclmul = 0;
+  int rdouble, rquad, rlazy, rsse, rnosse, raltmap, rcauchy, tmp;
+  uint64_t pp;
+  gf_internal_t *sub, *subsub, *subsubsub;
+
+  rdouble = (region_type & GF_REGION_DOUBLE_TABLE);
+  rquad   = (region_type & GF_REGION_QUAD_TABLE);
+  rlazy   = (region_type & GF_REGION_LAZY);
+  rsse    = (region_type & GF_REGION_SSE);
+  rnosse  = (region_type & GF_REGION_NOSSE);
+  raltmap = (region_type & GF_REGION_ALTMAP);
+  rcauchy = (region_type & GF_REGION_CAUCHY);
+
+  if (divide_type != GF_DIVIDE_DEFAULT &&
+      divide_type != GF_DIVIDE_MATRIX && 
+      divide_type != GF_DIVIDE_EUCLID) {
+    _gf_errno = GF_E_UNK_DIV;
+    return 0;
+  }
+
+  tmp = ( GF_REGION_DOUBLE_TABLE | GF_REGION_QUAD_TABLE | GF_REGION_LAZY |
+          GF_REGION_SSE | GF_REGION_NOSSE | GF_REGION_ALTMAP | GF_REGION_CAUCHY );
+  if (region_type & (~tmp)) { _gf_errno = GF_E_UNK_REG; return 0; }
+
+#ifdef INTEL_SSE2
+  sse2 = 1;
+#endif
+
+#ifdef INTEL_SSSE3
+  sse3 = 1;
+#endif
+
+#ifdef INTEL_SSE4
+  sse4 = 1;
+#endif
+
+#ifdef INTEL_PCLMUL
+  pclmul = 1;
+#endif
+
+
+  if (w < 1 || (w > 32 && w != 64 && w != 128)) { _gf_errno = GF_E_BAD___W; return 0; }
+    
+  if (mult_type != GF_MULT_COMPOSITE && w < 64) {
+    if ((poly >> (w+1)) != 0)                   { _gf_errno = GF_E_BADPOLY; return 0; }
+  }
+
+  if (mult_type == GF_MULT_DEFAULT) {
+    if (divide_type != GF_DIVIDE_DEFAULT) { _gf_errno = GF_E_MDEFDIV; return 0; }
+    if (region_type != GF_REGION_DEFAULT) { _gf_errno = GF_E_MDEFREG; return 0; }
+    if (arg1 != 0 || arg2 != 0)           { _gf_errno = GF_E_MDEFARG; return 0; }
+    return 1;
+  }
+  
+  if (rsse && rnosse)                                { _gf_errno = GF_E_SSE__NO; return 0; }
+  if (rcauchy && w > 32)                             { _gf_errno = GF_E_CAUGT32; return 0; }
+  if (rcauchy && region_type != GF_REGION_CAUCHY)    { _gf_errno = GF_E_CAUCHYB; return 0; }
+  if (rcauchy && mult_type == GF_MULT_COMPOSITE)     { _gf_errno = GF_E_CAUCOMP; return 0; }
+
+  if (arg1 != 0 && mult_type != GF_MULT_COMPOSITE && 
+      mult_type != GF_MULT_SPLIT_TABLE && mult_type != GF_MULT_GROUP) {
+    _gf_errno = GF_E_ARG1SET;
+    return 0;
+  }
+
+  if (arg2 != 0 && mult_type != GF_MULT_SPLIT_TABLE && mult_type != GF_MULT_GROUP) {
+    _gf_errno = GF_E_ARG2SET;
+    return 0;
+  }
+
+  if (divide_type == GF_DIVIDE_MATRIX && w > 32) { _gf_errno = GF_E_MATRIXW; return 0; }
+
+  if (rdouble) {
+    if (rquad)                      { _gf_errno = GF_E_DOUQUAD; return 0; }
+    if (mult_type != GF_MULT_TABLE) { _gf_errno = GF_E_DOUBLET; return 0; }
+    if (w != 4 && w != 8)           { _gf_errno = GF_E_DOUBLEW; return 0; }
+    if (rsse || rnosse || raltmap)  { _gf_errno = GF_E_DOUBLEJ; return 0; }
+    if (rlazy && w == 4)            { _gf_errno = GF_E_DOUBLEL; return 0; }
+    return 1;
+  }
+
+  if (rquad) {
+    if (mult_type != GF_MULT_TABLE) { _gf_errno = GF_E_QUAD__T; return 0; }
+    if (w != 4)                     { _gf_errno = GF_E_QUAD__W; return 0; }
+    if (rsse || rnosse || raltmap)  { _gf_errno = GF_E_QUAD__J; return 0; }
+    return 1;
+  }
+
+  if (rlazy)                        { _gf_errno = GF_E_LAZY__X; return 0; }
+
+  if (mult_type == GF_MULT_SHIFT) {
+    if (raltmap)                    { _gf_errno = GF_E_ALTSHIF; return 0; }
+    if (rsse || rnosse)             { _gf_errno = GF_E_SSESHIF; return 0; }
+    return 1;
+  }
+
+  if (mult_type == GF_MULT_CARRY_FREE) {
+    if (w != 4 && w != 8 && w != 16 &&
+        w != 32 && w != 64 && w != 128)            { _gf_errno = GF_E_CFM___W; return 0; }
+    if (w == 4 && (poly & 0xc))                    { _gf_errno = GF_E_CFM4POL; return 0; }
+    if (w == 8 && (poly & 0x80))                   { _gf_errno = GF_E_CFM8POL; return 0; }
+    if (w == 16 && (poly & 0xe000))                { _gf_errno = GF_E_CF16POL; return 0; }
+    if (w == 32 && (poly & 0xfe000000))            { _gf_errno = GF_E_CF32POL; return 0; }
+    if (w == 64 && (poly & 0xfffe000000000000ULL)) { _gf_errno = GF_E_CF64POL; return 0; }
+    if (raltmap)                                   { _gf_errno = GF_E_ALT_CFM; return 0; }
+    if (rsse || rnosse)                            { _gf_errno = GF_E_SSE_CFM; return 0; }
+    if (!pclmul)                                   { _gf_errno = GF_E_PCLMULX; return 0; }
+    return 1;
+  }
+
+  if (mult_type == GF_MULT_BYTWO_p || mult_type == GF_MULT_BYTWO_b) {
+    if (raltmap)                    { _gf_errno = GF_E_ALT_BY2; return 0; }
+    if (rsse && !sse2)              { _gf_errno = GF_E_BY2_SSE; return 0; }
+    return 1;
+  }
+
+  if (mult_type == GF_MULT_LOG_TABLE || mult_type == GF_MULT_LOG_ZERO
+                                     || mult_type == GF_MULT_LOG_ZERO_EXT ) {
+    if (w > 27)                     { _gf_errno = GF_E_LOGBADW; return 0; }
+    if (raltmap || rsse || rnosse)  { _gf_errno = GF_E_LOG___J; return 0; }
+
+    if (mult_type == GF_MULT_LOG_TABLE) return 1;
+
+    if (w != 8 && w != 16)          { _gf_errno = GF_E_ZERBADW; return 0; }
+
+    if (mult_type == GF_MULT_LOG_ZERO) return 1;
+
+    if (w != 8)                     { _gf_errno = GF_E_ZEXBADW; return 0; }
+    return 1;
+  }
+
+  if (mult_type == GF_MULT_GROUP) {
+    if (arg1 <= 0 || arg2 <= 0)                 { _gf_errno = GF_E_GR_ARGX; return 0; }
+    if (w == 4 || w == 8)                       { _gf_errno = GF_E_GR_W_48; return 0; }
+    if (w == 16 && (arg1 != 4 || arg2 != 4))     { _gf_errno = GF_E_GR_W_16; return 0; }
+    if (w == 128 && (arg1 != 4 || 
+       (arg2 != 4 && arg2 != 8 && arg2 != 16))) { _gf_errno = GF_E_GR_128A; return 0; }
+    if (w == 128 && !sse4)                      { _gf_errno = GF_E_GR_SSE4; return 0; }
+    if (arg1 > 27 || arg2 > 27)                 { _gf_errno = GF_E_GR_A_27; return 0; }
+    if (arg1 > w || arg2 > w)                   { _gf_errno = GF_E_GR_AR_W; return 0; }
+    if (raltmap || rsse || rnosse)              { _gf_errno = GF_E_GR____J; return 0; }
+    return 1;
+  }
+  
+  if (mult_type == GF_MULT_TABLE) {
+    if (w != 16 && w >= 15)                     { _gf_errno = GF_E_TABLE_W; return 0; }
+    if (w != 4 && (rsse || rnosse))             { _gf_errno = GF_E_TAB_SSE; return 0; }
+    if (rsse && !sse3)                          { _gf_errno = GF_E_TABSSE3; return 0; }
+    if (raltmap)                                { _gf_errno = GF_E_TAB_ALT; return 0; }
+    return 1;
+  }
+
+  if (mult_type == GF_MULT_SPLIT_TABLE) {
+    if (arg1 > arg2) {
+      tmp = arg1;
+      arg1 = arg2;
+      arg2 = tmp;
+    }
+    if (w == 8) {
+      if (arg1 != 4 || arg2 != 8)               { _gf_errno = GF_E_SP_8_AR; return 0; }
+      if (rsse && !sse3)                        { _gf_errno = GF_E_SP_SSE3; return 0; }
+      if (raltmap)                              { _gf_errno = GF_E_SP_8__A; return 0; }
+    } else if (w == 16) {
+      if (arg1 == 4 && arg2 == 16) {
+        if (rsse && !sse3)                      { _gf_errno = GF_E_SP_SSE3; return 0; }
+      } else if (arg1 == 8 && (arg2 == 16 || arg2 == 8)) {
+        if (rsse || rnosse)                     { _gf_errno = GF_E_SP_16_S; return 0; }
+        if (raltmap)                            { _gf_errno = GF_E_SP_16_A; return 0; }
+      } else                                    { _gf_errno = GF_E_SP_16AR; return 0; }
+    } else if (w == 32) {
+      if ((arg1 == 8 && arg2 == 8) ||
+          (arg1 == 8 && arg2 == 32) ||
+          (arg1 == 16 && arg2 == 32)) {
+        if (rsse || rnosse)                     { _gf_errno = GF_E_SP_32_S; return 0; }
+        if (raltmap)                            { _gf_errno = GF_E_SP_32_A; return 0; }
+      } else if ((arg1 == 4 && arg2 == 32) ||
+          (arg1 == 4 && arg2 == 32)) {
+        if (rsse && !sse3)                      { _gf_errno = GF_E_SP_SSE3; return 0; }
+        if (raltmap && arg1 != 4)               { _gf_errno = GF_E_SP_32_A; return 0; }
+        if (raltmap && !sse3)                   { _gf_errno = GF_E_SP_32AS; return 0; }
+        if (raltmap && rnosse)                  { _gf_errno = GF_E_SP_32AS; return 0; }
+      } else                                    { _gf_errno = GF_E_SP_32AR; return 0; }
+    } else if (w == 64) {
+      if ((arg1 == 8 && arg2 == 8) ||
+          (arg1 == 8 && arg2 == 64) ||
+          (arg1 == 16 && arg2 == 64)) {
+        if (rsse || rnosse)                     { _gf_errno = GF_E_SP_64_S; return 0; }
+        if (raltmap)                            { _gf_errno = GF_E_SP_64_A; return 0; }
+      } else if (arg1 == 4 && arg2 == 64) {
+        if (rsse && !sse3)                      { _gf_errno = GF_E_SP_SSE3; return 0; }
+        if (raltmap && !sse3)                   { _gf_errno = GF_E_SP_64AS; return 0; }
+        if (raltmap && rnosse)                  { _gf_errno = GF_E_SP_64AS; return 0; }
+      } else                                    { _gf_errno = GF_E_SP_64AR; return 0; }
+    } else if (w == 128) {
+      if (arg1 == 8 && arg2 == 128) {
+        if (rsse || rnosse)                     { _gf_errno = GF_E_SP128_S; return 0; }
+        if (raltmap)                            { _gf_errno = GF_E_SP128_A; return 0; }
+      } else if (arg1 == 4 && arg2 == 128) {
+        if (rsse && !sse3)                      { _gf_errno = GF_E_SP_SSE3; return 0; }
+        if (raltmap && !sse3)                   { _gf_errno = GF_E_SP128AS; return 0; }
+        if (raltmap && rnosse)                  { _gf_errno = GF_E_SP128AS; return 0; }
+        if (!raltmap && rsse)                   { _gf_errno = GF_E_SP128AL; return 0; }
+      } else                                    { _gf_errno = GF_E_SP128AR; return 0; }
+    } else                                      { _gf_errno = GF_E_SPLIT_W; return 0; }
+    return 1;
+  }
+
+  if (mult_type == GF_MULT_COMPOSITE) {
+    if (w != 8 && w != 16 && w != 32 
+               && w != 64 && w != 128)          { _gf_errno = GF_E_COMP__W; return 0; }
+    if ((poly >> (w/2)) != 0)                   { _gf_errno = GF_E_COMP_PP; return 0; }
+    if (divide_type != GF_DIVIDE_DEFAULT)       { _gf_errno = GF_E_DIVCOMP; return 0; }
+    if (arg1 != 2)                              { _gf_errno = GF_E_COMP_A2; return 0; }
+    if (rsse || rnosse)                         { _gf_errno = GF_E_COMP_SS; return 0; }
+    if (base != NULL) {
+      sub = (gf_internal_t *) base->scratch;
+      if (sub->w != w/2)                      { _gf_errno = GF_E_BASE__W; return 0; }
+      if (poly == 0) {
+        if (gf_composite_get_default_poly(base) == 0) { _gf_errno = GF_E_COMPXPP; return 0; }
+      }
+    }
+    return 1;
+  }
+
+  _gf_errno = GF_E_UNKNOWN; 
+  return 0;
+}
+
 int gf_scratch_size(int w, 
                     int mult_type, 
                     int region_type, 
@@ -15,6 +414,8 @@ int gf_scratch_size(int w,
                     int arg1, 
                     int arg2)
 {
+  if (gf_error_check(w, mult_type, region_type, divide_type, arg1, arg2, 0, NULL) == 0) return 0;
+
   switch(w) {
     case 4: return gf_w4_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
     case 8: return gf_w8_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
@@ -26,16 +427,31 @@ int gf_scratch_size(int w,
   }
 }
 
-int gf_dummy_init(gf_t *gf)
+extern int gf_size(gf_t *gf)
 {
-  return 0;
+  gf_internal_t *h;
+  int s;
+
+  s = sizeof(gf_t);
+  h = (gf_internal_t *) gf->scratch;
+  s += gf_scratch_size(h->w, h->mult_type, h->region_type, h->divide_type, h->arg1, h->arg2);
+  if (h->mult_type == GF_MULT_COMPOSITE) s += gf_size(h->base_gf);
+  return s;
 }
 
+
 int gf_init_easy(gf_t *gf, int w)
 {
-  return gf_init_hard(gf, w, GF_MULT_DEFAULT, GF_REGION_DEFAULT, GF_DIVIDE_DEFAULT, 0, 0, 0, NULL, NULL);
+  return gf_init_hard(gf, w, GF_MULT_DEFAULT, GF_REGION_DEFAULT, GF_DIVIDE_DEFAULT, 
+                      0, 0, 0, NULL, NULL);
 }
 
+/* Allen: What's going on here is this function is putting info into the
+       scratch mem of gf, and then calling the relevant REAL init
+       func for the word size.  Probably done this way to consolidate
+       those aspects of initialization that don't rely on word size,
+       and then take care of word-size-specific stuff. */
+
 int gf_init_hard(gf_t *gf, int w, int mult_type, 
                         int region_type,
                         int divide_type,
@@ -46,11 +462,14 @@ int gf_init_hard(gf_t *gf, int w, int mult_type,
 {
   int sz;
   gf_internal_t *h;
-  
+ 
+  if (gf_error_check(w, mult_type, region_type, divide_type, 
+                     arg1, arg2, prim_poly, base_gf) == 0) return 0;
+
   sz = gf_scratch_size(w, mult_type, region_type, divide_type, arg1, arg2);
-
-  if (sz <= 0) return 0;
-
+  if (sz <= 0) return 0;  /* This shouldn't happen, as all errors should get caught
+                             in gf_error_check() */
+  
   if (scratch_memory == NULL) {
     h = (gf_internal_t *) malloc(sz);
     h->free_me = 1;
@@ -71,8 +490,6 @@ int gf_init_hard(gf_t *gf, int w, int mult_type,
   h->private += (sizeof(gf_internal_t));
   gf->extract_word.w32 = NULL;
 
-  //printf("Created w=%d, with mult_type=%d and region_type=%d\n", w, mult_type, region_type);
-
   switch(w) {
     case 4: return gf_w4_init(gf);
     case 8: return gf_w8_init(gf);
@@ -94,6 +511,7 @@ int gf_free(gf_t *gf, int recursive)
     free(h->base_gf);
   }
   if (h->free_me) free(h);
+  return 0; /* Making compiler happy */
 }
 
 void gf_alignment_error(char *s, int a)
@@ -105,9 +523,9 @@ void gf_alignment_error(char *s, int a)
 }
 
 static 
-void gf_invert_binary_matrix(int *mat, int *inv, int rows) {
+void gf_invert_binary_matrix(uint32_t *mat, uint32_t *inv, int rows) {
   int cols, i, j, k;
-  int tmp;
+  uint32_t tmp;
 
   cols = rows;
 
@@ -172,34 +590,6 @@ uint32_t gf_bitmatrix_inverse(uint32_t y, int w, uint32_t pp)
   return inv[0];
 }
 
-/*
-void gf_two_byte_region_table_multiply(gf_region_data *rd, uint16_t *base)
-{
-  uint64_t p, ta, shift, tb;
-  uint64_t *s64, *d64
-
-  s64 = rd->s_start;
-  d64 = rd->d_start;
-  
-  while (s64 < (uint64_t *) rd->s_top) {
-    p = (rd->xor) ? *d64 : 0;
-    ta = *s64;
-
-    shift = 0;
-    while (ta != 0) {
-      tb = base[ta&0xffff];
-      p ^= (tb << shift);
-      ta >>= 16;
-      shift += 16;
-    }
-
-    *d64 = p;
-    d64++;
-    s64++;
-  }
-}
-*/
-
 void gf_two_byte_region_table_multiply(gf_region_data *rd, uint16_t *base)
 {
   uint64_t a, prod;
@@ -226,8 +616,8 @@ void gf_two_byte_region_table_multiply(gf_region_data *rd, uint16_t *base)
       prod ^= base[a >> 48];
       prod ^= *d64;
       *d64 = prod;
-      *s64++;
-      *d64++;
+      s64++;
+      d64++;
     }
   } else {
     while (d64 != top) {
@@ -243,8 +633,8 @@ void gf_two_byte_region_table_multiply(gf_region_data *rd, uint16_t *base)
       prod <<= 16;
       prod ^= base[a >> 48];
       *d64 = prod;
-      *s64++;
-      *d64++;
+      s64++;
+      d64++;
     }
   }
 }
@@ -307,9 +697,71 @@ static void gf_slow_multiply_region(gf_region_data *rd, void *src, void *dest, v
   }
 }
 
-/* If align>16, you align to 16 bytes, but make sure that within the aligned region bytes is a multiple of align.  However, you make sure that the region itself is a multiple of align. 
+/* JSP - The purpose of this procedure is to error check alignment,
+   and to set up the region operation so that it can best leverage
+   large words.
 
-   If align = -1, then this is cauchy.  You need to make sure that bytes is a multiple of w. */
+   It stores its information in rd.
+
+   Assuming you're not doing Cauchy coding, (see below for that),
+   then w will be 4, 8, 16, 32 or 64. It can't be 128 (probably
+   should change that).
+
+   src and dest must then be aligned on ceil(w/8)-byte boundaries.
+   Moreover, bytes must be a multiple of ceil(w/8).  If the variable
+   align is equal to ceil(w/8), then we will set s_start = src,
+   d_start = dest, s_top to (src+bytes) and d_top to (dest+bytes).
+   And we return -- the implementation will go ahead and do the
+   multiplication on individual words (e.g. using discrete logs).
+
+   If align is greater than ceil(w/8), then the implementation needs
+   to work on groups of "align" bytes.  For example, suppose you are
+   implementing BYTWO, without SSE. Then you will be doing the region
+   multiplication in units of 8 bytes, so align = 8. Or, suppose you
+   are doing a Quad table in GF(2^4). You will be doing the region
+   multiplication in units of 2 bytes, so align = 2. Or, suppose you
+   are doing split multiplication with SSE operations in GF(2^8).
+   Then align = 16. Worse yet, suppose you are doing split
+   multiplication with SSE operations in GF(2^16), with or without
+   ALTMAP. Then, you will be doing the multiplication on 256 bits at
+   a time.  So align = 32.
+
+   When align does not equal ceil(w/8), we split the region
+   multiplication into three parts.  We are going to make s_start be
+   the first address greater than or equal to src that is a multiple
+   of align.  s_top is going to be the largest address >= src+bytes
+   such that (s_top - s_start) is a multiple of align.  We do the
+   same with d_start and d_top.  When we say that "src and dest must
+   be aligned with respect to each other, we mean that s_start-src
+   must equal d_start-dest.
+
+   Now, the region multiplication is done in three parts -- the part
+   between src and s_start must be done using single words.
+   Similarly, the part between s_top and src+bytes must also be done
+   using single words.  The part between s_start and s_top will be
+   done in chunks of "align" bytes.
+
+   One final thing -- if align > 16, then s_start and d_start will be
+   aligned on a 16 byte boundary.  Perhaps we should have two
+   variables: align and chunksize.  Then we'd have s_start & d_start
+   aligned to "align", and have s_top-s_start be a multiple of
+   chunksize.  That may be less confusing, but it would be a big
+   change.
+
+   Finally, if align = -1, then we are doing Cauchy multiplication,
+   using only XOR's.  In this case, we're not going to care about
+   alignment because we are just doing XOR's.  Instead, the only
+   thing we care about is that bytes must be a multiple of w.
+
+   This is not to say that alignment doesn't matter in performance
+   with XOR's.  See that discussion in gf_multby_one().
+
+   After you call gf_set_region_data(), the procedure
+   gf_do_initial_region_alignment() calls gf->multiply.w32() on
+   everything between src and s_start.  The procedure
+   gf_do_final_region_alignment() calls gf->multiply.w32() on
+   everything between s_top and src+bytes.
+   */
 
 void gf_set_region_data(gf_region_data *rd,
   gf_t *gf,
@@ -326,7 +778,7 @@ void gf_set_region_data(gf_region_data *rd,
   uint32_t a;
   unsigned long uls, uld;
 
-  if (gf == NULL) {
+  if (gf == NULL) {  /* JSP - Can be NULL if you're just doing XOR's */
     wb = 1;
   } else {
     h = gf->scratch;
@@ -347,7 +799,7 @@ void gf_set_region_data(gf_region_data *rd,
 
   a = (align <= 16) ? align : 16;
 
-  if (align == -1) { /* This is cauchy.  Error check bytes, then set up the pointers
+  if (align == -1) { /* JSP: This is cauchy.  Error check bytes, then set up the pointers
                         so that there are no alignment regions. */
     if (bytes % h->w != 0) {
       fprintf(stderr, "Error in region multiply operation.\n");
@@ -386,14 +838,14 @@ void gf_set_region_data(gf_region_data *rd,
   }
 
   uls %= a;
-  if (uls != 0) uls = (align-uls);
+  if (uls != 0) uls = (a-uls);
   rd->s_start = rd->src + uls;
   rd->d_start = rd->dest + uls;
   bytes -= uls;
-
   bytes -= (bytes % align);
   rd->s_top = rd->s_start + bytes;
   rd->d_top = rd->d_start + bytes;
+
 }
 
 void gf_do_initial_region_alignment(gf_region_data *rd)
@@ -413,25 +865,76 @@ void gf_multby_zero(void *dest, int bytes, int xor)
   return;
 }
 
+/* JSP - gf_multby_one tries to do this in the most efficient way
+   possible.  If xor = 0, then simply call memcpy() since that
+   should be optimized by the system.  Otherwise, try to do the xor
+   in the following order:
+
+   If src and dest are aligned with respect to each other on 16-byte
+   boundaries and you have SSE instructions, then use aligned SSE
+   instructions.
+
+   If they aren't but you still have SSE instructions, use unaligned
+   SSE instructions.
+
+   If there are no SSE instructions, but they are aligned with
+   respect to each other on 8-byte boundaries, then do them with
+   uint64_t's.
+
+   Otherwise, call gf_unaligned_xor(), which does the following:
+   align a destination pointer along an 8-byte boundary, and then
+   memcpy 32 bytes at a time from the src pointer to an array of
+   doubles.  I'm not sure if that's the best -- probably needs
+   testing, but this seems like it could be a black hole.
+ */
+
+static void gf_unaligned_xor(void *src, void *dest, int bytes);
+
 void gf_multby_one(void *src, void *dest, int bytes, int xor) 
 {
-#ifdef   INTEL_SSE4
+#ifdef   INTEL_SSE2
   __m128i ms, md;
 #endif
+  unsigned long uls, uld;
   uint8_t *s8, *d8, *dtop8;
   uint64_t *s64, *d64, *dtop64;
   int abytes;
-
   gf_region_data rd;
+
   if (!xor) {
     memcpy(dest, src, bytes);
     return;
   }
+  uls = (unsigned long) src;
+  uld = (unsigned long) dest;
 
-#ifdef   INTEL_SSE4
+#ifdef   INTEL_SSE2
   s8 = (uint8_t *) src;
   d8 = (uint8_t *) dest;
-  abytes = bytes & 0xfffffff0;
+  if (uls % 16 == uld % 16) {
+    gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 16);
+    while (s8 != rd.s_start) {
+      *d8 ^= *s8;
+      d8++;
+      s8++;
+    }
+    while (s8 < (uint8_t *) rd.s_top) {
+      ms = _mm_load_si128 ((__m128i *)(s8));
+      md = _mm_load_si128 ((__m128i *)(d8));
+      md = _mm_xor_si128(md, ms);
+      _mm_store_si128((__m128i *)(d8), md);
+      s8 += 16;
+      d8 += 16;
+    }
+    while (s8 != (uint8_t *) src + bytes) {
+      *d8 ^= *s8;
+      d8++;
+      s8++;
+    }
+    return;
+  }
+
+  abytes = (bytes & 0xfffffff0);
 
   while (d8 < (uint8_t *) dest + abytes) {
     ms = _mm_loadu_si128 ((__m128i *)(s8));
@@ -449,8 +952,11 @@ void gf_multby_one(void *src, void *dest, int bytes, int xor)
   return;
 #endif
 
-  /* If you don't have SSE, you'd better be aligned..... */
-
+  if (uls % 8 != uld % 8) {
+    gf_unaligned_xor(src, dest, bytes);
+    return;
+  }
+  
   gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 8);
   s8 = (uint8_t *) src;
   d8 = (uint8_t *) dest;
@@ -480,3 +986,47 @@ void gf_multby_one(void *src, void *dest, int bytes, int xor)
   }
   return;
 }
+
+#define UNALIGNED_BUFSIZE (8)
+
+static void gf_unaligned_xor(void *src, void *dest, int bytes)
+{
+  uint64_t scopy[UNALIGNED_BUFSIZE], *d64;
+  int i;
+  gf_region_data rd;
+  uint8_t *s8, *d8;
+
+  /* JSP - call gf_set_region_data(), but use dest in both places.  This is
+     because I only want to set up dest.  If I used src, gf_set_region_data()
+     would fail because src and dest are not aligned to each other wrt 
+     8-byte pointers.  I know this will actually align d_start to 16 bytes.
+     If I change gf_set_region_data() to split alignment & chunksize, then 
+     I could do this correctly. */
+
+  gf_set_region_data(&rd, NULL, dest, dest, bytes, 1, 1, 8*UNALIGNED_BUFSIZE);
+  s8 = (uint8_t *) src;
+  d8 = (uint8_t *) dest;
+
+  while (d8 < (uint8_t *) rd.d_start) {
+    *d8 ^= *s8;
+    d8++;
+    s8++;
+  }
+  
+  d64 = (uint64_t *) d8;
+  while (d64 < (uint64_t *) rd.d_top) {
+    memcpy(scopy, s8, 8*UNALIGNED_BUFSIZE);
+    s8 += 8*UNALIGNED_BUFSIZE;
+    for (i = 0; i < UNALIGNED_BUFSIZE; i++) {
+      *d64 ^= scopy[i];
+      d64++;
+    }
+  }
+  
+  d8 = (uint8_t *) d64;
+  while (d8 < (uint8_t *) (dest+bytes)) {
+    *d8 ^= *s8;
+    d8++;
+    s8++;
+  }
+}
diff --git a/gf_54.c b/gf_54.c
deleted file mode 100644
index fc37783..0000000
--- a/gf_54.c
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Multiplies four and five in GF(2^4).
- */
-
-#include <stdio.h>
-#include <stdint.h>
-#include <stdlib.h>
-
-#include "gf_complete.h"
-
-main()
-{
-  gf_t gf;
-  void *scratch;
-  int size;
-
-  size = gf_scratch_size(16, GF_MULT_SPLIT_TABLE,
-                             GF_REGION_SSE | GF_REGION_ALTMAP,
-                             GF_DIVIDE_DEFAULT,
-                             16, 4);
-  if (size == -1) exit(1); /* It failed. That shouldn't happen*/
-  scratch = (void *) malloc(size);
-  if (scratch == NULL) { perror("malloc"); exit(1); }
-  if (!gf_init_hard(&gf, 16, GF_MULT_SPLIT_TABLE,
-                             GF_REGION_SSE | GF_REGION_ALTMAP,
-                             GF_DIVIDE_DEFAULT,
-                             0, 16, 4, NULL, scratch)) exit(1);
-  printf("Yo\n");
-}
diff --git a/gf_add.c b/gf_add.c
index 78d443f..545b4b7 100644
--- a/gf_add.c
+++ b/gf_add.c
@@ -16,7 +16,7 @@ void usage(char *s)
   fprintf(stderr, "       If w has an h on the end, treat a, b and the sum as hexadecimal (no 0x required)\n");
   fprintf(stderr, "\n");
   fprintf(stderr, "       legal w are: 1-32, 64 and 128\n");
-  fprintf(stderr, "           128 is hex only (i.e. '128' will be an error - do '128h')\n");
+  fprintf(stderr, "       128 is hex only (i.e. '128' will be an error - do '128h')\n");
 
   if (s != NULL) fprintf(stderr, "%s", s);
   exit(1);
diff --git a/gf_complete.h b/gf_complete.h
index ac6688e..de3b753 100644
--- a/gf_complete.h
+++ b/gf_complete.h
@@ -4,22 +4,30 @@
 #pragma once
 #include <stdint.h>
 
-#ifdef  INTEL_SSE4
-#include <nmmintrin.h>
-#include <emmintrin.h>
-#include <smmintrin.h>
+#ifdef INTEL_SSE4
+  #define INTEL_SSSE3
+  #include <nmmintrin.h>
 #endif
 
-#ifdef  INTEL_PCLMUL
-#include <wmmintrin.h>
+#ifdef INTEL_SSSE3
+  #define INTEL_SSE2
+  #include <tmmintrin.h>
 #endif
 
-/* This does either memcpy or xor, depending on "xor" */
+#ifdef INTEL_SSE2
+  #include <emmintrin.h>
+#endif
 
-extern void gf_multby_one(void *src, void *dest, int bytes, int xor);
+#ifdef INTEL_PCLMUL
+  #include <wmmintrin.h>
+  #ifdef INTEL_SSE4
+    #define INTEL_SSE4_PCLMUL
+  #endif
+  #ifdef INTEL_SSSE3
+    #define INTEL_SSSE3_PCLMUL
+  #endif
+#endif
 
-#define GF_W128_IS_ZERO(val) (val[0] == 0 && val[1] == 0)
-#define GF_W128_EQUAL(val1, val2) ((val1[0] == val2[0]) && (val1[1] == val2[1]))
 
 /* These are the different ways to perform multiplication.
    Not all are implemented for all values of w.
@@ -27,30 +35,30 @@ extern void gf_multby_one(void *src, void *dest, int bytes, int xor);
 
 typedef enum {GF_MULT_DEFAULT,   
               GF_MULT_SHIFT,   
+              GF_MULT_CARRY_FREE,   
               GF_MULT_GROUP,   
               GF_MULT_BYTWO_p,
               GF_MULT_BYTWO_b,
               GF_MULT_TABLE,   
               GF_MULT_LOG_TABLE,   
+              GF_MULT_LOG_ZERO,
+              GF_MULT_LOG_ZERO_EXT,
               GF_MULT_SPLIT_TABLE,   
               GF_MULT_COMPOSITE } gf_mult_type_t;
 
 /* These are the different ways to optimize region 
-   operations.  They are bits because you can compose them:
-   You can mix SINGLE/DOUBLE/QUAD, LAZY, SSE/NOSSE, STDMAP/ALTMAP/CAUCHY.
+   operations.  They are bits because you can compose them.
    Certain optimizations only apply to certain gf_mult_type_t's.  
    Again, please see documentation for how to use these */
    
 #define GF_REGION_DEFAULT      (0x0)
-#define GF_REGION_SINGLE_TABLE (0x1)
-#define GF_REGION_DOUBLE_TABLE (0x2)
-#define GF_REGION_QUAD_TABLE   (0x4)
-#define GF_REGION_LAZY         (0x8)
-#define GF_REGION_SSE          (0x10)
-#define GF_REGION_NOSSE        (0x20)
-#define GF_REGION_STDMAP       (0x40)
-#define GF_REGION_ALTMAP       (0x80)
-#define GF_REGION_CAUCHY       (0x100)
+#define GF_REGION_DOUBLE_TABLE (0x1)
+#define GF_REGION_QUAD_TABLE   (0x2)
+#define GF_REGION_LAZY         (0x4)
+#define GF_REGION_SSE          (0x8)
+#define GF_REGION_NOSSE        (0x10)
+#define GF_REGION_ALTMAP       (0x20)
+#define GF_REGION_CAUCHY       (0x40)
 
 typedef uint32_t gf_region_type_t;
 
@@ -74,6 +82,9 @@ typedef uint32_t    gf_val_32_t;
 typedef uint64_t    gf_val_64_t;
 typedef uint64_t   *gf_val_128_t;
 
+extern int _gf_errno;
+extern void gf_error();
+
 typedef struct gf *GFP;
 
 typedef union gf_func_a_b {
@@ -109,8 +120,21 @@ typedef struct gf {
   void           *scratch;
 } gf_t;
     
+/* Initializes the GF to defaults.  Pass it a pointer to a gf_t.
+   Returns 0 on failure, 1 on success. */
+
 extern int gf_init_easy(GFP gf, int w);
 
+/* Initializes the GF changing the defaults.
+   Returns 0 on failure, 1 on success.
+   Pass it a pointer to a gf_t.
+   For mult_type and divide_type, use one of gf_mult_type_t gf_divide_type_t .  
+   For region_type, OR together the GF_REGION_xxx's defined above.  
+   Use 0 as prim_poly for defaults.  Otherwise, the leading 1 is optional.
+   Use NULL for scratch_memory to have init_hard allocate memory.  Otherwise,
+   use gf_scratch_size() to determine how big scratch_memory has to be.
+ */
+
 extern int gf_init_hard(GFP gf, 
                         int w, 
                         int mult_type, 
@@ -122,6 +146,9 @@ extern int gf_init_hard(GFP gf,
                         GFP base_gf,
                         void *scratch_memory);
 
+/* Determines the size for scratch_memory.  
+   Returns 0 on failure and non-zero on success. */
+
 extern int gf_scratch_size(int w, 
                            int mult_type, 
                            int region_type, 
@@ -129,25 +156,32 @@ extern int gf_scratch_size(int w,
                            int arg1, 
                            int arg2);
 
+/* This reports the gf_scratch_size of a gf_t that has already been created */
+
+extern int gf_size(GFP gf);
+
+/* Frees scratch memory if gf_init_easy/gf_init_hard called malloc.
+   If recursive = 1, then it calls itself recursively on base_gf. */
+
 extern int gf_free(GFP gf, int recursive);
 
 /* This is support for inline single multiplications and divisions.
    I know it's yucky, but if you've got to be fast, you've got to be fast.
-   We'll support inlines for w=4, w=8 and w=16.  
+   We support inlining for w=4, w=8 and w=16.  
 
    To use inline multiplication and division with w=4 or 8, you should use the 
    default gf_t, or one with a single table.  Otherwise, gf_w4/8_get_mult_table()
-   will return NULL. */
+   will return NULL. Similarly, with w=16, the gf_t must be LOG */
 
 uint8_t *gf_w4_get_mult_table(GFP gf);
 uint8_t *gf_w4_get_div_table(GFP gf);
 
-#define GF_W4_INLINE_MULTDIV(table, a, b) (table[((a)<<4)|b])
+#define GF_W4_INLINE_MULTDIV(table, a, b) (table[((a)<<4)|(b)])
 
 uint8_t *gf_w8_get_mult_table(GFP gf);
 uint8_t *gf_w8_get_div_table(GFP gf);
 
-#define GF_W8_INLINE_MULTDIV(table, a, b) (table[(((uint32_t) a)<<8)|b])
+#define GF_W8_INLINE_MULTDIV(table, a, b) (table[(((uint32_t) (a))<<8)|(b)])
 
 uint16_t *gf_w16_get_log_table(GFP gf);
 uint16_t *gf_w16_get_mult_alog_table(GFP gf);
diff --git a/gf_example_5.c b/gf_example_5.c
new file mode 100644
index 0000000..3e303a3
--- /dev/null
+++ b/gf_example_5.c
@@ -0,0 +1,73 @@
+/*
+ * gf_example_5.c
+ *
+ * Demonstrating altmap and extract_word
+ */
+
+#include <stdio.h>
+#include <getopt.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include "gf_complete.h"
+#include "gf_rand.h"
+
+void usage(char *s)
+{
+  fprintf(stderr, "usage: gf_example_5\n");
+  exit(1);
+}
+
+int main(int argc, char **argv)
+{
+  uint16_t *a, *b;
+  int i, j;
+  gf_t gf;
+
+  if (gf_init_hard(&gf, 16, GF_MULT_SPLIT_TABLE, GF_REGION_ALTMAP, GF_DIVIDE_DEFAULT, 
+                   0, 16, 4, NULL, NULL) == 0) {
+    fprintf(stderr, "gf_init_hard failed\n");
+    exit(1);
+  }
+
+  a = (uint16_t *) malloc(200);
+  b = (uint16_t *) malloc(200);
+
+  a += 6;
+  b += 6;
+
+  MOA_Seed(0);
+
+  for (i = 0; i < 30; i++) a[i] = MOA_Random_W(16, 1);
+
+  gf.multiply_region.w32(&gf, a, b, 0x1234, 30*2, 0);
+
+  printf("a: 0x%lx    b: 0x%lx\n", (unsigned long) a, (unsigned long) b);
+
+  for (i = 0; i < 30; i += 10) {
+    printf("\n");
+    printf("  ");
+    for (j = 0; j < 10; j++) printf(" %4d", i+j);
+    printf("\n");
+
+    printf("a:");
+    for (j = 0; j < 10; j++) printf(" %04x", a[i+j]);
+    printf("\n");
+
+    printf("b:");
+    for (j = 0; j < 10; j++) printf(" %04x", b[i+j]);
+    printf("\n");
+    printf("\n");
+  }
+
+  for (i = 0; i < 15; i ++) {
+    printf("Word %2d: 0x%04x * 0x1234 = 0x%04x    ", i,
+           gf.extract_word.w32(&gf, a, 30*2, i),
+           gf.extract_word.w32(&gf, b, 30*2, i));
+    printf("Word %2d: 0x%04x * 0x1234 = 0x%04x\n", i+15,
+           gf.extract_word.w32(&gf, a, 30*2, i+15),
+           gf.extract_word.w32(&gf, b, 30*2, i+15));
+  }
+}
diff --git a/gf_example_6.c b/gf_example_6.c
new file mode 100644
index 0000000..86dda11
--- /dev/null
+++ b/gf_example_6.c
@@ -0,0 +1,79 @@
+/*
+ * gf_example_6.c
+ *
+ * Demonstrating altmap and extract_word
+ */
+
+#include <stdio.h>
+#include <getopt.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include "gf_complete.h"
+#include "gf_rand.h"
+
+void usage(char *s)
+{
+  fprintf(stderr, "usage: gf_example_6\n");
+  exit(1);
+}
+
+int main(int argc, char **argv)
+{
+  uint32_t *a, *b;
+  int i, j;
+  gf_t gf, gf_16;
+
+  if (gf_init_hard(&gf_16, 16, GF_MULT_LOG_TABLE, GF_REGION_DEFAULT, GF_DIVIDE_DEFAULT,
+                   0, 0, 0, NULL, NULL) == 0) {
+    fprintf(stderr, "gf_init_hard (6) failed\n");
+    exit(1);
+  }
+
+  if (gf_init_hard(&gf, 32, GF_MULT_COMPOSITE, GF_REGION_ALTMAP, GF_DIVIDE_DEFAULT, 
+                   0, 2, 0, &gf_16, NULL) == 0) {
+    fprintf(stderr, "gf_init_hard (32) failed\n");
+    exit(1);
+  }
+
+  a = (uint32_t *) malloc(200);
+  b = (uint32_t *) malloc(200);
+
+  a += 3;
+  b += 3;
+
+  MOA_Seed(0);
+
+  for (i = 0; i < 30; i++) a[i] = MOA_Random_W(32, 1);
+
+  gf.multiply_region.w32(&gf, a, b, 0x12345678, 30*4, 0);
+
+  printf("a: 0x%lx    b: 0x%lx\n", (unsigned long) a, (unsigned long) b);
+
+  for (i = 0; i < 30; i += 10) {
+    printf("\n");
+    printf("  ");
+    for (j = 0; j < 10; j++) printf(" %8d", i+j);
+    printf("\n");
+
+    printf("a:");
+    for (j = 0; j < 10; j++) printf(" %08x", a[i+j]);
+    printf("\n");
+
+    printf("b:");
+    for (j = 0; j < 10; j++) printf(" %08x", b[i+j]);
+    printf("\n");
+    printf("\n");
+  }
+
+  for (i = 0; i < 15; i ++) {
+    printf("Word %2d: 0x%08x * 0x12345678 = 0x%08x    ", i,
+           gf.extract_word.w32(&gf, a, 30*4, i),
+           gf.extract_word.w32(&gf, b, 30*4, i));
+    printf("Word %2d: 0x%08x * 0x12345678 = 0x%08x\n", i+15,
+           gf.extract_word.w32(&gf, a, 30*4, i+15),
+           gf.extract_word.w32(&gf, b, 30*4, i+15));
+  }
+}
diff --git a/gf_example_7.c b/gf_example_7.c
new file mode 100644
index 0000000..445ae20
--- /dev/null
+++ b/gf_example_7.c
@@ -0,0 +1,70 @@
+/*
+ * gf_example_7.c
+ *
+ * Demonstrating extract_word and Cauchy
+ */
+
+#include <stdio.h>
+#include <getopt.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include "gf_complete.h"
+#include "gf_rand.h"
+
+void usage(char *s)
+{
+  fprintf(stderr, "usage: gf_example_7\n");
+  exit(1);
+}
+
+int main(int argc, char **argv)
+{
+  uint8_t *a, *b;
+  int i, j;
+  gf_t gf;
+
+  if (gf_init_hard(&gf, 3, GF_MULT_TABLE, GF_REGION_CAUCHY, GF_DIVIDE_DEFAULT, 0, 0, 0, NULL, NULL) == 0) {
+    fprintf(stderr, "gf_init_hard failed\n");
+    exit(1);
+  }
+
+  a = (uint8_t *) malloc(3);
+  b = (uint8_t *) malloc(3);
+
+  MOA_Seed(0);
+
+  for (i = 0; i < 3; i++) a[i] = MOA_Random_W(8, 1);
+
+  gf.multiply_region.w32(&gf, a, b, 5, 3, 0);
+
+  printf("a: 0x%lx    b: 0x%lx\n", (unsigned long) a, (unsigned long) b);
+
+  printf("\n");
+  printf("a: 0x%02x 0x%02x 0x%02x\n", a[0], a[1], a[2]);
+  printf("b: 0x%02x 0x%02x 0x%02x\n", b[0], b[1], b[2]);
+  printf("\n");
+
+  printf("a bits:");
+  for (i = 0; i < 3; i++) {
+    printf(" ");
+    for (j = 7; j >= 0; j--) printf("%c", (a[i] & (1 << j)) ? '1' : '0');
+  }
+  printf("\n");
+
+  printf("b bits:");
+  for (i = 0; i < 3; i++) {
+    printf(" ");
+    for (j = 7; j >= 0; j--) printf("%c", (b[i] & (1 << j)) ? '1' : '0');
+  }
+  printf("\n");
+
+  printf("\n");
+  for (i = 0; i < 8; i++) {
+    printf("Word %2d: %d * 5 = %d\n", i,
+           gf.extract_word.w32(&gf, a, 3, i),
+           gf.extract_word.w32(&gf, b, 3, i));
+  }
+}
diff --git a/gf_general.c b/gf_general.c
index ac0c236..02efdc7 100644
--- a/gf_general.c
+++ b/gf_general.c
@@ -95,12 +95,20 @@ void gf_general_set_random(gf_general_t *v, int w, int zero_ok)
   }
 }
 
-void gf_general_val_to_s(gf_general_t *v, int w, char *s)
+void gf_general_val_to_s(gf_general_t *v, int w, char *s, int hex)
 {
   if (w <= 32) {
-    sprintf(s, "%x", v->w32);
+    if (hex) {
+      sprintf(s, "%x", v->w32);
+    } else {
+      sprintf(s, "%d", v->w32);
+    }
   } else if (w <= 64) {
-    sprintf(s, "%llx", (long long unsigned int) v->w64);
+    if (hex) {
+      sprintf(s, "%llx", (long long unsigned int) v->w64);
+    } else {
+      sprintf(s, "%lld", (long long unsigned int) v->w64);
+    }
   } else {
     if (v->w128[0] == 0) {
       sprintf(s, "%llx", (long long unsigned int) v->w128[1]);
@@ -111,6 +119,64 @@ void gf_general_val_to_s(gf_general_t *v, int w, char *s)
   }
 }
 
+int gf_general_s_to_val(gf_general_t *v, int w, char *s, int hex)
+{
+  int l;
+  int save;
+
+  if (w <= 32) {
+    if (hex) {
+      if (sscanf(s, "%x", &(v->w32)) == 0) return 0;
+    } else {
+      if (sscanf(s, "%d", &(v->w32)) == 0) return 0;
+    }
+    if (w == 32) return 1;
+    if (w == 31) {
+      if (v->w32 & (1 << 31)) return 0;
+      return 1;
+    } 
+    if (v->w32 & ~((1 << w)-1)) return 0;
+    return 1;
+  } else if (w <= 64) {
+    if (hex) return (sscanf(s, "%llx", &(v->w64)) == 1);
+    return (sscanf(s, "%lld", &(v->w64)) == 1);
+  } else {
+    if (!hex) return 0;
+    l = strlen(s);
+    if (l <= 16) {
+      v->w128[0] = 0;
+      return (sscanf(s, "%llx", &(v->w128[1])) == 1);
+    } else {
+      if (l > 32) return 0;
+      save = s[l-16];
+      s[l-16] = '\0';
+      if (sscanf(s, "%llx", &(v->w128[0])) == 0) {
+        s[l-16] = save;
+        return 0;
+      }
+      return (sscanf(s+(l-16), "%llx", &(v->w128[1])) == 1);
+    }
+  }
+}
+    
+void gf_general_add(gf_t *gf, gf_general_t *a, gf_general_t *b, gf_general_t *c)
+{
+  gf_internal_t *h;
+  int w;
+
+  h = (gf_internal_t *) gf->scratch;
+  w = h->w;
+
+  if (w <= 32) {
+    c->w32 = a->w32 ^ b->w32;
+  } else if (w <= 64) {
+    c->w64 = a->w64 ^ b->w64;
+  } else {
+    c->w128[0] = a->w128[0] ^ b->w128[0];
+    c->w128[1] = a->w128[1] ^ b->w128[1];
+  }
+}
+  
 void gf_general_multiply(gf_t *gf, gf_general_t *a, gf_general_t *b, gf_general_t *c)
 {
   gf_internal_t *h;
@@ -229,19 +295,19 @@ void gf_general_do_region_check(gf_t *gf, gf_general_t *a, void *orig_a, void *o
 
     if (!gf_general_are_equal(&ft, &sb, w)) {
       
-      printf("Problem with region multiply (all values in hex):\n");
-      printf("   Target address base: 0x%lx.  Word 0x%x of 0x%x.  Xor: %d\n", 
+      fprintf(stderr,"Problem with region multiply (all values in hex):\n");
+      fprintf(stderr,"   Target address base: 0x%lx.  Word 0x%x of 0x%x.  Xor: %d\n", 
                  (unsigned long) final_target, i, words, xor);
-      gf_general_val_to_s(a, w, sa);
-      gf_general_val_to_s(&oa, w, soa);
-      gf_general_val_to_s(&ot, w, sot);
-      gf_general_val_to_s(&ft, w, sft);
-      gf_general_val_to_s(&sb, w, ssb);
-      printf("   Value: %s\n", sa);
-      printf("   Original source word: %s\n", soa);
-      if (xor) printf("   XOR with target word: %s\n", sot);
-      printf("   Product word: %s\n", sft);
-      printf("   It should be: %s\n", ssb);
+      gf_general_val_to_s(a, w, sa, 1);
+      gf_general_val_to_s(&oa, w, soa, 1);
+      gf_general_val_to_s(&ot, w, sot, 1);
+      gf_general_val_to_s(&ft, w, sft, 1);
+      gf_general_val_to_s(&sb, w, ssb, 1);
+      fprintf(stderr,"   Value: %s\n", sa);
+      fprintf(stderr,"   Original source word: %s\n", soa);
+      if (xor) fprintf(stderr,"   XOR with target word: %s\n", sot);
+      fprintf(stderr,"   Product word: %s\n", sft);
+      fprintf(stderr,"   It should be: %s\n", ssb);
       exit(0);
     }
   }
@@ -251,7 +317,7 @@ void gf_general_set_up_single_timing_test(int w, void *ra, void *rb, int size)
 {
   void *top;
   gf_general_t g;
-  uint8_t *r8;
+  uint8_t *r8, *r8a;
   uint16_t *r16;
   uint32_t *r32;
   uint64_t *r64;
@@ -263,6 +329,8 @@ void gf_general_set_up_single_timing_test(int w, void *ra, void *rb, int size)
      However, don't allow for zeros in rb, because that will screw up
      division.
      
+     When w is 4, you fill the regions with random 4-bit words in each byte.
+
      Otherwise, treat every four bytes as an uint32_t
      and fill it with a random value mod (1 << w).
    */
@@ -296,6 +364,17 @@ void gf_general_set_up_single_timing_test(int w, void *ra, void *rb, int size)
       }
       rb += (w/8);
     }
+  } else if (w == 4) {
+    r8a = (uint8_t *) ra;
+    r8 = (uint8_t *) rb;
+    while (r8 < (uint8_t *) top) {
+      gf_general_set_random(&g, w, 1);
+      *r8a = g.w32;
+      gf_general_set_random(&g, w, 0);
+      *r8 = g.w32;
+      r8a++;
+      r8++;
+    }
   } else {
     r32 = (uint32_t *) ra;
     for (i = 0; i < size/4; i++) r32[i] = MOA_Random_W(w, 1);
@@ -306,7 +385,7 @@ void gf_general_set_up_single_timing_test(int w, void *ra, void *rb, int size)
 
 /* This sucks, but in order to time, you really need to avoid putting ifs in 
    the inner loops.  So, I'm doing a separate timing test for each w: 
-   8, 16, 32, 64, 128 and everything else.  Fortunately, the "everything else"
+   (4 & 8), 16, 32, 64, 128 and everything else.  Fortunately, the "everything else"
    tests can be equivalent to w=32.
 
    I'm also putting the results back into ra, because otherwise, the optimizer might
@@ -327,7 +406,7 @@ int gf_general_do_single_timing_test(gf_t *gf, void *ra, void *rb, int size, cha
   w = h->w;
   top = ra + size;
 
-  if (w == 8) {
+  if (w == 8 || w == 4) {
     r8a = (uint8_t *) ra; 
     r8b = (uint8_t *) rb; 
     top8 = (uint8_t *) top;
diff --git a/gf_general.h b/gf_general.h
index 0848f36..b257348 100644
--- a/gf_general.h
+++ b/gf_general.h
@@ -32,10 +32,12 @@ int gf_general_is_zero(gf_general_t *v, int w);
 int gf_general_is_one(gf_general_t *v, int w);
 int gf_general_are_equal(gf_general_t *v1, gf_general_t *v2, int w);
 
-void gf_general_val_to_s(gf_general_t *v, int w, char *s);
+void gf_general_val_to_s(gf_general_t *v, int w, char *s, int hex);
+int  gf_general_s_to_val(gf_general_t *v, int w, char *s, int hex);
 
 void gf_general_set_random(gf_general_t *v, int w, int zero_ok);
 
+void gf_general_add(gf_t *gf, gf_general_t *a, gf_general_t *b, gf_general_t *c);
 void gf_general_multiply(gf_t *gf, gf_general_t *a, gf_general_t *b, gf_general_t *c);
 void gf_general_divide(gf_t *gf, gf_general_t *a, gf_general_t *b, gf_general_t *c);
 void gf_general_inverse(gf_t *gf, gf_general_t *a, gf_general_t *b);
diff --git a/gf_inline_time.c b/gf_inline_time.c
index d52c814..55709cd 100644
--- a/gf_inline_time.c
+++ b/gf_inline_time.c
@@ -9,6 +9,7 @@
 #include <string.h>
 #include <stdlib.h>
 #include <time.h>
+#include <sys/time.h>
 
 #include "gf_complete.h"
 #include "gf_rand.h"
diff --git a/gf_int.h b/gf_int.h
index bd544bc..bdff2a2 100644
--- a/gf_int.h
+++ b/gf_int.h
@@ -51,11 +51,15 @@ extern int gf_wgen_scratch_size(int w, int mult_type, int region_type, int divid
 void gf_wgen_cauchy_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor);
 gf_val_32_t gf_wgen_extract_word(gf_t *gf, void *start, int bytes, int index);
 
-
 extern void gf_alignment_error(char *s, int a);
 
 extern uint32_t gf_bitmatrix_inverse(uint32_t y, int w, uint32_t pp);
 
+/* This returns the correct default for prim_poly when base is used as the base
+   field for COMPOSITE.  It returns 0 if we don't have a default prim_poly. */
+
+extern uint64_t gf_composite_get_default_poly(gf_t *base);
+
 /* This structure lets you define a region multiply.  It helps because you can handle
    unaligned portions of the data with the procedures below, which really cleans
    up the code. */
@@ -96,3 +100,97 @@ extern void gf_do_final_region_alignment(gf_region_data *rd);
 extern void gf_two_byte_region_table_multiply(gf_region_data *rd, uint16_t *base);
 
 extern void gf_multby_zero(void *dest, int bytes, int xor);
+extern void gf_multby_one(void *src, void *dest, int bytes, int xor);
+
+typedef enum {GF_E_MDEFDIV, /* Dev != Default && Mult == Default */
+              GF_E_MDEFREG, /* Reg != Default && Mult == Default */
+              GF_E_MDEFARG, /* Args != Default && Mult == Default */
+              GF_E_DIVCOMP, /* Mult == Composite && Div != Default */
+              GF_E_CAUCOMP, /* Mult == Composite && Reg == CAUCHY */
+              GF_E_DOUQUAD, /* Reg == DOUBLE && Reg == QUAD */
+              GF_E_SSE__NO, /* Reg == SSE && Reg == NOSSE */
+              GF_E_CAUCHYB, /* Reg == CAUCHY && Other Reg */
+              GF_E_CAUGT32, /* Reg == CAUCHY && w > 32*/
+              GF_E_ARG1SET, /* Arg1 != 0 && Mult \notin COMPOSITE/SPLIT/GROUP */
+              GF_E_ARG2SET, /* Arg2 != 0 && Mult \notin SPLIT/GROUP */
+              GF_E_MATRIXW, /* Div == MATRIX && w > 32 */
+              GF_E_BAD___W, /* Illegal w */
+              GF_E_DOUBLET, /* Reg == DOUBLE && Mult != TABLE */
+              GF_E_DOUBLEW, /* Reg == DOUBLE && w \notin {4,8} */
+              GF_E_DOUBLEJ, /* Reg == DOUBLE && other Reg */
+              GF_E_DOUBLEL, /* Reg == DOUBLE & LAZY but w = 4 */
+              GF_E_QUAD__T, /* Reg == QUAD && Mult != TABLE */
+              GF_E_QUAD__W, /* Reg == QUAD && w != 4 */
+              GF_E_QUAD__J, /* Reg == QUAD && other Reg */
+              GF_E_LAZY__X, /* Reg == LAZY && not DOUBLE or QUAD*/
+              GF_E_ALTSHIF, /* Mult == Shift && Reg == ALTMAP */
+              GF_E_SSESHIF, /* Mult == Shift && Reg == SSE|NOSSE */
+              GF_E_ALT_CFM, /* Mult == CARRY_FREE && Reg == ALTMAP */
+              GF_E_SSE_CFM, /* Mult == CARRY_FREE && Reg == SSE|NOSSE */
+              GF_E_PCLMULX, /* Mult == Carry_Free && No PCLMUL */
+              GF_E_ALT_BY2, /* Mult == Bytwo_x && Reg == ALTMAP */
+              GF_E_BY2_SSE, /* Mult == Bytwo_x && Reg == SSE && No SSE2 */
+              GF_E_LOGBADW, /* Mult == LOGx, w too big*/
+              GF_E_LOG___J, /* Mult == LOGx, && Reg == SSE|ALTMAP|NOSSE */
+              GF_E_ZERBADW, /* Mult == LOG_ZERO, w \notin {8,16} */
+              GF_E_ZEXBADW, /* Mult == LOG_ZERO_EXT, w != 8 */
+              GF_E_LOGPOLY, /* Mult == LOG & poly not primitive */
+              GF_E_GR_ARGX, /* Mult == GROUP, Bad arg1/2 */
+              GF_E_GR_W_48, /* Mult == GROUP, w \in { 4, 8 } */
+              GF_E_GR_W_16, /* Mult == GROUP, w == 16, arg1 != 4 || arg2 != 4 */
+              GF_E_GR_128A, /* Mult == GROUP, w == 128, bad args */
+              GF_E_GR_SSE4, /* Mult == GROUP, w == 128, No SSE4  */
+              GF_E_GR_A_27, /* Mult == GROUP, either arg > 27 */
+              GF_E_GR_AR_W, /* Mult == GROUP, either arg > w  */
+              GF_E_GR____J, /* Mult == GROUP, Reg == SSE|ALTMAP|NOSSE */
+              GF_E_TABLE_W, /* Mult == TABLE, w too big */
+              GF_E_TAB_SSE, /* Mult == TABLE, SSE|NOSSE only apply to w == 4 */
+              GF_E_TABSSE3, /* Mult == TABLE, Need SSSE3 for SSE */
+              GF_E_TAB_ALT, /* Mult == TABLE, Reg == ALTMAP */
+              GF_E_SP128AR, /* Mult == SPLIT, w=128, Bad arg1/arg2 */
+              GF_E_SP128AL, /* Mult == SPLIT, w=128, SSE requires ALTMAP */
+              GF_E_SP128AS, /* Mult == SPLIT, w=128, ALTMAP requires SSE */
+              GF_E_SP128_A, /* Mult == SPLIT, w=128, SSE only with 4/128 */
+              GF_E_SP128_S, /* Mult == SPLIT, w=128, ALTMAP only with 4/128 */
+              GF_E_SPLIT_W, /* Mult == SPLIT, Bad w (8, 16, 32, 64, 128)  */
+              GF_E_SP_16AR, /* Mult == SPLIT, w=16, Bad arg1/arg2 */
+              GF_E_SP_16_A, /* Mult == SPLIT, w=16, ALTMAP only with 4/16 */
+              GF_E_SP_16_S, /* Mult == SPLIT, w=16, SSE only with 4/16 */
+              GF_E_SP_32AR, /* Mult == SPLIT, w=32, Bad arg1/arg2 */
+              GF_E_SP_32AS, /* Mult == SPLIT, w=32, ALTMAP requires SSE */
+              GF_E_SP_32_A, /* Mult == SPLIT, w=32, ALTMAP only with 4/32 */
+              GF_E_SP_32_S, /* Mult == SPLIT, w=32, SSE only with 4/32 */
+              GF_E_SP_64AR, /* Mult == SPLIT, w=64, Bad arg1/arg2 */
+              GF_E_SP_64AS, /* Mult == SPLIT, w=64, ALTMAP requires SSE */
+              GF_E_SP_64_A, /* Mult == SPLIT, w=64, ALTMAP only with 4/64 */
+              GF_E_SP_64_S, /* Mult == SPLIT, w=64, SSE only with 4/64 */
+              GF_E_SP_8_AR, /* Mult == SPLIT, w=8, Bad arg1/arg2 */
+              GF_E_SP_8__A, /* Mult == SPLIT, w=8, no ALTMAP */
+              GF_E_SP_SSE3, /* Mult == SPLIT, Need SSSE3 for SSE */
+              GF_E_COMP_A2, /* Mult == COMP, arg1 must be = 2 */
+              GF_E_COMP_SS, /* Mult == COMP, SSE|NOSSE */
+              GF_E_COMP__W, /* Mult == COMP, Bad w. */
+              GF_E_UNKFLAG, /* Unknown flag in create_from.... */
+              GF_E_UNKNOWN, /* Unknown mult_type. */
+              GF_E_UNK_REG, /* Unknown region_type. */
+              GF_E_UNK_DIV, /* Unknown divide_type. */
+              GF_E_CFM___W, /* Mult == CFM,  Bad w. */
+              GF_E_CFM4POL, /* Mult == CFM & Prim Poly has high bits set. */
+              GF_E_CFM8POL, /* Mult == CFM & Prim Poly has high bits set. */
+              GF_E_CF16POL, /* Mult == CFM & Prim Poly has high bits set. */
+              GF_E_CF32POL, /* Mult == CFM & Prim Poly has high bits set. */
+              GF_E_CF64POL, /* Mult == CFM & Prim Poly has high bits set. */
+              GF_E_FEWARGS, /* Too few args in argc/argv. */
+              GF_E_BADPOLY, /* Bad primitive polynomial -- too many bits set. */
+              GF_E_COMP_PP, /* Bad primitive polynomial -- bigger than sub-field. */
+              GF_E_COMPXPP, /* Can't derive a default pp for composite field. */
+              GF_E_BASE__W, /* Composite -- Base field is the wrong size. */
+              GF_E_TWOMULT, /* In create_from... two -m's. */
+              GF_E_TWO_DIV, /* In create_from... two -d's. */
+              GF_E_POLYSPC, /* Bad numbera after -p. */
+              GF_E_SPLITAR, /* Ran out of arguments in SPLIT */
+              GF_E_SPLITNU, /* Arguments not integers in SPLIT. */
+              GF_E_GROUPAR, /* Ran out of arguments in GROUP */
+              GF_E_GROUPNU, /* Arguments not integers in GROUP. */
+              GF_E_DEFAULT } gf_error_type_t;
+
diff --git a/gf_method.c b/gf_method.c
index f65c4e3..bc9bd35 100644
--- a/gf_method.c
+++ b/gf_method.c
@@ -11,179 +11,172 @@
 #include <time.h>
 
 #include "gf_complete.h"
+#include "gf_int.h"
 #include "gf_method.h"
 
-void methods_to_stderr()
-{
-  fprintf(stderr, "To specify the methods, do one of the following: \n");
-  fprintf(stderr, "       - leave empty to use defaults\n");
-  fprintf(stderr, "       - use a single dash to use defaults\n");
-  fprintf(stderr, "       - specify MULTIPLY REGION DIVIDE\n");
-  fprintf(stderr, "\n");
-  fprintf(stderr, "Legal values of MULTIPLY:\n");
-  fprintf(stderr, "       SHIFT: shift\n");
-  fprintf(stderr, "       GROUP g_mult g_reduce: the Group technique - see the paper\n");
-  fprintf(stderr, "       BYTWO_p: BYTWO doubling the product.\n");
-  fprintf(stderr, "       BYTWO_b: BYTWO doubling b (more efficient thatn BYTWO_p)\n");
-  fprintf(stderr, "       TABLE: Full multiplication table\n");
-  fprintf(stderr, "       LOG:   Discrete logs\n");
-  fprintf(stderr, "       LOG_ZERO: Discrete logs with a large table for zeros\n");
-  fprintf(stderr, "       LOG_ZERO_EXT: Discrete logs with an extra large table for zeros\n");
-  fprintf(stderr, "       SPLIT g_a g_b: Split tables defined by g_a and g_b\n");
-  fprintf(stderr, "       COMPOSITE k rec METHOD: Composite field.  GF((2^l)^k), l=w/k.\n");
-  fprintf(stderr, "                               rec = 0 means inline single multiplication\n");
-  fprintf(stderr, "                               rec = 1 means recursive single multiplication\n");
-  fprintf(stderr, "                               METHOD is the method of the base field in GF(2^l)\n");
-  fprintf(stderr, "\n");
-  fprintf(stderr, "Legal values of REGION: Specify multiples with commas e.g. 'DOUBLE,LAZY'\n");
-  fprintf(stderr, "       -: Use defaults\n");
-  fprintf(stderr, "       SINGLE/DOUBLE/QUAD: Expand tables\n");
-  fprintf(stderr, "       LAZY: Lazily create table (only applies to TABLE and SPLIT)\n");
-  fprintf(stderr, "       SSE/NOSSE: Use 128-bit SSE instructions if you can\n");
-  fprintf(stderr, "       CAUCHY/ALTMAP/STDMAP: Use different memory mappings\n");
-  fprintf(stderr, "\n");
-  fprintf(stderr, "Legal values of DIVIDE:\n");
-  fprintf(stderr, "       -: Use defaults\n");
-  fprintf(stderr, "       MATRIX: Use matrix inversion\n");
-  fprintf(stderr, "       EUCLID: Use the extended Euclidian algorithm.\n");
-  fprintf(stderr, "\n");
-  fprintf(stderr, "See the user's manual for more information.\n");
-  fprintf(stderr, "There are many restrictions, so it is better to simply use defaults in most cases.\n");
-}
-
 int create_gf_from_argv(gf_t *gf, int w, int argc, char **argv, int starting)
 {
   int mult_type, divide_type, region_type;
-  uint32_t prim_poly = 0;
   int arg1, arg2, subrg_size;
+  uint64_t prim_poly;
   gf_t *base;
   char *crt, *x, *y;
 
-  if (argc <= starting || strcmp(argv[starting], "-") == 0) {
-    if (!gf_init_easy(gf, w)) return 0;
-    return (argc <= starting) ? starting : starting+1;
-  }
-
+  mult_type = GF_MULT_DEFAULT;
   region_type = GF_REGION_DEFAULT;
   divide_type = GF_DIVIDE_DEFAULT;
-
-  arg1 = 0;
-  arg2 = 0;
   prim_poly = 0;
   base = NULL;
-  subrg_size = 0;
-  
-  if (argc < starting+3) return 0;
-
-  if (strcmp(argv[starting], "SHIFT") == 0) {
-    mult_type = GF_MULT_SHIFT;
-    starting++;
-  } else if (strcmp(argv[starting], "GROUP") == 0) {
-    mult_type = GF_MULT_GROUP;
-    if (argc < starting+5) return 0;
-    if (sscanf(argv[starting+1], "%d", &arg1) == 0 ||
-        sscanf(argv[starting+2], "%d", &arg2) == 0 ||
-        arg1 <= 0 || arg2 <= 0 || arg1 >= w || arg2 >= w) return 0;
-    starting += 3;
-  } else if (strcmp(argv[starting], "BYTWO_p") == 0) {
-    mult_type = GF_MULT_BYTWO_p;
-    starting++;
-  } else if (strcmp(argv[starting], "BYTWO_b") == 0) {
-    mult_type = GF_MULT_BYTWO_b;
-    starting++;
-  } else if (strcmp(argv[starting], "TABLE") == 0) {
-    mult_type = GF_MULT_TABLE;
-    starting++;
-  } else if (strcmp(argv[starting], "LOG") == 0) {
-    mult_type = GF_MULT_LOG_TABLE;
-    starting++;
-  } else if (strcmp(argv[starting], "LOG_ZERO") == 0) {
-    mult_type = GF_MULT_LOG_TABLE;
-    arg1 = 1;
-    starting++;
-  } else if (strcmp(argv[starting], "LOG_ZERO_EXT") == 0) {
-    mult_type = GF_MULT_LOG_TABLE;
-    arg1 = 2;
-    starting++;
-  } else if (strcmp(argv[starting], "SPLIT") == 0) {
-    mult_type = GF_MULT_SPLIT_TABLE;
-    if (argc < starting+5) return 0;
-    if (sscanf(argv[starting+1], "%d", &arg1) == 0 ||
-        sscanf(argv[starting+2], "%d", &arg2) == 0 ||
-        arg1 <= 0 || arg2 <= 0 || w % arg1 != 0 || w % arg2 != 0) return 0;
-    starting += 3;
-  } else if (strcmp(argv[starting], "COMPOSITE") == 0) {
-    mult_type = GF_MULT_COMPOSITE;
-    if (argc < starting+6) return 0;
-    if (sscanf(argv[starting+1], "%d", &arg1) == 0 ||
-        sscanf(argv[starting+2], "%d", &arg2) == 0 ||
-        arg1 <= 1 || w %arg1 != 0 || ((arg2 | 1) != 1)) return 0;
-    base = (gf_t *) malloc(sizeof(gf_t));
-    starting = create_gf_from_argv(base, w/arg1, argc, argv, starting+3);
-    if (starting == 0) { free(base); return 0; }
-  } else {
-    return 0;
-  }
-
-  if (argc < starting+2) {
-    if (base != NULL) gf_free(base, 1);
-    return 0;
-  }
-
-  if (strcmp(argv[starting], "-") == 0) {
-    region_type = GF_REGION_DEFAULT;
-  } else {
-    crt = strdup(argv[starting]);
-    region_type = 0;
-    x = crt;
-    do { 
-      y = strchr(x, ','); 
-      if (y != NULL) *y = '\0';
-      if (strcmp(x, "DOUBLE") == 0) {
-        region_type |= GF_REGION_DOUBLE_TABLE;
-      } else if (strcmp(x, "QUAD") == 0) {
-        region_type |= GF_REGION_QUAD_TABLE;
-      } else if (strcmp(x, "SINGLE") == 0) {
-        region_type |= GF_REGION_SINGLE_TABLE;
-      } else if (strcmp(x, "LAZY") == 0) {
-        region_type |= GF_REGION_LAZY;
-      } else if (strcmp(x, "SSE") == 0) {
-        region_type |= GF_REGION_SSE;
-      } else if (strcmp(x, "NOSSE") == 0) {
-        region_type |= GF_REGION_NOSSE;
-      } else if (strcmp(x, "CAUCHY") == 0) {
-        region_type |= GF_REGION_CAUCHY;
-      } else if (strcmp(x, "ALTMAP") == 0) {
-        region_type |= GF_REGION_ALTMAP;
-      } else if (strcmp(x, "STDMAP") == 0) {
-        region_type |= GF_REGION_STDMAP;
+  arg1 = 0;
+  arg2 = 0;
+  while (1) {
+    if (argc > starting) {
+      if (strcmp(argv[starting], "-m") == 0) {
+        starting++;
+        if (mult_type != GF_MULT_DEFAULT) {
+          if (base != NULL) gf_free(base, 1);
+          _gf_errno = GF_E_TWOMULT;
+          return 0;
+        }
+        if (strcmp(argv[starting], "SHIFT") == 0) {
+          mult_type = GF_MULT_SHIFT;
+          starting++;
+        } else if (strcmp(argv[starting], "CARRY_FREE") == 0) {
+          mult_type = GF_MULT_CARRY_FREE;
+          starting++;
+        } else if (strcmp(argv[starting], "GROUP") == 0) {
+          mult_type = GF_MULT_GROUP;
+          if (argc < starting + 3) {
+            _gf_errno = GF_E_GROUPAR;
+            return 0;
+          }
+          if (sscanf(argv[starting+1], "%d", &arg1) == 0 ||
+              sscanf(argv[starting+2], "%d", &arg2) == 0) {
+            _gf_errno = GF_E_GROUPNU;
+            return 0;
+          }
+          starting += 3;
+        } else if (strcmp(argv[starting], "BYTWO_p") == 0) {
+          mult_type = GF_MULT_BYTWO_p;
+          starting++;
+        } else if (strcmp(argv[starting], "BYTWO_b") == 0) {
+          mult_type = GF_MULT_BYTWO_b;
+          starting++;
+        } else if (strcmp(argv[starting], "TABLE") == 0) {
+          mult_type = GF_MULT_TABLE;
+          starting++;
+        } else if (strcmp(argv[starting], "LOG") == 0) {
+          mult_type = GF_MULT_LOG_TABLE;
+          starting++;
+        } else if (strcmp(argv[starting], "LOG_ZERO") == 0) {
+          mult_type = GF_MULT_LOG_ZERO;
+          starting++;
+        } else if (strcmp(argv[starting], "LOG_ZERO_EXT") == 0) {
+          mult_type = GF_MULT_LOG_ZERO_EXT;
+          starting++;
+        } else if (strcmp(argv[starting], "SPLIT") == 0) {
+          mult_type = GF_MULT_SPLIT_TABLE;
+          if (argc < starting + 3) {
+            _gf_errno = GF_E_SPLITAR;
+            return 0;
+          }
+          if (sscanf(argv[starting+1], "%d", &arg1) == 0 ||
+              sscanf(argv[starting+2], "%d", &arg2) == 0) {
+            _gf_errno = GF_E_SPLITNU;
+            return 0;
+          }
+          starting += 3;
+        } else if (strcmp(argv[starting], "COMPOSITE") == 0) {
+          mult_type = GF_MULT_COMPOSITE;
+          if (argc < starting + 2) { _gf_errno = GF_E_FEWARGS; return 0; }
+          if (sscanf(argv[starting+1], "%d", &arg1) == 0) {
+            _gf_errno = GF_E_COMP_A2;
+            return 0;
+          }
+          starting += 2;
+          base = (gf_t *) malloc(sizeof(gf_t));
+          starting = create_gf_from_argv(base, w/arg1, argc, argv, starting);
+          if (starting == 0) {
+            free(base);
+            return 0;
+          }
+        } else {
+          if (base != NULL) gf_free(base, 1);
+          _gf_errno = GF_E_UNKNOWN;
+          return 0;
+        }
+      } else if (strcmp(argv[starting], "-r") == 0) {
+        starting++;
+        if (strcmp(argv[starting], "DOUBLE") == 0) {
+          region_type |= GF_REGION_DOUBLE_TABLE;
+          starting++;
+        } else if (strcmp(argv[starting], "QUAD") == 0) {
+          region_type |= GF_REGION_QUAD_TABLE;
+          starting++;
+        } else if (strcmp(argv[starting], "LAZY") == 0) {
+          region_type |= GF_REGION_LAZY;
+          starting++;
+        } else if (strcmp(argv[starting], "SSE") == 0) {
+          region_type |= GF_REGION_SSE;
+          starting++;
+        } else if (strcmp(argv[starting], "NOSSE") == 0) {
+          region_type |= GF_REGION_NOSSE;
+          starting++;
+        } else if (strcmp(argv[starting], "CAUCHY") == 0) {
+          region_type |= GF_REGION_CAUCHY;
+          starting++;
+        } else if (strcmp(argv[starting], "ALTMAP") == 0) {
+          region_type |= GF_REGION_ALTMAP;
+          starting++;
+        } else {
+          if (base != NULL) gf_free(base, 1);
+          _gf_errno = GF_E_UNK_REG;
+          return 0;
+        }
+      } else if (strcmp(argv[starting], "-p") == 0) {
+        starting++;
+        if (sscanf(argv[starting], "%llx", (long long unsigned int *)(&prim_poly)) == 0) {
+          if (base != NULL) gf_free(base, 1);
+          _gf_errno = GF_E_POLYSPC;
+          return 0;
+        }
+        starting++;
+      } else if (strcmp(argv[starting], "-d") == 0) {
+        starting++;
+        if (divide_type != GF_DIVIDE_DEFAULT) {
+          if (base != NULL) gf_free(base, 1);
+          _gf_errno = GF_E_TWO_DIV;
+          return 0;
+        } else if (strcmp(argv[starting], "EUCLID") == 0) {
+          divide_type = GF_DIVIDE_EUCLID;
+          starting++;
+        } else if (strcmp(argv[starting], "MATRIX") == 0) {
+          divide_type = GF_DIVIDE_MATRIX;
+          starting++;
+        } else {
+          _gf_errno = GF_E_UNK_DIV;
+          return 0;
+        }
+      } else if (strcmp(argv[starting], "-") == 0) {
+         /*
+         printf("Scratch size: %d\n", gf_scratch_size(w, 
+                                      mult_type, region_type, divide_type, arg1, arg2));
+         */
+        if (gf_init_hard(gf, w, mult_type, region_type, divide_type, 
+                         prim_poly, arg1, arg2, base, NULL) == 0) {
+          if (base != NULL) gf_free(base, 1);
+          return 0;
+        } else
+          return starting + 1;
       } else {
         if (base != NULL) gf_free(base, 1);
-        free(crt);
+        _gf_errno = GF_E_UNKFLAG;
         return 0;
       }
-      if (y != NULL) x = y+1;
-    } while (y != NULL);
-    free(crt);
+    } else {
+      if (base != NULL) gf_free(base, 1);
+      _gf_errno = GF_E_FEWARGS;
+      return 0;
+    }
   }
-
-  starting++;
-
-  if (strcmp(argv[starting], "-") == 0) {
-    divide_type = GF_DIVIDE_DEFAULT;
-  } else if (strcmp(argv[starting], "MATRIX") == 0) {
-    divide_type = GF_DIVIDE_MATRIX;
-  } else if (strcmp(argv[starting], "EUCLID") == 0) {
-    divide_type = GF_DIVIDE_EUCLID;
-  } else {
-    if (base != NULL) gf_free(base, 1);
-    return 0;
-  }
-  starting++;
-
-  if (!gf_init_hard(gf, w, mult_type, region_type, divide_type, prim_poly, arg1, arg2, base, NULL)) {
-    if (base != NULL) gf_free(base, 1);
-    return 0;
-  }
-  return starting;
 }
diff --git a/gf_method.h b/gf_method.h
index c7df540..ff29f25 100644
--- a/gf_method.h
+++ b/gf_method.h
@@ -8,8 +8,9 @@
 
 #include "gf_complete.h"
 
-/* This prints out the error string defining the methods that you can put on argv*/
-extern void methods_to_stderr();
+/* Parses argv starting at "starting".  
+   
+   Returns 0 on failure.
+   On success, it returns one past the last argument it read in argv. */
 
-/* Parses argv starting at "starting" */
 extern int create_gf_from_argv(gf_t *gf, int w, int argc, char **argv, int starting);
diff --git a/gf_methods.c b/gf_methods.c
index 13aeb8e..c4db5f5 100644
--- a/gf_methods.c
+++ b/gf_methods.c
@@ -11,58 +11,26 @@
 
 #include "gf_complete.h"
 #include "gf_method.h"
+#include "gf_int.h"
 
-#define NMULTS (15)
-static char *mults[NMULTS] = { "SHIFT", "GROUP44", "GROUP48", "BYTWO_p", "BYTWO_b",
-                               "TABLE", "LOG", "LOG_ZERO", "LOG_ZERO_EXT", "SPLIT2", "SPLIT4", "SPLIT8", "SPLIT88", "COMPOSITE-0", "COMPOSITE-1" };
+#define NMULTS (16)
+static char *mults[NMULTS] = { "SHIFT", "CARRY_FREE", "GROUP44", "GROUP48", "BYTWO_p", "BYTWO_b",
+                               "TABLE", "LOG", "LOG_ZERO", "LOG_ZERO_EXT", "SPLIT2",
+                               "SPLIT4", "SPLIT8", "SPLIT16", "SPLIT88", "COMPOSITE" };
 
-#define NREGIONS (96) 
-static char *regions[NREGIONS] = { "-", "SINGLE", "DOUBLE", "QUAD",
-"LAZY", "SINGLE,LAZY", "DOUBLE,LAZY", "QUAD,LAZY", "SSE",
-"SINGLE,SSE", "DOUBLE,SSE", "QUAD,SSE", "LAZY,SSE",
-"SINGLE,LAZY,SSE", "DOUBLE,LAZY,SSE", "QUAD,LAZY,SSE", "NOSSE",
-"SINGLE,NOSSE", "DOUBLE,NOSSE", "QUAD,NOSSE", "LAZY,NOSSE",
-"SINGLE,LAZY,NOSSE", "DOUBLE,LAZY,NOSSE", "QUAD,LAZY,NOSSE",
-"STDMAP", "SINGLE,STDMAP", "DOUBLE,STDMAP", "QUAD,STDMAP",
-"LAZY,STDMAP", "SINGLE,LAZY,STDMAP", "DOUBLE,LAZY,STDMAP",
-"QUAD,LAZY,STDMAP", "SSE,STDMAP", "SINGLE,SSE,STDMAP",
-"DOUBLE,SSE,STDMAP", "QUAD,SSE,STDMAP", "LAZY,SSE,STDMAP",
-"SINGLE,LAZY,SSE,STDMAP", "DOUBLE,LAZY,SSE,STDMAP",
-"QUAD,LAZY,SSE,STDMAP", "NOSSE,STDMAP", "SINGLE,NOSSE,STDMAP",
-"DOUBLE,NOSSE,STDMAP", "QUAD,NOSSE,STDMAP", "LAZY,NOSSE,STDMAP",
-"SINGLE,LAZY,NOSSE,STDMAP", "DOUBLE,LAZY,NOSSE,STDMAP",
-"QUAD,LAZY,NOSSE,STDMAP", "ALTMAP", "SINGLE,ALTMAP", "DOUBLE,ALTMAP",
-"QUAD,ALTMAP", "LAZY,ALTMAP", "SINGLE,LAZY,ALTMAP",
-"DOUBLE,LAZY,ALTMAP", "QUAD,LAZY,ALTMAP", "SSE,ALTMAP",
-"SINGLE,SSE,ALTMAP", "DOUBLE,SSE,ALTMAP", "QUAD,SSE,ALTMAP",
-"LAZY,SSE,ALTMAP", "SINGLE,LAZY,SSE,ALTMAP",
-"DOUBLE,LAZY,SSE,ALTMAP", "QUAD,LAZY,SSE,ALTMAP", "NOSSE,ALTMAP",
-"SINGLE,NOSSE,ALTMAP", "DOUBLE,NOSSE,ALTMAP", "QUAD,NOSSE,ALTMAP",
-"LAZY,NOSSE,ALTMAP", "SINGLE,LAZY,NOSSE,ALTMAP",
-"DOUBLE,LAZY,NOSSE,ALTMAP", "QUAD,LAZY,NOSSE,ALTMAP", "CAUCHY",
-"SINGLE,CAUCHY", "DOUBLE,CAUCHY", "QUAD,CAUCHY", "LAZY,CAUCHY",
-"SINGLE,LAZY,CAUCHY", "DOUBLE,LAZY,CAUCHY", "QUAD,LAZY,CAUCHY",
-"SSE,CAUCHY", "SINGLE,SSE,CAUCHY", "DOUBLE,SSE,CAUCHY",
-"QUAD,SSE,CAUCHY", "LAZY,SSE,CAUCHY", "SINGLE,LAZY,SSE,CAUCHY",
-"DOUBLE,LAZY,SSE,CAUCHY", "QUAD,LAZY,SSE,CAUCHY", "NOSSE,CAUCHY",
-"SINGLE,NOSSE,CAUCHY", "DOUBLE,NOSSE,CAUCHY", "QUAD,NOSSE,CAUCHY",
-"LAZY,NOSSE,CAUCHY", "SINGLE,LAZY,NOSSE,CAUCHY",
-"DOUBLE,LAZY,NOSSE,CAUCHY", "QUAD,LAZY,NOSSE,CAUCHY" };
+#define NREGIONS (7) 
+static char *regions[NREGIONS] = { "DOUBLE", "QUAD", "LAZY", "SSE", "NOSSE", 
+                                   "ALTMAP", "CAUCHY" };
 
-#define NDIVS (3)
-static char *divides[NDIVS] = { "-", "MATRIX", "EUCLID" }; 
+#define NDIVS (2)
+static char *divides[NDIVS] = { "MATRIX", "EUCLID" }; 
 
-int main()
+int main() 
 {
-  int m, r, d, w, i, sa, j;
-  char *argv[20];
+  int m, r, d, w, i, sa, j, k, reset;
+  char *argv[50];
   gf_t gf;
   char divs[200], ks[10], ls[10];
-
-  methods_to_stderr();
-
-  printf("\n");
-  printf("Implemented Methods: \n\n");
   
   for (i = 2; i < 8; i++) {
     w = (1 << i);
@@ -70,9 +38,14 @@ int main()
     if (create_gf_from_argv(&gf, w, 1, argv, 0) > 0) {
       printf("w=%d: -\n", w);
       gf_free(&gf, 1);
+    } else if (_gf_errno == GF_E_DEFAULT) {
+      fprintf(stderr, "Unlabeled failed method: w=%d: -\n", 2);
+      exit(1);
     }
+
     for (m = 0; m < NMULTS; m++) {
       sa = 0;
+      argv[sa++] = "-m";
       if (strcmp(mults[m], "GROUP44") == 0) {
         argv[sa++] = "GROUP";
         argv[sa++] = "4";
@@ -96,46 +69,66 @@ int main()
         sprintf(ls, "%d", w);
         argv[sa++] = ls;
         argv[sa++] = "8";
+      } else if (strcmp(mults[m], "SPLIT16") == 0) {
+        argv[sa++] = "SPLIT";
+        sprintf(ls, "%d", w);
+        argv[sa++] = ls;
+        argv[sa++] = "16";
       } else if (strcmp(mults[m], "SPLIT88") == 0) {
         argv[sa++] = "SPLIT";
         argv[sa++] = "8";
         argv[sa++] = "8";
-      } else if (strcmp(mults[m], "COMPOSITE-0") == 0) {
+      } else if (strcmp(mults[m], "COMPOSITE") == 0) {
         argv[sa++] = "COMPOSITE";
         argv[sa++] = "2";
-        argv[sa++] = "0";
-        argv[sa++] = "-";
-      } else if (strcmp(mults[m], "COMPOSITE-1") == 0) {
-        argv[sa++] = "COMPOSITE";
-        argv[sa++] = "2";
-        argv[sa++] = "1";
         argv[sa++] = "-";
       } else {
         argv[sa++] = mults[m];
       }
-      for (r = 0; r < NREGIONS; r++) {
-        argv[sa++] = regions[r]; 
-        strcpy(divs, "");
-        for (d = 0; d < NDIVS; d++) {
-          argv[sa++] = divides[d];
-/*          printf("w=%d:", w);
-          for (j = 0; j < sa; j++) printf(" %s", argv[j]);
-          printf("\n"); */
-          if (create_gf_from_argv(&gf, w, sa, argv, 0) > 0) {
-            strcat(divs, "|");
-            strcat(divs, divides[d]);
-            gf_free(&gf, 1);
-          } 
-          sa--;
+      reset = sa;
+      for (r = 0; r < (1 << NREGIONS); r++) {
+        sa = reset;
+        for (k = 0; k < NREGIONS; k++) {
+          if (r & 1 << k) {
+            argv[sa++] = "-r";
+            argv[sa++] = regions[k];
+          }
         }
-        if (strlen(divs) > 0) {
+        argv[sa++] = "-";
+        if (create_gf_from_argv(&gf, w, sa, argv, 0) > 0) {
           printf("w=%d:", w);
           for (j = 0; j < sa; j++) printf(" %s", argv[j]);
-          printf(" %s\n", divs+1);
+          printf("\n");
+          gf_free(&gf, 1);
+        } else if (_gf_errno == GF_E_DEFAULT) {
+          fprintf(stderr, "Unlabeled failed method: w=%d:", w);
+          for (j = 0; j < sa; j++) fprintf(stderr, " %s", argv[j]);
+          fprintf(stderr, "\n");
+          exit(1);
         }
         sa--;
+        for (d = 0; d < NDIVS; d++) {
+          argv[sa++] = "-d";
+          argv[sa++] = divides[d];
+          /*          printf("w=%d:", w);
+                      for (j = 0; j < sa; j++) printf(" %s", argv[j]);
+                      printf("\n"); */
+          argv[sa++] = "-";
+          if (create_gf_from_argv(&gf, w, sa, argv, 0) > 0) {
+            printf("w=%d:", w);
+            for (j = 0; j < sa; j++) printf(" %s", argv[j]);
+            printf("\n");
+            gf_free(&gf, 1);
+          } else if (_gf_errno == GF_E_DEFAULT) {
+            fprintf(stderr, "Unlabeled failed method: w=%d:", w);
+            for (j = 0; j < sa; j++) fprintf(stderr, " %s", argv[j]);
+            fprintf(stderr, "\n");
+            exit(1);
+          } 
+          sa-=3;
+        }
       }
-      sa--;
     }
   }
+  return 0;
 }
diff --git a/gf_mult.c b/gf_mult.c
index dc85cc6..c93a4f9 100644
--- a/gf_mult.c
+++ b/gf_mult.c
@@ -12,105 +12,53 @@
 
 #include "gf_complete.h"
 #include "gf_method.h"
+#include "gf_general.h"
 
-void usage(char *s)
+void usage(int why)
 {
   fprintf(stderr, "usage: gf_mult a b w [method] - does multiplication of a and b in GF(2^w)\n");
-  fprintf(stderr, "       If w has an h on the end, treat a, b and the product as hexadecimal (no 0x required)\n");
-  fprintf(stderr, "\n");
-  fprintf(stderr, "       legal w are: 1-32, 64 and 128\n");
-  fprintf(stderr, "           128 is hex only (i.e. '128' will be an error - do '128h')\n");
-  fprintf(stderr, "\n");
-  fprintf(stderr, "       For method specification, type gf_methods\n");
-
-  if (s != NULL) fprintf(stderr, "%s", s);
+  if (why == 'W') {
+    fprintf(stderr, "Bad w.\n");
+    fprintf(stderr, "Legal w are: 1 - 32, 64 and 128.\n");
+    fprintf(stderr, "Append 'h' to w to treat a, b and the product as hexadecimal.\n");
+    fprintf(stderr, "w=128 is hex only (i.e. '128' will be an error - do '128h')\n");
+  }
+  if (why == 'A') fprintf(stderr, "Bad a\n");
+  if (why == 'B') fprintf(stderr, "Bad b\n");
+  if (why == 'M') {
+    fprintf(stderr, "Bad Method Specification: ");
+    gf_error();
+  }
   exit(1);
 }
 
-int read_128(char *s, uint64_t *v)
-{
-  int l, t;
-  char save;
-
-  l = strlen(s);
-  if (l > 32) return 0;
-
-  if (l > 16) {
-    if (sscanf(s + (l-16), "%llx", (long long unsigned int *) &(v[1])) == 0) return 0;
-    save = s[l-16];
-    s[l-16] = '\0';
-    t = sscanf(s, "%llx", (long long unsigned int *) &(v[0]));
-    s[l-16] = save;
-    return t;
-  } else {
-    v[0] = 0;
-    return sscanf(s, "%llx", (long long unsigned int *)&(v[1]));
-  }
-  return 1;
-}
-
-void print_128(uint64_t *v) 
-{
-  if (v[0] > 0) {
-    printf("%llx", (long long unsigned int) v[0]);
-    printf("%016llx", (long long unsigned int) v[1]);
-  } else {
-    printf("%llx", (long long unsigned int) v[1]);
-  }
-  printf("\n");
-}
-
-
 int main(int argc, char **argv)
 {
-  int hex, al, bl, w;
-  uint32_t a, b, c, top;
-  uint64_t a64, b64, c64;
-  uint64_t a128[2], b128[2], c128[2];
-  char *format;
+  int hex, w;
   gf_t gf;
+  gf_general_t a, b, c;
+  char output[50];
 
-  if (argc < 4) usage(NULL);
-  if (sscanf(argv[3], "%d", &w) == 0) usage("Bad w\n");
+  if (argc < 4) usage(' ');
 
-  if (w <= 0 || (w > 32 && w != 64 && w != 128)) usage("Bad w");
+  if (sscanf(argv[3], "%d", &w) == 0) usage('W');
+  if (w <= 0 || (w > 32 && w != 64 && w != 128)) usage('W');
 
   hex = (strchr(argv[3], 'h') != NULL);
-  if (create_gf_from_argv(&gf, w, argc, argv, 4) == 0) usage("\nBad Method\n");
+  if (!hex && w == 128) usage('W');
 
-  if (!hex && w == 128) usage(NULL);
- 
-  if (w <= 32) {
-    format = (hex) ? "%x" : "%u";
-    if (sscanf(argv[1], format, &a) == 0) usage("Bad a\n");
-    if (sscanf(argv[2], format, &b) == 0) usage("Bad b\n");
-
-    if (w < 32) {
-      top = (w == 31) ? 0x80000000 : (1 << w);
-      if (w != 32 && a >= top) usage("a is too large\n");
-      if (w != 32 && b >= top) usage("b is too large\n");
-    }
-  
-    c = gf.multiply.w32(&gf, a, b);
-    printf(format, c);
-    printf("\n");
-
-  } else if (w == 64) {
-    format = (hex) ? "%llx" : "%llu";
-    if (sscanf(argv[1], format, &a64) == 0) usage("Bad a\n");
-    if (sscanf(argv[2], format, &b64) == 0) usage("Bad b\n");
-    c64 = gf.multiply.w64(&gf, a64, b64);
-
-    printf(format, c64);
-    printf("\n");
-
-  } else if (w == 128) {
-
-    if (read_128(argv[1], a128) == 0) usage("Bad a\n");
-    if (read_128(argv[2], b128) == 0) usage("Bad b\n");
-    gf.multiply.w128(&gf, a128, b128, c128);
-
-    print_128(c128);
+  if (argc == 4) {
+    if (gf_init_easy(&gf, w) == 0) usage('M');
+  } else {
+    if (create_gf_from_argv(&gf, w, argc, argv, 4) == 0) usage('M');
   }
+ 
+  if (!gf_general_s_to_val(&a, w, argv[1], hex)) usage('A');
+  if (!gf_general_s_to_val(&b, w, argv[2], hex)) usage('B');
+
+  gf_general_multiply(&gf, &a, &b, &c);
+  gf_general_val_to_s(&c, w, output, hex);
+  
+  printf("%s\n", output);
   exit(0);
 }
diff --git a/gf_poly.c b/gf_poly.c
index c057461..7134b2c 100644
--- a/gf_poly.c
+++ b/gf_poly.c
@@ -1,560 +1,268 @@
 /*
- * gf_poly.c - program to help find primitive polynomials in composite fields
+   gf_poly.c - program to help find irreducible polynomials in composite fields,
+   using the Ben-Or algorithm.  
+  
+   James S. Plank
+  
+   Please see the following paper for a 
+   description of the Ben-Or algorithm:
+
+   author    S. Gao and D. Panario
+   title     Tests and Constructions of Irreducible Polynomials over Finite Fields
+   booktitle Foundations of Computational Mathematics
+   year      1997
+   publisher Springer Verlag
+   pages     346-361
+
+  The basic technique is this.  You have a polynomial f(x) whose coefficients are
+  in a base field GF(2^w).  The polynomial is of degree n.  You need to do the 
+  following for all i from 1 to n/2:
+
+  Construct x^(2^w)^i modulo f.  That will be a polynomial of maximum degree n-1
+  with coefficients in GF(2^w).  You construct that polynomial by starting with x
+  and doubling it w times, each time taking the result modulo f.  Then you 
+  multiply that by itself i times, again each time taking the result modulo f.
+
+  When you're done, you need to "subtract" x -- since addition = subtraction = 
+  XOR, that means XOR x.  
+
+  Now, find the GCD of that last polynomial and f, using Euclid's algorithm.  If
+  the GCD is not one, then f is reducible.  If it is not reducible for each of
+  those i, then it is irreducible.
+
+  In this code, I am using a gf_general_t to represent elements of GF(2^w).  This
+  is so that I can use base fields that are GF(2^64) or GF(2^128). 
+   
+  I have two main procedures.  The first is x_to_q_to_i_minus_x, which calculates
+  x^(2^w)^i - x, putting the result into a gf_general_t * called retval.
+
+  The second is gcd_one, which takes a polynomial of degree n and a second one
+  of degree n-1, and uses Euclid's algorithm to decide if their GCD == 1.
+
+  These can be made faster (e.g. calculate x^(2^w) once and store it).
  */
 
 #include "gf_complete.h"
 #include "gf_method.h"
+#include "gf_general.h"
+#include "gf_int.h"
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
-#define GF_POLY_COEF_MASK8 0xff
-#define GF_POLY_COEF_MASK16 0xffff
-#define GF_POLY_COEF_MASK32 0xffffffff
-#define GF_POLY_COEF_MASK64 0xffffffffffffffff
+char *BM = "Bad Method: ";
 
-#define LLUI (long long unsigned int)
-
-struct gf_poly_coef_s;
-
-typedef struct gf_poly_coef_s {
-  uint64_t coef;
-  uint64_t power;
-  struct gf_poly_coef_s *next;
-} gf_poly_coef_t;
-
-typedef struct gf_poly_s {
-  gf_poly_coef_t *leading_coef;
-  uint64_t num_coefs;
-  gf_t *coef_gf;
-  int w;
-} gf_poly_t;
-
-static uint64_t gf_add(int w, uint64_t a, uint64_t b)
+void usage(char *s)
 {
-  if (w == 8) {
-    return (a & GF_POLY_COEF_MASK8) ^ (b & GF_POLY_COEF_MASK8);
-  } else if (w == 16) {
-    return (a & GF_POLY_COEF_MASK16) ^ (b & GF_POLY_COEF_MASK16);
-  } else if (w == 32) {
-    return (a & GF_POLY_COEF_MASK32) ^ (b & GF_POLY_COEF_MASK32);
-  } else if (w == 64) {
-    return (a & GF_POLY_COEF_MASK64) ^ (b & GF_POLY_COEF_MASK64);
-  }
-}
-
-static uint64_t gf_mult(int w, gf_t* gf, uint64_t a, uint64_t b)
-{
-  if (w <= 32) {
-    return gf->multiply.w32(gf, a, b); 
-  } else if (w == 64) {
-    return gf->multiply.w64(gf, a, b); 
-  }
-}
-
-static uint64_t gf_divide(int w, gf_t* gf, uint64_t a, uint64_t b)
-{
-  if (w <= 32) {
-    return gf->divide.w32(gf, a, b); 
-  } else if (w == 64) {
-    return gf->divide.w64(gf, a, b); 
-  }
-}
-
-static uint64_t gf_inverse(int w, gf_t* gf, uint64_t a)
-{
-  if (w <= 32) {
-    return gf->inverse.w32(gf, a); 
-  } else if (w == 64) {
-    return gf->inverse.w64(gf, a);
-  }
-}
-
-gf_poly_t* gf_poly_init(int w, gf_t *gf)
-{
-  gf_poly_t *gf_poly = (gf_poly_t*)malloc(sizeof(gf_poly_t));
-
-  if (gf_poly == NULL || gf == NULL) {
-    return NULL;
-  }
-
-  gf_poly->leading_coef = NULL;
-  gf_poly->num_coefs = 0;
-  gf_poly->coef_gf = gf;
-  gf_poly->w = w;
-
-  return gf_poly;
-}
-
-void gf_poly_print(gf_poly_t *gf_poly, char *message)
-{
-  gf_poly_coef_t *tmp;
-
-  if (gf_poly == NULL) {
-    fprintf(stderr, "0 * x^0\n");
-    return;
-  }
-
-  tmp = gf_poly->leading_coef;
-
-  while (tmp != NULL) {
-    printf("%llu * x^%llu", LLUI tmp->coef, LLUI tmp->power);
-    tmp = tmp->next;
-    if (tmp) {
-      printf(" + ");
-    }
-  }
-
-  if (message != NULL) {
-    printf(": %s\n", message);
-  }
-}
-
-gf_poly_t* gf_poly_copy(gf_poly_t *poly)
-{
-  gf_poly_t *new_poly = (gf_poly_t*)malloc(sizeof(gf_poly_t));
-  gf_poly_coef_t *tmp = poly->leading_coef;
-
-  if (new_poly == NULL) {
-    return NULL;
-  }
-
-  new_poly->leading_coef = NULL;
-  new_poly->num_coefs = 0;
-  new_poly->coef_gf = poly->coef_gf;
-  new_poly->w = poly->w;
-  
-  while (tmp != NULL) {
-    gf_poly_add_coef(new_poly, tmp->coef, tmp->power);
-
-    tmp = tmp->next;
-  }
-
-  return new_poly;
-}
-
-void gf_poly_clear(gf_poly_t* a)
-{
-  while (a->leading_coef != NULL) {
-    gf_poly_coef_t *tmp = a->leading_coef;
-    
-    a->leading_coef = tmp->next;
-
-    free(tmp);
-  }
-}
-
-void gf_poly_free(gf_poly_t **a)
-{
-  gf_poly_clear(*a);
-  free(*a); 
-  *a = NULL;
-}
-
-gf_poly_coef_t* gf_poly_create_node(uint64_t coef, uint64_t power)
-{
-  gf_poly_coef_t* node = (gf_poly_coef_t*)malloc(sizeof(gf_poly_coef_t));
-
-  if (node == NULL) {
-    return NULL;
-  }
-
-  node->coef = coef;
-  node->power = power;
-  node->next = NULL;
-
-  return node;
-}
-
-int gf_poly_remove_node(gf_poly_t *gf_poly, uint64_t power)
-{
-  gf_poly_coef_t* iter = gf_poly->leading_coef;
-
-  if (iter->power == power) {
-    gf_poly->leading_coef = iter->next;   
-    free(iter);
-    return 0;
-  }
-
-  while (iter->next != NULL) {
-    if (iter->next->power == power) {
-      gf_poly_coef_t* tmp = iter->next;
-      iter->next = iter->next->next;
-      free(tmp);
-      return 0;
-    }
-    iter = iter->next;
-  }
-
-  return -1;
-}
-
-int gf_poly_add_coef(gf_poly_t *gf_poly, uint64_t coef_val, uint64_t power)
-{
-  gf_poly_coef_t* node;
-  gf_poly_coef_t* iter = gf_poly->leading_coef;
-
-  /*
-   * The new node has the highest power, or there are no terms
-   */
-  if (gf_poly->leading_coef == NULL || gf_poly->leading_coef->power < power) {
-    node = gf_poly_create_node(coef_val, power);
-    node->next = gf_poly->leading_coef;
-    gf_poly->leading_coef = node;
-    return 0;
-  }
-
-  /*
-   * The new node is of the same power, add the coefs
-   */
-  if (gf_poly->leading_coef->power == power) {
-    gf_poly->leading_coef->coef = gf_add(gf_poly->w, gf_poly->leading_coef->coef, coef_val);   
-    if (gf_poly->leading_coef->coef == 0) {
-      gf_poly_remove_node(gf_poly, power);
-    }
-    return 0;
-  }
-
-  while (iter->next != NULL) {
-    if (iter->next->power == power) {
-      iter->next->coef = gf_add(gf_poly->w, iter->next->coef, coef_val);   
-
-      if (iter->next->coef == 0) {
-        gf_poly_remove_node(gf_poly, power);
-      }
-
-      return 0;
-    }
-    if (iter->next->power < power) {
-      node = gf_poly_create_node(coef_val, power);
-      node->next = iter->next;
-      iter->next = node;
-      return 0;
-    }
-    iter = iter->next;
-  }
-  
-  /*
-   * The power passed in is lower than any in the existing poly
-   */
-  node = gf_poly_create_node(coef_val, power);
-  iter->next = node;
-
-  return 0;
-}
-
-/*
- * Compute a+b and store in a
- */
-int gf_poly_add(gf_poly_t* a, gf_poly_t* b)
-{
-  gf_poly_coef_t* iter = b->leading_coef;
-
-  while (iter != NULL) {
-    gf_poly_add_coef(a, iter->coef, iter->power);
-    iter = iter->next; 
-  }
-
-  return 0;
-}
-
-/*
- * Compute a*b and store in a
- */
-int gf_poly_mult(gf_poly_t* a, gf_poly_t* b)
-{
-  gf_poly_coef_t* a_iter = a->leading_coef;
-
-  /*
-   * Remove one node at a time from 'a', starting with
-   * highest power.  Multiply the removed (coef,power)
-   * by every entry of 'b,' adding each product into 'a.'
-   */
-  while (a_iter != NULL) {
-    gf_poly_coef_t* tmp = a_iter;
-    gf_poly_coef_t* b_iter = b->leading_coef;
-
-    uint64_t a_power = a_iter->power;
-    uint64_t a_coef = a_iter->coef;
-    a_iter = a_iter->next;
-    gf_poly_remove_node(a, tmp->power);
-
-    while (b_iter != NULL) {
-      uint64_t new_power = b_iter->power + a_power;
-      uint64_t new_coef = gf_mult(a->w, a->coef_gf, b_iter->coef, a_coef);
-
-      gf_poly_add_coef(a, new_coef, new_power);
-
-      b_iter = b_iter->next;
-    }
-  }
-  return 0;
-}
-
-/*
- * Compute a % b and store in a
- */
-int gf_poly_reduce(gf_poly_t* a, gf_poly_t* b)
-{
-   gf_poly_t* c = gf_poly_init(a->w, a->coef_gf);
-   gf_poly_coef_t* a_iter = a->leading_coef;
-   gf_poly_coef_t* b_iter = b->leading_coef;
-
-  /*
-   * Reduce until the degree of 'a' is less than
-   * the degree of 'b.'  At that point 'a' will 
-   * contain the remainder of a / b.
-   */
-  while (a_iter && (a_iter->power >= b_iter->power)) {
-
-    /*
-     * Get the degree and leading coef of the current
-     * 'b'.
-     */
-    uint64_t reduce_power = a_iter->power - b_iter->power;
-    uint64_t reduce_coef = gf_divide(a->w, a->coef_gf, a_iter->coef, b_iter->coef);
-
-    /*
-     * Create a poly that will get rid of leading power
-     * of 'b' when added: c*x^(n-m)*b(x), where c 
-     * is the leading coef of 'a', n is the deg of 'a'
-     * and m is the degree of 'b'.
-     */
-    gf_poly_add_coef(c, reduce_coef, reduce_power);
-    gf_poly_mult(c, b);
-    
-    /*
-     * Add the newly created poly, which will reduce 
-     * a(x) by at least one term (leading term).
-     */
-    gf_poly_add(a, c);
-    
-    gf_poly_clear(c); 
-   
-    /*
-     * Grab the new leading term of 'a'
-     */ 
-    a_iter = a->leading_coef;
-  }
-}
-
-/*
- * Get the GCD of a and b, return the result
- */
-gf_poly_t* gf_poly_gcd(gf_poly_t* a, gf_poly_t* b)
-{
-  gf_poly_t *r1, *r2;
-  gf_poly_t* tmp_swp;
-
-  if (a->leading_coef == NULL || b->leading_coef == NULL) {
-    return NULL;
-  }
-
-  if (a->leading_coef->power > b->leading_coef->power) {
-    r1 = a;
-    r2 = b;
-  } else {
-    r1 = b;
-    r2 = a;
-  }
-
-  while ( 1 ) {
-    if (r2->leading_coef == NULL) {
-      break;
-    }
-    if (r2->leading_coef->power == 0 && r2->leading_coef->coef <= 1) {
-      break;
-    }
-
-    gf_poly_reduce(r1, r2);
-    tmp_swp = r1;
-    r1 = r2;
-    r2 = tmp_swp;
-  }
-
-  return r1;
-}
-
-/*
- * The Ben-Or algorithm for determining irreducibility
- */
-int gf_poly_is_irred(gf_poly_t* poly)
-{
-  gf_poly_t *gcd;
-  gf_poly_t *prod_of_irred;
-  uint64_t prod_of_irred_power = ((unsigned long long) 1) << poly->w;
-  int n = poly->leading_coef->power / 2;
-  int i;
-  int ret = 0;
-  gf_poly_t *a = gf_poly_copy(poly);
-
-  prod_of_irred = gf_poly_init(a->w, a->coef_gf);
-
-
-  for (i = 1; i <= n; i++) {
-    gf_poly_add_coef(prod_of_irred, 1, prod_of_irred_power);
-    gf_poly_add_coef(prod_of_irred, 1, 1);
-  
-    gf_poly_reduce(prod_of_irred, a); 
-    
-    gcd = gf_poly_gcd(a, prod_of_irred); 
-
-    /*
-     * It is irreducible if it is not the product of 
-     * non-trivial factors (non-constant).  Therefore,
-     * the GCD of the poly and prod_of_irred should be
-     * a constant (0 or 0-degree polynomial).
-     */ 
-    if (gcd == NULL) {
-      ret = -1;
-      break;
-    } else if (gcd->leading_coef->power != 0) {
-      ret = -1;
-      break;
-    } else if (gcd->leading_coef->power == 0) {
-      ret = 0;
-      break;
+  fprintf(stderr, "usage: gf_poly w(base-field) method power:coef [ power:coef .. ]\n");
+  fprintf(stderr, "\n");
+  fprintf(stderr, "       use - for the default method.\n");
+  fprintf(stderr, "       use 0x in front of the coefficient if it's in hex\n");
+  fprintf(stderr, "       \n");
+  fprintf(stderr, "       For example, to test whether x^2 + 2x + 1 is irreducible\n");
+  fprintf(stderr, "       in GF(2^16), the call is:\n");
+  fprintf(stderr, "       \n");
+  fprintf(stderr, "       gf_poly 16 - 2:1 1:2 0:1\n");
+  fprintf(stderr, "       \n");
+  fprintf(stderr, "       See the user's manual for more information.\n");
+  if (s != NULL) {
+    fprintf(stderr, "\n");
+    if (s == BM) {
+      fprintf(stderr, "%s", s);
+      gf_error();
     } else {
-      ret = -1;
-      break;
+      fprintf(stderr, "%s\n", s);
     }
-    
-    // Need if to avoid a overflow error
-    if ((i + 1) <= n) {
-      prod_of_irred_power *= prod_of_irred_power;
+  }
+  exit(1);
+}
+
+int gcd_one(gf_t *gf, int w, int n, gf_general_t *poly, gf_general_t *prod)
+{
+  gf_general_t *a, *b, zero, factor, p;
+  int i, j, da, db;
+  char buf[30];
+
+  gf_general_set_zero(&zero, w);
+
+  a = (gf_general_t *) malloc(sizeof(gf_general_t) * n+1);
+  b = (gf_general_t *) malloc(sizeof(gf_general_t) * n);
+  for (i = 0; i <= n; i++) gf_general_add(gf, &zero, poly+i, a+i);
+  for (i = 0; i < n; i++) gf_general_add(gf, &zero, prod+i, b+i);
+
+  da = n;
+  while (1) {
+    for (db = n-1; db >= 0 && gf_general_is_zero(b+db, w); db--) ;
+    if (db < 0) return 0;
+    if (db == 0) return 1;
+    for (j = da; j >= db; j--) {
+      if (!gf_general_is_zero(a+j, w)) {
+        gf_general_divide(gf, a+j, b+db, &factor);
+        for (i = 0; i <= db; i++) {
+          gf_general_multiply(gf, b+i, &factor, &p); 
+          gf_general_add(gf, &p, a+(i+j-db), a+(i+j-db));
+        }
+      }
+    }
+    for (i = 0; i < n; i++) {
+      gf_general_add(gf, a+i, &zero, &p);
+      gf_general_add(gf, b+i, &zero, a+i);
+      gf_general_add(gf, &p, &zero, b+i);
     }
-    gf_poly_clear(prod_of_irred);
   }
 
-  gf_poly_free(&a);
-
-  return ret;
 }
 
-int is_suitible_s(int w, gf_t *gf, uint64_t s)
+void x_to_q_to_i_minus_x(gf_t *gf, int w, int n, gf_general_t *poly, int logq, int i, gf_general_t *retval)
 {
-  uint64_t num_elems = ((unsigned long long) 1) << w;
-  uint64_t i = 2;
-  uint64_t i_inv;
+  gf_general_t x;
+  gf_general_t *x_to_q;
+  gf_general_t *product;
+  gf_general_t p, zero, factor;
+  int j, k, lq;
+  char buf[20];
 
-  for (; i < num_elems; i++) {
-    i_inv = gf_inverse(w, gf, i);
-    if ((i ^ i_inv) == s) {
-      fprintf(stderr, "Bailed on %llu ^ %llu = %llu\n", LLUI i, LLUI i_inv, LLUI s);
-      return -1;
+  gf_general_set_zero(&zero, w);
+  product = (gf_general_t *) malloc(sizeof(gf_general_t) * n*2);
+  x_to_q = (gf_general_t *) malloc(sizeof(gf_general_t) * n);
+  for (j = 0; j < n; j++) gf_general_set_zero(x_to_q+j, w);
+  gf_general_set_one(x_to_q+1, w);
+
+  for (lq = 0; lq < logq; lq++) {
+    for (j = 0; j < n*2; j++) gf_general_set_zero(product+j, w);
+    for (j = 0; j < n; j++) {
+      for (k = 0; k < n; k++) {
+        gf_general_multiply(gf, x_to_q+j, x_to_q+k, &p);
+        gf_general_add(gf, product+(j+k), &p, product+(j+k));
+      }
     }
-    if (i % 1000000000 == 0) fprintf(stderr, "Processed %llu\n", LLUI i);
+    for (j = n*2-1; j >= n; j--) {
+      if (!gf_general_is_zero(product+j, w)) {
+        gf_general_add(gf, product+j, &zero, &factor);
+        for (k = 0; k <= n; k++) {
+          gf_general_multiply(gf, poly+k, &factor, &p);
+          gf_general_add(gf, product+(j-n+k), &p, product+(j-n+k));
+        }
+      }
+    }
+    for (j = 0; j < n; j++) gf_general_add(gf, product+j, &zero, x_to_q+j);
+  }
+  for (j = 0; j < n; j++) gf_general_set_zero(retval+j, w);
+  gf_general_set_one(retval, w);
+
+  while (i > 0) {
+    for (j = 0; j < n*2; j++) gf_general_set_zero(product+j, w);
+    for (j = 0; j < n; j++) {
+      for (k = 0; k < n; k++) {
+        gf_general_multiply(gf, x_to_q+j, retval+k, &p);
+        gf_general_add(gf, product+(j+k), &p, product+(j+k));
+      }
+    }
+    for (j = n*2-1; j >= n; j--) {
+      if (!gf_general_is_zero(product+j, w)) {
+        gf_general_add(gf, product+j, &zero, &factor);
+        for (k = 0; k <= n; k++) {
+          gf_general_multiply(gf, poly+k, &factor, &p);
+          gf_general_add(gf, product+(j-n+k), &p, product+(j-n+k));
+        }
+      }
+    }
+    for (j = 0; j < n; j++) gf_general_add(gf, product+j, &zero, retval+j);
+    i--;
   }
 
-  return 0;
+  gf_general_set_one(&x, w);
+  gf_general_add(gf, &x, retval+1, retval+1);
+
+  free(product);
+  free(x_to_q);
 }
 
-static void
-usage(char *cmd)
-{
-  fprintf(stderr, "%s w <GF args> S <s value>\n", cmd);
-  fprintf(stderr, "\t will build a trinomial x^2+S*x+1\n");
-  fprintf(stderr, "OR\n");
-  fprintf(stderr, "%s w <GF args> G coef1,power1 <coef2,power2> ... <coefn,powern>\n", cmd);
-  fprintf(stderr, "\t will build a polynomial coef1^(power1) + ... + coefn^(powern)\n");
-  fprintf(stderr, "Example: ./gf_poly 8 - - - G 1,2 2,1 1,0\n");
-  fprintf(stderr, "\t will build a polynomial x^2+2*x+1 with coefs from GF(2^8)\n");
-}
-
-/*
- * Find irred poly of form x^2+sx+1
- * a_n*x^n + a_(n-1)*x^(n-1) + ...
- *
- * Terms are specified as: a_i,i a_j,j, ... where 
- * i is the degree of the term and a_i is the coef
- *
- */
-int main(int argc, char **argv)
+main(int argc, char **argv)
 {
+  int w, i, power, n, ap, success, j;
   gf_t gf;
-  int ret;
-  int w;
-  int i;
-  uint64_t irred_coef_s;
-  gf_poly_t *irred_poly;
-  char *term;
+  gf_general_t *poly, *prod;
+  char *string, *ptr;
+  char buf[100];
 
-  bzero(&gf, sizeof(gf_t)); 
+  if (argc < 4) usage(NULL);
 
-  if (argc < 4) {
-    usage(argv[0]);
-    return -1;
-  }
-  
-  w = atoi(argv[1]);
-  
-  ret = create_gf_from_argv(&gf, w, argc, argv, 3);
+  if (sscanf(argv[1], "%d", &w) != 1 || w <= 0) usage("Bad w.");
+  ap = create_gf_from_argv(&gf, w, argc, argv, 2);
 
-  if (ret <= 0) {
-    fprintf(stderr, "Could not create a GF\n");
-    return -1;
-  }
-    
-  irred_poly = gf_poly_init(w, &gf);
+  if (ap == 0) usage(BM);
 
-  i = ret + 1;
+  if (ap == argc) usage("No powers/coefficients given.");
 
-  if (strlen(argv[i]) > 1) {
-    usage(argv[0]); 
-    exit(1);
-  }
-
-  if (argv[i][0] == 'S') {
-    i++;
-    irred_coef_s = (uint64_t)strtoull(argv[i], NULL, 10);
-  
-    /*
-     * If this is a trinomial of the form x^2+s*x+1, then
-     * we can do a quick pre-check to see if this may be
-     * an irreducible polynomial.
-     */
-    if (is_suitible_s(w, &gf, irred_coef_s) < 0) {
-      fprintf(stderr, "%llu is not a suitable coeffient!\n", LLUI irred_coef_s);
-      return -1;
-    } else {
-      fprintf(stderr, "%llu IS A suitable coeffient!\n", LLUI irred_coef_s);
+  n = -1;
+  for (i = ap; i < argc; i++) {
+    if (strchr(argv[i], ':') == NULL || sscanf(argv[i], "%d:", &power) != 1) {
+      string = (char *) malloc(sizeof(char)*(strlen(argv[i]+100)));
+      sprintf(string, "Argument '%s' not in proper format of power:coefficient\n", argv[i]);
+      usage(string);
     }
+    if (power < 0) usage("Can't have negative powers\n");
+    if (power > n) n = power;
+  }
 
+  poly = (gf_general_t *) malloc(sizeof(gf_general_t)*(n+1));
+  for (i = 0; i <= n; i++) gf_general_set_zero(poly+i, w);
+  prod = (gf_general_t *) malloc(sizeof(gf_general_t)*n);
 
-    gf_poly_add_coef(irred_poly, 1, 2);
-    gf_poly_add_coef(irred_poly, irred_coef_s, 1);
-    gf_poly_add_coef(irred_poly, 1, 0);
+  for (i = ap; i < argc; i++) {
+    sscanf(argv[i], "%d:", &power);
+    ptr = strchr(argv[i], ':');
+    ptr++;
+    if (strncmp(ptr, "0x", 2) == 0) {
+      success = gf_general_s_to_val(poly+power, w, ptr+2, 1);
+    } else {
+      success = gf_general_s_to_val(poly+power, w, ptr, 0);
+    }
+    if (success == 0) {
+      string = (char *) malloc(sizeof(char)*(strlen(argv[i]+100)));
+      sprintf(string, "Argument '%s' not in proper format of power:coefficient\n", argv[i]);
+      usage(string);
+    }
+  }
 
-  } else if (argv[i][0] == 'G') {
-    term = argv[++i];
-
-
-    while (term != NULL) {
-      uint64_t coef = strtoull(strtok(term, ","), NULL, 10);
-      uint64_t power = strtoull(strtok(NULL, ","), NULL, 10);
-    
-      gf_poly_add_coef(irred_poly, coef, power);
-    
-      if (i < argc) {
-        term = argv[++i];
+  printf("Poly:");
+  for (power = n; power >= 0; power--) {
+    if (!gf_general_is_zero(poly+power, w)) {
+      printf("%s", (power == n) ? " " : " + ");
+      if (!gf_general_is_one(poly+power, w)) {
+        gf_general_val_to_s(poly+power, w, buf, 1);
+        if (n > 0) {
+          printf("(0x%s)", buf);
+        } else {
+          printf("0x%s", buf);
+        }
+      }
+      if (power == 0) {
+        if (gf_general_is_one(poly+power, w)) printf("1");
+      } else if (power == 1) {
+        printf("x");
       } else {
-        break;
+        printf("x^%d", power);
       }
     }
-  } else {
-    usage(argv[0]);
-    exit(1);
+  }
+  printf("\n");
+
+  if (!gf_general_is_one(poly+n, w)) {
+    printf("\n");
+    printf("Can't do Ben-Or, because the polynomial is not monic.\n");
+    exit(0);
+  }
+
+  for (i = 1; i <= n/2; i++) {
+    x_to_q_to_i_minus_x(&gf, w, n, poly, w, i, prod); 
+    if (!gcd_one(&gf, w, n, poly, prod)) {
+      printf("Reducible.\n");
+      exit(0);
+    }
   }
   
-  gf_poly_print(irred_poly, " specified via the command line\n");
-
-  ret = gf_poly_is_irred(irred_poly);
-
-  if (ret < 0) {
-    gf_poly_print(irred_poly, " IS NOT irreducible\n");
-  } else {
-    gf_poly_print(irred_poly, " IS irreducible\n");
-  }
-
-  return 0;
+  printf("Irreducible.\n");
+  exit(0);
 }
diff --git a/gf_time.c b/gf_time.c
index 8313b05..55f3e11 100644
--- a/gf_time.c
+++ b/gf_time.c
@@ -9,7 +9,7 @@
 #include <stdint.h>
 #include <string.h>
 #include <stdlib.h>
-#include <time.h>
+#include <sys/time.h>
 
 #include "gf_complete.h"
 #include "gf_method.h"
@@ -43,10 +43,14 @@ void problem(char *s)
   exit(1);
 }
 
+char *BM = "Bad Method: ";
+
 void usage(char *s)
 {
   fprintf(stderr, "usage: gf_time w tests seed size(bytes) iterations [method [params]] - does timing\n");
   fprintf(stderr, "\n");
+  fprintf(stderr, "does unit testing in GF(2^w)\n");
+  fprintf(stderr, "\n");
   fprintf(stderr, "Legal w are: 1 - 32, 64 and 128\n");
   fprintf(stderr, "\n");
   fprintf(stderr, "Tests may be any combination of:\n");
@@ -63,9 +67,12 @@ void usage(char *s)
   fprintf(stderr, "\n");
   fprintf(stderr, "Use -1 for time(0) as a seed.\n");
   fprintf(stderr, "\n");
-  fprintf(stderr, "For method specification, type gf_methods\n");
-  fprintf(stderr, "\n");
-  if (s != NULL) fprintf(stderr, "%s\n", s);
+  if (s == BM) {
+    fprintf(stderr, "%s", BM);
+    gf_error();
+  } else if (s != NULL) {
+    fprintf(stderr, "%s\n", s);
+  }
   exit(1);
 }
 
@@ -84,9 +91,15 @@ int main(int argc, char **argv)
   time_t t0;
   uint8_t *ra, *rb;
   gf_general_t a;
+
   
   if (argc < 6) usage(NULL);
-  if (sscanf(argv[1], "%d", &w) == 0) usage("Bad w\n");
+  
+  if (sscanf(argv[1], "%d", &w) == 0){
+    usage("Bad w[-pp]\n");
+  }
+
+  
   if (sscanf(argv[3], "%ld", &t0) == 0) usage("Bad seed\n");
   if (sscanf(argv[4], "%d", &size) == 0) usage("Bad size\n");
   if (sscanf(argv[5], "%d", &iterations) == 0) usage("Bad iterations\n");
@@ -99,7 +112,7 @@ int main(int argc, char **argv)
   if ((w > 32 && w != 64 && w != 128) || w < 0) usage("Bad w");
   if ((size * 8) % w != 0) usage ("Bad size -- must be a multiple of w*8\n");
   
-  if (!create_gf_from_argv(&gf, w, argc, argv, 6)) usage("Bad Method");
+  if (!create_gf_from_argv(&gf, w, argc, argv, 6)) usage(BM);
 
   strcpy(tests, "");
   for (i = 0; i < argv[2][i] != '\0'; i++) {
diff --git a/gf_unit.c b/gf_unit.c
index 03911c4..fbc21f9 100644
--- a/gf_unit.c
+++ b/gf_unit.c
@@ -10,6 +10,7 @@
 #include <string.h>
 #include <stdlib.h>
 #include <time.h>
+#include <signal.h>
 
 #include "gf_complete.h"
 #include "gf_int.h"
@@ -18,6 +19,8 @@
 #include "gf_general.h"
 
 #define REGION_SIZE (16384) 
+#define RMASK (0x00000000ffffffffLL)
+#define LMASK (0xffffffff00000000LL)
 
 void problem(char *s)
 {
@@ -26,11 +29,14 @@ void problem(char *s)
   exit(1);
 }
 
+char *BM = "Bad Method: ";
+
 void usage(char *s)
 {
   fprintf(stderr, "usage: gf_unit w tests seed [method] - does unit testing in GF(2^w)\n");
-  fprintf(stderr, "\n");
+  fprintf(stderr, "\n");    
   fprintf(stderr, "Legal w are: 1 - 32, 64 and 128\n");
+  fprintf(stderr, "           128 is hex only (i.e. '128' will be an error - do '128h')\n");
   fprintf(stderr, "\n");
   fprintf(stderr, "Tests may be any combination of:\n");
   fprintf(stderr, "       A: All\n");
@@ -40,16 +46,28 @@ void usage(char *s)
   fprintf(stderr, "\n");
   fprintf(stderr, "Use -1 for time(0) as a seed.\n");
   fprintf(stderr, "\n");
-  fprintf(stderr, "For method specification, type gf_methods\n");
-  fprintf(stderr, "\n");
-  if (s != NULL) fprintf(stderr, "%s\n", s);
+  if (s == BM) {
+    fprintf(stderr, "%s", BM);
+    gf_error();
+  } else if (s != NULL) {
+    fprintf(stderr, "%s\n", s);
+  }
   exit(1);
 }
 
+void SigHandler(int v)
+{
+  fprintf(stderr, "Problem: SegFault!\n");
+  fflush(stdout);
+  exit(2);
+}
+
 int main(int argc, char **argv)
 {
+  signal(SIGSEGV, SigHandler);
+
   int w, i, verbose, single, region, tested, top;
-  int start, end, xor;
+  int s_start, d_start, bytes, xor, alignment_test;
   gf_t   gf, gf_def;
   time_t t0;
   gf_internal_t *h;
@@ -61,15 +79,21 @@ int main(int argc, char **argv)
   char *ra, *rb, *rc, *rd, *target;
   int align;
 
+
   if (argc < 4) usage(NULL);
-  if (sscanf(argv[1], "%d", &w) == 0) usage("Bad w\n");
+
+  if (sscanf(argv[1], "%d", &w) == 0){
+    usage("Bad w\n");
+  }
+
   if (sscanf(argv[3], "%ld", &t0) == 0) usage("Bad seed\n");
   if (t0 == -1) t0 = time(0);
   MOA_Seed(t0);
 
   if (w > 32 && w != 64 && w != 128) usage("Bad w");
   
-  if (create_gf_from_argv(&gf, w, argc, argv, 4) == 0) usage("Bad Method");
+  if (create_gf_from_argv(&gf, w, argc, argv, 4) == 0) usage(BM);
+  printf("Size (bytes): %d\n", gf_size(&gf));
 
   for (i = 0; i < strlen(argv[2]); i++) {
     if (strchr("ASRV", argv[2][i]) == NULL) usage("Bad test\n");
@@ -83,10 +107,18 @@ int main(int argc, char **argv)
   ai = (gf_general_t *) malloc(sizeof(gf_general_t));
   bi = (gf_general_t *) malloc(sizeof(gf_general_t));
 
-  ra = (char *) malloc(sizeof(char)*REGION_SIZE);
-  rb = (char *) malloc(sizeof(char)*REGION_SIZE);
-  rc = (char *) malloc(sizeof(char)*REGION_SIZE);
-  rd = (char *) malloc(sizeof(char)*REGION_SIZE);
+  //15 bytes extra to make sure it's 16byte aligned
+  ra = (char *) malloc(sizeof(char)*REGION_SIZE+15);
+  rb = (char *) malloc(sizeof(char)*REGION_SIZE+15);
+  rc = (char *) malloc(sizeof(char)*REGION_SIZE+15);
+  rd = (char *) malloc(sizeof(char)*REGION_SIZE+15);
+
+  //this still assumes 8 byte aligned pointer from malloc
+  //(which is usual on 32-bit machines)
+  ra += (uint64_t)ra & 0xf;
+  rb += (uint64_t)rb & 0xf;
+  rc += (uint64_t)rc & 0xf;
+  rd += (uint64_t)rd & 0xf;
 
   if (w <= 32) {
     mask = 0;
@@ -97,8 +129,9 @@ int main(int argc, char **argv)
   single = (strchr(argv[2], 'S') != NULL || strchr(argv[2], 'A') != NULL);
   region = (strchr(argv[2], 'R') != NULL || strchr(argv[2], 'A') != NULL);
 
-  if (!gf_init_easy(&gf_def, w)) problem("No default for this value of w");
-  
+  if (!gf_init_hard(&gf_def, w, GF_MULT_DEFAULT, GF_REGION_DEFAULT, GF_DIVIDE_DEFAULT,
+      (h->mult_type != GF_MULT_COMPOSITE) ? h->prim_poly : 0, 0, 0, NULL, NULL))
+    problem("No default for this value of w");
   if (w == 4) {
     mult4 = gf_w4_get_mult_table(&gf);
     div4 = gf_w4_get_div_table(&gf);
@@ -129,21 +162,71 @@ int main(int argc, char **argv)
       if (w <= 10) {
         a->w32 = i % (1 << w);
         b->w32 = (i >> w);
-      } else if (i < 10) {
-        gf_general_set_zero(a, w);
-        gf_general_set_random(b, w, 1);
-      } else if (i < 20) {
-        gf_general_set_random(a, w, 1);
-        gf_general_set_zero(b, w);
-      } else if (i < 30) {
-        gf_general_set_one(a, w);
-        gf_general_set_random(b, w, 1);
-      } else if (i < 40) {
-        gf_general_set_random(a, w, 1);
-        gf_general_set_one(b, w);
+
+      //Allen: the following conditions were being run 10 times each. That didn't seem like nearly enough to
+      //me for these special cases, so I converted to doing this mod stuff to easily make the number of times
+      //run both larger and proportional to the total size of the run.
       } else {
-        gf_general_set_random(a, w, 1);
-        gf_general_set_random(b, w, 1);
+        switch (i % 32)
+        {
+          case 0: 
+            gf_general_set_zero(a, w);
+            gf_general_set_random(b, w, 1);
+            break;
+          case 1:
+            gf_general_set_random(a, w, 1);
+            gf_general_set_zero(b, w);
+            break;
+          case 2:
+            gf_general_set_one(a, w);
+            gf_general_set_random(b, w, 1);
+            break;
+          case 3:
+            gf_general_set_random(a, w, 1);
+            gf_general_set_one(b, w);
+            break;
+          default:
+            gf_general_set_random(a, w, 1);
+            gf_general_set_random(b, w, 1);
+        }
+      }
+
+      //Allen: the following special cases for w=64 are based on the code below for w=128.
+      //These w=64 cases are based on Dr. Plank's suggestion because some of the methods for w=64
+      //involve splitting it in two. I think they're less likely to give errors than the 128-bit case
+      //though, because the 128 bit case is always split in two.
+      //As with w=128, I'm arbitrarily deciding to do this sort of thing with a quarter of the cases
+      if (w == 64) {
+        switch (i % 32)
+        {
+          case 0: if (!gf_general_is_one(a, w)) a->w64 &= RMASK; break;
+          case 1: if (!gf_general_is_one(a, w)) a->w64 &= LMASK; break;
+          case 2: if (!gf_general_is_one(a, w)) a->w64 &= RMASK; if (!gf_general_is_one(b, w)) b->w64 &= RMASK; break;
+          case 3: if (!gf_general_is_one(a, w)) a->w64 &= RMASK; if (!gf_general_is_one(b, w)) b->w64 &= LMASK; break;
+          case 4: if (!gf_general_is_one(a, w)) a->w64 &= LMASK; if (!gf_general_is_one(b, w)) b->w64 &= RMASK; break;
+          case 5: if (!gf_general_is_one(a, w)) a->w64 &= LMASK; if (!gf_general_is_one(b, w)) b->w64 &= LMASK; break;
+          case 6: if (!gf_general_is_one(b, w)) b->w64 &= RMASK; break;
+          case 7: if (!gf_general_is_one(b, w)) b->w64 &= LMASK; break;
+        }
+      }
+
+      //Allen: for w=128, we have important special cases where one half or the other of the number is all
+      //zeros. The probability of hitting such a number randomly is 1^-64, so if we don't force these cases
+      //we'll probably never hit them. This could be implemented more efficiently by changing the set-random
+      //function for w=128, but I think this is easier to follow.
+      //I'm arbitrarily deciding to do this sort of thing with a quarter of the cases
+      if (w == 128) {
+        switch (i % 32)
+        {
+          case 0: if (!gf_general_is_one(a, w)) a->w128[0] = 0; break;
+          case 1: if (!gf_general_is_one(a, w)) a->w128[1] = 0; break;
+          case 2: if (!gf_general_is_one(a, w)) a->w128[0] = 0; if (!gf_general_is_one(b, w)) b->w128[0] = 0; break;
+          case 3: if (!gf_general_is_one(a, w)) a->w128[0] = 0; if (!gf_general_is_one(b, w)) b->w128[1] = 0; break;
+          case 4: if (!gf_general_is_one(a, w)) a->w128[1] = 0; if (!gf_general_is_one(b, w)) b->w128[0] = 0; break;
+          case 5: if (!gf_general_is_one(a, w)) a->w128[1] = 0; if (!gf_general_is_one(b, w)) b->w128[1] = 0; break;
+          case 6: if (!gf_general_is_one(b, w)) b->w128[0] = 0; break;
+          case 7: if (!gf_general_is_one(b, w)) b->w128[1] = 0; break;
+        }
       }
 
       tested = 0;
@@ -195,10 +278,10 @@ int main(int argc, char **argv)
         gf_general_multiply(&gf_def, a, b, d);
 
         if (!gf_general_are_equal(c, d, w)) {
-          gf_general_val_to_s(a, w, as);
-          gf_general_val_to_s(b, w, bs);
-          gf_general_val_to_s(c, w, cs);
-          gf_general_val_to_s(d, w, ds);
+          gf_general_val_to_s(a, w, as, 1);
+          gf_general_val_to_s(b, w, bs, 1);
+          gf_general_val_to_s(c, w, cs, 1);
+          gf_general_val_to_s(d, w, ds, 1);
           printf("Error in single multiplication (all numbers in hex):\n\n");
           printf("  gf.multiply(gf, %s, %s) = %s\n", as, bs, cs);
           printf("  The default gf multiplier returned %s\n", ds);
@@ -216,9 +299,9 @@ int main(int argc, char **argv)
         if (((gf_general_is_zero(a, w) || gf_general_is_zero(b, w)) && !gf_general_is_zero(c, w)) ||
             (gf_general_is_one(a, w) && !gf_general_are_equal(b, c, w)) ||
             (gf_general_is_one(b, w) && !gf_general_are_equal(a, c, w))) {
-          gf_general_val_to_s(a, w, as);
-          gf_general_val_to_s(b, w, bs);
-          gf_general_val_to_s(c, w, cs);
+          gf_general_val_to_s(a, w, as, 1);
+          gf_general_val_to_s(b, w, bs, 1);
+          gf_general_val_to_s(c, w, cs, 1);
           printf("Error in single multiplication (all numbers in hex):\n\n");
           printf("  gf.multiply(gf, %s, %s) = %s, which is clearly wrong.\n", as, bs, cs);
 ;
@@ -229,9 +312,9 @@ int main(int argc, char **argv)
       /* Dumb check to make sure that it's not returning numbers that are too big: */
 
       if (w < 32 && (c->w32 & mask) != c->w32) {
-        gf_general_val_to_s(a, w, as);
-        gf_general_val_to_s(b, w, bs);
-        gf_general_val_to_s(c, w, cs);
+        gf_general_val_to_s(a, w, as, 1);
+        gf_general_val_to_s(b, w, bs, 1);
+        gf_general_val_to_s(c, w, cs, 1);
         printf("Error in single multiplication (all numbers in hex):\n\n");
         printf("  gf.multiply.w32(gf, %s, %s) = %s, which is too big.\n", as, bs, cs);
         exit(1);
@@ -242,10 +325,10 @@ int main(int argc, char **argv)
       if (!gf_general_is_zero(a, w)) {
         gf_general_divide(&gf, c, a, d);
         if (!gf_general_are_equal(b, d, w)) {
-          gf_general_val_to_s(a, w, as);
-          gf_general_val_to_s(b, w, bs);
-          gf_general_val_to_s(c, w, cs);
-          gf_general_val_to_s(d, w, ds);
+          gf_general_val_to_s(a, w, as, 1);
+          gf_general_val_to_s(b, w, bs, 1);
+          gf_general_val_to_s(c, w, cs, 1);
+          gf_general_val_to_s(d, w, ds, 1);
           printf("Error in single multiplication/division (all numbers in hex):\n\n");
           printf("  gf.multiply(gf, %s, %s) = %s, but gf.divide(gf, %s, %s) = %s\n", as, bs, cs, cs, as, ds);
           exit(1);
@@ -257,40 +340,82 @@ int main(int argc, char **argv)
 
   if (region) {
     if (verbose) { printf("Testing region multiplications\n"); fflush(stdout); }
-    for (i = 0; i < 1000; i++) {
-      if (i < 20) {
-        gf_general_set_zero(a, w);
-      } else if (i < 40) {
-        gf_general_set_one(a, w);
-      } else if (i < 60) {
-        gf_general_set_two(a, w);
-      } else {
-        gf_general_set_random(a, w, 1);
+    for (i = 0; i < 1024; i++) {
+      //Allen: changing to a switch thing as with the single ops to make things proportional
+      switch (i % 32)
+      {
+        case 0:
+          gf_general_set_zero(a, w);
+          break;
+        case 1:
+          gf_general_set_one(a, w);
+          break;
+        case 2:
+          gf_general_set_two(a, w);
+          break;
+        default:
+          gf_general_set_random(a, w, 1);
       }
       MOA_Fill_Random_Region(ra, REGION_SIZE);
       MOA_Fill_Random_Region(rb, REGION_SIZE);
-      xor = i%2;
+      xor = (i/32)%2;
       align = w/8;
       if (align == 0) align = 1;
       if (align > 16) align = 16;
+
+      /* JSP - Cauchy test.  When w < 32 & it doesn't equal 4, 8 or 16, the default is
+         equal to GF_REGION_CAUCHY, even if GF_REGION_CAUCHY is not set. We are testing
+         three alignments here:
+
+         1. Anything goes -- no alignment guaranteed.
+         2. Perfect alignment.  Here src and dest must be aligned wrt each other,
+            and bytes must be a multiple of 16*w.  
+         3. Imperfect alignment.  Here we'll have src and dest be aligned wrt each 
+            other, but bytes is simply a multiple of w.  That means some XOR's will
+            be aligned, and some won't.
+       */
+
       if ((h->region_type & GF_REGION_CAUCHY) || (w < 32 && w != 4 && w != 8 && w != 16)) {
-        start = MOA_Random_W(5, 1);
-        end = REGION_SIZE - MOA_Random_W(5, 1);
+        alignment_test = (i%3);
+        
+        s_start = MOA_Random_W(5, 1);
+        if (alignment_test == 0) {
+          d_start = MOA_Random_W(5, 1);
+        } else {
+          d_start = s_start;
+        }
+
+        bytes = (d_start > s_start) ? REGION_SIZE - d_start : REGION_SIZE - s_start;
+        bytes -= MOA_Random_W(5, 1);
+        if (alignment_test == 1) {
+          bytes -= (bytes % (w*16));
+        } else {
+          bytes -= (bytes % w);
+        }
+
         target = rb;
-        while ((end-start)%w != 0) end--;
+ 
+      /* JSP - Otherwise, we're testing a non-cauchy test, and alignment
+        must be more strict.  We have to make sure that the regions are
+        aligned wrt each other on 16-byte pointers.  */
+
       } else {
-        start = MOA_Random_W(5, 1) * align;
-        end = REGION_SIZE - (MOA_Random_W(5, 1) * align);
+        s_start = MOA_Random_W(5, 1) * align;
+        d_start = s_start;
+        bytes = REGION_SIZE - s_start - MOA_Random_W(5, 1);
+        bytes -= (bytes % align);
+
         if (h->mult_type == GF_MULT_COMPOSITE && (h->region_type & GF_REGION_ALTMAP)) {
           target = rb ;
         } else {
-          target = ((i%4)/2) ? rb : ra;
+          target = (i/64)%2 ? rb : ra;
         }
       }
+
       memcpy(rc, ra, REGION_SIZE);
       memcpy(rd, target, REGION_SIZE);
-      gf_general_do_region_multiply(&gf, a, ra+start, target+start, end-start, xor);
-      gf_general_do_region_check(&gf, a, rc+start, rd+start, target+start, end-start, xor);
+      gf_general_do_region_multiply(&gf, a, ra+s_start, target+d_start, bytes, xor);
+      gf_general_do_region_check(&gf, a, rc+s_start, rd+d_start, target+d_start, bytes, xor);
     }
   }
 }
diff --git a/gf_w128.c b/gf_w128.c
index 0a2a93f..1465be5 100644
--- a/gf_w128.c
+++ b/gf_w128.c
@@ -12,7 +12,7 @@
 
 #define two_x(a) {\
   a[0] <<= 1; \
-  if (a[1] & (uint64_t) 1 << 63) a[0] ^= 1; \
+  if (a[1] & 1ULL << 63) a[0] ^= 1; \
   a[1] <<= 1; }
   
 #define a_get_b(a, i, b, j) {\
@@ -28,11 +28,18 @@ struct gf_w128_split_4_128_data {
   uint64_t tables[2][32][16];
 };
 
+struct gf_w128_split_8_128_data {
+  uint64_t last_value[2];
+  uint64_t tables[2][16][256];
+};
+
 typedef struct gf_group_tables_s {
   gf_val_128_t m_table;
   gf_val_128_t r_table;
 } gf_group_tables_t;
 
+#define MM_PRINT8(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 1) printf("%s%02x", (ii%4==0) ? "   " : " ", blah[15-ii]); printf("\n"); }
+
 static
 void
 gf_w128_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes,
@@ -70,11 +77,120 @@ int xor)
     }
 }
 
+static
+void
+gf_w128_clm_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes,
+int xor)
+{
+    int i;
+    gf_val_128_t s128;
+    gf_val_128_t d128;
+    uint64_t c128[2];
+    gf_region_data rd;
+#ifdef INTEL_SSE4_PCLMUL
+    __m128i     a,b;
+    __m128i     result0,result1;
+    __m128i     prim_poly;
+    __m128i     c,d,e,f;
+    gf_internal_t * h = gf->scratch;
+    prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)h->prim_poly);
+    /* We only do this to check on alignment. */
+    gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8);
+
+    if (val[0] == 0) {
+      if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
+      if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+    }
+
+    set_zero(c128, 0);
+
+    s128 = (gf_val_128_t) src;
+    d128 = (gf_val_128_t) dest;
+
+    if (xor) {
+      for (i = 0; i < bytes/sizeof(gf_val_64_t); i += 2) {
+        a = _mm_insert_epi64 (_mm_setzero_si128(), s128[i+1], 0);
+        b = _mm_insert_epi64 (a, val[1], 0);
+        a = _mm_insert_epi64 (a, s128[i], 1);
+        b = _mm_insert_epi64 (b, val[0], 1);
+    
+        c = _mm_clmulepi64_si128 (a, b, 0x00); /*low-low*/
+        f = _mm_clmulepi64_si128 (a, b, 0x01); /*high-low*/
+        e = _mm_clmulepi64_si128 (a, b, 0x10); /*low-high*/
+        d = _mm_clmulepi64_si128 (a, b, 0x11); /*high-high*/
+
+        /* now reusing a and b as temporary variables*/
+        result0 = _mm_setzero_si128();
+        result1 = result0;
+
+        result0 = _mm_xor_si128 (result0, _mm_insert_epi64 (d, 0, 0));
+        a = _mm_xor_si128 (_mm_srli_si128 (e, 8), _mm_insert_epi64 (d, 0, 1));
+        result0 = _mm_xor_si128 (result0, _mm_xor_si128 (_mm_srli_si128 (f, 8), a));
+
+        a = _mm_xor_si128 (_mm_slli_si128 (e, 8), _mm_insert_epi64 (c, 0, 0));
+        result1 = _mm_xor_si128 (result1, _mm_xor_si128 (_mm_slli_si128 (f, 8), a));
+        result1 = _mm_xor_si128 (result1, _mm_insert_epi64 (c, 0, 1));
+        /* now we have constructed our 'result' with result0 being the carry bits, and we have to reduce. */
+
+        a = _mm_srli_si128 (result0, 8);
+        b = _mm_clmulepi64_si128 (a, prim_poly, 0x00);
+        result0 = _mm_xor_si128 (result0, _mm_srli_si128 (b, 8));
+        result1 = _mm_xor_si128 (result1, _mm_slli_si128 (b, 8));
+
+        a = _mm_insert_epi64 (result0, 0, 1);
+        b = _mm_clmulepi64_si128 (a, prim_poly, 0x00);
+        result1 = _mm_xor_si128 (result1, b); 
+        d128[i] ^= (uint64_t)_mm_extract_epi64(result1,1);
+        d128[i+1] ^= (uint64_t)_mm_extract_epi64(result1,0);
+      }
+    } else {
+      for (i = 0; i < bytes/sizeof(gf_val_64_t); i += 2) {
+        a = _mm_insert_epi64 (_mm_setzero_si128(), s128[i+1], 0);
+        b = _mm_insert_epi64 (a, val[1], 0);
+        a = _mm_insert_epi64 (a, s128[i], 1);
+        b = _mm_insert_epi64 (b, val[0], 1);
+
+        c = _mm_clmulepi64_si128 (a, b, 0x00); /*low-low*/
+        f = _mm_clmulepi64_si128 (a, b, 0x01); /*high-low*/
+        e = _mm_clmulepi64_si128 (a, b, 0x10); /*low-high*/ 
+        d = _mm_clmulepi64_si128 (a, b, 0x11); /*high-high*/ 
+
+        /* now reusing a and b as temporary variables*/
+        result0 = _mm_setzero_si128();
+        result1 = result0;
+
+        result0 = _mm_xor_si128 (result0, _mm_insert_epi64 (d, 0, 0));
+        a = _mm_xor_si128 (_mm_srli_si128 (e, 8), _mm_insert_epi64 (d, 0, 1));
+        result0 = _mm_xor_si128 (result0, _mm_xor_si128 (_mm_srli_si128 (f, 8), a));
+
+        a = _mm_xor_si128 (_mm_slli_si128 (e, 8), _mm_insert_epi64 (c, 0, 0));
+        result1 = _mm_xor_si128 (result1, _mm_xor_si128 (_mm_slli_si128 (f, 8), a));
+        result1 = _mm_xor_si128 (result1, _mm_insert_epi64 (c, 0, 1));
+        /* now we have constructed our 'result' with result0 being the carry bits, and we have to reduce.*/
+
+        a = _mm_srli_si128 (result0, 8);
+        b = _mm_clmulepi64_si128 (a, prim_poly, 0x00);
+        result0 = _mm_xor_si128 (result0, _mm_srli_si128 (b, 8));
+        result1 = _mm_xor_si128 (result1, _mm_slli_si128 (b, 8));
+
+        a = _mm_insert_epi64 (result0, 0, 1);
+        b = _mm_clmulepi64_si128 (a, prim_poly, 0x00);
+        result1 = _mm_xor_si128 (result1, b);
+        d128[i] = (uint64_t)_mm_extract_epi64(result1,1);
+        d128[i+1] = (uint64_t)_mm_extract_epi64(result1,0);
+      }
+    }
+#endif
+}
+
 /*
  * Some w128 notes:
  * --Big Endian
  * --return values allocated beforehand
  */
+
+#define GF_W128_IS_ZERO(val) (val[0] == 0 && val[1] == 0)
+
 void
 gf_w128_shift_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
 {
@@ -99,6 +215,7 @@ gf_w128_shift_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_12
   set_zero(pl, 0);
   set_zero(pr, 0);
 
+  /* Allen: a*b for right half of a */
   for (i = 0; i < GF_FIELD_WIDTH/2; i++) {
     if (a[1] & (one << i)) {
       pl[1] ^= bl[1];
@@ -112,6 +229,7 @@ gf_w128_shift_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_12
     br[1] <<= 1;
   }
 
+  /* Allen: a*b for left half of a */
   for (i = 0; i < GF_FIELD_WIDTH/2; i++) {
     if (a[0] & (one << i)) {
       pl[0] ^= bl[0];
@@ -125,10 +243,11 @@ gf_w128_shift_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_12
     br[0] <<= 1;
   }
 
-  one = lbit;
-  ppl[0] = lbit;
-  ppl[1] = h->prim_poly >> 1;
-  ppr[0] = lbit;
+  /* Allen: do first half of reduction (based on left quarter of initial product) */
+  one = lbit >> 1;
+  ppl[0] = one; /* Allen: introduce leading one of primitive polynomial */
+  ppl[1] = h->prim_poly >> 2;
+  ppr[0] = h->prim_poly << (GF_FIELD_WIDTH/2-2);
   ppr[1] = 0;
   while (one != 0) {
     if (pl[0] & one) {
@@ -147,6 +266,7 @@ gf_w128_shift_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_12
     ppl[0] >>= 1;
   }
 
+  /* Allen: final half of reduction */
   one = lbit;
   while (one != 0) {
     if (pl[1] & one) {
@@ -162,12 +282,198 @@ gf_w128_shift_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_12
     ppl[1] >>= 1;
   }
 
+  /* Allen: if we really want to optimize this we can just be using c128 instead of pr all along */
   c128[0] = pr[0];
   c128[1] = pr[1];
 
   return;
 }
 
+void
+gf_w128_clm_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
+{
+#ifdef INTEL_SSE4_PCLMUL
+
+    __m128i     a,b;
+    __m128i     result0,result1;
+    __m128i     prim_poly;
+    __m128i     c,d,e,f;
+    gf_internal_t * h = gf->scratch;
+    
+    a = _mm_insert_epi64 (_mm_setzero_si128(), a128[1], 0);
+    b = _mm_insert_epi64 (a, b128[1], 0);
+    a = _mm_insert_epi64 (a, a128[0], 1);
+    b = _mm_insert_epi64 (b, b128[0], 1);
+
+    prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)h->prim_poly);
+
+    /* we need to test algorithm 2 later*/
+    c = _mm_clmulepi64_si128 (a, b, 0x00); /*low-low*/
+    f = _mm_clmulepi64_si128 (a, b, 0x01); /*high-low*/
+    e = _mm_clmulepi64_si128 (a, b, 0x10); /*low-high*/
+    d = _mm_clmulepi64_si128 (a, b, 0x11); /*high-high*/
+    
+    /* now reusing a and b as temporary variables*/
+    result0 = _mm_setzero_si128();
+    result1 = result0;
+
+    result0 = _mm_xor_si128 (result0, _mm_insert_epi64 (d, 0, 0));
+    a = _mm_xor_si128 (_mm_srli_si128 (e, 8), _mm_insert_epi64 (d, 0, 1));
+    result0 = _mm_xor_si128 (result0, _mm_xor_si128 (_mm_srli_si128 (f, 8), a));
+
+    a = _mm_xor_si128 (_mm_slli_si128 (e, 8), _mm_insert_epi64 (c, 0, 0));
+    result1 = _mm_xor_si128 (result1, _mm_xor_si128 (_mm_slli_si128 (f, 8), a));
+    result1 = _mm_xor_si128 (result1, _mm_insert_epi64 (c, 0, 1));
+    /* now we have constructed our 'result' with result0 being the carry bits, and we have to reduce.*/
+    
+    a = _mm_srli_si128 (result0, 8);
+    b = _mm_clmulepi64_si128 (a, prim_poly, 0x00);
+    result0 = _mm_xor_si128 (result0, _mm_srli_si128 (b, 8));
+    result1 = _mm_xor_si128 (result1, _mm_slli_si128 (b, 8));
+    
+    a = _mm_insert_epi64 (result0, 0, 1);
+    b = _mm_clmulepi64_si128 (a, prim_poly, 0x00);
+    result1 = _mm_xor_si128 (result1, b);
+
+    c128[0] = (uint64_t)_mm_extract_epi64(result1,1);
+    c128[1] = (uint64_t)_mm_extract_epi64(result1,0);
+#endif
+return;
+}
+
+void
+gf_w128_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
+{
+  uint64_t amask[2], pmask, pp, prod[2]; /*John: pmask is always the highest bit set, and the rest zeros. amask changes, it's a countdown.*/
+  uint64_t topbit; /* this is used as a boolean value */
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+  prod[0] = 0;
+  prod[1] = 0;
+  pmask = 0x8000000000000000ULL;
+  amask[0] = 0x8000000000000000ULL;
+  amask[1] = 0;
+
+  while (amask[1] != 0 || amask[0] != 0) {
+    topbit = (prod[0] & pmask);
+    prod[0] <<= 1;
+    if (prod[1] & pmask) prod[0] ^= 1;
+    prod[1] <<= 1;
+    if (topbit) prod[1] ^= pp;
+    if ((a128[0] & amask[0]) || (a128[1] & amask[1])) {
+      prod[0] ^= b128[0];
+      prod[1] ^= b128[1];
+    }
+    amask[1] >>= 1;
+    if (amask[0] & 1) amask[1] ^= pmask;
+    amask[0] >>= 1;
+  }
+  c128[0] = prod [0];
+  c128[1] = prod [1];
+  return;
+}
+
+void
+gf_w128_sse_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
+{
+#ifdef INTEL_SSE4
+  int i;
+  __m128i a, b, pp, one, prod, amask, l_middle_one, u_middle_one; 
+  /*John: pmask is always the highest bit set, and the rest zeros. amask changes, it's a countdown.*/
+  uint32_t topbit, middlebit, pmask; /* this is used as a boolean value */
+  gf_internal_t *h;
+
+
+  h = (gf_internal_t *) gf->scratch;
+  pp = _mm_set_epi32(0, 0, 0, (uint32_t)h->prim_poly);
+  prod = _mm_setzero_si128();
+  a = _mm_insert_epi64(prod, a128[1], 0x0);
+  a = _mm_insert_epi64(a, a128[0], 0x1);
+  b = _mm_insert_epi64(prod, b128[1], 0x0);
+  b = _mm_insert_epi64(b, b128[0], 0x1);
+  pmask = 0x80000000;
+  amask = _mm_insert_epi32(prod, 0x80000000, 0x3);
+  u_middle_one = _mm_insert_epi32(prod, 1, 0x2);
+  l_middle_one = _mm_insert_epi32(prod, 1 << 31, 0x1);
+  
+  for (i = 0; i < 64; i++) {
+    topbit = (_mm_extract_epi32(prod, 0x3) & pmask);
+    middlebit = (_mm_extract_epi32(prod, 0x1) & pmask);
+    prod = _mm_slli_epi64(prod, 1); /* this instruction loses the middle bit */
+    if (middlebit) {
+      prod = _mm_xor_si128(prod, u_middle_one);
+    }
+    if (topbit) {
+      prod = _mm_xor_si128(prod, pp);
+    }
+    if (((uint64_t)_mm_extract_epi64(_mm_and_si128(a, amask), 1))) {
+      prod = _mm_xor_si128(prod, b);
+    }
+    amask = _mm_srli_epi64(amask, 1); /*so does this one, but we can just replace after loop*/
+  }
+  amask = _mm_insert_epi32(amask, 1 << 31, 0x1);
+  for (i = 64; i < 128; i++) {
+    topbit = (_mm_extract_epi32(prod, 0x3) & pmask);
+    middlebit = (_mm_extract_epi32(prod, 0x1) & pmask);
+    prod = _mm_slli_epi64(prod, 1);
+    if (middlebit) prod = _mm_xor_si128(prod, u_middle_one);
+    if (topbit) prod = _mm_xor_si128(prod, pp);
+    if (((uint64_t)_mm_extract_epi64(_mm_and_si128(a, amask), 0))) {
+      prod = _mm_xor_si128(prod, b);
+    }
+    amask = _mm_srli_epi64(amask, 1);
+  }
+  c128[0] = (uint64_t)_mm_extract_epi64(prod, 1);
+  c128[1] = (uint64_t)_mm_extract_epi64(prod, 0);
+#endif
+  return;
+}
+
+
+/* Ben: This slow function implements sse instrutions for bytwo_b because why not */
+void
+gf_w128_sse_bytwo_b_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
+{
+#ifdef INTEL_SSE4
+  __m128i a, b, lmask, hmask, pp, c, middle_one;
+  gf_internal_t *h;
+  uint64_t topbit, middlebit;
+
+  h = (gf_internal_t *) gf->scratch;
+  
+  c = _mm_setzero_si128();
+  lmask = _mm_insert_epi64(c, 1ULL << 63, 0);
+  hmask = _mm_insert_epi64(c, 1ULL << 63, 1);
+  b = _mm_insert_epi64(c, a128[0], 1);
+  b = _mm_insert_epi64(b, a128[1], 0);
+  a = _mm_insert_epi64(c, b128[0], 1);
+  a = _mm_insert_epi64(a, b128[1], 0);
+  pp = _mm_insert_epi64(c, h->prim_poly, 0);
+  middle_one = _mm_insert_epi64(c, 1, 0x1);
+
+  while (1) {
+    if (_mm_extract_epi32(a, 0x0) & 1) {
+      c = _mm_xor_si128(c, b);
+    }
+    middlebit = (_mm_extract_epi32(a, 0x2) & 1);
+    a = _mm_srli_epi64(a, 1);
+    if (middlebit) a = _mm_xor_si128(a, lmask);
+    if ((_mm_extract_epi64(a, 0x1) == 0ULL) && (_mm_extract_epi64(a, 0x0) == 0ULL)){
+      c128[0] = _mm_extract_epi64(c, 0x1);
+      c128[1] = _mm_extract_epi64(c, 0x0);
+      return;
+    }
+    topbit = (_mm_extract_epi64(_mm_and_si128(b, hmask), 1));
+    middlebit = (_mm_extract_epi64(_mm_and_si128(b, lmask), 0));
+    b = _mm_slli_epi64(b, 1);
+    if (middlebit) b = _mm_xor_si128(b, middle_one);
+    if (topbit) b = _mm_xor_si128(b, pp);
+  }
+#endif
+}
+
 void
 gf_w128_bytwo_b_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
 {
@@ -177,7 +483,7 @@ gf_w128_bytwo_b_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_
 
   h = (gf_internal_t *) gf->scratch;
 
-  bmask = (1L << 63);
+  bmask = (1ULL << 63);
   set_zero(c, 0);
   b[0] = a128[0];
   b[1] = a128[1];
@@ -243,9 +549,9 @@ gf_w128_split_4_128_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_
           ld->tables[0][i][k^j] = (v[0] ^ ld->tables[0][i][k]);
           ld->tables[1][i][k^j] = (v[1] ^ ld->tables[1][i][k]);
         }
-        pp = (v[0] & (1L << 63));
+        pp = (v[0] & (1ULL << 63));
         v[0] <<= 1;
-        if (v[1] & (1L << 63)) v[0] ^= 1;
+        if (v[1] & (1ULL << 63)) v[0] ^= 1;
         v[1] <<= 1;
         if (pp) v[1] ^= h->prim_poly;
       }
@@ -254,6 +560,15 @@ gf_w128_split_4_128_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_
   ld->last_value[0] = val[0];
   ld->last_value[1] = val[1];
 
+/*
+  for (i = 0; i < 32; i++) {
+    for (j = 0; j < 16; j++) {
+      printf("%2d %2d %016llx %016llx\n", i, j, ld->tables[0][i][j], ld->tables[1][i][j]);
+    }
+    printf("\n");
+  }
+ */
+  i = 0;
   while (d64 < top) {
     v[0] = (xor) ? d64[0] : 0;
     v[1] = (xor) ? d64[1] : 0;
@@ -280,6 +595,191 @@ gf_w128_split_4_128_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_
   }
 }
 
+static
+void
+gf_w128_split_4_128_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
+{
+#ifdef INTEL_SSSE3
+  gf_internal_t *h;
+  int i, m, j, k, tindex;
+  uint64_t pp, v[2], s, *s64, *d64, *top;
+  __m128i si, tables[32][16], p[16], v0, mask1;
+  struct gf_w128_split_4_128_data *ld;
+  uint8_t btable[16];
+  gf_region_data rd;
+
+  if (val[0] == 0) {
+    if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
+    if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+  }
+
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+  
+  /* We only do this to check on alignment. */
+  gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 256);
+
+  /* Doing this instead of gf_do_initial_region_alignment() because that doesn't hold 128-bit vals */
+
+  gf_w128_multiply_region_from_single(gf, src, dest, val, (rd.s_start-src), xor);
+
+  s64 = (uint64_t *) rd.s_start;
+  d64 = (uint64_t *) rd.d_start;
+  top = (uint64_t *) rd.d_top;
+ 
+  ld = (struct gf_w128_split_4_128_data *) h->private;
+
+  if (val[0] != ld->last_value[0] || val[1] != ld->last_value[1]) {
+    v[0] = val[0];
+    v[1] = val[1];
+    for (i = 0; i < 32; i++) {
+      ld->tables[0][i][0] = 0;
+      ld->tables[1][i][0] = 0;
+      for (j = 1; j < 16; j <<= 1) {
+        for (k = 0; k < j; k++) {
+          ld->tables[0][i][k^j] = (v[0] ^ ld->tables[0][i][k]);
+          ld->tables[1][i][k^j] = (v[1] ^ ld->tables[1][i][k]);
+        }
+        pp = (v[0] & (1ULL << 63));
+        v[0] <<= 1;
+        if (v[1] & (1ULL << 63)) v[0] ^= 1;
+        v[1] <<= 1;
+        if (pp) v[1] ^= h->prim_poly;
+      }
+    }
+  }
+
+  ld->last_value[0] = val[0];
+  ld->last_value[1] = val[1];
+
+  for (i = 0; i < 32; i++) {
+    for (j = 0; j < 16; j++) {
+      for (k = 0; k < 16; k++) {
+        btable[k] = (uint8_t) ld->tables[1-(j/8)][i][k];
+        ld->tables[1-(j/8)][i][k] >>= 8;
+      }
+      tables[i][j] = _mm_loadu_si128((__m128i *) btable);
+/*
+      printf("%2d %2d: ", i, j);
+      MM_PRINT8("", tables[i][j]);
+ */
+    }
+  }
+
+
+  mask1 = _mm_set1_epi8(0xf);
+
+  while (d64 != top) {
+
+    if (xor) {
+      for (i = 0; i < 16; i++) p[i] = _mm_load_si128 ((__m128i *) (d64+i*2));
+    } else {
+      for (i = 0; i < 16; i++) p[i] = _mm_setzero_si128();
+    }
+    i = 0;
+    for (k = 0; k < 16; k++) {
+      v0 = _mm_load_si128((__m128i *) s64); 
+      s64 += 2;
+      
+      si = _mm_and_si128(v0, mask1);
+  
+      for (j = 0; j < 16; j++) {
+        p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si));
+      }
+      i++;
+      v0 = _mm_srli_epi32(v0, 4);
+      si = _mm_and_si128(v0, mask1);
+      for (j = 0; j < 16; j++) {
+        p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si));
+      }
+      i++;
+    }
+    for (i = 0; i < 16; i++) {
+      _mm_store_si128((__m128i *) d64, p[i]);
+      d64 += 2;
+    }
+  }
+  /* Doing this instead of gf_do_final_region_alignment() because that doesn't hold 128-bit vals */
+
+  gf_w128_multiply_region_from_single(gf, rd.s_top, rd.d_top, val, (src+bytes)-rd.s_top, xor);
+#endif
+}
+
+static
+void
+gf_w128_split_8_128_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
+{
+  int i, j, k;
+  uint64_t pp;
+  gf_internal_t *h;
+  uint64_t *s64, *d64, *top;
+  gf_region_data rd;
+  uint64_t v[2], s;
+  struct gf_w128_split_8_128_data *ld;
+
+  /* Check on alignment. Ignore it otherwise. */
+  gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8);
+
+  if (val[0] == 0) {
+    if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
+    if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+  }
+    
+  h = (gf_internal_t *) gf->scratch;
+  ld = (struct gf_w128_split_8_128_data *) h->private;
+
+  s64 = (uint64_t *) rd.s_start;
+  d64 = (uint64_t *) rd.d_start;
+  top = (uint64_t *) rd.d_top;
+
+  if (val[0] != ld->last_value[0] || val[1] != ld->last_value[1]) {
+    v[0] = val[0];
+    v[1] = val[1];
+    for (i = 0; i < 16; i++) {
+      ld->tables[0][i][0] = 0;
+      ld->tables[1][i][0] = 0;
+      for (j = 1; j < (1 << 8); j <<= 1) {
+        for (k = 0; k < j; k++) {
+          ld->tables[0][i][k^j] = (v[0] ^ ld->tables[0][i][k]);
+          ld->tables[1][i][k^j] = (v[1] ^ ld->tables[1][i][k]);
+        }
+        pp = (v[0] & (1ULL << 63));
+        v[0] <<= 1;
+        if (v[1] & (1ULL << 63)) v[0] ^= 1;
+        v[1] <<= 1;
+        if (pp) v[1] ^= h->prim_poly;
+      }
+    }
+  }
+  ld->last_value[0] = val[0];
+  ld->last_value[1] = val[1];
+
+  while (d64 < top) {
+    v[0] = (xor) ? d64[0] : 0;
+    v[1] = (xor) ? d64[1] : 0;
+    s = s64[1];
+    i = 0;
+    while (s != 0) {
+      v[0] ^= ld->tables[0][i][s&0xff];
+      v[1] ^= ld->tables[1][i][s&0xff];
+      s >>= 8;
+      i++;
+    }
+    s = s64[0];
+    i = 8;
+    while (s != 0) {
+      v[0] ^= ld->tables[0][i][s&0xff];
+      v[1] ^= ld->tables[1][i][s&0xff];
+      s >>= 8;
+      i++;
+    }
+    d64[0] = v[0];
+    d64[1] = v[1];
+    s64 += 2;
+    d64 += 2;
+  }
+}
+
 void
 gf_w128_bytwo_b_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
 {
@@ -300,7 +800,7 @@ gf_w128_bytwo_b_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t va
   s64 = (uint64_t *) rd.s_start;
   d64 = (uint64_t *) rd.d_start;
   top = (uint64_t *) rd.d_top;
-  bmask = (1L << 63);
+  bmask = (1ULL << 63);
 
   while (d64 < top) {
     set_zero(c, 0);
@@ -359,11 +859,7 @@ void gf_w128_group_m_init(gf_t *gf, gf_val_128_t b128)
   uint64_t a128[2];
   scratch = (gf_internal_t *) gf->scratch;
   gt = scratch->private;
-  if (scratch->mult_type == GF_MULT_DEFAULT) {
-    g_m = 4;
-  } else {
-    g_m = scratch->arg1;
-  }
+  g_m = scratch->arg1;
   prim_poly = scratch->prim_poly;
 
 
@@ -385,10 +881,49 @@ void gf_w128_group_m_init(gf_t *gf, gf_val_128_t b128)
   return;
 }
 
+static
+void gf_w128_group_m_sse_init(gf_t *gf, gf_val_128_t b128)
+{
+#ifdef INTEL_SSE4
+  int i, j;
+  int g_m;
+  uint64_t lbit, middlebit;
+  gf_internal_t *scratch;
+  gf_group_tables_t *gt;
+  scratch = (gf_internal_t *) gf->scratch;
+  gt = scratch->private;
+  g_m = scratch->arg1;
+
+  __m128i *table = (__m128i *)(gt->m_table), b, a, ubit, prim_poly;
+  prim_poly = _mm_insert_epi64(_mm_setzero_si128(), scratch->prim_poly, 0);
+  b = _mm_loadu_si128((__m128i *)(b128));
+
+  table[0] = _mm_setzero_si128();
+  table[1] = table[0];
+  table[1] = _mm_insert_epi64(table[1],b128[0],1);
+  table[1] = _mm_insert_epi64(table[1],b128[1],0);
+  lbit = 1;
+  lbit <<= 63;
+  ubit = _mm_set_epi32(0, 1, 0, 0);
+  for (i = 2; i < (1 << g_m); i <<= 1) {
+    a = table[(i >> 1)];
+    middlebit = (_mm_extract_epi64(a, 0x0) & lbit);
+    a = _mm_slli_epi64(a, 1);
+    if (middlebit) a = _mm_xor_si128(a, ubit);
+    table[i] = a;
+    if (_mm_extract_epi64(table[i >> 1], 0x1) & lbit) table[i] = _mm_xor_si128(table[i], prim_poly);
+    for (j = 0; j < i; j++) {
+      table[i + j] = _mm_xor_si128(table[i], table[j]);
+    }
+  }
+  return;
+#endif
+}
+
 void
 gf_w128_group_multiply(GFP gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
 {
-  int i;
+  int i,j;
   /* index_r, index_m, total_m (if g_r > g_m) */
   int i_r, i_m, t_m;
   int mask_m, mask_r;
@@ -399,13 +934,8 @@ gf_w128_group_multiply(GFP gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_
 
   scratch = (gf_internal_t *) gf->scratch;
   gt = scratch->private;
-  if (scratch->mult_type == GF_MULT_DEFAULT) {
-    g_m = 4;
-    g_r = 8;
-  } else {
-    g_m = scratch->arg1;
-    g_r = scratch->arg2;
-  }
+  g_m = scratch->arg1;
+  g_r = scratch->arg2;
 
   mask_m = (1 << g_m) - 1;
   mask_r = (1 << g_r) - 1;
@@ -413,7 +943,7 @@ gf_w128_group_multiply(GFP gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_
   if (b128[0] != gt->m_table[2] || b128[1] != gt->m_table[3]) {
     gf_w128_group_m_init(gf, b128);
   }
-
+  
   p_i[0] = 0;
   p_i[1] = 0;
   a[0] = a128[0];
@@ -458,11 +988,92 @@ gf_w128_group_multiply(GFP gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_
       i_r <<= g_m;
     }
   }
-
   c128[0] = p_i[0];
   c128[1] = p_i[1];
 }
 
+void
+gf_w128_group_sse_multiply(GFP gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
+{
+#ifdef INTEL_SSE4
+  int i,j;
+  int i_r, i_m, t_m;
+  int mask_m, mask_r, mask_s;
+  int g_m, g_r;
+  uint32_t shiftbits;
+  uint64_t a[2], tbit = 1;
+  tbit <<= 63;
+  gf_internal_t *scratch;
+  gf_group_tables_t *gt;
+  __m128i p_i, *m_table, *r_table, zero;
+  
+  zero = _mm_setzero_si128();
+  scratch = (gf_internal_t *) gf->scratch;
+  gt = scratch->private;
+  m_table = (__m128i *)(gt->m_table);
+  r_table = (__m128i *)(gt->r_table);
+  g_m = scratch->arg1;
+  g_r = scratch->arg2;
+
+  mask_m = (1 << g_m) - 1;
+  mask_r = (1 << g_r) - 1;
+  mask_s = mask_m << (32-g_m); /*sets g_m leftmost bits to 1*/
+  if (b128[0] != _mm_extract_epi64(m_table[1], 1) || b128[1] != _mm_extract_epi64(m_table[1], 0)) {
+    gf_w128_group_m_sse_init(gf, b128);
+  } 
+
+  p_i = zero;
+  a[0] = a128[0];
+  a[1] = a128[1];
+
+  t_m = 0;
+  i_r = 0;
+
+  /* Top 64 bits */
+  for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) {
+    i_m = (a[0] >> (i * g_m)) & mask_m;
+    i_r ^= ((uint64_t)_mm_extract_epi64(p_i, 1) >> (64 - g_m)) & mask_r;
+     
+    shiftbits = _mm_extract_epi32(p_i, 1) & mask_s;
+    shiftbits >>= 32-g_m;
+    p_i = _mm_slli_epi64(p_i, g_m);
+    p_i = _mm_xor_si128(p_i, _mm_insert_epi32(zero, shiftbits, 2));
+
+    p_i = _mm_xor_si128(p_i, m_table[i_m]);
+    t_m += g_m;
+    if (t_m == g_r) {
+      p_i = _mm_xor_si128(p_i, r_table[i_r]);
+      t_m = 0;
+      i_r = 0;
+    } else {
+      i_r <<= g_m;
+    }
+  }
+
+  for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) {
+    i_m = (a[1] >> (i * g_m)) & mask_m;
+    i_r ^= (((uint64_t)_mm_extract_epi64(p_i,1)) >> (64 - g_m)) & mask_r;
+
+    shiftbits = _mm_extract_epi32(p_i, 1) & mask_s;
+    shiftbits >>= 32-g_m;
+    p_i = _mm_slli_epi64(p_i, g_m);
+    p_i = _mm_xor_si128(p_i, _mm_insert_epi32(zero, shiftbits, 2));
+    
+    p_i = _mm_xor_si128(p_i, m_table[i_m]);
+    t_m += g_m;
+    if (t_m == g_r) {
+      p_i = _mm_xor_si128(p_i, r_table[i_r]);
+      t_m = 0;
+      i_r = 0;
+    } else {
+      i_r <<= g_m;
+    }
+  }
+  c128[0] = _mm_extract_epi64(p_i, 1);
+  c128[1] = _mm_extract_epi64(p_i, 0);
+#endif
+}
+
 static
 void
 gf_w128_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
@@ -487,13 +1098,8 @@ gf_w128_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val,
     
   scratch = (gf_internal_t *) gf->scratch;
   gt = scratch->private;
-  if (scratch->mult_type == GF_MULT_DEFAULT) {
-    g_m = 4;
-    g_r = 8;
-  } else {
-    g_m = scratch->arg1;
-    g_r = scratch->arg2;
-  }
+  g_m = scratch->arg1;
+  g_r = scratch->arg2;
 
   mask_m = (1 << g_m) - 1;
   mask_r = (1 << g_r) - 1;
@@ -522,6 +1128,7 @@ gf_w128_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val,
       p_i[0] <<= g_m;
       p_i[0] ^= (p_i[1] >> (64-g_m));
       p_i[1] <<= g_m;
+      
       p_i[0] ^= gt->m_table[2 * i_m];
       p_i[1] ^= gt->m_table[(2 * i_m) + 1];
       t_m += g_m;
@@ -533,7 +1140,6 @@ gf_w128_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val,
         i_r <<= g_m;
       }
     }
-  
     for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) {
       i_m = (a[1] >> (i * g_m)) & mask_m;
       i_r ^= (p_i[0] >> (64 - g_m)) & mask_r;
@@ -564,9 +1170,162 @@ gf_w128_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val,
   }
 }
 
+static
+void
+gf_w128_group_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
+{
+#ifdef INTEL_SSE4
+  int i;
+  int i_r, i_m, t_m;
+  int mask_m, mask_r, mask_s;
+  int g_m, g_r;
+  uint32_t shiftbits;
+  uint64_t a[2];
+  gf_internal_t *scratch;
+  gf_group_tables_t *gt;
+  gf_region_data rd;
+  uint64_t *a128, *c128, *top;
+  __m128i *m_table, *r_table, p_i, zero;
+  zero = _mm_setzero_si128();
+  /* We only do this to check on alignment. */
+  gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8);
+      
+  if (val[0] == 0) {
+    if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
+    if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+  }
+    
+  scratch = (gf_internal_t *) gf->scratch;
+  gt = scratch->private;
+  m_table = (__m128i *)(gt->m_table);
+  r_table = (__m128i *)(gt->r_table);
+  g_m = scratch->arg1;
+  g_r = scratch->arg2;
+
+  mask_m = (1 << g_m) - 1;
+  mask_r = (1 << g_r) - 1;
+  mask_s = mask_m << (32-g_m);
+
+  if (val[0] != _mm_extract_epi64(m_table[1], 1) || val[1] != _mm_extract_epi64(m_table[1], 0)) {
+    gf_w128_group_m_sse_init(gf, val);
+  }
+
+  a128 = (uint64_t *) src;
+  c128 = (uint64_t *) dest;
+  top = (uint64_t *) rd.d_top;
+
+  if (xor){
+    while (c128 < top) {
+      p_i = zero;
+      a[0] = a128[0];
+      a[1] = a128[1];
+
+      t_m = 0;
+      i_r = 0;
+      /* Top 64 bits */
+      for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) {
+        i_m = (a[0] >> (i * g_m)) & mask_m;
+        i_r ^= ((uint64_t)_mm_extract_epi64(p_i, 1) >> (64 - g_m)) & mask_r;
+
+        shiftbits = _mm_extract_epi32(p_i, 1) & mask_s;
+        shiftbits >>= 32-g_m;
+        p_i = _mm_slli_epi64(p_i, g_m);
+        p_i = _mm_xor_si128(p_i, _mm_insert_epi32(zero, shiftbits, 2));
+        p_i = _mm_xor_si128(p_i, m_table[i_m]);
+        t_m += g_m;
+        if (t_m == g_r) {
+          p_i = _mm_xor_si128(p_i, r_table[i_r]);
+          t_m = 0;
+          i_r = 0;
+        } else {
+          i_r <<= g_m;
+        }
+      }
+
+      for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) {
+        i_m = (a[1] >> (i * g_m)) & mask_m;
+        i_r ^= ((uint64_t)_mm_extract_epi64(p_i, 1) >> (64 - g_m)) & mask_r;
+
+        shiftbits = _mm_extract_epi32(p_i, 1) & mask_s;
+        shiftbits >>= 32-g_m;
+        p_i = _mm_slli_epi64(p_i, g_m);
+        p_i = _mm_xor_si128(p_i, _mm_insert_epi32(zero, shiftbits, 2));
+
+        p_i = _mm_xor_si128(p_i, m_table[i_m]);
+        t_m += g_m;
+        if (t_m == g_r) {
+          p_i = _mm_xor_si128(p_i, r_table[i_r]);
+          t_m = 0;
+          i_r = 0;
+        } else {
+          i_r <<= g_m;
+        }
+      }
+
+      c128[0] ^= _mm_extract_epi64(p_i, 1);
+      c128[1] ^= _mm_extract_epi64(p_i, 0);
+      a128 += 2;
+      c128 += 2;
+    }
+  }else{
+    while (c128 < top) {
+      p_i = zero;
+      a[0] = a128[0];
+      a[1] = a128[1];
+
+      t_m = 0;
+      i_r = 0;
+      /* Top 64 bits */
+      for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) {
+        i_m = (a[0] >> (i * g_m)) & mask_m;
+        i_r ^= ((uint64_t)_mm_extract_epi64(p_i, 1) >> (64 - g_m)) & mask_r;
+
+        shiftbits = _mm_extract_epi32(p_i, 1) & mask_s;
+        shiftbits >>= 32-g_m;
+        p_i = _mm_slli_epi64(p_i, g_m);
+        p_i = _mm_xor_si128(p_i, _mm_insert_epi32(zero, shiftbits, 2));
+        p_i = _mm_xor_si128(p_i, m_table[i_m]);
+        t_m += g_m;
+        if (t_m == g_r) {
+          p_i = _mm_xor_si128(p_i, r_table[i_r]);
+          t_m = 0;
+          i_r = 0;
+        } else {
+          i_r <<= g_m;
+        }
+      }
+
+      for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) {
+        i_m = (a[1] >> (i * g_m)) & mask_m;
+        i_r ^= ((uint64_t)_mm_extract_epi64(p_i, 1) >> (64 - g_m)) & mask_r;
+
+        shiftbits = _mm_extract_epi32(p_i, 1) & mask_s;
+        shiftbits >>= 32-g_m;
+        p_i = _mm_slli_epi64(p_i, g_m);
+        p_i = _mm_xor_si128(p_i, _mm_insert_epi32(zero, shiftbits, 2));
+
+        p_i = _mm_xor_si128(p_i, m_table[i_m]);
+        t_m += g_m;
+        if (t_m == g_r) {
+          p_i = _mm_xor_si128(p_i, r_table[i_r]);
+          t_m = 0;
+          i_r = 0;
+        } else {
+          i_r <<= g_m;
+        }
+      }
+
+      c128[0] = _mm_extract_epi64(p_i, 1);
+      c128[1] = _mm_extract_epi64(p_i, 0);
+      a128 += 2;
+      c128 += 2;
+    }
+  }
+#endif
+}
 
 /* a^-1 -> b */
-void
+  void
 gf_w128_euclid(GFP gf, gf_val_128_t a128, gf_val_128_t b128)
 {
   uint64_t e_i[2], e_im1[2], e_ip1[2];
@@ -585,10 +1344,26 @@ gf_w128_euclid(GFP gf, gf_val_128_t a128, gf_val_128_t b128)
   e_i[0] = a128[0];
   e_i[1] = a128[1];
   d_im1 = 128;
+
+  //Allen: I think d_i starts at 63 here, and checks each bit of a, starting at MSB, looking for the first nonzero bit
+  //so d_i should be 0 if this half of a is all 0s, otherwise it should be the position from right of the first-from-left zero bit of this half of a.
+  //BUT if d_i is 0 at end we won't know yet if the rightmost bit of this half is 1 or not
+
   for (d_i = (d_im1-1) % 64; ((one << d_i) & e_i[0]) == 0 && d_i > 0; d_i--) ;
+
+  //Allen: this is testing just the first half of the stop condition above, so if it holds we know we did not find a nonzero bit yet
+
   if (!((one << d_i) & e_i[0])) {
-    for (d_i = (d_im1-1) % 64; ((one << d_i) & e_i[1] == 0); d_i--) ;
+
+    //Allen: this is doing the same thing on the other half of a. In other words, we're still searching for a nonzero bit of a.
+    // but not bothering to test if d_i hits zero, which is fine because we've already tested for a=0.
+
+    for (d_i = (d_im1-1) % 64; ((one << d_i) & e_i[1]) == 0; d_i--) ;
+
   } else {
+
+    //Allen: if a 1 was found in more-significant half of a, make d_i the ACTUAL index of the first nonzero bit in the entire a.
+
     d_i += 64;
   }
   y_i[0] = 0;
@@ -614,11 +1389,11 @@ gf_w128_euclid(GFP gf, gf_val_128_t a128, gf_val_128_t b128)
         if (d_ip1 - d_i > 0) e_ip1[0] ^= (e_i[1] >> (64 - (d_ip1 - d_i)));
         e_ip1[1] ^= (e_i[1] << (d_ip1 - d_i));
       }
-        d_ip1--;
+      d_ip1--;
+      if (e_ip1[0] == 0 && e_ip1[1] == 0) { b[0] = 0; b[1] = 0; return; }
       while (d_ip1 >= 64 && (e_ip1[0] & (one << (d_ip1 - 64))) == 0) d_ip1--;
       while (d_ip1 <  64 && (e_ip1[1] & (one << d_ip1)) == 0) d_ip1--;
     }
-
     gf->multiply.w128(gf, c_i, y_i, y_ip1);
     y_ip1[0] ^= y_im1[0];
     y_ip1[1] ^= y_im1[1];
@@ -640,11 +1415,10 @@ gf_w128_euclid(GFP gf, gf_val_128_t a128, gf_val_128_t b128)
   b = (uint64_t *) b128;
   b[0] = y_i[0];
   b[1] = y_i[1];
-
   return;
 }
 
-void
+  void
 gf_w128_divide_from_inverse(GFP gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
 {
   uint64_t d[2];
@@ -653,7 +1427,7 @@ gf_w128_divide_from_inverse(GFP gf, gf_val_128_t a128, gf_val_128_t b128, gf_val
   return;
 }
 
-void
+  void
 gf_w128_inverse_from_divide(GFP gf, gf_val_128_t a128, gf_val_128_t b128)
 {
   uint64_t one128[2];
@@ -663,21 +1437,209 @@ gf_w128_inverse_from_divide(GFP gf, gf_val_128_t a128, gf_val_128_t b128)
   return;
 }
 
+
+static
+  void
+gf_w128_composite_inverse(gf_t *gf, gf_val_128_t a, gf_val_128_t inv)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  gf_t *base_gf = h->base_gf;
+  uint64_t a0 = a[1];
+  uint64_t a1 = a[0];
+  uint64_t c0, c1, d, tmp;
+  uint64_t a0inv, a1inv;
+
+  if (a0 == 0) {
+    a1inv = base_gf->inverse.w64(base_gf, a1);
+    c0 = base_gf->multiply.w64(base_gf, a1inv, h->prim_poly);
+    c1 = a1inv;
+  } else if (a1 == 0) {
+    c0 = base_gf->inverse.w64(base_gf, a0);
+    c1 = 0;
+  } else {
+    a1inv = base_gf->inverse.w64(base_gf, a1);
+    a0inv = base_gf->inverse.w64(base_gf, a0);
+
+    d = base_gf->multiply.w64(base_gf, a1, a0inv);
+
+    tmp = (base_gf->multiply.w64(base_gf, a1, a0inv) ^ base_gf->multiply.w64(base_gf, a0, a1inv) ^ h->prim_poly);
+    tmp = base_gf->inverse.w64(base_gf, tmp);
+
+    d = base_gf->multiply.w64(base_gf, d, tmp);
+
+    c0 = base_gf->multiply.w64(base_gf, (d^1), a0inv);
+    c1 = base_gf->multiply.w64(base_gf, d, a1inv);
+  }
+  inv[0] = c1;
+  inv[1] = c0;
+}
+
+static
+  void
+gf_w128_composite_multiply(gf_t *gf, gf_val_128_t a, gf_val_128_t b, gf_val_128_t rv)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  gf_t *base_gf = h->base_gf;
+  uint64_t b0 = b[1];
+  uint64_t b1 = b[0];
+  uint64_t a0 = a[1];
+  uint64_t a1 = a[0];
+  uint64_t a1b1;
+
+  a1b1 = base_gf->multiply.w64(base_gf, a1, b1);
+
+  rv[1] = (base_gf->multiply.w64(base_gf, a0, b0) ^ a1b1);
+  rv[0] = base_gf->multiply.w64(base_gf, a1, b0) ^ 
+    base_gf->multiply.w64(base_gf, a0, b1) ^ 
+    base_gf->multiply.w64(base_gf, a1b1, h->prim_poly);
+}
+
+static
+  void
+gf_w128_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
+{
+  unsigned long uls, uld;
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  gf_t *base_gf = h->base_gf;
+  uint64_t b0 = val[1];
+  uint64_t b1 = val[0];
+  uint64_t *s64, *d64;
+  uint64_t *top;
+  uint64_t a0, a1, a1b1;
+  gf_region_data rd;
+
+  if (val[0] == 0 && val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8);
+
+  s64 = rd.s_start;
+  d64 = rd.d_start;
+  top = rd.d_top;
+
+  if (xor) {
+    while (d64 < top) {
+      a1 = s64[0];
+      a0 = s64[1];
+      a1b1 = base_gf->multiply.w64(base_gf, a1, b1);
+
+      d64[1] ^= (base_gf->multiply.w64(base_gf, a0, b0) ^ a1b1);
+      d64[0] ^= (base_gf->multiply.w64(base_gf, a1, b0) ^ 
+          base_gf->multiply.w64(base_gf, a0, b1) ^ 
+          base_gf->multiply.w64(base_gf, a1b1, h->prim_poly));
+      s64 += 2;
+      d64 += 2;
+    }
+  } else {
+    while (d64 < top) {
+      a1 = s64[0];
+      a0 = s64[1];
+      a1b1 = base_gf->multiply.w64(base_gf, a1, b1);
+
+      d64[1] = (base_gf->multiply.w64(base_gf, a0, b0) ^ a1b1);
+      d64[0] = (base_gf->multiply.w64(base_gf, a1, b0) ^ 
+          base_gf->multiply.w64(base_gf, a0, b1) ^ 
+          base_gf->multiply.w64(base_gf, a1b1, h->prim_poly));
+      s64 += 2;
+      d64 += 2;
+    }
+  }
+}
+
+static
+void
+gf_w128_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int 
+    xor)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;  gf_t *base_gf = h->base_gf;
+  gf_val_64_t val0 = val[1];
+  gf_val_64_t val1 = val[0];
+  uint64_t *l, *hi;
+  uint8_t *slow, *shigh;
+  uint8_t *dlow, *dhigh, *top;
+  int sub_reg_size;
+  gf_region_data rd;
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 64);
+  gf_w128_multiply_region_from_single(gf, src, dest, val, (rd.s_start-src), xor);
+
+  slow = (uint8_t *) rd.s_start;
+  dlow = (uint8_t *) rd.d_start;
+  top = (uint8_t*) rd.d_top;
+  sub_reg_size = (top - dlow)/2;
+  shigh = slow + sub_reg_size;
+  dhigh = dlow + sub_reg_size;
+
+  base_gf->multiply_region.w64(base_gf, slow, dlow, val0, sub_reg_size, xor);
+  base_gf->multiply_region.w64(base_gf, shigh, dlow, val1, sub_reg_size, 1);
+  base_gf->multiply_region.w64(base_gf, slow, dhigh, val1, sub_reg_size, xor);
+  base_gf->multiply_region.w64(base_gf, shigh, dhigh, val0, sub_reg_size, 1);
+  base_gf->multiply_region.w64(base_gf, shigh, dhigh, base_gf->multiply.w64(base_gf, h->prim_poly, val1
+        ), sub_reg_size, 1);
+
+  gf_w128_multiply_region_from_single(gf, rd.s_top, rd.d_top, val, (src+bytes)-rd.s_top, xor);
+}
+
+
+  static
+int gf_w128_composite_init(gf_t *gf)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+
+  if (h->region_type & GF_REGION_ALTMAP) {
+    gf->multiply_region.w128 = gf_w128_composite_multiply_region_alt;   
+  } else {
+    gf->multiply_region.w128 = gf_w128_composite_multiply_region;
+  }
+
+  gf_internal_t *base_h = (gf_internal_t *) h->base_gf->scratch;
+
+  gf->multiply.w128 = gf_w128_composite_multiply;
+  gf->divide.w128 = gf_w128_divide_from_inverse;
+  gf->inverse.w128 = gf_w128_composite_inverse;
+
+  return 1;
+}
+
+static
+int gf_w128_cfm_init(gf_t *gf)
+{
+#ifdef INTEL_SSE4_PCLMUL
+  gf->inverse.w128 = gf_w128_euclid;
+  gf->multiply.w128 = gf_w128_clm_multiply;
+  gf->multiply_region.w128 = gf_w128_clm_multiply_region_from_single;
+  return 1;
+#endif
+
+  return 0;
+}
+
 static
 int gf_w128_shift_init(gf_t *gf)
 {
+  gf_internal_t *h;
+  h = (gf_internal_t*) gf->scratch;
   gf->multiply.w128 = gf_w128_shift_multiply;
   gf->inverse.w128 = gf_w128_euclid;
   gf->multiply_region.w128 = gf_w128_multiply_region_from_single;
   return 1;
 }
 
-static
+  static
 int gf_w128_bytwo_init(gf_t *gf)
 {
-  gf->multiply.w128 = gf_w128_bytwo_b_multiply;
+  gf_internal_t *h;
+  h = (gf_internal_t *) gf->scratch;
+
+  if (h->mult_type == GF_MULT_BYTWO_p) {
+    gf->multiply.w128 = gf_w128_bytwo_p_multiply;
+    /*gf->multiply.w128 = gf_w128_sse_bytwo_p_multiply;*/
+    /* John: the sse function is slower.*/
+  } else {
+    gf->multiply.w128 = gf_w128_bytwo_b_multiply;
+    /*gf->multiply.w128 = gf_w128_sse_bytwo_b_multiply;
+Ben: This sse function is also slower. */
+  }
   gf->inverse.w128 = gf_w128_euclid;
-  gf->multiply_region.w128 = gf_w128_multiply_region_from_single;
   gf->multiply_region.w128 = gf_w128_bytwo_b_multiply_region;
   return 1;
 }
@@ -686,7 +1648,7 @@ int gf_w128_bytwo_init(gf_t *gf)
  * Because the prim poly is only 8 bits and we are limiting g_r to 16, I do not need the high 64
  * bits in all of these numbers.
  */
-static
+  static
 void gf_w128_group_r_init(gf_t *gf)
 {
   int i, j;
@@ -696,11 +1658,7 @@ void gf_w128_group_r_init(gf_t *gf)
   gf_group_tables_t *gt;
   scratch = (gf_internal_t *) gf->scratch;
   gt = scratch->private;
-  if (scratch->mult_type == GF_MULT_DEFAULT) {
-    g_r = 8;
-  } else {
-    g_r = scratch->arg2;
-  }
+  g_r = scratch->arg2;
   pp = scratch->prim_poly;
 
   gt->r_table[0] = 0;
@@ -715,20 +1673,76 @@ void gf_w128_group_r_init(gf_t *gf)
   return;
 }
 
-static 
+  static
+void gf_w128_group_r_sse_init(gf_t *gf)
+{
+#ifdef INTEL_SSE4
+  int i, j;
+  int g_r;
+  uint64_t pp;
+  gf_internal_t *scratch;
+  gf_group_tables_t *gt;
+  scratch = (gf_internal_t *) gf->scratch;
+  gt = scratch->private;
+  __m128i zero = _mm_setzero_si128();
+  __m128i *table = (__m128i *)(gt->r_table);
+  g_r = scratch->arg2;
+  pp = scratch->prim_poly;
+  table[0] = zero;
+  for (i = 1; i < (1 << g_r); i++) {
+    table[i] = zero;
+    for (j = 0; j < g_r; j++) {
+      if (i & (1 << j)) {
+        table[i] = _mm_xor_si128(table[i], _mm_insert_epi64(zero, pp << j, 0));
+      }
+    }
+  }
+  return;
+#endif
+}
+
+  static 
 int gf_w128_split_init(gf_t *gf)
 {
-  struct gf_w128_split_4_128_data *sd;
+  struct gf_w128_split_4_128_data *sd4;
+  struct gf_w128_split_8_128_data *sd8;
   gf_internal_t *h;
 
   h = (gf_internal_t *) gf->scratch;
-  sd = (struct gf_w128_split_4_128_data *) h->private;
-  sd->last_value[0] = 0;
-  sd->last_value[1] = 0;
 
-  gf->multiply.w128 = gf_w128_bytwo_b_multiply;
+  gf->multiply.w128 = gf_w128_bytwo_p_multiply;
+#ifdef INTEL_SSE4_PCLMUL
+  if (!(h->region_type & GF_REGION_NOSSE)){
+    gf->multiply.w128 = gf_w128_clm_multiply;
+  }
+#endif
+
   gf->inverse.w128 = gf_w128_euclid;
-  gf->multiply_region.w128 = gf_w128_split_4_128_multiply_region;
+
+  if ((h->arg1 != 4 && h->arg2 != 4) || h->mult_type == GF_MULT_DEFAULT) {
+    sd8 = (struct gf_w128_split_8_128_data *) h->private;
+    sd8->last_value[0] = 0;
+    sd8->last_value[1] = 0;
+    gf->multiply_region.w128 = gf_w128_split_8_128_multiply_region;
+  } else {
+    sd4 = (struct gf_w128_split_4_128_data *) h->private;
+    sd4->last_value[0] = 0;
+    sd4->last_value[1] = 0;
+    if((h->region_type & GF_REGION_ALTMAP))
+    {
+      #ifdef INTEL_SSE4
+        if(!(h->region_type & GF_REGION_NOSSE))
+          gf->multiply_region.w128 = gf_w128_split_4_128_sse_altmap_multiply_region;
+        else
+          return 0;
+      #else
+        return 0;
+      #endif
+    }
+    else {
+      gf->multiply_region.w128 = gf_w128_split_4_128_multiply_region;
+    }
+  }
   return 1;
 }
 
@@ -739,16 +1753,12 @@ int gf_w128_group_init(gf_t *gf)
   gf_internal_t *scratch;
   gf_group_tables_t *gt;
   int g_m, g_r, size_r;
+  long tmp;
 
   scratch = (gf_internal_t *) gf->scratch;
   gt = scratch->private;
-  if (scratch->mult_type == GF_MULT_DEFAULT) {
-    g_m = 4;
-    g_r = 8;
-  } else {
-    g_m = scratch->arg1;
-    g_r = scratch->arg2;
-  }
+  g_m = scratch->arg1;
+  g_r = scratch->arg2;
   size_r = (1 << g_r);
 
   gt->r_table = scratch->private + (2 * sizeof(uint64_t *));
@@ -756,11 +1766,30 @@ int gf_w128_group_init(gf_t *gf)
   gt->m_table[2] = 0;
   gt->m_table[3] = 0;
 
-  gf_w128_group_r_init(gf);
-
   gf->multiply.w128 = gf_w128_group_multiply;
   gf->inverse.w128 = gf_w128_euclid;
   gf->multiply_region.w128 = gf_w128_group_multiply_region;
+
+  #ifdef INTEL_SSE4
+    if(!(scratch->region_type & GF_REGION_NOSSE))
+    {
+      if ((g_m != 4) && ((g_r != 4) || (g_r != 8)))
+        return 0;
+      gt->r_table = (void *)(((uint64_t)gt->r_table + 15) & (~0xfULL)); /* aligns gt->r_table on a 16-bit boundary*/
+      gt->m_table = gt->r_table + 2*size_r;
+      gt->m_table[2] = 0;
+      gt->m_table[3] = 0;
+      gf->multiply.w128 = gf_w128_group_sse_multiply;
+      gf->multiply_region.w128 = gf_w128_group_sse_multiply_region;
+      gf_w128_group_r_sse_init(gf);
+    }
+    else
+      gf_w128_group_r_init(gf);
+  #else
+    if(scratch->region_type & GF_REGION_SSE) return 0;
+    else gf_w128_group_r_init(gf);
+  #endif
+
   return 1;
 }
 
@@ -773,88 +1802,175 @@ void gf_w128_extract_word(gf_t *gf, void *start, int bytes, int index, gf_val_12
   memcpy(rv, s, 16);
 }
 
+static void gf_w128_split_extract_word(gf_t *gf, void *start, int bytes, int index, gf_val_128_t rv)
+{
+  int i, blocks;
+  uint64_t *r64, tmp;
+  uint8_t *r8;
+  gf_region_data rd;
+
+  gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 256);
+  r64 = (uint64_t *) start;
+  if ((r64 + index*2 < (uint64_t *) rd.d_start) ||
+      (r64 + index*2 >= (uint64_t *) rd.d_top)) {
+    memcpy(rv, r64+(index*2), 16);
+    return;
+  }
+
+  index -= (((uint64_t *) rd.d_start) - r64)/2;
+  r64 = (uint64_t *) rd.d_start;
+
+  blocks = index/16;
+  r64 += (blocks*32);
+  index %= 16;
+  r8 = (uint8_t *) r64;
+  r8 += index;
+  rv[0] = 0;
+  rv[1] = 0;
+
+  for (i = 0; i < 8; i++) {
+    tmp = *r8;
+    rv[1] |= (tmp << (i*8));
+    r8 += 16;
+  }
+
+  for (i = 0; i < 8; i++) {
+    tmp = *r8;
+    rv[0] |= (tmp << (i*8));
+    r8 += 16;
+  }
+  return;
+}
+
+  static
+void gf_w128_composite_extract_word(gf_t *gf, void *start, int bytes, int index, gf_val_128_t rv)
+{
+  int sub_size;
+  gf_internal_t *h;
+  uint8_t *r8, *top;
+  uint64_t *r64;
+  gf_region_data rd;
+
+  h = (gf_internal_t *) gf->scratch;
+  gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 64);
+  r64 = (uint64_t *) start;
+  if ((r64 + index*2 < (uint64_t *) rd.d_start) ||
+      (r64 + index*2 >= (uint64_t *) rd.d_top)) {
+    memcpy(rv, r64+(index*2), 16);
+    return;
+  }
+  index -= (((uint64_t *) rd.d_start) - r64)/2;
+  r8 = (uint8_t *) rd.d_start;
+  top = (uint8_t *) rd.d_top;
+  sub_size = (top-r8)/2;
+
+  rv[1] = h->base_gf->extract_word.w64(h->base_gf, r8, sub_size, index);
+  rv[0] = h->base_gf->extract_word.w64(h->base_gf, r8+sub_size, sub_size, index);
+  
+  return;
+}
+
 int gf_w128_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
 {
   int size_m, size_r;
   int w = 128;
+  if (divide_type==GF_DIVIDE_MATRIX) return 0;
+
   switch(mult_type)
   {
+    case GF_MULT_CARRY_FREE:
+      return sizeof(gf_internal_t);
+      break;
     case GF_MULT_SHIFT:
-      if (arg1 != 0 || arg2 != 0 || region_type != 0) return -1;
       return sizeof(gf_internal_t);
       break;
+    case GF_MULT_BYTWO_p:
     case GF_MULT_BYTWO_b:
-      if (arg1 != 0 || arg2 != 0 || region_type != 0) return -1;
       return sizeof(gf_internal_t);
       break;
-    case GF_MULT_SPLIT_TABLE:
-      if (region_type != 0) return -1;
-      if ((arg1 == 4 && arg2 == 128) || (arg1 == 128 && arg2 == 4)) {
-        return sizeof(gf_internal_t) + sizeof(struct gf_w128_split_4_128_data) + 64;
-      }
-      return -1;
-      break;
     case GF_MULT_DEFAULT: 
-      arg1 = 4;
-      arg2 = 8;
+    case GF_MULT_SPLIT_TABLE:
+      if ((arg1 == 4 && arg2 == 128) || (arg1 == 128 && arg2 == 4)) {
+        return sizeof(gf_internal_t) + sizeof(struct gf_w128_split_4_128_data) + 64;
+      } else if ((arg1 == 8 && arg2 == 128) || (arg1 == 128 && arg2 == 8) || mult_type == GF_MULT_DEFAULT) {
+        return sizeof(gf_internal_t) + sizeof(struct gf_w128_split_8_128_data) + 64;
+      }
+      return 0;
+      break;
     case GF_MULT_GROUP:
-
-      /* arg1 == mult size, arg2 == reduce size */
-      /* Should prevent anything over arg1 > 16 || arg2 > 16 */
-      if (region_type != 0) return -1;
-      if (arg1 <= 0 || arg2 <= 0 || arg1 > 16 || arg2 > 16) return -1;
-      if (GF_FIELD_WIDTH % arg1 != 0 || GF_FIELD_WIDTH % arg2 != 0) return -1;
-      /*
-       * Currently implementing code where g_m and g_r are the same or where g_r is larger, as
-       * these it is more efficient to have g_r as large as possible (but still not > 16)
-       */
-      if (arg1 > arg2) return -1;
-
-      /* size of each group, 128 bits */
+      /* JSP We've already error checked the arguments. */
       size_m = (1 << arg1) * 2 * sizeof(uint64_t);
-      /* The PP is only 8 bits and we are limiting g_r to 16, so only uint64_t */
-      size_r = (1 << arg2) * sizeof(uint64_t);
-
+      size_r = (1 << arg2) * 2 * sizeof(uint64_t);
       /* 
        * two pointers prepend the table data for structure
        * because the tables are of dynamic size
        */
-      return sizeof(gf_internal_t) + size_m + size_r + 2 * sizeof(uint64_t *);
+      return sizeof(gf_internal_t) + size_m + size_r + 4 * sizeof(uint64_t *);
+      break;
+    case GF_MULT_COMPOSITE:
+      if (arg1 == 2) {
+        return sizeof(gf_internal_t) + 4;
+      } else {
+        return 0;
+      }
+      break;
+
     default:
-      return -1;
+      return 0;
    }
 }
 
 int gf_w128_init(gf_t *gf)
 {
-  gf_internal_t *h;
+  gf_internal_t *h, *h_base, *h_base_base, *h_base_base_base;
+  int no_default_flag = 0;
 
   h = (gf_internal_t *) gf->scratch;
-  if (h->prim_poly == 0) h->prim_poly = 0x87; /* Omitting the leftmost 1 as in w=32 */
+  
+  /* Allen: set default primitive polynomial / irreducible polynomial if needed */
+
+  if (h->prim_poly == 0) {
+    if (h->mult_type == GF_MULT_COMPOSITE) {
+      h->prim_poly = gf_composite_get_default_poly(h->base_gf);
+      if (h->prim_poly == 0) return 0; /* This shouldn't happen */
+    } else {
+      h->prim_poly = 0x87; /* Omitting the leftmost 1 as in w=32 */
+    }
+    if (no_default_flag == 1) {
+      fprintf(stderr,"Code contains no default irreducible polynomial for given base field\n");
+      return 0;
+    }
+  }
 
   gf->multiply.w128 = NULL;
   gf->divide.w128 = NULL;
   gf->inverse.w128 = NULL;
   gf->multiply_region.w128 = NULL;
-
   switch(h->mult_type) {
+    case GF_MULT_BYTWO_p:
     case GF_MULT_BYTWO_b:      if (gf_w128_bytwo_init(gf) == 0) return 0; break;
+    case GF_MULT_CARRY_FREE:   if (gf_w128_cfm_init(gf) == 0) return 0; break;
     case GF_MULT_SHIFT:        if (gf_w128_shift_init(gf) == 0) return 0; break;
-    case GF_MULT_DEFAULT: 
     case GF_MULT_GROUP:        if (gf_w128_group_init(gf) == 0) return 0; break;
+    case GF_MULT_DEFAULT: 
     case GF_MULT_SPLIT_TABLE:  if (gf_w128_split_init(gf) == 0) return 0; break;
+    case GF_MULT_COMPOSITE:    if (gf_w128_composite_init(gf) == 0) return 0; break;
     default: return 0;
   }
 
-  gf->extract_word.w128 = gf_w128_extract_word;
+  /* Ben: Used to be h->region_type == GF_REGION_ALTMAP, but failed since there
+     are multiple flags in h->region_type */
+  if (h->mult_type == GF_MULT_SPLIT_TABLE && (h->region_type & GF_REGION_ALTMAP)) {
+    gf->extract_word.w128 = gf_w128_split_extract_word;
+  } else if (h->mult_type == GF_MULT_COMPOSITE && h->region_type == GF_REGION_ALTMAP) {
+    gf->extract_word.w128 = gf_w128_composite_extract_word;
+  } else {
+    gf->extract_word.w128 = gf_w128_extract_word;
+  }
 
   if (h->divide_type == GF_DIVIDE_EUCLID) {
     gf->divide.w128 = gf_w128_divide_from_inverse;
-    gf->inverse.w128 = gf_w128_euclid;
-  } /* } else if (h->divide_type == GF_DIVIDE_MATRIX) {
-    gf->divide.w128 = gf_w128_divide_from_inverse;
-    gf->inverse.w128 = gf_w128_matrix;
-  } */
+  } 
 
   if (gf->inverse.w128 != NULL && gf->divide.w128 == NULL) {
     gf->divide.w128 = gf_w128_divide_from_inverse;
diff --git a/gf_w16.c b/gf_w16.c
index e8b48fd..6bc25a6 100644
--- a/gf_w16.c
+++ b/gf_w16.c
@@ -14,50 +14,47 @@
 
 #define GF_BASE_FIELD_WIDTH (8)
 #define GF_BASE_FIELD_SIZE       (1 << GF_BASE_FIELD_WIDTH)
-#define GF_S_GF_8_2 (63)
 
-struct gf_logtable_data {
+struct gf_w16_logtable_data {
     uint16_t      log_tbl[GF_FIELD_SIZE];
     uint16_t      antilog_tbl[GF_FIELD_SIZE * 2];
     uint16_t      inv_tbl[GF_FIELD_SIZE];
     uint16_t      *d_antilog;
 };
 
-struct gf_zero_logtable_data {
-    int              log_tbl[GF_FIELD_SIZE];
+struct gf_w16_zero_logtable_data {
+    int           log_tbl[GF_FIELD_SIZE];
     uint16_t      _antilog_tbl[GF_FIELD_SIZE * 4];
     uint16_t      *antilog_tbl;
     uint16_t      inv_tbl[GF_FIELD_SIZE];
 };
 
-struct gf_lazytable_data {
-    int           log_tbl[GF_FIELD_SIZE];
+struct gf_w16_lazytable_data {
+    uint16_t      log_tbl[GF_FIELD_SIZE];
     uint16_t      antilog_tbl[GF_FIELD_SIZE * 2];
     uint16_t      inv_tbl[GF_FIELD_SIZE];
     uint16_t      lazytable[GF_FIELD_SIZE];
 };
 
-struct gf_w8_logtable_data {
-    uint8_t      log_tbl[GF_BASE_FIELD_SIZE];
-    uint8_t      antilog_tbl[GF_BASE_FIELD_SIZE * 2];
-    uint8_t      *antilog_tbl_div;
-};
-
-struct gf_w8_single_table_data {
-    uint8_t      mult[GF_BASE_FIELD_SIZE][GF_BASE_FIELD_SIZE];
-};
-
 struct gf_w16_bytwo_data {
     uint64_t prim_poly;
     uint64_t mask1;
     uint64_t mask2;
 };
 
+struct gf_w16_split_8_8_data {
+    uint16_t      tables[3][256][256];
+};
+
 struct gf_w16_group_4_4_data {
     uint16_t reduce[16];
     uint16_t shift[16];
 };
 
+struct gf_w16_composite_data {
+  uint8_t *mult_table;
+};
+
 #define AB2(ip, am1 ,am2, b, t1, t2) {\
   t1 = (b << 1) & am1;\
   t2 = b & am2; \
@@ -72,6 +69,9 @@ struct gf_w16_group_4_4_data {
 
 #define MM_PRINT(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 2) printf("  %02x %02x", blah[15-ii], blah[14-ii]); printf("\n"); }
 
+#define GF_FIRST_BIT (1 << 15)
+#define GF_MULTBY_TWO(p) (((p) & GF_FIRST_BIT) ? (((p) << 1) ^ h->prim_poly) : (p) << 1)
+
 static
 inline
 gf_val_32_t gf_w16_inverse_from_divide (gf_t *gf, gf_val_32_t a)
@@ -120,6 +120,212 @@ gf_w16_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_32_t
   gf_do_final_region_alignment(&rd);
 }
 
+static
+void
+gf_w16_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  gf_region_data rd;
+  uint16_t *s16;
+  uint16_t *d16;
+
+#ifdef INTEL_SSE4_PCLMUL
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         w;
+  gf_internal_t * h = gf->scratch;
+  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL));
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2);
+  gf_do_initial_region_alignment(&rd);
+
+  a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
+  
+  s16 = (uint16_t *) rd.s_start;
+  d16 = (uint16_t *) rd.d_start;
+
+  if (xor) {
+    while (d16 < ((uint16_t *) rd.d_top)) {
+
+      /* see gf_w16_clm_multiply() to see explanation of method */
+      
+      b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+      result = _mm_xor_si128 (result, w);
+
+      *d16 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
+      d16++;
+      s16++;
+    } 
+  } else {
+    while (d16 < ((uint16_t *) rd.d_top)) {
+      
+      /* see gf_w16_clm_multiply() to see explanation of method */
+      
+      b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+      result = _mm_xor_si128 (result, w);
+      
+      *d16 = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+      d16++;
+      s16++;
+    } 
+  }
+  gf_do_final_region_alignment(&rd);
+#endif
+}
+
+static
+void
+gf_w16_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  gf_region_data rd;
+  uint16_t *s16;
+  uint16_t *d16;
+
+#ifdef INTEL_SSE4_PCLMUL
+
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         w;
+  gf_internal_t * h = gf->scratch;
+  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL));
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
+  
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2);
+  gf_do_initial_region_alignment(&rd);
+
+  s16 = (uint16_t *) rd.s_start;
+  d16 = (uint16_t *) rd.d_start;
+
+  if (xor) {
+    while (d16 < ((uint16_t *) rd.d_top)) {
+      
+      /* see gf_w16_clm_multiply() to see explanation of method */
+      
+      b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+      result = _mm_xor_si128 (result, w);
+
+      *d16 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
+      d16++;
+      s16++;
+    } 
+  } else {
+    while (d16 < ((uint16_t *) rd.d_top)) {
+      
+      /* see gf_w16_clm_multiply() to see explanation of method */
+      
+      b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+      result = _mm_xor_si128 (result, w);
+      
+      *d16 = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+      d16++;
+      s16++;
+    } 
+  }
+  gf_do_final_region_alignment(&rd);
+#endif
+}
+
+static
+void
+gf_w16_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  gf_region_data rd;
+  uint16_t *s16;
+  uint16_t *d16;
+
+#ifdef INTEL_SSE4_PCLMUL
+
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         w;
+  gf_internal_t * h = gf->scratch;
+  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL));
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2);
+  gf_do_initial_region_alignment(&rd);
+
+  a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
+  
+  s16 = (uint16_t *) rd.s_start;
+  d16 = (uint16_t *) rd.d_start;
+
+  if (xor) {
+    while (d16 < ((uint16_t *) rd.d_top)) {
+      
+      /* see gf_w16_clm_multiply() to see explanation of method */
+      
+      b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+      result = _mm_xor_si128 (result, w);
+
+      *d16 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
+      d16++;
+      s16++;
+    } 
+  } else {
+    while (d16 < ((uint16_t *) rd.d_top)) {
+      
+      /* see gf_w16_clm_multiply() to see explanation of method */
+      
+      b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+      result = _mm_xor_si128 (result, w);
+      
+      *d16 = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+      d16++;
+      s16++;
+    } 
+  }
+  gf_do_final_region_alignment(&rd);
+#endif
+}
+
 static
 inline
 gf_val_32_t gf_w16_euclid (gf_t *gf, gf_val_32_t b)
@@ -146,6 +352,7 @@ gf_val_32_t gf_w16_euclid (gf_t *gf, gf_val_32_t b)
     while (d_ip1 >= d_i) {
       c_i ^= (1 << (d_ip1 - d_i));
       e_ip1 ^= (e_i << (d_ip1 - d_i));
+      if (e_ip1 == 0) return 0;
       while ((e_ip1 & (1 << d_ip1)) == 0) d_ip1--;
     }
 
@@ -227,16 +434,146 @@ gf_val_32_t gf_w16_matrix (gf_t *gf, gf_val_32_t b)
 /* JSP: GF_MULT_SHIFT: The world's dumbest multiplication algorithm.  I only
    include it for completeness.  It does have the feature that it requires no
    extra memory.  
-*/
+ */
 
 static
 inline
 gf_val_32_t
+gf_w16_clm_multiply_2 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16)
+{
+  gf_val_32_t rv = 0;
+
+#ifdef INTEL_SSE4_PCLMUL
+
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         v, w;
+  gf_internal_t * h = gf->scratch;
+
+  a = _mm_insert_epi32 (_mm_setzero_si128(), a16, 0);
+  b = _mm_insert_epi32 (a, b16, 0);
+
+  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL));
+
+  /* Do the initial multiply */
+  
+  result = _mm_clmulepi64_si128 (a, b, 0);
+
+  /* Ben: Do prim_poly reduction twice. We are guaranteed that we will only
+     have to do the reduction at most twice, because (w-2)/z == 2. Where
+     z is equal to the number of zeros after the leading 1
+
+     _mm_clmulepi64_si128 is the carryless multiply operation. Here
+     _mm_srli_si128 shifts the result to the right by 2 bytes. This allows
+     us to multiply the prim_poly by the leading bits of the result. We
+     then xor the result of that operation back with the result.*/
+
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+  result = _mm_xor_si128 (result, w);
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+  result = _mm_xor_si128 (result, w);
+
+  /* Extracts 32 bit value from result. */
+  
+  rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+
+
+#endif
+  return rv;
+}
+
+static
+inline
+gf_val_32_t
+gf_w16_clm_multiply_3 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16)
+{
+  gf_val_32_t rv = 0;
+
+#ifdef INTEL_SSE4_PCLMUL
+
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         v, w;
+  gf_internal_t * h = gf->scratch;
+
+  a = _mm_insert_epi32 (_mm_setzero_si128(), a16, 0);
+  b = _mm_insert_epi32 (a, b16, 0);
+
+  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL));
+
+  /* Do the initial multiply */
+  
+  result = _mm_clmulepi64_si128 (a, b, 0);
+
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+  result = _mm_xor_si128 (result, w);
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+  result = _mm_xor_si128 (result, w);
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+  result = _mm_xor_si128 (result, w);
+
+  /* Extracts 32 bit value from result. */
+  
+  rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+
+
+#endif
+  return rv;
+}
+
+static
+inline
+gf_val_32_t
+gf_w16_clm_multiply_4 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16)
+{
+  gf_val_32_t rv = 0;
+
+#ifdef INTEL_SSE4_PCLMUL
+
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         v, w;
+  gf_internal_t * h = gf->scratch;
+
+  a = _mm_insert_epi32 (_mm_setzero_si128(), a16, 0);
+  b = _mm_insert_epi32 (a, b16, 0);
+
+  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL));
+
+  /* Do the initial multiply */
+  
+  result = _mm_clmulepi64_si128 (a, b, 0);
+
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+  result = _mm_xor_si128 (result, w);
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+  result = _mm_xor_si128 (result, w);
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+  result = _mm_xor_si128 (result, w);
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+  result = _mm_xor_si128 (result, w);
+
+  /* Extracts 32 bit value from result. */
+  
+  rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+
+
+#endif
+  return rv;
+}
+
+
+static
+inline
+ gf_val_32_t
 gf_w16_shift_multiply (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16)
 {
   gf_val_32_t product, i, pp, a, b;
   gf_internal_t *h;
-  
+
   a = a16;
   b = b16;
   h = (gf_internal_t *) gf->scratch;
@@ -247,7 +584,7 @@ gf_w16_shift_multiply (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16)
   for (i = 0; i < GF_FIELD_WIDTH; i++) { 
     if (a & (1 << i)) product ^= (b << i);
   }
-  for (i = (GF_FIELD_WIDTH*2-1); i >= GF_FIELD_WIDTH; i--) {
+  for (i = (GF_FIELD_WIDTH*2-2); i >= GF_FIELD_WIDTH; i--) {
     if (product & (1 << i)) product ^= (pp << (i-GF_FIELD_WIDTH)); 
   }
   return product;
@@ -257,11 +594,37 @@ static
 int gf_w16_shift_init(gf_t *gf)
 {
   gf->multiply.w32 = gf_w16_shift_multiply;
-  gf->inverse.w32 = gf_w16_euclid;
-  gf->multiply_region.w32 = gf_w16_multiply_region_from_single;
   return 1;
 }
 
+static 
+int gf_w16_cfm_init(gf_t *gf)
+{
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+  
+  /*Ben: Determining how many reductions to do */
+  
+#ifdef INTEL_SSE4_PCLMUL
+  if ((0xfe00 & h->prim_poly) == 0) {
+    gf->multiply.w32 = gf_w16_clm_multiply_2;
+    gf->multiply_region.w32 = gf_w16_clm_multiply_region_from_single_2;
+  } else if((0xf000 & h->prim_poly) == 0) {
+    gf->multiply.w32 = gf_w16_clm_multiply_3;
+    gf->multiply_region.w32 = gf_w16_clm_multiply_region_from_single_3;
+  } else if ((0xe000 & h->prim_poly) == 0) {
+    gf->multiply.w32 = gf_w16_clm_multiply_4;
+    gf->multiply_region.w32 = gf_w16_clm_multiply_region_from_single_4;
+  } else {
+    return 0;
+  } 
+  return 1;
+#endif
+
+  return 0;
+}
+
 /* KMG: GF_MULT_LOGTABLE: */
 
 static
@@ -270,7 +633,7 @@ gf_w16_log_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int
 {
   uint16_t *s16, *d16;
   int lv;
-  struct gf_logtable_data *ltd;
+  struct gf_w16_logtable_data *ltd;
   gf_region_data rd;
 
   if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
@@ -279,7 +642,7 @@ gf_w16_log_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int
   gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2);
   gf_do_initial_region_alignment(&rd);
 
-  ltd = (struct gf_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
+  ltd = (struct gf_w16_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
   s16 = (uint16_t *) rd.s_start;
   d16 = (uint16_t *) rd.d_start;
 
@@ -306,9 +669,9 @@ inline
 gf_val_32_t
 gf_w16_log_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
 {
-  struct gf_logtable_data *ltd;
+  struct gf_w16_logtable_data *ltd;
 
-  ltd = (struct gf_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
+  ltd = (struct gf_w16_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
   return (a == 0 || b == 0) ? 0 : ltd->antilog_tbl[(int) ltd->log_tbl[a] + (int) ltd->log_tbl[b]];
 }
 
@@ -318,10 +681,10 @@ gf_val_32_t
 gf_w16_log_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
 {
   int log_sum = 0;
-  struct gf_logtable_data *ltd;
+  struct gf_w16_logtable_data *ltd;
 
   if (a == 0 || b == 0) return 0;
-  ltd = (struct gf_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
+  ltd = (struct gf_w16_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
 
   log_sum = (int) ltd->log_tbl[a] - (int) ltd->log_tbl[b];
   return (ltd->d_antilog[log_sum]);
@@ -331,9 +694,9 @@ static
 gf_val_32_t
 gf_w16_log_inverse(gf_t *gf, gf_val_32_t a)
 {
-  struct gf_logtable_data *ltd;
+  struct gf_w16_logtable_data *ltd;
 
-  ltd = (struct gf_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
+  ltd = (struct gf_w16_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
   return (ltd->inv_tbl[a]);
 }
 
@@ -341,17 +704,20 @@ static
 int gf_w16_log_init(gf_t *gf)
 {
   gf_internal_t *h;
-  struct gf_logtable_data *ltd;
+  struct gf_w16_logtable_data *ltd;
   int i, b;
+  int check = 0;
 
   h = (gf_internal_t *) gf->scratch;
   ltd = h->private;
-
-  ltd->log_tbl[0] = 0;
+  
+  for (i = 0; i < GF_MULT_GROUP_SIZE+1; i++)
+    ltd->log_tbl[i] = 0;
   ltd->d_antilog = ltd->antilog_tbl + GF_MULT_GROUP_SIZE;
 
   b = 1;
   for (i = 0; i < GF_MULT_GROUP_SIZE; i++) {
+      if (ltd->log_tbl[b] != 0) check = 1;
       ltd->log_tbl[b] = i;
       ltd->antilog_tbl[i] = b;
       ltd->antilog_tbl[i+GF_MULT_GROUP_SIZE] = b;
@@ -360,6 +726,24 @@ int gf_w16_log_init(gf_t *gf)
           b = b ^ h->prim_poly;
       }
   }
+
+  /* If you can't construct the log table, there's a problem.  This code is used for
+     some other implementations (e.g. in SPLIT), so if the log table doesn't work in 
+     that instance, use CARRY_FREE / SHIFT instead. */
+
+  if (check) {
+    if (h->mult_type != GF_MULT_LOG_TABLE) {
+
+#ifdef INTEL_SSE4_PCLMUL
+      return gf_w16_cfm_init(gf);
+#endif
+      return gf_w16_shift_init(gf);
+    } else {
+      _gf_errno = GF_E_LOGPOLY;
+      return 0;
+    }
+  }
+
   ltd->inv_tbl[0] = 0;  /* Not really, but we need to fill it with something  */
   ltd->inv_tbl[1] = 1;
   for (i = 2; i < GF_FIELD_SIZE; i++) {
@@ -377,8 +761,76 @@ int gf_w16_log_init(gf_t *gf)
 /* JSP: GF_MULT_SPLIT_TABLE: Using 8 multiplication tables to leverage SSE instructions.
 */
 
-static
+
+/* Ben: Does alternate mapping multiplication using a split table in the
+ lazy method without sse instructions*/
+
+static 
 void
+gf_w16_split_4_16_lazy_nosse_altmap_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  uint64_t i, j, a, b, c, prod;
+  uint8_t *s8, *d8, *top;
+  gf_internal_t *h;
+  uint16_t table[4][16];
+  gf_region_data rd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32);
+  gf_do_initial_region_alignment(&rd);    
+
+  h = (gf_internal_t *) gf->scratch;
+
+  /*Ben: Constructs lazy multiplication table*/
+
+  for (j = 0; j < 16; j++) {
+    for (i = 0; i < 4; i++) {
+      c = (j << (i*4));
+      table[i][j] = gf->multiply.w32(gf, c, val);
+    }
+  }
+
+  /*Ben: s8 is the start of source, d8 is the start of dest, top is end of dest region. */
+  
+  s8 = (uint8_t *) rd.s_start;
+  d8 = (uint8_t *) rd.d_start;
+  top = (uint8_t *) rd.d_top;
+
+
+  while (d8 < top) {
+    
+    /*Ben: Multiplies across 16 two byte quantities using alternate mapping 
+       high bits are on the left, low bits are on the right. */
+  
+    for (j=0;j<16;j++) {
+    
+      /*Ben: If the xor flag is set, the product should include what is in dest */
+      prod = (xor) ? ((uint16_t)(*d8)<<8) ^ *(d8+16) : 0;
+
+      /*Ben: xors all 4 table lookups into the product variable*/
+      
+      prod ^= ((table[0][*(s8+16)&0xf]) ^
+          (table[1][(*(s8+16)&0xf0)>>4]) ^
+          (table[2][*(s8)&0xf]) ^
+          (table[3][(*(s8)&0xf0)>>4]));
+
+      /*Ben: Stores product in the destination and moves on*/
+      
+      *d8 = (uint8_t)(prod >> 8);
+      *(d8+16) = (uint8_t)(prod & 0x00ff);
+      s8++;
+      d8++;
+    }
+    s8+=16;
+    d8+=16;
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+static
+  void
 gf_w16_split_4_16_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
 {
   uint64_t i, j, a, c, prod;
@@ -391,14 +843,14 @@ gf_w16_split_4_16_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_3
   if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
 
   gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2);
-  gf_do_initial_region_alignment(&rd);
+  gf_do_initial_region_alignment(&rd);    
 
   h = (gf_internal_t *) gf->scratch;
 
   for (j = 0; j < 16; j++) {
     for (i = 0; i < 4; i++) {
       c = (j << (i*4));
-      table[i][j] = gf_w16_log_multiply(gf, c, val);
+      table[i][j] = gf->multiply.w32(gf, c, val);
     }
   }
 
@@ -423,7 +875,7 @@ static
 void
 gf_w16_split_8_16_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
 {
-  uint64_t j, a, c, prod, *s64, *d64, *top64;
+  uint64_t j, k, v, a, c, prod, *s64, *d64, *top64;
   gf_internal_t *h;
   uint64_t htable[256], ltable[256];
   gf_region_data rd;
@@ -436,9 +888,16 @@ gf_w16_split_8_16_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_3
   
   h = (gf_internal_t *) gf->scratch;
 
-  for (j = 0; j < 256; j++) {
-    ltable[j] = gf_w16_log_multiply(gf, j, val);
-    htable[j] = gf_w16_log_multiply(gf, (j<<8), val);
+  v = val;
+  ltable[0] = 0;
+  for (j = 1; j < 256; j <<= 1) {
+    for (k = 0; k < j; k++) ltable[k^j] = (v ^ ltable[k]);
+    v = GF_MULTBY_TWO(v);
+  }
+  htable[0] = 0;
+  for (j = 1; j < 256; j <<= 1) {
+    for (k = 0; k < j; k++) htable[k^j] = (v ^ htable[k]);
+    v = GF_MULTBY_TWO(v);
   }
 
   s64 = (uint64_t *) rd.s_start;
@@ -472,8 +931,8 @@ gf_w16_split_8_16_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_3
     prod ^= ltable[a >> 56];
     prod ^= ((xor) ? *d64 : 0); 
     *d64 = prod;
-    *s64++;
-    *d64++;
+    s64++;
+    d64++;
   }
 */
   
@@ -489,10 +948,12 @@ gf_w16_split_8_16_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_3
       a <<= 8;
     }
 
+    //JSP: We can move the conditional outside the while loop, but we need to fully test it to understand which is better.
+   
     prod ^= ((xor) ? *d64 : 0); 
     *d64 = prod;
-    *s64++;
-    *d64++;
+    s64++;
+    d64++;
   }
   gf_do_final_region_alignment(&rd);
 }
@@ -502,7 +963,7 @@ gf_w16_table_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v
 {
   uint64_t j, a, c, pp;
   gf_internal_t *h;
-  struct gf_lazytable_data *ltd;
+  struct gf_w16_lazytable_data *ltd;
   gf_region_data rd;
 
   if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
@@ -512,7 +973,7 @@ gf_w16_table_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v
   gf_do_initial_region_alignment(&rd);
 
   h = (gf_internal_t *) gf->scratch;
-  ltd = (struct gf_lazytable_data *) h->private;
+  ltd = (struct gf_w16_lazytable_data *) h->private;
 
   ltd->lazytable[0] = 0;
 
@@ -530,9 +991,8 @@ gf_w16_table_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v
   } while (c != 1);
   */
 
-  a = ltd->log_tbl[val];
   for (c = 1; c < GF_FIELD_SIZE; c++) {
-    ltd->lazytable[c] = ltd->antilog_tbl[ltd->log_tbl[c]+a];
+    ltd->lazytable[c] = gf_w16_shift_multiply(gf, c, val);
   }
    
   gf_two_byte_region_table_multiply(&rd, ltd->lazytable);
@@ -543,7 +1003,7 @@ static
 void
 gf_w16_split_4_16_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
 {
-#ifdef   INTEL_SSE4
+#ifdef INTEL_SSSE3
   uint64_t i, j, *s64, *d64, *top64;;
   uint64_t a, c, prod;
   uint8_t low[4][16];
@@ -561,7 +1021,7 @@ gf_w16_split_4_16_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_v
   for (j = 0; j < 16; j++) {
     for (i = 0; i < 4; i++) {
       c = (j << (i*4));
-      prod = gf_w16_log_multiply(gf, c, val);
+      prod = gf->multiply.w32(gf, c, val);
       low[i][j] = (prod & 0xff);
       high[i][j] = (prod >> 8);
     }
@@ -676,7 +1136,7 @@ static
 void
 gf_w16_split_4_16_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
 {
-#ifdef   INTEL_SSE4
+#ifdef INTEL_SSSE3
   uint64_t i, j, *s64, *d64, *top64;;
   uint64_t c, prod;
   uint8_t low[4][16];
@@ -694,7 +1154,7 @@ gf_w16_split_4_16_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *des
   for (j = 0; j < 16; j++) {
     for (i = 0; i < 4; i++) {
       c = (j << (i*4));
-      prod = gf_w16_log_multiply(gf, c, val);
+      prod = gf->multiply.w32(gf, c, val);
       low[i][j] = (prod & 0xff);
       high[i][j] = (prod >> 8);
     }
@@ -782,32 +1242,111 @@ gf_w16_split_4_16_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *des
 #endif
 }
 
+uint32_t 
+gf_w16_split_8_8_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  uint32_t alow, blow;
+  struct gf_w16_split_8_8_data *d8;
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+  d8 = (struct gf_w16_split_8_8_data *) h->private;
+
+  alow = a & 0xff;
+  blow = b & 0xff;
+  a >>= 8;
+  b >>= 8;
+
+  return d8->tables[0][alow][blow] ^
+         d8->tables[1][alow][b] ^
+         d8->tables[1][a][blow] ^
+         d8->tables[2][a][b];
+}
+
 static 
 int gf_w16_split_init(gf_t *gf)
 {
   gf_internal_t *h;
-  gf_w16_log_init(gf);
+  struct gf_w16_split_8_8_data *d8;
+  int i, j, exp, issse3;
+  uint32_t p, basep;
 
   h = (gf_internal_t *) gf->scratch;
 
-  if (h->mult_type == GF_MULT_DEFAULT) {
-    gf->multiply_region.w32 = gf_w16_split_8_16_lazy_multiply_region;
-#ifdef INTEL_SSE4
-    gf->multiply_region.w32 = gf_w16_split_4_16_lazy_sse_multiply_region;
+issse3 = 0;
+#ifdef INTEL_SSSE3
+  issse3 = 1;
 #endif
-  } else if ((h->arg1 == 8 && h->arg2 == 16) || (h->arg2 == 8 && h->arg1 == 16)) {
-    gf->multiply_region.w32 = gf_w16_split_8_16_lazy_multiply_region;
-  } else if ((h->arg1 == 4 && h->arg2 == 16) || (h->arg2 == 4 && h->arg1 == 16)) {
-    if (h->region_type & GF_REGION_SSE) {
-      if (h->region_type & GF_REGION_ALTMAP) {
-        gf->multiply_region.w32 = gf_w16_split_4_16_lazy_sse_altmap_multiply_region;
-      } else {
-        gf->multiply_region.w32 = gf_w16_split_4_16_lazy_sse_multiply_region;
+
+  if (h->arg1 == 8 && h->arg2 == 8) {
+    d8 = (struct gf_w16_split_8_8_data *) h->private;
+    basep = 1;
+    for (exp = 0; exp < 3; exp++) {
+      for (j = 0; j < 256; j++) d8->tables[exp][0][j] = 0;
+      for (i = 0; i < 256; i++) d8->tables[exp][i][0] = 0;
+      d8->tables[exp][1][1] = basep;
+      for (i = 2; i < 256; i++) {
+        if (i&1) {
+          p = d8->tables[exp][i^1][1];
+          d8->tables[exp][i][1] = p ^ basep;
+        } else {
+          p = d8->tables[exp][i>>1][1];
+          d8->tables[exp][i][1] = GF_MULTBY_TWO(p);
+        }
       }
+      for (i = 1; i < 256; i++) {
+        p = d8->tables[exp][i][1];
+        for (j = 1; j < 256; j++) {
+          if (j&1) {
+            d8->tables[exp][i][j] = d8->tables[exp][i][j^1] ^ p;
+          } else {
+            d8->tables[exp][i][j] = GF_MULTBY_TWO(d8->tables[exp][i][j>>1]);
+          }
+        }
+      }
+      for (i = 0; i < 8; i++) basep = GF_MULTBY_TWO(basep);
+    }
+    gf->multiply.w32 = gf_w16_split_8_8_multiply;
+    gf->multiply_region.w32 = gf_w16_split_8_16_lazy_multiply_region;
+    return 1;
+
+  }
+
+  /* We'll be using LOG for multiplication, unless the pp isn't primitive.
+     In that case, we'll be using SHIFT. */
+
+  gf_w16_log_init(gf);
+
+  /* Defaults */
+
+  if (issse3) {
+    gf->multiply_region.w32 = gf_w16_split_4_16_lazy_sse_multiply_region;
+  } else {
+    gf->multiply_region.w32 = gf_w16_split_8_16_lazy_multiply_region;
+  }
+
+
+  if ((h->arg1 == 8 && h->arg2 == 16) || (h->arg2 == 8 && h->arg1 == 16)) {
+    gf->multiply_region.w32 = gf_w16_split_8_16_lazy_multiply_region;
+
+  } else if ((h->arg1 == 4 && h->arg2 == 16) || (h->arg2 == 4 && h->arg1 == 16)) {
+    if (issse3) {
+      if(h->region_type & GF_REGION_ALTMAP && h->region_type & GF_REGION_NOSSE)
+        gf->multiply_region.w32 = gf_w16_split_4_16_lazy_nosse_altmap_multiply_region;
+      else if(h->region_type & GF_REGION_NOSSE)
+        gf->multiply_region.w32 = gf_w16_split_4_16_lazy_multiply_region;
+      else if(h->region_type & GF_REGION_ALTMAP)
+        gf->multiply_region.w32 = gf_w16_split_4_16_lazy_sse_altmap_multiply_region;
     } else {
-      gf->multiply_region.w32 = gf_w16_split_4_16_lazy_multiply_region;
+      if(h->region_type & GF_REGION_SSE)
+        return 0;
+      else if(h->region_type & GF_REGION_ALTMAP)
+        gf->multiply_region.w32 = gf_w16_split_4_16_lazy_nosse_altmap_multiply_region;
+      else
+        gf->multiply_region.w32 = gf_w16_split_4_16_lazy_multiply_region;
     }
   }
+
   return 1;
 }
 
@@ -818,7 +1357,7 @@ int gf_w16_table_init(gf_t *gf)
   gf_w16_log_init(gf);
 
   h = (gf_internal_t *) gf->scratch;
-  gf->multiply_region.w32 = NULL;
+
   gf->multiply_region.w32 = gf_w16_table_lazy_multiply_region; 
   return 1;
 }
@@ -830,7 +1369,7 @@ gf_w16_log_zero_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val
   uint16_t lv;
   int i;
   uint16_t *s16, *d16, *top16;
-  struct gf_zero_logtable_data *ltd;
+  struct gf_w16_zero_logtable_data *ltd;
   gf_region_data rd;
 
   if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
@@ -839,7 +1378,7 @@ gf_w16_log_zero_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val
   gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2);
   gf_do_initial_region_alignment(&rd);
 
-  ltd = (struct gf_zero_logtable_data*) ((gf_internal_t *) gf->scratch)->private;
+  ltd = (struct gf_w16_zero_logtable_data*) ((gf_internal_t *) gf->scratch)->private;
   s16 = (uint16_t *) rd.s_start;
   d16 = (uint16_t *) rd.d_start;
   top16 = (uint16_t *) rd.d_top;
@@ -858,18 +1397,20 @@ gf_w16_log_zero_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val
   }
 
   /* This isn't necessary. */
+  
   gf_do_final_region_alignment(&rd);
 }
 
 /* Here -- double-check Kevin */
+
 static
 inline
 gf_val_32_t
 gf_w16_log_zero_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
 {
-  struct gf_zero_logtable_data *ltd;
+  struct gf_w16_zero_logtable_data *ltd;
 
-  ltd = (struct gf_zero_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
+  ltd = (struct gf_w16_zero_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
   return ltd->antilog_tbl[ltd->log_tbl[a] + ltd->log_tbl[b]];
 }
 
@@ -879,10 +1420,10 @@ gf_val_32_t
 gf_w16_log_zero_divide (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
 {
   int log_sum = 0;
-  struct gf_zero_logtable_data *ltd;
+  struct gf_w16_zero_logtable_data *ltd;
 
   if (a == 0 || b == 0) return 0;
-  ltd = (struct gf_zero_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
+  ltd = (struct gf_w16_zero_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
 
   log_sum = ltd->log_tbl[a] - ltd->log_tbl[b] + (GF_MULT_GROUP_SIZE);
   return (ltd->antilog_tbl[log_sum]);
@@ -892,9 +1433,9 @@ static
 gf_val_32_t
 gf_w16_log_zero_inverse (gf_t *gf, gf_val_32_t a)
 {
-  struct gf_zero_logtable_data *ltd;
+  struct gf_w16_zero_logtable_data *ltd;
 
-  ltd = (struct gf_zero_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
+  ltd = (struct gf_w16_zero_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
   return (ltd->inv_tbl[a]);
 }
 
@@ -1015,7 +1556,7 @@ static
 void 
 gf_w16_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
 {
-#ifdef   INTEL_SSE4
+#ifdef INTEL_SSE2
   int i;
   uint8_t *s8, *d8;
   uint32_t vrev;
@@ -1079,7 +1620,7 @@ static
 void
 gf_w16_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_w16_bytwo_data *btd)
 {
-#ifdef   INTEL_SSE4
+#ifdef INTEL_SSE2
   int i;
   uint8_t *d8, *s8, tb;
   __m128i pp, m1, m2, t1, t2, va, vb;
@@ -1105,7 +1646,7 @@ static
 void
 gf_w16_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_w16_bytwo_data *btd)
 {
-#ifdef   INTEL_SSE4
+#ifdef INTEL_SSE2
   int i;
   uint8_t *d8, *s8, tb;
   __m128i pp, m1, m2, t1, t2, va, vb;
@@ -1134,7 +1675,7 @@ static
 void 
 gf_w16_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
 {
-#ifdef   INTEL_SSE4
+#ifdef INTEL_SSE2
   int itb;
   uint8_t *d8, *s8;
   __m128i pp, m1, m2, t1, t2, va, vb;
@@ -1352,20 +1893,30 @@ int gf_w16_bytwo_init(gf_t *gf)
 
   if (h->mult_type == GF_MULT_BYTWO_p) {
     gf->multiply.w32 = gf_w16_bytwo_p_multiply;
-    if (h->region_type == GF_REGION_SSE) {
-      gf->multiply_region.w32 = gf_w16_bytwo_p_sse_multiply_region;
-    } else {
+    #ifdef INTEL_SSE2
+      if (h->region_type & GF_REGION_NOSSE)
+        gf->multiply_region.w32 = gf_w16_bytwo_p_nosse_multiply_region;
+      else
+        gf->multiply_region.w32 = gf_w16_bytwo_p_sse_multiply_region;
+    #else
       gf->multiply_region.w32 = gf_w16_bytwo_p_nosse_multiply_region;
-    }
+      if(h->region_type & GF_REGION_SSE)
+        return 0;
+    #endif
   } else {
     gf->multiply.w32 = gf_w16_bytwo_b_multiply;
-    if (h->region_type == GF_REGION_SSE) {
-      gf->multiply_region.w32 = gf_w16_bytwo_b_sse_multiply_region;
-    } else {
+    #ifdef INTEL_SSE2
+      if (h->region_type & GF_REGION_NOSSE)
+        gf->multiply_region.w32 = gf_w16_bytwo_b_nosse_multiply_region;
+      else
+        gf->multiply_region.w32 = gf_w16_bytwo_b_sse_multiply_region;
+    #else
       gf->multiply_region.w32 = gf_w16_bytwo_b_nosse_multiply_region;
-    }
+      if(h->region_type & GF_REGION_SSE)
+        return 0;
+    #endif
   }
-  gf->inverse.w32 = gf_w16_euclid;
+
   return 1;
 }
 
@@ -1373,7 +1924,7 @@ static
 int gf_w16_log_zero_init(gf_t *gf)
 {
   gf_internal_t *h;
-  struct gf_zero_logtable_data *ltd;
+  struct gf_w16_zero_logtable_data *ltd;
   int i, b;
 
   h = (gf_internal_t *) gf->scratch;
@@ -1423,30 +1974,30 @@ gf_w16_composite_multiply_recursive(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
 
   a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
 
-  rv = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, GF_S_GF_8_2)) << 8));
+  rv = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 8));
   return rv;
 }
 
 static
 gf_val_32_t
-gf_w16_composite_multiply_table(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+gf_w16_composite_multiply_inline(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
 {
   gf_internal_t *h = (gf_internal_t *) gf->scratch;
-  struct gf_w8_single_table_data * std;
-
+  gf_t *base_gf = h->base_gf;
   uint8_t b0 = b & 0x00ff;
   uint8_t b1 = (b & 0xff00) >> 8;
   uint8_t a0 = a & 0x00ff;
   uint8_t a1 = (a & 0xff00) >> 8;
-  uint8_t a1b1;
+  uint8_t a1b1, *mt;
   uint16_t rv;
+  struct gf_w16_composite_data *cd;
 
-  std = (struct gf_w8_single_table_data *) h->private;
+  cd = (struct gf_w16_composite_data *) h->private;
+  mt = cd->mult_table;
 
-  a1b1 = std->mult[a1][b1];
+  a1b1 = GF_W8_INLINE_MULTDIV(mt, a1, b1);
 
-  rv = ((std->mult[a0][b0] ^ a1b1) | 
-        ((std->mult[a1][b0] ^ std->mult[a0][b1] ^ std->mult[a1b1][GF_S_GF_8_2]) << 8));
+  rv = ((GF_W8_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) | ((GF_W8_INLINE_MULTDIV(mt, a1, b0) ^ GF_W8_INLINE_MULTDIV(mt, a0, b1) ^ GF_W8_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 8));
   return rv;
 }
 
@@ -1472,6 +2023,7 @@ gf_w16_composite_multiply_table(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
  *
  * a / b = a * c
  */
+
 static
 gf_val_32_t
 gf_w16_composite_inverse(gf_t *gf, gf_val_32_t a)
@@ -1486,7 +2038,7 @@ gf_w16_composite_inverse(gf_t *gf, gf_val_32_t a)
 
   if (a0 == 0) {
     a1inv = base_gf->inverse.w32(base_gf, a1);
-    c0 = base_gf->multiply.w32(base_gf, a1inv, GF_S_GF_8_2);
+    c0 = base_gf->multiply.w32(base_gf, a1inv, h->prim_poly);
     c1 = a1inv;
   } else if (a1 == 0) {
     c0 = base_gf->inverse.w32(base_gf, a0);
@@ -1497,7 +2049,7 @@ gf_w16_composite_inverse(gf_t *gf, gf_val_32_t a)
 
     d = base_gf->multiply.w32(base_gf, a1, a0inv);
 
-    tmp = (base_gf->multiply.w32(base_gf, a1, a0inv) ^ base_gf->multiply.w32(base_gf, a0, a1inv) ^ GF_S_GF_8_2);
+    tmp = (base_gf->multiply.w32(base_gf, a1, a0inv) ^ base_gf->multiply.w32(base_gf, a0, a1inv) ^ h->prim_poly);
     tmp = base_gf->inverse.w32(base_gf, tmp);
 
     d = base_gf->multiply.w32(base_gf, d, tmp);
@@ -1511,62 +2063,6 @@ gf_w16_composite_inverse(gf_t *gf, gf_val_32_t a)
   return c;
 }
 
-static
-gf_val_32_t
-gf_w16_composite_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
-{
-  uint16_t binv;
-
-  binv = gf->inverse.w32(gf, b);
-  return gf->multiply.w32(gf, a, binv);
-}
-
-static
-void
-gf_w16_composite_multiply_region_inline(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
-{
-  gf_internal_t *h = (gf_internal_t *) gf->scratch;
-  gf_t *base_gf = h->base_gf;
-  struct gf_w8_single_table_data * std;
-  uint8_t b0 = val & 0x00ff;
-  uint8_t b1 = (val & 0xff00) >> 8;
-  uint16_t *s16, *d16, *top;
-  uint8_t a0, a1, a1b1;
-  struct gf_logtable_data *ltd;
-  gf_region_data rd;
-  
-  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
-
-  std = (struct gf_w8_single_table_data *) h->private;
-  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2);
-
-  s16 = rd.s_start;
-  d16 = rd.d_start;
-  top = rd.d_top;
-
-  if (xor) {
-    while (d16 < top) {
-      a0 = (*s16) & 0x00ff;
-      a1 = ((*s16) & 0xff00) >> 8;
-      a1b1 = std->mult[a1][b1];
-
-      *d16 ^= ((std->mult[a0][b0] ^ a1b1) | ((std->mult[a1][b0] ^ std->mult[a0][b1] ^ std->mult[a1b1][GF_S_GF_8_2]) << 8));
-      s16++;
-      d16++;
-    }
-  } else {
-    while (d16 < top) {
-      a0 = (*s16) & 0x00ff;
-      a1 = ((*s16) & 0xff00) >> 8;
-      a1b1 = std->mult[a1][b1];
-
-      *d16 = ((std->mult[a0][b0] ^ a1b1) | ((std->mult[a1][b0] ^ std->mult[a0][b1] ^ std->mult[a1b1][GF_S_GF_8_2]) << 8));
-      s16++;
-      d16++;
-    }
-  }
-}
-
 static
 void
 gf_w16_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
@@ -1577,9 +2073,13 @@ gf_w16_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t va
   uint8_t b0 = val & 0x00ff;
   uint8_t b1 = (val & 0xff00) >> 8;
   uint16_t *s16, *d16, *top;
-  uint8_t a0, a1, a1b1;
+  uint8_t a0, a1, a1b1, *mt;
   gf_region_data rd;
-  struct gf_logtable_data *ltd;
+  struct gf_w16_logtable_data *ltd;
+  struct gf_w16_composite_data *cd;
+
+  cd = (struct gf_w16_composite_data *) h->private;
+  mt = cd->mult_table;
   
   if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
   gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2);
@@ -1588,27 +2088,61 @@ gf_w16_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t va
   d16 = rd.d_start;
   top = rd.d_top;
 
-  if (xor) {
-    while (d16 < top) {
-      a0 = (*s16) & 0x00ff;
-      a1 = ((*s16) & 0xff00) >> 8;
-      a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
-
-      (*d16) ^= ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
-                ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, GF_S_GF_8_2)) << 8));
-      s16++;
-      d16++;
+  if (mt == NULL) {
+    if (xor) {
+      while (d16 < top) {
+        a0 = (*s16) & 0x00ff;
+        a1 = ((*s16) & 0xff00) >> 8;
+        a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
+  
+        (*d16) ^= ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
+                  ((base_gf->multiply.w32(base_gf, a1, b0) ^ 
+                    base_gf->multiply.w32(base_gf, a0, b1) ^ 
+                    base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 8));
+        s16++;
+        d16++;
+      }
+    } else {
+      while (d16 < top) {
+        a0 = (*s16) & 0x00ff;
+        a1 = ((*s16) & 0xff00) >> 8;
+        a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
+  
+        (*d16) = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
+                  ((base_gf->multiply.w32(base_gf, a1, b0) ^ 
+                    base_gf->multiply.w32(base_gf, a0, b1) ^ 
+                    base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 8));
+        s16++;
+        d16++;
+      }
     }
   } else {
-    while (d16 < top) {
-      a0 = (*s16) & 0x00ff;
-      a1 = ((*s16) & 0xff00) >> 8;
-      a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
-
-      (*d16) = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
-               ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, GF_S_GF_8_2)) << 8));
-      s16++;
-      d16++;
+    if (xor) {
+      while (d16 < top) {
+        a0 = (*s16) & 0x00ff;
+        a1 = ((*s16) & 0xff00) >> 8;
+        a1b1 = GF_W8_INLINE_MULTDIV(mt, a1, b1);
+  
+        (*d16) ^= ((GF_W8_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) |
+                  ((GF_W8_INLINE_MULTDIV(mt, a1, b0) ^ 
+                    GF_W8_INLINE_MULTDIV(mt, a0, b1) ^ 
+                    GF_W8_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 8));
+        s16++;
+        d16++;
+      }
+    } else {
+      while (d16 < top) {
+        a0 = (*s16) & 0x00ff;
+        a1 = ((*s16) & 0xff00) >> 8;
+        a1b1 = GF_W8_INLINE_MULTDIV(mt, a1, b1);
+  
+        (*d16) = ((GF_W8_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) |
+                  ((GF_W8_INLINE_MULTDIV(mt, a1, b0) ^ 
+                    GF_W8_INLINE_MULTDIV(mt, a0, b1) ^ 
+                    GF_W8_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 8));
+        s16++;
+        d16++;
+      }
     }
   }
 }
@@ -1645,7 +2179,7 @@ gf_w16_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_32_
   base_gf->multiply_region.w32(base_gf, shigh, dlow, val1, sub_reg_size, 1);
   base_gf->multiply_region.w32(base_gf, slow, dhigh, val1, sub_reg_size, xor);
   base_gf->multiply_region.w32(base_gf, shigh, dhigh, val0, sub_reg_size, 1);
-  base_gf->multiply_region.w32(base_gf, shigh, dhigh, base_gf->multiply.w32(base_gf, GF_S_GF_8_2, val1), sub_reg_size, 1);
+  base_gf->multiply_region.w32(base_gf, shigh, dhigh, base_gf->multiply.w32(base_gf, h->prim_poly, val1), sub_reg_size, 1);
 
   gf_do_final_region_alignment(&rd);
 }
@@ -1653,34 +2187,26 @@ gf_w16_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_32_
 static
 int gf_w16_composite_init(gf_t *gf)
 {
-  struct gf_w8_single_table_data * std;
   gf_internal_t *h = (gf_internal_t *) gf->scratch;
-  gf_t *base_gf = h->base_gf;
-  gf_internal_t *base_h = (gf_internal_t *) base_gf->scratch;
-  uint16_t a, b;
+  struct gf_w16_composite_data *cd;
+
+  if (h->base_gf == NULL) return 0;
+
+  cd = (struct gf_w16_composite_data *) h->private;
+  cd->mult_table = gf_w8_get_mult_table(h->base_gf);
 
   if (h->region_type & GF_REGION_ALTMAP) {
     gf->multiply_region.w32 = gf_w16_composite_multiply_region_alt;
-  } else if (h->arg2 == 0 && base_h->mult_type == GF_MULT_TABLE && 
-                             base_h->region_type == GF_REGION_DEFAULT) {
-    gf->multiply_region.w32 = gf_w16_composite_multiply_region_inline;
   } else {
     gf->multiply_region.w32 = gf_w16_composite_multiply_region;
   }
-  
-  if (h->arg2 == 0) {
-    std = (struct gf_w8_single_table_data *) h->private;
-    for (a = 0; a < 256; a++) {
-      for (b = 0; b < 256; b++) {
-        std->mult[a][b] = base_gf->multiply.w32(base_gf, a, b);
-      }
-    }
-    gf->multiply.w32 = gf_w16_composite_multiply_table;
-  } else {
-    gf->multiply.w32 = gf_w16_composite_multiply_recursive;
-  }
 
-  gf->divide.w32 = gf_w16_composite_divide;
+  if (cd->mult_table == NULL) {
+    gf->multiply.w32 = gf_w16_composite_multiply_recursive;
+  } else {
+    gf->multiply.w32 = gf_w16_composite_multiply_inline;
+  }
+  gf->divide.w32 = NULL;
   gf->inverse.w32 = gf_w16_composite_inverse;
 
   return 1;
@@ -1815,79 +2341,50 @@ int gf_w16_group_init(gf_t *gf)
 
 int gf_w16_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
 {
-  int ss;
-  int sa;
-
-  ss = (GF_REGION_SSE | GF_REGION_NOSSE);
-  sa = (GF_REGION_STDMAP | GF_REGION_ALTMAP);
-
   switch(mult_type)
   {
     case GF_MULT_TABLE:
-      region_type |= GF_REGION_LAZY;
-      if (arg1 != 0 || arg2 != 0 || region_type != GF_REGION_LAZY) return -1;
-      return sizeof(gf_internal_t) + sizeof(struct gf_lazytable_data) + 64;
+      return sizeof(gf_internal_t) + sizeof(struct gf_w16_lazytable_data) + 64;
       break;
     case GF_MULT_BYTWO_p:
     case GF_MULT_BYTWO_b:
-      if (arg1 != 0 || arg2 != 0 || (region_type | ss) != ss ||
-         (region_type & ss) == ss) return -1;
       return sizeof(gf_internal_t) + sizeof(struct gf_w16_bytwo_data);
       break;
-    case GF_MULT_DEFAULT:
-    case GF_MULT_LOG_TABLE:
-      if (arg2 != 0) return -1;
-      if (region_type != GF_REGION_DEFAULT) return -1;
-      if (arg1 == 1) {
-        return sizeof(gf_internal_t) + sizeof(struct gf_zero_logtable_data) + 64;
-      } else if (arg1 == 0) {
-        return sizeof(gf_internal_t) + sizeof(struct gf_logtable_data) + 64;
-      } else {
-        return -1;
-      }
+    case GF_MULT_LOG_ZERO:
+      return sizeof(gf_internal_t) + sizeof(struct gf_w16_zero_logtable_data) + 64;
       break;
+    case GF_MULT_LOG_TABLE:
+      return sizeof(gf_internal_t) + sizeof(struct gf_w16_logtable_data) + 64;
+      break;
+    case GF_MULT_DEFAULT:
     case GF_MULT_SPLIT_TABLE: 
-        if ((arg1 == 8 && arg2 == 16) || (arg2 == 8 && arg1 == 16)) {
-          region_type |= GF_REGION_LAZY;
-          if (region_type != GF_REGION_LAZY) return -1;
-          return sizeof(gf_internal_t) + sizeof(struct gf_logtable_data) + 64;
-        } else if ((arg1 == 4 && arg2 == 16) || (arg2 == 4 && arg1 == 16)) {
-          region_type &= (~GF_REGION_LAZY);    /* Ignore GF_REGION_LAZY */
-          if ((region_type & ss) == ss) return -1;
-          if ((region_type & sa) == sa) return -1;
-          if ((region_type & ss) == 0) region_type |= GF_REGION_SSE;
-          if (region_type & GF_REGION_NOSSE) {
-            if (region_type != GF_REGION_NOSSE) return -1;
-            return sizeof(gf_internal_t) + sizeof(struct gf_logtable_data) + 64;
-          } else {
-            if ((region_type | ss | sa) != (ss|sa)) return -1;
-            return sizeof(gf_internal_t) + sizeof(struct gf_logtable_data) + 64;
-          }
-        }
-        return -1;
-        break;
-    case GF_MULT_GROUP:     
-      if (arg1 == 4 && arg2 == 4) {
-            return sizeof(gf_internal_t) + sizeof(struct gf_w16_group_4_4_data) + 64;
+      if (arg1 == 8 && arg2 == 8) {
+        return sizeof(gf_internal_t) + sizeof(struct gf_w16_split_8_8_data) + 64;
+      } else if ((arg1 == 8 && arg2 == 16) || (arg2 == 8 && arg1 == 16)) {
+        return sizeof(gf_internal_t) + sizeof(struct gf_w16_logtable_data) + 64;
+      } else if (mult_type == GF_MULT_DEFAULT || 
+                 (arg1 == 4 && arg2 == 16) || (arg2 == 4 && arg1 == 16)) {
+        return sizeof(gf_internal_t) + sizeof(struct gf_w16_logtable_data) + 64;
       }
-      return -1;
+      return 0;
+      break;
+    case GF_MULT_GROUP:     
+      return sizeof(gf_internal_t) + sizeof(struct gf_w16_group_4_4_data) + 64;
+      break;
+    case GF_MULT_CARRY_FREE:
+      return sizeof(gf_internal_t);
+      break;
     case GF_MULT_SHIFT:
-      if (arg1 != 0 || arg2 != 0 || region_type != 0) return -1;
       return sizeof(gf_internal_t);
       break;
     case GF_MULT_COMPOSITE:
-      if (region_type & ~(GF_REGION_ALTMAP | GF_REGION_STDMAP)) return -1;
-      if (arg1 == 2 && arg2 == 0) {
-        return sizeof(gf_internal_t) + sizeof(struct gf_w8_single_table_data) + 64;
-      } else if (arg1 == 2 && arg2 == 1) {
-        return sizeof(gf_internal_t) + 64;
-      } else {
-        return -1;
-      }
+      return sizeof(gf_internal_t) + sizeof(struct gf_w16_composite_data) + 64;
+      break;
 
     default:
-      return -1;
+      return 0;
    }
+   return 0;
 }
 
 int gf_w16_init(gf_t *gf)
@@ -1895,7 +2392,27 @@ int gf_w16_init(gf_t *gf)
   gf_internal_t *h;
 
   h = (gf_internal_t *) gf->scratch;
-  if (h->prim_poly == 0) h->prim_poly = 0x1100b;
+
+  /* Allen: set default primitive polynomial / irreducible polynomial if needed */
+
+  if (h->prim_poly == 0) {
+    if (h->mult_type == GF_MULT_COMPOSITE) {
+      h->prim_poly = gf_composite_get_default_poly(h->base_gf);
+      if (h->prim_poly == 0) return 0;
+    } else { 
+
+     /* Allen: use the following primitive polynomial to make 
+               carryless multiply work more efficiently for GF(2^16).
+
+        h->prim_poly = 0x1002d;
+
+        The following is the traditional primitive polynomial for GF(2^16) */
+
+      h->prim_poly = 0x1100b;
+    } 
+  }
+
+  if (h->mult_type != GF_MULT_COMPOSITE) h->prim_poly |= (1 << 16);
 
   gf->multiply.w32 = NULL;
   gf->divide.w32 = NULL;
@@ -1903,21 +2420,17 @@ int gf_w16_init(gf_t *gf)
   gf->multiply_region.w32 = NULL;
 
   switch(h->mult_type) {
-    case GF_MULT_LOG_TABLE:        
-      if (h->arg1 == 1) {
-        if (gf_w16_log_zero_init(gf) == 0) return 0;
-      } else {
-        if (gf_w16_log_init(gf) == 0) return 0;
-      }
-      break;
+    case GF_MULT_LOG_ZERO:    if (gf_w16_log_zero_init(gf) == 0) return 0; break;
+    case GF_MULT_LOG_TABLE:   if (gf_w16_log_init(gf) == 0) return 0; break;
     case GF_MULT_DEFAULT: 
     case GF_MULT_SPLIT_TABLE: if (gf_w16_split_init(gf) == 0) return 0; break;
     case GF_MULT_TABLE:       if (gf_w16_table_init(gf) == 0) return 0; break;
-    case GF_MULT_SHIFT:     if (gf_w16_shift_init(gf) == 0) return 0; break;
-    case GF_MULT_COMPOSITE: if (gf_w16_composite_init(gf) == 0) return 0; break;
+    case GF_MULT_CARRY_FREE:  if (gf_w16_cfm_init(gf) == 0) return 0; break;
+    case GF_MULT_SHIFT:       if (gf_w16_shift_init(gf) == 0) return 0; break;
+    case GF_MULT_COMPOSITE:   if (gf_w16_composite_init(gf) == 0) return 0; break;
     case GF_MULT_BYTWO_p: 
-    case GF_MULT_BYTWO_b:   if (gf_w16_bytwo_init(gf) == 0) return 0; break;
-    case GF_MULT_GROUP:     if (gf_w16_group_init(gf) == 0) return 0; break;
+    case GF_MULT_BYTWO_b:     if (gf_w16_bytwo_init(gf) == 0) return 0; break;
+    case GF_MULT_GROUP:       if (gf_w16_group_init(gf) == 0) return 0; break;
     default: return 0;
   }
   if (h->divide_type == GF_DIVIDE_EUCLID) {
@@ -1928,23 +2441,28 @@ int gf_w16_init(gf_t *gf)
     gf->inverse.w32 = gf_w16_matrix;
   }
 
-  if (gf->inverse.w32== NULL && gf->divide.w32 == NULL) gf->inverse.w32 = gf_w16_euclid;
-
-  if (gf->inverse.w32 != NULL && gf->divide.w32 == NULL) {
+  if (gf->divide.w32 == NULL) {
     gf->divide.w32 = gf_w16_divide_from_inverse;
+    if (gf->inverse.w32 == NULL) gf->inverse.w32 = gf_w16_euclid;
   }
-  if (gf->inverse.w32 == NULL && gf->divide.w32 != NULL) {
-    gf->inverse.w32 = gf_w16_inverse_from_divide;
-  }
+
+  if (gf->inverse.w32 == NULL)  gf->inverse.w32 = gf_w16_inverse_from_divide;
+
   if (h->region_type & GF_REGION_ALTMAP) {
     if (h->mult_type == GF_MULT_COMPOSITE) {
       gf->extract_word.w32 = gf_w16_composite_extract_word;
     } else {
       gf->extract_word.w32 = gf_w16_split_extract_word;
     }
+  } else if (h->region_type == GF_REGION_CAUCHY) {
+    gf->multiply_region.w32 = gf_wgen_cauchy_region;
+    gf->extract_word.w32 = gf_wgen_extract_word;
   } else {
     gf->extract_word.w32 = gf_w16_extract_word;
   }
+  if (gf->multiply_region.w32 == NULL) {
+    gf->multiply_region.w32 = gf_w16_multiply_region_from_single;
+  }
   return 1;
 }
 
@@ -1953,11 +2471,11 @@ int gf_w16_init(gf_t *gf)
 uint16_t *gf_w16_get_log_table(gf_t *gf)
 {
   gf_internal_t *h;
-  struct gf_logtable_data *ltd;
+  struct gf_w16_logtable_data *ltd;
 
   h = (gf_internal_t *) gf->scratch;
   if (gf->multiply.w32 == gf_w16_log_multiply) {
-    ltd = (struct gf_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
+    ltd = (struct gf_w16_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
     return (uint16_t *) ltd->log_tbl;
   }
   return NULL;
@@ -1966,11 +2484,11 @@ uint16_t *gf_w16_get_log_table(gf_t *gf)
 uint16_t *gf_w16_get_mult_alog_table(gf_t *gf)
 {
   gf_internal_t *h;
-  struct gf_logtable_data *ltd;
+  struct gf_w16_logtable_data *ltd;
 
   h = (gf_internal_t *) gf->scratch;
   if (gf->multiply.w32 == gf_w16_log_multiply) {
-    ltd = (struct gf_logtable_data *) h->private;
+    ltd = (struct gf_w16_logtable_data *) h->private;
     return (uint16_t *) ltd->antilog_tbl;
   }
   return NULL;
@@ -1979,11 +2497,11 @@ uint16_t *gf_w16_get_mult_alog_table(gf_t *gf)
 uint16_t *gf_w16_get_div_alog_table(gf_t *gf)
 {
   gf_internal_t *h;
-  struct gf_logtable_data *ltd;
+  struct gf_w16_logtable_data *ltd;
 
   h = (gf_internal_t *) gf->scratch;
   if (gf->multiply.w32 == gf_w16_log_multiply) {
-    ltd = (struct gf_logtable_data *) h->private;
+    ltd = (struct gf_w16_logtable_data *) h->private;
     return (uint16_t *) ltd->d_antilog;
   }
   return NULL;
diff --git a/gf_w32.c b/gf_w32.c
index b0ba8c5..cae188f 100644
--- a/gf_w32.c
+++ b/gf_w32.c
@@ -15,24 +15,14 @@
 #define GF_BASE_FIELD_WIDTH (16)
 #define GF_BASE_FIELD_SIZE       (1 << GF_BASE_FIELD_WIDTH)
 #define GF_BASE_FIELD_GROUP_SIZE  GF_BASE_FIELD_SIZE-1
-#define GF_S_GF_16_2 (40188)
-#define GF_MULTBY_TWO(p) (((p) & GF_FIRST_BIT) ? (((p) << 1) ^ h->prim_poly) : (p) << 1);
-
-
-struct gf_w16_logtable_data {
-    int              log_tbl[GF_BASE_FIELD_SIZE];
-    uint16_t         _antilog_tbl[GF_BASE_FIELD_SIZE * 4];
-    uint16_t         *antilog_tbl;
-    uint16_t         inv_tbl[GF_BASE_FIELD_SIZE];
-    uint32_t         log_s;
-};
+#define GF_MULTBY_TWO(p) (((p) & GF_FIRST_BIT) ? (((p) << 1) ^ h->prim_poly) : (p) << 1)
 
 struct gf_split_2_32_lazy_data {
     uint32_t      tables[16][4];
     uint32_t      last_value;
 };
 
-struct gf_split_8_8_data {
+struct gf_w32_split_8_8_data {
     uint32_t      tables[7][256][256];
     uint32_t      region_tables[4][256];
     uint32_t      last_value;
@@ -67,6 +57,11 @@ struct gf_w32_bytwo_data {
     uint64_t mask2;
 };
 
+struct gf_w32_composite_data {
+  uint16_t *log;
+  uint16_t *alog;
+};
+
 #define MM_PRINT32(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 4) printf(" %02x%02x%02x%02x", blah[15-ii], blah[14-ii], blah[13-ii], blah[12-ii]); printf("\n"); }
 
 #define MM_PRINT8(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 1) printf("%s%02x", (ii%4==0) ? "   " : " ", blah[15-ii]); printf("\n"); }
@@ -121,6 +116,168 @@ xor)
   }
 }
 
+static 
+void
+gf_w32_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
+{
+
+#ifdef INTEL_SSE4_PCLMUL
+
+  int i;
+  uint32_t *s32;
+  uint32_t *d32;
+  
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         w;
+  gf_internal_t * h = gf->scratch;
+  
+  prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL));
+   
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
+  s32 = (uint32_t *) src;
+  d32 = (uint32_t *) dest; 
+ 
+  if (xor) {
+    for (i = 0; i < bytes/sizeof(uint32_t); i++) {
+      b = _mm_insert_epi32 (a, s32[i], 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      d32[i] ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
+    } 
+  } else {
+    for (i = 0; i < bytes/sizeof(uint32_t); i++) {
+      b = _mm_insert_epi32 (a, s32[i], 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      d32[i] = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+    } 
+  }
+#endif
+}
+
+static 
+void
+gf_w32_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
+{
+
+#ifdef INTEL_SSE4_PCLMUL
+
+  int i;
+  uint32_t *s32;
+  uint32_t *d32;
+  
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         w;
+  gf_internal_t * h = gf->scratch;
+  
+  prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL));
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+  
+  a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
+  
+  s32 = (uint32_t *) src;
+  d32 = (uint32_t *) dest; 
+ 
+  if (xor) {
+    for (i = 0; i < bytes/sizeof(uint32_t); i++) {
+      b = _mm_insert_epi32 (a, s32[i], 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      d32[i] ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
+    } 
+  } else {
+    for (i = 0; i < bytes/sizeof(uint32_t); i++) {
+      b = _mm_insert_epi32 (a, s32[i], 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      d32[i] = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+    } 
+  }
+#endif
+}
+
+static 
+void
+gf_w32_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
+{
+#ifdef INTEL_SSE4_PCLMUL
+  int i;
+  uint32_t *s32;
+  uint32_t *d32;
+  
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         w;
+  gf_internal_t * h = gf->scratch;
+  
+  prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL));
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+  
+  a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
+  
+  s32 = (uint32_t *) src;
+  d32 = (uint32_t *) dest; 
+ 
+  if (xor) {
+    for (i = 0; i < bytes/sizeof(uint32_t); i++) {
+      b = _mm_insert_epi32 (a, s32[i], 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      d32[i] ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
+    } 
+  } else {
+    for (i = 0; i < bytes/sizeof(uint32_t); i++) {
+      b = _mm_insert_epi32 (a, s32[i], 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      d32[i] = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+    } 
+  }
+#endif
+}
+
 static
 inline
 uint32_t gf_w32_euclid (gf_t *gf, uint32_t b)
@@ -131,7 +288,7 @@ uint32_t gf_w32_euclid (gf_t *gf, uint32_t b)
   uint32_t c_i;
 
   if (b == 0) return -1;
-  e_im1 = ((gf_internal_t *) (gf->scratch))->prim_poly;
+  e_im1 = ((gf_internal_t *) (gf->scratch))->prim_poly; 
   e_i = b;
   d_im1 = 32;
   for (d_i = d_im1-1; ((1 << d_i) & e_i) == 0; d_i--) ;
@@ -148,6 +305,7 @@ uint32_t gf_w32_euclid (gf_t *gf, uint32_t b)
       c_i ^= (1 << (d_ip1 - d_i));
       e_ip1 ^= (e_i << (d_ip1 - d_i));
       d_ip1--;
+      if (e_ip1 == 0) return 0;
       while ((e_ip1 & (1 << d_ip1)) == 0) d_ip1--;
     }
 
@@ -237,6 +395,134 @@ uint32_t gf_w32_matrix (gf_t *gf, uint32_t b)
    extra memory.  
 */
 
+
+
+
+static
+inline
+gf_val_32_t
+gf_w32_clm_multiply_2 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
+{
+  gf_val_32_t rv = 0;
+
+#ifdef INTEL_SSE4_PCLMUL
+
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         v, w;
+  gf_internal_t * h = gf->scratch;
+
+
+  a = _mm_insert_epi32 (_mm_setzero_si128(), a32, 0);
+  b = _mm_insert_epi32 (a, b32, 0);
+  
+  prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL));
+  
+  /* Do the initial multiply */
+
+  result = _mm_clmulepi64_si128 (a, b, 0);
+
+  /* Ben: Do prim_poly reduction twice. We are guaranteed that we will only
+     have to do the reduction at most twice, because (w-2)/z == 2. Where
+     z is equal to the number of zeros after the leading 1 
+
+   _mm_clmulepi64_si128 is the carryless multiply operation. Here
+   _mm_srli_si128 shifts the result to the right by 4 bytes. This allows
+   us to multiply the prim_poly by the leading bits of the result. We
+   then xor the result of that operation back with the result.*/
+
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+  result = _mm_xor_si128 (result, w);
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+  result = _mm_xor_si128 (result, w);
+
+  /* Extracts 32 bit value from result. */
+  rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+#endif
+  return rv;
+}
+static
+inline
+gf_val_32_t
+gf_w32_clm_multiply_3 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
+{
+  gf_val_32_t rv = 0;
+
+#ifdef INTEL_SSE4_PCLMUL
+
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         v, w;
+  gf_internal_t * h = gf->scratch;
+
+
+  a = _mm_insert_epi32 (_mm_setzero_si128(), a32, 0);
+  b = _mm_insert_epi32 (a, b32, 0);
+
+  prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL));
+
+  /* Do the initial multiply */
+  
+  result = _mm_clmulepi64_si128 (a, b, 0);
+
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+  result = _mm_xor_si128 (result, w);
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+  result = _mm_xor_si128 (result, w);
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+  result = _mm_xor_si128 (result, w);
+
+  /* Extracts 32 bit value from result. */
+  
+  rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+#endif
+  return rv;
+}
+
+static
+inline
+gf_val_32_t
+gf_w32_clm_multiply_4 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
+{
+  gf_val_32_t rv = 0;
+
+#ifdef INTEL_SSE4_PCLMUL
+
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         v, w;
+  gf_internal_t * h = gf->scratch;
+
+
+  a = _mm_insert_epi32 (_mm_setzero_si128(), a32, 0);
+  b = _mm_insert_epi32 (a, b32, 0);
+
+  prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL));
+
+  /* Do the initial multiply */
+  
+  result = _mm_clmulepi64_si128 (a, b, 0);
+
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+  result = _mm_xor_si128 (result, w);
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+  result = _mm_xor_si128 (result, w);
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+  result = _mm_xor_si128 (result, w);
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+  result = _mm_xor_si128 (result, w);
+
+  /* Extracts 32 bit value from result. */
+  
+  rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+#endif
+  return rv;
+}
+
+
 static
 inline
 uint32_t
@@ -244,7 +530,7 @@ gf_w32_shift_multiply (gf_t *gf, uint32_t a32, uint32_t b32)
 {
   uint64_t product, i, pp, a, b, one;
   gf_internal_t *h;
-  
+
   a = a32;
   b = b32;
   h = (gf_internal_t *) gf->scratch;
@@ -256,37 +542,63 @@ gf_w32_shift_multiply (gf_t *gf, uint32_t a32, uint32_t b32)
   for (i = 0; i < GF_FIELD_WIDTH; i++) { 
     if (a & (one << i)) product ^= (b << i);
   }
-  for (i = (GF_FIELD_WIDTH*2-1); i >= GF_FIELD_WIDTH; i--) {
+  for (i = (GF_FIELD_WIDTH*2-2); i >= GF_FIELD_WIDTH; i--) {
     if (product & (one << i)) product ^= (pp << (i-GF_FIELD_WIDTH)); 
   }
   return product;
 }
 
-static 
-int gf_w32_shift_init(gf_t *gf)
+  static 
+int gf_w32_cfm_init(gf_t *gf)
 {
-  gf->multiply.w32 = gf_w32_shift_multiply;
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+
   gf->inverse.w32 = gf_w32_euclid;
   gf->multiply_region.w32 = gf_w32_multiply_region_from_single;
+  
+  /*Ben: We also check to see if the prim poly will work for pclmul */
+  /*Ben: Check to see how many reduction steps it will take*/
+
+#ifdef INTEL_SSE4_PCLMUL 
+  if ((0xfffe0000 & h->prim_poly) == 0){ 
+    gf->multiply.w32 = gf_w32_clm_multiply_2;
+    gf->multiply_region.w32 = gf_w32_clm_multiply_region_from_single_2;
+  }else if ((0xffc00000 & h->prim_poly) == 0){
+    gf->multiply.w32 = gf_w32_clm_multiply_3;
+    gf->multiply_region.w32 = gf_w32_clm_multiply_region_from_single_3;
+  }else if ((0xfe000000 & h->prim_poly) == 0){
+    gf->multiply.w32 = gf_w32_clm_multiply_4;
+    gf->multiply_region.w32 = gf_w32_clm_multiply_region_from_single_4;
+  } else {
+    return 0;
+  }
+  return 1;
+  #endif
+
+  return 0;
+}
+
+  static 
+int gf_w32_shift_init(gf_t *gf)
+{
+  gf->inverse.w32 = gf_w32_euclid;
+  gf->multiply_region.w32 = gf_w32_multiply_region_from_single;
+  gf->multiply.w32 = gf_w32_shift_multiply;
   return 1;
 }
 
 static
-void
+  void
 gf_w32_group_set_shift_tables(uint32_t *shift, uint32_t val, gf_internal_t *h)
 {
   int i;
   uint32_t j;
-  int g_s;
 
   shift[0] = 0;
-  
-  if (h->mult_type == GF_MULT_DEFAULT) {
-    g_s = 3;
-  } else {
-    g_s = h->arg1;
-  }
-  for (i = 1; i < (1 << g_s); i <<= 1) {
+
+  for (i = 1; i < (1 << h->arg1); i <<= 1) {
     for (j = 0; j < i; j++) shift[i|j] = shift[j]^val;
     if (val & GF_FIRST_BIT) {
       val <<= 1;
@@ -297,7 +609,7 @@ gf_w32_group_set_shift_tables(uint32_t *shift, uint32_t val, gf_internal_t *h)
   }
 }
 
-static
+  static
 void gf_w32_group_s_equals_r_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
 {
   int i;
@@ -333,10 +645,10 @@ void gf_w32_group_s_equals_r_multiply_region(gf_t *gf, void *src, void *dest, gf
     ind = a32 >> rs;
     a32 <<= leftover;
     p = gd->shift[ind];
-  
+
     bits_left = rs;
     rs = 32 - g_s;
-  
+
     while (bits_left > 0) {
       bits_left -= g_s;
       ind = a32 >> rs;
@@ -352,7 +664,7 @@ void gf_w32_group_s_equals_r_multiply_region(gf_t *gf, void *src, void *dest, gf
   gf_do_final_region_alignment(&rd);
 }
 
-static
+  static
 void gf_w32_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
 {
   uint32_t *s32, *d32, *top;
@@ -368,13 +680,8 @@ void gf_w32_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v
   if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
 
   gf_internal_t *h = (gf_internal_t *) gf->scratch;
-  if (h->mult_type == GF_MULT_DEFAULT) {
-    g_s = 3;
-    g_r = 8;
-  } else {
-    g_s = h->arg1;
-    g_r = h->arg2;
-  }
+  g_s = h->arg1;
+  g_r = h->arg2;
   gd = (struct gf_w32_group_data *) h->private;
   gf_w32_group_set_shift_tables(gd->shift, val, h);
 
@@ -527,13 +834,8 @@ gf_w32_group_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
   struct gf_w32_group_data *gd;
 
   gf_internal_t *h = (gf_internal_t *) gf->scratch;
-  if (h->mult_type == GF_MULT_DEFAULT) {
-    g_s = 3;
-    g_r = 8;
-  } else {
-    g_s = h->arg1;
-    g_r = h->arg2;
-  }
+  g_s = h->arg1;
+  g_r = h->arg2;
   gd = (struct gf_w32_group_data *) h->private;
   gf_w32_group_set_shift_tables(gd->shift, b, h);
 
@@ -684,7 +986,7 @@ static
 void
 gf_w32_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
 {
-#ifdef   INTEL_SSE4
+#ifdef INTEL_SSE2
   int i;
   uint8_t *s8, *d8;
   uint32_t vrev;
@@ -879,7 +1181,7 @@ static
 void
 gf_w32_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_w32_bytwo_data *btd)
 {
-#ifdef   INTEL_SSE4
+#ifdef INTEL_SSE2
   int i;
   uint8_t *d8, *s8, tb;
   __m128i pp, m1, m2, t1, t2, va, vb;
@@ -905,7 +1207,7 @@ static
 void
 gf_w32_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_w32_bytwo_data *btd)
 {
-#ifdef   INTEL_SSE4
+#ifdef INTEL_SSE2
   int i;
   uint8_t *d8, *s8, tb;
   __m128i pp, m1, m2, t1, t2, va, vb;
@@ -934,7 +1236,7 @@ static
 void 
 gf_w32_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
 {
-#ifdef   INTEL_SSE4
+#ifdef INTEL_SSE2
   uint32_t itb;
   uint8_t *d8, *s8;
   __m128i pp, m1, m2, t1, t2, va, vb;
@@ -1012,19 +1314,30 @@ int gf_w32_bytwo_init(gf_t *gf)
 
   if (h->mult_type == GF_MULT_BYTWO_p) {
     gf->multiply.w32 = gf_w32_bytwo_p_multiply;
-    if (h->region_type == GF_REGION_SSE) {
-      gf->multiply_region.w32 = gf_w32_bytwo_p_sse_multiply_region; 
-    } else {
+    #ifdef INTEL_SSE2
+      if (h->region_type & GF_REGION_NOSSE)
+        gf->multiply_region.w32 = gf_w32_bytwo_p_nosse_multiply_region; 
+      else
+        gf->multiply_region.w32 = gf_w32_bytwo_p_sse_multiply_region; 
+    #else
       gf->multiply_region.w32 = gf_w32_bytwo_p_nosse_multiply_region; 
-    }
+      if(h->region_type & GF_REGION_SSE)
+        return 0;
+    #endif
   } else {
     gf->multiply.w32 = gf_w32_bytwo_b_multiply; 
-    if (h->region_type == GF_REGION_SSE) {
-      gf->multiply_region.w32 = gf_w32_bytwo_b_sse_multiply_region; 
-    } else {
+    #ifdef INTEL_SSE2
+      if (h->region_type & GF_REGION_NOSSE)
+        gf->multiply_region.w32 = gf_w32_bytwo_b_nosse_multiply_region; 
+      else
+        gf->multiply_region.w32 = gf_w32_bytwo_b_sse_multiply_region; 
+    #else
       gf->multiply_region.w32 = gf_w32_bytwo_b_nosse_multiply_region; 
-    }
+      if(h->region_type & GF_REGION_SSE)
+        return 0;
+    #endif
   }
+
   gf->inverse.w32 = gf_w32_euclid;
   return 1;
 }
@@ -1036,10 +1349,10 @@ gf_w32_split_8_8_multiply (gf_t *gf, uint32_t a32, uint32_t b32)
 {
   uint32_t product, i, j, mask, tb;
   gf_internal_t *h;
-  struct gf_split_8_8_data *d8;
+  struct gf_w32_split_8_8_data *d8;
   
   h = (gf_internal_t *) gf->scratch;
-  d8 = (struct gf_split_8_8_data *) h->private;
+  d8 = (struct gf_w32_split_8_8_data *) h->private;
   product = 0;
   mask = 0xff;
 
@@ -1062,7 +1375,7 @@ gf_w32_split_8_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint32_t
   gf_internal_t *h;
   uint32_t *s32, *d32, *top, p, a, v;
   struct gf_split_8_32_lazy_data *d8;
-  struct gf_split_8_8_data *d88;
+  struct gf_w32_split_8_8_data *d88;
   uint32_t *t[4];
   int i, j, k, change;
   uint32_t pp;
@@ -1072,13 +1385,13 @@ gf_w32_split_8_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint32_t
   if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
 
   h = (gf_internal_t *) gf->scratch;
-  if (h->arg1 == 32 || h->arg2 == 32) {
+  if (h->arg1 == 32 || h->arg2 == 32 || h->mult_type == GF_MULT_DEFAULT) {
     d8 = (struct gf_split_8_32_lazy_data *) h->private;
     for (i = 0; i < 4; i++) t[i] = d8->tables[i];
     change = (val != d8->last_value);
     if (change) d8->last_value = val;
   } else {
-    d88 = (struct gf_split_8_8_data *) h->private;
+    d88 = (struct gf_w32_split_8_8_data *) h->private;
     for (i = 0; i < 4; i++) t[i] = d88->region_tables[i];
     change = (val != d88->last_value);
     if (change) d88->last_value = val;
@@ -1243,7 +1556,7 @@ static
 void
 gf_w32_split_2_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
 {
-#ifdef INTEL_SSE4
+#ifdef INTEL_SSSE3
   gf_internal_t *h;
   int i, m, j, tindex;
   uint32_t pp, v, v2, s, *s32, *d32, *top;
@@ -1380,7 +1693,7 @@ static
 void
 gf_w32_split_4_32_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
 {
-#ifdef INTEL_SSE4
+#ifdef INTEL_SSSE3
   gf_internal_t *h;
   int i, m, j, k, tindex;
   uint32_t pp, v, s, *s32, *d32, *top, *realtop;
@@ -1572,15 +1885,15 @@ static
 void
 gf_w32_split_4_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
 {
-#ifdef INTEL_SSE4
+#ifdef INTEL_SSSE3
   gf_internal_t *h;
   int i, m, j, k, tindex;
   uint32_t pp, v, s, *s32, *d32, *top, tmp_table[16];
-  __m128i vi, si, tables[8][4], p0, p1, p2, p3, mask1, v0, v1, v2, v3, mask8, mask16;
+  __m128i vi, si, tables[8][4], p0, p1, p2, p3, mask1, v0, v1, v2, v3, mask8;
   __m128i tv1, tv2, tv3, tv0;
   uint8_t btable[16];
   gf_region_data rd;
- 
+
   if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
   if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
 
@@ -1593,7 +1906,7 @@ gf_w32_split_4_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint
   s32 = (uint32_t *) rd.s_start;
   d32 = (uint32_t *) rd.d_start;
   top = (uint32_t *) rd.d_top;
-  
+
   v = val;
   for (i = 0; i < 8; i++) {
     tmp_table[0] = 0;
@@ -1614,7 +1927,6 @@ gf_w32_split_4_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint
 
   mask1 = _mm_set1_epi8(0xf);
   mask8 = _mm_set1_epi16(0xff);
-  mask16 = _mm_set1_epi32(0xffff);
 
   if (xor) {
     while (d32 != top) {
@@ -1737,36 +2049,41 @@ gf_w32_split_4_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint
       v1 = _mm_load_si128((__m128i *) s32); s32 += 4;
       v2 = _mm_load_si128((__m128i *) s32); s32 += 4;
       v3 = _mm_load_si128((__m128i *) s32); s32 += 4;
-  
+ 
       p0 = _mm_srli_epi16(v0, 8);
       p1 = _mm_srli_epi16(v1, 8);
       p2 = _mm_srli_epi16(v2, 8);
       p3 = _mm_srli_epi16(v3, 8);
-
+      
       tv0 = _mm_and_si128(v0, mask8);
       tv1 = _mm_and_si128(v1, mask8);
       tv2 = _mm_and_si128(v2, mask8);
       tv3 = _mm_and_si128(v3, mask8);
-
+      
       v0 = _mm_packus_epi16(p1, p0);
       v1 = _mm_packus_epi16(tv1, tv0);
       v2 = _mm_packus_epi16(p3, p2);
       v3 = _mm_packus_epi16(tv3, tv2);
-
+      
       p0 = _mm_srli_epi16(v0, 8);
       p1 = _mm_srli_epi16(v1, 8);
       p2 = _mm_srli_epi16(v2, 8);
       p3 = _mm_srli_epi16(v3, 8);
-
+     
       tv0 = _mm_and_si128(v0, mask8);
       tv1 = _mm_and_si128(v1, mask8);
       tv2 = _mm_and_si128(v2, mask8);
       tv3 = _mm_and_si128(v3, mask8);
-
+      
       v0 = _mm_packus_epi16(p2, p0);
       v1 = _mm_packus_epi16(p3, p1);
       v2 = _mm_packus_epi16(tv2, tv0);
       v3 = _mm_packus_epi16(tv3, tv1);
+      
+      p0 = v0;
+      p1 = v1;
+      p2 = v2;
+      p3 = v3;
 
       si = _mm_and_si128(v0, mask1);
       p0 = _mm_shuffle_epi8(tables[6][0], si);
@@ -1818,18 +2135,18 @@ gf_w32_split_4_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint
       p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[1][0], si));
       p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[1][1], si));
       p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[1][2], si));
-      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[1][3], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[1][3], si)); 
   
       tv0 = _mm_unpackhi_epi8(p1, p3);
       tv1 = _mm_unpackhi_epi8(p0, p2);
       tv2 = _mm_unpacklo_epi8(p1, p3);
       tv3 = _mm_unpacklo_epi8(p0, p2);
-
+      
       p0 = _mm_unpackhi_epi8(tv1, tv0);
       p1 = _mm_unpacklo_epi8(tv1, tv0);
       p2 = _mm_unpackhi_epi8(tv3, tv2);
       p3 = _mm_unpacklo_epi8(tv3, tv2);
-
+      
       _mm_store_si128((__m128i *) d32, p0);
       _mm_store_si128((__m128i *) (d32+4), p1);
       _mm_store_si128((__m128i *) (d32+8), p2);
@@ -1848,19 +2165,50 @@ int gf_w32_split_init(gf_t *gf)
   gf_internal_t *h;
   struct gf_split_2_32_lazy_data *ld2;
   struct gf_split_4_32_lazy_data *ld4;
-  struct gf_split_8_8_data *d8;
+  struct gf_w32_split_8_8_data *d8;
   struct gf_split_8_32_lazy_data *d32;
   struct gf_split_16_32_lazy_data *d16;
   uint32_t p, basep;
-  int i, j, exp;
+  int i, j, exp, ispclmul, issse3;
+
+  ispclmul = 0;
+#ifdef INTEL_SSE4_PCLMUL
+  ispclmul = 1;
+#endif
+
+  issse3 = 0;
+#ifdef INTEL_SSSE3
+  issse3 = 1;
+#endif
 
   h = (gf_internal_t *) gf->scratch;
 
   /* Defaults */
-  gf->multiply_region.w32 = gf_w32_multiply_region_from_single;
-  gf->multiply.w32 = gf_w32_shift_multiply;
+  
   gf->inverse.w32 = gf_w32_euclid;
 
+  /* JSP: First handle single multiplication:  
+     If args == 8, then we're doing split 8 8.  
+     Otherwise, if PCLMUL, we use that.
+     Otherwise, we use bytwo_p.
+   */
+
+  if (h->arg1 == 8 && h->arg2 == 8) {
+    gf->multiply.w32 = gf_w32_split_8_8_multiply;
+  } else if (ispclmul) {
+    if ((0xfffe0000 & h->prim_poly) == 0){
+      gf->multiply.w32 = gf_w32_clm_multiply_2;
+    } else if ((0xffc00000 & h->prim_poly) == 0){
+      gf->multiply.w32 = gf_w32_clm_multiply_3;
+    } else if ((0xfe000000 & h->prim_poly) == 0){
+     gf->multiply.w32 = gf_w32_clm_multiply_4;
+    }
+  } else {
+    gf->multiply.w32 = gf_w32_bytwo_p_multiply;
+  }
+
+  /* Easy cases: 16/32 and 2/32 */
+
   if ((h->arg1 == 16 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 16)) {
     d16 = (struct gf_split_16_32_lazy_data *) h->private;
     d16->last_value = 0;
@@ -1868,15 +2216,51 @@ int gf_w32_split_init(gf_t *gf)
     return 1;
   }
 
-  if ((h->arg1 == 8 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 8)) {
+  if ((h->arg1 == 2 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 2)) {
+    ld2 = (struct gf_split_2_32_lazy_data *) h->private;
+    ld2->last_value = 0;
+    #ifdef INTEL_SSSE3
+      if (!(h->region_type & GF_REGION_NOSSE))
+        gf->multiply_region.w32 = gf_w32_split_2_32_lazy_sse_multiply_region;
+      else
+        gf->multiply_region.w32 = gf_w32_split_2_32_lazy_multiply_region;
+    #else
+      gf->multiply_region.w32 = gf_w32_split_2_32_lazy_multiply_region;
+      if(h->region_type & GF_REGION_SSE) return 0;
+    #endif
+    return 1;
+  } 
+
+  /* 4/32 or Default + SSE - There is no ALTMAP/NOSSE. */
+
+  if ((h->arg1 == 4 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 4) ||
+      (issse3 && h->mult_type == GF_REGION_DEFAULT)) {
+    ld4 = (struct gf_split_4_32_lazy_data *) h->private;
+    ld4->last_value = 0;
+    if ((h->region_type & GF_REGION_NOSSE) || !issse3) {
+      gf->multiply_region.w32 = gf_w32_split_4_32_lazy_multiply_region;
+    } else if (h->region_type & GF_REGION_ALTMAP) {
+      gf->multiply_region.w32 = gf_w32_split_4_32_lazy_sse_altmap_multiply_region;
+    } else {
+      gf->multiply_region.w32 = gf_w32_split_4_32_lazy_sse_multiply_region;
+    }
+    return 1;
+  } 
+
+  /* 8/32 or Default + no SSE */
+
+  if ((h->arg1 == 8 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 8) || 
+       h->mult_type == GF_MULT_DEFAULT) {
     d32 = (struct gf_split_8_32_lazy_data *) h->private;
     d32->last_value = 0;
     gf->multiply_region.w32 = gf_w32_split_8_32_lazy_multiply_region;
     return 1;
   }
 
+  /* Finally, if args == 8, then we have to set up the tables here. */
+
   if (h->arg1 == 8 && h->arg2 == 8) {
-    d8 = (struct gf_split_8_8_data *) h->private;
+    d8 = (struct gf_w32_split_8_8_data *) h->private;
     d8->last_value = 0;
     gf->multiply.w32 = gf_w32_split_8_8_multiply;
     gf->multiply_region.w32 = gf_w32_split_8_32_lazy_multiply_region;
@@ -1908,31 +2292,10 @@ int gf_w32_split_init(gf_t *gf)
     }
     return 1;
   }
-  if ((h->arg1 == 2 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 2)) {
-    ld2 = (struct gf_split_2_32_lazy_data *) h->private;
-    ld2->last_value = 0;
-    if (h->region_type & GF_REGION_SSE) {
-      gf->multiply_region.w32 = gf_w32_split_2_32_lazy_sse_multiply_region;
-    } else {
-      gf->multiply_region.w32 = gf_w32_split_2_32_lazy_multiply_region;
-    }
-    return 1;
-  } 
-  if ((h->arg1 == 4 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 4)) {
-    ld4 = (struct gf_split_4_32_lazy_data *) h->private;
-    ld4->last_value = 0;
-    if (h->region_type & GF_REGION_SSE) {
-      if (h->region_type & GF_REGION_ALTMAP) {
-        gf->multiply_region.w32 = gf_w32_split_4_32_lazy_sse_altmap_multiply_region;
-      } else {
-        gf->multiply_region.w32 = gf_w32_split_4_32_lazy_sse_multiply_region;
-      }
-    } else {
-      gf->multiply_region.w32 = gf_w32_split_4_32_lazy_multiply_region;
-    }
-    return 1;
-  } 
-  return 1;
+
+  /* If we get here, then the arguments were bad. */
+
+  return 0;
 }
 
 static
@@ -1943,13 +2306,8 @@ int gf_w32_group_init(gf_t *gf)
   gf_internal_t *h = (gf_internal_t *) gf->scratch;
   int g_r, g_s;
 
-  if (h->mult_type == GF_MULT_DEFAULT) {
-    g_s = 3;
-    g_r = 8;
-  } else {
-    g_s = h->arg1;
-    g_r = h->arg2;
-  }
+  g_s = h->arg1;
+  g_r = h->arg2;
 
   gd = (struct gf_w32_group_data *) h->private;
   gd->shift = (uint32_t *) (&(gd->memory));
@@ -1983,11 +2341,6 @@ int gf_w32_group_init(gf_t *gf)
   } else {
     gf->multiply.w32 = gf_w32_group_multiply;
     gf->multiply_region.w32 = gf_w32_group_multiply_region;
-    if (h->mult_type == GF_MULT_DEFAULT) {
-#ifdef INTEL_SSE4
-      gf->multiply_region.w32 = gf_w32_split_4_32_lazy_sse_multiply_region;
-#endif
-    }
   }
   gf->divide.w32 = NULL;
   gf->inverse.w32 = gf_w32_euclid;
@@ -1995,44 +2348,6 @@ int gf_w32_group_init(gf_t *gf)
   return 1;
 }
 
-static
-uint32_t
-gf_w32_composite_multiply_logtable(gf_t *gf, uint32_t a, uint32_t b)
-{
-  gf_internal_t *h = (gf_internal_t *) gf->scratch;
-  struct gf_w16_logtable_data * ltd = (struct gf_w16_logtable_data *) h->private;
-
-  uint32_t b0 = b & 0xffff;
-  uint32_t b1 = b >> 16;
-  uint32_t a0 = a & 0xffff;
-  uint32_t a1 = a >> 16;
-  uint32_t a1b1;
-  uint32_t la0, la1, lb0, lb1, l11;
-  uint32_t p;
-
-  la0 = ltd->log_tbl[a0];
-  la1 = ltd->log_tbl[a1];
-  lb0 = ltd->log_tbl[b0];
-  lb1 = ltd->log_tbl[b1];
-
-  if (a1 && b1) {
-    l11 = (la1 + lb1);
-    a1b1 = ltd->antilog_tbl[l11];
-    l11 = ltd->log_tbl[a1b1];
-    p = ltd->antilog_tbl[l11+ltd->log_s];
-  } else {
-    a1b1 = 0;
-    p = 0;
-  }
- 
-  if (a0 && b1) p ^= ltd->antilog_tbl[la0+lb1];
-
-  if (a1 && b0) p ^= ltd->antilog_tbl[la1+lb0];
-  p <<= 16;
-  p ^= a1b1;
-  if (a0 && b0) p ^= ltd->antilog_tbl[la0+lb0];
-  return p;
-}
 
 static
 uint32_t
@@ -2040,19 +2355,48 @@ gf_w32_composite_multiply_recursive(gf_t *gf, uint32_t a, uint32_t b)
 {
   gf_internal_t *h = (gf_internal_t *) gf->scratch;
   gf_t *base_gf = h->base_gf;
-  uint16_t b0 = b & 0x0000ffff;
-  uint16_t b1 = (b & 0xffff0000) >> 16;
-  uint16_t a0 = a & 0x0000ffff;
-  uint16_t a1 = (a & 0xffff0000) >> 16;
-  uint16_t a1b1;
+  uint32_t b0 = b & 0x0000ffff;
+  uint32_t b1 = (b & 0xffff0000) >> 16;
+  uint32_t a0 = a & 0x0000ffff;
+  uint32_t a1 = (a & 0xffff0000) >> 16;
+  uint32_t a1b1;
   uint32_t rv;
-
   a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
 
-  rv = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, GF_S_GF_16_2)) << 16)); 
+  rv = ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 16) | (base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1);
   return rv;
 }
 
+/* JSP: This could be made faster. Someday, when I'm bored. */
+
+static
+uint32_t
+gf_w32_composite_multiply_inline(gf_t *gf, uint32_t a, uint32_t b)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  gf_t *base_gf = h->base_gf;
+  uint32_t b0 = b & 0x0000ffff;
+  uint32_t b1 = b >> 16;
+  uint32_t a0 = a & 0x0000ffff;
+  uint32_t a1 = a >> 16;
+  uint32_t a1b1, prod;
+  uint16_t *log, *alog;
+  struct gf_w32_composite_data *cd;
+
+  cd = (struct gf_w32_composite_data *) h->private;
+  log = cd->log;
+  alog = cd->alog;
+
+  a1b1 = GF_W16_INLINE_MULT(log, alog, a1, b1);
+  prod = GF_W16_INLINE_MULT(log, alog, a1, b0);
+  prod ^= GF_W16_INLINE_MULT(log, alog, a0, b1);
+  prod ^= GF_W16_INLINE_MULT(log, alog, a1b1, h->prim_poly);
+  prod <<= 16;
+  prod ^= GF_W16_INLINE_MULT(log, alog, a0, b0);
+  prod ^= a1b1;
+  return prod;
+}
+
 /*
  * Composite field division trick (explained in 2007 tech report)
  *
@@ -2075,6 +2419,7 @@ gf_w32_composite_multiply_recursive(gf_t *gf, uint32_t a, uint32_t b)
  *
  * a / b = a * c
  */
+
 static
 uint32_t
 gf_w32_composite_inverse(gf_t *gf, uint32_t a)
@@ -2089,7 +2434,7 @@ gf_w32_composite_inverse(gf_t *gf, uint32_t a)
 
   if (a0 == 0) {
     a1inv = base_gf->inverse.w32(base_gf, a1);
-    c0 = base_gf->multiply.w32(base_gf, a1inv, GF_S_GF_16_2);
+    c0 = base_gf->multiply.w32(base_gf, a1inv, h->prim_poly);
     c1 = a1inv;
   } else if (a1 == 0) {
     c0 = base_gf->inverse.w32(base_gf, a0);
@@ -2100,7 +2445,7 @@ gf_w32_composite_inverse(gf_t *gf, uint32_t a)
 
     d = base_gf->multiply.w32(base_gf, a1, a0inv);
 
-    tmp = (base_gf->multiply.w32(base_gf, a1, a0inv) ^ base_gf->multiply.w32(base_gf, a0, a1inv) ^ GF_S_GF_16_2);
+    tmp = (base_gf->multiply.w32(base_gf, a1, a0inv) ^ base_gf->multiply.w32(base_gf, a0, a1inv) ^ h->prim_poly);
     tmp = base_gf->inverse.w32(base_gf, tmp);
 
     d = base_gf->multiply.w32(base_gf, d, tmp);
@@ -2114,115 +2459,89 @@ gf_w32_composite_inverse(gf_t *gf, uint32_t a)
   return c;
 }
 
-static
-uint32_t
-gf_w32_composite_divide(gf_t *gf, uint32_t a, uint32_t b)
-{
-  uint32_t binv;
-
-  binv = gf->inverse.w32(gf, b);
-  return gf->multiply.w32(gf, a, binv);
-}
-
-/* JSP: I'm not using this because I don't think it has value added. */
-static
-void
-gf_w32_composite_multiply_region_inline(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
-{
-  unsigned long uls, uld;
-  gf_internal_t *h = (gf_internal_t *) gf->scratch;
-  gf_t *base_gf = h->base_gf;
-  int i=0;
-  struct gf_w16_logtable_data * ltd;
-  uint16_t b0 = val & 0x0000ffff;
-  uint16_t b1 = (val & 0xffff0000) >> 16;
-  uint32_t *s32 = (uint32_t *) src;
-  uint32_t *d32 = (uint32_t *) dest;
-  uint16_t a0, a1, a1b1;
-  int num_syms = bytes >> 2;
-  int sym_divisible = bytes % 4;
-
-  uls = (unsigned long) src;
-  uld = (unsigned long) dest;
-  if ((uls & 0x7) != (uld & 0x7)) gf_alignment_error("gf_w32_buf_const_log", 2);
-  if (sym_divisible) {
-    gf_alignment_error("gf_w32_buf_const_log: buffer size not divisible by symbol size = 2 bytes", 2);
-  }
-
-  if (val == 0) {
-    if (xor) return;
-    bzero(dest, bytes);
-    return;
-  }
-
-  ltd = (struct gf_w16_logtable_data *) h->private;
-
-  if (xor) {
-    for (i = 0;i < num_syms; i++) {
-      a0 = s32[i] & 0x0000ffff;
-      a1 = (s32[i] & 0xffff0000) >> 16;
-      a1b1 = ltd->antilog_tbl[ltd->log_tbl[a1] + ltd->log_tbl[b1]];
-
-      d32[i] ^= ((ltd->antilog_tbl[ltd->log_tbl[a0] + ltd->log_tbl[b0]] ^ a1b1) | 
-                 ((ltd->antilog_tbl[ltd->log_tbl[a1] + ltd->log_tbl[b0]] ^ ltd->antilog_tbl[ltd->log_tbl[a0] + ltd->log_tbl[b1]] ^ 
-                   ltd->antilog_tbl[ltd->log_tbl[a1b1] + ltd->log_tbl[GF_S_GF_16_2]]) << 16));
-
-    }
-  } else {
-    for (i = 0;i < num_syms; i++) {
-      a0 = s32[i] & 0x0000ffff;
-      a1 = (s32[i] & 0xffff0000) >> 16;
-      a1b1 = ltd->antilog_tbl[ltd->log_tbl[a1] + ltd->log_tbl[b1]];
-
-      d32[i] = ((ltd->antilog_tbl[ltd->log_tbl[a0] + ltd->log_tbl[b0]] ^ a1b1) | 
-                 ((ltd->antilog_tbl[ltd->log_tbl[a1] + ltd->log_tbl[b0]] ^ ltd->antilog_tbl[ltd->log_tbl[a0] + ltd->log_tbl[b1]] ^ 
-                   ltd->antilog_tbl[ltd->log_tbl[a1b1] + ltd->log_tbl[GF_S_GF_16_2]]) << 16));
-    }
-  }
-}
-
 static
 void
 gf_w32_composite_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
 {
   gf_internal_t *h = (gf_internal_t *) gf->scratch;
   gf_t *base_gf = h->base_gf;
-  struct gf_w16_logtable_data * ltd;
-  uint16_t b0 = val & 0x0000ffff;
-  uint16_t b1 = (val & 0xffff0000) >> 16;
+  uint32_t b0 = val & 0x0000ffff;
+  uint32_t b1 = (val & 0xffff0000) >> 16;
   uint32_t *s32, *d32, *top;
-  uint16_t a0, a1, a1b1;
+  uint16_t a0, a1, a1b1, *log, *alog;
+  uint32_t prod;
   gf_region_data rd;
+  struct gf_w32_composite_data *cd;
+
+  cd = (struct gf_w32_composite_data *) h->private;
+  log = cd->log;
+  alog = cd->alog;
 
   if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
   gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4);
-
   
   s32 = rd.s_start;
   d32 = rd.d_start;
   top = rd.d_top;
 
-  if (xor) {
-    while (d32 < top) {
-      a0 = *s32 & 0x0000ffff;
-      a1 = (*s32 & 0xffff0000) >> 16;
-      a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
-
-      *d32 ^= ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
-                ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, GF_S_GF_16_2)) << 16)); 
-      s32++;
-      d32++;
+  if (log == NULL) {
+    if (xor) {
+      while (d32 < top) {
+        a0 = *s32 & 0x0000ffff;
+        a1 = (*s32 & 0xffff0000) >> 16;
+        a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
+  
+        *d32 ^= ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
+                  ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 16)); 
+        s32++;
+        d32++;
+      }
+    } else {
+      while (d32 < top) {
+        a0 = *s32 & 0x0000ffff;
+        a1 = (*s32 & 0xffff0000) >> 16;
+        a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
+  
+        *d32 = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
+                  ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 16)); 
+        s32++;
+        d32++;
+      }
     }
   } else {
-    while (d32 < top) {
-      a0 = *s32 & 0x0000ffff;
-      a1 = (*s32 & 0xffff0000) >> 16;
-      a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
+    if (xor) {
+      while (d32 < top) {
+        a0 = *s32 & 0x0000ffff;
+        a1 = (*s32 & 0xffff0000) >> 16;
+        a1b1 = GF_W16_INLINE_MULT(log, alog, a1, b1);
 
-      *d32 = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
-                ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, GF_S_GF_16_2)) << 16)); 
-      s32++;
-      d32++;
+        prod = GF_W16_INLINE_MULT(log, alog, a1, b0);
+        prod ^= GF_W16_INLINE_MULT(log, alog, a0, b1);
+        prod ^= GF_W16_INLINE_MULT(log, alog, a1b1, h->prim_poly);
+        prod <<= 16;
+        prod ^= GF_W16_INLINE_MULT(log, alog, a0, b0);
+        prod ^= a1b1;
+        *d32 ^= prod;
+        s32++;
+        d32++;
+      }
+    } else {
+      while (d32 < top) {
+        a0 = *s32 & 0x0000ffff;
+        a1 = (*s32 & 0xffff0000) >> 16;
+        a1b1 = GF_W16_INLINE_MULT(log, alog, a1, b1);
+  
+        prod = GF_W16_INLINE_MULT(log, alog, a1, b0);
+        prod ^= GF_W16_INLINE_MULT(log, alog, a0, b1);
+        prod ^= GF_W16_INLINE_MULT(log, alog, a1b1, h->prim_poly);
+        prod <<= 16;
+        prod ^= GF_W16_INLINE_MULT(log, alog, a0, b0);
+        prod ^= a1b1;
+        
+        *d32 = prod;
+        s32++;
+        d32++;
+      }
     }
   }
 }
@@ -2259,7 +2578,7 @@ gf_w32_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, uint32_t v
   base_gf->multiply_region.w32(base_gf, shigh, dlow, val1, sub_reg_size, 1);
   base_gf->multiply_region.w32(base_gf, slow, dhigh, val1, sub_reg_size, xor);
   base_gf->multiply_region.w32(base_gf, shigh, dhigh, val0, sub_reg_size, 1);
-  base_gf->multiply_region.w32(base_gf, shigh, dhigh, base_gf->multiply.w32(base_gf, GF_S_GF_16_2, val1), sub_reg_size, 1);
+  base_gf->multiply_region.w32(base_gf, shigh, dhigh, base_gf->multiply.w32(base_gf, h->prim_poly, val1), sub_reg_size, 1);
 
   gf_do_final_region_alignment(&rd);
 }
@@ -2267,143 +2586,92 @@ gf_w32_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, uint32_t v
 static
 int gf_w32_composite_init(gf_t *gf)
 {
-  struct gf_w16_logtable_data *ltd;
   gf_internal_t *h = (gf_internal_t *) gf->scratch;
-  gf_t *base_gf = h->base_gf;
-  gf_internal_t *base_h = (gf_internal_t *) base_gf->scratch;
-  uint32_t a, b;
-  uint64_t prim_poly = ((gf_internal_t *) base_gf->scratch)->prim_poly;
-  int i;
+  struct gf_w32_composite_data *cd;
+
+  if (h->base_gf == NULL) return 0;
+
+  cd = (struct gf_w32_composite_data *) h->private;
+  cd->log = gf_w16_get_log_table(h->base_gf);
+  cd->alog = gf_w16_get_mult_alog_table(h->base_gf);
 
   if (h->region_type & GF_REGION_ALTMAP) {
     gf->multiply_region.w32 = gf_w32_composite_multiply_region_alt;
-  } else if (h->arg2 == 0 && base_h->mult_type == GF_MULT_LOG_TABLE &&
-                             base_h->arg1 == 0) {
-    gf->multiply_region.w32 = gf_w32_composite_multiply_region;
-/* It would be this, were that not buggy and I cared: 
-    gf->multiply_region.w32 = gf_w32_composite_multiply_region_inline; */
   } else {
     gf->multiply_region.w32 = gf_w32_composite_multiply_region;
   }
 
-  if (h->arg2 == 0) {
-    ltd = (struct gf_w16_logtable_data *) h->private;
-
-    ltd->log_tbl[0] = 0;
-
-    bzero(&(ltd->_antilog_tbl[0]), sizeof(ltd->_antilog_tbl));
-
-    ltd->antilog_tbl = &(ltd->_antilog_tbl[GF_BASE_FIELD_SIZE * 2]);
-  
-    b = 1;
-    for (i = 0; i < GF_BASE_FIELD_GROUP_SIZE; i++) {
-        ltd->log_tbl[b] = (uint16_t)i;
-        ltd->antilog_tbl[i] = (uint16_t)b;
-        ltd->antilog_tbl[i+GF_BASE_FIELD_GROUP_SIZE] = (uint16_t)b;
-        b <<= 1;
-        if (b & GF_BASE_FIELD_SIZE) {
-            b = b ^ prim_poly;
-        }
-    }
-    ltd->log_s = ltd->log_tbl[GF_S_GF_16_2];
-    ltd->inv_tbl[0] = 0;  /* Not really, but we need to fill it with something  */
-    ltd->inv_tbl[1] = 1;
-    for (i = 2; i < GF_BASE_FIELD_SIZE; i++) {
-      ltd->inv_tbl[i] = ltd->antilog_tbl[GF_BASE_FIELD_GROUP_SIZE-ltd->log_tbl[i]];
-    }
-    gf->multiply.w32 = gf_w32_composite_multiply_logtable;
-  } else {
+  if (cd->log == NULL) {
     gf->multiply.w32 = gf_w32_composite_multiply_recursive;
+  } else {
+    gf->multiply.w32 = gf_w32_composite_multiply_inline; 
   }
-
-  gf->divide.w32 = gf_w32_composite_divide;
+  gf->divide.w32 = NULL;
   gf->inverse.w32 = gf_w32_composite_inverse;
 
   return 1;
 }
 
+
+
 int gf_w32_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
 {
-  int ss, sa;
+  int ss;
+  int issse3 = 0;
 
   ss = (GF_REGION_SSE | GF_REGION_NOSSE);
-  sa = (GF_REGION_STDMAP | GF_REGION_ALTMAP);
+
+#ifdef INTEL_SSSE3
+  issse3 = 1;
+#endif
 
   switch(mult_type)
   {
     case GF_MULT_BYTWO_p:
     case GF_MULT_BYTWO_b:
-      if (arg1 != 0 || arg2 != 0) return -1;
-      if (region_type != GF_REGION_CAUCHY) {
-        if ((region_type | ss) != ss || (region_type & ss) == ss) return -1;
-      }
-      return sizeof(gf_internal_t) + sizeof(struct gf_w32_bytwo_data);
+      return sizeof(gf_internal_t) + sizeof(struct gf_w32_bytwo_data) + 64;
       break;
-    case GF_MULT_DEFAULT:
     case GF_MULT_GROUP: 
-      if (mult_type == GF_MULT_DEFAULT) {
-        arg1 = 3;
-        arg2 = 8;
-      }
-      if (arg1 <= 0 || arg2 <= 0) return -1;
-      if (region_type != GF_REGION_DEFAULT && region_type != GF_REGION_CAUCHY) return -1;
       return sizeof(gf_internal_t) + sizeof(struct gf_w32_group_data) +
                sizeof(uint32_t) * (1 << arg1) +
                sizeof(uint32_t) * (1 << arg2) + 64;
       break;
+    case GF_MULT_DEFAULT:
+
     case GF_MULT_SPLIT_TABLE: 
         if (arg1 == 8 && arg2 == 8){
-          if (region_type != GF_REGION_DEFAULT && region_type != GF_REGION_CAUCHY) return -1;
-          return sizeof(gf_internal_t) + sizeof(struct gf_split_8_8_data) + 64;
+          return sizeof(gf_internal_t) + sizeof(struct gf_w32_split_8_8_data) + 64;
         }
         if ((arg1 == 16 && arg2 == 32) || (arg2 == 16 && arg1 == 32)) {
-          region_type &= (~GF_REGION_LAZY);
-          if (region_type != GF_REGION_DEFAULT) return -1;
           return sizeof(gf_internal_t) + sizeof(struct gf_split_16_32_lazy_data) + 64;
         }
-        if ((arg1 == 8 && arg2 == 32) || (arg2 == 8 && arg1 == 32)) {
-          region_type &= (~GF_REGION_LAZY);
-          if (region_type != GF_REGION_DEFAULT) return -1;
-          return sizeof(gf_internal_t) + sizeof(struct gf_split_8_32_lazy_data) + 64;
-        }
         if ((arg1 == 2 && arg2 == 32) || (arg2 == 2 && arg1 == 32)) {
-          region_type &= (~GF_REGION_LAZY);
-          if ((region_type & ss) == ss) return -1;
-          if ((region_type | ss) != ss) return -1;
           return sizeof(gf_internal_t) + sizeof(struct gf_split_2_32_lazy_data) + 64;
         }
-        if ((arg1 == 4 && arg2 == 32) || (arg2 == 4 && arg1 == 32)) {
-          region_type &= (~GF_REGION_LAZY);
-          if ((region_type & ss) == ss) return -1;
-          if ((region_type & sa) == sa) return -1;
-          if (region_type & (~(ss|sa))) return -1;
-          if (region_type & GF_REGION_SSE) {
-            return sizeof(gf_internal_t) + sizeof(struct gf_split_4_32_lazy_data) + 64;
-          } else if (region_type & GF_REGION_ALTMAP) {
-            return -1;
-          } else {
-            return sizeof(gf_internal_t) + sizeof(struct gf_split_4_32_lazy_data) + 64;
-          }
+        if ((arg1 == 8 && arg2 == 32) || (arg2 == 8 && arg1 == 32) || 
+             (mult_type == GF_MULT_DEFAULT && !issse3)) {
+          return sizeof(gf_internal_t) + sizeof(struct gf_split_8_32_lazy_data) + 64;
         }
-        return -1;
+        if ((arg1 == 4 && arg2 == 32) || 
+            (arg2 == 4 && arg1 == 32) ||
+            mult_type == GF_MULT_DEFAULT) {
+          return sizeof(gf_internal_t) + sizeof(struct gf_split_4_32_lazy_data) + 64;
+        }
+        return 0;
+    case GF_MULT_CARRY_FREE:
+      return sizeof(gf_internal_t);
+      break;
     case GF_MULT_SHIFT:
-      if (arg1 != 0 || arg2 != 0) return -1;
-      if (region_type != 0 && region_type != GF_REGION_CAUCHY) return -1;
       return sizeof(gf_internal_t);
       break;
     case GF_MULT_COMPOSITE:
-      if (region_type & ~(GF_REGION_ALTMAP | GF_REGION_STDMAP)) return -1;
-      if (arg1 == 2 && arg2 == 0) {
-        return sizeof(gf_internal_t) + sizeof(struct gf_w16_logtable_data) + 64;
-      } else if (arg1 == 2 && arg2 == 1) {
-        return sizeof(gf_internal_t) + 64;
-      } else {
-        return -1;
-      }
+      return sizeof(gf_internal_t) + sizeof(struct gf_w32_composite_data) + 64;
+      break;
 
     default:
-      return -1;
+      return 0;
    }
+   return 0;
 }
 
 int gf_w32_init(gf_t *gf)
@@ -2411,22 +2679,43 @@ int gf_w32_init(gf_t *gf)
   gf_internal_t *h;
 
   h = (gf_internal_t *) gf->scratch;
-  if (h->prim_poly == 0) h->prim_poly = 0x400007;
+  
+  /* Allen: set default primitive polynomial / irreducible polynomial if needed */
 
+  if (h->prim_poly == 0) {
+    if (h->mult_type == GF_MULT_COMPOSITE) { 
+      h->prim_poly = gf_composite_get_default_poly(h->base_gf);
+      if (h->prim_poly == 0) return 0; /* This shouldn't happen */
+    } else { 
+
+      /* Allen: use the following primitive polynomial to make carryless multiply work more efficiently for GF(2^32).*/
+
+      /* h->prim_poly = 0xc5; */
+
+      /* Allen: The following is the traditional primitive polynomial for GF(2^32) */
+
+      h->prim_poly = 0x400007;
+    } 
+  }
+
+  /* No leading one */
+
+  if(h->mult_type != GF_MULT_COMPOSITE) h->prim_poly &= 0xffffffff;
+    
   gf->multiply.w32 = NULL;
   gf->divide.w32 = NULL;
   gf->inverse.w32 = NULL;
   gf->multiply_region.w32 = NULL;
 
   switch(h->mult_type) {
+    case GF_MULT_CARRY_FREE:  if (gf_w32_cfm_init(gf) == 0) return 0; break;
     case GF_MULT_SHIFT:       if (gf_w32_shift_init(gf) == 0) return 0; break;
     case GF_MULT_COMPOSITE:   if (gf_w32_composite_init(gf) == 0) return 0; break;
-    case GF_MULT_SPLIT_TABLE: if (gf_w32_split_init(gf) == 0) return 0; break;
     case GF_MULT_DEFAULT: 
+    case GF_MULT_SPLIT_TABLE: if (gf_w32_split_init(gf) == 0) return 0; break;
     case GF_MULT_GROUP:       if (gf_w32_group_init(gf) == 0) return 0; break;
     case GF_MULT_BYTWO_p:   
     case GF_MULT_BYTWO_b:     if (gf_w32_bytwo_init(gf) == 0) return 0; break;
-
     default: return 0;
   }
   if (h->divide_type == GF_DIVIDE_EUCLID) {
diff --git a/gf_w4.c b/gf_w4.c
index 1175e01..50f00da 100644
--- a/gf_w4.c
+++ b/gf_w4.c
@@ -100,7 +100,6 @@ gf_val_32_t gf_w4_euclid (gf_t *gf, gf_val_32_t b)
   y_im1 = 0;
 
   while (e_i != 1) {
-
     e_ip1 = e_im1;
     d_ip1 = d_im1;
     c_i = 0;
@@ -108,6 +107,7 @@ gf_val_32_t gf_w4_euclid (gf_t *gf, gf_val_32_t b)
     while (d_ip1 >= d_i) {
       c_i ^= (1 << (d_ip1 - d_i));
       e_ip1 ^= (e_i << (d_ip1 - d_i));
+      if (e_ip1 == 0) return 0;
       while ((e_ip1 & (1 << d_ip1)) == 0) d_ip1--;
     }
 
@@ -146,6 +146,110 @@ gf_val_32_t gf_w4_matrix (gf_t *gf, gf_val_32_t b)
   return gf_bitmatrix_inverse(b, 4, ((gf_internal_t *) (gf->scratch))->prim_poly);
 }
 
+
+static
+inline
+gf_val_32_t
+gf_w4_shift_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  uint8_t product, i, pp;
+  gf_internal_t *h;
+  
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+
+  product = 0;
+
+  for (i = 0; i < GF_FIELD_WIDTH; i++) { 
+    if (a & (1 << i)) product ^= (b << i);
+  }
+  for (i = (GF_FIELD_WIDTH*2-2); i >= GF_FIELD_WIDTH; i--) {
+    if (product & (1 << i)) product ^= (pp << (i-GF_FIELD_WIDTH)); 
+  }
+  return product;
+}
+
+/* Ben: This function works, but it is 33% slower than the normal shift mult */
+
+static
+inline
+gf_val_32_t
+gf_w4_clm_multiply (gf_t *gf, gf_val_32_t a4, gf_val_32_t b4)
+{
+  gf_val_32_t rv = 0;
+
+#ifdef INTEL_SSE4_PCLMUL
+
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         w;
+  gf_internal_t * h = gf->scratch;
+
+  a = _mm_insert_epi32 (_mm_setzero_si128(), a4, 0);
+  b = _mm_insert_epi32 (a, b4, 0);
+
+  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1fULL));
+
+  /* Do the initial multiply */
+
+  result = _mm_clmulepi64_si128 (a, b, 0);
+
+  /* Ben/JSP: Do prim_poly reduction once. We are guaranteed that we will only
+     have to do the reduction only once, because (w-2)/z == 1. Where
+     z is equal to the number of zeros after the leading 1.
+
+     _mm_clmulepi64_si128 is the carryless multiply operation. Here
+     _mm_srli_epi64 shifts the result to the right by 4 bits. This allows
+     us to multiply the prim_poly by the leading bits of the result. We
+     then xor the result of that operation back with the result. */
+
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_epi64 (result, 4), 0);
+  result = _mm_xor_si128 (result, w);
+
+  /* Extracts 32 bit value from result. */
+
+  rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+#endif
+  return rv;
+}
+
+static
+void
+gf_w4_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int 
+    xor)
+{
+  gf_region_data rd;
+  uint8_t *s8;
+  uint8_t *d8;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1);
+  gf_do_initial_region_alignment(&rd);
+
+  s8 = (uint8_t *) rd.s_start;
+  d8 = (uint8_t *) rd.d_start;
+
+  if (xor) {
+    while (d8 < ((uint8_t *) rd.d_top)) {
+      *d8 ^= (gf->multiply.w32(gf, val, (*s8 & 0xf)) | 
+             ((gf->multiply.w32(gf, val, (*s8 >> 4))) << 4));
+      d8++;
+      s8++;
+    }
+  } else {
+    while (d8 < ((uint8_t *) rd.d_top)) {
+      *d8 = (gf->multiply.w32(gf, val, (*s8 & 0xf)) | 
+             ((gf->multiply.w32(gf, val, (*s8 >> 4))) << 4));
+      d8++;
+      s8++;
+    }
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
 /* ------------------------------------------------------------
   IMPLEMENTATION: LOG_TABLE: 
 
@@ -220,18 +324,28 @@ int gf_w4_log_init(gf_t *gf)
   h = (gf_internal_t *) gf->scratch;
   ltd = h->private;
 
-  ltd->log_tbl[0] = 0;
+  for (i = 0; i < GF_FIELD_SIZE; i++)
+    ltd->log_tbl[i]=0;
 
   ltd->antilog_tbl_div = ltd->antilog_tbl + (GF_FIELD_SIZE-1);
   b = 1;
-  for (i = 0; i < GF_FIELD_SIZE-1; i++) {
-      ltd->log_tbl[b] = i;
-      ltd->antilog_tbl[i] = b;
-      ltd->antilog_tbl[i+GF_FIELD_SIZE-1] = b;
-      b <<= 1;
-      if (b & GF_FIELD_SIZE) {
-          b = b ^ h->prim_poly;
-      }
+  i = 0;
+  do {
+    if (ltd->log_tbl[b] != 0 && i != 0) {
+      fprintf(stderr, "Cannot construct log table: Polynomial is not primitive.\n\n");
+      return 0;
+    }
+    ltd->log_tbl[b] = i;
+    ltd->antilog_tbl[i] = b;
+    ltd->antilog_tbl[i+GF_FIELD_SIZE-1] = b;
+    b <<= 1;
+    i++;
+    if (b & GF_FIELD_SIZE) b = b ^ h->prim_poly;
+  } while (b != 1);
+
+  if (i != GF_FIELD_SIZE - 1) {
+    _gf_errno = GF_E_LOGPOLY;
+    return 0;
   }
     
   gf->inverse.w32 = gf_w4_inverse_from_divide;
@@ -300,7 +414,7 @@ static
 void 
 gf_w4_single_table_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
 {
-#ifdef   INTEL_SSE4
+#ifdef INTEL_SSSE3
   gf_region_data rd;
   uint8_t *base, *sptr, *dptr, *top;
   __m128i  tl, loset, h4, r, va, th;
@@ -351,37 +465,17 @@ int gf_w4_single_table_init(gf_t *gf)
   gf_internal_t *h;
   struct gf_single_table_data *std;
   int a, b, prod, loga, logb;
-  uint8_t log_tbl[GF_FIELD_SIZE];
-  uint8_t antilog_tbl[GF_FIELD_SIZE*2];
-  int sse;
 
-  sse = 0;
-#ifdef INTEL_SSE4
-  sse = 1;
-#endif
 
   h = (gf_internal_t *) gf->scratch;
   std = (struct gf_single_table_data *)h->private;
 
-  b = 1;
-  for (a = 0; a < GF_MULT_GROUP_SIZE; a++) {
-      log_tbl[b] = a;
-      antilog_tbl[a] = b;
-      antilog_tbl[a+GF_MULT_GROUP_SIZE] = b;
-      b <<= 1;
-      if (b & GF_FIELD_SIZE) {
-          b = b ^ h->prim_poly;
-      }
-  }
-
   bzero(std->mult, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
   bzero(std->div, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
 
   for (a = 1; a < GF_FIELD_SIZE; a++) {
-    loga = log_tbl[a];
     for (b = 1; b < GF_FIELD_SIZE; b++) {
-      logb = log_tbl[b];
-      prod = antilog_tbl[loga+logb];
+      prod = gf_w4_shift_multiply(gf, a, b);
       std->mult[a][b] = prod;
       std->div[prod][b] = a;
     }
@@ -390,11 +484,16 @@ int gf_w4_single_table_init(gf_t *gf)
   gf->inverse.w32 = NULL;
   gf->divide.w32 = gf_w4_single_table_divide;
   gf->multiply.w32 = gf_w4_single_table_multiply;
-  if ((h->region_type & GF_REGION_SSE) || (h->mult_type == GF_MULT_DEFAULT && sse)) {  
-    gf->multiply_region.w32 = gf_w4_single_table_sse_multiply_region;
-  } else {
+  #ifdef INTEL_SSSE3
+    if(h->region_type & (GF_REGION_NOSSE | GF_REGION_CAUCHY))
+      gf->multiply_region.w32 = gf_w4_single_table_multiply_region;
+    else
+      gf->multiply_region.w32 = gf_w4_single_table_sse_multiply_region;
+  #else
     gf->multiply_region.w32 = gf_w4_single_table_multiply_region;
-  }
+    if (h->region_type & GF_REGION_SSE) return 0;
+  #endif
+
   return 1;
 }
 
@@ -458,32 +557,17 @@ int gf_w4_double_table_init(gf_t *gf)
   gf_internal_t *h;
   struct gf_double_table_data *std;
   int a, b, c, prod, loga, logb, ab;
-  uint8_t log_tbl[GF_FIELD_SIZE];
-  uint8_t antilog_tbl[GF_FIELD_SIZE*2];
   uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE];
 
   h = (gf_internal_t *) gf->scratch;
   std = (struct gf_double_table_data *)h->private;
 
-  b = 1;
-  for (a = 0; a < GF_MULT_GROUP_SIZE; a++) {
-      log_tbl[b] = a;
-      antilog_tbl[a] = b;
-      antilog_tbl[a+GF_MULT_GROUP_SIZE] = b;
-      b <<= 1;
-      if (b & GF_FIELD_SIZE) {
-          b = b ^ h->prim_poly;
-      }
-  }
-
   bzero(mult, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
   bzero(std->div, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
 
   for (a = 1; a < GF_FIELD_SIZE; a++) {
-    loga = log_tbl[a];
     for (b = 1; b < GF_FIELD_SIZE; b++) {
-      logb = log_tbl[b];
-      prod = antilog_tbl[loga+logb];
+      prod = gf_w4_shift_multiply(gf, a, b);
       mult[a][b] = prod;
       std->div[prod][b] = a;
     }
@@ -600,32 +684,17 @@ int gf_w4_quad_table_init(gf_t *gf)
   gf_internal_t *h;
   struct gf_quad_table_data *std;
   int prod, loga, logb, ab, val, a, b, c, d, va, vb, vc, vd;
-  uint8_t log_tbl[GF_FIELD_SIZE];
-  uint8_t antilog_tbl[GF_FIELD_SIZE*2];
   uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE];
 
   h = (gf_internal_t *) gf->scratch;
   std = (struct gf_quad_table_data *)h->private;
 
-  b = 1;
-  for (a = 0; a < GF_MULT_GROUP_SIZE; a++) {
-      log_tbl[b] = a;
-      antilog_tbl[a] = b;
-      antilog_tbl[a+GF_MULT_GROUP_SIZE] = b;
-      b <<= 1;
-      if (b & GF_FIELD_SIZE) {
-          b = b ^ h->prim_poly;
-      }
-  }
-
   bzero(mult, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
   bzero(std->div, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
 
   for (a = 1; a < GF_FIELD_SIZE; a++) {
-    loga = log_tbl[a];
     for (b = 1; b < GF_FIELD_SIZE; b++) {
-      logb = log_tbl[b];
-      prod = antilog_tbl[loga+logb];
+      prod = gf_w4_shift_multiply(gf, a, b);
       mult[a][b] = prod;
       std->div[prod][b] = a;
     }
@@ -702,13 +771,18 @@ int gf_w4_table_init(gf_t *gf)
 {
   int rt;
   gf_internal_t *h;
+  int issse3 = 0;
+
+#ifdef INTEL_SSSE3
+  issse3 = 1;
+#endif
 
   h = (gf_internal_t *) gf->scratch;
   rt = (h->region_type);
-  if (rt == 0 || rt == GF_REGION_CAUCHY) rt |= GF_REGION_SINGLE_TABLE;
-  if (rt & GF_REGION_SINGLE_TABLE) {
-    return gf_w4_single_table_init(gf);
-  } else if (rt & GF_REGION_DOUBLE_TABLE) {
+
+  if (h->mult_type == GF_MULT_DEFAULT && !issse3) rt |= GF_REGION_DOUBLE_TABLE;
+
+  if (rt & GF_REGION_DOUBLE_TABLE) {
     return gf_w4_double_table_init(gf);
   } else if (rt & GF_REGION_QUAD_TABLE) {
     if (rt & GF_REGION_LAZY) {
@@ -717,7 +791,9 @@ int gf_w4_table_init(gf_t *gf)
       return gf_w4_quad_table_init(gf);
     }
     return gf_w4_double_table_init(gf);
-  } 
+  } else {
+    return gf_w4_single_table_init(gf);
+  }
   return 0;
 }
 
@@ -842,7 +918,7 @@ static
 void
 gf_w4_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
 {
-#ifdef   INTEL_SSE4
+#ifdef INTEL_SSE2
   int i;
   uint8_t *s8, *d8;
   uint8_t vrev;
@@ -895,7 +971,7 @@ static
 void 
 gf_w4_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
 {
-#ifdef   INTEL_SSE4
+#ifdef INTEL_SSE2
   uint8_t *d8, *s8, tb;
   __m128i pp, m1, m2, t1, t2, va, vb;
   struct gf_bytwo_data *btd;
@@ -960,7 +1036,7 @@ static
 void
 gf_w4_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_bytwo_data *btd)
 {
-#ifdef   INTEL_SSE4
+#ifdef INTEL_SSE2
   int i;
   uint8_t *d8, *s8, tb;
   __m128i pp, m1, m2, t1, t2, va, vb;
@@ -986,7 +1062,7 @@ static
 void
 gf_w4_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_bytwo_data *btd)
 {
-#ifdef   INTEL_SSE4
+#ifdef INTEL_SSE2
   int i;
   uint8_t *d8, *s8, tb;
   __m128i pp, m1, m2, t1, t2, va, vb;
@@ -1014,7 +1090,7 @@ static
 void
 gf_w4_bytwo_b_sse_region_4_noxor(gf_region_data *rd, struct gf_bytwo_data *btd)
 {
-#ifdef   INTEL_SSE4
+#ifdef INTEL_SSE2
   int i;
   uint8_t *d8, *s8, tb;
   __m128i pp, m1, m2, t1, t2, va, vb;
@@ -1041,7 +1117,7 @@ static
 void
 gf_w4_bytwo_b_sse_region_4_xor(gf_region_data *rd, struct gf_bytwo_data *btd)
 {
-#ifdef   INTEL_SSE4
+#ifdef INTEL_SSE2
   int i;
   uint8_t *d8, *s8, tb;
   __m128i pp, m1, m2, t1, t2, va, vb;
@@ -1071,7 +1147,7 @@ static
 void
 gf_w4_bytwo_b_sse_region_3_noxor(gf_region_data *rd, struct gf_bytwo_data *btd)
 {
-#ifdef   INTEL_SSE4
+#ifdef INTEL_SSE2
   int i;
   uint8_t *d8, *s8, tb;
   __m128i pp, m1, m2, t1, t2, va, vb;
@@ -1099,7 +1175,7 @@ static
 void
 gf_w4_bytwo_b_sse_region_3_xor(gf_region_data *rd, struct gf_bytwo_data *btd)
 {
-#ifdef   INTEL_SSE4
+#ifdef INTEL_SSE2
   int i;
   uint8_t *d8, *s8, tb;
   __m128i pp, m1, m2, t1, t2, va, vb;
@@ -1127,7 +1203,7 @@ static
 void
 gf_w4_bytwo_b_sse_region_5_noxor(gf_region_data *rd, struct gf_bytwo_data *btd)
 {
-#ifdef   INTEL_SSE4
+#ifdef INTEL_SSE2
   int i;
   uint8_t *d8, *s8, tb;
   __m128i pp, m1, m2, t1, t2, va, vb;
@@ -1156,7 +1232,7 @@ static
 void
 gf_w4_bytwo_b_sse_region_5_xor(gf_region_data *rd, struct gf_bytwo_data *btd)
 {
-#ifdef   INTEL_SSE4
+#ifdef INTEL_SSE2
   int i;
   uint8_t *d8, *s8, tb;
   __m128i pp, m1, m2, t1, t2, va, vb;
@@ -1185,7 +1261,7 @@ static
 void
 gf_w4_bytwo_b_sse_region_7_noxor(gf_region_data *rd, struct gf_bytwo_data *btd)
 {
-#ifdef   INTEL_SSE4
+#ifdef INTEL_SSE2
   int i;
   uint8_t *d8, *s8, tb;
   __m128i pp, m1, m2, t1, t2, va, vb;
@@ -1215,7 +1291,7 @@ static
 void
 gf_w4_bytwo_b_sse_region_7_xor(gf_region_data *rd, struct gf_bytwo_data *btd)
 {
-#ifdef   INTEL_SSE4
+#ifdef INTEL_SSE2
   int i;
   uint8_t *d8, *s8, tb;
   __m128i pp, m1, m2, t1, t2, va, vb;
@@ -1245,7 +1321,7 @@ static
 void
 gf_w4_bytwo_b_sse_region_6_noxor(gf_region_data *rd, struct gf_bytwo_data *btd)
 {
-#ifdef   INTEL_SSE4
+#ifdef INTEL_SSE2
   int i;
   uint8_t *d8, *s8, tb;
   __m128i pp, m1, m2, t1, t2, va, vb;
@@ -1274,7 +1350,7 @@ static
 void
 gf_w4_bytwo_b_sse_region_6_xor(gf_region_data *rd, struct gf_bytwo_data *btd)
 {
-#ifdef   INTEL_SSE4
+#ifdef INTEL_SSE2
   int i;
   uint8_t *d8, *s8, tb;
   __m128i pp, m1, m2, t1, t2, va, vb;
@@ -1303,7 +1379,7 @@ static
 void 
 gf_w4_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
 {
-#ifdef   INTEL_SSE4
+#ifdef INTEL_SSE2
   uint8_t *d8, *s8, tb;
   __m128i pp, m1, m2, t1, t2, va, vb;
   struct gf_bytwo_data *btd;
@@ -1853,114 +1929,107 @@ int gf_w4_bytwo_init(gf_t *gf)
 
   if (h->mult_type == GF_MULT_BYTWO_p) {
     gf->multiply.w32 = gf_w4_bytwo_p_multiply;
-    if (h->region_type == GF_REGION_SSE) {
-      gf->multiply_region.w32 = gf_w4_bytwo_p_sse_multiply_region;
-    } else {
+    #ifdef INTEL_SSE2
+      if (h->region_type & GF_REGION_NOSSE)
+        gf->multiply_region.w32 = gf_w4_bytwo_p_nosse_multiply_region;
+      else
+        gf->multiply_region.w32 = gf_w4_bytwo_p_sse_multiply_region;
+    #else
       gf->multiply_region.w32 = gf_w4_bytwo_p_nosse_multiply_region;
-    }
+      if (h->region_type & GF_REGION_SSE)
+        return 0;
+    #endif
   } else {
     gf->multiply.w32 = gf_w4_bytwo_b_multiply;
-    if (h->region_type == GF_REGION_SSE) {
-      gf->multiply_region.w32 = gf_w4_bytwo_b_sse_multiply_region;
-    } else {
+    #ifdef INTEL_SSE2
+      if (h->region_type & GF_REGION_NOSSE)
+        gf->multiply_region.w32 = gf_w4_bytwo_b_nosse_multiply_region;
+      else
+        gf->multiply_region.w32 = gf_w4_bytwo_b_sse_multiply_region;
+    #else
       gf->multiply_region.w32 = gf_w4_bytwo_b_nosse_multiply_region;
-    }
+      if (h->region_type & GF_REGION_SSE)
+        return 0;
+    #endif
   }
-  gf->inverse.w32 = gf_w4_euclid;
   return 1;
 }
 
 
-/* ------------------------------------------------------------
-   JSP: GF_MULT_SHIFT: The world's dumbest multiplication algorithm.  I only
-   include it for completeness.  It does have the feature that it requires no
-   extra memory.  
-*/
-
-static
-inline
-gf_val_32_t
-gf_w4_shift_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+static 
+int gf_w4_cfm_init(gf_t *gf)
 {
-  uint8_t product, i, pp;
   gf_internal_t *h;
-  
+
   h = (gf_internal_t *) gf->scratch;
-  pp = h->prim_poly;
 
-  product = 0;
-
-  for (i = 0; i < GF_FIELD_WIDTH; i++) { 
-    if (a & (1 << i)) product ^= (b << i);
-  }
-  for (i = (GF_FIELD_WIDTH*2-1); i >= GF_FIELD_WIDTH; i--) {
-    if (product & (1 << i)) product ^= (pp << (i-GF_FIELD_WIDTH)); 
-  }
-  return product;
+#ifdef INTEL_SSE4_PCLMUL
+  gf->multiply.w32 = gf_w4_clm_multiply;
+  return 1;
+#endif
+  return 0;
 }
 
 static 
 int gf_w4_shift_init(gf_t *gf)
 {
   gf->multiply.w32 = gf_w4_shift_multiply;
-  gf->inverse.w32 = gf_w4_euclid;
   return 1;
 }
 
+/* JSP: I'm putting all error-checking into gf_error_check(), so you don't 
+   have to do error checking in scratch_size or in init */
+
 int gf_w4_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
 {
   int region_tbl_size;
-  int sss;
   int ss;
+  int issse3 = 0;
 
-  sss = (GF_REGION_SINGLE_TABLE | GF_REGION_SSE | GF_REGION_NOSSE);
-  ss = (GF_REGION_SSE | GF_REGION_NOSSE);
+#ifdef INTEL_SSSE3
+  issse3 = 1;
+#endif
 
   switch(mult_type)
   {
     case GF_MULT_BYTWO_p:
     case GF_MULT_BYTWO_b:
-      if (arg1 != 0 || arg2 != 0) return -1;
-      if (region_type != GF_REGION_CAUCHY) {
-        if ((region_type | ss) != ss || (region_type & ss) == ss) return -1;
-      }
       return sizeof(gf_internal_t) + sizeof(struct gf_bytwo_data);
       break;
     case GF_MULT_DEFAULT:
     case GF_MULT_TABLE:
-      if (arg1 != 0 || arg2 != 0) return -1;
-      if (region_type == GF_REGION_CAUCHY || region_type == (GF_REGION_CAUCHY | GF_REGION_SINGLE_TABLE)) {
+      if (region_type == GF_REGION_CAUCHY) {
         return sizeof(gf_internal_t) + sizeof(struct gf_single_table_data) + 64;
       }
-      if (mult_type == GF_MULT_DEFAULT || region_type == 0) region_type = GF_REGION_SINGLE_TABLE;
-      if (region_type & GF_REGION_SINGLE_TABLE) {
-        if ((region_type | sss) != sss) return -1;
-        if ((region_type & sss) == sss) return -1;
-        return sizeof(gf_internal_t) + sizeof(struct gf_single_table_data) + 64;
-      } else if (region_type & GF_REGION_DOUBLE_TABLE) {
-        if (region_type != GF_REGION_DOUBLE_TABLE) return -1;
+
+      if (mult_type == GF_MULT_DEFAULT && !issse3) region_type = GF_REGION_DOUBLE_TABLE;
+
+      if (region_type & GF_REGION_DOUBLE_TABLE) {
         return sizeof(gf_internal_t) + sizeof(struct gf_double_table_data) + 64;
       } else if (region_type & GF_REGION_QUAD_TABLE) {
-        if ((region_type | GF_REGION_LAZY) != (GF_REGION_QUAD_TABLE | GF_REGION_LAZY)) return -1;
         if ((region_type & GF_REGION_LAZY) == 0) {
           return sizeof(gf_internal_t) + sizeof(struct gf_quad_table_data) + 64;
         } else {
           return sizeof(gf_internal_t) + sizeof(struct gf_quad_table_lazy_data) + 64;
         }
+      } else {
+        return sizeof(gf_internal_t) + sizeof(struct gf_single_table_data) + 64;
       }
-      return -1;  
       break;
+
     case GF_MULT_LOG_TABLE:
-      if (arg1 != 0 || arg2 != 0 || (region_type != 0 && region_type != GF_REGION_CAUCHY)) return -1;
       return sizeof(gf_internal_t) + sizeof(struct gf_logtable_data) + 64;
       break;
+    case GF_MULT_CARRY_FREE:
+      return sizeof(gf_internal_t);
+      break;
     case GF_MULT_SHIFT:
-      if (arg1 != 0 || arg2 != 0 || (region_type != 0 && region_type != GF_REGION_CAUCHY)) return -1;
       return sizeof(gf_internal_t);
       break;
     default:
-      return -1;
+      return 0;
    }
+  return 0;
 }
 
 int
@@ -1970,7 +2039,7 @@ gf_w4_init (gf_t *gf)
 
   h = (gf_internal_t *) gf->scratch;
   if (h->prim_poly == 0) h->prim_poly = 0x13;
-
+  h->prim_poly |= 0x10;
   gf->multiply.w32 = NULL;
   gf->divide.w32 = NULL;
   gf->inverse.w32 = NULL;
@@ -1978,13 +2047,13 @@ gf_w4_init (gf_t *gf)
   gf->extract_word.w32 = gf_w4_extract_word;
 
   switch(h->mult_type) {
-    case GF_MULT_SHIFT:     if (gf_w4_shift_init(gf) == 0) return 0; break;
+    case GF_MULT_CARRY_FREE: if (gf_w4_cfm_init(gf) == 0) return 0; break;
+    case GF_MULT_SHIFT:      if (gf_w4_shift_init(gf) == 0) return 0; break;
     case GF_MULT_BYTWO_p:   
-    case GF_MULT_BYTWO_b:   
-      if (gf_w4_bytwo_init(gf) == 0) return 0; break;
-    case GF_MULT_LOG_TABLE: if (gf_w4_log_init(gf) == 0) return 0; break;
+    case GF_MULT_BYTWO_b:    if (gf_w4_bytwo_init(gf) == 0) return 0; break;
+    case GF_MULT_LOG_TABLE:  if (gf_w4_log_init(gf) == 0) return 0; break;
     case GF_MULT_DEFAULT:   
-    case GF_MULT_TABLE:     if (gf_w4_table_init(gf) == 0) return 0; break;
+    case GF_MULT_TABLE:      if (gf_w4_table_init(gf) == 0) return 0; break;
     default: return 0;
   }
 
@@ -1996,17 +2065,22 @@ gf_w4_init (gf_t *gf)
     gf->inverse.w32 = gf_w4_matrix;
   }
 
-  if (gf->inverse.w32 != NULL && gf->divide.w32 == NULL) {
+  if (gf->divide.w32 == NULL) {
     gf->divide.w32 = gf_w4_divide_from_inverse;
+    if (gf->inverse.w32 == NULL) gf->inverse.w32 = gf_w4_euclid;
   }
-  if (gf->inverse.w32 == NULL && gf->divide.w32 != NULL) {
-    gf->inverse.w32 = gf_w4_inverse_from_divide;
-  }
+
+  if (gf->inverse.w32 == NULL)  gf->inverse.w32 = gf_w4_inverse_from_divide;
 
   if (h->region_type == GF_REGION_CAUCHY) {
     gf->multiply_region.w32 = gf_wgen_cauchy_region;
     gf->extract_word.w32 = gf_wgen_extract_word;
   }
+
+  if (gf->multiply_region.w32 == NULL) {
+    gf->multiply_region.w32 = gf_w4_multiply_region_from_single;
+  }
+
   return 1;
 }
 
diff --git a/gf_w64.c b/gf_w64.c
index 95100f4..12ec5af 100644
--- a/gf_w64.c
+++ b/gf_w64.c
@@ -9,18 +9,12 @@
 #include <stdlib.h>
 
 #define GF_FIELD_WIDTH (64)
-#define GF_FIRST_BIT (1L << 63)
+#define GF_FIRST_BIT (1ULL << 63)
 
 #define GF_BASE_FIELD_WIDTH (32)
-#define GF_BASE_FIELD_SIZE       (1L << GF_BASE_FIELD_WIDTH)
+#define GF_BASE_FIELD_SIZE       (1ULL << GF_BASE_FIELD_WIDTH)
 #define GF_BASE_FIELD_GROUP_SIZE  GF_BASE_FIELD_SIZE-1
 
-// 10000587 is a valid s for 2^16^2
-#define GF_S_GF_16_2_2 (1000587)
-
-// 1000012 is a valid s for 2^32
-#define GF_S_GF_32_2 (1000012)
-
 struct gf_w64_group_data {
     uint64_t *reduce;
     uint64_t *shift;
@@ -46,10 +40,6 @@ struct gf_split_8_8_data {
     uint64_t      tables[15][256][256];
 };
 
-typedef struct w64_composite_int_s {
-  uint64_t s; // 's' will be different depending on the base field
-} w64_composite_int_t;
-
 static
 inline
 gf_val_64_t gf_w64_inverse_from_divide (gf_t *gf, gf_val_64_t a)
@@ -79,6 +69,9 @@ xor)
   s64 = (gf_val_64_t *) src;
   d64 = (gf_val_64_t *) dest;
 
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
   if (xor) {
     for (i = 0; i < bytes/sizeof(gf_val_64_t); i++) {
       d64[i] ^= gf->multiply.w64(gf, val, s64[i]);
@@ -91,7 +84,186 @@ xor)
 }
 
 static
-inline
+void
+gf_w64_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int
+xor)
+{
+  int i, size;
+  gf_val_64_t *s64, *d64, *top;
+  gf_region_data rd;
+
+#ifdef INTEL_SSE4_PCLMUL
+  __m128i         a, b;
+  __m128i         result, r1;
+  __m128i         prim_poly;
+  __m128i         v, w;
+  __m128i         m1, m2, m3, m4;
+  gf_internal_t * h = gf->scratch;
+  
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+  
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
+  gf_do_initial_region_alignment(&rd);
+
+  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL));
+  b = _mm_insert_epi64 (_mm_setzero_si128(), val, 0);
+  m1 = _mm_set_epi32(0, 0, 0, (uint32_t)0xffffffff);
+  m2 = _mm_slli_si128(m1, 4);
+  m2 = _mm_or_si128(m1, m2);
+  m3 = _mm_slli_si128(m1, 8);
+  m4 = _mm_slli_si128(m3, 4);
+
+  s64 = (gf_val_64_t *) rd.s_start;
+  d64 = (gf_val_64_t *) rd.d_start;
+  top = (gf_val_64_t *) rd.d_top;
+  size = bytes/sizeof(gf_val_64_t);
+
+  if (xor) {
+    while (d64 != top) {
+      a = _mm_load_si128((__m128i *) s64);  
+      result = _mm_clmulepi64_si128 (a, b, 1);
+
+      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1);
+      r1 = _mm_xor_si128 (result, w);
+
+      result = _mm_clmulepi64_si128 (a, b, 0);
+
+      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1);
+      result = _mm_xor_si128 (result, w);
+
+      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1);
+      result = _mm_xor_si128 (result, w);
+
+      result = _mm_unpacklo_epi64(result, r1);
+      
+      r1 = _mm_load_si128((__m128i *) d64);
+      result = _mm_xor_si128(r1, result);
+      _mm_store_si128((__m128i *) d64, result);
+      d64 += 2;
+      s64 += 2;
+    }
+  } else {
+    while (d64 != top) {
+      
+      a = _mm_load_si128((__m128i *) s64);  
+      result = _mm_clmulepi64_si128 (a, b, 1);
+
+      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1);
+      r1 = _mm_xor_si128 (result, w);
+
+      result = _mm_clmulepi64_si128 (a, b, 0);
+
+      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1);
+      result = _mm_xor_si128 (result, w);
+      
+      result = _mm_unpacklo_epi64(result, r1);
+
+      _mm_store_si128((__m128i *) d64, result);
+      d64 += 2;
+      s64 += 2;
+    }
+  }
+  gf_do_final_region_alignment(&rd);
+#endif
+}
+
+static
+void
+gf_w64_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int
+xor)
+{
+  int i, size;
+  gf_val_64_t *s64, *d64, *top;
+  gf_region_data rd;
+
+#ifdef INTEL_SSE4_PCLMUL
+  __m128i         a, b;
+  __m128i         result, r1;
+  __m128i         prim_poly;
+  __m128i         w;
+  __m128i         m1, m3, m4;
+  gf_internal_t * h = gf->scratch;
+  
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+  
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
+  gf_do_initial_region_alignment(&rd);
+  
+  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL));
+  b = _mm_insert_epi64 (_mm_setzero_si128(), val, 0);
+  m1 = _mm_set_epi32(0, 0, 0, (uint32_t)0xffffffff);
+  m3 = _mm_slli_si128(m1, 8);
+  m4 = _mm_slli_si128(m3, 4);
+
+  s64 = (gf_val_64_t *) rd.s_start;
+  d64 = (gf_val_64_t *) rd.d_start;
+  top = (gf_val_64_t *) rd.d_top;
+  size = bytes/sizeof(gf_val_64_t);
+
+  if (xor) {
+    while (d64 != top) {
+      a = _mm_load_si128((__m128i *) s64);
+      result = _mm_clmulepi64_si128 (a, b, 1);
+
+      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1);
+      r1 = _mm_xor_si128 (result, w);
+
+      result = _mm_clmulepi64_si128 (a, b, 0);
+
+      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1);
+      result = _mm_xor_si128 (result, w);
+
+      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1);
+      result = _mm_xor_si128 (result, w);
+
+      result = _mm_unpacklo_epi64(result, r1);
+
+      r1 = _mm_load_si128((__m128i *) d64);
+      result = _mm_xor_si128(r1, result);
+      _mm_store_si128((__m128i *) d64, result);
+      d64 += 2;
+      s64 += 2;
+    }
+  } else {
+    while (d64 != top) {
+      a = _mm_load_si128((__m128i *) s64);
+      result = _mm_clmulepi64_si128 (a, b, 1);
+
+      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1);
+      r1 = _mm_xor_si128 (result, w);
+
+      result = _mm_clmulepi64_si128 (a, b, 0);
+
+      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1);
+      result = _mm_xor_si128 (result, w);
+
+      result = _mm_unpacklo_epi64(result, r1);
+
+      _mm_store_si128((__m128i *) d64, result);
+      d64 += 2;
+      s64 += 2; 
+    }
+  }
+  gf_do_final_region_alignment(&rd);
+#endif
+}
+
+static
+  inline
 gf_val_64_t gf_w64_euclid (gf_t *gf, gf_val_64_t b)
 {
   gf_val_64_t e_i, e_im1, e_ip1;
@@ -118,6 +290,7 @@ gf_val_64_t gf_w64_euclid (gf_t *gf, gf_val_64_t b)
       c_i ^= (one << (d_ip1 - d_i));
       e_ip1 ^= (e_i << (d_ip1 - d_i));
       d_ip1--;
+      if (e_ip1 == 0) return 0;
       while ((e_ip1 & (one << d_ip1)) == 0) d_ip1--;
     }
 
@@ -149,31 +322,41 @@ gf_w64_shift_multiply (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
 
   h = (gf_internal_t *) gf->scratch;
   ppr = h->prim_poly;
-  ppl = 1;
   
+  /* Allen: set leading one of primitive polynomial */
+  
+  ppl = 1;
+ 
   a = a64;
   bl = 0;
   br = b64;
   one = 1;
   lbit = (one << 63);
 
-  pl = 0;
-  pr = 0;
+  pl = 0; /* Allen: left side of product */
+  pr = 0; /* Allen: right side of product */
 
+  /* Allen: unlike the corresponding functions for smaller word sizes,
+   * this loop carries out the initial carryless multiply by
+   * shifting b itself rather than simply looking at successively
+   * higher shifts of b */
+  
   for (i = 0; i < GF_FIELD_WIDTH; i++) {
     if (a & (one << i)) {
       pl ^= bl;
       pr ^= br;
     }
-    /* printf("P: %016llx %016llx     ", pl, pr); printf("B: %016llx %016llx\n", bl, br);  */
+
     bl <<= 1;
     if (br & lbit) bl ^= 1;
     br <<= 1;
   }
 
-  one = lbit;
-  ppl = ((h->prim_poly >> 1) | lbit);
-  ppr = lbit;
+  /* Allen: the name of the variable "one" is no longer descriptive at this point */
+  
+  one = lbit >> 1;
+  ppl = (h->prim_poly >> 2) | one;
+  ppr = (h->prim_poly << (GF_FIELD_WIDTH-2));
   while (one != 0) {
     if (pl & one) {
       pl ^= ppl;
@@ -190,12 +373,16 @@ gf_w64_shift_multiply (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
 /*
  * ELM: Use the Intel carryless multiply instruction to do very fast 64x64 multiply.
  */
+
 static
 inline
 gf_val_64_t
-gf_w64_clm_multiply (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
+gf_w64_clm_multiply_2 (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
 {
-#ifdef  INTEL_PCLMUL
+       gf_val_64_t rv = 0;
+
+#ifdef INTEL_SSE4_PCLMUL
+
         __m128i         a, b;
         __m128i         result;
         __m128i         prim_poly;
@@ -206,10 +393,17 @@ gf_w64_clm_multiply (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
         b = _mm_insert_epi64 (a, b64, 0); 
         prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL));
         /* Do the initial multiply */
+   
         result = _mm_clmulepi64_si128 (a, b, 0);
+        
         /* Mask off the high order 32 bits using subtraction of the polynomial.
          * NOTE: this part requires that the polynomial have at least 32 leading 0 bits.
          */
+
+        /* Adam: We cant include the leading one in the 64 bit pclmul,
+         so we need to split up the high 8 bytes of the result into two 
+         parts before we multiply them with the prim_poly.*/
+
         v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
         w = _mm_clmulepi64_si128 (prim_poly, v, 0);
         result = _mm_xor_si128 (result, w);
@@ -217,47 +411,64 @@ gf_w64_clm_multiply (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
         w = _mm_clmulepi64_si128 (prim_poly, v, 0);
         result = _mm_xor_si128 (result, w);
 
-        return ((gf_val_64_t)_mm_extract_epi64(result, 0));
+        rv = ((gf_val_64_t)_mm_extract_epi64(result, 0));
 #endif
+        return rv;
 }
-
-#ifdef  INTEL_PCLMUL
+ 
+static
 inline
-__m128i
-gf_w64_clm_multiply_single (__m128i v, __m128i b, __m128i pp_l, __m128i pp_h)
+gf_val_64_t
+gf_w64_clm_multiply_4 (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
 {
-  __m128i       r0, r1, c0, c1, w0, w1;
+  gf_val_64_t rv = 0;
 
-  r0 = _mm_clmulepi64_si128 (b, v, 0);
-  c0 = _mm_srli_si128 (r0, 12);
-  w0 = _mm_clmulepi64_si128 (pp_h, c0, 0);
-  r0 = _mm_xor_si128 (r0, w0);
-  c0 = _mm_srli_si128 (_mm_slli_si128 (r0, 4), 12);
-  w0 = _mm_clmulepi64_si128 (pp_l, c0, 0);
-  r0 = _mm_insert_epi64 (_mm_xor_si128 (r0, w0), 0, 1);
+#ifdef INTEL_SSE4_PCLMUL
 
-  r1 = _mm_clmulepi64_si128 (b, v, 1);
-  c1 = _mm_srli_si128 (r1, 12);
-  w1 = _mm_clmulepi64_si128 (pp_h, c1, 0);
-  r1 = _mm_xor_si128 (r1, w1);
-  c1 = _mm_srli_si128 (_mm_slli_si128 (r1, 4), 12);
-  w1 = _mm_clmulepi64_si128 (pp_l, c1, 0);
-  r1 = _mm_slli_si128 (_mm_xor_si128 (r1, w1), 8);
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         v, w;
+  gf_internal_t * h = gf->scratch;
 
-  return (_mm_xor_si128 (r0, r1));
+  a = _mm_insert_epi64 (_mm_setzero_si128(), a64, 0);
+  b = _mm_insert_epi64 (a, b64, 0);
+  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL));
+ 
+  /* Do the initial multiply */
+  
+  result = _mm_clmulepi64_si128 (a, b, 0);
+
+  v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
+  w = _mm_clmulepi64_si128 (prim_poly, v, 0);
+  result = _mm_xor_si128 (result, w);
+  v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1);
+  w = _mm_clmulepi64_si128 (prim_poly, v, 0);
+  result = _mm_xor_si128 (result, w);
+  
+  v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
+  w = _mm_clmulepi64_si128 (prim_poly, v, 0);
+  result = _mm_xor_si128 (result, w);
+  v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1);
+  w = _mm_clmulepi64_si128 (prim_poly, v, 0);
+  result = _mm_xor_si128 (result, w);
+
+  rv = ((gf_val_64_t)_mm_extract_epi64(result, 0));
+#endif
+  return rv;
 }
 
-#endif
 
-void
+  void
 gf_w64_clm_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor)
 {
-#ifdef  INTEL_PCLMUL
+#ifdef INTEL_SSE4_PCLMUL
   gf_internal_t *h;
-  int i, top;
-  uint8_t *s8, *d8;
+  int i, j, k;
+  uint8_t *s8, *d8, *dtop;
+  uint64_t *s64, *d64;
   gf_region_data rd;
-  __m128i  v, b, xv, pp_l, pp_h, final;
+  __m128i  v, b, m, prim_poly, c, fr, w, result;
 
   if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
   if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
@@ -269,25 +480,67 @@ gf_w64_clm_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int by
 
   s8 = (uint8_t *) rd.s_start;
   d8 = (uint8_t *) rd.d_start;
-  top = (uint8_t *) rd.d_top - (uint8_t *)rd.d_start;
+  dtop = (uint8_t *) rd.d_top;
 
   v = _mm_insert_epi64(_mm_setzero_si128(), val, 0);
-  pp_l = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL));
-  pp_h = _mm_slli_si128 (pp_l, 4);
+  m = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff);
+  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL));
 
   if (xor) {
-    for (i = 0; i < top; i += 16) {
-      b = _mm_load_si128((__m128i *) (s8 + i));
-      final = gf_w64_clm_multiply_single (v, b, pp_l, pp_h);
-      xv = _mm_load_si128((__m128i *) (d8 + i));
-      final = _mm_xor_si128 (final, xv);
-      _mm_store_si128((__m128i *) (d8 + i), final);
+    while (d8 != dtop) {
+      s64 = (uint64_t *) s8;
+      b = _mm_load_si128((__m128i *) s8);
+      result = _mm_clmulepi64_si128 (b, v, 0);
+      c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, c, 0);
+      result = _mm_xor_si128 (result, w);
+      c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1);
+      w = _mm_clmulepi64_si128 (prim_poly, c, 0);
+      fr = _mm_xor_si128 (result, w);
+      fr = _mm_and_si128 (fr, m);
+
+      result = _mm_clmulepi64_si128 (b, v, 1);
+      c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, c, 0);
+      result = _mm_xor_si128 (result, w);
+      c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1);
+      w = _mm_clmulepi64_si128 (prim_poly, c, 0);
+      result = _mm_xor_si128 (result, w);
+      result = _mm_slli_si128 (result, 8);
+      fr = _mm_xor_si128 (result, fr);
+      result = _mm_load_si128((__m128i *) d8);
+      fr = _mm_xor_si128 (result, fr);
+
+      _mm_store_si128((__m128i *) d8, fr);
+      d8 += 16;
+      s8 += 16;
     }
   } else {
-    for (i = 0; i < top; i += 16) {
-      b = _mm_load_si128((__m128i *) (s8 + i));
-      final = gf_w64_clm_multiply_single (v, b, pp_l, pp_h);
-      _mm_store_si128((__m128i *) (d8 + i), final);
+    while (d8 < dtop) {
+      s64 = (uint64_t *) s8;
+      b = _mm_load_si128((__m128i *) s8);
+      result = _mm_clmulepi64_si128 (b, v, 0);
+      c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, c, 0);
+      result = _mm_xor_si128 (result, w);
+      c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1);
+      w = _mm_clmulepi64_si128 (prim_poly, c, 0);
+      fr = _mm_xor_si128 (result, w);
+      fr = _mm_and_si128 (fr, m);
+  
+      result = _mm_clmulepi64_si128 (b, v, 1);
+      c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, c, 0);
+      result = _mm_xor_si128 (result, w);
+      c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1);
+      w = _mm_clmulepi64_si128 (prim_poly, c, 0);
+      result = _mm_xor_si128 (result, w);
+      result = _mm_slli_si128 (result, 8);
+      fr = _mm_xor_si128 (result, fr);
+  
+      _mm_store_si128((__m128i *) d8, fr);
+      d8 += 16;
+      s8 += 16;
     }
   }
   gf_do_final_region_alignment(&rd);
@@ -486,18 +739,36 @@ int gf_w64_shift_init(gf_t *gf)
 {
   gf_internal_t *h;
 
-  h = (gf_internal_t *) gf->scratch;
-
   gf->multiply.w64 = gf_w64_shift_multiply;
   gf->inverse.w64 = gf_w64_euclid;
   gf->multiply_region.w64 = gf_w64_multiply_region_from_single;
+  return 1;
+}
 
-#ifdef  INTEL_PCLMUL
-  if (h->region_type != GF_REGION_NOSSE) gf->multiply.w64 = gf_w64_clm_multiply;
-  if (h->region_type != GF_REGION_NOSSE) gf->multiply_region.w64 = gf_w64_clm_multiply_region;
+static 
+int gf_w64_cfm_init(gf_t *gf)
+{
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+
+  gf->inverse.w64 = gf_w64_euclid;
+  gf->multiply_region.w64 = gf_w64_multiply_region_from_single;
+
+#ifdef INTEL_SSE4_PCLMUL
+  if ((0xfffffffe00000000ULL & h->prim_poly) == 0){ 
+    gf->multiply.w64 = gf_w64_clm_multiply_2;
+    gf->multiply_region.w64 = gf_w64_clm_multiply_region_from_single_2; 
+  }else if((0xfffe000000000000ULL & h->prim_poly) == 0){
+    gf->multiply.w64 = gf_w64_clm_multiply_4;
+    gf->multiply_region.w64 = gf_w64_clm_multiply_region_from_single_4;
+  } else {
+    return 0;
+  }
+  return 1;
 #endif
 
-  return 1;
+  return 0;
 }
 
 static
@@ -509,11 +780,7 @@ gf_w64_group_set_shift_tables(uint64_t *shift, uint64_t val, gf_internal_t *h)
   uint64_t one = 1;
   int g_s;
 
-  if (h->mult_type == GF_MULT_DEFAULT) {
-    g_s = 4;
-  } else {
-    g_s = h->arg1;
-  }
+  g_s = h->arg1;
   shift[0] = 0;
  
   for (i = 1; i < (1 << g_s); i <<= 1) {
@@ -538,13 +805,8 @@ gf_w64_group_multiply(gf_t *gf, gf_val_64_t a, gf_val_64_t b)
   struct gf_w64_group_data *gd;
 
   gf_internal_t *h = (gf_internal_t *) gf->scratch;
-  if (h->mult_type == GF_MULT_DEFAULT) {
-    g_s = 4;
-    g_r = 8;
-  } else {
-    g_s = h->arg1;
-    g_r = h->arg2;
-  }
+  g_s = h->arg1;
+  g_r = h->arg2;
   gd = (struct gf_w64_group_data *) h->private;
   gf_w64_group_set_shift_tables(gd->shift, b, h);
 
@@ -599,19 +861,18 @@ void gf_w64_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t v
   if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
 
   gd = (struct gf_w64_group_data *) h->private;
-  if (h->mult_type == GF_MULT_DEFAULT) {
-    g_s = 4;
-    g_r = 8;
-  } else {
-    g_s = h->arg1;
-    g_r = h->arg2;
-  }
+  g_s = h->arg1;
+  g_r = h->arg2;
   gf_w64_group_set_shift_tables(gd->shift, val, h);
 
-  for (i = 63; !(val & (1L << i)); i--) ;
+  for (i = 63; !(val & (1ULL << i)); i--) ;
   i += g_s;
-  if (i > 64) i = 64;   /* i is the bit position of the first zero bit in any element of
+  
+  /* i is the bit position of the first zero bit in any element of
                            gd->shift[] */
+  
+  if (i > 64) i = 64;   
+  
   fzb = i;
 
   gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4);
@@ -770,13 +1031,8 @@ int gf_w64_group_init(gf_t *gf)
   gf_internal_t *h = (gf_internal_t *) gf->scratch;
   int g_r, g_s;
 
-  if (h->mult_type == GF_MULT_DEFAULT) {
-    g_s = 4;
-    g_r = 8;
-  } else {
-    g_s = h->arg1;
-    g_r = h->arg2;
-  }
+  g_s = h->arg1;
+  g_r = h->arg2;
 
   gd = (struct gf_w64_group_data *) h->private;
   gd->shift = (uint64_t *) (&(gd->memory));
@@ -881,8 +1137,7 @@ gf_w64_bytwo_b_multiply (gf_t *gf, gf_val_64_t a, gf_val_64_t b)
   pp = h->prim_poly;
 
   prod = 0;
-  bmask = 0x80000000;
-  bmask <<= 32;
+  bmask = 0x8000000000000000ULL;
 
   while (1) {
     if (a & 1) prod ^= b;
@@ -908,10 +1163,11 @@ gf_w64_bytwo_p_multiply (gf_t *gf, gf_val_64_t a, gf_val_64_t b)
   pp = h->prim_poly;
 
   prod = 0;
-  pmask = 0x80000000;
-  pmask <<= 32;
-  amask = 0x80000000;
-  amask <<= 32;
+  
+  /* changed from declare then shift to just declare.*/
+  
+  pmask = 0x8000000000000000ULL;
+  amask = 0x8000000000000000ULL;
 
   while (amask != 0) {
     if (prod & pmask) {
@@ -1052,7 +1308,7 @@ gf_w64_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_
 
 void gf_w64_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor)
 {
-#ifdef   INTEL_SSE4
+#ifdef INTEL_SSE2
   int i;
   uint8_t *s8, *d8;
   uint64_t vrev, one64;
@@ -1118,7 +1374,7 @@ static
 void
 gf_w64_bytwo_b_sse_region_2_xor(gf_region_data *rd)
 {
-#ifdef   INTEL_SSE4
+#ifdef INTEL_SSE2
   int i;
   uint64_t one64, amask;
   uint8_t *d8, *s8, tb;
@@ -1152,7 +1408,7 @@ static
 void
 gf_w64_bytwo_b_sse_region_2_noxor(gf_region_data *rd)
 {
-#ifdef   INTEL_SSE4
+#ifdef INTEL_SSE2
   int i;
   uint64_t one64, amask;
   uint8_t *d8, *s8, tb;
@@ -1184,7 +1440,7 @@ static
 void
 gf_w64_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor)
 {
-#ifdef   INTEL_SSE4
+#ifdef INTEL_SSE2
   uint64_t itb, amask, one64;
   uint8_t *d8, *s8;
   __m128i pp, m1, m2, t1, t2, va, vb;
@@ -1248,18 +1504,28 @@ int gf_w64_bytwo_init(gf_t *gf)
 
   if (h->mult_type == GF_MULT_BYTWO_p) {
     gf->multiply.w64 = gf_w64_bytwo_p_multiply;
-    if (h->region_type == GF_REGION_SSE) {
-      gf->multiply_region.w64 = gf_w64_bytwo_p_sse_multiply_region; 
-    } else {
-      gf->multiply_region.w64 = gf_w64_bytwo_p_nosse_multiply_region;
-    } 
+    #ifdef INTEL_SSE2 
+      if (h->region_type & GF_REGION_NOSSE)
+        gf->multiply_region.w64 = gf_w64_bytwo_p_nosse_multiply_region; 
+      else
+        gf->multiply_region.w64 = gf_w64_bytwo_p_sse_multiply_region; 
+    #else
+      gf->multiply_region.w64 = gf_w64_bytwo_p_nosse_multiply_region; 
+      if(h->region_type & GF_REGION_SSE)
+        return 0;
+    #endif
   } else {
     gf->multiply.w64 = gf_w64_bytwo_b_multiply;
-      if (h->region_type == GF_REGION_SSE) {
-      gf->multiply_region.w64 = gf_w64_bytwo_b_sse_multiply_region; 
-    } else {
+    #ifdef INTEL_SSE2 
+      if (h->region_type & GF_REGION_NOSSE)
+        gf->multiply_region.w64 = gf_w64_bytwo_b_nosse_multiply_region; 
+      else
+        gf->multiply_region.w64 = gf_w64_bytwo_b_sse_multiply_region; 
+    #else
       gf->multiply_region.w64 = gf_w64_bytwo_b_nosse_multiply_region; 
-    } 
+      if(h->region_type & GF_REGION_SSE)
+        return 0;
+    #endif
   }
   gf->inverse.w64 = gf_w64_euclid;
   return 1;
@@ -1277,12 +1543,11 @@ gf_w64_composite_multiply(gf_t *gf, gf_val_64_t a, gf_val_64_t b)
   uint32_t a0 = a & 0x00000000ffffffff;
   uint32_t a1 = (a & 0xffffffff00000000) >> 32;
   uint32_t a1b1;
-  w64_composite_int_t *comp_int = (w64_composite_int_t*)h->private;
 
   a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
 
   return ((uint64_t)(base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | 
-         ((uint64_t)(base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, comp_int->s)) << 32));
+         ((uint64_t)(base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 32));
 }
 
 /*
@@ -1307,6 +1572,7 @@ gf_w64_composite_multiply(gf_t *gf, gf_val_64_t a, gf_val_64_t b)
  *
  * a / b = a * c
  */
+
 static
 gf_val_64_t
 gf_w64_composite_inverse(gf_t *gf, gf_val_64_t a)
@@ -1318,11 +1584,10 @@ gf_w64_composite_inverse(gf_t *gf, gf_val_64_t a)
   uint32_t c0, c1, d, tmp;
   uint64_t c;
   uint32_t a0inv, a1inv;
-  w64_composite_int_t *comp_int = (w64_composite_int_t*)h->private;
 
   if (a0 == 0) {
     a1inv = base_gf->inverse.w32(base_gf, a1);
-    c0 = base_gf->multiply.w32(base_gf, a1inv, comp_int->s);
+    c0 = base_gf->multiply.w32(base_gf, a1inv, h->prim_poly);
     c1 = a1inv;
   } else if (a1 == 0) {
     c0 = base_gf->inverse.w32(base_gf, a0);
@@ -1333,7 +1598,7 @@ gf_w64_composite_inverse(gf_t *gf, gf_val_64_t a)
 
     d = base_gf->multiply.w32(base_gf, a1, a0inv);
 
-    tmp = (base_gf->multiply.w32(base_gf, a1, a0inv) ^ base_gf->multiply.w32(base_gf, a0, a1inv) ^ comp_int->s);
+    tmp = (base_gf->multiply.w32(base_gf, a1, a0inv) ^ base_gf->multiply.w32(base_gf, a0, a1inv) ^ h->prim_poly);
     tmp = base_gf->inverse.w32(base_gf, tmp);
 
     d = base_gf->multiply.w32(base_gf, d, tmp);
@@ -1347,17 +1612,6 @@ gf_w64_composite_inverse(gf_t *gf, gf_val_64_t a)
   return c;
 }
 
-static
-gf_val_64_t
-gf_w64_composite_divide(gf_t *gf, gf_val_64_t a, gf_val_64_t b)
-{
-  gf_val_64_t binv;
-
-  binv = gf_w64_composite_inverse(gf, b);
-
-  return gf_w64_composite_multiply(gf, a, binv);
-}
-
 static
 void
 gf_w64_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor)
@@ -1374,7 +1628,6 @@ gf_w64_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t va
   int num_syms = bytes / 8;
   int sym_divisible = bytes % 4;
   gf_region_data rd;
-  w64_composite_int_t *comp_int = (w64_composite_int_t*)h->private;
 
   if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
   gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
@@ -1390,7 +1643,7 @@ gf_w64_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t va
       a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
 
       *d64 ^= ((uint64_t)(base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
-                ((uint64_t)(base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, comp_int->s)) << 32));
+                ((uint64_t)(base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 32));
       s64++;
       d64++;
     }
@@ -1401,7 +1654,7 @@ gf_w64_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t va
       a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
 
       *d64 = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
-                ((uint64_t)(base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, comp_int->s)) << 32));
+                ((uint64_t)(base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 32));
       s64++;
       d64++;
     }
@@ -1420,7 +1673,6 @@ gf_w64_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_64_
   uint8_t *dlow, *dhigh, *top;
   int sub_reg_size;
   gf_region_data rd;
-  w64_composite_int_t *comp_int = (w64_composite_int_t*)h->private;
 
   if (!xor) {
     memset(dest, 0, bytes);
@@ -1440,7 +1692,7 @@ gf_w64_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_64_
   base_gf->multiply_region.w32(base_gf, shigh, dlow, val1, sub_reg_size, 1);
   base_gf->multiply_region.w32(base_gf, slow, dhigh, val1, sub_reg_size, xor);
   base_gf->multiply_region.w32(base_gf, shigh, dhigh, val0, sub_reg_size, 1);
-  base_gf->multiply_region.w32(base_gf, shigh, dhigh, base_gf->multiply.w32(base_gf, comp_int->s, val1), sub_reg_size, 1);
+  base_gf->multiply_region.w32(base_gf, shigh, dhigh, base_gf->multiply.w32(base_gf, h->prim_poly, val1), sub_reg_size, 1);
 
   gf_do_final_region_alignment(&rd);
 }
@@ -1458,29 +1710,18 @@ int gf_w64_composite_init(gf_t *gf)
     gf->multiply_region.w64 = gf_w64_composite_multiply_region;
   }
 
-  if (h->base_gf != NULL) {
-    gf_internal_t *base_h = (gf_internal_t *) h->base_gf->scratch;
-    w64_composite_int_t *comp_int = (w64_composite_int_t*)h->private;
-
-    if (base_h->mult_type == GF_MULT_COMPOSITE) {
-      comp_int->s = GF_S_GF_16_2_2; 
-    } else {
-      comp_int->s = GF_S_GF_32_2; 
-    }
-  } 
-
   gf->multiply.w64 = gf_w64_composite_multiply;
-  gf->divide.w64 = gf_w64_composite_divide;
+  gf->divide.w64 = NULL;
   gf->inverse.w64 = gf_w64_composite_inverse;
 
   return 1;
 }
 
 static
-void
+  void
 gf_w64_split_4_64_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor)
 {
-#ifdef INTEL_SSE4
+#ifdef INTEL_SSSE3
   gf_internal_t *h;
   int i, m, j, k, tindex;
   uint64_t pp, v, s, *s64, *d64, *top;
@@ -1494,7 +1735,7 @@ gf_w64_split_4_64_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *des
 
   h = (gf_internal_t *) gf->scratch;
   pp = h->prim_poly;
- 
+
   gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 128);
   gf_do_initial_region_alignment(&rd);
 
@@ -1534,11 +1775,11 @@ gf_w64_split_4_64_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *des
     i = 0;
     for (k = 0; k < 8; k++) {
       v0 = _mm_load_si128((__m128i *) s64); 
+      /* MM_PRINT8("v", v0); */
       s64 += 2;
       
       si = _mm_and_si128(v0, mask1);
   
-      /* Happy now? */
       for (j = 0; j < 8; j++) {
         p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si));
       }
@@ -1551,6 +1792,7 @@ gf_w64_split_4_64_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *des
       i++;
     }
     for (i = 0; i < 8; i++) {
+      /* MM_PRINT8("v", p[i]); */
       _mm_store_si128((__m128i *) d64, p[i]);
       d64 += 2;
     }
@@ -1559,6 +1801,210 @@ gf_w64_split_4_64_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *des
 #endif
 }
 
+static
+  void
+gf_w64_split_4_64_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor)
+{
+#ifdef INTEL_SSE4
+  gf_internal_t *h;
+  int i, m, j, k, tindex;
+  uint64_t pp, v, s, *s64, *d64, *top;
+  __m128i si, tables[16][8], p[8], st[8], mask1, mask8, mask16, t1, t2;
+  struct gf_split_4_64_lazy_data *ld;
+  uint8_t btable[16];
+  gf_region_data rd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 128);
+  gf_do_initial_region_alignment(&rd);
+
+  s64 = (uint64_t *) rd.s_start;
+  d64 = (uint64_t *) rd.d_start;
+  top = (uint64_t *) rd.d_top;
+ 
+  ld = (struct gf_split_4_64_lazy_data *) h->private;
+
+  v = val;
+  for (i = 0; i < 16; i++) {
+    ld->tables[i][0] = 0;
+    for (j = 1; j < 16; j <<= 1) {
+      for (k = 0; k < j; k++) {
+        ld->tables[i][k^j] = (v ^ ld->tables[i][k]);
+      }
+      v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
+    }
+    for (j = 0; j < 8; j++) {
+      for (k = 0; k < 16; k++) {
+        btable[k] = (uint8_t) ld->tables[i][k];
+        ld->tables[i][k] >>= 8;
+      }
+      tables[i][j] = _mm_loadu_si128((__m128i *) btable);
+    }
+  }
+
+  mask1 = _mm_set1_epi8(0xf);
+  mask8 = _mm_set1_epi16(0xff);
+  mask16 = _mm_set1_epi32(0xffff);
+
+  while (d64 != top) {
+
+    for (i = 0; i < 8; i++) p[i] = _mm_setzero_si128();
+
+    for (k = 0; k < 8; k++) {
+      st[k]  = _mm_load_si128((__m128i *) s64); 
+      s64 += 2;
+    }
+
+    for (k = 0; k < 4; k ++) {
+      st[k] = _mm_shuffle_epi32(st[k], _MM_SHUFFLE(3,1,2,0));
+      st[k+4] = _mm_shuffle_epi32(st[k+4], _MM_SHUFFLE(2,0,3,1));
+      t1 = _mm_blend_epi16(st[k], st[k+4], 0xf0);
+      st[k] = _mm_srli_si128(st[k], 8);
+      st[k+4] = _mm_slli_si128(st[k+4], 8);
+      st[k+4] = _mm_blend_epi16(st[k], st[k+4], 0xf0);
+      st[k] = t1;
+    }
+
+/*
+    printf("After pack pass 1\n");
+    for (k = 0; k < 8; k++) {
+      MM_PRINT8("v", st[k]);
+    }
+    printf("\n");
+ */
+    
+    t1 = _mm_packus_epi32(_mm_and_si128(st[0], mask16), _mm_and_si128(st[2], mask16));
+    st[2] = _mm_packus_epi32(_mm_srli_epi32(st[0], 16), _mm_srli_epi32(st[2], 16));
+    st[0] = t1;
+    t1 = _mm_packus_epi32(_mm_and_si128(st[1], mask16), _mm_and_si128(st[3], mask16));
+    st[3] = _mm_packus_epi32(_mm_srli_epi32(st[1], 16), _mm_srli_epi32(st[3], 16));
+    st[1] = t1;
+    t1 = _mm_packus_epi32(_mm_and_si128(st[4], mask16), _mm_and_si128(st[6], mask16));
+    st[6] = _mm_packus_epi32(_mm_srli_epi32(st[4], 16), _mm_srli_epi32(st[6], 16));
+    st[4] = t1;
+    t1 = _mm_packus_epi32(_mm_and_si128(st[5], mask16), _mm_and_si128(st[7], mask16));
+    st[7] = _mm_packus_epi32(_mm_srli_epi32(st[5], 16), _mm_srli_epi32(st[7], 16));
+    st[5] = t1;
+
+/*
+    printf("After pack pass 2\n");
+    for (k = 0; k < 8; k++) {
+      MM_PRINT8("v", st[k]);
+    }
+    printf("\n");
+ */
+    t1 = _mm_packus_epi16(_mm_and_si128(st[0], mask8), _mm_and_si128(st[1], mask8));
+    st[1] = _mm_packus_epi16(_mm_srli_epi16(st[0], 8), _mm_srli_epi16(st[1], 8));
+    st[0] = t1;
+    t1 = _mm_packus_epi16(_mm_and_si128(st[2], mask8), _mm_and_si128(st[3], mask8));
+    st[3] = _mm_packus_epi16(_mm_srli_epi16(st[2], 8), _mm_srli_epi16(st[3], 8));
+    st[2] = t1;
+    t1 = _mm_packus_epi16(_mm_and_si128(st[4], mask8), _mm_and_si128(st[5], mask8));
+    st[5] = _mm_packus_epi16(_mm_srli_epi16(st[4], 8), _mm_srli_epi16(st[5], 8));
+    st[4] = t1;
+    t1 = _mm_packus_epi16(_mm_and_si128(st[6], mask8), _mm_and_si128(st[7], mask8));
+    st[7] = _mm_packus_epi16(_mm_srli_epi16(st[6], 8), _mm_srli_epi16(st[7], 8));
+    st[6] = t1;
+
+/*
+    printf("After final pack pass 2\n");
+    for (k = 0; k < 8; k++) {
+      MM_PRINT8("v", st[k]);
+    }
+ */
+    i = 0;
+    for (k = 0; k < 8; k++) {
+      si = _mm_and_si128(st[k], mask1);
+  
+      for (j = 0; j < 8; j++) {
+        p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si));
+      }
+      i++;
+      st[k] = _mm_srli_epi32(st[k], 4);
+      si = _mm_and_si128(st[k], mask1);
+      for (j = 0; j < 8; j++) {
+        p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si));
+      }
+      i++;
+    }
+
+    t1 = _mm_unpacklo_epi8(p[0], p[1]);
+    p[1] = _mm_unpackhi_epi8(p[0], p[1]);
+    p[0] = t1;
+    t1 = _mm_unpacklo_epi8(p[2], p[3]);
+    p[3] = _mm_unpackhi_epi8(p[2], p[3]);
+    p[2] = t1;
+    t1 = _mm_unpacklo_epi8(p[4], p[5]);
+    p[5] = _mm_unpackhi_epi8(p[4], p[5]);
+    p[4] = t1;
+    t1 = _mm_unpacklo_epi8(p[6], p[7]);
+    p[7] = _mm_unpackhi_epi8(p[6], p[7]);
+    p[6] = t1;
+
+/*
+    printf("After unpack pass 1:\n");
+    for (i = 0; i < 8; i++) {
+      MM_PRINT8("v", p[i]);
+    }
+ */
+
+    t1 = _mm_unpacklo_epi16(p[0], p[2]);
+    p[2] = _mm_unpackhi_epi16(p[0], p[2]);
+    p[0] = t1;
+    t1 = _mm_unpacklo_epi16(p[1], p[3]);
+    p[3] = _mm_unpackhi_epi16(p[1], p[3]);
+    p[1] = t1;
+    t1 = _mm_unpacklo_epi16(p[4], p[6]);
+    p[6] = _mm_unpackhi_epi16(p[4], p[6]);
+    p[4] = t1;
+    t1 = _mm_unpacklo_epi16(p[5], p[7]);
+    p[7] = _mm_unpackhi_epi16(p[5], p[7]);
+    p[5] = t1;
+
+/*
+    printf("After unpack pass 2:\n");
+    for (i = 0; i < 8; i++) {
+      MM_PRINT8("v", p[i]);
+    }
+ */
+
+    t1 = _mm_unpacklo_epi32(p[0], p[4]);
+    p[4] = _mm_unpackhi_epi32(p[0], p[4]);
+    p[0] = t1;
+    t1 = _mm_unpacklo_epi32(p[1], p[5]);
+    p[5] = _mm_unpackhi_epi32(p[1], p[5]);
+    p[1] = t1;
+    t1 = _mm_unpacklo_epi32(p[2], p[6]);
+    p[6] = _mm_unpackhi_epi32(p[2], p[6]);
+    p[2] = t1;
+    t1 = _mm_unpacklo_epi32(p[3], p[7]);
+    p[7] = _mm_unpackhi_epi32(p[3], p[7]);
+    p[3] = t1;
+
+    if (xor) {
+      for (i = 0; i < 8; i++) {
+        t1 = _mm_load_si128((__m128i *) d64);
+        _mm_store_si128((__m128i *) d64, _mm_xor_si128(p[i], t1));
+        d64 += 2;
+      }
+    } else {
+      for (i = 0; i < 8; i++) {
+        _mm_store_si128((__m128i *) d64, p[i]);
+        d64 += 2;
+      }
+    }
+
+  }
+
+  gf_do_final_region_alignment(&rd);
+#endif
+}
+
 #define GF_MULTBY_TWO(p) (((p) & GF_FIRST_BIT) ? (((p) << 1) ^ h->prim_poly) : (p) << 1);
 
 static
@@ -1575,27 +2021,72 @@ int gf_w64_split_init(gf_t *gf)
   h = (gf_internal_t *) gf->scratch;
 
   /* Defaults */
+
   gf->multiply_region.w64 = gf_w64_multiply_region_from_single;
 
-  gf->multiply.w64 = gf_w64_shift_multiply; 
+  gf->multiply.w64 = gf_w64_bytwo_p_multiply; 
 
-#ifdef INTEL_PCLMUL
-  if (h->region_type != GF_REGION_NOSSE) gf->multiply.w64 = gf_w64_clm_multiply;
+#ifdef INTEL_SSE4_PCLMUL
+  if ((!(h->region_type & GF_REGION_NOSSE) &&
+     (h->arg1 == 64 || h->arg2 == 64)) ||
+     h->mult_type == GF_MULT_DEFAULT){
+   
+    if ((0xfffffffe00000000ULL & h->prim_poly) == 0){ 
+      gf->multiply.w64 = gf_w64_clm_multiply_2;
+      gf->multiply_region.w64 = gf_w64_clm_multiply_region_from_single_2; 
+    }else if((0xfffe000000000000ULL & h->prim_poly) == 0){
+      gf->multiply.w64 = gf_w64_clm_multiply_4;
+      gf->multiply_region.w64 = gf_w64_clm_multiply_region_from_single_4; 
+    }else{
+      return 0;
+    }
+  }
 #endif
 
   gf->inverse.w64 = gf_w64_euclid;
 
+  /* Allen: set region pointers for default mult type. Single pointers are
+   * taken care of above (explicitly for sse, implicitly for no sse). */
+
+#ifdef INTEL_SSE4
+  if (h->mult_type == GF_MULT_DEFAULT) {
+    d4 = (struct gf_split_4_64_lazy_data *) h->private;
+    d4->last_value = 0;
+    gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_multiply_region; 
+  }
+#else
+  if (h->mult_type == GF_MULT_DEFAULT) {
+    d8 = (struct gf_split_8_64_lazy_data *) h->private;
+    d8->last_value = 0;
+    gf->multiply_region.w64 = gf_w64_split_8_64_lazy_multiply_region;
+  }
+#endif
+
   if ((h->arg1 == 4 && h->arg2 == 64) || (h->arg1 == 64 && h->arg2 == 4)) {
     d4 = (struct gf_split_4_64_lazy_data *) h->private;
     d4->last_value = 0;
-    if (h->region_type & GF_REGION_SSE) {
-      if (h->region_type & GF_REGION_ALTMAP) {
+
+    if((h->region_type & GF_REGION_ALTMAP) && (h->region_type & GF_REGION_NOSSE)) return 0;
+    if(h->region_type & GF_REGION_ALTMAP)
+    {
+      #ifdef INTEL_SSSE3
         gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_altmap_multiply_region; 
-      } else {
-/*        gf->multiply_region.w32 = gf_w32_split_4_32_lazy_sse_multiply_region; */
-      }
-    } else {
-      gf->multiply_region.w64 = gf_w64_split_4_64_lazy_multiply_region;
+      #else
+        return 0;
+      #endif
+    }
+    else //no altmap
+    {
+      #ifdef INTEL_SSE4
+        if(h->region_type & GF_REGION_NOSSE)
+          gf->multiply_region.w64 = gf_w64_split_4_64_lazy_multiply_region;
+        else
+          gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_multiply_region; 
+      #else
+        gf->multiply_region.w64 = gf_w64_split_4_64_lazy_multiply_region;
+        if(h->region_type & GF_REGION_SSE)
+          return 0;
+      #endif
     }
   }
   if ((h->arg1 == 8 && h->arg2 == 64) || (h->arg1 == 64 && h->arg2 == 8)) {
@@ -1611,7 +2102,9 @@ int gf_w64_split_init(gf_t *gf)
   if ((h->arg1 == 8 && h->arg2 == 8)) {
     d88 = (struct gf_split_8_8_data *) h->private;
     gf->multiply.w64 = gf_w64_split_8_8_multiply;
+
     /* The performance of this guy sucks, so don't bother with a region op */
+    
     basep = 1;
     for (exp = 0; exp < 15; exp++) {
       for (j = 0; j < 256; j++) d88->tables[exp][0][j] = 0;
@@ -1639,94 +2132,93 @@ int gf_w64_split_init(gf_t *gf)
       for (i = 0; i < 8; i++) basep = GF_MULTBY_TWO(basep);
     }
   }
-  return -1;
+  return 1;
 }
 
 int gf_w64_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
 {
-  int ss, sa;
+  int issse4;
 
-  ss = (GF_REGION_SSE | GF_REGION_NOSSE);
-  sa = (GF_REGION_STDMAP | GF_REGION_ALTMAP);
-
-  if (divide_type == GF_DIVIDE_MATRIX) return -1;
   switch(mult_type)
   {
     case GF_MULT_SHIFT:
-      if (arg1 != 0 || arg2 != 0) return -1;
-      if (region_type != GF_REGION_NOSSE && region_type != GF_REGION_SSE && region_type != GF_REGION_DEFAULT) return -1;
+      return sizeof(gf_internal_t);
+      break;
+    case GF_MULT_CARRY_FREE:
       return sizeof(gf_internal_t);
       break;
     case GF_MULT_BYTWO_p:
     case GF_MULT_BYTWO_b:
-      if (arg1 != 0 || arg2 != 0) return -1;
-      if (region_type != GF_REGION_CAUCHY) {
-        if ((region_type | ss) != ss || (region_type & ss) == ss) return -1;
-      }
       return sizeof(gf_internal_t);
       break;
 
+    case GF_MULT_DEFAULT:
+
+      /* Allen: set the *local* arg1 and arg2, just for scratch size purposes,
+       * then fall through to split table scratch size code. */
+
+#ifdef INTEL_SSE4
+      issse4 = 1;
+      arg1 = 64;
+      arg2 = 4;
+#else
+      issse4 = 0;
+      arg1 = 64;
+      arg2 = 8;
+#endif
+
     case GF_MULT_SPLIT_TABLE:
         if (arg1 == 8 && arg2 == 8) {
-          region_type &= (~GF_REGION_LAZY);
-          if (region_type != GF_REGION_DEFAULT) return -1;
           return sizeof(gf_internal_t) + sizeof(struct gf_split_8_8_data) + 64;
         }
         if ((arg1 == 16 && arg2 == 64) || (arg2 == 16 && arg1 == 64)) {
-          region_type &= (~GF_REGION_LAZY);
-          if (region_type != GF_REGION_DEFAULT) return -1;
           return sizeof(gf_internal_t) + sizeof(struct gf_split_16_64_lazy_data) + 64;
         }
         if ((arg1 == 8 && arg2 == 64) || (arg2 == 8 && arg1 == 64)) {
-          region_type &= (~GF_REGION_LAZY);
-          if (region_type != GF_REGION_DEFAULT) return -1;
           return sizeof(gf_internal_t) + sizeof(struct gf_split_8_64_lazy_data) + 64;
         }
 
-        if ((arg1 == 64 && arg2 == 4) || (arg1 == 4 && arg2 == 64)){
-          region_type &= (~GF_REGION_LAZY);
-          if ((region_type & ss) == ss) return -1;
-          if ((region_type & sa) == sa) return -1;
-          if (region_type & (~(ss|sa))) return -1;
-          if (region_type & GF_REGION_SSE) {
-            return sizeof(gf_internal_t) + sizeof(struct gf_split_4_64_lazy_data) + 64;
-          } else if (region_type & GF_REGION_ALTMAP) {
-            return -1;
-          } else {
-            return sizeof(gf_internal_t) + sizeof(struct gf_split_4_64_lazy_data) + 64;
-          }
+        if ((arg1 == 64 && arg2 == 4) || (arg1 == 4 && arg2 == 64)) {
+          return sizeof(gf_internal_t) + sizeof(struct gf_split_4_64_lazy_data) + 64;
         }
-        return -1;
-
-    case GF_MULT_DEFAULT:
-      arg1 = 4;
-      arg2 = 8;
+        return 0;
     case GF_MULT_GROUP:
-      if (arg1 <= 0 || arg2 <= 0) return -1;
-      if (region_type != GF_REGION_DEFAULT && region_type != GF_REGION_CAUCHY) return -1;
       return sizeof(gf_internal_t) + sizeof(struct gf_w64_group_data) +
                sizeof(uint64_t) * (1 << arg1) +
                sizeof(uint64_t) * (1 << arg2) + 64;
       break;
     case GF_MULT_COMPOSITE:
-      if (region_type & ~(GF_REGION_ALTMAP | GF_REGION_STDMAP)) return -1;
-      if ((arg1 == 2 && arg2 == 0) || (arg1 == 2 && arg2 == 1)) {
-        return sizeof(gf_internal_t) + sizeof(w64_composite_int_t) + 4;
-      } else {
-        return -1;
-      }
+      if (arg1 == 2) return sizeof(gf_internal_t) + 64;
+      return 0;
       break;
     default:
-      return -1;
+      return 0;
    }
 }
 
 int gf_w64_init(gf_t *gf)
 {
-  gf_internal_t *h;
+  gf_internal_t *h, *h_base, *h_base_base, *h_base_base_base;
+  int no_default_flag = 0;
 
   h = (gf_internal_t *) gf->scratch;
-  if (h->prim_poly == 0) h->prim_poly = 0x1b; /* Omitting the leftmost 1 as in w=32 */
+  
+  /* Allen: set default primitive polynomial / irreducible polynomial if needed */
+
+  /* Omitting the leftmost 1 as in w=32 */
+
+  if (h->prim_poly == 0) {
+    if (h->mult_type == GF_MULT_COMPOSITE) {
+      h->prim_poly = gf_composite_get_default_poly(h->base_gf);
+      if (h->prim_poly == 0) return 0; /* This shouldn't happen */
+    } else {
+      h->prim_poly = 0x1b;
+    } 
+    if (no_default_flag == 1) { 
+      fprintf(stderr,"Code contains no default irreducible polynomial for given base field\n"); 
+      return 0; 
+    } 
+  }
 
   gf->multiply.w64 = NULL;
   gf->divide.w64 = NULL;
@@ -1734,10 +2226,11 @@ int gf_w64_init(gf_t *gf)
   gf->multiply_region.w64 = NULL;
 
   switch(h->mult_type) {
-    case GF_MULT_SHIFT:     if (gf_w64_shift_init(gf) == 0) return 0; break;
-    case GF_MULT_COMPOSITE: if (gf_w64_composite_init(gf) == 0) return 0; break;
-    case GF_MULT_SPLIT_TABLE: if (gf_w64_split_init(gf) == 0) return 0; break; 
+    case GF_MULT_CARRY_FREE:  if (gf_w64_cfm_init(gf) == 0) return 0; break;
+    case GF_MULT_SHIFT:       if (gf_w64_shift_init(gf) == 0) return 0; break;
+    case GF_MULT_COMPOSITE:   if (gf_w64_composite_init(gf) == 0) return 0; break;
     case GF_MULT_DEFAULT:
+    case GF_MULT_SPLIT_TABLE: if (gf_w64_split_init(gf) == 0) return 0; break; 
     case GF_MULT_GROUP:       if (gf_w64_group_init(gf) == 0) return 0; break; 
     case GF_MULT_BYTWO_p:
     case GF_MULT_BYTWO_b:     if (gf_w64_bytwo_init(gf) == 0) return 0; break;
@@ -1748,11 +2241,6 @@ int gf_w64_init(gf_t *gf)
     gf->inverse.w64 = gf_w64_euclid;
   } 
 
-/* else if (h->divide_type == GF_DIVIDE_MATRIX) {
-    gf->divide.w64 = gf_w64_divide_from_inverse;
-    gf->inverse.w64 = gf_w64_matrix;
-  } */
-
   if (gf->inverse.w64 != NULL && gf->divide.w64 == NULL) {
     gf->divide.w64 = gf_w64_divide_from_inverse;
   }
@@ -1760,6 +2248,8 @@ int gf_w64_init(gf_t *gf)
     gf->inverse.w64 = gf_w64_inverse_from_divide;
   }
 
+  if (h->region_type == GF_REGION_CAUCHY) return 0;
+
   if (h->region_type & GF_REGION_ALTMAP) {
     if (h->mult_type == GF_MULT_COMPOSITE) {
       gf->extract_word.w64 = gf_w64_composite_extract_word;
diff --git a/gf_w8.c b/gf_w8.c
index 306f911..45c500f 100644
--- a/gf_w8.c
+++ b/gf_w8.c
@@ -15,7 +15,6 @@
 
 #define GF_BASE_FIELD_WIDTH (4)
 #define GF_BASE_FIELD_SIZE       (1 << GF_BASE_FIELD_WIDTH)
-#define GF_S_GF_4_2 (4)
 
 struct gf_w8_logtable_data {
     uint8_t         log_tbl[GF_FIELD_SIZE];
@@ -37,6 +36,10 @@ struct gf_w8_logzero_small_table_data {
     uint8_t         *div_tbl;
 };
 
+struct gf_w8_composite_data {
+  uint8_t *mult_table;
+};
+
 /* Don't change the order of these relative to gf_w8_half_table_data */
 
 struct gf_w8_default_data {
@@ -139,6 +142,7 @@ uint32_t gf_w8_euclid (gf_t *gf, uint32_t b)
     while (d_ip1 >= d_i) {
       c_i ^= (1 << (d_ip1 - d_i));
       e_ip1 ^= (e_i << (d_ip1 - d_i));
+      if (e_ip1 == 0) return 0;
       while ((e_ip1 & (1 << d_ip1)) == 0) d_ip1--;
     }
 
@@ -164,6 +168,30 @@ gf_val_32_t gf_w8_extract_word(gf_t *gf, void *start, int bytes, int index)
   return r8[index];
 }
 
+static
+gf_val_32_t gf_w8_composite_extract_word(gf_t *gf, void *start, int bytes, int index)
+{
+  int sub_size;
+  gf_internal_t *h;
+  uint8_t *r8, *top;
+  uint8_t a, b;
+  gf_region_data rd;
+
+  h = (gf_internal_t *) gf->scratch;
+  gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 32);
+  r8 = (uint8_t *) start;
+  if (r8 + index < (uint8_t *) rd.d_start) return r8[index];
+  if (r8 + index >= (uint8_t *) rd.d_top) return r8[index];
+  index -= (((uint8_t *) rd.d_start) - r8);
+  r8 = (uint8_t *) rd.d_start;
+  top = (uint8_t *) rd.d_top;
+  sub_size = (top-r8)/2;
+
+  a = h->base_gf->extract_word.w32(h->base_gf, r8, sub_size, index);
+  b = h->base_gf->extract_word.w32(h->base_gf, r8+sub_size, sub_size, index);
+  return (a | (b << 4));
+}
+
 static
 inline
 uint32_t gf_w8_matrix (gf_t *gf, uint32_t b)
@@ -171,22 +199,372 @@ uint32_t gf_w8_matrix (gf_t *gf, uint32_t b)
   return gf_bitmatrix_inverse(b, 8, ((gf_internal_t *) (gf->scratch))->prim_poly);
 }
 
-/* ------------------------------------------------------------
-  IMPLEMENTATION: SHIFT:
-
-   JSP: The world's dumbest multiplication algorithm.  I only
-   include it for completeness.  It does have the feature that it requires no
-   extra memory.  
-*/
 
 static
 inline
-uint32_t
+gf_val_32_t
+gf_w8_clm_multiply_2 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8)
+{
+  gf_val_32_t rv = 0;
+
+#ifdef INTEL_SSE4_PCLMUL
+
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         v, w;
+  gf_internal_t * h = gf->scratch;
+
+  a = _mm_insert_epi32 (_mm_setzero_si128(), a8, 0);
+  b = _mm_insert_epi32 (a, b8, 0);
+
+  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL));
+
+  /* Do the initial multiply */
+
+  result = _mm_clmulepi64_si128 (a, b, 0);
+
+  /* Ben: Do prim_poly reduction twice. We are guaranteed that we will only
+     have to do the reduction at most twice, because (w-2)/z == 2. Where
+     z is equal to the number of zeros after the leading 1
+
+     _mm_clmulepi64_si128 is the carryless multiply operation. Here
+     _mm_srli_si128 shifts the result to the right by 1 byte. This allows
+     us to multiply the prim_poly by the leading bits of the result. We
+     then xor the result of that operation back with the result.*/
+
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+  result = _mm_xor_si128 (result, w);
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+  result = _mm_xor_si128 (result, w);
+
+  /* Extracts 32 bit value from result. */
+  
+  rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+
+#endif
+  return rv;
+}
+
+static
+inline
+gf_val_32_t
+gf_w8_clm_multiply_3 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8)
+{
+  gf_val_32_t rv = 0;
+
+#ifdef INTEL_SSE4_PCLMUL
+
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         v, w;
+  gf_internal_t * h = gf->scratch;
+
+  a = _mm_insert_epi32 (_mm_setzero_si128(), a8, 0);
+  b = _mm_insert_epi32 (a, b8, 0);
+
+  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL));
+
+  /* Do the initial multiply */
+  
+  result = _mm_clmulepi64_si128 (a, b, 0);
+
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+  result = _mm_xor_si128 (result, w);
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+  result = _mm_xor_si128 (result, w);
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+  result = _mm_xor_si128 (result, w);
+
+  /* Extracts 32 bit value from result. */
+  
+  rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+
+#endif
+  return rv;
+}
+
+static
+inline
+gf_val_32_t
+gf_w8_clm_multiply_4 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8)
+{
+  gf_val_32_t rv = 0;
+
+#ifdef INTEL_SSE4_PCLMUL
+
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         v, w;
+  gf_internal_t * h = gf->scratch;
+
+  a = _mm_insert_epi32 (_mm_setzero_si128(), a8, 0);
+  b = _mm_insert_epi32 (a, b8, 0);
+
+  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL));
+
+  /* Do the initial multiply */
+  
+  result = _mm_clmulepi64_si128 (a, b, 0);
+
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+  result = _mm_xor_si128 (result, w);
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+  result = _mm_xor_si128 (result, w);
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+  result = _mm_xor_si128 (result, w);
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+  result = _mm_xor_si128 (result, w);
+
+  /* Extracts 32 bit value from result. */
+  rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+
+#endif
+  return rv;
+}
+
+
+static
+void
+gf_w8_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int
+    xor)
+{
+  gf_region_data rd;
+  uint8_t *s8;
+  uint8_t *d8;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1);
+  gf_do_initial_region_alignment(&rd);
+
+  s8 = (uint8_t *) rd.s_start;
+  d8 = (uint8_t *) rd.d_start;
+
+  if (xor) {
+    while (d8 < ((uint8_t *) rd.d_top)) {
+      *d8 ^= gf->multiply.w32(gf, val, *s8);
+      d8++;
+      s8++;
+    }
+  } else {
+    while (d8 < ((uint8_t *) rd.d_top)) {
+      *d8 = gf->multiply.w32(gf, val, *s8);
+      d8++;
+      s8++;
+    }
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+static
+void
+gf_w8_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int
+    xor)
+{
+  gf_region_data rd;
+  uint8_t *s8;
+  uint8_t *d8;
+
+#ifdef INTEL_SSE4_PCLMUL
+
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         v, w;
+  gf_internal_t * h = gf->scratch;
+
+  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL));
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1);
+  gf_do_initial_region_alignment(&rd);
+
+  s8 = (uint8_t *) rd.s_start;
+  d8 = (uint8_t *) rd.d_start;
+
+  if (xor) {
+    while (d8 < ((uint8_t *) rd.d_top)) {
+      b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+      result = _mm_xor_si128 (result, w);
+      *d8 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
+      d8++;
+      s8++;
+    }
+  } else {
+    while (d8 < ((uint8_t *) rd.d_top)) {
+      b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+      result = _mm_xor_si128 (result, w);
+      *d8 = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+      d8++;
+      s8++;
+    }
+  }
+  gf_do_final_region_alignment(&rd);
+#endif
+}
+
+static
+void
+gf_w8_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int
+    xor)
+{
+  gf_region_data rd;
+  uint8_t *s8;
+  uint8_t *d8;
+
+#ifdef INTEL_SSE4_PCLMUL
+
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         v, w;
+  gf_internal_t * h = gf->scratch;
+
+  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL));
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1);
+  gf_do_initial_region_alignment(&rd);
+
+  s8 = (uint8_t *) rd.s_start;
+  d8 = (uint8_t *) rd.d_start;
+
+  if (xor) {
+    while (d8 < ((uint8_t *) rd.d_top)) {
+      b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+      result = _mm_xor_si128 (result, w);
+      *d8 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
+      d8++;
+      s8++;
+    }
+  } else {
+    while (d8 < ((uint8_t *) rd.d_top)) {
+      b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+      result = _mm_xor_si128 (result, w);
+      *d8 = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+      d8++;
+      s8++;
+    }
+  }
+  gf_do_final_region_alignment(&rd);
+#endif
+}
+
+static
+void
+gf_w8_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int
+    xor)
+{
+  gf_region_data rd;
+  uint8_t *s8;
+  uint8_t *d8;
+
+#ifdef INTEL_SSE4_PCLMUL
+
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         v, w;
+  gf_internal_t * h = gf->scratch;
+
+  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL));
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1);
+  gf_do_initial_region_alignment(&rd);
+
+  s8 = (uint8_t *) rd.s_start;
+  d8 = (uint8_t *) rd.d_start;
+
+  if (xor) {
+    while (d8 < ((uint8_t *) rd.d_top)) {
+      b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+      result = _mm_xor_si128 (result, w);
+      *d8 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
+      d8++;
+      s8++;
+    }
+  } else {
+    while (d8 < ((uint8_t *) rd.d_top)) {
+      b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+      result = _mm_xor_si128 (result, w);
+      *d8 = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+      d8++;
+      s8++;
+    }
+  }
+  gf_do_final_region_alignment(&rd);
+#endif
+}
+
+/* ------------------------------------------------------------
+IMPLEMENTATION: SHIFT:
+
+JSP: The world's dumbest multiplication algorithm.  I only
+include it for completeness.  It does have the feature that it requires no
+extra memory.  
+ */
+
+static
+inline
+  uint32_t
 gf_w8_shift_multiply (gf_t *gf, uint32_t a8, uint32_t b8)
 {
   uint16_t product, i, pp, a, b;
   gf_internal_t *h;
-  
+
   a = a8;
   b = b8;
   h = (gf_internal_t *) gf->scratch;
@@ -197,29 +575,55 @@ gf_w8_shift_multiply (gf_t *gf, uint32_t a8, uint32_t b8)
   for (i = 0; i < GF_FIELD_WIDTH; i++) { 
     if (a & (1 << i)) product ^= (b << i);
   }
-  for (i = (GF_FIELD_WIDTH*2-1); i >= GF_FIELD_WIDTH; i--) {
+  for (i = (GF_FIELD_WIDTH*2-2); i >= GF_FIELD_WIDTH; i--) {
     if (product & (1 << i)) product ^= (pp << (i-GF_FIELD_WIDTH)); 
   }
   return product;
 }
 
+static 
+int gf_w8_cfm_init(gf_t *gf)
+{ 
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+
+#ifdef INTEL_SSE4_PCLMUL
+    if ((0xe0 & h->prim_poly) == 0){
+      gf->multiply.w32 = gf_w8_clm_multiply_2;
+      gf->multiply_region.w32 = gf_w8_clm_multiply_region_from_single_2;
+    }else if ((0xc0 & h->prim_poly) == 0){
+      gf->multiply.w32 = gf_w8_clm_multiply_3;
+      gf->multiply_region.w32 = gf_w8_clm_multiply_region_from_single_3;
+    }else if ((0x80 & h->prim_poly) == 0){ 
+      gf->multiply.w32 = gf_w8_clm_multiply_4;
+      gf->multiply_region.w32 = gf_w8_clm_multiply_region_from_single_4;
+    }else{
+      return 0;
+    }
+  return 1;
+#endif
+
+  return 0;
+
+}
+
 static 
 int gf_w8_shift_init(gf_t *gf)
-{
-  gf->multiply.w32 = gf_w8_shift_multiply;
-  gf->inverse.w32 = gf_w8_euclid;
+{ 
+  gf->multiply.w32 = gf_w8_shift_multiply;  /* The others will be set automatically */
   return 1;
 }
 
 /* ------------------------------------------------------------
-  IMPLEMENTATION: LOG_TABLE:
+IMPLEMENTATION: LOG_TABLE:
 
-  JSP: Kevin wrote this, and I'm converting it to my structure.
- */
+JSP: Kevin wrote this, and I'm converting it to my structure.
+*/
 
 static
 inline
-uint32_t
+  uint32_t
 gf_w8_logzero_multiply (gf_t *gf, uint32_t a, uint32_t b)
 {
   struct gf_w8_logzero_table_data *ltd;
@@ -230,7 +634,7 @@ gf_w8_logzero_multiply (gf_t *gf, uint32_t a, uint32_t b)
 
 static
 inline
-uint32_t
+  uint32_t
 gf_w8_logzero_divide (gf_t *gf, uint32_t a, uint32_t b)
 {
   struct gf_w8_logzero_table_data *ltd;
@@ -241,7 +645,7 @@ gf_w8_logzero_divide (gf_t *gf, uint32_t a, uint32_t b)
 
 static
 inline
-uint32_t
+  uint32_t
 gf_w8_logzero_small_multiply (gf_t *gf, uint32_t a, uint32_t b)
 {
   struct gf_w8_logzero_small_table_data *std;
@@ -253,7 +657,7 @@ gf_w8_logzero_small_multiply (gf_t *gf, uint32_t a, uint32_t b)
 
 static
 inline
-uint32_t
+  uint32_t
 gf_w8_logzero_small_divide (gf_t *gf, uint32_t a, uint32_t b)
 {
   struct gf_w8_logzero_small_table_data *std;
@@ -264,7 +668,7 @@ gf_w8_logzero_small_divide (gf_t *gf, uint32_t a, uint32_t b)
 
 static
 inline
-uint32_t
+  uint32_t
 gf_w8_log_multiply (gf_t *gf, uint32_t a, uint32_t b)
 {
   struct gf_w8_logtable_data *ltd;
@@ -275,7 +679,7 @@ gf_w8_log_multiply (gf_t *gf, uint32_t a, uint32_t b)
 
 static
 inline
-uint32_t
+  uint32_t
 gf_w8_log_divide (gf_t *gf, uint32_t a, uint32_t b)
 {
   int log_sum = 0;
@@ -289,7 +693,7 @@ gf_w8_log_divide (gf_t *gf, uint32_t a, uint32_t b)
 }
 
 static
-uint32_t
+  uint32_t
 gf_w8_log_inverse (gf_t *gf, uint32_t a)
 {
   struct gf_w8_logtable_data *ltd;
@@ -299,7 +703,7 @@ gf_w8_log_inverse (gf_t *gf, uint32_t a)
 }
 
 static
-uint32_t
+  uint32_t
 gf_w8_logzero_inverse (gf_t *gf, uint32_t a)
 {
   struct gf_w8_logzero_table_data *ltd;
@@ -309,7 +713,7 @@ gf_w8_logzero_inverse (gf_t *gf, uint32_t a)
 }
 
 static
-uint32_t
+  uint32_t
 gf_w8_logzero_small_inverse (gf_t *gf, uint32_t a)
 {
   struct gf_w8_logzero_small_table_data *std;
@@ -319,7 +723,7 @@ gf_w8_logzero_small_inverse (gf_t *gf, uint32_t a)
 }
 
 static
-void
+  void
 gf_w8_log_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
 {
   int i;
@@ -348,7 +752,7 @@ gf_w8_log_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int byt
 }
 
 static
-void
+  void
 gf_w8_logzero_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
 {
   int i;
@@ -390,7 +794,7 @@ gf_w8_logzero_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int
   }
 }
 
-static
+  static
 int gf_w8_log_init(gf_t *gf)
 {
   gf_internal_t *h;
@@ -400,13 +804,14 @@ int gf_w8_log_init(gf_t *gf)
   uint8_t *alt;
   uint8_t *inv;
   int i, b;
+  int check = 0;
 
   h = (gf_internal_t *) gf->scratch;
-  if (h->arg1 == 0) {
+  if (h->mult_type == GF_MULT_LOG_TABLE) {
     ltd = h->private;
     alt = ltd->antilog_tbl;
     inv = ltd->inv_tbl;
-  } else if (h->arg1 == 1) {
+  } else if (h->mult_type == GF_MULT_LOG_ZERO) {
     std = h->private;
     alt = std->antilog_tbl;
     std->div_tbl = (alt + 255);
@@ -418,10 +823,19 @@ int gf_w8_log_init(gf_t *gf)
     ztd->div_tbl = (alt + 255);
     inv = ztd->inv_tbl;
   }
-  
-  if (h->arg1 == 0) {
+
+  for (i = 0; i < GF_MULT_GROUP_SIZE+1; i++) {
+    if (h->mult_type == GF_MULT_LOG_TABLE)
+      ltd->log_tbl[i] = 0;
+    else if (h->mult_type == GF_MULT_LOG_ZERO)
+      std->log_tbl[i] = 0;
+    else
+      ztd->log_tbl[i] = 0;
+  }
+
+  if (h->mult_type == GF_MULT_LOG_TABLE) {
     ltd->log_tbl[0] = 0;
-  } else if (h->arg1 == 1) {
+  } else if (h->mult_type == GF_MULT_LOG_ZERO) {
     std->log_tbl[0] = 510;
   } else {
     ztd->log_tbl[0] = 512;
@@ -429,23 +843,31 @@ int gf_w8_log_init(gf_t *gf)
 
   b = 1;
   for (i = 0; i < GF_MULT_GROUP_SIZE; i++) {
-      if (h->arg1 == 0) {
-        ltd->log_tbl[b] = i;
-      } else if (h->arg1 == 1) {
-        std->log_tbl[b] = i;
-      } else {
-        ztd->log_tbl[b] = i;
-      }
-      alt[i] = b;
-      alt[i+GF_MULT_GROUP_SIZE] = b;
-      b <<= 1;
-      if (b & GF_FIELD_SIZE) {
-          b = b ^ h->prim_poly;
-      }
+    if (h->mult_type == GF_MULT_LOG_TABLE) {
+      if (ltd->log_tbl[b] != 0) check = 1;
+      ltd->log_tbl[b] = i;
+    } else if (h->mult_type == GF_MULT_LOG_ZERO) {
+      if (std->log_tbl[b] != 0) check = 1;
+      std->log_tbl[b] = i;
+    } else {
+      if (ztd->log_tbl[b] != 0) check = 1;
+      ztd->log_tbl[b] = i;
+    }
+    alt[i] = b;
+    alt[i+GF_MULT_GROUP_SIZE] = b;
+    b <<= 1;
+    if (b & GF_FIELD_SIZE) {
+      b = b ^ h->prim_poly;
+    }
+  }
+  if (check) {
+    _gf_errno = GF_E_LOGPOLY;
+    return 0;
   }
-  if (h->arg1 == 1) bzero(alt+510, 255);
 
-  if (h->arg1 == 2) {
+  if (h->mult_type == GF_MULT_LOG_ZERO) bzero(alt+510, 255);
+
+  if (h->mult_type == GF_MULT_LOG_ZERO_EXT) {
     bzero(alt+512, 255);
     alt[512+512] = 0;
   }
@@ -459,13 +881,13 @@ int gf_w8_log_init(gf_t *gf)
     if (i & (1 << 8)) i ^= h->prim_poly;
     b--;
   } while (i != 1);
-    
-  if (h->arg1 == 0) {
+
+  if (h->mult_type == GF_MULT_LOG_TABLE) {
     gf->inverse.w32 = gf_w8_log_inverse;
     gf->divide.w32 = gf_w8_log_divide;
     gf->multiply.w32 = gf_w8_log_multiply;
     gf->multiply_region.w32 = gf_w8_log_multiply_region;
-  } else if (h->arg1 == 1) {
+  } else if (h->mult_type == GF_MULT_LOG_ZERO) {
     gf->inverse.w32 = gf_w8_logzero_small_inverse;
     gf->divide.w32 = gf_w8_logzero_small_divide;
     gf->multiply.w32 = gf_w8_logzero_small_multiply;
@@ -480,13 +902,13 @@ int gf_w8_log_init(gf_t *gf)
 }
 
 /* ------------------------------------------------------------
-  IMPLEMENTATION: FULL_TABLE:
+IMPLEMENTATION: FULL_TABLE:
 
-  JSP: Kevin wrote this, and I'm converting it to my structure.
+JSP: Kevin wrote this, and I'm converting it to my structure.
  */
 
 static
-gf_val_32_t
+  gf_val_32_t
 gf_w8_table_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
 {
   struct gf_w8_single_table_data *ftd;
@@ -496,7 +918,7 @@ gf_w8_table_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
 }
 
 static
-gf_val_32_t
+  gf_val_32_t
 gf_w8_table_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
 {
   struct gf_w8_single_table_data *ftd;
@@ -506,7 +928,7 @@ gf_w8_table_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
 }
 
 static
-gf_val_32_t
+  gf_val_32_t
 gf_w8_default_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
 {
   struct gf_w8_default_data *ftd;
@@ -516,7 +938,7 @@ gf_w8_default_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
 }
 
 static
-gf_val_32_t
+  gf_val_32_t
 gf_w8_default_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
 {
   struct gf_w8_default_data *ftd;
@@ -526,7 +948,7 @@ gf_w8_default_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
 }
 
 static
-gf_val_32_t
+  gf_val_32_t
 gf_w8_double_table_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
 {
   struct gf_w8_double_table_data *ftd;
@@ -536,7 +958,7 @@ gf_w8_double_table_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
 }
 
 static
-gf_val_32_t
+  gf_val_32_t
 gf_w8_double_table_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
 {
   struct gf_w8_double_table_data *ftd;
@@ -546,7 +968,7 @@ gf_w8_double_table_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
 }
 
 static
-void
+  void
 gf_w8_double_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
 {
   uint16_t *base;
@@ -570,7 +992,7 @@ gf_w8_double_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t
         base[(b << 8)| c] = (vb | vc);
       }
     }
-      
+
   } else {
     dtd = (struct gf_w8_double_table_data *) h->private;
     base = &(dtd->mult[val][0]);
@@ -583,7 +1005,7 @@ gf_w8_double_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t
 }
 
 static
-gf_val_32_t
+  gf_val_32_t
 gf_w8_double_table_lazy_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
 {
   struct gf_w8_double_table_lazy_data *ftd;
@@ -593,7 +1015,7 @@ gf_w8_double_table_lazy_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
 }
 
 static
-gf_val_32_t
+  gf_val_32_t
 gf_w8_double_table_lazy_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
 {
   struct gf_w8_double_table_lazy_data *ftd;
@@ -603,7 +1025,7 @@ gf_w8_double_table_lazy_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
 }
 
 static
-void
+  void
 gf_w8_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
 {
   int i;
@@ -628,11 +1050,12 @@ gf_w8_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, in
     }
   }
 }
+
 static
-void
+  void
 gf_w8_split_multiply_region_sse(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
 {
-#ifdef   INTEL_SSE4
+#ifdef INTEL_SSSE3
   uint8_t *s8, *d8, *bh, *bl, *sptr, *dptr, *top;
   __m128i  tbl, loset, t1, r, va, mth, mtl;
   uint64_t altable[4];
@@ -654,7 +1077,7 @@ gf_w8_split_multiply_region_sse(gf_t *gf, void *src, void *dest, gf_val_32_t val
 
   sptr = rd.s_start;
   dptr = rd.d_start;
-  
+
   mth = _mm_loadu_si128 ((__m128i *)(bh));
   mtl = _mm_loadu_si128 ((__m128i *)(bl));
   loset = _mm_set1_epi8 (0x0f);
@@ -693,11 +1116,11 @@ gf_w8_split_multiply_region_sse(gf_t *gf, void *src, void *dest, gf_val_32_t val
 
 
 /* ------------------------------------------------------------
-  IMPLEMENTATION: FULL_TABLE:
+IMPLEMENTATION: FULL_TABLE:
  */
 
 static
-gf_val_32_t
+  gf_val_32_t
 gf_w8_split_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
 {
   struct gf_w8_half_table_data *htd;
@@ -707,7 +1130,7 @@ gf_w8_split_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
 }
 
 static
-void
+  void
 gf_w8_split_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
 {
   unsigned long uls, uld;
@@ -735,12 +1158,12 @@ gf_w8_split_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, in
 }
 
 
-static
+  static
 int gf_w8_split_init(gf_t *gf)
 {
   gf_internal_t *h;
   struct gf_w8_half_table_data *htd;
-  int a, b, c, d, pp;
+  int a, b, pp;
 
   h = (gf_internal_t *) gf->scratch;
   htd = (struct gf_w8_half_table_data *)h->private;
@@ -748,34 +1171,34 @@ int gf_w8_split_init(gf_t *gf)
 
   bzero(htd->high, sizeof(uint8_t)*GF_FIELD_SIZE*GF_HALF_SIZE);
   bzero(htd->low, sizeof(uint8_t)*GF_FIELD_SIZE*GF_HALF_SIZE);
-  
-  for (a = 1; a < GF_HALF_SIZE; a++) {
-    b = 1;
-    c = a;
-    d = (a << (GF_FIELD_WIDTH/2));
-    do {
-      htd->low[b][a] = c;
-      htd->high[b][a] = d;
-      b <<= 1;
-      if (b & GF_FIELD_SIZE) b ^= pp;
-      c <<= 1;
-      if (c & GF_FIELD_SIZE) c ^= pp;
-      d <<= 1;
-      if (d & GF_FIELD_SIZE) d ^= pp;
-    } while (c != a);
+
+  for (a = 1; a < GF_FIELD_SIZE; a++) {
+    for (b = 1; b < GF_HALF_SIZE; b++) {
+      htd->low[a][b] = gf_w8_shift_multiply(gf,a,b);
+      htd->high[a][b] = gf_w8_shift_multiply(gf,a,b<<4);
+    }
   }
 
-  gf->inverse.w32 = NULL; /* Will set from divide */
-  gf->divide.w32 = NULL;  /* Let the user figure it out. */
   gf->multiply.w32 = gf_w8_split_multiply;
-  if (h->region_type == GF_REGION_NOSSE) {
+  
+  #ifdef INTEL_SSSE3
+    if (h->region_type & GF_REGION_NOSSE)
+      gf->multiply_region.w32 = gf_w8_split_multiply_region;
+    else
+      gf->multiply_region.w32 = gf_w8_split_multiply_region_sse;
+  #else
     gf->multiply_region.w32 = gf_w8_split_multiply_region;
-  } else {
-    gf->multiply_region.w32 = gf_w8_split_multiply_region_sse;
-  }
+    if(h->region_type & GF_REGION_SSE)
+      return 0;
+  #endif
+
   return 1;
 }
 
+/* JSP: This is disgusting, but it is what it is.  If there is no SSE,
+   then the default is equivalent to single table.  If there is SSE, then
+   we use the "gf_w8_default_data" which is a hybrid of SPLIT & TABLE. */
+   
 static
 int gf_w8_table_init(gf_t *gf)
 {
@@ -784,19 +1207,24 @@ int gf_w8_table_init(gf_t *gf)
   struct gf_w8_double_table_data *dtd = NULL;
   struct gf_w8_double_table_lazy_data *ltd = NULL;
   struct gf_w8_default_data *dd = NULL;
-  int a, b, c, prod, scase;
+  int a, b, c, prod, scase, issse;
 
   h = (gf_internal_t *) gf->scratch;
 
-  if (h->mult_type == GF_MULT_DEFAULT) {
+  issse = 0;
+#ifdef INTEL_SSSE3
+  issse = 1;
+#endif
+
+  if (h->mult_type == GF_MULT_DEFAULT && issse) {
     dd = (struct gf_w8_default_data *)h->private;
     scase = 3;
     bzero(dd->high, sizeof(uint8_t) * GF_FIELD_SIZE * GF_HALF_SIZE);
     bzero(dd->low, sizeof(uint8_t) * GF_FIELD_SIZE * GF_HALF_SIZE);
     bzero(dd->divtable, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
     bzero(dd->multtable, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
-  } else if (h->region_type == 0 || (h->region_type & GF_REGION_CAUCHY) || 
-                             (h->region_type & GF_REGION_SINGLE_TABLE)) {
+  } else if (h->mult_type == GF_MULT_DEFAULT || 
+             h->region_type == 0 || (h->region_type & GF_REGION_CAUCHY)) {
     ftd = (struct gf_w8_single_table_data *)h->private;
     bzero(ftd->divtable, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
     bzero(ftd->multtable, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
@@ -815,93 +1243,98 @@ int gf_w8_table_init(gf_t *gf)
     fprintf(stderr, "Internal error in gf_w8_table_init\n");
     exit(0);
   }
-  
+
   for (a = 1; a < GF_FIELD_SIZE; a++) {
-    b = 1;
-    prod = a;
-    do {
+    for (b = 1; b < GF_FIELD_SIZE; b++) {
+      prod = gf_w8_shift_multiply(gf,a,b);
       switch (scase) {
-      case 0: 
-        ftd->multtable[a][b] = prod;
-        ftd->divtable[prod][b] = a;
-        break;
-      case 1:
-        dtd->div[prod][b] = a;
-        for (c = 0; c < GF_FIELD_SIZE; c++) {
-          dtd->mult[a][(c<<8)|b] |= prod;
-          dtd->mult[a][(b<<8)|c] |= (prod<<8);
-        }
-        break;
-      case 2:
-        ltd->div[prod][b] = a;
-        ltd->smult[a][b] = prod;
-        break;
-      case 3:
-        dd->multtable[a][b] = prod;
-        dd->divtable[prod][b] = a;
-        if ((b & 0xf) == b) dd->low[a][b] = prod;
-        if ((b & 0xf0) == b) dd->high[a][b>>4] = prod;
-        break;
+        case 0: 
+          ftd->multtable[a][b] = prod;
+          ftd->divtable[prod][b] = a;
+          break;
+        case 1:
+          dtd->div[prod][b] = a;
+          for (c = 0; c < GF_FIELD_SIZE; c++) {
+            dtd->mult[a][(c<<8)|b] |= prod;
+            dtd->mult[a][(b<<8)|c] |= (prod<<8);
+          }
+          break;
+        case 2:
+          ltd->div[prod][b] = a;
+          ltd->smult[a][b] = prod;
+          break;
+        case 3:
+          dd->multtable[a][b] = prod;
+          dd->divtable[prod][b] = a;
+          if ((b & 0xf) == b) { dd->low[a][b] = prod; }
+          if ((b & 0xf0) == b) { dd->high[a][b>>4] = prod; }
+          break;
       }
-      b <<= 1;
-      if (b & GF_FIELD_SIZE) b = b ^ h->prim_poly;
-      prod <<= 1;
-      if (prod & GF_FIELD_SIZE) prod = prod ^ h->prim_poly;
-      
-    } while (b != 1);
+    }
   }
 
   gf->inverse.w32 = NULL; /* Will set from divide */
   switch (scase) {
-  case 0: 
-    gf->divide.w32 = gf_w8_table_divide;
-    gf->multiply.w32 = gf_w8_table_multiply;
-    gf->multiply_region.w32 = gf_w8_table_multiply_region;
-    break;
-  case 1:
-    gf->divide.w32 = gf_w8_double_table_divide;
-    gf->multiply.w32 = gf_w8_double_table_multiply;
-    gf->multiply_region.w32 = gf_w8_double_table_multiply_region;
-    break;
-  case 2:
-    gf->divide.w32 = gf_w8_double_table_lazy_divide;
-    gf->multiply.w32 = gf_w8_double_table_lazy_multiply;
-    gf->multiply_region.w32 = gf_w8_double_table_multiply_region;
-    break;
-  case 3:
-    gf->divide.w32 = gf_w8_default_divide;
-    gf->multiply.w32 = gf_w8_default_multiply;
-    gf->multiply_region.w32 = gf_w8_split_multiply_region;
-#ifdef INTEL_SSE4
-    gf->multiply_region.w32 = gf_w8_split_multiply_region_sse;
+    case 0: 
+      gf->divide.w32 = gf_w8_table_divide;
+      gf->multiply.w32 = gf_w8_table_multiply;
+      gf->multiply_region.w32 = gf_w8_table_multiply_region;
+      break;
+    case 1:
+      gf->divide.w32 = gf_w8_double_table_divide;
+      gf->multiply.w32 = gf_w8_double_table_multiply;
+      gf->multiply_region.w32 = gf_w8_double_table_multiply_region;
+      break;
+    case 2:
+      gf->divide.w32 = gf_w8_double_table_lazy_divide;
+      gf->multiply.w32 = gf_w8_double_table_lazy_multiply;
+      gf->multiply_region.w32 = gf_w8_double_table_multiply_region;
+      break;
+    case 3:
+#ifdef INTEL_SSSE3
+      gf->divide.w32 = gf_w8_default_divide;
+      gf->multiply.w32 = gf_w8_default_multiply;
+      gf->multiply_region.w32 = gf_w8_split_multiply_region_sse;
 #endif
-    break;
+      break;
   }
   return 1;
 }
 
 static
-void
+  void
 gf_w8_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
 {
   gf_internal_t *h = (gf_internal_t *) gf->scratch;
   gf_t *base_gf = h->base_gf;
   uint8_t val0 = val & 0x0f;
   uint8_t val1 = (val & 0xf0) >> 4;
-  int sub_reg_size = bytes / 2;
+  gf_region_data rd;
+  int sub_reg_size;
 
-  if (bytes % 2 != 0) gf_alignment_error("gf_w8_composite_multiply_region_alt", 1);
+  if (val == 0) {
+    if (xor) return;
+    bzero(dest, bytes);
+    return;
+  }
 
-  base_gf->multiply_region.w32(base_gf, src, dest, val0, sub_reg_size, xor);
-  base_gf->multiply_region.w32(base_gf, src+sub_reg_size, dest, val1, sub_reg_size, 1);
-  base_gf->multiply_region.w32(base_gf, src, dest+sub_reg_size, val1, sub_reg_size, xor);
-  base_gf->multiply_region.w32(base_gf, src+sub_reg_size, dest+sub_reg_size, val0, sub_reg_size, 1);
-  base_gf->multiply_region.w32(base_gf, src+sub_reg_size, dest+sub_reg_size, base_gf->multiply.w32(base_gf, GF_S_GF_4_2, val1), sub_reg_size, 1);
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32);
+  gf_do_initial_region_alignment(&rd);
+
+  sub_reg_size = (rd.d_top - rd.d_start) / 2;
+
+  base_gf->multiply_region.w32(base_gf, rd.s_start, rd.d_start, val0, sub_reg_size, xor);
+  base_gf->multiply_region.w32(base_gf, rd.s_start+sub_reg_size, rd.d_start, val1, sub_reg_size, 1);
+  base_gf->multiply_region.w32(base_gf, rd.s_start, rd.d_start+sub_reg_size, val1, sub_reg_size, xor);
+  base_gf->multiply_region.w32(base_gf, rd.s_start+sub_reg_size, rd.d_start+sub_reg_size, val0, sub_reg_size, 1);
+  base_gf->multiply_region.w32(base_gf, rd.s_start+sub_reg_size, rd.d_start+sub_reg_size, base_gf->multiply.w32(base_gf, h->prim_poly, val1), sub_reg_size, 1);
+
+   gf_do_final_region_alignment(&rd);
 }
 
 static
 gf_val_32_t
-gf_w8_composite_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+gf_w8_composite_multiply_recursive(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
 {
   gf_internal_t *h = (gf_internal_t *) gf->scratch;
   gf_t *base_gf = h->base_gf;
@@ -912,8 +1345,35 @@ gf_w8_composite_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
   uint8_t a1b1;
 
   a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
-  
-  return ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, GF_S_GF_4_2)) << 4));
+
+  return ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | 
+          ((base_gf->multiply.w32(base_gf, a1, b0) ^ 
+           base_gf->multiply.w32(base_gf, a0, b1) ^ 
+           base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 4));
+}
+
+static
+gf_val_32_t
+gf_w8_composite_multiply_inline(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  gf_t *base_gf = h->base_gf;
+  uint8_t b0 = b & 0x0f; 
+  uint8_t b1 = (b & 0xf0) >> 4; 
+  uint8_t a0 = a & 0x0f; 
+  uint8_t a1 = (a & 0xf0) >> 4; 
+  uint8_t a1b1, *mt;
+  struct gf_w8_composite_data *cd;
+
+  cd = (struct gf_w8_composite_data *) h->private;
+  mt = cd->mult_table;
+
+  a1b1 = GF_W4_INLINE_MULTDIV(mt, a1, b1);
+
+  return ((GF_W4_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) | 
+          ((GF_W4_INLINE_MULTDIV(mt, a1, b0) ^ 
+           GF_W4_INLINE_MULTDIV(mt, a0, b1) ^ 
+           GF_W4_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 4));
 }
 
 /*
@@ -938,6 +1398,7 @@ gf_w8_composite_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
  * 
  * a / b = a * c
  */
+
 static
 gf_val_32_t
 gf_w8_composite_inverse(gf_t *gf, gf_val_32_t a)
@@ -949,10 +1410,9 @@ gf_w8_composite_inverse(gf_t *gf, gf_val_32_t a)
   uint8_t c0, c1, c, d, tmp;
   uint8_t a0inv, a1inv; 
 
-
   if (a0 == 0) {
     a1inv = base_gf->inverse.w32(base_gf, a1) & 0xf;
-    c0 = base_gf->multiply.w32(base_gf, a1inv, GF_S_GF_4_2);
+    c0 = base_gf->multiply.w32(base_gf, a1inv, h->prim_poly);
     c1 = a1inv;
   } else if (a1 == 0) {
     c0 = base_gf->inverse.w32(base_gf, a0);
@@ -963,49 +1423,36 @@ gf_w8_composite_inverse(gf_t *gf, gf_val_32_t a)
 
     d = base_gf->multiply.w32(base_gf, a1, a0inv) & 0xf;
 
-    tmp = (base_gf->multiply.w32(base_gf, a1, a0inv) ^ base_gf->multiply.w32(base_gf, a0, a1inv) ^ GF_S_GF_4_2) & 0xf;
+    tmp = (base_gf->multiply.w32(base_gf, a1, a0inv) ^ base_gf->multiply.w32(base_gf, a0, a1inv) ^ h->prim_poly) & 0xf;
     tmp = base_gf->inverse.w32(base_gf, tmp) & 0xf;
 
     d = base_gf->multiply.w32(base_gf, d, tmp) & 0xf;
- 
+
     c0 = base_gf->multiply.w32(base_gf, (d^1), a0inv) & 0xf; 
     c1 = base_gf->multiply.w32(base_gf, d, a1inv) & 0xf; 
   }
 
   c = c0 | (c1 << 4);
-  
+
   return c;
 }
 
-static
-gf_val_32_t
-gf_w8_composite_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
-{
-  gf_val_32_t binv;
-
-  binv = gf_w8_composite_inverse(gf, b);
-
-  return gf_w8_composite_multiply(gf, a, binv);
-}
-
 static
 void
 gf_w8_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
 {
-  unsigned long uls, uld;
+  gf_region_data rd;
   gf_internal_t *h = (gf_internal_t *) gf->scratch;
   gf_t *base_gf = h->base_gf;
-  int i=0;
-  struct gf_w4_single_table_data * std;
   uint8_t b0 = val & 0x0f; 
   uint8_t b1 = (val & 0xf0) >> 4; 
-  uint8_t *s8 = (uint8_t *) src;
-  uint8_t *d8 = (uint8_t *) dest; 
+  uint8_t *s8;
+  uint8_t *d8; 
+  uint8_t *mt;
   uint8_t a0, a1, a1b1;
+  struct gf_w8_composite_data *cd;
 
-  uls = ((unsigned long) src) & 0xf;
-  uld = ((unsigned long) dest) & 0xf;
-  if ((uls & 0x7) != (uld & 0x7)) gf_alignment_error("gf_w8_composite_multiply_region", 1);
+  cd = (struct gf_w8_composite_data *) h->private;
 
   if (val == 0) {
     if (xor) return;
@@ -1013,124 +1460,115 @@ gf_w8_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val
     return;
   }
 
-  std = (struct gf_w4_single_table_data *) h->private;
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1);
+  gf_do_initial_region_alignment(&rd);
+  
+  
+  s8 = (uint8_t *) rd.s_start;
+  d8 = (uint8_t *) rd.d_start;
 
-  if (xor) {
-    for (i = 0;i < bytes; i++) {
-      a0 = s8[i] & 0x0f; 
-      a1 = (s8[i] & 0xf0) >> 4; 
-      a1b1 = std->mult[a1][b1];
-
-      d8[i] ^= ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | 
-                ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, GF_S_GF_4_2)) << 4));
-      
+  mt = cd->mult_table;
+  if (mt == NULL) {
+    if (xor) {
+      while (d8 < (uint8_t *) rd.d_top) {
+        a0 = *s8 & 0x0f; 
+        a1 = (*s8 & 0xf0) >> 4; 
+        a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
+  
+        *d8 ^= ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | 
+               ((base_gf->multiply.w32(base_gf, a1, b0) ^ 
+                 base_gf->multiply.w32(base_gf, a0, b1) ^ 
+                 base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 4));
+        s8++;
+        d8++;
+      }
+    } else {
+      while (d8 < (uint8_t *) rd.d_top) {
+        a0 = *s8 & 0x0f; 
+        a1 = (*s8 & 0xf0) >> 4; 
+        a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
+  
+        *d8 = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | 
+              ((base_gf->multiply.w32(base_gf, a1, b0) ^ 
+                base_gf->multiply.w32(base_gf, a0, b1) ^ 
+                base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 4));
+        s8++;
+        d8++;
+      }
     }
   } else {
-    for (i = 0;i < bytes; i++) {
-      a0 = s8[i] & 0x0f; 
-      a1 = (s8[i] & 0xf0) >> 4; 
-      a1b1 = std->mult[a1][b1];
-
-      d8[i] = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | 
-               ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, GF_S_GF_4_2)) << 4));
-    }
-  }
-  return;
-}
-
-static
-void
-gf_w8_composite_multiply_region_table(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
-{
-  unsigned long uls, uld;
-  gf_internal_t *h = (gf_internal_t *) gf->scratch;
-  gf_t *base_gf = h->base_gf;
-  int i=0;
-  struct gf_w4_single_table_data * std;
-  uint8_t b0 = val & 0x0f; 
-  uint8_t b1 = (val & 0xf0) >> 4; 
-  uint8_t *s8 = (uint8_t *) src;
-  uint8_t *d8 = (uint8_t *) dest; 
-  uint8_t a0, a1, a1b1;
-
-  uls = ((unsigned long) src) & 0xf;
-  uld = ((unsigned long) dest) & 0xf;
-  if ((uls & 0x7) != (uld & 0x7)) gf_alignment_error("gf_w8_composite_multiply_region", 1);
-
-  if (val == 0) {
-    if (xor) return;
-    bzero(dest, bytes);
-    return;
-  }
-
-  std = (struct gf_w4_single_table_data *) h->private;
-
-  if (xor) {
-    for (i = 0;i < bytes; i++) {
-      a0 = s8[i] & 0x0f; 
-      a1 = (s8[i] & 0xf0) >> 4; 
-      a1b1 = std->mult[a1][b1];
-
-      d8[i] ^= ((std->mult[a0][b0] ^ a1b1) | ((std->mult[a1][b0] ^ std->mult[a0][b1] ^ std->mult[a1b1][GF_S_GF_4_2]) << 4));
-      
-    }
-  } else {
-    for (i = 0;i < bytes; i++) {
-      a0 = s8[i] & 0x0f; 
-      a1 = (s8[i] & 0xf0) >> 4; 
-      a1b1 = std->mult[a1][b1];
-
-      d8[i] = ((std->mult[a0][b0] ^ a1b1) | ((std->mult[a1][b0] ^ std->mult[a0][b1] ^ std->mult[a1b1][GF_S_GF_4_2]) << 4));
+    if (xor) {
+      while (d8 < (uint8_t *) rd.d_top) {
+        a0 = *s8 & 0x0f; 
+        a1 = (*s8 & 0xf0) >> 4; 
+        a1b1 = GF_W4_INLINE_MULTDIV(mt, a1, b1);
+  
+        *d8 ^= ((GF_W4_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) | 
+               ((GF_W4_INLINE_MULTDIV(mt, a1, b0) ^ 
+                 GF_W4_INLINE_MULTDIV(mt, a0, b1) ^ 
+                 GF_W4_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 4));
+        s8++;
+        d8++;
+      }
+    } else {
+      while (d8 < (uint8_t *) rd.d_top) {
+        a0 = *s8 & 0x0f; 
+        a1 = (*s8 & 0xf0) >> 4; 
+        a1b1 = GF_W4_INLINE_MULTDIV(mt, a1, b1);
+  
+        *d8 = ((GF_W4_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) | 
+              ((GF_W4_INLINE_MULTDIV(mt, a1, b0) ^ 
+                GF_W4_INLINE_MULTDIV(mt, a0, b1) ^ 
+                GF_W4_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 4));
+        s8++;
+        d8++;
+      }
     }
   }
+  gf_do_final_region_alignment(&rd);
   return;
 }
 
 static
 int gf_w8_composite_init(gf_t *gf)
 {
-  struct gf_w4_single_table_data * std;
   gf_internal_t *h = (gf_internal_t *) gf->scratch;
-  gf_t *base_gf = h->base_gf;
-  uint8_t a, b;
+  struct gf_w8_composite_data *cd;
 
-  std = (struct gf_w4_single_table_data *) h->private;
+  if (h->base_gf == NULL) return 0;
+
+  cd = (struct gf_w8_composite_data *) h->private;
+  cd->mult_table = gf_w4_get_mult_table(h->base_gf);
 
-  for (a = 0; a < 16; a++) {
-    for (b = 0; b < 16; b++) {
-      std->mult[a][b] = base_gf->multiply.w32(base_gf, a, b);
-    }
-  }
-  
   if (h->region_type & GF_REGION_ALTMAP) {
     gf->multiply_region.w32 = gf_w8_composite_multiply_region_alt;
   } else {
-    if (h->region_type & GF_REGION_SINGLE_TABLE) {
-      gf->multiply_region.w32 = gf_w8_composite_multiply_region_table;
-    } else {
-      gf->multiply_region.w32 = gf_w8_composite_multiply_region;
-    }
+    gf->multiply_region.w32 = gf_w8_composite_multiply_region;
   }
 
-  gf->multiply.w32 = gf_w8_composite_multiply;
-  gf->divide.w32 = gf_w8_composite_divide;
+  if (cd->mult_table == NULL) {
+    gf->multiply.w32 = gf_w8_composite_multiply_recursive;
+  } else {
+    gf->multiply.w32 = gf_w8_composite_multiply_inline;
+  }
+  gf->divide.w32 = NULL;
   gf->inverse.w32 = gf_w8_composite_inverse;
-  
+
   return 1;
 }
 
 static
 inline
-gf_val_32_t
+  gf_val_32_t
 gf_w8_bytwo_p_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
 {
   uint32_t prod, pp, pmask, amask;
   gf_internal_t *h;
-  
+
   h = (gf_internal_t *) gf->scratch;
   pp = h->prim_poly;
 
-  
+
   prod = 0;
   pmask = 0x80;
   amask = 0x80;
@@ -1149,12 +1587,12 @@ gf_w8_bytwo_p_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
 
 static
 inline
-gf_val_32_t
+  gf_val_32_t
 gf_w8_bytwo_b_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
 {
   uint32_t prod, pp, bmask;
   gf_internal_t *h;
-  
+
   h = (gf_internal_t *) gf->scratch;
   pp = h->prim_poly;
 
@@ -1174,13 +1612,13 @@ gf_w8_bytwo_b_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
 }
 
 static
-void 
+  void 
 gf_w8_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
 {
   uint64_t *s64, *d64, t1, t2, ta, prod, amask;
   gf_region_data rd;
   struct gf_w8_bytwo_data *btd;
-    
+
   if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
   if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
 
@@ -1225,18 +1663,18 @@ gf_w8_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t
 }
 
 #define BYTWO_P_ONESTEP {\
-      SSE_AB2(pp, m1 ,m2, prod, t1, t2); \
-      t1 = _mm_and_si128(v, one); \
-      t1 = _mm_sub_epi8(t1, one); \
-      t1 = _mm_and_si128(t1, ta); \
-      prod = _mm_xor_si128(prod, t1); \
-      v = _mm_srli_epi64(v, 1); }
+  SSE_AB2(pp, m1 ,m2, prod, t1, t2); \
+  t1 = _mm_and_si128(v, one); \
+  t1 = _mm_sub_epi8(t1, one); \
+  t1 = _mm_and_si128(t1, ta); \
+  prod = _mm_xor_si128(prod, t1); \
+  v = _mm_srli_epi64(v, 1); }
 
 static
-void 
+  void 
 gf_w8_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
 {
-#ifdef   INTEL_SSE4
+#ifdef INTEL_SSE2
   int i;
   uint8_t *s8, *d8;
   uint8_t vrev;
@@ -1244,7 +1682,7 @@ gf_w8_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v
   __m128i pp, m1, m2, ta, prod, t1, t2, tp, one, v;
   struct gf_w8_bytwo_data *btd;
   gf_region_data rd;
-    
+
   if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
   if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
 
@@ -1289,10 +1727,10 @@ gf_w8_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v
 }
 
 static
-void
+  void
 gf_w8_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_w8_bytwo_data *btd)
 {
-#ifdef   INTEL_SSE4
+#ifdef INTEL_SSE2
   int i;
   uint8_t *d8, *s8, tb;
   __m128i pp, m1, m2, t1, t2, va, vb;
@@ -1315,10 +1753,10 @@ gf_w8_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_w8_bytwo_data *bt
 }
 
 static
-void
+  void
 gf_w8_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_w8_bytwo_data *btd)
 {
-#ifdef   INTEL_SSE4
+#ifdef INTEL_SSE2
   int i;
   uint8_t *d8, *s8, tb;
   __m128i pp, m1, m2, t1, t2, va, vb;
@@ -1344,16 +1782,16 @@ gf_w8_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_w8_bytwo_data *btd)
 
 
 static
-void 
+  void 
 gf_w8_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
 {
-#ifdef   INTEL_SSE4
+#ifdef INTEL_SSE2
   int itb;
   uint8_t *d8, *s8;
   __m128i pp, m1, m2, t1, t2, va, vb;
   struct gf_w8_bytwo_data *btd;
   gf_region_data rd;
-    
+
   if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
   if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
 
@@ -1399,7 +1837,7 @@ gf_w8_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v
 }
 
 static
-void 
+  void 
 gf_w8_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
 {
   int i;
@@ -1419,349 +1857,349 @@ gf_w8_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t
   d64 = (uint64_t *) rd.d_start;
 
   switch (val) {
-  case 2:
-    if (xor) {
-      while (d64 < (uint64_t *) rd.d_top) {
-        ta = *s64;
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        *d64 ^= ta;
-        d64++;
-        s64++;
+    case 2:
+      if (xor) {
+        while (d64 < (uint64_t *) rd.d_top) {
+          ta = *s64;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          *d64 ^= ta;
+          d64++;
+          s64++;
+        }
+      } else {
+        while (d64 < (uint64_t *) rd.d_top) {
+          ta = *s64;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          *d64 = ta;
+          d64++;
+          s64++;
+        }
       }
-    } else {
-      while (d64 < (uint64_t *) rd.d_top) {
-        ta = *s64;
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        *d64 = ta;
-        d64++;
-        s64++;
+      break; 
+    case 3:
+      if (xor) {
+        while (d64 < (uint64_t *) rd.d_top) {
+          ta = *s64;
+          prod = ta;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          *d64 ^= (ta ^ prod);
+          d64++;
+          s64++;
+        }
+      } else {
+        while (d64 < (uint64_t *) rd.d_top) {
+          ta = *s64;
+          prod = ta;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          *d64 = (ta ^ prod);
+          d64++;
+          s64++;
+        }
       }
+      break; 
+    case 4:
+      if (xor) {
+        while (d64 < (uint64_t *) rd.d_top) {
+          ta = *s64;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          *d64 ^= ta;
+          d64++;
+          s64++;
+        }
+      } else {
+        while (d64 < (uint64_t *) rd.d_top) {
+          ta = *s64;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          *d64 = ta;
+          d64++;
+          s64++;
+        }
+      }
+      break; 
+    case 5:
+      if (xor) {
+        while (d64 < (uint64_t *) rd.d_top) {
+          ta = *s64;
+          prod = ta;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          *d64 ^= (ta ^ prod);
+          d64++;
+          s64++;
+        }
+      } else {
+        while (d64 < (uint64_t *) rd.d_top) {
+          ta = *s64;
+          prod = ta;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          *d64 = ta ^ prod;
+          d64++;
+          s64++;
+        }
+      }
+    case 6:
+      if (xor) {
+        while (d64 < (uint64_t *) rd.d_top) {
+          ta = *s64;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          prod = ta;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          *d64 ^= (ta ^ prod);
+          d64++;
+          s64++;
+        }
+      } else {
+        while (d64 < (uint64_t *) rd.d_top) {
+          ta = *s64;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          prod = ta;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          *d64 = ta ^ prod;
+          d64++;
+          s64++;
+        }
+      }
+      /*
+         case 7:
+         if (xor) {
+         while (d64 < (uint64_t *) rd.d_top) {
+         ta = *s64;
+         prod = ta;
+         AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+         prod ^= ta;
+         AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+       *d64 ^= (ta ^ prod);
+       d64++;
+       s64++;
+       }
+       } else {
+       while (d64 < (uint64_t *) rd.d_top) {
+       ta = *s64;
+       prod = ta;
+       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+       prod ^= ta;
+       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+       *d64 = ta ^ prod;
+       d64++;
+       s64++;
+       }
+       }
+       break; 
+       */
+    case 8:
+      if (xor) {
+        while (d64 < (uint64_t *) rd.d_top) {
+          ta = *s64;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          *d64 ^= ta;
+          d64++;
+          s64++;
+        }
+      } else {
+        while (d64 < (uint64_t *) rd.d_top) {
+          ta = *s64;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          *d64 = ta;
+          d64++;
+          s64++;
+        }
+      }
+      break; 
+      /*
+         case 9:
+         if (xor) {
+         while (d64 < (uint64_t *) rd.d_top) {
+         ta = *s64;
+         prod = ta;
+         AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+         AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+         AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+       *d64 ^= (ta ^ prod);
+       d64++;
+       s64++;
+       }
+       } else {
+       while (d64 < (uint64_t *) rd.d_top) {
+       ta = *s64;
+       prod = ta;
+       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+       *d64 = (ta ^ prod);
+       d64++;
+       s64++;
+       }
+       }
+       break; 
+       case 10:
+       if (xor) {
+       while (d64 < (uint64_t *) rd.d_top) {
+       ta = *s64;
+       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+       prod = ta;
+       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+       *d64 ^= (ta ^ prod);
+       d64++;
+       s64++;
+       }
+       } else {
+       while (d64 < (uint64_t *) rd.d_top) {
+       ta = *s64;
+       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+       prod = ta;
+       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+       *d64 = (ta ^ prod);
+       d64++;
+       s64++;
+       }
+       }
+       break; 
+       case 11:
+       if (xor) {
+       while (d64 < (uint64_t *) rd.d_top) {
+       ta = *s64;
+       prod = ta;
+       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+       prod ^= ta;
+       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+       *d64 ^= (ta ^ prod);
+       d64++;
+       s64++;
+       }
+       } else {
+       while (d64 < (uint64_t *) rd.d_top) {
+       ta = *s64;
+       prod = ta;
+       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+       prod ^= ta;
+       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      *d64 = (ta ^ prod);
+      d64++;
+      s64++;
+      }
+  }
+  break; 
+    case 12:
+  if (xor) {
+    while (d64 < (uint64_t *) rd.d_top) {
+      ta = *s64;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      prod = ta;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      *d64 ^= (ta ^ prod);
+      d64++;
+      s64++;
     }
-    break; 
-  case 3:
-    if (xor) {
-      while (d64 < (uint64_t *) rd.d_top) {
-        ta = *s64;
-        prod = ta;
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        *d64 ^= (ta ^ prod);
-        d64++;
-        s64++;
-      }
-    } else {
-      while (d64 < (uint64_t *) rd.d_top) {
-        ta = *s64;
-        prod = ta;
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        *d64 = (ta ^ prod);
-        d64++;
-        s64++;
-      }
+  } else {
+    while (d64 < (uint64_t *) rd.d_top) {
+      ta = *s64;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      prod = ta;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      *d64 = (ta ^ prod);
+      d64++;
+      s64++;
     }
-    break; 
-  case 4:
-    if (xor) {
-      while (d64 < (uint64_t *) rd.d_top) {
-        ta = *s64;
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        *d64 ^= ta;
-        d64++;
-        s64++;
-      }
-    } else {
-      while (d64 < (uint64_t *) rd.d_top) {
-        ta = *s64;
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        *d64 = ta;
-        d64++;
-        s64++;
-      }
+  }
+  break; 
+    case 13:
+  if (xor) {
+    while (d64 < (uint64_t *) rd.d_top) {
+      ta = *s64;
+      prod = ta;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      prod ^= ta;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      *d64 ^= (ta ^ prod);
+      d64++;
+      s64++;
     }
-    break; 
-  case 5:
-    if (xor) {
-      while (d64 < (uint64_t *) rd.d_top) {
-        ta = *s64;
-        prod = ta;
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        *d64 ^= (ta ^ prod);
-        d64++;
-        s64++;
-      }
-    } else {
-      while (d64 < (uint64_t *) rd.d_top) {
-        ta = *s64;
-        prod = ta;
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        *d64 = ta ^ prod;
-        d64++;
-        s64++;
-      }
+  } else {
+    while (d64 < (uint64_t *) rd.d_top) {
+      ta = *s64;
+      prod = ta;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      prod ^= ta;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      *d64 = (ta ^ prod);
+      d64++;
+      s64++;
     }
-  case 6:
-    if (xor) {
-      while (d64 < (uint64_t *) rd.d_top) {
-        ta = *s64;
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        prod = ta;
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        *d64 ^= (ta ^ prod);
-        d64++;
-        s64++;
-      }
-    } else {
-      while (d64 < (uint64_t *) rd.d_top) {
-        ta = *s64;
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        prod = ta;
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        *d64 = ta ^ prod;
-        d64++;
-        s64++;
-      }
+  }
+  break; 
+    case 14:
+  if (xor) {
+    while (d64 < (uint64_t *) rd.d_top) {
+      ta = *s64;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      prod = ta;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      prod ^= ta;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      *d64 ^= (ta ^ prod);
+      d64++;
+      s64++;
     }
-/*
-  case 7:
-    if (xor) {
-      while (d64 < (uint64_t *) rd.d_top) {
-        ta = *s64;
-        prod = ta;
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        prod ^= ta;
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        *d64 ^= (ta ^ prod);
-        d64++;
-        s64++;
-      }
-    } else {
-      while (d64 < (uint64_t *) rd.d_top) {
-        ta = *s64;
-        prod = ta;
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        prod ^= ta;
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        *d64 = ta ^ prod;
-        d64++;
-        s64++;
-      }
+  } else {
+    while (d64 < (uint64_t *) rd.d_top) {
+      ta = *s64;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      prod = ta;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      prod ^= ta;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      *d64 = (ta ^ prod);
+      d64++;
+      s64++;
     }
-    break; 
- */
-  case 8:
-    if (xor) {
-      while (d64 < (uint64_t *) rd.d_top) {
-        ta = *s64;
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        *d64 ^= ta;
-        d64++;
-        s64++;
-      }
-    } else {
-      while (d64 < (uint64_t *) rd.d_top) {
-        ta = *s64;
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        *d64 = ta;
-        d64++;
-        s64++;
-      }
+  }
+  break; 
+    case 15:
+  if (xor) {
+    while (d64 < (uint64_t *) rd.d_top) {
+      ta = *s64;
+      prod = ta;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      prod ^= ta;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      prod ^= ta;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      *d64 ^= (ta ^ prod);
+      d64++;
+      s64++;
     }
-    break; 
-/*
-  case 9:
-    if (xor) {
-      while (d64 < (uint64_t *) rd.d_top) {
-        ta = *s64;
-        prod = ta;
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        *d64 ^= (ta ^ prod);
-        d64++;
-        s64++;
-      }
-    } else {
-      while (d64 < (uint64_t *) rd.d_top) {
-        ta = *s64;
-        prod = ta;
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        *d64 = (ta ^ prod);
-        d64++;
-        s64++;
-      }
+  } else {
+    while (d64 < (uint64_t *) rd.d_top) {
+      ta = *s64;
+      prod = ta;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      prod ^= ta;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      prod ^= ta;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      *d64 = (ta ^ prod);
+      d64++;
+      s64++;
     }
-    break; 
-  case 10:
-    if (xor) {
-      while (d64 < (uint64_t *) rd.d_top) {
-        ta = *s64;
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        prod = ta;
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        *d64 ^= (ta ^ prod);
-        d64++;
-        s64++;
-      }
-    } else {
-      while (d64 < (uint64_t *) rd.d_top) {
-        ta = *s64;
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        prod = ta;
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        *d64 = (ta ^ prod);
-        d64++;
-        s64++;
-      }
-    }
-    break; 
-  case 11:
-    if (xor) {
-      while (d64 < (uint64_t *) rd.d_top) {
-        ta = *s64;
-        prod = ta;
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        prod ^= ta;
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        *d64 ^= (ta ^ prod);
-        d64++;
-        s64++;
-      }
-    } else {
-      while (d64 < (uint64_t *) rd.d_top) {
-        ta = *s64;
-        prod = ta;
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        prod ^= ta;
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        *d64 = (ta ^ prod);
-        d64++;
-        s64++;
-      }
-    }
-    break; 
-  case 12:
-    if (xor) {
-      while (d64 < (uint64_t *) rd.d_top) {
-        ta = *s64;
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        prod = ta;
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        *d64 ^= (ta ^ prod);
-        d64++;
-        s64++;
-      }
-    } else {
-      while (d64 < (uint64_t *) rd.d_top) {
-        ta = *s64;
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        prod = ta;
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        *d64 = (ta ^ prod);
-        d64++;
-        s64++;
-      }
-    }
-    break; 
-  case 13:
-    if (xor) {
-      while (d64 < (uint64_t *) rd.d_top) {
-        ta = *s64;
-        prod = ta;
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        prod ^= ta;
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        *d64 ^= (ta ^ prod);
-        d64++;
-        s64++;
-      }
-    } else {
-      while (d64 < (uint64_t *) rd.d_top) {
-        ta = *s64;
-        prod = ta;
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        prod ^= ta;
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        *d64 = (ta ^ prod);
-        d64++;
-        s64++;
-      }
-    }
-    break; 
-  case 14:
-    if (xor) {
-      while (d64 < (uint64_t *) rd.d_top) {
-        ta = *s64;
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        prod = ta;
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        prod ^= ta;
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        *d64 ^= (ta ^ prod);
-        d64++;
-        s64++;
-      }
-    } else {
-      while (d64 < (uint64_t *) rd.d_top) {
-        ta = *s64;
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        prod = ta;
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        prod ^= ta;
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        *d64 = (ta ^ prod);
-        d64++;
-        s64++;
-      }
-    }
-    break; 
-  case 15:
-    if (xor) {
-      while (d64 < (uint64_t *) rd.d_top) {
-        ta = *s64;
-        prod = ta;
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        prod ^= ta;
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        prod ^= ta;
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        *d64 ^= (ta ^ prod);
-        d64++;
-        s64++;
-      }
-    } else {
-      while (d64 < (uint64_t *) rd.d_top) {
-        ta = *s64;
-        prod = ta;
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        prod ^= ta;
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        prod ^= ta;
-        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
-        *d64 = (ta ^ prod);
-        d64++;
-        s64++;
-      }
-    }
-    break; 
-*/
-  default:
+  }
+  break; 
+  */
+    default:
     if (xor) {
       while (d64 < (uint64_t *) rd.d_top) {
         prod = *d64 ;
@@ -1798,7 +2236,7 @@ gf_w8_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t
   gf_do_final_region_alignment(&rd);
 }
 
-static
+  static
 int gf_w8_bytwo_init(gf_t *gf)
 {
   gf_internal_t *h;
@@ -1825,48 +2263,54 @@ int gf_w8_bytwo_init(gf_t *gf)
 
   if (h->mult_type == GF_MULT_BYTWO_p) {
     gf->multiply.w32 = gf_w8_bytwo_p_multiply;
-    if (h->region_type == GF_REGION_SSE) {
-      gf->multiply_region.w32 = gf_w8_bytwo_p_sse_multiply_region;
-    } else {
+#ifdef INTEL_SSE2
+    if (h->region_type & GF_REGION_NOSSE)
       gf->multiply_region.w32 = gf_w8_bytwo_p_nosse_multiply_region;
-    }
+    else
+      gf->multiply_region.w32 = gf_w8_bytwo_p_sse_multiply_region;
+#else
+    gf->multiply_region.w32 = gf_w8_bytwo_p_nosse_multiply_region;
+    if(h->region_type & GF_REGION_SSE)
+      return 0;
+#endif
   } else {
     gf->multiply.w32 = gf_w8_bytwo_b_multiply;
-    if (h->region_type == GF_REGION_SSE) {
-      gf->multiply_region.w32 = gf_w8_bytwo_b_sse_multiply_region;
-    } else {
+#ifdef INTEL_SSE2
+    if (h->region_type & GF_REGION_NOSSE)
       gf->multiply_region.w32 = gf_w8_bytwo_b_nosse_multiply_region;
-    }
+    else
+      gf->multiply_region.w32 = gf_w8_bytwo_b_sse_multiply_region;
+#else
+    gf->multiply_region.w32 = gf_w8_bytwo_b_nosse_multiply_region;
+    if(h->region_type & GF_REGION_SSE)
+      return 0;
+#endif
   }
-  gf->inverse.w32 = gf_w8_euclid;
   return 1;
 }
 
 
 /* ------------------------------------------------------------
    General procedures.
+   You don't need to error check here on in init, because it's done
+   for you in gf_error_check().
  */
 
 int gf_w8_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
 {
-  int sse;
-
-  sse = (GF_REGION_SSE | GF_REGION_NOSSE);
-
   switch(mult_type)
   {
     case GF_MULT_DEFAULT:
-      if (arg1 != 0 || arg2 != 0 || region_type != 0) return -1;
+#ifdef INTEL_SSSE3
       return sizeof(gf_internal_t) + sizeof(struct gf_w8_default_data) + 64;
+#endif
+      return sizeof(gf_internal_t) + sizeof(struct gf_w8_single_table_data) + 64;
     case GF_MULT_TABLE:
-      if (arg1 != 0 || arg2 != 0) return -1;
-      if (region_type == GF_REGION_CAUCHY || region_type == (GF_REGION_CAUCHY | GF_REGION_SINGLE_TABLE)) {
+      if (region_type == GF_REGION_CAUCHY) {
         return sizeof(gf_internal_t) + sizeof(struct gf_w8_single_table_data) + 64;
       }
 
-      if (region_type == 0) region_type = GF_REGION_SINGLE_TABLE;
-      if (region_type & GF_REGION_SINGLE_TABLE) {
-        if (region_type != GF_REGION_SINGLE_TABLE) return 0;
+      if (region_type == GF_REGION_DEFAULT) {
         return sizeof(gf_internal_t) + sizeof(struct gf_w8_single_table_data) + 64;
       } 
       if (region_type & GF_REGION_DOUBLE_TABLE) {
@@ -1875,62 +2319,62 @@ int gf_w8_scratch_size(int mult_type, int region_type, int divide_type, int arg1
         } else if (region_type == (GF_REGION_DOUBLE_TABLE | GF_REGION_LAZY)) {
           return sizeof(gf_internal_t) + sizeof(struct gf_w8_double_table_lazy_data) + 64;
         } else {
-          return -1;
+          return 0;
         }
       }
-      return -1;
+      return 0;
       break;
     case GF_MULT_BYTWO_p:
     case GF_MULT_BYTWO_b:
-      if (arg1 != 0 || arg2 != 0) return -1;
-      if (region_type != GF_REGION_CAUCHY) {
-        if ((region_type | sse) != sse || (region_type & sse) == sse) return -1;
-      }
       return sizeof(gf_internal_t) + sizeof(struct gf_w8_bytwo_data);
       break;
     case GF_MULT_SPLIT_TABLE:
       if ((arg1 == 4 && arg2 == 8) || (arg1 == 8 && arg2 == 4)) {
-        if (region_type == GF_REGION_CAUCHY) {
-          return sizeof(gf_internal_t) + sizeof(struct gf_w8_half_table_data) + 64;
-        }
-        if (region_type == 0) region_type = GF_REGION_SSE;
-        if ((region_type | sse) != sse) return -1;
-        if ((region_type & sse) == sse) return -1;
         return sizeof(gf_internal_t) + sizeof(struct gf_w8_half_table_data) + 64;
       }
-      return -1;
       break;
     case GF_MULT_LOG_TABLE:
-      if ((arg1 != 0 && arg1 != 1 && arg1 != 2) || arg2 != 0) return -1;
-      if (region_type != 0 && region_type != GF_REGION_CAUCHY) return -1;
-      if (arg1 == 0) return sizeof(gf_internal_t) + sizeof(struct gf_w8_logtable_data) + 64;
-      if (arg1 == 1) return sizeof(gf_internal_t) + sizeof(struct gf_w8_logzero_small_table_data) + 64;
+      return sizeof(gf_internal_t) + sizeof(struct gf_w8_logtable_data) + 64;
+      break;
+    case GF_MULT_LOG_ZERO:
+      return sizeof(gf_internal_t) + sizeof(struct gf_w8_logzero_small_table_data) + 64;
+      break;
+    case GF_MULT_LOG_ZERO_EXT:
       return sizeof(gf_internal_t) + sizeof(struct gf_w8_logzero_table_data) + 64;
       break;
+    case GF_MULT_CARRY_FREE:
+      return sizeof(gf_internal_t);
+      break;
     case GF_MULT_SHIFT:
-      if (arg1 != 0 || arg2 != 0) return -1;
-      if (region_type != 0 && region_type != GF_REGION_CAUCHY) return -1;
       return sizeof(gf_internal_t);
       break;
     case GF_MULT_COMPOSITE:
-      if (region_type & ~(GF_REGION_SINGLE_TABLE | GF_REGION_ALTMAP | GF_REGION_STDMAP)) return -1;
-      if ((region_type & (GF_REGION_SINGLE_TABLE | GF_REGION_ALTMAP)) == (GF_REGION_SINGLE_TABLE | GF_REGION_ALTMAP)) return -1;
-      if (arg1 == 2 && arg2 == 4) {
-        return sizeof(gf_internal_t) + sizeof(struct gf_w4_single_table_data) + 64;
-      } else {
-        return -1;
-      }
+      return sizeof(gf_internal_t) + sizeof(struct gf_w8_composite_data) + 64;
     default:
-      return -1;
-   }
+      return 0;
+  }
+  return 0;
 }
 
 int gf_w8_init(gf_t *gf)
 {
-  gf_internal_t *h;
+  gf_internal_t *h, *h_base;
 
   h = (gf_internal_t *) gf->scratch;
-  if (h->prim_poly == 0) h->prim_poly = 0x11d;
+
+  /* Allen: set default primitive polynomial / irreducible polynomial if needed */
+
+  if (h->prim_poly == 0) {
+    if (h->mult_type == GF_MULT_COMPOSITE) { 
+      h->prim_poly = gf_composite_get_default_poly(h->base_gf);
+      if (h->prim_poly == 0) return 0;   /* JSP: This shouldn't happen, but just in case. */
+    } else {             
+      h->prim_poly = 0x11d;
+    } 
+  }
+  if (h->mult_type != GF_MULT_COMPOSITE) { 
+    h->prim_poly |= 0x100;
+  }
 
   gf->multiply.w32 = NULL;
   gf->divide.w32 = NULL;
@@ -1939,16 +2383,20 @@ int gf_w8_init(gf_t *gf)
   gf->extract_word.w32 = gf_w8_extract_word;
 
   switch(h->mult_type) {
-    case GF_MULT_DEFAULT: if (gf_w8_table_init(gf) == 0) return 0; break;
-    case GF_MULT_TABLE:     if (gf_w8_table_init(gf) == 0) return 0; break;
+    case GF_MULT_DEFAULT:      
+    case GF_MULT_TABLE:        if (gf_w8_table_init(gf) == 0) return 0; break;
     case GF_MULT_BYTWO_p:
-    case GF_MULT_BYTWO_b:   if (gf_w8_bytwo_init(gf) == 0) return 0; break;
-    case GF_MULT_LOG_TABLE: if (gf_w8_log_init(gf) == 0) return 0; break;
-    case GF_MULT_SHIFT:     if (gf_w8_shift_init(gf) == 0) return 0; break;
-    case GF_MULT_SPLIT_TABLE: if (gf_w8_split_init(gf) == 0) return 0; break;
-    case GF_MULT_COMPOSITE: if (gf_w8_composite_init(gf) == 0) return 0; break;
+    case GF_MULT_BYTWO_b:      if (gf_w8_bytwo_init(gf) == 0) return 0; break;
+    case GF_MULT_LOG_ZERO:
+    case GF_MULT_LOG_ZERO_EXT:
+    case GF_MULT_LOG_TABLE:    if (gf_w8_log_init(gf) == 0) return 0; break;
+    case GF_MULT_CARRY_FREE:   if (gf_w8_cfm_init(gf) == 0) return 0; break;
+    case GF_MULT_SHIFT:        if (gf_w8_shift_init(gf) == 0) return 0; break;
+    case GF_MULT_SPLIT_TABLE:  if (gf_w8_split_init(gf) == 0) return 0; break;
+    case GF_MULT_COMPOSITE:    if (gf_w8_composite_init(gf) == 0) return 0; break;
     default: return 0;
   }
+
   if (h->divide_type == GF_DIVIDE_EUCLID) {
     gf->divide.w32 = gf_w8_divide_from_inverse;
     gf->inverse.w32 = gf_w8_euclid;
@@ -1957,11 +2405,15 @@ int gf_w8_init(gf_t *gf)
     gf->inverse.w32 = gf_w8_matrix;
   }
 
-  if (gf->inverse.w32 != NULL && gf->divide.w32 == NULL) {
+  if (gf->divide.w32 == NULL) {
     gf->divide.w32 = gf_w8_divide_from_inverse;
+    if (gf->inverse.w32 == NULL) gf->inverse.w32 = gf_w8_euclid;
   }
-  if (gf->inverse.w32 == NULL && gf->divide.w32 != NULL) {
-    gf->inverse.w32 = gf_w8_inverse_from_divide;
+
+  if (gf->inverse.w32 == NULL)  gf->inverse.w32 = gf_w8_inverse_from_divide;
+
+  if (h->mult_type == GF_MULT_COMPOSITE && (h->region_type & GF_REGION_ALTMAP)) {
+    gf->extract_word.w32 = gf_w8_composite_extract_word;
   }
 
   if (h->region_type == GF_REGION_CAUCHY) {
@@ -1969,6 +2421,10 @@ int gf_w8_init(gf_t *gf)
     gf->extract_word.w32 = gf_wgen_extract_word;
   }
 
+  if (gf->multiply_region.w32 == NULL) {
+    gf->multiply_region.w32 = gf_w8_multiply_region_from_single;
+  }
+
   return 1;
 }
 
@@ -2001,7 +2457,7 @@ uint8_t *gf_w8_get_div_table(gf_t *gf)
   h = (gf_internal_t *) gf->scratch;
   if (gf->multiply.w32 == gf_w8_default_multiply) {
     ftd = (struct gf_w8_default_data *) ((gf_internal_t *) gf->scratch)->private;
-    return (uint8_t *) std->divtable;
+    return (uint8_t *) ftd->divtable;
   } else if (gf->multiply.w32 == gf_w8_table_multiply) {
     std = (struct gf_w8_single_table_data *) ((gf_internal_t *) gf->scratch)->private;
     return (uint8_t *) std->divtable;
diff --git a/gf_wgen.c b/gf_wgen.c
index 7d5144b..ede115c 100644
--- a/gf_wgen.c
+++ b/gf_wgen.c
@@ -93,6 +93,7 @@ gf_val_32_t gf_wgen_euclid (gf_t *gf, gf_val_32_t b)
     while (d_ip1 >= d_i) {
       c_i ^= (1 << (d_ip1 - d_i));
       e_ip1 ^= (e_i << (d_ip1 - d_i));
+      if (e_ip1 == 0) return 0;
       while ((e_ip1 & (1 << d_ip1)) == 0) d_ip1--;
     }
 
@@ -223,7 +224,7 @@ gf_wgen_bytwo_p_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
   pp = h->prim_poly;
 
   prod = 0;
-  pmask = (1 << (h->w)-1);
+  pmask = (1 << ((h->w)-1)); /*Ben: Had an operator precedence warning here*/
   amask = pmask;
 
   while (amask != 0) {
@@ -508,16 +509,11 @@ int gf_wgen_table_8_init(gf_t *gf)
   }
     
   for (a = 1; a < (1 << w); a++) {
-    b = 1;
-    p = a;
-    do {
+    for (b = 1; b < (1 << w); b++) {
+      p = gf_wgen_shift_multiply(gf, a, b);
       std->mult[(a<<w)|b] = p;
-      std->div[(p<<w)|b] = a;
-      b = (b & (1 << (w-1))) ? (b << 1) ^ h->prim_poly : (b << 1);
-      b &= ((1 << w)-1);
-      p = (p & (1 << (w-1))) ? (p << 1) ^ h->prim_poly : (p << 1);
-      p &= ((1 << w)-1);
-    } while (b != 1);
+      std->div[(p<<w)|a] = b;
+    }
   }
 
   gf->multiply.w32 = gf_wgen_table_8_multiply;
@@ -572,18 +568,13 @@ int gf_wgen_table_16_init(gf_t *gf)
     std->div[a] = 0;
     std->div[a<<w] = 0;
   }
-    
+  
   for (a = 1; a < (1 << w); a++) {
-    b = 1;
-    p = a;
-    do {
+    for (b = 1; b < (1 << w); b++) {
+      p = gf_wgen_shift_multiply(gf, a, b);
       std->mult[(a<<w)|b] = p;
-      std->div[(p<<w)|b] = a;
-      b = (b & (1 << (w-1))) ? (b << 1) ^ h->prim_poly : (b << 1);
-      b &= ((1 << w)-1);
-      p = (p & (1 << (w-1))) ? (p << 1) ^ h->prim_poly : (p << 1);
-      p &= ((1 << w)-1);
-    } while (b != 1);
+      std->div[(p<<w)|a] = b;
+    }
   }
 
   gf->multiply.w32 = gf_wgen_table_16_multiply;
@@ -599,6 +590,11 @@ int gf_wgen_table_init(gf_t *gf)
   h = (gf_internal_t *) gf->scratch;
   if (h->w <= 8) return gf_wgen_table_8_init(gf);
   if (h->w <= 14) return gf_wgen_table_16_init(gf);
+
+  /* Returning zero to make the compiler happy, but this won't get 
+     executed, because it is tested in _scratch_space. */
+
+  return 0;
 }
 
 static
@@ -640,6 +636,7 @@ int gf_wgen_log_8_init(gf_t *gf)
   struct gf_wgen_log_w8_data *std;
   int w;
   uint32_t a, i;
+  int check = 0;
   
   h = (gf_internal_t *) gf->scratch;
   w = h->w;
@@ -649,17 +646,27 @@ int gf_wgen_log_8_init(gf_t *gf)
   std->anti = std->log + (1<<h->w);
   std->danti = std->anti + (1<<h->w)-1;
   
-  i = 0;
+  for (i = 0; i < (1 << w); i++)
+    std->log[i] = 0;
+
   a = 1;
-  do {
+  for(i=0; i < (1<<w)-1; i++)
+  {
+    if (std->log[a] != 0) check = 1;
     std->log[a] = i;
     std->anti[i] = a;
     std->danti[i] = a;
-    i++;
-    a = (a & (1 << (w-1))) ? (a << 1) ^ h->prim_poly : (a << 1);
-    a &= ((1 << w)-1);
-  } while (a != 1);
-  
+    a <<= 1;
+    if(a & (1<<w))
+      a ^= h->prim_poly;
+    //a &= ((1 << w)-1);
+  }
+
+  if (check != 0) {
+    _gf_errno = GF_E_LOGPOLY;
+    return 0;
+  }
+
   gf->multiply.w32 = gf_wgen_log_8_multiply;
   gf->divide.w32 = gf_wgen_log_8_divide;
   return 1;
@@ -704,6 +711,7 @@ int gf_wgen_log_16_init(gf_t *gf)
   struct gf_wgen_log_w16_data *std;
   int w;
   uint32_t a, i;
+  int check = 0;
   
   h = (gf_internal_t *) gf->scratch;
   w = h->w;
@@ -712,17 +720,28 @@ int gf_wgen_log_16_init(gf_t *gf)
   std->log = &(std->base);
   std->anti = std->log + (1<<h->w);
   std->danti = std->anti + (1<<h->w)-1;
-  
-  i = 0;
+ 
+  for (i = 0; i < (1 << w); i++)
+    std->log[i] = 0;
+
   a = 1;
-  do {
+  for(i=0; i < (1<<w)-1; i++)
+  {
+    if (std->log[a] != 0) check = 1;
     std->log[a] = i;
     std->anti[i] = a;
     std->danti[i] = a;
-    i++;
-    a = (a & (1 << (w-1))) ? (a << 1) ^ h->prim_poly : (a << 1);
-    a &= ((1 << w)-1);
-  } while (a != 1);
+    a <<= 1;
+    if(a & (1<<w))
+      a ^= h->prim_poly;
+    //a &= ((1 << w)-1);
+  }
+
+  if (check) {
+    if (h->mult_type != GF_MULT_LOG_TABLE) return gf_wgen_shift_init(gf);
+    _gf_errno = GF_E_LOGPOLY;
+    return 0;
+  }
   
   gf->multiply.w32 = gf_wgen_log_16_multiply;
   gf->divide.w32 = gf_wgen_log_16_divide;
@@ -768,7 +787,8 @@ int gf_wgen_log_32_init(gf_t *gf)
   struct gf_wgen_log_w32_data *std;
   int w;
   uint32_t a, i;
-  
+  int check = 0;
+
   h = (gf_internal_t *) gf->scratch;
   w = h->w;
   std = (struct gf_wgen_log_w32_data *) h->private;
@@ -777,17 +797,27 @@ int gf_wgen_log_32_init(gf_t *gf)
   std->anti = std->log + (1<<h->w);
   std->danti = std->anti + (1<<h->w)-1;
   
-  i = 0;
+  for (i = 0; i < (1 << w); i++)
+    std->log[i] = 0;
+
   a = 1;
-  do {
+  for(i=0; i < (1<<w)-1; i++)
+  {
+    if (std->log[a] != 0) check = 1;
     std->log[a] = i;
     std->anti[i] = a;
     std->danti[i] = a;
-    i++;
-    a = (a & (1 << (w-1))) ? (a << 1) ^ h->prim_poly : (a << 1);
-    a &= ((1 << w)-1);
-  } while (a != 1);
-  
+    a <<= 1;
+    if(a & (1<<w))
+      a ^= h->prim_poly;
+    //a &= ((1 << w)-1);
+  }
+
+  if (check != 0) {
+    _gf_errno = GF_E_LOGPOLY;
+    return 0;
+  }
+
   gf->multiply.w32 = gf_wgen_log_32_multiply;
   gf->divide.w32 = gf_wgen_log_32_divide;
   return 1;
@@ -802,15 +832,16 @@ int gf_wgen_log_init(gf_t *gf)
   if (h->w <= 8) return gf_wgen_log_8_init(gf);
   if (h->w <= 16) return gf_wgen_log_16_init(gf);
   if (h->w <= 32) return gf_wgen_log_32_init(gf); 
+
+  /* Returning zero to make the compiler happy, but this won't get 
+     executed, because it is tested in _scratch_space. */
+
+  return 0;
 }
 
 int gf_wgen_scratch_size(int w, int mult_type, int region_type, int divide_type, int arg1, int arg2)
 {
 
-  if (w > 32 || w < 0) return -1;
-
-  if ((region_type | GF_REGION_CAUCHY) != GF_REGION_CAUCHY) return -1;
-
   switch(mult_type)
   {
     case GF_MULT_DEFAULT: 
@@ -828,40 +859,37 @@ int gf_wgen_scratch_size(int w, int mult_type, int region_type, int divide_type,
     case GF_MULT_SHIFT:
     case GF_MULT_BYTWO_b:
     case GF_MULT_BYTWO_p:
-      if (arg1 != 0 || arg2 != 0) return -1;
       return sizeof(gf_internal_t);
       break;
     case GF_MULT_GROUP:
-      if (arg1 <= 0 || arg2 <= 0) return -1;
       return sizeof(gf_internal_t) + sizeof(struct gf_wgen_group_data) +
                sizeof(uint32_t) * (1 << arg1) +
                sizeof(uint32_t) * (1 << arg2) + 64;
       break;
 
     case GF_MULT_TABLE: 
-      if (arg1 != 0 || arg2 != 0) return -1;
       if (w <= 8) {
         return sizeof(gf_internal_t) + sizeof(struct gf_wgen_table_w8_data) +
                sizeof(uint8_t)*(1 << w)*(1<<w)*2 + 64;
       } else if (w < 15) {
         return sizeof(gf_internal_t) + sizeof(struct gf_wgen_table_w16_data) +
                sizeof(uint16_t)*(1 << w)*(1<<w)*2 + 64;
-      } else return -1;
+      } 
+      return 0;
     case GF_MULT_LOG_TABLE: 
-      if (arg1 != 0 || arg2 != 0) return -1;
       if (w <= 8) {
         return sizeof(gf_internal_t) + sizeof(struct gf_wgen_log_w8_data) +
                sizeof(uint8_t)*(1 << w)*3;
       } else if (w <= 16) {
         return sizeof(gf_internal_t) + sizeof(struct gf_wgen_log_w16_data) +
                sizeof(uint16_t)*(1 << w)*3;
-      } else if (w <= 29) {
+      } else if (w <= 27) {
         return sizeof(gf_internal_t) + sizeof(struct gf_wgen_log_w32_data) +
                sizeof(uint32_t)*(1 << w)*3;
-      } else return -1;
-
+      } else 
+      return 0;
     default:
-      return -1;
+      return 0;
    }
 }
 
@@ -935,6 +963,13 @@ int gf_wgen_init(gf_t *gf)
       case 32: h->prim_poly = 00020000007; break;
       default: fprintf(stderr, "gf_wgen_init: w not defined yet\n"); exit(1);
     }
+  } else {
+    if (h->w == 32) {
+      h->prim_poly &= 0xffffffff;
+    } else {
+      h->prim_poly |= (1 << h->w);
+      if (h->prim_poly & ~((1ULL<<(h->w+1))-1)) return 0;
+    }
   }
 
   gf->multiply.w32 = NULL;
@@ -950,7 +985,7 @@ int gf_wgen_init(gf_t *gf)
       } else if (h->w <= 16) {
         if (gf_wgen_log_init(gf) == 0) return 0; 
       } else {
-        if (gf_wgen_group_init(gf) == 0) return 0; 
+        if (gf_wgen_bytwo_p_init(gf) == 0) return 0; 
       }
       break;
     case GF_MULT_SHIFT:     if (gf_wgen_shift_init(gf) == 0) return 0; break;
diff --git a/release-files.txt b/release-files.txt
deleted file mode 100644
index ca25004..0000000
--- a/release-files.txt
+++ /dev/null
@@ -1,31 +0,0 @@
-License.txt
-README.txt
-GNUmakefile
-gf.c
-gf_add.c
-gf_complete.h
-gf_div.c
-gf_example_1.c
-gf_example_2.c
-gf_example_3.c
-gf_example_4.c
-gf_general.c
-gf_general.h
-gf_int.h
-gf_method.c
-gf_method.h
-gf_methods.c
-gf_mult.c
-gf_poly.c
-gf_rand.c
-gf_rand.h
-gf_time.c
-gf_unit.c
-gf_w128.c
-gf_w16.c
-gf_w32.c
-gf_w4.c
-gf_w64.c
-gf_w8.c
-gf_wgen.c
-whats_my_sse.c
diff --git a/tests.txt b/tests.txt
deleted file mode 100644
index e69de29..0000000
diff --git a/tmp-10-out.txt b/tmp-10-out.txt
deleted file mode 100644
index e69de29..0000000
diff --git a/tmp-time-test.sh b/tmp-time-test.sh
deleted file mode 100644
index e30fca8..0000000
--- a/tmp-time-test.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-if [ $# -lt 4 ]; then
-  echo 'usage: sh tmp-test.sh w gf_specs (e.g. LOG - -)' >&2
-  exit 1
-fi
-
-w=$1
-shift
-i=1024
-while [ $i -le 134217728 ]; do
-  iter=`echo $i | awk '{ print (134217728/$1)*1 }'`
-  echo $i $iter $w $* `./gf_time $w G -1 $i $iter $* | head -n 3 | tail -n 2`
-  i=`echo $i | awk '{ print $1*2 }'`
-done
-
diff --git a/tmp.c b/tmp.c
deleted file mode 100644
index a6deaab..0000000
--- a/tmp.c
+++ /dev/null
@@ -1,1583 +0,0 @@
-/*
- * gf_w32.c
- *
- * Routines for 32-bit Galois fields
- */
-
-#define MM_PRINT32(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 4) printf(" %02x%02x%02x%02x", blah[15-ii], blah[14-ii], blah[13-ii], blah[12-ii]); printf("\n"); }
-
-#define MM_PRINT8(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 1) printf("%s%02x", (ii%4==0) ? "   " : " ", blah[15-ii]); printf("\n"); }
-
-#include "gf_int.h"
-#include <stdio.h>
-#include <stdlib.h>
-
-#define GF_FIELD_WIDTH (32)
-#define GF_FIRST_BIT (1 << 31)
-
-#define GF_BASE_FIELD_WIDTH (16)
-#define GF_BASE_FIELD_SIZE       (1 << GF_BASE_FIELD_WIDTH)
-#define GF_BASE_FIELD_GROUP_SIZE  GF_BASE_FIELD_SIZE-1
-#define GF_S_GF_16_2 (40188)
-#define GF_MULTBY_TWO(p) (((p) & GF_FIRST_BIT) ? (((p) << 1) ^ h->prim_poly) : (p) << 1);
-
-
-struct gf_w16_logtable_data {
-    int              log_tbl[GF_BASE_FIELD_SIZE];
-    gf_val_16_t      _antilog_tbl[GF_BASE_FIELD_SIZE * 4];
-    gf_val_16_t      *antilog_tbl;
-    gf_val_16_t      inv_tbl[GF_BASE_FIELD_SIZE];
-};
-
-struct gf_split_2_32_lazy_data {
-    gf_val_32_t      last_value;
-    gf_val_32_t      tables[16][4];
-};
-
-struct gf_split_8_8_data {
-    gf_val_32_t      tables[7][256][256];
-};
-
-struct gf_split_4_32_lazy_data {
-    gf_val_32_t      last_value;
-    gf_val_32_t      tables[8][16];
-};
-
-static
-inline
-gf_val_32_t gf_w32_inverse_from_divide (gf_t *gf, gf_val_32_t a)
-{
-  return gf->divide.w32(gf, 1, a);
-}
-
-static
-inline
-gf_val_32_t gf_w32_divide_from_inverse (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
-{
-  b = gf->inverse.w32(gf, b);
-  return gf->multiply.w32(gf, a, b);
-}
-
-static
-void
-gf_w32_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int 
-xor)
-{
-  int i;
-  gf_val_32_t *s32;
-  gf_val_32_t *d32;
-   
-  s32 = (gf_val_32_t *) src;
-  d32 = (gf_val_32_t *) dest; 
- 
-  if (xor) {
-    for (i = 0; i < bytes/sizeof(gf_val_32_t); i++) {
-      d32[i] ^= gf->multiply.w32(gf, val, s32[i]);
-    } 
-  } else {
-    for (i = 0; i < bytes/sizeof(gf_val_32_t); i++) {
-      d32[i] = gf->multiply.w32(gf, val, s32[i]);
-    } 
-  }
-}
-
-static
-inline
-gf_val_32_t gf_w32_euclid (gf_t *gf, gf_val_32_t b)
-{
-  gf_val_32_t e_i, e_im1, e_ip1;
-  gf_val_32_t d_i, d_im1, d_ip1;
-  gf_val_32_t y_i, y_im1, y_ip1;
-  gf_val_32_t c_i;
-
-  if (b == 0) return -1;
-  e_im1 = ((gf_internal_t *) (gf->scratch))->prim_poly;
-  e_i = b;
-  d_im1 = 32;
-  for (d_i = d_im1-1; ((1 << d_i) & e_i) == 0; d_i--) ;
-  y_i = 1;
-  y_im1 = 0;
-
-  while (e_i != 1) {
-
-    e_ip1 = e_im1;
-    d_ip1 = d_im1;
-    c_i = 0;
-
-    while (d_ip1 >= d_i) {
-      c_i ^= (1 << (d_ip1 - d_i));
-      e_ip1 ^= (e_i << (d_ip1 - d_i));
-      d_ip1--;
-      while ((e_ip1 & (1 << d_ip1)) == 0) d_ip1--;
-    }
-
-    y_ip1 = y_im1 ^ gf->multiply.w32(gf, c_i, y_i);
-    y_im1 = y_i;
-    y_i = y_ip1;
-
-    e_im1 = e_i;
-    d_im1 = d_i;
-    e_i = e_ip1;
-    d_i = d_ip1;
-  }
-
-  return y_i;
-}
-
-static
-inline
-gf_val_32_t gf_w32_matrix (gf_t *gf, gf_val_32_t b)
-{
-  return gf_bitmatrix_inverse(b, 32, ((gf_internal_t *) (gf->scratch))->prim_poly);
-}
-
-/* JSP: GF_MULT_SHIFT: The world's dumbest multiplication algorithm.  I only
-   include it for completeness.  It does have the feature that it requires no
-   extra memory.  
-*/
-
-static
-inline
-gf_val_32_t
-gf_w32_shift_multiply (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
-{
-  uint64_t product, i, pp, a, b, one;
-  gf_internal_t *h;
-  
-  a = a32;
-  b = b32;
-  h = (gf_internal_t *) gf->scratch;
-  one = 1;
-  pp = h->prim_poly | (one << 32);
-
-  product = 0;
-
-  for (i = 0; i < GF_FIELD_WIDTH; i++) { 
-    if (a & (one << i)) product ^= (b << i);
-  }
-  for (i = (GF_FIELD_WIDTH*2-1); i >= GF_FIELD_WIDTH; i--) {
-    if (product & (one << i)) product ^= (pp << (i-GF_FIELD_WIDTH)); 
-  }
-  return product;
-}
-
-static 
-int gf_w32_shift_init(gf_t *gf)
-{
-  gf->multiply.w32 = gf_w32_shift_multiply;
-  gf->inverse.w32 = gf_w32_euclid;
-  gf->multiply_region.w32 = gf_w32_multiply_region_from_single;
-  return 1;
-}
-
-static
-inline
-gf_val_32_t
-gf_w32_split_8_8_multiply (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
-{
-  uint32_t product, i, j, mask, tb;
-  gf_internal_t *h;
-  struct gf_split_8_8_data *d8;
-  
-  h = (gf_internal_t *) gf->scratch;
-  d8 = (struct gf_split_8_8_data *) h->private;
-  product = 0;
-  mask = 0xff;
-
-  for (i = 0; i < 4; i++) {
-    tb = b32;
-    for (j = 0; j < 4; j++) {
-      product ^= d8->tables[i+j][a32&mask][tb&mask];
-      tb >>= 8;
-    }
-    a32 >>= 8;
-  }
-  return product;
-}
-
-static
-inline
-void
-gf_w32_split_8_8_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
-{
-  uint32_t product, mask, tb, tv, tp;
-  gf_internal_t *h;
-  struct gf_split_8_8_data *d8;
-  uint32_t *p00, *p01, *p02, *p03;
-  uint32_t *p10, *p11, *p12, *p13;
-  uint32_t *p20, *p21, *p22, *p23;
-  uint32_t *p30, *p31, *p32, *p33;
-  uint32_t *s32, *d32, *top;
-  unsigned long uls, uld;
-  
-  uls = (unsigned long) src;
-  uld = (unsigned long) dest;
-  if (uls %4 != 0 || ((uls & 0x7) != (uld & 0x7))) gf_alignment_error("gf_w32_split_8_8_multiply_region", 4);
-  if (bytes % 4 != 0) {
-    gf_alignment_error("gf_w32_split_8_8_multiply_region: buffer size not divisible by symbol size = 4 bytes", 4);
-  }
-
-  tv = val;
-  h = (gf_internal_t *) gf->scratch;
-  d8 = (struct gf_split_8_8_data *) h->private;
-  mask = 0xff;
-
-  p00 = &(d8->tables[0][val&mask][0]);
-  p01 = &(d8->tables[1][val&mask][0]);
-  p02 = &(d8->tables[2][val&mask][0]);
-  p03 = &(d8->tables[3][val&mask][0]);
-  val >>= 8;
-  p10 = &(d8->tables[1][val&mask][0]);
-  p11 = &(d8->tables[2][val&mask][0]);
-  p12 = &(d8->tables[3][val&mask][0]);
-  p13 = &(d8->tables[4][val&mask][0]);
-  val >>= 8;
-  p20 = &(d8->tables[2][val&mask][0]);
-  p21 = &(d8->tables[3][val&mask][0]);
-  p22 = &(d8->tables[4][val&mask][0]);
-  p23 = &(d8->tables[5][val&mask][0]);
-  val >>= 8;
-  p30 = &(d8->tables[3][val&mask][0]);
-  p31 = &(d8->tables[4][val&mask][0]);
-  p32 = &(d8->tables[5][val&mask][0]);
-  p33 = &(d8->tables[6][val&mask][0]);
-
-  s32 = (uint32_t *) src;
-  d32 = (uint32_t *) dest;
-  top = (d32 + (bytes/4));
-
-  while (d32 < top) {
-    tb = *s32;
-    tp = *d32;
-    product = (xor) ? (*d32) : 0;
-    product ^= p00[tb&mask];
-    product ^= p10[tb&mask];
-    product ^= p20[tb&mask];
-    product ^= p30[tb&mask];
-
-    tb >>= 8;
-    product ^= p01[tb&mask];
-    product ^= p11[tb&mask];
-    product ^= p21[tb&mask];
-    product ^= p31[tb&mask];
-
-    tb >>= 8;
-    product ^= p02[tb&mask];
-    product ^= p12[tb&mask];
-    product ^= p22[tb&mask];
-    product ^= p32[tb&mask];
-
-    tb >>= 8;
-    product ^= p03[tb&mask];
-    product ^= p13[tb&mask];
-    product ^= p23[tb&mask];
-    product ^= p33[tb&mask];
-    *d32 = product;
-    s32++;
-    d32++;
-  }
-}
-
-static
-void
-gf_w32_split_2_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
-{
-  unsigned long uls, uld;
-  gf_internal_t *h;
-  struct gf_split_2_32_lazy_data *ld;
-  int i;
-  gf_val_32_t pp, v, v2, s, *s32, *d32, *top;
-
-  uls = (unsigned long) src;
-  uld = (unsigned long) dest;
-  if (uls %4 != 0 || ((uls & 0x7) != (uld & 0x7))) gf_alignment_error("gf_w32_split_2_32_lazy_multiply_region", 4);
-  if (bytes % 4 != 0) {
-    gf_alignment_error("gf_w32_split_2_32_lazy_multiply_region: buffer size not divisible by symbol size = 4 bytes", 4);
-  }
-
-  if (val == 0) {
-    if (xor) return;
-    bzero(dest, bytes);
-    return;
-  }
-
-  h = (gf_internal_t *) gf->scratch;
-  pp = h->prim_poly;
-
-  ld = (struct gf_split_2_32_lazy_data *) h->private;
-  
-  if (ld->last_value != val) {
-    v = val;
-    for (i = 0; i < 16; i++) {
-      v2 = (v << 1);
-      if (v & GF_FIRST_BIT) v2 ^= pp;
-      ld->tables[i][0] = 0;
-      ld->tables[i][1] = v;
-      ld->tables[i][2] = v2;
-      ld->tables[i][3] = (v2 ^ v);
-      v = (v2 << 1);
-      if (v2 & GF_FIRST_BIT) v ^= pp;
-    }
-  }
-  ld->last_value = val;
-
-  s32 = (gf_val_32_t *) src;
-  d32 = (gf_val_32_t *) dest;
-  top = d32 + (bytes/4);
-
-  while (d32 != top) {
-    v = (xor) ? *d32 : 0;
-    s = *s32;
-    i = 0;
-    while (s != 0) {
-      v ^= ld->tables[i][s&3];
-      s >>= 2;
-      i++;
-    }
-    *d32 = v;
-    d32++;
-    s32++;
-  }
-}
-
-static
-void
-gf_w32_split_2_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
-{
-#ifdef INTEL_SSE4
-  unsigned long uls, uld;
-  gf_internal_t *h;
-  int i, m, j, tindex;
-  gf_val_32_t pp, v, v2, s, *s32, *d32, *top;
-  __m128i vi, si, pi, shuffler, tables[16], adder, xi, mask1, mask2;
-
-  uls = (unsigned long) src;
-  uld = (unsigned long) dest;
-  if (uls %4 != 0 || ((uls & 0xf) != (uld & 0xf))) gf_alignment_error("gf_w32_split_2_32_lazy_sse_multiply_region", 4);
-  if (bytes % 4 != 0) {
-    gf_alignment_error("gf_w32_split_2_32_lazy_sse_multiply_region: buffer size not divisible by symbol size = 4 bytes", 4);
-  }
-
-  if (val == 0) {
-    if (xor) return;
-    bzero(dest, bytes);
-    return;
-  }
-
-  h = (gf_internal_t *) gf->scratch;
-  pp = h->prim_poly;
-  
-  uls &= 0xf;
-
-  s32 = (gf_val_32_t *) src;
-  d32 = (gf_val_32_t *) dest;
-  top = d32 + (bytes/4);
-  
-  if (uls != 0) {
-    while (uls != 16) {
-      if (xor) {
-        *d32 ^= gf->multiply.w32(gf, *s32, val);
-      } else {
-        *d32 = gf->multiply.w32(gf, *s32, val);
-      }
-      *s32++;
-      *d32++;
-      if (d32 == top) return;
-      uls += 4;
-    }
-  }
-
-  uld = (unsigned long) top;
-  top = (gf_val_32_t *) (uld - (uld & 0xf));
-  uld &= 0xf;
-  
-  v = val;
-  for (i = 0; i < 16; i++) {
-    v2 = (v << 1);
-    if (v & GF_FIRST_BIT) v2 ^= pp;
-    tables[i] = _mm_set_epi32(v2 ^ v, v2, v, 0);
-    v = (v2 << 1);
-    if (v2 & GF_FIRST_BIT) v ^= pp;
-  }
-
-  shuffler = _mm_set_epi8(0xc, 0xc, 0xc, 0xc, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0);
-  adder = _mm_set_epi8(3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0);
-  mask1 = _mm_set1_epi8(0x3);
-  mask2 = _mm_set1_epi8(0xc);
-
-  while (d32 != top) {
-    pi = (xor) ? _mm_load_si128 ((__m128i *) d32) : _mm_setzero_si128();
-    vi = _mm_load_si128((__m128i *) s32);
- 
-    tindex = 0;
-    for (i = 0; i < 4; i++) {
-      si = _mm_shuffle_epi8(vi, shuffler);
-
-      xi = _mm_and_si128(si, mask1);
-      xi = _mm_slli_epi16(xi, 2);
-      xi = _mm_xor_si128(xi, adder);
-      pi = _mm_xor_si128(pi, _mm_shuffle_epi8(tables[tindex], xi));
-      tindex++;
-
-      xi = _mm_and_si128(si, mask2);
-      xi = _mm_xor_si128(xi, adder);
-      pi = _mm_xor_si128(pi, _mm_shuffle_epi8(tables[tindex], xi));
-      si = _mm_srli_epi16(si, 2);
-      tindex++;
-
-      xi = _mm_and_si128(si, mask2);
-      xi = _mm_xor_si128(xi, adder);
-      pi = _mm_xor_si128(pi, _mm_shuffle_epi8(tables[tindex], xi));
-      si = _mm_srli_epi16(si, 2);
-      tindex++;
-
-      xi = _mm_and_si128(si, mask2);
-      xi = _mm_xor_si128(xi, adder);
-      pi = _mm_xor_si128(pi, _mm_shuffle_epi8(tables[tindex], xi));
-      si = _mm_srli_epi16(si, 2);
-      tindex++;
-      
-      vi = _mm_srli_epi32(vi, 8);
-    }
-    _mm_store_si128((__m128i *) d32, pi);
-    d32 += 4;
-    s32 += 4;
-  }
-
-  while (uld > 0) {
-    if (xor) {
-      *d32 ^= gf->multiply.w32(gf, *s32, val);
-    } else {
-      *d32 = gf->multiply.w32(gf, *s32, val);
-    }
-    *s32++;
-    *d32++;
-    uld -= 4;
-  }
-
-
-#endif
-}
-
-static
-void
-gf_w32_split_4_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
-{
-  unsigned long uls, uld;
-  gf_internal_t *h;
-  struct gf_split_4_32_lazy_data *ld;
-  int i, j, k;
-  gf_val_32_t pp, v, s, *s32, *d32, *top;
-
-  uls = (unsigned long) src;
-  uld = (unsigned long) dest;
-  if (uls %4 != 0 || ((uls & 0x7) != (uld & 0x7))) gf_alignment_error("gf_w32_split_4_32_lazy_multiply_region", 4);
-  if (bytes % 4 != 0) {
-    gf_alignment_error("gf_w32_split_4_32_lazy_multiply_region: buffer size not divisible by symbol size = 4 bytes", 4);
-  }
-
-  if (val == 0) {
-    if (xor) return;
-    bzero(dest, bytes);
-    return;
-  }
-
-  h = (gf_internal_t *) gf->scratch;
-  pp = h->prim_poly;
-
-  ld = (struct gf_split_4_32_lazy_data *) h->private;
-  
-  if (ld->last_value != val) {
-    v = val;
-    for (i = 0; i < 8; i++) {
-      ld->tables[i][0] = 0;
-      for (j = 1; j < 16; j <<= 1) {
-        for (k = 0; k < j; k++) {
-          ld->tables[i][k^j] = (v ^ ld->tables[i][k]);
-        }
-        v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
-      }
-    }
-  }
-  ld->last_value = val;
-
-  s32 = (gf_val_32_t *) src;
-  d32 = (gf_val_32_t *) dest;
-  top = d32 + (bytes/4);
-
-  while (d32 != top) {
-    v = (xor) ? *d32 : 0;
-    s = *s32;
-    i = 0;
-    while (s != 0) {
-      v ^= ld->tables[i][s&0xf];
-      s >>= 4;
-      i++;
-    }
-    *d32 = v;
-    d32++;
-    s32++;
-  }
-}
-
-static
-void
-gf_w32_split_4_32_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
-{
-#ifdef INTEL_SSE4
-  unsigned long uls, uld;
-  gf_internal_t *h;
-  int i, m, j, k, tindex;
-  gf_val_32_t pp, v, s, *s32, *d32, *top, *realtop;
-  __m128i si, tables[8][4], p0, p1, p2, p3, mask1, v0, v1, v2, v3;
-  struct gf_split_4_32_lazy_data *ld;
-  uint8_t btable[16];
-
-  uls = (unsigned long) src;
-  uld = (unsigned long) dest;
-  if (uls %4 != 0 || ((uls & 0xf) != (uld & 0xf))) gf_alignment_error("gf_w32_split_4_32_lazy_sse_multiply_region", 4);
-  if (bytes % 4 != 0) {
-    gf_alignment_error("gf_w32_split_4_32_lazy_sse_multiply_region: buffer size not divisible by symbol size = 4 bytes", 4);
-  }
-
-  if (val == 0) {
-    if (xor) return;
-    bzero(dest, bytes);
-    return;
-  }
-
-  h = (gf_internal_t *) gf->scratch;
-  pp = h->prim_poly;
-  
-  uls &= 0xf;
-
-  s32 = (gf_val_32_t *) src;
-  d32 = (gf_val_32_t *) dest;
-  top = d32 + (bytes/4);
-  
-  if (uls != 0) {
-    while (uls != 16) {
-      if (xor) {
-        *d32 ^= gf->multiply.w32(gf, *s32, val);
-      } else {
-        *d32 = gf->multiply.w32(gf, *s32, val);
-      }
-      *s32++;
-      *d32++;
-      if (d32 == top) return;
-      uls += 4;
-    }
-  }
-
-  uld = (unsigned long) top;
-  realtop = top;
-  
-  /* You need the size of this region to be a multiple of 64 bytes */
-  bytes = (top - d32);
-  bytes -= (bytes & 0xf);
-  top = (d32 + bytes);
-
-  ld = (struct gf_split_4_32_lazy_data *) h->private;
- 
-  v = val;
-  for (i = 0; i < 8; i++) {
-    ld->tables[i][0] = 0;
-    for (j = 1; j < 16; j <<= 1) {
-      for (k = 0; k < j; k++) {
-        ld->tables[i][k^j] = (v ^ ld->tables[i][k]);
-      }
-      v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
-    }
-    for (j = 0; j < 4; j++) {
-      for (k = 0; k < 16; k++) {
-        btable[k] = (uint8_t) ld->tables[i][k];
-        ld->tables[i][k] >>= 8;
-      }
-      tables[i][j] = _mm_loadu_si128((__m128i *) btable);
-    }
-  }
-
-  mask1 = _mm_set1_epi8(0xf);
-
-  if (xor) {
-    while (d32 != top) {
-      p0 = _mm_load_si128 ((__m128i *) d32);
-      p1 = _mm_load_si128 ((__m128i *) (d32+4));
-      p2 = _mm_load_si128 ((__m128i *) (d32+8));
-      p3 = _mm_load_si128 ((__m128i *) (d32+12));
-  
-      v0 = _mm_load_si128((__m128i *) s32); s32 += 4;
-      v1 = _mm_load_si128((__m128i *) s32); s32 += 4;
-      v2 = _mm_load_si128((__m128i *) s32); s32 += 4;
-      v3 = _mm_load_si128((__m128i *) s32); s32 += 4;
-  
-      si = _mm_and_si128(v0, mask1);
-      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[0][0], si));
-      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[0][1], si));
-      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[0][2], si));
-      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[0][3], si));
-      
-      v0 = _mm_srli_epi32(v0, 4);
-      si = _mm_and_si128(v0, mask1);
-      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[1][0], si));
-      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[1][1], si));
-      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[1][2], si));
-      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[1][3], si));
-  
-      si = _mm_and_si128(v1, mask1);
-      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[2][0], si));
-      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[2][1], si));
-      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[2][2], si));
-      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[2][3], si));
-      
-      v1 = _mm_srli_epi32(v1, 4);
-      si = _mm_and_si128(v1, mask1);
-      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[3][0], si));
-      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[3][1], si));
-      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[3][2], si));
-      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[3][3], si));
-  
-      si = _mm_and_si128(v2, mask1);
-      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[4][0], si));
-      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[4][1], si));
-      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[4][2], si));
-      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[4][3], si));
-      
-      v2 = _mm_srli_epi32(v2, 4);
-      si = _mm_and_si128(v2, mask1);
-      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[5][0], si));
-      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[5][1], si));
-      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[5][2], si));
-      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[5][3], si));
-  
-      si = _mm_and_si128(v3, mask1);
-      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[6][0], si));
-      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[6][1], si));
-      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[6][2], si));
-      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[6][3], si));
-      
-      v3 = _mm_srli_epi32(v3, 4);
-      si = _mm_and_si128(v3, mask1);
-      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[7][0], si));
-      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[7][1], si));
-      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[7][2], si));
-      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[7][3], si));
-  
-      _mm_store_si128((__m128i *) d32, p0);
-      _mm_store_si128((__m128i *) (d32+4), p1);
-      _mm_store_si128((__m128i *) (d32+8), p2);
-      _mm_store_si128((__m128i *) (d32+12), p3);
-      d32 += 16;
-    } 
-  } else {
-    while (d32 != top) {
-  
-      v0 = _mm_load_si128((__m128i *) s32); s32 += 4;
-      v1 = _mm_load_si128((__m128i *) s32); s32 += 4;
-      v2 = _mm_load_si128((__m128i *) s32); s32 += 4;
-      v3 = _mm_load_si128((__m128i *) s32); s32 += 4;
-
-      si = _mm_and_si128(v0, mask1);
-      p0 = _mm_shuffle_epi8(tables[0][0], si);
-      p1 = _mm_shuffle_epi8(tables[0][1], si);
-      p2 = _mm_shuffle_epi8(tables[0][2], si);
-      p3 = _mm_shuffle_epi8(tables[0][3], si);
-      
-      v0 = _mm_srli_epi32(v0, 4);
-      si = _mm_and_si128(v0, mask1);
-      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[1][0], si));
-      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[1][1], si));
-      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[1][2], si));
-      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[1][3], si));
-  
-      si = _mm_and_si128(v1, mask1);
-      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[2][0], si));
-      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[2][1], si));
-      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[2][2], si));
-      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[2][3], si));
-      
-      v1 = _mm_srli_epi32(v1, 4);
-      si = _mm_and_si128(v1, mask1);
-      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[3][0], si));
-      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[3][1], si));
-      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[3][2], si));
-      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[3][3], si));
-  
-      si = _mm_and_si128(v2, mask1);
-      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[4][0], si));
-      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[4][1], si));
-      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[4][2], si));
-      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[4][3], si));
-      
-      v2 = _mm_srli_epi32(v2, 4);
-      si = _mm_and_si128(v2, mask1);
-      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[5][0], si));
-      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[5][1], si));
-      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[5][2], si));
-      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[5][3], si));
-  
-      si = _mm_and_si128(v3, mask1);
-      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[6][0], si));
-      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[6][1], si));
-      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[6][2], si));
-      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[6][3], si));
-      
-      v3 = _mm_srli_epi32(v3, 4);
-      si = _mm_and_si128(v3, mask1);
-      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[7][0], si));
-      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[7][1], si));
-      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[7][2], si));
-      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[7][3], si));
-  
-      _mm_store_si128((__m128i *) d32, p0);
-      _mm_store_si128((__m128i *) (d32+4), p1);
-      _mm_store_si128((__m128i *) (d32+8), p2);
-      _mm_store_si128((__m128i *) (d32+12), p3);
-      d32 += 16;
-    } 
-  }
-
-  while (d32 < realtop) {
-    if (xor) {
-      *d32 ^= gf->multiply.w32(gf, *s32, val);
-    } else {
-      *d32 = gf->multiply.w32(gf, *s32, val);
-    }
-    *s32++;
-    *d32++;
-  }
-
-#endif
-}
-
-
-static
-void
-gf_w32_split_4_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
-{
-#ifdef INTEL_SSE4
-  unsigned long uls, uld;
-  gf_internal_t *h;
-  int i, m, j, k, tindex;
-  gf_val_32_t pp, v, s, *s32, *d32, *top, *realtop;
-  __m128i vi, si, tables[8][4], p0, p1, p2, p3, mask1, v0, v1, v2, v3, mask8, mask16;
-  __m128i tv1, tv2, tv3, tv0;
-  struct gf_split_4_32_lazy_data *ld;
-  uint8_t btable[16];
-
-  uls = (unsigned long) src;
-  uld = (unsigned long) dest;
-  if (uls %4 != 0 || ((uls & 0xf) != (uld & 0xf))) gf_alignment_error("gf_w32_split_4_32_lazy_sse_multiply_region", 4);
-  if (bytes % 4 != 0) {
-    gf_alignment_error("gf_w32_split_4_32_lazy_sse_multiply_region: buffer size not divisible by symbol size = 4 bytes", 4);
-  }
-
-  if (val == 0) {
-    if (xor) return;
-    bzero(dest, bytes);
-    return;
-  }
-
-  h = (gf_internal_t *) gf->scratch;
-  pp = h->prim_poly;
-  
-  uls &= 0xf;
-
-  s32 = (gf_val_32_t *) src;
-  d32 = (gf_val_32_t *) dest;
-  top = d32 + (bytes/4);
-  
-  if (uls != 0) {
-    while (uls != 16) {
-      if (xor) {
-        *d32 ^= gf->multiply.w32(gf, *s32, val);
-      } else {
-        *d32 = gf->multiply.w32(gf, *s32, val);
-      }
-      *s32++;
-      *d32++;
-      if (d32 == top) return;
-      uls += 4;
-    }
-  }
-
-  uld = (unsigned long) top;
-  realtop = top;
-  
-  /* You need the size of this region to be a multiple of 64 bytes */
-  bytes = (top - d32);
-  bytes -= (bytes & 0xf);
-  top = (d32 + bytes);
-
-  ld = (struct gf_split_4_32_lazy_data *) h->private;
- 
-  v = val;
-  for (i = 0; i < 8; i++) {
-    ld->tables[i][0] = 0;
-    for (j = 1; j < 16; j <<= 1) {
-      for (k = 0; k < j; k++) {
-        ld->tables[i][k^j] = (v ^ ld->tables[i][k]);
-      }
-      v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
-    }
-    for (j = 0; j < 4; j++) {
-      for (k = 0; k < 16; k++) {
-        btable[k] = (uint8_t) ld->tables[i][k];
-        ld->tables[i][k] >>= 8;
-      }
-      tables[i][j] = _mm_loadu_si128((__m128i *) btable);
-    }
-  }
-
-  mask1 = _mm_set1_epi8(0xf);
-  mask8 = _mm_set1_epi16(0xff);
-  mask16 = _mm_set1_epi32(0xffff);
-
-  if (xor) {
-    while (d32 != top) {
-      v0 = _mm_load_si128((__m128i *) s32); s32 += 4;
-      v1 = _mm_load_si128((__m128i *) s32); s32 += 4;
-      v2 = _mm_load_si128((__m128i *) s32); s32 += 4;
-      v3 = _mm_load_si128((__m128i *) s32); s32 += 4;
-  
-/*      printf("Val = %x\n", val);
-      MM_PRINT8("Old V0", v0);
-      MM_PRINT8("Old V1", v1);
-      MM_PRINT8("Old V2", v2);
-      MM_PRINT8("Old V3", v3);
-      printf("\n"); */
-
-      p0 = _mm_srli_epi16(v0, 8);
-      p1 = _mm_srli_epi16(v1, 8);
-      p2 = _mm_srli_epi16(v2, 8);
-      p3 = _mm_srli_epi16(v3, 8);
-
-      tv0 = _mm_and_si128(v0, mask8);
-      tv1 = _mm_and_si128(v1, mask8);
-      tv2 = _mm_and_si128(v2, mask8);
-      tv3 = _mm_and_si128(v3, mask8);
-
-      v0 = _mm_packus_epi16(p1, p0);
-      v1 = _mm_packus_epi16(tv1, tv0);
-      v2 = _mm_packus_epi16(p3, p2);
-      v3 = _mm_packus_epi16(tv3, tv2);
-
-/*      MM_PRINT8("Middle V0", v0);
-      MM_PRINT8("Middle V1", v1);
-      MM_PRINT8("Middle V2", v2);
-      MM_PRINT8("Middle V3", v3);
-      printf("\n"); */
-
-      p0 = _mm_srli_epi16(v0, 8);
-      p1 = _mm_srli_epi16(v1, 8);
-      p2 = _mm_srli_epi16(v2, 8);
-      p3 = _mm_srli_epi16(v3, 8);
-
-      tv0 = _mm_and_si128(v0, mask8);
-      tv1 = _mm_and_si128(v1, mask8);
-      tv2 = _mm_and_si128(v2, mask8);
-      tv3 = _mm_and_si128(v3, mask8);
-
-      v0 = _mm_packus_epi16(p2, p0);
-      v1 = _mm_packus_epi16(p3, p1);
-      v2 = _mm_packus_epi16(tv2, tv0);
-      v3 = _mm_packus_epi16(tv3, tv1);
-
-/*      MM_PRINT8("New V0", v0);
-      MM_PRINT8("New V1", v1);
-      MM_PRINT8("New V2", v2);
-      MM_PRINT8("New V3", v3);
-      printf("\n"); */
-
-      si = _mm_and_si128(v0, mask1);
-      p0 = _mm_shuffle_epi8(tables[6][0], si);
-      p1 = _mm_shuffle_epi8(tables[6][1], si);
-      p2 = _mm_shuffle_epi8(tables[6][2], si);
-      p3 = _mm_shuffle_epi8(tables[6][3], si);
-      
-      v0 = _mm_srli_epi32(v0, 4);
-      si = _mm_and_si128(v0, mask1);
-      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[7][0], si));
-      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[7][1], si));
-      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[7][2], si));
-      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[7][3], si));
-  
-      si = _mm_and_si128(v1, mask1);
-      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[4][0], si));
-      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[4][1], si));
-      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[4][2], si));
-      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[4][3], si));
-      
-      v1 = _mm_srli_epi32(v1, 4);
-      si = _mm_and_si128(v1, mask1);
-      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[5][0], si));
-      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[5][1], si));
-      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[5][2], si));
-      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[5][3], si));
-  
-      si = _mm_and_si128(v2, mask1);
-      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[2][0], si));
-      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[2][1], si));
-      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[2][2], si));
-      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[2][3], si));
-      
-      v2 = _mm_srli_epi32(v2, 4);
-      si = _mm_and_si128(v2, mask1);
-      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[3][0], si));
-      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[3][1], si));
-      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[3][2], si));
-      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[3][3], si));
-  
-      si = _mm_and_si128(v3, mask1);
-      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[0][0], si));
-      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[0][1], si));
-      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[0][2], si));
-      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[0][3], si));
-      
-      v3 = _mm_srli_epi32(v3, 4);
-      si = _mm_and_si128(v3, mask1);
-      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[1][0], si));
-      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[1][1], si));
-      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[1][2], si));
-      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[1][3], si));
-  
-/*      MM_PRINT8("Old P0", p0);
-      MM_PRINT8("Old P1", p1);
-      MM_PRINT8("Old P2", p2);
-      MM_PRINT8("Old P3", p3);
-      printf("\n"); */
-
-      tv0 = _mm_unpackhi_epi8(p1, p3);
-      tv1 = _mm_unpackhi_epi8(p0, p2);
-      tv2 = _mm_unpacklo_epi8(p1, p3);
-      tv3 = _mm_unpacklo_epi8(p0, p2);
-
-/*      MM_PRINT8("Middle P0", tv0);
-      MM_PRINT8("Middle P1", tv1);
-      MM_PRINT8("Middle P2", tv2);
-      MM_PRINT8("Middle P3", tv3);
-      printf("\n"); */
-
-      p0 = _mm_unpackhi_epi8(tv1, tv0);
-      p1 = _mm_unpacklo_epi8(tv1, tv0);
-      p2 = _mm_unpackhi_epi8(tv3, tv2);
-      p3 = _mm_unpacklo_epi8(tv3, tv2);
-
-/*      MM_PRINT8("New P0", p0);
-      MM_PRINT8("New P1", p1);
-      MM_PRINT8("New P2", p2);
-      MM_PRINT8("New P3", p3);
-      printf("\n");
-      exit(1); */
-
-      v0 = _mm_load_si128 ((__m128i *) d32);
-      v1 = _mm_load_si128 ((__m128i *) (d32+4));
-      v2 = _mm_load_si128 ((__m128i *) (d32+8));
-      v3 = _mm_load_si128 ((__m128i *) (d32+12));
-  
-      p0 = _mm_xor_si128(p0, v0);
-      p1 = _mm_xor_si128(p1, v1);
-      p2 = _mm_xor_si128(p2, v2);
-      p3 = _mm_xor_si128(p3, v3);
-
-      _mm_store_si128((__m128i *) d32, p0);
-      _mm_store_si128((__m128i *) (d32+4), p1);
-      _mm_store_si128((__m128i *) (d32+8), p2);
-      _mm_store_si128((__m128i *) (d32+12), p3);
-      d32 += 16;
-    } 
-  } else {
-    while (d32 != top) {
-      v0 = _mm_load_si128((__m128i *) s32); s32 += 4;
-      v1 = _mm_load_si128((__m128i *) s32); s32 += 4;
-      v2 = _mm_load_si128((__m128i *) s32); s32 += 4;
-      v3 = _mm_load_si128((__m128i *) s32); s32 += 4;
-  
-/*      printf("Val = %x\n", val);
-      MM_PRINT8("Old V0", v0);
-      MM_PRINT8("Old V1", v1);
-      MM_PRINT8("Old V2", v2);
-      MM_PRINT8("Old V3", v3);
-      printf("\n"); */
-
-      p0 = _mm_srli_epi16(v0, 8);
-      p1 = _mm_srli_epi16(v1, 8);
-      p2 = _mm_srli_epi16(v2, 8);
-      p3 = _mm_srli_epi16(v3, 8);
-
-      tv0 = _mm_and_si128(v0, mask8);
-      tv1 = _mm_and_si128(v1, mask8);
-      tv2 = _mm_and_si128(v2, mask8);
-      tv3 = _mm_and_si128(v3, mask8);
-
-      v0 = _mm_packus_epi16(p1, p0);
-      v1 = _mm_packus_epi16(tv1, tv0);
-      v2 = _mm_packus_epi16(p3, p2);
-      v3 = _mm_packus_epi16(tv3, tv2);
-
-/*      MM_PRINT8("Middle V0", v0);
-      MM_PRINT8("Middle V1", v1);
-      MM_PRINT8("Middle V2", v2);
-      MM_PRINT8("Middle V3", v3);
-      printf("\n"); */
-
-      p0 = _mm_srli_epi16(v0, 8);
-      p1 = _mm_srli_epi16(v1, 8);
-      p2 = _mm_srli_epi16(v2, 8);
-      p3 = _mm_srli_epi16(v3, 8);
-
-      tv0 = _mm_and_si128(v0, mask8);
-      tv1 = _mm_and_si128(v1, mask8);
-      tv2 = _mm_and_si128(v2, mask8);
-      tv3 = _mm_and_si128(v3, mask8);
-
-      v0 = _mm_packus_epi16(p2, p0);
-      v1 = _mm_packus_epi16(p3, p1);
-      v2 = _mm_packus_epi16(tv2, tv0);
-      v3 = _mm_packus_epi16(tv3, tv1);
-
-/*      MM_PRINT8("New V0", v0);
-      MM_PRINT8("New V1", v1);
-      MM_PRINT8("New V2", v2);
-      MM_PRINT8("New V3", v3);
-      printf("\n"); */
-
-      si = _mm_and_si128(v0, mask1);
-      p0 = _mm_shuffle_epi8(tables[6][0], si);
-      p1 = _mm_shuffle_epi8(tables[6][1], si);
-      p2 = _mm_shuffle_epi8(tables[6][2], si);
-      p3 = _mm_shuffle_epi8(tables[6][3], si);
-      
-      v0 = _mm_srli_epi32(v0, 4);
-      si = _mm_and_si128(v0, mask1);
-      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[7][0], si));
-      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[7][1], si));
-      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[7][2], si));
-      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[7][3], si));
-  
-      si = _mm_and_si128(v1, mask1);
-      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[4][0], si));
-      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[4][1], si));
-      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[4][2], si));
-      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[4][3], si));
-      
-      v1 = _mm_srli_epi32(v1, 4);
-      si = _mm_and_si128(v1, mask1);
-      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[5][0], si));
-      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[5][1], si));
-      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[5][2], si));
-      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[5][3], si));
-  
-      si = _mm_and_si128(v2, mask1);
-      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[2][0], si));
-      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[2][1], si));
-      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[2][2], si));
-      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[2][3], si));
-      
-      v2 = _mm_srli_epi32(v2, 4);
-      si = _mm_and_si128(v2, mask1);
-      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[3][0], si));
-      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[3][1], si));
-      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[3][2], si));
-      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[3][3], si));
-  
-      si = _mm_and_si128(v3, mask1);
-      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[0][0], si));
-      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[0][1], si));
-      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[0][2], si));
-      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[0][3], si));
-      
-      v3 = _mm_srli_epi32(v3, 4);
-      si = _mm_and_si128(v3, mask1);
-      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[1][0], si));
-      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[1][1], si));
-      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[1][2], si));
-      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[1][3], si));
-  
-/*      MM_PRINT8("Old P0", p0);
-      MM_PRINT8("Old P1", p1);
-      MM_PRINT8("Old P2", p2);
-      MM_PRINT8("Old P3", p3);
-      printf("\n"); */
-
-      tv0 = _mm_unpackhi_epi8(p1, p3);
-      tv1 = _mm_unpackhi_epi8(p0, p2);
-      tv2 = _mm_unpacklo_epi8(p1, p3);
-      tv3 = _mm_unpacklo_epi8(p0, p2);
-
-/*      MM_PRINT8("Middle P0", tv0);
-      MM_PRINT8("Middle P1", tv1);
-      MM_PRINT8("Middle P2", tv2);
-      MM_PRINT8("Middle P3", tv3);
-      printf("\n"); */
-
-      p0 = _mm_unpackhi_epi8(tv1, tv0);
-      p1 = _mm_unpacklo_epi8(tv1, tv0);
-      p2 = _mm_unpackhi_epi8(tv3, tv2);
-      p3 = _mm_unpacklo_epi8(tv3, tv2);
-
-/*      MM_PRINT8("New P0", p0);
-      MM_PRINT8("New P1", p1);
-      MM_PRINT8("New P2", p2);
-      MM_PRINT8("New P3", p3);
-      printf("\n");
-      exit(1); */
-
-      _mm_store_si128((__m128i *) d32, p0);
-      _mm_store_si128((__m128i *) (d32+4), p1);
-      _mm_store_si128((__m128i *) (d32+8), p2);
-      _mm_store_si128((__m128i *) (d32+12), p3);
-      d32 += 16;
-    } 
-  }
-
-  while (d32 < realtop) {
-    if (xor) {
-      *d32 ^= gf->multiply.w32(gf, *s32, val);
-    } else {
-      *d32 = gf->multiply.w32(gf, *s32, val);
-    }
-    *s32++;
-    *d32++;
-  }
-
-
-#endif
-}
-
-static 
-int gf_w32_split_init(gf_t *gf)
-{
-  gf_internal_t *h;
-  struct gf_split_2_32_lazy_data *ld2;
-  struct gf_split_4_32_lazy_data *ld4;
-  struct gf_split_8_8_data *d8;
-  uint32_t p, basep;
-  int i, j, exp;
-
-  h = (gf_internal_t *) gf->scratch;
-
-  /* Defaults */
-  gf->multiply_region.w32 = gf_w32_multiply_region_from_single;
-  gf->multiply.w32 = gf_w32_shift_multiply;
-  gf->inverse.w32 = gf_w32_euclid;
-
-  if (h->arg1 == 8 && h->arg2 == 8) {
-    gf->multiply.w32 = gf_w32_split_8_8_multiply;
-    gf->multiply_region.w32 = gf_w32_split_8_8_multiply_region;
-    d8 = (struct gf_split_8_8_data *) h->private;
-    basep = 1;
-    for (exp = 0; exp < 7; exp++) {
-      for (j = 0; j < 256; j++) d8->tables[exp][0][j] = 0;
-      for (i = 0; i < 256; i++) d8->tables[exp][i][0] = 0;
-      d8->tables[exp][1][1] = basep;
-      for (i = 2; i < 256; i++) {
-        if (i&1) {
-          p = d8->tables[exp][i^1][1];
-          d8->tables[exp][i][1] = p ^ basep;
-        } else {
-          p = d8->tables[exp][i>>1][1];
-          d8->tables[exp][i][1] = GF_MULTBY_TWO(p);
-        }
-      }
-      for (i = 1; i < 256; i++) {
-        p = d8->tables[exp][i][1];
-        for (j = 1; j < 256; j++) {
-          if (j&1) {
-            d8->tables[exp][i][j] = d8->tables[exp][i][j^1] ^ p;
-          } else {
-            d8->tables[exp][i][j] = GF_MULTBY_TWO(d8->tables[exp][i][j>>1]);
-          }
-        }
-      }
-      for (i = 0; i < 8; i++) basep = GF_MULTBY_TWO(basep);
-    }
-  }
-  if ((h->arg1 == 2 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 2)) {
-    ld2 = (struct gf_split_2_32_lazy_data *) h->private;
-    ld2->last_value = 0;
-    if (h->region_type & GF_REGION_SSE) {
-      gf->multiply_region.w32 = gf_w32_split_2_32_lazy_sse_multiply_region;
-    } else {
-      gf->multiply_region.w32 = gf_w32_split_2_32_lazy_multiply_region;
-    }
-  } 
-  if ((h->arg1 == 4 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 4)) {
-    ld4 = (struct gf_split_4_32_lazy_data *) h->private;
-    ld4->last_value = 0;
-    if (h->region_type & GF_REGION_SSE) {
-      if (h->region_type & GF_REGION_ALTMAP) {
-        gf->multiply_region.w32 = gf_w32_split_4_32_lazy_sse_altmap_multiply_region;
-      } else {
-        gf->multiply_region.w32 = gf_w32_split_4_32_lazy_sse_multiply_region;
-      }
-    } else {
-      gf->multiply_region.w32 = gf_w32_split_4_32_lazy_multiply_region;
-    }
-  } 
-  return 1;
-}
-
-static
-gf_val_32_t
-gf_w32_composite_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
-{
-  gf_internal_t *h = (gf_internal_t *) gf->scratch;
-  gf_t *base_gf = h->base_gf;
-  uint16_t b0 = b & 0x0000ffff;
-  uint16_t b1 = (b & 0xffff0000) >> 16;
-  uint16_t a0 = a & 0x0000ffff;
-  uint16_t a1 = (a & 0xffff0000) >> 16;
-  uint16_t a1b1;
-
-  a1b1 = base_gf->multiply.w16(base_gf, a1, b1);
-
-  return ((base_gf->multiply.w16(base_gf, a0, b0) ^ a1b1) | ((base_gf->multiply.w16(base_gf, a1, b0) ^ base_gf->multiply.w16(base_gf, a0, b1) ^ base_gf->multiply.w16(base_gf, a1b1, GF_S_GF_16_2)) << 16));
-}
-
-/*
- * Composite field division trick (explained in 2007 tech report)
- *
- * Compute a / b = a*b^-1, where p(x) = x^2 + sx + 1
- *
- * let c = b^-1
- *
- * c*b = (s*b1c1+b1c0+b0c1)x+(b1c1+b0c0)
- *
- * want (s*b1c1+b1c0+b0c1) = 0 and (b1c1+b0c0) = 1
- *
- * let d = b1c1 and d+1 = b0c0
- *
- * solve s*b1c1+b1c0+b0c1 = 0
- *
- * solution: d = (b1b0^-1)(b1b0^-1+b0b1^-1+s)^-1
- *
- * c0 = (d+1)b0^-1
- * c1 = d*b1^-1
- *
- * a / b = a * c
- */
-static
-gf_val_32_t
-gf_w32_composite_inverse(gf_t *gf, gf_val_32_t a)
-{
-  gf_internal_t *h = (gf_internal_t *) gf->scratch;
-  gf_t *base_gf = h->base_gf;
-  uint16_t a0 = a & 0x0000ffff;
-  uint16_t a1 = (a & 0xffff0000) >> 16;
-  uint16_t c0, c1, d, tmp;
-  uint32_t c;
-  uint16_t a0inv, a1inv;
-
-  if (a0 == 0) {
-    a1inv = base_gf->inverse.w16(base_gf, a1);
-    c0 = base_gf->multiply.w16(base_gf, a1inv, GF_S_GF_16_2);
-    c1 = a1inv;
-  } else if (a1 == 0) {
-    c0 = base_gf->inverse.w16(base_gf, a0);
-    c1 = 0;
-  } else {
-    a1inv = base_gf->inverse.w16(base_gf, a1);
-    a0inv = base_gf->inverse.w16(base_gf, a0);
-
-    d = base_gf->multiply.w16(base_gf, a1, a0inv);
-
-    tmp = (base_gf->multiply.w16(base_gf, a1, a0inv) ^ base_gf->multiply.w16(base_gf, a0, a1inv) ^ GF_S_GF_16_2);
-    tmp = base_gf->inverse.w16(base_gf, tmp);
-
-    d = base_gf->multiply.w16(base_gf, d, tmp);
-
-    c0 = base_gf->multiply.w16(base_gf, (d^1), a0inv);
-    c1 = base_gf->multiply.w16(base_gf, d, a1inv);
-  }
-
-  c = c0 | (c1 << 16);
-
-  return c;
-}
-
-static
-gf_val_32_t
-gf_w32_composite_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
-{
-  gf_val_32_t binv;
-
-  binv = gf_w32_composite_inverse(gf, b);
-
-  return gf_w32_composite_multiply(gf, a, binv);
-}
-
-static
-void
-gf_w32_composite_multiply_region_table(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
-{
-  unsigned long uls, uld;
-  gf_internal_t *h = (gf_internal_t *) gf->scratch;
-  gf_t *base_gf = h->base_gf;
-  int i=0;
-  struct gf_w16_logtable_data * ltd;
-  uint16_t b0 = val & 0x0000ffff;
-  uint16_t b1 = (val & 0xffff0000) >> 16;
-  uint32_t *s32 = (uint32_t *) src;
-  uint32_t *d32 = (uint32_t *) dest;
-  uint16_t a0, a1, a1b1;
-  int num_syms = bytes >> 2;
-  int sym_divisible = bytes % 4;
-
-  uls = (unsigned long) src;
-  uld = (unsigned long) dest;
-  if ((uls & 0x7) != (uld & 0x7)) gf_alignment_error("gf_w32_buf_const_log", 2);
-  if (sym_divisible) {
-    gf_alignment_error("gf_w32_buf_const_log: buffer size not divisible by symbol size = 2 bytes", 2);
-  }
-
-  if (val == 0) {
-    if (xor) return;
-    bzero(dest, bytes);
-    return;
-  }
-
-  ltd = (struct gf_w16_logtable_data *) h->private;
-
-  if (xor) {
-    for (i = 0;i < num_syms; i++) {
-      a0 = s32[i] & 0x0000ffff;
-      a1 = (s32[i] & 0xffff0000) >> 16;
-      a1b1 = ltd->antilog_tbl[ltd->log_tbl[a1] + ltd->log_tbl[b1]];
-
-      d32[i] ^= ((ltd->antilog_tbl[ltd->log_tbl[a0] + ltd->log_tbl[b0]] ^ a1b1) | 
-                 ((ltd->antilog_tbl[ltd->log_tbl[a1] + ltd->log_tbl[b0]] ^ ltd->antilog_tbl[ltd->log_tbl[a0] + ltd->log_tbl[b1]] ^ 
-                   ltd->antilog_tbl[ltd->log_tbl[a1b1] + ltd->log_tbl[GF_S_GF_16_2]]) << 16));
-
-    }
-  } else {
-    for (i = 0;i < num_syms; i++) {
-      a0 = s32[i] & 0x0000ffff;
-      a1 = (s32[i] & 0xffff0000) >> 16;
-      a1b1 = ltd->antilog_tbl[ltd->log_tbl[a1] + ltd->log_tbl[b1]];
-
-      d32[i] = ((ltd->antilog_tbl[ltd->log_tbl[a0] + ltd->log_tbl[b0]] ^ a1b1) | 
-                 ((ltd->antilog_tbl[ltd->log_tbl[a1] + ltd->log_tbl[b0]] ^ ltd->antilog_tbl[ltd->log_tbl[a0] + ltd->log_tbl[b1]] ^ 
-                   ltd->antilog_tbl[ltd->log_tbl[a1b1] + ltd->log_tbl[GF_S_GF_16_2]]) << 16));
-    }
-  }
-}
-
-static
-void
-gf_w32_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
-{
-  unsigned long uls, uld;
-  gf_internal_t *h = (gf_internal_t *) gf->scratch;
-  gf_t *base_gf = h->base_gf;
-  int i=0;
-  struct gf_w16_logtable_data * ltd;
-  uint16_t b0 = val & 0x0000ffff;
-  uint16_t b1 = (val & 0xffff0000) >> 16;
-  uint32_t *s32 = (uint32_t *) src;
-  uint32_t *d32 = (uint32_t *) dest;
-  uint16_t a0, a1, a1b1;
-  int num_syms = bytes >> 2;
-  int sym_divisible = bytes % 4;
-
-  uls = (unsigned long) src;
-  uld = (unsigned long) dest;
-  if ((uls & 0x7) != (uld & 0x7)) gf_alignment_error("gf_w32_buf_const_log", 2);
-  if (sym_divisible) {
-    gf_alignment_error("gf_w32_buf_const_log: buffer size not divisible by symbol size = 2 bytes", 2);
-  }
-
-  if (val == 0) {
-    if (xor) return;
-    bzero(dest, bytes);
-    return;
-  }
-
-  ltd = (struct gf_w16_logtable_data *) h->private;
-
-  if (xor) {
-    for (i = 0;i < num_syms; i++) {
-      a0 = s32[i] & 0x0000ffff;
-      a1 = (s32[i] & 0xffff0000) >> 16;
-      a1b1 = base_gf->multiply.w16(base_gf, a1, b1);
-
-      d32[i] ^= ((base_gf->multiply.w16(base_gf, a0, b0) ^ a1b1) |
-                ((base_gf->multiply.w16(base_gf, a1, b0) ^ base_gf->multiply.w16(base_gf, a0, b1) ^ base_gf->multiply.w16(base_gf, a1b1, GF_S_GF_16_2)) << 16)); 
-
-    }
-  } else {
-    for (i = 0;i < num_syms; i++) {
-      a0 = s32[i] & 0x0000ffff;
-      a1 = (s32[i] & 0xffff0000) >> 16;
-      a1b1 = base_gf->multiply.w16(base_gf, a1, b1);
-
-      d32[i] = ((base_gf->multiply.w16(base_gf, a0, b0) ^ a1b1) |
-                ((base_gf->multiply.w16(base_gf, a1, b0) ^ base_gf->multiply.w16(base_gf, a0, b1) ^ base_gf->multiply.w16(base_gf, a1b1, GF_S_GF_16_2)) << 16)); 
-    }
-  }
-}
-
-
-
-static
-void
-gf_w32_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
-{
-  gf_internal_t *h = (gf_internal_t *) gf->scratch;
-  gf_t *base_gf = h->base_gf;
-  gf_val_16_t val0 = val & 0x0000ffff;
-  gf_val_16_t val1 = (val & 0xffff0000) >> 16;
-  int sub_reg_size = bytes / 2;
-
-  if (bytes % 2 != 0) gf_alignment_error("gf_w32_composite_multiply_region_alt", 1);
-  if (sub_reg_size % 2 != 0) gf_alignment_error("gf_w32_composite_multiply_region_alt", 1);
-  
-  if (!xor) {
-    memset(dest, 0, bytes);
-  }
-
-  base_gf->multiply_region.w16(base_gf, src, dest, val0, sub_reg_size, xor);
-  base_gf->multiply_region.w16(base_gf, src+sub_reg_size, dest, val1, sub_reg_size, 1);
-  base_gf->multiply_region.w16(base_gf, src, dest+sub_reg_size, val1, sub_reg_size, xor);
-  base_gf->multiply_region.w16(base_gf, src+sub_reg_size, dest+sub_reg_size, val0, sub_reg_size, 1);
-  base_gf->multiply_region.w16(base_gf, src+sub_reg_size, dest+sub_reg_size, base_gf->multiply.w16(base_gf, GF_S_GF_16_2, val1), sub_reg_size, 1);
-}
-
-static
-int gf_w32_composite_init(gf_t *gf)
-{
-  struct gf_w16_logtable_data *ltd;
-  gf_internal_t *h = (gf_internal_t *) gf->scratch;
-  gf_t *base_gf = h->base_gf;
-  gf_val_32_t a, b;
-  uint64_t prim_poly = ((gf_internal_t *) base_gf->scratch)->prim_poly;
-  int i;
-
-  ltd = (struct gf_w16_logtable_data *) h->private;
-
-  ltd->log_tbl[0] = 0;
-
-  bzero(&(ltd->_antilog_tbl[0]), sizeof(ltd->_antilog_tbl));
-
-  ltd->antilog_tbl = &(ltd->_antilog_tbl[GF_BASE_FIELD_SIZE * 2]);
-
-  b = 1;
-  for (i = 0; i < GF_BASE_FIELD_GROUP_SIZE; i++) {
-      ltd->log_tbl[b] = (gf_val_16_t)i;
-      ltd->antilog_tbl[i] = (gf_val_16_t)b;
-      ltd->antilog_tbl[i+GF_BASE_FIELD_GROUP_SIZE] = (gf_val_16_t)b;
-      b <<= 1;
-      if (b & GF_BASE_FIELD_SIZE) {
-          b = b ^ prim_poly;
-      }
-  }
-  ltd->inv_tbl[0] = 0;  /* Not really, but we need to fill it with something  */
-  ltd->inv_tbl[1] = 1;
-  for (i = 2; i < GF_BASE_FIELD_SIZE; i++) {
-    ltd->inv_tbl[i] = ltd->antilog_tbl[GF_BASE_FIELD_GROUP_SIZE-ltd->log_tbl[i]];
-  }
-
-  if (h->region_type & GF_REGION_ALTMAP) {
-    gf->multiply_region.w32 = gf_w32_composite_multiply_region_alt;
-  } else {
-    if (h->region_type & GF_REGION_SINGLE_TABLE) {
-      gf->multiply_region.w32 = gf_w32_composite_multiply_region_table;
-    } else {
-      gf->multiply_region.w32 = gf_w32_composite_multiply_region;
-    }
-  }
-
-  gf->multiply.w32 = gf_w32_composite_multiply;
-  gf->divide.w32 = gf_w32_composite_divide;
-  gf->inverse.w32 = gf_w32_composite_inverse;
-
-  return 1;
-}
-
-int gf_w32_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
-{
-  int ss, sa;
-
-  ss = (GF_REGION_SSE | GF_REGION_NOSSE);
-  sa = (GF_REGION_STDMAP | GF_REGION_ALTMAP);
-
-  switch(mult_type)
-  {
-    case GF_MULT_SPLIT_TABLE: 
-        if (arg1 == 8 && arg2 == 8){
-          if (region_type != GF_REGION_DEFAULT) return -1;
-          return sizeof(gf_internal_t) + sizeof(struct gf_split_8_8_data) + 64;
-        }
-        if ((arg1 == 2 && arg2 == 32) || (arg2 == 2 && arg1 == 32)) {
-          region_type &= (~GF_REGION_LAZY);
-          if ((region_type & ss) == ss) return -1;
-          if ((region_type | ss) != ss) return -1;
-          return sizeof(gf_internal_t) + sizeof(struct gf_split_2_32_lazy_data) + 64;
-        }
-        if ((arg1 == 4 && arg2 == 32) || (arg2 == 4 && arg1 == 32)) {
-          region_type &= (~GF_REGION_LAZY);
-          if ((region_type & ss) == ss) return -1;
-          if ((region_type & sa) == sa) return -1;
-          if (region_type & (~(ss|sa))) return -1;
-          if (region_type & GF_REGION_SSE) {
-            return sizeof(gf_internal_t) + sizeof(struct gf_split_4_32_lazy_data) + 64;
-          } else if (region_type & GF_REGION_ALTMAP) {
-            return -1;
-          } else {
-            return sizeof(gf_internal_t) + sizeof(struct gf_split_4_32_lazy_data) + 64;
-          }
-        }
-        return -1;
-    case GF_MULT_DEFAULT:
-    case GF_MULT_SHIFT:
-      if (arg1 != 0 || arg2 != 0 || region_type != 0) return -1;
-      return sizeof(gf_internal_t);
-      break;
-    case GF_MULT_COMPOSITE:
-      if (region_type & ~(GF_REGION_SINGLE_TABLE | GF_REGION_ALTMAP | GF_REGION_STDMAP)) return -1;
-      if ((region_type & (GF_REGION_SINGLE_TABLE | GF_REGION_ALTMAP)) == (GF_REGION_SINGLE_TABLE | GF_REGION_ALTMAP)) return -1;
-      if (arg1 == 2 && arg2 == 16 || arg2 == 2 && arg1 == 16) {
-        return sizeof(gf_internal_t) + sizeof(struct gf_w16_logtable_data) + 64;
-      } else {
-        return -1;
-      }
-    default:
-      return -1;
-   }
-}
-
-int gf_w32_init(gf_t *gf)
-{
-  gf_internal_t *h;
-
-  h = (gf_internal_t *) gf->scratch;
-  if (h->prim_poly == 0) h->prim_poly = 0x400007;
-
-  gf->multiply.w32 = NULL;
-  gf->divide.w32 = NULL;
-  gf->inverse.w32 = NULL;
-  gf->multiply_region.w32 = NULL;
-
-  switch(h->mult_type) {
-    case GF_MULT_DEFAULT: 
-    case GF_MULT_SHIFT:     if (gf_w32_shift_init(gf) == 0) return 0; break;
-    case GF_MULT_COMPOSITE: if (gf_w32_composite_init(gf) == 0) return 0; break;
-    case GF_MULT_SPLIT_TABLE: if (gf_w32_split_init(gf) == 0) return 0; break;
-    default: return 0;
-  }
-  if (h->divide_type == GF_DIVIDE_EUCLID) {
-    gf->divide.w32 = gf_w32_divide_from_inverse;
-    gf->inverse.w32 = gf_w32_euclid;
-  } else if (h->divide_type == GF_DIVIDE_MATRIX) {
-    gf->divide.w32 = gf_w32_divide_from_inverse;
-    gf->inverse.w32 = gf_w32_matrix;
-  }
-
-  if (gf->inverse.w32 != NULL && gf->divide.w32 == NULL) {
-    gf->divide.w32 = gf_w32_divide_from_inverse;
-  }
-  if (gf->inverse.w32 == NULL && gf->divide.w32 != NULL) {
-    gf->inverse.w32 = gf_w32_inverse_from_divide;
-  }
-  return 1;
-}
diff --git a/tmp.sh b/tmp.sh
deleted file mode 100644
index 6bd92b2..0000000
--- a/tmp.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-for i in 5 10 ; do
-  sed 's/1 }/'$i' }/' tmp-time-test.sh > tmp2.sh
-  sh tmp2.sh 4 LOG - - >> tmp-$i-out.txt
-  sh tmp2.sh 4 TABLE - - >> tmp-$i-out.txt
-  sh tmp2.sh 4 TABLE SINGLE,SSE - >> tmp-$i-out.txt
-  sh tmp2.sh 8 LOG - - >> tmp-$i-out.txt
-  sh tmp2.sh 8 TABLE - - >> tmp-$i-out.txt
-  sh tmp2.sh 8 SPLIT 8 4 SSE - >> tmp-$i-out.txt
-  sh tmp2.sh 16 LOG - - >> tmp-$i-out.txt
-  sh tmp2.sh 16 SPLIT 16 4 SSE,STDMAP - >> tmp-$i-out.txt
-  sh tmp2.sh 16 SPLIT 16 4 SSE,ALTMAP - >> tmp-$i-out.txt
-  sh tmp2.sh 32 SPLIT 8 8 - - >> tmp-$i-out.txt
-  sh tmp2.sh 32 SPLIT 32 4 SSE,STDMAP - >> tmp-$i-out.txt
-  sh tmp2.sh 32 SPLIT 32 4 SSE,ALTMAP - >> tmp-$i-out.txt
-done
diff --git a/tmp.txt b/tmp.txt
deleted file mode 100644
index 468cf49..0000000
--- a/tmp.txt
+++ /dev/null
@@ -1,162 +0,0 @@
-Tables[0] = 0000000000000000 3b60e7ccf8f4454e 76c1cf99f1e88a9c 4da12855091ccfd2 ed839f33e3d11538 d6e378ff1b255076 9b4250aa12399fa4 a022b766eacddaea db073e67c7a22a6b e067d9ab3f566f25 adc6f1fe364aa0f7 96a61632cebee5b9 3684a15424733f53 0de44698dc877a1d 40456ecdd59bb5cf 7b2589012d6ff081
-Tij            81 cf 1d 53   b9 f7 25 6b   ea a4 76 38   d2 9c 4e 00
-Tij            f0 b5 7a 3f   e5 a0 6f 2a   da 9f 50 15   cf 8a 45 00
-Tij            6f 9b 87 73   be 4a 56 a2   cd 39 25 d1   1c e8 f4 00
-Tij            2d d5 dc 24   ce 36 3f c7   ea 12 1b e3   09 f1 f8 00
-Tij            01 cd 98 54   32 fe ab 67   66 aa ff 33   55 99 cc 00
-Tij            89 6e 46 a1   16 f1 d9 3e   b7 50 78 9f   28 cf e7 00
-Tij            25 45 e4 84   a6 c6 67 07   22 42 e3 83   a1 c1 60 00
-Tij            7b 40 0d 36   96 ad e0 db   a0 9b d6 ed   4d 76 3b 00
-Tables[1] = 0000000000000000 b60e7ccf8f4454cd 6c1cf99f1e88a981 da12855091ccfd4c d839f33e3d115302 6e378ff1b25507cf b4250aa12399fa83 022b766eacddae4e b073e67c7a22a61f 067d9ab3f566f2d2 dc6f1fe364aa0f9e 6a61632cebee5b53 684a15424733f51d de44698dc877a1d0 0456ecdd59bb5c9c b2589012d6ff0851
-Tij            51 9c d0 1d   53 9e d2 1f   4e 83 cf 02   4c 81 cd 00
-Tij            08 5c a1 f5   5b 0f f2 a6   ae fa 07 53   fd a9 54 00
-Tij            ff bb 77 33   ee aa 66 22   dd 99 55 11   cc 88 44 00
-Tij            d6 59 c8 47   eb 64 f5 7a   ac 23 b2 3d   91 1e 8f 00
-Tij            12 dd 8d 42   2c e3 b3 7c   6e a1 f1 3e   50 9f cf 00
-Tij            90 ec 69 15   63 1f 9a e6   76 0a 8f f3   85 f9 7c 00
-Tij            58 56 44 4a   61 6f 7d 73   2b 25 37 39   12 1c 0e 00
-Tij            b2 04 de 68   6a dc 06 b0   02 b4 6e d8   da 6c b6 00
-Tables[2] = 0000000000000000 60e7ccf8f4454c25 c1cf99f1e88a984a a12855091ccfd46f 839f33e3d115308f e378ff1b25507caa 4250aa12399fa8c5 22b766eacddae4e0 073e67c7a22a6105 67d9ab3f566f2d20 c6f1fe364aa0f94f a61632cebee5b56a 84a15424733f518a e44698dc877a1daf 456ecdd59bb5c9c0 2589012d6ff085e5
-Tij            e5 c0 af 8a   6a 4f 20 05   e0 c5 aa 8f   6f 4a 25 00
-Tij            85 c9 1d 51   b5 f9 2d 61   e4 a8 7c 30   d4 98 4c 00
-Tij            f0 b5 7a 3f   e5 a0 6f 2a   da 9f 50 15   cf 8a 45 00
-Tij            6f 9b 87 73   be 4a 56 a2   cd 39 25 d1   1c e8 f4 00
-Tij            2d d5 dc 24   ce 36 3f c7   ea 12 1b e3   09 f1 f8 00
-Tij            01 cd 98 54   32 fe ab 67   66 aa ff 33   55 99 cc 00
-Tij            89 6e 46 a1   16 f1 d9 3e   b7 50 78 9f   28 cf e7 00
-Tij            25 45 e4 84   a6 c6 67 07   22 42 e3 83   a1 c1 60 00
-Tables[3] = 0000000000000000 0e7ccf8f4454c20a 1cf99f1e88a98414 12855091ccfd461e 39f33e3d11530828 378ff1b25507ca22 250aa12399fa8c3c 2b766eacddae4e36 73e67c7a22a61050 7d9ab3f566f2d25a 6f1fe364aa0f9444 61632cebee5b564e 4a15424733f51878 44698dc877a1da72 56ecdd59bb5c9c6c 589012d6ff085e66
-Tij            66 6c 72 78   4e 44 5a 50   36 3c 22 28   1e 14 0a 00
-Tij            5e 9c da 18   56 94 d2 10   4e 8c ca 08   46 84 c2 00
-Tij            08 5c a1 f5   5b 0f f2 a6   ae fa 07 53   fd a9 54 00
-Tij            ff bb 77 33   ee aa 66 22   dd 99 55 11   cc 88 44 00
-Tij            d6 59 c8 47   eb 64 f5 7a   ac 23 b2 3d   91 1e 8f 00
-Tij            12 dd 8d 42   2c e3 b3 7c   6e a1 f1 3e   50 9f cf 00
-Tij            90 ec 69 15   63 1f 9a e6   76 0a 8f f3   85 f9 7c 00
-Tij            58 56 44 4a   61 6f 7d 73   2b 25 37 39   12 1c 0e 00
-Tables[4] = 0000000000000000 e7ccf8f4454c20a0 cf99f1e88a98415b 2855091ccfd461fb 9f33e3d1153082ad 78ff1b25507ca20d 50aa12399fa8c3f6 b766eacddae4e356 3e67c7a22a610541 d9ab3f566f2d25e1 f1fe364aa0f9441a 1632cebee5b564ba a15424733f5187ec 4698dc877a1da74c 6ecdd59bb5c9c6b7 89012d6ff085e617
-Tij            17 b7 4c ec   ba 1a e1 41   56 f6 0d ad   fb 5b a0 00
-Tij            e6 c6 a7 87   64 44 25 05   e3 c3 a2 82   61 41 20 00
-Tij            85 c9 1d 51   b5 f9 2d 61   e4 a8 7c 30   d4 98 4c 00
-Tij            f0 b5 7a 3f   e5 a0 6f 2a   da 9f 50 15   cf 8a 45 00
-Tij            6f 9b 87 73   be 4a 56 a2   cd 39 25 d1   1c e8 f4 00
-Tij            2d d5 dc 24   ce 36 3f c7   ea 12 1b e3   09 f1 f8 00
-Tij            01 cd 98 54   32 fe ab 67   66 aa ff 33   55 99 cc 00
-Tij            89 6e 46 a1   16 f1 d9 3e   b7 50 78 9f   28 cf e7 00
-Tables[5] = 0000000000000000 7ccf8f4454c20a82 f99f1e88a9841504 855091ccfd461f86 f33e3d1153082a13 8ff1b25507ca2091 0aa12399fa8c3f17 766eacddae4e3595 e67c7a22a610543d 9ab3f566f2d25ebf 1fe364aa0f944139 632cebee5b564bbb 15424733f5187e2e 698dc877a1da74ac ecdd59bb5c9c6b2a 9012d6ff085e61a8
-Tij            a8 2a ac 2e   bb 39 bf 3d   95 17 91 13   86 04 82 00
-Tij            61 6b 74 7e   4b 41 5e 54   35 3f 20 2a   1f 15 0a 00
-Tij            5e 9c da 18   56 94 d2 10   4e 8c ca 08   46 84 c2 00
-Tij            08 5c a1 f5   5b 0f f2 a6   ae fa 07 53   fd a9 54 00
-Tij            ff bb 77 33   ee aa 66 22   dd 99 55 11   cc 88 44 00
-Tij            d6 59 c8 47   eb 64 f5 7a   ac 23 b2 3d   91 1e 8f 00
-Tij            12 dd 8d 42   2c e3 b3 7c   6e a1 f1 3e   50 9f cf 00
-Tij            90 ec 69 15   63 1f 9a e6   76 0a 8f f3   85 f9 7c 00
-Tables[6] = 0000000000000000 ccf8f4454c20a861 99f1e88a984150d9 55091ccfd461f8b8 33e3d1153082a1a9 ff1b25507ca209c8 aa12399fa8c3f170 66eacddae4e35911 67c7a22a61054352 ab3f566f2d25eb33 fe364aa0f944138b 32cebee5b564bbea 5424733f5187e2fb 98dc877a1da74a9a cdd59bb5c9c6b222 012d6ff085e61a43
-Tij            43 22 9a fb   ea 8b 33 52   11 70 c8 a9   b8 d9 61 00
-Tij            1a b2 4a e2   bb 13 eb 43   59 f1 09 a1   f8 50 a8 00
-Tij            e6 c6 a7 87   64 44 25 05   e3 c3 a2 82   61 41 20 00
-Tij            85 c9 1d 51   b5 f9 2d 61   e4 a8 7c 30   d4 98 4c 00
-Tij            f0 b5 7a 3f   e5 a0 6f 2a   da 9f 50 15   cf 8a 45 00
-Tij            6f 9b 87 73   be 4a 56 a2   cd 39 25 d1   1c e8 f4 00
-Tij            2d d5 dc 24   ce 36 3f c7   ea 12 1b e3   09 f1 f8 00
-Tij            01 cd 98 54   32 fe ab 67   66 aa ff 33   55 99 cc 00
-Tables[7] = 0000000000000000 cf8f4454c20a86a4 9f1e88a984150d53 5091ccfd461f8bf7 3e3d1153082a1abd f1b25507ca209c19 a12399fa8c3f17ee 6eacddae4e35914a 7c7a22a61054357a b3f566f2d25eb3de e364aa0f94413829 2cebee5b564bbe8d 424733f5187e2fc7 8dc877a1da74a963 dd59bb5c9c6b2294 12d6ff085e61a430
-Tij            30 94 63 c7   8d 29 de 7a   4a ee 19 bd   f7 53 a4 00
-Tij            a4 22 a9 2f   be 38 b3 35   91 17 9c 1a   8b 0d 86 00
-Tij            61 6b 74 7e   4b 41 5e 54   35 3f 20 2a   1f 15 0a 00
-Tij            5e 9c da 18   56 94 d2 10   4e 8c ca 08   46 84 c2 00
-Tij            08 5c a1 f5   5b 0f f2 a6   ae fa 07 53   fd a9 54 00
-Tij            ff bb 77 33   ee aa 66 22   dd 99 55 11   cc 88 44 00
-Tij            d6 59 c8 47   eb 64 f5 7a   ac 23 b2 3d   91 1e 8f 00
-Tij            12 dd 8d 42   2c e3 b3 7c   6e a1 f1 3e   50 9f cf 00
-Tables[8] = 0000000000000000 f8f4454c20a86af4 f1e88a984150d5f3 091ccfd461f8bf07 e3d1153082a1abfd 1b25507ca209c109 12399fa8c3f17e0e eacddae4e35914fa c7a22a61054357e1 3f566f2d25eb3d15 364aa0f944138212 cebee5b564bbe8e6 24733f5187e2fc1c dc877a1da74a96e8 d59bb5c9c6b229ef 2d6ff085e61a431b
-Tij            1b ef e8 1c   e6 12 15 e1   fa 0e 09 fd   07 f3 f4 00
-Tij            43 29 96 fc   e8 82 3d 57   14 7e c1 ab   bf d5 6a 00
-Tij            1a b2 4a e2   bb 13 eb 43   59 f1 09 a1   f8 50 a8 00
-Tij            e6 c6 a7 87   64 44 25 05   e3 c3 a2 82   61 41 20 00
-Tij            85 c9 1d 51   b5 f9 2d 61   e4 a8 7c 30   d4 98 4c 00
-Tij            f0 b5 7a 3f   e5 a0 6f 2a   da 9f 50 15   cf 8a 45 00
-Tij            6f 9b 87 73   be 4a 56 a2   cd 39 25 d1   1c e8 f4 00
-Tij            2d d5 dc 24   ce 36 3f c7   ea 12 1b e3   09 f1 f8 00
-Tables[9] = 0000000000000000 8f4454c20a86afd9 1e88a984150d5fa9 91ccfd461f8bf070 3d1153082a1abf52 b25507ca209c108b 2399fa8c3f17e0fb acddae4e35914f22 7a22a61054357ea4 f566f2d25eb3d17d 64aa0f944138210d ebee5b564bbe8ed4 4733f5187e2fc1f6 c877a1da74a96e2f 59bb5c9c6b229e5f d6ff085e61a43186
-Tij            86 5f 2f f6   d4 0d 7d a4   22 fb 8b 52   70 a9 d9 00
-Tij            31 9e 6e c1   8e 21 d1 7e   4f e0 10 bf   f0 5f af 00
-Tij            a4 22 a9 2f   be 38 b3 35   91 17 9c 1a   8b 0d 86 00
-Tij            61 6b 74 7e   4b 41 5e 54   35 3f 20 2a   1f 15 0a 00
-Tij            5e 9c da 18   56 94 d2 10   4e 8c ca 08   46 84 c2 00
-Tij            08 5c a1 f5   5b 0f f2 a6   ae fa 07 53   fd a9 54 00
-Tij            ff bb 77 33   ee aa 66 22   dd 99 55 11   cc 88 44 00
-Tij            d6 59 c8 47   eb 64 f5 7a   ac 23 b2 3d   91 1e 8f 00
-Tables[10] = 0000000000000000 f4454c20a86afd48 e88a984150d5fa8b 1ccfd461f8bf07c3 d1153082a1abf50d 25507ca209c10845 399fa8c3f17e0f86 cddae4e35914f2ce a22a61054357ea01 566f2d25eb3d1749 4aa0f9441382108a bee5b564bbe8edc2 733f5187e2fc1f0c 877a1da74a96e244 9bb5c9c6b229e587 6ff085e61a4318cf
-Tij            cf 87 44 0c   c2 8a 49 01   ce 86 45 0d   c3 8b 48 00
-Tij            18 e5 e2 1f   ed 10 17 ea   f2 0f 08 f5   07 fa fd 00
-Tij            43 29 96 fc   e8 82 3d 57   14 7e c1 ab   bf d5 6a 00
-Tij            1a b2 4a e2   bb 13 eb 43   59 f1 09 a1   f8 50 a8 00
-Tij            e6 c6 a7 87   64 44 25 05   e3 c3 a2 82   61 41 20 00
-Tij            85 c9 1d 51   b5 f9 2d 61   e4 a8 7c 30   d4 98 4c 00
-Tij            f0 b5 7a 3f   e5 a0 6f 2a   da 9f 50 15   cf 8a 45 00
-Tij            6f 9b 87 73   be 4a 56 a2   cd 39 25 d1   1c e8 f4 00
-Tables[11] = 0000000000000000 4454c20a86afd419 88a984150d5fa832 ccfd461f8bf07c2b 1153082a1abf507f 5507ca209c108466 99fa8c3f17e0f84d ddae4e35914f2c54 22a61054357ea0fe 66f2d25eb3d174e7 aa0f9441382108cc ee5b564bbe8edcd5 33f5187e2fc1f081 77a1da74a96e2498 bb5c9c6b229e58b3 ff085e61a4318caa
-Tij            aa b3 98 81   d5 cc e7 fe   54 4d 66 7f   2b 32 19 00
-Tij            8c 58 24 f0   dc 08 74 a0   2c f8 84 50   7c a8 d4 00
-Tij            31 9e 6e c1   8e 21 d1 7e   4f e0 10 bf   f0 5f af 00
-Tij            a4 22 a9 2f   be 38 b3 35   91 17 9c 1a   8b 0d 86 00
-Tij            61 6b 74 7e   4b 41 5e 54   35 3f 20 2a   1f 15 0a 00
-Tij            5e 9c da 18   56 94 d2 10   4e 8c ca 08   46 84 c2 00
-Tij            08 5c a1 f5   5b 0f f2 a6   ae fa 07 53   fd a9 54 00
-Tij            ff bb 77 33   ee aa 66 22   dd 99 55 11   cc 88 44 00
-Tables[12] = 0000000000000000 454c20a86afd41fc 8a984150d5fa83f8 cfd461f8bf07c204 153082a1abf507eb 507ca209c1084617 9fa8c3f17e0f8413 dae4e35914f2c5ef 2a61054357ea0fd6 6f2d25eb3d174e2a a0f9441382108c2e e5b564bbe8edcdd2 3f5187e2fc1f083d 7a1da74a96e249c1 b5c9c6b229e58bc5 f085e61a4318ca39
-Tij            39 c5 c1 3d   d2 2e 2a d6   ef 13 17 eb   04 f8 fc 00
-Tij            ca 8b 49 08   cd 8c 4e 0f   c5 84 46 07   c2 83 41 00
-Tij            18 e5 e2 1f   ed 10 17 ea   f2 0f 08 f5   07 fa fd 00
-Tij            43 29 96 fc   e8 82 3d 57   14 7e c1 ab   bf d5 6a 00
-Tij            1a b2 4a e2   bb 13 eb 43   59 f1 09 a1   f8 50 a8 00
-Tij            e6 c6 a7 87   64 44 25 05   e3 c3 a2 82   61 41 20 00
-Tij            85 c9 1d 51   b5 f9 2d 61   e4 a8 7c 30   d4 98 4c 00
-Tij            f0 b5 7a 3f   e5 a0 6f 2a   da 9f 50 15   cf 8a 45 00
-Tables[13] = 0000000000000000 54c20a86afd41fac a984150d5fa83f58 fd461f8bf07c20f4 53082a1abf507eab 07ca209c10846107 fa8c3f17e0f841f3 ae4e35914f2c5e5f a61054357ea0fd56 f2d25eb3d174e2fa 0f9441382108c20e 5b564bbe8edcdda2 f5187e2fc1f083fd a1da74a96e249c51 5c9c6b229e58bca5 085e61a4318ca309
-Tij            09 a5 51 fd   a2 0e fa 56   5f f3 07 ab   f4 58 ac 00
-Tij            a3 bc 9c 83   dd c2 e2 fd   5e 41 61 7e   20 3f 1f 00
-Tij            8c 58 24 f0   dc 08 74 a0   2c f8 84 50   7c a8 d4 00
-Tij            31 9e 6e c1   8e 21 d1 7e   4f e0 10 bf   f0 5f af 00
-Tij            a4 22 a9 2f   be 38 b3 35   91 17 9c 1a   8b 0d 86 00
-Tij            61 6b 74 7e   4b 41 5e 54   35 3f 20 2a   1f 15 0a 00
-Tij            5e 9c da 18   56 94 d2 10   4e 8c ca 08   46 84 c2 00
-Tij            08 5c a1 f5   5b 0f f2 a6   ae fa 07 53   fd a9 54 00
-Tables[14] = 0000000000000000 4c20a86afd41fab7 984150d5fa83f56e d461f8bf07c20fd9 3082a1abf507eac7 7ca209c108461070 a8c3f17e0f841fa9 e4e35914f2c5e51e 61054357ea0fd58e 2d25eb3d174e2f39 f9441382108c20e0 b564bbe8edcdda57 5187e2fc1f083f49 1da74a96e249c5fe c9c6b229e58bca27 85e61a4318ca3090
-Tij            90 27 fe 49   57 e0 39 8e   1e a9 70 c7   d9 6e b7 00
-Tij            30 ca c5 3f   da 20 2f d5   e5 1f 10 ea   0f f5 fa 00
-Tij            ca 8b 49 08   cd 8c 4e 0f   c5 84 46 07   c2 83 41 00
-Tij            18 e5 e2 1f   ed 10 17 ea   f2 0f 08 f5   07 fa fd 00
-Tij            43 29 96 fc   e8 82 3d 57   14 7e c1 ab   bf d5 6a 00
-Tij            1a b2 4a e2   bb 13 eb 43   59 f1 09 a1   f8 50 a8 00
-Tij            e6 c6 a7 87   64 44 25 05   e3 c3 a2 82   61 41 20 00
-Tij            85 c9 1d 51   b5 f9 2d 61   e4 a8 7c 30   d4 98 4c 00
-Tables[15] = 0000000000000000 c20a86afd41fab1c 84150d5fa83f5623 461f8bf07c20fd3f 082a1abf507eac5d ca209c1084610741 8c3f17e0f841fa7e 4e35914f2c5e5162 1054357ea0fd58ba d25eb3d174e2f3a6 9441382108c20e99 564bbe8edcdda585 187e2fc1f083f4e7 da74a96e249c5ffb 9c6b229e58bca2c4 5e61a4318ca309d8
-Tij            d8 c4 fb e7   85 99 a6 ba   62 7e 41 5d   3f 23 1c 00
-Tij            09 a2 5f f4   a5 0e f3 58   51 fa 07 ac   fd 56 ab 00
-Tij            a3 bc 9c 83   dd c2 e2 fd   5e 41 61 7e   20 3f 1f 00
-Tij            8c 58 24 f0   dc 08 74 a0   2c f8 84 50   7c a8 d4 00
-Tij            31 9e 6e c1   8e 21 d1 7e   4f e0 10 bf   f0 5f af 00
-Tij            a4 22 a9 2f   be 38 b3 35   91 17 9c 1a   8b 0d 86 00
-Tij            61 6b 74 7e   4b 41 5e 54   35 3f 20 2a   1f 15 0a 00
-Tij            5e 9c da 18   56 94 d2 10   4e 8c ca 08   46 84 c2 00
-Val= 3b60e7ccf8f4454e
-v0             28 4f 14 e3   1b f7 ee 76   b9 31 47 0a   ba 8b 70 fc
-v0             12 56 28 59   66 cd d2 d2   1c 91 30 26   a8 95 0a a9
-v0             ee 5d 14 e3   fb c8 45 23   a9 fd 8c f1   ff c9 2c 93
-v0             65 ce 82 f2   dc ec 6b e2   53 a3 9c fb   07 70 e7 ad
-v0             1b 87 3d 7b   4d 15 1d c2   d2 45 f3 03   4b e4 f4 9b
-v0             3b 01 2b c5   c5 d2 9d a9   68 7c a2 61   c9 5b 49 90
-v0             5d 13 7d ef   eb f1 52 da   a0 29 89 ef   08 f2 51 3b
-v0             17 05 b3 80   77 3a f2 5e   82 7a c9 39   84 df 8e bf
-
-p0             11 fc 47 f4   6c 01 44 ba   ba 62 e7 3f   ba fb ba 85
-p0             a6 fc 67 16   5f c3 95 fc   58 51 f4 fd   58 5f 58 a5
-p0             12 fc 1f b3   50 1e 3f 9a   fd 5e 83 20   fd 9c fd dd
-p0             d9 fc 1e ee   22 42 10 7f   a0 2c f0 7c   a0 24 a0 dc
-p0             a2 fc 4c 30   41 ce ad eb   7e 4f c1 f0   7e 6e 7e 8e
-p0             8b fc 7c 7b   9f b5 38 67   35 91 2f 8b   35 a9 35 be
-p0             07 fc 89 1a   3b 21 fd db   54 35 7e 1f   54 74 54 4b
-p0             cf fc 94 5e   40 78 c2 31   10 4e 18 46   10 da 10 56
diff --git a/tmp2.sh b/tmp2.sh
deleted file mode 100644
index d98248f..0000000
--- a/tmp2.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-if [ $# -lt 4 ]; then
-  echo 'usage: sh tmp-test.sh w gf_specs (e.g. LOG - -)' >&2
-  exit 1
-fi
-
-w=$1
-shift
-i=1024
-while [ $i -le 1073741824 ]; do
-  iter=`echo $i | awk '{ print (1073741824/$1)*10 }'`
-  echo $i $iter $w $* `gf_time $w R -1 $i $iter $*`
-  i=`echo $i | awk '{ print $1*2 }'`
-done

SHIFT	Done - Jim
BYTWO_p	Done - Jim
BYTWO_b	Done - Jim
BYTWO_p, SSE	Done - Jim
BYTWO_b, SSE	Done - Jim
Single TABLE	Done - Jim
Double TABLE	Done - Jim
Double TABLE, SSE	Done - Jim
Quad TABLE	Done - Jim
Lazy Quad TABLE	Done - Jim
LOG	Done - Jim
SHIFT	Done - Will
BYTWO_p	-
BYTWO_b	-
BYTWO_p, SSE	-
BYTWO_b, SSE	-
Split 32 1 SSE, maybe lazy	-
Split 16 1 lazy	-
Split 16 16 - Maybe that's insanity	-
Split 16 16 lazy	-
Group (SSE)	-
Composite, k=?, alternate mapping	-
CAUCHY Region (SSE XOR)	Done - Jim
SHIFT	Done - Jim
TABLE	Done - Jim
LOG	Done - Jim
BYTWO_p	Done - Jim
BYTWO_b	Done - Jim
Group, g_s == g_r	Done - Jim
Group, any g_s and g_r	Done - Jim
Split - do we need it?	Done - Jim
Composite - do we need it?	-
Split - do we need it?	-
Logzero?	-