GF-Complete Release 1.0.

Please see the user's manual for details.
2013-10-09 10:36:37 -04:00 · 2013-10-09 10:36:37 -04:00 · 110523d6f3
parent 79a46d18b6
commit 110523d6f3
50 changed files with 7050 additions and 5481 deletions
--- a/29
+++ b/29
@ -1,24 +1,23 @@
 #
 # GNUmakefile for Galois field library
 #
-#
+# The default flags do *not* have the SSE instructions enabled.
+# Please cd to flag_tester and run which_compile_flags.sh to see which SSE instructions
+# your machine and compiler support, and which flags you should include below.
+
+CFLAGS = -O3 
+LDFLAGS = -O3 

 SRCS = gf_w4.c gf_w8.c gf_w16.c gf_w32.c gf_w64.c gf_w128.c gf_wgen.c gf.c gf_unit.c \
       gf_time.c gf_mult.c gf_method.c gf_methods.c gf_div.c gf_rand.c gf_general.c \
       gf_poly.c gf_example_1.c gf_add.c gf_example_2.c gf_example_3.c gf_example_4.c \
-       gf_inline_time.c
+       gf_inline_time.c gf_example_5.c gf_example_6.c gf_example_7.c

 HDRS = gf_complete.h gf_int.h

 EXECUTABLES = gf_mult gf_div gf_add gf_unit gf_time gf_methods gf_poly \
-              gf_example_1 gf_example_2 gf_example_3 gf_example_4 gf_inline_time
-
-CFLAGS = -O3 -msse4 -maes -mpclmul -DINTEL_SSE4 -DINTEL_PCLMUL
-LDFLAGS = -O3 -msse4 -maes -mpclmul
-
-# Use these if you don't have INTEL_PCLMUL
-# CFLAGS = -O3 -msse4 -DINTEL_SSE4
-# LDFLAGS = -O3 -msse4 
+              gf_example_1 gf_example_2 gf_example_3 gf_example_4 gf_inline_time \
+              gf_example_5 gf_example_6 gf_example_7

 RM = /bin/rm -f

@ -45,6 +44,9 @@ gf_example_1: gf_example_1.o gf_complete.a
 gf_example_2: gf_example_2.o gf_complete.a
 gf_example_3: gf_example_3.o gf_complete.a
 gf_example_4: gf_example_4.o gf_complete.a
+gf_example_5: gf_example_5.o gf_complete.a
+gf_example_6: gf_example_6.o gf_complete.a
+gf_example_7: gf_example_7.o gf_complete.a
 gf_mult: gf_mult.o gf_complete.a
 gf_div: gf_div.o gf_complete.a
 gf_poly: gf_poly.o gf_complete.a
@ -54,7 +56,8 @@ clean:
 	$(RM) $(OBJS) gf_div.c

 spotless: clean
-	$(RM) *~ $(EXECUTABLES)
+	$(RM) *~ $(EXECUTABLES) which_compile_flags
+	$(RM) gf_complete.a

 gf_div.o: gf_complete.h gf_method.h
 gf_methods.o: gf_complete.h gf_method.h
@ -71,8 +74,12 @@ gf_example_1.o: gf_complete.h gf_rand.h
 gf_example_2.o: gf_complete.h gf_rand.h
 gf_example_3.o: gf_complete.h gf_rand.h
 gf_example_4.o: gf_complete.h gf_rand.h
+gf_example_5.o: gf_complete.h gf_rand.h
+gf_example_6.o: gf_complete.h gf_rand.h
+gf_example_7.o: gf_complete.h gf_rand.h
 gf_general.o: gf_complete.h gf_int.h gf_general.h gf_rand.h
 gf_mult.o: gf_complete.h gf_method.h
+gf.o: gf_complete.h gf_int.h
 gf_method.o: gf_complete.h

 gf_div.c: gf_mult.c
--- a/Log-Zero-for-w=8.odg
+++ b/Log-Zero-for-w=8.odg
--- a/Manual.pdf
+++ b/Manual.pdf
--- a/1
+++ b/1
@ -1 +0,0 @@
-This is a README file.
--- a/README.txt
+++ b/README.txt
@ -1,5 +1,13 @@
-This is GF-Complete, Revision 0.1.  
+This is GF-Complete, Revision 1.0.  
+
+The user's manual is in the file Manual.pdf.  
+
+There are two online homes for GF-Complete:
+
+  - https://bitbucket.org/jimplank/gf-complete
+  - http://www.cs.utk.edu/~plank/plank/papers/CS-13-716.html
+
+When compiling this for the first time, cd to flag_tester, and
+do "sh which_compile_flags.sh xxx", where xxx is the compiler
+that you will use in the GNUMakefile.

-Please see http://www.cs.utk.edu/~plank/plank/papers/CS-13-703.html for the user's
-manual and other important documentation about this library, including more 
-recent revisions.
--- a/explanation.html
+++ b/explanation.html
@ -1,777 +0,0 @@
-<h3>Code structure as of 7/20/2012</h3>
-
-written by Jim.
-<p>
-Ok -- once again, I have messed with the structure.  My goal is flexible and efficient.
-It's similar to the stuff before, but better because it makes things like Euclid's 
-method much cleaner.
-<p>
-I think we're ready to hack.  
-<p>
-<p>
-<hr>
-<h3>Files</h3>
-<UL>
-<LI> <a href=GNUmakefile><b>GNUmakefile</b></a>: Makefile
-<LI> <a href=README><b>README</b></a>: Empty readme
-<LI> <a href=explanation.html><b>explanation.html</b></a>: This file.
-<LI> <a href=gf.c><b>gf.c</b></a>: Main gf routines
-<LI> <a href=gf.h><b>gf.h</b></a>: Main gf prototypes and typedefs
-<LI> <a href=gf_int.h><b>gf_int.h</b></a>: Prototypes and typedefs for common routines for the 
-    internal gf implementations.
-<LI> <a href=gf_method.c><b>gf_method.c</b></a>: Code to help parse argc/argv to define the method.
-    This way, various programs can be consistent with how they handle the command line.
-<LI> <a href=gf_method.h><b>gf_method.h</b></a>: Prototypes for ibid.
-<LI> <a href=gf_methods.c><b>gf_methods.c</b></a>: This program prints out how to define
-    the various methods on the command line.  My idea is to beef this up so that you can 
-    give it a method spec on the command line, and it will tell you whether it's valid, or
-    why it's invalid.  I haven't written that part yet.
-<LI> <a href=gf_mult.c><b>gf_mult.c</b></a>: Program to do single multiplication.
-<LI> <a href=gf_mult.c><b>gf_mult.c</b></a>: Program to do single divisions -- it's created
-    in the makefile with a sed script on gf_mult.c.
-<LI> <a href=gf_time.c><b>gf_time.c</b></a>: Time tester
-<LI> <a href=gf_unit.c><b>gf_unit.c</b></a>: Unit tester
-<LI> <a href=gf_54.c><b>gf_54.c</b></a>: A simple example program that multiplies 
-    5 and 4 in GF(2^4).
-<LI> <a href=gf_w4.c><b>gf_w4.c</b></a>: Implementation of code for <i>w</i> = 4.
-(For now, only SHIFT and LOG, plus EUCLID & MATRIX).
-<LI> <a href=gf_w8.c><b>gf_w8.c</b></a>: Implementation of code for <i>w</i> = 8.
-(For now, only SHIFT plus EUCLID & MATRIX).
-<LI> <a href=gf_w16.c><b>gf_w16.c</b></a>: Implementation of code for <i>w</i> = 16.
-(For now, only SHIFT plus EUCLID & MATRIX).
-<LI> <a href=gf_w32.c><b>gf_w32.c</b></a>: Implementation of code for <i>w</i> = 32.
-(For now, only SHIFT plus EUCLID & MATRIX).
-<LI> <a href=gf_w64.c><b>gf_w64.c</b></a>: Implementation of code for <i>w</i> = 64.
-(For now, only SHIFT and EUCLID.
-<LI> I don't have gf_w128.c or gf_gen.c yet.
-</UL>
-
-<hr>
-<h3>Prototypes and typedefs in gf.h</h3>
-
-The main structure that users will see is in <b>gf.h</b>, and it is of type
-<b>gf_t</b>:
-
-<p><center><table border=3 cellpadding=3><td><pre>
-typedef struct gf {
-  gf_func_a_b    multiply;
-  gf_func_a_b    divide;
-  gf_func_a      inverse;
-  gf_region      multiply_region;
-  void           *scratch;
-} gf_t;
-</pre></td></table></center><p>
-    
-We can beef it up later with buf-buf or buf-acc.  The problem is that the paper is 
-already bloated, so right now, I want to keep it lean.
-<p>
-The types of the procedures are big unions, so that they work with the following
-types of arguments:
-
-<p><center><table border=3 cellpadding=3><td><pre>
-typedef uint8_t     gf_val_4_t;
-typedef uint8_t     gf_val_8_t;
-typedef uint16_t    gf_val_16_t;
-typedef uint32_t    gf_val_32_t;
-typedef uint64_t    gf_val_64_t;
-typedef uint64_t    *gf_val_128_t;
-typedef uint32_t    gf_val_gen_t;   /* The intent here is for general values <= 32 */
-</pre></td></table></center><p>
-
-To use one of these, you need to create one with <b>gf_init_easy()</b> or 
-<b>gf_init_hard()</b>.  Let's concentrate on the former:
-
-<p><center><table border=3 cellpadding=3><td><pre>
-extern int gf_init_easy(gf_t *gf, int w, int mult_type);
-</pre></td></table></center><p>
-
-You pass it memory for a <b>gf_t</b>, a value of <b>w</b> and
-a variable that says how to do multiplication.  The valid values of <b>mult_type</b>
-are enumerated in <b>gf.h</b>:
-
-<p><center><table border=3 cellpadding=3><td><pre>
-typedef enum {GF_MULT_DEFAULT,
-              GF_MULT_SHIFT,
-              GF_MULT_GROUP,
-              GF_MULT_BYTWO_p,
-              GF_MULT_BYTWO_b,
-              GF_MULT_TABLE,
-              GF_MULT_LOG_TABLE,
-              GF_MULT_SPLIT_TABLE,
-              GF_MULT_COMPOSITE } gf_mult_type_t;
-</pre></td></table></center><p>
-
-After creating the <b>gf_t</b>, you use its <b>multiply</b> method
-to multiply, using the union's fields to work with the various types.
-It looks easier than my explanation.  For example, suppose you wanted to multiply 5 and 4 in <i>GF(2<sup>4</sup>)</i>.
-You can do it as in 
-<b><a href=gf_54.c>gf_54.c</a></b>
-
-<p><center><table border=3 cellpadding=3><td><pre>
-#include "gf.h"
-
-main()
-{
-  gf_t gf;
-
-  gf_init_easy(&gf, 4, GF_MULT_DEFAULT);
-  printf("%d\n", gf.multiply.w4(&gf, 5, 4));
-  exit(0);
-}
-</pre></td></table></center><p>
-
-
-If you wanted to multiply in <i>GF(2<sup>8</sup>)</i>, then you'd have to use 8 as a parameter
-to <b>gf_init_easy</b>, and call the multiplier as <b>gf.mult.w8()</b>.
-<p>
-When you're done with your <b>gf_t</b>, you should call <b>gf_free()</b> on it so
-that it can free memory that it has allocated.  We'll talk more about memory later, but if you
-create your <b>gf_t</b> with <b>gf_init_easy</b>, then it calls <b>malloc()</b>, and 
-if you care about freeing memory, you'll have to call <b>gf_free()</b>.
-<p>
-
-<hr>
-<h3>Memory allocation</h3>
-
-Each implementation of a multiplication technique keeps around its
-own data.  For example, <b>GF_MULT_TABLE</b> keeps around 
-multiplication and division tables, and <b>GF_MULT_LOG</b> maintains log and
-antilog tables.  This data is stored in the pointer <b>scratch</b>.  My intent
-is that the memory that is there is all that's required.  In other
-words, the <b>multiply()</b>, <b>divide()</b>, <b>inverse()</b> and
-<b>multiply_region()</b> calls don't do any memory allocation.
-Moreover, <b>gf_init_easy()</b> only allocates one chunk of memory --
-the one in <b>scratch</b>.
-<p>
-If you don't want to have the initialization call allocate memory, you can use <b>gf_init_hard()</b>:
-
-<p><center><table border=3 cellpadding=3><td><pre>
-extern int gf_init_hard(gf_t *gf,
-                        int w,
-                        int mult_type,
-                        int region_type,
-                        int divide_type,
-                        uint64_t prim_poly,
-                        int arg1,
-                        int arg2,
-                        gf_t *base_gf,
-                        void *scratch_memory);
-</pre></td></table></center><p>
-
-The first three parameters are the same as <b>gf_init_easy()</b>. 
-You can add additional arguments for performing <b>multiply_region</b>, and
-for performing division in the <b>region_type</b> and <b>divide_type</b>
-arguments.  Their values are also defined in <b>gf.h</b>.  You can 
-mix the <b>region_type</b> values (e.g. "DOUBLE" and "SSE"):
-
-<p><center><table border=3 cellpadding=3><td><pre>
-#define GF_REGION_DEFAULT      (0x0)
-#define GF_REGION_SINGLE_TABLE (0x1)
-#define GF_REGION_DOUBLE_TABLE (0x2)
-#define GF_REGION_QUAD_TABLE   (0x4)
-#define GF_REGION_LAZY         (0x8)
-#define GF_REGION_SSE          (0x10)
-#define GF_REGION_NOSSE        (0x20)
-#define GF_REGION_STDMAP       (0x40)
-#define GF_REGION_ALTMAP       (0x80)
-#define GF_REGION_CAUCHY       (0x100)
-
-typedef uint32_t gf_region_type_t;
-
-typedef enum { GF_DIVIDE_DEFAULT,
-               GF_DIVIDE_MATRIX,
-               GF_DIVIDE_EUCLID } gf_division_type_t;
-</pre></td></table></center><p>
-You can change
-the primitive polynomial with <b>prim_poly</b>, give additional arguments with 
-<b>arg1</b> and <b>arg2</b> and give a base Galois Field for composite fields.
-Finally, you can pass it a pointer to memory in <b>scratch_memory</b>.  That
-way, you can avoid having <b>gf_init_hard()</b> call <b>malloc()</b>.  
-<p>
-There is a procedure called <b>gf_scratch_size()</b> that lets you know the minimum
-size for <b>scratch_memory</b>, depending on <i>w</i>, the multiplication type
-and the arguments:
-
-<p><center><table border=3 cellpadding=3><td><pre>
-extern int gf_scratch_size(int w,
-                           int mult_type,
-                           int region_type,
-                           int divide_type,
-                           int arg1,
-                           int arg2);
-</pre></td></table></center><p>
-
-You can specify default arguments in <b>gf_init_hard()</b>:
-<UL>
-<LI> <b>region_type</b> = <b>GF_REGION_DEFAULT</b> 
-<LI> <b>divide_type</b> = <b>GF_REGION_DEFAULT</b>
-<LI> <b>prim_poly</b> = 0
-<LI> <b>arg1</b> = 0
-<LI> <b>arg2</b> = 0
-<LI> <b>base_gf</b> = <b>NULL</b>
-<LI> <b>scratch_memory</b> = <b>NULL</b>
-</UL>
-If any argument is equal to its default, then default actions are taken (e.g. a 
-standard primitive polynomial is used, or memory is allocated for <b>scratch_memory</b>).
-In fact, <b>gf_init_easy()</b> simply calls <b>gf_init_hard()</b> with the default
-parameters.
-<p>
-<b>gf_free()</b> frees memory that was allocated with <b>gf_init_easy()</b>
-or <b>gf_init_hard()</b>.  The <b>recursive</b> parameter is in case you 
-use composite fields, and want to recursively free the base fields.
-If you pass <b>scratch_memory</b> to <b>gf_init_hard()</b>, then you typically
-don't need to call <b>gf_free()</b>.  It won't hurt to call it, though.
-
-<hr>
-<h3>gf_mult and gf_div</h3>
-
-For the moment, I have few things completely implemented, but that's because I want
-to be able to explain the structure, and how to specify methods.  In particular, for
-<i>w=4</i>, I have implemented <b>SHIFT</b> and <b>LOG</b>.  For <i>w=8, 16, 32, 64</i>
-I have implemented <b>SHIFT</b>.  For all <i>w &le; 32</i>, I have implemented both
-Euclid's algorithm for inversion, and the matrix method for inversion.  For
-<i>w=64</i>, it's just Euclid.  You can
-test these all with <b>gf_mult</b> and <b>gf_div</b>.  Here are a few calls:
-
-<pre>
-UNIX> <font color=darkred><b>gf_mult 7 11 4</b></font>                - Default
-4
-UNIX> <font color=darkred><b>gf_mult 7 11 4 SHIFT - -</b></font>      - Use shift
-4
-UNIX> <font color=darkred><b>gf_mult 7 11 4 LOG - -</b></font>        - Use logs
-4
-UNIX> <font color=darkred><b>gf_div 4 7 4</b></font>                  - Default
-11
-UNIX> <font color=darkred><b>gf_div 4 7 4 LOG - -</b></font>          - Use logs
-11
-UNIX> <font color=darkred><b>gf_div 4 7 4 LOG - EUCLID</b></font>     - Use Euclid instead of logs
-11
-UNIX> <font color=darkred><b>gf_div 4 7 4 LOG - MATRIX</b></font>     - Use Matrix inversion instead of logs
-11
-UNIX> <font color=darkred><b>gf_div 4 7 4 SHIFT - -</b></font>        - Default
-11
-UNIX> <font color=darkred><b>gf_div 4 7 4 SHIFT - EUCLID</b></font>   - Use Euclid (which is the default)
-11
-UNIX> <font color=darkred><b>gf_div 4 7 4 SHIFT - MATRIX</b></font>   - Use Matrix inversion instead of logs
-11
-UNIX> <font color=darkred><b>gf_mult 200 211 8</b></font>        - The remainder are shift/Euclid
-201
-UNIX> <font color=darkred><b>gf_div 201 211 8</b></font>
-200
-UNIX> <font color=darkred><b>gf_mult 60000 65111 16</b></font>
-63515
-UNIX> <font color=darkred><b>gf_div 63515 65111 16</b></font>
-60000
-UNIX> <font color=darkred><b>gf_mult abcd0001 9afbf788 32h</b></font>
-b0359681
-UNIX> <font color=darkred><b>gf_div b0359681 9afbf788 32h</b></font>
-abcd0001
-UNIX> <font color=darkred><b>gf_mult abcd00018c8b8c8a 9afbf7887f6d8e5b 64h</b></font>
-3a7def35185bd571
-UNIX> <font color=darkred><b>gf_mult abcd00018c8b8c8a 9afbf7887f6d8e5b 64h</b></font>
-3a7def35185bd571
-UNIX> <font color=darkred><b>gf_div 3a7def35185bd571 9afbf7887f6d8e5b 64h</b></font>
-abcd00018c8b8c8a
-UNIX> <font color=darkred><b></b></font>
-</pre>
-
-You can see all the methods with <b>gf_methods</b>.  We have a lot of implementing to do:
-
-<pre>
-UNIX> <font color=darkred><b>gf_methods</b></font>
-To specify the methods, do one of the following: 
-       - leave empty to use defaults
-       - use a single dash to use defaults
-       - specify MULTIPLY REGION DIVIDE
-
-Legal values of MULTIPLY:
-       SHIFT: shift
-       GROUP g_mult g_reduce: the Group technique - see the paper
-       BYTWO_p: BYTWO doubling the product.
-       BYTWO_b: BYTWO doubling b (more efficient thatn BYTWO_p)
-       TABLE: Full multiplication table
-       LOG:   Discrete logs
-       LOG_ZERO: Discrete logs with a large table for zeros
-       SPLIT g_a g_b: Split tables defined by g_a and g_b
-       COMPOSITE k l [METHOD]: Composite field, recursively specify the
-                               method of the base field in GF(2^l)
-
-Legal values of REGION: Specify multiples with commas e.g. 'DOUBLE,LAZY'
-       -: Use defaults
-       SINGLE/DOUBLE/QUAD: Expand tables
-       LAZY: Lazily create table (only applies to TABLE and SPLIT)
-       SSE/NOSSE: Use 128-bit SSE instructions if you can
-       CAUCHY/ALTMAP/STDMAP: Use different memory mappings
-
-Legal values of DIVIDE:
-       -: Use defaults
-       MATRIX: Use matrix inversion
-       EUCLID: Use the extended Euclidian algorithm.
-
-See the user's manual for more information.
-There are many restrictions, so it is better to simply use defaults in most cases.
-UNIX> <font color=darkred><b></b></font>
-</pre>
-
-<hr>
-<h3>gf_unit and gf_time</h3>
-
-<b><a href=gf_unit.c>gf_unit.c</a></b> is a unit tester, and 
-<b><a href=gf_time.c>gf_time.c</a></b> is a time tester.
-
-They are called as follows:
-
-<p><center><table border=3 cellpadding=3><td><pre>
-UNIX> <font color=darkred><b>gf_unit w tests seed [METHOD] </b></font>
-UNIX> <font color=darkred><b>gf_time w tests seed size(bytes) iterations [METHOD] </b></font>
-</pre></td></table></center><p>
-
-The <b>tests</b> parameter is one or more of the following characters:
-
-<UL>
-<LI>        A: Do all tests
-<LI>        S: Test only single operations (multiplication/division)
-<LI>        R: Test only region operations
-<LI>        V: Verbose Output
-</UL>
-
-<b>seed</b> is a seed for <b>srand48()</b> -- using -1 defaults to the current time.
-<p>
-For example, testing the defaults with w=4:
-
-<pre>
-UNIX> <font color=darkred><b>gf_unit 4 AV 1 LOG - -</b></font>
-Seed: 1
-Testing single multiplications/divisions.
-Testing Inversions.
-Testing buffer-constant, src != dest, xor = 0
-Testing buffer-constant, src != dest, xor = 1
-Testing buffer-constant, src == dest, xor = 0
-Testing buffer-constant, src == dest, xor = 1
-UNIX> <font color=darkred><b>gf_unit 4 AV 1 SHIFT - -</b></font>
-Seed: 1
-Testing single multiplications/divisions.
-Testing Inversions.
-No multiply_region.
-UNIX> <font color=darkred><b></b></font>
-</pre>
-
-There is no <b>multiply_region()</b> method defined for <b>SHIFT</b>.
-Thus, the procedures are <b>NULL</b> and the unit tester ignores them.
-<p>
-At the moment, I only have the unit tester working for w=4.
-<p>
-<b>gf_time</b> takes the size of an array (in bytes) and a number of iterations, and
-tests the speed of both single and region operations.  The tests are:
-
-<UL>
-<LI> A: All
-<LI> S: All Single Operations
-<LI> R: All Region Operations
-<LI> M: Single: Multiplications
-<LI> D: Single: Divisions
-<LI> I: Single: Inverses
-<LI> B: Region: Multipy_Region
-</UL> 
-
-Here are some examples with <b>SHIFT</b> and <b>LOG</b> on my mac.
-
-<pre>
-UNIX> <font color=darkred><b>gf_time 4 A 1 102400 1024 LOG - -</b></font>
-Seed: 1
-Multiply:   0.538126 s      185.830 Mega-ops/s
-Divide:     0.520825 s      192.003 Mega-ops/s
-Inverse:    0.631198 s      158.429 Mega-ops/s
-Buffer-Const,s!=d,xor=0:    0.478395 s      209.032 MB/s
-Buffer-Const,s!=d,xor=1:    0.524245 s      190.751 MB/s
-Buffer-Const,s==d,xor=0:    0.471851 s      211.931 MB/s
-Buffer-Const,s==d,xor=1:    0.528275 s      189.295 MB/s
-UNIX> <font color=darkred><b>gf_time 4 A 1 102400 1024 LOG - EUCLID</b></font>
-Seed: 1
-Multiply:   0.555512 s      180.014 Mega-ops/s
-Divide:     5.359434 s       18.659 Mega-ops/s
-Inverse:    4.911719 s       20.359 Mega-ops/s
-Buffer-Const,s!=d,xor=0:    0.496097 s      201.573 MB/s
-Buffer-Const,s!=d,xor=1:    0.538536 s      185.689 MB/s
-Buffer-Const,s==d,xor=0:    0.485564 s      205.946 MB/s
-Buffer-Const,s==d,xor=1:    0.540227 s      185.107 MB/s
-UNIX> <font color=darkred><b>gf_time 4 A 1 102400 1024 LOG - MATRIX</b></font>
-Seed: 1
-Multiply:   0.544005 s      183.822 Mega-ops/s
-Divide:     7.602822 s       13.153 Mega-ops/s
-Inverse:    7.000564 s       14.285 Mega-ops/s
-Buffer-Const,s!=d,xor=0:    0.474868 s      210.585 MB/s
-Buffer-Const,s!=d,xor=1:    0.527588 s      189.542 MB/s
-Buffer-Const,s==d,xor=0:    0.473130 s      211.358 MB/s
-Buffer-Const,s==d,xor=1:    0.529877 s      188.723 MB/s
-UNIX> <font color=darkred><b>gf_time 4 A 1 102400 1024 SHIFT - -</b></font>
-Seed: 1
-Multiply:   2.708842 s       36.916 Mega-ops/s
-Divide:     8.756882 s       11.420 Mega-ops/s
-Inverse:    5.695511 s       17.558 Mega-ops/s
-UNIX> <font color=darkred><b></b></font>
-</pre>
-
-At the moment, I only have the timer working for w=4.
-
-<hr>
-<h3>Walking you through <b>LOG</b></h3>
-
-To see how <b>scratch</b> is used to store data, let's look at what happens when 
-you call <b>gf_init_easy(&gf, 4, GF_MULT_LOG);</b>  
-First, <b>gf_init_easy()</b> calls <b>gf_init_hard()</b> with default parameters.
-This is in <b><a href=gf.c>gf.c</a></b>.
-<p>
-<b>gf_init_hard()</b>' first job is to set up the scratch.
-The scratch's type is <b>gf_internal_t</b>, defined in 
-<b><a href=gf_int.h>gf_int.h</a></b>:
-
-<p><center><table border=3 cellpadding=3><td><pre>
-typedef struct {
-  int mult_type;
-  int region_type;
-  int divide_type;
-  int w;
-  uint64_t prim_poly;
-  int free_me;
-  int arg1;
-  int arg2;
-  gf_t *base_gf;
-  void *private;
-} gf_internal_t;
-</pre></td></table></center><p>
-
-All the fields are straightfoward, with the exception of <b>private</b>.  That is
-a <b>(void *)</b> which points to the implementation's private data.
-<p>
-Here's the code for 
-<b>gf_init_hard()</b>:
-
-<p><center><table border=3 cellpadding=3><td><pre>
-int gf_init_hard(gf_t *gf, int w, int mult_type, 
-                        int region_type,
-                        int divide_type,
-                        uint64_t prim_poly,
-                        int arg1, int arg2,
-                        gf_t *base_gf,
-                        void *scratch_memory) 
-{
-  int sz;
-  gf_internal_t *h;
-
-
-  if (scratch_memory == NULL) {
-    sz = gf_scratch_size(w, mult_type, region_type, divide_type, arg1, arg2);
-    if (sz &lt;= 0) return 0;
-    h = (gf_internal_t *) malloc(sz);
-    h-&gt;free_me = 1;
-  } else {
-    h = scratch_memory;
-    h-&gt;free_me = 0;
-  }
-  gf-&gt;scratch = (void *) h;
-  h-&gt;mult_type = mult_type;
-  h-&gt;region_type = region_type;
-  h-&gt;divide_type = divide_type;
-  h-&gt;w = w;
-  h-&gt;prim_poly = prim_poly;
-  h-&gt;arg1 = arg1;
-  h-&gt;arg2 = arg2;
-  h-&gt;base_gf = base_gf;
-  h-&gt;private = (void *) gf-&gt;scratch;
-  h-&gt;private += (sizeof(gf_internal_t));
-
-  switch(w) {
-    case 4: return gf_w4_init(gf);
-    case 8: return gf_w8_init(gf);
-    case 16: return gf_w16_init(gf);
-    case 32: return gf_w32_init(gf);
-    case 64: return gf_w64_init(gf);
-    case 128: return gf_dummy_init(gf);
-    default: return 0;
-  }
-}
-</pre></td></table></center><p>
-
-The first thing it does is determine if it has to allocate space for <b>scratch</b>.
-If it must, it uses <b>gf_scratch_size()</b> to figure out how big the space must be.
-It then sets <b>gf->scratch</b> to this space, and sets all of the fields of the
-scratch to the arguments in <b>gf_init_hard()</b>.  The <b>private</b> pointer is
-set to be the space just after the pointer <b>gf->private</b>.   Again, it is up to 
-<b>gf_scratch_size()</b> to make sure there is enough space for the scratch, and 
-for all of the private data needed by the implementation.
-<p>
-Once the scratch is set up, <b>gf_init_hard()</b> calls <b>gf_w4_init()</b>.  This is
-in <b><a href=gf_w4.c>gf_w4.c</a></b>, and it is a 
-simple dispatcher to the various initialization routines, plus it 
-sets <b>EUCLID</b> and <b>MATRIX</b> if need be:
-
-<p><center><table border=3 cellpadding=3><td><pre>
-int gf_w4_init(gf_t *gf)
-{
-  gf_internal_t *h;
-
-  h = (gf_internal_t *) gf-&gt;scratch;
-  if (h-&gt;prim_poly == 0) h-&gt;prim_poly = 0x13;
-
-  gf-&gt;multiply.w4 = NULL;
-  gf-&gt;divide.w4 = NULL;
-  gf-&gt;inverse.w4 = NULL;
-  gf-&gt;multiply_region.w4 = NULL;
-
-  switch(h-&gt;mult_type) {
-    case GF_MULT_SHIFT:     if (gf_w4_shift_init(gf) == 0) return 0; break;
-    case GF_MULT_LOG_TABLE: if (gf_w4_log_init(gf) == 0) return 0; break;
-    case GF_MULT_DEFAULT:   if (gf_w4_log_init(gf) == 0) return 0; break;
-    default: return 0;
-  }
-  if (h-&gt;divide_type == GF_DIVIDE_EUCLID) {
-    gf-&gt;divide.w4 = gf_w4_divide_from_inverse;
-    gf-&gt;inverse.w4 = gf_w4_euclid;
-  } else if (h-&gt;divide_type == GF_DIVIDE_MATRIX) {
-    gf-&gt;divide.w4 = gf_w4_divide_from_inverse;
-    gf-&gt;inverse.w4 = gf_w4_matrix;
-  }
-
-  if (gf-&gt;inverse.w4 != NULL && gf-&gt;divide.w4 == NULL) {
-    gf-&gt;divide.w4 = gf_w4_divide_from_inverse;
-  }
-  if (gf-&gt;inverse.w4 == NULL && gf-&gt;divide.w4 != NULL) {
-    gf-&gt;inverse.w4 = gf_w4_inverse_from_divide;
-  }
-  return 1;
-}
-</pre></td></table></center><p>
-
-The code in <b>gf_w4_log_init()</b> sets up the log and antilog tables, and sets
-the <b>multiply.w4</b>, <b>divide.w4</b> etc routines to be the ones for logs.  The
-tables are put into <b>gf->scratch->private</b>, which is typecast to a <b>struct
-gf_logtable_data *</b>:
-
-<p><center><table border=3 cellpadding=3><td><pre>
-struct gf_logtable_data {
-    gf_val_4_t      log_tbl[GF_FIELD_SIZE];
-    gf_val_4_t      antilog_tbl[GF_FIELD_SIZE * 2];
-    gf_val_4_t      *antilog_tbl_div;
-};
-.......
-
-static 
-int gf_w4_log_init(gf_t *gf)
-{
-  gf_internal_t *h;
-  struct gf_logtable_data *ltd;
-  int i, b;
-
-  h = (gf_internal_t *) gf-&gt;scratch;
-  ltd = h-&gt;private;
-
-  ltd-&gt;log_tbl[0] = 0;
-
-  ltd-&gt;antilog_tbl_div = ltd-&gt;antilog_tbl + (GF_FIELD_SIZE-1);
-  b = 1;
-  for (i = 0; i &lt; GF_FIELD_SIZE-1; i++) {
-      ltd-&gt;log_tbl[b] = (gf_val_8_t)i;
-      ltd-&gt;antilog_tbl[i] = (gf_val_8_t)b;
-      ltd-&gt;antilog_tbl[i+GF_FIELD_SIZE-1] = (gf_val_8_t)b;
-      b &lt;&lt;= 1;
-      if (b & GF_FIELD_SIZE) {
-          b = b ^ h-&gt;prim_poly;
-      }
-  }
-    
-  gf-&gt;inverse.w4 = gf_w4_inverse_from_divide;
-  gf-&gt;divide.w4 = gf_w4_log_divide;
-  gf-&gt;multiply.w4 = gf_w4_log_multiply;
-  gf-&gt;multiply_region.w4 = gf_w4_log_multiply_region;
-  return 1;
-}
-</pre></td></table></center><p>
-
-And of course the individual routines use <b>h->private</b> to access the tables:
-
-<p><center><table border=3 cellpadding=3><td><pre>
-static
-inline
-gf_val_8_t gf_w4_log_multiply (gf_t *gf, gf_val_8_t a, gf_val_8_t b)
-{
-  struct gf_logtable_data *ltd;
-    
-  ltd = (struct gf_logtable_data *) ((gf_internal_t *) (gf-&gt;scratch))-&gt;private;
-  return (a == 0 || b == 0) ? 0 : ltd-&gt;antilog_tbl[(unsigned)(ltd-&gt;log_tbl[a] + ltd-&gt;log_tbl[b])];
-}
-</pre></td></table></center><p>
-
-Finally, it's important that the proper sizes are put into 
-<b>gf_w4_scratch_size()</b> for each implementation:
-
-<p><center><table border=3 cellpadding=3><td><pre>
-int gf_w4_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
-{
-  int region_tbl_size;
-  switch(mult_type)
-  {
-    case GF_MULT_DEFAULT:
-    case GF_MULT_LOG_TABLE:
-      return sizeof(gf_internal_t) + sizeof(struct gf_logtable_data) + 64;
-      break;
-    case GF_MULT_SHIFT:
-      return sizeof(gf_internal_t);
-      break;
-    default:
-      return -1;
-   }
-}
-</pre></td></table></center><p>
-I hope that's enough explanation for y'all to start implementing.  Let me know if you have
-problems -- thanks -- Jim
-
-<hr>
-The initial structure has been set for w=4, 8, 16, 32 and 64, with implementations of SHIFT and EUCLID, and for w <= 32, MATRIX.  There are some weird caveats:
-
-<UL>
-<LI> For w=32 and w=64, the primitive polynomial does not have the leading one.  
-<LI> I'd like for naming to be:
-<p>
-<UL>
-      <b>gf_w</b><i>w</i><b>_</b><i>technique</i></i><b>_</b><i>funcationality</i><b>()</b>.
-</UL>
-<p>
-For example, the log techniques for w=4 are:
-<pre>
-gf_w4_log_multiply()
-gf_w4_log_divide()
-gf_w4_log_multiply_region()
-gf_w4_log_init()
-</pre>
-<p>
-<LI> I'd also like a header block on implementations that says who wrote it.
-</UL>
-
-<hr>
-<h3>Things we need to Implement: <i>w=4</i></h3>
-
-<p><table border=3 cellpadding=2>
-<tr> <td> SHIFT </td> <td> Done - Jim </td> </tr>
-<tr> <td> BYTWO_p </td> <td>Done - Jim</td> </tr>
-<tr> <td> BYTWO_b </td> <td>Done - Jim</td> </tr>
-<tr> <td> BYTWO_p, SSE </td> <td>Done - Jim</td> </tr>
-<tr> <td> BYTWO_b, SSE </td> <td>Done - Jim</td> </tr>
-<tr> <td> Single TABLE </td> <td> Done - Jim </td> </tr>
-<tr> <td> Double TABLE </td> <td> Done - Jim </td> </tr>
-<tr> <td> Double TABLE, SSE </td> <td> Done - Jim </td> </tr>
-<tr> <td> Quad TABLE </td> <td>Done - Jim</td> </tr>
-<tr> <td> Lazy Quad TABLE </td> <td>Done - Jim</td> </tr>
-<tr> <td> LOG </td> <td> Done - Jim </td> </tr>
-</table><p>
-
-<hr>
-<h3>Things we need to Implement: <i>w=8</i></h3>
-
-<p><table border=3 cellpadding=2>
-<tr> <td> SHIFT </td> <td> Done - Jim </td> </tr>
-<tr> <td> BYTWO_p </td> <td>Done - Jim </td> </tr>
-<tr> <td> BYTWO_b </td> <td>Done - Jim </td> </tr>
-<tr> <td> BYTWO_p, SSE </td> <td>Done - Jim </td> </tr>
-<tr> <td> BYTWO_b, SSE </td> <td>Done - Jim </td> </tr>
-<tr> <td> Single TABLE </td> <td> Done - Kevin </td> </tr>
-<tr> <td> Double TABLE </td> <td> Done - Jim </td> </tr>
-<tr> <td> Lazy Double TABLE </td> <td> Done - Jim </td> </tr>
-<tr> <td> Split 2 1 (Half) SSE </td> <td>Done - Jim</td> </tr>
-<tr> <td> Composite, k=2 </td> <td> Done - Kevin (alt mapping not passing unit test) </td> </tr>
-<tr> <td> LOG </td> <td> Done - Kevin </td> </tr>
-<tr> <td> LOG ZERO</td> <td> Done - Jim</td> </tr>
-</table><p>
-
-<hr>
-<h3>Things we need to Implement: <i>w=16</i></h3>
-
-<p><table border=3 cellpadding=2>
-<tr> <td> SHIFT </td> <td> Done - Jim </td> </tr>
-<tr> <td> BYTWO_p </td> <td>Done - Jim</td> </tr>
-<tr> <td> BYTWO_b </td> <td>Done - Jim</td> </tr>
-<tr> <td> BYTWO_p, SSE </td> <td>Done - Jim</td> </tr>
-<tr> <td> BYTWO_b, SSE </td> <td>Done - Jim</td> </tr>
-<tr> <td> Lazy TABLE </td> <td>Done - Jim</td> </tr>
-<tr> <td> Split 4 16 No-SSE, lazy </td> <td>Done - Jim</td> </tr>
-<tr> <td> Split 4 16 SSE, lazy </td> <td>Done - Jim</td> </tr>
-<tr> <td> Split 4 16 SSE, lazy, alternate mapping </td> <td>Done - Jim</td> </tr>
-<tr> <td> Split 8 16, lazy </td> <td>Done - Jim</td> </tr>
-<tr> <td> Composite, k=2, stdmap recursive </td> <td> Done - Kevin</td> </tr>
-<tr> <td> Composite, k=2, altmap recursive </td> <td> Done - Kevin</td> </tr>
-<tr> <td> Composite, k=2, stdmap inline </td> <td> Done - Kevin</td> </tr>
-<tr> <td> LOG </td> <td> Done - Kevin </td> </tr>
-<tr> <td> LOG ZERO</td> <td> Done - Kevin </td> </tr>
-<tr> <td> Group 4 4 </td> <td>Done - Jim: I don't see a reason to implement others, although 4-8 will be faster, and 8 8 will have faster region ops.  They'll never beat SPLIT.</td> </tr>
-</table><p>
-
-<hr>
-<h3>Things we need to Implement: <i>w=32</i></h3>
-
-<p><table border=3 cellpadding=2>
-<tr> <td> SHIFT </td> <td> Done - Jim </td> </tr>
-<tr> <td> BYTWO_p </td> <td>Done - Jim</td> </tr>
-<tr> <td> BYTWO_b </td> <td>Done - Jim</td> </tr>
-<tr> <td> BYTWO_p, SSE </td> <td>Done - Jim</td> </tr>
-<tr> <td> BYTWO_b, SSE </td> <td>Done - Jim</td> </tr>
-<tr> <td> Split 2 32,lazy </td> <td>Done  - Jim</td> </tr>
-<tr> <td> Split 2 32, SSE, lazy </td> <td>Done  - Jim</td> </tr>
-<tr> <td> Split 4 32, lazy </td> <td>Done  - Jim</td> </tr>
-<tr> <td> Split 4 32, SSE,ALTMAP lazy </td> <td>Done  - Jim</td> </tr>
-<tr> <td> Split 4 32, SSE, lazy </td> <td>Done  - Jim</td> </tr>
-<tr> <td> Split 8 8 </td> <td>Done - Jim </td> </tr>
-<tr> <td> Group, g_s == g_r </td> <td>Done - Jim</td></tr>
-<tr> <td> Group, any g_s and g_r</td> <td>Done - Jim</td></tr>
-<tr> <td> Composite, k=2, stdmap recursive </td> <td> Done - Kevin</td> </tr>
-<tr> <td> Composite, k=2, altmap recursive </td> <td> Done - Kevin</td> </tr>
-<tr> <td> Composite, k=2, stdmap inline </td> <td> Done - Kevin</td> </tr>
-</table><p>
-<hr>
-<h3>Things we need to Implement: <i>w=64</i></h3>
-
-<p><table border=3 cellpadding=2>
-<tr> <td> SHIFT </td> <td> Done - Jim </td> </tr>
-<tr> <td> BYTWO_p </td> <td> - </td> </tr>
-<tr> <td> BYTWO_b </td> <td> - </td> </tr>
-<tr> <td> BYTWO_p, SSE </td> <td> - </td> </tr>
-<tr> <td> BYTWO_b, SSE </td> <td> - </td> </tr>
-<tr> <td> Split 16 1 SSE, maybe lazy </td> <td> - </td> </tr>
-<tr> <td> Split 8 1 lazy </td> <td> - </td> </tr>
-<tr> <td> Split 8 8 </td> <td> - </td> </tr>
-<tr> <td> Split 8 8 lazy </td> <td> - </td> </tr>
-<tr> <td> Group </td> <td> - </td> </tr>
-<tr> <td> Composite, k=2, alternate mapping </td> <td> - </td> </tr>
-</table><p>
-<hr>
-<h3>Things we need to Implement: <i>w=128</i></h3>
-
-<p><table border=3 cellpadding=2>
-<tr> <td> SHIFT </td> <td> Done - Will </td> </tr>
-<tr> <td> BYTWO_p </td> <td> - </td> </tr>
-<tr> <td> BYTWO_b </td> <td> - </td> </tr>
-<tr> <td> BYTWO_p, SSE </td> <td> - </td> </tr>
-<tr> <td> BYTWO_b, SSE </td> <td> - </td> </tr>
-<tr> <td> Split 32 1 SSE, maybe lazy </td> <td> - </td> </tr>
-<tr> <td> Split 16 1 lazy </td> <td> - </td> </tr>
-<tr> <td> Split 16 16 - Maybe that's insanity</td> <td> - </td> </tr>
-<tr> <td> Split 16 16 lazy </td> <td> - </td> </tr>
-<tr> <td> Group (SSE) </td> <td> - </td> </tr>
-<tr> <td> Composite, k=?, alternate mapping </td> <td> - </td> </tr>
-</table><p>
-<hr>
-<h3>Things we need to Implement: <i>w=general between 1 & 32</i></h3>
-
-<p><table border=3 cellpadding=2>
-<tr> <td> CAUCHY Region (SSE XOR)</td> <td> Done - Jim </td> </tr>
-<tr> <td> SHIFT </td> <td> Done - Jim </td> </tr>
-<tr> <td> TABLE </td> <td> Done - Jim </td> </tr>
-<tr> <td> LOG </td> <td> Done - Jim </td> </tr>
-<tr> <td> BYTWO_p </td> <td>Done - Jim</td> </tr>
-<tr> <td> BYTWO_b </td> <td>Done - Jim</td> </tr>
-<tr> <td> Group, g_s == g_r </td> <td>Done - Jim</td></tr>
-<tr> <td> Group, any g_s and g_r</td> <td>Done - Jim</td></tr>
-<tr> <td> Split - do we need it?</td> <td>Done - Jim</td></tr>
-<tr> <td> Composite - do we need it?</td> <td> - </td></tr>
-<tr> <td> Split - do we need it?</td> <td> - </td></tr>
-<tr> <td> Logzero?</td> <td> - </td></tr>
-</table><p>
--- a/flag_tester/README.txt
+++ b/flag_tester/README.txt
@ -0,0 +1,10 @@
+Run which_compile_flags.sh and it will print out the compile flags to use in
+  GNUmakefile. By default, this script uses "cc" as its compiler but you can
+  pass in the name of your compiler as an argument.
+
+EXAMPLE: "./which_compile_flags.sh clang"
+
+This script will run "clang" in the above example so be warned that if you type
+something like "rm" for that argument, you get what you asked for.  Also, make
+sure that the compiler that you pass to which_compile_flags.sh is the same as
+the compiler in GNUmakefile.
--- a/flag_tester/flag_test.c
+++ b/flag_tester/flag_test.c
@ -0,0 +1,120 @@
+/*
+ * flag_test.c - copied from whats_my_sse.c to output proper compile
+ *  flags for the GNUmakefile
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "intel_cpu_capabilities.h"
+
+void usage()
+{
+  fprintf(stderr, "usage: flag_test <compiler name>\n");
+  exit(EXIT_FAILURE);
+}
+
+int main(int argc, char **argv)
+{
+  //make sure to extend these buffers if more flags are added to this program
+  char cflags[1000], ldflags[1000], buf[1000];
+  FILE *file;
+  char sse_found = 0;
+
+  if(argc != 2)
+    usage();
+
+  sprintf(cflags, "CFLAGS = -O3");
+  sprintf(ldflags, "LDFLAGS = -O3");
+
+  if(cpu_has_feature(CPU_CAP_SSE42))
+  {
+    sprintf(buf, "%s sse_test.c -o sse4 -msse4 -DSSE4 2> /dev/null", argv[1]);
+    system(buf);
+    if(file = fopen("sse4", "r"))
+    {
+      fclose(file);
+
+      //run program and compare to the included output
+      system("./sse4 > temp.txt 2> /dev/null");
+      system("diff sse4_test.txt temp.txt > diff.txt 2> /dev/null");
+      file = fopen("diff.txt", "r");
+      if(fgetc(file) == EOF)
+      {
+        strcat(cflags, " -msse4 -DINTEL_SSE4");
+        strcat(ldflags, " -msse4");
+        sse_found = 1;
+      }
+      fclose(file);
+    }
+  }
+
+  if(cpu_has_feature(CPU_CAP_SSSE3) && !sse_found)
+  {
+    sprintf(buf, "%s sse_test.c -o ssse3 -mssse3 -DSSSE3 2> /dev/null", argv[1]);
+    system(buf);
+    if(file = fopen("ssse3", "r"))
+    {
+      fclose(file);
+
+      //run program and compare to the included output
+      system("./ssse3 > temp.txt 2> /dev/null");
+      system("diff ssse3_test.txt temp.txt > diff.txt 2> /dev/null");
+      file = fopen("diff.txt", "r");
+      if(fgetc(file) == EOF)
+      {
+        strcat(cflags, " -mssse3 -DINTEL_SSSE3");
+        strcat(ldflags, " -mssse3");
+        sse_found = 1;
+      }
+      fclose(file);
+    }
+  }
+
+  if(cpu_has_feature(CPU_CAP_SSE2) && !sse_found)
+  {
+    sprintf(buf, "%s sse_test.c -o sse2 -msse2 -DSSE2 2> /dev/null", argv[1]);
+    system(buf);
+    if(file = fopen("sse2", "r"))
+    {
+      fclose(file);
+
+      //run program and compare to the included output
+      system("./sse2 > temp.txt 2> /dev/null");
+      system("diff sse2_test.txt temp.txt > diff.txt 2> /dev/null");
+      file = fopen("diff.txt", "r");
+      if(fgetc(file) == EOF)
+      {
+        strcat(cflags, " -msse2 -DINTEL_SSE2");
+        strcat(ldflags, " -msse2");
+        sse_found = 1;
+      }
+      fclose(file);
+    }
+  }
+
+  if(cpu_has_feature(CPU_CAP_PCLMULQDQ) && sse_found)
+  {
+    sprintf(buf, "%s pclmul_test.c -o pclmul -maes -mpclmul 2> /dev/null"
+      , argv[1]);
+    system(buf);
+    if(file = fopen("pclmul", "r"))
+    {
+      fclose(file);
+
+      //run program and compare to the included output
+      system("./pclmul > temp.txt 2> /dev/null");
+      system("diff pclmul_test.txt temp.txt > diff.txt 2> /dev/null");
+      file = fopen("diff.txt", "r");
+      if(fgetc(file) == EOF)
+      {
+        strcat(cflags, " -maes -mpclmul -DINTEL_PCLMUL");
+        strcat(ldflags, " -maes -mpclmul");
+      }
+      fclose(file);
+    }
+  }
+
+  printf("%s\n%s\n", cflags, ldflags);
+}
--- a/flag_tester/intel_cpu_capabilities.h
+++ b/flag_tester/intel_cpu_capabilities.h
@ -16,7 +16,7 @@
 #define CPU_CPSSE               0x2000
 #define CPU_CAP_SSE3            (CPU_CPSSE | 0)
 #define CPU_CAP_PCLMULQDQ       (CPU_CPSSE | 1)
-#define CPU_CAP_SSSE3           (CPU_CPSSE | 10)
+#define CPU_CAP_SSSE3           (CPU_CPSSE | 9)
 #define CPU_CAP_SSE41           (CPU_CPSSE | 19)
 #define CPU_CAP_SSE42           (CPU_CPSSE | 20)
 #define CPU_CAP_AVX             (CPU_CPSSE | 28)
@ -25,7 +25,6 @@
        __asm__ __volatile__ ("cpuid":\
                              "=a" (ax), "=b" (bx), "=c" (cx), "=d" (dx) : "a" (func));

-inline
 int
 cpu_has_feature (unsigned which)
 {
--- a/flag_tester/pclmul_test.c
+++ b/flag_tester/pclmul_test.c
@ -0,0 +1,40 @@
+#include <wmmintrin.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#define MM_PRINT8(s, r) { uint8_t blah[16], ii; printf("%-20s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 1) printf("%s%02x", (ii%4==0) ? "   " : " ", blah[15-ii]); printf("\n"); }
+
+
+int main()
+{
+  uint64_t answer;
+  uint32_t pp;
+  __m128i a, b, c;
+
+  a = _mm_set1_epi8(0x0D);
+  b = _mm_set_epi32(0,0,0,0x0A);
+  pp = 0x13;
+  MM_PRINT8("a", a);
+  MM_PRINT8("b", b);
+
+  c = _mm_clmulepi64_si128(a, b, 0);
+  MM_PRINT8("a clm b", c);
+
+  a = _mm_set1_epi8(0xf0);
+  MM_PRINT8("a", a);
+  b = _mm_and_si128(a, c);
+  b = _mm_srli_epi64(b, 4);
+  MM_PRINT8("shifted", b);
+
+
+  a = _mm_set_epi32(0,0,0,pp);
+  MM_PRINT8("PP", a);
+
+  b = _mm_clmulepi64_si128(a, b, 0);
+  MM_PRINT8("PP clm over", b);
+
+  c = _mm_xor_si128(c,b);
+  MM_PRINT8("Answer", c);
+  //answer = _mm_extract_epi64(c, 0);
+  //printf("%llx\n", answer);
+}
--- a/flag_tester/pclmul_test.txt
+++ b/flag_tester/pclmul_test.txt
@ -0,0 +1,8 @@
+a                      0d 0d 0d 0d   0d 0d 0d 0d   0d 0d 0d 0d   0d 0d 0d 0d
+b                      00 00 00 00   00 00 00 00   00 00 00 00   00 00 00 0a
+a clm b                00 00 00 00   00 00 00 00   72 72 72 72   72 72 72 72
+a                      f0 f0 f0 f0   f0 f0 f0 f0   f0 f0 f0 f0   f0 f0 f0 f0
+shifted                00 00 00 00   00 00 00 00   07 07 07 07   07 07 07 07
+PP                     00 00 00 00   00 00 00 00   00 00 00 00   00 00 00 13
+PP clm over            00 00 00 00   00 00 00 00   79 79 79 79   79 79 79 79
+Answer                 00 00 00 00   00 00 00 00   0b 0b 0b 0b   0b 0b 0b 0b
--- a/flag_tester/sse2_test.txt
+++ b/flag_tester/sse2_test.txt
@ -0,0 +1,30 @@
+a                      0f 0e 0d 0c   0b 0a 09 08   07 06 05 04   03 02 01 00
+b                      10 0f 0e 0d   0c 0b 0a 09   08 07 06 05   04 03 02 01
+c                      11 10 0f 0e   0d 0c 0b 0a   09 08 07 06   05 04 03 02
+d                      12 11 10 0f   0e 0d 0c 0b   0a 09 08 07   06 05 04 03
+a sl16                 3c 38 34 30   2c 28 24 20   1c 18 14 10   0c 08 04 00
+b sl32                 40 3c 38 34   30 2c 28 24   20 1c 18 14   10 0c 08 04
+c sl64                 44 40 3c 38   34 30 2c 28   24 20 1c 18   14 10 0c 08
+d sl128                10 0f 0e 0d   0c 0b 0a 09   08 07 06 05   04 03 00 00
+a sr16                 0f 0e 0d 0c   0b 0a 09 08   07 06 05 04   03 02 01 00
+b sr32                 10 0f 0e 0d   0c 0b 0a 09   08 07 06 05   04 03 02 01
+c sr64                 11 10 0f 0e   0d 0c 0b 0a   09 08 07 06   05 04 03 02
+d sr128                00 00 10 0f   0e 0d 0c 0b   0a 09 08 07   06 05 04 03
+d = a^b                1f 01 03 01   07 01 03 01   0f 01 03 01   07 01 03 01
+d = a-b epi8           ff ff ff ff   ff ff ff ff   ff ff ff ff   ff ff ff ff
+d = a-b epi16          fe ff fe ff   fe ff fe ff   fe ff fe ff   fe ff fe ff
+d = a-b epi32          fe fe fe ff   fe fe fe ff   fe fe fe ff   fe fe fe ff
+d = a-b epi64          fe fe fe fe   fe fe fe ff   fe fe fe fe   fe fe fe ff
+d set_epi8             0f 0e 0d 0c   0b 0a 09 08   07 06 05 04   03 02 01 00
+d set_epi32            12 34 56 78   9a bc de f0   12 34 56 78   9a bc de f0
+d set1_epi64           f0 f0 f0 f0   f0 f0 f0 f0   f0 f0 f0 f0   f0 f0 f0 f0
+d set1_epi32           e2 e2 e2 e2   e2 e2 e2 e2   e2 e2 e2 e2   e2 e2 e2 e2
+d set1_epi16           af f3 af f3   af f3 af f3   af f3 af f3   af f3 af f3
+d set1_epi8            c5 c5 c5 c5   c5 c5 c5 c5   c5 c5 c5 c5   c5 c5 c5 c5
+d packus_epi16(d,d)    00 00 00 00   00 00 00 00   00 00 00 00   00 00 00 00
+c unpackhi(a,d)        00 0f 00 0e   00 0d 00 0c   00 0b 00 0a   00 09 00 08
+b unpacklo(c,a)        07 00 06 0b   05 00 04 0a   03 00 02 09   01 00 00 08
+d and(d,b)             00 00 00 00   00 00 00 00   00 00 00 00   00 00 00 00
+a stored to mem: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 
+d setzero              00 00 00 00   00 00 00 00   00 00 00 00   00 00 00 00
+c                      05 05 05 05   05 05 05 05   05 05 05 05   05 05 05 05
--- a/flag_tester/sse4_test.txt
+++ b/flag_tester/sse4_test.txt
@ -0,0 +1,35 @@
+a                      0f 0e 0d 0c   0b 0a 09 08   07 06 05 04   03 02 01 00
+b                      10 0f 0e 0d   0c 0b 0a 09   08 07 06 05   04 03 02 01
+c                      11 10 0f 0e   0d 0c 0b 0a   09 08 07 06   05 04 03 02
+d                      12 11 10 0f   0e 0d 0c 0b   0a 09 08 07   06 05 04 03
+a sl16                 3c 38 34 30   2c 28 24 20   1c 18 14 10   0c 08 04 00
+b sl32                 40 3c 38 34   30 2c 28 24   20 1c 18 14   10 0c 08 04
+c sl64                 44 40 3c 38   34 30 2c 28   24 20 1c 18   14 10 0c 08
+d sl128                10 0f 0e 0d   0c 0b 0a 09   08 07 06 05   04 03 00 00
+a sr16                 0f 0e 0d 0c   0b 0a 09 08   07 06 05 04   03 02 01 00
+b sr32                 10 0f 0e 0d   0c 0b 0a 09   08 07 06 05   04 03 02 01
+c sr64                 11 10 0f 0e   0d 0c 0b 0a   09 08 07 06   05 04 03 02
+d sr128                00 00 10 0f   0e 0d 0c 0b   0a 09 08 07   06 05 04 03
+d = a^b                1f 01 03 01   07 01 03 01   0f 01 03 01   07 01 03 01
+d = a-b epi8           ff ff ff ff   ff ff ff ff   ff ff ff ff   ff ff ff ff
+d = a-b epi16          fe ff fe ff   fe ff fe ff   fe ff fe ff   fe ff fe ff
+d = a-b epi32          fe fe fe ff   fe fe fe ff   fe fe fe ff   fe fe fe ff
+d = a-b epi64          fe fe fe fe   fe fe fe ff   fe fe fe fe   fe fe fe ff
+d set_epi8             0f 0e 0d 0c   0b 0a 09 08   07 06 05 04   03 02 01 00
+d set_epi32            12 34 56 78   9a bc de f0   12 34 56 78   9a bc de f0
+d set1_epi64           f0 f0 f0 f0   f0 f0 f0 f0   f0 f0 f0 f0   f0 f0 f0 f0
+d set1_epi32           e2 e2 e2 e2   e2 e2 e2 e2   e2 e2 e2 e2   e2 e2 e2 e2
+d set1_epi16           af f3 af f3   af f3 af f3   af f3 af f3   af f3 af f3
+d set1_epi8            c5 c5 c5 c5   c5 c5 c5 c5   c5 c5 c5 c5   c5 c5 c5 c5
+d packus_epi16(d,d)    00 00 00 00   00 00 00 00   00 00 00 00   00 00 00 00
+c unpackhi(a,d)        00 0f 00 0e   00 0d 00 0c   00 0b 00 0a   00 09 00 08
+b unpacklo(c,a)        07 00 06 0b   05 00 04 0a   03 00 02 09   01 00 00 08
+d and(d,b)             00 00 00 00   00 00 00 00   00 00 00 00   00 00 00 00
+a stored to mem: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 
+d setzero              00 00 00 00   00 00 00 00   00 00 00 00   00 00 00 00
+d insert32 @ 2         00 00 00 00   ab cd 12 34   00 00 00 00   00 00 00 00
+extract_epi32 @ 2: abcd1234
+d insert64 @ 0         00 00 00 00   ab cd 12 34   fe dc ba 12   91 82 73 64
+extract_epi64 @ 0: fedcba1291827364
+c                      05 05 05 05   05 05 05 05   05 05 05 05   05 05 05 05
+a shuffle(b, c)        02 02 02 02   02 02 02 02   02 02 02 02   02 02 02 02
--- a/flag_tester/sse_test.c
+++ b/flag_tester/sse_test.c
@ -0,0 +1,142 @@
+#ifdef SSE4
+#define SSSE3
+#include <nmmintrin.h>
+#endif
+
+#ifdef SSSE3
+#define SSE2
+#include <tmmintrin.h>
+#endif
+
+#ifdef SSE2
+#include <emmintrin.h>
+#endif
+
+#include <stdio.h>
+#include <stdint.h>
+#include <inttypes.h>
+
+#define MM_PRINT8(s, r) { uint8_t blah[16], ii; printf("%-20s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 1) printf("%s%02x", (ii%4==0) ? "   " : " ", blah[15-ii]); printf("\n"); }
+
+int main()
+{
+  uint32_t u32;
+  uint64_t u64;
+  uint8_t *ui8 = malloc(20), i;
+  __m128i a, b, c, d;
+
+  for(i=0; i < 20; i++)
+    ui8[i] = i;
+
+  a = _mm_load_si128( (__m128i *) ui8 );
+  b = _mm_loadu_si128( (__m128i *) (ui8+1));
+  c = _mm_loadu_si128( (__m128i *) (ui8+2));
+  d = _mm_loadu_si128( (__m128i *) (ui8+3));
+
+  MM_PRINT8("a", a);
+  MM_PRINT8("b", b);
+  MM_PRINT8("c", c);
+  MM_PRINT8("d", d);
+
+  a = _mm_slli_epi16(a, 2);
+  b = _mm_slli_epi32(b, 2);
+  c = _mm_slli_epi64(c, 2);
+  d = _mm_slli_si128(d, 2);
+
+  MM_PRINT8("a sl16", a);
+  MM_PRINT8("b sl32", b);
+  MM_PRINT8("c sl64", c);
+  MM_PRINT8("d sl128", d);
+
+  a = _mm_srli_epi16(a, 2);
+  b = _mm_srli_epi32(b, 2);
+  c = _mm_srli_epi64(c, 2);
+  d = _mm_srli_si128(d, 2);
+
+  MM_PRINT8("a sr16", a);
+  MM_PRINT8("b sr32", b);
+  MM_PRINT8("c sr64", c);
+  MM_PRINT8("d sr128", d);
+
+  d = _mm_xor_si128(a, b);
+  MM_PRINT8("d = a^b", d);
+  
+  d = _mm_sub_epi8(a, b);
+  MM_PRINT8("d = a-b epi8", d);
+  
+  d = _mm_sub_epi16(a, b);
+  MM_PRINT8("d = a-b epi16", d);
+  
+  d = _mm_sub_epi32(a, b);
+  MM_PRINT8("d = a-b epi32", d);
+  
+  d = _mm_sub_epi64(a, b);
+  MM_PRINT8("d = a-b epi64", d);
+  
+  d = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  MM_PRINT8("d set_epi8", d);
+  
+  d = _mm_set_epi32(0x12345678, 0x9abcdef0, 0x12345678, 0x9abcdef0);
+  MM_PRINT8("d set_epi32", d);
+  
+  d = _mm_set1_epi64x(0xF0F0F0F0F0F0F0F0ULL);
+  MM_PRINT8("d set1_epi64", d);
+  
+  d = _mm_set1_epi32(0xe2e2e2e2);
+  MM_PRINT8("d set1_epi32", d);
+
+  d = _mm_set1_epi16(0xaff3);
+  MM_PRINT8("d set1_epi16", d);
+
+  d = _mm_set1_epi8(0xc5);
+  MM_PRINT8("d set1_epi8", d);
+
+  d = _mm_packus_epi16(d, d);
+  MM_PRINT8("d packus_epi16(d,d)", d);
+
+  c = _mm_unpackhi_epi8(a, d);
+  MM_PRINT8("c unpackhi(a,d)", c);
+
+  b = _mm_unpacklo_epi8(c, a);
+  MM_PRINT8("b unpacklo(c,a)", b);
+
+  d = _mm_and_si128(d, b);
+  MM_PRINT8("d and(d,b)", d);
+
+  _mm_store_si128( (__m128i *) ui8, a);
+  printf("a stored to mem: ");
+  for(i=0; i < 16; i++)
+    printf("%u ", ui8[i]);
+  printf("\n");
+
+  d = _mm_setzero_si128();
+  MM_PRINT8("d setzero", d);
+
+  u32 = 0xABCD1234;
+  u64 = 0xFEDCBA1291827364ULL;
+  
+  #ifdef SSE4
+  d = _mm_insert_epi32(d, u32, 2);
+  MM_PRINT8("d insert32 @ 2", d);
+
+  u32 = 0;
+  u32 = _mm_extract_epi32(d, 2);
+  printf("extract_epi32 @ 2: %x\n", u32);
+
+  d = _mm_insert_epi64(d, u64, 0);
+  MM_PRINT8("d insert64 @ 0", d);
+
+  u64 = 0;
+  u64 = _mm_extract_epi64(d, 0);
+  printf("extract_epi64 @ 0: %" PRIx64 "\n", u64);
+  #endif
+
+  c = _mm_set1_epi8(5);
+  MM_PRINT8("c", c);
+
+  #ifdef SSSE3
+  a = _mm_shuffle_epi8(b, c);
+  MM_PRINT8("a shuffle(b, c)", a);
+  #endif
+
+}
--- a/flag_tester/ssse3_test.txt
+++ b/flag_tester/ssse3_test.txt
@ -0,0 +1,31 @@
+a                      0f 0e 0d 0c   0b 0a 09 08   07 06 05 04   03 02 01 00
+b                      10 0f 0e 0d   0c 0b 0a 09   08 07 06 05   04 03 02 01
+c                      11 10 0f 0e   0d 0c 0b 0a   09 08 07 06   05 04 03 02
+d                      12 11 10 0f   0e 0d 0c 0b   0a 09 08 07   06 05 04 03
+a sl16                 3c 38 34 30   2c 28 24 20   1c 18 14 10   0c 08 04 00
+b sl32                 40 3c 38 34   30 2c 28 24   20 1c 18 14   10 0c 08 04
+c sl64                 44 40 3c 38   34 30 2c 28   24 20 1c 18   14 10 0c 08
+d sl128                10 0f 0e 0d   0c 0b 0a 09   08 07 06 05   04 03 00 00
+a sr16                 0f 0e 0d 0c   0b 0a 09 08   07 06 05 04   03 02 01 00
+b sr32                 10 0f 0e 0d   0c 0b 0a 09   08 07 06 05   04 03 02 01
+c sr64                 11 10 0f 0e   0d 0c 0b 0a   09 08 07 06   05 04 03 02
+d sr128                00 00 10 0f   0e 0d 0c 0b   0a 09 08 07   06 05 04 03
+d = a^b                1f 01 03 01   07 01 03 01   0f 01 03 01   07 01 03 01
+d = a-b epi8           ff ff ff ff   ff ff ff ff   ff ff ff ff   ff ff ff ff
+d = a-b epi16          fe ff fe ff   fe ff fe ff   fe ff fe ff   fe ff fe ff
+d = a-b epi32          fe fe fe ff   fe fe fe ff   fe fe fe ff   fe fe fe ff
+d = a-b epi64          fe fe fe fe   fe fe fe ff   fe fe fe fe   fe fe fe ff
+d set_epi8             0f 0e 0d 0c   0b 0a 09 08   07 06 05 04   03 02 01 00
+d set_epi32            12 34 56 78   9a bc de f0   12 34 56 78   9a bc de f0
+d set1_epi64           f0 f0 f0 f0   f0 f0 f0 f0   f0 f0 f0 f0   f0 f0 f0 f0
+d set1_epi32           e2 e2 e2 e2   e2 e2 e2 e2   e2 e2 e2 e2   e2 e2 e2 e2
+d set1_epi16           af f3 af f3   af f3 af f3   af f3 af f3   af f3 af f3
+d set1_epi8            c5 c5 c5 c5   c5 c5 c5 c5   c5 c5 c5 c5   c5 c5 c5 c5
+d packus_epi16(d,d)    00 00 00 00   00 00 00 00   00 00 00 00   00 00 00 00
+c unpackhi(a,d)        00 0f 00 0e   00 0d 00 0c   00 0b 00 0a   00 09 00 08
+b unpacklo(c,a)        07 00 06 0b   05 00 04 0a   03 00 02 09   01 00 00 08
+d and(d,b)             00 00 00 00   00 00 00 00   00 00 00 00   00 00 00 00
+a stored to mem: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 
+d setzero              00 00 00 00   00 00 00 00   00 00 00 00   00 00 00 00
+c                      05 05 05 05   05 05 05 05   05 05 05 05   05 05 05 05
+a shuffle(b, c)        02 02 02 02   02 02 02 02   02 02 02 02   02 02 02 02
--- a/flag_tester/whats_my_sse.c
+++ b/flag_tester/whats_my_sse.c
--- a/flag_tester/which_compile_flags.sh
+++ b/flag_tester/which_compile_flags.sh
@ -0,0 +1,19 @@
+if [ -n "$1" ]; then
+  CC=$1
+else
+  CC=cc
+fi
+
+$CC flag_test.c -o flag_test 2> /dev/null
+if [ -e "flag_test" ]; then
+  OUTPUT=`./flag_test $CC 2> /dev/null`
+  if [ -n "$OUTPUT" ]; then
+    echo "$OUTPUT"
+  else
+    printf "CFLAGS = -O3\nLDFLAGS = -O3\n"
+  fi
+else
+  printf "$CC failed to compile flag_test.c\n"
+fi
+
+rm sse4 sse2 ssse3 pclmul diff.txt flag_test temp.txt 2> /dev/null
--- a/gf.c
+++ b/gf.c
@ -8,6 +8,405 @@
 #include <stdio.h>
 #include <stdlib.h>

+int _gf_errno = GF_E_DEFAULT;
+
+void gf_error()
+{
+  char *s;
+
+  switch(_gf_errno) {
+    case GF_E_DEFAULT: s = "No Error."; break;
+    case GF_E_TWOMULT: s = "Cannot specify two -m's."; break;
+    case GF_E_TWO_DIV: s = "Cannot specify two -d's."; break;
+    case GF_E_POLYSPC: s = "-p needs to be followed by a number in hex (0x optional)."; break;
+    case GF_E_GROUPAR: s = "Ran out of arguments in -m GROUP."; break;
+    case GF_E_GROUPNU: s = "In -m GROUP g_s g_r -- g_s and g_r need to be numbers."; break;
+    case GF_E_SPLITAR: s = "Ran out of arguments in -m SPLIT."; break;
+    case GF_E_SPLITNU: s = "In -m SPLIT w_a w_b -- w_a and w_b need to be numbers."; break;
+    case GF_E_FEWARGS: s = "Not enough arguments (Perhaps end with '-'?)"; break;
+    case GF_E_CFM___W: s = "-m CARRY_FREE, w must be 4, 8, 16, 32, 64 or 128."; break;
+    case GF_E_COMPXPP: s = "-m COMPOSITE, No poly specified, and we don't have a default for the given sub-field."; break;
+    case GF_E_BASE__W: s = "-m COMPOSITE and the base field is not for w/2."; break;
+    case GF_E_CFM4POL: s = "-m CARRY_FREE, w=4. (Prim-poly & 0xc) must equal 0."; break;
+    case GF_E_CFM8POL: s = "-m CARRY_FREE, w=8. (Prim-poly & 0x80) must equal 0."; break;
+    case GF_E_CF16POL: s = "-m CARRY_FREE, w=16. (Prim-poly & 0xe000) must equal 0."; break;
+    case GF_E_CF32POL: s = "-m CARRY_FREE, w=32. (Prim-poly & 0xfe000000) must equal 0."; break;
+    case GF_E_CF64POL: s = "-m CARRY_FREE, w=64. (Prim-poly & 0xfffe000000000000ULL) must equal 0."; break;
+    case GF_E_MDEFDIV: s = "If multiplication method == default, can't change division."; break;
+    case GF_E_MDEFREG: s = "If multiplication method == default, can't change region."; break;
+    case GF_E_MDEFARG: s = "If multiplication method == default, can't use arg1/arg2."; break;
+    case GF_E_DIVCOMP: s = "Cannot change the division technique with -m COMPOSITE."; break;
+    case GF_E_DOUQUAD: s = "Cannot specify -r DOUBLE and -r QUAD."; break;
+    case GF_E_SSE__NO: s = "Cannot specify -r SSE and -r NOSSE."; break;
+    case GF_E_CAUCHYB: s = "Cannot specify -r CAUCHY and any other -r."; break;
+    case GF_E_CAUCOMP: s = "Cannot specify -m COMPOSITE and -r CAUCHY."; break;
+    case GF_E_CAUGT32: s = "Cannot specify -r CAUCHY with w > 32."; break;
+    case GF_E_ARG1SET: s = "Only use arg1 with SPLIT, GROUP or COMPOSITE."; break;
+    case GF_E_ARG2SET: s = "Only use arg2 with SPLIT or GROUP."; break;
+    case GF_E_MATRIXW: s = "Cannot specify -d MATRIX with w > 32."; break;
+    case GF_E_BAD___W: s = "W must be 1-32, 64 or 128."; break;
+    case GF_E_DOUBLET: s = "Can only specify -r DOUBLE with -m TABLE."; break;
+    case GF_E_DOUBLEW: s = "Can only specify -r DOUBLE w = 4 or w = 8."; break;
+    case GF_E_DOUBLEJ: s = "Cannot specify -r DOUBLE with -r ALTMAP|SSE|NOSSE."; break;
+    case GF_E_DOUBLEL: s = "Can only specify -r DOUBLE -r LAZY with w = 8"; break;
+    case GF_E_QUAD__T: s = "Can only specify -r QUAD with -m TABLE."; break;
+    case GF_E_QUAD__W: s = "Can only specify -r QUAD w = 4."; break;
+    case GF_E_QUAD__J: s = "Cannot specify -r QUAD with -r ALTMAP|SSE|NOSSE."; break;
+    case GF_E_BADPOLY: s = "Bad primitive polynomial (high bits set)."; break;
+    case GF_E_COMP_PP: s = "Bad primitive polynomial -- bigger than sub-field."; break;
+    case GF_E_LAZY__X: s = "If -r LAZY, then -r must be DOUBLE or QUAD."; break;
+    case GF_E_ALTSHIF: s = "Cannot specify -m SHIFT and -r ALTMAP."; break;
+    case GF_E_SSESHIF: s = "Cannot specify -m SHIFT and -r SSE|NOSSE."; break;
+    case GF_E_ALT_CFM: s = "Cannot specify -m CARRY_FREE and -r ALTMAP."; break;
+    case GF_E_SSE_CFM: s = "Cannot specify -m CARRY_FREE and -r SSE|NOSSE."; break;
+    case GF_E_PCLMULX: s = "Specified -m CARRY_FREE, but PCLMUL is not supported."; break;
+    case GF_E_ALT_BY2: s = "Cannot specify -m BYTWO_x and -r ALTMAP."; break;
+    case GF_E_BY2_SSE: s = "Specified -m BYTWO_x -r SSE, but SSE2 is not supported."; break;
+    case GF_E_LOGBADW: s = "With Log Tables, w must be <= 27."; break;
+    case GF_E_LOG___J: s = "Cannot use Log tables with -r ALTMAP|SSE|NOSSE."; break;
+    case GF_E_LOGPOLY: s = "Cannot use Log tables because the polynomial is not primitive."; break;
+    case GF_E_ZERBADW: s = "With -m LOG_ZERO, w must be 8 or 16."; break;
+    case GF_E_ZEXBADW: s = "With -m LOG_ZERO_EXT, w must be 8."; break;
+    case GF_E_GR_ARGX: s = "With -m GROUP, arg1 and arg2 must be >= 0."; break;
+    case GF_E_GR_W_48: s = "With -m GROUP, w cannot be 4 or 8."; break;
+    case GF_E_GR_W_16: s = "With -m GROUP, w == 16, arg1 and arg2 must be 4."; break;
+    case GF_E_GR_128A: s = "With -m GROUP, w == 128, arg1 must be 4, and arg2 in { 4,8,16 }."; break;
+    case GF_E_GR_SSE4: s = "With -m GROUP, w == 128, you need SSE4."; break;
+    case GF_E_GR_A_27: s = "With -m GROUP, arg1 and arg2 must be <= 27."; break;
+    case GF_E_GR_AR_W: s = "With -m GROUP, arg1 and arg2 must be <= w."; break;
+    case GF_E_GR____J: s = "Cannot use GROUP with -r ALTMAP|SSE|NOSSE."; break;
+    case GF_E_TABLE_W: s = "With -m TABLE, w must be < 15, or == 16."; break;
+    case GF_E_TAB_SSE: s = "With -m TABLE, SSE|NOSSE only applies to w=4."; break;
+    case GF_E_TABSSE3: s = "With -m TABLE, -r SSE, you need SSSE3 supported."; break;
+    case GF_E_TAB_ALT: s = "With -m TABLE, you cannot use ALTMAP."; break;
+    case GF_E_SP128AR: s = "With -m SPLIT, w=128, bad arg1/arg2."; break;
+    case GF_E_SP128AL: s = "With -m SPLIT, w=128, -r SSE requires -r ALTMAP."; break;
+    case GF_E_SP128AS: s = "With -m SPLIT, w=128, ALTMAP needs SSSE3 supported."; break;
+    case GF_E_SP128_A: s = "With -m SPLIT, w=128, -r SSE|NOSSE only with arg1/arg2 = 4/128."; break;
+    case GF_E_SP128_S: s = "With -m SPLIT, w=128, -r ALTMAP only with arg1/arg2 = 4/128."; break;
+    case GF_E_SPLIT_W: s = "With -m SPLIT, w must be in {8, 16, 32, 64, 128}."; break;
+    case GF_E_SP_16AR: s = "With -m SPLIT, w=16, Bad arg1/arg2."; break;
+    case GF_E_SP_16_A: s = "With -m SPLIT, w=16, -r ALTMAP only with arg1/arg2 = 4/16."; break;
+    case GF_E_SP_16_S: s = "With -m SPLIT, w=16, -r SSE|NOSSE only with arg1/arg2 = 4/16."; break;
+    case GF_E_SP_32AR: s = "With -m SPLIT, w=32, Bad arg1/arg2."; break;
+    case GF_E_SP_32AS: s = "With -m SPLIT, w=32, -r ALTMAP needs SSSE3 supported."; break;
+    case GF_E_SP_32_A: s = "With -m SPLIT, w=32, -r ALTMAP only with arg1/arg2 = 4/32."; break;
+    case GF_E_SP_32_S: s = "With -m SPLIT, w=32, -r SSE|NOSSE only with arg1/arg2 = 4/32."; break;
+    case GF_E_SP_64AR: s = "With -m SPLIT, w=64, Bad arg1/arg2."; break;
+    case GF_E_SP_64AS: s = "With -m SPLIT, w=64, -r ALTMAP needs SSSE3 supported."; break;
+    case GF_E_SP_64_A: s = "With -m SPLIT, w=64, -r ALTMAP only with arg1/arg2 = 4/64."; break;
+    case GF_E_SP_64_S: s = "With -m SPLIT, w=64, -r SSE|NOSSE only with arg1/arg2 = 4/64."; break;
+    case GF_E_SP_8_AR: s = "With -m SPLIT, w=8, Bad arg1/arg2."; break;
+    case GF_E_SP_8__A: s = "With -m SPLIT, w=8, Can't have -r ALTMAP."; break;
+    case GF_E_SP_SSE3: s = "With -m SPLIT, Need SSSE3 support for SSE."; break;
+    case GF_E_COMP_A2: s = "With -m COMPOSITE, arg1 must equal 2."; break;
+    case GF_E_COMP_SS: s = "With -m COMPOSITE, -r SSE and -r NOSSE do not apply."; break;
+    case GF_E_COMP__W: s = "With -m COMPOSITE, w must be 8, 16, 32, 64 or 128."; break;
+    case GF_E_UNKFLAG: s = "Unknown method flag - should be -m, -d, -r or -p."; break;
+    case GF_E_UNKNOWN: s = "Unknown multiplication type."; break;
+    case GF_E_UNK_REG: s = "Unknown region type."; break;
+    case GF_E_UNK_DIV: s = "Unknown division type."; break;
+    default: s = "Undefined error.";
+  }
+
+  fprintf(stderr, "%s\n", s);
+}
+
+uint64_t gf_composite_get_default_poly(gf_t *base) 
+{
+  gf_internal_t *h;
+  int rv;
+
+  h = (gf_internal_t *) base->scratch;
+  if (h->w == 4) {
+    if (h->mult_type == GF_MULT_COMPOSITE) return 0;
+    if (h->prim_poly == 0x13) return 2;
+    return 0;
+  } 
+  if (h->w == 8) {
+    if (h->mult_type == GF_MULT_COMPOSITE) return 0;
+    if (h->prim_poly == 0x11d) return 3;
+    return 0;
+  }
+  if (h->w == 16) {
+    if (h->mult_type == GF_MULT_COMPOSITE) {
+      rv = gf_composite_get_default_poly(h->base_gf);
+      if (rv != h->prim_poly) return 0;
+      if (rv == 3) return 0x105;
+      return 0;
+    } else {
+      if (h->prim_poly == 0x1100b) return 2;
+      if (h->prim_poly == 0x1002d) return 7;
+      return 0;
+    }
+  }
+  if (h->w == 32) {
+    if (h->mult_type == GF_MULT_COMPOSITE) {
+      rv = gf_composite_get_default_poly(h->base_gf);
+      if (rv != h->prim_poly) return 0;
+      if (rv == 2) return 0x10005;
+      if (rv == 7) return 0x10008;
+      if (rv == 0x105) return 0x10002;
+      return 0;
+    } else {
+      if (h->prim_poly == 0x400007) return 2;
+      if (h->prim_poly == 0xc5) return 3;
+      return 0;
+    }
+  }
+  if (h->w == 64) {
+    if (h->mult_type == GF_MULT_COMPOSITE) {
+      rv = gf_composite_get_default_poly(h->base_gf);
+      if (rv != h->prim_poly) return 0;
+      if (rv == 3) return 0x100000009ULL;
+      if (rv == 2) return 0x100000004ULL;
+      if (rv == 0x10005) return 0x100000003ULL;
+      if (rv == 0x10002) return 0x100000005ULL;
+      if (rv == 0x10008) return 0x100000006ULL;  /* JSP: (0x0x100000003 works too, 
+                                                    but I want to differentiate cases). */
+      return 0;
+    } else {
+      if (h->prim_poly == 0x1bULL) return 2;
+      return 0;
+    }
+  }
+  return 0;
+}
+
+int gf_error_check(int w, int mult_type, int region_type, int divide_type,
+                   int arg1, int arg2, uint64_t poly, gf_t *base)
+{
+  int sse4 = 0;
+  int sse3 = 0;
+  int sse2 = 0;
+  int pclmul = 0;
+  int rdouble, rquad, rlazy, rsse, rnosse, raltmap, rcauchy, tmp;
+  uint64_t pp;
+  gf_internal_t *sub, *subsub, *subsubsub;
+
+  rdouble = (region_type & GF_REGION_DOUBLE_TABLE);
+  rquad   = (region_type & GF_REGION_QUAD_TABLE);
+  rlazy   = (region_type & GF_REGION_LAZY);
+  rsse    = (region_type & GF_REGION_SSE);
+  rnosse  = (region_type & GF_REGION_NOSSE);
+  raltmap = (region_type & GF_REGION_ALTMAP);
+  rcauchy = (region_type & GF_REGION_CAUCHY);
+
+  if (divide_type != GF_DIVIDE_DEFAULT &&
+      divide_type != GF_DIVIDE_MATRIX && 
+      divide_type != GF_DIVIDE_EUCLID) {
+    _gf_errno = GF_E_UNK_DIV;
+    return 0;
+  }
+
+  tmp = ( GF_REGION_DOUBLE_TABLE | GF_REGION_QUAD_TABLE | GF_REGION_LAZY |
+          GF_REGION_SSE | GF_REGION_NOSSE | GF_REGION_ALTMAP | GF_REGION_CAUCHY );
+  if (region_type & (~tmp)) { _gf_errno = GF_E_UNK_REG; return 0; }
+
+#ifdef INTEL_SSE2
+  sse2 = 1;
+#endif
+
+#ifdef INTEL_SSSE3
+  sse3 = 1;
+#endif
+
+#ifdef INTEL_SSE4
+  sse4 = 1;
+#endif
+
+#ifdef INTEL_PCLMUL
+  pclmul = 1;
+#endif
+
+
+  if (w < 1 || (w > 32 && w != 64 && w != 128)) { _gf_errno = GF_E_BAD___W; return 0; }
+    
+  if (mult_type != GF_MULT_COMPOSITE && w < 64) {
+    if ((poly >> (w+1)) != 0)                   { _gf_errno = GF_E_BADPOLY; return 0; }
+  }
+
+  if (mult_type == GF_MULT_DEFAULT) {
+    if (divide_type != GF_DIVIDE_DEFAULT) { _gf_errno = GF_E_MDEFDIV; return 0; }
+    if (region_type != GF_REGION_DEFAULT) { _gf_errno = GF_E_MDEFREG; return 0; }
+    if (arg1 != 0 || arg2 != 0)           { _gf_errno = GF_E_MDEFARG; return 0; }
+    return 1;
+  }
+  
+  if (rsse && rnosse)                                { _gf_errno = GF_E_SSE__NO; return 0; }
+  if (rcauchy && w > 32)                             { _gf_errno = GF_E_CAUGT32; return 0; }
+  if (rcauchy && region_type != GF_REGION_CAUCHY)    { _gf_errno = GF_E_CAUCHYB; return 0; }
+  if (rcauchy && mult_type == GF_MULT_COMPOSITE)     { _gf_errno = GF_E_CAUCOMP; return 0; }
+
+  if (arg1 != 0 && mult_type != GF_MULT_COMPOSITE && 
+      mult_type != GF_MULT_SPLIT_TABLE && mult_type != GF_MULT_GROUP) {
+    _gf_errno = GF_E_ARG1SET;
+    return 0;
+  }
+
+  if (arg2 != 0 && mult_type != GF_MULT_SPLIT_TABLE && mult_type != GF_MULT_GROUP) {
+    _gf_errno = GF_E_ARG2SET;
+    return 0;
+  }
+
+  if (divide_type == GF_DIVIDE_MATRIX && w > 32) { _gf_errno = GF_E_MATRIXW; return 0; }
+
+  if (rdouble) {
+    if (rquad)                      { _gf_errno = GF_E_DOUQUAD; return 0; }
+    if (mult_type != GF_MULT_TABLE) { _gf_errno = GF_E_DOUBLET; return 0; }
+    if (w != 4 && w != 8)           { _gf_errno = GF_E_DOUBLEW; return 0; }
+    if (rsse || rnosse || raltmap)  { _gf_errno = GF_E_DOUBLEJ; return 0; }
+    if (rlazy && w == 4)            { _gf_errno = GF_E_DOUBLEL; return 0; }
+    return 1;
+  }
+
+  if (rquad) {
+    if (mult_type != GF_MULT_TABLE) { _gf_errno = GF_E_QUAD__T; return 0; }
+    if (w != 4)                     { _gf_errno = GF_E_QUAD__W; return 0; }
+    if (rsse || rnosse || raltmap)  { _gf_errno = GF_E_QUAD__J; return 0; }
+    return 1;
+  }
+
+  if (rlazy)                        { _gf_errno = GF_E_LAZY__X; return 0; }
+
+  if (mult_type == GF_MULT_SHIFT) {
+    if (raltmap)                    { _gf_errno = GF_E_ALTSHIF; return 0; }
+    if (rsse || rnosse)             { _gf_errno = GF_E_SSESHIF; return 0; }
+    return 1;
+  }
+
+  if (mult_type == GF_MULT_CARRY_FREE) {
+    if (w != 4 && w != 8 && w != 16 &&
+        w != 32 && w != 64 && w != 128)            { _gf_errno = GF_E_CFM___W; return 0; }
+    if (w == 4 && (poly & 0xc))                    { _gf_errno = GF_E_CFM4POL; return 0; }
+    if (w == 8 && (poly & 0x80))                   { _gf_errno = GF_E_CFM8POL; return 0; }
+    if (w == 16 && (poly & 0xe000))                { _gf_errno = GF_E_CF16POL; return 0; }
+    if (w == 32 && (poly & 0xfe000000))            { _gf_errno = GF_E_CF32POL; return 0; }
+    if (w == 64 && (poly & 0xfffe000000000000ULL)) { _gf_errno = GF_E_CF64POL; return 0; }
+    if (raltmap)                                   { _gf_errno = GF_E_ALT_CFM; return 0; }
+    if (rsse || rnosse)                            { _gf_errno = GF_E_SSE_CFM; return 0; }
+    if (!pclmul)                                   { _gf_errno = GF_E_PCLMULX; return 0; }
+    return 1;
+  }
+
+  if (mult_type == GF_MULT_BYTWO_p || mult_type == GF_MULT_BYTWO_b) {
+    if (raltmap)                    { _gf_errno = GF_E_ALT_BY2; return 0; }
+    if (rsse && !sse2)              { _gf_errno = GF_E_BY2_SSE; return 0; }
+    return 1;
+  }
+
+  if (mult_type == GF_MULT_LOG_TABLE || mult_type == GF_MULT_LOG_ZERO
+                                     || mult_type == GF_MULT_LOG_ZERO_EXT ) {
+    if (w > 27)                     { _gf_errno = GF_E_LOGBADW; return 0; }
+    if (raltmap || rsse || rnosse)  { _gf_errno = GF_E_LOG___J; return 0; }
+
+    if (mult_type == GF_MULT_LOG_TABLE) return 1;
+
+    if (w != 8 && w != 16)          { _gf_errno = GF_E_ZERBADW; return 0; }
+
+    if (mult_type == GF_MULT_LOG_ZERO) return 1;
+
+    if (w != 8)                     { _gf_errno = GF_E_ZEXBADW; return 0; }
+    return 1;
+  }
+
+  if (mult_type == GF_MULT_GROUP) {
+    if (arg1 <= 0 || arg2 <= 0)                 { _gf_errno = GF_E_GR_ARGX; return 0; }
+    if (w == 4 || w == 8)                       { _gf_errno = GF_E_GR_W_48; return 0; }
+    if (w == 16 && (arg1 != 4 || arg2 != 4))     { _gf_errno = GF_E_GR_W_16; return 0; }
+    if (w == 128 && (arg1 != 4 || 
+       (arg2 != 4 && arg2 != 8 && arg2 != 16))) { _gf_errno = GF_E_GR_128A; return 0; }
+    if (w == 128 && !sse4)                      { _gf_errno = GF_E_GR_SSE4; return 0; }
+    if (arg1 > 27 || arg2 > 27)                 { _gf_errno = GF_E_GR_A_27; return 0; }
+    if (arg1 > w || arg2 > w)                   { _gf_errno = GF_E_GR_AR_W; return 0; }
+    if (raltmap || rsse || rnosse)              { _gf_errno = GF_E_GR____J; return 0; }
+    return 1;
+  }
+  
+  if (mult_type == GF_MULT_TABLE) {
+    if (w != 16 && w >= 15)                     { _gf_errno = GF_E_TABLE_W; return 0; }
+    if (w != 4 && (rsse || rnosse))             { _gf_errno = GF_E_TAB_SSE; return 0; }
+    if (rsse && !sse3)                          { _gf_errno = GF_E_TABSSE3; return 0; }
+    if (raltmap)                                { _gf_errno = GF_E_TAB_ALT; return 0; }
+    return 1;
+  }
+
+  if (mult_type == GF_MULT_SPLIT_TABLE) {
+    if (arg1 > arg2) {
+      tmp = arg1;
+      arg1 = arg2;
+      arg2 = tmp;
+    }
+    if (w == 8) {
+      if (arg1 != 4 || arg2 != 8)               { _gf_errno = GF_E_SP_8_AR; return 0; }
+      if (rsse && !sse3)                        { _gf_errno = GF_E_SP_SSE3; return 0; }
+      if (raltmap)                              { _gf_errno = GF_E_SP_8__A; return 0; }
+    } else if (w == 16) {
+      if (arg1 == 4 && arg2 == 16) {
+        if (rsse && !sse3)                      { _gf_errno = GF_E_SP_SSE3; return 0; }
+      } else if (arg1 == 8 && (arg2 == 16 || arg2 == 8)) {
+        if (rsse || rnosse)                     { _gf_errno = GF_E_SP_16_S; return 0; }
+        if (raltmap)                            { _gf_errno = GF_E_SP_16_A; return 0; }
+      } else                                    { _gf_errno = GF_E_SP_16AR; return 0; }
+    } else if (w == 32) {
+      if ((arg1 == 8 && arg2 == 8) ||
+          (arg1 == 8 && arg2 == 32) ||
+          (arg1 == 16 && arg2 == 32)) {
+        if (rsse || rnosse)                     { _gf_errno = GF_E_SP_32_S; return 0; }
+        if (raltmap)                            { _gf_errno = GF_E_SP_32_A; return 0; }
+      } else if ((arg1 == 4 && arg2 == 32) ||
+          (arg1 == 4 && arg2 == 32)) {
+        if (rsse && !sse3)                      { _gf_errno = GF_E_SP_SSE3; return 0; }
+        if (raltmap && arg1 != 4)               { _gf_errno = GF_E_SP_32_A; return 0; }
+        if (raltmap && !sse3)                   { _gf_errno = GF_E_SP_32AS; return 0; }
+        if (raltmap && rnosse)                  { _gf_errno = GF_E_SP_32AS; return 0; }
+      } else                                    { _gf_errno = GF_E_SP_32AR; return 0; }
+    } else if (w == 64) {
+      if ((arg1 == 8 && arg2 == 8) ||
+          (arg1 == 8 && arg2 == 64) ||
+          (arg1 == 16 && arg2 == 64)) {
+        if (rsse || rnosse)                     { _gf_errno = GF_E_SP_64_S; return 0; }
+        if (raltmap)                            { _gf_errno = GF_E_SP_64_A; return 0; }
+      } else if (arg1 == 4 && arg2 == 64) {
+        if (rsse && !sse3)                      { _gf_errno = GF_E_SP_SSE3; return 0; }
+        if (raltmap && !sse3)                   { _gf_errno = GF_E_SP_64AS; return 0; }
+        if (raltmap && rnosse)                  { _gf_errno = GF_E_SP_64AS; return 0; }
+      } else                                    { _gf_errno = GF_E_SP_64AR; return 0; }
+    } else if (w == 128) {
+      if (arg1 == 8 && arg2 == 128) {
+        if (rsse || rnosse)                     { _gf_errno = GF_E_SP128_S; return 0; }
+        if (raltmap)                            { _gf_errno = GF_E_SP128_A; return 0; }
+      } else if (arg1 == 4 && arg2 == 128) {
+        if (rsse && !sse3)                      { _gf_errno = GF_E_SP_SSE3; return 0; }
+        if (raltmap && !sse3)                   { _gf_errno = GF_E_SP128AS; return 0; }
+        if (raltmap && rnosse)                  { _gf_errno = GF_E_SP128AS; return 0; }
+        if (!raltmap && rsse)                   { _gf_errno = GF_E_SP128AL; return 0; }
+      } else                                    { _gf_errno = GF_E_SP128AR; return 0; }
+    } else                                      { _gf_errno = GF_E_SPLIT_W; return 0; }
+    return 1;
+  }
+
+  if (mult_type == GF_MULT_COMPOSITE) {
+    if (w != 8 && w != 16 && w != 32 
+               && w != 64 && w != 128)          { _gf_errno = GF_E_COMP__W; return 0; }
+    if ((poly >> (w/2)) != 0)                   { _gf_errno = GF_E_COMP_PP; return 0; }
+    if (divide_type != GF_DIVIDE_DEFAULT)       { _gf_errno = GF_E_DIVCOMP; return 0; }
+    if (arg1 != 2)                              { _gf_errno = GF_E_COMP_A2; return 0; }
+    if (rsse || rnosse)                         { _gf_errno = GF_E_COMP_SS; return 0; }
+    if (base != NULL) {
+      sub = (gf_internal_t *) base->scratch;
+      if (sub->w != w/2)                      { _gf_errno = GF_E_BASE__W; return 0; }
+      if (poly == 0) {
+        if (gf_composite_get_default_poly(base) == 0) { _gf_errno = GF_E_COMPXPP; return 0; }
+      }
+    }
+    return 1;
+  }
+
+  _gf_errno = GF_E_UNKNOWN; 
+  return 0;
+}
+
 int gf_scratch_size(int w, 
                    int mult_type, 
                    int region_type, 
@ -15,6 +414,8 @@ int gf_scratch_size(int w,
                    int arg1, 
                    int arg2)
 {
+  if (gf_error_check(w, mult_type, region_type, divide_type, arg1, arg2, 0, NULL) == 0) return 0;
+
  switch(w) {
    case 4: return gf_w4_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
    case 8: return gf_w8_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
@ -26,16 +427,31 @@ int gf_scratch_size(int w,
  }
 }

-int gf_dummy_init(gf_t *gf)
+extern int gf_size(gf_t *gf)
 {
-  return 0;
+  gf_internal_t *h;
+  int s;
+
+  s = sizeof(gf_t);
+  h = (gf_internal_t *) gf->scratch;
+  s += gf_scratch_size(h->w, h->mult_type, h->region_type, h->divide_type, h->arg1, h->arg2);
+  if (h->mult_type == GF_MULT_COMPOSITE) s += gf_size(h->base_gf);
+  return s;
 }

+
 int gf_init_easy(gf_t *gf, int w)
 {
-  return gf_init_hard(gf, w, GF_MULT_DEFAULT, GF_REGION_DEFAULT, GF_DIVIDE_DEFAULT, 0, 0, 0, NULL, NULL);
+  return gf_init_hard(gf, w, GF_MULT_DEFAULT, GF_REGION_DEFAULT, GF_DIVIDE_DEFAULT, 
+                      0, 0, 0, NULL, NULL);
 }

+/* Allen: What's going on here is this function is putting info into the
+       scratch mem of gf, and then calling the relevant REAL init
+       func for the word size.  Probably done this way to consolidate
+       those aspects of initialization that don't rely on word size,
+       and then take care of word-size-specific stuff. */
+
 int gf_init_hard(gf_t *gf, int w, int mult_type, 
                        int region_type,
                        int divide_type,
@ -46,11 +462,14 @@ int gf_init_hard(gf_t *gf, int w, int mult_type,
 {
  int sz;
  gf_internal_t *h;
-  
+ 
+  if (gf_error_check(w, mult_type, region_type, divide_type, 
+                     arg1, arg2, prim_poly, base_gf) == 0) return 0;
+
  sz = gf_scratch_size(w, mult_type, region_type, divide_type, arg1, arg2);
-
-  if (sz <= 0) return 0;
-
+  if (sz <= 0) return 0;  /* This shouldn't happen, as all errors should get caught
+                             in gf_error_check() */
+  
  if (scratch_memory == NULL) {
    h = (gf_internal_t *) malloc(sz);
    h->free_me = 1;
@ -71,8 +490,6 @@ int gf_init_hard(gf_t *gf, int w, int mult_type,
  h->private += (sizeof(gf_internal_t));
  gf->extract_word.w32 = NULL;

-  //printf("Created w=%d, with mult_type=%d and region_type=%d\n", w, mult_type, region_type);
-
  switch(w) {
    case 4: return gf_w4_init(gf);
    case 8: return gf_w8_init(gf);
@ -94,6 +511,7 @@ int gf_free(gf_t *gf, int recursive)
    free(h->base_gf);
  }
  if (h->free_me) free(h);
+  return 0; /* Making compiler happy */
 }

 void gf_alignment_error(char *s, int a)
@ -105,9 +523,9 @@ void gf_alignment_error(char *s, int a)
 }

 static 
-void gf_invert_binary_matrix(int *mat, int *inv, int rows) {
+void gf_invert_binary_matrix(uint32_t *mat, uint32_t *inv, int rows) {
  int cols, i, j, k;
-  int tmp;
+  uint32_t tmp;

  cols = rows;

@ -172,34 +590,6 @@ uint32_t gf_bitmatrix_inverse(uint32_t y, int w, uint32_t pp)
  return inv[0];
 }

-/*
-void gf_two_byte_region_table_multiply(gf_region_data *rd, uint16_t *base)
-{
-  uint64_t p, ta, shift, tb;
-  uint64_t *s64, *d64
-
-  s64 = rd->s_start;
-  d64 = rd->d_start;
-  
-  while (s64 < (uint64_t *) rd->s_top) {
-    p = (rd->xor) ? *d64 : 0;
-    ta = *s64;
-
-    shift = 0;
-    while (ta != 0) {
-      tb = base[ta&0xffff];
-      p ^= (tb << shift);
-      ta >>= 16;
-      shift += 16;
-    }
-
-    *d64 = p;
-    d64++;
-    s64++;
-  }
-}
-*/
-
 void gf_two_byte_region_table_multiply(gf_region_data *rd, uint16_t *base)
 {
  uint64_t a, prod;
@ -226,8 +616,8 @@ void gf_two_byte_region_table_multiply(gf_region_data *rd, uint16_t *base)
      prod ^= base[a >> 48];
      prod ^= *d64;
      *d64 = prod;
-      *s64++;
-      *d64++;
+      s64++;
+      d64++;
    }
  } else {
    while (d64 != top) {
@ -243,8 +633,8 @@ void gf_two_byte_region_table_multiply(gf_region_data *rd, uint16_t *base)
      prod <<= 16;
      prod ^= base[a >> 48];
      *d64 = prod;
-      *s64++;
-      *d64++;
+      s64++;
+      d64++;
    }
  }
 }
@ -307,9 +697,71 @@ static void gf_slow_multiply_region(gf_region_data *rd, void *src, void *dest, v
  }
 }

-/* If align>16, you align to 16 bytes, but make sure that within the aligned region bytes is a multiple of align.  However, you make sure that the region itself is a multiple of align. 
+/* JSP - The purpose of this procedure is to error check alignment,
+   and to set up the region operation so that it can best leverage
+   large words.

-   If align = -1, then this is cauchy.  You need to make sure that bytes is a multiple of w. */
+   It stores its information in rd.
+
+   Assuming you're not doing Cauchy coding, (see below for that),
+   then w will be 4, 8, 16, 32 or 64. It can't be 128 (probably
+   should change that).
+
+   src and dest must then be aligned on ceil(w/8)-byte boundaries.
+   Moreover, bytes must be a multiple of ceil(w/8).  If the variable
+   align is equal to ceil(w/8), then we will set s_start = src,
+   d_start = dest, s_top to (src+bytes) and d_top to (dest+bytes).
+   And we return -- the implementation will go ahead and do the
+   multiplication on individual words (e.g. using discrete logs).
+
+   If align is greater than ceil(w/8), then the implementation needs
+   to work on groups of "align" bytes.  For example, suppose you are
+   implementing BYTWO, without SSE. Then you will be doing the region
+   multiplication in units of 8 bytes, so align = 8. Or, suppose you
+   are doing a Quad table in GF(2^4). You will be doing the region
+   multiplication in units of 2 bytes, so align = 2. Or, suppose you
+   are doing split multiplication with SSE operations in GF(2^8).
+   Then align = 16. Worse yet, suppose you are doing split
+   multiplication with SSE operations in GF(2^16), with or without
+   ALTMAP. Then, you will be doing the multiplication on 256 bits at
+   a time.  So align = 32.
+
+   When align does not equal ceil(w/8), we split the region
+   multiplication into three parts.  We are going to make s_start be
+   the first address greater than or equal to src that is a multiple
+   of align.  s_top is going to be the largest address >= src+bytes
+   such that (s_top - s_start) is a multiple of align.  We do the
+   same with d_start and d_top.  When we say that "src and dest must
+   be aligned with respect to each other, we mean that s_start-src
+   must equal d_start-dest.
+
+   Now, the region multiplication is done in three parts -- the part
+   between src and s_start must be done using single words.
+   Similarly, the part between s_top and src+bytes must also be done
+   using single words.  The part between s_start and s_top will be
+   done in chunks of "align" bytes.
+
+   One final thing -- if align > 16, then s_start and d_start will be
+   aligned on a 16 byte boundary.  Perhaps we should have two
+   variables: align and chunksize.  Then we'd have s_start & d_start
+   aligned to "align", and have s_top-s_start be a multiple of
+   chunksize.  That may be less confusing, but it would be a big
+   change.
+
+   Finally, if align = -1, then we are doing Cauchy multiplication,
+   using only XOR's.  In this case, we're not going to care about
+   alignment because we are just doing XOR's.  Instead, the only
+   thing we care about is that bytes must be a multiple of w.
+
+   This is not to say that alignment doesn't matter in performance
+   with XOR's.  See that discussion in gf_multby_one().
+
+   After you call gf_set_region_data(), the procedure
+   gf_do_initial_region_alignment() calls gf->multiply.w32() on
+   everything between src and s_start.  The procedure
+   gf_do_final_region_alignment() calls gf->multiply.w32() on
+   everything between s_top and src+bytes.
+   */

 void gf_set_region_data(gf_region_data *rd,
  gf_t *gf,
@ -326,7 +778,7 @@ void gf_set_region_data(gf_region_data *rd,
  uint32_t a;
  unsigned long uls, uld;

-  if (gf == NULL) {
+  if (gf == NULL) {  /* JSP - Can be NULL if you're just doing XOR's */
    wb = 1;
  } else {
    h = gf->scratch;
@ -347,7 +799,7 @@ void gf_set_region_data(gf_region_data *rd,

  a = (align <= 16) ? align : 16;

-  if (align == -1) { /* This is cauchy.  Error check bytes, then set up the pointers
+  if (align == -1) { /* JSP: This is cauchy.  Error check bytes, then set up the pointers
                        so that there are no alignment regions. */
    if (bytes % h->w != 0) {
      fprintf(stderr, "Error in region multiply operation.\n");
@ -386,14 +838,14 @@ void gf_set_region_data(gf_region_data *rd,
  }

  uls %= a;
-  if (uls != 0) uls = (align-uls);
+  if (uls != 0) uls = (a-uls);
  rd->s_start = rd->src + uls;
  rd->d_start = rd->dest + uls;
  bytes -= uls;
-
  bytes -= (bytes % align);
  rd->s_top = rd->s_start + bytes;
  rd->d_top = rd->d_start + bytes;
+
 }

 void gf_do_initial_region_alignment(gf_region_data *rd)
@ -413,25 +865,76 @@ void gf_multby_zero(void *dest, int bytes, int xor)
  return;
 }

+/* JSP - gf_multby_one tries to do this in the most efficient way
+   possible.  If xor = 0, then simply call memcpy() since that
+   should be optimized by the system.  Otherwise, try to do the xor
+   in the following order:
+
+   If src and dest are aligned with respect to each other on 16-byte
+   boundaries and you have SSE instructions, then use aligned SSE
+   instructions.
+
+   If they aren't but you still have SSE instructions, use unaligned
+   SSE instructions.
+
+   If there are no SSE instructions, but they are aligned with
+   respect to each other on 8-byte boundaries, then do them with
+   uint64_t's.
+
+   Otherwise, call gf_unaligned_xor(), which does the following:
+   align a destination pointer along an 8-byte boundary, and then
+   memcpy 32 bytes at a time from the src pointer to an array of
+   doubles.  I'm not sure if that's the best -- probably needs
+   testing, but this seems like it could be a black hole.
+ */
+
+static void gf_unaligned_xor(void *src, void *dest, int bytes);
+
 void gf_multby_one(void *src, void *dest, int bytes, int xor) 
 {
-#ifdef   INTEL_SSE4
+#ifdef   INTEL_SSE2
  __m128i ms, md;
 #endif
+  unsigned long uls, uld;
  uint8_t *s8, *d8, *dtop8;
  uint64_t *s64, *d64, *dtop64;
  int abytes;
-
  gf_region_data rd;
+
  if (!xor) {
    memcpy(dest, src, bytes);
    return;
  }
+  uls = (unsigned long) src;
+  uld = (unsigned long) dest;

-#ifdef   INTEL_SSE4
+#ifdef   INTEL_SSE2
  s8 = (uint8_t *) src;
  d8 = (uint8_t *) dest;
-  abytes = bytes & 0xfffffff0;
+  if (uls % 16 == uld % 16) {
+    gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 16);
+    while (s8 != rd.s_start) {
+      *d8 ^= *s8;
+      d8++;
+      s8++;
+    }
+    while (s8 < (uint8_t *) rd.s_top) {
+      ms = _mm_load_si128 ((__m128i *)(s8));
+      md = _mm_load_si128 ((__m128i *)(d8));
+      md = _mm_xor_si128(md, ms);
+      _mm_store_si128((__m128i *)(d8), md);
+      s8 += 16;
+      d8 += 16;
+    }
+    while (s8 != (uint8_t *) src + bytes) {
+      *d8 ^= *s8;
+      d8++;
+      s8++;
+    }
+    return;
+  }
+
+  abytes = (bytes & 0xfffffff0);

  while (d8 < (uint8_t *) dest + abytes) {
    ms = _mm_loadu_si128 ((__m128i *)(s8));
@ -449,8 +952,11 @@ void gf_multby_one(void *src, void *dest, int bytes, int xor)
  return;
 #endif

-  /* If you don't have SSE, you'd better be aligned..... */
-
+  if (uls % 8 != uld % 8) {
+    gf_unaligned_xor(src, dest, bytes);
+    return;
+  }
+  
  gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 8);
  s8 = (uint8_t *) src;
  d8 = (uint8_t *) dest;
@ -480,3 +986,47 @@ void gf_multby_one(void *src, void *dest, int bytes, int xor)
  }
  return;
 }
+
+#define UNALIGNED_BUFSIZE (8)
+
+static void gf_unaligned_xor(void *src, void *dest, int bytes)
+{
+  uint64_t scopy[UNALIGNED_BUFSIZE], *d64;
+  int i;
+  gf_region_data rd;
+  uint8_t *s8, *d8;
+
+  /* JSP - call gf_set_region_data(), but use dest in both places.  This is
+     because I only want to set up dest.  If I used src, gf_set_region_data()
+     would fail because src and dest are not aligned to each other wrt 
+     8-byte pointers.  I know this will actually align d_start to 16 bytes.
+     If I change gf_set_region_data() to split alignment & chunksize, then 
+     I could do this correctly. */
+
+  gf_set_region_data(&rd, NULL, dest, dest, bytes, 1, 1, 8*UNALIGNED_BUFSIZE);
+  s8 = (uint8_t *) src;
+  d8 = (uint8_t *) dest;
+
+  while (d8 < (uint8_t *) rd.d_start) {
+    *d8 ^= *s8;
+    d8++;
+    s8++;
+  }
+  
+  d64 = (uint64_t *) d8;
+  while (d64 < (uint64_t *) rd.d_top) {
+    memcpy(scopy, s8, 8*UNALIGNED_BUFSIZE);
+    s8 += 8*UNALIGNED_BUFSIZE;
+    for (i = 0; i < UNALIGNED_BUFSIZE; i++) {
+      *d64 ^= scopy[i];
+      d64++;
+    }
+  }
+  
+  d8 = (uint8_t *) d64;
+  while (d8 < (uint8_t *) (dest+bytes)) {
+    *d8 ^= *s8;
+    d8++;
+    s8++;
+  }
+}
--- a/gf_54.c
+++ b/gf_54.c
@ -1,29 +0,0 @@
-/*
- * Multiplies four and five in GF(2^4).
- */
-
-#include <stdio.h>
-#include <stdint.h>
-#include <stdlib.h>
-
-#include "gf_complete.h"
-
-main()
-{
-  gf_t gf;
-  void *scratch;
-  int size;
-
-  size = gf_scratch_size(16, GF_MULT_SPLIT_TABLE,
-                             GF_REGION_SSE | GF_REGION_ALTMAP,
-                             GF_DIVIDE_DEFAULT,
-                             16, 4);
-  if (size == -1) exit(1); /* It failed. That shouldn't happen*/
-  scratch = (void *) malloc(size);
-  if (scratch == NULL) { perror("malloc"); exit(1); }
-  if (!gf_init_hard(&gf, 16, GF_MULT_SPLIT_TABLE,
-                             GF_REGION_SSE | GF_REGION_ALTMAP,
-                             GF_DIVIDE_DEFAULT,
-                             0, 16, 4, NULL, scratch)) exit(1);
-  printf("Yo\n");
-}
--- a/gf_add.c
+++ b/gf_add.c
@ -16,7 +16,7 @@ void usage(char *s)
  fprintf(stderr, "       If w has an h on the end, treat a, b and the sum as hexadecimal (no 0x required)\n");
  fprintf(stderr, "\n");
  fprintf(stderr, "       legal w are: 1-32, 64 and 128\n");
-  fprintf(stderr, "           128 is hex only (i.e. '128' will be an error - do '128h')\n");
+  fprintf(stderr, "       128 is hex only (i.e. '128' will be an error - do '128h')\n");

  if (s != NULL) fprintf(stderr, "%s", s);
  exit(1);
--- a/gf_complete.h
+++ b/gf_complete.h
@ -4,22 +4,30 @@
 #pragma once
 #include <stdint.h>

-#ifdef  INTEL_SSE4
-#include <nmmintrin.h>
-#include <emmintrin.h>
-#include <smmintrin.h>
+#ifdef INTEL_SSE4
+  #define INTEL_SSSE3
+  #include <nmmintrin.h>
 #endif

-#ifdef  INTEL_PCLMUL
-#include <wmmintrin.h>
+#ifdef INTEL_SSSE3
+  #define INTEL_SSE2
+  #include <tmmintrin.h>
 #endif

-/* This does either memcpy or xor, depending on "xor" */
+#ifdef INTEL_SSE2
+  #include <emmintrin.h>
+#endif

-extern void gf_multby_one(void *src, void *dest, int bytes, int xor);
+#ifdef INTEL_PCLMUL
+  #include <wmmintrin.h>
+  #ifdef INTEL_SSE4
+    #define INTEL_SSE4_PCLMUL
+  #endif
+  #ifdef INTEL_SSSE3
+    #define INTEL_SSSE3_PCLMUL
+  #endif
+#endif

-#define GF_W128_IS_ZERO(val) (val[0] == 0 && val[1] == 0)
-#define GF_W128_EQUAL(val1, val2) ((val1[0] == val2[0]) && (val1[1] == val2[1]))

 /* These are the different ways to perform multiplication.
   Not all are implemented for all values of w.
@ -27,30 +35,30 @@ extern void gf_multby_one(void *src, void *dest, int bytes, int xor);

 typedef enum {GF_MULT_DEFAULT,   
              GF_MULT_SHIFT,   
+              GF_MULT_CARRY_FREE,   
              GF_MULT_GROUP,   
              GF_MULT_BYTWO_p,
              GF_MULT_BYTWO_b,
              GF_MULT_TABLE,   
              GF_MULT_LOG_TABLE,   
+              GF_MULT_LOG_ZERO,
+              GF_MULT_LOG_ZERO_EXT,
              GF_MULT_SPLIT_TABLE,   
              GF_MULT_COMPOSITE } gf_mult_type_t;

 /* These are the different ways to optimize region 
-   operations.  They are bits because you can compose them:
-   You can mix SINGLE/DOUBLE/QUAD, LAZY, SSE/NOSSE, STDMAP/ALTMAP/CAUCHY.
+   operations.  They are bits because you can compose them.
   Certain optimizations only apply to certain gf_mult_type_t's.  
   Again, please see documentation for how to use these */
   
 #define GF_REGION_DEFAULT      (0x0)
-#define GF_REGION_SINGLE_TABLE (0x1)
-#define GF_REGION_DOUBLE_TABLE (0x2)
-#define GF_REGION_QUAD_TABLE   (0x4)
-#define GF_REGION_LAZY         (0x8)
-#define GF_REGION_SSE          (0x10)
-#define GF_REGION_NOSSE        (0x20)
-#define GF_REGION_STDMAP       (0x40)
-#define GF_REGION_ALTMAP       (0x80)
-#define GF_REGION_CAUCHY       (0x100)
+#define GF_REGION_DOUBLE_TABLE (0x1)
+#define GF_REGION_QUAD_TABLE   (0x2)
+#define GF_REGION_LAZY         (0x4)
+#define GF_REGION_SSE          (0x8)
+#define GF_REGION_NOSSE        (0x10)
+#define GF_REGION_ALTMAP       (0x20)
+#define GF_REGION_CAUCHY       (0x40)

 typedef uint32_t gf_region_type_t;

@ -74,6 +82,9 @@ typedef uint32_t    gf_val_32_t;
 typedef uint64_t    gf_val_64_t;
 typedef uint64_t   *gf_val_128_t;

+extern int _gf_errno;
+extern void gf_error();
+
 typedef struct gf *GFP;

 typedef union gf_func_a_b {
@ -109,8 +120,21 @@ typedef struct gf {
  void           *scratch;
 } gf_t;
    
+/* Initializes the GF to defaults.  Pass it a pointer to a gf_t.
+   Returns 0 on failure, 1 on success. */
+
 extern int gf_init_easy(GFP gf, int w);

+/* Initializes the GF changing the defaults.
+   Returns 0 on failure, 1 on success.
+   Pass it a pointer to a gf_t.
+   For mult_type and divide_type, use one of gf_mult_type_t gf_divide_type_t .  
+   For region_type, OR together the GF_REGION_xxx's defined above.  
+   Use 0 as prim_poly for defaults.  Otherwise, the leading 1 is optional.
+   Use NULL for scratch_memory to have init_hard allocate memory.  Otherwise,
+   use gf_scratch_size() to determine how big scratch_memory has to be.
+ */
+
 extern int gf_init_hard(GFP gf, 
                        int w, 
                        int mult_type, 
@ -122,6 +146,9 @@ extern int gf_init_hard(GFP gf,
                        GFP base_gf,
                        void *scratch_memory);

+/* Determines the size for scratch_memory.  
+   Returns 0 on failure and non-zero on success. */
+
 extern int gf_scratch_size(int w, 
                           int mult_type, 
                           int region_type, 
@ -129,25 +156,32 @@ extern int gf_scratch_size(int w,
                           int arg1, 
                           int arg2);

+/* This reports the gf_scratch_size of a gf_t that has already been created */
+
+extern int gf_size(GFP gf);
+
+/* Frees scratch memory if gf_init_easy/gf_init_hard called malloc.
+   If recursive = 1, then it calls itself recursively on base_gf. */
+
 extern int gf_free(GFP gf, int recursive);

 /* This is support for inline single multiplications and divisions.
   I know it's yucky, but if you've got to be fast, you've got to be fast.
-   We'll support inlines for w=4, w=8 and w=16.  
+   We support inlining for w=4, w=8 and w=16.  

   To use inline multiplication and division with w=4 or 8, you should use the 
   default gf_t, or one with a single table.  Otherwise, gf_w4/8_get_mult_table()
-   will return NULL. */
+   will return NULL. Similarly, with w=16, the gf_t must be LOG */

 uint8_t *gf_w4_get_mult_table(GFP gf);
 uint8_t *gf_w4_get_div_table(GFP gf);

-#define GF_W4_INLINE_MULTDIV(table, a, b) (table[((a)<<4)|b])
+#define GF_W4_INLINE_MULTDIV(table, a, b) (table[((a)<<4)|(b)])

 uint8_t *gf_w8_get_mult_table(GFP gf);
 uint8_t *gf_w8_get_div_table(GFP gf);

-#define GF_W8_INLINE_MULTDIV(table, a, b) (table[(((uint32_t) a)<<8)|b])
+#define GF_W8_INLINE_MULTDIV(table, a, b) (table[(((uint32_t) (a))<<8)|(b)])

 uint16_t *gf_w16_get_log_table(GFP gf);
 uint16_t *gf_w16_get_mult_alog_table(GFP gf);
--- a/gf_example_5.c
+++ b/gf_example_5.c
@ -0,0 +1,73 @@
+/*
+ * gf_example_5.c
+ *
+ * Demonstrating altmap and extract_word
+ */
+
+#include <stdio.h>
+#include <getopt.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include "gf_complete.h"
+#include "gf_rand.h"
+
+void usage(char *s)
+{
+  fprintf(stderr, "usage: gf_example_5\n");
+  exit(1);
+}
+
+int main(int argc, char **argv)
+{
+  uint16_t *a, *b;
+  int i, j;
+  gf_t gf;
+
+  if (gf_init_hard(&gf, 16, GF_MULT_SPLIT_TABLE, GF_REGION_ALTMAP, GF_DIVIDE_DEFAULT, 
+                   0, 16, 4, NULL, NULL) == 0) {
+    fprintf(stderr, "gf_init_hard failed\n");
+    exit(1);
+  }
+
+  a = (uint16_t *) malloc(200);
+  b = (uint16_t *) malloc(200);
+
+  a += 6;
+  b += 6;
+
+  MOA_Seed(0);
+
+  for (i = 0; i < 30; i++) a[i] = MOA_Random_W(16, 1);
+
+  gf.multiply_region.w32(&gf, a, b, 0x1234, 30*2, 0);
+
+  printf("a: 0x%lx    b: 0x%lx\n", (unsigned long) a, (unsigned long) b);
+
+  for (i = 0; i < 30; i += 10) {
+    printf("\n");
+    printf("  ");
+    for (j = 0; j < 10; j++) printf(" %4d", i+j);
+    printf("\n");
+
+    printf("a:");
+    for (j = 0; j < 10; j++) printf(" %04x", a[i+j]);
+    printf("\n");
+
+    printf("b:");
+    for (j = 0; j < 10; j++) printf(" %04x", b[i+j]);
+    printf("\n");
+    printf("\n");
+  }
+
+  for (i = 0; i < 15; i ++) {
+    printf("Word %2d: 0x%04x * 0x1234 = 0x%04x    ", i,
+           gf.extract_word.w32(&gf, a, 30*2, i),
+           gf.extract_word.w32(&gf, b, 30*2, i));
+    printf("Word %2d: 0x%04x * 0x1234 = 0x%04x\n", i+15,
+           gf.extract_word.w32(&gf, a, 30*2, i+15),
+           gf.extract_word.w32(&gf, b, 30*2, i+15));
+  }
+}
--- a/gf_example_6.c
+++ b/gf_example_6.c
@ -0,0 +1,79 @@
+/*
+ * gf_example_6.c
+ *
+ * Demonstrating altmap and extract_word
+ */
+
+#include <stdio.h>
+#include <getopt.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include "gf_complete.h"
+#include "gf_rand.h"
+
+void usage(char *s)
+{
+  fprintf(stderr, "usage: gf_example_6\n");
+  exit(1);
+}
+
+int main(int argc, char **argv)
+{
+  uint32_t *a, *b;
+  int i, j;
+  gf_t gf, gf_16;
+
+  if (gf_init_hard(&gf_16, 16, GF_MULT_LOG_TABLE, GF_REGION_DEFAULT, GF_DIVIDE_DEFAULT,
+                   0, 0, 0, NULL, NULL) == 0) {
+    fprintf(stderr, "gf_init_hard (6) failed\n");
+    exit(1);
+  }
+
+  if (gf_init_hard(&gf, 32, GF_MULT_COMPOSITE, GF_REGION_ALTMAP, GF_DIVIDE_DEFAULT, 
+                   0, 2, 0, &gf_16, NULL) == 0) {
+    fprintf(stderr, "gf_init_hard (32) failed\n");
+    exit(1);
+  }
+
+  a = (uint32_t *) malloc(200);
+  b = (uint32_t *) malloc(200);
+
+  a += 3;
+  b += 3;
+
+  MOA_Seed(0);
+
+  for (i = 0; i < 30; i++) a[i] = MOA_Random_W(32, 1);
+
+  gf.multiply_region.w32(&gf, a, b, 0x12345678, 30*4, 0);
+
+  printf("a: 0x%lx    b: 0x%lx\n", (unsigned long) a, (unsigned long) b);
+
+  for (i = 0; i < 30; i += 10) {
+    printf("\n");
+    printf("  ");
+    for (j = 0; j < 10; j++) printf(" %8d", i+j);
+    printf("\n");
+
+    printf("a:");
+    for (j = 0; j < 10; j++) printf(" %08x", a[i+j]);
+    printf("\n");
+
+    printf("b:");
+    for (j = 0; j < 10; j++) printf(" %08x", b[i+j]);
+    printf("\n");
+    printf("\n");
+  }
+
+  for (i = 0; i < 15; i ++) {
+    printf("Word %2d: 0x%08x * 0x12345678 = 0x%08x    ", i,
+           gf.extract_word.w32(&gf, a, 30*4, i),
+           gf.extract_word.w32(&gf, b, 30*4, i));
+    printf("Word %2d: 0x%08x * 0x12345678 = 0x%08x\n", i+15,
+           gf.extract_word.w32(&gf, a, 30*4, i+15),
+           gf.extract_word.w32(&gf, b, 30*4, i+15));
+  }
+}
--- a/gf_example_7.c
+++ b/gf_example_7.c
@ -0,0 +1,70 @@
+/*
+ * gf_example_7.c
+ *
+ * Demonstrating extract_word and Cauchy
+ */
+
+#include <stdio.h>
+#include <getopt.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include "gf_complete.h"
+#include "gf_rand.h"
+
+void usage(char *s)
+{
+  fprintf(stderr, "usage: gf_example_7\n");
+  exit(1);
+}
+
+int main(int argc, char **argv)
+{
+  uint8_t *a, *b;
+  int i, j;
+  gf_t gf;
+
+  if (gf_init_hard(&gf, 3, GF_MULT_TABLE, GF_REGION_CAUCHY, GF_DIVIDE_DEFAULT, 0, 0, 0, NULL, NULL) == 0) {
+    fprintf(stderr, "gf_init_hard failed\n");
+    exit(1);
+  }
+
+  a = (uint8_t *) malloc(3);
+  b = (uint8_t *) malloc(3);
+
+  MOA_Seed(0);
+
+  for (i = 0; i < 3; i++) a[i] = MOA_Random_W(8, 1);
+
+  gf.multiply_region.w32(&gf, a, b, 5, 3, 0);
+
+  printf("a: 0x%lx    b: 0x%lx\n", (unsigned long) a, (unsigned long) b);
+
+  printf("\n");
+  printf("a: 0x%02x 0x%02x 0x%02x\n", a[0], a[1], a[2]);
+  printf("b: 0x%02x 0x%02x 0x%02x\n", b[0], b[1], b[2]);
+  printf("\n");
+
+  printf("a bits:");
+  for (i = 0; i < 3; i++) {
+    printf(" ");
+    for (j = 7; j >= 0; j--) printf("%c", (a[i] & (1 << j)) ? '1' : '0');
+  }
+  printf("\n");
+
+  printf("b bits:");
+  for (i = 0; i < 3; i++) {
+    printf(" ");
+    for (j = 7; j >= 0; j--) printf("%c", (b[i] & (1 << j)) ? '1' : '0');
+  }
+  printf("\n");
+
+  printf("\n");
+  for (i = 0; i < 8; i++) {
+    printf("Word %2d: %d * 5 = %d\n", i,
+           gf.extract_word.w32(&gf, a, 3, i),
+           gf.extract_word.w32(&gf, b, 3, i));
+  }
+}
--- a/gf_general.c
+++ b/gf_general.c
@ -95,12 +95,20 @@ void gf_general_set_random(gf_general_t *v, int w, int zero_ok)
  }
 }

-void gf_general_val_to_s(gf_general_t *v, int w, char *s)
+void gf_general_val_to_s(gf_general_t *v, int w, char *s, int hex)
 {
  if (w <= 32) {
-    sprintf(s, "%x", v->w32);
+    if (hex) {
+      sprintf(s, "%x", v->w32);
+    } else {
+      sprintf(s, "%d", v->w32);
+    }
  } else if (w <= 64) {
-    sprintf(s, "%llx", (long long unsigned int) v->w64);
+    if (hex) {
+      sprintf(s, "%llx", (long long unsigned int) v->w64);
+    } else {
+      sprintf(s, "%lld", (long long unsigned int) v->w64);
+    }
  } else {
    if (v->w128[0] == 0) {
      sprintf(s, "%llx", (long long unsigned int) v->w128[1]);
@ -111,6 +119,64 @@ void gf_general_val_to_s(gf_general_t *v, int w, char *s)
  }
 }

+int gf_general_s_to_val(gf_general_t *v, int w, char *s, int hex)
+{
+  int l;
+  int save;
+
+  if (w <= 32) {
+    if (hex) {
+      if (sscanf(s, "%x", &(v->w32)) == 0) return 0;
+    } else {
+      if (sscanf(s, "%d", &(v->w32)) == 0) return 0;
+    }
+    if (w == 32) return 1;
+    if (w == 31) {
+      if (v->w32 & (1 << 31)) return 0;
+      return 1;
+    } 
+    if (v->w32 & ~((1 << w)-1)) return 0;
+    return 1;
+  } else if (w <= 64) {
+    if (hex) return (sscanf(s, "%llx", &(v->w64)) == 1);
+    return (sscanf(s, "%lld", &(v->w64)) == 1);
+  } else {
+    if (!hex) return 0;
+    l = strlen(s);
+    if (l <= 16) {
+      v->w128[0] = 0;
+      return (sscanf(s, "%llx", &(v->w128[1])) == 1);
+    } else {
+      if (l > 32) return 0;
+      save = s[l-16];
+      s[l-16] = '\0';
+      if (sscanf(s, "%llx", &(v->w128[0])) == 0) {
+        s[l-16] = save;
+        return 0;
+      }
+      return (sscanf(s+(l-16), "%llx", &(v->w128[1])) == 1);
+    }
+  }
+}
+    
+void gf_general_add(gf_t *gf, gf_general_t *a, gf_general_t *b, gf_general_t *c)
+{
+  gf_internal_t *h;
+  int w;
+
+  h = (gf_internal_t *) gf->scratch;
+  w = h->w;
+
+  if (w <= 32) {
+    c->w32 = a->w32 ^ b->w32;
+  } else if (w <= 64) {
+    c->w64 = a->w64 ^ b->w64;
+  } else {
+    c->w128[0] = a->w128[0] ^ b->w128[0];
+    c->w128[1] = a->w128[1] ^ b->w128[1];
+  }
+}
+  
 void gf_general_multiply(gf_t *gf, gf_general_t *a, gf_general_t *b, gf_general_t *c)
 {
  gf_internal_t *h;
@ -229,19 +295,19 @@ void gf_general_do_region_check(gf_t *gf, gf_general_t *a, void *orig_a, void *o

    if (!gf_general_are_equal(&ft, &sb, w)) {
      
-      printf("Problem with region multiply (all values in hex):\n");
-      printf("   Target address base: 0x%lx.  Word 0x%x of 0x%x.  Xor: %d\n", 
+      fprintf(stderr,"Problem with region multiply (all values in hex):\n");
+      fprintf(stderr,"   Target address base: 0x%lx.  Word 0x%x of 0x%x.  Xor: %d\n", 
                 (unsigned long) final_target, i, words, xor);
-      gf_general_val_to_s(a, w, sa);
-      gf_general_val_to_s(&oa, w, soa);
-      gf_general_val_to_s(&ot, w, sot);
-      gf_general_val_to_s(&ft, w, sft);
-      gf_general_val_to_s(&sb, w, ssb);
-      printf("   Value: %s\n", sa);
-      printf("   Original source word: %s\n", soa);
-      if (xor) printf("   XOR with target word: %s\n", sot);
-      printf("   Product word: %s\n", sft);
-      printf("   It should be: %s\n", ssb);
+      gf_general_val_to_s(a, w, sa, 1);
+      gf_general_val_to_s(&oa, w, soa, 1);
+      gf_general_val_to_s(&ot, w, sot, 1);
+      gf_general_val_to_s(&ft, w, sft, 1);
+      gf_general_val_to_s(&sb, w, ssb, 1);
+      fprintf(stderr,"   Value: %s\n", sa);
+      fprintf(stderr,"   Original source word: %s\n", soa);
+      if (xor) fprintf(stderr,"   XOR with target word: %s\n", sot);
+      fprintf(stderr,"   Product word: %s\n", sft);
+      fprintf(stderr,"   It should be: %s\n", ssb);
      exit(0);
    }
  }
@ -251,7 +317,7 @@ void gf_general_set_up_single_timing_test(int w, void *ra, void *rb, int size)
 {
  void *top;
  gf_general_t g;
-  uint8_t *r8;
+  uint8_t *r8, *r8a;
  uint16_t *r16;
  uint32_t *r32;
  uint64_t *r64;
@ -263,6 +329,8 @@ void gf_general_set_up_single_timing_test(int w, void *ra, void *rb, int size)
     However, don't allow for zeros in rb, because that will screw up
     division.
     
+     When w is 4, you fill the regions with random 4-bit words in each byte.
+
     Otherwise, treat every four bytes as an uint32_t
     and fill it with a random value mod (1 << w).
   */
@ -296,6 +364,17 @@ void gf_general_set_up_single_timing_test(int w, void *ra, void *rb, int size)
      }
      rb += (w/8);
    }
+  } else if (w == 4) {
+    r8a = (uint8_t *) ra;
+    r8 = (uint8_t *) rb;
+    while (r8 < (uint8_t *) top) {
+      gf_general_set_random(&g, w, 1);
+      *r8a = g.w32;
+      gf_general_set_random(&g, w, 0);
+      *r8 = g.w32;
+      r8a++;
+      r8++;
+    }
  } else {
    r32 = (uint32_t *) ra;
    for (i = 0; i < size/4; i++) r32[i] = MOA_Random_W(w, 1);
@ -306,7 +385,7 @@ void gf_general_set_up_single_timing_test(int w, void *ra, void *rb, int size)

 /* This sucks, but in order to time, you really need to avoid putting ifs in 
   the inner loops.  So, I'm doing a separate timing test for each w: 
-   8, 16, 32, 64, 128 and everything else.  Fortunately, the "everything else"
+   (4 & 8), 16, 32, 64, 128 and everything else.  Fortunately, the "everything else"
   tests can be equivalent to w=32.

   I'm also putting the results back into ra, because otherwise, the optimizer might
@ -327,7 +406,7 @@ int gf_general_do_single_timing_test(gf_t *gf, void *ra, void *rb, int size, cha
  w = h->w;
  top = ra + size;

-  if (w == 8) {
+  if (w == 8 || w == 4) {
    r8a = (uint8_t *) ra; 
    r8b = (uint8_t *) rb; 
    top8 = (uint8_t *) top;
--- a/gf_general.h
+++ b/gf_general.h
@ -32,10 +32,12 @@ int gf_general_is_zero(gf_general_t *v, int w);
 int gf_general_is_one(gf_general_t *v, int w);
 int gf_general_are_equal(gf_general_t *v1, gf_general_t *v2, int w);

-void gf_general_val_to_s(gf_general_t *v, int w, char *s);
+void gf_general_val_to_s(gf_general_t *v, int w, char *s, int hex);
+int  gf_general_s_to_val(gf_general_t *v, int w, char *s, int hex);

 void gf_general_set_random(gf_general_t *v, int w, int zero_ok);

+void gf_general_add(gf_t *gf, gf_general_t *a, gf_general_t *b, gf_general_t *c);
 void gf_general_multiply(gf_t *gf, gf_general_t *a, gf_general_t *b, gf_general_t *c);
 void gf_general_divide(gf_t *gf, gf_general_t *a, gf_general_t *b, gf_general_t *c);
 void gf_general_inverse(gf_t *gf, gf_general_t *a, gf_general_t *b);
--- a/gf_inline_time.c
+++ b/gf_inline_time.c
@ -9,6 +9,7 @@
 #include <string.h>
 #include <stdlib.h>
 #include <time.h>
+#include <sys/time.h>

 #include "gf_complete.h"
 #include "gf_rand.h"
--- a/gf_int.h
+++ b/gf_int.h
@ -51,11 +51,15 @@ extern int gf_wgen_scratch_size(int w, int mult_type, int region_type, int divid
 void gf_wgen_cauchy_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor);
 gf_val_32_t gf_wgen_extract_word(gf_t *gf, void *start, int bytes, int index);

-
 extern void gf_alignment_error(char *s, int a);

 extern uint32_t gf_bitmatrix_inverse(uint32_t y, int w, uint32_t pp);

+/* This returns the correct default for prim_poly when base is used as the base
+   field for COMPOSITE.  It returns 0 if we don't have a default prim_poly. */
+
+extern uint64_t gf_composite_get_default_poly(gf_t *base);
+
 /* This structure lets you define a region multiply.  It helps because you can handle
   unaligned portions of the data with the procedures below, which really cleans
   up the code. */
@ -96,3 +100,97 @@ extern void gf_do_final_region_alignment(gf_region_data *rd);
 extern void gf_two_byte_region_table_multiply(gf_region_data *rd, uint16_t *base);

 extern void gf_multby_zero(void *dest, int bytes, int xor);
+extern void gf_multby_one(void *src, void *dest, int bytes, int xor);
+
+typedef enum {GF_E_MDEFDIV, /* Dev != Default && Mult == Default */
+              GF_E_MDEFREG, /* Reg != Default && Mult == Default */
+              GF_E_MDEFARG, /* Args != Default && Mult == Default */
+              GF_E_DIVCOMP, /* Mult == Composite && Div != Default */
+              GF_E_CAUCOMP, /* Mult == Composite && Reg == CAUCHY */
+              GF_E_DOUQUAD, /* Reg == DOUBLE && Reg == QUAD */
+              GF_E_SSE__NO, /* Reg == SSE && Reg == NOSSE */
+              GF_E_CAUCHYB, /* Reg == CAUCHY && Other Reg */
+              GF_E_CAUGT32, /* Reg == CAUCHY && w > 32*/
+              GF_E_ARG1SET, /* Arg1 != 0 && Mult \notin COMPOSITE/SPLIT/GROUP */
+              GF_E_ARG2SET, /* Arg2 != 0 && Mult \notin SPLIT/GROUP */
+              GF_E_MATRIXW, /* Div == MATRIX && w > 32 */
+              GF_E_BAD___W, /* Illegal w */
+              GF_E_DOUBLET, /* Reg == DOUBLE && Mult != TABLE */
+              GF_E_DOUBLEW, /* Reg == DOUBLE && w \notin {4,8} */
+              GF_E_DOUBLEJ, /* Reg == DOUBLE && other Reg */
+              GF_E_DOUBLEL, /* Reg == DOUBLE & LAZY but w = 4 */
+              GF_E_QUAD__T, /* Reg == QUAD && Mult != TABLE */
+              GF_E_QUAD__W, /* Reg == QUAD && w != 4 */
+              GF_E_QUAD__J, /* Reg == QUAD && other Reg */
+              GF_E_LAZY__X, /* Reg == LAZY && not DOUBLE or QUAD*/
+              GF_E_ALTSHIF, /* Mult == Shift && Reg == ALTMAP */
+              GF_E_SSESHIF, /* Mult == Shift && Reg == SSE|NOSSE */
+              GF_E_ALT_CFM, /* Mult == CARRY_FREE && Reg == ALTMAP */
+              GF_E_SSE_CFM, /* Mult == CARRY_FREE && Reg == SSE|NOSSE */
+              GF_E_PCLMULX, /* Mult == Carry_Free && No PCLMUL */
+              GF_E_ALT_BY2, /* Mult == Bytwo_x && Reg == ALTMAP */
+              GF_E_BY2_SSE, /* Mult == Bytwo_x && Reg == SSE && No SSE2 */
+              GF_E_LOGBADW, /* Mult == LOGx, w too big*/
+              GF_E_LOG___J, /* Mult == LOGx, && Reg == SSE|ALTMAP|NOSSE */
+              GF_E_ZERBADW, /* Mult == LOG_ZERO, w \notin {8,16} */
+              GF_E_ZEXBADW, /* Mult == LOG_ZERO_EXT, w != 8 */
+              GF_E_LOGPOLY, /* Mult == LOG & poly not primitive */
+              GF_E_GR_ARGX, /* Mult == GROUP, Bad arg1/2 */
+              GF_E_GR_W_48, /* Mult == GROUP, w \in { 4, 8 } */
+              GF_E_GR_W_16, /* Mult == GROUP, w == 16, arg1 != 4 || arg2 != 4 */
+              GF_E_GR_128A, /* Mult == GROUP, w == 128, bad args */
+              GF_E_GR_SSE4, /* Mult == GROUP, w == 128, No SSE4  */
+              GF_E_GR_A_27, /* Mult == GROUP, either arg > 27 */
+              GF_E_GR_AR_W, /* Mult == GROUP, either arg > w  */
+              GF_E_GR____J, /* Mult == GROUP, Reg == SSE|ALTMAP|NOSSE */
+              GF_E_TABLE_W, /* Mult == TABLE, w too big */
+              GF_E_TAB_SSE, /* Mult == TABLE, SSE|NOSSE only apply to w == 4 */
+              GF_E_TABSSE3, /* Mult == TABLE, Need SSSE3 for SSE */
+              GF_E_TAB_ALT, /* Mult == TABLE, Reg == ALTMAP */
+              GF_E_SP128AR, /* Mult == SPLIT, w=128, Bad arg1/arg2 */
+              GF_E_SP128AL, /* Mult == SPLIT, w=128, SSE requires ALTMAP */
+              GF_E_SP128AS, /* Mult == SPLIT, w=128, ALTMAP requires SSE */
+              GF_E_SP128_A, /* Mult == SPLIT, w=128, SSE only with 4/128 */
+              GF_E_SP128_S, /* Mult == SPLIT, w=128, ALTMAP only with 4/128 */
+              GF_E_SPLIT_W, /* Mult == SPLIT, Bad w (8, 16, 32, 64, 128)  */
+              GF_E_SP_16AR, /* Mult == SPLIT, w=16, Bad arg1/arg2 */
+              GF_E_SP_16_A, /* Mult == SPLIT, w=16, ALTMAP only with 4/16 */
+              GF_E_SP_16_S, /* Mult == SPLIT, w=16, SSE only with 4/16 */
+              GF_E_SP_32AR, /* Mult == SPLIT, w=32, Bad arg1/arg2 */
+              GF_E_SP_32AS, /* Mult == SPLIT, w=32, ALTMAP requires SSE */
+              GF_E_SP_32_A, /* Mult == SPLIT, w=32, ALTMAP only with 4/32 */
+              GF_E_SP_32_S, /* Mult == SPLIT, w=32, SSE only with 4/32 */
+              GF_E_SP_64AR, /* Mult == SPLIT, w=64, Bad arg1/arg2 */
+              GF_E_SP_64AS, /* Mult == SPLIT, w=64, ALTMAP requires SSE */
+              GF_E_SP_64_A, /* Mult == SPLIT, w=64, ALTMAP only with 4/64 */
+              GF_E_SP_64_S, /* Mult == SPLIT, w=64, SSE only with 4/64 */
+              GF_E_SP_8_AR, /* Mult == SPLIT, w=8, Bad arg1/arg2 */
+              GF_E_SP_8__A, /* Mult == SPLIT, w=8, no ALTMAP */
+              GF_E_SP_SSE3, /* Mult == SPLIT, Need SSSE3 for SSE */
+              GF_E_COMP_A2, /* Mult == COMP, arg1 must be = 2 */
+              GF_E_COMP_SS, /* Mult == COMP, SSE|NOSSE */
+              GF_E_COMP__W, /* Mult == COMP, Bad w. */
+              GF_E_UNKFLAG, /* Unknown flag in create_from.... */
+              GF_E_UNKNOWN, /* Unknown mult_type. */
+              GF_E_UNK_REG, /* Unknown region_type. */
+              GF_E_UNK_DIV, /* Unknown divide_type. */
+              GF_E_CFM___W, /* Mult == CFM,  Bad w. */
+              GF_E_CFM4POL, /* Mult == CFM & Prim Poly has high bits set. */
+              GF_E_CFM8POL, /* Mult == CFM & Prim Poly has high bits set. */
+              GF_E_CF16POL, /* Mult == CFM & Prim Poly has high bits set. */
+              GF_E_CF32POL, /* Mult == CFM & Prim Poly has high bits set. */
+              GF_E_CF64POL, /* Mult == CFM & Prim Poly has high bits set. */
+              GF_E_FEWARGS, /* Too few args in argc/argv. */
+              GF_E_BADPOLY, /* Bad primitive polynomial -- too many bits set. */
+              GF_E_COMP_PP, /* Bad primitive polynomial -- bigger than sub-field. */
+              GF_E_COMPXPP, /* Can't derive a default pp for composite field. */
+              GF_E_BASE__W, /* Composite -- Base field is the wrong size. */
+              GF_E_TWOMULT, /* In create_from... two -m's. */
+              GF_E_TWO_DIV, /* In create_from... two -d's. */
+              GF_E_POLYSPC, /* Bad numbera after -p. */
+              GF_E_SPLITAR, /* Ran out of arguments in SPLIT */
+              GF_E_SPLITNU, /* Arguments not integers in SPLIT. */
+              GF_E_GROUPAR, /* Ran out of arguments in GROUP */
+              GF_E_GROUPNU, /* Arguments not integers in GROUP. */
+              GF_E_DEFAULT } gf_error_type_t;
+
--- a/gf_method.c
+++ b/gf_method.c
@ -11,179 +11,172 @@
 #include <time.h>

 #include "gf_complete.h"
+#include "gf_int.h"
 #include "gf_method.h"

-void methods_to_stderr()
-{
-  fprintf(stderr, "To specify the methods, do one of the following: \n");
-  fprintf(stderr, "       - leave empty to use defaults\n");
-  fprintf(stderr, "       - use a single dash to use defaults\n");
-  fprintf(stderr, "       - specify MULTIPLY REGION DIVIDE\n");
-  fprintf(stderr, "\n");
-  fprintf(stderr, "Legal values of MULTIPLY:\n");
-  fprintf(stderr, "       SHIFT: shift\n");
-  fprintf(stderr, "       GROUP g_mult g_reduce: the Group technique - see the paper\n");
-  fprintf(stderr, "       BYTWO_p: BYTWO doubling the product.\n");
-  fprintf(stderr, "       BYTWO_b: BYTWO doubling b (more efficient thatn BYTWO_p)\n");
-  fprintf(stderr, "       TABLE: Full multiplication table\n");
-  fprintf(stderr, "       LOG:   Discrete logs\n");
-  fprintf(stderr, "       LOG_ZERO: Discrete logs with a large table for zeros\n");
-  fprintf(stderr, "       LOG_ZERO_EXT: Discrete logs with an extra large table for zeros\n");
-  fprintf(stderr, "       SPLIT g_a g_b: Split tables defined by g_a and g_b\n");
-  fprintf(stderr, "       COMPOSITE k rec METHOD: Composite field.  GF((2^l)^k), l=w/k.\n");
-  fprintf(stderr, "                               rec = 0 means inline single multiplication\n");
-  fprintf(stderr, "                               rec = 1 means recursive single multiplication\n");
-  fprintf(stderr, "                               METHOD is the method of the base field in GF(2^l)\n");
-  fprintf(stderr, "\n");
-  fprintf(stderr, "Legal values of REGION: Specify multiples with commas e.g. 'DOUBLE,LAZY'\n");
-  fprintf(stderr, "       -: Use defaults\n");
-  fprintf(stderr, "       SINGLE/DOUBLE/QUAD: Expand tables\n");
-  fprintf(stderr, "       LAZY: Lazily create table (only applies to TABLE and SPLIT)\n");
-  fprintf(stderr, "       SSE/NOSSE: Use 128-bit SSE instructions if you can\n");
-  fprintf(stderr, "       CAUCHY/ALTMAP/STDMAP: Use different memory mappings\n");
-  fprintf(stderr, "\n");
-  fprintf(stderr, "Legal values of DIVIDE:\n");
-  fprintf(stderr, "       -: Use defaults\n");
-  fprintf(stderr, "       MATRIX: Use matrix inversion\n");
-  fprintf(stderr, "       EUCLID: Use the extended Euclidian algorithm.\n");
-  fprintf(stderr, "\n");
-  fprintf(stderr, "See the user's manual for more information.\n");
-  fprintf(stderr, "There are many restrictions, so it is better to simply use defaults in most cases.\n");
-}
-
 int create_gf_from_argv(gf_t *gf, int w, int argc, char **argv, int starting)
 {
  int mult_type, divide_type, region_type;
-  uint32_t prim_poly = 0;
  int arg1, arg2, subrg_size;
+  uint64_t prim_poly;
  gf_t *base;
  char *crt, *x, *y;

-  if (argc <= starting || strcmp(argv[starting], "-") == 0) {
-    if (!gf_init_easy(gf, w)) return 0;
-    return (argc <= starting) ? starting : starting+1;
-  }
-
+  mult_type = GF_MULT_DEFAULT;
  region_type = GF_REGION_DEFAULT;
  divide_type = GF_DIVIDE_DEFAULT;
-
-  arg1 = 0;
-  arg2 = 0;
  prim_poly = 0;
  base = NULL;
-  subrg_size = 0;
-  
-  if (argc < starting+3) return 0;
-
-  if (strcmp(argv[starting], "SHIFT") == 0) {
-    mult_type = GF_MULT_SHIFT;
-    starting++;
-  } else if (strcmp(argv[starting], "GROUP") == 0) {
-    mult_type = GF_MULT_GROUP;
-    if (argc < starting+5) return 0;
-    if (sscanf(argv[starting+1], "%d", &arg1) == 0 ||
-        sscanf(argv[starting+2], "%d", &arg2) == 0 ||
-        arg1 <= 0 || arg2 <= 0 || arg1 >= w || arg2 >= w) return 0;
-    starting += 3;
-  } else if (strcmp(argv[starting], "BYTWO_p") == 0) {
-    mult_type = GF_MULT_BYTWO_p;
-    starting++;
-  } else if (strcmp(argv[starting], "BYTWO_b") == 0) {
-    mult_type = GF_MULT_BYTWO_b;
-    starting++;
-  } else if (strcmp(argv[starting], "TABLE") == 0) {
-    mult_type = GF_MULT_TABLE;
-    starting++;
-  } else if (strcmp(argv[starting], "LOG") == 0) {
-    mult_type = GF_MULT_LOG_TABLE;
-    starting++;
-  } else if (strcmp(argv[starting], "LOG_ZERO") == 0) {
-    mult_type = GF_MULT_LOG_TABLE;
-    arg1 = 1;
-    starting++;
-  } else if (strcmp(argv[starting], "LOG_ZERO_EXT") == 0) {
-    mult_type = GF_MULT_LOG_TABLE;
-    arg1 = 2;
-    starting++;
-  } else if (strcmp(argv[starting], "SPLIT") == 0) {
-    mult_type = GF_MULT_SPLIT_TABLE;
-    if (argc < starting+5) return 0;
-    if (sscanf(argv[starting+1], "%d", &arg1) == 0 ||
-        sscanf(argv[starting+2], "%d", &arg2) == 0 ||
-        arg1 <= 0 || arg2 <= 0 || w % arg1 != 0 || w % arg2 != 0) return 0;
-    starting += 3;
-  } else if (strcmp(argv[starting], "COMPOSITE") == 0) {
-    mult_type = GF_MULT_COMPOSITE;
-    if (argc < starting+6) return 0;
-    if (sscanf(argv[starting+1], "%d", &arg1) == 0 ||
-        sscanf(argv[starting+2], "%d", &arg2) == 0 ||
-        arg1 <= 1 || w %arg1 != 0 || ((arg2 | 1) != 1)) return 0;
-    base = (gf_t *) malloc(sizeof(gf_t));
-    starting = create_gf_from_argv(base, w/arg1, argc, argv, starting+3);
-    if (starting == 0) { free(base); return 0; }
-  } else {
-    return 0;
-  }
-
-  if (argc < starting+2) {
-    if (base != NULL) gf_free(base, 1);
-    return 0;
-  }
-
-  if (strcmp(argv[starting], "-") == 0) {
-    region_type = GF_REGION_DEFAULT;
-  } else {
-    crt = strdup(argv[starting]);
-    region_type = 0;
-    x = crt;
-    do { 
-      y = strchr(x, ','); 
-      if (y != NULL) *y = '\0';
-      if (strcmp(x, "DOUBLE") == 0) {
-        region_type |= GF_REGION_DOUBLE_TABLE;
-      } else if (strcmp(x, "QUAD") == 0) {
-        region_type |= GF_REGION_QUAD_TABLE;
-      } else if (strcmp(x, "SINGLE") == 0) {
-        region_type |= GF_REGION_SINGLE_TABLE;
-      } else if (strcmp(x, "LAZY") == 0) {
-        region_type |= GF_REGION_LAZY;
-      } else if (strcmp(x, "SSE") == 0) {
-        region_type |= GF_REGION_SSE;
-      } else if (strcmp(x, "NOSSE") == 0) {
-        region_type |= GF_REGION_NOSSE;
-      } else if (strcmp(x, "CAUCHY") == 0) {
-        region_type |= GF_REGION_CAUCHY;
-      } else if (strcmp(x, "ALTMAP") == 0) {
-        region_type |= GF_REGION_ALTMAP;
-      } else if (strcmp(x, "STDMAP") == 0) {
-        region_type |= GF_REGION_STDMAP;
+  arg1 = 0;
+  arg2 = 0;
+  while (1) {
+    if (argc > starting) {
+      if (strcmp(argv[starting], "-m") == 0) {
+        starting++;
+        if (mult_type != GF_MULT_DEFAULT) {
+          if (base != NULL) gf_free(base, 1);
+          _gf_errno = GF_E_TWOMULT;
+          return 0;
+        }
+        if (strcmp(argv[starting], "SHIFT") == 0) {
+          mult_type = GF_MULT_SHIFT;
+          starting++;
+        } else if (strcmp(argv[starting], "CARRY_FREE") == 0) {
+          mult_type = GF_MULT_CARRY_FREE;
+          starting++;
+        } else if (strcmp(argv[starting], "GROUP") == 0) {
+          mult_type = GF_MULT_GROUP;
+          if (argc < starting + 3) {
+            _gf_errno = GF_E_GROUPAR;
+            return 0;
+          }
+          if (sscanf(argv[starting+1], "%d", &arg1) == 0 ||
+              sscanf(argv[starting+2], "%d", &arg2) == 0) {
+            _gf_errno = GF_E_GROUPNU;
+            return 0;
+          }
+          starting += 3;
+        } else if (strcmp(argv[starting], "BYTWO_p") == 0) {
+          mult_type = GF_MULT_BYTWO_p;
+          starting++;
+        } else if (strcmp(argv[starting], "BYTWO_b") == 0) {
+          mult_type = GF_MULT_BYTWO_b;
+          starting++;
+        } else if (strcmp(argv[starting], "TABLE") == 0) {
+          mult_type = GF_MULT_TABLE;
+          starting++;
+        } else if (strcmp(argv[starting], "LOG") == 0) {
+          mult_type = GF_MULT_LOG_TABLE;
+          starting++;
+        } else if (strcmp(argv[starting], "LOG_ZERO") == 0) {
+          mult_type = GF_MULT_LOG_ZERO;
+          starting++;
+        } else if (strcmp(argv[starting], "LOG_ZERO_EXT") == 0) {
+          mult_type = GF_MULT_LOG_ZERO_EXT;
+          starting++;
+        } else if (strcmp(argv[starting], "SPLIT") == 0) {
+          mult_type = GF_MULT_SPLIT_TABLE;
+          if (argc < starting + 3) {
+            _gf_errno = GF_E_SPLITAR;
+            return 0;
+          }
+          if (sscanf(argv[starting+1], "%d", &arg1) == 0 ||
+              sscanf(argv[starting+2], "%d", &arg2) == 0) {
+            _gf_errno = GF_E_SPLITNU;
+            return 0;
+          }
+          starting += 3;
+        } else if (strcmp(argv[starting], "COMPOSITE") == 0) {
+          mult_type = GF_MULT_COMPOSITE;
+          if (argc < starting + 2) { _gf_errno = GF_E_FEWARGS; return 0; }
+          if (sscanf(argv[starting+1], "%d", &arg1) == 0) {
+            _gf_errno = GF_E_COMP_A2;
+            return 0;
+          }
+          starting += 2;
+          base = (gf_t *) malloc(sizeof(gf_t));
+          starting = create_gf_from_argv(base, w/arg1, argc, argv, starting);
+          if (starting == 0) {
+            free(base);
+            return 0;
+          }
+        } else {
+          if (base != NULL) gf_free(base, 1);
+          _gf_errno = GF_E_UNKNOWN;
+          return 0;
+        }
+      } else if (strcmp(argv[starting], "-r") == 0) {
+        starting++;
+        if (strcmp(argv[starting], "DOUBLE") == 0) {
+          region_type |= GF_REGION_DOUBLE_TABLE;
+          starting++;
+        } else if (strcmp(argv[starting], "QUAD") == 0) {
+          region_type |= GF_REGION_QUAD_TABLE;
+          starting++;
+        } else if (strcmp(argv[starting], "LAZY") == 0) {
+          region_type |= GF_REGION_LAZY;
+          starting++;
+        } else if (strcmp(argv[starting], "SSE") == 0) {
+          region_type |= GF_REGION_SSE;
+          starting++;
+        } else if (strcmp(argv[starting], "NOSSE") == 0) {
+          region_type |= GF_REGION_NOSSE;
+          starting++;
+        } else if (strcmp(argv[starting], "CAUCHY") == 0) {
+          region_type |= GF_REGION_CAUCHY;
+          starting++;
+        } else if (strcmp(argv[starting], "ALTMAP") == 0) {
+          region_type |= GF_REGION_ALTMAP;
+          starting++;
+        } else {
+          if (base != NULL) gf_free(base, 1);
+          _gf_errno = GF_E_UNK_REG;
+          return 0;
+        }
+      } else if (strcmp(argv[starting], "-p") == 0) {
+        starting++;
+        if (sscanf(argv[starting], "%llx", (long long unsigned int *)(&prim_poly)) == 0) {
+          if (base != NULL) gf_free(base, 1);
+          _gf_errno = GF_E_POLYSPC;
+          return 0;
+        }
+        starting++;
+      } else if (strcmp(argv[starting], "-d") == 0) {
+        starting++;
+        if (divide_type != GF_DIVIDE_DEFAULT) {
+          if (base != NULL) gf_free(base, 1);
+          _gf_errno = GF_E_TWO_DIV;
+          return 0;
+        } else if (strcmp(argv[starting], "EUCLID") == 0) {
+          divide_type = GF_DIVIDE_EUCLID;
+          starting++;
+        } else if (strcmp(argv[starting], "MATRIX") == 0) {
+          divide_type = GF_DIVIDE_MATRIX;
+          starting++;
+        } else {
+          _gf_errno = GF_E_UNK_DIV;
+          return 0;
+        }
+      } else if (strcmp(argv[starting], "-") == 0) {
+         /*
+         printf("Scratch size: %d\n", gf_scratch_size(w, 
+                                      mult_type, region_type, divide_type, arg1, arg2));
+         */
+        if (gf_init_hard(gf, w, mult_type, region_type, divide_type, 
+                         prim_poly, arg1, arg2, base, NULL) == 0) {
+          if (base != NULL) gf_free(base, 1);
+          return 0;
+        } else
+          return starting + 1;
      } else {
        if (base != NULL) gf_free(base, 1);
-        free(crt);
+        _gf_errno = GF_E_UNKFLAG;
        return 0;
      }
-      if (y != NULL) x = y+1;
-    } while (y != NULL);
-    free(crt);
+    } else {
+      if (base != NULL) gf_free(base, 1);
+      _gf_errno = GF_E_FEWARGS;
+      return 0;
+    }
  }
-
-  starting++;
-
-  if (strcmp(argv[starting], "-") == 0) {
-    divide_type = GF_DIVIDE_DEFAULT;
-  } else if (strcmp(argv[starting], "MATRIX") == 0) {
-    divide_type = GF_DIVIDE_MATRIX;
-  } else if (strcmp(argv[starting], "EUCLID") == 0) {
-    divide_type = GF_DIVIDE_EUCLID;
-  } else {
-    if (base != NULL) gf_free(base, 1);
-    return 0;
-  }
-  starting++;
-
-  if (!gf_init_hard(gf, w, mult_type, region_type, divide_type, prim_poly, arg1, arg2, base, NULL)) {
-    if (base != NULL) gf_free(base, 1);
-    return 0;
-  }
-  return starting;
 }
--- a/gf_method.h
+++ b/gf_method.h
@ -8,8 +8,9 @@

 #include "gf_complete.h"

-/* This prints out the error string defining the methods that you can put on argv*/
-extern void methods_to_stderr();
+/* Parses argv starting at "starting".  
+   
+   Returns 0 on failure.
+   On success, it returns one past the last argument it read in argv. */

-/* Parses argv starting at "starting" */
 extern int create_gf_from_argv(gf_t *gf, int w, int argc, char **argv, int starting);
--- a/gf_methods.c
+++ b/gf_methods.c
@ -11,58 +11,26 @@

 #include "gf_complete.h"
 #include "gf_method.h"
+#include "gf_int.h"

-#define NMULTS (15)
-static char *mults[NMULTS] = { "SHIFT", "GROUP44", "GROUP48", "BYTWO_p", "BYTWO_b",
-                               "TABLE", "LOG", "LOG_ZERO", "LOG_ZERO_EXT", "SPLIT2", "SPLIT4", "SPLIT8", "SPLIT88", "COMPOSITE-0", "COMPOSITE-1" };
+#define NMULTS (16)
+static char *mults[NMULTS] = { "SHIFT", "CARRY_FREE", "GROUP44", "GROUP48", "BYTWO_p", "BYTWO_b",
+                               "TABLE", "LOG", "LOG_ZERO", "LOG_ZERO_EXT", "SPLIT2",
+                               "SPLIT4", "SPLIT8", "SPLIT16", "SPLIT88", "COMPOSITE" };

-#define NREGIONS (96) 
-static char *regions[NREGIONS] = { "-", "SINGLE", "DOUBLE", "QUAD",
-"LAZY", "SINGLE,LAZY", "DOUBLE,LAZY", "QUAD,LAZY", "SSE",
-"SINGLE,SSE", "DOUBLE,SSE", "QUAD,SSE", "LAZY,SSE",
-"SINGLE,LAZY,SSE", "DOUBLE,LAZY,SSE", "QUAD,LAZY,SSE", "NOSSE",
-"SINGLE,NOSSE", "DOUBLE,NOSSE", "QUAD,NOSSE", "LAZY,NOSSE",
-"SINGLE,LAZY,NOSSE", "DOUBLE,LAZY,NOSSE", "QUAD,LAZY,NOSSE",
-"STDMAP", "SINGLE,STDMAP", "DOUBLE,STDMAP", "QUAD,STDMAP",
-"LAZY,STDMAP", "SINGLE,LAZY,STDMAP", "DOUBLE,LAZY,STDMAP",
-"QUAD,LAZY,STDMAP", "SSE,STDMAP", "SINGLE,SSE,STDMAP",
-"DOUBLE,SSE,STDMAP", "QUAD,SSE,STDMAP", "LAZY,SSE,STDMAP",
-"SINGLE,LAZY,SSE,STDMAP", "DOUBLE,LAZY,SSE,STDMAP",
-"QUAD,LAZY,SSE,STDMAP", "NOSSE,STDMAP", "SINGLE,NOSSE,STDMAP",
-"DOUBLE,NOSSE,STDMAP", "QUAD,NOSSE,STDMAP", "LAZY,NOSSE,STDMAP",
-"SINGLE,LAZY,NOSSE,STDMAP", "DOUBLE,LAZY,NOSSE,STDMAP",
-"QUAD,LAZY,NOSSE,STDMAP", "ALTMAP", "SINGLE,ALTMAP", "DOUBLE,ALTMAP",
-"QUAD,ALTMAP", "LAZY,ALTMAP", "SINGLE,LAZY,ALTMAP",
-"DOUBLE,LAZY,ALTMAP", "QUAD,LAZY,ALTMAP", "SSE,ALTMAP",
-"SINGLE,SSE,ALTMAP", "DOUBLE,SSE,ALTMAP", "QUAD,SSE,ALTMAP",
-"LAZY,SSE,ALTMAP", "SINGLE,LAZY,SSE,ALTMAP",
-"DOUBLE,LAZY,SSE,ALTMAP", "QUAD,LAZY,SSE,ALTMAP", "NOSSE,ALTMAP",
-"SINGLE,NOSSE,ALTMAP", "DOUBLE,NOSSE,ALTMAP", "QUAD,NOSSE,ALTMAP",
-"LAZY,NOSSE,ALTMAP", "SINGLE,LAZY,NOSSE,ALTMAP",
-"DOUBLE,LAZY,NOSSE,ALTMAP", "QUAD,LAZY,NOSSE,ALTMAP", "CAUCHY",
-"SINGLE,CAUCHY", "DOUBLE,CAUCHY", "QUAD,CAUCHY", "LAZY,CAUCHY",
-"SINGLE,LAZY,CAUCHY", "DOUBLE,LAZY,CAUCHY", "QUAD,LAZY,CAUCHY",
-"SSE,CAUCHY", "SINGLE,SSE,CAUCHY", "DOUBLE,SSE,CAUCHY",
-"QUAD,SSE,CAUCHY", "LAZY,SSE,CAUCHY", "SINGLE,LAZY,SSE,CAUCHY",
-"DOUBLE,LAZY,SSE,CAUCHY", "QUAD,LAZY,SSE,CAUCHY", "NOSSE,CAUCHY",
-"SINGLE,NOSSE,CAUCHY", "DOUBLE,NOSSE,CAUCHY", "QUAD,NOSSE,CAUCHY",
-"LAZY,NOSSE,CAUCHY", "SINGLE,LAZY,NOSSE,CAUCHY",
-"DOUBLE,LAZY,NOSSE,CAUCHY", "QUAD,LAZY,NOSSE,CAUCHY" };
+#define NREGIONS (7) 
+static char *regions[NREGIONS] = { "DOUBLE", "QUAD", "LAZY", "SSE", "NOSSE", 
+                                   "ALTMAP", "CAUCHY" };

-#define NDIVS (3)
-static char *divides[NDIVS] = { "-", "MATRIX", "EUCLID" }; 
+#define NDIVS (2)
+static char *divides[NDIVS] = { "MATRIX", "EUCLID" }; 

-int main()
+int main() 
 {
-  int m, r, d, w, i, sa, j;
-  char *argv[20];
+  int m, r, d, w, i, sa, j, k, reset;
+  char *argv[50];
  gf_t gf;
  char divs[200], ks[10], ls[10];
-
-  methods_to_stderr();
-
-  printf("\n");
-  printf("Implemented Methods: \n\n");
  
  for (i = 2; i < 8; i++) {
    w = (1 << i);
@ -70,9 +38,14 @@ int main()
    if (create_gf_from_argv(&gf, w, 1, argv, 0) > 0) {
      printf("w=%d: -\n", w);
      gf_free(&gf, 1);
+    } else if (_gf_errno == GF_E_DEFAULT) {
+      fprintf(stderr, "Unlabeled failed method: w=%d: -\n", 2);
+      exit(1);
    }
+
    for (m = 0; m < NMULTS; m++) {
      sa = 0;
+      argv[sa++] = "-m";
      if (strcmp(mults[m], "GROUP44") == 0) {
        argv[sa++] = "GROUP";
        argv[sa++] = "4";
@ -96,46 +69,66 @@ int main()
        sprintf(ls, "%d", w);
        argv[sa++] = ls;
        argv[sa++] = "8";
+      } else if (strcmp(mults[m], "SPLIT16") == 0) {
+        argv[sa++] = "SPLIT";
+        sprintf(ls, "%d", w);
+        argv[sa++] = ls;
+        argv[sa++] = "16";
      } else if (strcmp(mults[m], "SPLIT88") == 0) {
        argv[sa++] = "SPLIT";
        argv[sa++] = "8";
        argv[sa++] = "8";
-      } else if (strcmp(mults[m], "COMPOSITE-0") == 0) {
+      } else if (strcmp(mults[m], "COMPOSITE") == 0) {
        argv[sa++] = "COMPOSITE";
        argv[sa++] = "2";
-        argv[sa++] = "0";
-        argv[sa++] = "-";
-      } else if (strcmp(mults[m], "COMPOSITE-1") == 0) {
-        argv[sa++] = "COMPOSITE";
-        argv[sa++] = "2";
-        argv[sa++] = "1";
        argv[sa++] = "-";
      } else {
        argv[sa++] = mults[m];
      }
-      for (r = 0; r < NREGIONS; r++) {
-        argv[sa++] = regions[r]; 
-        strcpy(divs, "");
-        for (d = 0; d < NDIVS; d++) {
-          argv[sa++] = divides[d];
-/*          printf("w=%d:", w);
-          for (j = 0; j < sa; j++) printf(" %s", argv[j]);
-          printf("\n"); */
-          if (create_gf_from_argv(&gf, w, sa, argv, 0) > 0) {
-            strcat(divs, "|");
-            strcat(divs, divides[d]);
-            gf_free(&gf, 1);
-          } 
-          sa--;
+      reset = sa;
+      for (r = 0; r < (1 << NREGIONS); r++) {
+        sa = reset;
+        for (k = 0; k < NREGIONS; k++) {
+          if (r & 1 << k) {
+            argv[sa++] = "-r";
+            argv[sa++] = regions[k];
+          }
        }
-        if (strlen(divs) > 0) {
+        argv[sa++] = "-";
+        if (create_gf_from_argv(&gf, w, sa, argv, 0) > 0) {
          printf("w=%d:", w);
          for (j = 0; j < sa; j++) printf(" %s", argv[j]);
-          printf(" %s\n", divs+1);
+          printf("\n");
+          gf_free(&gf, 1);
+        } else if (_gf_errno == GF_E_DEFAULT) {
+          fprintf(stderr, "Unlabeled failed method: w=%d:", w);
+          for (j = 0; j < sa; j++) fprintf(stderr, " %s", argv[j]);
+          fprintf(stderr, "\n");
+          exit(1);
        }
        sa--;
+        for (d = 0; d < NDIVS; d++) {
+          argv[sa++] = "-d";
+          argv[sa++] = divides[d];
+          /*          printf("w=%d:", w);
+                      for (j = 0; j < sa; j++) printf(" %s", argv[j]);
+                      printf("\n"); */
+          argv[sa++] = "-";
+          if (create_gf_from_argv(&gf, w, sa, argv, 0) > 0) {
+            printf("w=%d:", w);
+            for (j = 0; j < sa; j++) printf(" %s", argv[j]);
+            printf("\n");
+            gf_free(&gf, 1);
+          } else if (_gf_errno == GF_E_DEFAULT) {
+            fprintf(stderr, "Unlabeled failed method: w=%d:", w);
+            for (j = 0; j < sa; j++) fprintf(stderr, " %s", argv[j]);
+            fprintf(stderr, "\n");
+            exit(1);
+          } 
+          sa-=3;
+        }
      }
-      sa--;
    }
  }
+  return 0;
 }
--- a/gf_mult.c
+++ b/gf_mult.c
@ -12,105 +12,53 @@

 #include "gf_complete.h"
 #include "gf_method.h"
+#include "gf_general.h"

-void usage(char *s)
+void usage(int why)
 {
  fprintf(stderr, "usage: gf_mult a b w [method] - does multiplication of a and b in GF(2^w)\n");
-  fprintf(stderr, "       If w has an h on the end, treat a, b and the product as hexadecimal (no 0x required)\n");
-  fprintf(stderr, "\n");
-  fprintf(stderr, "       legal w are: 1-32, 64 and 128\n");
-  fprintf(stderr, "           128 is hex only (i.e. '128' will be an error - do '128h')\n");
-  fprintf(stderr, "\n");
-  fprintf(stderr, "       For method specification, type gf_methods\n");
-
-  if (s != NULL) fprintf(stderr, "%s", s);
+  if (why == 'W') {
+    fprintf(stderr, "Bad w.\n");
+    fprintf(stderr, "Legal w are: 1 - 32, 64 and 128.\n");
+    fprintf(stderr, "Append 'h' to w to treat a, b and the product as hexadecimal.\n");
+    fprintf(stderr, "w=128 is hex only (i.e. '128' will be an error - do '128h')\n");
+  }
+  if (why == 'A') fprintf(stderr, "Bad a\n");
+  if (why == 'B') fprintf(stderr, "Bad b\n");
+  if (why == 'M') {
+    fprintf(stderr, "Bad Method Specification: ");
+    gf_error();
+  }
  exit(1);
 }

-int read_128(char *s, uint64_t *v)
-{
-  int l, t;
-  char save;
-
-  l = strlen(s);
-  if (l > 32) return 0;
-
-  if (l > 16) {
-    if (sscanf(s + (l-16), "%llx", (long long unsigned int *) &(v[1])) == 0) return 0;
-    save = s[l-16];
-    s[l-16] = '\0';
-    t = sscanf(s, "%llx", (long long unsigned int *) &(v[0]));
-    s[l-16] = save;
-    return t;
-  } else {
-    v[0] = 0;
-    return sscanf(s, "%llx", (long long unsigned int *)&(v[1]));
-  }
-  return 1;
-}
-
-void print_128(uint64_t *v) 
-{
-  if (v[0] > 0) {
-    printf("%llx", (long long unsigned int) v[0]);
-    printf("%016llx", (long long unsigned int) v[1]);
-  } else {
-    printf("%llx", (long long unsigned int) v[1]);
-  }
-  printf("\n");
-}
-
-
 int main(int argc, char **argv)
 {
-  int hex, al, bl, w;
-  uint32_t a, b, c, top;
-  uint64_t a64, b64, c64;
-  uint64_t a128[2], b128[2], c128[2];
-  char *format;
+  int hex, w;
  gf_t gf;
+  gf_general_t a, b, c;
+  char output[50];

-  if (argc < 4) usage(NULL);
-  if (sscanf(argv[3], "%d", &w) == 0) usage("Bad w\n");
+  if (argc < 4) usage(' ');

-  if (w <= 0 || (w > 32 && w != 64 && w != 128)) usage("Bad w");
+  if (sscanf(argv[3], "%d", &w) == 0) usage('W');
+  if (w <= 0 || (w > 32 && w != 64 && w != 128)) usage('W');

  hex = (strchr(argv[3], 'h') != NULL);
-  if (create_gf_from_argv(&gf, w, argc, argv, 4) == 0) usage("\nBad Method\n");
+  if (!hex && w == 128) usage('W');

-  if (!hex && w == 128) usage(NULL);
- 
-  if (w <= 32) {
-    format = (hex) ? "%x" : "%u";
-    if (sscanf(argv[1], format, &a) == 0) usage("Bad a\n");
-    if (sscanf(argv[2], format, &b) == 0) usage("Bad b\n");
-
-    if (w < 32) {
-      top = (w == 31) ? 0x80000000 : (1 << w);
-      if (w != 32 && a >= top) usage("a is too large\n");
-      if (w != 32 && b >= top) usage("b is too large\n");
-    }
-  
-    c = gf.multiply.w32(&gf, a, b);
-    printf(format, c);
-    printf("\n");
-
-  } else if (w == 64) {
-    format = (hex) ? "%llx" : "%llu";
-    if (sscanf(argv[1], format, &a64) == 0) usage("Bad a\n");
-    if (sscanf(argv[2], format, &b64) == 0) usage("Bad b\n");
-    c64 = gf.multiply.w64(&gf, a64, b64);
-
-    printf(format, c64);
-    printf("\n");
-
-  } else if (w == 128) {
-
-    if (read_128(argv[1], a128) == 0) usage("Bad a\n");
-    if (read_128(argv[2], b128) == 0) usage("Bad b\n");
-    gf.multiply.w128(&gf, a128, b128, c128);
-
-    print_128(c128);
+  if (argc == 4) {
+    if (gf_init_easy(&gf, w) == 0) usage('M');
+  } else {
+    if (create_gf_from_argv(&gf, w, argc, argv, 4) == 0) usage('M');
  }
+ 
+  if (!gf_general_s_to_val(&a, w, argv[1], hex)) usage('A');
+  if (!gf_general_s_to_val(&b, w, argv[2], hex)) usage('B');
+
+  gf_general_multiply(&gf, &a, &b, &c);
+  gf_general_val_to_s(&c, w, output, hex);
+  
+  printf("%s\n", output);
  exit(0);
 }
--- a/gf_poly.c
+++ b/gf_poly.c
@ -1,560 +1,268 @@
 /*
- * gf_poly.c - program to help find primitive polynomials in composite fields
+   gf_poly.c - program to help find irreducible polynomials in composite fields,
+   using the Ben-Or algorithm.  
+  
+   James S. Plank
+  
+   Please see the following paper for a 
+   description of the Ben-Or algorithm:
+
+   author    S. Gao and D. Panario
+   title     Tests and Constructions of Irreducible Polynomials over Finite Fields
+   booktitle Foundations of Computational Mathematics
+   year      1997
+   publisher Springer Verlag
+   pages     346-361
+
+  The basic technique is this.  You have a polynomial f(x) whose coefficients are
+  in a base field GF(2^w).  The polynomial is of degree n.  You need to do the 
+  following for all i from 1 to n/2:
+
+  Construct x^(2^w)^i modulo f.  That will be a polynomial of maximum degree n-1
+  with coefficients in GF(2^w).  You construct that polynomial by starting with x
+  and doubling it w times, each time taking the result modulo f.  Then you 
+  multiply that by itself i times, again each time taking the result modulo f.
+
+  When you're done, you need to "subtract" x -- since addition = subtraction = 
+  XOR, that means XOR x.  
+
+  Now, find the GCD of that last polynomial and f, using Euclid's algorithm.  If
+  the GCD is not one, then f is reducible.  If it is not reducible for each of
+  those i, then it is irreducible.
+
+  In this code, I am using a gf_general_t to represent elements of GF(2^w).  This
+  is so that I can use base fields that are GF(2^64) or GF(2^128). 
+   
+  I have two main procedures.  The first is x_to_q_to_i_minus_x, which calculates
+  x^(2^w)^i - x, putting the result into a gf_general_t * called retval.
+
+  The second is gcd_one, which takes a polynomial of degree n and a second one
+  of degree n-1, and uses Euclid's algorithm to decide if their GCD == 1.
+
+  These can be made faster (e.g. calculate x^(2^w) once and store it).
 */

 #include "gf_complete.h"
 #include "gf_method.h"
+#include "gf_general.h"
+#include "gf_int.h"
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>

-#define GF_POLY_COEF_MASK8 0xff
-#define GF_POLY_COEF_MASK16 0xffff
-#define GF_POLY_COEF_MASK32 0xffffffff
-#define GF_POLY_COEF_MASK64 0xffffffffffffffff
+char *BM = "Bad Method: ";

-#define LLUI (long long unsigned int)
-
-struct gf_poly_coef_s;
-
-typedef struct gf_poly_coef_s {
-  uint64_t coef;
-  uint64_t power;
-  struct gf_poly_coef_s *next;
-} gf_poly_coef_t;
-
-typedef struct gf_poly_s {
-  gf_poly_coef_t *leading_coef;
-  uint64_t num_coefs;
-  gf_t *coef_gf;
-  int w;
-} gf_poly_t;
-
-static uint64_t gf_add(int w, uint64_t a, uint64_t b)
+void usage(char *s)
 {
-  if (w == 8) {
-    return (a & GF_POLY_COEF_MASK8) ^ (b & GF_POLY_COEF_MASK8);
-  } else if (w == 16) {
-    return (a & GF_POLY_COEF_MASK16) ^ (b & GF_POLY_COEF_MASK16);
-  } else if (w == 32) {
-    return (a & GF_POLY_COEF_MASK32) ^ (b & GF_POLY_COEF_MASK32);
-  } else if (w == 64) {
-    return (a & GF_POLY_COEF_MASK64) ^ (b & GF_POLY_COEF_MASK64);
-  }
-}
-
-static uint64_t gf_mult(int w, gf_t* gf, uint64_t a, uint64_t b)
-{
-  if (w <= 32) {
-    return gf->multiply.w32(gf, a, b); 
-  } else if (w == 64) {
-    return gf->multiply.w64(gf, a, b); 
-  }
-}
-
-static uint64_t gf_divide(int w, gf_t* gf, uint64_t a, uint64_t b)
-{
-  if (w <= 32) {
-    return gf->divide.w32(gf, a, b); 
-  } else if (w == 64) {
-    return gf->divide.w64(gf, a, b); 
-  }
-}
-
-static uint64_t gf_inverse(int w, gf_t* gf, uint64_t a)
-{
-  if (w <= 32) {
-    return gf->inverse.w32(gf, a); 
-  } else if (w == 64) {
-    return gf->inverse.w64(gf, a);
-  }
-}
-
-gf_poly_t* gf_poly_init(int w, gf_t *gf)
-{
-  gf_poly_t *gf_poly = (gf_poly_t*)malloc(sizeof(gf_poly_t));
-
-  if (gf_poly == NULL || gf == NULL) {
-    return NULL;
-  }
-
-  gf_poly->leading_coef = NULL;
-  gf_poly->num_coefs = 0;
-  gf_poly->coef_gf = gf;
-  gf_poly->w = w;
-
-  return gf_poly;
-}
-
-void gf_poly_print(gf_poly_t *gf_poly, char *message)
-{
-  gf_poly_coef_t *tmp;
-
-  if (gf_poly == NULL) {
-    fprintf(stderr, "0 * x^0\n");
-    return;
-  }
-
-  tmp = gf_poly->leading_coef;
-
-  while (tmp != NULL) {
-    printf("%llu * x^%llu", LLUI tmp->coef, LLUI tmp->power);
-    tmp = tmp->next;
-    if (tmp) {
-      printf(" + ");
-    }
-  }
-
-  if (message != NULL) {
-    printf(": %s\n", message);
-  }
-}
-
-gf_poly_t* gf_poly_copy(gf_poly_t *poly)
-{
-  gf_poly_t *new_poly = (gf_poly_t*)malloc(sizeof(gf_poly_t));
-  gf_poly_coef_t *tmp = poly->leading_coef;
-
-  if (new_poly == NULL) {
-    return NULL;
-  }
-
-  new_poly->leading_coef = NULL;
-  new_poly->num_coefs = 0;
-  new_poly->coef_gf = poly->coef_gf;
-  new_poly->w = poly->w;
-  
-  while (tmp != NULL) {
-    gf_poly_add_coef(new_poly, tmp->coef, tmp->power);
-
-    tmp = tmp->next;
-  }
-
-  return new_poly;
-}
-
-void gf_poly_clear(gf_poly_t* a)
-{
-  while (a->leading_coef != NULL) {
-    gf_poly_coef_t *tmp = a->leading_coef;
-    
-    a->leading_coef = tmp->next;
-
-    free(tmp);
-  }
-}
-
-void gf_poly_free(gf_poly_t **a)
-{
-  gf_poly_clear(*a);
-  free(*a); 
-  *a = NULL;
-}
-
-gf_poly_coef_t* gf_poly_create_node(uint64_t coef, uint64_t power)
-{
-  gf_poly_coef_t* node = (gf_poly_coef_t*)malloc(sizeof(gf_poly_coef_t));
-
-  if (node == NULL) {
-    return NULL;
-  }
-
-  node->coef = coef;
-  node->power = power;
-  node->next = NULL;
-
-  return node;
-}
-
-int gf_poly_remove_node(gf_poly_t *gf_poly, uint64_t power)
-{
-  gf_poly_coef_t* iter = gf_poly->leading_coef;
-
-  if (iter->power == power) {
-    gf_poly->leading_coef = iter->next;   
-    free(iter);
-    return 0;
-  }
-
-  while (iter->next != NULL) {
-    if (iter->next->power == power) {
-      gf_poly_coef_t* tmp = iter->next;
-      iter->next = iter->next->next;
-      free(tmp);
-      return 0;
-    }
-    iter = iter->next;
-  }
-
-  return -1;
-}
-
-int gf_poly_add_coef(gf_poly_t *gf_poly, uint64_t coef_val, uint64_t power)
-{
-  gf_poly_coef_t* node;
-  gf_poly_coef_t* iter = gf_poly->leading_coef;
-
-  /*
-   * The new node has the highest power, or there are no terms
-   */
-  if (gf_poly->leading_coef == NULL || gf_poly->leading_coef->power < power) {
-    node = gf_poly_create_node(coef_val, power);
-    node->next = gf_poly->leading_coef;
-    gf_poly->leading_coef = node;
-    return 0;
-  }
-
-  /*
-   * The new node is of the same power, add the coefs
-   */
-  if (gf_poly->leading_coef->power == power) {
-    gf_poly->leading_coef->coef = gf_add(gf_poly->w, gf_poly->leading_coef->coef, coef_val);   
-    if (gf_poly->leading_coef->coef == 0) {
-      gf_poly_remove_node(gf_poly, power);
-    }
-    return 0;
-  }
-
-  while (iter->next != NULL) {
-    if (iter->next->power == power) {
-      iter->next->coef = gf_add(gf_poly->w, iter->next->coef, coef_val);   
-
-      if (iter->next->coef == 0) {
-        gf_poly_remove_node(gf_poly, power);
-      }
-
-      return 0;
-    }
-    if (iter->next->power < power) {
-      node = gf_poly_create_node(coef_val, power);
-      node->next = iter->next;
-      iter->next = node;
-      return 0;
-    }
-    iter = iter->next;
-  }
-  
-  /*
-   * The power passed in is lower than any in the existing poly
-   */
-  node = gf_poly_create_node(coef_val, power);
-  iter->next = node;
-
-  return 0;
-}
-
-/*
- * Compute a+b and store in a
- */
-int gf_poly_add(gf_poly_t* a, gf_poly_t* b)
-{
-  gf_poly_coef_t* iter = b->leading_coef;
-
-  while (iter != NULL) {
-    gf_poly_add_coef(a, iter->coef, iter->power);
-    iter = iter->next; 
-  }
-
-  return 0;
-}
-
-/*
- * Compute a*b and store in a
- */
-int gf_poly_mult(gf_poly_t* a, gf_poly_t* b)
-{
-  gf_poly_coef_t* a_iter = a->leading_coef;
-
-  /*
-   * Remove one node at a time from 'a', starting with
-   * highest power.  Multiply the removed (coef,power)
-   * by every entry of 'b,' adding each product into 'a.'
-   */
-  while (a_iter != NULL) {
-    gf_poly_coef_t* tmp = a_iter;
-    gf_poly_coef_t* b_iter = b->leading_coef;
-
-    uint64_t a_power = a_iter->power;
-    uint64_t a_coef = a_iter->coef;
-    a_iter = a_iter->next;
-    gf_poly_remove_node(a, tmp->power);
-
-    while (b_iter != NULL) {
-      uint64_t new_power = b_iter->power + a_power;
-      uint64_t new_coef = gf_mult(a->w, a->coef_gf, b_iter->coef, a_coef);
-
-      gf_poly_add_coef(a, new_coef, new_power);
-
-      b_iter = b_iter->next;
-    }
-  }
-  return 0;
-}
-
-/*
- * Compute a % b and store in a
- */
-int gf_poly_reduce(gf_poly_t* a, gf_poly_t* b)
-{
-   gf_poly_t* c = gf_poly_init(a->w, a->coef_gf);
-   gf_poly_coef_t* a_iter = a->leading_coef;
-   gf_poly_coef_t* b_iter = b->leading_coef;
-
-  /*
-   * Reduce until the degree of 'a' is less than
-   * the degree of 'b.'  At that point 'a' will 
-   * contain the remainder of a / b.
-   */
-  while (a_iter && (a_iter->power >= b_iter->power)) {
-
-    /*
-     * Get the degree and leading coef of the current
-     * 'b'.
-     */
-    uint64_t reduce_power = a_iter->power - b_iter->power;
-    uint64_t reduce_coef = gf_divide(a->w, a->coef_gf, a_iter->coef, b_iter->coef);
-
-    /*
-     * Create a poly that will get rid of leading power
-     * of 'b' when added: c*x^(n-m)*b(x), where c 
-     * is the leading coef of 'a', n is the deg of 'a'
-     * and m is the degree of 'b'.
-     */
-    gf_poly_add_coef(c, reduce_coef, reduce_power);
-    gf_poly_mult(c, b);
-    
-    /*
-     * Add the newly created poly, which will reduce 
-     * a(x) by at least one term (leading term).
-     */
-    gf_poly_add(a, c);
-    
-    gf_poly_clear(c); 
-   
-    /*
-     * Grab the new leading term of 'a'
-     */ 
-    a_iter = a->leading_coef;
-  }
-}
-
-/*
- * Get the GCD of a and b, return the result
- */
-gf_poly_t* gf_poly_gcd(gf_poly_t* a, gf_poly_t* b)
-{
-  gf_poly_t *r1, *r2;
-  gf_poly_t* tmp_swp;
-
-  if (a->leading_coef == NULL || b->leading_coef == NULL) {
-    return NULL;
-  }
-
-  if (a->leading_coef->power > b->leading_coef->power) {
-    r1 = a;
-    r2 = b;
-  } else {
-    r1 = b;
-    r2 = a;
-  }
-
-  while ( 1 ) {
-    if (r2->leading_coef == NULL) {
-      break;
-    }
-    if (r2->leading_coef->power == 0 && r2->leading_coef->coef <= 1) {
-      break;
-    }
-
-    gf_poly_reduce(r1, r2);
-    tmp_swp = r1;
-    r1 = r2;
-    r2 = tmp_swp;
-  }
-
-  return r1;
-}
-
-/*
- * The Ben-Or algorithm for determining irreducibility
- */
-int gf_poly_is_irred(gf_poly_t* poly)
-{
-  gf_poly_t *gcd;
-  gf_poly_t *prod_of_irred;
-  uint64_t prod_of_irred_power = ((unsigned long long) 1) << poly->w;
-  int n = poly->leading_coef->power / 2;
-  int i;
-  int ret = 0;
-  gf_poly_t *a = gf_poly_copy(poly);
-
-  prod_of_irred = gf_poly_init(a->w, a->coef_gf);
-
-
-  for (i = 1; i <= n; i++) {
-    gf_poly_add_coef(prod_of_irred, 1, prod_of_irred_power);
-    gf_poly_add_coef(prod_of_irred, 1, 1);
-  
-    gf_poly_reduce(prod_of_irred, a); 
-    
-    gcd = gf_poly_gcd(a, prod_of_irred); 
-
-    /*
-     * It is irreducible if it is not the product of 
-     * non-trivial factors (non-constant).  Therefore,
-     * the GCD of the poly and prod_of_irred should be
-     * a constant (0 or 0-degree polynomial).
-     */ 
-    if (gcd == NULL) {
-      ret = -1;
-      break;
-    } else if (gcd->leading_coef->power != 0) {
-      ret = -1;
-      break;
-    } else if (gcd->leading_coef->power == 0) {
-      ret = 0;
-      break;
+  fprintf(stderr, "usage: gf_poly w(base-field) method power:coef [ power:coef .. ]\n");
+  fprintf(stderr, "\n");
+  fprintf(stderr, "       use - for the default method.\n");
+  fprintf(stderr, "       use 0x in front of the coefficient if it's in hex\n");
+  fprintf(stderr, "       \n");
+  fprintf(stderr, "       For example, to test whether x^2 + 2x + 1 is irreducible\n");
+  fprintf(stderr, "       in GF(2^16), the call is:\n");
+  fprintf(stderr, "       \n");
+  fprintf(stderr, "       gf_poly 16 - 2:1 1:2 0:1\n");
+  fprintf(stderr, "       \n");
+  fprintf(stderr, "       See the user's manual for more information.\n");
+  if (s != NULL) {
+    fprintf(stderr, "\n");
+    if (s == BM) {
+      fprintf(stderr, "%s", s);
+      gf_error();
    } else {
-      ret = -1;
-      break;
+      fprintf(stderr, "%s\n", s);
    }
-    
-    // Need if to avoid a overflow error
-    if ((i + 1) <= n) {
-      prod_of_irred_power *= prod_of_irred_power;
+  }
+  exit(1);
+}
+
+int gcd_one(gf_t *gf, int w, int n, gf_general_t *poly, gf_general_t *prod)
+{
+  gf_general_t *a, *b, zero, factor, p;
+  int i, j, da, db;
+  char buf[30];
+
+  gf_general_set_zero(&zero, w);
+
+  a = (gf_general_t *) malloc(sizeof(gf_general_t) * n+1);
+  b = (gf_general_t *) malloc(sizeof(gf_general_t) * n);
+  for (i = 0; i <= n; i++) gf_general_add(gf, &zero, poly+i, a+i);
+  for (i = 0; i < n; i++) gf_general_add(gf, &zero, prod+i, b+i);
+
+  da = n;
+  while (1) {
+    for (db = n-1; db >= 0 && gf_general_is_zero(b+db, w); db--) ;
+    if (db < 0) return 0;
+    if (db == 0) return 1;
+    for (j = da; j >= db; j--) {
+      if (!gf_general_is_zero(a+j, w)) {
+        gf_general_divide(gf, a+j, b+db, &factor);
+        for (i = 0; i <= db; i++) {
+          gf_general_multiply(gf, b+i, &factor, &p); 
+          gf_general_add(gf, &p, a+(i+j-db), a+(i+j-db));
+        }
+      }
+    }
+    for (i = 0; i < n; i++) {
+      gf_general_add(gf, a+i, &zero, &p);
+      gf_general_add(gf, b+i, &zero, a+i);
+      gf_general_add(gf, &p, &zero, b+i);
    }
-    gf_poly_clear(prod_of_irred);
  }

-  gf_poly_free(&a);
-
-  return ret;
 }

-int is_suitible_s(int w, gf_t *gf, uint64_t s)
+void x_to_q_to_i_minus_x(gf_t *gf, int w, int n, gf_general_t *poly, int logq, int i, gf_general_t *retval)
 {
-  uint64_t num_elems = ((unsigned long long) 1) << w;
-  uint64_t i = 2;
-  uint64_t i_inv;
+  gf_general_t x;
+  gf_general_t *x_to_q;
+  gf_general_t *product;
+  gf_general_t p, zero, factor;
+  int j, k, lq;
+  char buf[20];

-  for (; i < num_elems; i++) {
-    i_inv = gf_inverse(w, gf, i);
-    if ((i ^ i_inv) == s) {
-      fprintf(stderr, "Bailed on %llu ^ %llu = %llu\n", LLUI i, LLUI i_inv, LLUI s);
-      return -1;
+  gf_general_set_zero(&zero, w);
+  product = (gf_general_t *) malloc(sizeof(gf_general_t) * n*2);
+  x_to_q = (gf_general_t *) malloc(sizeof(gf_general_t) * n);
+  for (j = 0; j < n; j++) gf_general_set_zero(x_to_q+j, w);
+  gf_general_set_one(x_to_q+1, w);
+
+  for (lq = 0; lq < logq; lq++) {
+    for (j = 0; j < n*2; j++) gf_general_set_zero(product+j, w);
+    for (j = 0; j < n; j++) {
+      for (k = 0; k < n; k++) {
+        gf_general_multiply(gf, x_to_q+j, x_to_q+k, &p);
+        gf_general_add(gf, product+(j+k), &p, product+(j+k));
+      }
    }
-    if (i % 1000000000 == 0) fprintf(stderr, "Processed %llu\n", LLUI i);
+    for (j = n*2-1; j >= n; j--) {
+      if (!gf_general_is_zero(product+j, w)) {
+        gf_general_add(gf, product+j, &zero, &factor);
+        for (k = 0; k <= n; k++) {
+          gf_general_multiply(gf, poly+k, &factor, &p);
+          gf_general_add(gf, product+(j-n+k), &p, product+(j-n+k));
+        }
+      }
+    }
+    for (j = 0; j < n; j++) gf_general_add(gf, product+j, &zero, x_to_q+j);
+  }
+  for (j = 0; j < n; j++) gf_general_set_zero(retval+j, w);
+  gf_general_set_one(retval, w);
+
+  while (i > 0) {
+    for (j = 0; j < n*2; j++) gf_general_set_zero(product+j, w);
+    for (j = 0; j < n; j++) {
+      for (k = 0; k < n; k++) {
+        gf_general_multiply(gf, x_to_q+j, retval+k, &p);
+        gf_general_add(gf, product+(j+k), &p, product+(j+k));
+      }
+    }
+    for (j = n*2-1; j >= n; j--) {
+      if (!gf_general_is_zero(product+j, w)) {
+        gf_general_add(gf, product+j, &zero, &factor);
+        for (k = 0; k <= n; k++) {
+          gf_general_multiply(gf, poly+k, &factor, &p);
+          gf_general_add(gf, product+(j-n+k), &p, product+(j-n+k));
+        }
+      }
+    }
+    for (j = 0; j < n; j++) gf_general_add(gf, product+j, &zero, retval+j);
+    i--;
  }

-  return 0;
+  gf_general_set_one(&x, w);
+  gf_general_add(gf, &x, retval+1, retval+1);
+
+  free(product);
+  free(x_to_q);
 }

-static void
-usage(char *cmd)
-{
-  fprintf(stderr, "%s w <GF args> S <s value>\n", cmd);
-  fprintf(stderr, "\t will build a trinomial x^2+S*x+1\n");
-  fprintf(stderr, "OR\n");
-  fprintf(stderr, "%s w <GF args> G coef1,power1 <coef2,power2> ... <coefn,powern>\n", cmd);
-  fprintf(stderr, "\t will build a polynomial coef1^(power1) + ... + coefn^(powern)\n");
-  fprintf(stderr, "Example: ./gf_poly 8 - - - G 1,2 2,1 1,0\n");
-  fprintf(stderr, "\t will build a polynomial x^2+2*x+1 with coefs from GF(2^8)\n");
-}
-
-/*
- * Find irred poly of form x^2+sx+1
- * a_n*x^n + a_(n-1)*x^(n-1) + ...
- *
- * Terms are specified as: a_i,i a_j,j, ... where 
- * i is the degree of the term and a_i is the coef
- *
- */
-int main(int argc, char **argv)
+main(int argc, char **argv)
 {
+  int w, i, power, n, ap, success, j;
  gf_t gf;
-  int ret;
-  int w;
-  int i;
-  uint64_t irred_coef_s;
-  gf_poly_t *irred_poly;
-  char *term;
+  gf_general_t *poly, *prod;
+  char *string, *ptr;
+  char buf[100];

-  bzero(&gf, sizeof(gf_t)); 
+  if (argc < 4) usage(NULL);

-  if (argc < 4) {
-    usage(argv[0]);
-    return -1;
-  }
-  
-  w = atoi(argv[1]);
-  
-  ret = create_gf_from_argv(&gf, w, argc, argv, 3);
+  if (sscanf(argv[1], "%d", &w) != 1 || w <= 0) usage("Bad w.");
+  ap = create_gf_from_argv(&gf, w, argc, argv, 2);

-  if (ret <= 0) {
-    fprintf(stderr, "Could not create a GF\n");
-    return -1;
-  }
-    
-  irred_poly = gf_poly_init(w, &gf);
+  if (ap == 0) usage(BM);

-  i = ret + 1;
+  if (ap == argc) usage("No powers/coefficients given.");

-  if (strlen(argv[i]) > 1) {
-    usage(argv[0]); 
-    exit(1);
-  }
-
-  if (argv[i][0] == 'S') {
-    i++;
-    irred_coef_s = (uint64_t)strtoull(argv[i], NULL, 10);
-  
-    /*
-     * If this is a trinomial of the form x^2+s*x+1, then
-     * we can do a quick pre-check to see if this may be
-     * an irreducible polynomial.
-     */
-    if (is_suitible_s(w, &gf, irred_coef_s) < 0) {
-      fprintf(stderr, "%llu is not a suitable coeffient!\n", LLUI irred_coef_s);
-      return -1;
-    } else {
-      fprintf(stderr, "%llu IS A suitable coeffient!\n", LLUI irred_coef_s);
+  n = -1;
+  for (i = ap; i < argc; i++) {
+    if (strchr(argv[i], ':') == NULL || sscanf(argv[i], "%d:", &power) != 1) {
+      string = (char *) malloc(sizeof(char)*(strlen(argv[i]+100)));
+      sprintf(string, "Argument '%s' not in proper format of power:coefficient\n", argv[i]);
+      usage(string);
    }
+    if (power < 0) usage("Can't have negative powers\n");
+    if (power > n) n = power;
+  }

+  poly = (gf_general_t *) malloc(sizeof(gf_general_t)*(n+1));
+  for (i = 0; i <= n; i++) gf_general_set_zero(poly+i, w);
+  prod = (gf_general_t *) malloc(sizeof(gf_general_t)*n);

-    gf_poly_add_coef(irred_poly, 1, 2);
-    gf_poly_add_coef(irred_poly, irred_coef_s, 1);
-    gf_poly_add_coef(irred_poly, 1, 0);
+  for (i = ap; i < argc; i++) {
+    sscanf(argv[i], "%d:", &power);
+    ptr = strchr(argv[i], ':');
+    ptr++;
+    if (strncmp(ptr, "0x", 2) == 0) {
+      success = gf_general_s_to_val(poly+power, w, ptr+2, 1);
+    } else {
+      success = gf_general_s_to_val(poly+power, w, ptr, 0);
+    }
+    if (success == 0) {
+      string = (char *) malloc(sizeof(char)*(strlen(argv[i]+100)));
+      sprintf(string, "Argument '%s' not in proper format of power:coefficient\n", argv[i]);
+      usage(string);
+    }
+  }

-  } else if (argv[i][0] == 'G') {
-    term = argv[++i];
-
-
-    while (term != NULL) {
-      uint64_t coef = strtoull(strtok(term, ","), NULL, 10);
-      uint64_t power = strtoull(strtok(NULL, ","), NULL, 10);
-    
-      gf_poly_add_coef(irred_poly, coef, power);
-    
-      if (i < argc) {
-        term = argv[++i];
+  printf("Poly:");
+  for (power = n; power >= 0; power--) {
+    if (!gf_general_is_zero(poly+power, w)) {
+      printf("%s", (power == n) ? " " : " + ");
+      if (!gf_general_is_one(poly+power, w)) {
+        gf_general_val_to_s(poly+power, w, buf, 1);
+        if (n > 0) {
+          printf("(0x%s)", buf);
+        } else {
+          printf("0x%s", buf);
+        }
+      }
+      if (power == 0) {
+        if (gf_general_is_one(poly+power, w)) printf("1");
+      } else if (power == 1) {
+        printf("x");
      } else {
-        break;
+        printf("x^%d", power);
      }
    }
-  } else {
-    usage(argv[0]);
-    exit(1);
+  }
+  printf("\n");
+
+  if (!gf_general_is_one(poly+n, w)) {
+    printf("\n");
+    printf("Can't do Ben-Or, because the polynomial is not monic.\n");
+    exit(0);
+  }
+
+  for (i = 1; i <= n/2; i++) {
+    x_to_q_to_i_minus_x(&gf, w, n, poly, w, i, prod); 
+    if (!gcd_one(&gf, w, n, poly, prod)) {
+      printf("Reducible.\n");
+      exit(0);
+    }
  }
  
-  gf_poly_print(irred_poly, " specified via the command line\n");
-
-  ret = gf_poly_is_irred(irred_poly);
-
-  if (ret < 0) {
-    gf_poly_print(irred_poly, " IS NOT irreducible\n");
-  } else {
-    gf_poly_print(irred_poly, " IS irreducible\n");
-  }
-
-  return 0;
+  printf("Irreducible.\n");
+  exit(0);
 }
--- a/gf_time.c
+++ b/gf_time.c
@ -9,7 +9,7 @@
 #include <stdint.h>
 #include <string.h>
 #include <stdlib.h>
-#include <time.h>
+#include <sys/time.h>

 #include "gf_complete.h"
 #include "gf_method.h"
@ -43,10 +43,14 @@ void problem(char *s)
  exit(1);
 }

+char *BM = "Bad Method: ";
+
 void usage(char *s)
 {
  fprintf(stderr, "usage: gf_time w tests seed size(bytes) iterations [method [params]] - does timing\n");
  fprintf(stderr, "\n");
+  fprintf(stderr, "does unit testing in GF(2^w)\n");
+  fprintf(stderr, "\n");
  fprintf(stderr, "Legal w are: 1 - 32, 64 and 128\n");
  fprintf(stderr, "\n");
  fprintf(stderr, "Tests may be any combination of:\n");
@ -63,9 +67,12 @@ void usage(char *s)
  fprintf(stderr, "\n");
  fprintf(stderr, "Use -1 for time(0) as a seed.\n");
  fprintf(stderr, "\n");
-  fprintf(stderr, "For method specification, type gf_methods\n");
-  fprintf(stderr, "\n");
-  if (s != NULL) fprintf(stderr, "%s\n", s);
+  if (s == BM) {
+    fprintf(stderr, "%s", BM);
+    gf_error();
+  } else if (s != NULL) {
+    fprintf(stderr, "%s\n", s);
+  }
  exit(1);
 }

@ -84,9 +91,15 @@ int main(int argc, char **argv)
  time_t t0;
  uint8_t *ra, *rb;
  gf_general_t a;
+
  
  if (argc < 6) usage(NULL);
-  if (sscanf(argv[1], "%d", &w) == 0) usage("Bad w\n");
+  
+  if (sscanf(argv[1], "%d", &w) == 0){
+    usage("Bad w[-pp]\n");
+  }
+
+  
  if (sscanf(argv[3], "%ld", &t0) == 0) usage("Bad seed\n");
  if (sscanf(argv[4], "%d", &size) == 0) usage("Bad size\n");
  if (sscanf(argv[5], "%d", &iterations) == 0) usage("Bad iterations\n");
@ -99,7 +112,7 @@ int main(int argc, char **argv)
  if ((w > 32 && w != 64 && w != 128) || w < 0) usage("Bad w");
  if ((size * 8) % w != 0) usage ("Bad size -- must be a multiple of w*8\n");
  
-  if (!create_gf_from_argv(&gf, w, argc, argv, 6)) usage("Bad Method");
+  if (!create_gf_from_argv(&gf, w, argc, argv, 6)) usage(BM);

  strcpy(tests, "");
  for (i = 0; i < argv[2][i] != '\0'; i++) {
--- a/gf_unit.c
+++ b/gf_unit.c
@ -10,6 +10,7 @@
 #include <string.h>
 #include <stdlib.h>
 #include <time.h>
+#include <signal.h>

 #include "gf_complete.h"
 #include "gf_int.h"
@ -18,6 +19,8 @@
 #include "gf_general.h"

 #define REGION_SIZE (16384) 
+#define RMASK (0x00000000ffffffffLL)
+#define LMASK (0xffffffff00000000LL)

 void problem(char *s)
 {
@ -26,11 +29,14 @@ void problem(char *s)
  exit(1);
 }

+char *BM = "Bad Method: ";
+
 void usage(char *s)
 {
  fprintf(stderr, "usage: gf_unit w tests seed [method] - does unit testing in GF(2^w)\n");
-  fprintf(stderr, "\n");
+  fprintf(stderr, "\n");    
  fprintf(stderr, "Legal w are: 1 - 32, 64 and 128\n");
+  fprintf(stderr, "           128 is hex only (i.e. '128' will be an error - do '128h')\n");
  fprintf(stderr, "\n");
  fprintf(stderr, "Tests may be any combination of:\n");
  fprintf(stderr, "       A: All\n");
@ -40,16 +46,28 @@ void usage(char *s)
  fprintf(stderr, "\n");
  fprintf(stderr, "Use -1 for time(0) as a seed.\n");
  fprintf(stderr, "\n");
-  fprintf(stderr, "For method specification, type gf_methods\n");
-  fprintf(stderr, "\n");
-  if (s != NULL) fprintf(stderr, "%s\n", s);
+  if (s == BM) {
+    fprintf(stderr, "%s", BM);
+    gf_error();
+  } else if (s != NULL) {
+    fprintf(stderr, "%s\n", s);
+  }
  exit(1);
 }

+void SigHandler(int v)
+{
+  fprintf(stderr, "Problem: SegFault!\n");
+  fflush(stdout);
+  exit(2);
+}
+
 int main(int argc, char **argv)
 {
+  signal(SIGSEGV, SigHandler);
+
  int w, i, verbose, single, region, tested, top;
-  int start, end, xor;
+  int s_start, d_start, bytes, xor, alignment_test;
  gf_t   gf, gf_def;
  time_t t0;
  gf_internal_t *h;
@ -61,15 +79,21 @@ int main(int argc, char **argv)
  char *ra, *rb, *rc, *rd, *target;
  int align;

+
  if (argc < 4) usage(NULL);
-  if (sscanf(argv[1], "%d", &w) == 0) usage("Bad w\n");
+
+  if (sscanf(argv[1], "%d", &w) == 0){
+    usage("Bad w\n");
+  }
+
  if (sscanf(argv[3], "%ld", &t0) == 0) usage("Bad seed\n");
  if (t0 == -1) t0 = time(0);
  MOA_Seed(t0);

  if (w > 32 && w != 64 && w != 128) usage("Bad w");
  
-  if (create_gf_from_argv(&gf, w, argc, argv, 4) == 0) usage("Bad Method");
+  if (create_gf_from_argv(&gf, w, argc, argv, 4) == 0) usage(BM);
+  printf("Size (bytes): %d\n", gf_size(&gf));

  for (i = 0; i < strlen(argv[2]); i++) {
    if (strchr("ASRV", argv[2][i]) == NULL) usage("Bad test\n");
@ -83,10 +107,18 @@ int main(int argc, char **argv)
  ai = (gf_general_t *) malloc(sizeof(gf_general_t));
  bi = (gf_general_t *) malloc(sizeof(gf_general_t));

-  ra = (char *) malloc(sizeof(char)*REGION_SIZE);
-  rb = (char *) malloc(sizeof(char)*REGION_SIZE);
-  rc = (char *) malloc(sizeof(char)*REGION_SIZE);
-  rd = (char *) malloc(sizeof(char)*REGION_SIZE);
+  //15 bytes extra to make sure it's 16byte aligned
+  ra = (char *) malloc(sizeof(char)*REGION_SIZE+15);
+  rb = (char *) malloc(sizeof(char)*REGION_SIZE+15);
+  rc = (char *) malloc(sizeof(char)*REGION_SIZE+15);
+  rd = (char *) malloc(sizeof(char)*REGION_SIZE+15);
+
+  //this still assumes 8 byte aligned pointer from malloc
+  //(which is usual on 32-bit machines)
+  ra += (uint64_t)ra & 0xf;
+  rb += (uint64_t)rb & 0xf;
+  rc += (uint64_t)rc & 0xf;
+  rd += (uint64_t)rd & 0xf;

  if (w <= 32) {
    mask = 0;
@ -97,8 +129,9 @@ int main(int argc, char **argv)
  single = (strchr(argv[2], 'S') != NULL || strchr(argv[2], 'A') != NULL);
  region = (strchr(argv[2], 'R') != NULL || strchr(argv[2], 'A') != NULL);

-  if (!gf_init_easy(&gf_def, w)) problem("No default for this value of w");
-  
+  if (!gf_init_hard(&gf_def, w, GF_MULT_DEFAULT, GF_REGION_DEFAULT, GF_DIVIDE_DEFAULT,
+      (h->mult_type != GF_MULT_COMPOSITE) ? h->prim_poly : 0, 0, 0, NULL, NULL))
+    problem("No default for this value of w");
  if (w == 4) {
    mult4 = gf_w4_get_mult_table(&gf);
    div4 = gf_w4_get_div_table(&gf);
@ -129,21 +162,71 @@ int main(int argc, char **argv)
      if (w <= 10) {
        a->w32 = i % (1 << w);
        b->w32 = (i >> w);
-      } else if (i < 10) {
-        gf_general_set_zero(a, w);
-        gf_general_set_random(b, w, 1);
-      } else if (i < 20) {
-        gf_general_set_random(a, w, 1);
-        gf_general_set_zero(b, w);
-      } else if (i < 30) {
-        gf_general_set_one(a, w);
-        gf_general_set_random(b, w, 1);
-      } else if (i < 40) {
-        gf_general_set_random(a, w, 1);
-        gf_general_set_one(b, w);
+
+      //Allen: the following conditions were being run 10 times each. That didn't seem like nearly enough to
+      //me for these special cases, so I converted to doing this mod stuff to easily make the number of times
+      //run both larger and proportional to the total size of the run.
      } else {
-        gf_general_set_random(a, w, 1);
-        gf_general_set_random(b, w, 1);
+        switch (i % 32)
+        {
+          case 0: 
+            gf_general_set_zero(a, w);
+            gf_general_set_random(b, w, 1);
+            break;
+          case 1:
+            gf_general_set_random(a, w, 1);
+            gf_general_set_zero(b, w);
+            break;
+          case 2:
+            gf_general_set_one(a, w);
+            gf_general_set_random(b, w, 1);
+            break;
+          case 3:
+            gf_general_set_random(a, w, 1);
+            gf_general_set_one(b, w);
+            break;
+          default:
+            gf_general_set_random(a, w, 1);
+            gf_general_set_random(b, w, 1);
+        }
+      }
+
+      //Allen: the following special cases for w=64 are based on the code below for w=128.
+      //These w=64 cases are based on Dr. Plank's suggestion because some of the methods for w=64
+      //involve splitting it in two. I think they're less likely to give errors than the 128-bit case
+      //though, because the 128 bit case is always split in two.
+      //As with w=128, I'm arbitrarily deciding to do this sort of thing with a quarter of the cases
+      if (w == 64) {
+        switch (i % 32)
+        {
+          case 0: if (!gf_general_is_one(a, w)) a->w64 &= RMASK; break;
+          case 1: if (!gf_general_is_one(a, w)) a->w64 &= LMASK; break;
+          case 2: if (!gf_general_is_one(a, w)) a->w64 &= RMASK; if (!gf_general_is_one(b, w)) b->w64 &= RMASK; break;
+          case 3: if (!gf_general_is_one(a, w)) a->w64 &= RMASK; if (!gf_general_is_one(b, w)) b->w64 &= LMASK; break;
+          case 4: if (!gf_general_is_one(a, w)) a->w64 &= LMASK; if (!gf_general_is_one(b, w)) b->w64 &= RMASK; break;
+          case 5: if (!gf_general_is_one(a, w)) a->w64 &= LMASK; if (!gf_general_is_one(b, w)) b->w64 &= LMASK; break;
+          case 6: if (!gf_general_is_one(b, w)) b->w64 &= RMASK; break;
+          case 7: if (!gf_general_is_one(b, w)) b->w64 &= LMASK; break;
+        }
+      }
+
+      //Allen: for w=128, we have important special cases where one half or the other of the number is all
+      //zeros. The probability of hitting such a number randomly is 1^-64, so if we don't force these cases
+      //we'll probably never hit them. This could be implemented more efficiently by changing the set-random
+      //function for w=128, but I think this is easier to follow.
+      //I'm arbitrarily deciding to do this sort of thing with a quarter of the cases
+      if (w == 128) {
+        switch (i % 32)
+        {
+          case 0: if (!gf_general_is_one(a, w)) a->w128[0] = 0; break;
+          case 1: if (!gf_general_is_one(a, w)) a->w128[1] = 0; break;
+          case 2: if (!gf_general_is_one(a, w)) a->w128[0] = 0; if (!gf_general_is_one(b, w)) b->w128[0] = 0; break;
+          case 3: if (!gf_general_is_one(a, w)) a->w128[0] = 0; if (!gf_general_is_one(b, w)) b->w128[1] = 0; break;
+          case 4: if (!gf_general_is_one(a, w)) a->w128[1] = 0; if (!gf_general_is_one(b, w)) b->w128[0] = 0; break;
+          case 5: if (!gf_general_is_one(a, w)) a->w128[1] = 0; if (!gf_general_is_one(b, w)) b->w128[1] = 0; break;
+          case 6: if (!gf_general_is_one(b, w)) b->w128[0] = 0; break;
+          case 7: if (!gf_general_is_one(b, w)) b->w128[1] = 0; break;
+        }
      }

      tested = 0;
@ -195,10 +278,10 @@ int main(int argc, char **argv)
        gf_general_multiply(&gf_def, a, b, d);

        if (!gf_general_are_equal(c, d, w)) {
-          gf_general_val_to_s(a, w, as);
-          gf_general_val_to_s(b, w, bs);
-          gf_general_val_to_s(c, w, cs);
-          gf_general_val_to_s(d, w, ds);
+          gf_general_val_to_s(a, w, as, 1);
+          gf_general_val_to_s(b, w, bs, 1);
+          gf_general_val_to_s(c, w, cs, 1);
+          gf_general_val_to_s(d, w, ds, 1);
          printf("Error in single multiplication (all numbers in hex):\n\n");
          printf("  gf.multiply(gf, %s, %s) = %s\n", as, bs, cs);
          printf("  The default gf multiplier returned %s\n", ds);
@ -216,9 +299,9 @@ int main(int argc, char **argv)
        if (((gf_general_is_zero(a, w) || gf_general_is_zero(b, w)) && !gf_general_is_zero(c, w)) ||
            (gf_general_is_one(a, w) && !gf_general_are_equal(b, c, w)) ||
            (gf_general_is_one(b, w) && !gf_general_are_equal(a, c, w))) {
-          gf_general_val_to_s(a, w, as);
-          gf_general_val_to_s(b, w, bs);
-          gf_general_val_to_s(c, w, cs);
+          gf_general_val_to_s(a, w, as, 1);
+          gf_general_val_to_s(b, w, bs, 1);
+          gf_general_val_to_s(c, w, cs, 1);
          printf("Error in single multiplication (all numbers in hex):\n\n");
          printf("  gf.multiply(gf, %s, %s) = %s, which is clearly wrong.\n", as, bs, cs);
 ;
@ -229,9 +312,9 @@ int main(int argc, char **argv)
      /* Dumb check to make sure that it's not returning numbers that are too big: */

      if (w < 32 && (c->w32 & mask) != c->w32) {
-        gf_general_val_to_s(a, w, as);
-        gf_general_val_to_s(b, w, bs);
-        gf_general_val_to_s(c, w, cs);
+        gf_general_val_to_s(a, w, as, 1);
+        gf_general_val_to_s(b, w, bs, 1);
+        gf_general_val_to_s(c, w, cs, 1);
        printf("Error in single multiplication (all numbers in hex):\n\n");
        printf("  gf.multiply.w32(gf, %s, %s) = %s, which is too big.\n", as, bs, cs);
        exit(1);
@ -242,10 +325,10 @@ int main(int argc, char **argv)
      if (!gf_general_is_zero(a, w)) {
        gf_general_divide(&gf, c, a, d);
        if (!gf_general_are_equal(b, d, w)) {
-          gf_general_val_to_s(a, w, as);
-          gf_general_val_to_s(b, w, bs);
-          gf_general_val_to_s(c, w, cs);
-          gf_general_val_to_s(d, w, ds);
+          gf_general_val_to_s(a, w, as, 1);
+          gf_general_val_to_s(b, w, bs, 1);
+          gf_general_val_to_s(c, w, cs, 1);
+          gf_general_val_to_s(d, w, ds, 1);
          printf("Error in single multiplication/division (all numbers in hex):\n\n");
          printf("  gf.multiply(gf, %s, %s) = %s, but gf.divide(gf, %s, %s) = %s\n", as, bs, cs, cs, as, ds);
          exit(1);
@ -257,40 +340,82 @@ int main(int argc, char **argv)

  if (region) {
    if (verbose) { printf("Testing region multiplications\n"); fflush(stdout); }
-    for (i = 0; i < 1000; i++) {
-      if (i < 20) {
-        gf_general_set_zero(a, w);
-      } else if (i < 40) {
-        gf_general_set_one(a, w);
-      } else if (i < 60) {
-        gf_general_set_two(a, w);
-      } else {
-        gf_general_set_random(a, w, 1);
+    for (i = 0; i < 1024; i++) {
+      //Allen: changing to a switch thing as with the single ops to make things proportional
+      switch (i % 32)
+      {
+        case 0:
+          gf_general_set_zero(a, w);
+          break;
+        case 1:
+          gf_general_set_one(a, w);
+          break;
+        case 2:
+          gf_general_set_two(a, w);
+          break;
+        default:
+          gf_general_set_random(a, w, 1);
      }
      MOA_Fill_Random_Region(ra, REGION_SIZE);
      MOA_Fill_Random_Region(rb, REGION_SIZE);
-      xor = i%2;
+      xor = (i/32)%2;
      align = w/8;
      if (align == 0) align = 1;
      if (align > 16) align = 16;
+
+      /* JSP - Cauchy test.  When w < 32 & it doesn't equal 4, 8 or 16, the default is
+         equal to GF_REGION_CAUCHY, even if GF_REGION_CAUCHY is not set. We are testing
+         three alignments here:
+
+         1. Anything goes -- no alignment guaranteed.
+         2. Perfect alignment.  Here src and dest must be aligned wrt each other,
+            and bytes must be a multiple of 16*w.  
+         3. Imperfect alignment.  Here we'll have src and dest be aligned wrt each 
+            other, but bytes is simply a multiple of w.  That means some XOR's will
+            be aligned, and some won't.
+       */
+
      if ((h->region_type & GF_REGION_CAUCHY) || (w < 32 && w != 4 && w != 8 && w != 16)) {
-        start = MOA_Random_W(5, 1);
-        end = REGION_SIZE - MOA_Random_W(5, 1);
+        alignment_test = (i%3);
+        
+        s_start = MOA_Random_W(5, 1);
+        if (alignment_test == 0) {
+          d_start = MOA_Random_W(5, 1);
+        } else {
+          d_start = s_start;
+        }
+
+        bytes = (d_start > s_start) ? REGION_SIZE - d_start : REGION_SIZE - s_start;
+        bytes -= MOA_Random_W(5, 1);
+        if (alignment_test == 1) {
+          bytes -= (bytes % (w*16));
+        } else {
+          bytes -= (bytes % w);
+        }
+
        target = rb;
-        while ((end-start)%w != 0) end--;
+ 
+      /* JSP - Otherwise, we're testing a non-cauchy test, and alignment
+        must be more strict.  We have to make sure that the regions are
+        aligned wrt each other on 16-byte pointers.  */
+
      } else {
-        start = MOA_Random_W(5, 1) * align;
-        end = REGION_SIZE - (MOA_Random_W(5, 1) * align);
+        s_start = MOA_Random_W(5, 1) * align;
+        d_start = s_start;
+        bytes = REGION_SIZE - s_start - MOA_Random_W(5, 1);
+        bytes -= (bytes % align);
+
        if (h->mult_type == GF_MULT_COMPOSITE && (h->region_type & GF_REGION_ALTMAP)) {
          target = rb ;
        } else {
-          target = ((i%4)/2) ? rb : ra;
+          target = (i/64)%2 ? rb : ra;
        }
      }
+
      memcpy(rc, ra, REGION_SIZE);
      memcpy(rd, target, REGION_SIZE);
-      gf_general_do_region_multiply(&gf, a, ra+start, target+start, end-start, xor);
-      gf_general_do_region_check(&gf, a, rc+start, rd+start, target+start, end-start, xor);
+      gf_general_do_region_multiply(&gf, a, ra+s_start, target+d_start, bytes, xor);
+      gf_general_do_region_check(&gf, a, rc+s_start, rd+d_start, target+d_start, bytes, xor);
    }
  }
 }
--- a/gf_w128.c
+++ b/gf_w128.c
--- a/gf_w16.c
+++ b/gf_w16.c
--- a/gf_w32.c
+++ b/gf_w32.c
--- a/gf_w4.c
+++ b/gf_w4.c
@ -100,7 +100,6 @@ gf_val_32_t gf_w4_euclid (gf_t *gf, gf_val_32_t b)
  y_im1 = 0;

  while (e_i != 1) {
-
    e_ip1 = e_im1;
    d_ip1 = d_im1;
    c_i = 0;
@ -108,6 +107,7 @@ gf_val_32_t gf_w4_euclid (gf_t *gf, gf_val_32_t b)
    while (d_ip1 >= d_i) {
      c_i ^= (1 << (d_ip1 - d_i));
      e_ip1 ^= (e_i << (d_ip1 - d_i));
+      if (e_ip1 == 0) return 0;
      while ((e_ip1 & (1 << d_ip1)) == 0) d_ip1--;
    }

@ -146,6 +146,110 @@ gf_val_32_t gf_w4_matrix (gf_t *gf, gf_val_32_t b)
  return gf_bitmatrix_inverse(b, 4, ((gf_internal_t *) (gf->scratch))->prim_poly);
 }

+
+static
+inline
+gf_val_32_t
+gf_w4_shift_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  uint8_t product, i, pp;
+  gf_internal_t *h;
+  
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+
+  product = 0;
+
+  for (i = 0; i < GF_FIELD_WIDTH; i++) { 
+    if (a & (1 << i)) product ^= (b << i);
+  }
+  for (i = (GF_FIELD_WIDTH*2-2); i >= GF_FIELD_WIDTH; i--) {
+    if (product & (1 << i)) product ^= (pp << (i-GF_FIELD_WIDTH)); 
+  }
+  return product;
+}
+
+/* Ben: This function works, but it is 33% slower than the normal shift mult */
+
+static
+inline
+gf_val_32_t
+gf_w4_clm_multiply (gf_t *gf, gf_val_32_t a4, gf_val_32_t b4)
+{
+  gf_val_32_t rv = 0;
+
+#ifdef INTEL_SSE4_PCLMUL
+
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         w;
+  gf_internal_t * h = gf->scratch;
+
+  a = _mm_insert_epi32 (_mm_setzero_si128(), a4, 0);
+  b = _mm_insert_epi32 (a, b4, 0);
+
+  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1fULL));
+
+  /* Do the initial multiply */
+
+  result = _mm_clmulepi64_si128 (a, b, 0);
+
+  /* Ben/JSP: Do prim_poly reduction once. We are guaranteed that we will only
+     have to do the reduction only once, because (w-2)/z == 1. Where
+     z is equal to the number of zeros after the leading 1.
+
+     _mm_clmulepi64_si128 is the carryless multiply operation. Here
+     _mm_srli_epi64 shifts the result to the right by 4 bits. This allows
+     us to multiply the prim_poly by the leading bits of the result. We
+     then xor the result of that operation back with the result. */
+
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_epi64 (result, 4), 0);
+  result = _mm_xor_si128 (result, w);
+
+  /* Extracts 32 bit value from result. */
+
+  rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+#endif
+  return rv;
+}
+
+static
+void
+gf_w4_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int 
+    xor)
+{
+  gf_region_data rd;
+  uint8_t *s8;
+  uint8_t *d8;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1);
+  gf_do_initial_region_alignment(&rd);
+
+  s8 = (uint8_t *) rd.s_start;
+  d8 = (uint8_t *) rd.d_start;
+
+  if (xor) {
+    while (d8 < ((uint8_t *) rd.d_top)) {
+      *d8 ^= (gf->multiply.w32(gf, val, (*s8 & 0xf)) | 
+             ((gf->multiply.w32(gf, val, (*s8 >> 4))) << 4));
+      d8++;
+      s8++;
+    }
+  } else {
+    while (d8 < ((uint8_t *) rd.d_top)) {
+      *d8 = (gf->multiply.w32(gf, val, (*s8 & 0xf)) | 
+             ((gf->multiply.w32(gf, val, (*s8 >> 4))) << 4));
+      d8++;
+      s8++;
+    }
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
 /* ------------------------------------------------------------
  IMPLEMENTATION: LOG_TABLE: 

@ -220,18 +324,28 @@ int gf_w4_log_init(gf_t *gf)
  h = (gf_internal_t *) gf->scratch;
  ltd = h->private;

-  ltd->log_tbl[0] = 0;
+  for (i = 0; i < GF_FIELD_SIZE; i++)
+    ltd->log_tbl[i]=0;

  ltd->antilog_tbl_div = ltd->antilog_tbl + (GF_FIELD_SIZE-1);
  b = 1;
-  for (i = 0; i < GF_FIELD_SIZE-1; i++) {
-      ltd->log_tbl[b] = i;
-      ltd->antilog_tbl[i] = b;
-      ltd->antilog_tbl[i+GF_FIELD_SIZE-1] = b;
-      b <<= 1;
-      if (b & GF_FIELD_SIZE) {
-          b = b ^ h->prim_poly;
-      }
+  i = 0;
+  do {
+    if (ltd->log_tbl[b] != 0 && i != 0) {
+      fprintf(stderr, "Cannot construct log table: Polynomial is not primitive.\n\n");
+      return 0;
+    }
+    ltd->log_tbl[b] = i;
+    ltd->antilog_tbl[i] = b;
+    ltd->antilog_tbl[i+GF_FIELD_SIZE-1] = b;
+    b <<= 1;
+    i++;
+    if (b & GF_FIELD_SIZE) b = b ^ h->prim_poly;
+  } while (b != 1);
+
+  if (i != GF_FIELD_SIZE - 1) {
+    _gf_errno = GF_E_LOGPOLY;
+    return 0;
  }
    
  gf->inverse.w32 = gf_w4_inverse_from_divide;
@ -300,7 +414,7 @@ static
 void 
 gf_w4_single_table_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
 {
-#ifdef   INTEL_SSE4
+#ifdef INTEL_SSSE3
  gf_region_data rd;
  uint8_t *base, *sptr, *dptr, *top;
  __m128i  tl, loset, h4, r, va, th;
@ -351,37 +465,17 @@ int gf_w4_single_table_init(gf_t *gf)
  gf_internal_t *h;
  struct gf_single_table_data *std;
  int a, b, prod, loga, logb;
-  uint8_t log_tbl[GF_FIELD_SIZE];
-  uint8_t antilog_tbl[GF_FIELD_SIZE*2];
-  int sse;

-  sse = 0;
-#ifdef INTEL_SSE4
-  sse = 1;
-#endif

  h = (gf_internal_t *) gf->scratch;
  std = (struct gf_single_table_data *)h->private;

-  b = 1;
-  for (a = 0; a < GF_MULT_GROUP_SIZE; a++) {
-      log_tbl[b] = a;
-      antilog_tbl[a] = b;
-      antilog_tbl[a+GF_MULT_GROUP_SIZE] = b;
-      b <<= 1;
-      if (b & GF_FIELD_SIZE) {
-          b = b ^ h->prim_poly;
-      }
-  }
-
  bzero(std->mult, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
  bzero(std->div, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);

  for (a = 1; a < GF_FIELD_SIZE; a++) {
-    loga = log_tbl[a];
    for (b = 1; b < GF_FIELD_SIZE; b++) {
-      logb = log_tbl[b];
-      prod = antilog_tbl[loga+logb];
+      prod = gf_w4_shift_multiply(gf, a, b);
      std->mult[a][b] = prod;
      std->div[prod][b] = a;
    }
@ -390,11 +484,16 @@ int gf_w4_single_table_init(gf_t *gf)
  gf->inverse.w32 = NULL;
  gf->divide.w32 = gf_w4_single_table_divide;
  gf->multiply.w32 = gf_w4_single_table_multiply;
-  if ((h->region_type & GF_REGION_SSE) || (h->mult_type == GF_MULT_DEFAULT && sse)) {  
-    gf->multiply_region.w32 = gf_w4_single_table_sse_multiply_region;
-  } else {
+  #ifdef INTEL_SSSE3
+    if(h->region_type & (GF_REGION_NOSSE | GF_REGION_CAUCHY))
+      gf->multiply_region.w32 = gf_w4_single_table_multiply_region;
+    else
+      gf->multiply_region.w32 = gf_w4_single_table_sse_multiply_region;
+  #else
    gf->multiply_region.w32 = gf_w4_single_table_multiply_region;
-  }
+    if (h->region_type & GF_REGION_SSE) return 0;
+  #endif
+
  return 1;
 }

@ -458,32 +557,17 @@ int gf_w4_double_table_init(gf_t *gf)
  gf_internal_t *h;
  struct gf_double_table_data *std;
  int a, b, c, prod, loga, logb, ab;
-  uint8_t log_tbl[GF_FIELD_SIZE];
-  uint8_t antilog_tbl[GF_FIELD_SIZE*2];
  uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE];

  h = (gf_internal_t *) gf->scratch;
  std = (struct gf_double_table_data *)h->private;

-  b = 1;
-  for (a = 0; a < GF_MULT_GROUP_SIZE; a++) {
-      log_tbl[b] = a;
-      antilog_tbl[a] = b;
-      antilog_tbl[a+GF_MULT_GROUP_SIZE] = b;
-      b <<= 1;
-      if (b & GF_FIELD_SIZE) {
-          b = b ^ h->prim_poly;
-      }
-  }
-
  bzero(mult, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
  bzero(std->div, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);

  for (a = 1; a < GF_FIELD_SIZE; a++) {
-    loga = log_tbl[a];
    for (b = 1; b < GF_FIELD_SIZE; b++) {
-      logb = log_tbl[b];
-      prod = antilog_tbl[loga+logb];
+      prod = gf_w4_shift_multiply(gf, a, b);
      mult[a][b] = prod;
      std->div[prod][b] = a;
    }
@ -600,32 +684,17 @@ int gf_w4_quad_table_init(gf_t *gf)
  gf_internal_t *h;
  struct gf_quad_table_data *std;
  int prod, loga, logb, ab, val, a, b, c, d, va, vb, vc, vd;
-  uint8_t log_tbl[GF_FIELD_SIZE];
-  uint8_t antilog_tbl[GF_FIELD_SIZE*2];
  uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE];

  h = (gf_internal_t *) gf->scratch;
  std = (struct gf_quad_table_data *)h->private;

-  b = 1;
-  for (a = 0; a < GF_MULT_GROUP_SIZE; a++) {
-      log_tbl[b] = a;
-      antilog_tbl[a] = b;
-      antilog_tbl[a+GF_MULT_GROUP_SIZE] = b;
-      b <<= 1;
-      if (b & GF_FIELD_SIZE) {
-          b = b ^ h->prim_poly;
-      }
-  }
-
  bzero(mult, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
  bzero(std->div, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);

  for (a = 1; a < GF_FIELD_SIZE; a++) {
-    loga = log_tbl[a];
    for (b = 1; b < GF_FIELD_SIZE; b++) {
-      logb = log_tbl[b];
-      prod = antilog_tbl[loga+logb];
+      prod = gf_w4_shift_multiply(gf, a, b);
      mult[a][b] = prod;
      std->div[prod][b] = a;
    }
@ -702,13 +771,18 @@ int gf_w4_table_init(gf_t *gf)
 {
  int rt;
  gf_internal_t *h;
+  int issse3 = 0;
+
+#ifdef INTEL_SSSE3
+  issse3 = 1;
+#endif

  h = (gf_internal_t *) gf->scratch;
  rt = (h->region_type);
-  if (rt == 0 || rt == GF_REGION_CAUCHY) rt |= GF_REGION_SINGLE_TABLE;
-  if (rt & GF_REGION_SINGLE_TABLE) {
-    return gf_w4_single_table_init(gf);
-  } else if (rt & GF_REGION_DOUBLE_TABLE) {
+
+  if (h->mult_type == GF_MULT_DEFAULT && !issse3) rt |= GF_REGION_DOUBLE_TABLE;
+
+  if (rt & GF_REGION_DOUBLE_TABLE) {
    return gf_w4_double_table_init(gf);
  } else if (rt & GF_REGION_QUAD_TABLE) {
    if (rt & GF_REGION_LAZY) {
@ -717,7 +791,9 @@ int gf_w4_table_init(gf_t *gf)
      return gf_w4_quad_table_init(gf);
    }
    return gf_w4_double_table_init(gf);
-  } 
+  } else {
+    return gf_w4_single_table_init(gf);
+  }
  return 0;
 }

@ -842,7 +918,7 @@ static
 void
 gf_w4_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
 {
-#ifdef   INTEL_SSE4
+#ifdef INTEL_SSE2
  int i;
  uint8_t *s8, *d8;
  uint8_t vrev;
@ -895,7 +971,7 @@ static
 void 
 gf_w4_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
 {
-#ifdef   INTEL_SSE4
+#ifdef INTEL_SSE2
  uint8_t *d8, *s8, tb;
  __m128i pp, m1, m2, t1, t2, va, vb;
  struct gf_bytwo_data *btd;
@ -960,7 +1036,7 @@ static
 void
 gf_w4_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_bytwo_data *btd)
 {
-#ifdef   INTEL_SSE4
+#ifdef INTEL_SSE2
  int i;
  uint8_t *d8, *s8, tb;
  __m128i pp, m1, m2, t1, t2, va, vb;
@ -986,7 +1062,7 @@ static
 void
 gf_w4_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_bytwo_data *btd)
 {
-#ifdef   INTEL_SSE4
+#ifdef INTEL_SSE2
  int i;
  uint8_t *d8, *s8, tb;
  __m128i pp, m1, m2, t1, t2, va, vb;
@ -1014,7 +1090,7 @@ static
 void
 gf_w4_bytwo_b_sse_region_4_noxor(gf_region_data *rd, struct gf_bytwo_data *btd)
 {
-#ifdef   INTEL_SSE4
+#ifdef INTEL_SSE2
  int i;
  uint8_t *d8, *s8, tb;
  __m128i pp, m1, m2, t1, t2, va, vb;
@ -1041,7 +1117,7 @@ static
 void
 gf_w4_bytwo_b_sse_region_4_xor(gf_region_data *rd, struct gf_bytwo_data *btd)
 {
-#ifdef   INTEL_SSE4
+#ifdef INTEL_SSE2
  int i;
  uint8_t *d8, *s8, tb;
  __m128i pp, m1, m2, t1, t2, va, vb;
@ -1071,7 +1147,7 @@ static
 void
 gf_w4_bytwo_b_sse_region_3_noxor(gf_region_data *rd, struct gf_bytwo_data *btd)
 {
-#ifdef   INTEL_SSE4
+#ifdef INTEL_SSE2
  int i;
  uint8_t *d8, *s8, tb;
  __m128i pp, m1, m2, t1, t2, va, vb;
@ -1099,7 +1175,7 @@ static
 void
 gf_w4_bytwo_b_sse_region_3_xor(gf_region_data *rd, struct gf_bytwo_data *btd)
 {
-#ifdef   INTEL_SSE4
+#ifdef INTEL_SSE2
  int i;
  uint8_t *d8, *s8, tb;
  __m128i pp, m1, m2, t1, t2, va, vb;
@ -1127,7 +1203,7 @@ static
 void
 gf_w4_bytwo_b_sse_region_5_noxor(gf_region_data *rd, struct gf_bytwo_data *btd)
 {
-#ifdef   INTEL_SSE4
+#ifdef INTEL_SSE2
  int i;
  uint8_t *d8, *s8, tb;
  __m128i pp, m1, m2, t1, t2, va, vb;
@ -1156,7 +1232,7 @@ static
 void
 gf_w4_bytwo_b_sse_region_5_xor(gf_region_data *rd, struct gf_bytwo_data *btd)
 {
-#ifdef   INTEL_SSE4
+#ifdef INTEL_SSE2
  int i;
  uint8_t *d8, *s8, tb;
  __m128i pp, m1, m2, t1, t2, va, vb;
@ -1185,7 +1261,7 @@ static
 void
 gf_w4_bytwo_b_sse_region_7_noxor(gf_region_data *rd, struct gf_bytwo_data *btd)
 {
-#ifdef   INTEL_SSE4
+#ifdef INTEL_SSE2
  int i;
  uint8_t *d8, *s8, tb;
  __m128i pp, m1, m2, t1, t2, va, vb;
@ -1215,7 +1291,7 @@ static
 void
 gf_w4_bytwo_b_sse_region_7_xor(gf_region_data *rd, struct gf_bytwo_data *btd)
 {
-#ifdef   INTEL_SSE4
+#ifdef INTEL_SSE2
  int i;
  uint8_t *d8, *s8, tb;
  __m128i pp, m1, m2, t1, t2, va, vb;
@ -1245,7 +1321,7 @@ static
 void
 gf_w4_bytwo_b_sse_region_6_noxor(gf_region_data *rd, struct gf_bytwo_data *btd)
 {
-#ifdef   INTEL_SSE4
+#ifdef INTEL_SSE2
  int i;
  uint8_t *d8, *s8, tb;
  __m128i pp, m1, m2, t1, t2, va, vb;
@ -1274,7 +1350,7 @@ static
 void
 gf_w4_bytwo_b_sse_region_6_xor(gf_region_data *rd, struct gf_bytwo_data *btd)
 {
-#ifdef   INTEL_SSE4
+#ifdef INTEL_SSE2
  int i;
  uint8_t *d8, *s8, tb;
  __m128i pp, m1, m2, t1, t2, va, vb;
@ -1303,7 +1379,7 @@ static
 void 
 gf_w4_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
 {
-#ifdef   INTEL_SSE4
+#ifdef INTEL_SSE2
  uint8_t *d8, *s8, tb;
  __m128i pp, m1, m2, t1, t2, va, vb;
  struct gf_bytwo_data *btd;
@ -1853,114 +1929,107 @@ int gf_w4_bytwo_init(gf_t *gf)

  if (h->mult_type == GF_MULT_BYTWO_p) {
    gf->multiply.w32 = gf_w4_bytwo_p_multiply;
-    if (h->region_type == GF_REGION_SSE) {
-      gf->multiply_region.w32 = gf_w4_bytwo_p_sse_multiply_region;
-    } else {
+    #ifdef INTEL_SSE2
+      if (h->region_type & GF_REGION_NOSSE)
+        gf->multiply_region.w32 = gf_w4_bytwo_p_nosse_multiply_region;
+      else
+        gf->multiply_region.w32 = gf_w4_bytwo_p_sse_multiply_region;
+    #else
      gf->multiply_region.w32 = gf_w4_bytwo_p_nosse_multiply_region;
-    }
+      if (h->region_type & GF_REGION_SSE)
+        return 0;
+    #endif
  } else {
    gf->multiply.w32 = gf_w4_bytwo_b_multiply;
-    if (h->region_type == GF_REGION_SSE) {
-      gf->multiply_region.w32 = gf_w4_bytwo_b_sse_multiply_region;
-    } else {
+    #ifdef INTEL_SSE2
+      if (h->region_type & GF_REGION_NOSSE)
+        gf->multiply_region.w32 = gf_w4_bytwo_b_nosse_multiply_region;
+      else
+        gf->multiply_region.w32 = gf_w4_bytwo_b_sse_multiply_region;
+    #else
      gf->multiply_region.w32 = gf_w4_bytwo_b_nosse_multiply_region;
-    }
+      if (h->region_type & GF_REGION_SSE)
+        return 0;
+    #endif
  }
-  gf->inverse.w32 = gf_w4_euclid;
  return 1;
 }


-/* ------------------------------------------------------------
-   JSP: GF_MULT_SHIFT: The world's dumbest multiplication algorithm.  I only
-   include it for completeness.  It does have the feature that it requires no
-   extra memory.  
-*/
-
-static
-inline
-gf_val_32_t
-gf_w4_shift_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+static 
+int gf_w4_cfm_init(gf_t *gf)
 {
-  uint8_t product, i, pp;
  gf_internal_t *h;
-  
+
  h = (gf_internal_t *) gf->scratch;
-  pp = h->prim_poly;

-  product = 0;
-
-  for (i = 0; i < GF_FIELD_WIDTH; i++) { 
-    if (a & (1 << i)) product ^= (b << i);
-  }
-  for (i = (GF_FIELD_WIDTH*2-1); i >= GF_FIELD_WIDTH; i--) {
-    if (product & (1 << i)) product ^= (pp << (i-GF_FIELD_WIDTH)); 
-  }
-  return product;
+#ifdef INTEL_SSE4_PCLMUL
+  gf->multiply.w32 = gf_w4_clm_multiply;
+  return 1;
+#endif
+  return 0;
 }

 static 
 int gf_w4_shift_init(gf_t *gf)
 {
  gf->multiply.w32 = gf_w4_shift_multiply;
-  gf->inverse.w32 = gf_w4_euclid;
  return 1;
 }

+/* JSP: I'm putting all error-checking into gf_error_check(), so you don't 
+   have to do error checking in scratch_size or in init */
+
 int gf_w4_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
 {
  int region_tbl_size;
-  int sss;
  int ss;
+  int issse3 = 0;

-  sss = (GF_REGION_SINGLE_TABLE | GF_REGION_SSE | GF_REGION_NOSSE);
-  ss = (GF_REGION_SSE | GF_REGION_NOSSE);
+#ifdef INTEL_SSSE3
+  issse3 = 1;
+#endif

  switch(mult_type)
  {
    case GF_MULT_BYTWO_p:
    case GF_MULT_BYTWO_b:
-      if (arg1 != 0 || arg2 != 0) return -1;
-      if (region_type != GF_REGION_CAUCHY) {
-        if ((region_type | ss) != ss || (region_type & ss) == ss) return -1;
-      }
      return sizeof(gf_internal_t) + sizeof(struct gf_bytwo_data);
      break;
    case GF_MULT_DEFAULT:
    case GF_MULT_TABLE:
-      if (arg1 != 0 || arg2 != 0) return -1;
-      if (region_type == GF_REGION_CAUCHY || region_type == (GF_REGION_CAUCHY | GF_REGION_SINGLE_TABLE)) {
+      if (region_type == GF_REGION_CAUCHY) {
        return sizeof(gf_internal_t) + sizeof(struct gf_single_table_data) + 64;
      }
-      if (mult_type == GF_MULT_DEFAULT || region_type == 0) region_type = GF_REGION_SINGLE_TABLE;
-      if (region_type & GF_REGION_SINGLE_TABLE) {
-        if ((region_type | sss) != sss) return -1;
-        if ((region_type & sss) == sss) return -1;
-        return sizeof(gf_internal_t) + sizeof(struct gf_single_table_data) + 64;
-      } else if (region_type & GF_REGION_DOUBLE_TABLE) {
-        if (region_type != GF_REGION_DOUBLE_TABLE) return -1;
+
+      if (mult_type == GF_MULT_DEFAULT && !issse3) region_type = GF_REGION_DOUBLE_TABLE;
+
+      if (region_type & GF_REGION_DOUBLE_TABLE) {
        return sizeof(gf_internal_t) + sizeof(struct gf_double_table_data) + 64;
      } else if (region_type & GF_REGION_QUAD_TABLE) {
-        if ((region_type | GF_REGION_LAZY) != (GF_REGION_QUAD_TABLE | GF_REGION_LAZY)) return -1;
        if ((region_type & GF_REGION_LAZY) == 0) {
          return sizeof(gf_internal_t) + sizeof(struct gf_quad_table_data) + 64;
        } else {
          return sizeof(gf_internal_t) + sizeof(struct gf_quad_table_lazy_data) + 64;
        }
+      } else {
+        return sizeof(gf_internal_t) + sizeof(struct gf_single_table_data) + 64;
      }
-      return -1;  
      break;
+
    case GF_MULT_LOG_TABLE:
-      if (arg1 != 0 || arg2 != 0 || (region_type != 0 && region_type != GF_REGION_CAUCHY)) return -1;
      return sizeof(gf_internal_t) + sizeof(struct gf_logtable_data) + 64;
      break;
+    case GF_MULT_CARRY_FREE:
+      return sizeof(gf_internal_t);
+      break;
    case GF_MULT_SHIFT:
-      if (arg1 != 0 || arg2 != 0 || (region_type != 0 && region_type != GF_REGION_CAUCHY)) return -1;
      return sizeof(gf_internal_t);
      break;
    default:
-      return -1;
+      return 0;
   }
+  return 0;
 }

 int
@ -1970,7 +2039,7 @@ gf_w4_init (gf_t *gf)

  h = (gf_internal_t *) gf->scratch;
  if (h->prim_poly == 0) h->prim_poly = 0x13;
-
+  h->prim_poly |= 0x10;
  gf->multiply.w32 = NULL;
  gf->divide.w32 = NULL;
  gf->inverse.w32 = NULL;
@ -1978,13 +2047,13 @@ gf_w4_init (gf_t *gf)
  gf->extract_word.w32 = gf_w4_extract_word;

  switch(h->mult_type) {
-    case GF_MULT_SHIFT:     if (gf_w4_shift_init(gf) == 0) return 0; break;
+    case GF_MULT_CARRY_FREE: if (gf_w4_cfm_init(gf) == 0) return 0; break;
+    case GF_MULT_SHIFT:      if (gf_w4_shift_init(gf) == 0) return 0; break;
    case GF_MULT_BYTWO_p:   
-    case GF_MULT_BYTWO_b:   
-      if (gf_w4_bytwo_init(gf) == 0) return 0; break;
-    case GF_MULT_LOG_TABLE: if (gf_w4_log_init(gf) == 0) return 0; break;
+    case GF_MULT_BYTWO_b:    if (gf_w4_bytwo_init(gf) == 0) return 0; break;
+    case GF_MULT_LOG_TABLE:  if (gf_w4_log_init(gf) == 0) return 0; break;
    case GF_MULT_DEFAULT:   
-    case GF_MULT_TABLE:     if (gf_w4_table_init(gf) == 0) return 0; break;
+    case GF_MULT_TABLE:      if (gf_w4_table_init(gf) == 0) return 0; break;
    default: return 0;
  }

@ -1996,17 +2065,22 @@ gf_w4_init (gf_t *gf)
    gf->inverse.w32 = gf_w4_matrix;
  }

-  if (gf->inverse.w32 != NULL && gf->divide.w32 == NULL) {
+  if (gf->divide.w32 == NULL) {
    gf->divide.w32 = gf_w4_divide_from_inverse;
+    if (gf->inverse.w32 == NULL) gf->inverse.w32 = gf_w4_euclid;
  }
-  if (gf->inverse.w32 == NULL && gf->divide.w32 != NULL) {
-    gf->inverse.w32 = gf_w4_inverse_from_divide;
-  }
+
+  if (gf->inverse.w32 == NULL)  gf->inverse.w32 = gf_w4_inverse_from_divide;

  if (h->region_type == GF_REGION_CAUCHY) {
    gf->multiply_region.w32 = gf_wgen_cauchy_region;
    gf->extract_word.w32 = gf_wgen_extract_word;
  }
+
+  if (gf->multiply_region.w32 == NULL) {
+    gf->multiply_region.w32 = gf_w4_multiply_region_from_single;
+  }
+
  return 1;
 }

--- a/gf_w64.c
+++ b/gf_w64.c
--- a/gf_w8.c
+++ b/gf_w8.c
--- a/gf_wgen.c
+++ b/gf_wgen.c
@ -93,6 +93,7 @@ gf_val_32_t gf_wgen_euclid (gf_t *gf, gf_val_32_t b)
    while (d_ip1 >= d_i) {
      c_i ^= (1 << (d_ip1 - d_i));
      e_ip1 ^= (e_i << (d_ip1 - d_i));
+      if (e_ip1 == 0) return 0;
      while ((e_ip1 & (1 << d_ip1)) == 0) d_ip1--;
    }

@ -223,7 +224,7 @@ gf_wgen_bytwo_p_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
  pp = h->prim_poly;

  prod = 0;
-  pmask = (1 << (h->w)-1);
+  pmask = (1 << ((h->w)-1)); /*Ben: Had an operator precedence warning here*/
  amask = pmask;

  while (amask != 0) {
@ -508,16 +509,11 @@ int gf_wgen_table_8_init(gf_t *gf)
  }
    
  for (a = 1; a < (1 << w); a++) {
-    b = 1;
-    p = a;
-    do {
+    for (b = 1; b < (1 << w); b++) {
+      p = gf_wgen_shift_multiply(gf, a, b);
      std->mult[(a<<w)|b] = p;
-      std->div[(p<<w)|b] = a;
-      b = (b & (1 << (w-1))) ? (b << 1) ^ h->prim_poly : (b << 1);
-      b &= ((1 << w)-1);
-      p = (p & (1 << (w-1))) ? (p << 1) ^ h->prim_poly : (p << 1);
-      p &= ((1 << w)-1);
-    } while (b != 1);
+      std->div[(p<<w)|a] = b;
+    }
  }

  gf->multiply.w32 = gf_wgen_table_8_multiply;
@ -572,18 +568,13 @@ int gf_wgen_table_16_init(gf_t *gf)
    std->div[a] = 0;
    std->div[a<<w] = 0;
  }
-    
+  
  for (a = 1; a < (1 << w); a++) {
-    b = 1;
-    p = a;
-    do {
+    for (b = 1; b < (1 << w); b++) {
+      p = gf_wgen_shift_multiply(gf, a, b);
      std->mult[(a<<w)|b] = p;
-      std->div[(p<<w)|b] = a;
-      b = (b & (1 << (w-1))) ? (b << 1) ^ h->prim_poly : (b << 1);
-      b &= ((1 << w)-1);
-      p = (p & (1 << (w-1))) ? (p << 1) ^ h->prim_poly : (p << 1);
-      p &= ((1 << w)-1);
-    } while (b != 1);
+      std->div[(p<<w)|a] = b;
+    }
  }

  gf->multiply.w32 = gf_wgen_table_16_multiply;
@ -599,6 +590,11 @@ int gf_wgen_table_init(gf_t *gf)
  h = (gf_internal_t *) gf->scratch;
  if (h->w <= 8) return gf_wgen_table_8_init(gf);
  if (h->w <= 14) return gf_wgen_table_16_init(gf);
+
+  /* Returning zero to make the compiler happy, but this won't get 
+     executed, because it is tested in _scratch_space. */
+
+  return 0;
 }

 static
@ -640,6 +636,7 @@ int gf_wgen_log_8_init(gf_t *gf)
  struct gf_wgen_log_w8_data *std;
  int w;
  uint32_t a, i;
+  int check = 0;
  
  h = (gf_internal_t *) gf->scratch;
  w = h->w;
@ -649,17 +646,27 @@ int gf_wgen_log_8_init(gf_t *gf)
  std->anti = std->log + (1<<h->w);
  std->danti = std->anti + (1<<h->w)-1;
  
-  i = 0;
+  for (i = 0; i < (1 << w); i++)
+    std->log[i] = 0;
+
  a = 1;
-  do {
+  for(i=0; i < (1<<w)-1; i++)
+  {
+    if (std->log[a] != 0) check = 1;
    std->log[a] = i;
    std->anti[i] = a;
    std->danti[i] = a;
-    i++;
-    a = (a & (1 << (w-1))) ? (a << 1) ^ h->prim_poly : (a << 1);
-    a &= ((1 << w)-1);
-  } while (a != 1);
-  
+    a <<= 1;
+    if(a & (1<<w))
+      a ^= h->prim_poly;
+    //a &= ((1 << w)-1);
+  }
+
+  if (check != 0) {
+    _gf_errno = GF_E_LOGPOLY;
+    return 0;
+  }
+
  gf->multiply.w32 = gf_wgen_log_8_multiply;
  gf->divide.w32 = gf_wgen_log_8_divide;
  return 1;
@ -704,6 +711,7 @@ int gf_wgen_log_16_init(gf_t *gf)
  struct gf_wgen_log_w16_data *std;
  int w;
  uint32_t a, i;
+  int check = 0;
  
  h = (gf_internal_t *) gf->scratch;
  w = h->w;
@ -712,17 +720,28 @@ int gf_wgen_log_16_init(gf_t *gf)
  std->log = &(std->base);
  std->anti = std->log + (1<<h->w);
  std->danti = std->anti + (1<<h->w)-1;
-  
-  i = 0;
+ 
+  for (i = 0; i < (1 << w); i++)
+    std->log[i] = 0;
+
  a = 1;
-  do {
+  for(i=0; i < (1<<w)-1; i++)
+  {
+    if (std->log[a] != 0) check = 1;
    std->log[a] = i;
    std->anti[i] = a;
    std->danti[i] = a;
-    i++;
-    a = (a & (1 << (w-1))) ? (a << 1) ^ h->prim_poly : (a << 1);
-    a &= ((1 << w)-1);
-  } while (a != 1);
+    a <<= 1;
+    if(a & (1<<w))
+      a ^= h->prim_poly;
+    //a &= ((1 << w)-1);
+  }
+
+  if (check) {
+    if (h->mult_type != GF_MULT_LOG_TABLE) return gf_wgen_shift_init(gf);
+    _gf_errno = GF_E_LOGPOLY;
+    return 0;
+  }
  
  gf->multiply.w32 = gf_wgen_log_16_multiply;
  gf->divide.w32 = gf_wgen_log_16_divide;
@ -768,7 +787,8 @@ int gf_wgen_log_32_init(gf_t *gf)
  struct gf_wgen_log_w32_data *std;
  int w;
  uint32_t a, i;
-  
+  int check = 0;
+
  h = (gf_internal_t *) gf->scratch;
  w = h->w;
  std = (struct gf_wgen_log_w32_data *) h->private;
@ -777,17 +797,27 @@ int gf_wgen_log_32_init(gf_t *gf)
  std->anti = std->log + (1<<h->w);
  std->danti = std->anti + (1<<h->w)-1;
  
-  i = 0;
+  for (i = 0; i < (1 << w); i++)
+    std->log[i] = 0;
+
  a = 1;
-  do {
+  for(i=0; i < (1<<w)-1; i++)
+  {
+    if (std->log[a] != 0) check = 1;
    std->log[a] = i;
    std->anti[i] = a;
    std->danti[i] = a;
-    i++;
-    a = (a & (1 << (w-1))) ? (a << 1) ^ h->prim_poly : (a << 1);
-    a &= ((1 << w)-1);
-  } while (a != 1);
-  
+    a <<= 1;
+    if(a & (1<<w))
+      a ^= h->prim_poly;
+    //a &= ((1 << w)-1);
+  }
+
+  if (check != 0) {
+    _gf_errno = GF_E_LOGPOLY;
+    return 0;
+  }
+
  gf->multiply.w32 = gf_wgen_log_32_multiply;
  gf->divide.w32 = gf_wgen_log_32_divide;
  return 1;
@ -802,15 +832,16 @@ int gf_wgen_log_init(gf_t *gf)
  if (h->w <= 8) return gf_wgen_log_8_init(gf);
  if (h->w <= 16) return gf_wgen_log_16_init(gf);
  if (h->w <= 32) return gf_wgen_log_32_init(gf); 
+
+  /* Returning zero to make the compiler happy, but this won't get 
+     executed, because it is tested in _scratch_space. */
+
+  return 0;
 }

 int gf_wgen_scratch_size(int w, int mult_type, int region_type, int divide_type, int arg1, int arg2)
 {

-  if (w > 32 || w < 0) return -1;
-
-  if ((region_type | GF_REGION_CAUCHY) != GF_REGION_CAUCHY) return -1;
-
  switch(mult_type)
  {
    case GF_MULT_DEFAULT: 
@ -828,40 +859,37 @@ int gf_wgen_scratch_size(int w, int mult_type, int region_type, int divide_type,
    case GF_MULT_SHIFT:
    case GF_MULT_BYTWO_b:
    case GF_MULT_BYTWO_p:
-      if (arg1 != 0 || arg2 != 0) return -1;
      return sizeof(gf_internal_t);
      break;
    case GF_MULT_GROUP:
-      if (arg1 <= 0 || arg2 <= 0) return -1;
      return sizeof(gf_internal_t) + sizeof(struct gf_wgen_group_data) +
               sizeof(uint32_t) * (1 << arg1) +
               sizeof(uint32_t) * (1 << arg2) + 64;
      break;

    case GF_MULT_TABLE: 
-      if (arg1 != 0 || arg2 != 0) return -1;
      if (w <= 8) {
        return sizeof(gf_internal_t) + sizeof(struct gf_wgen_table_w8_data) +
               sizeof(uint8_t)*(1 << w)*(1<<w)*2 + 64;
      } else if (w < 15) {
        return sizeof(gf_internal_t) + sizeof(struct gf_wgen_table_w16_data) +
               sizeof(uint16_t)*(1 << w)*(1<<w)*2 + 64;
-      } else return -1;
+      } 
+      return 0;
    case GF_MULT_LOG_TABLE: 
-      if (arg1 != 0 || arg2 != 0) return -1;
      if (w <= 8) {
        return sizeof(gf_internal_t) + sizeof(struct gf_wgen_log_w8_data) +
               sizeof(uint8_t)*(1 << w)*3;
      } else if (w <= 16) {
        return sizeof(gf_internal_t) + sizeof(struct gf_wgen_log_w16_data) +
               sizeof(uint16_t)*(1 << w)*3;
-      } else if (w <= 29) {
+      } else if (w <= 27) {
        return sizeof(gf_internal_t) + sizeof(struct gf_wgen_log_w32_data) +
               sizeof(uint32_t)*(1 << w)*3;
-      } else return -1;
-
+      } else 
+      return 0;
    default:
-      return -1;
+      return 0;
   }
 }

@ -935,6 +963,13 @@ int gf_wgen_init(gf_t *gf)
      case 32: h->prim_poly = 00020000007; break;
      default: fprintf(stderr, "gf_wgen_init: w not defined yet\n"); exit(1);
    }
+  } else {
+    if (h->w == 32) {
+      h->prim_poly &= 0xffffffff;
+    } else {
+      h->prim_poly |= (1 << h->w);
+      if (h->prim_poly & ~((1ULL<<(h->w+1))-1)) return 0;
+    }
  }

  gf->multiply.w32 = NULL;
@ -950,7 +985,7 @@ int gf_wgen_init(gf_t *gf)
      } else if (h->w <= 16) {
        if (gf_wgen_log_init(gf) == 0) return 0; 
      } else {
-        if (gf_wgen_group_init(gf) == 0) return 0; 
+        if (gf_wgen_bytwo_p_init(gf) == 0) return 0; 
      }
      break;
    case GF_MULT_SHIFT:     if (gf_wgen_shift_init(gf) == 0) return 0; break;
--- a/release-files.txt
+++ b/release-files.txt
@ -1,31 +0,0 @@
-License.txt
-README.txt
-GNUmakefile
-gf.c
-gf_add.c
-gf_complete.h
-gf_div.c
-gf_example_1.c
-gf_example_2.c
-gf_example_3.c
-gf_example_4.c
-gf_general.c
-gf_general.h
-gf_int.h
-gf_method.c
-gf_method.h
-gf_methods.c
-gf_mult.c
-gf_poly.c
-gf_rand.c
-gf_rand.h
-gf_time.c
-gf_unit.c
-gf_w128.c
-gf_w16.c
-gf_w32.c
-gf_w4.c
-gf_w64.c
-gf_w8.c
-gf_wgen.c
-whats_my_sse.c
--- a/tests.txt
+++ b/tests.txt
--- a/tmp-10-out.txt
+++ b/tmp-10-out.txt
--- a/tmp-time-test.sh
+++ b/tmp-time-test.sh
@ -1,14 +0,0 @@
-if [ $# -lt 4 ]; then
-  echo 'usage: sh tmp-test.sh w gf_specs (e.g. LOG - -)' >&2
-  exit 1
-fi
-
-w=$1
-shift
-i=1024
-while [ $i -le 134217728 ]; do
-  iter=`echo $i | awk '{ print (134217728/$1)*1 }'`
-  echo $i $iter $w $* `./gf_time $w G -1 $i $iter $* | head -n 3 | tail -n 2`
-  i=`echo $i | awk '{ print $1*2 }'`
-done
-
--- a/tmp.c
+++ b/tmp.c
--- a/tmp.sh
+++ b/tmp.sh
@ -1,15 +0,0 @@
-for i in 5 10 ; do
-  sed 's/1 }/'$i' }/' tmp-time-test.sh > tmp2.sh
-  sh tmp2.sh 4 LOG - - >> tmp-$i-out.txt
-  sh tmp2.sh 4 TABLE - - >> tmp-$i-out.txt
-  sh tmp2.sh 4 TABLE SINGLE,SSE - >> tmp-$i-out.txt
-  sh tmp2.sh 8 LOG - - >> tmp-$i-out.txt
-  sh tmp2.sh 8 TABLE - - >> tmp-$i-out.txt
-  sh tmp2.sh 8 SPLIT 8 4 SSE - >> tmp-$i-out.txt
-  sh tmp2.sh 16 LOG - - >> tmp-$i-out.txt
-  sh tmp2.sh 16 SPLIT 16 4 SSE,STDMAP - >> tmp-$i-out.txt
-  sh tmp2.sh 16 SPLIT 16 4 SSE,ALTMAP - >> tmp-$i-out.txt
-  sh tmp2.sh 32 SPLIT 8 8 - - >> tmp-$i-out.txt
-  sh tmp2.sh 32 SPLIT 32 4 SSE,STDMAP - >> tmp-$i-out.txt
-  sh tmp2.sh 32 SPLIT 32 4 SSE,ALTMAP - >> tmp-$i-out.txt
-done
--- a/tmp.txt
+++ b/tmp.txt
@ -1,162 +0,0 @@
-Tables[0] = 0000000000000000 3b60e7ccf8f4454e 76c1cf99f1e88a9c 4da12855091ccfd2 ed839f33e3d11538 d6e378ff1b255076 9b4250aa12399fa4 a022b766eacddaea db073e67c7a22a6b e067d9ab3f566f25 adc6f1fe364aa0f7 96a61632cebee5b9 3684a15424733f53 0de44698dc877a1d 40456ecdd59bb5cf 7b2589012d6ff081
-Tij            81 cf 1d 53   b9 f7 25 6b   ea a4 76 38   d2 9c 4e 00
-Tij            f0 b5 7a 3f   e5 a0 6f 2a   da 9f 50 15   cf 8a 45 00
-Tij            6f 9b 87 73   be 4a 56 a2   cd 39 25 d1   1c e8 f4 00
-Tij            2d d5 dc 24   ce 36 3f c7   ea 12 1b e3   09 f1 f8 00
-Tij            01 cd 98 54   32 fe ab 67   66 aa ff 33   55 99 cc 00
-Tij            89 6e 46 a1   16 f1 d9 3e   b7 50 78 9f   28 cf e7 00
-Tij            25 45 e4 84   a6 c6 67 07   22 42 e3 83   a1 c1 60 00
-Tij            7b 40 0d 36   96 ad e0 db   a0 9b d6 ed   4d 76 3b 00
-Tables[1] = 0000000000000000 b60e7ccf8f4454cd 6c1cf99f1e88a981 da12855091ccfd4c d839f33e3d115302 6e378ff1b25507cf b4250aa12399fa83 022b766eacddae4e b073e67c7a22a61f 067d9ab3f566f2d2 dc6f1fe364aa0f9e 6a61632cebee5b53 684a15424733f51d de44698dc877a1d0 0456ecdd59bb5c9c b2589012d6ff0851
-Tij            51 9c d0 1d   53 9e d2 1f   4e 83 cf 02   4c 81 cd 00
-Tij            08 5c a1 f5   5b 0f f2 a6   ae fa 07 53   fd a9 54 00
-Tij            ff bb 77 33   ee aa 66 22   dd 99 55 11   cc 88 44 00
-Tij            d6 59 c8 47   eb 64 f5 7a   ac 23 b2 3d   91 1e 8f 00
-Tij            12 dd 8d 42   2c e3 b3 7c   6e a1 f1 3e   50 9f cf 00
-Tij            90 ec 69 15   63 1f 9a e6   76 0a 8f f3   85 f9 7c 00
-Tij            58 56 44 4a   61 6f 7d 73   2b 25 37 39   12 1c 0e 00
-Tij            b2 04 de 68   6a dc 06 b0   02 b4 6e d8   da 6c b6 00
-Tables[2] = 0000000000000000 60e7ccf8f4454c25 c1cf99f1e88a984a a12855091ccfd46f 839f33e3d115308f e378ff1b25507caa 4250aa12399fa8c5 22b766eacddae4e0 073e67c7a22a6105 67d9ab3f566f2d20 c6f1fe364aa0f94f a61632cebee5b56a 84a15424733f518a e44698dc877a1daf 456ecdd59bb5c9c0 2589012d6ff085e5
-Tij            e5 c0 af 8a   6a 4f 20 05   e0 c5 aa 8f   6f 4a 25 00
-Tij            85 c9 1d 51   b5 f9 2d 61   e4 a8 7c 30   d4 98 4c 00
-Tij            f0 b5 7a 3f   e5 a0 6f 2a   da 9f 50 15   cf 8a 45 00
-Tij            6f 9b 87 73   be 4a 56 a2   cd 39 25 d1   1c e8 f4 00
-Tij            2d d5 dc 24   ce 36 3f c7   ea 12 1b e3   09 f1 f8 00
-Tij            01 cd 98 54   32 fe ab 67   66 aa ff 33   55 99 cc 00
-Tij            89 6e 46 a1   16 f1 d9 3e   b7 50 78 9f   28 cf e7 00
-Tij            25 45 e4 84   a6 c6 67 07   22 42 e3 83   a1 c1 60 00
-Tables[3] = 0000000000000000 0e7ccf8f4454c20a 1cf99f1e88a98414 12855091ccfd461e 39f33e3d11530828 378ff1b25507ca22 250aa12399fa8c3c 2b766eacddae4e36 73e67c7a22a61050 7d9ab3f566f2d25a 6f1fe364aa0f9444 61632cebee5b564e 4a15424733f51878 44698dc877a1da72 56ecdd59bb5c9c6c 589012d6ff085e66
-Tij            66 6c 72 78   4e 44 5a 50   36 3c 22 28   1e 14 0a 00
-Tij            5e 9c da 18   56 94 d2 10   4e 8c ca 08   46 84 c2 00
-Tij            08 5c a1 f5   5b 0f f2 a6   ae fa 07 53   fd a9 54 00
-Tij            ff bb 77 33   ee aa 66 22   dd 99 55 11   cc 88 44 00
-Tij            d6 59 c8 47   eb 64 f5 7a   ac 23 b2 3d   91 1e 8f 00
-Tij            12 dd 8d 42   2c e3 b3 7c   6e a1 f1 3e   50 9f cf 00
-Tij            90 ec 69 15   63 1f 9a e6   76 0a 8f f3   85 f9 7c 00
-Tij            58 56 44 4a   61 6f 7d 73   2b 25 37 39   12 1c 0e 00
-Tables[4] = 0000000000000000 e7ccf8f4454c20a0 cf99f1e88a98415b 2855091ccfd461fb 9f33e3d1153082ad 78ff1b25507ca20d 50aa12399fa8c3f6 b766eacddae4e356 3e67c7a22a610541 d9ab3f566f2d25e1 f1fe364aa0f9441a 1632cebee5b564ba a15424733f5187ec 4698dc877a1da74c 6ecdd59bb5c9c6b7 89012d6ff085e617
-Tij            17 b7 4c ec   ba 1a e1 41   56 f6 0d ad   fb 5b a0 00
-Tij            e6 c6 a7 87   64 44 25 05   e3 c3 a2 82   61 41 20 00
-Tij            85 c9 1d 51   b5 f9 2d 61   e4 a8 7c 30   d4 98 4c 00
-Tij            f0 b5 7a 3f   e5 a0 6f 2a   da 9f 50 15   cf 8a 45 00
-Tij            6f 9b 87 73   be 4a 56 a2   cd 39 25 d1   1c e8 f4 00
-Tij            2d d5 dc 24   ce 36 3f c7   ea 12 1b e3   09 f1 f8 00
-Tij            01 cd 98 54   32 fe ab 67   66 aa ff 33   55 99 cc 00
-Tij            89 6e 46 a1   16 f1 d9 3e   b7 50 78 9f   28 cf e7 00
-Tables[5] = 0000000000000000 7ccf8f4454c20a82 f99f1e88a9841504 855091ccfd461f86 f33e3d1153082a13 8ff1b25507ca2091 0aa12399fa8c3f17 766eacddae4e3595 e67c7a22a610543d 9ab3f566f2d25ebf 1fe364aa0f944139 632cebee5b564bbb 15424733f5187e2e 698dc877a1da74ac ecdd59bb5c9c6b2a 9012d6ff085e61a8
-Tij            a8 2a ac 2e   bb 39 bf 3d   95 17 91 13   86 04 82 00
-Tij            61 6b 74 7e   4b 41 5e 54   35 3f 20 2a   1f 15 0a 00
-Tij            5e 9c da 18   56 94 d2 10   4e 8c ca 08   46 84 c2 00
-Tij            08 5c a1 f5   5b 0f f2 a6   ae fa 07 53   fd a9 54 00
-Tij            ff bb 77 33   ee aa 66 22   dd 99 55 11   cc 88 44 00
-Tij            d6 59 c8 47   eb 64 f5 7a   ac 23 b2 3d   91 1e 8f 00
-Tij            12 dd 8d 42   2c e3 b3 7c   6e a1 f1 3e   50 9f cf 00
-Tij            90 ec 69 15   63 1f 9a e6   76 0a 8f f3   85 f9 7c 00
-Tables[6] = 0000000000000000 ccf8f4454c20a861 99f1e88a984150d9 55091ccfd461f8b8 33e3d1153082a1a9 ff1b25507ca209c8 aa12399fa8c3f170 66eacddae4e35911 67c7a22a61054352 ab3f566f2d25eb33 fe364aa0f944138b 32cebee5b564bbea 5424733f5187e2fb 98dc877a1da74a9a cdd59bb5c9c6b222 012d6ff085e61a43
-Tij            43 22 9a fb   ea 8b 33 52   11 70 c8 a9   b8 d9 61 00
-Tij            1a b2 4a e2   bb 13 eb 43   59 f1 09 a1   f8 50 a8 00
-Tij            e6 c6 a7 87   64 44 25 05   e3 c3 a2 82   61 41 20 00
-Tij            85 c9 1d 51   b5 f9 2d 61   e4 a8 7c 30   d4 98 4c 00
-Tij            f0 b5 7a 3f   e5 a0 6f 2a   da 9f 50 15   cf 8a 45 00
-Tij            6f 9b 87 73   be 4a 56 a2   cd 39 25 d1   1c e8 f4 00
-Tij            2d d5 dc 24   ce 36 3f c7   ea 12 1b e3   09 f1 f8 00
-Tij            01 cd 98 54   32 fe ab 67   66 aa ff 33   55 99 cc 00
-Tables[7] = 0000000000000000 cf8f4454c20a86a4 9f1e88a984150d53 5091ccfd461f8bf7 3e3d1153082a1abd f1b25507ca209c19 a12399fa8c3f17ee 6eacddae4e35914a 7c7a22a61054357a b3f566f2d25eb3de e364aa0f94413829 2cebee5b564bbe8d 424733f5187e2fc7 8dc877a1da74a963 dd59bb5c9c6b2294 12d6ff085e61a430
-Tij            30 94 63 c7   8d 29 de 7a   4a ee 19 bd   f7 53 a4 00
-Tij            a4 22 a9 2f   be 38 b3 35   91 17 9c 1a   8b 0d 86 00
-Tij            61 6b 74 7e   4b 41 5e 54   35 3f 20 2a   1f 15 0a 00
-Tij            5e 9c da 18   56 94 d2 10   4e 8c ca 08   46 84 c2 00
-Tij            08 5c a1 f5   5b 0f f2 a6   ae fa 07 53   fd a9 54 00
-Tij            ff bb 77 33   ee aa 66 22   dd 99 55 11   cc 88 44 00
-Tij            d6 59 c8 47   eb 64 f5 7a   ac 23 b2 3d   91 1e 8f 00
-Tij            12 dd 8d 42   2c e3 b3 7c   6e a1 f1 3e   50 9f cf 00
-Tables[8] = 0000000000000000 f8f4454c20a86af4 f1e88a984150d5f3 091ccfd461f8bf07 e3d1153082a1abfd 1b25507ca209c109 12399fa8c3f17e0e eacddae4e35914fa c7a22a61054357e1 3f566f2d25eb3d15 364aa0f944138212 cebee5b564bbe8e6 24733f5187e2fc1c dc877a1da74a96e8 d59bb5c9c6b229ef 2d6ff085e61a431b
-Tij            1b ef e8 1c   e6 12 15 e1   fa 0e 09 fd   07 f3 f4 00
-Tij            43 29 96 fc   e8 82 3d 57   14 7e c1 ab   bf d5 6a 00
-Tij            1a b2 4a e2   bb 13 eb 43   59 f1 09 a1   f8 50 a8 00
-Tij            e6 c6 a7 87   64 44 25 05   e3 c3 a2 82   61 41 20 00
-Tij            85 c9 1d 51   b5 f9 2d 61   e4 a8 7c 30   d4 98 4c 00
-Tij            f0 b5 7a 3f   e5 a0 6f 2a   da 9f 50 15   cf 8a 45 00
-Tij            6f 9b 87 73   be 4a 56 a2   cd 39 25 d1   1c e8 f4 00
-Tij            2d d5 dc 24   ce 36 3f c7   ea 12 1b e3   09 f1 f8 00
-Tables[9] = 0000000000000000 8f4454c20a86afd9 1e88a984150d5fa9 91ccfd461f8bf070 3d1153082a1abf52 b25507ca209c108b 2399fa8c3f17e0fb acddae4e35914f22 7a22a61054357ea4 f566f2d25eb3d17d 64aa0f944138210d ebee5b564bbe8ed4 4733f5187e2fc1f6 c877a1da74a96e2f 59bb5c9c6b229e5f d6ff085e61a43186
-Tij            86 5f 2f f6   d4 0d 7d a4   22 fb 8b 52   70 a9 d9 00
-Tij            31 9e 6e c1   8e 21 d1 7e   4f e0 10 bf   f0 5f af 00
-Tij            a4 22 a9 2f   be 38 b3 35   91 17 9c 1a   8b 0d 86 00
-Tij            61 6b 74 7e   4b 41 5e 54   35 3f 20 2a   1f 15 0a 00
-Tij            5e 9c da 18   56 94 d2 10   4e 8c ca 08   46 84 c2 00
-Tij            08 5c a1 f5   5b 0f f2 a6   ae fa 07 53   fd a9 54 00
-Tij            ff bb 77 33   ee aa 66 22   dd 99 55 11   cc 88 44 00
-Tij            d6 59 c8 47   eb 64 f5 7a   ac 23 b2 3d   91 1e 8f 00
-Tables[10] = 0000000000000000 f4454c20a86afd48 e88a984150d5fa8b 1ccfd461f8bf07c3 d1153082a1abf50d 25507ca209c10845 399fa8c3f17e0f86 cddae4e35914f2ce a22a61054357ea01 566f2d25eb3d1749 4aa0f9441382108a bee5b564bbe8edc2 733f5187e2fc1f0c 877a1da74a96e244 9bb5c9c6b229e587 6ff085e61a4318cf
-Tij            cf 87 44 0c   c2 8a 49 01   ce 86 45 0d   c3 8b 48 00
-Tij            18 e5 e2 1f   ed 10 17 ea   f2 0f 08 f5   07 fa fd 00
-Tij            43 29 96 fc   e8 82 3d 57   14 7e c1 ab   bf d5 6a 00
-Tij            1a b2 4a e2   bb 13 eb 43   59 f1 09 a1   f8 50 a8 00
-Tij            e6 c6 a7 87   64 44 25 05   e3 c3 a2 82   61 41 20 00
-Tij            85 c9 1d 51   b5 f9 2d 61   e4 a8 7c 30   d4 98 4c 00
-Tij            f0 b5 7a 3f   e5 a0 6f 2a   da 9f 50 15   cf 8a 45 00
-Tij            6f 9b 87 73   be 4a 56 a2   cd 39 25 d1   1c e8 f4 00
-Tables[11] = 0000000000000000 4454c20a86afd419 88a984150d5fa832 ccfd461f8bf07c2b 1153082a1abf507f 5507ca209c108466 99fa8c3f17e0f84d ddae4e35914f2c54 22a61054357ea0fe 66f2d25eb3d174e7 aa0f9441382108cc ee5b564bbe8edcd5 33f5187e2fc1f081 77a1da74a96e2498 bb5c9c6b229e58b3 ff085e61a4318caa
-Tij            aa b3 98 81   d5 cc e7 fe   54 4d 66 7f   2b 32 19 00
-Tij            8c 58 24 f0   dc 08 74 a0   2c f8 84 50   7c a8 d4 00
-Tij            31 9e 6e c1   8e 21 d1 7e   4f e0 10 bf   f0 5f af 00
-Tij            a4 22 a9 2f   be 38 b3 35   91 17 9c 1a   8b 0d 86 00
-Tij            61 6b 74 7e   4b 41 5e 54   35 3f 20 2a   1f 15 0a 00
-Tij            5e 9c da 18   56 94 d2 10   4e 8c ca 08   46 84 c2 00
-Tij            08 5c a1 f5   5b 0f f2 a6   ae fa 07 53   fd a9 54 00
-Tij            ff bb 77 33   ee aa 66 22   dd 99 55 11   cc 88 44 00
-Tables[12] = 0000000000000000 454c20a86afd41fc 8a984150d5fa83f8 cfd461f8bf07c204 153082a1abf507eb 507ca209c1084617 9fa8c3f17e0f8413 dae4e35914f2c5ef 2a61054357ea0fd6 6f2d25eb3d174e2a a0f9441382108c2e e5b564bbe8edcdd2 3f5187e2fc1f083d 7a1da74a96e249c1 b5c9c6b229e58bc5 f085e61a4318ca39
-Tij            39 c5 c1 3d   d2 2e 2a d6   ef 13 17 eb   04 f8 fc 00
-Tij            ca 8b 49 08   cd 8c 4e 0f   c5 84 46 07   c2 83 41 00
-Tij            18 e5 e2 1f   ed 10 17 ea   f2 0f 08 f5   07 fa fd 00
-Tij            43 29 96 fc   e8 82 3d 57   14 7e c1 ab   bf d5 6a 00
-Tij            1a b2 4a e2   bb 13 eb 43   59 f1 09 a1   f8 50 a8 00
-Tij            e6 c6 a7 87   64 44 25 05   e3 c3 a2 82   61 41 20 00
-Tij            85 c9 1d 51   b5 f9 2d 61   e4 a8 7c 30   d4 98 4c 00
-Tij            f0 b5 7a 3f   e5 a0 6f 2a   da 9f 50 15   cf 8a 45 00
-Tables[13] = 0000000000000000 54c20a86afd41fac a984150d5fa83f58 fd461f8bf07c20f4 53082a1abf507eab 07ca209c10846107 fa8c3f17e0f841f3 ae4e35914f2c5e5f a61054357ea0fd56 f2d25eb3d174e2fa 0f9441382108c20e 5b564bbe8edcdda2 f5187e2fc1f083fd a1da74a96e249c51 5c9c6b229e58bca5 085e61a4318ca309
-Tij            09 a5 51 fd   a2 0e fa 56   5f f3 07 ab   f4 58 ac 00
-Tij            a3 bc 9c 83   dd c2 e2 fd   5e 41 61 7e   20 3f 1f 00
-Tij            8c 58 24 f0   dc 08 74 a0   2c f8 84 50   7c a8 d4 00
-Tij            31 9e 6e c1   8e 21 d1 7e   4f e0 10 bf   f0 5f af 00
-Tij            a4 22 a9 2f   be 38 b3 35   91 17 9c 1a   8b 0d 86 00
-Tij            61 6b 74 7e   4b 41 5e 54   35 3f 20 2a   1f 15 0a 00
-Tij            5e 9c da 18   56 94 d2 10   4e 8c ca 08   46 84 c2 00
-Tij            08 5c a1 f5   5b 0f f2 a6   ae fa 07 53   fd a9 54 00
-Tables[14] = 0000000000000000 4c20a86afd41fab7 984150d5fa83f56e d461f8bf07c20fd9 3082a1abf507eac7 7ca209c108461070 a8c3f17e0f841fa9 e4e35914f2c5e51e 61054357ea0fd58e 2d25eb3d174e2f39 f9441382108c20e0 b564bbe8edcdda57 5187e2fc1f083f49 1da74a96e249c5fe c9c6b229e58bca27 85e61a4318ca3090
-Tij            90 27 fe 49   57 e0 39 8e   1e a9 70 c7   d9 6e b7 00
-Tij            30 ca c5 3f   da 20 2f d5   e5 1f 10 ea   0f f5 fa 00
-Tij            ca 8b 49 08   cd 8c 4e 0f   c5 84 46 07   c2 83 41 00
-Tij            18 e5 e2 1f   ed 10 17 ea   f2 0f 08 f5   07 fa fd 00
-Tij            43 29 96 fc   e8 82 3d 57   14 7e c1 ab   bf d5 6a 00
-Tij            1a b2 4a e2   bb 13 eb 43   59 f1 09 a1   f8 50 a8 00
-Tij            e6 c6 a7 87   64 44 25 05   e3 c3 a2 82   61 41 20 00
-Tij            85 c9 1d 51   b5 f9 2d 61   e4 a8 7c 30   d4 98 4c 00
-Tables[15] = 0000000000000000 c20a86afd41fab1c 84150d5fa83f5623 461f8bf07c20fd3f 082a1abf507eac5d ca209c1084610741 8c3f17e0f841fa7e 4e35914f2c5e5162 1054357ea0fd58ba d25eb3d174e2f3a6 9441382108c20e99 564bbe8edcdda585 187e2fc1f083f4e7 da74a96e249c5ffb 9c6b229e58bca2c4 5e61a4318ca309d8
-Tij            d8 c4 fb e7   85 99 a6 ba   62 7e 41 5d   3f 23 1c 00
-Tij            09 a2 5f f4   a5 0e f3 58   51 fa 07 ac   fd 56 ab 00
-Tij            a3 bc 9c 83   dd c2 e2 fd   5e 41 61 7e   20 3f 1f 00
-Tij            8c 58 24 f0   dc 08 74 a0   2c f8 84 50   7c a8 d4 00
-Tij            31 9e 6e c1   8e 21 d1 7e   4f e0 10 bf   f0 5f af 00
-Tij            a4 22 a9 2f   be 38 b3 35   91 17 9c 1a   8b 0d 86 00
-Tij            61 6b 74 7e   4b 41 5e 54   35 3f 20 2a   1f 15 0a 00
-Tij            5e 9c da 18   56 94 d2 10   4e 8c ca 08   46 84 c2 00
-Val= 3b60e7ccf8f4454e
-v0             28 4f 14 e3   1b f7 ee 76   b9 31 47 0a   ba 8b 70 fc
-v0             12 56 28 59   66 cd d2 d2   1c 91 30 26   a8 95 0a a9
-v0             ee 5d 14 e3   fb c8 45 23   a9 fd 8c f1   ff c9 2c 93
-v0             65 ce 82 f2   dc ec 6b e2   53 a3 9c fb   07 70 e7 ad
-v0             1b 87 3d 7b   4d 15 1d c2   d2 45 f3 03   4b e4 f4 9b
-v0             3b 01 2b c5   c5 d2 9d a9   68 7c a2 61   c9 5b 49 90
-v0             5d 13 7d ef   eb f1 52 da   a0 29 89 ef   08 f2 51 3b
-v0             17 05 b3 80   77 3a f2 5e   82 7a c9 39   84 df 8e bf
-
-p0             11 fc 47 f4   6c 01 44 ba   ba 62 e7 3f   ba fb ba 85
-p0             a6 fc 67 16   5f c3 95 fc   58 51 f4 fd   58 5f 58 a5
-p0             12 fc 1f b3   50 1e 3f 9a   fd 5e 83 20   fd 9c fd dd
-p0             d9 fc 1e ee   22 42 10 7f   a0 2c f0 7c   a0 24 a0 dc
-p0             a2 fc 4c 30   41 ce ad eb   7e 4f c1 f0   7e 6e 7e 8e
-p0             8b fc 7c 7b   9f b5 38 67   35 91 2f 8b   35 a9 35 be
-p0             07 fc 89 1a   3b 21 fd db   54 35 7e 1f   54 74 54 4b
-p0             cf fc 94 5e   40 78 c2 31   10 4e 18 46   10 da 10 56
--- a/tmp2.sh
+++ b/tmp2.sh
@ -1,13 +0,0 @@
-if [ $# -lt 4 ]; then
-  echo 'usage: sh tmp-test.sh w gf_specs (e.g. LOG - -)' >&2
-  exit 1
-fi
-
-w=$1
-shift
-i=1024
-while [ $i -le 1073741824 ]; do
-  iter=`echo $i | awk '{ print (1073741824/$1)*10 }'`
-  echo $i $iter $w $* `gf_time $w R -1 $i $iter $*`
-  i=`echo $i | awk '{ print $1*2 }'`
-done