diff --git a/GNUmakefile b/GNUmakefile
index e722c01..bb7ead8 100644
--- a/GNUmakefile
+++ b/GNUmakefile
@@ -5,19 +5,20 @@
 
 SRCS = gf_w4.c gf_w8.c gf_w16.c gf_w32.c gf_w64.c gf_w128.c gf_wgen.c gf.c gf_unit.c \
        gf_time.c gf_mult.c gf_method.c gf_methods.c gf_div.c gf_rand.c gf_general.c \
-       gf_poly.c gf_example_1.c gf_add.c gf_example_2.c gf_example_3.c gf_example_4.c
+       gf_poly.c gf_example_1.c gf_add.c gf_example_2.c gf_example_3.c gf_example_4.c \
+       gf_inline_time.c
 
 HDRS = gf_complete.h gf_int.h
 
 EXECUTABLES = gf_mult gf_div gf_add gf_unit gf_time gf_methods gf_poly \
-              gf_example_1 gf_example_2 gf_example_3 gf_example_4
+              gf_example_1 gf_example_2 gf_example_3 gf_example_4 gf_inline_time
 
-CFLAGS = -O3 -msse4 -maes -mpclmul -DINTEL_SSE4 -DINTEL_PCLMUL
-LDFLAGS = -O3 -msse4 -maes -mpclmul
+# CFLAGS = -O3 -msse4 -maes -mpclmul -DINTEL_SSE4 -DINTEL_PCLMUL
+# LDFLAGS = -O3 -msse4 -maes -mpclmul
 
 # Use these if you don't have INTEL_PCLMUL
-# CFLAGS = -O3 -msse4 -DINTEL_SSE4
-# LDFLAGS = -O3 -msse4 
+ CFLAGS = -O3 -msse4 -DINTEL_SSE4
+ LDFLAGS = -O3 -msse4 
 
 RM = /bin/rm -f
 
@@ -38,6 +39,7 @@ gf_complete.a: $(LIBOBJS)
 
 gf_methods: gf_methods.o gf_complete.a
 gf_time: gf_time.o gf_complete.a
+gf_inline_time: gf_inline_time.o gf_complete.a
 gf_unit: gf_unit.o gf_complete.a
 gf_example_1: gf_example_1.o gf_complete.a
 gf_example_2: gf_example_2.o gf_complete.a
@@ -57,6 +59,7 @@ spotless: clean
 gf_div.o: gf_complete.h gf_method.h
 gf_methods.o: gf_complete.h gf_method.h
 gf_time.o: gf_complete.h gf_method.h gf_rand.h gf_general.h
+gf_inline_time.o: gf_complete.h gf_rand.h
 gf_wgen.o: gf_int.h gf_complete.h
 gf_w4.o: gf_int.h gf_complete.h
 gf_w8.o: gf_int.h gf_complete.h
diff --git a/gf_complete.h b/gf_complete.h
index 2336cfc..ac6688e 100644
--- a/gf_complete.h
+++ b/gf_complete.h
@@ -130,3 +130,28 @@ extern int gf_scratch_size(int w,
                            int arg2);
 
 extern int gf_free(GFP gf, int recursive);
+
+/* This is support for inline single multiplications and divisions.
+   I know it's yucky, but if you've got to be fast, you've got to be fast.
+   We'll support inlines for w=4, w=8 and w=16.  
+
+   To use inline multiplication and division with w=4 or 8, you should use the 
+   default gf_t, or one with a single table.  Otherwise, gf_w4/8_get_mult_table()
+   will return NULL. */
+
+uint8_t *gf_w4_get_mult_table(GFP gf);
+uint8_t *gf_w4_get_div_table(GFP gf);
+
+#define GF_W4_INLINE_MULTDIV(table, a, b) (table[((a)<<4)|b])
+
+uint8_t *gf_w8_get_mult_table(GFP gf);
+uint8_t *gf_w8_get_div_table(GFP gf);
+
+#define GF_W8_INLINE_MULTDIV(table, a, b) (table[(((uint32_t) a)<<8)|b])
+
+uint16_t *gf_w16_get_log_table(GFP gf);
+uint16_t *gf_w16_get_mult_alog_table(GFP gf);
+uint16_t *gf_w16_get_div_alog_table(GFP gf);
+
+#define GF_W16_INLINE_MULT(log, alog, a, b) ((a) == 0 || (b) == 0) ? 0 : (alog[(uint32_t)log[a]+(uint32_t)log[b]])
+#define GF_W16_INLINE_DIV(log, alog, a, b) ((a) == 0 || (b) == 0) ? 0 : (alog[(int)log[a]-(int)log[b]])
diff --git a/gf_time.c b/gf_time.c
index 1c72bcd..8313b05 100644
--- a/gf_time.c
+++ b/gf_time.c
@@ -16,8 +16,6 @@
 #include "gf_rand.h"
 #include "gf_general.h"
 
-#define REGION_SIZE (4096)
-
 void
 timer_start (double *t)
 {
diff --git a/gf_unit.c b/gf_unit.c
index 4eb3d2a..8fe253c 100644
--- a/gf_unit.c
+++ b/gf_unit.c
@@ -54,6 +54,8 @@ int main(int argc, char **argv)
   time_t t0;
   gf_internal_t *h;
   gf_general_t *a, *b, *c, *d, *ai, *bi;
+  uint8_t a8, b8, c8, *mult4, *div4, *mult8, *div8;
+  uint16_t a16, b16, c16, d16, *log16, *alog16;
   char as[50], bs[50], cs[50], ds[50], ais[50], bis[50];
   uint32_t mask;
   char *ra, *rb, *rc, *rd, *target;
@@ -97,6 +99,21 @@ int main(int argc, char **argv)
 
   if (!gf_init_easy(&gf_def, w)) problem("No default for this value of w");
   
+  if (w == 4) {
+    mult4 = gf_w4_get_mult_table(&gf);
+    div4 = gf_w4_get_div_table(&gf);
+  }
+
+  if (w == 8) {
+    mult8 = gf_w8_get_mult_table(&gf);
+    div8 = gf_w8_get_div_table(&gf);
+  }
+
+  if (w == 16) {
+    log16 = gf_w16_get_log_table(&gf);
+    alog16 = gf_w16_get_mult_alog_table(&gf);
+  }
+
   if (verbose) printf("Seed: %ld\n", t0);
 
   if (single) {
@@ -132,6 +149,45 @@ int main(int argc, char **argv)
       tested = 0;
       gf_general_multiply(&gf, a, b, c);
       
+      /* If w is 4, 8 or 16, then there are inline multiplication/division methods.  
+         Test them here. */
+
+      if (w == 4 && mult4 != NULL) {
+        a8 = a->w32;
+        b8 = b->w32;
+        c8 = GF_W4_INLINE_MULTDIV(mult4, a8, b8);
+        if (c8 != c->w32) {
+          printf("Error in inline multiplication. %d * %d.  Inline = %d.  Default = %d.\n",
+             a8, b8, c8, c->w32);
+          exit(1);
+        }
+      }
+
+      if (w == 8 && mult8 != NULL) {
+        a8 = a->w32;
+        b8 = b->w32;
+        c8 = GF_W8_INLINE_MULTDIV(mult8, a8, b8);
+        if (c8 != c->w32) {
+          printf("Error in inline multiplication. %d * %d.  Inline = %d.  Default = %d.\n",
+             a8, b8, c8, c->w32);
+          exit(1);
+        }
+      }
+
+      if (w == 16 && log16 != NULL) {
+        a16 = a->w32;
+        b16 = b->w32;
+        c16 = GF_W16_INLINE_MULT(log16, alog16, a16, b16);
+        if (c16 != c->w32) {
+          printf("Error in inline multiplication. %d * %d.  Inline = %d.  Default = %d.\n",
+             a16, b16, c16, c->w32);
+          printf("%d %d\n", log16[a16], log16[b16]);
+          top = log16[a16] + log16[b16];
+          printf("%d %d\n", top, alog16[top]);
+          exit(1);
+        }
+      }
+
       /* If this is not composite, then first test against the default: */
 
       if (h->mult_type != GF_MULT_COMPOSITE) {
diff --git a/gf_w16.c b/gf_w16.c
index 5752415..d6fffc3 100644
--- a/gf_w16.c
+++ b/gf_w16.c
@@ -17,9 +17,10 @@
 #define GF_S_GF_8_2 (63)
 
 struct gf_logtable_data {
-    int              log_tbl[GF_FIELD_SIZE];
+    uint16_t      log_tbl[GF_FIELD_SIZE];
     uint16_t      antilog_tbl[GF_FIELD_SIZE * 2];
     uint16_t      inv_tbl[GF_FIELD_SIZE];
+    uint16_t      *d_antilog;
 };
 
 struct gf_zero_logtable_data {
@@ -308,7 +309,7 @@ gf_w16_log_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
   struct gf_logtable_data *ltd;
 
   ltd = (struct gf_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
-  return (a == 0 || b == 0) ? 0 : ltd->antilog_tbl[ltd->log_tbl[a] + ltd->log_tbl[b]];
+  return (a == 0 || b == 0) ? 0 : ltd->antilog_tbl[(int) ltd->log_tbl[a] + (int) ltd->log_tbl[b]];
 }
 
 static
@@ -322,8 +323,8 @@ gf_w16_log_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
   if (a == 0 || b == 0) return 0;
   ltd = (struct gf_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
 
-  log_sum = ltd->log_tbl[a] - ltd->log_tbl[b] + (GF_MULT_GROUP_SIZE);
-  return (ltd->antilog_tbl[log_sum]);
+  log_sum = (int) ltd->log_tbl[a] - (int) ltd->log_tbl[b];
+  return (ltd->d_antilog[log_sum]);
 }
 
 static
@@ -347,6 +348,7 @@ int gf_w16_log_init(gf_t *gf)
   ltd = h->private;
 
   ltd->log_tbl[0] = 0;
+  ltd->d_antilog = ltd->log_tbl + GF_MULT_GROUP_SIZE;
 
   b = 1;
   for (i = 0; i < GF_MULT_GROUP_SIZE; i++) {
@@ -1945,3 +1947,44 @@ int gf_w16_init(gf_t *gf)
   }
   return 1;
 }
+
+/* Inline setup functions */
+
+uint16_t *gf_w16_get_log_table(gf_t *gf)
+{
+  gf_internal_t *h;
+  struct gf_logtable_data *ltd;
+
+  h = (gf_internal_t *) gf->scratch;
+  if (gf->multiply.w32 == gf_w16_log_multiply) {
+    ltd = (struct gf_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
+    return (uint16_t *) ltd->log_tbl;
+  }
+  return NULL;
+}
+
+uint16_t *gf_w16_get_mult_alog_table(gf_t *gf)
+{
+  gf_internal_t *h;
+  struct gf_logtable_data *ltd;
+
+  h = (gf_internal_t *) gf->scratch;
+  if (gf->multiply.w32 == gf_w16_log_multiply) {
+    ltd = (struct gf_logtable_data *) h->private;
+    return (uint16_t *) ltd->antilog_tbl;
+  }
+  return NULL;
+}
+
+uint16_t *gf_w16_get_div_alog_table(gf_t *gf)
+{
+  gf_internal_t *h;
+  struct gf_logtable_data *ltd;
+
+  h = (gf_internal_t *) gf->scratch;
+  if (gf->multiply.w32 == gf_w16_log_multiply) {
+    ltd = (struct gf_logtable_data *) h->private;
+    return (uint16_t *) ltd->d_antilog;
+  }
+  return NULL;
+}
diff --git a/gf_w4.c b/gf_w4.c
index b80da4d..1175e01 100644
--- a/gf_w4.c
+++ b/gf_w4.c
@@ -146,7 +146,6 @@ gf_val_32_t gf_w4_matrix (gf_t *gf, gf_val_32_t b)
   return gf_bitmatrix_inverse(b, 4, ((gf_internal_t *) (gf->scratch))->prim_poly);
 }
 
-
 /* ------------------------------------------------------------
   IMPLEMENTATION: LOG_TABLE: 
 
@@ -2010,3 +2009,32 @@ gf_w4_init (gf_t *gf)
   }
   return 1;
 }
+
+/* Inline setup functions */
+
+uint8_t *gf_w4_get_mult_table(gf_t *gf)
+{
+  gf_internal_t *h;
+  struct gf_single_table_data *std;
+  
+  h = (gf_internal_t *) gf->scratch;
+  if (gf->multiply.w32 == gf_w4_single_table_multiply) {
+    std = (struct gf_single_table_data *) h->private;
+    return (uint8_t *) std->mult;
+  } 
+  return NULL;
+}
+    
+uint8_t *gf_w4_get_div_table(gf_t *gf)
+{
+  gf_internal_t *h;
+  struct gf_single_table_data *std;
+  
+  h = (gf_internal_t *) gf->scratch;
+  if (gf->multiply.w32 == gf_w4_single_table_multiply) {
+    std = (struct gf_single_table_data *) h->private;
+    return (uint8_t *) std->div;
+  } 
+  return NULL;
+}
+
diff --git a/gf_w8.c b/gf_w8.c
index 0dbd472..306f911 100644
--- a/gf_w8.c
+++ b/gf_w8.c
@@ -1971,3 +1971,40 @@ int gf_w8_init(gf_t *gf)
 
   return 1;
 }
+
+
+/* Inline setup functions */
+
+uint8_t *gf_w8_get_mult_table(gf_t *gf)
+{
+  gf_internal_t *h;
+  struct gf_w8_default_data *ftd;
+  struct gf_w8_single_table_data *std;
+
+  h = (gf_internal_t *) gf->scratch;
+  if (gf->multiply.w32 == gf_w8_default_multiply) {
+    ftd = (struct gf_w8_default_data *) h->private;
+    return (uint8_t *) ftd->multtable;
+  } else if (gf->multiply.w32 == gf_w8_table_multiply) {
+    std = (struct gf_w8_single_table_data *) h->private;
+    return (uint8_t *) std->multtable;
+  }
+  return NULL;
+}
+
+uint8_t *gf_w8_get_div_table(gf_t *gf)
+{
+  gf_internal_t *h;
+  struct gf_w8_default_data *ftd;
+  struct gf_w8_single_table_data *std;
+
+  h = (gf_internal_t *) gf->scratch;
+  if (gf->multiply.w32 == gf_w8_default_multiply) {
+    ftd = (struct gf_w8_default_data *) ((gf_internal_t *) gf->scratch)->private;
+    return (uint8_t *) std->divtable;
+  } else if (gf->multiply.w32 == gf_w8_table_multiply) {
+    std = (struct gf_w8_single_table_data *) ((gf_internal_t *) gf->scratch)->private;
+    return (uint8_t *) std->divtable;
+  }
+  return NULL;
+}