diff --git a/GNUmakefile b/GNUmakefile index 0f35276..80cd3d3 100644 --- a/GNUmakefile +++ b/GNUmakefile @@ -1,24 +1,23 @@ # # GNUmakefile for Galois field library # -# +# The default flags do *not* have the SSE instructions enabled. +# Please cd to flag_tester and run which_compile_flags.sh to see which SSE instructions +# your machine and compiler support, and which flags you should include below. + +CFLAGS = -O3 +LDFLAGS = -O3 SRCS = gf_w4.c gf_w8.c gf_w16.c gf_w32.c gf_w64.c gf_w128.c gf_wgen.c gf.c gf_unit.c \ gf_time.c gf_mult.c gf_method.c gf_methods.c gf_div.c gf_rand.c gf_general.c \ gf_poly.c gf_example_1.c gf_add.c gf_example_2.c gf_example_3.c gf_example_4.c \ - gf_inline_time.c + gf_inline_time.c gf_example_5.c gf_example_6.c gf_example_7.c HDRS = gf_complete.h gf_int.h EXECUTABLES = gf_mult gf_div gf_add gf_unit gf_time gf_methods gf_poly \ - gf_example_1 gf_example_2 gf_example_3 gf_example_4 gf_inline_time - -CFLAGS = -O3 -msse4 -maes -mpclmul -DINTEL_SSE4 -DINTEL_PCLMUL -LDFLAGS = -O3 -msse4 -maes -mpclmul - -# Use these if you don't have INTEL_PCLMUL -# CFLAGS = -O3 -msse4 -DINTEL_SSE4 -# LDFLAGS = -O3 -msse4 + gf_example_1 gf_example_2 gf_example_3 gf_example_4 gf_inline_time \ + gf_example_5 gf_example_6 gf_example_7 RM = /bin/rm -f @@ -45,6 +44,9 @@ gf_example_1: gf_example_1.o gf_complete.a gf_example_2: gf_example_2.o gf_complete.a gf_example_3: gf_example_3.o gf_complete.a gf_example_4: gf_example_4.o gf_complete.a +gf_example_5: gf_example_5.o gf_complete.a +gf_example_6: gf_example_6.o gf_complete.a +gf_example_7: gf_example_7.o gf_complete.a gf_mult: gf_mult.o gf_complete.a gf_div: gf_div.o gf_complete.a gf_poly: gf_poly.o gf_complete.a @@ -54,7 +56,8 @@ clean: $(RM) $(OBJS) gf_div.c spotless: clean - $(RM) *~ $(EXECUTABLES) + $(RM) *~ $(EXECUTABLES) which_compile_flags + $(RM) gf_complete.a gf_div.o: gf_complete.h gf_method.h gf_methods.o: gf_complete.h gf_method.h @@ -71,8 +74,12 @@ gf_example_1.o: gf_complete.h gf_rand.h gf_example_2.o: gf_complete.h gf_rand.h gf_example_3.o: gf_complete.h gf_rand.h gf_example_4.o: gf_complete.h gf_rand.h +gf_example_5.o: gf_complete.h gf_rand.h +gf_example_6.o: gf_complete.h gf_rand.h +gf_example_7.o: gf_complete.h gf_rand.h gf_general.o: gf_complete.h gf_int.h gf_general.h gf_rand.h gf_mult.o: gf_complete.h gf_method.h +gf.o: gf_complete.h gf_int.h gf_method.o: gf_complete.h gf_div.c: gf_mult.c diff --git a/Log-Zero-for-w=8.odg b/Log-Zero-for-w=8.odg deleted file mode 100644 index 138a673..0000000 Binary files a/Log-Zero-for-w=8.odg and /dev/null differ diff --git a/Manual.pdf b/Manual.pdf new file mode 100644 index 0000000..fdc9756 Binary files /dev/null and b/Manual.pdf differ diff --git a/README b/README deleted file mode 100644 index 4169e1c..0000000 --- a/README +++ /dev/null @@ -1 +0,0 @@ -This is a README file. diff --git a/README.txt b/README.txt index 91fecc5..0726922 100644 --- a/README.txt +++ b/README.txt @@ -1,5 +1,13 @@ -This is GF-Complete, Revision 0.1. +This is GF-Complete, Revision 1.0. + +The user's manual is in the file Manual.pdf. + +There are two online homes for GF-Complete: + + - https://bitbucket.org/jimplank/gf-complete + - http://www.cs.utk.edu/~plank/plank/papers/CS-13-716.html + +When compiling this for the first time, cd to flag_tester, and +do "sh which_compile_flags.sh xxx", where xxx is the compiler +that you will use in the GNUMakefile. -Please see http://www.cs.utk.edu/~plank/plank/papers/CS-13-703.html for the user's -manual and other important documentation about this library, including more -recent revisions. diff --git a/explanation.html b/explanation.html deleted file mode 100644 index 72f03d0..0000000 --- a/explanation.html +++ /dev/null @@ -1,777 +0,0 @@ -

Code structure as of 7/20/2012

- -written by Jim. -

-Ok -- once again, I have messed with the structure. My goal is flexible and efficient. -It's similar to the stuff before, but better because it makes things like Euclid's -method much cleaner. -

-I think we're ready to hack. -

-

-


-

Files

- - -
-

Prototypes and typedefs in gf.h

- -The main structure that users will see is in gf.h, and it is of type -gf_t: - -

-typedef struct gf {
-  gf_func_a_b    multiply;
-  gf_func_a_b    divide;
-  gf_func_a      inverse;
-  gf_region      multiply_region;
-  void           *scratch;
-} gf_t;
-

- -We can beef it up later with buf-buf or buf-acc. The problem is that the paper is -already bloated, so right now, I want to keep it lean. -

-The types of the procedures are big unions, so that they work with the following -types of arguments: - -

-typedef uint8_t     gf_val_4_t;
-typedef uint8_t     gf_val_8_t;
-typedef uint16_t    gf_val_16_t;
-typedef uint32_t    gf_val_32_t;
-typedef uint64_t    gf_val_64_t;
-typedef uint64_t    *gf_val_128_t;
-typedef uint32_t    gf_val_gen_t;   /* The intent here is for general values <= 32 */
-

- -To use one of these, you need to create one with gf_init_easy() or -gf_init_hard(). Let's concentrate on the former: - -

-extern int gf_init_easy(gf_t *gf, int w, int mult_type);
-

- -You pass it memory for a gf_t, a value of w and -a variable that says how to do multiplication. The valid values of mult_type -are enumerated in gf.h: - -

-typedef enum {GF_MULT_DEFAULT,
-              GF_MULT_SHIFT,
-              GF_MULT_GROUP,
-              GF_MULT_BYTWO_p,
-              GF_MULT_BYTWO_b,
-              GF_MULT_TABLE,
-              GF_MULT_LOG_TABLE,
-              GF_MULT_SPLIT_TABLE,
-              GF_MULT_COMPOSITE } gf_mult_type_t;
-

- -After creating the gf_t, you use its multiply method -to multiply, using the union's fields to work with the various types. -It looks easier than my explanation. For example, suppose you wanted to multiply 5 and 4 in GF(24). -You can do it as in -gf_54.c - -

-#include "gf.h"
-
-main()
-{
-  gf_t gf;
-
-  gf_init_easy(&gf, 4, GF_MULT_DEFAULT);
-  printf("%d\n", gf.multiply.w4(&gf, 5, 4));
-  exit(0);
-}
-

- - -If you wanted to multiply in GF(28), then you'd have to use 8 as a parameter -to gf_init_easy, and call the multiplier as gf.mult.w8(). -

-When you're done with your gf_t, you should call gf_free() on it so -that it can free memory that it has allocated. We'll talk more about memory later, but if you -create your gf_t with gf_init_easy, then it calls malloc(), and -if you care about freeing memory, you'll have to call gf_free(). -

- -


-

Memory allocation

- -Each implementation of a multiplication technique keeps around its -own data. For example, GF_MULT_TABLE keeps around -multiplication and division tables, and GF_MULT_LOG maintains log and -antilog tables. This data is stored in the pointer scratch. My intent -is that the memory that is there is all that's required. In other -words, the multiply(), divide(), inverse() and -multiply_region() calls don't do any memory allocation. -Moreover, gf_init_easy() only allocates one chunk of memory -- -the one in scratch. -

-If you don't want to have the initialization call allocate memory, you can use gf_init_hard(): - -

-extern int gf_init_hard(gf_t *gf,
-                        int w,
-                        int mult_type,
-                        int region_type,
-                        int divide_type,
-                        uint64_t prim_poly,
-                        int arg1,
-                        int arg2,
-                        gf_t *base_gf,
-                        void *scratch_memory);
-

- -The first three parameters are the same as gf_init_easy(). -You can add additional arguments for performing multiply_region, and -for performing division in the region_type and divide_type -arguments. Their values are also defined in gf.h. You can -mix the region_type values (e.g. "DOUBLE" and "SSE"): - -

-#define GF_REGION_DEFAULT      (0x0)
-#define GF_REGION_SINGLE_TABLE (0x1)
-#define GF_REGION_DOUBLE_TABLE (0x2)
-#define GF_REGION_QUAD_TABLE   (0x4)
-#define GF_REGION_LAZY         (0x8)
-#define GF_REGION_SSE          (0x10)
-#define GF_REGION_NOSSE        (0x20)
-#define GF_REGION_STDMAP       (0x40)
-#define GF_REGION_ALTMAP       (0x80)
-#define GF_REGION_CAUCHY       (0x100)
-
-typedef uint32_t gf_region_type_t;
-
-typedef enum { GF_DIVIDE_DEFAULT,
-               GF_DIVIDE_MATRIX,
-               GF_DIVIDE_EUCLID } gf_division_type_t;
-

-You can change -the primitive polynomial with prim_poly, give additional arguments with -arg1 and arg2 and give a base Galois Field for composite fields. -Finally, you can pass it a pointer to memory in scratch_memory. That -way, you can avoid having gf_init_hard() call malloc(). -

-There is a procedure called gf_scratch_size() that lets you know the minimum -size for scratch_memory, depending on w, the multiplication type -and the arguments: - -

-extern int gf_scratch_size(int w,
-                           int mult_type,
-                           int region_type,
-                           int divide_type,
-                           int arg1,
-                           int arg2);
-

- -You can specify default arguments in gf_init_hard(): -

-If any argument is equal to its default, then default actions are taken (e.g. a -standard primitive polynomial is used, or memory is allocated for scratch_memory). -In fact, gf_init_easy() simply calls gf_init_hard() with the default -parameters. -

-gf_free() frees memory that was allocated with gf_init_easy() -or gf_init_hard(). The recursive parameter is in case you -use composite fields, and want to recursively free the base fields. -If you pass scratch_memory to gf_init_hard(), then you typically -don't need to call gf_free(). It won't hurt to call it, though. - -


-

gf_mult and gf_div

- -For the moment, I have few things completely implemented, but that's because I want -to be able to explain the structure, and how to specify methods. In particular, for -w=4, I have implemented SHIFT and LOG. For w=8, 16, 32, 64 -I have implemented SHIFT. For all w ≤ 32, I have implemented both -Euclid's algorithm for inversion, and the matrix method for inversion. For -w=64, it's just Euclid. You can -test these all with gf_mult and gf_div. Here are a few calls: - -
-UNIX> gf_mult 7 11 4                - Default
-4
-UNIX> gf_mult 7 11 4 SHIFT - -      - Use shift
-4
-UNIX> gf_mult 7 11 4 LOG - -        - Use logs
-4
-UNIX> gf_div 4 7 4                  - Default
-11
-UNIX> gf_div 4 7 4 LOG - -          - Use logs
-11
-UNIX> gf_div 4 7 4 LOG - EUCLID     - Use Euclid instead of logs
-11
-UNIX> gf_div 4 7 4 LOG - MATRIX     - Use Matrix inversion instead of logs
-11
-UNIX> gf_div 4 7 4 SHIFT - -        - Default
-11
-UNIX> gf_div 4 7 4 SHIFT - EUCLID   - Use Euclid (which is the default)
-11
-UNIX> gf_div 4 7 4 SHIFT - MATRIX   - Use Matrix inversion instead of logs
-11
-UNIX> gf_mult 200 211 8        - The remainder are shift/Euclid
-201
-UNIX> gf_div 201 211 8
-200
-UNIX> gf_mult 60000 65111 16
-63515
-UNIX> gf_div 63515 65111 16
-60000
-UNIX> gf_mult abcd0001 9afbf788 32h
-b0359681
-UNIX> gf_div b0359681 9afbf788 32h
-abcd0001
-UNIX> gf_mult abcd00018c8b8c8a 9afbf7887f6d8e5b 64h
-3a7def35185bd571
-UNIX> gf_mult abcd00018c8b8c8a 9afbf7887f6d8e5b 64h
-3a7def35185bd571
-UNIX> gf_div 3a7def35185bd571 9afbf7887f6d8e5b 64h
-abcd00018c8b8c8a
-UNIX> 
-
- -You can see all the methods with gf_methods. We have a lot of implementing to do: - -
-UNIX> gf_methods
-To specify the methods, do one of the following: 
-       - leave empty to use defaults
-       - use a single dash to use defaults
-       - specify MULTIPLY REGION DIVIDE
-
-Legal values of MULTIPLY:
-       SHIFT: shift
-       GROUP g_mult g_reduce: the Group technique - see the paper
-       BYTWO_p: BYTWO doubling the product.
-       BYTWO_b: BYTWO doubling b (more efficient thatn BYTWO_p)
-       TABLE: Full multiplication table
-       LOG:   Discrete logs
-       LOG_ZERO: Discrete logs with a large table for zeros
-       SPLIT g_a g_b: Split tables defined by g_a and g_b
-       COMPOSITE k l [METHOD]: Composite field, recursively specify the
-                               method of the base field in GF(2^l)
-
-Legal values of REGION: Specify multiples with commas e.g. 'DOUBLE,LAZY'
-       -: Use defaults
-       SINGLE/DOUBLE/QUAD: Expand tables
-       LAZY: Lazily create table (only applies to TABLE and SPLIT)
-       SSE/NOSSE: Use 128-bit SSE instructions if you can
-       CAUCHY/ALTMAP/STDMAP: Use different memory mappings
-
-Legal values of DIVIDE:
-       -: Use defaults
-       MATRIX: Use matrix inversion
-       EUCLID: Use the extended Euclidian algorithm.
-
-See the user's manual for more information.
-There are many restrictions, so it is better to simply use defaults in most cases.
-UNIX> 
-
- -
-

gf_unit and gf_time

- -gf_unit.c is a unit tester, and -gf_time.c is a time tester. - -They are called as follows: - -

-UNIX> gf_unit w tests seed [METHOD] 
-UNIX> gf_time w tests seed size(bytes) iterations [METHOD] 
-

- -The tests parameter is one or more of the following characters: - -

- -seed is a seed for srand48() -- using -1 defaults to the current time. -

-For example, testing the defaults with w=4: - -

-UNIX> gf_unit 4 AV 1 LOG - -
-Seed: 1
-Testing single multiplications/divisions.
-Testing Inversions.
-Testing buffer-constant, src != dest, xor = 0
-Testing buffer-constant, src != dest, xor = 1
-Testing buffer-constant, src == dest, xor = 0
-Testing buffer-constant, src == dest, xor = 1
-UNIX> gf_unit 4 AV 1 SHIFT - -
-Seed: 1
-Testing single multiplications/divisions.
-Testing Inversions.
-No multiply_region.
-UNIX> 
-
- -There is no multiply_region() method defined for SHIFT. -Thus, the procedures are NULL and the unit tester ignores them. -

-At the moment, I only have the unit tester working for w=4. -

-gf_time takes the size of an array (in bytes) and a number of iterations, and -tests the speed of both single and region operations. The tests are: - -

- -Here are some examples with SHIFT and LOG on my mac. - -
-UNIX> gf_time 4 A 1 102400 1024 LOG - -
-Seed: 1
-Multiply:   0.538126 s      185.830 Mega-ops/s
-Divide:     0.520825 s      192.003 Mega-ops/s
-Inverse:    0.631198 s      158.429 Mega-ops/s
-Buffer-Const,s!=d,xor=0:    0.478395 s      209.032 MB/s
-Buffer-Const,s!=d,xor=1:    0.524245 s      190.751 MB/s
-Buffer-Const,s==d,xor=0:    0.471851 s      211.931 MB/s
-Buffer-Const,s==d,xor=1:    0.528275 s      189.295 MB/s
-UNIX> gf_time 4 A 1 102400 1024 LOG - EUCLID
-Seed: 1
-Multiply:   0.555512 s      180.014 Mega-ops/s
-Divide:     5.359434 s       18.659 Mega-ops/s
-Inverse:    4.911719 s       20.359 Mega-ops/s
-Buffer-Const,s!=d,xor=0:    0.496097 s      201.573 MB/s
-Buffer-Const,s!=d,xor=1:    0.538536 s      185.689 MB/s
-Buffer-Const,s==d,xor=0:    0.485564 s      205.946 MB/s
-Buffer-Const,s==d,xor=1:    0.540227 s      185.107 MB/s
-UNIX> gf_time 4 A 1 102400 1024 LOG - MATRIX
-Seed: 1
-Multiply:   0.544005 s      183.822 Mega-ops/s
-Divide:     7.602822 s       13.153 Mega-ops/s
-Inverse:    7.000564 s       14.285 Mega-ops/s
-Buffer-Const,s!=d,xor=0:    0.474868 s      210.585 MB/s
-Buffer-Const,s!=d,xor=1:    0.527588 s      189.542 MB/s
-Buffer-Const,s==d,xor=0:    0.473130 s      211.358 MB/s
-Buffer-Const,s==d,xor=1:    0.529877 s      188.723 MB/s
-UNIX> gf_time 4 A 1 102400 1024 SHIFT - -
-Seed: 1
-Multiply:   2.708842 s       36.916 Mega-ops/s
-Divide:     8.756882 s       11.420 Mega-ops/s
-Inverse:    5.695511 s       17.558 Mega-ops/s
-UNIX> 
-
- -At the moment, I only have the timer working for w=4. - -
-

Walking you through LOG

- -To see how scratch is used to store data, let's look at what happens when -you call gf_init_easy(&gf, 4, GF_MULT_LOG); -First, gf_init_easy() calls gf_init_hard() with default parameters. -This is in gf.c. -

-gf_init_hard()' first job is to set up the scratch. -The scratch's type is gf_internal_t, defined in -gf_int.h: - -

-typedef struct {
-  int mult_type;
-  int region_type;
-  int divide_type;
-  int w;
-  uint64_t prim_poly;
-  int free_me;
-  int arg1;
-  int arg2;
-  gf_t *base_gf;
-  void *private;
-} gf_internal_t;
-

- -All the fields are straightfoward, with the exception of private. That is -a (void *) which points to the implementation's private data. -

-Here's the code for -gf_init_hard(): - -

-int gf_init_hard(gf_t *gf, int w, int mult_type, 
-                        int region_type,
-                        int divide_type,
-                        uint64_t prim_poly,
-                        int arg1, int arg2,
-                        gf_t *base_gf,
-                        void *scratch_memory) 
-{
-  int sz;
-  gf_internal_t *h;
-
-
-  if (scratch_memory == NULL) {
-    sz = gf_scratch_size(w, mult_type, region_type, divide_type, arg1, arg2);
-    if (sz <= 0) return 0;
-    h = (gf_internal_t *) malloc(sz);
-    h->free_me = 1;
-  } else {
-    h = scratch_memory;
-    h->free_me = 0;
-  }
-  gf->scratch = (void *) h;
-  h->mult_type = mult_type;
-  h->region_type = region_type;
-  h->divide_type = divide_type;
-  h->w = w;
-  h->prim_poly = prim_poly;
-  h->arg1 = arg1;
-  h->arg2 = arg2;
-  h->base_gf = base_gf;
-  h->private = (void *) gf->scratch;
-  h->private += (sizeof(gf_internal_t));
-
-  switch(w) {
-    case 4: return gf_w4_init(gf);
-    case 8: return gf_w8_init(gf);
-    case 16: return gf_w16_init(gf);
-    case 32: return gf_w32_init(gf);
-    case 64: return gf_w64_init(gf);
-    case 128: return gf_dummy_init(gf);
-    default: return 0;
-  }
-}
-

- -The first thing it does is determine if it has to allocate space for scratch. -If it must, it uses gf_scratch_size() to figure out how big the space must be. -It then sets gf->scratch to this space, and sets all of the fields of the -scratch to the arguments in gf_init_hard(). The private pointer is -set to be the space just after the pointer gf->private. Again, it is up to -gf_scratch_size() to make sure there is enough space for the scratch, and -for all of the private data needed by the implementation. -

-Once the scratch is set up, gf_init_hard() calls gf_w4_init(). This is -in gf_w4.c, and it is a -simple dispatcher to the various initialization routines, plus it -sets EUCLID and MATRIX if need be: - -

-int gf_w4_init(gf_t *gf)
-{
-  gf_internal_t *h;
-
-  h = (gf_internal_t *) gf->scratch;
-  if (h->prim_poly == 0) h->prim_poly = 0x13;
-
-  gf->multiply.w4 = NULL;
-  gf->divide.w4 = NULL;
-  gf->inverse.w4 = NULL;
-  gf->multiply_region.w4 = NULL;
-
-  switch(h->mult_type) {
-    case GF_MULT_SHIFT:     if (gf_w4_shift_init(gf) == 0) return 0; break;
-    case GF_MULT_LOG_TABLE: if (gf_w4_log_init(gf) == 0) return 0; break;
-    case GF_MULT_DEFAULT:   if (gf_w4_log_init(gf) == 0) return 0; break;
-    default: return 0;
-  }
-  if (h->divide_type == GF_DIVIDE_EUCLID) {
-    gf->divide.w4 = gf_w4_divide_from_inverse;
-    gf->inverse.w4 = gf_w4_euclid;
-  } else if (h->divide_type == GF_DIVIDE_MATRIX) {
-    gf->divide.w4 = gf_w4_divide_from_inverse;
-    gf->inverse.w4 = gf_w4_matrix;
-  }
-
-  if (gf->inverse.w4 != NULL && gf->divide.w4 == NULL) {
-    gf->divide.w4 = gf_w4_divide_from_inverse;
-  }
-  if (gf->inverse.w4 == NULL && gf->divide.w4 != NULL) {
-    gf->inverse.w4 = gf_w4_inverse_from_divide;
-  }
-  return 1;
-}
-

- -The code in gf_w4_log_init() sets up the log and antilog tables, and sets -the multiply.w4, divide.w4 etc routines to be the ones for logs. The -tables are put into gf->scratch->private, which is typecast to a struct -gf_logtable_data *: - -

-struct gf_logtable_data {
-    gf_val_4_t      log_tbl[GF_FIELD_SIZE];
-    gf_val_4_t      antilog_tbl[GF_FIELD_SIZE * 2];
-    gf_val_4_t      *antilog_tbl_div;
-};
-.......
-
-static 
-int gf_w4_log_init(gf_t *gf)
-{
-  gf_internal_t *h;
-  struct gf_logtable_data *ltd;
-  int i, b;
-
-  h = (gf_internal_t *) gf->scratch;
-  ltd = h->private;
-
-  ltd->log_tbl[0] = 0;
-
-  ltd->antilog_tbl_div = ltd->antilog_tbl + (GF_FIELD_SIZE-1);
-  b = 1;
-  for (i = 0; i < GF_FIELD_SIZE-1; i++) {
-      ltd->log_tbl[b] = (gf_val_8_t)i;
-      ltd->antilog_tbl[i] = (gf_val_8_t)b;
-      ltd->antilog_tbl[i+GF_FIELD_SIZE-1] = (gf_val_8_t)b;
-      b <<= 1;
-      if (b & GF_FIELD_SIZE) {
-          b = b ^ h->prim_poly;
-      }
-  }
-    
-  gf->inverse.w4 = gf_w4_inverse_from_divide;
-  gf->divide.w4 = gf_w4_log_divide;
-  gf->multiply.w4 = gf_w4_log_multiply;
-  gf->multiply_region.w4 = gf_w4_log_multiply_region;
-  return 1;
-}
-

- -And of course the individual routines use h->private to access the tables: - -

-static
-inline
-gf_val_8_t gf_w4_log_multiply (gf_t *gf, gf_val_8_t a, gf_val_8_t b)
-{
-  struct gf_logtable_data *ltd;
-    
-  ltd = (struct gf_logtable_data *) ((gf_internal_t *) (gf->scratch))->private;
-  return (a == 0 || b == 0) ? 0 : ltd->antilog_tbl[(unsigned)(ltd->log_tbl[a] + ltd->log_tbl[b])];
-}
-

- -Finally, it's important that the proper sizes are put into -gf_w4_scratch_size() for each implementation: - -

-int gf_w4_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
-{
-  int region_tbl_size;
-  switch(mult_type)
-  {
-    case GF_MULT_DEFAULT:
-    case GF_MULT_LOG_TABLE:
-      return sizeof(gf_internal_t) + sizeof(struct gf_logtable_data) + 64;
-      break;
-    case GF_MULT_SHIFT:
-      return sizeof(gf_internal_t);
-      break;
-    default:
-      return -1;
-   }
-}
-

-I hope that's enough explanation for y'all to start implementing. Let me know if you have -problems -- thanks -- Jim - -


-The initial structure has been set for w=4, 8, 16, 32 and 64, with implementations of SHIFT and EUCLID, and for w <= 32, MATRIX. There are some weird caveats: - - - -
-

Things we need to Implement: w=4

- -

- - - - - - - - - - - -
SHIFT Done - Jim
BYTWO_p Done - Jim
BYTWO_b Done - Jim
BYTWO_p, SSE Done - Jim
BYTWO_b, SSE Done - Jim
Single TABLE Done - Jim
Double TABLE Done - Jim
Double TABLE, SSE Done - Jim
Quad TABLE Done - Jim
Lazy Quad TABLE Done - Jim
LOG Done - Jim

- -


-

Things we need to Implement: w=8

- -

- - - - - - - - - - - - -
SHIFT Done - Jim
BYTWO_p Done - Jim
BYTWO_b Done - Jim
BYTWO_p, SSE Done - Jim
BYTWO_b, SSE Done - Jim
Single TABLE Done - Kevin
Double TABLE Done - Jim
Lazy Double TABLE Done - Jim
Split 2 1 (Half) SSE Done - Jim
Composite, k=2 Done - Kevin (alt mapping not passing unit test)
LOG Done - Kevin
LOG ZERO Done - Jim

- -


-

Things we need to Implement: w=16

- -

- - - - - - - - - - - - - - - - -
SHIFT Done - Jim
BYTWO_p Done - Jim
BYTWO_b Done - Jim
BYTWO_p, SSE Done - Jim
BYTWO_b, SSE Done - Jim
Lazy TABLE Done - Jim
Split 4 16 No-SSE, lazy Done - Jim
Split 4 16 SSE, lazy Done - Jim
Split 4 16 SSE, lazy, alternate mapping Done - Jim
Split 8 16, lazy Done - Jim
Composite, k=2, stdmap recursive Done - Kevin
Composite, k=2, altmap recursive Done - Kevin
Composite, k=2, stdmap inline Done - Kevin
LOG Done - Kevin
LOG ZERO Done - Kevin
Group 4 4 Done - Jim: I don't see a reason to implement others, although 4-8 will be faster, and 8 8 will have faster region ops. They'll never beat SPLIT.

- -


-

Things we need to Implement: w=32

- -

- - - - - - - - - - - - - - - - -
SHIFT Done - Jim
BYTWO_p Done - Jim
BYTWO_b Done - Jim
BYTWO_p, SSE Done - Jim
BYTWO_b, SSE Done - Jim
Split 2 32,lazy Done - Jim
Split 2 32, SSE, lazy Done - Jim
Split 4 32, lazy Done - Jim
Split 4 32, SSE,ALTMAP lazy Done - Jim
Split 4 32, SSE, lazy Done - Jim
Split 8 8 Done - Jim
Group, g_s == g_r Done - Jim
Group, any g_s and g_r Done - Jim
Composite, k=2, stdmap recursive Done - Kevin
Composite, k=2, altmap recursive Done - Kevin
Composite, k=2, stdmap inline Done - Kevin

-


-

Things we need to Implement: w=64

- -

- - - - - - - - - - - -
SHIFT Done - Jim
BYTWO_p -
BYTWO_b -
BYTWO_p, SSE -
BYTWO_b, SSE -
Split 16 1 SSE, maybe lazy -
Split 8 1 lazy -
Split 8 8 -
Split 8 8 lazy -
Group -
Composite, k=2, alternate mapping -

-


-

Things we need to Implement: w=128

- -

- - - - - - - - - - - -
SHIFT Done - Will
BYTWO_p -
BYTWO_b -
BYTWO_p, SSE -
BYTWO_b, SSE -
Split 32 1 SSE, maybe lazy -
Split 16 1 lazy -
Split 16 16 - Maybe that's insanity -
Split 16 16 lazy -
Group (SSE) -
Composite, k=?, alternate mapping -

-


-

Things we need to Implement: w=general between 1 & 32

- -

- - - - - - - - - - - - -
CAUCHY Region (SSE XOR) Done - Jim
SHIFT Done - Jim
TABLE Done - Jim
LOG Done - Jim
BYTWO_p Done - Jim
BYTWO_b Done - Jim
Group, g_s == g_r Done - Jim
Group, any g_s and g_r Done - Jim
Split - do we need it? Done - Jim
Composite - do we need it? -
Split - do we need it? -
Logzero? -

diff --git a/flag_tester/README.txt b/flag_tester/README.txt new file mode 100644 index 0000000..19101ff --- /dev/null +++ b/flag_tester/README.txt @@ -0,0 +1,10 @@ +Run which_compile_flags.sh and it will print out the compile flags to use in + GNUmakefile. By default, this script uses "cc" as its compiler but you can + pass in the name of your compiler as an argument. + +EXAMPLE: "./which_compile_flags.sh clang" + +This script will run "clang" in the above example so be warned that if you type +something like "rm" for that argument, you get what you asked for. Also, make +sure that the compiler that you pass to which_compile_flags.sh is the same as +the compiler in GNUmakefile. diff --git a/flag_tester/flag_test.c b/flag_tester/flag_test.c new file mode 100644 index 0000000..cecf472 --- /dev/null +++ b/flag_tester/flag_test.c @@ -0,0 +1,120 @@ +/* + * flag_test.c - copied from whats_my_sse.c to output proper compile + * flags for the GNUmakefile + * + */ + +#include +#include +#include +#include "intel_cpu_capabilities.h" + +void usage() +{ + fprintf(stderr, "usage: flag_test \n"); + exit(EXIT_FAILURE); +} + +int main(int argc, char **argv) +{ + //make sure to extend these buffers if more flags are added to this program + char cflags[1000], ldflags[1000], buf[1000]; + FILE *file; + char sse_found = 0; + + if(argc != 2) + usage(); + + sprintf(cflags, "CFLAGS = -O3"); + sprintf(ldflags, "LDFLAGS = -O3"); + + if(cpu_has_feature(CPU_CAP_SSE42)) + { + sprintf(buf, "%s sse_test.c -o sse4 -msse4 -DSSE4 2> /dev/null", argv[1]); + system(buf); + if(file = fopen("sse4", "r")) + { + fclose(file); + + //run program and compare to the included output + system("./sse4 > temp.txt 2> /dev/null"); + system("diff sse4_test.txt temp.txt > diff.txt 2> /dev/null"); + file = fopen("diff.txt", "r"); + if(fgetc(file) == EOF) + { + strcat(cflags, " -msse4 -DINTEL_SSE4"); + strcat(ldflags, " -msse4"); + sse_found = 1; + } + fclose(file); + } + } + + if(cpu_has_feature(CPU_CAP_SSSE3) && !sse_found) + { + sprintf(buf, "%s sse_test.c -o ssse3 -mssse3 -DSSSE3 2> /dev/null", argv[1]); + system(buf); + if(file = fopen("ssse3", "r")) + { + fclose(file); + + //run program and compare to the included output + system("./ssse3 > temp.txt 2> /dev/null"); + system("diff ssse3_test.txt temp.txt > diff.txt 2> /dev/null"); + file = fopen("diff.txt", "r"); + if(fgetc(file) == EOF) + { + strcat(cflags, " -mssse3 -DINTEL_SSSE3"); + strcat(ldflags, " -mssse3"); + sse_found = 1; + } + fclose(file); + } + } + + if(cpu_has_feature(CPU_CAP_SSE2) && !sse_found) + { + sprintf(buf, "%s sse_test.c -o sse2 -msse2 -DSSE2 2> /dev/null", argv[1]); + system(buf); + if(file = fopen("sse2", "r")) + { + fclose(file); + + //run program and compare to the included output + system("./sse2 > temp.txt 2> /dev/null"); + system("diff sse2_test.txt temp.txt > diff.txt 2> /dev/null"); + file = fopen("diff.txt", "r"); + if(fgetc(file) == EOF) + { + strcat(cflags, " -msse2 -DINTEL_SSE2"); + strcat(ldflags, " -msse2"); + sse_found = 1; + } + fclose(file); + } + } + + if(cpu_has_feature(CPU_CAP_PCLMULQDQ) && sse_found) + { + sprintf(buf, "%s pclmul_test.c -o pclmul -maes -mpclmul 2> /dev/null" + , argv[1]); + system(buf); + if(file = fopen("pclmul", "r")) + { + fclose(file); + + //run program and compare to the included output + system("./pclmul > temp.txt 2> /dev/null"); + system("diff pclmul_test.txt temp.txt > diff.txt 2> /dev/null"); + file = fopen("diff.txt", "r"); + if(fgetc(file) == EOF) + { + strcat(cflags, " -maes -mpclmul -DINTEL_PCLMUL"); + strcat(ldflags, " -maes -mpclmul"); + } + fclose(file); + } + } + + printf("%s\n%s\n", cflags, ldflags); +} diff --git a/intel_cpu_capabilities.h b/flag_tester/intel_cpu_capabilities.h similarity index 95% rename from intel_cpu_capabilities.h rename to flag_tester/intel_cpu_capabilities.h index 5fe0fea..6d1bbeb 100644 --- a/intel_cpu_capabilities.h +++ b/flag_tester/intel_cpu_capabilities.h @@ -16,7 +16,7 @@ #define CPU_CPSSE 0x2000 #define CPU_CAP_SSE3 (CPU_CPSSE | 0) #define CPU_CAP_PCLMULQDQ (CPU_CPSSE | 1) -#define CPU_CAP_SSSE3 (CPU_CPSSE | 10) +#define CPU_CAP_SSSE3 (CPU_CPSSE | 9) #define CPU_CAP_SSE41 (CPU_CPSSE | 19) #define CPU_CAP_SSE42 (CPU_CPSSE | 20) #define CPU_CAP_AVX (CPU_CPSSE | 28) @@ -25,7 +25,6 @@ __asm__ __volatile__ ("cpuid":\ "=a" (ax), "=b" (bx), "=c" (cx), "=d" (dx) : "a" (func)); -inline int cpu_has_feature (unsigned which) { diff --git a/flag_tester/pclmul_test.c b/flag_tester/pclmul_test.c new file mode 100644 index 0000000..bdae184 --- /dev/null +++ b/flag_tester/pclmul_test.c @@ -0,0 +1,40 @@ +#include +#include +#include + +#define MM_PRINT8(s, r) { uint8_t blah[16], ii; printf("%-20s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 1) printf("%s%02x", (ii%4==0) ? " " : " ", blah[15-ii]); printf("\n"); } + + +int main() +{ + uint64_t answer; + uint32_t pp; + __m128i a, b, c; + + a = _mm_set1_epi8(0x0D); + b = _mm_set_epi32(0,0,0,0x0A); + pp = 0x13; + MM_PRINT8("a", a); + MM_PRINT8("b", b); + + c = _mm_clmulepi64_si128(a, b, 0); + MM_PRINT8("a clm b", c); + + a = _mm_set1_epi8(0xf0); + MM_PRINT8("a", a); + b = _mm_and_si128(a, c); + b = _mm_srli_epi64(b, 4); + MM_PRINT8("shifted", b); + + + a = _mm_set_epi32(0,0,0,pp); + MM_PRINT8("PP", a); + + b = _mm_clmulepi64_si128(a, b, 0); + MM_PRINT8("PP clm over", b); + + c = _mm_xor_si128(c,b); + MM_PRINT8("Answer", c); + //answer = _mm_extract_epi64(c, 0); + //printf("%llx\n", answer); +} diff --git a/flag_tester/pclmul_test.txt b/flag_tester/pclmul_test.txt new file mode 100644 index 0000000..6102f94 --- /dev/null +++ b/flag_tester/pclmul_test.txt @@ -0,0 +1,8 @@ +a 0d 0d 0d 0d 0d 0d 0d 0d 0d 0d 0d 0d 0d 0d 0d 0d +b 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 0a +a clm b 00 00 00 00 00 00 00 00 72 72 72 72 72 72 72 72 +a f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 +shifted 00 00 00 00 00 00 00 00 07 07 07 07 07 07 07 07 +PP 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 13 +PP clm over 00 00 00 00 00 00 00 00 79 79 79 79 79 79 79 79 +Answer 00 00 00 00 00 00 00 00 0b 0b 0b 0b 0b 0b 0b 0b diff --git a/flag_tester/sse2_test.txt b/flag_tester/sse2_test.txt new file mode 100644 index 0000000..f79b6e0 --- /dev/null +++ b/flag_tester/sse2_test.txt @@ -0,0 +1,30 @@ +a 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 +b 10 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 +c 11 10 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03 02 +d 12 11 10 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03 +a sl16 3c 38 34 30 2c 28 24 20 1c 18 14 10 0c 08 04 00 +b sl32 40 3c 38 34 30 2c 28 24 20 1c 18 14 10 0c 08 04 +c sl64 44 40 3c 38 34 30 2c 28 24 20 1c 18 14 10 0c 08 +d sl128 10 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03 00 00 +a sr16 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 +b sr32 10 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 +c sr64 11 10 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03 02 +d sr128 00 00 10 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03 +d = a^b 1f 01 03 01 07 01 03 01 0f 01 03 01 07 01 03 01 +d = a-b epi8 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff +d = a-b epi16 fe ff fe ff fe ff fe ff fe ff fe ff fe ff fe ff +d = a-b epi32 fe fe fe ff fe fe fe ff fe fe fe ff fe fe fe ff +d = a-b epi64 fe fe fe fe fe fe fe ff fe fe fe fe fe fe fe ff +d set_epi8 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 +d set_epi32 12 34 56 78 9a bc de f0 12 34 56 78 9a bc de f0 +d set1_epi64 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 +d set1_epi32 e2 e2 e2 e2 e2 e2 e2 e2 e2 e2 e2 e2 e2 e2 e2 e2 +d set1_epi16 af f3 af f3 af f3 af f3 af f3 af f3 af f3 af f3 +d set1_epi8 c5 c5 c5 c5 c5 c5 c5 c5 c5 c5 c5 c5 c5 c5 c5 c5 +d packus_epi16(d,d) 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 +c unpackhi(a,d) 00 0f 00 0e 00 0d 00 0c 00 0b 00 0a 00 09 00 08 +b unpacklo(c,a) 07 00 06 0b 05 00 04 0a 03 00 02 09 01 00 00 08 +d and(d,b) 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 +a stored to mem: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 +d setzero 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 +c 05 05 05 05 05 05 05 05 05 05 05 05 05 05 05 05 diff --git a/flag_tester/sse4_test.txt b/flag_tester/sse4_test.txt new file mode 100644 index 0000000..3f6d7ec --- /dev/null +++ b/flag_tester/sse4_test.txt @@ -0,0 +1,35 @@ +a 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 +b 10 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 +c 11 10 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03 02 +d 12 11 10 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03 +a sl16 3c 38 34 30 2c 28 24 20 1c 18 14 10 0c 08 04 00 +b sl32 40 3c 38 34 30 2c 28 24 20 1c 18 14 10 0c 08 04 +c sl64 44 40 3c 38 34 30 2c 28 24 20 1c 18 14 10 0c 08 +d sl128 10 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03 00 00 +a sr16 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 +b sr32 10 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 +c sr64 11 10 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03 02 +d sr128 00 00 10 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03 +d = a^b 1f 01 03 01 07 01 03 01 0f 01 03 01 07 01 03 01 +d = a-b epi8 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff +d = a-b epi16 fe ff fe ff fe ff fe ff fe ff fe ff fe ff fe ff +d = a-b epi32 fe fe fe ff fe fe fe ff fe fe fe ff fe fe fe ff +d = a-b epi64 fe fe fe fe fe fe fe ff fe fe fe fe fe fe fe ff +d set_epi8 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 +d set_epi32 12 34 56 78 9a bc de f0 12 34 56 78 9a bc de f0 +d set1_epi64 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 +d set1_epi32 e2 e2 e2 e2 e2 e2 e2 e2 e2 e2 e2 e2 e2 e2 e2 e2 +d set1_epi16 af f3 af f3 af f3 af f3 af f3 af f3 af f3 af f3 +d set1_epi8 c5 c5 c5 c5 c5 c5 c5 c5 c5 c5 c5 c5 c5 c5 c5 c5 +d packus_epi16(d,d) 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 +c unpackhi(a,d) 00 0f 00 0e 00 0d 00 0c 00 0b 00 0a 00 09 00 08 +b unpacklo(c,a) 07 00 06 0b 05 00 04 0a 03 00 02 09 01 00 00 08 +d and(d,b) 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 +a stored to mem: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 +d setzero 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 +d insert32 @ 2 00 00 00 00 ab cd 12 34 00 00 00 00 00 00 00 00 +extract_epi32 @ 2: abcd1234 +d insert64 @ 0 00 00 00 00 ab cd 12 34 fe dc ba 12 91 82 73 64 +extract_epi64 @ 0: fedcba1291827364 +c 05 05 05 05 05 05 05 05 05 05 05 05 05 05 05 05 +a shuffle(b, c) 02 02 02 02 02 02 02 02 02 02 02 02 02 02 02 02 diff --git a/flag_tester/sse_test.c b/flag_tester/sse_test.c new file mode 100644 index 0000000..e40cf25 --- /dev/null +++ b/flag_tester/sse_test.c @@ -0,0 +1,142 @@ +#ifdef SSE4 +#define SSSE3 +#include +#endif + +#ifdef SSSE3 +#define SSE2 +#include +#endif + +#ifdef SSE2 +#include +#endif + +#include +#include +#include + +#define MM_PRINT8(s, r) { uint8_t blah[16], ii; printf("%-20s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 1) printf("%s%02x", (ii%4==0) ? " " : " ", blah[15-ii]); printf("\n"); } + +int main() +{ + uint32_t u32; + uint64_t u64; + uint8_t *ui8 = malloc(20), i; + __m128i a, b, c, d; + + for(i=0; i < 20; i++) + ui8[i] = i; + + a = _mm_load_si128( (__m128i *) ui8 ); + b = _mm_loadu_si128( (__m128i *) (ui8+1)); + c = _mm_loadu_si128( (__m128i *) (ui8+2)); + d = _mm_loadu_si128( (__m128i *) (ui8+3)); + + MM_PRINT8("a", a); + MM_PRINT8("b", b); + MM_PRINT8("c", c); + MM_PRINT8("d", d); + + a = _mm_slli_epi16(a, 2); + b = _mm_slli_epi32(b, 2); + c = _mm_slli_epi64(c, 2); + d = _mm_slli_si128(d, 2); + + MM_PRINT8("a sl16", a); + MM_PRINT8("b sl32", b); + MM_PRINT8("c sl64", c); + MM_PRINT8("d sl128", d); + + a = _mm_srli_epi16(a, 2); + b = _mm_srli_epi32(b, 2); + c = _mm_srli_epi64(c, 2); + d = _mm_srli_si128(d, 2); + + MM_PRINT8("a sr16", a); + MM_PRINT8("b sr32", b); + MM_PRINT8("c sr64", c); + MM_PRINT8("d sr128", d); + + d = _mm_xor_si128(a, b); + MM_PRINT8("d = a^b", d); + + d = _mm_sub_epi8(a, b); + MM_PRINT8("d = a-b epi8", d); + + d = _mm_sub_epi16(a, b); + MM_PRINT8("d = a-b epi16", d); + + d = _mm_sub_epi32(a, b); + MM_PRINT8("d = a-b epi32", d); + + d = _mm_sub_epi64(a, b); + MM_PRINT8("d = a-b epi64", d); + + d = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + MM_PRINT8("d set_epi8", d); + + d = _mm_set_epi32(0x12345678, 0x9abcdef0, 0x12345678, 0x9abcdef0); + MM_PRINT8("d set_epi32", d); + + d = _mm_set1_epi64x(0xF0F0F0F0F0F0F0F0ULL); + MM_PRINT8("d set1_epi64", d); + + d = _mm_set1_epi32(0xe2e2e2e2); + MM_PRINT8("d set1_epi32", d); + + d = _mm_set1_epi16(0xaff3); + MM_PRINT8("d set1_epi16", d); + + d = _mm_set1_epi8(0xc5); + MM_PRINT8("d set1_epi8", d); + + d = _mm_packus_epi16(d, d); + MM_PRINT8("d packus_epi16(d,d)", d); + + c = _mm_unpackhi_epi8(a, d); + MM_PRINT8("c unpackhi(a,d)", c); + + b = _mm_unpacklo_epi8(c, a); + MM_PRINT8("b unpacklo(c,a)", b); + + d = _mm_and_si128(d, b); + MM_PRINT8("d and(d,b)", d); + + _mm_store_si128( (__m128i *) ui8, a); + printf("a stored to mem: "); + for(i=0; i < 16; i++) + printf("%u ", ui8[i]); + printf("\n"); + + d = _mm_setzero_si128(); + MM_PRINT8("d setzero", d); + + u32 = 0xABCD1234; + u64 = 0xFEDCBA1291827364ULL; + + #ifdef SSE4 + d = _mm_insert_epi32(d, u32, 2); + MM_PRINT8("d insert32 @ 2", d); + + u32 = 0; + u32 = _mm_extract_epi32(d, 2); + printf("extract_epi32 @ 2: %x\n", u32); + + d = _mm_insert_epi64(d, u64, 0); + MM_PRINT8("d insert64 @ 0", d); + + u64 = 0; + u64 = _mm_extract_epi64(d, 0); + printf("extract_epi64 @ 0: %" PRIx64 "\n", u64); + #endif + + c = _mm_set1_epi8(5); + MM_PRINT8("c", c); + + #ifdef SSSE3 + a = _mm_shuffle_epi8(b, c); + MM_PRINT8("a shuffle(b, c)", a); + #endif + +} diff --git a/flag_tester/ssse3_test.txt b/flag_tester/ssse3_test.txt new file mode 100644 index 0000000..17bee1a --- /dev/null +++ b/flag_tester/ssse3_test.txt @@ -0,0 +1,31 @@ +a 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 +b 10 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 +c 11 10 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03 02 +d 12 11 10 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03 +a sl16 3c 38 34 30 2c 28 24 20 1c 18 14 10 0c 08 04 00 +b sl32 40 3c 38 34 30 2c 28 24 20 1c 18 14 10 0c 08 04 +c sl64 44 40 3c 38 34 30 2c 28 24 20 1c 18 14 10 0c 08 +d sl128 10 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03 00 00 +a sr16 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 +b sr32 10 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 +c sr64 11 10 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03 02 +d sr128 00 00 10 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03 +d = a^b 1f 01 03 01 07 01 03 01 0f 01 03 01 07 01 03 01 +d = a-b epi8 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff +d = a-b epi16 fe ff fe ff fe ff fe ff fe ff fe ff fe ff fe ff +d = a-b epi32 fe fe fe ff fe fe fe ff fe fe fe ff fe fe fe ff +d = a-b epi64 fe fe fe fe fe fe fe ff fe fe fe fe fe fe fe ff +d set_epi8 0f 0e 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 +d set_epi32 12 34 56 78 9a bc de f0 12 34 56 78 9a bc de f0 +d set1_epi64 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 f0 +d set1_epi32 e2 e2 e2 e2 e2 e2 e2 e2 e2 e2 e2 e2 e2 e2 e2 e2 +d set1_epi16 af f3 af f3 af f3 af f3 af f3 af f3 af f3 af f3 +d set1_epi8 c5 c5 c5 c5 c5 c5 c5 c5 c5 c5 c5 c5 c5 c5 c5 c5 +d packus_epi16(d,d) 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 +c unpackhi(a,d) 00 0f 00 0e 00 0d 00 0c 00 0b 00 0a 00 09 00 08 +b unpacklo(c,a) 07 00 06 0b 05 00 04 0a 03 00 02 09 01 00 00 08 +d and(d,b) 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 +a stored to mem: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 +d setzero 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 +c 05 05 05 05 05 05 05 05 05 05 05 05 05 05 05 05 +a shuffle(b, c) 02 02 02 02 02 02 02 02 02 02 02 02 02 02 02 02 diff --git a/whats_my_sse.c b/flag_tester/whats_my_sse.c similarity index 100% rename from whats_my_sse.c rename to flag_tester/whats_my_sse.c diff --git a/flag_tester/which_compile_flags.sh b/flag_tester/which_compile_flags.sh new file mode 100755 index 0000000..f39c609 --- /dev/null +++ b/flag_tester/which_compile_flags.sh @@ -0,0 +1,19 @@ +if [ -n "$1" ]; then + CC=$1 +else + CC=cc +fi + +$CC flag_test.c -o flag_test 2> /dev/null +if [ -e "flag_test" ]; then + OUTPUT=`./flag_test $CC 2> /dev/null` + if [ -n "$OUTPUT" ]; then + echo "$OUTPUT" + else + printf "CFLAGS = -O3\nLDFLAGS = -O3\n" + fi +else + printf "$CC failed to compile flag_test.c\n" +fi + +rm sse4 sse2 ssse3 pclmul diff.txt flag_test temp.txt 2> /dev/null diff --git a/gf.c b/gf.c index 4304e1d..b027473 100644 --- a/gf.c +++ b/gf.c @@ -8,6 +8,405 @@ #include #include +int _gf_errno = GF_E_DEFAULT; + +void gf_error() +{ + char *s; + + switch(_gf_errno) { + case GF_E_DEFAULT: s = "No Error."; break; + case GF_E_TWOMULT: s = "Cannot specify two -m's."; break; + case GF_E_TWO_DIV: s = "Cannot specify two -d's."; break; + case GF_E_POLYSPC: s = "-p needs to be followed by a number in hex (0x optional)."; break; + case GF_E_GROUPAR: s = "Ran out of arguments in -m GROUP."; break; + case GF_E_GROUPNU: s = "In -m GROUP g_s g_r -- g_s and g_r need to be numbers."; break; + case GF_E_SPLITAR: s = "Ran out of arguments in -m SPLIT."; break; + case GF_E_SPLITNU: s = "In -m SPLIT w_a w_b -- w_a and w_b need to be numbers."; break; + case GF_E_FEWARGS: s = "Not enough arguments (Perhaps end with '-'?)"; break; + case GF_E_CFM___W: s = "-m CARRY_FREE, w must be 4, 8, 16, 32, 64 or 128."; break; + case GF_E_COMPXPP: s = "-m COMPOSITE, No poly specified, and we don't have a default for the given sub-field."; break; + case GF_E_BASE__W: s = "-m COMPOSITE and the base field is not for w/2."; break; + case GF_E_CFM4POL: s = "-m CARRY_FREE, w=4. (Prim-poly & 0xc) must equal 0."; break; + case GF_E_CFM8POL: s = "-m CARRY_FREE, w=8. (Prim-poly & 0x80) must equal 0."; break; + case GF_E_CF16POL: s = "-m CARRY_FREE, w=16. (Prim-poly & 0xe000) must equal 0."; break; + case GF_E_CF32POL: s = "-m CARRY_FREE, w=32. (Prim-poly & 0xfe000000) must equal 0."; break; + case GF_E_CF64POL: s = "-m CARRY_FREE, w=64. (Prim-poly & 0xfffe000000000000ULL) must equal 0."; break; + case GF_E_MDEFDIV: s = "If multiplication method == default, can't change division."; break; + case GF_E_MDEFREG: s = "If multiplication method == default, can't change region."; break; + case GF_E_MDEFARG: s = "If multiplication method == default, can't use arg1/arg2."; break; + case GF_E_DIVCOMP: s = "Cannot change the division technique with -m COMPOSITE."; break; + case GF_E_DOUQUAD: s = "Cannot specify -r DOUBLE and -r QUAD."; break; + case GF_E_SSE__NO: s = "Cannot specify -r SSE and -r NOSSE."; break; + case GF_E_CAUCHYB: s = "Cannot specify -r CAUCHY and any other -r."; break; + case GF_E_CAUCOMP: s = "Cannot specify -m COMPOSITE and -r CAUCHY."; break; + case GF_E_CAUGT32: s = "Cannot specify -r CAUCHY with w > 32."; break; + case GF_E_ARG1SET: s = "Only use arg1 with SPLIT, GROUP or COMPOSITE."; break; + case GF_E_ARG2SET: s = "Only use arg2 with SPLIT or GROUP."; break; + case GF_E_MATRIXW: s = "Cannot specify -d MATRIX with w > 32."; break; + case GF_E_BAD___W: s = "W must be 1-32, 64 or 128."; break; + case GF_E_DOUBLET: s = "Can only specify -r DOUBLE with -m TABLE."; break; + case GF_E_DOUBLEW: s = "Can only specify -r DOUBLE w = 4 or w = 8."; break; + case GF_E_DOUBLEJ: s = "Cannot specify -r DOUBLE with -r ALTMAP|SSE|NOSSE."; break; + case GF_E_DOUBLEL: s = "Can only specify -r DOUBLE -r LAZY with w = 8"; break; + case GF_E_QUAD__T: s = "Can only specify -r QUAD with -m TABLE."; break; + case GF_E_QUAD__W: s = "Can only specify -r QUAD w = 4."; break; + case GF_E_QUAD__J: s = "Cannot specify -r QUAD with -r ALTMAP|SSE|NOSSE."; break; + case GF_E_BADPOLY: s = "Bad primitive polynomial (high bits set)."; break; + case GF_E_COMP_PP: s = "Bad primitive polynomial -- bigger than sub-field."; break; + case GF_E_LAZY__X: s = "If -r LAZY, then -r must be DOUBLE or QUAD."; break; + case GF_E_ALTSHIF: s = "Cannot specify -m SHIFT and -r ALTMAP."; break; + case GF_E_SSESHIF: s = "Cannot specify -m SHIFT and -r SSE|NOSSE."; break; + case GF_E_ALT_CFM: s = "Cannot specify -m CARRY_FREE and -r ALTMAP."; break; + case GF_E_SSE_CFM: s = "Cannot specify -m CARRY_FREE and -r SSE|NOSSE."; break; + case GF_E_PCLMULX: s = "Specified -m CARRY_FREE, but PCLMUL is not supported."; break; + case GF_E_ALT_BY2: s = "Cannot specify -m BYTWO_x and -r ALTMAP."; break; + case GF_E_BY2_SSE: s = "Specified -m BYTWO_x -r SSE, but SSE2 is not supported."; break; + case GF_E_LOGBADW: s = "With Log Tables, w must be <= 27."; break; + case GF_E_LOG___J: s = "Cannot use Log tables with -r ALTMAP|SSE|NOSSE."; break; + case GF_E_LOGPOLY: s = "Cannot use Log tables because the polynomial is not primitive."; break; + case GF_E_ZERBADW: s = "With -m LOG_ZERO, w must be 8 or 16."; break; + case GF_E_ZEXBADW: s = "With -m LOG_ZERO_EXT, w must be 8."; break; + case GF_E_GR_ARGX: s = "With -m GROUP, arg1 and arg2 must be >= 0."; break; + case GF_E_GR_W_48: s = "With -m GROUP, w cannot be 4 or 8."; break; + case GF_E_GR_W_16: s = "With -m GROUP, w == 16, arg1 and arg2 must be 4."; break; + case GF_E_GR_128A: s = "With -m GROUP, w == 128, arg1 must be 4, and arg2 in { 4,8,16 }."; break; + case GF_E_GR_SSE4: s = "With -m GROUP, w == 128, you need SSE4."; break; + case GF_E_GR_A_27: s = "With -m GROUP, arg1 and arg2 must be <= 27."; break; + case GF_E_GR_AR_W: s = "With -m GROUP, arg1 and arg2 must be <= w."; break; + case GF_E_GR____J: s = "Cannot use GROUP with -r ALTMAP|SSE|NOSSE."; break; + case GF_E_TABLE_W: s = "With -m TABLE, w must be < 15, or == 16."; break; + case GF_E_TAB_SSE: s = "With -m TABLE, SSE|NOSSE only applies to w=4."; break; + case GF_E_TABSSE3: s = "With -m TABLE, -r SSE, you need SSSE3 supported."; break; + case GF_E_TAB_ALT: s = "With -m TABLE, you cannot use ALTMAP."; break; + case GF_E_SP128AR: s = "With -m SPLIT, w=128, bad arg1/arg2."; break; + case GF_E_SP128AL: s = "With -m SPLIT, w=128, -r SSE requires -r ALTMAP."; break; + case GF_E_SP128AS: s = "With -m SPLIT, w=128, ALTMAP needs SSSE3 supported."; break; + case GF_E_SP128_A: s = "With -m SPLIT, w=128, -r SSE|NOSSE only with arg1/arg2 = 4/128."; break; + case GF_E_SP128_S: s = "With -m SPLIT, w=128, -r ALTMAP only with arg1/arg2 = 4/128."; break; + case GF_E_SPLIT_W: s = "With -m SPLIT, w must be in {8, 16, 32, 64, 128}."; break; + case GF_E_SP_16AR: s = "With -m SPLIT, w=16, Bad arg1/arg2."; break; + case GF_E_SP_16_A: s = "With -m SPLIT, w=16, -r ALTMAP only with arg1/arg2 = 4/16."; break; + case GF_E_SP_16_S: s = "With -m SPLIT, w=16, -r SSE|NOSSE only with arg1/arg2 = 4/16."; break; + case GF_E_SP_32AR: s = "With -m SPLIT, w=32, Bad arg1/arg2."; break; + case GF_E_SP_32AS: s = "With -m SPLIT, w=32, -r ALTMAP needs SSSE3 supported."; break; + case GF_E_SP_32_A: s = "With -m SPLIT, w=32, -r ALTMAP only with arg1/arg2 = 4/32."; break; + case GF_E_SP_32_S: s = "With -m SPLIT, w=32, -r SSE|NOSSE only with arg1/arg2 = 4/32."; break; + case GF_E_SP_64AR: s = "With -m SPLIT, w=64, Bad arg1/arg2."; break; + case GF_E_SP_64AS: s = "With -m SPLIT, w=64, -r ALTMAP needs SSSE3 supported."; break; + case GF_E_SP_64_A: s = "With -m SPLIT, w=64, -r ALTMAP only with arg1/arg2 = 4/64."; break; + case GF_E_SP_64_S: s = "With -m SPLIT, w=64, -r SSE|NOSSE only with arg1/arg2 = 4/64."; break; + case GF_E_SP_8_AR: s = "With -m SPLIT, w=8, Bad arg1/arg2."; break; + case GF_E_SP_8__A: s = "With -m SPLIT, w=8, Can't have -r ALTMAP."; break; + case GF_E_SP_SSE3: s = "With -m SPLIT, Need SSSE3 support for SSE."; break; + case GF_E_COMP_A2: s = "With -m COMPOSITE, arg1 must equal 2."; break; + case GF_E_COMP_SS: s = "With -m COMPOSITE, -r SSE and -r NOSSE do not apply."; break; + case GF_E_COMP__W: s = "With -m COMPOSITE, w must be 8, 16, 32, 64 or 128."; break; + case GF_E_UNKFLAG: s = "Unknown method flag - should be -m, -d, -r or -p."; break; + case GF_E_UNKNOWN: s = "Unknown multiplication type."; break; + case GF_E_UNK_REG: s = "Unknown region type."; break; + case GF_E_UNK_DIV: s = "Unknown division type."; break; + default: s = "Undefined error."; + } + + fprintf(stderr, "%s\n", s); +} + +uint64_t gf_composite_get_default_poly(gf_t *base) +{ + gf_internal_t *h; + int rv; + + h = (gf_internal_t *) base->scratch; + if (h->w == 4) { + if (h->mult_type == GF_MULT_COMPOSITE) return 0; + if (h->prim_poly == 0x13) return 2; + return 0; + } + if (h->w == 8) { + if (h->mult_type == GF_MULT_COMPOSITE) return 0; + if (h->prim_poly == 0x11d) return 3; + return 0; + } + if (h->w == 16) { + if (h->mult_type == GF_MULT_COMPOSITE) { + rv = gf_composite_get_default_poly(h->base_gf); + if (rv != h->prim_poly) return 0; + if (rv == 3) return 0x105; + return 0; + } else { + if (h->prim_poly == 0x1100b) return 2; + if (h->prim_poly == 0x1002d) return 7; + return 0; + } + } + if (h->w == 32) { + if (h->mult_type == GF_MULT_COMPOSITE) { + rv = gf_composite_get_default_poly(h->base_gf); + if (rv != h->prim_poly) return 0; + if (rv == 2) return 0x10005; + if (rv == 7) return 0x10008; + if (rv == 0x105) return 0x10002; + return 0; + } else { + if (h->prim_poly == 0x400007) return 2; + if (h->prim_poly == 0xc5) return 3; + return 0; + } + } + if (h->w == 64) { + if (h->mult_type == GF_MULT_COMPOSITE) { + rv = gf_composite_get_default_poly(h->base_gf); + if (rv != h->prim_poly) return 0; + if (rv == 3) return 0x100000009ULL; + if (rv == 2) return 0x100000004ULL; + if (rv == 0x10005) return 0x100000003ULL; + if (rv == 0x10002) return 0x100000005ULL; + if (rv == 0x10008) return 0x100000006ULL; /* JSP: (0x0x100000003 works too, + but I want to differentiate cases). */ + return 0; + } else { + if (h->prim_poly == 0x1bULL) return 2; + return 0; + } + } + return 0; +} + +int gf_error_check(int w, int mult_type, int region_type, int divide_type, + int arg1, int arg2, uint64_t poly, gf_t *base) +{ + int sse4 = 0; + int sse3 = 0; + int sse2 = 0; + int pclmul = 0; + int rdouble, rquad, rlazy, rsse, rnosse, raltmap, rcauchy, tmp; + uint64_t pp; + gf_internal_t *sub, *subsub, *subsubsub; + + rdouble = (region_type & GF_REGION_DOUBLE_TABLE); + rquad = (region_type & GF_REGION_QUAD_TABLE); + rlazy = (region_type & GF_REGION_LAZY); + rsse = (region_type & GF_REGION_SSE); + rnosse = (region_type & GF_REGION_NOSSE); + raltmap = (region_type & GF_REGION_ALTMAP); + rcauchy = (region_type & GF_REGION_CAUCHY); + + if (divide_type != GF_DIVIDE_DEFAULT && + divide_type != GF_DIVIDE_MATRIX && + divide_type != GF_DIVIDE_EUCLID) { + _gf_errno = GF_E_UNK_DIV; + return 0; + } + + tmp = ( GF_REGION_DOUBLE_TABLE | GF_REGION_QUAD_TABLE | GF_REGION_LAZY | + GF_REGION_SSE | GF_REGION_NOSSE | GF_REGION_ALTMAP | GF_REGION_CAUCHY ); + if (region_type & (~tmp)) { _gf_errno = GF_E_UNK_REG; return 0; } + +#ifdef INTEL_SSE2 + sse2 = 1; +#endif + +#ifdef INTEL_SSSE3 + sse3 = 1; +#endif + +#ifdef INTEL_SSE4 + sse4 = 1; +#endif + +#ifdef INTEL_PCLMUL + pclmul = 1; +#endif + + + if (w < 1 || (w > 32 && w != 64 && w != 128)) { _gf_errno = GF_E_BAD___W; return 0; } + + if (mult_type != GF_MULT_COMPOSITE && w < 64) { + if ((poly >> (w+1)) != 0) { _gf_errno = GF_E_BADPOLY; return 0; } + } + + if (mult_type == GF_MULT_DEFAULT) { + if (divide_type != GF_DIVIDE_DEFAULT) { _gf_errno = GF_E_MDEFDIV; return 0; } + if (region_type != GF_REGION_DEFAULT) { _gf_errno = GF_E_MDEFREG; return 0; } + if (arg1 != 0 || arg2 != 0) { _gf_errno = GF_E_MDEFARG; return 0; } + return 1; + } + + if (rsse && rnosse) { _gf_errno = GF_E_SSE__NO; return 0; } + if (rcauchy && w > 32) { _gf_errno = GF_E_CAUGT32; return 0; } + if (rcauchy && region_type != GF_REGION_CAUCHY) { _gf_errno = GF_E_CAUCHYB; return 0; } + if (rcauchy && mult_type == GF_MULT_COMPOSITE) { _gf_errno = GF_E_CAUCOMP; return 0; } + + if (arg1 != 0 && mult_type != GF_MULT_COMPOSITE && + mult_type != GF_MULT_SPLIT_TABLE && mult_type != GF_MULT_GROUP) { + _gf_errno = GF_E_ARG1SET; + return 0; + } + + if (arg2 != 0 && mult_type != GF_MULT_SPLIT_TABLE && mult_type != GF_MULT_GROUP) { + _gf_errno = GF_E_ARG2SET; + return 0; + } + + if (divide_type == GF_DIVIDE_MATRIX && w > 32) { _gf_errno = GF_E_MATRIXW; return 0; } + + if (rdouble) { + if (rquad) { _gf_errno = GF_E_DOUQUAD; return 0; } + if (mult_type != GF_MULT_TABLE) { _gf_errno = GF_E_DOUBLET; return 0; } + if (w != 4 && w != 8) { _gf_errno = GF_E_DOUBLEW; return 0; } + if (rsse || rnosse || raltmap) { _gf_errno = GF_E_DOUBLEJ; return 0; } + if (rlazy && w == 4) { _gf_errno = GF_E_DOUBLEL; return 0; } + return 1; + } + + if (rquad) { + if (mult_type != GF_MULT_TABLE) { _gf_errno = GF_E_QUAD__T; return 0; } + if (w != 4) { _gf_errno = GF_E_QUAD__W; return 0; } + if (rsse || rnosse || raltmap) { _gf_errno = GF_E_QUAD__J; return 0; } + return 1; + } + + if (rlazy) { _gf_errno = GF_E_LAZY__X; return 0; } + + if (mult_type == GF_MULT_SHIFT) { + if (raltmap) { _gf_errno = GF_E_ALTSHIF; return 0; } + if (rsse || rnosse) { _gf_errno = GF_E_SSESHIF; return 0; } + return 1; + } + + if (mult_type == GF_MULT_CARRY_FREE) { + if (w != 4 && w != 8 && w != 16 && + w != 32 && w != 64 && w != 128) { _gf_errno = GF_E_CFM___W; return 0; } + if (w == 4 && (poly & 0xc)) { _gf_errno = GF_E_CFM4POL; return 0; } + if (w == 8 && (poly & 0x80)) { _gf_errno = GF_E_CFM8POL; return 0; } + if (w == 16 && (poly & 0xe000)) { _gf_errno = GF_E_CF16POL; return 0; } + if (w == 32 && (poly & 0xfe000000)) { _gf_errno = GF_E_CF32POL; return 0; } + if (w == 64 && (poly & 0xfffe000000000000ULL)) { _gf_errno = GF_E_CF64POL; return 0; } + if (raltmap) { _gf_errno = GF_E_ALT_CFM; return 0; } + if (rsse || rnosse) { _gf_errno = GF_E_SSE_CFM; return 0; } + if (!pclmul) { _gf_errno = GF_E_PCLMULX; return 0; } + return 1; + } + + if (mult_type == GF_MULT_BYTWO_p || mult_type == GF_MULT_BYTWO_b) { + if (raltmap) { _gf_errno = GF_E_ALT_BY2; return 0; } + if (rsse && !sse2) { _gf_errno = GF_E_BY2_SSE; return 0; } + return 1; + } + + if (mult_type == GF_MULT_LOG_TABLE || mult_type == GF_MULT_LOG_ZERO + || mult_type == GF_MULT_LOG_ZERO_EXT ) { + if (w > 27) { _gf_errno = GF_E_LOGBADW; return 0; } + if (raltmap || rsse || rnosse) { _gf_errno = GF_E_LOG___J; return 0; } + + if (mult_type == GF_MULT_LOG_TABLE) return 1; + + if (w != 8 && w != 16) { _gf_errno = GF_E_ZERBADW; return 0; } + + if (mult_type == GF_MULT_LOG_ZERO) return 1; + + if (w != 8) { _gf_errno = GF_E_ZEXBADW; return 0; } + return 1; + } + + if (mult_type == GF_MULT_GROUP) { + if (arg1 <= 0 || arg2 <= 0) { _gf_errno = GF_E_GR_ARGX; return 0; } + if (w == 4 || w == 8) { _gf_errno = GF_E_GR_W_48; return 0; } + if (w == 16 && (arg1 != 4 || arg2 != 4)) { _gf_errno = GF_E_GR_W_16; return 0; } + if (w == 128 && (arg1 != 4 || + (arg2 != 4 && arg2 != 8 && arg2 != 16))) { _gf_errno = GF_E_GR_128A; return 0; } + if (w == 128 && !sse4) { _gf_errno = GF_E_GR_SSE4; return 0; } + if (arg1 > 27 || arg2 > 27) { _gf_errno = GF_E_GR_A_27; return 0; } + if (arg1 > w || arg2 > w) { _gf_errno = GF_E_GR_AR_W; return 0; } + if (raltmap || rsse || rnosse) { _gf_errno = GF_E_GR____J; return 0; } + return 1; + } + + if (mult_type == GF_MULT_TABLE) { + if (w != 16 && w >= 15) { _gf_errno = GF_E_TABLE_W; return 0; } + if (w != 4 && (rsse || rnosse)) { _gf_errno = GF_E_TAB_SSE; return 0; } + if (rsse && !sse3) { _gf_errno = GF_E_TABSSE3; return 0; } + if (raltmap) { _gf_errno = GF_E_TAB_ALT; return 0; } + return 1; + } + + if (mult_type == GF_MULT_SPLIT_TABLE) { + if (arg1 > arg2) { + tmp = arg1; + arg1 = arg2; + arg2 = tmp; + } + if (w == 8) { + if (arg1 != 4 || arg2 != 8) { _gf_errno = GF_E_SP_8_AR; return 0; } + if (rsse && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; } + if (raltmap) { _gf_errno = GF_E_SP_8__A; return 0; } + } else if (w == 16) { + if (arg1 == 4 && arg2 == 16) { + if (rsse && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; } + } else if (arg1 == 8 && (arg2 == 16 || arg2 == 8)) { + if (rsse || rnosse) { _gf_errno = GF_E_SP_16_S; return 0; } + if (raltmap) { _gf_errno = GF_E_SP_16_A; return 0; } + } else { _gf_errno = GF_E_SP_16AR; return 0; } + } else if (w == 32) { + if ((arg1 == 8 && arg2 == 8) || + (arg1 == 8 && arg2 == 32) || + (arg1 == 16 && arg2 == 32)) { + if (rsse || rnosse) { _gf_errno = GF_E_SP_32_S; return 0; } + if (raltmap) { _gf_errno = GF_E_SP_32_A; return 0; } + } else if ((arg1 == 4 && arg2 == 32) || + (arg1 == 4 && arg2 == 32)) { + if (rsse && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; } + if (raltmap && arg1 != 4) { _gf_errno = GF_E_SP_32_A; return 0; } + if (raltmap && !sse3) { _gf_errno = GF_E_SP_32AS; return 0; } + if (raltmap && rnosse) { _gf_errno = GF_E_SP_32AS; return 0; } + } else { _gf_errno = GF_E_SP_32AR; return 0; } + } else if (w == 64) { + if ((arg1 == 8 && arg2 == 8) || + (arg1 == 8 && arg2 == 64) || + (arg1 == 16 && arg2 == 64)) { + if (rsse || rnosse) { _gf_errno = GF_E_SP_64_S; return 0; } + if (raltmap) { _gf_errno = GF_E_SP_64_A; return 0; } + } else if (arg1 == 4 && arg2 == 64) { + if (rsse && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; } + if (raltmap && !sse3) { _gf_errno = GF_E_SP_64AS; return 0; } + if (raltmap && rnosse) { _gf_errno = GF_E_SP_64AS; return 0; } + } else { _gf_errno = GF_E_SP_64AR; return 0; } + } else if (w == 128) { + if (arg1 == 8 && arg2 == 128) { + if (rsse || rnosse) { _gf_errno = GF_E_SP128_S; return 0; } + if (raltmap) { _gf_errno = GF_E_SP128_A; return 0; } + } else if (arg1 == 4 && arg2 == 128) { + if (rsse && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; } + if (raltmap && !sse3) { _gf_errno = GF_E_SP128AS; return 0; } + if (raltmap && rnosse) { _gf_errno = GF_E_SP128AS; return 0; } + if (!raltmap && rsse) { _gf_errno = GF_E_SP128AL; return 0; } + } else { _gf_errno = GF_E_SP128AR; return 0; } + } else { _gf_errno = GF_E_SPLIT_W; return 0; } + return 1; + } + + if (mult_type == GF_MULT_COMPOSITE) { + if (w != 8 && w != 16 && w != 32 + && w != 64 && w != 128) { _gf_errno = GF_E_COMP__W; return 0; } + if ((poly >> (w/2)) != 0) { _gf_errno = GF_E_COMP_PP; return 0; } + if (divide_type != GF_DIVIDE_DEFAULT) { _gf_errno = GF_E_DIVCOMP; return 0; } + if (arg1 != 2) { _gf_errno = GF_E_COMP_A2; return 0; } + if (rsse || rnosse) { _gf_errno = GF_E_COMP_SS; return 0; } + if (base != NULL) { + sub = (gf_internal_t *) base->scratch; + if (sub->w != w/2) { _gf_errno = GF_E_BASE__W; return 0; } + if (poly == 0) { + if (gf_composite_get_default_poly(base) == 0) { _gf_errno = GF_E_COMPXPP; return 0; } + } + } + return 1; + } + + _gf_errno = GF_E_UNKNOWN; + return 0; +} + int gf_scratch_size(int w, int mult_type, int region_type, @@ -15,6 +414,8 @@ int gf_scratch_size(int w, int arg1, int arg2) { + if (gf_error_check(w, mult_type, region_type, divide_type, arg1, arg2, 0, NULL) == 0) return 0; + switch(w) { case 4: return gf_w4_scratch_size(mult_type, region_type, divide_type, arg1, arg2); case 8: return gf_w8_scratch_size(mult_type, region_type, divide_type, arg1, arg2); @@ -26,16 +427,31 @@ int gf_scratch_size(int w, } } -int gf_dummy_init(gf_t *gf) +extern int gf_size(gf_t *gf) { - return 0; + gf_internal_t *h; + int s; + + s = sizeof(gf_t); + h = (gf_internal_t *) gf->scratch; + s += gf_scratch_size(h->w, h->mult_type, h->region_type, h->divide_type, h->arg1, h->arg2); + if (h->mult_type == GF_MULT_COMPOSITE) s += gf_size(h->base_gf); + return s; } + int gf_init_easy(gf_t *gf, int w) { - return gf_init_hard(gf, w, GF_MULT_DEFAULT, GF_REGION_DEFAULT, GF_DIVIDE_DEFAULT, 0, 0, 0, NULL, NULL); + return gf_init_hard(gf, w, GF_MULT_DEFAULT, GF_REGION_DEFAULT, GF_DIVIDE_DEFAULT, + 0, 0, 0, NULL, NULL); } +/* Allen: What's going on here is this function is putting info into the + scratch mem of gf, and then calling the relevant REAL init + func for the word size. Probably done this way to consolidate + those aspects of initialization that don't rely on word size, + and then take care of word-size-specific stuff. */ + int gf_init_hard(gf_t *gf, int w, int mult_type, int region_type, int divide_type, @@ -46,11 +462,14 @@ int gf_init_hard(gf_t *gf, int w, int mult_type, { int sz; gf_internal_t *h; - + + if (gf_error_check(w, mult_type, region_type, divide_type, + arg1, arg2, prim_poly, base_gf) == 0) return 0; + sz = gf_scratch_size(w, mult_type, region_type, divide_type, arg1, arg2); - - if (sz <= 0) return 0; - + if (sz <= 0) return 0; /* This shouldn't happen, as all errors should get caught + in gf_error_check() */ + if (scratch_memory == NULL) { h = (gf_internal_t *) malloc(sz); h->free_me = 1; @@ -71,8 +490,6 @@ int gf_init_hard(gf_t *gf, int w, int mult_type, h->private += (sizeof(gf_internal_t)); gf->extract_word.w32 = NULL; - //printf("Created w=%d, with mult_type=%d and region_type=%d\n", w, mult_type, region_type); - switch(w) { case 4: return gf_w4_init(gf); case 8: return gf_w8_init(gf); @@ -94,6 +511,7 @@ int gf_free(gf_t *gf, int recursive) free(h->base_gf); } if (h->free_me) free(h); + return 0; /* Making compiler happy */ } void gf_alignment_error(char *s, int a) @@ -105,9 +523,9 @@ void gf_alignment_error(char *s, int a) } static -void gf_invert_binary_matrix(int *mat, int *inv, int rows) { +void gf_invert_binary_matrix(uint32_t *mat, uint32_t *inv, int rows) { int cols, i, j, k; - int tmp; + uint32_t tmp; cols = rows; @@ -172,34 +590,6 @@ uint32_t gf_bitmatrix_inverse(uint32_t y, int w, uint32_t pp) return inv[0]; } -/* -void gf_two_byte_region_table_multiply(gf_region_data *rd, uint16_t *base) -{ - uint64_t p, ta, shift, tb; - uint64_t *s64, *d64 - - s64 = rd->s_start; - d64 = rd->d_start; - - while (s64 < (uint64_t *) rd->s_top) { - p = (rd->xor) ? *d64 : 0; - ta = *s64; - - shift = 0; - while (ta != 0) { - tb = base[ta&0xffff]; - p ^= (tb << shift); - ta >>= 16; - shift += 16; - } - - *d64 = p; - d64++; - s64++; - } -} -*/ - void gf_two_byte_region_table_multiply(gf_region_data *rd, uint16_t *base) { uint64_t a, prod; @@ -226,8 +616,8 @@ void gf_two_byte_region_table_multiply(gf_region_data *rd, uint16_t *base) prod ^= base[a >> 48]; prod ^= *d64; *d64 = prod; - *s64++; - *d64++; + s64++; + d64++; } } else { while (d64 != top) { @@ -243,8 +633,8 @@ void gf_two_byte_region_table_multiply(gf_region_data *rd, uint16_t *base) prod <<= 16; prod ^= base[a >> 48]; *d64 = prod; - *s64++; - *d64++; + s64++; + d64++; } } } @@ -307,9 +697,71 @@ static void gf_slow_multiply_region(gf_region_data *rd, void *src, void *dest, v } } -/* If align>16, you align to 16 bytes, but make sure that within the aligned region bytes is a multiple of align. However, you make sure that the region itself is a multiple of align. +/* JSP - The purpose of this procedure is to error check alignment, + and to set up the region operation so that it can best leverage + large words. - If align = -1, then this is cauchy. You need to make sure that bytes is a multiple of w. */ + It stores its information in rd. + + Assuming you're not doing Cauchy coding, (see below for that), + then w will be 4, 8, 16, 32 or 64. It can't be 128 (probably + should change that). + + src and dest must then be aligned on ceil(w/8)-byte boundaries. + Moreover, bytes must be a multiple of ceil(w/8). If the variable + align is equal to ceil(w/8), then we will set s_start = src, + d_start = dest, s_top to (src+bytes) and d_top to (dest+bytes). + And we return -- the implementation will go ahead and do the + multiplication on individual words (e.g. using discrete logs). + + If align is greater than ceil(w/8), then the implementation needs + to work on groups of "align" bytes. For example, suppose you are + implementing BYTWO, without SSE. Then you will be doing the region + multiplication in units of 8 bytes, so align = 8. Or, suppose you + are doing a Quad table in GF(2^4). You will be doing the region + multiplication in units of 2 bytes, so align = 2. Or, suppose you + are doing split multiplication with SSE operations in GF(2^8). + Then align = 16. Worse yet, suppose you are doing split + multiplication with SSE operations in GF(2^16), with or without + ALTMAP. Then, you will be doing the multiplication on 256 bits at + a time. So align = 32. + + When align does not equal ceil(w/8), we split the region + multiplication into three parts. We are going to make s_start be + the first address greater than or equal to src that is a multiple + of align. s_top is going to be the largest address >= src+bytes + such that (s_top - s_start) is a multiple of align. We do the + same with d_start and d_top. When we say that "src and dest must + be aligned with respect to each other, we mean that s_start-src + must equal d_start-dest. + + Now, the region multiplication is done in three parts -- the part + between src and s_start must be done using single words. + Similarly, the part between s_top and src+bytes must also be done + using single words. The part between s_start and s_top will be + done in chunks of "align" bytes. + + One final thing -- if align > 16, then s_start and d_start will be + aligned on a 16 byte boundary. Perhaps we should have two + variables: align and chunksize. Then we'd have s_start & d_start + aligned to "align", and have s_top-s_start be a multiple of + chunksize. That may be less confusing, but it would be a big + change. + + Finally, if align = -1, then we are doing Cauchy multiplication, + using only XOR's. In this case, we're not going to care about + alignment because we are just doing XOR's. Instead, the only + thing we care about is that bytes must be a multiple of w. + + This is not to say that alignment doesn't matter in performance + with XOR's. See that discussion in gf_multby_one(). + + After you call gf_set_region_data(), the procedure + gf_do_initial_region_alignment() calls gf->multiply.w32() on + everything between src and s_start. The procedure + gf_do_final_region_alignment() calls gf->multiply.w32() on + everything between s_top and src+bytes. + */ void gf_set_region_data(gf_region_data *rd, gf_t *gf, @@ -326,7 +778,7 @@ void gf_set_region_data(gf_region_data *rd, uint32_t a; unsigned long uls, uld; - if (gf == NULL) { + if (gf == NULL) { /* JSP - Can be NULL if you're just doing XOR's */ wb = 1; } else { h = gf->scratch; @@ -347,7 +799,7 @@ void gf_set_region_data(gf_region_data *rd, a = (align <= 16) ? align : 16; - if (align == -1) { /* This is cauchy. Error check bytes, then set up the pointers + if (align == -1) { /* JSP: This is cauchy. Error check bytes, then set up the pointers so that there are no alignment regions. */ if (bytes % h->w != 0) { fprintf(stderr, "Error in region multiply operation.\n"); @@ -386,14 +838,14 @@ void gf_set_region_data(gf_region_data *rd, } uls %= a; - if (uls != 0) uls = (align-uls); + if (uls != 0) uls = (a-uls); rd->s_start = rd->src + uls; rd->d_start = rd->dest + uls; bytes -= uls; - bytes -= (bytes % align); rd->s_top = rd->s_start + bytes; rd->d_top = rd->d_start + bytes; + } void gf_do_initial_region_alignment(gf_region_data *rd) @@ -413,25 +865,76 @@ void gf_multby_zero(void *dest, int bytes, int xor) return; } +/* JSP - gf_multby_one tries to do this in the most efficient way + possible. If xor = 0, then simply call memcpy() since that + should be optimized by the system. Otherwise, try to do the xor + in the following order: + + If src and dest are aligned with respect to each other on 16-byte + boundaries and you have SSE instructions, then use aligned SSE + instructions. + + If they aren't but you still have SSE instructions, use unaligned + SSE instructions. + + If there are no SSE instructions, but they are aligned with + respect to each other on 8-byte boundaries, then do them with + uint64_t's. + + Otherwise, call gf_unaligned_xor(), which does the following: + align a destination pointer along an 8-byte boundary, and then + memcpy 32 bytes at a time from the src pointer to an array of + doubles. I'm not sure if that's the best -- probably needs + testing, but this seems like it could be a black hole. + */ + +static void gf_unaligned_xor(void *src, void *dest, int bytes); + void gf_multby_one(void *src, void *dest, int bytes, int xor) { -#ifdef INTEL_SSE4 +#ifdef INTEL_SSE2 __m128i ms, md; #endif + unsigned long uls, uld; uint8_t *s8, *d8, *dtop8; uint64_t *s64, *d64, *dtop64; int abytes; - gf_region_data rd; + if (!xor) { memcpy(dest, src, bytes); return; } + uls = (unsigned long) src; + uld = (unsigned long) dest; -#ifdef INTEL_SSE4 +#ifdef INTEL_SSE2 s8 = (uint8_t *) src; d8 = (uint8_t *) dest; - abytes = bytes & 0xfffffff0; + if (uls % 16 == uld % 16) { + gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 16); + while (s8 != rd.s_start) { + *d8 ^= *s8; + d8++; + s8++; + } + while (s8 < (uint8_t *) rd.s_top) { + ms = _mm_load_si128 ((__m128i *)(s8)); + md = _mm_load_si128 ((__m128i *)(d8)); + md = _mm_xor_si128(md, ms); + _mm_store_si128((__m128i *)(d8), md); + s8 += 16; + d8 += 16; + } + while (s8 != (uint8_t *) src + bytes) { + *d8 ^= *s8; + d8++; + s8++; + } + return; + } + + abytes = (bytes & 0xfffffff0); while (d8 < (uint8_t *) dest + abytes) { ms = _mm_loadu_si128 ((__m128i *)(s8)); @@ -449,8 +952,11 @@ void gf_multby_one(void *src, void *dest, int bytes, int xor) return; #endif - /* If you don't have SSE, you'd better be aligned..... */ - + if (uls % 8 != uld % 8) { + gf_unaligned_xor(src, dest, bytes); + return; + } + gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 8); s8 = (uint8_t *) src; d8 = (uint8_t *) dest; @@ -480,3 +986,47 @@ void gf_multby_one(void *src, void *dest, int bytes, int xor) } return; } + +#define UNALIGNED_BUFSIZE (8) + +static void gf_unaligned_xor(void *src, void *dest, int bytes) +{ + uint64_t scopy[UNALIGNED_BUFSIZE], *d64; + int i; + gf_region_data rd; + uint8_t *s8, *d8; + + /* JSP - call gf_set_region_data(), but use dest in both places. This is + because I only want to set up dest. If I used src, gf_set_region_data() + would fail because src and dest are not aligned to each other wrt + 8-byte pointers. I know this will actually align d_start to 16 bytes. + If I change gf_set_region_data() to split alignment & chunksize, then + I could do this correctly. */ + + gf_set_region_data(&rd, NULL, dest, dest, bytes, 1, 1, 8*UNALIGNED_BUFSIZE); + s8 = (uint8_t *) src; + d8 = (uint8_t *) dest; + + while (d8 < (uint8_t *) rd.d_start) { + *d8 ^= *s8; + d8++; + s8++; + } + + d64 = (uint64_t *) d8; + while (d64 < (uint64_t *) rd.d_top) { + memcpy(scopy, s8, 8*UNALIGNED_BUFSIZE); + s8 += 8*UNALIGNED_BUFSIZE; + for (i = 0; i < UNALIGNED_BUFSIZE; i++) { + *d64 ^= scopy[i]; + d64++; + } + } + + d8 = (uint8_t *) d64; + while (d8 < (uint8_t *) (dest+bytes)) { + *d8 ^= *s8; + d8++; + s8++; + } +} diff --git a/gf_54.c b/gf_54.c deleted file mode 100644 index fc37783..0000000 --- a/gf_54.c +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Multiplies four and five in GF(2^4). - */ - -#include -#include -#include - -#include "gf_complete.h" - -main() -{ - gf_t gf; - void *scratch; - int size; - - size = gf_scratch_size(16, GF_MULT_SPLIT_TABLE, - GF_REGION_SSE | GF_REGION_ALTMAP, - GF_DIVIDE_DEFAULT, - 16, 4); - if (size == -1) exit(1); /* It failed. That shouldn't happen*/ - scratch = (void *) malloc(size); - if (scratch == NULL) { perror("malloc"); exit(1); } - if (!gf_init_hard(&gf, 16, GF_MULT_SPLIT_TABLE, - GF_REGION_SSE | GF_REGION_ALTMAP, - GF_DIVIDE_DEFAULT, - 0, 16, 4, NULL, scratch)) exit(1); - printf("Yo\n"); -} diff --git a/gf_add.c b/gf_add.c index 78d443f..545b4b7 100644 --- a/gf_add.c +++ b/gf_add.c @@ -16,7 +16,7 @@ void usage(char *s) fprintf(stderr, " If w has an h on the end, treat a, b and the sum as hexadecimal (no 0x required)\n"); fprintf(stderr, "\n"); fprintf(stderr, " legal w are: 1-32, 64 and 128\n"); - fprintf(stderr, " 128 is hex only (i.e. '128' will be an error - do '128h')\n"); + fprintf(stderr, " 128 is hex only (i.e. '128' will be an error - do '128h')\n"); if (s != NULL) fprintf(stderr, "%s", s); exit(1); diff --git a/gf_complete.h b/gf_complete.h index ac6688e..de3b753 100644 --- a/gf_complete.h +++ b/gf_complete.h @@ -4,22 +4,30 @@ #pragma once #include -#ifdef INTEL_SSE4 -#include -#include -#include +#ifdef INTEL_SSE4 + #define INTEL_SSSE3 + #include #endif -#ifdef INTEL_PCLMUL -#include +#ifdef INTEL_SSSE3 + #define INTEL_SSE2 + #include #endif -/* This does either memcpy or xor, depending on "xor" */ +#ifdef INTEL_SSE2 + #include +#endif -extern void gf_multby_one(void *src, void *dest, int bytes, int xor); +#ifdef INTEL_PCLMUL + #include + #ifdef INTEL_SSE4 + #define INTEL_SSE4_PCLMUL + #endif + #ifdef INTEL_SSSE3 + #define INTEL_SSSE3_PCLMUL + #endif +#endif -#define GF_W128_IS_ZERO(val) (val[0] == 0 && val[1] == 0) -#define GF_W128_EQUAL(val1, val2) ((val1[0] == val2[0]) && (val1[1] == val2[1])) /* These are the different ways to perform multiplication. Not all are implemented for all values of w. @@ -27,30 +35,30 @@ extern void gf_multby_one(void *src, void *dest, int bytes, int xor); typedef enum {GF_MULT_DEFAULT, GF_MULT_SHIFT, + GF_MULT_CARRY_FREE, GF_MULT_GROUP, GF_MULT_BYTWO_p, GF_MULT_BYTWO_b, GF_MULT_TABLE, GF_MULT_LOG_TABLE, + GF_MULT_LOG_ZERO, + GF_MULT_LOG_ZERO_EXT, GF_MULT_SPLIT_TABLE, GF_MULT_COMPOSITE } gf_mult_type_t; /* These are the different ways to optimize region - operations. They are bits because you can compose them: - You can mix SINGLE/DOUBLE/QUAD, LAZY, SSE/NOSSE, STDMAP/ALTMAP/CAUCHY. + operations. They are bits because you can compose them. Certain optimizations only apply to certain gf_mult_type_t's. Again, please see documentation for how to use these */ #define GF_REGION_DEFAULT (0x0) -#define GF_REGION_SINGLE_TABLE (0x1) -#define GF_REGION_DOUBLE_TABLE (0x2) -#define GF_REGION_QUAD_TABLE (0x4) -#define GF_REGION_LAZY (0x8) -#define GF_REGION_SSE (0x10) -#define GF_REGION_NOSSE (0x20) -#define GF_REGION_STDMAP (0x40) -#define GF_REGION_ALTMAP (0x80) -#define GF_REGION_CAUCHY (0x100) +#define GF_REGION_DOUBLE_TABLE (0x1) +#define GF_REGION_QUAD_TABLE (0x2) +#define GF_REGION_LAZY (0x4) +#define GF_REGION_SSE (0x8) +#define GF_REGION_NOSSE (0x10) +#define GF_REGION_ALTMAP (0x20) +#define GF_REGION_CAUCHY (0x40) typedef uint32_t gf_region_type_t; @@ -74,6 +82,9 @@ typedef uint32_t gf_val_32_t; typedef uint64_t gf_val_64_t; typedef uint64_t *gf_val_128_t; +extern int _gf_errno; +extern void gf_error(); + typedef struct gf *GFP; typedef union gf_func_a_b { @@ -109,8 +120,21 @@ typedef struct gf { void *scratch; } gf_t; +/* Initializes the GF to defaults. Pass it a pointer to a gf_t. + Returns 0 on failure, 1 on success. */ + extern int gf_init_easy(GFP gf, int w); +/* Initializes the GF changing the defaults. + Returns 0 on failure, 1 on success. + Pass it a pointer to a gf_t. + For mult_type and divide_type, use one of gf_mult_type_t gf_divide_type_t . + For region_type, OR together the GF_REGION_xxx's defined above. + Use 0 as prim_poly for defaults. Otherwise, the leading 1 is optional. + Use NULL for scratch_memory to have init_hard allocate memory. Otherwise, + use gf_scratch_size() to determine how big scratch_memory has to be. + */ + extern int gf_init_hard(GFP gf, int w, int mult_type, @@ -122,6 +146,9 @@ extern int gf_init_hard(GFP gf, GFP base_gf, void *scratch_memory); +/* Determines the size for scratch_memory. + Returns 0 on failure and non-zero on success. */ + extern int gf_scratch_size(int w, int mult_type, int region_type, @@ -129,25 +156,32 @@ extern int gf_scratch_size(int w, int arg1, int arg2); +/* This reports the gf_scratch_size of a gf_t that has already been created */ + +extern int gf_size(GFP gf); + +/* Frees scratch memory if gf_init_easy/gf_init_hard called malloc. + If recursive = 1, then it calls itself recursively on base_gf. */ + extern int gf_free(GFP gf, int recursive); /* This is support for inline single multiplications and divisions. I know it's yucky, but if you've got to be fast, you've got to be fast. - We'll support inlines for w=4, w=8 and w=16. + We support inlining for w=4, w=8 and w=16. To use inline multiplication and division with w=4 or 8, you should use the default gf_t, or one with a single table. Otherwise, gf_w4/8_get_mult_table() - will return NULL. */ + will return NULL. Similarly, with w=16, the gf_t must be LOG */ uint8_t *gf_w4_get_mult_table(GFP gf); uint8_t *gf_w4_get_div_table(GFP gf); -#define GF_W4_INLINE_MULTDIV(table, a, b) (table[((a)<<4)|b]) +#define GF_W4_INLINE_MULTDIV(table, a, b) (table[((a)<<4)|(b)]) uint8_t *gf_w8_get_mult_table(GFP gf); uint8_t *gf_w8_get_div_table(GFP gf); -#define GF_W8_INLINE_MULTDIV(table, a, b) (table[(((uint32_t) a)<<8)|b]) +#define GF_W8_INLINE_MULTDIV(table, a, b) (table[(((uint32_t) (a))<<8)|(b)]) uint16_t *gf_w16_get_log_table(GFP gf); uint16_t *gf_w16_get_mult_alog_table(GFP gf); diff --git a/gf_example_5.c b/gf_example_5.c new file mode 100644 index 0000000..3e303a3 --- /dev/null +++ b/gf_example_5.c @@ -0,0 +1,73 @@ +/* + * gf_example_5.c + * + * Demonstrating altmap and extract_word + */ + +#include +#include +#include +#include +#include +#include + +#include "gf_complete.h" +#include "gf_rand.h" + +void usage(char *s) +{ + fprintf(stderr, "usage: gf_example_5\n"); + exit(1); +} + +int main(int argc, char **argv) +{ + uint16_t *a, *b; + int i, j; + gf_t gf; + + if (gf_init_hard(&gf, 16, GF_MULT_SPLIT_TABLE, GF_REGION_ALTMAP, GF_DIVIDE_DEFAULT, + 0, 16, 4, NULL, NULL) == 0) { + fprintf(stderr, "gf_init_hard failed\n"); + exit(1); + } + + a = (uint16_t *) malloc(200); + b = (uint16_t *) malloc(200); + + a += 6; + b += 6; + + MOA_Seed(0); + + for (i = 0; i < 30; i++) a[i] = MOA_Random_W(16, 1); + + gf.multiply_region.w32(&gf, a, b, 0x1234, 30*2, 0); + + printf("a: 0x%lx b: 0x%lx\n", (unsigned long) a, (unsigned long) b); + + for (i = 0; i < 30; i += 10) { + printf("\n"); + printf(" "); + for (j = 0; j < 10; j++) printf(" %4d", i+j); + printf("\n"); + + printf("a:"); + for (j = 0; j < 10; j++) printf(" %04x", a[i+j]); + printf("\n"); + + printf("b:"); + for (j = 0; j < 10; j++) printf(" %04x", b[i+j]); + printf("\n"); + printf("\n"); + } + + for (i = 0; i < 15; i ++) { + printf("Word %2d: 0x%04x * 0x1234 = 0x%04x ", i, + gf.extract_word.w32(&gf, a, 30*2, i), + gf.extract_word.w32(&gf, b, 30*2, i)); + printf("Word %2d: 0x%04x * 0x1234 = 0x%04x\n", i+15, + gf.extract_word.w32(&gf, a, 30*2, i+15), + gf.extract_word.w32(&gf, b, 30*2, i+15)); + } +} diff --git a/gf_example_6.c b/gf_example_6.c new file mode 100644 index 0000000..86dda11 --- /dev/null +++ b/gf_example_6.c @@ -0,0 +1,79 @@ +/* + * gf_example_6.c + * + * Demonstrating altmap and extract_word + */ + +#include +#include +#include +#include +#include +#include + +#include "gf_complete.h" +#include "gf_rand.h" + +void usage(char *s) +{ + fprintf(stderr, "usage: gf_example_6\n"); + exit(1); +} + +int main(int argc, char **argv) +{ + uint32_t *a, *b; + int i, j; + gf_t gf, gf_16; + + if (gf_init_hard(&gf_16, 16, GF_MULT_LOG_TABLE, GF_REGION_DEFAULT, GF_DIVIDE_DEFAULT, + 0, 0, 0, NULL, NULL) == 0) { + fprintf(stderr, "gf_init_hard (6) failed\n"); + exit(1); + } + + if (gf_init_hard(&gf, 32, GF_MULT_COMPOSITE, GF_REGION_ALTMAP, GF_DIVIDE_DEFAULT, + 0, 2, 0, &gf_16, NULL) == 0) { + fprintf(stderr, "gf_init_hard (32) failed\n"); + exit(1); + } + + a = (uint32_t *) malloc(200); + b = (uint32_t *) malloc(200); + + a += 3; + b += 3; + + MOA_Seed(0); + + for (i = 0; i < 30; i++) a[i] = MOA_Random_W(32, 1); + + gf.multiply_region.w32(&gf, a, b, 0x12345678, 30*4, 0); + + printf("a: 0x%lx b: 0x%lx\n", (unsigned long) a, (unsigned long) b); + + for (i = 0; i < 30; i += 10) { + printf("\n"); + printf(" "); + for (j = 0; j < 10; j++) printf(" %8d", i+j); + printf("\n"); + + printf("a:"); + for (j = 0; j < 10; j++) printf(" %08x", a[i+j]); + printf("\n"); + + printf("b:"); + for (j = 0; j < 10; j++) printf(" %08x", b[i+j]); + printf("\n"); + printf("\n"); + } + + for (i = 0; i < 15; i ++) { + printf("Word %2d: 0x%08x * 0x12345678 = 0x%08x ", i, + gf.extract_word.w32(&gf, a, 30*4, i), + gf.extract_word.w32(&gf, b, 30*4, i)); + printf("Word %2d: 0x%08x * 0x12345678 = 0x%08x\n", i+15, + gf.extract_word.w32(&gf, a, 30*4, i+15), + gf.extract_word.w32(&gf, b, 30*4, i+15)); + } +} diff --git a/gf_example_7.c b/gf_example_7.c new file mode 100644 index 0000000..445ae20 --- /dev/null +++ b/gf_example_7.c @@ -0,0 +1,70 @@ +/* + * gf_example_7.c + * + * Demonstrating extract_word and Cauchy + */ + +#include +#include +#include +#include +#include +#include + +#include "gf_complete.h" +#include "gf_rand.h" + +void usage(char *s) +{ + fprintf(stderr, "usage: gf_example_7\n"); + exit(1); +} + +int main(int argc, char **argv) +{ + uint8_t *a, *b; + int i, j; + gf_t gf; + + if (gf_init_hard(&gf, 3, GF_MULT_TABLE, GF_REGION_CAUCHY, GF_DIVIDE_DEFAULT, 0, 0, 0, NULL, NULL) == 0) { + fprintf(stderr, "gf_init_hard failed\n"); + exit(1); + } + + a = (uint8_t *) malloc(3); + b = (uint8_t *) malloc(3); + + MOA_Seed(0); + + for (i = 0; i < 3; i++) a[i] = MOA_Random_W(8, 1); + + gf.multiply_region.w32(&gf, a, b, 5, 3, 0); + + printf("a: 0x%lx b: 0x%lx\n", (unsigned long) a, (unsigned long) b); + + printf("\n"); + printf("a: 0x%02x 0x%02x 0x%02x\n", a[0], a[1], a[2]); + printf("b: 0x%02x 0x%02x 0x%02x\n", b[0], b[1], b[2]); + printf("\n"); + + printf("a bits:"); + for (i = 0; i < 3; i++) { + printf(" "); + for (j = 7; j >= 0; j--) printf("%c", (a[i] & (1 << j)) ? '1' : '0'); + } + printf("\n"); + + printf("b bits:"); + for (i = 0; i < 3; i++) { + printf(" "); + for (j = 7; j >= 0; j--) printf("%c", (b[i] & (1 << j)) ? '1' : '0'); + } + printf("\n"); + + printf("\n"); + for (i = 0; i < 8; i++) { + printf("Word %2d: %d * 5 = %d\n", i, + gf.extract_word.w32(&gf, a, 3, i), + gf.extract_word.w32(&gf, b, 3, i)); + } +} diff --git a/gf_general.c b/gf_general.c index ac0c236..02efdc7 100644 --- a/gf_general.c +++ b/gf_general.c @@ -95,12 +95,20 @@ void gf_general_set_random(gf_general_t *v, int w, int zero_ok) } } -void gf_general_val_to_s(gf_general_t *v, int w, char *s) +void gf_general_val_to_s(gf_general_t *v, int w, char *s, int hex) { if (w <= 32) { - sprintf(s, "%x", v->w32); + if (hex) { + sprintf(s, "%x", v->w32); + } else { + sprintf(s, "%d", v->w32); + } } else if (w <= 64) { - sprintf(s, "%llx", (long long unsigned int) v->w64); + if (hex) { + sprintf(s, "%llx", (long long unsigned int) v->w64); + } else { + sprintf(s, "%lld", (long long unsigned int) v->w64); + } } else { if (v->w128[0] == 0) { sprintf(s, "%llx", (long long unsigned int) v->w128[1]); @@ -111,6 +119,64 @@ void gf_general_val_to_s(gf_general_t *v, int w, char *s) } } +int gf_general_s_to_val(gf_general_t *v, int w, char *s, int hex) +{ + int l; + int save; + + if (w <= 32) { + if (hex) { + if (sscanf(s, "%x", &(v->w32)) == 0) return 0; + } else { + if (sscanf(s, "%d", &(v->w32)) == 0) return 0; + } + if (w == 32) return 1; + if (w == 31) { + if (v->w32 & (1 << 31)) return 0; + return 1; + } + if (v->w32 & ~((1 << w)-1)) return 0; + return 1; + } else if (w <= 64) { + if (hex) return (sscanf(s, "%llx", &(v->w64)) == 1); + return (sscanf(s, "%lld", &(v->w64)) == 1); + } else { + if (!hex) return 0; + l = strlen(s); + if (l <= 16) { + v->w128[0] = 0; + return (sscanf(s, "%llx", &(v->w128[1])) == 1); + } else { + if (l > 32) return 0; + save = s[l-16]; + s[l-16] = '\0'; + if (sscanf(s, "%llx", &(v->w128[0])) == 0) { + s[l-16] = save; + return 0; + } + return (sscanf(s+(l-16), "%llx", &(v->w128[1])) == 1); + } + } +} + +void gf_general_add(gf_t *gf, gf_general_t *a, gf_general_t *b, gf_general_t *c) +{ + gf_internal_t *h; + int w; + + h = (gf_internal_t *) gf->scratch; + w = h->w; + + if (w <= 32) { + c->w32 = a->w32 ^ b->w32; + } else if (w <= 64) { + c->w64 = a->w64 ^ b->w64; + } else { + c->w128[0] = a->w128[0] ^ b->w128[0]; + c->w128[1] = a->w128[1] ^ b->w128[1]; + } +} + void gf_general_multiply(gf_t *gf, gf_general_t *a, gf_general_t *b, gf_general_t *c) { gf_internal_t *h; @@ -229,19 +295,19 @@ void gf_general_do_region_check(gf_t *gf, gf_general_t *a, void *orig_a, void *o if (!gf_general_are_equal(&ft, &sb, w)) { - printf("Problem with region multiply (all values in hex):\n"); - printf(" Target address base: 0x%lx. Word 0x%x of 0x%x. Xor: %d\n", + fprintf(stderr,"Problem with region multiply (all values in hex):\n"); + fprintf(stderr," Target address base: 0x%lx. Word 0x%x of 0x%x. Xor: %d\n", (unsigned long) final_target, i, words, xor); - gf_general_val_to_s(a, w, sa); - gf_general_val_to_s(&oa, w, soa); - gf_general_val_to_s(&ot, w, sot); - gf_general_val_to_s(&ft, w, sft); - gf_general_val_to_s(&sb, w, ssb); - printf(" Value: %s\n", sa); - printf(" Original source word: %s\n", soa); - if (xor) printf(" XOR with target word: %s\n", sot); - printf(" Product word: %s\n", sft); - printf(" It should be: %s\n", ssb); + gf_general_val_to_s(a, w, sa, 1); + gf_general_val_to_s(&oa, w, soa, 1); + gf_general_val_to_s(&ot, w, sot, 1); + gf_general_val_to_s(&ft, w, sft, 1); + gf_general_val_to_s(&sb, w, ssb, 1); + fprintf(stderr," Value: %s\n", sa); + fprintf(stderr," Original source word: %s\n", soa); + if (xor) fprintf(stderr," XOR with target word: %s\n", sot); + fprintf(stderr," Product word: %s\n", sft); + fprintf(stderr," It should be: %s\n", ssb); exit(0); } } @@ -251,7 +317,7 @@ void gf_general_set_up_single_timing_test(int w, void *ra, void *rb, int size) { void *top; gf_general_t g; - uint8_t *r8; + uint8_t *r8, *r8a; uint16_t *r16; uint32_t *r32; uint64_t *r64; @@ -263,6 +329,8 @@ void gf_general_set_up_single_timing_test(int w, void *ra, void *rb, int size) However, don't allow for zeros in rb, because that will screw up division. + When w is 4, you fill the regions with random 4-bit words in each byte. + Otherwise, treat every four bytes as an uint32_t and fill it with a random value mod (1 << w). */ @@ -296,6 +364,17 @@ void gf_general_set_up_single_timing_test(int w, void *ra, void *rb, int size) } rb += (w/8); } + } else if (w == 4) { + r8a = (uint8_t *) ra; + r8 = (uint8_t *) rb; + while (r8 < (uint8_t *) top) { + gf_general_set_random(&g, w, 1); + *r8a = g.w32; + gf_general_set_random(&g, w, 0); + *r8 = g.w32; + r8a++; + r8++; + } } else { r32 = (uint32_t *) ra; for (i = 0; i < size/4; i++) r32[i] = MOA_Random_W(w, 1); @@ -306,7 +385,7 @@ void gf_general_set_up_single_timing_test(int w, void *ra, void *rb, int size) /* This sucks, but in order to time, you really need to avoid putting ifs in the inner loops. So, I'm doing a separate timing test for each w: - 8, 16, 32, 64, 128 and everything else. Fortunately, the "everything else" + (4 & 8), 16, 32, 64, 128 and everything else. Fortunately, the "everything else" tests can be equivalent to w=32. I'm also putting the results back into ra, because otherwise, the optimizer might @@ -327,7 +406,7 @@ int gf_general_do_single_timing_test(gf_t *gf, void *ra, void *rb, int size, cha w = h->w; top = ra + size; - if (w == 8) { + if (w == 8 || w == 4) { r8a = (uint8_t *) ra; r8b = (uint8_t *) rb; top8 = (uint8_t *) top; diff --git a/gf_general.h b/gf_general.h index 0848f36..b257348 100644 --- a/gf_general.h +++ b/gf_general.h @@ -32,10 +32,12 @@ int gf_general_is_zero(gf_general_t *v, int w); int gf_general_is_one(gf_general_t *v, int w); int gf_general_are_equal(gf_general_t *v1, gf_general_t *v2, int w); -void gf_general_val_to_s(gf_general_t *v, int w, char *s); +void gf_general_val_to_s(gf_general_t *v, int w, char *s, int hex); +int gf_general_s_to_val(gf_general_t *v, int w, char *s, int hex); void gf_general_set_random(gf_general_t *v, int w, int zero_ok); +void gf_general_add(gf_t *gf, gf_general_t *a, gf_general_t *b, gf_general_t *c); void gf_general_multiply(gf_t *gf, gf_general_t *a, gf_general_t *b, gf_general_t *c); void gf_general_divide(gf_t *gf, gf_general_t *a, gf_general_t *b, gf_general_t *c); void gf_general_inverse(gf_t *gf, gf_general_t *a, gf_general_t *b); diff --git a/gf_inline_time.c b/gf_inline_time.c index d52c814..55709cd 100644 --- a/gf_inline_time.c +++ b/gf_inline_time.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "gf_complete.h" #include "gf_rand.h" diff --git a/gf_int.h b/gf_int.h index bd544bc..bdff2a2 100644 --- a/gf_int.h +++ b/gf_int.h @@ -51,11 +51,15 @@ extern int gf_wgen_scratch_size(int w, int mult_type, int region_type, int divid void gf_wgen_cauchy_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor); gf_val_32_t gf_wgen_extract_word(gf_t *gf, void *start, int bytes, int index); - extern void gf_alignment_error(char *s, int a); extern uint32_t gf_bitmatrix_inverse(uint32_t y, int w, uint32_t pp); +/* This returns the correct default for prim_poly when base is used as the base + field for COMPOSITE. It returns 0 if we don't have a default prim_poly. */ + +extern uint64_t gf_composite_get_default_poly(gf_t *base); + /* This structure lets you define a region multiply. It helps because you can handle unaligned portions of the data with the procedures below, which really cleans up the code. */ @@ -96,3 +100,97 @@ extern void gf_do_final_region_alignment(gf_region_data *rd); extern void gf_two_byte_region_table_multiply(gf_region_data *rd, uint16_t *base); extern void gf_multby_zero(void *dest, int bytes, int xor); +extern void gf_multby_one(void *src, void *dest, int bytes, int xor); + +typedef enum {GF_E_MDEFDIV, /* Dev != Default && Mult == Default */ + GF_E_MDEFREG, /* Reg != Default && Mult == Default */ + GF_E_MDEFARG, /* Args != Default && Mult == Default */ + GF_E_DIVCOMP, /* Mult == Composite && Div != Default */ + GF_E_CAUCOMP, /* Mult == Composite && Reg == CAUCHY */ + GF_E_DOUQUAD, /* Reg == DOUBLE && Reg == QUAD */ + GF_E_SSE__NO, /* Reg == SSE && Reg == NOSSE */ + GF_E_CAUCHYB, /* Reg == CAUCHY && Other Reg */ + GF_E_CAUGT32, /* Reg == CAUCHY && w > 32*/ + GF_E_ARG1SET, /* Arg1 != 0 && Mult \notin COMPOSITE/SPLIT/GROUP */ + GF_E_ARG2SET, /* Arg2 != 0 && Mult \notin SPLIT/GROUP */ + GF_E_MATRIXW, /* Div == MATRIX && w > 32 */ + GF_E_BAD___W, /* Illegal w */ + GF_E_DOUBLET, /* Reg == DOUBLE && Mult != TABLE */ + GF_E_DOUBLEW, /* Reg == DOUBLE && w \notin {4,8} */ + GF_E_DOUBLEJ, /* Reg == DOUBLE && other Reg */ + GF_E_DOUBLEL, /* Reg == DOUBLE & LAZY but w = 4 */ + GF_E_QUAD__T, /* Reg == QUAD && Mult != TABLE */ + GF_E_QUAD__W, /* Reg == QUAD && w != 4 */ + GF_E_QUAD__J, /* Reg == QUAD && other Reg */ + GF_E_LAZY__X, /* Reg == LAZY && not DOUBLE or QUAD*/ + GF_E_ALTSHIF, /* Mult == Shift && Reg == ALTMAP */ + GF_E_SSESHIF, /* Mult == Shift && Reg == SSE|NOSSE */ + GF_E_ALT_CFM, /* Mult == CARRY_FREE && Reg == ALTMAP */ + GF_E_SSE_CFM, /* Mult == CARRY_FREE && Reg == SSE|NOSSE */ + GF_E_PCLMULX, /* Mult == Carry_Free && No PCLMUL */ + GF_E_ALT_BY2, /* Mult == Bytwo_x && Reg == ALTMAP */ + GF_E_BY2_SSE, /* Mult == Bytwo_x && Reg == SSE && No SSE2 */ + GF_E_LOGBADW, /* Mult == LOGx, w too big*/ + GF_E_LOG___J, /* Mult == LOGx, && Reg == SSE|ALTMAP|NOSSE */ + GF_E_ZERBADW, /* Mult == LOG_ZERO, w \notin {8,16} */ + GF_E_ZEXBADW, /* Mult == LOG_ZERO_EXT, w != 8 */ + GF_E_LOGPOLY, /* Mult == LOG & poly not primitive */ + GF_E_GR_ARGX, /* Mult == GROUP, Bad arg1/2 */ + GF_E_GR_W_48, /* Mult == GROUP, w \in { 4, 8 } */ + GF_E_GR_W_16, /* Mult == GROUP, w == 16, arg1 != 4 || arg2 != 4 */ + GF_E_GR_128A, /* Mult == GROUP, w == 128, bad args */ + GF_E_GR_SSE4, /* Mult == GROUP, w == 128, No SSE4 */ + GF_E_GR_A_27, /* Mult == GROUP, either arg > 27 */ + GF_E_GR_AR_W, /* Mult == GROUP, either arg > w */ + GF_E_GR____J, /* Mult == GROUP, Reg == SSE|ALTMAP|NOSSE */ + GF_E_TABLE_W, /* Mult == TABLE, w too big */ + GF_E_TAB_SSE, /* Mult == TABLE, SSE|NOSSE only apply to w == 4 */ + GF_E_TABSSE3, /* Mult == TABLE, Need SSSE3 for SSE */ + GF_E_TAB_ALT, /* Mult == TABLE, Reg == ALTMAP */ + GF_E_SP128AR, /* Mult == SPLIT, w=128, Bad arg1/arg2 */ + GF_E_SP128AL, /* Mult == SPLIT, w=128, SSE requires ALTMAP */ + GF_E_SP128AS, /* Mult == SPLIT, w=128, ALTMAP requires SSE */ + GF_E_SP128_A, /* Mult == SPLIT, w=128, SSE only with 4/128 */ + GF_E_SP128_S, /* Mult == SPLIT, w=128, ALTMAP only with 4/128 */ + GF_E_SPLIT_W, /* Mult == SPLIT, Bad w (8, 16, 32, 64, 128) */ + GF_E_SP_16AR, /* Mult == SPLIT, w=16, Bad arg1/arg2 */ + GF_E_SP_16_A, /* Mult == SPLIT, w=16, ALTMAP only with 4/16 */ + GF_E_SP_16_S, /* Mult == SPLIT, w=16, SSE only with 4/16 */ + GF_E_SP_32AR, /* Mult == SPLIT, w=32, Bad arg1/arg2 */ + GF_E_SP_32AS, /* Mult == SPLIT, w=32, ALTMAP requires SSE */ + GF_E_SP_32_A, /* Mult == SPLIT, w=32, ALTMAP only with 4/32 */ + GF_E_SP_32_S, /* Mult == SPLIT, w=32, SSE only with 4/32 */ + GF_E_SP_64AR, /* Mult == SPLIT, w=64, Bad arg1/arg2 */ + GF_E_SP_64AS, /* Mult == SPLIT, w=64, ALTMAP requires SSE */ + GF_E_SP_64_A, /* Mult == SPLIT, w=64, ALTMAP only with 4/64 */ + GF_E_SP_64_S, /* Mult == SPLIT, w=64, SSE only with 4/64 */ + GF_E_SP_8_AR, /* Mult == SPLIT, w=8, Bad arg1/arg2 */ + GF_E_SP_8__A, /* Mult == SPLIT, w=8, no ALTMAP */ + GF_E_SP_SSE3, /* Mult == SPLIT, Need SSSE3 for SSE */ + GF_E_COMP_A2, /* Mult == COMP, arg1 must be = 2 */ + GF_E_COMP_SS, /* Mult == COMP, SSE|NOSSE */ + GF_E_COMP__W, /* Mult == COMP, Bad w. */ + GF_E_UNKFLAG, /* Unknown flag in create_from.... */ + GF_E_UNKNOWN, /* Unknown mult_type. */ + GF_E_UNK_REG, /* Unknown region_type. */ + GF_E_UNK_DIV, /* Unknown divide_type. */ + GF_E_CFM___W, /* Mult == CFM, Bad w. */ + GF_E_CFM4POL, /* Mult == CFM & Prim Poly has high bits set. */ + GF_E_CFM8POL, /* Mult == CFM & Prim Poly has high bits set. */ + GF_E_CF16POL, /* Mult == CFM & Prim Poly has high bits set. */ + GF_E_CF32POL, /* Mult == CFM & Prim Poly has high bits set. */ + GF_E_CF64POL, /* Mult == CFM & Prim Poly has high bits set. */ + GF_E_FEWARGS, /* Too few args in argc/argv. */ + GF_E_BADPOLY, /* Bad primitive polynomial -- too many bits set. */ + GF_E_COMP_PP, /* Bad primitive polynomial -- bigger than sub-field. */ + GF_E_COMPXPP, /* Can't derive a default pp for composite field. */ + GF_E_BASE__W, /* Composite -- Base field is the wrong size. */ + GF_E_TWOMULT, /* In create_from... two -m's. */ + GF_E_TWO_DIV, /* In create_from... two -d's. */ + GF_E_POLYSPC, /* Bad numbera after -p. */ + GF_E_SPLITAR, /* Ran out of arguments in SPLIT */ + GF_E_SPLITNU, /* Arguments not integers in SPLIT. */ + GF_E_GROUPAR, /* Ran out of arguments in GROUP */ + GF_E_GROUPNU, /* Arguments not integers in GROUP. */ + GF_E_DEFAULT } gf_error_type_t; + diff --git a/gf_method.c b/gf_method.c index f65c4e3..bc9bd35 100644 --- a/gf_method.c +++ b/gf_method.c @@ -11,179 +11,172 @@ #include #include "gf_complete.h" +#include "gf_int.h" #include "gf_method.h" -void methods_to_stderr() -{ - fprintf(stderr, "To specify the methods, do one of the following: \n"); - fprintf(stderr, " - leave empty to use defaults\n"); - fprintf(stderr, " - use a single dash to use defaults\n"); - fprintf(stderr, " - specify MULTIPLY REGION DIVIDE\n"); - fprintf(stderr, "\n"); - fprintf(stderr, "Legal values of MULTIPLY:\n"); - fprintf(stderr, " SHIFT: shift\n"); - fprintf(stderr, " GROUP g_mult g_reduce: the Group technique - see the paper\n"); - fprintf(stderr, " BYTWO_p: BYTWO doubling the product.\n"); - fprintf(stderr, " BYTWO_b: BYTWO doubling b (more efficient thatn BYTWO_p)\n"); - fprintf(stderr, " TABLE: Full multiplication table\n"); - fprintf(stderr, " LOG: Discrete logs\n"); - fprintf(stderr, " LOG_ZERO: Discrete logs with a large table for zeros\n"); - fprintf(stderr, " LOG_ZERO_EXT: Discrete logs with an extra large table for zeros\n"); - fprintf(stderr, " SPLIT g_a g_b: Split tables defined by g_a and g_b\n"); - fprintf(stderr, " COMPOSITE k rec METHOD: Composite field. GF((2^l)^k), l=w/k.\n"); - fprintf(stderr, " rec = 0 means inline single multiplication\n"); - fprintf(stderr, " rec = 1 means recursive single multiplication\n"); - fprintf(stderr, " METHOD is the method of the base field in GF(2^l)\n"); - fprintf(stderr, "\n"); - fprintf(stderr, "Legal values of REGION: Specify multiples with commas e.g. 'DOUBLE,LAZY'\n"); - fprintf(stderr, " -: Use defaults\n"); - fprintf(stderr, " SINGLE/DOUBLE/QUAD: Expand tables\n"); - fprintf(stderr, " LAZY: Lazily create table (only applies to TABLE and SPLIT)\n"); - fprintf(stderr, " SSE/NOSSE: Use 128-bit SSE instructions if you can\n"); - fprintf(stderr, " CAUCHY/ALTMAP/STDMAP: Use different memory mappings\n"); - fprintf(stderr, "\n"); - fprintf(stderr, "Legal values of DIVIDE:\n"); - fprintf(stderr, " -: Use defaults\n"); - fprintf(stderr, " MATRIX: Use matrix inversion\n"); - fprintf(stderr, " EUCLID: Use the extended Euclidian algorithm.\n"); - fprintf(stderr, "\n"); - fprintf(stderr, "See the user's manual for more information.\n"); - fprintf(stderr, "There are many restrictions, so it is better to simply use defaults in most cases.\n"); -} - int create_gf_from_argv(gf_t *gf, int w, int argc, char **argv, int starting) { int mult_type, divide_type, region_type; - uint32_t prim_poly = 0; int arg1, arg2, subrg_size; + uint64_t prim_poly; gf_t *base; char *crt, *x, *y; - if (argc <= starting || strcmp(argv[starting], "-") == 0) { - if (!gf_init_easy(gf, w)) return 0; - return (argc <= starting) ? starting : starting+1; - } - + mult_type = GF_MULT_DEFAULT; region_type = GF_REGION_DEFAULT; divide_type = GF_DIVIDE_DEFAULT; - - arg1 = 0; - arg2 = 0; prim_poly = 0; base = NULL; - subrg_size = 0; - - if (argc < starting+3) return 0; - - if (strcmp(argv[starting], "SHIFT") == 0) { - mult_type = GF_MULT_SHIFT; - starting++; - } else if (strcmp(argv[starting], "GROUP") == 0) { - mult_type = GF_MULT_GROUP; - if (argc < starting+5) return 0; - if (sscanf(argv[starting+1], "%d", &arg1) == 0 || - sscanf(argv[starting+2], "%d", &arg2) == 0 || - arg1 <= 0 || arg2 <= 0 || arg1 >= w || arg2 >= w) return 0; - starting += 3; - } else if (strcmp(argv[starting], "BYTWO_p") == 0) { - mult_type = GF_MULT_BYTWO_p; - starting++; - } else if (strcmp(argv[starting], "BYTWO_b") == 0) { - mult_type = GF_MULT_BYTWO_b; - starting++; - } else if (strcmp(argv[starting], "TABLE") == 0) { - mult_type = GF_MULT_TABLE; - starting++; - } else if (strcmp(argv[starting], "LOG") == 0) { - mult_type = GF_MULT_LOG_TABLE; - starting++; - } else if (strcmp(argv[starting], "LOG_ZERO") == 0) { - mult_type = GF_MULT_LOG_TABLE; - arg1 = 1; - starting++; - } else if (strcmp(argv[starting], "LOG_ZERO_EXT") == 0) { - mult_type = GF_MULT_LOG_TABLE; - arg1 = 2; - starting++; - } else if (strcmp(argv[starting], "SPLIT") == 0) { - mult_type = GF_MULT_SPLIT_TABLE; - if (argc < starting+5) return 0; - if (sscanf(argv[starting+1], "%d", &arg1) == 0 || - sscanf(argv[starting+2], "%d", &arg2) == 0 || - arg1 <= 0 || arg2 <= 0 || w % arg1 != 0 || w % arg2 != 0) return 0; - starting += 3; - } else if (strcmp(argv[starting], "COMPOSITE") == 0) { - mult_type = GF_MULT_COMPOSITE; - if (argc < starting+6) return 0; - if (sscanf(argv[starting+1], "%d", &arg1) == 0 || - sscanf(argv[starting+2], "%d", &arg2) == 0 || - arg1 <= 1 || w %arg1 != 0 || ((arg2 | 1) != 1)) return 0; - base = (gf_t *) malloc(sizeof(gf_t)); - starting = create_gf_from_argv(base, w/arg1, argc, argv, starting+3); - if (starting == 0) { free(base); return 0; } - } else { - return 0; - } - - if (argc < starting+2) { - if (base != NULL) gf_free(base, 1); - return 0; - } - - if (strcmp(argv[starting], "-") == 0) { - region_type = GF_REGION_DEFAULT; - } else { - crt = strdup(argv[starting]); - region_type = 0; - x = crt; - do { - y = strchr(x, ','); - if (y != NULL) *y = '\0'; - if (strcmp(x, "DOUBLE") == 0) { - region_type |= GF_REGION_DOUBLE_TABLE; - } else if (strcmp(x, "QUAD") == 0) { - region_type |= GF_REGION_QUAD_TABLE; - } else if (strcmp(x, "SINGLE") == 0) { - region_type |= GF_REGION_SINGLE_TABLE; - } else if (strcmp(x, "LAZY") == 0) { - region_type |= GF_REGION_LAZY; - } else if (strcmp(x, "SSE") == 0) { - region_type |= GF_REGION_SSE; - } else if (strcmp(x, "NOSSE") == 0) { - region_type |= GF_REGION_NOSSE; - } else if (strcmp(x, "CAUCHY") == 0) { - region_type |= GF_REGION_CAUCHY; - } else if (strcmp(x, "ALTMAP") == 0) { - region_type |= GF_REGION_ALTMAP; - } else if (strcmp(x, "STDMAP") == 0) { - region_type |= GF_REGION_STDMAP; + arg1 = 0; + arg2 = 0; + while (1) { + if (argc > starting) { + if (strcmp(argv[starting], "-m") == 0) { + starting++; + if (mult_type != GF_MULT_DEFAULT) { + if (base != NULL) gf_free(base, 1); + _gf_errno = GF_E_TWOMULT; + return 0; + } + if (strcmp(argv[starting], "SHIFT") == 0) { + mult_type = GF_MULT_SHIFT; + starting++; + } else if (strcmp(argv[starting], "CARRY_FREE") == 0) { + mult_type = GF_MULT_CARRY_FREE; + starting++; + } else if (strcmp(argv[starting], "GROUP") == 0) { + mult_type = GF_MULT_GROUP; + if (argc < starting + 3) { + _gf_errno = GF_E_GROUPAR; + return 0; + } + if (sscanf(argv[starting+1], "%d", &arg1) == 0 || + sscanf(argv[starting+2], "%d", &arg2) == 0) { + _gf_errno = GF_E_GROUPNU; + return 0; + } + starting += 3; + } else if (strcmp(argv[starting], "BYTWO_p") == 0) { + mult_type = GF_MULT_BYTWO_p; + starting++; + } else if (strcmp(argv[starting], "BYTWO_b") == 0) { + mult_type = GF_MULT_BYTWO_b; + starting++; + } else if (strcmp(argv[starting], "TABLE") == 0) { + mult_type = GF_MULT_TABLE; + starting++; + } else if (strcmp(argv[starting], "LOG") == 0) { + mult_type = GF_MULT_LOG_TABLE; + starting++; + } else if (strcmp(argv[starting], "LOG_ZERO") == 0) { + mult_type = GF_MULT_LOG_ZERO; + starting++; + } else if (strcmp(argv[starting], "LOG_ZERO_EXT") == 0) { + mult_type = GF_MULT_LOG_ZERO_EXT; + starting++; + } else if (strcmp(argv[starting], "SPLIT") == 0) { + mult_type = GF_MULT_SPLIT_TABLE; + if (argc < starting + 3) { + _gf_errno = GF_E_SPLITAR; + return 0; + } + if (sscanf(argv[starting+1], "%d", &arg1) == 0 || + sscanf(argv[starting+2], "%d", &arg2) == 0) { + _gf_errno = GF_E_SPLITNU; + return 0; + } + starting += 3; + } else if (strcmp(argv[starting], "COMPOSITE") == 0) { + mult_type = GF_MULT_COMPOSITE; + if (argc < starting + 2) { _gf_errno = GF_E_FEWARGS; return 0; } + if (sscanf(argv[starting+1], "%d", &arg1) == 0) { + _gf_errno = GF_E_COMP_A2; + return 0; + } + starting += 2; + base = (gf_t *) malloc(sizeof(gf_t)); + starting = create_gf_from_argv(base, w/arg1, argc, argv, starting); + if (starting == 0) { + free(base); + return 0; + } + } else { + if (base != NULL) gf_free(base, 1); + _gf_errno = GF_E_UNKNOWN; + return 0; + } + } else if (strcmp(argv[starting], "-r") == 0) { + starting++; + if (strcmp(argv[starting], "DOUBLE") == 0) { + region_type |= GF_REGION_DOUBLE_TABLE; + starting++; + } else if (strcmp(argv[starting], "QUAD") == 0) { + region_type |= GF_REGION_QUAD_TABLE; + starting++; + } else if (strcmp(argv[starting], "LAZY") == 0) { + region_type |= GF_REGION_LAZY; + starting++; + } else if (strcmp(argv[starting], "SSE") == 0) { + region_type |= GF_REGION_SSE; + starting++; + } else if (strcmp(argv[starting], "NOSSE") == 0) { + region_type |= GF_REGION_NOSSE; + starting++; + } else if (strcmp(argv[starting], "CAUCHY") == 0) { + region_type |= GF_REGION_CAUCHY; + starting++; + } else if (strcmp(argv[starting], "ALTMAP") == 0) { + region_type |= GF_REGION_ALTMAP; + starting++; + } else { + if (base != NULL) gf_free(base, 1); + _gf_errno = GF_E_UNK_REG; + return 0; + } + } else if (strcmp(argv[starting], "-p") == 0) { + starting++; + if (sscanf(argv[starting], "%llx", (long long unsigned int *)(&prim_poly)) == 0) { + if (base != NULL) gf_free(base, 1); + _gf_errno = GF_E_POLYSPC; + return 0; + } + starting++; + } else if (strcmp(argv[starting], "-d") == 0) { + starting++; + if (divide_type != GF_DIVIDE_DEFAULT) { + if (base != NULL) gf_free(base, 1); + _gf_errno = GF_E_TWO_DIV; + return 0; + } else if (strcmp(argv[starting], "EUCLID") == 0) { + divide_type = GF_DIVIDE_EUCLID; + starting++; + } else if (strcmp(argv[starting], "MATRIX") == 0) { + divide_type = GF_DIVIDE_MATRIX; + starting++; + } else { + _gf_errno = GF_E_UNK_DIV; + return 0; + } + } else if (strcmp(argv[starting], "-") == 0) { + /* + printf("Scratch size: %d\n", gf_scratch_size(w, + mult_type, region_type, divide_type, arg1, arg2)); + */ + if (gf_init_hard(gf, w, mult_type, region_type, divide_type, + prim_poly, arg1, arg2, base, NULL) == 0) { + if (base != NULL) gf_free(base, 1); + return 0; + } else + return starting + 1; } else { if (base != NULL) gf_free(base, 1); - free(crt); + _gf_errno = GF_E_UNKFLAG; return 0; } - if (y != NULL) x = y+1; - } while (y != NULL); - free(crt); + } else { + if (base != NULL) gf_free(base, 1); + _gf_errno = GF_E_FEWARGS; + return 0; + } } - - starting++; - - if (strcmp(argv[starting], "-") == 0) { - divide_type = GF_DIVIDE_DEFAULT; - } else if (strcmp(argv[starting], "MATRIX") == 0) { - divide_type = GF_DIVIDE_MATRIX; - } else if (strcmp(argv[starting], "EUCLID") == 0) { - divide_type = GF_DIVIDE_EUCLID; - } else { - if (base != NULL) gf_free(base, 1); - return 0; - } - starting++; - - if (!gf_init_hard(gf, w, mult_type, region_type, divide_type, prim_poly, arg1, arg2, base, NULL)) { - if (base != NULL) gf_free(base, 1); - return 0; - } - return starting; } diff --git a/gf_method.h b/gf_method.h index c7df540..ff29f25 100644 --- a/gf_method.h +++ b/gf_method.h @@ -8,8 +8,9 @@ #include "gf_complete.h" -/* This prints out the error string defining the methods that you can put on argv*/ -extern void methods_to_stderr(); +/* Parses argv starting at "starting". + + Returns 0 on failure. + On success, it returns one past the last argument it read in argv. */ -/* Parses argv starting at "starting" */ extern int create_gf_from_argv(gf_t *gf, int w, int argc, char **argv, int starting); diff --git a/gf_methods.c b/gf_methods.c index 13aeb8e..c4db5f5 100644 --- a/gf_methods.c +++ b/gf_methods.c @@ -11,58 +11,26 @@ #include "gf_complete.h" #include "gf_method.h" +#include "gf_int.h" -#define NMULTS (15) -static char *mults[NMULTS] = { "SHIFT", "GROUP44", "GROUP48", "BYTWO_p", "BYTWO_b", - "TABLE", "LOG", "LOG_ZERO", "LOG_ZERO_EXT", "SPLIT2", "SPLIT4", "SPLIT8", "SPLIT88", "COMPOSITE-0", "COMPOSITE-1" }; +#define NMULTS (16) +static char *mults[NMULTS] = { "SHIFT", "CARRY_FREE", "GROUP44", "GROUP48", "BYTWO_p", "BYTWO_b", + "TABLE", "LOG", "LOG_ZERO", "LOG_ZERO_EXT", "SPLIT2", + "SPLIT4", "SPLIT8", "SPLIT16", "SPLIT88", "COMPOSITE" }; -#define NREGIONS (96) -static char *regions[NREGIONS] = { "-", "SINGLE", "DOUBLE", "QUAD", -"LAZY", "SINGLE,LAZY", "DOUBLE,LAZY", "QUAD,LAZY", "SSE", -"SINGLE,SSE", "DOUBLE,SSE", "QUAD,SSE", "LAZY,SSE", -"SINGLE,LAZY,SSE", "DOUBLE,LAZY,SSE", "QUAD,LAZY,SSE", "NOSSE", -"SINGLE,NOSSE", "DOUBLE,NOSSE", "QUAD,NOSSE", "LAZY,NOSSE", -"SINGLE,LAZY,NOSSE", "DOUBLE,LAZY,NOSSE", "QUAD,LAZY,NOSSE", -"STDMAP", "SINGLE,STDMAP", "DOUBLE,STDMAP", "QUAD,STDMAP", -"LAZY,STDMAP", "SINGLE,LAZY,STDMAP", "DOUBLE,LAZY,STDMAP", -"QUAD,LAZY,STDMAP", "SSE,STDMAP", "SINGLE,SSE,STDMAP", -"DOUBLE,SSE,STDMAP", "QUAD,SSE,STDMAP", "LAZY,SSE,STDMAP", -"SINGLE,LAZY,SSE,STDMAP", "DOUBLE,LAZY,SSE,STDMAP", -"QUAD,LAZY,SSE,STDMAP", "NOSSE,STDMAP", "SINGLE,NOSSE,STDMAP", -"DOUBLE,NOSSE,STDMAP", "QUAD,NOSSE,STDMAP", "LAZY,NOSSE,STDMAP", -"SINGLE,LAZY,NOSSE,STDMAP", "DOUBLE,LAZY,NOSSE,STDMAP", -"QUAD,LAZY,NOSSE,STDMAP", "ALTMAP", "SINGLE,ALTMAP", "DOUBLE,ALTMAP", -"QUAD,ALTMAP", "LAZY,ALTMAP", "SINGLE,LAZY,ALTMAP", -"DOUBLE,LAZY,ALTMAP", "QUAD,LAZY,ALTMAP", "SSE,ALTMAP", -"SINGLE,SSE,ALTMAP", "DOUBLE,SSE,ALTMAP", "QUAD,SSE,ALTMAP", -"LAZY,SSE,ALTMAP", "SINGLE,LAZY,SSE,ALTMAP", -"DOUBLE,LAZY,SSE,ALTMAP", "QUAD,LAZY,SSE,ALTMAP", "NOSSE,ALTMAP", -"SINGLE,NOSSE,ALTMAP", "DOUBLE,NOSSE,ALTMAP", "QUAD,NOSSE,ALTMAP", -"LAZY,NOSSE,ALTMAP", "SINGLE,LAZY,NOSSE,ALTMAP", -"DOUBLE,LAZY,NOSSE,ALTMAP", "QUAD,LAZY,NOSSE,ALTMAP", "CAUCHY", -"SINGLE,CAUCHY", "DOUBLE,CAUCHY", "QUAD,CAUCHY", "LAZY,CAUCHY", -"SINGLE,LAZY,CAUCHY", "DOUBLE,LAZY,CAUCHY", "QUAD,LAZY,CAUCHY", -"SSE,CAUCHY", "SINGLE,SSE,CAUCHY", "DOUBLE,SSE,CAUCHY", -"QUAD,SSE,CAUCHY", "LAZY,SSE,CAUCHY", "SINGLE,LAZY,SSE,CAUCHY", -"DOUBLE,LAZY,SSE,CAUCHY", "QUAD,LAZY,SSE,CAUCHY", "NOSSE,CAUCHY", -"SINGLE,NOSSE,CAUCHY", "DOUBLE,NOSSE,CAUCHY", "QUAD,NOSSE,CAUCHY", -"LAZY,NOSSE,CAUCHY", "SINGLE,LAZY,NOSSE,CAUCHY", -"DOUBLE,LAZY,NOSSE,CAUCHY", "QUAD,LAZY,NOSSE,CAUCHY" }; +#define NREGIONS (7) +static char *regions[NREGIONS] = { "DOUBLE", "QUAD", "LAZY", "SSE", "NOSSE", + "ALTMAP", "CAUCHY" }; -#define NDIVS (3) -static char *divides[NDIVS] = { "-", "MATRIX", "EUCLID" }; +#define NDIVS (2) +static char *divides[NDIVS] = { "MATRIX", "EUCLID" }; -int main() +int main() { - int m, r, d, w, i, sa, j; - char *argv[20]; + int m, r, d, w, i, sa, j, k, reset; + char *argv[50]; gf_t gf; char divs[200], ks[10], ls[10]; - - methods_to_stderr(); - - printf("\n"); - printf("Implemented Methods: \n\n"); for (i = 2; i < 8; i++) { w = (1 << i); @@ -70,9 +38,14 @@ int main() if (create_gf_from_argv(&gf, w, 1, argv, 0) > 0) { printf("w=%d: -\n", w); gf_free(&gf, 1); + } else if (_gf_errno == GF_E_DEFAULT) { + fprintf(stderr, "Unlabeled failed method: w=%d: -\n", 2); + exit(1); } + for (m = 0; m < NMULTS; m++) { sa = 0; + argv[sa++] = "-m"; if (strcmp(mults[m], "GROUP44") == 0) { argv[sa++] = "GROUP"; argv[sa++] = "4"; @@ -96,46 +69,66 @@ int main() sprintf(ls, "%d", w); argv[sa++] = ls; argv[sa++] = "8"; + } else if (strcmp(mults[m], "SPLIT16") == 0) { + argv[sa++] = "SPLIT"; + sprintf(ls, "%d", w); + argv[sa++] = ls; + argv[sa++] = "16"; } else if (strcmp(mults[m], "SPLIT88") == 0) { argv[sa++] = "SPLIT"; argv[sa++] = "8"; argv[sa++] = "8"; - } else if (strcmp(mults[m], "COMPOSITE-0") == 0) { + } else if (strcmp(mults[m], "COMPOSITE") == 0) { argv[sa++] = "COMPOSITE"; argv[sa++] = "2"; - argv[sa++] = "0"; - argv[sa++] = "-"; - } else if (strcmp(mults[m], "COMPOSITE-1") == 0) { - argv[sa++] = "COMPOSITE"; - argv[sa++] = "2"; - argv[sa++] = "1"; argv[sa++] = "-"; } else { argv[sa++] = mults[m]; } - for (r = 0; r < NREGIONS; r++) { - argv[sa++] = regions[r]; - strcpy(divs, ""); - for (d = 0; d < NDIVS; d++) { - argv[sa++] = divides[d]; -/* printf("w=%d:", w); - for (j = 0; j < sa; j++) printf(" %s", argv[j]); - printf("\n"); */ - if (create_gf_from_argv(&gf, w, sa, argv, 0) > 0) { - strcat(divs, "|"); - strcat(divs, divides[d]); - gf_free(&gf, 1); - } - sa--; + reset = sa; + for (r = 0; r < (1 << NREGIONS); r++) { + sa = reset; + for (k = 0; k < NREGIONS; k++) { + if (r & 1 << k) { + argv[sa++] = "-r"; + argv[sa++] = regions[k]; + } } - if (strlen(divs) > 0) { + argv[sa++] = "-"; + if (create_gf_from_argv(&gf, w, sa, argv, 0) > 0) { printf("w=%d:", w); for (j = 0; j < sa; j++) printf(" %s", argv[j]); - printf(" %s\n", divs+1); + printf("\n"); + gf_free(&gf, 1); + } else if (_gf_errno == GF_E_DEFAULT) { + fprintf(stderr, "Unlabeled failed method: w=%d:", w); + for (j = 0; j < sa; j++) fprintf(stderr, " %s", argv[j]); + fprintf(stderr, "\n"); + exit(1); } sa--; + for (d = 0; d < NDIVS; d++) { + argv[sa++] = "-d"; + argv[sa++] = divides[d]; + /* printf("w=%d:", w); + for (j = 0; j < sa; j++) printf(" %s", argv[j]); + printf("\n"); */ + argv[sa++] = "-"; + if (create_gf_from_argv(&gf, w, sa, argv, 0) > 0) { + printf("w=%d:", w); + for (j = 0; j < sa; j++) printf(" %s", argv[j]); + printf("\n"); + gf_free(&gf, 1); + } else if (_gf_errno == GF_E_DEFAULT) { + fprintf(stderr, "Unlabeled failed method: w=%d:", w); + for (j = 0; j < sa; j++) fprintf(stderr, " %s", argv[j]); + fprintf(stderr, "\n"); + exit(1); + } + sa-=3; + } } - sa--; } } + return 0; } diff --git a/gf_mult.c b/gf_mult.c index dc85cc6..c93a4f9 100644 --- a/gf_mult.c +++ b/gf_mult.c @@ -12,105 +12,53 @@ #include "gf_complete.h" #include "gf_method.h" +#include "gf_general.h" -void usage(char *s) +void usage(int why) { fprintf(stderr, "usage: gf_mult a b w [method] - does multiplication of a and b in GF(2^w)\n"); - fprintf(stderr, " If w has an h on the end, treat a, b and the product as hexadecimal (no 0x required)\n"); - fprintf(stderr, "\n"); - fprintf(stderr, " legal w are: 1-32, 64 and 128\n"); - fprintf(stderr, " 128 is hex only (i.e. '128' will be an error - do '128h')\n"); - fprintf(stderr, "\n"); - fprintf(stderr, " For method specification, type gf_methods\n"); - - if (s != NULL) fprintf(stderr, "%s", s); + if (why == 'W') { + fprintf(stderr, "Bad w.\n"); + fprintf(stderr, "Legal w are: 1 - 32, 64 and 128.\n"); + fprintf(stderr, "Append 'h' to w to treat a, b and the product as hexadecimal.\n"); + fprintf(stderr, "w=128 is hex only (i.e. '128' will be an error - do '128h')\n"); + } + if (why == 'A') fprintf(stderr, "Bad a\n"); + if (why == 'B') fprintf(stderr, "Bad b\n"); + if (why == 'M') { + fprintf(stderr, "Bad Method Specification: "); + gf_error(); + } exit(1); } -int read_128(char *s, uint64_t *v) -{ - int l, t; - char save; - - l = strlen(s); - if (l > 32) return 0; - - if (l > 16) { - if (sscanf(s + (l-16), "%llx", (long long unsigned int *) &(v[1])) == 0) return 0; - save = s[l-16]; - s[l-16] = '\0'; - t = sscanf(s, "%llx", (long long unsigned int *) &(v[0])); - s[l-16] = save; - return t; - } else { - v[0] = 0; - return sscanf(s, "%llx", (long long unsigned int *)&(v[1])); - } - return 1; -} - -void print_128(uint64_t *v) -{ - if (v[0] > 0) { - printf("%llx", (long long unsigned int) v[0]); - printf("%016llx", (long long unsigned int) v[1]); - } else { - printf("%llx", (long long unsigned int) v[1]); - } - printf("\n"); -} - - int main(int argc, char **argv) { - int hex, al, bl, w; - uint32_t a, b, c, top; - uint64_t a64, b64, c64; - uint64_t a128[2], b128[2], c128[2]; - char *format; + int hex, w; gf_t gf; + gf_general_t a, b, c; + char output[50]; - if (argc < 4) usage(NULL); - if (sscanf(argv[3], "%d", &w) == 0) usage("Bad w\n"); + if (argc < 4) usage(' '); - if (w <= 0 || (w > 32 && w != 64 && w != 128)) usage("Bad w"); + if (sscanf(argv[3], "%d", &w) == 0) usage('W'); + if (w <= 0 || (w > 32 && w != 64 && w != 128)) usage('W'); hex = (strchr(argv[3], 'h') != NULL); - if (create_gf_from_argv(&gf, w, argc, argv, 4) == 0) usage("\nBad Method\n"); + if (!hex && w == 128) usage('W'); - if (!hex && w == 128) usage(NULL); - - if (w <= 32) { - format = (hex) ? "%x" : "%u"; - if (sscanf(argv[1], format, &a) == 0) usage("Bad a\n"); - if (sscanf(argv[2], format, &b) == 0) usage("Bad b\n"); - - if (w < 32) { - top = (w == 31) ? 0x80000000 : (1 << w); - if (w != 32 && a >= top) usage("a is too large\n"); - if (w != 32 && b >= top) usage("b is too large\n"); - } - - c = gf.multiply.w32(&gf, a, b); - printf(format, c); - printf("\n"); - - } else if (w == 64) { - format = (hex) ? "%llx" : "%llu"; - if (sscanf(argv[1], format, &a64) == 0) usage("Bad a\n"); - if (sscanf(argv[2], format, &b64) == 0) usage("Bad b\n"); - c64 = gf.multiply.w64(&gf, a64, b64); - - printf(format, c64); - printf("\n"); - - } else if (w == 128) { - - if (read_128(argv[1], a128) == 0) usage("Bad a\n"); - if (read_128(argv[2], b128) == 0) usage("Bad b\n"); - gf.multiply.w128(&gf, a128, b128, c128); - - print_128(c128); + if (argc == 4) { + if (gf_init_easy(&gf, w) == 0) usage('M'); + } else { + if (create_gf_from_argv(&gf, w, argc, argv, 4) == 0) usage('M'); } + + if (!gf_general_s_to_val(&a, w, argv[1], hex)) usage('A'); + if (!gf_general_s_to_val(&b, w, argv[2], hex)) usage('B'); + + gf_general_multiply(&gf, &a, &b, &c); + gf_general_val_to_s(&c, w, output, hex); + + printf("%s\n", output); exit(0); } diff --git a/gf_poly.c b/gf_poly.c index c057461..7134b2c 100644 --- a/gf_poly.c +++ b/gf_poly.c @@ -1,560 +1,268 @@ /* - * gf_poly.c - program to help find primitive polynomials in composite fields + gf_poly.c - program to help find irreducible polynomials in composite fields, + using the Ben-Or algorithm. + + James S. Plank + + Please see the following paper for a + description of the Ben-Or algorithm: + + author S. Gao and D. Panario + title Tests and Constructions of Irreducible Polynomials over Finite Fields + booktitle Foundations of Computational Mathematics + year 1997 + publisher Springer Verlag + pages 346-361 + + The basic technique is this. You have a polynomial f(x) whose coefficients are + in a base field GF(2^w). The polynomial is of degree n. You need to do the + following for all i from 1 to n/2: + + Construct x^(2^w)^i modulo f. That will be a polynomial of maximum degree n-1 + with coefficients in GF(2^w). You construct that polynomial by starting with x + and doubling it w times, each time taking the result modulo f. Then you + multiply that by itself i times, again each time taking the result modulo f. + + When you're done, you need to "subtract" x -- since addition = subtraction = + XOR, that means XOR x. + + Now, find the GCD of that last polynomial and f, using Euclid's algorithm. If + the GCD is not one, then f is reducible. If it is not reducible for each of + those i, then it is irreducible. + + In this code, I am using a gf_general_t to represent elements of GF(2^w). This + is so that I can use base fields that are GF(2^64) or GF(2^128). + + I have two main procedures. The first is x_to_q_to_i_minus_x, which calculates + x^(2^w)^i - x, putting the result into a gf_general_t * called retval. + + The second is gcd_one, which takes a polynomial of degree n and a second one + of degree n-1, and uses Euclid's algorithm to decide if their GCD == 1. + + These can be made faster (e.g. calculate x^(2^w) once and store it). */ #include "gf_complete.h" #include "gf_method.h" +#include "gf_general.h" +#include "gf_int.h" #include #include #include -#define GF_POLY_COEF_MASK8 0xff -#define GF_POLY_COEF_MASK16 0xffff -#define GF_POLY_COEF_MASK32 0xffffffff -#define GF_POLY_COEF_MASK64 0xffffffffffffffff +char *BM = "Bad Method: "; -#define LLUI (long long unsigned int) - -struct gf_poly_coef_s; - -typedef struct gf_poly_coef_s { - uint64_t coef; - uint64_t power; - struct gf_poly_coef_s *next; -} gf_poly_coef_t; - -typedef struct gf_poly_s { - gf_poly_coef_t *leading_coef; - uint64_t num_coefs; - gf_t *coef_gf; - int w; -} gf_poly_t; - -static uint64_t gf_add(int w, uint64_t a, uint64_t b) +void usage(char *s) { - if (w == 8) { - return (a & GF_POLY_COEF_MASK8) ^ (b & GF_POLY_COEF_MASK8); - } else if (w == 16) { - return (a & GF_POLY_COEF_MASK16) ^ (b & GF_POLY_COEF_MASK16); - } else if (w == 32) { - return (a & GF_POLY_COEF_MASK32) ^ (b & GF_POLY_COEF_MASK32); - } else if (w == 64) { - return (a & GF_POLY_COEF_MASK64) ^ (b & GF_POLY_COEF_MASK64); - } -} - -static uint64_t gf_mult(int w, gf_t* gf, uint64_t a, uint64_t b) -{ - if (w <= 32) { - return gf->multiply.w32(gf, a, b); - } else if (w == 64) { - return gf->multiply.w64(gf, a, b); - } -} - -static uint64_t gf_divide(int w, gf_t* gf, uint64_t a, uint64_t b) -{ - if (w <= 32) { - return gf->divide.w32(gf, a, b); - } else if (w == 64) { - return gf->divide.w64(gf, a, b); - } -} - -static uint64_t gf_inverse(int w, gf_t* gf, uint64_t a) -{ - if (w <= 32) { - return gf->inverse.w32(gf, a); - } else if (w == 64) { - return gf->inverse.w64(gf, a); - } -} - -gf_poly_t* gf_poly_init(int w, gf_t *gf) -{ - gf_poly_t *gf_poly = (gf_poly_t*)malloc(sizeof(gf_poly_t)); - - if (gf_poly == NULL || gf == NULL) { - return NULL; - } - - gf_poly->leading_coef = NULL; - gf_poly->num_coefs = 0; - gf_poly->coef_gf = gf; - gf_poly->w = w; - - return gf_poly; -} - -void gf_poly_print(gf_poly_t *gf_poly, char *message) -{ - gf_poly_coef_t *tmp; - - if (gf_poly == NULL) { - fprintf(stderr, "0 * x^0\n"); - return; - } - - tmp = gf_poly->leading_coef; - - while (tmp != NULL) { - printf("%llu * x^%llu", LLUI tmp->coef, LLUI tmp->power); - tmp = tmp->next; - if (tmp) { - printf(" + "); - } - } - - if (message != NULL) { - printf(": %s\n", message); - } -} - -gf_poly_t* gf_poly_copy(gf_poly_t *poly) -{ - gf_poly_t *new_poly = (gf_poly_t*)malloc(sizeof(gf_poly_t)); - gf_poly_coef_t *tmp = poly->leading_coef; - - if (new_poly == NULL) { - return NULL; - } - - new_poly->leading_coef = NULL; - new_poly->num_coefs = 0; - new_poly->coef_gf = poly->coef_gf; - new_poly->w = poly->w; - - while (tmp != NULL) { - gf_poly_add_coef(new_poly, tmp->coef, tmp->power); - - tmp = tmp->next; - } - - return new_poly; -} - -void gf_poly_clear(gf_poly_t* a) -{ - while (a->leading_coef != NULL) { - gf_poly_coef_t *tmp = a->leading_coef; - - a->leading_coef = tmp->next; - - free(tmp); - } -} - -void gf_poly_free(gf_poly_t **a) -{ - gf_poly_clear(*a); - free(*a); - *a = NULL; -} - -gf_poly_coef_t* gf_poly_create_node(uint64_t coef, uint64_t power) -{ - gf_poly_coef_t* node = (gf_poly_coef_t*)malloc(sizeof(gf_poly_coef_t)); - - if (node == NULL) { - return NULL; - } - - node->coef = coef; - node->power = power; - node->next = NULL; - - return node; -} - -int gf_poly_remove_node(gf_poly_t *gf_poly, uint64_t power) -{ - gf_poly_coef_t* iter = gf_poly->leading_coef; - - if (iter->power == power) { - gf_poly->leading_coef = iter->next; - free(iter); - return 0; - } - - while (iter->next != NULL) { - if (iter->next->power == power) { - gf_poly_coef_t* tmp = iter->next; - iter->next = iter->next->next; - free(tmp); - return 0; - } - iter = iter->next; - } - - return -1; -} - -int gf_poly_add_coef(gf_poly_t *gf_poly, uint64_t coef_val, uint64_t power) -{ - gf_poly_coef_t* node; - gf_poly_coef_t* iter = gf_poly->leading_coef; - - /* - * The new node has the highest power, or there are no terms - */ - if (gf_poly->leading_coef == NULL || gf_poly->leading_coef->power < power) { - node = gf_poly_create_node(coef_val, power); - node->next = gf_poly->leading_coef; - gf_poly->leading_coef = node; - return 0; - } - - /* - * The new node is of the same power, add the coefs - */ - if (gf_poly->leading_coef->power == power) { - gf_poly->leading_coef->coef = gf_add(gf_poly->w, gf_poly->leading_coef->coef, coef_val); - if (gf_poly->leading_coef->coef == 0) { - gf_poly_remove_node(gf_poly, power); - } - return 0; - } - - while (iter->next != NULL) { - if (iter->next->power == power) { - iter->next->coef = gf_add(gf_poly->w, iter->next->coef, coef_val); - - if (iter->next->coef == 0) { - gf_poly_remove_node(gf_poly, power); - } - - return 0; - } - if (iter->next->power < power) { - node = gf_poly_create_node(coef_val, power); - node->next = iter->next; - iter->next = node; - return 0; - } - iter = iter->next; - } - - /* - * The power passed in is lower than any in the existing poly - */ - node = gf_poly_create_node(coef_val, power); - iter->next = node; - - return 0; -} - -/* - * Compute a+b and store in a - */ -int gf_poly_add(gf_poly_t* a, gf_poly_t* b) -{ - gf_poly_coef_t* iter = b->leading_coef; - - while (iter != NULL) { - gf_poly_add_coef(a, iter->coef, iter->power); - iter = iter->next; - } - - return 0; -} - -/* - * Compute a*b and store in a - */ -int gf_poly_mult(gf_poly_t* a, gf_poly_t* b) -{ - gf_poly_coef_t* a_iter = a->leading_coef; - - /* - * Remove one node at a time from 'a', starting with - * highest power. Multiply the removed (coef,power) - * by every entry of 'b,' adding each product into 'a.' - */ - while (a_iter != NULL) { - gf_poly_coef_t* tmp = a_iter; - gf_poly_coef_t* b_iter = b->leading_coef; - - uint64_t a_power = a_iter->power; - uint64_t a_coef = a_iter->coef; - a_iter = a_iter->next; - gf_poly_remove_node(a, tmp->power); - - while (b_iter != NULL) { - uint64_t new_power = b_iter->power + a_power; - uint64_t new_coef = gf_mult(a->w, a->coef_gf, b_iter->coef, a_coef); - - gf_poly_add_coef(a, new_coef, new_power); - - b_iter = b_iter->next; - } - } - return 0; -} - -/* - * Compute a % b and store in a - */ -int gf_poly_reduce(gf_poly_t* a, gf_poly_t* b) -{ - gf_poly_t* c = gf_poly_init(a->w, a->coef_gf); - gf_poly_coef_t* a_iter = a->leading_coef; - gf_poly_coef_t* b_iter = b->leading_coef; - - /* - * Reduce until the degree of 'a' is less than - * the degree of 'b.' At that point 'a' will - * contain the remainder of a / b. - */ - while (a_iter && (a_iter->power >= b_iter->power)) { - - /* - * Get the degree and leading coef of the current - * 'b'. - */ - uint64_t reduce_power = a_iter->power - b_iter->power; - uint64_t reduce_coef = gf_divide(a->w, a->coef_gf, a_iter->coef, b_iter->coef); - - /* - * Create a poly that will get rid of leading power - * of 'b' when added: c*x^(n-m)*b(x), where c - * is the leading coef of 'a', n is the deg of 'a' - * and m is the degree of 'b'. - */ - gf_poly_add_coef(c, reduce_coef, reduce_power); - gf_poly_mult(c, b); - - /* - * Add the newly created poly, which will reduce - * a(x) by at least one term (leading term). - */ - gf_poly_add(a, c); - - gf_poly_clear(c); - - /* - * Grab the new leading term of 'a' - */ - a_iter = a->leading_coef; - } -} - -/* - * Get the GCD of a and b, return the result - */ -gf_poly_t* gf_poly_gcd(gf_poly_t* a, gf_poly_t* b) -{ - gf_poly_t *r1, *r2; - gf_poly_t* tmp_swp; - - if (a->leading_coef == NULL || b->leading_coef == NULL) { - return NULL; - } - - if (a->leading_coef->power > b->leading_coef->power) { - r1 = a; - r2 = b; - } else { - r1 = b; - r2 = a; - } - - while ( 1 ) { - if (r2->leading_coef == NULL) { - break; - } - if (r2->leading_coef->power == 0 && r2->leading_coef->coef <= 1) { - break; - } - - gf_poly_reduce(r1, r2); - tmp_swp = r1; - r1 = r2; - r2 = tmp_swp; - } - - return r1; -} - -/* - * The Ben-Or algorithm for determining irreducibility - */ -int gf_poly_is_irred(gf_poly_t* poly) -{ - gf_poly_t *gcd; - gf_poly_t *prod_of_irred; - uint64_t prod_of_irred_power = ((unsigned long long) 1) << poly->w; - int n = poly->leading_coef->power / 2; - int i; - int ret = 0; - gf_poly_t *a = gf_poly_copy(poly); - - prod_of_irred = gf_poly_init(a->w, a->coef_gf); - - - for (i = 1; i <= n; i++) { - gf_poly_add_coef(prod_of_irred, 1, prod_of_irred_power); - gf_poly_add_coef(prod_of_irred, 1, 1); - - gf_poly_reduce(prod_of_irred, a); - - gcd = gf_poly_gcd(a, prod_of_irred); - - /* - * It is irreducible if it is not the product of - * non-trivial factors (non-constant). Therefore, - * the GCD of the poly and prod_of_irred should be - * a constant (0 or 0-degree polynomial). - */ - if (gcd == NULL) { - ret = -1; - break; - } else if (gcd->leading_coef->power != 0) { - ret = -1; - break; - } else if (gcd->leading_coef->power == 0) { - ret = 0; - break; + fprintf(stderr, "usage: gf_poly w(base-field) method power:coef [ power:coef .. ]\n"); + fprintf(stderr, "\n"); + fprintf(stderr, " use - for the default method.\n"); + fprintf(stderr, " use 0x in front of the coefficient if it's in hex\n"); + fprintf(stderr, " \n"); + fprintf(stderr, " For example, to test whether x^2 + 2x + 1 is irreducible\n"); + fprintf(stderr, " in GF(2^16), the call is:\n"); + fprintf(stderr, " \n"); + fprintf(stderr, " gf_poly 16 - 2:1 1:2 0:1\n"); + fprintf(stderr, " \n"); + fprintf(stderr, " See the user's manual for more information.\n"); + if (s != NULL) { + fprintf(stderr, "\n"); + if (s == BM) { + fprintf(stderr, "%s", s); + gf_error(); } else { - ret = -1; - break; + fprintf(stderr, "%s\n", s); } - - // Need if to avoid a overflow error - if ((i + 1) <= n) { - prod_of_irred_power *= prod_of_irred_power; + } + exit(1); +} + +int gcd_one(gf_t *gf, int w, int n, gf_general_t *poly, gf_general_t *prod) +{ + gf_general_t *a, *b, zero, factor, p; + int i, j, da, db; + char buf[30]; + + gf_general_set_zero(&zero, w); + + a = (gf_general_t *) malloc(sizeof(gf_general_t) * n+1); + b = (gf_general_t *) malloc(sizeof(gf_general_t) * n); + for (i = 0; i <= n; i++) gf_general_add(gf, &zero, poly+i, a+i); + for (i = 0; i < n; i++) gf_general_add(gf, &zero, prod+i, b+i); + + da = n; + while (1) { + for (db = n-1; db >= 0 && gf_general_is_zero(b+db, w); db--) ; + if (db < 0) return 0; + if (db == 0) return 1; + for (j = da; j >= db; j--) { + if (!gf_general_is_zero(a+j, w)) { + gf_general_divide(gf, a+j, b+db, &factor); + for (i = 0; i <= db; i++) { + gf_general_multiply(gf, b+i, &factor, &p); + gf_general_add(gf, &p, a+(i+j-db), a+(i+j-db)); + } + } + } + for (i = 0; i < n; i++) { + gf_general_add(gf, a+i, &zero, &p); + gf_general_add(gf, b+i, &zero, a+i); + gf_general_add(gf, &p, &zero, b+i); } - gf_poly_clear(prod_of_irred); } - gf_poly_free(&a); - - return ret; } -int is_suitible_s(int w, gf_t *gf, uint64_t s) +void x_to_q_to_i_minus_x(gf_t *gf, int w, int n, gf_general_t *poly, int logq, int i, gf_general_t *retval) { - uint64_t num_elems = ((unsigned long long) 1) << w; - uint64_t i = 2; - uint64_t i_inv; + gf_general_t x; + gf_general_t *x_to_q; + gf_general_t *product; + gf_general_t p, zero, factor; + int j, k, lq; + char buf[20]; - for (; i < num_elems; i++) { - i_inv = gf_inverse(w, gf, i); - if ((i ^ i_inv) == s) { - fprintf(stderr, "Bailed on %llu ^ %llu = %llu\n", LLUI i, LLUI i_inv, LLUI s); - return -1; + gf_general_set_zero(&zero, w); + product = (gf_general_t *) malloc(sizeof(gf_general_t) * n*2); + x_to_q = (gf_general_t *) malloc(sizeof(gf_general_t) * n); + for (j = 0; j < n; j++) gf_general_set_zero(x_to_q+j, w); + gf_general_set_one(x_to_q+1, w); + + for (lq = 0; lq < logq; lq++) { + for (j = 0; j < n*2; j++) gf_general_set_zero(product+j, w); + for (j = 0; j < n; j++) { + for (k = 0; k < n; k++) { + gf_general_multiply(gf, x_to_q+j, x_to_q+k, &p); + gf_general_add(gf, product+(j+k), &p, product+(j+k)); + } } - if (i % 1000000000 == 0) fprintf(stderr, "Processed %llu\n", LLUI i); + for (j = n*2-1; j >= n; j--) { + if (!gf_general_is_zero(product+j, w)) { + gf_general_add(gf, product+j, &zero, &factor); + for (k = 0; k <= n; k++) { + gf_general_multiply(gf, poly+k, &factor, &p); + gf_general_add(gf, product+(j-n+k), &p, product+(j-n+k)); + } + } + } + for (j = 0; j < n; j++) gf_general_add(gf, product+j, &zero, x_to_q+j); + } + for (j = 0; j < n; j++) gf_general_set_zero(retval+j, w); + gf_general_set_one(retval, w); + + while (i > 0) { + for (j = 0; j < n*2; j++) gf_general_set_zero(product+j, w); + for (j = 0; j < n; j++) { + for (k = 0; k < n; k++) { + gf_general_multiply(gf, x_to_q+j, retval+k, &p); + gf_general_add(gf, product+(j+k), &p, product+(j+k)); + } + } + for (j = n*2-1; j >= n; j--) { + if (!gf_general_is_zero(product+j, w)) { + gf_general_add(gf, product+j, &zero, &factor); + for (k = 0; k <= n; k++) { + gf_general_multiply(gf, poly+k, &factor, &p); + gf_general_add(gf, product+(j-n+k), &p, product+(j-n+k)); + } + } + } + for (j = 0; j < n; j++) gf_general_add(gf, product+j, &zero, retval+j); + i--; } - return 0; + gf_general_set_one(&x, w); + gf_general_add(gf, &x, retval+1, retval+1); + + free(product); + free(x_to_q); } -static void -usage(char *cmd) -{ - fprintf(stderr, "%s w S \n", cmd); - fprintf(stderr, "\t will build a trinomial x^2+S*x+1\n"); - fprintf(stderr, "OR\n"); - fprintf(stderr, "%s w G coef1,power1 ... \n", cmd); - fprintf(stderr, "\t will build a polynomial coef1^(power1) + ... + coefn^(powern)\n"); - fprintf(stderr, "Example: ./gf_poly 8 - - - G 1,2 2,1 1,0\n"); - fprintf(stderr, "\t will build a polynomial x^2+2*x+1 with coefs from GF(2^8)\n"); -} - -/* - * Find irred poly of form x^2+sx+1 - * a_n*x^n + a_(n-1)*x^(n-1) + ... - * - * Terms are specified as: a_i,i a_j,j, ... where - * i is the degree of the term and a_i is the coef - * - */ -int main(int argc, char **argv) +main(int argc, char **argv) { + int w, i, power, n, ap, success, j; gf_t gf; - int ret; - int w; - int i; - uint64_t irred_coef_s; - gf_poly_t *irred_poly; - char *term; + gf_general_t *poly, *prod; + char *string, *ptr; + char buf[100]; - bzero(&gf, sizeof(gf_t)); + if (argc < 4) usage(NULL); - if (argc < 4) { - usage(argv[0]); - return -1; - } - - w = atoi(argv[1]); - - ret = create_gf_from_argv(&gf, w, argc, argv, 3); + if (sscanf(argv[1], "%d", &w) != 1 || w <= 0) usage("Bad w."); + ap = create_gf_from_argv(&gf, w, argc, argv, 2); - if (ret <= 0) { - fprintf(stderr, "Could not create a GF\n"); - return -1; - } - - irred_poly = gf_poly_init(w, &gf); + if (ap == 0) usage(BM); - i = ret + 1; + if (ap == argc) usage("No powers/coefficients given."); - if (strlen(argv[i]) > 1) { - usage(argv[0]); - exit(1); - } - - if (argv[i][0] == 'S') { - i++; - irred_coef_s = (uint64_t)strtoull(argv[i], NULL, 10); - - /* - * If this is a trinomial of the form x^2+s*x+1, then - * we can do a quick pre-check to see if this may be - * an irreducible polynomial. - */ - if (is_suitible_s(w, &gf, irred_coef_s) < 0) { - fprintf(stderr, "%llu is not a suitable coeffient!\n", LLUI irred_coef_s); - return -1; - } else { - fprintf(stderr, "%llu IS A suitable coeffient!\n", LLUI irred_coef_s); + n = -1; + for (i = ap; i < argc; i++) { + if (strchr(argv[i], ':') == NULL || sscanf(argv[i], "%d:", &power) != 1) { + string = (char *) malloc(sizeof(char)*(strlen(argv[i]+100))); + sprintf(string, "Argument '%s' not in proper format of power:coefficient\n", argv[i]); + usage(string); } + if (power < 0) usage("Can't have negative powers\n"); + if (power > n) n = power; + } + poly = (gf_general_t *) malloc(sizeof(gf_general_t)*(n+1)); + for (i = 0; i <= n; i++) gf_general_set_zero(poly+i, w); + prod = (gf_general_t *) malloc(sizeof(gf_general_t)*n); - gf_poly_add_coef(irred_poly, 1, 2); - gf_poly_add_coef(irred_poly, irred_coef_s, 1); - gf_poly_add_coef(irred_poly, 1, 0); + for (i = ap; i < argc; i++) { + sscanf(argv[i], "%d:", &power); + ptr = strchr(argv[i], ':'); + ptr++; + if (strncmp(ptr, "0x", 2) == 0) { + success = gf_general_s_to_val(poly+power, w, ptr+2, 1); + } else { + success = gf_general_s_to_val(poly+power, w, ptr, 0); + } + if (success == 0) { + string = (char *) malloc(sizeof(char)*(strlen(argv[i]+100))); + sprintf(string, "Argument '%s' not in proper format of power:coefficient\n", argv[i]); + usage(string); + } + } - } else if (argv[i][0] == 'G') { - term = argv[++i]; - - - while (term != NULL) { - uint64_t coef = strtoull(strtok(term, ","), NULL, 10); - uint64_t power = strtoull(strtok(NULL, ","), NULL, 10); - - gf_poly_add_coef(irred_poly, coef, power); - - if (i < argc) { - term = argv[++i]; + printf("Poly:"); + for (power = n; power >= 0; power--) { + if (!gf_general_is_zero(poly+power, w)) { + printf("%s", (power == n) ? " " : " + "); + if (!gf_general_is_one(poly+power, w)) { + gf_general_val_to_s(poly+power, w, buf, 1); + if (n > 0) { + printf("(0x%s)", buf); + } else { + printf("0x%s", buf); + } + } + if (power == 0) { + if (gf_general_is_one(poly+power, w)) printf("1"); + } else if (power == 1) { + printf("x"); } else { - break; + printf("x^%d", power); } } - } else { - usage(argv[0]); - exit(1); + } + printf("\n"); + + if (!gf_general_is_one(poly+n, w)) { + printf("\n"); + printf("Can't do Ben-Or, because the polynomial is not monic.\n"); + exit(0); + } + + for (i = 1; i <= n/2; i++) { + x_to_q_to_i_minus_x(&gf, w, n, poly, w, i, prod); + if (!gcd_one(&gf, w, n, poly, prod)) { + printf("Reducible.\n"); + exit(0); + } } - gf_poly_print(irred_poly, " specified via the command line\n"); - - ret = gf_poly_is_irred(irred_poly); - - if (ret < 0) { - gf_poly_print(irred_poly, " IS NOT irreducible\n"); - } else { - gf_poly_print(irred_poly, " IS irreducible\n"); - } - - return 0; + printf("Irreducible.\n"); + exit(0); } diff --git a/gf_time.c b/gf_time.c index 8313b05..55f3e11 100644 --- a/gf_time.c +++ b/gf_time.c @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include "gf_complete.h" #include "gf_method.h" @@ -43,10 +43,14 @@ void problem(char *s) exit(1); } +char *BM = "Bad Method: "; + void usage(char *s) { fprintf(stderr, "usage: gf_time w tests seed size(bytes) iterations [method [params]] - does timing\n"); fprintf(stderr, "\n"); + fprintf(stderr, "does unit testing in GF(2^w)\n"); + fprintf(stderr, "\n"); fprintf(stderr, "Legal w are: 1 - 32, 64 and 128\n"); fprintf(stderr, "\n"); fprintf(stderr, "Tests may be any combination of:\n"); @@ -63,9 +67,12 @@ void usage(char *s) fprintf(stderr, "\n"); fprintf(stderr, "Use -1 for time(0) as a seed.\n"); fprintf(stderr, "\n"); - fprintf(stderr, "For method specification, type gf_methods\n"); - fprintf(stderr, "\n"); - if (s != NULL) fprintf(stderr, "%s\n", s); + if (s == BM) { + fprintf(stderr, "%s", BM); + gf_error(); + } else if (s != NULL) { + fprintf(stderr, "%s\n", s); + } exit(1); } @@ -84,9 +91,15 @@ int main(int argc, char **argv) time_t t0; uint8_t *ra, *rb; gf_general_t a; + if (argc < 6) usage(NULL); - if (sscanf(argv[1], "%d", &w) == 0) usage("Bad w\n"); + + if (sscanf(argv[1], "%d", &w) == 0){ + usage("Bad w[-pp]\n"); + } + + if (sscanf(argv[3], "%ld", &t0) == 0) usage("Bad seed\n"); if (sscanf(argv[4], "%d", &size) == 0) usage("Bad size\n"); if (sscanf(argv[5], "%d", &iterations) == 0) usage("Bad iterations\n"); @@ -99,7 +112,7 @@ int main(int argc, char **argv) if ((w > 32 && w != 64 && w != 128) || w < 0) usage("Bad w"); if ((size * 8) % w != 0) usage ("Bad size -- must be a multiple of w*8\n"); - if (!create_gf_from_argv(&gf, w, argc, argv, 6)) usage("Bad Method"); + if (!create_gf_from_argv(&gf, w, argc, argv, 6)) usage(BM); strcpy(tests, ""); for (i = 0; i < argv[2][i] != '\0'; i++) { diff --git a/gf_unit.c b/gf_unit.c index 03911c4..fbc21f9 100644 --- a/gf_unit.c +++ b/gf_unit.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "gf_complete.h" #include "gf_int.h" @@ -18,6 +19,8 @@ #include "gf_general.h" #define REGION_SIZE (16384) +#define RMASK (0x00000000ffffffffLL) +#define LMASK (0xffffffff00000000LL) void problem(char *s) { @@ -26,11 +29,14 @@ void problem(char *s) exit(1); } +char *BM = "Bad Method: "; + void usage(char *s) { fprintf(stderr, "usage: gf_unit w tests seed [method] - does unit testing in GF(2^w)\n"); - fprintf(stderr, "\n"); + fprintf(stderr, "\n"); fprintf(stderr, "Legal w are: 1 - 32, 64 and 128\n"); + fprintf(stderr, " 128 is hex only (i.e. '128' will be an error - do '128h')\n"); fprintf(stderr, "\n"); fprintf(stderr, "Tests may be any combination of:\n"); fprintf(stderr, " A: All\n"); @@ -40,16 +46,28 @@ void usage(char *s) fprintf(stderr, "\n"); fprintf(stderr, "Use -1 for time(0) as a seed.\n"); fprintf(stderr, "\n"); - fprintf(stderr, "For method specification, type gf_methods\n"); - fprintf(stderr, "\n"); - if (s != NULL) fprintf(stderr, "%s\n", s); + if (s == BM) { + fprintf(stderr, "%s", BM); + gf_error(); + } else if (s != NULL) { + fprintf(stderr, "%s\n", s); + } exit(1); } +void SigHandler(int v) +{ + fprintf(stderr, "Problem: SegFault!\n"); + fflush(stdout); + exit(2); +} + int main(int argc, char **argv) { + signal(SIGSEGV, SigHandler); + int w, i, verbose, single, region, tested, top; - int start, end, xor; + int s_start, d_start, bytes, xor, alignment_test; gf_t gf, gf_def; time_t t0; gf_internal_t *h; @@ -61,15 +79,21 @@ int main(int argc, char **argv) char *ra, *rb, *rc, *rd, *target; int align; + if (argc < 4) usage(NULL); - if (sscanf(argv[1], "%d", &w) == 0) usage("Bad w\n"); + + if (sscanf(argv[1], "%d", &w) == 0){ + usage("Bad w\n"); + } + if (sscanf(argv[3], "%ld", &t0) == 0) usage("Bad seed\n"); if (t0 == -1) t0 = time(0); MOA_Seed(t0); if (w > 32 && w != 64 && w != 128) usage("Bad w"); - if (create_gf_from_argv(&gf, w, argc, argv, 4) == 0) usage("Bad Method"); + if (create_gf_from_argv(&gf, w, argc, argv, 4) == 0) usage(BM); + printf("Size (bytes): %d\n", gf_size(&gf)); for (i = 0; i < strlen(argv[2]); i++) { if (strchr("ASRV", argv[2][i]) == NULL) usage("Bad test\n"); @@ -83,10 +107,18 @@ int main(int argc, char **argv) ai = (gf_general_t *) malloc(sizeof(gf_general_t)); bi = (gf_general_t *) malloc(sizeof(gf_general_t)); - ra = (char *) malloc(sizeof(char)*REGION_SIZE); - rb = (char *) malloc(sizeof(char)*REGION_SIZE); - rc = (char *) malloc(sizeof(char)*REGION_SIZE); - rd = (char *) malloc(sizeof(char)*REGION_SIZE); + //15 bytes extra to make sure it's 16byte aligned + ra = (char *) malloc(sizeof(char)*REGION_SIZE+15); + rb = (char *) malloc(sizeof(char)*REGION_SIZE+15); + rc = (char *) malloc(sizeof(char)*REGION_SIZE+15); + rd = (char *) malloc(sizeof(char)*REGION_SIZE+15); + + //this still assumes 8 byte aligned pointer from malloc + //(which is usual on 32-bit machines) + ra += (uint64_t)ra & 0xf; + rb += (uint64_t)rb & 0xf; + rc += (uint64_t)rc & 0xf; + rd += (uint64_t)rd & 0xf; if (w <= 32) { mask = 0; @@ -97,8 +129,9 @@ int main(int argc, char **argv) single = (strchr(argv[2], 'S') != NULL || strchr(argv[2], 'A') != NULL); region = (strchr(argv[2], 'R') != NULL || strchr(argv[2], 'A') != NULL); - if (!gf_init_easy(&gf_def, w)) problem("No default for this value of w"); - + if (!gf_init_hard(&gf_def, w, GF_MULT_DEFAULT, GF_REGION_DEFAULT, GF_DIVIDE_DEFAULT, + (h->mult_type != GF_MULT_COMPOSITE) ? h->prim_poly : 0, 0, 0, NULL, NULL)) + problem("No default for this value of w"); if (w == 4) { mult4 = gf_w4_get_mult_table(&gf); div4 = gf_w4_get_div_table(&gf); @@ -129,21 +162,71 @@ int main(int argc, char **argv) if (w <= 10) { a->w32 = i % (1 << w); b->w32 = (i >> w); - } else if (i < 10) { - gf_general_set_zero(a, w); - gf_general_set_random(b, w, 1); - } else if (i < 20) { - gf_general_set_random(a, w, 1); - gf_general_set_zero(b, w); - } else if (i < 30) { - gf_general_set_one(a, w); - gf_general_set_random(b, w, 1); - } else if (i < 40) { - gf_general_set_random(a, w, 1); - gf_general_set_one(b, w); + + //Allen: the following conditions were being run 10 times each. That didn't seem like nearly enough to + //me for these special cases, so I converted to doing this mod stuff to easily make the number of times + //run both larger and proportional to the total size of the run. } else { - gf_general_set_random(a, w, 1); - gf_general_set_random(b, w, 1); + switch (i % 32) + { + case 0: + gf_general_set_zero(a, w); + gf_general_set_random(b, w, 1); + break; + case 1: + gf_general_set_random(a, w, 1); + gf_general_set_zero(b, w); + break; + case 2: + gf_general_set_one(a, w); + gf_general_set_random(b, w, 1); + break; + case 3: + gf_general_set_random(a, w, 1); + gf_general_set_one(b, w); + break; + default: + gf_general_set_random(a, w, 1); + gf_general_set_random(b, w, 1); + } + } + + //Allen: the following special cases for w=64 are based on the code below for w=128. + //These w=64 cases are based on Dr. Plank's suggestion because some of the methods for w=64 + //involve splitting it in two. I think they're less likely to give errors than the 128-bit case + //though, because the 128 bit case is always split in two. + //As with w=128, I'm arbitrarily deciding to do this sort of thing with a quarter of the cases + if (w == 64) { + switch (i % 32) + { + case 0: if (!gf_general_is_one(a, w)) a->w64 &= RMASK; break; + case 1: if (!gf_general_is_one(a, w)) a->w64 &= LMASK; break; + case 2: if (!gf_general_is_one(a, w)) a->w64 &= RMASK; if (!gf_general_is_one(b, w)) b->w64 &= RMASK; break; + case 3: if (!gf_general_is_one(a, w)) a->w64 &= RMASK; if (!gf_general_is_one(b, w)) b->w64 &= LMASK; break; + case 4: if (!gf_general_is_one(a, w)) a->w64 &= LMASK; if (!gf_general_is_one(b, w)) b->w64 &= RMASK; break; + case 5: if (!gf_general_is_one(a, w)) a->w64 &= LMASK; if (!gf_general_is_one(b, w)) b->w64 &= LMASK; break; + case 6: if (!gf_general_is_one(b, w)) b->w64 &= RMASK; break; + case 7: if (!gf_general_is_one(b, w)) b->w64 &= LMASK; break; + } + } + + //Allen: for w=128, we have important special cases where one half or the other of the number is all + //zeros. The probability of hitting such a number randomly is 1^-64, so if we don't force these cases + //we'll probably never hit them. This could be implemented more efficiently by changing the set-random + //function for w=128, but I think this is easier to follow. + //I'm arbitrarily deciding to do this sort of thing with a quarter of the cases + if (w == 128) { + switch (i % 32) + { + case 0: if (!gf_general_is_one(a, w)) a->w128[0] = 0; break; + case 1: if (!gf_general_is_one(a, w)) a->w128[1] = 0; break; + case 2: if (!gf_general_is_one(a, w)) a->w128[0] = 0; if (!gf_general_is_one(b, w)) b->w128[0] = 0; break; + case 3: if (!gf_general_is_one(a, w)) a->w128[0] = 0; if (!gf_general_is_one(b, w)) b->w128[1] = 0; break; + case 4: if (!gf_general_is_one(a, w)) a->w128[1] = 0; if (!gf_general_is_one(b, w)) b->w128[0] = 0; break; + case 5: if (!gf_general_is_one(a, w)) a->w128[1] = 0; if (!gf_general_is_one(b, w)) b->w128[1] = 0; break; + case 6: if (!gf_general_is_one(b, w)) b->w128[0] = 0; break; + case 7: if (!gf_general_is_one(b, w)) b->w128[1] = 0; break; + } } tested = 0; @@ -195,10 +278,10 @@ int main(int argc, char **argv) gf_general_multiply(&gf_def, a, b, d); if (!gf_general_are_equal(c, d, w)) { - gf_general_val_to_s(a, w, as); - gf_general_val_to_s(b, w, bs); - gf_general_val_to_s(c, w, cs); - gf_general_val_to_s(d, w, ds); + gf_general_val_to_s(a, w, as, 1); + gf_general_val_to_s(b, w, bs, 1); + gf_general_val_to_s(c, w, cs, 1); + gf_general_val_to_s(d, w, ds, 1); printf("Error in single multiplication (all numbers in hex):\n\n"); printf(" gf.multiply(gf, %s, %s) = %s\n", as, bs, cs); printf(" The default gf multiplier returned %s\n", ds); @@ -216,9 +299,9 @@ int main(int argc, char **argv) if (((gf_general_is_zero(a, w) || gf_general_is_zero(b, w)) && !gf_general_is_zero(c, w)) || (gf_general_is_one(a, w) && !gf_general_are_equal(b, c, w)) || (gf_general_is_one(b, w) && !gf_general_are_equal(a, c, w))) { - gf_general_val_to_s(a, w, as); - gf_general_val_to_s(b, w, bs); - gf_general_val_to_s(c, w, cs); + gf_general_val_to_s(a, w, as, 1); + gf_general_val_to_s(b, w, bs, 1); + gf_general_val_to_s(c, w, cs, 1); printf("Error in single multiplication (all numbers in hex):\n\n"); printf(" gf.multiply(gf, %s, %s) = %s, which is clearly wrong.\n", as, bs, cs); ; @@ -229,9 +312,9 @@ int main(int argc, char **argv) /* Dumb check to make sure that it's not returning numbers that are too big: */ if (w < 32 && (c->w32 & mask) != c->w32) { - gf_general_val_to_s(a, w, as); - gf_general_val_to_s(b, w, bs); - gf_general_val_to_s(c, w, cs); + gf_general_val_to_s(a, w, as, 1); + gf_general_val_to_s(b, w, bs, 1); + gf_general_val_to_s(c, w, cs, 1); printf("Error in single multiplication (all numbers in hex):\n\n"); printf(" gf.multiply.w32(gf, %s, %s) = %s, which is too big.\n", as, bs, cs); exit(1); @@ -242,10 +325,10 @@ int main(int argc, char **argv) if (!gf_general_is_zero(a, w)) { gf_general_divide(&gf, c, a, d); if (!gf_general_are_equal(b, d, w)) { - gf_general_val_to_s(a, w, as); - gf_general_val_to_s(b, w, bs); - gf_general_val_to_s(c, w, cs); - gf_general_val_to_s(d, w, ds); + gf_general_val_to_s(a, w, as, 1); + gf_general_val_to_s(b, w, bs, 1); + gf_general_val_to_s(c, w, cs, 1); + gf_general_val_to_s(d, w, ds, 1); printf("Error in single multiplication/division (all numbers in hex):\n\n"); printf(" gf.multiply(gf, %s, %s) = %s, but gf.divide(gf, %s, %s) = %s\n", as, bs, cs, cs, as, ds); exit(1); @@ -257,40 +340,82 @@ int main(int argc, char **argv) if (region) { if (verbose) { printf("Testing region multiplications\n"); fflush(stdout); } - for (i = 0; i < 1000; i++) { - if (i < 20) { - gf_general_set_zero(a, w); - } else if (i < 40) { - gf_general_set_one(a, w); - } else if (i < 60) { - gf_general_set_two(a, w); - } else { - gf_general_set_random(a, w, 1); + for (i = 0; i < 1024; i++) { + //Allen: changing to a switch thing as with the single ops to make things proportional + switch (i % 32) + { + case 0: + gf_general_set_zero(a, w); + break; + case 1: + gf_general_set_one(a, w); + break; + case 2: + gf_general_set_two(a, w); + break; + default: + gf_general_set_random(a, w, 1); } MOA_Fill_Random_Region(ra, REGION_SIZE); MOA_Fill_Random_Region(rb, REGION_SIZE); - xor = i%2; + xor = (i/32)%2; align = w/8; if (align == 0) align = 1; if (align > 16) align = 16; + + /* JSP - Cauchy test. When w < 32 & it doesn't equal 4, 8 or 16, the default is + equal to GF_REGION_CAUCHY, even if GF_REGION_CAUCHY is not set. We are testing + three alignments here: + + 1. Anything goes -- no alignment guaranteed. + 2. Perfect alignment. Here src and dest must be aligned wrt each other, + and bytes must be a multiple of 16*w. + 3. Imperfect alignment. Here we'll have src and dest be aligned wrt each + other, but bytes is simply a multiple of w. That means some XOR's will + be aligned, and some won't. + */ + if ((h->region_type & GF_REGION_CAUCHY) || (w < 32 && w != 4 && w != 8 && w != 16)) { - start = MOA_Random_W(5, 1); - end = REGION_SIZE - MOA_Random_W(5, 1); + alignment_test = (i%3); + + s_start = MOA_Random_W(5, 1); + if (alignment_test == 0) { + d_start = MOA_Random_W(5, 1); + } else { + d_start = s_start; + } + + bytes = (d_start > s_start) ? REGION_SIZE - d_start : REGION_SIZE - s_start; + bytes -= MOA_Random_W(5, 1); + if (alignment_test == 1) { + bytes -= (bytes % (w*16)); + } else { + bytes -= (bytes % w); + } + target = rb; - while ((end-start)%w != 0) end--; + + /* JSP - Otherwise, we're testing a non-cauchy test, and alignment + must be more strict. We have to make sure that the regions are + aligned wrt each other on 16-byte pointers. */ + } else { - start = MOA_Random_W(5, 1) * align; - end = REGION_SIZE - (MOA_Random_W(5, 1) * align); + s_start = MOA_Random_W(5, 1) * align; + d_start = s_start; + bytes = REGION_SIZE - s_start - MOA_Random_W(5, 1); + bytes -= (bytes % align); + if (h->mult_type == GF_MULT_COMPOSITE && (h->region_type & GF_REGION_ALTMAP)) { target = rb ; } else { - target = ((i%4)/2) ? rb : ra; + target = (i/64)%2 ? rb : ra; } } + memcpy(rc, ra, REGION_SIZE); memcpy(rd, target, REGION_SIZE); - gf_general_do_region_multiply(&gf, a, ra+start, target+start, end-start, xor); - gf_general_do_region_check(&gf, a, rc+start, rd+start, target+start, end-start, xor); + gf_general_do_region_multiply(&gf, a, ra+s_start, target+d_start, bytes, xor); + gf_general_do_region_check(&gf, a, rc+s_start, rd+d_start, target+d_start, bytes, xor); } } } diff --git a/gf_w128.c b/gf_w128.c index 0a2a93f..1465be5 100644 --- a/gf_w128.c +++ b/gf_w128.c @@ -12,7 +12,7 @@ #define two_x(a) {\ a[0] <<= 1; \ - if (a[1] & (uint64_t) 1 << 63) a[0] ^= 1; \ + if (a[1] & 1ULL << 63) a[0] ^= 1; \ a[1] <<= 1; } #define a_get_b(a, i, b, j) {\ @@ -28,11 +28,18 @@ struct gf_w128_split_4_128_data { uint64_t tables[2][32][16]; }; +struct gf_w128_split_8_128_data { + uint64_t last_value[2]; + uint64_t tables[2][16][256]; +}; + typedef struct gf_group_tables_s { gf_val_128_t m_table; gf_val_128_t r_table; } gf_group_tables_t; +#define MM_PRINT8(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 1) printf("%s%02x", (ii%4==0) ? " " : " ", blah[15-ii]); printf("\n"); } + static void gf_w128_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, @@ -70,11 +77,120 @@ int xor) } } +static +void +gf_w128_clm_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, +int xor) +{ + int i; + gf_val_128_t s128; + gf_val_128_t d128; + uint64_t c128[2]; + gf_region_data rd; +#ifdef INTEL_SSE4_PCLMUL + __m128i a,b; + __m128i result0,result1; + __m128i prim_poly; + __m128i c,d,e,f; + gf_internal_t * h = gf->scratch; + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)h->prim_poly); + /* We only do this to check on alignment. */ + gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8); + + if (val[0] == 0) { + if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; } + } + + set_zero(c128, 0); + + s128 = (gf_val_128_t) src; + d128 = (gf_val_128_t) dest; + + if (xor) { + for (i = 0; i < bytes/sizeof(gf_val_64_t); i += 2) { + a = _mm_insert_epi64 (_mm_setzero_si128(), s128[i+1], 0); + b = _mm_insert_epi64 (a, val[1], 0); + a = _mm_insert_epi64 (a, s128[i], 1); + b = _mm_insert_epi64 (b, val[0], 1); + + c = _mm_clmulepi64_si128 (a, b, 0x00); /*low-low*/ + f = _mm_clmulepi64_si128 (a, b, 0x01); /*high-low*/ + e = _mm_clmulepi64_si128 (a, b, 0x10); /*low-high*/ + d = _mm_clmulepi64_si128 (a, b, 0x11); /*high-high*/ + + /* now reusing a and b as temporary variables*/ + result0 = _mm_setzero_si128(); + result1 = result0; + + result0 = _mm_xor_si128 (result0, _mm_insert_epi64 (d, 0, 0)); + a = _mm_xor_si128 (_mm_srli_si128 (e, 8), _mm_insert_epi64 (d, 0, 1)); + result0 = _mm_xor_si128 (result0, _mm_xor_si128 (_mm_srli_si128 (f, 8), a)); + + a = _mm_xor_si128 (_mm_slli_si128 (e, 8), _mm_insert_epi64 (c, 0, 0)); + result1 = _mm_xor_si128 (result1, _mm_xor_si128 (_mm_slli_si128 (f, 8), a)); + result1 = _mm_xor_si128 (result1, _mm_insert_epi64 (c, 0, 1)); + /* now we have constructed our 'result' with result0 being the carry bits, and we have to reduce. */ + + a = _mm_srli_si128 (result0, 8); + b = _mm_clmulepi64_si128 (a, prim_poly, 0x00); + result0 = _mm_xor_si128 (result0, _mm_srli_si128 (b, 8)); + result1 = _mm_xor_si128 (result1, _mm_slli_si128 (b, 8)); + + a = _mm_insert_epi64 (result0, 0, 1); + b = _mm_clmulepi64_si128 (a, prim_poly, 0x00); + result1 = _mm_xor_si128 (result1, b); + d128[i] ^= (uint64_t)_mm_extract_epi64(result1,1); + d128[i+1] ^= (uint64_t)_mm_extract_epi64(result1,0); + } + } else { + for (i = 0; i < bytes/sizeof(gf_val_64_t); i += 2) { + a = _mm_insert_epi64 (_mm_setzero_si128(), s128[i+1], 0); + b = _mm_insert_epi64 (a, val[1], 0); + a = _mm_insert_epi64 (a, s128[i], 1); + b = _mm_insert_epi64 (b, val[0], 1); + + c = _mm_clmulepi64_si128 (a, b, 0x00); /*low-low*/ + f = _mm_clmulepi64_si128 (a, b, 0x01); /*high-low*/ + e = _mm_clmulepi64_si128 (a, b, 0x10); /*low-high*/ + d = _mm_clmulepi64_si128 (a, b, 0x11); /*high-high*/ + + /* now reusing a and b as temporary variables*/ + result0 = _mm_setzero_si128(); + result1 = result0; + + result0 = _mm_xor_si128 (result0, _mm_insert_epi64 (d, 0, 0)); + a = _mm_xor_si128 (_mm_srli_si128 (e, 8), _mm_insert_epi64 (d, 0, 1)); + result0 = _mm_xor_si128 (result0, _mm_xor_si128 (_mm_srli_si128 (f, 8), a)); + + a = _mm_xor_si128 (_mm_slli_si128 (e, 8), _mm_insert_epi64 (c, 0, 0)); + result1 = _mm_xor_si128 (result1, _mm_xor_si128 (_mm_slli_si128 (f, 8), a)); + result1 = _mm_xor_si128 (result1, _mm_insert_epi64 (c, 0, 1)); + /* now we have constructed our 'result' with result0 being the carry bits, and we have to reduce.*/ + + a = _mm_srli_si128 (result0, 8); + b = _mm_clmulepi64_si128 (a, prim_poly, 0x00); + result0 = _mm_xor_si128 (result0, _mm_srli_si128 (b, 8)); + result1 = _mm_xor_si128 (result1, _mm_slli_si128 (b, 8)); + + a = _mm_insert_epi64 (result0, 0, 1); + b = _mm_clmulepi64_si128 (a, prim_poly, 0x00); + result1 = _mm_xor_si128 (result1, b); + d128[i] = (uint64_t)_mm_extract_epi64(result1,1); + d128[i+1] = (uint64_t)_mm_extract_epi64(result1,0); + } + } +#endif +} + /* * Some w128 notes: * --Big Endian * --return values allocated beforehand */ + +#define GF_W128_IS_ZERO(val) (val[0] == 0 && val[1] == 0) + void gf_w128_shift_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128) { @@ -99,6 +215,7 @@ gf_w128_shift_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_12 set_zero(pl, 0); set_zero(pr, 0); + /* Allen: a*b for right half of a */ for (i = 0; i < GF_FIELD_WIDTH/2; i++) { if (a[1] & (one << i)) { pl[1] ^= bl[1]; @@ -112,6 +229,7 @@ gf_w128_shift_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_12 br[1] <<= 1; } + /* Allen: a*b for left half of a */ for (i = 0; i < GF_FIELD_WIDTH/2; i++) { if (a[0] & (one << i)) { pl[0] ^= bl[0]; @@ -125,10 +243,11 @@ gf_w128_shift_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_12 br[0] <<= 1; } - one = lbit; - ppl[0] = lbit; - ppl[1] = h->prim_poly >> 1; - ppr[0] = lbit; + /* Allen: do first half of reduction (based on left quarter of initial product) */ + one = lbit >> 1; + ppl[0] = one; /* Allen: introduce leading one of primitive polynomial */ + ppl[1] = h->prim_poly >> 2; + ppr[0] = h->prim_poly << (GF_FIELD_WIDTH/2-2); ppr[1] = 0; while (one != 0) { if (pl[0] & one) { @@ -147,6 +266,7 @@ gf_w128_shift_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_12 ppl[0] >>= 1; } + /* Allen: final half of reduction */ one = lbit; while (one != 0) { if (pl[1] & one) { @@ -162,12 +282,198 @@ gf_w128_shift_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_12 ppl[1] >>= 1; } + /* Allen: if we really want to optimize this we can just be using c128 instead of pr all along */ c128[0] = pr[0]; c128[1] = pr[1]; return; } +void +gf_w128_clm_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128) +{ +#ifdef INTEL_SSE4_PCLMUL + + __m128i a,b; + __m128i result0,result1; + __m128i prim_poly; + __m128i c,d,e,f; + gf_internal_t * h = gf->scratch; + + a = _mm_insert_epi64 (_mm_setzero_si128(), a128[1], 0); + b = _mm_insert_epi64 (a, b128[1], 0); + a = _mm_insert_epi64 (a, a128[0], 1); + b = _mm_insert_epi64 (b, b128[0], 1); + + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)h->prim_poly); + + /* we need to test algorithm 2 later*/ + c = _mm_clmulepi64_si128 (a, b, 0x00); /*low-low*/ + f = _mm_clmulepi64_si128 (a, b, 0x01); /*high-low*/ + e = _mm_clmulepi64_si128 (a, b, 0x10); /*low-high*/ + d = _mm_clmulepi64_si128 (a, b, 0x11); /*high-high*/ + + /* now reusing a and b as temporary variables*/ + result0 = _mm_setzero_si128(); + result1 = result0; + + result0 = _mm_xor_si128 (result0, _mm_insert_epi64 (d, 0, 0)); + a = _mm_xor_si128 (_mm_srli_si128 (e, 8), _mm_insert_epi64 (d, 0, 1)); + result0 = _mm_xor_si128 (result0, _mm_xor_si128 (_mm_srli_si128 (f, 8), a)); + + a = _mm_xor_si128 (_mm_slli_si128 (e, 8), _mm_insert_epi64 (c, 0, 0)); + result1 = _mm_xor_si128 (result1, _mm_xor_si128 (_mm_slli_si128 (f, 8), a)); + result1 = _mm_xor_si128 (result1, _mm_insert_epi64 (c, 0, 1)); + /* now we have constructed our 'result' with result0 being the carry bits, and we have to reduce.*/ + + a = _mm_srli_si128 (result0, 8); + b = _mm_clmulepi64_si128 (a, prim_poly, 0x00); + result0 = _mm_xor_si128 (result0, _mm_srli_si128 (b, 8)); + result1 = _mm_xor_si128 (result1, _mm_slli_si128 (b, 8)); + + a = _mm_insert_epi64 (result0, 0, 1); + b = _mm_clmulepi64_si128 (a, prim_poly, 0x00); + result1 = _mm_xor_si128 (result1, b); + + c128[0] = (uint64_t)_mm_extract_epi64(result1,1); + c128[1] = (uint64_t)_mm_extract_epi64(result1,0); +#endif +return; +} + +void +gf_w128_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128) +{ + uint64_t amask[2], pmask, pp, prod[2]; /*John: pmask is always the highest bit set, and the rest zeros. amask changes, it's a countdown.*/ + uint64_t topbit; /* this is used as a boolean value */ + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + prod[0] = 0; + prod[1] = 0; + pmask = 0x8000000000000000ULL; + amask[0] = 0x8000000000000000ULL; + amask[1] = 0; + + while (amask[1] != 0 || amask[0] != 0) { + topbit = (prod[0] & pmask); + prod[0] <<= 1; + if (prod[1] & pmask) prod[0] ^= 1; + prod[1] <<= 1; + if (topbit) prod[1] ^= pp; + if ((a128[0] & amask[0]) || (a128[1] & amask[1])) { + prod[0] ^= b128[0]; + prod[1] ^= b128[1]; + } + amask[1] >>= 1; + if (amask[0] & 1) amask[1] ^= pmask; + amask[0] >>= 1; + } + c128[0] = prod [0]; + c128[1] = prod [1]; + return; +} + +void +gf_w128_sse_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128) +{ +#ifdef INTEL_SSE4 + int i; + __m128i a, b, pp, one, prod, amask, l_middle_one, u_middle_one; + /*John: pmask is always the highest bit set, and the rest zeros. amask changes, it's a countdown.*/ + uint32_t topbit, middlebit, pmask; /* this is used as a boolean value */ + gf_internal_t *h; + + + h = (gf_internal_t *) gf->scratch; + pp = _mm_set_epi32(0, 0, 0, (uint32_t)h->prim_poly); + prod = _mm_setzero_si128(); + a = _mm_insert_epi64(prod, a128[1], 0x0); + a = _mm_insert_epi64(a, a128[0], 0x1); + b = _mm_insert_epi64(prod, b128[1], 0x0); + b = _mm_insert_epi64(b, b128[0], 0x1); + pmask = 0x80000000; + amask = _mm_insert_epi32(prod, 0x80000000, 0x3); + u_middle_one = _mm_insert_epi32(prod, 1, 0x2); + l_middle_one = _mm_insert_epi32(prod, 1 << 31, 0x1); + + for (i = 0; i < 64; i++) { + topbit = (_mm_extract_epi32(prod, 0x3) & pmask); + middlebit = (_mm_extract_epi32(prod, 0x1) & pmask); + prod = _mm_slli_epi64(prod, 1); /* this instruction loses the middle bit */ + if (middlebit) { + prod = _mm_xor_si128(prod, u_middle_one); + } + if (topbit) { + prod = _mm_xor_si128(prod, pp); + } + if (((uint64_t)_mm_extract_epi64(_mm_and_si128(a, amask), 1))) { + prod = _mm_xor_si128(prod, b); + } + amask = _mm_srli_epi64(amask, 1); /*so does this one, but we can just replace after loop*/ + } + amask = _mm_insert_epi32(amask, 1 << 31, 0x1); + for (i = 64; i < 128; i++) { + topbit = (_mm_extract_epi32(prod, 0x3) & pmask); + middlebit = (_mm_extract_epi32(prod, 0x1) & pmask); + prod = _mm_slli_epi64(prod, 1); + if (middlebit) prod = _mm_xor_si128(prod, u_middle_one); + if (topbit) prod = _mm_xor_si128(prod, pp); + if (((uint64_t)_mm_extract_epi64(_mm_and_si128(a, amask), 0))) { + prod = _mm_xor_si128(prod, b); + } + amask = _mm_srli_epi64(amask, 1); + } + c128[0] = (uint64_t)_mm_extract_epi64(prod, 1); + c128[1] = (uint64_t)_mm_extract_epi64(prod, 0); +#endif + return; +} + + +/* Ben: This slow function implements sse instrutions for bytwo_b because why not */ +void +gf_w128_sse_bytwo_b_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128) +{ +#ifdef INTEL_SSE4 + __m128i a, b, lmask, hmask, pp, c, middle_one; + gf_internal_t *h; + uint64_t topbit, middlebit; + + h = (gf_internal_t *) gf->scratch; + + c = _mm_setzero_si128(); + lmask = _mm_insert_epi64(c, 1ULL << 63, 0); + hmask = _mm_insert_epi64(c, 1ULL << 63, 1); + b = _mm_insert_epi64(c, a128[0], 1); + b = _mm_insert_epi64(b, a128[1], 0); + a = _mm_insert_epi64(c, b128[0], 1); + a = _mm_insert_epi64(a, b128[1], 0); + pp = _mm_insert_epi64(c, h->prim_poly, 0); + middle_one = _mm_insert_epi64(c, 1, 0x1); + + while (1) { + if (_mm_extract_epi32(a, 0x0) & 1) { + c = _mm_xor_si128(c, b); + } + middlebit = (_mm_extract_epi32(a, 0x2) & 1); + a = _mm_srli_epi64(a, 1); + if (middlebit) a = _mm_xor_si128(a, lmask); + if ((_mm_extract_epi64(a, 0x1) == 0ULL) && (_mm_extract_epi64(a, 0x0) == 0ULL)){ + c128[0] = _mm_extract_epi64(c, 0x1); + c128[1] = _mm_extract_epi64(c, 0x0); + return; + } + topbit = (_mm_extract_epi64(_mm_and_si128(b, hmask), 1)); + middlebit = (_mm_extract_epi64(_mm_and_si128(b, lmask), 0)); + b = _mm_slli_epi64(b, 1); + if (middlebit) b = _mm_xor_si128(b, middle_one); + if (topbit) b = _mm_xor_si128(b, pp); + } +#endif +} + void gf_w128_bytwo_b_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128) { @@ -177,7 +483,7 @@ gf_w128_bytwo_b_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_ h = (gf_internal_t *) gf->scratch; - bmask = (1L << 63); + bmask = (1ULL << 63); set_zero(c, 0); b[0] = a128[0]; b[1] = a128[1]; @@ -243,9 +549,9 @@ gf_w128_split_4_128_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_ ld->tables[0][i][k^j] = (v[0] ^ ld->tables[0][i][k]); ld->tables[1][i][k^j] = (v[1] ^ ld->tables[1][i][k]); } - pp = (v[0] & (1L << 63)); + pp = (v[0] & (1ULL << 63)); v[0] <<= 1; - if (v[1] & (1L << 63)) v[0] ^= 1; + if (v[1] & (1ULL << 63)) v[0] ^= 1; v[1] <<= 1; if (pp) v[1] ^= h->prim_poly; } @@ -254,6 +560,15 @@ gf_w128_split_4_128_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_ ld->last_value[0] = val[0]; ld->last_value[1] = val[1]; +/* + for (i = 0; i < 32; i++) { + for (j = 0; j < 16; j++) { + printf("%2d %2d %016llx %016llx\n", i, j, ld->tables[0][i][j], ld->tables[1][i][j]); + } + printf("\n"); + } + */ + i = 0; while (d64 < top) { v[0] = (xor) ? d64[0] : 0; v[1] = (xor) ? d64[1] : 0; @@ -280,6 +595,191 @@ gf_w128_split_4_128_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_ } } +static +void +gf_w128_split_4_128_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor) +{ +#ifdef INTEL_SSSE3 + gf_internal_t *h; + int i, m, j, k, tindex; + uint64_t pp, v[2], s, *s64, *d64, *top; + __m128i si, tables[32][16], p[16], v0, mask1; + struct gf_w128_split_4_128_data *ld; + uint8_t btable[16]; + gf_region_data rd; + + if (val[0] == 0) { + if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; } + } + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + + /* We only do this to check on alignment. */ + gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 256); + + /* Doing this instead of gf_do_initial_region_alignment() because that doesn't hold 128-bit vals */ + + gf_w128_multiply_region_from_single(gf, src, dest, val, (rd.s_start-src), xor); + + s64 = (uint64_t *) rd.s_start; + d64 = (uint64_t *) rd.d_start; + top = (uint64_t *) rd.d_top; + + ld = (struct gf_w128_split_4_128_data *) h->private; + + if (val[0] != ld->last_value[0] || val[1] != ld->last_value[1]) { + v[0] = val[0]; + v[1] = val[1]; + for (i = 0; i < 32; i++) { + ld->tables[0][i][0] = 0; + ld->tables[1][i][0] = 0; + for (j = 1; j < 16; j <<= 1) { + for (k = 0; k < j; k++) { + ld->tables[0][i][k^j] = (v[0] ^ ld->tables[0][i][k]); + ld->tables[1][i][k^j] = (v[1] ^ ld->tables[1][i][k]); + } + pp = (v[0] & (1ULL << 63)); + v[0] <<= 1; + if (v[1] & (1ULL << 63)) v[0] ^= 1; + v[1] <<= 1; + if (pp) v[1] ^= h->prim_poly; + } + } + } + + ld->last_value[0] = val[0]; + ld->last_value[1] = val[1]; + + for (i = 0; i < 32; i++) { + for (j = 0; j < 16; j++) { + for (k = 0; k < 16; k++) { + btable[k] = (uint8_t) ld->tables[1-(j/8)][i][k]; + ld->tables[1-(j/8)][i][k] >>= 8; + } + tables[i][j] = _mm_loadu_si128((__m128i *) btable); +/* + printf("%2d %2d: ", i, j); + MM_PRINT8("", tables[i][j]); + */ + } + } + + + mask1 = _mm_set1_epi8(0xf); + + while (d64 != top) { + + if (xor) { + for (i = 0; i < 16; i++) p[i] = _mm_load_si128 ((__m128i *) (d64+i*2)); + } else { + for (i = 0; i < 16; i++) p[i] = _mm_setzero_si128(); + } + i = 0; + for (k = 0; k < 16; k++) { + v0 = _mm_load_si128((__m128i *) s64); + s64 += 2; + + si = _mm_and_si128(v0, mask1); + + for (j = 0; j < 16; j++) { + p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si)); + } + i++; + v0 = _mm_srli_epi32(v0, 4); + si = _mm_and_si128(v0, mask1); + for (j = 0; j < 16; j++) { + p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si)); + } + i++; + } + for (i = 0; i < 16; i++) { + _mm_store_si128((__m128i *) d64, p[i]); + d64 += 2; + } + } + /* Doing this instead of gf_do_final_region_alignment() because that doesn't hold 128-bit vals */ + + gf_w128_multiply_region_from_single(gf, rd.s_top, rd.d_top, val, (src+bytes)-rd.s_top, xor); +#endif +} + +static +void +gf_w128_split_8_128_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor) +{ + int i, j, k; + uint64_t pp; + gf_internal_t *h; + uint64_t *s64, *d64, *top; + gf_region_data rd; + uint64_t v[2], s; + struct gf_w128_split_8_128_data *ld; + + /* Check on alignment. Ignore it otherwise. */ + gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8); + + if (val[0] == 0) { + if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; } + } + + h = (gf_internal_t *) gf->scratch; + ld = (struct gf_w128_split_8_128_data *) h->private; + + s64 = (uint64_t *) rd.s_start; + d64 = (uint64_t *) rd.d_start; + top = (uint64_t *) rd.d_top; + + if (val[0] != ld->last_value[0] || val[1] != ld->last_value[1]) { + v[0] = val[0]; + v[1] = val[1]; + for (i = 0; i < 16; i++) { + ld->tables[0][i][0] = 0; + ld->tables[1][i][0] = 0; + for (j = 1; j < (1 << 8); j <<= 1) { + for (k = 0; k < j; k++) { + ld->tables[0][i][k^j] = (v[0] ^ ld->tables[0][i][k]); + ld->tables[1][i][k^j] = (v[1] ^ ld->tables[1][i][k]); + } + pp = (v[0] & (1ULL << 63)); + v[0] <<= 1; + if (v[1] & (1ULL << 63)) v[0] ^= 1; + v[1] <<= 1; + if (pp) v[1] ^= h->prim_poly; + } + } + } + ld->last_value[0] = val[0]; + ld->last_value[1] = val[1]; + + while (d64 < top) { + v[0] = (xor) ? d64[0] : 0; + v[1] = (xor) ? d64[1] : 0; + s = s64[1]; + i = 0; + while (s != 0) { + v[0] ^= ld->tables[0][i][s&0xff]; + v[1] ^= ld->tables[1][i][s&0xff]; + s >>= 8; + i++; + } + s = s64[0]; + i = 8; + while (s != 0) { + v[0] ^= ld->tables[0][i][s&0xff]; + v[1] ^= ld->tables[1][i][s&0xff]; + s >>= 8; + i++; + } + d64[0] = v[0]; + d64[1] = v[1]; + s64 += 2; + d64 += 2; + } +} + void gf_w128_bytwo_b_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor) { @@ -300,7 +800,7 @@ gf_w128_bytwo_b_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t va s64 = (uint64_t *) rd.s_start; d64 = (uint64_t *) rd.d_start; top = (uint64_t *) rd.d_top; - bmask = (1L << 63); + bmask = (1ULL << 63); while (d64 < top) { set_zero(c, 0); @@ -359,11 +859,7 @@ void gf_w128_group_m_init(gf_t *gf, gf_val_128_t b128) uint64_t a128[2]; scratch = (gf_internal_t *) gf->scratch; gt = scratch->private; - if (scratch->mult_type == GF_MULT_DEFAULT) { - g_m = 4; - } else { - g_m = scratch->arg1; - } + g_m = scratch->arg1; prim_poly = scratch->prim_poly; @@ -385,10 +881,49 @@ void gf_w128_group_m_init(gf_t *gf, gf_val_128_t b128) return; } +static +void gf_w128_group_m_sse_init(gf_t *gf, gf_val_128_t b128) +{ +#ifdef INTEL_SSE4 + int i, j; + int g_m; + uint64_t lbit, middlebit; + gf_internal_t *scratch; + gf_group_tables_t *gt; + scratch = (gf_internal_t *) gf->scratch; + gt = scratch->private; + g_m = scratch->arg1; + + __m128i *table = (__m128i *)(gt->m_table), b, a, ubit, prim_poly; + prim_poly = _mm_insert_epi64(_mm_setzero_si128(), scratch->prim_poly, 0); + b = _mm_loadu_si128((__m128i *)(b128)); + + table[0] = _mm_setzero_si128(); + table[1] = table[0]; + table[1] = _mm_insert_epi64(table[1],b128[0],1); + table[1] = _mm_insert_epi64(table[1],b128[1],0); + lbit = 1; + lbit <<= 63; + ubit = _mm_set_epi32(0, 1, 0, 0); + for (i = 2; i < (1 << g_m); i <<= 1) { + a = table[(i >> 1)]; + middlebit = (_mm_extract_epi64(a, 0x0) & lbit); + a = _mm_slli_epi64(a, 1); + if (middlebit) a = _mm_xor_si128(a, ubit); + table[i] = a; + if (_mm_extract_epi64(table[i >> 1], 0x1) & lbit) table[i] = _mm_xor_si128(table[i], prim_poly); + for (j = 0; j < i; j++) { + table[i + j] = _mm_xor_si128(table[i], table[j]); + } + } + return; +#endif +} + void gf_w128_group_multiply(GFP gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128) { - int i; + int i,j; /* index_r, index_m, total_m (if g_r > g_m) */ int i_r, i_m, t_m; int mask_m, mask_r; @@ -399,13 +934,8 @@ gf_w128_group_multiply(GFP gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_ scratch = (gf_internal_t *) gf->scratch; gt = scratch->private; - if (scratch->mult_type == GF_MULT_DEFAULT) { - g_m = 4; - g_r = 8; - } else { - g_m = scratch->arg1; - g_r = scratch->arg2; - } + g_m = scratch->arg1; + g_r = scratch->arg2; mask_m = (1 << g_m) - 1; mask_r = (1 << g_r) - 1; @@ -413,7 +943,7 @@ gf_w128_group_multiply(GFP gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_ if (b128[0] != gt->m_table[2] || b128[1] != gt->m_table[3]) { gf_w128_group_m_init(gf, b128); } - + p_i[0] = 0; p_i[1] = 0; a[0] = a128[0]; @@ -458,11 +988,92 @@ gf_w128_group_multiply(GFP gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_ i_r <<= g_m; } } - c128[0] = p_i[0]; c128[1] = p_i[1]; } +void +gf_w128_group_sse_multiply(GFP gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128) +{ +#ifdef INTEL_SSE4 + int i,j; + int i_r, i_m, t_m; + int mask_m, mask_r, mask_s; + int g_m, g_r; + uint32_t shiftbits; + uint64_t a[2], tbit = 1; + tbit <<= 63; + gf_internal_t *scratch; + gf_group_tables_t *gt; + __m128i p_i, *m_table, *r_table, zero; + + zero = _mm_setzero_si128(); + scratch = (gf_internal_t *) gf->scratch; + gt = scratch->private; + m_table = (__m128i *)(gt->m_table); + r_table = (__m128i *)(gt->r_table); + g_m = scratch->arg1; + g_r = scratch->arg2; + + mask_m = (1 << g_m) - 1; + mask_r = (1 << g_r) - 1; + mask_s = mask_m << (32-g_m); /*sets g_m leftmost bits to 1*/ + if (b128[0] != _mm_extract_epi64(m_table[1], 1) || b128[1] != _mm_extract_epi64(m_table[1], 0)) { + gf_w128_group_m_sse_init(gf, b128); + } + + p_i = zero; + a[0] = a128[0]; + a[1] = a128[1]; + + t_m = 0; + i_r = 0; + + /* Top 64 bits */ + for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) { + i_m = (a[0] >> (i * g_m)) & mask_m; + i_r ^= ((uint64_t)_mm_extract_epi64(p_i, 1) >> (64 - g_m)) & mask_r; + + shiftbits = _mm_extract_epi32(p_i, 1) & mask_s; + shiftbits >>= 32-g_m; + p_i = _mm_slli_epi64(p_i, g_m); + p_i = _mm_xor_si128(p_i, _mm_insert_epi32(zero, shiftbits, 2)); + + p_i = _mm_xor_si128(p_i, m_table[i_m]); + t_m += g_m; + if (t_m == g_r) { + p_i = _mm_xor_si128(p_i, r_table[i_r]); + t_m = 0; + i_r = 0; + } else { + i_r <<= g_m; + } + } + + for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) { + i_m = (a[1] >> (i * g_m)) & mask_m; + i_r ^= (((uint64_t)_mm_extract_epi64(p_i,1)) >> (64 - g_m)) & mask_r; + + shiftbits = _mm_extract_epi32(p_i, 1) & mask_s; + shiftbits >>= 32-g_m; + p_i = _mm_slli_epi64(p_i, g_m); + p_i = _mm_xor_si128(p_i, _mm_insert_epi32(zero, shiftbits, 2)); + + p_i = _mm_xor_si128(p_i, m_table[i_m]); + t_m += g_m; + if (t_m == g_r) { + p_i = _mm_xor_si128(p_i, r_table[i_r]); + t_m = 0; + i_r = 0; + } else { + i_r <<= g_m; + } + } + c128[0] = _mm_extract_epi64(p_i, 1); + c128[1] = _mm_extract_epi64(p_i, 0); +#endif +} + static void gf_w128_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor) @@ -487,13 +1098,8 @@ gf_w128_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, scratch = (gf_internal_t *) gf->scratch; gt = scratch->private; - if (scratch->mult_type == GF_MULT_DEFAULT) { - g_m = 4; - g_r = 8; - } else { - g_m = scratch->arg1; - g_r = scratch->arg2; - } + g_m = scratch->arg1; + g_r = scratch->arg2; mask_m = (1 << g_m) - 1; mask_r = (1 << g_r) - 1; @@ -522,6 +1128,7 @@ gf_w128_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, p_i[0] <<= g_m; p_i[0] ^= (p_i[1] >> (64-g_m)); p_i[1] <<= g_m; + p_i[0] ^= gt->m_table[2 * i_m]; p_i[1] ^= gt->m_table[(2 * i_m) + 1]; t_m += g_m; @@ -533,7 +1140,6 @@ gf_w128_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, i_r <<= g_m; } } - for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) { i_m = (a[1] >> (i * g_m)) & mask_m; i_r ^= (p_i[0] >> (64 - g_m)) & mask_r; @@ -564,9 +1170,162 @@ gf_w128_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, } } +static +void +gf_w128_group_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor) +{ +#ifdef INTEL_SSE4 + int i; + int i_r, i_m, t_m; + int mask_m, mask_r, mask_s; + int g_m, g_r; + uint32_t shiftbits; + uint64_t a[2]; + gf_internal_t *scratch; + gf_group_tables_t *gt; + gf_region_data rd; + uint64_t *a128, *c128, *top; + __m128i *m_table, *r_table, p_i, zero; + zero = _mm_setzero_si128(); + /* We only do this to check on alignment. */ + gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8); + + if (val[0] == 0) { + if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; } + } + + scratch = (gf_internal_t *) gf->scratch; + gt = scratch->private; + m_table = (__m128i *)(gt->m_table); + r_table = (__m128i *)(gt->r_table); + g_m = scratch->arg1; + g_r = scratch->arg2; + + mask_m = (1 << g_m) - 1; + mask_r = (1 << g_r) - 1; + mask_s = mask_m << (32-g_m); + + if (val[0] != _mm_extract_epi64(m_table[1], 1) || val[1] != _mm_extract_epi64(m_table[1], 0)) { + gf_w128_group_m_sse_init(gf, val); + } + + a128 = (uint64_t *) src; + c128 = (uint64_t *) dest; + top = (uint64_t *) rd.d_top; + + if (xor){ + while (c128 < top) { + p_i = zero; + a[0] = a128[0]; + a[1] = a128[1]; + + t_m = 0; + i_r = 0; + /* Top 64 bits */ + for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) { + i_m = (a[0] >> (i * g_m)) & mask_m; + i_r ^= ((uint64_t)_mm_extract_epi64(p_i, 1) >> (64 - g_m)) & mask_r; + + shiftbits = _mm_extract_epi32(p_i, 1) & mask_s; + shiftbits >>= 32-g_m; + p_i = _mm_slli_epi64(p_i, g_m); + p_i = _mm_xor_si128(p_i, _mm_insert_epi32(zero, shiftbits, 2)); + p_i = _mm_xor_si128(p_i, m_table[i_m]); + t_m += g_m; + if (t_m == g_r) { + p_i = _mm_xor_si128(p_i, r_table[i_r]); + t_m = 0; + i_r = 0; + } else { + i_r <<= g_m; + } + } + + for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) { + i_m = (a[1] >> (i * g_m)) & mask_m; + i_r ^= ((uint64_t)_mm_extract_epi64(p_i, 1) >> (64 - g_m)) & mask_r; + + shiftbits = _mm_extract_epi32(p_i, 1) & mask_s; + shiftbits >>= 32-g_m; + p_i = _mm_slli_epi64(p_i, g_m); + p_i = _mm_xor_si128(p_i, _mm_insert_epi32(zero, shiftbits, 2)); + + p_i = _mm_xor_si128(p_i, m_table[i_m]); + t_m += g_m; + if (t_m == g_r) { + p_i = _mm_xor_si128(p_i, r_table[i_r]); + t_m = 0; + i_r = 0; + } else { + i_r <<= g_m; + } + } + + c128[0] ^= _mm_extract_epi64(p_i, 1); + c128[1] ^= _mm_extract_epi64(p_i, 0); + a128 += 2; + c128 += 2; + } + }else{ + while (c128 < top) { + p_i = zero; + a[0] = a128[0]; + a[1] = a128[1]; + + t_m = 0; + i_r = 0; + /* Top 64 bits */ + for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) { + i_m = (a[0] >> (i * g_m)) & mask_m; + i_r ^= ((uint64_t)_mm_extract_epi64(p_i, 1) >> (64 - g_m)) & mask_r; + + shiftbits = _mm_extract_epi32(p_i, 1) & mask_s; + shiftbits >>= 32-g_m; + p_i = _mm_slli_epi64(p_i, g_m); + p_i = _mm_xor_si128(p_i, _mm_insert_epi32(zero, shiftbits, 2)); + p_i = _mm_xor_si128(p_i, m_table[i_m]); + t_m += g_m; + if (t_m == g_r) { + p_i = _mm_xor_si128(p_i, r_table[i_r]); + t_m = 0; + i_r = 0; + } else { + i_r <<= g_m; + } + } + + for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) { + i_m = (a[1] >> (i * g_m)) & mask_m; + i_r ^= ((uint64_t)_mm_extract_epi64(p_i, 1) >> (64 - g_m)) & mask_r; + + shiftbits = _mm_extract_epi32(p_i, 1) & mask_s; + shiftbits >>= 32-g_m; + p_i = _mm_slli_epi64(p_i, g_m); + p_i = _mm_xor_si128(p_i, _mm_insert_epi32(zero, shiftbits, 2)); + + p_i = _mm_xor_si128(p_i, m_table[i_m]); + t_m += g_m; + if (t_m == g_r) { + p_i = _mm_xor_si128(p_i, r_table[i_r]); + t_m = 0; + i_r = 0; + } else { + i_r <<= g_m; + } + } + + c128[0] = _mm_extract_epi64(p_i, 1); + c128[1] = _mm_extract_epi64(p_i, 0); + a128 += 2; + c128 += 2; + } + } +#endif +} /* a^-1 -> b */ -void + void gf_w128_euclid(GFP gf, gf_val_128_t a128, gf_val_128_t b128) { uint64_t e_i[2], e_im1[2], e_ip1[2]; @@ -585,10 +1344,26 @@ gf_w128_euclid(GFP gf, gf_val_128_t a128, gf_val_128_t b128) e_i[0] = a128[0]; e_i[1] = a128[1]; d_im1 = 128; + + //Allen: I think d_i starts at 63 here, and checks each bit of a, starting at MSB, looking for the first nonzero bit + //so d_i should be 0 if this half of a is all 0s, otherwise it should be the position from right of the first-from-left zero bit of this half of a. + //BUT if d_i is 0 at end we won't know yet if the rightmost bit of this half is 1 or not + for (d_i = (d_im1-1) % 64; ((one << d_i) & e_i[0]) == 0 && d_i > 0; d_i--) ; + + //Allen: this is testing just the first half of the stop condition above, so if it holds we know we did not find a nonzero bit yet + if (!((one << d_i) & e_i[0])) { - for (d_i = (d_im1-1) % 64; ((one << d_i) & e_i[1] == 0); d_i--) ; + + //Allen: this is doing the same thing on the other half of a. In other words, we're still searching for a nonzero bit of a. + // but not bothering to test if d_i hits zero, which is fine because we've already tested for a=0. + + for (d_i = (d_im1-1) % 64; ((one << d_i) & e_i[1]) == 0; d_i--) ; + } else { + + //Allen: if a 1 was found in more-significant half of a, make d_i the ACTUAL index of the first nonzero bit in the entire a. + d_i += 64; } y_i[0] = 0; @@ -614,11 +1389,11 @@ gf_w128_euclid(GFP gf, gf_val_128_t a128, gf_val_128_t b128) if (d_ip1 - d_i > 0) e_ip1[0] ^= (e_i[1] >> (64 - (d_ip1 - d_i))); e_ip1[1] ^= (e_i[1] << (d_ip1 - d_i)); } - d_ip1--; + d_ip1--; + if (e_ip1[0] == 0 && e_ip1[1] == 0) { b[0] = 0; b[1] = 0; return; } while (d_ip1 >= 64 && (e_ip1[0] & (one << (d_ip1 - 64))) == 0) d_ip1--; while (d_ip1 < 64 && (e_ip1[1] & (one << d_ip1)) == 0) d_ip1--; } - gf->multiply.w128(gf, c_i, y_i, y_ip1); y_ip1[0] ^= y_im1[0]; y_ip1[1] ^= y_im1[1]; @@ -640,11 +1415,10 @@ gf_w128_euclid(GFP gf, gf_val_128_t a128, gf_val_128_t b128) b = (uint64_t *) b128; b[0] = y_i[0]; b[1] = y_i[1]; - return; } -void + void gf_w128_divide_from_inverse(GFP gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128) { uint64_t d[2]; @@ -653,7 +1427,7 @@ gf_w128_divide_from_inverse(GFP gf, gf_val_128_t a128, gf_val_128_t b128, gf_val return; } -void + void gf_w128_inverse_from_divide(GFP gf, gf_val_128_t a128, gf_val_128_t b128) { uint64_t one128[2]; @@ -663,21 +1437,209 @@ gf_w128_inverse_from_divide(GFP gf, gf_val_128_t a128, gf_val_128_t b128) return; } + +static + void +gf_w128_composite_inverse(gf_t *gf, gf_val_128_t a, gf_val_128_t inv) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + gf_t *base_gf = h->base_gf; + uint64_t a0 = a[1]; + uint64_t a1 = a[0]; + uint64_t c0, c1, d, tmp; + uint64_t a0inv, a1inv; + + if (a0 == 0) { + a1inv = base_gf->inverse.w64(base_gf, a1); + c0 = base_gf->multiply.w64(base_gf, a1inv, h->prim_poly); + c1 = a1inv; + } else if (a1 == 0) { + c0 = base_gf->inverse.w64(base_gf, a0); + c1 = 0; + } else { + a1inv = base_gf->inverse.w64(base_gf, a1); + a0inv = base_gf->inverse.w64(base_gf, a0); + + d = base_gf->multiply.w64(base_gf, a1, a0inv); + + tmp = (base_gf->multiply.w64(base_gf, a1, a0inv) ^ base_gf->multiply.w64(base_gf, a0, a1inv) ^ h->prim_poly); + tmp = base_gf->inverse.w64(base_gf, tmp); + + d = base_gf->multiply.w64(base_gf, d, tmp); + + c0 = base_gf->multiply.w64(base_gf, (d^1), a0inv); + c1 = base_gf->multiply.w64(base_gf, d, a1inv); + } + inv[0] = c1; + inv[1] = c0; +} + +static + void +gf_w128_composite_multiply(gf_t *gf, gf_val_128_t a, gf_val_128_t b, gf_val_128_t rv) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + gf_t *base_gf = h->base_gf; + uint64_t b0 = b[1]; + uint64_t b1 = b[0]; + uint64_t a0 = a[1]; + uint64_t a1 = a[0]; + uint64_t a1b1; + + a1b1 = base_gf->multiply.w64(base_gf, a1, b1); + + rv[1] = (base_gf->multiply.w64(base_gf, a0, b0) ^ a1b1); + rv[0] = base_gf->multiply.w64(base_gf, a1, b0) ^ + base_gf->multiply.w64(base_gf, a0, b1) ^ + base_gf->multiply.w64(base_gf, a1b1, h->prim_poly); +} + +static + void +gf_w128_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor) +{ + unsigned long uls, uld; + gf_internal_t *h = (gf_internal_t *) gf->scratch; + gf_t *base_gf = h->base_gf; + uint64_t b0 = val[1]; + uint64_t b1 = val[0]; + uint64_t *s64, *d64; + uint64_t *top; + uint64_t a0, a1, a1b1; + gf_region_data rd; + + if (val[0] == 0 && val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8); + + s64 = rd.s_start; + d64 = rd.d_start; + top = rd.d_top; + + if (xor) { + while (d64 < top) { + a1 = s64[0]; + a0 = s64[1]; + a1b1 = base_gf->multiply.w64(base_gf, a1, b1); + + d64[1] ^= (base_gf->multiply.w64(base_gf, a0, b0) ^ a1b1); + d64[0] ^= (base_gf->multiply.w64(base_gf, a1, b0) ^ + base_gf->multiply.w64(base_gf, a0, b1) ^ + base_gf->multiply.w64(base_gf, a1b1, h->prim_poly)); + s64 += 2; + d64 += 2; + } + } else { + while (d64 < top) { + a1 = s64[0]; + a0 = s64[1]; + a1b1 = base_gf->multiply.w64(base_gf, a1, b1); + + d64[1] = (base_gf->multiply.w64(base_gf, a0, b0) ^ a1b1); + d64[0] = (base_gf->multiply.w64(base_gf, a1, b0) ^ + base_gf->multiply.w64(base_gf, a0, b1) ^ + base_gf->multiply.w64(base_gf, a1b1, h->prim_poly)); + s64 += 2; + d64 += 2; + } + } +} + +static +void +gf_w128_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int + xor) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; gf_t *base_gf = h->base_gf; + gf_val_64_t val0 = val[1]; + gf_val_64_t val1 = val[0]; + uint64_t *l, *hi; + uint8_t *slow, *shigh; + uint8_t *dlow, *dhigh, *top; + int sub_reg_size; + gf_region_data rd; + + gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 64); + gf_w128_multiply_region_from_single(gf, src, dest, val, (rd.s_start-src), xor); + + slow = (uint8_t *) rd.s_start; + dlow = (uint8_t *) rd.d_start; + top = (uint8_t*) rd.d_top; + sub_reg_size = (top - dlow)/2; + shigh = slow + sub_reg_size; + dhigh = dlow + sub_reg_size; + + base_gf->multiply_region.w64(base_gf, slow, dlow, val0, sub_reg_size, xor); + base_gf->multiply_region.w64(base_gf, shigh, dlow, val1, sub_reg_size, 1); + base_gf->multiply_region.w64(base_gf, slow, dhigh, val1, sub_reg_size, xor); + base_gf->multiply_region.w64(base_gf, shigh, dhigh, val0, sub_reg_size, 1); + base_gf->multiply_region.w64(base_gf, shigh, dhigh, base_gf->multiply.w64(base_gf, h->prim_poly, val1 + ), sub_reg_size, 1); + + gf_w128_multiply_region_from_single(gf, rd.s_top, rd.d_top, val, (src+bytes)-rd.s_top, xor); +} + + + static +int gf_w128_composite_init(gf_t *gf) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + + if (h->region_type & GF_REGION_ALTMAP) { + gf->multiply_region.w128 = gf_w128_composite_multiply_region_alt; + } else { + gf->multiply_region.w128 = gf_w128_composite_multiply_region; + } + + gf_internal_t *base_h = (gf_internal_t *) h->base_gf->scratch; + + gf->multiply.w128 = gf_w128_composite_multiply; + gf->divide.w128 = gf_w128_divide_from_inverse; + gf->inverse.w128 = gf_w128_composite_inverse; + + return 1; +} + +static +int gf_w128_cfm_init(gf_t *gf) +{ +#ifdef INTEL_SSE4_PCLMUL + gf->inverse.w128 = gf_w128_euclid; + gf->multiply.w128 = gf_w128_clm_multiply; + gf->multiply_region.w128 = gf_w128_clm_multiply_region_from_single; + return 1; +#endif + + return 0; +} + static int gf_w128_shift_init(gf_t *gf) { + gf_internal_t *h; + h = (gf_internal_t*) gf->scratch; gf->multiply.w128 = gf_w128_shift_multiply; gf->inverse.w128 = gf_w128_euclid; gf->multiply_region.w128 = gf_w128_multiply_region_from_single; return 1; } -static + static int gf_w128_bytwo_init(gf_t *gf) { - gf->multiply.w128 = gf_w128_bytwo_b_multiply; + gf_internal_t *h; + h = (gf_internal_t *) gf->scratch; + + if (h->mult_type == GF_MULT_BYTWO_p) { + gf->multiply.w128 = gf_w128_bytwo_p_multiply; + /*gf->multiply.w128 = gf_w128_sse_bytwo_p_multiply;*/ + /* John: the sse function is slower.*/ + } else { + gf->multiply.w128 = gf_w128_bytwo_b_multiply; + /*gf->multiply.w128 = gf_w128_sse_bytwo_b_multiply; +Ben: This sse function is also slower. */ + } gf->inverse.w128 = gf_w128_euclid; - gf->multiply_region.w128 = gf_w128_multiply_region_from_single; gf->multiply_region.w128 = gf_w128_bytwo_b_multiply_region; return 1; } @@ -686,7 +1648,7 @@ int gf_w128_bytwo_init(gf_t *gf) * Because the prim poly is only 8 bits and we are limiting g_r to 16, I do not need the high 64 * bits in all of these numbers. */ -static + static void gf_w128_group_r_init(gf_t *gf) { int i, j; @@ -696,11 +1658,7 @@ void gf_w128_group_r_init(gf_t *gf) gf_group_tables_t *gt; scratch = (gf_internal_t *) gf->scratch; gt = scratch->private; - if (scratch->mult_type == GF_MULT_DEFAULT) { - g_r = 8; - } else { - g_r = scratch->arg2; - } + g_r = scratch->arg2; pp = scratch->prim_poly; gt->r_table[0] = 0; @@ -715,20 +1673,76 @@ void gf_w128_group_r_init(gf_t *gf) return; } -static + static +void gf_w128_group_r_sse_init(gf_t *gf) +{ +#ifdef INTEL_SSE4 + int i, j; + int g_r; + uint64_t pp; + gf_internal_t *scratch; + gf_group_tables_t *gt; + scratch = (gf_internal_t *) gf->scratch; + gt = scratch->private; + __m128i zero = _mm_setzero_si128(); + __m128i *table = (__m128i *)(gt->r_table); + g_r = scratch->arg2; + pp = scratch->prim_poly; + table[0] = zero; + for (i = 1; i < (1 << g_r); i++) { + table[i] = zero; + for (j = 0; j < g_r; j++) { + if (i & (1 << j)) { + table[i] = _mm_xor_si128(table[i], _mm_insert_epi64(zero, pp << j, 0)); + } + } + } + return; +#endif +} + + static int gf_w128_split_init(gf_t *gf) { - struct gf_w128_split_4_128_data *sd; + struct gf_w128_split_4_128_data *sd4; + struct gf_w128_split_8_128_data *sd8; gf_internal_t *h; h = (gf_internal_t *) gf->scratch; - sd = (struct gf_w128_split_4_128_data *) h->private; - sd->last_value[0] = 0; - sd->last_value[1] = 0; - gf->multiply.w128 = gf_w128_bytwo_b_multiply; + gf->multiply.w128 = gf_w128_bytwo_p_multiply; +#ifdef INTEL_SSE4_PCLMUL + if (!(h->region_type & GF_REGION_NOSSE)){ + gf->multiply.w128 = gf_w128_clm_multiply; + } +#endif + gf->inverse.w128 = gf_w128_euclid; - gf->multiply_region.w128 = gf_w128_split_4_128_multiply_region; + + if ((h->arg1 != 4 && h->arg2 != 4) || h->mult_type == GF_MULT_DEFAULT) { + sd8 = (struct gf_w128_split_8_128_data *) h->private; + sd8->last_value[0] = 0; + sd8->last_value[1] = 0; + gf->multiply_region.w128 = gf_w128_split_8_128_multiply_region; + } else { + sd4 = (struct gf_w128_split_4_128_data *) h->private; + sd4->last_value[0] = 0; + sd4->last_value[1] = 0; + if((h->region_type & GF_REGION_ALTMAP)) + { + #ifdef INTEL_SSE4 + if(!(h->region_type & GF_REGION_NOSSE)) + gf->multiply_region.w128 = gf_w128_split_4_128_sse_altmap_multiply_region; + else + return 0; + #else + return 0; + #endif + } + else { + gf->multiply_region.w128 = gf_w128_split_4_128_multiply_region; + } + } return 1; } @@ -739,16 +1753,12 @@ int gf_w128_group_init(gf_t *gf) gf_internal_t *scratch; gf_group_tables_t *gt; int g_m, g_r, size_r; + long tmp; scratch = (gf_internal_t *) gf->scratch; gt = scratch->private; - if (scratch->mult_type == GF_MULT_DEFAULT) { - g_m = 4; - g_r = 8; - } else { - g_m = scratch->arg1; - g_r = scratch->arg2; - } + g_m = scratch->arg1; + g_r = scratch->arg2; size_r = (1 << g_r); gt->r_table = scratch->private + (2 * sizeof(uint64_t *)); @@ -756,11 +1766,30 @@ int gf_w128_group_init(gf_t *gf) gt->m_table[2] = 0; gt->m_table[3] = 0; - gf_w128_group_r_init(gf); - gf->multiply.w128 = gf_w128_group_multiply; gf->inverse.w128 = gf_w128_euclid; gf->multiply_region.w128 = gf_w128_group_multiply_region; + + #ifdef INTEL_SSE4 + if(!(scratch->region_type & GF_REGION_NOSSE)) + { + if ((g_m != 4) && ((g_r != 4) || (g_r != 8))) + return 0; + gt->r_table = (void *)(((uint64_t)gt->r_table + 15) & (~0xfULL)); /* aligns gt->r_table on a 16-bit boundary*/ + gt->m_table = gt->r_table + 2*size_r; + gt->m_table[2] = 0; + gt->m_table[3] = 0; + gf->multiply.w128 = gf_w128_group_sse_multiply; + gf->multiply_region.w128 = gf_w128_group_sse_multiply_region; + gf_w128_group_r_sse_init(gf); + } + else + gf_w128_group_r_init(gf); + #else + if(scratch->region_type & GF_REGION_SSE) return 0; + else gf_w128_group_r_init(gf); + #endif + return 1; } @@ -773,88 +1802,175 @@ void gf_w128_extract_word(gf_t *gf, void *start, int bytes, int index, gf_val_12 memcpy(rv, s, 16); } +static void gf_w128_split_extract_word(gf_t *gf, void *start, int bytes, int index, gf_val_128_t rv) +{ + int i, blocks; + uint64_t *r64, tmp; + uint8_t *r8; + gf_region_data rd; + + gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 256); + r64 = (uint64_t *) start; + if ((r64 + index*2 < (uint64_t *) rd.d_start) || + (r64 + index*2 >= (uint64_t *) rd.d_top)) { + memcpy(rv, r64+(index*2), 16); + return; + } + + index -= (((uint64_t *) rd.d_start) - r64)/2; + r64 = (uint64_t *) rd.d_start; + + blocks = index/16; + r64 += (blocks*32); + index %= 16; + r8 = (uint8_t *) r64; + r8 += index; + rv[0] = 0; + rv[1] = 0; + + for (i = 0; i < 8; i++) { + tmp = *r8; + rv[1] |= (tmp << (i*8)); + r8 += 16; + } + + for (i = 0; i < 8; i++) { + tmp = *r8; + rv[0] |= (tmp << (i*8)); + r8 += 16; + } + return; +} + + static +void gf_w128_composite_extract_word(gf_t *gf, void *start, int bytes, int index, gf_val_128_t rv) +{ + int sub_size; + gf_internal_t *h; + uint8_t *r8, *top; + uint64_t *r64; + gf_region_data rd; + + h = (gf_internal_t *) gf->scratch; + gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 64); + r64 = (uint64_t *) start; + if ((r64 + index*2 < (uint64_t *) rd.d_start) || + (r64 + index*2 >= (uint64_t *) rd.d_top)) { + memcpy(rv, r64+(index*2), 16); + return; + } + index -= (((uint64_t *) rd.d_start) - r64)/2; + r8 = (uint8_t *) rd.d_start; + top = (uint8_t *) rd.d_top; + sub_size = (top-r8)/2; + + rv[1] = h->base_gf->extract_word.w64(h->base_gf, r8, sub_size, index); + rv[0] = h->base_gf->extract_word.w64(h->base_gf, r8+sub_size, sub_size, index); + + return; +} + int gf_w128_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2) { int size_m, size_r; int w = 128; + if (divide_type==GF_DIVIDE_MATRIX) return 0; + switch(mult_type) { + case GF_MULT_CARRY_FREE: + return sizeof(gf_internal_t); + break; case GF_MULT_SHIFT: - if (arg1 != 0 || arg2 != 0 || region_type != 0) return -1; return sizeof(gf_internal_t); break; + case GF_MULT_BYTWO_p: case GF_MULT_BYTWO_b: - if (arg1 != 0 || arg2 != 0 || region_type != 0) return -1; return sizeof(gf_internal_t); break; - case GF_MULT_SPLIT_TABLE: - if (region_type != 0) return -1; - if ((arg1 == 4 && arg2 == 128) || (arg1 == 128 && arg2 == 4)) { - return sizeof(gf_internal_t) + sizeof(struct gf_w128_split_4_128_data) + 64; - } - return -1; - break; case GF_MULT_DEFAULT: - arg1 = 4; - arg2 = 8; + case GF_MULT_SPLIT_TABLE: + if ((arg1 == 4 && arg2 == 128) || (arg1 == 128 && arg2 == 4)) { + return sizeof(gf_internal_t) + sizeof(struct gf_w128_split_4_128_data) + 64; + } else if ((arg1 == 8 && arg2 == 128) || (arg1 == 128 && arg2 == 8) || mult_type == GF_MULT_DEFAULT) { + return sizeof(gf_internal_t) + sizeof(struct gf_w128_split_8_128_data) + 64; + } + return 0; + break; case GF_MULT_GROUP: - - /* arg1 == mult size, arg2 == reduce size */ - /* Should prevent anything over arg1 > 16 || arg2 > 16 */ - if (region_type != 0) return -1; - if (arg1 <= 0 || arg2 <= 0 || arg1 > 16 || arg2 > 16) return -1; - if (GF_FIELD_WIDTH % arg1 != 0 || GF_FIELD_WIDTH % arg2 != 0) return -1; - /* - * Currently implementing code where g_m and g_r are the same or where g_r is larger, as - * these it is more efficient to have g_r as large as possible (but still not > 16) - */ - if (arg1 > arg2) return -1; - - /* size of each group, 128 bits */ + /* JSP We've already error checked the arguments. */ size_m = (1 << arg1) * 2 * sizeof(uint64_t); - /* The PP is only 8 bits and we are limiting g_r to 16, so only uint64_t */ - size_r = (1 << arg2) * sizeof(uint64_t); - + size_r = (1 << arg2) * 2 * sizeof(uint64_t); /* * two pointers prepend the table data for structure * because the tables are of dynamic size */ - return sizeof(gf_internal_t) + size_m + size_r + 2 * sizeof(uint64_t *); + return sizeof(gf_internal_t) + size_m + size_r + 4 * sizeof(uint64_t *); + break; + case GF_MULT_COMPOSITE: + if (arg1 == 2) { + return sizeof(gf_internal_t) + 4; + } else { + return 0; + } + break; + default: - return -1; + return 0; } } int gf_w128_init(gf_t *gf) { - gf_internal_t *h; + gf_internal_t *h, *h_base, *h_base_base, *h_base_base_base; + int no_default_flag = 0; h = (gf_internal_t *) gf->scratch; - if (h->prim_poly == 0) h->prim_poly = 0x87; /* Omitting the leftmost 1 as in w=32 */ + + /* Allen: set default primitive polynomial / irreducible polynomial if needed */ + + if (h->prim_poly == 0) { + if (h->mult_type == GF_MULT_COMPOSITE) { + h->prim_poly = gf_composite_get_default_poly(h->base_gf); + if (h->prim_poly == 0) return 0; /* This shouldn't happen */ + } else { + h->prim_poly = 0x87; /* Omitting the leftmost 1 as in w=32 */ + } + if (no_default_flag == 1) { + fprintf(stderr,"Code contains no default irreducible polynomial for given base field\n"); + return 0; + } + } gf->multiply.w128 = NULL; gf->divide.w128 = NULL; gf->inverse.w128 = NULL; gf->multiply_region.w128 = NULL; - switch(h->mult_type) { + case GF_MULT_BYTWO_p: case GF_MULT_BYTWO_b: if (gf_w128_bytwo_init(gf) == 0) return 0; break; + case GF_MULT_CARRY_FREE: if (gf_w128_cfm_init(gf) == 0) return 0; break; case GF_MULT_SHIFT: if (gf_w128_shift_init(gf) == 0) return 0; break; - case GF_MULT_DEFAULT: case GF_MULT_GROUP: if (gf_w128_group_init(gf) == 0) return 0; break; + case GF_MULT_DEFAULT: case GF_MULT_SPLIT_TABLE: if (gf_w128_split_init(gf) == 0) return 0; break; + case GF_MULT_COMPOSITE: if (gf_w128_composite_init(gf) == 0) return 0; break; default: return 0; } - gf->extract_word.w128 = gf_w128_extract_word; + /* Ben: Used to be h->region_type == GF_REGION_ALTMAP, but failed since there + are multiple flags in h->region_type */ + if (h->mult_type == GF_MULT_SPLIT_TABLE && (h->region_type & GF_REGION_ALTMAP)) { + gf->extract_word.w128 = gf_w128_split_extract_word; + } else if (h->mult_type == GF_MULT_COMPOSITE && h->region_type == GF_REGION_ALTMAP) { + gf->extract_word.w128 = gf_w128_composite_extract_word; + } else { + gf->extract_word.w128 = gf_w128_extract_word; + } if (h->divide_type == GF_DIVIDE_EUCLID) { gf->divide.w128 = gf_w128_divide_from_inverse; - gf->inverse.w128 = gf_w128_euclid; - } /* } else if (h->divide_type == GF_DIVIDE_MATRIX) { - gf->divide.w128 = gf_w128_divide_from_inverse; - gf->inverse.w128 = gf_w128_matrix; - } */ + } if (gf->inverse.w128 != NULL && gf->divide.w128 == NULL) { gf->divide.w128 = gf_w128_divide_from_inverse; diff --git a/gf_w16.c b/gf_w16.c index e8b48fd..6bc25a6 100644 --- a/gf_w16.c +++ b/gf_w16.c @@ -14,50 +14,47 @@ #define GF_BASE_FIELD_WIDTH (8) #define GF_BASE_FIELD_SIZE (1 << GF_BASE_FIELD_WIDTH) -#define GF_S_GF_8_2 (63) -struct gf_logtable_data { +struct gf_w16_logtable_data { uint16_t log_tbl[GF_FIELD_SIZE]; uint16_t antilog_tbl[GF_FIELD_SIZE * 2]; uint16_t inv_tbl[GF_FIELD_SIZE]; uint16_t *d_antilog; }; -struct gf_zero_logtable_data { - int log_tbl[GF_FIELD_SIZE]; +struct gf_w16_zero_logtable_data { + int log_tbl[GF_FIELD_SIZE]; uint16_t _antilog_tbl[GF_FIELD_SIZE * 4]; uint16_t *antilog_tbl; uint16_t inv_tbl[GF_FIELD_SIZE]; }; -struct gf_lazytable_data { - int log_tbl[GF_FIELD_SIZE]; +struct gf_w16_lazytable_data { + uint16_t log_tbl[GF_FIELD_SIZE]; uint16_t antilog_tbl[GF_FIELD_SIZE * 2]; uint16_t inv_tbl[GF_FIELD_SIZE]; uint16_t lazytable[GF_FIELD_SIZE]; }; -struct gf_w8_logtable_data { - uint8_t log_tbl[GF_BASE_FIELD_SIZE]; - uint8_t antilog_tbl[GF_BASE_FIELD_SIZE * 2]; - uint8_t *antilog_tbl_div; -}; - -struct gf_w8_single_table_data { - uint8_t mult[GF_BASE_FIELD_SIZE][GF_BASE_FIELD_SIZE]; -}; - struct gf_w16_bytwo_data { uint64_t prim_poly; uint64_t mask1; uint64_t mask2; }; +struct gf_w16_split_8_8_data { + uint16_t tables[3][256][256]; +}; + struct gf_w16_group_4_4_data { uint16_t reduce[16]; uint16_t shift[16]; }; +struct gf_w16_composite_data { + uint8_t *mult_table; +}; + #define AB2(ip, am1 ,am2, b, t1, t2) {\ t1 = (b << 1) & am1;\ t2 = b & am2; \ @@ -72,6 +69,9 @@ struct gf_w16_group_4_4_data { #define MM_PRINT(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 2) printf(" %02x %02x", blah[15-ii], blah[14-ii]); printf("\n"); } +#define GF_FIRST_BIT (1 << 15) +#define GF_MULTBY_TWO(p) (((p) & GF_FIRST_BIT) ? (((p) << 1) ^ h->prim_poly) : (p) << 1) + static inline gf_val_32_t gf_w16_inverse_from_divide (gf_t *gf, gf_val_32_t a) @@ -120,6 +120,212 @@ gf_w16_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_32_t gf_do_final_region_alignment(&rd); } +static +void +gf_w16_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + gf_region_data rd; + uint16_t *s16; + uint16_t *d16; + +#ifdef INTEL_SSE4_PCLMUL + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i w; + gf_internal_t * h = gf->scratch; + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL)); + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2); + gf_do_initial_region_alignment(&rd); + + a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0); + + s16 = (uint16_t *) rd.s_start; + d16 = (uint16_t *) rd.d_start; + + if (xor) { + while (d16 < ((uint16_t *) rd.d_top)) { + + /* see gf_w16_clm_multiply() to see explanation of method */ + + b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + + *d16 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0)); + d16++; + s16++; + } + } else { + while (d16 < ((uint16_t *) rd.d_top)) { + + /* see gf_w16_clm_multiply() to see explanation of method */ + + b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + + *d16 = ((gf_val_32_t)_mm_extract_epi32(result, 0)); + d16++; + s16++; + } + } + gf_do_final_region_alignment(&rd); +#endif +} + +static +void +gf_w16_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + gf_region_data rd; + uint16_t *s16; + uint16_t *d16; + +#ifdef INTEL_SSE4_PCLMUL + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i w; + gf_internal_t * h = gf->scratch; + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL)); + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0); + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2); + gf_do_initial_region_alignment(&rd); + + s16 = (uint16_t *) rd.s_start; + d16 = (uint16_t *) rd.d_start; + + if (xor) { + while (d16 < ((uint16_t *) rd.d_top)) { + + /* see gf_w16_clm_multiply() to see explanation of method */ + + b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + + *d16 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0)); + d16++; + s16++; + } + } else { + while (d16 < ((uint16_t *) rd.d_top)) { + + /* see gf_w16_clm_multiply() to see explanation of method */ + + b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + + *d16 = ((gf_val_32_t)_mm_extract_epi32(result, 0)); + d16++; + s16++; + } + } + gf_do_final_region_alignment(&rd); +#endif +} + +static +void +gf_w16_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + gf_region_data rd; + uint16_t *s16; + uint16_t *d16; + +#ifdef INTEL_SSE4_PCLMUL + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i w; + gf_internal_t * h = gf->scratch; + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL)); + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2); + gf_do_initial_region_alignment(&rd); + + a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0); + + s16 = (uint16_t *) rd.s_start; + d16 = (uint16_t *) rd.d_start; + + if (xor) { + while (d16 < ((uint16_t *) rd.d_top)) { + + /* see gf_w16_clm_multiply() to see explanation of method */ + + b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + + *d16 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0)); + d16++; + s16++; + } + } else { + while (d16 < ((uint16_t *) rd.d_top)) { + + /* see gf_w16_clm_multiply() to see explanation of method */ + + b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + + *d16 = ((gf_val_32_t)_mm_extract_epi32(result, 0)); + d16++; + s16++; + } + } + gf_do_final_region_alignment(&rd); +#endif +} + static inline gf_val_32_t gf_w16_euclid (gf_t *gf, gf_val_32_t b) @@ -146,6 +352,7 @@ gf_val_32_t gf_w16_euclid (gf_t *gf, gf_val_32_t b) while (d_ip1 >= d_i) { c_i ^= (1 << (d_ip1 - d_i)); e_ip1 ^= (e_i << (d_ip1 - d_i)); + if (e_ip1 == 0) return 0; while ((e_ip1 & (1 << d_ip1)) == 0) d_ip1--; } @@ -227,16 +434,146 @@ gf_val_32_t gf_w16_matrix (gf_t *gf, gf_val_32_t b) /* JSP: GF_MULT_SHIFT: The world's dumbest multiplication algorithm. I only include it for completeness. It does have the feature that it requires no extra memory. -*/ + */ static inline gf_val_32_t +gf_w16_clm_multiply_2 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16) +{ + gf_val_32_t rv = 0; + +#ifdef INTEL_SSE4_PCLMUL + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i v, w; + gf_internal_t * h = gf->scratch; + + a = _mm_insert_epi32 (_mm_setzero_si128(), a16, 0); + b = _mm_insert_epi32 (a, b16, 0); + + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL)); + + /* Do the initial multiply */ + + result = _mm_clmulepi64_si128 (a, b, 0); + + /* Ben: Do prim_poly reduction twice. We are guaranteed that we will only + have to do the reduction at most twice, because (w-2)/z == 2. Where + z is equal to the number of zeros after the leading 1 + + _mm_clmulepi64_si128 is the carryless multiply operation. Here + _mm_srli_si128 shifts the result to the right by 2 bytes. This allows + us to multiply the prim_poly by the leading bits of the result. We + then xor the result of that operation back with the result.*/ + + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + + /* Extracts 32 bit value from result. */ + + rv = ((gf_val_32_t)_mm_extract_epi32(result, 0)); + + +#endif + return rv; +} + +static +inline +gf_val_32_t +gf_w16_clm_multiply_3 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16) +{ + gf_val_32_t rv = 0; + +#ifdef INTEL_SSE4_PCLMUL + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i v, w; + gf_internal_t * h = gf->scratch; + + a = _mm_insert_epi32 (_mm_setzero_si128(), a16, 0); + b = _mm_insert_epi32 (a, b16, 0); + + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL)); + + /* Do the initial multiply */ + + result = _mm_clmulepi64_si128 (a, b, 0); + + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + + /* Extracts 32 bit value from result. */ + + rv = ((gf_val_32_t)_mm_extract_epi32(result, 0)); + + +#endif + return rv; +} + +static +inline +gf_val_32_t +gf_w16_clm_multiply_4 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16) +{ + gf_val_32_t rv = 0; + +#ifdef INTEL_SSE4_PCLMUL + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i v, w; + gf_internal_t * h = gf->scratch; + + a = _mm_insert_epi32 (_mm_setzero_si128(), a16, 0); + b = _mm_insert_epi32 (a, b16, 0); + + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL)); + + /* Do the initial multiply */ + + result = _mm_clmulepi64_si128 (a, b, 0); + + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + + /* Extracts 32 bit value from result. */ + + rv = ((gf_val_32_t)_mm_extract_epi32(result, 0)); + + +#endif + return rv; +} + + +static +inline + gf_val_32_t gf_w16_shift_multiply (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16) { gf_val_32_t product, i, pp, a, b; gf_internal_t *h; - + a = a16; b = b16; h = (gf_internal_t *) gf->scratch; @@ -247,7 +584,7 @@ gf_w16_shift_multiply (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16) for (i = 0; i < GF_FIELD_WIDTH; i++) { if (a & (1 << i)) product ^= (b << i); } - for (i = (GF_FIELD_WIDTH*2-1); i >= GF_FIELD_WIDTH; i--) { + for (i = (GF_FIELD_WIDTH*2-2); i >= GF_FIELD_WIDTH; i--) { if (product & (1 << i)) product ^= (pp << (i-GF_FIELD_WIDTH)); } return product; @@ -257,11 +594,37 @@ static int gf_w16_shift_init(gf_t *gf) { gf->multiply.w32 = gf_w16_shift_multiply; - gf->inverse.w32 = gf_w16_euclid; - gf->multiply_region.w32 = gf_w16_multiply_region_from_single; return 1; } +static +int gf_w16_cfm_init(gf_t *gf) +{ + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + + /*Ben: Determining how many reductions to do */ + +#ifdef INTEL_SSE4_PCLMUL + if ((0xfe00 & h->prim_poly) == 0) { + gf->multiply.w32 = gf_w16_clm_multiply_2; + gf->multiply_region.w32 = gf_w16_clm_multiply_region_from_single_2; + } else if((0xf000 & h->prim_poly) == 0) { + gf->multiply.w32 = gf_w16_clm_multiply_3; + gf->multiply_region.w32 = gf_w16_clm_multiply_region_from_single_3; + } else if ((0xe000 & h->prim_poly) == 0) { + gf->multiply.w32 = gf_w16_clm_multiply_4; + gf->multiply_region.w32 = gf_w16_clm_multiply_region_from_single_4; + } else { + return 0; + } + return 1; +#endif + + return 0; +} + /* KMG: GF_MULT_LOGTABLE: */ static @@ -270,7 +633,7 @@ gf_w16_log_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int { uint16_t *s16, *d16; int lv; - struct gf_logtable_data *ltd; + struct gf_w16_logtable_data *ltd; gf_region_data rd; if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } @@ -279,7 +642,7 @@ gf_w16_log_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2); gf_do_initial_region_alignment(&rd); - ltd = (struct gf_logtable_data *) ((gf_internal_t *) gf->scratch)->private; + ltd = (struct gf_w16_logtable_data *) ((gf_internal_t *) gf->scratch)->private; s16 = (uint16_t *) rd.s_start; d16 = (uint16_t *) rd.d_start; @@ -306,9 +669,9 @@ inline gf_val_32_t gf_w16_log_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) { - struct gf_logtable_data *ltd; + struct gf_w16_logtable_data *ltd; - ltd = (struct gf_logtable_data *) ((gf_internal_t *) gf->scratch)->private; + ltd = (struct gf_w16_logtable_data *) ((gf_internal_t *) gf->scratch)->private; return (a == 0 || b == 0) ? 0 : ltd->antilog_tbl[(int) ltd->log_tbl[a] + (int) ltd->log_tbl[b]]; } @@ -318,10 +681,10 @@ gf_val_32_t gf_w16_log_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b) { int log_sum = 0; - struct gf_logtable_data *ltd; + struct gf_w16_logtable_data *ltd; if (a == 0 || b == 0) return 0; - ltd = (struct gf_logtable_data *) ((gf_internal_t *) gf->scratch)->private; + ltd = (struct gf_w16_logtable_data *) ((gf_internal_t *) gf->scratch)->private; log_sum = (int) ltd->log_tbl[a] - (int) ltd->log_tbl[b]; return (ltd->d_antilog[log_sum]); @@ -331,9 +694,9 @@ static gf_val_32_t gf_w16_log_inverse(gf_t *gf, gf_val_32_t a) { - struct gf_logtable_data *ltd; + struct gf_w16_logtable_data *ltd; - ltd = (struct gf_logtable_data *) ((gf_internal_t *) gf->scratch)->private; + ltd = (struct gf_w16_logtable_data *) ((gf_internal_t *) gf->scratch)->private; return (ltd->inv_tbl[a]); } @@ -341,17 +704,20 @@ static int gf_w16_log_init(gf_t *gf) { gf_internal_t *h; - struct gf_logtable_data *ltd; + struct gf_w16_logtable_data *ltd; int i, b; + int check = 0; h = (gf_internal_t *) gf->scratch; ltd = h->private; - - ltd->log_tbl[0] = 0; + + for (i = 0; i < GF_MULT_GROUP_SIZE+1; i++) + ltd->log_tbl[i] = 0; ltd->d_antilog = ltd->antilog_tbl + GF_MULT_GROUP_SIZE; b = 1; for (i = 0; i < GF_MULT_GROUP_SIZE; i++) { + if (ltd->log_tbl[b] != 0) check = 1; ltd->log_tbl[b] = i; ltd->antilog_tbl[i] = b; ltd->antilog_tbl[i+GF_MULT_GROUP_SIZE] = b; @@ -360,6 +726,24 @@ int gf_w16_log_init(gf_t *gf) b = b ^ h->prim_poly; } } + + /* If you can't construct the log table, there's a problem. This code is used for + some other implementations (e.g. in SPLIT), so if the log table doesn't work in + that instance, use CARRY_FREE / SHIFT instead. */ + + if (check) { + if (h->mult_type != GF_MULT_LOG_TABLE) { + +#ifdef INTEL_SSE4_PCLMUL + return gf_w16_cfm_init(gf); +#endif + return gf_w16_shift_init(gf); + } else { + _gf_errno = GF_E_LOGPOLY; + return 0; + } + } + ltd->inv_tbl[0] = 0; /* Not really, but we need to fill it with something */ ltd->inv_tbl[1] = 1; for (i = 2; i < GF_FIELD_SIZE; i++) { @@ -377,8 +761,76 @@ int gf_w16_log_init(gf_t *gf) /* JSP: GF_MULT_SPLIT_TABLE: Using 8 multiplication tables to leverage SSE instructions. */ -static + +/* Ben: Does alternate mapping multiplication using a split table in the + lazy method without sse instructions*/ + +static void +gf_w16_split_4_16_lazy_nosse_altmap_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + uint64_t i, j, a, b, c, prod; + uint8_t *s8, *d8, *top; + gf_internal_t *h; + uint16_t table[4][16]; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32); + gf_do_initial_region_alignment(&rd); + + h = (gf_internal_t *) gf->scratch; + + /*Ben: Constructs lazy multiplication table*/ + + for (j = 0; j < 16; j++) { + for (i = 0; i < 4; i++) { + c = (j << (i*4)); + table[i][j] = gf->multiply.w32(gf, c, val); + } + } + + /*Ben: s8 is the start of source, d8 is the start of dest, top is end of dest region. */ + + s8 = (uint8_t *) rd.s_start; + d8 = (uint8_t *) rd.d_start; + top = (uint8_t *) rd.d_top; + + + while (d8 < top) { + + /*Ben: Multiplies across 16 two byte quantities using alternate mapping + high bits are on the left, low bits are on the right. */ + + for (j=0;j<16;j++) { + + /*Ben: If the xor flag is set, the product should include what is in dest */ + prod = (xor) ? ((uint16_t)(*d8)<<8) ^ *(d8+16) : 0; + + /*Ben: xors all 4 table lookups into the product variable*/ + + prod ^= ((table[0][*(s8+16)&0xf]) ^ + (table[1][(*(s8+16)&0xf0)>>4]) ^ + (table[2][*(s8)&0xf]) ^ + (table[3][(*(s8)&0xf0)>>4])); + + /*Ben: Stores product in the destination and moves on*/ + + *d8 = (uint8_t)(prod >> 8); + *(d8+16) = (uint8_t)(prod & 0x00ff); + s8++; + d8++; + } + s8+=16; + d8+=16; + } + gf_do_final_region_alignment(&rd); +} + +static + void gf_w16_split_4_16_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { uint64_t i, j, a, c, prod; @@ -391,14 +843,14 @@ gf_w16_split_4_16_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_3 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2); - gf_do_initial_region_alignment(&rd); + gf_do_initial_region_alignment(&rd); h = (gf_internal_t *) gf->scratch; for (j = 0; j < 16; j++) { for (i = 0; i < 4; i++) { c = (j << (i*4)); - table[i][j] = gf_w16_log_multiply(gf, c, val); + table[i][j] = gf->multiply.w32(gf, c, val); } } @@ -423,7 +875,7 @@ static void gf_w16_split_8_16_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { - uint64_t j, a, c, prod, *s64, *d64, *top64; + uint64_t j, k, v, a, c, prod, *s64, *d64, *top64; gf_internal_t *h; uint64_t htable[256], ltable[256]; gf_region_data rd; @@ -436,9 +888,16 @@ gf_w16_split_8_16_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_3 h = (gf_internal_t *) gf->scratch; - for (j = 0; j < 256; j++) { - ltable[j] = gf_w16_log_multiply(gf, j, val); - htable[j] = gf_w16_log_multiply(gf, (j<<8), val); + v = val; + ltable[0] = 0; + for (j = 1; j < 256; j <<= 1) { + for (k = 0; k < j; k++) ltable[k^j] = (v ^ ltable[k]); + v = GF_MULTBY_TWO(v); + } + htable[0] = 0; + for (j = 1; j < 256; j <<= 1) { + for (k = 0; k < j; k++) htable[k^j] = (v ^ htable[k]); + v = GF_MULTBY_TWO(v); } s64 = (uint64_t *) rd.s_start; @@ -472,8 +931,8 @@ gf_w16_split_8_16_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_3 prod ^= ltable[a >> 56]; prod ^= ((xor) ? *d64 : 0); *d64 = prod; - *s64++; - *d64++; + s64++; + d64++; } */ @@ -489,10 +948,12 @@ gf_w16_split_8_16_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_3 a <<= 8; } + //JSP: We can move the conditional outside the while loop, but we need to fully test it to understand which is better. + prod ^= ((xor) ? *d64 : 0); *d64 = prod; - *s64++; - *d64++; + s64++; + d64++; } gf_do_final_region_alignment(&rd); } @@ -502,7 +963,7 @@ gf_w16_table_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v { uint64_t j, a, c, pp; gf_internal_t *h; - struct gf_lazytable_data *ltd; + struct gf_w16_lazytable_data *ltd; gf_region_data rd; if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } @@ -512,7 +973,7 @@ gf_w16_table_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v gf_do_initial_region_alignment(&rd); h = (gf_internal_t *) gf->scratch; - ltd = (struct gf_lazytable_data *) h->private; + ltd = (struct gf_w16_lazytable_data *) h->private; ltd->lazytable[0] = 0; @@ -530,9 +991,8 @@ gf_w16_table_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v } while (c != 1); */ - a = ltd->log_tbl[val]; for (c = 1; c < GF_FIELD_SIZE; c++) { - ltd->lazytable[c] = ltd->antilog_tbl[ltd->log_tbl[c]+a]; + ltd->lazytable[c] = gf_w16_shift_multiply(gf, c, val); } gf_two_byte_region_table_multiply(&rd, ltd->lazytable); @@ -543,7 +1003,7 @@ static void gf_w16_split_4_16_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { -#ifdef INTEL_SSE4 +#ifdef INTEL_SSSE3 uint64_t i, j, *s64, *d64, *top64;; uint64_t a, c, prod; uint8_t low[4][16]; @@ -561,7 +1021,7 @@ gf_w16_split_4_16_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_v for (j = 0; j < 16; j++) { for (i = 0; i < 4; i++) { c = (j << (i*4)); - prod = gf_w16_log_multiply(gf, c, val); + prod = gf->multiply.w32(gf, c, val); low[i][j] = (prod & 0xff); high[i][j] = (prod >> 8); } @@ -676,7 +1136,7 @@ static void gf_w16_split_4_16_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { -#ifdef INTEL_SSE4 +#ifdef INTEL_SSSE3 uint64_t i, j, *s64, *d64, *top64;; uint64_t c, prod; uint8_t low[4][16]; @@ -694,7 +1154,7 @@ gf_w16_split_4_16_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *des for (j = 0; j < 16; j++) { for (i = 0; i < 4; i++) { c = (j << (i*4)); - prod = gf_w16_log_multiply(gf, c, val); + prod = gf->multiply.w32(gf, c, val); low[i][j] = (prod & 0xff); high[i][j] = (prod >> 8); } @@ -782,32 +1242,111 @@ gf_w16_split_4_16_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *des #endif } +uint32_t +gf_w16_split_8_8_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + uint32_t alow, blow; + struct gf_w16_split_8_8_data *d8; + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + d8 = (struct gf_w16_split_8_8_data *) h->private; + + alow = a & 0xff; + blow = b & 0xff; + a >>= 8; + b >>= 8; + + return d8->tables[0][alow][blow] ^ + d8->tables[1][alow][b] ^ + d8->tables[1][a][blow] ^ + d8->tables[2][a][b]; +} + static int gf_w16_split_init(gf_t *gf) { gf_internal_t *h; - gf_w16_log_init(gf); + struct gf_w16_split_8_8_data *d8; + int i, j, exp, issse3; + uint32_t p, basep; h = (gf_internal_t *) gf->scratch; - if (h->mult_type == GF_MULT_DEFAULT) { - gf->multiply_region.w32 = gf_w16_split_8_16_lazy_multiply_region; -#ifdef INTEL_SSE4 - gf->multiply_region.w32 = gf_w16_split_4_16_lazy_sse_multiply_region; +issse3 = 0; +#ifdef INTEL_SSSE3 + issse3 = 1; #endif - } else if ((h->arg1 == 8 && h->arg2 == 16) || (h->arg2 == 8 && h->arg1 == 16)) { - gf->multiply_region.w32 = gf_w16_split_8_16_lazy_multiply_region; - } else if ((h->arg1 == 4 && h->arg2 == 16) || (h->arg2 == 4 && h->arg1 == 16)) { - if (h->region_type & GF_REGION_SSE) { - if (h->region_type & GF_REGION_ALTMAP) { - gf->multiply_region.w32 = gf_w16_split_4_16_lazy_sse_altmap_multiply_region; - } else { - gf->multiply_region.w32 = gf_w16_split_4_16_lazy_sse_multiply_region; + + if (h->arg1 == 8 && h->arg2 == 8) { + d8 = (struct gf_w16_split_8_8_data *) h->private; + basep = 1; + for (exp = 0; exp < 3; exp++) { + for (j = 0; j < 256; j++) d8->tables[exp][0][j] = 0; + for (i = 0; i < 256; i++) d8->tables[exp][i][0] = 0; + d8->tables[exp][1][1] = basep; + for (i = 2; i < 256; i++) { + if (i&1) { + p = d8->tables[exp][i^1][1]; + d8->tables[exp][i][1] = p ^ basep; + } else { + p = d8->tables[exp][i>>1][1]; + d8->tables[exp][i][1] = GF_MULTBY_TWO(p); + } } + for (i = 1; i < 256; i++) { + p = d8->tables[exp][i][1]; + for (j = 1; j < 256; j++) { + if (j&1) { + d8->tables[exp][i][j] = d8->tables[exp][i][j^1] ^ p; + } else { + d8->tables[exp][i][j] = GF_MULTBY_TWO(d8->tables[exp][i][j>>1]); + } + } + } + for (i = 0; i < 8; i++) basep = GF_MULTBY_TWO(basep); + } + gf->multiply.w32 = gf_w16_split_8_8_multiply; + gf->multiply_region.w32 = gf_w16_split_8_16_lazy_multiply_region; + return 1; + + } + + /* We'll be using LOG for multiplication, unless the pp isn't primitive. + In that case, we'll be using SHIFT. */ + + gf_w16_log_init(gf); + + /* Defaults */ + + if (issse3) { + gf->multiply_region.w32 = gf_w16_split_4_16_lazy_sse_multiply_region; + } else { + gf->multiply_region.w32 = gf_w16_split_8_16_lazy_multiply_region; + } + + + if ((h->arg1 == 8 && h->arg2 == 16) || (h->arg2 == 8 && h->arg1 == 16)) { + gf->multiply_region.w32 = gf_w16_split_8_16_lazy_multiply_region; + + } else if ((h->arg1 == 4 && h->arg2 == 16) || (h->arg2 == 4 && h->arg1 == 16)) { + if (issse3) { + if(h->region_type & GF_REGION_ALTMAP && h->region_type & GF_REGION_NOSSE) + gf->multiply_region.w32 = gf_w16_split_4_16_lazy_nosse_altmap_multiply_region; + else if(h->region_type & GF_REGION_NOSSE) + gf->multiply_region.w32 = gf_w16_split_4_16_lazy_multiply_region; + else if(h->region_type & GF_REGION_ALTMAP) + gf->multiply_region.w32 = gf_w16_split_4_16_lazy_sse_altmap_multiply_region; } else { - gf->multiply_region.w32 = gf_w16_split_4_16_lazy_multiply_region; + if(h->region_type & GF_REGION_SSE) + return 0; + else if(h->region_type & GF_REGION_ALTMAP) + gf->multiply_region.w32 = gf_w16_split_4_16_lazy_nosse_altmap_multiply_region; + else + gf->multiply_region.w32 = gf_w16_split_4_16_lazy_multiply_region; } } + return 1; } @@ -818,7 +1357,7 @@ int gf_w16_table_init(gf_t *gf) gf_w16_log_init(gf); h = (gf_internal_t *) gf->scratch; - gf->multiply_region.w32 = NULL; + gf->multiply_region.w32 = gf_w16_table_lazy_multiply_region; return 1; } @@ -830,7 +1369,7 @@ gf_w16_log_zero_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val uint16_t lv; int i; uint16_t *s16, *d16, *top16; - struct gf_zero_logtable_data *ltd; + struct gf_w16_zero_logtable_data *ltd; gf_region_data rd; if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } @@ -839,7 +1378,7 @@ gf_w16_log_zero_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2); gf_do_initial_region_alignment(&rd); - ltd = (struct gf_zero_logtable_data*) ((gf_internal_t *) gf->scratch)->private; + ltd = (struct gf_w16_zero_logtable_data*) ((gf_internal_t *) gf->scratch)->private; s16 = (uint16_t *) rd.s_start; d16 = (uint16_t *) rd.d_start; top16 = (uint16_t *) rd.d_top; @@ -858,18 +1397,20 @@ gf_w16_log_zero_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val } /* This isn't necessary. */ + gf_do_final_region_alignment(&rd); } /* Here -- double-check Kevin */ + static inline gf_val_32_t gf_w16_log_zero_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) { - struct gf_zero_logtable_data *ltd; + struct gf_w16_zero_logtable_data *ltd; - ltd = (struct gf_zero_logtable_data *) ((gf_internal_t *) gf->scratch)->private; + ltd = (struct gf_w16_zero_logtable_data *) ((gf_internal_t *) gf->scratch)->private; return ltd->antilog_tbl[ltd->log_tbl[a] + ltd->log_tbl[b]]; } @@ -879,10 +1420,10 @@ gf_val_32_t gf_w16_log_zero_divide (gf_t *gf, gf_val_32_t a, gf_val_32_t b) { int log_sum = 0; - struct gf_zero_logtable_data *ltd; + struct gf_w16_zero_logtable_data *ltd; if (a == 0 || b == 0) return 0; - ltd = (struct gf_zero_logtable_data *) ((gf_internal_t *) gf->scratch)->private; + ltd = (struct gf_w16_zero_logtable_data *) ((gf_internal_t *) gf->scratch)->private; log_sum = ltd->log_tbl[a] - ltd->log_tbl[b] + (GF_MULT_GROUP_SIZE); return (ltd->antilog_tbl[log_sum]); @@ -892,9 +1433,9 @@ static gf_val_32_t gf_w16_log_zero_inverse (gf_t *gf, gf_val_32_t a) { - struct gf_zero_logtable_data *ltd; + struct gf_w16_zero_logtable_data *ltd; - ltd = (struct gf_zero_logtable_data *) ((gf_internal_t *) gf->scratch)->private; + ltd = (struct gf_w16_zero_logtable_data *) ((gf_internal_t *) gf->scratch)->private; return (ltd->inv_tbl[a]); } @@ -1015,7 +1556,7 @@ static void gf_w16_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { -#ifdef INTEL_SSE4 +#ifdef INTEL_SSE2 int i; uint8_t *s8, *d8; uint32_t vrev; @@ -1079,7 +1620,7 @@ static void gf_w16_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_w16_bytwo_data *btd) { -#ifdef INTEL_SSE4 +#ifdef INTEL_SSE2 int i; uint8_t *d8, *s8, tb; __m128i pp, m1, m2, t1, t2, va, vb; @@ -1105,7 +1646,7 @@ static void gf_w16_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_w16_bytwo_data *btd) { -#ifdef INTEL_SSE4 +#ifdef INTEL_SSE2 int i; uint8_t *d8, *s8, tb; __m128i pp, m1, m2, t1, t2, va, vb; @@ -1134,7 +1675,7 @@ static void gf_w16_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { -#ifdef INTEL_SSE4 +#ifdef INTEL_SSE2 int itb; uint8_t *d8, *s8; __m128i pp, m1, m2, t1, t2, va, vb; @@ -1352,20 +1893,30 @@ int gf_w16_bytwo_init(gf_t *gf) if (h->mult_type == GF_MULT_BYTWO_p) { gf->multiply.w32 = gf_w16_bytwo_p_multiply; - if (h->region_type == GF_REGION_SSE) { - gf->multiply_region.w32 = gf_w16_bytwo_p_sse_multiply_region; - } else { + #ifdef INTEL_SSE2 + if (h->region_type & GF_REGION_NOSSE) + gf->multiply_region.w32 = gf_w16_bytwo_p_nosse_multiply_region; + else + gf->multiply_region.w32 = gf_w16_bytwo_p_sse_multiply_region; + #else gf->multiply_region.w32 = gf_w16_bytwo_p_nosse_multiply_region; - } + if(h->region_type & GF_REGION_SSE) + return 0; + #endif } else { gf->multiply.w32 = gf_w16_bytwo_b_multiply; - if (h->region_type == GF_REGION_SSE) { - gf->multiply_region.w32 = gf_w16_bytwo_b_sse_multiply_region; - } else { + #ifdef INTEL_SSE2 + if (h->region_type & GF_REGION_NOSSE) + gf->multiply_region.w32 = gf_w16_bytwo_b_nosse_multiply_region; + else + gf->multiply_region.w32 = gf_w16_bytwo_b_sse_multiply_region; + #else gf->multiply_region.w32 = gf_w16_bytwo_b_nosse_multiply_region; - } + if(h->region_type & GF_REGION_SSE) + return 0; + #endif } - gf->inverse.w32 = gf_w16_euclid; + return 1; } @@ -1373,7 +1924,7 @@ static int gf_w16_log_zero_init(gf_t *gf) { gf_internal_t *h; - struct gf_zero_logtable_data *ltd; + struct gf_w16_zero_logtable_data *ltd; int i, b; h = (gf_internal_t *) gf->scratch; @@ -1423,30 +1974,30 @@ gf_w16_composite_multiply_recursive(gf_t *gf, gf_val_32_t a, gf_val_32_t b) a1b1 = base_gf->multiply.w32(base_gf, a1, b1); - rv = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, GF_S_GF_8_2)) << 8)); + rv = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 8)); return rv; } static gf_val_32_t -gf_w16_composite_multiply_table(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +gf_w16_composite_multiply_inline(gf_t *gf, gf_val_32_t a, gf_val_32_t b) { gf_internal_t *h = (gf_internal_t *) gf->scratch; - struct gf_w8_single_table_data * std; - + gf_t *base_gf = h->base_gf; uint8_t b0 = b & 0x00ff; uint8_t b1 = (b & 0xff00) >> 8; uint8_t a0 = a & 0x00ff; uint8_t a1 = (a & 0xff00) >> 8; - uint8_t a1b1; + uint8_t a1b1, *mt; uint16_t rv; + struct gf_w16_composite_data *cd; - std = (struct gf_w8_single_table_data *) h->private; + cd = (struct gf_w16_composite_data *) h->private; + mt = cd->mult_table; - a1b1 = std->mult[a1][b1]; + a1b1 = GF_W8_INLINE_MULTDIV(mt, a1, b1); - rv = ((std->mult[a0][b0] ^ a1b1) | - ((std->mult[a1][b0] ^ std->mult[a0][b1] ^ std->mult[a1b1][GF_S_GF_8_2]) << 8)); + rv = ((GF_W8_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) | ((GF_W8_INLINE_MULTDIV(mt, a1, b0) ^ GF_W8_INLINE_MULTDIV(mt, a0, b1) ^ GF_W8_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 8)); return rv; } @@ -1472,6 +2023,7 @@ gf_w16_composite_multiply_table(gf_t *gf, gf_val_32_t a, gf_val_32_t b) * * a / b = a * c */ + static gf_val_32_t gf_w16_composite_inverse(gf_t *gf, gf_val_32_t a) @@ -1486,7 +2038,7 @@ gf_w16_composite_inverse(gf_t *gf, gf_val_32_t a) if (a0 == 0) { a1inv = base_gf->inverse.w32(base_gf, a1); - c0 = base_gf->multiply.w32(base_gf, a1inv, GF_S_GF_8_2); + c0 = base_gf->multiply.w32(base_gf, a1inv, h->prim_poly); c1 = a1inv; } else if (a1 == 0) { c0 = base_gf->inverse.w32(base_gf, a0); @@ -1497,7 +2049,7 @@ gf_w16_composite_inverse(gf_t *gf, gf_val_32_t a) d = base_gf->multiply.w32(base_gf, a1, a0inv); - tmp = (base_gf->multiply.w32(base_gf, a1, a0inv) ^ base_gf->multiply.w32(base_gf, a0, a1inv) ^ GF_S_GF_8_2); + tmp = (base_gf->multiply.w32(base_gf, a1, a0inv) ^ base_gf->multiply.w32(base_gf, a0, a1inv) ^ h->prim_poly); tmp = base_gf->inverse.w32(base_gf, tmp); d = base_gf->multiply.w32(base_gf, d, tmp); @@ -1511,62 +2063,6 @@ gf_w16_composite_inverse(gf_t *gf, gf_val_32_t a) return c; } -static -gf_val_32_t -gf_w16_composite_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - uint16_t binv; - - binv = gf->inverse.w32(gf, b); - return gf->multiply.w32(gf, a, binv); -} - -static -void -gf_w16_composite_multiply_region_inline(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) -{ - gf_internal_t *h = (gf_internal_t *) gf->scratch; - gf_t *base_gf = h->base_gf; - struct gf_w8_single_table_data * std; - uint8_t b0 = val & 0x00ff; - uint8_t b1 = (val & 0xff00) >> 8; - uint16_t *s16, *d16, *top; - uint8_t a0, a1, a1b1; - struct gf_logtable_data *ltd; - gf_region_data rd; - - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } - - std = (struct gf_w8_single_table_data *) h->private; - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2); - - s16 = rd.s_start; - d16 = rd.d_start; - top = rd.d_top; - - if (xor) { - while (d16 < top) { - a0 = (*s16) & 0x00ff; - a1 = ((*s16) & 0xff00) >> 8; - a1b1 = std->mult[a1][b1]; - - *d16 ^= ((std->mult[a0][b0] ^ a1b1) | ((std->mult[a1][b0] ^ std->mult[a0][b1] ^ std->mult[a1b1][GF_S_GF_8_2]) << 8)); - s16++; - d16++; - } - } else { - while (d16 < top) { - a0 = (*s16) & 0x00ff; - a1 = ((*s16) & 0xff00) >> 8; - a1b1 = std->mult[a1][b1]; - - *d16 = ((std->mult[a0][b0] ^ a1b1) | ((std->mult[a1][b0] ^ std->mult[a0][b1] ^ std->mult[a1b1][GF_S_GF_8_2]) << 8)); - s16++; - d16++; - } - } -} - static void gf_w16_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) @@ -1577,9 +2073,13 @@ gf_w16_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t va uint8_t b0 = val & 0x00ff; uint8_t b1 = (val & 0xff00) >> 8; uint16_t *s16, *d16, *top; - uint8_t a0, a1, a1b1; + uint8_t a0, a1, a1b1, *mt; gf_region_data rd; - struct gf_logtable_data *ltd; + struct gf_w16_logtable_data *ltd; + struct gf_w16_composite_data *cd; + + cd = (struct gf_w16_composite_data *) h->private; + mt = cd->mult_table; if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2); @@ -1588,27 +2088,61 @@ gf_w16_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t va d16 = rd.d_start; top = rd.d_top; - if (xor) { - while (d16 < top) { - a0 = (*s16) & 0x00ff; - a1 = ((*s16) & 0xff00) >> 8; - a1b1 = base_gf->multiply.w32(base_gf, a1, b1); - - (*d16) ^= ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | - ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, GF_S_GF_8_2)) << 8)); - s16++; - d16++; + if (mt == NULL) { + if (xor) { + while (d16 < top) { + a0 = (*s16) & 0x00ff; + a1 = ((*s16) & 0xff00) >> 8; + a1b1 = base_gf->multiply.w32(base_gf, a1, b1); + + (*d16) ^= ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | + ((base_gf->multiply.w32(base_gf, a1, b0) ^ + base_gf->multiply.w32(base_gf, a0, b1) ^ + base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 8)); + s16++; + d16++; + } + } else { + while (d16 < top) { + a0 = (*s16) & 0x00ff; + a1 = ((*s16) & 0xff00) >> 8; + a1b1 = base_gf->multiply.w32(base_gf, a1, b1); + + (*d16) = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | + ((base_gf->multiply.w32(base_gf, a1, b0) ^ + base_gf->multiply.w32(base_gf, a0, b1) ^ + base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 8)); + s16++; + d16++; + } } } else { - while (d16 < top) { - a0 = (*s16) & 0x00ff; - a1 = ((*s16) & 0xff00) >> 8; - a1b1 = base_gf->multiply.w32(base_gf, a1, b1); - - (*d16) = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | - ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, GF_S_GF_8_2)) << 8)); - s16++; - d16++; + if (xor) { + while (d16 < top) { + a0 = (*s16) & 0x00ff; + a1 = ((*s16) & 0xff00) >> 8; + a1b1 = GF_W8_INLINE_MULTDIV(mt, a1, b1); + + (*d16) ^= ((GF_W8_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) | + ((GF_W8_INLINE_MULTDIV(mt, a1, b0) ^ + GF_W8_INLINE_MULTDIV(mt, a0, b1) ^ + GF_W8_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 8)); + s16++; + d16++; + } + } else { + while (d16 < top) { + a0 = (*s16) & 0x00ff; + a1 = ((*s16) & 0xff00) >> 8; + a1b1 = GF_W8_INLINE_MULTDIV(mt, a1, b1); + + (*d16) = ((GF_W8_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) | + ((GF_W8_INLINE_MULTDIV(mt, a1, b0) ^ + GF_W8_INLINE_MULTDIV(mt, a0, b1) ^ + GF_W8_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 8)); + s16++; + d16++; + } } } } @@ -1645,7 +2179,7 @@ gf_w16_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_32_ base_gf->multiply_region.w32(base_gf, shigh, dlow, val1, sub_reg_size, 1); base_gf->multiply_region.w32(base_gf, slow, dhigh, val1, sub_reg_size, xor); base_gf->multiply_region.w32(base_gf, shigh, dhigh, val0, sub_reg_size, 1); - base_gf->multiply_region.w32(base_gf, shigh, dhigh, base_gf->multiply.w32(base_gf, GF_S_GF_8_2, val1), sub_reg_size, 1); + base_gf->multiply_region.w32(base_gf, shigh, dhigh, base_gf->multiply.w32(base_gf, h->prim_poly, val1), sub_reg_size, 1); gf_do_final_region_alignment(&rd); } @@ -1653,34 +2187,26 @@ gf_w16_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_32_ static int gf_w16_composite_init(gf_t *gf) { - struct gf_w8_single_table_data * std; gf_internal_t *h = (gf_internal_t *) gf->scratch; - gf_t *base_gf = h->base_gf; - gf_internal_t *base_h = (gf_internal_t *) base_gf->scratch; - uint16_t a, b; + struct gf_w16_composite_data *cd; + + if (h->base_gf == NULL) return 0; + + cd = (struct gf_w16_composite_data *) h->private; + cd->mult_table = gf_w8_get_mult_table(h->base_gf); if (h->region_type & GF_REGION_ALTMAP) { gf->multiply_region.w32 = gf_w16_composite_multiply_region_alt; - } else if (h->arg2 == 0 && base_h->mult_type == GF_MULT_TABLE && - base_h->region_type == GF_REGION_DEFAULT) { - gf->multiply_region.w32 = gf_w16_composite_multiply_region_inline; } else { gf->multiply_region.w32 = gf_w16_composite_multiply_region; } - - if (h->arg2 == 0) { - std = (struct gf_w8_single_table_data *) h->private; - for (a = 0; a < 256; a++) { - for (b = 0; b < 256; b++) { - std->mult[a][b] = base_gf->multiply.w32(base_gf, a, b); - } - } - gf->multiply.w32 = gf_w16_composite_multiply_table; - } else { - gf->multiply.w32 = gf_w16_composite_multiply_recursive; - } - gf->divide.w32 = gf_w16_composite_divide; + if (cd->mult_table == NULL) { + gf->multiply.w32 = gf_w16_composite_multiply_recursive; + } else { + gf->multiply.w32 = gf_w16_composite_multiply_inline; + } + gf->divide.w32 = NULL; gf->inverse.w32 = gf_w16_composite_inverse; return 1; @@ -1815,79 +2341,50 @@ int gf_w16_group_init(gf_t *gf) int gf_w16_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2) { - int ss; - int sa; - - ss = (GF_REGION_SSE | GF_REGION_NOSSE); - sa = (GF_REGION_STDMAP | GF_REGION_ALTMAP); - switch(mult_type) { case GF_MULT_TABLE: - region_type |= GF_REGION_LAZY; - if (arg1 != 0 || arg2 != 0 || region_type != GF_REGION_LAZY) return -1; - return sizeof(gf_internal_t) + sizeof(struct gf_lazytable_data) + 64; + return sizeof(gf_internal_t) + sizeof(struct gf_w16_lazytable_data) + 64; break; case GF_MULT_BYTWO_p: case GF_MULT_BYTWO_b: - if (arg1 != 0 || arg2 != 0 || (region_type | ss) != ss || - (region_type & ss) == ss) return -1; return sizeof(gf_internal_t) + sizeof(struct gf_w16_bytwo_data); break; - case GF_MULT_DEFAULT: - case GF_MULT_LOG_TABLE: - if (arg2 != 0) return -1; - if (region_type != GF_REGION_DEFAULT) return -1; - if (arg1 == 1) { - return sizeof(gf_internal_t) + sizeof(struct gf_zero_logtable_data) + 64; - } else if (arg1 == 0) { - return sizeof(gf_internal_t) + sizeof(struct gf_logtable_data) + 64; - } else { - return -1; - } + case GF_MULT_LOG_ZERO: + return sizeof(gf_internal_t) + sizeof(struct gf_w16_zero_logtable_data) + 64; break; + case GF_MULT_LOG_TABLE: + return sizeof(gf_internal_t) + sizeof(struct gf_w16_logtable_data) + 64; + break; + case GF_MULT_DEFAULT: case GF_MULT_SPLIT_TABLE: - if ((arg1 == 8 && arg2 == 16) || (arg2 == 8 && arg1 == 16)) { - region_type |= GF_REGION_LAZY; - if (region_type != GF_REGION_LAZY) return -1; - return sizeof(gf_internal_t) + sizeof(struct gf_logtable_data) + 64; - } else if ((arg1 == 4 && arg2 == 16) || (arg2 == 4 && arg1 == 16)) { - region_type &= (~GF_REGION_LAZY); /* Ignore GF_REGION_LAZY */ - if ((region_type & ss) == ss) return -1; - if ((region_type & sa) == sa) return -1; - if ((region_type & ss) == 0) region_type |= GF_REGION_SSE; - if (region_type & GF_REGION_NOSSE) { - if (region_type != GF_REGION_NOSSE) return -1; - return sizeof(gf_internal_t) + sizeof(struct gf_logtable_data) + 64; - } else { - if ((region_type | ss | sa) != (ss|sa)) return -1; - return sizeof(gf_internal_t) + sizeof(struct gf_logtable_data) + 64; - } - } - return -1; - break; - case GF_MULT_GROUP: - if (arg1 == 4 && arg2 == 4) { - return sizeof(gf_internal_t) + sizeof(struct gf_w16_group_4_4_data) + 64; + if (arg1 == 8 && arg2 == 8) { + return sizeof(gf_internal_t) + sizeof(struct gf_w16_split_8_8_data) + 64; + } else if ((arg1 == 8 && arg2 == 16) || (arg2 == 8 && arg1 == 16)) { + return sizeof(gf_internal_t) + sizeof(struct gf_w16_logtable_data) + 64; + } else if (mult_type == GF_MULT_DEFAULT || + (arg1 == 4 && arg2 == 16) || (arg2 == 4 && arg1 == 16)) { + return sizeof(gf_internal_t) + sizeof(struct gf_w16_logtable_data) + 64; } - return -1; + return 0; + break; + case GF_MULT_GROUP: + return sizeof(gf_internal_t) + sizeof(struct gf_w16_group_4_4_data) + 64; + break; + case GF_MULT_CARRY_FREE: + return sizeof(gf_internal_t); + break; case GF_MULT_SHIFT: - if (arg1 != 0 || arg2 != 0 || region_type != 0) return -1; return sizeof(gf_internal_t); break; case GF_MULT_COMPOSITE: - if (region_type & ~(GF_REGION_ALTMAP | GF_REGION_STDMAP)) return -1; - if (arg1 == 2 && arg2 == 0) { - return sizeof(gf_internal_t) + sizeof(struct gf_w8_single_table_data) + 64; - } else if (arg1 == 2 && arg2 == 1) { - return sizeof(gf_internal_t) + 64; - } else { - return -1; - } + return sizeof(gf_internal_t) + sizeof(struct gf_w16_composite_data) + 64; + break; default: - return -1; + return 0; } + return 0; } int gf_w16_init(gf_t *gf) @@ -1895,7 +2392,27 @@ int gf_w16_init(gf_t *gf) gf_internal_t *h; h = (gf_internal_t *) gf->scratch; - if (h->prim_poly == 0) h->prim_poly = 0x1100b; + + /* Allen: set default primitive polynomial / irreducible polynomial if needed */ + + if (h->prim_poly == 0) { + if (h->mult_type == GF_MULT_COMPOSITE) { + h->prim_poly = gf_composite_get_default_poly(h->base_gf); + if (h->prim_poly == 0) return 0; + } else { + + /* Allen: use the following primitive polynomial to make + carryless multiply work more efficiently for GF(2^16). + + h->prim_poly = 0x1002d; + + The following is the traditional primitive polynomial for GF(2^16) */ + + h->prim_poly = 0x1100b; + } + } + + if (h->mult_type != GF_MULT_COMPOSITE) h->prim_poly |= (1 << 16); gf->multiply.w32 = NULL; gf->divide.w32 = NULL; @@ -1903,21 +2420,17 @@ int gf_w16_init(gf_t *gf) gf->multiply_region.w32 = NULL; switch(h->mult_type) { - case GF_MULT_LOG_TABLE: - if (h->arg1 == 1) { - if (gf_w16_log_zero_init(gf) == 0) return 0; - } else { - if (gf_w16_log_init(gf) == 0) return 0; - } - break; + case GF_MULT_LOG_ZERO: if (gf_w16_log_zero_init(gf) == 0) return 0; break; + case GF_MULT_LOG_TABLE: if (gf_w16_log_init(gf) == 0) return 0; break; case GF_MULT_DEFAULT: case GF_MULT_SPLIT_TABLE: if (gf_w16_split_init(gf) == 0) return 0; break; case GF_MULT_TABLE: if (gf_w16_table_init(gf) == 0) return 0; break; - case GF_MULT_SHIFT: if (gf_w16_shift_init(gf) == 0) return 0; break; - case GF_MULT_COMPOSITE: if (gf_w16_composite_init(gf) == 0) return 0; break; + case GF_MULT_CARRY_FREE: if (gf_w16_cfm_init(gf) == 0) return 0; break; + case GF_MULT_SHIFT: if (gf_w16_shift_init(gf) == 0) return 0; break; + case GF_MULT_COMPOSITE: if (gf_w16_composite_init(gf) == 0) return 0; break; case GF_MULT_BYTWO_p: - case GF_MULT_BYTWO_b: if (gf_w16_bytwo_init(gf) == 0) return 0; break; - case GF_MULT_GROUP: if (gf_w16_group_init(gf) == 0) return 0; break; + case GF_MULT_BYTWO_b: if (gf_w16_bytwo_init(gf) == 0) return 0; break; + case GF_MULT_GROUP: if (gf_w16_group_init(gf) == 0) return 0; break; default: return 0; } if (h->divide_type == GF_DIVIDE_EUCLID) { @@ -1928,23 +2441,28 @@ int gf_w16_init(gf_t *gf) gf->inverse.w32 = gf_w16_matrix; } - if (gf->inverse.w32== NULL && gf->divide.w32 == NULL) gf->inverse.w32 = gf_w16_euclid; - - if (gf->inverse.w32 != NULL && gf->divide.w32 == NULL) { + if (gf->divide.w32 == NULL) { gf->divide.w32 = gf_w16_divide_from_inverse; + if (gf->inverse.w32 == NULL) gf->inverse.w32 = gf_w16_euclid; } - if (gf->inverse.w32 == NULL && gf->divide.w32 != NULL) { - gf->inverse.w32 = gf_w16_inverse_from_divide; - } + + if (gf->inverse.w32 == NULL) gf->inverse.w32 = gf_w16_inverse_from_divide; + if (h->region_type & GF_REGION_ALTMAP) { if (h->mult_type == GF_MULT_COMPOSITE) { gf->extract_word.w32 = gf_w16_composite_extract_word; } else { gf->extract_word.w32 = gf_w16_split_extract_word; } + } else if (h->region_type == GF_REGION_CAUCHY) { + gf->multiply_region.w32 = gf_wgen_cauchy_region; + gf->extract_word.w32 = gf_wgen_extract_word; } else { gf->extract_word.w32 = gf_w16_extract_word; } + if (gf->multiply_region.w32 == NULL) { + gf->multiply_region.w32 = gf_w16_multiply_region_from_single; + } return 1; } @@ -1953,11 +2471,11 @@ int gf_w16_init(gf_t *gf) uint16_t *gf_w16_get_log_table(gf_t *gf) { gf_internal_t *h; - struct gf_logtable_data *ltd; + struct gf_w16_logtable_data *ltd; h = (gf_internal_t *) gf->scratch; if (gf->multiply.w32 == gf_w16_log_multiply) { - ltd = (struct gf_logtable_data *) ((gf_internal_t *) gf->scratch)->private; + ltd = (struct gf_w16_logtable_data *) ((gf_internal_t *) gf->scratch)->private; return (uint16_t *) ltd->log_tbl; } return NULL; @@ -1966,11 +2484,11 @@ uint16_t *gf_w16_get_log_table(gf_t *gf) uint16_t *gf_w16_get_mult_alog_table(gf_t *gf) { gf_internal_t *h; - struct gf_logtable_data *ltd; + struct gf_w16_logtable_data *ltd; h = (gf_internal_t *) gf->scratch; if (gf->multiply.w32 == gf_w16_log_multiply) { - ltd = (struct gf_logtable_data *) h->private; + ltd = (struct gf_w16_logtable_data *) h->private; return (uint16_t *) ltd->antilog_tbl; } return NULL; @@ -1979,11 +2497,11 @@ uint16_t *gf_w16_get_mult_alog_table(gf_t *gf) uint16_t *gf_w16_get_div_alog_table(gf_t *gf) { gf_internal_t *h; - struct gf_logtable_data *ltd; + struct gf_w16_logtable_data *ltd; h = (gf_internal_t *) gf->scratch; if (gf->multiply.w32 == gf_w16_log_multiply) { - ltd = (struct gf_logtable_data *) h->private; + ltd = (struct gf_w16_logtable_data *) h->private; return (uint16_t *) ltd->d_antilog; } return NULL; diff --git a/gf_w32.c b/gf_w32.c index b0ba8c5..cae188f 100644 --- a/gf_w32.c +++ b/gf_w32.c @@ -15,24 +15,14 @@ #define GF_BASE_FIELD_WIDTH (16) #define GF_BASE_FIELD_SIZE (1 << GF_BASE_FIELD_WIDTH) #define GF_BASE_FIELD_GROUP_SIZE GF_BASE_FIELD_SIZE-1 -#define GF_S_GF_16_2 (40188) -#define GF_MULTBY_TWO(p) (((p) & GF_FIRST_BIT) ? (((p) << 1) ^ h->prim_poly) : (p) << 1); - - -struct gf_w16_logtable_data { - int log_tbl[GF_BASE_FIELD_SIZE]; - uint16_t _antilog_tbl[GF_BASE_FIELD_SIZE * 4]; - uint16_t *antilog_tbl; - uint16_t inv_tbl[GF_BASE_FIELD_SIZE]; - uint32_t log_s; -}; +#define GF_MULTBY_TWO(p) (((p) & GF_FIRST_BIT) ? (((p) << 1) ^ h->prim_poly) : (p) << 1) struct gf_split_2_32_lazy_data { uint32_t tables[16][4]; uint32_t last_value; }; -struct gf_split_8_8_data { +struct gf_w32_split_8_8_data { uint32_t tables[7][256][256]; uint32_t region_tables[4][256]; uint32_t last_value; @@ -67,6 +57,11 @@ struct gf_w32_bytwo_data { uint64_t mask2; }; +struct gf_w32_composite_data { + uint16_t *log; + uint16_t *alog; +}; + #define MM_PRINT32(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 4) printf(" %02x%02x%02x%02x", blah[15-ii], blah[14-ii], blah[13-ii], blah[12-ii]); printf("\n"); } #define MM_PRINT8(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 1) printf("%s%02x", (ii%4==0) ? " " : " ", blah[15-ii]); printf("\n"); } @@ -121,6 +116,168 @@ xor) } } +static +void +gf_w32_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) +{ + +#ifdef INTEL_SSE4_PCLMUL + + int i; + uint32_t *s32; + uint32_t *d32; + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i w; + gf_internal_t * h = gf->scratch; + + prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL)); + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0); + s32 = (uint32_t *) src; + d32 = (uint32_t *) dest; + + if (xor) { + for (i = 0; i < bytes/sizeof(uint32_t); i++) { + b = _mm_insert_epi32 (a, s32[i], 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + d32[i] ^= ((gf_val_32_t)_mm_extract_epi32(result, 0)); + } + } else { + for (i = 0; i < bytes/sizeof(uint32_t); i++) { + b = _mm_insert_epi32 (a, s32[i], 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + d32[i] = ((gf_val_32_t)_mm_extract_epi32(result, 0)); + } + } +#endif +} + +static +void +gf_w32_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) +{ + +#ifdef INTEL_SSE4_PCLMUL + + int i; + uint32_t *s32; + uint32_t *d32; + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i w; + gf_internal_t * h = gf->scratch; + + prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL)); + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0); + + s32 = (uint32_t *) src; + d32 = (uint32_t *) dest; + + if (xor) { + for (i = 0; i < bytes/sizeof(uint32_t); i++) { + b = _mm_insert_epi32 (a, s32[i], 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + d32[i] ^= ((gf_val_32_t)_mm_extract_epi32(result, 0)); + } + } else { + for (i = 0; i < bytes/sizeof(uint32_t); i++) { + b = _mm_insert_epi32 (a, s32[i], 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + d32[i] = ((gf_val_32_t)_mm_extract_epi32(result, 0)); + } + } +#endif +} + +static +void +gf_w32_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) +{ +#ifdef INTEL_SSE4_PCLMUL + int i; + uint32_t *s32; + uint32_t *d32; + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i w; + gf_internal_t * h = gf->scratch; + + prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL)); + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0); + + s32 = (uint32_t *) src; + d32 = (uint32_t *) dest; + + if (xor) { + for (i = 0; i < bytes/sizeof(uint32_t); i++) { + b = _mm_insert_epi32 (a, s32[i], 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + d32[i] ^= ((gf_val_32_t)_mm_extract_epi32(result, 0)); + } + } else { + for (i = 0; i < bytes/sizeof(uint32_t); i++) { + b = _mm_insert_epi32 (a, s32[i], 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + d32[i] = ((gf_val_32_t)_mm_extract_epi32(result, 0)); + } + } +#endif +} + static inline uint32_t gf_w32_euclid (gf_t *gf, uint32_t b) @@ -131,7 +288,7 @@ uint32_t gf_w32_euclid (gf_t *gf, uint32_t b) uint32_t c_i; if (b == 0) return -1; - e_im1 = ((gf_internal_t *) (gf->scratch))->prim_poly; + e_im1 = ((gf_internal_t *) (gf->scratch))->prim_poly; e_i = b; d_im1 = 32; for (d_i = d_im1-1; ((1 << d_i) & e_i) == 0; d_i--) ; @@ -148,6 +305,7 @@ uint32_t gf_w32_euclid (gf_t *gf, uint32_t b) c_i ^= (1 << (d_ip1 - d_i)); e_ip1 ^= (e_i << (d_ip1 - d_i)); d_ip1--; + if (e_ip1 == 0) return 0; while ((e_ip1 & (1 << d_ip1)) == 0) d_ip1--; } @@ -237,6 +395,134 @@ uint32_t gf_w32_matrix (gf_t *gf, uint32_t b) extra memory. */ + + + +static +inline +gf_val_32_t +gf_w32_clm_multiply_2 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32) +{ + gf_val_32_t rv = 0; + +#ifdef INTEL_SSE4_PCLMUL + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i v, w; + gf_internal_t * h = gf->scratch; + + + a = _mm_insert_epi32 (_mm_setzero_si128(), a32, 0); + b = _mm_insert_epi32 (a, b32, 0); + + prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL)); + + /* Do the initial multiply */ + + result = _mm_clmulepi64_si128 (a, b, 0); + + /* Ben: Do prim_poly reduction twice. We are guaranteed that we will only + have to do the reduction at most twice, because (w-2)/z == 2. Where + z is equal to the number of zeros after the leading 1 + + _mm_clmulepi64_si128 is the carryless multiply operation. Here + _mm_srli_si128 shifts the result to the right by 4 bytes. This allows + us to multiply the prim_poly by the leading bits of the result. We + then xor the result of that operation back with the result.*/ + + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + + /* Extracts 32 bit value from result. */ + rv = ((gf_val_32_t)_mm_extract_epi32(result, 0)); +#endif + return rv; +} +static +inline +gf_val_32_t +gf_w32_clm_multiply_3 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32) +{ + gf_val_32_t rv = 0; + +#ifdef INTEL_SSE4_PCLMUL + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i v, w; + gf_internal_t * h = gf->scratch; + + + a = _mm_insert_epi32 (_mm_setzero_si128(), a32, 0); + b = _mm_insert_epi32 (a, b32, 0); + + prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL)); + + /* Do the initial multiply */ + + result = _mm_clmulepi64_si128 (a, b, 0); + + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + + /* Extracts 32 bit value from result. */ + + rv = ((gf_val_32_t)_mm_extract_epi32(result, 0)); +#endif + return rv; +} + +static +inline +gf_val_32_t +gf_w32_clm_multiply_4 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32) +{ + gf_val_32_t rv = 0; + +#ifdef INTEL_SSE4_PCLMUL + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i v, w; + gf_internal_t * h = gf->scratch; + + + a = _mm_insert_epi32 (_mm_setzero_si128(), a32, 0); + b = _mm_insert_epi32 (a, b32, 0); + + prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL)); + + /* Do the initial multiply */ + + result = _mm_clmulepi64_si128 (a, b, 0); + + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + + /* Extracts 32 bit value from result. */ + + rv = ((gf_val_32_t)_mm_extract_epi32(result, 0)); +#endif + return rv; +} + + static inline uint32_t @@ -244,7 +530,7 @@ gf_w32_shift_multiply (gf_t *gf, uint32_t a32, uint32_t b32) { uint64_t product, i, pp, a, b, one; gf_internal_t *h; - + a = a32; b = b32; h = (gf_internal_t *) gf->scratch; @@ -256,37 +542,63 @@ gf_w32_shift_multiply (gf_t *gf, uint32_t a32, uint32_t b32) for (i = 0; i < GF_FIELD_WIDTH; i++) { if (a & (one << i)) product ^= (b << i); } - for (i = (GF_FIELD_WIDTH*2-1); i >= GF_FIELD_WIDTH; i--) { + for (i = (GF_FIELD_WIDTH*2-2); i >= GF_FIELD_WIDTH; i--) { if (product & (one << i)) product ^= (pp << (i-GF_FIELD_WIDTH)); } return product; } -static -int gf_w32_shift_init(gf_t *gf) + static +int gf_w32_cfm_init(gf_t *gf) { - gf->multiply.w32 = gf_w32_shift_multiply; + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + gf->inverse.w32 = gf_w32_euclid; gf->multiply_region.w32 = gf_w32_multiply_region_from_single; + + /*Ben: We also check to see if the prim poly will work for pclmul */ + /*Ben: Check to see how many reduction steps it will take*/ + +#ifdef INTEL_SSE4_PCLMUL + if ((0xfffe0000 & h->prim_poly) == 0){ + gf->multiply.w32 = gf_w32_clm_multiply_2; + gf->multiply_region.w32 = gf_w32_clm_multiply_region_from_single_2; + }else if ((0xffc00000 & h->prim_poly) == 0){ + gf->multiply.w32 = gf_w32_clm_multiply_3; + gf->multiply_region.w32 = gf_w32_clm_multiply_region_from_single_3; + }else if ((0xfe000000 & h->prim_poly) == 0){ + gf->multiply.w32 = gf_w32_clm_multiply_4; + gf->multiply_region.w32 = gf_w32_clm_multiply_region_from_single_4; + } else { + return 0; + } + return 1; + #endif + + return 0; +} + + static +int gf_w32_shift_init(gf_t *gf) +{ + gf->inverse.w32 = gf_w32_euclid; + gf->multiply_region.w32 = gf_w32_multiply_region_from_single; + gf->multiply.w32 = gf_w32_shift_multiply; return 1; } static -void + void gf_w32_group_set_shift_tables(uint32_t *shift, uint32_t val, gf_internal_t *h) { int i; uint32_t j; - int g_s; shift[0] = 0; - - if (h->mult_type == GF_MULT_DEFAULT) { - g_s = 3; - } else { - g_s = h->arg1; - } - for (i = 1; i < (1 << g_s); i <<= 1) { + + for (i = 1; i < (1 << h->arg1); i <<= 1) { for (j = 0; j < i; j++) shift[i|j] = shift[j]^val; if (val & GF_FIRST_BIT) { val <<= 1; @@ -297,7 +609,7 @@ gf_w32_group_set_shift_tables(uint32_t *shift, uint32_t val, gf_internal_t *h) } } -static + static void gf_w32_group_s_equals_r_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { int i; @@ -333,10 +645,10 @@ void gf_w32_group_s_equals_r_multiply_region(gf_t *gf, void *src, void *dest, gf ind = a32 >> rs; a32 <<= leftover; p = gd->shift[ind]; - + bits_left = rs; rs = 32 - g_s; - + while (bits_left > 0) { bits_left -= g_s; ind = a32 >> rs; @@ -352,7 +664,7 @@ void gf_w32_group_s_equals_r_multiply_region(gf_t *gf, void *src, void *dest, gf gf_do_final_region_alignment(&rd); } -static + static void gf_w32_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { uint32_t *s32, *d32, *top; @@ -368,13 +680,8 @@ void gf_w32_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } gf_internal_t *h = (gf_internal_t *) gf->scratch; - if (h->mult_type == GF_MULT_DEFAULT) { - g_s = 3; - g_r = 8; - } else { - g_s = h->arg1; - g_r = h->arg2; - } + g_s = h->arg1; + g_r = h->arg2; gd = (struct gf_w32_group_data *) h->private; gf_w32_group_set_shift_tables(gd->shift, val, h); @@ -527,13 +834,8 @@ gf_w32_group_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) struct gf_w32_group_data *gd; gf_internal_t *h = (gf_internal_t *) gf->scratch; - if (h->mult_type == GF_MULT_DEFAULT) { - g_s = 3; - g_r = 8; - } else { - g_s = h->arg1; - g_r = h->arg2; - } + g_s = h->arg1; + g_r = h->arg2; gd = (struct gf_w32_group_data *) h->private; gf_w32_group_set_shift_tables(gd->shift, b, h); @@ -684,7 +986,7 @@ static void gf_w32_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { -#ifdef INTEL_SSE4 +#ifdef INTEL_SSE2 int i; uint8_t *s8, *d8; uint32_t vrev; @@ -879,7 +1181,7 @@ static void gf_w32_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_w32_bytwo_data *btd) { -#ifdef INTEL_SSE4 +#ifdef INTEL_SSE2 int i; uint8_t *d8, *s8, tb; __m128i pp, m1, m2, t1, t2, va, vb; @@ -905,7 +1207,7 @@ static void gf_w32_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_w32_bytwo_data *btd) { -#ifdef INTEL_SSE4 +#ifdef INTEL_SSE2 int i; uint8_t *d8, *s8, tb; __m128i pp, m1, m2, t1, t2, va, vb; @@ -934,7 +1236,7 @@ static void gf_w32_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { -#ifdef INTEL_SSE4 +#ifdef INTEL_SSE2 uint32_t itb; uint8_t *d8, *s8; __m128i pp, m1, m2, t1, t2, va, vb; @@ -1012,19 +1314,30 @@ int gf_w32_bytwo_init(gf_t *gf) if (h->mult_type == GF_MULT_BYTWO_p) { gf->multiply.w32 = gf_w32_bytwo_p_multiply; - if (h->region_type == GF_REGION_SSE) { - gf->multiply_region.w32 = gf_w32_bytwo_p_sse_multiply_region; - } else { + #ifdef INTEL_SSE2 + if (h->region_type & GF_REGION_NOSSE) + gf->multiply_region.w32 = gf_w32_bytwo_p_nosse_multiply_region; + else + gf->multiply_region.w32 = gf_w32_bytwo_p_sse_multiply_region; + #else gf->multiply_region.w32 = gf_w32_bytwo_p_nosse_multiply_region; - } + if(h->region_type & GF_REGION_SSE) + return 0; + #endif } else { gf->multiply.w32 = gf_w32_bytwo_b_multiply; - if (h->region_type == GF_REGION_SSE) { - gf->multiply_region.w32 = gf_w32_bytwo_b_sse_multiply_region; - } else { + #ifdef INTEL_SSE2 + if (h->region_type & GF_REGION_NOSSE) + gf->multiply_region.w32 = gf_w32_bytwo_b_nosse_multiply_region; + else + gf->multiply_region.w32 = gf_w32_bytwo_b_sse_multiply_region; + #else gf->multiply_region.w32 = gf_w32_bytwo_b_nosse_multiply_region; - } + if(h->region_type & GF_REGION_SSE) + return 0; + #endif } + gf->inverse.w32 = gf_w32_euclid; return 1; } @@ -1036,10 +1349,10 @@ gf_w32_split_8_8_multiply (gf_t *gf, uint32_t a32, uint32_t b32) { uint32_t product, i, j, mask, tb; gf_internal_t *h; - struct gf_split_8_8_data *d8; + struct gf_w32_split_8_8_data *d8; h = (gf_internal_t *) gf->scratch; - d8 = (struct gf_split_8_8_data *) h->private; + d8 = (struct gf_w32_split_8_8_data *) h->private; product = 0; mask = 0xff; @@ -1062,7 +1375,7 @@ gf_w32_split_8_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint32_t gf_internal_t *h; uint32_t *s32, *d32, *top, p, a, v; struct gf_split_8_32_lazy_data *d8; - struct gf_split_8_8_data *d88; + struct gf_w32_split_8_8_data *d88; uint32_t *t[4]; int i, j, k, change; uint32_t pp; @@ -1072,13 +1385,13 @@ gf_w32_split_8_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint32_t if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } h = (gf_internal_t *) gf->scratch; - if (h->arg1 == 32 || h->arg2 == 32) { + if (h->arg1 == 32 || h->arg2 == 32 || h->mult_type == GF_MULT_DEFAULT) { d8 = (struct gf_split_8_32_lazy_data *) h->private; for (i = 0; i < 4; i++) t[i] = d8->tables[i]; change = (val != d8->last_value); if (change) d8->last_value = val; } else { - d88 = (struct gf_split_8_8_data *) h->private; + d88 = (struct gf_w32_split_8_8_data *) h->private; for (i = 0; i < 4; i++) t[i] = d88->region_tables[i]; change = (val != d88->last_value); if (change) d88->last_value = val; @@ -1243,7 +1556,7 @@ static void gf_w32_split_2_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) { -#ifdef INTEL_SSE4 +#ifdef INTEL_SSSE3 gf_internal_t *h; int i, m, j, tindex; uint32_t pp, v, v2, s, *s32, *d32, *top; @@ -1380,7 +1693,7 @@ static void gf_w32_split_4_32_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) { -#ifdef INTEL_SSE4 +#ifdef INTEL_SSSE3 gf_internal_t *h; int i, m, j, k, tindex; uint32_t pp, v, s, *s32, *d32, *top, *realtop; @@ -1572,15 +1885,15 @@ static void gf_w32_split_4_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) { -#ifdef INTEL_SSE4 +#ifdef INTEL_SSSE3 gf_internal_t *h; int i, m, j, k, tindex; uint32_t pp, v, s, *s32, *d32, *top, tmp_table[16]; - __m128i vi, si, tables[8][4], p0, p1, p2, p3, mask1, v0, v1, v2, v3, mask8, mask16; + __m128i vi, si, tables[8][4], p0, p1, p2, p3, mask1, v0, v1, v2, v3, mask8; __m128i tv1, tv2, tv3, tv0; uint8_t btable[16]; gf_region_data rd; - + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } @@ -1593,7 +1906,7 @@ gf_w32_split_4_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint s32 = (uint32_t *) rd.s_start; d32 = (uint32_t *) rd.d_start; top = (uint32_t *) rd.d_top; - + v = val; for (i = 0; i < 8; i++) { tmp_table[0] = 0; @@ -1614,7 +1927,6 @@ gf_w32_split_4_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint mask1 = _mm_set1_epi8(0xf); mask8 = _mm_set1_epi16(0xff); - mask16 = _mm_set1_epi32(0xffff); if (xor) { while (d32 != top) { @@ -1737,36 +2049,41 @@ gf_w32_split_4_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint v1 = _mm_load_si128((__m128i *) s32); s32 += 4; v2 = _mm_load_si128((__m128i *) s32); s32 += 4; v3 = _mm_load_si128((__m128i *) s32); s32 += 4; - + p0 = _mm_srli_epi16(v0, 8); p1 = _mm_srli_epi16(v1, 8); p2 = _mm_srli_epi16(v2, 8); p3 = _mm_srli_epi16(v3, 8); - + tv0 = _mm_and_si128(v0, mask8); tv1 = _mm_and_si128(v1, mask8); tv2 = _mm_and_si128(v2, mask8); tv3 = _mm_and_si128(v3, mask8); - + v0 = _mm_packus_epi16(p1, p0); v1 = _mm_packus_epi16(tv1, tv0); v2 = _mm_packus_epi16(p3, p2); v3 = _mm_packus_epi16(tv3, tv2); - + p0 = _mm_srli_epi16(v0, 8); p1 = _mm_srli_epi16(v1, 8); p2 = _mm_srli_epi16(v2, 8); p3 = _mm_srli_epi16(v3, 8); - + tv0 = _mm_and_si128(v0, mask8); tv1 = _mm_and_si128(v1, mask8); tv2 = _mm_and_si128(v2, mask8); tv3 = _mm_and_si128(v3, mask8); - + v0 = _mm_packus_epi16(p2, p0); v1 = _mm_packus_epi16(p3, p1); v2 = _mm_packus_epi16(tv2, tv0); v3 = _mm_packus_epi16(tv3, tv1); + + p0 = v0; + p1 = v1; + p2 = v2; + p3 = v3; si = _mm_and_si128(v0, mask1); p0 = _mm_shuffle_epi8(tables[6][0], si); @@ -1818,18 +2135,18 @@ gf_w32_split_4_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[1][0], si)); p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[1][1], si)); p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[1][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[1][3], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[1][3], si)); tv0 = _mm_unpackhi_epi8(p1, p3); tv1 = _mm_unpackhi_epi8(p0, p2); tv2 = _mm_unpacklo_epi8(p1, p3); tv3 = _mm_unpacklo_epi8(p0, p2); - + p0 = _mm_unpackhi_epi8(tv1, tv0); p1 = _mm_unpacklo_epi8(tv1, tv0); p2 = _mm_unpackhi_epi8(tv3, tv2); p3 = _mm_unpacklo_epi8(tv3, tv2); - + _mm_store_si128((__m128i *) d32, p0); _mm_store_si128((__m128i *) (d32+4), p1); _mm_store_si128((__m128i *) (d32+8), p2); @@ -1848,19 +2165,50 @@ int gf_w32_split_init(gf_t *gf) gf_internal_t *h; struct gf_split_2_32_lazy_data *ld2; struct gf_split_4_32_lazy_data *ld4; - struct gf_split_8_8_data *d8; + struct gf_w32_split_8_8_data *d8; struct gf_split_8_32_lazy_data *d32; struct gf_split_16_32_lazy_data *d16; uint32_t p, basep; - int i, j, exp; + int i, j, exp, ispclmul, issse3; + + ispclmul = 0; +#ifdef INTEL_SSE4_PCLMUL + ispclmul = 1; +#endif + + issse3 = 0; +#ifdef INTEL_SSSE3 + issse3 = 1; +#endif h = (gf_internal_t *) gf->scratch; /* Defaults */ - gf->multiply_region.w32 = gf_w32_multiply_region_from_single; - gf->multiply.w32 = gf_w32_shift_multiply; + gf->inverse.w32 = gf_w32_euclid; + /* JSP: First handle single multiplication: + If args == 8, then we're doing split 8 8. + Otherwise, if PCLMUL, we use that. + Otherwise, we use bytwo_p. + */ + + if (h->arg1 == 8 && h->arg2 == 8) { + gf->multiply.w32 = gf_w32_split_8_8_multiply; + } else if (ispclmul) { + if ((0xfffe0000 & h->prim_poly) == 0){ + gf->multiply.w32 = gf_w32_clm_multiply_2; + } else if ((0xffc00000 & h->prim_poly) == 0){ + gf->multiply.w32 = gf_w32_clm_multiply_3; + } else if ((0xfe000000 & h->prim_poly) == 0){ + gf->multiply.w32 = gf_w32_clm_multiply_4; + } + } else { + gf->multiply.w32 = gf_w32_bytwo_p_multiply; + } + + /* Easy cases: 16/32 and 2/32 */ + if ((h->arg1 == 16 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 16)) { d16 = (struct gf_split_16_32_lazy_data *) h->private; d16->last_value = 0; @@ -1868,15 +2216,51 @@ int gf_w32_split_init(gf_t *gf) return 1; } - if ((h->arg1 == 8 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 8)) { + if ((h->arg1 == 2 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 2)) { + ld2 = (struct gf_split_2_32_lazy_data *) h->private; + ld2->last_value = 0; + #ifdef INTEL_SSSE3 + if (!(h->region_type & GF_REGION_NOSSE)) + gf->multiply_region.w32 = gf_w32_split_2_32_lazy_sse_multiply_region; + else + gf->multiply_region.w32 = gf_w32_split_2_32_lazy_multiply_region; + #else + gf->multiply_region.w32 = gf_w32_split_2_32_lazy_multiply_region; + if(h->region_type & GF_REGION_SSE) return 0; + #endif + return 1; + } + + /* 4/32 or Default + SSE - There is no ALTMAP/NOSSE. */ + + if ((h->arg1 == 4 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 4) || + (issse3 && h->mult_type == GF_REGION_DEFAULT)) { + ld4 = (struct gf_split_4_32_lazy_data *) h->private; + ld4->last_value = 0; + if ((h->region_type & GF_REGION_NOSSE) || !issse3) { + gf->multiply_region.w32 = gf_w32_split_4_32_lazy_multiply_region; + } else if (h->region_type & GF_REGION_ALTMAP) { + gf->multiply_region.w32 = gf_w32_split_4_32_lazy_sse_altmap_multiply_region; + } else { + gf->multiply_region.w32 = gf_w32_split_4_32_lazy_sse_multiply_region; + } + return 1; + } + + /* 8/32 or Default + no SSE */ + + if ((h->arg1 == 8 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 8) || + h->mult_type == GF_MULT_DEFAULT) { d32 = (struct gf_split_8_32_lazy_data *) h->private; d32->last_value = 0; gf->multiply_region.w32 = gf_w32_split_8_32_lazy_multiply_region; return 1; } + /* Finally, if args == 8, then we have to set up the tables here. */ + if (h->arg1 == 8 && h->arg2 == 8) { - d8 = (struct gf_split_8_8_data *) h->private; + d8 = (struct gf_w32_split_8_8_data *) h->private; d8->last_value = 0; gf->multiply.w32 = gf_w32_split_8_8_multiply; gf->multiply_region.w32 = gf_w32_split_8_32_lazy_multiply_region; @@ -1908,31 +2292,10 @@ int gf_w32_split_init(gf_t *gf) } return 1; } - if ((h->arg1 == 2 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 2)) { - ld2 = (struct gf_split_2_32_lazy_data *) h->private; - ld2->last_value = 0; - if (h->region_type & GF_REGION_SSE) { - gf->multiply_region.w32 = gf_w32_split_2_32_lazy_sse_multiply_region; - } else { - gf->multiply_region.w32 = gf_w32_split_2_32_lazy_multiply_region; - } - return 1; - } - if ((h->arg1 == 4 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 4)) { - ld4 = (struct gf_split_4_32_lazy_data *) h->private; - ld4->last_value = 0; - if (h->region_type & GF_REGION_SSE) { - if (h->region_type & GF_REGION_ALTMAP) { - gf->multiply_region.w32 = gf_w32_split_4_32_lazy_sse_altmap_multiply_region; - } else { - gf->multiply_region.w32 = gf_w32_split_4_32_lazy_sse_multiply_region; - } - } else { - gf->multiply_region.w32 = gf_w32_split_4_32_lazy_multiply_region; - } - return 1; - } - return 1; + + /* If we get here, then the arguments were bad. */ + + return 0; } static @@ -1943,13 +2306,8 @@ int gf_w32_group_init(gf_t *gf) gf_internal_t *h = (gf_internal_t *) gf->scratch; int g_r, g_s; - if (h->mult_type == GF_MULT_DEFAULT) { - g_s = 3; - g_r = 8; - } else { - g_s = h->arg1; - g_r = h->arg2; - } + g_s = h->arg1; + g_r = h->arg2; gd = (struct gf_w32_group_data *) h->private; gd->shift = (uint32_t *) (&(gd->memory)); @@ -1983,11 +2341,6 @@ int gf_w32_group_init(gf_t *gf) } else { gf->multiply.w32 = gf_w32_group_multiply; gf->multiply_region.w32 = gf_w32_group_multiply_region; - if (h->mult_type == GF_MULT_DEFAULT) { -#ifdef INTEL_SSE4 - gf->multiply_region.w32 = gf_w32_split_4_32_lazy_sse_multiply_region; -#endif - } } gf->divide.w32 = NULL; gf->inverse.w32 = gf_w32_euclid; @@ -1995,44 +2348,6 @@ int gf_w32_group_init(gf_t *gf) return 1; } -static -uint32_t -gf_w32_composite_multiply_logtable(gf_t *gf, uint32_t a, uint32_t b) -{ - gf_internal_t *h = (gf_internal_t *) gf->scratch; - struct gf_w16_logtable_data * ltd = (struct gf_w16_logtable_data *) h->private; - - uint32_t b0 = b & 0xffff; - uint32_t b1 = b >> 16; - uint32_t a0 = a & 0xffff; - uint32_t a1 = a >> 16; - uint32_t a1b1; - uint32_t la0, la1, lb0, lb1, l11; - uint32_t p; - - la0 = ltd->log_tbl[a0]; - la1 = ltd->log_tbl[a1]; - lb0 = ltd->log_tbl[b0]; - lb1 = ltd->log_tbl[b1]; - - if (a1 && b1) { - l11 = (la1 + lb1); - a1b1 = ltd->antilog_tbl[l11]; - l11 = ltd->log_tbl[a1b1]; - p = ltd->antilog_tbl[l11+ltd->log_s]; - } else { - a1b1 = 0; - p = 0; - } - - if (a0 && b1) p ^= ltd->antilog_tbl[la0+lb1]; - - if (a1 && b0) p ^= ltd->antilog_tbl[la1+lb0]; - p <<= 16; - p ^= a1b1; - if (a0 && b0) p ^= ltd->antilog_tbl[la0+lb0]; - return p; -} static uint32_t @@ -2040,19 +2355,48 @@ gf_w32_composite_multiply_recursive(gf_t *gf, uint32_t a, uint32_t b) { gf_internal_t *h = (gf_internal_t *) gf->scratch; gf_t *base_gf = h->base_gf; - uint16_t b0 = b & 0x0000ffff; - uint16_t b1 = (b & 0xffff0000) >> 16; - uint16_t a0 = a & 0x0000ffff; - uint16_t a1 = (a & 0xffff0000) >> 16; - uint16_t a1b1; + uint32_t b0 = b & 0x0000ffff; + uint32_t b1 = (b & 0xffff0000) >> 16; + uint32_t a0 = a & 0x0000ffff; + uint32_t a1 = (a & 0xffff0000) >> 16; + uint32_t a1b1; uint32_t rv; - a1b1 = base_gf->multiply.w32(base_gf, a1, b1); - rv = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, GF_S_GF_16_2)) << 16)); + rv = ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 16) | (base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1); return rv; } +/* JSP: This could be made faster. Someday, when I'm bored. */ + +static +uint32_t +gf_w32_composite_multiply_inline(gf_t *gf, uint32_t a, uint32_t b) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + gf_t *base_gf = h->base_gf; + uint32_t b0 = b & 0x0000ffff; + uint32_t b1 = b >> 16; + uint32_t a0 = a & 0x0000ffff; + uint32_t a1 = a >> 16; + uint32_t a1b1, prod; + uint16_t *log, *alog; + struct gf_w32_composite_data *cd; + + cd = (struct gf_w32_composite_data *) h->private; + log = cd->log; + alog = cd->alog; + + a1b1 = GF_W16_INLINE_MULT(log, alog, a1, b1); + prod = GF_W16_INLINE_MULT(log, alog, a1, b0); + prod ^= GF_W16_INLINE_MULT(log, alog, a0, b1); + prod ^= GF_W16_INLINE_MULT(log, alog, a1b1, h->prim_poly); + prod <<= 16; + prod ^= GF_W16_INLINE_MULT(log, alog, a0, b0); + prod ^= a1b1; + return prod; +} + /* * Composite field division trick (explained in 2007 tech report) * @@ -2075,6 +2419,7 @@ gf_w32_composite_multiply_recursive(gf_t *gf, uint32_t a, uint32_t b) * * a / b = a * c */ + static uint32_t gf_w32_composite_inverse(gf_t *gf, uint32_t a) @@ -2089,7 +2434,7 @@ gf_w32_composite_inverse(gf_t *gf, uint32_t a) if (a0 == 0) { a1inv = base_gf->inverse.w32(base_gf, a1); - c0 = base_gf->multiply.w32(base_gf, a1inv, GF_S_GF_16_2); + c0 = base_gf->multiply.w32(base_gf, a1inv, h->prim_poly); c1 = a1inv; } else if (a1 == 0) { c0 = base_gf->inverse.w32(base_gf, a0); @@ -2100,7 +2445,7 @@ gf_w32_composite_inverse(gf_t *gf, uint32_t a) d = base_gf->multiply.w32(base_gf, a1, a0inv); - tmp = (base_gf->multiply.w32(base_gf, a1, a0inv) ^ base_gf->multiply.w32(base_gf, a0, a1inv) ^ GF_S_GF_16_2); + tmp = (base_gf->multiply.w32(base_gf, a1, a0inv) ^ base_gf->multiply.w32(base_gf, a0, a1inv) ^ h->prim_poly); tmp = base_gf->inverse.w32(base_gf, tmp); d = base_gf->multiply.w32(base_gf, d, tmp); @@ -2114,115 +2459,89 @@ gf_w32_composite_inverse(gf_t *gf, uint32_t a) return c; } -static -uint32_t -gf_w32_composite_divide(gf_t *gf, uint32_t a, uint32_t b) -{ - uint32_t binv; - - binv = gf->inverse.w32(gf, b); - return gf->multiply.w32(gf, a, binv); -} - -/* JSP: I'm not using this because I don't think it has value added. */ -static -void -gf_w32_composite_multiply_region_inline(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) -{ - unsigned long uls, uld; - gf_internal_t *h = (gf_internal_t *) gf->scratch; - gf_t *base_gf = h->base_gf; - int i=0; - struct gf_w16_logtable_data * ltd; - uint16_t b0 = val & 0x0000ffff; - uint16_t b1 = (val & 0xffff0000) >> 16; - uint32_t *s32 = (uint32_t *) src; - uint32_t *d32 = (uint32_t *) dest; - uint16_t a0, a1, a1b1; - int num_syms = bytes >> 2; - int sym_divisible = bytes % 4; - - uls = (unsigned long) src; - uld = (unsigned long) dest; - if ((uls & 0x7) != (uld & 0x7)) gf_alignment_error("gf_w32_buf_const_log", 2); - if (sym_divisible) { - gf_alignment_error("gf_w32_buf_const_log: buffer size not divisible by symbol size = 2 bytes", 2); - } - - if (val == 0) { - if (xor) return; - bzero(dest, bytes); - return; - } - - ltd = (struct gf_w16_logtable_data *) h->private; - - if (xor) { - for (i = 0;i < num_syms; i++) { - a0 = s32[i] & 0x0000ffff; - a1 = (s32[i] & 0xffff0000) >> 16; - a1b1 = ltd->antilog_tbl[ltd->log_tbl[a1] + ltd->log_tbl[b1]]; - - d32[i] ^= ((ltd->antilog_tbl[ltd->log_tbl[a0] + ltd->log_tbl[b0]] ^ a1b1) | - ((ltd->antilog_tbl[ltd->log_tbl[a1] + ltd->log_tbl[b0]] ^ ltd->antilog_tbl[ltd->log_tbl[a0] + ltd->log_tbl[b1]] ^ - ltd->antilog_tbl[ltd->log_tbl[a1b1] + ltd->log_tbl[GF_S_GF_16_2]]) << 16)); - - } - } else { - for (i = 0;i < num_syms; i++) { - a0 = s32[i] & 0x0000ffff; - a1 = (s32[i] & 0xffff0000) >> 16; - a1b1 = ltd->antilog_tbl[ltd->log_tbl[a1] + ltd->log_tbl[b1]]; - - d32[i] = ((ltd->antilog_tbl[ltd->log_tbl[a0] + ltd->log_tbl[b0]] ^ a1b1) | - ((ltd->antilog_tbl[ltd->log_tbl[a1] + ltd->log_tbl[b0]] ^ ltd->antilog_tbl[ltd->log_tbl[a0] + ltd->log_tbl[b1]] ^ - ltd->antilog_tbl[ltd->log_tbl[a1b1] + ltd->log_tbl[GF_S_GF_16_2]]) << 16)); - } - } -} - static void gf_w32_composite_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) { gf_internal_t *h = (gf_internal_t *) gf->scratch; gf_t *base_gf = h->base_gf; - struct gf_w16_logtable_data * ltd; - uint16_t b0 = val & 0x0000ffff; - uint16_t b1 = (val & 0xffff0000) >> 16; + uint32_t b0 = val & 0x0000ffff; + uint32_t b1 = (val & 0xffff0000) >> 16; uint32_t *s32, *d32, *top; - uint16_t a0, a1, a1b1; + uint16_t a0, a1, a1b1, *log, *alog; + uint32_t prod; gf_region_data rd; + struct gf_w32_composite_data *cd; + + cd = (struct gf_w32_composite_data *) h->private; + log = cd->log; + alog = cd->alog; if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4); - s32 = rd.s_start; d32 = rd.d_start; top = rd.d_top; - if (xor) { - while (d32 < top) { - a0 = *s32 & 0x0000ffff; - a1 = (*s32 & 0xffff0000) >> 16; - a1b1 = base_gf->multiply.w32(base_gf, a1, b1); - - *d32 ^= ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | - ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, GF_S_GF_16_2)) << 16)); - s32++; - d32++; + if (log == NULL) { + if (xor) { + while (d32 < top) { + a0 = *s32 & 0x0000ffff; + a1 = (*s32 & 0xffff0000) >> 16; + a1b1 = base_gf->multiply.w32(base_gf, a1, b1); + + *d32 ^= ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | + ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 16)); + s32++; + d32++; + } + } else { + while (d32 < top) { + a0 = *s32 & 0x0000ffff; + a1 = (*s32 & 0xffff0000) >> 16; + a1b1 = base_gf->multiply.w32(base_gf, a1, b1); + + *d32 = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | + ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 16)); + s32++; + d32++; + } } } else { - while (d32 < top) { - a0 = *s32 & 0x0000ffff; - a1 = (*s32 & 0xffff0000) >> 16; - a1b1 = base_gf->multiply.w32(base_gf, a1, b1); + if (xor) { + while (d32 < top) { + a0 = *s32 & 0x0000ffff; + a1 = (*s32 & 0xffff0000) >> 16; + a1b1 = GF_W16_INLINE_MULT(log, alog, a1, b1); - *d32 = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | - ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, GF_S_GF_16_2)) << 16)); - s32++; - d32++; + prod = GF_W16_INLINE_MULT(log, alog, a1, b0); + prod ^= GF_W16_INLINE_MULT(log, alog, a0, b1); + prod ^= GF_W16_INLINE_MULT(log, alog, a1b1, h->prim_poly); + prod <<= 16; + prod ^= GF_W16_INLINE_MULT(log, alog, a0, b0); + prod ^= a1b1; + *d32 ^= prod; + s32++; + d32++; + } + } else { + while (d32 < top) { + a0 = *s32 & 0x0000ffff; + a1 = (*s32 & 0xffff0000) >> 16; + a1b1 = GF_W16_INLINE_MULT(log, alog, a1, b1); + + prod = GF_W16_INLINE_MULT(log, alog, a1, b0); + prod ^= GF_W16_INLINE_MULT(log, alog, a0, b1); + prod ^= GF_W16_INLINE_MULT(log, alog, a1b1, h->prim_poly); + prod <<= 16; + prod ^= GF_W16_INLINE_MULT(log, alog, a0, b0); + prod ^= a1b1; + + *d32 = prod; + s32++; + d32++; + } } } } @@ -2259,7 +2578,7 @@ gf_w32_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, uint32_t v base_gf->multiply_region.w32(base_gf, shigh, dlow, val1, sub_reg_size, 1); base_gf->multiply_region.w32(base_gf, slow, dhigh, val1, sub_reg_size, xor); base_gf->multiply_region.w32(base_gf, shigh, dhigh, val0, sub_reg_size, 1); - base_gf->multiply_region.w32(base_gf, shigh, dhigh, base_gf->multiply.w32(base_gf, GF_S_GF_16_2, val1), sub_reg_size, 1); + base_gf->multiply_region.w32(base_gf, shigh, dhigh, base_gf->multiply.w32(base_gf, h->prim_poly, val1), sub_reg_size, 1); gf_do_final_region_alignment(&rd); } @@ -2267,143 +2586,92 @@ gf_w32_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, uint32_t v static int gf_w32_composite_init(gf_t *gf) { - struct gf_w16_logtable_data *ltd; gf_internal_t *h = (gf_internal_t *) gf->scratch; - gf_t *base_gf = h->base_gf; - gf_internal_t *base_h = (gf_internal_t *) base_gf->scratch; - uint32_t a, b; - uint64_t prim_poly = ((gf_internal_t *) base_gf->scratch)->prim_poly; - int i; + struct gf_w32_composite_data *cd; + + if (h->base_gf == NULL) return 0; + + cd = (struct gf_w32_composite_data *) h->private; + cd->log = gf_w16_get_log_table(h->base_gf); + cd->alog = gf_w16_get_mult_alog_table(h->base_gf); if (h->region_type & GF_REGION_ALTMAP) { gf->multiply_region.w32 = gf_w32_composite_multiply_region_alt; - } else if (h->arg2 == 0 && base_h->mult_type == GF_MULT_LOG_TABLE && - base_h->arg1 == 0) { - gf->multiply_region.w32 = gf_w32_composite_multiply_region; -/* It would be this, were that not buggy and I cared: - gf->multiply_region.w32 = gf_w32_composite_multiply_region_inline; */ } else { gf->multiply_region.w32 = gf_w32_composite_multiply_region; } - if (h->arg2 == 0) { - ltd = (struct gf_w16_logtable_data *) h->private; - - ltd->log_tbl[0] = 0; - - bzero(&(ltd->_antilog_tbl[0]), sizeof(ltd->_antilog_tbl)); - - ltd->antilog_tbl = &(ltd->_antilog_tbl[GF_BASE_FIELD_SIZE * 2]); - - b = 1; - for (i = 0; i < GF_BASE_FIELD_GROUP_SIZE; i++) { - ltd->log_tbl[b] = (uint16_t)i; - ltd->antilog_tbl[i] = (uint16_t)b; - ltd->antilog_tbl[i+GF_BASE_FIELD_GROUP_SIZE] = (uint16_t)b; - b <<= 1; - if (b & GF_BASE_FIELD_SIZE) { - b = b ^ prim_poly; - } - } - ltd->log_s = ltd->log_tbl[GF_S_GF_16_2]; - ltd->inv_tbl[0] = 0; /* Not really, but we need to fill it with something */ - ltd->inv_tbl[1] = 1; - for (i = 2; i < GF_BASE_FIELD_SIZE; i++) { - ltd->inv_tbl[i] = ltd->antilog_tbl[GF_BASE_FIELD_GROUP_SIZE-ltd->log_tbl[i]]; - } - gf->multiply.w32 = gf_w32_composite_multiply_logtable; - } else { + if (cd->log == NULL) { gf->multiply.w32 = gf_w32_composite_multiply_recursive; + } else { + gf->multiply.w32 = gf_w32_composite_multiply_inline; } - - gf->divide.w32 = gf_w32_composite_divide; + gf->divide.w32 = NULL; gf->inverse.w32 = gf_w32_composite_inverse; return 1; } + + int gf_w32_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2) { - int ss, sa; + int ss; + int issse3 = 0; ss = (GF_REGION_SSE | GF_REGION_NOSSE); - sa = (GF_REGION_STDMAP | GF_REGION_ALTMAP); + +#ifdef INTEL_SSSE3 + issse3 = 1; +#endif switch(mult_type) { case GF_MULT_BYTWO_p: case GF_MULT_BYTWO_b: - if (arg1 != 0 || arg2 != 0) return -1; - if (region_type != GF_REGION_CAUCHY) { - if ((region_type | ss) != ss || (region_type & ss) == ss) return -1; - } - return sizeof(gf_internal_t) + sizeof(struct gf_w32_bytwo_data); + return sizeof(gf_internal_t) + sizeof(struct gf_w32_bytwo_data) + 64; break; - case GF_MULT_DEFAULT: case GF_MULT_GROUP: - if (mult_type == GF_MULT_DEFAULT) { - arg1 = 3; - arg2 = 8; - } - if (arg1 <= 0 || arg2 <= 0) return -1; - if (region_type != GF_REGION_DEFAULT && region_type != GF_REGION_CAUCHY) return -1; return sizeof(gf_internal_t) + sizeof(struct gf_w32_group_data) + sizeof(uint32_t) * (1 << arg1) + sizeof(uint32_t) * (1 << arg2) + 64; break; + case GF_MULT_DEFAULT: + case GF_MULT_SPLIT_TABLE: if (arg1 == 8 && arg2 == 8){ - if (region_type != GF_REGION_DEFAULT && region_type != GF_REGION_CAUCHY) return -1; - return sizeof(gf_internal_t) + sizeof(struct gf_split_8_8_data) + 64; + return sizeof(gf_internal_t) + sizeof(struct gf_w32_split_8_8_data) + 64; } if ((arg1 == 16 && arg2 == 32) || (arg2 == 16 && arg1 == 32)) { - region_type &= (~GF_REGION_LAZY); - if (region_type != GF_REGION_DEFAULT) return -1; return sizeof(gf_internal_t) + sizeof(struct gf_split_16_32_lazy_data) + 64; } - if ((arg1 == 8 && arg2 == 32) || (arg2 == 8 && arg1 == 32)) { - region_type &= (~GF_REGION_LAZY); - if (region_type != GF_REGION_DEFAULT) return -1; - return sizeof(gf_internal_t) + sizeof(struct gf_split_8_32_lazy_data) + 64; - } if ((arg1 == 2 && arg2 == 32) || (arg2 == 2 && arg1 == 32)) { - region_type &= (~GF_REGION_LAZY); - if ((region_type & ss) == ss) return -1; - if ((region_type | ss) != ss) return -1; return sizeof(gf_internal_t) + sizeof(struct gf_split_2_32_lazy_data) + 64; } - if ((arg1 == 4 && arg2 == 32) || (arg2 == 4 && arg1 == 32)) { - region_type &= (~GF_REGION_LAZY); - if ((region_type & ss) == ss) return -1; - if ((region_type & sa) == sa) return -1; - if (region_type & (~(ss|sa))) return -1; - if (region_type & GF_REGION_SSE) { - return sizeof(gf_internal_t) + sizeof(struct gf_split_4_32_lazy_data) + 64; - } else if (region_type & GF_REGION_ALTMAP) { - return -1; - } else { - return sizeof(gf_internal_t) + sizeof(struct gf_split_4_32_lazy_data) + 64; - } + if ((arg1 == 8 && arg2 == 32) || (arg2 == 8 && arg1 == 32) || + (mult_type == GF_MULT_DEFAULT && !issse3)) { + return sizeof(gf_internal_t) + sizeof(struct gf_split_8_32_lazy_data) + 64; } - return -1; + if ((arg1 == 4 && arg2 == 32) || + (arg2 == 4 && arg1 == 32) || + mult_type == GF_MULT_DEFAULT) { + return sizeof(gf_internal_t) + sizeof(struct gf_split_4_32_lazy_data) + 64; + } + return 0; + case GF_MULT_CARRY_FREE: + return sizeof(gf_internal_t); + break; case GF_MULT_SHIFT: - if (arg1 != 0 || arg2 != 0) return -1; - if (region_type != 0 && region_type != GF_REGION_CAUCHY) return -1; return sizeof(gf_internal_t); break; case GF_MULT_COMPOSITE: - if (region_type & ~(GF_REGION_ALTMAP | GF_REGION_STDMAP)) return -1; - if (arg1 == 2 && arg2 == 0) { - return sizeof(gf_internal_t) + sizeof(struct gf_w16_logtable_data) + 64; - } else if (arg1 == 2 && arg2 == 1) { - return sizeof(gf_internal_t) + 64; - } else { - return -1; - } + return sizeof(gf_internal_t) + sizeof(struct gf_w32_composite_data) + 64; + break; default: - return -1; + return 0; } + return 0; } int gf_w32_init(gf_t *gf) @@ -2411,22 +2679,43 @@ int gf_w32_init(gf_t *gf) gf_internal_t *h; h = (gf_internal_t *) gf->scratch; - if (h->prim_poly == 0) h->prim_poly = 0x400007; + + /* Allen: set default primitive polynomial / irreducible polynomial if needed */ + if (h->prim_poly == 0) { + if (h->mult_type == GF_MULT_COMPOSITE) { + h->prim_poly = gf_composite_get_default_poly(h->base_gf); + if (h->prim_poly == 0) return 0; /* This shouldn't happen */ + } else { + + /* Allen: use the following primitive polynomial to make carryless multiply work more efficiently for GF(2^32).*/ + + /* h->prim_poly = 0xc5; */ + + /* Allen: The following is the traditional primitive polynomial for GF(2^32) */ + + h->prim_poly = 0x400007; + } + } + + /* No leading one */ + + if(h->mult_type != GF_MULT_COMPOSITE) h->prim_poly &= 0xffffffff; + gf->multiply.w32 = NULL; gf->divide.w32 = NULL; gf->inverse.w32 = NULL; gf->multiply_region.w32 = NULL; switch(h->mult_type) { + case GF_MULT_CARRY_FREE: if (gf_w32_cfm_init(gf) == 0) return 0; break; case GF_MULT_SHIFT: if (gf_w32_shift_init(gf) == 0) return 0; break; case GF_MULT_COMPOSITE: if (gf_w32_composite_init(gf) == 0) return 0; break; - case GF_MULT_SPLIT_TABLE: if (gf_w32_split_init(gf) == 0) return 0; break; case GF_MULT_DEFAULT: + case GF_MULT_SPLIT_TABLE: if (gf_w32_split_init(gf) == 0) return 0; break; case GF_MULT_GROUP: if (gf_w32_group_init(gf) == 0) return 0; break; case GF_MULT_BYTWO_p: case GF_MULT_BYTWO_b: if (gf_w32_bytwo_init(gf) == 0) return 0; break; - default: return 0; } if (h->divide_type == GF_DIVIDE_EUCLID) { diff --git a/gf_w4.c b/gf_w4.c index 1175e01..50f00da 100644 --- a/gf_w4.c +++ b/gf_w4.c @@ -100,7 +100,6 @@ gf_val_32_t gf_w4_euclid (gf_t *gf, gf_val_32_t b) y_im1 = 0; while (e_i != 1) { - e_ip1 = e_im1; d_ip1 = d_im1; c_i = 0; @@ -108,6 +107,7 @@ gf_val_32_t gf_w4_euclid (gf_t *gf, gf_val_32_t b) while (d_ip1 >= d_i) { c_i ^= (1 << (d_ip1 - d_i)); e_ip1 ^= (e_i << (d_ip1 - d_i)); + if (e_ip1 == 0) return 0; while ((e_ip1 & (1 << d_ip1)) == 0) d_ip1--; } @@ -146,6 +146,110 @@ gf_val_32_t gf_w4_matrix (gf_t *gf, gf_val_32_t b) return gf_bitmatrix_inverse(b, 4, ((gf_internal_t *) (gf->scratch))->prim_poly); } + +static +inline +gf_val_32_t +gf_w4_shift_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + uint8_t product, i, pp; + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + + product = 0; + + for (i = 0; i < GF_FIELD_WIDTH; i++) { + if (a & (1 << i)) product ^= (b << i); + } + for (i = (GF_FIELD_WIDTH*2-2); i >= GF_FIELD_WIDTH; i--) { + if (product & (1 << i)) product ^= (pp << (i-GF_FIELD_WIDTH)); + } + return product; +} + +/* Ben: This function works, but it is 33% slower than the normal shift mult */ + +static +inline +gf_val_32_t +gf_w4_clm_multiply (gf_t *gf, gf_val_32_t a4, gf_val_32_t b4) +{ + gf_val_32_t rv = 0; + +#ifdef INTEL_SSE4_PCLMUL + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i w; + gf_internal_t * h = gf->scratch; + + a = _mm_insert_epi32 (_mm_setzero_si128(), a4, 0); + b = _mm_insert_epi32 (a, b4, 0); + + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1fULL)); + + /* Do the initial multiply */ + + result = _mm_clmulepi64_si128 (a, b, 0); + + /* Ben/JSP: Do prim_poly reduction once. We are guaranteed that we will only + have to do the reduction only once, because (w-2)/z == 1. Where + z is equal to the number of zeros after the leading 1. + + _mm_clmulepi64_si128 is the carryless multiply operation. Here + _mm_srli_epi64 shifts the result to the right by 4 bits. This allows + us to multiply the prim_poly by the leading bits of the result. We + then xor the result of that operation back with the result. */ + + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_epi64 (result, 4), 0); + result = _mm_xor_si128 (result, w); + + /* Extracts 32 bit value from result. */ + + rv = ((gf_val_32_t)_mm_extract_epi32(result, 0)); +#endif + return rv; +} + +static +void +gf_w4_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int + xor) +{ + gf_region_data rd; + uint8_t *s8; + uint8_t *d8; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1); + gf_do_initial_region_alignment(&rd); + + s8 = (uint8_t *) rd.s_start; + d8 = (uint8_t *) rd.d_start; + + if (xor) { + while (d8 < ((uint8_t *) rd.d_top)) { + *d8 ^= (gf->multiply.w32(gf, val, (*s8 & 0xf)) | + ((gf->multiply.w32(gf, val, (*s8 >> 4))) << 4)); + d8++; + s8++; + } + } else { + while (d8 < ((uint8_t *) rd.d_top)) { + *d8 = (gf->multiply.w32(gf, val, (*s8 & 0xf)) | + ((gf->multiply.w32(gf, val, (*s8 >> 4))) << 4)); + d8++; + s8++; + } + } + gf_do_final_region_alignment(&rd); +} + /* ------------------------------------------------------------ IMPLEMENTATION: LOG_TABLE: @@ -220,18 +324,28 @@ int gf_w4_log_init(gf_t *gf) h = (gf_internal_t *) gf->scratch; ltd = h->private; - ltd->log_tbl[0] = 0; + for (i = 0; i < GF_FIELD_SIZE; i++) + ltd->log_tbl[i]=0; ltd->antilog_tbl_div = ltd->antilog_tbl + (GF_FIELD_SIZE-1); b = 1; - for (i = 0; i < GF_FIELD_SIZE-1; i++) { - ltd->log_tbl[b] = i; - ltd->antilog_tbl[i] = b; - ltd->antilog_tbl[i+GF_FIELD_SIZE-1] = b; - b <<= 1; - if (b & GF_FIELD_SIZE) { - b = b ^ h->prim_poly; - } + i = 0; + do { + if (ltd->log_tbl[b] != 0 && i != 0) { + fprintf(stderr, "Cannot construct log table: Polynomial is not primitive.\n\n"); + return 0; + } + ltd->log_tbl[b] = i; + ltd->antilog_tbl[i] = b; + ltd->antilog_tbl[i+GF_FIELD_SIZE-1] = b; + b <<= 1; + i++; + if (b & GF_FIELD_SIZE) b = b ^ h->prim_poly; + } while (b != 1); + + if (i != GF_FIELD_SIZE - 1) { + _gf_errno = GF_E_LOGPOLY; + return 0; } gf->inverse.w32 = gf_w4_inverse_from_divide; @@ -300,7 +414,7 @@ static void gf_w4_single_table_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { -#ifdef INTEL_SSE4 +#ifdef INTEL_SSSE3 gf_region_data rd; uint8_t *base, *sptr, *dptr, *top; __m128i tl, loset, h4, r, va, th; @@ -351,37 +465,17 @@ int gf_w4_single_table_init(gf_t *gf) gf_internal_t *h; struct gf_single_table_data *std; int a, b, prod, loga, logb; - uint8_t log_tbl[GF_FIELD_SIZE]; - uint8_t antilog_tbl[GF_FIELD_SIZE*2]; - int sse; - sse = 0; -#ifdef INTEL_SSE4 - sse = 1; -#endif h = (gf_internal_t *) gf->scratch; std = (struct gf_single_table_data *)h->private; - b = 1; - for (a = 0; a < GF_MULT_GROUP_SIZE; a++) { - log_tbl[b] = a; - antilog_tbl[a] = b; - antilog_tbl[a+GF_MULT_GROUP_SIZE] = b; - b <<= 1; - if (b & GF_FIELD_SIZE) { - b = b ^ h->prim_poly; - } - } - bzero(std->mult, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE); bzero(std->div, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE); for (a = 1; a < GF_FIELD_SIZE; a++) { - loga = log_tbl[a]; for (b = 1; b < GF_FIELD_SIZE; b++) { - logb = log_tbl[b]; - prod = antilog_tbl[loga+logb]; + prod = gf_w4_shift_multiply(gf, a, b); std->mult[a][b] = prod; std->div[prod][b] = a; } @@ -390,11 +484,16 @@ int gf_w4_single_table_init(gf_t *gf) gf->inverse.w32 = NULL; gf->divide.w32 = gf_w4_single_table_divide; gf->multiply.w32 = gf_w4_single_table_multiply; - if ((h->region_type & GF_REGION_SSE) || (h->mult_type == GF_MULT_DEFAULT && sse)) { - gf->multiply_region.w32 = gf_w4_single_table_sse_multiply_region; - } else { + #ifdef INTEL_SSSE3 + if(h->region_type & (GF_REGION_NOSSE | GF_REGION_CAUCHY)) + gf->multiply_region.w32 = gf_w4_single_table_multiply_region; + else + gf->multiply_region.w32 = gf_w4_single_table_sse_multiply_region; + #else gf->multiply_region.w32 = gf_w4_single_table_multiply_region; - } + if (h->region_type & GF_REGION_SSE) return 0; + #endif + return 1; } @@ -458,32 +557,17 @@ int gf_w4_double_table_init(gf_t *gf) gf_internal_t *h; struct gf_double_table_data *std; int a, b, c, prod, loga, logb, ab; - uint8_t log_tbl[GF_FIELD_SIZE]; - uint8_t antilog_tbl[GF_FIELD_SIZE*2]; uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE]; h = (gf_internal_t *) gf->scratch; std = (struct gf_double_table_data *)h->private; - b = 1; - for (a = 0; a < GF_MULT_GROUP_SIZE; a++) { - log_tbl[b] = a; - antilog_tbl[a] = b; - antilog_tbl[a+GF_MULT_GROUP_SIZE] = b; - b <<= 1; - if (b & GF_FIELD_SIZE) { - b = b ^ h->prim_poly; - } - } - bzero(mult, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE); bzero(std->div, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE); for (a = 1; a < GF_FIELD_SIZE; a++) { - loga = log_tbl[a]; for (b = 1; b < GF_FIELD_SIZE; b++) { - logb = log_tbl[b]; - prod = antilog_tbl[loga+logb]; + prod = gf_w4_shift_multiply(gf, a, b); mult[a][b] = prod; std->div[prod][b] = a; } @@ -600,32 +684,17 @@ int gf_w4_quad_table_init(gf_t *gf) gf_internal_t *h; struct gf_quad_table_data *std; int prod, loga, logb, ab, val, a, b, c, d, va, vb, vc, vd; - uint8_t log_tbl[GF_FIELD_SIZE]; - uint8_t antilog_tbl[GF_FIELD_SIZE*2]; uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE]; h = (gf_internal_t *) gf->scratch; std = (struct gf_quad_table_data *)h->private; - b = 1; - for (a = 0; a < GF_MULT_GROUP_SIZE; a++) { - log_tbl[b] = a; - antilog_tbl[a] = b; - antilog_tbl[a+GF_MULT_GROUP_SIZE] = b; - b <<= 1; - if (b & GF_FIELD_SIZE) { - b = b ^ h->prim_poly; - } - } - bzero(mult, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE); bzero(std->div, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE); for (a = 1; a < GF_FIELD_SIZE; a++) { - loga = log_tbl[a]; for (b = 1; b < GF_FIELD_SIZE; b++) { - logb = log_tbl[b]; - prod = antilog_tbl[loga+logb]; + prod = gf_w4_shift_multiply(gf, a, b); mult[a][b] = prod; std->div[prod][b] = a; } @@ -702,13 +771,18 @@ int gf_w4_table_init(gf_t *gf) { int rt; gf_internal_t *h; + int issse3 = 0; + +#ifdef INTEL_SSSE3 + issse3 = 1; +#endif h = (gf_internal_t *) gf->scratch; rt = (h->region_type); - if (rt == 0 || rt == GF_REGION_CAUCHY) rt |= GF_REGION_SINGLE_TABLE; - if (rt & GF_REGION_SINGLE_TABLE) { - return gf_w4_single_table_init(gf); - } else if (rt & GF_REGION_DOUBLE_TABLE) { + + if (h->mult_type == GF_MULT_DEFAULT && !issse3) rt |= GF_REGION_DOUBLE_TABLE; + + if (rt & GF_REGION_DOUBLE_TABLE) { return gf_w4_double_table_init(gf); } else if (rt & GF_REGION_QUAD_TABLE) { if (rt & GF_REGION_LAZY) { @@ -717,7 +791,9 @@ int gf_w4_table_init(gf_t *gf) return gf_w4_quad_table_init(gf); } return gf_w4_double_table_init(gf); - } + } else { + return gf_w4_single_table_init(gf); + } return 0; } @@ -842,7 +918,7 @@ static void gf_w4_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { -#ifdef INTEL_SSE4 +#ifdef INTEL_SSE2 int i; uint8_t *s8, *d8; uint8_t vrev; @@ -895,7 +971,7 @@ static void gf_w4_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { -#ifdef INTEL_SSE4 +#ifdef INTEL_SSE2 uint8_t *d8, *s8, tb; __m128i pp, m1, m2, t1, t2, va, vb; struct gf_bytwo_data *btd; @@ -960,7 +1036,7 @@ static void gf_w4_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_bytwo_data *btd) { -#ifdef INTEL_SSE4 +#ifdef INTEL_SSE2 int i; uint8_t *d8, *s8, tb; __m128i pp, m1, m2, t1, t2, va, vb; @@ -986,7 +1062,7 @@ static void gf_w4_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_bytwo_data *btd) { -#ifdef INTEL_SSE4 +#ifdef INTEL_SSE2 int i; uint8_t *d8, *s8, tb; __m128i pp, m1, m2, t1, t2, va, vb; @@ -1014,7 +1090,7 @@ static void gf_w4_bytwo_b_sse_region_4_noxor(gf_region_data *rd, struct gf_bytwo_data *btd) { -#ifdef INTEL_SSE4 +#ifdef INTEL_SSE2 int i; uint8_t *d8, *s8, tb; __m128i pp, m1, m2, t1, t2, va, vb; @@ -1041,7 +1117,7 @@ static void gf_w4_bytwo_b_sse_region_4_xor(gf_region_data *rd, struct gf_bytwo_data *btd) { -#ifdef INTEL_SSE4 +#ifdef INTEL_SSE2 int i; uint8_t *d8, *s8, tb; __m128i pp, m1, m2, t1, t2, va, vb; @@ -1071,7 +1147,7 @@ static void gf_w4_bytwo_b_sse_region_3_noxor(gf_region_data *rd, struct gf_bytwo_data *btd) { -#ifdef INTEL_SSE4 +#ifdef INTEL_SSE2 int i; uint8_t *d8, *s8, tb; __m128i pp, m1, m2, t1, t2, va, vb; @@ -1099,7 +1175,7 @@ static void gf_w4_bytwo_b_sse_region_3_xor(gf_region_data *rd, struct gf_bytwo_data *btd) { -#ifdef INTEL_SSE4 +#ifdef INTEL_SSE2 int i; uint8_t *d8, *s8, tb; __m128i pp, m1, m2, t1, t2, va, vb; @@ -1127,7 +1203,7 @@ static void gf_w4_bytwo_b_sse_region_5_noxor(gf_region_data *rd, struct gf_bytwo_data *btd) { -#ifdef INTEL_SSE4 +#ifdef INTEL_SSE2 int i; uint8_t *d8, *s8, tb; __m128i pp, m1, m2, t1, t2, va, vb; @@ -1156,7 +1232,7 @@ static void gf_w4_bytwo_b_sse_region_5_xor(gf_region_data *rd, struct gf_bytwo_data *btd) { -#ifdef INTEL_SSE4 +#ifdef INTEL_SSE2 int i; uint8_t *d8, *s8, tb; __m128i pp, m1, m2, t1, t2, va, vb; @@ -1185,7 +1261,7 @@ static void gf_w4_bytwo_b_sse_region_7_noxor(gf_region_data *rd, struct gf_bytwo_data *btd) { -#ifdef INTEL_SSE4 +#ifdef INTEL_SSE2 int i; uint8_t *d8, *s8, tb; __m128i pp, m1, m2, t1, t2, va, vb; @@ -1215,7 +1291,7 @@ static void gf_w4_bytwo_b_sse_region_7_xor(gf_region_data *rd, struct gf_bytwo_data *btd) { -#ifdef INTEL_SSE4 +#ifdef INTEL_SSE2 int i; uint8_t *d8, *s8, tb; __m128i pp, m1, m2, t1, t2, va, vb; @@ -1245,7 +1321,7 @@ static void gf_w4_bytwo_b_sse_region_6_noxor(gf_region_data *rd, struct gf_bytwo_data *btd) { -#ifdef INTEL_SSE4 +#ifdef INTEL_SSE2 int i; uint8_t *d8, *s8, tb; __m128i pp, m1, m2, t1, t2, va, vb; @@ -1274,7 +1350,7 @@ static void gf_w4_bytwo_b_sse_region_6_xor(gf_region_data *rd, struct gf_bytwo_data *btd) { -#ifdef INTEL_SSE4 +#ifdef INTEL_SSE2 int i; uint8_t *d8, *s8, tb; __m128i pp, m1, m2, t1, t2, va, vb; @@ -1303,7 +1379,7 @@ static void gf_w4_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { -#ifdef INTEL_SSE4 +#ifdef INTEL_SSE2 uint8_t *d8, *s8, tb; __m128i pp, m1, m2, t1, t2, va, vb; struct gf_bytwo_data *btd; @@ -1853,114 +1929,107 @@ int gf_w4_bytwo_init(gf_t *gf) if (h->mult_type == GF_MULT_BYTWO_p) { gf->multiply.w32 = gf_w4_bytwo_p_multiply; - if (h->region_type == GF_REGION_SSE) { - gf->multiply_region.w32 = gf_w4_bytwo_p_sse_multiply_region; - } else { + #ifdef INTEL_SSE2 + if (h->region_type & GF_REGION_NOSSE) + gf->multiply_region.w32 = gf_w4_bytwo_p_nosse_multiply_region; + else + gf->multiply_region.w32 = gf_w4_bytwo_p_sse_multiply_region; + #else gf->multiply_region.w32 = gf_w4_bytwo_p_nosse_multiply_region; - } + if (h->region_type & GF_REGION_SSE) + return 0; + #endif } else { gf->multiply.w32 = gf_w4_bytwo_b_multiply; - if (h->region_type == GF_REGION_SSE) { - gf->multiply_region.w32 = gf_w4_bytwo_b_sse_multiply_region; - } else { + #ifdef INTEL_SSE2 + if (h->region_type & GF_REGION_NOSSE) + gf->multiply_region.w32 = gf_w4_bytwo_b_nosse_multiply_region; + else + gf->multiply_region.w32 = gf_w4_bytwo_b_sse_multiply_region; + #else gf->multiply_region.w32 = gf_w4_bytwo_b_nosse_multiply_region; - } + if (h->region_type & GF_REGION_SSE) + return 0; + #endif } - gf->inverse.w32 = gf_w4_euclid; return 1; } -/* ------------------------------------------------------------ - JSP: GF_MULT_SHIFT: The world's dumbest multiplication algorithm. I only - include it for completeness. It does have the feature that it requires no - extra memory. -*/ - -static -inline -gf_val_32_t -gf_w4_shift_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) +static +int gf_w4_cfm_init(gf_t *gf) { - uint8_t product, i, pp; gf_internal_t *h; - + h = (gf_internal_t *) gf->scratch; - pp = h->prim_poly; - product = 0; - - for (i = 0; i < GF_FIELD_WIDTH; i++) { - if (a & (1 << i)) product ^= (b << i); - } - for (i = (GF_FIELD_WIDTH*2-1); i >= GF_FIELD_WIDTH; i--) { - if (product & (1 << i)) product ^= (pp << (i-GF_FIELD_WIDTH)); - } - return product; +#ifdef INTEL_SSE4_PCLMUL + gf->multiply.w32 = gf_w4_clm_multiply; + return 1; +#endif + return 0; } static int gf_w4_shift_init(gf_t *gf) { gf->multiply.w32 = gf_w4_shift_multiply; - gf->inverse.w32 = gf_w4_euclid; return 1; } +/* JSP: I'm putting all error-checking into gf_error_check(), so you don't + have to do error checking in scratch_size or in init */ + int gf_w4_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2) { int region_tbl_size; - int sss; int ss; + int issse3 = 0; - sss = (GF_REGION_SINGLE_TABLE | GF_REGION_SSE | GF_REGION_NOSSE); - ss = (GF_REGION_SSE | GF_REGION_NOSSE); +#ifdef INTEL_SSSE3 + issse3 = 1; +#endif switch(mult_type) { case GF_MULT_BYTWO_p: case GF_MULT_BYTWO_b: - if (arg1 != 0 || arg2 != 0) return -1; - if (region_type != GF_REGION_CAUCHY) { - if ((region_type | ss) != ss || (region_type & ss) == ss) return -1; - } return sizeof(gf_internal_t) + sizeof(struct gf_bytwo_data); break; case GF_MULT_DEFAULT: case GF_MULT_TABLE: - if (arg1 != 0 || arg2 != 0) return -1; - if (region_type == GF_REGION_CAUCHY || region_type == (GF_REGION_CAUCHY | GF_REGION_SINGLE_TABLE)) { + if (region_type == GF_REGION_CAUCHY) { return sizeof(gf_internal_t) + sizeof(struct gf_single_table_data) + 64; } - if (mult_type == GF_MULT_DEFAULT || region_type == 0) region_type = GF_REGION_SINGLE_TABLE; - if (region_type & GF_REGION_SINGLE_TABLE) { - if ((region_type | sss) != sss) return -1; - if ((region_type & sss) == sss) return -1; - return sizeof(gf_internal_t) + sizeof(struct gf_single_table_data) + 64; - } else if (region_type & GF_REGION_DOUBLE_TABLE) { - if (region_type != GF_REGION_DOUBLE_TABLE) return -1; + + if (mult_type == GF_MULT_DEFAULT && !issse3) region_type = GF_REGION_DOUBLE_TABLE; + + if (region_type & GF_REGION_DOUBLE_TABLE) { return sizeof(gf_internal_t) + sizeof(struct gf_double_table_data) + 64; } else if (region_type & GF_REGION_QUAD_TABLE) { - if ((region_type | GF_REGION_LAZY) != (GF_REGION_QUAD_TABLE | GF_REGION_LAZY)) return -1; if ((region_type & GF_REGION_LAZY) == 0) { return sizeof(gf_internal_t) + sizeof(struct gf_quad_table_data) + 64; } else { return sizeof(gf_internal_t) + sizeof(struct gf_quad_table_lazy_data) + 64; } + } else { + return sizeof(gf_internal_t) + sizeof(struct gf_single_table_data) + 64; } - return -1; break; + case GF_MULT_LOG_TABLE: - if (arg1 != 0 || arg2 != 0 || (region_type != 0 && region_type != GF_REGION_CAUCHY)) return -1; return sizeof(gf_internal_t) + sizeof(struct gf_logtable_data) + 64; break; + case GF_MULT_CARRY_FREE: + return sizeof(gf_internal_t); + break; case GF_MULT_SHIFT: - if (arg1 != 0 || arg2 != 0 || (region_type != 0 && region_type != GF_REGION_CAUCHY)) return -1; return sizeof(gf_internal_t); break; default: - return -1; + return 0; } + return 0; } int @@ -1970,7 +2039,7 @@ gf_w4_init (gf_t *gf) h = (gf_internal_t *) gf->scratch; if (h->prim_poly == 0) h->prim_poly = 0x13; - + h->prim_poly |= 0x10; gf->multiply.w32 = NULL; gf->divide.w32 = NULL; gf->inverse.w32 = NULL; @@ -1978,13 +2047,13 @@ gf_w4_init (gf_t *gf) gf->extract_word.w32 = gf_w4_extract_word; switch(h->mult_type) { - case GF_MULT_SHIFT: if (gf_w4_shift_init(gf) == 0) return 0; break; + case GF_MULT_CARRY_FREE: if (gf_w4_cfm_init(gf) == 0) return 0; break; + case GF_MULT_SHIFT: if (gf_w4_shift_init(gf) == 0) return 0; break; case GF_MULT_BYTWO_p: - case GF_MULT_BYTWO_b: - if (gf_w4_bytwo_init(gf) == 0) return 0; break; - case GF_MULT_LOG_TABLE: if (gf_w4_log_init(gf) == 0) return 0; break; + case GF_MULT_BYTWO_b: if (gf_w4_bytwo_init(gf) == 0) return 0; break; + case GF_MULT_LOG_TABLE: if (gf_w4_log_init(gf) == 0) return 0; break; case GF_MULT_DEFAULT: - case GF_MULT_TABLE: if (gf_w4_table_init(gf) == 0) return 0; break; + case GF_MULT_TABLE: if (gf_w4_table_init(gf) == 0) return 0; break; default: return 0; } @@ -1996,17 +2065,22 @@ gf_w4_init (gf_t *gf) gf->inverse.w32 = gf_w4_matrix; } - if (gf->inverse.w32 != NULL && gf->divide.w32 == NULL) { + if (gf->divide.w32 == NULL) { gf->divide.w32 = gf_w4_divide_from_inverse; + if (gf->inverse.w32 == NULL) gf->inverse.w32 = gf_w4_euclid; } - if (gf->inverse.w32 == NULL && gf->divide.w32 != NULL) { - gf->inverse.w32 = gf_w4_inverse_from_divide; - } + + if (gf->inverse.w32 == NULL) gf->inverse.w32 = gf_w4_inverse_from_divide; if (h->region_type == GF_REGION_CAUCHY) { gf->multiply_region.w32 = gf_wgen_cauchy_region; gf->extract_word.w32 = gf_wgen_extract_word; } + + if (gf->multiply_region.w32 == NULL) { + gf->multiply_region.w32 = gf_w4_multiply_region_from_single; + } + return 1; } diff --git a/gf_w64.c b/gf_w64.c index 95100f4..12ec5af 100644 --- a/gf_w64.c +++ b/gf_w64.c @@ -9,18 +9,12 @@ #include #define GF_FIELD_WIDTH (64) -#define GF_FIRST_BIT (1L << 63) +#define GF_FIRST_BIT (1ULL << 63) #define GF_BASE_FIELD_WIDTH (32) -#define GF_BASE_FIELD_SIZE (1L << GF_BASE_FIELD_WIDTH) +#define GF_BASE_FIELD_SIZE (1ULL << GF_BASE_FIELD_WIDTH) #define GF_BASE_FIELD_GROUP_SIZE GF_BASE_FIELD_SIZE-1 -// 10000587 is a valid s for 2^16^2 -#define GF_S_GF_16_2_2 (1000587) - -// 1000012 is a valid s for 2^32 -#define GF_S_GF_32_2 (1000012) - struct gf_w64_group_data { uint64_t *reduce; uint64_t *shift; @@ -46,10 +40,6 @@ struct gf_split_8_8_data { uint64_t tables[15][256][256]; }; -typedef struct w64_composite_int_s { - uint64_t s; // 's' will be different depending on the base field -} w64_composite_int_t; - static inline gf_val_64_t gf_w64_inverse_from_divide (gf_t *gf, gf_val_64_t a) @@ -79,6 +69,9 @@ xor) s64 = (gf_val_64_t *) src; d64 = (gf_val_64_t *) dest; + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + if (xor) { for (i = 0; i < bytes/sizeof(gf_val_64_t); i++) { d64[i] ^= gf->multiply.w64(gf, val, s64[i]); @@ -91,7 +84,186 @@ xor) } static -inline +void +gf_w64_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int +xor) +{ + int i, size; + gf_val_64_t *s64, *d64, *top; + gf_region_data rd; + +#ifdef INTEL_SSE4_PCLMUL + __m128i a, b; + __m128i result, r1; + __m128i prim_poly; + __m128i v, w; + __m128i m1, m2, m3, m4; + gf_internal_t * h = gf->scratch; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); + gf_do_initial_region_alignment(&rd); + + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL)); + b = _mm_insert_epi64 (_mm_setzero_si128(), val, 0); + m1 = _mm_set_epi32(0, 0, 0, (uint32_t)0xffffffff); + m2 = _mm_slli_si128(m1, 4); + m2 = _mm_or_si128(m1, m2); + m3 = _mm_slli_si128(m1, 8); + m4 = _mm_slli_si128(m3, 4); + + s64 = (gf_val_64_t *) rd.s_start; + d64 = (gf_val_64_t *) rd.d_start; + top = (gf_val_64_t *) rd.d_top; + size = bytes/sizeof(gf_val_64_t); + + if (xor) { + while (d64 != top) { + a = _mm_load_si128((__m128i *) s64); + result = _mm_clmulepi64_si128 (a, b, 1); + + w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1); + r1 = _mm_xor_si128 (result, w); + + result = _mm_clmulepi64_si128 (a, b, 0); + + w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1); + result = _mm_xor_si128 (result, w); + + w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1); + result = _mm_xor_si128 (result, w); + + result = _mm_unpacklo_epi64(result, r1); + + r1 = _mm_load_si128((__m128i *) d64); + result = _mm_xor_si128(r1, result); + _mm_store_si128((__m128i *) d64, result); + d64 += 2; + s64 += 2; + } + } else { + while (d64 != top) { + + a = _mm_load_si128((__m128i *) s64); + result = _mm_clmulepi64_si128 (a, b, 1); + + w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1); + r1 = _mm_xor_si128 (result, w); + + result = _mm_clmulepi64_si128 (a, b, 0); + + w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1); + result = _mm_xor_si128 (result, w); + + result = _mm_unpacklo_epi64(result, r1); + + _mm_store_si128((__m128i *) d64, result); + d64 += 2; + s64 += 2; + } + } + gf_do_final_region_alignment(&rd); +#endif +} + +static +void +gf_w64_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int +xor) +{ + int i, size; + gf_val_64_t *s64, *d64, *top; + gf_region_data rd; + +#ifdef INTEL_SSE4_PCLMUL + __m128i a, b; + __m128i result, r1; + __m128i prim_poly; + __m128i w; + __m128i m1, m3, m4; + gf_internal_t * h = gf->scratch; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); + gf_do_initial_region_alignment(&rd); + + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL)); + b = _mm_insert_epi64 (_mm_setzero_si128(), val, 0); + m1 = _mm_set_epi32(0, 0, 0, (uint32_t)0xffffffff); + m3 = _mm_slli_si128(m1, 8); + m4 = _mm_slli_si128(m3, 4); + + s64 = (gf_val_64_t *) rd.s_start; + d64 = (gf_val_64_t *) rd.d_start; + top = (gf_val_64_t *) rd.d_top; + size = bytes/sizeof(gf_val_64_t); + + if (xor) { + while (d64 != top) { + a = _mm_load_si128((__m128i *) s64); + result = _mm_clmulepi64_si128 (a, b, 1); + + w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1); + r1 = _mm_xor_si128 (result, w); + + result = _mm_clmulepi64_si128 (a, b, 0); + + w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1); + result = _mm_xor_si128 (result, w); + + w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1); + result = _mm_xor_si128 (result, w); + + result = _mm_unpacklo_epi64(result, r1); + + r1 = _mm_load_si128((__m128i *) d64); + result = _mm_xor_si128(r1, result); + _mm_store_si128((__m128i *) d64, result); + d64 += 2; + s64 += 2; + } + } else { + while (d64 != top) { + a = _mm_load_si128((__m128i *) s64); + result = _mm_clmulepi64_si128 (a, b, 1); + + w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1); + r1 = _mm_xor_si128 (result, w); + + result = _mm_clmulepi64_si128 (a, b, 0); + + w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1); + result = _mm_xor_si128 (result, w); + + result = _mm_unpacklo_epi64(result, r1); + + _mm_store_si128((__m128i *) d64, result); + d64 += 2; + s64 += 2; + } + } + gf_do_final_region_alignment(&rd); +#endif +} + +static + inline gf_val_64_t gf_w64_euclid (gf_t *gf, gf_val_64_t b) { gf_val_64_t e_i, e_im1, e_ip1; @@ -118,6 +290,7 @@ gf_val_64_t gf_w64_euclid (gf_t *gf, gf_val_64_t b) c_i ^= (one << (d_ip1 - d_i)); e_ip1 ^= (e_i << (d_ip1 - d_i)); d_ip1--; + if (e_ip1 == 0) return 0; while ((e_ip1 & (one << d_ip1)) == 0) d_ip1--; } @@ -149,31 +322,41 @@ gf_w64_shift_multiply (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64) h = (gf_internal_t *) gf->scratch; ppr = h->prim_poly; - ppl = 1; + /* Allen: set leading one of primitive polynomial */ + + ppl = 1; + a = a64; bl = 0; br = b64; one = 1; lbit = (one << 63); - pl = 0; - pr = 0; + pl = 0; /* Allen: left side of product */ + pr = 0; /* Allen: right side of product */ + /* Allen: unlike the corresponding functions for smaller word sizes, + * this loop carries out the initial carryless multiply by + * shifting b itself rather than simply looking at successively + * higher shifts of b */ + for (i = 0; i < GF_FIELD_WIDTH; i++) { if (a & (one << i)) { pl ^= bl; pr ^= br; } - /* printf("P: %016llx %016llx ", pl, pr); printf("B: %016llx %016llx\n", bl, br); */ + bl <<= 1; if (br & lbit) bl ^= 1; br <<= 1; } - one = lbit; - ppl = ((h->prim_poly >> 1) | lbit); - ppr = lbit; + /* Allen: the name of the variable "one" is no longer descriptive at this point */ + + one = lbit >> 1; + ppl = (h->prim_poly >> 2) | one; + ppr = (h->prim_poly << (GF_FIELD_WIDTH-2)); while (one != 0) { if (pl & one) { pl ^= ppl; @@ -190,12 +373,16 @@ gf_w64_shift_multiply (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64) /* * ELM: Use the Intel carryless multiply instruction to do very fast 64x64 multiply. */ + static inline gf_val_64_t -gf_w64_clm_multiply (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64) +gf_w64_clm_multiply_2 (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64) { -#ifdef INTEL_PCLMUL + gf_val_64_t rv = 0; + +#ifdef INTEL_SSE4_PCLMUL + __m128i a, b; __m128i result; __m128i prim_poly; @@ -206,10 +393,17 @@ gf_w64_clm_multiply (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64) b = _mm_insert_epi64 (a, b64, 0); prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL)); /* Do the initial multiply */ + result = _mm_clmulepi64_si128 (a, b, 0); + /* Mask off the high order 32 bits using subtraction of the polynomial. * NOTE: this part requires that the polynomial have at least 32 leading 0 bits. */ + + /* Adam: We cant include the leading one in the 64 bit pclmul, + so we need to split up the high 8 bytes of the result into two + parts before we multiply them with the prim_poly.*/ + v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0); w = _mm_clmulepi64_si128 (prim_poly, v, 0); result = _mm_xor_si128 (result, w); @@ -217,47 +411,64 @@ gf_w64_clm_multiply (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64) w = _mm_clmulepi64_si128 (prim_poly, v, 0); result = _mm_xor_si128 (result, w); - return ((gf_val_64_t)_mm_extract_epi64(result, 0)); + rv = ((gf_val_64_t)_mm_extract_epi64(result, 0)); #endif + return rv; } - -#ifdef INTEL_PCLMUL + +static inline -__m128i -gf_w64_clm_multiply_single (__m128i v, __m128i b, __m128i pp_l, __m128i pp_h) +gf_val_64_t +gf_w64_clm_multiply_4 (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64) { - __m128i r0, r1, c0, c1, w0, w1; + gf_val_64_t rv = 0; - r0 = _mm_clmulepi64_si128 (b, v, 0); - c0 = _mm_srli_si128 (r0, 12); - w0 = _mm_clmulepi64_si128 (pp_h, c0, 0); - r0 = _mm_xor_si128 (r0, w0); - c0 = _mm_srli_si128 (_mm_slli_si128 (r0, 4), 12); - w0 = _mm_clmulepi64_si128 (pp_l, c0, 0); - r0 = _mm_insert_epi64 (_mm_xor_si128 (r0, w0), 0, 1); +#ifdef INTEL_SSE4_PCLMUL - r1 = _mm_clmulepi64_si128 (b, v, 1); - c1 = _mm_srli_si128 (r1, 12); - w1 = _mm_clmulepi64_si128 (pp_h, c1, 0); - r1 = _mm_xor_si128 (r1, w1); - c1 = _mm_srli_si128 (_mm_slli_si128 (r1, 4), 12); - w1 = _mm_clmulepi64_si128 (pp_l, c1, 0); - r1 = _mm_slli_si128 (_mm_xor_si128 (r1, w1), 8); + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i v, w; + gf_internal_t * h = gf->scratch; - return (_mm_xor_si128 (r0, r1)); + a = _mm_insert_epi64 (_mm_setzero_si128(), a64, 0); + b = _mm_insert_epi64 (a, b64, 0); + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL)); + + /* Do the initial multiply */ + + result = _mm_clmulepi64_si128 (a, b, 0); + + v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0); + w = _mm_clmulepi64_si128 (prim_poly, v, 0); + result = _mm_xor_si128 (result, w); + v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1); + w = _mm_clmulepi64_si128 (prim_poly, v, 0); + result = _mm_xor_si128 (result, w); + + v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0); + w = _mm_clmulepi64_si128 (prim_poly, v, 0); + result = _mm_xor_si128 (result, w); + v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1); + w = _mm_clmulepi64_si128 (prim_poly, v, 0); + result = _mm_xor_si128 (result, w); + + rv = ((gf_val_64_t)_mm_extract_epi64(result, 0)); +#endif + return rv; } -#endif -void + void gf_w64_clm_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor) { -#ifdef INTEL_PCLMUL +#ifdef INTEL_SSE4_PCLMUL gf_internal_t *h; - int i, top; - uint8_t *s8, *d8; + int i, j, k; + uint8_t *s8, *d8, *dtop; + uint64_t *s64, *d64; gf_region_data rd; - __m128i v, b, xv, pp_l, pp_h, final; + __m128i v, b, m, prim_poly, c, fr, w, result; if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } @@ -269,25 +480,67 @@ gf_w64_clm_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int by s8 = (uint8_t *) rd.s_start; d8 = (uint8_t *) rd.d_start; - top = (uint8_t *) rd.d_top - (uint8_t *)rd.d_start; + dtop = (uint8_t *) rd.d_top; v = _mm_insert_epi64(_mm_setzero_si128(), val, 0); - pp_l = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL)); - pp_h = _mm_slli_si128 (pp_l, 4); + m = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff); + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL)); if (xor) { - for (i = 0; i < top; i += 16) { - b = _mm_load_si128((__m128i *) (s8 + i)); - final = gf_w64_clm_multiply_single (v, b, pp_l, pp_h); - xv = _mm_load_si128((__m128i *) (d8 + i)); - final = _mm_xor_si128 (final, xv); - _mm_store_si128((__m128i *) (d8 + i), final); + while (d8 != dtop) { + s64 = (uint64_t *) s8; + b = _mm_load_si128((__m128i *) s8); + result = _mm_clmulepi64_si128 (b, v, 0); + c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0); + w = _mm_clmulepi64_si128 (prim_poly, c, 0); + result = _mm_xor_si128 (result, w); + c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1); + w = _mm_clmulepi64_si128 (prim_poly, c, 0); + fr = _mm_xor_si128 (result, w); + fr = _mm_and_si128 (fr, m); + + result = _mm_clmulepi64_si128 (b, v, 1); + c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0); + w = _mm_clmulepi64_si128 (prim_poly, c, 0); + result = _mm_xor_si128 (result, w); + c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1); + w = _mm_clmulepi64_si128 (prim_poly, c, 0); + result = _mm_xor_si128 (result, w); + result = _mm_slli_si128 (result, 8); + fr = _mm_xor_si128 (result, fr); + result = _mm_load_si128((__m128i *) d8); + fr = _mm_xor_si128 (result, fr); + + _mm_store_si128((__m128i *) d8, fr); + d8 += 16; + s8 += 16; } } else { - for (i = 0; i < top; i += 16) { - b = _mm_load_si128((__m128i *) (s8 + i)); - final = gf_w64_clm_multiply_single (v, b, pp_l, pp_h); - _mm_store_si128((__m128i *) (d8 + i), final); + while (d8 < dtop) { + s64 = (uint64_t *) s8; + b = _mm_load_si128((__m128i *) s8); + result = _mm_clmulepi64_si128 (b, v, 0); + c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0); + w = _mm_clmulepi64_si128 (prim_poly, c, 0); + result = _mm_xor_si128 (result, w); + c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1); + w = _mm_clmulepi64_si128 (prim_poly, c, 0); + fr = _mm_xor_si128 (result, w); + fr = _mm_and_si128 (fr, m); + + result = _mm_clmulepi64_si128 (b, v, 1); + c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0); + w = _mm_clmulepi64_si128 (prim_poly, c, 0); + result = _mm_xor_si128 (result, w); + c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1); + w = _mm_clmulepi64_si128 (prim_poly, c, 0); + result = _mm_xor_si128 (result, w); + result = _mm_slli_si128 (result, 8); + fr = _mm_xor_si128 (result, fr); + + _mm_store_si128((__m128i *) d8, fr); + d8 += 16; + s8 += 16; } } gf_do_final_region_alignment(&rd); @@ -486,18 +739,36 @@ int gf_w64_shift_init(gf_t *gf) { gf_internal_t *h; - h = (gf_internal_t *) gf->scratch; - gf->multiply.w64 = gf_w64_shift_multiply; gf->inverse.w64 = gf_w64_euclid; gf->multiply_region.w64 = gf_w64_multiply_region_from_single; + return 1; +} -#ifdef INTEL_PCLMUL - if (h->region_type != GF_REGION_NOSSE) gf->multiply.w64 = gf_w64_clm_multiply; - if (h->region_type != GF_REGION_NOSSE) gf->multiply_region.w64 = gf_w64_clm_multiply_region; +static +int gf_w64_cfm_init(gf_t *gf) +{ + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + + gf->inverse.w64 = gf_w64_euclid; + gf->multiply_region.w64 = gf_w64_multiply_region_from_single; + +#ifdef INTEL_SSE4_PCLMUL + if ((0xfffffffe00000000ULL & h->prim_poly) == 0){ + gf->multiply.w64 = gf_w64_clm_multiply_2; + gf->multiply_region.w64 = gf_w64_clm_multiply_region_from_single_2; + }else if((0xfffe000000000000ULL & h->prim_poly) == 0){ + gf->multiply.w64 = gf_w64_clm_multiply_4; + gf->multiply_region.w64 = gf_w64_clm_multiply_region_from_single_4; + } else { + return 0; + } + return 1; #endif - return 1; + return 0; } static @@ -509,11 +780,7 @@ gf_w64_group_set_shift_tables(uint64_t *shift, uint64_t val, gf_internal_t *h) uint64_t one = 1; int g_s; - if (h->mult_type == GF_MULT_DEFAULT) { - g_s = 4; - } else { - g_s = h->arg1; - } + g_s = h->arg1; shift[0] = 0; for (i = 1; i < (1 << g_s); i <<= 1) { @@ -538,13 +805,8 @@ gf_w64_group_multiply(gf_t *gf, gf_val_64_t a, gf_val_64_t b) struct gf_w64_group_data *gd; gf_internal_t *h = (gf_internal_t *) gf->scratch; - if (h->mult_type == GF_MULT_DEFAULT) { - g_s = 4; - g_r = 8; - } else { - g_s = h->arg1; - g_r = h->arg2; - } + g_s = h->arg1; + g_r = h->arg2; gd = (struct gf_w64_group_data *) h->private; gf_w64_group_set_shift_tables(gd->shift, b, h); @@ -599,19 +861,18 @@ void gf_w64_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t v if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } gd = (struct gf_w64_group_data *) h->private; - if (h->mult_type == GF_MULT_DEFAULT) { - g_s = 4; - g_r = 8; - } else { - g_s = h->arg1; - g_r = h->arg2; - } + g_s = h->arg1; + g_r = h->arg2; gf_w64_group_set_shift_tables(gd->shift, val, h); - for (i = 63; !(val & (1L << i)); i--) ; + for (i = 63; !(val & (1ULL << i)); i--) ; i += g_s; - if (i > 64) i = 64; /* i is the bit position of the first zero bit in any element of + + /* i is the bit position of the first zero bit in any element of gd->shift[] */ + + if (i > 64) i = 64; + fzb = i; gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4); @@ -770,13 +1031,8 @@ int gf_w64_group_init(gf_t *gf) gf_internal_t *h = (gf_internal_t *) gf->scratch; int g_r, g_s; - if (h->mult_type == GF_MULT_DEFAULT) { - g_s = 4; - g_r = 8; - } else { - g_s = h->arg1; - g_r = h->arg2; - } + g_s = h->arg1; + g_r = h->arg2; gd = (struct gf_w64_group_data *) h->private; gd->shift = (uint64_t *) (&(gd->memory)); @@ -881,8 +1137,7 @@ gf_w64_bytwo_b_multiply (gf_t *gf, gf_val_64_t a, gf_val_64_t b) pp = h->prim_poly; prod = 0; - bmask = 0x80000000; - bmask <<= 32; + bmask = 0x8000000000000000ULL; while (1) { if (a & 1) prod ^= b; @@ -908,10 +1163,11 @@ gf_w64_bytwo_p_multiply (gf_t *gf, gf_val_64_t a, gf_val_64_t b) pp = h->prim_poly; prod = 0; - pmask = 0x80000000; - pmask <<= 32; - amask = 0x80000000; - amask <<= 32; + + /* changed from declare then shift to just declare.*/ + + pmask = 0x8000000000000000ULL; + amask = 0x8000000000000000ULL; while (amask != 0) { if (prod & pmask) { @@ -1052,7 +1308,7 @@ gf_w64_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_ void gf_w64_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) { -#ifdef INTEL_SSE4 +#ifdef INTEL_SSE2 int i; uint8_t *s8, *d8; uint64_t vrev, one64; @@ -1118,7 +1374,7 @@ static void gf_w64_bytwo_b_sse_region_2_xor(gf_region_data *rd) { -#ifdef INTEL_SSE4 +#ifdef INTEL_SSE2 int i; uint64_t one64, amask; uint8_t *d8, *s8, tb; @@ -1152,7 +1408,7 @@ static void gf_w64_bytwo_b_sse_region_2_noxor(gf_region_data *rd) { -#ifdef INTEL_SSE4 +#ifdef INTEL_SSE2 int i; uint64_t one64, amask; uint8_t *d8, *s8, tb; @@ -1184,7 +1440,7 @@ static void gf_w64_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) { -#ifdef INTEL_SSE4 +#ifdef INTEL_SSE2 uint64_t itb, amask, one64; uint8_t *d8, *s8; __m128i pp, m1, m2, t1, t2, va, vb; @@ -1248,18 +1504,28 @@ int gf_w64_bytwo_init(gf_t *gf) if (h->mult_type == GF_MULT_BYTWO_p) { gf->multiply.w64 = gf_w64_bytwo_p_multiply; - if (h->region_type == GF_REGION_SSE) { - gf->multiply_region.w64 = gf_w64_bytwo_p_sse_multiply_region; - } else { - gf->multiply_region.w64 = gf_w64_bytwo_p_nosse_multiply_region; - } + #ifdef INTEL_SSE2 + if (h->region_type & GF_REGION_NOSSE) + gf->multiply_region.w64 = gf_w64_bytwo_p_nosse_multiply_region; + else + gf->multiply_region.w64 = gf_w64_bytwo_p_sse_multiply_region; + #else + gf->multiply_region.w64 = gf_w64_bytwo_p_nosse_multiply_region; + if(h->region_type & GF_REGION_SSE) + return 0; + #endif } else { gf->multiply.w64 = gf_w64_bytwo_b_multiply; - if (h->region_type == GF_REGION_SSE) { - gf->multiply_region.w64 = gf_w64_bytwo_b_sse_multiply_region; - } else { + #ifdef INTEL_SSE2 + if (h->region_type & GF_REGION_NOSSE) + gf->multiply_region.w64 = gf_w64_bytwo_b_nosse_multiply_region; + else + gf->multiply_region.w64 = gf_w64_bytwo_b_sse_multiply_region; + #else gf->multiply_region.w64 = gf_w64_bytwo_b_nosse_multiply_region; - } + if(h->region_type & GF_REGION_SSE) + return 0; + #endif } gf->inverse.w64 = gf_w64_euclid; return 1; @@ -1277,12 +1543,11 @@ gf_w64_composite_multiply(gf_t *gf, gf_val_64_t a, gf_val_64_t b) uint32_t a0 = a & 0x00000000ffffffff; uint32_t a1 = (a & 0xffffffff00000000) >> 32; uint32_t a1b1; - w64_composite_int_t *comp_int = (w64_composite_int_t*)h->private; a1b1 = base_gf->multiply.w32(base_gf, a1, b1); return ((uint64_t)(base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | - ((uint64_t)(base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, comp_int->s)) << 32)); + ((uint64_t)(base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 32)); } /* @@ -1307,6 +1572,7 @@ gf_w64_composite_multiply(gf_t *gf, gf_val_64_t a, gf_val_64_t b) * * a / b = a * c */ + static gf_val_64_t gf_w64_composite_inverse(gf_t *gf, gf_val_64_t a) @@ -1318,11 +1584,10 @@ gf_w64_composite_inverse(gf_t *gf, gf_val_64_t a) uint32_t c0, c1, d, tmp; uint64_t c; uint32_t a0inv, a1inv; - w64_composite_int_t *comp_int = (w64_composite_int_t*)h->private; if (a0 == 0) { a1inv = base_gf->inverse.w32(base_gf, a1); - c0 = base_gf->multiply.w32(base_gf, a1inv, comp_int->s); + c0 = base_gf->multiply.w32(base_gf, a1inv, h->prim_poly); c1 = a1inv; } else if (a1 == 0) { c0 = base_gf->inverse.w32(base_gf, a0); @@ -1333,7 +1598,7 @@ gf_w64_composite_inverse(gf_t *gf, gf_val_64_t a) d = base_gf->multiply.w32(base_gf, a1, a0inv); - tmp = (base_gf->multiply.w32(base_gf, a1, a0inv) ^ base_gf->multiply.w32(base_gf, a0, a1inv) ^ comp_int->s); + tmp = (base_gf->multiply.w32(base_gf, a1, a0inv) ^ base_gf->multiply.w32(base_gf, a0, a1inv) ^ h->prim_poly); tmp = base_gf->inverse.w32(base_gf, tmp); d = base_gf->multiply.w32(base_gf, d, tmp); @@ -1347,17 +1612,6 @@ gf_w64_composite_inverse(gf_t *gf, gf_val_64_t a) return c; } -static -gf_val_64_t -gf_w64_composite_divide(gf_t *gf, gf_val_64_t a, gf_val_64_t b) -{ - gf_val_64_t binv; - - binv = gf_w64_composite_inverse(gf, b); - - return gf_w64_composite_multiply(gf, a, binv); -} - static void gf_w64_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) @@ -1374,7 +1628,6 @@ gf_w64_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t va int num_syms = bytes / 8; int sym_divisible = bytes % 4; gf_region_data rd; - w64_composite_int_t *comp_int = (w64_composite_int_t*)h->private; if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8); @@ -1390,7 +1643,7 @@ gf_w64_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t va a1b1 = base_gf->multiply.w32(base_gf, a1, b1); *d64 ^= ((uint64_t)(base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | - ((uint64_t)(base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, comp_int->s)) << 32)); + ((uint64_t)(base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 32)); s64++; d64++; } @@ -1401,7 +1654,7 @@ gf_w64_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t va a1b1 = base_gf->multiply.w32(base_gf, a1, b1); *d64 = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | - ((uint64_t)(base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, comp_int->s)) << 32)); + ((uint64_t)(base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 32)); s64++; d64++; } @@ -1420,7 +1673,6 @@ gf_w64_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_64_ uint8_t *dlow, *dhigh, *top; int sub_reg_size; gf_region_data rd; - w64_composite_int_t *comp_int = (w64_composite_int_t*)h->private; if (!xor) { memset(dest, 0, bytes); @@ -1440,7 +1692,7 @@ gf_w64_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_64_ base_gf->multiply_region.w32(base_gf, shigh, dlow, val1, sub_reg_size, 1); base_gf->multiply_region.w32(base_gf, slow, dhigh, val1, sub_reg_size, xor); base_gf->multiply_region.w32(base_gf, shigh, dhigh, val0, sub_reg_size, 1); - base_gf->multiply_region.w32(base_gf, shigh, dhigh, base_gf->multiply.w32(base_gf, comp_int->s, val1), sub_reg_size, 1); + base_gf->multiply_region.w32(base_gf, shigh, dhigh, base_gf->multiply.w32(base_gf, h->prim_poly, val1), sub_reg_size, 1); gf_do_final_region_alignment(&rd); } @@ -1458,29 +1710,18 @@ int gf_w64_composite_init(gf_t *gf) gf->multiply_region.w64 = gf_w64_composite_multiply_region; } - if (h->base_gf != NULL) { - gf_internal_t *base_h = (gf_internal_t *) h->base_gf->scratch; - w64_composite_int_t *comp_int = (w64_composite_int_t*)h->private; - - if (base_h->mult_type == GF_MULT_COMPOSITE) { - comp_int->s = GF_S_GF_16_2_2; - } else { - comp_int->s = GF_S_GF_32_2; - } - } - gf->multiply.w64 = gf_w64_composite_multiply; - gf->divide.w64 = gf_w64_composite_divide; + gf->divide.w64 = NULL; gf->inverse.w64 = gf_w64_composite_inverse; return 1; } static -void + void gf_w64_split_4_64_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor) { -#ifdef INTEL_SSE4 +#ifdef INTEL_SSSE3 gf_internal_t *h; int i, m, j, k, tindex; uint64_t pp, v, s, *s64, *d64, *top; @@ -1494,7 +1735,7 @@ gf_w64_split_4_64_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *des h = (gf_internal_t *) gf->scratch; pp = h->prim_poly; - + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 128); gf_do_initial_region_alignment(&rd); @@ -1534,11 +1775,11 @@ gf_w64_split_4_64_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *des i = 0; for (k = 0; k < 8; k++) { v0 = _mm_load_si128((__m128i *) s64); + /* MM_PRINT8("v", v0); */ s64 += 2; si = _mm_and_si128(v0, mask1); - /* Happy now? */ for (j = 0; j < 8; j++) { p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si)); } @@ -1551,6 +1792,7 @@ gf_w64_split_4_64_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *des i++; } for (i = 0; i < 8; i++) { + /* MM_PRINT8("v", p[i]); */ _mm_store_si128((__m128i *) d64, p[i]); d64 += 2; } @@ -1559,6 +1801,210 @@ gf_w64_split_4_64_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *des #endif } +static + void +gf_w64_split_4_64_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor) +{ +#ifdef INTEL_SSE4 + gf_internal_t *h; + int i, m, j, k, tindex; + uint64_t pp, v, s, *s64, *d64, *top; + __m128i si, tables[16][8], p[8], st[8], mask1, mask8, mask16, t1, t2; + struct gf_split_4_64_lazy_data *ld; + uint8_t btable[16]; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 128); + gf_do_initial_region_alignment(&rd); + + s64 = (uint64_t *) rd.s_start; + d64 = (uint64_t *) rd.d_start; + top = (uint64_t *) rd.d_top; + + ld = (struct gf_split_4_64_lazy_data *) h->private; + + v = val; + for (i = 0; i < 16; i++) { + ld->tables[i][0] = 0; + for (j = 1; j < 16; j <<= 1) { + for (k = 0; k < j; k++) { + ld->tables[i][k^j] = (v ^ ld->tables[i][k]); + } + v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1); + } + for (j = 0; j < 8; j++) { + for (k = 0; k < 16; k++) { + btable[k] = (uint8_t) ld->tables[i][k]; + ld->tables[i][k] >>= 8; + } + tables[i][j] = _mm_loadu_si128((__m128i *) btable); + } + } + + mask1 = _mm_set1_epi8(0xf); + mask8 = _mm_set1_epi16(0xff); + mask16 = _mm_set1_epi32(0xffff); + + while (d64 != top) { + + for (i = 0; i < 8; i++) p[i] = _mm_setzero_si128(); + + for (k = 0; k < 8; k++) { + st[k] = _mm_load_si128((__m128i *) s64); + s64 += 2; + } + + for (k = 0; k < 4; k ++) { + st[k] = _mm_shuffle_epi32(st[k], _MM_SHUFFLE(3,1,2,0)); + st[k+4] = _mm_shuffle_epi32(st[k+4], _MM_SHUFFLE(2,0,3,1)); + t1 = _mm_blend_epi16(st[k], st[k+4], 0xf0); + st[k] = _mm_srli_si128(st[k], 8); + st[k+4] = _mm_slli_si128(st[k+4], 8); + st[k+4] = _mm_blend_epi16(st[k], st[k+4], 0xf0); + st[k] = t1; + } + +/* + printf("After pack pass 1\n"); + for (k = 0; k < 8; k++) { + MM_PRINT8("v", st[k]); + } + printf("\n"); + */ + + t1 = _mm_packus_epi32(_mm_and_si128(st[0], mask16), _mm_and_si128(st[2], mask16)); + st[2] = _mm_packus_epi32(_mm_srli_epi32(st[0], 16), _mm_srli_epi32(st[2], 16)); + st[0] = t1; + t1 = _mm_packus_epi32(_mm_and_si128(st[1], mask16), _mm_and_si128(st[3], mask16)); + st[3] = _mm_packus_epi32(_mm_srli_epi32(st[1], 16), _mm_srli_epi32(st[3], 16)); + st[1] = t1; + t1 = _mm_packus_epi32(_mm_and_si128(st[4], mask16), _mm_and_si128(st[6], mask16)); + st[6] = _mm_packus_epi32(_mm_srli_epi32(st[4], 16), _mm_srli_epi32(st[6], 16)); + st[4] = t1; + t1 = _mm_packus_epi32(_mm_and_si128(st[5], mask16), _mm_and_si128(st[7], mask16)); + st[7] = _mm_packus_epi32(_mm_srli_epi32(st[5], 16), _mm_srli_epi32(st[7], 16)); + st[5] = t1; + +/* + printf("After pack pass 2\n"); + for (k = 0; k < 8; k++) { + MM_PRINT8("v", st[k]); + } + printf("\n"); + */ + t1 = _mm_packus_epi16(_mm_and_si128(st[0], mask8), _mm_and_si128(st[1], mask8)); + st[1] = _mm_packus_epi16(_mm_srli_epi16(st[0], 8), _mm_srli_epi16(st[1], 8)); + st[0] = t1; + t1 = _mm_packus_epi16(_mm_and_si128(st[2], mask8), _mm_and_si128(st[3], mask8)); + st[3] = _mm_packus_epi16(_mm_srli_epi16(st[2], 8), _mm_srli_epi16(st[3], 8)); + st[2] = t1; + t1 = _mm_packus_epi16(_mm_and_si128(st[4], mask8), _mm_and_si128(st[5], mask8)); + st[5] = _mm_packus_epi16(_mm_srli_epi16(st[4], 8), _mm_srli_epi16(st[5], 8)); + st[4] = t1; + t1 = _mm_packus_epi16(_mm_and_si128(st[6], mask8), _mm_and_si128(st[7], mask8)); + st[7] = _mm_packus_epi16(_mm_srli_epi16(st[6], 8), _mm_srli_epi16(st[7], 8)); + st[6] = t1; + +/* + printf("After final pack pass 2\n"); + for (k = 0; k < 8; k++) { + MM_PRINT8("v", st[k]); + } + */ + i = 0; + for (k = 0; k < 8; k++) { + si = _mm_and_si128(st[k], mask1); + + for (j = 0; j < 8; j++) { + p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si)); + } + i++; + st[k] = _mm_srli_epi32(st[k], 4); + si = _mm_and_si128(st[k], mask1); + for (j = 0; j < 8; j++) { + p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si)); + } + i++; + } + + t1 = _mm_unpacklo_epi8(p[0], p[1]); + p[1] = _mm_unpackhi_epi8(p[0], p[1]); + p[0] = t1; + t1 = _mm_unpacklo_epi8(p[2], p[3]); + p[3] = _mm_unpackhi_epi8(p[2], p[3]); + p[2] = t1; + t1 = _mm_unpacklo_epi8(p[4], p[5]); + p[5] = _mm_unpackhi_epi8(p[4], p[5]); + p[4] = t1; + t1 = _mm_unpacklo_epi8(p[6], p[7]); + p[7] = _mm_unpackhi_epi8(p[6], p[7]); + p[6] = t1; + +/* + printf("After unpack pass 1:\n"); + for (i = 0; i < 8; i++) { + MM_PRINT8("v", p[i]); + } + */ + + t1 = _mm_unpacklo_epi16(p[0], p[2]); + p[2] = _mm_unpackhi_epi16(p[0], p[2]); + p[0] = t1; + t1 = _mm_unpacklo_epi16(p[1], p[3]); + p[3] = _mm_unpackhi_epi16(p[1], p[3]); + p[1] = t1; + t1 = _mm_unpacklo_epi16(p[4], p[6]); + p[6] = _mm_unpackhi_epi16(p[4], p[6]); + p[4] = t1; + t1 = _mm_unpacklo_epi16(p[5], p[7]); + p[7] = _mm_unpackhi_epi16(p[5], p[7]); + p[5] = t1; + +/* + printf("After unpack pass 2:\n"); + for (i = 0; i < 8; i++) { + MM_PRINT8("v", p[i]); + } + */ + + t1 = _mm_unpacklo_epi32(p[0], p[4]); + p[4] = _mm_unpackhi_epi32(p[0], p[4]); + p[0] = t1; + t1 = _mm_unpacklo_epi32(p[1], p[5]); + p[5] = _mm_unpackhi_epi32(p[1], p[5]); + p[1] = t1; + t1 = _mm_unpacklo_epi32(p[2], p[6]); + p[6] = _mm_unpackhi_epi32(p[2], p[6]); + p[2] = t1; + t1 = _mm_unpacklo_epi32(p[3], p[7]); + p[7] = _mm_unpackhi_epi32(p[3], p[7]); + p[3] = t1; + + if (xor) { + for (i = 0; i < 8; i++) { + t1 = _mm_load_si128((__m128i *) d64); + _mm_store_si128((__m128i *) d64, _mm_xor_si128(p[i], t1)); + d64 += 2; + } + } else { + for (i = 0; i < 8; i++) { + _mm_store_si128((__m128i *) d64, p[i]); + d64 += 2; + } + } + + } + + gf_do_final_region_alignment(&rd); +#endif +} + #define GF_MULTBY_TWO(p) (((p) & GF_FIRST_BIT) ? (((p) << 1) ^ h->prim_poly) : (p) << 1); static @@ -1575,27 +2021,72 @@ int gf_w64_split_init(gf_t *gf) h = (gf_internal_t *) gf->scratch; /* Defaults */ + gf->multiply_region.w64 = gf_w64_multiply_region_from_single; - gf->multiply.w64 = gf_w64_shift_multiply; + gf->multiply.w64 = gf_w64_bytwo_p_multiply; -#ifdef INTEL_PCLMUL - if (h->region_type != GF_REGION_NOSSE) gf->multiply.w64 = gf_w64_clm_multiply; +#ifdef INTEL_SSE4_PCLMUL + if ((!(h->region_type & GF_REGION_NOSSE) && + (h->arg1 == 64 || h->arg2 == 64)) || + h->mult_type == GF_MULT_DEFAULT){ + + if ((0xfffffffe00000000ULL & h->prim_poly) == 0){ + gf->multiply.w64 = gf_w64_clm_multiply_2; + gf->multiply_region.w64 = gf_w64_clm_multiply_region_from_single_2; + }else if((0xfffe000000000000ULL & h->prim_poly) == 0){ + gf->multiply.w64 = gf_w64_clm_multiply_4; + gf->multiply_region.w64 = gf_w64_clm_multiply_region_from_single_4; + }else{ + return 0; + } + } #endif gf->inverse.w64 = gf_w64_euclid; + /* Allen: set region pointers for default mult type. Single pointers are + * taken care of above (explicitly for sse, implicitly for no sse). */ + +#ifdef INTEL_SSE4 + if (h->mult_type == GF_MULT_DEFAULT) { + d4 = (struct gf_split_4_64_lazy_data *) h->private; + d4->last_value = 0; + gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_multiply_region; + } +#else + if (h->mult_type == GF_MULT_DEFAULT) { + d8 = (struct gf_split_8_64_lazy_data *) h->private; + d8->last_value = 0; + gf->multiply_region.w64 = gf_w64_split_8_64_lazy_multiply_region; + } +#endif + if ((h->arg1 == 4 && h->arg2 == 64) || (h->arg1 == 64 && h->arg2 == 4)) { d4 = (struct gf_split_4_64_lazy_data *) h->private; d4->last_value = 0; - if (h->region_type & GF_REGION_SSE) { - if (h->region_type & GF_REGION_ALTMAP) { + + if((h->region_type & GF_REGION_ALTMAP) && (h->region_type & GF_REGION_NOSSE)) return 0; + if(h->region_type & GF_REGION_ALTMAP) + { + #ifdef INTEL_SSSE3 gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_altmap_multiply_region; - } else { -/* gf->multiply_region.w32 = gf_w32_split_4_32_lazy_sse_multiply_region; */ - } - } else { - gf->multiply_region.w64 = gf_w64_split_4_64_lazy_multiply_region; + #else + return 0; + #endif + } + else //no altmap + { + #ifdef INTEL_SSE4 + if(h->region_type & GF_REGION_NOSSE) + gf->multiply_region.w64 = gf_w64_split_4_64_lazy_multiply_region; + else + gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_multiply_region; + #else + gf->multiply_region.w64 = gf_w64_split_4_64_lazy_multiply_region; + if(h->region_type & GF_REGION_SSE) + return 0; + #endif } } if ((h->arg1 == 8 && h->arg2 == 64) || (h->arg1 == 64 && h->arg2 == 8)) { @@ -1611,7 +2102,9 @@ int gf_w64_split_init(gf_t *gf) if ((h->arg1 == 8 && h->arg2 == 8)) { d88 = (struct gf_split_8_8_data *) h->private; gf->multiply.w64 = gf_w64_split_8_8_multiply; + /* The performance of this guy sucks, so don't bother with a region op */ + basep = 1; for (exp = 0; exp < 15; exp++) { for (j = 0; j < 256; j++) d88->tables[exp][0][j] = 0; @@ -1639,94 +2132,93 @@ int gf_w64_split_init(gf_t *gf) for (i = 0; i < 8; i++) basep = GF_MULTBY_TWO(basep); } } - return -1; + return 1; } int gf_w64_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2) { - int ss, sa; + int issse4; - ss = (GF_REGION_SSE | GF_REGION_NOSSE); - sa = (GF_REGION_STDMAP | GF_REGION_ALTMAP); - - if (divide_type == GF_DIVIDE_MATRIX) return -1; switch(mult_type) { case GF_MULT_SHIFT: - if (arg1 != 0 || arg2 != 0) return -1; - if (region_type != GF_REGION_NOSSE && region_type != GF_REGION_SSE && region_type != GF_REGION_DEFAULT) return -1; + return sizeof(gf_internal_t); + break; + case GF_MULT_CARRY_FREE: return sizeof(gf_internal_t); break; case GF_MULT_BYTWO_p: case GF_MULT_BYTWO_b: - if (arg1 != 0 || arg2 != 0) return -1; - if (region_type != GF_REGION_CAUCHY) { - if ((region_type | ss) != ss || (region_type & ss) == ss) return -1; - } return sizeof(gf_internal_t); break; + case GF_MULT_DEFAULT: + + /* Allen: set the *local* arg1 and arg2, just for scratch size purposes, + * then fall through to split table scratch size code. */ + +#ifdef INTEL_SSE4 + issse4 = 1; + arg1 = 64; + arg2 = 4; +#else + issse4 = 0; + arg1 = 64; + arg2 = 8; +#endif + case GF_MULT_SPLIT_TABLE: if (arg1 == 8 && arg2 == 8) { - region_type &= (~GF_REGION_LAZY); - if (region_type != GF_REGION_DEFAULT) return -1; return sizeof(gf_internal_t) + sizeof(struct gf_split_8_8_data) + 64; } if ((arg1 == 16 && arg2 == 64) || (arg2 == 16 && arg1 == 64)) { - region_type &= (~GF_REGION_LAZY); - if (region_type != GF_REGION_DEFAULT) return -1; return sizeof(gf_internal_t) + sizeof(struct gf_split_16_64_lazy_data) + 64; } if ((arg1 == 8 && arg2 == 64) || (arg2 == 8 && arg1 == 64)) { - region_type &= (~GF_REGION_LAZY); - if (region_type != GF_REGION_DEFAULT) return -1; return sizeof(gf_internal_t) + sizeof(struct gf_split_8_64_lazy_data) + 64; } - if ((arg1 == 64 && arg2 == 4) || (arg1 == 4 && arg2 == 64)){ - region_type &= (~GF_REGION_LAZY); - if ((region_type & ss) == ss) return -1; - if ((region_type & sa) == sa) return -1; - if (region_type & (~(ss|sa))) return -1; - if (region_type & GF_REGION_SSE) { - return sizeof(gf_internal_t) + sizeof(struct gf_split_4_64_lazy_data) + 64; - } else if (region_type & GF_REGION_ALTMAP) { - return -1; - } else { - return sizeof(gf_internal_t) + sizeof(struct gf_split_4_64_lazy_data) + 64; - } + if ((arg1 == 64 && arg2 == 4) || (arg1 == 4 && arg2 == 64)) { + return sizeof(gf_internal_t) + sizeof(struct gf_split_4_64_lazy_data) + 64; } - return -1; - - case GF_MULT_DEFAULT: - arg1 = 4; - arg2 = 8; + return 0; case GF_MULT_GROUP: - if (arg1 <= 0 || arg2 <= 0) return -1; - if (region_type != GF_REGION_DEFAULT && region_type != GF_REGION_CAUCHY) return -1; return sizeof(gf_internal_t) + sizeof(struct gf_w64_group_data) + sizeof(uint64_t) * (1 << arg1) + sizeof(uint64_t) * (1 << arg2) + 64; break; case GF_MULT_COMPOSITE: - if (region_type & ~(GF_REGION_ALTMAP | GF_REGION_STDMAP)) return -1; - if ((arg1 == 2 && arg2 == 0) || (arg1 == 2 && arg2 == 1)) { - return sizeof(gf_internal_t) + sizeof(w64_composite_int_t) + 4; - } else { - return -1; - } + if (arg1 == 2) return sizeof(gf_internal_t) + 64; + return 0; break; default: - return -1; + return 0; } } int gf_w64_init(gf_t *gf) { - gf_internal_t *h; + gf_internal_t *h, *h_base, *h_base_base, *h_base_base_base; + int no_default_flag = 0; h = (gf_internal_t *) gf->scratch; - if (h->prim_poly == 0) h->prim_poly = 0x1b; /* Omitting the leftmost 1 as in w=32 */ + + /* Allen: set default primitive polynomial / irreducible polynomial if needed */ + + /* Omitting the leftmost 1 as in w=32 */ + + if (h->prim_poly == 0) { + if (h->mult_type == GF_MULT_COMPOSITE) { + h->prim_poly = gf_composite_get_default_poly(h->base_gf); + if (h->prim_poly == 0) return 0; /* This shouldn't happen */ + } else { + h->prim_poly = 0x1b; + } + if (no_default_flag == 1) { + fprintf(stderr,"Code contains no default irreducible polynomial for given base field\n"); + return 0; + } + } gf->multiply.w64 = NULL; gf->divide.w64 = NULL; @@ -1734,10 +2226,11 @@ int gf_w64_init(gf_t *gf) gf->multiply_region.w64 = NULL; switch(h->mult_type) { - case GF_MULT_SHIFT: if (gf_w64_shift_init(gf) == 0) return 0; break; - case GF_MULT_COMPOSITE: if (gf_w64_composite_init(gf) == 0) return 0; break; - case GF_MULT_SPLIT_TABLE: if (gf_w64_split_init(gf) == 0) return 0; break; + case GF_MULT_CARRY_FREE: if (gf_w64_cfm_init(gf) == 0) return 0; break; + case GF_MULT_SHIFT: if (gf_w64_shift_init(gf) == 0) return 0; break; + case GF_MULT_COMPOSITE: if (gf_w64_composite_init(gf) == 0) return 0; break; case GF_MULT_DEFAULT: + case GF_MULT_SPLIT_TABLE: if (gf_w64_split_init(gf) == 0) return 0; break; case GF_MULT_GROUP: if (gf_w64_group_init(gf) == 0) return 0; break; case GF_MULT_BYTWO_p: case GF_MULT_BYTWO_b: if (gf_w64_bytwo_init(gf) == 0) return 0; break; @@ -1748,11 +2241,6 @@ int gf_w64_init(gf_t *gf) gf->inverse.w64 = gf_w64_euclid; } -/* else if (h->divide_type == GF_DIVIDE_MATRIX) { - gf->divide.w64 = gf_w64_divide_from_inverse; - gf->inverse.w64 = gf_w64_matrix; - } */ - if (gf->inverse.w64 != NULL && gf->divide.w64 == NULL) { gf->divide.w64 = gf_w64_divide_from_inverse; } @@ -1760,6 +2248,8 @@ int gf_w64_init(gf_t *gf) gf->inverse.w64 = gf_w64_inverse_from_divide; } + if (h->region_type == GF_REGION_CAUCHY) return 0; + if (h->region_type & GF_REGION_ALTMAP) { if (h->mult_type == GF_MULT_COMPOSITE) { gf->extract_word.w64 = gf_w64_composite_extract_word; diff --git a/gf_w8.c b/gf_w8.c index 306f911..45c500f 100644 --- a/gf_w8.c +++ b/gf_w8.c @@ -15,7 +15,6 @@ #define GF_BASE_FIELD_WIDTH (4) #define GF_BASE_FIELD_SIZE (1 << GF_BASE_FIELD_WIDTH) -#define GF_S_GF_4_2 (4) struct gf_w8_logtable_data { uint8_t log_tbl[GF_FIELD_SIZE]; @@ -37,6 +36,10 @@ struct gf_w8_logzero_small_table_data { uint8_t *div_tbl; }; +struct gf_w8_composite_data { + uint8_t *mult_table; +}; + /* Don't change the order of these relative to gf_w8_half_table_data */ struct gf_w8_default_data { @@ -139,6 +142,7 @@ uint32_t gf_w8_euclid (gf_t *gf, uint32_t b) while (d_ip1 >= d_i) { c_i ^= (1 << (d_ip1 - d_i)); e_ip1 ^= (e_i << (d_ip1 - d_i)); + if (e_ip1 == 0) return 0; while ((e_ip1 & (1 << d_ip1)) == 0) d_ip1--; } @@ -164,6 +168,30 @@ gf_val_32_t gf_w8_extract_word(gf_t *gf, void *start, int bytes, int index) return r8[index]; } +static +gf_val_32_t gf_w8_composite_extract_word(gf_t *gf, void *start, int bytes, int index) +{ + int sub_size; + gf_internal_t *h; + uint8_t *r8, *top; + uint8_t a, b; + gf_region_data rd; + + h = (gf_internal_t *) gf->scratch; + gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 32); + r8 = (uint8_t *) start; + if (r8 + index < (uint8_t *) rd.d_start) return r8[index]; + if (r8 + index >= (uint8_t *) rd.d_top) return r8[index]; + index -= (((uint8_t *) rd.d_start) - r8); + r8 = (uint8_t *) rd.d_start; + top = (uint8_t *) rd.d_top; + sub_size = (top-r8)/2; + + a = h->base_gf->extract_word.w32(h->base_gf, r8, sub_size, index); + b = h->base_gf->extract_word.w32(h->base_gf, r8+sub_size, sub_size, index); + return (a | (b << 4)); +} + static inline uint32_t gf_w8_matrix (gf_t *gf, uint32_t b) @@ -171,22 +199,372 @@ uint32_t gf_w8_matrix (gf_t *gf, uint32_t b) return gf_bitmatrix_inverse(b, 8, ((gf_internal_t *) (gf->scratch))->prim_poly); } -/* ------------------------------------------------------------ - IMPLEMENTATION: SHIFT: - - JSP: The world's dumbest multiplication algorithm. I only - include it for completeness. It does have the feature that it requires no - extra memory. -*/ static inline -uint32_t +gf_val_32_t +gf_w8_clm_multiply_2 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8) +{ + gf_val_32_t rv = 0; + +#ifdef INTEL_SSE4_PCLMUL + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i v, w; + gf_internal_t * h = gf->scratch; + + a = _mm_insert_epi32 (_mm_setzero_si128(), a8, 0); + b = _mm_insert_epi32 (a, b8, 0); + + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL)); + + /* Do the initial multiply */ + + result = _mm_clmulepi64_si128 (a, b, 0); + + /* Ben: Do prim_poly reduction twice. We are guaranteed that we will only + have to do the reduction at most twice, because (w-2)/z == 2. Where + z is equal to the number of zeros after the leading 1 + + _mm_clmulepi64_si128 is the carryless multiply operation. Here + _mm_srli_si128 shifts the result to the right by 1 byte. This allows + us to multiply the prim_poly by the leading bits of the result. We + then xor the result of that operation back with the result.*/ + + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + + /* Extracts 32 bit value from result. */ + + rv = ((gf_val_32_t)_mm_extract_epi32(result, 0)); + +#endif + return rv; +} + +static +inline +gf_val_32_t +gf_w8_clm_multiply_3 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8) +{ + gf_val_32_t rv = 0; + +#ifdef INTEL_SSE4_PCLMUL + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i v, w; + gf_internal_t * h = gf->scratch; + + a = _mm_insert_epi32 (_mm_setzero_si128(), a8, 0); + b = _mm_insert_epi32 (a, b8, 0); + + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL)); + + /* Do the initial multiply */ + + result = _mm_clmulepi64_si128 (a, b, 0); + + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + + /* Extracts 32 bit value from result. */ + + rv = ((gf_val_32_t)_mm_extract_epi32(result, 0)); + +#endif + return rv; +} + +static +inline +gf_val_32_t +gf_w8_clm_multiply_4 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8) +{ + gf_val_32_t rv = 0; + +#ifdef INTEL_SSE4_PCLMUL + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i v, w; + gf_internal_t * h = gf->scratch; + + a = _mm_insert_epi32 (_mm_setzero_si128(), a8, 0); + b = _mm_insert_epi32 (a, b8, 0); + + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL)); + + /* Do the initial multiply */ + + result = _mm_clmulepi64_si128 (a, b, 0); + + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + + /* Extracts 32 bit value from result. */ + rv = ((gf_val_32_t)_mm_extract_epi32(result, 0)); + +#endif + return rv; +} + + +static +void +gf_w8_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int + xor) +{ + gf_region_data rd; + uint8_t *s8; + uint8_t *d8; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1); + gf_do_initial_region_alignment(&rd); + + s8 = (uint8_t *) rd.s_start; + d8 = (uint8_t *) rd.d_start; + + if (xor) { + while (d8 < ((uint8_t *) rd.d_top)) { + *d8 ^= gf->multiply.w32(gf, val, *s8); + d8++; + s8++; + } + } else { + while (d8 < ((uint8_t *) rd.d_top)) { + *d8 = gf->multiply.w32(gf, val, *s8); + d8++; + s8++; + } + } + gf_do_final_region_alignment(&rd); +} + +static +void +gf_w8_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int + xor) +{ + gf_region_data rd; + uint8_t *s8; + uint8_t *d8; + +#ifdef INTEL_SSE4_PCLMUL + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i v, w; + gf_internal_t * h = gf->scratch; + + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL)); + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0); + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1); + gf_do_initial_region_alignment(&rd); + + s8 = (uint8_t *) rd.s_start; + d8 = (uint8_t *) rd.d_start; + + if (xor) { + while (d8 < ((uint8_t *) rd.d_top)) { + b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + *d8 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0)); + d8++; + s8++; + } + } else { + while (d8 < ((uint8_t *) rd.d_top)) { + b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + *d8 = ((gf_val_32_t)_mm_extract_epi32(result, 0)); + d8++; + s8++; + } + } + gf_do_final_region_alignment(&rd); +#endif +} + +static +void +gf_w8_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int + xor) +{ + gf_region_data rd; + uint8_t *s8; + uint8_t *d8; + +#ifdef INTEL_SSE4_PCLMUL + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i v, w; + gf_internal_t * h = gf->scratch; + + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL)); + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0); + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1); + gf_do_initial_region_alignment(&rd); + + s8 = (uint8_t *) rd.s_start; + d8 = (uint8_t *) rd.d_start; + + if (xor) { + while (d8 < ((uint8_t *) rd.d_top)) { + b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + *d8 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0)); + d8++; + s8++; + } + } else { + while (d8 < ((uint8_t *) rd.d_top)) { + b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + *d8 = ((gf_val_32_t)_mm_extract_epi32(result, 0)); + d8++; + s8++; + } + } + gf_do_final_region_alignment(&rd); +#endif +} + +static +void +gf_w8_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int + xor) +{ + gf_region_data rd; + uint8_t *s8; + uint8_t *d8; + +#ifdef INTEL_SSE4_PCLMUL + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i v, w; + gf_internal_t * h = gf->scratch; + + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL)); + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0); + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1); + gf_do_initial_region_alignment(&rd); + + s8 = (uint8_t *) rd.s_start; + d8 = (uint8_t *) rd.d_start; + + if (xor) { + while (d8 < ((uint8_t *) rd.d_top)) { + b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + *d8 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0)); + d8++; + s8++; + } + } else { + while (d8 < ((uint8_t *) rd.d_top)) { + b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + *d8 = ((gf_val_32_t)_mm_extract_epi32(result, 0)); + d8++; + s8++; + } + } + gf_do_final_region_alignment(&rd); +#endif +} + +/* ------------------------------------------------------------ +IMPLEMENTATION: SHIFT: + +JSP: The world's dumbest multiplication algorithm. I only +include it for completeness. It does have the feature that it requires no +extra memory. + */ + +static +inline + uint32_t gf_w8_shift_multiply (gf_t *gf, uint32_t a8, uint32_t b8) { uint16_t product, i, pp, a, b; gf_internal_t *h; - + a = a8; b = b8; h = (gf_internal_t *) gf->scratch; @@ -197,29 +575,55 @@ gf_w8_shift_multiply (gf_t *gf, uint32_t a8, uint32_t b8) for (i = 0; i < GF_FIELD_WIDTH; i++) { if (a & (1 << i)) product ^= (b << i); } - for (i = (GF_FIELD_WIDTH*2-1); i >= GF_FIELD_WIDTH; i--) { + for (i = (GF_FIELD_WIDTH*2-2); i >= GF_FIELD_WIDTH; i--) { if (product & (1 << i)) product ^= (pp << (i-GF_FIELD_WIDTH)); } return product; } +static +int gf_w8_cfm_init(gf_t *gf) +{ + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + +#ifdef INTEL_SSE4_PCLMUL + if ((0xe0 & h->prim_poly) == 0){ + gf->multiply.w32 = gf_w8_clm_multiply_2; + gf->multiply_region.w32 = gf_w8_clm_multiply_region_from_single_2; + }else if ((0xc0 & h->prim_poly) == 0){ + gf->multiply.w32 = gf_w8_clm_multiply_3; + gf->multiply_region.w32 = gf_w8_clm_multiply_region_from_single_3; + }else if ((0x80 & h->prim_poly) == 0){ + gf->multiply.w32 = gf_w8_clm_multiply_4; + gf->multiply_region.w32 = gf_w8_clm_multiply_region_from_single_4; + }else{ + return 0; + } + return 1; +#endif + + return 0; + +} + static int gf_w8_shift_init(gf_t *gf) -{ - gf->multiply.w32 = gf_w8_shift_multiply; - gf->inverse.w32 = gf_w8_euclid; +{ + gf->multiply.w32 = gf_w8_shift_multiply; /* The others will be set automatically */ return 1; } /* ------------------------------------------------------------ - IMPLEMENTATION: LOG_TABLE: +IMPLEMENTATION: LOG_TABLE: - JSP: Kevin wrote this, and I'm converting it to my structure. - */ +JSP: Kevin wrote this, and I'm converting it to my structure. +*/ static inline -uint32_t + uint32_t gf_w8_logzero_multiply (gf_t *gf, uint32_t a, uint32_t b) { struct gf_w8_logzero_table_data *ltd; @@ -230,7 +634,7 @@ gf_w8_logzero_multiply (gf_t *gf, uint32_t a, uint32_t b) static inline -uint32_t + uint32_t gf_w8_logzero_divide (gf_t *gf, uint32_t a, uint32_t b) { struct gf_w8_logzero_table_data *ltd; @@ -241,7 +645,7 @@ gf_w8_logzero_divide (gf_t *gf, uint32_t a, uint32_t b) static inline -uint32_t + uint32_t gf_w8_logzero_small_multiply (gf_t *gf, uint32_t a, uint32_t b) { struct gf_w8_logzero_small_table_data *std; @@ -253,7 +657,7 @@ gf_w8_logzero_small_multiply (gf_t *gf, uint32_t a, uint32_t b) static inline -uint32_t + uint32_t gf_w8_logzero_small_divide (gf_t *gf, uint32_t a, uint32_t b) { struct gf_w8_logzero_small_table_data *std; @@ -264,7 +668,7 @@ gf_w8_logzero_small_divide (gf_t *gf, uint32_t a, uint32_t b) static inline -uint32_t + uint32_t gf_w8_log_multiply (gf_t *gf, uint32_t a, uint32_t b) { struct gf_w8_logtable_data *ltd; @@ -275,7 +679,7 @@ gf_w8_log_multiply (gf_t *gf, uint32_t a, uint32_t b) static inline -uint32_t + uint32_t gf_w8_log_divide (gf_t *gf, uint32_t a, uint32_t b) { int log_sum = 0; @@ -289,7 +693,7 @@ gf_w8_log_divide (gf_t *gf, uint32_t a, uint32_t b) } static -uint32_t + uint32_t gf_w8_log_inverse (gf_t *gf, uint32_t a) { struct gf_w8_logtable_data *ltd; @@ -299,7 +703,7 @@ gf_w8_log_inverse (gf_t *gf, uint32_t a) } static -uint32_t + uint32_t gf_w8_logzero_inverse (gf_t *gf, uint32_t a) { struct gf_w8_logzero_table_data *ltd; @@ -309,7 +713,7 @@ gf_w8_logzero_inverse (gf_t *gf, uint32_t a) } static -uint32_t + uint32_t gf_w8_logzero_small_inverse (gf_t *gf, uint32_t a) { struct gf_w8_logzero_small_table_data *std; @@ -319,7 +723,7 @@ gf_w8_logzero_small_inverse (gf_t *gf, uint32_t a) } static -void + void gf_w8_log_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) { int i; @@ -348,7 +752,7 @@ gf_w8_log_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int byt } static -void + void gf_w8_logzero_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) { int i; @@ -390,7 +794,7 @@ gf_w8_logzero_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int } } -static + static int gf_w8_log_init(gf_t *gf) { gf_internal_t *h; @@ -400,13 +804,14 @@ int gf_w8_log_init(gf_t *gf) uint8_t *alt; uint8_t *inv; int i, b; + int check = 0; h = (gf_internal_t *) gf->scratch; - if (h->arg1 == 0) { + if (h->mult_type == GF_MULT_LOG_TABLE) { ltd = h->private; alt = ltd->antilog_tbl; inv = ltd->inv_tbl; - } else if (h->arg1 == 1) { + } else if (h->mult_type == GF_MULT_LOG_ZERO) { std = h->private; alt = std->antilog_tbl; std->div_tbl = (alt + 255); @@ -418,10 +823,19 @@ int gf_w8_log_init(gf_t *gf) ztd->div_tbl = (alt + 255); inv = ztd->inv_tbl; } - - if (h->arg1 == 0) { + + for (i = 0; i < GF_MULT_GROUP_SIZE+1; i++) { + if (h->mult_type == GF_MULT_LOG_TABLE) + ltd->log_tbl[i] = 0; + else if (h->mult_type == GF_MULT_LOG_ZERO) + std->log_tbl[i] = 0; + else + ztd->log_tbl[i] = 0; + } + + if (h->mult_type == GF_MULT_LOG_TABLE) { ltd->log_tbl[0] = 0; - } else if (h->arg1 == 1) { + } else if (h->mult_type == GF_MULT_LOG_ZERO) { std->log_tbl[0] = 510; } else { ztd->log_tbl[0] = 512; @@ -429,23 +843,31 @@ int gf_w8_log_init(gf_t *gf) b = 1; for (i = 0; i < GF_MULT_GROUP_SIZE; i++) { - if (h->arg1 == 0) { - ltd->log_tbl[b] = i; - } else if (h->arg1 == 1) { - std->log_tbl[b] = i; - } else { - ztd->log_tbl[b] = i; - } - alt[i] = b; - alt[i+GF_MULT_GROUP_SIZE] = b; - b <<= 1; - if (b & GF_FIELD_SIZE) { - b = b ^ h->prim_poly; - } + if (h->mult_type == GF_MULT_LOG_TABLE) { + if (ltd->log_tbl[b] != 0) check = 1; + ltd->log_tbl[b] = i; + } else if (h->mult_type == GF_MULT_LOG_ZERO) { + if (std->log_tbl[b] != 0) check = 1; + std->log_tbl[b] = i; + } else { + if (ztd->log_tbl[b] != 0) check = 1; + ztd->log_tbl[b] = i; + } + alt[i] = b; + alt[i+GF_MULT_GROUP_SIZE] = b; + b <<= 1; + if (b & GF_FIELD_SIZE) { + b = b ^ h->prim_poly; + } + } + if (check) { + _gf_errno = GF_E_LOGPOLY; + return 0; } - if (h->arg1 == 1) bzero(alt+510, 255); - if (h->arg1 == 2) { + if (h->mult_type == GF_MULT_LOG_ZERO) bzero(alt+510, 255); + + if (h->mult_type == GF_MULT_LOG_ZERO_EXT) { bzero(alt+512, 255); alt[512+512] = 0; } @@ -459,13 +881,13 @@ int gf_w8_log_init(gf_t *gf) if (i & (1 << 8)) i ^= h->prim_poly; b--; } while (i != 1); - - if (h->arg1 == 0) { + + if (h->mult_type == GF_MULT_LOG_TABLE) { gf->inverse.w32 = gf_w8_log_inverse; gf->divide.w32 = gf_w8_log_divide; gf->multiply.w32 = gf_w8_log_multiply; gf->multiply_region.w32 = gf_w8_log_multiply_region; - } else if (h->arg1 == 1) { + } else if (h->mult_type == GF_MULT_LOG_ZERO) { gf->inverse.w32 = gf_w8_logzero_small_inverse; gf->divide.w32 = gf_w8_logzero_small_divide; gf->multiply.w32 = gf_w8_logzero_small_multiply; @@ -480,13 +902,13 @@ int gf_w8_log_init(gf_t *gf) } /* ------------------------------------------------------------ - IMPLEMENTATION: FULL_TABLE: +IMPLEMENTATION: FULL_TABLE: - JSP: Kevin wrote this, and I'm converting it to my structure. +JSP: Kevin wrote this, and I'm converting it to my structure. */ static -gf_val_32_t + gf_val_32_t gf_w8_table_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) { struct gf_w8_single_table_data *ftd; @@ -496,7 +918,7 @@ gf_w8_table_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) } static -gf_val_32_t + gf_val_32_t gf_w8_table_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b) { struct gf_w8_single_table_data *ftd; @@ -506,7 +928,7 @@ gf_w8_table_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b) } static -gf_val_32_t + gf_val_32_t gf_w8_default_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) { struct gf_w8_default_data *ftd; @@ -516,7 +938,7 @@ gf_w8_default_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) } static -gf_val_32_t + gf_val_32_t gf_w8_default_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b) { struct gf_w8_default_data *ftd; @@ -526,7 +948,7 @@ gf_w8_default_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b) } static -gf_val_32_t + gf_val_32_t gf_w8_double_table_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) { struct gf_w8_double_table_data *ftd; @@ -536,7 +958,7 @@ gf_w8_double_table_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) } static -gf_val_32_t + gf_val_32_t gf_w8_double_table_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b) { struct gf_w8_double_table_data *ftd; @@ -546,7 +968,7 @@ gf_w8_double_table_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b) } static -void + void gf_w8_double_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { uint16_t *base; @@ -570,7 +992,7 @@ gf_w8_double_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t base[(b << 8)| c] = (vb | vc); } } - + } else { dtd = (struct gf_w8_double_table_data *) h->private; base = &(dtd->mult[val][0]); @@ -583,7 +1005,7 @@ gf_w8_double_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t } static -gf_val_32_t + gf_val_32_t gf_w8_double_table_lazy_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) { struct gf_w8_double_table_lazy_data *ftd; @@ -593,7 +1015,7 @@ gf_w8_double_table_lazy_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) } static -gf_val_32_t + gf_val_32_t gf_w8_double_table_lazy_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b) { struct gf_w8_double_table_lazy_data *ftd; @@ -603,7 +1025,7 @@ gf_w8_double_table_lazy_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b) } static -void + void gf_w8_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { int i; @@ -628,11 +1050,12 @@ gf_w8_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, in } } } + static -void + void gf_w8_split_multiply_region_sse(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { -#ifdef INTEL_SSE4 +#ifdef INTEL_SSSE3 uint8_t *s8, *d8, *bh, *bl, *sptr, *dptr, *top; __m128i tbl, loset, t1, r, va, mth, mtl; uint64_t altable[4]; @@ -654,7 +1077,7 @@ gf_w8_split_multiply_region_sse(gf_t *gf, void *src, void *dest, gf_val_32_t val sptr = rd.s_start; dptr = rd.d_start; - + mth = _mm_loadu_si128 ((__m128i *)(bh)); mtl = _mm_loadu_si128 ((__m128i *)(bl)); loset = _mm_set1_epi8 (0x0f); @@ -693,11 +1116,11 @@ gf_w8_split_multiply_region_sse(gf_t *gf, void *src, void *dest, gf_val_32_t val /* ------------------------------------------------------------ - IMPLEMENTATION: FULL_TABLE: +IMPLEMENTATION: FULL_TABLE: */ static -gf_val_32_t + gf_val_32_t gf_w8_split_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) { struct gf_w8_half_table_data *htd; @@ -707,7 +1130,7 @@ gf_w8_split_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) } static -void + void gf_w8_split_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { unsigned long uls, uld; @@ -735,12 +1158,12 @@ gf_w8_split_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, in } -static + static int gf_w8_split_init(gf_t *gf) { gf_internal_t *h; struct gf_w8_half_table_data *htd; - int a, b, c, d, pp; + int a, b, pp; h = (gf_internal_t *) gf->scratch; htd = (struct gf_w8_half_table_data *)h->private; @@ -748,34 +1171,34 @@ int gf_w8_split_init(gf_t *gf) bzero(htd->high, sizeof(uint8_t)*GF_FIELD_SIZE*GF_HALF_SIZE); bzero(htd->low, sizeof(uint8_t)*GF_FIELD_SIZE*GF_HALF_SIZE); - - for (a = 1; a < GF_HALF_SIZE; a++) { - b = 1; - c = a; - d = (a << (GF_FIELD_WIDTH/2)); - do { - htd->low[b][a] = c; - htd->high[b][a] = d; - b <<= 1; - if (b & GF_FIELD_SIZE) b ^= pp; - c <<= 1; - if (c & GF_FIELD_SIZE) c ^= pp; - d <<= 1; - if (d & GF_FIELD_SIZE) d ^= pp; - } while (c != a); + + for (a = 1; a < GF_FIELD_SIZE; a++) { + for (b = 1; b < GF_HALF_SIZE; b++) { + htd->low[a][b] = gf_w8_shift_multiply(gf,a,b); + htd->high[a][b] = gf_w8_shift_multiply(gf,a,b<<4); + } } - gf->inverse.w32 = NULL; /* Will set from divide */ - gf->divide.w32 = NULL; /* Let the user figure it out. */ gf->multiply.w32 = gf_w8_split_multiply; - if (h->region_type == GF_REGION_NOSSE) { + + #ifdef INTEL_SSSE3 + if (h->region_type & GF_REGION_NOSSE) + gf->multiply_region.w32 = gf_w8_split_multiply_region; + else + gf->multiply_region.w32 = gf_w8_split_multiply_region_sse; + #else gf->multiply_region.w32 = gf_w8_split_multiply_region; - } else { - gf->multiply_region.w32 = gf_w8_split_multiply_region_sse; - } + if(h->region_type & GF_REGION_SSE) + return 0; + #endif + return 1; } +/* JSP: This is disgusting, but it is what it is. If there is no SSE, + then the default is equivalent to single table. If there is SSE, then + we use the "gf_w8_default_data" which is a hybrid of SPLIT & TABLE. */ + static int gf_w8_table_init(gf_t *gf) { @@ -784,19 +1207,24 @@ int gf_w8_table_init(gf_t *gf) struct gf_w8_double_table_data *dtd = NULL; struct gf_w8_double_table_lazy_data *ltd = NULL; struct gf_w8_default_data *dd = NULL; - int a, b, c, prod, scase; + int a, b, c, prod, scase, issse; h = (gf_internal_t *) gf->scratch; - if (h->mult_type == GF_MULT_DEFAULT) { + issse = 0; +#ifdef INTEL_SSSE3 + issse = 1; +#endif + + if (h->mult_type == GF_MULT_DEFAULT && issse) { dd = (struct gf_w8_default_data *)h->private; scase = 3; bzero(dd->high, sizeof(uint8_t) * GF_FIELD_SIZE * GF_HALF_SIZE); bzero(dd->low, sizeof(uint8_t) * GF_FIELD_SIZE * GF_HALF_SIZE); bzero(dd->divtable, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE); bzero(dd->multtable, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE); - } else if (h->region_type == 0 || (h->region_type & GF_REGION_CAUCHY) || - (h->region_type & GF_REGION_SINGLE_TABLE)) { + } else if (h->mult_type == GF_MULT_DEFAULT || + h->region_type == 0 || (h->region_type & GF_REGION_CAUCHY)) { ftd = (struct gf_w8_single_table_data *)h->private; bzero(ftd->divtable, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE); bzero(ftd->multtable, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE); @@ -815,93 +1243,98 @@ int gf_w8_table_init(gf_t *gf) fprintf(stderr, "Internal error in gf_w8_table_init\n"); exit(0); } - + for (a = 1; a < GF_FIELD_SIZE; a++) { - b = 1; - prod = a; - do { + for (b = 1; b < GF_FIELD_SIZE; b++) { + prod = gf_w8_shift_multiply(gf,a,b); switch (scase) { - case 0: - ftd->multtable[a][b] = prod; - ftd->divtable[prod][b] = a; - break; - case 1: - dtd->div[prod][b] = a; - for (c = 0; c < GF_FIELD_SIZE; c++) { - dtd->mult[a][(c<<8)|b] |= prod; - dtd->mult[a][(b<<8)|c] |= (prod<<8); - } - break; - case 2: - ltd->div[prod][b] = a; - ltd->smult[a][b] = prod; - break; - case 3: - dd->multtable[a][b] = prod; - dd->divtable[prod][b] = a; - if ((b & 0xf) == b) dd->low[a][b] = prod; - if ((b & 0xf0) == b) dd->high[a][b>>4] = prod; - break; + case 0: + ftd->multtable[a][b] = prod; + ftd->divtable[prod][b] = a; + break; + case 1: + dtd->div[prod][b] = a; + for (c = 0; c < GF_FIELD_SIZE; c++) { + dtd->mult[a][(c<<8)|b] |= prod; + dtd->mult[a][(b<<8)|c] |= (prod<<8); + } + break; + case 2: + ltd->div[prod][b] = a; + ltd->smult[a][b] = prod; + break; + case 3: + dd->multtable[a][b] = prod; + dd->divtable[prod][b] = a; + if ((b & 0xf) == b) { dd->low[a][b] = prod; } + if ((b & 0xf0) == b) { dd->high[a][b>>4] = prod; } + break; } - b <<= 1; - if (b & GF_FIELD_SIZE) b = b ^ h->prim_poly; - prod <<= 1; - if (prod & GF_FIELD_SIZE) prod = prod ^ h->prim_poly; - - } while (b != 1); + } } gf->inverse.w32 = NULL; /* Will set from divide */ switch (scase) { - case 0: - gf->divide.w32 = gf_w8_table_divide; - gf->multiply.w32 = gf_w8_table_multiply; - gf->multiply_region.w32 = gf_w8_table_multiply_region; - break; - case 1: - gf->divide.w32 = gf_w8_double_table_divide; - gf->multiply.w32 = gf_w8_double_table_multiply; - gf->multiply_region.w32 = gf_w8_double_table_multiply_region; - break; - case 2: - gf->divide.w32 = gf_w8_double_table_lazy_divide; - gf->multiply.w32 = gf_w8_double_table_lazy_multiply; - gf->multiply_region.w32 = gf_w8_double_table_multiply_region; - break; - case 3: - gf->divide.w32 = gf_w8_default_divide; - gf->multiply.w32 = gf_w8_default_multiply; - gf->multiply_region.w32 = gf_w8_split_multiply_region; -#ifdef INTEL_SSE4 - gf->multiply_region.w32 = gf_w8_split_multiply_region_sse; + case 0: + gf->divide.w32 = gf_w8_table_divide; + gf->multiply.w32 = gf_w8_table_multiply; + gf->multiply_region.w32 = gf_w8_table_multiply_region; + break; + case 1: + gf->divide.w32 = gf_w8_double_table_divide; + gf->multiply.w32 = gf_w8_double_table_multiply; + gf->multiply_region.w32 = gf_w8_double_table_multiply_region; + break; + case 2: + gf->divide.w32 = gf_w8_double_table_lazy_divide; + gf->multiply.w32 = gf_w8_double_table_lazy_multiply; + gf->multiply_region.w32 = gf_w8_double_table_multiply_region; + break; + case 3: +#ifdef INTEL_SSSE3 + gf->divide.w32 = gf_w8_default_divide; + gf->multiply.w32 = gf_w8_default_multiply; + gf->multiply_region.w32 = gf_w8_split_multiply_region_sse; #endif - break; + break; } return 1; } static -void + void gf_w8_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { gf_internal_t *h = (gf_internal_t *) gf->scratch; gf_t *base_gf = h->base_gf; uint8_t val0 = val & 0x0f; uint8_t val1 = (val & 0xf0) >> 4; - int sub_reg_size = bytes / 2; + gf_region_data rd; + int sub_reg_size; - if (bytes % 2 != 0) gf_alignment_error("gf_w8_composite_multiply_region_alt", 1); + if (val == 0) { + if (xor) return; + bzero(dest, bytes); + return; + } - base_gf->multiply_region.w32(base_gf, src, dest, val0, sub_reg_size, xor); - base_gf->multiply_region.w32(base_gf, src+sub_reg_size, dest, val1, sub_reg_size, 1); - base_gf->multiply_region.w32(base_gf, src, dest+sub_reg_size, val1, sub_reg_size, xor); - base_gf->multiply_region.w32(base_gf, src+sub_reg_size, dest+sub_reg_size, val0, sub_reg_size, 1); - base_gf->multiply_region.w32(base_gf, src+sub_reg_size, dest+sub_reg_size, base_gf->multiply.w32(base_gf, GF_S_GF_4_2, val1), sub_reg_size, 1); + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32); + gf_do_initial_region_alignment(&rd); + + sub_reg_size = (rd.d_top - rd.d_start) / 2; + + base_gf->multiply_region.w32(base_gf, rd.s_start, rd.d_start, val0, sub_reg_size, xor); + base_gf->multiply_region.w32(base_gf, rd.s_start+sub_reg_size, rd.d_start, val1, sub_reg_size, 1); + base_gf->multiply_region.w32(base_gf, rd.s_start, rd.d_start+sub_reg_size, val1, sub_reg_size, xor); + base_gf->multiply_region.w32(base_gf, rd.s_start+sub_reg_size, rd.d_start+sub_reg_size, val0, sub_reg_size, 1); + base_gf->multiply_region.w32(base_gf, rd.s_start+sub_reg_size, rd.d_start+sub_reg_size, base_gf->multiply.w32(base_gf, h->prim_poly, val1), sub_reg_size, 1); + + gf_do_final_region_alignment(&rd); } static gf_val_32_t -gf_w8_composite_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +gf_w8_composite_multiply_recursive(gf_t *gf, gf_val_32_t a, gf_val_32_t b) { gf_internal_t *h = (gf_internal_t *) gf->scratch; gf_t *base_gf = h->base_gf; @@ -912,8 +1345,35 @@ gf_w8_composite_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) uint8_t a1b1; a1b1 = base_gf->multiply.w32(base_gf, a1, b1); - - return ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, GF_S_GF_4_2)) << 4)); + + return ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | + ((base_gf->multiply.w32(base_gf, a1, b0) ^ + base_gf->multiply.w32(base_gf, a0, b1) ^ + base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 4)); +} + +static +gf_val_32_t +gf_w8_composite_multiply_inline(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + gf_t *base_gf = h->base_gf; + uint8_t b0 = b & 0x0f; + uint8_t b1 = (b & 0xf0) >> 4; + uint8_t a0 = a & 0x0f; + uint8_t a1 = (a & 0xf0) >> 4; + uint8_t a1b1, *mt; + struct gf_w8_composite_data *cd; + + cd = (struct gf_w8_composite_data *) h->private; + mt = cd->mult_table; + + a1b1 = GF_W4_INLINE_MULTDIV(mt, a1, b1); + + return ((GF_W4_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) | + ((GF_W4_INLINE_MULTDIV(mt, a1, b0) ^ + GF_W4_INLINE_MULTDIV(mt, a0, b1) ^ + GF_W4_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 4)); } /* @@ -938,6 +1398,7 @@ gf_w8_composite_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) * * a / b = a * c */ + static gf_val_32_t gf_w8_composite_inverse(gf_t *gf, gf_val_32_t a) @@ -949,10 +1410,9 @@ gf_w8_composite_inverse(gf_t *gf, gf_val_32_t a) uint8_t c0, c1, c, d, tmp; uint8_t a0inv, a1inv; - if (a0 == 0) { a1inv = base_gf->inverse.w32(base_gf, a1) & 0xf; - c0 = base_gf->multiply.w32(base_gf, a1inv, GF_S_GF_4_2); + c0 = base_gf->multiply.w32(base_gf, a1inv, h->prim_poly); c1 = a1inv; } else if (a1 == 0) { c0 = base_gf->inverse.w32(base_gf, a0); @@ -963,49 +1423,36 @@ gf_w8_composite_inverse(gf_t *gf, gf_val_32_t a) d = base_gf->multiply.w32(base_gf, a1, a0inv) & 0xf; - tmp = (base_gf->multiply.w32(base_gf, a1, a0inv) ^ base_gf->multiply.w32(base_gf, a0, a1inv) ^ GF_S_GF_4_2) & 0xf; + tmp = (base_gf->multiply.w32(base_gf, a1, a0inv) ^ base_gf->multiply.w32(base_gf, a0, a1inv) ^ h->prim_poly) & 0xf; tmp = base_gf->inverse.w32(base_gf, tmp) & 0xf; d = base_gf->multiply.w32(base_gf, d, tmp) & 0xf; - + c0 = base_gf->multiply.w32(base_gf, (d^1), a0inv) & 0xf; c1 = base_gf->multiply.w32(base_gf, d, a1inv) & 0xf; } c = c0 | (c1 << 4); - + return c; } -static -gf_val_32_t -gf_w8_composite_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - gf_val_32_t binv; - - binv = gf_w8_composite_inverse(gf, b); - - return gf_w8_composite_multiply(gf, a, binv); -} - static void gf_w8_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { - unsigned long uls, uld; + gf_region_data rd; gf_internal_t *h = (gf_internal_t *) gf->scratch; gf_t *base_gf = h->base_gf; - int i=0; - struct gf_w4_single_table_data * std; uint8_t b0 = val & 0x0f; uint8_t b1 = (val & 0xf0) >> 4; - uint8_t *s8 = (uint8_t *) src; - uint8_t *d8 = (uint8_t *) dest; + uint8_t *s8; + uint8_t *d8; + uint8_t *mt; uint8_t a0, a1, a1b1; + struct gf_w8_composite_data *cd; - uls = ((unsigned long) src) & 0xf; - uld = ((unsigned long) dest) & 0xf; - if ((uls & 0x7) != (uld & 0x7)) gf_alignment_error("gf_w8_composite_multiply_region", 1); + cd = (struct gf_w8_composite_data *) h->private; if (val == 0) { if (xor) return; @@ -1013,124 +1460,115 @@ gf_w8_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val return; } - std = (struct gf_w4_single_table_data *) h->private; + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1); + gf_do_initial_region_alignment(&rd); + + + s8 = (uint8_t *) rd.s_start; + d8 = (uint8_t *) rd.d_start; - if (xor) { - for (i = 0;i < bytes; i++) { - a0 = s8[i] & 0x0f; - a1 = (s8[i] & 0xf0) >> 4; - a1b1 = std->mult[a1][b1]; - - d8[i] ^= ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | - ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, GF_S_GF_4_2)) << 4)); - + mt = cd->mult_table; + if (mt == NULL) { + if (xor) { + while (d8 < (uint8_t *) rd.d_top) { + a0 = *s8 & 0x0f; + a1 = (*s8 & 0xf0) >> 4; + a1b1 = base_gf->multiply.w32(base_gf, a1, b1); + + *d8 ^= ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | + ((base_gf->multiply.w32(base_gf, a1, b0) ^ + base_gf->multiply.w32(base_gf, a0, b1) ^ + base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 4)); + s8++; + d8++; + } + } else { + while (d8 < (uint8_t *) rd.d_top) { + a0 = *s8 & 0x0f; + a1 = (*s8 & 0xf0) >> 4; + a1b1 = base_gf->multiply.w32(base_gf, a1, b1); + + *d8 = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | + ((base_gf->multiply.w32(base_gf, a1, b0) ^ + base_gf->multiply.w32(base_gf, a0, b1) ^ + base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 4)); + s8++; + d8++; + } } } else { - for (i = 0;i < bytes; i++) { - a0 = s8[i] & 0x0f; - a1 = (s8[i] & 0xf0) >> 4; - a1b1 = std->mult[a1][b1]; - - d8[i] = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | - ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, GF_S_GF_4_2)) << 4)); - } - } - return; -} - -static -void -gf_w8_composite_multiply_region_table(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) -{ - unsigned long uls, uld; - gf_internal_t *h = (gf_internal_t *) gf->scratch; - gf_t *base_gf = h->base_gf; - int i=0; - struct gf_w4_single_table_data * std; - uint8_t b0 = val & 0x0f; - uint8_t b1 = (val & 0xf0) >> 4; - uint8_t *s8 = (uint8_t *) src; - uint8_t *d8 = (uint8_t *) dest; - uint8_t a0, a1, a1b1; - - uls = ((unsigned long) src) & 0xf; - uld = ((unsigned long) dest) & 0xf; - if ((uls & 0x7) != (uld & 0x7)) gf_alignment_error("gf_w8_composite_multiply_region", 1); - - if (val == 0) { - if (xor) return; - bzero(dest, bytes); - return; - } - - std = (struct gf_w4_single_table_data *) h->private; - - if (xor) { - for (i = 0;i < bytes; i++) { - a0 = s8[i] & 0x0f; - a1 = (s8[i] & 0xf0) >> 4; - a1b1 = std->mult[a1][b1]; - - d8[i] ^= ((std->mult[a0][b0] ^ a1b1) | ((std->mult[a1][b0] ^ std->mult[a0][b1] ^ std->mult[a1b1][GF_S_GF_4_2]) << 4)); - - } - } else { - for (i = 0;i < bytes; i++) { - a0 = s8[i] & 0x0f; - a1 = (s8[i] & 0xf0) >> 4; - a1b1 = std->mult[a1][b1]; - - d8[i] = ((std->mult[a0][b0] ^ a1b1) | ((std->mult[a1][b0] ^ std->mult[a0][b1] ^ std->mult[a1b1][GF_S_GF_4_2]) << 4)); + if (xor) { + while (d8 < (uint8_t *) rd.d_top) { + a0 = *s8 & 0x0f; + a1 = (*s8 & 0xf0) >> 4; + a1b1 = GF_W4_INLINE_MULTDIV(mt, a1, b1); + + *d8 ^= ((GF_W4_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) | + ((GF_W4_INLINE_MULTDIV(mt, a1, b0) ^ + GF_W4_INLINE_MULTDIV(mt, a0, b1) ^ + GF_W4_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 4)); + s8++; + d8++; + } + } else { + while (d8 < (uint8_t *) rd.d_top) { + a0 = *s8 & 0x0f; + a1 = (*s8 & 0xf0) >> 4; + a1b1 = GF_W4_INLINE_MULTDIV(mt, a1, b1); + + *d8 = ((GF_W4_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) | + ((GF_W4_INLINE_MULTDIV(mt, a1, b0) ^ + GF_W4_INLINE_MULTDIV(mt, a0, b1) ^ + GF_W4_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 4)); + s8++; + d8++; + } } } + gf_do_final_region_alignment(&rd); return; } static int gf_w8_composite_init(gf_t *gf) { - struct gf_w4_single_table_data * std; gf_internal_t *h = (gf_internal_t *) gf->scratch; - gf_t *base_gf = h->base_gf; - uint8_t a, b; + struct gf_w8_composite_data *cd; - std = (struct gf_w4_single_table_data *) h->private; + if (h->base_gf == NULL) return 0; + + cd = (struct gf_w8_composite_data *) h->private; + cd->mult_table = gf_w4_get_mult_table(h->base_gf); - for (a = 0; a < 16; a++) { - for (b = 0; b < 16; b++) { - std->mult[a][b] = base_gf->multiply.w32(base_gf, a, b); - } - } - if (h->region_type & GF_REGION_ALTMAP) { gf->multiply_region.w32 = gf_w8_composite_multiply_region_alt; } else { - if (h->region_type & GF_REGION_SINGLE_TABLE) { - gf->multiply_region.w32 = gf_w8_composite_multiply_region_table; - } else { - gf->multiply_region.w32 = gf_w8_composite_multiply_region; - } + gf->multiply_region.w32 = gf_w8_composite_multiply_region; } - gf->multiply.w32 = gf_w8_composite_multiply; - gf->divide.w32 = gf_w8_composite_divide; + if (cd->mult_table == NULL) { + gf->multiply.w32 = gf_w8_composite_multiply_recursive; + } else { + gf->multiply.w32 = gf_w8_composite_multiply_inline; + } + gf->divide.w32 = NULL; gf->inverse.w32 = gf_w8_composite_inverse; - + return 1; } static inline -gf_val_32_t + gf_val_32_t gf_w8_bytwo_p_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) { uint32_t prod, pp, pmask, amask; gf_internal_t *h; - + h = (gf_internal_t *) gf->scratch; pp = h->prim_poly; - + prod = 0; pmask = 0x80; amask = 0x80; @@ -1149,12 +1587,12 @@ gf_w8_bytwo_p_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) static inline -gf_val_32_t + gf_val_32_t gf_w8_bytwo_b_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) { uint32_t prod, pp, bmask; gf_internal_t *h; - + h = (gf_internal_t *) gf->scratch; pp = h->prim_poly; @@ -1174,13 +1612,13 @@ gf_w8_bytwo_b_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) } static -void + void gf_w8_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { uint64_t *s64, *d64, t1, t2, ta, prod, amask; gf_region_data rd; struct gf_w8_bytwo_data *btd; - + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } @@ -1225,18 +1663,18 @@ gf_w8_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t } #define BYTWO_P_ONESTEP {\ - SSE_AB2(pp, m1 ,m2, prod, t1, t2); \ - t1 = _mm_and_si128(v, one); \ - t1 = _mm_sub_epi8(t1, one); \ - t1 = _mm_and_si128(t1, ta); \ - prod = _mm_xor_si128(prod, t1); \ - v = _mm_srli_epi64(v, 1); } + SSE_AB2(pp, m1 ,m2, prod, t1, t2); \ + t1 = _mm_and_si128(v, one); \ + t1 = _mm_sub_epi8(t1, one); \ + t1 = _mm_and_si128(t1, ta); \ + prod = _mm_xor_si128(prod, t1); \ + v = _mm_srli_epi64(v, 1); } static -void + void gf_w8_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { -#ifdef INTEL_SSE4 +#ifdef INTEL_SSE2 int i; uint8_t *s8, *d8; uint8_t vrev; @@ -1244,7 +1682,7 @@ gf_w8_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v __m128i pp, m1, m2, ta, prod, t1, t2, tp, one, v; struct gf_w8_bytwo_data *btd; gf_region_data rd; - + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } @@ -1289,10 +1727,10 @@ gf_w8_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v } static -void + void gf_w8_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_w8_bytwo_data *btd) { -#ifdef INTEL_SSE4 +#ifdef INTEL_SSE2 int i; uint8_t *d8, *s8, tb; __m128i pp, m1, m2, t1, t2, va, vb; @@ -1315,10 +1753,10 @@ gf_w8_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_w8_bytwo_data *bt } static -void + void gf_w8_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_w8_bytwo_data *btd) { -#ifdef INTEL_SSE4 +#ifdef INTEL_SSE2 int i; uint8_t *d8, *s8, tb; __m128i pp, m1, m2, t1, t2, va, vb; @@ -1344,16 +1782,16 @@ gf_w8_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_w8_bytwo_data *btd) static -void + void gf_w8_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { -#ifdef INTEL_SSE4 +#ifdef INTEL_SSE2 int itb; uint8_t *d8, *s8; __m128i pp, m1, m2, t1, t2, va, vb; struct gf_w8_bytwo_data *btd; gf_region_data rd; - + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } @@ -1399,7 +1837,7 @@ gf_w8_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v } static -void + void gf_w8_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) { int i; @@ -1419,349 +1857,349 @@ gf_w8_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t d64 = (uint64_t *) rd.d_start; switch (val) { - case 2: - if (xor) { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 ^= ta; - d64++; - s64++; + case 2: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= ta; + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = ta; + d64++; + s64++; + } } - } else { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 = ta; - d64++; - s64++; + break; + case 3: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = (ta ^ prod); + d64++; + s64++; + } } + break; + case 4: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= ta; + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = ta; + d64++; + s64++; + } + } + break; + case 5: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = ta ^ prod; + d64++; + s64++; + } + } + case 6: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = ta ^ prod; + d64++; + s64++; + } + } + /* + case 7: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = ta ^ prod; + d64++; + s64++; + } + } + break; + */ + case 8: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= ta; + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = ta; + d64++; + s64++; + } + } + break; + /* + case 9: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = (ta ^ prod); + d64++; + s64++; + } + } + break; + case 10: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = (ta ^ prod); + d64++; + s64++; + } + } + break; + case 11: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = (ta ^ prod); + d64++; + s64++; + } + } + break; + case 12: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; } - break; - case 3: - if (xor) { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 ^= (ta ^ prod); - d64++; - s64++; - } - } else { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 = (ta ^ prod); - d64++; - s64++; - } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = (ta ^ prod); + d64++; + s64++; } - break; - case 4: - if (xor) { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 ^= ta; - d64++; - s64++; - } - } else { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 = ta; - d64++; - s64++; - } + } + break; + case 13: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; } - break; - case 5: - if (xor) { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 ^= (ta ^ prod); - d64++; - s64++; - } - } else { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 = ta ^ prod; - d64++; - s64++; - } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = (ta ^ prod); + d64++; + s64++; } - case 6: - if (xor) { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 ^= (ta ^ prod); - d64++; - s64++; - } - } else { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 = ta ^ prod; - d64++; - s64++; - } + } + break; + case 14: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; } -/* - case 7: - if (xor) { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod ^= ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 ^= (ta ^ prod); - d64++; - s64++; - } - } else { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod ^= ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 = ta ^ prod; - d64++; - s64++; - } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = (ta ^ prod); + d64++; + s64++; } - break; - */ - case 8: - if (xor) { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 ^= ta; - d64++; - s64++; - } - } else { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 = ta; - d64++; - s64++; - } + } + break; + case 15: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; } - break; -/* - case 9: - if (xor) { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 ^= (ta ^ prod); - d64++; - s64++; - } - } else { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 = (ta ^ prod); - d64++; - s64++; - } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = (ta ^ prod); + d64++; + s64++; } - break; - case 10: - if (xor) { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 ^= (ta ^ prod); - d64++; - s64++; - } - } else { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 = (ta ^ prod); - d64++; - s64++; - } - } - break; - case 11: - if (xor) { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod ^= ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 ^= (ta ^ prod); - d64++; - s64++; - } - } else { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod ^= ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 = (ta ^ prod); - d64++; - s64++; - } - } - break; - case 12: - if (xor) { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 ^= (ta ^ prod); - d64++; - s64++; - } - } else { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 = (ta ^ prod); - d64++; - s64++; - } - } - break; - case 13: - if (xor) { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod ^= ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 ^= (ta ^ prod); - d64++; - s64++; - } - } else { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod ^= ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 = (ta ^ prod); - d64++; - s64++; - } - } - break; - case 14: - if (xor) { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod ^= ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 ^= (ta ^ prod); - d64++; - s64++; - } - } else { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod ^= ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 = (ta ^ prod); - d64++; - s64++; - } - } - break; - case 15: - if (xor) { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod ^= ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod ^= ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 ^= (ta ^ prod); - d64++; - s64++; - } - } else { - while (d64 < (uint64_t *) rd.d_top) { - ta = *s64; - prod = ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod ^= ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - prod ^= ta; - AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); - *d64 = (ta ^ prod); - d64++; - s64++; - } - } - break; -*/ - default: + } + break; + */ + default: if (xor) { while (d64 < (uint64_t *) rd.d_top) { prod = *d64 ; @@ -1798,7 +2236,7 @@ gf_w8_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t gf_do_final_region_alignment(&rd); } -static + static int gf_w8_bytwo_init(gf_t *gf) { gf_internal_t *h; @@ -1825,48 +2263,54 @@ int gf_w8_bytwo_init(gf_t *gf) if (h->mult_type == GF_MULT_BYTWO_p) { gf->multiply.w32 = gf_w8_bytwo_p_multiply; - if (h->region_type == GF_REGION_SSE) { - gf->multiply_region.w32 = gf_w8_bytwo_p_sse_multiply_region; - } else { +#ifdef INTEL_SSE2 + if (h->region_type & GF_REGION_NOSSE) gf->multiply_region.w32 = gf_w8_bytwo_p_nosse_multiply_region; - } + else + gf->multiply_region.w32 = gf_w8_bytwo_p_sse_multiply_region; +#else + gf->multiply_region.w32 = gf_w8_bytwo_p_nosse_multiply_region; + if(h->region_type & GF_REGION_SSE) + return 0; +#endif } else { gf->multiply.w32 = gf_w8_bytwo_b_multiply; - if (h->region_type == GF_REGION_SSE) { - gf->multiply_region.w32 = gf_w8_bytwo_b_sse_multiply_region; - } else { +#ifdef INTEL_SSE2 + if (h->region_type & GF_REGION_NOSSE) gf->multiply_region.w32 = gf_w8_bytwo_b_nosse_multiply_region; - } + else + gf->multiply_region.w32 = gf_w8_bytwo_b_sse_multiply_region; +#else + gf->multiply_region.w32 = gf_w8_bytwo_b_nosse_multiply_region; + if(h->region_type & GF_REGION_SSE) + return 0; +#endif } - gf->inverse.w32 = gf_w8_euclid; return 1; } /* ------------------------------------------------------------ General procedures. + You don't need to error check here on in init, because it's done + for you in gf_error_check(). */ int gf_w8_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2) { - int sse; - - sse = (GF_REGION_SSE | GF_REGION_NOSSE); - switch(mult_type) { case GF_MULT_DEFAULT: - if (arg1 != 0 || arg2 != 0 || region_type != 0) return -1; +#ifdef INTEL_SSSE3 return sizeof(gf_internal_t) + sizeof(struct gf_w8_default_data) + 64; +#endif + return sizeof(gf_internal_t) + sizeof(struct gf_w8_single_table_data) + 64; case GF_MULT_TABLE: - if (arg1 != 0 || arg2 != 0) return -1; - if (region_type == GF_REGION_CAUCHY || region_type == (GF_REGION_CAUCHY | GF_REGION_SINGLE_TABLE)) { + if (region_type == GF_REGION_CAUCHY) { return sizeof(gf_internal_t) + sizeof(struct gf_w8_single_table_data) + 64; } - if (region_type == 0) region_type = GF_REGION_SINGLE_TABLE; - if (region_type & GF_REGION_SINGLE_TABLE) { - if (region_type != GF_REGION_SINGLE_TABLE) return 0; + if (region_type == GF_REGION_DEFAULT) { return sizeof(gf_internal_t) + sizeof(struct gf_w8_single_table_data) + 64; } if (region_type & GF_REGION_DOUBLE_TABLE) { @@ -1875,62 +2319,62 @@ int gf_w8_scratch_size(int mult_type, int region_type, int divide_type, int arg1 } else if (region_type == (GF_REGION_DOUBLE_TABLE | GF_REGION_LAZY)) { return sizeof(gf_internal_t) + sizeof(struct gf_w8_double_table_lazy_data) + 64; } else { - return -1; + return 0; } } - return -1; + return 0; break; case GF_MULT_BYTWO_p: case GF_MULT_BYTWO_b: - if (arg1 != 0 || arg2 != 0) return -1; - if (region_type != GF_REGION_CAUCHY) { - if ((region_type | sse) != sse || (region_type & sse) == sse) return -1; - } return sizeof(gf_internal_t) + sizeof(struct gf_w8_bytwo_data); break; case GF_MULT_SPLIT_TABLE: if ((arg1 == 4 && arg2 == 8) || (arg1 == 8 && arg2 == 4)) { - if (region_type == GF_REGION_CAUCHY) { - return sizeof(gf_internal_t) + sizeof(struct gf_w8_half_table_data) + 64; - } - if (region_type == 0) region_type = GF_REGION_SSE; - if ((region_type | sse) != sse) return -1; - if ((region_type & sse) == sse) return -1; return sizeof(gf_internal_t) + sizeof(struct gf_w8_half_table_data) + 64; } - return -1; break; case GF_MULT_LOG_TABLE: - if ((arg1 != 0 && arg1 != 1 && arg1 != 2) || arg2 != 0) return -1; - if (region_type != 0 && region_type != GF_REGION_CAUCHY) return -1; - if (arg1 == 0) return sizeof(gf_internal_t) + sizeof(struct gf_w8_logtable_data) + 64; - if (arg1 == 1) return sizeof(gf_internal_t) + sizeof(struct gf_w8_logzero_small_table_data) + 64; + return sizeof(gf_internal_t) + sizeof(struct gf_w8_logtable_data) + 64; + break; + case GF_MULT_LOG_ZERO: + return sizeof(gf_internal_t) + sizeof(struct gf_w8_logzero_small_table_data) + 64; + break; + case GF_MULT_LOG_ZERO_EXT: return sizeof(gf_internal_t) + sizeof(struct gf_w8_logzero_table_data) + 64; break; + case GF_MULT_CARRY_FREE: + return sizeof(gf_internal_t); + break; case GF_MULT_SHIFT: - if (arg1 != 0 || arg2 != 0) return -1; - if (region_type != 0 && region_type != GF_REGION_CAUCHY) return -1; return sizeof(gf_internal_t); break; case GF_MULT_COMPOSITE: - if (region_type & ~(GF_REGION_SINGLE_TABLE | GF_REGION_ALTMAP | GF_REGION_STDMAP)) return -1; - if ((region_type & (GF_REGION_SINGLE_TABLE | GF_REGION_ALTMAP)) == (GF_REGION_SINGLE_TABLE | GF_REGION_ALTMAP)) return -1; - if (arg1 == 2 && arg2 == 4) { - return sizeof(gf_internal_t) + sizeof(struct gf_w4_single_table_data) + 64; - } else { - return -1; - } + return sizeof(gf_internal_t) + sizeof(struct gf_w8_composite_data) + 64; default: - return -1; - } + return 0; + } + return 0; } int gf_w8_init(gf_t *gf) { - gf_internal_t *h; + gf_internal_t *h, *h_base; h = (gf_internal_t *) gf->scratch; - if (h->prim_poly == 0) h->prim_poly = 0x11d; + + /* Allen: set default primitive polynomial / irreducible polynomial if needed */ + + if (h->prim_poly == 0) { + if (h->mult_type == GF_MULT_COMPOSITE) { + h->prim_poly = gf_composite_get_default_poly(h->base_gf); + if (h->prim_poly == 0) return 0; /* JSP: This shouldn't happen, but just in case. */ + } else { + h->prim_poly = 0x11d; + } + } + if (h->mult_type != GF_MULT_COMPOSITE) { + h->prim_poly |= 0x100; + } gf->multiply.w32 = NULL; gf->divide.w32 = NULL; @@ -1939,16 +2383,20 @@ int gf_w8_init(gf_t *gf) gf->extract_word.w32 = gf_w8_extract_word; switch(h->mult_type) { - case GF_MULT_DEFAULT: if (gf_w8_table_init(gf) == 0) return 0; break; - case GF_MULT_TABLE: if (gf_w8_table_init(gf) == 0) return 0; break; + case GF_MULT_DEFAULT: + case GF_MULT_TABLE: if (gf_w8_table_init(gf) == 0) return 0; break; case GF_MULT_BYTWO_p: - case GF_MULT_BYTWO_b: if (gf_w8_bytwo_init(gf) == 0) return 0; break; - case GF_MULT_LOG_TABLE: if (gf_w8_log_init(gf) == 0) return 0; break; - case GF_MULT_SHIFT: if (gf_w8_shift_init(gf) == 0) return 0; break; - case GF_MULT_SPLIT_TABLE: if (gf_w8_split_init(gf) == 0) return 0; break; - case GF_MULT_COMPOSITE: if (gf_w8_composite_init(gf) == 0) return 0; break; + case GF_MULT_BYTWO_b: if (gf_w8_bytwo_init(gf) == 0) return 0; break; + case GF_MULT_LOG_ZERO: + case GF_MULT_LOG_ZERO_EXT: + case GF_MULT_LOG_TABLE: if (gf_w8_log_init(gf) == 0) return 0; break; + case GF_MULT_CARRY_FREE: if (gf_w8_cfm_init(gf) == 0) return 0; break; + case GF_MULT_SHIFT: if (gf_w8_shift_init(gf) == 0) return 0; break; + case GF_MULT_SPLIT_TABLE: if (gf_w8_split_init(gf) == 0) return 0; break; + case GF_MULT_COMPOSITE: if (gf_w8_composite_init(gf) == 0) return 0; break; default: return 0; } + if (h->divide_type == GF_DIVIDE_EUCLID) { gf->divide.w32 = gf_w8_divide_from_inverse; gf->inverse.w32 = gf_w8_euclid; @@ -1957,11 +2405,15 @@ int gf_w8_init(gf_t *gf) gf->inverse.w32 = gf_w8_matrix; } - if (gf->inverse.w32 != NULL && gf->divide.w32 == NULL) { + if (gf->divide.w32 == NULL) { gf->divide.w32 = gf_w8_divide_from_inverse; + if (gf->inverse.w32 == NULL) gf->inverse.w32 = gf_w8_euclid; } - if (gf->inverse.w32 == NULL && gf->divide.w32 != NULL) { - gf->inverse.w32 = gf_w8_inverse_from_divide; + + if (gf->inverse.w32 == NULL) gf->inverse.w32 = gf_w8_inverse_from_divide; + + if (h->mult_type == GF_MULT_COMPOSITE && (h->region_type & GF_REGION_ALTMAP)) { + gf->extract_word.w32 = gf_w8_composite_extract_word; } if (h->region_type == GF_REGION_CAUCHY) { @@ -1969,6 +2421,10 @@ int gf_w8_init(gf_t *gf) gf->extract_word.w32 = gf_wgen_extract_word; } + if (gf->multiply_region.w32 == NULL) { + gf->multiply_region.w32 = gf_w8_multiply_region_from_single; + } + return 1; } @@ -2001,7 +2457,7 @@ uint8_t *gf_w8_get_div_table(gf_t *gf) h = (gf_internal_t *) gf->scratch; if (gf->multiply.w32 == gf_w8_default_multiply) { ftd = (struct gf_w8_default_data *) ((gf_internal_t *) gf->scratch)->private; - return (uint8_t *) std->divtable; + return (uint8_t *) ftd->divtable; } else if (gf->multiply.w32 == gf_w8_table_multiply) { std = (struct gf_w8_single_table_data *) ((gf_internal_t *) gf->scratch)->private; return (uint8_t *) std->divtable; diff --git a/gf_wgen.c b/gf_wgen.c index 7d5144b..ede115c 100644 --- a/gf_wgen.c +++ b/gf_wgen.c @@ -93,6 +93,7 @@ gf_val_32_t gf_wgen_euclid (gf_t *gf, gf_val_32_t b) while (d_ip1 >= d_i) { c_i ^= (1 << (d_ip1 - d_i)); e_ip1 ^= (e_i << (d_ip1 - d_i)); + if (e_ip1 == 0) return 0; while ((e_ip1 & (1 << d_ip1)) == 0) d_ip1--; } @@ -223,7 +224,7 @@ gf_wgen_bytwo_p_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) pp = h->prim_poly; prod = 0; - pmask = (1 << (h->w)-1); + pmask = (1 << ((h->w)-1)); /*Ben: Had an operator precedence warning here*/ amask = pmask; while (amask != 0) { @@ -508,16 +509,11 @@ int gf_wgen_table_8_init(gf_t *gf) } for (a = 1; a < (1 << w); a++) { - b = 1; - p = a; - do { + for (b = 1; b < (1 << w); b++) { + p = gf_wgen_shift_multiply(gf, a, b); std->mult[(a<div[(p<prim_poly : (b << 1); - b &= ((1 << w)-1); - p = (p & (1 << (w-1))) ? (p << 1) ^ h->prim_poly : (p << 1); - p &= ((1 << w)-1); - } while (b != 1); + std->div[(p<multiply.w32 = gf_wgen_table_8_multiply; @@ -572,18 +568,13 @@ int gf_wgen_table_16_init(gf_t *gf) std->div[a] = 0; std->div[a<mult[(a<div[(p<prim_poly : (b << 1); - b &= ((1 << w)-1); - p = (p & (1 << (w-1))) ? (p << 1) ^ h->prim_poly : (p << 1); - p &= ((1 << w)-1); - } while (b != 1); + std->div[(p<multiply.w32 = gf_wgen_table_16_multiply; @@ -599,6 +590,11 @@ int gf_wgen_table_init(gf_t *gf) h = (gf_internal_t *) gf->scratch; if (h->w <= 8) return gf_wgen_table_8_init(gf); if (h->w <= 14) return gf_wgen_table_16_init(gf); + + /* Returning zero to make the compiler happy, but this won't get + executed, because it is tested in _scratch_space. */ + + return 0; } static @@ -640,6 +636,7 @@ int gf_wgen_log_8_init(gf_t *gf) struct gf_wgen_log_w8_data *std; int w; uint32_t a, i; + int check = 0; h = (gf_internal_t *) gf->scratch; w = h->w; @@ -649,17 +646,27 @@ int gf_wgen_log_8_init(gf_t *gf) std->anti = std->log + (1<w); std->danti = std->anti + (1<w)-1; - i = 0; + for (i = 0; i < (1 << w); i++) + std->log[i] = 0; + a = 1; - do { + for(i=0; i < (1<log[a] != 0) check = 1; std->log[a] = i; std->anti[i] = a; std->danti[i] = a; - i++; - a = (a & (1 << (w-1))) ? (a << 1) ^ h->prim_poly : (a << 1); - a &= ((1 << w)-1); - } while (a != 1); - + a <<= 1; + if(a & (1<prim_poly; + //a &= ((1 << w)-1); + } + + if (check != 0) { + _gf_errno = GF_E_LOGPOLY; + return 0; + } + gf->multiply.w32 = gf_wgen_log_8_multiply; gf->divide.w32 = gf_wgen_log_8_divide; return 1; @@ -704,6 +711,7 @@ int gf_wgen_log_16_init(gf_t *gf) struct gf_wgen_log_w16_data *std; int w; uint32_t a, i; + int check = 0; h = (gf_internal_t *) gf->scratch; w = h->w; @@ -712,17 +720,28 @@ int gf_wgen_log_16_init(gf_t *gf) std->log = &(std->base); std->anti = std->log + (1<w); std->danti = std->anti + (1<w)-1; - - i = 0; + + for (i = 0; i < (1 << w); i++) + std->log[i] = 0; + a = 1; - do { + for(i=0; i < (1<log[a] != 0) check = 1; std->log[a] = i; std->anti[i] = a; std->danti[i] = a; - i++; - a = (a & (1 << (w-1))) ? (a << 1) ^ h->prim_poly : (a << 1); - a &= ((1 << w)-1); - } while (a != 1); + a <<= 1; + if(a & (1<prim_poly; + //a &= ((1 << w)-1); + } + + if (check) { + if (h->mult_type != GF_MULT_LOG_TABLE) return gf_wgen_shift_init(gf); + _gf_errno = GF_E_LOGPOLY; + return 0; + } gf->multiply.w32 = gf_wgen_log_16_multiply; gf->divide.w32 = gf_wgen_log_16_divide; @@ -768,7 +787,8 @@ int gf_wgen_log_32_init(gf_t *gf) struct gf_wgen_log_w32_data *std; int w; uint32_t a, i; - + int check = 0; + h = (gf_internal_t *) gf->scratch; w = h->w; std = (struct gf_wgen_log_w32_data *) h->private; @@ -777,17 +797,27 @@ int gf_wgen_log_32_init(gf_t *gf) std->anti = std->log + (1<w); std->danti = std->anti + (1<w)-1; - i = 0; + for (i = 0; i < (1 << w); i++) + std->log[i] = 0; + a = 1; - do { + for(i=0; i < (1<log[a] != 0) check = 1; std->log[a] = i; std->anti[i] = a; std->danti[i] = a; - i++; - a = (a & (1 << (w-1))) ? (a << 1) ^ h->prim_poly : (a << 1); - a &= ((1 << w)-1); - } while (a != 1); - + a <<= 1; + if(a & (1<prim_poly; + //a &= ((1 << w)-1); + } + + if (check != 0) { + _gf_errno = GF_E_LOGPOLY; + return 0; + } + gf->multiply.w32 = gf_wgen_log_32_multiply; gf->divide.w32 = gf_wgen_log_32_divide; return 1; @@ -802,15 +832,16 @@ int gf_wgen_log_init(gf_t *gf) if (h->w <= 8) return gf_wgen_log_8_init(gf); if (h->w <= 16) return gf_wgen_log_16_init(gf); if (h->w <= 32) return gf_wgen_log_32_init(gf); + + /* Returning zero to make the compiler happy, but this won't get + executed, because it is tested in _scratch_space. */ + + return 0; } int gf_wgen_scratch_size(int w, int mult_type, int region_type, int divide_type, int arg1, int arg2) { - if (w > 32 || w < 0) return -1; - - if ((region_type | GF_REGION_CAUCHY) != GF_REGION_CAUCHY) return -1; - switch(mult_type) { case GF_MULT_DEFAULT: @@ -828,40 +859,37 @@ int gf_wgen_scratch_size(int w, int mult_type, int region_type, int divide_type, case GF_MULT_SHIFT: case GF_MULT_BYTWO_b: case GF_MULT_BYTWO_p: - if (arg1 != 0 || arg2 != 0) return -1; return sizeof(gf_internal_t); break; case GF_MULT_GROUP: - if (arg1 <= 0 || arg2 <= 0) return -1; return sizeof(gf_internal_t) + sizeof(struct gf_wgen_group_data) + sizeof(uint32_t) * (1 << arg1) + sizeof(uint32_t) * (1 << arg2) + 64; break; case GF_MULT_TABLE: - if (arg1 != 0 || arg2 != 0) return -1; if (w <= 8) { return sizeof(gf_internal_t) + sizeof(struct gf_wgen_table_w8_data) + sizeof(uint8_t)*(1 << w)*(1<prim_poly = 00020000007; break; default: fprintf(stderr, "gf_wgen_init: w not defined yet\n"); exit(1); } + } else { + if (h->w == 32) { + h->prim_poly &= 0xffffffff; + } else { + h->prim_poly |= (1 << h->w); + if (h->prim_poly & ~((1ULL<<(h->w+1))-1)) return 0; + } } gf->multiply.w32 = NULL; @@ -950,7 +985,7 @@ int gf_wgen_init(gf_t *gf) } else if (h->w <= 16) { if (gf_wgen_log_init(gf) == 0) return 0; } else { - if (gf_wgen_group_init(gf) == 0) return 0; + if (gf_wgen_bytwo_p_init(gf) == 0) return 0; } break; case GF_MULT_SHIFT: if (gf_wgen_shift_init(gf) == 0) return 0; break; diff --git a/release-files.txt b/release-files.txt deleted file mode 100644 index ca25004..0000000 --- a/release-files.txt +++ /dev/null @@ -1,31 +0,0 @@ -License.txt -README.txt -GNUmakefile -gf.c -gf_add.c -gf_complete.h -gf_div.c -gf_example_1.c -gf_example_2.c -gf_example_3.c -gf_example_4.c -gf_general.c -gf_general.h -gf_int.h -gf_method.c -gf_method.h -gf_methods.c -gf_mult.c -gf_poly.c -gf_rand.c -gf_rand.h -gf_time.c -gf_unit.c -gf_w128.c -gf_w16.c -gf_w32.c -gf_w4.c -gf_w64.c -gf_w8.c -gf_wgen.c -whats_my_sse.c diff --git a/tests.txt b/tests.txt deleted file mode 100644 index e69de29..0000000 diff --git a/tmp-10-out.txt b/tmp-10-out.txt deleted file mode 100644 index e69de29..0000000 diff --git a/tmp-time-test.sh b/tmp-time-test.sh deleted file mode 100644 index e30fca8..0000000 --- a/tmp-time-test.sh +++ /dev/null @@ -1,14 +0,0 @@ -if [ $# -lt 4 ]; then - echo 'usage: sh tmp-test.sh w gf_specs (e.g. LOG - -)' >&2 - exit 1 -fi - -w=$1 -shift -i=1024 -while [ $i -le 134217728 ]; do - iter=`echo $i | awk '{ print (134217728/$1)*1 }'` - echo $i $iter $w $* `./gf_time $w G -1 $i $iter $* | head -n 3 | tail -n 2` - i=`echo $i | awk '{ print $1*2 }'` -done - diff --git a/tmp.c b/tmp.c deleted file mode 100644 index a6deaab..0000000 --- a/tmp.c +++ /dev/null @@ -1,1583 +0,0 @@ -/* - * gf_w32.c - * - * Routines for 32-bit Galois fields - */ - -#define MM_PRINT32(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 4) printf(" %02x%02x%02x%02x", blah[15-ii], blah[14-ii], blah[13-ii], blah[12-ii]); printf("\n"); } - -#define MM_PRINT8(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 1) printf("%s%02x", (ii%4==0) ? " " : " ", blah[15-ii]); printf("\n"); } - -#include "gf_int.h" -#include -#include - -#define GF_FIELD_WIDTH (32) -#define GF_FIRST_BIT (1 << 31) - -#define GF_BASE_FIELD_WIDTH (16) -#define GF_BASE_FIELD_SIZE (1 << GF_BASE_FIELD_WIDTH) -#define GF_BASE_FIELD_GROUP_SIZE GF_BASE_FIELD_SIZE-1 -#define GF_S_GF_16_2 (40188) -#define GF_MULTBY_TWO(p) (((p) & GF_FIRST_BIT) ? (((p) << 1) ^ h->prim_poly) : (p) << 1); - - -struct gf_w16_logtable_data { - int log_tbl[GF_BASE_FIELD_SIZE]; - gf_val_16_t _antilog_tbl[GF_BASE_FIELD_SIZE * 4]; - gf_val_16_t *antilog_tbl; - gf_val_16_t inv_tbl[GF_BASE_FIELD_SIZE]; -}; - -struct gf_split_2_32_lazy_data { - gf_val_32_t last_value; - gf_val_32_t tables[16][4]; -}; - -struct gf_split_8_8_data { - gf_val_32_t tables[7][256][256]; -}; - -struct gf_split_4_32_lazy_data { - gf_val_32_t last_value; - gf_val_32_t tables[8][16]; -}; - -static -inline -gf_val_32_t gf_w32_inverse_from_divide (gf_t *gf, gf_val_32_t a) -{ - return gf->divide.w32(gf, 1, a); -} - -static -inline -gf_val_32_t gf_w32_divide_from_inverse (gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - b = gf->inverse.w32(gf, b); - return gf->multiply.w32(gf, a, b); -} - -static -void -gf_w32_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int -xor) -{ - int i; - gf_val_32_t *s32; - gf_val_32_t *d32; - - s32 = (gf_val_32_t *) src; - d32 = (gf_val_32_t *) dest; - - if (xor) { - for (i = 0; i < bytes/sizeof(gf_val_32_t); i++) { - d32[i] ^= gf->multiply.w32(gf, val, s32[i]); - } - } else { - for (i = 0; i < bytes/sizeof(gf_val_32_t); i++) { - d32[i] = gf->multiply.w32(gf, val, s32[i]); - } - } -} - -static -inline -gf_val_32_t gf_w32_euclid (gf_t *gf, gf_val_32_t b) -{ - gf_val_32_t e_i, e_im1, e_ip1; - gf_val_32_t d_i, d_im1, d_ip1; - gf_val_32_t y_i, y_im1, y_ip1; - gf_val_32_t c_i; - - if (b == 0) return -1; - e_im1 = ((gf_internal_t *) (gf->scratch))->prim_poly; - e_i = b; - d_im1 = 32; - for (d_i = d_im1-1; ((1 << d_i) & e_i) == 0; d_i--) ; - y_i = 1; - y_im1 = 0; - - while (e_i != 1) { - - e_ip1 = e_im1; - d_ip1 = d_im1; - c_i = 0; - - while (d_ip1 >= d_i) { - c_i ^= (1 << (d_ip1 - d_i)); - e_ip1 ^= (e_i << (d_ip1 - d_i)); - d_ip1--; - while ((e_ip1 & (1 << d_ip1)) == 0) d_ip1--; - } - - y_ip1 = y_im1 ^ gf->multiply.w32(gf, c_i, y_i); - y_im1 = y_i; - y_i = y_ip1; - - e_im1 = e_i; - d_im1 = d_i; - e_i = e_ip1; - d_i = d_ip1; - } - - return y_i; -} - -static -inline -gf_val_32_t gf_w32_matrix (gf_t *gf, gf_val_32_t b) -{ - return gf_bitmatrix_inverse(b, 32, ((gf_internal_t *) (gf->scratch))->prim_poly); -} - -/* JSP: GF_MULT_SHIFT: The world's dumbest multiplication algorithm. I only - include it for completeness. It does have the feature that it requires no - extra memory. -*/ - -static -inline -gf_val_32_t -gf_w32_shift_multiply (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32) -{ - uint64_t product, i, pp, a, b, one; - gf_internal_t *h; - - a = a32; - b = b32; - h = (gf_internal_t *) gf->scratch; - one = 1; - pp = h->prim_poly | (one << 32); - - product = 0; - - for (i = 0; i < GF_FIELD_WIDTH; i++) { - if (a & (one << i)) product ^= (b << i); - } - for (i = (GF_FIELD_WIDTH*2-1); i >= GF_FIELD_WIDTH; i--) { - if (product & (one << i)) product ^= (pp << (i-GF_FIELD_WIDTH)); - } - return product; -} - -static -int gf_w32_shift_init(gf_t *gf) -{ - gf->multiply.w32 = gf_w32_shift_multiply; - gf->inverse.w32 = gf_w32_euclid; - gf->multiply_region.w32 = gf_w32_multiply_region_from_single; - return 1; -} - -static -inline -gf_val_32_t -gf_w32_split_8_8_multiply (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32) -{ - uint32_t product, i, j, mask, tb; - gf_internal_t *h; - struct gf_split_8_8_data *d8; - - h = (gf_internal_t *) gf->scratch; - d8 = (struct gf_split_8_8_data *) h->private; - product = 0; - mask = 0xff; - - for (i = 0; i < 4; i++) { - tb = b32; - for (j = 0; j < 4; j++) { - product ^= d8->tables[i+j][a32&mask][tb&mask]; - tb >>= 8; - } - a32 >>= 8; - } - return product; -} - -static -inline -void -gf_w32_split_8_8_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) -{ - uint32_t product, mask, tb, tv, tp; - gf_internal_t *h; - struct gf_split_8_8_data *d8; - uint32_t *p00, *p01, *p02, *p03; - uint32_t *p10, *p11, *p12, *p13; - uint32_t *p20, *p21, *p22, *p23; - uint32_t *p30, *p31, *p32, *p33; - uint32_t *s32, *d32, *top; - unsigned long uls, uld; - - uls = (unsigned long) src; - uld = (unsigned long) dest; - if (uls %4 != 0 || ((uls & 0x7) != (uld & 0x7))) gf_alignment_error("gf_w32_split_8_8_multiply_region", 4); - if (bytes % 4 != 0) { - gf_alignment_error("gf_w32_split_8_8_multiply_region: buffer size not divisible by symbol size = 4 bytes", 4); - } - - tv = val; - h = (gf_internal_t *) gf->scratch; - d8 = (struct gf_split_8_8_data *) h->private; - mask = 0xff; - - p00 = &(d8->tables[0][val&mask][0]); - p01 = &(d8->tables[1][val&mask][0]); - p02 = &(d8->tables[2][val&mask][0]); - p03 = &(d8->tables[3][val&mask][0]); - val >>= 8; - p10 = &(d8->tables[1][val&mask][0]); - p11 = &(d8->tables[2][val&mask][0]); - p12 = &(d8->tables[3][val&mask][0]); - p13 = &(d8->tables[4][val&mask][0]); - val >>= 8; - p20 = &(d8->tables[2][val&mask][0]); - p21 = &(d8->tables[3][val&mask][0]); - p22 = &(d8->tables[4][val&mask][0]); - p23 = &(d8->tables[5][val&mask][0]); - val >>= 8; - p30 = &(d8->tables[3][val&mask][0]); - p31 = &(d8->tables[4][val&mask][0]); - p32 = &(d8->tables[5][val&mask][0]); - p33 = &(d8->tables[6][val&mask][0]); - - s32 = (uint32_t *) src; - d32 = (uint32_t *) dest; - top = (d32 + (bytes/4)); - - while (d32 < top) { - tb = *s32; - tp = *d32; - product = (xor) ? (*d32) : 0; - product ^= p00[tb&mask]; - product ^= p10[tb&mask]; - product ^= p20[tb&mask]; - product ^= p30[tb&mask]; - - tb >>= 8; - product ^= p01[tb&mask]; - product ^= p11[tb&mask]; - product ^= p21[tb&mask]; - product ^= p31[tb&mask]; - - tb >>= 8; - product ^= p02[tb&mask]; - product ^= p12[tb&mask]; - product ^= p22[tb&mask]; - product ^= p32[tb&mask]; - - tb >>= 8; - product ^= p03[tb&mask]; - product ^= p13[tb&mask]; - product ^= p23[tb&mask]; - product ^= p33[tb&mask]; - *d32 = product; - s32++; - d32++; - } -} - -static -void -gf_w32_split_2_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) -{ - unsigned long uls, uld; - gf_internal_t *h; - struct gf_split_2_32_lazy_data *ld; - int i; - gf_val_32_t pp, v, v2, s, *s32, *d32, *top; - - uls = (unsigned long) src; - uld = (unsigned long) dest; - if (uls %4 != 0 || ((uls & 0x7) != (uld & 0x7))) gf_alignment_error("gf_w32_split_2_32_lazy_multiply_region", 4); - if (bytes % 4 != 0) { - gf_alignment_error("gf_w32_split_2_32_lazy_multiply_region: buffer size not divisible by symbol size = 4 bytes", 4); - } - - if (val == 0) { - if (xor) return; - bzero(dest, bytes); - return; - } - - h = (gf_internal_t *) gf->scratch; - pp = h->prim_poly; - - ld = (struct gf_split_2_32_lazy_data *) h->private; - - if (ld->last_value != val) { - v = val; - for (i = 0; i < 16; i++) { - v2 = (v << 1); - if (v & GF_FIRST_BIT) v2 ^= pp; - ld->tables[i][0] = 0; - ld->tables[i][1] = v; - ld->tables[i][2] = v2; - ld->tables[i][3] = (v2 ^ v); - v = (v2 << 1); - if (v2 & GF_FIRST_BIT) v ^= pp; - } - } - ld->last_value = val; - - s32 = (gf_val_32_t *) src; - d32 = (gf_val_32_t *) dest; - top = d32 + (bytes/4); - - while (d32 != top) { - v = (xor) ? *d32 : 0; - s = *s32; - i = 0; - while (s != 0) { - v ^= ld->tables[i][s&3]; - s >>= 2; - i++; - } - *d32 = v; - d32++; - s32++; - } -} - -static -void -gf_w32_split_2_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) -{ -#ifdef INTEL_SSE4 - unsigned long uls, uld; - gf_internal_t *h; - int i, m, j, tindex; - gf_val_32_t pp, v, v2, s, *s32, *d32, *top; - __m128i vi, si, pi, shuffler, tables[16], adder, xi, mask1, mask2; - - uls = (unsigned long) src; - uld = (unsigned long) dest; - if (uls %4 != 0 || ((uls & 0xf) != (uld & 0xf))) gf_alignment_error("gf_w32_split_2_32_lazy_sse_multiply_region", 4); - if (bytes % 4 != 0) { - gf_alignment_error("gf_w32_split_2_32_lazy_sse_multiply_region: buffer size not divisible by symbol size = 4 bytes", 4); - } - - if (val == 0) { - if (xor) return; - bzero(dest, bytes); - return; - } - - h = (gf_internal_t *) gf->scratch; - pp = h->prim_poly; - - uls &= 0xf; - - s32 = (gf_val_32_t *) src; - d32 = (gf_val_32_t *) dest; - top = d32 + (bytes/4); - - if (uls != 0) { - while (uls != 16) { - if (xor) { - *d32 ^= gf->multiply.w32(gf, *s32, val); - } else { - *d32 = gf->multiply.w32(gf, *s32, val); - } - *s32++; - *d32++; - if (d32 == top) return; - uls += 4; - } - } - - uld = (unsigned long) top; - top = (gf_val_32_t *) (uld - (uld & 0xf)); - uld &= 0xf; - - v = val; - for (i = 0; i < 16; i++) { - v2 = (v << 1); - if (v & GF_FIRST_BIT) v2 ^= pp; - tables[i] = _mm_set_epi32(v2 ^ v, v2, v, 0); - v = (v2 << 1); - if (v2 & GF_FIRST_BIT) v ^= pp; - } - - shuffler = _mm_set_epi8(0xc, 0xc, 0xc, 0xc, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0); - adder = _mm_set_epi8(3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0); - mask1 = _mm_set1_epi8(0x3); - mask2 = _mm_set1_epi8(0xc); - - while (d32 != top) { - pi = (xor) ? _mm_load_si128 ((__m128i *) d32) : _mm_setzero_si128(); - vi = _mm_load_si128((__m128i *) s32); - - tindex = 0; - for (i = 0; i < 4; i++) { - si = _mm_shuffle_epi8(vi, shuffler); - - xi = _mm_and_si128(si, mask1); - xi = _mm_slli_epi16(xi, 2); - xi = _mm_xor_si128(xi, adder); - pi = _mm_xor_si128(pi, _mm_shuffle_epi8(tables[tindex], xi)); - tindex++; - - xi = _mm_and_si128(si, mask2); - xi = _mm_xor_si128(xi, adder); - pi = _mm_xor_si128(pi, _mm_shuffle_epi8(tables[tindex], xi)); - si = _mm_srli_epi16(si, 2); - tindex++; - - xi = _mm_and_si128(si, mask2); - xi = _mm_xor_si128(xi, adder); - pi = _mm_xor_si128(pi, _mm_shuffle_epi8(tables[tindex], xi)); - si = _mm_srli_epi16(si, 2); - tindex++; - - xi = _mm_and_si128(si, mask2); - xi = _mm_xor_si128(xi, adder); - pi = _mm_xor_si128(pi, _mm_shuffle_epi8(tables[tindex], xi)); - si = _mm_srli_epi16(si, 2); - tindex++; - - vi = _mm_srli_epi32(vi, 8); - } - _mm_store_si128((__m128i *) d32, pi); - d32 += 4; - s32 += 4; - } - - while (uld > 0) { - if (xor) { - *d32 ^= gf->multiply.w32(gf, *s32, val); - } else { - *d32 = gf->multiply.w32(gf, *s32, val); - } - *s32++; - *d32++; - uld -= 4; - } - - -#endif -} - -static -void -gf_w32_split_4_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) -{ - unsigned long uls, uld; - gf_internal_t *h; - struct gf_split_4_32_lazy_data *ld; - int i, j, k; - gf_val_32_t pp, v, s, *s32, *d32, *top; - - uls = (unsigned long) src; - uld = (unsigned long) dest; - if (uls %4 != 0 || ((uls & 0x7) != (uld & 0x7))) gf_alignment_error("gf_w32_split_4_32_lazy_multiply_region", 4); - if (bytes % 4 != 0) { - gf_alignment_error("gf_w32_split_4_32_lazy_multiply_region: buffer size not divisible by symbol size = 4 bytes", 4); - } - - if (val == 0) { - if (xor) return; - bzero(dest, bytes); - return; - } - - h = (gf_internal_t *) gf->scratch; - pp = h->prim_poly; - - ld = (struct gf_split_4_32_lazy_data *) h->private; - - if (ld->last_value != val) { - v = val; - for (i = 0; i < 8; i++) { - ld->tables[i][0] = 0; - for (j = 1; j < 16; j <<= 1) { - for (k = 0; k < j; k++) { - ld->tables[i][k^j] = (v ^ ld->tables[i][k]); - } - v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1); - } - } - } - ld->last_value = val; - - s32 = (gf_val_32_t *) src; - d32 = (gf_val_32_t *) dest; - top = d32 + (bytes/4); - - while (d32 != top) { - v = (xor) ? *d32 : 0; - s = *s32; - i = 0; - while (s != 0) { - v ^= ld->tables[i][s&0xf]; - s >>= 4; - i++; - } - *d32 = v; - d32++; - s32++; - } -} - -static -void -gf_w32_split_4_32_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) -{ -#ifdef INTEL_SSE4 - unsigned long uls, uld; - gf_internal_t *h; - int i, m, j, k, tindex; - gf_val_32_t pp, v, s, *s32, *d32, *top, *realtop; - __m128i si, tables[8][4], p0, p1, p2, p3, mask1, v0, v1, v2, v3; - struct gf_split_4_32_lazy_data *ld; - uint8_t btable[16]; - - uls = (unsigned long) src; - uld = (unsigned long) dest; - if (uls %4 != 0 || ((uls & 0xf) != (uld & 0xf))) gf_alignment_error("gf_w32_split_4_32_lazy_sse_multiply_region", 4); - if (bytes % 4 != 0) { - gf_alignment_error("gf_w32_split_4_32_lazy_sse_multiply_region: buffer size not divisible by symbol size = 4 bytes", 4); - } - - if (val == 0) { - if (xor) return; - bzero(dest, bytes); - return; - } - - h = (gf_internal_t *) gf->scratch; - pp = h->prim_poly; - - uls &= 0xf; - - s32 = (gf_val_32_t *) src; - d32 = (gf_val_32_t *) dest; - top = d32 + (bytes/4); - - if (uls != 0) { - while (uls != 16) { - if (xor) { - *d32 ^= gf->multiply.w32(gf, *s32, val); - } else { - *d32 = gf->multiply.w32(gf, *s32, val); - } - *s32++; - *d32++; - if (d32 == top) return; - uls += 4; - } - } - - uld = (unsigned long) top; - realtop = top; - - /* You need the size of this region to be a multiple of 64 bytes */ - bytes = (top - d32); - bytes -= (bytes & 0xf); - top = (d32 + bytes); - - ld = (struct gf_split_4_32_lazy_data *) h->private; - - v = val; - for (i = 0; i < 8; i++) { - ld->tables[i][0] = 0; - for (j = 1; j < 16; j <<= 1) { - for (k = 0; k < j; k++) { - ld->tables[i][k^j] = (v ^ ld->tables[i][k]); - } - v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1); - } - for (j = 0; j < 4; j++) { - for (k = 0; k < 16; k++) { - btable[k] = (uint8_t) ld->tables[i][k]; - ld->tables[i][k] >>= 8; - } - tables[i][j] = _mm_loadu_si128((__m128i *) btable); - } - } - - mask1 = _mm_set1_epi8(0xf); - - if (xor) { - while (d32 != top) { - p0 = _mm_load_si128 ((__m128i *) d32); - p1 = _mm_load_si128 ((__m128i *) (d32+4)); - p2 = _mm_load_si128 ((__m128i *) (d32+8)); - p3 = _mm_load_si128 ((__m128i *) (d32+12)); - - v0 = _mm_load_si128((__m128i *) s32); s32 += 4; - v1 = _mm_load_si128((__m128i *) s32); s32 += 4; - v2 = _mm_load_si128((__m128i *) s32); s32 += 4; - v3 = _mm_load_si128((__m128i *) s32); s32 += 4; - - si = _mm_and_si128(v0, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[0][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[0][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[0][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[0][3], si)); - - v0 = _mm_srli_epi32(v0, 4); - si = _mm_and_si128(v0, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[1][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[1][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[1][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[1][3], si)); - - si = _mm_and_si128(v1, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[2][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[2][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[2][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[2][3], si)); - - v1 = _mm_srli_epi32(v1, 4); - si = _mm_and_si128(v1, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[3][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[3][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[3][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[3][3], si)); - - si = _mm_and_si128(v2, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[4][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[4][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[4][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[4][3], si)); - - v2 = _mm_srli_epi32(v2, 4); - si = _mm_and_si128(v2, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[5][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[5][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[5][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[5][3], si)); - - si = _mm_and_si128(v3, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[6][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[6][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[6][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[6][3], si)); - - v3 = _mm_srli_epi32(v3, 4); - si = _mm_and_si128(v3, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[7][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[7][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[7][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[7][3], si)); - - _mm_store_si128((__m128i *) d32, p0); - _mm_store_si128((__m128i *) (d32+4), p1); - _mm_store_si128((__m128i *) (d32+8), p2); - _mm_store_si128((__m128i *) (d32+12), p3); - d32 += 16; - } - } else { - while (d32 != top) { - - v0 = _mm_load_si128((__m128i *) s32); s32 += 4; - v1 = _mm_load_si128((__m128i *) s32); s32 += 4; - v2 = _mm_load_si128((__m128i *) s32); s32 += 4; - v3 = _mm_load_si128((__m128i *) s32); s32 += 4; - - si = _mm_and_si128(v0, mask1); - p0 = _mm_shuffle_epi8(tables[0][0], si); - p1 = _mm_shuffle_epi8(tables[0][1], si); - p2 = _mm_shuffle_epi8(tables[0][2], si); - p3 = _mm_shuffle_epi8(tables[0][3], si); - - v0 = _mm_srli_epi32(v0, 4); - si = _mm_and_si128(v0, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[1][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[1][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[1][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[1][3], si)); - - si = _mm_and_si128(v1, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[2][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[2][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[2][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[2][3], si)); - - v1 = _mm_srli_epi32(v1, 4); - si = _mm_and_si128(v1, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[3][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[3][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[3][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[3][3], si)); - - si = _mm_and_si128(v2, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[4][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[4][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[4][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[4][3], si)); - - v2 = _mm_srli_epi32(v2, 4); - si = _mm_and_si128(v2, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[5][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[5][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[5][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[5][3], si)); - - si = _mm_and_si128(v3, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[6][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[6][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[6][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[6][3], si)); - - v3 = _mm_srli_epi32(v3, 4); - si = _mm_and_si128(v3, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[7][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[7][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[7][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[7][3], si)); - - _mm_store_si128((__m128i *) d32, p0); - _mm_store_si128((__m128i *) (d32+4), p1); - _mm_store_si128((__m128i *) (d32+8), p2); - _mm_store_si128((__m128i *) (d32+12), p3); - d32 += 16; - } - } - - while (d32 < realtop) { - if (xor) { - *d32 ^= gf->multiply.w32(gf, *s32, val); - } else { - *d32 = gf->multiply.w32(gf, *s32, val); - } - *s32++; - *d32++; - } - -#endif -} - - -static -void -gf_w32_split_4_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) -{ -#ifdef INTEL_SSE4 - unsigned long uls, uld; - gf_internal_t *h; - int i, m, j, k, tindex; - gf_val_32_t pp, v, s, *s32, *d32, *top, *realtop; - __m128i vi, si, tables[8][4], p0, p1, p2, p3, mask1, v0, v1, v2, v3, mask8, mask16; - __m128i tv1, tv2, tv3, tv0; - struct gf_split_4_32_lazy_data *ld; - uint8_t btable[16]; - - uls = (unsigned long) src; - uld = (unsigned long) dest; - if (uls %4 != 0 || ((uls & 0xf) != (uld & 0xf))) gf_alignment_error("gf_w32_split_4_32_lazy_sse_multiply_region", 4); - if (bytes % 4 != 0) { - gf_alignment_error("gf_w32_split_4_32_lazy_sse_multiply_region: buffer size not divisible by symbol size = 4 bytes", 4); - } - - if (val == 0) { - if (xor) return; - bzero(dest, bytes); - return; - } - - h = (gf_internal_t *) gf->scratch; - pp = h->prim_poly; - - uls &= 0xf; - - s32 = (gf_val_32_t *) src; - d32 = (gf_val_32_t *) dest; - top = d32 + (bytes/4); - - if (uls != 0) { - while (uls != 16) { - if (xor) { - *d32 ^= gf->multiply.w32(gf, *s32, val); - } else { - *d32 = gf->multiply.w32(gf, *s32, val); - } - *s32++; - *d32++; - if (d32 == top) return; - uls += 4; - } - } - - uld = (unsigned long) top; - realtop = top; - - /* You need the size of this region to be a multiple of 64 bytes */ - bytes = (top - d32); - bytes -= (bytes & 0xf); - top = (d32 + bytes); - - ld = (struct gf_split_4_32_lazy_data *) h->private; - - v = val; - for (i = 0; i < 8; i++) { - ld->tables[i][0] = 0; - for (j = 1; j < 16; j <<= 1) { - for (k = 0; k < j; k++) { - ld->tables[i][k^j] = (v ^ ld->tables[i][k]); - } - v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1); - } - for (j = 0; j < 4; j++) { - for (k = 0; k < 16; k++) { - btable[k] = (uint8_t) ld->tables[i][k]; - ld->tables[i][k] >>= 8; - } - tables[i][j] = _mm_loadu_si128((__m128i *) btable); - } - } - - mask1 = _mm_set1_epi8(0xf); - mask8 = _mm_set1_epi16(0xff); - mask16 = _mm_set1_epi32(0xffff); - - if (xor) { - while (d32 != top) { - v0 = _mm_load_si128((__m128i *) s32); s32 += 4; - v1 = _mm_load_si128((__m128i *) s32); s32 += 4; - v2 = _mm_load_si128((__m128i *) s32); s32 += 4; - v3 = _mm_load_si128((__m128i *) s32); s32 += 4; - -/* printf("Val = %x\n", val); - MM_PRINT8("Old V0", v0); - MM_PRINT8("Old V1", v1); - MM_PRINT8("Old V2", v2); - MM_PRINT8("Old V3", v3); - printf("\n"); */ - - p0 = _mm_srli_epi16(v0, 8); - p1 = _mm_srli_epi16(v1, 8); - p2 = _mm_srli_epi16(v2, 8); - p3 = _mm_srli_epi16(v3, 8); - - tv0 = _mm_and_si128(v0, mask8); - tv1 = _mm_and_si128(v1, mask8); - tv2 = _mm_and_si128(v2, mask8); - tv3 = _mm_and_si128(v3, mask8); - - v0 = _mm_packus_epi16(p1, p0); - v1 = _mm_packus_epi16(tv1, tv0); - v2 = _mm_packus_epi16(p3, p2); - v3 = _mm_packus_epi16(tv3, tv2); - -/* MM_PRINT8("Middle V0", v0); - MM_PRINT8("Middle V1", v1); - MM_PRINT8("Middle V2", v2); - MM_PRINT8("Middle V3", v3); - printf("\n"); */ - - p0 = _mm_srli_epi16(v0, 8); - p1 = _mm_srli_epi16(v1, 8); - p2 = _mm_srli_epi16(v2, 8); - p3 = _mm_srli_epi16(v3, 8); - - tv0 = _mm_and_si128(v0, mask8); - tv1 = _mm_and_si128(v1, mask8); - tv2 = _mm_and_si128(v2, mask8); - tv3 = _mm_and_si128(v3, mask8); - - v0 = _mm_packus_epi16(p2, p0); - v1 = _mm_packus_epi16(p3, p1); - v2 = _mm_packus_epi16(tv2, tv0); - v3 = _mm_packus_epi16(tv3, tv1); - -/* MM_PRINT8("New V0", v0); - MM_PRINT8("New V1", v1); - MM_PRINT8("New V2", v2); - MM_PRINT8("New V3", v3); - printf("\n"); */ - - si = _mm_and_si128(v0, mask1); - p0 = _mm_shuffle_epi8(tables[6][0], si); - p1 = _mm_shuffle_epi8(tables[6][1], si); - p2 = _mm_shuffle_epi8(tables[6][2], si); - p3 = _mm_shuffle_epi8(tables[6][3], si); - - v0 = _mm_srli_epi32(v0, 4); - si = _mm_and_si128(v0, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[7][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[7][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[7][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[7][3], si)); - - si = _mm_and_si128(v1, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[4][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[4][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[4][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[4][3], si)); - - v1 = _mm_srli_epi32(v1, 4); - si = _mm_and_si128(v1, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[5][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[5][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[5][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[5][3], si)); - - si = _mm_and_si128(v2, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[2][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[2][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[2][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[2][3], si)); - - v2 = _mm_srli_epi32(v2, 4); - si = _mm_and_si128(v2, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[3][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[3][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[3][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[3][3], si)); - - si = _mm_and_si128(v3, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[0][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[0][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[0][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[0][3], si)); - - v3 = _mm_srli_epi32(v3, 4); - si = _mm_and_si128(v3, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[1][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[1][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[1][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[1][3], si)); - -/* MM_PRINT8("Old P0", p0); - MM_PRINT8("Old P1", p1); - MM_PRINT8("Old P2", p2); - MM_PRINT8("Old P3", p3); - printf("\n"); */ - - tv0 = _mm_unpackhi_epi8(p1, p3); - tv1 = _mm_unpackhi_epi8(p0, p2); - tv2 = _mm_unpacklo_epi8(p1, p3); - tv3 = _mm_unpacklo_epi8(p0, p2); - -/* MM_PRINT8("Middle P0", tv0); - MM_PRINT8("Middle P1", tv1); - MM_PRINT8("Middle P2", tv2); - MM_PRINT8("Middle P3", tv3); - printf("\n"); */ - - p0 = _mm_unpackhi_epi8(tv1, tv0); - p1 = _mm_unpacklo_epi8(tv1, tv0); - p2 = _mm_unpackhi_epi8(tv3, tv2); - p3 = _mm_unpacklo_epi8(tv3, tv2); - -/* MM_PRINT8("New P0", p0); - MM_PRINT8("New P1", p1); - MM_PRINT8("New P2", p2); - MM_PRINT8("New P3", p3); - printf("\n"); - exit(1); */ - - v0 = _mm_load_si128 ((__m128i *) d32); - v1 = _mm_load_si128 ((__m128i *) (d32+4)); - v2 = _mm_load_si128 ((__m128i *) (d32+8)); - v3 = _mm_load_si128 ((__m128i *) (d32+12)); - - p0 = _mm_xor_si128(p0, v0); - p1 = _mm_xor_si128(p1, v1); - p2 = _mm_xor_si128(p2, v2); - p3 = _mm_xor_si128(p3, v3); - - _mm_store_si128((__m128i *) d32, p0); - _mm_store_si128((__m128i *) (d32+4), p1); - _mm_store_si128((__m128i *) (d32+8), p2); - _mm_store_si128((__m128i *) (d32+12), p3); - d32 += 16; - } - } else { - while (d32 != top) { - v0 = _mm_load_si128((__m128i *) s32); s32 += 4; - v1 = _mm_load_si128((__m128i *) s32); s32 += 4; - v2 = _mm_load_si128((__m128i *) s32); s32 += 4; - v3 = _mm_load_si128((__m128i *) s32); s32 += 4; - -/* printf("Val = %x\n", val); - MM_PRINT8("Old V0", v0); - MM_PRINT8("Old V1", v1); - MM_PRINT8("Old V2", v2); - MM_PRINT8("Old V3", v3); - printf("\n"); */ - - p0 = _mm_srli_epi16(v0, 8); - p1 = _mm_srli_epi16(v1, 8); - p2 = _mm_srli_epi16(v2, 8); - p3 = _mm_srli_epi16(v3, 8); - - tv0 = _mm_and_si128(v0, mask8); - tv1 = _mm_and_si128(v1, mask8); - tv2 = _mm_and_si128(v2, mask8); - tv3 = _mm_and_si128(v3, mask8); - - v0 = _mm_packus_epi16(p1, p0); - v1 = _mm_packus_epi16(tv1, tv0); - v2 = _mm_packus_epi16(p3, p2); - v3 = _mm_packus_epi16(tv3, tv2); - -/* MM_PRINT8("Middle V0", v0); - MM_PRINT8("Middle V1", v1); - MM_PRINT8("Middle V2", v2); - MM_PRINT8("Middle V3", v3); - printf("\n"); */ - - p0 = _mm_srli_epi16(v0, 8); - p1 = _mm_srli_epi16(v1, 8); - p2 = _mm_srli_epi16(v2, 8); - p3 = _mm_srli_epi16(v3, 8); - - tv0 = _mm_and_si128(v0, mask8); - tv1 = _mm_and_si128(v1, mask8); - tv2 = _mm_and_si128(v2, mask8); - tv3 = _mm_and_si128(v3, mask8); - - v0 = _mm_packus_epi16(p2, p0); - v1 = _mm_packus_epi16(p3, p1); - v2 = _mm_packus_epi16(tv2, tv0); - v3 = _mm_packus_epi16(tv3, tv1); - -/* MM_PRINT8("New V0", v0); - MM_PRINT8("New V1", v1); - MM_PRINT8("New V2", v2); - MM_PRINT8("New V3", v3); - printf("\n"); */ - - si = _mm_and_si128(v0, mask1); - p0 = _mm_shuffle_epi8(tables[6][0], si); - p1 = _mm_shuffle_epi8(tables[6][1], si); - p2 = _mm_shuffle_epi8(tables[6][2], si); - p3 = _mm_shuffle_epi8(tables[6][3], si); - - v0 = _mm_srli_epi32(v0, 4); - si = _mm_and_si128(v0, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[7][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[7][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[7][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[7][3], si)); - - si = _mm_and_si128(v1, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[4][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[4][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[4][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[4][3], si)); - - v1 = _mm_srli_epi32(v1, 4); - si = _mm_and_si128(v1, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[5][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[5][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[5][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[5][3], si)); - - si = _mm_and_si128(v2, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[2][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[2][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[2][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[2][3], si)); - - v2 = _mm_srli_epi32(v2, 4); - si = _mm_and_si128(v2, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[3][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[3][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[3][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[3][3], si)); - - si = _mm_and_si128(v3, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[0][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[0][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[0][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[0][3], si)); - - v3 = _mm_srli_epi32(v3, 4); - si = _mm_and_si128(v3, mask1); - p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[1][0], si)); - p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[1][1], si)); - p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[1][2], si)); - p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[1][3], si)); - -/* MM_PRINT8("Old P0", p0); - MM_PRINT8("Old P1", p1); - MM_PRINT8("Old P2", p2); - MM_PRINT8("Old P3", p3); - printf("\n"); */ - - tv0 = _mm_unpackhi_epi8(p1, p3); - tv1 = _mm_unpackhi_epi8(p0, p2); - tv2 = _mm_unpacklo_epi8(p1, p3); - tv3 = _mm_unpacklo_epi8(p0, p2); - -/* MM_PRINT8("Middle P0", tv0); - MM_PRINT8("Middle P1", tv1); - MM_PRINT8("Middle P2", tv2); - MM_PRINT8("Middle P3", tv3); - printf("\n"); */ - - p0 = _mm_unpackhi_epi8(tv1, tv0); - p1 = _mm_unpacklo_epi8(tv1, tv0); - p2 = _mm_unpackhi_epi8(tv3, tv2); - p3 = _mm_unpacklo_epi8(tv3, tv2); - -/* MM_PRINT8("New P0", p0); - MM_PRINT8("New P1", p1); - MM_PRINT8("New P2", p2); - MM_PRINT8("New P3", p3); - printf("\n"); - exit(1); */ - - _mm_store_si128((__m128i *) d32, p0); - _mm_store_si128((__m128i *) (d32+4), p1); - _mm_store_si128((__m128i *) (d32+8), p2); - _mm_store_si128((__m128i *) (d32+12), p3); - d32 += 16; - } - } - - while (d32 < realtop) { - if (xor) { - *d32 ^= gf->multiply.w32(gf, *s32, val); - } else { - *d32 = gf->multiply.w32(gf, *s32, val); - } - *s32++; - *d32++; - } - - -#endif -} - -static -int gf_w32_split_init(gf_t *gf) -{ - gf_internal_t *h; - struct gf_split_2_32_lazy_data *ld2; - struct gf_split_4_32_lazy_data *ld4; - struct gf_split_8_8_data *d8; - uint32_t p, basep; - int i, j, exp; - - h = (gf_internal_t *) gf->scratch; - - /* Defaults */ - gf->multiply_region.w32 = gf_w32_multiply_region_from_single; - gf->multiply.w32 = gf_w32_shift_multiply; - gf->inverse.w32 = gf_w32_euclid; - - if (h->arg1 == 8 && h->arg2 == 8) { - gf->multiply.w32 = gf_w32_split_8_8_multiply; - gf->multiply_region.w32 = gf_w32_split_8_8_multiply_region; - d8 = (struct gf_split_8_8_data *) h->private; - basep = 1; - for (exp = 0; exp < 7; exp++) { - for (j = 0; j < 256; j++) d8->tables[exp][0][j] = 0; - for (i = 0; i < 256; i++) d8->tables[exp][i][0] = 0; - d8->tables[exp][1][1] = basep; - for (i = 2; i < 256; i++) { - if (i&1) { - p = d8->tables[exp][i^1][1]; - d8->tables[exp][i][1] = p ^ basep; - } else { - p = d8->tables[exp][i>>1][1]; - d8->tables[exp][i][1] = GF_MULTBY_TWO(p); - } - } - for (i = 1; i < 256; i++) { - p = d8->tables[exp][i][1]; - for (j = 1; j < 256; j++) { - if (j&1) { - d8->tables[exp][i][j] = d8->tables[exp][i][j^1] ^ p; - } else { - d8->tables[exp][i][j] = GF_MULTBY_TWO(d8->tables[exp][i][j>>1]); - } - } - } - for (i = 0; i < 8; i++) basep = GF_MULTBY_TWO(basep); - } - } - if ((h->arg1 == 2 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 2)) { - ld2 = (struct gf_split_2_32_lazy_data *) h->private; - ld2->last_value = 0; - if (h->region_type & GF_REGION_SSE) { - gf->multiply_region.w32 = gf_w32_split_2_32_lazy_sse_multiply_region; - } else { - gf->multiply_region.w32 = gf_w32_split_2_32_lazy_multiply_region; - } - } - if ((h->arg1 == 4 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 4)) { - ld4 = (struct gf_split_4_32_lazy_data *) h->private; - ld4->last_value = 0; - if (h->region_type & GF_REGION_SSE) { - if (h->region_type & GF_REGION_ALTMAP) { - gf->multiply_region.w32 = gf_w32_split_4_32_lazy_sse_altmap_multiply_region; - } else { - gf->multiply_region.w32 = gf_w32_split_4_32_lazy_sse_multiply_region; - } - } else { - gf->multiply_region.w32 = gf_w32_split_4_32_lazy_multiply_region; - } - } - return 1; -} - -static -gf_val_32_t -gf_w32_composite_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - gf_internal_t *h = (gf_internal_t *) gf->scratch; - gf_t *base_gf = h->base_gf; - uint16_t b0 = b & 0x0000ffff; - uint16_t b1 = (b & 0xffff0000) >> 16; - uint16_t a0 = a & 0x0000ffff; - uint16_t a1 = (a & 0xffff0000) >> 16; - uint16_t a1b1; - - a1b1 = base_gf->multiply.w16(base_gf, a1, b1); - - return ((base_gf->multiply.w16(base_gf, a0, b0) ^ a1b1) | ((base_gf->multiply.w16(base_gf, a1, b0) ^ base_gf->multiply.w16(base_gf, a0, b1) ^ base_gf->multiply.w16(base_gf, a1b1, GF_S_GF_16_2)) << 16)); -} - -/* - * Composite field division trick (explained in 2007 tech report) - * - * Compute a / b = a*b^-1, where p(x) = x^2 + sx + 1 - * - * let c = b^-1 - * - * c*b = (s*b1c1+b1c0+b0c1)x+(b1c1+b0c0) - * - * want (s*b1c1+b1c0+b0c1) = 0 and (b1c1+b0c0) = 1 - * - * let d = b1c1 and d+1 = b0c0 - * - * solve s*b1c1+b1c0+b0c1 = 0 - * - * solution: d = (b1b0^-1)(b1b0^-1+b0b1^-1+s)^-1 - * - * c0 = (d+1)b0^-1 - * c1 = d*b1^-1 - * - * a / b = a * c - */ -static -gf_val_32_t -gf_w32_composite_inverse(gf_t *gf, gf_val_32_t a) -{ - gf_internal_t *h = (gf_internal_t *) gf->scratch; - gf_t *base_gf = h->base_gf; - uint16_t a0 = a & 0x0000ffff; - uint16_t a1 = (a & 0xffff0000) >> 16; - uint16_t c0, c1, d, tmp; - uint32_t c; - uint16_t a0inv, a1inv; - - if (a0 == 0) { - a1inv = base_gf->inverse.w16(base_gf, a1); - c0 = base_gf->multiply.w16(base_gf, a1inv, GF_S_GF_16_2); - c1 = a1inv; - } else if (a1 == 0) { - c0 = base_gf->inverse.w16(base_gf, a0); - c1 = 0; - } else { - a1inv = base_gf->inverse.w16(base_gf, a1); - a0inv = base_gf->inverse.w16(base_gf, a0); - - d = base_gf->multiply.w16(base_gf, a1, a0inv); - - tmp = (base_gf->multiply.w16(base_gf, a1, a0inv) ^ base_gf->multiply.w16(base_gf, a0, a1inv) ^ GF_S_GF_16_2); - tmp = base_gf->inverse.w16(base_gf, tmp); - - d = base_gf->multiply.w16(base_gf, d, tmp); - - c0 = base_gf->multiply.w16(base_gf, (d^1), a0inv); - c1 = base_gf->multiply.w16(base_gf, d, a1inv); - } - - c = c0 | (c1 << 16); - - return c; -} - -static -gf_val_32_t -gf_w32_composite_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b) -{ - gf_val_32_t binv; - - binv = gf_w32_composite_inverse(gf, b); - - return gf_w32_composite_multiply(gf, a, binv); -} - -static -void -gf_w32_composite_multiply_region_table(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) -{ - unsigned long uls, uld; - gf_internal_t *h = (gf_internal_t *) gf->scratch; - gf_t *base_gf = h->base_gf; - int i=0; - struct gf_w16_logtable_data * ltd; - uint16_t b0 = val & 0x0000ffff; - uint16_t b1 = (val & 0xffff0000) >> 16; - uint32_t *s32 = (uint32_t *) src; - uint32_t *d32 = (uint32_t *) dest; - uint16_t a0, a1, a1b1; - int num_syms = bytes >> 2; - int sym_divisible = bytes % 4; - - uls = (unsigned long) src; - uld = (unsigned long) dest; - if ((uls & 0x7) != (uld & 0x7)) gf_alignment_error("gf_w32_buf_const_log", 2); - if (sym_divisible) { - gf_alignment_error("gf_w32_buf_const_log: buffer size not divisible by symbol size = 2 bytes", 2); - } - - if (val == 0) { - if (xor) return; - bzero(dest, bytes); - return; - } - - ltd = (struct gf_w16_logtable_data *) h->private; - - if (xor) { - for (i = 0;i < num_syms; i++) { - a0 = s32[i] & 0x0000ffff; - a1 = (s32[i] & 0xffff0000) >> 16; - a1b1 = ltd->antilog_tbl[ltd->log_tbl[a1] + ltd->log_tbl[b1]]; - - d32[i] ^= ((ltd->antilog_tbl[ltd->log_tbl[a0] + ltd->log_tbl[b0]] ^ a1b1) | - ((ltd->antilog_tbl[ltd->log_tbl[a1] + ltd->log_tbl[b0]] ^ ltd->antilog_tbl[ltd->log_tbl[a0] + ltd->log_tbl[b1]] ^ - ltd->antilog_tbl[ltd->log_tbl[a1b1] + ltd->log_tbl[GF_S_GF_16_2]]) << 16)); - - } - } else { - for (i = 0;i < num_syms; i++) { - a0 = s32[i] & 0x0000ffff; - a1 = (s32[i] & 0xffff0000) >> 16; - a1b1 = ltd->antilog_tbl[ltd->log_tbl[a1] + ltd->log_tbl[b1]]; - - d32[i] = ((ltd->antilog_tbl[ltd->log_tbl[a0] + ltd->log_tbl[b0]] ^ a1b1) | - ((ltd->antilog_tbl[ltd->log_tbl[a1] + ltd->log_tbl[b0]] ^ ltd->antilog_tbl[ltd->log_tbl[a0] + ltd->log_tbl[b1]] ^ - ltd->antilog_tbl[ltd->log_tbl[a1b1] + ltd->log_tbl[GF_S_GF_16_2]]) << 16)); - } - } -} - -static -void -gf_w32_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) -{ - unsigned long uls, uld; - gf_internal_t *h = (gf_internal_t *) gf->scratch; - gf_t *base_gf = h->base_gf; - int i=0; - struct gf_w16_logtable_data * ltd; - uint16_t b0 = val & 0x0000ffff; - uint16_t b1 = (val & 0xffff0000) >> 16; - uint32_t *s32 = (uint32_t *) src; - uint32_t *d32 = (uint32_t *) dest; - uint16_t a0, a1, a1b1; - int num_syms = bytes >> 2; - int sym_divisible = bytes % 4; - - uls = (unsigned long) src; - uld = (unsigned long) dest; - if ((uls & 0x7) != (uld & 0x7)) gf_alignment_error("gf_w32_buf_const_log", 2); - if (sym_divisible) { - gf_alignment_error("gf_w32_buf_const_log: buffer size not divisible by symbol size = 2 bytes", 2); - } - - if (val == 0) { - if (xor) return; - bzero(dest, bytes); - return; - } - - ltd = (struct gf_w16_logtable_data *) h->private; - - if (xor) { - for (i = 0;i < num_syms; i++) { - a0 = s32[i] & 0x0000ffff; - a1 = (s32[i] & 0xffff0000) >> 16; - a1b1 = base_gf->multiply.w16(base_gf, a1, b1); - - d32[i] ^= ((base_gf->multiply.w16(base_gf, a0, b0) ^ a1b1) | - ((base_gf->multiply.w16(base_gf, a1, b0) ^ base_gf->multiply.w16(base_gf, a0, b1) ^ base_gf->multiply.w16(base_gf, a1b1, GF_S_GF_16_2)) << 16)); - - } - } else { - for (i = 0;i < num_syms; i++) { - a0 = s32[i] & 0x0000ffff; - a1 = (s32[i] & 0xffff0000) >> 16; - a1b1 = base_gf->multiply.w16(base_gf, a1, b1); - - d32[i] = ((base_gf->multiply.w16(base_gf, a0, b0) ^ a1b1) | - ((base_gf->multiply.w16(base_gf, a1, b0) ^ base_gf->multiply.w16(base_gf, a0, b1) ^ base_gf->multiply.w16(base_gf, a1b1, GF_S_GF_16_2)) << 16)); - } - } -} - - - -static -void -gf_w32_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) -{ - gf_internal_t *h = (gf_internal_t *) gf->scratch; - gf_t *base_gf = h->base_gf; - gf_val_16_t val0 = val & 0x0000ffff; - gf_val_16_t val1 = (val & 0xffff0000) >> 16; - int sub_reg_size = bytes / 2; - - if (bytes % 2 != 0) gf_alignment_error("gf_w32_composite_multiply_region_alt", 1); - if (sub_reg_size % 2 != 0) gf_alignment_error("gf_w32_composite_multiply_region_alt", 1); - - if (!xor) { - memset(dest, 0, bytes); - } - - base_gf->multiply_region.w16(base_gf, src, dest, val0, sub_reg_size, xor); - base_gf->multiply_region.w16(base_gf, src+sub_reg_size, dest, val1, sub_reg_size, 1); - base_gf->multiply_region.w16(base_gf, src, dest+sub_reg_size, val1, sub_reg_size, xor); - base_gf->multiply_region.w16(base_gf, src+sub_reg_size, dest+sub_reg_size, val0, sub_reg_size, 1); - base_gf->multiply_region.w16(base_gf, src+sub_reg_size, dest+sub_reg_size, base_gf->multiply.w16(base_gf, GF_S_GF_16_2, val1), sub_reg_size, 1); -} - -static -int gf_w32_composite_init(gf_t *gf) -{ - struct gf_w16_logtable_data *ltd; - gf_internal_t *h = (gf_internal_t *) gf->scratch; - gf_t *base_gf = h->base_gf; - gf_val_32_t a, b; - uint64_t prim_poly = ((gf_internal_t *) base_gf->scratch)->prim_poly; - int i; - - ltd = (struct gf_w16_logtable_data *) h->private; - - ltd->log_tbl[0] = 0; - - bzero(&(ltd->_antilog_tbl[0]), sizeof(ltd->_antilog_tbl)); - - ltd->antilog_tbl = &(ltd->_antilog_tbl[GF_BASE_FIELD_SIZE * 2]); - - b = 1; - for (i = 0; i < GF_BASE_FIELD_GROUP_SIZE; i++) { - ltd->log_tbl[b] = (gf_val_16_t)i; - ltd->antilog_tbl[i] = (gf_val_16_t)b; - ltd->antilog_tbl[i+GF_BASE_FIELD_GROUP_SIZE] = (gf_val_16_t)b; - b <<= 1; - if (b & GF_BASE_FIELD_SIZE) { - b = b ^ prim_poly; - } - } - ltd->inv_tbl[0] = 0; /* Not really, but we need to fill it with something */ - ltd->inv_tbl[1] = 1; - for (i = 2; i < GF_BASE_FIELD_SIZE; i++) { - ltd->inv_tbl[i] = ltd->antilog_tbl[GF_BASE_FIELD_GROUP_SIZE-ltd->log_tbl[i]]; - } - - if (h->region_type & GF_REGION_ALTMAP) { - gf->multiply_region.w32 = gf_w32_composite_multiply_region_alt; - } else { - if (h->region_type & GF_REGION_SINGLE_TABLE) { - gf->multiply_region.w32 = gf_w32_composite_multiply_region_table; - } else { - gf->multiply_region.w32 = gf_w32_composite_multiply_region; - } - } - - gf->multiply.w32 = gf_w32_composite_multiply; - gf->divide.w32 = gf_w32_composite_divide; - gf->inverse.w32 = gf_w32_composite_inverse; - - return 1; -} - -int gf_w32_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2) -{ - int ss, sa; - - ss = (GF_REGION_SSE | GF_REGION_NOSSE); - sa = (GF_REGION_STDMAP | GF_REGION_ALTMAP); - - switch(mult_type) - { - case GF_MULT_SPLIT_TABLE: - if (arg1 == 8 && arg2 == 8){ - if (region_type != GF_REGION_DEFAULT) return -1; - return sizeof(gf_internal_t) + sizeof(struct gf_split_8_8_data) + 64; - } - if ((arg1 == 2 && arg2 == 32) || (arg2 == 2 && arg1 == 32)) { - region_type &= (~GF_REGION_LAZY); - if ((region_type & ss) == ss) return -1; - if ((region_type | ss) != ss) return -1; - return sizeof(gf_internal_t) + sizeof(struct gf_split_2_32_lazy_data) + 64; - } - if ((arg1 == 4 && arg2 == 32) || (arg2 == 4 && arg1 == 32)) { - region_type &= (~GF_REGION_LAZY); - if ((region_type & ss) == ss) return -1; - if ((region_type & sa) == sa) return -1; - if (region_type & (~(ss|sa))) return -1; - if (region_type & GF_REGION_SSE) { - return sizeof(gf_internal_t) + sizeof(struct gf_split_4_32_lazy_data) + 64; - } else if (region_type & GF_REGION_ALTMAP) { - return -1; - } else { - return sizeof(gf_internal_t) + sizeof(struct gf_split_4_32_lazy_data) + 64; - } - } - return -1; - case GF_MULT_DEFAULT: - case GF_MULT_SHIFT: - if (arg1 != 0 || arg2 != 0 || region_type != 0) return -1; - return sizeof(gf_internal_t); - break; - case GF_MULT_COMPOSITE: - if (region_type & ~(GF_REGION_SINGLE_TABLE | GF_REGION_ALTMAP | GF_REGION_STDMAP)) return -1; - if ((region_type & (GF_REGION_SINGLE_TABLE | GF_REGION_ALTMAP)) == (GF_REGION_SINGLE_TABLE | GF_REGION_ALTMAP)) return -1; - if (arg1 == 2 && arg2 == 16 || arg2 == 2 && arg1 == 16) { - return sizeof(gf_internal_t) + sizeof(struct gf_w16_logtable_data) + 64; - } else { - return -1; - } - default: - return -1; - } -} - -int gf_w32_init(gf_t *gf) -{ - gf_internal_t *h; - - h = (gf_internal_t *) gf->scratch; - if (h->prim_poly == 0) h->prim_poly = 0x400007; - - gf->multiply.w32 = NULL; - gf->divide.w32 = NULL; - gf->inverse.w32 = NULL; - gf->multiply_region.w32 = NULL; - - switch(h->mult_type) { - case GF_MULT_DEFAULT: - case GF_MULT_SHIFT: if (gf_w32_shift_init(gf) == 0) return 0; break; - case GF_MULT_COMPOSITE: if (gf_w32_composite_init(gf) == 0) return 0; break; - case GF_MULT_SPLIT_TABLE: if (gf_w32_split_init(gf) == 0) return 0; break; - default: return 0; - } - if (h->divide_type == GF_DIVIDE_EUCLID) { - gf->divide.w32 = gf_w32_divide_from_inverse; - gf->inverse.w32 = gf_w32_euclid; - } else if (h->divide_type == GF_DIVIDE_MATRIX) { - gf->divide.w32 = gf_w32_divide_from_inverse; - gf->inverse.w32 = gf_w32_matrix; - } - - if (gf->inverse.w32 != NULL && gf->divide.w32 == NULL) { - gf->divide.w32 = gf_w32_divide_from_inverse; - } - if (gf->inverse.w32 == NULL && gf->divide.w32 != NULL) { - gf->inverse.w32 = gf_w32_inverse_from_divide; - } - return 1; -} diff --git a/tmp.sh b/tmp.sh deleted file mode 100644 index 6bd92b2..0000000 --- a/tmp.sh +++ /dev/null @@ -1,15 +0,0 @@ -for i in 5 10 ; do - sed 's/1 }/'$i' }/' tmp-time-test.sh > tmp2.sh - sh tmp2.sh 4 LOG - - >> tmp-$i-out.txt - sh tmp2.sh 4 TABLE - - >> tmp-$i-out.txt - sh tmp2.sh 4 TABLE SINGLE,SSE - >> tmp-$i-out.txt - sh tmp2.sh 8 LOG - - >> tmp-$i-out.txt - sh tmp2.sh 8 TABLE - - >> tmp-$i-out.txt - sh tmp2.sh 8 SPLIT 8 4 SSE - >> tmp-$i-out.txt - sh tmp2.sh 16 LOG - - >> tmp-$i-out.txt - sh tmp2.sh 16 SPLIT 16 4 SSE,STDMAP - >> tmp-$i-out.txt - sh tmp2.sh 16 SPLIT 16 4 SSE,ALTMAP - >> tmp-$i-out.txt - sh tmp2.sh 32 SPLIT 8 8 - - >> tmp-$i-out.txt - sh tmp2.sh 32 SPLIT 32 4 SSE,STDMAP - >> tmp-$i-out.txt - sh tmp2.sh 32 SPLIT 32 4 SSE,ALTMAP - >> tmp-$i-out.txt -done diff --git a/tmp.txt b/tmp.txt deleted file mode 100644 index 468cf49..0000000 --- a/tmp.txt +++ /dev/null @@ -1,162 +0,0 @@ -Tables[0] = 0000000000000000 3b60e7ccf8f4454e 76c1cf99f1e88a9c 4da12855091ccfd2 ed839f33e3d11538 d6e378ff1b255076 9b4250aa12399fa4 a022b766eacddaea db073e67c7a22a6b e067d9ab3f566f25 adc6f1fe364aa0f7 96a61632cebee5b9 3684a15424733f53 0de44698dc877a1d 40456ecdd59bb5cf 7b2589012d6ff081 -Tij 81 cf 1d 53 b9 f7 25 6b ea a4 76 38 d2 9c 4e 00 -Tij f0 b5 7a 3f e5 a0 6f 2a da 9f 50 15 cf 8a 45 00 -Tij 6f 9b 87 73 be 4a 56 a2 cd 39 25 d1 1c e8 f4 00 -Tij 2d d5 dc 24 ce 36 3f c7 ea 12 1b e3 09 f1 f8 00 -Tij 01 cd 98 54 32 fe ab 67 66 aa ff 33 55 99 cc 00 -Tij 89 6e 46 a1 16 f1 d9 3e b7 50 78 9f 28 cf e7 00 -Tij 25 45 e4 84 a6 c6 67 07 22 42 e3 83 a1 c1 60 00 -Tij 7b 40 0d 36 96 ad e0 db a0 9b d6 ed 4d 76 3b 00 -Tables[1] = 0000000000000000 b60e7ccf8f4454cd 6c1cf99f1e88a981 da12855091ccfd4c d839f33e3d115302 6e378ff1b25507cf b4250aa12399fa83 022b766eacddae4e b073e67c7a22a61f 067d9ab3f566f2d2 dc6f1fe364aa0f9e 6a61632cebee5b53 684a15424733f51d de44698dc877a1d0 0456ecdd59bb5c9c b2589012d6ff0851 -Tij 51 9c d0 1d 53 9e d2 1f 4e 83 cf 02 4c 81 cd 00 -Tij 08 5c a1 f5 5b 0f f2 a6 ae fa 07 53 fd a9 54 00 -Tij ff bb 77 33 ee aa 66 22 dd 99 55 11 cc 88 44 00 -Tij d6 59 c8 47 eb 64 f5 7a ac 23 b2 3d 91 1e 8f 00 -Tij 12 dd 8d 42 2c e3 b3 7c 6e a1 f1 3e 50 9f cf 00 -Tij 90 ec 69 15 63 1f 9a e6 76 0a 8f f3 85 f9 7c 00 -Tij 58 56 44 4a 61 6f 7d 73 2b 25 37 39 12 1c 0e 00 -Tij b2 04 de 68 6a dc 06 b0 02 b4 6e d8 da 6c b6 00 -Tables[2] = 0000000000000000 60e7ccf8f4454c25 c1cf99f1e88a984a a12855091ccfd46f 839f33e3d115308f e378ff1b25507caa 4250aa12399fa8c5 22b766eacddae4e0 073e67c7a22a6105 67d9ab3f566f2d20 c6f1fe364aa0f94f a61632cebee5b56a 84a15424733f518a e44698dc877a1daf 456ecdd59bb5c9c0 2589012d6ff085e5 -Tij e5 c0 af 8a 6a 4f 20 05 e0 c5 aa 8f 6f 4a 25 00 -Tij 85 c9 1d 51 b5 f9 2d 61 e4 a8 7c 30 d4 98 4c 00 -Tij f0 b5 7a 3f e5 a0 6f 2a da 9f 50 15 cf 8a 45 00 -Tij 6f 9b 87 73 be 4a 56 a2 cd 39 25 d1 1c e8 f4 00 -Tij 2d d5 dc 24 ce 36 3f c7 ea 12 1b e3 09 f1 f8 00 -Tij 01 cd 98 54 32 fe ab 67 66 aa ff 33 55 99 cc 00 -Tij 89 6e 46 a1 16 f1 d9 3e b7 50 78 9f 28 cf e7 00 -Tij 25 45 e4 84 a6 c6 67 07 22 42 e3 83 a1 c1 60 00 -Tables[3] = 0000000000000000 0e7ccf8f4454c20a 1cf99f1e88a98414 12855091ccfd461e 39f33e3d11530828 378ff1b25507ca22 250aa12399fa8c3c 2b766eacddae4e36 73e67c7a22a61050 7d9ab3f566f2d25a 6f1fe364aa0f9444 61632cebee5b564e 4a15424733f51878 44698dc877a1da72 56ecdd59bb5c9c6c 589012d6ff085e66 -Tij 66 6c 72 78 4e 44 5a 50 36 3c 22 28 1e 14 0a 00 -Tij 5e 9c da 18 56 94 d2 10 4e 8c ca 08 46 84 c2 00 -Tij 08 5c a1 f5 5b 0f f2 a6 ae fa 07 53 fd a9 54 00 -Tij ff bb 77 33 ee aa 66 22 dd 99 55 11 cc 88 44 00 -Tij d6 59 c8 47 eb 64 f5 7a ac 23 b2 3d 91 1e 8f 00 -Tij 12 dd 8d 42 2c e3 b3 7c 6e a1 f1 3e 50 9f cf 00 -Tij 90 ec 69 15 63 1f 9a e6 76 0a 8f f3 85 f9 7c 00 -Tij 58 56 44 4a 61 6f 7d 73 2b 25 37 39 12 1c 0e 00 -Tables[4] = 0000000000000000 e7ccf8f4454c20a0 cf99f1e88a98415b 2855091ccfd461fb 9f33e3d1153082ad 78ff1b25507ca20d 50aa12399fa8c3f6 b766eacddae4e356 3e67c7a22a610541 d9ab3f566f2d25e1 f1fe364aa0f9441a 1632cebee5b564ba a15424733f5187ec 4698dc877a1da74c 6ecdd59bb5c9c6b7 89012d6ff085e617 -Tij 17 b7 4c ec ba 1a e1 41 56 f6 0d ad fb 5b a0 00 -Tij e6 c6 a7 87 64 44 25 05 e3 c3 a2 82 61 41 20 00 -Tij 85 c9 1d 51 b5 f9 2d 61 e4 a8 7c 30 d4 98 4c 00 -Tij f0 b5 7a 3f e5 a0 6f 2a da 9f 50 15 cf 8a 45 00 -Tij 6f 9b 87 73 be 4a 56 a2 cd 39 25 d1 1c e8 f4 00 -Tij 2d d5 dc 24 ce 36 3f c7 ea 12 1b e3 09 f1 f8 00 -Tij 01 cd 98 54 32 fe ab 67 66 aa ff 33 55 99 cc 00 -Tij 89 6e 46 a1 16 f1 d9 3e b7 50 78 9f 28 cf e7 00 -Tables[5] = 0000000000000000 7ccf8f4454c20a82 f99f1e88a9841504 855091ccfd461f86 f33e3d1153082a13 8ff1b25507ca2091 0aa12399fa8c3f17 766eacddae4e3595 e67c7a22a610543d 9ab3f566f2d25ebf 1fe364aa0f944139 632cebee5b564bbb 15424733f5187e2e 698dc877a1da74ac ecdd59bb5c9c6b2a 9012d6ff085e61a8 -Tij a8 2a ac 2e bb 39 bf 3d 95 17 91 13 86 04 82 00 -Tij 61 6b 74 7e 4b 41 5e 54 35 3f 20 2a 1f 15 0a 00 -Tij 5e 9c da 18 56 94 d2 10 4e 8c ca 08 46 84 c2 00 -Tij 08 5c a1 f5 5b 0f f2 a6 ae fa 07 53 fd a9 54 00 -Tij ff bb 77 33 ee aa 66 22 dd 99 55 11 cc 88 44 00 -Tij d6 59 c8 47 eb 64 f5 7a ac 23 b2 3d 91 1e 8f 00 -Tij 12 dd 8d 42 2c e3 b3 7c 6e a1 f1 3e 50 9f cf 00 -Tij 90 ec 69 15 63 1f 9a e6 76 0a 8f f3 85 f9 7c 00 -Tables[6] = 0000000000000000 ccf8f4454c20a861 99f1e88a984150d9 55091ccfd461f8b8 33e3d1153082a1a9 ff1b25507ca209c8 aa12399fa8c3f170 66eacddae4e35911 67c7a22a61054352 ab3f566f2d25eb33 fe364aa0f944138b 32cebee5b564bbea 5424733f5187e2fb 98dc877a1da74a9a cdd59bb5c9c6b222 012d6ff085e61a43 -Tij 43 22 9a fb ea 8b 33 52 11 70 c8 a9 b8 d9 61 00 -Tij 1a b2 4a e2 bb 13 eb 43 59 f1 09 a1 f8 50 a8 00 -Tij e6 c6 a7 87 64 44 25 05 e3 c3 a2 82 61 41 20 00 -Tij 85 c9 1d 51 b5 f9 2d 61 e4 a8 7c 30 d4 98 4c 00 -Tij f0 b5 7a 3f e5 a0 6f 2a da 9f 50 15 cf 8a 45 00 -Tij 6f 9b 87 73 be 4a 56 a2 cd 39 25 d1 1c e8 f4 00 -Tij 2d d5 dc 24 ce 36 3f c7 ea 12 1b e3 09 f1 f8 00 -Tij 01 cd 98 54 32 fe ab 67 66 aa ff 33 55 99 cc 00 -Tables[7] = 0000000000000000 cf8f4454c20a86a4 9f1e88a984150d53 5091ccfd461f8bf7 3e3d1153082a1abd f1b25507ca209c19 a12399fa8c3f17ee 6eacddae4e35914a 7c7a22a61054357a b3f566f2d25eb3de e364aa0f94413829 2cebee5b564bbe8d 424733f5187e2fc7 8dc877a1da74a963 dd59bb5c9c6b2294 12d6ff085e61a430 -Tij 30 94 63 c7 8d 29 de 7a 4a ee 19 bd f7 53 a4 00 -Tij a4 22 a9 2f be 38 b3 35 91 17 9c 1a 8b 0d 86 00 -Tij 61 6b 74 7e 4b 41 5e 54 35 3f 20 2a 1f 15 0a 00 -Tij 5e 9c da 18 56 94 d2 10 4e 8c ca 08 46 84 c2 00 -Tij 08 5c a1 f5 5b 0f f2 a6 ae fa 07 53 fd a9 54 00 -Tij ff bb 77 33 ee aa 66 22 dd 99 55 11 cc 88 44 00 -Tij d6 59 c8 47 eb 64 f5 7a ac 23 b2 3d 91 1e 8f 00 -Tij 12 dd 8d 42 2c e3 b3 7c 6e a1 f1 3e 50 9f cf 00 -Tables[8] = 0000000000000000 f8f4454c20a86af4 f1e88a984150d5f3 091ccfd461f8bf07 e3d1153082a1abfd 1b25507ca209c109 12399fa8c3f17e0e eacddae4e35914fa c7a22a61054357e1 3f566f2d25eb3d15 364aa0f944138212 cebee5b564bbe8e6 24733f5187e2fc1c dc877a1da74a96e8 d59bb5c9c6b229ef 2d6ff085e61a431b -Tij 1b ef e8 1c e6 12 15 e1 fa 0e 09 fd 07 f3 f4 00 -Tij 43 29 96 fc e8 82 3d 57 14 7e c1 ab bf d5 6a 00 -Tij 1a b2 4a e2 bb 13 eb 43 59 f1 09 a1 f8 50 a8 00 -Tij e6 c6 a7 87 64 44 25 05 e3 c3 a2 82 61 41 20 00 -Tij 85 c9 1d 51 b5 f9 2d 61 e4 a8 7c 30 d4 98 4c 00 -Tij f0 b5 7a 3f e5 a0 6f 2a da 9f 50 15 cf 8a 45 00 -Tij 6f 9b 87 73 be 4a 56 a2 cd 39 25 d1 1c e8 f4 00 -Tij 2d d5 dc 24 ce 36 3f c7 ea 12 1b e3 09 f1 f8 00 -Tables[9] = 0000000000000000 8f4454c20a86afd9 1e88a984150d5fa9 91ccfd461f8bf070 3d1153082a1abf52 b25507ca209c108b 2399fa8c3f17e0fb acddae4e35914f22 7a22a61054357ea4 f566f2d25eb3d17d 64aa0f944138210d ebee5b564bbe8ed4 4733f5187e2fc1f6 c877a1da74a96e2f 59bb5c9c6b229e5f d6ff085e61a43186 -Tij 86 5f 2f f6 d4 0d 7d a4 22 fb 8b 52 70 a9 d9 00 -Tij 31 9e 6e c1 8e 21 d1 7e 4f e0 10 bf f0 5f af 00 -Tij a4 22 a9 2f be 38 b3 35 91 17 9c 1a 8b 0d 86 00 -Tij 61 6b 74 7e 4b 41 5e 54 35 3f 20 2a 1f 15 0a 00 -Tij 5e 9c da 18 56 94 d2 10 4e 8c ca 08 46 84 c2 00 -Tij 08 5c a1 f5 5b 0f f2 a6 ae fa 07 53 fd a9 54 00 -Tij ff bb 77 33 ee aa 66 22 dd 99 55 11 cc 88 44 00 -Tij d6 59 c8 47 eb 64 f5 7a ac 23 b2 3d 91 1e 8f 00 -Tables[10] = 0000000000000000 f4454c20a86afd48 e88a984150d5fa8b 1ccfd461f8bf07c3 d1153082a1abf50d 25507ca209c10845 399fa8c3f17e0f86 cddae4e35914f2ce a22a61054357ea01 566f2d25eb3d1749 4aa0f9441382108a bee5b564bbe8edc2 733f5187e2fc1f0c 877a1da74a96e244 9bb5c9c6b229e587 6ff085e61a4318cf -Tij cf 87 44 0c c2 8a 49 01 ce 86 45 0d c3 8b 48 00 -Tij 18 e5 e2 1f ed 10 17 ea f2 0f 08 f5 07 fa fd 00 -Tij 43 29 96 fc e8 82 3d 57 14 7e c1 ab bf d5 6a 00 -Tij 1a b2 4a e2 bb 13 eb 43 59 f1 09 a1 f8 50 a8 00 -Tij e6 c6 a7 87 64 44 25 05 e3 c3 a2 82 61 41 20 00 -Tij 85 c9 1d 51 b5 f9 2d 61 e4 a8 7c 30 d4 98 4c 00 -Tij f0 b5 7a 3f e5 a0 6f 2a da 9f 50 15 cf 8a 45 00 -Tij 6f 9b 87 73 be 4a 56 a2 cd 39 25 d1 1c e8 f4 00 -Tables[11] = 0000000000000000 4454c20a86afd419 88a984150d5fa832 ccfd461f8bf07c2b 1153082a1abf507f 5507ca209c108466 99fa8c3f17e0f84d ddae4e35914f2c54 22a61054357ea0fe 66f2d25eb3d174e7 aa0f9441382108cc ee5b564bbe8edcd5 33f5187e2fc1f081 77a1da74a96e2498 bb5c9c6b229e58b3 ff085e61a4318caa -Tij aa b3 98 81 d5 cc e7 fe 54 4d 66 7f 2b 32 19 00 -Tij 8c 58 24 f0 dc 08 74 a0 2c f8 84 50 7c a8 d4 00 -Tij 31 9e 6e c1 8e 21 d1 7e 4f e0 10 bf f0 5f af 00 -Tij a4 22 a9 2f be 38 b3 35 91 17 9c 1a 8b 0d 86 00 -Tij 61 6b 74 7e 4b 41 5e 54 35 3f 20 2a 1f 15 0a 00 -Tij 5e 9c da 18 56 94 d2 10 4e 8c ca 08 46 84 c2 00 -Tij 08 5c a1 f5 5b 0f f2 a6 ae fa 07 53 fd a9 54 00 -Tij ff bb 77 33 ee aa 66 22 dd 99 55 11 cc 88 44 00 -Tables[12] = 0000000000000000 454c20a86afd41fc 8a984150d5fa83f8 cfd461f8bf07c204 153082a1abf507eb 507ca209c1084617 9fa8c3f17e0f8413 dae4e35914f2c5ef 2a61054357ea0fd6 6f2d25eb3d174e2a a0f9441382108c2e e5b564bbe8edcdd2 3f5187e2fc1f083d 7a1da74a96e249c1 b5c9c6b229e58bc5 f085e61a4318ca39 -Tij 39 c5 c1 3d d2 2e 2a d6 ef 13 17 eb 04 f8 fc 00 -Tij ca 8b 49 08 cd 8c 4e 0f c5 84 46 07 c2 83 41 00 -Tij 18 e5 e2 1f ed 10 17 ea f2 0f 08 f5 07 fa fd 00 -Tij 43 29 96 fc e8 82 3d 57 14 7e c1 ab bf d5 6a 00 -Tij 1a b2 4a e2 bb 13 eb 43 59 f1 09 a1 f8 50 a8 00 -Tij e6 c6 a7 87 64 44 25 05 e3 c3 a2 82 61 41 20 00 -Tij 85 c9 1d 51 b5 f9 2d 61 e4 a8 7c 30 d4 98 4c 00 -Tij f0 b5 7a 3f e5 a0 6f 2a da 9f 50 15 cf 8a 45 00 -Tables[13] = 0000000000000000 54c20a86afd41fac a984150d5fa83f58 fd461f8bf07c20f4 53082a1abf507eab 07ca209c10846107 fa8c3f17e0f841f3 ae4e35914f2c5e5f a61054357ea0fd56 f2d25eb3d174e2fa 0f9441382108c20e 5b564bbe8edcdda2 f5187e2fc1f083fd a1da74a96e249c51 5c9c6b229e58bca5 085e61a4318ca309 -Tij 09 a5 51 fd a2 0e fa 56 5f f3 07 ab f4 58 ac 00 -Tij a3 bc 9c 83 dd c2 e2 fd 5e 41 61 7e 20 3f 1f 00 -Tij 8c 58 24 f0 dc 08 74 a0 2c f8 84 50 7c a8 d4 00 -Tij 31 9e 6e c1 8e 21 d1 7e 4f e0 10 bf f0 5f af 00 -Tij a4 22 a9 2f be 38 b3 35 91 17 9c 1a 8b 0d 86 00 -Tij 61 6b 74 7e 4b 41 5e 54 35 3f 20 2a 1f 15 0a 00 -Tij 5e 9c da 18 56 94 d2 10 4e 8c ca 08 46 84 c2 00 -Tij 08 5c a1 f5 5b 0f f2 a6 ae fa 07 53 fd a9 54 00 -Tables[14] = 0000000000000000 4c20a86afd41fab7 984150d5fa83f56e d461f8bf07c20fd9 3082a1abf507eac7 7ca209c108461070 a8c3f17e0f841fa9 e4e35914f2c5e51e 61054357ea0fd58e 2d25eb3d174e2f39 f9441382108c20e0 b564bbe8edcdda57 5187e2fc1f083f49 1da74a96e249c5fe c9c6b229e58bca27 85e61a4318ca3090 -Tij 90 27 fe 49 57 e0 39 8e 1e a9 70 c7 d9 6e b7 00 -Tij 30 ca c5 3f da 20 2f d5 e5 1f 10 ea 0f f5 fa 00 -Tij ca 8b 49 08 cd 8c 4e 0f c5 84 46 07 c2 83 41 00 -Tij 18 e5 e2 1f ed 10 17 ea f2 0f 08 f5 07 fa fd 00 -Tij 43 29 96 fc e8 82 3d 57 14 7e c1 ab bf d5 6a 00 -Tij 1a b2 4a e2 bb 13 eb 43 59 f1 09 a1 f8 50 a8 00 -Tij e6 c6 a7 87 64 44 25 05 e3 c3 a2 82 61 41 20 00 -Tij 85 c9 1d 51 b5 f9 2d 61 e4 a8 7c 30 d4 98 4c 00 -Tables[15] = 0000000000000000 c20a86afd41fab1c 84150d5fa83f5623 461f8bf07c20fd3f 082a1abf507eac5d ca209c1084610741 8c3f17e0f841fa7e 4e35914f2c5e5162 1054357ea0fd58ba d25eb3d174e2f3a6 9441382108c20e99 564bbe8edcdda585 187e2fc1f083f4e7 da74a96e249c5ffb 9c6b229e58bca2c4 5e61a4318ca309d8 -Tij d8 c4 fb e7 85 99 a6 ba 62 7e 41 5d 3f 23 1c 00 -Tij 09 a2 5f f4 a5 0e f3 58 51 fa 07 ac fd 56 ab 00 -Tij a3 bc 9c 83 dd c2 e2 fd 5e 41 61 7e 20 3f 1f 00 -Tij 8c 58 24 f0 dc 08 74 a0 2c f8 84 50 7c a8 d4 00 -Tij 31 9e 6e c1 8e 21 d1 7e 4f e0 10 bf f0 5f af 00 -Tij a4 22 a9 2f be 38 b3 35 91 17 9c 1a 8b 0d 86 00 -Tij 61 6b 74 7e 4b 41 5e 54 35 3f 20 2a 1f 15 0a 00 -Tij 5e 9c da 18 56 94 d2 10 4e 8c ca 08 46 84 c2 00 -Val= 3b60e7ccf8f4454e -v0 28 4f 14 e3 1b f7 ee 76 b9 31 47 0a ba 8b 70 fc -v0 12 56 28 59 66 cd d2 d2 1c 91 30 26 a8 95 0a a9 -v0 ee 5d 14 e3 fb c8 45 23 a9 fd 8c f1 ff c9 2c 93 -v0 65 ce 82 f2 dc ec 6b e2 53 a3 9c fb 07 70 e7 ad -v0 1b 87 3d 7b 4d 15 1d c2 d2 45 f3 03 4b e4 f4 9b -v0 3b 01 2b c5 c5 d2 9d a9 68 7c a2 61 c9 5b 49 90 -v0 5d 13 7d ef eb f1 52 da a0 29 89 ef 08 f2 51 3b -v0 17 05 b3 80 77 3a f2 5e 82 7a c9 39 84 df 8e bf - -p0 11 fc 47 f4 6c 01 44 ba ba 62 e7 3f ba fb ba 85 -p0 a6 fc 67 16 5f c3 95 fc 58 51 f4 fd 58 5f 58 a5 -p0 12 fc 1f b3 50 1e 3f 9a fd 5e 83 20 fd 9c fd dd -p0 d9 fc 1e ee 22 42 10 7f a0 2c f0 7c a0 24 a0 dc -p0 a2 fc 4c 30 41 ce ad eb 7e 4f c1 f0 7e 6e 7e 8e -p0 8b fc 7c 7b 9f b5 38 67 35 91 2f 8b 35 a9 35 be -p0 07 fc 89 1a 3b 21 fd db 54 35 7e 1f 54 74 54 4b -p0 cf fc 94 5e 40 78 c2 31 10 4e 18 46 10 da 10 56 diff --git a/tmp2.sh b/tmp2.sh deleted file mode 100644 index d98248f..0000000 --- a/tmp2.sh +++ /dev/null @@ -1,13 +0,0 @@ -if [ $# -lt 4 ]; then - echo 'usage: sh tmp-test.sh w gf_specs (e.g. LOG - -)' >&2 - exit 1 -fi - -w=$1 -shift -i=1024 -while [ $i -le 1073741824 ]; do - iter=`echo $i | awk '{ print (1073741824/$1)*10 }'` - echo $i $iter $w $* `gf_time $w R -1 $i $iter $*` - i=`echo $i | awk '{ print $1*2 }'` -done