gf-complete/gf.c

/*
 * gf.c
 *
 * Generic routines for Galois fields
 */

#include "gf_int.h"
#include <stdio.h>
#include <stdlib.h>

int gf_scratch_size(int w, 
                    int mult_type, 
                    int region_type, 
                    int divide_type, 
                    int arg1, 
                    int arg2)
{
  switch(w) {
    case 4: return gf_w4_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
    case 8: return gf_w8_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
    case 16: return gf_w16_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
    case 32: return gf_w32_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
    case 64: return gf_w64_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
    case 128: return gf_w128_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
    default: return gf_wgen_scratch_size(w, mult_type, region_type, divide_type, arg1, arg2);
  }
}

int gf_dummy_init(gf_t *gf)
{
  return 0;
}

int gf_init_easy(gf_t *gf, int w, int mult_type)
{
  return gf_init_hard(gf, w, mult_type, GF_REGION_DEFAULT, GF_DIVIDE_DEFAULT, 0, 0, 0, NULL, NULL);
}

int gf_init_hard(gf_t *gf, int w, int mult_type, 
                        int region_type,
                        int divide_type,
                        uint64_t prim_poly,
                        int arg1, int arg2,
                        gf_t *base_gf,
                        void *scratch_memory) 
{
  int sz;
  gf_internal_t *h;
  
  sz = gf_scratch_size(w, mult_type, region_type, divide_type, arg1, arg2);

  if (sz <= 0) return 0;

  if (scratch_memory == NULL) {
    h = (gf_internal_t *) malloc(sz);
    h->free_me = 1;
  } else {
    h = scratch_memory;
    h->free_me = 0;
  }
  gf->scratch = (void *) h;
  h->mult_type = mult_type;
  h->region_type = region_type;
  h->divide_type = divide_type;
  h->w = w;
  h->prim_poly = prim_poly;
  h->arg1 = arg1;
  h->arg2 = arg2;
  h->base_gf = base_gf;
  h->private = (void *) gf->scratch;
  h->private += (sizeof(gf_internal_t));
  gf->extract_word.w32 = NULL;

  //printf("Created w=%d, with mult_type=%d and region_type=%d\n", w, mult_type, region_type);

  switch(w) {
    case 4: return gf_w4_init(gf);
    case 8: return gf_w8_init(gf);
    case 16: return gf_w16_init(gf);
    case 32: return gf_w32_init(gf);
    case 64: return gf_w64_init(gf);
    case 128: return gf_w128_init(gf);
    default: return gf_wgen_init(gf);
  }
}

int gf_free(gf_t *gf, int recursive)
{
  gf_internal_t *h;

  h = (gf_internal_t *) gf->scratch;
  if (recursive && h->base_gf != NULL) {
    gf_free(h->base_gf, 1);
    free(h->base_gf);
  }
  if (h->free_me) free(h);
}

void gf_alignment_error(char *s, int a)
{
  fprintf(stderr, "Alignment error in %s:\n", s);
  fprintf(stderr, "   The source and destination buffers must be aligned to each other,\n");
  fprintf(stderr, "   and they must be aligned to a %d-byte address.\n", a);
  exit(1);
}

/* Lifted this code from Jens Gregor -- thanks, Jens */

int gf_is_sse2()
{
  unsigned int cpeinfo;
  unsigned int cpsse;
  asm ( "mov $0x1, %%eax\n\t"
                "cpuid\n\t"
                "mov %%edx, %0\n\t"
            "mov %%ecx, %1\n" : "=m" (cpeinfo), "=m" (cpsse));
  if ((cpeinfo >> 26) & 0x1 ) return 1;
  return 0;
}

static 
void gf_invert_binary_matrix(int *mat, int *inv, int rows) {
  int cols, i, j, k;
  int tmp;

  cols = rows;

  for (i = 0; i < rows; i++) inv[i] = (1 << i);

  /* First -- convert into upper triangular */

  for (i = 0; i < cols; i++) {

    /* Swap rows if we ave a zero i,i element.  If we can't swap, then the
       matrix was not invertible */

    if ((mat[i] & (1 << i)) == 0) {
      for (j = i+1; j < rows && (mat[j] & (1 << i)) == 0; j++) ;
      if (j == rows) {
        fprintf(stderr, "galois_invert_matrix: Matrix not invertible!!\n");
        exit(1);
      }
      tmp = mat[i]; mat[i] = mat[j]; mat[j] = tmp;
      tmp = inv[i]; inv[i] = inv[j]; inv[j] = tmp;
    }

    /* Now for each j>i, add A_ji*Ai to Aj */
    for (j = i+1; j != rows; j++) {
      if ((mat[j] & (1 << i)) != 0) {
        mat[j] ^= mat[i];
        inv[j] ^= inv[i];
      }
    }
  }

  /* Now the matrix is upper triangular.  Start at the top and multiply down */

  for (i = rows-1; i >= 0; i--) {
    for (j = 0; j < i; j++) {
      if (mat[j] & (1 << i)) {
        /*  mat[j] ^= mat[i]; */
        inv[j] ^= inv[i];
      }
    }
  }
}

uint32_t gf_bitmatrix_inverse(uint32_t y, int w, uint32_t pp) 
{
  uint32_t mat[32], inv[32], mask;
  int i;

  mask = (w == 32) ? 0xffffffff : (1 << w) - 1;
  for (i = 0; i < w; i++) {
    mat[i] = y;

    if (y & (1 << (w-1))) {
      y = y << 1;
      y = ((y ^ pp) & mask);
    } else {
      y = y << 1;
    }
  }

  gf_invert_binary_matrix(mat, inv, w);
  return inv[0];
}

/*
void gf_two_byte_region_table_multiply(gf_region_data *rd, uint16_t *base)
{
  uint64_t p, ta, shift, tb;
  uint64_t *s64, *d64

  s64 = rd->s_start;
  d64 = rd->d_start;
  
  while (s64 < (uint64_t *) rd->s_top) {
    p = (rd->xor) ? *d64 : 0;
    ta = *s64;

    shift = 0;
    while (ta != 0) {
      tb = base[ta&0xffff];
      p ^= (tb << shift);
      ta >>= 16;
      shift += 16;
    }

    *d64 = p;
    d64++;
    s64++;
  }
}
*/

void gf_two_byte_region_table_multiply(gf_region_data *rd, uint16_t *base)
{
  uint64_t a, prod;
  int j, xor;
  uint64_t *s64, *d64, *top;

  s64 = rd->s_start;
  d64 = rd->d_start;
  top = rd->d_top;
  xor = rd->xor;
  
  if (xor) {
    while (d64 != top) {
      a = *s64;
      prod = base[a >> 48];
      a <<= 16;
      prod <<= 16;
      prod ^= base[a >> 48];
      a <<= 16;
      prod <<= 16;
      prod ^= base[a >> 48];
      a <<= 16;
      prod <<= 16;
      prod ^= base[a >> 48];
      prod ^= *d64;
      *d64 = prod;
      *s64++;
      *d64++;
    }
  } else {
    while (d64 != top) {
      a = *s64;
      prod = base[a >> 48];
      a <<= 16;
      prod <<= 16;
      prod ^= base[a >> 48];
      a <<= 16;
      prod <<= 16;
      prod ^= base[a >> 48];
      a <<= 16;
      prod <<= 16;
      prod ^= base[a >> 48];
      *d64 = prod;
      *s64++;
      *d64++;
    }
  }
}

static void gf_slow_multiply_region(gf_region_data *rd, void *src, void *dest, void *s_top)
{
  uint8_t *s8, *d8;
  uint16_t *s16, *d16;
  uint32_t *s32, *d32;
  uint64_t *s64, *d64;
  gf_internal_t *h;
  int wb;
  uint32_t p, a;

  h = rd->gf->scratch;
  wb = (h->w)/8;
  if (wb == 0) wb = 1;
  
  while (src < s_top) {
    switch (h->w) {
    case 8:
      s8 = (uint8_t *) src;
      d8 = (uint8_t *) dest;
      *d8 = (rd->xor) ? (*d8 ^ rd->gf->multiply.w32(rd->gf, rd->val, *s8)) : 
                      rd->gf->multiply.w32(rd->gf, rd->val, *s8);
      break;
    case 4:
      s8 = (uint8_t *) src;
      d8 = (uint8_t *) dest;
      a = *s8;
      p = rd->gf->multiply.w32(rd->gf, rd->val, a&0xf);
      p |= (rd->gf->multiply.w32(rd->gf, rd->val, a >> 4) << 4);
      if (rd->xor) p ^= *d8;
      *d8 = p;
      break;
    case 16:
      s16 = (uint16_t *) src;
      d16 = (uint16_t *) dest;
      *d16 = (rd->xor) ? (*d16 ^ rd->gf->multiply.w32(rd->gf, rd->val, *s16)) : 
                      rd->gf->multiply.w32(rd->gf, rd->val, *s16);
      break;
    case 32:
      s32 = (uint32_t *) src;
      d32 = (uint32_t *) dest;
      *d32 = (rd->xor) ? (*d32 ^ rd->gf->multiply.w32(rd->gf, rd->val, *s32)) : 
                      rd->gf->multiply.w32(rd->gf, rd->val, *s32);
      break;
    case 64:
      s64 = (uint64_t *) src;
      d64 = (uint64_t *) dest;
      *d64 = (rd->xor) ? (*d64 ^ rd->gf->multiply.w64(rd->gf, rd->val, *s64)) : 
                      rd->gf->multiply.w64(rd->gf, rd->val, *s64);
      break;
    default:
      fprintf(stderr, "Error: gf_slow_multiply_region: w=%d not implemented.\n", h->w);
      exit(1);
    }
    src += wb;
    dest += wb;
  }
}

/* If align>16, you align to 16 bytes, but make sure that within the aligned region bytes is a multiple of align.  However, you make sure that the region itself is a multiple of align. 

   If align = -1, then this is cauchy.  You need to make sure that bytes is a multiple of w. */

void gf_set_region_data(gf_region_data *rd,
  gf_t *gf,
  void *src,
  void *dest,
  int bytes,
  uint32_t val,
  int xor,
  int align)
{
  uint8_t *s8, *d8;
  gf_internal_t *h;
  int wb;
  uint32_t a;
  unsigned long uls, uld;

  h = gf->scratch;
  wb = (h->w)/8;
  if (wb == 0) wb = 1;
  
  rd->gf = gf;
  rd->src = src;
  rd->dest = dest;
  rd->bytes = bytes;
  rd->val = val;
  rd->xor = xor;
  rd->align = align;

  uls = (unsigned long) src;
  uld = (unsigned long) dest;

  a = (align <= 16) ? align : 16;

  if (align == -1) { /* This is cauchy.  Error check bytes, then set up the pointers
                        so that there is no alignment regions. */
    if (bytes % h->w != 0) {
      fprintf(stderr, "Error in region multiply operation.\n");
      fprintf(stderr, "The size must be a multiple of %d bytes.\n", h->w);
      exit(1);
    }
  
    rd->s_start = src;
    rd->d_start = dest;
    rd->s_top = src + bytes;
    rd->d_top = src + bytes;
    return;
  }

  if (uls % a != uld % a) {
    fprintf(stderr, "Error in region multiply operation.\n");
    fprintf(stderr, "The source & destination pointers must be aligned with respect\n");
    fprintf(stderr, "to each other along a %d byte boundary.\n", a);
    fprintf(stderr, "Src = 0x%lx.  Dest = 0x%lx\n", (unsigned long) src,
            (unsigned long) dest);
    exit(1);
  }

  if (uls % wb != 0) {
    fprintf(stderr, "Error in region multiply operation.\n");
    fprintf(stderr, "The pointers must be aligned along a %d byte boundary.\n", wb);
    fprintf(stderr, "Src = 0x%lx.  Dest = 0x%lx\n", (unsigned long) src,
            (unsigned long) dest);
    exit(1);
  }

  if (bytes % wb != 0) {
    fprintf(stderr, "Error in region multiply operation.\n");
    fprintf(stderr, "The size must be a multiple of %d bytes.\n", wb);
    exit(1);
  }

  uls %= a;
  if (uls != 0) uls = (align-uls);
  rd->s_start = rd->src + uls;
  rd->d_start = rd->dest + uls;
  bytes -= uls;

  bytes -= (bytes % align);
  rd->s_top = rd->s_start + bytes;
  rd->d_top = rd->d_start + bytes;
}

void gf_do_initial_region_alignment(gf_region_data *rd)
{
  gf_slow_multiply_region(rd, rd->src, rd->dest, rd->s_start);
}

void gf_do_final_region_alignment(gf_region_data *rd)
{
  gf_slow_multiply_region(rd, rd->s_top, rd->d_top, rd->src+rd->bytes);
}

void gf_multby_zero(void *dest, int bytes, int xor) 
{
  if (xor) return;
  bzero(dest, bytes);
  return;
}

void gf_multby_one(gf_t *gf, void *src, void *dest, int bytes, int xor) 
{
#ifdef   INTEL_SSE4
  __m128i ms, md;
#endif
  uint8_t *s8, *d8, *dtop8;
  uint64_t *s64, *d64, *dtop64;
  int abytes;

  gf_region_data rd;
  if (!xor) {
    memcpy(dest, src, bytes);
    return;
  }

#ifdef   INTEL_SSE4
  s8 = (uint8_t *) src;
  d8 = (uint8_t *) dest;
  abytes = bytes & 0xfffffff0;

  while (d8 < (uint8_t *) dest + abytes) {
    ms = _mm_loadu_si128 ((__m128i *)(s8));
    md = _mm_loadu_si128 ((__m128i *)(d8));
    md = _mm_xor_si128(md, ms);
    _mm_storeu_si128((__m128i *)(d8), md);
    s8 += 16;
    d8 += 16;
  }
  while (d8 != (uint8_t *) dest+bytes) {
    *d8 ^= *s8;
    d8++;
    s8++;
  }
  return;
#endif

  /* If you don't have SSE, you'd better be aligned..... */

  gf_set_region_data(&rd, gf, src, dest, bytes, 1, xor, 8);
  s8 = (uint8_t *) src;
  d8 = (uint8_t *) dest;
  while (d8 != rd.d_start) {
    *d8 ^= *s8;
    d8++;
    s8++;
  }
  dtop64 = (uint64_t *) rd.d_top;

  while (d64 < dtop64) {
    *d64 ^= *s64;
    d64++;
    s64++;
  }
  while (d8 != (uint8_t *) dest+bytes) {
    *d8 ^= *s8;
    d8++;
    s8++;
  }
  return;
}
Big checkin after I've lost the others. Ha ha. git-svn-id: svn://mamba.eecs.utk.edu/home/plank/svn/Galois-Library@78 36f187d4-5712-4624-889c-152d48957efa 2012-12-08 19:28:43 +04:00			`/*`
			`* gf.c`
			`*`
			`* Generic routines for Galois fields`
			`*/`

			`#include "gf_int.h"`
			`#include <stdio.h>`
			`#include <stdlib.h>`

			`int gf_scratch_size(int w,`
			`int mult_type,`
			`int region_type,`
			`int divide_type,`
			`int arg1,`
			`int arg2)`
			`{`
			`switch(w) {`
			`case 4: return gf_w4_scratch_size(mult_type, region_type, divide_type, arg1, arg2);`
			`case 8: return gf_w8_scratch_size(mult_type, region_type, divide_type, arg1, arg2);`
			`case 16: return gf_w16_scratch_size(mult_type, region_type, divide_type, arg1, arg2);`
			`case 32: return gf_w32_scratch_size(mult_type, region_type, divide_type, arg1, arg2);`
			`case 64: return gf_w64_scratch_size(mult_type, region_type, divide_type, arg1, arg2);`
			`case 128: return gf_w128_scratch_size(mult_type, region_type, divide_type, arg1, arg2);`
			`default: return gf_wgen_scratch_size(w, mult_type, region_type, divide_type, arg1, arg2);`
			`}`
			`}`

			`int gf_dummy_init(gf_t *gf)`
			`{`
			`return 0;`
			`}`

			`int gf_init_easy(gf_t *gf, int w, int mult_type)`
			`{`
			`return gf_init_hard(gf, w, mult_type, GF_REGION_DEFAULT, GF_DIVIDE_DEFAULT, 0, 0, 0, NULL, NULL);`
			`}`

			`int gf_init_hard(gf_t *gf, int w, int mult_type,`
			`int region_type,`
			`int divide_type,`
			`uint64_t prim_poly,`
			`int arg1, int arg2,`
			`gf_t *base_gf,`
			`void *scratch_memory)`
			`{`
			`int sz;`
			`gf_internal_t *h;`

			`sz = gf_scratch_size(w, mult_type, region_type, divide_type, arg1, arg2);`

			`if (sz <= 0) return 0;`

			`if (scratch_memory == NULL) {`
			`h = (gf_internal_t *) malloc(sz);`
			`h->free_me = 1;`
			`} else {`
			`h = scratch_memory;`
			`h->free_me = 0;`
			`}`
			`gf->scratch = (void *) h;`
			`h->mult_type = mult_type;`
			`h->region_type = region_type;`
			`h->divide_type = divide_type;`
			`h->w = w;`
			`h->prim_poly = prim_poly;`
			`h->arg1 = arg1;`
			`h->arg2 = arg2;`
			`h->base_gf = base_gf;`
			`h->private = (void *) gf->scratch;`
			`h->private += (sizeof(gf_internal_t));`
			`gf->extract_word.w32 = NULL;`

			`//printf("Created w=%d, with mult_type=%d and region_type=%d\n", w, mult_type, region_type);`

			`switch(w) {`
			`case 4: return gf_w4_init(gf);`
			`case 8: return gf_w8_init(gf);`
			`case 16: return gf_w16_init(gf);`
			`case 32: return gf_w32_init(gf);`
			`case 64: return gf_w64_init(gf);`
			`case 128: return gf_w128_init(gf);`
			`default: return gf_wgen_init(gf);`
			`}`
			`}`

			`int gf_free(gf_t *gf, int recursive)`
			`{`
			`gf_internal_t *h;`

			`h = (gf_internal_t *) gf->scratch;`
			`if (recursive && h->base_gf != NULL) {`
			`gf_free(h->base_gf, 1);`
			`free(h->base_gf);`
			`}`
			`if (h->free_me) free(h);`
			`}`

			`void gf_alignment_error(char *s, int a)`
			`{`
			`fprintf(stderr, "Alignment error in %s:\n", s);`
			`fprintf(stderr, " The source and destination buffers must be aligned to each other,\n");`
			`fprintf(stderr, " and they must be aligned to a %d-byte address.\n", a);`
			`exit(1);`
			`}`

			`/* Lifted this code from Jens Gregor -- thanks, Jens */`

			`int gf_is_sse2()`
			`{`
			`unsigned int cpeinfo;`
			`unsigned int cpsse;`
			`asm ( "mov $0x1, %%eax\n\t"`
			`"cpuid\n\t"`
			`"mov %%edx, %0\n\t"`
			`"mov %%ecx, %1\n" : "=m" (cpeinfo), "=m" (cpsse));`
			`if ((cpeinfo >> 26) & 0x1 ) return 1;`
			`return 0;`
			`}`

			`static`
			`void gf_invert_binary_matrix(int mat, int inv, int rows) {`
			`int cols, i, j, k;`
			`int tmp;`

			`cols = rows;`

			`for (i = 0; i < rows; i++) inv[i] = (1 << i);`

			`/* First -- convert into upper triangular */`

			`for (i = 0; i < cols; i++) {`

			`/* Swap rows if we ave a zero i,i element. If we can't swap, then the`
			`matrix was not invertible */`

			`if ((mat[i] & (1 << i)) == 0) {`
			`for (j = i+1; j < rows && (mat[j] & (1 << i)) == 0; j++) ;`
			`if (j == rows) {`
			`fprintf(stderr, "galois_invert_matrix: Matrix not invertible!!\n");`
			`exit(1);`
			`}`
			`tmp = mat[i]; mat[i] = mat[j]; mat[j] = tmp;`
			`tmp = inv[i]; inv[i] = inv[j]; inv[j] = tmp;`
			`}`

			`/* Now for each j>i, add A_jiAi to Aj /`
			`for (j = i+1; j != rows; j++) {`
			`if ((mat[j] & (1 << i)) != 0) {`
			`mat[j] ^= mat[i];`
			`inv[j] ^= inv[i];`
			`}`
			`}`
			`}`

			`/* Now the matrix is upper triangular. Start at the top and multiply down */`

			`for (i = rows-1; i >= 0; i--) {`
			`for (j = 0; j < i; j++) {`
			`if (mat[j] & (1 << i)) {`
			`/* mat[j] ^= mat[i]; */`
			`inv[j] ^= inv[i];`
			`}`
			`}`
			`}`
			`}`

			`uint32_t gf_bitmatrix_inverse(uint32_t y, int w, uint32_t pp)`
			`{`
			`uint32_t mat[32], inv[32], mask;`
			`int i;`

			`mask = (w == 32) ? 0xffffffff : (1 << w) - 1;`
			`for (i = 0; i < w; i++) {`
			`mat[i] = y;`

			`if (y & (1 << (w-1))) {`
			`y = y << 1;`
			`y = ((y ^ pp) & mask);`
			`} else {`
			`y = y << 1;`
			`}`
			`}`

			`gf_invert_binary_matrix(mat, inv, w);`
			`return inv[0];`
			`}`

			`/*`
			`void gf_two_byte_region_table_multiply(gf_region_data rd, uint16_t base)`
			`{`
			`uint64_t p, ta, shift, tb;`
			`uint64_t s64, d64`

			`s64 = rd->s_start;`
			`d64 = rd->d_start;`

			`while (s64 < (uint64_t *) rd->s_top) {`
			`p = (rd->xor) ? *d64 : 0;`
			`ta = *s64;`

			`shift = 0;`
			`while (ta != 0) {`
			`tb = base[ta&0xffff];`
			`p ^= (tb << shift);`
			`ta >>= 16;`
			`shift += 16;`
			`}`

			`*d64 = p;`
			`d64++;`
			`s64++;`
			`}`
			`}`
			`*/`

			`void gf_two_byte_region_table_multiply(gf_region_data rd, uint16_t base)`
			`{`
			`uint64_t a, prod;`
			`int j, xor;`
			`uint64_t s64, d64, *top;`

			`s64 = rd->s_start;`
			`d64 = rd->d_start;`
			`top = rd->d_top;`
			`xor = rd->xor;`

			`if (xor) {`
			`while (d64 != top) {`
			`a = *s64;`
			`prod = base[a >> 48];`
			`a <<= 16;`
			`prod <<= 16;`
			`prod ^= base[a >> 48];`
			`a <<= 16;`
			`prod <<= 16;`
			`prod ^= base[a >> 48];`
			`a <<= 16;`
			`prod <<= 16;`
			`prod ^= base[a >> 48];`
			`prod ^= *d64;`
			`*d64 = prod;`
			`*s64++;`
			`*d64++;`
			`}`
			`} else {`
			`while (d64 != top) {`
			`a = *s64;`
			`prod = base[a >> 48];`
			`a <<= 16;`
			`prod <<= 16;`
			`prod ^= base[a >> 48];`
			`a <<= 16;`
			`prod <<= 16;`
			`prod ^= base[a >> 48];`
			`a <<= 16;`
			`prod <<= 16;`
			`prod ^= base[a >> 48];`
			`*d64 = prod;`
			`*s64++;`
			`*d64++;`
			`}`
			`}`
			`}`

			`static void gf_slow_multiply_region(gf_region_data rd, void src, void dest, void s_top)`
			`{`
			`uint8_t s8, d8;`
			`uint16_t s16, d16;`
			`uint32_t s32, d32;`
w=64 Composite field changes. STDMAP and single operations are passing the unit tests. ALTMAP is not. I think it may be an issue with the mapping. Jim is going to have a look. git-svn-id: svn://mamba.eecs.utk.edu/home/plank/svn/Galois-Library@80 36f187d4-5712-4624-889c-152d48957efa 2012-12-29 21:33:37 +04:00			`uint64_t s64, d64;`
Big checkin after I've lost the others. Ha ha. git-svn-id: svn://mamba.eecs.utk.edu/home/plank/svn/Galois-Library@78 36f187d4-5712-4624-889c-152d48957efa 2012-12-08 19:28:43 +04:00			`gf_internal_t *h;`
			`int wb;`
			`uint32_t p, a;`

			`h = rd->gf->scratch;`
			`wb = (h->w)/8;`
			`if (wb == 0) wb = 1;`

			`while (src < s_top) {`
			`switch (h->w) {`
			`case 8:`
			`s8 = (uint8_t *) src;`
			`d8 = (uint8_t *) dest;`
			`d8 = (rd->xor) ? (d8 ^ rd->gf->multiply.w32(rd->gf, rd->val, *s8)) :`
			`rd->gf->multiply.w32(rd->gf, rd->val, *s8);`
			`break;`
			`case 4:`
			`s8 = (uint8_t *) src;`
			`d8 = (uint8_t *) dest;`
			`a = *s8;`
			`p = rd->gf->multiply.w32(rd->gf, rd->val, a&0xf);`
			`p \|= (rd->gf->multiply.w32(rd->gf, rd->val, a >> 4) << 4);`
			`if (rd->xor) p ^= *d8;`
			`*d8 = p;`
			`break;`
			`case 16:`
			`s16 = (uint16_t *) src;`
			`d16 = (uint16_t *) dest;`
			`d16 = (rd->xor) ? (d16 ^ rd->gf->multiply.w32(rd->gf, rd->val, *s16)) :`
			`rd->gf->multiply.w32(rd->gf, rd->val, *s16);`
			`break;`
			`case 32:`
			`s32 = (uint32_t *) src;`
			`d32 = (uint32_t *) dest;`
			`d32 = (rd->xor) ? (d32 ^ rd->gf->multiply.w32(rd->gf, rd->val, *s32)) :`
			`rd->gf->multiply.w32(rd->gf, rd->val, *s32);`
			`break;`
w=64 Composite field changes. STDMAP and single operations are passing the unit tests. ALTMAP is not. I think it may be an issue with the mapping. Jim is going to have a look. git-svn-id: svn://mamba.eecs.utk.edu/home/plank/svn/Galois-Library@80 36f187d4-5712-4624-889c-152d48957efa 2012-12-29 21:33:37 +04:00			`case 64:`
			`s64 = (uint64_t *) src;`
			`d64 = (uint64_t *) dest;`
			`d64 = (rd->xor) ? (d64 ^ rd->gf->multiply.w64(rd->gf, rd->val, *s64)) :`
			`rd->gf->multiply.w64(rd->gf, rd->val, *s64);`
			`break;`
Big checkin after I've lost the others. Ha ha. git-svn-id: svn://mamba.eecs.utk.edu/home/plank/svn/Galois-Library@78 36f187d4-5712-4624-889c-152d48957efa 2012-12-08 19:28:43 +04:00			`default:`
			`fprintf(stderr, "Error: gf_slow_multiply_region: w=%d not implemented.\n", h->w);`
			`exit(1);`
			`}`
			`src += wb;`
			`dest += wb;`
			`}`
			`}`

			`/* If align>16, you align to 16 bytes, but make sure that within the aligned region bytes is a multiple of align. However, you make sure that the region itself is a multiple of align.`

			`If align = -1, then this is cauchy. You need to make sure that bytes is a multiple of w. */`

			`void gf_set_region_data(gf_region_data *rd,`
			`gf_t *gf,`
			`void *src,`
			`void *dest,`
			`int bytes,`
			`uint32_t val,`
			`int xor,`
			`int align)`
			`{`
			`uint8_t s8, d8;`
			`gf_internal_t *h;`
			`int wb;`
			`uint32_t a;`
			`unsigned long uls, uld;`

			`h = gf->scratch;`
			`wb = (h->w)/8;`
			`if (wb == 0) wb = 1;`

			`rd->gf = gf;`
			`rd->src = src;`
			`rd->dest = dest;`
			`rd->bytes = bytes;`
			`rd->val = val;`
			`rd->xor = xor;`
			`rd->align = align;`

			`uls = (unsigned long) src;`
			`uld = (unsigned long) dest;`

			`a = (align <= 16) ? align : 16;`

			`if (align == -1) { /* This is cauchy. Error check bytes, then set up the pointers`
			`so that there is no alignment regions. */`
			`if (bytes % h->w != 0) {`
			`fprintf(stderr, "Error in region multiply operation.\n");`
			`fprintf(stderr, "The size must be a multiple of %d bytes.\n", h->w);`
			`exit(1);`
			`}`

			`rd->s_start = src;`
			`rd->d_start = dest;`
			`rd->s_top = src + bytes;`
			`rd->d_top = src + bytes;`
			`return;`
			`}`

			`if (uls % a != uld % a) {`
			`fprintf(stderr, "Error in region multiply operation.\n");`
			`fprintf(stderr, "The source & destination pointers must be aligned with respect\n");`
			`fprintf(stderr, "to each other along a %d byte boundary.\n", a);`
			`fprintf(stderr, "Src = 0x%lx. Dest = 0x%lx\n", (unsigned long) src,`
			`(unsigned long) dest);`
			`exit(1);`
			`}`

			`if (uls % wb != 0) {`
			`fprintf(stderr, "Error in region multiply operation.\n");`
			`fprintf(stderr, "The pointers must be aligned along a %d byte boundary.\n", wb);`
			`fprintf(stderr, "Src = 0x%lx. Dest = 0x%lx\n", (unsigned long) src,`
			`(unsigned long) dest);`
			`exit(1);`
			`}`

			`if (bytes % wb != 0) {`
			`fprintf(stderr, "Error in region multiply operation.\n");`
			`fprintf(stderr, "The size must be a multiple of %d bytes.\n", wb);`
			`exit(1);`
			`}`

			`uls %= a;`
			`if (uls != 0) uls = (align-uls);`
			`rd->s_start = rd->src + uls;`
			`rd->d_start = rd->dest + uls;`
			`bytes -= uls;`

			`bytes -= (bytes % align);`
			`rd->s_top = rd->s_start + bytes;`
			`rd->d_top = rd->d_start + bytes;`
			`}`

			`void gf_do_initial_region_alignment(gf_region_data *rd)`
			`{`
			`gf_slow_multiply_region(rd, rd->src, rd->dest, rd->s_start);`
			`}`

			`void gf_do_final_region_alignment(gf_region_data *rd)`
			`{`
			`gf_slow_multiply_region(rd, rd->s_top, rd->d_top, rd->src+rd->bytes);`
			`}`

			`void gf_multby_zero(void *dest, int bytes, int xor)`
			`{`
			`if (xor) return;`
			`bzero(dest, bytes);`
			`return;`
			`}`

			`void gf_multby_one(gf_t gf, void src, void *dest, int bytes, int xor)`
			`{`
			`#ifdef INTEL_SSE4`
			`__m128i ms, md;`
			`#endif`
			`uint8_t s8, d8, *dtop8;`
			`uint64_t s64, d64, *dtop64;`
			`int abytes;`

			`gf_region_data rd;`
			`if (!xor) {`
			`memcpy(dest, src, bytes);`
			`return;`
			`}`

			`#ifdef INTEL_SSE4`
			`s8 = (uint8_t *) src;`
			`d8 = (uint8_t *) dest;`
			`abytes = bytes & 0xfffffff0;`

			`while (d8 < (uint8_t *) dest + abytes) {`
			`ms = _mm_loadu_si128 ((__m128i *)(s8));`
			`md = _mm_loadu_si128 ((__m128i *)(d8));`
			`md = _mm_xor_si128(md, ms);`
			`_mm_storeu_si128((__m128i *)(d8), md);`
			`s8 += 16;`
			`d8 += 16;`
			`}`
			`while (d8 != (uint8_t *) dest+bytes) {`
			`d8 ^= s8;`
			`d8++;`
			`s8++;`
			`}`
			`return;`
			`#endif`

			`/* If you don't have SSE, you'd better be aligned..... */`

			`gf_set_region_data(&rd, gf, src, dest, bytes, 1, xor, 8);`
			`s8 = (uint8_t *) src;`
			`d8 = (uint8_t *) dest;`
			`while (d8 != rd.d_start) {`
			`d8 ^= s8;`
			`d8++;`
			`s8++;`
			`}`
			`dtop64 = (uint64_t *) rd.d_top;`

			`while (d64 < dtop64) {`
			`d64 ^= s64;`
			`d64++;`
			`s64++;`
			`}`
			`while (d8 != (uint8_t *) dest+bytes) {`
			`d8 ^= s8;`
			`d8++;`
			`s8++;`
			`}`
			`return;`
			`}`