/*
 * gf.c
 *
 * Generic routines for Galois fields
 */

#include "gf_int.h"
#include <stdio.h>
#include <stdlib.h>

int gf_scratch_size(int w, 
                    int mult_type, 
                    int region_type, 
                    int divide_type, 
                    int arg1, 
                    int arg2)
{
  switch(w) {
    case 4: return gf_w4_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
    case 8: return gf_w8_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
    case 16: return gf_w16_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
    case 32: return gf_w32_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
    case 64: return gf_w64_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
    case 128: return gf_w128_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
    default: return gf_wgen_scratch_size(w, mult_type, region_type, divide_type, arg1, arg2);
  }
}

int gf_dummy_init(gf_t *gf)
{
  return 0;
}

int gf_init_easy(gf_t *gf, int w, int mult_type)
{
  return gf_init_hard(gf, w, mult_type, GF_REGION_DEFAULT, GF_DIVIDE_DEFAULT, 0, 0, 0, NULL, NULL);
}

int gf_init_hard(gf_t *gf, int w, int mult_type, 
                        int region_type,
                        int divide_type,
                        uint64_t prim_poly,
                        int arg1, int arg2,
                        gf_t *base_gf,
                        void *scratch_memory) 
{
  int sz;
  gf_internal_t *h;
  
  sz = gf_scratch_size(w, mult_type, region_type, divide_type, arg1, arg2);

  if (sz <= 0) return 0;

  if (scratch_memory == NULL) {
    h = (gf_internal_t *) malloc(sz);
    h->free_me = 1;
  } else {
    h = scratch_memory;
    h->free_me = 0;
  }
  gf->scratch = (void *) h;
  h->mult_type = mult_type;
  h->region_type = region_type;
  h->divide_type = divide_type;
  h->w = w;
  h->prim_poly = prim_poly;
  h->arg1 = arg1;
  h->arg2 = arg2;
  h->base_gf = base_gf;
  h->private = (void *) gf->scratch;
  h->private += (sizeof(gf_internal_t));
  gf->extract_word.w32 = NULL;

  //printf("Created w=%d, with mult_type=%d and region_type=%d\n", w, mult_type, region_type);

  switch(w) {
    case 4: return gf_w4_init(gf);
    case 8: return gf_w8_init(gf);
    case 16: return gf_w16_init(gf);
    case 32: return gf_w32_init(gf);
    case 64: return gf_w64_init(gf);
    case 128: return gf_w128_init(gf);
    default: return gf_wgen_init(gf);
  }
}

int gf_free(gf_t *gf, int recursive)
{
  gf_internal_t *h;

  h = (gf_internal_t *) gf->scratch;
  if (recursive && h->base_gf != NULL) {
    gf_free(h->base_gf, 1);
    free(h->base_gf);
  }
  if (h->free_me) free(h);
}

void gf_alignment_error(char *s, int a)
{
  fprintf(stderr, "Alignment error in %s:\n", s);
  fprintf(stderr, "   The source and destination buffers must be aligned to each other,\n");
  fprintf(stderr, "   and they must be aligned to a %d-byte address.\n", a);
  exit(1);
}

/* Lifted this code from Jens Gregor -- thanks, Jens */

int gf_is_sse2()
{
  unsigned int cpeinfo;
  unsigned int cpsse;
  asm ( "mov $0x1, %%eax\n\t"
                "cpuid\n\t"
                "mov %%edx, %0\n\t"
            "mov %%ecx, %1\n" : "=m" (cpeinfo), "=m" (cpsse));
  if ((cpeinfo >> 26) & 0x1 ) return 1;
  return 0;
}

static 
void gf_invert_binary_matrix(int *mat, int *inv, int rows) {
  int cols, i, j, k;
  int tmp;

  cols = rows;

  for (i = 0; i < rows; i++) inv[i] = (1 << i);

  /* First -- convert into upper triangular */

  for (i = 0; i < cols; i++) {

    /* Swap rows if we ave a zero i,i element.  If we can't swap, then the
       matrix was not invertible */

    if ((mat[i] & (1 << i)) == 0) {
      for (j = i+1; j < rows && (mat[j] & (1 << i)) == 0; j++) ;
      if (j == rows) {
        fprintf(stderr, "galois_invert_matrix: Matrix not invertible!!\n");
        exit(1);
      }
      tmp = mat[i]; mat[i] = mat[j]; mat[j] = tmp;
      tmp = inv[i]; inv[i] = inv[j]; inv[j] = tmp;
    }

    /* Now for each j>i, add A_ji*Ai to Aj */
    for (j = i+1; j != rows; j++) {
      if ((mat[j] & (1 << i)) != 0) {
        mat[j] ^= mat[i];
        inv[j] ^= inv[i];
      }
    }
  }

  /* Now the matrix is upper triangular.  Start at the top and multiply down */

  for (i = rows-1; i >= 0; i--) {
    for (j = 0; j < i; j++) {
      if (mat[j] & (1 << i)) {
        /*  mat[j] ^= mat[i]; */
        inv[j] ^= inv[i];
      }
    }
  }
}

uint32_t gf_bitmatrix_inverse(uint32_t y, int w, uint32_t pp) 
{
  uint32_t mat[32], inv[32], mask;
  int i;

  mask = (w == 32) ? 0xffffffff : (1 << w) - 1;
  for (i = 0; i < w; i++) {
    mat[i] = y;

    if (y & (1 << (w-1))) {
      y = y << 1;
      y = ((y ^ pp) & mask);
    } else {
      y = y << 1;
    }
  }

  gf_invert_binary_matrix(mat, inv, w);
  return inv[0];
}

/*
void gf_two_byte_region_table_multiply(gf_region_data *rd, uint16_t *base)
{
  uint64_t p, ta, shift, tb;
  uint64_t *s64, *d64

  s64 = rd->s_start;
  d64 = rd->d_start;
  
  while (s64 < (uint64_t *) rd->s_top) {
    p = (rd->xor) ? *d64 : 0;
    ta = *s64;

    shift = 0;
    while (ta != 0) {
      tb = base[ta&0xffff];
      p ^= (tb << shift);
      ta >>= 16;
      shift += 16;
    }

    *d64 = p;
    d64++;
    s64++;
  }
}
*/

void gf_two_byte_region_table_multiply(gf_region_data *rd, uint16_t *base)
{
  uint64_t a, prod;
  int j, xor;
  uint64_t *s64, *d64, *top;

  s64 = rd->s_start;
  d64 = rd->d_start;
  top = rd->d_top;
  xor = rd->xor;
  
  if (xor) {
    while (d64 != top) {
      a = *s64;
      prod = base[a >> 48];
      a <<= 16;
      prod <<= 16;
      prod ^= base[a >> 48];
      a <<= 16;
      prod <<= 16;
      prod ^= base[a >> 48];
      a <<= 16;
      prod <<= 16;
      prod ^= base[a >> 48];
      prod ^= *d64;
      *d64 = prod;
      *s64++;
      *d64++;
    }
  } else {
    while (d64 != top) {
      a = *s64;
      prod = base[a >> 48];
      a <<= 16;
      prod <<= 16;
      prod ^= base[a >> 48];
      a <<= 16;
      prod <<= 16;
      prod ^= base[a >> 48];
      a <<= 16;
      prod <<= 16;
      prod ^= base[a >> 48];
      *d64 = prod;
      *s64++;
      *d64++;
    }
  }
}

static void gf_slow_multiply_region(gf_region_data *rd, void *src, void *dest, void *s_top)
{
  uint8_t *s8, *d8;
  uint16_t *s16, *d16;
  uint32_t *s32, *d32;
  uint64_t *s64, *d64;
  gf_internal_t *h;
  int wb;
  uint32_t p, a;

  h = rd->gf->scratch;
  wb = (h->w)/8;
  if (wb == 0) wb = 1;
  
  while (src < s_top) {
    switch (h->w) {
    case 8:
      s8 = (uint8_t *) src;
      d8 = (uint8_t *) dest;
      *d8 = (rd->xor) ? (*d8 ^ rd->gf->multiply.w32(rd->gf, rd->val, *s8)) : 
                      rd->gf->multiply.w32(rd->gf, rd->val, *s8);
      break;
    case 4:
      s8 = (uint8_t *) src;
      d8 = (uint8_t *) dest;
      a = *s8;
      p = rd->gf->multiply.w32(rd->gf, rd->val, a&0xf);
      p |= (rd->gf->multiply.w32(rd->gf, rd->val, a >> 4) << 4);
      if (rd->xor) p ^= *d8;
      *d8 = p;
      break;
    case 16:
      s16 = (uint16_t *) src;
      d16 = (uint16_t *) dest;
      *d16 = (rd->xor) ? (*d16 ^ rd->gf->multiply.w32(rd->gf, rd->val, *s16)) : 
                      rd->gf->multiply.w32(rd->gf, rd->val, *s16);
      break;
    case 32:
      s32 = (uint32_t *) src;
      d32 = (uint32_t *) dest;
      *d32 = (rd->xor) ? (*d32 ^ rd->gf->multiply.w32(rd->gf, rd->val, *s32)) : 
                      rd->gf->multiply.w32(rd->gf, rd->val, *s32);
      break;
    case 64:
      s64 = (uint64_t *) src;
      d64 = (uint64_t *) dest;
      *d64 = (rd->xor) ? (*d64 ^ rd->gf->multiply.w64(rd->gf, rd->val, *s64)) : 
                      rd->gf->multiply.w64(rd->gf, rd->val, *s64);
      break;
    default:
      fprintf(stderr, "Error: gf_slow_multiply_region: w=%d not implemented.\n", h->w);
      exit(1);
    }
    src += wb;
    dest += wb;
  }
}

/* If align>16, you align to 16 bytes, but make sure that within the aligned region bytes is a multiple of align.  However, you make sure that the region itself is a multiple of align. 

   If align = -1, then this is cauchy.  You need to make sure that bytes is a multiple of w. */

void gf_set_region_data(gf_region_data *rd,
  gf_t *gf,
  void *src,
  void *dest,
  int bytes,
  uint32_t val,
  int xor,
  int align)
{
  uint8_t *s8, *d8;
  gf_internal_t *h;
  int wb;
  uint32_t a;
  unsigned long uls, uld;

  h = gf->scratch;
  wb = (h->w)/8;
  if (wb == 0) wb = 1;
  
  rd->gf = gf;
  rd->src = src;
  rd->dest = dest;
  rd->bytes = bytes;
  rd->val = val;
  rd->xor = xor;
  rd->align = align;

  uls = (unsigned long) src;
  uld = (unsigned long) dest;

  a = (align <= 16) ? align : 16;

  if (align == -1) { /* This is cauchy.  Error check bytes, then set up the pointers
                        so that there is no alignment regions. */
    if (bytes % h->w != 0) {
      fprintf(stderr, "Error in region multiply operation.\n");
      fprintf(stderr, "The size must be a multiple of %d bytes.\n", h->w);
      exit(1);
    }
  
    rd->s_start = src;
    rd->d_start = dest;
    rd->s_top = src + bytes;
    rd->d_top = src + bytes;
    return;
  }

  if (uls % a != uld % a) {
    fprintf(stderr, "Error in region multiply operation.\n");
    fprintf(stderr, "The source & destination pointers must be aligned with respect\n");
    fprintf(stderr, "to each other along a %d byte boundary.\n", a);
    fprintf(stderr, "Src = 0x%lx.  Dest = 0x%lx\n", (unsigned long) src,
            (unsigned long) dest);
    exit(1);
  }

  if (uls % wb != 0) {
    fprintf(stderr, "Error in region multiply operation.\n");
    fprintf(stderr, "The pointers must be aligned along a %d byte boundary.\n", wb);
    fprintf(stderr, "Src = 0x%lx.  Dest = 0x%lx\n", (unsigned long) src,
            (unsigned long) dest);
    exit(1);
  }

  if (bytes % wb != 0) {
    fprintf(stderr, "Error in region multiply operation.\n");
    fprintf(stderr, "The size must be a multiple of %d bytes.\n", wb);
    exit(1);
  }

  uls %= a;
  if (uls != 0) uls = (align-uls);
  rd->s_start = rd->src + uls;
  rd->d_start = rd->dest + uls;
  bytes -= uls;

  bytes -= (bytes % align);
  rd->s_top = rd->s_start + bytes;
  rd->d_top = rd->d_start + bytes;
}

void gf_do_initial_region_alignment(gf_region_data *rd)
{
  gf_slow_multiply_region(rd, rd->src, rd->dest, rd->s_start);
}

void gf_do_final_region_alignment(gf_region_data *rd)
{
  gf_slow_multiply_region(rd, rd->s_top, rd->d_top, rd->src+rd->bytes);
}

void gf_multby_zero(void *dest, int bytes, int xor) 
{
  if (xor) return;
  bzero(dest, bytes);
  return;
}

void gf_multby_one(gf_t *gf, void *src, void *dest, int bytes, int xor) 
{
#ifdef   INTEL_SSE4
  __m128i ms, md;
#endif
  uint8_t *s8, *d8, *dtop8;
  uint64_t *s64, *d64, *dtop64;
  int abytes;

  gf_region_data rd;
  if (!xor) {
    memcpy(dest, src, bytes);
    return;
  }

#ifdef   INTEL_SSE4
  s8 = (uint8_t *) src;
  d8 = (uint8_t *) dest;
  abytes = bytes & 0xfffffff0;

  while (d8 < (uint8_t *) dest + abytes) {
    ms = _mm_loadu_si128 ((__m128i *)(s8));
    md = _mm_loadu_si128 ((__m128i *)(d8));
    md = _mm_xor_si128(md, ms);
    _mm_storeu_si128((__m128i *)(d8), md);
    s8 += 16;
    d8 += 16;
  }
  while (d8 != (uint8_t *) dest+bytes) {
    *d8 ^= *s8;
    d8++;
    s8++;
  }
  return;
#endif

  /* If you don't have SSE, you'd better be aligned..... */

  gf_set_region_data(&rd, gf, src, dest, bytes, 1, xor, 8);
  s8 = (uint8_t *) src;
  d8 = (uint8_t *) dest;
  while (d8 != rd.d_start) {
    *d8 ^= *s8;
    d8++;
    s8++;
  }
  dtop64 = (uint64_t *) rd.d_top;

  while (d64 < dtop64) {
    *d64 ^= *s64;
    d64++;
    s64++;
  }
  while (d8 != (uint8_t *) dest+bytes) {
    *d8 ^= *s8;
    d8++;
    s8++;
  }
  return;
}