gf-complete/gf.c

486 lines
11 KiB
C
Raw Normal View History

/*
* gf.c
*
* Generic routines for Galois fields
*/
#include "gf_int.h"
#include <stdio.h>
#include <stdlib.h>
int gf_scratch_size(int w,
int mult_type,
int region_type,
int divide_type,
int arg1,
int arg2)
{
switch(w) {
case 4: return gf_w4_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
case 8: return gf_w8_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
case 16: return gf_w16_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
case 32: return gf_w32_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
case 64: return gf_w64_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
case 128: return gf_w128_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
default: return gf_wgen_scratch_size(w, mult_type, region_type, divide_type, arg1, arg2);
}
}
int gf_dummy_init(gf_t *gf)
{
return 0;
}
int gf_init_easy(gf_t *gf, int w, int mult_type)
{
return gf_init_hard(gf, w, mult_type, GF_REGION_DEFAULT, GF_DIVIDE_DEFAULT, 0, 0, 0, NULL, NULL);
}
int gf_init_hard(gf_t *gf, int w, int mult_type,
int region_type,
int divide_type,
uint64_t prim_poly,
int arg1, int arg2,
gf_t *base_gf,
void *scratch_memory)
{
int sz;
gf_internal_t *h;
sz = gf_scratch_size(w, mult_type, region_type, divide_type, arg1, arg2);
if (sz <= 0) return 0;
if (scratch_memory == NULL) {
h = (gf_internal_t *) malloc(sz);
h->free_me = 1;
} else {
h = scratch_memory;
h->free_me = 0;
}
gf->scratch = (void *) h;
h->mult_type = mult_type;
h->region_type = region_type;
h->divide_type = divide_type;
h->w = w;
h->prim_poly = prim_poly;
h->arg1 = arg1;
h->arg2 = arg2;
h->base_gf = base_gf;
h->private = (void *) gf->scratch;
h->private += (sizeof(gf_internal_t));
gf->extract_word.w32 = NULL;
//printf("Created w=%d, with mult_type=%d and region_type=%d\n", w, mult_type, region_type);
switch(w) {
case 4: return gf_w4_init(gf);
case 8: return gf_w8_init(gf);
case 16: return gf_w16_init(gf);
case 32: return gf_w32_init(gf);
case 64: return gf_w64_init(gf);
case 128: return gf_w128_init(gf);
default: return gf_wgen_init(gf);
}
}
int gf_free(gf_t *gf, int recursive)
{
gf_internal_t *h;
h = (gf_internal_t *) gf->scratch;
if (recursive && h->base_gf != NULL) {
gf_free(h->base_gf, 1);
free(h->base_gf);
}
if (h->free_me) free(h);
}
void gf_alignment_error(char *s, int a)
{
fprintf(stderr, "Alignment error in %s:\n", s);
fprintf(stderr, " The source and destination buffers must be aligned to each other,\n");
fprintf(stderr, " and they must be aligned to a %d-byte address.\n", a);
exit(1);
}
/* Lifted this code from Jens Gregor -- thanks, Jens */
int gf_is_sse2()
{
unsigned int cpeinfo;
unsigned int cpsse;
asm ( "mov $0x1, %%eax\n\t"
"cpuid\n\t"
"mov %%edx, %0\n\t"
"mov %%ecx, %1\n" : "=m" (cpeinfo), "=m" (cpsse));
if ((cpeinfo >> 26) & 0x1 ) return 1;
return 0;
}
static
void gf_invert_binary_matrix(int *mat, int *inv, int rows) {
int cols, i, j, k;
int tmp;
cols = rows;
for (i = 0; i < rows; i++) inv[i] = (1 << i);
/* First -- convert into upper triangular */
for (i = 0; i < cols; i++) {
/* Swap rows if we ave a zero i,i element. If we can't swap, then the
matrix was not invertible */
if ((mat[i] & (1 << i)) == 0) {
for (j = i+1; j < rows && (mat[j] & (1 << i)) == 0; j++) ;
if (j == rows) {
fprintf(stderr, "galois_invert_matrix: Matrix not invertible!!\n");
exit(1);
}
tmp = mat[i]; mat[i] = mat[j]; mat[j] = tmp;
tmp = inv[i]; inv[i] = inv[j]; inv[j] = tmp;
}
/* Now for each j>i, add A_ji*Ai to Aj */
for (j = i+1; j != rows; j++) {
if ((mat[j] & (1 << i)) != 0) {
mat[j] ^= mat[i];
inv[j] ^= inv[i];
}
}
}
/* Now the matrix is upper triangular. Start at the top and multiply down */
for (i = rows-1; i >= 0; i--) {
for (j = 0; j < i; j++) {
if (mat[j] & (1 << i)) {
/* mat[j] ^= mat[i]; */
inv[j] ^= inv[i];
}
}
}
}
uint32_t gf_bitmatrix_inverse(uint32_t y, int w, uint32_t pp)
{
uint32_t mat[32], inv[32], mask;
int i;
mask = (w == 32) ? 0xffffffff : (1 << w) - 1;
for (i = 0; i < w; i++) {
mat[i] = y;
if (y & (1 << (w-1))) {
y = y << 1;
y = ((y ^ pp) & mask);
} else {
y = y << 1;
}
}
gf_invert_binary_matrix(mat, inv, w);
return inv[0];
}
/*
void gf_two_byte_region_table_multiply(gf_region_data *rd, uint16_t *base)
{
uint64_t p, ta, shift, tb;
uint64_t *s64, *d64
s64 = rd->s_start;
d64 = rd->d_start;
while (s64 < (uint64_t *) rd->s_top) {
p = (rd->xor) ? *d64 : 0;
ta = *s64;
shift = 0;
while (ta != 0) {
tb = base[ta&0xffff];
p ^= (tb << shift);
ta >>= 16;
shift += 16;
}
*d64 = p;
d64++;
s64++;
}
}
*/
void gf_two_byte_region_table_multiply(gf_region_data *rd, uint16_t *base)
{
uint64_t a, prod;
int j, xor;
uint64_t *s64, *d64, *top;
s64 = rd->s_start;
d64 = rd->d_start;
top = rd->d_top;
xor = rd->xor;
if (xor) {
while (d64 != top) {
a = *s64;
prod = base[a >> 48];
a <<= 16;
prod <<= 16;
prod ^= base[a >> 48];
a <<= 16;
prod <<= 16;
prod ^= base[a >> 48];
a <<= 16;
prod <<= 16;
prod ^= base[a >> 48];
prod ^= *d64;
*d64 = prod;
*s64++;
*d64++;
}
} else {
while (d64 != top) {
a = *s64;
prod = base[a >> 48];
a <<= 16;
prod <<= 16;
prod ^= base[a >> 48];
a <<= 16;
prod <<= 16;
prod ^= base[a >> 48];
a <<= 16;
prod <<= 16;
prod ^= base[a >> 48];
*d64 = prod;
*s64++;
*d64++;
}
}
}
static void gf_slow_multiply_region(gf_region_data *rd, void *src, void *dest, void *s_top)
{
uint8_t *s8, *d8;
uint16_t *s16, *d16;
uint32_t *s32, *d32;
uint64_t *s64, *d64;
gf_internal_t *h;
int wb;
uint32_t p, a;
h = rd->gf->scratch;
wb = (h->w)/8;
if (wb == 0) wb = 1;
while (src < s_top) {
switch (h->w) {
case 8:
s8 = (uint8_t *) src;
d8 = (uint8_t *) dest;
*d8 = (rd->xor) ? (*d8 ^ rd->gf->multiply.w32(rd->gf, rd->val, *s8)) :
rd->gf->multiply.w32(rd->gf, rd->val, *s8);
break;
case 4:
s8 = (uint8_t *) src;
d8 = (uint8_t *) dest;
a = *s8;
p = rd->gf->multiply.w32(rd->gf, rd->val, a&0xf);
p |= (rd->gf->multiply.w32(rd->gf, rd->val, a >> 4) << 4);
if (rd->xor) p ^= *d8;
*d8 = p;
break;
case 16:
s16 = (uint16_t *) src;
d16 = (uint16_t *) dest;
*d16 = (rd->xor) ? (*d16 ^ rd->gf->multiply.w32(rd->gf, rd->val, *s16)) :
rd->gf->multiply.w32(rd->gf, rd->val, *s16);
break;
case 32:
s32 = (uint32_t *) src;
d32 = (uint32_t *) dest;
*d32 = (rd->xor) ? (*d32 ^ rd->gf->multiply.w32(rd->gf, rd->val, *s32)) :
rd->gf->multiply.w32(rd->gf, rd->val, *s32);
break;
case 64:
s64 = (uint64_t *) src;
d64 = (uint64_t *) dest;
*d64 = (rd->xor) ? (*d64 ^ rd->gf->multiply.w64(rd->gf, rd->val, *s64)) :
rd->gf->multiply.w64(rd->gf, rd->val, *s64);
break;
default:
fprintf(stderr, "Error: gf_slow_multiply_region: w=%d not implemented.\n", h->w);
exit(1);
}
src += wb;
dest += wb;
}
}
/* If align>16, you align to 16 bytes, but make sure that within the aligned region bytes is a multiple of align. However, you make sure that the region itself is a multiple of align.
If align = -1, then this is cauchy. You need to make sure that bytes is a multiple of w. */
void gf_set_region_data(gf_region_data *rd,
gf_t *gf,
void *src,
void *dest,
int bytes,
uint32_t val,
int xor,
int align)
{
uint8_t *s8, *d8;
gf_internal_t *h;
int wb;
uint32_t a;
unsigned long uls, uld;
h = gf->scratch;
wb = (h->w)/8;
if (wb == 0) wb = 1;
rd->gf = gf;
rd->src = src;
rd->dest = dest;
rd->bytes = bytes;
rd->val = val;
rd->xor = xor;
rd->align = align;
uls = (unsigned long) src;
uld = (unsigned long) dest;
a = (align <= 16) ? align : 16;
if (align == -1) { /* This is cauchy. Error check bytes, then set up the pointers
so that there is no alignment regions. */
if (bytes % h->w != 0) {
fprintf(stderr, "Error in region multiply operation.\n");
fprintf(stderr, "The size must be a multiple of %d bytes.\n", h->w);
exit(1);
}
rd->s_start = src;
rd->d_start = dest;
rd->s_top = src + bytes;
rd->d_top = src + bytes;
return;
}
if (uls % a != uld % a) {
fprintf(stderr, "Error in region multiply operation.\n");
fprintf(stderr, "The source & destination pointers must be aligned with respect\n");
fprintf(stderr, "to each other along a %d byte boundary.\n", a);
fprintf(stderr, "Src = 0x%lx. Dest = 0x%lx\n", (unsigned long) src,
(unsigned long) dest);
exit(1);
}
if (uls % wb != 0) {
fprintf(stderr, "Error in region multiply operation.\n");
fprintf(stderr, "The pointers must be aligned along a %d byte boundary.\n", wb);
fprintf(stderr, "Src = 0x%lx. Dest = 0x%lx\n", (unsigned long) src,
(unsigned long) dest);
exit(1);
}
if (bytes % wb != 0) {
fprintf(stderr, "Error in region multiply operation.\n");
fprintf(stderr, "The size must be a multiple of %d bytes.\n", wb);
exit(1);
}
uls %= a;
if (uls != 0) uls = (align-uls);
rd->s_start = rd->src + uls;
rd->d_start = rd->dest + uls;
bytes -= uls;
bytes -= (bytes % align);
rd->s_top = rd->s_start + bytes;
rd->d_top = rd->d_start + bytes;
}
void gf_do_initial_region_alignment(gf_region_data *rd)
{
gf_slow_multiply_region(rd, rd->src, rd->dest, rd->s_start);
}
void gf_do_final_region_alignment(gf_region_data *rd)
{
gf_slow_multiply_region(rd, rd->s_top, rd->d_top, rd->src+rd->bytes);
}
void gf_multby_zero(void *dest, int bytes, int xor)
{
if (xor) return;
bzero(dest, bytes);
return;
}
void gf_multby_one(gf_t *gf, void *src, void *dest, int bytes, int xor)
{
#ifdef INTEL_SSE4
__m128i ms, md;
#endif
uint8_t *s8, *d8, *dtop8;
uint64_t *s64, *d64, *dtop64;
int abytes;
gf_region_data rd;
if (!xor) {
memcpy(dest, src, bytes);
return;
}
#ifdef INTEL_SSE4
s8 = (uint8_t *) src;
d8 = (uint8_t *) dest;
abytes = bytes & 0xfffffff0;
while (d8 < (uint8_t *) dest + abytes) {
ms = _mm_loadu_si128 ((__m128i *)(s8));
md = _mm_loadu_si128 ((__m128i *)(d8));
md = _mm_xor_si128(md, ms);
_mm_storeu_si128((__m128i *)(d8), md);
s8 += 16;
d8 += 16;
}
while (d8 != (uint8_t *) dest+bytes) {
*d8 ^= *s8;
d8++;
s8++;
}
return;
#endif
/* If you don't have SSE, you'd better be aligned..... */
gf_set_region_data(&rd, gf, src, dest, bytes, 1, xor, 8);
s8 = (uint8_t *) src;
d8 = (uint8_t *) dest;
while (d8 != rd.d_start) {
*d8 ^= *s8;
d8++;
s8++;
}
dtop64 = (uint64_t *) rd.d_top;
while (d64 < dtop64) {
*d64 ^= *s64;
d64++;
s64++;
}
while (d8 != (uint8_t *) dest+bytes) {
*d8 ^= *s8;
d8++;
s8++;
}
return;
}