#include <stdio.h>
#include <string.h>
#include <inttypes.h>
#include <stdlib.h>
#include <time.h>

#if !defined(__i386__) && !defined(__x86_64__)
#undef SSE2
#endif

static uint8_t gfmul(uint8_t a, uint8_t b)
{
  uint8_t v = 0;

  while ( b ) {
    if ( b & 1 ) v ^= a;
    a = (a << 1) ^ (((int8_t)a >> 7) & 0x1d);
    b >>= 1;
  }
  return v;
}

static uint8_t gfpow(uint8_t a, int b)
{
  uint8_t v = 1;

  b %= 255;
  if ( b < 0 )
    b += 255;

  while ( b ) {
    if ( b & 1 ) v = gfmul(v,a);
    a = gfmul(a,a);
    b >>= 1;
  }
  return v;
}

#ifdef SSE2
static inline int cpuid_features(void)
{
	uint32_t eax = 1;
	uint32_t ebx, ecx, edx;

	asm volatile("cpuid" :
		     "+a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx));

	return edx;
}
#endif

static inline uint64_t rdtsc(void)
{
  uint64_t tsc;

#if defined(__i386__)
  asm volatile("rdtsc" : "=A" (tsc));
#elif defined(__x86_64__)
  uint64_t rax, rdx;
  asm volatile("rdtsc" : "=a" (rax), "=d" (rdx));
  tsc = (rdx << 32) + rax;
#else
  struct timeval tv;
  gettimeofday(&tv, NULL);
  tsc = tv.tv_sec * 1000000000ULL + tv.tv_usec * 1000ULL;
#endif

  return tsc;
}

#ifdef BIT64
typedef uint64_t unative_t;
#define NBYTES(x) ((unative_t)(x) * 0x0101010101010101)
#define NSIZE 8
#else
typedef uint32_t unative_t;
#define NBYTES(x) ((unative_t)(x) * 0x01010101)
#define NSIZE 4
#endif

#define DWID 512		/* Data width */

typedef uint8_t datum_t[DWID] __attribute__((aligned(DWID)));
datum_t data[256];
datum_t p, q;

int main(int argc, char *argv[])
{
  uint64_t before, after;
  int count, nc;
  int z, i, j, d;
  uint8_t hbtable[256];
  uint8_t polytable[256][256];
  int n = (argc > 1) ? atoi(argv[1]) : 16;
  uint8_t db, os, op;
  uint16_t ns;
#ifdef SSE2
  uint32_t flags;
  static struct {
    uint64_t x1d[2];
    uint64_t xfe[2];
  } __attribute__((aligned(32))) xmmc = {
    { 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL },
    { 0xfefefefefefefefeULL, 0xfefefefefefefefeULL },
  };
#endif

  count = (argc > 2) ? atoi(argv[2]) : 10000;

  for ( z = 0 ; z < 256 ; z++ ) {
    hbtable[z] = gfmul(0x1d, z);
  }

  os = 1;
  for ( i = 0 ; i < n ; i++ ) {
    for ( z = 0 ; z < 256 ; z++ ) {
      polytable[i][z] = gfmul(os, z);
    }
    os = gfmul(2, os);
  }

  /* srand(time() ^ getpid()); */
  for ( i = 0 ; i < n ; i++ ) {
    for ( j = 0 ; j < DWID ; j++ ) {
      data[i][j] = rand() >> 7;
    }
  }

#if 0
  /* Compute Reed-Solomon syndrome the old-fashioned way */
  /* Eventually use this as the "gold standard" */
  before = rdtsc();
  for ( nc = count ; nc ; nc-- ) {
    os = 0;
    for ( i = 0 ; i < n ; i++ ) {
      os ^= gfmul(gfpow(2,i), data[i][0]);
    }
    asm volatile(" " : "+r" (os)); /* Don't optimize out */
  }
  after = rdtsc();
  printf("Reference: %02x, time = %llu\n", os, after-before);
#endif

  /* Compute Reed-Solomon syndrome the table way */
 
  before = rdtsc();
  for ( nc = count ; nc ; nc-- ) {
    for ( d = 0 ; d < DWID ; d++ ) {
      os = op = 0;
      for ( i = 0 ; i < n ; i++ ) {
	db = data[i][d];
	op ^= db;
	os ^= polytable[i][db];
      }
      p[d] = op;
      q[d] = os;
    }
  }
  after = rdtsc();
  printf("Tables:   %16llx, cycles = %12llu, cycles/byte = %.4g\n",
	 *(uint64_t *)q, after-before,
	 ((double)(after-before))/((double)((long long)DWID*n*count)));

  /* Mixed integer and tables */
  before = rdtsc();
  for ( nc = count ; nc ; nc-- ) {
    for ( d = 0 ; d < DWID ; d++ ) {
      ns = 0;
      /* Do partial byte */
      for ( z = n-1 ; (z & 7) != 7 ; z-- ) {
	ns ^= data[z][d] << (z & 7);
      }
      for ( z -= 7 ; z >= 0 ; z -= 8 ) {
	ns = hbtable[ns >> 8] ^ ns;
	ns <<= 1; db = data[z+7][d]; op ^= db; ns ^= db;
	ns <<= 1; db = data[z+6][d]; op ^= db; ns ^= db;
	ns <<= 1; db = data[z+5][d]; op ^= db; ns ^= db;
	ns <<= 1; db = data[z+4][d]; op ^= db; ns ^= db;
	ns <<= 1; db = data[z+3][d]; op ^= db; ns ^= db;
	ns <<= 1; db = data[z+2][d]; op ^= db; ns ^= db;
	ns <<= 1; db = data[z+1][d]; op ^= db; ns ^= db;
	ns <<= 1; db = data[z+0][d]; op ^= db; ns ^= db;
      }
      os = hbtable[ns >> 8] ^ ns;
      /* Add p */
      q[d] = os;
    }
  }
  after = rdtsc();
  printf("Mixed:    %16llx, cycles = %12llu, cycles/byte = %.4g\n",
	 *(uint64_t *)q, after-before,
	 ((double)(after-before))/((double)((long long)DWID*n*count)));

  /* All integer - single byte */
  before = rdtsc();
  for ( nc = count ; nc ; nc-- ) {
    for ( d = 0 ; d < DWID ; d++ ) {
      os = op = 0;
      for ( z = n-1 ; z >= 0 ; z-- ) {
	db = data[z][d];
	op ^= db;
	os = ((os << 1) ^ (((int8_t)os >> 7) & 0x1d)) ^ db;
      }
      p[d] = op;
      q[d] = os;
    }
  }
  after = rdtsc();
  printf("Byteint:  %16llx, cycles = %12llu, cycles/byte = %.4g\n",
	 *(uint64_t *)q, after-before,
	 ((double)(after-before))/((double)((long long)DWID*n*count)));

  /* All integer - 32/64 bits - unroll factor 1 */
  before = rdtsc();
  for ( nc = count ; nc ; nc-- ) {
    unative_t wd0, ws0, wp0, w10, w20;
    for ( d = 0 ; d < DWID ; d += NSIZE*1 ) {
      ws0 = wp0 = 0;
      for ( z = n-1 ; z >= 0 ; z-- ) {
	wd0 = *(unative_t *)&data[z][d+NSIZE*0];
	wp0 ^= wd0;
	w20 = ws0 & NBYTES(0x80);
	w10 = (ws0 << 1) & NBYTES(0xfe);
	w20 = (w20 << 1) - (w20 >> 7);
	w20 &= NBYTES(0x1d);
	w10 ^= w20;
	ws0 = w10 ^ wd0;
      }
      *(unative_t *)&p[d+NSIZE*0] = wp0;
      *(unative_t *)&q[d+NSIZE*0] = ws0;
    }
  }
  after = rdtsc();
  printf("Int%dx%-3s %16llx, cycles = %12llu, cycles/byte = %.4g\n",
	 NSIZE*8, "1:", *(uint64_t *)q, after-before,
	 ((double)(after-before))/((double)((long long)DWID*n*count)));

  /* All integer - 32/64 bits - unroll factor 2 */
  before = rdtsc();
  for ( nc = count ; nc ; nc-- ) {
    unative_t wd0, ws0, wp0, w10, w20;
    unative_t wd1, ws1, wp1, w11, w21;
    for ( d = 0 ; d < DWID ; d += NSIZE*2 ) {
      ws0 = wp0 = 0;
      ws1 = wp1 = 0;
      for ( z = n-1 ; z >= 0 ; z-- ) {
	wd0 = *(unative_t *)&data[z][d+NSIZE*0];
	wd1 = *(unative_t *)&data[z][d+NSIZE*1];
	wp0 ^= wd0;
	wp1 ^= wd1;
	w20 = ws0 & NBYTES(0x80);
	w21 = ws1 & NBYTES(0x80);
	w10 = (ws0 << 1) & NBYTES(0xfe);
	w11 = (ws1 << 1) & NBYTES(0xfe);
	w20 = (w20 << 1) - (w20 >> 7);
	w21 = (w21 << 1) - (w21 >> 7);
	w20 &= NBYTES(0x1d);
	w21 &= NBYTES(0x1d);
	w10 ^= w20;
	w11 ^= w21;
	ws0 = w10 ^ wd0;
	ws1 = w11 ^ wd1;
      }
      *(unative_t *)&p[d+NSIZE*0] = wp0;
      *(unative_t *)&p[d+NSIZE*1] = wp1;
      *(unative_t *)&q[d+NSIZE*0] = ws0;
      *(unative_t *)&q[d+NSIZE*1] = ws1;
    }
  }
  after = rdtsc();
  printf("Int%dx%-3s %16llx, cycles = %12llu, cycles/byte = %.4g\n",
	 NSIZE*8, "2:", *(uint64_t *)q, after-before,
	 ((double)(after-before))/((double)((long long)DWID*n*count)));

  /* All integer - 32/64 bits - unroll factor 4 */
  before = rdtsc();
  for ( nc = count ; nc ; nc-- ) {
    unative_t wd0, ws0, wp0, w10, w20;
    unative_t wd1, ws1, wp1, w11, w21;
    unative_t wd2, ws2, wp2, w12, w22;
    unative_t wd3, ws3, wp3, w13, w23;
    for ( d = 0 ; d < DWID ; d += NSIZE*4 ) {
      ws0 = wp0 = 0;
      ws1 = wp1 = 0;
      ws2 = wp2 = 0;
      ws3 = wp3 = 0;
      for ( z = n-1 ; z >= 0 ; z-- ) {
	wd0 = *(unative_t *)&data[z][d+NSIZE*0];
	wd1 = *(unative_t *)&data[z][d+NSIZE*1];
	wd2 = *(unative_t *)&data[z][d+NSIZE*2];
	wd3 = *(unative_t *)&data[z][d+NSIZE*3];
	wp0 ^= wd0;
	wp1 ^= wd1;
	wp2 ^= wd2;
	wp3 ^= wd3;
	w20 = ws0 & NBYTES(0x80);
	w21 = ws1 & NBYTES(0x80);
	w22 = ws2 & NBYTES(0x80);
	w23 = ws3 & NBYTES(0x80);
	w10 = (ws0 << 1) & NBYTES(0xfe);
	w11 = (ws1 << 1) & NBYTES(0xfe);
	w12 = (ws2 << 1) & NBYTES(0xfe);
	w13 = (ws3 << 1) & NBYTES(0xfe);
	w20 = (w20 << 1) - (w20 >> 7);
	w21 = (w21 << 1) - (w21 >> 7);
	w22 = (w22 << 1) - (w22 >> 7);
	w23 = (w23 << 1) - (w23 >> 7);
	w20 &= NBYTES(0x1d);
	w21 &= NBYTES(0x1d);
	w22 &= NBYTES(0x1d);
	w23 &= NBYTES(0x1d);
	w10 ^= w20;
	w11 ^= w21;
	w12 ^= w22;
	w13 ^= w23;
	ws0 = w10 ^ wd0;
	ws1 = w11 ^ wd1;
	ws2 = w12 ^ wd2;
	ws3 = w13 ^ wd3;
      }
      *(unative_t *)&p[d+NSIZE*0] = wp0;
      *(unative_t *)&p[d+NSIZE*1] = wp1;
      *(unative_t *)&p[d+NSIZE*2] = wp2;
      *(unative_t *)&p[d+NSIZE*3] = wp3;
      *(unative_t *)&q[d+NSIZE*0] = ws0;
      *(unative_t *)&q[d+NSIZE*1] = ws1;
      *(unative_t *)&q[d+NSIZE*2] = ws2;
      *(unative_t *)&q[d+NSIZE*3] = ws3;
    }
  }
  after = rdtsc();
  printf("Int%dx%-3s %16llx, cycles = %12llu, cycles/byte = %.4g\n",
	 NSIZE*8, "4:", *(uint64_t *)q, after-before,
	 ((double)(after-before))/((double)((long long)DWID*n*count)));

  /* All integer - 32/64 bits - unroll factor 8 */
  before = rdtsc();
  for ( nc = count ; nc ; nc-- ) {
    unative_t wd0, ws0, wp0, w10, w20;
    unative_t wd1, ws1, wp1, w11, w21;
    unative_t wd2, ws2, wp2, w12, w22;
    unative_t wd3, ws3, wp3, w13, w23;
    unative_t wd4, ws4, wp4, w14, w24;
    unative_t wd5, ws5, wp5, w15, w25;
    unative_t wd6, ws6, wp6, w16, w26;
    unative_t wd7, ws7, wp7, w17, w27;
    for ( d = 0 ; d < DWID ; d += NSIZE*8 ) {
      ws0 = wp0 = 0;
      ws1 = wp1 = 0;
      ws2 = wp2 = 0;
      ws3 = wp3 = 0;
      ws4 = wp4 = 0;
      ws5 = wp5 = 0;
      ws6 = wp6 = 0;
      ws7 = wp7 = 0;
      for ( z = n-1 ; z >= 0 ; z-- ) {
	wd0 = *(unative_t *)&data[z][d+NSIZE*0];
	wd1 = *(unative_t *)&data[z][d+NSIZE*1];
	wd2 = *(unative_t *)&data[z][d+NSIZE*2];
	wd3 = *(unative_t *)&data[z][d+NSIZE*3];
	wd4 = *(unative_t *)&data[z][d+NSIZE*4];
	wd5 = *(unative_t *)&data[z][d+NSIZE*5];
	wd6 = *(unative_t *)&data[z][d+NSIZE*6];
	wd7 = *(unative_t *)&data[z][d+NSIZE*7];
	wp0 ^= wd0;
	wp1 ^= wd1;
	wp2 ^= wd2;
	wp3 ^= wd3;
	wp4 ^= wd4;
	wp5 ^= wd5;
	wp6 ^= wd6;
	wp7 ^= wd7;
	w20 = ws0 & NBYTES(0x80);
	w21 = ws1 & NBYTES(0x80);
	w22 = ws2 & NBYTES(0x80);
	w23 = ws3 & NBYTES(0x80);
	w24 = ws4 & NBYTES(0x80);
	w25 = ws5 & NBYTES(0x80);
	w26 = ws6 & NBYTES(0x80);
	w27 = ws7 & NBYTES(0x80);
	w10 = (ws0 << 1) & NBYTES(0xfe);
	w11 = (ws1 << 1) & NBYTES(0xfe);
	w12 = (ws2 << 1) & NBYTES(0xfe);
	w13 = (ws3 << 1) & NBYTES(0xfe);
	w14 = (ws4 << 1) & NBYTES(0xfe);
	w15 = (ws5 << 1) & NBYTES(0xfe);
	w16 = (ws6 << 1) & NBYTES(0xfe);
	w17 = (ws7 << 1) & NBYTES(0xfe);
	w20 = (w20 << 1) - (w20 >> 7);
	w21 = (w21 << 1) - (w21 >> 7);
	w22 = (w22 << 1) - (w22 >> 7);
	w23 = (w23 << 1) - (w23 >> 7);
	w24 = (w24 << 1) - (w24 >> 7);
	w25 = (w25 << 1) - (w25 >> 7);
	w26 = (w26 << 1) - (w26 >> 7);
	w27 = (w27 << 1) - (w27 >> 7);
	w20 &= NBYTES(0x1d);
	w21 &= NBYTES(0x1d);
	w22 &= NBYTES(0x1d);
	w23 &= NBYTES(0x1d);
	w24 &= NBYTES(0x1d);
	w25 &= NBYTES(0x1d);
	w26 &= NBYTES(0x1d);
	w27 &= NBYTES(0x1d);
	w10 ^= w20;
	w11 ^= w21;
	w12 ^= w22;
	w13 ^= w23;
	w14 ^= w24;
	w15 ^= w25;
	w16 ^= w26;
	w17 ^= w27;
	ws0 = w10 ^ wd0;
	ws1 = w11 ^ wd1;
	ws2 = w12 ^ wd2;
	ws3 = w13 ^ wd3;
	ws4 = w14 ^ wd4;
	ws5 = w15 ^ wd5;
	ws6 = w16 ^ wd6;
	ws7 = w17 ^ wd7;
      }
      *(unative_t *)&p[d+NSIZE*0] = wp0;
      *(unative_t *)&p[d+NSIZE*1] = wp1;
      *(unative_t *)&p[d+NSIZE*2] = wp2;
      *(unative_t *)&p[d+NSIZE*3] = wp3;
      *(unative_t *)&p[d+NSIZE*4] = wp4;
      *(unative_t *)&p[d+NSIZE*5] = wp5;
      *(unative_t *)&p[d+NSIZE*6] = wp6;
      *(unative_t *)&p[d+NSIZE*7] = wp7;
      *(unative_t *)&q[d+NSIZE*0] = ws0;
      *(unative_t *)&q[d+NSIZE*1] = ws1;
      *(unative_t *)&q[d+NSIZE*2] = ws2;
      *(unative_t *)&q[d+NSIZE*3] = ws3;
      *(unative_t *)&q[d+NSIZE*4] = ws4;
      *(unative_t *)&q[d+NSIZE*5] = ws5;
      *(unative_t *)&q[d+NSIZE*6] = ws6;
      *(unative_t *)&q[d+NSIZE*7] = ws7;
    }
  }
  after = rdtsc();
  printf("Int%dx%-3s %16llx, cycles = %12llu, cycles/byte = %.4g\n",
	 NSIZE*8, "8:", *(uint64_t *)q, after-before,
	 ((double)(after-before))/((double)((long long)DWID*n*count)));

  /* All integer - 32/64 bits - unroll factor 16 */
  before = rdtsc();
  for ( nc = count ; nc ; nc-- ) {
    unative_t wd0, ws0, wp0, w10, w20;
    unative_t wd1, ws1, wp1, w11, w21;
    unative_t wd2, ws2, wp2, w12, w22;
    unative_t wd3, ws3, wp3, w13, w23;
    unative_t wd4, ws4, wp4, w14, w24;
    unative_t wd5, ws5, wp5, w15, w25;
    unative_t wd6, ws6, wp6, w16, w26;
    unative_t wd7, ws7, wp7, w17, w27;
    unative_t wd8, ws8, wp8, w18, w28;
    unative_t wd9, ws9, wp9, w19, w29;
    unative_t wd10, ws10, wp10, w110, w210;
    unative_t wd11, ws11, wp11, w111, w211;
    unative_t wd12, ws12, wp12, w112, w212;
    unative_t wd13, ws13, wp13, w113, w213;
    unative_t wd14, ws14, wp14, w114, w214;
    unative_t wd15, ws15, wp15, w115, w215;
    for ( d = 0 ; d < DWID ; d += NSIZE*16 ) {
      ws0 = wp0 = 0;
      ws1 = wp1 = 0;
      ws2 = wp2 = 0;
      ws3 = wp3 = 0;
      ws4 = wp4 = 0;
      ws5 = wp5 = 0;
      ws6 = wp6 = 0;
      ws7 = wp7 = 0;
      ws8 = wp8 = 0;
      ws9 = wp9 = 0;
      ws10 = wp10 = 0;
      ws11 = wp11 = 0;
      ws12 = wp12 = 0;
      ws13 = wp13 = 0;
      ws14 = wp14 = 0;
      ws15 = wp15 = 0;
      for ( z = n-1 ; z >= 0 ; z-- ) {
	wd0 = *(unative_t *)&data[z][d+NSIZE*0];
	wd1 = *(unative_t *)&data[z][d+NSIZE*1];
	wd2 = *(unative_t *)&data[z][d+NSIZE*2];
	wd3 = *(unative_t *)&data[z][d+NSIZE*3];
	wd4 = *(unative_t *)&data[z][d+NSIZE*4];
	wd5 = *(unative_t *)&data[z][d+NSIZE*5];
	wd6 = *(unative_t *)&data[z][d+NSIZE*6];
	wd7 = *(unative_t *)&data[z][d+NSIZE*7];
	wd8 = *(unative_t *)&data[z][d+NSIZE*8];
	wd9 = *(unative_t *)&data[z][d+NSIZE*9];
	wd10 = *(unative_t *)&data[z][d+NSIZE*10];
	wd11 = *(unative_t *)&data[z][d+NSIZE*11];
	wd12 = *(unative_t *)&data[z][d+NSIZE*12];
	wd13 = *(unative_t *)&data[z][d+NSIZE*13];
	wd14 = *(unative_t *)&data[z][d+NSIZE*14];
	wd15 = *(unative_t *)&data[z][d+NSIZE*15];
	wp0 ^= wd0;
	wp1 ^= wd1;
	wp2 ^= wd2;
	wp3 ^= wd3;
	wp4 ^= wd4;
	wp5 ^= wd5;
	wp6 ^= wd6;
	wp7 ^= wd7;
	wp8 ^= wd8;
	wp9 ^= wd9;
	wp10 ^= wd10;
	wp11 ^= wd11;
	wp12 ^= wd12;
	wp13 ^= wd13;
	wp14 ^= wd14;
	wp15 ^= wd15;
	w20 = ws0 & NBYTES(0x80);
	w21 = ws1 & NBYTES(0x80);
	w22 = ws2 & NBYTES(0x80);
	w23 = ws3 & NBYTES(0x80);
	w24 = ws4 & NBYTES(0x80);
	w25 = ws5 & NBYTES(0x80);
	w26 = ws6 & NBYTES(0x80);
	w27 = ws7 & NBYTES(0x80);
	w28 = ws8 & NBYTES(0x80);
	w29 = ws9 & NBYTES(0x80);
	w210 = ws10 & NBYTES(0x80);
	w211 = ws11 & NBYTES(0x80);
	w212 = ws12 & NBYTES(0x80);
	w213 = ws13 & NBYTES(0x80);
	w214 = ws14 & NBYTES(0x80);
	w215 = ws15 & NBYTES(0x80);
	w10 = (ws0 << 1) & NBYTES(0xfe);
	w11 = (ws1 << 1) & NBYTES(0xfe);
	w12 = (ws2 << 1) & NBYTES(0xfe);
	w13 = (ws3 << 1) & NBYTES(0xfe);
	w14 = (ws4 << 1) & NBYTES(0xfe);
	w15 = (ws5 << 1) & NBYTES(0xfe);
	w16 = (ws6 << 1) & NBYTES(0xfe);
	w17 = (ws7 << 1) & NBYTES(0xfe);
	w18 = (ws8 << 1) & NBYTES(0xfe);
	w19 = (ws9 << 1) & NBYTES(0xfe);
	w110 = (ws10 << 1) & NBYTES(0xfe);
	w111 = (ws11 << 1) & NBYTES(0xfe);
	w112 = (ws12 << 1) & NBYTES(0xfe);
	w113 = (ws13 << 1) & NBYTES(0xfe);
	w114 = (ws14 << 1) & NBYTES(0xfe);
	w115 = (ws15 << 1) & NBYTES(0xfe);
	w20 = (w20 << 1) - (w20 >> 7);
	w21 = (w21 << 1) - (w21 >> 7);
	w22 = (w22 << 1) - (w22 >> 7);
	w23 = (w23 << 1) - (w23 >> 7);
	w24 = (w24 << 1) - (w24 >> 7);
	w25 = (w25 << 1) - (w25 >> 7);
	w26 = (w26 << 1) - (w26 >> 7);
	w27 = (w27 << 1) - (w27 >> 7);
	w28 = (w28 << 1) - (w28 >> 7);
	w29 = (w29 << 1) - (w29 >> 7);
	w210 = (w210 << 1) - (w210 >> 7);
	w211 = (w211 << 1) - (w211 >> 7);
	w212 = (w212 << 1) - (w212 >> 7);
	w213 = (w213 << 1) - (w213 >> 7);
	w214 = (w214 << 1) - (w214 >> 7);
	w215 = (w215 << 1) - (w215 >> 7);
	w20 &= NBYTES(0x1d);
	w21 &= NBYTES(0x1d);
	w22 &= NBYTES(0x1d);
	w23 &= NBYTES(0x1d);
	w24 &= NBYTES(0x1d);
	w25 &= NBYTES(0x1d);
	w26 &= NBYTES(0x1d);
	w27 &= NBYTES(0x1d);
	w28 &= NBYTES(0x1d);
	w29 &= NBYTES(0x1d);
	w210 &= NBYTES(0x1d);
	w211 &= NBYTES(0x1d);
	w212 &= NBYTES(0x1d);
	w213 &= NBYTES(0x1d);
	w214 &= NBYTES(0x1d);
	w215 &= NBYTES(0x1d);
	w10 ^= w20;
	w11 ^= w21;
	w12 ^= w22;
	w13 ^= w23;
	w14 ^= w24;
	w15 ^= w25;
	w16 ^= w26;
	w17 ^= w27;
	w18 ^= w28;
	w19 ^= w29;
	w110 ^= w210;
	w111 ^= w211;
	w112 ^= w212;
	w113 ^= w213;
	w114 ^= w214;
	w115 ^= w215;
	ws0 = w10 ^ wd0;
	ws1 = w11 ^ wd1;
	ws2 = w12 ^ wd2;
	ws3 = w13 ^ wd3;
	ws4 = w14 ^ wd4;
	ws5 = w15 ^ wd5;
	ws6 = w16 ^ wd6;
	ws7 = w17 ^ wd7;
	ws8 = w18 ^ wd8;
	ws9 = w19 ^ wd9;
	ws10 = w110 ^ wd10;
	ws11 = w111 ^ wd11;
	ws12 = w112 ^ wd12;
	ws13 = w113 ^ wd13;
	ws14 = w114 ^ wd14;
	ws15 = w115 ^ wd15;
      }
      *(unative_t *)&p[d+NSIZE*0] = wp0;
      *(unative_t *)&p[d+NSIZE*1] = wp1;
      *(unative_t *)&p[d+NSIZE*2] = wp2;
      *(unative_t *)&p[d+NSIZE*3] = wp3;
      *(unative_t *)&p[d+NSIZE*4] = wp4;
      *(unative_t *)&p[d+NSIZE*5] = wp5;
      *(unative_t *)&p[d+NSIZE*6] = wp6;
      *(unative_t *)&p[d+NSIZE*7] = wp7;
      *(unative_t *)&p[d+NSIZE*8] = wp8;
      *(unative_t *)&p[d+NSIZE*9] = wp9;
      *(unative_t *)&p[d+NSIZE*10] = wp10;
      *(unative_t *)&p[d+NSIZE*11] = wp11;
      *(unative_t *)&p[d+NSIZE*12] = wp12;
      *(unative_t *)&p[d+NSIZE*13] = wp13;
      *(unative_t *)&p[d+NSIZE*14] = wp14;
      *(unative_t *)&p[d+NSIZE*15] = wp15;
      *(unative_t *)&q[d+NSIZE*0] = ws0;
      *(unative_t *)&q[d+NSIZE*1] = ws1;
      *(unative_t *)&q[d+NSIZE*2] = ws2;
      *(unative_t *)&q[d+NSIZE*3] = ws3;
      *(unative_t *)&q[d+NSIZE*4] = ws4;
      *(unative_t *)&q[d+NSIZE*5] = ws5;
      *(unative_t *)&q[d+NSIZE*6] = ws6;
      *(unative_t *)&q[d+NSIZE*7] = ws7;
      *(unative_t *)&q[d+NSIZE*8] = ws8;
      *(unative_t *)&q[d+NSIZE*9] = ws9;
      *(unative_t *)&q[d+NSIZE*10] = ws10;
      *(unative_t *)&q[d+NSIZE*11] = ws11;
      *(unative_t *)&q[d+NSIZE*12] = ws12;
      *(unative_t *)&q[d+NSIZE*13] = ws13;
      *(unative_t *)&q[d+NSIZE*14] = ws14;
      *(unative_t *)&q[d+NSIZE*15] = ws15;
    }
  }
  after = rdtsc();
  printf("Int%dx%-3s %16llx, cycles = %12llu, cycles/byte = %.4g\n",
	 NSIZE*8, "16:", *(uint64_t *)q, after-before,
	 ((double)(after-before))/((double)((long long)DWID*n*count)));

#if defined(__i386__) || defined(__x86_64__)

  /* Compute using MMX 8 bytes wide */
  before = rdtsc();
  for ( nc = count ; nc ; nc-- ) {
    asm volatile("movq %0,%%mm0" :: "m" (0x1d1d1d1d1d1d1d1dULL));
    asm volatile("movq %0,%%mm1" :: "m" (0xfefefefefefefefeULL));
    for ( d = 0 ; d < DWID ; d += 8 ) {
      /* Loop unrolled by two */
      asm volatile("pxor %mm2,%mm2"); /* P[0] */
      asm volatile("pxor %mm4,%mm4"); /* Q[0] */
      asm volatile("pxor %mm5,%mm5"); /* MBZ - used by Q[0] compute */
      for ( i = n-1 ; i >= 0 ; i-- ) {
	asm volatile("movq %0,%%mm6" :: "m" (data[i][d]));
	asm volatile("pcmpgtb %mm4,%mm5");
	asm volatile("psllw $1,%mm4");
	asm volatile("pand %mm0,%mm5");
	asm volatile("pand %mm1,%mm4");
	asm volatile("pxor %mm5,%mm4");
	asm volatile("pxor %mm5,%mm5");
	asm volatile("pxor %mm6,%mm2");
	asm volatile("pxor %mm6,%mm4");
      }
      asm volatile("movq %%mm2,%0" : "=m" (p[d]));
      asm volatile("movq %%mm4,%0" : "=m" (q[d]));
    }
  }
  after = rdtsc();
  asm volatile("emms");
  printf("MMX(1):   %16llx, cycles = %12llu, cycles/byte = %.4g\n",
	 *(uint64_t *)q, after-before,
	 ((double)(after-before))/((double)((long long)DWID*n*count)));

  /* Compute using MMX 16 bytes wide */
  before = rdtsc();
  for ( nc = count ; nc ; nc-- ) {
    asm volatile("movq %0,%%mm0" :: "m" (0x1d1d1d1d1d1d1d1dULL));
    asm volatile("movq %0,%%mm1" :: "m" (0xfefefefefefefefeULL));
    for ( d = 0 ; d < DWID ; d += 16 ) {
      /* Loop unrolled by two */
      asm volatile("pxor %mm2,%mm2"); /* P[0] */
      asm volatile("pxor %mm3,%mm3"); /* P[1] */
      asm volatile("pxor %mm4,%mm4"); /* Q[0] */
      asm volatile("pxor %mm5,%mm5"); /* MBZ - used by Q[0] compute */
      asm volatile("pxor %mm6,%mm6"); /* Q[1] */
      asm volatile("pxor %mm7,%mm7"); /* MBZ - used by Q[1] compute */
      for ( i = n-1 ; i >= 0 ; i-- ) {
	asm volatile("pcmpgtb %mm4,%mm5");
	asm volatile("pcmpgtb %mm6,%mm7");
	asm volatile("psllw $1,%mm4");
	asm volatile("psllw $1,%mm6");
	asm volatile("pand %mm0,%mm5");
	asm volatile("pand %mm0,%mm7");
	asm volatile("pand %mm1,%mm4");
	asm volatile("pand %mm1,%mm6");
	asm volatile("pxor %mm5,%mm4");
	asm volatile("pxor %mm7,%mm6");
	asm volatile("movq %0,%%mm5" :: "m" (data[i][d]));
	asm volatile("movq %0,%%mm7" :: "m" (data[i][d+8]));
	asm volatile("pxor %mm5,%mm2");
	asm volatile("pxor %mm7,%mm3");
	asm volatile("pxor %mm5,%mm4");
	asm volatile("pxor %mm7,%mm6");
	asm volatile("pxor %mm5,%mm5");
	asm volatile("pxor %mm7,%mm7");
      }
      asm volatile("movq %%mm2,%0" : "=m" (p[d]));
      asm volatile("movq %%mm3,%0" : "=m" (p[d+8]));
      asm volatile("movq %%mm4,%0" : "=m" (q[d]));
      asm volatile("movq %%mm6,%0" : "=m" (q[d+8]));
    }
  }
  after = rdtsc();
  asm volatile("emms");
  printf("MMX(2):   %16llx, cycles = %12llu, cycles/byte = %.4g\n",
	 *(uint64_t *)q, after-before,
	 ((double)(after-before))/((double)((long long)DWID*n*count)));

#ifdef SSE2
  flags = cpuid_features();
  if ( (flags & (0xf << 23)) != (0xf << 23) ) {
    printf("No SSE-2 detected\n");
  } else {
    
#ifdef NT
# define MOVDQA "movntdq "
#else
# define MOVDQA "movdqa "
#endif
    asm volatile("mfence");
    
    /* Compute using SSE2 16 bytes wide */
    before = rdtsc();
    for ( nc = count ; nc ; nc-- ) {
      asm volatile("movdqa %0,%%xmm0" :: "m" (xmmc.x1d[0]));
      asm volatile("movdqa %0,%%xmm1" :: "m" (xmmc.xfe[0]));
      for ( d = 0 ; d < DWID ; d += 16 ) {
	asm volatile("pxor %xmm2,%xmm2"); /* P[0] */
	asm volatile("pxor %xmm4,%xmm4"); /* Q[0] */
	asm volatile("pxor %xmm5,%xmm5"); /* MBZ - used by Q[0] compute */
	for ( i = n-1 ; i >= 0 ; i-- ) {
	  asm volatile("movq %0,%%xmm6" :: "m" (data[i][d]));
	  asm volatile("pcmpgtb %xmm4,%xmm5");
	  asm volatile("psllw $1,%xmm4");
	  asm volatile("pand %xmm0,%xmm5");
	  asm volatile("pand %xmm1,%xmm4");
	  asm volatile("pxor %xmm5,%xmm4");
	  asm volatile("pxor %xmm5,%xmm5");
	  asm volatile("pxor %xmm6,%xmm2");
	  asm volatile("pxor %xmm6,%xmm4");
	}
	asm volatile(MOVDQA "%%xmm2,%0" : "=m" (p[d]));
	asm volatile(MOVDQA "%%xmm4,%0" : "=m" (q[d]));
      }
      asm volatile("sfence");
    }
    after = rdtsc();
    printf("XMM(1):   %16llx, cycles = %12llu, cycles/byte = %.4g\n",
	   *(uint64_t *)q, after-before,
	   ((double)(after-before))/((double)((long long)DWID*n*count)));

    /* Compute using SSE2 32 bytes wide */
    before = rdtsc();
    for ( nc = count ; nc ; nc-- ) {
      asm volatile("movdqa %0,%%xmm0" :: "m" (xmmc.x1d[0]));
      asm volatile("movdqa %0,%%xmm1" :: "m" (xmmc.xfe[0]));
      for ( d = 0 ; d < DWID ; d += 32 ) {
	/* Loop unrolled by two */
	asm volatile("pxor %xmm2,%xmm2"); /* P[0] */
	asm volatile("pxor %xmm3,%xmm3"); /* P[1] */
	asm volatile("pxor %xmm4,%xmm4"); /* Q[0] */
	asm volatile("pxor %xmm5,%xmm5"); /* MBZ - used by Q[0] compute */
	asm volatile("pxor %xmm6,%xmm6"); /* Q[1] */
	asm volatile("pxor %xmm7,%xmm7"); /* MBZ - used by Q[1] compute */
	for ( i = n-1 ; i >= 0 ; i-- ) {
	  asm volatile("pcmpgtb %xmm4,%xmm5");
	  asm volatile("pcmpgtb %xmm6,%xmm7");
	  asm volatile("psllw $1,%xmm4");
	  asm volatile("psllw $1,%xmm6");
	  asm volatile("pand %xmm0,%xmm5");
	  asm volatile("pand %xmm0,%xmm7");
	  asm volatile("pand %xmm1,%xmm4");
	  asm volatile("pand %xmm1,%xmm6");
	  asm volatile("pxor %xmm5,%xmm4");
	  asm volatile("pxor %xmm7,%xmm6");
	  asm volatile("movq %0,%%xmm5" :: "m" (data[i][d]));
	  asm volatile("movq %0,%%xmm7" :: "m" (data[i][d+16]));
	  asm volatile("pxor %xmm5,%xmm2");
	  asm volatile("pxor %xmm7,%xmm3");
	  asm volatile("pxor %xmm5,%xmm4");
	  asm volatile("pxor %xmm7,%xmm6");
	  asm volatile("pxor %xmm5,%xmm5");
	  asm volatile("pxor %xmm7,%xmm7");
	}
	asm volatile(MOVDQA "%%xmm2,%0" : "=m" (p[d]));
	asm volatile(MOVDQA "%%xmm3,%0" : "=m" (p[d+16]));
	asm volatile(MOVDQA "%%xmm4,%0" : "=m" (q[d]));
	asm volatile(MOVDQA "%%xmm6,%0" : "=m" (q[d+16]));
      }
      asm volatile("sfence");
    }
    after = rdtsc();
    printf("XMM(2):   %16llx, cycles = %12llu, cycles/byte = %.4g\n",
	   *(uint64_t *)q, after-before,
	   ((double)(after-before))/((double)((long long)DWID*n*count)));

    /* Compute using SSE2 16 bytes wide w/prefetch */
    before = rdtsc();
    for ( nc = count ; nc ; nc-- ) {
      asm volatile("movdqa %0,%%xmm0" :: "m" (xmmc.x1d[0]));
      asm volatile("movdqa %0,%%xmm1" :: "m" (xmmc.xfe[0]));
      for ( d = 0 ; d < DWID ; d += 16 ) {
	asm volatile("prefetchnta %0" :: "m" (data[n-1][d]));
	asm volatile("pxor %xmm2,%xmm2"); /* P[0] */
	asm volatile("pxor %xmm4,%xmm4"); /* Q[0] */
	asm volatile("pxor %xmm5,%xmm5"); /* MBZ - used by Q[0] compute */
	for ( i = n-1 ; i >= 1 ; i-- ) {
	  asm volatile("movq %0,%%xmm6" :: "m" (data[i][d]));
	  asm volatile("prefetchnta %0" :: "m" (data[i-1][d]));
	  asm volatile("pcmpgtb %xmm4,%xmm5");
	  asm volatile("psllw $1,%xmm4");
	  asm volatile("pand %xmm0,%xmm5");
	  asm volatile("pand %xmm1,%xmm4");
	  asm volatile("pxor %xmm5,%xmm4");
	  asm volatile("pxor %xmm5,%xmm5");
	  asm volatile("pxor %xmm6,%xmm2");
	  asm volatile("pxor %xmm6,%xmm4");
	}
	asm volatile("movq %0,%%xmm6" :: "m" (data[i][d]));
	asm volatile("pcmpgtb %xmm4,%xmm5");
	asm volatile("psllw $1,%xmm4");
	asm volatile("pand %xmm0,%xmm5");
	asm volatile("pand %xmm1,%xmm4");
	asm volatile("pxor %xmm5,%xmm4");
	asm volatile("pxor %xmm5,%xmm5");
	asm volatile("pxor %xmm6,%xmm2");
	asm volatile("pxor %xmm6,%xmm4");

	asm volatile(MOVDQA "%%xmm2,%0" : "=m" (p[d]));
	asm volatile(MOVDQA "%%xmm4,%0" : "=m" (q[d]));
      }
      asm volatile("sfence");
    }
    after = rdtsc();
    printf("XMMNT(1): %16llx, cycles = %12llu, cycles/byte = %.4g\n",
	   *(uint64_t *)q, after-before,
	   ((double)(after-before))/((double)((long long)DWID*n*count)));

    /* Compute using SSE2 32 bytes wide */
    before = rdtsc();
    for ( nc = count ; nc ; nc-- ) {
      asm volatile("movdqa %0,%%xmm0" :: "m" (xmmc.x1d[0]));
      asm volatile("movdqa %0,%%xmm1" :: "m" (xmmc.xfe[0]));
      for ( d = 0 ; d < DWID ; d += 32 ) {
	/* Loop unrolled by two */
	asm volatile("prefetchnta %0" :: "m" (data[n-1][d]));
	asm volatile("prefetchnta %0" :: "m" (data[n-1][d+16]));
	asm volatile("pxor %xmm2,%xmm2"); /* P[0] */
	asm volatile("pxor %xmm3,%xmm3"); /* P[1] */
	asm volatile("pxor %xmm4,%xmm4"); /* Q[0] */
	asm volatile("pxor %xmm5,%xmm5"); /* MBZ - used by Q[0] compute */
	asm volatile("pxor %xmm6,%xmm6"); /* Q[1] */
	asm volatile("pxor %xmm7,%xmm7"); /* MBZ - used by Q[1] compute */
	for ( i = n-1 ; i >= 0 ; i-- ) {
	  asm volatile("pcmpgtb %xmm4,%xmm5");
	  asm volatile("pcmpgtb %xmm6,%xmm7");
	  asm volatile("psllw $1,%xmm4");
	  asm volatile("psllw $1,%xmm6");
	  asm volatile("pand %xmm0,%xmm5");
	  asm volatile("pand %xmm0,%xmm7");
	  asm volatile("pand %xmm1,%xmm4");
	  asm volatile("pand %xmm1,%xmm6");
	  asm volatile("pxor %xmm5,%xmm4");
	  asm volatile("pxor %xmm7,%xmm6");
	  asm volatile("movq %0,%%xmm5" :: "m" (data[i][d]));
	  asm volatile("movq %0,%%xmm7" :: "m" (data[i][d+16]));
	  asm volatile("prefetchnta %0" :: "m" (data[i-1][d]));
	  asm volatile("prefetchnta %0" :: "m" (data[i-1][d+16]));
	  asm volatile("pxor %xmm5,%xmm2");
	  asm volatile("pxor %xmm7,%xmm3");
	  asm volatile("pxor %xmm5,%xmm4");
	  asm volatile("pxor %xmm7,%xmm6");
	  asm volatile("pxor %xmm5,%xmm5");
	  asm volatile("pxor %xmm7,%xmm7");
	}
	asm volatile(MOVDQA "%%xmm2,%0" : "=m" (p[d]));
	asm volatile(MOVDQA "%%xmm3,%0" : "=m" (p[d+16]));
	asm volatile(MOVDQA "%%xmm4,%0" : "=m" (q[d]));
	asm volatile(MOVDQA "%%xmm6,%0" : "=m" (q[d+16]));
      }
      asm volatile("sfence");
    }
    after = rdtsc();
    printf("XMMNT(2): %16llx, cycles = %12llu, cycles/byte = %.4g\n",
	   *(uint64_t *)q, after-before,
	   ((double)(after-before))/((double)((long long)DWID*n*count)));
  }
#endif /* SSE2 */
#endif /* i386 x86_64 */

  return 0;
}
