784 lines
23 KiB
C++
784 lines
23 KiB
C++
/*
|
|
Copyright (c) 2015 Christopher A. Taylor. All rights reserved.
|
|
|
|
Redistribution and use in source and binary forms, with or without
|
|
modification, are permitted provided that the following conditions are met:
|
|
|
|
* Redistributions of source code must retain the above copyright notice,
|
|
this list of conditions and the following disclaimer.
|
|
* Redistributions in binary form must reproduce the above copyright notice,
|
|
this list of conditions and the following disclaimer in the documentation
|
|
and/or other materials provided with the distribution.
|
|
* Neither the name of CM256 nor the names of its contributors may be
|
|
used to endorse or promote products derived from this software without
|
|
specific prior written permission.
|
|
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
#include <cstdio>
|
|
#include <cstdlib>
|
|
|
|
#include "gf256.h"
|
|
|
|
const uint8_t gf256_ctx::GF256_GEN_POLY[GF256_GEN_POLY_COUNT] = {
|
|
0x8e,
|
|
0x95,
|
|
0x96,
|
|
0xa6,
|
|
0xaf,
|
|
0xb1,
|
|
0xb2,
|
|
0xb4,
|
|
0xb8,
|
|
0xc3,
|
|
0xc6,
|
|
0xd4,
|
|
0xe1,
|
|
0xe7,
|
|
0xf3,
|
|
0xfa,
|
|
};
|
|
|
|
gf256_ctx::gf256_ctx()
|
|
: initialized(false)
|
|
{
|
|
gf256_init_();
|
|
}
|
|
|
|
gf256_ctx::~gf256_ctx()
|
|
{
|
|
}
|
|
|
|
// Select which polynomial to use
|
|
void gf256_ctx::gf255_poly_init(int polynomialIndex)
|
|
{
|
|
if (polynomialIndex < 0 || polynomialIndex >= GF256_GEN_POLY_COUNT)
|
|
{
|
|
polynomialIndex = 0;
|
|
}
|
|
|
|
Polynomial = (GF256_GEN_POLY[polynomialIndex] << 1) | 1;
|
|
}
|
|
|
|
//-----------------------------------------------------------------------------
|
|
// Exponential and Log Tables
|
|
|
|
// Construct EXP and LOG tables from polynomial
|
|
void gf256_ctx::gf256_explog_init()
|
|
{
|
|
unsigned poly = Polynomial;
|
|
uint8_t* exptab = GF256_EXP_TABLE;
|
|
uint16_t* logtab = GF256_LOG_TABLE;
|
|
|
|
logtab[0] = 512;
|
|
exptab[0] = 1;
|
|
for (unsigned jj = 1; jj < 255; ++jj)
|
|
{
|
|
unsigned next = (unsigned)exptab[jj - 1] * 2;
|
|
if (next >= 256)
|
|
next ^= poly;
|
|
|
|
exptab[jj] = static_cast<uint8_t>(next);
|
|
logtab[exptab[jj]] = static_cast<uint16_t>(jj);
|
|
}
|
|
|
|
exptab[255] = exptab[0];
|
|
logtab[exptab[255]] = 255;
|
|
|
|
for (unsigned jj = 256; jj < 2 * 255; ++jj)
|
|
{
|
|
exptab[jj] = exptab[jj % 255];
|
|
}
|
|
|
|
exptab[2 * 255] = 1;
|
|
|
|
for (unsigned jj = 2 * 255 + 1; jj < 4 * 255; ++jj)
|
|
{
|
|
exptab[jj] = 0;
|
|
}
|
|
}
|
|
|
|
//-----------------------------------------------------------------------------
|
|
// Multiply and Divide Tables
|
|
|
|
// Initialize MUL and DIV tables using LOG and EXP tables
|
|
void gf256_ctx::gf256_muldiv_init()
|
|
{
|
|
// Allocate table memory 65KB x 2
|
|
uint8_t* m = GF256_MUL_TABLE;
|
|
uint8_t* d = GF256_DIV_TABLE;
|
|
|
|
// Unroll y = 0 subtable
|
|
for (int x = 0; x < 256; ++x)
|
|
{
|
|
m[x] = d[x] = 0;
|
|
}
|
|
|
|
// For each other y value,
|
|
for (int y = 1; y < 256; ++y)
|
|
{
|
|
// Calculate log(y) for mult and 255 - log(y) for div
|
|
const uint8_t log_y = static_cast<uint8_t>(GF256_LOG_TABLE[y]);
|
|
const uint8_t log_yn = 255 - log_y;
|
|
|
|
// Next subtable
|
|
m += 256;
|
|
d += 256;
|
|
|
|
// Unroll x = 0
|
|
m[0] = 0;
|
|
d[0] = 0;
|
|
|
|
// Calculate x * y, x / y
|
|
for (int x = 1; x < 256; ++x)
|
|
{
|
|
uint16_t log_x = GF256_LOG_TABLE[x];
|
|
|
|
m[x] = GF256_EXP_TABLE[log_x + log_y];
|
|
d[x] = GF256_EXP_TABLE[log_x + log_yn];
|
|
}
|
|
}
|
|
}
|
|
|
|
//-----------------------------------------------------------------------------
|
|
// Inverse Table
|
|
|
|
// Initialize INV table using DIV table
|
|
void gf256_ctx::gf256_inv_init()
|
|
{
|
|
for (int x = 0; x < 256; ++x)
|
|
{
|
|
GF256_INV_TABLE[x] = gf256_div(1, static_cast<uint8_t>(x));
|
|
}
|
|
}
|
|
|
|
//-----------------------------------------------------------------------------
|
|
// Multiply and Add Memory Tables
|
|
|
|
/*
|
|
Fast algorithm to compute m[1..8] = a[1..8] * b in GF(256)
|
|
using SSE3 SIMD instruction set:
|
|
|
|
Consider z = x * y in GF(256).
|
|
This operation can be performed bit-by-bit. Usefully, the partial product
|
|
of each bit is combined linearly with the rest. This means that the 8-bit
|
|
number x can be split into its high and low 4 bits, and partial products
|
|
can be formed from each half. Then the halves can be linearly combined:
|
|
|
|
z = x[0..3] * y + x[4..7] * y
|
|
|
|
The multiplication of each half can be done efficiently via table lookups,
|
|
and the addition in GF(256) is XOR. There must be two tables that map 16
|
|
input elements for the low or high 4 bits of x to the two partial products.
|
|
Each value for y has a different set of two tables:
|
|
|
|
z = TABLE_LO_y(x[0..3]) xor TABLE_HI_y(x[4..7])
|
|
|
|
This means that we need 16 * 2 * 256 = 8192 bytes for precomputed tables.
|
|
|
|
Computing z[] = x[] * y can be performed 16 bytes at a time by using the
|
|
128-bit register operations supported by modern processors.
|
|
|
|
This is efficiently realized in SSE3 using the _mm_shuffle_epi8() function
|
|
provided by Visual Studio 2010 or newer in <tmmintrin.h>. This function
|
|
uses the low bits to do a table lookup on each byte. Unfortunately the
|
|
high bit of each mask byte has the special feature that it clears the
|
|
output byte when it is set, so we need to make sure it's cleared by masking
|
|
off the high bit of each byte before using it:
|
|
|
|
clr_mask = _mm_set1_epi8(0x0f) = 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
|
|
|
|
For the low half of the partial product, clear the high bit of each byte
|
|
and perform the table lookup:
|
|
|
|
p_lo = _mm_and_si128(x, clr_mask)
|
|
p_lo = _mm_shuffle_epi8(p_lo, TABLE_LO_y)
|
|
|
|
For the high half of the partial product, shift the high 4 bits of each
|
|
byte into the low 4 bits and clear the high bit of each byte, and then
|
|
perform the table lookup:
|
|
|
|
p_hi = _mm_srli_epi64(x, 4)
|
|
p_hi = _mm_and_si128(p_hi, clr_mask)
|
|
p_hi = _mm_shuffle_epi8(p_hi, TABLE_HI_y)
|
|
|
|
Finally add the two partial products to form the product, recalling that
|
|
addition is XOR in a Galois field:
|
|
|
|
result = _mm_xor_si128(p_lo, p_hi)
|
|
|
|
This crunches 16 bytes of x at a time, and the result can be stored in z.
|
|
*/
|
|
|
|
/*
|
|
Intrinsic reference:
|
|
|
|
SSE3, VS2010+, tmmintrin.h:
|
|
|
|
GF256_M128 _mm_shuffle_epi8(GF256_M128 a, GF256_M128 mask);
|
|
Emits the Supplemental Streaming SIMD Extensions 3 (SSSE3) instruction pshufb. This instruction shuffles 16-byte parameters from a 128-bit parameter.
|
|
|
|
Pseudo-code for PSHUFB (with 128 bit operands):
|
|
|
|
for i = 0 to 15 {
|
|
if (SRC[(i * 8)+7] = 1 ) then
|
|
DEST[(i*8)+7..(i*8)+0] <- 0;
|
|
else
|
|
index[3..0] <- SRC[(i*8)+3 .. (i*8)+0];
|
|
DEST[(i*8)+7..(i*8)+0] <- DEST[(index*8+7)..(index*8+0)];
|
|
endif
|
|
}
|
|
|
|
SSE2, VS2008+, emmintrin.h:
|
|
|
|
GF256_M128 _mm_slli_epi64 (GF256_M128 a, int count);
|
|
Shifts the 2 signed or unsigned 64-bit integers in a left by count bits while shifting in zeros.
|
|
GF256_M128 _mm_srli_epi64 (GF256_M128 a, int count);
|
|
Shifts the 2 signed or unsigned 64-bit integers in a right by count bits while shifting in zeros.
|
|
GF256_M128 _mm_set1_epi8 (char b);
|
|
Sets the 16 signed 8-bit integer values to b.
|
|
GF256_M128 _mm_and_si128 (GF256_M128 a, GF256_M128 b);
|
|
Computes the bitwise AND of the 128-bit value in a and the 128-bit value in b.
|
|
GF256_M128 _mm_xor_si128 ( GF256_M128 a, GF256_M128 b);
|
|
Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in b.
|
|
*/
|
|
|
|
// Initialize the MM256 tables using gf256_mul()
|
|
void gf256_ctx::gf256_muladd_mem_init()
|
|
{
|
|
for (int y = 0; y < 256; ++y)
|
|
{
|
|
uint8_t lo[16], hi[16];
|
|
|
|
// TABLE_LO_Y maps 0..15 to 8-bit partial product based on y.
|
|
for (unsigned char x = 0; x < 16; ++x)
|
|
{
|
|
lo[x] = gf256_mul(x, static_cast<uint8_t>(y));
|
|
hi[x] = gf256_mul(x << 4, static_cast<uint8_t>(y));
|
|
}
|
|
|
|
const GF256_M128 table_lo = _mm_set_epi8(
|
|
lo[15], lo[14], lo[13], lo[12], lo[11], lo[10], lo[9], lo[8],
|
|
lo[7], lo[6], lo[5], lo[4], lo[3], lo[2], lo[1], lo[0]);
|
|
const GF256_M128 table_hi = _mm_set_epi8(
|
|
hi[15], hi[14], hi[13], hi[12], hi[11], hi[10], hi[9], hi[8],
|
|
hi[7], hi[6], hi[5], hi[4], hi[3], hi[2], hi[1], hi[0]);
|
|
_mm_store_si128(MM256_TABLE_LO_Y + y, table_lo);
|
|
_mm_store_si128(MM256_TABLE_HI_Y + y, table_hi);
|
|
}
|
|
}
|
|
|
|
//-----------------------------------------------------------------------------
|
|
// Initialization
|
|
//
|
|
// Initialize a context, filling in the tables.
|
|
//
|
|
// Thread-safety / Usage Notes:
|
|
//
|
|
// It is perfectly safe and encouraged to use a gf256_ctx object from multiple
|
|
// threads. The gf256_init() is relatively expensive and should only be done
|
|
// once, though it will take less than a millisecond.
|
|
//
|
|
// The gf256_ctx object must be aligned to 16 byte boundary.
|
|
// Simply tag the object with GF256_ALIGNED to achieve this.
|
|
//
|
|
// Example:
|
|
// static GF256_ALIGNED gf256_ctx TheGF256Context;
|
|
// gf256_init(&TheGF256Context, 0);
|
|
//
|
|
// Returns 0 on success and other values on failure.
|
|
|
|
int gf256_ctx::gf256_init_()
|
|
{
|
|
// Avoid multiple initialization
|
|
if (initialized)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
if (!IsLittleEndian())
|
|
{
|
|
fprintf(stderr, "gf256_ctx::gf256_init_: Little Endian architecture expected (code won't work without mods)\n");
|
|
return -2;
|
|
}
|
|
|
|
gf255_poly_init(DefaultPolynomialIndex);
|
|
gf256_explog_init();
|
|
gf256_muldiv_init();
|
|
gf256_inv_init();
|
|
gf256_muladd_mem_init();
|
|
|
|
initialized = true;
|
|
fprintf(stderr, "gf256_ctx::gf256_init_: initialized\n");
|
|
return 0;
|
|
}
|
|
|
|
//-----------------------------------------------------------------------------
|
|
// Operations with context
|
|
|
|
void gf256_ctx::gf256_mul_mem(void* GF256_RESTRICT vz, const void* GF256_RESTRICT vx, uint8_t y, int bytes)
|
|
{
|
|
// Use a single if-statement to handle special cases
|
|
if (y <= 1)
|
|
{
|
|
if (y == 0)
|
|
{
|
|
memset(vz, 0, bytes);
|
|
}
|
|
return;
|
|
}
|
|
|
|
// Partial product tables; see above
|
|
const GF256_M128 table_lo_y = _mm_load_si128(MM256_TABLE_LO_Y + y);
|
|
const GF256_M128 table_hi_y = _mm_load_si128(MM256_TABLE_HI_Y + y);
|
|
|
|
// clr_mask = 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
|
|
const GF256_M128 clr_mask = _mm_set1_epi8(0x0f);
|
|
|
|
GF256_M128* GF256_RESTRICT z16 = reinterpret_cast<GF256_M128*>(vz);
|
|
const GF256_M128* GF256_RESTRICT x16 = reinterpret_cast<const GF256_M128*>(vx);
|
|
|
|
// Handle multiples of 16 bytes
|
|
while (bytes >= 16)
|
|
{
|
|
// See above comments for details
|
|
GF256_M128 x0 = _mm_loadu_si128(x16);
|
|
GF256_M128 l0 = _mm_and_si128(x0, clr_mask);
|
|
x0 = _mm_srli_epi64(x0, 4);
|
|
GF256_M128 h0 = _mm_and_si128(x0, clr_mask);
|
|
l0 = _mm_shuffle_epi8(table_lo_y, l0);
|
|
h0 = _mm_shuffle_epi8(table_hi_y, h0);
|
|
_mm_storeu_si128(z16, _mm_xor_si128(l0, h0));
|
|
|
|
x16++;
|
|
z16++;
|
|
bytes -= 16;
|
|
}
|
|
|
|
uint8_t* GF256_RESTRICT z8 = reinterpret_cast<uint8_t*>(z16);
|
|
const uint8_t* GF256_RESTRICT x8 = reinterpret_cast<const uint8_t*>(x16);
|
|
const uint8_t* GF256_RESTRICT table = GF256_MUL_TABLE + ((unsigned)y << 8);
|
|
|
|
// Handle a block of 8 bytes
|
|
if (bytes >= 8)
|
|
{
|
|
uint64_t word = table[x8[0]];
|
|
word |= (uint64_t)table[x8[1]] << 8;
|
|
word |= (uint64_t)table[x8[2]] << 16;
|
|
word |= (uint64_t)table[x8[3]] << 24;
|
|
word |= (uint64_t)table[x8[4]] << 32;
|
|
word |= (uint64_t)table[x8[5]] << 40;
|
|
word |= (uint64_t)table[x8[6]] << 48;
|
|
word |= (uint64_t)table[x8[7]] << 56;
|
|
*(uint64_t*)z8 = word;
|
|
|
|
x8 += 8;
|
|
z8 += 8;
|
|
bytes -= 8;
|
|
}
|
|
|
|
// Handle a block of 4 bytes
|
|
if (bytes >= 4)
|
|
{
|
|
uint32_t word = table[x8[0]];
|
|
word |= (uint32_t)table[x8[1]] << 8;
|
|
word |= (uint32_t)table[x8[2]] << 16;
|
|
word |= (uint32_t)table[x8[3]] << 24;
|
|
*(uint32_t*)z8 = word;
|
|
|
|
x8 += 4;
|
|
z8 += 4;
|
|
bytes -= 4;
|
|
}
|
|
|
|
// Handle single bytes
|
|
for (int i = bytes; i > 0; i--)
|
|
{
|
|
z8[i - 1] = table[x8[i - 1]];
|
|
}
|
|
}
|
|
|
|
void gf256_ctx::gf256_muladd_mem(void* GF256_RESTRICT vz, uint8_t y, const void* GF256_RESTRICT vx, int bytes)
|
|
{
|
|
// Use a single if-statement to handle special cases
|
|
if (y <= 1)
|
|
{
|
|
if (y == 1)
|
|
{
|
|
gf256_add_mem(vz, vx, bytes);
|
|
}
|
|
return;
|
|
}
|
|
|
|
// Partial product tables; see above
|
|
const GF256_M128 table_lo_y = _mm_load_si128(MM256_TABLE_LO_Y + y);
|
|
const GF256_M128 table_hi_y = _mm_load_si128(MM256_TABLE_HI_Y + y);
|
|
|
|
// clr_mask = 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
|
|
const GF256_M128 clr_mask = _mm_set1_epi8(0x0f);
|
|
|
|
GF256_M128* GF256_RESTRICT z16 = reinterpret_cast<GF256_M128*>(vz);
|
|
const GF256_M128* GF256_RESTRICT x16 = reinterpret_cast<const GF256_M128*>(vx);
|
|
|
|
// Handle multiples of 16 bytes
|
|
while (bytes >= 16)
|
|
{
|
|
// See above comments for details
|
|
GF256_M128 x0 = _mm_loadu_si128(x16);
|
|
GF256_M128 l0 = _mm_and_si128(x0, clr_mask);
|
|
x0 = _mm_srli_epi64(x0, 4);
|
|
GF256_M128 h0 = _mm_and_si128(x0, clr_mask);
|
|
l0 = _mm_shuffle_epi8(table_lo_y, l0);
|
|
h0 = _mm_shuffle_epi8(table_hi_y, h0);
|
|
const GF256_M128 p0 = _mm_xor_si128(l0, h0);
|
|
const GF256_M128 z0 = _mm_loadu_si128(z16);
|
|
_mm_storeu_si128(z16, _mm_xor_si128(p0, z0));
|
|
|
|
x16++;
|
|
z16++;
|
|
bytes -= 16;
|
|
}
|
|
|
|
uint8_t* GF256_RESTRICT z8 = reinterpret_cast<uint8_t*>(z16);
|
|
const uint8_t* GF256_RESTRICT x8 = reinterpret_cast<const uint8_t*>(x16);
|
|
const uint8_t* GF256_RESTRICT table = GF256_MUL_TABLE + ((unsigned)y << 8);
|
|
|
|
// Handle a block of 8 bytes
|
|
if (bytes >= 8)
|
|
{
|
|
uint64_t word = table[x8[0]];
|
|
word |= (uint64_t)table[x8[1]] << 8;
|
|
word |= (uint64_t)table[x8[2]] << 16;
|
|
word |= (uint64_t)table[x8[3]] << 24;
|
|
word |= (uint64_t)table[x8[4]] << 32;
|
|
word |= (uint64_t)table[x8[5]] << 40;
|
|
word |= (uint64_t)table[x8[6]] << 48;
|
|
word |= (uint64_t)table[x8[7]] << 56;
|
|
*(uint64_t*)z8 ^= word;
|
|
|
|
x8 += 8;
|
|
z8 += 8;
|
|
bytes -= 8;
|
|
}
|
|
|
|
// Handle a block of 4 bytes
|
|
if (bytes >= 4)
|
|
{
|
|
uint32_t word = table[x8[0]];
|
|
word |= (uint32_t)table[x8[1]] << 8;
|
|
word |= (uint32_t)table[x8[2]] << 16;
|
|
word |= (uint32_t)table[x8[3]] << 24;
|
|
*(uint32_t*)z8 ^= word;
|
|
|
|
x8 += 4;
|
|
z8 += 4;
|
|
bytes -= 4;
|
|
}
|
|
|
|
// Handle single bytes
|
|
for (int i = bytes; i > 0; i--)
|
|
{
|
|
z8[i - 1] ^= table[x8[i - 1]];
|
|
}
|
|
}
|
|
|
|
//-----------------------------------------------------------------------------
|
|
// Static operations
|
|
|
|
void gf256_ctx::gf256_add_mem(void* GF256_RESTRICT vx, const void* GF256_RESTRICT vy, int bytes)
|
|
{
|
|
GF256_M128* GF256_RESTRICT x16 = reinterpret_cast<GF256_M128*>(vx);
|
|
const GF256_M128* GF256_RESTRICT y16 = reinterpret_cast<const GF256_M128*>(vy);
|
|
|
|
// Handle multiples of 64 bytes
|
|
while (bytes >= 64)
|
|
{
|
|
GF256_M128 x0 = _mm_loadu_si128(x16);
|
|
GF256_M128 x1 = _mm_loadu_si128(x16 + 1);
|
|
GF256_M128 x2 = _mm_loadu_si128(x16 + 2);
|
|
GF256_M128 x3 = _mm_loadu_si128(x16 + 3);
|
|
GF256_M128 y0 = _mm_loadu_si128(y16);
|
|
GF256_M128 y1 = _mm_loadu_si128(y16 + 1);
|
|
GF256_M128 y2 = _mm_loadu_si128(y16 + 2);
|
|
GF256_M128 y3 = _mm_loadu_si128(y16 + 3);
|
|
|
|
_mm_storeu_si128(x16,
|
|
_mm_xor_si128(x0, y0));
|
|
_mm_storeu_si128(x16 + 1,
|
|
_mm_xor_si128(x1, y1));
|
|
_mm_storeu_si128(x16 + 2,
|
|
_mm_xor_si128(x2, y2));
|
|
_mm_storeu_si128(x16 + 3,
|
|
_mm_xor_si128(x3, y3));
|
|
|
|
x16 += 4;
|
|
y16 += 4;
|
|
bytes -= 64;
|
|
}
|
|
|
|
// Handle multiples of 16 bytes
|
|
while (bytes >= 16)
|
|
{
|
|
// x[i] = x[i] xor y[i]
|
|
_mm_storeu_si128(x16,
|
|
_mm_xor_si128(
|
|
_mm_loadu_si128(x16),
|
|
_mm_loadu_si128(y16)));
|
|
|
|
x16++;
|
|
y16++;
|
|
bytes -= 16;
|
|
}
|
|
|
|
uint8_t* GF256_RESTRICT x1 = reinterpret_cast<uint8_t*>(x16);
|
|
const uint8_t* GF256_RESTRICT y1 = reinterpret_cast<const uint8_t*>(y16);
|
|
|
|
// Handle a block of 8 bytes
|
|
if (bytes >= 8)
|
|
{
|
|
uint64_t* GF256_RESTRICT x8 = reinterpret_cast<uint64_t*>(x1);
|
|
const uint64_t* GF256_RESTRICT y8 = reinterpret_cast<const uint64_t*>(y1);
|
|
*x8 ^= *y8;
|
|
|
|
x1 += 8;
|
|
y1 += 8;
|
|
bytes -= 8;
|
|
}
|
|
|
|
// Handle a block of 4 bytes
|
|
if (bytes >= 4)
|
|
{
|
|
uint32_t* GF256_RESTRICT x4 = reinterpret_cast<uint32_t*>(x1);
|
|
const uint32_t* GF256_RESTRICT y4 = reinterpret_cast<const uint32_t*>(y1);
|
|
*x4 ^= *y4;
|
|
|
|
x1 += 4;
|
|
y1 += 4;
|
|
bytes -= 4;
|
|
}
|
|
|
|
// Handle final bytes
|
|
for (int i = bytes; i > 0; i--)
|
|
{
|
|
x1[i - 1] ^= y1[i - 1];
|
|
}
|
|
}
|
|
|
|
void gf256_ctx::gf256_add2_mem(void* GF256_RESTRICT vz, const void* GF256_RESTRICT vx, const void* GF256_RESTRICT vy, int bytes)
|
|
{
|
|
GF256_M128* GF256_RESTRICT z16 = reinterpret_cast<GF256_M128*>(vz);
|
|
const GF256_M128* GF256_RESTRICT x16 = reinterpret_cast<const GF256_M128*>(vx);
|
|
const GF256_M128* GF256_RESTRICT y16 = reinterpret_cast<const GF256_M128*>(vy);
|
|
|
|
// Handle multiples of 16 bytes
|
|
while (bytes >= 16)
|
|
{
|
|
// z[i] = x[i] xor y[i]
|
|
_mm_storeu_si128(z16,
|
|
_mm_xor_si128(
|
|
_mm_loadu_si128(z16),
|
|
_mm_xor_si128(
|
|
_mm_loadu_si128(x16),
|
|
_mm_loadu_si128(y16))));
|
|
|
|
x16++;
|
|
y16++;
|
|
z16++;
|
|
bytes -= 16;
|
|
}
|
|
|
|
uint8_t* GF256_RESTRICT z1 = reinterpret_cast<uint8_t*>(z16);
|
|
const uint8_t* GF256_RESTRICT x1 = reinterpret_cast<const uint8_t*>(x16);
|
|
const uint8_t* GF256_RESTRICT y1 = reinterpret_cast<const uint8_t*>(y16);
|
|
|
|
// Handle a block of 8 bytes
|
|
if (bytes >= 8)
|
|
{
|
|
uint64_t* GF256_RESTRICT z8 = reinterpret_cast<uint64_t*>(z1);
|
|
const uint64_t* GF256_RESTRICT x8 = reinterpret_cast<const uint64_t*>(x1);
|
|
const uint64_t* GF256_RESTRICT y8 = reinterpret_cast<const uint64_t*>(y1);
|
|
*z8 ^= *x8 ^ *y8;
|
|
|
|
x1 += 8;
|
|
y1 += 8;
|
|
z1 += 8;
|
|
bytes -= 8;
|
|
}
|
|
|
|
// Handle a block of 4 bytes
|
|
if (bytes >= 4)
|
|
{
|
|
uint32_t* GF256_RESTRICT z4 = reinterpret_cast<uint32_t*>(z1);
|
|
const uint32_t* GF256_RESTRICT x4 = reinterpret_cast<const uint32_t*>(x1);
|
|
const uint32_t* GF256_RESTRICT y4 = reinterpret_cast<const uint32_t*>(y1);
|
|
*z4 ^= *x4 ^ *y4;
|
|
|
|
x1 += 4;
|
|
y1 += 4;
|
|
z1 += 4;
|
|
bytes -= 4;
|
|
}
|
|
|
|
// Handle final bytes
|
|
for (int i = bytes; i > 0; i--)
|
|
{
|
|
z1[i - 1] ^= x1[i - 1] ^ y1[i - 1];
|
|
}
|
|
}
|
|
|
|
void gf256_ctx::gf256_addset_mem(void* GF256_RESTRICT vz, const void* GF256_RESTRICT vx, const void* GF256_RESTRICT vy, int bytes)
|
|
{
|
|
GF256_M128* GF256_RESTRICT z16 = reinterpret_cast<GF256_M128*>(vz);
|
|
const GF256_M128* GF256_RESTRICT x16 = reinterpret_cast<const GF256_M128*>(vx);
|
|
const GF256_M128* GF256_RESTRICT y16 = reinterpret_cast<const GF256_M128*>(vy);
|
|
|
|
// Handle multiples of 64 bytes
|
|
while (bytes >= 64)
|
|
{
|
|
GF256_M128 x0 = _mm_loadu_si128(x16);
|
|
GF256_M128 x1 = _mm_loadu_si128(x16 + 1);
|
|
GF256_M128 x2 = _mm_loadu_si128(x16 + 2);
|
|
GF256_M128 x3 = _mm_loadu_si128(x16 + 3);
|
|
GF256_M128 y0 = _mm_loadu_si128(y16);
|
|
GF256_M128 y1 = _mm_loadu_si128(y16 + 1);
|
|
GF256_M128 y2 = _mm_loadu_si128(y16 + 2);
|
|
GF256_M128 y3 = _mm_loadu_si128(y16 + 3);
|
|
|
|
_mm_storeu_si128(z16, _mm_xor_si128(x0, y0));
|
|
_mm_storeu_si128(z16 + 1, _mm_xor_si128(x1, y1));
|
|
_mm_storeu_si128(z16 + 2, _mm_xor_si128(x2, y2));
|
|
_mm_storeu_si128(z16 + 3, _mm_xor_si128(x3, y3));
|
|
|
|
x16 += 4;
|
|
y16 += 4;
|
|
z16 += 4;
|
|
bytes -= 64;
|
|
}
|
|
|
|
// Handle multiples of 16 bytes
|
|
while (bytes >= 16)
|
|
{
|
|
// z[i] = x[i] xor y[i]
|
|
_mm_storeu_si128(z16,
|
|
_mm_xor_si128(
|
|
_mm_loadu_si128(x16),
|
|
_mm_loadu_si128(y16)));
|
|
|
|
x16++;
|
|
y16++;
|
|
z16++;
|
|
bytes -= 16;
|
|
}
|
|
|
|
uint8_t* GF256_RESTRICT z1 = reinterpret_cast<uint8_t*>(z16);
|
|
const uint8_t* GF256_RESTRICT x1 = reinterpret_cast<const uint8_t*>(x16);
|
|
const uint8_t* GF256_RESTRICT y1 = reinterpret_cast<const uint8_t*>(y16);
|
|
|
|
// Handle a block of 8 bytes
|
|
if (bytes >= 8)
|
|
{
|
|
uint64_t* GF256_RESTRICT z8 = reinterpret_cast<uint64_t*>(z1);
|
|
const uint64_t* GF256_RESTRICT x8 = reinterpret_cast<const uint64_t*>(x1);
|
|
const uint64_t* GF256_RESTRICT y8 = reinterpret_cast<const uint64_t*>(y1);
|
|
*z8 = *x8 ^ *y8;
|
|
|
|
x1 += 8;
|
|
y1 += 8;
|
|
z1 += 8;
|
|
bytes -= 8;
|
|
}
|
|
|
|
// Handle a block of 4 bytes
|
|
if (bytes >= 4)
|
|
{
|
|
uint32_t* GF256_RESTRICT z4 = reinterpret_cast<uint32_t*>(z1);
|
|
const uint32_t* GF256_RESTRICT x4 = reinterpret_cast<const uint32_t*>(x1);
|
|
const uint32_t* GF256_RESTRICT y4 = reinterpret_cast<const uint32_t*>(y1);
|
|
*z4 = *x4 ^ *y4;
|
|
|
|
x1 += 4;
|
|
y1 += 4;
|
|
z1 += 4;
|
|
bytes -= 4;
|
|
}
|
|
|
|
// Handle final bytes
|
|
for (int i = bytes; i > 0; i--)
|
|
{
|
|
z1[i - 1] = x1[i - 1] ^ y1[i - 1];
|
|
}
|
|
}
|
|
|
|
void gf256_memswap(void* GF256_RESTRICT vx, void* GF256_RESTRICT vy, int bytes)
|
|
{
|
|
GF256_M128* GF256_RESTRICT x16 = reinterpret_cast<GF256_M128*>(vx);
|
|
GF256_M128* GF256_RESTRICT y16 = reinterpret_cast<GF256_M128*>(vy);
|
|
|
|
// Handle blocks of 16 bytes
|
|
while (bytes >= 16)
|
|
{
|
|
GF256_M128 x0 = _mm_loadu_si128(x16);
|
|
GF256_M128 y0 = _mm_loadu_si128(y16);
|
|
_mm_storeu_si128(x16, y0);
|
|
_mm_storeu_si128(y16, x0);
|
|
|
|
bytes -= 16;
|
|
++x16;
|
|
++y16;
|
|
}
|
|
|
|
uint8_t* GF256_RESTRICT x1 = reinterpret_cast<uint8_t*>(x16);
|
|
uint8_t* GF256_RESTRICT y1 = reinterpret_cast<uint8_t*>(y16);
|
|
|
|
// Handle a block of 8 bytes
|
|
if (bytes >= 8)
|
|
{
|
|
uint64_t* GF256_RESTRICT x8 = reinterpret_cast<uint64_t*>(x1);
|
|
uint64_t* GF256_RESTRICT y8 = reinterpret_cast<uint64_t*>(y1);
|
|
|
|
uint64_t temp = *x8;
|
|
*x8 = *y8;
|
|
*y8 = temp;
|
|
|
|
x1 += 8;
|
|
y1 += 8;
|
|
bytes -= 8;
|
|
}
|
|
|
|
// Handle a block of 4 bytes
|
|
if (bytes >= 4)
|
|
{
|
|
uint32_t* GF256_RESTRICT x4 = reinterpret_cast<uint32_t*>(x1);
|
|
uint32_t* GF256_RESTRICT y4 = reinterpret_cast<uint32_t*>(y1);
|
|
|
|
uint32_t temp = *x4;
|
|
*x4 = *y4;
|
|
*y4 = temp;
|
|
|
|
x1 += 4;
|
|
y1 += 4;
|
|
bytes -= 4;
|
|
}
|
|
|
|
// Handle final bytes
|
|
uint8_t temp;
|
|
|
|
for (int i = bytes; i > 0; i--)
|
|
{
|
|
temp = x1[i - 1];
|
|
x1[i - 1] = y1[i - 1];
|
|
y1[i - 1] = temp;
|
|
}
|
|
}
|