/* Copyright (c) 2015 Christopher A. Taylor. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of CM256 nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include "gf256.h" const uint8_t gf256_ctx::GF256_GEN_POLY[GF256_GEN_POLY_COUNT] = { 0x8e, 0x95, 0x96, 0xa6, 0xaf, 0xb1, 0xb2, 0xb4, 0xb8, 0xc3, 0xc6, 0xd4, 0xe1, 0xe7, 0xf3, 0xfa, }; gf256_ctx::gf256_ctx() : initialized(false) { gf256_init_(); } gf256_ctx::~gf256_ctx() { } // Select which polynomial to use void gf256_ctx::gf255_poly_init(int polynomialIndex) { if (polynomialIndex < 0 || polynomialIndex >= GF256_GEN_POLY_COUNT) { polynomialIndex = 0; } Polynomial = (GF256_GEN_POLY[polynomialIndex] << 1) | 1; } //----------------------------------------------------------------------------- // Exponential and Log Tables // Construct EXP and LOG tables from polynomial void gf256_ctx::gf256_explog_init() { unsigned poly = Polynomial; uint8_t* exptab = GF256_EXP_TABLE; uint16_t* logtab = GF256_LOG_TABLE; logtab[0] = 512; exptab[0] = 1; for (unsigned jj = 1; jj < 255; ++jj) { unsigned next = (unsigned)exptab[jj - 1] * 2; if (next >= 256) next ^= poly; exptab[jj] = static_cast(next); logtab[exptab[jj]] = static_cast(jj); } exptab[255] = exptab[0]; logtab[exptab[255]] = 255; for (unsigned jj = 256; jj < 2 * 255; ++jj) { exptab[jj] = exptab[jj % 255]; } exptab[2 * 255] = 1; for (unsigned jj = 2 * 255 + 1; jj < 4 * 255; ++jj) { exptab[jj] = 0; } } //----------------------------------------------------------------------------- // Multiply and Divide Tables // Initialize MUL and DIV tables using LOG and EXP tables void gf256_ctx::gf256_muldiv_init() { // Allocate table memory 65KB x 2 uint8_t* m = GF256_MUL_TABLE; uint8_t* d = GF256_DIV_TABLE; // Unroll y = 0 subtable for (int x = 0; x < 256; ++x) { m[x] = d[x] = 0; } // For each other y value, for (int y = 1; y < 256; ++y) { // Calculate log(y) for mult and 255 - log(y) for div const uint8_t log_y = static_cast(GF256_LOG_TABLE[y]); const uint8_t log_yn = 255 - log_y; // Next subtable m += 256; d += 256; // Unroll x = 0 m[0] = 0; d[0] = 0; // Calculate x * y, x / y for (int x = 1; x < 256; ++x) { uint16_t log_x = GF256_LOG_TABLE[x]; m[x] = GF256_EXP_TABLE[log_x + log_y]; d[x] = GF256_EXP_TABLE[log_x + log_yn]; } } } //----------------------------------------------------------------------------- // Inverse Table // Initialize INV table using DIV table void gf256_ctx::gf256_inv_init() { for (int x = 0; x < 256; ++x) { GF256_INV_TABLE[x] = gf256_div(1, static_cast(x)); } } //----------------------------------------------------------------------------- // Multiply and Add Memory Tables /* Fast algorithm to compute m[1..8] = a[1..8] * b in GF(256) using SSE3 SIMD instruction set: Consider z = x * y in GF(256). This operation can be performed bit-by-bit. Usefully, the partial product of each bit is combined linearly with the rest. This means that the 8-bit number x can be split into its high and low 4 bits, and partial products can be formed from each half. Then the halves can be linearly combined: z = x[0..3] * y + x[4..7] * y The multiplication of each half can be done efficiently via table lookups, and the addition in GF(256) is XOR. There must be two tables that map 16 input elements for the low or high 4 bits of x to the two partial products. Each value for y has a different set of two tables: z = TABLE_LO_y(x[0..3]) xor TABLE_HI_y(x[4..7]) This means that we need 16 * 2 * 256 = 8192 bytes for precomputed tables. Computing z[] = x[] * y can be performed 16 bytes at a time by using the 128-bit register operations supported by modern processors. This is efficiently realized in SSE3 using the _mm_shuffle_epi8() function provided by Visual Studio 2010 or newer in . This function uses the low bits to do a table lookup on each byte. Unfortunately the high bit of each mask byte has the special feature that it clears the output byte when it is set, so we need to make sure it's cleared by masking off the high bit of each byte before using it: clr_mask = _mm_set1_epi8(0x0f) = 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f For the low half of the partial product, clear the high bit of each byte and perform the table lookup: p_lo = _mm_and_si128(x, clr_mask) p_lo = _mm_shuffle_epi8(p_lo, TABLE_LO_y) For the high half of the partial product, shift the high 4 bits of each byte into the low 4 bits and clear the high bit of each byte, and then perform the table lookup: p_hi = _mm_srli_epi64(x, 4) p_hi = _mm_and_si128(p_hi, clr_mask) p_hi = _mm_shuffle_epi8(p_hi, TABLE_HI_y) Finally add the two partial products to form the product, recalling that addition is XOR in a Galois field: result = _mm_xor_si128(p_lo, p_hi) This crunches 16 bytes of x at a time, and the result can be stored in z. */ /* Intrinsic reference: SSE3, VS2010+, tmmintrin.h: GF256_M128 _mm_shuffle_epi8(GF256_M128 a, GF256_M128 mask); Emits the Supplemental Streaming SIMD Extensions 3 (SSSE3) instruction pshufb. This instruction shuffles 16-byte parameters from a 128-bit parameter. Pseudo-code for PSHUFB (with 128 bit operands): for i = 0 to 15 { if (SRC[(i * 8)+7] = 1 ) then DEST[(i*8)+7..(i*8)+0] <- 0; else index[3..0] <- SRC[(i*8)+3 .. (i*8)+0]; DEST[(i*8)+7..(i*8)+0] <- DEST[(index*8+7)..(index*8+0)]; endif } SSE2, VS2008+, emmintrin.h: GF256_M128 _mm_slli_epi64 (GF256_M128 a, int count); Shifts the 2 signed or unsigned 64-bit integers in a left by count bits while shifting in zeros. GF256_M128 _mm_srli_epi64 (GF256_M128 a, int count); Shifts the 2 signed or unsigned 64-bit integers in a right by count bits while shifting in zeros. GF256_M128 _mm_set1_epi8 (char b); Sets the 16 signed 8-bit integer values to b. GF256_M128 _mm_and_si128 (GF256_M128 a, GF256_M128 b); Computes the bitwise AND of the 128-bit value in a and the 128-bit value in b. GF256_M128 _mm_xor_si128 ( GF256_M128 a, GF256_M128 b); Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in b. */ // Initialize the MM256 tables using gf256_mul() void gf256_ctx::gf256_muladd_mem_init() { for (int y = 0; y < 256; ++y) { uint8_t lo[16], hi[16]; // TABLE_LO_Y maps 0..15 to 8-bit partial product based on y. for (unsigned char x = 0; x < 16; ++x) { lo[x] = gf256_mul(x, static_cast(y)); hi[x] = gf256_mul(x << 4, static_cast(y)); } const GF256_M128 table_lo = _mm_set_epi8( lo[15], lo[14], lo[13], lo[12], lo[11], lo[10], lo[9], lo[8], lo[7], lo[6], lo[5], lo[4], lo[3], lo[2], lo[1], lo[0]); const GF256_M128 table_hi = _mm_set_epi8( hi[15], hi[14], hi[13], hi[12], hi[11], hi[10], hi[9], hi[8], hi[7], hi[6], hi[5], hi[4], hi[3], hi[2], hi[1], hi[0]); _mm_store_si128(MM256_TABLE_LO_Y + y, table_lo); _mm_store_si128(MM256_TABLE_HI_Y + y, table_hi); } } //----------------------------------------------------------------------------- // Initialization // // Initialize a context, filling in the tables. // // Thread-safety / Usage Notes: // // It is perfectly safe and encouraged to use a gf256_ctx object from multiple // threads. The gf256_init() is relatively expensive and should only be done // once, though it will take less than a millisecond. // // The gf256_ctx object must be aligned to 16 byte boundary. // Simply tag the object with GF256_ALIGNED to achieve this. // // Example: // static GF256_ALIGNED gf256_ctx TheGF256Context; // gf256_init(&TheGF256Context, 0); // // Returns 0 on success and other values on failure. int gf256_ctx::gf256_init_() { // Avoid multiple initialization if (initialized) { return 0; } if (!IsLittleEndian()) { fprintf(stderr, "gf256_ctx::gf256_init_: Little Endian architecture expected (code won't work without mods)\n"); return -2; } gf255_poly_init(DefaultPolynomialIndex); gf256_explog_init(); gf256_muldiv_init(); gf256_inv_init(); gf256_muladd_mem_init(); initialized = true; fprintf(stderr, "gf256_ctx::gf256_init_: initialized\n"); return 0; } //----------------------------------------------------------------------------- // Operations with context void gf256_ctx::gf256_mul_mem(void* GF256_RESTRICT vz, const void* GF256_RESTRICT vx, uint8_t y, int bytes) { // Use a single if-statement to handle special cases if (y <= 1) { if (y == 0) { memset(vz, 0, bytes); } return; } // Partial product tables; see above const GF256_M128 table_lo_y = _mm_load_si128(MM256_TABLE_LO_Y + y); const GF256_M128 table_hi_y = _mm_load_si128(MM256_TABLE_HI_Y + y); // clr_mask = 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f const GF256_M128 clr_mask = _mm_set1_epi8(0x0f); GF256_M128* GF256_RESTRICT z16 = reinterpret_cast(vz); const GF256_M128* GF256_RESTRICT x16 = reinterpret_cast(vx); // Handle multiples of 16 bytes while (bytes >= 16) { // See above comments for details GF256_M128 x0 = _mm_loadu_si128(x16); GF256_M128 l0 = _mm_and_si128(x0, clr_mask); x0 = _mm_srli_epi64(x0, 4); GF256_M128 h0 = _mm_and_si128(x0, clr_mask); l0 = _mm_shuffle_epi8(table_lo_y, l0); h0 = _mm_shuffle_epi8(table_hi_y, h0); _mm_storeu_si128(z16, _mm_xor_si128(l0, h0)); x16++; z16++; bytes -= 16; } uint8_t* GF256_RESTRICT z8 = reinterpret_cast(z16); const uint8_t* GF256_RESTRICT x8 = reinterpret_cast(x16); const uint8_t* GF256_RESTRICT table = GF256_MUL_TABLE + ((unsigned)y << 8); // Handle a block of 8 bytes if (bytes >= 8) { uint64_t word = table[x8[0]]; word |= (uint64_t)table[x8[1]] << 8; word |= (uint64_t)table[x8[2]] << 16; word |= (uint64_t)table[x8[3]] << 24; word |= (uint64_t)table[x8[4]] << 32; word |= (uint64_t)table[x8[5]] << 40; word |= (uint64_t)table[x8[6]] << 48; word |= (uint64_t)table[x8[7]] << 56; *(uint64_t*)z8 = word; x8 += 8; z8 += 8; bytes -= 8; } // Handle a block of 4 bytes if (bytes >= 4) { uint32_t word = table[x8[0]]; word |= (uint32_t)table[x8[1]] << 8; word |= (uint32_t)table[x8[2]] << 16; word |= (uint32_t)table[x8[3]] << 24; *(uint32_t*)z8 = word; x8 += 4; z8 += 4; bytes -= 4; } // Handle single bytes for (int i = bytes; i > 0; i--) { z8[i - 1] = table[x8[i - 1]]; } } void gf256_ctx::gf256_muladd_mem(void* GF256_RESTRICT vz, uint8_t y, const void* GF256_RESTRICT vx, int bytes) { // Use a single if-statement to handle special cases if (y <= 1) { if (y == 1) { gf256_add_mem(vz, vx, bytes); } return; } // Partial product tables; see above const GF256_M128 table_lo_y = _mm_load_si128(MM256_TABLE_LO_Y + y); const GF256_M128 table_hi_y = _mm_load_si128(MM256_TABLE_HI_Y + y); // clr_mask = 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f const GF256_M128 clr_mask = _mm_set1_epi8(0x0f); GF256_M128* GF256_RESTRICT z16 = reinterpret_cast(vz); const GF256_M128* GF256_RESTRICT x16 = reinterpret_cast(vx); // Handle multiples of 16 bytes while (bytes >= 16) { // See above comments for details GF256_M128 x0 = _mm_loadu_si128(x16); GF256_M128 l0 = _mm_and_si128(x0, clr_mask); x0 = _mm_srli_epi64(x0, 4); GF256_M128 h0 = _mm_and_si128(x0, clr_mask); l0 = _mm_shuffle_epi8(table_lo_y, l0); h0 = _mm_shuffle_epi8(table_hi_y, h0); const GF256_M128 p0 = _mm_xor_si128(l0, h0); const GF256_M128 z0 = _mm_loadu_si128(z16); _mm_storeu_si128(z16, _mm_xor_si128(p0, z0)); x16++; z16++; bytes -= 16; } uint8_t* GF256_RESTRICT z8 = reinterpret_cast(z16); const uint8_t* GF256_RESTRICT x8 = reinterpret_cast(x16); const uint8_t* GF256_RESTRICT table = GF256_MUL_TABLE + ((unsigned)y << 8); // Handle a block of 8 bytes if (bytes >= 8) { uint64_t word = table[x8[0]]; word |= (uint64_t)table[x8[1]] << 8; word |= (uint64_t)table[x8[2]] << 16; word |= (uint64_t)table[x8[3]] << 24; word |= (uint64_t)table[x8[4]] << 32; word |= (uint64_t)table[x8[5]] << 40; word |= (uint64_t)table[x8[6]] << 48; word |= (uint64_t)table[x8[7]] << 56; *(uint64_t*)z8 ^= word; x8 += 8; z8 += 8; bytes -= 8; } // Handle a block of 4 bytes if (bytes >= 4) { uint32_t word = table[x8[0]]; word |= (uint32_t)table[x8[1]] << 8; word |= (uint32_t)table[x8[2]] << 16; word |= (uint32_t)table[x8[3]] << 24; *(uint32_t*)z8 ^= word; x8 += 4; z8 += 4; bytes -= 4; } // Handle single bytes for (int i = bytes; i > 0; i--) { z8[i - 1] ^= table[x8[i - 1]]; } } //----------------------------------------------------------------------------- // Static operations void gf256_ctx::gf256_add_mem(void* GF256_RESTRICT vx, const void* GF256_RESTRICT vy, int bytes) { GF256_M128* GF256_RESTRICT x16 = reinterpret_cast(vx); const GF256_M128* GF256_RESTRICT y16 = reinterpret_cast(vy); // Handle multiples of 64 bytes while (bytes >= 64) { GF256_M128 x0 = _mm_loadu_si128(x16); GF256_M128 x1 = _mm_loadu_si128(x16 + 1); GF256_M128 x2 = _mm_loadu_si128(x16 + 2); GF256_M128 x3 = _mm_loadu_si128(x16 + 3); GF256_M128 y0 = _mm_loadu_si128(y16); GF256_M128 y1 = _mm_loadu_si128(y16 + 1); GF256_M128 y2 = _mm_loadu_si128(y16 + 2); GF256_M128 y3 = _mm_loadu_si128(y16 + 3); _mm_storeu_si128(x16, _mm_xor_si128(x0, y0)); _mm_storeu_si128(x16 + 1, _mm_xor_si128(x1, y1)); _mm_storeu_si128(x16 + 2, _mm_xor_si128(x2, y2)); _mm_storeu_si128(x16 + 3, _mm_xor_si128(x3, y3)); x16 += 4; y16 += 4; bytes -= 64; } // Handle multiples of 16 bytes while (bytes >= 16) { // x[i] = x[i] xor y[i] _mm_storeu_si128(x16, _mm_xor_si128( _mm_loadu_si128(x16), _mm_loadu_si128(y16))); x16++; y16++; bytes -= 16; } uint8_t* GF256_RESTRICT x1 = reinterpret_cast(x16); const uint8_t* GF256_RESTRICT y1 = reinterpret_cast(y16); // Handle a block of 8 bytes if (bytes >= 8) { uint64_t* GF256_RESTRICT x8 = reinterpret_cast(x1); const uint64_t* GF256_RESTRICT y8 = reinterpret_cast(y1); *x8 ^= *y8; x1 += 8; y1 += 8; bytes -= 8; } // Handle a block of 4 bytes if (bytes >= 4) { uint32_t* GF256_RESTRICT x4 = reinterpret_cast(x1); const uint32_t* GF256_RESTRICT y4 = reinterpret_cast(y1); *x4 ^= *y4; x1 += 4; y1 += 4; bytes -= 4; } // Handle final bytes for (int i = bytes; i > 0; i--) { x1[i - 1] ^= y1[i - 1]; } } void gf256_ctx::gf256_add2_mem(void* GF256_RESTRICT vz, const void* GF256_RESTRICT vx, const void* GF256_RESTRICT vy, int bytes) { GF256_M128* GF256_RESTRICT z16 = reinterpret_cast(vz); const GF256_M128* GF256_RESTRICT x16 = reinterpret_cast(vx); const GF256_M128* GF256_RESTRICT y16 = reinterpret_cast(vy); // Handle multiples of 16 bytes while (bytes >= 16) { // z[i] = x[i] xor y[i] _mm_storeu_si128(z16, _mm_xor_si128( _mm_loadu_si128(z16), _mm_xor_si128( _mm_loadu_si128(x16), _mm_loadu_si128(y16)))); x16++; y16++; z16++; bytes -= 16; } uint8_t* GF256_RESTRICT z1 = reinterpret_cast(z16); const uint8_t* GF256_RESTRICT x1 = reinterpret_cast(x16); const uint8_t* GF256_RESTRICT y1 = reinterpret_cast(y16); // Handle a block of 8 bytes if (bytes >= 8) { uint64_t* GF256_RESTRICT z8 = reinterpret_cast(z1); const uint64_t* GF256_RESTRICT x8 = reinterpret_cast(x1); const uint64_t* GF256_RESTRICT y8 = reinterpret_cast(y1); *z8 ^= *x8 ^ *y8; x1 += 8; y1 += 8; z1 += 8; bytes -= 8; } // Handle a block of 4 bytes if (bytes >= 4) { uint32_t* GF256_RESTRICT z4 = reinterpret_cast(z1); const uint32_t* GF256_RESTRICT x4 = reinterpret_cast(x1); const uint32_t* GF256_RESTRICT y4 = reinterpret_cast(y1); *z4 ^= *x4 ^ *y4; x1 += 4; y1 += 4; z1 += 4; bytes -= 4; } // Handle final bytes for (int i = bytes; i > 0; i--) { z1[i - 1] ^= x1[i - 1] ^ y1[i - 1]; } } void gf256_ctx::gf256_addset_mem(void* GF256_RESTRICT vz, const void* GF256_RESTRICT vx, const void* GF256_RESTRICT vy, int bytes) { GF256_M128* GF256_RESTRICT z16 = reinterpret_cast(vz); const GF256_M128* GF256_RESTRICT x16 = reinterpret_cast(vx); const GF256_M128* GF256_RESTRICT y16 = reinterpret_cast(vy); // Handle multiples of 64 bytes while (bytes >= 64) { GF256_M128 x0 = _mm_loadu_si128(x16); GF256_M128 x1 = _mm_loadu_si128(x16 + 1); GF256_M128 x2 = _mm_loadu_si128(x16 + 2); GF256_M128 x3 = _mm_loadu_si128(x16 + 3); GF256_M128 y0 = _mm_loadu_si128(y16); GF256_M128 y1 = _mm_loadu_si128(y16 + 1); GF256_M128 y2 = _mm_loadu_si128(y16 + 2); GF256_M128 y3 = _mm_loadu_si128(y16 + 3); _mm_storeu_si128(z16, _mm_xor_si128(x0, y0)); _mm_storeu_si128(z16 + 1, _mm_xor_si128(x1, y1)); _mm_storeu_si128(z16 + 2, _mm_xor_si128(x2, y2)); _mm_storeu_si128(z16 + 3, _mm_xor_si128(x3, y3)); x16 += 4; y16 += 4; z16 += 4; bytes -= 64; } // Handle multiples of 16 bytes while (bytes >= 16) { // z[i] = x[i] xor y[i] _mm_storeu_si128(z16, _mm_xor_si128( _mm_loadu_si128(x16), _mm_loadu_si128(y16))); x16++; y16++; z16++; bytes -= 16; } uint8_t* GF256_RESTRICT z1 = reinterpret_cast(z16); const uint8_t* GF256_RESTRICT x1 = reinterpret_cast(x16); const uint8_t* GF256_RESTRICT y1 = reinterpret_cast(y16); // Handle a block of 8 bytes if (bytes >= 8) { uint64_t* GF256_RESTRICT z8 = reinterpret_cast(z1); const uint64_t* GF256_RESTRICT x8 = reinterpret_cast(x1); const uint64_t* GF256_RESTRICT y8 = reinterpret_cast(y1); *z8 = *x8 ^ *y8; x1 += 8; y1 += 8; z1 += 8; bytes -= 8; } // Handle a block of 4 bytes if (bytes >= 4) { uint32_t* GF256_RESTRICT z4 = reinterpret_cast(z1); const uint32_t* GF256_RESTRICT x4 = reinterpret_cast(x1); const uint32_t* GF256_RESTRICT y4 = reinterpret_cast(y1); *z4 = *x4 ^ *y4; x1 += 4; y1 += 4; z1 += 4; bytes -= 4; } // Handle final bytes for (int i = bytes; i > 0; i--) { z1[i - 1] = x1[i - 1] ^ y1[i - 1]; } } void gf256_memswap(void* GF256_RESTRICT vx, void* GF256_RESTRICT vy, int bytes) { GF256_M128* GF256_RESTRICT x16 = reinterpret_cast(vx); GF256_M128* GF256_RESTRICT y16 = reinterpret_cast(vy); // Handle blocks of 16 bytes while (bytes >= 16) { GF256_M128 x0 = _mm_loadu_si128(x16); GF256_M128 y0 = _mm_loadu_si128(y16); _mm_storeu_si128(x16, y0); _mm_storeu_si128(y16, x0); bytes -= 16; ++x16; ++y16; } uint8_t* GF256_RESTRICT x1 = reinterpret_cast(x16); uint8_t* GF256_RESTRICT y1 = reinterpret_cast(y16); // Handle a block of 8 bytes if (bytes >= 8) { uint64_t* GF256_RESTRICT x8 = reinterpret_cast(x1); uint64_t* GF256_RESTRICT y8 = reinterpret_cast(y1); uint64_t temp = *x8; *x8 = *y8; *y8 = temp; x1 += 8; y1 += 8; bytes -= 8; } // Handle a block of 4 bytes if (bytes >= 4) { uint32_t* GF256_RESTRICT x4 = reinterpret_cast(x1); uint32_t* GF256_RESTRICT y4 = reinterpret_cast(y1); uint32_t temp = *x4; *x4 = *y4; *y4 = temp; x1 += 4; y1 += 4; bytes -= 4; } // Handle final bytes uint8_t temp; for (int i = bytes; i > 0; i--) { temp = x1[i - 1]; x1[i - 1] = y1[i - 1]; y1[i - 1] = temp; } }