Viewing File: /home/ubuntu/efiexchange-node-base/node_modules/keccak/src/libkeccak-32/KeccakP-1600-inplace32BI.c

/*
Implementation by Ronny Van Keer, hereby denoted as "the implementer".

For more information, feedback or questions, please refer to our website:
https://keccak.team/

To the extent possible under law, the implementer has waived all copyright
and related or neighboring rights to the source code in this file.
http://creativecommons.org/publicdomain/zero/1.0/

---

This file implements Keccak-p[1600] in a SnP-compatible way.
Please refer to SnP-documentation.h for more details.

This implementation comes with KeccakP-1600-SnP.h in the same folder.
Please refer to LowLevel.build for the exact list of other files it must be combined with.
*/

#include    <string.h>
#include "brg_endian.h"
#include "KeccakP-1600-SnP.h"
#include "SnP-Relaned.h"

typedef unsigned char UINT8;
typedef unsigned int UINT32;
/* WARNING: on 8-bit and 16-bit platforms, this should be replaced by: */
/* typedef unsigned long       UINT32; */

#define ROL32(a, offset) ((((UINT32)a) << (offset)) ^ (((UINT32)a) >> (32-(offset))))

/* Credit to Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002 */
#define prepareToBitInterleaving(low, high, temp, temp0, temp1) \
        temp0 = (low); \
        temp = (temp0 ^ (temp0 >>  1)) & 0x22222222UL;  temp0 = temp0 ^ temp ^ (temp <<  1); \
        temp = (temp0 ^ (temp0 >>  2)) & 0x0C0C0C0CUL;  temp0 = temp0 ^ temp ^ (temp <<  2); \
        temp = (temp0 ^ (temp0 >>  4)) & 0x00F000F0UL;  temp0 = temp0 ^ temp ^ (temp <<  4); \
        temp = (temp0 ^ (temp0 >>  8)) & 0x0000FF00UL;  temp0 = temp0 ^ temp ^ (temp <<  8); \
        temp1 = (high); \
        temp = (temp1 ^ (temp1 >>  1)) & 0x22222222UL;  temp1 = temp1 ^ temp ^ (temp <<  1); \
        temp = (temp1 ^ (temp1 >>  2)) & 0x0C0C0C0CUL;  temp1 = temp1 ^ temp ^ (temp <<  2); \
        temp = (temp1 ^ (temp1 >>  4)) & 0x00F000F0UL;  temp1 = temp1 ^ temp ^ (temp <<  4); \
        temp = (temp1 ^ (temp1 >>  8)) & 0x0000FF00UL;  temp1 = temp1 ^ temp ^ (temp <<  8);

#define toBitInterleavingAndXOR(low, high, even, odd, temp, temp0, temp1) \
        prepareToBitInterleaving(low, high, temp, temp0, temp1) \
        even ^= (temp0 & 0x0000FFFF) | (temp1 << 16); \
        odd ^= (temp0 >> 16) | (temp1 & 0xFFFF0000);

#define toBitInterleavingAndAND(low, high, even, odd, temp, temp0, temp1) \
        prepareToBitInterleaving(low, high, temp, temp0, temp1) \
        even &= (temp0 & 0x0000FFFF) | (temp1 << 16); \
        odd &= (temp0 >> 16) | (temp1 & 0xFFFF0000);

#define toBitInterleavingAndSet(low, high, even, odd, temp, temp0, temp1) \
        prepareToBitInterleaving(low, high, temp, temp0, temp1) \
        even = (temp0 & 0x0000FFFF) | (temp1 << 16); \
        odd = (temp0 >> 16) | (temp1 & 0xFFFF0000);

/* Credit to Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002 */
#define prepareFromBitInterleaving(even, odd, temp, temp0, temp1) \
        temp0 = (even); \
        temp1 = (odd); \
        temp = (temp0 & 0x0000FFFF) | (temp1 << 16); \
        temp1 = (temp0 >> 16) | (temp1 & 0xFFFF0000); \
        temp0 = temp; \
        temp = (temp0 ^ (temp0 >>  8)) & 0x0000FF00UL;  temp0 = temp0 ^ temp ^ (temp <<  8); \
        temp = (temp0 ^ (temp0 >>  4)) & 0x00F000F0UL;  temp0 = temp0 ^ temp ^ (temp <<  4); \
        temp = (temp0 ^ (temp0 >>  2)) & 0x0C0C0C0CUL;  temp0 = temp0 ^ temp ^ (temp <<  2); \
        temp = (temp0 ^ (temp0 >>  1)) & 0x22222222UL;  temp0 = temp0 ^ temp ^ (temp <<  1); \
        temp = (temp1 ^ (temp1 >>  8)) & 0x0000FF00UL;  temp1 = temp1 ^ temp ^ (temp <<  8); \
        temp = (temp1 ^ (temp1 >>  4)) & 0x00F000F0UL;  temp1 = temp1 ^ temp ^ (temp <<  4); \
        temp = (temp1 ^ (temp1 >>  2)) & 0x0C0C0C0CUL;  temp1 = temp1 ^ temp ^ (temp <<  2); \
        temp = (temp1 ^ (temp1 >>  1)) & 0x22222222UL;  temp1 = temp1 ^ temp ^ (temp <<  1);

#define fromBitInterleaving(even, odd, low, high, temp, temp0, temp1) \
        prepareFromBitInterleaving(even, odd, temp, temp0, temp1) \
        low = temp0; \
        high = temp1;

#define fromBitInterleavingAndXOR(even, odd, lowIn, highIn, lowOut, highOut, temp, temp0, temp1) \
        prepareFromBitInterleaving(even, odd, temp, temp0, temp1) \
        lowOut = lowIn ^ temp0; \
        highOut = highIn ^ temp1;

void KeccakP1600_SetBytesInLaneToZero(void *state, unsigned int lanePosition, unsigned int offset, unsigned int length)
{
    UINT8 laneAsBytes[8];
    UINT32 low, high;
    UINT32 temp, temp0, temp1;
    UINT32 *stateAsHalfLanes = (UINT32*)state;

    memset(laneAsBytes, 0xFF, offset);
    memset(laneAsBytes+offset, 0x00, length);
    memset(laneAsBytes+offset+length, 0xFF, 8-offset-length);
#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
    low = *((UINT32*)(laneAsBytes+0));
    high = *((UINT32*)(laneAsBytes+4));
#else
    low = laneAsBytes[0]
        | ((UINT32)(laneAsBytes[1]) << 8)
        | ((UINT32)(laneAsBytes[2]) << 16)
        | ((UINT32)(laneAsBytes[3]) << 24);
    high = laneAsBytes[4]
        | ((UINT32)(laneAsBytes[5]) << 8)
        | ((UINT32)(laneAsBytes[6]) << 16)
        | ((UINT32)(laneAsBytes[7]) << 24);
#endif
    toBitInterleavingAndAND(low, high, stateAsHalfLanes[lanePosition*2+0], stateAsHalfLanes[lanePosition*2+1], temp, temp0, temp1);
}

/* ---------------------------------------------------------------- */

void KeccakP1600_Initialize(void *state)
{
    memset(state, 0, 200);
}

/* ---------------------------------------------------------------- */

void KeccakP1600_AddByte(void *state, unsigned char byte, unsigned int offset)
{
    unsigned int lanePosition = offset/8;
    unsigned int offsetInLane = offset%8;
    UINT32 low, high;
    UINT32 temp, temp0, temp1;
    UINT32 *stateAsHalfLanes = (UINT32*)state;

    if (offsetInLane < 4) {
        low = (UINT32)byte << (offsetInLane*8);
        high = 0;
    }
    else {
        low = 0;
        high = (UINT32)byte << ((offsetInLane-4)*8);
    }
    toBitInterleavingAndXOR(low, high, stateAsHalfLanes[lanePosition*2+0], stateAsHalfLanes[lanePosition*2+1], temp, temp0, temp1);
}

/* ---------------------------------------------------------------- */

void KeccakP1600_AddBytesInLane(void *state, unsigned int lanePosition, const unsigned char *data, unsigned int offset, unsigned int length)
{
    UINT8 laneAsBytes[8];
    UINT32 low, high;
    UINT32 temp, temp0, temp1;
    UINT32 *stateAsHalfLanes = (UINT32*)state;

    memset(laneAsBytes, 0, 8);
    memcpy(laneAsBytes+offset, data, length);
#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
    low = *((UINT32*)(laneAsBytes+0));
    high = *((UINT32*)(laneAsBytes+4));
#else
    low = laneAsBytes[0]
        | ((UINT32)(laneAsBytes[1]) << 8)
        | ((UINT32)(laneAsBytes[2]) << 16)
        | ((UINT32)(laneAsBytes[3]) << 24);
    high = laneAsBytes[4]
        | ((UINT32)(laneAsBytes[5]) << 8)
        | ((UINT32)(laneAsBytes[6]) << 16)
        | ((UINT32)(laneAsBytes[7]) << 24);
#endif
    toBitInterleavingAndXOR(low, high, stateAsHalfLanes[lanePosition*2+0], stateAsHalfLanes[lanePosition*2+1], temp, temp0, temp1);
}

/* ---------------------------------------------------------------- */

void KeccakP1600_AddLanes(void *state, const unsigned char *data, unsigned int laneCount)
{
#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
    const UINT32 * pI = (const UINT32 *)data;
    UINT32 * pS = (UINT32*)state;
    UINT32 t, x0, x1;
    int i;
    for (i = laneCount-1; i >= 0; --i) {
#ifdef NO_MISALIGNED_ACCESSES
        UINT32 low;
        UINT32 high;
        memcpy(&low, pI++, 4);
        memcpy(&high, pI++, 4);
        toBitInterleavingAndXOR(low, high, *(pS++), *(pS++), t, x0, x1);
#else
        toBitInterleavingAndXOR(*(pI++), *(pI++), *(pS++), *(pS++), t, x0, x1)
#endif
    }
#else
    unsigned int lanePosition;
    for(lanePosition=0; lanePosition<laneCount; lanePosition++) {
        UINT8 laneAsBytes[8];
        memcpy(laneAsBytes, data+lanePosition*8, 8);
        UINT32 low = laneAsBytes[0]
            | ((UINT32)(laneAsBytes[1]) << 8)
            | ((UINT32)(laneAsBytes[2]) << 16)
            | ((UINT32)(laneAsBytes[3]) << 24);
        UINT32 high = laneAsBytes[4]
            | ((UINT32)(laneAsBytes[5]) << 8)
            | ((UINT32)(laneAsBytes[6]) << 16)
            | ((UINT32)(laneAsBytes[7]) << 24);
        UINT32 even, odd, temp, temp0, temp1;
        UINT32 *stateAsHalfLanes = (UINT32*)state;
        toBitInterleavingAndXOR(low, high, stateAsHalfLanes[lanePosition*2+0], stateAsHalfLanes[lanePosition*2+1], temp, temp0, temp1);
    }
#endif
}

/* ---------------------------------------------------------------- */

void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length)
{
    SnP_AddBytes(state, data, offset, length, KeccakP1600_AddLanes, KeccakP1600_AddBytesInLane, 8);
}

/* ---------------------------------------------------------------- */

void KeccakP1600_OverwriteBytesInLane(void *state, unsigned int lanePosition, const unsigned char *data, unsigned int offset, unsigned int length)
{
    KeccakP1600_SetBytesInLaneToZero(state, lanePosition, offset, length);
    KeccakP1600_AddBytesInLane(state, lanePosition, data, offset, length);
}

/* ---------------------------------------------------------------- */

void KeccakP1600_OverwriteLanes(void *state, const unsigned char *data, unsigned int laneCount)
{
#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
    const UINT32 * pI = (const UINT32 *)data;
    UINT32 * pS = (UINT32 *)state;
    UINT32 t, x0, x1;
    int i;
    for (i = laneCount-1; i >= 0; --i) {
#ifdef NO_MISALIGNED_ACCESSES
        UINT32 low;
        UINT32 high;
        memcpy(&low, pI++, 4);
        memcpy(&high, pI++, 4);
        toBitInterleavingAndSet(low, high, *(pS++), *(pS++), t, x0, x1);
#else
        toBitInterleavingAndSet(*(pI++), *(pI++), *(pS++), *(pS++), t, x0, x1)
#endif
    }
#else
    unsigned int lanePosition;
    for(lanePosition=0; lanePosition<laneCount; lanePosition++) {
        UINT8 laneAsBytes[8];
        memcpy(laneAsBytes, data+lanePosition*8, 8);
        UINT32 low = laneAsBytes[0]
            | ((UINT32)(laneAsBytes[1]) << 8)
            | ((UINT32)(laneAsBytes[2]) << 16)
            | ((UINT32)(laneAsBytes[3]) << 24);
        UINT32 high = laneAsBytes[4]
            | ((UINT32)(laneAsBytes[5]) << 8)
            | ((UINT32)(laneAsBytes[6]) << 16)
            | ((UINT32)(laneAsBytes[7]) << 24);
        UINT32 even, odd, temp, temp0, temp1;
        UINT32 *stateAsHalfLanes = (UINT32*)state;
        toBitInterleavingAndSet(low, high, stateAsHalfLanes[lanePosition*2+0], stateAsHalfLanes[lanePosition*2+1], temp, temp0, temp1);
    }
#endif
}

/* ---------------------------------------------------------------- */

void KeccakP1600_OverwriteBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length)
{
    SnP_OverwriteBytes(state, data, offset, length, KeccakP1600_OverwriteLanes, KeccakP1600_OverwriteBytesInLane, 8);
}

/* ---------------------------------------------------------------- */

void KeccakP1600_OverwriteWithZeroes(void *state, unsigned int byteCount)
{
    UINT32 *stateAsHalfLanes = (UINT32*)state;
    unsigned int i;

    for(i=0; i<byteCount/8; i++) {
        stateAsHalfLanes[i*2+0] = 0;
        stateAsHalfLanes[i*2+1] = 0;
    }
    if (byteCount%8 != 0)
        KeccakP1600_SetBytesInLaneToZero(state, byteCount/8, 0, byteCount%8);
}

/* ---------------------------------------------------------------- */

void KeccakP1600_ExtractBytesInLane(const void *state, unsigned int lanePosition, unsigned char *data, unsigned int offset, unsigned int length)
{
    UINT32 *stateAsHalfLanes = (UINT32*)state;
    UINT32 low, high, temp, temp0, temp1;
    UINT8 laneAsBytes[8];

    fromBitInterleaving(stateAsHalfLanes[lanePosition*2], stateAsHalfLanes[lanePosition*2+1], low, high, temp, temp0, temp1);
#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
    *((UINT32*)(laneAsBytes+0)) = low;
    *((UINT32*)(laneAsBytes+4)) = high;
#else
    laneAsBytes[0] = low & 0xFF;
    laneAsBytes[1] = (low >> 8) & 0xFF;
    laneAsBytes[2] = (low >> 16) & 0xFF;
    laneAsBytes[3] = (low >> 24) & 0xFF;
    laneAsBytes[4] = high & 0xFF;
    laneAsBytes[5] = (high >> 8) & 0xFF;
    laneAsBytes[6] = (high >> 16) & 0xFF;
    laneAsBytes[7] = (high >> 24) & 0xFF;
#endif
    memcpy(data, laneAsBytes+offset, length);
}

/* ---------------------------------------------------------------- */

void KeccakP1600_ExtractLanes(const void *state, unsigned char *data, unsigned int laneCount)
{
#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
    UINT32 * pI = (UINT32 *)data;
    const UINT32 * pS = ( const UINT32 *)state;
    UINT32 t, x0, x1;
    int i;
    for (i = laneCount-1; i >= 0; --i) {
#ifdef NO_MISALIGNED_ACCESSES
        UINT32 low;
        UINT32 high;
        fromBitInterleaving(*(pS++), *(pS++), low, high, t, x0, x1);
        memcpy(pI++, &low, 4);
        memcpy(pI++, &high, 4);
#else
        fromBitInterleaving(*(pS++), *(pS++), *(pI++), *(pI++), t, x0, x1)
#endif
    }
#else
    unsigned int lanePosition;
    for(lanePosition=0; lanePosition<laneCount; lanePosition++) {
        UINT32 *stateAsHalfLanes = (UINT32*)state;
        UINT32 low, high, temp, temp0, temp1;
        fromBitInterleaving(stateAsHalfLanes[lanePosition*2], stateAsHalfLanes[lanePosition*2+1], low, high, temp, temp0, temp1);
        UINT8 laneAsBytes[8];
        laneAsBytes[0] = low & 0xFF;
        laneAsBytes[1] = (low >> 8) & 0xFF;
        laneAsBytes[2] = (low >> 16) & 0xFF;
        laneAsBytes[3] = (low >> 24) & 0xFF;
        laneAsBytes[4] = high & 0xFF;
        laneAsBytes[5] = (high >> 8) & 0xFF;
        laneAsBytes[6] = (high >> 16) & 0xFF;
        laneAsBytes[7] = (high >> 24) & 0xFF;
        memcpy(data+lanePosition*8, laneAsBytes, 8);
    }
#endif
}

/* ---------------------------------------------------------------- */

void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length)
{
    SnP_ExtractBytes(state, data, offset, length, KeccakP1600_ExtractLanes, KeccakP1600_ExtractBytesInLane, 8);
}

/* ---------------------------------------------------------------- */

void KeccakP1600_ExtractAndAddBytesInLane(const void *state, unsigned int lanePosition, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length)
{
    UINT32 *stateAsHalfLanes = (UINT32*)state;
    UINT32 low, high, temp, temp0, temp1;
    UINT8 laneAsBytes[8];
    unsigned int i;

    fromBitInterleaving(stateAsHalfLanes[lanePosition*2], stateAsHalfLanes[lanePosition*2+1], low, high, temp, temp0, temp1);
#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
    *((UINT32*)(laneAsBytes+0)) = low;
    *((UINT32*)(laneAsBytes+4)) = high;
#else
    laneAsBytes[0] = low & 0xFF;
    laneAsBytes[1] = (low >> 8) & 0xFF;
    laneAsBytes[2] = (low >> 16) & 0xFF;
    laneAsBytes[3] = (low >> 24) & 0xFF;
    laneAsBytes[4] = high & 0xFF;
    laneAsBytes[5] = (high >> 8) & 0xFF;
    laneAsBytes[6] = (high >> 16) & 0xFF;
    laneAsBytes[7] = (high >> 24) & 0xFF;
#endif
    for(i=0; i<length; i++)
        output[i] = input[i] ^ laneAsBytes[offset+i];
}

/* ---------------------------------------------------------------- */

void KeccakP1600_ExtractAndAddLanes(const void *state, const unsigned char *input, unsigned char *output, unsigned int laneCount)
{
#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
    const UINT32 * pI = (const UINT32 *)input;
    UINT32 * pO = (UINT32 *)output;
    const UINT32 * pS = (const UINT32 *)state;
    UINT32 t, x0, x1;
    int i;
    for (i = laneCount-1; i >= 0; --i) {
#ifdef NO_MISALIGNED_ACCESSES
        UINT32 low;
        UINT32 high;
        fromBitInterleaving(*(pS++), *(pS++), low, high, t, x0, x1);
        *(pO++) = *(pI++) ^ low;
        *(pO++) = *(pI++) ^ high;
#else
        fromBitInterleavingAndXOR(*(pS++), *(pS++), *(pI++), *(pI++), *(pO++), *(pO++), t, x0, x1)
#endif
    }
#else
    unsigned int lanePosition;
    for(lanePosition=0; lanePosition<laneCount; lanePosition++) {
        UINT32 *stateAsHalfLanes = (UINT32*)state;
        UINT32 low, high, temp, temp0, temp1;
        fromBitInterleaving(stateAsHalfLanes[lanePosition*2], stateAsHalfLanes[lanePosition*2+1], low, high, temp, temp0, temp1);
        UINT8 laneAsBytes[8];
        laneAsBytes[0] = low & 0xFF;
        laneAsBytes[1] = (low >> 8) & 0xFF;
        laneAsBytes[2] = (low >> 16) & 0xFF;
        laneAsBytes[3] = (low >> 24) & 0xFF;
        laneAsBytes[4] = high & 0xFF;
        laneAsBytes[5] = (high >> 8) & 0xFF;
        laneAsBytes[6] = (high >> 16) & 0xFF;
        laneAsBytes[7] = (high >> 24) & 0xFF;
        ((UINT32*)(output+lanePosition*8))[0] = ((UINT32*)(input+lanePosition*8))[0] ^ (*(const UINT32*)(laneAsBytes+0));
        ((UINT32*)(output+lanePosition*8))[1] = ((UINT32*)(input+lanePosition*8))[0] ^ (*(const UINT32*)(laneAsBytes+4));
    }
#endif
}
/* ---------------------------------------------------------------- */

void KeccakP1600_ExtractAndAddBytes(const void *state, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length)
{
    SnP_ExtractAndAddBytes(state, input, output, offset, length, KeccakP1600_ExtractAndAddLanes, KeccakP1600_ExtractAndAddBytesInLane, 8);
}

/* ---------------------------------------------------------------- */

static const UINT32 KeccakF1600RoundConstants_int2[2*24+1] =
{
    0x00000001UL,    0x00000000UL,
    0x00000000UL,    0x00000089UL,
    0x00000000UL,    0x8000008bUL,
    0x00000000UL,    0x80008080UL,
    0x00000001UL,    0x0000008bUL,
    0x00000001UL,    0x00008000UL,
    0x00000001UL,    0x80008088UL,
    0x00000001UL,    0x80000082UL,
    0x00000000UL,    0x0000000bUL,
    0x00000000UL,    0x0000000aUL,
    0x00000001UL,    0x00008082UL,
    0x00000000UL,    0x00008003UL,
    0x00000001UL,    0x0000808bUL,
    0x00000001UL,    0x8000000bUL,
    0x00000001UL,    0x8000008aUL,
    0x00000001UL,    0x80000081UL,
    0x00000000UL,    0x80000081UL,
    0x00000000UL,    0x80000008UL,
    0x00000000UL,    0x00000083UL,
    0x00000000UL,    0x80008003UL,
    0x00000001UL,    0x80008088UL,
    0x00000000UL,    0x80000088UL,
    0x00000001UL,    0x00008000UL,
    0x00000000UL,    0x80008082UL,
    0x000000FFUL
};

#define KeccakRound0() \
        Cx = Abu0^Agu0^Aku0^Amu0^Asu0; \
        Du1 = Abe1^Age1^Ake1^Ame1^Ase1; \
        Da0 = Cx^ROL32(Du1, 1); \
        Cz = Abu1^Agu1^Aku1^Amu1^Asu1; \
        Du0 = Abe0^Age0^Ake0^Ame0^Ase0; \
        Da1 = Cz^Du0; \
        Cw = Abi0^Agi0^Aki0^Ami0^Asi0; \
        Do0 = Cw^ROL32(Cz, 1); \
        Cy = Abi1^Agi1^Aki1^Ami1^Asi1; \
        Do1 = Cy^Cx; \
        Cx = Aba0^Aga0^Aka0^Ama0^Asa0; \
        De0 = Cx^ROL32(Cy, 1); \
        Cz = Aba1^Aga1^Aka1^Ama1^Asa1; \
        De1 = Cz^Cw; \
        Cy = Abo1^Ago1^Ako1^Amo1^Aso1; \
        Di0 = Du0^ROL32(Cy, 1); \
        Cw = Abo0^Ago0^Ako0^Amo0^Aso0; \
        Di1 = Du1^Cw; \
        Du0 = Cw^ROL32(Cz, 1); \
        Du1 = Cy^Cx; \
\
        Ba = (Aba0^Da0); \
        Be = ROL32((Age0^De0), 22); \
        Bi = ROL32((Aki1^Di1), 22); \
        Bo = ROL32((Amo1^Do1), 11); \
        Bu = ROL32((Asu0^Du0),  7); \
        Aba0 =   Ba ^((~Be)&  Bi ); \
        Aba0 ^= *(pRoundConstants++); \
        Age0 =   Be ^((~Bi)&  Bo ); \
        Aki1 =   Bi ^((~Bo)&  Bu ); \
        Amo1 =   Bo ^((~Bu)&  Ba ); \
        Asu0 =   Bu ^((~Ba)&  Be ); \
        Ba = (Aba1^Da1); \
        Be = ROL32((Age1^De1), 22); \
        Bi = ROL32((Aki0^Di0), 21); \
        Bo = ROL32((Amo0^Do0), 10); \
        Bu = ROL32((Asu1^Du1),  7); \
        Aba1 =   Ba ^((~Be)&  Bi ); \
        Aba1 ^= *(pRoundConstants++); \
        Age1 =   Be ^((~Bi)&  Bo ); \
        Aki0 =   Bi ^((~Bo)&  Bu ); \
        Amo0 =   Bo ^((~Bu)&  Ba ); \
        Asu1 =   Bu ^((~Ba)&  Be ); \
        Bi = ROL32((Aka1^Da1),  2); \
        Bo = ROL32((Ame1^De1), 23); \
        Bu = ROL32((Asi1^Di1), 31); \
        Ba = ROL32((Abo0^Do0), 14); \
        Be = ROL32((Agu0^Du0), 10); \
        Aka1 =   Ba ^((~Be)&  Bi ); \
        Ame1 =   Be ^((~Bi)&  Bo ); \
        Asi1 =   Bi ^((~Bo)&  Bu ); \
        Abo0 =   Bo ^((~Bu)&  Ba ); \
        Agu0 =   Bu ^((~Ba)&  Be ); \
        Bi = ROL32((Aka0^Da0),  1); \
        Bo = ROL32((Ame0^De0), 22); \
        Bu = ROL32((Asi0^Di0), 30); \
        Ba = ROL32((Abo1^Do1), 14); \
        Be = ROL32((Agu1^Du1), 10); \
        Aka0 =   Ba ^((~Be)&  Bi ); \
        Ame0 =   Be ^((~Bi)&  Bo ); \
        Asi0 =   Bi ^((~Bo)&  Bu ); \
        Abo1 =   Bo ^((~Bu)&  Ba ); \
        Agu1 =   Bu ^((~Ba)&  Be ); \
        Bu = ROL32((Asa0^Da0),  9); \
        Ba = ROL32((Abe1^De1),  1); \
        Be = ROL32((Agi0^Di0),  3); \
        Bi = ROL32((Ako1^Do1), 13); \
        Bo = ROL32((Amu0^Du0),  4); \
        Asa0 =   Ba ^((~Be)&  Bi ); \
        Abe1 =   Be ^((~Bi)&  Bo ); \
        Agi0 =   Bi ^((~Bo)&  Bu ); \
        Ako1 =   Bo ^((~Bu)&  Ba ); \
        Amu0 =   Bu ^((~Ba)&  Be ); \
        Bu = ROL32((Asa1^Da1),  9); \
        Ba = (Abe0^De0); \
        Be = ROL32((Agi1^Di1),  3); \
        Bi = ROL32((Ako0^Do0), 12); \
        Bo = ROL32((Amu1^Du1),  4); \
        Asa1 =   Ba ^((~Be)&  Bi ); \
        Abe0 =   Be ^((~Bi)&  Bo ); \
        Agi1 =   Bi ^((~Bo)&  Bu ); \
        Ako0 =   Bo ^((~Bu)&  Ba ); \
        Amu1 =   Bu ^((~Ba)&  Be ); \
        Be = ROL32((Aga0^Da0), 18); \
        Bi = ROL32((Ake0^De0),  5); \
        Bo = ROL32((Ami1^Di1),  8); \
        Bu = ROL32((Aso0^Do0), 28); \
        Ba = ROL32((Abu1^Du1), 14); \
        Aga0 =   Ba ^((~Be)&  Bi ); \
        Ake0 =   Be ^((~Bi)&  Bo ); \
        Ami1 =   Bi ^((~Bo)&  Bu ); \
        Aso0 =   Bo ^((~Bu)&  Ba ); \
        Abu1 =   Bu ^((~Ba)&  Be ); \
        Be = ROL32((Aga1^Da1), 18); \
        Bi = ROL32((Ake1^De1),  5); \
        Bo = ROL32((Ami0^Di0),  7); \
        Bu = ROL32((Aso1^Do1), 28); \
        Ba = ROL32((Abu0^Du0), 13); \
        Aga1 =   Ba ^((~Be)&  Bi ); \
        Ake1 =   Be ^((~Bi)&  Bo ); \
        Ami0 =   Bi ^((~Bo)&  Bu ); \
        Aso1 =   Bo ^((~Bu)&  Ba ); \
        Abu0 =   Bu ^((~Ba)&  Be ); \
        Bo = ROL32((Ama1^Da1), 21); \
        Bu = ROL32((Ase0^De0),  1); \
        Ba = ROL32((Abi0^Di0), 31); \
        Be = ROL32((Ago1^Do1), 28); \
        Bi = ROL32((Aku1^Du1), 20); \
        Ama1 =   Ba ^((~Be)&  Bi ); \
        Ase0 =   Be ^((~Bi)&  Bo ); \
        Abi0 =   Bi ^((~Bo)&  Bu ); \
        Ago1 =   Bo ^((~Bu)&  Ba ); \
        Aku1 =   Bu ^((~Ba)&  Be ); \
        Bo = ROL32((Ama0^Da0), 20); \
        Bu = ROL32((Ase1^De1),  1); \
        Ba = ROL32((Abi1^Di1), 31); \
        Be = ROL32((Ago0^Do0), 27); \
        Bi = ROL32((Aku0^Du0), 19); \
        Ama0 =   Ba ^((~Be)&  Bi ); \
        Ase1 =   Be ^((~Bi)&  Bo ); \
        Abi1 =   Bi ^((~Bo)&  Bu ); \
        Ago0 =   Bo ^((~Bu)&  Ba ); \
        Aku0 =   Bu ^((~Ba)&  Be )

#define KeccakRound1() \
        Cx = Asu0^Agu0^Amu0^Abu1^Aku1; \
        Du1 = Age1^Ame0^Abe0^Ake1^Ase1; \
        Da0 = Cx^ROL32(Du1, 1); \
        Cz = Asu1^Agu1^Amu1^Abu0^Aku0; \
        Du0 = Age0^Ame1^Abe1^Ake0^Ase0; \
        Da1 = Cz^Du0; \
        Cw = Aki1^Asi1^Agi0^Ami1^Abi0; \
        Do0 = Cw^ROL32(Cz, 1); \
        Cy = Aki0^Asi0^Agi1^Ami0^Abi1; \
        Do1 = Cy^Cx; \
        Cx = Aba0^Aka1^Asa0^Aga0^Ama1; \
        De0 = Cx^ROL32(Cy, 1); \
        Cz = Aba1^Aka0^Asa1^Aga1^Ama0; \
        De1 = Cz^Cw; \
        Cy = Amo0^Abo1^Ako0^Aso1^Ago0; \
        Di0 = Du0^ROL32(Cy, 1); \
        Cw = Amo1^Abo0^Ako1^Aso0^Ago1; \
        Di1 = Du1^Cw; \
        Du0 = Cw^ROL32(Cz, 1); \
        Du1 = Cy^Cx; \
\
        Ba = (Aba0^Da0); \
        Be = ROL32((Ame1^De0), 22); \
        Bi = ROL32((Agi1^Di1), 22); \
        Bo = ROL32((Aso1^Do1), 11); \
        Bu = ROL32((Aku1^Du0),  7); \
        Aba0 =   Ba ^((~Be)&  Bi ); \
        Aba0 ^= *(pRoundConstants++); \
        Ame1 =   Be ^((~Bi)&  Bo ); \
        Agi1 =   Bi ^((~Bo)&  Bu ); \
        Aso1 =   Bo ^((~Bu)&  Ba ); \
        Aku1 =   Bu ^((~Ba)&  Be ); \
        Ba = (Aba1^Da1); \
        Be = ROL32((Ame0^De1), 22); \
        Bi = ROL32((Agi0^Di0), 21); \
        Bo = ROL32((Aso0^Do0), 10); \
        Bu = ROL32((Aku0^Du1),  7); \
        Aba1 =   Ba ^((~Be)&  Bi ); \
        Aba1 ^= *(pRoundConstants++); \
        Ame0 =   Be ^((~Bi)&  Bo ); \
        Agi0 =   Bi ^((~Bo)&  Bu ); \
        Aso0 =   Bo ^((~Bu)&  Ba ); \
        Aku0 =   Bu ^((~Ba)&  Be ); \
        Bi = ROL32((Asa1^Da1),  2); \
        Bo = ROL32((Ake1^De1), 23); \
        Bu = ROL32((Abi1^Di1), 31); \
        Ba = ROL32((Amo1^Do0), 14); \
        Be = ROL32((Agu0^Du0), 10); \
        Asa1 =   Ba ^((~Be)&  Bi ); \
        Ake1 =   Be ^((~Bi)&  Bo ); \
        Abi1 =   Bi ^((~Bo)&  Bu ); \
        Amo1 =   Bo ^((~Bu)&  Ba ); \
        Agu0 =   Bu ^((~Ba)&  Be ); \
        Bi = ROL32((Asa0^Da0),  1); \
        Bo = ROL32((Ake0^De0), 22); \
        Bu = ROL32((Abi0^Di0), 30); \
        Ba = ROL32((Amo0^Do1), 14); \
        Be = ROL32((Agu1^Du1), 10); \
        Asa0 =   Ba ^((~Be)&  Bi ); \
        Ake0 =   Be ^((~Bi)&  Bo ); \
        Abi0 =   Bi ^((~Bo)&  Bu ); \
        Amo0 =   Bo ^((~Bu)&  Ba ); \
        Agu1 =   Bu ^((~Ba)&  Be ); \
        Bu = ROL32((Ama1^Da0),  9); \
        Ba = ROL32((Age1^De1),  1); \
        Be = ROL32((Asi1^Di0),  3); \
        Bi = ROL32((Ako0^Do1), 13); \
        Bo = ROL32((Abu1^Du0),  4); \
        Ama1 =   Ba ^((~Be)&  Bi ); \
        Age1 =   Be ^((~Bi)&  Bo ); \
        Asi1 =   Bi ^((~Bo)&  Bu ); \
        Ako0 =   Bo ^((~Bu)&  Ba ); \
        Abu1 =   Bu ^((~Ba)&  Be ); \
        Bu = ROL32((Ama0^Da1),  9); \
        Ba = (Age0^De0); \
        Be = ROL32((Asi0^Di1),  3); \
        Bi = ROL32((Ako1^Do0), 12); \
        Bo = ROL32((Abu0^Du1),  4); \
        Ama0 =   Ba ^((~Be)&  Bi ); \
        Age0 =   Be ^((~Bi)&  Bo ); \
        Asi0 =   Bi ^((~Bo)&  Bu ); \
        Ako1 =   Bo ^((~Bu)&  Ba ); \
        Abu0 =   Bu ^((~Ba)&  Be ); \
        Be = ROL32((Aka1^Da0), 18); \
        Bi = ROL32((Abe1^De0),  5); \
        Bo = ROL32((Ami0^Di1),  8); \
        Bu = ROL32((Ago1^Do0), 28); \
        Ba = ROL32((Asu1^Du1), 14); \
        Aka1 =   Ba ^((~Be)&  Bi ); \
        Abe1 =   Be ^((~Bi)&  Bo ); \
        Ami0 =   Bi ^((~Bo)&  Bu ); \
        Ago1 =   Bo ^((~Bu)&  Ba ); \
        Asu1 =   Bu ^((~Ba)&  Be ); \
        Be = ROL32((Aka0^Da1), 18); \
        Bi = ROL32((Abe0^De1),  5); \
        Bo = ROL32((Ami1^Di0),  7); \
        Bu = ROL32((Ago0^Do1), 28); \
        Ba = ROL32((Asu0^Du0), 13); \
        Aka0 =   Ba ^((~Be)&  Bi ); \
        Abe0 =   Be ^((~Bi)&  Bo ); \
        Ami1 =   Bi ^((~Bo)&  Bu ); \
        Ago0 =   Bo ^((~Bu)&  Ba ); \
        Asu0 =   Bu ^((~Ba)&  Be ); \
        Bo = ROL32((Aga1^Da1), 21); \
        Bu = ROL32((Ase0^De0),  1); \
        Ba = ROL32((Aki1^Di0), 31); \
        Be = ROL32((Abo1^Do1), 28); \
        Bi = ROL32((Amu1^Du1), 20); \
        Aga1 =   Ba ^((~Be)&  Bi ); \
        Ase0 =   Be ^((~Bi)&  Bo ); \
        Aki1 =   Bi ^((~Bo)&  Bu ); \
        Abo1 =   Bo ^((~Bu)&  Ba ); \
        Amu1 =   Bu ^((~Ba)&  Be ); \
        Bo = ROL32((Aga0^Da0), 20); \
        Bu = ROL32((Ase1^De1),  1); \
        Ba = ROL32((Aki0^Di1), 31); \
        Be = ROL32((Abo0^Do0), 27); \
        Bi = ROL32((Amu0^Du0), 19); \
        Aga0 =   Ba ^((~Be)&  Bi ); \
        Ase1 =   Be ^((~Bi)&  Bo ); \
        Aki0 =   Bi ^((~Bo)&  Bu ); \
        Abo0 =   Bo ^((~Bu)&  Ba ); \
        Amu0 =   Bu ^((~Ba)&  Be );

#define KeccakRound2() \
        Cx = Aku1^Agu0^Abu1^Asu1^Amu1; \
        Du1 = Ame0^Ake0^Age0^Abe0^Ase1; \
        Da0 = Cx^ROL32(Du1, 1); \
        Cz = Aku0^Agu1^Abu0^Asu0^Amu0; \
        Du0 = Ame1^Ake1^Age1^Abe1^Ase0; \
        Da1 = Cz^Du0; \
        Cw = Agi1^Abi1^Asi1^Ami0^Aki1; \
        Do0 = Cw^ROL32(Cz, 1); \
        Cy = Agi0^Abi0^Asi0^Ami1^Aki0; \
        Do1 = Cy^Cx; \
        Cx = Aba0^Asa1^Ama1^Aka1^Aga1; \
        De0 = Cx^ROL32(Cy, 1); \
        Cz = Aba1^Asa0^Ama0^Aka0^Aga0; \
        De1 = Cz^Cw; \
        Cy = Aso0^Amo0^Ako1^Ago0^Abo0; \
        Di0 = Du0^ROL32(Cy, 1); \
        Cw = Aso1^Amo1^Ako0^Ago1^Abo1; \
        Di1 = Du1^Cw; \
        Du0 = Cw^ROL32(Cz, 1); \
        Du1 = Cy^Cx; \
\
        Ba = (Aba0^Da0); \
        Be = ROL32((Ake1^De0), 22); \
        Bi = ROL32((Asi0^Di1), 22); \
        Bo = ROL32((Ago0^Do1), 11); \
        Bu = ROL32((Amu1^Du0),  7); \
        Aba0 =   Ba ^((~Be)&  Bi ); \
        Aba0 ^= *(pRoundConstants++); \
        Ake1 =   Be ^((~Bi)&  Bo ); \
        Asi0 =   Bi ^((~Bo)&  Bu ); \
        Ago0 =   Bo ^((~Bu)&  Ba ); \
        Amu1 =   Bu ^((~Ba)&  Be ); \
        Ba = (Aba1^Da1); \
        Be = ROL32((Ake0^De1), 22); \
        Bi = ROL32((Asi1^Di0), 21); \
        Bo = ROL32((Ago1^Do0), 10); \
        Bu = ROL32((Amu0^Du1),  7); \
        Aba1 =   Ba ^((~Be)&  Bi ); \
        Aba1 ^= *(pRoundConstants++); \
        Ake0 =   Be ^((~Bi)&  Bo ); \
        Asi1 =   Bi ^((~Bo)&  Bu ); \
        Ago1 =   Bo ^((~Bu)&  Ba ); \
        Amu0 =   Bu ^((~Ba)&  Be ); \
        Bi = ROL32((Ama0^Da1),  2); \
        Bo = ROL32((Abe0^De1), 23); \
        Bu = ROL32((Aki0^Di1), 31); \
        Ba = ROL32((Aso1^Do0), 14); \
        Be = ROL32((Agu0^Du0), 10); \
        Ama0 =   Ba ^((~Be)&  Bi ); \
        Abe0 =   Be ^((~Bi)&  Bo ); \
        Aki0 =   Bi ^((~Bo)&  Bu ); \
        Aso1 =   Bo ^((~Bu)&  Ba ); \
        Agu0 =   Bu ^((~Ba)&  Be ); \
        Bi = ROL32((Ama1^Da0),  1); \
        Bo = ROL32((Abe1^De0), 22); \
        Bu = ROL32((Aki1^Di0), 30); \
        Ba = ROL32((Aso0^Do1), 14); \
        Be = ROL32((Agu1^Du1), 10); \
        Ama1 =   Ba ^((~Be)&  Bi ); \
        Abe1 =   Be ^((~Bi)&  Bo ); \
        Aki1 =   Bi ^((~Bo)&  Bu ); \
        Aso0 =   Bo ^((~Bu)&  Ba ); \
        Agu1 =   Bu ^((~Ba)&  Be ); \
        Bu = ROL32((Aga1^Da0),  9); \
        Ba = ROL32((Ame0^De1),  1); \
        Be = ROL32((Abi1^Di0),  3); \
        Bi = ROL32((Ako1^Do1), 13); \
        Bo = ROL32((Asu1^Du0),  4); \
        Aga1 =   Ba ^((~Be)&  Bi ); \
        Ame0 =   Be ^((~Bi)&  Bo ); \
        Abi1 =   Bi ^((~Bo)&  Bu ); \
        Ako1 =   Bo ^((~Bu)&  Ba ); \
        Asu1 =   Bu ^((~Ba)&  Be ); \
        Bu = ROL32((Aga0^Da1),  9); \
        Ba = (Ame1^De0); \
        Be = ROL32((Abi0^Di1),  3); \
        Bi = ROL32((Ako0^Do0), 12); \
        Bo = ROL32((Asu0^Du1),  4); \
        Aga0 =   Ba ^((~Be)&  Bi ); \
        Ame1 =   Be ^((~Bi)&  Bo ); \
        Abi0 =   Bi ^((~Bo)&  Bu ); \
        Ako0 =   Bo ^((~Bu)&  Ba ); \
        Asu0 =   Bu ^((~Ba)&  Be ); \
        Be = ROL32((Asa1^Da0), 18); \
        Bi = ROL32((Age1^De0),  5); \
        Bo = ROL32((Ami1^Di1),  8); \
        Bu = ROL32((Abo1^Do0), 28); \
        Ba = ROL32((Aku0^Du1), 14); \
        Asa1 =   Ba ^((~Be)&  Bi ); \
        Age1 =   Be ^((~Bi)&  Bo ); \
        Ami1 =   Bi ^((~Bo)&  Bu ); \
        Abo1 =   Bo ^((~Bu)&  Ba ); \
        Aku0 =   Bu ^((~Ba)&  Be ); \
        Be = ROL32((Asa0^Da1), 18); \
        Bi = ROL32((Age0^De1),  5); \
        Bo = ROL32((Ami0^Di0),  7); \
        Bu = ROL32((Abo0^Do1), 28); \
        Ba = ROL32((Aku1^Du0), 13); \
        Asa0 =   Ba ^((~Be)&  Bi ); \
        Age0 =   Be ^((~Bi)&  Bo ); \
        Ami0 =   Bi ^((~Bo)&  Bu ); \
        Abo0 =   Bo ^((~Bu)&  Ba ); \
        Aku1 =   Bu ^((~Ba)&  Be ); \
        Bo = ROL32((Aka0^Da1), 21); \
        Bu = ROL32((Ase0^De0),  1); \
        Ba = ROL32((Agi1^Di0), 31); \
        Be = ROL32((Amo0^Do1), 28); \
        Bi = ROL32((Abu0^Du1), 20); \
        Aka0 =   Ba ^((~Be)&  Bi ); \
        Ase0 =   Be ^((~Bi)&  Bo ); \
        Agi1 =   Bi ^((~Bo)&  Bu ); \
        Amo0 =   Bo ^((~Bu)&  Ba ); \
        Abu0 =   Bu ^((~Ba)&  Be ); \
        Bo = ROL32((Aka1^Da0), 20); \
        Bu = ROL32((Ase1^De1),  1); \
        Ba = ROL32((Agi0^Di1), 31); \
        Be = ROL32((Amo1^Do0), 27); \
        Bi = ROL32((Abu1^Du0), 19); \
        Aka1 =   Ba ^((~Be)&  Bi ); \
        Ase1 =   Be ^((~Bi)&  Bo ); \
        Agi0 =   Bi ^((~Bo)&  Bu ); \
        Amo1 =   Bo ^((~Bu)&  Ba ); \
        Abu1 =   Bu ^((~Ba)&  Be );

#define KeccakRound3() \
        Cx = Amu1^Agu0^Asu1^Aku0^Abu0; \
        Du1 = Ake0^Abe1^Ame1^Age0^Ase1; \
        Da0 = Cx^ROL32(Du1, 1); \
        Cz = Amu0^Agu1^Asu0^Aku1^Abu1; \
        Du0 = Ake1^Abe0^Ame0^Age1^Ase0; \
        Da1 = Cz^Du0; \
        Cw = Asi0^Aki0^Abi1^Ami1^Agi1; \
        Do0 = Cw^ROL32(Cz, 1); \
        Cy = Asi1^Aki1^Abi0^Ami0^Agi0; \
        Do1 = Cy^Cx; \
        Cx = Aba0^Ama0^Aga1^Asa1^Aka0; \
        De0 = Cx^ROL32(Cy, 1); \
        Cz = Aba1^Ama1^Aga0^Asa0^Aka1; \
        De1 = Cz^Cw; \
        Cy = Ago1^Aso0^Ako0^Abo0^Amo1; \
        Di0 = Du0^ROL32(Cy, 1); \
        Cw = Ago0^Aso1^Ako1^Abo1^Amo0; \
        Di1 = Du1^Cw; \
        Du0 = Cw^ROL32(Cz, 1); \
        Du1 = Cy^Cx; \
\
        Ba = (Aba0^Da0); \
        Be = ROL32((Abe0^De0), 22); \
        Bi = ROL32((Abi0^Di1), 22); \
        Bo = ROL32((Abo0^Do1), 11); \
        Bu = ROL32((Abu0^Du0),  7); \
        Aba0 =   Ba ^((~Be)&  Bi ); \
        Aba0 ^= *(pRoundConstants++); \
        Abe0 =   Be ^((~Bi)&  Bo ); \
        Abi0 =   Bi ^((~Bo)&  Bu ); \
        Abo0 =   Bo ^((~Bu)&  Ba ); \
        Abu0 =   Bu ^((~Ba)&  Be ); \
        Ba = (Aba1^Da1); \
        Be = ROL32((Abe1^De1), 22); \
        Bi = ROL32((Abi1^Di0), 21); \
        Bo = ROL32((Abo1^Do0), 10); \
        Bu = ROL32((Abu1^Du1),  7); \
        Aba1 =   Ba ^((~Be)&  Bi ); \
        Aba1 ^= *(pRoundConstants++); \
        Abe1 =   Be ^((~Bi)&  Bo ); \
        Abi1 =   Bi ^((~Bo)&  Bu ); \
        Abo1 =   Bo ^((~Bu)&  Ba ); \
        Abu1 =   Bu ^((~Ba)&  Be ); \
        Bi = ROL32((Aga0^Da1),  2); \
        Bo = ROL32((Age0^De1), 23); \
        Bu = ROL32((Agi0^Di1), 31); \
        Ba = ROL32((Ago0^Do0), 14); \
        Be = ROL32((Agu0^Du0), 10); \
        Aga0 =   Ba ^((~Be)&  Bi ); \
        Age0 =   Be ^((~Bi)&  Bo ); \
        Agi0 =   Bi ^((~Bo)&  Bu ); \
        Ago0 =   Bo ^((~Bu)&  Ba ); \
        Agu0 =   Bu ^((~Ba)&  Be ); \
        Bi = ROL32((Aga1^Da0),  1); \
        Bo = ROL32((Age1^De0), 22); \
        Bu = ROL32((Agi1^Di0), 30); \
        Ba = ROL32((Ago1^Do1), 14); \
        Be = ROL32((Agu1^Du1), 10); \
        Aga1 =   Ba ^((~Be)&  Bi ); \
        Age1 =   Be ^((~Bi)&  Bo ); \
        Agi1 =   Bi ^((~Bo)&  Bu ); \
        Ago1 =   Bo ^((~Bu)&  Ba ); \
        Agu1 =   Bu ^((~Ba)&  Be ); \
        Bu = ROL32((Aka0^Da0),  9); \
        Ba = ROL32((Ake0^De1),  1); \
        Be = ROL32((Aki0^Di0),  3); \
        Bi = ROL32((Ako0^Do1), 13); \
        Bo = ROL32((Aku0^Du0),  4); \
        Aka0 =   Ba ^((~Be)&  Bi ); \
        Ake0 =   Be ^((~Bi)&  Bo ); \
        Aki0 =   Bi ^((~Bo)&  Bu ); \
        Ako0 =   Bo ^((~Bu)&  Ba ); \
        Aku0 =   Bu ^((~Ba)&  Be ); \
        Bu = ROL32((Aka1^Da1),  9); \
        Ba = (Ake1^De0); \
        Be = ROL32((Aki1^Di1),  3); \
        Bi = ROL32((Ako1^Do0), 12); \
        Bo = ROL32((Aku1^Du1),  4); \
        Aka1 =   Ba ^((~Be)&  Bi ); \
        Ake1 =   Be ^((~Bi)&  Bo ); \
        Aki1 =   Bi ^((~Bo)&  Bu ); \
        Ako1 =   Bo ^((~Bu)&  Ba ); \
        Aku1 =   Bu ^((~Ba)&  Be ); \
        Be = ROL32((Ama0^Da0), 18); \
        Bi = ROL32((Ame0^De0),  5); \
        Bo = ROL32((Ami0^Di1),  8); \
        Bu = ROL32((Amo0^Do0), 28); \
        Ba = ROL32((Amu0^Du1), 14); \
        Ama0 =   Ba ^((~Be)&  Bi ); \
        Ame0 =   Be ^((~Bi)&  Bo ); \
        Ami0 =   Bi ^((~Bo)&  Bu ); \
        Amo0 =   Bo ^((~Bu)&  Ba ); \
        Amu0 =   Bu ^((~Ba)&  Be ); \
        Be = ROL32((Ama1^Da1), 18); \
        Bi = ROL32((Ame1^De1),  5); \
        Bo = ROL32((Ami1^Di0),  7); \
        Bu = ROL32((Amo1^Do1), 28); \
        Ba = ROL32((Amu1^Du0), 13); \
        Ama1 =   Ba ^((~Be)&  Bi ); \
        Ame1 =   Be ^((~Bi)&  Bo ); \
        Ami1 =   Bi ^((~Bo)&  Bu ); \
        Amo1 =   Bo ^((~Bu)&  Ba ); \
        Amu1 =   Bu ^((~Ba)&  Be ); \
        Bo = ROL32((Asa0^Da1), 21); \
        Bu = ROL32((Ase0^De0),  1); \
        Ba = ROL32((Asi0^Di0), 31); \
        Be = ROL32((Aso0^Do1), 28); \
        Bi = ROL32((Asu0^Du1), 20); \
        Asa0 =   Ba ^((~Be)&  Bi ); \
        Ase0 =   Be ^((~Bi)&  Bo ); \
        Asi0 =   Bi ^((~Bo)&  Bu ); \
        Aso0 =   Bo ^((~Bu)&  Ba ); \
        Asu0 =   Bu ^((~Ba)&  Be ); \
        Bo = ROL32((Asa1^Da0), 20); \
        Bu = ROL32((Ase1^De1),  1); \
        Ba = ROL32((Asi1^Di1), 31); \
        Be = ROL32((Aso1^Do0), 27); \
        Bi = ROL32((Asu1^Du0), 19); \
        Asa1 =   Ba ^((~Be)&  Bi ); \
        Ase1 =   Be ^((~Bi)&  Bo ); \
        Asi1 =   Bi ^((~Bo)&  Bu ); \
        Aso1 =   Bo ^((~Bu)&  Ba ); \
        Asu1 =   Bu ^((~Ba)&  Be );

void KeccakP1600_Permute_Nrounds(void *state, unsigned int nRounds)
{
    UINT32 Da0, De0, Di0, Do0, Du0;
    UINT32 Da1, De1, Di1, Do1, Du1;
    UINT32 Ba, Be, Bi, Bo, Bu;
    UINT32 Cx, Cy, Cz, Cw;
    const UINT32 *pRoundConstants = KeccakF1600RoundConstants_int2+(24-nRounds)*2;
    UINT32 *stateAsHalfLanes = (UINT32*)state;
    #define Aba0 stateAsHalfLanes[ 0]
    #define Aba1 stateAsHalfLanes[ 1]
    #define Abe0 stateAsHalfLanes[ 2]
    #define Abe1 stateAsHalfLanes[ 3]
    #define Abi0 stateAsHalfLanes[ 4]
    #define Abi1 stateAsHalfLanes[ 5]
    #define Abo0 stateAsHalfLanes[ 6]
    #define Abo1 stateAsHalfLanes[ 7]
    #define Abu0 stateAsHalfLanes[ 8]
    #define Abu1 stateAsHalfLanes[ 9]
    #define Aga0 stateAsHalfLanes[10]
    #define Aga1 stateAsHalfLanes[11]
    #define Age0 stateAsHalfLanes[12]
    #define Age1 stateAsHalfLanes[13]
    #define Agi0 stateAsHalfLanes[14]
    #define Agi1 stateAsHalfLanes[15]
    #define Ago0 stateAsHalfLanes[16]
    #define Ago1 stateAsHalfLanes[17]
    #define Agu0 stateAsHalfLanes[18]
    #define Agu1 stateAsHalfLanes[19]
    #define Aka0 stateAsHalfLanes[20]
    #define Aka1 stateAsHalfLanes[21]
    #define Ake0 stateAsHalfLanes[22]
    #define Ake1 stateAsHalfLanes[23]
    #define Aki0 stateAsHalfLanes[24]
    #define Aki1 stateAsHalfLanes[25]
    #define Ako0 stateAsHalfLanes[26]
    #define Ako1 stateAsHalfLanes[27]
    #define Aku0 stateAsHalfLanes[28]
    #define Aku1 stateAsHalfLanes[29]
    #define Ama0 stateAsHalfLanes[30]
    #define Ama1 stateAsHalfLanes[31]
    #define Ame0 stateAsHalfLanes[32]
    #define Ame1 stateAsHalfLanes[33]
    #define Ami0 stateAsHalfLanes[34]
    #define Ami1 stateAsHalfLanes[35]
    #define Amo0 stateAsHalfLanes[36]
    #define Amo1 stateAsHalfLanes[37]
    #define Amu0 stateAsHalfLanes[38]
    #define Amu1 stateAsHalfLanes[39]
    #define Asa0 stateAsHalfLanes[40]
    #define Asa1 stateAsHalfLanes[41]
    #define Ase0 stateAsHalfLanes[42]
    #define Ase1 stateAsHalfLanes[43]
    #define Asi0 stateAsHalfLanes[44]
    #define Asi1 stateAsHalfLanes[45]
    #define Aso0 stateAsHalfLanes[46]
    #define Aso1 stateAsHalfLanes[47]
    #define Asu0 stateAsHalfLanes[48]
    #define Asu1 stateAsHalfLanes[49]

    nRounds &= 3;
    switch ( nRounds )
    {
        #define I0 Ba
        #define I1 Be
        #define T0 Bi
        #define T1 Bo
        #define SwapPI13( in0,in1,in2,in3,eo0,eo1,eo2,eo3 ) \
            I0 = (in0)[0]; I1 = (in0)[1];       \
            T0 = (in1)[0]; T1 = (in1)[1];       \
            (in0)[eo0] = T0; (in0)[eo0^1] = T1; \
            T0 = (in2)[0]; T1 = (in2)[1];       \
            (in1)[eo1] = T0; (in1)[eo1^1] = T1; \
            T0 = (in3)[0]; T1 = (in3)[1];       \
            (in2)[eo2] = T0; (in2)[eo2^1] = T1; \
            (in3)[eo3] = I0; (in3)[eo3^1] = I1
        #define SwapPI2( in0,in1,in2,in3 ) \
            I0 = (in0)[0]; I1 = (in0)[1]; \
            T0 = (in1)[0]; T1 = (in1)[1]; \
            (in0)[1] = T0; (in0)[0] = T1; \
            (in1)[1] = I0; (in1)[0] = I1; \
            I0 = (in2)[0]; I1 = (in2)[1]; \
            T0 = (in3)[0]; T1 = (in3)[1]; \
            (in2)[1] = T0; (in2)[0] = T1; \
            (in3)[1] = I0; (in3)[0] = I1
        #define SwapEO( even,odd ) T0 = even; even = odd; odd = T0

        case 1:
            SwapPI13( &Aga0, &Aka0, &Asa0, &Ama0, 1, 0, 1, 0 );
            SwapPI13( &Abe0, &Age0, &Ame0, &Ake0, 0, 1, 0, 1 );
            SwapPI13( &Abi0, &Aki0, &Agi0, &Asi0, 1, 0, 1, 0 );
            SwapEO( Ami0, Ami1 );
            SwapPI13( &Abo0, &Amo0, &Aso0, &Ago0, 1, 0, 1, 0 );
            SwapEO( Ako0, Ako1 );
            SwapPI13( &Abu0, &Asu0, &Aku0, &Amu0, 0, 1, 0, 1 );
            break;        

        case 2:
            SwapPI2( &Aga0, &Asa0, &Aka0, &Ama0 );
            SwapPI2( &Abe0, &Ame0, &Age0, &Ake0 );
            SwapPI2( &Abi0, &Agi0, &Aki0, &Asi0 );
            SwapPI2( &Abo0, &Aso0, &Ago0, &Amo0 );
            SwapPI2( &Abu0, &Aku0, &Amu0, &Asu0 );
            break;        

        case 3:
            SwapPI13( &Aga0, &Ama0, &Asa0, &Aka0, 0, 1, 0, 1 );
            SwapPI13( &Abe0, &Ake0, &Ame0, &Age0, 1, 0, 1, 0 );
            SwapPI13( &Abi0, &Asi0, &Agi0, &Aki0, 0, 1, 0, 1 );
            SwapEO( Ami0, Ami1 );
            SwapPI13( &Abo0, &Ago0, &Aso0, &Amo0, 0, 1, 0, 1 );
            SwapEO( Ako0, Ako1 );
            SwapPI13( &Abu0, &Amu0, &Aku0, &Asu0, 1, 0, 1, 0 );
            break;        
        #undef I0
        #undef I1
        #undef T0
        #undef T1
        #undef SwapPI13
        #undef SwapPI2
        #undef SwapEO
    }

    do
    {
        /* Code for 4 rounds, using factor 2 interleaving, 64-bit lanes mapped to 32-bit words */
        switch ( nRounds )
        {
            case 0: KeccakRound0(); /* fall through */
            case 3: KeccakRound1();
            case 2: KeccakRound2();
            case 1: KeccakRound3();
        }
        nRounds = 0;
    }
    while ( *pRoundConstants != 0xFF );

    #undef Aba0
    #undef Aba1
    #undef Abe0
    #undef Abe1
    #undef Abi0
    #undef Abi1
    #undef Abo0
    #undef Abo1
    #undef Abu0
    #undef Abu1
    #undef Aga0
    #undef Aga1
    #undef Age0
    #undef Age1
    #undef Agi0
    #undef Agi1
    #undef Ago0
    #undef Ago1
    #undef Agu0
    #undef Agu1
    #undef Aka0
    #undef Aka1
    #undef Ake0
    #undef Ake1
    #undef Aki0
    #undef Aki1
    #undef Ako0
    #undef Ako1
    #undef Aku0
    #undef Aku1
    #undef Ama0
    #undef Ama1
    #undef Ame0
    #undef Ame1
    #undef Ami0
    #undef Ami1
    #undef Amo0
    #undef Amo1
    #undef Amu0
    #undef Amu1
    #undef Asa0
    #undef Asa1
    #undef Ase0
    #undef Ase1
    #undef Asi0
    #undef Asi1
    #undef Aso0
    #undef Aso1
    #undef Asu0
    #undef Asu1
}

/* ---------------------------------------------------------------- */

void KeccakP1600_Permute_12rounds(void *state)
{
     KeccakP1600_Permute_Nrounds(state, 12);
}

/* ---------------------------------------------------------------- */

void KeccakP1600_Permute_24rounds(void *state)
{
     KeccakP1600_Permute_Nrounds(state, 24);
}
Back to Directory File Manager