diff --git a/cmake/astrobwt.cmake b/cmake/astrobwt.cmake index 06485779..6f48d13e 100644 --- a/cmake/astrobwt.cmake +++ b/cmake/astrobwt.cmake @@ -23,6 +23,7 @@ if (WITH_ASTROBWT) else() if (CMAKE_SIZEOF_VOID_P EQUAL 8) add_definitions(/DASTROBWT_AVX2) + list(APPEND SOURCES_CRYPTO src/crypto/astrobwt/xmm6int/salsa20_xmm6int-avx2.c) if (CMAKE_C_COMPILER_ID MATCHES MSVC) enable_language(ASM_MASM) list(APPEND SOURCES_CRYPTO src/crypto/astrobwt/sha3_256_avx2.asm) diff --git a/src/crypto/astrobwt/AstroBWT.cpp b/src/crypto/astrobwt/AstroBWT.cpp index 26b80225..5dee1165 100644 --- a/src/crypto/astrobwt/AstroBWT.cpp +++ b/src/crypto/astrobwt/AstroBWT.cpp @@ -70,7 +70,17 @@ static void Salsa20_XORKeyStream(const void* key, void* output, size_t size) { const uint64_t iv = 0; ZeroTier::Salsa20 s(key, &iv); - s.XORKeyStream(output, size); + s.XORKeyStream(output, static_cast(size)); + memset(static_cast(output) - 16, 0, 16); + memset(static_cast(output) + size, 0, 16); +} + +extern "C" int salsa20_stream_avx2(void* c, uint64_t clen, const void* iv, const void* key); + +static void Salsa20_XORKeyStream_AVX256(const void* key, void* output, size_t size) +{ + const uint64_t iv = 0; + salsa20_stream_avx2(output, size, &iv, key); memset(static_cast(output) - 16, 0, 16); memset(static_cast(output) + size, 0, 16); } @@ -167,13 +177,16 @@ bool xmrig::astrobwt::astrobwt_dero(const void* input_data, uint32_t input_size, uint8_t* stage2_result = (uint8_t*)(tmp_indices); #ifdef ASTROBWT_AVX2 - if (hasAVX2 && avx2) + if (hasAVX2 && avx2) { SHA3_256_AVX2_ASM(input_data, input_size, key); + Salsa20_XORKeyStream_AVX256(key, stage1_output, STAGE1_SIZE); + } else #endif + { sha3_HashBuffer(256, SHA3_FLAGS_NONE, input_data, input_size, key, sizeof(key)); - - Salsa20_XORKeyStream(key, stage1_output, STAGE1_SIZE); + Salsa20_XORKeyStream(key, stage1_output, STAGE1_SIZE); + } sort_indices(STAGE1_SIZE + 1, stage1_output, indices, tmp_indices); @@ -196,7 +209,15 @@ bool xmrig::astrobwt::astrobwt_dero(const void* input_data, uint32_t input_size, return false; } - Salsa20_XORKeyStream(key, stage2_output, stage2_size); +#ifdef ASTROBWT_AVX2 + if (hasAVX2 && avx2) { + Salsa20_XORKeyStream_AVX256(key, stage2_output, stage2_size); + } + else +#endif + { + Salsa20_XORKeyStream(key, stage2_output, stage2_size); + } sort_indices(stage2_size + 1, stage2_output, indices, tmp_indices); diff --git a/src/crypto/astrobwt/xmm6int/salsa20_xmm6int-avx2.c b/src/crypto/astrobwt/xmm6int/salsa20_xmm6int-avx2.c new file mode 100644 index 00000000..fe2047d4 --- /dev/null +++ b/src/crypto/astrobwt/xmm6int/salsa20_xmm6int-avx2.c @@ -0,0 +1,105 @@ +/* + * ISC License + * + * Copyright (c) 2013-2021 + * Frank Denis + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include + +#ifdef __GNUC__ +#pragma GCC target("sse2") +#pragma GCC target("ssse3") +#pragma GCC target("sse4.1") +#pragma GCC target("avx2") +#endif + +#include +#include +#include +#include + +#define ROUNDS 20 + +typedef struct salsa_ctx { + uint32_t input[16]; +} salsa_ctx; + +static const int TR[16] = { + 0, 5, 10, 15, 12, 1, 6, 11, 8, 13, 2, 7, 4, 9, 14, 3 +}; + +#define LOAD32_LE(p) *((uint32_t*)(p)) +#define STORE32_LE(dst, src) memcpy((dst), &(src), sizeof(uint32_t)) + +static void +salsa_keysetup(salsa_ctx *ctx, const uint8_t *k) +{ + ctx->input[TR[1]] = LOAD32_LE(k + 0); + ctx->input[TR[2]] = LOAD32_LE(k + 4); + ctx->input[TR[3]] = LOAD32_LE(k + 8); + ctx->input[TR[4]] = LOAD32_LE(k + 12); + ctx->input[TR[11]] = LOAD32_LE(k + 16); + ctx->input[TR[12]] = LOAD32_LE(k + 20); + ctx->input[TR[13]] = LOAD32_LE(k + 24); + ctx->input[TR[14]] = LOAD32_LE(k + 28); + ctx->input[TR[0]] = 0x61707865; + ctx->input[TR[5]] = 0x3320646e; + ctx->input[TR[10]] = 0x79622d32; + ctx->input[TR[15]] = 0x6b206574; +} + +static void +salsa_ivsetup(salsa_ctx *ctx, const uint8_t *iv, const uint8_t *counter) +{ + ctx->input[TR[6]] = LOAD32_LE(iv + 0); + ctx->input[TR[7]] = LOAD32_LE(iv + 4); + ctx->input[TR[8]] = counter == NULL ? 0 : LOAD32_LE(counter + 0); + ctx->input[TR[9]] = counter == NULL ? 0 : LOAD32_LE(counter + 4); +} + +static void +salsa20_encrypt_bytes(salsa_ctx *ctx, const uint8_t *m, uint8_t *c, + unsigned long long bytes) +{ + uint32_t * const x = &ctx->input[0]; + + if (!bytes) { + return; /* LCOV_EXCL_LINE */ + } + +#include "u8.h" +#include "u4.h" +#include "u1.h" +#include "u0.h" +} + +int salsa20_stream_avx2(void* c, uint64_t clen, const void* iv, const void* key) +{ + struct salsa_ctx ctx; + + if (!clen) { + return 0; + } + + salsa_keysetup(&ctx, (const uint8_t*)key); + salsa_ivsetup(&ctx, (const uint8_t*)iv, NULL); + memset(c, 0, clen); + salsa20_encrypt_bytes(&ctx, (const uint8_t*)c, (uint8_t*)c, clen); + + return 0; +} diff --git a/src/crypto/astrobwt/xmm6int/u0.h b/src/crypto/astrobwt/xmm6int/u0.h new file mode 100644 index 00000000..ab93f742 --- /dev/null +++ b/src/crypto/astrobwt/xmm6int/u0.h @@ -0,0 +1,193 @@ +if (bytes > 0) { + __m128i diag0 = _mm_loadu_si128((const __m128i *) (x + 0)); + __m128i diag1 = _mm_loadu_si128((const __m128i *) (x + 4)); + __m128i diag2 = _mm_loadu_si128((const __m128i *) (x + 8)); + __m128i diag3 = _mm_loadu_si128((const __m128i *) (x + 12)); + __m128i a0, a1, a2, a3, a4, a5, a6, a7; + __m128i b0, b1, b2, b3, b4, b5, b6, b7; + uint8_t partialblock[64]; + + unsigned int i; + + a0 = diag1; + for (i = 0; i < ROUNDS; i += 4) { + a0 = _mm_add_epi32(a0, diag0); + a1 = diag0; + b0 = a0; + a0 = _mm_slli_epi32(a0, 7); + b0 = _mm_srli_epi32(b0, 25); + diag3 = _mm_xor_si128(diag3, a0); + + diag3 = _mm_xor_si128(diag3, b0); + + a1 = _mm_add_epi32(a1, diag3); + a2 = diag3; + b1 = a1; + a1 = _mm_slli_epi32(a1, 9); + b1 = _mm_srli_epi32(b1, 23); + diag2 = _mm_xor_si128(diag2, a1); + diag3 = _mm_shuffle_epi32(diag3, 0x93); + diag2 = _mm_xor_si128(diag2, b1); + + a2 = _mm_add_epi32(a2, diag2); + a3 = diag2; + b2 = a2; + a2 = _mm_slli_epi32(a2, 13); + b2 = _mm_srli_epi32(b2, 19); + diag1 = _mm_xor_si128(diag1, a2); + diag2 = _mm_shuffle_epi32(diag2, 0x4e); + diag1 = _mm_xor_si128(diag1, b2); + + a3 = _mm_add_epi32(a3, diag1); + a4 = diag3; + b3 = a3; + a3 = _mm_slli_epi32(a3, 18); + b3 = _mm_srli_epi32(b3, 14); + diag0 = _mm_xor_si128(diag0, a3); + diag1 = _mm_shuffle_epi32(diag1, 0x39); + diag0 = _mm_xor_si128(diag0, b3); + + a4 = _mm_add_epi32(a4, diag0); + a5 = diag0; + b4 = a4; + a4 = _mm_slli_epi32(a4, 7); + b4 = _mm_srli_epi32(b4, 25); + diag1 = _mm_xor_si128(diag1, a4); + + diag1 = _mm_xor_si128(diag1, b4); + + a5 = _mm_add_epi32(a5, diag1); + a6 = diag1; + b5 = a5; + a5 = _mm_slli_epi32(a5, 9); + b5 = _mm_srli_epi32(b5, 23); + diag2 = _mm_xor_si128(diag2, a5); + diag1 = _mm_shuffle_epi32(diag1, 0x93); + diag2 = _mm_xor_si128(diag2, b5); + + a6 = _mm_add_epi32(a6, diag2); + a7 = diag2; + b6 = a6; + a6 = _mm_slli_epi32(a6, 13); + b6 = _mm_srli_epi32(b6, 19); + diag3 = _mm_xor_si128(diag3, a6); + diag2 = _mm_shuffle_epi32(diag2, 0x4e); + diag3 = _mm_xor_si128(diag3, b6); + + a7 = _mm_add_epi32(a7, diag3); + a0 = diag1; + b7 = a7; + a7 = _mm_slli_epi32(a7, 18); + b7 = _mm_srli_epi32(b7, 14); + diag0 = _mm_xor_si128(diag0, a7); + diag3 = _mm_shuffle_epi32(diag3, 0x39); + diag0 = _mm_xor_si128(diag0, b7); + + a0 = _mm_add_epi32(a0, diag0); + a1 = diag0; + b0 = a0; + a0 = _mm_slli_epi32(a0, 7); + b0 = _mm_srli_epi32(b0, 25); + diag3 = _mm_xor_si128(diag3, a0); + + diag3 = _mm_xor_si128(diag3, b0); + + a1 = _mm_add_epi32(a1, diag3); + a2 = diag3; + b1 = a1; + a1 = _mm_slli_epi32(a1, 9); + b1 = _mm_srli_epi32(b1, 23); + diag2 = _mm_xor_si128(diag2, a1); + diag3 = _mm_shuffle_epi32(diag3, 0x93); + diag2 = _mm_xor_si128(diag2, b1); + + a2 = _mm_add_epi32(a2, diag2); + a3 = diag2; + b2 = a2; + a2 = _mm_slli_epi32(a2, 13); + b2 = _mm_srli_epi32(b2, 19); + diag1 = _mm_xor_si128(diag1, a2); + diag2 = _mm_shuffle_epi32(diag2, 0x4e); + diag1 = _mm_xor_si128(diag1, b2); + + a3 = _mm_add_epi32(a3, diag1); + a4 = diag3; + b3 = a3; + a3 = _mm_slli_epi32(a3, 18); + b3 = _mm_srli_epi32(b3, 14); + diag0 = _mm_xor_si128(diag0, a3); + diag1 = _mm_shuffle_epi32(diag1, 0x39); + diag0 = _mm_xor_si128(diag0, b3); + + a4 = _mm_add_epi32(a4, diag0); + a5 = diag0; + b4 = a4; + a4 = _mm_slli_epi32(a4, 7); + b4 = _mm_srli_epi32(b4, 25); + diag1 = _mm_xor_si128(diag1, a4); + + diag1 = _mm_xor_si128(diag1, b4); + + a5 = _mm_add_epi32(a5, diag1); + a6 = diag1; + b5 = a5; + a5 = _mm_slli_epi32(a5, 9); + b5 = _mm_srli_epi32(b5, 23); + diag2 = _mm_xor_si128(diag2, a5); + diag1 = _mm_shuffle_epi32(diag1, 0x93); + diag2 = _mm_xor_si128(diag2, b5); + + a6 = _mm_add_epi32(a6, diag2); + a7 = diag2; + b6 = a6; + a6 = _mm_slli_epi32(a6, 13); + b6 = _mm_srli_epi32(b6, 19); + diag3 = _mm_xor_si128(diag3, a6); + diag2 = _mm_shuffle_epi32(diag2, 0x4e); + diag3 = _mm_xor_si128(diag3, b6); + + a7 = _mm_add_epi32(a7, diag3); + a0 = diag1; + b7 = a7; + a7 = _mm_slli_epi32(a7, 18); + b7 = _mm_srli_epi32(b7, 14); + diag0 = _mm_xor_si128(diag0, a7); + diag3 = _mm_shuffle_epi32(diag3, 0x39); + diag0 = _mm_xor_si128(diag0, b7); + } + + diag0 = _mm_add_epi32(diag0, _mm_loadu_si128((const __m128i *) (x + 0))); + diag1 = _mm_add_epi32(diag1, _mm_loadu_si128((const __m128i *) (x + 4))); + diag2 = _mm_add_epi32(diag2, _mm_loadu_si128((const __m128i *) (x + 8))); + diag3 = _mm_add_epi32(diag3, _mm_loadu_si128((const __m128i *) (x + 12))); + +#define ONEQUAD_SHUFFLE(A, B, C, D) \ + do { \ + uint32_t in##A = _mm_cvtsi128_si32(diag0); \ + uint32_t in##B = _mm_cvtsi128_si32(diag1); \ + uint32_t in##C = _mm_cvtsi128_si32(diag2); \ + uint32_t in##D = _mm_cvtsi128_si32(diag3); \ + diag0 = _mm_shuffle_epi32(diag0, 0x39); \ + diag1 = _mm_shuffle_epi32(diag1, 0x39); \ + diag2 = _mm_shuffle_epi32(diag2, 0x39); \ + diag3 = _mm_shuffle_epi32(diag3, 0x39); \ + *(uint32_t *) (partialblock + (A * 4)) = in##A; \ + *(uint32_t *) (partialblock + (B * 4)) = in##B; \ + *(uint32_t *) (partialblock + (C * 4)) = in##C; \ + *(uint32_t *) (partialblock + (D * 4)) = in##D; \ + } while (0) + +#define ONEQUAD(A, B, C, D) ONEQUAD_SHUFFLE(A, B, C, D) + + ONEQUAD(0, 12, 8, 4); + ONEQUAD(5, 1, 13, 9); + ONEQUAD(10, 6, 2, 14); + ONEQUAD(15, 11, 7, 3); + +#undef ONEQUAD +#undef ONEQUAD_SHUFFLE + + for (i = 0; i < bytes; i++) { + c[i] = m[i] ^ partialblock[i]; + } +} diff --git a/src/crypto/astrobwt/xmm6int/u1.h b/src/crypto/astrobwt/xmm6int/u1.h new file mode 100644 index 00000000..e82521cd --- /dev/null +++ b/src/crypto/astrobwt/xmm6int/u1.h @@ -0,0 +1,207 @@ +while (bytes >= 64) { + __m128i diag0 = _mm_loadu_si128((const __m128i *) (x + 0)); + __m128i diag1 = _mm_loadu_si128((const __m128i *) (x + 4)); + __m128i diag2 = _mm_loadu_si128((const __m128i *) (x + 8)); + __m128i diag3 = _mm_loadu_si128((const __m128i *) (x + 12)); + __m128i a0, a1, a2, a3, a4, a5, a6, a7; + __m128i b0, b1, b2, b3, b4, b5, b6, b7; + + uint32_t in8; + uint32_t in9; + int i; + + a0 = diag1; + for (i = 0; i < ROUNDS; i += 4) { + a0 = _mm_add_epi32(a0, diag0); + a1 = diag0; + b0 = a0; + a0 = _mm_slli_epi32(a0, 7); + b0 = _mm_srli_epi32(b0, 25); + diag3 = _mm_xor_si128(diag3, a0); + + diag3 = _mm_xor_si128(diag3, b0); + + a1 = _mm_add_epi32(a1, diag3); + a2 = diag3; + b1 = a1; + a1 = _mm_slli_epi32(a1, 9); + b1 = _mm_srli_epi32(b1, 23); + diag2 = _mm_xor_si128(diag2, a1); + diag3 = _mm_shuffle_epi32(diag3, 0x93); + diag2 = _mm_xor_si128(diag2, b1); + + a2 = _mm_add_epi32(a2, diag2); + a3 = diag2; + b2 = a2; + a2 = _mm_slli_epi32(a2, 13); + b2 = _mm_srli_epi32(b2, 19); + diag1 = _mm_xor_si128(diag1, a2); + diag2 = _mm_shuffle_epi32(diag2, 0x4e); + diag1 = _mm_xor_si128(diag1, b2); + + a3 = _mm_add_epi32(a3, diag1); + a4 = diag3; + b3 = a3; + a3 = _mm_slli_epi32(a3, 18); + b3 = _mm_srli_epi32(b3, 14); + diag0 = _mm_xor_si128(diag0, a3); + diag1 = _mm_shuffle_epi32(diag1, 0x39); + diag0 = _mm_xor_si128(diag0, b3); + + a4 = _mm_add_epi32(a4, diag0); + a5 = diag0; + b4 = a4; + a4 = _mm_slli_epi32(a4, 7); + b4 = _mm_srli_epi32(b4, 25); + diag1 = _mm_xor_si128(diag1, a4); + + diag1 = _mm_xor_si128(diag1, b4); + + a5 = _mm_add_epi32(a5, diag1); + a6 = diag1; + b5 = a5; + a5 = _mm_slli_epi32(a5, 9); + b5 = _mm_srli_epi32(b5, 23); + diag2 = _mm_xor_si128(diag2, a5); + diag1 = _mm_shuffle_epi32(diag1, 0x93); + diag2 = _mm_xor_si128(diag2, b5); + + a6 = _mm_add_epi32(a6, diag2); + a7 = diag2; + b6 = a6; + a6 = _mm_slli_epi32(a6, 13); + b6 = _mm_srli_epi32(b6, 19); + diag3 = _mm_xor_si128(diag3, a6); + diag2 = _mm_shuffle_epi32(diag2, 0x4e); + diag3 = _mm_xor_si128(diag3, b6); + + a7 = _mm_add_epi32(a7, diag3); + a0 = diag1; + b7 = a7; + a7 = _mm_slli_epi32(a7, 18); + b7 = _mm_srli_epi32(b7, 14); + diag0 = _mm_xor_si128(diag0, a7); + diag3 = _mm_shuffle_epi32(diag3, 0x39); + diag0 = _mm_xor_si128(diag0, b7); + + a0 = _mm_add_epi32(a0, diag0); + a1 = diag0; + b0 = a0; + a0 = _mm_slli_epi32(a0, 7); + b0 = _mm_srli_epi32(b0, 25); + diag3 = _mm_xor_si128(diag3, a0); + + diag3 = _mm_xor_si128(diag3, b0); + + a1 = _mm_add_epi32(a1, diag3); + a2 = diag3; + b1 = a1; + a1 = _mm_slli_epi32(a1, 9); + b1 = _mm_srli_epi32(b1, 23); + diag2 = _mm_xor_si128(diag2, a1); + diag3 = _mm_shuffle_epi32(diag3, 0x93); + diag2 = _mm_xor_si128(diag2, b1); + + a2 = _mm_add_epi32(a2, diag2); + a3 = diag2; + b2 = a2; + a2 = _mm_slli_epi32(a2, 13); + b2 = _mm_srli_epi32(b2, 19); + diag1 = _mm_xor_si128(diag1, a2); + diag2 = _mm_shuffle_epi32(diag2, 0x4e); + diag1 = _mm_xor_si128(diag1, b2); + + a3 = _mm_add_epi32(a3, diag1); + a4 = diag3; + b3 = a3; + a3 = _mm_slli_epi32(a3, 18); + b3 = _mm_srli_epi32(b3, 14); + diag0 = _mm_xor_si128(diag0, a3); + diag1 = _mm_shuffle_epi32(diag1, 0x39); + diag0 = _mm_xor_si128(diag0, b3); + + a4 = _mm_add_epi32(a4, diag0); + a5 = diag0; + b4 = a4; + a4 = _mm_slli_epi32(a4, 7); + b4 = _mm_srli_epi32(b4, 25); + diag1 = _mm_xor_si128(diag1, a4); + + diag1 = _mm_xor_si128(diag1, b4); + + a5 = _mm_add_epi32(a5, diag1); + a6 = diag1; + b5 = a5; + a5 = _mm_slli_epi32(a5, 9); + b5 = _mm_srli_epi32(b5, 23); + diag2 = _mm_xor_si128(diag2, a5); + diag1 = _mm_shuffle_epi32(diag1, 0x93); + diag2 = _mm_xor_si128(diag2, b5); + + a6 = _mm_add_epi32(a6, diag2); + a7 = diag2; + b6 = a6; + a6 = _mm_slli_epi32(a6, 13); + b6 = _mm_srli_epi32(b6, 19); + diag3 = _mm_xor_si128(diag3, a6); + diag2 = _mm_shuffle_epi32(diag2, 0x4e); + diag3 = _mm_xor_si128(diag3, b6); + + a7 = _mm_add_epi32(a7, diag3); + a0 = diag1; + b7 = a7; + a7 = _mm_slli_epi32(a7, 18); + b7 = _mm_srli_epi32(b7, 14); + diag0 = _mm_xor_si128(diag0, a7); + diag3 = _mm_shuffle_epi32(diag3, 0x39); + diag0 = _mm_xor_si128(diag0, b7); + } + + diag0 = _mm_add_epi32(diag0, _mm_loadu_si128((const __m128i *) (x + 0))); + diag1 = _mm_add_epi32(diag1, _mm_loadu_si128((const __m128i *) (x + 4))); + diag2 = _mm_add_epi32(diag2, _mm_loadu_si128((const __m128i *) (x + 8))); + diag3 = _mm_add_epi32(diag3, _mm_loadu_si128((const __m128i *) (x + 12))); + +#define ONEQUAD_SHUFFLE(A, B, C, D) \ + do { \ + uint32_t in##A = _mm_cvtsi128_si32(diag0); \ + uint32_t in##B = _mm_cvtsi128_si32(diag1); \ + uint32_t in##C = _mm_cvtsi128_si32(diag2); \ + uint32_t in##D = _mm_cvtsi128_si32(diag3); \ + diag0 = _mm_shuffle_epi32(diag0, 0x39); \ + diag1 = _mm_shuffle_epi32(diag1, 0x39); \ + diag2 = _mm_shuffle_epi32(diag2, 0x39); \ + diag3 = _mm_shuffle_epi32(diag3, 0x39); \ + in##A ^= *(const uint32_t *) (m + (A * 4)); \ + in##B ^= *(const uint32_t *) (m + (B * 4)); \ + in##C ^= *(const uint32_t *) (m + (C * 4)); \ + in##D ^= *(const uint32_t *) (m + (D * 4)); \ + *(uint32_t *) (c + (A * 4)) = in##A; \ + *(uint32_t *) (c + (B * 4)) = in##B; \ + *(uint32_t *) (c + (C * 4)) = in##C; \ + *(uint32_t *) (c + (D * 4)) = in##D; \ + } while (0) + +#define ONEQUAD(A, B, C, D) ONEQUAD_SHUFFLE(A, B, C, D) + + ONEQUAD(0, 12, 8, 4); + ONEQUAD(5, 1, 13, 9); + ONEQUAD(10, 6, 2, 14); + ONEQUAD(15, 11, 7, 3); + +#undef ONEQUAD +#undef ONEQUAD_SHUFFLE + + in8 = x[8]; + in9 = x[13]; + in8++; + if (in8 == 0) { + in9++; + } + x[8] = in8; + x[13] = in9; + + c += 64; + m += 64; + bytes -= 64; +} diff --git a/src/crypto/astrobwt/xmm6int/u4.h b/src/crypto/astrobwt/xmm6int/u4.h new file mode 100644 index 00000000..474f4860 --- /dev/null +++ b/src/crypto/astrobwt/xmm6int/u4.h @@ -0,0 +1,547 @@ +if (bytes >= 256) { + __m128i y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14, + y15; + __m128i z0, z1, z2, z3, z4, z5, z6, z7, z8, z9, z10, z11, z12, z13, z14, + z15; + __m128i orig0, orig1, orig2, orig3, orig4, orig5, orig6, orig7, orig8, + orig9, orig10, orig11, orig12, orig13, orig14, orig15; + + uint32_t in8; + uint32_t in9; + int i; + + /* element broadcast immediate for _mm_shuffle_epi32 are in order: + 0x00, 0x55, 0xaa, 0xff */ + z0 = _mm_loadu_si128((const __m128i *) (x + 0)); + z5 = _mm_shuffle_epi32(z0, 0x55); + z10 = _mm_shuffle_epi32(z0, 0xaa); + z15 = _mm_shuffle_epi32(z0, 0xff); + z0 = _mm_shuffle_epi32(z0, 0x00); + z1 = _mm_loadu_si128((const __m128i *) (x + 4)); + z6 = _mm_shuffle_epi32(z1, 0xaa); + z11 = _mm_shuffle_epi32(z1, 0xff); + z12 = _mm_shuffle_epi32(z1, 0x00); + z1 = _mm_shuffle_epi32(z1, 0x55); + z2 = _mm_loadu_si128((const __m128i *) (x + 8)); + z7 = _mm_shuffle_epi32(z2, 0xff); + z13 = _mm_shuffle_epi32(z2, 0x55); + z2 = _mm_shuffle_epi32(z2, 0xaa); + /* no z8 -> first half of the nonce, will fill later */ + z3 = _mm_loadu_si128((const __m128i *) (x + 12)); + z4 = _mm_shuffle_epi32(z3, 0x00); + z14 = _mm_shuffle_epi32(z3, 0xaa); + z3 = _mm_shuffle_epi32(z3, 0xff); + /* no z9 -> second half of the nonce, will fill later */ + orig0 = z0; + orig1 = z1; + orig2 = z2; + orig3 = z3; + orig4 = z4; + orig5 = z5; + orig6 = z6; + orig7 = z7; + orig10 = z10; + orig11 = z11; + orig12 = z12; + orig13 = z13; + orig14 = z14; + orig15 = z15; + + while (bytes >= 256) { + /* vector implementation for z8 and z9 */ + /* not sure if it helps for only 4 blocks */ + const __m128i addv8 = _mm_set_epi64x(1, 0); + const __m128i addv9 = _mm_set_epi64x(3, 2); + __m128i t8, t9; + uint64_t in89; + + in8 = x[8]; + in9 = x[13]; + in89 = ((uint64_t) in8) | (((uint64_t) in9) << 32); + t8 = _mm_set1_epi64x(in89); + t9 = _mm_set1_epi64x(in89); + + z8 = _mm_add_epi64(addv8, t8); + z9 = _mm_add_epi64(addv9, t9); + + t8 = _mm_unpacklo_epi32(z8, z9); + t9 = _mm_unpackhi_epi32(z8, z9); + + z8 = _mm_unpacklo_epi32(t8, t9); + z9 = _mm_unpackhi_epi32(t8, t9); + + orig8 = z8; + orig9 = z9; + + in89 += 4; + + x[8] = in89 & 0xFFFFFFFF; + x[13] = (in89 >> 32) & 0xFFFFFFFF; + + z5 = orig5; + z10 = orig10; + z15 = orig15; + z14 = orig14; + z3 = orig3; + z6 = orig6; + z11 = orig11; + z1 = orig1; + + z7 = orig7; + z13 = orig13; + z2 = orig2; + z9 = orig9; + z0 = orig0; + z12 = orig12; + z4 = orig4; + z8 = orig8; + + for (i = 0; i < ROUNDS; i += 2) { + /* the inner loop is a direct translation (regexp search/replace) + * from the amd64-xmm6 ASM */ + __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, + r14, r15; + + y4 = z12; + y4 = _mm_add_epi32(y4, z0); + r4 = y4; + y4 = _mm_slli_epi32(y4, 7); + z4 = _mm_xor_si128(z4, y4); + r4 = _mm_srli_epi32(r4, 25); + z4 = _mm_xor_si128(z4, r4); + + y9 = z1; + y9 = _mm_add_epi32(y9, z5); + r9 = y9; + y9 = _mm_slli_epi32(y9, 7); + z9 = _mm_xor_si128(z9, y9); + r9 = _mm_srli_epi32(r9, 25); + z9 = _mm_xor_si128(z9, r9); + + y8 = z0; + y8 = _mm_add_epi32(y8, z4); + r8 = y8; + y8 = _mm_slli_epi32(y8, 9); + z8 = _mm_xor_si128(z8, y8); + r8 = _mm_srli_epi32(r8, 23); + z8 = _mm_xor_si128(z8, r8); + + y13 = z5; + y13 = _mm_add_epi32(y13, z9); + r13 = y13; + y13 = _mm_slli_epi32(y13, 9); + z13 = _mm_xor_si128(z13, y13); + r13 = _mm_srli_epi32(r13, 23); + z13 = _mm_xor_si128(z13, r13); + + y12 = z4; + y12 = _mm_add_epi32(y12, z8); + r12 = y12; + y12 = _mm_slli_epi32(y12, 13); + z12 = _mm_xor_si128(z12, y12); + r12 = _mm_srli_epi32(r12, 19); + z12 = _mm_xor_si128(z12, r12); + + y1 = z9; + y1 = _mm_add_epi32(y1, z13); + r1 = y1; + y1 = _mm_slli_epi32(y1, 13); + z1 = _mm_xor_si128(z1, y1); + r1 = _mm_srli_epi32(r1, 19); + z1 = _mm_xor_si128(z1, r1); + + y0 = z8; + y0 = _mm_add_epi32(y0, z12); + r0 = y0; + y0 = _mm_slli_epi32(y0, 18); + z0 = _mm_xor_si128(z0, y0); + r0 = _mm_srli_epi32(r0, 14); + z0 = _mm_xor_si128(z0, r0); + + y5 = z13; + y5 = _mm_add_epi32(y5, z1); + r5 = y5; + y5 = _mm_slli_epi32(y5, 18); + z5 = _mm_xor_si128(z5, y5); + r5 = _mm_srli_epi32(r5, 14); + z5 = _mm_xor_si128(z5, r5); + + y14 = z6; + y14 = _mm_add_epi32(y14, z10); + r14 = y14; + y14 = _mm_slli_epi32(y14, 7); + z14 = _mm_xor_si128(z14, y14); + r14 = _mm_srli_epi32(r14, 25); + z14 = _mm_xor_si128(z14, r14); + + y3 = z11; + y3 = _mm_add_epi32(y3, z15); + r3 = y3; + y3 = _mm_slli_epi32(y3, 7); + z3 = _mm_xor_si128(z3, y3); + r3 = _mm_srli_epi32(r3, 25); + z3 = _mm_xor_si128(z3, r3); + + y2 = z10; + y2 = _mm_add_epi32(y2, z14); + r2 = y2; + y2 = _mm_slli_epi32(y2, 9); + z2 = _mm_xor_si128(z2, y2); + r2 = _mm_srli_epi32(r2, 23); + z2 = _mm_xor_si128(z2, r2); + + y7 = z15; + y7 = _mm_add_epi32(y7, z3); + r7 = y7; + y7 = _mm_slli_epi32(y7, 9); + z7 = _mm_xor_si128(z7, y7); + r7 = _mm_srli_epi32(r7, 23); + z7 = _mm_xor_si128(z7, r7); + + y6 = z14; + y6 = _mm_add_epi32(y6, z2); + r6 = y6; + y6 = _mm_slli_epi32(y6, 13); + z6 = _mm_xor_si128(z6, y6); + r6 = _mm_srli_epi32(r6, 19); + z6 = _mm_xor_si128(z6, r6); + + y11 = z3; + y11 = _mm_add_epi32(y11, z7); + r11 = y11; + y11 = _mm_slli_epi32(y11, 13); + z11 = _mm_xor_si128(z11, y11); + r11 = _mm_srli_epi32(r11, 19); + z11 = _mm_xor_si128(z11, r11); + + y10 = z2; + y10 = _mm_add_epi32(y10, z6); + r10 = y10; + y10 = _mm_slli_epi32(y10, 18); + z10 = _mm_xor_si128(z10, y10); + r10 = _mm_srli_epi32(r10, 14); + z10 = _mm_xor_si128(z10, r10); + + y1 = z3; + y1 = _mm_add_epi32(y1, z0); + r1 = y1; + y1 = _mm_slli_epi32(y1, 7); + z1 = _mm_xor_si128(z1, y1); + r1 = _mm_srli_epi32(r1, 25); + z1 = _mm_xor_si128(z1, r1); + + y15 = z7; + y15 = _mm_add_epi32(y15, z11); + r15 = y15; + y15 = _mm_slli_epi32(y15, 18); + z15 = _mm_xor_si128(z15, y15); + r15 = _mm_srli_epi32(r15, 14); + z15 = _mm_xor_si128(z15, r15); + + y6 = z4; + y6 = _mm_add_epi32(y6, z5); + r6 = y6; + y6 = _mm_slli_epi32(y6, 7); + z6 = _mm_xor_si128(z6, y6); + r6 = _mm_srli_epi32(r6, 25); + z6 = _mm_xor_si128(z6, r6); + + y2 = z0; + y2 = _mm_add_epi32(y2, z1); + r2 = y2; + y2 = _mm_slli_epi32(y2, 9); + z2 = _mm_xor_si128(z2, y2); + r2 = _mm_srli_epi32(r2, 23); + z2 = _mm_xor_si128(z2, r2); + + y7 = z5; + y7 = _mm_add_epi32(y7, z6); + r7 = y7; + y7 = _mm_slli_epi32(y7, 9); + z7 = _mm_xor_si128(z7, y7); + r7 = _mm_srli_epi32(r7, 23); + z7 = _mm_xor_si128(z7, r7); + + y3 = z1; + y3 = _mm_add_epi32(y3, z2); + r3 = y3; + y3 = _mm_slli_epi32(y3, 13); + z3 = _mm_xor_si128(z3, y3); + r3 = _mm_srli_epi32(r3, 19); + z3 = _mm_xor_si128(z3, r3); + + y4 = z6; + y4 = _mm_add_epi32(y4, z7); + r4 = y4; + y4 = _mm_slli_epi32(y4, 13); + z4 = _mm_xor_si128(z4, y4); + r4 = _mm_srli_epi32(r4, 19); + z4 = _mm_xor_si128(z4, r4); + + y0 = z2; + y0 = _mm_add_epi32(y0, z3); + r0 = y0; + y0 = _mm_slli_epi32(y0, 18); + z0 = _mm_xor_si128(z0, y0); + r0 = _mm_srli_epi32(r0, 14); + z0 = _mm_xor_si128(z0, r0); + + y5 = z7; + y5 = _mm_add_epi32(y5, z4); + r5 = y5; + y5 = _mm_slli_epi32(y5, 18); + z5 = _mm_xor_si128(z5, y5); + r5 = _mm_srli_epi32(r5, 14); + z5 = _mm_xor_si128(z5, r5); + + y11 = z9; + y11 = _mm_add_epi32(y11, z10); + r11 = y11; + y11 = _mm_slli_epi32(y11, 7); + z11 = _mm_xor_si128(z11, y11); + r11 = _mm_srli_epi32(r11, 25); + z11 = _mm_xor_si128(z11, r11); + + y12 = z14; + y12 = _mm_add_epi32(y12, z15); + r12 = y12; + y12 = _mm_slli_epi32(y12, 7); + z12 = _mm_xor_si128(z12, y12); + r12 = _mm_srli_epi32(r12, 25); + z12 = _mm_xor_si128(z12, r12); + + y8 = z10; + y8 = _mm_add_epi32(y8, z11); + r8 = y8; + y8 = _mm_slli_epi32(y8, 9); + z8 = _mm_xor_si128(z8, y8); + r8 = _mm_srli_epi32(r8, 23); + z8 = _mm_xor_si128(z8, r8); + + y13 = z15; + y13 = _mm_add_epi32(y13, z12); + r13 = y13; + y13 = _mm_slli_epi32(y13, 9); + z13 = _mm_xor_si128(z13, y13); + r13 = _mm_srli_epi32(r13, 23); + z13 = _mm_xor_si128(z13, r13); + + y9 = z11; + y9 = _mm_add_epi32(y9, z8); + r9 = y9; + y9 = _mm_slli_epi32(y9, 13); + z9 = _mm_xor_si128(z9, y9); + r9 = _mm_srli_epi32(r9, 19); + z9 = _mm_xor_si128(z9, r9); + + y14 = z12; + y14 = _mm_add_epi32(y14, z13); + r14 = y14; + y14 = _mm_slli_epi32(y14, 13); + z14 = _mm_xor_si128(z14, y14); + r14 = _mm_srli_epi32(r14, 19); + z14 = _mm_xor_si128(z14, r14); + + y10 = z8; + y10 = _mm_add_epi32(y10, z9); + r10 = y10; + y10 = _mm_slli_epi32(y10, 18); + z10 = _mm_xor_si128(z10, y10); + r10 = _mm_srli_epi32(r10, 14); + z10 = _mm_xor_si128(z10, r10); + + y15 = z13; + y15 = _mm_add_epi32(y15, z14); + r15 = y15; + y15 = _mm_slli_epi32(y15, 18); + z15 = _mm_xor_si128(z15, y15); + r15 = _mm_srli_epi32(r15, 14); + z15 = _mm_xor_si128(z15, r15); + } + +/* store data ; this macro replicates the original amd64-xmm6 code */ +#define ONEQUAD_SHUFFLE(A, B, C, D) \ + z##A = _mm_add_epi32(z##A, orig##A); \ + z##B = _mm_add_epi32(z##B, orig##B); \ + z##C = _mm_add_epi32(z##C, orig##C); \ + z##D = _mm_add_epi32(z##D, orig##D); \ + in##A = _mm_cvtsi128_si32(z##A); \ + in##B = _mm_cvtsi128_si32(z##B); \ + in##C = _mm_cvtsi128_si32(z##C); \ + in##D = _mm_cvtsi128_si32(z##D); \ + z##A = _mm_shuffle_epi32(z##A, 0x39); \ + z##B = _mm_shuffle_epi32(z##B, 0x39); \ + z##C = _mm_shuffle_epi32(z##C, 0x39); \ + z##D = _mm_shuffle_epi32(z##D, 0x39); \ + \ + in##A ^= *(uint32_t *) (m + 0); \ + in##B ^= *(uint32_t *) (m + 4); \ + in##C ^= *(uint32_t *) (m + 8); \ + in##D ^= *(uint32_t *) (m + 12); \ + \ + *(uint32_t *) (c + 0) = in##A; \ + *(uint32_t *) (c + 4) = in##B; \ + *(uint32_t *) (c + 8) = in##C; \ + *(uint32_t *) (c + 12) = in##D; \ + \ + in##A = _mm_cvtsi128_si32(z##A); \ + in##B = _mm_cvtsi128_si32(z##B); \ + in##C = _mm_cvtsi128_si32(z##C); \ + in##D = _mm_cvtsi128_si32(z##D); \ + z##A = _mm_shuffle_epi32(z##A, 0x39); \ + z##B = _mm_shuffle_epi32(z##B, 0x39); \ + z##C = _mm_shuffle_epi32(z##C, 0x39); \ + z##D = _mm_shuffle_epi32(z##D, 0x39); \ + \ + in##A ^= *(uint32_t *) (m + 64); \ + in##B ^= *(uint32_t *) (m + 68); \ + in##C ^= *(uint32_t *) (m + 72); \ + in##D ^= *(uint32_t *) (m + 76); \ + *(uint32_t *) (c + 64) = in##A; \ + *(uint32_t *) (c + 68) = in##B; \ + *(uint32_t *) (c + 72) = in##C; \ + *(uint32_t *) (c + 76) = in##D; \ + \ + in##A = _mm_cvtsi128_si32(z##A); \ + in##B = _mm_cvtsi128_si32(z##B); \ + in##C = _mm_cvtsi128_si32(z##C); \ + in##D = _mm_cvtsi128_si32(z##D); \ + z##A = _mm_shuffle_epi32(z##A, 0x39); \ + z##B = _mm_shuffle_epi32(z##B, 0x39); \ + z##C = _mm_shuffle_epi32(z##C, 0x39); \ + z##D = _mm_shuffle_epi32(z##D, 0x39); \ + \ + in##A ^= *(uint32_t *) (m + 128); \ + in##B ^= *(uint32_t *) (m + 132); \ + in##C ^= *(uint32_t *) (m + 136); \ + in##D ^= *(uint32_t *) (m + 140); \ + *(uint32_t *) (c + 128) = in##A; \ + *(uint32_t *) (c + 132) = in##B; \ + *(uint32_t *) (c + 136) = in##C; \ + *(uint32_t *) (c + 140) = in##D; \ + \ + in##A = _mm_cvtsi128_si32(z##A); \ + in##B = _mm_cvtsi128_si32(z##B); \ + in##C = _mm_cvtsi128_si32(z##C); \ + in##D = _mm_cvtsi128_si32(z##D); \ + \ + in##A ^= *(uint32_t *) (m + 192); \ + in##B ^= *(uint32_t *) (m + 196); \ + in##C ^= *(uint32_t *) (m + 200); \ + in##D ^= *(uint32_t *) (m + 204); \ + *(uint32_t *) (c + 192) = in##A; \ + *(uint32_t *) (c + 196) = in##B; \ + *(uint32_t *) (c + 200) = in##C; \ + *(uint32_t *) (c + 204) = in##D + +/* store data ; this macro replaces shuffle+mov by a direct extract; not much + * difference */ +#define ONEQUAD_EXTRACT(A, B, C, D) \ + z##A = _mm_add_epi32(z##A, orig##A); \ + z##B = _mm_add_epi32(z##B, orig##B); \ + z##C = _mm_add_epi32(z##C, orig##C); \ + z##D = _mm_add_epi32(z##D, orig##D); \ + in##A = _mm_cvtsi128_si32(z##A); \ + in##B = _mm_cvtsi128_si32(z##B); \ + in##C = _mm_cvtsi128_si32(z##C); \ + in##D = _mm_cvtsi128_si32(z##D); \ + in##A ^= *(uint32_t *) (m + 0); \ + in##B ^= *(uint32_t *) (m + 4); \ + in##C ^= *(uint32_t *) (m + 8); \ + in##D ^= *(uint32_t *) (m + 12); \ + *(uint32_t *) (c + 0) = in##A; \ + *(uint32_t *) (c + 4) = in##B; \ + *(uint32_t *) (c + 8) = in##C; \ + *(uint32_t *) (c + 12) = in##D; \ + \ + in##A = _mm_extract_epi32(z##A, 1); \ + in##B = _mm_extract_epi32(z##B, 1); \ + in##C = _mm_extract_epi32(z##C, 1); \ + in##D = _mm_extract_epi32(z##D, 1); \ + \ + in##A ^= *(uint32_t *) (m + 64); \ + in##B ^= *(uint32_t *) (m + 68); \ + in##C ^= *(uint32_t *) (m + 72); \ + in##D ^= *(uint32_t *) (m + 76); \ + *(uint32_t *) (c + 64) = in##A; \ + *(uint32_t *) (c + 68) = in##B; \ + *(uint32_t *) (c + 72) = in##C; \ + *(uint32_t *) (c + 76) = in##D; \ + \ + in##A = _mm_extract_epi32(z##A, 2); \ + in##B = _mm_extract_epi32(z##B, 2); \ + in##C = _mm_extract_epi32(z##C, 2); \ + in##D = _mm_extract_epi32(z##D, 2); \ + \ + in##A ^= *(uint32_t *) (m + 128); \ + in##B ^= *(uint32_t *) (m + 132); \ + in##C ^= *(uint32_t *) (m + 136); \ + in##D ^= *(uint32_t *) (m + 140); \ + *(uint32_t *) (c + 128) = in##A; \ + *(uint32_t *) (c + 132) = in##B; \ + *(uint32_t *) (c + 136) = in##C; \ + *(uint32_t *) (c + 140) = in##D; \ + \ + in##A = _mm_extract_epi32(z##A, 3); \ + in##B = _mm_extract_epi32(z##B, 3); \ + in##C = _mm_extract_epi32(z##C, 3); \ + in##D = _mm_extract_epi32(z##D, 3); \ + \ + in##A ^= *(uint32_t *) (m + 192); \ + in##B ^= *(uint32_t *) (m + 196); \ + in##C ^= *(uint32_t *) (m + 200); \ + in##D ^= *(uint32_t *) (m + 204); \ + *(uint32_t *) (c + 192) = in##A; \ + *(uint32_t *) (c + 196) = in##B; \ + *(uint32_t *) (c + 200) = in##C; \ + *(uint32_t *) (c + 204) = in##D + +/* store data ; this macro first transpose data in-registers, and then store + * them in memory. much faster with icc. */ +#define ONEQUAD_TRANSPOSE(A, B, C, D) \ + z##A = _mm_add_epi32(z##A, orig##A); \ + z##B = _mm_add_epi32(z##B, orig##B); \ + z##C = _mm_add_epi32(z##C, orig##C); \ + z##D = _mm_add_epi32(z##D, orig##D); \ + y##A = _mm_unpacklo_epi32(z##A, z##B); \ + y##B = _mm_unpacklo_epi32(z##C, z##D); \ + y##C = _mm_unpackhi_epi32(z##A, z##B); \ + y##D = _mm_unpackhi_epi32(z##C, z##D); \ + z##A = _mm_unpacklo_epi64(y##A, y##B); \ + z##B = _mm_unpackhi_epi64(y##A, y##B); \ + z##C = _mm_unpacklo_epi64(y##C, y##D); \ + z##D = _mm_unpackhi_epi64(y##C, y##D); \ + y##A = _mm_xor_si128(z##A, _mm_loadu_si128((const __m128i *) (m + 0))); \ + _mm_storeu_si128((__m128i *) (c + 0), y##A); \ + y##B = _mm_xor_si128(z##B, _mm_loadu_si128((const __m128i *) (m + 64))); \ + _mm_storeu_si128((__m128i *) (c + 64), y##B); \ + y##C = _mm_xor_si128(z##C, _mm_loadu_si128((const __m128i *) (m + 128))); \ + _mm_storeu_si128((__m128i *) (c + 128), y##C); \ + y##D = _mm_xor_si128(z##D, _mm_loadu_si128((const __m128i *) (m + 192))); \ + _mm_storeu_si128((__m128i *) (c + 192), y##D) + +#define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D) + + ONEQUAD(0, 1, 2, 3); + m += 16; + c += 16; + ONEQUAD(4, 5, 6, 7); + m += 16; + c += 16; + ONEQUAD(8, 9, 10, 11); + m += 16; + c += 16; + ONEQUAD(12, 13, 14, 15); + m -= 48; + c -= 48; + +#undef ONEQUAD +#undef ONEQUAD_TRANSPOSE +#undef ONEQUAD_EXTRACT +#undef ONEQUAD_SHUFFLE + + bytes -= 256; + c += 256; + m += 256; + } +} diff --git a/src/crypto/astrobwt/xmm6int/u8.h b/src/crypto/astrobwt/xmm6int/u8.h new file mode 100644 index 00000000..581b22c2 --- /dev/null +++ b/src/crypto/astrobwt/xmm6int/u8.h @@ -0,0 +1,477 @@ +if (bytes >= 512) { + __m256i y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14, + y15; + + /* the naive way seems as fast (if not a bit faster) than the vector way */ + __m256i z0 = _mm256_set1_epi32(x[0]); + __m256i z5 = _mm256_set1_epi32(x[1]); + __m256i z10 = _mm256_set1_epi32(x[2]); + __m256i z15 = _mm256_set1_epi32(x[3]); + __m256i z12 = _mm256_set1_epi32(x[4]); + __m256i z1 = _mm256_set1_epi32(x[5]); + __m256i z6 = _mm256_set1_epi32(x[6]); + __m256i z11 = _mm256_set1_epi32(x[7]); + __m256i z8; /* useless */ + __m256i z13 = _mm256_set1_epi32(x[9]); + __m256i z2 = _mm256_set1_epi32(x[10]); + __m256i z7 = _mm256_set1_epi32(x[11]); + __m256i z4 = _mm256_set1_epi32(x[12]); + __m256i z9; /* useless */ + __m256i z14 = _mm256_set1_epi32(x[14]); + __m256i z3 = _mm256_set1_epi32(x[15]); + + __m256i orig0 = z0; + __m256i orig1 = z1; + __m256i orig2 = z2; + __m256i orig3 = z3; + __m256i orig4 = z4; + __m256i orig5 = z5; + __m256i orig6 = z6; + __m256i orig7 = z7; + __m256i orig8; + __m256i orig9; + __m256i orig10 = z10; + __m256i orig11 = z11; + __m256i orig12 = z12; + __m256i orig13 = z13; + __m256i orig14 = z14; + __m256i orig15 = z15; + + uint32_t in8; + uint32_t in9; + int i; + + while (bytes >= 512) { + /* vector implementation for z8 and z9 */ + /* faster than the naive version for 8 blocks */ + const __m256i addv8 = _mm256_set_epi64x(3, 2, 1, 0); + const __m256i addv9 = _mm256_set_epi64x(7, 6, 5, 4); + const __m256i permute = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); + + __m256i t8, t9; + uint64_t in89; + + in8 = x[8]; + in9 = x[13]; /* see arrays above for the address translation */ + in89 = ((uint64_t) in8) | (((uint64_t) in9) << 32); + + z8 = z9 = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(in89)); + + t8 = _mm256_add_epi64(addv8, z8); + t9 = _mm256_add_epi64(addv9, z9); + + z8 = _mm256_unpacklo_epi32(t8, t9); + z9 = _mm256_unpackhi_epi32(t8, t9); + + t8 = _mm256_unpacklo_epi32(z8, z9); + t9 = _mm256_unpackhi_epi32(z8, z9); + + /* required because unpack* are intra-lane */ + z8 = _mm256_permutevar8x32_epi32(t8, permute); + z9 = _mm256_permutevar8x32_epi32(t9, permute); + + orig8 = z8; + orig9 = z9; + + in89 += 8; + + x[8] = in89 & 0xFFFFFFFF; + x[13] = (in89 >> 32) & 0xFFFFFFFF; + + z5 = orig5; + z10 = orig10; + z15 = orig15; + z14 = orig14; + z3 = orig3; + z6 = orig6; + z11 = orig11; + z1 = orig1; + + z7 = orig7; + z13 = orig13; + z2 = orig2; + z9 = orig9; + z0 = orig0; + z12 = orig12; + z4 = orig4; + z8 = orig8; + + for (i = 0; i < ROUNDS; i += 2) { + /* the inner loop is a direct translation (regexp search/replace) + * from the amd64-xmm6 ASM */ + __m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, + r14, r15; + + y4 = z12; + y4 = _mm256_add_epi32(y4, z0); + r4 = y4; + y4 = _mm256_slli_epi32(y4, 7); + z4 = _mm256_xor_si256(z4, y4); + r4 = _mm256_srli_epi32(r4, 25); + z4 = _mm256_xor_si256(z4, r4); + + y9 = z1; + y9 = _mm256_add_epi32(y9, z5); + r9 = y9; + y9 = _mm256_slli_epi32(y9, 7); + z9 = _mm256_xor_si256(z9, y9); + r9 = _mm256_srli_epi32(r9, 25); + z9 = _mm256_xor_si256(z9, r9); + + y8 = z0; + y8 = _mm256_add_epi32(y8, z4); + r8 = y8; + y8 = _mm256_slli_epi32(y8, 9); + z8 = _mm256_xor_si256(z8, y8); + r8 = _mm256_srli_epi32(r8, 23); + z8 = _mm256_xor_si256(z8, r8); + + y13 = z5; + y13 = _mm256_add_epi32(y13, z9); + r13 = y13; + y13 = _mm256_slli_epi32(y13, 9); + z13 = _mm256_xor_si256(z13, y13); + r13 = _mm256_srli_epi32(r13, 23); + z13 = _mm256_xor_si256(z13, r13); + + y12 = z4; + y12 = _mm256_add_epi32(y12, z8); + r12 = y12; + y12 = _mm256_slli_epi32(y12, 13); + z12 = _mm256_xor_si256(z12, y12); + r12 = _mm256_srli_epi32(r12, 19); + z12 = _mm256_xor_si256(z12, r12); + + y1 = z9; + y1 = _mm256_add_epi32(y1, z13); + r1 = y1; + y1 = _mm256_slli_epi32(y1, 13); + z1 = _mm256_xor_si256(z1, y1); + r1 = _mm256_srli_epi32(r1, 19); + z1 = _mm256_xor_si256(z1, r1); + + y0 = z8; + y0 = _mm256_add_epi32(y0, z12); + r0 = y0; + y0 = _mm256_slli_epi32(y0, 18); + z0 = _mm256_xor_si256(z0, y0); + r0 = _mm256_srli_epi32(r0, 14); + z0 = _mm256_xor_si256(z0, r0); + + y5 = z13; + y5 = _mm256_add_epi32(y5, z1); + r5 = y5; + y5 = _mm256_slli_epi32(y5, 18); + z5 = _mm256_xor_si256(z5, y5); + r5 = _mm256_srli_epi32(r5, 14); + z5 = _mm256_xor_si256(z5, r5); + + y14 = z6; + y14 = _mm256_add_epi32(y14, z10); + r14 = y14; + y14 = _mm256_slli_epi32(y14, 7); + z14 = _mm256_xor_si256(z14, y14); + r14 = _mm256_srli_epi32(r14, 25); + z14 = _mm256_xor_si256(z14, r14); + + y3 = z11; + y3 = _mm256_add_epi32(y3, z15); + r3 = y3; + y3 = _mm256_slli_epi32(y3, 7); + z3 = _mm256_xor_si256(z3, y3); + r3 = _mm256_srli_epi32(r3, 25); + z3 = _mm256_xor_si256(z3, r3); + + y2 = z10; + y2 = _mm256_add_epi32(y2, z14); + r2 = y2; + y2 = _mm256_slli_epi32(y2, 9); + z2 = _mm256_xor_si256(z2, y2); + r2 = _mm256_srli_epi32(r2, 23); + z2 = _mm256_xor_si256(z2, r2); + + y7 = z15; + y7 = _mm256_add_epi32(y7, z3); + r7 = y7; + y7 = _mm256_slli_epi32(y7, 9); + z7 = _mm256_xor_si256(z7, y7); + r7 = _mm256_srli_epi32(r7, 23); + z7 = _mm256_xor_si256(z7, r7); + + y6 = z14; + y6 = _mm256_add_epi32(y6, z2); + r6 = y6; + y6 = _mm256_slli_epi32(y6, 13); + z6 = _mm256_xor_si256(z6, y6); + r6 = _mm256_srli_epi32(r6, 19); + z6 = _mm256_xor_si256(z6, r6); + + y11 = z3; + y11 = _mm256_add_epi32(y11, z7); + r11 = y11; + y11 = _mm256_slli_epi32(y11, 13); + z11 = _mm256_xor_si256(z11, y11); + r11 = _mm256_srli_epi32(r11, 19); + z11 = _mm256_xor_si256(z11, r11); + + y10 = z2; + y10 = _mm256_add_epi32(y10, z6); + r10 = y10; + y10 = _mm256_slli_epi32(y10, 18); + z10 = _mm256_xor_si256(z10, y10); + r10 = _mm256_srli_epi32(r10, 14); + z10 = _mm256_xor_si256(z10, r10); + + y1 = z3; + y1 = _mm256_add_epi32(y1, z0); + r1 = y1; + y1 = _mm256_slli_epi32(y1, 7); + z1 = _mm256_xor_si256(z1, y1); + r1 = _mm256_srli_epi32(r1, 25); + z1 = _mm256_xor_si256(z1, r1); + + y15 = z7; + y15 = _mm256_add_epi32(y15, z11); + r15 = y15; + y15 = _mm256_slli_epi32(y15, 18); + z15 = _mm256_xor_si256(z15, y15); + r15 = _mm256_srli_epi32(r15, 14); + z15 = _mm256_xor_si256(z15, r15); + + y6 = z4; + y6 = _mm256_add_epi32(y6, z5); + r6 = y6; + y6 = _mm256_slli_epi32(y6, 7); + z6 = _mm256_xor_si256(z6, y6); + r6 = _mm256_srli_epi32(r6, 25); + z6 = _mm256_xor_si256(z6, r6); + + y2 = z0; + y2 = _mm256_add_epi32(y2, z1); + r2 = y2; + y2 = _mm256_slli_epi32(y2, 9); + z2 = _mm256_xor_si256(z2, y2); + r2 = _mm256_srli_epi32(r2, 23); + z2 = _mm256_xor_si256(z2, r2); + + y7 = z5; + y7 = _mm256_add_epi32(y7, z6); + r7 = y7; + y7 = _mm256_slli_epi32(y7, 9); + z7 = _mm256_xor_si256(z7, y7); + r7 = _mm256_srli_epi32(r7, 23); + z7 = _mm256_xor_si256(z7, r7); + + y3 = z1; + y3 = _mm256_add_epi32(y3, z2); + r3 = y3; + y3 = _mm256_slli_epi32(y3, 13); + z3 = _mm256_xor_si256(z3, y3); + r3 = _mm256_srli_epi32(r3, 19); + z3 = _mm256_xor_si256(z3, r3); + + y4 = z6; + y4 = _mm256_add_epi32(y4, z7); + r4 = y4; + y4 = _mm256_slli_epi32(y4, 13); + z4 = _mm256_xor_si256(z4, y4); + r4 = _mm256_srli_epi32(r4, 19); + z4 = _mm256_xor_si256(z4, r4); + + y0 = z2; + y0 = _mm256_add_epi32(y0, z3); + r0 = y0; + y0 = _mm256_slli_epi32(y0, 18); + z0 = _mm256_xor_si256(z0, y0); + r0 = _mm256_srli_epi32(r0, 14); + z0 = _mm256_xor_si256(z0, r0); + + y5 = z7; + y5 = _mm256_add_epi32(y5, z4); + r5 = y5; + y5 = _mm256_slli_epi32(y5, 18); + z5 = _mm256_xor_si256(z5, y5); + r5 = _mm256_srli_epi32(r5, 14); + z5 = _mm256_xor_si256(z5, r5); + + y11 = z9; + y11 = _mm256_add_epi32(y11, z10); + r11 = y11; + y11 = _mm256_slli_epi32(y11, 7); + z11 = _mm256_xor_si256(z11, y11); + r11 = _mm256_srli_epi32(r11, 25); + z11 = _mm256_xor_si256(z11, r11); + + y12 = z14; + y12 = _mm256_add_epi32(y12, z15); + r12 = y12; + y12 = _mm256_slli_epi32(y12, 7); + z12 = _mm256_xor_si256(z12, y12); + r12 = _mm256_srli_epi32(r12, 25); + z12 = _mm256_xor_si256(z12, r12); + + y8 = z10; + y8 = _mm256_add_epi32(y8, z11); + r8 = y8; + y8 = _mm256_slli_epi32(y8, 9); + z8 = _mm256_xor_si256(z8, y8); + r8 = _mm256_srli_epi32(r8, 23); + z8 = _mm256_xor_si256(z8, r8); + + y13 = z15; + y13 = _mm256_add_epi32(y13, z12); + r13 = y13; + y13 = _mm256_slli_epi32(y13, 9); + z13 = _mm256_xor_si256(z13, y13); + r13 = _mm256_srli_epi32(r13, 23); + z13 = _mm256_xor_si256(z13, r13); + + y9 = z11; + y9 = _mm256_add_epi32(y9, z8); + r9 = y9; + y9 = _mm256_slli_epi32(y9, 13); + z9 = _mm256_xor_si256(z9, y9); + r9 = _mm256_srli_epi32(r9, 19); + z9 = _mm256_xor_si256(z9, r9); + + y14 = z12; + y14 = _mm256_add_epi32(y14, z13); + r14 = y14; + y14 = _mm256_slli_epi32(y14, 13); + z14 = _mm256_xor_si256(z14, y14); + r14 = _mm256_srli_epi32(r14, 19); + z14 = _mm256_xor_si256(z14, r14); + + y10 = z8; + y10 = _mm256_add_epi32(y10, z9); + r10 = y10; + y10 = _mm256_slli_epi32(y10, 18); + z10 = _mm256_xor_si256(z10, y10); + r10 = _mm256_srli_epi32(r10, 14); + z10 = _mm256_xor_si256(z10, r10); + + y15 = z13; + y15 = _mm256_add_epi32(y15, z14); + r15 = y15; + y15 = _mm256_slli_epi32(y15, 18); + z15 = _mm256_xor_si256(z15, y15); + r15 = _mm256_srli_epi32(r15, 14); + z15 = _mm256_xor_si256(z15, r15); + } + +/* store data ; this macro first transpose data in-registers, and then store + * them in memory. much faster with icc. */ +#define ONEQUAD_TRANSPOSE(A, B, C, D) \ + { \ + __m128i t0, t1, t2, t3; \ + z##A = _mm256_add_epi32(z##A, orig##A); \ + z##B = _mm256_add_epi32(z##B, orig##B); \ + z##C = _mm256_add_epi32(z##C, orig##C); \ + z##D = _mm256_add_epi32(z##D, orig##D); \ + y##A = _mm256_unpacklo_epi32(z##A, z##B); \ + y##B = _mm256_unpacklo_epi32(z##C, z##D); \ + y##C = _mm256_unpackhi_epi32(z##A, z##B); \ + y##D = _mm256_unpackhi_epi32(z##C, z##D); \ + z##A = _mm256_unpacklo_epi64(y##A, y##B); \ + z##B = _mm256_unpackhi_epi64(y##A, y##B); \ + z##C = _mm256_unpacklo_epi64(y##C, y##D); \ + z##D = _mm256_unpackhi_epi64(y##C, y##D); \ + t0 = _mm_xor_si128(_mm256_extracti128_si256(z##A, 0), \ + _mm_loadu_si128((const __m128i*) (m + 0))); \ + _mm_storeu_si128((__m128i*) (c + 0), t0); \ + t1 = _mm_xor_si128(_mm256_extracti128_si256(z##B, 0), \ + _mm_loadu_si128((const __m128i*) (m + 64))); \ + _mm_storeu_si128((__m128i*) (c + 64), t1); \ + t2 = _mm_xor_si128(_mm256_extracti128_si256(z##C, 0), \ + _mm_loadu_si128((const __m128i*) (m + 128))); \ + _mm_storeu_si128((__m128i*) (c + 128), t2); \ + t3 = _mm_xor_si128(_mm256_extracti128_si256(z##D, 0), \ + _mm_loadu_si128((const __m128i*) (m + 192))); \ + _mm_storeu_si128((__m128i*) (c + 192), t3); \ + t0 = _mm_xor_si128(_mm256_extracti128_si256(z##A, 1), \ + _mm_loadu_si128((const __m128i*) (m + 256))); \ + _mm_storeu_si128((__m128i*) (c + 256), t0); \ + t1 = _mm_xor_si128(_mm256_extracti128_si256(z##B, 1), \ + _mm_loadu_si128((const __m128i*) (m + 320))); \ + _mm_storeu_si128((__m128i*) (c + 320), t1); \ + t2 = _mm_xor_si128(_mm256_extracti128_si256(z##C, 1), \ + _mm_loadu_si128((const __m128i*) (m + 384))); \ + _mm_storeu_si128((__m128i*) (c + 384), t2); \ + t3 = _mm_xor_si128(_mm256_extracti128_si256(z##D, 1), \ + _mm_loadu_si128((const __m128i*) (m + 448))); \ + _mm_storeu_si128((__m128i*) (c + 448), t3); \ + } + +#define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D) + +#define ONEQUAD_UNPCK(A, B, C, D) \ + { \ + z##A = _mm256_add_epi32(z##A, orig##A); \ + z##B = _mm256_add_epi32(z##B, orig##B); \ + z##C = _mm256_add_epi32(z##C, orig##C); \ + z##D = _mm256_add_epi32(z##D, orig##D); \ + y##A = _mm256_unpacklo_epi32(z##A, z##B); \ + y##B = _mm256_unpacklo_epi32(z##C, z##D); \ + y##C = _mm256_unpackhi_epi32(z##A, z##B); \ + y##D = _mm256_unpackhi_epi32(z##C, z##D); \ + z##A = _mm256_unpacklo_epi64(y##A, y##B); \ + z##B = _mm256_unpackhi_epi64(y##A, y##B); \ + z##C = _mm256_unpacklo_epi64(y##C, y##D); \ + z##D = _mm256_unpackhi_epi64(y##C, y##D); \ + } + +#define ONEOCTO(A, B, C, D, A2, B2, C2, D2) \ + { \ + ONEQUAD_UNPCK(A, B, C, D); \ + ONEQUAD_UNPCK(A2, B2, C2, D2); \ + y##A = _mm256_permute2x128_si256(z##A, z##A2, 0x20); \ + y##A2 = _mm256_permute2x128_si256(z##A, z##A2, 0x31); \ + y##B = _mm256_permute2x128_si256(z##B, z##B2, 0x20); \ + y##B2 = _mm256_permute2x128_si256(z##B, z##B2, 0x31); \ + y##C = _mm256_permute2x128_si256(z##C, z##C2, 0x20); \ + y##C2 = _mm256_permute2x128_si256(z##C, z##C2, 0x31); \ + y##D = _mm256_permute2x128_si256(z##D, z##D2, 0x20); \ + y##D2 = _mm256_permute2x128_si256(z##D, z##D2, 0x31); \ + y##A = _mm256_xor_si256(y##A, \ + _mm256_loadu_si256((const __m256i*) (m + 0))); \ + y##B = _mm256_xor_si256( \ + y##B, _mm256_loadu_si256((const __m256i*) (m + 64))); \ + y##C = _mm256_xor_si256( \ + y##C, _mm256_loadu_si256((const __m256i*) (m + 128))); \ + y##D = _mm256_xor_si256( \ + y##D, _mm256_loadu_si256((const __m256i*) (m + 192))); \ + y##A2 = _mm256_xor_si256( \ + y##A2, _mm256_loadu_si256((const __m256i*) (m + 256))); \ + y##B2 = _mm256_xor_si256( \ + y##B2, _mm256_loadu_si256((const __m256i*) (m + 320))); \ + y##C2 = _mm256_xor_si256( \ + y##C2, _mm256_loadu_si256((const __m256i*) (m + 384))); \ + y##D2 = _mm256_xor_si256( \ + y##D2, _mm256_loadu_si256((const __m256i*) (m + 448))); \ + _mm256_storeu_si256((__m256i*) (c + 0), y##A); \ + _mm256_storeu_si256((__m256i*) (c + 64), y##B); \ + _mm256_storeu_si256((__m256i*) (c + 128), y##C); \ + _mm256_storeu_si256((__m256i*) (c + 192), y##D); \ + _mm256_storeu_si256((__m256i*) (c + 256), y##A2); \ + _mm256_storeu_si256((__m256i*) (c + 320), y##B2); \ + _mm256_storeu_si256((__m256i*) (c + 384), y##C2); \ + _mm256_storeu_si256((__m256i*) (c + 448), y##D2); \ + } + + ONEOCTO(0, 1, 2, 3, 4, 5, 6, 7); + m += 32; + c += 32; + ONEOCTO(8, 9, 10, 11, 12, 13, 14, 15); + m -= 32; + c -= 32; + +#undef ONEQUAD +#undef ONEQUAD_TRANSPOSE +#undef ONEQUAD_UNPCK +#undef ONEOCTO + + bytes -= 512; + c += 512; + m += 512; + } +}