Compiler fix
This commit is contained in:
parent
efb322df66
commit
e87d5111a2
19 changed files with 1401 additions and 279 deletions
|
@ -55,7 +55,7 @@ public:
|
|||
size_t threads() const override
|
||||
{
|
||||
# ifdef XMRIG_ALGO_GHOSTRIDER
|
||||
return m_ghHelper ? 2 : 1;
|
||||
return ((m_algorithm.family() == Algorithm::GHOSTRIDER) && m_ghHelper) ? 2 : 1;
|
||||
# else
|
||||
return 1;
|
||||
# endif
|
||||
|
|
|
@ -30,6 +30,12 @@
|
|||
#endif
|
||||
|
||||
|
||||
#include "crypto/cn/CryptoNight_monero.h"
|
||||
#ifdef XMRIG_VAES
|
||||
# include "crypto/cn/CryptoNight_x86_vaes.h"
|
||||
#endif
|
||||
|
||||
|
||||
#include "backend/cpu/platform/BasicCpuInfo.h"
|
||||
#include "3rdparty/rapidjson/document.h"
|
||||
#include "crypto/common/Assembly.h"
|
||||
|
@ -294,6 +300,9 @@ xmrig::BasicCpuInfo::BasicCpuInfo() :
|
|||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
cn_sse41_enabled = has(FLAG_SSE41);
|
||||
cn_vaes_enabled = has(FLAG_VAES);
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -66,6 +66,10 @@
|
|||
} while (0)
|
||||
|
||||
|
||||
bool cn_sse41_enabled = false;
|
||||
bool cn_vaes_enabled = false;
|
||||
|
||||
|
||||
namespace xmrig {
|
||||
|
||||
|
||||
|
@ -97,6 +101,27 @@ cn_mainloop_fun cn_double_double_mainloop_sandybridge_asm = nullptr;
|
|||
cn_mainloop_fun cn_upx2_mainloop_asm = nullptr;
|
||||
cn_mainloop_fun cn_upx2_double_mainloop_asm = nullptr;
|
||||
|
||||
cn_mainloop_fun cn_gr0_single_mainloop_asm = nullptr;
|
||||
cn_mainloop_fun cn_gr1_single_mainloop_asm = nullptr;
|
||||
cn_mainloop_fun cn_gr2_single_mainloop_asm = nullptr;
|
||||
cn_mainloop_fun cn_gr3_single_mainloop_asm = nullptr;
|
||||
cn_mainloop_fun cn_gr4_single_mainloop_asm = nullptr;
|
||||
cn_mainloop_fun cn_gr5_single_mainloop_asm = nullptr;
|
||||
|
||||
cn_mainloop_fun cn_gr0_double_mainloop_asm = nullptr;
|
||||
cn_mainloop_fun cn_gr1_double_mainloop_asm = nullptr;
|
||||
cn_mainloop_fun cn_gr2_double_mainloop_asm = nullptr;
|
||||
cn_mainloop_fun cn_gr3_double_mainloop_asm = nullptr;
|
||||
cn_mainloop_fun cn_gr4_double_mainloop_asm = nullptr;
|
||||
cn_mainloop_fun cn_gr5_double_mainloop_asm = nullptr;
|
||||
|
||||
cn_mainloop_fun cn_gr0_quad_mainloop_asm = nullptr;
|
||||
cn_mainloop_fun cn_gr1_quad_mainloop_asm = nullptr;
|
||||
cn_mainloop_fun cn_gr2_quad_mainloop_asm = nullptr;
|
||||
cn_mainloop_fun cn_gr3_quad_mainloop_asm = nullptr;
|
||||
cn_mainloop_fun cn_gr4_quad_mainloop_asm = nullptr;
|
||||
cn_mainloop_fun cn_gr5_quad_mainloop_asm = nullptr;
|
||||
|
||||
|
||||
template<Algorithm::Id SOURCE_ALGO = Algorithm::CN_2, typename T, typename U>
|
||||
static void patchCode(T dst, U src, const uint32_t iterations, const uint32_t mask = CnAlgo<Algorithm::CN_HALF>().mask())
|
||||
|
@ -136,7 +161,7 @@ static void patchCode(T dst, U src, const uint32_t iterations, const uint32_t ma
|
|||
|
||||
static void patchAsmVariants()
|
||||
{
|
||||
const int allocation_size = 131072;
|
||||
constexpr size_t allocation_size = 0x20000;
|
||||
auto base = static_cast<uint8_t *>(VirtualMemory::allocateExecutableMemory(allocation_size, false));
|
||||
|
||||
cn_half_mainloop_ivybridge_asm = reinterpret_cast<cn_mainloop_fun> (base + 0x0000);
|
||||
|
@ -173,6 +198,29 @@ static void patchAsmVariants()
|
|||
cn_upx2_double_mainloop_asm = reinterpret_cast<cn_mainloop_fun> (base + 0x15000);
|
||||
# endif
|
||||
|
||||
# ifdef XMRIG_ALGO_GHOSTRIDER
|
||||
cn_gr0_single_mainloop_asm = reinterpret_cast<cn_mainloop_fun> (base + 0x16000);
|
||||
cn_gr1_single_mainloop_asm = reinterpret_cast<cn_mainloop_fun> (base + 0x16800);
|
||||
cn_gr2_single_mainloop_asm = reinterpret_cast<cn_mainloop_fun> (base + 0x17000);
|
||||
cn_gr3_single_mainloop_asm = reinterpret_cast<cn_mainloop_fun> (base + 0x17800);
|
||||
cn_gr4_single_mainloop_asm = reinterpret_cast<cn_mainloop_fun> (base + 0x18000);
|
||||
cn_gr5_single_mainloop_asm = reinterpret_cast<cn_mainloop_fun> (base + 0x18800);
|
||||
|
||||
cn_gr0_double_mainloop_asm = reinterpret_cast<cn_mainloop_fun> (base + 0x19000);
|
||||
cn_gr1_double_mainloop_asm = reinterpret_cast<cn_mainloop_fun> (base + 0x19800);
|
||||
cn_gr2_double_mainloop_asm = reinterpret_cast<cn_mainloop_fun> (base + 0x1A000);
|
||||
cn_gr3_double_mainloop_asm = reinterpret_cast<cn_mainloop_fun> (base + 0x1A800);
|
||||
cn_gr4_double_mainloop_asm = reinterpret_cast<cn_mainloop_fun> (base + 0x1B000);
|
||||
cn_gr5_double_mainloop_asm = reinterpret_cast<cn_mainloop_fun> (base + 0x1B800);
|
||||
|
||||
cn_gr0_quad_mainloop_asm = reinterpret_cast<cn_mainloop_fun> (base + 0x1C000);
|
||||
cn_gr1_quad_mainloop_asm = reinterpret_cast<cn_mainloop_fun> (base + 0x1C800);
|
||||
cn_gr2_quad_mainloop_asm = reinterpret_cast<cn_mainloop_fun> (base + 0x1D000);
|
||||
cn_gr3_quad_mainloop_asm = reinterpret_cast<cn_mainloop_fun> (base + 0x1D800);
|
||||
cn_gr4_quad_mainloop_asm = reinterpret_cast<cn_mainloop_fun> (base + 0x1E000);
|
||||
cn_gr5_quad_mainloop_asm = reinterpret_cast<cn_mainloop_fun> (base + 0x1E800);
|
||||
# endif
|
||||
|
||||
{
|
||||
constexpr uint32_t ITER = CnAlgo<Algorithm::CN_HALF>().iterations();
|
||||
|
||||
|
@ -230,7 +278,30 @@ static void patchAsmVariants()
|
|||
patchCode<Algorithm::CN_RWZ>(cn_upx2_mainloop_asm, cnv2_rwz_mainloop_asm, ITER, MASK);
|
||||
patchCode<Algorithm::CN_RWZ>(cn_upx2_double_mainloop_asm, cnv2_rwz_double_mainloop_asm, ITER, MASK);
|
||||
}
|
||||
#endif
|
||||
# endif
|
||||
|
||||
# ifdef XMRIG_ALGO_GHOSTRIDER
|
||||
patchCode<Algorithm::CN_1>(cn_gr0_single_mainloop_asm, cnv1_single_mainloop_asm, CnAlgo<Algorithm::CN_GR_0>().iterations(), CnAlgo<Algorithm::CN_GR_0>().mask());
|
||||
patchCode<Algorithm::CN_1>(cn_gr1_single_mainloop_asm, cnv1_single_mainloop_asm, CnAlgo<Algorithm::CN_GR_1>().iterations(), CnAlgo<Algorithm::CN_GR_1>().mask());
|
||||
patchCode<Algorithm::CN_1>(cn_gr2_single_mainloop_asm, cnv1_single_mainloop_asm, CnAlgo<Algorithm::CN_GR_2>().iterations(), CnAlgo<Algorithm::CN_GR_2>().mask());
|
||||
patchCode<Algorithm::CN_1>(cn_gr3_single_mainloop_asm, cnv1_single_mainloop_asm, CnAlgo<Algorithm::CN_GR_3>().iterations(), CnAlgo<Algorithm::CN_GR_3>().mask());
|
||||
patchCode<Algorithm::CN_1>(cn_gr4_single_mainloop_asm, cnv1_single_mainloop_asm, CnAlgo<Algorithm::CN_GR_4>().iterations(), CnAlgo<Algorithm::CN_GR_4>().mask());
|
||||
patchCode<Algorithm::CN_1>(cn_gr5_single_mainloop_asm, cnv1_single_mainloop_asm, CnAlgo<Algorithm::CN_GR_5>().iterations(), CnAlgo<Algorithm::CN_GR_5>().mask());
|
||||
|
||||
patchCode<Algorithm::CN_1>(cn_gr0_double_mainloop_asm, cnv1_double_mainloop_asm, CnAlgo<Algorithm::CN_GR_0>().iterations(), CnAlgo<Algorithm::CN_GR_0>().mask());
|
||||
patchCode<Algorithm::CN_1>(cn_gr1_double_mainloop_asm, cnv1_double_mainloop_asm, CnAlgo<Algorithm::CN_GR_1>().iterations(), CnAlgo<Algorithm::CN_GR_1>().mask());
|
||||
patchCode<Algorithm::CN_1>(cn_gr2_double_mainloop_asm, cnv1_double_mainloop_asm, CnAlgo<Algorithm::CN_GR_2>().iterations(), CnAlgo<Algorithm::CN_GR_2>().mask());
|
||||
patchCode<Algorithm::CN_1>(cn_gr3_double_mainloop_asm, cnv1_double_mainloop_asm, CnAlgo<Algorithm::CN_GR_3>().iterations(), CnAlgo<Algorithm::CN_GR_3>().mask());
|
||||
patchCode<Algorithm::CN_1>(cn_gr4_double_mainloop_asm, cnv1_double_mainloop_asm, CnAlgo<Algorithm::CN_GR_4>().iterations(), CnAlgo<Algorithm::CN_GR_4>().mask());
|
||||
patchCode<Algorithm::CN_1>(cn_gr5_double_mainloop_asm, cnv1_double_mainloop_asm, CnAlgo<Algorithm::CN_GR_5>().iterations(), CnAlgo<Algorithm::CN_GR_5>().mask());
|
||||
|
||||
patchCode<Algorithm::CN_1>(cn_gr0_quad_mainloop_asm, cnv1_quad_mainloop_asm, CnAlgo<Algorithm::CN_GR_0>().iterations(), CnAlgo<Algorithm::CN_GR_0>().mask());
|
||||
patchCode<Algorithm::CN_1>(cn_gr1_quad_mainloop_asm, cnv1_quad_mainloop_asm, CnAlgo<Algorithm::CN_GR_1>().iterations(), CnAlgo<Algorithm::CN_GR_1>().mask());
|
||||
patchCode<Algorithm::CN_1>(cn_gr2_quad_mainloop_asm, cnv1_quad_mainloop_asm, CnAlgo<Algorithm::CN_GR_2>().iterations(), CnAlgo<Algorithm::CN_GR_2>().mask());
|
||||
patchCode<Algorithm::CN_1>(cn_gr3_quad_mainloop_asm, cnv1_quad_mainloop_asm, CnAlgo<Algorithm::CN_GR_3>().iterations(), CnAlgo<Algorithm::CN_GR_3>().mask());
|
||||
patchCode<Algorithm::CN_1>(cn_gr4_quad_mainloop_asm, cnv1_quad_mainloop_asm, CnAlgo<Algorithm::CN_GR_4>().iterations(), CnAlgo<Algorithm::CN_GR_4>().mask());
|
||||
patchCode<Algorithm::CN_1>(cn_gr5_quad_mainloop_asm, cnv1_quad_mainloop_asm, CnAlgo<Algorithm::CN_GR_5>().iterations(), CnAlgo<Algorithm::CN_GR_5>().mask());
|
||||
# endif
|
||||
|
||||
VirtualMemory::protectRX(base, allocation_size);
|
||||
VirtualMemory::flushInstructionCache(base, allocation_size);
|
||||
|
|
|
@ -52,8 +52,10 @@ struct cryptonight_r_data {
|
|||
struct cryptonight_ctx {
|
||||
alignas(16) uint8_t state[224];
|
||||
alignas(16) uint8_t *memory;
|
||||
const uint32_t* tweak1_table;
|
||||
uint64_t tweak1_2;
|
||||
|
||||
uint8_t unused[40];
|
||||
uint8_t unused[24];
|
||||
const uint32_t *saes_table;
|
||||
|
||||
cn_mainloop_fun_ms_abi generated_code;
|
||||
|
|
|
@ -204,4 +204,7 @@
|
|||
v4_random_math(code##part, r##part); \
|
||||
}
|
||||
|
||||
extern bool cn_sse41_enabled;
|
||||
extern bool cn_vaes_enabled;
|
||||
|
||||
#endif /* XMRIG_CRYPTONIGHT_MONERO_H */
|
||||
|
|
|
@ -295,8 +295,8 @@ static NOINLINE void cn_explode_scratchpad(cryptonight_ctx *ctx)
|
|||
constexpr CnAlgo<ALGO> props;
|
||||
|
||||
# ifdef XMRIG_VAES
|
||||
if (!SOFT_AES && !props.isHeavy() && Cpu::info()->hasVAES()) {
|
||||
cn_explode_scratchpad_vaes<ALGO>(ctx);
|
||||
if (!SOFT_AES && !props.isHeavy() && cn_vaes_enabled) {
|
||||
cn_explode_scratchpad_vaes(ctx, props.memory(), props.half_mem());
|
||||
return;
|
||||
}
|
||||
# endif
|
||||
|
@ -409,8 +409,8 @@ static NOINLINE void cn_implode_scratchpad(cryptonight_ctx *ctx)
|
|||
constexpr CnAlgo<ALGO> props;
|
||||
|
||||
# ifdef XMRIG_VAES
|
||||
if (!SOFT_AES && !props.isHeavy() && Cpu::info()->hasVAES()) {
|
||||
cn_implode_scratchpad_vaes<ALGO>(ctx);
|
||||
if (!SOFT_AES && !props.isHeavy() && cn_vaes_enabled) {
|
||||
cn_implode_scratchpad_vaes(ctx, props.memory(), props.half_mem());
|
||||
return;
|
||||
}
|
||||
# endif
|
||||
|
@ -634,9 +634,31 @@ static inline void cryptonight_conceal_tweak(__m128i& cx, __m128& conc_var)
|
|||
cx = _mm_xor_si128(cx, _mm_cvttps_epi32(nc));
|
||||
}
|
||||
|
||||
template<Algorithm::Id ALGO>
|
||||
void cryptonight_single_hash_gr_sse41(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, cryptonight_ctx** __restrict__ ctx, uint64_t height);
|
||||
|
||||
template<Algorithm::Id ALGO, bool SOFT_AES, int interleave>
|
||||
inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx, uint64_t height)
|
||||
{
|
||||
if (!SOFT_AES) {
|
||||
switch (ALGO) {
|
||||
case Algorithm::CN_GR_0:
|
||||
case Algorithm::CN_GR_1:
|
||||
case Algorithm::CN_GR_2:
|
||||
case Algorithm::CN_GR_3:
|
||||
case Algorithm::CN_GR_4:
|
||||
case Algorithm::CN_GR_5:
|
||||
if (cn_sse41_enabled) {
|
||||
cryptonight_single_hash_gr_sse41<ALGO>(input, size, output, ctx, height);
|
||||
return;
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
constexpr CnAlgo<ALGO> props;
|
||||
constexpr size_t MASK = props.mask();
|
||||
constexpr Algorithm::Id BASE = props.base();
|
||||
|
@ -822,13 +844,16 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si
|
|||
|
||||
|
||||
#ifdef XMRIG_FEATURE_ASM
|
||||
extern "C" void cnv1_single_mainloop_asm(cryptonight_ctx * *ctx);
|
||||
extern "C" void cnv1_double_mainloop_asm(cryptonight_ctx **ctx);
|
||||
extern "C" void cnv1_quad_mainloop_asm(cryptonight_ctx **ctx);
|
||||
extern "C" void cnv2_mainloop_ivybridge_asm(cryptonight_ctx **ctx);
|
||||
extern "C" void cnv2_mainloop_ryzen_asm(cryptonight_ctx **ctx);
|
||||
extern "C" void cnv2_mainloop_bulldozer_asm(cryptonight_ctx **ctx);
|
||||
extern "C" void cnv2_double_mainloop_sandybridge_asm(cryptonight_ctx **ctx);
|
||||
extern "C" void cnv2_rwz_mainloop_asm(cryptonight_ctx **ctx);
|
||||
extern "C" void cnv2_rwz_double_mainloop_asm(cryptonight_ctx **ctx);
|
||||
extern "C" void cnv2_upx_double_mainloop_zen3_asm(cryptonight_ctx * *ctx);
|
||||
extern "C" void cnv2_upx_double_mainloop_zen3_asm(cryptonight_ctx **ctx);
|
||||
|
||||
|
||||
namespace xmrig {
|
||||
|
@ -865,6 +890,28 @@ extern cn_mainloop_fun cn_double_double_mainloop_sandybridge_asm;
|
|||
extern cn_mainloop_fun cn_upx2_mainloop_asm;
|
||||
extern cn_mainloop_fun cn_upx2_double_mainloop_asm;
|
||||
|
||||
extern cn_mainloop_fun cn_gr0_single_mainloop_asm;
|
||||
extern cn_mainloop_fun cn_gr1_single_mainloop_asm;
|
||||
extern cn_mainloop_fun cn_gr2_single_mainloop_asm;
|
||||
extern cn_mainloop_fun cn_gr3_single_mainloop_asm;
|
||||
extern cn_mainloop_fun cn_gr4_single_mainloop_asm;
|
||||
extern cn_mainloop_fun cn_gr5_single_mainloop_asm;
|
||||
|
||||
extern cn_mainloop_fun cn_gr0_double_mainloop_asm;
|
||||
extern cn_mainloop_fun cn_gr1_double_mainloop_asm;
|
||||
extern cn_mainloop_fun cn_gr2_double_mainloop_asm;
|
||||
extern cn_mainloop_fun cn_gr3_double_mainloop_asm;
|
||||
extern cn_mainloop_fun cn_gr4_double_mainloop_asm;
|
||||
extern cn_mainloop_fun cn_gr5_double_mainloop_asm;
|
||||
|
||||
extern cn_mainloop_fun cn_gr0_quad_mainloop_asm;
|
||||
extern cn_mainloop_fun cn_gr1_quad_mainloop_asm;
|
||||
extern cn_mainloop_fun cn_gr2_quad_mainloop_asm;
|
||||
extern cn_mainloop_fun cn_gr3_quad_mainloop_asm;
|
||||
extern cn_mainloop_fun cn_gr4_quad_mainloop_asm;
|
||||
extern cn_mainloop_fun cn_gr5_quad_mainloop_asm;
|
||||
|
||||
|
||||
} // namespace xmrig
|
||||
|
||||
|
||||
|
@ -1017,8 +1064,8 @@ inline void cryptonight_double_hash_asm(const uint8_t *__restrict__ input, size_
|
|||
}
|
||||
|
||||
# ifdef XMRIG_VAES
|
||||
if (!props.isHeavy() && Cpu::info()->hasVAES()) {
|
||||
cn_explode_scratchpad_vaes_double<ALGO>(ctx[0], ctx[1]);
|
||||
if (!props.isHeavy() && cn_vaes_enabled) {
|
||||
cn_explode_scratchpad_vaes_double(ctx[0], ctx[1], props.memory(), props.half_mem());
|
||||
}
|
||||
else
|
||||
# endif
|
||||
|
@ -1065,8 +1112,8 @@ inline void cryptonight_double_hash_asm(const uint8_t *__restrict__ input, size_
|
|||
}
|
||||
|
||||
# ifdef XMRIG_VAES
|
||||
if (!props.isHeavy() && Cpu::info()->hasVAES()) {
|
||||
cn_implode_scratchpad_vaes_double<ALGO>(ctx[0], ctx[1]);
|
||||
if (!props.isHeavy() && cn_vaes_enabled) {
|
||||
cn_implode_scratchpad_vaes_double(ctx[0], ctx[1], props.memory(), props.half_mem());
|
||||
}
|
||||
else
|
||||
# endif
|
||||
|
@ -1090,9 +1137,126 @@ inline void cryptonight_double_hash_asm(const uint8_t *__restrict__ input, size_
|
|||
namespace xmrig {
|
||||
|
||||
|
||||
template<Algorithm::Id ALGO>
|
||||
static NOINLINE void cryptonight_single_hash_gr_sse41(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, cryptonight_ctx** __restrict__ ctx, uint64_t height)
|
||||
{
|
||||
constexpr CnAlgo<ALGO> props;
|
||||
constexpr Algorithm::Id BASE = props.base();
|
||||
|
||||
if (BASE == Algorithm::CN_1 && size < 43) {
|
||||
memset(output, 0, 32);
|
||||
return;
|
||||
}
|
||||
|
||||
keccak(input, size, ctx[0]->state);
|
||||
|
||||
if (props.half_mem()) {
|
||||
ctx[0]->first_half = true;
|
||||
}
|
||||
cn_explode_scratchpad<ALGO, false, 0>(ctx[0]);
|
||||
|
||||
VARIANT1_INIT(0);
|
||||
ctx[0]->tweak1_2 = tweak1_2_0;
|
||||
ctx[0]->tweak1_table = tweak1_table;
|
||||
if (ALGO == Algorithm::CN_GR_0) cn_gr0_single_mainloop_asm(ctx);
|
||||
if (ALGO == Algorithm::CN_GR_1) cn_gr1_single_mainloop_asm(ctx);
|
||||
if (ALGO == Algorithm::CN_GR_2) cn_gr2_single_mainloop_asm(ctx);
|
||||
if (ALGO == Algorithm::CN_GR_3) cn_gr3_single_mainloop_asm(ctx);
|
||||
if (ALGO == Algorithm::CN_GR_4) cn_gr4_single_mainloop_asm(ctx);
|
||||
if (ALGO == Algorithm::CN_GR_5) cn_gr5_single_mainloop_asm(ctx);
|
||||
|
||||
cn_implode_scratchpad<ALGO, false, 0>(ctx[0]);
|
||||
keccakf(reinterpret_cast<uint64_t*>(ctx[0]->state), 24);
|
||||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
|
||||
}
|
||||
|
||||
|
||||
template<Algorithm::Id ALGO>
|
||||
static NOINLINE void cryptonight_double_hash_gr_sse41(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx, uint64_t height)
|
||||
{
|
||||
constexpr CnAlgo<ALGO> props;
|
||||
constexpr Algorithm::Id BASE = props.base();
|
||||
|
||||
if (BASE == Algorithm::CN_1 && size < 43) {
|
||||
memset(output, 0, 64);
|
||||
return;
|
||||
}
|
||||
|
||||
keccak(input, size, ctx[0]->state);
|
||||
keccak(input + size, size, ctx[1]->state);
|
||||
|
||||
if (props.half_mem()) {
|
||||
ctx[0]->first_half = true;
|
||||
ctx[1]->first_half = true;
|
||||
}
|
||||
|
||||
# ifdef XMRIG_VAES
|
||||
if (!props.isHeavy() && cn_vaes_enabled) {
|
||||
cn_explode_scratchpad_vaes_double(ctx[0], ctx[1], props.memory(), props.half_mem());
|
||||
}
|
||||
else
|
||||
# endif
|
||||
{
|
||||
cn_explode_scratchpad<ALGO, false, 0>(ctx[0]);
|
||||
cn_explode_scratchpad<ALGO, false, 0>(ctx[1]);
|
||||
}
|
||||
|
||||
VARIANT1_INIT(0);
|
||||
VARIANT1_INIT(1);
|
||||
|
||||
ctx[0]->tweak1_2 = tweak1_2_0;
|
||||
ctx[1]->tweak1_2 = tweak1_2_1;
|
||||
|
||||
ctx[0]->tweak1_table = tweak1_table;
|
||||
|
||||
if (ALGO == Algorithm::CN_GR_0) cn_gr0_double_mainloop_asm(ctx);
|
||||
if (ALGO == Algorithm::CN_GR_1) cn_gr1_double_mainloop_asm(ctx);
|
||||
if (ALGO == Algorithm::CN_GR_2) cn_gr2_double_mainloop_asm(ctx);
|
||||
if (ALGO == Algorithm::CN_GR_3) cn_gr3_double_mainloop_asm(ctx);
|
||||
if (ALGO == Algorithm::CN_GR_4) cn_gr4_double_mainloop_asm(ctx);
|
||||
if (ALGO == Algorithm::CN_GR_5) cn_gr5_double_mainloop_asm(ctx);
|
||||
|
||||
# ifdef XMRIG_VAES
|
||||
if (!props.isHeavy() && cn_vaes_enabled) {
|
||||
cn_implode_scratchpad_vaes_double(ctx[0], ctx[1], props.memory(), props.half_mem());
|
||||
}
|
||||
else
|
||||
# endif
|
||||
{
|
||||
cn_implode_scratchpad<ALGO, false, 0>(ctx[0]);
|
||||
cn_implode_scratchpad<ALGO, false, 0>(ctx[1]);
|
||||
}
|
||||
|
||||
keccakf(reinterpret_cast<uint64_t*>(ctx[0]->state), 24);
|
||||
keccakf(reinterpret_cast<uint64_t*>(ctx[1]->state), 24);
|
||||
|
||||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
|
||||
extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32);
|
||||
}
|
||||
|
||||
|
||||
template<Algorithm::Id ALGO, bool SOFT_AES>
|
||||
inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx, uint64_t height)
|
||||
{
|
||||
if (!SOFT_AES) {
|
||||
switch (ALGO) {
|
||||
case Algorithm::CN_GR_0:
|
||||
case Algorithm::CN_GR_1:
|
||||
case Algorithm::CN_GR_2:
|
||||
case Algorithm::CN_GR_3:
|
||||
case Algorithm::CN_GR_4:
|
||||
case Algorithm::CN_GR_5:
|
||||
if (cn_sse41_enabled) {
|
||||
cryptonight_double_hash_gr_sse41<ALGO>(input, size, output, ctx, height);
|
||||
return;
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
constexpr CnAlgo<ALGO> props;
|
||||
constexpr size_t MASK = props.mask();
|
||||
constexpr Algorithm::Id BASE = props.base();
|
||||
|
@ -1130,8 +1294,8 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si
|
|||
}
|
||||
|
||||
# ifdef XMRIG_VAES
|
||||
if (!SOFT_AES && !props.isHeavy() && Cpu::info()->hasVAES()) {
|
||||
cn_explode_scratchpad_vaes_double<ALGO>(ctx[0], ctx[1]);
|
||||
if (!SOFT_AES && !props.isHeavy() && cn_vaes_enabled) {
|
||||
cn_explode_scratchpad_vaes_double(ctx[0], ctx[1], props.memory(), props.half_mem());
|
||||
}
|
||||
else
|
||||
# endif
|
||||
|
@ -1334,8 +1498,8 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si
|
|||
}
|
||||
|
||||
# ifdef XMRIG_VAES
|
||||
if (!SOFT_AES && !props.isHeavy() && Cpu::info()->hasVAES()) {
|
||||
cn_implode_scratchpad_vaes_double<ALGO>(ctx[0], ctx[1]);
|
||||
if (!SOFT_AES && !props.isHeavy() && cn_vaes_enabled) {
|
||||
cn_implode_scratchpad_vaes_double(ctx[0], ctx[1], props.memory(), props.half_mem());
|
||||
}
|
||||
else
|
||||
# endif
|
||||
|
@ -1352,27 +1516,14 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si
|
|||
}
|
||||
|
||||
|
||||
static inline void cryptonight_monero_tweak_gr(uint64_t* mem_out, const uint8_t* l, uint64_t idx, __m128i ax0, __m128i bx0, __m128i cx)
|
||||
{
|
||||
__m128i tmp = _mm_xor_si128(bx0, cx);
|
||||
mem_out[0] = _mm_cvtsi128_si64(tmp);
|
||||
|
||||
tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
||||
uint64_t vh = _mm_cvtsi128_si64(tmp);
|
||||
|
||||
mem_out[1] = vh ^ tweak1_table[static_cast<uint32_t>(vh) >> 24];
|
||||
}
|
||||
|
||||
|
||||
template<Algorithm::Id ALGO, bool SOFT_AES>
|
||||
void cryptonight_quad_hash_zen(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, cryptonight_ctx** __restrict__ ctx, uint64_t height)
|
||||
template<Algorithm::Id ALGO>
|
||||
static NOINLINE void cryptonight_quad_hash_gr_sse41(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, cryptonight_ctx** __restrict__ ctx, uint64_t height)
|
||||
{
|
||||
constexpr CnAlgo<ALGO> props;
|
||||
constexpr size_t MASK = props.mask();
|
||||
constexpr Algorithm::Id BASE = props.base();
|
||||
|
||||
if (BASE == Algorithm::CN_1 && size < 43) {
|
||||
memset(output, 0, 64);
|
||||
memset(output, 0, 32 * 4);
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -1381,21 +1532,6 @@ void cryptonight_quad_hash_zen(const uint8_t* __restrict__ input, size_t size, u
|
|||
keccak(input + size * 2, size, ctx[2]->state);
|
||||
keccak(input + size * 3, size, ctx[3]->state);
|
||||
|
||||
uint8_t* l0 = ctx[0]->memory;
|
||||
uint8_t* l1 = ctx[1]->memory;
|
||||
uint8_t* l2 = ctx[2]->memory;
|
||||
uint8_t* l3 = ctx[3]->memory;
|
||||
|
||||
uint64_t* h0 = reinterpret_cast<uint64_t*>(ctx[0]->state);
|
||||
uint64_t* h1 = reinterpret_cast<uint64_t*>(ctx[1]->state);
|
||||
uint64_t* h2 = reinterpret_cast<uint64_t*>(ctx[2]->state);
|
||||
uint64_t* h3 = reinterpret_cast<uint64_t*>(ctx[3]->state);
|
||||
|
||||
VARIANT1_INIT(0);
|
||||
VARIANT1_INIT(1);
|
||||
VARIANT1_INIT(2);
|
||||
VARIANT1_INIT(3);
|
||||
|
||||
if (props.half_mem()) {
|
||||
ctx[0]->first_half = true;
|
||||
ctx[1]->first_half = true;
|
||||
|
@ -1404,156 +1540,51 @@ void cryptonight_quad_hash_zen(const uint8_t* __restrict__ input, size_t size, u
|
|||
}
|
||||
|
||||
# ifdef XMRIG_VAES
|
||||
if (!SOFT_AES && !props.isHeavy() && Cpu::info()->hasVAES()) {
|
||||
cn_explode_scratchpad_vaes_double<ALGO>(ctx[0], ctx[1]);
|
||||
cn_explode_scratchpad_vaes_double<ALGO>(ctx[2], ctx[3]);
|
||||
if (!props.isHeavy() && cn_vaes_enabled) {
|
||||
cn_explode_scratchpad_vaes_double(ctx[0], ctx[1], props.memory(), props.half_mem());
|
||||
cn_explode_scratchpad_vaes_double(ctx[2], ctx[3], props.memory(), props.half_mem());
|
||||
}
|
||||
else
|
||||
# endif
|
||||
{
|
||||
cn_explode_scratchpad<ALGO, SOFT_AES, 0>(ctx[0]);
|
||||
cn_explode_scratchpad<ALGO, SOFT_AES, 0>(ctx[1]);
|
||||
cn_explode_scratchpad<ALGO, SOFT_AES, 0>(ctx[2]);
|
||||
cn_explode_scratchpad<ALGO, SOFT_AES, 0>(ctx[3]);
|
||||
cn_explode_scratchpad<ALGO, false, 0>(ctx[0]);
|
||||
cn_explode_scratchpad<ALGO, false, 0>(ctx[1]);
|
||||
cn_explode_scratchpad<ALGO, false, 0>(ctx[2]);
|
||||
cn_explode_scratchpad<ALGO, false, 0>(ctx[3]);
|
||||
}
|
||||
|
||||
uint64_t al0 = h0[0] ^ h0[4];
|
||||
uint64_t al1 = h1[0] ^ h1[4];
|
||||
uint64_t al2 = h2[0] ^ h2[4];
|
||||
uint64_t al3 = h3[0] ^ h3[4];
|
||||
VARIANT1_INIT(0); ctx[0]->tweak1_2 = tweak1_2_0;
|
||||
VARIANT1_INIT(1); ctx[1]->tweak1_2 = tweak1_2_1;
|
||||
VARIANT1_INIT(2); ctx[2]->tweak1_2 = tweak1_2_2;
|
||||
VARIANT1_INIT(3); ctx[3]->tweak1_2 = tweak1_2_3;
|
||||
|
||||
uint64_t ah0 = h0[1] ^ h0[5];
|
||||
uint64_t ah1 = h1[1] ^ h1[5];
|
||||
uint64_t ah2 = h2[1] ^ h2[5];
|
||||
uint64_t ah3 = h3[1] ^ h3[5];
|
||||
ctx[0]->tweak1_table = tweak1_table;
|
||||
|
||||
__m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
|
||||
__m128i bx10 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
|
||||
__m128i bx20 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]);
|
||||
__m128i bx30 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]);
|
||||
|
||||
uint64_t idx0 = al0;
|
||||
uint64_t idx1 = al1;
|
||||
uint64_t idx2 = al2;
|
||||
uint64_t idx3 = al3;
|
||||
|
||||
__m128i cx0, cx1, cx2, cx3;
|
||||
|
||||
if (!SOFT_AES) {
|
||||
cx0 = _mm_load_si128(reinterpret_cast<const __m128i*>(&l0[idx0 & MASK]));
|
||||
cx1 = _mm_load_si128(reinterpret_cast<const __m128i*>(&l1[idx1 & MASK]));
|
||||
cx2 = _mm_load_si128(reinterpret_cast<const __m128i*>(&l2[idx2 & MASK]));
|
||||
cx3 = _mm_load_si128(reinterpret_cast<const __m128i*>(&l3[idx3 & MASK]));
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < props.iterations(); i++) {
|
||||
const __m128i ax0 = _mm_set_epi64x(ah0, al0);
|
||||
const __m128i ax1 = _mm_set_epi64x(ah1, al1);
|
||||
const __m128i ax2 = _mm_set_epi64x(ah2, al2);
|
||||
const __m128i ax3 = _mm_set_epi64x(ah3, al3);
|
||||
|
||||
if (SOFT_AES) {
|
||||
cx0 = soft_aesenc(&l0[idx0 & MASK], ax0, reinterpret_cast<const uint32_t*>(saes_table));
|
||||
cx1 = soft_aesenc(&l1[idx1 & MASK], ax1, reinterpret_cast<const uint32_t*>(saes_table));
|
||||
cx2 = soft_aesenc(&l2[idx2 & MASK], ax2, reinterpret_cast<const uint32_t*>(saes_table));
|
||||
cx3 = soft_aesenc(&l3[idx3 & MASK], ax3, reinterpret_cast<const uint32_t*>(saes_table));
|
||||
}
|
||||
else {
|
||||
cx0 = _mm_aesenc_si128(cx0, ax0);
|
||||
cx1 = _mm_aesenc_si128(cx1, ax1);
|
||||
cx2 = _mm_aesenc_si128(cx2, ax2);
|
||||
cx3 = _mm_aesenc_si128(cx3, ax3);
|
||||
if (MASK > 131072) {
|
||||
_mm_prefetch((const char*)(&l0[_mm_cvtsi128_si32(cx0) & MASK]), _MM_HINT_T0);
|
||||
_mm_prefetch((const char*)(&l1[_mm_cvtsi128_si32(cx1) & MASK]), _MM_HINT_T0);
|
||||
_mm_prefetch((const char*)(&l2[_mm_cvtsi128_si32(cx2) & MASK]), _MM_HINT_T0);
|
||||
_mm_prefetch((const char*)(&l3[_mm_cvtsi128_si32(cx3) & MASK]), _MM_HINT_T0);
|
||||
}
|
||||
}
|
||||
|
||||
cryptonight_monero_tweak_gr((uint64_t*)&l0[idx0 & MASK], l0, idx0 & MASK, ax0, bx00, cx0);
|
||||
cryptonight_monero_tweak_gr((uint64_t*)&l1[idx1 & MASK], l1, idx1 & MASK, ax1, bx10, cx1);
|
||||
cryptonight_monero_tweak_gr((uint64_t*)&l2[idx2 & MASK], l2, idx2 & MASK, ax2, bx20, cx2);
|
||||
cryptonight_monero_tweak_gr((uint64_t*)&l3[idx3 & MASK], l3, idx3 & MASK, ax3, bx30, cx3);
|
||||
|
||||
idx0 = _mm_cvtsi128_si64(cx0);
|
||||
idx1 = _mm_cvtsi128_si64(cx1);
|
||||
idx2 = _mm_cvtsi128_si64(cx2);
|
||||
idx3 = _mm_cvtsi128_si64(cx3);
|
||||
|
||||
uint64_t hi, lo, cl, ch;
|
||||
|
||||
cl = ((uint64_t*)&l0[idx0 & MASK])[0];
|
||||
ch = ((uint64_t*)&l0[idx0 & MASK])[1];
|
||||
lo = __umul128(idx0, cl, &hi);
|
||||
al0 += hi;
|
||||
ah0 += lo;
|
||||
((uint64_t*)&l0[idx0 & MASK])[0] = al0;
|
||||
((uint64_t*)&l0[idx0 & MASK])[1] = ah0 ^ tweak1_2_0;
|
||||
al0 ^= cl;
|
||||
ah0 ^= ch;
|
||||
idx0 = al0;
|
||||
bx00 = cx0;
|
||||
if (!SOFT_AES) cx0 = _mm_load_si128(reinterpret_cast<const __m128i*>(&l0[idx0 & MASK]));
|
||||
|
||||
cl = ((uint64_t*)&l1[idx1 & MASK])[0];
|
||||
ch = ((uint64_t*)&l1[idx1 & MASK])[1];
|
||||
lo = __umul128(idx1, cl, &hi);
|
||||
al1 += hi;
|
||||
ah1 += lo;
|
||||
((uint64_t*)&l1[idx1 & MASK])[0] = al1;
|
||||
((uint64_t*)&l1[idx1 & MASK])[1] = ah1 ^ tweak1_2_1;
|
||||
al1 ^= cl;
|
||||
ah1 ^= ch;
|
||||
idx1 = al1;
|
||||
bx10 = cx1;
|
||||
if (!SOFT_AES) cx1 = _mm_load_si128(reinterpret_cast<const __m128i*>(&l1[idx1 & MASK]));
|
||||
|
||||
cl = ((uint64_t*)&l2[idx2 & MASK])[0];
|
||||
ch = ((uint64_t*)&l2[idx2 & MASK])[1];
|
||||
lo = __umul128(idx2, cl, &hi);
|
||||
al2 += hi;
|
||||
ah2 += lo;
|
||||
((uint64_t*)&l2[idx2 & MASK])[0] = al2;
|
||||
((uint64_t*)&l2[idx2 & MASK])[1] = ah2 ^ tweak1_2_2;
|
||||
al2 ^= cl;
|
||||
ah2 ^= ch;
|
||||
idx2 = al2;
|
||||
bx20 = cx2;
|
||||
if (!SOFT_AES) cx2 = _mm_load_si128(reinterpret_cast<const __m128i*>(&l2[idx2 & MASK]));
|
||||
|
||||
cl = ((uint64_t*)&l3[idx3 & MASK])[0];
|
||||
ch = ((uint64_t*)&l3[idx3 & MASK])[1];
|
||||
lo = __umul128(idx3, cl, &hi);
|
||||
al3 += hi;
|
||||
ah3 += lo;
|
||||
((uint64_t*)&l3[idx3 & MASK])[0] = al3;
|
||||
((uint64_t*)&l3[idx3 & MASK])[1] = ah3 ^ tweak1_2_3;
|
||||
al3 ^= cl;
|
||||
ah3 ^= ch;
|
||||
idx3 = al3;
|
||||
bx30 = cx3;
|
||||
if (!SOFT_AES) cx3 = _mm_load_si128(reinterpret_cast<const __m128i*>(&l3[idx3 & MASK]));
|
||||
}
|
||||
if (ALGO == Algorithm::CN_GR_0) cn_gr0_quad_mainloop_asm(ctx);
|
||||
if (ALGO == Algorithm::CN_GR_1) cn_gr1_quad_mainloop_asm(ctx);
|
||||
if (ALGO == Algorithm::CN_GR_2) cn_gr2_quad_mainloop_asm(ctx);
|
||||
if (ALGO == Algorithm::CN_GR_3) cn_gr3_quad_mainloop_asm(ctx);
|
||||
if (ALGO == Algorithm::CN_GR_4) cn_gr4_quad_mainloop_asm(ctx);
|
||||
if (ALGO == Algorithm::CN_GR_5) cn_gr5_quad_mainloop_asm(ctx);
|
||||
|
||||
# ifdef XMRIG_VAES
|
||||
if (!SOFT_AES && !props.isHeavy() && Cpu::info()->hasVAES()) {
|
||||
cn_implode_scratchpad_vaes_double<ALGO>(ctx[0], ctx[1]);
|
||||
cn_implode_scratchpad_vaes_double<ALGO>(ctx[2], ctx[3]);
|
||||
if (!props.isHeavy() && cn_vaes_enabled) {
|
||||
cn_implode_scratchpad_vaes_double(ctx[0], ctx[1], props.memory(), props.half_mem());
|
||||
cn_implode_scratchpad_vaes_double(ctx[2], ctx[3], props.memory(), props.half_mem());
|
||||
}
|
||||
else
|
||||
# endif
|
||||
{
|
||||
cn_implode_scratchpad<ALGO, SOFT_AES, 0>(ctx[0]);
|
||||
cn_implode_scratchpad<ALGO, SOFT_AES, 0>(ctx[1]);
|
||||
cn_implode_scratchpad<ALGO, SOFT_AES, 0>(ctx[2]);
|
||||
cn_implode_scratchpad<ALGO, SOFT_AES, 0>(ctx[3]);
|
||||
cn_implode_scratchpad<ALGO, false, 0>(ctx[0]);
|
||||
cn_implode_scratchpad<ALGO, false, 0>(ctx[1]);
|
||||
cn_implode_scratchpad<ALGO, false, 0>(ctx[2]);
|
||||
cn_implode_scratchpad<ALGO, false, 0>(ctx[3]);
|
||||
}
|
||||
|
||||
keccakf(h0, 24);
|
||||
keccakf(h1, 24);
|
||||
keccakf(h2, 24);
|
||||
keccakf(h3, 24);
|
||||
keccakf(reinterpret_cast<uint64_t*>(ctx[0]->state), 24);
|
||||
keccakf(reinterpret_cast<uint64_t*>(ctx[1]->state), 24);
|
||||
keccakf(reinterpret_cast<uint64_t*>(ctx[2]->state), 24);
|
||||
keccakf(reinterpret_cast<uint64_t*>(ctx[3]->state), 24);
|
||||
|
||||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
|
||||
extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32);
|
||||
|
@ -1755,12 +1786,23 @@ inline void cryptonight_triple_hash(const uint8_t *__restrict__ input, size_t si
|
|||
template<Algorithm::Id ALGO, bool SOFT_AES>
|
||||
inline void cryptonight_quad_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx, uint64_t height)
|
||||
{
|
||||
const auto arch = Cpu::info()->arch();
|
||||
if ((arch >= ICpuInfo::ARCH_ZEN) && (arch <= ICpuInfo::ARCH_ZEN3)) {
|
||||
if ((ALGO == Algorithm::CN_GR_0) || (ALGO == Algorithm::CN_GR_1) || (ALGO == Algorithm::CN_GR_2) || (ALGO == Algorithm::CN_GR_3) || (ALGO == Algorithm::CN_GR_4) || (ALGO == Algorithm::CN_GR_5)) {
|
||||
cryptonight_quad_hash_zen<ALGO, SOFT_AES>(input, size, output, ctx, height);
|
||||
if (!SOFT_AES) {
|
||||
switch (ALGO) {
|
||||
case Algorithm::CN_GR_0:
|
||||
case Algorithm::CN_GR_1:
|
||||
case Algorithm::CN_GR_2:
|
||||
case Algorithm::CN_GR_3:
|
||||
case Algorithm::CN_GR_4:
|
||||
case Algorithm::CN_GR_5:
|
||||
if (cn_sse41_enabled) {
|
||||
cryptonight_quad_hash_gr_sse41<ALGO>(input, size, output, ctx, height);
|
||||
return;
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
constexpr CnAlgo<ALGO> props;
|
||||
|
@ -1788,9 +1830,9 @@ inline void cryptonight_quad_hash(const uint8_t *__restrict__ input, size_t size
|
|||
}
|
||||
|
||||
# ifdef XMRIG_VAES
|
||||
if (!SOFT_AES && !props.isHeavy() && Cpu::info()->hasVAES()) {
|
||||
cn_explode_scratchpad_vaes_double<ALGO>(ctx[0], ctx[1]);
|
||||
cn_explode_scratchpad_vaes_double<ALGO>(ctx[2], ctx[3]);
|
||||
if (!SOFT_AES && !props.isHeavy() && cn_vaes_enabled) {
|
||||
cn_explode_scratchpad_vaes_double(ctx[0], ctx[1], props.memory(), props.half_mem());
|
||||
cn_explode_scratchpad_vaes_double(ctx[2], ctx[3], props.memory(), props.half_mem());
|
||||
}
|
||||
else
|
||||
# endif
|
||||
|
@ -1851,9 +1893,9 @@ inline void cryptonight_quad_hash(const uint8_t *__restrict__ input, size_t size
|
|||
}
|
||||
|
||||
# ifdef XMRIG_VAES
|
||||
if (!SOFT_AES && !props.isHeavy() && Cpu::info()->hasVAES()) {
|
||||
cn_implode_scratchpad_vaes_double<ALGO>(ctx[0], ctx[1]);
|
||||
cn_implode_scratchpad_vaes_double<ALGO>(ctx[2], ctx[3]);
|
||||
if (!SOFT_AES && !props.isHeavy() && cn_vaes_enabled) {
|
||||
cn_implode_scratchpad_vaes_double(ctx[0], ctx[1], props.memory(), props.half_mem());
|
||||
cn_implode_scratchpad_vaes_double(ctx[2], ctx[3], props.memory(), props.half_mem());
|
||||
}
|
||||
else
|
||||
# endif
|
||||
|
|
|
@ -162,12 +162,9 @@ static FORCEINLINE void vaes_round(__m256i key, __m256i& x0, __m256i& x1, __m256
|
|||
namespace xmrig {
|
||||
|
||||
|
||||
template<Algorithm::Id ALGO>
|
||||
NOINLINE void cn_explode_scratchpad_vaes(cryptonight_ctx* ctx)
|
||||
NOINLINE void cn_explode_scratchpad_vaes(cryptonight_ctx* ctx, size_t memory, bool half_mem)
|
||||
{
|
||||
constexpr CnAlgo<ALGO> props;
|
||||
|
||||
constexpr size_t N = (props.memory() / sizeof(__m256i)) / (props.half_mem() ? 2 : 1);
|
||||
const size_t N = (memory / sizeof(__m256i)) / (half_mem ? 2 : 1);
|
||||
|
||||
__m256i xin01, xin23, xin45, xin67;
|
||||
__m256i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
|
||||
|
@ -177,7 +174,7 @@ NOINLINE void cn_explode_scratchpad_vaes(cryptonight_ctx* ctx)
|
|||
|
||||
vaes_genkey(input, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9);
|
||||
|
||||
if (props.half_mem() && !ctx->first_half) {
|
||||
if (half_mem && !ctx->first_half) {
|
||||
const __m256i* p = reinterpret_cast<const __m256i*>(ctx->save_state);
|
||||
xin01 = _mm256_loadu_si256(p + 0);
|
||||
xin23 = _mm256_loadu_si256(p + 1);
|
||||
|
@ -226,7 +223,7 @@ NOINLINE void cn_explode_scratchpad_vaes(cryptonight_ctx* ctx)
|
|||
prefetch_ptr = output;
|
||||
}
|
||||
|
||||
if (props.half_mem() && ctx->first_half) {
|
||||
if (half_mem && ctx->first_half) {
|
||||
__m256i* p = reinterpret_cast<__m256i*>(ctx->save_state);
|
||||
_mm256_storeu_si256(p + 0, xin01);
|
||||
_mm256_storeu_si256(p + 1, xin23);
|
||||
|
@ -238,12 +235,9 @@ NOINLINE void cn_explode_scratchpad_vaes(cryptonight_ctx* ctx)
|
|||
}
|
||||
|
||||
|
||||
template<Algorithm::Id ALGO>
|
||||
NOINLINE void cn_explode_scratchpad_vaes_double(cryptonight_ctx* ctx1, cryptonight_ctx* ctx2)
|
||||
NOINLINE void cn_explode_scratchpad_vaes_double(cryptonight_ctx* ctx1, cryptonight_ctx* ctx2, size_t memory, bool half_mem)
|
||||
{
|
||||
constexpr CnAlgo<ALGO> props;
|
||||
|
||||
constexpr size_t N = (props.memory() / sizeof(__m128i)) / (props.half_mem() ? 2 : 1);
|
||||
const size_t N = (memory / sizeof(__m128i)) / (half_mem ? 2 : 1);
|
||||
|
||||
__m256i xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7;
|
||||
__m256i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
|
||||
|
@ -257,7 +251,7 @@ NOINLINE void cn_explode_scratchpad_vaes_double(cryptonight_ctx* ctx1, cryptonig
|
|||
vaes_genkey_double(input1, input2, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9);
|
||||
|
||||
{
|
||||
const bool b = props.half_mem() && !ctx1->first_half && !ctx2->first_half;
|
||||
const bool b = half_mem && !ctx1->first_half && !ctx2->first_half;
|
||||
const __m128i* p1 = b ? reinterpret_cast<const __m128i*>(ctx1->save_state) : (input1 + 4);
|
||||
const __m128i* p2 = b ? reinterpret_cast<const __m128i*>(ctx2->save_state) : (input2 + 4);
|
||||
xin0 = _mm256_loadu2_m128i(p2 + 0, p1 + 0);
|
||||
|
@ -315,7 +309,7 @@ NOINLINE void cn_explode_scratchpad_vaes_double(cryptonight_ctx* ctx1, cryptonig
|
|||
prefetch_ptr2 = output2;
|
||||
}
|
||||
|
||||
if (props.half_mem() && ctx1->first_half && ctx2->first_half) {
|
||||
if (half_mem && ctx1->first_half && ctx2->first_half) {
|
||||
__m128i* p1 = reinterpret_cast<__m128i*>(ctx1->save_state);
|
||||
__m128i* p2 = reinterpret_cast<__m128i*>(ctx2->save_state);
|
||||
_mm256_storeu2_m128i(p2 + 0, p1 + 0, xin0);
|
||||
|
@ -332,12 +326,9 @@ NOINLINE void cn_explode_scratchpad_vaes_double(cryptonight_ctx* ctx1, cryptonig
|
|||
}
|
||||
|
||||
|
||||
template<Algorithm::Id ALGO>
|
||||
NOINLINE void cn_implode_scratchpad_vaes(cryptonight_ctx* ctx)
|
||||
NOINLINE void cn_implode_scratchpad_vaes(cryptonight_ctx* ctx, size_t memory, bool half_mem)
|
||||
{
|
||||
constexpr CnAlgo<ALGO> props;
|
||||
|
||||
constexpr size_t N = (props.memory() / sizeof(__m256i)) / (props.half_mem() ? 2 : 1);
|
||||
const size_t N = (memory / sizeof(__m256i)) / (half_mem ? 2 : 1);
|
||||
|
||||
__m256i xout01, xout23, xout45, xout67;
|
||||
__m256i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
|
||||
|
@ -353,11 +344,11 @@ NOINLINE void cn_implode_scratchpad_vaes(cryptonight_ctx* ctx)
|
|||
xout67 = _mm256_loadu_si256(output + 5);
|
||||
|
||||
const __m256i* input_begin = input;
|
||||
for (size_t part = 0; part < (props.half_mem() ? 2 : 1); ++part) {
|
||||
if (props.half_mem() && (part == 1)) {
|
||||
for (size_t part = 0; part < (half_mem ? 2 : 1); ++part) {
|
||||
if (half_mem && (part == 1)) {
|
||||
input = input_begin;
|
||||
ctx->first_half = false;
|
||||
cn_explode_scratchpad_vaes<ALGO>(ctx);
|
||||
cn_explode_scratchpad_vaes(ctx, memory, half_mem);
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < N;) {
|
||||
|
@ -399,12 +390,9 @@ NOINLINE void cn_implode_scratchpad_vaes(cryptonight_ctx* ctx)
|
|||
}
|
||||
|
||||
|
||||
template<Algorithm::Id ALGO>
|
||||
NOINLINE void cn_implode_scratchpad_vaes_double(cryptonight_ctx* ctx1, cryptonight_ctx* ctx2)
|
||||
NOINLINE void cn_implode_scratchpad_vaes_double(cryptonight_ctx* ctx1, cryptonight_ctx* ctx2, size_t memory, bool half_mem)
|
||||
{
|
||||
constexpr CnAlgo<ALGO> props;
|
||||
|
||||
constexpr size_t N = (props.memory() / sizeof(__m128i)) / (props.half_mem() ? 2 : 1);
|
||||
const size_t N = (memory / sizeof(__m128i)) / (half_mem ? 2 : 1);
|
||||
|
||||
__m256i xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7;
|
||||
__m256i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
|
||||
|
@ -428,13 +416,13 @@ NOINLINE void cn_implode_scratchpad_vaes_double(cryptonight_ctx* ctx1, cryptonig
|
|||
|
||||
const __m128i* input_begin1 = input1;
|
||||
const __m128i* input_begin2 = input2;
|
||||
for (size_t part = 0; part < (props.half_mem() ? 2 : 1); ++part) {
|
||||
if (props.half_mem() && (part == 1)) {
|
||||
for (size_t part = 0; part < (half_mem ? 2 : 1); ++part) {
|
||||
if (half_mem && (part == 1)) {
|
||||
input1 = input_begin1;
|
||||
input2 = input_begin2;
|
||||
ctx1->first_half = false;
|
||||
ctx2->first_half = false;
|
||||
cn_explode_scratchpad_vaes_double<ALGO>(ctx1, ctx2);
|
||||
cn_explode_scratchpad_vaes_double(ctx1, ctx2, memory, half_mem);
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < N;) {
|
||||
|
@ -487,44 +475,4 @@ NOINLINE void cn_implode_scratchpad_vaes_double(cryptonight_ctx* ctx1, cryptonig
|
|||
}
|
||||
|
||||
|
||||
template<Algorithm::Id ALGO>
|
||||
void VAES_Instance()
|
||||
{
|
||||
cn_explode_scratchpad_vaes<ALGO>(nullptr);
|
||||
cn_explode_scratchpad_vaes_double<ALGO>(nullptr, nullptr);
|
||||
cn_implode_scratchpad_vaes<ALGO>(nullptr);
|
||||
cn_implode_scratchpad_vaes_double<ALGO>(nullptr, nullptr);
|
||||
}
|
||||
|
||||
|
||||
void (*vaes_instances[])() = {
|
||||
VAES_Instance<Algorithm::CN_0>,
|
||||
VAES_Instance<Algorithm::CN_1>,
|
||||
VAES_Instance<Algorithm::CN_2>,
|
||||
VAES_Instance<Algorithm::CN_R>,
|
||||
VAES_Instance<Algorithm::CN_FAST>,
|
||||
VAES_Instance<Algorithm::CN_HALF>,
|
||||
VAES_Instance<Algorithm::CN_XAO>,
|
||||
VAES_Instance<Algorithm::CN_RTO>,
|
||||
VAES_Instance<Algorithm::CN_RWZ>,
|
||||
VAES_Instance<Algorithm::CN_ZLS>,
|
||||
VAES_Instance<Algorithm::CN_DOUBLE>,
|
||||
VAES_Instance<Algorithm::CN_CCX>,
|
||||
VAES_Instance<Algorithm::CN_LITE_0>,
|
||||
VAES_Instance<Algorithm::CN_LITE_1>,
|
||||
VAES_Instance<Algorithm::CN_HEAVY_0>,
|
||||
VAES_Instance<Algorithm::CN_HEAVY_TUBE>,
|
||||
VAES_Instance<Algorithm::CN_HEAVY_XHV>,
|
||||
VAES_Instance<Algorithm::CN_PICO_0>,
|
||||
VAES_Instance<Algorithm::CN_PICO_TLO>,
|
||||
VAES_Instance<Algorithm::CN_UPX2>,
|
||||
VAES_Instance<Algorithm::CN_GR_0>,
|
||||
VAES_Instance<Algorithm::CN_GR_1>,
|
||||
VAES_Instance<Algorithm::CN_GR_2>,
|
||||
VAES_Instance<Algorithm::CN_GR_3>,
|
||||
VAES_Instance<Algorithm::CN_GR_4>,
|
||||
VAES_Instance<Algorithm::CN_GR_5>,
|
||||
};
|
||||
|
||||
|
||||
} // xmrig
|
||||
|
|
|
@ -36,10 +36,10 @@ struct cryptonight_ctx;
|
|||
namespace xmrig {
|
||||
|
||||
|
||||
template<Algorithm::Id ALGO> void cn_explode_scratchpad_vaes(cryptonight_ctx* ctx);
|
||||
template<Algorithm::Id ALGO> void cn_explode_scratchpad_vaes_double(cryptonight_ctx* ctx1, cryptonight_ctx* ctx2);
|
||||
template<Algorithm::Id ALGO> void cn_implode_scratchpad_vaes(cryptonight_ctx* ctx);
|
||||
template<Algorithm::Id ALGO> void cn_implode_scratchpad_vaes_double(cryptonight_ctx* ctx1, cryptonight_ctx* ctx2);
|
||||
void cn_explode_scratchpad_vaes(cryptonight_ctx* ctx, size_t memory, bool half_mem);
|
||||
void cn_explode_scratchpad_vaes_double(cryptonight_ctx* ctx1, cryptonight_ctx* ctx2, size_t memory, bool half_mem);
|
||||
void cn_implode_scratchpad_vaes(cryptonight_ctx* ctx, size_t memory, bool half_mem);
|
||||
void cn_implode_scratchpad_vaes_double(cryptonight_ctx* ctx1, cryptonight_ctx* ctx2, size_t memory, bool half_mem);
|
||||
|
||||
|
||||
} // xmrig
|
||||
|
|
132
src/crypto/cn/asm/cn1/cnv1_double_main_loop.inc
Normal file
132
src/crypto/cn/asm/cn1/cnv1_double_main_loop.inc
Normal file
|
@ -0,0 +1,132 @@
|
|||
mov QWORD PTR [rsp+8], rbx
|
||||
mov QWORD PTR [rsp+16], rbp
|
||||
mov QWORD PTR [rsp+24], rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 32
|
||||
mov rdx, QWORD PTR [rcx]
|
||||
mov r8, QWORD PTR [rcx+8]
|
||||
mov r12d, 524288
|
||||
movaps XMMWORD PTR [rsp+16], xmm6
|
||||
mov rbx, QWORD PTR [rdx+32]
|
||||
xor rbx, QWORD PTR [rdx]
|
||||
mov rsi, QWORD PTR [rdx+40]
|
||||
mov r10, rbx
|
||||
xor rsi, QWORD PTR [rdx+8]
|
||||
and r10d, 2097136
|
||||
mov rdi, QWORD PTR [r8+32]
|
||||
xor rdi, QWORD PTR [r8]
|
||||
movq xmm3, rbx
|
||||
mov rbp, QWORD PTR [r8+40]
|
||||
mov r9, rdi
|
||||
xor rbp, QWORD PTR [r8+8]
|
||||
movq xmm0, rsi
|
||||
mov rcx, QWORD PTR [rdx+56]
|
||||
and r9d, 2097136
|
||||
xor rcx, QWORD PTR [rdx+24]
|
||||
movq xmm4, rdi
|
||||
mov rax, QWORD PTR [rdx+48]
|
||||
xor rax, QWORD PTR [rdx+16]
|
||||
mov r14, QWORD PTR [rdx+224]
|
||||
mov r13, QWORD PTR [rdx+232]
|
||||
mov r15, QWORD PTR [r8+224]
|
||||
punpcklqdq xmm3, xmm0
|
||||
movq xmm0, rbp
|
||||
movq xmm5, rax
|
||||
punpcklqdq xmm4, xmm0
|
||||
mov rax, QWORD PTR [r8+48]
|
||||
movq xmm0, rcx
|
||||
xor rax, QWORD PTR [r8+16]
|
||||
mov rcx, QWORD PTR [r8+56]
|
||||
xor rcx, QWORD PTR [r8+24]
|
||||
movdqu xmm1, XMMWORD PTR [r14+r10]
|
||||
movq xmm6, rax
|
||||
punpcklqdq xmm5, xmm0
|
||||
mov rax, QWORD PTR [rdx+240]
|
||||
movq xmm0, rcx
|
||||
movdqu xmm2, XMMWORD PTR [r15+r9]
|
||||
mov QWORD PTR [rsp], rax
|
||||
mov rax, QWORD PTR [r8+240]
|
||||
mov QWORD PTR [rsp+8], rax
|
||||
punpcklqdq xmm6, xmm0
|
||||
|
||||
ALIGN(64)
|
||||
main_loop_cnv1_double:
|
||||
aesenc xmm1, xmm3
|
||||
aesenc xmm2, xmm4
|
||||
movdqa xmm0, xmm1
|
||||
movq r11, xmm2
|
||||
pxor xmm0, xmm5
|
||||
movdqa xmm5, xmm1
|
||||
movq QWORD PTR [r14+r10], xmm0
|
||||
pextrq rcx, xmm0, 1
|
||||
mov eax, ecx
|
||||
movdqa xmm0, xmm2
|
||||
shr rax, 24
|
||||
pxor xmm0, xmm6
|
||||
movdqa xmm6, xmm2
|
||||
mov eax, DWORD PTR [r13+rax*4]
|
||||
xor rax, rcx
|
||||
mov QWORD PTR [r14+r10+8], rax
|
||||
movq QWORD PTR [r15+r9], xmm0
|
||||
pextrq rcx, xmm0, 1
|
||||
mov eax, ecx
|
||||
shr rax, 24
|
||||
mov eax, DWORD PTR [r13+rax*4]
|
||||
xor rax, rcx
|
||||
movq rcx, xmm1
|
||||
mov QWORD PTR [r15+r9+8], rax
|
||||
mov r9, rcx
|
||||
and r9d, 2097136
|
||||
mov r10, QWORD PTR [r14+r9]
|
||||
mov r8, QWORD PTR [r14+r9+8]
|
||||
mov rax, r10
|
||||
mul rcx
|
||||
add rsi, rax
|
||||
add rbx, rdx
|
||||
mov rax, QWORD PTR [rsp]
|
||||
mov QWORD PTR [r14+r9], rbx
|
||||
xor rax, rsi
|
||||
mov QWORD PTR [r14+r9+8], rax
|
||||
xor rsi, r8
|
||||
xor rbx, r10
|
||||
mov r8, r11
|
||||
and r8d, 2097136
|
||||
mov r10, rbx
|
||||
and r10d, 2097136
|
||||
movq xmm3, rbx
|
||||
pinsrq xmm3, rsi, 1
|
||||
mov r9, QWORD PTR [r15+r8]
|
||||
mov rcx, QWORD PTR [r15+r8+8]
|
||||
mov rax, r9
|
||||
movdqu xmm1, XMMWORD PTR [r14+r10]
|
||||
mul r11
|
||||
add rbp, rax
|
||||
add rdi, rdx
|
||||
mov rax, QWORD PTR [rsp+8]
|
||||
mov QWORD PTR [r15+r8], rdi
|
||||
xor rax, rbp
|
||||
xor rdi, r9
|
||||
mov QWORD PTR [r15+r8+8], rax
|
||||
mov r9, rdi
|
||||
xor rbp, rcx
|
||||
and r9d, 2097136
|
||||
movq xmm4, rdi
|
||||
pinsrq xmm4, rbp, 1
|
||||
movdqu xmm2, XMMWORD PTR [r15+r9]
|
||||
sub r12, 1
|
||||
jne main_loop_cnv1_double
|
||||
|
||||
mov rbx, QWORD PTR [rsp+80]
|
||||
mov rbp, QWORD PTR [rsp+88]
|
||||
mov rsi, QWORD PTR [rsp+96]
|
||||
movaps xmm6, XMMWORD PTR [rsp+16]
|
||||
add rsp, 32
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
263
src/crypto/cn/asm/cn1/cnv1_quad_main_loop.inc
Normal file
263
src/crypto/cn/asm/cn1/cnv1_quad_main_loop.inc
Normal file
|
@ -0,0 +1,263 @@
|
|||
mov rax, rsp
|
||||
mov QWORD PTR [rax+8], rbx
|
||||
mov QWORD PTR [rax+16], rbp
|
||||
mov QWORD PTR [rax+24], rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 144
|
||||
mov r8, QWORD PTR [rcx]
|
||||
mov r9, QWORD PTR [rcx+8]
|
||||
mov r10, QWORD PTR [rcx+16]
|
||||
mov r11, QWORD PTR [rcx+24]
|
||||
mov rbp, QWORD PTR [r8+224]
|
||||
mov r13, QWORD PTR [r8+232]
|
||||
mov r14, QWORD PTR [r9+224]
|
||||
mov r15, QWORD PTR [r10+224]
|
||||
mov r12, QWORD PTR [r11+224]
|
||||
mov rcx, QWORD PTR [r8+40]
|
||||
xor rcx, QWORD PTR [r8+8]
|
||||
mov rbx, QWORD PTR [r8+32]
|
||||
xor rbx, QWORD PTR [r8]
|
||||
mov rdi, QWORD PTR [r9+32]
|
||||
xor rdi, QWORD PTR [r9]
|
||||
movq xmm0, rcx
|
||||
mov rcx, QWORD PTR [r9+40]
|
||||
xor rcx, QWORD PTR [r9+8]
|
||||
movq xmm1, rbx
|
||||
movaps XMMWORD PTR [rax-56], xmm6
|
||||
movaps XMMWORD PTR [rax-72], xmm7
|
||||
movaps XMMWORD PTR [rax-88], xmm8
|
||||
movaps XMMWORD PTR [rax-104], xmm9
|
||||
movaps XMMWORD PTR [rax-120], xmm10
|
||||
movaps XMMWORD PTR [rsp+48], xmm11
|
||||
movaps XMMWORD PTR [rsp+32], xmm12
|
||||
and ebx, 2097136
|
||||
mov rsi, QWORD PTR [r10+32]
|
||||
movq xmm2, rdi
|
||||
mov rax, QWORD PTR [r8+240]
|
||||
and edi, 2097136
|
||||
xor rsi, QWORD PTR [r10]
|
||||
mov rdx, QWORD PTR [r8+56]
|
||||
xor rdx, QWORD PTR [r8+24]
|
||||
mov QWORD PTR [rsp], rax
|
||||
mov rax, QWORD PTR [r9+240]
|
||||
movq xmm3, rsi
|
||||
mov QWORD PTR [rsp+8], rax
|
||||
and esi, 2097136
|
||||
mov rax, QWORD PTR [r10+240]
|
||||
punpcklqdq xmm1, xmm0
|
||||
movq xmm0, rcx
|
||||
mov rcx, QWORD PTR [r10+40]
|
||||
xor rcx, QWORD PTR [r10+8]
|
||||
mov QWORD PTR [rsp+16], rax
|
||||
mov rax, QWORD PTR [r11+240]
|
||||
punpcklqdq xmm2, xmm0
|
||||
movq xmm0, rcx
|
||||
mov QWORD PTR [rsp+24], rax
|
||||
mov rcx, QWORD PTR [r11+40]
|
||||
xor rcx, QWORD PTR [r11+8]
|
||||
mov rax, QWORD PTR [r11+32]
|
||||
xor rax, QWORD PTR [r11]
|
||||
punpcklqdq xmm3, xmm0
|
||||
movq xmm0, rcx
|
||||
mov rcx, QWORD PTR [r8+48]
|
||||
xor rcx, QWORD PTR [r8+16]
|
||||
movq xmm4, rax
|
||||
and eax, 2097136
|
||||
punpcklqdq xmm4, xmm0
|
||||
movq xmm0, rdx
|
||||
mov rdx, QWORD PTR [r9+56]
|
||||
xor rdx, QWORD PTR [r9+24]
|
||||
movq xmm5, rcx
|
||||
mov rcx, QWORD PTR [r9+48]
|
||||
xor rcx, QWORD PTR [r9+16]
|
||||
punpcklqdq xmm5, xmm0
|
||||
movq xmm0, rdx
|
||||
mov rdx, QWORD PTR [r10+56]
|
||||
xor rdx, QWORD PTR [r10+24]
|
||||
movq xmm6, rcx
|
||||
mov rcx, QWORD PTR [r10+48]
|
||||
xor rcx, QWORD PTR [r10+16]
|
||||
punpcklqdq xmm6, xmm0
|
||||
movq xmm0, rdx
|
||||
mov rdx, QWORD PTR [r11+56]
|
||||
movq xmm7, rcx
|
||||
punpcklqdq xmm7, xmm0
|
||||
xor rdx, QWORD PTR [r11+24]
|
||||
mov rcx, QWORD PTR [r11+48]
|
||||
xor rcx, QWORD PTR [r11+16]
|
||||
mov r11d, 524288
|
||||
movdqu xmm9, XMMWORD PTR [rbp+rbx]
|
||||
movdqu xmm10, XMMWORD PTR [r14+rdi]
|
||||
movq xmm0, rdx
|
||||
movdqu xmm11, XMMWORD PTR [r15+rsi]
|
||||
movdqu xmm12, XMMWORD PTR [r12+rax]
|
||||
movq xmm8, rcx
|
||||
punpcklqdq xmm8, xmm0
|
||||
|
||||
ALIGN(64)
|
||||
main_loop_cnv1_quad:
|
||||
aesenc xmm9, xmm1
|
||||
aesenc xmm10, xmm2
|
||||
aesenc xmm11, xmm3
|
||||
aesenc xmm12, xmm4
|
||||
movd ecx, xmm9
|
||||
and ecx, 2097136
|
||||
prefetcht0 BYTE PTR [rcx+rbp]
|
||||
movd ecx, xmm10
|
||||
and ecx, 2097136
|
||||
prefetcht0 BYTE PTR [rcx+r14]
|
||||
movd ecx, xmm11
|
||||
and ecx, 2097136
|
||||
prefetcht0 BYTE PTR [rcx+r15]
|
||||
movd ecx, xmm12
|
||||
and ecx, 2097136
|
||||
prefetcht0 BYTE PTR [rcx+r12]
|
||||
movdqa xmm0, xmm9
|
||||
pxor xmm0, xmm5
|
||||
movdqa xmm5, xmm9
|
||||
movq QWORD PTR [rbp+rbx], xmm0
|
||||
pextrq rdx, xmm0, 1
|
||||
mov ecx, edx
|
||||
movdqa xmm0, xmm10
|
||||
shr rcx, 24
|
||||
pxor xmm0, xmm6
|
||||
mov ecx, DWORD PTR [r13+rcx*4]
|
||||
xor rcx, rdx
|
||||
mov QWORD PTR [rbp+rbx+8], rcx
|
||||
movq rbx, xmm1
|
||||
movq QWORD PTR [r14+rdi], xmm0
|
||||
pextrq rdx, xmm0, 1
|
||||
mov ecx, edx
|
||||
movdqa xmm0, xmm11
|
||||
shr rcx, 24
|
||||
pxor xmm0, xmm7
|
||||
mov ecx, DWORD PTR [r13+rcx*4]
|
||||
xor rcx, rdx
|
||||
mov QWORD PTR [r14+rdi+8], rcx
|
||||
movq rdi, xmm2
|
||||
movq QWORD PTR [r15+rsi], xmm0
|
||||
pextrq rdx, xmm0, 1
|
||||
mov ecx, edx
|
||||
movdqa xmm0, xmm12
|
||||
shr rcx, 24
|
||||
pxor xmm0, xmm8
|
||||
mov ecx, DWORD PTR [r13+rcx*4]
|
||||
xor rcx, rdx
|
||||
mov QWORD PTR [r15+rsi+8], rcx
|
||||
movq QWORD PTR [r12+rax], xmm0
|
||||
pextrq rdx, xmm0, 1
|
||||
mov ecx, edx
|
||||
shr rcx, 24
|
||||
mov ecx, DWORD PTR [r13+rcx*4]
|
||||
xor rcx, rdx
|
||||
mov QWORD PTR [r12+rax+8], rcx
|
||||
movq rcx, xmm9
|
||||
mov r8, rcx
|
||||
and r8d, 2097136
|
||||
mov r9, QWORD PTR [rbp+r8]
|
||||
mov r10, QWORD PTR [rbp+r8+8]
|
||||
mov rax, r9
|
||||
mul rcx
|
||||
pextrq rcx, xmm1, 1
|
||||
add rcx, rax
|
||||
add rbx, rdx
|
||||
mov rax, QWORD PTR [rsp]
|
||||
mov QWORD PTR [rbp+r8], rbx
|
||||
xor rax, rcx
|
||||
mov QWORD PTR [rbp+r8+8], rax
|
||||
xor rcx, r10
|
||||
xor rbx, r9
|
||||
movq xmm1, rbx
|
||||
and ebx, 2097136
|
||||
pinsrq xmm1, rcx, 1
|
||||
movq rcx, xmm10
|
||||
mov r8, rcx
|
||||
and r8d, 2097136
|
||||
movdqu xmm9, XMMWORD PTR [rbp+rbx]
|
||||
mov r9, QWORD PTR [r14+r8]
|
||||
mov r10, QWORD PTR [r14+r8+8]
|
||||
mov rax, r9
|
||||
mul rcx
|
||||
pextrq rcx, xmm2, 1
|
||||
add rcx, rax
|
||||
add rdi, rdx
|
||||
mov rax, QWORD PTR [rsp+8]
|
||||
mov QWORD PTR [r14+r8], rdi
|
||||
xor rax, rcx
|
||||
xor rdi, r9
|
||||
mov QWORD PTR [r14+r8+8], rax
|
||||
xor rcx, r10
|
||||
movq xmm2, rdi
|
||||
and edi, 2097136
|
||||
pinsrq xmm2, rcx, 1
|
||||
movq rcx, xmm11
|
||||
movq rsi, xmm3
|
||||
mov r8, rcx
|
||||
and r8d, 2097136
|
||||
movdqa xmm6, xmm10
|
||||
movdqa xmm7, xmm11
|
||||
movdqa xmm8, xmm12
|
||||
movdqu xmm10, XMMWORD PTR [r14+rdi]
|
||||
mov r9, QWORD PTR [r15+r8]
|
||||
mov r10, QWORD PTR [r15+r8+8]
|
||||
mov rax, r9
|
||||
mul rcx
|
||||
pextrq rcx, xmm3, 1
|
||||
add rcx, rax
|
||||
add rsi, rdx
|
||||
mov rax, QWORD PTR [rsp+16]
|
||||
xor rax, rcx
|
||||
mov QWORD PTR [r15+r8], rsi
|
||||
mov QWORD PTR [r15+r8+8], rax
|
||||
xor rcx, r10
|
||||
xor rsi, r9
|
||||
movq xmm3, rsi
|
||||
and esi, 2097136
|
||||
pinsrq xmm3, rcx, 1
|
||||
movq rcx, xmm12
|
||||
mov r8, rcx
|
||||
and r8d, 2097136
|
||||
movdqu xmm11, XMMWORD PTR [r15+rsi]
|
||||
mov r9, QWORD PTR [r12+r8]
|
||||
mov r10, QWORD PTR [r12+r8+8]
|
||||
mov rax, r9
|
||||
mul rcx
|
||||
mov rcx, rax
|
||||
movq rax, xmm4
|
||||
add rax, rdx
|
||||
mov QWORD PTR [r12+r8], rax
|
||||
xor rax, r9
|
||||
pextrq rdx, xmm4, 1
|
||||
add rdx, rcx
|
||||
mov rcx, QWORD PTR [rsp+24]
|
||||
xor rcx, rdx
|
||||
xor rdx, r10
|
||||
movq xmm4, rax
|
||||
mov QWORD PTR [r12+r8+8], rcx
|
||||
and eax, 2097136
|
||||
pinsrq xmm4, rdx, 1
|
||||
movdqu xmm12, XMMWORD PTR [r12+rax]
|
||||
sub r11, 1
|
||||
jne main_loop_cnv1_quad
|
||||
|
||||
movaps xmm7, XMMWORD PTR [rsp+112]
|
||||
lea r11, QWORD PTR [rsp+144]
|
||||
mov rbx, QWORD PTR [r11+48]
|
||||
mov rbp, QWORD PTR [r11+56]
|
||||
mov rsi, QWORD PTR [r11+64]
|
||||
movaps xmm6, XMMWORD PTR [r11-16]
|
||||
movaps xmm8, XMMWORD PTR [r11-48]
|
||||
movaps xmm9, XMMWORD PTR [r11-64]
|
||||
movaps xmm10, XMMWORD PTR [r11-80]
|
||||
movaps xmm11, XMMWORD PTR [r11-96]
|
||||
movaps xmm12, XMMWORD PTR [r11-112]
|
||||
mov rsp, r11
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
66
src/crypto/cn/asm/cn1/cnv1_single_main_loop.inc
Normal file
66
src/crypto/cn/asm/cn1/cnv1_single_main_loop.inc
Normal file
|
@ -0,0 +1,66 @@
|
|||
mov QWORD PTR [rsp+8], rbx
|
||||
mov QWORD PTR [rsp+16], rbp
|
||||
mov QWORD PTR [rsp+24], rsi
|
||||
mov QWORD PTR [rsp+32], rdi
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
mov rdx, QWORD PTR [rcx]
|
||||
mov esi, 524288
|
||||
mov r11, QWORD PTR [rdx+32]
|
||||
xor r11, QWORD PTR [rdx]
|
||||
mov rdi, QWORD PTR [rdx+224]
|
||||
mov rbx, QWORD PTR [rdx+40]
|
||||
xor rbx, QWORD PTR [rdx+8]
|
||||
mov rcx, QWORD PTR [rdx+56]
|
||||
xor rcx, QWORD PTR [rdx+24]
|
||||
mov rax, QWORD PTR [rdx+48]
|
||||
xor rax, QWORD PTR [rdx+16]
|
||||
mov rbp, QWORD PTR [rdx+240]
|
||||
mov r14, QWORD PTR [rdx+232]
|
||||
movq xmm2, rax
|
||||
pinsrq xmm2, rcx, 1
|
||||
|
||||
ALIGN(64)
|
||||
main_loop_cnv1_single:
|
||||
mov r8, r11
|
||||
and r8d, 2097136
|
||||
movdqu xmm1, XMMWORD PTR [rdi+r8]
|
||||
movq xmm0, r11
|
||||
pinsrq xmm0, rbx, 1
|
||||
aesenc xmm1, xmm0
|
||||
movq r15, xmm1
|
||||
mov r9, r15
|
||||
and r9d, 2097136
|
||||
movdqa xmm0, xmm1
|
||||
pxor xmm0, xmm2
|
||||
movdqa xmm2, xmm1
|
||||
movq QWORD PTR [rdi+r8], xmm0
|
||||
pextrq rdx, xmm0, 1
|
||||
mov eax, edx
|
||||
shr rax, 24
|
||||
mov ecx, DWORD PTR [r14+rax*4]
|
||||
xor rcx, rdx
|
||||
mov QWORD PTR [rdi+r8+8], rcx
|
||||
mov r10, QWORD PTR [rdi+r9]
|
||||
mov r8, QWORD PTR [rdi+r9+8]
|
||||
mov rax, r10
|
||||
mul r15
|
||||
add rbx, rax
|
||||
add r11, rdx
|
||||
mov QWORD PTR [rdi+r9], r11
|
||||
mov rax, rbx
|
||||
xor rbx, r8
|
||||
xor r11, r10
|
||||
xor rax, rbp
|
||||
mov QWORD PTR [rdi+r9+8], rax
|
||||
sub rsi, 1
|
||||
jne main_loop_cnv1_single
|
||||
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
mov rbx, QWORD PTR [rsp+8]
|
||||
mov rbp, QWORD PTR [rsp+16]
|
||||
mov rsi, QWORD PTR [rsp+24]
|
||||
mov rdi, QWORD PTR [rsp+32]
|
|
@ -11,6 +11,9 @@
|
|||
# define FN_PREFIX(fn) fn
|
||||
.section .text
|
||||
#endif
|
||||
.global FN_PREFIX(cnv1_single_mainloop_asm)
|
||||
.global FN_PREFIX(cnv1_double_mainloop_asm)
|
||||
.global FN_PREFIX(cnv1_quad_mainloop_asm)
|
||||
.global FN_PREFIX(cnv2_mainloop_ivybridge_asm)
|
||||
.global FN_PREFIX(cnv2_mainloop_ryzen_asm)
|
||||
.global FN_PREFIX(cnv2_mainloop_bulldozer_asm)
|
||||
|
@ -19,6 +22,33 @@
|
|||
.global FN_PREFIX(cnv2_rwz_double_mainloop_asm)
|
||||
.global FN_PREFIX(cnv2_upx_double_mainloop_zen3_asm)
|
||||
|
||||
ALIGN(64)
|
||||
FN_PREFIX(cnv1_single_mainloop_asm):
|
||||
sub rsp, 48
|
||||
mov rcx, rdi
|
||||
#include "cn1/cnv1_single_main_loop.inc"
|
||||
add rsp, 48
|
||||
ret 0
|
||||
mov eax, 3735929054
|
||||
|
||||
ALIGN(64)
|
||||
FN_PREFIX(cnv1_double_mainloop_asm):
|
||||
sub rsp, 48
|
||||
mov rcx, rdi
|
||||
#include "cn1/cnv1_double_main_loop.inc"
|
||||
add rsp, 48
|
||||
ret 0
|
||||
mov eax, 3735929054
|
||||
|
||||
ALIGN(64)
|
||||
FN_PREFIX(cnv1_quad_mainloop_asm):
|
||||
sub rsp, 48
|
||||
mov rcx, rdi
|
||||
#include "cn1/cnv1_quad_main_loop.inc"
|
||||
add rsp, 48
|
||||
ret 0
|
||||
mov eax, 3735929054
|
||||
|
||||
ALIGN(64)
|
||||
FN_PREFIX(cnv2_mainloop_ivybridge_asm):
|
||||
sub rsp, 48
|
||||
|
|
|
@ -1,4 +1,7 @@
|
|||
_TEXT_CNV2_MAINLOOP SEGMENT PAGE READ EXECUTE
|
||||
PUBLIC cnv1_single_mainloop_asm
|
||||
PUBLIC cnv1_double_mainloop_asm
|
||||
PUBLIC cnv1_quad_mainloop_asm
|
||||
PUBLIC cnv2_mainloop_ivybridge_asm
|
||||
PUBLIC cnv2_mainloop_ryzen_asm
|
||||
PUBLIC cnv2_mainloop_bulldozer_asm
|
||||
|
@ -6,6 +9,27 @@ PUBLIC cnv2_double_mainloop_sandybridge_asm
|
|||
PUBLIC cnv2_rwz_mainloop_asm
|
||||
PUBLIC cnv2_rwz_double_mainloop_asm
|
||||
|
||||
ALIGN(64)
|
||||
cnv1_single_mainloop_asm PROC
|
||||
INCLUDE cn1/cnv1_single_main_loop.inc
|
||||
ret 0
|
||||
mov eax, 3735929054
|
||||
cnv1_single_mainloop_asm ENDP
|
||||
|
||||
ALIGN(64)
|
||||
cnv1_double_mainloop_asm PROC
|
||||
INCLUDE cn1/cnv1_double_main_loop.inc
|
||||
ret 0
|
||||
mov eax, 3735929054
|
||||
cnv1_double_mainloop_asm ENDP
|
||||
|
||||
ALIGN(64)
|
||||
cnv1_quad_mainloop_asm PROC
|
||||
INCLUDE cn1/cnv1_quad_main_loop.inc
|
||||
ret 0
|
||||
mov eax, 3735929054
|
||||
cnv1_quad_mainloop_asm ENDP
|
||||
|
||||
ALIGN(64)
|
||||
cnv2_mainloop_ivybridge_asm PROC
|
||||
INCLUDE cn2/cnv2_main_loop_ivybridge.inc
|
||||
|
|
132
src/crypto/cn/asm/win64/cn1/cnv1_double_main_loop.inc
Normal file
132
src/crypto/cn/asm/win64/cn1/cnv1_double_main_loop.inc
Normal file
|
@ -0,0 +1,132 @@
|
|||
mov QWORD PTR [rsp+8], rbx
|
||||
mov QWORD PTR [rsp+16], rbp
|
||||
mov QWORD PTR [rsp+24], rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 32
|
||||
mov rdx, QWORD PTR [rcx]
|
||||
mov r8, QWORD PTR [rcx+8]
|
||||
mov r12d, 524288
|
||||
movaps XMMWORD PTR [rsp+16], xmm6
|
||||
mov rbx, QWORD PTR [rdx+32]
|
||||
xor rbx, QWORD PTR [rdx]
|
||||
mov rsi, QWORD PTR [rdx+40]
|
||||
mov r10, rbx
|
||||
xor rsi, QWORD PTR [rdx+8]
|
||||
and r10d, 2097136
|
||||
mov rdi, QWORD PTR [r8+32]
|
||||
xor rdi, QWORD PTR [r8]
|
||||
movd xmm3, rbx
|
||||
mov rbp, QWORD PTR [r8+40]
|
||||
mov r9, rdi
|
||||
xor rbp, QWORD PTR [r8+8]
|
||||
movd xmm0, rsi
|
||||
mov rcx, QWORD PTR [rdx+56]
|
||||
and r9d, 2097136
|
||||
xor rcx, QWORD PTR [rdx+24]
|
||||
movd xmm4, rdi
|
||||
mov rax, QWORD PTR [rdx+48]
|
||||
xor rax, QWORD PTR [rdx+16]
|
||||
mov r14, QWORD PTR [rdx+224]
|
||||
mov r13, QWORD PTR [rdx+232]
|
||||
mov r15, QWORD PTR [r8+224]
|
||||
punpcklqdq xmm3, xmm0
|
||||
movd xmm0, rbp
|
||||
movd xmm5, rax
|
||||
punpcklqdq xmm4, xmm0
|
||||
mov rax, QWORD PTR [r8+48]
|
||||
movd xmm0, rcx
|
||||
xor rax, QWORD PTR [r8+16]
|
||||
mov rcx, QWORD PTR [r8+56]
|
||||
xor rcx, QWORD PTR [r8+24]
|
||||
movdqu xmm1, XMMWORD PTR [r14+r10]
|
||||
movd xmm6, rax
|
||||
punpcklqdq xmm5, xmm0
|
||||
mov rax, QWORD PTR [rdx+240]
|
||||
movd xmm0, rcx
|
||||
movdqu xmm2, XMMWORD PTR [r15+r9]
|
||||
mov QWORD PTR [rsp], rax
|
||||
mov rax, QWORD PTR [r8+240]
|
||||
mov QWORD PTR [rsp+8], rax
|
||||
punpcklqdq xmm6, xmm0
|
||||
|
||||
ALIGN(64)
|
||||
main_loop_cnv1_double:
|
||||
aesenc xmm1, xmm3
|
||||
aesenc xmm2, xmm4
|
||||
movdqa xmm0, xmm1
|
||||
movd r11, xmm2
|
||||
pxor xmm0, xmm5
|
||||
movdqa xmm5, xmm1
|
||||
movd QWORD PTR [r14+r10], xmm0
|
||||
pextrq rcx, xmm0, 1
|
||||
mov eax, ecx
|
||||
movdqa xmm0, xmm2
|
||||
shr rax, 24
|
||||
pxor xmm0, xmm6
|
||||
movdqa xmm6, xmm2
|
||||
mov eax, DWORD PTR [r13+rax*4]
|
||||
xor rax, rcx
|
||||
mov QWORD PTR [r14+r10+8], rax
|
||||
movd QWORD PTR [r15+r9], xmm0
|
||||
pextrq rcx, xmm0, 1
|
||||
mov eax, ecx
|
||||
shr rax, 24
|
||||
mov eax, DWORD PTR [r13+rax*4]
|
||||
xor rax, rcx
|
||||
movd rcx, xmm1
|
||||
mov QWORD PTR [r15+r9+8], rax
|
||||
mov r9, rcx
|
||||
and r9d, 2097136
|
||||
mov r10, QWORD PTR [r14+r9]
|
||||
mov r8, QWORD PTR [r14+r9+8]
|
||||
mov rax, r10
|
||||
mul rcx
|
||||
add rsi, rax
|
||||
add rbx, rdx
|
||||
mov rax, QWORD PTR [rsp]
|
||||
mov QWORD PTR [r14+r9], rbx
|
||||
xor rax, rsi
|
||||
mov QWORD PTR [r14+r9+8], rax
|
||||
xor rsi, r8
|
||||
xor rbx, r10
|
||||
mov r8, r11
|
||||
and r8d, 2097136
|
||||
mov r10, rbx
|
||||
and r10d, 2097136
|
||||
movd xmm3, rbx
|
||||
pinsrq xmm3, rsi, 1
|
||||
mov r9, QWORD PTR [r15+r8]
|
||||
mov rcx, QWORD PTR [r15+r8+8]
|
||||
mov rax, r9
|
||||
movdqu xmm1, XMMWORD PTR [r14+r10]
|
||||
mul r11
|
||||
add rbp, rax
|
||||
add rdi, rdx
|
||||
mov rax, QWORD PTR [rsp+8]
|
||||
mov QWORD PTR [r15+r8], rdi
|
||||
xor rax, rbp
|
||||
xor rdi, r9
|
||||
mov QWORD PTR [r15+r8+8], rax
|
||||
mov r9, rdi
|
||||
xor rbp, rcx
|
||||
and r9d, 2097136
|
||||
movd xmm4, rdi
|
||||
pinsrq xmm4, rbp, 1
|
||||
movdqu xmm2, XMMWORD PTR [r15+r9]
|
||||
sub r12, 1
|
||||
jne main_loop_cnv1_double
|
||||
|
||||
mov rbx, QWORD PTR [rsp+80]
|
||||
mov rbp, QWORD PTR [rsp+88]
|
||||
mov rsi, QWORD PTR [rsp+96]
|
||||
movaps xmm6, XMMWORD PTR [rsp+16]
|
||||
add rsp, 32
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
263
src/crypto/cn/asm/win64/cn1/cnv1_quad_main_loop.inc
Normal file
263
src/crypto/cn/asm/win64/cn1/cnv1_quad_main_loop.inc
Normal file
|
@ -0,0 +1,263 @@
|
|||
mov rax, rsp
|
||||
mov QWORD PTR [rax+8], rbx
|
||||
mov QWORD PTR [rax+16], rbp
|
||||
mov QWORD PTR [rax+24], rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 144
|
||||
mov r8, QWORD PTR [rcx]
|
||||
mov r9, QWORD PTR [rcx+8]
|
||||
mov r10, QWORD PTR [rcx+16]
|
||||
mov r11, QWORD PTR [rcx+24]
|
||||
mov rbp, QWORD PTR [r8+224]
|
||||
mov r13, QWORD PTR [r8+232]
|
||||
mov r14, QWORD PTR [r9+224]
|
||||
mov r15, QWORD PTR [r10+224]
|
||||
mov r12, QWORD PTR [r11+224]
|
||||
mov rcx, QWORD PTR [r8+40]
|
||||
xor rcx, QWORD PTR [r8+8]
|
||||
mov rbx, QWORD PTR [r8+32]
|
||||
xor rbx, QWORD PTR [r8]
|
||||
mov rdi, QWORD PTR [r9+32]
|
||||
xor rdi, QWORD PTR [r9]
|
||||
movd xmm0, rcx
|
||||
mov rcx, QWORD PTR [r9+40]
|
||||
xor rcx, QWORD PTR [r9+8]
|
||||
movd xmm1, rbx
|
||||
movaps XMMWORD PTR [rax-56], xmm6
|
||||
movaps XMMWORD PTR [rax-72], xmm7
|
||||
movaps XMMWORD PTR [rax-88], xmm8
|
||||
movaps XMMWORD PTR [rax-104], xmm9
|
||||
movaps XMMWORD PTR [rax-120], xmm10
|
||||
movaps XMMWORD PTR [rsp+48], xmm11
|
||||
movaps XMMWORD PTR [rsp+32], xmm12
|
||||
and ebx, 2097136
|
||||
mov rsi, QWORD PTR [r10+32]
|
||||
movd xmm2, rdi
|
||||
mov rax, QWORD PTR [r8+240]
|
||||
and edi, 2097136
|
||||
xor rsi, QWORD PTR [r10]
|
||||
mov rdx, QWORD PTR [r8+56]
|
||||
xor rdx, QWORD PTR [r8+24]
|
||||
mov QWORD PTR [rsp], rax
|
||||
mov rax, QWORD PTR [r9+240]
|
||||
movd xmm3, rsi
|
||||
mov QWORD PTR [rsp+8], rax
|
||||
and esi, 2097136
|
||||
mov rax, QWORD PTR [r10+240]
|
||||
punpcklqdq xmm1, xmm0
|
||||
movd xmm0, rcx
|
||||
mov rcx, QWORD PTR [r10+40]
|
||||
xor rcx, QWORD PTR [r10+8]
|
||||
mov QWORD PTR [rsp+16], rax
|
||||
mov rax, QWORD PTR [r11+240]
|
||||
punpcklqdq xmm2, xmm0
|
||||
movd xmm0, rcx
|
||||
mov QWORD PTR [rsp+24], rax
|
||||
mov rcx, QWORD PTR [r11+40]
|
||||
xor rcx, QWORD PTR [r11+8]
|
||||
mov rax, QWORD PTR [r11+32]
|
||||
xor rax, QWORD PTR [r11]
|
||||
punpcklqdq xmm3, xmm0
|
||||
movd xmm0, rcx
|
||||
mov rcx, QWORD PTR [r8+48]
|
||||
xor rcx, QWORD PTR [r8+16]
|
||||
movd xmm4, rax
|
||||
and eax, 2097136
|
||||
punpcklqdq xmm4, xmm0
|
||||
movd xmm0, rdx
|
||||
mov rdx, QWORD PTR [r9+56]
|
||||
xor rdx, QWORD PTR [r9+24]
|
||||
movd xmm5, rcx
|
||||
mov rcx, QWORD PTR [r9+48]
|
||||
xor rcx, QWORD PTR [r9+16]
|
||||
punpcklqdq xmm5, xmm0
|
||||
movd xmm0, rdx
|
||||
mov rdx, QWORD PTR [r10+56]
|
||||
xor rdx, QWORD PTR [r10+24]
|
||||
movd xmm6, rcx
|
||||
mov rcx, QWORD PTR [r10+48]
|
||||
xor rcx, QWORD PTR [r10+16]
|
||||
punpcklqdq xmm6, xmm0
|
||||
movd xmm0, rdx
|
||||
mov rdx, QWORD PTR [r11+56]
|
||||
movd xmm7, rcx
|
||||
punpcklqdq xmm7, xmm0
|
||||
xor rdx, QWORD PTR [r11+24]
|
||||
mov rcx, QWORD PTR [r11+48]
|
||||
xor rcx, QWORD PTR [r11+16]
|
||||
mov r11d, 524288
|
||||
movdqu xmm9, XMMWORD PTR [rbp+rbx]
|
||||
movdqu xmm10, XMMWORD PTR [r14+rdi]
|
||||
movd xmm0, rdx
|
||||
movdqu xmm11, XMMWORD PTR [r15+rsi]
|
||||
movdqu xmm12, XMMWORD PTR [r12+rax]
|
||||
movd xmm8, rcx
|
||||
punpcklqdq xmm8, xmm0
|
||||
|
||||
ALIGN(64)
|
||||
main_loop_cnv1_quad:
|
||||
aesenc xmm9, xmm1
|
||||
aesenc xmm10, xmm2
|
||||
aesenc xmm11, xmm3
|
||||
aesenc xmm12, xmm4
|
||||
movd ecx, xmm9
|
||||
and ecx, 2097136
|
||||
prefetcht0 BYTE PTR [rcx+rbp]
|
||||
movd ecx, xmm10
|
||||
and ecx, 2097136
|
||||
prefetcht0 BYTE PTR [rcx+r14]
|
||||
movd ecx, xmm11
|
||||
and ecx, 2097136
|
||||
prefetcht0 BYTE PTR [rcx+r15]
|
||||
movd ecx, xmm12
|
||||
and ecx, 2097136
|
||||
prefetcht0 BYTE PTR [rcx+r12]
|
||||
movdqa xmm0, xmm9
|
||||
pxor xmm0, xmm5
|
||||
movdqa xmm5, xmm9
|
||||
movd QWORD PTR [rbp+rbx], xmm0
|
||||
pextrq rdx, xmm0, 1
|
||||
mov ecx, edx
|
||||
movdqa xmm0, xmm10
|
||||
shr rcx, 24
|
||||
pxor xmm0, xmm6
|
||||
mov ecx, DWORD PTR [r13+rcx*4]
|
||||
xor rcx, rdx
|
||||
mov QWORD PTR [rbp+rbx+8], rcx
|
||||
movd rbx, xmm1
|
||||
movd QWORD PTR [r14+rdi], xmm0
|
||||
pextrq rdx, xmm0, 1
|
||||
mov ecx, edx
|
||||
movdqa xmm0, xmm11
|
||||
shr rcx, 24
|
||||
pxor xmm0, xmm7
|
||||
mov ecx, DWORD PTR [r13+rcx*4]
|
||||
xor rcx, rdx
|
||||
mov QWORD PTR [r14+rdi+8], rcx
|
||||
movd rdi, xmm2
|
||||
movd QWORD PTR [r15+rsi], xmm0
|
||||
pextrq rdx, xmm0, 1
|
||||
mov ecx, edx
|
||||
movdqa xmm0, xmm12
|
||||
shr rcx, 24
|
||||
pxor xmm0, xmm8
|
||||
mov ecx, DWORD PTR [r13+rcx*4]
|
||||
xor rcx, rdx
|
||||
mov QWORD PTR [r15+rsi+8], rcx
|
||||
movd QWORD PTR [r12+rax], xmm0
|
||||
pextrq rdx, xmm0, 1
|
||||
mov ecx, edx
|
||||
shr rcx, 24
|
||||
mov ecx, DWORD PTR [r13+rcx*4]
|
||||
xor rcx, rdx
|
||||
mov QWORD PTR [r12+rax+8], rcx
|
||||
movd rcx, xmm9
|
||||
mov r8, rcx
|
||||
and r8d, 2097136
|
||||
mov r9, QWORD PTR [rbp+r8]
|
||||
mov r10, QWORD PTR [rbp+r8+8]
|
||||
mov rax, r9
|
||||
mul rcx
|
||||
pextrq rcx, xmm1, 1
|
||||
add rcx, rax
|
||||
add rbx, rdx
|
||||
mov rax, QWORD PTR [rsp]
|
||||
mov QWORD PTR [rbp+r8], rbx
|
||||
xor rax, rcx
|
||||
mov QWORD PTR [rbp+r8+8], rax
|
||||
xor rcx, r10
|
||||
xor rbx, r9
|
||||
movd xmm1, rbx
|
||||
and ebx, 2097136
|
||||
pinsrq xmm1, rcx, 1
|
||||
movd rcx, xmm10
|
||||
mov r8, rcx
|
||||
and r8d, 2097136
|
||||
movdqu xmm9, XMMWORD PTR [rbp+rbx]
|
||||
mov r9, QWORD PTR [r14+r8]
|
||||
mov r10, QWORD PTR [r14+r8+8]
|
||||
mov rax, r9
|
||||
mul rcx
|
||||
pextrq rcx, xmm2, 1
|
||||
add rcx, rax
|
||||
add rdi, rdx
|
||||
mov rax, QWORD PTR [rsp+8]
|
||||
mov QWORD PTR [r14+r8], rdi
|
||||
xor rax, rcx
|
||||
xor rdi, r9
|
||||
mov QWORD PTR [r14+r8+8], rax
|
||||
xor rcx, r10
|
||||
movd xmm2, rdi
|
||||
and edi, 2097136
|
||||
pinsrq xmm2, rcx, 1
|
||||
movd rcx, xmm11
|
||||
movd rsi, xmm3
|
||||
mov r8, rcx
|
||||
and r8d, 2097136
|
||||
movdqa xmm6, xmm10
|
||||
movdqa xmm7, xmm11
|
||||
movdqa xmm8, xmm12
|
||||
movdqu xmm10, XMMWORD PTR [r14+rdi]
|
||||
mov r9, QWORD PTR [r15+r8]
|
||||
mov r10, QWORD PTR [r15+r8+8]
|
||||
mov rax, r9
|
||||
mul rcx
|
||||
pextrq rcx, xmm3, 1
|
||||
add rcx, rax
|
||||
add rsi, rdx
|
||||
mov rax, QWORD PTR [rsp+16]
|
||||
xor rax, rcx
|
||||
mov QWORD PTR [r15+r8], rsi
|
||||
mov QWORD PTR [r15+r8+8], rax
|
||||
xor rcx, r10
|
||||
xor rsi, r9
|
||||
movd xmm3, rsi
|
||||
and esi, 2097136
|
||||
pinsrq xmm3, rcx, 1
|
||||
movd rcx, xmm12
|
||||
mov r8, rcx
|
||||
and r8d, 2097136
|
||||
movdqu xmm11, XMMWORD PTR [r15+rsi]
|
||||
mov r9, QWORD PTR [r12+r8]
|
||||
mov r10, QWORD PTR [r12+r8+8]
|
||||
mov rax, r9
|
||||
mul rcx
|
||||
mov rcx, rax
|
||||
movd rax, xmm4
|
||||
add rax, rdx
|
||||
mov QWORD PTR [r12+r8], rax
|
||||
xor rax, r9
|
||||
pextrq rdx, xmm4, 1
|
||||
add rdx, rcx
|
||||
mov rcx, QWORD PTR [rsp+24]
|
||||
xor rcx, rdx
|
||||
xor rdx, r10
|
||||
movd xmm4, rax
|
||||
mov QWORD PTR [r12+r8+8], rcx
|
||||
and eax, 2097136
|
||||
pinsrq xmm4, rdx, 1
|
||||
movdqu xmm12, XMMWORD PTR [r12+rax]
|
||||
sub r11, 1
|
||||
jne main_loop_cnv1_quad
|
||||
|
||||
movaps xmm7, XMMWORD PTR [rsp+112]
|
||||
lea r11, QWORD PTR [rsp+144]
|
||||
mov rbx, QWORD PTR [r11+48]
|
||||
mov rbp, QWORD PTR [r11+56]
|
||||
mov rsi, QWORD PTR [r11+64]
|
||||
movaps xmm6, XMMWORD PTR [r11-16]
|
||||
movaps xmm8, XMMWORD PTR [r11-48]
|
||||
movaps xmm9, XMMWORD PTR [r11-64]
|
||||
movaps xmm10, XMMWORD PTR [r11-80]
|
||||
movaps xmm11, XMMWORD PTR [r11-96]
|
||||
movaps xmm12, XMMWORD PTR [r11-112]
|
||||
mov rsp, r11
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
66
src/crypto/cn/asm/win64/cn1/cnv1_single_main_loop.inc
Normal file
66
src/crypto/cn/asm/win64/cn1/cnv1_single_main_loop.inc
Normal file
|
@ -0,0 +1,66 @@
|
|||
mov QWORD PTR [rsp+8], rbx
|
||||
mov QWORD PTR [rsp+16], rbp
|
||||
mov QWORD PTR [rsp+24], rsi
|
||||
mov QWORD PTR [rsp+32], rdi
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
mov rdx, QWORD PTR [rcx]
|
||||
mov esi, 524288
|
||||
mov r11, QWORD PTR [rdx+32]
|
||||
xor r11, QWORD PTR [rdx]
|
||||
mov rdi, QWORD PTR [rdx+224]
|
||||
mov rbx, QWORD PTR [rdx+40]
|
||||
xor rbx, QWORD PTR [rdx+8]
|
||||
mov rcx, QWORD PTR [rdx+56]
|
||||
xor rcx, QWORD PTR [rdx+24]
|
||||
mov rax, QWORD PTR [rdx+48]
|
||||
xor rax, QWORD PTR [rdx+16]
|
||||
mov rbp, QWORD PTR [rdx+240]
|
||||
mov r14, QWORD PTR [rdx+232]
|
||||
movd xmm2, rax
|
||||
pinsrq xmm2, rcx, 1
|
||||
|
||||
ALIGN(64)
|
||||
main_loop_cnv1_single:
|
||||
mov r8, r11
|
||||
and r8d, 2097136
|
||||
movdqu xmm1, XMMWORD PTR [rdi+r8]
|
||||
movd xmm0, r11
|
||||
pinsrq xmm0, rbx, 1
|
||||
aesenc xmm1, xmm0
|
||||
movd r15, xmm1
|
||||
mov r9, r15
|
||||
and r9d, 2097136
|
||||
movdqa xmm0, xmm1
|
||||
pxor xmm0, xmm2
|
||||
movdqa xmm2, xmm1
|
||||
movd QWORD PTR [rdi+r8], xmm0
|
||||
pextrq rdx, xmm0, 1
|
||||
mov eax, edx
|
||||
shr rax, 24
|
||||
mov ecx, DWORD PTR [r14+rax*4]
|
||||
xor rcx, rdx
|
||||
mov QWORD PTR [rdi+r8+8], rcx
|
||||
mov r10, QWORD PTR [rdi+r9]
|
||||
mov r8, QWORD PTR [rdi+r9+8]
|
||||
mov rax, r10
|
||||
mul r15
|
||||
add rbx, rax
|
||||
add r11, rdx
|
||||
mov QWORD PTR [rdi+r9], r11
|
||||
mov rax, rbx
|
||||
xor rbx, r8
|
||||
xor r11, r10
|
||||
xor rax, rbp
|
||||
mov QWORD PTR [rdi+r9+8], rax
|
||||
sub rsi, 1
|
||||
jne main_loop_cnv1_single
|
||||
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
mov rbx, QWORD PTR [rsp+8]
|
||||
mov rbp, QWORD PTR [rsp+16]
|
||||
mov rsi, QWORD PTR [rsp+24]
|
||||
mov rdi, QWORD PTR [rsp+32]
|
|
@ -1,6 +1,9 @@
|
|||
#define ALIGN(x) .align 64
|
||||
.intel_syntax noprefix
|
||||
.section .text
|
||||
.global cnv1_single_mainloop_asm
|
||||
.global cnv1_double_mainloop_asm
|
||||
.global cnv1_quad_mainloop_asm
|
||||
.global cnv2_mainloop_ivybridge_asm
|
||||
.global cnv2_mainloop_ryzen_asm
|
||||
.global cnv2_mainloop_bulldozer_asm
|
||||
|
@ -9,6 +12,24 @@
|
|||
.global cnv2_rwz_double_mainloop_asm
|
||||
.global cnv2_upx_double_mainloop_zen3_asm
|
||||
|
||||
ALIGN(64)
|
||||
cnv1_single_mainloop_asm:
|
||||
#include "../cn1/cnv1_single_main_loop.inc"
|
||||
ret 0
|
||||
mov eax, 3735929054
|
||||
|
||||
ALIGN(64)
|
||||
cnv1_double_mainloop_asm:
|
||||
#include "../cn1/cnv1_double_main_loop.inc"
|
||||
ret 0
|
||||
mov eax, 3735929054
|
||||
|
||||
ALIGN(64)
|
||||
cnv1_quad_mainloop_asm:
|
||||
#include "../cn1/cnv1_quad_main_loop.inc"
|
||||
ret 0
|
||||
mov eax, 3735929054
|
||||
|
||||
ALIGN(64)
|
||||
cnv2_mainloop_ivybridge_asm:
|
||||
#include "../cn2/cnv2_main_loop_ivybridge.inc"
|
||||
|
|
|
@ -1,4 +1,7 @@
|
|||
_TEXT_CNV2_MAINLOOP SEGMENT PAGE READ EXECUTE
|
||||
PUBLIC cnv1_single_mainloop_asm
|
||||
PUBLIC cnv1_double_mainloop_asm
|
||||
PUBLIC cnv1_quad_mainloop_asm
|
||||
PUBLIC cnv2_mainloop_ivybridge_asm
|
||||
PUBLIC cnv2_mainloop_ryzen_asm
|
||||
PUBLIC cnv2_mainloop_bulldozer_asm
|
||||
|
@ -6,28 +9,49 @@ PUBLIC cnv2_double_mainloop_sandybridge_asm
|
|||
PUBLIC cnv2_rwz_mainloop_asm
|
||||
PUBLIC cnv2_rwz_double_mainloop_asm
|
||||
|
||||
ALIGN 64
|
||||
ALIGN(64)
|
||||
cnv1_single_mainloop_asm PROC
|
||||
INCLUDE cn1/cnv1_single_main_loop.inc
|
||||
ret 0
|
||||
mov eax, 3735929054
|
||||
cnv1_single_mainloop_asm ENDP
|
||||
|
||||
ALIGN(64)
|
||||
cnv1_double_mainloop_asm PROC
|
||||
INCLUDE cn1/cnv1_double_main_loop.inc
|
||||
ret 0
|
||||
mov eax, 3735929054
|
||||
cnv1_double_mainloop_asm ENDP
|
||||
|
||||
ALIGN(64)
|
||||
cnv1_quad_mainloop_asm PROC
|
||||
INCLUDE cn1/cnv1_quad_main_loop.inc
|
||||
ret 0
|
||||
mov eax, 3735929054
|
||||
cnv1_quad_mainloop_asm ENDP
|
||||
|
||||
ALIGN(64)
|
||||
cnv2_mainloop_ivybridge_asm PROC
|
||||
INCLUDE cn2/cnv2_main_loop_ivybridge.inc
|
||||
ret 0
|
||||
mov eax, 3735929054
|
||||
cnv2_mainloop_ivybridge_asm ENDP
|
||||
|
||||
ALIGN 64
|
||||
ALIGN(64)
|
||||
cnv2_mainloop_ryzen_asm PROC
|
||||
INCLUDE cn2/cnv2_main_loop_ryzen.inc
|
||||
ret 0
|
||||
mov eax, 3735929054
|
||||
cnv2_mainloop_ryzen_asm ENDP
|
||||
|
||||
ALIGN 64
|
||||
ALIGN(64)
|
||||
cnv2_mainloop_bulldozer_asm PROC
|
||||
INCLUDE cn2/cnv2_main_loop_bulldozer.inc
|
||||
ret 0
|
||||
mov eax, 3735929054
|
||||
cnv2_mainloop_bulldozer_asm ENDP
|
||||
|
||||
ALIGN 64
|
||||
ALIGN(64)
|
||||
cnv2_double_mainloop_sandybridge_asm PROC
|
||||
INCLUDE cn2/cnv2_double_main_loop_sandybridge.inc
|
||||
ret 0
|
||||
|
|
|
@ -42,14 +42,40 @@ set(SOURCES
|
|||
ghostrider.cpp
|
||||
)
|
||||
|
||||
if (CMAKE_C_COMPILER_ID MATCHES GNU)
|
||||
# gcc 11.2.0 crashes with -ftree-vrp
|
||||
set_source_files_properties(sph_jh.c PROPERTIES COMPILE_FLAGS "-Ofast -fno-tree-vrp")
|
||||
|
||||
# gcc 11.2.0 creates incorrect code with -O3
|
||||
set_source_files_properties(sph_sha2.c PROPERTIES COMPILE_FLAGS "-O2")
|
||||
|
||||
set_source_files_properties(sph_luffa.c PROPERTIES COMPILE_FLAGS "-Ofast -Wno-unused-const-variable")
|
||||
if (CMAKE_C_COMPILER_ID MATCHES MSVC)
|
||||
set_source_files_properties(sph_blake.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os")
|
||||
set_source_files_properties(sph_bmw.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os")
|
||||
set_source_files_properties(sph_cubehash.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os")
|
||||
set_source_files_properties(sph_echo.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os")
|
||||
set_source_files_properties(sph_fugue.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os")
|
||||
set_source_files_properties(sph_groestl.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os")
|
||||
set_source_files_properties(sph_hamsi.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os")
|
||||
set_source_files_properties(sph_jh.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os")
|
||||
set_source_files_properties(sph_keccak.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os")
|
||||
set_source_files_properties(sph_luffa.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os")
|
||||
set_source_files_properties(sph_shabal.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os")
|
||||
set_source_files_properties(sph_shavite.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os")
|
||||
set_source_files_properties(sph_simd.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os")
|
||||
set_source_files_properties(sph_sha2.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os")
|
||||
set_source_files_properties(sph_skein.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os")
|
||||
set_source_files_properties(sph_whirlpool.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os")
|
||||
elseif (CMAKE_C_COMPILER_ID MATCHES GNU OR CMAKE_C_COMPILER_ID MATCHES Clang)
|
||||
set_source_files_properties(sph_blake.c PROPERTIES COMPILE_FLAGS "-Os")
|
||||
set_source_files_properties(sph_bmw.c PROPERTIES COMPILE_FLAGS "-Os")
|
||||
set_source_files_properties(sph_cubehash.c PROPERTIES COMPILE_FLAGS "-Os")
|
||||
set_source_files_properties(sph_echo.c PROPERTIES COMPILE_FLAGS "-Os")
|
||||
set_source_files_properties(sph_fugue.c PROPERTIES COMPILE_FLAGS "-Os")
|
||||
set_source_files_properties(sph_groestl.c PROPERTIES COMPILE_FLAGS "-Os")
|
||||
set_source_files_properties(sph_hamsi.c PROPERTIES COMPILE_FLAGS "-Os")
|
||||
set_source_files_properties(sph_jh.c PROPERTIES COMPILE_FLAGS "-Os -fno-tree-vrp")
|
||||
set_source_files_properties(sph_keccak.c PROPERTIES COMPILE_FLAGS "-Os")
|
||||
set_source_files_properties(sph_luffa.c PROPERTIES COMPILE_FLAGS "-Os -Wno-unused-const-variable")
|
||||
set_source_files_properties(sph_shabal.c PROPERTIES COMPILE_FLAGS "-Os")
|
||||
set_source_files_properties(sph_shavite.c PROPERTIES COMPILE_FLAGS "-Os")
|
||||
set_source_files_properties(sph_simd.c PROPERTIES COMPILE_FLAGS "-Os")
|
||||
set_source_files_properties(sph_sha2.c PROPERTIES COMPILE_FLAGS "-Os")
|
||||
set_source_files_properties(sph_skein.c PROPERTIES COMPILE_FLAGS "-Os")
|
||||
set_source_files_properties(sph_whirlpool.c PROPERTIES COMPILE_FLAGS "-Os")
|
||||
endif()
|
||||
|
||||
include_directories(.)
|
||||
|
|
Loading…
Reference in a new issue