AVX2 optimized code for AstroBWT
Added "astrobwt-avx2" parameter in config.json, it's turned off ("false") by default. 4-5% speedup on CPUs with proper AVX2 support (AMD Ryzen starting with Zen2, Intel Core starting with Haswell). There will be no speedup on the following CPUs: - Intel Pentium/Celeron don't support AVX2 - AMD Zen/Zen+ have only half-speed AVX GCC compiled version is faster without AVX2, MSVC compiled version is faster with AVX2
This commit is contained in:
parent
8698b73036
commit
e22f798085
14 changed files with 563 additions and 15 deletions
|
@ -23,6 +23,16 @@ if (WITH_ASTROBWT)
|
|||
src/crypto/astrobwt/salsa20_ref/salsa20.c
|
||||
)
|
||||
else()
|
||||
if (CMAKE_SIZEOF_VOID_P EQUAL 8)
|
||||
enable_language(ASM_MASM)
|
||||
add_definitions(/DASTROBWT_AVX2)
|
||||
if (CMAKE_C_COMPILER_ID MATCHES MSVC)
|
||||
list(APPEND SOURCES_CRYPTO src/crypto/astrobwt/sha3_256_avx2.asm)
|
||||
else()
|
||||
list(APPEND SOURCES_CRYPTO src/crypto/astrobwt/sha3_256_avx2.S)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
list(APPEND HEADERS_CRYPTO
|
||||
src/crypto/astrobwt/Salsa20.hpp
|
||||
)
|
||||
|
|
|
@ -52,6 +52,7 @@ static const char *kArgon2Impl = "argon2-impl";
|
|||
|
||||
#ifdef XMRIG_ALGO_ASTROBWT
|
||||
static const char* kAstroBWTMaxSize = "astrobwt-max-size";
|
||||
static const char* kAstroBWTAVX2 = "astrobwt-avx2";
|
||||
#endif
|
||||
|
||||
|
||||
|
@ -94,6 +95,7 @@ rapidjson::Value xmrig::CpuConfig::toJSON(rapidjson::Document &doc) const
|
|||
|
||||
# ifdef XMRIG_ALGO_ASTROBWT
|
||||
obj.AddMember(StringRef(kAstroBWTMaxSize), m_astrobwtMaxSize, allocator);
|
||||
obj.AddMember(StringRef(kAstroBWTAVX2), m_astrobwtAVX2, allocator);
|
||||
# endif
|
||||
|
||||
m_threads.toJSON(obj, doc);
|
||||
|
@ -148,12 +150,20 @@ void xmrig::CpuConfig::read(const rapidjson::Value &value)
|
|||
# endif
|
||||
|
||||
# ifdef XMRIG_ALGO_ASTROBWT
|
||||
const auto& obj = Json::getValue(value, kAstroBWTMaxSize);
|
||||
if (obj.IsNull() || !obj.IsInt()) {
|
||||
const auto& astroBWTMaxSize = Json::getValue(value, kAstroBWTMaxSize);
|
||||
if (astroBWTMaxSize.IsNull() || !astroBWTMaxSize.IsInt()) {
|
||||
m_shouldSave = true;
|
||||
}
|
||||
else {
|
||||
m_astrobwtMaxSize = std::min(std::max(obj.GetInt(), 400), 1200);
|
||||
m_astrobwtMaxSize = std::min(std::max(astroBWTMaxSize.GetInt(), 400), 1200);
|
||||
}
|
||||
|
||||
const auto& astroBWTAVX2 = Json::getValue(value, kAstroBWTAVX2);
|
||||
if (astroBWTAVX2.IsNull() || !astroBWTAVX2.IsBool()) {
|
||||
m_shouldSave = true;
|
||||
}
|
||||
else {
|
||||
m_astrobwtAVX2 = astroBWTAVX2.GetBool();
|
||||
}
|
||||
# endif
|
||||
|
||||
|
|
|
@ -60,6 +60,7 @@ public:
|
|||
inline const String &argon2Impl() const { return m_argon2Impl; }
|
||||
inline const Threads<CpuThreads> &threads() const { return m_threads; }
|
||||
inline int astrobwtMaxSize() const { return m_astrobwtMaxSize; }
|
||||
inline bool astrobwtAVX2() const { return m_astrobwtAVX2; }
|
||||
inline int priority() const { return m_priority; }
|
||||
inline uint32_t limit() const { return m_limit; }
|
||||
|
||||
|
@ -77,6 +78,7 @@ private:
|
|||
bool m_shouldSave = false;
|
||||
bool m_yield = true;
|
||||
int m_astrobwtMaxSize = 550;
|
||||
bool m_astrobwtAVX2 = false;
|
||||
int m_memoryPool = 0;
|
||||
int m_priority = -1;
|
||||
String m_argon2Impl;
|
||||
|
|
|
@ -39,6 +39,7 @@ xmrig::CpuLaunchData::CpuLaunchData(const Miner *miner, const Algorithm &algorit
|
|||
hwAES(config.isHwAES()),
|
||||
yield(config.isYield()),
|
||||
astrobwtMaxSize(config.astrobwtMaxSize()),
|
||||
astrobwtAVX2(config.astrobwtAVX2()),
|
||||
priority(config.priority()),
|
||||
affinity(thread.affinity()),
|
||||
miner(miner),
|
||||
|
|
|
@ -62,6 +62,7 @@ public:
|
|||
const bool hwAES;
|
||||
const bool yield;
|
||||
const int astrobwtMaxSize;
|
||||
const bool astrobwtAVX2;
|
||||
const int priority;
|
||||
const int64_t affinity;
|
||||
const Miner *miner;
|
||||
|
|
|
@ -81,6 +81,7 @@ xmrig::CpuWorker<N>::CpuWorker(size_t id, const CpuLaunchData &data) :
|
|||
m_yield(data.yield),
|
||||
m_av(data.av()),
|
||||
m_astrobwtMaxSize(data.astrobwtMaxSize * 1000),
|
||||
m_astrobwtAVX2(data.astrobwtAVX2),
|
||||
m_miner(data.miner),
|
||||
m_ctx()
|
||||
{
|
||||
|
@ -262,7 +263,7 @@ void xmrig::CpuWorker<N>::start()
|
|||
{
|
||||
# ifdef XMRIG_ALGO_ASTROBWT
|
||||
if (job.algorithm().family() == Algorithm::ASTROBWT) {
|
||||
if (!astrobwt::astrobwt_dero(m_job.blob(), job.size(), m_ctx[0]->memory, m_hash, m_astrobwtMaxSize))
|
||||
if (!astrobwt::astrobwt_dero(m_job.blob(), job.size(), m_ctx[0]->memory, m_hash, m_astrobwtMaxSize, m_astrobwtAVX2))
|
||||
valid = false;
|
||||
}
|
||||
else
|
||||
|
|
|
@ -74,6 +74,7 @@ private:
|
|||
const bool m_yield;
|
||||
const CnHash::AlgoVariant m_av;
|
||||
const int m_astrobwtMaxSize;
|
||||
const bool m_astrobwtAVX2;
|
||||
const Miner *m_miner;
|
||||
cryptonight_ctx *m_ctx[N];
|
||||
uint8_t m_hash[N * 32]{ 0 };
|
||||
|
|
|
@ -41,6 +41,7 @@
|
|||
#include "core/Miner.h"
|
||||
#include "crypto/common/Nonce.h"
|
||||
#include "crypto/rx/Rx.h"
|
||||
#include "crypto/astrobwt/AstroBWT.h"
|
||||
#include "rapidjson/document.h"
|
||||
#include "version.h"
|
||||
|
||||
|
@ -242,6 +243,10 @@ public:
|
|||
# endif
|
||||
|
||||
|
||||
# ifdef XMRIG_ALGO_ASTROBWT
|
||||
inline bool initAstroBWT() { return astrobwt::init(job); }
|
||||
# endif
|
||||
|
||||
Algorithm algorithm;
|
||||
Algorithms algorithms;
|
||||
bool active = false;
|
||||
|
@ -454,10 +459,14 @@ void xmrig::Miner::setJob(const Job &job, bool donate)
|
|||
d_ptr->userJobId = job.id();
|
||||
}
|
||||
|
||||
bool ready = true;
|
||||
|
||||
# ifdef XMRIG_ALGO_RANDOMX
|
||||
const bool ready = d_ptr->initRX();
|
||||
# else
|
||||
constexpr const bool ready = true;
|
||||
ready &= d_ptr->initRX();
|
||||
# endif
|
||||
|
||||
# ifdef XMRIG_ALGO_ASTROBWT
|
||||
ready &= d_ptr->initAstroBWT();
|
||||
# endif
|
||||
|
||||
mutex.unlock();
|
||||
|
|
|
@ -30,6 +30,10 @@
|
|||
#include "AstroBWT.h"
|
||||
#include "sha3.h"
|
||||
#include "crypto/cn/CryptoNight.h"
|
||||
#include "base/net/stratum/Job.h"
|
||||
#include "base/crypto/Algorithm.h"
|
||||
#include "base/io/log/Log.h"
|
||||
#include "backend/cpu/Cpu.h"
|
||||
#include <limits>
|
||||
|
||||
constexpr int STAGE1_SIZE = 147253;
|
||||
|
@ -38,6 +42,18 @@ constexpr int ALLOCATION_SIZE = (STAGE1_SIZE + 1048576) + (128 - (STAGE1_SIZE &
|
|||
constexpr int COUNTING_SORT_BITS = 10;
|
||||
constexpr int COUNTING_SORT_SIZE = 1 << COUNTING_SORT_BITS;
|
||||
|
||||
static bool astrobwtInitialized = false;
|
||||
|
||||
#ifdef ASTROBWT_AVX2
|
||||
static bool hasAVX2 = false;
|
||||
|
||||
extern "C"
|
||||
#ifdef __GNUC__
|
||||
__attribute__((ms_abi))
|
||||
#endif
|
||||
void SHA3_256_AVX2_ASM(const void* in, size_t inBytes, void* out);
|
||||
#endif
|
||||
|
||||
#ifdef _MSC_VER
|
||||
|
||||
#include <stdlib.h>
|
||||
|
@ -155,7 +171,25 @@ void sort_indices(int N, const uint8_t* v, uint64_t* indices, uint64_t* tmp_indi
|
|||
}
|
||||
}
|
||||
|
||||
bool xmrig::astrobwt::astrobwt_dero(const void* input_data, uint32_t input_size, void* scratchpad, uint8_t* output_hash, int stage2_max_size)
|
||||
bool xmrig::astrobwt::init(const xmrig::Job& job)
|
||||
{
|
||||
if (job.algorithm().family() != xmrig::Algorithm::ASTROBWT)
|
||||
return true;
|
||||
|
||||
if (astrobwtInitialized)
|
||||
return true;
|
||||
|
||||
#ifdef ASTROBWT_AVX2
|
||||
if (xmrig::Cpu::info()->hasAVX2()) {
|
||||
hasAVX2 = true;
|
||||
}
|
||||
#endif
|
||||
|
||||
astrobwtInitialized = true;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool xmrig::astrobwt::astrobwt_dero(const void* input_data, uint32_t input_size, void* scratchpad, uint8_t* output_hash, int stage2_max_size, bool avx2)
|
||||
{
|
||||
uint8_t key[32];
|
||||
uint8_t* scratchpad_ptr = (uint8_t*)(scratchpad) + 64;
|
||||
|
@ -166,7 +200,12 @@ bool xmrig::astrobwt::astrobwt_dero(const void* input_data, uint32_t input_size,
|
|||
uint8_t* stage1_result = (uint8_t*)(tmp_indices);
|
||||
uint8_t* stage2_result = (uint8_t*)(tmp_indices);
|
||||
|
||||
sha3_HashBuffer(256, SHA3_FLAGS_NONE, input_data, input_size, key, sizeof(key));
|
||||
#ifdef ASTROBWT_AVX2
|
||||
if (hasAVX2 && avx2)
|
||||
SHA3_256_AVX2_ASM(input_data, input_size, key);
|
||||
else
|
||||
#endif
|
||||
sha3_HashBuffer(256, SHA3_FLAGS_NONE, input_data, input_size, key, sizeof(key));
|
||||
|
||||
Salsa20_XORKeyStream(key, stage1_output, STAGE1_SIZE);
|
||||
|
||||
|
@ -178,7 +217,12 @@ bool xmrig::astrobwt::astrobwt_dero(const void* input_data, uint32_t input_size,
|
|||
stage1_result[i] = tmp[indices[i] & ((1 << 21) - 1)];
|
||||
}
|
||||
|
||||
sha3_HashBuffer(256, SHA3_FLAGS_NONE, stage1_result, STAGE1_SIZE + 1, key, sizeof(key));
|
||||
#ifdef ASTROBWT_AVX2
|
||||
if (hasAVX2 && avx2)
|
||||
SHA3_256_AVX2_ASM(stage1_result, STAGE1_SIZE + 1, key);
|
||||
else
|
||||
#endif
|
||||
sha3_HashBuffer(256, SHA3_FLAGS_NONE, stage1_result, STAGE1_SIZE + 1, key, sizeof(key));
|
||||
|
||||
const int stage2_size = STAGE1_SIZE + (*(uint32_t*)(key) & 0xfffff);
|
||||
if (stage2_size > stage2_max_size)
|
||||
|
@ -203,7 +247,12 @@ bool xmrig::astrobwt::astrobwt_dero(const void* input_data, uint32_t input_size,
|
|||
stage2_result[i] = tmp[indices[i] & ((1 << 21) - 1)];
|
||||
}
|
||||
|
||||
sha3_HashBuffer(256, SHA3_FLAGS_NONE, stage2_result, stage2_size + 1, output_hash, 32);
|
||||
#ifdef ASTROBWT_AVX2
|
||||
if (hasAVX2 && avx2)
|
||||
SHA3_256_AVX2_ASM(stage2_result, stage2_size + 1, output_hash);
|
||||
else
|
||||
#endif
|
||||
sha3_HashBuffer(256, SHA3_FLAGS_NONE, stage2_result, stage2_size + 1, output_hash, 32);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
@ -211,5 +260,5 @@ bool xmrig::astrobwt::astrobwt_dero(const void* input_data, uint32_t input_size,
|
|||
template<>
|
||||
void xmrig::astrobwt::single_hash<xmrig::Algorithm::ASTROBWT_DERO>(const uint8_t* input, size_t size, uint8_t* output, cryptonight_ctx** ctx, uint64_t)
|
||||
{
|
||||
astrobwt_dero(input, static_cast<uint32_t>(size), ctx[0]->memory, output, std::numeric_limits<int>::max());
|
||||
astrobwt_dero(input, static_cast<uint32_t>(size), ctx[0]->memory, output, std::numeric_limits<int>::max(), true);
|
||||
}
|
||||
|
|
|
@ -33,9 +33,14 @@
|
|||
struct cryptonight_ctx;
|
||||
|
||||
|
||||
namespace xmrig { namespace astrobwt {
|
||||
namespace xmrig {
|
||||
|
||||
bool astrobwt_dero(const void* input_data, uint32_t input_size, void* scratchpad, uint8_t* output_hash, int stage2_max_size);
|
||||
class Job;
|
||||
|
||||
namespace astrobwt {
|
||||
|
||||
bool init(const Job&);
|
||||
bool astrobwt_dero(const void* input_data, uint32_t input_size, void* scratchpad, uint8_t* output_hash, int stage2_max_size, bool avx2);
|
||||
|
||||
template<Algorithm::Id ALGO>
|
||||
void single_hash(const uint8_t* input, size_t size, uint8_t* output, cryptonight_ctx** ctx, uint64_t);
|
||||
|
@ -44,4 +49,4 @@ template<>
|
|||
void single_hash<Algorithm::ASTROBWT_DERO>(const uint8_t* input, size_t size, uint8_t* output, cryptonight_ctx** ctx, uint64_t);
|
||||
|
||||
|
||||
}} // namespace xmrig::argon2
|
||||
}} // namespace xmrig::astrobwt
|
||||
|
|
50
src/crypto/astrobwt/sha3_256_avx2.S
Normal file
50
src/crypto/astrobwt/sha3_256_avx2.S
Normal file
|
@ -0,0 +1,50 @@
|
|||
;# XMRig
|
||||
;# Copyright 2010 Jeff Garzik <jgarzik@pobox.com>
|
||||
;# Copyright 2012-2014 pooler <pooler@litecoinpool.org>
|
||||
;# Copyright 2014 Lucas Jones <https://github.com/lucasjones>
|
||||
;# Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
|
||||
;# Copyright 2016 Jay D Dee <jayddee246@gmail.com>
|
||||
;# Copyright 2017-2019 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
|
||||
;# Copyright 2018 Lee Clagett <https://github.com/vtnerd>
|
||||
;# Copyright 2018-2019 tevador <tevador@gmail.com>
|
||||
;# Copyright 2000 Transmeta Corporation <https://github.com/intel/msr-tools>
|
||||
;# Copyright 2004-2008 H. Peter Anvin <https://github.com/intel/msr-tools>
|
||||
;# Copyright 2018-2020 SChernykh <https://github.com/SChernykh>
|
||||
;# Copyright 2016-2020 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
||||
;#
|
||||
;# This program is free software: you can redistribute it and/or modify
|
||||
;# it under the terms of the GNU General Public License as published by
|
||||
;# the Free Software Foundation, either version 3 of the License, or
|
||||
;# (at your option) any later version.
|
||||
;#
|
||||
;# This program is distributed in the hope that it will be useful,
|
||||
;# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
;# GNU General Public License for more details.
|
||||
;#
|
||||
;# You should have received a copy of the GNU General Public License
|
||||
;# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
;#
|
||||
|
||||
.intel_syntax noprefix
|
||||
#if defined(__APPLE__)
|
||||
.text
|
||||
#define DECL(x) _##x
|
||||
#else
|
||||
.section .text
|
||||
#define DECL(x) x
|
||||
#endif
|
||||
|
||||
#define ALIGN .balign
|
||||
#define dq .quad
|
||||
|
||||
.global DECL(SHA3_256_AVX2_ASM)
|
||||
|
||||
#include "sha3_256_avx2.inc"
|
||||
|
||||
KeccakF1600_AVX2_ASM:
|
||||
lea r8,[rip+rot_left+96]
|
||||
lea r9,[rip+rot_right+96]
|
||||
lea r10,[rip+rndc]
|
||||
|
||||
#include "sha3_256_keccakf1600_avx2.inc"
|
42
src/crypto/astrobwt/sha3_256_avx2.asm
Normal file
42
src/crypto/astrobwt/sha3_256_avx2.asm
Normal file
|
@ -0,0 +1,42 @@
|
|||
;# XMRig
|
||||
;# Copyright 2010 Jeff Garzik <jgarzik@pobox.com>
|
||||
;# Copyright 2012-2014 pooler <pooler@litecoinpool.org>
|
||||
;# Copyright 2014 Lucas Jones <https://github.com/lucasjones>
|
||||
;# Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
|
||||
;# Copyright 2016 Jay D Dee <jayddee246@gmail.com>
|
||||
;# Copyright 2017-2019 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
|
||||
;# Copyright 2018 Lee Clagett <https://github.com/vtnerd>
|
||||
;# Copyright 2018-2019 tevador <tevador@gmail.com>
|
||||
;# Copyright 2000 Transmeta Corporation <https://github.com/intel/msr-tools>
|
||||
;# Copyright 2004-2008 H. Peter Anvin <https://github.com/intel/msr-tools>
|
||||
;# Copyright 2018-2020 SChernykh <https://github.com/SChernykh>
|
||||
;# Copyright 2016-2020 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
||||
;#
|
||||
;# This program is free software: you can redistribute it and/or modify
|
||||
;# it under the terms of the GNU General Public License as published by
|
||||
;# the Free Software Foundation, either version 3 of the License, or
|
||||
;# (at your option) any later version.
|
||||
;#
|
||||
;# This program is distributed in the hope that it will be useful,
|
||||
;# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
;# GNU General Public License for more details.
|
||||
;#
|
||||
;# You should have received a copy of the GNU General Public License
|
||||
;# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
;#
|
||||
|
||||
_SHA3_256_AVX2_ASM SEGMENT PAGE READ EXECUTE
|
||||
PUBLIC SHA3_256_AVX2_ASM
|
||||
|
||||
include sha3_256_avx2.inc
|
||||
|
||||
KeccakF1600_AVX2_ASM:
|
||||
lea r8,[rot_left+96]
|
||||
lea r9,[rot_right+96]
|
||||
lea r10,[rndc]
|
||||
|
||||
include sha3_256_keccakf1600_avx2.inc
|
||||
|
||||
_SHA3_256_AVX2_ASM ENDS
|
||||
END
|
164
src/crypto/astrobwt/sha3_256_avx2.inc
Normal file
164
src/crypto/astrobwt/sha3_256_avx2.inc
Normal file
|
@ -0,0 +1,164 @@
|
|||
;# XMRig
|
||||
;# Copyright 2010 Jeff Garzik <jgarzik@pobox.com>
|
||||
;# Copyright 2012-2014 pooler <pooler@litecoinpool.org>
|
||||
;# Copyright 2014 Lucas Jones <https://github.com/lucasjones>
|
||||
;# Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
|
||||
;# Copyright 2016 Jay D Dee <jayddee246@gmail.com>
|
||||
;# Copyright 2017-2019 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
|
||||
;# Copyright 2018 Lee Clagett <https://github.com/vtnerd>
|
||||
;# Copyright 2018-2019 tevador <tevador@gmail.com>
|
||||
;# Copyright 2000 Transmeta Corporation <https://github.com/intel/msr-tools>
|
||||
;# Copyright 2004-2008 H. Peter Anvin <https://github.com/intel/msr-tools>
|
||||
;# Copyright 2018-2020 SChernykh <https://github.com/SChernykh>
|
||||
;# Copyright 2016-2020 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
||||
;#
|
||||
;# This program is free software: you can redistribute it and/or modify
|
||||
;# it under the terms of the GNU General Public License as published by
|
||||
;# the Free Software Foundation, either version 3 of the License, or
|
||||
;# (at your option) any later version.
|
||||
;#
|
||||
;# This program is distributed in the hope that it will be useful,
|
||||
;# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
;# GNU General Public License for more details.
|
||||
;#
|
||||
;# You should have received a copy of the GNU General Public License
|
||||
;# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
;#
|
||||
|
||||
ALIGN 64
|
||||
SHA3_256_AVX2_ASM:
|
||||
vzeroupper
|
||||
|
||||
mov qword ptr [rsp+8],rbx
|
||||
mov qword ptr [rsp+16],rsi
|
||||
mov qword ptr [rsp+24],rdi
|
||||
push rbp
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
|
||||
sub rsp, 80
|
||||
movdqu xmmword ptr [rsp+64], xmm6
|
||||
movdqu xmmword ptr [rsp+48], xmm7
|
||||
movdqu xmmword ptr [rsp+32], xmm8
|
||||
movdqu xmmword ptr [rsp+16], xmm9
|
||||
movdqu xmmword ptr [rsp+0], xmm10
|
||||
sub rsp, 80
|
||||
movdqu xmmword ptr [rsp+64], xmm11
|
||||
movdqu xmmword ptr [rsp+48], xmm12
|
||||
movdqu xmmword ptr [rsp+32], xmm13
|
||||
movdqu xmmword ptr [rsp+16], xmm14
|
||||
movdqu xmmword ptr [rsp+0], xmm15
|
||||
|
||||
sub rsp,320
|
||||
lea rbp,[rsp+64]
|
||||
and rbp,-32
|
||||
vpxor xmm0,xmm0,xmm0
|
||||
xor edi,edi
|
||||
mov dword ptr [rbp],50462976
|
||||
mov r12,rdx
|
||||
mov dword ptr [rbp+4],169150212
|
||||
mov r14,rdx
|
||||
mov dword ptr [rbp+8],218436623
|
||||
shr r14,3
|
||||
and r12d,7
|
||||
mov dword ptr [rbp+12],135009046
|
||||
mov r13,r8
|
||||
mov byte ptr [rbp+16],9
|
||||
mov rsi,rcx
|
||||
mov ebx,edi
|
||||
vmovdqa ymmword ptr [rbp+32],ymm0
|
||||
vmovdqa ymmword ptr [rbp+64],ymm0
|
||||
vmovdqa ymmword ptr [rbp+96],ymm0
|
||||
vmovdqa ymmword ptr [rbp+128],ymm0
|
||||
vmovdqa ymmword ptr [rbp+160],ymm0
|
||||
vmovdqa ymmword ptr [rbp+192],ymm0
|
||||
vmovdqa ymmword ptr [rbp+224],ymm0
|
||||
test r14,r14
|
||||
je sha3_main_loop_end
|
||||
|
||||
sha3_main_loop:
|
||||
movzx eax,byte ptr [rbp+rbx]
|
||||
lea rcx,[rbp+32]
|
||||
lea rcx,[rcx+rax*8]
|
||||
mov rax,qword ptr [rsi]
|
||||
xor qword ptr [rcx],rax
|
||||
lea r15,[rbx+1]
|
||||
cmp rbx,16
|
||||
jne skip_keccak
|
||||
|
||||
lea rcx,[rbp+32]
|
||||
call KeccakF1600_AVX2_ASM
|
||||
|
||||
skip_keccak:
|
||||
cmp rbx,16
|
||||
mov rax,rdi
|
||||
cmovne rax,r15
|
||||
add rsi,8
|
||||
mov rbx,rax
|
||||
sub r14,1
|
||||
jne sha3_main_loop
|
||||
|
||||
sha3_main_loop_end:
|
||||
mov rdx,rdi
|
||||
test r12,r12
|
||||
je sha3_tail_loop_end
|
||||
mov r8,rdi
|
||||
|
||||
sha3_tail_loop:
|
||||
movzx eax,byte ptr [rdx+rsi]
|
||||
inc rdx
|
||||
shlx rcx,rax,r8
|
||||
or rdi,rcx
|
||||
add r8,8
|
||||
cmp rdx,r12
|
||||
jb sha3_tail_loop
|
||||
|
||||
sha3_tail_loop_end:
|
||||
movzx eax,byte ptr [rbp+rbx]
|
||||
lea rdx,[rbp+32]
|
||||
lea rdx,[rdx+rax*8]
|
||||
mov ecx,6
|
||||
lea rax,[r12*8]
|
||||
shlx rcx,rcx,rax
|
||||
xor rcx,qword ptr [rdx]
|
||||
mov eax,1
|
||||
shl rax,63
|
||||
xor rcx,rdi
|
||||
mov qword ptr [rdx],rcx
|
||||
xor qword ptr [rbp+104],rax
|
||||
|
||||
lea rcx,[rbp+32]
|
||||
call KeccakF1600_AVX2_ASM
|
||||
|
||||
vmovups ymm0,ymmword ptr [rbp+32]
|
||||
vmovups ymmword ptr [r13],ymm0
|
||||
vzeroupper
|
||||
|
||||
add rsp,320
|
||||
|
||||
movdqu xmm15, xmmword ptr [rsp]
|
||||
movdqu xmm14, xmmword ptr [rsp+16]
|
||||
movdqu xmm13, xmmword ptr [rsp+32]
|
||||
movdqu xmm12, xmmword ptr [rsp+48]
|
||||
movdqu xmm11, xmmword ptr [rsp+64]
|
||||
add rsp, 80
|
||||
movdqu xmm10, xmmword ptr [rsp]
|
||||
movdqu xmm9, xmmword ptr [rsp+16]
|
||||
movdqu xmm8, xmmword ptr [rsp+32]
|
||||
movdqu xmm7, xmmword ptr [rsp+48]
|
||||
movdqu xmm6, xmmword ptr [rsp+64]
|
||||
add rsp, 80
|
||||
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rbp
|
||||
mov rbx,qword ptr [rsp+8]
|
||||
mov rsi,qword ptr [rsp+16]
|
||||
mov rdi,qword ptr [rsp+24]
|
||||
|
||||
ret
|
203
src/crypto/astrobwt/sha3_256_keccakf1600_avx2.inc
Normal file
203
src/crypto/astrobwt/sha3_256_keccakf1600_avx2.inc
Normal file
|
@ -0,0 +1,203 @@
|
|||
;# XMRig
|
||||
;# Copyright 2010 Jeff Garzik <jgarzik@pobox.com>
|
||||
;# Copyright 2012-2014 pooler <pooler@litecoinpool.org>
|
||||
;# Copyright 2014 Lucas Jones <https://github.com/lucasjones>
|
||||
;# Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
|
||||
;# Copyright 2016 Jay D Dee <jayddee246@gmail.com>
|
||||
;# Copyright 2017-2019 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
|
||||
;# Copyright 2018 Lee Clagett <https://github.com/vtnerd>
|
||||
;# Copyright 2018-2019 tevador <tevador@gmail.com>
|
||||
;# Copyright 2000 Transmeta Corporation <https://github.com/intel/msr-tools>
|
||||
;# Copyright 2004-2008 H. Peter Anvin <https://github.com/intel/msr-tools>
|
||||
;# Copyright 2018-2020 SChernykh <https://github.com/SChernykh>
|
||||
;# Copyright 2016-2020 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
||||
;#
|
||||
;# This program is free software: you can redistribute it and/or modify
|
||||
;# it under the terms of the GNU General Public License as published by
|
||||
;# the Free Software Foundation, either version 3 of the License, or
|
||||
;# (at your option) any later version.
|
||||
;#
|
||||
;# This program is distributed in the hope that it will be useful,
|
||||
;# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
;# GNU General Public License for more details.
|
||||
;#
|
||||
;# You should have received a copy of the GNU General Public License
|
||||
;# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
;#
|
||||
|
||||
mov eax,24
|
||||
lea rcx,[rcx+96]
|
||||
vpbroadcastq ymm0,QWORD PTR [rcx-96]
|
||||
vmovdqu ymm1,YMMWORD PTR [rcx-88]
|
||||
vmovdqu ymm2,YMMWORD PTR [rcx-56]
|
||||
vmovdqu ymm3,YMMWORD PTR [rcx-24]
|
||||
vmovdqu ymm4,YMMWORD PTR [rcx+8]
|
||||
vmovdqu ymm5,YMMWORD PTR [rcx+40]
|
||||
vmovdqu ymm6,YMMWORD PTR [rcx+72]
|
||||
|
||||
ALIGN 64
|
||||
Loop_avx2:
|
||||
vpshufd ymm13,ymm2,78
|
||||
vpxor ymm12,ymm5,ymm3
|
||||
vpxor ymm9,ymm4,ymm6
|
||||
vpxor ymm12,ymm12,ymm1
|
||||
vpxor ymm12,ymm12,ymm9
|
||||
vpermq ymm11,ymm12,147
|
||||
vpxor ymm13,ymm13,ymm2
|
||||
vpermq ymm7,ymm13,78
|
||||
vpsrlq ymm8,ymm12,63
|
||||
vpaddq ymm9,ymm12,ymm12
|
||||
vpor ymm8,ymm8,ymm9
|
||||
vpermq ymm15,ymm8,57
|
||||
vpxor ymm14,ymm8,ymm11
|
||||
vpermq ymm14,ymm14,0
|
||||
vpxor ymm13,ymm13,ymm0
|
||||
vpxor ymm13,ymm13,ymm7
|
||||
vpsrlq ymm7,ymm13,63
|
||||
vpaddq ymm8,ymm13,ymm13
|
||||
vpor ymm8,ymm8,ymm7
|
||||
vpxor ymm2,ymm2,ymm14
|
||||
vpxor ymm0,ymm0,ymm14
|
||||
vpblendd ymm15,ymm15,ymm8,192
|
||||
vpblendd ymm11,ymm11,ymm13,3
|
||||
vpxor ymm15,ymm15,ymm11
|
||||
vpsllvq ymm10,ymm2,YMMWORD PTR [r8-96]
|
||||
vpsrlvq ymm2,ymm2,YMMWORD PTR [r9-96]
|
||||
vpor ymm2,ymm2,ymm10
|
||||
vpxor ymm3,ymm3,ymm15
|
||||
vpsllvq ymm11,ymm3,YMMWORD PTR [r8-32]
|
||||
vpsrlvq ymm3,ymm3,YMMWORD PTR [r9-32]
|
||||
vpor ymm3,ymm3,ymm11
|
||||
vpxor ymm4,ymm4,ymm15
|
||||
vpsllvq ymm12,ymm4,YMMWORD PTR [r8]
|
||||
vpsrlvq ymm4,ymm4,YMMWORD PTR [r9]
|
||||
vpor ymm4,ymm4,ymm12
|
||||
vpxor ymm5,ymm5,ymm15
|
||||
vpsllvq ymm13,ymm5,YMMWORD PTR [r8+32]
|
||||
vpsrlvq ymm5,ymm5,YMMWORD PTR [r9+32]
|
||||
vpor ymm5,ymm5,ymm13
|
||||
vpxor ymm6,ymm6,ymm15
|
||||
vpermq ymm10,ymm2,141
|
||||
vpermq ymm11,ymm3,141
|
||||
vpsllvq ymm14,ymm6,YMMWORD PTR [r8+64]
|
||||
vpsrlvq ymm8,ymm6,YMMWORD PTR [r9+64]
|
||||
vpor ymm8,ymm8,ymm14
|
||||
vpxor ymm1,ymm1,ymm15
|
||||
vpermq ymm12,ymm4,27
|
||||
vpermq ymm13,ymm5,114
|
||||
vpsllvq ymm15,ymm1,YMMWORD PTR [r8-64]
|
||||
vpsrlvq ymm9,ymm1,YMMWORD PTR [r9-64]
|
||||
vpor ymm9,ymm9,ymm15
|
||||
vpsrldq ymm14,ymm8,8
|
||||
vpandn ymm7,ymm8,ymm14
|
||||
vpblendd ymm3,ymm9,ymm13,12
|
||||
vpblendd ymm15,ymm11,ymm9,12
|
||||
vpblendd ymm5,ymm10,ymm11,12
|
||||
vpblendd ymm14,ymm9,ymm10,12
|
||||
vpblendd ymm3,ymm3,ymm11,48
|
||||
vpblendd ymm15,ymm15,ymm12,48
|
||||
vpblendd ymm5,ymm5,ymm9,48
|
||||
vpblendd ymm14,ymm14,ymm13,48
|
||||
vpblendd ymm3,ymm3,ymm12,192
|
||||
vpblendd ymm15,ymm15,ymm13,192
|
||||
vpblendd ymm5,ymm5,ymm13,192
|
||||
vpblendd ymm14,ymm14,ymm11,192
|
||||
vpandn ymm3,ymm3,ymm15
|
||||
vpandn ymm5,ymm5,ymm14
|
||||
vpblendd ymm6,ymm12,ymm9,12
|
||||
vpblendd ymm15,ymm10,ymm12,12
|
||||
vpxor ymm3,ymm3,ymm10
|
||||
vpblendd ymm6,ymm6,ymm10,48
|
||||
vpblendd ymm15,ymm15,ymm11,48
|
||||
vpxor ymm5,ymm5,ymm12
|
||||
vpblendd ymm6,ymm6,ymm11,192
|
||||
vpblendd ymm15,ymm15,ymm9,192
|
||||
vpandn ymm6,ymm6,ymm15
|
||||
vpxor ymm6,ymm6,ymm13
|
||||
vpermq ymm4,ymm8,30
|
||||
vpblendd ymm15,ymm4,ymm0,48
|
||||
vpermq ymm1,ymm8,57
|
||||
vpblendd ymm1,ymm1,ymm0,192
|
||||
vpandn ymm1,ymm1,ymm15
|
||||
vpblendd ymm2,ymm11,ymm12,12
|
||||
vpblendd ymm14,ymm13,ymm11,12
|
||||
vpblendd ymm2,ymm2,ymm13,48
|
||||
vpblendd ymm14,ymm14,ymm10,48
|
||||
vpblendd ymm2,ymm2,ymm10,192
|
||||
vpblendd ymm14,ymm14,ymm12,192
|
||||
vpandn ymm2,ymm2,ymm14
|
||||
vpxor ymm2,ymm2,ymm9
|
||||
vpermq ymm7,ymm7,0
|
||||
vpermq ymm3,ymm3,27
|
||||
vpermq ymm5,ymm5,141
|
||||
vpermq ymm6,ymm6,114
|
||||
vpblendd ymm4,ymm13,ymm10,12
|
||||
vpblendd ymm14,ymm12,ymm13,12
|
||||
vpblendd ymm4,ymm4,ymm12,48
|
||||
vpblendd ymm14,ymm14,ymm9,48
|
||||
vpblendd ymm4,ymm4,ymm9,192
|
||||
vpblendd ymm14,ymm14,ymm10,192
|
||||
vpandn ymm4,ymm4,ymm14
|
||||
vpxor ymm0,ymm0,ymm7
|
||||
vpxor ymm1,ymm1,ymm8
|
||||
vpxor ymm4,ymm4,ymm11
|
||||
vpxor ymm0,ymm0,YMMWORD PTR [r10]
|
||||
lea r10,[r10+32]
|
||||
dec eax
|
||||
jnz Loop_avx2
|
||||
|
||||
vmovq QWORD PTR [rcx-96],xmm0
|
||||
vmovdqu YMMWORD PTR [rcx-88],ymm1
|
||||
vmovdqu YMMWORD PTR [rcx-56],ymm2
|
||||
vmovdqu YMMWORD PTR [rcx-24],ymm3
|
||||
vmovdqu YMMWORD PTR [rcx+8],ymm4
|
||||
vmovdqu YMMWORD PTR [rcx+40],ymm5
|
||||
vmovdqu YMMWORD PTR [rcx+72],ymm6
|
||||
|
||||
ret
|
||||
|
||||
ALIGN 32
|
||||
rot_left:
|
||||
dq 3, 18, 36, 41
|
||||
dq 1, 62, 28, 27
|
||||
dq 45, 6, 56, 39
|
||||
dq 10, 61, 55, 8
|
||||
dq 2, 15, 25, 20
|
||||
dq 44, 43, 21, 14
|
||||
|
||||
ALIGN 32
|
||||
rot_right:
|
||||
dq 64-3, 64-18, 64-36, 64-41
|
||||
dq 64-1, 64-62, 64-28, 64-27
|
||||
dq 64-45, 64-6, 64-56, 64-39
|
||||
dq 64-10, 64-61, 64-55, 64-8
|
||||
dq 64-2, 64-15, 64-25, 64-20
|
||||
dq 64-44, 64-43, 64-21, 64-14
|
||||
|
||||
ALIGN 32
|
||||
rndc:
|
||||
dq 1, 1, 1, 1
|
||||
dq 32898, 32898, 32898, 32898
|
||||
dq 9223372036854808714, 9223372036854808714, 9223372036854808714, 9223372036854808714
|
||||
dq 9223372039002292224, 9223372039002292224, 9223372039002292224, 9223372039002292224
|
||||
dq 32907, 32907, 32907, 32907
|
||||
dq 2147483649, 2147483649, 2147483649, 2147483649
|
||||
dq 9223372039002292353, 9223372039002292353, 9223372039002292353, 9223372039002292353
|
||||
dq 9223372036854808585, 9223372036854808585, 9223372036854808585, 9223372036854808585
|
||||
dq 138, 138, 138, 138
|
||||
dq 136, 136, 136, 136
|
||||
dq 2147516425, 2147516425, 2147516425, 2147516425
|
||||
dq 2147483658, 2147483658, 2147483658, 2147483658
|
||||
dq 2147516555, 2147516555, 2147516555, 2147516555
|
||||
dq 9223372036854775947, 9223372036854775947, 9223372036854775947, 9223372036854775947
|
||||
dq 9223372036854808713, 9223372036854808713, 9223372036854808713, 9223372036854808713
|
||||
dq 9223372036854808579, 9223372036854808579, 9223372036854808579, 9223372036854808579
|
||||
dq 9223372036854808578, 9223372036854808578, 9223372036854808578, 9223372036854808578
|
||||
dq 9223372036854775936, 9223372036854775936, 9223372036854775936, 9223372036854775936
|
||||
dq 32778, 32778, 32778, 32778
|
||||
dq 9223372039002259466, 9223372039002259466, 9223372039002259466, 9223372039002259466
|
||||
dq 9223372039002292353, 9223372039002292353, 9223372039002292353, 9223372039002292353
|
||||
dq 9223372036854808704, 9223372036854808704, 9223372036854808704, 9223372036854808704
|
||||
dq 2147483649, 2147483649, 2147483649, 2147483649
|
||||
dq 9223372039002292232, 9223372039002292232, 9223372039002292232, 9223372039002292232
|
Loading…
Reference in a new issue