From 9dc2525ce1cc21765ea9f5998d8a93fcbbd8a55c Mon Sep 17 00:00:00 2001 From: XMRig Date: Fri, 6 Sep 2019 11:43:02 +0700 Subject: [PATCH] Restored OpenCL interleave. --- CMakeLists.txt | 1 + src/backend/common/Worker.h | 2 +- src/backend/opencl/OclConfig.cpp | 16 ++- src/backend/opencl/OclInterleave.cpp | 118 ++++++++++++++++++++++ src/backend/opencl/OclInterleave.h | 63 ++++++++++++ src/backend/opencl/OclLaunchData.h | 2 + src/backend/opencl/OclThread.h | 5 +- src/backend/opencl/OclWorker.cpp | 46 +++++++-- src/backend/opencl/OclWorker.h | 9 ++ src/backend/opencl/opencl.cmake | 6 ++ src/backend/opencl/wrappers/OclDevice.cpp | 4 +- 11 files changed, 258 insertions(+), 14 deletions(-) create mode 100644 src/backend/opencl/OclInterleave.cpp create mode 100644 src/backend/opencl/OclInterleave.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 73d08b76..e16e5521 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,6 +16,7 @@ option(WITH_ASM "Enable ASM PoW implementations" ON) option(WITH_EMBEDDED_CONFIG "Enable internal embedded JSON config" OFF) option(WITH_OPENCL "Enable OpenCL backend" OFF) option(WITH_STRICT_CACHE "Enable strict checks for OpenCL cache" ON) +option(WITH_INTERLEAVE_DEBUG_LOG "Enable debug log for threads interleave" OFF) option(BUILD_STATIC "Build static binary" OFF) option(ARM_TARGET "Force use specific ARM target 8 or 7" 0) diff --git a/src/backend/common/Worker.h b/src/backend/common/Worker.h index 5f5df925..f55ec8aa 100644 --- a/src/backend/common/Worker.h +++ b/src/backend/common/Worker.h @@ -28,7 +28,7 @@ #include -#include +#include #include "backend/common/interfaces/IWorker.h" diff --git a/src/backend/opencl/OclConfig.cpp b/src/backend/opencl/OclConfig.cpp index b78af3fd..4bd2ab84 100644 --- a/src/backend/opencl/OclConfig.cpp +++ b/src/backend/opencl/OclConfig.cpp @@ -158,7 +158,7 @@ std::vector xmrig::OclConfig::get(const Miner *miner, cons return out; } - out.reserve(threads.count()); + out.reserve(threads.count() * 2); for (const OclThread &thread : threads.data()) { if (thread.index() >= devices.size()) { @@ -166,8 +166,18 @@ std::vector xmrig::OclConfig::get(const Miner *miner, cons continue; } - for (int64_t affinity : thread.threads()) { - out.emplace_back(miner, algorithm, *this, platform, thread, devices[thread.index()], affinity); + if (thread.threads().size() > 1) { + auto interleave = std::make_shared(thread.threads().size()); + + for (int64_t affinity : thread.threads()) { + OclLaunchData data(miner, algorithm, *this, platform, thread, devices[thread.index()], affinity); + data.interleave = interleave; + + out.emplace_back(data); + } + } + else { + out.emplace_back(miner, algorithm, *this, platform, thread, devices[thread.index()], thread.threads()[0]); } } diff --git a/src/backend/opencl/OclInterleave.cpp b/src/backend/opencl/OclInterleave.cpp new file mode 100644 index 00000000..d7226f41 --- /dev/null +++ b/src/backend/opencl/OclInterleave.cpp @@ -0,0 +1,118 @@ +/* XMRig + * Copyright 2010 Jeff Garzik + * Copyright 2012-2014 pooler + * Copyright 2014 Lucas Jones + * Copyright 2014-2016 Wolf9466 + * Copyright 2016 Jay D Dee + * Copyright 2017-2018 XMR-Stak , + * Copyright 2018-2019 SChernykh + * Copyright 2016-2019 XMRig , + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + + +#include "backend/opencl/OclInterleave.h" +#include "base/io/log/Log.h" +#include "base/tools/Chrono.h" + + +#include +#include + + +uint64_t xmrig::OclInterleave::adjustDelay(size_t id) +{ + const uint64_t t0 = Chrono::steadyMSecs(); + uint64_t delay = 0; + + { + std::lock_guard lock(m_mutex); + + const uint64_t dt = t0 - m_timestamp; + m_timestamp = t0; + + // The perfect interleaving is when N threads on the same GPU start with T/N interval between each other + // If a thread starts earlier than 0.75*T/N ms after the previous thread, delay it to restore perfect interleaving + if ((dt > 0) && (dt < m_threshold * (m_averageRunTime / m_threads))) { + delay = static_cast(m_averageRunTime / m_threads - dt); + m_threshold = 0.75; + } + } + + if (delay == 0) { + return 0; + } + + if (delay >= 400) { + delay = 200; + } + + std::this_thread::sleep_for(std::chrono::milliseconds(delay)); + +# ifdef XMRIG_INTERLEAVE_DEBUG + LOG_WARN("Thread #%zu was paused for %" PRIu64 " ms to adjust interleaving", id, delay); +# endif + + return delay; +} + + +uint64_t xmrig::OclInterleave::resumeDelay(size_t id) +{ + uint64_t delay = 0; + + { + constexpr const double firstThreadSpeedupCoeff = 1.25; + + std::lock_guard lock(m_mutex); + delay = static_cast(m_resumeCounter * m_averageRunTime / m_threads / firstThreadSpeedupCoeff); + ++m_resumeCounter; + } + + if (delay == 0) { + return 0; + } + + if (delay > 1000) { + delay = 1000; + } + +# ifdef XMRIG_INTERLEAVE_DEBUG + LOG_WARN("Thread #%zu will be paused for %" PRIu64 " ms to before resuming", id, delay); +# endif + + std::this_thread::sleep_for(std::chrono::milliseconds(delay)); + + return delay; +} + + +void xmrig::OclInterleave::setResumeCounter(uint32_t value) +{ + std::lock_guard lock(m_mutex); + m_resumeCounter = value; +} + + +void xmrig::OclInterleave::setRunTime(uint64_t time) +{ + // averagingBias = 1.0 - only the last delta time is taken into account + // averagingBias = 0.5 - the last delta time has the same weight as all the previous ones combined + // averagingBias = 0.1 - the last delta time has 10% weight of all the previous ones combined + constexpr double averagingBias = 0.1; + + std::lock_guard lock(m_mutex); + m_averageRunTime = m_averageRunTime * (1.0 - averagingBias) + time * averagingBias; +} diff --git a/src/backend/opencl/OclInterleave.h b/src/backend/opencl/OclInterleave.h new file mode 100644 index 00000000..66327295 --- /dev/null +++ b/src/backend/opencl/OclInterleave.h @@ -0,0 +1,63 @@ +/* XMRig + * Copyright 2010 Jeff Garzik + * Copyright 2012-2014 pooler + * Copyright 2014 Lucas Jones + * Copyright 2014-2016 Wolf9466 + * Copyright 2016 Jay D Dee + * Copyright 2017-2018 XMR-Stak , + * Copyright 2018-2019 SChernykh + * Copyright 2016-2019 XMRig , + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifndef XMRIG_OCLINTERLEAVE_H +#define XMRIG_OCLINTERLEAVE_H + + +#include +#include + + +namespace xmrig { + + +class OclInterleave +{ +public: + OclInterleave() = delete; + inline OclInterleave(size_t threads) : m_threads(threads) {} + + uint64_t adjustDelay(size_t id); + uint64_t resumeDelay(size_t id); + void setResumeCounter(uint32_t value); + void setRunTime(uint64_t time); + +private: + const size_t m_threads; + double m_averageRunTime = 0.0; + double m_threshold = 0.95; + std::mutex m_mutex; + uint32_t m_resumeCounter = 0; + uint64_t m_timestamp = 0; +}; + + +using OclInterleavePtr = std::shared_ptr; + + +} /* namespace xmrig */ + + +#endif /* XMRIG_OCLINTERLEAVE_H */ diff --git a/src/backend/opencl/OclLaunchData.h b/src/backend/opencl/OclLaunchData.h index 9b63e8e7..eb39de10 100644 --- a/src/backend/opencl/OclLaunchData.h +++ b/src/backend/opencl/OclLaunchData.h @@ -27,6 +27,7 @@ #define XMRIG_OCLLAUNCHDATA_H +#include "backend/opencl/OclInterleave.h" #include "backend/opencl/OclThread.h" #include "backend/opencl/wrappers/OclDevice.h" #include "backend/opencl/wrappers/OclPlatform.h" @@ -64,6 +65,7 @@ public: const OclDevice device; const OclPlatform platform; const OclThread thread; + OclInterleavePtr interleave; }; diff --git a/src/backend/opencl/OclThread.h b/src/backend/opencl/OclThread.h index 96ea10a6..63f4afe5 100644 --- a/src/backend/opencl/OclThread.h +++ b/src/backend/opencl/OclThread.h @@ -26,6 +26,7 @@ #define XMRIG_OCLTHREAD_H +#include "crypto/common/Algorithm.h" #include "rapidjson/fwd.h" @@ -39,7 +40,8 @@ class OclThread { public: OclThread() = delete; - OclThread(uint32_t index, uint32_t intensity, uint32_t worksize, uint32_t stridedIndex, uint32_t memChunk, uint32_t threads) : + OclThread(uint32_t index, uint32_t intensity, uint32_t worksize, uint32_t stridedIndex, uint32_t memChunk, uint32_t threads, const Algorithm &algorithm) : + m_algorithm(algorithm), m_threads(threads, -1), m_index(index), m_memChunk(memChunk), @@ -72,6 +74,7 @@ public: private: inline void setIntensity(uint32_t intensity) { m_intensity = intensity / m_worksize * m_worksize; } + Algorithm m_algorithm; int m_datasetHost = -1; std::vector m_threads; uint32_t m_bfactor = 6; diff --git a/src/backend/opencl/OclWorker.cpp b/src/backend/opencl/OclWorker.cpp index ac8c0b2d..bd4f9126 100644 --- a/src/backend/opencl/OclWorker.cpp +++ b/src/backend/opencl/OclWorker.cpp @@ -24,12 +24,11 @@ */ -#include -#include - - #include "backend/opencl/OclWorker.h" + #include "backend/opencl/runners/OclCnRunner.h" +#include "base/io/log/Log.h" +#include "base/tools/Chrono.h" #include "core/Miner.h" #include "crypto/common/Nonce.h" #include "net/JobResults.h" @@ -40,6 +39,10 @@ #endif +#include +#include + + namespace xmrig { @@ -57,7 +60,8 @@ xmrig::OclWorker::OclWorker(size_t id, const OclLaunchData &data) : Worker(id, data.affinity, -1), m_algorithm(data.algorithm), m_miner(data.miner), - m_intensity(data.thread.intensity()) + m_intensity(data.thread.intensity()), + m_interleave(data.interleave) { switch (m_algorithm.family()) { case Algorithm::RANDOM_X: @@ -101,6 +105,10 @@ void xmrig::OclWorker::start() while (Nonce::sequence(Nonce::OPENCL) > 0) { if (Nonce::isPaused()) { + if (m_interleave) { + m_interleave->setResumeCounter(0); + } + do { std::this_thread::sleep_for(std::chrono::milliseconds(200)); } @@ -110,11 +118,19 @@ void xmrig::OclWorker::start() break; } + if (m_interleave) { + m_interleave->resumeDelay(m_id); + } + consumeJob(); } while (!Nonce::isOutdated(Nonce::OPENCL, m_job.sequence())) { - storeStats(); + if (m_interleave) { + m_interleave->adjustDelay(m_id); + } + + const uint64_t t = Chrono::steadyMSecs(); if (!m_runner->run(*m_job.nonce(), results)) { return; @@ -125,8 +141,8 @@ void xmrig::OclWorker::start() } m_job.nextRound(roundSize(m_intensity), m_intensity); - m_count += m_intensity; + storeStats(t); std::this_thread::yield(); } @@ -144,3 +160,19 @@ void xmrig::OclWorker::consumeJob() m_job.add(m_miner->job(), Nonce::sequence(Nonce::OPENCL), roundSize(m_intensity) * m_intensity); m_runner->set(m_job.currentJob(), m_job.blob()); } + + +void xmrig::OclWorker::storeStats(uint64_t t) +{ + if (Nonce::isPaused()) { + return; + } + + m_count += m_intensity; + + if (m_interleave) { + m_interleave->setRunTime(Chrono::steadyMSecs() - t); + } + + Worker::storeStats(); +} diff --git a/src/backend/opencl/OclWorker.h b/src/backend/opencl/OclWorker.h index b158935e..dcee4f0e 100644 --- a/src/backend/opencl/OclWorker.h +++ b/src/backend/opencl/OclWorker.h @@ -42,20 +42,29 @@ class IOclRunner; class OclWorker : public Worker { public: + OclWorker() = delete; + OclWorker(const OclWorker &other) = delete; + OclWorker(OclWorker &&other) = delete; OclWorker(size_t id, const OclLaunchData &data); + ~OclWorker() override; + OclWorker &operator=(const OclWorker &other) = delete; + OclWorker &operator=(OclWorker &&other) = delete; + protected: bool selfTest() override; void start() override; private: void consumeJob(); + void storeStats(uint64_t ts); const Algorithm m_algorithm; const Miner *m_miner; const uint32_t m_intensity; IOclRunner *m_runner = nullptr; + OclInterleavePtr m_interleave; WorkerJob<1> m_job; }; diff --git a/src/backend/opencl/opencl.cmake b/src/backend/opencl/opencl.cmake index 8c062e95..547d5be7 100644 --- a/src/backend/opencl/opencl.cmake +++ b/src/backend/opencl/opencl.cmake @@ -13,6 +13,7 @@ if (WITH_OPENCL) src/backend/opencl/OclBackend.h src/backend/opencl/OclCache.h src/backend/opencl/OclConfig.h + src/backend/opencl/OclInterleave.h src/backend/opencl/OclLaunchData.h src/backend/opencl/OclThread.h src/backend/opencl/OclThreads.h @@ -38,6 +39,7 @@ if (WITH_OPENCL) src/backend/opencl/OclBackend.cpp src/backend/opencl/OclCache.cpp src/backend/opencl/OclConfig.cpp + src/backend/opencl/OclInterleave.cpp src/backend/opencl/OclLaunchData.cpp src/backend/opencl/OclThread.cpp src/backend/opencl/OclThreads.cpp @@ -69,6 +71,10 @@ if (WITH_OPENCL) else() remove_definitions(/DXMRIG_STRICT_OPENCL_CACHE) endif() + + if (WITH_INTERLEAVE_DEBUG_LOG) + add_definitions(/DXMRIG_INTERLEAVE_DEBUG) + endif() else() remove_definitions(/DXMRIG_FEATURE_OPENCL) diff --git a/src/backend/opencl/wrappers/OclDevice.cpp b/src/backend/opencl/wrappers/OclDevice.cpp index 179f2b69..184ab240 100644 --- a/src/backend/opencl/wrappers/OclDevice.cpp +++ b/src/backend/opencl/wrappers/OclDevice.cpp @@ -186,7 +186,7 @@ void xmrig::OclDevice::generate(const Algorithm &algorithm, OclThreads &threads) const uint32_t memChunk = getMemChunk(algorithm); const uint32_t threadCount = ((globalMem() - intensity * 2 * algorithm.l3()) > 128 * oneMiB) ? 2 : 1; - threads.add(OclThread(index(), intensity, worksize, stridedIndex, memChunk, threadCount)); + threads.add(OclThread(index(), intensity, worksize, stridedIndex, memChunk, threadCount, algorithm)); } @@ -258,7 +258,7 @@ uint32_t xmrig::OclDevice::getPossibleIntensity(const Algorithm &algorithm) cons const size_t minFreeMem = (maxThreads == 40000u ? 512u : 128u) * oneMiB; const size_t availableMem = freeMem() - minFreeMem; const size_t perThread = algorithm.l3() + 224u; - const uint32_t maxIntensity = static_cast(availableMem / perThread); + const auto maxIntensity = static_cast(availableMem / perThread); return std::min(maxThreads, maxIntensity); }