Restored OpenCL interleave.

2019-09-06 11:43:02 +07:00 · 2019-09-06 11:43:02 +07:00 · 9dc2525ce1
commit 9dc2525ce1
parent 0e362f38bc
11 changed files with 258 additions and 14 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -16,6 +16,7 @@ option(WITH_ASM             "Enable ASM PoW implementations" ON)
 option(WITH_EMBEDDED_CONFIG "Enable internal embedded JSON config" OFF)
 option(WITH_OPENCL          "Enable OpenCL backend" OFF)
 option(WITH_STRICT_CACHE    "Enable strict checks for OpenCL cache" ON)
+option(WITH_INTERLEAVE_DEBUG_LOG "Enable debug log for threads interleave" OFF)

 option(BUILD_STATIC         "Build static binary" OFF)
 option(ARM_TARGET           "Force use specific ARM target 8 or 7" 0)
--- a/src/backend/common/Worker.h
+++ b/src/backend/common/Worker.h
@ -28,7 +28,7 @@


 #include <atomic>
-#include <stdint.h>
+#include <cstdint>


 #include "backend/common/interfaces/IWorker.h"
--- a/src/backend/opencl/OclConfig.cpp
+++ b/src/backend/opencl/OclConfig.cpp
@ -158,7 +158,7 @@ std::vector<xmrig::OclLaunchData> xmrig::OclConfig::get(const Miner *miner, cons
        return out;
    }

-    out.reserve(threads.count());
+    out.reserve(threads.count() * 2);

    for (const OclThread &thread : threads.data()) {
        if (thread.index() >= devices.size()) {
@ -166,8 +166,18 @@ std::vector<xmrig::OclLaunchData> xmrig::OclConfig::get(const Miner *miner, cons
            continue;
        }

-        for (int64_t affinity : thread.threads()) {
-            out.emplace_back(miner, algorithm, *this, platform, thread, devices[thread.index()], affinity);
+        if (thread.threads().size() > 1) {
+            auto interleave = std::make_shared<OclInterleave>(thread.threads().size());
+
+            for (int64_t affinity : thread.threads()) {
+                OclLaunchData data(miner, algorithm, *this, platform, thread, devices[thread.index()], affinity);
+                data.interleave = interleave;
+
+                out.emplace_back(data);
+            }
+        }
+        else {
+           out.emplace_back(miner, algorithm, *this, platform, thread, devices[thread.index()], thread.threads()[0]);
        }
    }

--- a/src/backend/opencl/OclInterleave.cpp
+++ b/src/backend/opencl/OclInterleave.cpp
@ -0,0 +1,118 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#include "backend/opencl/OclInterleave.h"
+#include "base/io/log/Log.h"
+#include "base/tools/Chrono.h"
+
+
+#include <cinttypes>
+#include <thread>
+
+
+uint64_t xmrig::OclInterleave::adjustDelay(size_t id)
+{
+    const uint64_t t0 = Chrono::steadyMSecs();
+    uint64_t delay    = 0;
+
+    {
+        std::lock_guard<std::mutex> lock(m_mutex);
+
+        const uint64_t dt = t0 - m_timestamp;
+        m_timestamp = t0;
+
+        // The perfect interleaving is when N threads on the same GPU start with T/N interval between each other
+        // If a thread starts earlier than 0.75*T/N ms after the previous thread, delay it to restore perfect interleaving
+        if ((dt > 0) && (dt < m_threshold * (m_averageRunTime / m_threads))) {
+            delay = static_cast<uint64_t>(m_averageRunTime / m_threads - dt);
+            m_threshold = 0.75;
+        }
+    }
+
+    if (delay == 0) {
+        return 0;
+    }
+
+    if (delay >= 400) {
+        delay = 200;
+    }
+
+    std::this_thread::sleep_for(std::chrono::milliseconds(delay));
+
+#   ifdef XMRIG_INTERLEAVE_DEBUG
+    LOG_WARN("Thread #%zu was paused for %" PRIu64 " ms to adjust interleaving", id, delay);
+#   endif
+
+    return delay;
+}
+
+
+uint64_t xmrig::OclInterleave::resumeDelay(size_t id)
+{
+    uint64_t delay = 0;
+
+    {
+        constexpr const double firstThreadSpeedupCoeff = 1.25;
+
+        std::lock_guard<std::mutex> lock(m_mutex);
+        delay = static_cast<uint64_t>(m_resumeCounter * m_averageRunTime / m_threads / firstThreadSpeedupCoeff);
+        ++m_resumeCounter;
+    }
+
+    if (delay == 0) {
+        return 0;
+    }
+
+    if (delay > 1000) {
+        delay = 1000;
+    }
+
+#   ifdef XMRIG_INTERLEAVE_DEBUG
+    LOG_WARN("Thread #%zu will be paused for %" PRIu64 " ms to before resuming", id, delay);
+#   endif
+
+    std::this_thread::sleep_for(std::chrono::milliseconds(delay));
+
+    return delay;
+}
+
+
+void xmrig::OclInterleave::setResumeCounter(uint32_t value)
+{
+    std::lock_guard<std::mutex> lock(m_mutex);
+    m_resumeCounter = value;
+}
+
+
+void xmrig::OclInterleave::setRunTime(uint64_t time)
+{
+    // averagingBias = 1.0 - only the last delta time is taken into account
+    // averagingBias = 0.5 - the last delta time has the same weight as all the previous ones combined
+    // averagingBias = 0.1 - the last delta time has 10% weight of all the previous ones combined
+    constexpr double averagingBias = 0.1;
+
+    std::lock_guard<std::mutex> lock(m_mutex);
+    m_averageRunTime = m_averageRunTime * (1.0 - averagingBias) + time * averagingBias;
+}
--- a/src/backend/opencl/OclInterleave.h
+++ b/src/backend/opencl/OclInterleave.h
@ -0,0 +1,63 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef XMRIG_OCLINTERLEAVE_H
+#define XMRIG_OCLINTERLEAVE_H
+
+
+#include <memory>
+#include <mutex>
+
+
+namespace xmrig {
+
+
+class OclInterleave
+{
+public:
+    OclInterleave() = delete;
+    inline OclInterleave(size_t threads) : m_threads(threads) {}
+
+    uint64_t adjustDelay(size_t id);
+    uint64_t resumeDelay(size_t id);
+    void setResumeCounter(uint32_t value);
+    void setRunTime(uint64_t time);
+
+private:
+    const size_t m_threads;
+    double m_averageRunTime   = 0.0;
+    double m_threshold        = 0.95;
+    std::mutex m_mutex;
+    uint32_t m_resumeCounter  = 0;
+    uint64_t m_timestamp      = 0;
+};
+
+
+using OclInterleavePtr = std::shared_ptr<OclInterleave>;
+
+
+} /* namespace xmrig */
+
+
+#endif /* XMRIG_OCLINTERLEAVE_H */
--- a/src/backend/opencl/OclLaunchData.h
+++ b/src/backend/opencl/OclLaunchData.h
@ -27,6 +27,7 @@
 #define XMRIG_OCLLAUNCHDATA_H


+#include "backend/opencl/OclInterleave.h"
 #include "backend/opencl/OclThread.h"
 #include "backend/opencl/wrappers/OclDevice.h"
 #include "backend/opencl/wrappers/OclPlatform.h"
@ -64,6 +65,7 @@ public:
    const OclDevice device;
    const OclPlatform platform;
    const OclThread thread;
+    OclInterleavePtr interleave;
 };


--- a/src/backend/opencl/OclThread.h
+++ b/src/backend/opencl/OclThread.h
@ -26,6 +26,7 @@
 #define XMRIG_OCLTHREAD_H


+#include "crypto/common/Algorithm.h"
 #include "rapidjson/fwd.h"


@ -39,7 +40,8 @@ class OclThread
 {
 public:
    OclThread() = delete;
-    OclThread(uint32_t index, uint32_t intensity, uint32_t worksize, uint32_t stridedIndex, uint32_t memChunk, uint32_t threads) :
+    OclThread(uint32_t index, uint32_t intensity, uint32_t worksize, uint32_t stridedIndex, uint32_t memChunk, uint32_t threads, const Algorithm &algorithm) :
+        m_algorithm(algorithm),
        m_threads(threads, -1),
        m_index(index),
        m_memChunk(memChunk),
@ -72,6 +74,7 @@ public:
 private:
    inline void setIntensity(uint32_t intensity)            { m_intensity = intensity / m_worksize * m_worksize; }

+    Algorithm m_algorithm;
    int m_datasetHost       = -1;
    std::vector<int64_t> m_threads;
    uint32_t m_bfactor      = 6;
--- a/src/backend/opencl/OclWorker.cpp
+++ b/src/backend/opencl/OclWorker.cpp
@ -24,12 +24,11 @@
 */


-#include <assert.h>
-#include <thread>
-
-
 #include "backend/opencl/OclWorker.h"
+
 #include "backend/opencl/runners/OclCnRunner.h"
+#include "base/io/log/Log.h"
+#include "base/tools/Chrono.h"
 #include "core/Miner.h"
 #include "crypto/common/Nonce.h"
 #include "net/JobResults.h"
@ -40,6 +39,10 @@
 #endif


+#include <cassert>
+#include <thread>
+
+
 namespace xmrig {


@ -57,7 +60,8 @@ xmrig::OclWorker::OclWorker(size_t id, const OclLaunchData &data) :
    Worker(id, data.affinity, -1),
    m_algorithm(data.algorithm),
    m_miner(data.miner),
-    m_intensity(data.thread.intensity())
+    m_intensity(data.thread.intensity()),
+    m_interleave(data.interleave)
 {
    switch (m_algorithm.family()) {
    case Algorithm::RANDOM_X:
@ -101,6 +105,10 @@ void xmrig::OclWorker::start()

    while (Nonce::sequence(Nonce::OPENCL) > 0) {
        if (Nonce::isPaused()) {
+            if (m_interleave) {
+                m_interleave->setResumeCounter(0);
+            }
+
            do {
                std::this_thread::sleep_for(std::chrono::milliseconds(200));
            }
@ -110,11 +118,19 @@ void xmrig::OclWorker::start()
                break;
            }

+            if (m_interleave) {
+                m_interleave->resumeDelay(m_id);
+            }
+
            consumeJob();
        }

        while (!Nonce::isOutdated(Nonce::OPENCL, m_job.sequence())) {
-            storeStats();
+            if (m_interleave) {
+                m_interleave->adjustDelay(m_id);
+            }
+
+            const uint64_t t = Chrono::steadyMSecs();

            if (!m_runner->run(*m_job.nonce(), results)) {
                return;
@ -125,8 +141,8 @@ void xmrig::OclWorker::start()
            }

            m_job.nextRound(roundSize(m_intensity), m_intensity);
-            m_count += m_intensity;

+            storeStats(t);
            std::this_thread::yield();
        }

@ -144,3 +160,19 @@ void xmrig::OclWorker::consumeJob()
    m_job.add(m_miner->job(), Nonce::sequence(Nonce::OPENCL), roundSize(m_intensity) * m_intensity);
    m_runner->set(m_job.currentJob(), m_job.blob());
 }
+
+
+void xmrig::OclWorker::storeStats(uint64_t t)
+{
+    if (Nonce::isPaused()) {
+        return;
+    }
+
+    m_count += m_intensity;
+
+    if (m_interleave) {
+        m_interleave->setRunTime(Chrono::steadyMSecs() - t);
+    }
+
+    Worker::storeStats();
+}
--- a/src/backend/opencl/OclWorker.h
+++ b/src/backend/opencl/OclWorker.h
@ -42,20 +42,29 @@ class IOclRunner;
 class OclWorker : public Worker
 {
 public:
+    OclWorker()                       = delete;
+    OclWorker(const OclWorker &other) = delete;
+    OclWorker(OclWorker &&other)      = delete;
    OclWorker(size_t id, const OclLaunchData &data);
+
    ~OclWorker() override;

+    OclWorker &operator=(const OclWorker &other) = delete;
+    OclWorker &operator=(OclWorker &&other)      = delete;
+
 protected:
    bool selfTest() override;
    void start() override;

 private:
    void consumeJob();
+    void storeStats(uint64_t ts);

    const Algorithm m_algorithm;
    const Miner *m_miner;
    const uint32_t m_intensity;
    IOclRunner *m_runner = nullptr;
+    OclInterleavePtr m_interleave;
    WorkerJob<1> m_job;
 };

--- a/src/backend/opencl/opencl.cmake
+++ b/src/backend/opencl/opencl.cmake
@ -13,6 +13,7 @@ if (WITH_OPENCL)
        src/backend/opencl/OclBackend.h
        src/backend/opencl/OclCache.h
        src/backend/opencl/OclConfig.h
+        src/backend/opencl/OclInterleave.h
        src/backend/opencl/OclLaunchData.h
        src/backend/opencl/OclThread.h
        src/backend/opencl/OclThreads.h
@ -38,6 +39,7 @@ if (WITH_OPENCL)
        src/backend/opencl/OclBackend.cpp
        src/backend/opencl/OclCache.cpp
        src/backend/opencl/OclConfig.cpp
+        src/backend/opencl/OclInterleave.cpp
        src/backend/opencl/OclLaunchData.cpp
        src/backend/opencl/OclThread.cpp
        src/backend/opencl/OclThreads.cpp
@ -69,6 +71,10 @@ if (WITH_OPENCL)
   else()
       remove_definitions(/DXMRIG_STRICT_OPENCL_CACHE)
   endif()
+
+   if (WITH_INTERLEAVE_DEBUG_LOG)
+       add_definitions(/DXMRIG_INTERLEAVE_DEBUG)
+   endif()
 else()
    remove_definitions(/DXMRIG_FEATURE_OPENCL)

--- a/src/backend/opencl/wrappers/OclDevice.cpp
+++ b/src/backend/opencl/wrappers/OclDevice.cpp
@ -186,7 +186,7 @@ void xmrig::OclDevice::generate(const Algorithm &algorithm, OclThreads &threads)
    const uint32_t memChunk     = getMemChunk(algorithm);
    const uint32_t threadCount  = ((globalMem() - intensity * 2 * algorithm.l3()) > 128 * oneMiB) ? 2 : 1;

-    threads.add(OclThread(index(), intensity, worksize, stridedIndex, memChunk, threadCount));
+    threads.add(OclThread(index(), intensity, worksize, stridedIndex, memChunk, threadCount, algorithm));
 }


@ -258,7 +258,7 @@ uint32_t xmrig::OclDevice::getPossibleIntensity(const Algorithm &algorithm) cons
    const size_t minFreeMem     = (maxThreads == 40000u ? 512u : 128u) * oneMiB;
    const size_t availableMem   = freeMem() - minFreeMem;
    const size_t perThread      = algorithm.l3() + 224u;
-    const uint32_t maxIntensity = static_cast<uint32_t>(availableMem / perThread);
+    const auto maxIntensity     = static_cast<uint32_t>(availableMem / perThread);

    return std::min<uint32_t>(maxThreads, maxIntensity);
 }