From 5e66efabcf12e9b89265cd81d0526380ffe3111e Mon Sep 17 00:00:00 2001 From: SChernykh Date: Thu, 19 Oct 2023 17:39:25 +0200 Subject: [PATCH] ARM64 JIT: don't use `x18` register MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit From https://developer.apple.com/documentation/xcode/writing-arm64-code-for-apple-platforms > The platforms reserve register x18. Don’t use this register. This PR fixes invalid hashes when running on Apple silicon with the latest macOS SDK. --- src/crypto/randomx/jit_compiler_a64.cpp | 54 +++++------ src/crypto/randomx/jit_compiler_a64_static.S | 98 ++++++++++---------- 2 files changed, 75 insertions(+), 77 deletions(-) diff --git a/src/crypto/randomx/jit_compiler_a64.cpp b/src/crypto/randomx/jit_compiler_a64.cpp index 530658db..05dac9f7 100644 --- a/src/crypto/randomx/jit_compiler_a64.cpp +++ b/src/crypto/randomx/jit_compiler_a64.cpp @@ -131,8 +131,8 @@ void JitCompilerA64::generateProgram(Program& program, ProgramConfiguration& con // and w16, w10, ScratchpadL3Mask64 emit32(0x121A0000 | 16 | (10 << 5) | ((RandomX_CurrentConfig.Log2_ScratchpadL3 - 7) << 10), code, codePos); - // and w17, w18, ScratchpadL3Mask64 - emit32(0x121A0000 | 17 | (18 << 5) | ((RandomX_CurrentConfig.Log2_ScratchpadL3 - 7) << 10), code, codePos); + // and w17, w20, ScratchpadL3Mask64 + emit32(0x121A0000 | 17 | (20 << 5) | ((RandomX_CurrentConfig.Log2_ScratchpadL3 - 7) << 10), code, codePos); codePos = PrologueSize; literalPos = ImulRcpLiteralsEnd; @@ -148,16 +148,16 @@ void JitCompilerA64::generateProgram(Program& program, ProgramConfiguration& con } // Update spMix2 - // eor w18, config.readReg2, config.readReg3 - emit32(ARMV8A::EOR32 | 18 | (IntRegMap[config.readReg2] << 5) | (IntRegMap[config.readReg3] << 16), code, codePos); + // eor w20, config.readReg2, config.readReg3 + emit32(ARMV8A::EOR32 | 20 | (IntRegMap[config.readReg2] << 5) | (IntRegMap[config.readReg3] << 16), code, codePos); // Jump back to the main loop const uint32_t offset = (((uint8_t*)randomx_program_aarch64_vm_instructions_end) - ((uint8_t*)randomx_program_aarch64)) - codePos; emit32(ARMV8A::B | (offset / 4), code, codePos); - // and w18, w18, CacheLineAlignMask + // and w20, w20, CacheLineAlignMask codePos = (((uint8_t*)randomx_program_aarch64_cacheline_align_mask1) - ((uint8_t*)randomx_program_aarch64)); - emit32(0x121A0000 | 18 | (18 << 5) | ((RandomX_CurrentConfig.Log2_DatasetBaseSize - 7) << 10), code, codePos); + emit32(0x121A0000 | 20 | (20 << 5) | ((RandomX_CurrentConfig.Log2_DatasetBaseSize - 7) << 10), code, codePos); // and w10, w10, CacheLineAlignMask codePos = (((uint8_t*)randomx_program_aarch64_cacheline_align_mask2) - ((uint8_t*)randomx_program_aarch64)); @@ -189,8 +189,8 @@ void JitCompilerA64::generateProgramLight(Program& program, ProgramConfiguration // and w16, w10, ScratchpadL3Mask64 emit32(0x121A0000 | 16 | (10 << 5) | ((RandomX_CurrentConfig.Log2_ScratchpadL3 - 7) << 10), code, codePos); - // and w17, w18, ScratchpadL3Mask64 - emit32(0x121A0000 | 17 | (18 << 5) | ((RandomX_CurrentConfig.Log2_ScratchpadL3 - 7) << 10), code, codePos); + // and w17, w20, ScratchpadL3Mask64 + emit32(0x121A0000 | 17 | (20 << 5) | ((RandomX_CurrentConfig.Log2_ScratchpadL3 - 7) << 10), code, codePos); codePos = PrologueSize; literalPos = ImulRcpLiteralsEnd; @@ -206,8 +206,8 @@ void JitCompilerA64::generateProgramLight(Program& program, ProgramConfiguration } // Update spMix2 - // eor w18, config.readReg2, config.readReg3 - emit32(ARMV8A::EOR32 | 18 | (IntRegMap[config.readReg2] << 5) | (IntRegMap[config.readReg3] << 16), code, codePos); + // eor w20, config.readReg2, config.readReg3 + emit32(ARMV8A::EOR32 | 20 | (IntRegMap[config.readReg2] << 5) | (IntRegMap[config.readReg3] << 16), code, codePos); // Jump back to the main loop const uint32_t offset = (((uint8_t*)randomx_program_aarch64_vm_instructions_end_light) - ((uint8_t*)randomx_program_aarch64)) - codePos; @@ -477,7 +477,7 @@ void JitCompilerA64::emitAddImmediate(uint32_t dst, uint32_t src, uint32_t imm, } else { - constexpr uint32_t tmp_reg = 18; + constexpr uint32_t tmp_reg = 20; emitMovImmediate(tmp_reg, imm, code, k); // add dst, src, tmp_reg @@ -526,7 +526,7 @@ void JitCompilerA64::emitMemLoadFP(uint32_t src, Instruction& instr, uint8_t* co uint32_t k = codePos; uint32_t imm = instr.getImm32(); - constexpr uint32_t tmp_reg = 18; + constexpr uint32_t tmp_reg = 19; imm &= instr.getModMem() ? (RandomX_CurrentConfig.ScratchpadL1_Size - 1) : (RandomX_CurrentConfig.ScratchpadL2_Size - 1); emitAddImmediate(tmp_reg, src, imm, code, k); @@ -580,7 +580,7 @@ void JitCompilerA64::h_IADD_M(Instruction& instr, uint32_t& codePos) const uint32_t src = IntRegMap[instr.src]; const uint32_t dst = IntRegMap[instr.dst]; - constexpr uint32_t tmp_reg = 18; + constexpr uint32_t tmp_reg = 20; emitMemLoad(dst, src, instr, code, k); // add dst, dst, tmp_reg @@ -618,7 +618,7 @@ void JitCompilerA64::h_ISUB_M(Instruction& instr, uint32_t& codePos) const uint32_t src = IntRegMap[instr.src]; const uint32_t dst = IntRegMap[instr.dst]; - constexpr uint32_t tmp_reg = 18; + constexpr uint32_t tmp_reg = 20; emitMemLoad(dst, src, instr, code, k); // sub dst, dst, tmp_reg @@ -637,7 +637,7 @@ void JitCompilerA64::h_IMUL_R(Instruction& instr, uint32_t& codePos) if (src == dst) { - src = 18; + src = 20; emitMovImmediate(src, instr.getImm32(), code, k); } @@ -655,7 +655,7 @@ void JitCompilerA64::h_IMUL_M(Instruction& instr, uint32_t& codePos) const uint32_t src = IntRegMap[instr.src]; const uint32_t dst = IntRegMap[instr.dst]; - constexpr uint32_t tmp_reg = 18; + constexpr uint32_t tmp_reg = 20; emitMemLoad(dst, src, instr, code, k); // sub dst, dst, tmp_reg @@ -686,7 +686,7 @@ void JitCompilerA64::h_IMULH_M(Instruction& instr, uint32_t& codePos) const uint32_t src = IntRegMap[instr.src]; const uint32_t dst = IntRegMap[instr.dst]; - constexpr uint32_t tmp_reg = 18; + constexpr uint32_t tmp_reg = 20; emitMemLoad(dst, src, instr, code, k); // umulh dst, dst, tmp_reg @@ -717,7 +717,7 @@ void JitCompilerA64::h_ISMULH_M(Instruction& instr, uint32_t& codePos) const uint32_t src = IntRegMap[instr.src]; const uint32_t dst = IntRegMap[instr.dst]; - constexpr uint32_t tmp_reg = 18; + constexpr uint32_t tmp_reg = 20; emitMemLoad(dst, src, instr, code, k); // smulh dst, dst, tmp_reg @@ -735,7 +735,7 @@ void JitCompilerA64::h_IMUL_RCP(Instruction& instr, uint32_t& codePos) uint32_t k = codePos; - constexpr uint32_t tmp_reg = 18; + constexpr uint32_t tmp_reg = 20; const uint32_t dst = IntRegMap[instr.dst]; constexpr uint64_t N = 1ULL << 63; @@ -754,9 +754,9 @@ void JitCompilerA64::h_IMUL_RCP(Instruction& instr, uint32_t& codePos) literalPos -= sizeof(uint64_t); *(uint64_t*)(code + literalPos) = (q << shift) + ((r << shift) / divisor); - if (literal_id < 13) + if (literal_id < 12) { - static constexpr uint32_t literal_regs[13] = { 30 << 16, 29 << 16, 28 << 16, 27 << 16, 26 << 16, 25 << 16, 24 << 16, 23 << 16, 22 << 16, 21 << 16, 20 << 16, 11 << 16, 0 }; + static constexpr uint32_t literal_regs[12] = { 30 << 16, 29 << 16, 28 << 16, 27 << 16, 26 << 16, 25 << 16, 24 << 16, 23 << 16, 22 << 16, 21 << 16, 11 << 16, 0 }; // mul dst, dst, literal_reg emit32(ARMV8A::MUL | dst | (dst << 5) | literal_regs[literal_id], code, k); @@ -794,7 +794,7 @@ void JitCompilerA64::h_IXOR_R(Instruction& instr, uint32_t& codePos) if (src == dst) { - src = 18; + src = 20; emitMovImmediate(src, instr.getImm32(), code, k); } @@ -812,7 +812,7 @@ void JitCompilerA64::h_IXOR_M(Instruction& instr, uint32_t& codePos) const uint32_t src = IntRegMap[instr.src]; const uint32_t dst = IntRegMap[instr.dst]; - constexpr uint32_t tmp_reg = 18; + constexpr uint32_t tmp_reg = 20; emitMemLoad(dst, src, instr, code, k); // eor dst, dst, tmp_reg @@ -850,7 +850,7 @@ void JitCompilerA64::h_IROL_R(Instruction& instr, uint32_t& codePos) if (src != dst) { - constexpr uint32_t tmp_reg = 18; + constexpr uint32_t tmp_reg = 20; // sub tmp_reg, xzr, src emit32(ARMV8A::SUB | tmp_reg | (31 << 5) | (src << 16), code, k); @@ -878,7 +878,7 @@ void JitCompilerA64::h_ISWAP_R(Instruction& instr, uint32_t& codePos) uint32_t k = codePos; - constexpr uint32_t tmp_reg = 18; + constexpr uint32_t tmp_reg = 20; emit32(ARMV8A::MOV_REG | tmp_reg | (dst << 16), code, k); emit32(ARMV8A::MOV_REG | dst | (src << 16), code, k); emit32(ARMV8A::MOV_REG | src | (tmp_reg << 16), code, k); @@ -1026,7 +1026,7 @@ void JitCompilerA64::h_CFROUND(Instruction& instr, uint32_t& codePos) const uint32_t src = IntRegMap[instr.src]; - constexpr uint32_t tmp_reg = 18; + constexpr uint32_t tmp_reg = 20; constexpr uint32_t fpcr_tmp_reg = 8; // ror tmp_reg, src, imm @@ -1050,7 +1050,7 @@ void JitCompilerA64::h_ISTORE(Instruction& instr, uint32_t& codePos) const uint32_t src = IntRegMap[instr.src]; const uint32_t dst = IntRegMap[instr.dst]; - constexpr uint32_t tmp_reg = 18; + constexpr uint32_t tmp_reg = 20; uint32_t imm = instr.getImm32(); diff --git a/src/crypto/randomx/jit_compiler_a64_static.S b/src/crypto/randomx/jit_compiler_a64_static.S index 95a5c92c..e019c6b4 100644 --- a/src/crypto/randomx/jit_compiler_a64_static.S +++ b/src/crypto/randomx/jit_compiler_a64_static.S @@ -72,9 +72,9 @@ # x15 -> "r7" # x16 -> spAddr0 # x17 -> spAddr1 -# x18 -> temporary +# x18 -> unused (platform register, don't touch it) # x19 -> temporary -# x20 -> literal for IMUL_RCP +# x20 -> temporary # x21 -> literal for IMUL_RCP # x22 -> literal for IMUL_RCP # x23 -> literal for IMUL_RCP @@ -109,7 +109,7 @@ DECL(randomx_program_aarch64): # Save callee-saved registers sub sp, sp, 192 stp x16, x17, [sp] - stp x18, x19, [sp, 16] + str x19, [sp, 16] stp x20, x21, [sp, 32] stp x22, x23, [sp, 48] stp x24, x25, [sp, 64] @@ -164,7 +164,6 @@ DECL(randomx_program_aarch64): # Read literals ldr x0, literal_x0 ldr x11, literal_x11 - ldr x20, literal_x20 ldr x21, literal_x21 ldr x22, literal_x22 ldr x23, literal_x23 @@ -196,11 +195,11 @@ DECL(randomx_program_aarch64): DECL(randomx_program_aarch64_main_loop): # spAddr0 = spMix1 & ScratchpadL3Mask64; # spAddr1 = (spMix1 >> 32) & ScratchpadL3Mask64; - lsr x18, x10, 32 + lsr x20, x10, 32 # Actual mask will be inserted by JIT compiler and w16, w10, 1 - and w17, w18, 1 + and w17, w20, 1 # x16 = scratchpad + spAddr0 # x17 = scratchpad + spAddr1 @@ -208,31 +207,31 @@ DECL(randomx_program_aarch64_main_loop): add x17, x17, x2 # xor integer registers with scratchpad data (spAddr0) - ldp x18, x19, [x16] - eor x4, x4, x18 + ldp x20, x19, [x16] + eor x4, x4, x20 eor x5, x5, x19 - ldp x18, x19, [x16, 16] - eor x6, x6, x18 + ldp x20, x19, [x16, 16] + eor x6, x6, x20 eor x7, x7, x19 - ldp x18, x19, [x16, 32] - eor x12, x12, x18 + ldp x20, x19, [x16, 32] + eor x12, x12, x20 eor x13, x13, x19 - ldp x18, x19, [x16, 48] - eor x14, x14, x18 + ldp x20, x19, [x16, 48] + eor x14, x14, x20 eor x15, x15, x19 # Load group F registers (spAddr1) - ldpsw x18, x19, [x17] - ins v16.d[0], x18 + ldpsw x20, x19, [x17] + ins v16.d[0], x20 ins v16.d[1], x19 - ldpsw x18, x19, [x17, 8] - ins v17.d[0], x18 + ldpsw x20, x19, [x17, 8] + ins v17.d[0], x20 ins v17.d[1], x19 - ldpsw x18, x19, [x17, 16] - ins v18.d[0], x18 + ldpsw x20, x19, [x17, 16] + ins v18.d[0], x20 ins v18.d[1], x19 - ldpsw x18, x19, [x17, 24] - ins v19.d[0], x18 + ldpsw x20, x19, [x17, 24] + ins v19.d[0], x20 ins v19.d[1], x19 scvtf v16.2d, v16.2d scvtf v17.2d, v17.2d @@ -240,17 +239,17 @@ DECL(randomx_program_aarch64_main_loop): scvtf v19.2d, v19.2d # Load group E registers (spAddr1) - ldpsw x18, x19, [x17, 32] - ins v20.d[0], x18 + ldpsw x20, x19, [x17, 32] + ins v20.d[0], x20 ins v20.d[1], x19 - ldpsw x18, x19, [x17, 40] - ins v21.d[0], x18 + ldpsw x20, x19, [x17, 40] + ins v21.d[0], x20 ins v21.d[1], x19 - ldpsw x18, x19, [x17, 48] - ins v22.d[0], x18 + ldpsw x20, x19, [x17, 48] + ins v22.d[0], x20 ins v22.d[1], x19 - ldpsw x18, x19, [x17, 56] - ins v23.d[0], x18 + ldpsw x20, x19, [x17, 56] + ins v23.d[0], x20 ins v23.d[1], x19 scvtf v20.2d, v20.2d scvtf v21.2d, v21.2d @@ -273,7 +272,6 @@ DECL(randomx_program_aarch64_vm_instructions): literal_x0: .fill 1,8,0 literal_x11: .fill 1,8,0 -literal_x20: .fill 1,8,0 literal_x21: .fill 1,8,0 literal_x22: .fill 1,8,0 literal_x23: .fill 1,8,0 @@ -309,17 +307,17 @@ DECL(randomx_program_aarch64_vm_instructions_end): lsr x10, x9, 32 # mx ^= r[readReg2] ^ r[readReg3]; - eor x9, x9, x18 + eor x9, x9, x20 # Calculate dataset pointer for dataset prefetch - mov w18, w9 + mov w20, w9 DECL(randomx_program_aarch64_cacheline_align_mask1): # Actual mask will be inserted by JIT compiler - and x18, x18, 1 - add x18, x18, x1 + and x20, x20, 1 + add x20, x20, x1 # Prefetch dataset data - prfm pldl2strm, [x18] + prfm pldl2strm, [x20] # mx <-> ma ror x9, x9, 32 @@ -331,17 +329,17 @@ DECL(randomx_program_aarch64_cacheline_align_mask2): DECL(randomx_program_aarch64_xor_with_dataset_line): # xor integer registers with dataset data - ldp x18, x19, [x10] - eor x4, x4, x18 + ldp x20, x19, [x10] + eor x4, x4, x20 eor x5, x5, x19 - ldp x18, x19, [x10, 16] - eor x6, x6, x18 + ldp x20, x19, [x10, 16] + eor x6, x6, x20 eor x7, x7, x19 - ldp x18, x19, [x10, 32] - eor x12, x12, x18 + ldp x20, x19, [x10, 32] + eor x12, x12, x20 eor x13, x13, x19 - ldp x18, x19, [x10, 48] - eor x14, x14, x18 + ldp x20, x19, [x10, 48] + eor x14, x14, x20 eor x15, x15, x19 DECL(randomx_program_aarch64_update_spMix1): @@ -384,7 +382,7 @@ DECL(randomx_program_aarch64_update_spMix1): # Restore callee-saved registers ldp x16, x17, [sp] - ldp x18, x19, [sp, 16] + ldr x19, [sp, 16] ldp x20, x21, [sp, 32] ldp x22, x23, [sp, 48] ldp x24, x25, [sp, 64] @@ -405,7 +403,7 @@ DECL(randomx_program_aarch64_vm_instructions_end_light): stp x2, x30, [sp, 80] # mx ^= r[readReg2] ^ r[readReg3]; - eor x9, x9, x18 + eor x9, x9, x20 # mx <-> ma ror x9, x9, 32 @@ -447,8 +445,8 @@ DECL(randomx_program_aarch64_light_dataset_offset): # x3 -> end item DECL(randomx_init_dataset_aarch64): - # Save x30 (return address) - str x30, [sp, -16]! + # Save x20 (used as temporary, but must be saved to not break ABI) and x30 (return address) + stp x20, x30, [sp, -16]! # Load pointer to cache memory ldr x0, [x0] @@ -460,8 +458,8 @@ DECL(randomx_init_dataset_aarch64_main_loop): cmp x2, x3 bne DECL(randomx_init_dataset_aarch64_main_loop) - # Restore x30 (return address) - ldr x30, [sp], 16 + # Restore x20 and x30 + ldp x20, x30, [sp], 16 ret