commit
ae6c536e98
3 changed files with 19 additions and 14 deletions
|
@ -377,12 +377,15 @@ static inline void cn_explode_scratchpad(const __m128i *input, __m128i *output)
|
|||
_mm_store_si128(output + 1, xin1);
|
||||
_mm_store_si128(output + 2, xin2);
|
||||
_mm_store_si128(output + 3, xin3);
|
||||
output += (64 << interleave) / sizeof(__m128i);
|
||||
_mm_store_si128(output + 0, xin4);
|
||||
_mm_store_si128(output + 1, xin5);
|
||||
_mm_store_si128(output + 2, xin6);
|
||||
_mm_store_si128(output + 3, xin7);
|
||||
output += (64 << interleave) / sizeof(__m128i);
|
||||
|
||||
constexpr int output_increment = (64 << interleave) / sizeof(__m128i);
|
||||
|
||||
_mm_store_si128(output + output_increment + 0, xin4);
|
||||
_mm_store_si128(output + output_increment + 1, xin5);
|
||||
_mm_store_si128(output + output_increment + 2, xin6);
|
||||
_mm_store_si128(output + output_increment + 3, xin7);
|
||||
|
||||
output += output_increment * 2;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -414,13 +417,15 @@ static inline void cn_implode_scratchpad(const __m128i *input, __m128i *output)
|
|||
xout1 = _mm_xor_si128(_mm_load_si128(input + 1), xout1);
|
||||
xout2 = _mm_xor_si128(_mm_load_si128(input + 2), xout2);
|
||||
xout3 = _mm_xor_si128(_mm_load_si128(input + 3), xout3);
|
||||
input += (64 << interleave) / sizeof(__m128i);
|
||||
xout4 = _mm_xor_si128(_mm_load_si128(input + 0), xout4);
|
||||
xout5 = _mm_xor_si128(_mm_load_si128(input + 1), xout5);
|
||||
xout6 = _mm_xor_si128(_mm_load_si128(input + 2), xout6);
|
||||
xout7 = _mm_xor_si128(_mm_load_si128(input + 3), xout7);
|
||||
input += (64 << interleave) / sizeof(__m128i);
|
||||
|
||||
constexpr int input_increment = (64 << interleave) / sizeof(__m128i);
|
||||
|
||||
xout4 = _mm_xor_si128(_mm_load_si128(input + input_increment + 0), xout4);
|
||||
xout5 = _mm_xor_si128(_mm_load_si128(input + input_increment + 1), xout5);
|
||||
xout6 = _mm_xor_si128(_mm_load_si128(input + input_increment + 2), xout6);
|
||||
xout7 = _mm_xor_si128(_mm_load_si128(input + input_increment + 3), xout7);
|
||||
|
||||
input += input_increment * 2;
|
||||
i += 8;
|
||||
|
||||
if ((interleave > 0) && (i < props.memory() / sizeof(__m128i))) {
|
||||
|
|
|
@ -34,7 +34,7 @@
|
|||
movdqa XMMWORD PTR [rsp+32], xmm0
|
||||
|
||||
stmxcsr DWORD PTR [rsp+24]
|
||||
mov DWORD PTR [rsp+28], 24448
|
||||
mov DWORD PTR [rsp+28], 16256
|
||||
ldmxcsr DWORD PTR [rsp+28]
|
||||
|
||||
mov rcx, QWORD PTR [rbx+56]
|
||||
|
|
|
@ -34,7 +34,7 @@
|
|||
movdqa XMMWORD PTR [rsp+32], xmm0
|
||||
|
||||
stmxcsr DWORD PTR [rsp+24]
|
||||
mov DWORD PTR [rsp+28], 24448
|
||||
mov DWORD PTR [rsp+28], 16256
|
||||
ldmxcsr DWORD PTR [rsp+28]
|
||||
|
||||
mov rcx, QWORD PTR [rbx+56]
|
||||
|
|
Loading…
Reference in a new issue