Checking patch sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S... error: while searching for: testl %edx, %edx /* Go to special inputs processing branch. */ jne L(SPECIAL_VALUES_BRANCH) # LOE rbx rbp r12 r13 r14 r15 xmm0 /* No registers to restore on fast path. */ ret /* Cold case. edx has 1s where there was a special value that needs to be handled by a atanhf call. Optimize for code size more so than speed here. */ L(SPECIAL_VALUES_BRANCH): # LOE rbx rdx rbp r12 r13 r14 r15 xmm0 xmm5 /* Stack coming in 16-byte aligned. Set 8-byte misaligned so on call entry will be 16-byte aligned. */ subq $56, %rsp cfi_def_cfa_offset(64) movups %xmm0, 24(%rsp) movups %xmm5, 40(%rsp) /* Use rbx/rbp for callee save registers as they get short encoding for many instructions (as compared with r12/r13). */ movq %rbx, (%rsp) cfi_offset(rbx, -64) movq %rbp, 8(%rsp) cfi_offset(rbp, -56) /* edx has 1s where there was a special value that needs to be handled by a tanhf call. */ movl %edx, %ebx L(SPECIAL_VALUES_LOOP): # LOE rbx rbp r12 r13 r14 r15 /* use rbp as index for special value that is saved across calls to tanhf. We technically don't need a callee save register here as offset to rsp is always [0, 12] so we can restore rsp by realigning to 64. Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions in the loop. */ xorl %ebp, %ebp bsfl %ebx, %ebp /* Scalar math fucntion call to process special input. */ movss 40(%rsp, %rbp, 4), %xmm0 call atanhf@PLT /* No good way to avoid the store-forwarding fault this will cause on return. `lfence` avoids the SF fault but at greater cost as it serialized stack/callee save restoration. */ movss %xmm0, 24(%rsp, %rbp, 4) leal -1(%rbx), %eax andl %eax, %ebx jnz L(SPECIAL_VALUES_LOOP) # LOE r12 r13 r14 r15 /* All results have been written to 24(%rsp). */ movups 24(%rsp), %xmm0 movq (%rsp), %rbx cfi_restore(rbx) movq 8(%rsp), %rbp cfi_restore(rbp) addq $56, %rsp cfi_def_cfa_offset(8) ret END(_ZGVbN4v_atanhf_sse4) .section .rodata, "a" .align 16 #ifdef __svml_satanh_data_internal_typedef typedef unsigned int VUINT32; typedef struct{ __declspec(align(16)) VUINT32 sOne[4][1]; __declspec(align(16)) VUINT32 SgnMask[4][1]; __declspec(align(16)) VUINT32 sTopMask12[4][1]; __declspec(align(16)) VUINT32 iBrkValue[4][1]; __declspec(align(16)) VUINT32 iOffExpoMask[4][1]; __declspec(align(16)) VUINT32 sPoly[8][4][1]; __declspec(align(16)) VUINT32 sLn2[4][1]; __declspec(align(16)) VUINT32 TinyRange[4][1]; } __svml_satanh_data_internal; #endif __svml_satanh_data_internal: /* sOne = SP 1.0 */ .align 16 .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 /* SgnMask */ .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff /* sTopMask12 */ .align 16 .long 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000 /* iBrkValue = SP 2/3 */ .align 16 .long 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab /* iOffExpoMask = SP significand mask ==*/ .align 16 .long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff /* sPoly[] = SP polynomial */ .align 16 .long 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */ .long 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */ .long 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */ .long 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */ .long 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */ .long 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */ .long 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */ .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */ /* sLn2 = SP ln(2) */ .align 16 .long 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218 /* TinyRange */ .align 16 .long 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000 .align 16 .type __svml_satanh_data_internal, @object .size __svml_satanh_data_internal, .-__svm error: patch failed: sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S:206 error: sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S: patch does not apply