Even abs() comes with a performance penalty

classic Classic list List threaded Threaded
6 messages Options
Reply | Threaded
Open this post in threaded view
|

Even abs() comes with a performance penalty

Hubert Tong via cfe-dev
--- bugs-bunny.c ---
// Copyleft © 2014-2020, Stefan Kanthak <[hidden email]>

#ifdef __amd64__
__int128_t __absti2(__int128_t argument) {
    return argument < 0 ? -argument : argument;
}
#else
long long __absdi2(long long argument) {
#ifdef BUNNY
    return __builtin_llabs(argument);
#else
    return argument < 0 ? -argument : argument;
#endif // BUNNY
}

long __abssi2(long argument) {
#ifdef BUNNY
    return __builtin_labs(argument);
#else
    return argument < 0 ? -argument : argument;
#endif // BUNNY
}
#endif // __amd64__
--- EOF ---

Run clang -c -o- -O3 -S -target amd64-pc-linux bugs-bunny.c

Left: inperformant original code # right: proper code,
                                 #        faster and 3 bytes shorter

__absti2:      # @__absti2
# %bb.0:                         # .intel_syntax noprefix
      xorl     %edx, %edx        #        mov    rax, rsi
      movq     %rdi, %rax        #        cqo
      negq     %rax              #        mov    rax, rdx
      sbbq     %rsi, %rdx        #        add    rdi, rdx
      testq    %rsi, %rsi        #        adc    rsi, rdx
      cmovnsq  %rdi, %rax        #        xor    rax, rdi
      cmovnsq  %rsi, %rdx        #        xor    rdx, rsi
      retq                       #        ret

CMOVcc introduces a data dependency here, WITHOUT necessity!


Run clang -c -o- -O3 -S -target i386-pc-linux bugs-bunny.c

Left: inperformant original code # right: proper code, runs even on real
                                 #        i386, not just PentiumPro+

___abssi2:    # @__abssi2
# %bb.0:                         # .intel_syntax noprefix
      movl    4(%esp), %ecx      #        mov    eax, [esp+4]
      movl    %ecx, %eax         #        cdq
      negl    %eax               #        add    eax, edx
      cmovll  %ecx, %eax         #        xor    eax, edx
      retl                       #        ret


Writing shorter code for __absdi2() for i386 is left as an
exercise to the reader.

_______________________________________________
cfe-dev mailing list
[hidden email]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-dev
Reply | Threaded
Open this post in threaded view
|

Re: Even abs() comes with a performance penalty

Hubert Tong via cfe-dev
cmov has been 1 cycle since Sandy Bridge. Moves execute in the register renamer since Ivy Bridge. So mov+neg+cmov should be faster than cdq+add+xor on modern CPUs. Furthermore, cdq really ties the hands of the register allocator so probably doesn't make sense in a larger function with abs mixed with other code.

~Craig


On Sun, Sep 6, 2020 at 12:30 PM Stefan Kanthak via cfe-dev <[hidden email]> wrote:
--- bugs-bunny.c ---
// Copyleft © 2014-2020, Stefan Kanthak <[hidden email]>

#ifdef __amd64__
__int128_t __absti2(__int128_t argument) {
    return argument < 0 ? -argument : argument;
}
#else
long long __absdi2(long long argument) {
#ifdef BUNNY
    return __builtin_llabs(argument);
#else
    return argument < 0 ? -argument : argument;
#endif // BUNNY
}

long __abssi2(long argument) {
#ifdef BUNNY
    return __builtin_labs(argument);
#else
    return argument < 0 ? -argument : argument;
#endif // BUNNY
}
#endif // __amd64__
--- EOF ---

Run clang -c -o- -O3 -S -target amd64-pc-linux bugs-bunny.c

Left: inperformant original code # right: proper code,
                                 #        faster and 3 bytes shorter

__absti2:      # @__absti2
# %bb.0:                         # .intel_syntax noprefix
      xorl     %edx, %edx        #        mov    rax, rsi
      movq     %rdi, %rax        #        cqo
      negq     %rax              #        mov    rax, rdx
      sbbq     %rsi, %rdx        #        add    rdi, rdx
      testq    %rsi, %rsi        #        adc    rsi, rdx
      cmovnsq  %rdi, %rax        #        xor    rax, rdi
      cmovnsq  %rsi, %rdx        #        xor    rdx, rsi
      retq                       #        ret

CMOVcc introduces a data dependency here, WITHOUT necessity!


Run clang -c -o- -O3 -S -target i386-pc-linux bugs-bunny.c

Left: inperformant original code # right: proper code, runs even on real
                                 #        i386, not just PentiumPro+

___abssi2:    # @__abssi2
# %bb.0:                         # .intel_syntax noprefix
      movl    4(%esp), %ecx      #        mov    eax, [esp+4]
      movl    %ecx, %eax         #        cdq
      negl    %eax               #        add    eax, edx
      cmovll  %ecx, %eax         #        xor    eax, edx
      retl                       #        ret


Writing shorter code for __absdi2() for i386 is left as an
exercise to the reader.

_______________________________________________
cfe-dev mailing list
[hidden email]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-dev

_______________________________________________
cfe-dev mailing list
[hidden email]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-dev
Reply | Threaded
Open this post in threaded view
|

Re: Even abs() comes with a performance penalty

Hubert Tong via cfe-dev
Sorry. I made a mistake. Cmov has been 1 cycle since Broadwell.

~Craig


On Sun, Sep 6, 2020 at 12:39 PM Craig Topper <[hidden email]> wrote:
cmov has been 1 cycle since Sandy Bridge. Moves execute in the register renamer since Ivy Bridge. So mov+neg+cmov should be faster than cdq+add+xor on modern CPUs. Furthermore, cdq really ties the hands of the register allocator so probably doesn't make sense in a larger function with abs mixed with other code.

~Craig


On Sun, Sep 6, 2020 at 12:30 PM Stefan Kanthak via cfe-dev <[hidden email]> wrote:
--- bugs-bunny.c ---
// Copyleft © 2014-2020, Stefan Kanthak <[hidden email]>

#ifdef __amd64__
__int128_t __absti2(__int128_t argument) {
    return argument < 0 ? -argument : argument;
}
#else
long long __absdi2(long long argument) {
#ifdef BUNNY
    return __builtin_llabs(argument);
#else
    return argument < 0 ? -argument : argument;
#endif // BUNNY
}

long __abssi2(long argument) {
#ifdef BUNNY
    return __builtin_labs(argument);
#else
    return argument < 0 ? -argument : argument;
#endif // BUNNY
}
#endif // __amd64__
--- EOF ---

Run clang -c -o- -O3 -S -target amd64-pc-linux bugs-bunny.c

Left: inperformant original code # right: proper code,
                                 #        faster and 3 bytes shorter

__absti2:      # @__absti2
# %bb.0:                         # .intel_syntax noprefix
      xorl     %edx, %edx        #        mov    rax, rsi
      movq     %rdi, %rax        #        cqo
      negq     %rax              #        mov    rax, rdx
      sbbq     %rsi, %rdx        #        add    rdi, rdx
      testq    %rsi, %rsi        #        adc    rsi, rdx
      cmovnsq  %rdi, %rax        #        xor    rax, rdi
      cmovnsq  %rsi, %rdx        #        xor    rdx, rsi
      retq                       #        ret

CMOVcc introduces a data dependency here, WITHOUT necessity!


Run clang -c -o- -O3 -S -target i386-pc-linux bugs-bunny.c

Left: inperformant original code # right: proper code, runs even on real
                                 #        i386, not just PentiumPro+

___abssi2:    # @__abssi2
# %bb.0:                         # .intel_syntax noprefix
      movl    4(%esp), %ecx      #        mov    eax, [esp+4]
      movl    %ecx, %eax         #        cdq
      negl    %eax               #        add    eax, edx
      cmovll  %ecx, %eax         #        xor    eax, edx
      retl                       #        ret


Writing shorter code for __absdi2() for i386 is left as an
exercise to the reader.

_______________________________________________
cfe-dev mailing list
[hidden email]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-dev

_______________________________________________
cfe-dev mailing list
[hidden email]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-dev
Reply | Threaded
Open this post in threaded view
|

Re: Even abs() comes with a performance penalty

Hubert Tong via cfe-dev
In reply to this post by Hubert Tong via cfe-dev
"Craig Topper" <[hidden email]> wrote:

> cmov has been 1 cycle since Sandy Bridge.

That doesn't matter here. It's the data dependency it introduces.

> Moves execute in the register renamer since Ivy Bridge.

That's why my code shows 2 of them.

> So mov+neg+cmov should be faster than cdq+add+xor on modern CPUs.

You but forgot sbb+ test, and the data dependency: how well does the CPU
speculate about cmovs?

> Furthermore, cdq really ties the hands of the register allocator so probably
> doesn't make sense in a larger function with abs mixed with other code.

The optimiser is free to use mov+sar then, at the expense of +4 or +6 bytes.

Ever heard of trade-off?

Stefan

On Sun, Sep 6, 2020 at 12:30 PM Stefan Kanthak via cfe-dev <
[hidden email]> wrote:

> --- bugs-bunny.c ---
> // Copyleft © 2014-2020, Stefan Kanthak <[hidden email]>
>
> #ifdef __amd64__
> __int128_t __absti2(__int128_t argument) {
>     return argument < 0 ? -argument : argument;
> }
> #else
> long long __absdi2(long long argument) {
> #ifdef BUNNY
>     return __builtin_llabs(argument);
> #else
>     return argument < 0 ? -argument : argument;
> #endif // BUNNY
> }
>
> long __abssi2(long argument) {
> #ifdef BUNNY
>     return __builtin_labs(argument);
> #else
>     return argument < 0 ? -argument : argument;
> #endif // BUNNY
> }
> #endif // __amd64__
> --- EOF ---
>
> Run clang -c -o- -O3 -S -target amd64-pc-linux bugs-bunny.c
>
> Left: inperformant original code # right: proper code,
>                                  #        faster and 3 bytes shorter
>
> __absti2:      # @__absti2
> # %bb.0:                         # .intel_syntax noprefix
>       xorl     %edx, %edx        #        mov    rax, rsi
>       movq     %rdi, %rax        #        cqo
>       negq     %rax              #        mov    rax, rdx
>       sbbq     %rsi, %rdx        #        add    rdi, rdx
>       testq    %rsi, %rsi        #        adc    rsi, rdx
>       cmovnsq  %rdi, %rax        #        xor    rax, rdi
>       cmovnsq  %rsi, %rdx        #        xor    rdx, rsi
>       retq                       #        ret
>
> CMOVcc introduces a data dependency here, WITHOUT necessity!
>
>
> Run clang -c -o- -O3 -S -target i386-pc-linux bugs-bunny.c
>
> Left: inperformant original code # right: proper code, runs even on real
>                                  #        i386, not just PentiumPro+
>
> ___abssi2:    # @__abssi2
> # %bb.0:                         # .intel_syntax noprefix
>       movl    4(%esp), %ecx      #        mov    eax, [esp+4]
>       movl    %ecx, %eax         #        cdq
>       negl    %eax               #        add    eax, edx
>       cmovll  %ecx, %eax         #        xor    eax, edx
>       retl                       #        ret
>
>
> Writing shorter code for __absdi2() for i386 is left as an
> exercise to the reader.
>
> _______________________________________________
> cfe-dev mailing list
> [hidden email]
> https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-dev
>

_______________________________________________
cfe-dev mailing list
[hidden email]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-dev
Reply | Threaded
Open this post in threaded view
|

Re: Even abs() comes with a performance penalty

Hubert Tong via cfe-dev
In reply to this post by Hubert Tong via cfe-dev
"Craig Topper" <[hidden email]> wrote:

> Sorry. I made a mistake. Cmov has been 1 cycle since Broadwell.

Doesn't matter, no need to worry: all instructions used below run in 1 cycle on
recent CPUs ... just like Jcc. The question/point is but whether the CPU can/does
speculate ahead.

Stefan

On Sun, Sep 6, 2020 at 12:39 PM Craig Topper <[hidden email]> wrote:

> cmov has been 1 cycle since Sandy Bridge. Moves execute in the register
> renamer since Ivy Bridge. So mov+neg+cmov should be faster than cdq+add+xor
> on modern CPUs. Furthermore, cdq really ties the hands of the register
> allocator so probably doesn't make sense in a larger function with abs
> mixed with other code.
>
> ~Craig
>
>
> On Sun, Sep 6, 2020 at 12:30 PM Stefan Kanthak via cfe-dev <
> [hidden email]> wrote:
>
>> --- bugs-bunny.c ---
>> // Copyleft © 2014-2020, Stefan Kanthak <[hidden email]>
>>
>> #ifdef __amd64__
>> __int128_t __absti2(__int128_t argument) {
>>     return argument < 0 ? -argument : argument;
>> }
>> #else
>> long long __absdi2(long long argument) {
>> #ifdef BUNNY
>>     return __builtin_llabs(argument);
>> #else
>>     return argument < 0 ? -argument : argument;
>> #endif // BUNNY
>> }
>>
>> long __abssi2(long argument) {
>> #ifdef BUNNY
>>     return __builtin_labs(argument);
>> #else
>>     return argument < 0 ? -argument : argument;
>> #endif // BUNNY
>> }
>> #endif // __amd64__
>> --- EOF ---
>>
>> Run clang -c -o- -O3 -S -target amd64-pc-linux bugs-bunny.c
>>
>> Left: inperformant original code # right: proper code,
>>                                  #        faster and 3 bytes shorter
>>
>> __absti2:      # @__absti2
>> # %bb.0:                         # .intel_syntax noprefix
>>       xorl     %edx, %edx        #        mov    rax, rsi
>>       movq     %rdi, %rax        #        cqo
>>       negq     %rax              #        mov    rax, rdx
>>       sbbq     %rsi, %rdx        #        add    rdi, rdx
>>       testq    %rsi, %rsi        #        adc    rsi, rdx
>>       cmovnsq  %rdi, %rax        #        xor    rax, rdi
>>       cmovnsq  %rsi, %rdx        #        xor    rdx, rsi
>>       retq                       #        ret
>>
>> CMOVcc introduces a data dependency here, WITHOUT necessity!
>>
>>
>> Run clang -c -o- -O3 -S -target i386-pc-linux bugs-bunny.c
>>
>> Left: inperformant original code # right: proper code, runs even on real
>>                                  #        i386, not just PentiumPro+
>>
>> ___abssi2:    # @__abssi2
>> # %bb.0:                         # .intel_syntax noprefix
>>       movl    4(%esp), %ecx      #        mov    eax, [esp+4]
>>       movl    %ecx, %eax         #        cdq
>>       negl    %eax               #        add    eax, edx
>>       cmovll  %ecx, %eax         #        xor    eax, edx
>>       retl                       #        ret
>>
>>
>> Writing shorter code for __absdi2() for i386 is left as an
>> exercise to the reader.
>>
>> _______________________________________________
>> cfe-dev mailing list
>> [hidden email]
>> https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-dev
>>

_______________________________________________
cfe-dev mailing list
[hidden email]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-dev
Reply | Threaded
Open this post in threaded view
|

Re: Even abs() comes with a performance penalty

Hubert Tong via cfe-dev
I was mostly speaking to abssi2. What data dependency exists for cmov that doesn’t exist for cdq+add+xor?

On Sun, Sep 6, 2020 at 1:19 PM Stefan Kanthak <[hidden email]> wrote:
"Craig Topper" <[hidden email]> wrote:



> Sorry. I made a mistake. Cmov has been 1 cycle since Broadwell.



Doesn't matter, no need to worry: all instructions used below run in 1 cycle on

recent CPUs ... just like Jcc. The question/point is but whether the CPU can/does

speculate ahead.



Stefan



On Sun, Sep 6, 2020 at 12:39 PM Craig Topper <[hidden email]> wrote:



> cmov has been 1 cycle since Sandy Bridge. Moves execute in the register

> renamer since Ivy Bridge. So mov+neg+cmov should be faster than cdq+add+xor

> on modern CPUs. Furthermore, cdq really ties the hands of the register

> allocator so probably doesn't make sense in a larger function with abs

> mixed with other code.

>

> ~Craig

>

>

> On Sun, Sep 6, 2020 at 12:30 PM Stefan Kanthak via cfe-dev <

> [hidden email]> wrote:

>

>> --- bugs-bunny.c ---

>> // Copyleft © 2014-2020, Stefan Kanthak <[hidden email]>

>>

>> #ifdef __amd64__

>> __int128_t __absti2(__int128_t argument) {

>>     return argument < 0 ? -argument : argument;

>> }

>> #else

>> long long __absdi2(long long argument) {

>> #ifdef BUNNY

>>     return __builtin_llabs(argument);

>> #else

>>     return argument < 0 ? -argument : argument;

>> #endif // BUNNY

>> }

>>

>> long __abssi2(long argument) {

>> #ifdef BUNNY

>>     return __builtin_labs(argument);

>> #else

>>     return argument < 0 ? -argument : argument;

>> #endif // BUNNY

>> }

>> #endif // __amd64__

>> --- EOF ---

>>

>> Run clang -c -o- -O3 -S -target amd64-pc-linux bugs-bunny.c

>>

>> Left: inperformant original code # right: proper code,

>>                                  #        faster and 3 bytes shorter

>>

>> __absti2:      # @__absti2

>> # %bb.0:                         # .intel_syntax noprefix

>>       xorl     %edx, %edx        #        mov    rax, rsi

>>       movq     %rdi, %rax        #        cqo

>>       negq     %rax              #        mov    rax, rdx

>>       sbbq     %rsi, %rdx        #        add    rdi, rdx

>>       testq    %rsi, %rsi        #        adc    rsi, rdx

>>       cmovnsq  %rdi, %rax        #        xor    rax, rdi

>>       cmovnsq  %rsi, %rdx        #        xor    rdx, rsi

>>       retq                       #        ret

>>

>> CMOVcc introduces a data dependency here, WITHOUT necessity!

>>

>>

>> Run clang -c -o- -O3 -S -target i386-pc-linux bugs-bunny.c

>>

>> Left: inperformant original code # right: proper code, runs even on real

>>                                  #        i386, not just PentiumPro+

>>

>> ___abssi2:    # @__abssi2

>> # %bb.0:                         # .intel_syntax noprefix

>>       movl    4(%esp), %ecx      #        mov    eax, [esp+4]

>>       movl    %ecx, %eax         #        cdq

>>       negl    %eax               #        add    eax, edx

>>       cmovll  %ecx, %eax         #        xor    eax, edx

>>       retl                       #        ret

>>

>>

>> Writing shorter code for __absdi2() for i386 is left as an

>> exercise to the reader.

>>

>> _______________________________________________

>> cfe-dev mailing list

>> [hidden email]

>> https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-dev

>>



--
~Craig

_______________________________________________
cfe-dev mailing list
[hidden email]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-dev