Here's why... FP87 uses a stack of 8 levels, SSE/AVX uses registers (8 in i386 mode, 16 in x86-64 mode - 32 if AVX-512).
To use fp87 you need to think in RPN (Reverse Polish Notation) and be careful about stack usage. Since SysV ABI and MS-ABI requires the cleanup of the fp87 stack (keeping only st(0) if a floating point type is returned by a function), some additional instructions are executed to manage the stack.
Let's take a look in a simple function which extracts the roots of a quadratic equation:
#include <math.h>
// returns # of roots found, filling the first 2 positions of the array
// pointed by roots.
int find_roots_quad( double *roots, double a, double b, double c )
{
double d = b*b - 4*a*c;
// Obviously there are no real roots if a==0 and d < 0.
if ( a == 0.0 || d < 0.0 )
return 0;
double sqr = sqrt( d );
int nr = 1;
a += a;
*roots++ = ( -b + sqr ) / a;
if ( d > 0.0 )
{
*roots = ( -b - sqr ) / a;
nr++;
}
return nr;
}
Here's the x86-64 code, very straightforward:
; Entry:
; RDI = roots
; XMM0 = a
; XMM1 = b
; XMM2 = c
find_roots_quad:
mulsd xmm2, [four]
mulsd xmm2, xmm0 ; xmm2 = 4*a*c
movapd xmm3, xmm1
mulsd xmm3, xmm1
subsd xmm3, xmm2 ; xmm3 = b² - 4*a*c
pxor xmm2, xmm2
comisd xmm0, xmm2 ; a == 0?
je .noroots ; if yes, no roots.
comisd xmm2, xmm3 ; 0 > xmm3?
ja .noroots ; if yes, no roots.
movapd xmm5, xmm3
addsd xmm0, xmm0 ; xmm0 = 2*a
sqrtsd xmm5, xmm5 ; xmm5 = sqrt( b²-4*a*c )
movapd xmm4, xmm5
subsd xmm4, xmm1
divsd xmm4, xmm0 ; xmm4 = ( -b + sqrt( b²-4*a*c ) ) / (2 * a)
movsd [rdi], xmm4 ; store in *root.
comisd xmm3, xmm2 ; xmm3 > 0?
ja .tworoots ; if yes, two roots.
mov eax, 1
ret
align 4
.tworoots:
addsd xmm1, xmm5
xorpd xmm1, [signalbitmask] ; invert the signal bit.
divsd xmm1, xmm0 ; xmm1 = -( b + sqrt( b² - 4*a*c ) / ( 2 * a )
movsd [rdi+8], xmm1 ; store in *(root + 1).
mov eax, 2
ret
align 4
.noroots:
xor eax, eax
ret
section .rodata
align 8
four:
dq 4.0
align 16
signalbitmask:
dq 1 << 63, 0
The i386 code is a little bit more complex. Comments added to make a little bit easier for you:
struc frqstk
resd 1 ; return address.
.roots: resd 1
.a: resq 1
.b: resq 1
.c: resq 1
endstruc
find_roots_quad:
fld qword [esp + frqstk.a] ; st0=a
fld qword [esp + frqstk.b] ; st0=b, st1=a
fld st0 ; st0=b, st1=b, st2=a
fmul st0, st1 ; st0=b², st1=b, st2=a
fld dword [four] ; st0=4, st1=b², st2=b, st3=a
fmul qword [esp + frqstk.c] ; st0=4*c, st1=b², st2=b, st3=a
fmul st0, st3 ; st0=4*a*c, st1=b², st2=b, st3=a
fsubp st1, st0 ; st0=b²-4*a*c, st1=b, st2=a
fldz ; st0=0, st1=b²-4*a*c, st2=b, st3=a
fcomi st0, st3 ; st3 == st0?
je .noroots ; if yes, no roots.
fcomi st0, st1 ; 0 > st1?
ja .noroots ; if yes, no roots.
fld st1 ; st0=b²-4*a*c, st1=0, st2=b²-4*a*c, st3=b, st4=a
mov eax, [esp + frqstk.roots]
fsqrt ; st0=sqrt(b²-4*a*c), st1=0, st2=b²-4*a*c, st3=b, st4=a
fxch st4 ; st0=a, st1=0, st2=b²-4*a*c, st3=b, st4=sqrt(b²-4*a*c)
fadd st0, st0 ; st0=2*a, st1=0, st2=b²-4*a*c, st3=b, st4=sqrt(b²-4*a*c)
fld st4 ; st0=sqrt(b²-4*a*c), st1=2*a, st2=0, st3=b²-4*a*c, st4=b, st5=sqrt(b²-4*a*c)
fsub st0, st4 ; st0=-b+sqrt(b²-4*a*c), st1=2*a, st2=0, st3=b²-4*a*c, st4=b, st5=sqrt(b²-4*a*c)
fdiv st0, st1 ; st0=(-b+sqrt(b²-4*a*c))/2*a, st1=2*a, st2=0, st3=b²-4*a*c, st4=b, st5=sqrt(b²-4*a*c)
fstp qword [eax] ; store st0 in *roots.
; st0=2*a, st1=0, st2=b²-4*a*c, st3=b, st4=sqrt(b²-4*a*c)
fxch st2 ; st0=b²-4*a*c, st1=0, st2=2*a, st3=b, st4=sqrt(b²-4*a*c)
fcomip st0, st1 ; st0 > 0?
; st0=0, st1=2*a, st2=b, st3=sqrt(b²-4*a*c)
fstp st0 ; st0=2*a, st1=b, st2=sqrt(b²-4*a*c)
ja .two_roots ; if yes, 2 roots.
fstp st0 ; pop all remaining 3 elements.
fstp st0
fstp st0
mov eax, 1 ; 1 root
ret
align 4
.two_roots:
fxch st1 ; st0=b, st1=2*a, st2=sqrt(b²-4*a*c)
faddp st2, st0 ; st0=2*a, st1=b+sqrt(b²-4*a*c)
fxch st1 ; st0=b+sqrt(b²-4*a*c), st1=2*a
fchs ; st0=-(b+sqrt(b²-4*a*c)), st1=2*a
fdivrp st1, st0 ; st0=-(b+sqrt(b²-4*a*c))/(2*a)
fstp qword [eax+8] ; store st0 at *(roots + 1).
mov eax, 2 ; 2 roots
ret
align 4
.noroots:
fstp st0 ; pops elements from stack.
fstp st0
fstp st0
fstp st0
xor eax, eax ; no roots.
ret
section .rodata
align 4
four:
dd 4.0
The fp87 is needed on those rare occasions where you must deal with extended precision (long double).