Hey guys, I need your help.
Well, I'm working on one project. I have to make two types of codes in NASM for SSE:
1. Vector*Matrix multiplication.
2. Matrix*Vector multiplication.
These codes need to be created for any size of the Matrix and Vector so I need to use loops, because with SSE I'm able to use 4 numbers within one xmm register (working with single precision floats). In case of, for example, Matrix 16*16 and Vector 16*1 I will have to perform the computation in loop 4 times. The computation itself could be made several ways. The critical for me is the loop. I use NASM and the output of my code is library (.dll). After the library is made, I use Matlab then to compare computations this way:
a) define the Matrix and Vector (using random numbers generated by Matlab) and define the size of the Matrix and Vector
b) perform computation in Matlab
c) perform computation using mentioned .dll from assembler
d) compare result
Well, after all these steps I found out that the output is different. The problem is, probably, in the loops because when I was debugging .exe file from assembler in OllyDbg, the computation of the M=4*4, V=4*1 was correct.
There are two types of loops I need. One for V*M and one for M*V.
My code for V*M:
(this one uses inside the loops shuffps instructions, this is only example, I have codes with unpack instructions and also with the transposition of matrix and so on)
______________________________________CODE V*M__________________________________________________________
vxm_sse1:
push dword ebp
mov dword ebp,esp
sub esp,4
mov dword [ebp - 4],3
mov eax,16
mul dword [ebp + 24]
push eax
mul dword [ebp - 4]
mov dword [ebp - 4],eax
pop eax
mov esi,[ebp + 12] ;input vector pointer
mov ecx,[ebp + 20] ;length of the input vector
mov edx,[ebp + 8] ;Matrix pointer
.invec1: ;1st loop
movups xmm0,[esi]
mov edi,[ebp + 16] ;output vector pointer
mov ebx,[ebp + 24] ;length of the output vector
.radmat1: ;2nd loop inside the 1st one
movups xmm4,[edx] ;begin of the computation with shufps instructions
add edx,eax
movups xmm5,[edx]
add edx,eax
movups xmm6,[edx]
add edx,eax
movups xmm7,[edx]
movups xmm2,[edi]
movups xmm1,xmm0
shufps xmm1,xmm1,0x00
mulps xmm1,xmm4
addps xmm2,xmm1
movups xmm1,xmm0
shufps xmm1,xmm1,0x55
mulps xmm1,xmm5
addps xmm2,xmm1
movups xmm1,xmm0
shufps xmm1,xmm1,0xAA
mulps xmm1,xmm6
addps xmm2,xmm1
movups xmm1,xmm0
shufps xmm1,xmm1,0xFF
mulps xmm1,xmm7
addps xmm2,xmm1
movups [edi],xmm2
add edi,16
sub edx,dword [ebp - 4]
add edx,16
dec ebx
jnz .radmat1 ;end of the 2nd loop inside the 1st one
add esi,16
add edx,dword [ebp - 4]
dec ecx
jnz .invec1 ;end of the 1st loop
mov dword esp,ebp
pop dword ebp
ret 20
______________________________________/CODE V*M_________________________________________________________
My code for M*V (not V*M):
(this one uses probably the most easy was - horizontal adds - haddps instructions)
______________________________________CODE V*M__________________________________________________________
mxv_sse4:
push dword ebp
mov dword ebp,esp
sub esp,4
mov dword [ebp - 4],8
mov eax,16
mul dword [ebp + 24]
push eax
mul dword [ebp - 4]
mov dword [ebp - 4],eax
pop eax
mov edx,[ebp + 8] ;Matrix pointer
mov ebx,[ebp + 24] ;length of the output vector
mov esi,[ebp + 12] ;input vector pointer
.invec4: ;1st loop
movups xmm0,[esi]
mov edi,[ebp + 16] ;output vector pointer
mov ecx,[ebp + 20] ;length of the input vector
.radmat4: ;2nd loop inside the 1st one
movups xmm4,[edx] ;begin of the computation with haddps instructions
add edx,eax
movups xmm5,[edx]
add edx,eax
movups xmm6,[edx]
add edx,eax
movups xmm7,[edx]
movups xmm2,[edi]
mulps xmm4,xmm0
mulps xmm5,xmm0
mulps xmm6,xmm0
mulps xmm7,xmm0
haddps xmm4,xmm5
haddps xmm6,xmm7
haddps xmm4,xmm6
addps xmm2,xmm4
movups [edi],xmm2
add edi,16
add edx,eax
dec ecx
jnz .radmat4 ;end of the 2nd loop inside the 1st one
add esi,16
sub edx,dword [ebp - 4]
add edx,16
dec ebx
jnz .invec4 ;end of the 1st loop
mov dword esp,ebp
pop dword ebp
ret 20
______________________________________/CODE V*M_________________________________________________________
Like I said, I think the problem is in loops, computations inside the loops look to be good. I would be really glad if you could be able to help me.
Thank you! Cheers!