Author Topic: [NASM] Matrix/Vector multiplication loops with SSE (Read 19878 times)

coldflame · « **on:** May 06, 2015, 03:19:57 PM »

Hey guys, I need your help.

Well, I'm working on one project. I have to make two types of codes in NASM for SSE:
1. Vector*Matrix multiplication.
2. Matrix*Vector multiplication.

These codes need to be created for any size of the Matrix and Vector so I need to use loops, because with SSE I'm able to use 4 numbers within one xmm register (working with single precision floats). In case of, for example, Matrix 16*16 and Vector 16*1 I will have to perform the computation in loop 4 times. The computation itself could be made several ways. The critical for me is the loop. I use NASM and the output of my code is library (.dll). After the library is made, I use Matlab then to compare computations this way:

a) define the Matrix and Vector (using random numbers generated by Matlab) and define the size of the Matrix and Vector
b) perform computation in Matlab
c) perform computation using mentioned .dll from assembler
d) compare result

Well, after all these steps I found out that the output is different. The problem is, probably, in the loops because when I was debugging .exe file from assembler in OllyDbg, the computation of the M=4*4, V=4*1 was correct.

There are two types of loops I need. One for V*M and one for M*V.
My code for V*M:
(this one uses inside the loops shuffps instructions, this is only example, I have codes with unpack instructions and also with the transposition of matrix and so on)

______________________________________CODE V*M__________________________________________________________
vxm_sse1:
push dword ebp
   mov dword ebp,esp

   sub esp,4
   mov dword [ebp - 4],3

   mov eax,16
   mul dword [ebp + 24]

   push eax
   mul dword [ebp - 4]
   mov dword [ebp - 4],eax
   pop eax

    mov esi,[ebp + 12]        ;input vector pointer
   mov ecx,[ebp + 20]        ;length of the input vector
    mov edx,[ebp + 8]        ;Matrix pointer

   .invec1: ;1st loop
movups xmm0,[esi]
   mov edi,[ebp + 16]        ;output vector pointer
   mov ebx,[ebp + 24]        ;length of the output vector

         .radmat1: ;2nd loop inside the 1st one
            movups xmm4,[edx] ;begin of the computation with shufps instructions
            add edx,eax
            movups xmm5,[edx]
            add edx,eax
            movups xmm6,[edx]
            add edx,eax
            movups xmm7,[edx]
movups xmm2,[edi]

            movups xmm1,xmm0
            shufps xmm1,xmm1,0x00
            mulps xmm1,xmm4
            addps xmm2,xmm1
            movups xmm1,xmm0
            shufps xmm1,xmm1,0x55
            mulps xmm1,xmm5
            addps xmm2,xmm1
            movups xmm1,xmm0
            shufps xmm1,xmm1,0xAA
            mulps xmm1,xmm6
            addps xmm2,xmm1
            movups xmm1,xmm0
            shufps xmm1,xmm1,0xFF
            mulps xmm1,xmm7
            addps xmm2,xmm1
            movups [edi],xmm2

            add edi,16
sub edx,dword [ebp - 4]
            add edx,16

            dec ebx
         jnz .radmat1 ;end of the 2nd loop inside the 1st one

   add esi,16
   add edx,dword [ebp - 4]
   dec ecx
   jnz .invec1 ;end of the 1st loop

   mov dword esp,ebp
    pop dword ebp
ret 20
______________________________________/CODE V*M_________________________________________________________

My code for M*V (not V*M):
(this one uses probably the most easy was - horizontal adds - haddps instructions)

______________________________________CODE V*M__________________________________________________________
mxv_sse4:
   push dword ebp
   mov dword ebp,esp

   sub esp,4
   mov dword [ebp - 4],8

   mov eax,16
   mul dword [ebp + 24]

   push eax
   mul dword [ebp - 4]
   mov dword [ebp - 4],eax
   pop eax

    mov edx,[ebp + 8]        ;Matrix pointer
mov ebx,[ebp + 24]        ;length of the output vector
mov esi,[ebp + 12]        ;input vector pointer

   .invec4: ;1st loop
   movups xmm0,[esi]
   mov edi,[ebp + 16]        ;output vector pointer
   mov ecx,[ebp + 20]        ;length of the input vector

         .radmat4: ;2nd loop inside the 1st one
            movups xmm4,[edx] ;begin of the computation with haddps instructions
            add edx,eax
            movups xmm5,[edx]
            add edx,eax
            movups xmm6,[edx]
            add edx,eax
            movups xmm7,[edx]
movups xmm2,[edi]

            mulps xmm4,xmm0
            mulps xmm5,xmm0
            mulps xmm6,xmm0
            mulps xmm7,xmm0
            haddps xmm4,xmm5
            haddps xmm6,xmm7
            haddps xmm4,xmm6
            addps xmm2,xmm4
            movups [edi],xmm2

            add edi,16
add edx,eax
            dec ecx
         jnz .radmat4 ;end of the 2nd loop inside the 1st one

   add esi,16
   sub edx,dword [ebp - 4]
   add edx,16
   dec ebx
   jnz .invec4 ;end of the 1st loop

   mov dword esp,ebp
    pop dword ebp
ret 20
______________________________________/CODE V*M_________________________________________________________

Like I said, I think the problem is in loops, computations inside the loops look to be good. I would be really glad if you could be able to help me.

Thank you! Cheers!

Rob Neff · « **Reply #1 on:** May 06, 2015, 05:27:15 PM »

Assuming your SSE code is correct - I didn't check - it would appear the following, which I shortened your code to highlight, is the bug:

Code: [Select]

	.invec1:                                   ;1st loop
            movups xmm0,[esi]

            ;;;;
            ;;;; HERE, YOU ARE RESETTING POINTER BACK TO BEGINNING OF VECTOR FOR EACH LOOP.
            ;;;; THUS YOU LOSE ALL PREVIOUS COMPUTATIONS DONE.
            ;;;;
            mov edi,[ebp + 16]		           ;output vector pointer

            mov ebx,[ebp + 24]		           ;length of the output vector

           .radmat1:                  ;2nd loop inside the 1st one
               .
               .
               .
               dec ebx
               jnz .radmat1               ;end of the 2nd loop inside the 1st one

      	 add esi,16
         add edx,dword [ebp - 4]
         dec ecx
         jnz .invec1                                 ;end of the 1st loop

I'm pretty sure you didn't intend to overwrite previous computations as indicated by my comment within the code above.

coldflame · « **Reply #2 on:** May 06, 2015, 08:39:45 PM »

Thank you Rob, I will try to fix it and test it.

coldflame · « **Reply #3 on:** May 06, 2015, 09:15:47 PM »

Well, after the test Matlab still shows different result as assembled library function. I really do not know where the hell is problem.

Rob Neff · « **Reply #4 on:** May 07, 2015, 02:16:31 PM »

Start out small. Use a simple inc opcode within the inner loop. Then verify that the output is 1 more than the input. Once you have the loop working properly then insert your SSE code. That way, only the math changes, not the logic. Hope that helps.

coldflame · « **Reply #5 on:** May 11, 2015, 12:49:28 PM »

I just noticed that the part with reseting pointer is not a problem. Yes, Im resseting pointer back to beggining but after that, in the 2nd loop I use the old values in xmm2:

movups xmm4,[edx] ;begin of the computation with shufps instructions
add edx,eax
movups xmm5,[edx]
add edx,eax
movups xmm6,[edx]
add edx,eax
movups xmm7,[edx]
movups xmm2,[edi] ---------> here

But thank you Rob for answers.

NASM - The Netwide Assembler

News:

Author Topic: [NASM] Matrix/Vector multiplication loops with SSE (Read 19878 times)

coldflame

[NASM] Matrix/Vector multiplication loops with SSE

Rob Neff

Re: [NASM] Matrix/Vector multiplication loops with SSE

coldflame

Re: [NASM] Matrix/Vector multiplication loops with SSE

coldflame

Re: [NASM] Matrix/Vector multiplication loops with SSE

Rob Neff

Re: [NASM] Matrix/Vector multiplication loops with SSE

coldflame

Re: [NASM] Matrix/Vector multiplication loops with SSE