Author Topic: Program crashes when calling external DLL x64  (Read 5989 times)

Offline coldflame

  • Jr. Member
  • *
  • Posts: 15
Program crashes when calling external DLL x64
« on: March 23, 2017, 06:42:56 PM »

I have a DLL created in NASM and linked with GoLink. There's one function that computes Vector x Matrix multiplication. When I try to call this function in external program (Matlab), it crashes. I think there is some detail I missed in my code. I'd appreciate every help.

The code below is one that computes big matrix (in my case 4096x4096) multiplication in single precision. The final result is computed in cycles - each cycle works with 4x4 submatrix (accroding to length of the xmm registers).

To make it as clear as possible, I added some notes to help you quickly understand the code. However, I have a feeling that something is wrong with stack or just with parameter passing. The computation itself is correct (tested with debugger).

Also, in debugger the whole computation works fine, even the jump back from the function works. I have no idea why Matlab crashes.

Code: [Select]
global vxm_sse         
export vxm_sse       

section .data

section .text


mov rax,1

;fastcall x64
;rcx = Matrix pointer
;rdx => rsi = Input vector pointer
;r8 -> rax = Output vector pointer
;r9 = length of the vector


push rbp
lea rbp,[rsp + 20h]                        ; stack shadow space

lea rsi,[rdx]          ; input vector pointer from rdx to rsi

                mov qword r13,3
              mov rax,8
          mul r9                 ; RAX = value needed for Matrix pointer movement (jump to the first value in next matrix row)

              push rax
          mul qword r13
        mov qword r13,rax                             ; R13 = value needed for Matrix pointer movement (jump 3 rows back
        pop rax              ; to the value used in the beginning of the cycle)

        mov r12,rax ; RAX to R12 (RAX will be needed to get the output vector "outside" the function
        mov r11,r9 ; R9 to R11 = both now contains length of the vectors (needed for cycle decrementation)

invec1:                                              ; first loop
movups xmm0,[rsi]          ; input subvector 4x1 to xmm0
mov r10,r11 ; R11 to R10 = needed for cycle decrementation
lea rax,[r8] ; output vector pointer moved into RAX

radmat1:                                   ; second loop
movups xmm4,[rcx]           ; submatrix 4x4 from RCX to xmm4-7
add rcx,r12
movups xmm5,[rcx]
add rcx,r12
movups xmm6,[rcx]
add rcx,r12
movups xmm7,[rcx]
                                movups xmm2,[rax]          ; output subvector into xmm2

                                movups xmm1,xmm0
shufps xmm1,xmm1,0x00
mulps xmm1,xmm4
addps xmm2,xmm1
movups xmm1,xmm0
shufps xmm1,xmm1,0x55
mulps xmm1,xmm5
addps xmm2,xmm1
movups xmm1,xmm0
shufps xmm1,xmm1,0xAA
mulps xmm1,xmm6
addps xmm2,xmm1
movups xmm1,xmm0
shufps xmm1,xmm1,0xFF
mulps xmm1,xmm7
addps xmm2,xmm1
movups [rax],xmm2                  ; result of the partial computation into output vector pointer

add rax,16
sub rcx,r13
add rcx,16             

dec r10
jnz radmat1

add rsi,16
        sub rcx,r13
        dec r9
        jnz invec1

        lea rsp,[rbp - 20h] 
pop rbp