Hello,
I have a DLL created in NASM and linked with GoLink. There's one function that computes Vector x Matrix multiplication. When I try to call this function in external program (Matlab), it crashes. I think there is some detail I missed in my code. I'd appreciate every help.
The code below is one that computes big matrix (in my case 4096x4096) multiplication in single precision. The final result is computed in cycles - each cycle works with 4x4 submatrix (accroding to length of the xmm registers).
To make it as clear as possible, I added some notes to help you quickly understand the code. However, I have a feeling that something is wrong with stack or just with parameter passing. The computation itself is correct (tested with debugger).
Also, in debugger the whole computation works fine, even the jump back from the function works. I have no idea why Matlab crashes.
global vxm_sse
export vxm_sse
section .data
section .text
start:
DllMain:
mov rax,1
ret
;fastcall x64
;rcx = Matrix pointer
;rdx => rsi = Input vector pointer
;r8 -> rax = Output vector pointer
;r9 = length of the vector
vxm_sse:
push rbp
lea rbp,[rsp + 20h] ; stack shadow space
lea rsi,[rdx] ; input vector pointer from rdx to rsi
mov qword r13,3
mov rax,8
mul r9 ; RAX = value needed for Matrix pointer movement (jump to the first value in next matrix row)
push rax
mul qword r13
mov qword r13,rax ; R13 = value needed for Matrix pointer movement (jump 3 rows back
pop rax ; to the value used in the beginning of the cycle)
mov r12,rax ; RAX to R12 (RAX will be needed to get the output vector "outside" the function
mov r11,r9 ; R9 to R11 = both now contains length of the vectors (needed for cycle decrementation)
invec1: ; first loop
movups xmm0,[rsi] ; input subvector 4x1 to xmm0
mov r10,r11 ; R11 to R10 = needed for cycle decrementation
lea rax,[r8] ; output vector pointer moved into RAX
radmat1: ; second loop
movups xmm4,[rcx] ; submatrix 4x4 from RCX to xmm4-7
add rcx,r12
movups xmm5,[rcx]
add rcx,r12
movups xmm6,[rcx]
add rcx,r12
movups xmm7,[rcx]
movups xmm2,[rax] ; output subvector into xmm2
movups xmm1,xmm0
shufps xmm1,xmm1,0x00
mulps xmm1,xmm4
addps xmm2,xmm1
movups xmm1,xmm0
shufps xmm1,xmm1,0x55
mulps xmm1,xmm5
addps xmm2,xmm1
movups xmm1,xmm0
shufps xmm1,xmm1,0xAA
mulps xmm1,xmm6
addps xmm2,xmm1
movups xmm1,xmm0
shufps xmm1,xmm1,0xFF
mulps xmm1,xmm7
addps xmm2,xmm1
movups [rax],xmm2 ; result of the partial computation into output vector pointer
add rax,16
sub rcx,r13
add rcx,16
dec r10
jnz radmat1
add rsi,16
sub rcx,r13
dec r9
jnz invec1
lea rsp,[rbp - 20h]
pop rbp
ret