NASM - The Netwide Assembler

NASM Forum => Programming with NASM => Topic started by: paml27 on March 04, 2019, 10:25:32 PM

Title: Windows multithreading problem
Post by: paml27 on March 04, 2019, 10:25:32 PM
I am creating four threads for a four-core 64-bit Windows 7 machine in NASM 64, using Windows API calls.  The complete reproducible code is below (it is written as a DLL and the entry point is Main_Entry_fn). 

The four threads are created in a loop by calls to CreateThread.  All threads call Test_Function where they write the stride value (32 in this example).  All threads are set to execute immediately on creation. 

Each thread processes an input array and writes output to an output buffer.  The first thread starts at data element 0, the second at 1, etc.  Each jumps by four (32 bytes), so that thread 1 processes 0, 4, 8, etc and thread 2 processes 1, 5, 9, etc. 

In my simple Test_Function I write the number 32 (the stride value) to each of eight locations (0, 4; 1, 5; 2, 6; 3, 7),  so I can test the function of the threads. 

The problem is that thread 1 never executes.  Threads 2-4 usually return a value, but not always.  When I call GetCurrentThreadId it returns an ID for threads 2-4, but not for thread 1.  When query the return handles from CreateThread, all four handles are returned. 

There should be no need for a lock on the data reads and writes; the output buffer is shared between all threads, but all threads writes data to unique locations in the buffer. 

Update:  I just modified the code to create only one thread, and that thread executes now whereas if I create all four threads it doesn't. 

; Header Section
[BITS 64]
[default rel]
extern malloc, calloc, realloc, free
global Main_Entry_fn
export Main_Entry_fn
global FreeMem_fn
export FreeMem_fn
extern CreateThread, CloseHandle, ResumeThread
extern WaitForMultipleObjects, GetCurrentThreadId, GetLastError

section .data align=16
Return_Pointer_Array: dq 0, 0, 0
list_of_results_ptr: dq 0
list_of_results_ctr: dq 0
list_of_results_length: dq 0
data_master_ptr: dq 0
initial_dynamic_length: dq 0
numbers_ptr: dq 0
numbers_length: dq 0
numbers_ctr: dq 0
ThreadCount:  times 4 dq 0
ThreadInfo: times 8 dq 0
ThreadHandles: times 4 dq 0
ThreadAttributes: times 4 dq 0
StartByte: dq 0
stride: dq 32

section .text


; Populate the ThreadInfo array with vars to pass
; ThreadInfo: length, startbyte, stride, vars into registers on entry to each core

mov rdi,ThreadInfo
mov rax,0 ;ThreadInfoLength
mov [rdi],rax      ; length (number of vars into registers plus 3 elements)
mov rax,[stride]
mov [rdi+16],rax   ; 8 x number of cores (32 in this example)
mov [rdi+24],r15

mov rbp,rsp ; preserve caller's stack frame
sub rsp,56 ; Shadow space (was 32)
; _____


mov rdi,ThreadInfo
mov rax,[StartByte]
mov [rdi+8],rax   ; 0, 8, 16, or 24
; _____
; Create Threads

mov rcx,ThreadAttributes   ; lpThreadAttributes (Security Attributes)
mov rdx,0            ; dwStackSize
mov r8,Test_Function      ; lpStartAddress (function pointer)
mov r9,ThreadInfo      ; lpParameter (array of data passed to each core)

mov rax,0
mov [rsp+32],rax         ; use default creation flags
mov rdi,ThreadCount
mov [rsp+40],rdi         ; ThreadID

call CreateThread

; Move the handle into ThreadHandles array
mov rdi,ThreadHandles
mov rcx,[StartByte]
mov [rdi+rcx],rax

mov rax,[StartByte]
add rax,8
mov [StartByte],rax

mov rbx,32
cmp rax,rbx
jl label_0

; _____
; Wait

mov rcx,4 ;rax         ; number of handles
mov rdx,ThreadHandles      ; pointer to handles array
mov r8,0            ; wait for all threads to complete
mov r9,5000         ; milliseconds to wait

call WaitForMultipleObjects

; _____

mov rsp,rbp ; Unwind the stack
jmp label_900



; Populate registers
mov rdi,rcx
mov r15,[rdi+24]

mov rcx,[rdi+8] ; start byte
mov rax,[rdi+16] ; stride
cvtsi2sd xmm0,rax
movsd [r15+rcx],xmm0
add rcx,32
movsd [r15+rcx],xmm0


; __________

mov rdi,ThreadHandles
mov rcx,[rdi+0]
call CloseHandle
mov rcx,[rdi+8]
call CloseHandle
mov rcx,[rdi+16]
call CloseHandle
mov rcx,[rdi+24]
call CloseHandle

mov rdi,Return_Pointer_Array
mov rax,r15
mov [rdi+0],rax
mov rax,80 ;r14
mov [rdi+8],rax
mov rax,rdi
;Free the memory
;The pointer is passed back in rcx (of course)
sub rsp,40
call free
add rsp,40
; __________
; Main Entry
push rdi
push rbp
push rbx

push r15
xor r15,r15

mov [numbers_ptr],rcx
mov [data_master_ptr],rdx
; Now assign lengths
lea rdi,[data_master_ptr]
mov rbp,[rdi]
xor rcx,rcx
movsd xmm0,qword[rbp+rcx]
cvttsd2si rax,xmm0
mov [numbers_length],rax
add rcx,8
; __________
; malloc for dynamic arrays
lea rdi,[data_master_ptr]
mov rbp,[rdi]
movsd xmm0,qword[rbp]
cvttsd2si rax,xmm0
mov r8,rax
;Allocate 10 times size of input array
;but use the size of input array if it exceeds 10MB
mov rdx,10
mul rdx
mov rdx,10000000
cmp rax,rdx
jl malloc_next
mov rax,r8
mov rax,50000000
mov [initial_dynamic_length],rax
mov rcx,qword[initial_dynamic_length] ; Initial size
xor rax,rax
sub rsp,40
call malloc
mov qword [list_of_results_ptr],rax
add rsp,40
mov rax,qword[initial_dynamic_length]
mov [list_of_results_length],rax
; __________

lea rdi,[rel list_of_results_ptr]
mov r15,qword[rdi]
; __________

call Init_Cores_fn

pop r15
pop rbx
pop rbp
pop rdi
Title: Re: Windows multithreading problem
Post by: paml27 on March 06, 2019, 07:23:37 PM
The answer to this question is at 
Title: Re: Windows multithreading problem
Post by: Frank Kotler on March 06, 2019, 09:23:06 PM
Hi paml27,

Thanks for sharing the answer!