For your study:
bits 64
default rel
section .rodata
nl:
db `\n`
section .text
global _start
align 4
_start:
sub rsp,8 ; Align RSP to DQWORD (SysV ABI)
mov rdi,12345678 ; # to print.
call printUint64Decimal
; Print a newline
mov eax,1
mov edi,eax
mov edx,eax
lea rsi,[nl]
syscall
sub rsp,8 ; restore RSP
; Exit program
mov eax,60
xor edi,edi
syscall ; This syscall never returns.
; Entry RDI = #
align 4
printUint64Decimal:
; Allocate 24 bytes, realigning RSP to DQWORD (SysV ABI).
; We just need 22 bytes in the buffer allocated on the stack.
sub rsp, 24
mov r9, rsp
mov rsi, rsp
mov r8, 0xcccccccccccccccd ; 1/10, scaled (0b0.00011001100... rounded and shifted left by 67).
align 4
.loop:
mov rax, rdi
dec rsi
; Multiply by scaled 1/10, instead of dividing by 10
; This is faster.
mul r8
mov rax, rdi
shr rdx, 3 ; RDX = quotient
lea rcx, [rdx+rdx*4] ; RCX = RDX*10
add rcx, rcx
sub rax, rcx ; RAX = Dividend - RCX (remainder)
; RAX = remainder, RDI = quotient
; Store remainder converted to ASCII.
add al, '0'
mov [rsi], al
mov rax, rdi
mov rdi, rdx
; Stay in loop if quotient > 9.
cmp rax, 9
ja .loop
; Print the buffer, calculating the size of the string.
mov eax, 1
mov rdx, r9
sub rdx, rsi
mov edi, eax
syscall
add rsp, 24
ret