Below follows my first subroutine ever written in assembly language. Figured I'd share with all the noobs (like myself). Reading the forums, following links that Frank has provided for optimazation pointers and whatnot.....what follows is what I came up with. Tested, and is significantly faster than any libc version that I ran this against. I *think* I got everything right (no core files), but I'm sure there may be room for improvement. I implemented loop unrolling, branching hints, and dword copies (as opposed as byte copies).
Anyways, hope this helps someone who's like me and still "learning the ropes" as I like to put it. Comments welcomed naturally.
hints.inc:
%ifndef HINTS_INC
%define HINTS_INC
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
%macro hint.nobranch 0 ; Use this macro right before a jump to hint to
DB 2Eh ; the processor whether nor not we will be
%endmacro ; taking a jump.
;
%macro hint.branch 0 ; This should help speed up branch prediction
DB 3Eh ; just a wee bit.
%endmacro ;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
%endif
bzero.s:
%include 'hints.inc'
section .text
global byte_zero:function
byte_zero:
push ebp ;
mov ebp, esp ; Stack Frame
mov eax, [ebp+8] ; Destination
mov ecx, [ebp+12] ; Length
cmp ecx, 0 ; Length 0?
hint.nobranch
je .done ; Yep, we are done.
cmp ecx, 4 ; Length, less than 4?
hint.nobranch
jl .cleanup ; Yep, let's cleanup
mov edx, eax ; Is the address
and edx, 3 ; 32 bit aligned?
hint.branch
jz .aligned ; Yep, jump to .aligned
mov edx, eax ; Is the address
and edx, 1 ; 16 bit aligned?
jz .aligned_16 ; Yep, Jump to .aligned_16
mov byte [eax], 0 ; Write a byte to the memory
sub ecx, 1 ; location and hope we are
add eax, 1 ; aligned at this point.
mov edx, eax ; Is the address
and edx, 3 ; 32 bit aligned now?
jz .aligned ; Yep, jump to .aligned
; We are word aligned at this point.
.aligned_16:
mov word [eax], 0
sub ecx, 2
add eax, 2
.aligned:
; 1st iteration
mov dword [eax], 0 ; Move a dword into memory location
sub ecx, 4 ; Subtract 4 from length
add eax, 4 ; Add 4 to address
cmp ecx, 4 ; If length is less than 4
hint.nobranch
jl .cleanup ; cleanup.
; 2nd iteration
mov dword [eax], 0
sub ecx, 4
add eax, 4
cmp ecx, 4
hint.nobranch
jl .cleanup
; 3rd iteration
mov dword [eax], 0
sub ecx, 4
add eax, 4
cmp ecx, 4
hint.nobranch
jl .cleanup
; 4th iteration
mov dword [eax], 0
sub ecx, 4
add eax, 4
cmp ecx, 4
hint.branch
jge .aligned
.cleanup:
cmp ecx, 0 ; Is ecx zero?
jz .done ; Yep, we are done.
mov byte [eax], 0 ; Move a zero byte
sub ecx, 1 ; Subtract one from length
add eax, 1 ; Add one to the address.
jmp .cleanup ; To the top!
.done:
mov esp, ebp ; Restore Frame
pop ebp ;
ret ; Return
Trivial example:
############
char b[15];
(void)byte_zero(&b, 15);
############