NASM - The Netwide Assembler
NASM Forum => Example Code => Topic started by: pprocacci on May 18, 2010, 09:25:19 AM
-
Below follows my first subroutine ever written in assembly language. Figured I'd share with all the noobs (like myself). Reading the forums, following links that Frank has provided for optimazation pointers and whatnot.....what follows is what I came up with. Tested, and is significantly faster than any libc version that I ran this against. I *think* I got everything right (no core files), but I'm sure there may be room for improvement. I implemented loop unrolling, branching hints, and dword copies (as opposed as byte copies).
Anyways, hope this helps someone who's like me and still "learning the ropes" as I like to put it. Comments welcomed naturally.
hints.inc:
%ifndef HINTS_INC
%define HINTS_INC
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
%macro hint.nobranch 0 ; Use this macro right before a jump to hint to
DB 2Eh ; the processor whether nor not we will be
%endmacro ; taking a jump.
;
%macro hint.branch 0 ; This should help speed up branch prediction
DB 3Eh ; just a wee bit.
%endmacro ;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
%endif
bzero.s:
%include 'hints.inc'
section .text
global byte_zero:function
byte_zero:
push ebp ;
mov ebp, esp ; Stack Frame
mov eax, [ebp+8] ; Destination
mov ecx, [ebp+12] ; Length
cmp ecx, 0 ; Length 0?
hint.nobranch
je .done ; Yep, we are done.
cmp ecx, 4 ; Length, less than 4?
hint.nobranch
jl .cleanup ; Yep, let's cleanup
mov edx, eax ; Is the address
and edx, 3 ; 32 bit aligned?
hint.branch
jz .aligned ; Yep, jump to .aligned
mov edx, eax ; Is the address
and edx, 1 ; 16 bit aligned?
jz .aligned_16 ; Yep, Jump to .aligned_16
mov byte [eax], 0 ; Write a byte to the memory
sub ecx, 1 ; location and hope we are
add eax, 1 ; aligned at this point.
mov edx, eax ; Is the address
and edx, 3 ; 32 bit aligned now?
jz .aligned ; Yep, jump to .aligned
; We are word aligned at this point.
.aligned_16:
mov word [eax], 0
sub ecx, 2
add eax, 2
.aligned:
; 1st iteration
mov dword [eax], 0 ; Move a dword into memory location
sub ecx, 4 ; Subtract 4 from length
add eax, 4 ; Add 4 to address
cmp ecx, 4 ; If length is less than 4
hint.nobranch
jl .cleanup ; cleanup.
; 2nd iteration
mov dword [eax], 0
sub ecx, 4
add eax, 4
cmp ecx, 4
hint.nobranch
jl .cleanup
; 3rd iteration
mov dword [eax], 0
sub ecx, 4
add eax, 4
cmp ecx, 4
hint.nobranch
jl .cleanup
; 4th iteration
mov dword [eax], 0
sub ecx, 4
add eax, 4
cmp ecx, 4
hint.branch
jge .aligned
.cleanup:
cmp ecx, 0 ; Is ecx zero?
jz .done ; Yep, we are done.
mov byte [eax], 0 ; Move a zero byte
sub ecx, 1 ; Subtract one from length
add eax, 1 ; Add one to the address.
jmp .cleanup ; To the top!
.done:
mov esp, ebp ; Restore Frame
pop ebp ;
ret ; Return
Trivial example:
############
char b[15];
(void)byte_zero(&b, 15);
############
-
I have an update to my little routine that I'd like to provide. Again was reading a post from Frank regarding (un)signed opcodes and whatnot, and ensured all my opcodes worked on unsigned ints. The post I am referring to is referenced here:
http://forum.nasm.us/index.php?topic=809.0
In short:
"A minor nit: jnge and jle, which you use in validating the digits entered, are signed conditionals - jnae and jbe would be the unsigned instructions. Since you probably won't encounter a negative number here, it shouldn't ever cause a problem, but it isn't "right". (IMHO)"
There are probably other minor things I'm missing as well, and I'm not going to bore everyone to death! So without further adiou(sp), a patch.....
diff -u byte_zero.s.orig byte_zero.s
--- byte_zero.s.orig 2010-05-18 20:22:14.000000000 -0500
+++ byte_zero.s 2010-05-18 20:39:14.000000000 -0500
@@ -19,13 +19,13 @@
mov eax, [ebp+8] ; Destination
mov ecx, [ebp+12] ; Length
- cmp ecx, 0 ; Length 0?
+ cmp ecx, 0 ; Length 0?
hint.nobranch ; We are probably not going to branch.
- je .done ; Yep, we are done.
+ je .done ; Yep, we are done.
cmp ecx, 4 ; Length, less than 4?
hint.nobranch ; We are probably not going to branch.
- jl .cleanup ; Yep, let's cleanup
+ jb .cleanup ; Yep, let's cleanup
mov edx, eax ; Is the address
and edx, 3 ; 32 bit aligned?
@@ -57,7 +57,7 @@
add eax, 4 ; Add 4 to address
cmp ecx, 4 ; If length is less than 4
hint.nobranch ; Probably not going to branch
- jl .cleanup ; cleanup.
+ jb .cleanup ; cleanup.
; 2nd iteration
mov dword [eax], 0
@@ -65,7 +65,7 @@
add eax, 4
cmp ecx, 4
hint.nobranch
- jl .cleanup
+ jb .cleanup
; 3rd iteration
mov dword [eax], 0
@@ -73,7 +73,7 @@
add eax, 4
cmp ecx, 4
hint.nobranch
- jl .cleanup
+ jb .cleanup
; 4th iteration
mov dword [eax], 0
@@ -81,11 +81,10 @@
add eax, 4
cmp ecx, 4
hint.branch
- jge .aligned
+ jae .aligned
.cleanup:
- cmp ecx, 0 ; Is ecx zero?
- jz .done ; Yep, we are done.
+ jecxz .done ; Jump to done if ecx is 0
mov byte [eax], 0 ; Move a zero byte
sub ecx, 1 ; Subtract one from length
add eax, 1 ; Add one to the address.