Author Topic: My byte_zero example  (Read 22179 times)

Offline pprocacci

  • Jr. Member
  • *
  • Posts: 11
My byte_zero example
« on: May 18, 2010, 09:25:19 AM »
Below follows my first subroutine ever written in assembly language.  Figured I'd share with all the noobs (like myself).  Reading the forums, following links that Frank has provided for optimazation pointers and whatnot.....what follows is what I came up with.  Tested, and is significantly faster than any libc version that I ran this against.  I *think* I got everything right (no core files), but I'm sure there may be room for improvement.  I implemented loop unrolling, branching hints, and dword copies (as opposed as byte copies).

Anyways, hope this helps someone who's like me and still "learning the ropes" as I like to put it.  Comments welcomed naturally.

hints.inc:
Code: [Select]
%ifndef HINTS_INC
%define HINTS_INC
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
%macro hint.nobranch 0                                  ; Use this macro right before a jump to hint to
        DB 2Eh                                                 ; the processor whether nor not we will be
%endmacro                                                   ; taking a jump.
                                                                  ;
%macro hint.branch 0                                     ; This should help speed up branch prediction
        DB 3Eh                                                 ; just a wee bit.
%endmacro                                                   ;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
%endif


bzero.s:
Code: [Select]
%include        'hints.inc'

section .text
global byte_zero:function

byte_zero:
        push    ebp             ;
        mov     ebp, esp        ; Stack Frame

        mov     eax, [ebp+8]    ; Destination
        mov     ecx, [ebp+12]   ; Length

        cmp     ecx, 0          ; Length 0?
        hint.nobranch
        je      .done           ; Yep, we are done.

        cmp     ecx, 4          ; Length, less than 4?
        hint.nobranch
        jl      .cleanup        ; Yep, let's cleanup

        mov     edx, eax        ; Is the address
        and     edx, 3          ; 32 bit aligned?
        hint.branch
        jz      .aligned        ; Yep, jump to .aligned

        mov     edx, eax        ; Is the address
        and     edx, 1          ; 16 bit aligned?
        jz      .aligned_16     ; Yep, Jump to .aligned_16

        mov     byte [eax], 0   ; Write a byte to the memory
        sub     ecx, 1          ; location and hope we are
        add     eax, 1          ; aligned at this point.

        mov     edx, eax        ; Is the address
        and     edx, 3          ; 32 bit aligned now?
        jz      .aligned        ; Yep, jump to .aligned

        ; We are word aligned at this point.
.aligned_16:
        mov     word [eax], 0
        sub     ecx, 2
        add     eax, 2

.aligned:
        ; 1st iteration
        mov     dword [eax], 0  ; Move a dword into memory location
        sub     ecx, 4          ; Subtract 4 from length
        add     eax, 4          ; Add 4 to address
        cmp     ecx, 4          ; If length is less than 4
        hint.nobranch
        jl      .cleanup        ; cleanup.
        ; 2nd iteration
        mov     dword [eax], 0
        sub     ecx, 4
        add     eax, 4
        cmp     ecx, 4
        hint.nobranch
        jl      .cleanup
        ; 3rd iteration
        mov     dword [eax], 0
        sub     ecx, 4
        add     eax, 4
        cmp     ecx, 4
        hint.nobranch
        jl      .cleanup
        ; 4th iteration
        mov     dword [eax], 0
        sub     ecx, 4
        add     eax, 4
        cmp     ecx, 4
        hint.branch
        jge      .aligned

.cleanup:
        cmp     ecx, 0          ; Is ecx zero?
        jz      .done           ; Yep, we are done.
        mov     byte [eax], 0   ; Move a zero byte
        sub     ecx, 1          ; Subtract one from length
        add     eax, 1          ; Add one to the address.
        jmp     .cleanup        ; To the top!
.done:
        mov     esp, ebp        ; Restore Frame
        pop     ebp             ;
        ret                     ; Return

Trivial example:

############
char b[15];
(void)byte_zero(&b, 15);
############
« Last Edit: May 18, 2010, 09:32:14 AM by pprocacci »

Offline pprocacci

  • Jr. Member
  • *
  • Posts: 11
Re: My byte_zero example
« Reply #1 on: May 19, 2010, 01:44:08 AM »
I have an update to my little routine that I'd like to provide.  Again was reading a post from Frank regarding (un)signed opcodes and whatnot, and ensured all my opcodes worked on unsigned ints.  The post I am referring to is referenced here:

http://forum.nasm.us/index.php?topic=809.0

In short:

"A minor nit: jnge and jle, which you use in validating the digits entered, are signed conditionals - jnae and jbe would be the unsigned instructions. Since you probably won't encounter a negative number here, it shouldn't ever cause a problem, but it isn't "right". (IMHO)"

There are probably other minor things I'm missing as well, and I'm not going to bore everyone to death!  So without further adiou(sp), a patch.....

diff -u byte_zero.s.orig byte_zero.s
Code: [Select]
--- byte_zero.s.orig    2010-05-18 20:22:14.000000000 -0500
+++ byte_zero.s 2010-05-18 20:39:14.000000000 -0500
@@ -19,13 +19,13 @@
        mov     eax, [ebp+8]    ; Destination
        mov     ecx, [ebp+12]   ; Length
 
-       cmp     ecx, 0          ; Length 0?
+       cmp     ecx, 0          ; Length 0?
        hint.nobranch           ; We are probably not going to branch.
-       je      .done           ; Yep, we are done.
+       je      .done           ; Yep, we are done.
 
        cmp     ecx, 4          ; Length, less than 4?
        hint.nobranch           ; We are probably not going to branch.
-       jl      .cleanup        ; Yep, let's cleanup
+       jb      .cleanup        ; Yep, let's cleanup
 
        mov     edx, eax        ; Is the address
        and     edx, 3          ; 32 bit aligned?
@@ -57,7 +57,7 @@
        add     eax, 4          ; Add 4 to address
        cmp     ecx, 4          ; If length is less than 4
        hint.nobranch           ; Probably not going to branch
-       jl      .cleanup        ; cleanup.
+       jb      .cleanup        ; cleanup.
 
        ; 2nd iteration
        mov     dword [eax], 0
@@ -65,7 +65,7 @@
        add     eax, 4
        cmp     ecx, 4
        hint.nobranch
-       jl      .cleanup
+       jb      .cleanup
 
        ; 3rd iteration
        mov     dword [eax], 0
@@ -73,7 +73,7 @@
        add     eax, 4
        cmp     ecx, 4
        hint.nobranch
-       jl      .cleanup
+       jb      .cleanup
 
        ; 4th iteration
        mov     dword [eax], 0
@@ -81,11 +81,10 @@
        add     eax, 4
        cmp     ecx, 4
        hint.branch
-       jge     .aligned
+       jae     .aligned
 
 .cleanup:
-       cmp     ecx, 0          ; Is ecx zero?
-       jz      .done           ; Yep, we are done.
+       jecxz   .done           ; Jump to done if ecx is 0
        mov     byte [eax], 0   ; Move a zero byte
        sub     ecx, 1          ; Subtract one from length
        add     eax, 1          ; Add one to the address.