Author Topic: how many registers can i use in intel i7 ?  (Read 11144 times)


  • Guest
how many registers can i use in intel i7 ?
« on: June 02, 2009, 06:08:41 PM »
how many 64 bit registers can I use inside intel i7 cpu for storage purposes to feed them later into XMM registers? I currently use XMM0-15, MM0-8, R8-15 only. I know i can use RAX,RBX,RCX, RDX and eight registers inside the FPU (ST0-ST8), but what others can I use? Can I use stack registers? Thanks in advance.

I attach my application code if needed.

void pipe_mult_ushort(ushort *data,ushort *rands)
__asm__ __volatile__(".intel_syntax noprefix\n\t"
      //// in this section we pull as much data as we can into the CPU
      //// to minimize the DRAM delay and store it where we can

"movdqa xmm0,[edi]\n\t"         // load xmm0 & xmm1
                "movdqa xmm1,[esi]\n\t"

"movdqa xmm2,0x10[edi]\n\t"     // load xmm2 & xmm3
                "movdqa xmm3,0x10[esi]\n\t"

"movdqa xmm4,0x20[edi]\n\t"     // load xmm4 & xmm5
                "movdqa xmm5,0x20[esi]\n\t"

"movdqa xmm6,0x30[edi]\n\t"     // load xmm6 & xmm7
                "movdqa xmm7,0x30[esi]\n\t"

"movdqa xmm8,0x40[edi]\n\t"     // load xmm8 & xmm9
                "movdqa xmm9,0x40[esi]\n\t"

"movdqa xmm10,0x50[edi]\n\t"    // load xmm10 & xmm11
                "movdqa xmm11,0x50[esi]\n\t"

"movdqa xmm12,0x60[edi]\n\t"    // load xmm12 & xmm13
                "movdqa xmm13,0x60[esi]\n\t"

"movdqa xmm14,0x70[edi]\n\t"    // load xmm14 & xmm15
                "movdqa xmm15,0x70[esi]\n\t"

"movq mm0,0x80[edi]\n\t"   // load mmx0
      "movq mm1,0x80[esi]\n\t"
      "movq mm2,0x88[edi]\n\t"
      "movq mm3,0x88[esi]\n\t"
      "movq mm4,0x90[edi]\n\t"
      "movq mm5,0x90[esi]\n\t"
      "movq mm6,0x98[edi]\n\t"
      "movq mm7,0x98[esi]\n\t"

"movq  r8,0xA0[edi]\n\t"   // store some in extended 64bit registers
      "movq  r9,0xA0[esi]\n\t"
      "movq r10,0xA8[edi]\n\t"
      "movq r11,0xA8[esi]\n\t"
      "movq r12,0xB0[edi]\n\t"
      "movq r13,0xB0[esi]\n\t"
      "movq r14,0xB8[edi]\n\t"
      "movq r15,0xB8[esi]\n\t"

// all available registers were data can be stored were filled, proceed with calcs now
      // calc xmms first
                "pmullw xmm0,xmm1\n\t"          // calc xmm0
                "pmullw xmm2,xmm3\n\t"          // calc xmm2
                "pmullw xmm4,xmm5\n\t"          // calc xmm4
                "pmullw xmm6,xmm7\n\t"          // calc xmm6
                "pmullw xmm8,xmm9\n\t"          // calc xmm8
                "pmullw xmm10,xmm11\n\t"        // calc xmm10
                "pmullw xmm12,xmm13\n\t"        // calc xmm12
                "pmullw xmm14,xmm15\n\t"        // calc xmm14

// calc mms second
                "pmullw mm0,mm1\n\t"        // calc mm0
                "pmullw mm2,mm3\n\t"        // calc mm0
                "pmullw mm4,mm5\n\t"        // calc mm0
                "pmullw mm6,mm7\n\t"        // calc mm0

// send xmm values to memory
                "movdqa [edi],xmm0\n\t"         // xmm0 -> memory
                "movdqa 0x10[edi],xmm2\n\t"     // xmm2 -> memory
                "movdqa 0x20[edi],xmm4\n\t"     // xmm4 -> memory
                "movdqa 0x30[edi],xmm6\n\t"     // xmm6 -> memory
                "movdqa 0x40[edi],xmm8\n\t"     // xmm8 -> memory
                "movdqa 0x50[edi],xmm10\n\t"    // xmm10 -> memory
                "movdqa 0x60[edi],xmm12\n\t"    // xmm12 -> memory
                "movdqa 0x70[edi],xmm14\n\t"    // xmm14 -> memory

// send mm values to memory
                "movq 0x80[edi],mm0\n\t"       // mm0 -> memory
                "movq 0x88[edi],mm2\n\t"       // mm2 -> memory
                "movq 0x90[edi],mm4\n\t"       // mm4 -> memory
                "movq 0x98[edi],mm6\n\t"       // mm6 -> memory

// xmms & mms are free now
      // load mms from 'r's
      "movq mm0,r8\n\t"      // move saved 'r' to mm
      "movq mm1,r9\n\t"      // move saved 'r' to mm
      "movq mm2,r10\n\t"      // move saved 'r' to mm
      "movq mm3,r11\n\t"      // move saved 'r' to mm
      "movq mm4,r12\n\t"      // move saved 'r' to mm
      "movq mm5,r13\n\t"      // move saved 'r' to mm
      "movq mm6,r14\n\t"      // move saved 'r' to mm
      "movq mm7,r15\n\t"      // move saved 'r' to mm
      // calc mms
                "pmullw mm0,mm1\n\t"          // calc mms copied from 'r's
                "pmullw mm2,mm3\n\t"          // calc mms copied from 'r's
                "pmullw mm4,mm5\n\t"          // calc mms copied from 'r's
                "pmullw mm6,mm7\n\t"          // calc mms copied from 'r's
      // send mm values to memory
                "movq 0xA0[edi],mm0\n\t"       // mm0 -> memory
                "movq 0xA8[edi],mm2\n\t"       // mm2 -> memory
                "movq 0xB0[edi],mm4\n\t"       // mm4 -> memory
                "movq 0xB8[edi],mm6\n\t"       // mm6 -> memory

                     :  "D" (data) ,"S" (rands)
                     :  "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7",



#include "pipe_line_math.h"

#define _ARRAY_SIZE_ 256*256*256*24
#define _ELTS_PER_PIPE_ 112
ushort __attribute__ ((aligned (16))) rands[_ARRAY_SIZE_];
ushort __attribute__ ((aligned (16))) data[_ARRAY_SIZE_];

struct timespec tspec1;
struct timespec tspec2;

main() {
ulong i,max;
double diff;

for (i=0;i<_ARRAY_SIZE_;i++) { /// fill with any data

    for (i=0;i,&rands);
    for (i=0;i,&rands);  // one more time
    printf("time pipeline multiply:\nstart: %d:%d\n  end: %d:%d ; total diff: %f\n",tspec1.tv_sec,tspec1.tv_nsec,tspec2.tv_sec,tspec2.tv_nsec,diff);
    printf("sample data:\n");
    for (i=0;i<64;i++) {
        if (!((i+1)%16))    printf("\n");