From: "George C. Moschovitis" To: djgpp AT sun DOT soe DOT clarkson DOT edu (djgpp) Date: Wed, 5 Jul 1995 19:56:37 +0300 (EET DST) /* * This code is NOT tested. Just wrote it to propose * some ideas for further optimisation of memset for 486/pentium. * i 'll further optimize it some day... * (coz it a generally usefull rout for gfx programming for example :) * but i would like to hear comments on this ! * remember ! consider this as pseudocode * dunno if gas can assemble it :) * by George (tmL) Moshovitis / ETD * */ .file "memset.s" .text .globl _memset .align 4 # this is align 2^4 right ? # coz the 468-cache has a 16 byte line # (p5 has 32 byte line but data can # cross the line so align 16 is enough) _memset: pushl %edi # use esp as frame pointer... movl 8(%esp),%edi # this REALLY annoys me. movl 12(%esp),%eax # How about declaring an inline asm movl 16(%esp),%ecx # memset rout in string.h that gets # those parameters and jumps to this # code (without this stack code). # perhaps this isnt elegant but memset # should really be as fast as possible. # (lots of people use it for example # to draw scanlines in triangle fillers) # think about it... cmpl $15,%ecx jle L3 movb %al,%ah movl $16,%ebx # prepare ebx for later (pairs with above) movl %eax,%edx sall $16,%eax movw %dx,%ax movl %ecx,%edx shrl $4,%ecx .AREPEAT 3 # save some jumps tstl $3,%edi jz L1 movb %al,0(%edi) decb %dl incl %edi .AENDR jmp L1 .align 4 L1: # this loop could be better anti-stall movl %eax,0(%edi) # optimized but i am not in the mood... movl %eax,16(%edi) # anyway it fits in a cache line... addl %ebx,%edi # use ebx instead of a constant... movl %eax,(2-16)(%edi) decl %ecx movl %eax,(3-16)(%edi) # mov doesn't mess with the carry... jnz L2 # i hope gas converts this to a short jmp... movb %dl,%cl andb $3,%cl L2: # "RISCified" rep stosb :) movb %al,(%edi) # faster than rep stosb on 486/pentium. decb %cl # and i think on pentium too ?? incl %edi # does NOT mess with carry... jnz L2 movl 8(%esp),%eax popl %edi ret .align 4 # some code bytes more make no big deal. L3: # not that we gain that much, but anyway... movb %al,(%edi) # faster than rep stosb on 486. decb %cl # and i think on pentium too ?? incl %edi # does NOT mess with carry... jnz L3 movl 8(%esp),%eax popl %edi ret