From: "George C. Moschovitis" <gmoscho AT alexander DOT cc DOT ece DOT ntua DOT gr>
To: djgpp AT sun DOT soe DOT clarkson DOT edu (djgpp)
Date: Wed, 5 Jul 1995 19:56:37 +0300 (EET DST)


/*
 * This code is NOT tested. Just wrote it to propose
 * some ideas for further optimisation of memset for 486/pentium.
 * i 'll further optimize it some day...
 * (coz it a generally usefull rout for gfx programming for example :)
 * but i would like to hear comments on this !
 * remember ! consider this as pseudocode
 * dunno if gas can assemble it :)
 * by George (tmL) Moshovitis / ETD
 *
*/

	.file "memset.s"
	.text
	.globl _memset
	.align 4			# this is align 2^4 right ?
					# coz the 468-cache has a 16 byte line
					# (p5 has 32 byte line but data can
					# cross the line so align 16 is enough)
_memset:				
        pushl	%edi
					# use esp as frame pointer...
	movl	8(%esp),%edi		# this REALLY annoys me.
	movl	12(%esp),%eax		# How about declaring an inline asm
	movl	16(%esp),%ecx		# memset rout in string.h that gets
					# those parameters and jumps to this
					# code (without this stack code).
					# perhaps this isnt elegant but memset
					# should really be as fast as possible.
					# (lots of people use it for example
					# to draw scanlines in triangle fillers)
					# think about it...
	
	cmpl $15,%ecx
	jle L3
	
	movb %al,%ah
	movl $16,%ebx		# prepare ebx for later (pairs with above)
	movl %eax,%edx
	sall $16,%eax
	movw %dx,%ax

	movl %ecx,%edx
	shrl $4,%ecx

	.AREPEAT 3		# save some jumps 
	tstl $3,%edi
	jz L1
	movb %al,0(%edi)
	decb %dl
	incl %edi
	.AENDR
	jmp L1	

	.align 4
L1:				# this loop could be better anti-stall 
	movl %eax,0(%edi)	# optimized but i am not in the mood...
	movl %eax,16(%edi)      # anyway it fits in a cache line...
	addl %ebx,%edi		# use ebx instead of a constant... 
	movl %eax,(2-16)(%edi)
	decl %ecx
	movl %eax,(3-16)(%edi)	# mov doesn't mess with the carry...
	jnz L2			# i hope gas converts this to a short jmp...	

	movb %dl,%cl
	andb $3,%cl
L2:				# "RISCified" rep stosb :)
	movb %al,(%edi)		# faster than rep stosb on 486/pentium.
	decb %cl		# and i think on pentium too ??
	incl %edi		# does NOT mess with carry...
	jnz  L2

	movl 8(%esp),%eax
	popl %edi	
	ret
	
	.align 4		# some code bytes more make no big deal.
L3:				# not that we gain that much, but anyway...
	movb %al,(%edi)		# faster than rep stosb on 486.
	decb %cl		# and i think on pentium too ??
	incl %edi		# does NOT mess with carry...
	jnz  L3

	movl 8(%esp),%eax
	popl %edi	
	ret