I came across the problem today when using optimisations with the following
C code with inline asm.
void tilescreen(byte *screen,byte *tile,lword x, lword y, byte rot, byte xscale, byte yscale)
{
lword ddx, ddy, d2x, d2y, i, j;
ddx = (CosTable[rot] * xscale) >>5;
ddy = (SinTable[rot] * yscale) >>8;
rot+=64;
d2x = (CosTable[rot] * xscale) >>5;
d2y = (SinTable[rot] * yscale) >>8;
i = x - ddx * 160 - d2x * 100;
j = y*32 - ddy * 160 - d2y * 100;
__asm__ __volatile__ (" cld
movl %7,%%edi
movl $200,%%ecx
0: /* vertical loop */
push %%ecx
movl %0,%%eax
movl %1,%%edx
movl $320,%%ecx
1: /* horizontal loop */
addl %2,%%eax
addl %3,%%edx
movb %%ah,%%bl
movb %%dh,%%bh
shrl $3,%%bx
andl $0x03ff,%%ebx
addl %4,%%ebx
movl %%ebx,%%esi
movsb
decl %%ecx
jnz 1b
movl %5,%%eax
addl %%eax,%0
movl %6,%%eax
addl %%eax,%1
pop %%ecx
decl %%ecx
jnz 0b
"
/*Outputs*/ :
/*Inputs*/ : "g" (i), "g" (j), "g" (ddx), "g" (ddy), "g" (tile),
"g" (d2x), "g" (d2y), "g" (screen)
/*Reg's*/ : "eax","ebx","ecx","edx","esi","edi"
);
}
------------------------------------------------------------------------------
Here's a dump of the assembler output compiled without optimisations.
It works as expected.
cld
movl 8(%ebp),%edi
movl $200,%ecx
0: /* vertical loop */
push %ecx
movl -24(%ebp),%eax
movl -28(%ebp),%edx
movl $320,%ecx
1: /* horizontal loop */
addl -8(%ebp),%eax
addl -12(%ebp),%edx
movb %ah,%bl
movb %dh,%bh
shrl $3,%bx
andl $0x03ff,%ebx
addl 12(%ebp),%ebx
movl %ebx,%esi
movsb
decl %ecx
jnz 1b
movl -16(%ebp),%eax
addl %eax,-24(%ebp)
movl -20(%ebp),%eax
addl %eax,-28(%ebp)
pop %ecx
decl %ecx
jnz 0b
------------------------------------------------------------------------------
Here's the same code compiled with -O3 (-O1 & -O2 also produce incorrect code).
It clobbers eax after each horizontal loop.
cld
movl 8(%ebp),%edi
movl $200,%ecx
0: /* vertical loop */
push %ecx
movl -20(%ebp),%eax
movl -24(%ebp),%edx
movl $320,%ecx
1: /* horizontal loop */
addl -4(%ebp),%eax
addl -8(%ebp),%edx
movb %ah,%bl
movb %dh,%bh
shrl $3,%bx
andl $0x03ff,%ebx
addl 12(%ebp),%ebx
movl %ebx,%esi
movsb
decl %ecx
jnz 1b
movl -12(%ebp),%eax
addl %eax,-20(%ebp)
movl %esi,%eax <--- clobbers eax
addl %eax,-24(%ebp)
pop %ecx
decl %ecx
jnz 0b