www.delorie.com/archives/browse.cgi   search  
Mail Archives: djgpp-workers/1999/08/09/11:26:43

Date: Mon, 9 Aug 1999 11:21:27 +0300 (IDT)
From: Eli Zaretskii <eliz AT is DOT elta DOT co DOT il>
X-Sender: eliz AT is
To: djgpp-workers AT delorie DOT com
cc: Andris Pavenis <pavenis AT lanet DOT lv>
Subject: Performance regression of 2.95 vs. 2.7, x86, loop-related (fwd)
Message-ID: <Pine.SUN.3.91.990809112043.19120J-100000@is>
MIME-Version: 1.0
Reply-To: djgpp-workers AT delorie DOT com

Somebody posted the message below to the bug-gcc list.  It seems like
this platforms had ".p2align 4,,7" even in v2.7.2.3.  Andris, is this
something that is configured in a platform-dependent manner in GCC?

---------- Forwarded message ----------
Date: Sun, 08 Aug 1999 17:55:40 -0700
From: Zack Weinberg <zack AT bitmover DOT com>
To: gcc-bugs AT gcc DOT gnu DOT org, amylaar AT cygnus DOT co DOT uk
Cc: lm AT bitmover DOT com
Subject: Performance regression of 2.95 vs. 2.7, x86, loop-related


Consider this fragment:

#include <stdio.h>

typedef struct
{
   unsigned short cksum;
   unsigned short encoding;
} sccs;

#define E_GZIP 0x4

extern int zputs(void *, FILE *, unsigned char *, int, void (*)());
extern void gzip_sum();

unsigned short
fpd(sccs *s, unsigned char *buf, FILE *out)
{
	unsigned short	sum = 0;
	unsigned char	*p, *q, c;
	static	unsigned char block[8192];
	static	unsigned char *next = block;

	/* Checksum up to and including the first newline
	 * or the end of the string.
	 */
	p = buf;
	q = next;
	for (;;) {
		for (;;) {
			c = *p++;
			if (c == '\0') goto done;
			sum += c;
			*q++ = c;
			if (q == &block[8192]) break;
			if (c == '\n') goto done;
		}
		if (s->encoding & E_GZIP) {
			zputs((void *)s, out, block, 8192, gzip_sum);
		} else {
			fwrite(block, 8192, 1, out);
		}
		q = block;
		if (c == '\n') break;
	}
done:	next = q;
	if (!(s->encoding & E_GZIP)) s->cksum += sum;
	return (sum);
}

The structure of the loops is calculated to get near-optimal code
generation out of gcc 2.7.x.  gcc 2.95, however, does a transformation
on it that nearly doubles the execution time of the function, which
leads to a 30% slowdown of the program this is taken from.  I'm not
sure if it's loop or jump optimization.  It is NOT global CSE, which
was my first hypothesis.

Here is a side-by-side diff of assembly output for the function
(munged slightly to eliminate spurious differences).  2.7.2.3 is on
the left, 2.95 on the right.  -O2 -fomit-frame-pointer was used both
times.  The interesting bits are marked with stars at the left margin.

 fpd:					fpd:
 				      >		subl $12,%esp
 	pushl %ebp				pushl %ebp
 	pushl %edi				pushl %edi
 	pushl %esi				pushl %esi
 	pushl %ebx				pushl %ebx
 	movl 20(%esp),%ebp	      |		movl 32(%esp),%ebp
 	xorl %edi,%edi				xorl %edi,%edi
 	movl 24(%esp),%esi	      |		movl 36(%esp),%esi
 	movl next.23,%edx			movl next.23,%edx
 	.p2align 4,,7				.p2align 4,,7
 .L20:					.L20:
 	movb (%esi),%bl				movb (%esi),%bl
 	incl %esi				incl %esi
 	testb %bl,%bl				testb %bl,%bl
 	je .L24					je .L24
 	movzbw %bl,%ax				movzbw %bl,%ax
!	addw %ax,%di		      |		addl %eax,%edi
 	movb %bl,(%edx)				movb %bl,(%edx)
 	incl %edx				incl %edx
 	cmpl $block.22+8192,%edx		cmpl $block.22+8192,%edx
*	je .L21			      |		jne .L35
*	cmpb $10,%bl		      <
*	je .L24			      <
*	jmp .L20		      <
*	.p2align 4,,7		      <
*.L21:				      <
 	testb $4,2(%ebp)			testb $4,2(%ebp)
 	je .L27					je .L27
 				      >		addl $-12,%esp
 	pushl $gzip_sum				pushl $gzip_sum
 	pushl $8192				pushl $8192
 	pushl $block.22				pushl $block.22
 	movl 40(%esp),%ecx	      |		movl 64(%esp),%eax
 	pushl %ecx		      |		pushl %eax
 	pushl %ebp				pushl %ebp
 	call zputs				call zputs
 	addl $20,%esp		      |		addl $32,%esp
 	jmp .L28				jmp .L28
 	.p2align 4,,7				.p2align 4,,7
 .L27:					.L27:
 	movl 28(%esp),%ecx	      |		movl 40(%esp),%eax
 	pushl %ecx		      |		pushl %eax
 	pushl $1				pushl $1
 	pushl $8192				pushl $8192
 	pushl $block.22				pushl $block.22
 	call fwrite				call fwrite
 	addl $16,%esp				addl $16,%esp
 .L28:					.L28:
 	movl $block.22,%edx			movl $block.22,%edx
*				      >	.L35:
 	cmpb $10,%bl				cmpb $10,%bl
 	jne .L20				jne .L20
 .L24:					.L24:
 	movl %edx,next.23			movl %edx,next.23
 	testb $4,2(%ebp)			testb $4,2(%ebp)
 	jne .L30				jne .L30
 	addw %di,(%ebp)				addw %di,(%ebp)
 .L30:					.L30:
 	movzwl %di,%eax				movzwl %di,%eax
 	popl %ebx				popl %ebx
 	popl %esi				popl %esi
 	popl %edi				popl %edi
 	popl %ebp				popl %ebp
 				      >		addl $12,%esp
 	ret					ret

The change looks trivial but it isn't.  The two places where c is
compared with '\n' have been merged into one, and that one is placed
AFTER the big block of code that is executed only rarely.  The inner
loop therefore contains a jump to another cache line.  It will almost
always be taken, but (if I understand x86 branch prediction correctly)
it will be mispredicted on the first few iterations - and the input
data to this function is such that the inner loop may only execute a
few times per call.

It would be fine to merge the two comparisons if the sole instance
were contiguous with the inner loop and the outer loop code jumped
back up.

It would be nice if I could get the code that gcc 2.7.x generated
without having to structure the source that way; it's far more natural
to write this with the "outer loop" in an if block inside the inner
loop.

I'd also like to point out the line marked with an exclamation point.
Unless movzbw clears the high 16 bits of the destination register
(does it?), we are adding garbage to edi.

zw

- Raw text -


  webmaster     delorie software   privacy  
  Copyright © 2019   by DJ Delorie     Updated Jul 2019