diff options
| -rw-r--r-- | libc/string/i386/memset.c | 65 | 
1 files changed, 56 insertions, 9 deletions
| diff --git a/libc/string/i386/memset.c b/libc/string/i386/memset.c index cfc16983c..9f51f3c60 100644 --- a/libc/string/i386/memset.c +++ b/libc/string/i386/memset.c @@ -28,21 +28,68 @@   * More importantly, these should provide a good example for   * others to follow when adding arch specific optimizations.   *  -Erik + * + * 2009-04: modified by Denys Vlasenko <vda.linux@googlemail.com> + * Fill byte-by-byte is a bit too slow. I prefer 46 byte function + * which fills x4 faster than 21 bytes one.   */  #include <string.h> -/* Experimentally off - libc_hidden_proto(memset) */  #undef memset  void *memset(void *s, int c, size_t count)  { -    int d0, d1; -    __asm__ __volatile__( -	    "rep\n\t" -	    "stosb" -	    : "=&c" (d0), "=&D" (d1) -	    :"a" (c),"1" (s),"0" (count) -	    :"memory"); -    return s; +	int reg, edi; +	__asm__ __volatile__( + +		/* Most of the time, count is divisible by 4 and nonzero */ +		/* It's better to make this case faster */ +	/*	"	jecxz	9f\n" - (optional) count == 0: goto ret */ +		"	mov	%%ecx, %1\n" +		"	shr	$2, %%ecx\n" +		"	jz	1f\n" /* zero words: goto fill_bytes */ +		/* extend 8-bit fill to 32 bits */ +		"	movzx	%%al, %%eax\n" /* 3 bytes */ +	/* or:	"	and	$0xff, %%eax\n" - 5 bytes */ +		"	imul	$0x01010101, %%eax\n" /* 6 bytes */ +		/* fill full words */ +		"	rep; stosl\n" +		/* fill 0-3 bytes */ +		"1:	and	$3, %1\n" +		"	jz	9f\n" /* (count & 3) == 0: goto end */ +		"2:	stosb\n" +		"	dec	%1\n" +		"	jnz	2b\n" +		/* end */ +		"9:\n" + +		: "=&D" (edi), "=&r" (reg) +		: "0" (s), "a" (c), "c" (count) +		: "memory" +	); +	return s;  }  libc_hidden_def(memset) + +/* +gcc 4.3.1 +========= +57                     push   %edi +8b 7c 24 08            mov    0x8(%esp),%edi +8b 4c 24 10            mov    0x10(%esp),%ecx +8b 44 24 0c            mov    0xc(%esp),%eax +89 ca                  mov    %ecx,%edx +c1 e9 02               shr    $0x2,%ecx +74 0b                  je     1f <__GI_memset+0x1f> +0f b6 c0               movzbl %al,%eax +69 c0 01 01 01 01      imul   $0x1010101,%eax,%eax +f3 ab                  rep stos %eax,%es:(%edi) +83 e2 03               and    $0x3,%edx +74 04                  je     28 <__GI_memset+0x28> +aa                     stos   %al,%es:(%edi) +4a                     dec    %edx +75 fc                  jne    24 <__GI_memset+0x24> +8b 44 24 08            mov    0x8(%esp),%eax +5f                     pop    %edi +c3                     ret +*/ | 
