diff options
| -rw-r--r-- | libc/string/x86_64/memcpy.S | 4 | ||||
| -rw-r--r-- | libc/string/x86_64/memset.S | 20 | ||||
| -rw-r--r-- | libc/string/x86_64/strcat.S | 19 | ||||
| -rw-r--r-- | libc/string/x86_64/strchr.S | 12 | ||||
| -rw-r--r-- | libc/string/x86_64/strcpy.S | 8 | ||||
| -rw-r--r-- | libc/string/x86_64/strcspn.S | 12 | ||||
| -rw-r--r-- | libc/string/x86_64/strlen.S | 14 | ||||
| -rw-r--r-- | libc/string/x86_64/strspn.S | 16 | 
8 files changed, 75 insertions, 30 deletions
| diff --git a/libc/string/x86_64/memcpy.S b/libc/string/x86_64/memcpy.S index b3bb0f96c..697b992d0 100644 --- a/libc/string/x86_64/memcpy.S +++ b/libc/string/x86_64/memcpy.S @@ -59,9 +59,9 @@ ENTRY (BP_SYM (memcpy))  	subq	$32, %rcx  	js	2f -	.p2align 4 +	/* Next 3 insns are 11 bytes total, make sure we decode them in one go */ +	.p2align 4,,11  3: -  	/* Now correct the loop counter.  Please note that in the following  	   code the flags are not changed anymore.  */  	subq	$32, %rcx diff --git a/libc/string/x86_64/memset.S b/libc/string/x86_64/memset.S index 3092e81eb..46751006b 100644 --- a/libc/string/x86_64/memset.S +++ b/libc/string/x86_64/memset.S @@ -55,8 +55,10 @@ ENTRY (memset)  	test	$0x7,%edi	/* Check for alignment.  */  	jz	2f -	.p2align 4 -1:	/* Align ptr to 8 byte.  */ +	/* Next 3 insns are 9 bytes total, make sure we decode them in one go */ +	.p2align 4,,9 +1: +	/* Align ptr to 8 byte.  */  	mov	%sil,(%rcx)  	dec	%rdx  	inc	%rcx @@ -70,8 +72,10 @@ ENTRY (memset)  	cmp	LARGE, %rdx  	jae	11f -	.p2align 4 -3:	/* Fill 64 bytes.  */ +	/* Next 3 insns are 11 bytes total, make sure we decode them in one go */ +	.p2align 4,,11 +3: +	/* Fill 64 bytes.  */  	mov	%r8,(%rcx)  	mov	%r8,0x8(%rcx)  	mov	%r8,0x10(%rcx) @@ -114,9 +118,11 @@ ENTRY (memset)  #endif  	retq -	.p2align 4 -11:	/* Fill 64 bytes without polluting the cache.  */ -	/* We could use	movntdq    %xmm0,(%rcx) here to further +	/* Next 3 insns are 14 bytes total, make sure we decode them in one go */ +	.p2align 4,,14 +11: +	/* Fill 64 bytes without polluting the cache.  */ +	/* We could use	movntdq %xmm0,(%rcx) here to further  	   speed up for large cases but let's not use XMM registers.  */  	movnti	%r8,(%rcx)  	movnti  %r8,0x8(%rcx) diff --git a/libc/string/x86_64/strcat.S b/libc/string/x86_64/strcat.S index 9e2da50f2..23d068fea 100644 --- a/libc/string/x86_64/strcat.S +++ b/libc/string/x86_64/strcat.S @@ -45,7 +45,9 @@ ENTRY (BP_SYM (strcat))  	/* Now the source is aligned.  Scan for NUL byte.  */ -	.p2align 4 + +	/* Next 3 insns are 10 bytes total, make sure we decode them in one go */ +	.p2align 4,,10  4:  	/* First unroll.  */  	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */ @@ -103,8 +105,11 @@ ENTRY (BP_SYM (strcat))  				   the addition will not result in 0.  */  	jz 4b			/* no NUL found => continue loop */ -	.p2align 4		/* Align, it is a jump target.  */ -3:	subq $8,%rax		/* correct pointer increment.  */ +	/* Align, it is a jump target.  */ +	/* Next 3 insns are 8 bytes total, make sure we decode them in one go */ +	.p2align 3,,8 +3: +	subq $8,%rax		/* correct pointer increment.  */  	testb %cl, %cl		/* is first byte NUL? */  	jz 2f			/* yes => return */ @@ -160,7 +165,9 @@ ENTRY (BP_SYM (strcat))  	/* Now the sources is aligned.  Unfortunatly we cannot force  	   to have both source and destination aligned, so ignore the  	   alignment of the destination.  */ -	.p2align 4 + +	/* Next 3 insns are 10 bytes total, make sure we decode them in one go */ +	.p2align 4,,10  22:  	/* 1st unroll.  */  	movq	(%rsi), %rax	/* Read double word (8 bytes).  */ @@ -237,7 +244,9 @@ ENTRY (BP_SYM (strcat))  	/* Do the last few bytes. %rax contains the value to write.  	   The loop is unrolled twice.  */ -	.p2align 4 + +	/* Next 3 insns are 6 bytes total, make sure we decode them in one go */ +	.p2align 3,,6  23:  	movb	%al, (%rdx)	/* 1st byte.  */  	testb	%al, %al	/* Is it NUL.  */ diff --git a/libc/string/x86_64/strchr.S b/libc/string/x86_64/strchr.S index c357bfd03..9ef46b7f2 100644 --- a/libc/string/x86_64/strchr.S +++ b/libc/string/x86_64/strchr.S @@ -92,7 +92,8 @@ ENTRY (BP_SYM (strchr))  	 each of whose bytes is C.  This turns each byte that is C  	 into a zero.  */ -	.p2align 4 +	/* Next 3 insns are 10 bytes total, make sure we decode them in one go */ +	.p2align 4,,10  4:  	/* Main Loop is unrolled 4 times.  */  	/* First unroll.  */ @@ -230,8 +231,11 @@ ENTRY (BP_SYM (strchr))  	   reversed.  */ -	.p2align 4		/* Align, it's a jump target.  */ -3:	movq	%r9,%rdx	/* move to %rdx so that we can access bytes */ +	/* Align, it's a jump target.  */ +	/* Next 3 insns are 9 bytes total, make sure we decode them in one go */ +	.p2align 4,,9 +3: +	movq	%r9,%rdx	/* move to %rdx so that we can access bytes */  	subq	$8,%rax		/* correct pointer increment.  */  	testb %cl, %cl		/* is first byte C? */  	jz 6f			/* yes => return pointer */ @@ -281,7 +285,7 @@ ENTRY (BP_SYM (strchr))  	incq %rax  6: -	nop +	/* nop - huh?? */  	retq  END (BP_SYM (strchr)) diff --git a/libc/string/x86_64/strcpy.S b/libc/string/x86_64/strcpy.S index 920d0543a..612a30d1a 100644 --- a/libc/string/x86_64/strcpy.S +++ b/libc/string/x86_64/strcpy.S @@ -53,7 +53,9 @@ ENTRY (BP_SYM (STRCPY))  	/* Now the sources is aligned.  Unfortunatly we cannot force  	   to have both source and destination aligned, so ignore the  	   alignment of the destination.  */ -	.p2align 4 + +	/* Next 3 insns are 10 bytes total, make sure we decode them in one go */ +	.p2align 4,,10  1:  	/* 1st unroll.  */  	movq	(%rsi), %rax	/* Read double word (8 bytes).  */ @@ -130,7 +132,9 @@ ENTRY (BP_SYM (STRCPY))  	/* Do the last few bytes. %rax contains the value to write.  	   The loop is unrolled twice.  */ -	.p2align 4 + +	/* Next 3 insns are 6 bytes total, make sure we decode them in one go */ +	.p2align 3,,6  3:  	/* Note that stpcpy needs to return with the value of the NUL  	   byte.  */ diff --git a/libc/string/x86_64/strcspn.S b/libc/string/x86_64/strcspn.S index 9563de496..fd9b09c48 100644 --- a/libc/string/x86_64/strcspn.S +++ b/libc/string/x86_64/strcspn.S @@ -55,7 +55,9 @@ ENTRY (strcspn)     Although all the following instruction only modify %cl we always     have a correct zero-extended 64-bit value in %rcx.  */ -	.p2align 4 +	/* Next 3 insns are 6 bytes total, make sure we decode them in one go */ +	.p2align 3,,6 +  L(2):	movb (%rax), %cl	/* get byte from skipset */  	testb %cl, %cl		/* is NUL char? */  	jz L(1)			/* yes => start compare loop */ @@ -88,7 +90,13 @@ L(1):	leaq -4(%rdx), %rax	/* prepare loop */  	   value in the table.  But the value of NUL is NUL so the loop  	   terminates for NUL in every case.  */ -	.p2align 4 +	/* Next 3 insns are 9 bytes total. */ +	/* .p2align 4,,9 would make sure we decode them in one go, */ +	/* but it will also align entire function to 16 bytes, */ +	/* potentially creating largish padding at link time. */ +	/* We are aligning to 8 bytes instead: */ +	.p2align 3,,8 +  L(3):	addq $4, %rax		/* adjust pointer for full loop round */  	movb (%rax), %cl	/* get byte from string */ diff --git a/libc/string/x86_64/strlen.S b/libc/string/x86_64/strlen.S index 3b6f2ac86..4213f0ab6 100644 --- a/libc/string/x86_64/strlen.S +++ b/libc/string/x86_64/strlen.S @@ -40,8 +40,11 @@ ENTRY (strlen)  1:	movq $0xfefefefefefefeff,%r8 /* Save magic.  */ -	.p2align 4		/* Align loop.  */ -4:	/* Main Loop is unrolled 4 times.  */ +	/* Align loop.  */ +	/* Next 3 insns are 10 bytes total, make sure we decode them in one go */ +	.p2align 4,,10 +4: +	/* Main Loop is unrolled 4 times.  */  	/* First unroll.  */  	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */  	addq $8,%rax		/* adjust pointer for next word */ @@ -98,8 +101,11 @@ ENTRY (strlen)  				   the addition will not result in 0.  */  	jz 4b			/* no NUL found => continue loop */ -	.p2align 4		/* Align, it is a jump target.  */ -3:	subq $8,%rax		/* correct pointer increment.  */ +	/* Align, it is a jump target.  */ +	/* Next 3 insns are 8 bytes total, make sure we decode them in one go */ +	.p2align 3,,8 +3: +	subq $8,%rax		/* correct pointer increment.  */  	testb %cl, %cl		/* is first byte NUL? */  	jz 2f			/* yes => return */ diff --git a/libc/string/x86_64/strspn.S b/libc/string/x86_64/strspn.S index 416424565..41cff0490 100644 --- a/libc/string/x86_64/strspn.S +++ b/libc/string/x86_64/strspn.S @@ -50,8 +50,10 @@ ENTRY (strspn)     Although all the following instruction only modify %cl we always     have a correct zero-extended 64-bit value in %rcx.  */ -	.p2align 4 -L(2):	movb (%rax), %cl	/* get byte from stopset */ +	/* Next 3 insns are 6 bytes total, make sure we decode them in one go */ +	.p2align 3,,6 +L(2): +	movb (%rax), %cl	/* get byte from stopset */  	testb %cl, %cl		/* is NUL char? */  	jz L(1)			/* yes => start compare loop */  	movb %cl, (%rsp,%rcx)	/* set corresponding byte in stopset table */ @@ -83,8 +85,14 @@ L(1):	leaq -4(%rdx), %rax	/* prepare loop */  	   value in the table.  But the value of NUL is NUL so the loop  	   terminates for NUL in every case.  */ -	.p2align 4 -L(3):	addq $4, %rax		/* adjust pointer for full loop round */ +	/* Next 3 insns are 9 bytes total. */ +	/* .p2align 4,,9 would make sure we decode them in one go, */ +	/* but it will also align entire function to 16 bytes, */ +	/* potentially creating largish padding at link time. */ +	/* We are aligning to 8 bytes instead: */ +	.p2align 3,,8 +L(3): +	addq $4, %rax		/* adjust pointer for full loop round */  	movb (%rax), %cl	/* get byte from string */  	testb %cl, (%rsp,%rcx)	/* is it contained in skipset? */ | 
