/* memset/bzero -- set memory area to CH/0 Optimized version for x86-64. Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Andreas Jaeger <aj@suse.de>. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU C Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ #include "_glibc_inc.h" /* BEWARE: `#ifdef memset' means that memset is redefined as `bzero' */ #define BZERO_P (defined memset) /* This is somehow experimental and could made dependend on the cache size. */ #define LARGE $120000 .text #endif ENTRY (memset) #if BZERO_P mov %rsi,%rdx /* Adjust parameter. */ xorl %esi,%esi /* Fill with 0s. */ #endif cmp $0x7,%rdx /* Check for small length. */ mov %rdi,%rcx /* Save ptr as return value. */ jbe 7f #if BZERO_P mov %rsi,%r8 /* Just copy 0. */ #else /* Populate 8 bit data to full 64-bit. */ movabs $0x0101010101010101,%r8 movzbl %sil,%eax imul %rax,%r8 #endif test $0x7,%edi /* Check for alignment. */ jz 2f /* Next 3 insns are 9 bytes total, make sure we decode them in one go */ .p2align 4,,9 1: /* Align ptr to 8 byte. */ mov %sil,(%rcx) dec %rdx inc %rcx test $0x7,%cl jnz 1b 2: /* Check for really large regions. */ mov %rdx,%rax shr $0x6,%rax je 4f cmp LARGE, %rdx jae 11f /* Next 3 insns are 11 bytes total, make sure we decode them in one go */ .p2align 4,,11 3: /* Fill 64 bytes. */ mov %r8,(%rcx) mov %r8,0x8(%rcx) mov %r8,0x10(%rcx) mov %r8,0x18(%rcx) mov %r8,0x20(%rcx) mov %r8,0x28(%rcx) mov %r8,0x30(%rcx) mov %r8,0x38(%rcx) add $0x40,%rcx dec %rax jne 3b 4: /* Fill final bytes. */ and $0x3f,%edx mov %rdx,%rax shr $0x3,%rax je 6f 5: /* First in chunks of 8 bytes. */ mov %r8,(%rcx) add $0x8,%rcx dec %rax jne 5b 6: and $0x7,%edx 7: test %rdx,%rdx je 9f 8: /* And finally as bytes (up to 7). */ mov %sil,(%rcx) inc %rcx dec %rdx jne 8b 9: #if BZERO_P /* nothing */ #else /* Load result (only if used as memset). */ mov %rdi,%rax /* start address of destination is result */ #endif retq /* Next 3 insns are 14 bytes total, make sure we decode them in one go */ .p2align 4,,14 11: /* Fill 64 bytes without polluting the cache. */ /* We could use movntdq %xmm0,(%rcx) here to further speed up for large cases but let's not use XMM registers. */ movnti %r8,(%rcx) movnti %r8,0x8(%rcx) movnti %r8,0x10(%rcx) movnti %r8,0x18(%rcx) movnti %r8,0x20(%rcx) movnti %r8,0x28(%rcx) movnti %r8,0x30(%rcx) movnti %r8,0x38(%rcx) add $0x40,%rcx dec %rax jne 11b jmp 4b END (memset) #if !BZERO_P libc_hidden_def(memset) #endif