#ifndef X86_BITS_STRING_H #define X86_BITS_STRING_H /* * Copyright (C) 2007 Michael Brown . * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or any later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA * 02110-1301, USA. * * You can also choose to distribute this program under the terms of * the Unmodified Binary Distribution Licence (as given in the file * COPYING.UBDL), provided that you have satisfied its requirements. */ FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL ); /** @file * * Optimised string operations * */ extern void * __memcpy ( void *dest, const void *src, size_t len ); extern void * __memcpy_reverse ( void *dest, const void *src, size_t len ); /** * Copy memory area (where length is a compile-time constant) * * @v dest Destination address * @v src Source address * @v len Length * @ret dest Destination address */ static inline __attribute__ (( always_inline )) void * __constant_memcpy ( void *dest, const void *src, size_t len ) { union { uint32_t u32[2]; uint16_t u16[4]; uint8_t u8[8]; } __attribute__ (( __may_alias__ )) *dest_u = dest; const union { uint32_t u32[2]; uint16_t u16[4]; uint8_t u8[8]; } __attribute__ (( __may_alias__ )) *src_u = src; const void *esi; void *edi; switch ( len ) { case 0 : /* 0 bytes */ return dest; /* * Single-register moves; these are always better than a * string operation. We can clobber an arbitrary two * registers (data, source, dest can re-use source register) * instead of being restricted to esi and edi. There's also a * much greater potential for optimising with nearby code. * */ case 1 : /* 4 bytes */ dest_u->u8[0] = src_u->u8[0]; return dest; case 2 : /* 6 bytes */ dest_u->u16[0] = src_u->u16[0]; return dest; case 4 : /* 4 bytes */ dest_u->u32[0] = src_u->u32[0]; return dest; /* * Double-register moves; these are probably still a win. * */ case 3 : /* 12 bytes */ dest_u->u16[0] = src_u->u16[0]; dest_u->u8[2] = src_u->u8[2]; return dest; case 5 : /* 10 bytes */ dest_u->u32[0] = src_u->u32[0]; dest_u->u8[4] = src_u->u8[4]; return dest; case 6 : /* 12 bytes */ dest_u->u32[0] = src_u->u32[0]; dest_u->u16[2] = src_u->u16[2]; return dest; case 8 : /* 10 bytes */ dest_u->u32[0] = src_u->u32[0]; dest_u->u32[1] = src_u->u32[1]; return dest; } /* Even if we have to load up esi and edi ready for a string * operation, we can sometimes save space by using multiple * single-byte "movs" operations instead of loading up ecx and * using "rep movsb". * * "load ecx, rep movsb" is 7 bytes, plus an average of 1 byte * to allow for saving/restoring ecx 50% of the time. * * "movsl" and "movsb" are 1 byte each, "movsw" is two bytes. * (In 16-bit mode, "movsl" is 2 bytes and "movsw" is 1 byte, * but "movsl" moves twice as much data, so it balances out). * * The cutoff point therefore occurs around 26 bytes; the byte * requirements for each method are: * * len 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 * #bytes (ecx) 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 * #bytes (no ecx) 4 5 6 7 5 6 7 8 6 7 8 9 7 8 9 10 */ esi = src; edi = dest; if ( len >= 26 ) return __memcpy ( dest, src, len ); if ( len >= 6*4 ) __asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi ) : "0" ( edi ), "1" ( esi ) : "memory" ); if ( len >= 5*4 ) __asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi ) : "0" ( edi ), "1" ( esi ) : "memory" ); if ( len >= 4*4 ) __asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi ) : "0" ( edi ), "1" ( esi ) : "memory" ); if ( len >= 3*4 ) __asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi ) : "0" ( edi ), "1" ( esi ) : "memory" ); if ( len >= 2*4 ) __asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi ) : "0" ( edi ), "1" ( esi ) : "memory" ); if ( len >= 1*4 ) __asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi ) : "0" ( edi ), "1" ( esi ) : "memory" ); if ( ( len % 4 ) >= 2 ) __asm__ __volatile__ ( "movsw" : "=&D" ( edi ), "=&S" ( esi ) : "0" ( edi ), "1" ( esi ) : "memory" ); if ( ( len % 2 ) >= 1 ) __asm__ __volatile__ ( "movsb" : "=&D" ( edi ), "=&S" ( esi ) : "0" ( edi ), "1" ( esi ) : "memory" ); return dest; } /** * Copy memory area * * @v dest Destination address * @v src Source address * @v len Length * @ret dest Destination address */ static inline __attribute__ (( always_inline )) void * memcpy ( void *dest, const void *src, size_t len ) { if ( __builtin_constant_p ( len ) ) { return __constant_memcpy ( dest, src, len ); } else { return __memcpy ( dest, src, len ); } } extern void * __memmove ( void *dest, const void *src, size_t len ); /** * Copy (possibly overlapping) memory area * * @v dest Destination address * @v src Source address * @v len Length * @ret dest Destination address */ static inline __attribute__ (( always_inline )) void * memmove ( void *dest, const void *src, size_t len ) { ssize_t offset = ( dest - src ); if ( __builtin_constant_p ( offset ) ) { if ( offset <= 0 ) { return memcpy ( dest, src, len ); } else { return __memcpy_reverse ( dest, src, len ); } } else { return __memmove ( dest, src, len ); } } /** * Fill memory region * * @v dest Destination address * @v fill Fill pattern * @v len Length * @ret dest Destination address */ static inline __attribute__ (( always_inline )) void * __memset ( void *dest, int fill, size_t len ) { void *discard_D; size_t discard_c; __asm__ __volatile__ ( "rep stosb" : "=&D" ( discard_D ), "=&c" ( discard_c ) : "0" ( dest ), "1" ( len ), "a" ( fill ) : "memory" ); return dest; } /** * Fill memory region with zero (where length is a compile-time constant) * * @v dest Destination address * @v len Length * @ret dest Destination address */ static inline __attribute__ (( always_inline )) void * __constant_memset_zero ( void *dest, size_t len ) { union { uint32_t u32[2]; uint16_t u16[4]; uint8_t u8[8]; } __attribute__ (( __may_alias__ )) *dest_u = dest; void *edi; uint32_t eax; switch ( len ) { case 0 : /* 0 bytes */ return dest; /* Single-register moves. Almost certainly better than a * string operation. We can avoid clobbering any registers, * we can reuse a zero that happens to already be in a * register, and we can optimise away the code entirely if the * memset() is used to clear a region which then gets * immediately overwritten. */ case 1 : /* 3 bytes */ dest_u->u8[0] = 0; return dest; case 2: /* 5 bytes */ dest_u->u16[0] = 0; return dest; case 4: /* 6 bytes */ dest_u->u32[0] = 0; return dest; /* Double-register moves. Very probably better than a string * operation. */ case 3 : /* 9 bytes */ dest_u->u16[0] = 0; dest_u->u8[2] = 0; return dest; case 5 : /* 10 bytes */ dest_u->u32[0] = 0; dest_u->u8[4] = 0; return dest; case 6 : /* 12 bytes */ dest_u->u32[0] = 0; dest_u->u16[2] = 0; return dest; case 8 : /* 13 bytes */ dest_u->u32[0] = 0; dest_u->u32[1] = 0; return dest; } /* As with memcpy(), we can potentially save space by using * multiple single-byte "stos" instructions instead of loading * up ecx and using "rep stosb". * * "load ecx, rep movsb" is 7 bytes, plus an average of 1 byte * to allow for saving/restoring ecx 50% of the time. * * "stosl" and "stosb" are 1 byte each, "stosw" is two bytes. * * The calculations are therefore the same as for memcpy(), * giving a cutoff point of around 26 bytes. */ edi = dest; eax = 0; if ( len >= 26 ) return __memset ( dest, 0, len ); if ( len >= 6*4 ) __asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax ) : "0" ( edi ), "1" ( eax ) : "memory" ); if ( len >= 5*4 ) __asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax ) : "0" ( edi ), "1" ( eax ) : "memory" ); if ( len >= 4*4 ) __asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax ) : "0" ( edi ), "1" ( eax ) : "memory" ); if ( len >= 3*4 ) __asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax ) : "0" ( edi ), "1" ( eax ) : "memory" ); if ( len >= 2*4 ) __asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax ) : "0" ( edi ), "1" ( eax ) : "memory" ); if ( len >= 1*4 ) __asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax ) : "0" ( edi ), "1" ( eax ) : "memory" ); if ( ( len % 4 ) >= 2 ) __asm__ __volatile__ ( "stosw" : "=&D" ( edi ), "=&a" ( eax ) : "0" ( edi ), "1" ( eax ) : "memory" ); if ( ( len % 2 ) >= 1 ) __asm__ __volatile__ ( "stosb" : "=&D" ( edi ), "=&a" ( eax ) : "0" ( edi ), "1" ( eax ) : "memory" ); return dest; } /** * Fill memory region * * @v dest Destination address * @v fill Fill pattern * @v len Length * @ret dest Destination address */ static inline __attribute__ (( always_inline )) void * memset ( void *dest, int fill, size_t len ) { if ( __builtin_constant_p ( fill ) && ( fill == 0 ) && __builtin_constant_p ( len ) ) { return __constant_memset_zero ( dest, len ); } else { return __memset ( dest, fill, len ); } } #endif /* X86_BITS_STRING_H */