diff -ruN a/sysdeps/arm/arm-features.h b/sysdeps/arm/arm-features.h --- a/sysdeps/arm/arm-features.h 2021-07-14 13:30:27.757425373 +0200 +++ b/sysdeps/arm/arm-features.h 2021-07-14 13:31:23.173127069 +0200 @@ -56,4 +56,6 @@ /* An OS-specific arm-features.h file may define ARM_NO_INDEX_REGISTER to indicate that the two-register addressing modes must never be used. */ +#define NO_THUMB + #endif /* arm-features.h */ diff -ruN a/sysdeps/arm/armv6t2/memchr.S b/sysdeps/arm/armv6t2/memchr.S --- a/sysdeps/arm/armv6t2/memchr.S 2021-07-14 13:30:27.757425373 +0200 +++ b/sysdeps/arm/armv6t2/memchr.S 1970-01-01 01:00:00.000000000 +0100 @@ -1,161 +0,0 @@ -/* Copyright (C) 2011-2019 Free Software Foundation, Inc. - This file is part of the GNU C Library. - Code contributed by Dave Gilbert - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library. If not, see - . */ - -#include - -@ This memchr routine is optimised on a Cortex-A9 and should work on all ARMv7 -@ and ARMv6T2 processors. It has a fast path for short sizes, and has an -@ optimised path for large data sets; the worst case is finding the match early -@ in a large data set. -@ Note: The use of cbz/cbnz means it's Thumb only - -@ 2011-07-15 david.gilbert@linaro.org -@ Copy from Cortex strings release 21 and change license -@ http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/view/head:/src/linaro-a9/memchr.S -@ Change function declarations/entry/exit -@ 2011-12-01 david.gilbert@linaro.org -@ Add some fixes from comments received (including use of ldrd instead ldm) -@ 2011-12-07 david.gilbert@linaro.org -@ Removed cbz from align loop - can't be taken - -@ this lets us check a flag in a 00/ff byte easily in either endianness -#ifdef __ARMEB__ -#define CHARTSTMASK(c) 1<<(31-(c*8)) -#else -#define CHARTSTMASK(c) 1<<(c*8) -#endif - .syntax unified - - .text - .thumb - .thumb_func - .global memchr - .type memchr,%function -ENTRY(memchr) - @ r0 = start of memory to scan - @ r1 = character to look for - @ r2 = length - @ returns r0 = pointer to character or NULL if not found - and r1,r1,#0xff @ Don't think we can trust the caller to actually pass a char - - cmp r2,#16 @ If it's short don't bother with anything clever - blt 20f - - tst r0, #7 @ If it's already aligned skip the next bit - beq 10f - - @ Work up to an aligned point -5: - ldrb r3, [r0],#1 - subs r2, r2, #1 - cmp r3, r1 - beq 50f @ If it matches exit found - tst r0, #7 - bne 5b @ If not aligned yet then do next byte - -10: - @ At this point, we are aligned, we know we have at least 8 bytes to work with - push {r4,r5,r6,r7} - cfi_adjust_cfa_offset (16) - cfi_rel_offset (r4, 0) - cfi_rel_offset (r5, 4) - cfi_rel_offset (r6, 8) - cfi_rel_offset (r7, 12) - - cfi_remember_state - - orr r1, r1, r1, lsl #8 @ expand the match word across to all bytes - orr r1, r1, r1, lsl #16 - bic r6, r2, #7 @ Number of double words to work with * 8 - mvns r7, #0 @ all F's - movs r3, #0 - -15: - ldrd r4,r5, [r0],#8 - subs r6, r6, #8 - eor r4,r4, r1 @ Get it so that r4,r5 have 00's where the bytes match the target - eor r5,r5, r1 - uadd8 r4, r4, r7 @ Parallel add 0xff - sets the GE bits for anything that wasn't 0 - sel r4, r3, r7 @ bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION - uadd8 r5, r5, r7 @ Parallel add 0xff - sets the GE bits for anything that wasn't 0 - sel r5, r4, r7 @ chained....bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION - cbnz r5, 60f - bne 15b @ (Flags from the subs above) If not run out of bytes then go around again - - pop {r4,r5,r6,r7} - cfi_adjust_cfa_offset (-16) - cfi_restore (r4) - cfi_restore (r5) - cfi_restore (r6) - cfi_restore (r7) - - and r1,r1,#0xff @ Get r1 back to a single character from the expansion above - and r2,r2,#7 @ Leave the count remaining as the number after the double words have been done - -20: - cbz r2, 40f @ 0 length or hit the end already then not found - -21: @ Post aligned section, or just a short call - ldrb r3,[r0],#1 - subs r2,r2,#1 - eor r3,r3,r1 @ r3 = 0 if match - doesn't break flags from sub - cbz r3, 50f - bne 21b @ on r2 flags - -40: - movs r0,#0 @ not found - DO_RET(lr) - -50: - subs r0,r0,#1 @ found - DO_RET(lr) - -60: @ We're here because the fast path found a hit - now we have to track down exactly which word it was - @ r0 points to the start of the double word after the one that was tested - @ r4 has the 00/ff pattern for the first word, r5 has the chained value - cfi_restore_state - cmp r4, #0 - itte eq - moveq r4, r5 @ the end is in the 2nd word - subeq r0,r0,#3 @ Points to 2nd byte of 2nd word - subne r0,r0,#7 @ or 2nd byte of 1st word - - @ r0 currently points to the 2nd byte of the word containing the hit - tst r4, # CHARTSTMASK(0) @ 1st character - bne 61f - adds r0,r0,#1 - tst r4, # CHARTSTMASK(1) @ 2nd character - ittt eq - addeq r0,r0,#1 - tsteq r4, # (3<<15) @ 2nd & 3rd character - @ If not the 3rd must be the last one - addeq r0,r0,#1 - -61: - pop {r4,r5,r6,r7} - cfi_adjust_cfa_offset (-16) - cfi_restore (r4) - cfi_restore (r5) - cfi_restore (r6) - cfi_restore (r7) - - subs r0,r0,#1 - DO_RET(lr) - -END(memchr) -libc_hidden_builtin_def (memchr) diff -ruN a/sysdeps/arm/armv6t2/strlen.S b/sysdeps/arm/armv6t2/strlen.S --- a/sysdeps/arm/armv6t2/strlen.S 2021-07-14 13:30:27.757425373 +0200 +++ b/sysdeps/arm/armv6t2/strlen.S 1970-01-01 01:00:00.000000000 +0100 @@ -1,141 +0,0 @@ -/* Copyright (C) 2010-2019 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library. If not, see - . */ - -/* - Assumes: - ARMv6T2, AArch32 - - */ - -#include -#include - -#ifdef __ARMEB__ -#define S2LO lsl -#define S2HI lsr -#else -#define S2LO lsr -#define S2HI lsl -#endif - -/* This code is best on Thumb. */ - .thumb - -/* Parameters and result. */ -#define srcin r0 -#define result r0 - -/* Internal variables. */ -#define src r1 -#define data1a r2 -#define data1b r3 -#define const_m1 r12 -#define const_0 r4 -#define tmp1 r4 /* Overlaps const_0 */ -#define tmp2 r5 - - .text - .p2align 6 -ENTRY(strlen) - pld [srcin, #0] - strd r4, r5, [sp, #-8]! - cfi_adjust_cfa_offset (8) - cfi_rel_offset (r4, 0) - cfi_rel_offset (r5, 4) - cfi_remember_state - bic src, srcin, #7 - mvn const_m1, #0 - ands tmp1, srcin, #7 /* (8 - bytes) to alignment. */ - pld [src, #32] - bne.w .Lmisaligned8 - mov const_0, #0 - mov result, #-8 -.Lloop_aligned: - /* Bytes 0-7. */ - ldrd data1a, data1b, [src] - pld [src, #64] - add result, result, #8 -.Lstart_realigned: - uadd8 data1a, data1a, const_m1 /* Saturating GE<0:3> set. */ - sel data1a, const_0, const_m1 /* Select based on GE<0:3>. */ - uadd8 data1b, data1b, const_m1 - sel data1b, data1a, const_m1 /* Only used if d1a == 0. */ - cbnz data1b, .Lnull_found - - /* Bytes 8-15. */ - ldrd data1a, data1b, [src, #8] - uadd8 data1a, data1a, const_m1 /* Saturating GE<0:3> set. */ - add result, result, #8 - sel data1a, const_0, const_m1 /* Select based on GE<0:3>. */ - uadd8 data1b, data1b, const_m1 - sel data1b, data1a, const_m1 /* Only used if d1a == 0. */ - cbnz data1b, .Lnull_found - - /* Bytes 16-23. */ - ldrd data1a, data1b, [src, #16] - uadd8 data1a, data1a, const_m1 /* Saturating GE<0:3> set. */ - add result, result, #8 - sel data1a, const_0, const_m1 /* Select based on GE<0:3>. */ - uadd8 data1b, data1b, const_m1 - sel data1b, data1a, const_m1 /* Only used if d1a == 0. */ - cbnz data1b, .Lnull_found - - /* Bytes 24-31. */ - ldrd data1a, data1b, [src, #24] - add src, src, #32 - uadd8 data1a, data1a, const_m1 /* Saturating GE<0:3> set. */ - add result, result, #8 - sel data1a, const_0, const_m1 /* Select based on GE<0:3>. */ - uadd8 data1b, data1b, const_m1 - sel data1b, data1a, const_m1 /* Only used if d1a == 0. */ - cmp data1b, #0 - beq .Lloop_aligned - -.Lnull_found: - cmp data1a, #0 - itt eq - addeq result, result, #4 - moveq data1a, data1b -#ifndef __ARMEB__ - rev data1a, data1a -#endif - clz data1a, data1a - ldrd r4, r5, [sp], #8 - cfi_adjust_cfa_offset (-8) - cfi_restore (r4) - cfi_restore (r5) - add result, result, data1a, lsr #3 /* Bits -> Bytes. */ - DO_RET(lr) - -.Lmisaligned8: - cfi_restore_state - ldrd data1a, data1b, [src] - and tmp2, tmp1, #3 - rsb result, tmp1, #0 - lsl tmp2, tmp2, #3 /* Bytes -> bits. */ - tst tmp1, #4 - pld [src, #64] - S2HI tmp2, const_m1, tmp2 - orn data1a, data1a, tmp2 - itt ne - ornne data1b, data1b, tmp2 - movne data1a, const_m1 - mov const_0, #0 - b .Lstart_realigned - -END(strlen) -libc_hidden_builtin_def (strlen) diff -ruN a/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c b/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c --- a/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c 2021-07-14 13:30:27.757425373 +0200 +++ b/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c 2021-07-14 13:33:48.092401122 +0200 @@ -34,7 +34,6 @@ bool use_neon = true; #ifdef __ARM_NEON__ # define __memcpy_neon memcpy -# define __memchr_neon memchr #else use_neon = (GLRO(dl_hwcap) & HWCAP_ARM_NEON) != 0; #endif @@ -53,9 +52,5 @@ #endif IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_arm)); - IFUNC_IMPL (i, name, memchr, - IFUNC_IMPL_ADD (array, i, memchr, use_neon, __memchr_neon) - IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_noneon)); - return i; } diff -ruN a/sysdeps/arm/armv7/multiarch/Makefile b/sysdeps/arm/armv7/multiarch/Makefile --- a/sysdeps/arm/armv7/multiarch/Makefile 2021-07-14 13:30:27.757425373 +0200 +++ b/sysdeps/arm/armv7/multiarch/Makefile 2021-07-14 13:33:03.224624420 +0200 @@ -1,4 +1,3 @@ ifeq ($(subdir),string) -sysdep_routines += memcpy_neon memcpy_vfp memchr_neon memcpy_arm \ - memchr_noneon +sysdep_routines += memcpy_neon memcpy_vfp memcpy_arm endif diff -ruN a/sysdeps/arm/armv7/multiarch/memchr.c b/sysdeps/arm/armv7/multiarch/memchr.c --- a/sysdeps/arm/armv7/multiarch/memchr.c 2021-07-14 13:30:27.757425373 +0200 +++ b/sysdeps/arm/armv7/multiarch/memchr.c 1970-01-01 01:00:00.000000000 +0100 @@ -1,35 +0,0 @@ -/* Multiple versions of memchr. - All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2017-2019 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -/* For __ARM_NEON__ memchr_neon.S defines memchr directly and ifunc - is not used. */ -#if IS_IN (libc) && !defined (__ARM_NEON__) -# define memchr __redirect_memchr -# include -# undef memchr - -# include - -# define SYMBOL_NAME memchr -# include "ifunc-memchr.h" - -arm_libc_ifunc_redirected (__redirect_memchr, memchr, IFUNC_SELECTOR); - -arm_libc_ifunc_hidden_def (__redirect_memchr, memchr); -#endif diff -ruN a/sysdeps/arm/armv7/multiarch/memchr_neon.S b/sysdeps/arm/armv7/multiarch/memchr_neon.S --- a/sysdeps/arm/armv7/multiarch/memchr_neon.S 2021-07-14 13:30:27.757425373 +0200 +++ b/sysdeps/arm/armv7/multiarch/memchr_neon.S 1970-01-01 01:00:00.000000000 +0100 @@ -1,202 +0,0 @@ -/* memchr implemented using NEON. - Copyright (C) 2011-2019 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library. If not, see - . */ - -#include - -/* For __ARM_NEON__ this file defines memchr. */ -#ifndef __ARM_NEON__ -# define memchr __memchr_neon -# undef libc_hidden_builtin_def -# define libc_hidden_builtin_def(a) -#endif - - .arch armv7-a - .fpu neon - - -/* Arguments */ -#define srcin r0 -#define chrin r1 -#define cntin r2 - -/* Retval */ -#define result r0 /* Live range does not overlap with srcin */ - -/* Working registers */ -#define src r1 /* Live range does not overlap with chrin */ -#define tmp r3 -#define synd r0 /* No overlap with srcin or result */ -#define soff r12 - -/* Working NEON registers */ -#define vrepchr q0 -#define vdata0 q1 -#define vdata0_0 d2 /* Lower half of vdata0 */ -#define vdata0_1 d3 /* Upper half of vdata0 */ -#define vdata1 q2 -#define vdata1_0 d4 /* Lower half of vhas_chr0 */ -#define vdata1_1 d5 /* Upper half of vhas_chr0 */ -#define vrepmask q3 -#define vrepmask0 d6 -#define vrepmask1 d7 -#define vend q4 -#define vend0 d8 -#define vend1 d9 - -/* - * Core algorithm: - * - * For each 32-byte chunk we calculate a 32-bit syndrome value, with one bit per - * byte. Each bit is set if the relevant byte matched the requested character - * and cleared otherwise. Since the bits in the syndrome reflect exactly the - * order in which things occur in the original string, counting trailing zeros - * allows to identify exactly which byte has matched. - */ - - .thumb_func - .p2align 4,,15 - -ENTRY(memchr) - /* Use a simple loop if there are less than 8 bytes to search. */ - cmp cntin, #7 - bhi .Llargestr - and chrin, chrin, #0xff - -.Lsmallstr: - subs cntin, cntin, #1 - blo .Lnotfound /* Return not found if reached end. */ - ldrb tmp, [srcin], #1 - cmp tmp, chrin - bne .Lsmallstr /* Loop again if not found. */ - /* Otherwise fixup address and return. */ - sub result, srcin, #1 - bx lr - - -.Llargestr: - vdup.8 vrepchr, chrin /* Duplicate char across all lanes. */ - /* - * Magic constant 0x8040201008040201 allows us to identify which lane - * matches the requested byte. - */ - movw tmp, #0x0201 - movt tmp, #0x0804 - lsl soff, tmp, #4 - vmov vrepmask0, tmp, soff - vmov vrepmask1, tmp, soff - /* Work with aligned 32-byte chunks */ - bic src, srcin, #31 - ands soff, srcin, #31 - beq .Lloopintro /* Go straight to main loop if it's aligned. */ - - /* - * Input string is not 32-byte aligned. We calculate the syndrome - * value for the aligned 32 bytes block containing the first bytes - * and mask the irrelevant part. - */ - vld1.8 {vdata0, vdata1}, [src:256]! - sub tmp, soff, #32 - adds cntin, cntin, tmp - vceq.i8 vdata0, vdata0, vrepchr - vceq.i8 vdata1, vdata1, vrepchr - vand vdata0, vdata0, vrepmask - vand vdata1, vdata1, vrepmask - vpadd.i8 vdata0_0, vdata0_0, vdata0_1 - vpadd.i8 vdata1_0, vdata1_0, vdata1_1 - vpadd.i8 vdata0_0, vdata0_0, vdata1_0 - vpadd.i8 vdata0_0, vdata0_0, vdata0_0 - vmov synd, vdata0_0[0] - - /* Clear the soff lower bits */ - lsr synd, synd, soff - lsl synd, synd, soff - /* The first block can also be the last */ - bls .Lmasklast - /* Have we found something already? */ - cbnz synd, .Ltail - - -.Lloopintro: - vpush {vend} - /* 264/265 correspond to d8/d9 for q4 */ - cfi_adjust_cfa_offset (16) - cfi_rel_offset (264, 0) - cfi_rel_offset (265, 8) - .p2align 3,,7 -.Lloop: - vld1.8 {vdata0, vdata1}, [src:256]! - subs cntin, cntin, #32 - vceq.i8 vdata0, vdata0, vrepchr - vceq.i8 vdata1, vdata1, vrepchr - /* If we're out of data we finish regardless of the result. */ - bls .Lend - /* Use a fast check for the termination condition. */ - vorr vend, vdata0, vdata1 - vorr vend0, vend0, vend1 - vmov synd, tmp, vend0 - orrs synd, synd, tmp - /* We're not out of data, loop if we haven't found the character. */ - beq .Lloop - -.Lend: - vpop {vend} - cfi_adjust_cfa_offset (-16) - cfi_restore (264) - cfi_restore (265) - - /* Termination condition found, let's calculate the syndrome value. */ - vand vdata0, vdata0, vrepmask - vand vdata1, vdata1, vrepmask - vpadd.i8 vdata0_0, vdata0_0, vdata0_1 - vpadd.i8 vdata1_0, vdata1_0, vdata1_1 - vpadd.i8 vdata0_0, vdata0_0, vdata1_0 - vpadd.i8 vdata0_0, vdata0_0, vdata0_0 - vmov synd, vdata0_0[0] - cbz synd, .Lnotfound - bhi .Ltail /* Uses the condition code from - subs cntin, cntin, #32 above. */ - - -.Lmasklast: - /* Clear the (-cntin) upper bits to avoid out-of-bounds matches. */ - neg cntin, cntin - lsl synd, synd, cntin - lsrs synd, synd, cntin - it eq - moveq src, #0 /* If no match, set src to 0 so the retval is 0. */ - - -.Ltail: - /* Count the trailing zeros using bit reversing */ - rbit synd, synd - /* Compensate the last post-increment */ - sub src, src, #32 - /* Count the leading zeros */ - clz synd, synd - /* Compute the potential result and return */ - add result, src, synd - bx lr - - -.Lnotfound: - /* Set result to NULL if not found and return */ - mov result, #0 - bx lr - -END(memchr) -libc_hidden_builtin_def (memchr) diff -ruN a/sysdeps/arm/armv7/multiarch/memchr_noneon.S b/sysdeps/arm/armv7/multiarch/memchr_noneon.S --- a/sysdeps/arm/armv7/multiarch/memchr_noneon.S 2021-07-14 13:30:27.757425373 +0200 +++ b/sysdeps/arm/armv7/multiarch/memchr_noneon.S 1970-01-01 01:00:00.000000000 +0100 @@ -1,5 +0,0 @@ -#define memchr __memchr_noneon -#undef libc_hidden_builtin_def -#define libc_hidden_builtin_def(name) - -#include diff -ruN a/sysdeps/arm/armv7/multiarch/rtld-memchr.S b/sysdeps/arm/armv7/multiarch/rtld-memchr.S --- a/sysdeps/arm/armv7/multiarch/rtld-memchr.S 2021-07-14 13:30:27.757425373 +0200 +++ b/sysdeps/arm/armv7/multiarch/rtld-memchr.S 1970-01-01 01:00:00.000000000 +0100 @@ -1 +0,0 @@ -#include diff -ruN a/sysdeps/arm/armv7/strcmp.S b/sysdeps/arm/armv7/strcmp.S --- a/sysdeps/arm/armv7/strcmp.S 2021-07-14 13:30:27.757425373 +0200 +++ b/sysdeps/arm/armv7/strcmp.S 1970-01-01 01:00:00.000000000 +0100 @@ -1,496 +0,0 @@ -/* strcmp implementation for ARMv7-A, optimized for Cortex-A15. - Copyright (C) 2012-2019 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library. If not, see - . */ - -#include -#include - -/* Implementation of strcmp for ARMv7 when DSP instructions are - available. Use ldrd to support wider loads, provided the data - is sufficiently aligned. Use saturating arithmetic to optimize - the compares. */ - -/* Build Options: - STRCMP_PRECHECK: Run a quick pre-check of the first byte in the - string. If comparing completely random strings the pre-check will - save time, since there is a very high probability of a mismatch in - the first character: we save significant overhead if this is the - common case. However, if strings are likely to be identical (e.g. - because we're verifying a hit in a hash table), then this check - is largely redundant. */ - -#define STRCMP_PRECHECK 1 - - .syntax unified - -#ifdef __ARM_BIG_ENDIAN -# define S2LO lsl -# define S2LOEQ lsleq -# define S2HI lsr -# define MSB 0x000000ff -# define LSB 0xff000000 -# define BYTE0_OFFSET 24 -# define BYTE1_OFFSET 16 -# define BYTE2_OFFSET 8 -# define BYTE3_OFFSET 0 -#else /* not __ARM_BIG_ENDIAN */ -# define S2LO lsr -# define S2LOEQ lsreq -# define S2HI lsl -# define BYTE0_OFFSET 0 -# define BYTE1_OFFSET 8 -# define BYTE2_OFFSET 16 -# define BYTE3_OFFSET 24 -# define MSB 0xff000000 -# define LSB 0x000000ff -#endif /* not __ARM_BIG_ENDIAN */ - -/* Parameters and result. */ -#define src1 r0 -#define src2 r1 -#define result r0 /* Overlaps src1. */ - -/* Internal variables. */ -#define tmp1 r4 -#define tmp2 r5 -#define const_m1 r12 - -/* Additional internal variables for 64-bit aligned data. */ -#define data1a r2 -#define data1b r3 -#define data2a r6 -#define data2b r7 -#define syndrome_a tmp1 -#define syndrome_b tmp2 - -/* Additional internal variables for 32-bit aligned data. */ -#define data1 r2 -#define data2 r3 -#define syndrome tmp2 - - - .thumb - -/* In Thumb code we can't use MVN with a register shift, but we do have ORN. */ -.macro prepare_mask mask_reg, nbits_reg - S2HI \mask_reg, const_m1, \nbits_reg -.endm -.macro apply_mask data_reg, mask_reg - orn \data_reg, \data_reg, \mask_reg -.endm - - /* Macro to compute and return the result value for word-aligned - cases. */ - .macro strcmp_epilogue_aligned synd d1 d2 restore_r6 -#ifdef __ARM_BIG_ENDIAN - /* If data1 contains a zero byte, then syndrome will contain a 1 in - bit 7 of that byte. Otherwise, the highest set bit in the - syndrome will highlight the first different bit. It is therefore - sufficient to extract the eight bits starting with the syndrome - bit. */ - clz tmp1, \synd - lsl r1, \d2, tmp1 - .if \restore_r6 - ldrd r6, r7, [sp, #8] - .endif - lsl \d1, \d1, tmp1 - lsr result, \d1, #24 - ldrd r4, r5, [sp], #16 - cfi_remember_state - cfi_def_cfa_offset (0) - cfi_restore (r4) - cfi_restore (r5) - cfi_restore (r6) - cfi_restore (r7) - sub result, result, r1, lsr #24 - bx lr -#else - /* To use the big-endian trick we'd have to reverse all three words. - that's slower than this approach. */ - rev \synd, \synd - clz tmp1, \synd - bic tmp1, tmp1, #7 - lsr r1, \d2, tmp1 - .if \restore_r6 - ldrd r6, r7, [sp, #8] - .endif - lsr \d1, \d1, tmp1 - and result, \d1, #255 - and r1, r1, #255 - ldrd r4, r5, [sp], #16 - cfi_remember_state - cfi_def_cfa_offset (0) - cfi_restore (r4) - cfi_restore (r5) - cfi_restore (r6) - cfi_restore (r7) - sub result, result, r1 - - bx lr -#endif - .endm - - .text - .p2align 5 -.Lstrcmp_start_addr: -#if STRCMP_PRECHECK == 1 -.Lfastpath_exit: - sub r0, r2, r3 - bx lr - nop -#endif -ENTRY (strcmp) -#if STRCMP_PRECHECK == 1 - ldrb r2, [src1] - ldrb r3, [src2] - cmp r2, #1 - it cs - cmpcs r2, r3 - bne .Lfastpath_exit -#endif - strd r4, r5, [sp, #-16]! - cfi_def_cfa_offset (16) - cfi_offset (r4, -16) - cfi_offset (r5, -12) - orr tmp1, src1, src2 - strd r6, r7, [sp, #8] - cfi_offset (r6, -8) - cfi_offset (r7, -4) - mvn const_m1, #0 - lsl r2, tmp1, #29 - cbz r2, .Lloop_aligned8 - -.Lnot_aligned: - eor tmp1, src1, src2 - tst tmp1, #7 - bne .Lmisaligned8 - - /* Deal with mutual misalignment by aligning downwards and then - masking off the unwanted loaded data to prevent a difference. */ - and tmp1, src1, #7 - bic src1, src1, #7 - and tmp2, tmp1, #3 - bic src2, src2, #7 - lsl tmp2, tmp2, #3 /* Bytes -> bits. */ - ldrd data1a, data1b, [src1], #16 - tst tmp1, #4 - ldrd data2a, data2b, [src2], #16 - prepare_mask tmp1, tmp2 - apply_mask data1a, tmp1 - apply_mask data2a, tmp1 - beq .Lstart_realigned8 - apply_mask data1b, tmp1 - mov data1a, const_m1 - apply_mask data2b, tmp1 - mov data2a, const_m1 - b .Lstart_realigned8 - - /* Unwind the inner loop by a factor of 2, giving 16 bytes per - pass. */ - .p2align 5,,12 /* Don't start in the tail bytes of a cache line. */ - .p2align 2 /* Always word aligned. */ -.Lloop_aligned8: - ldrd data1a, data1b, [src1], #16 - ldrd data2a, data2b, [src2], #16 -.Lstart_realigned8: - uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */ - eor syndrome_a, data1a, data2a - sel syndrome_a, syndrome_a, const_m1 - cbnz syndrome_a, .Ldiff_in_a - uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */ - eor syndrome_b, data1b, data2b - sel syndrome_b, syndrome_b, const_m1 - cbnz syndrome_b, .Ldiff_in_b - - ldrd data1a, data1b, [src1, #-8] - ldrd data2a, data2b, [src2, #-8] - uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */ - eor syndrome_a, data1a, data2a - sel syndrome_a, syndrome_a, const_m1 - uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */ - eor syndrome_b, data1b, data2b - sel syndrome_b, syndrome_b, const_m1 - /* Can't use CBZ for backwards branch. */ - orrs syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */ - beq .Lloop_aligned8 - -.Ldiff_found: - cbnz syndrome_a, .Ldiff_in_a - -.Ldiff_in_b: - strcmp_epilogue_aligned syndrome_b, data1b, data2b 1 - -.Ldiff_in_a: - cfi_restore_state - strcmp_epilogue_aligned syndrome_a, data1a, data2a 1 - - cfi_restore_state -.Lmisaligned8: - tst tmp1, #3 - bne .Lmisaligned4 - ands tmp1, src1, #3 - bne .Lmutual_align4 - - /* Unrolled by a factor of 2, to reduce the number of post-increment - operations. */ -.Lloop_aligned4: - ldr data1, [src1], #8 - ldr data2, [src2], #8 -.Lstart_realigned4: - uadd8 syndrome, data1, const_m1 /* Only need GE bits. */ - eor syndrome, data1, data2 - sel syndrome, syndrome, const_m1 - cbnz syndrome, .Laligned4_done - ldr data1, [src1, #-4] - ldr data2, [src2, #-4] - uadd8 syndrome, data1, const_m1 - eor syndrome, data1, data2 - sel syndrome, syndrome, const_m1 - cmp syndrome, #0 - beq .Lloop_aligned4 - -.Laligned4_done: - strcmp_epilogue_aligned syndrome, data1, data2, 0 - -.Lmutual_align4: - cfi_restore_state - /* Deal with mutual misalignment by aligning downwards and then - masking off the unwanted loaded data to prevent a difference. */ - lsl tmp1, tmp1, #3 /* Bytes -> bits. */ - bic src1, src1, #3 - ldr data1, [src1], #8 - bic src2, src2, #3 - ldr data2, [src2], #8 - - prepare_mask tmp1, tmp1 - apply_mask data1, tmp1 - apply_mask data2, tmp1 - b .Lstart_realigned4 - -.Lmisaligned4: - ands tmp1, src1, #3 - beq .Lsrc1_aligned - sub src2, src2, tmp1 - bic src1, src1, #3 - lsls tmp1, tmp1, #31 - ldr data1, [src1], #4 - beq .Laligned_m2 - bcs .Laligned_m1 - -#if STRCMP_PRECHECK == 0 - ldrb data2, [src2, #1] - uxtb tmp1, data1, ror #BYTE1_OFFSET - subs tmp1, tmp1, data2 - bne .Lmisaligned_exit - cbz data2, .Lmisaligned_exit - -.Laligned_m2: - ldrb data2, [src2, #2] - uxtb tmp1, data1, ror #BYTE2_OFFSET - subs tmp1, tmp1, data2 - bne .Lmisaligned_exit - cbz data2, .Lmisaligned_exit - -.Laligned_m1: - ldrb data2, [src2, #3] - uxtb tmp1, data1, ror #BYTE3_OFFSET - subs tmp1, tmp1, data2 - bne .Lmisaligned_exit - add src2, src2, #4 - cbnz data2, .Lsrc1_aligned -#else /* STRCMP_PRECHECK */ - /* If we've done the pre-check, then we don't need to check the - first byte again here. */ - ldrb data2, [src2, #2] - uxtb tmp1, data1, ror #BYTE2_OFFSET - subs tmp1, tmp1, data2 - bne .Lmisaligned_exit - cbz data2, .Lmisaligned_exit - -.Laligned_m2: - ldrb data2, [src2, #3] - uxtb tmp1, data1, ror #BYTE3_OFFSET - subs tmp1, tmp1, data2 - bne .Lmisaligned_exit - cbnz data2, .Laligned_m1 -#endif - -.Lmisaligned_exit: - mov result, tmp1 - ldr r4, [sp], #16 - cfi_remember_state - cfi_def_cfa_offset (0) - cfi_restore (r4) - cfi_restore (r5) - cfi_restore (r6) - cfi_restore (r7) - bx lr - -#if STRCMP_PRECHECK == 1 -.Laligned_m1: - add src2, src2, #4 -#endif -.Lsrc1_aligned: - cfi_restore_state - /* src1 is word aligned, but src2 has no common alignment - with it. */ - ldr data1, [src1], #4 - lsls tmp1, src2, #31 /* C=src2[1], Z=src2[0]. */ - - bic src2, src2, #3 - ldr data2, [src2], #4 - bhi .Loverlap1 /* C=1, Z=0 => src2[1:0] = 0b11. */ - bcs .Loverlap2 /* C=1, Z=1 => src2[1:0] = 0b10. */ - - /* (overlap3) C=0, Z=0 => src2[1:0] = 0b01. */ -.Loverlap3: - bic tmp1, data1, #MSB - uadd8 syndrome, data1, const_m1 - eors syndrome, tmp1, data2, S2LO #8 - sel syndrome, syndrome, const_m1 - bne 4f - cbnz syndrome, 5f - ldr data2, [src2], #4 - eor tmp1, tmp1, data1 - cmp tmp1, data2, S2HI #24 - bne 6f - ldr data1, [src1], #4 - b .Loverlap3 -4: - S2LO data2, data2, #8 - b .Lstrcmp_tail - -5: - bics syndrome, syndrome, #MSB - bne .Lstrcmp_done_equal - - /* We can only get here if the MSB of data1 contains 0, so - fast-path the exit. */ - ldrb result, [src2] - ldrd r4, r5, [sp], #16 - cfi_remember_state - cfi_def_cfa_offset (0) - cfi_restore (r4) - cfi_restore (r5) - /* R6/7 Not used in this sequence. */ - cfi_restore (r6) - cfi_restore (r7) - neg result, result - bx lr - -6: - cfi_restore_state - S2LO data1, data1, #24 - and data2, data2, #LSB - b .Lstrcmp_tail - - .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */ -.Loverlap2: - and tmp1, data1, const_m1, S2LO #16 - uadd8 syndrome, data1, const_m1 - eors syndrome, tmp1, data2, S2LO #16 - sel syndrome, syndrome, const_m1 - bne 4f - cbnz syndrome, 5f - ldr data2, [src2], #4 - eor tmp1, tmp1, data1 - cmp tmp1, data2, S2HI #16 - bne 6f - ldr data1, [src1], #4 - b .Loverlap2 -4: - S2LO data2, data2, #16 - b .Lstrcmp_tail -5: - ands syndrome, syndrome, const_m1, S2LO #16 - bne .Lstrcmp_done_equal - - ldrh data2, [src2] - S2LO data1, data1, #16 -#ifdef __ARM_BIG_ENDIAN - lsl data2, data2, #16 -#endif - b .Lstrcmp_tail - -6: - S2LO data1, data1, #16 - and data2, data2, const_m1, S2LO #16 - b .Lstrcmp_tail - - .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */ -.Loverlap1: - and tmp1, data1, #LSB - uadd8 syndrome, data1, const_m1 - eors syndrome, tmp1, data2, S2LO #24 - sel syndrome, syndrome, const_m1 - bne 4f - cbnz syndrome, 5f - ldr data2, [src2], #4 - eor tmp1, tmp1, data1 - cmp tmp1, data2, S2HI #8 - bne 6f - ldr data1, [src1], #4 - b .Loverlap1 -4: - S2LO data2, data2, #24 - b .Lstrcmp_tail -5: - tst syndrome, #LSB - bne .Lstrcmp_done_equal - ldr data2, [src2] -6: - S2LO data1, data1, #8 - bic data2, data2, #MSB - b .Lstrcmp_tail - -.Lstrcmp_done_equal: - mov result, #0 - ldrd r4, r5, [sp], #16 - cfi_remember_state - cfi_def_cfa_offset (0) - cfi_restore (r4) - cfi_restore (r5) - /* R6/7 not used in this sequence. */ - cfi_restore (r6) - cfi_restore (r7) - bx lr - -.Lstrcmp_tail: - cfi_restore_state -#ifndef __ARM_BIG_ENDIAN - rev data1, data1 - rev data2, data2 - /* Now everything looks big-endian... */ -#endif - uadd8 tmp1, data1, const_m1 - eor tmp1, data1, data2 - sel syndrome, tmp1, const_m1 - clz tmp1, syndrome - lsl data1, data1, tmp1 - lsl data2, data2, tmp1 - lsr result, data1, #24 - ldrd r4, r5, [sp], #16 - cfi_def_cfa_offset (0) - cfi_restore (r4) - cfi_restore (r5) - /* R6/7 not used in this sequence. */ - cfi_restore (r6) - cfi_restore (r7) - sub result, result, data2, lsr #24 - bx lr -END (strcmp) -libc_hidden_builtin_def (strcmp)