diff --git a/aosp_diff/preliminary/bionic/0003-Optimize-bionic-memory-functions-with-avx2-instructi.patch b/aosp_diff/preliminary/bionic/0003-Optimize-bionic-memory-functions-with-avx2-instructi.patch new file mode 100644 index 0000000000..db61807bff --- /dev/null +++ b/aosp_diff/preliminary/bionic/0003-Optimize-bionic-memory-functions-with-avx2-instructi.patch @@ -0,0 +1,3779 @@ +From 013b505284379453df6637f009a224f6d5c6f3bd Mon Sep 17 00:00:00 2001 +From: "Reddy, Alavala Srinivasa" +Date: Wed, 13 Sep 2023 18:36:21 +0530 +Subject: [PATCH 3/5] Optimize bionic memory functions with avx2 instructions + +Following memory related functions are optimized with +avx2 implementation ported from glibc 2.20 +(only for 64-bit) + - memchr + - memcmp + - memrchr + +Test done: Build and boot is fine, Run the benchmarks suite. + +Change-Id: I956773c79b9bcebee69726820eaa74c709df7081 +Signed-off-by: ahs +Signed-off-by: Ravi Kumar Soni +--- + libc/Android.bp | 36 +- + .../kabylake/string/avx2-memcpy-kbl.S | 2052 +++++++++++++++++ + .../arch-x86_64/dynamic_function_dispatch.cpp | 38 + + libc/arch-x86_64/generic/string/memchr.c | 20 + + libc/arch-x86_64/generic/string/memrchr.c | 20 + + libc/arch-x86_64/generic/string/wmemset.c | 20 + + libc/arch-x86_64/{string => include}/cache.h | 0 + .../kabylake/string/avx2-memchr-kbl.S | 371 +++ + .../kabylake/string/avx2-memcmp-kbl.S | 428 ++++ + .../kabylake/string/avx2-memrchr-kbl.S | 408 ++++ + .../kabylake/string/avx2-wmemset-kbl.S | 140 ++ + .../string/sse2-memmove-slm.S | 4 +- + .../{ => silvermont}/string/sse2-memset-slm.S | 0 + .../{ => silvermont}/string/sse2-stpcpy-slm.S | 0 + .../string/sse2-stpncpy-slm.S | 0 + .../{ => silvermont}/string/sse2-strcat-slm.S | 0 + .../{ => silvermont}/string/sse2-strcpy-slm.S | 0 + .../{ => silvermont}/string/sse2-strlen-slm.S | 0 + .../string/sse2-strncat-slm.S | 0 + .../string/sse2-strncpy-slm.S | 0 + .../{ => silvermont}/string/sse4-memcmp-slm.S | 2 +- + .../string/ssse3-strcmp-slm.S | 0 + .../string/ssse3-strncmp-slm.S | 0 + libc/arch-x86_64/static_function_dispatch.S | 6 + + 24 files changed, 3528 insertions(+), 17 deletions(-) + create mode 100644 libc/arch-x86/kabylake/string/avx2-memcpy-kbl.S + create mode 100644 libc/arch-x86_64/generic/string/memchr.c + create mode 100644 libc/arch-x86_64/generic/string/memrchr.c + create mode 100644 libc/arch-x86_64/generic/string/wmemset.c + rename libc/arch-x86_64/{string => include}/cache.h (100%) + create mode 100644 libc/arch-x86_64/kabylake/string/avx2-memchr-kbl.S + create mode 100644 libc/arch-x86_64/kabylake/string/avx2-memcmp-kbl.S + create mode 100644 libc/arch-x86_64/kabylake/string/avx2-memrchr-kbl.S + create mode 100644 libc/arch-x86_64/kabylake/string/avx2-wmemset-kbl.S + rename libc/arch-x86_64/{ => silvermont}/string/sse2-memmove-slm.S (99%) + rename libc/arch-x86_64/{ => silvermont}/string/sse2-memset-slm.S (100%) + rename libc/arch-x86_64/{ => silvermont}/string/sse2-stpcpy-slm.S (100%) + rename libc/arch-x86_64/{ => silvermont}/string/sse2-stpncpy-slm.S (100%) + rename libc/arch-x86_64/{ => silvermont}/string/sse2-strcat-slm.S (100%) + rename libc/arch-x86_64/{ => silvermont}/string/sse2-strcpy-slm.S (100%) + rename libc/arch-x86_64/{ => silvermont}/string/sse2-strlen-slm.S (100%) + rename libc/arch-x86_64/{ => silvermont}/string/sse2-strncat-slm.S (100%) + rename libc/arch-x86_64/{ => silvermont}/string/sse2-strncpy-slm.S (100%) + rename libc/arch-x86_64/{ => silvermont}/string/sse4-memcmp-slm.S (99%) + rename libc/arch-x86_64/{ => silvermont}/string/ssse3-strcmp-slm.S (100%) + rename libc/arch-x86_64/{ => silvermont}/string/ssse3-strncmp-slm.S (100%) + +diff --git a/libc/Android.bp b/libc/Android.bp +index 943d41fba..530ce9111 100644 +--- a/libc/Android.bp ++++ b/libc/Android.bp +@@ -617,8 +617,6 @@ cc_library_static { + }, + x86_64: { + srcs: [ +- "upstream-openbsd/lib/libc/string/memchr.c", +- "upstream-openbsd/lib/libc/string/memrchr.c", + "upstream-openbsd/lib/libc/string/strlcat.c", + "upstream-openbsd/lib/libc/string/strlcpy.c", + ], +@@ -1187,6 +1185,7 @@ cc_library_static { + ], + }, + x86_64: { ++ include_dirs: ["bionic/libc/arch-x86_64/include"], + srcs: [ + "arch-x86_64/bionic/__bionic_clone.S", + "arch-x86_64/bionic/_exit_with_stack_teardown.S", +@@ -1196,18 +1195,27 @@ cc_library_static { + "arch-x86_64/bionic/vfork.S", + + "arch-x86_64/string/avx2-memset-kbl.S", +- "arch-x86_64/string/sse2-memmove-slm.S", +- "arch-x86_64/string/sse2-memset-slm.S", +- "arch-x86_64/string/sse2-stpcpy-slm.S", +- "arch-x86_64/string/sse2-stpncpy-slm.S", +- "arch-x86_64/string/sse2-strcat-slm.S", +- "arch-x86_64/string/sse2-strcpy-slm.S", +- "arch-x86_64/string/sse2-strlen-slm.S", +- "arch-x86_64/string/sse2-strncat-slm.S", +- "arch-x86_64/string/sse2-strncpy-slm.S", +- "arch-x86_64/string/sse4-memcmp-slm.S", +- "arch-x86_64/string/ssse3-strcmp-slm.S", +- "arch-x86_64/string/ssse3-strncmp-slm.S", ++ "arch-x86_64/silvermont/string/sse2-memmove-slm.S", ++ "arch-x86_64/silvermont/string/sse2-memset-slm.S", ++ "arch-x86_64/silvermont/string/sse2-stpcpy-slm.S", ++ "arch-x86_64/silvermont/string/sse2-stpncpy-slm.S", ++ "arch-x86_64/silvermont/string/sse2-strcat-slm.S", ++ "arch-x86_64/silvermont/string/sse2-strcpy-slm.S", ++ "arch-x86_64/silvermont/string/sse2-strlen-slm.S", ++ "arch-x86_64/silvermont/string/sse2-strncat-slm.S", ++ "arch-x86_64/silvermont/string/sse2-strncpy-slm.S", ++ "arch-x86_64/silvermont/string/sse4-memcmp-slm.S", ++ "arch-x86_64/silvermont/string/ssse3-strcmp-slm.S", ++ "arch-x86_64/silvermont/string/ssse3-strncmp-slm.S", ++ ++ //"arch-x86_64/generic/string/wmemset.c" ++ "arch-x86_64/generic/string/memchr.c", ++ "arch-x86_64/generic/string/memrchr.c", ++ ++ //"arch-x86_64/kabylake/string/avx2-wmemset-kbl.S" ++ "arch-x86_64/kabylake/string/avx2-memcmp-kbl.S", ++ "arch-x86_64/kabylake/string/avx2-memchr-kbl.S", ++ "arch-x86_64/kabylake/string/avx2-memrchr-kbl.S", + + "bionic/strchr.cpp", + "bionic/strchrnul.cpp", +diff --git a/libc/arch-x86/kabylake/string/avx2-memcpy-kbl.S b/libc/arch-x86/kabylake/string/avx2-memcpy-kbl.S +new file mode 100644 +index 000000000..69fca7cf1 +--- /dev/null ++++ b/libc/arch-x86/kabylake/string/avx2-memcpy-kbl.S +@@ -0,0 +1,2052 @@ ++#define ENTRY(f) \ ++ .text; \ ++ .globl f; \ ++ .p2align 4, 0x90; \ ++ .type f,@function; \ ++ f: \ ++ ++#define END(f) ++ .size f, .-f; \ ++ .section .rodata,"a",@progbits; \ ++ .p2align 2 \ ++ ++ENTRY(memcpy_avx2) ++# %bb.0: ++ pushl %ebp ++ pushl %ebx ++ pushl %edi ++ pushl %esi ++ movl 28(%esp), %ebx ++ movl 24(%esp), %ecx ++ movl 20(%esp), %eax ++ calll .L0$pb ++.L0$pb: ++ popl %esi ++.Ltmp0: ++ addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp0-.L0$pb), %esi ++ cmpl $256, %ebx # imm = 0x100 ++ ja .LBB0_251 ++# %bb.1: ++ leal -1(%ebx), %edi ++ cmpl $255, %edi ++ ja .LBB0_270 ++# %bb.2: ++ addl .LJTI0_1@GOTOFF(%esi,%edi,4), %esi ++ leal (%eax,%ebx), %edx ++ addl %ebx, %ecx ++ jmpl *%esi ++.LBB0_251: ++ movl %eax, %ebp ++ vmovups (%ecx), %ymm0 ++ movl %ebx, %edi ++ negl %ebp ++ andl $31, %ebp ++ subl %ebp, %edi ++ addl %ebp, %ecx ++ leal (%eax,%ebp), %edx ++ cmpl $2097152, %edi # imm = 0x200000 ++ vmovups %ymm0, (%eax) ++ ja .LBB0_256 ++# %bb.252: ++ cmpl $256, %edi # imm = 0x100 ++ jb .LBB0_260 ++# %bb.253: ++ subl %ebp, %ebx ++ .p2align 4, 0x90 ++.LBB0_254: # =>This Inner Loop Header: Depth=1 ++ vmovups (%ecx), %ymm0 ++ vmovups 32(%ecx), %ymm1 ++ vmovups 64(%ecx), %ymm2 ++ vmovups 96(%ecx), %ymm3 ++ vmovups 128(%ecx), %ymm4 ++ vmovups 160(%ecx), %ymm5 ++ vmovups 192(%ecx), %ymm6 ++ vmovups 224(%ecx), %ymm7 ++ prefetchnta 512(%ecx) ++ addl $-256, %edi ++ addl $256, %ecx # imm = 0x100 ++ vmovups %ymm0, (%edx) ++ vmovups %ymm1, 32(%edx) ++ vmovups %ymm2, 64(%edx) ++ vmovups %ymm3, 96(%edx) ++ vmovups %ymm4, 128(%edx) ++ vmovups %ymm5, 160(%edx) ++ vmovups %ymm6, 192(%edx) ++ vmovups %ymm7, 224(%edx) ++ addl $256, %edx # imm = 0x100 ++ cmpl $255, %edi ++ ja .LBB0_254 ++# %bb.255: ++ movzbl %bl, %edi ++ leal -1(%edi), %ebx ++ cmpl $255, %ebx ++ jbe .LBB0_261 ++ jmp .LBB0_270 ++.LBB0_256: ++ prefetchnta (%ecx) ++ subl %ebp, %ebx ++ testb $31, %cl ++ je .LBB0_257 ++ .p2align 4, 0x90 ++.LBB0_258: # =>This Inner Loop Header: Depth=1 ++ vmovups (%ecx), %ymm0 ++ vmovups 32(%ecx), %ymm1 ++ vmovups 64(%ecx), %ymm2 ++ vmovups 96(%ecx), %ymm3 ++ vmovups 128(%ecx), %ymm4 ++ vmovups 160(%ecx), %ymm5 ++ vmovups 192(%ecx), %ymm6 ++ vmovups 224(%ecx), %ymm7 ++ prefetchnta 512(%ecx) ++ addl $-256, %edi ++ addl $256, %ecx # imm = 0x100 ++ vmovntps %ymm0, (%edx) ++ vmovntps %ymm1, 32(%edx) ++ vmovntps %ymm2, 64(%edx) ++ vmovntps %ymm3, 96(%edx) ++ vmovntps %ymm4, 128(%edx) ++ vmovntps %ymm5, 160(%edx) ++ vmovntps %ymm6, 192(%edx) ++ vmovntps %ymm7, 224(%edx) ++ addl $256, %edx # imm = 0x100 ++ cmpl $255, %edi ++ ja .LBB0_258 ++ jmp .LBB0_259 ++ .p2align 4, 0x90 ++.LBB0_257: # =>This Inner Loop Header: Depth=1 ++ vmovaps (%ecx), %ymm0 ++ vmovaps 32(%ecx), %ymm1 ++ vmovaps 64(%ecx), %ymm2 ++ vmovaps 96(%ecx), %ymm3 ++ vmovaps 128(%ecx), %ymm4 ++ vmovaps 160(%ecx), %ymm5 ++ vmovaps 192(%ecx), %ymm6 ++ vmovaps 224(%ecx), %ymm7 ++ prefetchnta 512(%ecx) ++ addl $-256, %edi ++ addl $256, %ecx # imm = 0x100 ++ vmovntps %ymm0, (%edx) ++ vmovntps %ymm1, 32(%edx) ++ vmovntps %ymm2, 64(%edx) ++ vmovntps %ymm3, 96(%edx) ++ vmovntps %ymm4, 128(%edx) ++ vmovntps %ymm5, 160(%edx) ++ vmovntps %ymm6, 192(%edx) ++ vmovntps %ymm7, 224(%edx) ++ addl $256, %edx # imm = 0x100 ++ cmpl $255, %edi ++ ja .LBB0_257 ++.LBB0_259: ++ sfence ++ movzbl %bl, %edi ++.LBB0_260: ++ leal -1(%edi), %ebx ++ cmpl $255, %ebx ++ ja .LBB0_270 ++.LBB0_261: ++ addl .LJTI0_0@GOTOFF(%esi,%ebx,4), %esi ++ addl %edi, %edx ++ addl %edi, %ecx ++ jmpl *%esi ++.LBB0_11: ++ vmovups -131(%ecx), %ymm0 ++ vmovups %ymm0, -131(%edx) ++ vmovups -99(%ecx), %ymm0 ++ vmovups %ymm0, -99(%edx) ++ vmovups -67(%ecx), %ymm0 ++ vmovups %ymm0, -67(%edx) ++ vmovups -35(%ecx), %ymm0 ++ vmovups %ymm0, -35(%edx) ++.LBB0_12: ++ movzwl -3(%ecx), %esi ++ movw %si, -3(%edx) ++ jmp .LBB0_6 ++.LBB0_17: ++ vmovups -133(%ecx), %ymm0 ++ vmovups %ymm0, -133(%edx) ++ vmovups -101(%ecx), %ymm0 ++ vmovups %ymm0, -101(%edx) ++ vmovups -69(%ecx), %ymm0 ++ vmovups %ymm0, -69(%edx) ++ vmovups -37(%ecx), %ymm0 ++ vmovups %ymm0, -37(%edx) ++.LBB0_18: ++ movl -5(%ecx), %esi ++ movl %esi, -5(%edx) ++ jmp .LBB0_6 ++.LBB0_19: ++ vmovups -134(%ecx), %ymm0 ++ vmovups %ymm0, -134(%edx) ++ vmovups -102(%ecx), %ymm0 ++ vmovups %ymm0, -102(%edx) ++ vmovups -70(%ecx), %ymm0 ++ vmovups %ymm0, -70(%edx) ++ vmovups -38(%ecx), %ymm0 ++ vmovups %ymm0, -38(%edx) ++.LBB0_20: ++ movl -6(%ecx), %esi ++ movl %esi, -6(%edx) ++ jmp .LBB0_10 ++.LBB0_21: ++ vmovups -135(%ecx), %ymm0 ++ vmovups %ymm0, -135(%edx) ++ vmovups -103(%ecx), %ymm0 ++ vmovups %ymm0, -103(%edx) ++ vmovups -71(%ecx), %ymm0 ++ vmovups %ymm0, -71(%edx) ++ vmovups -39(%ecx), %ymm0 ++ vmovups %ymm0, -39(%edx) ++.LBB0_22: ++ movl -7(%ecx), %esi ++ movl %esi, -7(%edx) ++ jmp .LBB0_16 ++.LBB0_27: ++ vmovups -137(%ecx), %ymm0 ++ vmovups %ymm0, -137(%edx) ++ vmovups -105(%ecx), %ymm0 ++ vmovups %ymm0, -105(%edx) ++ vmovups -73(%ecx), %ymm0 ++ vmovups %ymm0, -73(%edx) ++ vmovups -41(%ecx), %ymm0 ++ vmovups %ymm0, -41(%edx) ++.LBB0_28: ++ vmovsd -9(%ecx), %xmm0 # xmm0 = mem[0],zero ++ vmovsd %xmm0, -9(%edx) ++ jmp .LBB0_6 ++.LBB0_29: ++ vmovups -138(%ecx), %ymm0 ++ vmovups %ymm0, -138(%edx) ++ vmovups -106(%ecx), %ymm0 ++ vmovups %ymm0, -106(%edx) ++ vmovups -74(%ecx), %ymm0 ++ vmovups %ymm0, -74(%edx) ++ vmovups -42(%ecx), %ymm0 ++ vmovups %ymm0, -42(%edx) ++.LBB0_30: ++ vmovsd -10(%ecx), %xmm0 # xmm0 = mem[0],zero ++ vmovsd %xmm0, -10(%edx) ++ jmp .LBB0_10 ++.LBB0_31: ++ vmovups -139(%ecx), %ymm0 ++ vmovups %ymm0, -139(%edx) ++ vmovups -107(%ecx), %ymm0 ++ vmovups %ymm0, -107(%edx) ++ vmovups -75(%ecx), %ymm0 ++ vmovups %ymm0, -75(%edx) ++ vmovups -43(%ecx), %ymm0 ++ vmovups %ymm0, -43(%edx) ++.LBB0_32: ++ vmovsd -11(%ecx), %xmm0 # xmm0 = mem[0],zero ++ vmovsd %xmm0, -11(%edx) ++ jmp .LBB0_16 ++.LBB0_33: ++ vmovups -140(%ecx), %ymm0 ++ vmovups %ymm0, -140(%edx) ++ vmovups -108(%ecx), %ymm0 ++ vmovups %ymm0, -108(%edx) ++ vmovups -76(%ecx), %ymm0 ++ vmovups %ymm0, -76(%edx) ++ vmovups -44(%ecx), %ymm0 ++ vmovups %ymm0, -44(%edx) ++.LBB0_34: ++ vmovsd -12(%ecx), %xmm0 # xmm0 = mem[0],zero ++ vmovsd %xmm0, -12(%edx) ++ jmp .LBB0_16 ++.LBB0_35: ++ vmovups -141(%ecx), %ymm0 ++ vmovups %ymm0, -141(%edx) ++ vmovups -109(%ecx), %ymm0 ++ vmovups %ymm0, -109(%edx) ++ vmovups -77(%ecx), %ymm0 ++ vmovups %ymm0, -77(%edx) ++ vmovups -45(%ecx), %ymm0 ++ vmovups %ymm0, -45(%edx) ++.LBB0_36: ++ vmovsd -13(%ecx), %xmm0 # xmm0 = mem[0],zero ++ vmovsd %xmm0, -13(%edx) ++ jmp .LBB0_26 ++.LBB0_37: ++ vmovups -142(%ecx), %ymm0 ++ vmovups %ymm0, -142(%edx) ++ vmovups -110(%ecx), %ymm0 ++ vmovups %ymm0, -110(%edx) ++ vmovups -78(%ecx), %ymm0 ++ vmovups %ymm0, -78(%edx) ++ vmovups -46(%ecx), %ymm0 ++ vmovups %ymm0, -46(%edx) ++.LBB0_38: ++ vmovsd -14(%ecx), %xmm0 # xmm0 = mem[0],zero ++ vmovsd %xmm0, -14(%edx) ++ jmp .LBB0_26 ++.LBB0_39: ++ vmovups -143(%ecx), %ymm0 ++ vmovups %ymm0, -143(%edx) ++ vmovups -111(%ecx), %ymm0 ++ vmovups %ymm0, -111(%edx) ++ vmovups -79(%ecx), %ymm0 ++ vmovups %ymm0, -79(%edx) ++ vmovups -47(%ecx), %ymm0 ++ vmovups %ymm0, -47(%edx) ++.LBB0_40: ++ vmovsd -15(%ecx), %xmm0 # xmm0 = mem[0],zero ++ vmovsd %xmm0, -15(%edx) ++ jmp .LBB0_26 ++.LBB0_45: ++ vmovups -145(%ecx), %ymm0 ++ vmovups %ymm0, -145(%edx) ++ vmovups -113(%ecx), %ymm0 ++ vmovups %ymm0, -113(%edx) ++ vmovups -81(%ecx), %ymm0 ++ vmovups %ymm0, -81(%edx) ++ vmovups -49(%ecx), %ymm0 ++ vmovups %ymm0, -49(%edx) ++.LBB0_46: ++ vmovups -17(%ecx), %xmm0 ++ vmovups %xmm0, -17(%edx) ++ jmp .LBB0_6 ++.LBB0_47: ++ vmovups -146(%ecx), %ymm0 ++ vmovups %ymm0, -146(%edx) ++ vmovups -114(%ecx), %ymm0 ++ vmovups %ymm0, -114(%edx) ++ vmovups -82(%ecx), %ymm0 ++ vmovups %ymm0, -82(%edx) ++ vmovups -50(%ecx), %ymm0 ++ vmovups %ymm0, -50(%edx) ++.LBB0_48: ++ vmovups -18(%ecx), %xmm0 ++ vmovups %xmm0, -18(%edx) ++ jmp .LBB0_10 ++.LBB0_49: ++ vmovups -147(%ecx), %ymm0 ++ vmovups %ymm0, -147(%edx) ++ vmovups -115(%ecx), %ymm0 ++ vmovups %ymm0, -115(%edx) ++ vmovups -83(%ecx), %ymm0 ++ vmovups %ymm0, -83(%edx) ++ vmovups -51(%ecx), %ymm0 ++ vmovups %ymm0, -51(%edx) ++.LBB0_50: ++ vmovups -19(%ecx), %xmm0 ++ vmovups %xmm0, -19(%edx) ++ jmp .LBB0_16 ++.LBB0_51: ++ vmovups -148(%ecx), %ymm0 ++ vmovups %ymm0, -148(%edx) ++ vmovups -116(%ecx), %ymm0 ++ vmovups %ymm0, -116(%edx) ++ vmovups -84(%ecx), %ymm0 ++ vmovups %ymm0, -84(%edx) ++ vmovups -52(%ecx), %ymm0 ++ vmovups %ymm0, -52(%edx) ++.LBB0_52: ++ vmovups -20(%ecx), %xmm0 ++ vmovups %xmm0, -20(%edx) ++ jmp .LBB0_16 ++.LBB0_53: ++ vmovups -149(%ecx), %ymm0 ++ vmovups %ymm0, -149(%edx) ++ vmovups -117(%ecx), %ymm0 ++ vmovups %ymm0, -117(%edx) ++ vmovups -85(%ecx), %ymm0 ++ vmovups %ymm0, -85(%edx) ++ vmovups -53(%ecx), %ymm0 ++ vmovups %ymm0, -53(%edx) ++.LBB0_54: ++ vmovups -21(%ecx), %xmm0 ++ vmovups %xmm0, -21(%edx) ++ jmp .LBB0_26 ++.LBB0_55: ++ vmovups -150(%ecx), %ymm0 ++ vmovups %ymm0, -150(%edx) ++ vmovups -118(%ecx), %ymm0 ++ vmovups %ymm0, -118(%edx) ++ vmovups -86(%ecx), %ymm0 ++ vmovups %ymm0, -86(%edx) ++ vmovups -54(%ecx), %ymm0 ++ vmovups %ymm0, -54(%edx) ++.LBB0_56: ++ vmovups -22(%ecx), %xmm0 ++ vmovups %xmm0, -22(%edx) ++ jmp .LBB0_26 ++.LBB0_57: ++ vmovups -151(%ecx), %ymm0 ++ vmovups %ymm0, -151(%edx) ++ vmovups -119(%ecx), %ymm0 ++ vmovups %ymm0, -119(%edx) ++ vmovups -87(%ecx), %ymm0 ++ vmovups %ymm0, -87(%edx) ++ vmovups -55(%ecx), %ymm0 ++ vmovups %ymm0, -55(%edx) ++.LBB0_58: ++ vmovups -23(%ecx), %xmm0 ++ vmovups %xmm0, -23(%edx) ++ jmp .LBB0_26 ++.LBB0_59: ++ vmovups -152(%ecx), %ymm0 ++ vmovups %ymm0, -152(%edx) ++ vmovups -120(%ecx), %ymm0 ++ vmovups %ymm0, -120(%edx) ++ vmovups -88(%ecx), %ymm0 ++ vmovups %ymm0, -88(%edx) ++ vmovups -56(%ecx), %ymm0 ++ vmovups %ymm0, -56(%edx) ++.LBB0_60: ++ vmovups -24(%ecx), %xmm0 ++ vmovups %xmm0, -24(%edx) ++ jmp .LBB0_26 ++.LBB0_61: ++ vmovups -153(%ecx), %ymm0 ++ vmovups %ymm0, -153(%edx) ++ vmovups -121(%ecx), %ymm0 ++ vmovups %ymm0, -121(%edx) ++ vmovups -89(%ecx), %ymm0 ++ vmovups %ymm0, -89(%edx) ++ vmovups -57(%ecx), %ymm0 ++ vmovups %ymm0, -57(%edx) ++.LBB0_62: ++ vmovups -25(%ecx), %xmm0 ++ vmovups %xmm0, -25(%edx) ++ jmp .LBB0_44 ++.LBB0_63: ++ vmovups -154(%ecx), %ymm0 ++ vmovups %ymm0, -154(%edx) ++ vmovups -122(%ecx), %ymm0 ++ vmovups %ymm0, -122(%edx) ++ vmovups -90(%ecx), %ymm0 ++ vmovups %ymm0, -90(%edx) ++ vmovups -58(%ecx), %ymm0 ++ vmovups %ymm0, -58(%edx) ++.LBB0_64: ++ vmovups -26(%ecx), %xmm0 ++ vmovups %xmm0, -26(%edx) ++ jmp .LBB0_44 ++.LBB0_65: ++ vmovups -155(%ecx), %ymm0 ++ vmovups %ymm0, -155(%edx) ++ vmovups -123(%ecx), %ymm0 ++ vmovups %ymm0, -123(%edx) ++ vmovups -91(%ecx), %ymm0 ++ vmovups %ymm0, -91(%edx) ++ vmovups -59(%ecx), %ymm0 ++ vmovups %ymm0, -59(%edx) ++.LBB0_66: ++ vmovups -27(%ecx), %xmm0 ++ vmovups %xmm0, -27(%edx) ++ jmp .LBB0_44 ++.LBB0_67: ++ vmovups -156(%ecx), %ymm0 ++ vmovups %ymm0, -156(%edx) ++ vmovups -124(%ecx), %ymm0 ++ vmovups %ymm0, -124(%edx) ++ vmovups -92(%ecx), %ymm0 ++ vmovups %ymm0, -92(%edx) ++ vmovups -60(%ecx), %ymm0 ++ vmovups %ymm0, -60(%edx) ++.LBB0_68: ++ vmovups -28(%ecx), %xmm0 ++ vmovups %xmm0, -28(%edx) ++ jmp .LBB0_44 ++.LBB0_69: ++ vmovups -157(%ecx), %ymm0 ++ vmovups %ymm0, -157(%edx) ++ vmovups -125(%ecx), %ymm0 ++ vmovups %ymm0, -125(%edx) ++ vmovups -93(%ecx), %ymm0 ++ vmovups %ymm0, -93(%edx) ++ vmovups -61(%ecx), %ymm0 ++ vmovups %ymm0, -61(%edx) ++.LBB0_70: ++ vmovups -29(%ecx), %xmm0 ++ vmovups %xmm0, -29(%edx) ++ jmp .LBB0_44 ++.LBB0_71: ++ vmovups -158(%ecx), %ymm0 ++ vmovups %ymm0, -158(%edx) ++ vmovups -126(%ecx), %ymm0 ++ vmovups %ymm0, -126(%edx) ++ vmovups -94(%ecx), %ymm0 ++ vmovups %ymm0, -94(%edx) ++ vmovups -62(%ecx), %ymm0 ++ vmovups %ymm0, -62(%edx) ++.LBB0_72: ++ vmovups -30(%ecx), %xmm0 ++ vmovups %xmm0, -30(%edx) ++ jmp .LBB0_44 ++.LBB0_73: ++ vmovups -159(%ecx), %ymm0 ++ vmovups %ymm0, -159(%edx) ++ vmovups -127(%ecx), %ymm0 ++ vmovups %ymm0, -127(%edx) ++ vmovups -95(%ecx), %ymm0 ++ vmovups %ymm0, -95(%edx) ++ vmovups -63(%ecx), %ymm0 ++ vmovups %ymm0, -63(%edx) ++.LBB0_74: ++ vmovups -31(%ecx), %xmm0 ++ vmovups %xmm0, -31(%edx) ++ jmp .LBB0_44 ++.LBB0_75: ++ vmovups -193(%ecx), %ymm0 ++ vmovups %ymm0, -193(%edx) ++.LBB0_76: ++ vmovups -161(%ecx), %ymm0 ++ vmovups %ymm0, -161(%edx) ++.LBB0_3: ++ vmovups -129(%ecx), %ymm0 ++ vmovups %ymm0, -129(%edx) ++ vmovups -97(%ecx), %ymm0 ++ vmovups %ymm0, -97(%edx) ++.LBB0_4: ++ vmovups -65(%ecx), %ymm0 ++ vmovups %ymm0, -65(%edx) ++.LBB0_5: ++ vmovups -33(%ecx), %ymm0 ++ vmovups %ymm0, -33(%edx) ++.LBB0_6: ++ movb -1(%ecx), %cl ++ movb %cl, -1(%edx) ++ jmp .LBB0_270 ++.LBB0_77: ++ vmovups -194(%ecx), %ymm0 ++ vmovups %ymm0, -194(%edx) ++.LBB0_78: ++ vmovups -162(%ecx), %ymm0 ++ vmovups %ymm0, -162(%edx) ++.LBB0_7: ++ vmovups -130(%ecx), %ymm0 ++ vmovups %ymm0, -130(%edx) ++ vmovups -98(%ecx), %ymm0 ++ vmovups %ymm0, -98(%edx) ++.LBB0_8: ++ vmovups -66(%ecx), %ymm0 ++ vmovups %ymm0, -66(%edx) ++.LBB0_9: ++ vmovups -34(%ecx), %ymm0 ++ vmovups %ymm0, -34(%edx) ++.LBB0_10: ++ movzwl -2(%ecx), %ecx ++ movw %cx, -2(%edx) ++ jmp .LBB0_270 ++.LBB0_79: ++ vmovups -195(%ecx), %ymm0 ++ vmovups %ymm0, -195(%edx) ++.LBB0_80: ++ vmovups -163(%ecx), %ymm0 ++ vmovups %ymm0, -163(%edx) ++ vmovups -131(%ecx), %ymm0 ++ vmovups %ymm0, -131(%edx) ++ vmovups -99(%ecx), %ymm0 ++ vmovups %ymm0, -99(%edx) ++.LBB0_81: ++ vmovups -67(%ecx), %ymm0 ++ vmovups %ymm0, -67(%edx) ++.LBB0_82: ++ vmovups -35(%ecx), %ymm0 ++ vmovups %ymm0, -35(%edx) ++ jmp .LBB0_16 ++.LBB0_83: ++ vmovups -196(%ecx), %ymm0 ++ vmovups %ymm0, -196(%edx) ++.LBB0_84: ++ vmovups -164(%ecx), %ymm0 ++ vmovups %ymm0, -164(%edx) ++.LBB0_13: ++ vmovups -132(%ecx), %ymm0 ++ vmovups %ymm0, -132(%edx) ++ vmovups -100(%ecx), %ymm0 ++ vmovups %ymm0, -100(%edx) ++.LBB0_14: ++ vmovups -68(%ecx), %ymm0 ++ vmovups %ymm0, -68(%edx) ++.LBB0_15: ++ vmovups -36(%ecx), %ymm0 ++ vmovups %ymm0, -36(%edx) ++.LBB0_16: ++ movl -4(%ecx), %ecx ++ movl %ecx, -4(%edx) ++ jmp .LBB0_270 ++.LBB0_85: ++ vmovups -197(%ecx), %ymm0 ++ vmovups %ymm0, -197(%edx) ++.LBB0_86: ++ vmovups -165(%ecx), %ymm0 ++ vmovups %ymm0, -165(%edx) ++ vmovups -133(%ecx), %ymm0 ++ vmovups %ymm0, -133(%edx) ++ vmovups -101(%ecx), %ymm0 ++ vmovups %ymm0, -101(%edx) ++.LBB0_87: ++ vmovups -69(%ecx), %ymm0 ++ vmovups %ymm0, -69(%edx) ++.LBB0_88: ++ vmovups -37(%ecx), %ymm0 ++ vmovups %ymm0, -37(%edx) ++ jmp .LBB0_26 ++.LBB0_89: ++ vmovups -198(%ecx), %ymm0 ++ vmovups %ymm0, -198(%edx) ++.LBB0_90: ++ vmovups -166(%ecx), %ymm0 ++ vmovups %ymm0, -166(%edx) ++ vmovups -134(%ecx), %ymm0 ++ vmovups %ymm0, -134(%edx) ++ vmovups -102(%ecx), %ymm0 ++ vmovups %ymm0, -102(%edx) ++.LBB0_91: ++ vmovups -70(%ecx), %ymm0 ++ vmovups %ymm0, -70(%edx) ++.LBB0_92: ++ vmovups -38(%ecx), %ymm0 ++ vmovups %ymm0, -38(%edx) ++ jmp .LBB0_26 ++.LBB0_93: ++ vmovups -199(%ecx), %ymm0 ++ vmovups %ymm0, -199(%edx) ++.LBB0_94: ++ vmovups -167(%ecx), %ymm0 ++ vmovups %ymm0, -167(%edx) ++ vmovups -135(%ecx), %ymm0 ++ vmovups %ymm0, -135(%edx) ++ vmovups -103(%ecx), %ymm0 ++ vmovups %ymm0, -103(%edx) ++.LBB0_95: ++ vmovups -71(%ecx), %ymm0 ++ vmovups %ymm0, -71(%edx) ++.LBB0_96: ++ vmovups -39(%ecx), %ymm0 ++ vmovups %ymm0, -39(%edx) ++ jmp .LBB0_26 ++.LBB0_97: ++ vmovups -200(%ecx), %ymm0 ++ vmovups %ymm0, -200(%edx) ++.LBB0_98: ++ vmovups -168(%ecx), %ymm0 ++ vmovups %ymm0, -168(%edx) ++.LBB0_23: ++ vmovups -136(%ecx), %ymm0 ++ vmovups %ymm0, -136(%edx) ++ vmovups -104(%ecx), %ymm0 ++ vmovups %ymm0, -104(%edx) ++.LBB0_24: ++ vmovups -72(%ecx), %ymm0 ++ vmovups %ymm0, -72(%edx) ++.LBB0_25: ++ vmovups -40(%ecx), %ymm0 ++ vmovups %ymm0, -40(%edx) ++.LBB0_26: ++ vmovsd -8(%ecx), %xmm0 # xmm0 = mem[0],zero ++ vmovsd %xmm0, -8(%edx) ++ jmp .LBB0_270 ++.LBB0_99: ++ vmovups -201(%ecx), %ymm0 ++ vmovups %ymm0, -201(%edx) ++.LBB0_100: ++ vmovups -169(%ecx), %ymm0 ++ vmovups %ymm0, -169(%edx) ++ vmovups -137(%ecx), %ymm0 ++ vmovups %ymm0, -137(%edx) ++ vmovups -105(%ecx), %ymm0 ++ vmovups %ymm0, -105(%edx) ++.LBB0_101: ++ vmovups -73(%ecx), %ymm0 ++ vmovups %ymm0, -73(%edx) ++.LBB0_102: ++ vmovups -41(%ecx), %ymm0 ++ vmovups %ymm0, -41(%edx) ++ jmp .LBB0_44 ++.LBB0_103: ++ vmovups -202(%ecx), %ymm0 ++ vmovups %ymm0, -202(%edx) ++.LBB0_104: ++ vmovups -170(%ecx), %ymm0 ++ vmovups %ymm0, -170(%edx) ++ vmovups -138(%ecx), %ymm0 ++ vmovups %ymm0, -138(%edx) ++ vmovups -106(%ecx), %ymm0 ++ vmovups %ymm0, -106(%edx) ++.LBB0_105: ++ vmovups -74(%ecx), %ymm0 ++ vmovups %ymm0, -74(%edx) ++.LBB0_106: ++ vmovups -42(%ecx), %ymm0 ++ vmovups %ymm0, -42(%edx) ++ jmp .LBB0_44 ++.LBB0_107: ++ vmovups -203(%ecx), %ymm0 ++ vmovups %ymm0, -203(%edx) ++.LBB0_108: ++ vmovups -171(%ecx), %ymm0 ++ vmovups %ymm0, -171(%edx) ++ vmovups -139(%ecx), %ymm0 ++ vmovups %ymm0, -139(%edx) ++ vmovups -107(%ecx), %ymm0 ++ vmovups %ymm0, -107(%edx) ++.LBB0_109: ++ vmovups -75(%ecx), %ymm0 ++ vmovups %ymm0, -75(%edx) ++.LBB0_110: ++ vmovups -43(%ecx), %ymm0 ++ vmovups %ymm0, -43(%edx) ++ jmp .LBB0_44 ++.LBB0_111: ++ vmovups -204(%ecx), %ymm0 ++ vmovups %ymm0, -204(%edx) ++.LBB0_112: ++ vmovups -172(%ecx), %ymm0 ++ vmovups %ymm0, -172(%edx) ++ vmovups -140(%ecx), %ymm0 ++ vmovups %ymm0, -140(%edx) ++ vmovups -108(%ecx), %ymm0 ++ vmovups %ymm0, -108(%edx) ++.LBB0_113: ++ vmovups -76(%ecx), %ymm0 ++ vmovups %ymm0, -76(%edx) ++.LBB0_114: ++ vmovups -44(%ecx), %ymm0 ++ vmovups %ymm0, -44(%edx) ++ jmp .LBB0_44 ++.LBB0_115: ++ vmovups -205(%ecx), %ymm0 ++ vmovups %ymm0, -205(%edx) ++.LBB0_116: ++ vmovups -173(%ecx), %ymm0 ++ vmovups %ymm0, -173(%edx) ++ vmovups -141(%ecx), %ymm0 ++ vmovups %ymm0, -141(%edx) ++ vmovups -109(%ecx), %ymm0 ++ vmovups %ymm0, -109(%edx) ++.LBB0_117: ++ vmovups -77(%ecx), %ymm0 ++ vmovups %ymm0, -77(%edx) ++.LBB0_118: ++ vmovups -45(%ecx), %ymm0 ++ vmovups %ymm0, -45(%edx) ++ jmp .LBB0_44 ++.LBB0_119: ++ vmovups -206(%ecx), %ymm0 ++ vmovups %ymm0, -206(%edx) ++.LBB0_120: ++ vmovups -174(%ecx), %ymm0 ++ vmovups %ymm0, -174(%edx) ++ vmovups -142(%ecx), %ymm0 ++ vmovups %ymm0, -142(%edx) ++ vmovups -110(%ecx), %ymm0 ++ vmovups %ymm0, -110(%edx) ++.LBB0_121: ++ vmovups -78(%ecx), %ymm0 ++ vmovups %ymm0, -78(%edx) ++.LBB0_122: ++ vmovups -46(%ecx), %ymm0 ++ vmovups %ymm0, -46(%edx) ++ jmp .LBB0_44 ++.LBB0_123: ++ vmovups -207(%ecx), %ymm0 ++ vmovups %ymm0, -207(%edx) ++.LBB0_124: ++ vmovups -175(%ecx), %ymm0 ++ vmovups %ymm0, -175(%edx) ++ vmovups -143(%ecx), %ymm0 ++ vmovups %ymm0, -143(%edx) ++ vmovups -111(%ecx), %ymm0 ++ vmovups %ymm0, -111(%edx) ++.LBB0_125: ++ vmovups -79(%ecx), %ymm0 ++ vmovups %ymm0, -79(%edx) ++.LBB0_126: ++ vmovups -47(%ecx), %ymm0 ++ vmovups %ymm0, -47(%edx) ++ jmp .LBB0_44 ++.LBB0_127: ++ vmovups -208(%ecx), %ymm0 ++ vmovups %ymm0, -208(%edx) ++.LBB0_128: ++ vmovups -176(%ecx), %ymm0 ++ vmovups %ymm0, -176(%edx) ++.LBB0_41: ++ vmovups -144(%ecx), %ymm0 ++ vmovups %ymm0, -144(%edx) ++ vmovups -112(%ecx), %ymm0 ++ vmovups %ymm0, -112(%edx) ++.LBB0_42: ++ vmovups -80(%ecx), %ymm0 ++ vmovups %ymm0, -80(%edx) ++.LBB0_43: ++ vmovups -48(%ecx), %ymm0 ++ vmovups %ymm0, -48(%edx) ++.LBB0_44: ++ vmovups -16(%ecx), %xmm0 ++ vmovups %xmm0, -16(%edx) ++ jmp .LBB0_270 ++.LBB0_129: ++ vmovups -209(%ecx), %ymm0 ++ vmovups %ymm0, -209(%edx) ++.LBB0_130: ++ vmovups -177(%ecx), %ymm0 ++ vmovups %ymm0, -177(%edx) ++ vmovups -145(%ecx), %ymm0 ++ vmovups %ymm0, -145(%edx) ++ vmovups -113(%ecx), %ymm0 ++ vmovups %ymm0, -113(%edx) ++.LBB0_131: ++ vmovups -81(%ecx), %ymm0 ++ vmovups %ymm0, -81(%edx) ++.LBB0_132: ++ vmovups -49(%ecx), %ymm0 ++ vmovups %ymm0, -49(%edx) ++ jmp .LBB0_269 ++.LBB0_133: ++ vmovups -210(%ecx), %ymm0 ++ vmovups %ymm0, -210(%edx) ++.LBB0_134: ++ vmovups -178(%ecx), %ymm0 ++ vmovups %ymm0, -178(%edx) ++ vmovups -146(%ecx), %ymm0 ++ vmovups %ymm0, -146(%edx) ++ vmovups -114(%ecx), %ymm0 ++ vmovups %ymm0, -114(%edx) ++.LBB0_135: ++ vmovups -82(%ecx), %ymm0 ++ vmovups %ymm0, -82(%edx) ++.LBB0_136: ++ vmovups -50(%ecx), %ymm0 ++ vmovups %ymm0, -50(%edx) ++ jmp .LBB0_269 ++.LBB0_137: ++ vmovups -211(%ecx), %ymm0 ++ vmovups %ymm0, -211(%edx) ++.LBB0_138: ++ vmovups -179(%ecx), %ymm0 ++ vmovups %ymm0, -179(%edx) ++ vmovups -147(%ecx), %ymm0 ++ vmovups %ymm0, -147(%edx) ++ vmovups -115(%ecx), %ymm0 ++ vmovups %ymm0, -115(%edx) ++.LBB0_139: ++ vmovups -83(%ecx), %ymm0 ++ vmovups %ymm0, -83(%edx) ++.LBB0_140: ++ vmovups -51(%ecx), %ymm0 ++ vmovups %ymm0, -51(%edx) ++ jmp .LBB0_269 ++.LBB0_141: ++ vmovups -212(%ecx), %ymm0 ++ vmovups %ymm0, -212(%edx) ++.LBB0_142: ++ vmovups -180(%ecx), %ymm0 ++ vmovups %ymm0, -180(%edx) ++ vmovups -148(%ecx), %ymm0 ++ vmovups %ymm0, -148(%edx) ++ vmovups -116(%ecx), %ymm0 ++ vmovups %ymm0, -116(%edx) ++.LBB0_143: ++ vmovups -84(%ecx), %ymm0 ++ vmovups %ymm0, -84(%edx) ++.LBB0_144: ++ vmovups -52(%ecx), %ymm0 ++ vmovups %ymm0, -52(%edx) ++ jmp .LBB0_269 ++.LBB0_145: ++ vmovups -213(%ecx), %ymm0 ++ vmovups %ymm0, -213(%edx) ++.LBB0_146: ++ vmovups -181(%ecx), %ymm0 ++ vmovups %ymm0, -181(%edx) ++ vmovups -149(%ecx), %ymm0 ++ vmovups %ymm0, -149(%edx) ++ vmovups -117(%ecx), %ymm0 ++ vmovups %ymm0, -117(%edx) ++.LBB0_147: ++ vmovups -85(%ecx), %ymm0 ++ vmovups %ymm0, -85(%edx) ++.LBB0_148: ++ vmovups -53(%ecx), %ymm0 ++ vmovups %ymm0, -53(%edx) ++ jmp .LBB0_269 ++.LBB0_149: ++ vmovups -214(%ecx), %ymm0 ++ vmovups %ymm0, -214(%edx) ++.LBB0_150: ++ vmovups -182(%ecx), %ymm0 ++ vmovups %ymm0, -182(%edx) ++ vmovups -150(%ecx), %ymm0 ++ vmovups %ymm0, -150(%edx) ++ vmovups -118(%ecx), %ymm0 ++ vmovups %ymm0, -118(%edx) ++.LBB0_151: ++ vmovups -86(%ecx), %ymm0 ++ vmovups %ymm0, -86(%edx) ++.LBB0_152: ++ vmovups -54(%ecx), %ymm0 ++ vmovups %ymm0, -54(%edx) ++ jmp .LBB0_269 ++.LBB0_153: ++ vmovups -215(%ecx), %ymm0 ++ vmovups %ymm0, -215(%edx) ++.LBB0_154: ++ vmovups -183(%ecx), %ymm0 ++ vmovups %ymm0, -183(%edx) ++ vmovups -151(%ecx), %ymm0 ++ vmovups %ymm0, -151(%edx) ++ vmovups -119(%ecx), %ymm0 ++ vmovups %ymm0, -119(%edx) ++.LBB0_155: ++ vmovups -87(%ecx), %ymm0 ++ vmovups %ymm0, -87(%edx) ++.LBB0_156: ++ vmovups -55(%ecx), %ymm0 ++ vmovups %ymm0, -55(%edx) ++ jmp .LBB0_269 ++.LBB0_157: ++ vmovups -216(%ecx), %ymm0 ++ vmovups %ymm0, -216(%edx) ++.LBB0_158: ++ vmovups -184(%ecx), %ymm0 ++ vmovups %ymm0, -184(%edx) ++ vmovups -152(%ecx), %ymm0 ++ vmovups %ymm0, -152(%edx) ++ vmovups -120(%ecx), %ymm0 ++ vmovups %ymm0, -120(%edx) ++.LBB0_159: ++ vmovups -88(%ecx), %ymm0 ++ vmovups %ymm0, -88(%edx) ++.LBB0_160: ++ vmovups -56(%ecx), %ymm0 ++ vmovups %ymm0, -56(%edx) ++ jmp .LBB0_269 ++.LBB0_161: ++ vmovups -217(%ecx), %ymm0 ++ vmovups %ymm0, -217(%edx) ++.LBB0_162: ++ vmovups -185(%ecx), %ymm0 ++ vmovups %ymm0, -185(%edx) ++ vmovups -153(%ecx), %ymm0 ++ vmovups %ymm0, -153(%edx) ++ vmovups -121(%ecx), %ymm0 ++ vmovups %ymm0, -121(%edx) ++.LBB0_163: ++ vmovups -89(%ecx), %ymm0 ++ vmovups %ymm0, -89(%edx) ++.LBB0_164: ++ vmovups -57(%ecx), %ymm0 ++ vmovups %ymm0, -57(%edx) ++ jmp .LBB0_269 ++.LBB0_165: ++ vmovups -218(%ecx), %ymm0 ++ vmovups %ymm0, -218(%edx) ++.LBB0_166: ++ vmovups -186(%ecx), %ymm0 ++ vmovups %ymm0, -186(%edx) ++ vmovups -154(%ecx), %ymm0 ++ vmovups %ymm0, -154(%edx) ++ vmovups -122(%ecx), %ymm0 ++ vmovups %ymm0, -122(%edx) ++.LBB0_167: ++ vmovups -90(%ecx), %ymm0 ++ vmovups %ymm0, -90(%edx) ++.LBB0_168: ++ vmovups -58(%ecx), %ymm0 ++ vmovups %ymm0, -58(%edx) ++ jmp .LBB0_269 ++.LBB0_169: ++ vmovups -219(%ecx), %ymm0 ++ vmovups %ymm0, -219(%edx) ++.LBB0_170: ++ vmovups -187(%ecx), %ymm0 ++ vmovups %ymm0, -187(%edx) ++ vmovups -155(%ecx), %ymm0 ++ vmovups %ymm0, -155(%edx) ++ vmovups -123(%ecx), %ymm0 ++ vmovups %ymm0, -123(%edx) ++.LBB0_171: ++ vmovups -91(%ecx), %ymm0 ++ vmovups %ymm0, -91(%edx) ++.LBB0_172: ++ vmovups -59(%ecx), %ymm0 ++ vmovups %ymm0, -59(%edx) ++ jmp .LBB0_269 ++.LBB0_173: ++ vmovups -220(%ecx), %ymm0 ++ vmovups %ymm0, -220(%edx) ++.LBB0_174: ++ vmovups -188(%ecx), %ymm0 ++ vmovups %ymm0, -188(%edx) ++ vmovups -156(%ecx), %ymm0 ++ vmovups %ymm0, -156(%edx) ++ vmovups -124(%ecx), %ymm0 ++ vmovups %ymm0, -124(%edx) ++.LBB0_175: ++ vmovups -92(%ecx), %ymm0 ++ vmovups %ymm0, -92(%edx) ++.LBB0_176: ++ vmovups -60(%ecx), %ymm0 ++ vmovups %ymm0, -60(%edx) ++ jmp .LBB0_269 ++.LBB0_177: ++ vmovups -221(%ecx), %ymm0 ++ vmovups %ymm0, -221(%edx) ++.LBB0_178: ++ vmovups -189(%ecx), %ymm0 ++ vmovups %ymm0, -189(%edx) ++ vmovups -157(%ecx), %ymm0 ++ vmovups %ymm0, -157(%edx) ++ vmovups -125(%ecx), %ymm0 ++ vmovups %ymm0, -125(%edx) ++.LBB0_179: ++ vmovups -93(%ecx), %ymm0 ++ vmovups %ymm0, -93(%edx) ++.LBB0_180: ++ vmovups -61(%ecx), %ymm0 ++ vmovups %ymm0, -61(%edx) ++ jmp .LBB0_269 ++.LBB0_181: ++ vmovups -222(%ecx), %ymm0 ++ vmovups %ymm0, -222(%edx) ++.LBB0_182: ++ vmovups -190(%ecx), %ymm0 ++ vmovups %ymm0, -190(%edx) ++ vmovups -158(%ecx), %ymm0 ++ vmovups %ymm0, -158(%edx) ++ vmovups -126(%ecx), %ymm0 ++ vmovups %ymm0, -126(%edx) ++.LBB0_183: ++ vmovups -94(%ecx), %ymm0 ++ vmovups %ymm0, -94(%edx) ++.LBB0_184: ++ vmovups -62(%ecx), %ymm0 ++ vmovups %ymm0, -62(%edx) ++ jmp .LBB0_269 ++.LBB0_185: ++ vmovups -223(%ecx), %ymm0 ++ vmovups %ymm0, -223(%edx) ++.LBB0_186: ++ vmovups -191(%ecx), %ymm0 ++ vmovups %ymm0, -191(%edx) ++ vmovups -159(%ecx), %ymm0 ++ vmovups %ymm0, -159(%edx) ++ vmovups -127(%ecx), %ymm0 ++ vmovups %ymm0, -127(%edx) ++.LBB0_187: ++ vmovups -95(%ecx), %ymm0 ++ vmovups %ymm0, -95(%edx) ++.LBB0_188: ++ vmovups -63(%ecx), %ymm0 ++ vmovups %ymm0, -63(%edx) ++ jmp .LBB0_269 ++.LBB0_189: ++ vmovups -225(%ecx), %ymm0 ++ vmovups %ymm0, -225(%edx) ++ vmovups -193(%ecx), %ymm0 ++ vmovups %ymm0, -193(%edx) ++ vmovups -161(%ecx), %ymm0 ++ vmovups %ymm0, -161(%edx) ++ vmovups -129(%ecx), %ymm0 ++ vmovups %ymm0, -129(%edx) ++.LBB0_190: ++ vmovups -97(%ecx), %ymm0 ++ vmovups %ymm0, -97(%edx) ++ vmovups -65(%ecx), %ymm0 ++ vmovups %ymm0, -65(%edx) ++ jmp .LBB0_268 ++.LBB0_191: ++ vmovups -226(%ecx), %ymm0 ++ vmovups %ymm0, -226(%edx) ++ vmovups -194(%ecx), %ymm0 ++ vmovups %ymm0, -194(%edx) ++ vmovups -162(%ecx), %ymm0 ++ vmovups %ymm0, -162(%edx) ++ vmovups -130(%ecx), %ymm0 ++ vmovups %ymm0, -130(%edx) ++.LBB0_192: ++ vmovups -98(%ecx), %ymm0 ++ vmovups %ymm0, -98(%edx) ++ vmovups -66(%ecx), %ymm0 ++ vmovups %ymm0, -66(%edx) ++ jmp .LBB0_268 ++.LBB0_193: ++ vmovups -227(%ecx), %ymm0 ++ vmovups %ymm0, -227(%edx) ++ vmovups -195(%ecx), %ymm0 ++ vmovups %ymm0, -195(%edx) ++ vmovups -163(%ecx), %ymm0 ++ vmovups %ymm0, -163(%edx) ++ vmovups -131(%ecx), %ymm0 ++ vmovups %ymm0, -131(%edx) ++.LBB0_194: ++ vmovups -99(%ecx), %ymm0 ++ vmovups %ymm0, -99(%edx) ++ vmovups -67(%ecx), %ymm0 ++ vmovups %ymm0, -67(%edx) ++ jmp .LBB0_268 ++.LBB0_195: ++ vmovups -228(%ecx), %ymm0 ++ vmovups %ymm0, -228(%edx) ++ vmovups -196(%ecx), %ymm0 ++ vmovups %ymm0, -196(%edx) ++ vmovups -164(%ecx), %ymm0 ++ vmovups %ymm0, -164(%edx) ++ vmovups -132(%ecx), %ymm0 ++ vmovups %ymm0, -132(%edx) ++.LBB0_196: ++ vmovups -100(%ecx), %ymm0 ++ vmovups %ymm0, -100(%edx) ++ vmovups -68(%ecx), %ymm0 ++ vmovups %ymm0, -68(%edx) ++ jmp .LBB0_268 ++.LBB0_197: ++ vmovups -229(%ecx), %ymm0 ++ vmovups %ymm0, -229(%edx) ++ vmovups -197(%ecx), %ymm0 ++ vmovups %ymm0, -197(%edx) ++ vmovups -165(%ecx), %ymm0 ++ vmovups %ymm0, -165(%edx) ++ vmovups -133(%ecx), %ymm0 ++ vmovups %ymm0, -133(%edx) ++.LBB0_198: ++ vmovups -101(%ecx), %ymm0 ++ vmovups %ymm0, -101(%edx) ++ vmovups -69(%ecx), %ymm0 ++ vmovups %ymm0, -69(%edx) ++ jmp .LBB0_268 ++.LBB0_199: ++ vmovups -230(%ecx), %ymm0 ++ vmovups %ymm0, -230(%edx) ++ vmovups -198(%ecx), %ymm0 ++ vmovups %ymm0, -198(%edx) ++ vmovups -166(%ecx), %ymm0 ++ vmovups %ymm0, -166(%edx) ++ vmovups -134(%ecx), %ymm0 ++ vmovups %ymm0, -134(%edx) ++.LBB0_200: ++ vmovups -102(%ecx), %ymm0 ++ vmovups %ymm0, -102(%edx) ++ vmovups -70(%ecx), %ymm0 ++ vmovups %ymm0, -70(%edx) ++ jmp .LBB0_268 ++.LBB0_201: ++ vmovups -231(%ecx), %ymm0 ++ vmovups %ymm0, -231(%edx) ++ vmovups -199(%ecx), %ymm0 ++ vmovups %ymm0, -199(%edx) ++ vmovups -167(%ecx), %ymm0 ++ vmovups %ymm0, -167(%edx) ++ vmovups -135(%ecx), %ymm0 ++ vmovups %ymm0, -135(%edx) ++.LBB0_202: ++ vmovups -103(%ecx), %ymm0 ++ vmovups %ymm0, -103(%edx) ++ vmovups -71(%ecx), %ymm0 ++ vmovups %ymm0, -71(%edx) ++ jmp .LBB0_268 ++.LBB0_203: ++ vmovups -232(%ecx), %ymm0 ++ vmovups %ymm0, -232(%edx) ++ vmovups -200(%ecx), %ymm0 ++ vmovups %ymm0, -200(%edx) ++ vmovups -168(%ecx), %ymm0 ++ vmovups %ymm0, -168(%edx) ++ vmovups -136(%ecx), %ymm0 ++ vmovups %ymm0, -136(%edx) ++.LBB0_204: ++ vmovups -104(%ecx), %ymm0 ++ vmovups %ymm0, -104(%edx) ++ vmovups -72(%ecx), %ymm0 ++ vmovups %ymm0, -72(%edx) ++ jmp .LBB0_268 ++.LBB0_205: ++ vmovups -233(%ecx), %ymm0 ++ vmovups %ymm0, -233(%edx) ++ vmovups -201(%ecx), %ymm0 ++ vmovups %ymm0, -201(%edx) ++ vmovups -169(%ecx), %ymm0 ++ vmovups %ymm0, -169(%edx) ++ vmovups -137(%ecx), %ymm0 ++ vmovups %ymm0, -137(%edx) ++.LBB0_206: ++ vmovups -105(%ecx), %ymm0 ++ vmovups %ymm0, -105(%edx) ++ vmovups -73(%ecx), %ymm0 ++ vmovups %ymm0, -73(%edx) ++ jmp .LBB0_268 ++.LBB0_207: ++ vmovups -234(%ecx), %ymm0 ++ vmovups %ymm0, -234(%edx) ++ vmovups -202(%ecx), %ymm0 ++ vmovups %ymm0, -202(%edx) ++ vmovups -170(%ecx), %ymm0 ++ vmovups %ymm0, -170(%edx) ++ vmovups -138(%ecx), %ymm0 ++ vmovups %ymm0, -138(%edx) ++.LBB0_208: ++ vmovups -106(%ecx), %ymm0 ++ vmovups %ymm0, -106(%edx) ++ vmovups -74(%ecx), %ymm0 ++ vmovups %ymm0, -74(%edx) ++ jmp .LBB0_268 ++.LBB0_209: ++ vmovups -235(%ecx), %ymm0 ++ vmovups %ymm0, -235(%edx) ++ vmovups -203(%ecx), %ymm0 ++ vmovups %ymm0, -203(%edx) ++ vmovups -171(%ecx), %ymm0 ++ vmovups %ymm0, -171(%edx) ++ vmovups -139(%ecx), %ymm0 ++ vmovups %ymm0, -139(%edx) ++.LBB0_210: ++ vmovups -107(%ecx), %ymm0 ++ vmovups %ymm0, -107(%edx) ++ vmovups -75(%ecx), %ymm0 ++ vmovups %ymm0, -75(%edx) ++ jmp .LBB0_268 ++.LBB0_211: ++ vmovups -236(%ecx), %ymm0 ++ vmovups %ymm0, -236(%edx) ++ vmovups -204(%ecx), %ymm0 ++ vmovups %ymm0, -204(%edx) ++ vmovups -172(%ecx), %ymm0 ++ vmovups %ymm0, -172(%edx) ++ vmovups -140(%ecx), %ymm0 ++ vmovups %ymm0, -140(%edx) ++.LBB0_212: ++ vmovups -108(%ecx), %ymm0 ++ vmovups %ymm0, -108(%edx) ++ vmovups -76(%ecx), %ymm0 ++ vmovups %ymm0, -76(%edx) ++ jmp .LBB0_268 ++.LBB0_213: ++ vmovups -237(%ecx), %ymm0 ++ vmovups %ymm0, -237(%edx) ++ vmovups -205(%ecx), %ymm0 ++ vmovups %ymm0, -205(%edx) ++ vmovups -173(%ecx), %ymm0 ++ vmovups %ymm0, -173(%edx) ++ vmovups -141(%ecx), %ymm0 ++ vmovups %ymm0, -141(%edx) ++.LBB0_214: ++ vmovups -109(%ecx), %ymm0 ++ vmovups %ymm0, -109(%edx) ++ vmovups -77(%ecx), %ymm0 ++ vmovups %ymm0, -77(%edx) ++ jmp .LBB0_268 ++.LBB0_215: ++ vmovups -238(%ecx), %ymm0 ++ vmovups %ymm0, -238(%edx) ++ vmovups -206(%ecx), %ymm0 ++ vmovups %ymm0, -206(%edx) ++ vmovups -174(%ecx), %ymm0 ++ vmovups %ymm0, -174(%edx) ++ vmovups -142(%ecx), %ymm0 ++ vmovups %ymm0, -142(%edx) ++.LBB0_216: ++ vmovups -110(%ecx), %ymm0 ++ vmovups %ymm0, -110(%edx) ++ vmovups -78(%ecx), %ymm0 ++ vmovups %ymm0, -78(%edx) ++ jmp .LBB0_268 ++.LBB0_217: ++ vmovups -239(%ecx), %ymm0 ++ vmovups %ymm0, -239(%edx) ++ vmovups -207(%ecx), %ymm0 ++ vmovups %ymm0, -207(%edx) ++ vmovups -175(%ecx), %ymm0 ++ vmovups %ymm0, -175(%edx) ++ vmovups -143(%ecx), %ymm0 ++ vmovups %ymm0, -143(%edx) ++.LBB0_218: ++ vmovups -111(%ecx), %ymm0 ++ vmovups %ymm0, -111(%edx) ++ vmovups -79(%ecx), %ymm0 ++ vmovups %ymm0, -79(%edx) ++ jmp .LBB0_268 ++.LBB0_219: ++ vmovups -240(%ecx), %ymm0 ++ vmovups %ymm0, -240(%edx) ++ vmovups -208(%ecx), %ymm0 ++ vmovups %ymm0, -208(%edx) ++ vmovups -176(%ecx), %ymm0 ++ vmovups %ymm0, -176(%edx) ++ vmovups -144(%ecx), %ymm0 ++ vmovups %ymm0, -144(%edx) ++.LBB0_220: ++ vmovups -112(%ecx), %ymm0 ++ vmovups %ymm0, -112(%edx) ++ vmovups -80(%ecx), %ymm0 ++ vmovups %ymm0, -80(%edx) ++ jmp .LBB0_268 ++.LBB0_221: ++ vmovups -241(%ecx), %ymm0 ++ vmovups %ymm0, -241(%edx) ++ vmovups -209(%ecx), %ymm0 ++ vmovups %ymm0, -209(%edx) ++ vmovups -177(%ecx), %ymm0 ++ vmovups %ymm0, -177(%edx) ++ vmovups -145(%ecx), %ymm0 ++ vmovups %ymm0, -145(%edx) ++.LBB0_222: ++ vmovups -113(%ecx), %ymm0 ++ vmovups %ymm0, -113(%edx) ++ vmovups -81(%ecx), %ymm0 ++ vmovups %ymm0, -81(%edx) ++ jmp .LBB0_268 ++.LBB0_223: ++ vmovups -242(%ecx), %ymm0 ++ vmovups %ymm0, -242(%edx) ++ vmovups -210(%ecx), %ymm0 ++ vmovups %ymm0, -210(%edx) ++ vmovups -178(%ecx), %ymm0 ++ vmovups %ymm0, -178(%edx) ++ vmovups -146(%ecx), %ymm0 ++ vmovups %ymm0, -146(%edx) ++.LBB0_224: ++ vmovups -114(%ecx), %ymm0 ++ vmovups %ymm0, -114(%edx) ++ vmovups -82(%ecx), %ymm0 ++ vmovups %ymm0, -82(%edx) ++ jmp .LBB0_268 ++.LBB0_225: ++ vmovups -243(%ecx), %ymm0 ++ vmovups %ymm0, -243(%edx) ++ vmovups -211(%ecx), %ymm0 ++ vmovups %ymm0, -211(%edx) ++ vmovups -179(%ecx), %ymm0 ++ vmovups %ymm0, -179(%edx) ++ vmovups -147(%ecx), %ymm0 ++ vmovups %ymm0, -147(%edx) ++.LBB0_226: ++ vmovups -115(%ecx), %ymm0 ++ vmovups %ymm0, -115(%edx) ++ vmovups -83(%ecx), %ymm0 ++ vmovups %ymm0, -83(%edx) ++ jmp .LBB0_268 ++.LBB0_227: ++ vmovups -244(%ecx), %ymm0 ++ vmovups %ymm0, -244(%edx) ++ vmovups -212(%ecx), %ymm0 ++ vmovups %ymm0, -212(%edx) ++ vmovups -180(%ecx), %ymm0 ++ vmovups %ymm0, -180(%edx) ++ vmovups -148(%ecx), %ymm0 ++ vmovups %ymm0, -148(%edx) ++.LBB0_228: ++ vmovups -116(%ecx), %ymm0 ++ vmovups %ymm0, -116(%edx) ++ vmovups -84(%ecx), %ymm0 ++ vmovups %ymm0, -84(%edx) ++ jmp .LBB0_268 ++.LBB0_229: ++ vmovups -245(%ecx), %ymm0 ++ vmovups %ymm0, -245(%edx) ++ vmovups -213(%ecx), %ymm0 ++ vmovups %ymm0, -213(%edx) ++ vmovups -181(%ecx), %ymm0 ++ vmovups %ymm0, -181(%edx) ++ vmovups -149(%ecx), %ymm0 ++ vmovups %ymm0, -149(%edx) ++.LBB0_230: ++ vmovups -117(%ecx), %ymm0 ++ vmovups %ymm0, -117(%edx) ++ vmovups -85(%ecx), %ymm0 ++ vmovups %ymm0, -85(%edx) ++ jmp .LBB0_268 ++.LBB0_231: ++ vmovups -246(%ecx), %ymm0 ++ vmovups %ymm0, -246(%edx) ++ vmovups -214(%ecx), %ymm0 ++ vmovups %ymm0, -214(%edx) ++ vmovups -182(%ecx), %ymm0 ++ vmovups %ymm0, -182(%edx) ++ vmovups -150(%ecx), %ymm0 ++ vmovups %ymm0, -150(%edx) ++.LBB0_232: ++ vmovups -118(%ecx), %ymm0 ++ vmovups %ymm0, -118(%edx) ++ vmovups -86(%ecx), %ymm0 ++ vmovups %ymm0, -86(%edx) ++ jmp .LBB0_268 ++.LBB0_233: ++ vmovups -247(%ecx), %ymm0 ++ vmovups %ymm0, -247(%edx) ++ vmovups -215(%ecx), %ymm0 ++ vmovups %ymm0, -215(%edx) ++ vmovups -183(%ecx), %ymm0 ++ vmovups %ymm0, -183(%edx) ++ vmovups -151(%ecx), %ymm0 ++ vmovups %ymm0, -151(%edx) ++.LBB0_234: ++ vmovups -119(%ecx), %ymm0 ++ vmovups %ymm0, -119(%edx) ++ vmovups -87(%ecx), %ymm0 ++ vmovups %ymm0, -87(%edx) ++ jmp .LBB0_268 ++.LBB0_235: ++ vmovups -248(%ecx), %ymm0 ++ vmovups %ymm0, -248(%edx) ++ vmovups -216(%ecx), %ymm0 ++ vmovups %ymm0, -216(%edx) ++ vmovups -184(%ecx), %ymm0 ++ vmovups %ymm0, -184(%edx) ++ vmovups -152(%ecx), %ymm0 ++ vmovups %ymm0, -152(%edx) ++.LBB0_236: ++ vmovups -120(%ecx), %ymm0 ++ vmovups %ymm0, -120(%edx) ++ vmovups -88(%ecx), %ymm0 ++ vmovups %ymm0, -88(%edx) ++ jmp .LBB0_268 ++.LBB0_237: ++ vmovups -249(%ecx), %ymm0 ++ vmovups %ymm0, -249(%edx) ++ vmovups -217(%ecx), %ymm0 ++ vmovups %ymm0, -217(%edx) ++ vmovups -185(%ecx), %ymm0 ++ vmovups %ymm0, -185(%edx) ++ vmovups -153(%ecx), %ymm0 ++ vmovups %ymm0, -153(%edx) ++.LBB0_238: ++ vmovups -121(%ecx), %ymm0 ++ vmovups %ymm0, -121(%edx) ++ vmovups -89(%ecx), %ymm0 ++ vmovups %ymm0, -89(%edx) ++ jmp .LBB0_268 ++.LBB0_239: ++ vmovups -250(%ecx), %ymm0 ++ vmovups %ymm0, -250(%edx) ++ vmovups -218(%ecx), %ymm0 ++ vmovups %ymm0, -218(%edx) ++ vmovups -186(%ecx), %ymm0 ++ vmovups %ymm0, -186(%edx) ++ vmovups -154(%ecx), %ymm0 ++ vmovups %ymm0, -154(%edx) ++.LBB0_240: ++ vmovups -122(%ecx), %ymm0 ++ vmovups %ymm0, -122(%edx) ++ vmovups -90(%ecx), %ymm0 ++ vmovups %ymm0, -90(%edx) ++ jmp .LBB0_268 ++.LBB0_241: ++ vmovups -251(%ecx), %ymm0 ++ vmovups %ymm0, -251(%edx) ++ vmovups -219(%ecx), %ymm0 ++ vmovups %ymm0, -219(%edx) ++ vmovups -187(%ecx), %ymm0 ++ vmovups %ymm0, -187(%edx) ++ vmovups -155(%ecx), %ymm0 ++ vmovups %ymm0, -155(%edx) ++.LBB0_242: ++ vmovups -123(%ecx), %ymm0 ++ vmovups %ymm0, -123(%edx) ++ vmovups -91(%ecx), %ymm0 ++ vmovups %ymm0, -91(%edx) ++ jmp .LBB0_268 ++.LBB0_243: ++ vmovups -252(%ecx), %ymm0 ++ vmovups %ymm0, -252(%edx) ++ vmovups -220(%ecx), %ymm0 ++ vmovups %ymm0, -220(%edx) ++ vmovups -188(%ecx), %ymm0 ++ vmovups %ymm0, -188(%edx) ++ vmovups -156(%ecx), %ymm0 ++ vmovups %ymm0, -156(%edx) ++.LBB0_244: ++ vmovups -124(%ecx), %ymm0 ++ vmovups %ymm0, -124(%edx) ++ vmovups -92(%ecx), %ymm0 ++ vmovups %ymm0, -92(%edx) ++ jmp .LBB0_268 ++.LBB0_245: ++ vmovups -253(%ecx), %ymm0 ++ vmovups %ymm0, -253(%edx) ++ vmovups -221(%ecx), %ymm0 ++ vmovups %ymm0, -221(%edx) ++ vmovups -189(%ecx), %ymm0 ++ vmovups %ymm0, -189(%edx) ++ vmovups -157(%ecx), %ymm0 ++ vmovups %ymm0, -157(%edx) ++.LBB0_246: ++ vmovups -125(%ecx), %ymm0 ++ vmovups %ymm0, -125(%edx) ++ vmovups -93(%ecx), %ymm0 ++ vmovups %ymm0, -93(%edx) ++ jmp .LBB0_268 ++.LBB0_247: ++ vmovups -254(%ecx), %ymm0 ++ vmovups %ymm0, -254(%edx) ++ vmovups -222(%ecx), %ymm0 ++ vmovups %ymm0, -222(%edx) ++ vmovups -190(%ecx), %ymm0 ++ vmovups %ymm0, -190(%edx) ++ vmovups -158(%ecx), %ymm0 ++ vmovups %ymm0, -158(%edx) ++.LBB0_248: ++ vmovups -126(%ecx), %ymm0 ++ vmovups %ymm0, -126(%edx) ++ vmovups -94(%ecx), %ymm0 ++ vmovups %ymm0, -94(%edx) ++ jmp .LBB0_268 ++.LBB0_249: ++ vmovups -255(%ecx), %ymm0 ++ vmovups %ymm0, -255(%edx) ++ vmovups -223(%ecx), %ymm0 ++ vmovups %ymm0, -223(%edx) ++ vmovups -191(%ecx), %ymm0 ++ vmovups %ymm0, -191(%edx) ++ vmovups -159(%ecx), %ymm0 ++ vmovups %ymm0, -159(%edx) ++.LBB0_250: ++ vmovups -127(%ecx), %ymm0 ++ vmovups %ymm0, -127(%edx) ++ vmovups -95(%ecx), %ymm0 ++ vmovups %ymm0, -95(%edx) ++ jmp .LBB0_268 ++.LBB0_262: ++ vmovups -256(%ecx), %ymm0 ++ vmovups %ymm0, -256(%edx) ++.LBB0_263: ++ vmovups -224(%ecx), %ymm0 ++ vmovups %ymm0, -224(%edx) ++.LBB0_264: ++ vmovups -192(%ecx), %ymm0 ++ vmovups %ymm0, -192(%edx) ++.LBB0_265: ++ vmovups -160(%ecx), %ymm0 ++ vmovups %ymm0, -160(%edx) ++.LBB0_266: ++ vmovups -128(%ecx), %ymm0 ++ vmovups %ymm0, -128(%edx) ++.LBB0_267: ++ vmovups -96(%ecx), %ymm0 ++ vmovups %ymm0, -96(%edx) ++.LBB0_268: ++ vmovups -64(%ecx), %ymm0 ++ vmovups %ymm0, -64(%edx) ++.LBB0_269: ++ vmovups -32(%ecx), %ymm0 ++ vmovups %ymm0, -32(%edx) ++.LBB0_270: ++ vzeroupper ++ popl %esi ++ popl %edi ++ popl %ebx ++ popl %ebp ++ retl ++END(memcpy_avx2) ++ ++/*.Lfunc_end0: ++ .size memcpy_avx2, .Lfunc_end0-memcpy_avx2 ++ .section .rodata,"a",@progbits ++ .p2align 2*/ ++.LJTI0_0: ++ .long .LBB0_6@GOTOFF ++ .long .LBB0_10@GOTOFF ++ .long .LBB0_12@GOTOFF ++ .long .LBB0_16@GOTOFF ++ .long .LBB0_18@GOTOFF ++ .long .LBB0_20@GOTOFF ++ .long .LBB0_22@GOTOFF ++ .long .LBB0_26@GOTOFF ++ .long .LBB0_28@GOTOFF ++ .long .LBB0_30@GOTOFF ++ .long .LBB0_32@GOTOFF ++ .long .LBB0_34@GOTOFF ++ .long .LBB0_36@GOTOFF ++ .long .LBB0_38@GOTOFF ++ .long .LBB0_40@GOTOFF ++ .long .LBB0_44@GOTOFF ++ .long .LBB0_46@GOTOFF ++ .long .LBB0_48@GOTOFF ++ .long .LBB0_50@GOTOFF ++ .long .LBB0_52@GOTOFF ++ .long .LBB0_54@GOTOFF ++ .long .LBB0_56@GOTOFF ++ .long .LBB0_58@GOTOFF ++ .long .LBB0_60@GOTOFF ++ .long .LBB0_62@GOTOFF ++ .long .LBB0_64@GOTOFF ++ .long .LBB0_66@GOTOFF ++ .long .LBB0_68@GOTOFF ++ .long .LBB0_70@GOTOFF ++ .long .LBB0_72@GOTOFF ++ .long .LBB0_74@GOTOFF ++ .long .LBB0_269@GOTOFF ++ .long .LBB0_5@GOTOFF ++ .long .LBB0_9@GOTOFF ++ .long .LBB0_82@GOTOFF ++ .long .LBB0_15@GOTOFF ++ .long .LBB0_88@GOTOFF ++ .long .LBB0_92@GOTOFF ++ .long .LBB0_96@GOTOFF ++ .long .LBB0_25@GOTOFF ++ .long .LBB0_102@GOTOFF ++ .long .LBB0_106@GOTOFF ++ .long .LBB0_110@GOTOFF ++ .long .LBB0_114@GOTOFF ++ .long .LBB0_118@GOTOFF ++ .long .LBB0_122@GOTOFF ++ .long .LBB0_126@GOTOFF ++ .long .LBB0_43@GOTOFF ++ .long .LBB0_132@GOTOFF ++ .long .LBB0_136@GOTOFF ++ .long .LBB0_140@GOTOFF ++ .long .LBB0_144@GOTOFF ++ .long .LBB0_148@GOTOFF ++ .long .LBB0_152@GOTOFF ++ .long .LBB0_156@GOTOFF ++ .long .LBB0_160@GOTOFF ++ .long .LBB0_164@GOTOFF ++ .long .LBB0_168@GOTOFF ++ .long .LBB0_172@GOTOFF ++ .long .LBB0_176@GOTOFF ++ .long .LBB0_180@GOTOFF ++ .long .LBB0_184@GOTOFF ++ .long .LBB0_188@GOTOFF ++ .long .LBB0_268@GOTOFF ++ .long .LBB0_4@GOTOFF ++ .long .LBB0_8@GOTOFF ++ .long .LBB0_81@GOTOFF ++ .long .LBB0_14@GOTOFF ++ .long .LBB0_87@GOTOFF ++ .long .LBB0_91@GOTOFF ++ .long .LBB0_95@GOTOFF ++ .long .LBB0_24@GOTOFF ++ .long .LBB0_101@GOTOFF ++ .long .LBB0_105@GOTOFF ++ .long .LBB0_109@GOTOFF ++ .long .LBB0_113@GOTOFF ++ .long .LBB0_117@GOTOFF ++ .long .LBB0_121@GOTOFF ++ .long .LBB0_125@GOTOFF ++ .long .LBB0_42@GOTOFF ++ .long .LBB0_131@GOTOFF ++ .long .LBB0_135@GOTOFF ++ .long .LBB0_139@GOTOFF ++ .long .LBB0_143@GOTOFF ++ .long .LBB0_147@GOTOFF ++ .long .LBB0_151@GOTOFF ++ .long .LBB0_155@GOTOFF ++ .long .LBB0_159@GOTOFF ++ .long .LBB0_163@GOTOFF ++ .long .LBB0_167@GOTOFF ++ .long .LBB0_171@GOTOFF ++ .long .LBB0_175@GOTOFF ++ .long .LBB0_179@GOTOFF ++ .long .LBB0_183@GOTOFF ++ .long .LBB0_187@GOTOFF ++ .long .LBB0_267@GOTOFF ++ .long .LBB0_190@GOTOFF ++ .long .LBB0_192@GOTOFF ++ .long .LBB0_194@GOTOFF ++ .long .LBB0_196@GOTOFF ++ .long .LBB0_198@GOTOFF ++ .long .LBB0_200@GOTOFF ++ .long .LBB0_202@GOTOFF ++ .long .LBB0_204@GOTOFF ++ .long .LBB0_206@GOTOFF ++ .long .LBB0_208@GOTOFF ++ .long .LBB0_210@GOTOFF ++ .long .LBB0_212@GOTOFF ++ .long .LBB0_214@GOTOFF ++ .long .LBB0_216@GOTOFF ++ .long .LBB0_218@GOTOFF ++ .long .LBB0_220@GOTOFF ++ .long .LBB0_222@GOTOFF ++ .long .LBB0_224@GOTOFF ++ .long .LBB0_226@GOTOFF ++ .long .LBB0_228@GOTOFF ++ .long .LBB0_230@GOTOFF ++ .long .LBB0_232@GOTOFF ++ .long .LBB0_234@GOTOFF ++ .long .LBB0_236@GOTOFF ++ .long .LBB0_238@GOTOFF ++ .long .LBB0_240@GOTOFF ++ .long .LBB0_242@GOTOFF ++ .long .LBB0_244@GOTOFF ++ .long .LBB0_246@GOTOFF ++ .long .LBB0_248@GOTOFF ++ .long .LBB0_250@GOTOFF ++ .long .LBB0_266@GOTOFF ++ .long .LBB0_3@GOTOFF ++ .long .LBB0_7@GOTOFF ++ .long .LBB0_11@GOTOFF ++ .long .LBB0_13@GOTOFF ++ .long .LBB0_17@GOTOFF ++ .long .LBB0_19@GOTOFF ++ .long .LBB0_21@GOTOFF ++ .long .LBB0_23@GOTOFF ++ .long .LBB0_27@GOTOFF ++ .long .LBB0_29@GOTOFF ++ .long .LBB0_31@GOTOFF ++ .long .LBB0_33@GOTOFF ++ .long .LBB0_35@GOTOFF ++ .long .LBB0_37@GOTOFF ++ .long .LBB0_39@GOTOFF ++ .long .LBB0_41@GOTOFF ++ .long .LBB0_45@GOTOFF ++ .long .LBB0_47@GOTOFF ++ .long .LBB0_49@GOTOFF ++ .long .LBB0_51@GOTOFF ++ .long .LBB0_53@GOTOFF ++ .long .LBB0_55@GOTOFF ++ .long .LBB0_57@GOTOFF ++ .long .LBB0_59@GOTOFF ++ .long .LBB0_61@GOTOFF ++ .long .LBB0_63@GOTOFF ++ .long .LBB0_65@GOTOFF ++ .long .LBB0_67@GOTOFF ++ .long .LBB0_69@GOTOFF ++ .long .LBB0_71@GOTOFF ++ .long .LBB0_73@GOTOFF ++ .long .LBB0_265@GOTOFF ++ .long .LBB0_76@GOTOFF ++ .long .LBB0_78@GOTOFF ++ .long .LBB0_80@GOTOFF ++ .long .LBB0_84@GOTOFF ++ .long .LBB0_86@GOTOFF ++ .long .LBB0_90@GOTOFF ++ .long .LBB0_94@GOTOFF ++ .long .LBB0_98@GOTOFF ++ .long .LBB0_100@GOTOFF ++ .long .LBB0_104@GOTOFF ++ .long .LBB0_108@GOTOFF ++ .long .LBB0_112@GOTOFF ++ .long .LBB0_116@GOTOFF ++ .long .LBB0_120@GOTOFF ++ .long .LBB0_124@GOTOFF ++ .long .LBB0_128@GOTOFF ++ .long .LBB0_130@GOTOFF ++ .long .LBB0_134@GOTOFF ++ .long .LBB0_138@GOTOFF ++ .long .LBB0_142@GOTOFF ++ .long .LBB0_146@GOTOFF ++ .long .LBB0_150@GOTOFF ++ .long .LBB0_154@GOTOFF ++ .long .LBB0_158@GOTOFF ++ .long .LBB0_162@GOTOFF ++ .long .LBB0_166@GOTOFF ++ .long .LBB0_170@GOTOFF ++ .long .LBB0_174@GOTOFF ++ .long .LBB0_178@GOTOFF ++ .long .LBB0_182@GOTOFF ++ .long .LBB0_186@GOTOFF ++ .long .LBB0_264@GOTOFF ++ .long .LBB0_75@GOTOFF ++ .long .LBB0_77@GOTOFF ++ .long .LBB0_79@GOTOFF ++ .long .LBB0_83@GOTOFF ++ .long .LBB0_85@GOTOFF ++ .long .LBB0_89@GOTOFF ++ .long .LBB0_93@GOTOFF ++ .long .LBB0_97@GOTOFF ++ .long .LBB0_99@GOTOFF ++ .long .LBB0_103@GOTOFF ++ .long .LBB0_107@GOTOFF ++ .long .LBB0_111@GOTOFF ++ .long .LBB0_115@GOTOFF ++ .long .LBB0_119@GOTOFF ++ .long .LBB0_123@GOTOFF ++ .long .LBB0_127@GOTOFF ++ .long .LBB0_129@GOTOFF ++ .long .LBB0_133@GOTOFF ++ .long .LBB0_137@GOTOFF ++ .long .LBB0_141@GOTOFF ++ .long .LBB0_145@GOTOFF ++ .long .LBB0_149@GOTOFF ++ .long .LBB0_153@GOTOFF ++ .long .LBB0_157@GOTOFF ++ .long .LBB0_161@GOTOFF ++ .long .LBB0_165@GOTOFF ++ .long .LBB0_169@GOTOFF ++ .long .LBB0_173@GOTOFF ++ .long .LBB0_177@GOTOFF ++ .long .LBB0_181@GOTOFF ++ .long .LBB0_185@GOTOFF ++ .long .LBB0_263@GOTOFF ++ .long .LBB0_189@GOTOFF ++ .long .LBB0_191@GOTOFF ++ .long .LBB0_193@GOTOFF ++ .long .LBB0_195@GOTOFF ++ .long .LBB0_197@GOTOFF ++ .long .LBB0_199@GOTOFF ++ .long .LBB0_201@GOTOFF ++ .long .LBB0_203@GOTOFF ++ .long .LBB0_205@GOTOFF ++ .long .LBB0_207@GOTOFF ++ .long .LBB0_209@GOTOFF ++ .long .LBB0_211@GOTOFF ++ .long .LBB0_213@GOTOFF ++ .long .LBB0_215@GOTOFF ++ .long .LBB0_217@GOTOFF ++ .long .LBB0_219@GOTOFF ++ .long .LBB0_221@GOTOFF ++ .long .LBB0_223@GOTOFF ++ .long .LBB0_225@GOTOFF ++ .long .LBB0_227@GOTOFF ++ .long .LBB0_229@GOTOFF ++ .long .LBB0_231@GOTOFF ++ .long .LBB0_233@GOTOFF ++ .long .LBB0_235@GOTOFF ++ .long .LBB0_237@GOTOFF ++ .long .LBB0_239@GOTOFF ++ .long .LBB0_241@GOTOFF ++ .long .LBB0_243@GOTOFF ++ .long .LBB0_245@GOTOFF ++ .long .LBB0_247@GOTOFF ++ .long .LBB0_249@GOTOFF ++ .long .LBB0_262@GOTOFF ++.LJTI0_1: ++ .long .LBB0_6@GOTOFF ++ .long .LBB0_10@GOTOFF ++ .long .LBB0_12@GOTOFF ++ .long .LBB0_16@GOTOFF ++ .long .LBB0_18@GOTOFF ++ .long .LBB0_20@GOTOFF ++ .long .LBB0_22@GOTOFF ++ .long .LBB0_26@GOTOFF ++ .long .LBB0_28@GOTOFF ++ .long .LBB0_30@GOTOFF ++ .long .LBB0_32@GOTOFF ++ .long .LBB0_34@GOTOFF ++ .long .LBB0_36@GOTOFF ++ .long .LBB0_38@GOTOFF ++ .long .LBB0_40@GOTOFF ++ .long .LBB0_44@GOTOFF ++ .long .LBB0_46@GOTOFF ++ .long .LBB0_48@GOTOFF ++ .long .LBB0_50@GOTOFF ++ .long .LBB0_52@GOTOFF ++ .long .LBB0_54@GOTOFF ++ .long .LBB0_56@GOTOFF ++ .long .LBB0_58@GOTOFF ++ .long .LBB0_60@GOTOFF ++ .long .LBB0_62@GOTOFF ++ .long .LBB0_64@GOTOFF ++ .long .LBB0_66@GOTOFF ++ .long .LBB0_68@GOTOFF ++ .long .LBB0_70@GOTOFF ++ .long .LBB0_72@GOTOFF ++ .long .LBB0_74@GOTOFF ++ .long .LBB0_269@GOTOFF ++ .long .LBB0_5@GOTOFF ++ .long .LBB0_9@GOTOFF ++ .long .LBB0_82@GOTOFF ++ .long .LBB0_15@GOTOFF ++ .long .LBB0_88@GOTOFF ++ .long .LBB0_92@GOTOFF ++ .long .LBB0_96@GOTOFF ++ .long .LBB0_25@GOTOFF ++ .long .LBB0_102@GOTOFF ++ .long .LBB0_106@GOTOFF ++ .long .LBB0_110@GOTOFF ++ .long .LBB0_114@GOTOFF ++ .long .LBB0_118@GOTOFF ++ .long .LBB0_122@GOTOFF ++ .long .LBB0_126@GOTOFF ++ .long .LBB0_43@GOTOFF ++ .long .LBB0_132@GOTOFF ++ .long .LBB0_136@GOTOFF ++ .long .LBB0_140@GOTOFF ++ .long .LBB0_144@GOTOFF ++ .long .LBB0_148@GOTOFF ++ .long .LBB0_152@GOTOFF ++ .long .LBB0_156@GOTOFF ++ .long .LBB0_160@GOTOFF ++ .long .LBB0_164@GOTOFF ++ .long .LBB0_168@GOTOFF ++ .long .LBB0_172@GOTOFF ++ .long .LBB0_176@GOTOFF ++ .long .LBB0_180@GOTOFF ++ .long .LBB0_184@GOTOFF ++ .long .LBB0_188@GOTOFF ++ .long .LBB0_268@GOTOFF ++ .long .LBB0_4@GOTOFF ++ .long .LBB0_8@GOTOFF ++ .long .LBB0_81@GOTOFF ++ .long .LBB0_14@GOTOFF ++ .long .LBB0_87@GOTOFF ++ .long .LBB0_91@GOTOFF ++ .long .LBB0_95@GOTOFF ++ .long .LBB0_24@GOTOFF ++ .long .LBB0_101@GOTOFF ++ .long .LBB0_105@GOTOFF ++ .long .LBB0_109@GOTOFF ++ .long .LBB0_113@GOTOFF ++ .long .LBB0_117@GOTOFF ++ .long .LBB0_121@GOTOFF ++ .long .LBB0_125@GOTOFF ++ .long .LBB0_42@GOTOFF ++ .long .LBB0_131@GOTOFF ++ .long .LBB0_135@GOTOFF ++ .long .LBB0_139@GOTOFF ++ .long .LBB0_143@GOTOFF ++ .long .LBB0_147@GOTOFF ++ .long .LBB0_151@GOTOFF ++ .long .LBB0_155@GOTOFF ++ .long .LBB0_159@GOTOFF ++ .long .LBB0_163@GOTOFF ++ .long .LBB0_167@GOTOFF ++ .long .LBB0_171@GOTOFF ++ .long .LBB0_175@GOTOFF ++ .long .LBB0_179@GOTOFF ++ .long .LBB0_183@GOTOFF ++ .long .LBB0_187@GOTOFF ++ .long .LBB0_267@GOTOFF ++ .long .LBB0_190@GOTOFF ++ .long .LBB0_192@GOTOFF ++ .long .LBB0_194@GOTOFF ++ .long .LBB0_196@GOTOFF ++ .long .LBB0_198@GOTOFF ++ .long .LBB0_200@GOTOFF ++ .long .LBB0_202@GOTOFF ++ .long .LBB0_204@GOTOFF ++ .long .LBB0_206@GOTOFF ++ .long .LBB0_208@GOTOFF ++ .long .LBB0_210@GOTOFF ++ .long .LBB0_212@GOTOFF ++ .long .LBB0_214@GOTOFF ++ .long .LBB0_216@GOTOFF ++ .long .LBB0_218@GOTOFF ++ .long .LBB0_220@GOTOFF ++ .long .LBB0_222@GOTOFF ++ .long .LBB0_224@GOTOFF ++ .long .LBB0_226@GOTOFF ++ .long .LBB0_228@GOTOFF ++ .long .LBB0_230@GOTOFF ++ .long .LBB0_232@GOTOFF ++ .long .LBB0_234@GOTOFF ++ .long .LBB0_236@GOTOFF ++ .long .LBB0_238@GOTOFF ++ .long .LBB0_240@GOTOFF ++ .long .LBB0_242@GOTOFF ++ .long .LBB0_244@GOTOFF ++ .long .LBB0_246@GOTOFF ++ .long .LBB0_248@GOTOFF ++ .long .LBB0_250@GOTOFF ++ .long .LBB0_266@GOTOFF ++ .long .LBB0_3@GOTOFF ++ .long .LBB0_7@GOTOFF ++ .long .LBB0_11@GOTOFF ++ .long .LBB0_13@GOTOFF ++ .long .LBB0_17@GOTOFF ++ .long .LBB0_19@GOTOFF ++ .long .LBB0_21@GOTOFF ++ .long .LBB0_23@GOTOFF ++ .long .LBB0_27@GOTOFF ++ .long .LBB0_29@GOTOFF ++ .long .LBB0_31@GOTOFF ++ .long .LBB0_33@GOTOFF ++ .long .LBB0_35@GOTOFF ++ .long .LBB0_37@GOTOFF ++ .long .LBB0_39@GOTOFF ++ .long .LBB0_41@GOTOFF ++ .long .LBB0_45@GOTOFF ++ .long .LBB0_47@GOTOFF ++ .long .LBB0_49@GOTOFF ++ .long .LBB0_51@GOTOFF ++ .long .LBB0_53@GOTOFF ++ .long .LBB0_55@GOTOFF ++ .long .LBB0_57@GOTOFF ++ .long .LBB0_59@GOTOFF ++ .long .LBB0_61@GOTOFF ++ .long .LBB0_63@GOTOFF ++ .long .LBB0_65@GOTOFF ++ .long .LBB0_67@GOTOFF ++ .long .LBB0_69@GOTOFF ++ .long .LBB0_71@GOTOFF ++ .long .LBB0_73@GOTOFF ++ .long .LBB0_265@GOTOFF ++ .long .LBB0_76@GOTOFF ++ .long .LBB0_78@GOTOFF ++ .long .LBB0_80@GOTOFF ++ .long .LBB0_84@GOTOFF ++ .long .LBB0_86@GOTOFF ++ .long .LBB0_90@GOTOFF ++ .long .LBB0_94@GOTOFF ++ .long .LBB0_98@GOTOFF ++ .long .LBB0_100@GOTOFF ++ .long .LBB0_104@GOTOFF ++ .long .LBB0_108@GOTOFF ++ .long .LBB0_112@GOTOFF ++ .long .LBB0_116@GOTOFF ++ .long .LBB0_120@GOTOFF ++ .long .LBB0_124@GOTOFF ++ .long .LBB0_128@GOTOFF ++ .long .LBB0_130@GOTOFF ++ .long .LBB0_134@GOTOFF ++ .long .LBB0_138@GOTOFF ++ .long .LBB0_142@GOTOFF ++ .long .LBB0_146@GOTOFF ++ .long .LBB0_150@GOTOFF ++ .long .LBB0_154@GOTOFF ++ .long .LBB0_158@GOTOFF ++ .long .LBB0_162@GOTOFF ++ .long .LBB0_166@GOTOFF ++ .long .LBB0_170@GOTOFF ++ .long .LBB0_174@GOTOFF ++ .long .LBB0_178@GOTOFF ++ .long .LBB0_182@GOTOFF ++ .long .LBB0_186@GOTOFF ++ .long .LBB0_264@GOTOFF ++ .long .LBB0_75@GOTOFF ++ .long .LBB0_77@GOTOFF ++ .long .LBB0_79@GOTOFF ++ .long .LBB0_83@GOTOFF ++ .long .LBB0_85@GOTOFF ++ .long .LBB0_89@GOTOFF ++ .long .LBB0_93@GOTOFF ++ .long .LBB0_97@GOTOFF ++ .long .LBB0_99@GOTOFF ++ .long .LBB0_103@GOTOFF ++ .long .LBB0_107@GOTOFF ++ .long .LBB0_111@GOTOFF ++ .long .LBB0_115@GOTOFF ++ .long .LBB0_119@GOTOFF ++ .long .LBB0_123@GOTOFF ++ .long .LBB0_127@GOTOFF ++ .long .LBB0_129@GOTOFF ++ .long .LBB0_133@GOTOFF ++ .long .LBB0_137@GOTOFF ++ .long .LBB0_141@GOTOFF ++ .long .LBB0_145@GOTOFF ++ .long .LBB0_149@GOTOFF ++ .long .LBB0_153@GOTOFF ++ .long .LBB0_157@GOTOFF ++ .long .LBB0_161@GOTOFF ++ .long .LBB0_165@GOTOFF ++ .long .LBB0_169@GOTOFF ++ .long .LBB0_173@GOTOFF ++ .long .LBB0_177@GOTOFF ++ .long .LBB0_181@GOTOFF ++ .long .LBB0_185@GOTOFF ++ .long .LBB0_263@GOTOFF ++ .long .LBB0_189@GOTOFF ++ .long .LBB0_191@GOTOFF ++ .long .LBB0_193@GOTOFF ++ .long .LBB0_195@GOTOFF ++ .long .LBB0_197@GOTOFF ++ .long .LBB0_199@GOTOFF ++ .long .LBB0_201@GOTOFF ++ .long .LBB0_203@GOTOFF ++ .long .LBB0_205@GOTOFF ++ .long .LBB0_207@GOTOFF ++ .long .LBB0_209@GOTOFF ++ .long .LBB0_211@GOTOFF ++ .long .LBB0_213@GOTOFF ++ .long .LBB0_215@GOTOFF ++ .long .LBB0_217@GOTOFF ++ .long .LBB0_219@GOTOFF ++ .long .LBB0_221@GOTOFF ++ .long .LBB0_223@GOTOFF ++ .long .LBB0_225@GOTOFF ++ .long .LBB0_227@GOTOFF ++ .long .LBB0_229@GOTOFF ++ .long .LBB0_231@GOTOFF ++ .long .LBB0_233@GOTOFF ++ .long .LBB0_235@GOTOFF ++ .long .LBB0_237@GOTOFF ++ .long .LBB0_239@GOTOFF ++ .long .LBB0_241@GOTOFF ++ .long .LBB0_243@GOTOFF ++ .long .LBB0_245@GOTOFF ++ .long .LBB0_247@GOTOFF ++ .long .LBB0_249@GOTOFF ++ .long .LBB0_262@GOTOFF ++ # -- End function +diff --git a/libc/arch-x86_64/dynamic_function_dispatch.cpp b/libc/arch-x86_64/dynamic_function_dispatch.cpp +index c846ded45..43aaebb54 100644 +--- a/libc/arch-x86_64/dynamic_function_dispatch.cpp ++++ b/libc/arch-x86_64/dynamic_function_dispatch.cpp +@@ -46,4 +46,42 @@ DEFINE_IFUNC_FOR(__memset_chk) { + RETURN_FUNC(__memset_chk_func, __memset_chk_generic); + } + ++typedef int memcmp_func(const void* __lhs, const void* __rhs, size_t __n); ++DEFINE_IFUNC_FOR(memcmp) { ++ __builtin_cpu_init(); ++ if (__builtin_cpu_supports("avx2")) RETURN_FUNC(memcmp_func, memcmp_avx2) ++ RETURN_FUNC(memcmp_func, memcmp_generic); ++} ++ ++typedef void* memmove_func(void* __dst, const void* __src, size_t __n); ++DEFINE_IFUNC_FOR(memmove) { ++ RETURN_FUNC(memmove_func, memmove_generic); ++} ++ ++typedef void* memcpy_func(void* __dst, const void* __src, size_t __n); ++DEFINE_IFUNC_FOR(memcpy) { ++ return memmove_resolver(); ++} ++ ++typedef void* memchr_func(const void* __s, int __ch, size_t __n); ++DEFINE_IFUNC_FOR(memchr) { ++ __builtin_cpu_init(); ++ if (__builtin_cpu_supports("avx2")) RETURN_FUNC(memchr_func, memchr_avx2); ++ RETURN_FUNC(memchr_func, memchr_openbsd); ++} ++ ++typedef void* memrchr_func(const void* __s, int __ch, size_t __n); ++DEFINE_IFUNC_FOR(memrchr) { ++ __builtin_cpu_init(); ++ if (__builtin_cpu_supports("avx2")) RETURN_FUNC(memrchr_func, memrchr_avx2); ++ RETURN_FUNC(memrchr_func, memrchr_openbsd); ++} ++ ++// typedef int wmemset_func(const wchar_t* __lhs, const wchar_t* __rhs, size_t __n); ++// DEFINE_IFUNC_FOR(wmemset) { ++// __builtin_cpu_init(); ++// if (__builtin_cpu_supports("avx2")) RETURN_FUNC(wmemset_func, wmemset_avx2); ++// RETURN_FUNC(wmemset_func, wmemset_freebsd); ++// } ++ + } // extern "C" +diff --git a/libc/arch-x86_64/generic/string/memchr.c b/libc/arch-x86_64/generic/string/memchr.c +new file mode 100644 +index 000000000..86ee02e0b +--- /dev/null ++++ b/libc/arch-x86_64/generic/string/memchr.c +@@ -0,0 +1,20 @@ ++/* ++ * Copyright (C) 2019 The Android Open Source Project ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++*/ ++ ++#include ++#define memchr memchr_openbsd ++ ++#include +diff --git a/libc/arch-x86_64/generic/string/memrchr.c b/libc/arch-x86_64/generic/string/memrchr.c +new file mode 100644 +index 000000000..c803009f5 +--- /dev/null ++++ b/libc/arch-x86_64/generic/string/memrchr.c +@@ -0,0 +1,20 @@ ++/* ++ * Copyright (C) 2019 The Android Open Source Project ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++*/ ++ ++#include ++#define memrchr memrchr_openbsd ++ ++#include +diff --git a/libc/arch-x86_64/generic/string/wmemset.c b/libc/arch-x86_64/generic/string/wmemset.c +new file mode 100644 +index 000000000..ac6bd7ec4 +--- /dev/null ++++ b/libc/arch-x86_64/generic/string/wmemset.c +@@ -0,0 +1,20 @@ ++/* ++ * Copyright (C) 2019 The Android Open Source Project ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++*/ ++ ++#include ++#define wmemset wmemset_freebsd ++ ++#include +diff --git a/libc/arch-x86_64/string/cache.h b/libc/arch-x86_64/include/cache.h +similarity index 100% +rename from libc/arch-x86_64/string/cache.h +rename to libc/arch-x86_64/include/cache.h +diff --git a/libc/arch-x86_64/kabylake/string/avx2-memchr-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-memchr-kbl.S +new file mode 100644 +index 000000000..da667c9b3 +--- /dev/null ++++ b/libc/arch-x86_64/kabylake/string/avx2-memchr-kbl.S +@@ -0,0 +1,371 @@ ++#ifndef L ++# define L(label) .L##label ++#endif ++ ++#ifndef cfi_startproc ++# define cfi_startproc .cfi_startproc ++#endif ++ ++#ifndef cfi_endproc ++# define cfi_endproc .cfi_endproc ++#endif ++ ++#ifndef cfi_rel_offset ++# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off ++#endif ++ ++#ifndef cfi_restore ++# define cfi_restore(reg) .cfi_restore reg ++#endif ++ ++#ifndef cfi_adjust_cfa_offset ++# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off ++#endif ++ ++#ifndef ENTRY ++# define ENTRY(name) \ ++ .type name, @function; \ ++ .globl name; \ ++ .p2align 4; \ ++name: \ ++ cfi_startproc ++#endif ++ ++#ifndef END ++# define END(name) \ ++ cfi_endproc; \ ++ .size name, .-name ++#endif ++ ++#define CFI_PUSH(REG) \ ++ cfi_adjust_cfa_offset (4); \ ++ cfi_rel_offset (REG, 0) ++ ++#define CFI_POP(REG) \ ++ cfi_adjust_cfa_offset (-4); \ ++ cfi_restore (REG) ++ ++#define PUSH(REG) push REG; ++#define POP(REG) pop REG; ++ ++#define ENTRANCE PUSH (%rbx); ++#define RETURN_END POP (%rbx); ret ++#define RETURN RETURN_END; ++ ++# ifndef MEMCHR ++# define MEMCHR memchr_avx2 ++# endif ++ ++# ifdef USE_AS_WMEMCHR ++# define VPCMPEQ vpcmpeqd ++# else ++# define VPCMPEQ vpcmpeqb ++# endif ++ ++# ifndef VZEROUPPER ++# define VZEROUPPER vzeroupper ++# endif ++ ++# define VEC_SIZE 32 ++ ++ .section .text.avx,"ax",@progbits ++ENTRY (MEMCHR) ++# ifndef USE_AS_RAWMEMCHR ++ /* Check for zero length. */ ++ testq %rdx, %rdx ++ jz L(null) ++# endif ++ movl %edi, %ecx ++ /* Broadcast CHAR to YMM0. */ ++ vmovd %esi, %xmm0 ++# ifdef USE_AS_WMEMCHR ++ shl $2, %rdx ++ vpbroadcastd %xmm0, %ymm0 ++# else ++ vpbroadcastb %xmm0, %ymm0 ++# endif ++ /* Check if we may cross page boundary with one vector load. */ ++ andl $(2 * VEC_SIZE - 1), %ecx ++ cmpl $VEC_SIZE, %ecx ++ ja L(cros_page_boundary) ++ ++ /* Check the first VEC_SIZE bytes. */ ++ VPCMPEQ (%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax ++ testl %eax, %eax ++ ++# ifndef USE_AS_RAWMEMCHR ++ jnz L(first_vec_x0_check) ++ /* Adjust length and check the end of data. */ ++ subq $VEC_SIZE, %rdx ++ jbe L(zero) ++# else ++ jnz L(first_vec_x0) ++# endif ++ ++ /* Align data for aligned loads in the loop. */ ++ addq $VEC_SIZE, %rdi ++ andl $(VEC_SIZE - 1), %ecx ++ andq $-VEC_SIZE, %rdi ++ ++# ifndef USE_AS_RAWMEMCHR ++ /* Adjust length. */ ++ addq %rcx, %rdx ++ ++ subq $(VEC_SIZE * 4), %rdx ++ jbe L(last_4x_vec_or_less) ++# endif ++ jmp L(more_4x_vec) ++ ++ .p2align 4 ++L(cros_page_boundary): ++ andl $(VEC_SIZE - 1), %ecx ++ andq $-VEC_SIZE, %rdi ++ VPCMPEQ (%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax ++ /* Remove the leading bytes. */ ++ sarl %cl, %eax ++ testl %eax, %eax ++ jz L(aligned_more) ++ tzcntl %eax, %eax ++# ifndef USE_AS_RAWMEMCHR ++ /* Check the end of data. */ ++ cmpq %rax, %rdx ++ jbe L(zero) ++# endif ++ addq %rdi, %rax ++ addq %rcx, %rax ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(aligned_more): ++# ifndef USE_AS_RAWMEMCHR ++ /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)" ++ instead of "(rdx + rcx) - VEC_SIZE" to void possible addition ++ overflow. */ ++ negq %rcx ++ addq $VEC_SIZE, %rcx ++ ++ /* Check the end of data. */ ++ subq %rcx, %rdx ++ jbe L(zero) ++# endif ++ ++ addq $VEC_SIZE, %rdi ++ ++# ifndef USE_AS_RAWMEMCHR ++ subq $(VEC_SIZE * 4), %rdx ++ jbe L(last_4x_vec_or_less) ++# endif ++ ++L(more_4x_vec): ++ /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time ++ since data is only aligned to VEC_SIZE. */ ++ VPCMPEQ (%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x0) ++ ++ VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x1) ++ ++ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x2) ++ ++ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x3) ++ ++ addq $(VEC_SIZE * 4), %rdi ++ ++# ifndef USE_AS_RAWMEMCHR ++ subq $(VEC_SIZE * 4), %rdx ++ jbe L(last_4x_vec_or_less) ++# endif ++ ++ /* Align data to 4 * VEC_SIZE. */ ++ movq %rdi, %rcx ++ andl $(4 * VEC_SIZE - 1), %ecx ++ andq $-(4 * VEC_SIZE), %rdi ++ ++# ifndef USE_AS_RAWMEMCHR ++ /* Adjust length. */ ++ addq %rcx, %rdx ++# endif ++ ++ .p2align 4 ++L(loop_4x_vec): ++ /* Compare 4 * VEC at a time forward. */ ++ VPCMPEQ (%rdi), %ymm0, %ymm1 ++ VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2 ++ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3 ++ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4 ++ ++ vpor %ymm1, %ymm2, %ymm5 ++ vpor %ymm3, %ymm4, %ymm6 ++ vpor %ymm5, %ymm6, %ymm5 ++ ++ vpmovmskb %ymm5, %eax ++ testl %eax, %eax ++ jnz L(4x_vec_end) ++ ++ addq $(VEC_SIZE * 4), %rdi ++ ++# ifdef USE_AS_RAWMEMCHR ++ jmp L(loop_4x_vec) ++# else ++ subq $(VEC_SIZE * 4), %rdx ++ ja L(loop_4x_vec) ++ ++L(last_4x_vec_or_less): ++ /* Less than 4 * VEC and aligned to VEC_SIZE. */ ++ addl $(VEC_SIZE * 2), %edx ++ jle L(last_2x_vec) ++ ++ VPCMPEQ (%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x0) ++ ++ VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x1) ++ ++ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax ++ testl %eax, %eax ++ ++ jnz L(first_vec_x2_check) ++ subl $VEC_SIZE, %edx ++ jle L(zero) ++ ++ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax ++ testl %eax, %eax ++ ++ jnz L(first_vec_x3_check) ++ xorl %eax, %eax ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(last_2x_vec): ++ addl $(VEC_SIZE * 2), %edx ++ VPCMPEQ (%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax ++ testl %eax, %eax ++ ++ jnz L(first_vec_x0_check) ++ subl $VEC_SIZE, %edx ++ jle L(zero) ++ ++ VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x1_check) ++ xorl %eax, %eax ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(first_vec_x0_check): ++ tzcntl %eax, %eax ++ /* Check the end of data. */ ++ cmpq %rax, %rdx ++ jbe L(zero) ++ addq %rdi, %rax ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(first_vec_x1_check): ++ tzcntl %eax, %eax ++ /* Check the end of data. */ ++ cmpq %rax, %rdx ++ jbe L(zero) ++ addq $VEC_SIZE, %rax ++ addq %rdi, %rax ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(first_vec_x2_check): ++ tzcntl %eax, %eax ++ /* Check the end of data. */ ++ cmpq %rax, %rdx ++ jbe L(zero) ++ addq $(VEC_SIZE * 2), %rax ++ addq %rdi, %rax ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(first_vec_x3_check): ++ tzcntl %eax, %eax ++ /* Check the end of data. */ ++ cmpq %rax, %rdx ++ jbe L(zero) ++ addq $(VEC_SIZE * 3), %rax ++ addq %rdi, %rax ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(zero): ++ VZEROUPPER ++L(null): ++ xorl %eax, %eax ++ ret ++# endif ++ ++ .p2align 4 ++L(first_vec_x0): ++ tzcntl %eax, %eax ++ addq %rdi, %rax ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(first_vec_x1): ++ tzcntl %eax, %eax ++ addq $VEC_SIZE, %rax ++ addq %rdi, %rax ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(first_vec_x2): ++ tzcntl %eax, %eax ++ addq $(VEC_SIZE * 2), %rax ++ addq %rdi, %rax ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(4x_vec_end): ++ vpmovmskb %ymm1, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x0) ++ vpmovmskb %ymm2, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x1) ++ vpmovmskb %ymm3, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x2) ++ vpmovmskb %ymm4, %eax ++ testl %eax, %eax ++L(first_vec_x3): ++ tzcntl %eax, %eax ++ addq $(VEC_SIZE * 3), %rax ++ addq %rdi, %rax ++ VZEROUPPER ++ ret ++ ++END (MEMCHR) +diff --git a/libc/arch-x86_64/kabylake/string/avx2-memcmp-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-memcmp-kbl.S +new file mode 100644 +index 000000000..e9778ca5a +--- /dev/null ++++ b/libc/arch-x86_64/kabylake/string/avx2-memcmp-kbl.S +@@ -0,0 +1,428 @@ ++/* Copyright (C) 2017-2019 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++/* memcmp/wmemcmp is implemented as: ++ 1. For size from 2 to 7 bytes, load as big endian with movbe and bswap ++ to avoid branches. ++ 2. Use overlapping compare to avoid branch. ++ 3. Use vector compare when size >= 4 bytes for memcmp or size >= 8 ++ bytes for wmemcmp. ++ 4. If size is 8 * VEC_SIZE or less, unroll the loop. ++ 5. Compare 4 * VEC_SIZE at a time with the aligned first memory ++ area. ++ 6. Use 2 vector compares when size is 2 * VEC_SIZE or less. ++ 7. Use 4 vector compares when size is 4 * VEC_SIZE or less. ++ 8. Use 8 vector compares when size is 8 * VEC_SIZE or less. */ ++ ++ ++#ifndef MEMCMP ++# define MEMCMP memcmp_avx2 ++#endif ++ ++#ifndef L ++# define L(label) .L##label ++#endif ++ ++#ifndef ALIGN ++# define ALIGN(n) .p2align n ++#endif ++ ++#ifndef cfi_startproc ++# define cfi_startproc .cfi_startproc ++#endif ++ ++#ifndef cfi_endproc ++# define cfi_endproc .cfi_endproc ++#endif ++ ++#ifndef ENTRY ++# define ENTRY(name) \ ++ .type name, @function; \ ++ .globl name; \ ++ .p2align 4; \ ++name: \ ++ cfi_startproc ++#endif ++ ++#ifndef END ++# define END(name) \ ++ cfi_endproc; \ ++ .size name, .-name ++#endif ++ ++#ifndef ALIGN ++# define ALIGN(n) .p2align n ++#endif ++ ++# ifdef USE_AS_WMEMCMP ++# define VPCMPEQ vpcmpeqd ++# else ++# define VPCMPEQ vpcmpeqb ++# endif ++# ifndef VZEROUPPER ++# define VZEROUPPER vzeroupper ++# endif ++# define VEC_SIZE 32 ++# define VEC_MASK ((1 << VEC_SIZE) - 1) ++ .section .text.avx,"ax",@progbits ++ENTRY (MEMCMP) ++# ifdef USE_AS_WMEMCMP ++ shl $2, %RDX_LP ++# elif defined __ILP32__ ++ /* Clear the upper 32 bits. */ ++ movl %edx, %edx ++# endif ++ cmp $VEC_SIZE, %rdx ++ jb L(less_vec) ++ /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ ++ vmovdqu (%rsi), %ymm2 ++ VPCMPEQ (%rdi), %ymm2, %ymm2 ++ vpmovmskb %ymm2, %eax ++ subl $VEC_MASK, %eax ++ jnz L(first_vec) ++ cmpq $(VEC_SIZE * 2), %rdx ++ jbe L(last_vec) ++ VPCMPEQ %ymm0, %ymm0, %ymm0 ++ /* More than 2 * VEC. */ ++ cmpq $(VEC_SIZE * 8), %rdx ++ ja L(more_8x_vec) ++ cmpq $(VEC_SIZE * 4), %rdx ++ jb L(last_4x_vec) ++ /* From 4 * VEC to 8 * VEC, inclusively. */ ++ vmovdqu (%rsi), %ymm1 ++ VPCMPEQ (%rdi), %ymm1, %ymm1 ++ vmovdqu VEC_SIZE(%rsi), %ymm2 ++ VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2 ++ vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3 ++ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 ++ vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4 ++ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 ++ vpand %ymm1, %ymm2, %ymm5 ++ vpand %ymm3, %ymm4, %ymm6 ++ vpand %ymm5, %ymm6, %ymm5 ++ vptest %ymm0, %ymm5 ++ jnc L(4x_vec_end) ++ leaq -(4 * VEC_SIZE)(%rdi, %rdx), %rdi ++ leaq -(4 * VEC_SIZE)(%rsi, %rdx), %rsi ++ vmovdqu (%rsi), %ymm1 ++ VPCMPEQ (%rdi), %ymm1, %ymm1 ++ vmovdqu VEC_SIZE(%rsi), %ymm2 ++ VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2 ++ vpand %ymm2, %ymm1, %ymm5 ++ vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3 ++ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 ++ vpand %ymm3, %ymm5, %ymm5 ++ vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4 ++ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 ++ vpand %ymm4, %ymm5, %ymm5 ++ vptest %ymm0, %ymm5 ++ jnc L(4x_vec_end) ++ xorl %eax, %eax ++ VZEROUPPER ++ ret ++ .p2align 4 ++L(last_2x_vec): ++ /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ ++ vmovdqu (%rsi), %ymm2 ++ VPCMPEQ (%rdi), %ymm2, %ymm2 ++ vpmovmskb %ymm2, %eax ++ subl $VEC_MASK, %eax ++ jnz L(first_vec) ++L(last_vec): ++ /* Use overlapping loads to avoid branches. */ ++ leaq -VEC_SIZE(%rdi, %rdx), %rdi ++ leaq -VEC_SIZE(%rsi, %rdx), %rsi ++ vmovdqu (%rsi), %ymm2 ++ VPCMPEQ (%rdi), %ymm2, %ymm2 ++ vpmovmskb %ymm2, %eax ++ subl $VEC_MASK, %eax ++ jnz L(first_vec) ++ VZEROUPPER ++ ret ++ .p2align 4 ++L(first_vec): ++ /* A byte or int32 is different within 16 or 32 bytes. */ ++ tzcntl %eax, %ecx ++# ifdef USE_AS_WMEMCMP ++ xorl %eax, %eax ++ movl (%rdi, %rcx), %edx ++ cmpl (%rsi, %rcx), %edx ++L(wmemcmp_return): ++ setl %al ++ negl %eax ++ orl $1, %eax ++# else ++ movzbl (%rdi, %rcx), %eax ++ movzbl (%rsi, %rcx), %edx ++ sub %edx, %eax ++# endif ++ VZEROUPPER ++ ret ++# ifdef USE_AS_WMEMCMP ++ .p2align 4 ++L(4): ++ xorl %eax, %eax ++ movl (%rdi), %edx ++ cmpl (%rsi), %edx ++ jne L(wmemcmp_return) ++ ret ++# else ++ ++L(between_4_7): ++ /* Load as big endian with overlapping movbe to avoid branches. */ ++ movbe (%rdi), %eax ++ movbe (%rsi), %ecx ++ shlq $32, %rax ++ shlq $32, %rcx ++ movbe -4(%rdi, %rdx), %edi ++ movbe -4(%rsi, %rdx), %esi ++ orq %rdi, %rax ++ orq %rsi, %rcx ++ subq %rcx, %rax ++ je L(exit) ++ sbbl %eax, %eax ++ orl $1, %eax ++ ret ++ .p2align 4 ++/*L(8): ++ giving two failures ++ movl (%rdi), %eax ++ subl (%rsi), %eax ++ je L(between_4_7) ++ retq */ ++ ++L(exit): ++ ret ++ .p2align 4 ++L(between_2_3): ++ /* Load as big endian to avoid branches. */ ++ movzwl (%rdi), %eax ++ movzwl (%rsi), %ecx ++ shll $8, %eax ++ shll $8, %ecx ++ bswap %eax ++ bswap %ecx ++ movb -1(%rdi, %rdx), %al ++ movb -1(%rsi, %rdx), %cl ++ /* Subtraction is okay because the upper 8 bits are zero. */ ++ subl %ecx, %eax ++ ret ++ .p2align 4 ++L(1): ++ movzbl (%rdi), %eax ++ movzbl (%rsi), %ecx ++ sub %ecx, %eax ++ ret ++# endif ++ .p2align 4 ++L(zero): ++ xorl %eax, %eax ++ ret ++ .p2align 4 ++L(less_vec): ++# ifdef USE_AS_WMEMCMP ++ /* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes. */ ++ cmpb $4, %dl ++ je L(4) ++ jb L(zero) ++# else ++/* cmpb $8, %dl ++ jne L(tmp) ++ movl (%rdi), %eax ++ subl (%rsi), %eax ++ jne L(exit) ++L(temp): ++ movl %edx, %edx ++ //jmp L(tmp) ++L(tmp):*/ ++ ++ cmpb $1, %dl ++ je L(1) ++ jb L(zero) ++ ++ cmpb $4, %dl ++ jb L(between_2_3) ++ cmpb $8, %dl ++ //je L(8) ++ jb L(between_4_7) ++# endif ++ cmpb $16, %dl ++ jae L(between_16_31) ++ /* It is between 8 and 15 bytes. */ ++ vmovq (%rdi), %xmm1 ++ vmovq (%rsi), %xmm2 ++ VPCMPEQ %xmm1, %xmm2, %xmm2 ++ vpmovmskb %xmm2, %eax ++ subl $0xffff, %eax ++ jnz L(first_vec) ++ /* Use overlapping loads to avoid branches. */ ++ leaq -8(%rdi, %rdx), %rdi ++ leaq -8(%rsi, %rdx), %rsi ++ vmovq (%rdi), %xmm1 ++ vmovq (%rsi), %xmm2 ++ VPCMPEQ %xmm1, %xmm2, %xmm2 ++ vpmovmskb %xmm2, %eax ++ subl $0xffff, %eax ++ jnz L(first_vec) ++ ret ++ .p2align 4 ++L(between_16_31): ++ /* From 16 to 31 bytes. No branch when size == 16. */ ++ vmovdqu (%rsi), %xmm2 ++ VPCMPEQ (%rdi), %xmm2, %xmm2 ++ vpmovmskb %xmm2, %eax ++ subl $0xffff, %eax ++ jnz L(first_vec) ++ /* Use overlapping loads to avoid branches. */ ++ leaq -16(%rdi, %rdx), %rdi ++ leaq -16(%rsi, %rdx), %rsi ++ vmovdqu (%rsi), %xmm2 ++ VPCMPEQ (%rdi), %xmm2, %xmm2 ++ vpmovmskb %xmm2, %eax ++ subl $0xffff, %eax ++ jnz L(first_vec) ++ ret ++ .p2align 4 ++L(more_8x_vec): ++ /* More than 8 * VEC. Check the first VEC. */ ++ vmovdqu (%rsi), %ymm2 ++ VPCMPEQ (%rdi), %ymm2, %ymm2 ++ vpmovmskb %ymm2, %eax ++ subl $VEC_MASK, %eax ++ jnz L(first_vec) ++ /* Align the first memory area for aligned loads in the loop. ++ Compute how much the first memory area is misaligned. */ ++ movq %rdi, %rcx ++ andl $(VEC_SIZE - 1), %ecx ++ /* Get the negative of offset for alignment. */ ++ subq $VEC_SIZE, %rcx ++ /* Adjust the second memory area. */ ++ subq %rcx, %rsi ++ /* Adjust the first memory area which should be aligned now. */ ++ subq %rcx, %rdi ++ /* Adjust length. */ ++ addq %rcx, %rdx ++L(loop_4x_vec): ++ /* Compare 4 * VEC at a time forward. */ ++ vmovdqu (%rsi), %ymm1 ++ VPCMPEQ (%rdi), %ymm1, %ymm1 ++ vmovdqu VEC_SIZE(%rsi), %ymm2 ++ VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2 ++ vpand %ymm2, %ymm1, %ymm5 ++ vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3 ++ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 ++ vpand %ymm3, %ymm5, %ymm5 ++ vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4 ++ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 ++ vpand %ymm4, %ymm5, %ymm5 ++ vptest %ymm0, %ymm5 ++ jnc L(4x_vec_end) ++ addq $(VEC_SIZE * 4), %rdi ++ addq $(VEC_SIZE * 4), %rsi ++ subq $(VEC_SIZE * 4), %rdx ++ cmpq $(VEC_SIZE * 4), %rdx ++ jae L(loop_4x_vec) ++ /* Less than 4 * VEC. */ ++ cmpq $VEC_SIZE, %rdx ++ jbe L(last_vec) ++ cmpq $(VEC_SIZE * 2), %rdx ++ jbe L(last_2x_vec) ++L(last_4x_vec): ++ /* From 2 * VEC to 4 * VEC. */ ++ vmovdqu (%rsi), %ymm2 ++ VPCMPEQ (%rdi), %ymm2, %ymm2 ++ vpmovmskb %ymm2, %eax ++ subl $VEC_MASK, %eax ++ jnz L(first_vec) ++ addq $VEC_SIZE, %rdi ++ addq $VEC_SIZE, %rsi ++ vmovdqu (%rsi), %ymm2 ++ VPCMPEQ (%rdi), %ymm2, %ymm2 ++ vpmovmskb %ymm2, %eax ++ subl $VEC_MASK, %eax ++ jnz L(first_vec) ++ /* Use overlapping loads to avoid branches. */ ++ leaq -(3 * VEC_SIZE)(%rdi, %rdx), %rdi ++ leaq -(3 * VEC_SIZE)(%rsi, %rdx), %rsi ++ vmovdqu (%rsi), %ymm2 ++ VPCMPEQ (%rdi), %ymm2, %ymm2 ++ vpmovmskb %ymm2, %eax ++ subl $VEC_MASK, %eax ++ jnz L(first_vec) ++ addq $VEC_SIZE, %rdi ++ addq $VEC_SIZE, %rsi ++ vmovdqu (%rsi), %ymm2 ++ VPCMPEQ (%rdi), %ymm2, %ymm2 ++ vpmovmskb %ymm2, %eax ++ subl $VEC_MASK, %eax ++ jnz L(first_vec) ++ VZEROUPPER ++ ret ++ .p2align 4 ++L(4x_vec_end): ++ vpmovmskb %ymm1, %eax ++ subl $VEC_MASK, %eax ++ jnz L(first_vec) ++ vpmovmskb %ymm2, %eax ++ subl $VEC_MASK, %eax ++ jnz L(first_vec_x1) ++ vpmovmskb %ymm3, %eax ++ subl $VEC_MASK, %eax ++ jnz L(first_vec_x2) ++ vpmovmskb %ymm4, %eax ++ subl $VEC_MASK, %eax ++ tzcntl %eax, %ecx ++# ifdef USE_AS_WMEMCMP ++ xorl %eax, %eax ++ movl (VEC_SIZE * 3)(%rdi, %rcx), %edx ++ cmpl (VEC_SIZE * 3)(%rsi, %rcx), %edx ++ jmp L(wmemcmp_return) ++# else ++ movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax ++ movzbl (VEC_SIZE * 3)(%rsi, %rcx), %edx ++ sub %edx, %eax ++# endif ++ VZEROUPPER ++ ret ++ .p2align 4 ++L(first_vec_x1): ++ tzcntl %eax, %ecx ++# ifdef USE_AS_WMEMCMP ++ xorl %eax, %eax ++ movl VEC_SIZE(%rdi, %rcx), %edx ++ cmpl VEC_SIZE(%rsi, %rcx), %edx ++ jmp L(wmemcmp_return) ++# else ++ movzbl VEC_SIZE(%rdi, %rcx), %eax ++ movzbl VEC_SIZE(%rsi, %rcx), %edx ++ sub %edx, %eax ++# endif ++ VZEROUPPER ++ ret ++ .p2align 4 ++L(first_vec_x2): ++ tzcntl %eax, %ecx ++# ifdef USE_AS_WMEMCMP ++ xorl %eax, %eax ++ movl (VEC_SIZE * 2)(%rdi, %rcx), %edx ++ cmpl (VEC_SIZE * 2)(%rsi, %rcx), %edx ++ jmp L(wmemcmp_return) ++# else ++ movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax ++ movzbl (VEC_SIZE * 2)(%rsi, %rcx), %edx ++ sub %edx, %eax ++# endif ++ VZEROUPPER ++ ret ++END (MEMCMP) +diff --git a/libc/arch-x86_64/kabylake/string/avx2-memrchr-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-memrchr-kbl.S +new file mode 100644 +index 000000000..a958fb56d +--- /dev/null ++++ b/libc/arch-x86_64/kabylake/string/avx2-memrchr-kbl.S +@@ -0,0 +1,408 @@ ++/* memrchr optimized with AVX2. ++ Copyright (C) 2017-2019 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#ifndef L ++# define L(label) .L##label ++#endif ++ ++#ifndef cfi_startproc ++# define cfi_startproc .cfi_startproc ++#endif ++ ++#ifndef cfi_endproc ++# define cfi_endproc .cfi_endproc ++#endif ++ ++#ifndef cfi_rel_offset ++# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off ++#endif ++ ++#ifndef cfi_restore ++# define cfi_restore(reg) .cfi_restore reg ++#endif ++ ++#ifndef cfi_adjust_cfa_offset ++# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off ++#endif ++ ++#ifndef ENTRY ++# define ENTRY(name) \ ++ .type name, @function; \ ++ .globl name; \ ++ .p2align 4; \ ++name: \ ++ cfi_startproc ++#endif ++ ++#ifndef END ++# define END(name) \ ++ cfi_endproc; \ ++ .size name, .-name ++#endif ++ ++#define CFI_PUSH(REG) \ ++ cfi_adjust_cfa_offset (4); \ ++ cfi_rel_offset (REG, 0) ++ ++#define CFI_POP(REG) \ ++ cfi_adjust_cfa_offset (-4); \ ++ cfi_restore (REG) ++ ++#define PUSH(REG) pushl REG; CFI_PUSH (REG) ++#define POP(REG) popl REG; CFI_POP (REG) ++ ++# ifndef MEMRCHR ++# define MEMRCHR memrchr_avx2 ++# endif ++ ++#ifndef VZEROUPPER ++# define VZEROUPPER vzeroupper ++# endif ++ ++# define VEC_SIZE 32 ++ ++ .section .text.avx,"ax",@progbits ++ENTRY (MEMRCHR) ++ /* Broadcast CHAR to YMM0. */ ++ vmovd %esi, %xmm0 ++ vpbroadcastb %xmm0, %ymm0 ++ ++ sub $VEC_SIZE, %rdx ++ jbe L(last_vec_or_less) ++ ++ add %rdx, %rdi ++ ++ /* Check the last VEC_SIZE bytes. */ ++ vpcmpeqb (%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax ++ testl %eax, %eax ++ jnz L(last_vec_x0) ++ ++ subq $(VEC_SIZE * 4), %rdi ++ movl %edi, %ecx ++ andl $(VEC_SIZE - 1), %ecx ++ jz L(aligned_more) ++ ++ /* Align data for aligned loads in the loop. */ ++ addq $VEC_SIZE, %rdi ++ addq $VEC_SIZE, %rdx ++ andq $-VEC_SIZE, %rdi ++ subq %rcx, %rdx ++ ++ .p2align 4 ++L(aligned_more): ++ subq $(VEC_SIZE * 4), %rdx ++ jbe L(last_4x_vec_or_less) ++ ++ /* Check the last 4 * VEC_SIZE. Only one VEC_SIZE at a time ++ since data is only aligned to VEC_SIZE. */ ++ vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax ++ testl %eax, %eax ++ jnz L(last_vec_x3) ++ ++ vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2 ++ vpmovmskb %ymm2, %eax ++ testl %eax, %eax ++ jnz L(last_vec_x2) ++ ++ vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3 ++ vpmovmskb %ymm3, %eax ++ testl %eax, %eax ++ jnz L(last_vec_x1) ++ ++ vpcmpeqb (%rdi), %ymm0, %ymm4 ++ vpmovmskb %ymm4, %eax ++ testl %eax, %eax ++ jnz L(last_vec_x0) ++ ++ /* Align data to 4 * VEC_SIZE for loop with fewer branches. ++ There are some overlaps with above if data isn't aligned ++ to 4 * VEC_SIZE. */ ++ movl %edi, %ecx ++ andl $(VEC_SIZE * 4 - 1), %ecx ++ jz L(loop_4x_vec) ++ ++ addq $(VEC_SIZE * 4), %rdi ++ addq $(VEC_SIZE * 4), %rdx ++ andq $-(VEC_SIZE * 4), %rdi ++ subq %rcx, %rdx ++ ++ .p2align 4 ++L(loop_4x_vec): ++ /* Compare 4 * VEC at a time forward. */ ++ subq $(VEC_SIZE * 4), %rdi ++ subq $(VEC_SIZE * 4), %rdx ++ jbe L(last_4x_vec_or_less) ++ ++ vmovdqa (%rdi), %ymm1 ++ vmovdqa VEC_SIZE(%rdi), %ymm2 ++ vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3 ++ vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4 ++ ++ vpcmpeqb %ymm1, %ymm0, %ymm1 ++ vpcmpeqb %ymm2, %ymm0, %ymm2 ++ vpcmpeqb %ymm3, %ymm0, %ymm3 ++ vpcmpeqb %ymm4, %ymm0, %ymm4 ++ ++ vpor %ymm1, %ymm2, %ymm5 ++ vpor %ymm3, %ymm4, %ymm6 ++ vpor %ymm5, %ymm6, %ymm5 ++ ++ vpmovmskb %ymm5, %eax ++ testl %eax, %eax ++ jz L(loop_4x_vec) ++ ++ /* There is a match. */ ++ vpmovmskb %ymm4, %eax ++ testl %eax, %eax ++ jnz L(last_vec_x3) ++ ++ vpmovmskb %ymm3, %eax ++ testl %eax, %eax ++ jnz L(last_vec_x2) ++ ++ vpmovmskb %ymm2, %eax ++ testl %eax, %eax ++ jnz L(last_vec_x1) ++ ++ vpmovmskb %ymm1, %eax ++ bsrl %eax, %eax ++ addq %rdi, %rax ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(last_4x_vec_or_less): ++ addl $(VEC_SIZE * 4), %edx ++ cmpl $(VEC_SIZE * 2), %edx ++ jbe L(last_2x_vec) ++ ++ vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax ++ testl %eax, %eax ++ jnz L(last_vec_x3) ++ ++ vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2 ++ vpmovmskb %ymm2, %eax ++ testl %eax, %eax ++ jnz L(last_vec_x2) ++ ++ vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3 ++ vpmovmskb %ymm3, %eax ++ testl %eax, %eax ++ jnz L(last_vec_x1_check) ++ cmpl $(VEC_SIZE * 3), %edx ++ jbe L(zero) ++ ++ vpcmpeqb (%rdi), %ymm0, %ymm4 ++ vpmovmskb %ymm4, %eax ++ testl %eax, %eax ++ jz L(zero) ++ bsrl %eax, %eax ++ subq $(VEC_SIZE * 4), %rdx ++ addq %rax, %rdx ++ jl L(zero) ++ addq %rdi, %rax ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(last_2x_vec): ++ vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax ++ testl %eax, %eax ++ jnz L(last_vec_x3_check) ++ cmpl $VEC_SIZE, %edx ++ jbe L(zero) ++ ++ vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax ++ testl %eax, %eax ++ jz L(zero) ++ bsrl %eax, %eax ++ subq $(VEC_SIZE * 2), %rdx ++ addq %rax, %rdx ++ jl L(zero) ++ addl $(VEC_SIZE * 2), %eax ++ addq %rdi, %rax ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(last_vec_x0): ++ bsrl %eax, %eax ++ addq %rdi, %rax ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(last_vec_x1): ++ bsrl %eax, %eax ++ addl $VEC_SIZE, %eax ++ addq %rdi, %rax ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(last_vec_x2): ++ bsrl %eax, %eax ++ addl $(VEC_SIZE * 2), %eax ++ addq %rdi, %rax ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(last_vec_x3): ++ bsrl %eax, %eax ++ addl $(VEC_SIZE * 3), %eax ++ addq %rdi, %rax ++ ret ++ ++ .p2align 4 ++L(last_vec_x1_check): ++ bsrl %eax, %eax ++ subq $(VEC_SIZE * 3), %rdx ++ addq %rax, %rdx ++ jl L(zero) ++ addl $VEC_SIZE, %eax ++ addq %rdi, %rax ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(last_vec_x3_check): ++ bsrl %eax, %eax ++ subq $VEC_SIZE, %rdx ++ addq %rax, %rdx ++ jl L(zero) ++ addl $(VEC_SIZE * 3), %eax ++ addq %rdi, %rax ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(zero): ++ VZEROUPPER ++L(null): ++ xorl %eax, %eax ++ ret ++ ++ .p2align 4 ++L(last_vec_or_less_aligned): ++ movl %edx, %ecx ++ ++ vpcmpeqb (%rdi), %ymm0, %ymm1 ++ ++ movl $1, %edx ++ /* Support rdx << 32. */ ++ salq %cl, %rdx ++ subq $1, %rdx ++ ++ vpmovmskb %ymm1, %eax ++ ++ /* Remove the trailing bytes. */ ++ andl %edx, %eax ++ testl %eax, %eax ++ jz L(zero) ++ ++ bsrl %eax, %eax ++ addq %rdi, %rax ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(last_vec_or_less): ++ addl $VEC_SIZE, %edx ++ ++ /* Check for zero length. */ ++ testl %edx, %edx ++ jz L(null) ++ ++ movl %edi, %ecx ++ andl $(VEC_SIZE - 1), %ecx ++ jz L(last_vec_or_less_aligned) ++ ++ movl %ecx, %esi ++ movl %ecx, %r8d ++ addl %edx, %esi ++ andq $-VEC_SIZE, %rdi ++ ++ subl $VEC_SIZE, %esi ++ ja L(last_vec_2x_aligned) ++ ++ /* Check the last VEC. */ ++ vpcmpeqb (%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax ++ ++ /* Remove the leading and trailing bytes. */ ++ sarl %cl, %eax ++ movl %edx, %ecx ++ ++ movl $1, %edx ++ sall %cl, %edx ++ subl $1, %edx ++ ++ andl %edx, %eax ++ testl %eax, %eax ++ jz L(zero) ++ ++ bsrl %eax, %eax ++ addq %rdi, %rax ++ addq %r8, %rax ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(last_vec_2x_aligned): ++ movl %esi, %ecx ++ ++ /* Check the last VEC. */ ++ vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm1 ++ ++ movl $1, %edx ++ sall %cl, %edx ++ subl $1, %edx ++ ++ vpmovmskb %ymm1, %eax ++ ++ /* Remove the trailing bytes. */ ++ andl %edx, %eax ++ ++ testl %eax, %eax ++ jnz L(last_vec_x1) ++ ++ /* Check the second last VEC. */ ++ vpcmpeqb (%rdi), %ymm0, %ymm1 ++ ++ movl %r8d, %ecx ++ ++ vpmovmskb %ymm1, %eax ++ ++ /* Remove the leading bytes. Must use unsigned right shift for ++ bsrl below. */ ++ shrl %cl, %eax ++ testl %eax, %eax ++ jz L(zero) ++ ++ bsrl %eax, %eax ++ addq %rdi, %rax ++ addq %r8, %rax ++ VZEROUPPER ++ ret ++END (MEMRCHR) +diff --git a/libc/arch-x86_64/kabylake/string/avx2-wmemset-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-wmemset-kbl.S +new file mode 100644 +index 000000000..7c485cf70 +--- /dev/null ++++ b/libc/arch-x86_64/kabylake/string/avx2-wmemset-kbl.S +@@ -0,0 +1,140 @@ ++/* ++Copyright (C) 2019 The Android Open Source Project ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions ++are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in ++ the documentation and/or other materials provided with the ++ distribution. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ++FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ++COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, ++INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, ++BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS ++OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED ++AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT ++OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF ++SUCH DAMAGE. ++*/ ++ ++#include ++ ++#ifndef WMEMSET ++ #define WMEMSET wmemset_avx2 ++#endif ++ ++ .section .text.avx2,"ax",@progbits ++ ++ENTRY (WMEMSET) ++# BB#0: ++ testq %rdx, %rdx ++ je .LBB0_14 ++# BB#1: ++ cmpq $32, %rdx ++ jae .LBB0_3 ++# BB#2: ++ xorl %r8d, %r8d ++ movq %rdi, %rax ++ jmp .LBB0_12 ++.LBB0_3: ++ movq %rdx, %r8 ++ andq $-32, %r8 ++ vmovd %esi, %xmm0 ++ vpbroadcastd %xmm0, %ymm0 ++ leaq -32(%r8), %rcx ++ movq %rcx, %rax ++ shrq $5, %rax ++ leal 1(%rax), %r9d ++ andl $7, %r9d ++ cmpq $224, %rcx ++ jae .LBB0_5 ++# BB#4: ++ xorl %eax, %eax ++ testq %r9, %r9 ++ jne .LBB0_8 ++ jmp .LBB0_10 ++.LBB0_5: ++ leaq 992(%rdi), %rcx ++ leaq -1(%r9), %r10 ++ subq %rax, %r10 ++ xorl %eax, %eax ++ .p2align 4, 0x90 ++.LBB0_6: # =>This Inner Loop Header: Depth=1 ++ vmovdqu %ymm0, -992(%rcx,%rax,4) ++ vmovdqu %ymm0, -960(%rcx,%rax,4) ++ vmovdqu %ymm0, -928(%rcx,%rax,4) ++ vmovdqu %ymm0, -896(%rcx,%rax,4) ++ vmovdqu %ymm0, -864(%rcx,%rax,4) ++ vmovdqu %ymm0, -832(%rcx,%rax,4) ++ vmovdqu %ymm0, -800(%rcx,%rax,4) ++ vmovdqu %ymm0, -768(%rcx,%rax,4) ++ vmovdqu %ymm0, -736(%rcx,%rax,4) ++ vmovdqu %ymm0, -704(%rcx,%rax,4) ++ vmovdqu %ymm0, -672(%rcx,%rax,4) ++ vmovdqu %ymm0, -640(%rcx,%rax,4) ++ vmovdqu %ymm0, -608(%rcx,%rax,4) ++ vmovdqu %ymm0, -576(%rcx,%rax,4) ++ vmovdqu %ymm0, -544(%rcx,%rax,4) ++ vmovdqu %ymm0, -512(%rcx,%rax,4) ++ vmovdqu %ymm0, -480(%rcx,%rax,4) ++ vmovdqu %ymm0, -448(%rcx,%rax,4) ++ vmovdqu %ymm0, -416(%rcx,%rax,4) ++ vmovdqu %ymm0, -384(%rcx,%rax,4) ++ vmovdqu %ymm0, -352(%rcx,%rax,4) ++ vmovdqu %ymm0, -320(%rcx,%rax,4) ++ vmovdqu %ymm0, -288(%rcx,%rax,4) ++ vmovdqu %ymm0, -256(%rcx,%rax,4) ++ vmovdqu %ymm0, -224(%rcx,%rax,4) ++ vmovdqu %ymm0, -192(%rcx,%rax,4) ++ vmovdqu %ymm0, -160(%rcx,%rax,4) ++ vmovdqu %ymm0, -128(%rcx,%rax,4) ++ vmovdqu %ymm0, -96(%rcx,%rax,4) ++ vmovdqu %ymm0, -64(%rcx,%rax,4) ++ vmovdqu %ymm0, -32(%rcx,%rax,4) ++ vmovdqu %ymm0, (%rcx,%rax,4) ++ addq $256, %rax # imm = 0x100 ++ addq $8, %r10 ++ jne .LBB0_6 ++# BB#7: ++ testq %r9, %r9 ++ je .LBB0_10 ++.LBB0_8: ++ leaq (%rdi,%rax,4), %rax ++ addq $96, %rax ++ negq %r9 ++ .p2align 4, 0x90 ++.LBB0_9: # =>This Inner Loop Header: Depth=1 ++ vmovdqu %ymm0, -96(%rax) ++ vmovdqu %ymm0, -64(%rax) ++ vmovdqu %ymm0, -32(%rax) ++ vmovdqu %ymm0, (%rax) ++ subq $-128, %rax ++ addq $1, %r9 ++ jne .LBB0_9 ++.LBB0_10: ++ cmpq %rdx, %r8 ++ je .LBB0_14 ++# BB#11: ++ leaq (%rdi,%r8,4), %rax ++.LBB0_12: ++ subq %r8, %rdx ++ .p2align 4, 0x90 ++.LBB0_13: # =>This Inner Loop Header: Depth=1 ++ movl %esi, (%rax) ++ addq $4, %rax ++ addq $-1, %rdx ++ jne .LBB0_13 ++.LBB0_14: ++ movq %rdi, %rax ++ vzeroupper ++ retq ++END(WMEMSET) +diff --git a/libc/arch-x86_64/string/sse2-memmove-slm.S b/libc/arch-x86_64/silvermont/string/sse2-memmove-slm.S +similarity index 99% +rename from libc/arch-x86_64/string/sse2-memmove-slm.S +rename to libc/arch-x86_64/silvermont/string/sse2-memmove-slm.S +index 739502888..7024f4950 100644 +--- a/libc/arch-x86_64/string/sse2-memmove-slm.S ++++ b/libc/arch-x86_64/silvermont/string/sse2-memmove-slm.S +@@ -31,7 +31,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + #include "cache.h" + + #ifndef MEMMOVE +-# define MEMMOVE memmove ++# define MEMMOVE memmove_generic + #endif + + #ifndef L +@@ -515,4 +515,4 @@ L(mm_large_page_loop_backward): + + END (MEMMOVE) + +-ALIAS_SYMBOL(memcpy, MEMMOVE) ++//ALIAS_SYMBOL(memcpy, MEMMOVE) +diff --git a/libc/arch-x86_64/string/sse2-memset-slm.S b/libc/arch-x86_64/silvermont/string/sse2-memset-slm.S +similarity index 100% +rename from libc/arch-x86_64/string/sse2-memset-slm.S +rename to libc/arch-x86_64/silvermont/string/sse2-memset-slm.S +diff --git a/libc/arch-x86_64/string/sse2-stpcpy-slm.S b/libc/arch-x86_64/silvermont/string/sse2-stpcpy-slm.S +similarity index 100% +rename from libc/arch-x86_64/string/sse2-stpcpy-slm.S +rename to libc/arch-x86_64/silvermont/string/sse2-stpcpy-slm.S +diff --git a/libc/arch-x86_64/string/sse2-stpncpy-slm.S b/libc/arch-x86_64/silvermont/string/sse2-stpncpy-slm.S +similarity index 100% +rename from libc/arch-x86_64/string/sse2-stpncpy-slm.S +rename to libc/arch-x86_64/silvermont/string/sse2-stpncpy-slm.S +diff --git a/libc/arch-x86_64/string/sse2-strcat-slm.S b/libc/arch-x86_64/silvermont/string/sse2-strcat-slm.S +similarity index 100% +rename from libc/arch-x86_64/string/sse2-strcat-slm.S +rename to libc/arch-x86_64/silvermont/string/sse2-strcat-slm.S +diff --git a/libc/arch-x86_64/string/sse2-strcpy-slm.S b/libc/arch-x86_64/silvermont/string/sse2-strcpy-slm.S +similarity index 100% +rename from libc/arch-x86_64/string/sse2-strcpy-slm.S +rename to libc/arch-x86_64/silvermont/string/sse2-strcpy-slm.S +diff --git a/libc/arch-x86_64/string/sse2-strlen-slm.S b/libc/arch-x86_64/silvermont/string/sse2-strlen-slm.S +similarity index 100% +rename from libc/arch-x86_64/string/sse2-strlen-slm.S +rename to libc/arch-x86_64/silvermont/string/sse2-strlen-slm.S +diff --git a/libc/arch-x86_64/string/sse2-strncat-slm.S b/libc/arch-x86_64/silvermont/string/sse2-strncat-slm.S +similarity index 100% +rename from libc/arch-x86_64/string/sse2-strncat-slm.S +rename to libc/arch-x86_64/silvermont/string/sse2-strncat-slm.S +diff --git a/libc/arch-x86_64/string/sse2-strncpy-slm.S b/libc/arch-x86_64/silvermont/string/sse2-strncpy-slm.S +similarity index 100% +rename from libc/arch-x86_64/string/sse2-strncpy-slm.S +rename to libc/arch-x86_64/silvermont/string/sse2-strncpy-slm.S +diff --git a/libc/arch-x86_64/string/sse4-memcmp-slm.S b/libc/arch-x86_64/silvermont/string/sse4-memcmp-slm.S +similarity index 99% +rename from libc/arch-x86_64/string/sse4-memcmp-slm.S +rename to libc/arch-x86_64/silvermont/string/sse4-memcmp-slm.S +index 8a8b180a2..6cfcd767f 100644 +--- a/libc/arch-x86_64/string/sse4-memcmp-slm.S ++++ b/libc/arch-x86_64/silvermont/string/sse4-memcmp-slm.S +@@ -31,7 +31,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + #include "cache.h" + + #ifndef MEMCMP +-# define MEMCMP memcmp ++# define MEMCMP memcmp_generic + #endif + + #ifndef L +diff --git a/libc/arch-x86_64/string/ssse3-strcmp-slm.S b/libc/arch-x86_64/silvermont/string/ssse3-strcmp-slm.S +similarity index 100% +rename from libc/arch-x86_64/string/ssse3-strcmp-slm.S +rename to libc/arch-x86_64/silvermont/string/ssse3-strcmp-slm.S +diff --git a/libc/arch-x86_64/string/ssse3-strncmp-slm.S b/libc/arch-x86_64/silvermont/string/ssse3-strncmp-slm.S +similarity index 100% +rename from libc/arch-x86_64/string/ssse3-strncmp-slm.S +rename to libc/arch-x86_64/silvermont/string/ssse3-strncmp-slm.S +diff --git a/libc/arch-x86_64/static_function_dispatch.S b/libc/arch-x86_64/static_function_dispatch.S +index 93ff5f2fc..979ce4f18 100644 +--- a/libc/arch-x86_64/static_function_dispatch.S ++++ b/libc/arch-x86_64/static_function_dispatch.S +@@ -35,3 +35,9 @@ END(name) + + FUNCTION_DELEGATE(memset, memset_generic) + FUNCTION_DELEGATE(__memset_chk, __memset_chk_generic) ++FUNCTION_DELEGATE(memcmp, memcmp_generic) ++FUNCTION_DELEGATE(memcpy, memmove_generic) ++FUNCTION_DELEGATE(memmove, memmove_generic) ++FUNCTION_DELEGATE(memchr, memchr_openbsd) ++FUNCTION_DELEGATE(memrchr, memrchr_openbsd) ++//FUNCTION_DELEGATE(wmemset, wmemset_freebsd) +-- +2.25.1 + diff --git a/aosp_diff/preliminary/bionic/0004-Optimize-bionic-string-functions-with-avx-implementa.patch b/aosp_diff/preliminary/bionic/0004-Optimize-bionic-string-functions-with-avx-implementa.patch new file mode 100644 index 0000000000..0432f627fd --- /dev/null +++ b/aosp_diff/preliminary/bionic/0004-Optimize-bionic-string-functions-with-avx-implementa.patch @@ -0,0 +1,4169 @@ +From b6a7f45aa68426f4e32a4bf51e71ec5453f25f8d Mon Sep 17 00:00:00 2001 +From: Ravi Kumar Soni +Date: Mon, 28 Oct 2024 15:08:14 +0530 +Subject: [PATCH 4/5] Optimize bionic string functions with avx implementation + +Following are the string functions that has been +optimized with avx2 implementation from glibc 2.32 version. + - strcmp, strncmp + - strlen, strnlen + - strchr, strrchr + - strcpy, strncpy + - stpcpy, stpncpy + - strcat, strncat + - wcscmp, wcsncmp + - wcslen, wcsnlen + - wcschr, wcsrchr + +Test done: Build and boot is fine, Run the benchmarks suite. + +Change-Id: I7f08a7507d25447ce886e9fde0482527c3f7a178 +Signed-off-by: ahs +Signed-off-by: Ravi Kumar Soni +--- + libc/Android.bp | 45 +- + .../arch-x86_64/dynamic_function_dispatch.cpp | 133 ++- + libc/arch-x86_64/generic/string/memchr.c | 2 +- + libc/arch-x86_64/generic/string/memrchr.c | 2 +- + libc/arch-x86_64/generic/string/strchr.cpp | 19 + + libc/arch-x86_64/generic/string/strnlen.cpp | 19 + + libc/arch-x86_64/generic/string/strrchr.cpp | 19 + + libc/arch-x86_64/generic/string/wcschr.c | 19 + + libc/arch-x86_64/generic/string/wcscmp.c | 19 + + libc/arch-x86_64/generic/string/wcslen.c | 19 + + libc/arch-x86_64/generic/string/wcsncmp.c | 19 + + libc/arch-x86_64/generic/string/wcsnlen.c | 19 + + libc/arch-x86_64/generic/string/wcsrchr.c | 19 + + libc/arch-x86_64/generic/string/wmemset.c | 2 +- + .../{ => kabylake}/string/avx2-memset-kbl.S | 0 + .../kabylake/string/avx2-stpcpy-kbl.S | 3 + + .../kabylake/string/avx2-stpncpy-kbl.S | 5 + + .../kabylake/string/avx2-strcat-kbl.S | 299 +++++ + .../kabylake/string/avx2-strchr-kbl.S | 277 +++++ + .../kabylake/string/avx2-strcmp-kbl.S | 885 ++++++++++++++ + .../kabylake/string/avx2-strcpy-kbl.S | 1046 +++++++++++++++++ + .../kabylake/string/avx2-strlen-kbl.S | 418 +++++++ + .../kabylake/string/avx2-strncat-kbl.S | 3 + + .../kabylake/string/avx2-strncmp-kbl.S | 4 + + .../kabylake/string/avx2-strncpy-kbl.S | 4 + + .../kabylake/string/avx2-strnlen-kbl.S | 4 + + .../kabylake/string/avx2-strrchr-kbl.S | 258 ++++ + .../kabylake/string/avx2-wcschr-kbl.S | 3 + + .../kabylake/string/avx2-wcscmp-kbl.S | 4 + + .../kabylake/string/avx2-wcslen-kbl.S | 4 + + .../kabylake/string/avx2-wcsncmp-kbl.S | 6 + + .../kabylake/string/avx2-wcsnlen-kbl.S | 6 + + .../kabylake/string/avx2-wcsrchr-kbl.S | 3 + + libc/arch-x86_64/kabylake/string/avx_regs.h | 26 + + .../{include => kabylake/string}/cache.h | 0 + libc/arch-x86_64/silvermont/string/cache.h | 36 + + .../silvermont/string/sse2-stpcpy-slm.S | 2 +- + .../silvermont/string/sse2-stpncpy-slm.S | 2 +- + .../silvermont/string/sse2-strcat-slm.S | 2 +- + .../silvermont/string/sse2-strcpy-slm.S | 2 +- + .../silvermont/string/sse2-strlen-slm.S | 2 +- + .../silvermont/string/sse2-strncat-slm.S | 2 +- + .../silvermont/string/sse2-strncpy-slm.S | 2 +- + .../silvermont/string/ssse3-strcmp-slm.S | 2 +- + .../silvermont/string/ssse3-strncmp-slm.S | 2 +- + libc/arch-x86_64/static_function_dispatch.S | 25 +- + 46 files changed, 3669 insertions(+), 23 deletions(-) + create mode 100644 libc/arch-x86_64/generic/string/strchr.cpp + create mode 100644 libc/arch-x86_64/generic/string/strnlen.cpp + create mode 100644 libc/arch-x86_64/generic/string/strrchr.cpp + create mode 100644 libc/arch-x86_64/generic/string/wcschr.c + create mode 100644 libc/arch-x86_64/generic/string/wcscmp.c + create mode 100644 libc/arch-x86_64/generic/string/wcslen.c + create mode 100644 libc/arch-x86_64/generic/string/wcsncmp.c + create mode 100644 libc/arch-x86_64/generic/string/wcsnlen.c + create mode 100644 libc/arch-x86_64/generic/string/wcsrchr.c + rename libc/arch-x86_64/{ => kabylake}/string/avx2-memset-kbl.S (100%) + create mode 100644 libc/arch-x86_64/kabylake/string/avx2-stpcpy-kbl.S + create mode 100644 libc/arch-x86_64/kabylake/string/avx2-stpncpy-kbl.S + create mode 100644 libc/arch-x86_64/kabylake/string/avx2-strcat-kbl.S + create mode 100644 libc/arch-x86_64/kabylake/string/avx2-strchr-kbl.S + create mode 100644 libc/arch-x86_64/kabylake/string/avx2-strcmp-kbl.S + create mode 100644 libc/arch-x86_64/kabylake/string/avx2-strcpy-kbl.S + create mode 100644 libc/arch-x86_64/kabylake/string/avx2-strlen-kbl.S + create mode 100644 libc/arch-x86_64/kabylake/string/avx2-strncat-kbl.S + create mode 100644 libc/arch-x86_64/kabylake/string/avx2-strncmp-kbl.S + create mode 100644 libc/arch-x86_64/kabylake/string/avx2-strncpy-kbl.S + create mode 100644 libc/arch-x86_64/kabylake/string/avx2-strnlen-kbl.S + create mode 100644 libc/arch-x86_64/kabylake/string/avx2-strrchr-kbl.S + create mode 100644 libc/arch-x86_64/kabylake/string/avx2-wcschr-kbl.S + create mode 100644 libc/arch-x86_64/kabylake/string/avx2-wcscmp-kbl.S + create mode 100644 libc/arch-x86_64/kabylake/string/avx2-wcslen-kbl.S + create mode 100644 libc/arch-x86_64/kabylake/string/avx2-wcsncmp-kbl.S + create mode 100644 libc/arch-x86_64/kabylake/string/avx2-wcsnlen-kbl.S + create mode 100644 libc/arch-x86_64/kabylake/string/avx2-wcsrchr-kbl.S + create mode 100644 libc/arch-x86_64/kabylake/string/avx_regs.h + rename libc/arch-x86_64/{include => kabylake/string}/cache.h (100%) + create mode 100644 libc/arch-x86_64/silvermont/string/cache.h + +diff --git a/libc/Android.bp b/libc/Android.bp +index 530ce9111..92483e833 100644 +--- a/libc/Android.bp ++++ b/libc/Android.bp +@@ -377,6 +377,17 @@ cc_library_static { + "upstream-freebsd/lib/libc/string/wmemcmp.c", + ], + }, ++ x86_64: { ++ exclude_srcs: [ ++ "upstream-freebsd/lib/libc/string/wcscmp.c", ++ "upstream-freebsd/lib/libc/string/wcsncmp.c", ++ "upstream-freebsd/lib/libc/string/wcslen.c", ++ "upstream-freebsd/lib/libc/string/wcsnlen.c", ++ "upstream-freebsd/lib/libc/string/wcschr.c", ++ "upstream-freebsd/lib/libc/string/wcsrchr.c", ++ ++ ], ++ }, + }, + + cflags: [ +@@ -1185,7 +1196,6 @@ cc_library_static { + ], + }, + x86_64: { +- include_dirs: ["bionic/libc/arch-x86_64/include"], + srcs: [ + "arch-x86_64/bionic/__bionic_clone.S", + "arch-x86_64/bionic/_exit_with_stack_teardown.S", +@@ -1194,7 +1204,7 @@ cc_library_static { + "arch-x86_64/bionic/syscall.S", + "arch-x86_64/bionic/vfork.S", + +- "arch-x86_64/string/avx2-memset-kbl.S", ++ "arch-x86_64/kabylake/string/avx2-memset-kbl.S", + "arch-x86_64/silvermont/string/sse2-memmove-slm.S", + "arch-x86_64/silvermont/string/sse2-memset-slm.S", + "arch-x86_64/silvermont/string/sse2-stpcpy-slm.S", +@@ -1211,17 +1221,42 @@ cc_library_static { + //"arch-x86_64/generic/string/wmemset.c" + "arch-x86_64/generic/string/memchr.c", + "arch-x86_64/generic/string/memrchr.c", ++ "arch-x86_64/generic/string/strchr.cpp", ++ "arch-x86_64/generic/string/strrchr.cpp", ++ "arch-x86_64/generic/string/strnlen.cpp", ++ "arch-x86_64/generic/string/wcscmp.c", ++ "arch-x86_64/generic/string/wcsncmp.c", ++ "arch-x86_64/generic/string/wcslen.c", ++ "arch-x86_64/generic/string/wcsnlen.c", ++ "arch-x86_64/generic/string/wcschr.c", ++ "arch-x86_64/generic/string/wcsrchr.c", + + //"arch-x86_64/kabylake/string/avx2-wmemset-kbl.S" + "arch-x86_64/kabylake/string/avx2-memcmp-kbl.S", + "arch-x86_64/kabylake/string/avx2-memchr-kbl.S", + "arch-x86_64/kabylake/string/avx2-memrchr-kbl.S", ++ "arch-x86_64/kabylake/string/avx2-strcmp-kbl.S", ++ "arch-x86_64/kabylake/string/avx2-strncmp-kbl.S", ++ "arch-x86_64/kabylake/string/avx2-strlen-kbl.S", ++ "arch-x86_64/kabylake/string/avx2-strnlen-kbl.S", ++ "arch-x86_64/kabylake/string/avx2-strchr-kbl.S", ++ "arch-x86_64/kabylake/string/avx2-strrchr-kbl.S", ++ "arch-x86_64/kabylake/string/avx2-strcpy-kbl.S", ++ "arch-x86_64/kabylake/string/avx2-strncpy-kbl.S", ++ "arch-x86_64/kabylake/string/avx2-stpcpy-kbl.S", ++ "arch-x86_64/kabylake/string/avx2-stpncpy-kbl.S", ++ "arch-x86_64/kabylake/string/avx2-strcat-kbl.S", ++ "arch-x86_64/kabylake/string/avx2-strncat-kbl.S", ++ "arch-x86_64/kabylake/string/avx2-wcscmp-kbl.S", ++ "arch-x86_64/kabylake/string/avx2-wcsncmp-kbl.S", ++ "arch-x86_64/kabylake/string/avx2-wcslen-kbl.S", ++ "arch-x86_64/kabylake/string/avx2-wcsnlen-kbl.S", ++ "arch-x86_64/kabylake/string/avx2-wcschr-kbl.S", ++ "arch-x86_64/kabylake/string/avx2-wcsrchr-kbl.S", + +- "bionic/strchr.cpp", + "bionic/strchrnul.cpp", +- "bionic/strnlen.cpp", +- "bionic/strrchr.cpp", + ], ++ + }, + }, + +diff --git a/libc/arch-x86_64/dynamic_function_dispatch.cpp b/libc/arch-x86_64/dynamic_function_dispatch.cpp +index 43aaebb54..182eb4200 100644 +--- a/libc/arch-x86_64/dynamic_function_dispatch.cpp ++++ b/libc/arch-x86_64/dynamic_function_dispatch.cpp +@@ -67,21 +67,148 @@ typedef void* memchr_func(const void* __s, int __ch, size_t __n); + DEFINE_IFUNC_FOR(memchr) { + __builtin_cpu_init(); + if (__builtin_cpu_supports("avx2")) RETURN_FUNC(memchr_func, memchr_avx2); +- RETURN_FUNC(memchr_func, memchr_openbsd); ++ RETURN_FUNC(memchr_func, memchr_generic); + } + + typedef void* memrchr_func(const void* __s, int __ch, size_t __n); + DEFINE_IFUNC_FOR(memrchr) { + __builtin_cpu_init(); + if (__builtin_cpu_supports("avx2")) RETURN_FUNC(memrchr_func, memrchr_avx2); +- RETURN_FUNC(memrchr_func, memrchr_openbsd); ++ RETURN_FUNC(memrchr_func, memrchr_generic); + } + + // typedef int wmemset_func(const wchar_t* __lhs, const wchar_t* __rhs, size_t __n); + // DEFINE_IFUNC_FOR(wmemset) { + // __builtin_cpu_init(); + // if (__builtin_cpu_supports("avx2")) RETURN_FUNC(wmemset_func, wmemset_avx2); +-// RETURN_FUNC(wmemset_func, wmemset_freebsd); ++// RETURN_FUNC(wmemset_func, wmemset_generic); + // } + ++typedef int strcmp_func(const char* __lhs, const char* __rhs); ++DEFINE_IFUNC_FOR(strcmp) { ++ __builtin_cpu_init(); ++ if (__builtin_cpu_supports("avx2")) RETURN_FUNC(strcmp_func, strcmp_avx2); ++ RETURN_FUNC(strcmp_func, strcmp_generic); ++} ++ ++typedef int strncmp_func(const char* __lhs, const char* __rhs, size_t __n); ++DEFINE_IFUNC_FOR(strncmp) { ++ __builtin_cpu_init(); ++ if (__builtin_cpu_supports("avx2")) RETURN_FUNC(strncmp_func, strncmp_avx2); ++ RETURN_FUNC(strncmp_func, strncmp_generic); ++} ++ ++typedef char* strcpy_func(char* __dst, const char* __src); ++DEFINE_IFUNC_FOR(strcpy) { ++ __builtin_cpu_init(); ++ if (__builtin_cpu_supports("avx2")) RETURN_FUNC(strcpy_func, strcpy_avx2); ++ RETURN_FUNC(strcpy_func, strcpy_generic); ++} ++ ++typedef char* strncpy_func(char* __dst, const char* __src, size_t __n); ++DEFINE_IFUNC_FOR(strncpy) { ++ __builtin_cpu_init(); ++ if (__builtin_cpu_supports("avx2")) RETURN_FUNC(strncpy_func, strncpy_avx2); ++ RETURN_FUNC(strncpy_func, strncpy_generic); ++} ++ ++typedef char* stpcpy_func(char* __dst, const char* __src); ++DEFINE_IFUNC_FOR(stpcpy) { ++ __builtin_cpu_init(); ++ if (__builtin_cpu_supports("avx2")) RETURN_FUNC(stpcpy_func, stpcpy_avx2); ++ RETURN_FUNC(stpcpy_func, stpcpy_generic); ++} ++ ++typedef char* stpncpy_func(char* __dst, const char* __src, size_t __n); ++DEFINE_IFUNC_FOR(stpncpy) { ++ __builtin_cpu_init(); ++ if (__builtin_cpu_supports("avx2")) RETURN_FUNC(stpncpy_func, stpncpy_avx2); ++ RETURN_FUNC(stpncpy_func, stpncpy_generic); ++} ++ ++typedef size_t strlen_func(const char* __s); ++DEFINE_IFUNC_FOR(strlen) { ++ __builtin_cpu_init(); ++ if (__builtin_cpu_supports("avx2")) RETURN_FUNC(strlen_func, strlen_avx2); ++ RETURN_FUNC(strlen_func, strlen_generic); ++} ++ ++ ++typedef size_t strnlen_func(const char* __s, size_t __n); ++DEFINE_IFUNC_FOR(strnlen) { ++ __builtin_cpu_init(); ++ if (__builtin_cpu_supports("avx2")) RETURN_FUNC(strnlen_func, strnlen_avx2); ++ RETURN_FUNC(strnlen_func, strnlen_generic); ++} ++ ++typedef char* strchr_func(const char* __s, int __ch); ++DEFINE_IFUNC_FOR(strchr) { ++ __builtin_cpu_init(); ++ if (__builtin_cpu_supports("avx2")) RETURN_FUNC(strchr_func, strchr_avx2); ++ RETURN_FUNC(strchr_func, strchr_generic); ++} ++ ++typedef char* strrchr_func(const char* __s, int __ch); ++DEFINE_IFUNC_FOR(strrchr) { ++ __builtin_cpu_init(); ++ if (__builtin_cpu_supports("avx2")) RETURN_FUNC(strrchr_func, strrchr_avx2); ++ RETURN_FUNC(strrchr_func, strrchr_generic); ++} ++ ++typedef char* strcat_func(char* __dst, const char* __src); ++DEFINE_IFUNC_FOR(strcat) { ++ __builtin_cpu_init(); ++ if (__builtin_cpu_supports("avx2")) RETURN_FUNC(strcat_func, strcat_avx2); ++ RETURN_FUNC(strcat_func, strcat_generic); ++} ++ ++typedef char* strncat_func(char* __dst, const char* __src, size_t __n); ++DEFINE_IFUNC_FOR(strncat) { ++ __builtin_cpu_init(); ++ if (__builtin_cpu_supports("avx2")) RETURN_FUNC(strncat_func, strncat_avx2); ++ RETURN_FUNC(strncat_func, strncat_generic); ++} ++ ++typedef int wcscmp_func(const wchar_t* __lhs, const wchar_t* __rhs); ++DEFINE_IFUNC_FOR(wcscmp) { ++ __builtin_cpu_init(); ++ if (__builtin_cpu_supports("avx2")) RETURN_FUNC(wcscmp_func, wcscmp_avx2); ++ RETURN_FUNC(wcscmp_func, wcscmp_generic); ++} ++ ++typedef int wcsncmp_func(const wchar_t* __lhs, const wchar_t* __rhs, size_t __n); ++DEFINE_IFUNC_FOR(wcsncmp) { ++ __builtin_cpu_init(); ++ if (__builtin_cpu_supports("avx2")) RETURN_FUNC(wcsncmp_func, wcsncmp_avx2); ++ RETURN_FUNC(wcsncmp_func, wcsncmp_generic); ++} ++ ++typedef size_t wcslen_func(const wchar_t* __s); ++DEFINE_IFUNC_FOR(wcslen) { ++ __builtin_cpu_init(); ++ if (__builtin_cpu_supports("avx2")) RETURN_FUNC(wcslen_func, wcslen_avx2); ++ RETURN_FUNC(wcslen_func, wcslen_generic); ++} ++ ++typedef size_t wcsnlen_func(const wchar_t* __s, size_t __n); ++DEFINE_IFUNC_FOR(wcsnlen) { ++ __builtin_cpu_init(); ++ if (__builtin_cpu_supports("avx2")) RETURN_FUNC(wcsnlen_func, wcsnlen_avx2); ++ RETURN_FUNC(wcsnlen_func, wcsnlen_generic); ++} ++ ++typedef wchar_t* wcschr_func(const wchar_t* __s, wchar_t __wc); ++DEFINE_IFUNC_FOR(wcschr) { ++ __builtin_cpu_init(); ++ if (__builtin_cpu_supports("avx2")) RETURN_FUNC(wcschr_func, wcschr_avx2); ++ RETURN_FUNC(wcschr_func, wcschr_generic); ++} ++ ++typedef wchar_t* wcsrchr_func(const wchar_t* __s, wchar_t __wc); ++DEFINE_IFUNC_FOR(wcsrchr) { ++ __builtin_cpu_init(); ++ if (__builtin_cpu_supports("avx2")) RETURN_FUNC(wcsrchr_func, wcsrchr_avx2); ++ RETURN_FUNC(wcsrchr_func, wcsrchr_generic); ++} ++ + } // extern "C" +diff --git a/libc/arch-x86_64/generic/string/memchr.c b/libc/arch-x86_64/generic/string/memchr.c +index 86ee02e0b..e6fc3eb84 100644 +--- a/libc/arch-x86_64/generic/string/memchr.c ++++ b/libc/arch-x86_64/generic/string/memchr.c +@@ -15,6 +15,6 @@ + */ + + #include +-#define memchr memchr_openbsd ++#define memchr memchr_generic + + #include +diff --git a/libc/arch-x86_64/generic/string/memrchr.c b/libc/arch-x86_64/generic/string/memrchr.c +index c803009f5..ee085e384 100644 +--- a/libc/arch-x86_64/generic/string/memrchr.c ++++ b/libc/arch-x86_64/generic/string/memrchr.c +@@ -15,6 +15,6 @@ + */ + + #include +-#define memrchr memrchr_openbsd ++#define memrchr memrchr_generic + + #include +diff --git a/libc/arch-x86_64/generic/string/strchr.cpp b/libc/arch-x86_64/generic/string/strchr.cpp +new file mode 100644 +index 000000000..8a3d6d619 +--- /dev/null ++++ b/libc/arch-x86_64/generic/string/strchr.cpp +@@ -0,0 +1,19 @@ ++/* ++ * Copyright (C) 2019 The Android Open Source Project ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++*/ ++ ++#define strchr strchr_generic ++ ++#include +diff --git a/libc/arch-x86_64/generic/string/strnlen.cpp b/libc/arch-x86_64/generic/string/strnlen.cpp +new file mode 100644 +index 000000000..f60348656 +--- /dev/null ++++ b/libc/arch-x86_64/generic/string/strnlen.cpp +@@ -0,0 +1,19 @@ ++/* ++ * Copyright (C) 2019 The Android Open Source Project ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++*/ ++ ++#define strnlen strnlen_generic ++ ++#include +diff --git a/libc/arch-x86_64/generic/string/strrchr.cpp b/libc/arch-x86_64/generic/string/strrchr.cpp +new file mode 100644 +index 000000000..9f0f33fd2 +--- /dev/null ++++ b/libc/arch-x86_64/generic/string/strrchr.cpp +@@ -0,0 +1,19 @@ ++/* ++ * Copyright (C) 2019 The Android Open Source Project ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++*/ ++ ++#define strrchr strrchr_generic ++ ++#include +diff --git a/libc/arch-x86_64/generic/string/wcschr.c b/libc/arch-x86_64/generic/string/wcschr.c +new file mode 100644 +index 000000000..d45e45d20 +--- /dev/null ++++ b/libc/arch-x86_64/generic/string/wcschr.c +@@ -0,0 +1,19 @@ ++/* ++ * Copyright (C) 2019 The Android Open Source Project ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++*/ ++ ++#define wcschr wcschr_generic ++ ++#include +diff --git a/libc/arch-x86_64/generic/string/wcscmp.c b/libc/arch-x86_64/generic/string/wcscmp.c +new file mode 100644 +index 000000000..e55bab549 +--- /dev/null ++++ b/libc/arch-x86_64/generic/string/wcscmp.c +@@ -0,0 +1,19 @@ ++/* ++ * Copyright (C) 2019 The Android Open Source Project ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++*/ ++ ++#define wcscmp wcscmp_generic ++ ++#include +diff --git a/libc/arch-x86_64/generic/string/wcslen.c b/libc/arch-x86_64/generic/string/wcslen.c +new file mode 100644 +index 000000000..5b873fc30 +--- /dev/null ++++ b/libc/arch-x86_64/generic/string/wcslen.c +@@ -0,0 +1,19 @@ ++/* ++ * Copyright (C) 2019 The Android Open Source Project ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++*/ ++ ++#define wcslen wcslen_generic ++ ++#include +diff --git a/libc/arch-x86_64/generic/string/wcsncmp.c b/libc/arch-x86_64/generic/string/wcsncmp.c +new file mode 100644 +index 000000000..40b2ca2f3 +--- /dev/null ++++ b/libc/arch-x86_64/generic/string/wcsncmp.c +@@ -0,0 +1,19 @@ ++/* ++ * Copyright (C) 2019 The Android Open Source Project ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++*/ ++ ++#define wcsncmp wcsncmp_generic ++ ++#include +diff --git a/libc/arch-x86_64/generic/string/wcsnlen.c b/libc/arch-x86_64/generic/string/wcsnlen.c +new file mode 100644 +index 000000000..91051cea7 +--- /dev/null ++++ b/libc/arch-x86_64/generic/string/wcsnlen.c +@@ -0,0 +1,19 @@ ++/* ++ * Copyright (C) 2019 The Android Open Source Project ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++*/ ++ ++#define wcsnlen wcsnlen_generic ++ ++#include +diff --git a/libc/arch-x86_64/generic/string/wcsrchr.c b/libc/arch-x86_64/generic/string/wcsrchr.c +new file mode 100644 +index 000000000..73e8c25bc +--- /dev/null ++++ b/libc/arch-x86_64/generic/string/wcsrchr.c +@@ -0,0 +1,19 @@ ++/* ++ * Copyright (C) 2019 The Android Open Source Project ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++*/ ++ ++#define wcsrchr wcsrchr_generic ++ ++#include +diff --git a/libc/arch-x86_64/generic/string/wmemset.c b/libc/arch-x86_64/generic/string/wmemset.c +index ac6bd7ec4..9675fe91f 100644 +--- a/libc/arch-x86_64/generic/string/wmemset.c ++++ b/libc/arch-x86_64/generic/string/wmemset.c +@@ -15,6 +15,6 @@ + */ + + #include +-#define wmemset wmemset_freebsd ++#define wmemset wmemset_generic + + #include +diff --git a/libc/arch-x86_64/string/avx2-memset-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-memset-kbl.S +similarity index 100% +rename from libc/arch-x86_64/string/avx2-memset-kbl.S +rename to libc/arch-x86_64/kabylake/string/avx2-memset-kbl.S +diff --git a/libc/arch-x86_64/kabylake/string/avx2-stpcpy-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-stpcpy-kbl.S +new file mode 100644 +index 000000000..63f9ba25b +--- /dev/null ++++ b/libc/arch-x86_64/kabylake/string/avx2-stpcpy-kbl.S +@@ -0,0 +1,3 @@ ++#define USE_AS_STPCPY ++#define STRCPY stpcpy_avx2 ++#include "avx2-strcpy-kbl.S" +diff --git a/libc/arch-x86_64/kabylake/string/avx2-stpncpy-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-stpncpy-kbl.S +new file mode 100644 +index 000000000..c1bbdb29e +--- /dev/null ++++ b/libc/arch-x86_64/kabylake/string/avx2-stpncpy-kbl.S +@@ -0,0 +1,5 @@ ++#define USE_AS_STPCPY ++#define USE_AS_STRNCPY ++#define STRCPY stpncpy_avx2 ++#include "avx_regs.h" ++#include "avx2-strcpy-kbl.S" +diff --git a/libc/arch-x86_64/kabylake/string/avx2-strcat-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-strcat-kbl.S +new file mode 100644 +index 000000000..d1e9b4b38 +--- /dev/null ++++ b/libc/arch-x86_64/kabylake/string/avx2-strcat-kbl.S +@@ -0,0 +1,299 @@ ++/* strcat with AVX2 ++ Copyright (C) 2011-2020 Free Software Foundation, Inc. ++ Contributed by Intel Corporation. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++ ++ ++# ifndef STRCAT ++# define STRCAT strcat_avx2 ++# endif ++ ++# ifndef L ++# define L(label) .L##label ++# endif ++ ++# ifndef cfi_startproc ++# define cfi_startproc .cfi_startproc ++# endif ++ ++# ifndef cfi_endproc ++# define cfi_endproc .cfi_endproc ++# endif ++ ++# ifndef ENTRY ++# define ENTRY(name) \ ++ .type name, @function; \ ++ .globl name; \ ++ .p2align 4; \ ++name: \ ++ cfi_startproc ++# endif ++ ++# ifndef END ++# define END(name) \ ++ cfi_endproc; \ ++ .size name, .-name ++# endif ++ ++# define USE_AS_STRCAT ++ ++/* Number of bytes in a vector register */ ++# define VEC_SIZE 32 ++ ++ .section .text.avx,"ax",@progbits ++ENTRY (STRCAT) ++ mov %rdi, %r9 ++# ifdef USE_AS_STRNCAT ++ mov %rdx, %r8 ++# endif ++ ++ xor %eax, %eax ++ mov %edi, %ecx ++ and $((VEC_SIZE * 4) - 1), %ecx ++ vpxor %xmm6, %xmm6, %xmm6 ++ cmp $(VEC_SIZE * 3), %ecx ++ ja L(fourth_vector_boundary) ++ vpcmpeqb (%rdi), %ymm6, %ymm0 ++ vpmovmskb %ymm0, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_first_vector) ++ mov %rdi, %rax ++ and $-VEC_SIZE, %rax ++ jmp L(align_vec_size_start) ++L(fourth_vector_boundary): ++ mov %rdi, %rax ++ and $-VEC_SIZE, %rax ++ vpcmpeqb (%rax), %ymm6, %ymm0 ++ mov $-1, %r10d ++ sub %rax, %rcx ++ shl %cl, %r10d ++ vpmovmskb %ymm0, %edx ++ and %r10d, %edx ++ jnz L(exit) ++ ++L(align_vec_size_start): ++ vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm0 ++ vpmovmskb %ymm0, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_second_vector) ++ ++ vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1 ++ vpmovmskb %ymm1, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_third_vector) ++ ++ vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2 ++ vpmovmskb %ymm2, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_fourth_vector) ++ ++ vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3 ++ vpmovmskb %ymm3, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_fifth_vector) ++ ++ vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0 ++ add $(VEC_SIZE * 4), %rax ++ vpmovmskb %ymm0, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_second_vector) ++ ++ vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1 ++ vpmovmskb %ymm1, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_third_vector) ++ ++ vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2 ++ vpmovmskb %ymm2, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_fourth_vector) ++ ++ vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3 ++ vpmovmskb %ymm3, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_fifth_vector) ++ ++ vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0 ++ add $(VEC_SIZE * 4), %rax ++ vpmovmskb %ymm0, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_second_vector) ++ ++ vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1 ++ vpmovmskb %ymm1, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_third_vector) ++ ++ vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2 ++ vpmovmskb %ymm2, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_fourth_vector) ++ ++ vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3 ++ vpmovmskb %ymm3, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_fifth_vector) ++ ++ vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0 ++ add $(VEC_SIZE * 4), %rax ++ vpmovmskb %ymm0, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_second_vector) ++ ++ vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1 ++ vpmovmskb %ymm1, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_third_vector) ++ ++ vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2 ++ vpmovmskb %ymm2, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_fourth_vector) ++ ++ vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3 ++ vpmovmskb %ymm3, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_fifth_vector) ++ ++ test $((VEC_SIZE * 4) - 1), %rax ++ jz L(align_four_vec_loop) ++ ++ vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0 ++ add $(VEC_SIZE * 5), %rax ++ vpmovmskb %ymm0, %edx ++ test %edx, %edx ++ jnz L(exit) ++ ++ test $((VEC_SIZE * 4) - 1), %rax ++ jz L(align_four_vec_loop) ++ ++ vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm1 ++ add $VEC_SIZE, %rax ++ vpmovmskb %ymm1, %edx ++ test %edx, %edx ++ jnz L(exit) ++ ++ test $((VEC_SIZE * 4) - 1), %rax ++ jz L(align_four_vec_loop) ++ ++ vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm2 ++ add $VEC_SIZE, %rax ++ vpmovmskb %ymm2, %edx ++ test %edx, %edx ++ jnz L(exit) ++ ++ test $((VEC_SIZE * 4) - 1), %rax ++ jz L(align_four_vec_loop) ++ ++ vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm3 ++ add $VEC_SIZE, %rax ++ vpmovmskb %ymm3, %edx ++ test %edx, %edx ++ jnz L(exit) ++ ++ add $VEC_SIZE, %rax ++ ++ .p2align 4 ++L(align_four_vec_loop): ++ vmovaps (%rax), %ymm4 ++ vpminub VEC_SIZE(%rax), %ymm4, %ymm4 ++ vmovaps (VEC_SIZE * 2)(%rax), %ymm5 ++ vpminub (VEC_SIZE * 3)(%rax), %ymm5, %ymm5 ++ add $(VEC_SIZE * 4), %rax ++ vpminub %ymm4, %ymm5, %ymm5 ++ vpcmpeqb %ymm5, %ymm6, %ymm5 ++ vpmovmskb %ymm5, %edx ++ test %edx, %edx ++ jz L(align_four_vec_loop) ++ ++ vpcmpeqb -(VEC_SIZE * 4)(%rax), %ymm6, %ymm0 ++ sub $(VEC_SIZE * 5), %rax ++ vpmovmskb %ymm0, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_second_vector) ++ ++ vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1 ++ vpmovmskb %ymm1, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_third_vector) ++ ++ vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2 ++ vpmovmskb %ymm2, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_fourth_vector) ++ ++ vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3 ++ vpmovmskb %ymm3, %edx ++ sub %rdi, %rax ++ bsf %rdx, %rdx ++ add %rdx, %rax ++ add $(VEC_SIZE * 4), %rax ++ jmp L(StartStrcpyPart) ++ ++ .p2align 4 ++L(exit): ++ sub %rdi, %rax ++L(exit_null_on_first_vector): ++ bsf %rdx, %rdx ++ add %rdx, %rax ++ jmp L(StartStrcpyPart) ++ ++ .p2align 4 ++L(exit_null_on_second_vector): ++ sub %rdi, %rax ++ bsf %rdx, %rdx ++ add %rdx, %rax ++ add $VEC_SIZE, %rax ++ jmp L(StartStrcpyPart) ++ ++ .p2align 4 ++L(exit_null_on_third_vector): ++ sub %rdi, %rax ++ bsf %rdx, %rdx ++ add %rdx, %rax ++ add $(VEC_SIZE * 2), %rax ++ jmp L(StartStrcpyPart) ++ ++ .p2align 4 ++L(exit_null_on_fourth_vector): ++ sub %rdi, %rax ++ bsf %rdx, %rdx ++ add %rdx, %rax ++ add $(VEC_SIZE * 3), %rax ++ jmp L(StartStrcpyPart) ++ ++ .p2align 4 ++L(exit_null_on_fifth_vector): ++ sub %rdi, %rax ++ bsf %rdx, %rdx ++ add %rdx, %rax ++ add $(VEC_SIZE * 4), %rax ++ ++ .p2align 4 ++L(StartStrcpyPart): ++ lea (%r9, %rax), %rdi ++ mov %rsi, %rcx ++ mov %r9, %rax /* save result */ ++ ++# ifdef USE_AS_STRNCAT ++ test %r8, %r8 ++ jz L(ExitZero) ++# define USE_AS_STRNCPY ++# endif ++ ++# include "avx2-strcpy-kbl.S" +diff --git a/libc/arch-x86_64/kabylake/string/avx2-strchr-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-strchr-kbl.S +new file mode 100644 +index 000000000..7d8a44c81 +--- /dev/null ++++ b/libc/arch-x86_64/kabylake/string/avx2-strchr-kbl.S +@@ -0,0 +1,277 @@ ++/* strchr/strchrnul optimized with AVX2. ++ Copyright (C) 2017-2020 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++ ++# ifndef STRCHR ++# define STRCHR strchr_avx2 ++# endif ++ ++# ifndef L ++# define L(label) .L##label ++# endif ++ ++# ifndef cfi_startproc ++# define cfi_startproc .cfi_startproc ++# endif ++ ++# ifndef cfi_endproc ++# define cfi_endproc .cfi_endproc ++# endif ++ ++# ifndef ENTRY ++# define ENTRY(name) \ ++ .type name, @function; \ ++ .globl name; \ ++ .p2align 4; \ ++name: \ ++ cfi_startproc ++# endif ++ ++# ifndef END ++# define END(name) \ ++ cfi_endproc; \ ++ .size name, .-name ++# endif ++ ++# ifdef USE_AS_WCSCHR ++# define VPBROADCAST vpbroadcastd ++# define VPCMPEQ vpcmpeqd ++# define CHAR_REG esi ++# else ++# define VPBROADCAST vpbroadcastb ++# define VPCMPEQ vpcmpeqb ++# define CHAR_REG sil ++# endif ++ ++# ifndef VZEROUPPER ++# define VZEROUPPER vzeroupper ++# endif ++ ++# define VEC_SIZE 32 ++ ++ .section .text.avx,"ax",@progbits ++ENTRY (STRCHR) ++ movl %edi, %ecx ++ /* Broadcast CHAR to YMM0. */ ++ vmovd %esi, %xmm0 ++ vpxor %xmm9, %xmm9, %xmm9 ++ VPBROADCAST %xmm0, %ymm0 ++ /* Check if we may cross page boundary with one vector load. */ ++ andl $(2 * VEC_SIZE - 1), %ecx ++ cmpl $VEC_SIZE, %ecx ++ ja L(cros_page_boundary) ++ ++ /* Check the first VEC_SIZE bytes. Search for both CHAR and the ++ null byte. */ ++ vmovdqu (%rdi), %ymm8 ++ VPCMPEQ %ymm8, %ymm0, %ymm1 ++ VPCMPEQ %ymm8, %ymm9, %ymm2 ++ vpor %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x0) ++ ++ /* Align data for aligned loads in the loop. */ ++ addq $VEC_SIZE, %rdi ++ andl $(VEC_SIZE - 1), %ecx ++ andq $-VEC_SIZE, %rdi ++ ++ jmp L(more_4x_vec) ++ ++ .p2align 4 ++L(cros_page_boundary): ++ andl $(VEC_SIZE - 1), %ecx ++ andq $-VEC_SIZE, %rdi ++ vmovdqu (%rdi), %ymm8 ++ VPCMPEQ %ymm8, %ymm0, %ymm1 ++ VPCMPEQ %ymm8, %ymm9, %ymm2 ++ vpor %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %eax ++ /* Remove the leading bytes. */ ++ sarl %cl, %eax ++ testl %eax, %eax ++ jz L(aligned_more) ++ /* Found CHAR or the null byte. */ ++ tzcntl %eax, %eax ++ addq %rcx, %rax ++# ifdef USE_AS_STRCHRNUL ++ addq %rdi, %rax ++# else ++ xorl %edx, %edx ++ leaq (%rdi, %rax), %rax ++ cmp (%rax), %CHAR_REG ++ cmovne %rdx, %rax ++# endif ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(aligned_more): ++ addq $VEC_SIZE, %rdi ++ ++L(more_4x_vec): ++ /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time ++ since data is only aligned to VEC_SIZE. */ ++ vmovdqa (%rdi), %ymm8 ++ VPCMPEQ %ymm8, %ymm0, %ymm1 ++ VPCMPEQ %ymm8, %ymm9, %ymm2 ++ vpor %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x0) ++ ++ vmovdqa VEC_SIZE(%rdi), %ymm8 ++ VPCMPEQ %ymm8, %ymm0, %ymm1 ++ VPCMPEQ %ymm8, %ymm9, %ymm2 ++ vpor %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x1) ++ ++ vmovdqa (VEC_SIZE * 2)(%rdi), %ymm8 ++ VPCMPEQ %ymm8, %ymm0, %ymm1 ++ VPCMPEQ %ymm8, %ymm9, %ymm2 ++ vpor %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x2) ++ ++ vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8 ++ VPCMPEQ %ymm8, %ymm0, %ymm1 ++ VPCMPEQ %ymm8, %ymm9, %ymm2 ++ vpor %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x3) ++ ++ addq $(VEC_SIZE * 4), %rdi ++ ++ /* Align data to 4 * VEC_SIZE. */ ++ movq %rdi, %rcx ++ andl $(4 * VEC_SIZE - 1), %ecx ++ andq $-(4 * VEC_SIZE), %rdi ++ ++ .p2align 4 ++L(loop_4x_vec): ++ /* Compare 4 * VEC at a time forward. */ ++ vmovdqa (%rdi), %ymm5 ++ vmovdqa VEC_SIZE(%rdi), %ymm6 ++ vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7 ++ vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8 ++ ++ VPCMPEQ %ymm5, %ymm0, %ymm1 ++ VPCMPEQ %ymm6, %ymm0, %ymm2 ++ VPCMPEQ %ymm7, %ymm0, %ymm3 ++ VPCMPEQ %ymm8, %ymm0, %ymm4 ++ ++ VPCMPEQ %ymm5, %ymm9, %ymm5 ++ VPCMPEQ %ymm6, %ymm9, %ymm6 ++ VPCMPEQ %ymm7, %ymm9, %ymm7 ++ VPCMPEQ %ymm8, %ymm9, %ymm8 ++ ++ vpor %ymm1, %ymm5, %ymm1 ++ vpor %ymm2, %ymm6, %ymm2 ++ vpor %ymm3, %ymm7, %ymm3 ++ vpor %ymm4, %ymm8, %ymm4 ++ ++ vpor %ymm1, %ymm2, %ymm5 ++ vpor %ymm3, %ymm4, %ymm6 ++ ++ vpor %ymm5, %ymm6, %ymm5 ++ ++ vpmovmskb %ymm5, %eax ++ testl %eax, %eax ++ jnz L(4x_vec_end) ++ ++ addq $(VEC_SIZE * 4), %rdi ++ ++ jmp L(loop_4x_vec) ++ ++ .p2align 4 ++L(first_vec_x0): ++ /* Found CHAR or the null byte. */ ++ tzcntl %eax, %eax ++# ifdef USE_AS_STRCHRNUL ++ addq %rdi, %rax ++# else ++ xorl %edx, %edx ++ leaq (%rdi, %rax), %rax ++ cmp (%rax), %CHAR_REG ++ cmovne %rdx, %rax ++# endif ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(first_vec_x1): ++ tzcntl %eax, %eax ++# ifdef USE_AS_STRCHRNUL ++ addq $VEC_SIZE, %rax ++ addq %rdi, %rax ++# else ++ xorl %edx, %edx ++ leaq VEC_SIZE(%rdi, %rax), %rax ++ cmp (%rax), %CHAR_REG ++ cmovne %rdx, %rax ++# endif ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(first_vec_x2): ++ tzcntl %eax, %eax ++# ifdef USE_AS_STRCHRNUL ++ addq $(VEC_SIZE * 2), %rax ++ addq %rdi, %rax ++# else ++ xorl %edx, %edx ++ leaq (VEC_SIZE * 2)(%rdi, %rax), %rax ++ cmp (%rax), %CHAR_REG ++ cmovne %rdx, %rax ++# endif ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(4x_vec_end): ++ vpmovmskb %ymm1, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x0) ++ vpmovmskb %ymm2, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x1) ++ vpmovmskb %ymm3, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x2) ++ vpmovmskb %ymm4, %eax ++ testl %eax, %eax ++L(first_vec_x3): ++ tzcntl %eax, %eax ++# ifdef USE_AS_STRCHRNUL ++ addq $(VEC_SIZE * 3), %rax ++ addq %rdi, %rax ++# else ++ xorl %edx, %edx ++ leaq (VEC_SIZE * 3)(%rdi, %rax), %rax ++ cmp (%rax), %CHAR_REG ++ cmovne %rdx, %rax ++# endif ++ VZEROUPPER ++ ret ++ ++END (STRCHR) +diff --git a/libc/arch-x86_64/kabylake/string/avx2-strcmp-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-strcmp-kbl.S +new file mode 100644 +index 000000000..b241812d8 +--- /dev/null ++++ b/libc/arch-x86_64/kabylake/string/avx2-strcmp-kbl.S +@@ -0,0 +1,885 @@ ++/* strcmp/wcscmp/strncmp/wcsncmp optimized with AVX2. ++ Copyright (C) 2018-2020 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++ ++# ifndef STRCMP ++# define STRCMP strcmp_avx2 ++# endif ++ ++# ifndef L ++# define L(label) .L##label ++# endif ++ ++# ifndef cfi_startproc ++# define cfi_startproc .cfi_startproc ++# endif ++ ++# ifndef cfi_endproc ++# define cfi_endproc .cfi_endproc ++# endif ++ ++# ifndef ENTRY ++# define ENTRY(name) \ ++ .type name, @function; \ ++ .globl name; \ ++ .p2align 4; \ ++name: \ ++ cfi_startproc ++# endif ++ ++# ifndef END ++# define END(name) \ ++ cfi_endproc; \ ++ .size name, .-name ++# endif ++ ++# define PAGE_SIZE 4096 ++ ++/* VEC_SIZE = Number of bytes in a ymm register */ ++# define VEC_SIZE 32 ++ ++/* Shift for dividing by (VEC_SIZE * 4). */ ++# define DIVIDE_BY_VEC_4_SHIFT 7 ++# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) ++# error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) ++# endif ++ ++# ifdef USE_AS_WCSCMP ++/* Compare packed dwords. */ ++# define VPCMPEQ vpcmpeqd ++/* Compare packed dwords and store minimum. */ ++# define VPMINU vpminud ++/* 1 dword char == 4 bytes. */ ++# define SIZE_OF_CHAR 4 ++# else ++/* Compare packed bytes. */ ++# define VPCMPEQ vpcmpeqb ++/* Compare packed bytes and store minimum. */ ++# define VPMINU vpminub ++/* 1 byte char == 1 byte. */ ++# define SIZE_OF_CHAR 1 ++# endif ++ ++# ifndef VZEROUPPER ++# define VZEROUPPER vzeroupper ++# endif ++ ++/* Warning! ++ wcscmp/wcsncmp have to use SIGNED comparison for elements. ++ strcmp/strncmp have to use UNSIGNED comparison for elements. ++*/ ++ ++/* The main idea of the string comparison (byte or dword) using AVX2 ++ consists of comparing (VPCMPEQ) two ymm vectors. The latter can be on ++ either packed bytes or dwords depending on USE_AS_WCSCMP. In order ++ to check the null char, algorithm keeps the matched bytes/dwords, ++ requiring two more AVX2 instructions (VPMINU and VPCMPEQ). In general, ++ the costs of comparing VEC_SIZE bytes (32-bytes) are two VPCMPEQ and ++ one VPMINU instructions, together with movdqu and testl instructions. ++ Main loop (away from from page boundary) compares 4 vectors are a time, ++ effectively comparing 4 x VEC_SIZE bytes (128 bytes) on each loop. ++ ++ The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic ++ is the same as strcmp, except that an a maximum offset is tracked. If ++ the maximum offset is reached before a difference is found, zero is ++ returned. */ ++ ++ .section .text.avx,"ax",@progbits ++ENTRY (STRCMP) ++# ifdef USE_AS_STRNCMP ++ /* Check for simple cases (0 or 1) in offset. */ ++ cmp $1, %RDX_LP ++ je L(char0) ++ jb L(zero) ++# ifdef USE_AS_WCSCMP ++ /* Convert units: from wide to byte char. */ ++ shl $2, %RDX_LP ++# endif ++ /* Register %r11 tracks the maximum offset. */ ++ mov %RDX_LP, %R11_LP ++# endif ++ movl %edi, %eax ++ xorl %edx, %edx ++ /* Make %xmm7 (%ymm7) all zeros in this function. */ ++ vpxor %xmm7, %xmm7, %xmm7 ++ orl %esi, %eax ++ andl $(PAGE_SIZE - 1), %eax ++ cmpl $(PAGE_SIZE - (VEC_SIZE * 4)), %eax ++ jg L(cross_page) ++ /* Start comparing 4 vectors. */ ++ vmovdqu (%rdi), %ymm1 ++ VPCMPEQ (%rsi), %ymm1, %ymm0 ++ VPMINU %ymm1, %ymm0, %ymm0 ++ VPCMPEQ %ymm7, %ymm0, %ymm0 ++ vpmovmskb %ymm0, %ecx ++ testl %ecx, %ecx ++ je L(next_3_vectors) ++ tzcntl %ecx, %edx ++# ifdef USE_AS_STRNCMP ++ /* Return 0 if the mismatched index (%rdx) is after the maximum ++ offset (%r11). */ ++ cmpq %r11, %rdx ++ jae L(zero) ++# endif ++# ifdef USE_AS_WCSCMP ++ xorl %eax, %eax ++ movl (%rdi, %rdx), %ecx ++ cmpl (%rsi, %rdx), %ecx ++ je L(return) ++L(wcscmp_return): ++ setl %al ++ negl %eax ++ orl $1, %eax ++L(return): ++# else ++ movzbl (%rdi, %rdx), %eax ++ movzbl (%rsi, %rdx), %edx ++ subl %edx, %eax ++# endif ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(return_vec_size): ++ tzcntl %ecx, %edx ++# ifdef USE_AS_STRNCMP ++ /* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after ++ the maximum offset (%r11). */ ++ addq $VEC_SIZE, %rdx ++ cmpq %r11, %rdx ++ jae L(zero) ++# ifdef USE_AS_WCSCMP ++ xorl %eax, %eax ++ movl (%rdi, %rdx), %ecx ++ cmpl (%rsi, %rdx), %ecx ++ jne L(wcscmp_return) ++# else ++ movzbl (%rdi, %rdx), %eax ++ movzbl (%rsi, %rdx), %edx ++ subl %edx, %eax ++# endif ++# else ++# ifdef USE_AS_WCSCMP ++ xorl %eax, %eax ++ movl VEC_SIZE(%rdi, %rdx), %ecx ++ cmpl VEC_SIZE(%rsi, %rdx), %ecx ++ jne L(wcscmp_return) ++# else ++ movzbl VEC_SIZE(%rdi, %rdx), %eax ++ movzbl VEC_SIZE(%rsi, %rdx), %edx ++ subl %edx, %eax ++# endif ++# endif ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(return_2_vec_size): ++ tzcntl %ecx, %edx ++# ifdef USE_AS_STRNCMP ++ /* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is ++ after the maximum offset (%r11). */ ++ addq $(VEC_SIZE * 2), %rdx ++ cmpq %r11, %rdx ++ jae L(zero) ++# ifdef USE_AS_WCSCMP ++ xorl %eax, %eax ++ movl (%rdi, %rdx), %ecx ++ cmpl (%rsi, %rdx), %ecx ++ jne L(wcscmp_return) ++# else ++ movzbl (%rdi, %rdx), %eax ++ movzbl (%rsi, %rdx), %edx ++ subl %edx, %eax ++# endif ++# else ++# ifdef USE_AS_WCSCMP ++ xorl %eax, %eax ++ movl (VEC_SIZE * 2)(%rdi, %rdx), %ecx ++ cmpl (VEC_SIZE * 2)(%rsi, %rdx), %ecx ++ jne L(wcscmp_return) ++# else ++ movzbl (VEC_SIZE * 2)(%rdi, %rdx), %eax ++ movzbl (VEC_SIZE * 2)(%rsi, %rdx), %edx ++ subl %edx, %eax ++# endif ++# endif ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(return_3_vec_size): ++ tzcntl %ecx, %edx ++# ifdef USE_AS_STRNCMP ++ /* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is ++ after the maximum offset (%r11). */ ++ addq $(VEC_SIZE * 3), %rdx ++ cmpq %r11, %rdx ++ jae L(zero) ++# ifdef USE_AS_WCSCMP ++ xorl %eax, %eax ++ movl (%rdi, %rdx), %ecx ++ cmpl (%rsi, %rdx), %ecx ++ jne L(wcscmp_return) ++# else ++ movzbl (%rdi, %rdx), %eax ++ movzbl (%rsi, %rdx), %edx ++ subl %edx, %eax ++# endif ++# else ++# ifdef USE_AS_WCSCMP ++ xorl %eax, %eax ++ movl (VEC_SIZE * 3)(%rdi, %rdx), %ecx ++ cmpl (VEC_SIZE * 3)(%rsi, %rdx), %ecx ++ jne L(wcscmp_return) ++# else ++ movzbl (VEC_SIZE * 3)(%rdi, %rdx), %eax ++ movzbl (VEC_SIZE * 3)(%rsi, %rdx), %edx ++ subl %edx, %eax ++# endif ++# endif ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(next_3_vectors): ++ vmovdqu VEC_SIZE(%rdi), %ymm6 ++ VPCMPEQ VEC_SIZE(%rsi), %ymm6, %ymm3 ++ VPMINU %ymm6, %ymm3, %ymm3 ++ VPCMPEQ %ymm7, %ymm3, %ymm3 ++ vpmovmskb %ymm3, %ecx ++ testl %ecx, %ecx ++ jne L(return_vec_size) ++ vmovdqu (VEC_SIZE * 2)(%rdi), %ymm5 ++ vmovdqu (VEC_SIZE * 3)(%rdi), %ymm4 ++ vmovdqu (VEC_SIZE * 3)(%rsi), %ymm0 ++ VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm5, %ymm2 ++ VPMINU %ymm5, %ymm2, %ymm2 ++ VPCMPEQ %ymm4, %ymm0, %ymm0 ++ VPCMPEQ %ymm7, %ymm2, %ymm2 ++ vpmovmskb %ymm2, %ecx ++ testl %ecx, %ecx ++ jne L(return_2_vec_size) ++ VPMINU %ymm4, %ymm0, %ymm0 ++ VPCMPEQ %ymm7, %ymm0, %ymm0 ++ vpmovmskb %ymm0, %ecx ++ testl %ecx, %ecx ++ jne L(return_3_vec_size) ++L(main_loop_header): ++ leaq (VEC_SIZE * 4)(%rdi), %rdx ++ movl $PAGE_SIZE, %ecx ++ /* Align load via RAX. */ ++ andq $-(VEC_SIZE * 4), %rdx ++ subq %rdi, %rdx ++ leaq (%rdi, %rdx), %rax ++# ifdef USE_AS_STRNCMP ++ /* Starting from this point, the maximum offset, or simply the ++ 'offset', DECREASES by the same amount when base pointers are ++ moved forward. Return 0 when: ++ 1) On match: offset <= the matched vector index. ++ 2) On mistmach, offset is before the mistmatched index. ++ */ ++ subq %rdx, %r11 ++ jbe L(zero) ++# endif ++ addq %rsi, %rdx ++ movq %rdx, %rsi ++ andl $(PAGE_SIZE - 1), %esi ++ /* Number of bytes before page crossing. */ ++ subq %rsi, %rcx ++ /* Number of VEC_SIZE * 4 blocks before page crossing. */ ++ shrq $DIVIDE_BY_VEC_4_SHIFT, %rcx ++ /* ESI: Number of VEC_SIZE * 4 blocks before page crossing. */ ++ movl %ecx, %esi ++ jmp L(loop_start) ++ ++ .p2align 4 ++L(loop): ++# ifdef USE_AS_STRNCMP ++ /* Base pointers are moved forward by 4 * VEC_SIZE. Decrease ++ the maximum offset (%r11) by the same amount. */ ++ subq $(VEC_SIZE * 4), %r11 ++ jbe L(zero) ++# endif ++ addq $(VEC_SIZE * 4), %rax ++ addq $(VEC_SIZE * 4), %rdx ++L(loop_start): ++ testl %esi, %esi ++ leal -1(%esi), %esi ++ je L(loop_cross_page) ++L(back_to_loop): ++ /* Main loop, comparing 4 vectors are a time. */ ++ vmovdqa (%rax), %ymm0 ++ vmovdqa VEC_SIZE(%rax), %ymm3 ++ VPCMPEQ (%rdx), %ymm0, %ymm4 ++ VPCMPEQ VEC_SIZE(%rdx), %ymm3, %ymm1 ++ VPMINU %ymm0, %ymm4, %ymm4 ++ VPMINU %ymm3, %ymm1, %ymm1 ++ vmovdqa (VEC_SIZE * 2)(%rax), %ymm2 ++ VPMINU %ymm1, %ymm4, %ymm0 ++ vmovdqa (VEC_SIZE * 3)(%rax), %ymm3 ++ VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm2, %ymm5 ++ VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm3, %ymm6 ++ VPMINU %ymm2, %ymm5, %ymm5 ++ VPMINU %ymm3, %ymm6, %ymm6 ++ VPMINU %ymm5, %ymm0, %ymm0 ++ VPMINU %ymm6, %ymm0, %ymm0 ++ VPCMPEQ %ymm7, %ymm0, %ymm0 ++ ++ /* Test each mask (32 bits) individually because for VEC_SIZE ++ == 32 is not possible to OR the four masks and keep all bits ++ in a 64-bit integer register, differing from SSE2 strcmp ++ where ORing is possible. */ ++ vpmovmskb %ymm0, %ecx ++ testl %ecx, %ecx ++ je L(loop) ++ VPCMPEQ %ymm7, %ymm4, %ymm0 ++ vpmovmskb %ymm0, %edi ++ testl %edi, %edi ++ je L(test_vec) ++ tzcntl %edi, %ecx ++# ifdef USE_AS_STRNCMP ++ cmpq %rcx, %r11 ++ jbe L(zero) ++# ifdef USE_AS_WCSCMP ++ movq %rax, %rsi ++ xorl %eax, %eax ++ movl (%rsi, %rcx), %edi ++ cmpl (%rdx, %rcx), %edi ++ jne L(wcscmp_return) ++# else ++ movzbl (%rax, %rcx), %eax ++ movzbl (%rdx, %rcx), %edx ++ subl %edx, %eax ++# endif ++# else ++# ifdef USE_AS_WCSCMP ++ movq %rax, %rsi ++ xorl %eax, %eax ++ movl (%rsi, %rcx), %edi ++ cmpl (%rdx, %rcx), %edi ++ jne L(wcscmp_return) ++# else ++ movzbl (%rax, %rcx), %eax ++ movzbl (%rdx, %rcx), %edx ++ subl %edx, %eax ++# endif ++# endif ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(test_vec): ++# ifdef USE_AS_STRNCMP ++ /* The first vector matched. Return 0 if the maximum offset ++ (%r11) <= VEC_SIZE. */ ++ cmpq $VEC_SIZE, %r11 ++ jbe L(zero) ++# endif ++ VPCMPEQ %ymm7, %ymm1, %ymm1 ++ vpmovmskb %ymm1, %ecx ++ testl %ecx, %ecx ++ je L(test_2_vec) ++ tzcntl %ecx, %edi ++# ifdef USE_AS_STRNCMP ++ addq $VEC_SIZE, %rdi ++ cmpq %rdi, %r11 ++ jbe L(zero) ++# ifdef USE_AS_WCSCMP ++ movq %rax, %rsi ++ xorl %eax, %eax ++ movl (%rsi, %rdi), %ecx ++ cmpl (%rdx, %rdi), %ecx ++ jne L(wcscmp_return) ++# else ++ movzbl (%rax, %rdi), %eax ++ movzbl (%rdx, %rdi), %edx ++ subl %edx, %eax ++# endif ++# else ++# ifdef USE_AS_WCSCMP ++ movq %rax, %rsi ++ xorl %eax, %eax ++ movl VEC_SIZE(%rsi, %rdi), %ecx ++ cmpl VEC_SIZE(%rdx, %rdi), %ecx ++ jne L(wcscmp_return) ++# else ++ movzbl VEC_SIZE(%rax, %rdi), %eax ++ movzbl VEC_SIZE(%rdx, %rdi), %edx ++ subl %edx, %eax ++# endif ++# endif ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(test_2_vec): ++# ifdef USE_AS_STRNCMP ++ /* The first 2 vectors matched. Return 0 if the maximum offset ++ (%r11) <= 2 * VEC_SIZE. */ ++ cmpq $(VEC_SIZE * 2), %r11 ++ jbe L(zero) ++# endif ++ VPCMPEQ %ymm7, %ymm5, %ymm5 ++ vpmovmskb %ymm5, %ecx ++ testl %ecx, %ecx ++ je L(test_3_vec) ++ tzcntl %ecx, %edi ++# ifdef USE_AS_STRNCMP ++ addq $(VEC_SIZE * 2), %rdi ++ cmpq %rdi, %r11 ++ jbe L(zero) ++# ifdef USE_AS_WCSCMP ++ movq %rax, %rsi ++ xorl %eax, %eax ++ movl (%rsi, %rdi), %ecx ++ cmpl (%rdx, %rdi), %ecx ++ jne L(wcscmp_return) ++# else ++ movzbl (%rax, %rdi), %eax ++ movzbl (%rdx, %rdi), %edx ++ subl %edx, %eax ++# endif ++# else ++# ifdef USE_AS_WCSCMP ++ movq %rax, %rsi ++ xorl %eax, %eax ++ movl (VEC_SIZE * 2)(%rsi, %rdi), %ecx ++ cmpl (VEC_SIZE * 2)(%rdx, %rdi), %ecx ++ jne L(wcscmp_return) ++# else ++ movzbl (VEC_SIZE * 2)(%rax, %rdi), %eax ++ movzbl (VEC_SIZE * 2)(%rdx, %rdi), %edx ++ subl %edx, %eax ++# endif ++# endif ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(test_3_vec): ++# ifdef USE_AS_STRNCMP ++ /* The first 3 vectors matched. Return 0 if the maximum offset ++ (%r11) <= 3 * VEC_SIZE. */ ++ cmpq $(VEC_SIZE * 3), %r11 ++ jbe L(zero) ++# endif ++ VPCMPEQ %ymm7, %ymm6, %ymm6 ++ vpmovmskb %ymm6, %esi ++ tzcntl %esi, %ecx ++# ifdef USE_AS_STRNCMP ++ addq $(VEC_SIZE * 3), %rcx ++ cmpq %rcx, %r11 ++ jbe L(zero) ++# ifdef USE_AS_WCSCMP ++ movq %rax, %rsi ++ xorl %eax, %eax ++ movl (%rsi, %rcx), %esi ++ cmpl (%rdx, %rcx), %esi ++ jne L(wcscmp_return) ++# else ++ movzbl (%rax, %rcx), %eax ++ movzbl (%rdx, %rcx), %edx ++ subl %edx, %eax ++# endif ++# else ++# ifdef USE_AS_WCSCMP ++ movq %rax, %rsi ++ xorl %eax, %eax ++ movl (VEC_SIZE * 3)(%rsi, %rcx), %esi ++ cmpl (VEC_SIZE * 3)(%rdx, %rcx), %esi ++ jne L(wcscmp_return) ++# else ++ movzbl (VEC_SIZE * 3)(%rax, %rcx), %eax ++ movzbl (VEC_SIZE * 3)(%rdx, %rcx), %edx ++ subl %edx, %eax ++# endif ++# endif ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(loop_cross_page): ++ xorl %r10d, %r10d ++ movq %rdx, %rcx ++ /* Align load via RDX. We load the extra ECX bytes which should ++ be ignored. */ ++ andl $((VEC_SIZE * 4) - 1), %ecx ++ /* R10 is -RCX. */ ++ subq %rcx, %r10 ++ ++ /* This works only if VEC_SIZE * 2 == 64. */ ++# if (VEC_SIZE * 2) != 64 ++# error (VEC_SIZE * 2) != 64 ++# endif ++ ++ /* Check if the first VEC_SIZE * 2 bytes should be ignored. */ ++ cmpl $(VEC_SIZE * 2), %ecx ++ jge L(loop_cross_page_2_vec) ++ ++ vmovdqu (%rax, %r10), %ymm2 ++ vmovdqu VEC_SIZE(%rax, %r10), %ymm3 ++ VPCMPEQ (%rdx, %r10), %ymm2, %ymm0 ++ VPCMPEQ VEC_SIZE(%rdx, %r10), %ymm3, %ymm1 ++ VPMINU %ymm2, %ymm0, %ymm0 ++ VPMINU %ymm3, %ymm1, %ymm1 ++ VPCMPEQ %ymm7, %ymm0, %ymm0 ++ VPCMPEQ %ymm7, %ymm1, %ymm1 ++ ++ vpmovmskb %ymm0, %edi ++ vpmovmskb %ymm1, %esi ++ ++ salq $32, %rsi ++ xorq %rsi, %rdi ++ ++ /* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes. */ ++ shrq %cl, %rdi ++ ++ testq %rdi, %rdi ++ je L(loop_cross_page_2_vec) ++ tzcntq %rdi, %rcx ++# ifdef USE_AS_STRNCMP ++ cmpq %rcx, %r11 ++ jbe L(zero) ++# ifdef USE_AS_WCSCMP ++ movq %rax, %rsi ++ xorl %eax, %eax ++ movl (%rsi, %rcx), %edi ++ cmpl (%rdx, %rcx), %edi ++ jne L(wcscmp_return) ++# else ++ movzbl (%rax, %rcx), %eax ++ movzbl (%rdx, %rcx), %edx ++ subl %edx, %eax ++# endif ++# else ++# ifdef USE_AS_WCSCMP ++ movq %rax, %rsi ++ xorl %eax, %eax ++ movl (%rsi, %rcx), %edi ++ cmpl (%rdx, %rcx), %edi ++ jne L(wcscmp_return) ++# else ++ movzbl (%rax, %rcx), %eax ++ movzbl (%rdx, %rcx), %edx ++ subl %edx, %eax ++# endif ++# endif ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(loop_cross_page_2_vec): ++ /* The first VEC_SIZE * 2 bytes match or are ignored. */ ++ vmovdqu (VEC_SIZE * 2)(%rax, %r10), %ymm2 ++ vmovdqu (VEC_SIZE * 3)(%rax, %r10), %ymm3 ++ VPCMPEQ (VEC_SIZE * 2)(%rdx, %r10), %ymm2, %ymm5 ++ VPMINU %ymm2, %ymm5, %ymm5 ++ VPCMPEQ (VEC_SIZE * 3)(%rdx, %r10), %ymm3, %ymm6 ++ VPCMPEQ %ymm7, %ymm5, %ymm5 ++ VPMINU %ymm3, %ymm6, %ymm6 ++ VPCMPEQ %ymm7, %ymm6, %ymm6 ++ ++ vpmovmskb %ymm5, %edi ++ vpmovmskb %ymm6, %esi ++ ++ salq $32, %rsi ++ xorq %rsi, %rdi ++ ++ xorl %r8d, %r8d ++ /* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes. */ ++ subl $(VEC_SIZE * 2), %ecx ++ jle 1f ++ /* Skip ECX bytes. */ ++ shrq %cl, %rdi ++ /* R8 has number of bytes skipped. */ ++ movl %ecx, %r8d ++1: ++ /* Before jumping back to the loop, set ESI to the number of ++ VEC_SIZE * 4 blocks before page crossing. */ ++ movl $(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi ++ ++ testq %rdi, %rdi ++# ifdef USE_AS_STRNCMP ++ /* At this point, if %rdi value is 0, it already tested ++ VEC_SIZE*4+%r10 byte starting from %rax. This label ++ checks whether strncmp maximum offset reached or not. */ ++ je L(string_nbyte_offset_check) ++# else ++ je L(back_to_loop) ++# endif ++ tzcntq %rdi, %rcx ++ addq %r10, %rcx ++ /* Adjust for number of bytes skipped. */ ++ addq %r8, %rcx ++# ifdef USE_AS_STRNCMP ++ addq $(VEC_SIZE * 2), %rcx ++ subq %rcx, %r11 ++ jbe L(zero) ++# ifdef USE_AS_WCSCMP ++ movq %rax, %rsi ++ xorl %eax, %eax ++ movl (%rsi, %rcx), %edi ++ cmpl (%rdx, %rcx), %edi ++ jne L(wcscmp_return) ++# else ++ movzbl (%rax, %rcx), %eax ++ movzbl (%rdx, %rcx), %edx ++ subl %edx, %eax ++# endif ++# else ++# ifdef USE_AS_WCSCMP ++ movq %rax, %rsi ++ xorl %eax, %eax ++ movl (VEC_SIZE * 2)(%rsi, %rcx), %edi ++ cmpl (VEC_SIZE * 2)(%rdx, %rcx), %edi ++ jne L(wcscmp_return) ++# else ++ movzbl (VEC_SIZE * 2)(%rax, %rcx), %eax ++ movzbl (VEC_SIZE * 2)(%rdx, %rcx), %edx ++ subl %edx, %eax ++# endif ++# endif ++ VZEROUPPER ++ ret ++ ++# ifdef USE_AS_STRNCMP ++L(string_nbyte_offset_check): ++ leaq (VEC_SIZE * 4)(%r10), %r10 ++ cmpq %r10, %r11 ++ jbe L(zero) ++ jmp L(back_to_loop) ++# endif ++ ++ .p2align 4 ++L(cross_page_loop): ++ /* Check one byte/dword at a time. */ ++# ifdef USE_AS_WCSCMP ++ cmpl %ecx, %eax ++# else ++ subl %ecx, %eax ++# endif ++ jne L(different) ++ addl $SIZE_OF_CHAR, %edx ++ cmpl $(VEC_SIZE * 4), %edx ++ je L(main_loop_header) ++# ifdef USE_AS_STRNCMP ++ cmpq %r11, %rdx ++ jae L(zero) ++# endif ++# ifdef USE_AS_WCSCMP ++ movl (%rdi, %rdx), %eax ++ movl (%rsi, %rdx), %ecx ++# else ++ movzbl (%rdi, %rdx), %eax ++ movzbl (%rsi, %rdx), %ecx ++# endif ++ /* Check null char. */ ++ testl %eax, %eax ++ jne L(cross_page_loop) ++ /* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED ++ comparisons. */ ++ subl %ecx, %eax ++# ifndef USE_AS_WCSCMP ++L(different): ++# endif ++ VZEROUPPER ++ ret ++ ++# ifdef USE_AS_WCSCMP ++ .p2align 4 ++L(different): ++ /* Use movl to avoid modifying EFLAGS. */ ++ movl $0, %eax ++ setl %al ++ negl %eax ++ orl $1, %eax ++ VZEROUPPER ++ ret ++# endif ++ ++# ifdef USE_AS_STRNCMP ++ .p2align 4 ++L(zero): ++ xorl %eax, %eax ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(char0): ++# ifdef USE_AS_WCSCMP ++ xorl %eax, %eax ++ movl (%rdi), %ecx ++ cmpl (%rsi), %ecx ++ jne L(wcscmp_return) ++# else ++ movzbl (%rsi), %ecx ++ movzbl (%rdi), %eax ++ subl %ecx, %eax ++# endif ++ VZEROUPPER ++ ret ++# endif ++ ++ .p2align 4 ++L(last_vector): ++ addq %rdx, %rdi ++ addq %rdx, %rsi ++# ifdef USE_AS_STRNCMP ++ subq %rdx, %r11 ++# endif ++ tzcntl %ecx, %edx ++# ifdef USE_AS_STRNCMP ++ cmpq %r11, %rdx ++ jae L(zero) ++# endif ++# ifdef USE_AS_WCSCMP ++ xorl %eax, %eax ++ movl (%rdi, %rdx), %ecx ++ cmpl (%rsi, %rdx), %ecx ++ jne L(wcscmp_return) ++# else ++ movzbl (%rdi, %rdx), %eax ++ movzbl (%rsi, %rdx), %edx ++ subl %edx, %eax ++# endif ++ VZEROUPPER ++ ret ++ ++ /* Comparing on page boundary region requires special treatment: ++ It must done one vector at the time, starting with the wider ++ ymm vector if possible, if not, with xmm. If fetching 16 bytes ++ (xmm) still passes the boundary, byte comparison must be done. ++ */ ++ .p2align 4 ++L(cross_page): ++ /* Try one ymm vector at a time. */ ++ cmpl $(PAGE_SIZE - VEC_SIZE), %eax ++ jg L(cross_page_1_vector) ++L(loop_1_vector): ++ vmovdqu (%rdi, %rdx), %ymm1 ++ VPCMPEQ (%rsi, %rdx), %ymm1, %ymm0 ++ VPMINU %ymm1, %ymm0, %ymm0 ++ VPCMPEQ %ymm7, %ymm0, %ymm0 ++ vpmovmskb %ymm0, %ecx ++ testl %ecx, %ecx ++ jne L(last_vector) ++ ++ addl $VEC_SIZE, %edx ++ ++ addl $VEC_SIZE, %eax ++# ifdef USE_AS_STRNCMP ++ /* Return 0 if the current offset (%rdx) >= the maximum offset ++ (%r11). */ ++ cmpq %r11, %rdx ++ jae L(zero) ++# endif ++ cmpl $(PAGE_SIZE - VEC_SIZE), %eax ++ jle L(loop_1_vector) ++L(cross_page_1_vector): ++ /* Less than 32 bytes to check, try one xmm vector. */ ++ cmpl $(PAGE_SIZE - 16), %eax ++ jg L(cross_page_1_xmm) ++ vmovdqu (%rdi, %rdx), %xmm1 ++ VPCMPEQ (%rsi, %rdx), %xmm1, %xmm0 ++ VPMINU %xmm1, %xmm0, %xmm0 ++ VPCMPEQ %xmm7, %xmm0, %xmm0 ++ vpmovmskb %xmm0, %ecx ++ testl %ecx, %ecx ++ jne L(last_vector) ++ ++ addl $16, %edx ++# ifndef USE_AS_WCSCMP ++ addl $16, %eax ++# endif ++# ifdef USE_AS_STRNCMP ++ /* Return 0 if the current offset (%rdx) >= the maximum offset ++ (%r11). */ ++ cmpq %r11, %rdx ++ jae L(zero) ++# endif ++ ++L(cross_page_1_xmm): ++# ifndef USE_AS_WCSCMP ++ /* Less than 16 bytes to check, try 8 byte vector. NB: No need ++ for wcscmp nor wcsncmp since wide char is 4 bytes. */ ++ cmpl $(PAGE_SIZE - 8), %eax ++ jg L(cross_page_8bytes) ++ vmovq (%rdi, %rdx), %xmm1 ++ vmovq (%rsi, %rdx), %xmm0 ++ VPCMPEQ %xmm0, %xmm1, %xmm0 ++ VPMINU %xmm1, %xmm0, %xmm0 ++ VPCMPEQ %xmm7, %xmm0, %xmm0 ++ vpmovmskb %xmm0, %ecx ++ /* Only last 8 bits are valid. */ ++ andl $0xff, %ecx ++ testl %ecx, %ecx ++ jne L(last_vector) ++ ++ addl $8, %edx ++ addl $8, %eax ++# ifdef USE_AS_STRNCMP ++ /* Return 0 if the current offset (%rdx) >= the maximum offset ++ (%r11). */ ++ cmpq %r11, %rdx ++ jae L(zero) ++# endif ++ ++L(cross_page_8bytes): ++ /* Less than 8 bytes to check, try 4 byte vector. */ ++ cmpl $(PAGE_SIZE - 4), %eax ++ jg L(cross_page_4bytes) ++ vmovd (%rdi, %rdx), %xmm1 ++ vmovd (%rsi, %rdx), %xmm0 ++ VPCMPEQ %xmm0, %xmm1, %xmm0 ++ VPMINU %xmm1, %xmm0, %xmm0 ++ VPCMPEQ %xmm7, %xmm0, %xmm0 ++ vpmovmskb %xmm0, %ecx ++ /* Only last 4 bits are valid. */ ++ andl $0xf, %ecx ++ testl %ecx, %ecx ++ jne L(last_vector) ++ ++ addl $4, %edx ++# ifdef USE_AS_STRNCMP ++ /* Return 0 if the current offset (%rdx) >= the maximum offset ++ (%r11). */ ++ cmpq %r11, %rdx ++ jae L(zero) ++# endif ++ ++L(cross_page_4bytes): ++# endif ++ /* Less than 4 bytes to check, try one byte/dword at a time. */ ++# ifdef USE_AS_STRNCMP ++ cmpq %r11, %rdx ++ jae L(zero) ++# endif ++# ifdef USE_AS_WCSCMP ++ movl (%rdi, %rdx), %eax ++ movl (%rsi, %rdx), %ecx ++# else ++ movzbl (%rdi, %rdx), %eax ++ movzbl (%rsi, %rdx), %ecx ++# endif ++ testl %eax, %eax ++ jne L(cross_page_loop) ++ subl %ecx, %eax ++ VZEROUPPER ++ ret ++END (STRCMP) +diff --git a/libc/arch-x86_64/kabylake/string/avx2-strcpy-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-strcpy-kbl.S +new file mode 100644 +index 000000000..809a9ac00 +--- /dev/null ++++ b/libc/arch-x86_64/kabylake/string/avx2-strcpy-kbl.S +@@ -0,0 +1,1046 @@ ++/* strcpy with AVX2 ++ Copyright (C) 2011-2020 Free Software Foundation, Inc. ++ Contributed by Intel Corporation. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++ ++# ifndef USE_AS_STRCAT ++ ++# ifndef STRCPY ++# define STRCPY strcpy_avx2 ++# endif ++ ++# endif ++ ++# ifndef L ++# define L(label) .L##label ++# endif ++ ++# ifndef cfi_startproc ++# define cfi_startproc .cfi_startproc ++# endif ++ ++# ifndef cfi_endproc ++# define cfi_endproc .cfi_endproc ++# endif ++ ++# ifndef ENTRY ++# define ENTRY(name) \ ++ .type name, @function; \ ++ .globl name; \ ++ .p2align 4; \ ++name: \ ++ cfi_startproc ++# endif ++ ++# ifndef END ++# define END(name) \ ++ cfi_endproc; \ ++ .size name, .-name ++# endif ++ ++/* Number of bytes in a vector register */ ++# ifndef VEC_SIZE ++# define VEC_SIZE 32 ++# endif ++ ++# ifndef VZEROUPPER ++# define VZEROUPPER vzeroupper ++# endif ++ ++/* zero register */ ++#define xmmZ xmm0 ++#define ymmZ ymm0 ++ ++/* mask register */ ++#define ymmM ymm1 ++ ++# ifndef USE_AS_STRCAT ++ ++ .section .text.avx,"ax",@progbits ++ENTRY (STRCPY) ++# ifdef USE_AS_STRNCPY ++ mov %RDX_LP, %R8_LP ++ test %R8_LP, %R8_LP ++ jz L(ExitZero) ++# endif ++ mov %rsi, %rcx ++# ifndef USE_AS_STPCPY ++ mov %rdi, %rax /* save result */ ++# endif ++ ++# endif ++ ++ vpxor %xmmZ, %xmmZ, %xmmZ ++ ++ and $((VEC_SIZE * 4) - 1), %ecx ++ cmp $(VEC_SIZE * 2), %ecx ++ jbe L(SourceStringAlignmentLessTwoVecSize) ++ ++ and $-VEC_SIZE, %rsi ++ and $(VEC_SIZE - 1), %ecx ++ ++ vpcmpeqb (%rsi), %ymmZ, %ymmM ++ vpmovmskb %ymmM, %edx ++ shr %cl, %rdx ++ ++# ifdef USE_AS_STRNCPY ++# if defined USE_AS_STPCPY || defined USE_AS_STRCAT ++ mov $VEC_SIZE, %r10 ++ sub %rcx, %r10 ++ cmp %r10, %r8 ++# else ++ mov $(VEC_SIZE + 1), %r10 ++ sub %rcx, %r10 ++ cmp %r10, %r8 ++# endif ++ jbe L(CopyVecSizeTailCase2OrCase3) ++# endif ++ test %edx, %edx ++ jnz L(CopyVecSizeTail) ++ ++ vpcmpeqb VEC_SIZE(%rsi), %ymmZ, %ymm2 ++ vpmovmskb %ymm2, %edx ++ ++# ifdef USE_AS_STRNCPY ++ add $VEC_SIZE, %r10 ++ cmp %r10, %r8 ++ jbe L(CopyTwoVecSizeCase2OrCase3) ++# endif ++ test %edx, %edx ++ jnz L(CopyTwoVecSize) ++ ++ vmovdqu (%rsi, %rcx), %ymm2 /* copy VEC_SIZE bytes */ ++ vmovdqu %ymm2, (%rdi) ++ ++/* If source address alignment != destination address alignment */ ++ .p2align 4 ++L(UnalignVecSizeBoth): ++ sub %rcx, %rdi ++# ifdef USE_AS_STRNCPY ++ add %rcx, %r8 ++ sbb %rcx, %rcx ++ or %rcx, %r8 ++# endif ++ mov $VEC_SIZE, %rcx ++ vmovdqa (%rsi, %rcx), %ymm2 ++ vmovdqu %ymm2, (%rdi, %rcx) ++ vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2 ++ vpcmpeqb %ymm2, %ymmZ, %ymmM ++ vpmovmskb %ymmM, %edx ++ add $VEC_SIZE, %rcx ++# ifdef USE_AS_STRNCPY ++ sub $(VEC_SIZE * 3), %r8 ++ jbe L(CopyVecSizeCase2OrCase3) ++# endif ++ test %edx, %edx ++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT ++ jnz L(CopyVecSizeUnalignedVec2) ++# else ++ jnz L(CopyVecSize) ++# endif ++ ++ vmovdqu %ymm2, (%rdi, %rcx) ++ vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3 ++ vpcmpeqb %ymm3, %ymmZ, %ymmM ++ vpmovmskb %ymmM, %edx ++ add $VEC_SIZE, %rcx ++# ifdef USE_AS_STRNCPY ++ sub $VEC_SIZE, %r8 ++ jbe L(CopyVecSizeCase2OrCase3) ++# endif ++ test %edx, %edx ++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT ++ jnz L(CopyVecSizeUnalignedVec3) ++# else ++ jnz L(CopyVecSize) ++# endif ++ ++ vmovdqu %ymm3, (%rdi, %rcx) ++ vmovdqa VEC_SIZE(%rsi, %rcx), %ymm4 ++ vpcmpeqb %ymm4, %ymmZ, %ymmM ++ vpmovmskb %ymmM, %edx ++ add $VEC_SIZE, %rcx ++# ifdef USE_AS_STRNCPY ++ sub $VEC_SIZE, %r8 ++ jbe L(CopyVecSizeCase2OrCase3) ++# endif ++ test %edx, %edx ++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT ++ jnz L(CopyVecSizeUnalignedVec4) ++# else ++ jnz L(CopyVecSize) ++# endif ++ ++ vmovdqu %ymm4, (%rdi, %rcx) ++ vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2 ++ vpcmpeqb %ymm2, %ymmZ, %ymmM ++ vpmovmskb %ymmM, %edx ++ add $VEC_SIZE, %rcx ++# ifdef USE_AS_STRNCPY ++ sub $VEC_SIZE, %r8 ++ jbe L(CopyVecSizeCase2OrCase3) ++# endif ++ test %edx, %edx ++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT ++ jnz L(CopyVecSizeUnalignedVec2) ++# else ++ jnz L(CopyVecSize) ++# endif ++ ++ vmovdqu %ymm2, (%rdi, %rcx) ++ vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2 ++ vpcmpeqb %ymm2, %ymmZ, %ymmM ++ vpmovmskb %ymmM, %edx ++ add $VEC_SIZE, %rcx ++# ifdef USE_AS_STRNCPY ++ sub $VEC_SIZE, %r8 ++ jbe L(CopyVecSizeCase2OrCase3) ++# endif ++ test %edx, %edx ++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT ++ jnz L(CopyVecSizeUnalignedVec2) ++# else ++ jnz L(CopyVecSize) ++# endif ++ ++ vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3 ++ vmovdqu %ymm2, (%rdi, %rcx) ++ vpcmpeqb %ymm3, %ymmZ, %ymmM ++ vpmovmskb %ymmM, %edx ++ add $VEC_SIZE, %rcx ++# ifdef USE_AS_STRNCPY ++ sub $VEC_SIZE, %r8 ++ jbe L(CopyVecSizeCase2OrCase3) ++# endif ++ test %edx, %edx ++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT ++ jnz L(CopyVecSizeUnalignedVec3) ++# else ++ jnz L(CopyVecSize) ++# endif ++ ++ vmovdqu %ymm3, (%rdi, %rcx) ++ mov %rsi, %rdx ++ lea VEC_SIZE(%rsi, %rcx), %rsi ++ and $-(VEC_SIZE * 4), %rsi ++ sub %rsi, %rdx ++ sub %rdx, %rdi ++# ifdef USE_AS_STRNCPY ++ lea (VEC_SIZE * 8)(%r8, %rdx), %r8 ++# endif ++L(UnalignedFourVecSizeLoop): ++ vmovdqa (%rsi), %ymm4 ++ vmovdqa VEC_SIZE(%rsi), %ymm5 ++ vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6 ++ vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7 ++ vpminub %ymm5, %ymm4, %ymm2 ++ vpminub %ymm7, %ymm6, %ymm3 ++ vpminub %ymm2, %ymm3, %ymm3 ++ vpcmpeqb %ymmM, %ymm3, %ymm3 ++ vpmovmskb %ymm3, %edx ++# ifdef USE_AS_STRNCPY ++ sub $(VEC_SIZE * 4), %r8 ++ jbe L(UnalignedLeaveCase2OrCase3) ++# endif ++ test %edx, %edx ++ jnz L(UnalignedFourVecSizeLeave) ++ ++L(UnalignedFourVecSizeLoop_start): ++ add $(VEC_SIZE * 4), %rdi ++ add $(VEC_SIZE * 4), %rsi ++ vmovdqu %ymm4, -(VEC_SIZE * 4)(%rdi) ++ vmovdqa (%rsi), %ymm4 ++ vmovdqu %ymm5, -(VEC_SIZE * 3)(%rdi) ++ vmovdqa VEC_SIZE(%rsi), %ymm5 ++ vpminub %ymm5, %ymm4, %ymm2 ++ vmovdqu %ymm6, -(VEC_SIZE * 2)(%rdi) ++ vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6 ++ vmovdqu %ymm7, -VEC_SIZE(%rdi) ++ vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7 ++ vpminub %ymm7, %ymm6, %ymm3 ++ vpminub %ymm2, %ymm3, %ymm3 ++ vpcmpeqb %ymmM, %ymm3, %ymm3 ++ vpmovmskb %ymm3, %edx ++# ifdef USE_AS_STRNCPY ++ sub $(VEC_SIZE * 4), %r8 ++ jbe L(UnalignedLeaveCase2OrCase3) ++# endif ++ test %edx, %edx ++ jz L(UnalignedFourVecSizeLoop_start) ++ ++L(UnalignedFourVecSizeLeave): ++ vpcmpeqb %ymm4, %ymmZ, %ymmM ++ vpmovmskb %ymmM, %edx ++ test %edx, %edx ++ jnz L(CopyVecSizeUnaligned_0) ++ ++ vpcmpeqb %ymm5, %ymmZ, %ymmM ++ vpmovmskb %ymmM, %ecx ++ test %ecx, %ecx ++ jnz L(CopyVecSizeUnaligned_16) ++ ++ vpcmpeqb %ymm6, %ymmZ, %ymmM ++ vpmovmskb %ymmM, %edx ++ test %edx, %edx ++ jnz L(CopyVecSizeUnaligned_32) ++ ++ vpcmpeqb %ymm7, %ymmZ, %ymmM ++ vpmovmskb %ymmM, %ecx ++ bsf %ecx, %edx ++ vmovdqu %ymm4, (%rdi) ++ vmovdqu %ymm5, VEC_SIZE(%rdi) ++ vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi) ++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT ++# ifdef USE_AS_STPCPY ++ lea (VEC_SIZE * 3)(%rdi, %rdx), %rax ++# endif ++ vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi) ++ add $(VEC_SIZE - 1), %r8 ++ sub %rdx, %r8 ++ lea ((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi ++ jmp L(StrncpyFillTailWithZero) ++# else ++ add $(VEC_SIZE * 3), %rsi ++ add $(VEC_SIZE * 3), %rdi ++ jmp L(CopyVecSizeExit) ++# endif ++ ++/* If source address alignment == destination address alignment */ ++ ++L(SourceStringAlignmentLessTwoVecSize): ++ vmovdqu (%rsi), %ymm3 ++ vmovdqu VEC_SIZE(%rsi), %ymm2 ++ vpcmpeqb %ymm3, %ymmZ, %ymmM ++ vpmovmskb %ymmM, %edx ++ ++# ifdef USE_AS_STRNCPY ++# if defined USE_AS_STPCPY || defined USE_AS_STRCAT ++ cmp $VEC_SIZE, %r8 ++# else ++ cmp $(VEC_SIZE + 1), %r8 ++# endif ++ jbe L(CopyVecSizeTail1Case2OrCase3) ++# endif ++ test %edx, %edx ++ jnz L(CopyVecSizeTail1) ++ ++ vmovdqu %ymm3, (%rdi) ++ vpcmpeqb %ymm2, %ymmZ, %ymmM ++ vpmovmskb %ymmM, %edx ++ ++# ifdef USE_AS_STRNCPY ++# if defined USE_AS_STPCPY || defined USE_AS_STRCAT ++ cmp $(VEC_SIZE * 2), %r8 ++# else ++ cmp $((VEC_SIZE * 2) + 1), %r8 ++# endif ++ jbe L(CopyTwoVecSize1Case2OrCase3) ++# endif ++ test %edx, %edx ++ jnz L(CopyTwoVecSize1) ++ ++ and $-VEC_SIZE, %rsi ++ and $(VEC_SIZE - 1), %ecx ++ jmp L(UnalignVecSizeBoth) ++ ++/*------End of main part with loops---------------------*/ ++ ++/* Case1 */ ++ ++# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT) ++ .p2align 4 ++L(CopyVecSize): ++ add %rcx, %rdi ++# endif ++L(CopyVecSizeTail): ++ add %rcx, %rsi ++L(CopyVecSizeTail1): ++ bsf %edx, %edx ++L(CopyVecSizeExit): ++ cmp $32, %edx ++ jae L(Exit32_63) ++ cmp $16, %edx ++ jae L(Exit16_31) ++ cmp $8, %edx ++ jae L(Exit8_15) ++ cmp $4, %edx ++ jae L(Exit4_7) ++ cmp $3, %edx ++ je L(Exit3) ++ cmp $1, %edx ++ ja L(Exit2) ++ je L(Exit1) ++ movb $0, (%rdi) ++# ifdef USE_AS_STPCPY ++ lea (%rdi), %rax ++# endif ++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT ++ sub $1, %r8 ++ lea 1(%rdi), %rdi ++ jnz L(StrncpyFillTailWithZero) ++# endif ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(CopyTwoVecSize1): ++ add $VEC_SIZE, %rsi ++ add $VEC_SIZE, %rdi ++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT ++ sub $VEC_SIZE, %r8 ++# endif ++ jmp L(CopyVecSizeTail1) ++ ++ .p2align 4 ++L(CopyTwoVecSize): ++ bsf %edx, %edx ++ add %rcx, %rsi ++ add $VEC_SIZE, %edx ++ sub %ecx, %edx ++ jmp L(CopyVecSizeExit) ++ ++ .p2align 4 ++L(CopyVecSizeUnaligned_0): ++ bsf %edx, %edx ++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT ++# ifdef USE_AS_STPCPY ++ lea (%rdi, %rdx), %rax ++# endif ++ vmovdqu %ymm4, (%rdi) ++ add $((VEC_SIZE * 4) - 1), %r8 ++ sub %rdx, %r8 ++ lea 1(%rdi, %rdx), %rdi ++ jmp L(StrncpyFillTailWithZero) ++# else ++ jmp L(CopyVecSizeExit) ++# endif ++ ++ .p2align 4 ++L(CopyVecSizeUnaligned_16): ++ bsf %ecx, %edx ++ vmovdqu %ymm4, (%rdi) ++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT ++# ifdef USE_AS_STPCPY ++ lea VEC_SIZE(%rdi, %rdx), %rax ++# endif ++ vmovdqu %ymm5, VEC_SIZE(%rdi) ++ add $((VEC_SIZE * 3) - 1), %r8 ++ sub %rdx, %r8 ++ lea (VEC_SIZE + 1)(%rdi, %rdx), %rdi ++ jmp L(StrncpyFillTailWithZero) ++# else ++ add $VEC_SIZE, %rsi ++ add $VEC_SIZE, %rdi ++ jmp L(CopyVecSizeExit) ++# endif ++ ++ .p2align 4 ++L(CopyVecSizeUnaligned_32): ++ bsf %edx, %edx ++ vmovdqu %ymm4, (%rdi) ++ vmovdqu %ymm5, VEC_SIZE(%rdi) ++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT ++# ifdef USE_AS_STPCPY ++ lea (VEC_SIZE * 2)(%rdi, %rdx), %rax ++# endif ++ vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi) ++ add $((VEC_SIZE * 2) - 1), %r8 ++ sub %rdx, %r8 ++ lea ((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi ++ jmp L(StrncpyFillTailWithZero) ++# else ++ add $(VEC_SIZE * 2), %rsi ++ add $(VEC_SIZE * 2), %rdi ++ jmp L(CopyVecSizeExit) ++# endif ++ ++# ifdef USE_AS_STRNCPY ++# ifndef USE_AS_STRCAT ++ .p2align 4 ++L(CopyVecSizeUnalignedVec6): ++ vmovdqu %ymm6, (%rdi, %rcx) ++ jmp L(CopyVecSizeVecExit) ++ ++ .p2align 4 ++L(CopyVecSizeUnalignedVec5): ++ vmovdqu %ymm5, (%rdi, %rcx) ++ jmp L(CopyVecSizeVecExit) ++ ++ .p2align 4 ++L(CopyVecSizeUnalignedVec4): ++ vmovdqu %ymm4, (%rdi, %rcx) ++ jmp L(CopyVecSizeVecExit) ++ ++ .p2align 4 ++L(CopyVecSizeUnalignedVec3): ++ vmovdqu %ymm3, (%rdi, %rcx) ++ jmp L(CopyVecSizeVecExit) ++# endif ++ ++/* Case2 */ ++ ++ .p2align 4 ++L(CopyVecSizeCase2): ++ add $VEC_SIZE, %r8 ++ add %rcx, %rdi ++ add %rcx, %rsi ++ bsf %edx, %edx ++ cmp %r8d, %edx ++ jb L(CopyVecSizeExit) ++ jmp L(StrncpyExit) ++ ++ .p2align 4 ++L(CopyTwoVecSizeCase2): ++ add %rcx, %rsi ++ bsf %edx, %edx ++ add $VEC_SIZE, %edx ++ sub %ecx, %edx ++ cmp %r8d, %edx ++ jb L(CopyVecSizeExit) ++ jmp L(StrncpyExit) ++ ++L(CopyVecSizeTailCase2): ++ add %rcx, %rsi ++ bsf %edx, %edx ++ cmp %r8d, %edx ++ jb L(CopyVecSizeExit) ++ jmp L(StrncpyExit) ++ ++L(CopyVecSizeTail1Case2): ++ bsf %edx, %edx ++ cmp %r8d, %edx ++ jb L(CopyVecSizeExit) ++ jmp L(StrncpyExit) ++ ++/* Case2 or Case3, Case3 */ ++ ++ .p2align 4 ++L(CopyVecSizeCase2OrCase3): ++ test %rdx, %rdx ++ jnz L(CopyVecSizeCase2) ++L(CopyVecSizeCase3): ++ add $VEC_SIZE, %r8 ++ add %rcx, %rdi ++ add %rcx, %rsi ++ jmp L(StrncpyExit) ++ ++ .p2align 4 ++L(CopyTwoVecSizeCase2OrCase3): ++ test %rdx, %rdx ++ jnz L(CopyTwoVecSizeCase2) ++ add %rcx, %rsi ++ jmp L(StrncpyExit) ++ ++ .p2align 4 ++L(CopyVecSizeTailCase2OrCase3): ++ test %rdx, %rdx ++ jnz L(CopyVecSizeTailCase2) ++ add %rcx, %rsi ++ jmp L(StrncpyExit) ++ ++ .p2align 4 ++L(CopyTwoVecSize1Case2OrCase3): ++ add $VEC_SIZE, %rdi ++ add $VEC_SIZE, %rsi ++ sub $VEC_SIZE, %r8 ++L(CopyVecSizeTail1Case2OrCase3): ++ test %rdx, %rdx ++ jnz L(CopyVecSizeTail1Case2) ++ jmp L(StrncpyExit) ++# endif ++ ++/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/ ++ ++ .p2align 4 ++L(Exit1): ++ movzwl (%rsi), %edx ++ mov %dx, (%rdi) ++# ifdef USE_AS_STPCPY ++ lea 1(%rdi), %rax ++# endif ++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT ++ sub $2, %r8 ++ lea 2(%rdi), %rdi ++ jnz L(StrncpyFillTailWithZero) ++# endif ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(Exit2): ++ movzwl (%rsi), %ecx ++ mov %cx, (%rdi) ++ movb $0, 2(%rdi) ++# ifdef USE_AS_STPCPY ++ lea 2(%rdi), %rax ++# endif ++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT ++ sub $3, %r8 ++ lea 3(%rdi), %rdi ++ jnz L(StrncpyFillTailWithZero) ++# endif ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(Exit3): ++ mov (%rsi), %edx ++ mov %edx, (%rdi) ++# ifdef USE_AS_STPCPY ++ lea 3(%rdi), %rax ++# endif ++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT ++ sub $4, %r8 ++ lea 4(%rdi), %rdi ++ jnz L(StrncpyFillTailWithZero) ++# endif ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(Exit4_7): ++ mov (%rsi), %ecx ++ mov %ecx, (%rdi) ++ mov -3(%rsi, %rdx), %ecx ++ mov %ecx, -3(%rdi, %rdx) ++# ifdef USE_AS_STPCPY ++ lea (%rdi, %rdx), %rax ++# endif ++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT ++ sub %rdx, %r8 ++ sub $1, %r8 ++ lea 1(%rdi, %rdx), %rdi ++ jnz L(StrncpyFillTailWithZero) ++# endif ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(Exit8_15): ++ mov (%rsi), %rcx ++ mov -7(%rsi, %rdx), %r9 ++ mov %rcx, (%rdi) ++ mov %r9, -7(%rdi, %rdx) ++# ifdef USE_AS_STPCPY ++ lea (%rdi, %rdx), %rax ++# endif ++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT ++ sub %rdx, %r8 ++ sub $1, %r8 ++ lea 1(%rdi, %rdx), %rdi ++ jnz L(StrncpyFillTailWithZero) ++# endif ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(Exit16_31): ++ vmovdqu (%rsi), %xmm2 ++ vmovdqu -15(%rsi, %rdx), %xmm3 ++ vmovdqu %xmm2, (%rdi) ++ vmovdqu %xmm3, -15(%rdi, %rdx) ++# ifdef USE_AS_STPCPY ++ lea (%rdi, %rdx), %rax ++# endif ++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT ++ sub %rdx, %r8 ++ sub $1, %r8 ++ lea 1(%rdi, %rdx), %rdi ++ jnz L(StrncpyFillTailWithZero) ++# endif ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(Exit32_63): ++ vmovdqu (%rsi), %ymm2 ++ vmovdqu -31(%rsi, %rdx), %ymm3 ++ vmovdqu %ymm2, (%rdi) ++ vmovdqu %ymm3, -31(%rdi, %rdx) ++# ifdef USE_AS_STPCPY ++ lea (%rdi, %rdx), %rax ++# endif ++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT ++ sub %rdx, %r8 ++ sub $1, %r8 ++ lea 1(%rdi, %rdx), %rdi ++ jnz L(StrncpyFillTailWithZero) ++# endif ++ VZEROUPPER ++ ret ++ ++# ifdef USE_AS_STRNCPY ++ ++ .p2align 4 ++L(StrncpyExit1): ++ movzbl (%rsi), %edx ++ mov %dl, (%rdi) ++# ifdef USE_AS_STPCPY ++ lea 1(%rdi), %rax ++# endif ++# ifdef USE_AS_STRCAT ++ movb $0, 1(%rdi) ++# endif ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(StrncpyExit2): ++ movzwl (%rsi), %edx ++ mov %dx, (%rdi) ++# ifdef USE_AS_STPCPY ++ lea 2(%rdi), %rax ++# endif ++# ifdef USE_AS_STRCAT ++ movb $0, 2(%rdi) ++# endif ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(StrncpyExit3_4): ++ movzwl (%rsi), %ecx ++ movzwl -2(%rsi, %r8), %edx ++ mov %cx, (%rdi) ++ mov %dx, -2(%rdi, %r8) ++# ifdef USE_AS_STPCPY ++ lea (%rdi, %r8), %rax ++# endif ++# ifdef USE_AS_STRCAT ++ movb $0, (%rdi, %r8) ++# endif ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(StrncpyExit5_8): ++ mov (%rsi), %ecx ++ mov -4(%rsi, %r8), %edx ++ mov %ecx, (%rdi) ++ mov %edx, -4(%rdi, %r8) ++# ifdef USE_AS_STPCPY ++ lea (%rdi, %r8), %rax ++# endif ++# ifdef USE_AS_STRCAT ++ movb $0, (%rdi, %r8) ++# endif ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(StrncpyExit9_16): ++ mov (%rsi), %rcx ++ mov -8(%rsi, %r8), %rdx ++ mov %rcx, (%rdi) ++ mov %rdx, -8(%rdi, %r8) ++# ifdef USE_AS_STPCPY ++ lea (%rdi, %r8), %rax ++# endif ++# ifdef USE_AS_STRCAT ++ movb $0, (%rdi, %r8) ++# endif ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(StrncpyExit17_32): ++ vmovdqu (%rsi), %xmm2 ++ vmovdqu -16(%rsi, %r8), %xmm3 ++ vmovdqu %xmm2, (%rdi) ++ vmovdqu %xmm3, -16(%rdi, %r8) ++# ifdef USE_AS_STPCPY ++ lea (%rdi, %r8), %rax ++# endif ++# ifdef USE_AS_STRCAT ++ movb $0, (%rdi, %r8) ++# endif ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(StrncpyExit33_64): ++ /* 0/32, 31/16 */ ++ vmovdqu (%rsi), %ymm2 ++ vmovdqu -VEC_SIZE(%rsi, %r8), %ymm3 ++ vmovdqu %ymm2, (%rdi) ++ vmovdqu %ymm3, -VEC_SIZE(%rdi, %r8) ++# ifdef USE_AS_STPCPY ++ lea (%rdi, %r8), %rax ++# endif ++# ifdef USE_AS_STRCAT ++ movb $0, (%rdi, %r8) ++# endif ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(StrncpyExit65): ++ /* 0/32, 32/32, 64/1 */ ++ vmovdqu (%rsi), %ymm2 ++ vmovdqu 32(%rsi), %ymm3 ++ mov 64(%rsi), %cl ++ vmovdqu %ymm2, (%rdi) ++ vmovdqu %ymm3, 32(%rdi) ++ mov %cl, 64(%rdi) ++# ifdef USE_AS_STPCPY ++ lea 65(%rdi), %rax ++# endif ++# ifdef USE_AS_STRCAT ++ movb $0, 65(%rdi) ++# endif ++ VZEROUPPER ++ ret ++ ++# ifndef USE_AS_STRCAT ++ ++ .p2align 4 ++L(Fill1): ++ mov %dl, (%rdi) ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(Fill2): ++ mov %dx, (%rdi) ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(Fill3_4): ++ mov %dx, (%rdi) ++ mov %dx, -2(%rdi, %r8) ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(Fill5_8): ++ mov %edx, (%rdi) ++ mov %edx, -4(%rdi, %r8) ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(Fill9_16): ++ mov %rdx, (%rdi) ++ mov %rdx, -8(%rdi, %r8) ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(Fill17_32): ++ vmovdqu %xmmZ, (%rdi) ++ vmovdqu %xmmZ, -16(%rdi, %r8) ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(CopyVecSizeUnalignedVec2): ++ vmovdqu %ymm2, (%rdi, %rcx) ++ ++ .p2align 4 ++L(CopyVecSizeVecExit): ++ bsf %edx, %edx ++ add $(VEC_SIZE - 1), %r8 ++ add %rcx, %rdi ++# ifdef USE_AS_STPCPY ++ lea (%rdi, %rdx), %rax ++# endif ++ sub %rdx, %r8 ++ lea 1(%rdi, %rdx), %rdi ++ ++ .p2align 4 ++L(StrncpyFillTailWithZero): ++ xor %edx, %edx ++ sub $VEC_SIZE, %r8 ++ jbe L(StrncpyFillExit) ++ ++ vmovdqu %ymmZ, (%rdi) ++ add $VEC_SIZE, %rdi ++ ++ mov %rdi, %rsi ++ and $(VEC_SIZE - 1), %esi ++ sub %rsi, %rdi ++ add %rsi, %r8 ++ sub $(VEC_SIZE * 4), %r8 ++ jb L(StrncpyFillLessFourVecSize) ++ ++L(StrncpyFillLoopVmovdqa): ++ vmovdqa %ymmZ, (%rdi) ++ vmovdqa %ymmZ, VEC_SIZE(%rdi) ++ vmovdqa %ymmZ, (VEC_SIZE * 2)(%rdi) ++ vmovdqa %ymmZ, (VEC_SIZE * 3)(%rdi) ++ add $(VEC_SIZE * 4), %rdi ++ sub $(VEC_SIZE * 4), %r8 ++ jae L(StrncpyFillLoopVmovdqa) ++ ++L(StrncpyFillLessFourVecSize): ++ add $(VEC_SIZE * 2), %r8 ++ jl L(StrncpyFillLessTwoVecSize) ++ vmovdqa %ymmZ, (%rdi) ++ vmovdqa %ymmZ, VEC_SIZE(%rdi) ++ add $(VEC_SIZE * 2), %rdi ++ sub $VEC_SIZE, %r8 ++ jl L(StrncpyFillExit) ++ vmovdqa %ymmZ, (%rdi) ++ add $VEC_SIZE, %rdi ++ jmp L(Fill) ++ ++ .p2align 4 ++L(StrncpyFillLessTwoVecSize): ++ add $VEC_SIZE, %r8 ++ jl L(StrncpyFillExit) ++ vmovdqa %ymmZ, (%rdi) ++ add $VEC_SIZE, %rdi ++ jmp L(Fill) ++ ++ .p2align 4 ++L(StrncpyFillExit): ++ add $VEC_SIZE, %r8 ++L(Fill): ++ cmp $17, %r8d ++ jae L(Fill17_32) ++ cmp $9, %r8d ++ jae L(Fill9_16) ++ cmp $5, %r8d ++ jae L(Fill5_8) ++ cmp $3, %r8d ++ jae L(Fill3_4) ++ cmp $1, %r8d ++ ja L(Fill2) ++ je L(Fill1) ++ VZEROUPPER ++ ret ++ ++/* end of ifndef USE_AS_STRCAT */ ++# endif ++ ++ .p2align 4 ++L(UnalignedLeaveCase2OrCase3): ++ test %rdx, %rdx ++ jnz L(UnalignedFourVecSizeLeaveCase2) ++L(UnalignedFourVecSizeLeaveCase3): ++ lea (VEC_SIZE * 4)(%r8), %rcx ++ and $-VEC_SIZE, %rcx ++ add $(VEC_SIZE * 3), %r8 ++ jl L(CopyVecSizeCase3) ++ vmovdqu %ymm4, (%rdi) ++ sub $VEC_SIZE, %r8 ++ jb L(CopyVecSizeCase3) ++ vmovdqu %ymm5, VEC_SIZE(%rdi) ++ sub $VEC_SIZE, %r8 ++ jb L(CopyVecSizeCase3) ++ vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi) ++ sub $VEC_SIZE, %r8 ++ jb L(CopyVecSizeCase3) ++ vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi) ++# ifdef USE_AS_STPCPY ++ lea (VEC_SIZE * 4)(%rdi), %rax ++# endif ++# ifdef USE_AS_STRCAT ++ movb $0, (VEC_SIZE * 4)(%rdi) ++# endif ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(UnalignedFourVecSizeLeaveCase2): ++ xor %ecx, %ecx ++ vpcmpeqb %ymm4, %ymmZ, %ymmM ++ vpmovmskb %ymmM, %edx ++ add $(VEC_SIZE * 3), %r8 ++ jle L(CopyVecSizeCase2OrCase3) ++ test %edx, %edx ++# ifndef USE_AS_STRCAT ++ jnz L(CopyVecSizeUnalignedVec4) ++# else ++ jnz L(CopyVecSize) ++# endif ++ vpcmpeqb %ymm5, %ymmZ, %ymmM ++ vpmovmskb %ymmM, %edx ++ vmovdqu %ymm4, (%rdi) ++ add $VEC_SIZE, %rcx ++ sub $VEC_SIZE, %r8 ++ jbe L(CopyVecSizeCase2OrCase3) ++ test %edx, %edx ++# ifndef USE_AS_STRCAT ++ jnz L(CopyVecSizeUnalignedVec5) ++# else ++ jnz L(CopyVecSize) ++# endif ++ ++ vpcmpeqb %ymm6, %ymmZ, %ymmM ++ vpmovmskb %ymmM, %edx ++ vmovdqu %ymm5, VEC_SIZE(%rdi) ++ add $VEC_SIZE, %rcx ++ sub $VEC_SIZE, %r8 ++ jbe L(CopyVecSizeCase2OrCase3) ++ test %edx, %edx ++# ifndef USE_AS_STRCAT ++ jnz L(CopyVecSizeUnalignedVec6) ++# else ++ jnz L(CopyVecSize) ++# endif ++ ++ vpcmpeqb %ymm7, %ymmZ, %ymmM ++ vpmovmskb %ymmM, %edx ++ vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi) ++ lea VEC_SIZE(%rdi, %rcx), %rdi ++ lea VEC_SIZE(%rsi, %rcx), %rsi ++ bsf %edx, %edx ++ cmp %r8d, %edx ++ jb L(CopyVecSizeExit) ++L(StrncpyExit): ++ cmp $65, %r8d ++ je L(StrncpyExit65) ++ cmp $33, %r8d ++ jae L(StrncpyExit33_64) ++ cmp $17, %r8d ++ jae L(StrncpyExit17_32) ++ cmp $9, %r8d ++ jae L(StrncpyExit9_16) ++ cmp $5, %r8d ++ jae L(StrncpyExit5_8) ++ cmp $3, %r8d ++ jae L(StrncpyExit3_4) ++ cmp $1, %r8d ++ ja L(StrncpyExit2) ++ je L(StrncpyExit1) ++# ifdef USE_AS_STPCPY ++ mov %rdi, %rax ++# endif ++# ifdef USE_AS_STRCAT ++ movb $0, (%rdi) ++# endif ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(ExitZero): ++# ifndef USE_AS_STRCAT ++ mov %rdi, %rax ++# endif ++ VZEROUPPER ++ ret ++ ++# endif ++ ++# ifndef USE_AS_STRCAT ++END (STRCPY) ++# else ++END (STRCAT) ++# endif +diff --git a/libc/arch-x86_64/kabylake/string/avx2-strlen-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-strlen-kbl.S +new file mode 100644 +index 000000000..912d771b4 +--- /dev/null ++++ b/libc/arch-x86_64/kabylake/string/avx2-strlen-kbl.S +@@ -0,0 +1,418 @@ ++/* strlen/strnlen/wcslen/wcsnlen optimized with AVX2. ++ Copyright (C) 2017-2020 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++# ifndef STRLEN ++# define STRLEN strlen_avx2 ++# endif ++ ++# ifndef L ++# define L(label) .L##label ++# endif ++ ++# ifndef cfi_startproc ++# define cfi_startproc .cfi_startproc ++# endif ++ ++# ifndef cfi_endproc ++# define cfi_endproc .cfi_endproc ++# endif ++ ++# ifndef ENTRY ++# define ENTRY(name) \ ++ .type name, @function; \ ++ .globl name; \ ++ .p2align 4; \ ++name: \ ++ cfi_startproc ++# endif ++ ++# ifndef END ++# define END(name) \ ++ cfi_endproc; \ ++ .size name, .-name ++# endif ++ ++# ifdef USE_AS_WCSLEN ++# define VPCMPEQ vpcmpeqd ++# define VPMINU vpminud ++# else ++# define VPCMPEQ vpcmpeqb ++# define VPMINU vpminub ++# endif ++ ++# ifndef VZEROUPPER ++# define VZEROUPPER vzeroupper ++# endif ++ ++# define VEC_SIZE 32 ++ ++ .section .text.avx,"ax",@progbits ++ENTRY (STRLEN) ++# ifdef USE_AS_STRNLEN ++ /* Check for zero length. */ ++ test %RSI_LP, %RSI_LP ++ jz L(zero) ++# ifdef USE_AS_WCSLEN ++ shl $2, %RSI_LP ++# elif defined __ILP32__ ++ /* Clear the upper 32 bits. */ ++ movl %esi, %esi ++# endif ++ mov %RSI_LP, %R8_LP ++# endif ++ movl %edi, %ecx ++ movq %rdi, %rdx ++ vpxor %xmm0, %xmm0, %xmm0 ++ ++ /* Check if we may cross page boundary with one vector load. */ ++ andl $(2 * VEC_SIZE - 1), %ecx ++ cmpl $VEC_SIZE, %ecx ++ ja L(cros_page_boundary) ++ ++ /* Check the first VEC_SIZE bytes. */ ++ VPCMPEQ (%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax ++ testl %eax, %eax ++ ++# ifdef USE_AS_STRNLEN ++ jnz L(first_vec_x0_check) ++ /* Adjust length and check the end of data. */ ++ subq $VEC_SIZE, %rsi ++ jbe L(max) ++# else ++ jnz L(first_vec_x0) ++# endif ++ ++ /* Align data for aligned loads in the loop. */ ++ addq $VEC_SIZE, %rdi ++ andl $(VEC_SIZE - 1), %ecx ++ andq $-VEC_SIZE, %rdi ++ ++# ifdef USE_AS_STRNLEN ++ /* Adjust length. */ ++ addq %rcx, %rsi ++ ++ subq $(VEC_SIZE * 4), %rsi ++ jbe L(last_4x_vec_or_less) ++# endif ++ jmp L(more_4x_vec) ++ ++ .p2align 4 ++L(cros_page_boundary): ++ andl $(VEC_SIZE - 1), %ecx ++ andq $-VEC_SIZE, %rdi ++ VPCMPEQ (%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax ++ /* Remove the leading bytes. */ ++ sarl %cl, %eax ++ testl %eax, %eax ++ jz L(aligned_more) ++ tzcntl %eax, %eax ++# ifdef USE_AS_STRNLEN ++ /* Check the end of data. */ ++ cmpq %rax, %rsi ++ jbe L(max) ++# endif ++ addq %rdi, %rax ++ addq %rcx, %rax ++ subq %rdx, %rax ++# ifdef USE_AS_WCSLEN ++ shrq $2, %rax ++# endif ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(aligned_more): ++# ifdef USE_AS_STRNLEN ++ /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE" ++ with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE" ++ to void possible addition overflow. */ ++ negq %rcx ++ addq $VEC_SIZE, %rcx ++ ++ /* Check the end of data. */ ++ subq %rcx, %rsi ++ jbe L(max) ++# endif ++ ++ addq $VEC_SIZE, %rdi ++ ++# ifdef USE_AS_STRNLEN ++ subq $(VEC_SIZE * 4), %rsi ++ jbe L(last_4x_vec_or_less) ++# endif ++ ++L(more_4x_vec): ++ /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time ++ since data is only aligned to VEC_SIZE. */ ++ VPCMPEQ (%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x0) ++ ++ VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x1) ++ ++ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x2) ++ ++ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x3) ++ ++ addq $(VEC_SIZE * 4), %rdi ++ ++# ifdef USE_AS_STRNLEN ++ subq $(VEC_SIZE * 4), %rsi ++ jbe L(last_4x_vec_or_less) ++# endif ++ ++ /* Align data to 4 * VEC_SIZE. */ ++ movq %rdi, %rcx ++ andl $(4 * VEC_SIZE - 1), %ecx ++ andq $-(4 * VEC_SIZE), %rdi ++ ++# ifdef USE_AS_STRNLEN ++ /* Adjust length. */ ++ addq %rcx, %rsi ++# endif ++ ++ .p2align 4 ++L(loop_4x_vec): ++ /* Compare 4 * VEC at a time forward. */ ++ vmovdqa (%rdi), %ymm1 ++ vmovdqa VEC_SIZE(%rdi), %ymm2 ++ vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3 ++ vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4 ++ VPMINU %ymm1, %ymm2, %ymm5 ++ VPMINU %ymm3, %ymm4, %ymm6 ++ VPMINU %ymm5, %ymm6, %ymm5 ++ ++ VPCMPEQ %ymm5, %ymm0, %ymm5 ++ vpmovmskb %ymm5, %eax ++ testl %eax, %eax ++ jnz L(4x_vec_end) ++ ++ addq $(VEC_SIZE * 4), %rdi ++ ++# ifndef USE_AS_STRNLEN ++ jmp L(loop_4x_vec) ++# else ++ subq $(VEC_SIZE * 4), %rsi ++ ja L(loop_4x_vec) ++ ++L(last_4x_vec_or_less): ++ /* Less than 4 * VEC and aligned to VEC_SIZE. */ ++ addl $(VEC_SIZE * 2), %esi ++ jle L(last_2x_vec) ++ ++ VPCMPEQ (%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x0) ++ ++ VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x1) ++ ++ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax ++ testl %eax, %eax ++ ++ jnz L(first_vec_x2_check) ++ subl $VEC_SIZE, %esi ++ jle L(max) ++ ++ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax ++ testl %eax, %eax ++ ++ jnz L(first_vec_x3_check) ++ movq %r8, %rax ++# ifdef USE_AS_WCSLEN ++ shrq $2, %rax ++# endif ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(last_2x_vec): ++ addl $(VEC_SIZE * 2), %esi ++ VPCMPEQ (%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax ++ testl %eax, %eax ++ ++ jnz L(first_vec_x0_check) ++ subl $VEC_SIZE, %esi ++ jle L(max) ++ ++ VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x1_check) ++ movq %r8, %rax ++# ifdef USE_AS_WCSLEN ++ shrq $2, %rax ++# endif ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(first_vec_x0_check): ++ tzcntl %eax, %eax ++ /* Check the end of data. */ ++ cmpq %rax, %rsi ++ jbe L(max) ++ addq %rdi, %rax ++ subq %rdx, %rax ++# ifdef USE_AS_WCSLEN ++ shrq $2, %rax ++# endif ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(first_vec_x1_check): ++ tzcntl %eax, %eax ++ /* Check the end of data. */ ++ cmpq %rax, %rsi ++ jbe L(max) ++ addq $VEC_SIZE, %rax ++ addq %rdi, %rax ++ subq %rdx, %rax ++# ifdef USE_AS_WCSLEN ++ shrq $2, %rax ++# endif ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(first_vec_x2_check): ++ tzcntl %eax, %eax ++ /* Check the end of data. */ ++ cmpq %rax, %rsi ++ jbe L(max) ++ addq $(VEC_SIZE * 2), %rax ++ addq %rdi, %rax ++ subq %rdx, %rax ++# ifdef USE_AS_WCSLEN ++ shrq $2, %rax ++# endif ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(first_vec_x3_check): ++ tzcntl %eax, %eax ++ /* Check the end of data. */ ++ cmpq %rax, %rsi ++ jbe L(max) ++ addq $(VEC_SIZE * 3), %rax ++ addq %rdi, %rax ++ subq %rdx, %rax ++# ifdef USE_AS_WCSLEN ++ shrq $2, %rax ++# endif ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(max): ++ movq %r8, %rax ++# ifdef USE_AS_WCSLEN ++ shrq $2, %rax ++# endif ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(zero): ++ xorl %eax, %eax ++ ret ++# endif ++ ++ .p2align 4 ++L(first_vec_x0): ++ tzcntl %eax, %eax ++ addq %rdi, %rax ++ subq %rdx, %rax ++# ifdef USE_AS_WCSLEN ++ shrq $2, %rax ++# endif ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(first_vec_x1): ++ tzcntl %eax, %eax ++ addq $VEC_SIZE, %rax ++ addq %rdi, %rax ++ subq %rdx, %rax ++# ifdef USE_AS_WCSLEN ++ shrq $2, %rax ++# endif ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(first_vec_x2): ++ tzcntl %eax, %eax ++ addq $(VEC_SIZE * 2), %rax ++ addq %rdi, %rax ++ subq %rdx, %rax ++# ifdef USE_AS_WCSLEN ++ shrq $2, %rax ++# endif ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(4x_vec_end): ++ VPCMPEQ %ymm1, %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x0) ++ VPCMPEQ %ymm2, %ymm0, %ymm2 ++ vpmovmskb %ymm2, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x1) ++ VPCMPEQ %ymm3, %ymm0, %ymm3 ++ vpmovmskb %ymm3, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x2) ++ VPCMPEQ %ymm4, %ymm0, %ymm4 ++ vpmovmskb %ymm4, %eax ++L(first_vec_x3): ++ tzcntl %eax, %eax ++ addq $(VEC_SIZE * 3), %rax ++ addq %rdi, %rax ++ subq %rdx, %rax ++# ifdef USE_AS_WCSLEN ++ shrq $2, %rax ++# endif ++ VZEROUPPER ++ ret ++ ++END (STRLEN) +diff --git a/libc/arch-x86_64/kabylake/string/avx2-strncat-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-strncat-kbl.S +new file mode 100644 +index 000000000..71e1a46c2 +--- /dev/null ++++ b/libc/arch-x86_64/kabylake/string/avx2-strncat-kbl.S +@@ -0,0 +1,3 @@ ++#define USE_AS_STRNCAT ++#define STRCAT strncat_avx2 ++#include "avx2-strcat-kbl.S" +diff --git a/libc/arch-x86_64/kabylake/string/avx2-strncmp-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-strncmp-kbl.S +new file mode 100644 +index 000000000..b21a19134 +--- /dev/null ++++ b/libc/arch-x86_64/kabylake/string/avx2-strncmp-kbl.S +@@ -0,0 +1,4 @@ ++#define STRCMP strncmp_avx2 ++#define USE_AS_STRNCMP 1 ++#include "avx_regs.h" ++#include "avx2-strcmp-kbl.S" +diff --git a/libc/arch-x86_64/kabylake/string/avx2-strncpy-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-strncpy-kbl.S +new file mode 100644 +index 000000000..7ad840667 +--- /dev/null ++++ b/libc/arch-x86_64/kabylake/string/avx2-strncpy-kbl.S +@@ -0,0 +1,4 @@ ++#define USE_AS_STRNCPY ++#define STRCPY strncpy_avx2 ++#include "avx_regs.h" ++#include "avx2-strcpy-kbl.S" +diff --git a/libc/arch-x86_64/kabylake/string/avx2-strnlen-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-strnlen-kbl.S +new file mode 100644 +index 000000000..22cc5c527 +--- /dev/null ++++ b/libc/arch-x86_64/kabylake/string/avx2-strnlen-kbl.S +@@ -0,0 +1,4 @@ ++#define STRLEN strnlen_avx2 ++#define USE_AS_STRNLEN 1 ++#include "avx_regs.h" ++#include "avx2-strlen-kbl.S" +diff --git a/libc/arch-x86_64/kabylake/string/avx2-strrchr-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-strrchr-kbl.S +new file mode 100644 +index 000000000..b3a65fbc6 +--- /dev/null ++++ b/libc/arch-x86_64/kabylake/string/avx2-strrchr-kbl.S +@@ -0,0 +1,258 @@ ++/* strrchr/wcsrchr optimized with AVX2. ++ Copyright (C) 2017-2020 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++ ++# ifndef STRRCHR ++# define STRRCHR strrchr_avx2 ++# endif ++ ++# ifndef L ++# define L(label) .L##label ++# endif ++ ++# ifndef cfi_startproc ++# define cfi_startproc .cfi_startproc ++# endif ++ ++# ifndef cfi_endproc ++# define cfi_endproc .cfi_endproc ++# endif ++ ++# ifndef ENTRY ++# define ENTRY(name) \ ++ .type name, @function; \ ++ .globl name; \ ++ .p2align 4; \ ++name: \ ++ cfi_startproc ++# endif ++ ++# ifndef END ++# define END(name) \ ++ cfi_endproc; \ ++ .size name, .-name ++# endif ++ ++# ifdef USE_AS_WCSRCHR ++# define VPBROADCAST vpbroadcastd ++# define VPCMPEQ vpcmpeqd ++# else ++# define VPBROADCAST vpbroadcastb ++# define VPCMPEQ vpcmpeqb ++# endif ++ ++# ifndef VZEROUPPER ++# define VZEROUPPER vzeroupper ++# endif ++ ++# define VEC_SIZE 32 ++ ++ .section .text.avx,"ax",@progbits ++ENTRY (STRRCHR) ++ movd %esi, %xmm4 ++ movl %edi, %ecx ++ /* Broadcast CHAR to YMM4. */ ++ VPBROADCAST %xmm4, %ymm4 ++ vpxor %xmm0, %xmm0, %xmm0 ++ ++ /* Check if we may cross page boundary with one vector load. */ ++ andl $(2 * VEC_SIZE - 1), %ecx ++ cmpl $VEC_SIZE, %ecx ++ ja L(cros_page_boundary) ++ ++ vmovdqu (%rdi), %ymm1 ++ VPCMPEQ %ymm1, %ymm0, %ymm2 ++ VPCMPEQ %ymm1, %ymm4, %ymm3 ++ vpmovmskb %ymm2, %ecx ++ vpmovmskb %ymm3, %eax ++ addq $VEC_SIZE, %rdi ++ ++ testl %eax, %eax ++ jnz L(first_vec) ++ ++ testl %ecx, %ecx ++ jnz L(return_null) ++ ++ andq $-VEC_SIZE, %rdi ++ xorl %edx, %edx ++ jmp L(aligned_loop) ++ ++ .p2align 4 ++L(first_vec): ++ /* Check if there is a nul CHAR. */ ++ testl %ecx, %ecx ++ jnz L(char_and_nul_in_first_vec) ++ ++ /* Remember the match and keep searching. */ ++ movl %eax, %edx ++ movq %rdi, %rsi ++ andq $-VEC_SIZE, %rdi ++ jmp L(aligned_loop) ++ ++ .p2align 4 ++L(cros_page_boundary): ++ andl $(VEC_SIZE - 1), %ecx ++ andq $-VEC_SIZE, %rdi ++ vmovdqa (%rdi), %ymm1 ++ VPCMPEQ %ymm1, %ymm0, %ymm2 ++ VPCMPEQ %ymm1, %ymm4, %ymm3 ++ vpmovmskb %ymm2, %edx ++ vpmovmskb %ymm3, %eax ++ shrl %cl, %edx ++ shrl %cl, %eax ++ addq $VEC_SIZE, %rdi ++ ++ /* Check if there is a CHAR. */ ++ testl %eax, %eax ++ jnz L(found_char) ++ ++ testl %edx, %edx ++ jnz L(return_null) ++ ++ jmp L(aligned_loop) ++ ++ .p2align 4 ++L(found_char): ++ testl %edx, %edx ++ jnz L(char_and_nul) ++ ++ /* Remember the match and keep searching. */ ++ movl %eax, %edx ++ leaq (%rdi, %rcx), %rsi ++ ++ .p2align 4 ++L(aligned_loop): ++ vmovdqa (%rdi), %ymm1 ++ VPCMPEQ %ymm1, %ymm0, %ymm2 ++ addq $VEC_SIZE, %rdi ++ VPCMPEQ %ymm1, %ymm4, %ymm3 ++ vpmovmskb %ymm2, %ecx ++ vpmovmskb %ymm3, %eax ++ orl %eax, %ecx ++ jnz L(char_nor_null) ++ ++ vmovdqa (%rdi), %ymm1 ++ VPCMPEQ %ymm1, %ymm0, %ymm2 ++ add $VEC_SIZE, %rdi ++ VPCMPEQ %ymm1, %ymm4, %ymm3 ++ vpmovmskb %ymm2, %ecx ++ vpmovmskb %ymm3, %eax ++ orl %eax, %ecx ++ jnz L(char_nor_null) ++ ++ vmovdqa (%rdi), %ymm1 ++ VPCMPEQ %ymm1, %ymm0, %ymm2 ++ addq $VEC_SIZE, %rdi ++ VPCMPEQ %ymm1, %ymm4, %ymm3 ++ vpmovmskb %ymm2, %ecx ++ vpmovmskb %ymm3, %eax ++ orl %eax, %ecx ++ jnz L(char_nor_null) ++ ++ vmovdqa (%rdi), %ymm1 ++ VPCMPEQ %ymm1, %ymm0, %ymm2 ++ addq $VEC_SIZE, %rdi ++ VPCMPEQ %ymm1, %ymm4, %ymm3 ++ vpmovmskb %ymm2, %ecx ++ vpmovmskb %ymm3, %eax ++ orl %eax, %ecx ++ jz L(aligned_loop) ++ ++ .p2align 4 ++L(char_nor_null): ++ /* Find a CHAR or a nul CHAR in a loop. */ ++ testl %eax, %eax ++ jnz L(match) ++L(return_value): ++ testl %edx, %edx ++ jz L(return_null) ++ movl %edx, %eax ++ movq %rsi, %rdi ++ ++# ifdef USE_AS_WCSRCHR ++ /* Keep the first bit for each matching CHAR for bsr. */ ++ andl $0x11111111, %eax ++# endif ++ bsrl %eax, %eax ++ leaq -VEC_SIZE(%rdi, %rax), %rax ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(match): ++ /* Find a CHAR. Check if there is a nul CHAR. */ ++ vpmovmskb %ymm2, %ecx ++ testl %ecx, %ecx ++ jnz L(find_nul) ++ ++ /* Remember the match and keep searching. */ ++ movl %eax, %edx ++ movq %rdi, %rsi ++ jmp L(aligned_loop) ++ ++ .p2align 4 ++L(find_nul): ++# ifdef USE_AS_WCSRCHR ++ /* Keep the first bit for each matching CHAR for bsr. */ ++ andl $0x11111111, %ecx ++ andl $0x11111111, %eax ++# endif ++ /* Mask out any matching bits after the nul CHAR. */ ++ movl %ecx, %r8d ++ subl $1, %r8d ++ xorl %ecx, %r8d ++ andl %r8d, %eax ++ testl %eax, %eax ++ /* If there is no CHAR here, return the remembered one. */ ++ jz L(return_value) ++ bsrl %eax, %eax ++ leaq -VEC_SIZE(%rdi, %rax), %rax ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(char_and_nul): ++ /* Find both a CHAR and a nul CHAR. */ ++ addq %rcx, %rdi ++ movl %edx, %ecx ++L(char_and_nul_in_first_vec): ++# ifdef USE_AS_WCSRCHR ++ /* Keep the first bit for each matching CHAR for bsr. */ ++ andl $0x11111111, %ecx ++ andl $0x11111111, %eax ++# endif ++ /* Mask out any matching bits after the nul CHAR. */ ++ movl %ecx, %r8d ++ subl $1, %r8d ++ xorl %ecx, %r8d ++ andl %r8d, %eax ++ testl %eax, %eax ++ /* Return null pointer if the nul CHAR comes first. */ ++ jz L(return_null) ++ bsrl %eax, %eax ++ leaq -VEC_SIZE(%rdi, %rax), %rax ++ VZEROUPPER ++ ret ++ ++ .p2align 4 ++L(return_null): ++ xorl %eax, %eax ++ VZEROUPPER ++ ret ++ ++END (STRRCHR) +diff --git a/libc/arch-x86_64/kabylake/string/avx2-wcschr-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-wcschr-kbl.S +new file mode 100644 +index 000000000..b03124767 +--- /dev/null ++++ b/libc/arch-x86_64/kabylake/string/avx2-wcschr-kbl.S +@@ -0,0 +1,3 @@ ++#define STRCHR wcschr_avx2 ++#define USE_AS_WCSCHR 1 ++#include "avx2-strchr-kbl.S" +diff --git a/libc/arch-x86_64/kabylake/string/avx2-wcscmp-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-wcscmp-kbl.S +new file mode 100644 +index 000000000..bcbcd4ce7 +--- /dev/null ++++ b/libc/arch-x86_64/kabylake/string/avx2-wcscmp-kbl.S +@@ -0,0 +1,4 @@ ++#define STRCMP wcscmp_avx2 ++#define USE_AS_WCSCMP 1 ++ ++#include "avx2-strcmp-kbl.S" +diff --git a/libc/arch-x86_64/kabylake/string/avx2-wcslen-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-wcslen-kbl.S +new file mode 100644 +index 000000000..f1b973572 +--- /dev/null ++++ b/libc/arch-x86_64/kabylake/string/avx2-wcslen-kbl.S +@@ -0,0 +1,4 @@ ++#define STRLEN wcslen_avx2 ++#define USE_AS_WCSLEN 1 ++ ++#include "avx2-strlen-kbl.S" +diff --git a/libc/arch-x86_64/kabylake/string/avx2-wcsncmp-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-wcsncmp-kbl.S +new file mode 100644 +index 000000000..7603169c1 +--- /dev/null ++++ b/libc/arch-x86_64/kabylake/string/avx2-wcsncmp-kbl.S +@@ -0,0 +1,6 @@ ++#define STRCMP wcsncmp_avx2 ++#define USE_AS_STRNCMP 1 ++#define USE_AS_WCSCMP 1 ++ ++#include "avx_regs.h" ++#include "avx2-strcmp-kbl.S" +diff --git a/libc/arch-x86_64/kabylake/string/avx2-wcsnlen-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-wcsnlen-kbl.S +new file mode 100644 +index 000000000..2095cd8e0 +--- /dev/null ++++ b/libc/arch-x86_64/kabylake/string/avx2-wcsnlen-kbl.S +@@ -0,0 +1,6 @@ ++#define STRLEN wcsnlen_avx2 ++#define USE_AS_WCSLEN 1 ++#define USE_AS_STRNLEN 1 ++ ++#include "avx_regs.h" ++#include "avx2-strlen-kbl.S" +diff --git a/libc/arch-x86_64/kabylake/string/avx2-wcsrchr-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-wcsrchr-kbl.S +new file mode 100644 +index 000000000..fbec1286c +--- /dev/null ++++ b/libc/arch-x86_64/kabylake/string/avx2-wcsrchr-kbl.S +@@ -0,0 +1,3 @@ ++#define STRRCHR wcsrchr_avx2 ++#define USE_AS_WCSRCHR 1 ++#include "avx2-strrchr-kbl.S" +diff --git a/libc/arch-x86_64/kabylake/string/avx_regs.h b/libc/arch-x86_64/kabylake/string/avx_regs.h +new file mode 100644 +index 000000000..223d97e3e +--- /dev/null ++++ b/libc/arch-x86_64/kabylake/string/avx_regs.h +@@ -0,0 +1,26 @@ ++/* Long and pointer size in bytes. */ ++#define LP_SIZE 8 ++ ++/* Instruction to operate on long and pointer. */ ++#define LP_OP(insn) insn##q ++ ++/* Assembler address directive. */ ++#define ASM_ADDR .quad ++ ++/* Registers to hold long and pointer. */ ++#define RAX_LP rax ++#define RBP_LP rbp ++#define RBX_LP rbx ++#define RCX_LP rcx ++#define RDI_LP rdi ++#define RDX_LP rdx ++#define RSI_LP rsi ++#define RSP_LP rsp ++#define R8_LP r8 ++#define R9_LP r9 ++#define R10_LP r10 ++#define R11_LP r11 ++#define R12_LP r12 ++#define R13_LP r13 ++#define R14_LP r14 ++#define R15_LP r15 +diff --git a/libc/arch-x86_64/include/cache.h b/libc/arch-x86_64/kabylake/string/cache.h +similarity index 100% +rename from libc/arch-x86_64/include/cache.h +rename to libc/arch-x86_64/kabylake/string/cache.h +diff --git a/libc/arch-x86_64/silvermont/string/cache.h b/libc/arch-x86_64/silvermont/string/cache.h +new file mode 100644 +index 000000000..3606d2a1a +--- /dev/null ++++ b/libc/arch-x86_64/silvermont/string/cache.h +@@ -0,0 +1,36 @@ ++/* ++Copyright (c) 2014, Intel Corporation ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ ++ * Redistributions of source code must retain the above copyright notice, ++ * this list of conditions and the following disclaimer. ++ ++ * Redistributions in binary form must reproduce the above copyright notice, ++ * this list of conditions and the following disclaimer in the documentation ++ * and/or other materials provided with the distribution. ++ ++ * Neither the name of Intel Corporation nor the names of its contributors ++ * may be used to endorse or promote products derived from this software ++ * without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ++ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ++ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*/ ++ ++/* Values are optimized for Silvermont */ ++#define SHARED_CACHE_SIZE (1024*1024) /* Silvermont L2 Cache */ ++#define DATA_CACHE_SIZE (24*1024) /* Silvermont L1 Data Cache */ ++ ++#define SHARED_CACHE_SIZE_HALF (SHARED_CACHE_SIZE / 2) ++#define DATA_CACHE_SIZE_HALF (DATA_CACHE_SIZE / 2) +diff --git a/libc/arch-x86_64/silvermont/string/sse2-stpcpy-slm.S b/libc/arch-x86_64/silvermont/string/sse2-stpcpy-slm.S +index 0ad2d44cf..ce15cdf1c 100644 +--- a/libc/arch-x86_64/silvermont/string/sse2-stpcpy-slm.S ++++ b/libc/arch-x86_64/silvermont/string/sse2-stpcpy-slm.S +@@ -29,5 +29,5 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + #define USE_AS_STPCPY +-#define STRCPY stpcpy ++#define STRCPY stpcpy_generic + #include "sse2-strcpy-slm.S" +diff --git a/libc/arch-x86_64/silvermont/string/sse2-stpncpy-slm.S b/libc/arch-x86_64/silvermont/string/sse2-stpncpy-slm.S +index 30666850b..02b4df02d 100644 +--- a/libc/arch-x86_64/silvermont/string/sse2-stpncpy-slm.S ++++ b/libc/arch-x86_64/silvermont/string/sse2-stpncpy-slm.S +@@ -30,5 +30,5 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + #define USE_AS_STRNCPY + #define USE_AS_STPCPY +-#define STRCPY stpncpy ++#define STRCPY stpncpy_generic + #include "sse2-strcpy-slm.S" +diff --git a/libc/arch-x86_64/silvermont/string/sse2-strcat-slm.S b/libc/arch-x86_64/silvermont/string/sse2-strcat-slm.S +index dd8207ff5..007adfe95 100644 +--- a/libc/arch-x86_64/silvermont/string/sse2-strcat-slm.S ++++ b/libc/arch-x86_64/silvermont/string/sse2-strcat-slm.S +@@ -29,7 +29,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + #ifndef STRCAT +-# define STRCAT strcat ++# define STRCAT strcat_generic + #endif + + #ifndef L +diff --git a/libc/arch-x86_64/silvermont/string/sse2-strcpy-slm.S b/libc/arch-x86_64/silvermont/string/sse2-strcpy-slm.S +index 3e146bfbc..ade9eac4f 100644 +--- a/libc/arch-x86_64/silvermont/string/sse2-strcpy-slm.S ++++ b/libc/arch-x86_64/silvermont/string/sse2-strcpy-slm.S +@@ -31,7 +31,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + #ifndef USE_AS_STRCAT + + # ifndef STRCPY +-# define STRCPY strcpy ++# define STRCPY strcpy_generic + # endif + + # ifndef L +diff --git a/libc/arch-x86_64/silvermont/string/sse2-strlen-slm.S b/libc/arch-x86_64/silvermont/string/sse2-strlen-slm.S +index 3772fe770..df24f9de2 100644 +--- a/libc/arch-x86_64/silvermont/string/sse2-strlen-slm.S ++++ b/libc/arch-x86_64/silvermont/string/sse2-strlen-slm.S +@@ -31,7 +31,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + #ifndef USE_AS_STRCAT + + #ifndef STRLEN +-# define STRLEN strlen ++# define STRLEN strlen_generic + #endif + + #ifndef L +diff --git a/libc/arch-x86_64/silvermont/string/sse2-strncat-slm.S b/libc/arch-x86_64/silvermont/string/sse2-strncat-slm.S +index 6b4a43084..c5394f9d5 100644 +--- a/libc/arch-x86_64/silvermont/string/sse2-strncat-slm.S ++++ b/libc/arch-x86_64/silvermont/string/sse2-strncat-slm.S +@@ -29,5 +29,5 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + #define USE_AS_STRNCAT +-#define STRCAT strncat ++#define STRCAT strncat_generic + #include "sse2-strcat-slm.S" +diff --git a/libc/arch-x86_64/silvermont/string/sse2-strncpy-slm.S b/libc/arch-x86_64/silvermont/string/sse2-strncpy-slm.S +index 594e78f74..2e8d68d12 100644 +--- a/libc/arch-x86_64/silvermont/string/sse2-strncpy-slm.S ++++ b/libc/arch-x86_64/silvermont/string/sse2-strncpy-slm.S +@@ -29,5 +29,5 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + #define USE_AS_STRNCPY +-#define STRCPY strncpy ++#define STRCPY strncpy_generic + #include "sse2-strcpy-slm.S" +diff --git a/libc/arch-x86_64/silvermont/string/ssse3-strcmp-slm.S b/libc/arch-x86_64/silvermont/string/ssse3-strcmp-slm.S +index e8acd5ba4..fa2542f00 100644 +--- a/libc/arch-x86_64/silvermont/string/ssse3-strcmp-slm.S ++++ b/libc/arch-x86_64/silvermont/string/ssse3-strcmp-slm.S +@@ -43,7 +43,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + #else + #define UPDATE_STRNCMP_COUNTER + #ifndef STRCMP +-#define STRCMP strcmp ++#define STRCMP strcmp_generic + #endif + #endif + +diff --git a/libc/arch-x86_64/silvermont/string/ssse3-strncmp-slm.S b/libc/arch-x86_64/silvermont/string/ssse3-strncmp-slm.S +index 0e4077517..5d20a483f 100644 +--- a/libc/arch-x86_64/silvermont/string/ssse3-strncmp-slm.S ++++ b/libc/arch-x86_64/silvermont/string/ssse3-strncmp-slm.S +@@ -29,5 +29,5 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + #define USE_AS_STRNCMP +-#define STRCMP strncmp ++#define STRCMP strncmp_generic + #include "ssse3-strcmp-slm.S" +diff --git a/libc/arch-x86_64/static_function_dispatch.S b/libc/arch-x86_64/static_function_dispatch.S +index 979ce4f18..5c0f1f2ba 100644 +--- a/libc/arch-x86_64/static_function_dispatch.S ++++ b/libc/arch-x86_64/static_function_dispatch.S +@@ -38,6 +38,25 @@ FUNCTION_DELEGATE(__memset_chk, __memset_chk_generic) + FUNCTION_DELEGATE(memcmp, memcmp_generic) + FUNCTION_DELEGATE(memcpy, memmove_generic) + FUNCTION_DELEGATE(memmove, memmove_generic) +-FUNCTION_DELEGATE(memchr, memchr_openbsd) +-FUNCTION_DELEGATE(memrchr, memrchr_openbsd) +-//FUNCTION_DELEGATE(wmemset, wmemset_freebsd) ++FUNCTION_DELEGATE(memchr, memchr_generic) ++FUNCTION_DELEGATE(memrchr, memrchr_generic) ++//FUNCTION_DELEGATE(wmemset, wmemset_generic) ++FUNCTION_DELEGATE(strcmp, strcmp_generic) ++FUNCTION_DELEGATE(strncmp, strncmp_generic) ++FUNCTION_DELEGATE(strcpy, strcpy_generic) ++FUNCTION_DELEGATE(strncpy, strncpy_generic) ++FUNCTION_DELEGATE(stpcpy, stpcpy_generic) ++FUNCTION_DELEGATE(stpncpy, stpncpy_generic) ++FUNCTION_DELEGATE(strlen, strlen_generic) ++FUNCTION_DELEGATE(strnlen, strnlen_generic) ++FUNCTION_DELEGATE(strchr, strchr_generic) ++FUNCTION_DELEGATE(strrchr, strrchr_generic) ++FUNCTION_DELEGATE(strcat, strcat_generic) ++FUNCTION_DELEGATE(strncat, strncat_generic) ++FUNCTION_DELEGATE(wcscmp, wcscmp_generic) ++FUNCTION_DELEGATE(wcsncmp, wcsncmp_generic) ++FUNCTION_DELEGATE(wcslen, wcslen_generic) ++FUNCTION_DELEGATE(wcsnlen, wcsnlen_generic) ++FUNCTION_DELEGATE(wcschr, wcschr_generic) ++FUNCTION_DELEGATE(wcsrchr, wcsrchr_generic) ++ +-- +2.25.1 + diff --git a/aosp_diff/preliminary/bionic/0005-avx2-implementation-for-memmove-api.patch b/aosp_diff/preliminary/bionic/0005-avx2-implementation-for-memmove-api.patch new file mode 100644 index 0000000000..6f47b3414b --- /dev/null +++ b/aosp_diff/preliminary/bionic/0005-avx2-implementation-for-memmove-api.patch @@ -0,0 +1,645 @@ +From 05ace70e6407263d0bef91800005942a079058d6 Mon Sep 17 00:00:00 2001 +From: "Reddy, Alavala Srinivasa" +Date: Wed, 1 Nov 2023 18:43:18 +0530 +Subject: [PATCH 5/5] avx2 implementation for memmove api + +This patch includes handwritten avx2 assembly +implementation for memmove 64-bit. + +Test done: Build and boot is fine, Run the benchmarks suite. + +Signed-off-by: ahs +--- + libc/Android.bp | 1 + + .../arch-x86_64/dynamic_function_dispatch.cpp | 2 + + .../kabylake/string/avx2-memmove-kbl.S | 593 ++++++++++++++++++ + 3 files changed, 596 insertions(+) + create mode 100644 libc/arch-x86_64/kabylake/string/avx2-memmove-kbl.S + +diff --git a/libc/Android.bp b/libc/Android.bp +index 92483e833..5deb88b48 100644 +--- a/libc/Android.bp ++++ b/libc/Android.bp +@@ -1235,6 +1235,7 @@ cc_library_static { + "arch-x86_64/kabylake/string/avx2-memcmp-kbl.S", + "arch-x86_64/kabylake/string/avx2-memchr-kbl.S", + "arch-x86_64/kabylake/string/avx2-memrchr-kbl.S", ++ "arch-x86_64/kabylake/string/avx2-memmove-kbl.S", + "arch-x86_64/kabylake/string/avx2-strcmp-kbl.S", + "arch-x86_64/kabylake/string/avx2-strncmp-kbl.S", + "arch-x86_64/kabylake/string/avx2-strlen-kbl.S", +diff --git a/libc/arch-x86_64/dynamic_function_dispatch.cpp b/libc/arch-x86_64/dynamic_function_dispatch.cpp +index 182eb4200..5bcf63e4c 100644 +--- a/libc/arch-x86_64/dynamic_function_dispatch.cpp ++++ b/libc/arch-x86_64/dynamic_function_dispatch.cpp +@@ -55,6 +55,8 @@ DEFINE_IFUNC_FOR(memcmp) { + + typedef void* memmove_func(void* __dst, const void* __src, size_t __n); + DEFINE_IFUNC_FOR(memmove) { ++ __builtin_cpu_init(); ++ if (__builtin_cpu_supports("avx2")) RETURN_FUNC(memmove_func, memmove_avx2); + RETURN_FUNC(memmove_func, memmove_generic); + } + +diff --git a/libc/arch-x86_64/kabylake/string/avx2-memmove-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-memmove-kbl.S +new file mode 100644 +index 000000000..02e9ec1d2 +--- /dev/null ++++ b/libc/arch-x86_64/kabylake/string/avx2-memmove-kbl.S +@@ -0,0 +1,593 @@ ++/* ++Copyright (c) 2014, Intel Corporation ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ ++ * Redistributions of source code must retain the above copyright notice, ++ * this list of conditions and the following disclaimer. ++ ++ * Redistributions in binary form must reproduce the above copyright notice, ++ * this list of conditions and the following disclaimer in the documentation ++ * and/or other materials provided with the distribution. ++ ++ * Neither the name of Intel Corporation nor the names of its contributors ++ * may be used to endorse or promote products derived from this software ++ * without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ++ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ++ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*/ ++ ++#include "cache.h" ++ ++#ifndef MEMMOVE ++# define MEMMOVE memmove_avx2 ++#endif ++ ++#ifndef L ++# define L(label) .L##label ++#endif ++ ++#ifndef cfi_startproc ++# define cfi_startproc .cfi_startproc ++#endif ++ ++#ifndef cfi_endproc ++# define cfi_endproc .cfi_endproc ++#endif ++ ++#ifndef cfi_rel_offset ++# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off ++#endif ++ ++#ifndef cfi_restore ++# define cfi_restore(reg) .cfi_restore reg ++#endif ++ ++#ifndef cfi_adjust_cfa_offset ++# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off ++#endif ++ ++#ifndef ENTRY ++# define ENTRY(name) \ ++ .type name, @function; \ ++ .globl name; \ ++ .p2align 4; \ ++name: \ ++ cfi_startproc ++#endif ++ ++#ifndef ALIAS_SYMBOL ++# define ALIAS_SYMBOL(alias, original) \ ++ .globl alias; \ ++ .equ alias, original ++#endif ++ ++#ifndef END ++# define END(name) \ ++ cfi_endproc; \ ++ .size name, .-name ++#endif ++ ++#define CFI_PUSH(REG) \ ++ cfi_adjust_cfa_offset (4); \ ++ cfi_rel_offset (REG, 0) ++ ++#define CFI_POP(REG) \ ++ cfi_adjust_cfa_offset (-4); \ ++ cfi_restore (REG) ++ ++#define PUSH(REG) push REG; ++#define POP(REG) pop REG; ++ ++#define ENTRANCE PUSH (%rbx); ++#define RETURN_END POP (%rbx); ret ++#define RETURN RETURN_END; ++ ++ .section .text.avx2,"ax",@progbits ++ENTRY (MEMMOVE) ++ ENTRANCE ++ mov %rdi, %rax ++ ++/* Check whether we should copy backward or forward. */ ++ cmp %rsi, %rdi ++ je L(mm_return) ++ jg L(mm_len_0_or_more_backward) ++ ++/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128] ++ separately. */ ++ cmp $16, %rdx ++ jbe L(mm_len_0_16_bytes_forward) ++ ++ cmp $32, %rdx ++ ja L(mm_len_32_or_more_forward) ++ ++/* Copy [0..32] and return. */ ++ movdqu (%rsi), %xmm0 ++ movdqu -16(%rsi, %rdx), %xmm1 ++ movdqu %xmm0, (%rdi) ++ movdqu %xmm1, -16(%rdi, %rdx) ++ jmp L(mm_return) ++ ++L(mm_len_32_or_more_forward): ++ cmp $64, %rdx ++ ja L(mm_len_64_or_more_forward) ++ ++/* Copy [0..64] and return. */ ++ movdqu (%rsi), %xmm0 ++ movdqu 16(%rsi), %xmm1 ++ movdqu -16(%rsi, %rdx), %xmm2 ++ movdqu -32(%rsi, %rdx), %xmm3 ++ movdqu %xmm0, (%rdi) ++ movdqu %xmm1, 16(%rdi) ++ movdqu %xmm2, -16(%rdi, %rdx) ++ movdqu %xmm3, -32(%rdi, %rdx) ++ jmp L(mm_return) ++ ++L(mm_len_64_or_more_forward): ++ cmp $128, %rdx ++ ja L(mm_len_128_or_more_forward) ++ ++/* Copy [0..128] and return. */ ++ movdqu (%rsi), %xmm0 ++ movdqu 16(%rsi), %xmm1 ++ movdqu 32(%rsi), %xmm2 ++ movdqu 48(%rsi), %xmm3 ++ movdqu -64(%rsi, %rdx), %xmm4 ++ movdqu -48(%rsi, %rdx), %xmm5 ++ movdqu -32(%rsi, %rdx), %xmm6 ++ movdqu -16(%rsi, %rdx), %xmm7 ++ movdqu %xmm0, (%rdi) ++ movdqu %xmm1, 16(%rdi) ++ movdqu %xmm2, 32(%rdi) ++ movdqu %xmm3, 48(%rdi) ++ movdqu %xmm4, -64(%rdi, %rdx) ++ movdqu %xmm5, -48(%rdi, %rdx) ++ movdqu %xmm6, -32(%rdi, %rdx) ++ movdqu %xmm7, -16(%rdi, %rdx) ++ jmp L(mm_return) ++ ++L(mm_len_128_or_more_forward): ++ cmp $256, %rdx ++ ja L(mm_len_256_or_more_forward) ++ ++/* Copy [0..256] and return. */ ++ movdqu (%rsi), %xmm0 ++ movdqu 16(%rsi), %xmm1 ++ movdqu 32(%rsi), %xmm2 ++ movdqu 48(%rsi), %xmm3 ++ movdqu 64(%rsi), %xmm4 ++ movdqu 80(%rsi), %xmm5 ++ movdqu 96(%rsi), %xmm6 ++ movdqu 112(%rsi), %xmm7 ++ movdqu -128(%rsi, %rdx), %xmm8 ++ movdqu -112(%rsi, %rdx), %xmm9 ++ movdqu -96(%rsi, %rdx), %xmm10 ++ movdqu -80(%rsi, %rdx), %xmm11 ++ movdqu -64(%rsi, %rdx), %xmm12 ++ movdqu -48(%rsi, %rdx), %xmm13 ++ movdqu -32(%rsi, %rdx), %xmm14 ++ movdqu -16(%rsi, %rdx), %xmm15 ++ movdqu %xmm0, (%rdi) ++ movdqu %xmm1, 16(%rdi) ++ movdqu %xmm2, 32(%rdi) ++ movdqu %xmm3, 48(%rdi) ++ movdqu %xmm4, 64(%rdi) ++ movdqu %xmm5, 80(%rdi) ++ movdqu %xmm6, 96(%rdi) ++ movdqu %xmm7, 112(%rdi) ++ movdqu %xmm8, -128(%rdi, %rdx) ++ movdqu %xmm9, -112(%rdi, %rdx) ++ movdqu %xmm10, -96(%rdi, %rdx) ++ movdqu %xmm11, -80(%rdi, %rdx) ++ movdqu %xmm12, -64(%rdi, %rdx) ++ movdqu %xmm13, -48(%rdi, %rdx) ++ movdqu %xmm14, -32(%rdi, %rdx) ++ movdqu %xmm15, -16(%rdi, %rdx) ++ jmp L(mm_return) ++ ++L(mm_len_256_or_more_forward): ++/* Aligning the address of destination. */ ++/* save first unaligned 128 bytes */ ++ vmovdqu (%rsi), %ymm0 ++ vmovdqu 32(%rsi), %ymm1 ++ vmovdqu 64(%rsi), %ymm2 ++ vmovdqu 96(%rsi), %ymm3 ++ ++ lea 128(%rdi), %r8 ++ and $-128, %r8 /* r8 now aligned to next 128 byte boundary */ ++ sub %rdi, %rsi /* rsi = src - dst = diff */ ++ ++ vmovdqu (%r8, %rsi), %ymm4 ++ vmovdqu 32(%r8, %rsi), %ymm5 ++ vmovdqu 64(%r8, %rsi), %ymm6 ++ vmovdqu 96(%r8, %rsi), %ymm7 ++ ++ vmovdqu %ymm0, (%rdi) ++ vmovdqu %ymm1, 32(%rdi) ++ vmovdqu %ymm2, 64(%rdi) ++ vmovdqu %ymm3, 96(%rdi) ++ vmovdqa %ymm4, (%r8) ++ vmovaps %ymm5, 32(%r8) ++ vmovaps %ymm6, 64(%r8) ++ vmovaps %ymm7, 96(%r8) ++ add $128, %r8 ++ ++ lea (%rdi, %rdx), %rbx ++ and $-128, %rbx ++ cmp %r8, %rbx ++ jbe L(mm_copy_remaining_forward) ++ ++ cmp $SHARED_CACHE_SIZE_HALF, %rdx ++ jae L(mm_large_page_loop_forward) ++ ++ .p2align 4 ++L(mm_main_loop_forward): ++ prefetcht0 128(%r8, %rsi) ++ vmovdqu (%r8, %rsi), %ymm0 ++ vmovdqu 32(%r8, %rsi), %ymm1 ++ vmovdqa %ymm0, (%r8) ++ vmovaps %ymm1, 32(%r8) ++ lea 64(%r8), %r8 ++ cmp %r8, %rbx ++ ja L(mm_main_loop_forward) ++ ++L(mm_copy_remaining_forward): ++ add %rdi, %rdx ++ sub %r8, %rdx ++/* We copied all up till %rdi position in the dst. ++ In %rdx now is how many bytes are left to copy. ++ Now we need to advance %r8. */ ++ lea (%r8, %rsi), %r9 ++ ++L(mm_remaining_0_128_bytes_forward): ++ cmp $64, %rdx ++ ja L(mm_remaining_65_128_bytes_forward) ++ cmp $32, %rdx ++ ja L(mm_remaining_33_64_bytes_forward) ++ vzeroupper ++ cmp $16, %rdx ++ ja L(mm_remaining_17_32_bytes_forward) ++ test %rdx, %rdx ++ .p2align 4,,2 ++ je L(mm_return) ++ ++ cmpb $8, %dl ++ ja L(mm_remaining_9_16_bytes_forward) ++ cmpb $4, %dl ++ .p2align 4,,5 ++ ja L(mm_remaining_5_8_bytes_forward) ++ cmpb $2, %dl ++ .p2align 4,,1 ++ ja L(mm_remaining_3_4_bytes_forward) ++ movzbl -1(%r9,%rdx), %esi ++ movzbl (%r9), %ebx ++ movb %sil, -1(%r8,%rdx) ++ movb %bl, (%r8) ++ jmp L(mm_return) ++ ++L(mm_remaining_65_128_bytes_forward): ++ vmovdqu (%r9), %ymm0 ++ vmovdqu 32(%r9), %ymm1 ++ vmovdqu -64(%r9, %rdx), %ymm2 ++ vmovdqu -32(%r9, %rdx), %ymm3 ++ vmovdqu %ymm0, (%r8) ++ vmovdqu %ymm1, 32(%r8) ++ vmovdqu %ymm2, -64(%r8, %rdx) ++ vmovdqu %ymm3, -32(%r8, %rdx) ++ jmp L(mm_return) ++ ++L(mm_remaining_33_64_bytes_forward): ++ vmovdqu (%r9), %ymm0 ++ vmovdqu -32(%r9, %rdx), %ymm1 ++ vmovdqu %ymm0, (%r8) ++ vmovdqu %ymm1, -32(%r8, %rdx) ++ jmp L(mm_return) ++ ++L(mm_remaining_17_32_bytes_forward): ++ movdqu (%r9), %xmm0 ++ movdqu -16(%r9, %rdx), %xmm1 ++ movdqu %xmm0, (%r8) ++ movdqu %xmm1, -16(%r8, %rdx) ++ jmp L(mm_return) ++ ++L(mm_remaining_5_8_bytes_forward): ++ movl (%r9), %esi ++ movl -4(%r9,%rdx), %ebx ++ movl %esi, (%r8) ++ movl %ebx, -4(%r8,%rdx) ++ jmp L(mm_return) ++ ++L(mm_remaining_9_16_bytes_forward): ++ mov (%r9), %rsi ++ mov -8(%r9, %rdx), %rbx ++ mov %rsi, (%r8) ++ mov %rbx, -8(%r8, %rdx) ++ jmp L(mm_return) ++ ++L(mm_remaining_3_4_bytes_forward): ++ movzwl -2(%r9,%rdx), %esi ++ movzwl (%r9), %ebx ++ movw %si, -2(%r8,%rdx) ++ movw %bx, (%r8) ++ jmp L(mm_return) ++ ++L(mm_len_0_16_bytes_forward): ++ testb $24, %dl ++ jne L(mm_len_9_16_bytes_forward) ++ testb $4, %dl ++ .p2align 4,,5 ++ jne L(mm_len_5_8_bytes_forward) ++ test %rdx, %rdx ++ .p2align 4,,2 ++ je L(mm_return) ++ testb $2, %dl ++ .p2align 4,,1 ++ jne L(mm_len_2_4_bytes_forward) ++ movzbl -1(%rsi,%rdx), %ebx ++ movzbl (%rsi), %esi ++ movb %bl, -1(%rdi,%rdx) ++ movb %sil, (%rdi) ++ jmp L(mm_return) ++ ++L(mm_len_2_4_bytes_forward): ++ movzwl -2(%rsi,%rdx), %ebx ++ movzwl (%rsi), %esi ++ movw %bx, -2(%rdi,%rdx) ++ movw %si, (%rdi) ++ jmp L(mm_return) ++ ++L(mm_len_5_8_bytes_forward): ++ movl (%rsi), %ebx ++ movl -4(%rsi,%rdx), %esi ++ movl %ebx, (%rdi) ++ movl %esi, -4(%rdi,%rdx) ++ jmp L(mm_return) ++ ++L(mm_len_9_16_bytes_forward): ++ mov (%rsi), %rbx ++ mov -8(%rsi, %rdx), %rsi ++ mov %rbx, (%rdi) ++ mov %rsi, -8(%rdi, %rdx) ++ jmp L(mm_return) ++ ++L(mm_recalc_len): ++/* Compute in %rdx how many bytes are left to copy after ++ the main loop stops. */ ++ vzeroupper ++ mov %rbx, %rdx ++ sub %rdi, %rdx ++/* The code for copying backwards. */ ++L(mm_len_0_or_more_backward): ++ ++/* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128] ++ separately. */ ++ cmp $16, %rdx ++ jbe L(mm_len_0_16_bytes_backward) ++ ++ cmp $32, %rdx ++ ja L(mm_len_32_or_more_backward) ++ ++/* Copy [0..32] and return. */ ++ movdqu (%rsi), %xmm0 ++ movdqu -16(%rsi, %rdx), %xmm1 ++ movdqu %xmm0, (%rdi) ++ movdqu %xmm1, -16(%rdi, %rdx) ++ jmp L(mm_return) ++ ++L(mm_len_32_or_more_backward): ++ cmp $64, %rdx ++ ja L(mm_len_64_or_more_backward) ++ ++/* Copy [0..64] and return. */ ++ movdqu (%rsi), %xmm0 ++ movdqu 16(%rsi), %xmm1 ++ movdqu -16(%rsi, %rdx), %xmm2 ++ movdqu -32(%rsi, %rdx), %xmm3 ++ movdqu %xmm0, (%rdi) ++ movdqu %xmm1, 16(%rdi) ++ movdqu %xmm2, -16(%rdi, %rdx) ++ movdqu %xmm3, -32(%rdi, %rdx) ++ jmp L(mm_return) ++ ++L(mm_len_64_or_more_backward): ++ cmp $128, %rdx ++ ja L(mm_len_128_or_more_backward) ++ ++/* Copy [0..128] and return. */ ++ movdqu (%rsi), %xmm0 ++ movdqu 16(%rsi), %xmm1 ++ movdqu 32(%rsi), %xmm2 ++ movdqu 48(%rsi), %xmm3 ++ movdqu -64(%rsi, %rdx), %xmm4 ++ movdqu -48(%rsi, %rdx), %xmm5 ++ movdqu -32(%rsi, %rdx), %xmm6 ++ movdqu -16(%rsi, %rdx), %xmm7 ++ movdqu %xmm0, (%rdi) ++ movdqu %xmm1, 16(%rdi) ++ movdqu %xmm2, 32(%rdi) ++ movdqu %xmm3, 48(%rdi) ++ movdqu %xmm4, -64(%rdi, %rdx) ++ movdqu %xmm5, -48(%rdi, %rdx) ++ movdqu %xmm6, -32(%rdi, %rdx) ++ movdqu %xmm7, -16(%rdi, %rdx) ++ jmp L(mm_return) ++ ++L(mm_len_128_or_more_backward): ++ cmp $256, %rdx ++ ja L(mm_len_256_or_more_backward) ++ ++/* Copy [0..256] and return. */ ++ movdqu (%rsi), %xmm0 ++ movdqu 16(%rsi), %xmm1 ++ movdqu 32(%rsi), %xmm2 ++ movdqu 48(%rsi), %xmm3 ++ movdqu 64(%rsi), %xmm4 ++ movdqu 80(%rsi), %xmm5 ++ movdqu 96(%rsi), %xmm6 ++ movdqu 112(%rsi), %xmm7 ++ movdqu -128(%rsi, %rdx), %xmm8 ++ movdqu -112(%rsi, %rdx), %xmm9 ++ movdqu -96(%rsi, %rdx), %xmm10 ++ movdqu -80(%rsi, %rdx), %xmm11 ++ movdqu -64(%rsi, %rdx), %xmm12 ++ movdqu -48(%rsi, %rdx), %xmm13 ++ movdqu -32(%rsi, %rdx), %xmm14 ++ movdqu -16(%rsi, %rdx), %xmm15 ++ movdqu %xmm0, (%rdi) ++ movdqu %xmm1, 16(%rdi) ++ movdqu %xmm2, 32(%rdi) ++ movdqu %xmm3, 48(%rdi) ++ movdqu %xmm4, 64(%rdi) ++ movdqu %xmm5, 80(%rdi) ++ movdqu %xmm6, 96(%rdi) ++ movdqu %xmm7, 112(%rdi) ++ movdqu %xmm8, -128(%rdi, %rdx) ++ movdqu %xmm9, -112(%rdi, %rdx) ++ movdqu %xmm10, -96(%rdi, %rdx) ++ movdqu %xmm11, -80(%rdi, %rdx) ++ movdqu %xmm12, -64(%rdi, %rdx) ++ movdqu %xmm13, -48(%rdi, %rdx) ++ movdqu %xmm14, -32(%rdi, %rdx) ++ movdqu %xmm15, -16(%rdi, %rdx) ++ jmp L(mm_return) ++ ++L(mm_len_256_or_more_backward): ++/* Aligning the address of destination. We need to save ++ 128 bytes from the source in order not to overwrite them. */ ++ vmovdqu -32(%rsi, %rdx), %ymm0 ++ vmovdqu -64(%rsi, %rdx), %ymm1 ++ vmovdqu -96(%rsi, %rdx), %ymm2 ++ vmovdqu -128(%rsi, %rdx), %ymm3 ++ ++ lea (%rdi, %rdx), %r9 ++ and $-128, %r9 /* r9 = aligned dst */ ++ ++ mov %rsi, %r8 ++ sub %rdi, %r8 /* r8 = src - dst, diff */ ++ ++ vmovdqu -32(%r9, %r8), %ymm4 ++ vmovdqu -64(%r9, %r8), %ymm5 ++ vmovdqu -96(%r9, %r8), %ymm6 ++ vmovdqu -128(%r9, %r8), %ymm7 ++ ++ vmovdqu %ymm0, -32(%rdi, %rdx) ++ vmovdqu %ymm1, -64(%rdi, %rdx) ++ vmovdqu %ymm2, -96(%rdi, %rdx) ++ vmovdqu %ymm3, -128(%rdi, %rdx) ++ vmovdqa %ymm4, -32(%r9) ++ vmovdqa %ymm5, -64(%r9) ++ vmovdqa %ymm6, -96(%r9) ++ vmovdqa %ymm7, -128(%r9) ++ lea -128(%r9), %r9 ++ ++ lea 128(%rdi), %rbx ++ and $-128, %rbx ++ ++ cmp %r9, %rbx ++ jae L(mm_recalc_len) ++ ++ cmp $SHARED_CACHE_SIZE_HALF, %rdx ++ jae L(mm_large_page_loop_backward) ++ ++ .p2align 4 ++L(mm_main_loop_backward): ++ prefetcht0 -128(%r9, %r8) ++ ++ vmovdqu -64(%r9, %r8), %ymm0 ++ vmovdqu -32(%r9, %r8), %ymm1 ++ vmovdqa %ymm0, -64(%r9) ++ vmovaps %ymm1, -32(%r9) ++ lea -64(%r9), %r9 ++ cmp %r9, %rbx ++ jb L(mm_main_loop_backward) ++ jmp L(mm_recalc_len) ++ ++/* Copy [0..16] and return. */ ++L(mm_len_0_16_bytes_backward): ++ testb $24, %dl ++ jnz L(mm_len_9_16_bytes_backward) ++ testb $4, %dl ++ .p2align 4,,5 ++ jnz L(mm_len_5_8_bytes_backward) ++ test %rdx, %rdx ++ .p2align 4,,2 ++ je L(mm_return) ++ testb $2, %dl ++ .p2align 4,,1 ++ jne L(mm_len_3_4_bytes_backward) ++ movzbl -1(%rsi,%rdx), %ebx ++ movzbl (%rsi), %ecx ++ movb %bl, -1(%rdi,%rdx) ++ movb %cl, (%rdi) ++ jmp L(mm_return) ++ ++L(mm_len_3_4_bytes_backward): ++ movzwl -2(%rsi,%rdx), %ebx ++ movzwl (%rsi), %ecx ++ movw %bx, -2(%rdi,%rdx) ++ movw %cx, (%rdi) ++ jmp L(mm_return) ++ ++L(mm_len_9_16_bytes_backward): ++ movl -4(%rsi,%rdx), %ebx ++ movl -8(%rsi,%rdx), %ecx ++ movl %ebx, -4(%rdi,%rdx) ++ movl %ecx, -8(%rdi,%rdx) ++ sub $8, %rdx ++ jmp L(mm_len_0_16_bytes_backward) ++ ++L(mm_len_5_8_bytes_backward): ++ movl (%rsi), %ebx ++ movl -4(%rsi,%rdx), %ecx ++ movl %ebx, (%rdi) ++ movl %ecx, -4(%rdi,%rdx) ++ ++L(mm_return): ++ vzeroupper ++ RETURN ++ ++/* Big length copy forward part. */ ++ ++ .p2align 4 ++L(mm_large_page_loop_forward): ++ vmovdqu (%r8, %rsi), %ymm0 ++ vmovdqu 32(%r8, %rsi), %ymm1 ++ vmovdqu 64(%r8, %rsi), %ymm2 ++ vmovdqu 96(%r8, %rsi), %ymm3 ++ vmovntdq %ymm0, (%r8) ++ vmovntdq %ymm1, 32(%r8) ++ vmovntdq %ymm2, 64(%r8) ++ vmovntdq %ymm3, 96(%r8) ++ lea 128(%r8), %r8 ++ cmp %r8, %rbx ++ ja L(mm_large_page_loop_forward) ++ sfence ++ jmp L(mm_copy_remaining_forward) ++ ++/* Big length copy backward part. */ ++ .p2align 4 ++L(mm_large_page_loop_backward): ++ vmovdqu -64(%r9, %r8), %ymm0 ++ vmovdqu -32(%r9, %r8), %ymm1 ++ vmovntdq %ymm0, -64(%r9) ++ vmovntdq %ymm1, -32(%r9) ++ lea -64(%r9), %r9 ++ cmp %r9, %rbx ++ jb L(mm_large_page_loop_backward) ++ sfence ++ jmp L(mm_recalc_len) ++ ++END (MEMMOVE) ++ ++//ALIAS_SYMBOL(memcpy, MEMMOVE) +-- +2.25.1 +