diff --git a/aosp_diff/preliminary/bionic/0003-Optimize-bionic-memory-functions-with-avx2-instructi.patch b/aosp_diff/preliminary/bionic/0003-Optimize-bionic-memory-functions-with-avx2-instructi.patch
new file mode 100644
index 0000000000..db61807bff
--- /dev/null
+++ b/aosp_diff/preliminary/bionic/0003-Optimize-bionic-memory-functions-with-avx2-instructi.patch
@@ -0,0 +1,3779 @@
+From 013b505284379453df6637f009a224f6d5c6f3bd Mon Sep 17 00:00:00 2001
+From: "Reddy, Alavala Srinivasa" <alavala.srinivasa.reddy@intel.com>
+Date: Wed, 13 Sep 2023 18:36:21 +0530
+Subject: [PATCH 3/5] Optimize bionic memory functions with avx2 instructions
+
+Following memory related functions are optimized with
+avx2 implementation ported from glibc 2.20
+(only for 64-bit)
+ - memchr
+ - memcmp
+ - memrchr
+
+Test done: Build and boot is fine, Run the benchmarks suite.
+
+Change-Id: I956773c79b9bcebee69726820eaa74c709df7081
+Signed-off-by: ahs <amrita.h.s@intel.com>
+Signed-off-by: Ravi Kumar Soni <ravi.kumar.soni@intel.com>
+---
+ libc/Android.bp                               |   36 +-
+ .../kabylake/string/avx2-memcpy-kbl.S         | 2052 +++++++++++++++++
+ .../arch-x86_64/dynamic_function_dispatch.cpp |   38 +
+ libc/arch-x86_64/generic/string/memchr.c      |   20 +
+ libc/arch-x86_64/generic/string/memrchr.c     |   20 +
+ libc/arch-x86_64/generic/string/wmemset.c     |   20 +
+ libc/arch-x86_64/{string => include}/cache.h  |    0
+ .../kabylake/string/avx2-memchr-kbl.S         |  371 +++
+ .../kabylake/string/avx2-memcmp-kbl.S         |  428 ++++
+ .../kabylake/string/avx2-memrchr-kbl.S        |  408 ++++
+ .../kabylake/string/avx2-wmemset-kbl.S        |  140 ++
+ .../string/sse2-memmove-slm.S                 |    4 +-
+ .../{ => silvermont}/string/sse2-memset-slm.S |    0
+ .../{ => silvermont}/string/sse2-stpcpy-slm.S |    0
+ .../string/sse2-stpncpy-slm.S                 |    0
+ .../{ => silvermont}/string/sse2-strcat-slm.S |    0
+ .../{ => silvermont}/string/sse2-strcpy-slm.S |    0
+ .../{ => silvermont}/string/sse2-strlen-slm.S |    0
+ .../string/sse2-strncat-slm.S                 |    0
+ .../string/sse2-strncpy-slm.S                 |    0
+ .../{ => silvermont}/string/sse4-memcmp-slm.S |    2 +-
+ .../string/ssse3-strcmp-slm.S                 |    0
+ .../string/ssse3-strncmp-slm.S                |    0
+ libc/arch-x86_64/static_function_dispatch.S   |    6 +
+ 24 files changed, 3528 insertions(+), 17 deletions(-)
+ create mode 100644 libc/arch-x86/kabylake/string/avx2-memcpy-kbl.S
+ create mode 100644 libc/arch-x86_64/generic/string/memchr.c
+ create mode 100644 libc/arch-x86_64/generic/string/memrchr.c
+ create mode 100644 libc/arch-x86_64/generic/string/wmemset.c
+ rename libc/arch-x86_64/{string => include}/cache.h (100%)
+ create mode 100644 libc/arch-x86_64/kabylake/string/avx2-memchr-kbl.S
+ create mode 100644 libc/arch-x86_64/kabylake/string/avx2-memcmp-kbl.S
+ create mode 100644 libc/arch-x86_64/kabylake/string/avx2-memrchr-kbl.S
+ create mode 100644 libc/arch-x86_64/kabylake/string/avx2-wmemset-kbl.S
+ rename libc/arch-x86_64/{ => silvermont}/string/sse2-memmove-slm.S (99%)
+ rename libc/arch-x86_64/{ => silvermont}/string/sse2-memset-slm.S (100%)
+ rename libc/arch-x86_64/{ => silvermont}/string/sse2-stpcpy-slm.S (100%)
+ rename libc/arch-x86_64/{ => silvermont}/string/sse2-stpncpy-slm.S (100%)
+ rename libc/arch-x86_64/{ => silvermont}/string/sse2-strcat-slm.S (100%)
+ rename libc/arch-x86_64/{ => silvermont}/string/sse2-strcpy-slm.S (100%)
+ rename libc/arch-x86_64/{ => silvermont}/string/sse2-strlen-slm.S (100%)
+ rename libc/arch-x86_64/{ => silvermont}/string/sse2-strncat-slm.S (100%)
+ rename libc/arch-x86_64/{ => silvermont}/string/sse2-strncpy-slm.S (100%)
+ rename libc/arch-x86_64/{ => silvermont}/string/sse4-memcmp-slm.S (99%)
+ rename libc/arch-x86_64/{ => silvermont}/string/ssse3-strcmp-slm.S (100%)
+ rename libc/arch-x86_64/{ => silvermont}/string/ssse3-strncmp-slm.S (100%)
+
+diff --git a/libc/Android.bp b/libc/Android.bp
+index 943d41fba..530ce9111 100644
+--- a/libc/Android.bp
++++ b/libc/Android.bp
+@@ -617,8 +617,6 @@ cc_library_static {
+         },
+         x86_64: {
+             srcs: [
+-                "upstream-openbsd/lib/libc/string/memchr.c",
+-                "upstream-openbsd/lib/libc/string/memrchr.c",
+                 "upstream-openbsd/lib/libc/string/strlcat.c",
+                 "upstream-openbsd/lib/libc/string/strlcpy.c",
+             ],
+@@ -1187,6 +1185,7 @@ cc_library_static {
+             ],
+         },
+         x86_64: {
++            include_dirs: ["bionic/libc/arch-x86_64/include"],
+             srcs: [
+                 "arch-x86_64/bionic/__bionic_clone.S",
+                 "arch-x86_64/bionic/_exit_with_stack_teardown.S",
+@@ -1196,18 +1195,27 @@ cc_library_static {
+                 "arch-x86_64/bionic/vfork.S",
+ 
+                 "arch-x86_64/string/avx2-memset-kbl.S",
+-                "arch-x86_64/string/sse2-memmove-slm.S",
+-                "arch-x86_64/string/sse2-memset-slm.S",
+-                "arch-x86_64/string/sse2-stpcpy-slm.S",
+-                "arch-x86_64/string/sse2-stpncpy-slm.S",
+-                "arch-x86_64/string/sse2-strcat-slm.S",
+-                "arch-x86_64/string/sse2-strcpy-slm.S",
+-                "arch-x86_64/string/sse2-strlen-slm.S",
+-                "arch-x86_64/string/sse2-strncat-slm.S",
+-                "arch-x86_64/string/sse2-strncpy-slm.S",
+-                "arch-x86_64/string/sse4-memcmp-slm.S",
+-                "arch-x86_64/string/ssse3-strcmp-slm.S",
+-                "arch-x86_64/string/ssse3-strncmp-slm.S",
++                "arch-x86_64/silvermont/string/sse2-memmove-slm.S",
++                "arch-x86_64/silvermont/string/sse2-memset-slm.S",
++                "arch-x86_64/silvermont/string/sse2-stpcpy-slm.S",
++                "arch-x86_64/silvermont/string/sse2-stpncpy-slm.S",
++                "arch-x86_64/silvermont/string/sse2-strcat-slm.S",
++                "arch-x86_64/silvermont/string/sse2-strcpy-slm.S",
++                "arch-x86_64/silvermont/string/sse2-strlen-slm.S",
++                "arch-x86_64/silvermont/string/sse2-strncat-slm.S",
++                "arch-x86_64/silvermont/string/sse2-strncpy-slm.S",
++                "arch-x86_64/silvermont/string/sse4-memcmp-slm.S",
++                "arch-x86_64/silvermont/string/ssse3-strcmp-slm.S",
++                "arch-x86_64/silvermont/string/ssse3-strncmp-slm.S",
++
++                //"arch-x86_64/generic/string/wmemset.c"
++                "arch-x86_64/generic/string/memchr.c",
++                "arch-x86_64/generic/string/memrchr.c",
++
++                //"arch-x86_64/kabylake/string/avx2-wmemset-kbl.S"
++                "arch-x86_64/kabylake/string/avx2-memcmp-kbl.S",
++                "arch-x86_64/kabylake/string/avx2-memchr-kbl.S",
++                "arch-x86_64/kabylake/string/avx2-memrchr-kbl.S",
+ 
+                 "bionic/strchr.cpp",
+                 "bionic/strchrnul.cpp",
+diff --git a/libc/arch-x86/kabylake/string/avx2-memcpy-kbl.S b/libc/arch-x86/kabylake/string/avx2-memcpy-kbl.S
+new file mode 100644
+index 000000000..69fca7cf1
+--- /dev/null
++++ b/libc/arch-x86/kabylake/string/avx2-memcpy-kbl.S
+@@ -0,0 +1,2052 @@
++#define ENTRY(f) \
++    .text; \
++    .globl f; \
++    .p2align    4, 0x90; \
++    .type f,@function; \
++    f: \
++
++#define END(f)
++    .size f, .-f; \
++    .section        .rodata,"a",@progbits; \
++    .p2align        2 \
++
++ENTRY(memcpy_avx2)
++# %bb.0:
++	pushl	%ebp
++	pushl	%ebx
++	pushl	%edi
++	pushl	%esi
++	movl	28(%esp), %ebx
++	movl	24(%esp), %ecx
++	movl	20(%esp), %eax
++	calll	.L0$pb
++.L0$pb:
++	popl	%esi
++.Ltmp0:
++	addl	$_GLOBAL_OFFSET_TABLE_+(.Ltmp0-.L0$pb), %esi
++	cmpl	$256, %ebx              # imm = 0x100
++	ja	.LBB0_251
++# %bb.1:
++	leal	-1(%ebx), %edi
++	cmpl	$255, %edi
++	ja	.LBB0_270
++# %bb.2:
++	addl	.LJTI0_1@GOTOFF(%esi,%edi,4), %esi
++	leal	(%eax,%ebx), %edx
++	addl	%ebx, %ecx
++	jmpl	*%esi
++.LBB0_251:
++	movl	%eax, %ebp
++	vmovups	(%ecx), %ymm0
++	movl	%ebx, %edi
++	negl	%ebp
++	andl	$31, %ebp
++	subl	%ebp, %edi
++	addl	%ebp, %ecx
++	leal	(%eax,%ebp), %edx
++	cmpl	$2097152, %edi          # imm = 0x200000
++	vmovups	%ymm0, (%eax)
++	ja	.LBB0_256
++# %bb.252:
++	cmpl	$256, %edi              # imm = 0x100
++	jb	.LBB0_260
++# %bb.253:
++	subl	%ebp, %ebx
++	.p2align	4, 0x90
++.LBB0_254:                              # =>This Inner Loop Header: Depth=1
++	vmovups	(%ecx), %ymm0
++	vmovups	32(%ecx), %ymm1
++	vmovups	64(%ecx), %ymm2
++	vmovups	96(%ecx), %ymm3
++	vmovups	128(%ecx), %ymm4
++	vmovups	160(%ecx), %ymm5
++	vmovups	192(%ecx), %ymm6
++	vmovups	224(%ecx), %ymm7
++	prefetchnta	512(%ecx)
++	addl	$-256, %edi
++	addl	$256, %ecx              # imm = 0x100
++	vmovups	%ymm0, (%edx)
++	vmovups	%ymm1, 32(%edx)
++	vmovups	%ymm2, 64(%edx)
++	vmovups	%ymm3, 96(%edx)
++	vmovups	%ymm4, 128(%edx)
++	vmovups	%ymm5, 160(%edx)
++	vmovups	%ymm6, 192(%edx)
++	vmovups	%ymm7, 224(%edx)
++	addl	$256, %edx              # imm = 0x100
++	cmpl	$255, %edi
++	ja	.LBB0_254
++# %bb.255:
++	movzbl	%bl, %edi
++	leal	-1(%edi), %ebx
++	cmpl	$255, %ebx
++	jbe	.LBB0_261
++	jmp	.LBB0_270
++.LBB0_256:
++	prefetchnta	(%ecx)
++	subl	%ebp, %ebx
++	testb	$31, %cl
++	je	.LBB0_257
++	.p2align	4, 0x90
++.LBB0_258:                              # =>This Inner Loop Header: Depth=1
++	vmovups	(%ecx), %ymm0
++	vmovups	32(%ecx), %ymm1
++	vmovups	64(%ecx), %ymm2
++	vmovups	96(%ecx), %ymm3
++	vmovups	128(%ecx), %ymm4
++	vmovups	160(%ecx), %ymm5
++	vmovups	192(%ecx), %ymm6
++	vmovups	224(%ecx), %ymm7
++	prefetchnta	512(%ecx)
++	addl	$-256, %edi
++	addl	$256, %ecx              # imm = 0x100
++	vmovntps	%ymm0, (%edx)
++	vmovntps	%ymm1, 32(%edx)
++	vmovntps	%ymm2, 64(%edx)
++	vmovntps	%ymm3, 96(%edx)
++	vmovntps	%ymm4, 128(%edx)
++	vmovntps	%ymm5, 160(%edx)
++	vmovntps	%ymm6, 192(%edx)
++	vmovntps	%ymm7, 224(%edx)
++	addl	$256, %edx              # imm = 0x100
++	cmpl	$255, %edi
++	ja	.LBB0_258
++	jmp	.LBB0_259
++	.p2align	4, 0x90
++.LBB0_257:                              # =>This Inner Loop Header: Depth=1
++	vmovaps	(%ecx), %ymm0
++	vmovaps	32(%ecx), %ymm1
++	vmovaps	64(%ecx), %ymm2
++	vmovaps	96(%ecx), %ymm3
++	vmovaps	128(%ecx), %ymm4
++	vmovaps	160(%ecx), %ymm5
++	vmovaps	192(%ecx), %ymm6
++	vmovaps	224(%ecx), %ymm7
++	prefetchnta	512(%ecx)
++	addl	$-256, %edi
++	addl	$256, %ecx              # imm = 0x100
++	vmovntps	%ymm0, (%edx)
++	vmovntps	%ymm1, 32(%edx)
++	vmovntps	%ymm2, 64(%edx)
++	vmovntps	%ymm3, 96(%edx)
++	vmovntps	%ymm4, 128(%edx)
++	vmovntps	%ymm5, 160(%edx)
++	vmovntps	%ymm6, 192(%edx)
++	vmovntps	%ymm7, 224(%edx)
++	addl	$256, %edx              # imm = 0x100
++	cmpl	$255, %edi
++	ja	.LBB0_257
++.LBB0_259:
++	sfence
++	movzbl	%bl, %edi
++.LBB0_260:
++	leal	-1(%edi), %ebx
++	cmpl	$255, %ebx
++	ja	.LBB0_270
++.LBB0_261:
++	addl	.LJTI0_0@GOTOFF(%esi,%ebx,4), %esi
++	addl	%edi, %edx
++	addl	%edi, %ecx
++	jmpl	*%esi
++.LBB0_11:
++	vmovups	-131(%ecx), %ymm0
++	vmovups	%ymm0, -131(%edx)
++	vmovups	-99(%ecx), %ymm0
++	vmovups	%ymm0, -99(%edx)
++	vmovups	-67(%ecx), %ymm0
++	vmovups	%ymm0, -67(%edx)
++	vmovups	-35(%ecx), %ymm0
++	vmovups	%ymm0, -35(%edx)
++.LBB0_12:
++	movzwl	-3(%ecx), %esi
++	movw	%si, -3(%edx)
++	jmp	.LBB0_6
++.LBB0_17:
++	vmovups	-133(%ecx), %ymm0
++	vmovups	%ymm0, -133(%edx)
++	vmovups	-101(%ecx), %ymm0
++	vmovups	%ymm0, -101(%edx)
++	vmovups	-69(%ecx), %ymm0
++	vmovups	%ymm0, -69(%edx)
++	vmovups	-37(%ecx), %ymm0
++	vmovups	%ymm0, -37(%edx)
++.LBB0_18:
++	movl	-5(%ecx), %esi
++	movl	%esi, -5(%edx)
++	jmp	.LBB0_6
++.LBB0_19:
++	vmovups	-134(%ecx), %ymm0
++	vmovups	%ymm0, -134(%edx)
++	vmovups	-102(%ecx), %ymm0
++	vmovups	%ymm0, -102(%edx)
++	vmovups	-70(%ecx), %ymm0
++	vmovups	%ymm0, -70(%edx)
++	vmovups	-38(%ecx), %ymm0
++	vmovups	%ymm0, -38(%edx)
++.LBB0_20:
++	movl	-6(%ecx), %esi
++	movl	%esi, -6(%edx)
++	jmp	.LBB0_10
++.LBB0_21:
++	vmovups	-135(%ecx), %ymm0
++	vmovups	%ymm0, -135(%edx)
++	vmovups	-103(%ecx), %ymm0
++	vmovups	%ymm0, -103(%edx)
++	vmovups	-71(%ecx), %ymm0
++	vmovups	%ymm0, -71(%edx)
++	vmovups	-39(%ecx), %ymm0
++	vmovups	%ymm0, -39(%edx)
++.LBB0_22:
++	movl	-7(%ecx), %esi
++	movl	%esi, -7(%edx)
++	jmp	.LBB0_16
++.LBB0_27:
++	vmovups	-137(%ecx), %ymm0
++	vmovups	%ymm0, -137(%edx)
++	vmovups	-105(%ecx), %ymm0
++	vmovups	%ymm0, -105(%edx)
++	vmovups	-73(%ecx), %ymm0
++	vmovups	%ymm0, -73(%edx)
++	vmovups	-41(%ecx), %ymm0
++	vmovups	%ymm0, -41(%edx)
++.LBB0_28:
++	vmovsd	-9(%ecx), %xmm0         # xmm0 = mem[0],zero
++	vmovsd	%xmm0, -9(%edx)
++	jmp	.LBB0_6
++.LBB0_29:
++	vmovups	-138(%ecx), %ymm0
++	vmovups	%ymm0, -138(%edx)
++	vmovups	-106(%ecx), %ymm0
++	vmovups	%ymm0, -106(%edx)
++	vmovups	-74(%ecx), %ymm0
++	vmovups	%ymm0, -74(%edx)
++	vmovups	-42(%ecx), %ymm0
++	vmovups	%ymm0, -42(%edx)
++.LBB0_30:
++	vmovsd	-10(%ecx), %xmm0        # xmm0 = mem[0],zero
++	vmovsd	%xmm0, -10(%edx)
++	jmp	.LBB0_10
++.LBB0_31:
++	vmovups	-139(%ecx), %ymm0
++	vmovups	%ymm0, -139(%edx)
++	vmovups	-107(%ecx), %ymm0
++	vmovups	%ymm0, -107(%edx)
++	vmovups	-75(%ecx), %ymm0
++	vmovups	%ymm0, -75(%edx)
++	vmovups	-43(%ecx), %ymm0
++	vmovups	%ymm0, -43(%edx)
++.LBB0_32:
++	vmovsd	-11(%ecx), %xmm0        # xmm0 = mem[0],zero
++	vmovsd	%xmm0, -11(%edx)
++	jmp	.LBB0_16
++.LBB0_33:
++	vmovups	-140(%ecx), %ymm0
++	vmovups	%ymm0, -140(%edx)
++	vmovups	-108(%ecx), %ymm0
++	vmovups	%ymm0, -108(%edx)
++	vmovups	-76(%ecx), %ymm0
++	vmovups	%ymm0, -76(%edx)
++	vmovups	-44(%ecx), %ymm0
++	vmovups	%ymm0, -44(%edx)
++.LBB0_34:
++	vmovsd	-12(%ecx), %xmm0        # xmm0 = mem[0],zero
++	vmovsd	%xmm0, -12(%edx)
++	jmp	.LBB0_16
++.LBB0_35:
++	vmovups	-141(%ecx), %ymm0
++	vmovups	%ymm0, -141(%edx)
++	vmovups	-109(%ecx), %ymm0
++	vmovups	%ymm0, -109(%edx)
++	vmovups	-77(%ecx), %ymm0
++	vmovups	%ymm0, -77(%edx)
++	vmovups	-45(%ecx), %ymm0
++	vmovups	%ymm0, -45(%edx)
++.LBB0_36:
++	vmovsd	-13(%ecx), %xmm0        # xmm0 = mem[0],zero
++	vmovsd	%xmm0, -13(%edx)
++	jmp	.LBB0_26
++.LBB0_37:
++	vmovups	-142(%ecx), %ymm0
++	vmovups	%ymm0, -142(%edx)
++	vmovups	-110(%ecx), %ymm0
++	vmovups	%ymm0, -110(%edx)
++	vmovups	-78(%ecx), %ymm0
++	vmovups	%ymm0, -78(%edx)
++	vmovups	-46(%ecx), %ymm0
++	vmovups	%ymm0, -46(%edx)
++.LBB0_38:
++	vmovsd	-14(%ecx), %xmm0        # xmm0 = mem[0],zero
++	vmovsd	%xmm0, -14(%edx)
++	jmp	.LBB0_26
++.LBB0_39:
++	vmovups	-143(%ecx), %ymm0
++	vmovups	%ymm0, -143(%edx)
++	vmovups	-111(%ecx), %ymm0
++	vmovups	%ymm0, -111(%edx)
++	vmovups	-79(%ecx), %ymm0
++	vmovups	%ymm0, -79(%edx)
++	vmovups	-47(%ecx), %ymm0
++	vmovups	%ymm0, -47(%edx)
++.LBB0_40:
++	vmovsd	-15(%ecx), %xmm0        # xmm0 = mem[0],zero
++	vmovsd	%xmm0, -15(%edx)
++	jmp	.LBB0_26
++.LBB0_45:
++	vmovups	-145(%ecx), %ymm0
++	vmovups	%ymm0, -145(%edx)
++	vmovups	-113(%ecx), %ymm0
++	vmovups	%ymm0, -113(%edx)
++	vmovups	-81(%ecx), %ymm0
++	vmovups	%ymm0, -81(%edx)
++	vmovups	-49(%ecx), %ymm0
++	vmovups	%ymm0, -49(%edx)
++.LBB0_46:
++	vmovups	-17(%ecx), %xmm0
++	vmovups	%xmm0, -17(%edx)
++	jmp	.LBB0_6
++.LBB0_47:
++	vmovups	-146(%ecx), %ymm0
++	vmovups	%ymm0, -146(%edx)
++	vmovups	-114(%ecx), %ymm0
++	vmovups	%ymm0, -114(%edx)
++	vmovups	-82(%ecx), %ymm0
++	vmovups	%ymm0, -82(%edx)
++	vmovups	-50(%ecx), %ymm0
++	vmovups	%ymm0, -50(%edx)
++.LBB0_48:
++	vmovups	-18(%ecx), %xmm0
++	vmovups	%xmm0, -18(%edx)
++	jmp	.LBB0_10
++.LBB0_49:
++	vmovups	-147(%ecx), %ymm0
++	vmovups	%ymm0, -147(%edx)
++	vmovups	-115(%ecx), %ymm0
++	vmovups	%ymm0, -115(%edx)
++	vmovups	-83(%ecx), %ymm0
++	vmovups	%ymm0, -83(%edx)
++	vmovups	-51(%ecx), %ymm0
++	vmovups	%ymm0, -51(%edx)
++.LBB0_50:
++	vmovups	-19(%ecx), %xmm0
++	vmovups	%xmm0, -19(%edx)
++	jmp	.LBB0_16
++.LBB0_51:
++	vmovups	-148(%ecx), %ymm0
++	vmovups	%ymm0, -148(%edx)
++	vmovups	-116(%ecx), %ymm0
++	vmovups	%ymm0, -116(%edx)
++	vmovups	-84(%ecx), %ymm0
++	vmovups	%ymm0, -84(%edx)
++	vmovups	-52(%ecx), %ymm0
++	vmovups	%ymm0, -52(%edx)
++.LBB0_52:
++	vmovups	-20(%ecx), %xmm0
++	vmovups	%xmm0, -20(%edx)
++	jmp	.LBB0_16
++.LBB0_53:
++	vmovups	-149(%ecx), %ymm0
++	vmovups	%ymm0, -149(%edx)
++	vmovups	-117(%ecx), %ymm0
++	vmovups	%ymm0, -117(%edx)
++	vmovups	-85(%ecx), %ymm0
++	vmovups	%ymm0, -85(%edx)
++	vmovups	-53(%ecx), %ymm0
++	vmovups	%ymm0, -53(%edx)
++.LBB0_54:
++	vmovups	-21(%ecx), %xmm0
++	vmovups	%xmm0, -21(%edx)
++	jmp	.LBB0_26
++.LBB0_55:
++	vmovups	-150(%ecx), %ymm0
++	vmovups	%ymm0, -150(%edx)
++	vmovups	-118(%ecx), %ymm0
++	vmovups	%ymm0, -118(%edx)
++	vmovups	-86(%ecx), %ymm0
++	vmovups	%ymm0, -86(%edx)
++	vmovups	-54(%ecx), %ymm0
++	vmovups	%ymm0, -54(%edx)
++.LBB0_56:
++	vmovups	-22(%ecx), %xmm0
++	vmovups	%xmm0, -22(%edx)
++	jmp	.LBB0_26
++.LBB0_57:
++	vmovups	-151(%ecx), %ymm0
++	vmovups	%ymm0, -151(%edx)
++	vmovups	-119(%ecx), %ymm0
++	vmovups	%ymm0, -119(%edx)
++	vmovups	-87(%ecx), %ymm0
++	vmovups	%ymm0, -87(%edx)
++	vmovups	-55(%ecx), %ymm0
++	vmovups	%ymm0, -55(%edx)
++.LBB0_58:
++	vmovups	-23(%ecx), %xmm0
++	vmovups	%xmm0, -23(%edx)
++	jmp	.LBB0_26
++.LBB0_59:
++	vmovups	-152(%ecx), %ymm0
++	vmovups	%ymm0, -152(%edx)
++	vmovups	-120(%ecx), %ymm0
++	vmovups	%ymm0, -120(%edx)
++	vmovups	-88(%ecx), %ymm0
++	vmovups	%ymm0, -88(%edx)
++	vmovups	-56(%ecx), %ymm0
++	vmovups	%ymm0, -56(%edx)
++.LBB0_60:
++	vmovups	-24(%ecx), %xmm0
++	vmovups	%xmm0, -24(%edx)
++	jmp	.LBB0_26
++.LBB0_61:
++	vmovups	-153(%ecx), %ymm0
++	vmovups	%ymm0, -153(%edx)
++	vmovups	-121(%ecx), %ymm0
++	vmovups	%ymm0, -121(%edx)
++	vmovups	-89(%ecx), %ymm0
++	vmovups	%ymm0, -89(%edx)
++	vmovups	-57(%ecx), %ymm0
++	vmovups	%ymm0, -57(%edx)
++.LBB0_62:
++	vmovups	-25(%ecx), %xmm0
++	vmovups	%xmm0, -25(%edx)
++	jmp	.LBB0_44
++.LBB0_63:
++	vmovups	-154(%ecx), %ymm0
++	vmovups	%ymm0, -154(%edx)
++	vmovups	-122(%ecx), %ymm0
++	vmovups	%ymm0, -122(%edx)
++	vmovups	-90(%ecx), %ymm0
++	vmovups	%ymm0, -90(%edx)
++	vmovups	-58(%ecx), %ymm0
++	vmovups	%ymm0, -58(%edx)
++.LBB0_64:
++	vmovups	-26(%ecx), %xmm0
++	vmovups	%xmm0, -26(%edx)
++	jmp	.LBB0_44
++.LBB0_65:
++	vmovups	-155(%ecx), %ymm0
++	vmovups	%ymm0, -155(%edx)
++	vmovups	-123(%ecx), %ymm0
++	vmovups	%ymm0, -123(%edx)
++	vmovups	-91(%ecx), %ymm0
++	vmovups	%ymm0, -91(%edx)
++	vmovups	-59(%ecx), %ymm0
++	vmovups	%ymm0, -59(%edx)
++.LBB0_66:
++	vmovups	-27(%ecx), %xmm0
++	vmovups	%xmm0, -27(%edx)
++	jmp	.LBB0_44
++.LBB0_67:
++	vmovups	-156(%ecx), %ymm0
++	vmovups	%ymm0, -156(%edx)
++	vmovups	-124(%ecx), %ymm0
++	vmovups	%ymm0, -124(%edx)
++	vmovups	-92(%ecx), %ymm0
++	vmovups	%ymm0, -92(%edx)
++	vmovups	-60(%ecx), %ymm0
++	vmovups	%ymm0, -60(%edx)
++.LBB0_68:
++	vmovups	-28(%ecx), %xmm0
++	vmovups	%xmm0, -28(%edx)
++	jmp	.LBB0_44
++.LBB0_69:
++	vmovups	-157(%ecx), %ymm0
++	vmovups	%ymm0, -157(%edx)
++	vmovups	-125(%ecx), %ymm0
++	vmovups	%ymm0, -125(%edx)
++	vmovups	-93(%ecx), %ymm0
++	vmovups	%ymm0, -93(%edx)
++	vmovups	-61(%ecx), %ymm0
++	vmovups	%ymm0, -61(%edx)
++.LBB0_70:
++	vmovups	-29(%ecx), %xmm0
++	vmovups	%xmm0, -29(%edx)
++	jmp	.LBB0_44
++.LBB0_71:
++	vmovups	-158(%ecx), %ymm0
++	vmovups	%ymm0, -158(%edx)
++	vmovups	-126(%ecx), %ymm0
++	vmovups	%ymm0, -126(%edx)
++	vmovups	-94(%ecx), %ymm0
++	vmovups	%ymm0, -94(%edx)
++	vmovups	-62(%ecx), %ymm0
++	vmovups	%ymm0, -62(%edx)
++.LBB0_72:
++	vmovups	-30(%ecx), %xmm0
++	vmovups	%xmm0, -30(%edx)
++	jmp	.LBB0_44
++.LBB0_73:
++	vmovups	-159(%ecx), %ymm0
++	vmovups	%ymm0, -159(%edx)
++	vmovups	-127(%ecx), %ymm0
++	vmovups	%ymm0, -127(%edx)
++	vmovups	-95(%ecx), %ymm0
++	vmovups	%ymm0, -95(%edx)
++	vmovups	-63(%ecx), %ymm0
++	vmovups	%ymm0, -63(%edx)
++.LBB0_74:
++	vmovups	-31(%ecx), %xmm0
++	vmovups	%xmm0, -31(%edx)
++	jmp	.LBB0_44
++.LBB0_75:
++	vmovups	-193(%ecx), %ymm0
++	vmovups	%ymm0, -193(%edx)
++.LBB0_76:
++	vmovups	-161(%ecx), %ymm0
++	vmovups	%ymm0, -161(%edx)
++.LBB0_3:
++	vmovups	-129(%ecx), %ymm0
++	vmovups	%ymm0, -129(%edx)
++	vmovups	-97(%ecx), %ymm0
++	vmovups	%ymm0, -97(%edx)
++.LBB0_4:
++	vmovups	-65(%ecx), %ymm0
++	vmovups	%ymm0, -65(%edx)
++.LBB0_5:
++	vmovups	-33(%ecx), %ymm0
++	vmovups	%ymm0, -33(%edx)
++.LBB0_6:
++	movb	-1(%ecx), %cl
++	movb	%cl, -1(%edx)
++	jmp	.LBB0_270
++.LBB0_77:
++	vmovups	-194(%ecx), %ymm0
++	vmovups	%ymm0, -194(%edx)
++.LBB0_78:
++	vmovups	-162(%ecx), %ymm0
++	vmovups	%ymm0, -162(%edx)
++.LBB0_7:
++	vmovups	-130(%ecx), %ymm0
++	vmovups	%ymm0, -130(%edx)
++	vmovups	-98(%ecx), %ymm0
++	vmovups	%ymm0, -98(%edx)
++.LBB0_8:
++	vmovups	-66(%ecx), %ymm0
++	vmovups	%ymm0, -66(%edx)
++.LBB0_9:
++	vmovups	-34(%ecx), %ymm0
++	vmovups	%ymm0, -34(%edx)
++.LBB0_10:
++	movzwl	-2(%ecx), %ecx
++	movw	%cx, -2(%edx)
++	jmp	.LBB0_270
++.LBB0_79:
++	vmovups	-195(%ecx), %ymm0
++	vmovups	%ymm0, -195(%edx)
++.LBB0_80:
++	vmovups	-163(%ecx), %ymm0
++	vmovups	%ymm0, -163(%edx)
++	vmovups	-131(%ecx), %ymm0
++	vmovups	%ymm0, -131(%edx)
++	vmovups	-99(%ecx), %ymm0
++	vmovups	%ymm0, -99(%edx)
++.LBB0_81:
++	vmovups	-67(%ecx), %ymm0
++	vmovups	%ymm0, -67(%edx)
++.LBB0_82:
++	vmovups	-35(%ecx), %ymm0
++	vmovups	%ymm0, -35(%edx)
++	jmp	.LBB0_16
++.LBB0_83:
++	vmovups	-196(%ecx), %ymm0
++	vmovups	%ymm0, -196(%edx)
++.LBB0_84:
++	vmovups	-164(%ecx), %ymm0
++	vmovups	%ymm0, -164(%edx)
++.LBB0_13:
++	vmovups	-132(%ecx), %ymm0
++	vmovups	%ymm0, -132(%edx)
++	vmovups	-100(%ecx), %ymm0
++	vmovups	%ymm0, -100(%edx)
++.LBB0_14:
++	vmovups	-68(%ecx), %ymm0
++	vmovups	%ymm0, -68(%edx)
++.LBB0_15:
++	vmovups	-36(%ecx), %ymm0
++	vmovups	%ymm0, -36(%edx)
++.LBB0_16:
++	movl	-4(%ecx), %ecx
++	movl	%ecx, -4(%edx)
++	jmp	.LBB0_270
++.LBB0_85:
++	vmovups	-197(%ecx), %ymm0
++	vmovups	%ymm0, -197(%edx)
++.LBB0_86:
++	vmovups	-165(%ecx), %ymm0
++	vmovups	%ymm0, -165(%edx)
++	vmovups	-133(%ecx), %ymm0
++	vmovups	%ymm0, -133(%edx)
++	vmovups	-101(%ecx), %ymm0
++	vmovups	%ymm0, -101(%edx)
++.LBB0_87:
++	vmovups	-69(%ecx), %ymm0
++	vmovups	%ymm0, -69(%edx)
++.LBB0_88:
++	vmovups	-37(%ecx), %ymm0
++	vmovups	%ymm0, -37(%edx)
++	jmp	.LBB0_26
++.LBB0_89:
++	vmovups	-198(%ecx), %ymm0
++	vmovups	%ymm0, -198(%edx)
++.LBB0_90:
++	vmovups	-166(%ecx), %ymm0
++	vmovups	%ymm0, -166(%edx)
++	vmovups	-134(%ecx), %ymm0
++	vmovups	%ymm0, -134(%edx)
++	vmovups	-102(%ecx), %ymm0
++	vmovups	%ymm0, -102(%edx)
++.LBB0_91:
++	vmovups	-70(%ecx), %ymm0
++	vmovups	%ymm0, -70(%edx)
++.LBB0_92:
++	vmovups	-38(%ecx), %ymm0
++	vmovups	%ymm0, -38(%edx)
++	jmp	.LBB0_26
++.LBB0_93:
++	vmovups	-199(%ecx), %ymm0
++	vmovups	%ymm0, -199(%edx)
++.LBB0_94:
++	vmovups	-167(%ecx), %ymm0
++	vmovups	%ymm0, -167(%edx)
++	vmovups	-135(%ecx), %ymm0
++	vmovups	%ymm0, -135(%edx)
++	vmovups	-103(%ecx), %ymm0
++	vmovups	%ymm0, -103(%edx)
++.LBB0_95:
++	vmovups	-71(%ecx), %ymm0
++	vmovups	%ymm0, -71(%edx)
++.LBB0_96:
++	vmovups	-39(%ecx), %ymm0
++	vmovups	%ymm0, -39(%edx)
++	jmp	.LBB0_26
++.LBB0_97:
++	vmovups	-200(%ecx), %ymm0
++	vmovups	%ymm0, -200(%edx)
++.LBB0_98:
++	vmovups	-168(%ecx), %ymm0
++	vmovups	%ymm0, -168(%edx)
++.LBB0_23:
++	vmovups	-136(%ecx), %ymm0
++	vmovups	%ymm0, -136(%edx)
++	vmovups	-104(%ecx), %ymm0
++	vmovups	%ymm0, -104(%edx)
++.LBB0_24:
++	vmovups	-72(%ecx), %ymm0
++	vmovups	%ymm0, -72(%edx)
++.LBB0_25:
++	vmovups	-40(%ecx), %ymm0
++	vmovups	%ymm0, -40(%edx)
++.LBB0_26:
++	vmovsd	-8(%ecx), %xmm0         # xmm0 = mem[0],zero
++	vmovsd	%xmm0, -8(%edx)
++	jmp	.LBB0_270
++.LBB0_99:
++	vmovups	-201(%ecx), %ymm0
++	vmovups	%ymm0, -201(%edx)
++.LBB0_100:
++	vmovups	-169(%ecx), %ymm0
++	vmovups	%ymm0, -169(%edx)
++	vmovups	-137(%ecx), %ymm0
++	vmovups	%ymm0, -137(%edx)
++	vmovups	-105(%ecx), %ymm0
++	vmovups	%ymm0, -105(%edx)
++.LBB0_101:
++	vmovups	-73(%ecx), %ymm0
++	vmovups	%ymm0, -73(%edx)
++.LBB0_102:
++	vmovups	-41(%ecx), %ymm0
++	vmovups	%ymm0, -41(%edx)
++	jmp	.LBB0_44
++.LBB0_103:
++	vmovups	-202(%ecx), %ymm0
++	vmovups	%ymm0, -202(%edx)
++.LBB0_104:
++	vmovups	-170(%ecx), %ymm0
++	vmovups	%ymm0, -170(%edx)
++	vmovups	-138(%ecx), %ymm0
++	vmovups	%ymm0, -138(%edx)
++	vmovups	-106(%ecx), %ymm0
++	vmovups	%ymm0, -106(%edx)
++.LBB0_105:
++	vmovups	-74(%ecx), %ymm0
++	vmovups	%ymm0, -74(%edx)
++.LBB0_106:
++	vmovups	-42(%ecx), %ymm0
++	vmovups	%ymm0, -42(%edx)
++	jmp	.LBB0_44
++.LBB0_107:
++	vmovups	-203(%ecx), %ymm0
++	vmovups	%ymm0, -203(%edx)
++.LBB0_108:
++	vmovups	-171(%ecx), %ymm0
++	vmovups	%ymm0, -171(%edx)
++	vmovups	-139(%ecx), %ymm0
++	vmovups	%ymm0, -139(%edx)
++	vmovups	-107(%ecx), %ymm0
++	vmovups	%ymm0, -107(%edx)
++.LBB0_109:
++	vmovups	-75(%ecx), %ymm0
++	vmovups	%ymm0, -75(%edx)
++.LBB0_110:
++	vmovups	-43(%ecx), %ymm0
++	vmovups	%ymm0, -43(%edx)
++	jmp	.LBB0_44
++.LBB0_111:
++	vmovups	-204(%ecx), %ymm0
++	vmovups	%ymm0, -204(%edx)
++.LBB0_112:
++	vmovups	-172(%ecx), %ymm0
++	vmovups	%ymm0, -172(%edx)
++	vmovups	-140(%ecx), %ymm0
++	vmovups	%ymm0, -140(%edx)
++	vmovups	-108(%ecx), %ymm0
++	vmovups	%ymm0, -108(%edx)
++.LBB0_113:
++	vmovups	-76(%ecx), %ymm0
++	vmovups	%ymm0, -76(%edx)
++.LBB0_114:
++	vmovups	-44(%ecx), %ymm0
++	vmovups	%ymm0, -44(%edx)
++	jmp	.LBB0_44
++.LBB0_115:
++	vmovups	-205(%ecx), %ymm0
++	vmovups	%ymm0, -205(%edx)
++.LBB0_116:
++	vmovups	-173(%ecx), %ymm0
++	vmovups	%ymm0, -173(%edx)
++	vmovups	-141(%ecx), %ymm0
++	vmovups	%ymm0, -141(%edx)
++	vmovups	-109(%ecx), %ymm0
++	vmovups	%ymm0, -109(%edx)
++.LBB0_117:
++	vmovups	-77(%ecx), %ymm0
++	vmovups	%ymm0, -77(%edx)
++.LBB0_118:
++	vmovups	-45(%ecx), %ymm0
++	vmovups	%ymm0, -45(%edx)
++	jmp	.LBB0_44
++.LBB0_119:
++	vmovups	-206(%ecx), %ymm0
++	vmovups	%ymm0, -206(%edx)
++.LBB0_120:
++	vmovups	-174(%ecx), %ymm0
++	vmovups	%ymm0, -174(%edx)
++	vmovups	-142(%ecx), %ymm0
++	vmovups	%ymm0, -142(%edx)
++	vmovups	-110(%ecx), %ymm0
++	vmovups	%ymm0, -110(%edx)
++.LBB0_121:
++	vmovups	-78(%ecx), %ymm0
++	vmovups	%ymm0, -78(%edx)
++.LBB0_122:
++	vmovups	-46(%ecx), %ymm0
++	vmovups	%ymm0, -46(%edx)
++	jmp	.LBB0_44
++.LBB0_123:
++	vmovups	-207(%ecx), %ymm0
++	vmovups	%ymm0, -207(%edx)
++.LBB0_124:
++	vmovups	-175(%ecx), %ymm0
++	vmovups	%ymm0, -175(%edx)
++	vmovups	-143(%ecx), %ymm0
++	vmovups	%ymm0, -143(%edx)
++	vmovups	-111(%ecx), %ymm0
++	vmovups	%ymm0, -111(%edx)
++.LBB0_125:
++	vmovups	-79(%ecx), %ymm0
++	vmovups	%ymm0, -79(%edx)
++.LBB0_126:
++	vmovups	-47(%ecx), %ymm0
++	vmovups	%ymm0, -47(%edx)
++	jmp	.LBB0_44
++.LBB0_127:
++	vmovups	-208(%ecx), %ymm0
++	vmovups	%ymm0, -208(%edx)
++.LBB0_128:
++	vmovups	-176(%ecx), %ymm0
++	vmovups	%ymm0, -176(%edx)
++.LBB0_41:
++	vmovups	-144(%ecx), %ymm0
++	vmovups	%ymm0, -144(%edx)
++	vmovups	-112(%ecx), %ymm0
++	vmovups	%ymm0, -112(%edx)
++.LBB0_42:
++	vmovups	-80(%ecx), %ymm0
++	vmovups	%ymm0, -80(%edx)
++.LBB0_43:
++	vmovups	-48(%ecx), %ymm0
++	vmovups	%ymm0, -48(%edx)
++.LBB0_44:
++	vmovups	-16(%ecx), %xmm0
++	vmovups	%xmm0, -16(%edx)
++	jmp	.LBB0_270
++.LBB0_129:
++	vmovups	-209(%ecx), %ymm0
++	vmovups	%ymm0, -209(%edx)
++.LBB0_130:
++	vmovups	-177(%ecx), %ymm0
++	vmovups	%ymm0, -177(%edx)
++	vmovups	-145(%ecx), %ymm0
++	vmovups	%ymm0, -145(%edx)
++	vmovups	-113(%ecx), %ymm0
++	vmovups	%ymm0, -113(%edx)
++.LBB0_131:
++	vmovups	-81(%ecx), %ymm0
++	vmovups	%ymm0, -81(%edx)
++.LBB0_132:
++	vmovups	-49(%ecx), %ymm0
++	vmovups	%ymm0, -49(%edx)
++	jmp	.LBB0_269
++.LBB0_133:
++	vmovups	-210(%ecx), %ymm0
++	vmovups	%ymm0, -210(%edx)
++.LBB0_134:
++	vmovups	-178(%ecx), %ymm0
++	vmovups	%ymm0, -178(%edx)
++	vmovups	-146(%ecx), %ymm0
++	vmovups	%ymm0, -146(%edx)
++	vmovups	-114(%ecx), %ymm0
++	vmovups	%ymm0, -114(%edx)
++.LBB0_135:
++	vmovups	-82(%ecx), %ymm0
++	vmovups	%ymm0, -82(%edx)
++.LBB0_136:
++	vmovups	-50(%ecx), %ymm0
++	vmovups	%ymm0, -50(%edx)
++	jmp	.LBB0_269
++.LBB0_137:
++	vmovups	-211(%ecx), %ymm0
++	vmovups	%ymm0, -211(%edx)
++.LBB0_138:
++	vmovups	-179(%ecx), %ymm0
++	vmovups	%ymm0, -179(%edx)
++	vmovups	-147(%ecx), %ymm0
++	vmovups	%ymm0, -147(%edx)
++	vmovups	-115(%ecx), %ymm0
++	vmovups	%ymm0, -115(%edx)
++.LBB0_139:
++	vmovups	-83(%ecx), %ymm0
++	vmovups	%ymm0, -83(%edx)
++.LBB0_140:
++	vmovups	-51(%ecx), %ymm0
++	vmovups	%ymm0, -51(%edx)
++	jmp	.LBB0_269
++.LBB0_141:
++	vmovups	-212(%ecx), %ymm0
++	vmovups	%ymm0, -212(%edx)
++.LBB0_142:
++	vmovups	-180(%ecx), %ymm0
++	vmovups	%ymm0, -180(%edx)
++	vmovups	-148(%ecx), %ymm0
++	vmovups	%ymm0, -148(%edx)
++	vmovups	-116(%ecx), %ymm0
++	vmovups	%ymm0, -116(%edx)
++.LBB0_143:
++	vmovups	-84(%ecx), %ymm0
++	vmovups	%ymm0, -84(%edx)
++.LBB0_144:
++	vmovups	-52(%ecx), %ymm0
++	vmovups	%ymm0, -52(%edx)
++	jmp	.LBB0_269
++.LBB0_145:
++	vmovups	-213(%ecx), %ymm0
++	vmovups	%ymm0, -213(%edx)
++.LBB0_146:
++	vmovups	-181(%ecx), %ymm0
++	vmovups	%ymm0, -181(%edx)
++	vmovups	-149(%ecx), %ymm0
++	vmovups	%ymm0, -149(%edx)
++	vmovups	-117(%ecx), %ymm0
++	vmovups	%ymm0, -117(%edx)
++.LBB0_147:
++	vmovups	-85(%ecx), %ymm0
++	vmovups	%ymm0, -85(%edx)
++.LBB0_148:
++	vmovups	-53(%ecx), %ymm0
++	vmovups	%ymm0, -53(%edx)
++	jmp	.LBB0_269
++.LBB0_149:
++	vmovups	-214(%ecx), %ymm0
++	vmovups	%ymm0, -214(%edx)
++.LBB0_150:
++	vmovups	-182(%ecx), %ymm0
++	vmovups	%ymm0, -182(%edx)
++	vmovups	-150(%ecx), %ymm0
++	vmovups	%ymm0, -150(%edx)
++	vmovups	-118(%ecx), %ymm0
++	vmovups	%ymm0, -118(%edx)
++.LBB0_151:
++	vmovups	-86(%ecx), %ymm0
++	vmovups	%ymm0, -86(%edx)
++.LBB0_152:
++	vmovups	-54(%ecx), %ymm0
++	vmovups	%ymm0, -54(%edx)
++	jmp	.LBB0_269
++.LBB0_153:
++	vmovups	-215(%ecx), %ymm0
++	vmovups	%ymm0, -215(%edx)
++.LBB0_154:
++	vmovups	-183(%ecx), %ymm0
++	vmovups	%ymm0, -183(%edx)
++	vmovups	-151(%ecx), %ymm0
++	vmovups	%ymm0, -151(%edx)
++	vmovups	-119(%ecx), %ymm0
++	vmovups	%ymm0, -119(%edx)
++.LBB0_155:
++	vmovups	-87(%ecx), %ymm0
++	vmovups	%ymm0, -87(%edx)
++.LBB0_156:
++	vmovups	-55(%ecx), %ymm0
++	vmovups	%ymm0, -55(%edx)
++	jmp	.LBB0_269
++.LBB0_157:
++	vmovups	-216(%ecx), %ymm0
++	vmovups	%ymm0, -216(%edx)
++.LBB0_158:
++	vmovups	-184(%ecx), %ymm0
++	vmovups	%ymm0, -184(%edx)
++	vmovups	-152(%ecx), %ymm0
++	vmovups	%ymm0, -152(%edx)
++	vmovups	-120(%ecx), %ymm0
++	vmovups	%ymm0, -120(%edx)
++.LBB0_159:
++	vmovups	-88(%ecx), %ymm0
++	vmovups	%ymm0, -88(%edx)
++.LBB0_160:
++	vmovups	-56(%ecx), %ymm0
++	vmovups	%ymm0, -56(%edx)
++	jmp	.LBB0_269
++.LBB0_161:
++	vmovups	-217(%ecx), %ymm0
++	vmovups	%ymm0, -217(%edx)
++.LBB0_162:
++	vmovups	-185(%ecx), %ymm0
++	vmovups	%ymm0, -185(%edx)
++	vmovups	-153(%ecx), %ymm0
++	vmovups	%ymm0, -153(%edx)
++	vmovups	-121(%ecx), %ymm0
++	vmovups	%ymm0, -121(%edx)
++.LBB0_163:
++	vmovups	-89(%ecx), %ymm0
++	vmovups	%ymm0, -89(%edx)
++.LBB0_164:
++	vmovups	-57(%ecx), %ymm0
++	vmovups	%ymm0, -57(%edx)
++	jmp	.LBB0_269
++.LBB0_165:
++	vmovups	-218(%ecx), %ymm0
++	vmovups	%ymm0, -218(%edx)
++.LBB0_166:
++	vmovups	-186(%ecx), %ymm0
++	vmovups	%ymm0, -186(%edx)
++	vmovups	-154(%ecx), %ymm0
++	vmovups	%ymm0, -154(%edx)
++	vmovups	-122(%ecx), %ymm0
++	vmovups	%ymm0, -122(%edx)
++.LBB0_167:
++	vmovups	-90(%ecx), %ymm0
++	vmovups	%ymm0, -90(%edx)
++.LBB0_168:
++	vmovups	-58(%ecx), %ymm0
++	vmovups	%ymm0, -58(%edx)
++	jmp	.LBB0_269
++.LBB0_169:
++	vmovups	-219(%ecx), %ymm0
++	vmovups	%ymm0, -219(%edx)
++.LBB0_170:
++	vmovups	-187(%ecx), %ymm0
++	vmovups	%ymm0, -187(%edx)
++	vmovups	-155(%ecx), %ymm0
++	vmovups	%ymm0, -155(%edx)
++	vmovups	-123(%ecx), %ymm0
++	vmovups	%ymm0, -123(%edx)
++.LBB0_171:
++	vmovups	-91(%ecx), %ymm0
++	vmovups	%ymm0, -91(%edx)
++.LBB0_172:
++	vmovups	-59(%ecx), %ymm0
++	vmovups	%ymm0, -59(%edx)
++	jmp	.LBB0_269
++.LBB0_173:
++	vmovups	-220(%ecx), %ymm0
++	vmovups	%ymm0, -220(%edx)
++.LBB0_174:
++	vmovups	-188(%ecx), %ymm0
++	vmovups	%ymm0, -188(%edx)
++	vmovups	-156(%ecx), %ymm0
++	vmovups	%ymm0, -156(%edx)
++	vmovups	-124(%ecx), %ymm0
++	vmovups	%ymm0, -124(%edx)
++.LBB0_175:
++	vmovups	-92(%ecx), %ymm0
++	vmovups	%ymm0, -92(%edx)
++.LBB0_176:
++	vmovups	-60(%ecx), %ymm0
++	vmovups	%ymm0, -60(%edx)
++	jmp	.LBB0_269
++.LBB0_177:
++	vmovups	-221(%ecx), %ymm0
++	vmovups	%ymm0, -221(%edx)
++.LBB0_178:
++	vmovups	-189(%ecx), %ymm0
++	vmovups	%ymm0, -189(%edx)
++	vmovups	-157(%ecx), %ymm0
++	vmovups	%ymm0, -157(%edx)
++	vmovups	-125(%ecx), %ymm0
++	vmovups	%ymm0, -125(%edx)
++.LBB0_179:
++	vmovups	-93(%ecx), %ymm0
++	vmovups	%ymm0, -93(%edx)
++.LBB0_180:
++	vmovups	-61(%ecx), %ymm0
++	vmovups	%ymm0, -61(%edx)
++	jmp	.LBB0_269
++.LBB0_181:
++	vmovups	-222(%ecx), %ymm0
++	vmovups	%ymm0, -222(%edx)
++.LBB0_182:
++	vmovups	-190(%ecx), %ymm0
++	vmovups	%ymm0, -190(%edx)
++	vmovups	-158(%ecx), %ymm0
++	vmovups	%ymm0, -158(%edx)
++	vmovups	-126(%ecx), %ymm0
++	vmovups	%ymm0, -126(%edx)
++.LBB0_183:
++	vmovups	-94(%ecx), %ymm0
++	vmovups	%ymm0, -94(%edx)
++.LBB0_184:
++	vmovups	-62(%ecx), %ymm0
++	vmovups	%ymm0, -62(%edx)
++	jmp	.LBB0_269
++.LBB0_185:
++	vmovups	-223(%ecx), %ymm0
++	vmovups	%ymm0, -223(%edx)
++.LBB0_186:
++	vmovups	-191(%ecx), %ymm0
++	vmovups	%ymm0, -191(%edx)
++	vmovups	-159(%ecx), %ymm0
++	vmovups	%ymm0, -159(%edx)
++	vmovups	-127(%ecx), %ymm0
++	vmovups	%ymm0, -127(%edx)
++.LBB0_187:
++	vmovups	-95(%ecx), %ymm0
++	vmovups	%ymm0, -95(%edx)
++.LBB0_188:
++	vmovups	-63(%ecx), %ymm0
++	vmovups	%ymm0, -63(%edx)
++	jmp	.LBB0_269
++.LBB0_189:
++	vmovups	-225(%ecx), %ymm0
++	vmovups	%ymm0, -225(%edx)
++	vmovups	-193(%ecx), %ymm0
++	vmovups	%ymm0, -193(%edx)
++	vmovups	-161(%ecx), %ymm0
++	vmovups	%ymm0, -161(%edx)
++	vmovups	-129(%ecx), %ymm0
++	vmovups	%ymm0, -129(%edx)
++.LBB0_190:
++	vmovups	-97(%ecx), %ymm0
++	vmovups	%ymm0, -97(%edx)
++	vmovups	-65(%ecx), %ymm0
++	vmovups	%ymm0, -65(%edx)
++	jmp	.LBB0_268
++.LBB0_191:
++	vmovups	-226(%ecx), %ymm0
++	vmovups	%ymm0, -226(%edx)
++	vmovups	-194(%ecx), %ymm0
++	vmovups	%ymm0, -194(%edx)
++	vmovups	-162(%ecx), %ymm0
++	vmovups	%ymm0, -162(%edx)
++	vmovups	-130(%ecx), %ymm0
++	vmovups	%ymm0, -130(%edx)
++.LBB0_192:
++	vmovups	-98(%ecx), %ymm0
++	vmovups	%ymm0, -98(%edx)
++	vmovups	-66(%ecx), %ymm0
++	vmovups	%ymm0, -66(%edx)
++	jmp	.LBB0_268
++.LBB0_193:
++	vmovups	-227(%ecx), %ymm0
++	vmovups	%ymm0, -227(%edx)
++	vmovups	-195(%ecx), %ymm0
++	vmovups	%ymm0, -195(%edx)
++	vmovups	-163(%ecx), %ymm0
++	vmovups	%ymm0, -163(%edx)
++	vmovups	-131(%ecx), %ymm0
++	vmovups	%ymm0, -131(%edx)
++.LBB0_194:
++	vmovups	-99(%ecx), %ymm0
++	vmovups	%ymm0, -99(%edx)
++	vmovups	-67(%ecx), %ymm0
++	vmovups	%ymm0, -67(%edx)
++	jmp	.LBB0_268
++.LBB0_195:
++	vmovups	-228(%ecx), %ymm0
++	vmovups	%ymm0, -228(%edx)
++	vmovups	-196(%ecx), %ymm0
++	vmovups	%ymm0, -196(%edx)
++	vmovups	-164(%ecx), %ymm0
++	vmovups	%ymm0, -164(%edx)
++	vmovups	-132(%ecx), %ymm0
++	vmovups	%ymm0, -132(%edx)
++.LBB0_196:
++	vmovups	-100(%ecx), %ymm0
++	vmovups	%ymm0, -100(%edx)
++	vmovups	-68(%ecx), %ymm0
++	vmovups	%ymm0, -68(%edx)
++	jmp	.LBB0_268
++.LBB0_197:
++	vmovups	-229(%ecx), %ymm0
++	vmovups	%ymm0, -229(%edx)
++	vmovups	-197(%ecx), %ymm0
++	vmovups	%ymm0, -197(%edx)
++	vmovups	-165(%ecx), %ymm0
++	vmovups	%ymm0, -165(%edx)
++	vmovups	-133(%ecx), %ymm0
++	vmovups	%ymm0, -133(%edx)
++.LBB0_198:
++	vmovups	-101(%ecx), %ymm0
++	vmovups	%ymm0, -101(%edx)
++	vmovups	-69(%ecx), %ymm0
++	vmovups	%ymm0, -69(%edx)
++	jmp	.LBB0_268
++.LBB0_199:
++	vmovups	-230(%ecx), %ymm0
++	vmovups	%ymm0, -230(%edx)
++	vmovups	-198(%ecx), %ymm0
++	vmovups	%ymm0, -198(%edx)
++	vmovups	-166(%ecx), %ymm0
++	vmovups	%ymm0, -166(%edx)
++	vmovups	-134(%ecx), %ymm0
++	vmovups	%ymm0, -134(%edx)
++.LBB0_200:
++	vmovups	-102(%ecx), %ymm0
++	vmovups	%ymm0, -102(%edx)
++	vmovups	-70(%ecx), %ymm0
++	vmovups	%ymm0, -70(%edx)
++	jmp	.LBB0_268
++.LBB0_201:
++	vmovups	-231(%ecx), %ymm0
++	vmovups	%ymm0, -231(%edx)
++	vmovups	-199(%ecx), %ymm0
++	vmovups	%ymm0, -199(%edx)
++	vmovups	-167(%ecx), %ymm0
++	vmovups	%ymm0, -167(%edx)
++	vmovups	-135(%ecx), %ymm0
++	vmovups	%ymm0, -135(%edx)
++.LBB0_202:
++	vmovups	-103(%ecx), %ymm0
++	vmovups	%ymm0, -103(%edx)
++	vmovups	-71(%ecx), %ymm0
++	vmovups	%ymm0, -71(%edx)
++	jmp	.LBB0_268
++.LBB0_203:
++	vmovups	-232(%ecx), %ymm0
++	vmovups	%ymm0, -232(%edx)
++	vmovups	-200(%ecx), %ymm0
++	vmovups	%ymm0, -200(%edx)
++	vmovups	-168(%ecx), %ymm0
++	vmovups	%ymm0, -168(%edx)
++	vmovups	-136(%ecx), %ymm0
++	vmovups	%ymm0, -136(%edx)
++.LBB0_204:
++	vmovups	-104(%ecx), %ymm0
++	vmovups	%ymm0, -104(%edx)
++	vmovups	-72(%ecx), %ymm0
++	vmovups	%ymm0, -72(%edx)
++	jmp	.LBB0_268
++.LBB0_205:
++	vmovups	-233(%ecx), %ymm0
++	vmovups	%ymm0, -233(%edx)
++	vmovups	-201(%ecx), %ymm0
++	vmovups	%ymm0, -201(%edx)
++	vmovups	-169(%ecx), %ymm0
++	vmovups	%ymm0, -169(%edx)
++	vmovups	-137(%ecx), %ymm0
++	vmovups	%ymm0, -137(%edx)
++.LBB0_206:
++	vmovups	-105(%ecx), %ymm0
++	vmovups	%ymm0, -105(%edx)
++	vmovups	-73(%ecx), %ymm0
++	vmovups	%ymm0, -73(%edx)
++	jmp	.LBB0_268
++.LBB0_207:
++	vmovups	-234(%ecx), %ymm0
++	vmovups	%ymm0, -234(%edx)
++	vmovups	-202(%ecx), %ymm0
++	vmovups	%ymm0, -202(%edx)
++	vmovups	-170(%ecx), %ymm0
++	vmovups	%ymm0, -170(%edx)
++	vmovups	-138(%ecx), %ymm0
++	vmovups	%ymm0, -138(%edx)
++.LBB0_208:
++	vmovups	-106(%ecx), %ymm0
++	vmovups	%ymm0, -106(%edx)
++	vmovups	-74(%ecx), %ymm0
++	vmovups	%ymm0, -74(%edx)
++	jmp	.LBB0_268
++.LBB0_209:
++	vmovups	-235(%ecx), %ymm0
++	vmovups	%ymm0, -235(%edx)
++	vmovups	-203(%ecx), %ymm0
++	vmovups	%ymm0, -203(%edx)
++	vmovups	-171(%ecx), %ymm0
++	vmovups	%ymm0, -171(%edx)
++	vmovups	-139(%ecx), %ymm0
++	vmovups	%ymm0, -139(%edx)
++.LBB0_210:
++	vmovups	-107(%ecx), %ymm0
++	vmovups	%ymm0, -107(%edx)
++	vmovups	-75(%ecx), %ymm0
++	vmovups	%ymm0, -75(%edx)
++	jmp	.LBB0_268
++.LBB0_211:
++	vmovups	-236(%ecx), %ymm0
++	vmovups	%ymm0, -236(%edx)
++	vmovups	-204(%ecx), %ymm0
++	vmovups	%ymm0, -204(%edx)
++	vmovups	-172(%ecx), %ymm0
++	vmovups	%ymm0, -172(%edx)
++	vmovups	-140(%ecx), %ymm0
++	vmovups	%ymm0, -140(%edx)
++.LBB0_212:
++	vmovups	-108(%ecx), %ymm0
++	vmovups	%ymm0, -108(%edx)
++	vmovups	-76(%ecx), %ymm0
++	vmovups	%ymm0, -76(%edx)
++	jmp	.LBB0_268
++.LBB0_213:
++	vmovups	-237(%ecx), %ymm0
++	vmovups	%ymm0, -237(%edx)
++	vmovups	-205(%ecx), %ymm0
++	vmovups	%ymm0, -205(%edx)
++	vmovups	-173(%ecx), %ymm0
++	vmovups	%ymm0, -173(%edx)
++	vmovups	-141(%ecx), %ymm0
++	vmovups	%ymm0, -141(%edx)
++.LBB0_214:
++	vmovups	-109(%ecx), %ymm0
++	vmovups	%ymm0, -109(%edx)
++	vmovups	-77(%ecx), %ymm0
++	vmovups	%ymm0, -77(%edx)
++	jmp	.LBB0_268
++.LBB0_215:
++	vmovups	-238(%ecx), %ymm0
++	vmovups	%ymm0, -238(%edx)
++	vmovups	-206(%ecx), %ymm0
++	vmovups	%ymm0, -206(%edx)
++	vmovups	-174(%ecx), %ymm0
++	vmovups	%ymm0, -174(%edx)
++	vmovups	-142(%ecx), %ymm0
++	vmovups	%ymm0, -142(%edx)
++.LBB0_216:
++	vmovups	-110(%ecx), %ymm0
++	vmovups	%ymm0, -110(%edx)
++	vmovups	-78(%ecx), %ymm0
++	vmovups	%ymm0, -78(%edx)
++	jmp	.LBB0_268
++.LBB0_217:
++	vmovups	-239(%ecx), %ymm0
++	vmovups	%ymm0, -239(%edx)
++	vmovups	-207(%ecx), %ymm0
++	vmovups	%ymm0, -207(%edx)
++	vmovups	-175(%ecx), %ymm0
++	vmovups	%ymm0, -175(%edx)
++	vmovups	-143(%ecx), %ymm0
++	vmovups	%ymm0, -143(%edx)
++.LBB0_218:
++	vmovups	-111(%ecx), %ymm0
++	vmovups	%ymm0, -111(%edx)
++	vmovups	-79(%ecx), %ymm0
++	vmovups	%ymm0, -79(%edx)
++	jmp	.LBB0_268
++.LBB0_219:
++	vmovups	-240(%ecx), %ymm0
++	vmovups	%ymm0, -240(%edx)
++	vmovups	-208(%ecx), %ymm0
++	vmovups	%ymm0, -208(%edx)
++	vmovups	-176(%ecx), %ymm0
++	vmovups	%ymm0, -176(%edx)
++	vmovups	-144(%ecx), %ymm0
++	vmovups	%ymm0, -144(%edx)
++.LBB0_220:
++	vmovups	-112(%ecx), %ymm0
++	vmovups	%ymm0, -112(%edx)
++	vmovups	-80(%ecx), %ymm0
++	vmovups	%ymm0, -80(%edx)
++	jmp	.LBB0_268
++.LBB0_221:
++	vmovups	-241(%ecx), %ymm0
++	vmovups	%ymm0, -241(%edx)
++	vmovups	-209(%ecx), %ymm0
++	vmovups	%ymm0, -209(%edx)
++	vmovups	-177(%ecx), %ymm0
++	vmovups	%ymm0, -177(%edx)
++	vmovups	-145(%ecx), %ymm0
++	vmovups	%ymm0, -145(%edx)
++.LBB0_222:
++	vmovups	-113(%ecx), %ymm0
++	vmovups	%ymm0, -113(%edx)
++	vmovups	-81(%ecx), %ymm0
++	vmovups	%ymm0, -81(%edx)
++	jmp	.LBB0_268
++.LBB0_223:
++	vmovups	-242(%ecx), %ymm0
++	vmovups	%ymm0, -242(%edx)
++	vmovups	-210(%ecx), %ymm0
++	vmovups	%ymm0, -210(%edx)
++	vmovups	-178(%ecx), %ymm0
++	vmovups	%ymm0, -178(%edx)
++	vmovups	-146(%ecx), %ymm0
++	vmovups	%ymm0, -146(%edx)
++.LBB0_224:
++	vmovups	-114(%ecx), %ymm0
++	vmovups	%ymm0, -114(%edx)
++	vmovups	-82(%ecx), %ymm0
++	vmovups	%ymm0, -82(%edx)
++	jmp	.LBB0_268
++.LBB0_225:
++	vmovups	-243(%ecx), %ymm0
++	vmovups	%ymm0, -243(%edx)
++	vmovups	-211(%ecx), %ymm0
++	vmovups	%ymm0, -211(%edx)
++	vmovups	-179(%ecx), %ymm0
++	vmovups	%ymm0, -179(%edx)
++	vmovups	-147(%ecx), %ymm0
++	vmovups	%ymm0, -147(%edx)
++.LBB0_226:
++	vmovups	-115(%ecx), %ymm0
++	vmovups	%ymm0, -115(%edx)
++	vmovups	-83(%ecx), %ymm0
++	vmovups	%ymm0, -83(%edx)
++	jmp	.LBB0_268
++.LBB0_227:
++	vmovups	-244(%ecx), %ymm0
++	vmovups	%ymm0, -244(%edx)
++	vmovups	-212(%ecx), %ymm0
++	vmovups	%ymm0, -212(%edx)
++	vmovups	-180(%ecx), %ymm0
++	vmovups	%ymm0, -180(%edx)
++	vmovups	-148(%ecx), %ymm0
++	vmovups	%ymm0, -148(%edx)
++.LBB0_228:
++	vmovups	-116(%ecx), %ymm0
++	vmovups	%ymm0, -116(%edx)
++	vmovups	-84(%ecx), %ymm0
++	vmovups	%ymm0, -84(%edx)
++	jmp	.LBB0_268
++.LBB0_229:
++	vmovups	-245(%ecx), %ymm0
++	vmovups	%ymm0, -245(%edx)
++	vmovups	-213(%ecx), %ymm0
++	vmovups	%ymm0, -213(%edx)
++	vmovups	-181(%ecx), %ymm0
++	vmovups	%ymm0, -181(%edx)
++	vmovups	-149(%ecx), %ymm0
++	vmovups	%ymm0, -149(%edx)
++.LBB0_230:
++	vmovups	-117(%ecx), %ymm0
++	vmovups	%ymm0, -117(%edx)
++	vmovups	-85(%ecx), %ymm0
++	vmovups	%ymm0, -85(%edx)
++	jmp	.LBB0_268
++.LBB0_231:
++	vmovups	-246(%ecx), %ymm0
++	vmovups	%ymm0, -246(%edx)
++	vmovups	-214(%ecx), %ymm0
++	vmovups	%ymm0, -214(%edx)
++	vmovups	-182(%ecx), %ymm0
++	vmovups	%ymm0, -182(%edx)
++	vmovups	-150(%ecx), %ymm0
++	vmovups	%ymm0, -150(%edx)
++.LBB0_232:
++	vmovups	-118(%ecx), %ymm0
++	vmovups	%ymm0, -118(%edx)
++	vmovups	-86(%ecx), %ymm0
++	vmovups	%ymm0, -86(%edx)
++	jmp	.LBB0_268
++.LBB0_233:
++	vmovups	-247(%ecx), %ymm0
++	vmovups	%ymm0, -247(%edx)
++	vmovups	-215(%ecx), %ymm0
++	vmovups	%ymm0, -215(%edx)
++	vmovups	-183(%ecx), %ymm0
++	vmovups	%ymm0, -183(%edx)
++	vmovups	-151(%ecx), %ymm0
++	vmovups	%ymm0, -151(%edx)
++.LBB0_234:
++	vmovups	-119(%ecx), %ymm0
++	vmovups	%ymm0, -119(%edx)
++	vmovups	-87(%ecx), %ymm0
++	vmovups	%ymm0, -87(%edx)
++	jmp	.LBB0_268
++.LBB0_235:
++	vmovups	-248(%ecx), %ymm0
++	vmovups	%ymm0, -248(%edx)
++	vmovups	-216(%ecx), %ymm0
++	vmovups	%ymm0, -216(%edx)
++	vmovups	-184(%ecx), %ymm0
++	vmovups	%ymm0, -184(%edx)
++	vmovups	-152(%ecx), %ymm0
++	vmovups	%ymm0, -152(%edx)
++.LBB0_236:
++	vmovups	-120(%ecx), %ymm0
++	vmovups	%ymm0, -120(%edx)
++	vmovups	-88(%ecx), %ymm0
++	vmovups	%ymm0, -88(%edx)
++	jmp	.LBB0_268
++.LBB0_237:
++	vmovups	-249(%ecx), %ymm0
++	vmovups	%ymm0, -249(%edx)
++	vmovups	-217(%ecx), %ymm0
++	vmovups	%ymm0, -217(%edx)
++	vmovups	-185(%ecx), %ymm0
++	vmovups	%ymm0, -185(%edx)
++	vmovups	-153(%ecx), %ymm0
++	vmovups	%ymm0, -153(%edx)
++.LBB0_238:
++	vmovups	-121(%ecx), %ymm0
++	vmovups	%ymm0, -121(%edx)
++	vmovups	-89(%ecx), %ymm0
++	vmovups	%ymm0, -89(%edx)
++	jmp	.LBB0_268
++.LBB0_239:
++	vmovups	-250(%ecx), %ymm0
++	vmovups	%ymm0, -250(%edx)
++	vmovups	-218(%ecx), %ymm0
++	vmovups	%ymm0, -218(%edx)
++	vmovups	-186(%ecx), %ymm0
++	vmovups	%ymm0, -186(%edx)
++	vmovups	-154(%ecx), %ymm0
++	vmovups	%ymm0, -154(%edx)
++.LBB0_240:
++	vmovups	-122(%ecx), %ymm0
++	vmovups	%ymm0, -122(%edx)
++	vmovups	-90(%ecx), %ymm0
++	vmovups	%ymm0, -90(%edx)
++	jmp	.LBB0_268
++.LBB0_241:
++	vmovups	-251(%ecx), %ymm0
++	vmovups	%ymm0, -251(%edx)
++	vmovups	-219(%ecx), %ymm0
++	vmovups	%ymm0, -219(%edx)
++	vmovups	-187(%ecx), %ymm0
++	vmovups	%ymm0, -187(%edx)
++	vmovups	-155(%ecx), %ymm0
++	vmovups	%ymm0, -155(%edx)
++.LBB0_242:
++	vmovups	-123(%ecx), %ymm0
++	vmovups	%ymm0, -123(%edx)
++	vmovups	-91(%ecx), %ymm0
++	vmovups	%ymm0, -91(%edx)
++	jmp	.LBB0_268
++.LBB0_243:
++	vmovups	-252(%ecx), %ymm0
++	vmovups	%ymm0, -252(%edx)
++	vmovups	-220(%ecx), %ymm0
++	vmovups	%ymm0, -220(%edx)
++	vmovups	-188(%ecx), %ymm0
++	vmovups	%ymm0, -188(%edx)
++	vmovups	-156(%ecx), %ymm0
++	vmovups	%ymm0, -156(%edx)
++.LBB0_244:
++	vmovups	-124(%ecx), %ymm0
++	vmovups	%ymm0, -124(%edx)
++	vmovups	-92(%ecx), %ymm0
++	vmovups	%ymm0, -92(%edx)
++	jmp	.LBB0_268
++.LBB0_245:
++	vmovups	-253(%ecx), %ymm0
++	vmovups	%ymm0, -253(%edx)
++	vmovups	-221(%ecx), %ymm0
++	vmovups	%ymm0, -221(%edx)
++	vmovups	-189(%ecx), %ymm0
++	vmovups	%ymm0, -189(%edx)
++	vmovups	-157(%ecx), %ymm0
++	vmovups	%ymm0, -157(%edx)
++.LBB0_246:
++	vmovups	-125(%ecx), %ymm0
++	vmovups	%ymm0, -125(%edx)
++	vmovups	-93(%ecx), %ymm0
++	vmovups	%ymm0, -93(%edx)
++	jmp	.LBB0_268
++.LBB0_247:
++	vmovups	-254(%ecx), %ymm0
++	vmovups	%ymm0, -254(%edx)
++	vmovups	-222(%ecx), %ymm0
++	vmovups	%ymm0, -222(%edx)
++	vmovups	-190(%ecx), %ymm0
++	vmovups	%ymm0, -190(%edx)
++	vmovups	-158(%ecx), %ymm0
++	vmovups	%ymm0, -158(%edx)
++.LBB0_248:
++	vmovups	-126(%ecx), %ymm0
++	vmovups	%ymm0, -126(%edx)
++	vmovups	-94(%ecx), %ymm0
++	vmovups	%ymm0, -94(%edx)
++	jmp	.LBB0_268
++.LBB0_249:
++	vmovups	-255(%ecx), %ymm0
++	vmovups	%ymm0, -255(%edx)
++	vmovups	-223(%ecx), %ymm0
++	vmovups	%ymm0, -223(%edx)
++	vmovups	-191(%ecx), %ymm0
++	vmovups	%ymm0, -191(%edx)
++	vmovups	-159(%ecx), %ymm0
++	vmovups	%ymm0, -159(%edx)
++.LBB0_250:
++	vmovups	-127(%ecx), %ymm0
++	vmovups	%ymm0, -127(%edx)
++	vmovups	-95(%ecx), %ymm0
++	vmovups	%ymm0, -95(%edx)
++	jmp	.LBB0_268
++.LBB0_262:
++	vmovups	-256(%ecx), %ymm0
++	vmovups	%ymm0, -256(%edx)
++.LBB0_263:
++	vmovups	-224(%ecx), %ymm0
++	vmovups	%ymm0, -224(%edx)
++.LBB0_264:
++	vmovups	-192(%ecx), %ymm0
++	vmovups	%ymm0, -192(%edx)
++.LBB0_265:
++	vmovups	-160(%ecx), %ymm0
++	vmovups	%ymm0, -160(%edx)
++.LBB0_266:
++	vmovups	-128(%ecx), %ymm0
++	vmovups	%ymm0, -128(%edx)
++.LBB0_267:
++	vmovups	-96(%ecx), %ymm0
++	vmovups	%ymm0, -96(%edx)
++.LBB0_268:
++	vmovups	-64(%ecx), %ymm0
++	vmovups	%ymm0, -64(%edx)
++.LBB0_269:
++	vmovups	-32(%ecx), %ymm0
++	vmovups	%ymm0, -32(%edx)
++.LBB0_270:
++	vzeroupper
++	popl	%esi
++	popl	%edi
++	popl	%ebx
++	popl	%ebp
++	retl
++END(memcpy_avx2)
++
++/*.Lfunc_end0:
++	.size	memcpy_avx2, .Lfunc_end0-memcpy_avx2
++	.section	.rodata,"a",@progbits
++	.p2align	2*/
++.LJTI0_0:
++	.long	.LBB0_6@GOTOFF
++	.long	.LBB0_10@GOTOFF
++	.long	.LBB0_12@GOTOFF
++	.long	.LBB0_16@GOTOFF
++	.long	.LBB0_18@GOTOFF
++	.long	.LBB0_20@GOTOFF
++	.long	.LBB0_22@GOTOFF
++	.long	.LBB0_26@GOTOFF
++	.long	.LBB0_28@GOTOFF
++	.long	.LBB0_30@GOTOFF
++	.long	.LBB0_32@GOTOFF
++	.long	.LBB0_34@GOTOFF
++	.long	.LBB0_36@GOTOFF
++	.long	.LBB0_38@GOTOFF
++	.long	.LBB0_40@GOTOFF
++	.long	.LBB0_44@GOTOFF
++	.long	.LBB0_46@GOTOFF
++	.long	.LBB0_48@GOTOFF
++	.long	.LBB0_50@GOTOFF
++	.long	.LBB0_52@GOTOFF
++	.long	.LBB0_54@GOTOFF
++	.long	.LBB0_56@GOTOFF
++	.long	.LBB0_58@GOTOFF
++	.long	.LBB0_60@GOTOFF
++	.long	.LBB0_62@GOTOFF
++	.long	.LBB0_64@GOTOFF
++	.long	.LBB0_66@GOTOFF
++	.long	.LBB0_68@GOTOFF
++	.long	.LBB0_70@GOTOFF
++	.long	.LBB0_72@GOTOFF
++	.long	.LBB0_74@GOTOFF
++	.long	.LBB0_269@GOTOFF
++	.long	.LBB0_5@GOTOFF
++	.long	.LBB0_9@GOTOFF
++	.long	.LBB0_82@GOTOFF
++	.long	.LBB0_15@GOTOFF
++	.long	.LBB0_88@GOTOFF
++	.long	.LBB0_92@GOTOFF
++	.long	.LBB0_96@GOTOFF
++	.long	.LBB0_25@GOTOFF
++	.long	.LBB0_102@GOTOFF
++	.long	.LBB0_106@GOTOFF
++	.long	.LBB0_110@GOTOFF
++	.long	.LBB0_114@GOTOFF
++	.long	.LBB0_118@GOTOFF
++	.long	.LBB0_122@GOTOFF
++	.long	.LBB0_126@GOTOFF
++	.long	.LBB0_43@GOTOFF
++	.long	.LBB0_132@GOTOFF
++	.long	.LBB0_136@GOTOFF
++	.long	.LBB0_140@GOTOFF
++	.long	.LBB0_144@GOTOFF
++	.long	.LBB0_148@GOTOFF
++	.long	.LBB0_152@GOTOFF
++	.long	.LBB0_156@GOTOFF
++	.long	.LBB0_160@GOTOFF
++	.long	.LBB0_164@GOTOFF
++	.long	.LBB0_168@GOTOFF
++	.long	.LBB0_172@GOTOFF
++	.long	.LBB0_176@GOTOFF
++	.long	.LBB0_180@GOTOFF
++	.long	.LBB0_184@GOTOFF
++	.long	.LBB0_188@GOTOFF
++	.long	.LBB0_268@GOTOFF
++	.long	.LBB0_4@GOTOFF
++	.long	.LBB0_8@GOTOFF
++	.long	.LBB0_81@GOTOFF
++	.long	.LBB0_14@GOTOFF
++	.long	.LBB0_87@GOTOFF
++	.long	.LBB0_91@GOTOFF
++	.long	.LBB0_95@GOTOFF
++	.long	.LBB0_24@GOTOFF
++	.long	.LBB0_101@GOTOFF
++	.long	.LBB0_105@GOTOFF
++	.long	.LBB0_109@GOTOFF
++	.long	.LBB0_113@GOTOFF
++	.long	.LBB0_117@GOTOFF
++	.long	.LBB0_121@GOTOFF
++	.long	.LBB0_125@GOTOFF
++	.long	.LBB0_42@GOTOFF
++	.long	.LBB0_131@GOTOFF
++	.long	.LBB0_135@GOTOFF
++	.long	.LBB0_139@GOTOFF
++	.long	.LBB0_143@GOTOFF
++	.long	.LBB0_147@GOTOFF
++	.long	.LBB0_151@GOTOFF
++	.long	.LBB0_155@GOTOFF
++	.long	.LBB0_159@GOTOFF
++	.long	.LBB0_163@GOTOFF
++	.long	.LBB0_167@GOTOFF
++	.long	.LBB0_171@GOTOFF
++	.long	.LBB0_175@GOTOFF
++	.long	.LBB0_179@GOTOFF
++	.long	.LBB0_183@GOTOFF
++	.long	.LBB0_187@GOTOFF
++	.long	.LBB0_267@GOTOFF
++	.long	.LBB0_190@GOTOFF
++	.long	.LBB0_192@GOTOFF
++	.long	.LBB0_194@GOTOFF
++	.long	.LBB0_196@GOTOFF
++	.long	.LBB0_198@GOTOFF
++	.long	.LBB0_200@GOTOFF
++	.long	.LBB0_202@GOTOFF
++	.long	.LBB0_204@GOTOFF
++	.long	.LBB0_206@GOTOFF
++	.long	.LBB0_208@GOTOFF
++	.long	.LBB0_210@GOTOFF
++	.long	.LBB0_212@GOTOFF
++	.long	.LBB0_214@GOTOFF
++	.long	.LBB0_216@GOTOFF
++	.long	.LBB0_218@GOTOFF
++	.long	.LBB0_220@GOTOFF
++	.long	.LBB0_222@GOTOFF
++	.long	.LBB0_224@GOTOFF
++	.long	.LBB0_226@GOTOFF
++	.long	.LBB0_228@GOTOFF
++	.long	.LBB0_230@GOTOFF
++	.long	.LBB0_232@GOTOFF
++	.long	.LBB0_234@GOTOFF
++	.long	.LBB0_236@GOTOFF
++	.long	.LBB0_238@GOTOFF
++	.long	.LBB0_240@GOTOFF
++	.long	.LBB0_242@GOTOFF
++	.long	.LBB0_244@GOTOFF
++	.long	.LBB0_246@GOTOFF
++	.long	.LBB0_248@GOTOFF
++	.long	.LBB0_250@GOTOFF
++	.long	.LBB0_266@GOTOFF
++	.long	.LBB0_3@GOTOFF
++	.long	.LBB0_7@GOTOFF
++	.long	.LBB0_11@GOTOFF
++	.long	.LBB0_13@GOTOFF
++	.long	.LBB0_17@GOTOFF
++	.long	.LBB0_19@GOTOFF
++	.long	.LBB0_21@GOTOFF
++	.long	.LBB0_23@GOTOFF
++	.long	.LBB0_27@GOTOFF
++	.long	.LBB0_29@GOTOFF
++	.long	.LBB0_31@GOTOFF
++	.long	.LBB0_33@GOTOFF
++	.long	.LBB0_35@GOTOFF
++	.long	.LBB0_37@GOTOFF
++	.long	.LBB0_39@GOTOFF
++	.long	.LBB0_41@GOTOFF
++	.long	.LBB0_45@GOTOFF
++	.long	.LBB0_47@GOTOFF
++	.long	.LBB0_49@GOTOFF
++	.long	.LBB0_51@GOTOFF
++	.long	.LBB0_53@GOTOFF
++	.long	.LBB0_55@GOTOFF
++	.long	.LBB0_57@GOTOFF
++	.long	.LBB0_59@GOTOFF
++	.long	.LBB0_61@GOTOFF
++	.long	.LBB0_63@GOTOFF
++	.long	.LBB0_65@GOTOFF
++	.long	.LBB0_67@GOTOFF
++	.long	.LBB0_69@GOTOFF
++	.long	.LBB0_71@GOTOFF
++	.long	.LBB0_73@GOTOFF
++	.long	.LBB0_265@GOTOFF
++	.long	.LBB0_76@GOTOFF
++	.long	.LBB0_78@GOTOFF
++	.long	.LBB0_80@GOTOFF
++	.long	.LBB0_84@GOTOFF
++	.long	.LBB0_86@GOTOFF
++	.long	.LBB0_90@GOTOFF
++	.long	.LBB0_94@GOTOFF
++	.long	.LBB0_98@GOTOFF
++	.long	.LBB0_100@GOTOFF
++	.long	.LBB0_104@GOTOFF
++	.long	.LBB0_108@GOTOFF
++	.long	.LBB0_112@GOTOFF
++	.long	.LBB0_116@GOTOFF
++	.long	.LBB0_120@GOTOFF
++	.long	.LBB0_124@GOTOFF
++	.long	.LBB0_128@GOTOFF
++	.long	.LBB0_130@GOTOFF
++	.long	.LBB0_134@GOTOFF
++	.long	.LBB0_138@GOTOFF
++	.long	.LBB0_142@GOTOFF
++	.long	.LBB0_146@GOTOFF
++	.long	.LBB0_150@GOTOFF
++	.long	.LBB0_154@GOTOFF
++	.long	.LBB0_158@GOTOFF
++	.long	.LBB0_162@GOTOFF
++	.long	.LBB0_166@GOTOFF
++	.long	.LBB0_170@GOTOFF
++	.long	.LBB0_174@GOTOFF
++	.long	.LBB0_178@GOTOFF
++	.long	.LBB0_182@GOTOFF
++	.long	.LBB0_186@GOTOFF
++	.long	.LBB0_264@GOTOFF
++	.long	.LBB0_75@GOTOFF
++	.long	.LBB0_77@GOTOFF
++	.long	.LBB0_79@GOTOFF
++	.long	.LBB0_83@GOTOFF
++	.long	.LBB0_85@GOTOFF
++	.long	.LBB0_89@GOTOFF
++	.long	.LBB0_93@GOTOFF
++	.long	.LBB0_97@GOTOFF
++	.long	.LBB0_99@GOTOFF
++	.long	.LBB0_103@GOTOFF
++	.long	.LBB0_107@GOTOFF
++	.long	.LBB0_111@GOTOFF
++	.long	.LBB0_115@GOTOFF
++	.long	.LBB0_119@GOTOFF
++	.long	.LBB0_123@GOTOFF
++	.long	.LBB0_127@GOTOFF
++	.long	.LBB0_129@GOTOFF
++	.long	.LBB0_133@GOTOFF
++	.long	.LBB0_137@GOTOFF
++	.long	.LBB0_141@GOTOFF
++	.long	.LBB0_145@GOTOFF
++	.long	.LBB0_149@GOTOFF
++	.long	.LBB0_153@GOTOFF
++	.long	.LBB0_157@GOTOFF
++	.long	.LBB0_161@GOTOFF
++	.long	.LBB0_165@GOTOFF
++	.long	.LBB0_169@GOTOFF
++	.long	.LBB0_173@GOTOFF
++	.long	.LBB0_177@GOTOFF
++	.long	.LBB0_181@GOTOFF
++	.long	.LBB0_185@GOTOFF
++	.long	.LBB0_263@GOTOFF
++	.long	.LBB0_189@GOTOFF
++	.long	.LBB0_191@GOTOFF
++	.long	.LBB0_193@GOTOFF
++	.long	.LBB0_195@GOTOFF
++	.long	.LBB0_197@GOTOFF
++	.long	.LBB0_199@GOTOFF
++	.long	.LBB0_201@GOTOFF
++	.long	.LBB0_203@GOTOFF
++	.long	.LBB0_205@GOTOFF
++	.long	.LBB0_207@GOTOFF
++	.long	.LBB0_209@GOTOFF
++	.long	.LBB0_211@GOTOFF
++	.long	.LBB0_213@GOTOFF
++	.long	.LBB0_215@GOTOFF
++	.long	.LBB0_217@GOTOFF
++	.long	.LBB0_219@GOTOFF
++	.long	.LBB0_221@GOTOFF
++	.long	.LBB0_223@GOTOFF
++	.long	.LBB0_225@GOTOFF
++	.long	.LBB0_227@GOTOFF
++	.long	.LBB0_229@GOTOFF
++	.long	.LBB0_231@GOTOFF
++	.long	.LBB0_233@GOTOFF
++	.long	.LBB0_235@GOTOFF
++	.long	.LBB0_237@GOTOFF
++	.long	.LBB0_239@GOTOFF
++	.long	.LBB0_241@GOTOFF
++	.long	.LBB0_243@GOTOFF
++	.long	.LBB0_245@GOTOFF
++	.long	.LBB0_247@GOTOFF
++	.long	.LBB0_249@GOTOFF
++	.long	.LBB0_262@GOTOFF
++.LJTI0_1:
++	.long	.LBB0_6@GOTOFF
++	.long	.LBB0_10@GOTOFF
++	.long	.LBB0_12@GOTOFF
++	.long	.LBB0_16@GOTOFF
++	.long	.LBB0_18@GOTOFF
++	.long	.LBB0_20@GOTOFF
++	.long	.LBB0_22@GOTOFF
++	.long	.LBB0_26@GOTOFF
++	.long	.LBB0_28@GOTOFF
++	.long	.LBB0_30@GOTOFF
++	.long	.LBB0_32@GOTOFF
++	.long	.LBB0_34@GOTOFF
++	.long	.LBB0_36@GOTOFF
++	.long	.LBB0_38@GOTOFF
++	.long	.LBB0_40@GOTOFF
++	.long	.LBB0_44@GOTOFF
++	.long	.LBB0_46@GOTOFF
++	.long	.LBB0_48@GOTOFF
++	.long	.LBB0_50@GOTOFF
++	.long	.LBB0_52@GOTOFF
++	.long	.LBB0_54@GOTOFF
++	.long	.LBB0_56@GOTOFF
++	.long	.LBB0_58@GOTOFF
++	.long	.LBB0_60@GOTOFF
++	.long	.LBB0_62@GOTOFF
++	.long	.LBB0_64@GOTOFF
++	.long	.LBB0_66@GOTOFF
++	.long	.LBB0_68@GOTOFF
++	.long	.LBB0_70@GOTOFF
++	.long	.LBB0_72@GOTOFF
++	.long	.LBB0_74@GOTOFF
++	.long	.LBB0_269@GOTOFF
++	.long	.LBB0_5@GOTOFF
++	.long	.LBB0_9@GOTOFF
++	.long	.LBB0_82@GOTOFF
++	.long	.LBB0_15@GOTOFF
++	.long	.LBB0_88@GOTOFF
++	.long	.LBB0_92@GOTOFF
++	.long	.LBB0_96@GOTOFF
++	.long	.LBB0_25@GOTOFF
++	.long	.LBB0_102@GOTOFF
++	.long	.LBB0_106@GOTOFF
++	.long	.LBB0_110@GOTOFF
++	.long	.LBB0_114@GOTOFF
++	.long	.LBB0_118@GOTOFF
++	.long	.LBB0_122@GOTOFF
++	.long	.LBB0_126@GOTOFF
++	.long	.LBB0_43@GOTOFF
++	.long	.LBB0_132@GOTOFF
++	.long	.LBB0_136@GOTOFF
++	.long	.LBB0_140@GOTOFF
++	.long	.LBB0_144@GOTOFF
++	.long	.LBB0_148@GOTOFF
++	.long	.LBB0_152@GOTOFF
++	.long	.LBB0_156@GOTOFF
++	.long	.LBB0_160@GOTOFF
++	.long	.LBB0_164@GOTOFF
++	.long	.LBB0_168@GOTOFF
++	.long	.LBB0_172@GOTOFF
++	.long	.LBB0_176@GOTOFF
++	.long	.LBB0_180@GOTOFF
++	.long	.LBB0_184@GOTOFF
++	.long	.LBB0_188@GOTOFF
++	.long	.LBB0_268@GOTOFF
++	.long	.LBB0_4@GOTOFF
++	.long	.LBB0_8@GOTOFF
++	.long	.LBB0_81@GOTOFF
++	.long	.LBB0_14@GOTOFF
++	.long	.LBB0_87@GOTOFF
++	.long	.LBB0_91@GOTOFF
++	.long	.LBB0_95@GOTOFF
++	.long	.LBB0_24@GOTOFF
++	.long	.LBB0_101@GOTOFF
++	.long	.LBB0_105@GOTOFF
++	.long	.LBB0_109@GOTOFF
++	.long	.LBB0_113@GOTOFF
++	.long	.LBB0_117@GOTOFF
++	.long	.LBB0_121@GOTOFF
++	.long	.LBB0_125@GOTOFF
++	.long	.LBB0_42@GOTOFF
++	.long	.LBB0_131@GOTOFF
++	.long	.LBB0_135@GOTOFF
++	.long	.LBB0_139@GOTOFF
++	.long	.LBB0_143@GOTOFF
++	.long	.LBB0_147@GOTOFF
++	.long	.LBB0_151@GOTOFF
++	.long	.LBB0_155@GOTOFF
++	.long	.LBB0_159@GOTOFF
++	.long	.LBB0_163@GOTOFF
++	.long	.LBB0_167@GOTOFF
++	.long	.LBB0_171@GOTOFF
++	.long	.LBB0_175@GOTOFF
++	.long	.LBB0_179@GOTOFF
++	.long	.LBB0_183@GOTOFF
++	.long	.LBB0_187@GOTOFF
++	.long	.LBB0_267@GOTOFF
++	.long	.LBB0_190@GOTOFF
++	.long	.LBB0_192@GOTOFF
++	.long	.LBB0_194@GOTOFF
++	.long	.LBB0_196@GOTOFF
++	.long	.LBB0_198@GOTOFF
++	.long	.LBB0_200@GOTOFF
++	.long	.LBB0_202@GOTOFF
++	.long	.LBB0_204@GOTOFF
++	.long	.LBB0_206@GOTOFF
++	.long	.LBB0_208@GOTOFF
++	.long	.LBB0_210@GOTOFF
++	.long	.LBB0_212@GOTOFF
++	.long	.LBB0_214@GOTOFF
++	.long	.LBB0_216@GOTOFF
++	.long	.LBB0_218@GOTOFF
++	.long	.LBB0_220@GOTOFF
++	.long	.LBB0_222@GOTOFF
++	.long	.LBB0_224@GOTOFF
++	.long	.LBB0_226@GOTOFF
++	.long	.LBB0_228@GOTOFF
++	.long	.LBB0_230@GOTOFF
++	.long	.LBB0_232@GOTOFF
++	.long	.LBB0_234@GOTOFF
++	.long	.LBB0_236@GOTOFF
++	.long	.LBB0_238@GOTOFF
++	.long	.LBB0_240@GOTOFF
++	.long	.LBB0_242@GOTOFF
++	.long	.LBB0_244@GOTOFF
++	.long	.LBB0_246@GOTOFF
++	.long	.LBB0_248@GOTOFF
++	.long	.LBB0_250@GOTOFF
++	.long	.LBB0_266@GOTOFF
++	.long	.LBB0_3@GOTOFF
++	.long	.LBB0_7@GOTOFF
++	.long	.LBB0_11@GOTOFF
++	.long	.LBB0_13@GOTOFF
++	.long	.LBB0_17@GOTOFF
++	.long	.LBB0_19@GOTOFF
++	.long	.LBB0_21@GOTOFF
++	.long	.LBB0_23@GOTOFF
++	.long	.LBB0_27@GOTOFF
++	.long	.LBB0_29@GOTOFF
++	.long	.LBB0_31@GOTOFF
++	.long	.LBB0_33@GOTOFF
++	.long	.LBB0_35@GOTOFF
++	.long	.LBB0_37@GOTOFF
++	.long	.LBB0_39@GOTOFF
++	.long	.LBB0_41@GOTOFF
++	.long	.LBB0_45@GOTOFF
++	.long	.LBB0_47@GOTOFF
++	.long	.LBB0_49@GOTOFF
++	.long	.LBB0_51@GOTOFF
++	.long	.LBB0_53@GOTOFF
++	.long	.LBB0_55@GOTOFF
++	.long	.LBB0_57@GOTOFF
++	.long	.LBB0_59@GOTOFF
++	.long	.LBB0_61@GOTOFF
++	.long	.LBB0_63@GOTOFF
++	.long	.LBB0_65@GOTOFF
++	.long	.LBB0_67@GOTOFF
++	.long	.LBB0_69@GOTOFF
++	.long	.LBB0_71@GOTOFF
++	.long	.LBB0_73@GOTOFF
++	.long	.LBB0_265@GOTOFF
++	.long	.LBB0_76@GOTOFF
++	.long	.LBB0_78@GOTOFF
++	.long	.LBB0_80@GOTOFF
++	.long	.LBB0_84@GOTOFF
++	.long	.LBB0_86@GOTOFF
++	.long	.LBB0_90@GOTOFF
++	.long	.LBB0_94@GOTOFF
++	.long	.LBB0_98@GOTOFF
++	.long	.LBB0_100@GOTOFF
++	.long	.LBB0_104@GOTOFF
++	.long	.LBB0_108@GOTOFF
++	.long	.LBB0_112@GOTOFF
++	.long	.LBB0_116@GOTOFF
++	.long	.LBB0_120@GOTOFF
++	.long	.LBB0_124@GOTOFF
++	.long	.LBB0_128@GOTOFF
++	.long	.LBB0_130@GOTOFF
++	.long	.LBB0_134@GOTOFF
++	.long	.LBB0_138@GOTOFF
++	.long	.LBB0_142@GOTOFF
++	.long	.LBB0_146@GOTOFF
++	.long	.LBB0_150@GOTOFF
++	.long	.LBB0_154@GOTOFF
++	.long	.LBB0_158@GOTOFF
++	.long	.LBB0_162@GOTOFF
++	.long	.LBB0_166@GOTOFF
++	.long	.LBB0_170@GOTOFF
++	.long	.LBB0_174@GOTOFF
++	.long	.LBB0_178@GOTOFF
++	.long	.LBB0_182@GOTOFF
++	.long	.LBB0_186@GOTOFF
++	.long	.LBB0_264@GOTOFF
++	.long	.LBB0_75@GOTOFF
++	.long	.LBB0_77@GOTOFF
++	.long	.LBB0_79@GOTOFF
++	.long	.LBB0_83@GOTOFF
++	.long	.LBB0_85@GOTOFF
++	.long	.LBB0_89@GOTOFF
++	.long	.LBB0_93@GOTOFF
++	.long	.LBB0_97@GOTOFF
++	.long	.LBB0_99@GOTOFF
++	.long	.LBB0_103@GOTOFF
++	.long	.LBB0_107@GOTOFF
++	.long	.LBB0_111@GOTOFF
++	.long	.LBB0_115@GOTOFF
++	.long	.LBB0_119@GOTOFF
++	.long	.LBB0_123@GOTOFF
++	.long	.LBB0_127@GOTOFF
++	.long	.LBB0_129@GOTOFF
++	.long	.LBB0_133@GOTOFF
++	.long	.LBB0_137@GOTOFF
++	.long	.LBB0_141@GOTOFF
++	.long	.LBB0_145@GOTOFF
++	.long	.LBB0_149@GOTOFF
++	.long	.LBB0_153@GOTOFF
++	.long	.LBB0_157@GOTOFF
++	.long	.LBB0_161@GOTOFF
++	.long	.LBB0_165@GOTOFF
++	.long	.LBB0_169@GOTOFF
++	.long	.LBB0_173@GOTOFF
++	.long	.LBB0_177@GOTOFF
++	.long	.LBB0_181@GOTOFF
++	.long	.LBB0_185@GOTOFF
++	.long	.LBB0_263@GOTOFF
++	.long	.LBB0_189@GOTOFF
++	.long	.LBB0_191@GOTOFF
++	.long	.LBB0_193@GOTOFF
++	.long	.LBB0_195@GOTOFF
++	.long	.LBB0_197@GOTOFF
++	.long	.LBB0_199@GOTOFF
++	.long	.LBB0_201@GOTOFF
++	.long	.LBB0_203@GOTOFF
++	.long	.LBB0_205@GOTOFF
++	.long	.LBB0_207@GOTOFF
++	.long	.LBB0_209@GOTOFF
++	.long	.LBB0_211@GOTOFF
++	.long	.LBB0_213@GOTOFF
++	.long	.LBB0_215@GOTOFF
++	.long	.LBB0_217@GOTOFF
++	.long	.LBB0_219@GOTOFF
++	.long	.LBB0_221@GOTOFF
++	.long	.LBB0_223@GOTOFF
++	.long	.LBB0_225@GOTOFF
++	.long	.LBB0_227@GOTOFF
++	.long	.LBB0_229@GOTOFF
++	.long	.LBB0_231@GOTOFF
++	.long	.LBB0_233@GOTOFF
++	.long	.LBB0_235@GOTOFF
++	.long	.LBB0_237@GOTOFF
++	.long	.LBB0_239@GOTOFF
++	.long	.LBB0_241@GOTOFF
++	.long	.LBB0_243@GOTOFF
++	.long	.LBB0_245@GOTOFF
++	.long	.LBB0_247@GOTOFF
++	.long	.LBB0_249@GOTOFF
++	.long	.LBB0_262@GOTOFF
++                                        # -- End function
+diff --git a/libc/arch-x86_64/dynamic_function_dispatch.cpp b/libc/arch-x86_64/dynamic_function_dispatch.cpp
+index c846ded45..43aaebb54 100644
+--- a/libc/arch-x86_64/dynamic_function_dispatch.cpp
++++ b/libc/arch-x86_64/dynamic_function_dispatch.cpp
+@@ -46,4 +46,42 @@ DEFINE_IFUNC_FOR(__memset_chk) {
+   RETURN_FUNC(__memset_chk_func, __memset_chk_generic);
+ }
+ 
++typedef int memcmp_func(const void* __lhs, const void* __rhs, size_t __n);
++DEFINE_IFUNC_FOR(memcmp) {
++    __builtin_cpu_init();
++    if (__builtin_cpu_supports("avx2")) RETURN_FUNC(memcmp_func, memcmp_avx2)
++    RETURN_FUNC(memcmp_func, memcmp_generic);
++}
++
++typedef void* memmove_func(void* __dst, const void* __src, size_t __n);
++DEFINE_IFUNC_FOR(memmove) {
++    RETURN_FUNC(memmove_func, memmove_generic);
++}
++
++typedef void* memcpy_func(void* __dst, const void* __src, size_t __n);
++DEFINE_IFUNC_FOR(memcpy) {
++	return memmove_resolver();
++}
++
++typedef void* memchr_func(const void* __s, int __ch, size_t __n);
++DEFINE_IFUNC_FOR(memchr) {
++    __builtin_cpu_init();
++    if (__builtin_cpu_supports("avx2")) RETURN_FUNC(memchr_func, memchr_avx2);
++    RETURN_FUNC(memchr_func, memchr_openbsd);
++}
++
++typedef void* memrchr_func(const void* __s, int __ch, size_t __n);
++DEFINE_IFUNC_FOR(memrchr) {
++    __builtin_cpu_init();
++    if (__builtin_cpu_supports("avx2")) RETURN_FUNC(memrchr_func, memrchr_avx2);
++    RETURN_FUNC(memrchr_func, memrchr_openbsd);
++}
++
++// typedef int wmemset_func(const wchar_t* __lhs, const wchar_t* __rhs, size_t __n);
++// DEFINE_IFUNC_FOR(wmemset) {
++//     __builtin_cpu_init();
++//     if (__builtin_cpu_supports("avx2")) RETURN_FUNC(wmemset_func, wmemset_avx2);
++//     RETURN_FUNC(wmemset_func, wmemset_freebsd);
++// }
++
+ }  // extern "C"
+diff --git a/libc/arch-x86_64/generic/string/memchr.c b/libc/arch-x86_64/generic/string/memchr.c
+new file mode 100644
+index 000000000..86ee02e0b
+--- /dev/null
++++ b/libc/arch-x86_64/generic/string/memchr.c
+@@ -0,0 +1,20 @@
++/*
++ * Copyright (C) 2019 The Android Open Source Project
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *      http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++*/
++
++#include <upstream-openbsd/android/include/openbsd-compat.h>
++#define memchr memchr_openbsd
++
++#include <upstream-openbsd/lib/libc/string/memchr.c>
+diff --git a/libc/arch-x86_64/generic/string/memrchr.c b/libc/arch-x86_64/generic/string/memrchr.c
+new file mode 100644
+index 000000000..c803009f5
+--- /dev/null
++++ b/libc/arch-x86_64/generic/string/memrchr.c
+@@ -0,0 +1,20 @@
++/*
++ * Copyright (C) 2019 The Android Open Source Project
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *      http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++*/
++
++#include <upstream-openbsd/android/include/openbsd-compat.h>
++#define memrchr memrchr_openbsd
++
++#include <upstream-openbsd/lib/libc/string/memrchr.c>
+diff --git a/libc/arch-x86_64/generic/string/wmemset.c b/libc/arch-x86_64/generic/string/wmemset.c
+new file mode 100644
+index 000000000..ac6bd7ec4
+--- /dev/null
++++ b/libc/arch-x86_64/generic/string/wmemset.c
+@@ -0,0 +1,20 @@
++/*
++ * Copyright (C) 2019 The Android Open Source Project
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *      http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++*/
++
++#include <upstream-openbsd/android/include/openbsd-compat.h>
++#define wmemset wmemset_freebsd
++
++#include <upstream-freebsd/lib/libc/string/wmemset.c>
+diff --git a/libc/arch-x86_64/string/cache.h b/libc/arch-x86_64/include/cache.h
+similarity index 100%
+rename from libc/arch-x86_64/string/cache.h
+rename to libc/arch-x86_64/include/cache.h
+diff --git a/libc/arch-x86_64/kabylake/string/avx2-memchr-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-memchr-kbl.S
+new file mode 100644
+index 000000000..da667c9b3
+--- /dev/null
++++ b/libc/arch-x86_64/kabylake/string/avx2-memchr-kbl.S
+@@ -0,0 +1,371 @@
++#ifndef L
++# define L(label)	.L##label
++#endif
++
++#ifndef cfi_startproc
++# define cfi_startproc	.cfi_startproc
++#endif
++
++#ifndef cfi_endproc
++# define cfi_endproc	.cfi_endproc
++#endif
++
++#ifndef cfi_rel_offset
++# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
++#endif
++
++#ifndef cfi_restore
++# define cfi_restore(reg)	.cfi_restore reg
++#endif
++
++#ifndef cfi_adjust_cfa_offset
++# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
++#endif
++
++#ifndef ENTRY
++# define ENTRY(name)		\
++	.type name,  @function;		\
++	.globl name;		\
++	.p2align 4;		\
++name:		\
++	cfi_startproc
++#endif
++
++#ifndef END
++# define END(name)		\
++	cfi_endproc;		\
++	.size name, .-name
++#endif
++
++#define CFI_PUSH(REG)		\
++	cfi_adjust_cfa_offset (4);		\
++	cfi_rel_offset (REG, 0)
++
++#define CFI_POP(REG)		\
++	cfi_adjust_cfa_offset (-4);		\
++	cfi_restore (REG)
++
++#define PUSH(REG)	push REG;
++#define POP(REG)	pop REG;
++
++#define ENTRANCE	PUSH (%rbx);
++#define RETURN_END	POP (%rbx); ret
++#define RETURN		RETURN_END;
++
++# ifndef MEMCHR
++#  define MEMCHR          memchr_avx2
++# endif
++
++# ifdef USE_AS_WMEMCHR
++#  define VPCMPEQ         vpcmpeqd
++# else
++#  define VPCMPEQ         vpcmpeqb
++# endif
++
++# ifndef VZEROUPPER
++#  define VZEROUPPER  vzeroupper
++# endif
++
++# define VEC_SIZE 32
++
++                .section .text.avx,"ax",@progbits
++ENTRY (MEMCHR)
++# ifndef USE_AS_RAWMEMCHR
++                /* Check for zero length.  */
++                testq      %rdx, %rdx
++                jz             L(null)
++# endif
++                movl      %edi, %ecx
++                /* Broadcast CHAR to YMM0.  */
++                vmovd  %esi, %xmm0
++# ifdef USE_AS_WMEMCHR
++                shl          $2, %rdx
++                vpbroadcastd %xmm0, %ymm0
++# else
++                vpbroadcastb %xmm0, %ymm0
++# endif
++                /* Check if we may cross page boundary with one vector load.  */
++                andl       $(2 * VEC_SIZE - 1), %ecx
++                cmpl      $VEC_SIZE, %ecx
++                ja             L(cros_page_boundary)
++
++                /* Check the first VEC_SIZE bytes.  */
++                VPCMPEQ (%rdi), %ymm0, %ymm1
++                vpmovmskb %ymm1, %eax
++                testl       %eax, %eax
++
++# ifndef USE_AS_RAWMEMCHR
++                jnz          L(first_vec_x0_check)
++                /* Adjust length and check the end of data.  */
++                subq      $VEC_SIZE, %rdx
++                jbe          L(zero)
++# else
++                jnz          L(first_vec_x0)
++# endif
++
++                /* Align data for aligned loads in the loop.  */
++                addq      $VEC_SIZE, %rdi
++                andl       $(VEC_SIZE - 1), %ecx
++                andq      $-VEC_SIZE, %rdi
++
++# ifndef USE_AS_RAWMEMCHR
++                /* Adjust length.  */
++                addq      %rcx, %rdx
++
++                subq      $(VEC_SIZE * 4), %rdx
++                jbe          L(last_4x_vec_or_less)
++# endif
++                jmp         L(more_4x_vec)
++
++                .p2align 4
++L(cros_page_boundary):
++                andl       $(VEC_SIZE - 1), %ecx
++                andq      $-VEC_SIZE, %rdi
++                VPCMPEQ (%rdi), %ymm0, %ymm1
++                vpmovmskb %ymm1, %eax
++                /* Remove the leading bytes.  */
++                sarl         %cl, %eax
++                testl       %eax, %eax
++                jz             L(aligned_more)
++                tzcntl     %eax, %eax
++# ifndef USE_AS_RAWMEMCHR
++                /* Check the end of data.  */
++                cmpq     %rax, %rdx
++                jbe          L(zero)
++# endif
++                addq      %rdi, %rax
++                addq      %rcx, %rax
++                VZEROUPPER
++                ret
++
++                .p2align 4
++L(aligned_more):
++# ifndef USE_AS_RAWMEMCHR
++        /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
++                  instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
++                   overflow.  */
++                negq      %rcx
++                addq      $VEC_SIZE, %rcx
++
++                /* Check the end of data.  */
++                subq      %rcx, %rdx
++                jbe          L(zero)
++# endif
++
++                addq      $VEC_SIZE, %rdi
++
++# ifndef USE_AS_RAWMEMCHR
++                subq      $(VEC_SIZE * 4), %rdx
++                jbe          L(last_4x_vec_or_less)
++# endif
++
++L(more_4x_vec):
++                /* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
++                   since data is only aligned to VEC_SIZE.  */
++                VPCMPEQ (%rdi), %ymm0, %ymm1
++                vpmovmskb %ymm1, %eax
++                testl       %eax, %eax
++                jnz          L(first_vec_x0)
++
++                VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
++                vpmovmskb %ymm1, %eax
++                testl       %eax, %eax
++                jnz          L(first_vec_x1)
++
++                VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
++                vpmovmskb %ymm1, %eax
++                testl       %eax, %eax
++                jnz          L(first_vec_x2)
++
++                VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
++                vpmovmskb %ymm1, %eax
++                testl       %eax, %eax
++                jnz          L(first_vec_x3)
++
++                addq      $(VEC_SIZE * 4), %rdi
++
++# ifndef USE_AS_RAWMEMCHR
++                subq      $(VEC_SIZE * 4), %rdx
++                jbe          L(last_4x_vec_or_less)
++# endif
++
++                /* Align data to 4 * VEC_SIZE.  */
++                movq     %rdi, %rcx
++                andl       $(4 * VEC_SIZE - 1), %ecx
++                andq      $-(4 * VEC_SIZE), %rdi
++
++# ifndef USE_AS_RAWMEMCHR
++                /* Adjust length.  */
++                addq      %rcx, %rdx
++# endif
++
++                .p2align 4
++L(loop_4x_vec):
++                /* Compare 4 * VEC at a time forward.  */
++                VPCMPEQ (%rdi), %ymm0, %ymm1
++                VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2
++                VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
++                VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
++
++                vpor       %ymm1, %ymm2, %ymm5
++                vpor       %ymm3, %ymm4, %ymm6
++                vpor       %ymm5, %ymm6, %ymm5
++
++                vpmovmskb %ymm5, %eax
++                testl       %eax, %eax
++                jnz          L(4x_vec_end)
++
++                addq      $(VEC_SIZE * 4), %rdi
++
++# ifdef USE_AS_RAWMEMCHR
++                jmp         L(loop_4x_vec)
++# else
++                subq      $(VEC_SIZE * 4), %rdx
++                ja             L(loop_4x_vec)
++
++L(last_4x_vec_or_less):
++                /* Less than 4 * VEC and aligned to VEC_SIZE.  */
++                addl       $(VEC_SIZE * 2), %edx
++                jle           L(last_2x_vec)
++
++                VPCMPEQ (%rdi), %ymm0, %ymm1
++                vpmovmskb %ymm1, %eax
++                testl       %eax, %eax
++                jnz          L(first_vec_x0)
++
++                VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
++                vpmovmskb %ymm1, %eax
++                testl       %eax, %eax
++                jnz          L(first_vec_x1)
++
++                VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
++                vpmovmskb %ymm1, %eax
++                testl       %eax, %eax
++
++                jnz          L(first_vec_x2_check)
++                subl       $VEC_SIZE, %edx
++                jle           L(zero)
++
++                VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
++                vpmovmskb %ymm1, %eax
++                testl       %eax, %eax
++
++                jnz          L(first_vec_x3_check)
++                xorl        %eax, %eax
++                VZEROUPPER
++                ret
++
++                .p2align 4
++L(last_2x_vec):
++                addl       $(VEC_SIZE * 2), %edx
++                VPCMPEQ (%rdi), %ymm0, %ymm1
++                vpmovmskb %ymm1, %eax
++                testl       %eax, %eax
++
++                jnz          L(first_vec_x0_check)
++                subl       $VEC_SIZE, %edx
++                jle           L(zero)
++
++                VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
++                vpmovmskb %ymm1, %eax
++                testl       %eax, %eax
++                jnz          L(first_vec_x1_check)
++                xorl        %eax, %eax
++                VZEROUPPER
++                ret
++
++                .p2align 4
++L(first_vec_x0_check):
++                tzcntl     %eax, %eax
++                /* Check the end of data.  */
++                cmpq     %rax, %rdx
++                jbe          L(zero)
++                addq      %rdi, %rax
++                VZEROUPPER
++                ret
++
++                .p2align 4
++L(first_vec_x1_check):
++                tzcntl     %eax, %eax
++                /* Check the end of data.  */
++                cmpq     %rax, %rdx
++                jbe          L(zero)
++                addq      $VEC_SIZE, %rax
++                addq      %rdi, %rax
++                VZEROUPPER
++                ret
++
++                .p2align 4
++L(first_vec_x2_check):
++                tzcntl     %eax, %eax
++                /* Check the end of data.  */
++                cmpq     %rax, %rdx
++                jbe          L(zero)
++                addq      $(VEC_SIZE * 2), %rax
++                addq      %rdi, %rax
++                VZEROUPPER
++                ret
++
++                .p2align 4
++L(first_vec_x3_check):
++                tzcntl     %eax, %eax
++                /* Check the end of data.  */
++                cmpq     %rax, %rdx
++                jbe          L(zero)
++                addq      $(VEC_SIZE * 3), %rax
++                addq      %rdi, %rax
++                VZEROUPPER
++                ret
++
++                .p2align 4
++L(zero):
++                VZEROUPPER
++L(null):
++                xorl        %eax, %eax
++                ret
++# endif
++
++                .p2align 4
++L(first_vec_x0):
++                tzcntl     %eax, %eax
++                addq      %rdi, %rax
++                VZEROUPPER
++                ret
++
++                .p2align 4
++L(first_vec_x1):
++                tzcntl     %eax, %eax
++                addq      $VEC_SIZE, %rax
++                addq      %rdi, %rax
++                VZEROUPPER
++                ret
++
++                .p2align 4
++L(first_vec_x2):
++                tzcntl     %eax, %eax
++                addq      $(VEC_SIZE * 2), %rax
++                addq      %rdi, %rax
++                VZEROUPPER
++                ret
++
++                .p2align 4
++L(4x_vec_end):
++                vpmovmskb %ymm1, %eax
++                testl       %eax, %eax
++                jnz          L(first_vec_x0)
++                vpmovmskb %ymm2, %eax
++                testl       %eax, %eax
++                jnz          L(first_vec_x1)
++                vpmovmskb %ymm3, %eax
++                testl       %eax, %eax
++                jnz          L(first_vec_x2)
++                vpmovmskb %ymm4, %eax
++                testl       %eax, %eax
++L(first_vec_x3):
++                tzcntl     %eax, %eax
++                addq      $(VEC_SIZE * 3), %rax
++                addq      %rdi, %rax
++                VZEROUPPER
++                ret
++
++END (MEMCHR)
+diff --git a/libc/arch-x86_64/kabylake/string/avx2-memcmp-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-memcmp-kbl.S
+new file mode 100644
+index 000000000..e9778ca5a
+--- /dev/null
++++ b/libc/arch-x86_64/kabylake/string/avx2-memcmp-kbl.S
+@@ -0,0 +1,428 @@
++/* Copyright (C) 2017-2019 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <http://www.gnu.org/licenses/>.  */
++
++/* memcmp/wmemcmp is implemented as:
++   1. For size from 2 to 7 bytes, load as big endian with movbe and bswap
++      to avoid branches.
++   2. Use overlapping compare to avoid branch.
++   3. Use vector compare when size >= 4 bytes for memcmp or size >= 8
++      bytes for wmemcmp.
++   4. If size is 8 * VEC_SIZE or less, unroll the loop.
++   5. Compare 4 * VEC_SIZE at a time with the aligned first memory
++      area.
++   6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
++   7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
++   8. Use 8 vector compares when size is 8 * VEC_SIZE or less.  */
++
++
++#ifndef MEMCMP
++# define MEMCMP		memcmp_avx2
++#endif
++
++#ifndef L
++# define L(label)	.L##label
++#endif
++
++#ifndef ALIGN
++# define ALIGN(n)	.p2align n
++#endif
++
++#ifndef cfi_startproc
++# define cfi_startproc			.cfi_startproc
++#endif
++
++#ifndef cfi_endproc
++# define cfi_endproc			.cfi_endproc
++#endif
++
++#ifndef ENTRY
++# define ENTRY(name)			\
++	.type name,  @function; 	\
++	.globl name;			\
++	.p2align 4;			\
++name:					\
++	cfi_startproc
++#endif
++
++#ifndef END
++# define END(name)			\
++	cfi_endproc;			\
++	.size name, .-name
++#endif
++
++#ifndef ALIGN
++# define ALIGN(n)	.p2align n
++#endif
++
++# ifdef USE_AS_WMEMCMP
++#  define VPCMPEQ        vpcmpeqd
++# else
++#  define VPCMPEQ        vpcmpeqb
++# endif
++# ifndef VZEROUPPER
++#  define VZEROUPPER        vzeroupper
++# endif
++# define VEC_SIZE 32
++# define VEC_MASK ((1 << VEC_SIZE) - 1)
++        .section .text.avx,"ax",@progbits
++ENTRY (MEMCMP)
++# ifdef USE_AS_WMEMCMP
++        shl        $2, %RDX_LP
++# elif defined __ILP32__
++        /* Clear the upper 32 bits.  */
++        movl        %edx, %edx
++# endif
++        cmp        $VEC_SIZE, %rdx
++        jb        L(less_vec)
++        /* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
++        vmovdqu        (%rsi), %ymm2
++        VPCMPEQ (%rdi), %ymm2, %ymm2
++        vpmovmskb %ymm2, %eax
++        subl    $VEC_MASK, %eax
++        jnz        L(first_vec)
++        cmpq        $(VEC_SIZE * 2), %rdx
++        jbe        L(last_vec)
++        VPCMPEQ        %ymm0, %ymm0, %ymm0
++        /* More than 2 * VEC.  */
++        cmpq        $(VEC_SIZE * 8), %rdx
++        ja        L(more_8x_vec)
++        cmpq        $(VEC_SIZE * 4), %rdx
++        jb        L(last_4x_vec)
++        /* From 4 * VEC to 8 * VEC, inclusively. */
++        vmovdqu        (%rsi), %ymm1
++        VPCMPEQ (%rdi), %ymm1, %ymm1
++        vmovdqu        VEC_SIZE(%rsi), %ymm2
++        VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
++        vmovdqu        (VEC_SIZE * 2)(%rsi), %ymm3
++        VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
++        vmovdqu        (VEC_SIZE * 3)(%rsi), %ymm4
++        VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
++        vpand        %ymm1, %ymm2, %ymm5
++        vpand        %ymm3, %ymm4, %ymm6
++        vpand        %ymm5, %ymm6, %ymm5
++        vptest        %ymm0, %ymm5
++        jnc        L(4x_vec_end)
++        leaq        -(4 * VEC_SIZE)(%rdi, %rdx), %rdi
++        leaq        -(4 * VEC_SIZE)(%rsi, %rdx), %rsi
++        vmovdqu        (%rsi), %ymm1
++        VPCMPEQ (%rdi), %ymm1, %ymm1
++        vmovdqu        VEC_SIZE(%rsi), %ymm2
++        VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
++        vpand        %ymm2, %ymm1, %ymm5
++        vmovdqu        (VEC_SIZE * 2)(%rsi), %ymm3
++        VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
++        vpand        %ymm3, %ymm5, %ymm5
++        vmovdqu        (VEC_SIZE * 3)(%rsi), %ymm4
++        VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
++        vpand        %ymm4, %ymm5, %ymm5
++        vptest        %ymm0, %ymm5
++        jnc        L(4x_vec_end)
++        xorl        %eax, %eax
++        VZEROUPPER
++        ret
++        .p2align 4
++L(last_2x_vec):
++        /* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
++        vmovdqu        (%rsi), %ymm2
++        VPCMPEQ (%rdi), %ymm2, %ymm2
++        vpmovmskb %ymm2, %eax
++        subl    $VEC_MASK, %eax
++        jnz        L(first_vec)
++L(last_vec):
++        /* Use overlapping loads to avoid branches.  */
++        leaq        -VEC_SIZE(%rdi, %rdx), %rdi
++        leaq        -VEC_SIZE(%rsi, %rdx), %rsi
++        vmovdqu        (%rsi), %ymm2
++        VPCMPEQ (%rdi), %ymm2, %ymm2
++        vpmovmskb %ymm2, %eax
++        subl    $VEC_MASK, %eax
++        jnz        L(first_vec)
++        VZEROUPPER
++        ret
++        .p2align 4
++L(first_vec):
++        /* A byte or int32 is different within 16 or 32 bytes.  */
++        tzcntl        %eax, %ecx
++# ifdef USE_AS_WMEMCMP
++        xorl        %eax, %eax
++        movl        (%rdi, %rcx), %edx
++        cmpl        (%rsi, %rcx), %edx
++L(wmemcmp_return):
++        setl        %al
++        negl        %eax
++        orl        $1, %eax
++# else
++        movzbl        (%rdi, %rcx), %eax
++        movzbl        (%rsi, %rcx), %edx
++        sub        %edx, %eax
++# endif
++        VZEROUPPER
++        ret
++# ifdef USE_AS_WMEMCMP
++        .p2align 4
++L(4):
++        xorl        %eax, %eax
++        movl        (%rdi), %edx
++        cmpl        (%rsi), %edx
++        jne        L(wmemcmp_return)
++        ret
++# else
++
++L(between_4_7):
++        /* Load as big endian with overlapping movbe to avoid branches.  */
++        movbe        (%rdi), %eax
++        movbe        (%rsi), %ecx
++        shlq        $32, %rax
++        shlq        $32, %rcx
++        movbe        -4(%rdi, %rdx), %edi
++        movbe        -4(%rsi, %rdx), %esi
++        orq        %rdi, %rax
++        orq        %rsi, %rcx
++        subq        %rcx, %rax
++        je        L(exit)
++        sbbl        %eax, %eax
++        orl        $1, %eax
++        ret
++        .p2align 4
++/*L(8):
++	giving two failures 
++	movl (%rdi), %eax
++	subl (%rsi), %eax
++	je L(between_4_7)
++        retq */
++
++L(exit):
++        ret
++        .p2align 4
++L(between_2_3):
++        /* Load as big endian to avoid branches.  */
++        movzwl        (%rdi), %eax
++        movzwl        (%rsi), %ecx
++        shll        $8, %eax
++        shll        $8, %ecx
++        bswap        %eax
++        bswap        %ecx
++        movb        -1(%rdi, %rdx), %al
++        movb        -1(%rsi, %rdx), %cl
++        /* Subtraction is okay because the upper 8 bits are zero.  */
++        subl        %ecx, %eax
++        ret
++        .p2align 4
++L(1):
++        movzbl        (%rdi), %eax
++        movzbl        (%rsi), %ecx
++        sub        %ecx, %eax
++        ret
++# endif
++        .p2align 4
++L(zero):
++        xorl        %eax, %eax
++        ret
++        .p2align 4
++L(less_vec):
++# ifdef USE_AS_WMEMCMP
++        /* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes.  */
++        cmpb        $4, %dl
++        je        L(4)
++        jb        L(zero)
++# else
++/*	cmpb $8, %dl
++        jne L(tmp)
++        movl (%rdi), %eax
++        subl (%rsi), %eax
++        jne L(exit)
++L(temp):
++	movl    %edx, %edx
++	//jmp L(tmp) 
++L(tmp):*/ 
++
++        cmpb        $1, %dl
++        je        L(1)
++        jb        L(zero)
++
++        cmpb        $4, %dl
++        jb        L(between_2_3)
++        cmpb       $8, %dl
++        //je        L(8)
++        jb        L(between_4_7)
++# endif
++        cmpb        $16, %dl
++        jae        L(between_16_31)
++        /* It is between 8 and 15 bytes.  */
++        vmovq        (%rdi), %xmm1
++        vmovq        (%rsi), %xmm2
++        VPCMPEQ %xmm1, %xmm2, %xmm2
++        vpmovmskb %xmm2, %eax
++        subl    $0xffff, %eax
++        jnz        L(first_vec)
++        /* Use overlapping loads to avoid branches.  */
++        leaq        -8(%rdi, %rdx), %rdi
++        leaq        -8(%rsi, %rdx), %rsi
++        vmovq        (%rdi), %xmm1
++        vmovq        (%rsi), %xmm2
++        VPCMPEQ %xmm1, %xmm2, %xmm2
++        vpmovmskb %xmm2, %eax
++        subl    $0xffff, %eax
++        jnz        L(first_vec)
++        ret
++        .p2align 4
++L(between_16_31):
++        /* From 16 to 31 bytes.  No branch when size == 16.  */
++        vmovdqu        (%rsi), %xmm2
++        VPCMPEQ (%rdi), %xmm2, %xmm2
++        vpmovmskb %xmm2, %eax
++        subl    $0xffff, %eax
++        jnz        L(first_vec)
++        /* Use overlapping loads to avoid branches.  */
++        leaq        -16(%rdi, %rdx), %rdi
++        leaq        -16(%rsi, %rdx), %rsi
++        vmovdqu        (%rsi), %xmm2
++        VPCMPEQ (%rdi), %xmm2, %xmm2
++        vpmovmskb %xmm2, %eax
++        subl    $0xffff, %eax
++        jnz        L(first_vec)
++        ret
++        .p2align 4
++L(more_8x_vec):
++        /* More than 8 * VEC.  Check the first VEC.  */
++        vmovdqu        (%rsi), %ymm2
++        VPCMPEQ (%rdi), %ymm2, %ymm2
++        vpmovmskb %ymm2, %eax
++        subl    $VEC_MASK, %eax
++        jnz        L(first_vec)
++        /* Align the first memory area for aligned loads in the loop.
++           Compute how much the first memory area is misaligned.  */
++        movq        %rdi, %rcx
++        andl        $(VEC_SIZE - 1), %ecx
++        /* Get the negative of offset for alignment.  */
++        subq        $VEC_SIZE, %rcx
++        /* Adjust the second memory area.  */
++        subq        %rcx, %rsi
++        /* Adjust the first memory area which should be aligned now.  */
++        subq        %rcx, %rdi
++        /* Adjust length.  */
++        addq        %rcx, %rdx
++L(loop_4x_vec):
++        /* Compare 4 * VEC at a time forward.  */
++        vmovdqu        (%rsi), %ymm1
++        VPCMPEQ (%rdi), %ymm1, %ymm1
++        vmovdqu        VEC_SIZE(%rsi), %ymm2
++        VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
++        vpand        %ymm2, %ymm1, %ymm5
++        vmovdqu        (VEC_SIZE * 2)(%rsi), %ymm3
++        VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
++        vpand        %ymm3, %ymm5, %ymm5
++        vmovdqu        (VEC_SIZE * 3)(%rsi), %ymm4
++        VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
++        vpand        %ymm4, %ymm5, %ymm5
++        vptest        %ymm0, %ymm5
++        jnc        L(4x_vec_end)
++        addq        $(VEC_SIZE * 4), %rdi
++        addq        $(VEC_SIZE * 4), %rsi
++        subq        $(VEC_SIZE * 4), %rdx
++        cmpq        $(VEC_SIZE * 4), %rdx
++        jae        L(loop_4x_vec)
++        /* Less than 4 * VEC.  */
++        cmpq        $VEC_SIZE, %rdx
++        jbe        L(last_vec)
++        cmpq        $(VEC_SIZE * 2), %rdx
++        jbe        L(last_2x_vec)
++L(last_4x_vec):
++        /* From 2 * VEC to 4 * VEC. */
++        vmovdqu        (%rsi), %ymm2
++        VPCMPEQ (%rdi), %ymm2, %ymm2
++        vpmovmskb %ymm2, %eax
++        subl    $VEC_MASK, %eax
++        jnz        L(first_vec)
++        addq        $VEC_SIZE, %rdi
++        addq        $VEC_SIZE, %rsi
++        vmovdqu        (%rsi), %ymm2
++        VPCMPEQ (%rdi), %ymm2, %ymm2
++        vpmovmskb %ymm2, %eax
++        subl    $VEC_MASK, %eax
++        jnz        L(first_vec)
++        /* Use overlapping loads to avoid branches.  */
++        leaq        -(3 * VEC_SIZE)(%rdi, %rdx), %rdi
++        leaq        -(3 * VEC_SIZE)(%rsi, %rdx), %rsi
++        vmovdqu        (%rsi), %ymm2
++        VPCMPEQ (%rdi), %ymm2, %ymm2
++        vpmovmskb %ymm2, %eax
++        subl    $VEC_MASK, %eax
++        jnz        L(first_vec)
++        addq        $VEC_SIZE, %rdi
++        addq        $VEC_SIZE, %rsi
++        vmovdqu        (%rsi), %ymm2
++        VPCMPEQ (%rdi), %ymm2, %ymm2
++        vpmovmskb %ymm2, %eax
++        subl    $VEC_MASK, %eax
++        jnz        L(first_vec)
++        VZEROUPPER
++        ret
++        .p2align 4
++L(4x_vec_end):
++        vpmovmskb %ymm1, %eax
++        subl        $VEC_MASK, %eax
++        jnz        L(first_vec)
++        vpmovmskb %ymm2, %eax
++        subl        $VEC_MASK, %eax
++        jnz        L(first_vec_x1)
++        vpmovmskb %ymm3, %eax
++        subl        $VEC_MASK, %eax
++        jnz        L(first_vec_x2)
++        vpmovmskb %ymm4, %eax
++        subl        $VEC_MASK, %eax
++        tzcntl        %eax, %ecx
++# ifdef USE_AS_WMEMCMP
++        xorl        %eax, %eax
++        movl        (VEC_SIZE * 3)(%rdi, %rcx), %edx
++        cmpl        (VEC_SIZE * 3)(%rsi, %rcx), %edx
++        jmp        L(wmemcmp_return)
++# else
++        movzbl        (VEC_SIZE * 3)(%rdi, %rcx), %eax
++        movzbl        (VEC_SIZE * 3)(%rsi, %rcx), %edx
++        sub        %edx, %eax
++# endif
++        VZEROUPPER
++        ret
++        .p2align 4
++L(first_vec_x1):
++        tzcntl        %eax, %ecx
++# ifdef USE_AS_WMEMCMP
++        xorl        %eax, %eax
++        movl        VEC_SIZE(%rdi, %rcx), %edx
++        cmpl        VEC_SIZE(%rsi, %rcx), %edx
++        jmp        L(wmemcmp_return)
++# else
++        movzbl        VEC_SIZE(%rdi, %rcx), %eax
++        movzbl        VEC_SIZE(%rsi, %rcx), %edx
++        sub        %edx, %eax
++# endif
++        VZEROUPPER
++        ret
++        .p2align 4
++L(first_vec_x2):
++        tzcntl        %eax, %ecx
++# ifdef USE_AS_WMEMCMP
++        xorl        %eax, %eax
++        movl        (VEC_SIZE * 2)(%rdi, %rcx), %edx
++        cmpl        (VEC_SIZE * 2)(%rsi, %rcx), %edx
++        jmp        L(wmemcmp_return)
++# else
++        movzbl        (VEC_SIZE * 2)(%rdi, %rcx), %eax
++        movzbl        (VEC_SIZE * 2)(%rsi, %rcx), %edx
++        sub        %edx, %eax
++# endif
++        VZEROUPPER
++        ret
++END (MEMCMP)
+diff --git a/libc/arch-x86_64/kabylake/string/avx2-memrchr-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-memrchr-kbl.S
+new file mode 100644
+index 000000000..a958fb56d
+--- /dev/null
++++ b/libc/arch-x86_64/kabylake/string/avx2-memrchr-kbl.S
+@@ -0,0 +1,408 @@
++/* memrchr optimized with AVX2.
++   Copyright (C) 2017-2019 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#ifndef L
++# define L(label)       .L##label
++#endif
++
++#ifndef cfi_startproc
++# define cfi_startproc  .cfi_startproc
++#endif
++
++#ifndef cfi_endproc
++# define cfi_endproc    .cfi_endproc
++#endif
++
++#ifndef cfi_rel_offset
++# define cfi_rel_offset(reg, off)       .cfi_rel_offset reg, off
++#endif
++
++#ifndef cfi_restore
++# define cfi_restore(reg)       .cfi_restore reg
++#endif
++
++#ifndef cfi_adjust_cfa_offset
++# define cfi_adjust_cfa_offset(off)     .cfi_adjust_cfa_offset off
++#endif
++
++#ifndef ENTRY
++# define ENTRY(name)    \
++        .type name,  @function; \
++        .globl name;    \
++        .p2align 4;     \
++name:   \
++        cfi_startproc
++#endif
++
++#ifndef END
++# define END(name)      \
++        cfi_endproc;    \
++        .size name,     .-name
++#endif
++
++#define CFI_PUSH(REG)   \
++        cfi_adjust_cfa_offset (4);      \
++        cfi_rel_offset (REG, 0)
++
++#define CFI_POP(REG)    \
++        cfi_adjust_cfa_offset (-4);     \
++        cfi_restore (REG)
++
++#define PUSH(REG) pushl REG; CFI_PUSH (REG)
++#define POP(REG) popl REG; CFI_POP (REG)
++
++# ifndef MEMRCHR
++#  define MEMRCHR          memrchr_avx2
++# endif
++
++#ifndef VZEROUPPER
++#  define VZEROUPPER	vzeroupper
++# endif
++
++# define VEC_SIZE 32
++
++	.section .text.avx,"ax",@progbits
++ENTRY (MEMRCHR)
++	/* Broadcast CHAR to YMM0.  */
++	vmovd	%esi, %xmm0
++	vpbroadcastb %xmm0, %ymm0
++
++	sub	$VEC_SIZE, %rdx
++	jbe	L(last_vec_or_less)
++
++	add	%rdx, %rdi
++
++	/* Check the last VEC_SIZE bytes.  */
++	vpcmpeqb (%rdi), %ymm0, %ymm1
++	vpmovmskb %ymm1, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x0)
++
++	subq	$(VEC_SIZE * 4), %rdi
++	movl	%edi, %ecx
++	andl	$(VEC_SIZE - 1), %ecx
++	jz	L(aligned_more)
++
++	/* Align data for aligned loads in the loop.  */
++	addq	$VEC_SIZE, %rdi
++	addq	$VEC_SIZE, %rdx
++	andq	$-VEC_SIZE, %rdi
++	subq	%rcx, %rdx
++
++	.p2align 4
++L(aligned_more):
++	subq	$(VEC_SIZE * 4), %rdx
++	jbe	L(last_4x_vec_or_less)
++
++	/* Check the last 4 * VEC_SIZE.  Only one VEC_SIZE at a time
++	   since data is only aligned to VEC_SIZE.  */
++	vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
++	vpmovmskb %ymm1, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x3)
++
++	vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
++	vpmovmskb %ymm2, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x2)
++
++	vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
++	vpmovmskb %ymm3, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x1)
++
++	vpcmpeqb (%rdi), %ymm0, %ymm4
++	vpmovmskb %ymm4, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x0)
++
++	/* Align data to 4 * VEC_SIZE for loop with fewer branches.
++	   There are some overlaps with above if data isn't aligned
++	   to 4 * VEC_SIZE.  */
++	movl	%edi, %ecx
++	andl	$(VEC_SIZE * 4 - 1), %ecx
++	jz	L(loop_4x_vec)
++
++	addq	$(VEC_SIZE * 4), %rdi
++	addq	$(VEC_SIZE * 4), %rdx
++	andq	$-(VEC_SIZE * 4), %rdi
++	subq	%rcx, %rdx
++
++	.p2align 4
++L(loop_4x_vec):
++	/* Compare 4 * VEC at a time forward.  */
++	subq	$(VEC_SIZE * 4), %rdi
++	subq	$(VEC_SIZE * 4), %rdx
++	jbe	L(last_4x_vec_or_less)
++
++	vmovdqa	(%rdi), %ymm1
++	vmovdqa	VEC_SIZE(%rdi), %ymm2
++	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm3
++	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm4
++
++	vpcmpeqb %ymm1, %ymm0, %ymm1
++	vpcmpeqb %ymm2, %ymm0, %ymm2
++	vpcmpeqb %ymm3, %ymm0, %ymm3
++	vpcmpeqb %ymm4, %ymm0, %ymm4
++
++	vpor	%ymm1, %ymm2, %ymm5
++	vpor	%ymm3, %ymm4, %ymm6
++	vpor	%ymm5, %ymm6, %ymm5
++
++	vpmovmskb %ymm5, %eax
++	testl	%eax, %eax
++	jz	L(loop_4x_vec)
++
++	/* There is a match.  */
++	vpmovmskb %ymm4, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x3)
++
++	vpmovmskb %ymm3, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x2)
++
++	vpmovmskb %ymm2, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x1)
++
++	vpmovmskb %ymm1, %eax
++	bsrl	%eax, %eax
++	addq	%rdi, %rax
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(last_4x_vec_or_less):
++	addl	$(VEC_SIZE * 4), %edx
++	cmpl	$(VEC_SIZE * 2), %edx
++	jbe	L(last_2x_vec)
++
++	vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
++	vpmovmskb %ymm1, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x3)
++
++	vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
++	vpmovmskb %ymm2, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x2)
++
++	vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
++	vpmovmskb %ymm3, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x1_check)
++	cmpl	$(VEC_SIZE * 3), %edx
++	jbe	L(zero)
++
++	vpcmpeqb (%rdi), %ymm0, %ymm4
++	vpmovmskb %ymm4, %eax
++	testl	%eax, %eax
++	jz	L(zero)
++	bsrl	%eax, %eax
++	subq	$(VEC_SIZE * 4), %rdx
++	addq	%rax, %rdx
++	jl	L(zero)
++	addq	%rdi, %rax
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(last_2x_vec):
++	vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
++	vpmovmskb %ymm1, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x3_check)
++	cmpl	$VEC_SIZE, %edx
++	jbe	L(zero)
++
++	vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
++	vpmovmskb %ymm1, %eax
++	testl	%eax, %eax
++	jz	L(zero)
++	bsrl	%eax, %eax
++	subq	$(VEC_SIZE * 2), %rdx
++	addq	%rax, %rdx
++	jl	L(zero)
++	addl	$(VEC_SIZE * 2), %eax
++	addq	%rdi, %rax
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(last_vec_x0):
++	bsrl	%eax, %eax
++	addq	%rdi, %rax
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(last_vec_x1):
++	bsrl	%eax, %eax
++	addl	$VEC_SIZE, %eax
++	addq	%rdi, %rax
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(last_vec_x2):
++	bsrl	%eax, %eax
++	addl	$(VEC_SIZE * 2), %eax
++	addq	%rdi, %rax
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(last_vec_x3):
++	bsrl	%eax, %eax
++	addl	$(VEC_SIZE * 3), %eax
++	addq	%rdi, %rax
++	ret
++
++	.p2align 4
++L(last_vec_x1_check):
++	bsrl	%eax, %eax
++	subq	$(VEC_SIZE * 3), %rdx
++	addq	%rax, %rdx
++	jl	L(zero)
++	addl	$VEC_SIZE, %eax
++	addq	%rdi, %rax
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(last_vec_x3_check):
++	bsrl	%eax, %eax
++	subq	$VEC_SIZE, %rdx
++	addq	%rax, %rdx
++	jl	L(zero)
++	addl	$(VEC_SIZE * 3), %eax
++	addq	%rdi, %rax
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(zero):
++	VZEROUPPER
++L(null):
++	xorl	%eax, %eax
++	ret
++
++	.p2align 4
++L(last_vec_or_less_aligned):
++	movl	%edx, %ecx
++
++	vpcmpeqb (%rdi), %ymm0, %ymm1
++
++	movl	$1, %edx
++	/* Support rdx << 32.  */
++	salq	%cl, %rdx
++	subq	$1, %rdx
++
++	vpmovmskb %ymm1, %eax
++
++	/* Remove the trailing bytes.  */
++	andl	%edx, %eax
++	testl	%eax, %eax
++	jz	L(zero)
++
++	bsrl	%eax, %eax
++	addq	%rdi, %rax
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(last_vec_or_less):
++	addl	$VEC_SIZE, %edx
++
++	/* Check for zero length.  */
++	testl	%edx, %edx
++	jz	L(null)
++
++	movl	%edi, %ecx
++	andl	$(VEC_SIZE - 1), %ecx
++	jz	L(last_vec_or_less_aligned)
++
++	movl	%ecx, %esi
++	movl	%ecx, %r8d
++	addl	%edx, %esi
++	andq	$-VEC_SIZE, %rdi
++
++	subl	$VEC_SIZE, %esi
++	ja	L(last_vec_2x_aligned)
++
++	/* Check the last VEC.  */
++	vpcmpeqb (%rdi), %ymm0, %ymm1
++	vpmovmskb %ymm1, %eax
++
++	/* Remove the leading and trailing bytes.  */
++	sarl	%cl, %eax
++	movl	%edx, %ecx
++
++	movl	$1, %edx
++	sall	%cl, %edx
++	subl	$1, %edx
++
++	andl	%edx, %eax
++	testl	%eax, %eax
++	jz	L(zero)
++
++	bsrl	%eax, %eax
++	addq	%rdi, %rax
++	addq	%r8, %rax
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(last_vec_2x_aligned):
++	movl	%esi, %ecx
++
++	/* Check the last VEC.  */
++	vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm1
++
++	movl	$1, %edx
++	sall	%cl, %edx
++	subl	$1, %edx
++
++	vpmovmskb %ymm1, %eax
++
++	/* Remove the trailing bytes.  */
++	andl	%edx, %eax
++
++	testl	%eax, %eax
++	jnz	L(last_vec_x1)
++
++	/* Check the second last VEC.  */
++	vpcmpeqb (%rdi), %ymm0, %ymm1
++
++	movl	%r8d, %ecx
++
++	vpmovmskb %ymm1, %eax
++
++	/* Remove the leading bytes.  Must use unsigned right shift for
++	   bsrl below.  */
++	shrl	%cl, %eax
++	testl	%eax, %eax
++	jz	L(zero)
++
++	bsrl	%eax, %eax
++	addq	%rdi, %rax
++	addq	%r8, %rax
++	VZEROUPPER
++	ret
++END (MEMRCHR)
+diff --git a/libc/arch-x86_64/kabylake/string/avx2-wmemset-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-wmemset-kbl.S
+new file mode 100644
+index 000000000..7c485cf70
+--- /dev/null
++++ b/libc/arch-x86_64/kabylake/string/avx2-wmemset-kbl.S
+@@ -0,0 +1,140 @@
++/*
++Copyright (C) 2019 The Android Open Source Project
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions
++are met:
++ * Redistributions of source code must retain the above copyright
++   notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++   notice, this list of conditions and the following disclaimer in
++   the documentation and/or other materials provided with the
++   distribution.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
++"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
++LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
++FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
++COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
++INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
++BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
++OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
++AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
++OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
++SUCH DAMAGE.
++*/
++
++#include <private/bionic_asm.h>
++
++#ifndef WMEMSET
++ #define WMEMSET wmemset_avx2
++#endif
++
++        .section .text.avx2,"ax",@progbits
++
++ENTRY (WMEMSET)
++# BB#0:
++	testq	%rdx, %rdx
++	je	.LBB0_14
++# BB#1:
++	cmpq	$32, %rdx
++	jae	.LBB0_3
++# BB#2:
++	xorl	%r8d, %r8d
++	movq	%rdi, %rax
++	jmp	.LBB0_12
++.LBB0_3:
++	movq	%rdx, %r8
++	andq	$-32, %r8
++	vmovd	%esi, %xmm0
++	vpbroadcastd	%xmm0, %ymm0
++	leaq	-32(%r8), %rcx
++	movq	%rcx, %rax
++	shrq	$5, %rax
++	leal	1(%rax), %r9d
++	andl	$7, %r9d
++	cmpq	$224, %rcx
++	jae	.LBB0_5
++# BB#4:
++	xorl	%eax, %eax
++	testq	%r9, %r9
++	jne	.LBB0_8
++	jmp	.LBB0_10
++.LBB0_5:
++	leaq	992(%rdi), %rcx
++	leaq	-1(%r9), %r10
++	subq	%rax, %r10
++	xorl	%eax, %eax
++	.p2align	4, 0x90
++.LBB0_6:                                # =>This Inner Loop Header: Depth=1
++	vmovdqu	%ymm0, -992(%rcx,%rax,4)
++	vmovdqu	%ymm0, -960(%rcx,%rax,4)
++	vmovdqu	%ymm0, -928(%rcx,%rax,4)
++	vmovdqu	%ymm0, -896(%rcx,%rax,4)
++	vmovdqu	%ymm0, -864(%rcx,%rax,4)
++	vmovdqu	%ymm0, -832(%rcx,%rax,4)
++	vmovdqu	%ymm0, -800(%rcx,%rax,4)
++	vmovdqu	%ymm0, -768(%rcx,%rax,4)
++	vmovdqu	%ymm0, -736(%rcx,%rax,4)
++	vmovdqu	%ymm0, -704(%rcx,%rax,4)
++	vmovdqu	%ymm0, -672(%rcx,%rax,4)
++	vmovdqu	%ymm0, -640(%rcx,%rax,4)
++	vmovdqu	%ymm0, -608(%rcx,%rax,4)
++	vmovdqu	%ymm0, -576(%rcx,%rax,4)
++	vmovdqu	%ymm0, -544(%rcx,%rax,4)
++	vmovdqu	%ymm0, -512(%rcx,%rax,4)
++	vmovdqu	%ymm0, -480(%rcx,%rax,4)
++	vmovdqu	%ymm0, -448(%rcx,%rax,4)
++	vmovdqu	%ymm0, -416(%rcx,%rax,4)
++	vmovdqu	%ymm0, -384(%rcx,%rax,4)
++	vmovdqu	%ymm0, -352(%rcx,%rax,4)
++	vmovdqu	%ymm0, -320(%rcx,%rax,4)
++	vmovdqu	%ymm0, -288(%rcx,%rax,4)
++	vmovdqu	%ymm0, -256(%rcx,%rax,4)
++	vmovdqu	%ymm0, -224(%rcx,%rax,4)
++	vmovdqu	%ymm0, -192(%rcx,%rax,4)
++	vmovdqu	%ymm0, -160(%rcx,%rax,4)
++	vmovdqu	%ymm0, -128(%rcx,%rax,4)
++	vmovdqu	%ymm0, -96(%rcx,%rax,4)
++	vmovdqu	%ymm0, -64(%rcx,%rax,4)
++	vmovdqu	%ymm0, -32(%rcx,%rax,4)
++	vmovdqu	%ymm0, (%rcx,%rax,4)
++	addq	$256, %rax              # imm = 0x100
++	addq	$8, %r10
++	jne	.LBB0_6
++# BB#7:
++	testq	%r9, %r9
++	je	.LBB0_10
++.LBB0_8:
++	leaq	(%rdi,%rax,4), %rax
++	addq	$96, %rax
++	negq	%r9
++	.p2align	4, 0x90
++.LBB0_9:                                # =>This Inner Loop Header: Depth=1
++	vmovdqu	%ymm0, -96(%rax)
++	vmovdqu	%ymm0, -64(%rax)
++	vmovdqu	%ymm0, -32(%rax)
++	vmovdqu	%ymm0, (%rax)
++	subq	$-128, %rax
++	addq	$1, %r9
++	jne	.LBB0_9
++.LBB0_10:
++	cmpq	%rdx, %r8
++	je	.LBB0_14
++# BB#11:
++	leaq	(%rdi,%r8,4), %rax
++.LBB0_12:
++	subq	%r8, %rdx
++	.p2align	4, 0x90
++.LBB0_13:                               # =>This Inner Loop Header: Depth=1
++	movl	%esi, (%rax)
++	addq	$4, %rax
++	addq	$-1, %rdx
++	jne	.LBB0_13
++.LBB0_14:
++	movq	%rdi, %rax
++	vzeroupper
++	retq
++END(WMEMSET)
+diff --git a/libc/arch-x86_64/string/sse2-memmove-slm.S b/libc/arch-x86_64/silvermont/string/sse2-memmove-slm.S
+similarity index 99%
+rename from libc/arch-x86_64/string/sse2-memmove-slm.S
+rename to libc/arch-x86_64/silvermont/string/sse2-memmove-slm.S
+index 739502888..7024f4950 100644
+--- a/libc/arch-x86_64/string/sse2-memmove-slm.S
++++ b/libc/arch-x86_64/silvermont/string/sse2-memmove-slm.S
+@@ -31,7 +31,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ #include "cache.h"
+ 
+ #ifndef MEMMOVE
+-# define MEMMOVE		memmove
++# define MEMMOVE		memmove_generic
+ #endif
+ 
+ #ifndef L
+@@ -515,4 +515,4 @@ L(mm_large_page_loop_backward):
+ 
+ END (MEMMOVE)
+ 
+-ALIAS_SYMBOL(memcpy, MEMMOVE)
++//ALIAS_SYMBOL(memcpy, MEMMOVE)
+diff --git a/libc/arch-x86_64/string/sse2-memset-slm.S b/libc/arch-x86_64/silvermont/string/sse2-memset-slm.S
+similarity index 100%
+rename from libc/arch-x86_64/string/sse2-memset-slm.S
+rename to libc/arch-x86_64/silvermont/string/sse2-memset-slm.S
+diff --git a/libc/arch-x86_64/string/sse2-stpcpy-slm.S b/libc/arch-x86_64/silvermont/string/sse2-stpcpy-slm.S
+similarity index 100%
+rename from libc/arch-x86_64/string/sse2-stpcpy-slm.S
+rename to libc/arch-x86_64/silvermont/string/sse2-stpcpy-slm.S
+diff --git a/libc/arch-x86_64/string/sse2-stpncpy-slm.S b/libc/arch-x86_64/silvermont/string/sse2-stpncpy-slm.S
+similarity index 100%
+rename from libc/arch-x86_64/string/sse2-stpncpy-slm.S
+rename to libc/arch-x86_64/silvermont/string/sse2-stpncpy-slm.S
+diff --git a/libc/arch-x86_64/string/sse2-strcat-slm.S b/libc/arch-x86_64/silvermont/string/sse2-strcat-slm.S
+similarity index 100%
+rename from libc/arch-x86_64/string/sse2-strcat-slm.S
+rename to libc/arch-x86_64/silvermont/string/sse2-strcat-slm.S
+diff --git a/libc/arch-x86_64/string/sse2-strcpy-slm.S b/libc/arch-x86_64/silvermont/string/sse2-strcpy-slm.S
+similarity index 100%
+rename from libc/arch-x86_64/string/sse2-strcpy-slm.S
+rename to libc/arch-x86_64/silvermont/string/sse2-strcpy-slm.S
+diff --git a/libc/arch-x86_64/string/sse2-strlen-slm.S b/libc/arch-x86_64/silvermont/string/sse2-strlen-slm.S
+similarity index 100%
+rename from libc/arch-x86_64/string/sse2-strlen-slm.S
+rename to libc/arch-x86_64/silvermont/string/sse2-strlen-slm.S
+diff --git a/libc/arch-x86_64/string/sse2-strncat-slm.S b/libc/arch-x86_64/silvermont/string/sse2-strncat-slm.S
+similarity index 100%
+rename from libc/arch-x86_64/string/sse2-strncat-slm.S
+rename to libc/arch-x86_64/silvermont/string/sse2-strncat-slm.S
+diff --git a/libc/arch-x86_64/string/sse2-strncpy-slm.S b/libc/arch-x86_64/silvermont/string/sse2-strncpy-slm.S
+similarity index 100%
+rename from libc/arch-x86_64/string/sse2-strncpy-slm.S
+rename to libc/arch-x86_64/silvermont/string/sse2-strncpy-slm.S
+diff --git a/libc/arch-x86_64/string/sse4-memcmp-slm.S b/libc/arch-x86_64/silvermont/string/sse4-memcmp-slm.S
+similarity index 99%
+rename from libc/arch-x86_64/string/sse4-memcmp-slm.S
+rename to libc/arch-x86_64/silvermont/string/sse4-memcmp-slm.S
+index 8a8b180a2..6cfcd767f 100644
+--- a/libc/arch-x86_64/string/sse4-memcmp-slm.S
++++ b/libc/arch-x86_64/silvermont/string/sse4-memcmp-slm.S
+@@ -31,7 +31,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ #include "cache.h"
+ 
+ #ifndef MEMCMP
+-# define MEMCMP		memcmp
++# define MEMCMP		memcmp_generic
+ #endif
+ 
+ #ifndef L
+diff --git a/libc/arch-x86_64/string/ssse3-strcmp-slm.S b/libc/arch-x86_64/silvermont/string/ssse3-strcmp-slm.S
+similarity index 100%
+rename from libc/arch-x86_64/string/ssse3-strcmp-slm.S
+rename to libc/arch-x86_64/silvermont/string/ssse3-strcmp-slm.S
+diff --git a/libc/arch-x86_64/string/ssse3-strncmp-slm.S b/libc/arch-x86_64/silvermont/string/ssse3-strncmp-slm.S
+similarity index 100%
+rename from libc/arch-x86_64/string/ssse3-strncmp-slm.S
+rename to libc/arch-x86_64/silvermont/string/ssse3-strncmp-slm.S
+diff --git a/libc/arch-x86_64/static_function_dispatch.S b/libc/arch-x86_64/static_function_dispatch.S
+index 93ff5f2fc..979ce4f18 100644
+--- a/libc/arch-x86_64/static_function_dispatch.S
++++ b/libc/arch-x86_64/static_function_dispatch.S
+@@ -35,3 +35,9 @@ END(name)
+ 
+ FUNCTION_DELEGATE(memset, memset_generic)
+ FUNCTION_DELEGATE(__memset_chk, __memset_chk_generic)
++FUNCTION_DELEGATE(memcmp, memcmp_generic)
++FUNCTION_DELEGATE(memcpy, memmove_generic)
++FUNCTION_DELEGATE(memmove, memmove_generic)
++FUNCTION_DELEGATE(memchr, memchr_openbsd)
++FUNCTION_DELEGATE(memrchr, memrchr_openbsd)
++//FUNCTION_DELEGATE(wmemset, wmemset_freebsd)
+-- 
+2.25.1
+
diff --git a/aosp_diff/preliminary/bionic/0004-Optimize-bionic-string-functions-with-avx-implementa.patch b/aosp_diff/preliminary/bionic/0004-Optimize-bionic-string-functions-with-avx-implementa.patch
new file mode 100644
index 0000000000..0432f627fd
--- /dev/null
+++ b/aosp_diff/preliminary/bionic/0004-Optimize-bionic-string-functions-with-avx-implementa.patch
@@ -0,0 +1,4169 @@
+From b6a7f45aa68426f4e32a4bf51e71ec5453f25f8d Mon Sep 17 00:00:00 2001
+From: Ravi Kumar Soni <ravi.kumar.soni@intel.com>
+Date: Mon, 28 Oct 2024 15:08:14 +0530
+Subject: [PATCH 4/5] Optimize bionic string functions with avx implementation
+
+Following are the string functions that has been
+optimized with avx2 implementation from glibc 2.32 version.
+  - strcmp, strncmp
+  - strlen, strnlen
+  - strchr, strrchr
+  - strcpy, strncpy
+  - stpcpy, stpncpy
+  - strcat, strncat
+  - wcscmp, wcsncmp
+  - wcslen, wcsnlen
+  - wcschr, wcsrchr
+
+Test done: Build and boot is fine, Run the benchmarks suite.
+
+Change-Id: I7f08a7507d25447ce886e9fde0482527c3f7a178
+Signed-off-by: ahs <amrita.h.s@intel.com>
+Signed-off-by: Ravi Kumar Soni <ravi.kumar.soni@intel.com>
+---
+ libc/Android.bp                               |   45 +-
+ .../arch-x86_64/dynamic_function_dispatch.cpp |  133 ++-
+ libc/arch-x86_64/generic/string/memchr.c      |    2 +-
+ libc/arch-x86_64/generic/string/memrchr.c     |    2 +-
+ libc/arch-x86_64/generic/string/strchr.cpp    |   19 +
+ libc/arch-x86_64/generic/string/strnlen.cpp   |   19 +
+ libc/arch-x86_64/generic/string/strrchr.cpp   |   19 +
+ libc/arch-x86_64/generic/string/wcschr.c      |   19 +
+ libc/arch-x86_64/generic/string/wcscmp.c      |   19 +
+ libc/arch-x86_64/generic/string/wcslen.c      |   19 +
+ libc/arch-x86_64/generic/string/wcsncmp.c     |   19 +
+ libc/arch-x86_64/generic/string/wcsnlen.c     |   19 +
+ libc/arch-x86_64/generic/string/wcsrchr.c     |   19 +
+ libc/arch-x86_64/generic/string/wmemset.c     |    2 +-
+ .../{ => kabylake}/string/avx2-memset-kbl.S   |    0
+ .../kabylake/string/avx2-stpcpy-kbl.S         |    3 +
+ .../kabylake/string/avx2-stpncpy-kbl.S        |    5 +
+ .../kabylake/string/avx2-strcat-kbl.S         |  299 +++++
+ .../kabylake/string/avx2-strchr-kbl.S         |  277 +++++
+ .../kabylake/string/avx2-strcmp-kbl.S         |  885 ++++++++++++++
+ .../kabylake/string/avx2-strcpy-kbl.S         | 1046 +++++++++++++++++
+ .../kabylake/string/avx2-strlen-kbl.S         |  418 +++++++
+ .../kabylake/string/avx2-strncat-kbl.S        |    3 +
+ .../kabylake/string/avx2-strncmp-kbl.S        |    4 +
+ .../kabylake/string/avx2-strncpy-kbl.S        |    4 +
+ .../kabylake/string/avx2-strnlen-kbl.S        |    4 +
+ .../kabylake/string/avx2-strrchr-kbl.S        |  258 ++++
+ .../kabylake/string/avx2-wcschr-kbl.S         |    3 +
+ .../kabylake/string/avx2-wcscmp-kbl.S         |    4 +
+ .../kabylake/string/avx2-wcslen-kbl.S         |    4 +
+ .../kabylake/string/avx2-wcsncmp-kbl.S        |    6 +
+ .../kabylake/string/avx2-wcsnlen-kbl.S        |    6 +
+ .../kabylake/string/avx2-wcsrchr-kbl.S        |    3 +
+ libc/arch-x86_64/kabylake/string/avx_regs.h   |   26 +
+ .../{include => kabylake/string}/cache.h      |    0
+ libc/arch-x86_64/silvermont/string/cache.h    |   36 +
+ .../silvermont/string/sse2-stpcpy-slm.S       |    2 +-
+ .../silvermont/string/sse2-stpncpy-slm.S      |    2 +-
+ .../silvermont/string/sse2-strcat-slm.S       |    2 +-
+ .../silvermont/string/sse2-strcpy-slm.S       |    2 +-
+ .../silvermont/string/sse2-strlen-slm.S       |    2 +-
+ .../silvermont/string/sse2-strncat-slm.S      |    2 +-
+ .../silvermont/string/sse2-strncpy-slm.S      |    2 +-
+ .../silvermont/string/ssse3-strcmp-slm.S      |    2 +-
+ .../silvermont/string/ssse3-strncmp-slm.S     |    2 +-
+ libc/arch-x86_64/static_function_dispatch.S   |   25 +-
+ 46 files changed, 3669 insertions(+), 23 deletions(-)
+ create mode 100644 libc/arch-x86_64/generic/string/strchr.cpp
+ create mode 100644 libc/arch-x86_64/generic/string/strnlen.cpp
+ create mode 100644 libc/arch-x86_64/generic/string/strrchr.cpp
+ create mode 100644 libc/arch-x86_64/generic/string/wcschr.c
+ create mode 100644 libc/arch-x86_64/generic/string/wcscmp.c
+ create mode 100644 libc/arch-x86_64/generic/string/wcslen.c
+ create mode 100644 libc/arch-x86_64/generic/string/wcsncmp.c
+ create mode 100644 libc/arch-x86_64/generic/string/wcsnlen.c
+ create mode 100644 libc/arch-x86_64/generic/string/wcsrchr.c
+ rename libc/arch-x86_64/{ => kabylake}/string/avx2-memset-kbl.S (100%)
+ create mode 100644 libc/arch-x86_64/kabylake/string/avx2-stpcpy-kbl.S
+ create mode 100644 libc/arch-x86_64/kabylake/string/avx2-stpncpy-kbl.S
+ create mode 100644 libc/arch-x86_64/kabylake/string/avx2-strcat-kbl.S
+ create mode 100644 libc/arch-x86_64/kabylake/string/avx2-strchr-kbl.S
+ create mode 100644 libc/arch-x86_64/kabylake/string/avx2-strcmp-kbl.S
+ create mode 100644 libc/arch-x86_64/kabylake/string/avx2-strcpy-kbl.S
+ create mode 100644 libc/arch-x86_64/kabylake/string/avx2-strlen-kbl.S
+ create mode 100644 libc/arch-x86_64/kabylake/string/avx2-strncat-kbl.S
+ create mode 100644 libc/arch-x86_64/kabylake/string/avx2-strncmp-kbl.S
+ create mode 100644 libc/arch-x86_64/kabylake/string/avx2-strncpy-kbl.S
+ create mode 100644 libc/arch-x86_64/kabylake/string/avx2-strnlen-kbl.S
+ create mode 100644 libc/arch-x86_64/kabylake/string/avx2-strrchr-kbl.S
+ create mode 100644 libc/arch-x86_64/kabylake/string/avx2-wcschr-kbl.S
+ create mode 100644 libc/arch-x86_64/kabylake/string/avx2-wcscmp-kbl.S
+ create mode 100644 libc/arch-x86_64/kabylake/string/avx2-wcslen-kbl.S
+ create mode 100644 libc/arch-x86_64/kabylake/string/avx2-wcsncmp-kbl.S
+ create mode 100644 libc/arch-x86_64/kabylake/string/avx2-wcsnlen-kbl.S
+ create mode 100644 libc/arch-x86_64/kabylake/string/avx2-wcsrchr-kbl.S
+ create mode 100644 libc/arch-x86_64/kabylake/string/avx_regs.h
+ rename libc/arch-x86_64/{include => kabylake/string}/cache.h (100%)
+ create mode 100644 libc/arch-x86_64/silvermont/string/cache.h
+
+diff --git a/libc/Android.bp b/libc/Android.bp
+index 530ce9111..92483e833 100644
+--- a/libc/Android.bp
++++ b/libc/Android.bp
+@@ -377,6 +377,17 @@ cc_library_static {
+                 "upstream-freebsd/lib/libc/string/wmemcmp.c",
+             ],
+         },
++        x86_64: {
++            exclude_srcs: [
++                "upstream-freebsd/lib/libc/string/wcscmp.c",
++                "upstream-freebsd/lib/libc/string/wcsncmp.c",
++                "upstream-freebsd/lib/libc/string/wcslen.c",
++                "upstream-freebsd/lib/libc/string/wcsnlen.c",
++                "upstream-freebsd/lib/libc/string/wcschr.c",
++                "upstream-freebsd/lib/libc/string/wcsrchr.c",
++
++            ],
++        },
+     },
+ 
+     cflags: [
+@@ -1185,7 +1196,6 @@ cc_library_static {
+             ],
+         },
+         x86_64: {
+-            include_dirs: ["bionic/libc/arch-x86_64/include"],
+             srcs: [
+                 "arch-x86_64/bionic/__bionic_clone.S",
+                 "arch-x86_64/bionic/_exit_with_stack_teardown.S",
+@@ -1194,7 +1204,7 @@ cc_library_static {
+                 "arch-x86_64/bionic/syscall.S",
+                 "arch-x86_64/bionic/vfork.S",
+ 
+-                "arch-x86_64/string/avx2-memset-kbl.S",
++                "arch-x86_64/kabylake/string/avx2-memset-kbl.S",
+                 "arch-x86_64/silvermont/string/sse2-memmove-slm.S",
+                 "arch-x86_64/silvermont/string/sse2-memset-slm.S",
+                 "arch-x86_64/silvermont/string/sse2-stpcpy-slm.S",
+@@ -1211,17 +1221,42 @@ cc_library_static {
+                 //"arch-x86_64/generic/string/wmemset.c"
+                 "arch-x86_64/generic/string/memchr.c",
+                 "arch-x86_64/generic/string/memrchr.c",
++                "arch-x86_64/generic/string/strchr.cpp",
++                "arch-x86_64/generic/string/strrchr.cpp",
++                "arch-x86_64/generic/string/strnlen.cpp",
++                "arch-x86_64/generic/string/wcscmp.c",
++                "arch-x86_64/generic/string/wcsncmp.c",
++                "arch-x86_64/generic/string/wcslen.c",
++                "arch-x86_64/generic/string/wcsnlen.c",
++                "arch-x86_64/generic/string/wcschr.c",
++                "arch-x86_64/generic/string/wcsrchr.c",
+ 
+                 //"arch-x86_64/kabylake/string/avx2-wmemset-kbl.S"
+                 "arch-x86_64/kabylake/string/avx2-memcmp-kbl.S",
+                 "arch-x86_64/kabylake/string/avx2-memchr-kbl.S",
+                 "arch-x86_64/kabylake/string/avx2-memrchr-kbl.S",
++                "arch-x86_64/kabylake/string/avx2-strcmp-kbl.S",
++                "arch-x86_64/kabylake/string/avx2-strncmp-kbl.S",
++                "arch-x86_64/kabylake/string/avx2-strlen-kbl.S",
++                "arch-x86_64/kabylake/string/avx2-strnlen-kbl.S",
++                "arch-x86_64/kabylake/string/avx2-strchr-kbl.S",
++                "arch-x86_64/kabylake/string/avx2-strrchr-kbl.S",
++                "arch-x86_64/kabylake/string/avx2-strcpy-kbl.S",
++                "arch-x86_64/kabylake/string/avx2-strncpy-kbl.S",
++                "arch-x86_64/kabylake/string/avx2-stpcpy-kbl.S",
++                "arch-x86_64/kabylake/string/avx2-stpncpy-kbl.S",
++                "arch-x86_64/kabylake/string/avx2-strcat-kbl.S",
++                "arch-x86_64/kabylake/string/avx2-strncat-kbl.S",
++                "arch-x86_64/kabylake/string/avx2-wcscmp-kbl.S",
++                "arch-x86_64/kabylake/string/avx2-wcsncmp-kbl.S",
++                "arch-x86_64/kabylake/string/avx2-wcslen-kbl.S",
++                "arch-x86_64/kabylake/string/avx2-wcsnlen-kbl.S",
++                "arch-x86_64/kabylake/string/avx2-wcschr-kbl.S",
++                "arch-x86_64/kabylake/string/avx2-wcsrchr-kbl.S",
+ 
+-                "bionic/strchr.cpp",
+                 "bionic/strchrnul.cpp",
+-                "bionic/strnlen.cpp",
+-                "bionic/strrchr.cpp",
+             ],
++
+         },
+     },
+ 
+diff --git a/libc/arch-x86_64/dynamic_function_dispatch.cpp b/libc/arch-x86_64/dynamic_function_dispatch.cpp
+index 43aaebb54..182eb4200 100644
+--- a/libc/arch-x86_64/dynamic_function_dispatch.cpp
++++ b/libc/arch-x86_64/dynamic_function_dispatch.cpp
+@@ -67,21 +67,148 @@ typedef void* memchr_func(const void* __s, int __ch, size_t __n);
+ DEFINE_IFUNC_FOR(memchr) {
+     __builtin_cpu_init();
+     if (__builtin_cpu_supports("avx2")) RETURN_FUNC(memchr_func, memchr_avx2);
+-    RETURN_FUNC(memchr_func, memchr_openbsd);
++    RETURN_FUNC(memchr_func, memchr_generic);
+ }
+ 
+ typedef void* memrchr_func(const void* __s, int __ch, size_t __n);
+ DEFINE_IFUNC_FOR(memrchr) {
+     __builtin_cpu_init();
+     if (__builtin_cpu_supports("avx2")) RETURN_FUNC(memrchr_func, memrchr_avx2);
+-    RETURN_FUNC(memrchr_func, memrchr_openbsd);
++    RETURN_FUNC(memrchr_func, memrchr_generic);
+ }
+ 
+ // typedef int wmemset_func(const wchar_t* __lhs, const wchar_t* __rhs, size_t __n);
+ // DEFINE_IFUNC_FOR(wmemset) {
+ //     __builtin_cpu_init();
+ //     if (__builtin_cpu_supports("avx2")) RETURN_FUNC(wmemset_func, wmemset_avx2);
+-//     RETURN_FUNC(wmemset_func, wmemset_freebsd);
++//     RETURN_FUNC(wmemset_func, wmemset_generic);
+ // }
+ 
++typedef int strcmp_func(const char* __lhs, const char* __rhs);
++DEFINE_IFUNC_FOR(strcmp) {
++    __builtin_cpu_init();
++    if (__builtin_cpu_supports("avx2")) RETURN_FUNC(strcmp_func, strcmp_avx2);
++    RETURN_FUNC(strcmp_func, strcmp_generic);
++}
++
++typedef int strncmp_func(const char* __lhs, const char* __rhs, size_t __n);
++DEFINE_IFUNC_FOR(strncmp) {
++    __builtin_cpu_init();
++    if (__builtin_cpu_supports("avx2")) RETURN_FUNC(strncmp_func, strncmp_avx2);
++    RETURN_FUNC(strncmp_func, strncmp_generic);
++}
++
++typedef char* strcpy_func(char* __dst, const char* __src);
++DEFINE_IFUNC_FOR(strcpy) {
++    __builtin_cpu_init();
++    if (__builtin_cpu_supports("avx2")) RETURN_FUNC(strcpy_func, strcpy_avx2);
++    RETURN_FUNC(strcpy_func, strcpy_generic);
++}
++
++typedef char* strncpy_func(char* __dst, const char* __src, size_t __n);
++DEFINE_IFUNC_FOR(strncpy) {
++    __builtin_cpu_init();
++    if (__builtin_cpu_supports("avx2")) RETURN_FUNC(strncpy_func, strncpy_avx2);
++    RETURN_FUNC(strncpy_func, strncpy_generic);
++}
++
++typedef char* stpcpy_func(char* __dst, const char* __src);
++DEFINE_IFUNC_FOR(stpcpy) {
++    __builtin_cpu_init();
++    if (__builtin_cpu_supports("avx2")) RETURN_FUNC(stpcpy_func, stpcpy_avx2);
++    RETURN_FUNC(stpcpy_func, stpcpy_generic);
++}
++
++typedef char* stpncpy_func(char* __dst, const char* __src, size_t __n);
++DEFINE_IFUNC_FOR(stpncpy) {
++    __builtin_cpu_init();
++    if (__builtin_cpu_supports("avx2")) RETURN_FUNC(stpncpy_func, stpncpy_avx2);
++    RETURN_FUNC(stpncpy_func, stpncpy_generic);
++}
++
++typedef size_t strlen_func(const char* __s);
++DEFINE_IFUNC_FOR(strlen) {
++    __builtin_cpu_init();
++    if (__builtin_cpu_supports("avx2")) RETURN_FUNC(strlen_func, strlen_avx2);
++    RETURN_FUNC(strlen_func, strlen_generic);
++}
++
++
++typedef size_t strnlen_func(const char* __s, size_t __n);
++DEFINE_IFUNC_FOR(strnlen) {
++    __builtin_cpu_init();
++    if (__builtin_cpu_supports("avx2")) RETURN_FUNC(strnlen_func, strnlen_avx2);
++    RETURN_FUNC(strnlen_func, strnlen_generic);
++}
++
++typedef char* strchr_func(const char* __s, int __ch);
++DEFINE_IFUNC_FOR(strchr) {
++    __builtin_cpu_init();
++    if (__builtin_cpu_supports("avx2")) RETURN_FUNC(strchr_func, strchr_avx2);
++    RETURN_FUNC(strchr_func, strchr_generic);
++}
++
++typedef char* strrchr_func(const char* __s, int __ch);
++DEFINE_IFUNC_FOR(strrchr) {
++    __builtin_cpu_init();
++    if (__builtin_cpu_supports("avx2")) RETURN_FUNC(strrchr_func, strrchr_avx2);
++    RETURN_FUNC(strrchr_func, strrchr_generic);
++}
++
++typedef char* strcat_func(char* __dst, const char* __src);
++DEFINE_IFUNC_FOR(strcat) {
++    __builtin_cpu_init();
++    if (__builtin_cpu_supports("avx2")) RETURN_FUNC(strcat_func, strcat_avx2);
++    RETURN_FUNC(strcat_func, strcat_generic);
++}
++
++typedef char* strncat_func(char* __dst, const char* __src, size_t __n);
++DEFINE_IFUNC_FOR(strncat) {
++    __builtin_cpu_init();
++    if (__builtin_cpu_supports("avx2")) RETURN_FUNC(strncat_func, strncat_avx2);
++    RETURN_FUNC(strncat_func, strncat_generic);
++}
++
++typedef int wcscmp_func(const wchar_t* __lhs, const wchar_t* __rhs);
++DEFINE_IFUNC_FOR(wcscmp) {
++    __builtin_cpu_init();
++    if (__builtin_cpu_supports("avx2")) RETURN_FUNC(wcscmp_func, wcscmp_avx2);
++    RETURN_FUNC(wcscmp_func, wcscmp_generic);
++}
++
++typedef int wcsncmp_func(const wchar_t* __lhs, const wchar_t* __rhs, size_t __n);
++DEFINE_IFUNC_FOR(wcsncmp) {
++    __builtin_cpu_init();
++    if (__builtin_cpu_supports("avx2")) RETURN_FUNC(wcsncmp_func, wcsncmp_avx2);
++    RETURN_FUNC(wcsncmp_func, wcsncmp_generic);
++}
++
++typedef size_t wcslen_func(const wchar_t* __s);
++DEFINE_IFUNC_FOR(wcslen) {
++    __builtin_cpu_init();
++    if (__builtin_cpu_supports("avx2")) RETURN_FUNC(wcslen_func, wcslen_avx2);
++    RETURN_FUNC(wcslen_func, wcslen_generic);
++}
++
++typedef size_t wcsnlen_func(const wchar_t* __s, size_t __n);
++DEFINE_IFUNC_FOR(wcsnlen) {
++    __builtin_cpu_init();
++    if (__builtin_cpu_supports("avx2")) RETURN_FUNC(wcsnlen_func, wcsnlen_avx2);
++    RETURN_FUNC(wcsnlen_func, wcsnlen_generic);
++}
++
++typedef wchar_t* wcschr_func(const wchar_t* __s, wchar_t __wc);
++DEFINE_IFUNC_FOR(wcschr) {
++    __builtin_cpu_init();
++    if (__builtin_cpu_supports("avx2")) RETURN_FUNC(wcschr_func, wcschr_avx2);
++    RETURN_FUNC(wcschr_func, wcschr_generic);
++}
++
++typedef wchar_t* wcsrchr_func(const wchar_t* __s, wchar_t __wc);
++DEFINE_IFUNC_FOR(wcsrchr) {
++    __builtin_cpu_init();
++    if (__builtin_cpu_supports("avx2")) RETURN_FUNC(wcsrchr_func, wcsrchr_avx2);
++    RETURN_FUNC(wcsrchr_func, wcsrchr_generic);
++}
++
+ }  // extern "C"
+diff --git a/libc/arch-x86_64/generic/string/memchr.c b/libc/arch-x86_64/generic/string/memchr.c
+index 86ee02e0b..e6fc3eb84 100644
+--- a/libc/arch-x86_64/generic/string/memchr.c
++++ b/libc/arch-x86_64/generic/string/memchr.c
+@@ -15,6 +15,6 @@
+ */
+ 
+ #include <upstream-openbsd/android/include/openbsd-compat.h>
+-#define memchr memchr_openbsd
++#define memchr memchr_generic
+ 
+ #include <upstream-openbsd/lib/libc/string/memchr.c>
+diff --git a/libc/arch-x86_64/generic/string/memrchr.c b/libc/arch-x86_64/generic/string/memrchr.c
+index c803009f5..ee085e384 100644
+--- a/libc/arch-x86_64/generic/string/memrchr.c
++++ b/libc/arch-x86_64/generic/string/memrchr.c
+@@ -15,6 +15,6 @@
+ */
+ 
+ #include <upstream-openbsd/android/include/openbsd-compat.h>
+-#define memrchr memrchr_openbsd
++#define memrchr memrchr_generic
+ 
+ #include <upstream-openbsd/lib/libc/string/memrchr.c>
+diff --git a/libc/arch-x86_64/generic/string/strchr.cpp b/libc/arch-x86_64/generic/string/strchr.cpp
+new file mode 100644
+index 000000000..8a3d6d619
+--- /dev/null
++++ b/libc/arch-x86_64/generic/string/strchr.cpp
+@@ -0,0 +1,19 @@
++/*
++ * Copyright (C) 2019 The Android Open Source Project
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *      http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++*/
++
++#define strchr strchr_generic
++
++#include <bionic/strchr.cpp>
+diff --git a/libc/arch-x86_64/generic/string/strnlen.cpp b/libc/arch-x86_64/generic/string/strnlen.cpp
+new file mode 100644
+index 000000000..f60348656
+--- /dev/null
++++ b/libc/arch-x86_64/generic/string/strnlen.cpp
+@@ -0,0 +1,19 @@
++/*
++ * Copyright (C) 2019 The Android Open Source Project
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *      http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++*/
++
++#define strnlen strnlen_generic
++
++#include <bionic/strnlen.cpp>
+diff --git a/libc/arch-x86_64/generic/string/strrchr.cpp b/libc/arch-x86_64/generic/string/strrchr.cpp
+new file mode 100644
+index 000000000..9f0f33fd2
+--- /dev/null
++++ b/libc/arch-x86_64/generic/string/strrchr.cpp
+@@ -0,0 +1,19 @@
++/*
++ * Copyright (C) 2019 The Android Open Source Project
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *      http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++*/
++
++#define strrchr strrchr_generic
++
++#include <bionic/strrchr.cpp>
+diff --git a/libc/arch-x86_64/generic/string/wcschr.c b/libc/arch-x86_64/generic/string/wcschr.c
+new file mode 100644
+index 000000000..d45e45d20
+--- /dev/null
++++ b/libc/arch-x86_64/generic/string/wcschr.c
+@@ -0,0 +1,19 @@
++/*
++ * Copyright (C) 2019 The Android Open Source Project
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *      http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++*/
++
++#define wcschr wcschr_generic
++
++#include <upstream-freebsd/lib/libc/string/wcschr.c>
+diff --git a/libc/arch-x86_64/generic/string/wcscmp.c b/libc/arch-x86_64/generic/string/wcscmp.c
+new file mode 100644
+index 000000000..e55bab549
+--- /dev/null
++++ b/libc/arch-x86_64/generic/string/wcscmp.c
+@@ -0,0 +1,19 @@
++/*
++ * Copyright (C) 2019 The Android Open Source Project
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *      http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++*/
++
++#define wcscmp wcscmp_generic
++
++#include <upstream-freebsd/lib/libc/string/wcscmp.c>
+diff --git a/libc/arch-x86_64/generic/string/wcslen.c b/libc/arch-x86_64/generic/string/wcslen.c
+new file mode 100644
+index 000000000..5b873fc30
+--- /dev/null
++++ b/libc/arch-x86_64/generic/string/wcslen.c
+@@ -0,0 +1,19 @@
++/*
++ * Copyright (C) 2019 The Android Open Source Project
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *      http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++*/
++
++#define wcslen wcslen_generic
++
++#include <upstream-freebsd/lib/libc/string/wcslen.c>
+diff --git a/libc/arch-x86_64/generic/string/wcsncmp.c b/libc/arch-x86_64/generic/string/wcsncmp.c
+new file mode 100644
+index 000000000..40b2ca2f3
+--- /dev/null
++++ b/libc/arch-x86_64/generic/string/wcsncmp.c
+@@ -0,0 +1,19 @@
++/*
++ * Copyright (C) 2019 The Android Open Source Project
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *      http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++*/
++
++#define wcsncmp wcsncmp_generic
++
++#include <upstream-freebsd/lib/libc/string/wcsncmp.c>
+diff --git a/libc/arch-x86_64/generic/string/wcsnlen.c b/libc/arch-x86_64/generic/string/wcsnlen.c
+new file mode 100644
+index 000000000..91051cea7
+--- /dev/null
++++ b/libc/arch-x86_64/generic/string/wcsnlen.c
+@@ -0,0 +1,19 @@
++/*
++ * Copyright (C) 2019 The Android Open Source Project
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *      http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++*/
++
++#define wcsnlen wcsnlen_generic
++
++#include <upstream-freebsd/lib/libc/string/wcsnlen.c>
+diff --git a/libc/arch-x86_64/generic/string/wcsrchr.c b/libc/arch-x86_64/generic/string/wcsrchr.c
+new file mode 100644
+index 000000000..73e8c25bc
+--- /dev/null
++++ b/libc/arch-x86_64/generic/string/wcsrchr.c
+@@ -0,0 +1,19 @@
++/*
++ * Copyright (C) 2019 The Android Open Source Project
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *      http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++*/
++
++#define wcsrchr wcsrchr_generic
++
++#include <upstream-freebsd/lib/libc/string/wcsrchr.c>
+diff --git a/libc/arch-x86_64/generic/string/wmemset.c b/libc/arch-x86_64/generic/string/wmemset.c
+index ac6bd7ec4..9675fe91f 100644
+--- a/libc/arch-x86_64/generic/string/wmemset.c
++++ b/libc/arch-x86_64/generic/string/wmemset.c
+@@ -15,6 +15,6 @@
+ */
+ 
+ #include <upstream-openbsd/android/include/openbsd-compat.h>
+-#define wmemset wmemset_freebsd
++#define wmemset wmemset_generic
+ 
+ #include <upstream-freebsd/lib/libc/string/wmemset.c>
+diff --git a/libc/arch-x86_64/string/avx2-memset-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-memset-kbl.S
+similarity index 100%
+rename from libc/arch-x86_64/string/avx2-memset-kbl.S
+rename to libc/arch-x86_64/kabylake/string/avx2-memset-kbl.S
+diff --git a/libc/arch-x86_64/kabylake/string/avx2-stpcpy-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-stpcpy-kbl.S
+new file mode 100644
+index 000000000..63f9ba25b
+--- /dev/null
++++ b/libc/arch-x86_64/kabylake/string/avx2-stpcpy-kbl.S
+@@ -0,0 +1,3 @@
++#define USE_AS_STPCPY
++#define STRCPY stpcpy_avx2
++#include "avx2-strcpy-kbl.S"
+diff --git a/libc/arch-x86_64/kabylake/string/avx2-stpncpy-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-stpncpy-kbl.S
+new file mode 100644
+index 000000000..c1bbdb29e
+--- /dev/null
++++ b/libc/arch-x86_64/kabylake/string/avx2-stpncpy-kbl.S
+@@ -0,0 +1,5 @@
++#define USE_AS_STPCPY
++#define USE_AS_STRNCPY
++#define STRCPY stpncpy_avx2
++#include "avx_regs.h"
++#include "avx2-strcpy-kbl.S"
+diff --git a/libc/arch-x86_64/kabylake/string/avx2-strcat-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-strcat-kbl.S
+new file mode 100644
+index 000000000..d1e9b4b38
+--- /dev/null
++++ b/libc/arch-x86_64/kabylake/string/avx2-strcat-kbl.S
+@@ -0,0 +1,299 @@
++/* strcat with AVX2
++   Copyright (C) 2011-2020 Free Software Foundation, Inc.
++   Contributed by Intel Corporation.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++
++
++# ifndef STRCAT
++#  define STRCAT  strcat_avx2
++# endif
++
++# ifndef L
++#  define L(label)      .L##label
++# endif
++
++# ifndef cfi_startproc
++#  define cfi_startproc .cfi_startproc
++# endif
++
++# ifndef cfi_endproc
++#  define cfi_endproc   .cfi_endproc
++# endif
++
++# ifndef ENTRY
++#  define ENTRY(name)   \
++        .type name, @function;  \
++        .globl name;    \
++        .p2align 4;     \
++name:   \
++        cfi_startproc
++# endif
++
++# ifndef END
++#  define END(name)     \
++        cfi_endproc;    \
++        .size name, .-name
++# endif
++
++# define USE_AS_STRCAT
++
++/* Number of bytes in a vector register */
++# define VEC_SIZE	32
++
++	.section .text.avx,"ax",@progbits
++ENTRY (STRCAT)
++	mov	%rdi, %r9
++# ifdef USE_AS_STRNCAT
++	mov	%rdx, %r8
++# endif
++
++	xor	%eax, %eax
++	mov	%edi, %ecx
++	and	$((VEC_SIZE * 4) - 1), %ecx
++	vpxor	%xmm6, %xmm6, %xmm6
++	cmp	$(VEC_SIZE * 3), %ecx
++	ja	L(fourth_vector_boundary)
++	vpcmpeqb (%rdi), %ymm6, %ymm0
++	vpmovmskb %ymm0, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_first_vector)
++	mov	%rdi, %rax
++	and	$-VEC_SIZE, %rax
++	jmp	L(align_vec_size_start)
++L(fourth_vector_boundary):
++	mov	%rdi, %rax
++	and	$-VEC_SIZE, %rax
++	vpcmpeqb	(%rax), %ymm6, %ymm0
++	mov	$-1, %r10d
++	sub	%rax, %rcx
++	shl	%cl, %r10d
++	vpmovmskb %ymm0, %edx
++	and	%r10d, %edx
++	jnz	L(exit)
++
++L(align_vec_size_start):
++	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm0
++	vpmovmskb %ymm0, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_second_vector)
++
++	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
++	vpmovmskb %ymm1, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_third_vector)
++
++	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
++	vpmovmskb %ymm2, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_fourth_vector)
++
++	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
++	vpmovmskb %ymm3, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_fifth_vector)
++
++	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
++	add	$(VEC_SIZE * 4), %rax
++	vpmovmskb %ymm0, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_second_vector)
++
++	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
++	vpmovmskb %ymm1, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_third_vector)
++
++	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
++	vpmovmskb %ymm2, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_fourth_vector)
++
++	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
++	vpmovmskb %ymm3, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_fifth_vector)
++
++	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
++	add	$(VEC_SIZE * 4), %rax
++	vpmovmskb %ymm0, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_second_vector)
++
++	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
++	vpmovmskb %ymm1, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_third_vector)
++
++	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
++	vpmovmskb %ymm2, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_fourth_vector)
++
++	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
++	vpmovmskb %ymm3, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_fifth_vector)
++
++	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
++	add	$(VEC_SIZE * 4), %rax
++	vpmovmskb %ymm0, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_second_vector)
++
++	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
++	vpmovmskb %ymm1, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_third_vector)
++
++	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
++	vpmovmskb %ymm2, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_fourth_vector)
++
++	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
++	vpmovmskb %ymm3, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_fifth_vector)
++
++	test	$((VEC_SIZE * 4) - 1), %rax
++	jz	L(align_four_vec_loop)
++
++	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
++	add	$(VEC_SIZE * 5), %rax
++	vpmovmskb %ymm0, %edx
++	test	%edx, %edx
++	jnz	L(exit)
++
++	test	$((VEC_SIZE * 4) - 1), %rax
++	jz	L(align_four_vec_loop)
++
++	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm1
++	add	$VEC_SIZE, %rax
++	vpmovmskb %ymm1, %edx
++	test	%edx, %edx
++	jnz	L(exit)
++
++	test	$((VEC_SIZE * 4) - 1), %rax
++	jz	L(align_four_vec_loop)
++
++	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm2
++	add	$VEC_SIZE, %rax
++	vpmovmskb %ymm2, %edx
++	test	%edx, %edx
++	jnz	L(exit)
++
++	test	$((VEC_SIZE * 4) - 1), %rax
++	jz	L(align_four_vec_loop)
++
++	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm3
++	add	$VEC_SIZE, %rax
++	vpmovmskb %ymm3, %edx
++	test	%edx, %edx
++	jnz	L(exit)
++
++	add	$VEC_SIZE, %rax
++
++	.p2align 4
++L(align_four_vec_loop):
++	vmovaps	(%rax),	%ymm4
++	vpminub	VEC_SIZE(%rax),	%ymm4, %ymm4
++	vmovaps	(VEC_SIZE * 2)(%rax),	%ymm5
++	vpminub	(VEC_SIZE * 3)(%rax),	%ymm5, %ymm5
++	add	$(VEC_SIZE * 4),	%rax
++	vpminub	%ymm4,	%ymm5, %ymm5
++	vpcmpeqb %ymm5,	%ymm6, %ymm5
++	vpmovmskb %ymm5,	%edx
++	test	%edx,	%edx
++	jz	L(align_four_vec_loop)
++
++	vpcmpeqb -(VEC_SIZE * 4)(%rax), %ymm6, %ymm0
++	sub	$(VEC_SIZE * 5),	%rax
++	vpmovmskb %ymm0, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_second_vector)
++
++	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
++	vpmovmskb %ymm1, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_third_vector)
++
++	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
++	vpmovmskb %ymm2, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_fourth_vector)
++
++	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
++	vpmovmskb %ymm3, %edx
++	sub	%rdi, %rax
++	bsf	%rdx, %rdx
++	add	%rdx, %rax
++	add	$(VEC_SIZE * 4), %rax
++	jmp	L(StartStrcpyPart)
++
++	.p2align 4
++L(exit):
++	sub	%rdi, %rax
++L(exit_null_on_first_vector):
++	bsf	%rdx, %rdx
++	add	%rdx, %rax
++	jmp	L(StartStrcpyPart)
++
++	.p2align 4
++L(exit_null_on_second_vector):
++	sub	%rdi, %rax
++	bsf	%rdx, %rdx
++	add	%rdx, %rax
++	add	$VEC_SIZE, %rax
++	jmp	L(StartStrcpyPart)
++
++	.p2align 4
++L(exit_null_on_third_vector):
++	sub	%rdi, %rax
++	bsf	%rdx, %rdx
++	add	%rdx, %rax
++	add	$(VEC_SIZE * 2), %rax
++	jmp	L(StartStrcpyPart)
++
++	.p2align 4
++L(exit_null_on_fourth_vector):
++	sub	%rdi, %rax
++	bsf	%rdx, %rdx
++	add	%rdx, %rax
++	add	$(VEC_SIZE * 3), %rax
++	jmp	L(StartStrcpyPart)
++
++	.p2align 4
++L(exit_null_on_fifth_vector):
++	sub	%rdi, %rax
++	bsf	%rdx, %rdx
++	add	%rdx, %rax
++	add	$(VEC_SIZE * 4), %rax
++
++	.p2align 4
++L(StartStrcpyPart):
++	lea	(%r9, %rax), %rdi
++	mov	%rsi, %rcx
++	mov	%r9, %rax      /* save result */
++
++# ifdef USE_AS_STRNCAT
++	test	%r8, %r8
++	jz	L(ExitZero)
++#  define USE_AS_STRNCPY
++# endif
++
++# include "avx2-strcpy-kbl.S"
+diff --git a/libc/arch-x86_64/kabylake/string/avx2-strchr-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-strchr-kbl.S
+new file mode 100644
+index 000000000..7d8a44c81
+--- /dev/null
++++ b/libc/arch-x86_64/kabylake/string/avx2-strchr-kbl.S
+@@ -0,0 +1,277 @@
++/* strchr/strchrnul optimized with AVX2.
++   Copyright (C) 2017-2020 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++
++# ifndef STRCHR
++#  define STRCHR	strchr_avx2
++# endif
++
++# ifndef L
++#  define L(label)      .L##label
++# endif
++
++# ifndef cfi_startproc
++#  define cfi_startproc .cfi_startproc
++# endif
++
++# ifndef cfi_endproc
++#  define cfi_endproc   .cfi_endproc
++# endif
++
++# ifndef ENTRY
++#  define ENTRY(name)   \
++        .type name, @function;  \
++        .globl name;    \
++        .p2align 4;     \
++name:   \
++        cfi_startproc
++# endif
++
++# ifndef END
++#  define END(name)     \
++        cfi_endproc;    \
++        .size name, .-name
++# endif
++
++# ifdef USE_AS_WCSCHR
++#  define VPBROADCAST	vpbroadcastd
++#  define VPCMPEQ	vpcmpeqd
++#  define CHAR_REG	esi
++# else
++#  define VPBROADCAST	vpbroadcastb
++#  define VPCMPEQ	vpcmpeqb
++#  define CHAR_REG	sil
++# endif
++
++# ifndef VZEROUPPER
++#  define VZEROUPPER	vzeroupper
++# endif
++
++# define VEC_SIZE 32
++
++	.section .text.avx,"ax",@progbits
++ENTRY (STRCHR)
++	movl	%edi, %ecx
++	/* Broadcast CHAR to YMM0.  */
++	vmovd	%esi, %xmm0
++	vpxor	%xmm9, %xmm9, %xmm9
++	VPBROADCAST %xmm0, %ymm0
++	/* Check if we may cross page boundary with one vector load.  */
++	andl	$(2 * VEC_SIZE - 1), %ecx
++	cmpl	$VEC_SIZE, %ecx
++	ja	L(cros_page_boundary)
++
++	/* Check the first VEC_SIZE bytes.  Search for both CHAR and the
++	   null byte.  */
++	vmovdqu	(%rdi), %ymm8
++	VPCMPEQ %ymm8, %ymm0, %ymm1
++	VPCMPEQ %ymm8, %ymm9, %ymm2
++	vpor	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x0)
++
++	/* Align data for aligned loads in the loop.  */
++	addq	$VEC_SIZE, %rdi
++	andl	$(VEC_SIZE - 1), %ecx
++	andq	$-VEC_SIZE, %rdi
++
++	jmp	L(more_4x_vec)
++
++	.p2align 4
++L(cros_page_boundary):
++	andl	$(VEC_SIZE - 1), %ecx
++	andq	$-VEC_SIZE, %rdi
++	vmovdqu	(%rdi), %ymm8
++	VPCMPEQ %ymm8, %ymm0, %ymm1
++	VPCMPEQ %ymm8, %ymm9, %ymm2
++	vpor	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %eax
++	/* Remove the leading bytes.  */
++	sarl	%cl, %eax
++	testl	%eax, %eax
++	jz	L(aligned_more)
++	/* Found CHAR or the null byte.  */
++	tzcntl	%eax, %eax
++	addq	%rcx, %rax
++# ifdef USE_AS_STRCHRNUL
++	addq	%rdi, %rax
++# else
++	xorl	%edx, %edx
++	leaq	(%rdi, %rax), %rax
++	cmp	(%rax), %CHAR_REG
++	cmovne	%rdx, %rax
++# endif
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(aligned_more):
++	addq	$VEC_SIZE, %rdi
++
++L(more_4x_vec):
++	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
++	   since data is only aligned to VEC_SIZE.  */
++	vmovdqa	(%rdi), %ymm8
++	VPCMPEQ %ymm8, %ymm0, %ymm1
++	VPCMPEQ %ymm8, %ymm9, %ymm2
++	vpor	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x0)
++
++	vmovdqa	VEC_SIZE(%rdi), %ymm8
++	VPCMPEQ %ymm8, %ymm0, %ymm1
++	VPCMPEQ %ymm8, %ymm9, %ymm2
++	vpor	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x1)
++
++	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm8
++	VPCMPEQ %ymm8, %ymm0, %ymm1
++	VPCMPEQ %ymm8, %ymm9, %ymm2
++	vpor	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x2)
++
++	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
++	VPCMPEQ %ymm8, %ymm0, %ymm1
++	VPCMPEQ %ymm8, %ymm9, %ymm2
++	vpor	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x3)
++
++	addq	$(VEC_SIZE * 4), %rdi
++
++	/* Align data to 4 * VEC_SIZE.  */
++	movq	%rdi, %rcx
++	andl	$(4 * VEC_SIZE - 1), %ecx
++	andq	$-(4 * VEC_SIZE), %rdi
++
++	.p2align 4
++L(loop_4x_vec):
++	/* Compare 4 * VEC at a time forward.  */
++	vmovdqa	(%rdi), %ymm5
++	vmovdqa	VEC_SIZE(%rdi), %ymm6
++	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm7
++	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
++
++	VPCMPEQ %ymm5, %ymm0, %ymm1
++	VPCMPEQ %ymm6, %ymm0, %ymm2
++	VPCMPEQ %ymm7, %ymm0, %ymm3
++	VPCMPEQ %ymm8, %ymm0, %ymm4
++
++	VPCMPEQ %ymm5, %ymm9, %ymm5
++	VPCMPEQ %ymm6, %ymm9, %ymm6
++	VPCMPEQ %ymm7, %ymm9, %ymm7
++	VPCMPEQ %ymm8, %ymm9, %ymm8
++
++	vpor	%ymm1, %ymm5, %ymm1
++	vpor	%ymm2, %ymm6, %ymm2
++	vpor	%ymm3, %ymm7, %ymm3
++	vpor	%ymm4, %ymm8, %ymm4
++
++	vpor	%ymm1, %ymm2, %ymm5
++	vpor	%ymm3, %ymm4, %ymm6
++
++	vpor	%ymm5, %ymm6, %ymm5
++
++	vpmovmskb %ymm5, %eax
++	testl	%eax, %eax
++	jnz	L(4x_vec_end)
++
++	addq	$(VEC_SIZE * 4), %rdi
++
++	jmp	L(loop_4x_vec)
++
++	.p2align 4
++L(first_vec_x0):
++	/* Found CHAR or the null byte.  */
++	tzcntl	%eax, %eax
++# ifdef USE_AS_STRCHRNUL
++	addq	%rdi, %rax
++# else
++	xorl	%edx, %edx
++	leaq	(%rdi, %rax), %rax
++	cmp	(%rax), %CHAR_REG
++	cmovne	%rdx, %rax
++# endif
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(first_vec_x1):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_STRCHRNUL
++	addq	$VEC_SIZE, %rax
++	addq	%rdi, %rax
++# else
++	xorl	%edx, %edx
++	leaq	VEC_SIZE(%rdi, %rax), %rax
++	cmp	(%rax), %CHAR_REG
++	cmovne	%rdx, %rax
++# endif
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(first_vec_x2):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_STRCHRNUL
++	addq	$(VEC_SIZE * 2), %rax
++	addq	%rdi, %rax
++# else
++	xorl	%edx, %edx
++	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
++	cmp	(%rax), %CHAR_REG
++	cmovne	%rdx, %rax
++# endif
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(4x_vec_end):
++	vpmovmskb %ymm1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x0)
++	vpmovmskb %ymm2, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x1)
++	vpmovmskb %ymm3, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x2)
++	vpmovmskb %ymm4, %eax
++	testl	%eax, %eax
++L(first_vec_x3):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_STRCHRNUL
++	addq	$(VEC_SIZE * 3), %rax
++	addq	%rdi, %rax
++# else
++	xorl	%edx, %edx
++	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
++	cmp	(%rax), %CHAR_REG
++	cmovne	%rdx, %rax
++# endif
++	VZEROUPPER
++	ret
++
++END (STRCHR)
+diff --git a/libc/arch-x86_64/kabylake/string/avx2-strcmp-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-strcmp-kbl.S
+new file mode 100644
+index 000000000..b241812d8
+--- /dev/null
++++ b/libc/arch-x86_64/kabylake/string/avx2-strcmp-kbl.S
+@@ -0,0 +1,885 @@
++/* strcmp/wcscmp/strncmp/wcsncmp optimized with AVX2.
++   Copyright (C) 2018-2020 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++
++# ifndef STRCMP
++#  define STRCMP	strcmp_avx2
++# endif
++
++# ifndef L
++#  define L(label)      .L##label
++# endif
++
++# ifndef cfi_startproc
++#  define cfi_startproc .cfi_startproc
++# endif
++
++# ifndef cfi_endproc
++#  define cfi_endproc   .cfi_endproc
++# endif
++
++# ifndef ENTRY
++#  define ENTRY(name)   \
++        .type name, @function;  \
++        .globl name;    \
++        .p2align 4;     \
++name:   \
++        cfi_startproc
++# endif
++
++# ifndef END
++#  define END(name)     \
++        cfi_endproc;    \
++        .size name, .-name
++# endif
++
++# define PAGE_SIZE	4096
++
++/* VEC_SIZE = Number of bytes in a ymm register */
++# define VEC_SIZE	32
++
++/* Shift for dividing by (VEC_SIZE * 4).  */
++# define DIVIDE_BY_VEC_4_SHIFT	7
++# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
++#  error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
++# endif
++
++# ifdef USE_AS_WCSCMP
++/* Compare packed dwords.  */
++#  define VPCMPEQ	vpcmpeqd
++/* Compare packed dwords and store minimum.  */
++#  define VPMINU	vpminud
++/* 1 dword char == 4 bytes.  */
++#  define SIZE_OF_CHAR	4
++# else
++/* Compare packed bytes.  */
++#  define VPCMPEQ	vpcmpeqb
++/* Compare packed bytes and store minimum.  */
++#  define VPMINU	vpminub
++/* 1 byte char == 1 byte.  */
++#  define SIZE_OF_CHAR	1
++# endif
++
++# ifndef VZEROUPPER
++#  define VZEROUPPER	vzeroupper
++# endif
++
++/* Warning!
++           wcscmp/wcsncmp have to use SIGNED comparison for elements.
++           strcmp/strncmp have to use UNSIGNED comparison for elements.
++*/
++
++/* The main idea of the string comparison (byte or dword) using AVX2
++   consists of comparing (VPCMPEQ) two ymm vectors. The latter can be on
++   either packed bytes or dwords depending on USE_AS_WCSCMP. In order
++   to check the null char, algorithm keeps the matched bytes/dwords,
++   requiring two more AVX2 instructions (VPMINU and VPCMPEQ). In general,
++   the costs of comparing VEC_SIZE bytes (32-bytes) are two VPCMPEQ and
++   one VPMINU instructions, together with movdqu and testl instructions.
++   Main loop (away from from page boundary) compares 4 vectors are a time,
++   effectively comparing 4 x VEC_SIZE bytes (128 bytes) on each loop.
++
++   The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic
++   is the same as strcmp, except that an a maximum offset is tracked.  If
++   the maximum offset is reached before a difference is found, zero is
++   returned.  */
++
++	.section .text.avx,"ax",@progbits
++ENTRY (STRCMP)
++# ifdef USE_AS_STRNCMP
++	/* Check for simple cases (0 or 1) in offset.  */
++	cmp	$1, %RDX_LP
++	je	L(char0)
++	jb	L(zero)
++#  ifdef USE_AS_WCSCMP
++	/* Convert units: from wide to byte char.  */
++	shl	$2, %RDX_LP
++#  endif
++	/* Register %r11 tracks the maximum offset.  */
++	mov	%RDX_LP, %R11_LP
++# endif
++	movl	%edi, %eax
++	xorl	%edx, %edx
++	/* Make %xmm7 (%ymm7) all zeros in this function.  */
++	vpxor	%xmm7, %xmm7, %xmm7
++	orl	%esi, %eax
++	andl	$(PAGE_SIZE - 1), %eax
++	cmpl	$(PAGE_SIZE - (VEC_SIZE * 4)), %eax
++	jg	L(cross_page)
++	/* Start comparing 4 vectors.  */
++	vmovdqu	(%rdi), %ymm1
++	VPCMPEQ	(%rsi), %ymm1, %ymm0
++	VPMINU	%ymm1, %ymm0, %ymm0
++	VPCMPEQ	%ymm7, %ymm0, %ymm0
++	vpmovmskb %ymm0, %ecx
++	testl	%ecx, %ecx
++	je	L(next_3_vectors)
++	tzcntl	%ecx, %edx
++# ifdef USE_AS_STRNCMP
++	/* Return 0 if the mismatched index (%rdx) is after the maximum
++	   offset (%r11).   */
++	cmpq	%r11, %rdx
++	jae	L(zero)
++# endif
++# ifdef USE_AS_WCSCMP
++	xorl	%eax, %eax
++	movl	(%rdi, %rdx), %ecx
++	cmpl	(%rsi, %rdx), %ecx
++	je	L(return)
++L(wcscmp_return):
++	setl	%al
++	negl	%eax
++	orl	$1, %eax
++L(return):
++# else
++	movzbl	(%rdi, %rdx), %eax
++	movzbl	(%rsi, %rdx), %edx
++	subl	%edx, %eax
++# endif
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(return_vec_size):
++	tzcntl	%ecx, %edx
++# ifdef USE_AS_STRNCMP
++	/* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after
++	   the maximum offset (%r11).  */
++	addq	$VEC_SIZE, %rdx
++	cmpq	%r11, %rdx
++	jae	L(zero)
++#  ifdef USE_AS_WCSCMP
++	xorl	%eax, %eax
++	movl	(%rdi, %rdx), %ecx
++	cmpl	(%rsi, %rdx), %ecx
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(%rdi, %rdx), %eax
++	movzbl	(%rsi, %rdx), %edx
++	subl	%edx, %eax
++#  endif
++# else
++#  ifdef USE_AS_WCSCMP
++	xorl	%eax, %eax
++	movl	VEC_SIZE(%rdi, %rdx), %ecx
++	cmpl	VEC_SIZE(%rsi, %rdx), %ecx
++	jne	L(wcscmp_return)
++#  else
++	movzbl	VEC_SIZE(%rdi, %rdx), %eax
++	movzbl	VEC_SIZE(%rsi, %rdx), %edx
++	subl	%edx, %eax
++#  endif
++# endif
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(return_2_vec_size):
++	tzcntl	%ecx, %edx
++# ifdef USE_AS_STRNCMP
++	/* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is
++	   after the maximum offset (%r11).  */
++	addq	$(VEC_SIZE * 2), %rdx
++	cmpq	%r11, %rdx
++	jae	L(zero)
++#  ifdef USE_AS_WCSCMP
++	xorl	%eax, %eax
++	movl	(%rdi, %rdx), %ecx
++	cmpl	(%rsi, %rdx), %ecx
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(%rdi, %rdx), %eax
++	movzbl	(%rsi, %rdx), %edx
++	subl	%edx, %eax
++#  endif
++# else
++#  ifdef USE_AS_WCSCMP
++	xorl	%eax, %eax
++	movl	(VEC_SIZE * 2)(%rdi, %rdx), %ecx
++	cmpl	(VEC_SIZE * 2)(%rsi, %rdx), %ecx
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(VEC_SIZE * 2)(%rdi, %rdx), %eax
++	movzbl	(VEC_SIZE * 2)(%rsi, %rdx), %edx
++	subl	%edx, %eax
++#  endif
++# endif
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(return_3_vec_size):
++	tzcntl	%ecx, %edx
++# ifdef USE_AS_STRNCMP
++	/* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is
++	   after the maximum offset (%r11).  */
++	addq	$(VEC_SIZE * 3), %rdx
++	cmpq	%r11, %rdx
++	jae	L(zero)
++#  ifdef USE_AS_WCSCMP
++	xorl	%eax, %eax
++	movl	(%rdi, %rdx), %ecx
++	cmpl	(%rsi, %rdx), %ecx
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(%rdi, %rdx), %eax
++	movzbl	(%rsi, %rdx), %edx
++	subl	%edx, %eax
++#  endif
++# else
++#  ifdef USE_AS_WCSCMP
++	xorl	%eax, %eax
++	movl	(VEC_SIZE * 3)(%rdi, %rdx), %ecx
++	cmpl	(VEC_SIZE * 3)(%rsi, %rdx), %ecx
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(VEC_SIZE * 3)(%rdi, %rdx), %eax
++	movzbl	(VEC_SIZE * 3)(%rsi, %rdx), %edx
++	subl	%edx, %eax
++#  endif
++# endif
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(next_3_vectors):
++	vmovdqu	VEC_SIZE(%rdi), %ymm6
++	VPCMPEQ	VEC_SIZE(%rsi), %ymm6, %ymm3
++	VPMINU	%ymm6, %ymm3, %ymm3
++	VPCMPEQ	%ymm7, %ymm3, %ymm3
++	vpmovmskb %ymm3, %ecx
++	testl	%ecx, %ecx
++	jne	L(return_vec_size)
++	vmovdqu	(VEC_SIZE * 2)(%rdi), %ymm5
++	vmovdqu	(VEC_SIZE * 3)(%rdi), %ymm4
++	vmovdqu	(VEC_SIZE * 3)(%rsi), %ymm0
++	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm5, %ymm2
++	VPMINU	%ymm5, %ymm2, %ymm2
++	VPCMPEQ	%ymm4, %ymm0, %ymm0
++	VPCMPEQ	%ymm7, %ymm2, %ymm2
++	vpmovmskb %ymm2, %ecx
++	testl	%ecx, %ecx
++	jne	L(return_2_vec_size)
++	VPMINU	%ymm4, %ymm0, %ymm0
++	VPCMPEQ	%ymm7, %ymm0, %ymm0
++	vpmovmskb %ymm0, %ecx
++	testl	%ecx, %ecx
++	jne	L(return_3_vec_size)
++L(main_loop_header):
++	leaq	(VEC_SIZE * 4)(%rdi), %rdx
++	movl	$PAGE_SIZE, %ecx
++	/* Align load via RAX.  */
++	andq	$-(VEC_SIZE * 4), %rdx
++	subq	%rdi, %rdx
++	leaq	(%rdi, %rdx), %rax
++# ifdef USE_AS_STRNCMP
++	/* Starting from this point, the maximum offset, or simply the
++	   'offset', DECREASES by the same amount when base pointers are
++	   moved forward.  Return 0 when:
++	     1) On match: offset <= the matched vector index.
++	     2) On mistmach, offset is before the mistmatched index.
++	 */
++	subq	%rdx, %r11
++	jbe	L(zero)
++# endif
++	addq	%rsi, %rdx
++	movq	%rdx, %rsi
++	andl	$(PAGE_SIZE - 1), %esi
++	/* Number of bytes before page crossing.  */
++	subq	%rsi, %rcx
++	/* Number of VEC_SIZE * 4 blocks before page crossing.  */
++	shrq	$DIVIDE_BY_VEC_4_SHIFT, %rcx
++	/* ESI: Number of VEC_SIZE * 4 blocks before page crossing.   */
++	movl	%ecx, %esi
++	jmp	L(loop_start)
++
++	.p2align 4
++L(loop):
++# ifdef USE_AS_STRNCMP
++	/* Base pointers are moved forward by 4 * VEC_SIZE.  Decrease
++	   the maximum offset (%r11) by the same amount.  */
++	subq	$(VEC_SIZE * 4), %r11
++	jbe	L(zero)
++# endif
++	addq	$(VEC_SIZE * 4), %rax
++	addq	$(VEC_SIZE * 4), %rdx
++L(loop_start):
++	testl	%esi, %esi
++	leal	-1(%esi), %esi
++	je	L(loop_cross_page)
++L(back_to_loop):
++	/* Main loop, comparing 4 vectors are a time.  */
++	vmovdqa	(%rax), %ymm0
++	vmovdqa	VEC_SIZE(%rax), %ymm3
++	VPCMPEQ	(%rdx), %ymm0, %ymm4
++	VPCMPEQ	VEC_SIZE(%rdx), %ymm3, %ymm1
++	VPMINU	%ymm0, %ymm4, %ymm4
++	VPMINU	%ymm3, %ymm1, %ymm1
++	vmovdqa	(VEC_SIZE * 2)(%rax), %ymm2
++	VPMINU	%ymm1, %ymm4, %ymm0
++	vmovdqa	(VEC_SIZE * 3)(%rax), %ymm3
++	VPCMPEQ	(VEC_SIZE * 2)(%rdx), %ymm2, %ymm5
++	VPCMPEQ	(VEC_SIZE * 3)(%rdx), %ymm3, %ymm6
++	VPMINU	%ymm2, %ymm5, %ymm5
++	VPMINU	%ymm3, %ymm6, %ymm6
++	VPMINU	%ymm5, %ymm0, %ymm0
++	VPMINU	%ymm6, %ymm0, %ymm0
++	VPCMPEQ	%ymm7, %ymm0, %ymm0
++
++	/* Test each mask (32 bits) individually because for VEC_SIZE
++	   == 32 is not possible to OR the four masks and keep all bits
++	   in a 64-bit integer register, differing from SSE2 strcmp
++	   where ORing is possible.  */
++	vpmovmskb %ymm0, %ecx
++	testl	%ecx, %ecx
++	je	L(loop)
++	VPCMPEQ	%ymm7, %ymm4, %ymm0
++	vpmovmskb %ymm0, %edi
++	testl	%edi, %edi
++	je	L(test_vec)
++	tzcntl	%edi, %ecx
++# ifdef USE_AS_STRNCMP
++	cmpq	%rcx, %r11
++	jbe	L(zero)
++#  ifdef USE_AS_WCSCMP
++	movq	%rax, %rsi
++	xorl	%eax, %eax
++	movl	(%rsi, %rcx), %edi
++	cmpl	(%rdx, %rcx), %edi
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(%rax, %rcx), %eax
++	movzbl	(%rdx, %rcx), %edx
++	subl	%edx, %eax
++#  endif
++# else
++#  ifdef USE_AS_WCSCMP
++	movq	%rax, %rsi
++	xorl	%eax, %eax
++	movl	(%rsi, %rcx), %edi
++	cmpl	(%rdx, %rcx), %edi
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(%rax, %rcx), %eax
++	movzbl	(%rdx, %rcx), %edx
++	subl	%edx, %eax
++#  endif
++# endif
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(test_vec):
++# ifdef USE_AS_STRNCMP
++	/* The first vector matched.  Return 0 if the maximum offset
++	   (%r11) <= VEC_SIZE.  */
++	cmpq	$VEC_SIZE, %r11
++	jbe	L(zero)
++# endif
++	VPCMPEQ	%ymm7, %ymm1, %ymm1
++	vpmovmskb %ymm1, %ecx
++	testl	%ecx, %ecx
++	je	L(test_2_vec)
++	tzcntl	%ecx, %edi
++# ifdef USE_AS_STRNCMP
++	addq	$VEC_SIZE, %rdi
++	cmpq	%rdi, %r11
++	jbe	L(zero)
++#  ifdef USE_AS_WCSCMP
++	movq	%rax, %rsi
++	xorl	%eax, %eax
++	movl	(%rsi, %rdi), %ecx
++	cmpl	(%rdx, %rdi), %ecx
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(%rax, %rdi), %eax
++	movzbl	(%rdx, %rdi), %edx
++	subl	%edx, %eax
++#  endif
++# else
++#  ifdef USE_AS_WCSCMP
++	movq	%rax, %rsi
++	xorl	%eax, %eax
++	movl	VEC_SIZE(%rsi, %rdi), %ecx
++	cmpl	VEC_SIZE(%rdx, %rdi), %ecx
++	jne	L(wcscmp_return)
++#  else
++	movzbl	VEC_SIZE(%rax, %rdi), %eax
++	movzbl	VEC_SIZE(%rdx, %rdi), %edx
++	subl	%edx, %eax
++#  endif
++# endif
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(test_2_vec):
++# ifdef USE_AS_STRNCMP
++	/* The first 2 vectors matched.  Return 0 if the maximum offset
++	   (%r11) <= 2 * VEC_SIZE.  */
++	cmpq	$(VEC_SIZE * 2), %r11
++	jbe	L(zero)
++# endif
++	VPCMPEQ	%ymm7, %ymm5, %ymm5
++	vpmovmskb %ymm5, %ecx
++	testl	%ecx, %ecx
++	je	L(test_3_vec)
++	tzcntl	%ecx, %edi
++# ifdef USE_AS_STRNCMP
++	addq	$(VEC_SIZE * 2), %rdi
++	cmpq	%rdi, %r11
++	jbe	L(zero)
++#  ifdef USE_AS_WCSCMP
++	movq	%rax, %rsi
++	xorl	%eax, %eax
++	movl	(%rsi, %rdi), %ecx
++	cmpl	(%rdx, %rdi), %ecx
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(%rax, %rdi), %eax
++	movzbl	(%rdx, %rdi), %edx
++	subl	%edx, %eax
++#  endif
++# else
++#  ifdef USE_AS_WCSCMP
++	movq	%rax, %rsi
++	xorl	%eax, %eax
++	movl	(VEC_SIZE * 2)(%rsi, %rdi), %ecx
++	cmpl	(VEC_SIZE * 2)(%rdx, %rdi), %ecx
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(VEC_SIZE * 2)(%rax, %rdi), %eax
++	movzbl	(VEC_SIZE * 2)(%rdx, %rdi), %edx
++	subl	%edx, %eax
++#  endif
++# endif
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(test_3_vec):
++# ifdef USE_AS_STRNCMP
++	/* The first 3 vectors matched.  Return 0 if the maximum offset
++	   (%r11) <= 3 * VEC_SIZE.  */
++	cmpq	$(VEC_SIZE * 3), %r11
++	jbe	L(zero)
++# endif
++	VPCMPEQ	%ymm7, %ymm6, %ymm6
++	vpmovmskb %ymm6, %esi
++	tzcntl	%esi, %ecx
++# ifdef USE_AS_STRNCMP
++	addq	$(VEC_SIZE * 3), %rcx
++	cmpq	%rcx, %r11
++	jbe	L(zero)
++#  ifdef USE_AS_WCSCMP
++	movq	%rax, %rsi
++	xorl	%eax, %eax
++	movl	(%rsi, %rcx), %esi
++	cmpl	(%rdx, %rcx), %esi
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(%rax, %rcx), %eax
++	movzbl	(%rdx, %rcx), %edx
++	subl	%edx, %eax
++#  endif
++# else
++#  ifdef USE_AS_WCSCMP
++	movq	%rax, %rsi
++	xorl	%eax, %eax
++	movl	(VEC_SIZE * 3)(%rsi, %rcx), %esi
++	cmpl	(VEC_SIZE * 3)(%rdx, %rcx), %esi
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(VEC_SIZE * 3)(%rax, %rcx), %eax
++	movzbl	(VEC_SIZE * 3)(%rdx, %rcx), %edx
++	subl	%edx, %eax
++#  endif
++# endif
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(loop_cross_page):
++	xorl	%r10d, %r10d
++	movq	%rdx, %rcx
++	/* Align load via RDX.  We load the extra ECX bytes which should
++	   be ignored.  */
++	andl	$((VEC_SIZE * 4) - 1), %ecx
++	/* R10 is -RCX.  */
++	subq	%rcx, %r10
++
++	/* This works only if VEC_SIZE * 2 == 64. */
++# if (VEC_SIZE * 2) != 64
++#  error (VEC_SIZE * 2) != 64
++# endif
++
++	/* Check if the first VEC_SIZE * 2 bytes should be ignored.  */
++	cmpl	$(VEC_SIZE * 2), %ecx
++	jge	L(loop_cross_page_2_vec)
++
++	vmovdqu	(%rax, %r10), %ymm2
++	vmovdqu	VEC_SIZE(%rax, %r10), %ymm3
++	VPCMPEQ	(%rdx, %r10), %ymm2, %ymm0
++	VPCMPEQ	VEC_SIZE(%rdx, %r10), %ymm3, %ymm1
++	VPMINU	%ymm2, %ymm0, %ymm0
++	VPMINU	%ymm3, %ymm1, %ymm1
++	VPCMPEQ	%ymm7, %ymm0, %ymm0
++	VPCMPEQ	%ymm7, %ymm1, %ymm1
++
++	vpmovmskb %ymm0, %edi
++	vpmovmskb %ymm1, %esi
++
++	salq	$32, %rsi
++	xorq	%rsi, %rdi
++
++	/* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes.  */
++	shrq	%cl, %rdi
++
++	testq	%rdi, %rdi
++	je	L(loop_cross_page_2_vec)
++	tzcntq	%rdi, %rcx
++# ifdef USE_AS_STRNCMP
++	cmpq	%rcx, %r11
++	jbe	L(zero)
++#  ifdef USE_AS_WCSCMP
++	movq	%rax, %rsi
++	xorl	%eax, %eax
++	movl	(%rsi, %rcx), %edi
++	cmpl	(%rdx, %rcx), %edi
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(%rax, %rcx), %eax
++	movzbl	(%rdx, %rcx), %edx
++	subl	%edx, %eax
++#  endif
++# else
++#  ifdef USE_AS_WCSCMP
++	movq	%rax, %rsi
++	xorl	%eax, %eax
++	movl	(%rsi, %rcx), %edi
++	cmpl	(%rdx, %rcx), %edi
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(%rax, %rcx), %eax
++	movzbl	(%rdx, %rcx), %edx
++	subl	%edx, %eax
++#  endif
++# endif
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(loop_cross_page_2_vec):
++	/* The first VEC_SIZE * 2 bytes match or are ignored.  */
++	vmovdqu	(VEC_SIZE * 2)(%rax, %r10), %ymm2
++	vmovdqu	(VEC_SIZE * 3)(%rax, %r10), %ymm3
++	VPCMPEQ	(VEC_SIZE * 2)(%rdx, %r10), %ymm2, %ymm5
++	VPMINU	%ymm2, %ymm5, %ymm5
++	VPCMPEQ	(VEC_SIZE * 3)(%rdx, %r10), %ymm3, %ymm6
++	VPCMPEQ	%ymm7, %ymm5, %ymm5
++	VPMINU	%ymm3, %ymm6, %ymm6
++	VPCMPEQ	%ymm7, %ymm6, %ymm6
++
++	vpmovmskb %ymm5, %edi
++	vpmovmskb %ymm6, %esi
++
++	salq	$32, %rsi
++	xorq	%rsi, %rdi
++
++	xorl	%r8d, %r8d
++	/* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes.  */
++	subl	$(VEC_SIZE * 2), %ecx
++	jle	1f
++	/* Skip ECX bytes.  */
++	shrq	%cl, %rdi
++	/* R8 has number of bytes skipped.  */
++	movl	%ecx, %r8d
++1:
++	/* Before jumping back to the loop, set ESI to the number of
++	   VEC_SIZE * 4 blocks before page crossing.  */
++	movl	$(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi
++
++	testq	%rdi, %rdi
++# ifdef USE_AS_STRNCMP
++	/* At this point, if %rdi value is 0, it already tested
++	   VEC_SIZE*4+%r10 byte starting from %rax. This label
++	   checks whether strncmp maximum offset reached or not.  */
++	je	L(string_nbyte_offset_check)
++# else
++	je	L(back_to_loop)
++# endif
++	tzcntq	%rdi, %rcx
++	addq	%r10, %rcx
++	/* Adjust for number of bytes skipped.  */
++	addq	%r8, %rcx
++# ifdef USE_AS_STRNCMP
++	addq	$(VEC_SIZE * 2), %rcx
++	subq	%rcx, %r11
++	jbe	L(zero)
++#  ifdef USE_AS_WCSCMP
++	movq	%rax, %rsi
++	xorl	%eax, %eax
++	movl	(%rsi, %rcx), %edi
++	cmpl	(%rdx, %rcx), %edi
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(%rax, %rcx), %eax
++	movzbl	(%rdx, %rcx), %edx
++	subl	%edx, %eax
++#  endif
++# else
++#  ifdef USE_AS_WCSCMP
++	movq	%rax, %rsi
++	xorl	%eax, %eax
++	movl	(VEC_SIZE * 2)(%rsi, %rcx), %edi
++	cmpl	(VEC_SIZE * 2)(%rdx, %rcx), %edi
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(VEC_SIZE * 2)(%rax, %rcx), %eax
++	movzbl	(VEC_SIZE * 2)(%rdx, %rcx), %edx
++	subl	%edx, %eax
++#  endif
++# endif
++	VZEROUPPER
++	ret
++
++# ifdef USE_AS_STRNCMP
++L(string_nbyte_offset_check):
++	leaq	(VEC_SIZE * 4)(%r10), %r10
++	cmpq	%r10, %r11
++	jbe	L(zero)
++	jmp	L(back_to_loop)
++# endif
++
++	.p2align 4
++L(cross_page_loop):
++	/* Check one byte/dword at a time.  */
++# ifdef USE_AS_WCSCMP
++	cmpl	%ecx, %eax
++# else
++	subl	%ecx, %eax
++# endif
++	jne	L(different)
++	addl	$SIZE_OF_CHAR, %edx
++	cmpl	$(VEC_SIZE * 4), %edx
++	je	L(main_loop_header)
++# ifdef USE_AS_STRNCMP
++	cmpq	%r11, %rdx
++	jae	L(zero)
++# endif
++# ifdef USE_AS_WCSCMP
++	movl	(%rdi, %rdx), %eax
++	movl	(%rsi, %rdx), %ecx
++# else
++	movzbl	(%rdi, %rdx), %eax
++	movzbl	(%rsi, %rdx), %ecx
++# endif
++	/* Check null char.  */
++	testl	%eax, %eax
++	jne	L(cross_page_loop)
++	/* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED
++	   comparisons.  */
++	subl	%ecx, %eax
++# ifndef USE_AS_WCSCMP
++L(different):
++# endif
++	VZEROUPPER
++	ret
++
++# ifdef USE_AS_WCSCMP
++	.p2align 4
++L(different):
++	/* Use movl to avoid modifying EFLAGS.  */
++	movl	$0, %eax
++	setl	%al
++	negl	%eax
++	orl	$1, %eax
++	VZEROUPPER
++	ret
++# endif
++
++# ifdef USE_AS_STRNCMP
++	.p2align 4
++L(zero):
++	xorl	%eax, %eax
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(char0):
++#  ifdef USE_AS_WCSCMP
++	xorl	%eax, %eax
++	movl	(%rdi), %ecx
++	cmpl	(%rsi), %ecx
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(%rsi), %ecx
++	movzbl	(%rdi), %eax
++	subl	%ecx, %eax
++#  endif
++	VZEROUPPER
++	ret
++# endif
++
++	.p2align 4
++L(last_vector):
++	addq	%rdx, %rdi
++	addq	%rdx, %rsi
++# ifdef USE_AS_STRNCMP
++	subq	%rdx, %r11
++# endif
++	tzcntl	%ecx, %edx
++# ifdef USE_AS_STRNCMP
++	cmpq	%r11, %rdx
++	jae	L(zero)
++# endif
++# ifdef USE_AS_WCSCMP
++	xorl	%eax, %eax
++	movl	(%rdi, %rdx), %ecx
++	cmpl	(%rsi, %rdx), %ecx
++	jne	L(wcscmp_return)
++# else
++	movzbl	(%rdi, %rdx), %eax
++	movzbl	(%rsi, %rdx), %edx
++	subl	%edx, %eax
++# endif
++	VZEROUPPER
++	ret
++
++	/* Comparing on page boundary region requires special treatment:
++	   It must done one vector at the time, starting with the wider
++	   ymm vector if possible, if not, with xmm. If fetching 16 bytes
++	   (xmm) still passes the boundary, byte comparison must be done.
++	 */
++	.p2align 4
++L(cross_page):
++	/* Try one ymm vector at a time.  */
++	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
++	jg	L(cross_page_1_vector)
++L(loop_1_vector):
++	vmovdqu	(%rdi, %rdx), %ymm1
++	VPCMPEQ	(%rsi, %rdx), %ymm1, %ymm0
++	VPMINU	%ymm1, %ymm0, %ymm0
++	VPCMPEQ	%ymm7, %ymm0, %ymm0
++	vpmovmskb %ymm0, %ecx
++	testl	%ecx, %ecx
++	jne	L(last_vector)
++
++	addl	$VEC_SIZE, %edx
++
++	addl	$VEC_SIZE, %eax
++# ifdef USE_AS_STRNCMP
++	/* Return 0 if the current offset (%rdx) >= the maximum offset
++	   (%r11).  */
++	cmpq	%r11, %rdx
++	jae	L(zero)
++# endif
++	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
++	jle	L(loop_1_vector)
++L(cross_page_1_vector):
++	/* Less than 32 bytes to check, try one xmm vector.  */
++	cmpl	$(PAGE_SIZE - 16), %eax
++	jg	L(cross_page_1_xmm)
++	vmovdqu	(%rdi, %rdx), %xmm1
++	VPCMPEQ	(%rsi, %rdx), %xmm1, %xmm0
++	VPMINU	%xmm1, %xmm0, %xmm0
++	VPCMPEQ	%xmm7, %xmm0, %xmm0
++	vpmovmskb %xmm0, %ecx
++	testl	%ecx, %ecx
++	jne	L(last_vector)
++
++	addl	$16, %edx
++# ifndef USE_AS_WCSCMP
++	addl	$16, %eax
++# endif
++# ifdef USE_AS_STRNCMP
++	/* Return 0 if the current offset (%rdx) >= the maximum offset
++	   (%r11).  */
++	cmpq	%r11, %rdx
++	jae	L(zero)
++# endif
++
++L(cross_page_1_xmm):
++# ifndef USE_AS_WCSCMP
++	/* Less than 16 bytes to check, try 8 byte vector.  NB: No need
++	   for wcscmp nor wcsncmp since wide char is 4 bytes.   */
++	cmpl	$(PAGE_SIZE - 8), %eax
++	jg	L(cross_page_8bytes)
++	vmovq	(%rdi, %rdx), %xmm1
++	vmovq	(%rsi, %rdx), %xmm0
++	VPCMPEQ	%xmm0, %xmm1, %xmm0
++	VPMINU	%xmm1, %xmm0, %xmm0
++	VPCMPEQ	%xmm7, %xmm0, %xmm0
++	vpmovmskb %xmm0, %ecx
++	/* Only last 8 bits are valid.  */
++	andl	$0xff, %ecx
++	testl	%ecx, %ecx
++	jne	L(last_vector)
++
++	addl	$8, %edx
++	addl	$8, %eax
++#  ifdef USE_AS_STRNCMP
++	/* Return 0 if the current offset (%rdx) >= the maximum offset
++	   (%r11).  */
++	cmpq	%r11, %rdx
++	jae	L(zero)
++#  endif
++
++L(cross_page_8bytes):
++	/* Less than 8 bytes to check, try 4 byte vector.  */
++	cmpl	$(PAGE_SIZE - 4), %eax
++	jg	L(cross_page_4bytes)
++	vmovd	(%rdi, %rdx), %xmm1
++	vmovd	(%rsi, %rdx), %xmm0
++	VPCMPEQ	%xmm0, %xmm1, %xmm0
++	VPMINU	%xmm1, %xmm0, %xmm0
++	VPCMPEQ	%xmm7, %xmm0, %xmm0
++	vpmovmskb %xmm0, %ecx
++	/* Only last 4 bits are valid.  */
++	andl	$0xf, %ecx
++	testl	%ecx, %ecx
++	jne	L(last_vector)
++
++	addl	$4, %edx
++#  ifdef USE_AS_STRNCMP
++	/* Return 0 if the current offset (%rdx) >= the maximum offset
++	   (%r11).  */
++	cmpq	%r11, %rdx
++	jae	L(zero)
++#  endif
++
++L(cross_page_4bytes):
++# endif
++	/* Less than 4 bytes to check, try one byte/dword at a time.  */
++# ifdef USE_AS_STRNCMP
++	cmpq	%r11, %rdx
++	jae	L(zero)
++# endif
++# ifdef USE_AS_WCSCMP
++	movl	(%rdi, %rdx), %eax
++	movl	(%rsi, %rdx), %ecx
++# else
++	movzbl	(%rdi, %rdx), %eax
++	movzbl	(%rsi, %rdx), %ecx
++# endif
++	testl	%eax, %eax
++	jne	L(cross_page_loop)
++	subl	%ecx, %eax
++	VZEROUPPER
++	ret
++END (STRCMP)
+diff --git a/libc/arch-x86_64/kabylake/string/avx2-strcpy-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-strcpy-kbl.S
+new file mode 100644
+index 000000000..809a9ac00
+--- /dev/null
++++ b/libc/arch-x86_64/kabylake/string/avx2-strcpy-kbl.S
+@@ -0,0 +1,1046 @@
++/* strcpy with AVX2
++   Copyright (C) 2011-2020 Free Software Foundation, Inc.
++   Contributed by Intel Corporation.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++
++# ifndef USE_AS_STRCAT
++
++#  ifndef STRCPY
++#   define STRCPY  strcpy_avx2
++#  endif
++
++# endif
++
++# ifndef L
++#  define L(label)      .L##label
++# endif
++
++# ifndef cfi_startproc
++#  define cfi_startproc .cfi_startproc
++# endif
++
++# ifndef cfi_endproc
++#  define cfi_endproc   .cfi_endproc
++# endif
++
++# ifndef ENTRY
++#  define ENTRY(name)   \
++        .type name, @function;  \
++        .globl name;    \
++        .p2align 4;     \
++name:   \
++        cfi_startproc
++# endif
++
++# ifndef END
++#  define END(name)     \
++        cfi_endproc;    \
++        .size name, .-name
++# endif
++
++/* Number of bytes in a vector register */
++# ifndef VEC_SIZE
++#  define VEC_SIZE	32
++# endif
++
++# ifndef VZEROUPPER
++#  define VZEROUPPER	vzeroupper
++# endif
++
++/* zero register */
++#define xmmZ	xmm0
++#define ymmZ	ymm0
++
++/* mask register */
++#define ymmM	ymm1
++
++# ifndef USE_AS_STRCAT
++
++	.section .text.avx,"ax",@progbits
++ENTRY (STRCPY)
++#  ifdef USE_AS_STRNCPY
++	mov	%RDX_LP, %R8_LP
++	test	%R8_LP, %R8_LP
++	jz	L(ExitZero)
++#  endif
++	mov	%rsi, %rcx
++#  ifndef USE_AS_STPCPY
++	mov	%rdi, %rax      /* save result */
++#  endif
++
++# endif
++
++	vpxor	%xmmZ, %xmmZ, %xmmZ
++
++	and	$((VEC_SIZE * 4) - 1), %ecx
++	cmp	$(VEC_SIZE * 2), %ecx
++	jbe	L(SourceStringAlignmentLessTwoVecSize)
++
++	and	$-VEC_SIZE, %rsi
++	and	$(VEC_SIZE - 1), %ecx
++
++	vpcmpeqb (%rsi), %ymmZ, %ymmM
++	vpmovmskb %ymmM, %edx
++	shr	%cl, %rdx
++
++# ifdef USE_AS_STRNCPY
++#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
++	mov	$VEC_SIZE, %r10
++	sub	%rcx, %r10
++	cmp	%r10, %r8
++#  else
++	mov	$(VEC_SIZE + 1), %r10
++	sub	%rcx, %r10
++	cmp	%r10, %r8
++#  endif
++	jbe	L(CopyVecSizeTailCase2OrCase3)
++# endif
++	test	%edx, %edx
++	jnz	L(CopyVecSizeTail)
++
++	vpcmpeqb VEC_SIZE(%rsi), %ymmZ, %ymm2
++	vpmovmskb %ymm2, %edx
++
++# ifdef USE_AS_STRNCPY
++	add	$VEC_SIZE, %r10
++	cmp	%r10, %r8
++	jbe	L(CopyTwoVecSizeCase2OrCase3)
++# endif
++	test	%edx, %edx
++	jnz	L(CopyTwoVecSize)
++
++	vmovdqu (%rsi, %rcx), %ymm2   /* copy VEC_SIZE bytes */
++	vmovdqu %ymm2, (%rdi)
++
++/* If source address alignment != destination address alignment */
++	.p2align 4
++L(UnalignVecSizeBoth):
++	sub	%rcx, %rdi
++# ifdef USE_AS_STRNCPY
++	add	%rcx, %r8
++	sbb	%rcx, %rcx
++	or	%rcx, %r8
++# endif
++	mov	$VEC_SIZE, %rcx
++	vmovdqa (%rsi, %rcx), %ymm2
++	vmovdqu %ymm2, (%rdi, %rcx)
++	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
++	vpcmpeqb %ymm2, %ymmZ, %ymmM
++	vpmovmskb %ymmM, %edx
++	add	$VEC_SIZE, %rcx
++# ifdef USE_AS_STRNCPY
++	sub	$(VEC_SIZE * 3), %r8
++	jbe	L(CopyVecSizeCase2OrCase3)
++# endif
++	test	%edx, %edx
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	jnz	L(CopyVecSizeUnalignedVec2)
++# else
++	jnz	L(CopyVecSize)
++# endif
++
++	vmovdqu %ymm2, (%rdi, %rcx)
++	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
++	vpcmpeqb %ymm3, %ymmZ, %ymmM
++	vpmovmskb %ymmM, %edx
++	add	$VEC_SIZE, %rcx
++# ifdef USE_AS_STRNCPY
++	sub	$VEC_SIZE, %r8
++	jbe	L(CopyVecSizeCase2OrCase3)
++# endif
++	test	%edx, %edx
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	jnz	L(CopyVecSizeUnalignedVec3)
++# else
++	jnz	L(CopyVecSize)
++# endif
++
++	vmovdqu %ymm3, (%rdi, %rcx)
++	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm4
++	vpcmpeqb %ymm4, %ymmZ, %ymmM
++	vpmovmskb %ymmM, %edx
++	add	$VEC_SIZE, %rcx
++# ifdef USE_AS_STRNCPY
++	sub	$VEC_SIZE, %r8
++	jbe	L(CopyVecSizeCase2OrCase3)
++# endif
++	test	%edx, %edx
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	jnz	L(CopyVecSizeUnalignedVec4)
++# else
++	jnz	L(CopyVecSize)
++# endif
++
++	vmovdqu %ymm4, (%rdi, %rcx)
++	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
++	vpcmpeqb %ymm2, %ymmZ, %ymmM
++	vpmovmskb %ymmM, %edx
++	add	$VEC_SIZE, %rcx
++# ifdef USE_AS_STRNCPY
++	sub	$VEC_SIZE, %r8
++	jbe	L(CopyVecSizeCase2OrCase3)
++# endif
++	test	%edx, %edx
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	jnz	L(CopyVecSizeUnalignedVec2)
++# else
++	jnz	L(CopyVecSize)
++# endif
++
++	vmovdqu %ymm2, (%rdi, %rcx)
++	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
++	vpcmpeqb %ymm2, %ymmZ, %ymmM
++	vpmovmskb %ymmM, %edx
++	add	$VEC_SIZE, %rcx
++# ifdef USE_AS_STRNCPY
++	sub	$VEC_SIZE, %r8
++	jbe	L(CopyVecSizeCase2OrCase3)
++# endif
++	test	%edx, %edx
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	jnz	L(CopyVecSizeUnalignedVec2)
++# else
++	jnz	L(CopyVecSize)
++# endif
++
++	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
++	vmovdqu %ymm2, (%rdi, %rcx)
++	vpcmpeqb %ymm3, %ymmZ, %ymmM
++	vpmovmskb %ymmM, %edx
++	add	$VEC_SIZE, %rcx
++# ifdef USE_AS_STRNCPY
++	sub	$VEC_SIZE, %r8
++	jbe	L(CopyVecSizeCase2OrCase3)
++# endif
++	test	%edx, %edx
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	jnz	L(CopyVecSizeUnalignedVec3)
++# else
++	jnz	L(CopyVecSize)
++# endif
++
++	vmovdqu %ymm3, (%rdi, %rcx)
++	mov	%rsi, %rdx
++	lea	VEC_SIZE(%rsi, %rcx), %rsi
++	and	$-(VEC_SIZE * 4), %rsi
++	sub	%rsi, %rdx
++	sub	%rdx, %rdi
++# ifdef USE_AS_STRNCPY
++	lea	(VEC_SIZE * 8)(%r8, %rdx), %r8
++# endif
++L(UnalignedFourVecSizeLoop):
++	vmovdqa (%rsi), %ymm4
++	vmovdqa VEC_SIZE(%rsi), %ymm5
++	vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
++	vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
++	vpminub %ymm5, %ymm4, %ymm2
++	vpminub %ymm7, %ymm6, %ymm3
++	vpminub %ymm2, %ymm3, %ymm3
++	vpcmpeqb %ymmM, %ymm3, %ymm3
++	vpmovmskb %ymm3, %edx
++# ifdef USE_AS_STRNCPY
++	sub	$(VEC_SIZE * 4), %r8
++	jbe	L(UnalignedLeaveCase2OrCase3)
++# endif
++	test	%edx, %edx
++	jnz	L(UnalignedFourVecSizeLeave)
++
++L(UnalignedFourVecSizeLoop_start):
++	add	$(VEC_SIZE * 4), %rdi
++	add	$(VEC_SIZE * 4), %rsi
++	vmovdqu %ymm4, -(VEC_SIZE * 4)(%rdi)
++	vmovdqa (%rsi), %ymm4
++	vmovdqu %ymm5, -(VEC_SIZE * 3)(%rdi)
++	vmovdqa VEC_SIZE(%rsi), %ymm5
++	vpminub %ymm5, %ymm4, %ymm2
++	vmovdqu %ymm6, -(VEC_SIZE * 2)(%rdi)
++	vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
++	vmovdqu %ymm7, -VEC_SIZE(%rdi)
++	vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
++	vpminub %ymm7, %ymm6, %ymm3
++	vpminub %ymm2, %ymm3, %ymm3
++	vpcmpeqb %ymmM, %ymm3, %ymm3
++	vpmovmskb %ymm3, %edx
++# ifdef USE_AS_STRNCPY
++	sub	$(VEC_SIZE * 4), %r8
++	jbe	L(UnalignedLeaveCase2OrCase3)
++# endif
++	test	%edx, %edx
++	jz	L(UnalignedFourVecSizeLoop_start)
++
++L(UnalignedFourVecSizeLeave):
++	vpcmpeqb %ymm4, %ymmZ, %ymmM
++	vpmovmskb %ymmM, %edx
++	test	%edx, %edx
++	jnz	L(CopyVecSizeUnaligned_0)
++
++	vpcmpeqb %ymm5, %ymmZ, %ymmM
++	vpmovmskb %ymmM, %ecx
++	test	%ecx, %ecx
++	jnz	L(CopyVecSizeUnaligned_16)
++
++	vpcmpeqb %ymm6, %ymmZ, %ymmM
++	vpmovmskb %ymmM, %edx
++	test	%edx, %edx
++	jnz	L(CopyVecSizeUnaligned_32)
++
++	vpcmpeqb %ymm7, %ymmZ, %ymmM
++	vpmovmskb %ymmM, %ecx
++	bsf	%ecx, %edx
++	vmovdqu %ymm4, (%rdi)
++	vmovdqu %ymm5, VEC_SIZE(%rdi)
++	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++# ifdef USE_AS_STPCPY
++	lea	(VEC_SIZE * 3)(%rdi, %rdx), %rax
++# endif
++	vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi)
++	add	$(VEC_SIZE - 1), %r8
++	sub	%rdx, %r8
++	lea	((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi
++	jmp	L(StrncpyFillTailWithZero)
++# else
++	add	$(VEC_SIZE * 3), %rsi
++	add	$(VEC_SIZE * 3), %rdi
++	jmp	L(CopyVecSizeExit)
++# endif
++
++/* If source address alignment == destination address alignment */
++
++L(SourceStringAlignmentLessTwoVecSize):
++	vmovdqu (%rsi), %ymm3
++	vmovdqu VEC_SIZE(%rsi), %ymm2
++	vpcmpeqb %ymm3, %ymmZ, %ymmM
++	vpmovmskb %ymmM, %edx
++
++# ifdef USE_AS_STRNCPY
++#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
++	cmp	$VEC_SIZE, %r8
++#  else
++	cmp	$(VEC_SIZE + 1), %r8
++#  endif
++	jbe	L(CopyVecSizeTail1Case2OrCase3)
++# endif
++	test	%edx, %edx
++	jnz	L(CopyVecSizeTail1)
++
++	vmovdqu %ymm3, (%rdi)
++	vpcmpeqb %ymm2, %ymmZ, %ymmM
++	vpmovmskb %ymmM, %edx
++
++# ifdef USE_AS_STRNCPY
++#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
++	cmp	$(VEC_SIZE * 2), %r8
++#  else
++	cmp	$((VEC_SIZE * 2) + 1), %r8
++#  endif
++	jbe	L(CopyTwoVecSize1Case2OrCase3)
++# endif
++	test	%edx, %edx
++	jnz	L(CopyTwoVecSize1)
++
++	and	$-VEC_SIZE, %rsi
++	and	$(VEC_SIZE - 1), %ecx
++	jmp	L(UnalignVecSizeBoth)
++
++/*------End of main part with loops---------------------*/
++
++/* Case1 */
++
++# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
++	.p2align 4
++L(CopyVecSize):
++	add	%rcx, %rdi
++# endif
++L(CopyVecSizeTail):
++	add	%rcx, %rsi
++L(CopyVecSizeTail1):
++	bsf	%edx, %edx
++L(CopyVecSizeExit):
++	cmp	$32, %edx
++	jae	L(Exit32_63)
++	cmp	$16, %edx
++	jae	L(Exit16_31)
++	cmp	$8, %edx
++	jae	L(Exit8_15)
++	cmp	$4, %edx
++	jae	L(Exit4_7)
++	cmp	$3, %edx
++	je	L(Exit3)
++	cmp	$1, %edx
++	ja	L(Exit2)
++	je	L(Exit1)
++	movb	$0, (%rdi)
++# ifdef USE_AS_STPCPY
++	lea	(%rdi), %rax
++# endif
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	sub	$1, %r8
++	lea	1(%rdi), %rdi
++	jnz	L(StrncpyFillTailWithZero)
++# endif
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(CopyTwoVecSize1):
++	add	$VEC_SIZE, %rsi
++	add	$VEC_SIZE, %rdi
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	sub	$VEC_SIZE, %r8
++# endif
++	jmp	L(CopyVecSizeTail1)
++
++	.p2align 4
++L(CopyTwoVecSize):
++	bsf	%edx, %edx
++	add	%rcx, %rsi
++	add	$VEC_SIZE, %edx
++	sub	%ecx, %edx
++	jmp	L(CopyVecSizeExit)
++
++	.p2align 4
++L(CopyVecSizeUnaligned_0):
++	bsf	%edx, %edx
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++# ifdef USE_AS_STPCPY
++	lea	(%rdi, %rdx), %rax
++# endif
++	vmovdqu %ymm4, (%rdi)
++	add	$((VEC_SIZE * 4) - 1), %r8
++	sub	%rdx, %r8
++	lea	1(%rdi, %rdx), %rdi
++	jmp	L(StrncpyFillTailWithZero)
++# else
++	jmp	L(CopyVecSizeExit)
++# endif
++
++	.p2align 4
++L(CopyVecSizeUnaligned_16):
++	bsf	%ecx, %edx
++	vmovdqu %ymm4, (%rdi)
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++# ifdef USE_AS_STPCPY
++	lea	VEC_SIZE(%rdi, %rdx), %rax
++# endif
++	vmovdqu %ymm5, VEC_SIZE(%rdi)
++	add	$((VEC_SIZE * 3) - 1), %r8
++	sub	%rdx, %r8
++	lea	(VEC_SIZE + 1)(%rdi, %rdx), %rdi
++	jmp	L(StrncpyFillTailWithZero)
++# else
++	add	$VEC_SIZE, %rsi
++	add	$VEC_SIZE, %rdi
++	jmp	L(CopyVecSizeExit)
++# endif
++
++	.p2align 4
++L(CopyVecSizeUnaligned_32):
++	bsf	%edx, %edx
++	vmovdqu %ymm4, (%rdi)
++	vmovdqu %ymm5, VEC_SIZE(%rdi)
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++# ifdef USE_AS_STPCPY
++	lea	(VEC_SIZE * 2)(%rdi, %rdx), %rax
++# endif
++	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
++	add	$((VEC_SIZE * 2) - 1), %r8
++	sub	%rdx, %r8
++	lea	((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi
++	jmp	L(StrncpyFillTailWithZero)
++# else
++	add	$(VEC_SIZE * 2), %rsi
++	add	$(VEC_SIZE * 2), %rdi
++	jmp	L(CopyVecSizeExit)
++# endif
++
++# ifdef USE_AS_STRNCPY
++#  ifndef USE_AS_STRCAT
++	.p2align 4
++L(CopyVecSizeUnalignedVec6):
++	vmovdqu %ymm6, (%rdi, %rcx)
++	jmp	L(CopyVecSizeVecExit)
++
++	.p2align 4
++L(CopyVecSizeUnalignedVec5):
++	vmovdqu %ymm5, (%rdi, %rcx)
++	jmp	L(CopyVecSizeVecExit)
++
++	.p2align 4
++L(CopyVecSizeUnalignedVec4):
++	vmovdqu %ymm4, (%rdi, %rcx)
++	jmp	L(CopyVecSizeVecExit)
++
++	.p2align 4
++L(CopyVecSizeUnalignedVec3):
++	vmovdqu %ymm3, (%rdi, %rcx)
++	jmp	L(CopyVecSizeVecExit)
++#  endif
++
++/* Case2 */
++
++	.p2align 4
++L(CopyVecSizeCase2):
++	add	$VEC_SIZE, %r8
++	add	%rcx, %rdi
++	add	%rcx, %rsi
++	bsf	%edx, %edx
++	cmp	%r8d, %edx
++	jb	L(CopyVecSizeExit)
++	jmp	L(StrncpyExit)
++
++	.p2align 4
++L(CopyTwoVecSizeCase2):
++	add	%rcx, %rsi
++	bsf	%edx, %edx
++	add	$VEC_SIZE, %edx
++	sub	%ecx, %edx
++	cmp	%r8d, %edx
++	jb	L(CopyVecSizeExit)
++	jmp	L(StrncpyExit)
++
++L(CopyVecSizeTailCase2):
++	add	%rcx, %rsi
++	bsf	%edx, %edx
++	cmp	%r8d, %edx
++	jb	L(CopyVecSizeExit)
++	jmp	L(StrncpyExit)
++
++L(CopyVecSizeTail1Case2):
++	bsf	%edx, %edx
++	cmp	%r8d, %edx
++	jb	L(CopyVecSizeExit)
++	jmp	L(StrncpyExit)
++
++/* Case2 or Case3,  Case3 */
++
++	.p2align 4
++L(CopyVecSizeCase2OrCase3):
++	test	%rdx, %rdx
++	jnz	L(CopyVecSizeCase2)
++L(CopyVecSizeCase3):
++	add	$VEC_SIZE, %r8
++	add	%rcx, %rdi
++	add	%rcx, %rsi
++	jmp	L(StrncpyExit)
++
++	.p2align 4
++L(CopyTwoVecSizeCase2OrCase3):
++	test	%rdx, %rdx
++	jnz	L(CopyTwoVecSizeCase2)
++	add	%rcx, %rsi
++	jmp	L(StrncpyExit)
++
++	.p2align 4
++L(CopyVecSizeTailCase2OrCase3):
++	test	%rdx, %rdx
++	jnz	L(CopyVecSizeTailCase2)
++	add	%rcx, %rsi
++	jmp	L(StrncpyExit)
++
++	.p2align 4
++L(CopyTwoVecSize1Case2OrCase3):
++	add	$VEC_SIZE, %rdi
++	add	$VEC_SIZE, %rsi
++	sub	$VEC_SIZE, %r8
++L(CopyVecSizeTail1Case2OrCase3):
++	test	%rdx, %rdx
++	jnz	L(CopyVecSizeTail1Case2)
++	jmp	L(StrncpyExit)
++# endif
++
++/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
++
++	.p2align 4
++L(Exit1):
++	movzwl	(%rsi), %edx
++	mov	%dx, (%rdi)
++# ifdef USE_AS_STPCPY
++	lea	1(%rdi), %rax
++# endif
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	sub	$2, %r8
++	lea	2(%rdi), %rdi
++	jnz	L(StrncpyFillTailWithZero)
++# endif
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(Exit2):
++	movzwl	(%rsi), %ecx
++	mov	%cx, (%rdi)
++	movb	$0, 2(%rdi)
++# ifdef USE_AS_STPCPY
++	lea	2(%rdi), %rax
++# endif
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	sub	$3, %r8
++	lea	3(%rdi), %rdi
++	jnz	L(StrncpyFillTailWithZero)
++# endif
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(Exit3):
++	mov	(%rsi), %edx
++	mov	%edx, (%rdi)
++# ifdef USE_AS_STPCPY
++	lea	3(%rdi), %rax
++# endif
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	sub	$4, %r8
++	lea	4(%rdi), %rdi
++	jnz	L(StrncpyFillTailWithZero)
++# endif
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(Exit4_7):
++	mov	(%rsi), %ecx
++	mov	%ecx, (%rdi)
++	mov	-3(%rsi, %rdx), %ecx
++	mov	%ecx, -3(%rdi, %rdx)
++# ifdef USE_AS_STPCPY
++	lea	(%rdi, %rdx), %rax
++# endif
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	sub	%rdx, %r8
++	sub	$1, %r8
++	lea	1(%rdi, %rdx), %rdi
++	jnz	L(StrncpyFillTailWithZero)
++# endif
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(Exit8_15):
++	mov	(%rsi), %rcx
++	mov	-7(%rsi, %rdx), %r9
++	mov	%rcx, (%rdi)
++	mov	%r9, -7(%rdi, %rdx)
++# ifdef USE_AS_STPCPY
++	lea	(%rdi, %rdx), %rax
++# endif
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	sub	%rdx, %r8
++	sub	$1, %r8
++	lea	1(%rdi, %rdx), %rdi
++	jnz	L(StrncpyFillTailWithZero)
++# endif
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(Exit16_31):
++	vmovdqu (%rsi), %xmm2
++	vmovdqu -15(%rsi, %rdx), %xmm3
++	vmovdqu %xmm2, (%rdi)
++	vmovdqu %xmm3, -15(%rdi, %rdx)
++# ifdef USE_AS_STPCPY
++	lea	(%rdi, %rdx), %rax
++# endif
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	sub %rdx, %r8
++	sub $1, %r8
++	lea 1(%rdi, %rdx), %rdi
++	jnz L(StrncpyFillTailWithZero)
++# endif
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(Exit32_63):
++	vmovdqu (%rsi), %ymm2
++	vmovdqu -31(%rsi, %rdx), %ymm3
++	vmovdqu %ymm2, (%rdi)
++	vmovdqu %ymm3, -31(%rdi, %rdx)
++# ifdef USE_AS_STPCPY
++	lea	(%rdi, %rdx), %rax
++# endif
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	sub	%rdx, %r8
++	sub	$1, %r8
++	lea	1(%rdi, %rdx), %rdi
++	jnz	L(StrncpyFillTailWithZero)
++# endif
++	VZEROUPPER
++	ret
++
++# ifdef USE_AS_STRNCPY
++
++	.p2align 4
++L(StrncpyExit1):
++	movzbl	(%rsi), %edx
++	mov	%dl, (%rdi)
++#  ifdef USE_AS_STPCPY
++	lea	1(%rdi), %rax
++#  endif
++#  ifdef USE_AS_STRCAT
++	movb	$0, 1(%rdi)
++#  endif
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(StrncpyExit2):
++	movzwl	(%rsi), %edx
++	mov	%dx, (%rdi)
++#  ifdef USE_AS_STPCPY
++	lea	2(%rdi), %rax
++#  endif
++#  ifdef USE_AS_STRCAT
++	movb	$0, 2(%rdi)
++#  endif
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(StrncpyExit3_4):
++	movzwl	(%rsi), %ecx
++	movzwl	-2(%rsi, %r8), %edx
++	mov	%cx, (%rdi)
++	mov	%dx, -2(%rdi, %r8)
++#  ifdef USE_AS_STPCPY
++	lea	(%rdi, %r8), %rax
++#  endif
++#  ifdef USE_AS_STRCAT
++	movb	$0, (%rdi, %r8)
++#  endif
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(StrncpyExit5_8):
++	mov	(%rsi), %ecx
++	mov	-4(%rsi, %r8), %edx
++	mov	%ecx, (%rdi)
++	mov	%edx, -4(%rdi, %r8)
++#  ifdef USE_AS_STPCPY
++	lea	(%rdi, %r8), %rax
++#  endif
++#  ifdef USE_AS_STRCAT
++	movb	$0, (%rdi, %r8)
++#  endif
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(StrncpyExit9_16):
++	mov	(%rsi), %rcx
++	mov	-8(%rsi, %r8), %rdx
++	mov	%rcx, (%rdi)
++	mov	%rdx, -8(%rdi, %r8)
++#  ifdef USE_AS_STPCPY
++	lea	(%rdi, %r8), %rax
++#  endif
++#  ifdef USE_AS_STRCAT
++	movb	$0, (%rdi, %r8)
++#  endif
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(StrncpyExit17_32):
++	vmovdqu (%rsi), %xmm2
++	vmovdqu -16(%rsi, %r8), %xmm3
++	vmovdqu %xmm2, (%rdi)
++	vmovdqu %xmm3, -16(%rdi, %r8)
++#  ifdef USE_AS_STPCPY
++	lea	(%rdi, %r8), %rax
++#  endif
++#  ifdef USE_AS_STRCAT
++	movb	$0, (%rdi, %r8)
++#  endif
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(StrncpyExit33_64):
++	/*  0/32, 31/16 */
++	vmovdqu (%rsi), %ymm2
++	vmovdqu -VEC_SIZE(%rsi, %r8), %ymm3
++	vmovdqu %ymm2, (%rdi)
++	vmovdqu %ymm3, -VEC_SIZE(%rdi, %r8)
++#  ifdef USE_AS_STPCPY
++	lea	(%rdi, %r8), %rax
++#  endif
++#  ifdef USE_AS_STRCAT
++	movb	$0, (%rdi, %r8)
++#  endif
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(StrncpyExit65):
++	/* 0/32, 32/32, 64/1 */
++	vmovdqu (%rsi), %ymm2
++	vmovdqu 32(%rsi), %ymm3
++	mov	64(%rsi), %cl
++	vmovdqu %ymm2, (%rdi)
++	vmovdqu %ymm3, 32(%rdi)
++	mov	%cl, 64(%rdi)
++#  ifdef USE_AS_STPCPY
++	lea	65(%rdi), %rax
++#  endif
++#  ifdef USE_AS_STRCAT
++	movb	$0, 65(%rdi)
++#  endif
++	VZEROUPPER
++	ret
++
++#  ifndef USE_AS_STRCAT
++
++	.p2align 4
++L(Fill1):
++	mov	%dl, (%rdi)
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(Fill2):
++	mov	%dx, (%rdi)
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(Fill3_4):
++	mov	%dx, (%rdi)
++	mov     %dx, -2(%rdi, %r8)
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(Fill5_8):
++	mov	%edx, (%rdi)
++	mov     %edx, -4(%rdi, %r8)
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(Fill9_16):
++	mov	%rdx, (%rdi)
++	mov	%rdx, -8(%rdi, %r8)
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(Fill17_32):
++	vmovdqu %xmmZ, (%rdi)
++	vmovdqu %xmmZ, -16(%rdi, %r8)
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(CopyVecSizeUnalignedVec2):
++	vmovdqu %ymm2, (%rdi, %rcx)
++
++	.p2align 4
++L(CopyVecSizeVecExit):
++	bsf	%edx, %edx
++	add	$(VEC_SIZE - 1), %r8
++	add	%rcx, %rdi
++#   ifdef USE_AS_STPCPY
++	lea	(%rdi, %rdx), %rax
++#   endif
++	sub	%rdx, %r8
++	lea	1(%rdi, %rdx), %rdi
++
++	.p2align 4
++L(StrncpyFillTailWithZero):
++	xor	%edx, %edx
++	sub	$VEC_SIZE, %r8
++	jbe	L(StrncpyFillExit)
++
++	vmovdqu %ymmZ, (%rdi)
++	add	$VEC_SIZE, %rdi
++
++	mov	%rdi, %rsi
++	and	$(VEC_SIZE - 1), %esi
++	sub	%rsi, %rdi
++	add	%rsi, %r8
++	sub	$(VEC_SIZE * 4), %r8
++	jb	L(StrncpyFillLessFourVecSize)
++
++L(StrncpyFillLoopVmovdqa):
++	vmovdqa %ymmZ, (%rdi)
++	vmovdqa %ymmZ, VEC_SIZE(%rdi)
++	vmovdqa %ymmZ, (VEC_SIZE * 2)(%rdi)
++	vmovdqa %ymmZ, (VEC_SIZE * 3)(%rdi)
++	add	$(VEC_SIZE * 4), %rdi
++	sub	$(VEC_SIZE * 4), %r8
++	jae	L(StrncpyFillLoopVmovdqa)
++
++L(StrncpyFillLessFourVecSize):
++	add	$(VEC_SIZE * 2), %r8
++	jl	L(StrncpyFillLessTwoVecSize)
++	vmovdqa %ymmZ, (%rdi)
++	vmovdqa %ymmZ, VEC_SIZE(%rdi)
++	add	$(VEC_SIZE * 2), %rdi
++	sub	$VEC_SIZE, %r8
++	jl	L(StrncpyFillExit)
++	vmovdqa %ymmZ, (%rdi)
++	add	$VEC_SIZE, %rdi
++	jmp	L(Fill)
++
++	.p2align 4
++L(StrncpyFillLessTwoVecSize):
++	add	$VEC_SIZE, %r8
++	jl	L(StrncpyFillExit)
++	vmovdqa %ymmZ, (%rdi)
++	add	$VEC_SIZE, %rdi
++	jmp	L(Fill)
++
++	.p2align 4
++L(StrncpyFillExit):
++	add	$VEC_SIZE, %r8
++L(Fill):
++	cmp	$17, %r8d
++	jae	L(Fill17_32)
++	cmp	$9, %r8d
++	jae	L(Fill9_16)
++	cmp	$5, %r8d
++	jae	L(Fill5_8)
++	cmp	$3, %r8d
++	jae	L(Fill3_4)
++	cmp	$1, %r8d
++	ja	L(Fill2)
++	je	L(Fill1)
++	VZEROUPPER
++	ret
++
++/* end of ifndef USE_AS_STRCAT */
++#  endif
++
++	.p2align 4
++L(UnalignedLeaveCase2OrCase3):
++	test	%rdx, %rdx
++	jnz	L(UnalignedFourVecSizeLeaveCase2)
++L(UnalignedFourVecSizeLeaveCase3):
++	lea	(VEC_SIZE * 4)(%r8), %rcx
++	and	$-VEC_SIZE, %rcx
++	add	$(VEC_SIZE * 3), %r8
++	jl	L(CopyVecSizeCase3)
++	vmovdqu %ymm4, (%rdi)
++	sub	$VEC_SIZE, %r8
++	jb	L(CopyVecSizeCase3)
++	vmovdqu %ymm5, VEC_SIZE(%rdi)
++	sub	$VEC_SIZE, %r8
++	jb	L(CopyVecSizeCase3)
++	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
++	sub	$VEC_SIZE, %r8
++	jb	L(CopyVecSizeCase3)
++	vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi)
++#  ifdef USE_AS_STPCPY
++	lea	(VEC_SIZE * 4)(%rdi), %rax
++#  endif
++#  ifdef USE_AS_STRCAT
++	movb	$0, (VEC_SIZE * 4)(%rdi)
++#  endif
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(UnalignedFourVecSizeLeaveCase2):
++	xor	%ecx, %ecx
++	vpcmpeqb %ymm4, %ymmZ, %ymmM
++	vpmovmskb %ymmM, %edx
++	add	$(VEC_SIZE * 3), %r8
++	jle	L(CopyVecSizeCase2OrCase3)
++	test	%edx, %edx
++#  ifndef USE_AS_STRCAT
++	jnz	L(CopyVecSizeUnalignedVec4)
++#  else
++	jnz	L(CopyVecSize)
++#  endif
++	vpcmpeqb %ymm5, %ymmZ, %ymmM
++	vpmovmskb %ymmM, %edx
++	vmovdqu %ymm4, (%rdi)
++	add	$VEC_SIZE, %rcx
++	sub	$VEC_SIZE, %r8
++	jbe	L(CopyVecSizeCase2OrCase3)
++	test	%edx, %edx
++#  ifndef USE_AS_STRCAT
++	jnz	L(CopyVecSizeUnalignedVec5)
++#  else
++	jnz	L(CopyVecSize)
++#  endif
++
++	vpcmpeqb %ymm6, %ymmZ, %ymmM
++	vpmovmskb %ymmM, %edx
++	vmovdqu %ymm5, VEC_SIZE(%rdi)
++	add	$VEC_SIZE, %rcx
++	sub	$VEC_SIZE, %r8
++	jbe	L(CopyVecSizeCase2OrCase3)
++	test	%edx, %edx
++#  ifndef USE_AS_STRCAT
++	jnz	L(CopyVecSizeUnalignedVec6)
++#  else
++	jnz	L(CopyVecSize)
++#  endif
++
++	vpcmpeqb %ymm7, %ymmZ, %ymmM
++	vpmovmskb %ymmM, %edx
++	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
++	lea	VEC_SIZE(%rdi, %rcx), %rdi
++	lea	VEC_SIZE(%rsi, %rcx), %rsi
++	bsf	%edx, %edx
++	cmp	%r8d, %edx
++	jb	L(CopyVecSizeExit)
++L(StrncpyExit):
++	cmp	$65, %r8d
++	je	L(StrncpyExit65)
++	cmp	$33, %r8d
++	jae	L(StrncpyExit33_64)
++	cmp	$17, %r8d
++	jae	L(StrncpyExit17_32)
++	cmp	$9, %r8d
++	jae	L(StrncpyExit9_16)
++	cmp	$5, %r8d
++	jae	L(StrncpyExit5_8)
++	cmp	$3, %r8d
++	jae	L(StrncpyExit3_4)
++	cmp	$1, %r8d
++	ja	L(StrncpyExit2)
++	je	L(StrncpyExit1)
++#  ifdef USE_AS_STPCPY
++	mov	%rdi, %rax
++#  endif
++#  ifdef USE_AS_STRCAT
++	movb	$0, (%rdi)
++#  endif
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(ExitZero):
++#  ifndef USE_AS_STRCAT
++	mov	%rdi, %rax
++#  endif
++	VZEROUPPER
++	ret
++
++# endif
++
++# ifndef USE_AS_STRCAT
++END (STRCPY)
++# else
++END (STRCAT)
++# endif
+diff --git a/libc/arch-x86_64/kabylake/string/avx2-strlen-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-strlen-kbl.S
+new file mode 100644
+index 000000000..912d771b4
+--- /dev/null
++++ b/libc/arch-x86_64/kabylake/string/avx2-strlen-kbl.S
+@@ -0,0 +1,418 @@
++/* strlen/strnlen/wcslen/wcsnlen optimized with AVX2.
++   Copyright (C) 2017-2020 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++# ifndef STRLEN
++#  define STRLEN	strlen_avx2
++# endif
++
++# ifndef L
++#  define L(label)      .L##label
++# endif
++
++# ifndef cfi_startproc
++#  define cfi_startproc .cfi_startproc
++# endif
++
++# ifndef cfi_endproc
++#  define cfi_endproc   .cfi_endproc
++# endif
++
++# ifndef ENTRY
++#  define ENTRY(name)   \
++        .type name, @function;  \
++        .globl name;    \
++        .p2align 4;     \
++name:   \
++        cfi_startproc
++# endif
++
++# ifndef END
++#  define END(name)     \
++        cfi_endproc;    \
++        .size name, .-name
++# endif
++
++# ifdef USE_AS_WCSLEN
++#  define VPCMPEQ	vpcmpeqd
++#  define VPMINU	vpminud
++# else
++#  define VPCMPEQ	vpcmpeqb
++#  define VPMINU	vpminub
++# endif
++
++# ifndef VZEROUPPER
++#  define VZEROUPPER	vzeroupper
++# endif
++
++# define VEC_SIZE 32
++
++	.section .text.avx,"ax",@progbits
++ENTRY (STRLEN)
++# ifdef USE_AS_STRNLEN
++	/* Check for zero length.  */
++	test	%RSI_LP, %RSI_LP
++	jz	L(zero)
++#  ifdef USE_AS_WCSLEN
++	shl	$2, %RSI_LP
++#  elif defined __ILP32__
++	/* Clear the upper 32 bits.  */
++	movl	%esi, %esi
++#  endif
++	mov	%RSI_LP, %R8_LP
++# endif
++	movl	%edi, %ecx
++	movq	%rdi, %rdx
++	vpxor	%xmm0, %xmm0, %xmm0
++
++	/* Check if we may cross page boundary with one vector load.  */
++	andl	$(2 * VEC_SIZE - 1), %ecx
++	cmpl	$VEC_SIZE, %ecx
++	ja	L(cros_page_boundary)
++
++	/* Check the first VEC_SIZE bytes.  */
++	VPCMPEQ (%rdi), %ymm0, %ymm1
++	vpmovmskb %ymm1, %eax
++	testl	%eax, %eax
++
++# ifdef USE_AS_STRNLEN
++	jnz	L(first_vec_x0_check)
++	/* Adjust length and check the end of data.  */
++	subq	$VEC_SIZE, %rsi
++	jbe	L(max)
++# else
++	jnz	L(first_vec_x0)
++# endif
++
++	/* Align data for aligned loads in the loop.  */
++	addq	$VEC_SIZE, %rdi
++	andl	$(VEC_SIZE - 1), %ecx
++	andq	$-VEC_SIZE, %rdi
++
++# ifdef USE_AS_STRNLEN
++	/* Adjust length.  */
++	addq	%rcx, %rsi
++
++	subq	$(VEC_SIZE * 4), %rsi
++	jbe	L(last_4x_vec_or_less)
++# endif
++	jmp	L(more_4x_vec)
++
++	.p2align 4
++L(cros_page_boundary):
++	andl	$(VEC_SIZE - 1), %ecx
++	andq	$-VEC_SIZE, %rdi
++	VPCMPEQ (%rdi), %ymm0, %ymm1
++	vpmovmskb %ymm1, %eax
++	/* Remove the leading bytes.  */
++	sarl	%cl, %eax
++	testl	%eax, %eax
++	jz	L(aligned_more)
++	tzcntl	%eax, %eax
++# ifdef USE_AS_STRNLEN
++	/* Check the end of data.  */
++	cmpq	%rax, %rsi
++	jbe	L(max)
++# endif
++	addq	%rdi, %rax
++	addq	%rcx, %rax
++	subq	%rdx, %rax
++# ifdef USE_AS_WCSLEN
++	shrq	$2, %rax
++# endif
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(aligned_more):
++# ifdef USE_AS_STRNLEN
++        /* "rcx" is less than VEC_SIZE.  Calculate "rdx + rcx - VEC_SIZE"
++	    with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
++	    to void possible addition overflow.  */
++	negq	%rcx
++	addq	$VEC_SIZE, %rcx
++
++	/* Check the end of data.  */
++	subq	%rcx, %rsi
++	jbe	L(max)
++# endif
++
++	addq	$VEC_SIZE, %rdi
++
++# ifdef USE_AS_STRNLEN
++	subq	$(VEC_SIZE * 4), %rsi
++	jbe	L(last_4x_vec_or_less)
++# endif
++
++L(more_4x_vec):
++	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
++	   since data is only aligned to VEC_SIZE.  */
++	VPCMPEQ (%rdi), %ymm0, %ymm1
++	vpmovmskb %ymm1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x0)
++
++	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
++	vpmovmskb %ymm1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x1)
++
++	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
++	vpmovmskb %ymm1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x2)
++
++	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
++	vpmovmskb %ymm1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x3)
++
++	addq	$(VEC_SIZE * 4), %rdi
++
++# ifdef USE_AS_STRNLEN
++	subq	$(VEC_SIZE * 4), %rsi
++	jbe	L(last_4x_vec_or_less)
++# endif
++
++	/* Align data to 4 * VEC_SIZE.  */
++	movq	%rdi, %rcx
++	andl	$(4 * VEC_SIZE - 1), %ecx
++	andq	$-(4 * VEC_SIZE), %rdi
++
++# ifdef USE_AS_STRNLEN
++	/* Adjust length.  */
++	addq	%rcx, %rsi
++# endif
++
++	.p2align 4
++L(loop_4x_vec):
++	/* Compare 4 * VEC at a time forward.  */
++	vmovdqa (%rdi), %ymm1
++	vmovdqa	VEC_SIZE(%rdi), %ymm2
++	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm3
++	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm4
++	VPMINU	%ymm1, %ymm2, %ymm5
++	VPMINU	%ymm3, %ymm4, %ymm6
++	VPMINU	%ymm5, %ymm6, %ymm5
++
++	VPCMPEQ	%ymm5, %ymm0, %ymm5
++	vpmovmskb %ymm5, %eax
++	testl	%eax, %eax
++	jnz	L(4x_vec_end)
++
++	addq	$(VEC_SIZE * 4), %rdi
++
++# ifndef USE_AS_STRNLEN
++	jmp	L(loop_4x_vec)
++# else
++	subq	$(VEC_SIZE * 4), %rsi
++	ja	L(loop_4x_vec)
++
++L(last_4x_vec_or_less):
++	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
++	addl	$(VEC_SIZE * 2), %esi
++	jle	L(last_2x_vec)
++
++	VPCMPEQ (%rdi), %ymm0, %ymm1
++	vpmovmskb %ymm1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x0)
++
++	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
++	vpmovmskb %ymm1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x1)
++
++	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
++	vpmovmskb %ymm1, %eax
++	testl	%eax, %eax
++
++	jnz	L(first_vec_x2_check)
++	subl	$VEC_SIZE, %esi
++	jle	L(max)
++
++	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
++	vpmovmskb %ymm1, %eax
++	testl	%eax, %eax
++
++	jnz	L(first_vec_x3_check)
++	movq	%r8, %rax
++#  ifdef USE_AS_WCSLEN
++	shrq	$2, %rax
++#  endif
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(last_2x_vec):
++	addl	$(VEC_SIZE * 2), %esi
++	VPCMPEQ (%rdi), %ymm0, %ymm1
++	vpmovmskb %ymm1, %eax
++	testl	%eax, %eax
++
++	jnz	L(first_vec_x0_check)
++	subl	$VEC_SIZE, %esi
++	jle	L(max)
++
++	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
++	vpmovmskb %ymm1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x1_check)
++	movq	%r8, %rax
++#  ifdef USE_AS_WCSLEN
++	shrq	$2, %rax
++#  endif
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(first_vec_x0_check):
++	tzcntl	%eax, %eax
++	/* Check the end of data.  */
++	cmpq	%rax, %rsi
++	jbe	L(max)
++	addq	%rdi, %rax
++	subq	%rdx, %rax
++#  ifdef USE_AS_WCSLEN
++	shrq	$2, %rax
++#  endif
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(first_vec_x1_check):
++	tzcntl	%eax, %eax
++	/* Check the end of data.  */
++	cmpq	%rax, %rsi
++	jbe	L(max)
++	addq	$VEC_SIZE, %rax
++	addq	%rdi, %rax
++	subq	%rdx, %rax
++#  ifdef USE_AS_WCSLEN
++	shrq	$2, %rax
++#  endif
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(first_vec_x2_check):
++	tzcntl	%eax, %eax
++	/* Check the end of data.  */
++	cmpq	%rax, %rsi
++	jbe	L(max)
++	addq	$(VEC_SIZE * 2), %rax
++	addq	%rdi, %rax
++	subq	%rdx, %rax
++#  ifdef USE_AS_WCSLEN
++	shrq	$2, %rax
++#  endif
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(first_vec_x3_check):
++	tzcntl	%eax, %eax
++	/* Check the end of data.  */
++	cmpq	%rax, %rsi
++	jbe	L(max)
++	addq	$(VEC_SIZE * 3), %rax
++	addq	%rdi, %rax
++	subq	%rdx, %rax
++#  ifdef USE_AS_WCSLEN
++	shrq	$2, %rax
++#  endif
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(max):
++	movq	%r8, %rax
++#  ifdef USE_AS_WCSLEN
++	shrq	$2, %rax
++#  endif
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(zero):
++	xorl	%eax, %eax
++	ret
++# endif
++
++	.p2align 4
++L(first_vec_x0):
++	tzcntl	%eax, %eax
++	addq	%rdi, %rax
++	subq	%rdx, %rax
++# ifdef USE_AS_WCSLEN
++	shrq	$2, %rax
++# endif
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(first_vec_x1):
++	tzcntl	%eax, %eax
++	addq	$VEC_SIZE, %rax
++	addq	%rdi, %rax
++	subq	%rdx, %rax
++# ifdef USE_AS_WCSLEN
++	shrq	$2, %rax
++# endif
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(first_vec_x2):
++	tzcntl	%eax, %eax
++	addq	$(VEC_SIZE * 2), %rax
++	addq	%rdi, %rax
++	subq	%rdx, %rax
++# ifdef USE_AS_WCSLEN
++	shrq	$2, %rax
++# endif
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(4x_vec_end):
++	VPCMPEQ	%ymm1, %ymm0, %ymm1
++	vpmovmskb %ymm1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x0)
++	VPCMPEQ %ymm2, %ymm0, %ymm2
++	vpmovmskb %ymm2, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x1)
++	VPCMPEQ %ymm3, %ymm0, %ymm3
++	vpmovmskb %ymm3, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x2)
++	VPCMPEQ %ymm4, %ymm0, %ymm4
++	vpmovmskb %ymm4, %eax
++L(first_vec_x3):
++	tzcntl	%eax, %eax
++	addq	$(VEC_SIZE * 3), %rax
++	addq	%rdi, %rax
++	subq	%rdx, %rax
++# ifdef USE_AS_WCSLEN
++	shrq	$2, %rax
++# endif
++	VZEROUPPER
++	ret
++
++END (STRLEN)
+diff --git a/libc/arch-x86_64/kabylake/string/avx2-strncat-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-strncat-kbl.S
+new file mode 100644
+index 000000000..71e1a46c2
+--- /dev/null
++++ b/libc/arch-x86_64/kabylake/string/avx2-strncat-kbl.S
+@@ -0,0 +1,3 @@
++#define USE_AS_STRNCAT
++#define STRCAT strncat_avx2
++#include "avx2-strcat-kbl.S"
+diff --git a/libc/arch-x86_64/kabylake/string/avx2-strncmp-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-strncmp-kbl.S
+new file mode 100644
+index 000000000..b21a19134
+--- /dev/null
++++ b/libc/arch-x86_64/kabylake/string/avx2-strncmp-kbl.S
+@@ -0,0 +1,4 @@
++#define STRCMP	strncmp_avx2
++#define USE_AS_STRNCMP 1
++#include "avx_regs.h"
++#include "avx2-strcmp-kbl.S"
+diff --git a/libc/arch-x86_64/kabylake/string/avx2-strncpy-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-strncpy-kbl.S
+new file mode 100644
+index 000000000..7ad840667
+--- /dev/null
++++ b/libc/arch-x86_64/kabylake/string/avx2-strncpy-kbl.S
+@@ -0,0 +1,4 @@
++#define USE_AS_STRNCPY
++#define STRCPY strncpy_avx2
++#include "avx_regs.h"
++#include "avx2-strcpy-kbl.S"
+diff --git a/libc/arch-x86_64/kabylake/string/avx2-strnlen-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-strnlen-kbl.S
+new file mode 100644
+index 000000000..22cc5c527
+--- /dev/null
++++ b/libc/arch-x86_64/kabylake/string/avx2-strnlen-kbl.S
+@@ -0,0 +1,4 @@
++#define STRLEN strnlen_avx2
++#define USE_AS_STRNLEN 1
++#include "avx_regs.h"
++#include "avx2-strlen-kbl.S"
+diff --git a/libc/arch-x86_64/kabylake/string/avx2-strrchr-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-strrchr-kbl.S
+new file mode 100644
+index 000000000..b3a65fbc6
+--- /dev/null
++++ b/libc/arch-x86_64/kabylake/string/avx2-strrchr-kbl.S
+@@ -0,0 +1,258 @@
++/* strrchr/wcsrchr optimized with AVX2.
++   Copyright (C) 2017-2020 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++
++# ifndef STRRCHR
++#  define STRRCHR	strrchr_avx2
++# endif
++
++# ifndef L
++#  define L(label)      .L##label
++# endif
++
++# ifndef cfi_startproc
++#  define cfi_startproc .cfi_startproc
++# endif
++
++# ifndef cfi_endproc
++#  define cfi_endproc   .cfi_endproc
++# endif
++
++# ifndef ENTRY
++#  define ENTRY(name)   \
++        .type name, @function;  \
++        .globl name;    \
++        .p2align 4;     \
++name:   \
++        cfi_startproc
++# endif
++
++# ifndef END
++#  define END(name)     \
++        cfi_endproc;    \
++        .size name, .-name
++# endif
++
++# ifdef USE_AS_WCSRCHR
++#  define VPBROADCAST	vpbroadcastd
++#  define VPCMPEQ	vpcmpeqd
++# else
++#  define VPBROADCAST	vpbroadcastb
++#  define VPCMPEQ	vpcmpeqb
++# endif
++
++# ifndef VZEROUPPER
++#  define VZEROUPPER	vzeroupper
++# endif
++
++# define VEC_SIZE	32
++
++	.section .text.avx,"ax",@progbits
++ENTRY (STRRCHR)
++	movd	%esi, %xmm4
++	movl	%edi, %ecx
++	/* Broadcast CHAR to YMM4.  */
++	VPBROADCAST %xmm4, %ymm4
++	vpxor	%xmm0, %xmm0, %xmm0
++
++	/* Check if we may cross page boundary with one vector load.  */
++	andl	$(2 * VEC_SIZE - 1), %ecx
++	cmpl	$VEC_SIZE, %ecx
++	ja	L(cros_page_boundary)
++
++	vmovdqu	(%rdi), %ymm1
++	VPCMPEQ	%ymm1, %ymm0, %ymm2
++	VPCMPEQ	%ymm1, %ymm4, %ymm3
++	vpmovmskb %ymm2, %ecx
++	vpmovmskb %ymm3, %eax
++	addq	$VEC_SIZE, %rdi
++
++	testl	%eax, %eax
++	jnz	L(first_vec)
++
++	testl	%ecx, %ecx
++	jnz	L(return_null)
++
++	andq	$-VEC_SIZE, %rdi
++	xorl	%edx, %edx
++	jmp	L(aligned_loop)
++
++	.p2align 4
++L(first_vec):
++	/* Check if there is a nul CHAR.  */
++	testl	%ecx, %ecx
++	jnz	L(char_and_nul_in_first_vec)
++
++	/* Remember the match and keep searching.  */
++	movl	%eax, %edx
++	movq	%rdi, %rsi
++	andq	$-VEC_SIZE, %rdi
++	jmp	L(aligned_loop)
++
++	.p2align 4
++L(cros_page_boundary):
++	andl	$(VEC_SIZE - 1), %ecx
++	andq	$-VEC_SIZE, %rdi
++	vmovdqa	(%rdi), %ymm1
++	VPCMPEQ	%ymm1, %ymm0, %ymm2
++	VPCMPEQ	%ymm1, %ymm4, %ymm3
++	vpmovmskb %ymm2, %edx
++	vpmovmskb %ymm3, %eax
++	shrl	%cl, %edx
++	shrl	%cl, %eax
++	addq	$VEC_SIZE, %rdi
++
++	/* Check if there is a CHAR.  */
++	testl	%eax, %eax
++	jnz	L(found_char)
++
++	testl	%edx, %edx
++	jnz	L(return_null)
++
++	jmp	L(aligned_loop)
++
++	.p2align 4
++L(found_char):
++	testl	%edx, %edx
++	jnz	L(char_and_nul)
++
++	/* Remember the match and keep searching.  */
++	movl	%eax, %edx
++	leaq	(%rdi, %rcx), %rsi
++
++	.p2align 4
++L(aligned_loop):
++	vmovdqa	(%rdi), %ymm1
++	VPCMPEQ	%ymm1, %ymm0, %ymm2
++	addq	$VEC_SIZE, %rdi
++	VPCMPEQ	%ymm1, %ymm4, %ymm3
++	vpmovmskb %ymm2, %ecx
++	vpmovmskb %ymm3, %eax
++	orl	%eax, %ecx
++	jnz	L(char_nor_null)
++
++	vmovdqa	(%rdi), %ymm1
++	VPCMPEQ	%ymm1, %ymm0, %ymm2
++	add	$VEC_SIZE, %rdi
++	VPCMPEQ	%ymm1, %ymm4, %ymm3
++	vpmovmskb %ymm2, %ecx
++	vpmovmskb %ymm3, %eax
++	orl	%eax, %ecx
++	jnz	L(char_nor_null)
++
++	vmovdqa	(%rdi), %ymm1
++	VPCMPEQ	%ymm1, %ymm0, %ymm2
++	addq	$VEC_SIZE, %rdi
++	VPCMPEQ	%ymm1, %ymm4, %ymm3
++	vpmovmskb %ymm2, %ecx
++	vpmovmskb %ymm3, %eax
++	orl	%eax, %ecx
++	jnz	L(char_nor_null)
++
++	vmovdqa	(%rdi), %ymm1
++	VPCMPEQ	%ymm1, %ymm0, %ymm2
++	addq	$VEC_SIZE, %rdi
++	VPCMPEQ	%ymm1, %ymm4, %ymm3
++	vpmovmskb %ymm2, %ecx
++	vpmovmskb %ymm3, %eax
++	orl	%eax, %ecx
++	jz	L(aligned_loop)
++
++	.p2align 4
++L(char_nor_null):
++	/* Find a CHAR or a nul CHAR in a loop.  */
++	testl	%eax, %eax
++	jnz	L(match)
++L(return_value):
++	testl	%edx, %edx
++	jz	L(return_null)
++	movl	%edx, %eax
++	movq	%rsi, %rdi
++
++# ifdef USE_AS_WCSRCHR
++	/* Keep the first bit for each matching CHAR for bsr.  */
++	andl	$0x11111111, %eax
++# endif
++	bsrl	%eax, %eax
++	leaq	-VEC_SIZE(%rdi, %rax), %rax
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(match):
++	/* Find a CHAR.  Check if there is a nul CHAR.  */
++	vpmovmskb %ymm2, %ecx
++	testl	%ecx, %ecx
++	jnz	L(find_nul)
++
++	/* Remember the match and keep searching.  */
++	movl	%eax, %edx
++	movq	%rdi, %rsi
++	jmp	L(aligned_loop)
++
++	.p2align 4
++L(find_nul):
++# ifdef USE_AS_WCSRCHR
++	/* Keep the first bit for each matching CHAR for bsr.  */
++	andl	$0x11111111, %ecx
++	andl	$0x11111111, %eax
++# endif
++	/* Mask out any matching bits after the nul CHAR.  */
++	movl	%ecx, %r8d
++	subl	$1, %r8d
++	xorl	%ecx, %r8d
++	andl	%r8d, %eax
++	testl	%eax, %eax
++	/* If there is no CHAR here, return the remembered one.  */
++	jz	L(return_value)
++	bsrl	%eax, %eax
++	leaq	-VEC_SIZE(%rdi, %rax), %rax
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(char_and_nul):
++	/* Find both a CHAR and a nul CHAR.  */
++	addq	%rcx, %rdi
++	movl	%edx, %ecx
++L(char_and_nul_in_first_vec):
++# ifdef USE_AS_WCSRCHR
++	/* Keep the first bit for each matching CHAR for bsr.  */
++	andl	$0x11111111, %ecx
++	andl	$0x11111111, %eax
++# endif
++	/* Mask out any matching bits after the nul CHAR.  */
++	movl	%ecx, %r8d
++	subl	$1, %r8d
++	xorl	%ecx, %r8d
++	andl	%r8d, %eax
++	testl	%eax, %eax
++	/* Return null pointer if the nul CHAR comes first.  */
++	jz	L(return_null)
++	bsrl	%eax, %eax
++	leaq	-VEC_SIZE(%rdi, %rax), %rax
++	VZEROUPPER
++	ret
++
++	.p2align 4
++L(return_null):
++	xorl	%eax, %eax
++	VZEROUPPER
++	ret
++
++END (STRRCHR)
+diff --git a/libc/arch-x86_64/kabylake/string/avx2-wcschr-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-wcschr-kbl.S
+new file mode 100644
+index 000000000..b03124767
+--- /dev/null
++++ b/libc/arch-x86_64/kabylake/string/avx2-wcschr-kbl.S
+@@ -0,0 +1,3 @@
++#define STRCHR wcschr_avx2
++#define USE_AS_WCSCHR 1
++#include "avx2-strchr-kbl.S"
+diff --git a/libc/arch-x86_64/kabylake/string/avx2-wcscmp-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-wcscmp-kbl.S
+new file mode 100644
+index 000000000..bcbcd4ce7
+--- /dev/null
++++ b/libc/arch-x86_64/kabylake/string/avx2-wcscmp-kbl.S
+@@ -0,0 +1,4 @@
++#define STRCMP wcscmp_avx2
++#define USE_AS_WCSCMP 1
++
++#include "avx2-strcmp-kbl.S"
+diff --git a/libc/arch-x86_64/kabylake/string/avx2-wcslen-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-wcslen-kbl.S
+new file mode 100644
+index 000000000..f1b973572
+--- /dev/null
++++ b/libc/arch-x86_64/kabylake/string/avx2-wcslen-kbl.S
+@@ -0,0 +1,4 @@
++#define STRLEN wcslen_avx2
++#define USE_AS_WCSLEN 1
++
++#include "avx2-strlen-kbl.S"
+diff --git a/libc/arch-x86_64/kabylake/string/avx2-wcsncmp-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-wcsncmp-kbl.S
+new file mode 100644
+index 000000000..7603169c1
+--- /dev/null
++++ b/libc/arch-x86_64/kabylake/string/avx2-wcsncmp-kbl.S
+@@ -0,0 +1,6 @@
++#define STRCMP wcsncmp_avx2
++#define USE_AS_STRNCMP 1
++#define USE_AS_WCSCMP 1
++
++#include "avx_regs.h"
++#include "avx2-strcmp-kbl.S"
+diff --git a/libc/arch-x86_64/kabylake/string/avx2-wcsnlen-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-wcsnlen-kbl.S
+new file mode 100644
+index 000000000..2095cd8e0
+--- /dev/null
++++ b/libc/arch-x86_64/kabylake/string/avx2-wcsnlen-kbl.S
+@@ -0,0 +1,6 @@
++#define STRLEN wcsnlen_avx2
++#define USE_AS_WCSLEN 1
++#define USE_AS_STRNLEN 1
++
++#include "avx_regs.h"
++#include "avx2-strlen-kbl.S"
+diff --git a/libc/arch-x86_64/kabylake/string/avx2-wcsrchr-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-wcsrchr-kbl.S
+new file mode 100644
+index 000000000..fbec1286c
+--- /dev/null
++++ b/libc/arch-x86_64/kabylake/string/avx2-wcsrchr-kbl.S
+@@ -0,0 +1,3 @@
++#define STRRCHR wcsrchr_avx2
++#define USE_AS_WCSRCHR 1
++#include "avx2-strrchr-kbl.S"
+diff --git a/libc/arch-x86_64/kabylake/string/avx_regs.h b/libc/arch-x86_64/kabylake/string/avx_regs.h
+new file mode 100644
+index 000000000..223d97e3e
+--- /dev/null
++++ b/libc/arch-x86_64/kabylake/string/avx_regs.h
+@@ -0,0 +1,26 @@
++/* Long and pointer size in bytes.  */
++#define LP_SIZE 8
++
++/* Instruction to operate on long and pointer.  */
++#define LP_OP(insn) insn##q
++
++/* Assembler address directive. */
++#define ASM_ADDR .quad
++
++/* Registers to hold long and pointer.  */
++#define RAX_LP  rax
++#define RBP_LP  rbp
++#define RBX_LP  rbx
++#define RCX_LP  rcx
++#define RDI_LP  rdi
++#define RDX_LP  rdx
++#define RSI_LP  rsi
++#define RSP_LP  rsp
++#define R8_LP   r8
++#define R9_LP   r9
++#define R10_LP  r10
++#define R11_LP  r11
++#define R12_LP  r12
++#define R13_LP  r13
++#define R14_LP  r14
++#define R15_LP  r15
+diff --git a/libc/arch-x86_64/include/cache.h b/libc/arch-x86_64/kabylake/string/cache.h
+similarity index 100%
+rename from libc/arch-x86_64/include/cache.h
+rename to libc/arch-x86_64/kabylake/string/cache.h
+diff --git a/libc/arch-x86_64/silvermont/string/cache.h b/libc/arch-x86_64/silvermont/string/cache.h
+new file mode 100644
+index 000000000..3606d2a1a
+--- /dev/null
++++ b/libc/arch-x86_64/silvermont/string/cache.h
+@@ -0,0 +1,36 @@
++/*
++Copyright (c) 2014, Intel Corporation
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++
++    * Redistributions of source code must retain the above copyright notice,
++    * this list of conditions and the following disclaimer.
++
++    * Redistributions in binary form must reproduce the above copyright notice,
++    * this list of conditions and the following disclaimer in the documentation
++    * and/or other materials provided with the distribution.
++
++    * Neither the name of Intel Corporation nor the names of its contributors
++    * may be used to endorse or promote products derived from this software
++    * without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
++ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
++ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*/
++
++/* Values are optimized for Silvermont */
++#define SHARED_CACHE_SIZE (1024*1024)  /* Silvermont L2 Cache */
++#define DATA_CACHE_SIZE   (24*1024)    /* Silvermont L1 Data Cache */
++
++#define SHARED_CACHE_SIZE_HALF (SHARED_CACHE_SIZE / 2)
++#define DATA_CACHE_SIZE_HALF   (DATA_CACHE_SIZE / 2)
+diff --git a/libc/arch-x86_64/silvermont/string/sse2-stpcpy-slm.S b/libc/arch-x86_64/silvermont/string/sse2-stpcpy-slm.S
+index 0ad2d44cf..ce15cdf1c 100644
+--- a/libc/arch-x86_64/silvermont/string/sse2-stpcpy-slm.S
++++ b/libc/arch-x86_64/silvermont/string/sse2-stpcpy-slm.S
+@@ -29,5 +29,5 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+ 
+ #define USE_AS_STPCPY
+-#define STRCPY		stpcpy
++#define STRCPY		stpcpy_generic
+ #include "sse2-strcpy-slm.S"
+diff --git a/libc/arch-x86_64/silvermont/string/sse2-stpncpy-slm.S b/libc/arch-x86_64/silvermont/string/sse2-stpncpy-slm.S
+index 30666850b..02b4df02d 100644
+--- a/libc/arch-x86_64/silvermont/string/sse2-stpncpy-slm.S
++++ b/libc/arch-x86_64/silvermont/string/sse2-stpncpy-slm.S
+@@ -30,5 +30,5 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ 
+ #define USE_AS_STRNCPY
+ #define USE_AS_STPCPY
+-#define STRCPY		stpncpy
++#define STRCPY		stpncpy_generic
+ #include "sse2-strcpy-slm.S"
+diff --git a/libc/arch-x86_64/silvermont/string/sse2-strcat-slm.S b/libc/arch-x86_64/silvermont/string/sse2-strcat-slm.S
+index dd8207ff5..007adfe95 100644
+--- a/libc/arch-x86_64/silvermont/string/sse2-strcat-slm.S
++++ b/libc/arch-x86_64/silvermont/string/sse2-strcat-slm.S
+@@ -29,7 +29,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+ 
+ #ifndef STRCAT
+-# define STRCAT		strcat
++# define STRCAT		strcat_generic
+ #endif
+ 
+ #ifndef L
+diff --git a/libc/arch-x86_64/silvermont/string/sse2-strcpy-slm.S b/libc/arch-x86_64/silvermont/string/sse2-strcpy-slm.S
+index 3e146bfbc..ade9eac4f 100644
+--- a/libc/arch-x86_64/silvermont/string/sse2-strcpy-slm.S
++++ b/libc/arch-x86_64/silvermont/string/sse2-strcpy-slm.S
+@@ -31,7 +31,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ #ifndef USE_AS_STRCAT
+ 
+ # ifndef STRCPY
+-#  define STRCPY	strcpy
++#  define STRCPY	strcpy_generic
+ # endif
+ 
+ # ifndef L
+diff --git a/libc/arch-x86_64/silvermont/string/sse2-strlen-slm.S b/libc/arch-x86_64/silvermont/string/sse2-strlen-slm.S
+index 3772fe770..df24f9de2 100644
+--- a/libc/arch-x86_64/silvermont/string/sse2-strlen-slm.S
++++ b/libc/arch-x86_64/silvermont/string/sse2-strlen-slm.S
+@@ -31,7 +31,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ #ifndef USE_AS_STRCAT
+ 
+ #ifndef STRLEN
+-# define STRLEN		strlen
++# define STRLEN		strlen_generic
+ #endif
+ 
+ #ifndef L
+diff --git a/libc/arch-x86_64/silvermont/string/sse2-strncat-slm.S b/libc/arch-x86_64/silvermont/string/sse2-strncat-slm.S
+index 6b4a43084..c5394f9d5 100644
+--- a/libc/arch-x86_64/silvermont/string/sse2-strncat-slm.S
++++ b/libc/arch-x86_64/silvermont/string/sse2-strncat-slm.S
+@@ -29,5 +29,5 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+ 
+ #define USE_AS_STRNCAT
+-#define STRCAT		strncat
++#define STRCAT		strncat_generic
+ #include "sse2-strcat-slm.S"
+diff --git a/libc/arch-x86_64/silvermont/string/sse2-strncpy-slm.S b/libc/arch-x86_64/silvermont/string/sse2-strncpy-slm.S
+index 594e78f74..2e8d68d12 100644
+--- a/libc/arch-x86_64/silvermont/string/sse2-strncpy-slm.S
++++ b/libc/arch-x86_64/silvermont/string/sse2-strncpy-slm.S
+@@ -29,5 +29,5 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+ 
+ #define USE_AS_STRNCPY
+-#define STRCPY		strncpy
++#define STRCPY		strncpy_generic
+ #include "sse2-strcpy-slm.S"
+diff --git a/libc/arch-x86_64/silvermont/string/ssse3-strcmp-slm.S b/libc/arch-x86_64/silvermont/string/ssse3-strcmp-slm.S
+index e8acd5ba4..fa2542f00 100644
+--- a/libc/arch-x86_64/silvermont/string/ssse3-strcmp-slm.S
++++ b/libc/arch-x86_64/silvermont/string/ssse3-strcmp-slm.S
+@@ -43,7 +43,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ #else
+ #define UPDATE_STRNCMP_COUNTER
+ #ifndef STRCMP
+-#define STRCMP		strcmp
++#define STRCMP		strcmp_generic
+ #endif
+ #endif
+ 
+diff --git a/libc/arch-x86_64/silvermont/string/ssse3-strncmp-slm.S b/libc/arch-x86_64/silvermont/string/ssse3-strncmp-slm.S
+index 0e4077517..5d20a483f 100644
+--- a/libc/arch-x86_64/silvermont/string/ssse3-strncmp-slm.S
++++ b/libc/arch-x86_64/silvermont/string/ssse3-strncmp-slm.S
+@@ -29,5 +29,5 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+ 
+ #define USE_AS_STRNCMP
+-#define STRCMP		strncmp
++#define STRCMP		strncmp_generic
+ #include "ssse3-strcmp-slm.S"
+diff --git a/libc/arch-x86_64/static_function_dispatch.S b/libc/arch-x86_64/static_function_dispatch.S
+index 979ce4f18..5c0f1f2ba 100644
+--- a/libc/arch-x86_64/static_function_dispatch.S
++++ b/libc/arch-x86_64/static_function_dispatch.S
+@@ -38,6 +38,25 @@ FUNCTION_DELEGATE(__memset_chk, __memset_chk_generic)
+ FUNCTION_DELEGATE(memcmp, memcmp_generic)
+ FUNCTION_DELEGATE(memcpy, memmove_generic)
+ FUNCTION_DELEGATE(memmove, memmove_generic)
+-FUNCTION_DELEGATE(memchr, memchr_openbsd)
+-FUNCTION_DELEGATE(memrchr, memrchr_openbsd)
+-//FUNCTION_DELEGATE(wmemset, wmemset_freebsd)
++FUNCTION_DELEGATE(memchr, memchr_generic)
++FUNCTION_DELEGATE(memrchr, memrchr_generic)
++//FUNCTION_DELEGATE(wmemset, wmemset_generic)
++FUNCTION_DELEGATE(strcmp, strcmp_generic)
++FUNCTION_DELEGATE(strncmp, strncmp_generic)
++FUNCTION_DELEGATE(strcpy, strcpy_generic)
++FUNCTION_DELEGATE(strncpy, strncpy_generic)
++FUNCTION_DELEGATE(stpcpy, stpcpy_generic)
++FUNCTION_DELEGATE(stpncpy, stpncpy_generic)
++FUNCTION_DELEGATE(strlen, strlen_generic)
++FUNCTION_DELEGATE(strnlen, strnlen_generic)
++FUNCTION_DELEGATE(strchr, strchr_generic)
++FUNCTION_DELEGATE(strrchr, strrchr_generic)
++FUNCTION_DELEGATE(strcat, strcat_generic)
++FUNCTION_DELEGATE(strncat, strncat_generic)
++FUNCTION_DELEGATE(wcscmp, wcscmp_generic)
++FUNCTION_DELEGATE(wcsncmp, wcsncmp_generic)
++FUNCTION_DELEGATE(wcslen, wcslen_generic)
++FUNCTION_DELEGATE(wcsnlen, wcsnlen_generic)
++FUNCTION_DELEGATE(wcschr, wcschr_generic)
++FUNCTION_DELEGATE(wcsrchr, wcsrchr_generic)
++
+-- 
+2.25.1
+
diff --git a/aosp_diff/preliminary/bionic/0005-avx2-implementation-for-memmove-api.patch b/aosp_diff/preliminary/bionic/0005-avx2-implementation-for-memmove-api.patch
new file mode 100644
index 0000000000..6f47b3414b
--- /dev/null
+++ b/aosp_diff/preliminary/bionic/0005-avx2-implementation-for-memmove-api.patch
@@ -0,0 +1,645 @@
+From 05ace70e6407263d0bef91800005942a079058d6 Mon Sep 17 00:00:00 2001
+From: "Reddy, Alavala Srinivasa" <alavala.srinivasa.reddy@intel.com>
+Date: Wed, 1 Nov 2023 18:43:18 +0530
+Subject: [PATCH 5/5] avx2 implementation for memmove api
+
+This patch includes handwritten avx2 assembly
+implementation for memmove 64-bit.
+
+Test done: Build and boot is fine, Run the benchmarks suite.
+
+Signed-off-by: ahs <amrita.h.s@intel.com>
+---
+ libc/Android.bp                               |   1 +
+ .../arch-x86_64/dynamic_function_dispatch.cpp |   2 +
+ .../kabylake/string/avx2-memmove-kbl.S        | 593 ++++++++++++++++++
+ 3 files changed, 596 insertions(+)
+ create mode 100644 libc/arch-x86_64/kabylake/string/avx2-memmove-kbl.S
+
+diff --git a/libc/Android.bp b/libc/Android.bp
+index 92483e833..5deb88b48 100644
+--- a/libc/Android.bp
++++ b/libc/Android.bp
+@@ -1235,6 +1235,7 @@ cc_library_static {
+                 "arch-x86_64/kabylake/string/avx2-memcmp-kbl.S",
+                 "arch-x86_64/kabylake/string/avx2-memchr-kbl.S",
+                 "arch-x86_64/kabylake/string/avx2-memrchr-kbl.S",
++                "arch-x86_64/kabylake/string/avx2-memmove-kbl.S",
+                 "arch-x86_64/kabylake/string/avx2-strcmp-kbl.S",
+                 "arch-x86_64/kabylake/string/avx2-strncmp-kbl.S",
+                 "arch-x86_64/kabylake/string/avx2-strlen-kbl.S",
+diff --git a/libc/arch-x86_64/dynamic_function_dispatch.cpp b/libc/arch-x86_64/dynamic_function_dispatch.cpp
+index 182eb4200..5bcf63e4c 100644
+--- a/libc/arch-x86_64/dynamic_function_dispatch.cpp
++++ b/libc/arch-x86_64/dynamic_function_dispatch.cpp
+@@ -55,6 +55,8 @@ DEFINE_IFUNC_FOR(memcmp) {
+ 
+ typedef void* memmove_func(void* __dst, const void* __src, size_t __n);
+ DEFINE_IFUNC_FOR(memmove) {
++    __builtin_cpu_init();
++    if (__builtin_cpu_supports("avx2")) RETURN_FUNC(memmove_func, memmove_avx2);
+     RETURN_FUNC(memmove_func, memmove_generic);
+ }
+ 
+diff --git a/libc/arch-x86_64/kabylake/string/avx2-memmove-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-memmove-kbl.S
+new file mode 100644
+index 000000000..02e9ec1d2
+--- /dev/null
++++ b/libc/arch-x86_64/kabylake/string/avx2-memmove-kbl.S
+@@ -0,0 +1,593 @@
++/*
++Copyright (c) 2014, Intel Corporation
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++
++    * Redistributions of source code must retain the above copyright notice,
++    * this list of conditions and the following disclaimer.
++
++    * Redistributions in binary form must reproduce the above copyright notice,
++    * this list of conditions and the following disclaimer in the documentation
++    * and/or other materials provided with the distribution.
++
++    * Neither the name of Intel Corporation nor the names of its contributors
++    * may be used to endorse or promote products derived from this software
++    * without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
++ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
++ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*/
++
++#include "cache.h"
++
++#ifndef MEMMOVE
++# define MEMMOVE		memmove_avx2
++#endif
++
++#ifndef L
++# define L(label)	.L##label
++#endif
++
++#ifndef cfi_startproc
++# define cfi_startproc	.cfi_startproc
++#endif
++
++#ifndef cfi_endproc
++# define cfi_endproc	.cfi_endproc
++#endif
++
++#ifndef cfi_rel_offset
++# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
++#endif
++
++#ifndef cfi_restore
++# define cfi_restore(reg)	.cfi_restore reg
++#endif
++
++#ifndef cfi_adjust_cfa_offset
++# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
++#endif
++
++#ifndef ENTRY
++# define ENTRY(name)		\
++	.type name,  @function;		\
++	.globl name;		\
++	.p2align 4;		\
++name:		\
++	cfi_startproc
++#endif
++
++#ifndef ALIAS_SYMBOL
++# define ALIAS_SYMBOL(alias, original) \
++	.globl alias; \
++	.equ alias, original
++#endif
++
++#ifndef END
++# define END(name)		\
++	cfi_endproc;		\
++	.size name, .-name
++#endif
++
++#define CFI_PUSH(REG)		\
++	cfi_adjust_cfa_offset (4);		\
++	cfi_rel_offset (REG, 0)
++
++#define CFI_POP(REG)		\
++	cfi_adjust_cfa_offset (-4);		\
++	cfi_restore (REG)
++
++#define PUSH(REG)	push REG;
++#define POP(REG)	pop REG;
++
++#define ENTRANCE	PUSH (%rbx);
++#define RETURN_END	POP (%rbx); ret
++#define RETURN		RETURN_END;
++
++	.section .text.avx2,"ax",@progbits
++ENTRY (MEMMOVE)
++	ENTRANCE
++	mov	%rdi, %rax
++
++/* Check whether we should copy backward or forward.  */
++	cmp	%rsi, %rdi
++	je	L(mm_return)
++	jg	L(mm_len_0_or_more_backward)
++
++/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
++	separately.  */
++	cmp	$16, %rdx
++	jbe	L(mm_len_0_16_bytes_forward)
++
++	cmp	$32, %rdx
++	ja	L(mm_len_32_or_more_forward)
++
++/* Copy [0..32] and return.  */
++	movdqu	(%rsi), %xmm0
++	movdqu	-16(%rsi, %rdx), %xmm1
++	movdqu	%xmm0, (%rdi)
++	movdqu	%xmm1, -16(%rdi, %rdx)
++	jmp	L(mm_return)
++
++L(mm_len_32_or_more_forward):
++	cmp	$64, %rdx
++	ja	L(mm_len_64_or_more_forward)
++
++/* Copy [0..64] and return.  */
++        movdqu	(%rsi), %xmm0
++	movdqu	16(%rsi), %xmm1
++	movdqu	-16(%rsi, %rdx), %xmm2
++	movdqu	-32(%rsi, %rdx), %xmm3
++	movdqu	%xmm0, (%rdi)
++	movdqu	%xmm1, 16(%rdi)
++	movdqu	%xmm2, -16(%rdi, %rdx)
++	movdqu	%xmm3, -32(%rdi, %rdx)
++	jmp	L(mm_return)
++
++L(mm_len_64_or_more_forward):
++	cmp	$128, %rdx
++	ja	L(mm_len_128_or_more_forward)
++
++/* Copy [0..128] and return.  */
++        movdqu	(%rsi), %xmm0
++	movdqu	16(%rsi), %xmm1
++	movdqu	32(%rsi), %xmm2
++	movdqu	48(%rsi), %xmm3
++	movdqu	-64(%rsi, %rdx), %xmm4
++	movdqu	-48(%rsi, %rdx), %xmm5
++	movdqu	-32(%rsi, %rdx), %xmm6
++	movdqu	-16(%rsi, %rdx), %xmm7
++	movdqu	%xmm0, (%rdi)
++	movdqu	%xmm1, 16(%rdi)
++	movdqu	%xmm2, 32(%rdi)
++	movdqu	%xmm3, 48(%rdi)
++	movdqu	%xmm4, -64(%rdi, %rdx)
++	movdqu	%xmm5, -48(%rdi, %rdx)
++	movdqu	%xmm6, -32(%rdi, %rdx)
++	movdqu	%xmm7, -16(%rdi, %rdx)
++	jmp	L(mm_return)
++
++L(mm_len_128_or_more_forward):
++        cmp     $256, %rdx
++        ja      L(mm_len_256_or_more_forward)
++
++/* Copy [0..256] and return.  */
++	movdqu	(%rsi), %xmm0
++	movdqu	16(%rsi), %xmm1
++	movdqu	32(%rsi), %xmm2
++	movdqu	48(%rsi), %xmm3
++	movdqu	64(%rsi), %xmm4
++	movdqu	80(%rsi), %xmm5
++	movdqu	96(%rsi), %xmm6
++	movdqu	112(%rsi), %xmm7
++	movdqu	-128(%rsi, %rdx), %xmm8
++	movdqu	-112(%rsi, %rdx), %xmm9
++	movdqu	-96(%rsi, %rdx), %xmm10
++	movdqu	-80(%rsi, %rdx), %xmm11
++	movdqu	-64(%rsi, %rdx), %xmm12
++	movdqu	-48(%rsi, %rdx), %xmm13
++	movdqu	-32(%rsi, %rdx), %xmm14
++	movdqu	-16(%rsi, %rdx), %xmm15
++	movdqu	%xmm0, (%rdi)
++	movdqu	%xmm1, 16(%rdi)
++	movdqu	%xmm2, 32(%rdi)
++	movdqu	%xmm3, 48(%rdi)
++	movdqu	%xmm4, 64(%rdi)
++	movdqu	%xmm5, 80(%rdi)
++	movdqu	%xmm6, 96(%rdi)
++	movdqu	%xmm7, 112(%rdi)
++	movdqu	%xmm8, -128(%rdi, %rdx)
++	movdqu	%xmm9, -112(%rdi, %rdx)
++	movdqu	%xmm10, -96(%rdi, %rdx)
++	movdqu	%xmm11, -80(%rdi, %rdx)
++	movdqu	%xmm12, -64(%rdi, %rdx)
++	movdqu	%xmm13, -48(%rdi, %rdx)
++	movdqu	%xmm14, -32(%rdi, %rdx)
++	movdqu	%xmm15, -16(%rdi, %rdx)
++        jmp     L(mm_return)
++
++L(mm_len_256_or_more_forward):
++/* Aligning the address of destination.  */
++/*  save first unaligned 128 bytes */
++        vmovdqu (%rsi), %ymm0
++        vmovdqu 32(%rsi), %ymm1
++        vmovdqu 64(%rsi), %ymm2
++        vmovdqu 96(%rsi), %ymm3
++
++        lea     128(%rdi), %r8
++        and     $-128, %r8  /* r8 now aligned to next 128 byte boundary */
++        sub     %rdi, %rsi /* rsi = src - dst = diff */
++
++        vmovdqu (%r8, %rsi), %ymm4
++        vmovdqu 32(%r8, %rsi), %ymm5
++        vmovdqu 64(%r8, %rsi), %ymm6
++        vmovdqu 96(%r8, %rsi), %ymm7
++
++        vmovdqu %ymm0, (%rdi)
++        vmovdqu %ymm1, 32(%rdi)
++        vmovdqu %ymm2, 64(%rdi)
++        vmovdqu %ymm3, 96(%rdi)
++        vmovdqa %ymm4, (%r8)
++        vmovaps %ymm5, 32(%r8)
++        vmovaps %ymm6, 64(%r8)
++        vmovaps %ymm7, 96(%r8)
++        add     $128, %r8
++
++        lea     (%rdi, %rdx), %rbx
++        and     $-128, %rbx
++        cmp     %r8, %rbx
++        jbe     L(mm_copy_remaining_forward)
++
++        cmp     $SHARED_CACHE_SIZE_HALF, %rdx
++        jae     L(mm_large_page_loop_forward)
++
++        .p2align 4
++L(mm_main_loop_forward):
++        prefetcht0 128(%r8, %rsi)
++        vmovdqu (%r8, %rsi), %ymm0
++        vmovdqu 32(%r8, %rsi), %ymm1
++        vmovdqa %ymm0, (%r8)
++        vmovaps %ymm1, 32(%r8)
++        lea     64(%r8), %r8
++        cmp     %r8, %rbx
++        ja      L(mm_main_loop_forward)
++
++L(mm_copy_remaining_forward):
++	add	%rdi, %rdx
++	sub	%r8, %rdx
++/* We copied all up till %rdi position in the dst.
++	In %rdx now is how many bytes are left to copy.
++	Now we need to advance %r8. */
++	lea	(%r8, %rsi), %r9
++
++L(mm_remaining_0_128_bytes_forward):
++        cmp     $64, %rdx
++        ja      L(mm_remaining_65_128_bytes_forward)
++	cmp	$32, %rdx
++	ja	L(mm_remaining_33_64_bytes_forward)
++        vzeroupper
++	cmp	$16, %rdx
++	ja	L(mm_remaining_17_32_bytes_forward)
++	test	%rdx, %rdx
++	.p2align 4,,2
++	je	L(mm_return)
++
++	cmpb	$8, %dl
++	ja	L(mm_remaining_9_16_bytes_forward)
++	cmpb	$4, %dl
++	.p2align 4,,5
++	ja	L(mm_remaining_5_8_bytes_forward)
++	cmpb	$2, %dl
++	.p2align 4,,1
++	ja	L(mm_remaining_3_4_bytes_forward)
++	movzbl	-1(%r9,%rdx), %esi
++	movzbl	(%r9), %ebx
++	movb	%sil, -1(%r8,%rdx)
++	movb	%bl, (%r8)
++	jmp	L(mm_return)
++
++L(mm_remaining_65_128_bytes_forward):
++        vmovdqu (%r9), %ymm0
++        vmovdqu 32(%r9), %ymm1
++        vmovdqu -64(%r9, %rdx), %ymm2
++        vmovdqu -32(%r9, %rdx), %ymm3
++        vmovdqu %ymm0, (%r8)
++        vmovdqu %ymm1, 32(%r8)
++        vmovdqu %ymm2, -64(%r8, %rdx)
++        vmovdqu %ymm3, -32(%r8, %rdx)
++        jmp L(mm_return)
++
++L(mm_remaining_33_64_bytes_forward):
++        vmovdqu (%r9), %ymm0
++        vmovdqu -32(%r9, %rdx), %ymm1
++        vmovdqu %ymm0, (%r8)
++        vmovdqu %ymm1, -32(%r8, %rdx)
++	jmp	L(mm_return)
++
++L(mm_remaining_17_32_bytes_forward):
++	movdqu	(%r9), %xmm0
++	movdqu	-16(%r9, %rdx), %xmm1
++	movdqu	%xmm0, (%r8)
++	movdqu	%xmm1, -16(%r8, %rdx)
++	jmp	L(mm_return)
++
++L(mm_remaining_5_8_bytes_forward):
++	movl	(%r9), %esi
++	movl	-4(%r9,%rdx), %ebx
++	movl	%esi, (%r8)
++	movl	%ebx, -4(%r8,%rdx)
++	jmp	L(mm_return)
++
++L(mm_remaining_9_16_bytes_forward):
++	mov	(%r9), %rsi
++	mov	-8(%r9, %rdx), %rbx
++	mov	%rsi, (%r8)
++	mov	%rbx, -8(%r8, %rdx)
++	jmp	L(mm_return)
++
++L(mm_remaining_3_4_bytes_forward):
++	movzwl	-2(%r9,%rdx), %esi
++	movzwl	(%r9), %ebx
++	movw	%si, -2(%r8,%rdx)
++	movw	%bx, (%r8)
++	jmp	L(mm_return)
++
++L(mm_len_0_16_bytes_forward):
++	testb	$24, %dl
++	jne	L(mm_len_9_16_bytes_forward)
++	testb	$4, %dl
++	.p2align 4,,5
++	jne	L(mm_len_5_8_bytes_forward)
++	test	%rdx, %rdx
++	.p2align 4,,2
++	je	L(mm_return)
++	testb	$2, %dl
++	.p2align 4,,1
++	jne	L(mm_len_2_4_bytes_forward)
++	movzbl	-1(%rsi,%rdx), %ebx
++	movzbl	(%rsi), %esi
++	movb	%bl, -1(%rdi,%rdx)
++	movb	%sil, (%rdi)
++	jmp	L(mm_return)
++
++L(mm_len_2_4_bytes_forward):
++	movzwl	-2(%rsi,%rdx), %ebx
++	movzwl	(%rsi), %esi
++	movw	%bx, -2(%rdi,%rdx)
++	movw	%si, (%rdi)
++	jmp	L(mm_return)
++
++L(mm_len_5_8_bytes_forward):
++	movl	(%rsi), %ebx
++	movl	-4(%rsi,%rdx), %esi
++	movl	%ebx, (%rdi)
++	movl	%esi, -4(%rdi,%rdx)
++	jmp	L(mm_return)
++
++L(mm_len_9_16_bytes_forward):
++	mov	(%rsi), %rbx
++	mov	-8(%rsi, %rdx), %rsi
++	mov	%rbx, (%rdi)
++	mov	%rsi, -8(%rdi, %rdx)
++	jmp	L(mm_return)
++
++L(mm_recalc_len):
++/* Compute in %rdx how many bytes are left to copy after
++	the main loop stops.  */
++	vzeroupper
++	mov 	%rbx, %rdx
++	sub 	%rdi, %rdx
++/* The code for copying backwards.  */
++L(mm_len_0_or_more_backward):
++
++/* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128]
++	separately.  */
++	cmp	$16, %rdx
++	jbe	L(mm_len_0_16_bytes_backward)
++
++	cmp	$32, %rdx
++	ja	L(mm_len_32_or_more_backward)
++
++/* Copy [0..32] and return.  */
++	movdqu	(%rsi), %xmm0
++	movdqu	-16(%rsi, %rdx), %xmm1
++	movdqu	%xmm0, (%rdi)
++	movdqu	%xmm1, -16(%rdi, %rdx)
++	jmp	L(mm_return)
++
++L(mm_len_32_or_more_backward):
++	cmp	$64, %rdx
++	ja	L(mm_len_64_or_more_backward)
++
++/* Copy [0..64] and return.  */
++        movdqu	(%rsi), %xmm0
++	movdqu	16(%rsi), %xmm1
++	movdqu	-16(%rsi, %rdx), %xmm2
++	movdqu	-32(%rsi, %rdx), %xmm3
++	movdqu	%xmm0, (%rdi)
++	movdqu	%xmm1, 16(%rdi)
++	movdqu	%xmm2, -16(%rdi, %rdx)
++	movdqu	%xmm3, -32(%rdi, %rdx)
++	jmp	L(mm_return)
++
++L(mm_len_64_or_more_backward):
++	cmp	$128, %rdx
++	ja	L(mm_len_128_or_more_backward)
++
++/* Copy [0..128] and return.  */
++        movdqu	(%rsi), %xmm0
++	movdqu	16(%rsi), %xmm1
++	movdqu	32(%rsi), %xmm2
++	movdqu	48(%rsi), %xmm3
++	movdqu	-64(%rsi, %rdx), %xmm4
++	movdqu	-48(%rsi, %rdx), %xmm5
++	movdqu	-32(%rsi, %rdx), %xmm6
++	movdqu	-16(%rsi, %rdx), %xmm7
++	movdqu	%xmm0, (%rdi)
++	movdqu	%xmm1, 16(%rdi)
++	movdqu	%xmm2, 32(%rdi)
++	movdqu	%xmm3, 48(%rdi)
++	movdqu	%xmm4, -64(%rdi, %rdx)
++	movdqu	%xmm5, -48(%rdi, %rdx)
++	movdqu	%xmm6, -32(%rdi, %rdx)
++	movdqu	%xmm7, -16(%rdi, %rdx)
++	jmp	L(mm_return)
++
++L(mm_len_128_or_more_backward):
++	cmp	$256, %rdx
++	ja	L(mm_len_256_or_more_backward)
++
++/* Copy [0..256] and return.  */
++	movdqu	(%rsi), %xmm0
++	movdqu	16(%rsi), %xmm1
++	movdqu	32(%rsi), %xmm2
++	movdqu	48(%rsi), %xmm3
++	movdqu	64(%rsi), %xmm4
++	movdqu	80(%rsi), %xmm5
++	movdqu	96(%rsi), %xmm6
++	movdqu	112(%rsi), %xmm7
++	movdqu	-128(%rsi, %rdx), %xmm8
++	movdqu	-112(%rsi, %rdx), %xmm9
++	movdqu	-96(%rsi, %rdx), %xmm10
++	movdqu	-80(%rsi, %rdx), %xmm11
++	movdqu	-64(%rsi, %rdx), %xmm12
++	movdqu	-48(%rsi, %rdx), %xmm13
++	movdqu	-32(%rsi, %rdx), %xmm14
++	movdqu	-16(%rsi, %rdx), %xmm15
++	movdqu	%xmm0, (%rdi)
++	movdqu	%xmm1, 16(%rdi)
++	movdqu	%xmm2, 32(%rdi)
++	movdqu	%xmm3, 48(%rdi)
++	movdqu	%xmm4, 64(%rdi)
++	movdqu	%xmm5, 80(%rdi)
++	movdqu	%xmm6, 96(%rdi)
++	movdqu	%xmm7, 112(%rdi)
++	movdqu	%xmm8, -128(%rdi, %rdx)
++	movdqu	%xmm9, -112(%rdi, %rdx)
++	movdqu	%xmm10, -96(%rdi, %rdx)
++	movdqu	%xmm11, -80(%rdi, %rdx)
++	movdqu	%xmm12, -64(%rdi, %rdx)
++	movdqu	%xmm13, -48(%rdi, %rdx)
++	movdqu	%xmm14, -32(%rdi, %rdx)
++	movdqu	%xmm15, -16(%rdi, %rdx)
++	jmp	L(mm_return)
++
++L(mm_len_256_or_more_backward):
++/* Aligning the address of destination. We need to save
++	128 bytes from the source in order not to overwrite them.  */
++	vmovdqu	-32(%rsi, %rdx), %ymm0
++	vmovdqu	-64(%rsi, %rdx), %ymm1
++	vmovdqu	-96(%rsi, %rdx), %ymm2
++	vmovdqu	-128(%rsi, %rdx), %ymm3
++
++	lea	(%rdi, %rdx), %r9
++	and	$-128, %r9 /* r9 = aligned dst */
++
++	mov	%rsi, %r8
++	sub	%rdi, %r8 /* r8 = src - dst, diff */
++
++	vmovdqu	-32(%r9, %r8), %ymm4
++	vmovdqu	-64(%r9, %r8), %ymm5
++	vmovdqu	-96(%r9, %r8), %ymm6
++	vmovdqu	-128(%r9, %r8), %ymm7
++
++	vmovdqu	%ymm0, -32(%rdi, %rdx)
++	vmovdqu	%ymm1, -64(%rdi, %rdx)
++	vmovdqu	%ymm2, -96(%rdi, %rdx)
++	vmovdqu	%ymm3, -128(%rdi, %rdx)
++	vmovdqa	%ymm4, -32(%r9)
++	vmovdqa	%ymm5, -64(%r9)
++	vmovdqa	%ymm6, -96(%r9)
++	vmovdqa	%ymm7, -128(%r9)
++	lea	-128(%r9), %r9
++
++	lea	128(%rdi), %rbx
++	and	$-128, %rbx
++
++	cmp	%r9, %rbx
++	jae	L(mm_recalc_len)
++
++	cmp	$SHARED_CACHE_SIZE_HALF, %rdx
++	jae	L(mm_large_page_loop_backward)
++
++	.p2align 4
++L(mm_main_loop_backward):
++	prefetcht0 -128(%r9, %r8)
++
++	vmovdqu	-64(%r9, %r8), %ymm0
++	vmovdqu	-32(%r9, %r8), %ymm1
++	vmovdqa	%ymm0, -64(%r9)
++	vmovaps	%ymm1, -32(%r9)
++	lea	-64(%r9), %r9
++	cmp	%r9, %rbx
++	jb	L(mm_main_loop_backward)
++	jmp	L(mm_recalc_len)
++
++/* Copy [0..16] and return.  */
++L(mm_len_0_16_bytes_backward):
++	testb	$24, %dl
++	jnz	L(mm_len_9_16_bytes_backward)
++	testb	$4, %dl
++	.p2align 4,,5
++	jnz	L(mm_len_5_8_bytes_backward)
++	test	%rdx, %rdx
++	.p2align 4,,2
++	je	L(mm_return)
++	testb	$2, %dl
++	.p2align 4,,1
++	jne	L(mm_len_3_4_bytes_backward)
++	movzbl	-1(%rsi,%rdx), %ebx
++	movzbl	(%rsi), %ecx
++	movb	%bl, -1(%rdi,%rdx)
++	movb	%cl, (%rdi)
++	jmp	L(mm_return)
++
++L(mm_len_3_4_bytes_backward):
++	movzwl	-2(%rsi,%rdx), %ebx
++	movzwl	(%rsi), %ecx
++	movw	%bx, -2(%rdi,%rdx)
++	movw	%cx, (%rdi)
++	jmp	L(mm_return)
++
++L(mm_len_9_16_bytes_backward):
++	movl	-4(%rsi,%rdx), %ebx
++	movl	-8(%rsi,%rdx), %ecx
++	movl	%ebx, -4(%rdi,%rdx)
++	movl	%ecx, -8(%rdi,%rdx)
++	sub	$8, %rdx
++	jmp	L(mm_len_0_16_bytes_backward)
++
++L(mm_len_5_8_bytes_backward):
++	movl	(%rsi), %ebx
++	movl	-4(%rsi,%rdx), %ecx
++	movl	%ebx, (%rdi)
++	movl	%ecx, -4(%rdi,%rdx)
++
++L(mm_return):
++        vzeroupper
++	RETURN
++
++/* Big length copy forward part.  */
++
++	.p2align 4
++L(mm_large_page_loop_forward):
++	vmovdqu	  (%r8, %rsi), %ymm0
++	vmovdqu	  32(%r8, %rsi), %ymm1
++	vmovdqu	  64(%r8, %rsi), %ymm2
++	vmovdqu	  96(%r8, %rsi), %ymm3
++	vmovntdq  %ymm0, (%r8)
++	vmovntdq  %ymm1, 32(%r8)
++	vmovntdq  %ymm2, 64(%r8)
++	vmovntdq  %ymm3, 96(%r8)
++	lea 	  128(%r8), %r8
++	cmp	  %r8, %rbx
++	ja	  L(mm_large_page_loop_forward)
++	sfence
++	jmp	  L(mm_copy_remaining_forward)
++
++/* Big length copy backward part.  */
++	.p2align 4
++L(mm_large_page_loop_backward):
++	vmovdqu	  -64(%r9, %r8), %ymm0
++	vmovdqu	  -32(%r9, %r8), %ymm1
++	vmovntdq  %ymm0, -64(%r9)
++	vmovntdq  %ymm1, -32(%r9)
++	lea 	  -64(%r9), %r9
++	cmp	  %r9, %rbx
++	jb	  L(mm_large_page_loop_backward)
++	sfence
++	jmp	  L(mm_recalc_len)
++
++END (MEMMOVE)
++
++//ALIAS_SYMBOL(memcpy, MEMMOVE)
+-- 
+2.25.1
+