diff --git a/IDE/WIN/user_settings.h b/IDE/WIN/user_settings.h index 225108946a..a1011abf8c 100644 --- a/IDE/WIN/user_settings.h +++ b/IDE/WIN/user_settings.h @@ -74,6 +74,9 @@ #if 0 #define HAVE_INTEL_AVX2 #endif + + #define USE_INTEL_CHACHA_SPEEDUP + #define USE_INTEL_POLY1305_SPEEDUP #endif /* Single Precision Support for RSA/DH 1024/2048/3072 and diff --git a/wolfcrypt/benchmark/benchmark.c b/wolfcrypt/benchmark/benchmark.c index e53fbc45f8..7e13f99ab7 100644 --- a/wolfcrypt/benchmark/benchmark.c +++ b/wolfcrypt/benchmark/benchmark.c @@ -1971,6 +1971,7 @@ static int numBlocks = NUM_BLOCKS; static word32 bench_size = BENCH_SIZE; static int base2 = 1; static int digest_stream = 1; +static int encrypt_only = 0; #ifdef MULTI_VALUE_STATISTICS static int minimum_runs = 0; @@ -5820,27 +5821,54 @@ void bench_chacha(void) XMEMSET(enc, 0, sizeof(ChaCha)); wc_Chacha_SetKey(enc, bench_key, 16); - bench_stats_start(&count, &start); - do { - for (i = 0; i < numBlocks; i++) { - ret = wc_Chacha_SetIV(enc, bench_iv, 0); - if (ret < 0) { - printf("wc_Chacha_SetIV error: %d\n", ret); - goto exit; + if (encrypt_only) { + ret = wc_Chacha_SetIV(enc, bench_iv, 0); + if (ret < 0) { + printf("wc_Chacha_SetIV error: %d\n", ret); + goto exit; + } + bench_stats_start(&count, &start); + do { + for (i = 0; i < numBlocks; i++) { + ret = wc_Chacha_Process(enc, bench_cipher, bench_plain, + bench_size); + if (ret < 0) { + printf("wc_Chacha_Process error: %d\n", ret); + goto exit; + } + RECORD_MULTI_VALUE_STATS(); } - ret = wc_Chacha_Process(enc, bench_cipher, bench_plain, bench_size); - if (ret < 0) { - printf("wc_Chacha_Process error: %d\n", ret); - goto exit; + count += i; + } while (bench_stats_check(start) + #ifdef MULTI_VALUE_STATISTICS + || runs < minimum_runs + #endif + ); + } + else { + bench_stats_start(&count, &start); + do { + for (i = 0; i < numBlocks; i++) { + ret = wc_Chacha_SetIV(enc, bench_iv, 0); + if (ret < 0) { + printf("wc_Chacha_SetIV error: %d\n", ret); + goto exit; + } + ret = wc_Chacha_Process(enc, bench_cipher, bench_plain, + bench_size); + if (ret < 0) { + printf("wc_Chacha_Process error: %d\n", ret); + goto exit; + } + RECORD_MULTI_VALUE_STATS(); } - RECORD_MULTI_VALUE_STATS(); - } - count += i; - } while (bench_stats_check(start) -#ifdef MULTI_VALUE_STATISTICS - || runs < minimum_runs -#endif - ); + count += i; + } while (bench_stats_check(start) + #ifdef MULTI_VALUE_STATISTICS + || runs < minimum_runs + #endif + ); + } bench_stats_sym_finish("CHACHA", 0, count, bench_size, start, 0); #ifdef MULTI_VALUE_STATISTICS @@ -13470,6 +13498,8 @@ int wolfcrypt_benchmark_main(int argc, char** argv) #endif else if (string_matches(argv[1], "-dgst_full")) digest_stream = 0; + else if (string_matches(argv[1], "-enc_only")) + encrypt_only = 1; #ifndef NO_RSA else if (string_matches(argv[1], "-rsa_sign")) rsa_sign_verify = 1; diff --git a/wolfcrypt/src/aes_gcm_asm.asm b/wolfcrypt/src/aes_gcm_asm.asm index c0e3682fe3..5b3fc7f636 100644 --- a/wolfcrypt/src/aes_gcm_asm.asm +++ b/wolfcrypt/src/aes_gcm_asm.asm @@ -1,6 +1,6 @@ ; /* aes_gcm_asm.asm */ ; /* -; * Copyright (C) 2006-2023 wolfSSL Inc. +; * Copyright (C) 2006-2024 wolfSSL Inc. ; * ; * This file is part of wolfSSL. ; * @@ -116,16 +116,16 @@ AES_GCM_encrypt_aesni PROC mov r15, QWORD PTR [rsp+136] mov r10d, DWORD PTR [rsp+144] sub rsp, 320 - movdqu [rsp+160], xmm6 - movdqu [rsp+176], xmm7 - movdqu [rsp+192], xmm8 - movdqu [rsp+208], xmm9 - movdqu [rsp+224], xmm10 - movdqu [rsp+240], xmm11 - movdqu [rsp+256], xmm12 - movdqu [rsp+272], xmm13 - movdqu [rsp+288], xmm14 - movdqu [rsp+304], xmm15 + movdqu OWORD PTR [rsp+160], xmm6 + movdqu OWORD PTR [rsp+176], xmm7 + movdqu OWORD PTR [rsp+192], xmm8 + movdqu OWORD PTR [rsp+208], xmm9 + movdqu OWORD PTR [rsp+224], xmm10 + movdqu OWORD PTR [rsp+240], xmm11 + movdqu OWORD PTR [rsp+256], xmm12 + movdqu OWORD PTR [rsp+272], xmm13 + movdqu OWORD PTR [rsp+288], xmm14 + movdqu OWORD PTR [rsp+304], xmm15 pxor xmm4, xmm4 pxor xmm6, xmm6 cmp ebx, 12 @@ -189,7 +189,7 @@ L_AES_GCM_encrypt_aesni_calc_iv_12_last: aesenclast xmm5, xmm7 aesenclast xmm1, xmm7 pshufb xmm5, OWORD PTR L_aes_gcm_bswap_mask - movdqu [rsp+144], xmm1 + movdqu OWORD PTR [rsp+144], xmm1 jmp L_AES_GCM_encrypt_aesni_iv_done L_AES_GCM_encrypt_aesni_iv_not_12: ; Calculate values when IV is not 12 bytes @@ -227,7 +227,7 @@ L_AES_GCM_encrypt_aesni_calc_iv_1_aesenc_avx_last: jl L_AES_GCM_encrypt_aesni_calc_iv_lt16 and edx, 4294967280 L_AES_GCM_encrypt_aesni_calc_iv_16_loop: - movdqu xmm8, [rax+rcx] + movdqu xmm8, OWORD PTR [rax+rcx] pshufb xmm8, OWORD PTR L_aes_gcm_bswap_mask pxor xmm4, xmm8 pshufd xmm1, xmm4, 78 @@ -294,7 +294,7 @@ L_AES_GCM_encrypt_aesni_calc_iv_lt16: sub rsp, 16 pxor xmm8, xmm8 xor ebx, ebx - movdqu [rsp], xmm8 + movdqu OWORD PTR [rsp], xmm8 L_AES_GCM_encrypt_aesni_calc_iv_loop: movzx r13d, BYTE PTR [rax+rcx] mov BYTE PTR [rsp+rbx], r13b @@ -302,7 +302,7 @@ L_AES_GCM_encrypt_aesni_calc_iv_loop: inc ebx cmp ecx, edx jl L_AES_GCM_encrypt_aesni_calc_iv_loop - movdqu xmm8, [rsp] + movdqu xmm8, OWORD PTR [rsp] add rsp, 16 pshufb xmm8, OWORD PTR L_aes_gcm_bswap_mask pxor xmm4, xmm8 @@ -446,7 +446,7 @@ L_AES_GCM_encrypt_aesni_calc_iv_done: movdqa xmm9, OWORD PTR [r15+224] L_AES_GCM_encrypt_aesni_calc_iv_2_aesenc_avx_last: aesenclast xmm8, xmm9 - movdqu [rsp+144], xmm8 + movdqu OWORD PTR [rsp+144], xmm8 L_AES_GCM_encrypt_aesni_iv_done: ; Additional authentication data mov edx, r11d @@ -457,7 +457,7 @@ L_AES_GCM_encrypt_aesni_iv_done: jl L_AES_GCM_encrypt_aesni_calc_aad_lt16 and edx, 4294967280 L_AES_GCM_encrypt_aesni_calc_aad_16_loop: - movdqu xmm8, [r12+rcx] + movdqu xmm8, OWORD PTR [r12+rcx] pshufb xmm8, OWORD PTR L_aes_gcm_bswap_mask pxor xmm6, xmm8 pshufd xmm1, xmm6, 78 @@ -524,7 +524,7 @@ L_AES_GCM_encrypt_aesni_calc_aad_lt16: sub rsp, 16 pxor xmm8, xmm8 xor ebx, ebx - movdqu [rsp], xmm8 + movdqu OWORD PTR [rsp], xmm8 L_AES_GCM_encrypt_aesni_calc_aad_loop: movzx r13d, BYTE PTR [r12+rcx] mov BYTE PTR [rsp+rbx], r13b @@ -532,7 +532,7 @@ L_AES_GCM_encrypt_aesni_calc_aad_loop: inc ebx cmp ecx, edx jl L_AES_GCM_encrypt_aesni_calc_aad_loop - movdqu xmm8, [rsp] + movdqu xmm8, OWORD PTR [rsp] add rsp, 16 pshufb xmm8, OWORD PTR L_aes_gcm_bswap_mask pxor xmm6, xmm8 @@ -596,7 +596,7 @@ L_AES_GCM_encrypt_aesni_calc_aad_done: movdqa xmm9, xmm5 paddd xmm4, OWORD PTR L_aes_gcm_one movdqa xmm8, xmm5 - movdqu [rsp+128], xmm4 + movdqu OWORD PTR [rsp+128], xmm4 psrlq xmm9, 63 psllq xmm8, 1 pslldq xmm9, 8 @@ -612,7 +612,7 @@ L_AES_GCM_encrypt_aesni_calc_aad_done: and r13d, 4294967168 movdqa xmm2, xmm6 ; H ^ 1 - movdqu [rsp], xmm5 + movdqu OWORD PTR [rsp], xmm5 ; H ^ 2 pshufd xmm9, xmm5, 78 pshufd xmm10, xmm5, 78 @@ -654,7 +654,7 @@ L_AES_GCM_encrypt_aesni_calc_aad_done: pxor xmm14, xmm13 pxor xmm14, xmm8 pxor xmm0, xmm14 - movdqu [rsp+16], xmm0 + movdqu OWORD PTR [rsp+16], xmm0 ; H ^ 3 pshufd xmm9, xmm5, 78 pshufd xmm10, xmm0, 78 @@ -696,7 +696,7 @@ L_AES_GCM_encrypt_aesni_calc_aad_done: pxor xmm14, xmm13 pxor xmm14, xmm8 pxor xmm1, xmm14 - movdqu [rsp+32], xmm1 + movdqu OWORD PTR [rsp+32], xmm1 ; H ^ 4 pshufd xmm9, xmm0, 78 pshufd xmm10, xmm0, 78 @@ -738,7 +738,7 @@ L_AES_GCM_encrypt_aesni_calc_aad_done: pxor xmm14, xmm13 pxor xmm14, xmm8 pxor xmm3, xmm14 - movdqu [rsp+48], xmm3 + movdqu OWORD PTR [rsp+48], xmm3 ; H ^ 5 pshufd xmm9, xmm0, 78 pshufd xmm10, xmm1, 78 @@ -780,7 +780,7 @@ L_AES_GCM_encrypt_aesni_calc_aad_done: pxor xmm14, xmm13 pxor xmm14, xmm8 pxor xmm7, xmm14 - movdqu [rsp+64], xmm7 + movdqu OWORD PTR [rsp+64], xmm7 ; H ^ 6 pshufd xmm9, xmm1, 78 pshufd xmm10, xmm1, 78 @@ -822,7 +822,7 @@ L_AES_GCM_encrypt_aesni_calc_aad_done: pxor xmm14, xmm13 pxor xmm14, xmm8 pxor xmm7, xmm14 - movdqu [rsp+80], xmm7 + movdqu OWORD PTR [rsp+80], xmm7 ; H ^ 7 pshufd xmm9, xmm1, 78 pshufd xmm10, xmm3, 78 @@ -864,7 +864,7 @@ L_AES_GCM_encrypt_aesni_calc_aad_done: pxor xmm14, xmm13 pxor xmm14, xmm8 pxor xmm7, xmm14 - movdqu [rsp+96], xmm7 + movdqu OWORD PTR [rsp+96], xmm7 ; H ^ 8 pshufd xmm9, xmm3, 78 pshufd xmm10, xmm3, 78 @@ -906,9 +906,9 @@ L_AES_GCM_encrypt_aesni_calc_aad_done: pxor xmm14, xmm13 pxor xmm14, xmm8 pxor xmm7, xmm14 - movdqu [rsp+112], xmm7 + movdqu OWORD PTR [rsp+112], xmm7 ; First 128 bytes of input - movdqu xmm8, [rsp+128] + movdqu xmm8, OWORD PTR [rsp+128] movdqa xmm1, OWORD PTR L_aes_gcm_bswap_epi64 movdqa xmm0, xmm8 pshufb xmm8, xmm1 @@ -935,7 +935,7 @@ L_AES_GCM_encrypt_aesni_calc_aad_done: pshufb xmm15, xmm1 paddd xmm0, OWORD PTR L_aes_gcm_eight movdqa xmm7, OWORD PTR [r15] - movdqu [rsp+128], xmm0 + movdqu OWORD PTR [rsp+128], xmm0 pxor xmm8, xmm7 pxor xmm9, xmm7 pxor xmm10, xmm7 @@ -1069,36 +1069,36 @@ L_AES_GCM_encrypt_aesni_calc_aad_done: L_AES_GCM_encrypt_aesni_enc_done: aesenclast xmm8, xmm7 aesenclast xmm9, xmm7 - movdqu xmm0, [rdi] - movdqu xmm1, [rdi+16] + movdqu xmm0, OWORD PTR [rdi] + movdqu xmm1, OWORD PTR [rdi+16] pxor xmm8, xmm0 pxor xmm9, xmm1 - movdqu [rsi], xmm8 - movdqu [rsi+16], xmm9 + movdqu OWORD PTR [rsi], xmm8 + movdqu OWORD PTR [rsi+16], xmm9 aesenclast xmm10, xmm7 aesenclast xmm11, xmm7 - movdqu xmm0, [rdi+32] - movdqu xmm1, [rdi+48] + movdqu xmm0, OWORD PTR [rdi+32] + movdqu xmm1, OWORD PTR [rdi+48] pxor xmm10, xmm0 pxor xmm11, xmm1 - movdqu [rsi+32], xmm10 - movdqu [rsi+48], xmm11 + movdqu OWORD PTR [rsi+32], xmm10 + movdqu OWORD PTR [rsi+48], xmm11 aesenclast xmm12, xmm7 aesenclast xmm13, xmm7 - movdqu xmm0, [rdi+64] - movdqu xmm1, [rdi+80] + movdqu xmm0, OWORD PTR [rdi+64] + movdqu xmm1, OWORD PTR [rdi+80] pxor xmm12, xmm0 pxor xmm13, xmm1 - movdqu [rsi+64], xmm12 - movdqu [rsi+80], xmm13 + movdqu OWORD PTR [rsi+64], xmm12 + movdqu OWORD PTR [rsi+80], xmm13 aesenclast xmm14, xmm7 aesenclast xmm15, xmm7 - movdqu xmm0, [rdi+96] - movdqu xmm1, [rdi+112] + movdqu xmm0, OWORD PTR [rdi+96] + movdqu xmm1, OWORD PTR [rdi+112] pxor xmm14, xmm0 pxor xmm15, xmm1 - movdqu [rsi+96], xmm14 - movdqu [rsi+112], xmm15 + movdqu OWORD PTR [rsi+96], xmm14 + movdqu OWORD PTR [rsi+112], xmm15 cmp r13d, 128 mov ebx, 128 jle L_AES_GCM_encrypt_aesni_end_128 @@ -1106,7 +1106,7 @@ L_AES_GCM_encrypt_aesni_enc_done: L_AES_GCM_encrypt_aesni_ghash_128: lea rcx, QWORD PTR [rdi+rbx] lea rdx, QWORD PTR [rsi+rbx] - movdqu xmm8, [rsp+128] + movdqu xmm8, OWORD PTR [rsp+128] movdqa xmm1, OWORD PTR L_aes_gcm_bswap_epi64 movdqa xmm0, xmm8 pshufb xmm8, xmm1 @@ -1133,7 +1133,7 @@ L_AES_GCM_encrypt_aesni_ghash_128: pshufb xmm15, xmm1 paddd xmm0, OWORD PTR L_aes_gcm_eight movdqa xmm7, OWORD PTR [r15] - movdqu [rsp+128], xmm0 + movdqu OWORD PTR [rsp+128], xmm0 pxor xmm8, xmm7 pxor xmm9, xmm7 pxor xmm10, xmm7 @@ -1142,8 +1142,8 @@ L_AES_GCM_encrypt_aesni_ghash_128: pxor xmm13, xmm7 pxor xmm14, xmm7 pxor xmm15, xmm7 - movdqu xmm7, [rsp+112] - movdqu xmm0, [rdx+-128] + movdqu xmm7, OWORD PTR [rsp+112] + movdqu xmm0, OWORD PTR [rdx+-128] aesenc xmm8, [r15+16] pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask pxor xmm0, xmm2 @@ -1165,8 +1165,8 @@ L_AES_GCM_encrypt_aesni_ghash_128: aesenc xmm15, [r15+16] pxor xmm1, xmm2 pxor xmm1, xmm3 - movdqu xmm7, [rsp+96] - movdqu xmm0, [rdx+-112] + movdqu xmm7, OWORD PTR [rsp+96] + movdqu xmm0, OWORD PTR [rdx+-112] pshufd xmm4, xmm7, 78 pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask aesenc xmm8, [r15+32] @@ -1189,8 +1189,8 @@ L_AES_GCM_encrypt_aesni_ghash_128: pxor xmm1, xmm6 pxor xmm3, xmm6 pxor xmm1, xmm4 - movdqu xmm7, [rsp+80] - movdqu xmm0, [rdx+-96] + movdqu xmm7, OWORD PTR [rsp+80] + movdqu xmm0, OWORD PTR [rdx+-96] pshufd xmm4, xmm7, 78 pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask aesenc xmm8, [r15+48] @@ -1213,8 +1213,8 @@ L_AES_GCM_encrypt_aesni_ghash_128: pxor xmm1, xmm6 pxor xmm3, xmm6 pxor xmm1, xmm4 - movdqu xmm7, [rsp+64] - movdqu xmm0, [rdx+-80] + movdqu xmm7, OWORD PTR [rsp+64] + movdqu xmm0, OWORD PTR [rdx+-80] pshufd xmm4, xmm7, 78 pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask aesenc xmm8, [r15+64] @@ -1237,8 +1237,8 @@ L_AES_GCM_encrypt_aesni_ghash_128: pxor xmm1, xmm6 pxor xmm3, xmm6 pxor xmm1, xmm4 - movdqu xmm7, [rsp+48] - movdqu xmm0, [rdx+-64] + movdqu xmm7, OWORD PTR [rsp+48] + movdqu xmm0, OWORD PTR [rdx+-64] pshufd xmm4, xmm7, 78 pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask aesenc xmm8, [r15+80] @@ -1261,8 +1261,8 @@ L_AES_GCM_encrypt_aesni_ghash_128: pxor xmm1, xmm6 pxor xmm3, xmm6 pxor xmm1, xmm4 - movdqu xmm7, [rsp+32] - movdqu xmm0, [rdx+-48] + movdqu xmm7, OWORD PTR [rsp+32] + movdqu xmm0, OWORD PTR [rdx+-48] pshufd xmm4, xmm7, 78 pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask aesenc xmm8, [r15+96] @@ -1285,8 +1285,8 @@ L_AES_GCM_encrypt_aesni_ghash_128: pxor xmm1, xmm6 pxor xmm3, xmm6 pxor xmm1, xmm4 - movdqu xmm7, [rsp+16] - movdqu xmm0, [rdx+-32] + movdqu xmm7, OWORD PTR [rsp+16] + movdqu xmm0, OWORD PTR [rdx+-32] pshufd xmm4, xmm7, 78 pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask aesenc xmm8, [r15+112] @@ -1309,8 +1309,8 @@ L_AES_GCM_encrypt_aesni_ghash_128: pxor xmm1, xmm6 pxor xmm3, xmm6 pxor xmm1, xmm4 - movdqu xmm7, [rsp] - movdqu xmm0, [rdx+-16] + movdqu xmm7, OWORD PTR [rsp] + movdqu xmm0, OWORD PTR [rdx+-16] pshufd xmm4, xmm7, 78 pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask aesenc xmm8, [r15+128] @@ -1413,36 +1413,36 @@ L_AES_GCM_encrypt_aesni_ghash_128: L_AES_GCM_encrypt_aesni_aesenc_128_ghash_avx_done: aesenclast xmm8, xmm7 aesenclast xmm9, xmm7 - movdqu xmm0, [rcx] - movdqu xmm1, [rcx+16] + movdqu xmm0, OWORD PTR [rcx] + movdqu xmm1, OWORD PTR [rcx+16] pxor xmm8, xmm0 pxor xmm9, xmm1 - movdqu [rdx], xmm8 - movdqu [rdx+16], xmm9 + movdqu OWORD PTR [rdx], xmm8 + movdqu OWORD PTR [rdx+16], xmm9 aesenclast xmm10, xmm7 aesenclast xmm11, xmm7 - movdqu xmm0, [rcx+32] - movdqu xmm1, [rcx+48] + movdqu xmm0, OWORD PTR [rcx+32] + movdqu xmm1, OWORD PTR [rcx+48] pxor xmm10, xmm0 pxor xmm11, xmm1 - movdqu [rdx+32], xmm10 - movdqu [rdx+48], xmm11 + movdqu OWORD PTR [rdx+32], xmm10 + movdqu OWORD PTR [rdx+48], xmm11 aesenclast xmm12, xmm7 aesenclast xmm13, xmm7 - movdqu xmm0, [rcx+64] - movdqu xmm1, [rcx+80] + movdqu xmm0, OWORD PTR [rcx+64] + movdqu xmm1, OWORD PTR [rcx+80] pxor xmm12, xmm0 pxor xmm13, xmm1 - movdqu [rdx+64], xmm12 - movdqu [rdx+80], xmm13 + movdqu OWORD PTR [rdx+64], xmm12 + movdqu OWORD PTR [rdx+80], xmm13 aesenclast xmm14, xmm7 aesenclast xmm15, xmm7 - movdqu xmm0, [rcx+96] - movdqu xmm1, [rcx+112] + movdqu xmm0, OWORD PTR [rcx+96] + movdqu xmm1, OWORD PTR [rcx+112] pxor xmm14, xmm0 pxor xmm15, xmm1 - movdqu [rdx+96], xmm14 - movdqu [rdx+112], xmm15 + movdqu OWORD PTR [rdx+96], xmm14 + movdqu OWORD PTR [rdx+112], xmm15 add ebx, 128 cmp ebx, r13d jl L_AES_GCM_encrypt_aesni_ghash_128 @@ -1457,7 +1457,7 @@ L_AES_GCM_encrypt_aesni_end_128: pshufb xmm13, xmm4 pshufb xmm14, xmm4 pshufb xmm15, xmm4 - movdqu xmm7, [rsp+112] + movdqu xmm7, OWORD PTR [rsp+112] pshufd xmm1, xmm8, 78 pshufd xmm2, xmm7, 78 movdqa xmm3, xmm7 @@ -1476,7 +1476,7 @@ L_AES_GCM_encrypt_aesni_end_128: psrldq xmm1, 8 pxor xmm4, xmm2 pxor xmm6, xmm1 - movdqu xmm7, [rsp+96] + movdqu xmm7, OWORD PTR [rsp+96] pshufd xmm1, xmm9, 78 pshufd xmm2, xmm7, 78 movdqa xmm3, xmm7 @@ -1495,7 +1495,7 @@ L_AES_GCM_encrypt_aesni_end_128: psrldq xmm1, 8 pxor xmm4, xmm2 pxor xmm6, xmm1 - movdqu xmm7, [rsp+80] + movdqu xmm7, OWORD PTR [rsp+80] pshufd xmm1, xmm10, 78 pshufd xmm2, xmm7, 78 movdqa xmm3, xmm7 @@ -1514,7 +1514,7 @@ L_AES_GCM_encrypt_aesni_end_128: psrldq xmm1, 8 pxor xmm4, xmm2 pxor xmm6, xmm1 - movdqu xmm7, [rsp+64] + movdqu xmm7, OWORD PTR [rsp+64] pshufd xmm1, xmm11, 78 pshufd xmm2, xmm7, 78 movdqa xmm3, xmm7 @@ -1533,7 +1533,7 @@ L_AES_GCM_encrypt_aesni_end_128: psrldq xmm1, 8 pxor xmm4, xmm2 pxor xmm6, xmm1 - movdqu xmm7, [rsp+48] + movdqu xmm7, OWORD PTR [rsp+48] pshufd xmm1, xmm12, 78 pshufd xmm2, xmm7, 78 movdqa xmm3, xmm7 @@ -1552,7 +1552,7 @@ L_AES_GCM_encrypt_aesni_end_128: psrldq xmm1, 8 pxor xmm4, xmm2 pxor xmm6, xmm1 - movdqu xmm7, [rsp+32] + movdqu xmm7, OWORD PTR [rsp+32] pshufd xmm1, xmm13, 78 pshufd xmm2, xmm7, 78 movdqa xmm3, xmm7 @@ -1571,7 +1571,7 @@ L_AES_GCM_encrypt_aesni_end_128: psrldq xmm1, 8 pxor xmm4, xmm2 pxor xmm6, xmm1 - movdqu xmm7, [rsp+16] + movdqu xmm7, OWORD PTR [rsp+16] pshufd xmm1, xmm14, 78 pshufd xmm2, xmm7, 78 movdqa xmm3, xmm7 @@ -1590,7 +1590,7 @@ L_AES_GCM_encrypt_aesni_end_128: psrldq xmm1, 8 pxor xmm4, xmm2 pxor xmm6, xmm1 - movdqu xmm7, [rsp] + movdqu xmm7, OWORD PTR [rsp] pshufd xmm1, xmm15, 78 pshufd xmm2, xmm7, 78 movdqa xmm3, xmm7 @@ -1632,7 +1632,7 @@ L_AES_GCM_encrypt_aesni_end_128: pxor xmm2, xmm1 pxor xmm2, xmm4 pxor xmm6, xmm2 - movdqu xmm5, [rsp] + movdqu xmm5, OWORD PTR [rsp] L_AES_GCM_encrypt_aesni_done_128: mov edx, r9d cmp ebx, edx @@ -1643,12 +1643,12 @@ L_AES_GCM_encrypt_aesni_done_128: jge L_AES_GCM_encrypt_aesni_last_block_done lea rcx, QWORD PTR [rdi+rbx] lea rdx, QWORD PTR [rsi+rbx] - movdqu xmm8, [rsp+128] + movdqu xmm8, OWORD PTR [rsp+128] movdqa xmm9, xmm8 pshufb xmm8, OWORD PTR L_aes_gcm_bswap_epi64 paddd xmm9, OWORD PTR L_aes_gcm_one pxor xmm8, [r15] - movdqu [rsp+128], xmm9 + movdqu OWORD PTR [rsp+128], xmm9 aesenc xmm8, [r15+16] aesenc xmm8, [r15+32] aesenc xmm8, [r15+48] @@ -1671,9 +1671,9 @@ L_AES_GCM_encrypt_aesni_done_128: movdqa xmm9, OWORD PTR [r15+224] L_AES_GCM_encrypt_aesni_aesenc_block_aesenc_avx_last: aesenclast xmm8, xmm9 - movdqu xmm9, [rcx] + movdqu xmm9, OWORD PTR [rcx] pxor xmm8, xmm9 - movdqu [rdx], xmm8 + movdqu OWORD PTR [rdx], xmm8 pshufb xmm8, OWORD PTR L_aes_gcm_bswap_mask pxor xmm6, xmm8 add ebx, 16 @@ -1682,12 +1682,12 @@ L_AES_GCM_encrypt_aesni_aesenc_block_aesenc_avx_last: L_AES_GCM_encrypt_aesni_last_block_start: lea rcx, QWORD PTR [rdi+rbx] lea rdx, QWORD PTR [rsi+rbx] - movdqu xmm8, [rsp+128] + movdqu xmm8, OWORD PTR [rsp+128] movdqa xmm9, xmm8 pshufb xmm8, OWORD PTR L_aes_gcm_bswap_epi64 paddd xmm9, OWORD PTR L_aes_gcm_one pxor xmm8, [r15] - movdqu [rsp+128], xmm9 + movdqu OWORD PTR [rsp+128], xmm9 movdqa xmm10, xmm6 pclmulqdq xmm10, xmm5, 16 aesenc xmm8, [r15+16] @@ -1735,9 +1735,9 @@ L_AES_GCM_encrypt_aesni_last_block_start: movdqa xmm9, OWORD PTR [r15+224] L_AES_GCM_encrypt_aesni_aesenc_gfmul_last: aesenclast xmm8, xmm9 - movdqu xmm9, [rcx] + movdqu xmm9, OWORD PTR [rcx] pxor xmm8, xmm9 - movdqu [rdx], xmm8 + movdqu OWORD PTR [rdx], xmm8 pshufb xmm8, OWORD PTR L_aes_gcm_bswap_mask pxor xmm6, xmm8 add ebx, 16 @@ -1789,7 +1789,7 @@ L_AES_GCM_encrypt_aesni_last_block_done: mov edx, ecx and ecx, 15 jz L_AES_GCM_encrypt_aesni_aesenc_last15_enc_avx_done - movdqu xmm4, [rsp+128] + movdqu xmm4, OWORD PTR [rsp+128] pshufb xmm4, OWORD PTR L_aes_gcm_bswap_epi64 pxor xmm4, [r15] aesenc xmm4, [r15+16] @@ -1816,7 +1816,7 @@ L_AES_GCM_encrypt_aesni_aesenc_last15_enc_avx_aesenc_avx_last: aesenclast xmm4, xmm9 sub rsp, 16 xor ecx, ecx - movdqu [rsp], xmm4 + movdqu OWORD PTR [rsp], xmm4 L_AES_GCM_encrypt_aesni_aesenc_last15_enc_avx_loop: movzx r13d, BYTE PTR [rdi+rbx] xor r13b, BYTE PTR [rsp+rcx] @@ -1835,7 +1835,7 @@ L_AES_GCM_encrypt_aesni_aesenc_last15_enc_avx_byte_loop: cmp ecx, 16 jl L_AES_GCM_encrypt_aesni_aesenc_last15_enc_avx_byte_loop L_AES_GCM_encrypt_aesni_aesenc_last15_enc_avx_finish_enc: - movdqu xmm4, [rsp] + movdqu xmm4, OWORD PTR [rsp] add rsp, 16 pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask pxor xmm6, xmm4 @@ -1929,12 +1929,12 @@ L_AES_GCM_encrypt_aesni_done_enc: pxor xmm14, xmm8 pxor xmm6, xmm14 pshufb xmm6, OWORD PTR L_aes_gcm_bswap_mask - movdqu xmm0, [rsp+144] + movdqu xmm0, OWORD PTR [rsp+144] pxor xmm0, xmm6 cmp r14d, 16 je L_AES_GCM_encrypt_aesni_store_tag_16 xor rcx, rcx - movdqu [rsp], xmm0 + movdqu OWORD PTR [rsp], xmm0 L_AES_GCM_encrypt_aesni_store_tag_loop: movzx r13d, BYTE PTR [rsp+rcx] mov BYTE PTR [r8+rcx], r13b @@ -1943,18 +1943,18 @@ L_AES_GCM_encrypt_aesni_store_tag_loop: jne L_AES_GCM_encrypt_aesni_store_tag_loop jmp L_AES_GCM_encrypt_aesni_store_tag_done L_AES_GCM_encrypt_aesni_store_tag_16: - movdqu [r8], xmm0 + movdqu OWORD PTR [r8], xmm0 L_AES_GCM_encrypt_aesni_store_tag_done: - movdqu xmm6, [rsp+160] - movdqu xmm7, [rsp+176] - movdqu xmm8, [rsp+192] - movdqu xmm9, [rsp+208] - movdqu xmm10, [rsp+224] - movdqu xmm11, [rsp+240] - movdqu xmm12, [rsp+256] - movdqu xmm13, [rsp+272] - movdqu xmm14, [rsp+288] - movdqu xmm15, [rsp+304] + movdqu xmm6, OWORD PTR [rsp+160] + movdqu xmm7, OWORD PTR [rsp+176] + movdqu xmm8, OWORD PTR [rsp+192] + movdqu xmm9, OWORD PTR [rsp+208] + movdqu xmm10, OWORD PTR [rsp+224] + movdqu xmm11, OWORD PTR [rsp+240] + movdqu xmm12, OWORD PTR [rsp+256] + movdqu xmm13, OWORD PTR [rsp+272] + movdqu xmm14, OWORD PTR [rsp+288] + movdqu xmm15, OWORD PTR [rsp+304] add rsp, 320 pop r15 pop r14 @@ -1989,16 +1989,16 @@ AES_GCM_decrypt_aesni PROC mov r10d, DWORD PTR [rsp+152] mov rbp, QWORD PTR [rsp+160] sub rsp, 328 - movdqu [rsp+168], xmm6 - movdqu [rsp+184], xmm7 - movdqu [rsp+200], xmm8 - movdqu [rsp+216], xmm9 - movdqu [rsp+232], xmm10 - movdqu [rsp+248], xmm11 - movdqu [rsp+264], xmm12 - movdqu [rsp+280], xmm13 - movdqu [rsp+296], xmm14 - movdqu [rsp+312], xmm15 + movdqu OWORD PTR [rsp+168], xmm6 + movdqu OWORD PTR [rsp+184], xmm7 + movdqu OWORD PTR [rsp+200], xmm8 + movdqu OWORD PTR [rsp+216], xmm9 + movdqu OWORD PTR [rsp+232], xmm10 + movdqu OWORD PTR [rsp+248], xmm11 + movdqu OWORD PTR [rsp+264], xmm12 + movdqu OWORD PTR [rsp+280], xmm13 + movdqu OWORD PTR [rsp+296], xmm14 + movdqu OWORD PTR [rsp+312], xmm15 pxor xmm4, xmm4 pxor xmm6, xmm6 cmp ebx, 12 @@ -2062,7 +2062,7 @@ L_AES_GCM_decrypt_aesni_calc_iv_12_last: aesenclast xmm5, xmm7 aesenclast xmm1, xmm7 pshufb xmm5, OWORD PTR L_aes_gcm_bswap_mask - movdqu [rsp+144], xmm1 + movdqu OWORD PTR [rsp+144], xmm1 jmp L_AES_GCM_decrypt_aesni_iv_done L_AES_GCM_decrypt_aesni_iv_not_12: ; Calculate values when IV is not 12 bytes @@ -2100,7 +2100,7 @@ L_AES_GCM_decrypt_aesni_calc_iv_1_aesenc_avx_last: jl L_AES_GCM_decrypt_aesni_calc_iv_lt16 and edx, 4294967280 L_AES_GCM_decrypt_aesni_calc_iv_16_loop: - movdqu xmm8, [rax+rcx] + movdqu xmm8, OWORD PTR [rax+rcx] pshufb xmm8, OWORD PTR L_aes_gcm_bswap_mask pxor xmm4, xmm8 pshufd xmm1, xmm4, 78 @@ -2167,7 +2167,7 @@ L_AES_GCM_decrypt_aesni_calc_iv_lt16: sub rsp, 16 pxor xmm8, xmm8 xor ebx, ebx - movdqu [rsp], xmm8 + movdqu OWORD PTR [rsp], xmm8 L_AES_GCM_decrypt_aesni_calc_iv_loop: movzx r13d, BYTE PTR [rax+rcx] mov BYTE PTR [rsp+rbx], r13b @@ -2175,7 +2175,7 @@ L_AES_GCM_decrypt_aesni_calc_iv_loop: inc ebx cmp ecx, edx jl L_AES_GCM_decrypt_aesni_calc_iv_loop - movdqu xmm8, [rsp] + movdqu xmm8, OWORD PTR [rsp] add rsp, 16 pshufb xmm8, OWORD PTR L_aes_gcm_bswap_mask pxor xmm4, xmm8 @@ -2319,7 +2319,7 @@ L_AES_GCM_decrypt_aesni_calc_iv_done: movdqa xmm9, OWORD PTR [r15+224] L_AES_GCM_decrypt_aesni_calc_iv_2_aesenc_avx_last: aesenclast xmm8, xmm9 - movdqu [rsp+144], xmm8 + movdqu OWORD PTR [rsp+144], xmm8 L_AES_GCM_decrypt_aesni_iv_done: ; Additional authentication data mov edx, r11d @@ -2330,7 +2330,7 @@ L_AES_GCM_decrypt_aesni_iv_done: jl L_AES_GCM_decrypt_aesni_calc_aad_lt16 and edx, 4294967280 L_AES_GCM_decrypt_aesni_calc_aad_16_loop: - movdqu xmm8, [r12+rcx] + movdqu xmm8, OWORD PTR [r12+rcx] pshufb xmm8, OWORD PTR L_aes_gcm_bswap_mask pxor xmm6, xmm8 pshufd xmm1, xmm6, 78 @@ -2397,7 +2397,7 @@ L_AES_GCM_decrypt_aesni_calc_aad_lt16: sub rsp, 16 pxor xmm8, xmm8 xor ebx, ebx - movdqu [rsp], xmm8 + movdqu OWORD PTR [rsp], xmm8 L_AES_GCM_decrypt_aesni_calc_aad_loop: movzx r13d, BYTE PTR [r12+rcx] mov BYTE PTR [rsp+rbx], r13b @@ -2405,7 +2405,7 @@ L_AES_GCM_decrypt_aesni_calc_aad_loop: inc ebx cmp ecx, edx jl L_AES_GCM_decrypt_aesni_calc_aad_loop - movdqu xmm8, [rsp] + movdqu xmm8, OWORD PTR [rsp] add rsp, 16 pshufb xmm8, OWORD PTR L_aes_gcm_bswap_mask pxor xmm6, xmm8 @@ -2469,7 +2469,7 @@ L_AES_GCM_decrypt_aesni_calc_aad_done: movdqa xmm9, xmm5 paddd xmm4, OWORD PTR L_aes_gcm_one movdqa xmm8, xmm5 - movdqu [rsp+128], xmm4 + movdqu OWORD PTR [rsp+128], xmm4 psrlq xmm9, 63 psllq xmm8, 1 pslldq xmm9, 8 @@ -2485,7 +2485,7 @@ L_AES_GCM_decrypt_aesni_calc_aad_done: and r13d, 4294967168 movdqa xmm2, xmm6 ; H ^ 1 - movdqu [rsp], xmm5 + movdqu OWORD PTR [rsp], xmm5 ; H ^ 2 pshufd xmm9, xmm5, 78 pshufd xmm10, xmm5, 78 @@ -2527,7 +2527,7 @@ L_AES_GCM_decrypt_aesni_calc_aad_done: pxor xmm14, xmm13 pxor xmm14, xmm8 pxor xmm0, xmm14 - movdqu [rsp+16], xmm0 + movdqu OWORD PTR [rsp+16], xmm0 ; H ^ 3 pshufd xmm9, xmm5, 78 pshufd xmm10, xmm0, 78 @@ -2569,7 +2569,7 @@ L_AES_GCM_decrypt_aesni_calc_aad_done: pxor xmm14, xmm13 pxor xmm14, xmm8 pxor xmm1, xmm14 - movdqu [rsp+32], xmm1 + movdqu OWORD PTR [rsp+32], xmm1 ; H ^ 4 pshufd xmm9, xmm0, 78 pshufd xmm10, xmm0, 78 @@ -2611,7 +2611,7 @@ L_AES_GCM_decrypt_aesni_calc_aad_done: pxor xmm14, xmm13 pxor xmm14, xmm8 pxor xmm3, xmm14 - movdqu [rsp+48], xmm3 + movdqu OWORD PTR [rsp+48], xmm3 ; H ^ 5 pshufd xmm9, xmm0, 78 pshufd xmm10, xmm1, 78 @@ -2653,7 +2653,7 @@ L_AES_GCM_decrypt_aesni_calc_aad_done: pxor xmm14, xmm13 pxor xmm14, xmm8 pxor xmm7, xmm14 - movdqu [rsp+64], xmm7 + movdqu OWORD PTR [rsp+64], xmm7 ; H ^ 6 pshufd xmm9, xmm1, 78 pshufd xmm10, xmm1, 78 @@ -2695,7 +2695,7 @@ L_AES_GCM_decrypt_aesni_calc_aad_done: pxor xmm14, xmm13 pxor xmm14, xmm8 pxor xmm7, xmm14 - movdqu [rsp+80], xmm7 + movdqu OWORD PTR [rsp+80], xmm7 ; H ^ 7 pshufd xmm9, xmm1, 78 pshufd xmm10, xmm3, 78 @@ -2737,7 +2737,7 @@ L_AES_GCM_decrypt_aesni_calc_aad_done: pxor xmm14, xmm13 pxor xmm14, xmm8 pxor xmm7, xmm14 - movdqu [rsp+96], xmm7 + movdqu OWORD PTR [rsp+96], xmm7 ; H ^ 8 pshufd xmm9, xmm3, 78 pshufd xmm10, xmm3, 78 @@ -2779,11 +2779,11 @@ L_AES_GCM_decrypt_aesni_calc_aad_done: pxor xmm14, xmm13 pxor xmm14, xmm8 pxor xmm7, xmm14 - movdqu [rsp+112], xmm7 + movdqu OWORD PTR [rsp+112], xmm7 L_AES_GCM_decrypt_aesni_ghash_128: lea rcx, QWORD PTR [rdi+rbx] lea rdx, QWORD PTR [rsi+rbx] - movdqu xmm8, [rsp+128] + movdqu xmm8, OWORD PTR [rsp+128] movdqa xmm1, OWORD PTR L_aes_gcm_bswap_epi64 movdqa xmm0, xmm8 pshufb xmm8, xmm1 @@ -2810,7 +2810,7 @@ L_AES_GCM_decrypt_aesni_ghash_128: pshufb xmm15, xmm1 paddd xmm0, OWORD PTR L_aes_gcm_eight movdqa xmm7, OWORD PTR [r15] - movdqu [rsp+128], xmm0 + movdqu OWORD PTR [rsp+128], xmm0 pxor xmm8, xmm7 pxor xmm9, xmm7 pxor xmm10, xmm7 @@ -2819,8 +2819,8 @@ L_AES_GCM_decrypt_aesni_ghash_128: pxor xmm13, xmm7 pxor xmm14, xmm7 pxor xmm15, xmm7 - movdqu xmm7, [rsp+112] - movdqu xmm0, [rcx] + movdqu xmm7, OWORD PTR [rsp+112] + movdqu xmm0, OWORD PTR [rcx] aesenc xmm8, [r15+16] pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask pxor xmm0, xmm2 @@ -2842,8 +2842,8 @@ L_AES_GCM_decrypt_aesni_ghash_128: aesenc xmm15, [r15+16] pxor xmm1, xmm2 pxor xmm1, xmm3 - movdqu xmm7, [rsp+96] - movdqu xmm0, [rcx+16] + movdqu xmm7, OWORD PTR [rsp+96] + movdqu xmm0, OWORD PTR [rcx+16] pshufd xmm4, xmm7, 78 pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask aesenc xmm8, [r15+32] @@ -2866,8 +2866,8 @@ L_AES_GCM_decrypt_aesni_ghash_128: pxor xmm1, xmm6 pxor xmm3, xmm6 pxor xmm1, xmm4 - movdqu xmm7, [rsp+80] - movdqu xmm0, [rcx+32] + movdqu xmm7, OWORD PTR [rsp+80] + movdqu xmm0, OWORD PTR [rcx+32] pshufd xmm4, xmm7, 78 pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask aesenc xmm8, [r15+48] @@ -2890,8 +2890,8 @@ L_AES_GCM_decrypt_aesni_ghash_128: pxor xmm1, xmm6 pxor xmm3, xmm6 pxor xmm1, xmm4 - movdqu xmm7, [rsp+64] - movdqu xmm0, [rcx+48] + movdqu xmm7, OWORD PTR [rsp+64] + movdqu xmm0, OWORD PTR [rcx+48] pshufd xmm4, xmm7, 78 pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask aesenc xmm8, [r15+64] @@ -2914,8 +2914,8 @@ L_AES_GCM_decrypt_aesni_ghash_128: pxor xmm1, xmm6 pxor xmm3, xmm6 pxor xmm1, xmm4 - movdqu xmm7, [rsp+48] - movdqu xmm0, [rcx+64] + movdqu xmm7, OWORD PTR [rsp+48] + movdqu xmm0, OWORD PTR [rcx+64] pshufd xmm4, xmm7, 78 pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask aesenc xmm8, [r15+80] @@ -2938,8 +2938,8 @@ L_AES_GCM_decrypt_aesni_ghash_128: pxor xmm1, xmm6 pxor xmm3, xmm6 pxor xmm1, xmm4 - movdqu xmm7, [rsp+32] - movdqu xmm0, [rcx+80] + movdqu xmm7, OWORD PTR [rsp+32] + movdqu xmm0, OWORD PTR [rcx+80] pshufd xmm4, xmm7, 78 pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask aesenc xmm8, [r15+96] @@ -2962,8 +2962,8 @@ L_AES_GCM_decrypt_aesni_ghash_128: pxor xmm1, xmm6 pxor xmm3, xmm6 pxor xmm1, xmm4 - movdqu xmm7, [rsp+16] - movdqu xmm0, [rcx+96] + movdqu xmm7, OWORD PTR [rsp+16] + movdqu xmm0, OWORD PTR [rcx+96] pshufd xmm4, xmm7, 78 pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask aesenc xmm8, [r15+112] @@ -2986,8 +2986,8 @@ L_AES_GCM_decrypt_aesni_ghash_128: pxor xmm1, xmm6 pxor xmm3, xmm6 pxor xmm1, xmm4 - movdqu xmm7, [rsp] - movdqu xmm0, [rcx+112] + movdqu xmm7, OWORD PTR [rsp] + movdqu xmm0, OWORD PTR [rcx+112] pshufd xmm4, xmm7, 78 pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask aesenc xmm8, [r15+128] @@ -3090,41 +3090,41 @@ L_AES_GCM_decrypt_aesni_ghash_128: L_AES_GCM_decrypt_aesni_aesenc_128_ghash_avx_done: aesenclast xmm8, xmm7 aesenclast xmm9, xmm7 - movdqu xmm0, [rcx] - movdqu xmm1, [rcx+16] + movdqu xmm0, OWORD PTR [rcx] + movdqu xmm1, OWORD PTR [rcx+16] pxor xmm8, xmm0 pxor xmm9, xmm1 - movdqu [rdx], xmm8 - movdqu [rdx+16], xmm9 + movdqu OWORD PTR [rdx], xmm8 + movdqu OWORD PTR [rdx+16], xmm9 aesenclast xmm10, xmm7 aesenclast xmm11, xmm7 - movdqu xmm0, [rcx+32] - movdqu xmm1, [rcx+48] + movdqu xmm0, OWORD PTR [rcx+32] + movdqu xmm1, OWORD PTR [rcx+48] pxor xmm10, xmm0 pxor xmm11, xmm1 - movdqu [rdx+32], xmm10 - movdqu [rdx+48], xmm11 + movdqu OWORD PTR [rdx+32], xmm10 + movdqu OWORD PTR [rdx+48], xmm11 aesenclast xmm12, xmm7 aesenclast xmm13, xmm7 - movdqu xmm0, [rcx+64] - movdqu xmm1, [rcx+80] + movdqu xmm0, OWORD PTR [rcx+64] + movdqu xmm1, OWORD PTR [rcx+80] pxor xmm12, xmm0 pxor xmm13, xmm1 - movdqu [rdx+64], xmm12 - movdqu [rdx+80], xmm13 + movdqu OWORD PTR [rdx+64], xmm12 + movdqu OWORD PTR [rdx+80], xmm13 aesenclast xmm14, xmm7 aesenclast xmm15, xmm7 - movdqu xmm0, [rcx+96] - movdqu xmm1, [rcx+112] + movdqu xmm0, OWORD PTR [rcx+96] + movdqu xmm1, OWORD PTR [rcx+112] pxor xmm14, xmm0 pxor xmm15, xmm1 - movdqu [rdx+96], xmm14 - movdqu [rdx+112], xmm15 + movdqu OWORD PTR [rdx+96], xmm14 + movdqu OWORD PTR [rdx+112], xmm15 add ebx, 128 cmp ebx, r13d jl L_AES_GCM_decrypt_aesni_ghash_128 movdqa xmm6, xmm2 - movdqu xmm5, [rsp] + movdqu xmm5, OWORD PTR [rsp] L_AES_GCM_decrypt_aesni_done_128: mov edx, r9d cmp ebx, edx @@ -3136,16 +3136,16 @@ L_AES_GCM_decrypt_aesni_done_128: L_AES_GCM_decrypt_aesni_last_block_start: lea rcx, QWORD PTR [rdi+rbx] lea rdx, QWORD PTR [rsi+rbx] - movdqu xmm1, [rcx] + movdqu xmm1, OWORD PTR [rcx] movdqa xmm0, xmm5 pshufb xmm1, OWORD PTR L_aes_gcm_bswap_mask pxor xmm1, xmm6 - movdqu xmm8, [rsp+128] + movdqu xmm8, OWORD PTR [rsp+128] movdqa xmm9, xmm8 pshufb xmm8, OWORD PTR L_aes_gcm_bswap_epi64 paddd xmm9, OWORD PTR L_aes_gcm_one pxor xmm8, [r15] - movdqu [rsp+128], xmm9 + movdqu OWORD PTR [rsp+128], xmm9 movdqa xmm10, xmm1 pclmulqdq xmm10, xmm0, 16 aesenc xmm8, [r15+16] @@ -3193,9 +3193,9 @@ L_AES_GCM_decrypt_aesni_last_block_start: movdqa xmm9, OWORD PTR [r15+224] L_AES_GCM_decrypt_aesni_aesenc_gfmul_last: aesenclast xmm8, xmm9 - movdqu xmm9, [rcx] + movdqu xmm9, OWORD PTR [rcx] pxor xmm8, xmm9 - movdqu [rdx], xmm8 + movdqu OWORD PTR [rdx], xmm8 add ebx, 16 cmp ebx, r13d jl L_AES_GCM_decrypt_aesni_last_block_start @@ -3204,7 +3204,7 @@ L_AES_GCM_decrypt_aesni_last_block_done: mov edx, ecx and ecx, 15 jz L_AES_GCM_decrypt_aesni_aesenc_last15_dec_avx_done - movdqu xmm4, [rsp+128] + movdqu xmm4, OWORD PTR [rsp+128] pshufb xmm4, OWORD PTR L_aes_gcm_bswap_epi64 pxor xmm4, [r15] aesenc xmm4, [r15+16] @@ -3231,9 +3231,9 @@ L_AES_GCM_decrypt_aesni_aesenc_last15_dec_avx_aesenc_avx_last: aesenclast xmm4, xmm9 sub rsp, 32 xor ecx, ecx - movdqu [rsp], xmm4 + movdqu OWORD PTR [rsp], xmm4 pxor xmm0, xmm0 - movdqu [rsp+16], xmm0 + movdqu OWORD PTR [rsp+16], xmm0 L_AES_GCM_decrypt_aesni_aesenc_last15_dec_avx_loop: movzx r13d, BYTE PTR [rdi+rbx] mov BYTE PTR [rsp+rcx+16], r13b @@ -3243,7 +3243,7 @@ L_AES_GCM_decrypt_aesni_aesenc_last15_dec_avx_loop: inc ecx cmp ebx, edx jl L_AES_GCM_decrypt_aesni_aesenc_last15_dec_avx_loop - movdqu xmm4, [rsp+16] + movdqu xmm4, OWORD PTR [rsp+16] add rsp, 32 pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask pxor xmm6, xmm4 @@ -3337,14 +3337,14 @@ L_AES_GCM_decrypt_aesni_done_dec: pxor xmm14, xmm8 pxor xmm6, xmm14 pshufb xmm6, OWORD PTR L_aes_gcm_bswap_mask - movdqu xmm0, [rsp+144] + movdqu xmm0, OWORD PTR [rsp+144] pxor xmm0, xmm6 cmp r14d, 16 je L_AES_GCM_decrypt_aesni_cmp_tag_16 sub rsp, 16 xor rcx, rcx xor rbx, rbx - movdqu [rsp], xmm0 + movdqu OWORD PTR [rsp], xmm0 L_AES_GCM_decrypt_aesni_cmp_tag_loop: movzx r13d, BYTE PTR [rsp+rcx] xor r13b, BYTE PTR [r8+rcx] @@ -3352,13 +3352,13 @@ L_AES_GCM_decrypt_aesni_cmp_tag_loop: inc ecx cmp ecx, r14d jne L_AES_GCM_decrypt_aesni_cmp_tag_loop - cmp rbx, 0 + cmp bl, 0 sete bl add rsp, 16 xor rcx, rcx jmp L_AES_GCM_decrypt_aesni_cmp_tag_done L_AES_GCM_decrypt_aesni_cmp_tag_16: - movdqu xmm1, [r8] + movdqu xmm1, OWORD PTR [r8] pcmpeqb xmm0, xmm1 pmovmskb rdx, xmm0 ; %%edx == 0xFFFF then return 1 else => return 0 @@ -3367,16 +3367,16 @@ L_AES_GCM_decrypt_aesni_cmp_tag_16: sete bl L_AES_GCM_decrypt_aesni_cmp_tag_done: mov DWORD PTR [rbp], ebx - movdqu xmm6, [rsp+168] - movdqu xmm7, [rsp+184] - movdqu xmm8, [rsp+200] - movdqu xmm9, [rsp+216] - movdqu xmm10, [rsp+232] - movdqu xmm11, [rsp+248] - movdqu xmm12, [rsp+264] - movdqu xmm13, [rsp+280] - movdqu xmm14, [rsp+296] - movdqu xmm15, [rsp+312] + movdqu xmm6, OWORD PTR [rsp+168] + movdqu xmm7, OWORD PTR [rsp+184] + movdqu xmm8, OWORD PTR [rsp+200] + movdqu xmm9, OWORD PTR [rsp+216] + movdqu xmm10, OWORD PTR [rsp+232] + movdqu xmm11, OWORD PTR [rsp+248] + movdqu xmm12, OWORD PTR [rsp+264] + movdqu xmm13, OWORD PTR [rsp+280] + movdqu xmm14, OWORD PTR [rsp+296] + movdqu xmm15, OWORD PTR [rsp+312] add rsp, 328 pop rbp pop r15 @@ -3404,10 +3404,10 @@ AES_GCM_init_aesni PROC mov r8, QWORD PTR [rsp+88] mov r9, QWORD PTR [rsp+96] sub rsp, 80 - movdqu [rsp+16], xmm6 - movdqu [rsp+32], xmm7 - movdqu [rsp+48], xmm8 - movdqu [rsp+64], xmm15 + movdqu OWORD PTR [rsp+16], xmm6 + movdqu OWORD PTR [rsp+32], xmm7 + movdqu OWORD PTR [rsp+48], xmm8 + movdqu OWORD PTR [rsp+64], xmm15 pxor xmm4, xmm4 mov edx, r11d cmp edx, 12 @@ -3508,7 +3508,7 @@ L_AES_GCM_init_aesni_calc_iv_1_aesenc_avx_last: jl L_AES_GCM_init_aesni_calc_iv_lt16 and edx, 4294967280 L_AES_GCM_init_aesni_calc_iv_16_loop: - movdqu xmm7, [r10+rcx] + movdqu xmm7, OWORD PTR [r10+rcx] pshufb xmm7, OWORD PTR L_aes_gcm_bswap_mask pxor xmm4, xmm7 pshufd xmm1, xmm4, 78 @@ -3575,7 +3575,7 @@ L_AES_GCM_init_aesni_calc_iv_lt16: sub rsp, 16 pxor xmm7, xmm7 xor r13d, r13d - movdqu [rsp], xmm7 + movdqu OWORD PTR [rsp], xmm7 L_AES_GCM_init_aesni_calc_iv_loop: movzx r12d, BYTE PTR [r10+rcx] mov BYTE PTR [rsp+r13], r12b @@ -3583,7 +3583,7 @@ L_AES_GCM_init_aesni_calc_iv_loop: inc r13d cmp ecx, edx jl L_AES_GCM_init_aesni_calc_iv_loop - movdqu xmm7, [rsp] + movdqu xmm7, OWORD PTR [rsp] add rsp, 16 pshufb xmm7, OWORD PTR L_aes_gcm_bswap_mask pxor xmm4, xmm7 @@ -3734,10 +3734,10 @@ L_AES_GCM_init_aesni_iv_done: paddd xmm4, OWORD PTR L_aes_gcm_one movdqa OWORD PTR [rax], xmm5 movdqa OWORD PTR [r8], xmm4 - movdqu xmm6, [rsp+16] - movdqu xmm7, [rsp+32] - movdqu xmm8, [rsp+48] - movdqu xmm15, [rsp+64] + movdqu xmm6, OWORD PTR [rsp+16] + movdqu xmm7, OWORD PTR [rsp+32] + movdqu xmm8, OWORD PTR [rsp+48] + movdqu xmm15, OWORD PTR [rsp+64] add rsp, 80 pop r14 pop r13 @@ -3751,13 +3751,13 @@ _text SEGMENT READONLY PARA AES_GCM_aad_update_aesni PROC mov rax, rcx sub rsp, 32 - movdqu [rsp], xmm6 - movdqu [rsp+16], xmm7 + movdqu OWORD PTR [rsp], xmm6 + movdqu OWORD PTR [rsp+16], xmm7 movdqa xmm5, OWORD PTR [r8] movdqa xmm6, OWORD PTR [r9] xor ecx, ecx L_AES_GCM_aad_update_aesni_16_loop: - movdqu xmm7, [rax+rcx] + movdqu xmm7, OWORD PTR [rax+rcx] pshufb xmm7, OWORD PTR L_aes_gcm_bswap_mask pxor xmm5, xmm7 pshufd xmm1, xmm5, 78 @@ -3818,8 +3818,8 @@ L_AES_GCM_aad_update_aesni_16_loop: cmp ecx, edx jl L_AES_GCM_aad_update_aesni_16_loop movdqa OWORD PTR [r8], xmm5 - movdqu xmm6, [rsp] - movdqu xmm7, [rsp+16] + movdqu xmm6, OWORD PTR [rsp] + movdqu xmm7, OWORD PTR [rsp+16] add rsp, 32 ret AES_GCM_aad_update_aesni ENDP @@ -3829,12 +3829,12 @@ AES_GCM_encrypt_block_aesni PROC mov r10, r8 mov r11, r9 mov rax, QWORD PTR [rsp+40] - movdqu xmm0, [rax] + movdqu xmm0, OWORD PTR [rax] movdqa xmm1, xmm0 pshufb xmm0, OWORD PTR L_aes_gcm_bswap_epi64 paddd xmm1, OWORD PTR L_aes_gcm_one pxor xmm0, [rcx] - movdqu [rax], xmm1 + movdqu OWORD PTR [rax], xmm1 aesenc xmm0, [rcx+16] aesenc xmm0, [rcx+32] aesenc xmm0, [rcx+48] @@ -3857,9 +3857,9 @@ AES_GCM_encrypt_block_aesni PROC movdqa xmm1, OWORD PTR [rcx+224] L_AES_GCM_encrypt_block_aesni_aesenc_block_aesenc_avx_last: aesenclast xmm0, xmm1 - movdqu xmm1, [r11] + movdqu xmm1, OWORD PTR [r11] pxor xmm0, xmm1 - movdqu [r10], xmm0 + movdqu OWORD PTR [r10], xmm0 pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask ret AES_GCM_encrypt_block_aesni ENDP @@ -3867,11 +3867,11 @@ _text ENDS _text SEGMENT READONLY PARA AES_GCM_ghash_block_aesni PROC sub rsp, 32 - movdqu [rsp], xmm6 - movdqu [rsp+16], xmm7 + movdqu OWORD PTR [rsp], xmm6 + movdqu OWORD PTR [rsp+16], xmm7 movdqa xmm4, OWORD PTR [rdx] movdqa xmm5, OWORD PTR [r8] - movdqu xmm7, [rcx] + movdqu xmm7, OWORD PTR [rcx] pshufb xmm7, OWORD PTR L_aes_gcm_bswap_mask pxor xmm4, xmm7 pshufd xmm1, xmm4, 78 @@ -3929,8 +3929,8 @@ AES_GCM_ghash_block_aesni PROC pxor xmm2, xmm6 pxor xmm4, xmm2 movdqa OWORD PTR [rdx], xmm4 - movdqu xmm6, [rsp] - movdqu xmm7, [rsp+16] + movdqu xmm6, OWORD PTR [rsp] + movdqu xmm7, OWORD PTR [rsp+16] add rsp, 32 ret AES_GCM_ghash_block_aesni ENDP @@ -3951,16 +3951,16 @@ AES_GCM_encrypt_update_aesni PROC mov r14, QWORD PTR [rsp+96] mov r15, QWORD PTR [rsp+104] sub rsp, 320 - movdqu [rsp+160], xmm6 - movdqu [rsp+176], xmm7 - movdqu [rsp+192], xmm8 - movdqu [rsp+208], xmm9 - movdqu [rsp+224], xmm10 - movdqu [rsp+240], xmm11 - movdqu [rsp+256], xmm12 - movdqu [rsp+272], xmm13 - movdqu [rsp+288], xmm14 - movdqu [rsp+304], xmm15 + movdqu OWORD PTR [rsp+160], xmm6 + movdqu OWORD PTR [rsp+176], xmm7 + movdqu OWORD PTR [rsp+192], xmm8 + movdqu OWORD PTR [rsp+208], xmm9 + movdqu OWORD PTR [rsp+224], xmm10 + movdqu OWORD PTR [rsp+240], xmm11 + movdqu OWORD PTR [rsp+256], xmm12 + movdqu OWORD PTR [rsp+272], xmm13 + movdqu OWORD PTR [rsp+288], xmm14 + movdqu OWORD PTR [rsp+304], xmm15 movdqa xmm6, OWORD PTR [r12] movdqa xmm5, OWORD PTR [r14] movdqa xmm9, xmm5 @@ -3980,7 +3980,7 @@ AES_GCM_encrypt_update_aesni PROC and r13d, 4294967168 movdqa xmm2, xmm6 ; H ^ 1 - movdqu [rsp], xmm5 + movdqu OWORD PTR [rsp], xmm5 ; H ^ 2 pshufd xmm9, xmm5, 78 pshufd xmm10, xmm5, 78 @@ -4022,7 +4022,7 @@ AES_GCM_encrypt_update_aesni PROC pxor xmm14, xmm13 pxor xmm14, xmm8 pxor xmm0, xmm14 - movdqu [rsp+16], xmm0 + movdqu OWORD PTR [rsp+16], xmm0 ; H ^ 3 pshufd xmm9, xmm5, 78 pshufd xmm10, xmm0, 78 @@ -4064,7 +4064,7 @@ AES_GCM_encrypt_update_aesni PROC pxor xmm14, xmm13 pxor xmm14, xmm8 pxor xmm1, xmm14 - movdqu [rsp+32], xmm1 + movdqu OWORD PTR [rsp+32], xmm1 ; H ^ 4 pshufd xmm9, xmm0, 78 pshufd xmm10, xmm0, 78 @@ -4106,7 +4106,7 @@ AES_GCM_encrypt_update_aesni PROC pxor xmm14, xmm13 pxor xmm14, xmm8 pxor xmm3, xmm14 - movdqu [rsp+48], xmm3 + movdqu OWORD PTR [rsp+48], xmm3 ; H ^ 5 pshufd xmm9, xmm0, 78 pshufd xmm10, xmm1, 78 @@ -4148,7 +4148,7 @@ AES_GCM_encrypt_update_aesni PROC pxor xmm14, xmm13 pxor xmm14, xmm8 pxor xmm7, xmm14 - movdqu [rsp+64], xmm7 + movdqu OWORD PTR [rsp+64], xmm7 ; H ^ 6 pshufd xmm9, xmm1, 78 pshufd xmm10, xmm1, 78 @@ -4190,7 +4190,7 @@ AES_GCM_encrypt_update_aesni PROC pxor xmm14, xmm13 pxor xmm14, xmm8 pxor xmm7, xmm14 - movdqu [rsp+80], xmm7 + movdqu OWORD PTR [rsp+80], xmm7 ; H ^ 7 pshufd xmm9, xmm1, 78 pshufd xmm10, xmm3, 78 @@ -4232,7 +4232,7 @@ AES_GCM_encrypt_update_aesni PROC pxor xmm14, xmm13 pxor xmm14, xmm8 pxor xmm7, xmm14 - movdqu [rsp+96], xmm7 + movdqu OWORD PTR [rsp+96], xmm7 ; H ^ 8 pshufd xmm9, xmm3, 78 pshufd xmm10, xmm3, 78 @@ -4274,9 +4274,9 @@ AES_GCM_encrypt_update_aesni PROC pxor xmm14, xmm13 pxor xmm14, xmm8 pxor xmm7, xmm14 - movdqu [rsp+112], xmm7 + movdqu OWORD PTR [rsp+112], xmm7 ; First 128 bytes of input - movdqu xmm8, [r15] + movdqu xmm8, OWORD PTR [r15] movdqa xmm1, OWORD PTR L_aes_gcm_bswap_epi64 movdqa xmm0, xmm8 pshufb xmm8, xmm1 @@ -4303,7 +4303,7 @@ AES_GCM_encrypt_update_aesni PROC pshufb xmm15, xmm1 paddd xmm0, OWORD PTR L_aes_gcm_eight movdqa xmm7, OWORD PTR [rax] - movdqu [r15], xmm0 + movdqu OWORD PTR [r15], xmm0 pxor xmm8, xmm7 pxor xmm9, xmm7 pxor xmm10, xmm7 @@ -4437,36 +4437,36 @@ AES_GCM_encrypt_update_aesni PROC L_AES_GCM_encrypt_update_aesni_enc_done: aesenclast xmm8, xmm7 aesenclast xmm9, xmm7 - movdqu xmm0, [r11] - movdqu xmm1, [r11+16] + movdqu xmm0, OWORD PTR [r11] + movdqu xmm1, OWORD PTR [r11+16] pxor xmm8, xmm0 pxor xmm9, xmm1 - movdqu [r10], xmm8 - movdqu [r10+16], xmm9 + movdqu OWORD PTR [r10], xmm8 + movdqu OWORD PTR [r10+16], xmm9 aesenclast xmm10, xmm7 aesenclast xmm11, xmm7 - movdqu xmm0, [r11+32] - movdqu xmm1, [r11+48] + movdqu xmm0, OWORD PTR [r11+32] + movdqu xmm1, OWORD PTR [r11+48] pxor xmm10, xmm0 pxor xmm11, xmm1 - movdqu [r10+32], xmm10 - movdqu [r10+48], xmm11 + movdqu OWORD PTR [r10+32], xmm10 + movdqu OWORD PTR [r10+48], xmm11 aesenclast xmm12, xmm7 aesenclast xmm13, xmm7 - movdqu xmm0, [r11+64] - movdqu xmm1, [r11+80] + movdqu xmm0, OWORD PTR [r11+64] + movdqu xmm1, OWORD PTR [r11+80] pxor xmm12, xmm0 pxor xmm13, xmm1 - movdqu [r10+64], xmm12 - movdqu [r10+80], xmm13 + movdqu OWORD PTR [r10+64], xmm12 + movdqu OWORD PTR [r10+80], xmm13 aesenclast xmm14, xmm7 aesenclast xmm15, xmm7 - movdqu xmm0, [r11+96] - movdqu xmm1, [r11+112] + movdqu xmm0, OWORD PTR [r11+96] + movdqu xmm1, OWORD PTR [r11+112] pxor xmm14, xmm0 pxor xmm15, xmm1 - movdqu [r10+96], xmm14 - movdqu [r10+112], xmm15 + movdqu OWORD PTR [r10+96], xmm14 + movdqu OWORD PTR [r10+112], xmm15 cmp r13d, 128 mov edi, 128 jle L_AES_GCM_encrypt_update_aesni_end_128 @@ -4474,7 +4474,7 @@ L_AES_GCM_encrypt_update_aesni_enc_done: L_AES_GCM_encrypt_update_aesni_ghash_128: lea rcx, QWORD PTR [r11+rdi] lea rdx, QWORD PTR [r10+rdi] - movdqu xmm8, [r15] + movdqu xmm8, OWORD PTR [r15] movdqa xmm1, OWORD PTR L_aes_gcm_bswap_epi64 movdqa xmm0, xmm8 pshufb xmm8, xmm1 @@ -4501,7 +4501,7 @@ L_AES_GCM_encrypt_update_aesni_ghash_128: pshufb xmm15, xmm1 paddd xmm0, OWORD PTR L_aes_gcm_eight movdqa xmm7, OWORD PTR [rax] - movdqu [r15], xmm0 + movdqu OWORD PTR [r15], xmm0 pxor xmm8, xmm7 pxor xmm9, xmm7 pxor xmm10, xmm7 @@ -4510,8 +4510,8 @@ L_AES_GCM_encrypt_update_aesni_ghash_128: pxor xmm13, xmm7 pxor xmm14, xmm7 pxor xmm15, xmm7 - movdqu xmm7, [rsp+112] - movdqu xmm0, [rdx+-128] + movdqu xmm7, OWORD PTR [rsp+112] + movdqu xmm0, OWORD PTR [rdx+-128] aesenc xmm8, [rax+16] pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask pxor xmm0, xmm2 @@ -4533,8 +4533,8 @@ L_AES_GCM_encrypt_update_aesni_ghash_128: aesenc xmm15, [rax+16] pxor xmm1, xmm2 pxor xmm1, xmm3 - movdqu xmm7, [rsp+96] - movdqu xmm0, [rdx+-112] + movdqu xmm7, OWORD PTR [rsp+96] + movdqu xmm0, OWORD PTR [rdx+-112] pshufd xmm4, xmm7, 78 pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask aesenc xmm8, [rax+32] @@ -4557,8 +4557,8 @@ L_AES_GCM_encrypt_update_aesni_ghash_128: pxor xmm1, xmm6 pxor xmm3, xmm6 pxor xmm1, xmm4 - movdqu xmm7, [rsp+80] - movdqu xmm0, [rdx+-96] + movdqu xmm7, OWORD PTR [rsp+80] + movdqu xmm0, OWORD PTR [rdx+-96] pshufd xmm4, xmm7, 78 pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask aesenc xmm8, [rax+48] @@ -4581,8 +4581,8 @@ L_AES_GCM_encrypt_update_aesni_ghash_128: pxor xmm1, xmm6 pxor xmm3, xmm6 pxor xmm1, xmm4 - movdqu xmm7, [rsp+64] - movdqu xmm0, [rdx+-80] + movdqu xmm7, OWORD PTR [rsp+64] + movdqu xmm0, OWORD PTR [rdx+-80] pshufd xmm4, xmm7, 78 pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask aesenc xmm8, [rax+64] @@ -4605,8 +4605,8 @@ L_AES_GCM_encrypt_update_aesni_ghash_128: pxor xmm1, xmm6 pxor xmm3, xmm6 pxor xmm1, xmm4 - movdqu xmm7, [rsp+48] - movdqu xmm0, [rdx+-64] + movdqu xmm7, OWORD PTR [rsp+48] + movdqu xmm0, OWORD PTR [rdx+-64] pshufd xmm4, xmm7, 78 pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask aesenc xmm8, [rax+80] @@ -4629,8 +4629,8 @@ L_AES_GCM_encrypt_update_aesni_ghash_128: pxor xmm1, xmm6 pxor xmm3, xmm6 pxor xmm1, xmm4 - movdqu xmm7, [rsp+32] - movdqu xmm0, [rdx+-48] + movdqu xmm7, OWORD PTR [rsp+32] + movdqu xmm0, OWORD PTR [rdx+-48] pshufd xmm4, xmm7, 78 pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask aesenc xmm8, [rax+96] @@ -4653,8 +4653,8 @@ L_AES_GCM_encrypt_update_aesni_ghash_128: pxor xmm1, xmm6 pxor xmm3, xmm6 pxor xmm1, xmm4 - movdqu xmm7, [rsp+16] - movdqu xmm0, [rdx+-32] + movdqu xmm7, OWORD PTR [rsp+16] + movdqu xmm0, OWORD PTR [rdx+-32] pshufd xmm4, xmm7, 78 pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask aesenc xmm8, [rax+112] @@ -4677,8 +4677,8 @@ L_AES_GCM_encrypt_update_aesni_ghash_128: pxor xmm1, xmm6 pxor xmm3, xmm6 pxor xmm1, xmm4 - movdqu xmm7, [rsp] - movdqu xmm0, [rdx+-16] + movdqu xmm7, OWORD PTR [rsp] + movdqu xmm0, OWORD PTR [rdx+-16] pshufd xmm4, xmm7, 78 pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask aesenc xmm8, [rax+128] @@ -4781,36 +4781,36 @@ L_AES_GCM_encrypt_update_aesni_ghash_128: L_AES_GCM_encrypt_update_aesni_aesenc_128_ghash_avx_done: aesenclast xmm8, xmm7 aesenclast xmm9, xmm7 - movdqu xmm0, [rcx] - movdqu xmm1, [rcx+16] + movdqu xmm0, OWORD PTR [rcx] + movdqu xmm1, OWORD PTR [rcx+16] pxor xmm8, xmm0 pxor xmm9, xmm1 - movdqu [rdx], xmm8 - movdqu [rdx+16], xmm9 + movdqu OWORD PTR [rdx], xmm8 + movdqu OWORD PTR [rdx+16], xmm9 aesenclast xmm10, xmm7 aesenclast xmm11, xmm7 - movdqu xmm0, [rcx+32] - movdqu xmm1, [rcx+48] + movdqu xmm0, OWORD PTR [rcx+32] + movdqu xmm1, OWORD PTR [rcx+48] pxor xmm10, xmm0 pxor xmm11, xmm1 - movdqu [rdx+32], xmm10 - movdqu [rdx+48], xmm11 + movdqu OWORD PTR [rdx+32], xmm10 + movdqu OWORD PTR [rdx+48], xmm11 aesenclast xmm12, xmm7 aesenclast xmm13, xmm7 - movdqu xmm0, [rcx+64] - movdqu xmm1, [rcx+80] + movdqu xmm0, OWORD PTR [rcx+64] + movdqu xmm1, OWORD PTR [rcx+80] pxor xmm12, xmm0 pxor xmm13, xmm1 - movdqu [rdx+64], xmm12 - movdqu [rdx+80], xmm13 + movdqu OWORD PTR [rdx+64], xmm12 + movdqu OWORD PTR [rdx+80], xmm13 aesenclast xmm14, xmm7 aesenclast xmm15, xmm7 - movdqu xmm0, [rcx+96] - movdqu xmm1, [rcx+112] + movdqu xmm0, OWORD PTR [rcx+96] + movdqu xmm1, OWORD PTR [rcx+112] pxor xmm14, xmm0 pxor xmm15, xmm1 - movdqu [rdx+96], xmm14 - movdqu [rdx+112], xmm15 + movdqu OWORD PTR [rdx+96], xmm14 + movdqu OWORD PTR [rdx+112], xmm15 add edi, 128 cmp edi, r13d jl L_AES_GCM_encrypt_update_aesni_ghash_128 @@ -4825,7 +4825,7 @@ L_AES_GCM_encrypt_update_aesni_end_128: pshufb xmm13, xmm4 pshufb xmm14, xmm4 pshufb xmm15, xmm4 - movdqu xmm7, [rsp+112] + movdqu xmm7, OWORD PTR [rsp+112] pshufd xmm1, xmm8, 78 pshufd xmm2, xmm7, 78 movdqa xmm3, xmm7 @@ -4844,7 +4844,7 @@ L_AES_GCM_encrypt_update_aesni_end_128: psrldq xmm1, 8 pxor xmm4, xmm2 pxor xmm6, xmm1 - movdqu xmm7, [rsp+96] + movdqu xmm7, OWORD PTR [rsp+96] pshufd xmm1, xmm9, 78 pshufd xmm2, xmm7, 78 movdqa xmm3, xmm7 @@ -4863,7 +4863,7 @@ L_AES_GCM_encrypt_update_aesni_end_128: psrldq xmm1, 8 pxor xmm4, xmm2 pxor xmm6, xmm1 - movdqu xmm7, [rsp+80] + movdqu xmm7, OWORD PTR [rsp+80] pshufd xmm1, xmm10, 78 pshufd xmm2, xmm7, 78 movdqa xmm3, xmm7 @@ -4882,7 +4882,7 @@ L_AES_GCM_encrypt_update_aesni_end_128: psrldq xmm1, 8 pxor xmm4, xmm2 pxor xmm6, xmm1 - movdqu xmm7, [rsp+64] + movdqu xmm7, OWORD PTR [rsp+64] pshufd xmm1, xmm11, 78 pshufd xmm2, xmm7, 78 movdqa xmm3, xmm7 @@ -4901,7 +4901,7 @@ L_AES_GCM_encrypt_update_aesni_end_128: psrldq xmm1, 8 pxor xmm4, xmm2 pxor xmm6, xmm1 - movdqu xmm7, [rsp+48] + movdqu xmm7, OWORD PTR [rsp+48] pshufd xmm1, xmm12, 78 pshufd xmm2, xmm7, 78 movdqa xmm3, xmm7 @@ -4920,7 +4920,7 @@ L_AES_GCM_encrypt_update_aesni_end_128: psrldq xmm1, 8 pxor xmm4, xmm2 pxor xmm6, xmm1 - movdqu xmm7, [rsp+32] + movdqu xmm7, OWORD PTR [rsp+32] pshufd xmm1, xmm13, 78 pshufd xmm2, xmm7, 78 movdqa xmm3, xmm7 @@ -4939,7 +4939,7 @@ L_AES_GCM_encrypt_update_aesni_end_128: psrldq xmm1, 8 pxor xmm4, xmm2 pxor xmm6, xmm1 - movdqu xmm7, [rsp+16] + movdqu xmm7, OWORD PTR [rsp+16] pshufd xmm1, xmm14, 78 pshufd xmm2, xmm7, 78 movdqa xmm3, xmm7 @@ -4958,7 +4958,7 @@ L_AES_GCM_encrypt_update_aesni_end_128: psrldq xmm1, 8 pxor xmm4, xmm2 pxor xmm6, xmm1 - movdqu xmm7, [rsp] + movdqu xmm7, OWORD PTR [rsp] pshufd xmm1, xmm15, 78 pshufd xmm2, xmm7, 78 movdqa xmm3, xmm7 @@ -5000,7 +5000,7 @@ L_AES_GCM_encrypt_update_aesni_end_128: pxor xmm2, xmm1 pxor xmm2, xmm4 pxor xmm6, xmm2 - movdqu xmm5, [rsp] + movdqu xmm5, OWORD PTR [rsp] L_AES_GCM_encrypt_update_aesni_done_128: mov edx, r9d cmp edi, edx @@ -5011,12 +5011,12 @@ L_AES_GCM_encrypt_update_aesni_done_128: jge L_AES_GCM_encrypt_update_aesni_last_block_done lea rcx, QWORD PTR [r11+rdi] lea rdx, QWORD PTR [r10+rdi] - movdqu xmm8, [r15] + movdqu xmm8, OWORD PTR [r15] movdqa xmm9, xmm8 pshufb xmm8, OWORD PTR L_aes_gcm_bswap_epi64 paddd xmm9, OWORD PTR L_aes_gcm_one pxor xmm8, [rax] - movdqu [r15], xmm9 + movdqu OWORD PTR [r15], xmm9 aesenc xmm8, [rax+16] aesenc xmm8, [rax+32] aesenc xmm8, [rax+48] @@ -5039,9 +5039,9 @@ L_AES_GCM_encrypt_update_aesni_done_128: movdqa xmm9, OWORD PTR [rax+224] L_AES_GCM_encrypt_update_aesni_aesenc_block_aesenc_avx_last: aesenclast xmm8, xmm9 - movdqu xmm9, [rcx] + movdqu xmm9, OWORD PTR [rcx] pxor xmm8, xmm9 - movdqu [rdx], xmm8 + movdqu OWORD PTR [rdx], xmm8 pshufb xmm8, OWORD PTR L_aes_gcm_bswap_mask pxor xmm6, xmm8 add edi, 16 @@ -5050,12 +5050,12 @@ L_AES_GCM_encrypt_update_aesni_aesenc_block_aesenc_avx_last: L_AES_GCM_encrypt_update_aesni_last_block_start: lea rcx, QWORD PTR [r11+rdi] lea rdx, QWORD PTR [r10+rdi] - movdqu xmm8, [r15] + movdqu xmm8, OWORD PTR [r15] movdqa xmm9, xmm8 pshufb xmm8, OWORD PTR L_aes_gcm_bswap_epi64 paddd xmm9, OWORD PTR L_aes_gcm_one pxor xmm8, [rax] - movdqu [r15], xmm9 + movdqu OWORD PTR [r15], xmm9 movdqa xmm10, xmm6 pclmulqdq xmm10, xmm5, 16 aesenc xmm8, [rax+16] @@ -5103,9 +5103,9 @@ L_AES_GCM_encrypt_update_aesni_last_block_start: movdqa xmm9, OWORD PTR [rax+224] L_AES_GCM_encrypt_update_aesni_aesenc_gfmul_last: aesenclast xmm8, xmm9 - movdqu xmm9, [rcx] + movdqu xmm9, OWORD PTR [rcx] pxor xmm8, xmm9 - movdqu [rdx], xmm8 + movdqu OWORD PTR [rdx], xmm8 pshufb xmm8, OWORD PTR L_aes_gcm_bswap_mask pxor xmm6, xmm8 add edi, 16 @@ -5155,16 +5155,16 @@ L_AES_GCM_encrypt_update_aesni_last_block_ghash: L_AES_GCM_encrypt_update_aesni_last_block_done: L_AES_GCM_encrypt_update_aesni_done_enc: movdqa OWORD PTR [r12], xmm6 - movdqu xmm6, [rsp+160] - movdqu xmm7, [rsp+176] - movdqu xmm8, [rsp+192] - movdqu xmm9, [rsp+208] - movdqu xmm10, [rsp+224] - movdqu xmm11, [rsp+240] - movdqu xmm12, [rsp+256] - movdqu xmm13, [rsp+272] - movdqu xmm14, [rsp+288] - movdqu xmm15, [rsp+304] + movdqu xmm6, OWORD PTR [rsp+160] + movdqu xmm7, OWORD PTR [rsp+176] + movdqu xmm8, OWORD PTR [rsp+192] + movdqu xmm9, OWORD PTR [rsp+208] + movdqu xmm10, OWORD PTR [rsp+224] + movdqu xmm11, OWORD PTR [rsp+240] + movdqu xmm12, OWORD PTR [rsp+256] + movdqu xmm13, OWORD PTR [rsp+272] + movdqu xmm14, OWORD PTR [rsp+288] + movdqu xmm15, OWORD PTR [rsp+304] add rsp, 320 pop rdi pop r15 @@ -5186,14 +5186,14 @@ AES_GCM_encrypt_final_aesni PROC mov r12, QWORD PTR [rsp+72] mov r14, QWORD PTR [rsp+80] sub rsp, 144 - movdqu [rsp+16], xmm6 - movdqu [rsp+32], xmm7 - movdqu [rsp+48], xmm8 - movdqu [rsp+64], xmm9 - movdqu [rsp+80], xmm10 - movdqu [rsp+96], xmm11 - movdqu [rsp+112], xmm12 - movdqu [rsp+128], xmm13 + movdqu OWORD PTR [rsp+16], xmm6 + movdqu OWORD PTR [rsp+32], xmm7 + movdqu OWORD PTR [rsp+48], xmm8 + movdqu OWORD PTR [rsp+64], xmm9 + movdqu OWORD PTR [rsp+80], xmm10 + movdqu OWORD PTR [rsp+96], xmm11 + movdqu OWORD PTR [rsp+112], xmm12 + movdqu OWORD PTR [rsp+128], xmm13 movdqa xmm4, OWORD PTR [rax] movdqa xmm5, OWORD PTR [r12] movdqa xmm6, OWORD PTR [r14] @@ -5260,7 +5260,7 @@ AES_GCM_encrypt_final_aesni PROC cmp r8d, 16 je L_AES_GCM_encrypt_final_aesni_store_tag_16 xor rcx, rcx - movdqu [rsp], xmm0 + movdqu OWORD PTR [rsp], xmm0 L_AES_GCM_encrypt_final_aesni_store_tag_loop: movzx r13d, BYTE PTR [rsp+rcx] mov BYTE PTR [r9+rcx], r13b @@ -5269,16 +5269,16 @@ L_AES_GCM_encrypt_final_aesni_store_tag_loop: jne L_AES_GCM_encrypt_final_aesni_store_tag_loop jmp L_AES_GCM_encrypt_final_aesni_store_tag_done L_AES_GCM_encrypt_final_aesni_store_tag_16: - movdqu [r9], xmm0 + movdqu OWORD PTR [r9], xmm0 L_AES_GCM_encrypt_final_aesni_store_tag_done: - movdqu xmm6, [rsp+16] - movdqu xmm7, [rsp+32] - movdqu xmm8, [rsp+48] - movdqu xmm9, [rsp+64] - movdqu xmm10, [rsp+80] - movdqu xmm11, [rsp+96] - movdqu xmm12, [rsp+112] - movdqu xmm13, [rsp+128] + movdqu xmm6, OWORD PTR [rsp+16] + movdqu xmm7, OWORD PTR [rsp+32] + movdqu xmm8, OWORD PTR [rsp+48] + movdqu xmm9, OWORD PTR [rsp+64] + movdqu xmm10, OWORD PTR [rsp+80] + movdqu xmm11, OWORD PTR [rsp+96] + movdqu xmm12, OWORD PTR [rsp+112] + movdqu xmm13, OWORD PTR [rsp+128] add rsp, 144 pop r14 pop r12 @@ -5303,16 +5303,16 @@ AES_GCM_decrypt_update_aesni PROC mov r14, QWORD PTR [rsp+104] mov r15, QWORD PTR [rsp+112] sub rsp, 328 - movdqu [rsp+168], xmm6 - movdqu [rsp+184], xmm7 - movdqu [rsp+200], xmm8 - movdqu [rsp+216], xmm9 - movdqu [rsp+232], xmm10 - movdqu [rsp+248], xmm11 - movdqu [rsp+264], xmm12 - movdqu [rsp+280], xmm13 - movdqu [rsp+296], xmm14 - movdqu [rsp+312], xmm15 + movdqu OWORD PTR [rsp+168], xmm6 + movdqu OWORD PTR [rsp+184], xmm7 + movdqu OWORD PTR [rsp+200], xmm8 + movdqu OWORD PTR [rsp+216], xmm9 + movdqu OWORD PTR [rsp+232], xmm10 + movdqu OWORD PTR [rsp+248], xmm11 + movdqu OWORD PTR [rsp+264], xmm12 + movdqu OWORD PTR [rsp+280], xmm13 + movdqu OWORD PTR [rsp+296], xmm14 + movdqu OWORD PTR [rsp+312], xmm15 movdqa xmm6, OWORD PTR [r12] movdqa xmm5, OWORD PTR [r14] movdqa xmm9, xmm5 @@ -5332,7 +5332,7 @@ AES_GCM_decrypt_update_aesni PROC and r13d, 4294967168 movdqa xmm2, xmm6 ; H ^ 1 - movdqu [rsp], xmm5 + movdqu OWORD PTR [rsp], xmm5 ; H ^ 2 pshufd xmm9, xmm5, 78 pshufd xmm10, xmm5, 78 @@ -5374,7 +5374,7 @@ AES_GCM_decrypt_update_aesni PROC pxor xmm14, xmm13 pxor xmm14, xmm8 pxor xmm0, xmm14 - movdqu [rsp+16], xmm0 + movdqu OWORD PTR [rsp+16], xmm0 ; H ^ 3 pshufd xmm9, xmm5, 78 pshufd xmm10, xmm0, 78 @@ -5416,7 +5416,7 @@ AES_GCM_decrypt_update_aesni PROC pxor xmm14, xmm13 pxor xmm14, xmm8 pxor xmm1, xmm14 - movdqu [rsp+32], xmm1 + movdqu OWORD PTR [rsp+32], xmm1 ; H ^ 4 pshufd xmm9, xmm0, 78 pshufd xmm10, xmm0, 78 @@ -5458,7 +5458,7 @@ AES_GCM_decrypt_update_aesni PROC pxor xmm14, xmm13 pxor xmm14, xmm8 pxor xmm3, xmm14 - movdqu [rsp+48], xmm3 + movdqu OWORD PTR [rsp+48], xmm3 ; H ^ 5 pshufd xmm9, xmm0, 78 pshufd xmm10, xmm1, 78 @@ -5500,7 +5500,7 @@ AES_GCM_decrypt_update_aesni PROC pxor xmm14, xmm13 pxor xmm14, xmm8 pxor xmm7, xmm14 - movdqu [rsp+64], xmm7 + movdqu OWORD PTR [rsp+64], xmm7 ; H ^ 6 pshufd xmm9, xmm1, 78 pshufd xmm10, xmm1, 78 @@ -5542,7 +5542,7 @@ AES_GCM_decrypt_update_aesni PROC pxor xmm14, xmm13 pxor xmm14, xmm8 pxor xmm7, xmm14 - movdqu [rsp+80], xmm7 + movdqu OWORD PTR [rsp+80], xmm7 ; H ^ 7 pshufd xmm9, xmm1, 78 pshufd xmm10, xmm3, 78 @@ -5584,7 +5584,7 @@ AES_GCM_decrypt_update_aesni PROC pxor xmm14, xmm13 pxor xmm14, xmm8 pxor xmm7, xmm14 - movdqu [rsp+96], xmm7 + movdqu OWORD PTR [rsp+96], xmm7 ; H ^ 8 pshufd xmm9, xmm3, 78 pshufd xmm10, xmm3, 78 @@ -5626,11 +5626,11 @@ AES_GCM_decrypt_update_aesni PROC pxor xmm14, xmm13 pxor xmm14, xmm8 pxor xmm7, xmm14 - movdqu [rsp+112], xmm7 + movdqu OWORD PTR [rsp+112], xmm7 L_AES_GCM_decrypt_update_aesni_ghash_128: lea rcx, QWORD PTR [r11+rdi] lea rdx, QWORD PTR [r10+rdi] - movdqu xmm8, [r15] + movdqu xmm8, OWORD PTR [r15] movdqa xmm1, OWORD PTR L_aes_gcm_bswap_epi64 movdqa xmm0, xmm8 pshufb xmm8, xmm1 @@ -5657,7 +5657,7 @@ L_AES_GCM_decrypt_update_aesni_ghash_128: pshufb xmm15, xmm1 paddd xmm0, OWORD PTR L_aes_gcm_eight movdqa xmm7, OWORD PTR [rax] - movdqu [r15], xmm0 + movdqu OWORD PTR [r15], xmm0 pxor xmm8, xmm7 pxor xmm9, xmm7 pxor xmm10, xmm7 @@ -5666,8 +5666,8 @@ L_AES_GCM_decrypt_update_aesni_ghash_128: pxor xmm13, xmm7 pxor xmm14, xmm7 pxor xmm15, xmm7 - movdqu xmm7, [rsp+112] - movdqu xmm0, [rcx] + movdqu xmm7, OWORD PTR [rsp+112] + movdqu xmm0, OWORD PTR [rcx] aesenc xmm8, [rax+16] pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask pxor xmm0, xmm2 @@ -5689,8 +5689,8 @@ L_AES_GCM_decrypt_update_aesni_ghash_128: aesenc xmm15, [rax+16] pxor xmm1, xmm2 pxor xmm1, xmm3 - movdqu xmm7, [rsp+96] - movdqu xmm0, [rcx+16] + movdqu xmm7, OWORD PTR [rsp+96] + movdqu xmm0, OWORD PTR [rcx+16] pshufd xmm4, xmm7, 78 pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask aesenc xmm8, [rax+32] @@ -5713,8 +5713,8 @@ L_AES_GCM_decrypt_update_aesni_ghash_128: pxor xmm1, xmm6 pxor xmm3, xmm6 pxor xmm1, xmm4 - movdqu xmm7, [rsp+80] - movdqu xmm0, [rcx+32] + movdqu xmm7, OWORD PTR [rsp+80] + movdqu xmm0, OWORD PTR [rcx+32] pshufd xmm4, xmm7, 78 pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask aesenc xmm8, [rax+48] @@ -5737,8 +5737,8 @@ L_AES_GCM_decrypt_update_aesni_ghash_128: pxor xmm1, xmm6 pxor xmm3, xmm6 pxor xmm1, xmm4 - movdqu xmm7, [rsp+64] - movdqu xmm0, [rcx+48] + movdqu xmm7, OWORD PTR [rsp+64] + movdqu xmm0, OWORD PTR [rcx+48] pshufd xmm4, xmm7, 78 pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask aesenc xmm8, [rax+64] @@ -5761,8 +5761,8 @@ L_AES_GCM_decrypt_update_aesni_ghash_128: pxor xmm1, xmm6 pxor xmm3, xmm6 pxor xmm1, xmm4 - movdqu xmm7, [rsp+48] - movdqu xmm0, [rcx+64] + movdqu xmm7, OWORD PTR [rsp+48] + movdqu xmm0, OWORD PTR [rcx+64] pshufd xmm4, xmm7, 78 pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask aesenc xmm8, [rax+80] @@ -5785,8 +5785,8 @@ L_AES_GCM_decrypt_update_aesni_ghash_128: pxor xmm1, xmm6 pxor xmm3, xmm6 pxor xmm1, xmm4 - movdqu xmm7, [rsp+32] - movdqu xmm0, [rcx+80] + movdqu xmm7, OWORD PTR [rsp+32] + movdqu xmm0, OWORD PTR [rcx+80] pshufd xmm4, xmm7, 78 pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask aesenc xmm8, [rax+96] @@ -5809,8 +5809,8 @@ L_AES_GCM_decrypt_update_aesni_ghash_128: pxor xmm1, xmm6 pxor xmm3, xmm6 pxor xmm1, xmm4 - movdqu xmm7, [rsp+16] - movdqu xmm0, [rcx+96] + movdqu xmm7, OWORD PTR [rsp+16] + movdqu xmm0, OWORD PTR [rcx+96] pshufd xmm4, xmm7, 78 pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask aesenc xmm8, [rax+112] @@ -5833,8 +5833,8 @@ L_AES_GCM_decrypt_update_aesni_ghash_128: pxor xmm1, xmm6 pxor xmm3, xmm6 pxor xmm1, xmm4 - movdqu xmm7, [rsp] - movdqu xmm0, [rcx+112] + movdqu xmm7, OWORD PTR [rsp] + movdqu xmm0, OWORD PTR [rcx+112] pshufd xmm4, xmm7, 78 pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask aesenc xmm8, [rax+128] @@ -5937,41 +5937,41 @@ L_AES_GCM_decrypt_update_aesni_ghash_128: L_AES_GCM_decrypt_update_aesni_aesenc_128_ghash_avx_done: aesenclast xmm8, xmm7 aesenclast xmm9, xmm7 - movdqu xmm0, [rcx] - movdqu xmm1, [rcx+16] + movdqu xmm0, OWORD PTR [rcx] + movdqu xmm1, OWORD PTR [rcx+16] pxor xmm8, xmm0 pxor xmm9, xmm1 - movdqu [rdx], xmm8 - movdqu [rdx+16], xmm9 + movdqu OWORD PTR [rdx], xmm8 + movdqu OWORD PTR [rdx+16], xmm9 aesenclast xmm10, xmm7 aesenclast xmm11, xmm7 - movdqu xmm0, [rcx+32] - movdqu xmm1, [rcx+48] + movdqu xmm0, OWORD PTR [rcx+32] + movdqu xmm1, OWORD PTR [rcx+48] pxor xmm10, xmm0 pxor xmm11, xmm1 - movdqu [rdx+32], xmm10 - movdqu [rdx+48], xmm11 + movdqu OWORD PTR [rdx+32], xmm10 + movdqu OWORD PTR [rdx+48], xmm11 aesenclast xmm12, xmm7 aesenclast xmm13, xmm7 - movdqu xmm0, [rcx+64] - movdqu xmm1, [rcx+80] + movdqu xmm0, OWORD PTR [rcx+64] + movdqu xmm1, OWORD PTR [rcx+80] pxor xmm12, xmm0 pxor xmm13, xmm1 - movdqu [rdx+64], xmm12 - movdqu [rdx+80], xmm13 + movdqu OWORD PTR [rdx+64], xmm12 + movdqu OWORD PTR [rdx+80], xmm13 aesenclast xmm14, xmm7 aesenclast xmm15, xmm7 - movdqu xmm0, [rcx+96] - movdqu xmm1, [rcx+112] + movdqu xmm0, OWORD PTR [rcx+96] + movdqu xmm1, OWORD PTR [rcx+112] pxor xmm14, xmm0 pxor xmm15, xmm1 - movdqu [rdx+96], xmm14 - movdqu [rdx+112], xmm15 + movdqu OWORD PTR [rdx+96], xmm14 + movdqu OWORD PTR [rdx+112], xmm15 add edi, 128 cmp edi, r13d jl L_AES_GCM_decrypt_update_aesni_ghash_128 movdqa xmm6, xmm2 - movdqu xmm5, [rsp] + movdqu xmm5, OWORD PTR [rsp] L_AES_GCM_decrypt_update_aesni_done_128: mov edx, r9d cmp edi, edx @@ -5983,16 +5983,16 @@ L_AES_GCM_decrypt_update_aesni_done_128: L_AES_GCM_decrypt_update_aesni_last_block_start: lea rcx, QWORD PTR [r11+rdi] lea rdx, QWORD PTR [r10+rdi] - movdqu xmm1, [rcx] + movdqu xmm1, OWORD PTR [rcx] movdqa xmm0, xmm5 pshufb xmm1, OWORD PTR L_aes_gcm_bswap_mask pxor xmm1, xmm6 - movdqu xmm8, [r15] + movdqu xmm8, OWORD PTR [r15] movdqa xmm9, xmm8 pshufb xmm8, OWORD PTR L_aes_gcm_bswap_epi64 paddd xmm9, OWORD PTR L_aes_gcm_one pxor xmm8, [rax] - movdqu [r15], xmm9 + movdqu OWORD PTR [r15], xmm9 movdqa xmm10, xmm1 pclmulqdq xmm10, xmm0, 16 aesenc xmm8, [rax+16] @@ -6040,25 +6040,25 @@ L_AES_GCM_decrypt_update_aesni_last_block_start: movdqa xmm9, OWORD PTR [rax+224] L_AES_GCM_decrypt_update_aesni_aesenc_gfmul_last: aesenclast xmm8, xmm9 - movdqu xmm9, [rcx] + movdqu xmm9, OWORD PTR [rcx] pxor xmm8, xmm9 - movdqu [rdx], xmm8 + movdqu OWORD PTR [rdx], xmm8 add edi, 16 cmp edi, r13d jl L_AES_GCM_decrypt_update_aesni_last_block_start L_AES_GCM_decrypt_update_aesni_last_block_done: L_AES_GCM_decrypt_update_aesni_done_dec: movdqa OWORD PTR [r12], xmm6 - movdqu xmm6, [rsp+168] - movdqu xmm7, [rsp+184] - movdqu xmm8, [rsp+200] - movdqu xmm9, [rsp+216] - movdqu xmm10, [rsp+232] - movdqu xmm11, [rsp+248] - movdqu xmm12, [rsp+264] - movdqu xmm13, [rsp+280] - movdqu xmm14, [rsp+296] - movdqu xmm15, [rsp+312] + movdqu xmm6, OWORD PTR [rsp+168] + movdqu xmm7, OWORD PTR [rsp+184] + movdqu xmm8, OWORD PTR [rsp+200] + movdqu xmm9, OWORD PTR [rsp+216] + movdqu xmm10, OWORD PTR [rsp+232] + movdqu xmm11, OWORD PTR [rsp+248] + movdqu xmm12, OWORD PTR [rsp+264] + movdqu xmm13, OWORD PTR [rsp+280] + movdqu xmm14, OWORD PTR [rsp+296] + movdqu xmm15, OWORD PTR [rsp+312] add rsp, 328 pop rsi pop rdi @@ -6084,15 +6084,15 @@ AES_GCM_decrypt_final_aesni PROC mov r14, QWORD PTR [rsp+96] mov rbp, QWORD PTR [rsp+104] sub rsp, 160 - movdqu [rsp+16], xmm6 - movdqu [rsp+32], xmm7 - movdqu [rsp+48], xmm8 - movdqu [rsp+64], xmm9 - movdqu [rsp+80], xmm10 - movdqu [rsp+96], xmm11 - movdqu [rsp+112], xmm12 - movdqu [rsp+128], xmm13 - movdqu [rsp+144], xmm15 + movdqu OWORD PTR [rsp+16], xmm6 + movdqu OWORD PTR [rsp+32], xmm7 + movdqu OWORD PTR [rsp+48], xmm8 + movdqu OWORD PTR [rsp+64], xmm9 + movdqu OWORD PTR [rsp+80], xmm10 + movdqu OWORD PTR [rsp+96], xmm11 + movdqu OWORD PTR [rsp+112], xmm12 + movdqu OWORD PTR [rsp+128], xmm13 + movdqu OWORD PTR [rsp+144], xmm15 movdqa xmm6, OWORD PTR [rax] movdqa xmm5, OWORD PTR [r12] movdqa xmm15, OWORD PTR [r14] @@ -6161,7 +6161,7 @@ AES_GCM_decrypt_final_aesni PROC sub rsp, 16 xor rcx, rcx xor r15, r15 - movdqu [rsp], xmm0 + movdqu OWORD PTR [rsp], xmm0 L_AES_GCM_decrypt_final_aesni_cmp_tag_loop: movzx r13d, BYTE PTR [rsp+rcx] xor r13b, BYTE PTR [r9+rcx] @@ -6169,13 +6169,13 @@ L_AES_GCM_decrypt_final_aesni_cmp_tag_loop: inc ecx cmp ecx, r8d jne L_AES_GCM_decrypt_final_aesni_cmp_tag_loop - cmp r15, 0 + cmp r15b, 0 sete r15b add rsp, 16 xor rcx, rcx jmp L_AES_GCM_decrypt_final_aesni_cmp_tag_done L_AES_GCM_decrypt_final_aesni_cmp_tag_16: - movdqu xmm1, [r9] + movdqu xmm1, OWORD PTR [r9] pcmpeqb xmm0, xmm1 pmovmskb rdx, xmm0 ; %%edx == 0xFFFF then return 1 else => return 0 @@ -6184,15 +6184,15 @@ L_AES_GCM_decrypt_final_aesni_cmp_tag_16: sete r15b L_AES_GCM_decrypt_final_aesni_cmp_tag_done: mov DWORD PTR [rbp], r15d - movdqu xmm6, [rsp+16] - movdqu xmm7, [rsp+32] - movdqu xmm8, [rsp+48] - movdqu xmm9, [rsp+64] - movdqu xmm10, [rsp+80] - movdqu xmm11, [rsp+96] - movdqu xmm12, [rsp+112] - movdqu xmm13, [rsp+128] - movdqu xmm15, [rsp+144] + movdqu xmm6, OWORD PTR [rsp+16] + movdqu xmm7, OWORD PTR [rsp+32] + movdqu xmm8, OWORD PTR [rsp+48] + movdqu xmm9, OWORD PTR [rsp+64] + movdqu xmm10, OWORD PTR [rsp+80] + movdqu xmm11, OWORD PTR [rsp+96] + movdqu xmm12, OWORD PTR [rsp+112] + movdqu xmm13, OWORD PTR [rsp+128] + movdqu xmm15, OWORD PTR [rsp+144] add rsp, 160 pop r15 pop rbp @@ -9012,7 +9012,7 @@ L_AES_GCM_decrypt_avx1_cmp_tag_loop: inc ecx cmp ecx, r14d jne L_AES_GCM_decrypt_avx1_cmp_tag_loop - cmp rbx, 0 + cmp bl, 0 sete bl add rsp, 16 xor rcx, rcx @@ -11398,7 +11398,7 @@ L_AES_GCM_decrypt_final_avx1_cmp_tag_loop: inc ecx cmp ecx, r8d jne L_AES_GCM_decrypt_final_avx1_cmp_tag_loop - cmp r15, 0 + cmp r15b, 0 sete r15b add rsp, 16 xor rcx, rcx @@ -13765,7 +13765,7 @@ L_AES_GCM_decrypt_avx2_cmp_tag_loop: inc edx cmp edx, r15d jne L_AES_GCM_decrypt_avx2_cmp_tag_loop - cmp rax, 0 + cmp al, 0 sete al jmp L_AES_GCM_decrypt_avx2_cmp_tag_done L_AES_GCM_decrypt_avx2_cmp_tag_16: @@ -15764,7 +15764,7 @@ L_AES_GCM_decrypt_final_avx2_cmp_tag_loop: inc r13d cmp r13d, r8d jne L_AES_GCM_decrypt_final_avx2_cmp_tag_loop - cmp r10, 0 + cmp r10b, 0 sete r10b jmp L_AES_GCM_decrypt_final_avx2_cmp_tag_done L_AES_GCM_decrypt_final_avx2_cmp_tag_16: diff --git a/wolfcrypt/src/asn.c b/wolfcrypt/src/asn.c index 6bb8edc127..f5c181e3ed 100644 --- a/wolfcrypt/src/asn.c +++ b/wolfcrypt/src/asn.c @@ -3496,7 +3496,7 @@ int CheckBitString(const byte* input, word32* inOutIdx, int* len, #else ASNGetData dataASN[bitStringASN_Length]; int ret; - int bits; + int bits = 0; /* Parse BIT_STRING and check validity of unused bits. */ XMEMSET(dataASN, 0, sizeof(dataASN)); @@ -7227,7 +7227,7 @@ int wc_CreatePKCS8Key(byte* out, word32* outSz, byte* key, word32 keySz, return (int)(tmpSz + sz); #else DECL_ASNSETDATA(dataASN, pkcs8KeyASN_Length); - int sz; + int sz = 0; int ret = 0; word32 keyIdx = 0; word32 tmpAlgId = 0; @@ -8903,7 +8903,7 @@ int DecryptContent(byte* input, word32 sz, const char* password, int passwordSz) DECL_ASNGETDATA(dataASN, pbes2ParamsASN_Length); int ret = 0; int id = 0; - int version; + int version = 0; word32 idx = 0; word32 pIdx = 0; word32 iterations = 0; @@ -14430,7 +14430,7 @@ static int GetCertName(DecodedCert* cert, char* full, byte* hash, int nameType, DECL_ASNGETDATA(dataASN, rdnASN_Length); int ret = 0; word32 idx = 0; - int len; + int len = 0; word32 srcIdx = *inOutIdx; #ifdef WOLFSSL_X509_NAME_AVAILABLE WOLFSSL_X509_NAME* dName = NULL; @@ -16139,7 +16139,7 @@ word32 wc_EncodeSignature(byte* out, const byte* digest, word32 digSz, #else DECL_ASNSETDATA(dataASN, digestInfoASN_Length); int ret = 0; - int sz; + int sz = 0; unsigned char dgst[WC_MAX_DIGEST_SIZE]; CALLOC_ASNSETDATA(dataASN, digestInfoASN_Length, ret, NULL); @@ -21727,9 +21727,9 @@ static int DecodeCertInternal(DecodedCert* cert, int verify, int* criticalExt, DECL_ASNGETDATA(dataASN, x509CertASN_Length); int ret = 0; int badDate = 0; - byte version; + byte version = 0; word32 idx; - word32 serialSz; + word32 serialSz = 0; const unsigned char* issuer = NULL; word32 issuerSz = 0; const unsigned char* subject = NULL; @@ -34365,7 +34365,8 @@ int wc_BuildEccKeyDer(ecc_key* key, byte* output, word32 *inLen, return (int)totalSz; #else DECL_ASNSETDATA(dataASN, eccKeyASN_Length); - word32 privSz, pubSz; + word32 privSz = 0; + word32 pubSz = 0; int sz = 0; int ret = 0; int curveIdSz = 0; diff --git a/wolfcrypt/src/chacha.c b/wolfcrypt/src/chacha.c index c84829b772..f4975604c7 100644 --- a/wolfcrypt/src/chacha.c +++ b/wolfcrypt/src/chacha.c @@ -72,6 +72,10 @@ Public domain. #elif defined(__clang__) && defined(NO_AVX2_SUPPORT) #undef NO_AVX2_SUPPORT #endif + #if defined(_MSC_VER) && (_MSC_VER <= 1900) + #undef NO_AVX2_SUPPORT + #define NO_AVX2_SUPPORT + #endif #ifndef NO_AVX2_SUPPORT #define HAVE_INTEL_AVX2 diff --git a/wolfcrypt/src/chacha_asm.S b/wolfcrypt/src/chacha_asm.S index 93b3a3fef7..0ef0978369 100644 --- a/wolfcrypt/src/chacha_asm.S +++ b/wolfcrypt/src/chacha_asm.S @@ -868,7 +868,7 @@ L_chacha20_avx1_loop128: vmovdqa 240(%r9), %xmm15 jmp L_chacha20_avx1_start128 L_chacha20_avx1_done128: - shl $2, %eax + shll $2, %eax addl %eax, 48(%rdi) L_chacha20_avx1_end128: cmpl $0x40, %ecx @@ -1456,7 +1456,7 @@ L_chacha20_avx2_loop256: vmovdqa 480(%r9), %ymm15 jmp L_chacha20_avx2_start256 L_chacha20_avx2_done256: - shl $3, %eax + shll $3, %eax addl %eax, 48(%rdi) L_chacha20_avx2_end256: #ifndef __APPLE__ diff --git a/wolfcrypt/src/chacha_asm.asm b/wolfcrypt/src/chacha_asm.asm new file mode 100644 index 0000000000..78de305818 --- /dev/null +++ b/wolfcrypt/src/chacha_asm.asm @@ -0,0 +1,1426 @@ +; /* chacha_asm.asm */ +; /* +; * Copyright (C) 2006-2024 wolfSSL Inc. +; * +; * This file is part of wolfSSL. +; * +; * wolfSSL is free software; you can redistribute it and/or modify +; * it under the terms of the GNU General Public License as published by +; * the Free Software Foundation; either version 2 of the License, or +; * (at your option) any later version. +; * +; * wolfSSL is distributed in the hope that it will be useful, +; * but WITHOUT ANY WARRANTY; without even the implied warranty of +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; * GNU General Public License for more details. +; * +; * You should have received a copy of the GNU General Public License +; * along with this program; if not, write to the Free Software +; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA +; */ +IF @Version LT 1200 +; AVX2 instructions not recognized by old versions of MASM +IFNDEF NO_AVX2_SUPPORT +NO_AVX2_SUPPORT = 1 +ENDIF +; MOVBE instruction not recognized by old versions of MASM +IFNDEF NO_MOVBE_SUPPORT +NO_MOVBE_SUPPORT = 1 +ENDIF +ENDIF + +IFNDEF HAVE_INTEL_AVX1 +HAVE_INTEL_AVX1 = 1 +ENDIF +IFNDEF NO_AVX2_SUPPORT +HAVE_INTEL_AVX2 = 1 +ENDIF + +IFNDEF _WIN64 +_WIN64 = 1 +ENDIF + +_text SEGMENT READONLY PARA +chacha_encrypt_x64 PROC + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + sub rsp, 64 + cmp r9d, 64 + jl L_chacha_x64_small +L_chacha_x64_start: + sub rsp, 48 + mov QWORD PTR [rsp+24], r8 + mov QWORD PTR [rsp+32], rdx + mov QWORD PTR [rsp+40], r9 + mov rax, QWORD PTR [rcx+32] + mov rbx, QWORD PTR [rcx+40] + mov QWORD PTR [rsp+8], rax + mov QWORD PTR [rsp+16], rbx + mov eax, DWORD PTR [rcx] + mov ebx, DWORD PTR [rcx+4] + mov r9d, DWORD PTR [rcx+8] + mov r8d, DWORD PTR [rcx+12] + mov r8d, DWORD PTR [rcx+16] + mov r9d, DWORD PTR [rcx+20] + mov r10d, DWORD PTR [rcx+24] + mov r11d, DWORD PTR [rcx+28] + mov r12d, DWORD PTR [rcx+48] + mov r13d, DWORD PTR [rcx+52] + mov r14d, DWORD PTR [rcx+56] + mov r15d, DWORD PTR [rcx+60] + mov BYTE PTR [rsp], 10 + mov edx, DWORD PTR [rsp+8] + mov ebp, DWORD PTR [rsp+12] +L_chacha_x64_block_crypt_start: + add eax, r8d + add ebx, r9d + xor r12d, eax + xor r13d, ebx + rol r12d, 16 + rol r13d, 16 + add edx, r12d + add ebp, r13d + xor r8d, edx + xor r9d, ebp + rol r8d, 12 + rol r9d, 12 + add eax, r8d + add ebx, r9d + xor r12d, eax + xor r13d, ebx + rol r12d, 8 + rol r13d, 8 + add edx, r12d + add ebp, r13d + xor r8d, edx + xor r9d, ebp + rol r8d, 7 + rol r9d, 7 + mov DWORD PTR [rsp+8], edx + mov DWORD PTR [rsp+12], ebp + mov edx, DWORD PTR [rsp+16] + mov ebp, DWORD PTR [rsp+20] + add r9d, r10d + add r8d, r11d + xor r14d, r9d + xor r15d, r8d + rol r14d, 16 + rol r15d, 16 + add edx, r14d + add ebp, r15d + xor r10d, edx + xor r11d, ebp + rol r10d, 12 + rol r11d, 12 + add r9d, r10d + add r8d, r11d + xor r14d, r9d + xor r15d, r8d + rol r14d, 8 + rol r15d, 8 + add edx, r14d + add ebp, r15d + xor r10d, edx + xor r11d, ebp + rol r10d, 7 + rol r11d, 7 + add eax, r9d + add ebx, r10d + xor r15d, eax + xor r12d, ebx + rol r15d, 16 + rol r12d, 16 + add edx, r15d + add ebp, r12d + xor r9d, edx + xor r10d, ebp + rol r9d, 12 + rol r10d, 12 + add eax, r9d + add ebx, r10d + xor r15d, eax + xor r12d, ebx + rol r15d, 8 + rol r12d, 8 + add edx, r15d + add ebp, r12d + xor r9d, edx + xor r10d, ebp + rol r9d, 7 + rol r10d, 7 + mov DWORD PTR [rsp+16], edx + mov DWORD PTR [rsp+20], ebp + mov edx, DWORD PTR [rsp+8] + mov ebp, DWORD PTR [rsp+12] + add r9d, r11d + add r8d, r8d + xor r13d, r9d + xor r14d, r8d + rol r13d, 16 + rol r14d, 16 + add edx, r13d + add ebp, r14d + xor r11d, edx + xor r8d, ebp + rol r11d, 12 + rol r8d, 12 + add r9d, r11d + add r8d, r8d + xor r13d, r9d + xor r14d, r8d + rol r13d, 8 + rol r14d, 8 + add edx, r13d + add ebp, r14d + xor r11d, edx + xor r8d, ebp + rol r11d, 7 + rol r8d, 7 + dec BYTE PTR [rsp] + jnz L_chacha_x64_block_crypt_start + mov DWORD PTR [rsp+8], edx + mov DWORD PTR [rsp+12], ebp + mov rdx, QWORD PTR [rsp+32] + mov rbp, QWORD PTR [rsp+24] + add eax, DWORD PTR [rcx] + add ebx, DWORD PTR [rcx+4] + add r9d, DWORD PTR [rcx+8] + add r8d, DWORD PTR [rcx+12] + add r8d, DWORD PTR [rcx+16] + add r9d, DWORD PTR [rcx+20] + add r10d, DWORD PTR [rcx+24] + add r11d, DWORD PTR [rcx+28] + add r12d, DWORD PTR [rcx+48] + add r13d, DWORD PTR [rcx+52] + add r14d, DWORD PTR [rcx+56] + add r15d, DWORD PTR [rcx+60] + xor eax, DWORD PTR [rdx] + xor ebx, DWORD PTR [rdx+4] + xor r9d, DWORD PTR [rdx+8] + xor r8d, DWORD PTR [rdx+12] + xor r8d, DWORD PTR [rdx+16] + xor r9d, DWORD PTR [rdx+20] + xor r10d, DWORD PTR [rdx+24] + xor r11d, DWORD PTR [rdx+28] + xor r12d, DWORD PTR [rdx+48] + xor r13d, DWORD PTR [rdx+52] + xor r14d, DWORD PTR [rdx+56] + xor r15d, DWORD PTR [rdx+60] + mov DWORD PTR [rbp], eax + mov DWORD PTR [rbp+4], ebx + mov DWORD PTR [rbp+8], r9d + mov DWORD PTR [rbp+12], r8d + mov DWORD PTR [rbp+16], r8d + mov DWORD PTR [rbp+20], r9d + mov DWORD PTR [rbp+24], r10d + mov DWORD PTR [rbp+28], r11d + mov DWORD PTR [rbp+48], r12d + mov DWORD PTR [rbp+52], r13d + mov DWORD PTR [rbp+56], r14d + mov DWORD PTR [rbp+60], r15d + mov eax, DWORD PTR [rsp+8] + mov ebx, DWORD PTR [rsp+12] + mov r9d, DWORD PTR [rsp+16] + mov r8d, DWORD PTR [rsp+20] + add eax, DWORD PTR [rcx+32] + add ebx, DWORD PTR [rcx+36] + add r9d, DWORD PTR [rcx+40] + add r8d, DWORD PTR [rcx+44] + xor eax, DWORD PTR [rdx+32] + xor ebx, DWORD PTR [rdx+36] + xor r9d, DWORD PTR [rdx+40] + xor r8d, DWORD PTR [rdx+44] + mov DWORD PTR [rbp+32], eax + mov DWORD PTR [rbp+36], ebx + mov DWORD PTR [rbp+40], r9d + mov DWORD PTR [rbp+44], r8d + mov r8, QWORD PTR [rsp+24] + mov r9, QWORD PTR [rsp+40] + add DWORD PTR [rcx+48], 1 + add rsp, 48 + sub r9d, 64 + add rdx, 64 + add r8, 64 + cmp r9d, 64 + jge L_chacha_x64_start +L_chacha_x64_small: + cmp r9d, 0 + je L_chacha_x64_done + sub rsp, 48 + mov QWORD PTR [rsp+24], r8 + mov QWORD PTR [rsp+32], rdx + mov QWORD PTR [rsp+40], r9 + mov rax, QWORD PTR [rcx+32] + mov rbx, QWORD PTR [rcx+40] + mov QWORD PTR [rsp+8], rax + mov QWORD PTR [rsp+16], rbx + mov eax, DWORD PTR [rcx] + mov ebx, DWORD PTR [rcx+4] + mov r9d, DWORD PTR [rcx+8] + mov r8d, DWORD PTR [rcx+12] + mov r8d, DWORD PTR [rcx+16] + mov r9d, DWORD PTR [rcx+20] + mov r10d, DWORD PTR [rcx+24] + mov r11d, DWORD PTR [rcx+28] + mov r12d, DWORD PTR [rcx+48] + mov r13d, DWORD PTR [rcx+52] + mov r14d, DWORD PTR [rcx+56] + mov r15d, DWORD PTR [rcx+60] + mov BYTE PTR [rsp], 10 + mov edx, DWORD PTR [rsp+8] + mov ebp, DWORD PTR [rsp+12] +L_chacha_x64_partial_crypt_start: + add eax, r8d + add ebx, r9d + xor r12d, eax + xor r13d, ebx + rol r12d, 16 + rol r13d, 16 + add edx, r12d + add ebp, r13d + xor r8d, edx + xor r9d, ebp + rol r8d, 12 + rol r9d, 12 + add eax, r8d + add ebx, r9d + xor r12d, eax + xor r13d, ebx + rol r12d, 8 + rol r13d, 8 + add edx, r12d + add ebp, r13d + xor r8d, edx + xor r9d, ebp + rol r8d, 7 + rol r9d, 7 + mov DWORD PTR [rsp+8], edx + mov DWORD PTR [rsp+12], ebp + mov edx, DWORD PTR [rsp+16] + mov ebp, DWORD PTR [rsp+20] + add r9d, r10d + add r8d, r11d + xor r14d, r9d + xor r15d, r8d + rol r14d, 16 + rol r15d, 16 + add edx, r14d + add ebp, r15d + xor r10d, edx + xor r11d, ebp + rol r10d, 12 + rol r11d, 12 + add r9d, r10d + add r8d, r11d + xor r14d, r9d + xor r15d, r8d + rol r14d, 8 + rol r15d, 8 + add edx, r14d + add ebp, r15d + xor r10d, edx + xor r11d, ebp + rol r10d, 7 + rol r11d, 7 + add eax, r9d + add ebx, r10d + xor r15d, eax + xor r12d, ebx + rol r15d, 16 + rol r12d, 16 + add edx, r15d + add ebp, r12d + xor r9d, edx + xor r10d, ebp + rol r9d, 12 + rol r10d, 12 + add eax, r9d + add ebx, r10d + xor r15d, eax + xor r12d, ebx + rol r15d, 8 + rol r12d, 8 + add edx, r15d + add ebp, r12d + xor r9d, edx + xor r10d, ebp + rol r9d, 7 + rol r10d, 7 + mov DWORD PTR [rsp+16], edx + mov DWORD PTR [rsp+20], ebp + mov edx, DWORD PTR [rsp+8] + mov ebp, DWORD PTR [rsp+12] + add r9d, r11d + add r8d, r8d + xor r13d, r9d + xor r14d, r8d + rol r13d, 16 + rol r14d, 16 + add edx, r13d + add ebp, r14d + xor r11d, edx + xor r8d, ebp + rol r11d, 12 + rol r8d, 12 + add r9d, r11d + add r8d, r8d + xor r13d, r9d + xor r14d, r8d + rol r13d, 8 + rol r14d, 8 + add edx, r13d + add ebp, r14d + xor r11d, edx + xor r8d, ebp + rol r11d, 7 + rol r8d, 7 + dec BYTE PTR [rsp] + jnz L_chacha_x64_partial_crypt_start + mov DWORD PTR [rsp+8], edx + mov DWORD PTR [rsp+12], ebp + mov rdx, QWORD PTR [rsp+32] + add eax, DWORD PTR [rcx] + add ebx, DWORD PTR [rcx+4] + add r9d, DWORD PTR [rcx+8] + add r8d, DWORD PTR [rcx+12] + add r8d, DWORD PTR [rcx+16] + add r9d, DWORD PTR [rcx+20] + add r10d, DWORD PTR [rcx+24] + add r11d, DWORD PTR [rcx+28] + add r12d, DWORD PTR [rcx+48] + add r13d, DWORD PTR [rcx+52] + add r14d, DWORD PTR [rcx+56] + add r15d, DWORD PTR [rcx+60] + lea rbp, QWORD PTR [rcx+80] + mov DWORD PTR [rbp], eax + mov DWORD PTR [rbp+4], ebx + mov DWORD PTR [rbp+8], r9d + mov DWORD PTR [rbp+12], r8d + mov DWORD PTR [rbp+16], r8d + mov DWORD PTR [rbp+20], r9d + mov DWORD PTR [rbp+24], r10d + mov DWORD PTR [rbp+28], r11d + mov DWORD PTR [rbp+48], r12d + mov DWORD PTR [rbp+52], r13d + mov DWORD PTR [rbp+56], r14d + mov DWORD PTR [rbp+60], r15d + mov eax, DWORD PTR [rsp+8] + mov ebx, DWORD PTR [rsp+12] + mov r9d, DWORD PTR [rsp+16] + mov r8d, DWORD PTR [rsp+20] + add eax, DWORD PTR [rcx+32] + add ebx, DWORD PTR [rcx+36] + add r9d, DWORD PTR [rcx+40] + add r8d, DWORD PTR [rcx+44] + mov DWORD PTR [rbp+32], eax + mov DWORD PTR [rbp+36], ebx + mov DWORD PTR [rbp+40], r9d + mov DWORD PTR [rbp+44], r8d + mov r8, QWORD PTR [rsp+24] + mov r9, QWORD PTR [rsp+40] + add DWORD PTR [rcx+48], 1 + add rsp, 48 + mov r8d, r9d + xor rbx, rbx + and r8d, 7 + jz L_chacha_x64_partial_start64 +L_chacha_x64_partial_start8: + movzx eax, BYTE PTR [rbp+rbx] + xor al, BYTE PTR [rdx+rbx] + mov BYTE PTR [r8+rbx], al + inc ebx + cmp ebx, r8d + jne L_chacha_x64_partial_start8 + je L_chacha_x64_partial_end64 +L_chacha_x64_partial_start64: + mov rax, QWORD PTR [rbp+rbx] + xor rax, QWORD PTR [rdx+rbx] + mov QWORD PTR [r8+rbx], rax + add ebx, 8 +L_chacha_x64_partial_end64: + cmp ebx, r9d + jne L_chacha_x64_partial_start64 + mov r9d, 64 + sub r9d, ebx + mov DWORD PTR [rcx+76], r9d +L_chacha_x64_done: + add rsp, 64 + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + ret +chacha_encrypt_x64 ENDP +_text ENDS +IFDEF HAVE_INTEL_AVX1 +_DATA SEGMENT +ALIGN 16 +L_chacha20_avx1_rotl8 QWORD 433757367256023043, 1012478749960636427 +ptr_L_chacha20_avx1_rotl8 QWORD L_chacha20_avx1_rotl8 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_chacha20_avx1_rotl16 QWORD 361421592464458498, 940142975169071882 +ptr_L_chacha20_avx1_rotl16 QWORD L_chacha20_avx1_rotl16 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_chacha20_avx1_add QWORD 4294967296, 12884901890 +ptr_L_chacha20_avx1_add QWORD L_chacha20_avx1_add +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_chacha20_avx1_four QWORD 17179869188, 17179869188 +ptr_L_chacha20_avx1_four QWORD L_chacha20_avx1_four +_DATA ENDS +_text SEGMENT READONLY PARA +chacha_encrypt_avx1 PROC + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + sub rsp, 560 + vmovdqu OWORD PTR [rsp+400], xmm6 + vmovdqu OWORD PTR [rsp+416], xmm7 + vmovdqu OWORD PTR [rsp+432], xmm8 + vmovdqu OWORD PTR [rsp+448], xmm9 + vmovdqu OWORD PTR [rsp+464], xmm10 + vmovdqu OWORD PTR [rsp+480], xmm11 + vmovdqu OWORD PTR [rsp+496], xmm12 + vmovdqu OWORD PTR [rsp+512], xmm13 + vmovdqu OWORD PTR [rsp+528], xmm14 + vmovdqu OWORD PTR [rsp+544], xmm15 + mov r11, rsp + lea r12, QWORD PTR [rsp+256] + mov r14, QWORD PTR [ptr_L_chacha20_avx1_rotl8] + mov r15, QWORD PTR [ptr_L_chacha20_avx1_rotl16] + mov rdi, QWORD PTR [ptr_L_chacha20_avx1_add] + mov rsi, QWORD PTR [ptr_L_chacha20_avx1_four] + add r11, 15 + add r12, 15 + and r11, -16 + and r12, -16 + mov eax, r9d + shr eax, 8 + jz L_chacha20_avx1_end128 + vpshufd xmm0, [rcx], 0 + vpshufd xmm1, [rcx+4], 0 + vpshufd xmm2, [rcx+8], 0 + vpshufd xmm3, [rcx+12], 0 + vpshufd xmm4, [rcx+16], 0 + vpshufd xmm5, [rcx+20], 0 + vpshufd xmm6, [rcx+24], 0 + vpshufd xmm7, [rcx+28], 0 + vpshufd xmm8, [rcx+32], 0 + vpshufd xmm9, [rcx+36], 0 + vpshufd xmm10, [rcx+40], 0 + vpshufd xmm11, [rcx+44], 0 + vpshufd xmm12, [rcx+48], 0 + vpshufd xmm13, [rcx+52], 0 + vpshufd xmm14, [rcx+56], 0 + vpshufd xmm15, [rcx+60], 0 + vpaddd xmm12, xmm12, OWORD PTR [rdi] + vmovdqa OWORD PTR [r11], xmm0 + vmovdqa OWORD PTR [r11+16], xmm1 + vmovdqa OWORD PTR [r11+32], xmm2 + vmovdqa OWORD PTR [r11+48], xmm3 + vmovdqa OWORD PTR [r11+64], xmm4 + vmovdqa OWORD PTR [r11+80], xmm5 + vmovdqa OWORD PTR [r11+96], xmm6 + vmovdqa OWORD PTR [r11+112], xmm7 + vmovdqa OWORD PTR [r11+128], xmm8 + vmovdqa OWORD PTR [r11+144], xmm9 + vmovdqa OWORD PTR [r11+160], xmm10 + vmovdqa OWORD PTR [r11+176], xmm11 + vmovdqa OWORD PTR [r11+192], xmm12 + vmovdqa OWORD PTR [r11+208], xmm13 + vmovdqa OWORD PTR [r11+224], xmm14 + vmovdqa OWORD PTR [r11+240], xmm15 +L_chacha20_avx1_start128: + vmovdqa OWORD PTR [r12+48], xmm11 + mov r10b, 10 +L_chacha20_avx1_loop128: + vpaddd xmm0, xmm0, xmm4 + vpxor xmm12, xmm12, xmm0 + vmovdqa xmm11, OWORD PTR [r12+48] + vpshufb xmm12, xmm12, OWORD PTR [r15] + vpaddd xmm8, xmm8, xmm12 + vpxor xmm4, xmm4, xmm8 + vpaddd xmm1, xmm1, xmm5 + vpxor xmm13, xmm13, xmm1 + vpshufb xmm13, xmm13, OWORD PTR [r15] + vpaddd xmm9, xmm9, xmm13 + vpxor xmm5, xmm5, xmm9 + vpaddd xmm2, xmm2, xmm6 + vpxor xmm14, xmm14, xmm2 + vpshufb xmm14, xmm14, OWORD PTR [r15] + vpaddd xmm10, xmm10, xmm14 + vpxor xmm6, xmm6, xmm10 + vpaddd xmm3, xmm3, xmm7 + vpxor xmm15, xmm15, xmm3 + vpshufb xmm15, xmm15, OWORD PTR [r15] + vpaddd xmm11, xmm11, xmm15 + vpxor xmm7, xmm7, xmm11 + vmovdqa OWORD PTR [r12+48], xmm11 + vpsrld xmm11, xmm4, 20 + vpslld xmm4, xmm4, 12 + vpxor xmm4, xmm4, xmm11 + vpsrld xmm11, xmm5, 20 + vpslld xmm5, xmm5, 12 + vpxor xmm5, xmm5, xmm11 + vpsrld xmm11, xmm6, 20 + vpslld xmm6, xmm6, 12 + vpxor xmm6, xmm6, xmm11 + vpsrld xmm11, xmm7, 20 + vpslld xmm7, xmm7, 12 + vpxor xmm7, xmm7, xmm11 + vpaddd xmm0, xmm0, xmm4 + vpxor xmm12, xmm12, xmm0 + vmovdqa xmm11, OWORD PTR [r12+48] + vpshufb xmm12, xmm12, OWORD PTR [r14] + vpaddd xmm8, xmm8, xmm12 + vpxor xmm4, xmm4, xmm8 + vpaddd xmm1, xmm1, xmm5 + vpxor xmm13, xmm13, xmm1 + vpshufb xmm13, xmm13, OWORD PTR [r14] + vpaddd xmm9, xmm9, xmm13 + vpxor xmm5, xmm5, xmm9 + vpaddd xmm2, xmm2, xmm6 + vpxor xmm14, xmm14, xmm2 + vpshufb xmm14, xmm14, OWORD PTR [r14] + vpaddd xmm10, xmm10, xmm14 + vpxor xmm6, xmm6, xmm10 + vpaddd xmm3, xmm3, xmm7 + vpxor xmm15, xmm15, xmm3 + vpshufb xmm15, xmm15, OWORD PTR [r14] + vpaddd xmm11, xmm11, xmm15 + vpxor xmm7, xmm7, xmm11 + vmovdqa OWORD PTR [r12+48], xmm11 + vpsrld xmm11, xmm4, 25 + vpslld xmm4, xmm4, 7 + vpxor xmm4, xmm4, xmm11 + vpsrld xmm11, xmm5, 25 + vpslld xmm5, xmm5, 7 + vpxor xmm5, xmm5, xmm11 + vpsrld xmm11, xmm6, 25 + vpslld xmm6, xmm6, 7 + vpxor xmm6, xmm6, xmm11 + vpsrld xmm11, xmm7, 25 + vpslld xmm7, xmm7, 7 + vpxor xmm7, xmm7, xmm11 + vpaddd xmm0, xmm0, xmm5 + vpxor xmm15, xmm15, xmm0 + vmovdqa xmm11, OWORD PTR [r12+48] + vpshufb xmm15, xmm15, OWORD PTR [r15] + vpaddd xmm10, xmm10, xmm15 + vpxor xmm5, xmm5, xmm10 + vpaddd xmm1, xmm1, xmm6 + vpxor xmm12, xmm12, xmm1 + vpshufb xmm12, xmm12, OWORD PTR [r15] + vpaddd xmm11, xmm11, xmm12 + vpxor xmm6, xmm6, xmm11 + vpaddd xmm2, xmm2, xmm7 + vpxor xmm13, xmm13, xmm2 + vpshufb xmm13, xmm13, OWORD PTR [r15] + vpaddd xmm8, xmm8, xmm13 + vpxor xmm7, xmm7, xmm8 + vpaddd xmm3, xmm3, xmm4 + vpxor xmm14, xmm14, xmm3 + vpshufb xmm14, xmm14, OWORD PTR [r15] + vpaddd xmm9, xmm9, xmm14 + vpxor xmm4, xmm4, xmm9 + vmovdqa OWORD PTR [r12+48], xmm11 + vpsrld xmm11, xmm5, 20 + vpslld xmm5, xmm5, 12 + vpxor xmm5, xmm5, xmm11 + vpsrld xmm11, xmm6, 20 + vpslld xmm6, xmm6, 12 + vpxor xmm6, xmm6, xmm11 + vpsrld xmm11, xmm7, 20 + vpslld xmm7, xmm7, 12 + vpxor xmm7, xmm7, xmm11 + vpsrld xmm11, xmm4, 20 + vpslld xmm4, xmm4, 12 + vpxor xmm4, xmm4, xmm11 + vpaddd xmm0, xmm0, xmm5 + vpxor xmm15, xmm15, xmm0 + vmovdqa xmm11, OWORD PTR [r12+48] + vpshufb xmm15, xmm15, OWORD PTR [r14] + vpaddd xmm10, xmm10, xmm15 + vpxor xmm5, xmm5, xmm10 + vpaddd xmm1, xmm1, xmm6 + vpxor xmm12, xmm12, xmm1 + vpshufb xmm12, xmm12, OWORD PTR [r14] + vpaddd xmm11, xmm11, xmm12 + vpxor xmm6, xmm6, xmm11 + vpaddd xmm2, xmm2, xmm7 + vpxor xmm13, xmm13, xmm2 + vpshufb xmm13, xmm13, OWORD PTR [r14] + vpaddd xmm8, xmm8, xmm13 + vpxor xmm7, xmm7, xmm8 + vpaddd xmm3, xmm3, xmm4 + vpxor xmm14, xmm14, xmm3 + vpshufb xmm14, xmm14, OWORD PTR [r14] + vpaddd xmm9, xmm9, xmm14 + vpxor xmm4, xmm4, xmm9 + vmovdqa OWORD PTR [r12+48], xmm11 + vpsrld xmm11, xmm5, 25 + vpslld xmm5, xmm5, 7 + vpxor xmm5, xmm5, xmm11 + vpsrld xmm11, xmm6, 25 + vpslld xmm6, xmm6, 7 + vpxor xmm6, xmm6, xmm11 + vpsrld xmm11, xmm7, 25 + vpslld xmm7, xmm7, 7 + vpxor xmm7, xmm7, xmm11 + vpsrld xmm11, xmm4, 25 + vpslld xmm4, xmm4, 7 + vpxor xmm4, xmm4, xmm11 + dec r10b + jnz L_chacha20_avx1_loop128 + vmovdqa xmm11, OWORD PTR [r12+48] + vpaddd xmm0, xmm0, OWORD PTR [r11] + vpaddd xmm1, xmm1, OWORD PTR [r11+16] + vpaddd xmm2, xmm2, OWORD PTR [r11+32] + vpaddd xmm3, xmm3, OWORD PTR [r11+48] + vpaddd xmm4, xmm4, OWORD PTR [r11+64] + vpaddd xmm5, xmm5, OWORD PTR [r11+80] + vpaddd xmm6, xmm6, OWORD PTR [r11+96] + vpaddd xmm7, xmm7, OWORD PTR [r11+112] + vpaddd xmm8, xmm8, OWORD PTR [r11+128] + vpaddd xmm9, xmm9, OWORD PTR [r11+144] + vpaddd xmm10, xmm10, OWORD PTR [r11+160] + vpaddd xmm11, xmm11, OWORD PTR [r11+176] + vpaddd xmm12, xmm12, OWORD PTR [r11+192] + vpaddd xmm13, xmm13, OWORD PTR [r11+208] + vpaddd xmm14, xmm14, OWORD PTR [r11+224] + vpaddd xmm15, xmm15, OWORD PTR [r11+240] + vmovdqa OWORD PTR [r12], xmm8 + vmovdqa OWORD PTR [r12+16], xmm9 + vmovdqa OWORD PTR [r12+32], xmm10 + vmovdqa OWORD PTR [r12+48], xmm11 + vmovdqa OWORD PTR [r12+64], xmm12 + vmovdqa OWORD PTR [r12+80], xmm13 + vmovdqa OWORD PTR [r12+96], xmm14 + vmovdqa OWORD PTR [r12+112], xmm15 + vpunpckldq xmm8, xmm0, xmm1 + vpunpckldq xmm9, xmm2, xmm3 + vpunpckhdq xmm12, xmm0, xmm1 + vpunpckhdq xmm13, xmm2, xmm3 + vpunpckldq xmm10, xmm4, xmm5 + vpunpckldq xmm11, xmm6, xmm7 + vpunpckhdq xmm14, xmm4, xmm5 + vpunpckhdq xmm15, xmm6, xmm7 + vpunpcklqdq xmm0, xmm8, xmm9 + vpunpcklqdq xmm1, xmm10, xmm11 + vpunpckhqdq xmm2, xmm8, xmm9 + vpunpckhqdq xmm3, xmm10, xmm11 + vpunpcklqdq xmm4, xmm12, xmm13 + vpunpcklqdq xmm5, xmm14, xmm15 + vpunpckhqdq xmm6, xmm12, xmm13 + vpunpckhqdq xmm7, xmm14, xmm15 + vmovdqu xmm8, OWORD PTR [rdx] + vmovdqu xmm9, OWORD PTR [rdx+16] + vmovdqu xmm10, OWORD PTR [rdx+64] + vmovdqu xmm11, OWORD PTR [rdx+80] + vmovdqu xmm12, OWORD PTR [rdx+128] + vmovdqu xmm13, OWORD PTR [rdx+144] + vmovdqu xmm14, OWORD PTR [rdx+192] + vmovdqu xmm15, OWORD PTR [rdx+208] + vpxor xmm0, xmm0, xmm8 + vpxor xmm1, xmm1, xmm9 + vpxor xmm2, xmm2, xmm10 + vpxor xmm3, xmm3, xmm11 + vpxor xmm4, xmm4, xmm12 + vpxor xmm5, xmm5, xmm13 + vpxor xmm6, xmm6, xmm14 + vpxor xmm7, xmm7, xmm15 + vmovdqu OWORD PTR [r8], xmm0 + vmovdqu OWORD PTR [r8+16], xmm1 + vmovdqu OWORD PTR [r8+64], xmm2 + vmovdqu OWORD PTR [r8+80], xmm3 + vmovdqu OWORD PTR [r8+128], xmm4 + vmovdqu OWORD PTR [r8+144], xmm5 + vmovdqu OWORD PTR [r8+192], xmm6 + vmovdqu OWORD PTR [r8+208], xmm7 + vmovdqa xmm0, OWORD PTR [r12] + vmovdqa xmm1, OWORD PTR [r12+16] + vmovdqa xmm2, OWORD PTR [r12+32] + vmovdqa xmm3, OWORD PTR [r12+48] + vmovdqa xmm4, OWORD PTR [r12+64] + vmovdqa xmm5, OWORD PTR [r12+80] + vmovdqa xmm6, OWORD PTR [r12+96] + vmovdqa xmm7, OWORD PTR [r12+112] + vpunpckldq xmm8, xmm0, xmm1 + vpunpckldq xmm9, xmm2, xmm3 + vpunpckhdq xmm12, xmm0, xmm1 + vpunpckhdq xmm13, xmm2, xmm3 + vpunpckldq xmm10, xmm4, xmm5 + vpunpckldq xmm11, xmm6, xmm7 + vpunpckhdq xmm14, xmm4, xmm5 + vpunpckhdq xmm15, xmm6, xmm7 + vpunpcklqdq xmm0, xmm8, xmm9 + vpunpcklqdq xmm1, xmm10, xmm11 + vpunpckhqdq xmm2, xmm8, xmm9 + vpunpckhqdq xmm3, xmm10, xmm11 + vpunpcklqdq xmm4, xmm12, xmm13 + vpunpcklqdq xmm5, xmm14, xmm15 + vpunpckhqdq xmm6, xmm12, xmm13 + vpunpckhqdq xmm7, xmm14, xmm15 + vmovdqu xmm8, OWORD PTR [rdx+32] + vmovdqu xmm9, OWORD PTR [rdx+48] + vmovdqu xmm10, OWORD PTR [rdx+96] + vmovdqu xmm11, OWORD PTR [rdx+112] + vmovdqu xmm12, OWORD PTR [rdx+160] + vmovdqu xmm13, OWORD PTR [rdx+176] + vmovdqu xmm14, OWORD PTR [rdx+224] + vmovdqu xmm15, OWORD PTR [rdx+240] + vpxor xmm0, xmm0, xmm8 + vpxor xmm1, xmm1, xmm9 + vpxor xmm2, xmm2, xmm10 + vpxor xmm3, xmm3, xmm11 + vpxor xmm4, xmm4, xmm12 + vpxor xmm5, xmm5, xmm13 + vpxor xmm6, xmm6, xmm14 + vpxor xmm7, xmm7, xmm15 + vmovdqu OWORD PTR [r8+32], xmm0 + vmovdqu OWORD PTR [r8+48], xmm1 + vmovdqu OWORD PTR [r8+96], xmm2 + vmovdqu OWORD PTR [r8+112], xmm3 + vmovdqu OWORD PTR [r8+160], xmm4 + vmovdqu OWORD PTR [r8+176], xmm5 + vmovdqu OWORD PTR [r8+224], xmm6 + vmovdqu OWORD PTR [r8+240], xmm7 + vmovdqa xmm12, OWORD PTR [r11+192] + add rdx, 256 + add r8, 256 + vpaddd xmm12, xmm12, OWORD PTR [rsi] + sub r9d, 256 + vmovdqa OWORD PTR [r11+192], xmm12 + cmp r9d, 256 + jl L_chacha20_avx1_done128 + vmovdqa xmm0, OWORD PTR [r11] + vmovdqa xmm1, OWORD PTR [r11+16] + vmovdqa xmm2, OWORD PTR [r11+32] + vmovdqa xmm3, OWORD PTR [r11+48] + vmovdqa xmm4, OWORD PTR [r11+64] + vmovdqa xmm5, OWORD PTR [r11+80] + vmovdqa xmm6, OWORD PTR [r11+96] + vmovdqa xmm7, OWORD PTR [r11+112] + vmovdqa xmm8, OWORD PTR [r11+128] + vmovdqa xmm9, OWORD PTR [r11+144] + vmovdqa xmm10, OWORD PTR [r11+160] + vmovdqa xmm11, OWORD PTR [r11+176] + vmovdqa xmm12, OWORD PTR [r11+192] + vmovdqa xmm13, OWORD PTR [r11+208] + vmovdqa xmm14, OWORD PTR [r11+224] + vmovdqa xmm15, OWORD PTR [r11+240] + jmp L_chacha20_avx1_start128 +L_chacha20_avx1_done128: + shl eax, 2 + add DWORD PTR [rcx+48], eax +L_chacha20_avx1_end128: + cmp r9d, 64 + jl L_chacha20_avx1_block_done +L_chacha20_avx1_block_start: + vmovdqu xmm0, OWORD PTR [rcx] + vmovdqu xmm1, OWORD PTR [rcx+16] + vmovdqu xmm2, OWORD PTR [rcx+32] + vmovdqu xmm3, OWORD PTR [rcx+48] + vmovdqa xmm5, xmm0 + vmovdqa xmm6, xmm1 + vmovdqa xmm7, xmm2 + vmovdqa xmm8, xmm3 + mov al, 10 +L_chacha20_avx1_block_crypt_start: + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, OWORD PTR [r15] + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm4, xmm1, 20 + vpslld xmm1, xmm1, 12 + vpxor xmm1, xmm1, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, OWORD PTR [r14] + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm4, xmm1, 25 + vpslld xmm1, xmm1, 7 + vpxor xmm1, xmm1, xmm4 + vpshufd xmm1, xmm1, 57 + vpshufd xmm2, xmm2, 78 + vpshufd xmm3, xmm3, 147 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, OWORD PTR [r15] + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm4, xmm1, 20 + vpslld xmm1, xmm1, 12 + vpxor xmm1, xmm1, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, OWORD PTR [r14] + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm4, xmm1, 25 + vpslld xmm1, xmm1, 7 + vpxor xmm1, xmm1, xmm4 + vpshufd xmm1, xmm1, 147 + vpshufd xmm2, xmm2, 78 + vpshufd xmm3, xmm3, 57 + dec al + jnz L_chacha20_avx1_block_crypt_start + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm1, xmm1, xmm6 + vpaddd xmm2, xmm2, xmm7 + vpaddd xmm3, xmm3, xmm8 + vmovdqu xmm5, OWORD PTR [rdx] + vmovdqu xmm6, OWORD PTR [rdx+16] + vmovdqu xmm7, OWORD PTR [rdx+32] + vmovdqu xmm8, OWORD PTR [rdx+48] + vpxor xmm0, xmm0, xmm5 + vpxor xmm1, xmm1, xmm6 + vpxor xmm2, xmm2, xmm7 + vpxor xmm3, xmm3, xmm8 + vmovdqu OWORD PTR [r8], xmm0 + vmovdqu OWORD PTR [r8+16], xmm1 + vmovdqu OWORD PTR [r8+32], xmm2 + vmovdqu OWORD PTR [r8+48], xmm3 + add DWORD PTR [rcx+48], 1 + sub r9d, 64 + add rdx, 64 + add r8, 64 + cmp r9d, 64 + jge L_chacha20_avx1_block_start +L_chacha20_avx1_block_done: + cmp r9d, 0 + je L_chacha20_avx1_partial_done + lea r12, QWORD PTR [rcx+80] + vmovdqu xmm0, OWORD PTR [rcx] + vmovdqu xmm1, OWORD PTR [rcx+16] + vmovdqu xmm2, OWORD PTR [rcx+32] + vmovdqu xmm3, OWORD PTR [rcx+48] + vmovdqa xmm5, xmm0 + vmovdqa xmm6, xmm1 + vmovdqa xmm7, xmm2 + vmovdqa xmm8, xmm3 + mov al, 10 +L_chacha20_avx1_partial_crypt_start: + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, OWORD PTR [r15] + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm4, xmm1, 20 + vpslld xmm1, xmm1, 12 + vpxor xmm1, xmm1, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, OWORD PTR [r14] + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm4, xmm1, 25 + vpslld xmm1, xmm1, 7 + vpxor xmm1, xmm1, xmm4 + vpshufd xmm1, xmm1, 57 + vpshufd xmm2, xmm2, 78 + vpshufd xmm3, xmm3, 147 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, OWORD PTR [r15] + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm4, xmm1, 20 + vpslld xmm1, xmm1, 12 + vpxor xmm1, xmm1, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, OWORD PTR [r14] + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm4, xmm1, 25 + vpslld xmm1, xmm1, 7 + vpxor xmm1, xmm1, xmm4 + vpshufd xmm1, xmm1, 147 + vpshufd xmm2, xmm2, 78 + vpshufd xmm3, xmm3, 57 + dec al + jnz L_chacha20_avx1_partial_crypt_start + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm1, xmm1, xmm6 + vpaddd xmm2, xmm2, xmm7 + vpaddd xmm3, xmm3, xmm8 + vmovdqu OWORD PTR [r12], xmm0 + vmovdqu OWORD PTR [r12+16], xmm1 + vmovdqu OWORD PTR [r12+32], xmm2 + vmovdqu OWORD PTR [r12+48], xmm3 + add DWORD PTR [rcx+48], 1 + mov r10d, r9d + xor r13, r13 + and r10d, 7 + jz L_chacha20_avx1_partial_start64 +L_chacha20_avx1_partial_start8: + movzx eax, BYTE PTR [r12+r13] + xor al, BYTE PTR [rdx+r13] + mov BYTE PTR [r8+r13], al + inc r13d + cmp r13d, r10d + jne L_chacha20_avx1_partial_start8 + je L_chacha20_avx1_partial_end64 +L_chacha20_avx1_partial_start64: + mov rax, QWORD PTR [r12+r13] + xor rax, QWORD PTR [rdx+r13] + mov QWORD PTR [r8+r13], rax + add r13d, 8 +L_chacha20_avx1_partial_end64: + cmp r13d, r9d + jne L_chacha20_avx1_partial_start64 + mov r10d, 64 + sub r10d, r13d + mov DWORD PTR [rcx+76], r10d +L_chacha20_avx1_partial_done: + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp+400] + vmovdqu xmm7, OWORD PTR [rsp+416] + vmovdqu xmm8, OWORD PTR [rsp+432] + vmovdqu xmm9, OWORD PTR [rsp+448] + vmovdqu xmm10, OWORD PTR [rsp+464] + vmovdqu xmm11, OWORD PTR [rsp+480] + vmovdqu xmm12, OWORD PTR [rsp+496] + vmovdqu xmm13, OWORD PTR [rsp+512] + vmovdqu xmm14, OWORD PTR [rsp+528] + vmovdqu xmm15, OWORD PTR [rsp+544] + add rsp, 560 + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + ret +chacha_encrypt_avx1 ENDP +_text ENDS +ENDIF +IFDEF HAVE_INTEL_AVX2 +_DATA SEGMENT +ALIGN 16 +L_chacha20_avx2_rotl8 QWORD 433757367256023043, 1012478749960636427, + 433757367256023043, 1012478749960636427 +ptr_L_chacha20_avx2_rotl8 QWORD L_chacha20_avx2_rotl8 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_chacha20_avx2_rotl16 QWORD 361421592464458498, 940142975169071882, + 361421592464458498, 940142975169071882 +ptr_L_chacha20_avx2_rotl16 QWORD L_chacha20_avx2_rotl16 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_chacha20_avx2_add QWORD 4294967296, 12884901890, + 21474836484, 30064771078 +ptr_L_chacha20_avx2_add QWORD L_chacha20_avx2_add +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_chacha20_avx2_eight QWORD 34359738376, 34359738376, + 34359738376, 34359738376 +ptr_L_chacha20_avx2_eight QWORD L_chacha20_avx2_eight +_DATA ENDS +_text SEGMENT READONLY PARA +chacha_encrypt_avx2 PROC + push r12 + push r13 + push r14 + push r15 + push rdi + sub rsp, 960 + vmovdqu OWORD PTR [rsp+800], xmm6 + vmovdqu OWORD PTR [rsp+816], xmm7 + vmovdqu OWORD PTR [rsp+832], xmm8 + vmovdqu OWORD PTR [rsp+848], xmm9 + vmovdqu OWORD PTR [rsp+864], xmm10 + vmovdqu OWORD PTR [rsp+880], xmm11 + vmovdqu OWORD PTR [rsp+896], xmm12 + vmovdqu OWORD PTR [rsp+912], xmm13 + vmovdqu OWORD PTR [rsp+928], xmm14 + vmovdqu OWORD PTR [rsp+944], xmm15 + mov r11, rsp + mov r13, QWORD PTR [ptr_L_chacha20_avx2_rotl8] + mov r14, QWORD PTR [ptr_L_chacha20_avx2_rotl16] + mov r15, QWORD PTR [ptr_L_chacha20_avx2_add] + mov rdi, QWORD PTR [ptr_L_chacha20_avx2_eight] + lea r12, QWORD PTR [rsp+512] + add r11, 31 + add r12, 31 + and r11, -32 + and r12, -32 + mov eax, r9d + shr eax, 9 + jz L_chacha20_avx2_end256 + vpbroadcastd ymm0, DWORD PTR [rcx] + vpbroadcastd ymm1, DWORD PTR [rcx+4] + vpbroadcastd ymm2, DWORD PTR [rcx+8] + vpbroadcastd ymm3, DWORD PTR [rcx+12] + vpbroadcastd ymm4, DWORD PTR [rcx+16] + vpbroadcastd ymm5, DWORD PTR [rcx+20] + vpbroadcastd ymm6, DWORD PTR [rcx+24] + vpbroadcastd ymm7, DWORD PTR [rcx+28] + vpbroadcastd ymm8, DWORD PTR [rcx+32] + vpbroadcastd ymm9, DWORD PTR [rcx+36] + vpbroadcastd ymm10, DWORD PTR [rcx+40] + vpbroadcastd ymm11, DWORD PTR [rcx+44] + vpbroadcastd ymm12, DWORD PTR [rcx+48] + vpbroadcastd ymm13, DWORD PTR [rcx+52] + vpbroadcastd ymm14, DWORD PTR [rcx+56] + vpbroadcastd ymm15, DWORD PTR [rcx+60] + vpaddd ymm12, ymm12, YMMWORD PTR [r15] + vmovdqa YMMWORD PTR [r11], ymm0 + vmovdqa YMMWORD PTR [r11+32], ymm1 + vmovdqa YMMWORD PTR [r11+64], ymm2 + vmovdqa YMMWORD PTR [r11+96], ymm3 + vmovdqa YMMWORD PTR [r11+128], ymm4 + vmovdqa YMMWORD PTR [r11+160], ymm5 + vmovdqa YMMWORD PTR [r11+192], ymm6 + vmovdqa YMMWORD PTR [r11+224], ymm7 + vmovdqa YMMWORD PTR [r11+256], ymm8 + vmovdqa YMMWORD PTR [r11+288], ymm9 + vmovdqa YMMWORD PTR [r11+320], ymm10 + vmovdqa YMMWORD PTR [r11+352], ymm11 + vmovdqa YMMWORD PTR [r11+384], ymm12 + vmovdqa YMMWORD PTR [r11+416], ymm13 + vmovdqa YMMWORD PTR [r11+448], ymm14 + vmovdqa YMMWORD PTR [r11+480], ymm15 +L_chacha20_avx2_start256: + mov r10b, 10 + vmovdqa YMMWORD PTR [r12+96], ymm11 +L_chacha20_avx2_loop256: + vpaddd ymm0, ymm0, ymm4 + vpxor ymm12, ymm12, ymm0 + vmovdqa ymm11, YMMWORD PTR [r12+96] + vpshufb ymm12, ymm12, YMMWORD PTR [r14] + vpaddd ymm8, ymm8, ymm12 + vpxor ymm4, ymm4, ymm8 + vpaddd ymm1, ymm1, ymm5 + vpxor ymm13, ymm13, ymm1 + vpshufb ymm13, ymm13, YMMWORD PTR [r14] + vpaddd ymm9, ymm9, ymm13 + vpxor ymm5, ymm5, ymm9 + vpaddd ymm2, ymm2, ymm6 + vpxor ymm14, ymm14, ymm2 + vpshufb ymm14, ymm14, YMMWORD PTR [r14] + vpaddd ymm10, ymm10, ymm14 + vpxor ymm6, ymm6, ymm10 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm15, ymm15, ymm3 + vpshufb ymm15, ymm15, YMMWORD PTR [r14] + vpaddd ymm11, ymm11, ymm15 + vpxor ymm7, ymm7, ymm11 + vmovdqa YMMWORD PTR [r12+96], ymm11 + vpsrld ymm11, ymm4, 20 + vpslld ymm4, ymm4, 12 + vpxor ymm4, ymm4, ymm11 + vpsrld ymm11, ymm5, 20 + vpslld ymm5, ymm5, 12 + vpxor ymm5, ymm5, ymm11 + vpsrld ymm11, ymm6, 20 + vpslld ymm6, ymm6, 12 + vpxor ymm6, ymm6, ymm11 + vpsrld ymm11, ymm7, 20 + vpslld ymm7, ymm7, 12 + vpxor ymm7, ymm7, ymm11 + vpaddd ymm0, ymm0, ymm4 + vpxor ymm12, ymm12, ymm0 + vmovdqa ymm11, YMMWORD PTR [r12+96] + vpshufb ymm12, ymm12, YMMWORD PTR [r13] + vpaddd ymm8, ymm8, ymm12 + vpxor ymm4, ymm4, ymm8 + vpaddd ymm1, ymm1, ymm5 + vpxor ymm13, ymm13, ymm1 + vpshufb ymm13, ymm13, YMMWORD PTR [r13] + vpaddd ymm9, ymm9, ymm13 + vpxor ymm5, ymm5, ymm9 + vpaddd ymm2, ymm2, ymm6 + vpxor ymm14, ymm14, ymm2 + vpshufb ymm14, ymm14, YMMWORD PTR [r13] + vpaddd ymm10, ymm10, ymm14 + vpxor ymm6, ymm6, ymm10 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm15, ymm15, ymm3 + vpshufb ymm15, ymm15, YMMWORD PTR [r13] + vpaddd ymm11, ymm11, ymm15 + vpxor ymm7, ymm7, ymm11 + vmovdqa YMMWORD PTR [r12+96], ymm11 + vpsrld ymm11, ymm4, 25 + vpslld ymm4, ymm4, 7 + vpxor ymm4, ymm4, ymm11 + vpsrld ymm11, ymm5, 25 + vpslld ymm5, ymm5, 7 + vpxor ymm5, ymm5, ymm11 + vpsrld ymm11, ymm6, 25 + vpslld ymm6, ymm6, 7 + vpxor ymm6, ymm6, ymm11 + vpsrld ymm11, ymm7, 25 + vpslld ymm7, ymm7, 7 + vpxor ymm7, ymm7, ymm11 + vpaddd ymm0, ymm0, ymm5 + vpxor ymm15, ymm15, ymm0 + vmovdqa ymm11, YMMWORD PTR [r12+96] + vpshufb ymm15, ymm15, YMMWORD PTR [r14] + vpaddd ymm10, ymm10, ymm15 + vpxor ymm5, ymm5, ymm10 + vpaddd ymm1, ymm1, ymm6 + vpxor ymm12, ymm12, ymm1 + vpshufb ymm12, ymm12, YMMWORD PTR [r14] + vpaddd ymm11, ymm11, ymm12 + vpxor ymm6, ymm6, ymm11 + vpaddd ymm2, ymm2, ymm7 + vpxor ymm13, ymm13, ymm2 + vpshufb ymm13, ymm13, YMMWORD PTR [r14] + vpaddd ymm8, ymm8, ymm13 + vpxor ymm7, ymm7, ymm8 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm14, ymm14, ymm3 + vpshufb ymm14, ymm14, YMMWORD PTR [r14] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm4, ymm4, ymm9 + vmovdqa YMMWORD PTR [r12+96], ymm11 + vpsrld ymm11, ymm5, 20 + vpslld ymm5, ymm5, 12 + vpxor ymm5, ymm5, ymm11 + vpsrld ymm11, ymm6, 20 + vpslld ymm6, ymm6, 12 + vpxor ymm6, ymm6, ymm11 + vpsrld ymm11, ymm7, 20 + vpslld ymm7, ymm7, 12 + vpxor ymm7, ymm7, ymm11 + vpsrld ymm11, ymm4, 20 + vpslld ymm4, ymm4, 12 + vpxor ymm4, ymm4, ymm11 + vpaddd ymm0, ymm0, ymm5 + vpxor ymm15, ymm15, ymm0 + vmovdqa ymm11, YMMWORD PTR [r12+96] + vpshufb ymm15, ymm15, YMMWORD PTR [r13] + vpaddd ymm10, ymm10, ymm15 + vpxor ymm5, ymm5, ymm10 + vpaddd ymm1, ymm1, ymm6 + vpxor ymm12, ymm12, ymm1 + vpshufb ymm12, ymm12, YMMWORD PTR [r13] + vpaddd ymm11, ymm11, ymm12 + vpxor ymm6, ymm6, ymm11 + vpaddd ymm2, ymm2, ymm7 + vpxor ymm13, ymm13, ymm2 + vpshufb ymm13, ymm13, YMMWORD PTR [r13] + vpaddd ymm8, ymm8, ymm13 + vpxor ymm7, ymm7, ymm8 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm14, ymm14, ymm3 + vpshufb ymm14, ymm14, YMMWORD PTR [r13] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm4, ymm4, ymm9 + vmovdqa YMMWORD PTR [r12+96], ymm11 + vpsrld ymm11, ymm5, 25 + vpslld ymm5, ymm5, 7 + vpxor ymm5, ymm5, ymm11 + vpsrld ymm11, ymm6, 25 + vpslld ymm6, ymm6, 7 + vpxor ymm6, ymm6, ymm11 + vpsrld ymm11, ymm7, 25 + vpslld ymm7, ymm7, 7 + vpxor ymm7, ymm7, ymm11 + vpsrld ymm11, ymm4, 25 + vpslld ymm4, ymm4, 7 + vpxor ymm4, ymm4, ymm11 + dec r10b + jnz L_chacha20_avx2_loop256 + vmovdqa ymm11, YMMWORD PTR [r12+96] + vpaddd ymm0, ymm0, YMMWORD PTR [r11] + vpaddd ymm1, ymm1, YMMWORD PTR [r11+32] + vpaddd ymm2, ymm2, YMMWORD PTR [r11+64] + vpaddd ymm3, ymm3, YMMWORD PTR [r11+96] + vpaddd ymm4, ymm4, YMMWORD PTR [r11+128] + vpaddd ymm5, ymm5, YMMWORD PTR [r11+160] + vpaddd ymm6, ymm6, YMMWORD PTR [r11+192] + vpaddd ymm7, ymm7, YMMWORD PTR [r11+224] + vpaddd ymm8, ymm8, YMMWORD PTR [r11+256] + vpaddd ymm9, ymm9, YMMWORD PTR [r11+288] + vpaddd ymm10, ymm10, YMMWORD PTR [r11+320] + vpaddd ymm11, ymm11, YMMWORD PTR [r11+352] + vpaddd ymm12, ymm12, YMMWORD PTR [r11+384] + vpaddd ymm13, ymm13, YMMWORD PTR [r11+416] + vpaddd ymm14, ymm14, YMMWORD PTR [r11+448] + vpaddd ymm15, ymm15, YMMWORD PTR [r11+480] + vmovdqa YMMWORD PTR [r12], ymm8 + vmovdqa YMMWORD PTR [r12+32], ymm9 + vmovdqa YMMWORD PTR [r12+64], ymm10 + vmovdqa YMMWORD PTR [r12+96], ymm11 + vmovdqa YMMWORD PTR [r12+128], ymm12 + vmovdqa YMMWORD PTR [r12+160], ymm13 + vmovdqa YMMWORD PTR [r12+192], ymm14 + vmovdqa YMMWORD PTR [r12+224], ymm15 + vpunpckldq ymm8, ymm0, ymm1 + vpunpckldq ymm9, ymm2, ymm3 + vpunpckhdq ymm12, ymm0, ymm1 + vpunpckhdq ymm13, ymm2, ymm3 + vpunpckldq ymm10, ymm4, ymm5 + vpunpckldq ymm11, ymm6, ymm7 + vpunpckhdq ymm14, ymm4, ymm5 + vpunpckhdq ymm15, ymm6, ymm7 + vpunpcklqdq ymm0, ymm8, ymm9 + vpunpcklqdq ymm1, ymm10, ymm11 + vpunpckhqdq ymm2, ymm8, ymm9 + vpunpckhqdq ymm3, ymm10, ymm11 + vpunpcklqdq ymm4, ymm12, ymm13 + vpunpcklqdq ymm5, ymm14, ymm15 + vpunpckhqdq ymm6, ymm12, ymm13 + vpunpckhqdq ymm7, ymm14, ymm15 + vperm2i128 ymm8, ymm0, ymm1, 32 + vperm2i128 ymm9, ymm2, ymm3, 32 + vperm2i128 ymm12, ymm0, ymm1, 49 + vperm2i128 ymm13, ymm2, ymm3, 49 + vperm2i128 ymm10, ymm4, ymm5, 32 + vperm2i128 ymm11, ymm6, ymm7, 32 + vperm2i128 ymm14, ymm4, ymm5, 49 + vperm2i128 ymm15, ymm6, ymm7, 49 + vmovdqu ymm0, YMMWORD PTR [rdx] + vmovdqu ymm1, YMMWORD PTR [rdx+64] + vmovdqu ymm2, YMMWORD PTR [rdx+128] + vmovdqu ymm3, YMMWORD PTR [rdx+192] + vmovdqu ymm4, YMMWORD PTR [rdx+256] + vmovdqu ymm5, YMMWORD PTR [rdx+320] + vmovdqu ymm6, YMMWORD PTR [rdx+384] + vmovdqu ymm7, YMMWORD PTR [rdx+448] + vpxor ymm8, ymm8, ymm0 + vpxor ymm9, ymm9, ymm1 + vpxor ymm10, ymm10, ymm2 + vpxor ymm11, ymm11, ymm3 + vpxor ymm12, ymm12, ymm4 + vpxor ymm13, ymm13, ymm5 + vpxor ymm14, ymm14, ymm6 + vpxor ymm15, ymm15, ymm7 + vmovdqu YMMWORD PTR [r8], ymm8 + vmovdqu YMMWORD PTR [r8+64], ymm9 + vmovdqu YMMWORD PTR [r8+128], ymm10 + vmovdqu YMMWORD PTR [r8+192], ymm11 + vmovdqu YMMWORD PTR [r8+256], ymm12 + vmovdqu YMMWORD PTR [r8+320], ymm13 + vmovdqu YMMWORD PTR [r8+384], ymm14 + vmovdqu YMMWORD PTR [r8+448], ymm15 + vmovdqa ymm0, YMMWORD PTR [r12] + vmovdqa ymm1, YMMWORD PTR [r12+32] + vmovdqa ymm2, YMMWORD PTR [r12+64] + vmovdqa ymm3, YMMWORD PTR [r12+96] + vmovdqa ymm4, YMMWORD PTR [r12+128] + vmovdqa ymm5, YMMWORD PTR [r12+160] + vmovdqa ymm6, YMMWORD PTR [r12+192] + vmovdqa ymm7, YMMWORD PTR [r12+224] + vpunpckldq ymm8, ymm0, ymm1 + vpunpckldq ymm9, ymm2, ymm3 + vpunpckhdq ymm12, ymm0, ymm1 + vpunpckhdq ymm13, ymm2, ymm3 + vpunpckldq ymm10, ymm4, ymm5 + vpunpckldq ymm11, ymm6, ymm7 + vpunpckhdq ymm14, ymm4, ymm5 + vpunpckhdq ymm15, ymm6, ymm7 + vpunpcklqdq ymm0, ymm8, ymm9 + vpunpcklqdq ymm1, ymm10, ymm11 + vpunpckhqdq ymm2, ymm8, ymm9 + vpunpckhqdq ymm3, ymm10, ymm11 + vpunpcklqdq ymm4, ymm12, ymm13 + vpunpcklqdq ymm5, ymm14, ymm15 + vpunpckhqdq ymm6, ymm12, ymm13 + vpunpckhqdq ymm7, ymm14, ymm15 + vperm2i128 ymm8, ymm0, ymm1, 32 + vperm2i128 ymm9, ymm2, ymm3, 32 + vperm2i128 ymm12, ymm0, ymm1, 49 + vperm2i128 ymm13, ymm2, ymm3, 49 + vperm2i128 ymm10, ymm4, ymm5, 32 + vperm2i128 ymm11, ymm6, ymm7, 32 + vperm2i128 ymm14, ymm4, ymm5, 49 + vperm2i128 ymm15, ymm6, ymm7, 49 + vmovdqu ymm0, YMMWORD PTR [rdx+32] + vmovdqu ymm1, YMMWORD PTR [rdx+96] + vmovdqu ymm2, YMMWORD PTR [rdx+160] + vmovdqu ymm3, YMMWORD PTR [rdx+224] + vmovdqu ymm4, YMMWORD PTR [rdx+288] + vmovdqu ymm5, YMMWORD PTR [rdx+352] + vmovdqu ymm6, YMMWORD PTR [rdx+416] + vmovdqu ymm7, YMMWORD PTR [rdx+480] + vpxor ymm8, ymm8, ymm0 + vpxor ymm9, ymm9, ymm1 + vpxor ymm10, ymm10, ymm2 + vpxor ymm11, ymm11, ymm3 + vpxor ymm12, ymm12, ymm4 + vpxor ymm13, ymm13, ymm5 + vpxor ymm14, ymm14, ymm6 + vpxor ymm15, ymm15, ymm7 + vmovdqu YMMWORD PTR [r8+32], ymm8 + vmovdqu YMMWORD PTR [r8+96], ymm9 + vmovdqu YMMWORD PTR [r8+160], ymm10 + vmovdqu YMMWORD PTR [r8+224], ymm11 + vmovdqu YMMWORD PTR [r8+288], ymm12 + vmovdqu YMMWORD PTR [r8+352], ymm13 + vmovdqu YMMWORD PTR [r8+416], ymm14 + vmovdqu YMMWORD PTR [r8+480], ymm15 + vmovdqa ymm12, YMMWORD PTR [r11+384] + add rdx, 512 + add r8, 512 + vpaddd ymm12, ymm12, YMMWORD PTR [rdi] + sub r9d, 512 + vmovdqa YMMWORD PTR [r11+384], ymm12 + cmp r9d, 512 + jl L_chacha20_avx2_done256 + vmovdqa ymm0, YMMWORD PTR [r11] + vmovdqa ymm1, YMMWORD PTR [r11+32] + vmovdqa ymm2, YMMWORD PTR [r11+64] + vmovdqa ymm3, YMMWORD PTR [r11+96] + vmovdqa ymm4, YMMWORD PTR [r11+128] + vmovdqa ymm5, YMMWORD PTR [r11+160] + vmovdqa ymm6, YMMWORD PTR [r11+192] + vmovdqa ymm7, YMMWORD PTR [r11+224] + vmovdqa ymm8, YMMWORD PTR [r11+256] + vmovdqa ymm9, YMMWORD PTR [r11+288] + vmovdqa ymm10, YMMWORD PTR [r11+320] + vmovdqa ymm11, YMMWORD PTR [r11+352] + vmovdqa ymm12, YMMWORD PTR [r11+384] + vmovdqa ymm13, YMMWORD PTR [r11+416] + vmovdqa ymm14, YMMWORD PTR [r11+448] + vmovdqa ymm15, YMMWORD PTR [r11+480] + jmp L_chacha20_avx2_start256 +L_chacha20_avx2_done256: + shl eax, 3 + add DWORD PTR [rcx+48], eax +L_chacha20_avx2_end256: + call chacha_encrypt_avx1 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp+800] + vmovdqu xmm7, OWORD PTR [rsp+816] + vmovdqu xmm8, OWORD PTR [rsp+832] + vmovdqu xmm9, OWORD PTR [rsp+848] + vmovdqu xmm10, OWORD PTR [rsp+864] + vmovdqu xmm11, OWORD PTR [rsp+880] + vmovdqu xmm12, OWORD PTR [rsp+896] + vmovdqu xmm13, OWORD PTR [rsp+912] + vmovdqu xmm14, OWORD PTR [rsp+928] + vmovdqu xmm15, OWORD PTR [rsp+944] + add rsp, 960 + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + ret +chacha_encrypt_avx2 ENDP +_text ENDS +ENDIF +END diff --git a/wolfcrypt/src/fe_x25519_asm.S b/wolfcrypt/src/fe_x25519_asm.S index 2001bc0a89..308aaae034 100644 --- a/wolfcrypt/src/fe_x25519_asm.S +++ b/wolfcrypt/src/fe_x25519_asm.S @@ -1,6 +1,6 @@ /* fe_x25519_asm.S */ /* - * Copyright (C) 2006-2023 wolfSSL Inc. + * Copyright (C) 2006-2024 wolfSSL Inc. * * This file is part of wolfSSL. * @@ -4140,7 +4140,7 @@ _fe_sq2_x64: adcq %r11, %r8 adcq %r12, %r9 adcq %r13, %r10 - mov %r10, %rax + movq %r10, %rax shldq $0x01, %r9, %r10 shldq $0x01, %r8, %r9 shldq $0x01, %rcx, %r8 @@ -5946,7 +5946,7 @@ _ge_p2_dbl_x64: adcq %r13, %r10 adcq %r14, %r11 adcq %r15, %r12 - mov %r12, %rax + movq %r12, %rax shldq $0x01, %r11, %r12 shldq $0x01, %r10, %r11 shldq $0x01, %r9, %r10 @@ -8676,7 +8676,7 @@ _sc_reduce_x64: movq $0xa7ed9ce5a30a2c13, %rcx movq %r12, %rax mulq %rcx - mov $0x00, %rbp + movq $0x00, %rbp addq %rax, %r8 adcq %rdx, %rbp movq %r13, %rax @@ -8689,7 +8689,7 @@ _sc_reduce_x64: addq %rbp, %r9 adcq %rax, %r10 adcq %rdx, %r11 - mov $0x00, %rbx + movq $0x00, %rbx adcq $0x00, %rbx movq %r15, %rax mulq %rcx @@ -8699,7 +8699,7 @@ _sc_reduce_x64: movq $0xeb2106215d086329, %rcx movq %r12, %rax mulq %rcx - mov $0x00, %rbp + movq $0x00, %rbp addq %rax, %r9 adcq %rdx, %rbp movq %r13, %rax @@ -8712,7 +8712,7 @@ _sc_reduce_x64: addq %rbp, %r10 adcq %rax, %r11 adcq %rdx, %rbx - mov $0x00, %rbp + movq $0x00, %rbp adcq $0x00, %rbp movq %r15, %rax mulq %rcx @@ -8962,7 +8962,7 @@ _sc_muladd_x64: movq $0xa7ed9ce5a30a2c13, %rbx movq %r12, %rax mulq %rbx - mov $0x00, %rbp + movq $0x00, %rbp addq %rax, %r8 adcq %rdx, %rbp movq %r13, %rax @@ -8975,7 +8975,7 @@ _sc_muladd_x64: addq %rbp, %r9 adcq %rax, %r10 adcq %rdx, %r11 - mov $0x00, %rsi + movq $0x00, %rsi adcq $0x00, %rsi movq %r15, %rax mulq %rbx @@ -8985,7 +8985,7 @@ _sc_muladd_x64: movq $0xeb2106215d086329, %rbx movq %r12, %rax mulq %rbx - mov $0x00, %rbp + movq $0x00, %rbp addq %rax, %r9 adcq %rdx, %rbp movq %r13, %rax @@ -8998,7 +8998,7 @@ _sc_muladd_x64: addq %rbp, %r10 adcq %rax, %r11 adcq %rdx, %rsi - mov $0x00, %rbp + movq $0x00, %rbp adcq $0x00, %rbp movq %r15, %rax mulq %rbx @@ -11371,7 +11371,7 @@ _fe_sq2_avx2: adcxq %rax, %r10 adoxq %r14, %r11 adcxq %rcx, %r11 - mov %r11, %rax + movq %r11, %rax shldq $0x01, %r10, %r11 shldq $0x01, %r9, %r10 shldq $0x01, %r8, %r9 @@ -12862,7 +12862,7 @@ _ge_p2_dbl_avx2: adcxq %r9, %r12 adoxq %rbx, %r13 adcxq %rcx, %r13 - mov %r13, %r9 + movq %r13, %r9 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 shldq $0x01, %r10, %r11 @@ -15206,33 +15206,33 @@ _sc_reduce_avx2: adcq $0x00, %r15 # Sub product of top 4 words and order movq $0xa7ed9ce5a30a2c13, %rdx - mulx %r12, %rcx, %rax + mulxq %r12, %rcx, %rax addq %rcx, %r8 adcq %rax, %r9 - mulx %r14, %rcx, %rax + mulxq %r14, %rcx, %rax adcq %rcx, %r10 adcq %rax, %r11 - mov $0x00, %rsi + movq $0x00, %rsi adcq $0x00, %rsi - mulx %r13, %rcx, %rax + mulxq %r13, %rcx, %rax addq %rcx, %r9 adcq %rax, %r10 - mulx %r15, %rcx, %rax + mulxq %r15, %rcx, %rax adcq %rcx, %r11 adcq %rax, %rsi movq $0xeb2106215d086329, %rdx - mulx %r12, %rcx, %rax + mulxq %r12, %rcx, %rax addq %rcx, %r9 adcq %rax, %r10 - mulx %r14, %rcx, %rax + mulxq %r14, %rcx, %rax adcq %rcx, %r11 adcq %rax, %rsi - mov $0x00, %rbx + movq $0x00, %rbx adcq $0x00, %rbx - mulx %r13, %rcx, %rax + mulxq %r13, %rcx, %rax addq %rcx, %r10 adcq %rax, %r11 - mulx %r15, %rcx, %rax + mulxq %r15, %rcx, %rax adcq %rcx, %rsi adcq %rax, %rbx subq %r12, %r10 @@ -15265,21 +15265,21 @@ _sc_reduce_avx2: # Sub product of top 2 words and order # * -5812631a5cf5d3ed movq $0xa7ed9ce5a30a2c13, %rdx - mulx %r12, %rbp, %rax + mulxq %r12, %rbp, %rax movq $0x00, %rsi addq %rbp, %r8 adcq %rax, %r9 - mulx %r13, %rbp, %rax + mulxq %r13, %rbp, %rax adcq $0x00, %rsi addq %rbp, %r9 adcq %rax, %rsi # * -14def9dea2f79cd7 movq $0xeb2106215d086329, %rdx - mulx %r12, %rbp, %rax + mulxq %r12, %rbp, %rax movq $0x00, %rbx addq %rbp, %r9 adcq %rax, %r10 - mulx %r13, %rbp, %rax + mulxq %r13, %rbp, %rax adcq $0x00, %rbx addq %rbp, %r10 adcq %rax, %rbx @@ -15450,33 +15450,33 @@ _sc_muladd_avx2: adcq $0x00, %rbp # Sub product of top 4 words and order movq $0xa7ed9ce5a30a2c13, %rdx - mulx %r14, %rcx, %rax + mulxq %r14, %rcx, %rax addq %rcx, %r10 adcq %rax, %r11 - mulx %rbx, %rcx, %rax + mulxq %rbx, %rcx, %rax adcq %rcx, %r12 adcq %rax, %r13 - mov $0x00, %rsi + movq $0x00, %rsi adcq $0x00, %rsi - mulx %r15, %rcx, %rax + mulxq %r15, %rcx, %rax addq %rcx, %r11 adcq %rax, %r12 - mulx %rbp, %rcx, %rax + mulxq %rbp, %rcx, %rax adcq %rcx, %r13 adcq %rax, %rsi movq $0xeb2106215d086329, %rdx - mulx %r14, %rcx, %rax + mulxq %r14, %rcx, %rax addq %rcx, %r11 adcq %rax, %r12 - mulx %rbx, %rcx, %rax + mulxq %rbx, %rcx, %rax adcq %rcx, %r13 adcq %rax, %rsi - mov $0x00, %r8 + movq $0x00, %r8 adcq $0x00, %r8 - mulx %r15, %rcx, %rax + mulxq %r15, %rcx, %rax addq %rcx, %r12 adcq %rax, %r13 - mulx %rbp, %rcx, %rax + mulxq %rbp, %rcx, %rax adcq %rcx, %rsi adcq %rax, %r8 subq %r14, %r12 @@ -15509,21 +15509,21 @@ _sc_muladd_avx2: # Sub product of top 2 words and order # * -5812631a5cf5d3ed movq $0xa7ed9ce5a30a2c13, %rdx - mulx %r14, %r9, %rax + mulxq %r14, %r9, %rax movq $0x00, %rsi addq %r9, %r10 adcq %rax, %r11 - mulx %r15, %r9, %rax + mulxq %r15, %r9, %rax adcq $0x00, %rsi addq %r9, %r11 adcq %rax, %rsi # * -14def9dea2f79cd7 movq $0xeb2106215d086329, %rdx - mulx %r14, %r9, %rax + mulxq %r14, %r9, %rax movq $0x00, %r8 addq %r9, %r11 adcq %rax, %r12 - mulx %r15, %r9, %rax + mulxq %r15, %r9, %rax adcq $0x00, %r8 addq %r9, %r12 adcq %rax, %r8 diff --git a/wolfcrypt/src/include.am b/wolfcrypt/src/include.am index 11d726bea6..6c9b6bcc5c 100644 --- a/wolfcrypt/src/include.am +++ b/wolfcrypt/src/include.am @@ -15,6 +15,8 @@ EXTRA_DIST += wolfcrypt/src/evp.c EXTRA_DIST += wolfcrypt/src/asm.c EXTRA_DIST += wolfcrypt/src/aes_asm.asm EXTRA_DIST += wolfcrypt/src/aes_gcm_asm.asm +EXTRA_DIST += wolfcrypt/src/chacha_asm.asm +EXTRA_DIST += wolfcrypt/src/poly1305_asm.asm EXTRA_DIST += wolfcrypt/src/wc_dsp.c EXTRA_DIST += wolfcrypt/src/sp_dsp32.c EXTRA_DIST += wolfcrypt/src/sp_x86_64_asm.asm diff --git a/wolfcrypt/src/poly1305.c b/wolfcrypt/src/poly1305.c index f56b3fdadd..cde754752a 100644 --- a/wolfcrypt/src/poly1305.c +++ b/wolfcrypt/src/poly1305.c @@ -55,7 +55,7 @@ and Daniel J. Bernstein #pragma warning(disable: 4127) #endif -#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP) +#ifdef USE_INTEL_POLY1305_SPEEDUP #include #include @@ -70,6 +70,10 @@ and Daniel J. Bernstein #elif defined(__clang__) && defined(NO_AVX2_SUPPORT) #undef NO_AVX2_SUPPORT #endif + #if defined(_MSC_VER) && (_MSC_VER <= 1900) + #undef NO_AVX2_SUPPORT + #define NO_AVX2_SUPPORT + #endif #define HAVE_INTEL_AVX1 #ifndef NO_AVX2_SUPPORT @@ -77,13 +81,12 @@ and Daniel J. Bernstein #endif #endif -#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP) +#ifdef USE_INTEL_POLY1305_SPEEDUP static word32 intel_flags = 0; static word32 cpu_flags_set = 0; #endif -#if (defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP)) || \ - defined(POLY130564) +#if defined(USE_INTEL_POLY1305_SPEEDUP) || defined(POLY130564) #if defined(_MSC_VER) #define POLY1305_NOINLINE __declspec(noinline) #elif defined(__GNUC__) @@ -123,7 +126,7 @@ static word32 cpu_flags_set = 0; #endif #endif -#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP) +#ifdef USE_INTEL_POLY1305_SPEEDUP #ifdef __cplusplus extern "C" { #endif @@ -266,7 +269,7 @@ with a given ctx pointer to a Poly1305 structure. static int poly1305_blocks(Poly1305* ctx, const unsigned char *m, size_t bytes) { -#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP) +#ifdef USE_INTEL_POLY1305_SPEEDUP /* AVX2 is handled in wc_Poly1305Update. */ SAVE_VECTOR_REGISTERS(return _svr_ret;); poly1305_blocks_avx(ctx, m, bytes); @@ -400,7 +403,7 @@ number of bytes is less than the block size. */ static int poly1305_block(Poly1305* ctx, const unsigned char *m) { -#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP) +#ifdef USE_INTEL_POLY1305_SPEEDUP /* No call to poly1305_block when AVX2, AVX2 does 4 blocks at a time. */ SAVE_VECTOR_REGISTERS(return _svr_ret;); poly1305_block_avx(ctx, m); @@ -415,8 +418,7 @@ static int poly1305_block(Poly1305* ctx, const unsigned char *m) #if !defined(WOLFSSL_ARMASM) || !defined(__aarch64__) int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz) { -#if defined(POLY130564) && \ - !(defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP)) +#if defined(POLY130564) && !defined(USE_INTEL_POLY1305_SPEEDUP) word64 t0,t1; #endif @@ -437,7 +439,7 @@ int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz) if (keySz != 32 || ctx == NULL) return BAD_FUNC_ARG; -#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP) +#ifdef USE_INTEL_POLY1305_SPEEDUP if (!cpu_flags_set) { intel_flags = cpuid_get_flags(); cpu_flags_set = 1; @@ -504,7 +506,7 @@ int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz) int wc_Poly1305Final(Poly1305* ctx, byte* mac) { -#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP) +#ifdef USE_INTEL_POLY1305_SPEEDUP #elif defined(POLY130564) word64 h0,h1,h2,c; @@ -523,7 +525,7 @@ int wc_Poly1305Final(Poly1305* ctx, byte* mac) if (ctx == NULL || mac == NULL) return BAD_FUNC_ARG; -#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP) +#ifdef USE_INTEL_POLY1305_SPEEDUP SAVE_VECTOR_REGISTERS(return _svr_ret;); #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_AVX2(intel_flags)) @@ -709,7 +711,7 @@ int wc_Poly1305Update(Poly1305* ctx, const byte* m, word32 bytes) printf("\n"); #endif -#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP) +#ifdef USE_INTEL_POLY1305_SPEEDUP #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_AVX2(intel_flags)) { SAVE_VECTOR_REGISTERS(return _svr_ret;); diff --git a/wolfcrypt/src/poly1305_asm.S b/wolfcrypt/src/poly1305_asm.S index 5eaa6d36fb..b995670f8b 100644 --- a/wolfcrypt/src/poly1305_asm.S +++ b/wolfcrypt/src/poly1305_asm.S @@ -1106,7 +1106,7 @@ L_poly1305_avx2_final_start_copy: incb %cl incb %dl L_poly1305_avx2_final_cmp_copy: - cmp %rcx, %rax + cmpb %cl, %al jne L_poly1305_avx2_final_start_copy #ifndef __APPLE__ callq poly1305_final_avx@plt diff --git a/wolfcrypt/src/poly1305_asm.asm b/wolfcrypt/src/poly1305_asm.asm new file mode 100644 index 0000000000..972a62c63e --- /dev/null +++ b/wolfcrypt/src/poly1305_asm.asm @@ -0,0 +1,1060 @@ +; /* poly1305_asm.asm */ +; /* +; * Copyright (C) 2006-2024 wolfSSL Inc. +; * +; * This file is part of wolfSSL. +; * +; * wolfSSL is free software; you can redistribute it and/or modify +; * it under the terms of the GNU General Public License as published by +; * the Free Software Foundation; either version 2 of the License, or +; * (at your option) any later version. +; * +; * wolfSSL is distributed in the hope that it will be useful, +; * but WITHOUT ANY WARRANTY; without even the implied warranty of +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; * GNU General Public License for more details. +; * +; * You should have received a copy of the GNU General Public License +; * along with this program; if not, write to the Free Software +; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA +; */ +IF @Version LT 1200 +; AVX2 instructions not recognized by old versions of MASM +IFNDEF NO_AVX2_SUPPORT +NO_AVX2_SUPPORT = 1 +ENDIF +; MOVBE instruction not recognized by old versions of MASM +IFNDEF NO_MOVBE_SUPPORT +NO_MOVBE_SUPPORT = 1 +ENDIF +ENDIF + +IFNDEF HAVE_INTEL_AVX1 +HAVE_INTEL_AVX1 = 1 +ENDIF +IFNDEF NO_AVX2_SUPPORT +HAVE_INTEL_AVX2 = 1 +ENDIF + +IFNDEF _WIN64 +_WIN64 = 1 +ENDIF + +IFDEF HAVE_INTEL_AVX1 +_text SEGMENT READONLY PARA +poly1305_setkey_avx PROC + push r12 + push r13 + mov r12, 1152921487695413247 + mov r13, 1152921487695413244 + mov rax, QWORD PTR [rdx] + mov r8, QWORD PTR [rdx+8] + mov r9, QWORD PTR [rdx+16] + mov r10, QWORD PTR [rdx+24] + and rax, r12 + and r8, r13 + mov r12, rax + mov r13, r8 + xor r11, r11 + mov QWORD PTR [rcx], rax + mov QWORD PTR [rcx+8], r8 + mov QWORD PTR [rcx+24], r11 + mov QWORD PTR [rcx+32], r11 + mov QWORD PTR [rcx+40], r11 + mov QWORD PTR [rcx+48], r9 + mov QWORD PTR [rcx+56], r10 + mov QWORD PTR [rcx+352], r11 + mov QWORD PTR [rcx+408], r11 + mov QWORD PTR [rcx+360], rax + mov QWORD PTR [rcx+416], r8 + add r12, rax + add r13, r8 + mov QWORD PTR [rcx+368], r12 + mov QWORD PTR [rcx+424], r13 + add r12, rax + add r13, r8 + mov QWORD PTR [rcx+376], r12 + mov QWORD PTR [rcx+432], r13 + add r12, rax + add r13, r8 + mov QWORD PTR [rcx+384], r12 + mov QWORD PTR [rcx+440], r13 + add r12, rax + add r13, r8 + mov QWORD PTR [rcx+392], r12 + mov QWORD PTR [rcx+448], r13 + add r12, rax + add r13, r8 + mov QWORD PTR [rcx+400], r12 + mov QWORD PTR [rcx+456], r13 + mov QWORD PTR [rcx+608], r11 + mov BYTE PTR [rcx+616], 1 + pop r13 + pop r12 + ret +poly1305_setkey_avx ENDP +_text ENDS +_text SEGMENT READONLY PARA +poly1305_block_avx PROC + push r15 + push rbx + push r12 + push r13 + push r14 + mov r15, QWORD PTR [rcx] + mov rbx, QWORD PTR [rcx+8] + mov r8, QWORD PTR [rcx+24] + mov r9, QWORD PTR [rcx+32] + mov r10, QWORD PTR [rcx+40] + xor r14, r14 + mov r14b, BYTE PTR [rcx+616] + ; h += m + mov r11, QWORD PTR [rdx] + mov r12, QWORD PTR [rdx+8] + add r8, r11 + adc r9, r12 + mov rax, rbx + adc r10, r14 + ; r[1] * h[0] => rdx, rax ==> t2, t1 + mul r8 + mov r12, rax + mov r13, rdx + ; r[0] * h[1] => rdx, rax ++> t2, t1 + mov rax, r15 + mul r9 + add r12, rax + mov rax, r15 + adc r13, rdx + ; r[0] * h[0] => rdx, rax ==> t4, t0 + mul r8 + mov r11, rax + mov r8, rdx + ; r[1] * h[1] => rdx, rax =+> t3, t2 + mov rax, rbx + mul r9 + ; r[0] * h[2] +> t2 + add r13, QWORD PTR [rcx+8*r10+352] + mov r14, rdx + add r12, r8 + adc r13, rax + ; r[1] * h[2] +> t3 + adc r14, QWORD PTR [rcx+8*r10+408] + ; r * h in r14, r13, r12, r11 + ; h = (r * h) mod 2^130 - 5 + mov r10, r13 + and r13, -4 + and r10, 3 + add r11, r13 + mov r8, r13 + adc r12, r14 + adc r10, 0 + shrd r8, r14, 2 + shr r14, 2 + add r8, r11 + adc r12, r14 + mov r9, r12 + adc r10, 0 + ; h in r10, r9, r8 + ; Store h to ctx + mov QWORD PTR [rcx+24], r8 + mov QWORD PTR [rcx+32], r9 + mov QWORD PTR [rcx+40], r10 + pop r14 + pop r13 + pop r12 + pop rbx + pop r15 + ret +poly1305_block_avx ENDP +_text ENDS +_text SEGMENT READONLY PARA +poly1305_blocks_avx PROC + push rdi + push rsi + push r15 + push rbx + push r12 + push r13 + push r14 + mov rdi, rcx + mov rsi, rdx + mov rcx, r8 + mov r15, QWORD PTR [rdi] + mov rbx, QWORD PTR [rdi+8] + mov r8, QWORD PTR [rdi+24] + mov r9, QWORD PTR [rdi+32] + mov r10, QWORD PTR [rdi+40] +L_poly1305_avx_blocks_start: + ; h += m + mov r11, QWORD PTR [rsi] + mov r12, QWORD PTR [rsi+8] + add r8, r11 + adc r9, r12 + mov rax, rbx + adc r10, 0 + ; r[1] * h[0] => rdx, rax ==> t2, t1 + mul r8 + mov r12, rax + mov r13, rdx + ; r[0] * h[1] => rdx, rax ++> t2, t1 + mov rax, r15 + mul r9 + add r12, rax + mov rax, r15 + adc r13, rdx + ; r[0] * h[0] => rdx, rax ==> t4, t0 + mul r8 + mov r11, rax + mov r8, rdx + ; r[1] * h[1] => rdx, rax =+> t3, t2 + mov rax, rbx + mul r9 + ; r[0] * h[2] +> t2 + add r13, QWORD PTR [rdi+8*r10+360] + mov r14, rdx + add r12, r8 + adc r13, rax + ; r[1] * h[2] +> t3 + adc r14, QWORD PTR [rdi+8*r10+416] + ; r * h in r14, r13, r12, r11 + ; h = (r * h) mod 2^130 - 5 + mov r10, r13 + and r13, -4 + and r10, 3 + add r11, r13 + mov r8, r13 + adc r12, r14 + adc r10, 0 + shrd r8, r14, 2 + shr r14, 2 + add r8, r11 + adc r12, r14 + mov r9, r12 + adc r10, 0 + ; h in r10, r9, r8 + ; Next block from message + add rsi, 16 + sub rcx, 16 + jg L_poly1305_avx_blocks_start + ; Store h to ctx + mov QWORD PTR [rdi+24], r8 + mov QWORD PTR [rdi+32], r9 + mov QWORD PTR [rdi+40], r10 + pop r14 + pop r13 + pop r12 + pop rbx + pop r15 + pop rsi + pop rdi + ret +poly1305_blocks_avx ENDP +_text ENDS +_text SEGMENT READONLY PARA +poly1305_final_avx PROC + push rdi + push rbx + push r12 + mov rdi, rcx + mov rbx, rdx + mov rax, QWORD PTR [rdi+608] + test rax, rax + je L_poly1305_avx_final_no_more + mov BYTE PTR [rdi+rax+480], 1 + jmp L_poly1305_avx_final_cmp_rem +L_poly1305_avx_final_zero_rem: + mov BYTE PTR [rdi+rax+480], 0 +L_poly1305_avx_final_cmp_rem: + inc al + cmp rax, 16 + jl L_poly1305_avx_final_zero_rem + mov BYTE PTR [rdi+616], 0 + lea rdx, QWORD PTR [rdi+480] + call poly1305_block_avx +L_poly1305_avx_final_no_more: + mov rax, QWORD PTR [rdi+24] + mov rdx, QWORD PTR [rdi+32] + mov rcx, QWORD PTR [rdi+40] + mov r11, QWORD PTR [rdi+48] + mov r12, QWORD PTR [rdi+56] + ; h %= p + ; h = (h + pad) + ; mod 2^130 - 5 + mov r8, rcx + and rcx, 3 + shr r8, 2 + ; Multiply by 5 + lea r8, QWORD PTR [r8+4*r8+0] + add rax, r8 + adc rdx, 0 + adc rcx, 0 + ; Fixup when between (1 << 130) - 1 and (1 << 130) - 5 + mov r8, rax + mov r9, rdx + mov r10, rcx + add r8, 5 + adc r9, 0 + adc r10, 0 + cmp r10, 4 + cmove rax, r8 + cmove rdx, r9 + ; h += pad + add rax, r11 + adc rdx, r12 + mov QWORD PTR [rbx], rax + mov QWORD PTR [rbx+8], rdx + ; Zero out r + mov QWORD PTR [rdi], 0 + mov QWORD PTR [rdi+8], 0 + ; Zero out h + mov QWORD PTR [rdi+24], 0 + mov QWORD PTR [rdi+32], 0 + mov QWORD PTR [rdi+40], 0 + ; Zero out pad + mov QWORD PTR [rdi+48], 0 + mov QWORD PTR [rdi+56], 0 + pop r12 + pop rbx + pop rdi + ret +poly1305_final_avx ENDP +_text ENDS +ENDIF +IFDEF HAVE_INTEL_AVX2 +_text SEGMENT READONLY PARA +poly1305_calc_powers_avx2 PROC + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + push rbx + push rbp + mov r8, QWORD PTR [rcx] + mov r9, QWORD PTR [rcx+8] + xor r10, r10 + ; Convert to 26 bits in 32 + mov rax, r8 + mov rdx, r8 + mov rsi, r8 + mov rbx, r9 + mov rbp, r9 + shr rdx, 26 + shrd rsi, r9, 52 + shr rbx, 14 + shrd rbp, r10, 40 + and rax, 67108863 + and rdx, 67108863 + and rsi, 67108863 + and rbx, 67108863 + and rbp, 67108863 + mov DWORD PTR [rcx+224], eax + mov DWORD PTR [rcx+228], edx + mov DWORD PTR [rcx+232], esi + mov DWORD PTR [rcx+236], ebx + mov DWORD PTR [rcx+240], ebp + mov DWORD PTR [rcx+244], 0 + ; Square 128-bit + mov rax, r9 + mul r8 + xor r14, r14 + mov r12, rax + mov r13, rdx + add r12, rax + adc r13, rdx + adc r14, 0 + mov rax, r8 + mul rax + mov r11, rax + mov rdi, rdx + mov rax, r9 + mul rax + add r12, rdi + adc r13, rax + adc r14, rdx + ; Reduce 256-bit to 130-bit + mov rax, r13 + mov rdx, r14 + and rax, -4 + and r13, 3 + add r11, rax + adc r12, rdx + adc r13, 0 + shrd rax, rdx, 2 + shr rdx, 2 + add r11, rax + adc r12, rdx + adc r13, 0 + mov rax, r13 + shr rax, 2 + lea rax, QWORD PTR [rax+4*rax+0] + and r13, 3 + add r11, rax + adc r12, 0 + adc r13, 0 + ; Convert to 26 bits in 32 + mov rax, r11 + mov rdx, r11 + mov rsi, r11 + mov rbx, r12 + mov rbp, r12 + shr rdx, 26 + shrd rsi, r12, 52 + shr rbx, 14 + shrd rbp, r13, 40 + and rax, 67108863 + and rdx, 67108863 + and rsi, 67108863 + and rbx, 67108863 + and rbp, 67108863 + mov DWORD PTR [rcx+256], eax + mov DWORD PTR [rcx+260], edx + mov DWORD PTR [rcx+264], esi + mov DWORD PTR [rcx+268], ebx + mov DWORD PTR [rcx+272], ebp + mov DWORD PTR [rcx+276], 0 + ; Multiply 128-bit by 130-bit + ; r1[0] * r2[0] + mov rax, r8 + mul r11 + mov r14, rax + mov r15, rdx + ; r1[0] * r2[1] + mov rax, r8 + mul r12 + mov rdi, 0 + add r15, rax + adc rdi, rdx + ; r1[1] * r2[0] + mov rax, r9 + mul r11 + mov rsi, 0 + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; r1[0] * r2[2] + mov rax, r8 + mul r13 + add rdi, rax + adc rsi, rdx + ; r1[1] * r2[1] + mov rax, r9 + mul r12 + mov rbx, 0 + add rdi, rax + adc rsi, rdx + adc rbx, 0 + ; r1[1] * r2[2] + mov rax, r9 + mul r13 + add rsi, rax + adc rbx, rdx + ; Reduce 260-bit to 130-bit + mov rax, rdi + mov rdx, rsi + mov rbx, rbx + and rax, -4 + and rdi, 3 + add r14, rax + adc r15, rdx + adc rdi, rbx + shrd rax, rdx, 2 + shrd rdx, rbx, 2 + shr rbx, 2 + add r14, rax + adc r15, rdx + adc rdi, rbx + mov rax, rdi + and rdi, 3 + shr rax, 2 + lea rax, QWORD PTR [rax+4*rax+0] + add r14, rax + adc r15, 0 + adc rdi, 0 + ; Convert to 26 bits in 32 + mov rax, r14 + mov rdx, r14 + mov rsi, r14 + mov rbx, r15 + mov rbp, r15 + shr rdx, 26 + shrd rsi, r15, 52 + shr rbx, 14 + shrd rbp, rdi, 40 + and rax, 67108863 + and rdx, 67108863 + and rsi, 67108863 + and rbx, 67108863 + and rbp, 67108863 + mov DWORD PTR [rcx+288], eax + mov DWORD PTR [rcx+292], edx + mov DWORD PTR [rcx+296], esi + mov DWORD PTR [rcx+300], ebx + mov DWORD PTR [rcx+304], ebp + mov DWORD PTR [rcx+308], 0 + ; Square 130-bit + mov rax, r12 + mul r11 + xor r14, r14 + mov r9, rax + mov r10, rdx + add r9, rax + adc r10, rdx + adc r14, 0 + mov rax, r11 + mul rax + mov r8, rax + mov rdi, rdx + mov rax, r12 + mul rax + add r9, rdi + adc r10, rax + adc r14, rdx + mov rax, r13 + mul rax + mov r15, rax + mov rax, r13 + mul r11 + add r10, rax + adc r14, rdx + adc r15, 0 + add r10, rax + adc r14, rdx + adc r15, 0 + mov rax, r13 + mul r12 + add r14, rax + adc r15, rdx + add r14, rax + adc r15, rdx + ; Reduce 260-bit to 130-bit + mov rax, r10 + mov rdx, r14 + mov rdi, r15 + and rax, -4 + and r10, 3 + add r8, rax + adc r9, rdx + adc r10, rdi + shrd rax, rdx, 2 + shrd rdx, rdi, 2 + shr rdi, 2 + add r8, rax + adc r9, rdx + adc r10, rdi + mov rax, r10 + and r10, 3 + shr rax, 2 + lea rax, QWORD PTR [rax+4*rax+0] + add r8, rax + adc r9, 0 + adc r10, 0 + ; Convert to 26 bits in 32 + mov rax, r8 + mov rdx, r8 + mov rsi, r8 + mov rbx, r9 + mov rbp, r9 + shr rdx, 26 + shrd rsi, r9, 52 + shr rbx, 14 + shrd rbp, r10, 40 + and rax, 67108863 + and rdx, 67108863 + and rsi, 67108863 + and rbx, 67108863 + and rbp, 67108863 + mov DWORD PTR [rcx+320], eax + mov DWORD PTR [rcx+324], edx + mov DWORD PTR [rcx+328], esi + mov DWORD PTR [rcx+332], ebx + mov DWORD PTR [rcx+336], ebp + mov DWORD PTR [rcx+340], 0 + pop rbp + pop rbx + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + ret +poly1305_calc_powers_avx2 ENDP +_text ENDS +_text SEGMENT READONLY PARA +poly1305_setkey_avx2 PROC + call poly1305_setkey_avx + vpxor ymm0, ymm0, ymm0 + vmovdqu YMMWORD PTR [rcx+64], ymm0 + vmovdqu YMMWORD PTR [rcx+96], ymm0 + vmovdqu YMMWORD PTR [rcx+128], ymm0 + vmovdqu YMMWORD PTR [rcx+160], ymm0 + vmovdqu YMMWORD PTR [rcx+192], ymm0 + mov QWORD PTR [rcx+608], 0 + mov WORD PTR [rcx+616], 0 + ret +poly1305_setkey_avx2 ENDP +_text ENDS +_DATA SEGMENT +ALIGN 16 +L_poly1305_avx2_blocks_mask QWORD 67108863, 67108863, + 67108863, 67108863 +ptr_L_poly1305_avx2_blocks_mask QWORD L_poly1305_avx2_blocks_mask +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_poly1305_avx2_blocks_hibit QWORD 16777216, 16777216, + 16777216, 16777216 +ptr_L_poly1305_avx2_blocks_hibit QWORD L_poly1305_avx2_blocks_hibit +_DATA ENDS +_text SEGMENT READONLY PARA +poly1305_blocks_avx2 PROC + push r12 + push rdi + push rsi + push rbx + push r13 + push r14 + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 + sub rsp, 480 + vmovdqu OWORD PTR [rsp+320], xmm6 + vmovdqu OWORD PTR [rsp+336], xmm7 + vmovdqu OWORD PTR [rsp+352], xmm8 + vmovdqu OWORD PTR [rsp+368], xmm9 + vmovdqu OWORD PTR [rsp+384], xmm10 + vmovdqu OWORD PTR [rsp+400], xmm11 + vmovdqu OWORD PTR [rsp+416], xmm12 + vmovdqu OWORD PTR [rsp+432], xmm13 + vmovdqu OWORD PTR [rsp+448], xmm14 + vmovdqu OWORD PTR [rsp+464], xmm15 + mov r13, QWORD PTR [ptr_L_poly1305_avx2_blocks_mask] + mov r14, QWORD PTR [ptr_L_poly1305_avx2_blocks_hibit] + mov rcx, rsp + and rcx, -32 + add rcx, 32 + vpxor ymm15, ymm15, ymm15 + mov rbx, rcx + lea rax, QWORD PTR [rdi+64] + add rbx, 160 + cmp WORD PTR [rdi+616], 0 + jne L_poly1305_avx2_blocks_begin_h + ; Load the message data + vmovdqu ymm0, YMMWORD PTR [rsi] + vmovdqu ymm1, YMMWORD PTR [rsi+32] + vperm2i128 ymm2, ymm0, ymm1, 32 + vperm2i128 ymm0, ymm0, ymm1, 49 + vpunpckldq ymm1, ymm2, ymm0 + vpunpckhdq ymm3, ymm2, ymm0 + vpunpckldq ymm0, ymm1, ymm15 + vpunpckhdq ymm1, ymm1, ymm15 + vpunpckldq ymm2, ymm3, ymm15 + vpunpckhdq ymm3, ymm3, ymm15 + vmovdqu ymm4, YMMWORD PTR [r14] + vpsllq ymm1, ymm1, 6 + vpsllq ymm2, ymm2, 12 + vpsllq ymm3, ymm3, 18 + vmovdqu ymm14, YMMWORD PTR [r13] + ; Reduce, in place, the message data + vpsrlq ymm10, ymm0, 26 + vpsrlq ymm11, ymm3, 26 + vpand ymm0, ymm0, ymm14 + vpand ymm3, ymm3, ymm14 + vpaddq ymm1, ymm10, ymm1 + vpaddq ymm4, ymm11, ymm4 + vpsrlq ymm10, ymm1, 26 + vpsrlq ymm11, ymm4, 26 + vpand ymm1, ymm1, ymm14 + vpand ymm4, ymm4, ymm14 + vpaddq ymm2, ymm10, ymm2 + vpslld ymm12, ymm11, 2 + vpaddd ymm12, ymm11, ymm12 + vpsrlq ymm10, ymm2, 26 + vpaddq ymm0, ymm12, ymm0 + vpsrlq ymm11, ymm0, 26 + vpand ymm2, ymm2, ymm14 + vpand ymm0, ymm0, ymm14 + vpaddq ymm3, ymm10, ymm3 + vpaddq ymm1, ymm11, ymm1 + vpsrlq ymm10, ymm3, 26 + vpand ymm3, ymm3, ymm14 + vpaddq ymm4, ymm10, ymm4 + add rsi, 64 + sub rdx, 64 + jz L_poly1305_avx2_blocks_store + jmp L_poly1305_avx2_blocks_load_r4 +L_poly1305_avx2_blocks_begin_h: + ; Load the H values. + vmovdqu ymm0, YMMWORD PTR [rax] + vmovdqu ymm1, YMMWORD PTR [rax+32] + vmovdqu ymm2, YMMWORD PTR [rax+64] + vmovdqu ymm3, YMMWORD PTR [rax+96] + vmovdqu ymm4, YMMWORD PTR [rax+128] + ; Check if there is a power of r to load - otherwise use r^4. + cmp BYTE PTR [rdi+616], 0 + je L_poly1305_avx2_blocks_load_r4 + ; Load the 4 powers of r - r^4, r^3, r^2, r^1. + vmovdqu ymm8, YMMWORD PTR [rdi+224] + vmovdqu ymm7, YMMWORD PTR [rdi+256] + vmovdqu ymm6, YMMWORD PTR [rdi+288] + vmovdqu ymm5, YMMWORD PTR [rdi+320] + vpermq ymm5, ymm5, 216 + vpermq ymm6, ymm6, 216 + vpermq ymm7, ymm7, 216 + vpermq ymm8, ymm8, 216 + vpunpcklqdq ymm10, ymm5, ymm6 + vpunpckhqdq ymm11, ymm5, ymm6 + vpunpcklqdq ymm12, ymm7, ymm8 + vpunpckhqdq ymm13, ymm7, ymm8 + vperm2i128 ymm5, ymm10, ymm12, 32 + vperm2i128 ymm7, ymm10, ymm12, 49 + vperm2i128 ymm9, ymm11, ymm13, 32 + vpsrlq ymm6, ymm5, 32 + vpsrlq ymm8, ymm7, 32 + jmp L_poly1305_avx2_blocks_mul_5 +L_poly1305_avx2_blocks_load_r4: + ; Load r^4 into all four positions. + vmovdqu ymm13, YMMWORD PTR [rdi+320] + vpermq ymm5, ymm13, 0 + vpsrlq ymm14, ymm13, 32 + vpermq ymm7, ymm13, 85 + vpermq ymm9, ymm13, 170 + vpermq ymm6, ymm14, 0 + vpermq ymm8, ymm14, 85 +L_poly1305_avx2_blocks_mul_5: + ; Multiply top 4 26-bit values of all four H by 5 + vpslld ymm10, ymm6, 2 + vpslld ymm11, ymm7, 2 + vpslld ymm12, ymm8, 2 + vpslld ymm13, ymm9, 2 + vpaddq ymm10, ymm6, ymm10 + vpaddq ymm11, ymm7, ymm11 + vpaddq ymm12, ymm8, ymm12 + vpaddq ymm13, ymm9, ymm13 + ; Store powers of r and multiple of 5 for use in multiply. + vmovdqa YMMWORD PTR [rbx], ymm10 + vmovdqa YMMWORD PTR [rbx+32], ymm11 + vmovdqa YMMWORD PTR [rbx+64], ymm12 + vmovdqa YMMWORD PTR [rbx+96], ymm13 + vmovdqa YMMWORD PTR [rcx], ymm5 + vmovdqa YMMWORD PTR [rcx+32], ymm6 + vmovdqa YMMWORD PTR [rcx+64], ymm7 + vmovdqa YMMWORD PTR [rcx+96], ymm8 + vmovdqa YMMWORD PTR [rcx+128], ymm9 + vmovdqu ymm14, YMMWORD PTR [r13] + ; If not finished then loop over data + cmp BYTE PTR [rdi+616], 1 + jne L_poly1305_avx2_blocks_start + ; Do last multiply, reduce, add the four H together and move to + ; 32-bit registers + vpmuludq ymm5, ymm4, [rbx] + vpmuludq ymm10, ymm3, [rbx+32] + vpmuludq ymm6, ymm4, [rbx+32] + vpmuludq ymm11, ymm3, [rbx+64] + vpmuludq ymm7, ymm4, [rbx+64] + vpaddq ymm5, ymm10, ymm5 + vpmuludq ymm12, ymm2, [rbx+64] + vpmuludq ymm8, ymm4, [rbx+96] + vpaddq ymm6, ymm11, ymm6 + vpmuludq ymm13, ymm1, [rbx+96] + vpmuludq ymm10, ymm2, [rbx+96] + vpaddq ymm5, ymm12, ymm5 + vpmuludq ymm11, ymm3, [rbx+96] + vpmuludq ymm12, ymm3, [rcx] + vpaddq ymm5, ymm13, ymm5 + vpmuludq ymm9, ymm4, [rcx] + vpaddq ymm6, ymm10, ymm6 + vpmuludq ymm13, ymm0, [rcx] + vpaddq ymm7, ymm11, ymm7 + vpmuludq ymm10, ymm1, [rcx] + vpaddq ymm8, ymm12, ymm8 + vpmuludq ymm11, ymm2, [rcx] + vpmuludq ymm12, ymm2, [rcx+32] + vpaddq ymm5, ymm13, ymm5 + vpmuludq ymm13, ymm3, [rcx+32] + vpaddq ymm6, ymm10, ymm6 + vpmuludq ymm10, ymm0, [rcx+32] + vpaddq ymm7, ymm11, ymm7 + vpmuludq ymm11, ymm1, [rcx+32] + vpaddq ymm8, ymm12, ymm8 + vpmuludq ymm12, ymm1, [rcx+64] + vpaddq ymm9, ymm13, ymm9 + vpmuludq ymm13, ymm2, [rcx+64] + vpaddq ymm6, ymm10, ymm6 + vpmuludq ymm10, ymm0, [rcx+64] + vpaddq ymm7, ymm11, ymm7 + vpmuludq ymm11, ymm0, [rcx+96] + vpaddq ymm8, ymm12, ymm8 + vpmuludq ymm12, ymm1, [rcx+96] + vpaddq ymm9, ymm13, ymm9 + vpaddq ymm7, ymm10, ymm7 + vpmuludq ymm13, ymm0, [rcx+128] + vpaddq ymm8, ymm11, ymm8 + vpaddq ymm9, ymm12, ymm9 + vpaddq ymm9, ymm13, ymm9 + vpsrlq ymm10, ymm5, 26 + vpsrlq ymm11, ymm8, 26 + vpand ymm5, ymm5, ymm14 + vpand ymm8, ymm8, ymm14 + vpaddq ymm6, ymm10, ymm6 + vpaddq ymm9, ymm11, ymm9 + vpsrlq ymm10, ymm6, 26 + vpsrlq ymm11, ymm9, 26 + vpand ymm1, ymm6, ymm14 + vpand ymm4, ymm9, ymm14 + vpaddq ymm7, ymm10, ymm7 + vpslld ymm12, ymm11, 2 + vpaddd ymm12, ymm11, ymm12 + vpsrlq ymm10, ymm7, 26 + vpaddq ymm5, ymm12, ymm5 + vpsrlq ymm11, ymm5, 26 + vpand ymm2, ymm7, ymm14 + vpand ymm0, ymm5, ymm14 + vpaddq ymm8, ymm10, ymm8 + vpaddq ymm1, ymm11, ymm1 + vpsrlq ymm10, ymm8, 26 + vpand ymm3, ymm8, ymm14 + vpaddq ymm4, ymm10, ymm4 + vpsrldq ymm5, ymm0, 8 + vpsrldq ymm6, ymm1, 8 + vpsrldq ymm7, ymm2, 8 + vpsrldq ymm8, ymm3, 8 + vpsrldq ymm9, ymm4, 8 + vpaddq ymm0, ymm5, ymm0 + vpaddq ymm1, ymm6, ymm1 + vpaddq ymm2, ymm7, ymm2 + vpaddq ymm3, ymm8, ymm3 + vpaddq ymm4, ymm9, ymm4 + vpermq ymm5, ymm0, 2 + vpermq ymm6, ymm1, 2 + vpermq ymm7, ymm2, 2 + vpermq ymm8, ymm3, 2 + vpermq ymm9, ymm4, 2 + vpaddq ymm0, ymm5, ymm0 + vpaddq ymm1, ymm6, ymm1 + vpaddq ymm2, ymm7, ymm2 + vpaddq ymm3, ymm8, ymm3 + vpaddq ymm4, ymm9, ymm4 + vmovd r8d, xmm0 + vmovd r9d, xmm1 + vmovd r10d, xmm2 + vmovd r11d, xmm3 + vmovd r12d, xmm4 + jmp L_poly1305_avx2_blocks_end_calc +L_poly1305_avx2_blocks_start: + vmovdqu ymm5, YMMWORD PTR [rsi] + vmovdqu ymm6, YMMWORD PTR [rsi+32] + vperm2i128 ymm7, ymm5, ymm6, 32 + vperm2i128 ymm5, ymm5, ymm6, 49 + vpunpckldq ymm6, ymm7, ymm5 + vpunpckhdq ymm8, ymm7, ymm5 + vpunpckldq ymm5, ymm6, ymm15 + vpunpckhdq ymm6, ymm6, ymm15 + vpunpckldq ymm7, ymm8, ymm15 + vpunpckhdq ymm8, ymm8, ymm15 + vmovdqu ymm9, YMMWORD PTR [r14] + vpsllq ymm6, ymm6, 6 + vpsllq ymm7, ymm7, 12 + vpsllq ymm8, ymm8, 18 + vpmuludq ymm10, ymm4, [rbx] + vpaddq ymm5, ymm10, ymm5 + vpmuludq ymm10, ymm3, [rbx+32] + vpmuludq ymm11, ymm4, [rbx+32] + vpaddq ymm6, ymm11, ymm6 + vpmuludq ymm11, ymm3, [rbx+64] + vpmuludq ymm12, ymm4, [rbx+64] + vpaddq ymm7, ymm12, ymm7 + vpaddq ymm5, ymm10, ymm5 + vpmuludq ymm12, ymm2, [rbx+64] + vpmuludq ymm13, ymm4, [rbx+96] + vpaddq ymm8, ymm13, ymm8 + vpaddq ymm6, ymm11, ymm6 + vpmuludq ymm13, ymm1, [rbx+96] + vpmuludq ymm10, ymm2, [rbx+96] + vpaddq ymm5, ymm12, ymm5 + vpmuludq ymm11, ymm3, [rbx+96] + vpmuludq ymm12, ymm3, [rcx] + vpaddq ymm5, ymm13, ymm5 + vpmuludq ymm13, ymm4, [rcx] + vpaddq ymm9, ymm13, ymm9 + vpaddq ymm6, ymm10, ymm6 + vpmuludq ymm13, ymm0, [rcx] + vpaddq ymm7, ymm11, ymm7 + vpmuludq ymm10, ymm1, [rcx] + vpaddq ymm8, ymm12, ymm8 + vpmuludq ymm11, ymm2, [rcx] + vpmuludq ymm12, ymm2, [rcx+32] + vpaddq ymm5, ymm13, ymm5 + vpmuludq ymm13, ymm3, [rcx+32] + vpaddq ymm6, ymm10, ymm6 + vpmuludq ymm10, ymm0, [rcx+32] + vpaddq ymm7, ymm11, ymm7 + vpmuludq ymm11, ymm1, [rcx+32] + vpaddq ymm8, ymm12, ymm8 + vpmuludq ymm12, ymm1, [rcx+64] + vpaddq ymm9, ymm13, ymm9 + vpmuludq ymm13, ymm2, [rcx+64] + vpaddq ymm6, ymm10, ymm6 + vpmuludq ymm10, ymm0, [rcx+64] + vpaddq ymm7, ymm11, ymm7 + vpmuludq ymm11, ymm0, [rcx+96] + vpaddq ymm8, ymm12, ymm8 + vpmuludq ymm12, ymm1, [rcx+96] + vpaddq ymm9, ymm13, ymm9 + vpaddq ymm7, ymm10, ymm7 + vpmuludq ymm13, ymm0, [rcx+128] + vpaddq ymm8, ymm11, ymm8 + vpaddq ymm9, ymm12, ymm9 + vpaddq ymm9, ymm13, ymm9 + vpsrlq ymm10, ymm5, 26 + vpsrlq ymm11, ymm8, 26 + vpand ymm5, ymm5, ymm14 + vpand ymm8, ymm8, ymm14 + vpaddq ymm6, ymm10, ymm6 + vpaddq ymm9, ymm11, ymm9 + vpsrlq ymm10, ymm6, 26 + vpsrlq ymm11, ymm9, 26 + vpand ymm1, ymm6, ymm14 + vpand ymm4, ymm9, ymm14 + vpaddq ymm7, ymm10, ymm7 + vpslld ymm12, ymm11, 2 + vpaddd ymm12, ymm11, ymm12 + vpsrlq ymm10, ymm7, 26 + vpaddq ymm5, ymm12, ymm5 + vpsrlq ymm11, ymm5, 26 + vpand ymm2, ymm7, ymm14 + vpand ymm0, ymm5, ymm14 + vpaddq ymm8, ymm10, ymm8 + vpaddq ymm1, ymm11, ymm1 + vpsrlq ymm10, ymm8, 26 + vpand ymm3, ymm8, ymm14 + vpaddq ymm4, ymm10, ymm4 + add rsi, 64 + sub rdx, 64 + jnz L_poly1305_avx2_blocks_start +L_poly1305_avx2_blocks_store: + ; Store four H values - state + vmovdqu YMMWORD PTR [rax], ymm0 + vmovdqu YMMWORD PTR [rax+32], ymm1 + vmovdqu YMMWORD PTR [rax+64], ymm2 + vmovdqu YMMWORD PTR [rax+96], ymm3 + vmovdqu YMMWORD PTR [rax+128], ymm4 +L_poly1305_avx2_blocks_end_calc: + cmp BYTE PTR [rdi+616], 0 + je L_poly1305_avx2_blocks_complete + mov rax, r8 + mov rdx, r10 + mov rcx, r12 + shr rdx, 12 + shr rcx, 24 + shl r9, 26 + shl r10, 52 + shl r11, 14 + shl r12, 40 + add rax, r9 + adc rax, r10 + adc rdx, r11 + adc rdx, r12 + adc rcx, 0 + mov r8, rcx + and rcx, 3 + shr r8, 2 + lea r8, QWORD PTR [r8+4*r8+0] + add rax, r8 + adc rdx, 0 + adc rcx, 0 + mov QWORD PTR [rdi+24], rax + mov QWORD PTR [rdi+32], rdx + mov QWORD PTR [rdi+40], rcx +L_poly1305_avx2_blocks_complete: + mov BYTE PTR [rdi+617], 1 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp+320] + vmovdqu xmm7, OWORD PTR [rsp+336] + vmovdqu xmm8, OWORD PTR [rsp+352] + vmovdqu xmm9, OWORD PTR [rsp+368] + vmovdqu xmm10, OWORD PTR [rsp+384] + vmovdqu xmm11, OWORD PTR [rsp+400] + vmovdqu xmm12, OWORD PTR [rsp+416] + vmovdqu xmm13, OWORD PTR [rsp+432] + vmovdqu xmm14, OWORD PTR [rsp+448] + vmovdqu xmm15, OWORD PTR [rsp+464] + add rsp, 480 + pop r14 + pop r13 + pop rbx + pop rsi + pop rdi + pop r12 + ret +poly1305_blocks_avx2 ENDP +_text ENDS +_text SEGMENT READONLY PARA +poly1305_final_avx2 PROC + push rdi + push rsi + mov rdi, rcx + mov rsi, rdx + mov BYTE PTR [rdi+616], 1 + mov cl, BYTE PTR [rdi+617] + cmp cl, 0 + je L_poly1305_avx2_final_done_blocks_X4 + push rsi + mov r8, 64 + xor rdx, rdx + mov rcx, rdi + call poly1305_blocks_avx2 + pop rsi +L_poly1305_avx2_final_done_blocks_X4: + mov rax, QWORD PTR [rdi+608] + mov rcx, rax + and rcx, -16 + cmp cl, 0 + je L_poly1305_avx2_final_done_blocks + push rcx + push rax + push rsi + mov r8, rcx + lea rdx, QWORD PTR [rdi+480] + mov rcx, rdi + call poly1305_blocks_avx + pop rsi + pop rax + pop rcx +L_poly1305_avx2_final_done_blocks: + sub QWORD PTR [rdi+608], rcx + xor rdx, rdx + jmp L_poly1305_avx2_final_cmp_copy +L_poly1305_avx2_final_start_copy: + mov r8b, BYTE PTR [rdi+rcx+480] + mov BYTE PTR [rdi+rdx+480], r8b + inc cl + inc dl +L_poly1305_avx2_final_cmp_copy: + cmp al, cl + jne L_poly1305_avx2_final_start_copy + mov rcx, rdi + mov rdx, rsi + call poly1305_final_avx + vpxor ymm0, ymm0, ymm0 + vmovdqu YMMWORD PTR [rdi+64], ymm0 + vmovdqu YMMWORD PTR [rdi+96], ymm0 + vmovdqu YMMWORD PTR [rdi+128], ymm0 + vmovdqu YMMWORD PTR [rdi+160], ymm0 + vmovdqu YMMWORD PTR [rdi+192], ymm0 + vmovdqu YMMWORD PTR [rdi+224], ymm0 + vmovdqu YMMWORD PTR [rdi+256], ymm0 + vmovdqu YMMWORD PTR [rdi+288], ymm0 + vmovdqu YMMWORD PTR [rdi+320], ymm0 + mov QWORD PTR [rdi+608], 0 + mov WORD PTR [rdi+616], 0 + vzeroupper + pop rsi + pop rdi + ret +poly1305_final_avx2 ENDP +_text ENDS +ENDIF +END diff --git a/wolfcrypt/test/test.c b/wolfcrypt/test/test.c index 6c025c3e08..a0abe62dc8 100644 --- a/wolfcrypt/test/test.c +++ b/wolfcrypt/test/test.c @@ -3471,14 +3471,6 @@ WOLFSSL_TEST_SUBROUTINE wc_test_ret_t sha256_test(void) ERROR_OUT(WC_TEST_RET_ENC_EC(ret), exit); } if (XMEMCMP(hash, b.output, WC_SHA256_DIGEST_SIZE) != 0) { -{ - for (int ii = 0; ii < WC_SHA256_DIGEST_SIZE; ii++) - fprintf(stderr, " %02x", hash[ii]); - fprintf(stderr, "\n"); - for (int ii = 0; ii < WC_SHA256_DIGEST_SIZE; ii++) - fprintf(stderr, " %02x", b.output[ii]); - fprintf(stderr, "\n"); -} ERROR_OUT(WC_TEST_RET_ENC_NC, exit); } #endif diff --git a/wolfssl.vcproj b/wolfssl.vcproj index 02cd2dac3a..8f07d8bf11 100644 --- a/wolfssl.vcproj +++ b/wolfssl.vcproj @@ -199,6 +199,10 @@ RelativePath=".\wolfcrypt\src\aes_asm.asm" > + + @@ -227,6 +231,10 @@ RelativePath=".\wolfcrypt\src\chacha.c" > + + @@ -331,6 +339,10 @@ RelativePath=".\wolfcrypt\src\poly1305.c" > + + diff --git a/wolfssl.vcxproj b/wolfssl.vcxproj index 5904e2cd88..662922b488 100644 --- a/wolfssl.vcxproj +++ b/wolfssl.vcxproj @@ -384,6 +384,34 @@ $(OutDir)%(Filename).obj $(IntDir)%(Filename).obj + + false + false + ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) + ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(OutDir)%(Filename).obj + $(IntDir)%(Filename).obj + false + false + ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) + ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(OutDir)%(Filename).obj + $(IntDir)%(Filename).obj + + + false + false + ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) + ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(OutDir)%(Filename).obj + $(IntDir)%(Filename).obj + false + false + ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) + ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(OutDir)%(Filename).obj + $(IntDir)%(Filename).obj + false false diff --git a/wolfssl/wolfcrypt/chacha.h b/wolfssl/wolfcrypt/chacha.h index 848edf6816..6c9577b110 100644 --- a/wolfssl/wolfcrypt/chacha.h +++ b/wolfssl/wolfcrypt/chacha.h @@ -77,7 +77,7 @@ enum { typedef struct ChaCha { word32 X[CHACHA_CHUNK_WORDS]; /* state of cipher */ -#ifdef HAVE_INTEL_AVX1 +#if defined(USE_INTEL_CHACHA_SPEEDUP) /* vpshufd reads 16 bytes but we only use bottom 4. */ byte extra[12]; #endif diff --git a/wolfssl/wolfcrypt/poly1305.h b/wolfssl/wolfcrypt/poly1305.h index c0a5b8dfc1..cc312546e3 100644 --- a/wolfssl/wolfcrypt/poly1305.h +++ b/wolfssl/wolfcrypt/poly1305.h @@ -48,7 +48,14 @@ #define WC_HAS_GCC_4_4_64BIT #endif -#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP) +#ifdef WOLFSSL_X86_64_BUILD +#if defined(USE_INTEL_SPEEDUP) && !defined(NO_POLY1305_ASM) + #define USE_INTEL_POLY1305_SPEEDUP + #define HAVE_INTEL_AVX1 +#endif +#endif + +#if defined(USE_INTEL_POLY1305_SPEEDUP) #elif (defined(WC_HAS_SIZEOF_INT128_64BIT) || defined(WC_HAS_MSVC_64BIT) || \ defined(WC_HAS_GCC_4_4_64BIT)) #define POLY130564 @@ -67,7 +74,7 @@ enum { /* Poly1305 state */ typedef struct Poly1305 { -#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP) +#ifdef USE_INTEL_POLY1305_SPEEDUP word64 r[3]; word64 h[3]; word64 pad[2];