From 66566955db554ed187e6fbb8ced944bbbf8222ff Mon Sep 17 00:00:00 2001 From: Daniel Pouzzner Date: Mon, 23 Feb 2026 23:03:08 -0600 Subject: [PATCH 1/3] wolfssl/wolfcrypt/wc_port.h, wolfssl/wolfcrypt/sha256.h, wolfssl/wolfcrypt/sha512.h, wolfssl/wolfcrypt/sp.h, wolfssl/wolfcrypt/wc_mlkem.h: add WC_NO_INLINE. --- wolfssl/wolfcrypt/sha256.h | 8 +------- wolfssl/wolfcrypt/sha512.h | 8 +------- wolfssl/wolfcrypt/sp.h | 13 +------------ wolfssl/wolfcrypt/wc_mlkem.h | 10 +--------- wolfssl/wolfcrypt/wc_port.h | 14 ++++++++++++++ 5 files changed, 18 insertions(+), 35 deletions(-) diff --git a/wolfssl/wolfcrypt/sha256.h b/wolfssl/wolfcrypt/sha256.h index 398722955f0..1cc27b070a2 100644 --- a/wolfssl/wolfcrypt/sha256.h +++ b/wolfssl/wolfcrypt/sha256.h @@ -102,13 +102,7 @@ #define WOLFSSL_NO_HASH_RAW #endif -#if defined(_MSC_VER) - #define SHA256_NOINLINE __declspec(noinline) -#elif defined(__IAR_SYSTEMS_ICC__) || defined(__GNUC__) - #define SHA256_NOINLINE __attribute__((noinline)) -#else - #define SHA256_NOINLINE -#endif +#define SHA256_NOINLINE WC_NO_INLINE #if !defined(NO_OLD_SHA_NAMES) #define SHA256 WC_SHA256 diff --git a/wolfssl/wolfcrypt/sha512.h b/wolfssl/wolfcrypt/sha512.h index 7d9724e5f1e..a6e906f1c87 100644 --- a/wolfssl/wolfcrypt/sha512.h +++ b/wolfssl/wolfcrypt/sha512.h @@ -80,13 +80,7 @@ #include #endif -#if defined(_MSC_VER) - #define SHA512_NOINLINE __declspec(noinline) -#elif defined(__IAR_SYSTEMS_ICC__) || defined(__GNUC__) - #define SHA512_NOINLINE __attribute__((noinline)) -#else - #define SHA512_NOINLINE -#endif +#define SHA512_NOINLINE WC_NO_INLINE #ifdef WOLFSSL_SHA512 diff --git a/wolfssl/wolfcrypt/sp.h b/wolfssl/wolfcrypt/sp.h index 1ebd39efad0..c0f028a9b3a 100644 --- a/wolfssl/wolfcrypt/sp.h +++ b/wolfssl/wolfcrypt/sp.h @@ -48,18 +48,7 @@ #undef WOLFSSL_HAVE_SP_ECC #endif -#ifdef noinline - #define SP_NOINLINE noinline -#elif defined(_MSC_VER) - #define SP_NOINLINE __declspec(noinline) -#elif defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - #define SP_NOINLINE _Pragma("inline = never") -#elif defined(__GNUC__) || defined(__KEIL__) || defined(__DCC__) - #define SP_NOINLINE __attribute__((noinline)) -#else - #define SP_NOINLINE -#endif - +#define SP_NOINLINE WC_NO_INLINE #ifdef __cplusplus extern "C" { diff --git a/wolfssl/wolfcrypt/wc_mlkem.h b/wolfssl/wolfcrypt/wc_mlkem.h index 1ee4225787e..27f12264c3b 100644 --- a/wolfssl/wolfcrypt/wc_mlkem.h +++ b/wolfssl/wolfcrypt/wc_mlkem.h @@ -44,15 +44,7 @@ #define WOLFSSL_MLKEM_NO_DECAPSULATE #endif -#ifdef noinline - #define MLKEM_NOINLINE noinline -#elif defined(_MSC_VER) - #define MLKEM_NOINLINE __declspec(noinline) -#elif defined(__GNUC__) - #define MLKEM_NOINLINE __attribute__((noinline)) -#else - #define MLKEM_NOINLINE -#endif +#define MLKEM_NOINLINE WC_NO_INLINE enum { /* Flags of Kyber keys. */ diff --git a/wolfssl/wolfcrypt/wc_port.h b/wolfssl/wolfcrypt/wc_port.h index 76cb11b5b16..f58fd0e8544 100644 --- a/wolfssl/wolfcrypt/wc_port.h +++ b/wolfssl/wolfcrypt/wc_port.h @@ -143,6 +143,20 @@ #endif #endif +#ifndef WC_NO_INLINE + #ifdef noinline + #define WC_NO_INLINE noinline + #elif defined(_MSC_VER) + #define WC_NO_INLINE __declspec(noinline) + #elif defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + #define WC_NO_INLINE _Pragma("inline = never") + #elif defined(__GNUC__) || defined(__KEIL__) || defined(__DCC__) + #define WC_NO_INLINE __attribute__((noinline)) + #else + #define WC_NO_INLINE + #endif +#endif + #ifndef WC_OMIT_FRAME_POINTER #if defined(__GNUC__) #define WC_OMIT_FRAME_POINTER \ From 39987a9d53aaf4d0b267c9009f5288c6c10f9819 Mon Sep 17 00:00:00 2001 From: Daniel Pouzzner Date: Tue, 24 Feb 2026 13:59:12 -0600 Subject: [PATCH 2/3] wolfcrypt/src/aes.c, wolfcrypt/src/cmac.c, wolfssl/wolfcrypt/aes.h, wolfssl/wolfcrypt/types.h: optimizations to mitigate performance regressions from 299e7bd097 (#9783): * add prefetch_ptr flag argument to AesEncrypt_C() and AesDecrypt_C(), and call PreFetchTe() and PreFetchSBox() only if *prefetch_ptr is zero, whereupon it is set to 1; * when C implementations are available, add prefetch_ptr arg to wc_AesEncrypt() and wc_AesDecrypt(), and pass it through; * in functions that directly call the AES block encryption methods, opportunistically inhibit prefetch on all but the first call; * move AES-specific code in wc_CmacUpdate() in cmac.c to wc_local_CmacUpdateAes() in aes.c to let it use conditional prefetching; * add WC_ARG_NOT_NULL(), WC_ARGS_NOT_NULL(), and WC_ALL_ARGS_NOT_NULL attribute abstractions. --- .wolfssl_known_macro_extras | 1 + wolfcrypt/src/aes.c | 298 ++++++++++++++++++++++++++++++------ wolfcrypt/src/cmac.c | 14 +- wolfssl/wolfcrypt/aes.h | 7 + wolfssl/wolfcrypt/types.h | 25 +++ 5 files changed, 289 insertions(+), 56 deletions(-) diff --git a/.wolfssl_known_macro_extras b/.wolfssl_known_macro_extras index 97decd7acb9..24552e1037f 100644 --- a/.wolfssl_known_macro_extras +++ b/.wolfssl_known_macro_extras @@ -1094,6 +1094,7 @@ __clang_major__ __cplusplus __ghc__ __ghs__ +__has_attribute __hpux__ __i386 __i386__ diff --git a/wolfcrypt/src/aes.c b/wolfcrypt/src/aes.c index 29911ccfa20..4833803a565 100644 --- a/wolfcrypt/src/aes.c +++ b/wolfcrypt/src/aes.c @@ -70,9 +70,9 @@ block cipher mechanism that uses n-bit binary string parameter key with 128-bits #include #endif -#if defined(WOLFSSL_AES_SIV) +#ifdef WOLFSSL_CMAC #include -#endif /* WOLFSSL_AES_SIV */ +#endif #if defined(WOLFSSL_HAVE_PSA) && !defined(WOLFSSL_PSA_NO_AES) #include @@ -2142,8 +2142,12 @@ static word32 GetTable8_4(const byte* t, byte o0, byte o1, byte o2, byte o3) * @param [out] outBlock Encrypted block. * @param [in] r Rounds divided by 2. */ +#define WC_AES_HAVE_PREFETCH_ARG +static int always_prefetch = 0; +WC_MAYBE_UNUSED static int never_prefetch = 1; +WC_ARGS_NOT_NULL((1, 2, 3, 5)) static void AesEncrypt_C(Aes* aes, const byte* inBlock, byte* outBlock, - word32 r) + word32 r, int *prefetch_ptr) { word32 s0 = 0, s1 = 0, s2 = 0, s3 = 0; word32 t0 = 0, t1 = 0, t2 = 0, t3 = 0; @@ -2178,8 +2182,15 @@ static void AesEncrypt_C(Aes* aes, const byte* inBlock, byte* outBlock, s3 ^= rk[3]; #ifndef WOLFSSL_AES_SMALL_TABLES + #ifndef WC_NO_CACHE_RESISTANT - s0 |= PreFetchTe(); + if (*prefetch_ptr == 0) { + s0 |= PreFetchTe(); + if (prefetch_ptr != &always_prefetch) + *prefetch_ptr = 1; + } +#else + (void)prefetch_ptr; #endif #ifndef WOLFSSL_AES_TOUCH_LINES @@ -2320,9 +2331,17 @@ static void AesEncrypt_C(Aes* aes, const byte* inBlock, byte* outBlock, s2 ^= u2 & 0x000000ff; s3 ^= u3 & 0x000000ff; } #endif -#else + +#else /* WOLFSSL_AES_SMALL_TABLES */ + #ifndef WC_NO_CACHE_RESISTANT - s0 |= PreFetchSBox(); + if (*prefetch_ptr == 0) { + s0 |= PreFetchSBox(); + if (prefetch_ptr != &always_prefetch) + *prefetch_ptr = 1; + } +#else + (void)prefetch_ptr; #endif r *= 2; @@ -2399,7 +2418,8 @@ static void AesEncrypt_C(Aes* aes, const byte* inBlock, byte* outBlock, s1 = t1 ^ rk[1]; s2 = t2 ^ rk[2]; s3 = t3 ^ rk[3]; -#endif + +#endif /* WOLFSSL_AES_SMALL_TABLES */ /* write out */ #ifdef LITTLE_ENDIAN_ORDER @@ -2429,9 +2449,10 @@ static void AesEncrypt_C(Aes* aes, const byte* inBlock, byte* outBlock, static void AesEncryptBlocks_C(Aes* aes, const byte* in, byte* out, word32 sz) { word32 i; + int did_prefetches = 0; for (i = 0; i < sz; i += WC_AES_BLOCK_SIZE) { - AesEncrypt_C(aes, in, out, aes->rounds >> 1); + AesEncrypt_C(aes, in, out, aes->rounds >> 1, &did_prefetches); in += WC_AES_BLOCK_SIZE; out += WC_AES_BLOCK_SIZE; } @@ -2998,10 +3019,18 @@ extern void AesEncryptBlocks_C(Aes* aes, const byte* in, byte* out, word32 sz); #endif /* !WC_AES_BITSLICED */ -/* this section disabled with NO_AES_192 */ -/* calling this one when missing NO_AES_192 */ +#ifdef WC_AES_HAVE_PREFETCH_ARG +#define wc_AesEncrypt(aes, inBlock, outBlock) \ + AesEncrypt_preFetchOpt(aes, inBlock, outBlock, &always_prefetch) +WC_ALL_ARGS_NOT_NULL static WARN_UNUSED_RESULT int AesEncrypt_preFetchOpt( + Aes* aes, const byte* inBlock, byte* outBlock, int *prefetch_ptr) +#else +#define AesEncrypt_preFetchOpt(aes, inBlock, outBlock, prefetch_ptr) \ + wc_AesEncrypt(aes, inBlock, outBlock) static WARN_UNUSED_RESULT int wc_AesEncrypt( Aes* aes, const byte* inBlock, byte* outBlock) + WC_ALL_ARGS_NOT_NULL +#endif { #if defined(MAX3266X_AES) word32 keySize; @@ -3153,7 +3182,11 @@ static WARN_UNUSED_RESULT int wc_AesEncrypt( } #endif +#ifdef WC_AES_HAVE_PREFETCH_ARG + AesEncrypt_C(aes, inBlock, outBlock, r, prefetch_ptr); +#else AesEncrypt_C(aes, inBlock, outBlock, r); +#endif return 0; } /* wc_AesEncrypt */ @@ -3211,8 +3244,14 @@ static WARN_UNUSED_RESULT WC_INLINE word32 PreFetchTd4(void) * @param [out] outBlock Encrypted block. * @param [in] r Rounds divided by 2. */ +#ifndef WC_AES_HAVE_PREFETCH_ARG + #define WC_AES_HAVE_PREFETCH_ARG + static int always_prefetch = 0; + WC_MAYBE_UNUSED static int never_prefetch = 1; +#endif +WC_ARGS_NOT_NULL((1, 2, 3, 5)) static void AesDecrypt_C(Aes* aes, const byte* inBlock, byte* outBlock, - word32 r) + word32 r, int *prefetch_ptr) { word32 s0 = 0, s1 = 0, s2 = 0, s3 = 0; word32 t0 = 0, t1 = 0, t2 = 0, t3 = 0; @@ -3246,8 +3285,16 @@ static void AesDecrypt_C(Aes* aes, const byte* inBlock, byte* outBlock, s3 ^= rk[3]; #ifndef WOLFSSL_AES_SMALL_TABLES + #ifndef WC_NO_CACHE_RESISTANT - s0 |= PreFetchTd(); + if (*prefetch_ptr == 0) { + s0 |= PreFetchTd(); + /* don't set the prefetched flag here -- PreFetchTd4() is called + * below. + */ + } +#else + (void)prefetch_ptr; #endif #ifndef WOLFSSL_AES_TOUCH_LINES @@ -3330,7 +3377,13 @@ static void AesDecrypt_C(Aes* aes, const byte* inBlock, byte* outBlock, */ #ifndef WC_NO_CACHE_RESISTANT - t0 |= PreFetchTd4(); + if (*prefetch_ptr == 0) { + t0 |= PreFetchTd4(); + if (prefetch_ptr != &always_prefetch) + *prefetch_ptr = 1; + } +#else + (void)prefetch_ptr; #endif s0 = GetTable8_4(Td4, GETBYTE(t0, 3), GETBYTE(t3, 2), @@ -3341,9 +3394,17 @@ static void AesDecrypt_C(Aes* aes, const byte* inBlock, byte* outBlock, GETBYTE(t0, 1), GETBYTE(t3, 0)) ^ rk[2]; s3 = GetTable8_4(Td4, GETBYTE(t3, 3), GETBYTE(t2, 2), GETBYTE(t1, 1), GETBYTE(t0, 0)) ^ rk[3]; -#else + +#else /* WOLFSSL_AES_SMALL_TABLES */ + #ifndef WC_NO_CACHE_RESISTANT - s0 |= PreFetchTd4(); + if (*prefetch_ptr == 0) { + s0 |= PreFetchTd4(); + if (prefetch_ptr != &always_prefetch) + *prefetch_ptr = 1; + } +#else + (void)prefetch_ptr; #endif r *= 2; @@ -3419,7 +3480,8 @@ static void AesDecrypt_C(Aes* aes, const byte* inBlock, byte* outBlock, s1 = t1 ^ rk[1]; s2 = t2 ^ rk[2]; s3 = t3 ^ rk[3]; -#endif + +#endif /* WOLFSSL_AES_SMALL_TABLES */ /* write out */ #ifdef LITTLE_ENDIAN_ORDER @@ -3450,9 +3512,10 @@ static void AesDecrypt_C(Aes* aes, const byte* inBlock, byte* outBlock, static void AesDecryptBlocks_C(Aes* aes, const byte* in, byte* out, word32 sz) { word32 i; + int did_prefetches = 0; for (i = 0; i < sz; i += WC_AES_BLOCK_SIZE) { - AesDecrypt_C(aes, in, out, aes->rounds >> 1); + AesDecrypt_C(aes, in, out, aes->rounds >> 1, &did_prefetches); in += WC_AES_BLOCK_SIZE; out += WC_AES_BLOCK_SIZE; } @@ -3808,8 +3871,18 @@ static void AesDecryptBlocks_C(Aes* aes, const byte* in, byte* out, word32 sz) #if defined(__aarch64__) || !defined(WOLFSSL_ARMASM) #if !defined(WC_AES_BITSLICED) || defined(WOLFSSL_AES_DIRECT) /* Software AES - ECB Decrypt */ -static WARN_UNUSED_RESULT int wc_AesDecrypt( + +#ifdef WC_AES_HAVE_PREFETCH_ARG +#define wc_AesDecrypt(aes, inBlock, outBlock) \ + AesDecrypt_preFetchOpt(aes, inBlock, outBlock, &always_prefetch) +WC_ALL_ARGS_NOT_NULL static WARN_UNUSED_RESULT int AesDecrypt_preFetchOpt( + Aes* aes, const byte* inBlock, byte* outBlock, int *prefetch_ptr) +#else +#define AesDecrypt_preFetchOpt(aes, inBlock, outBlock, prefetch_ptr) \ + wc_AesDecrypt(aes, inBlock, outBlock) +WC_ALL_ARGS_NOT_NULL static WARN_UNUSED_RESULT int wc_AesDecrypt( Aes* aes, const byte* inBlock, byte* outBlock) +#endif { #if defined(MAX3266X_AES) word32 keySize; @@ -3935,7 +4008,11 @@ static WARN_UNUSED_RESULT int wc_AesDecrypt( } #endif +#ifdef WC_AES_HAVE_PREFETCH_ARG + AesDecrypt_C(aes, inBlock, outBlock, r, prefetch_ptr); +#else AesDecrypt_C(aes, inBlock, outBlock, r); +#endif return 0; } /* wc_AesDecrypt[_SW]() */ @@ -3946,7 +4023,16 @@ static WARN_UNUSED_RESULT int wc_AesDecrypt( #endif /* NEED_AES_TABLES */ - +#ifndef WC_AES_HAVE_PREFETCH_ARG + #ifndef AesEncrypt_preFetchOpt + #define AesEncrypt_preFetchOpt(aes, inBlock, outBlock, do_preFetch) \ + wc_AesEncrypt(aes, inBlock, outBlock) + #endif + #ifndef AesDecrypt_preFetchOpt + #define AesDecrypt_preFetchOpt(aes, inBlock, outBlock, do_preFetch) \ + wc_AesDecrypt(aes, inBlock, outBlock) + #endif +#endif /* wc_AesSetKey */ #if defined(STM32_CRYPTO) @@ -5335,6 +5421,7 @@ int wc_AesSetIV(Aes* aes, const byte* iv) #else /* Allow direct access to one block encrypt */ + /* Note, the in and out args are swapped compared to wc_AesEncrypt(). */ int wc_AesEncryptDirect(Aes* aes, byte* out, const byte* in) { int ret; @@ -5355,6 +5442,7 @@ int wc_AesSetIV(Aes* aes, const byte* iv) #ifdef HAVE_AES_DECRYPT /* Allow direct access to one block decrypt */ + /* Note, the in and out args are swapped compared to wc_AesDecrypt(). */ int wc_AesDecryptDirect(Aes* aes, byte* out, const byte* in) { int ret; @@ -6097,7 +6185,6 @@ int wc_AesSetIV(Aes* aes, const byte* iv) offset += WC_AES_BLOCK_SIZE; } - return 0; } #endif /* HAVE_AES_DECRYPT */ @@ -6471,10 +6558,15 @@ int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz) else #endif { +#ifdef WC_AES_HAVE_PREFETCH_ARG + int did_prefetches = 0; +#endif ret = 0; while (blocks--) { xorbuf((byte*)aes->reg, in, WC_AES_BLOCK_SIZE); - ret = wc_AesEncrypt(aes, (byte*)aes->reg, (byte*)aes->reg); + ret = AesEncrypt_preFetchOpt(aes, (byte*)aes->reg, + (byte*)aes->reg, + &did_prefetches); if (ret != 0) break; XMEMCPY(out, aes->reg, WC_AES_BLOCK_SIZE); @@ -6713,9 +6805,13 @@ int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz) } } #else +#ifdef WC_AES_HAVE_PREFETCH_ARG + { + int did_prefetches = 0; +#endif while (blocks--) { XMEMCPY(aes->tmp, in, WC_AES_BLOCK_SIZE); - ret = wc_AesDecrypt(aes, in, out); + ret = AesDecrypt_preFetchOpt(aes, in, out, &did_prefetches); if (ret != 0) return ret; xorbuf(out, (byte*)aes->reg, WC_AES_BLOCK_SIZE); @@ -6725,6 +6821,9 @@ int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz) out += WC_AES_BLOCK_SIZE; in += WC_AES_BLOCK_SIZE; } +#ifdef WC_AES_HAVE_PREFETCH_ARG + } +#endif #endif } @@ -6967,6 +7066,9 @@ int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz) int ret = 0; #endif word32 processed; +#ifdef WC_AES_HAVE_PREFETCH_ARG + int did_prefetches = 0; +#endif #if !(!defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \ !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO)) @@ -7118,7 +7220,9 @@ int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz) #ifdef XTRANSFORM_AESCTRBLOCK XTRANSFORM_AESCTRBLOCK(aes, out, in); #else - ret = wc_AesEncrypt(aes, (byte*)aes->reg, scratch); + ret = AesEncrypt_preFetchOpt(aes, (byte*)aes->reg, + scratch, + &did_prefetches); if (ret != 0) break; xorbuf(scratch, in, WC_AES_BLOCK_SIZE); @@ -7136,7 +7240,9 @@ int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz) /* handle non block size remaining and store unused byte count in left */ if ((ret == 0) && sz) { - ret = wc_AesEncrypt(aes, (byte*)aes->reg, (byte*)aes->tmp); + ret = AesEncrypt_preFetchOpt(aes, (byte*)aes->reg, + (byte*)aes->tmp, + &did_prefetches); if (ret == 0) { IncrementAesCounter((byte*)aes->reg); aes->left = WC_AES_BLOCK_SIZE - sz; @@ -7175,6 +7281,16 @@ int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz) #endif /* WOLFSSL_AES_COUNTER */ #endif /* !WOLFSSL_RISCV_ASM */ +#ifndef WC_AES_HAVE_PREFETCH_ARG + #ifndef AesEncrypt_preFetchOpt + #define AesEncrypt_preFetchOpt(aes, inBlock, outBlock, do_preFetch) \ + wc_AesEncrypt(aes, inBlock, outBlock) + #endif + #ifndef AesDecrypt_preFetchOpt + #define AesDecrypt_preFetchOpt(aes, inBlock, outBlock, do_preFetch) \ + wc_AesDecrypt(aes, inBlock, outBlock) + #endif +#endif /* * The IV for AES GCM and CCM, stored in struct Aes's member reg, is comprised @@ -9616,6 +9732,9 @@ WARN_UNUSED_RESULT int AES_GCM_encrypt_C( ALIGN16 byte counter[WC_AES_BLOCK_SIZE]; ALIGN16 byte initialCounter[WC_AES_BLOCK_SIZE]; ALIGN16 byte scratch[WC_AES_BLOCK_SIZE]; +#ifdef WC_AES_HAVE_PREFETCH_ARG + int did_prefetches = 0; +#endif if (ivSz == GCM_NONCE_MID_SZ) { /* Counter is IV with bottom 4 bytes set to: 0x00,0x00,0x00,0x01. */ @@ -9674,7 +9793,8 @@ WARN_UNUSED_RESULT int AES_GCM_encrypt_C( while (blocks--) { IncrementGcmCounter(counter); #if !defined(WOLFSSL_PIC32MZ_CRYPT) - ret = wc_AesEncrypt(aes, counter, scratch); + ret = AesEncrypt_preFetchOpt(aes, counter, scratch, + &did_prefetches); if (ret != 0) return ret; xorbufout(c, scratch, p, WC_AES_BLOCK_SIZE); @@ -9686,14 +9806,15 @@ WARN_UNUSED_RESULT int AES_GCM_encrypt_C( if (partial != 0) { IncrementGcmCounter(counter); - ret = wc_AesEncrypt(aes, counter, scratch); + ret = AesEncrypt_preFetchOpt(aes, counter, scratch, &did_prefetches); if (ret != 0) return ret; xorbufout(c, scratch, p, partial); } if (authTag) { GHASH(&aes->gcm, authIn, authInSz, out, sz, authTag, authTagSz); - ret = wc_AesEncrypt(aes, initialCounter, scratch); + ret = AesEncrypt_preFetchOpt(aes, initialCounter, scratch, + &did_prefetches); if (ret != 0) return ret; xorbuf(authTag, scratch, authTagSz); @@ -12814,7 +12935,11 @@ static WARN_UNUSED_RESULT int roll_x( in += WC_AES_BLOCK_SIZE; inSz -= WC_AES_BLOCK_SIZE; - ret = wc_AesEncrypt(aes, out, out); + /* wc_AesCcmEncrypt(), wc_AesCcmDecrypt(), and roll_auth() only call + * roll_x() after the AES cache lines are already hot -- no need to + * absorb additional prefetch overhead here. + */ + ret = AesEncrypt_preFetchOpt(aes, out, out, &never_prefetch); if (ret != 0) return ret; } @@ -12822,7 +12947,11 @@ static WARN_UNUSED_RESULT int roll_x( /* process remainder of the data */ if (inSz > 0) { xorbuf(out, in, inSz); - ret = wc_AesEncrypt(aes, out, out); + /* wc_AesCcmEncrypt(), wc_AesCcmDecrypt(), and roll_auth() only call + * roll_x() after the AES cache lines are already hot -- no need to + * absorb additional prefetch overhead here. + */ + ret = AesEncrypt_preFetchOpt(aes, out, out, &never_prefetch); if (ret != 0) return ret; } @@ -12870,7 +12999,11 @@ static WARN_UNUSED_RESULT int roll_auth( xorbuf(out + authLenSz, in, inSz); inSz = 0; } - ret = wc_AesEncrypt(aes, out, out); + /* wc_AesCcmEncrypt() and wc_AesCcmDecrypt() only call roll_auth() after the + * AES cache lines are already hot -- no need to absorb additional prefetch + * overhead here. + */ + ret = AesEncrypt_preFetchOpt(aes, out, out, &never_prefetch); if ((ret == 0) && (inSz > 0)) { ret = roll_x(aes, in, inSz, out); @@ -12996,6 +13129,9 @@ int wc_AesCcmEncrypt(Aes* aes, byte* out, const byte* in, word32 inSz, #endif VECTOR_REGISTERS_PUSH; + /* note this wc_AesEncrypt() will perform cache prefetches if needed, so + * that the later encrypt ops don't need to. + */ ret = wc_AesEncrypt(aes, B, A); #ifdef WOLFSSL_CHECK_MEM_ZERO if (ret == 0) @@ -13014,7 +13150,7 @@ int wc_AesCcmEncrypt(Aes* aes, byte* out, const byte* in, word32 inSz, B[0] = (byte)(lenSz - 1U); for (i = 0; i < lenSz; i++) B[WC_AES_BLOCK_SIZE - 1 - i] = 0; - ret = wc_AesEncrypt(aes, B, A); + ret = AesEncrypt_preFetchOpt(aes, B, A, &never_prefetch); } if (ret == 0) { @@ -13042,7 +13178,7 @@ int wc_AesCcmEncrypt(Aes* aes, byte* out, const byte* in, word32 inSz, #endif if (ret == 0) { while (inSz >= WC_AES_BLOCK_SIZE) { - ret = wc_AesEncrypt(aes, B, A); + ret = AesEncrypt_preFetchOpt(aes, B, A, &never_prefetch); if (ret != 0) break; xorbuf(A, in, WC_AES_BLOCK_SIZE); @@ -13055,7 +13191,7 @@ int wc_AesCcmEncrypt(Aes* aes, byte* out, const byte* in, word32 inSz, } } if ((ret == 0) && (inSz > 0)) { - ret = wc_AesEncrypt(aes, B, A); + ret = AesEncrypt_preFetchOpt(aes, B, A, &never_prefetch); } if ((ret == 0) && (inSz > 0)) { xorbuf(A, in, inSz); @@ -13095,6 +13231,9 @@ int wc_AesCcmDecrypt(Aes* aes, byte* out, const byte* in, word32 inSz, byte mask = 0xFF; const word32 wordSz = (word32)sizeof(word32); int ret = 0; +#ifdef WC_AES_HAVE_PREFETCH_ARG + int did_prefetches = 0; +#endif /* sanity check on arguments */ if (aes == NULL || (inSz != 0 && (in == NULL || out == NULL)) || @@ -13165,7 +13304,7 @@ int wc_AesCcmDecrypt(Aes* aes, byte* out, const byte* in, word32 inSz, #endif while (oSz >= WC_AES_BLOCK_SIZE) { - ret = wc_AesEncrypt(aes, B, A); + ret = AesEncrypt_preFetchOpt(aes, B, A, &did_prefetches); if (ret != 0) break; xorbuf(A, in, WC_AES_BLOCK_SIZE); @@ -13177,14 +13316,14 @@ int wc_AesCcmDecrypt(Aes* aes, byte* out, const byte* in, word32 inSz, } if ((ret == 0) && (inSz > 0)) - ret = wc_AesEncrypt(aes, B, A); + ret = AesEncrypt_preFetchOpt(aes, B, A, &did_prefetches); if ((ret == 0) && (inSz > 0)) { xorbuf(A, in, oSz); XMEMCPY(o, A, oSz); for (i = 0; i < lenSz; i++) B[WC_AES_BLOCK_SIZE - 1 - i] = 0; - ret = wc_AesEncrypt(aes, B, A); + ret = AesEncrypt_preFetchOpt(aes, B, A, &did_prefetches); } if (ret == 0) { @@ -13200,7 +13339,7 @@ int wc_AesCcmDecrypt(Aes* aes, byte* out, const byte* in, word32 inSz, B[WC_AES_BLOCK_SIZE - 1 - i] = (byte)((inSz >> ((8 * i) & mask)) & mask); } - ret = wc_AesEncrypt(aes, B, A); + ret = AesEncrypt_preFetchOpt(aes, B, A, &did_prefetches); } if (ret == 0) { @@ -13214,7 +13353,7 @@ int wc_AesCcmDecrypt(Aes* aes, byte* out, const byte* in, word32 inSz, B[0] = (byte)(lenSz - 1U); for (i = 0; i < lenSz; i++) B[WC_AES_BLOCK_SIZE - 1 - i] = 0; - ret = wc_AesEncrypt(aes, B, B); + ret = AesEncrypt_preFetchOpt(aes, B, B, &did_prefetches); } if (ret == 0) @@ -13772,9 +13911,12 @@ static WARN_UNUSED_RESULT int _AesEcbEncrypt( AesEncryptBlocks_C(aes, in, out, sz); #else word32 i; +#ifdef WC_AES_HAVE_PREFETCH_ARG + int did_prefetches = 0; +#endif for (i = 0; i < sz; i += WC_AES_BLOCK_SIZE) { - ret = wc_AesEncryptDirect(aes, out, in); + ret = AesEncrypt_preFetchOpt(aes, in, out, &did_prefetches); if (ret != 0) break; in += WC_AES_BLOCK_SIZE; @@ -13936,6 +14078,9 @@ static WARN_UNUSED_RESULT int AesCfbEncrypt_C(Aes* aes, byte* out, { int ret = 0; word32 processed; +#ifdef WC_AES_HAVE_PREFETCH_ARG + int did_prefetches = 0; +#endif if ((aes == NULL) || (out == NULL) || (in == NULL)) { return BAD_FUNC_ARG; @@ -13960,7 +14105,8 @@ static WARN_UNUSED_RESULT int AesCfbEncrypt_C(Aes* aes, byte* out, VECTOR_REGISTERS_PUSH; while (sz >= WC_AES_BLOCK_SIZE) { - ret = wc_AesEncryptDirect(aes, (byte*)aes->reg, (byte*)aes->reg); + ret = AesEncrypt_preFetchOpt(aes, (byte*)aes->reg, (byte*)aes->reg, + &did_prefetches); if (ret != 0) { break; } @@ -13973,7 +14119,8 @@ static WARN_UNUSED_RESULT int AesCfbEncrypt_C(Aes* aes, byte* out, /* encrypt left over data */ if ((ret == 0) && sz) { - ret = wc_AesEncryptDirect(aes, (byte*)aes->tmp, (byte*)aes->reg); + ret = AesEncrypt_preFetchOpt(aes, (byte*)aes->reg, (byte*)aes->tmp, + &did_prefetches); if (ret == 0) { xorbufout(out, in, aes->tmp, sz); XMEMCPY(aes->reg, out, sz); @@ -14004,6 +14151,9 @@ static WARN_UNUSED_RESULT int AesCfbDecrypt_C(Aes* aes, byte* out, { int ret = 0; word32 processed; +#ifdef WC_AES_HAVE_PREFETCH_ARG + int did_prefetches = 0; +#endif (void)mode; @@ -14050,7 +14200,8 @@ static WARN_UNUSED_RESULT int AesCfbDecrypt_C(Aes* aes, byte* out, } #endif while (sz >= WC_AES_BLOCK_SIZE) { - ret = wc_AesEncryptDirect(aes, (byte*)aes->tmp, (byte*)aes->reg); + ret = AesEncrypt_preFetchOpt(aes, (byte*)aes->reg, (byte*)aes->tmp, + &did_prefetches); if (ret != 0) { break; } @@ -14063,7 +14214,8 @@ static WARN_UNUSED_RESULT int AesCfbDecrypt_C(Aes* aes, byte* out, /* decrypt left over data */ if ((ret == 0) && sz) { - ret = wc_AesEncryptDirect(aes, (byte*)aes->tmp, (byte*)aes->reg); + ret = AesEncrypt_preFetchOpt(aes, (byte*)aes->reg, (byte*)aes->tmp, + &did_prefetches); if (ret == 0) { XMEMCPY(aes->reg, in, sz); xorbufout(out, in, aes->tmp, sz); @@ -14144,6 +14296,9 @@ static WARN_UNUSED_RESULT int wc_AesFeedbackCFB8( { byte *pt; int ret = 0; +#ifdef WC_AES_HAVE_PREFETCH_ARG + int did_prefetches = 0; +#endif if (aes == NULL || out == NULL || in == NULL) { return BAD_FUNC_ARG; @@ -14156,7 +14311,8 @@ static WARN_UNUSED_RESULT int wc_AesFeedbackCFB8( VECTOR_REGISTERS_PUSH; while (sz > 0) { - ret = wc_AesEncryptDirect(aes, (byte*)aes->tmp, (byte*)aes->reg); + ret = AesEncrypt_preFetchOpt(aes, (byte*)aes->reg, (byte*)aes->tmp, + &did_prefetches); if (ret != 0) break; if (dir == AES_DECRYPTION) { @@ -14200,6 +14356,9 @@ static WARN_UNUSED_RESULT int wc_AesFeedbackCFB1( byte* pt; int bit = 7; int ret = 0; +#ifdef WC_AES_HAVE_PREFETCH_ARG + int did_prefetches = 0; +#endif if (aes == NULL || out == NULL || in == NULL) { return BAD_FUNC_ARG; @@ -14212,7 +14371,8 @@ static WARN_UNUSED_RESULT int wc_AesFeedbackCFB1( VECTOR_REGISTERS_PUSH; while (sz > 0) { - ret = wc_AesEncryptDirect(aes, (byte*)aes->tmp, (byte*)aes->reg); + ret = AesEncrypt_preFetchOpt(aes, (byte*)aes->reg, (byte*)aes->tmp, + &did_prefetches); if (ret != 0) break; if (dir == AES_DECRYPTION) { @@ -14351,6 +14511,9 @@ static WARN_UNUSED_RESULT int AesOfbCrypt_C(Aes* aes, byte* out, const byte* in, { int ret = 0; word32 processed; +#ifdef WC_AES_HAVE_PREFETCH_ARG + int did_prefetches = 0; +#endif if ((aes == NULL) || (out == NULL) || (in == NULL)) { return BAD_FUNC_ARG; @@ -14373,7 +14536,8 @@ static WARN_UNUSED_RESULT int AesOfbCrypt_C(Aes* aes, byte* out, const byte* in, VECTOR_REGISTERS_PUSH; while (sz >= WC_AES_BLOCK_SIZE) { - ret = wc_AesEncryptDirect(aes, (byte*)aes->reg, (byte*)aes->reg); + ret = AesEncrypt_preFetchOpt(aes, (byte*)aes->reg, (byte*)aes->reg, + &did_prefetches); if (ret != 0) { break; } @@ -14385,7 +14549,8 @@ static WARN_UNUSED_RESULT int AesOfbCrypt_C(Aes* aes, byte* out, const byte* in, /* encrypt left over data */ if ((ret == 0) && sz) { - ret = wc_AesEncryptDirect(aes, (byte*)aes->tmp, (byte*)aes->reg); + ret = AesEncrypt_preFetchOpt(aes, (byte*)aes->reg, (byte*)aes->tmp, + &did_prefetches); if (ret == 0) { XMEMCPY(aes->reg, aes->tmp, WC_AES_BLOCK_SIZE); xorbufout(out, in, aes->tmp, sz); @@ -16096,6 +16261,45 @@ int wc_AesXtsDecryptConsecutiveSectors(XtsAes* aes, byte* out, const byte* in, #endif /* HAVE_AES_DECRYPT */ #endif /* WOLFSSL_AES_XTS */ +#ifdef WOLFSSL_CMAC + +int wc_local_CmacUpdateAes(struct Cmac *cmac, const byte* in, word32 inSz) { + int ret = 0; + Aes *aes = &cmac->aes; +#ifdef WC_AES_HAVE_PREFETCH_ARG + int did_prefetches = 0; +#endif + + VECTOR_REGISTERS_PUSH; + + while ((ret == 0) && (inSz != 0)) { + word32 add = min(inSz, WC_AES_BLOCK_SIZE - cmac->bufferSz); + XMEMCPY(&cmac->buffer[cmac->bufferSz], in, add); + + cmac->bufferSz += add; + inSz -= add; + in += add; + + if (cmac->bufferSz == WC_AES_BLOCK_SIZE && inSz != 0) { + if (cmac->totalSz != 0) { + xorbuf(cmac->buffer, cmac->digest, WC_AES_BLOCK_SIZE); + } + ret = AesEncrypt_preFetchOpt(aes, cmac->buffer, + cmac->digest, &did_prefetches); + if (ret == 0) { + cmac->totalSz += WC_AES_BLOCK_SIZE; + cmac->bufferSz = 0; + } + } + } + + VECTOR_REGISTERS_POP; + + return ret; +} + +#endif /* WOLFSSL_CMAC */ + #ifdef WOLFSSL_AES_SIV /* diff --git a/wolfcrypt/src/cmac.c b/wolfcrypt/src/cmac.c index ac9a14811e3..66e45f92477 100644 --- a/wolfcrypt/src/cmac.c +++ b/wolfcrypt/src/cmac.c @@ -228,6 +228,7 @@ int wc_CmacUpdate(Cmac* cmac, const byte* in, word32 inSz) #if !defined(NO_AES) && defined(WOLFSSL_AES_DIRECT) case WC_CMAC_AES: { +#ifdef HAVE_SELFTEST while ((ret == 0) && (inSz != 0)) { word32 add = min(inSz, WC_AES_BLOCK_SIZE - cmac->bufferSz); XMEMCPY(&cmac->buffer[cmac->bufferSz], in, add); @@ -240,21 +241,16 @@ int wc_CmacUpdate(Cmac* cmac, const byte* in, word32 inSz) if (cmac->totalSz != 0) { xorbuf(cmac->buffer, cmac->digest, WC_AES_BLOCK_SIZE); } -#ifndef HAVE_SELFTEST - ret = wc_AesEncryptDirect(&cmac->aes, cmac->digest, - cmac->buffer); - if (ret == 0) { - cmac->totalSz += WC_AES_BLOCK_SIZE; - cmac->bufferSz = 0; - } -#else wc_AesEncryptDirect(&cmac->aes, cmac->digest, cmac->buffer); cmac->totalSz += WC_AES_BLOCK_SIZE; cmac->bufferSz = 0; -#endif } } +#else + (void)ret; + ret = wc_local_CmacUpdateAes(cmac, in, inSz); +#endif }; break; #endif /* !NO_AES && WOLFSSL_AES_DIRECT */ default: diff --git a/wolfssl/wolfcrypt/aes.h b/wolfssl/wolfcrypt/aes.h index da27ccc709b..3c4a7db5eb6 100644 --- a/wolfssl/wolfcrypt/aes.h +++ b/wolfssl/wolfcrypt/aes.h @@ -806,6 +806,13 @@ int wc_AesSivDecrypt_ex(const byte* key, word32 keySz, const AesSivAssoc* assoc, const byte* in, word32 inSz, byte* siv, byte* out); #endif +#ifdef WOLFSSL_CMAC +/* forward declaration, in case aes.h is being included by cmac.h */ +struct Cmac; +WOLFSSL_LOCAL int wc_local_CmacUpdateAes(struct Cmac *cmac, const byte* in, + word32 inSz); +#endif + #ifdef WOLFSSL_AES_EAX /* Because of the circular dependency between AES and CMAC, we need to prevent diff --git a/wolfssl/wolfcrypt/types.h b/wolfssl/wolfcrypt/types.h index 28bb12fc17b..4ba6dfbe753 100644 --- a/wolfssl/wolfcrypt/types.h +++ b/wolfssl/wolfcrypt/types.h @@ -2056,6 +2056,31 @@ WOLFSSL_API word32 CheckRunTimeSettings(void); #define WC_NORETURN #endif +#if defined(__has_attribute) && __has_attribute(nonnull) + #ifndef WC_ARG_NOT_NULL + #define WC_ARG_NOT_NULL(a) __attribute__((nonnull(a))) + #endif + #ifndef WC_ARGS_NOT_NULL + /* double-parenthesize, a la WC_ARGS_NOT_NULL((1, 2)) -- this approach + * maintains compatibility with WOLF_NO_VARIADIC_MACROS. + */ + #define WC_ARGS_NOT_NULL(p_a) __attribute__((nonnull p_a)) + #endif + #ifndef WC_ALL_ARGS_NOT_NULL + #define WC_ALL_ARGS_NOT_NULL __attribute__((nonnull)) + #endif +#else + #ifndef WC_ARG_NOT_NULL + #define WC_ARG_NOT_NULL(a) /* null expansion */ + #endif + #ifndef WC_ARGS_NOT_NULL + #define WC_ARGS_NOT_NULL(p_a) /* null expansion */ + #endif + #ifndef WC_ALL_ARGS_NOT_NULL + #define WC_ALL_ARGS_NOT_NULL + #endif +#endif + #if defined(WOLFSSL_KEY_GEN) || defined(HAVE_COMP_KEY) || \ defined(WOLFSSL_DEBUG_MATH) || defined(DEBUG_WOLFSSL) || \ defined(WOLFSSL_PUBLIC_MP) || defined(OPENSSL_EXTRA) || \ From 314da6d6bc1f388ae1f9b3e35c40dccab912a42c Mon Sep 17 00:00:00 2001 From: Daniel Pouzzner Date: Tue, 24 Feb 2026 15:41:11 -0600 Subject: [PATCH 3/3] wolfssl/wolfcrypt/types.h: work around limitations of Watcom and Windows preprocessors, re WC_ARG_NOT_NULL and friends. --- wolfssl/wolfcrypt/types.h | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/wolfssl/wolfcrypt/types.h b/wolfssl/wolfcrypt/types.h index 4ba6dfbe753..af7d6b6224a 100644 --- a/wolfssl/wolfcrypt/types.h +++ b/wolfssl/wolfcrypt/types.h @@ -2056,7 +2056,8 @@ WOLFSSL_API word32 CheckRunTimeSettings(void); #define WC_NORETURN #endif -#if defined(__has_attribute) && __has_attribute(nonnull) +#ifdef __has_attribute +#if __has_attribute(nonnull) #ifndef WC_ARG_NOT_NULL #define WC_ARG_NOT_NULL(a) __attribute__((nonnull(a))) #endif @@ -2069,16 +2070,17 @@ WOLFSSL_API word32 CheckRunTimeSettings(void); #ifndef WC_ALL_ARGS_NOT_NULL #define WC_ALL_ARGS_NOT_NULL __attribute__((nonnull)) #endif -#else - #ifndef WC_ARG_NOT_NULL - #define WC_ARG_NOT_NULL(a) /* null expansion */ - #endif - #ifndef WC_ARGS_NOT_NULL - #define WC_ARGS_NOT_NULL(p_a) /* null expansion */ - #endif - #ifndef WC_ALL_ARGS_NOT_NULL - #define WC_ALL_ARGS_NOT_NULL - #endif +#endif /* __has_attribute(nonnull) */ +#endif /* defined(__has_attribute) */ + +#ifndef WC_ARG_NOT_NULL + #define WC_ARG_NOT_NULL(a) /* null expansion */ +#endif +#ifndef WC_ARGS_NOT_NULL + #define WC_ARGS_NOT_NULL(p_a) /* null expansion */ +#endif +#ifndef WC_ALL_ARGS_NOT_NULL + #define WC_ALL_ARGS_NOT_NULL #endif #if defined(WOLFSSL_KEY_GEN) || defined(HAVE_COMP_KEY) || \