From 00d92c29891e27fa024e08f5511ef45f9067a279 Mon Sep 17 00:00:00 2001 From: Valdemar Bucilko Date: Wed, 31 Jan 2018 14:27:07 +0200 Subject: [PATCH] x64 intrinsics for msvc in bn_mul, timing and aesni modules AES-NI implementation for MSVC x64 using intrinsics. Implement rdtsc timing function using intrinsics for x64. Use 128bit-result multiply on msvc/x64. --- include/mbedtls/aesni.h | 5 + include/mbedtls/bn_mul.h | 24 ++- include/mbedtls/check_config.h | 2 +- library/aesni.c | 265 +++++++++++++++++++++++++++++++++ library/timing.c | 17 +++ 5 files changed, 310 insertions(+), 3 deletions(-) diff --git a/include/mbedtls/aesni.h b/include/mbedtls/aesni.h index c1d22f59a..a9c46b147 100644 --- a/include/mbedtls/aesni.h +++ b/include/mbedtls/aesni.h @@ -42,6 +42,11 @@ #define MBEDTLS_HAVE_X86_64 #endif +#if defined(_MSC_VER) && defined(_M_X64) && \ + ! defined(MBEDTLS_HAVE_X86_64) +#define MBEDTLS_HAVE_X86_64 +#endif + #if defined(MBEDTLS_HAVE_X86_64) #ifdef __cplusplus diff --git a/include/mbedtls/bn_mul.h b/include/mbedtls/bn_mul.h index 31137cd4c..5b86dff9f 100644 --- a/include/mbedtls/bn_mul.h +++ b/include/mbedtls/bn_mul.h @@ -924,10 +924,30 @@ __asm mov s, esi \ #endif /* SSE2 */ -#endif /* MSVC */ - +#endif /* (MSVC && _M_IX86) || __WATCOMC__ */ #endif /* MBEDTLS_HAVE_ASM */ +#if defined(_MSC_VER) && defined(_M_X64) + +#include + +#define MULADDC_INIT \ +{ \ + mbedtls_mpi_uint r0, r1; \ + unsigned char carry; + +#define MULADDC_CORE \ + r0 = _umul128( *(s++), b, &r1 ); \ + carry = _addcarry_u64( 0, r0, c, &r0 ); \ + _addcarry_u64( carry, r1, 0, &r1 ); \ + carry = _addcarry_u64( 0, r0, *d, &r0 ); \ + _addcarry_u64( carry, r1, 0, &r1 ); \ + c = r1; *(d++) = r0; + +#define MULADDC_STOP \ +} +#endif /* _MSC_VER && _M_X64 */ + #if !defined(MULADDC_CORE) #if defined(MBEDTLS_HAVE_UDBL) diff --git a/include/mbedtls/check_config.h b/include/mbedtls/check_config.h index 703c84c45..cd522ae3d 100644 --- a/include/mbedtls/check_config.h +++ b/include/mbedtls/check_config.h @@ -68,7 +68,7 @@ #error "MBEDTLS_HAVE_TIME_DATE without MBEDTLS_HAVE_TIME does not make sense" #endif -#if defined(MBEDTLS_AESNI_C) && !defined(MBEDTLS_HAVE_ASM) +#if defined(MBEDTLS_AESNI_C) && !defined(MBEDTLS_HAVE_ASM) && !(defined(_MSC_VER) && defined(_M_X64)) #error "MBEDTLS_AESNI_C defined, but not all prerequisites" #endif diff --git a/library/aesni.c b/library/aesni.c index 996292ff6..5af0e4c11 100644 --- a/library/aesni.c +++ b/library/aesni.c @@ -42,6 +42,11 @@ #if defined(MBEDTLS_HAVE_X86_64) +#if defined(_MSC_VER) && defined(_M_X64) +#define MBEDTLS_HAVE_MSVC_X64_INTRINSICS +#include +#endif + /* * AES-NI support detection routine */ @@ -52,11 +57,17 @@ int mbedtls_aesni_has_support( unsigned int what ) if( ! done ) { +#if defined(MBEDTLS_HAVE_MSVC_X64_INTRINSICS) + int regs[4]; // eax, ebx, ecx, edx + __cpuid( regs, 1 ); + c = regs[2]; +#else asm( "movl $1, %%eax \n\t" "cpuid \n\t" : "=c" (c) : : "eax", "ebx", "edx" ); +#endif done = 1; } @@ -97,6 +108,28 @@ int mbedtls_aesni_crypt_ecb( mbedtls_aes_context *ctx, const unsigned char input[16], unsigned char output[16] ) { +#if defined(MBEDTLS_HAVE_MSVC_X64_INTRINSICS) + __m128i* rk, a; + int i; + + rk = (__m128i*)ctx->rk; + a = _mm_xor_si128( _mm_loadu_si128( (__m128i*)input ), _mm_loadu_si128( rk++ ) ); + + if (mode == MBEDTLS_AES_ENCRYPT) + { + for (i = ctx->nr - 1; i; --i) + a = _mm_aesenc_si128( a, _mm_loadu_si128( rk++ ) ); + a = _mm_aesenclast_si128( a, _mm_loadu_si128( rk ) ); + } + else + { + for (i = ctx->nr - 1; i; --i) + a = _mm_aesdec_si128( a, _mm_loadu_si128( rk++ ) ); + a = _mm_aesdeclast_si128( a, _mm_loadu_si128( rk ) ); + } + + _mm_storeu_si128( (__m128i*)output, a ); +#else asm( "movdqu (%3), %%xmm0 \n\t" // load input "movdqu (%1), %%xmm1 \n\t" // load round key 0 "pxor %%xmm1, %%xmm0 \n\t" // round 0 @@ -130,10 +163,70 @@ int mbedtls_aesni_crypt_ecb( mbedtls_aes_context *ctx, : "r" (ctx->nr), "r" (ctx->rk), "r" (mode), "r" (input), "r" (output) : "memory", "cc", "xmm0", "xmm1" ); +#endif return( 0 ); } +#if defined(MBEDTLS_HAVE_MSVC_X64_INTRINSICS) + +static inline void clmul256( __m128i a, __m128i b, __m128i* r0, __m128i* r1 ) +{ + __m128i c, d, e, f, ef; + c = _mm_clmulepi64_si128( a, b, 0x00 ); + d = _mm_clmulepi64_si128( a, b, 0x11 ); + e = _mm_clmulepi64_si128( a, b, 0x10 ); + f = _mm_clmulepi64_si128( a, b, 0x01 ); + + // r0 = f0^e0^c1:c0 = c1:c0 ^ f0^e0:0 + // r1 = d1:f1^e1^d0 = d1:d0 ^ 0:f1^e1 + + ef = _mm_xor_si128( e, f ); + *r0 = _mm_xor_si128( c, _mm_slli_si128( ef, 8 ) ); + *r1 = _mm_xor_si128( d, _mm_srli_si128( ef, 8 ) ); +} + +static inline void sll256( __m128i a0, __m128i a1, __m128i* s0, __m128i* s1 ) +{ + __m128i l0, l1, r0, r1; + + l0 = _mm_slli_epi64( a0, 1 ); + l1 = _mm_slli_epi64( a1, 1 ); + + r0 = _mm_srli_epi64( a0, 63 ); + r1 = _mm_srli_epi64( a1, 63 ); + + *s0 = _mm_or_si128( l0, _mm_slli_si128( r0, 8 ) ); + *s1 = _mm_or_si128( _mm_or_si128( l1, _mm_srli_si128( r0, 8 ) ), _mm_slli_si128( r1, 8 ) ); +} + +static inline __m128i reducemod128( __m128i x10, __m128i x32 ) +{ + __m128i a, b, c, dx0, e, f, g, h; + + // (1) left shift x0 by 63, 62 and 57 + a = _mm_slli_epi64( x10, 63 ); + b = _mm_slli_epi64( x10, 62 ); + c = _mm_slli_epi64( x10, 57 ); + + // (2) compute D xor'ing a, b, c and x1 + // d:x0 = x1:x0 ^ [a^b^c:0] + dx0 = _mm_xor_si128( x10, _mm_slli_si128( _mm_xor_si128( _mm_xor_si128( a, b ), c ), 8 ) ); + + // (3) right shift [d:x0] by 1, 2, 7 + e = _mm_or_si128( _mm_srli_epi64( dx0, 1 ), _mm_srli_si128( _mm_slli_epi64( dx0, 63 ), 8 ) ); + f = _mm_or_si128( _mm_srli_epi64( dx0, 2 ), _mm_srli_si128( _mm_slli_epi64( dx0, 62 ), 8 ) ); + g = _mm_or_si128( _mm_srli_epi64( dx0, 7 ), _mm_srli_si128( _mm_slli_epi64( dx0, 57 ), 8 ) ); + + // (4) compute h = d^e1^f1^g1 : x0^e0^f0^g0 + h = _mm_xor_si128( dx0, _mm_xor_si128( e, _mm_xor_si128( f, g ) ) ); + + // result is x3^h1:x2^h0 + return _mm_xor_si128( x32, h ); +} + +#endif + /* * GCM multiplication: c = a times b in GF(2^128) * Based on [CLMUL-WP] algorithms 1 (with equation 27) and 5. @@ -142,6 +235,22 @@ void mbedtls_aesni_gcm_mult( unsigned char c[16], const unsigned char a[16], const unsigned char b[16] ) { + +#if defined(MBEDTLS_HAVE_MSVC_X64_INTRINSICS) + __m128i xa, xb, m0, m1, x10, x32, r; + + xa.m128i_u64[1] = _byteswap_uint64( *((unsigned __int64*)a + 0) ); + xa.m128i_u64[0] = _byteswap_uint64( *((unsigned __int64*)a + 1) ); + xb.m128i_u64[1] = _byteswap_uint64( *((unsigned __int64*)b + 0) ); + xb.m128i_u64[0] = _byteswap_uint64( *((unsigned __int64*)b + 1) ); + + clmul256( xa, xb, &m0, &m1 ); + sll256( m0, m1, &x10, &x32 ); + r = reducemod128( x10, x32 ); + + *((unsigned __int64*)c + 0) = _byteswap_uint64( r.m128i_u64[1] ); + *((unsigned __int64*)c + 1) = _byteswap_uint64( r.m128i_u64[0] ); +#else unsigned char aa[16], bb[16], cc[16]; size_t i; @@ -242,6 +351,7 @@ void mbedtls_aesni_gcm_mult( unsigned char c[16], /* Now byte-reverse the outputs */ for( i = 0; i < 16; i++ ) c[i] = cc[15 - i]; +#endif return; } @@ -258,22 +368,109 @@ void mbedtls_aesni_inverse_key( unsigned char *invkey, memcpy( ik, fk, 16 ); for( fk -= 16, ik += 16; fk > fwdkey; fk -= 16, ik += 16 ) +#if defined(MBEDTLS_HAVE_MSVC_X64_INTRINSICS) + _mm_storeu_si128( (__m128i*)ik, _mm_aesimc_si128( _mm_loadu_si128( (__m128i*)fk) ) ); +#else asm( "movdqu (%0), %%xmm0 \n\t" AESIMC xmm0_xmm0 "\n\t" "movdqu %%xmm0, (%1) \n\t" : : "r" (fk), "r" (ik) : "memory", "xmm0" ); +#endif memcpy( ik, fk, 16 ); } +#if defined(MBEDTLS_HAVE_MSVC_X64_INTRINSICS) +inline static __m128i aes_key_128_assist( __m128i key, __m128i kg ) +{ + key = _mm_xor_si128( key, _mm_slli_si128( key, 4 ) ); + key = _mm_xor_si128( key, _mm_slli_si128( key, 4 ) ); + key = _mm_xor_si128( key, _mm_slli_si128( key, 4 ) ); + kg = _mm_shuffle_epi32( kg, _MM_SHUFFLE( 3, 3, 3, 3 ) ); + return _mm_xor_si128( key, kg ); +} + +// [AES-WP] Part of Fig. 25 page 32 +inline static void aes_key_192_assist( __m128i* temp1, __m128i * temp3, __m128i kg ) +{ + __m128i temp4; + kg = _mm_shuffle_epi32( kg, 0x55 ); + temp4 = _mm_slli_si128( *temp1, 0x4 ); + *temp1 = _mm_xor_si128( *temp1, temp4 ); + temp4 = _mm_slli_si128( temp4, 0x4 ); + *temp1 = _mm_xor_si128( *temp1, temp4 ); + temp4 = _mm_slli_si128( temp4, 0x4 ); + *temp1 = _mm_xor_si128( *temp1, temp4 ); + *temp1 = _mm_xor_si128( *temp1, kg ); + kg = _mm_shuffle_epi32( *temp1, 0xff ); + temp4 = _mm_slli_si128( *temp3, 0x4 ); + *temp3 = _mm_xor_si128( *temp3, temp4 ); + *temp3 = _mm_xor_si128( *temp3, kg ); +} + +// [AES-WP] Part of Fig. 26 page 34 +inline static void aes_key_256_assist_1( __m128i* temp1, __m128i kg ) +{ + __m128i temp4; + kg = _mm_shuffle_epi32( kg, 0xff ); + temp4 = _mm_slli_si128( *temp1, 0x4 ); + *temp1 = _mm_xor_si128( *temp1, temp4 ); + temp4 = _mm_slli_si128( temp4, 0x4 ); + *temp1 = _mm_xor_si128( *temp1, temp4 ); + temp4 = _mm_slli_si128( temp4, 0x4 ); + *temp1 = _mm_xor_si128( *temp1, temp4 ); + *temp1 = _mm_xor_si128( *temp1, kg ); +} + +inline static void aes_key_256_assist_2( __m128i* temp1, __m128i* temp3 ) +{ + __m128i temp2, temp4; + temp4 = _mm_aeskeygenassist_si128( *temp1, 0x0 ); + temp2 = _mm_shuffle_epi32( temp4, 0xaa ); + temp4 = _mm_slli_si128( *temp3, 0x4 ); + *temp3 = _mm_xor_si128( *temp3, temp4 ); + temp4 = _mm_slli_si128( temp4, 0x4 ); + *temp3 = _mm_xor_si128( *temp3, temp4 ); + temp4 = _mm_slli_si128( temp4, 0x4 ); + *temp3 = _mm_xor_si128( *temp3, temp4 ); + *temp3 = _mm_xor_si128( *temp3, temp2 ); +} +#endif /* MBEDTLS_HAVE_MSVC_X64_INTRINSICS */ + /* * Key expansion, 128-bit case */ static void aesni_setkey_enc_128( unsigned char *rk, const unsigned char *key ) { +#if defined(MBEDTLS_HAVE_MSVC_X64_INTRINSICS) + __m128i* xrk, k; + + xrk = (__m128i*)rk; + +#define EXPAND_ROUND(k, rcon) \ + _mm_storeu_si128( xrk++, k ); \ + k = aes_key_128_assist( k, _mm_aeskeygenassist_si128( k, rcon ) ) + + k = _mm_loadu_si128( (__m128i*)key ); + EXPAND_ROUND( k, 0x01 ); + EXPAND_ROUND( k, 0x02 ); + EXPAND_ROUND( k, 0x04 ); + EXPAND_ROUND( k, 0x08 ); + EXPAND_ROUND( k, 0x10 ); + EXPAND_ROUND( k, 0x20 ); + EXPAND_ROUND( k, 0x40 ); + EXPAND_ROUND( k, 0x80 ); + EXPAND_ROUND( k, 0x1b ); + EXPAND_ROUND( k, 0x36 ); + _mm_storeu_si128( xrk, k ); + +#undef EXPAND_ROUND + +#else + asm( "movdqu (%1), %%xmm0 \n\t" // copy the original key "movdqu %%xmm0, (%0) \n\t" // as round key 0 "jmp 2f \n\t" // skip auxiliary routine @@ -316,6 +513,7 @@ static void aesni_setkey_enc_128( unsigned char *rk, : : "r" (rk), "r" (key) : "memory", "cc", "0" ); +#endif /* MBEDTLS_HAVE_MSVC_X64_INTRINSICS */ } /* @@ -324,6 +522,37 @@ static void aesni_setkey_enc_128( unsigned char *rk, static void aesni_setkey_enc_192( unsigned char *rk, const unsigned char *key ) { +#if defined(MBEDTLS_HAVE_MSVC_X64_INTRINSICS) + __m128i temp1, temp3; + __m128i *key_schedule = (__m128i*)rk; + temp1 = _mm_loadu_si128( (__m128i*)key ); + temp3 = _mm_loadu_si128( (__m128i*)(key + 16) ); + key_schedule[0] = temp1; + key_schedule[1] = temp3; + aes_key_192_assist( &temp1, &temp3, _mm_aeskeygenassist_si128(temp3, 0x1) ); + key_schedule[1] = _mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( key_schedule[1] ), _mm_castsi128_pd( temp1 ), 0 ) ); + key_schedule[2] = _mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( temp1 ), _mm_castsi128_pd( temp3 ), 1 ) ); + aes_key_192_assist( &temp1, &temp3, _mm_aeskeygenassist_si128( temp3, 0x2 ) ); + key_schedule[3] = temp1; + key_schedule[4] = temp3; + aes_key_192_assist( &temp1, &temp3, _mm_aeskeygenassist_si128( temp3, 0x4 ) ); + key_schedule[4] = _mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( key_schedule[4] ), _mm_castsi128_pd( temp1 ), 0 ) ); + key_schedule[5] = _mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( temp1 ), _mm_castsi128_pd( temp3 ), 1 ) ); + aes_key_192_assist( &temp1, &temp3, _mm_aeskeygenassist_si128( temp3, 0x8 ) ); + key_schedule[6] = temp1; + key_schedule[7] = temp3; + aes_key_192_assist( &temp1, &temp3, _mm_aeskeygenassist_si128( temp3, 0x10 ) ); + key_schedule[7] = _mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( key_schedule[7] ), _mm_castsi128_pd( temp1 ), 0 ) ); + key_schedule[8] = _mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( temp1 ), _mm_castsi128_pd( temp3 ), 1 ) ); + aes_key_192_assist( &temp1, &temp3, _mm_aeskeygenassist_si128( temp3, 0x20 ) ); + key_schedule[9] = temp1; + key_schedule[10] = temp3; + aes_key_192_assist( &temp1, &temp3, _mm_aeskeygenassist_si128( temp3, 0x40 ) ); + key_schedule[10] = _mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( key_schedule[10] ), _mm_castsi128_pd( temp1 ), 0 ) ); + key_schedule[11] = _mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( temp1 ), _mm_castsi128_pd( temp3 ), 1 ) ); + aes_key_192_assist( &temp1, &temp3, _mm_aeskeygenassist_si128( temp3, 0x80 ) ); + key_schedule[12] = temp1; +#else asm( "movdqu (%1), %%xmm0 \n\t" // copy original round key "movdqu %%xmm0, (%0) \n\t" "add $16, %0 \n\t" @@ -373,6 +602,7 @@ static void aesni_setkey_enc_192( unsigned char *rk, : : "r" (rk), "r" (key) : "memory", "cc", "0" ); +#endif /* MBEDTLS_HAVE_MSVC_X64_INTRINSICS */ } /* @@ -381,6 +611,40 @@ static void aesni_setkey_enc_192( unsigned char *rk, static void aesni_setkey_enc_256( unsigned char *rk, const unsigned char *key ) { +#if defined(MBEDTLS_HAVE_MSVC_X64_INTRINSICS) + __m128i temp1, temp3; + __m128i *key_schedule = (__m128i*)rk; + temp1 = _mm_loadu_si128( (__m128i*)key ); + temp3 = _mm_loadu_si128( (__m128i*)(key + 16) ); + key_schedule[0] = temp1; + key_schedule[1] = temp3; + aes_key_256_assist_1( &temp1, _mm_aeskeygenassist_si128( temp3, 0x01 ) ); + key_schedule[2] = temp1; + aes_key_256_assist_2( &temp1, &temp3 ); + key_schedule[3] = temp3; + aes_key_256_assist_1( &temp1, _mm_aeskeygenassist_si128( temp3, 0x02 ) ); + key_schedule[4] = temp1; + aes_key_256_assist_2( &temp1, &temp3 ); + key_schedule[5] = temp3; + aes_key_256_assist_1( &temp1, _mm_aeskeygenassist_si128( temp3, 0x04 ) ); + key_schedule[6] = temp1; + aes_key_256_assist_2( &temp1, &temp3 ); + key_schedule[7] = temp3; + aes_key_256_assist_1( &temp1, _mm_aeskeygenassist_si128( temp3, 0x08 ) ); + key_schedule[8] = temp1; + aes_key_256_assist_2( &temp1, &temp3 ); + key_schedule[9] = temp3; + aes_key_256_assist_1( &temp1, _mm_aeskeygenassist_si128( temp3, 0x10 ) ); + key_schedule[10] = temp1; + aes_key_256_assist_2( &temp1, &temp3 ); + key_schedule[11] = temp3; + aes_key_256_assist_1( &temp1, _mm_aeskeygenassist_si128( temp3, 0x20 ) ); + key_schedule[12] = temp1; + aes_key_256_assist_2( &temp1, &temp3 ); + key_schedule[13] = temp3; + aes_key_256_assist_1( &temp1, _mm_aeskeygenassist_si128( temp3, 0x40 ) ); + key_schedule[14] = temp1; +#else asm( "movdqu (%1), %%xmm0 \n\t" "movdqu %%xmm0, (%0) \n\t" "add $16, %0 \n\t" @@ -439,6 +703,7 @@ static void aesni_setkey_enc_256( unsigned char *rk, : : "r" (rk), "r" (key) : "memory", "cc", "0" ); +#endif /* MBEDTLS_HAVE_MSVC_X64_INTRINSICS */ } /* diff --git a/library/timing.c b/library/timing.c index 57bc9bcc1..639e160ac 100644 --- a/library/timing.c +++ b/library/timing.c @@ -112,6 +112,23 @@ unsigned long mbedtls_timing_hardclock( void ) #endif /* !HAVE_HARDCLOCK && MBEDTLS_HAVE_ASM && __GNUC__ && ( __amd64__ || __x86_64__ ) */ +#if !defined(HAVE_HARDCLOCK) && defined(_MSC_VER) && defined(_M_X64) + +#define HAVE_HARDCLOCK + +unsigned long mbedtls_timing_hardclock(void) +{ + union + { + unsigned __int64 u64; + struct { unsigned long lo, hi; } u32; + } tsc; + tsc.u64 = __rdtsc(); + return tsc.u32.lo; +} +#endif /* !HAVE_HARDCLOCK && _MSC_VER && _M_X64 */ + + #if !defined(HAVE_HARDCLOCK) && defined(MBEDTLS_HAVE_ASM) && \ defined(__GNUC__) && ( defined(__powerpc__) || defined(__ppc__) )