x64 intrinsics for msvc in bn_mul, timing and aesni modules

AES-NI implementation for MSVC x64 using intrinsics. Implement rdtsc timing function using intrinsics for x64. Use 128bit-result multiply on msvc/x64.
2024-11-22 22:15:43 +01:00 · 2018-01-31 14:27:07 +02:00 · 2018-01-31 14:27:07 +02:00 · 00d92c2989
commit 00d92c2989
parent 6e270b44b1
5 changed files with 310 additions and 3 deletions
--- a/include/mbedtls/aesni.h
+++ b/include/mbedtls/aesni.h
@ -42,6 +42,11 @@
 #define MBEDTLS_HAVE_X86_64
 #endif
 #if defined(_MSC_VER) && defined(_M_X64) &&  \
    ! defined(MBEDTLS_HAVE_X86_64)
 #define MBEDTLS_HAVE_X86_64
 #endif
 #if defined(MBEDTLS_HAVE_X86_64)
 #ifdef __cplusplus
--- a/include/mbedtls/bn_mul.h
+++ b/include/mbedtls/bn_mul.h
@ -924,10 +924,30 @@
    __asm   mov     s, esi                      \
 #endif /* SSE2 */
-#endif /* MSVC */
+#endif /* (MSVC && _M_IX86) || __WATCOMC__ */
 #endif /* MBEDTLS_HAVE_ASM */
 #if defined(_MSC_VER) && defined(_M_X64)
 #include <intrin.h>
 #define MULADDC_INIT                    \
 {                                       \
    mbedtls_mpi_uint r0, r1;            \
    unsigned char carry;
 #define MULADDC_CORE                       \
    r0 = _umul128( *(s++), b, &r1 );         \
    carry = _addcarry_u64( 0, r0, c, &r0 );  \
    _addcarry_u64( carry, r1, 0, &r1 );      \
    carry = _addcarry_u64( 0, r0, *d, &r0 ); \
    _addcarry_u64( carry, r1, 0, &r1 );      \
    c = r1; *(d++) = r0;
 #define MULADDC_STOP                    \
 }
 #endif /* _MSC_VER && _M_X64 */
 #if !defined(MULADDC_CORE)
 #if defined(MBEDTLS_HAVE_UDBL)
--- a/include/mbedtls/check_config.h
+++ b/include/mbedtls/check_config.h
@ -68,7 +68,7 @@
 #error "MBEDTLS_HAVE_TIME_DATE without MBEDTLS_HAVE_TIME does not make sense"
 #endif
-#if defined(MBEDTLS_AESNI_C) && !defined(MBEDTLS_HAVE_ASM)
+#if defined(MBEDTLS_AESNI_C) && !defined(MBEDTLS_HAVE_ASM) && !(defined(_MSC_VER) && defined(_M_X64))
 #error "MBEDTLS_AESNI_C defined, but not all prerequisites"
 #endif
--- a/library/aesni.c
+++ b/library/aesni.c
@ -42,6 +42,11 @@
 #if defined(MBEDTLS_HAVE_X86_64)
 #if defined(_MSC_VER) && defined(_M_X64)
 #define MBEDTLS_HAVE_MSVC_X64_INTRINSICS
 #include <intrin.h>
 #endif
 /*
 * AES-NI support detection routine
 */
@ -52,11 +57,17 @@ int mbedtls_aesni_has_support( unsigned int what )
    if( ! done )
    {
 #if defined(MBEDTLS_HAVE_MSVC_X64_INTRINSICS)
        int regs[4]; // eax, ebx, ecx, edx
        __cpuid( regs, 1 );
        c = regs[2];
 #else
        asm( "movl  $1, %%eax   \n\t"
             "cpuid             \n\t"
             : "=c" (c)
             :
             : "eax", "ebx", "edx" );
 #endif
        done = 1;
    }
@ -97,6 +108,28 @@ int mbedtls_aesni_crypt_ecb( mbedtls_aes_context *ctx,
                     const unsigned char input[16],
                     unsigned char output[16] )
 {
 #if defined(MBEDTLS_HAVE_MSVC_X64_INTRINSICS)
    __m128i* rk, a;
    int i;
    rk = (__m128i*)ctx->rk;
    a = _mm_xor_si128( _mm_loadu_si128( (__m128i*)input ), _mm_loadu_si128( rk++ ) );
    if (mode == MBEDTLS_AES_ENCRYPT)
    {
        for (i = ctx->nr - 1; i; --i)
            a = _mm_aesenc_si128( a, _mm_loadu_si128( rk++ ) );
        a = _mm_aesenclast_si128( a, _mm_loadu_si128( rk ) );
    }
    else
    {
        for (i = ctx->nr - 1; i; --i)
            a = _mm_aesdec_si128( a, _mm_loadu_si128( rk++ ) );
        a = _mm_aesdeclast_si128( a, _mm_loadu_si128( rk ) );
    }
    _mm_storeu_si128( (__m128i*)output, a );
 #else
    asm( "movdqu    (%3), %%xmm0    \n\t" // load input
         "movdqu    (%1), %%xmm1    \n\t" // load round key 0
         "pxor      %%xmm1, %%xmm0  \n\t" // round 0
@ -130,10 +163,70 @@ int mbedtls_aesni_crypt_ecb( mbedtls_aes_context *ctx,
         : "r" (ctx->nr), "r" (ctx->rk), "r" (mode), "r" (input), "r" (output)
         : "memory", "cc", "xmm0", "xmm1" );
 #endif
    return( 0 );
 }
 #if defined(MBEDTLS_HAVE_MSVC_X64_INTRINSICS)
 static inline void clmul256( __m128i a, __m128i b, __m128i* r0, __m128i* r1 )
 {
    __m128i c, d, e, f, ef;
    c = _mm_clmulepi64_si128( a, b, 0x00 );
    d = _mm_clmulepi64_si128( a, b, 0x11 );
    e = _mm_clmulepi64_si128( a, b, 0x10 );
    f = _mm_clmulepi64_si128( a, b, 0x01 );
    // r0 = f0^e0^c1:c0 = c1:c0 ^ f0^e0:0
    // r1 = d1:f1^e1^d0 = d1:d0 ^ 0:f1^e1
    ef = _mm_xor_si128( e, f );
    *r0 = _mm_xor_si128( c, _mm_slli_si128( ef, 8 ) );
    *r1 = _mm_xor_si128( d, _mm_srli_si128( ef, 8 ) );
 }
 static inline void sll256( __m128i a0, __m128i a1, __m128i* s0, __m128i* s1 )
 {
    __m128i l0, l1, r0, r1;
    l0 = _mm_slli_epi64( a0, 1 );
    l1 = _mm_slli_epi64( a1, 1 );
    r0 = _mm_srli_epi64( a0, 63 );
    r1 = _mm_srli_epi64( a1, 63 );
    *s0 = _mm_or_si128( l0, _mm_slli_si128( r0, 8 ) );
    *s1 = _mm_or_si128( _mm_or_si128( l1, _mm_srli_si128( r0, 8 ) ), _mm_slli_si128( r1, 8 ) );
 }
 static inline __m128i reducemod128( __m128i x10, __m128i x32 )
 {
    __m128i a, b, c, dx0, e, f, g, h;
    // (1) left shift x0 by 63, 62 and 57
    a = _mm_slli_epi64( x10, 63 );
    b = _mm_slli_epi64( x10, 62 );
    c = _mm_slli_epi64( x10, 57 );
    // (2) compute D xor'ing a, b, c and x1
    // d:x0 = x1:x0 ^ [a^b^c:0]
    dx0 = _mm_xor_si128( x10, _mm_slli_si128( _mm_xor_si128( _mm_xor_si128( a, b ), c ), 8 ) );
    // (3) right shift [d:x0] by 1, 2, 7
    e = _mm_or_si128( _mm_srli_epi64( dx0, 1 ), _mm_srli_si128( _mm_slli_epi64( dx0, 63 ), 8 ) );
    f = _mm_or_si128( _mm_srli_epi64( dx0, 2 ), _mm_srli_si128( _mm_slli_epi64( dx0, 62 ), 8 ) );
    g = _mm_or_si128( _mm_srli_epi64( dx0, 7 ), _mm_srli_si128( _mm_slli_epi64( dx0, 57 ), 8 ) );
    // (4) compute h = d^e1^f1^g1 : x0^e0^f0^g0
    h = _mm_xor_si128( dx0, _mm_xor_si128( e, _mm_xor_si128( f, g ) ) );
    // result is x3^h1:x2^h0
    return _mm_xor_si128( x32, h );
 }
 #endif
 /*
 * GCM multiplication: c = a times b in GF(2^128)
 * Based on [CLMUL-WP] algorithms 1 (with equation 27) and 5.
@ -142,6 +235,22 @@ void mbedtls_aesni_gcm_mult( unsigned char c[16],
                     const unsigned char a[16],
                     const unsigned char b[16] )
 {
 #if defined(MBEDTLS_HAVE_MSVC_X64_INTRINSICS)
    __m128i xa, xb, m0, m1, x10, x32, r;
    xa.m128i_u64[1] = _byteswap_uint64( *((unsigned __int64*)a + 0) );
    xa.m128i_u64[0] = _byteswap_uint64( *((unsigned __int64*)a + 1) );
    xb.m128i_u64[1] = _byteswap_uint64( *((unsigned __int64*)b + 0) );
    xb.m128i_u64[0] = _byteswap_uint64( *((unsigned __int64*)b + 1) );
    clmul256( xa, xb, &m0, &m1 );
    sll256( m0, m1, &x10, &x32 );
    r = reducemod128( x10, x32 );
    *((unsigned __int64*)c + 0) = _byteswap_uint64( r.m128i_u64[1] );
    *((unsigned __int64*)c + 1) = _byteswap_uint64( r.m128i_u64[0] );
 #else
    unsigned char aa[16], bb[16], cc[16];
    size_t i;
@ -242,6 +351,7 @@ void mbedtls_aesni_gcm_mult( unsigned char c[16],
    /* Now byte-reverse the outputs */
    for( i = 0; i < 16; i++ )
        c[i] = cc[15 - i];
 #endif
    return;
 }
@ -258,22 +368,109 @@ void mbedtls_aesni_inverse_key( unsigned char *invkey,
    memcpy( ik, fk, 16 );
    for( fk -= 16, ik += 16; fk > fwdkey; fk -= 16, ik += 16 )
 #if defined(MBEDTLS_HAVE_MSVC_X64_INTRINSICS)
        _mm_storeu_si128( (__m128i*)ik, _mm_aesimc_si128( _mm_loadu_si128( (__m128i*)fk) ) );
 #else
        asm( "movdqu (%0), %%xmm0       \n\t"
             AESIMC  xmm0_xmm0         "\n\t"
             "movdqu %%xmm0, (%1)       \n\t"
             :
             : "r" (fk), "r" (ik)
             : "memory", "xmm0" );
 #endif
    memcpy( ik, fk, 16 );
 }
 #if defined(MBEDTLS_HAVE_MSVC_X64_INTRINSICS)
 inline static __m128i aes_key_128_assist( __m128i key, __m128i kg )
 {
    key = _mm_xor_si128( key, _mm_slli_si128( key, 4 ) );
    key = _mm_xor_si128( key, _mm_slli_si128( key, 4 ) );
    key = _mm_xor_si128( key, _mm_slli_si128( key, 4 ) );
    kg = _mm_shuffle_epi32( kg, _MM_SHUFFLE( 3, 3, 3, 3 ) );
    return _mm_xor_si128( key, kg );
 }
 // [AES-WP] Part of Fig. 25 page 32
 inline static void aes_key_192_assist( __m128i* temp1, __m128i * temp3, __m128i kg )
 {
    __m128i temp4;
    kg = _mm_shuffle_epi32( kg, 0x55 );
    temp4 = _mm_slli_si128( *temp1, 0x4 );
    *temp1 = _mm_xor_si128( *temp1, temp4 );
    temp4 = _mm_slli_si128( temp4, 0x4 );
    *temp1 = _mm_xor_si128( *temp1, temp4 );
    temp4 = _mm_slli_si128( temp4, 0x4 );
    *temp1 = _mm_xor_si128( *temp1, temp4 );
    *temp1 = _mm_xor_si128( *temp1, kg );
    kg = _mm_shuffle_epi32( *temp1, 0xff );
    temp4 = _mm_slli_si128( *temp3, 0x4 );
    *temp3 = _mm_xor_si128( *temp3, temp4 );
    *temp3 = _mm_xor_si128( *temp3, kg );
 }
 // [AES-WP] Part of Fig. 26 page 34
 inline static void aes_key_256_assist_1( __m128i* temp1, __m128i kg )
 {
    __m128i temp4;
    kg = _mm_shuffle_epi32( kg, 0xff );
    temp4 = _mm_slli_si128( *temp1, 0x4 );
    *temp1 = _mm_xor_si128( *temp1, temp4 );
    temp4 = _mm_slli_si128( temp4, 0x4 );
    *temp1 = _mm_xor_si128( *temp1, temp4 );
    temp4 = _mm_slli_si128( temp4, 0x4 );
    *temp1 = _mm_xor_si128( *temp1, temp4 );
    *temp1 = _mm_xor_si128( *temp1, kg );
 }
 inline static void aes_key_256_assist_2( __m128i* temp1, __m128i* temp3 )
 {
    __m128i temp2, temp4;
    temp4 = _mm_aeskeygenassist_si128( *temp1, 0x0 );
    temp2 = _mm_shuffle_epi32( temp4, 0xaa );
    temp4 = _mm_slli_si128( *temp3, 0x4 );
    *temp3 = _mm_xor_si128( *temp3, temp4 );
    temp4 = _mm_slli_si128( temp4, 0x4 );
    *temp3 = _mm_xor_si128( *temp3, temp4 );
    temp4 = _mm_slli_si128( temp4, 0x4 );
    *temp3 = _mm_xor_si128( *temp3, temp4 );
    *temp3 = _mm_xor_si128( *temp3, temp2 );
 }
 #endif /* MBEDTLS_HAVE_MSVC_X64_INTRINSICS */
 /*
 * Key expansion, 128-bit case
 */
 static void aesni_setkey_enc_128( unsigned char *rk,
                                  const unsigned char *key )
 {
 #if defined(MBEDTLS_HAVE_MSVC_X64_INTRINSICS)
    __m128i* xrk, k;
    xrk = (__m128i*)rk;
 #define EXPAND_ROUND(k, rcon) \
    _mm_storeu_si128( xrk++, k ); \
    k = aes_key_128_assist( k, _mm_aeskeygenassist_si128( k, rcon ) )
    k = _mm_loadu_si128( (__m128i*)key );
    EXPAND_ROUND( k, 0x01 );
    EXPAND_ROUND( k, 0x02 );
    EXPAND_ROUND( k, 0x04 );
    EXPAND_ROUND( k, 0x08 );
    EXPAND_ROUND( k, 0x10 );
    EXPAND_ROUND( k, 0x20 );
    EXPAND_ROUND( k, 0x40 );
    EXPAND_ROUND( k, 0x80 );
    EXPAND_ROUND( k, 0x1b );
    EXPAND_ROUND( k, 0x36 );
    _mm_storeu_si128( xrk, k );
 #undef EXPAND_ROUND
 #else
    asm( "movdqu (%1), %%xmm0               \n\t" // copy the original key
         "movdqu %%xmm0, (%0)               \n\t" // as round key 0
         "jmp 2f                            \n\t" // skip auxiliary routine
@ -316,6 +513,7 @@ static void aesni_setkey_enc_128( unsigned char *rk,
         :
         : "r" (rk), "r" (key)
         : "memory", "cc", "0" );
 #endif /* MBEDTLS_HAVE_MSVC_X64_INTRINSICS */
 }
 /*
@ -324,6 +522,37 @@ static void aesni_setkey_enc_128( unsigned char *rk,
 static void aesni_setkey_enc_192( unsigned char *rk,
                                  const unsigned char *key )
 {
 #if defined(MBEDTLS_HAVE_MSVC_X64_INTRINSICS)
    __m128i temp1, temp3;
    __m128i *key_schedule = (__m128i*)rk;
    temp1 = _mm_loadu_si128( (__m128i*)key );
    temp3 = _mm_loadu_si128( (__m128i*)(key + 16) );
    key_schedule[0] = temp1;
    key_schedule[1] = temp3;
    aes_key_192_assist( &temp1, &temp3, _mm_aeskeygenassist_si128(temp3, 0x1) );
    key_schedule[1] = _mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( key_schedule[1] ), _mm_castsi128_pd( temp1 ), 0 ) );
    key_schedule[2] = _mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( temp1 ), _mm_castsi128_pd( temp3 ), 1 ) );
    aes_key_192_assist( &temp1, &temp3, _mm_aeskeygenassist_si128( temp3, 0x2 ) );
    key_schedule[3] = temp1;
    key_schedule[4] = temp3;
    aes_key_192_assist( &temp1, &temp3, _mm_aeskeygenassist_si128( temp3, 0x4 ) );
    key_schedule[4] = _mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( key_schedule[4] ), _mm_castsi128_pd( temp1 ), 0 ) );
    key_schedule[5] = _mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( temp1 ), _mm_castsi128_pd( temp3 ), 1 ) );
    aes_key_192_assist( &temp1, &temp3, _mm_aeskeygenassist_si128( temp3, 0x8 ) );
    key_schedule[6] = temp1;
    key_schedule[7] = temp3;
    aes_key_192_assist( &temp1, &temp3, _mm_aeskeygenassist_si128( temp3, 0x10 ) );
    key_schedule[7] = _mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( key_schedule[7] ), _mm_castsi128_pd( temp1 ), 0 ) );
    key_schedule[8] = _mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( temp1 ), _mm_castsi128_pd( temp3 ), 1 ) );
    aes_key_192_assist( &temp1, &temp3, _mm_aeskeygenassist_si128( temp3, 0x20 ) );
    key_schedule[9] = temp1;
    key_schedule[10] = temp3;
    aes_key_192_assist( &temp1, &temp3, _mm_aeskeygenassist_si128( temp3, 0x40 ) );
    key_schedule[10] = _mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( key_schedule[10] ), _mm_castsi128_pd( temp1 ), 0 ) );
    key_schedule[11] = _mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( temp1 ), _mm_castsi128_pd( temp3 ), 1 ) );
    aes_key_192_assist( &temp1, &temp3, _mm_aeskeygenassist_si128( temp3, 0x80 ) );
    key_schedule[12] = temp1;
 #else
    asm( "movdqu (%1), %%xmm0   \n\t" // copy original round key
         "movdqu %%xmm0, (%0)   \n\t"
         "add $16, %0           \n\t"
@ -373,6 +602,7 @@ static void aesni_setkey_enc_192( unsigned char *rk,
         :
         : "r" (rk), "r" (key)
         : "memory", "cc", "0" );
 #endif /* MBEDTLS_HAVE_MSVC_X64_INTRINSICS */
 }
 /*
@ -381,6 +611,40 @@ static void aesni_setkey_enc_192( unsigned char *rk,
 static void aesni_setkey_enc_256( unsigned char *rk,
                                  const unsigned char *key )
 {
 #if defined(MBEDTLS_HAVE_MSVC_X64_INTRINSICS)
    __m128i temp1, temp3;
    __m128i *key_schedule = (__m128i*)rk;
    temp1 = _mm_loadu_si128( (__m128i*)key );
    temp3 = _mm_loadu_si128( (__m128i*)(key + 16) );
    key_schedule[0] = temp1;
    key_schedule[1] = temp3;
    aes_key_256_assist_1( &temp1, _mm_aeskeygenassist_si128( temp3, 0x01 ) );
    key_schedule[2] = temp1;
    aes_key_256_assist_2( &temp1, &temp3 );
    key_schedule[3] = temp3;
    aes_key_256_assist_1( &temp1, _mm_aeskeygenassist_si128( temp3, 0x02 ) );
    key_schedule[4] = temp1;
    aes_key_256_assist_2( &temp1, &temp3 );
    key_schedule[5] = temp3;
    aes_key_256_assist_1( &temp1, _mm_aeskeygenassist_si128( temp3, 0x04 ) );
    key_schedule[6] = temp1;
    aes_key_256_assist_2( &temp1, &temp3 );
    key_schedule[7] = temp3;
    aes_key_256_assist_1( &temp1, _mm_aeskeygenassist_si128( temp3, 0x08 ) );
    key_schedule[8] = temp1;
    aes_key_256_assist_2( &temp1, &temp3 );
    key_schedule[9] = temp3;
    aes_key_256_assist_1( &temp1, _mm_aeskeygenassist_si128( temp3, 0x10 ) );
    key_schedule[10] = temp1;
    aes_key_256_assist_2( &temp1, &temp3 );
    key_schedule[11] = temp3;
    aes_key_256_assist_1( &temp1, _mm_aeskeygenassist_si128( temp3, 0x20 ) );
    key_schedule[12] = temp1;
    aes_key_256_assist_2( &temp1, &temp3 );
    key_schedule[13] = temp3;
    aes_key_256_assist_1( &temp1, _mm_aeskeygenassist_si128( temp3, 0x40 ) );
    key_schedule[14] = temp1;
 #else
    asm( "movdqu (%1), %%xmm0           \n\t"
         "movdqu %%xmm0, (%0)           \n\t"
         "add $16, %0                   \n\t"
@ -439,6 +703,7 @@ static void aesni_setkey_enc_256( unsigned char *rk,
         :
         : "r" (rk), "r" (key)
         : "memory", "cc", "0" );
 #endif /* MBEDTLS_HAVE_MSVC_X64_INTRINSICS */
 }
 /*
--- a/library/timing.c
+++ b/library/timing.c
@ -112,6 +112,23 @@ unsigned long mbedtls_timing_hardclock( void )
 #endif /* !HAVE_HARDCLOCK && MBEDTLS_HAVE_ASM &&
          __GNUC__ && ( __amd64__ || __x86_64__ ) */
 #if !defined(HAVE_HARDCLOCK) && defined(_MSC_VER) && defined(_M_X64)
 #define HAVE_HARDCLOCK
 unsigned long mbedtls_timing_hardclock(void)
 {
    union
    {
        unsigned __int64 u64;
        struct { unsigned long lo, hi; } u32;
    } tsc;
    tsc.u64 = __rdtsc();
    return tsc.u32.lo;
 }
 #endif /* !HAVE_HARDCLOCK && _MSC_VER && _M_X64 */
 #if !defined(HAVE_HARDCLOCK) && defined(MBEDTLS_HAVE_ASM) &&  \
    defined(__GNUC__) && ( defined(__powerpc__) || defined(__ppc__) )