mirror of
https://github.com/yuzu-emu/mbedtls.git
synced 2024-11-23 03:05:43 +01:00
x64 intrinsics for msvc in bn_mul, timing and aesni modules
AES-NI implementation for MSVC x64 using intrinsics. Implement rdtsc timing function using intrinsics for x64. Use 128bit-result multiply on msvc/x64.
This commit is contained in:
parent
6e270b44b1
commit
00d92c2989
@ -42,6 +42,11 @@
|
|||||||
#define MBEDTLS_HAVE_X86_64
|
#define MBEDTLS_HAVE_X86_64
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(_MSC_VER) && defined(_M_X64) && \
|
||||||
|
! defined(MBEDTLS_HAVE_X86_64)
|
||||||
|
#define MBEDTLS_HAVE_X86_64
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(MBEDTLS_HAVE_X86_64)
|
#if defined(MBEDTLS_HAVE_X86_64)
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
@ -924,10 +924,30 @@
|
|||||||
__asm mov s, esi \
|
__asm mov s, esi \
|
||||||
|
|
||||||
#endif /* SSE2 */
|
#endif /* SSE2 */
|
||||||
#endif /* MSVC */
|
#endif /* (MSVC && _M_IX86) || __WATCOMC__ */
|
||||||
|
|
||||||
#endif /* MBEDTLS_HAVE_ASM */
|
#endif /* MBEDTLS_HAVE_ASM */
|
||||||
|
|
||||||
|
#if defined(_MSC_VER) && defined(_M_X64)
|
||||||
|
|
||||||
|
#include <intrin.h>
|
||||||
|
|
||||||
|
#define MULADDC_INIT \
|
||||||
|
{ \
|
||||||
|
mbedtls_mpi_uint r0, r1; \
|
||||||
|
unsigned char carry;
|
||||||
|
|
||||||
|
#define MULADDC_CORE \
|
||||||
|
r0 = _umul128( *(s++), b, &r1 ); \
|
||||||
|
carry = _addcarry_u64( 0, r0, c, &r0 ); \
|
||||||
|
_addcarry_u64( carry, r1, 0, &r1 ); \
|
||||||
|
carry = _addcarry_u64( 0, r0, *d, &r0 ); \
|
||||||
|
_addcarry_u64( carry, r1, 0, &r1 ); \
|
||||||
|
c = r1; *(d++) = r0;
|
||||||
|
|
||||||
|
#define MULADDC_STOP \
|
||||||
|
}
|
||||||
|
#endif /* _MSC_VER && _M_X64 */
|
||||||
|
|
||||||
#if !defined(MULADDC_CORE)
|
#if !defined(MULADDC_CORE)
|
||||||
#if defined(MBEDTLS_HAVE_UDBL)
|
#if defined(MBEDTLS_HAVE_UDBL)
|
||||||
|
|
||||||
|
@ -68,7 +68,7 @@
|
|||||||
#error "MBEDTLS_HAVE_TIME_DATE without MBEDTLS_HAVE_TIME does not make sense"
|
#error "MBEDTLS_HAVE_TIME_DATE without MBEDTLS_HAVE_TIME does not make sense"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(MBEDTLS_AESNI_C) && !defined(MBEDTLS_HAVE_ASM)
|
#if defined(MBEDTLS_AESNI_C) && !defined(MBEDTLS_HAVE_ASM) && !(defined(_MSC_VER) && defined(_M_X64))
|
||||||
#error "MBEDTLS_AESNI_C defined, but not all prerequisites"
|
#error "MBEDTLS_AESNI_C defined, but not all prerequisites"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
265
library/aesni.c
265
library/aesni.c
@ -42,6 +42,11 @@
|
|||||||
|
|
||||||
#if defined(MBEDTLS_HAVE_X86_64)
|
#if defined(MBEDTLS_HAVE_X86_64)
|
||||||
|
|
||||||
|
#if defined(_MSC_VER) && defined(_M_X64)
|
||||||
|
#define MBEDTLS_HAVE_MSVC_X64_INTRINSICS
|
||||||
|
#include <intrin.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* AES-NI support detection routine
|
* AES-NI support detection routine
|
||||||
*/
|
*/
|
||||||
@ -52,11 +57,17 @@ int mbedtls_aesni_has_support( unsigned int what )
|
|||||||
|
|
||||||
if( ! done )
|
if( ! done )
|
||||||
{
|
{
|
||||||
|
#if defined(MBEDTLS_HAVE_MSVC_X64_INTRINSICS)
|
||||||
|
int regs[4]; // eax, ebx, ecx, edx
|
||||||
|
__cpuid( regs, 1 );
|
||||||
|
c = regs[2];
|
||||||
|
#else
|
||||||
asm( "movl $1, %%eax \n\t"
|
asm( "movl $1, %%eax \n\t"
|
||||||
"cpuid \n\t"
|
"cpuid \n\t"
|
||||||
: "=c" (c)
|
: "=c" (c)
|
||||||
:
|
:
|
||||||
: "eax", "ebx", "edx" );
|
: "eax", "ebx", "edx" );
|
||||||
|
#endif
|
||||||
done = 1;
|
done = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -97,6 +108,28 @@ int mbedtls_aesni_crypt_ecb( mbedtls_aes_context *ctx,
|
|||||||
const unsigned char input[16],
|
const unsigned char input[16],
|
||||||
unsigned char output[16] )
|
unsigned char output[16] )
|
||||||
{
|
{
|
||||||
|
#if defined(MBEDTLS_HAVE_MSVC_X64_INTRINSICS)
|
||||||
|
__m128i* rk, a;
|
||||||
|
int i;
|
||||||
|
|
||||||
|
rk = (__m128i*)ctx->rk;
|
||||||
|
a = _mm_xor_si128( _mm_loadu_si128( (__m128i*)input ), _mm_loadu_si128( rk++ ) );
|
||||||
|
|
||||||
|
if (mode == MBEDTLS_AES_ENCRYPT)
|
||||||
|
{
|
||||||
|
for (i = ctx->nr - 1; i; --i)
|
||||||
|
a = _mm_aesenc_si128( a, _mm_loadu_si128( rk++ ) );
|
||||||
|
a = _mm_aesenclast_si128( a, _mm_loadu_si128( rk ) );
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
for (i = ctx->nr - 1; i; --i)
|
||||||
|
a = _mm_aesdec_si128( a, _mm_loadu_si128( rk++ ) );
|
||||||
|
a = _mm_aesdeclast_si128( a, _mm_loadu_si128( rk ) );
|
||||||
|
}
|
||||||
|
|
||||||
|
_mm_storeu_si128( (__m128i*)output, a );
|
||||||
|
#else
|
||||||
asm( "movdqu (%3), %%xmm0 \n\t" // load input
|
asm( "movdqu (%3), %%xmm0 \n\t" // load input
|
||||||
"movdqu (%1), %%xmm1 \n\t" // load round key 0
|
"movdqu (%1), %%xmm1 \n\t" // load round key 0
|
||||||
"pxor %%xmm1, %%xmm0 \n\t" // round 0
|
"pxor %%xmm1, %%xmm0 \n\t" // round 0
|
||||||
@ -130,10 +163,70 @@ int mbedtls_aesni_crypt_ecb( mbedtls_aes_context *ctx,
|
|||||||
: "r" (ctx->nr), "r" (ctx->rk), "r" (mode), "r" (input), "r" (output)
|
: "r" (ctx->nr), "r" (ctx->rk), "r" (mode), "r" (input), "r" (output)
|
||||||
: "memory", "cc", "xmm0", "xmm1" );
|
: "memory", "cc", "xmm0", "xmm1" );
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
return( 0 );
|
return( 0 );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if defined(MBEDTLS_HAVE_MSVC_X64_INTRINSICS)
|
||||||
|
|
||||||
|
static inline void clmul256( __m128i a, __m128i b, __m128i* r0, __m128i* r1 )
|
||||||
|
{
|
||||||
|
__m128i c, d, e, f, ef;
|
||||||
|
c = _mm_clmulepi64_si128( a, b, 0x00 );
|
||||||
|
d = _mm_clmulepi64_si128( a, b, 0x11 );
|
||||||
|
e = _mm_clmulepi64_si128( a, b, 0x10 );
|
||||||
|
f = _mm_clmulepi64_si128( a, b, 0x01 );
|
||||||
|
|
||||||
|
// r0 = f0^e0^c1:c0 = c1:c0 ^ f0^e0:0
|
||||||
|
// r1 = d1:f1^e1^d0 = d1:d0 ^ 0:f1^e1
|
||||||
|
|
||||||
|
ef = _mm_xor_si128( e, f );
|
||||||
|
*r0 = _mm_xor_si128( c, _mm_slli_si128( ef, 8 ) );
|
||||||
|
*r1 = _mm_xor_si128( d, _mm_srli_si128( ef, 8 ) );
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void sll256( __m128i a0, __m128i a1, __m128i* s0, __m128i* s1 )
|
||||||
|
{
|
||||||
|
__m128i l0, l1, r0, r1;
|
||||||
|
|
||||||
|
l0 = _mm_slli_epi64( a0, 1 );
|
||||||
|
l1 = _mm_slli_epi64( a1, 1 );
|
||||||
|
|
||||||
|
r0 = _mm_srli_epi64( a0, 63 );
|
||||||
|
r1 = _mm_srli_epi64( a1, 63 );
|
||||||
|
|
||||||
|
*s0 = _mm_or_si128( l0, _mm_slli_si128( r0, 8 ) );
|
||||||
|
*s1 = _mm_or_si128( _mm_or_si128( l1, _mm_srli_si128( r0, 8 ) ), _mm_slli_si128( r1, 8 ) );
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline __m128i reducemod128( __m128i x10, __m128i x32 )
|
||||||
|
{
|
||||||
|
__m128i a, b, c, dx0, e, f, g, h;
|
||||||
|
|
||||||
|
// (1) left shift x0 by 63, 62 and 57
|
||||||
|
a = _mm_slli_epi64( x10, 63 );
|
||||||
|
b = _mm_slli_epi64( x10, 62 );
|
||||||
|
c = _mm_slli_epi64( x10, 57 );
|
||||||
|
|
||||||
|
// (2) compute D xor'ing a, b, c and x1
|
||||||
|
// d:x0 = x1:x0 ^ [a^b^c:0]
|
||||||
|
dx0 = _mm_xor_si128( x10, _mm_slli_si128( _mm_xor_si128( _mm_xor_si128( a, b ), c ), 8 ) );
|
||||||
|
|
||||||
|
// (3) right shift [d:x0] by 1, 2, 7
|
||||||
|
e = _mm_or_si128( _mm_srli_epi64( dx0, 1 ), _mm_srli_si128( _mm_slli_epi64( dx0, 63 ), 8 ) );
|
||||||
|
f = _mm_or_si128( _mm_srli_epi64( dx0, 2 ), _mm_srli_si128( _mm_slli_epi64( dx0, 62 ), 8 ) );
|
||||||
|
g = _mm_or_si128( _mm_srli_epi64( dx0, 7 ), _mm_srli_si128( _mm_slli_epi64( dx0, 57 ), 8 ) );
|
||||||
|
|
||||||
|
// (4) compute h = d^e1^f1^g1 : x0^e0^f0^g0
|
||||||
|
h = _mm_xor_si128( dx0, _mm_xor_si128( e, _mm_xor_si128( f, g ) ) );
|
||||||
|
|
||||||
|
// result is x3^h1:x2^h0
|
||||||
|
return _mm_xor_si128( x32, h );
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* GCM multiplication: c = a times b in GF(2^128)
|
* GCM multiplication: c = a times b in GF(2^128)
|
||||||
* Based on [CLMUL-WP] algorithms 1 (with equation 27) and 5.
|
* Based on [CLMUL-WP] algorithms 1 (with equation 27) and 5.
|
||||||
@ -142,6 +235,22 @@ void mbedtls_aesni_gcm_mult( unsigned char c[16],
|
|||||||
const unsigned char a[16],
|
const unsigned char a[16],
|
||||||
const unsigned char b[16] )
|
const unsigned char b[16] )
|
||||||
{
|
{
|
||||||
|
|
||||||
|
#if defined(MBEDTLS_HAVE_MSVC_X64_INTRINSICS)
|
||||||
|
__m128i xa, xb, m0, m1, x10, x32, r;
|
||||||
|
|
||||||
|
xa.m128i_u64[1] = _byteswap_uint64( *((unsigned __int64*)a + 0) );
|
||||||
|
xa.m128i_u64[0] = _byteswap_uint64( *((unsigned __int64*)a + 1) );
|
||||||
|
xb.m128i_u64[1] = _byteswap_uint64( *((unsigned __int64*)b + 0) );
|
||||||
|
xb.m128i_u64[0] = _byteswap_uint64( *((unsigned __int64*)b + 1) );
|
||||||
|
|
||||||
|
clmul256( xa, xb, &m0, &m1 );
|
||||||
|
sll256( m0, m1, &x10, &x32 );
|
||||||
|
r = reducemod128( x10, x32 );
|
||||||
|
|
||||||
|
*((unsigned __int64*)c + 0) = _byteswap_uint64( r.m128i_u64[1] );
|
||||||
|
*((unsigned __int64*)c + 1) = _byteswap_uint64( r.m128i_u64[0] );
|
||||||
|
#else
|
||||||
unsigned char aa[16], bb[16], cc[16];
|
unsigned char aa[16], bb[16], cc[16];
|
||||||
size_t i;
|
size_t i;
|
||||||
|
|
||||||
@ -242,6 +351,7 @@ void mbedtls_aesni_gcm_mult( unsigned char c[16],
|
|||||||
/* Now byte-reverse the outputs */
|
/* Now byte-reverse the outputs */
|
||||||
for( i = 0; i < 16; i++ )
|
for( i = 0; i < 16; i++ )
|
||||||
c[i] = cc[15 - i];
|
c[i] = cc[15 - i];
|
||||||
|
#endif
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -258,22 +368,109 @@ void mbedtls_aesni_inverse_key( unsigned char *invkey,
|
|||||||
memcpy( ik, fk, 16 );
|
memcpy( ik, fk, 16 );
|
||||||
|
|
||||||
for( fk -= 16, ik += 16; fk > fwdkey; fk -= 16, ik += 16 )
|
for( fk -= 16, ik += 16; fk > fwdkey; fk -= 16, ik += 16 )
|
||||||
|
#if defined(MBEDTLS_HAVE_MSVC_X64_INTRINSICS)
|
||||||
|
_mm_storeu_si128( (__m128i*)ik, _mm_aesimc_si128( _mm_loadu_si128( (__m128i*)fk) ) );
|
||||||
|
#else
|
||||||
asm( "movdqu (%0), %%xmm0 \n\t"
|
asm( "movdqu (%0), %%xmm0 \n\t"
|
||||||
AESIMC xmm0_xmm0 "\n\t"
|
AESIMC xmm0_xmm0 "\n\t"
|
||||||
"movdqu %%xmm0, (%1) \n\t"
|
"movdqu %%xmm0, (%1) \n\t"
|
||||||
:
|
:
|
||||||
: "r" (fk), "r" (ik)
|
: "r" (fk), "r" (ik)
|
||||||
: "memory", "xmm0" );
|
: "memory", "xmm0" );
|
||||||
|
#endif
|
||||||
|
|
||||||
memcpy( ik, fk, 16 );
|
memcpy( ik, fk, 16 );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if defined(MBEDTLS_HAVE_MSVC_X64_INTRINSICS)
|
||||||
|
inline static __m128i aes_key_128_assist( __m128i key, __m128i kg )
|
||||||
|
{
|
||||||
|
key = _mm_xor_si128( key, _mm_slli_si128( key, 4 ) );
|
||||||
|
key = _mm_xor_si128( key, _mm_slli_si128( key, 4 ) );
|
||||||
|
key = _mm_xor_si128( key, _mm_slli_si128( key, 4 ) );
|
||||||
|
kg = _mm_shuffle_epi32( kg, _MM_SHUFFLE( 3, 3, 3, 3 ) );
|
||||||
|
return _mm_xor_si128( key, kg );
|
||||||
|
}
|
||||||
|
|
||||||
|
// [AES-WP] Part of Fig. 25 page 32
|
||||||
|
inline static void aes_key_192_assist( __m128i* temp1, __m128i * temp3, __m128i kg )
|
||||||
|
{
|
||||||
|
__m128i temp4;
|
||||||
|
kg = _mm_shuffle_epi32( kg, 0x55 );
|
||||||
|
temp4 = _mm_slli_si128( *temp1, 0x4 );
|
||||||
|
*temp1 = _mm_xor_si128( *temp1, temp4 );
|
||||||
|
temp4 = _mm_slli_si128( temp4, 0x4 );
|
||||||
|
*temp1 = _mm_xor_si128( *temp1, temp4 );
|
||||||
|
temp4 = _mm_slli_si128( temp4, 0x4 );
|
||||||
|
*temp1 = _mm_xor_si128( *temp1, temp4 );
|
||||||
|
*temp1 = _mm_xor_si128( *temp1, kg );
|
||||||
|
kg = _mm_shuffle_epi32( *temp1, 0xff );
|
||||||
|
temp4 = _mm_slli_si128( *temp3, 0x4 );
|
||||||
|
*temp3 = _mm_xor_si128( *temp3, temp4 );
|
||||||
|
*temp3 = _mm_xor_si128( *temp3, kg );
|
||||||
|
}
|
||||||
|
|
||||||
|
// [AES-WP] Part of Fig. 26 page 34
|
||||||
|
inline static void aes_key_256_assist_1( __m128i* temp1, __m128i kg )
|
||||||
|
{
|
||||||
|
__m128i temp4;
|
||||||
|
kg = _mm_shuffle_epi32( kg, 0xff );
|
||||||
|
temp4 = _mm_slli_si128( *temp1, 0x4 );
|
||||||
|
*temp1 = _mm_xor_si128( *temp1, temp4 );
|
||||||
|
temp4 = _mm_slli_si128( temp4, 0x4 );
|
||||||
|
*temp1 = _mm_xor_si128( *temp1, temp4 );
|
||||||
|
temp4 = _mm_slli_si128( temp4, 0x4 );
|
||||||
|
*temp1 = _mm_xor_si128( *temp1, temp4 );
|
||||||
|
*temp1 = _mm_xor_si128( *temp1, kg );
|
||||||
|
}
|
||||||
|
|
||||||
|
inline static void aes_key_256_assist_2( __m128i* temp1, __m128i* temp3 )
|
||||||
|
{
|
||||||
|
__m128i temp2, temp4;
|
||||||
|
temp4 = _mm_aeskeygenassist_si128( *temp1, 0x0 );
|
||||||
|
temp2 = _mm_shuffle_epi32( temp4, 0xaa );
|
||||||
|
temp4 = _mm_slli_si128( *temp3, 0x4 );
|
||||||
|
*temp3 = _mm_xor_si128( *temp3, temp4 );
|
||||||
|
temp4 = _mm_slli_si128( temp4, 0x4 );
|
||||||
|
*temp3 = _mm_xor_si128( *temp3, temp4 );
|
||||||
|
temp4 = _mm_slli_si128( temp4, 0x4 );
|
||||||
|
*temp3 = _mm_xor_si128( *temp3, temp4 );
|
||||||
|
*temp3 = _mm_xor_si128( *temp3, temp2 );
|
||||||
|
}
|
||||||
|
#endif /* MBEDTLS_HAVE_MSVC_X64_INTRINSICS */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Key expansion, 128-bit case
|
* Key expansion, 128-bit case
|
||||||
*/
|
*/
|
||||||
static void aesni_setkey_enc_128( unsigned char *rk,
|
static void aesni_setkey_enc_128( unsigned char *rk,
|
||||||
const unsigned char *key )
|
const unsigned char *key )
|
||||||
{
|
{
|
||||||
|
#if defined(MBEDTLS_HAVE_MSVC_X64_INTRINSICS)
|
||||||
|
__m128i* xrk, k;
|
||||||
|
|
||||||
|
xrk = (__m128i*)rk;
|
||||||
|
|
||||||
|
#define EXPAND_ROUND(k, rcon) \
|
||||||
|
_mm_storeu_si128( xrk++, k ); \
|
||||||
|
k = aes_key_128_assist( k, _mm_aeskeygenassist_si128( k, rcon ) )
|
||||||
|
|
||||||
|
k = _mm_loadu_si128( (__m128i*)key );
|
||||||
|
EXPAND_ROUND( k, 0x01 );
|
||||||
|
EXPAND_ROUND( k, 0x02 );
|
||||||
|
EXPAND_ROUND( k, 0x04 );
|
||||||
|
EXPAND_ROUND( k, 0x08 );
|
||||||
|
EXPAND_ROUND( k, 0x10 );
|
||||||
|
EXPAND_ROUND( k, 0x20 );
|
||||||
|
EXPAND_ROUND( k, 0x40 );
|
||||||
|
EXPAND_ROUND( k, 0x80 );
|
||||||
|
EXPAND_ROUND( k, 0x1b );
|
||||||
|
EXPAND_ROUND( k, 0x36 );
|
||||||
|
_mm_storeu_si128( xrk, k );
|
||||||
|
|
||||||
|
#undef EXPAND_ROUND
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
asm( "movdqu (%1), %%xmm0 \n\t" // copy the original key
|
asm( "movdqu (%1), %%xmm0 \n\t" // copy the original key
|
||||||
"movdqu %%xmm0, (%0) \n\t" // as round key 0
|
"movdqu %%xmm0, (%0) \n\t" // as round key 0
|
||||||
"jmp 2f \n\t" // skip auxiliary routine
|
"jmp 2f \n\t" // skip auxiliary routine
|
||||||
@ -316,6 +513,7 @@ static void aesni_setkey_enc_128( unsigned char *rk,
|
|||||||
:
|
:
|
||||||
: "r" (rk), "r" (key)
|
: "r" (rk), "r" (key)
|
||||||
: "memory", "cc", "0" );
|
: "memory", "cc", "0" );
|
||||||
|
#endif /* MBEDTLS_HAVE_MSVC_X64_INTRINSICS */
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -324,6 +522,37 @@ static void aesni_setkey_enc_128( unsigned char *rk,
|
|||||||
static void aesni_setkey_enc_192( unsigned char *rk,
|
static void aesni_setkey_enc_192( unsigned char *rk,
|
||||||
const unsigned char *key )
|
const unsigned char *key )
|
||||||
{
|
{
|
||||||
|
#if defined(MBEDTLS_HAVE_MSVC_X64_INTRINSICS)
|
||||||
|
__m128i temp1, temp3;
|
||||||
|
__m128i *key_schedule = (__m128i*)rk;
|
||||||
|
temp1 = _mm_loadu_si128( (__m128i*)key );
|
||||||
|
temp3 = _mm_loadu_si128( (__m128i*)(key + 16) );
|
||||||
|
key_schedule[0] = temp1;
|
||||||
|
key_schedule[1] = temp3;
|
||||||
|
aes_key_192_assist( &temp1, &temp3, _mm_aeskeygenassist_si128(temp3, 0x1) );
|
||||||
|
key_schedule[1] = _mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( key_schedule[1] ), _mm_castsi128_pd( temp1 ), 0 ) );
|
||||||
|
key_schedule[2] = _mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( temp1 ), _mm_castsi128_pd( temp3 ), 1 ) );
|
||||||
|
aes_key_192_assist( &temp1, &temp3, _mm_aeskeygenassist_si128( temp3, 0x2 ) );
|
||||||
|
key_schedule[3] = temp1;
|
||||||
|
key_schedule[4] = temp3;
|
||||||
|
aes_key_192_assist( &temp1, &temp3, _mm_aeskeygenassist_si128( temp3, 0x4 ) );
|
||||||
|
key_schedule[4] = _mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( key_schedule[4] ), _mm_castsi128_pd( temp1 ), 0 ) );
|
||||||
|
key_schedule[5] = _mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( temp1 ), _mm_castsi128_pd( temp3 ), 1 ) );
|
||||||
|
aes_key_192_assist( &temp1, &temp3, _mm_aeskeygenassist_si128( temp3, 0x8 ) );
|
||||||
|
key_schedule[6] = temp1;
|
||||||
|
key_schedule[7] = temp3;
|
||||||
|
aes_key_192_assist( &temp1, &temp3, _mm_aeskeygenassist_si128( temp3, 0x10 ) );
|
||||||
|
key_schedule[7] = _mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( key_schedule[7] ), _mm_castsi128_pd( temp1 ), 0 ) );
|
||||||
|
key_schedule[8] = _mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( temp1 ), _mm_castsi128_pd( temp3 ), 1 ) );
|
||||||
|
aes_key_192_assist( &temp1, &temp3, _mm_aeskeygenassist_si128( temp3, 0x20 ) );
|
||||||
|
key_schedule[9] = temp1;
|
||||||
|
key_schedule[10] = temp3;
|
||||||
|
aes_key_192_assist( &temp1, &temp3, _mm_aeskeygenassist_si128( temp3, 0x40 ) );
|
||||||
|
key_schedule[10] = _mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( key_schedule[10] ), _mm_castsi128_pd( temp1 ), 0 ) );
|
||||||
|
key_schedule[11] = _mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( temp1 ), _mm_castsi128_pd( temp3 ), 1 ) );
|
||||||
|
aes_key_192_assist( &temp1, &temp3, _mm_aeskeygenassist_si128( temp3, 0x80 ) );
|
||||||
|
key_schedule[12] = temp1;
|
||||||
|
#else
|
||||||
asm( "movdqu (%1), %%xmm0 \n\t" // copy original round key
|
asm( "movdqu (%1), %%xmm0 \n\t" // copy original round key
|
||||||
"movdqu %%xmm0, (%0) \n\t"
|
"movdqu %%xmm0, (%0) \n\t"
|
||||||
"add $16, %0 \n\t"
|
"add $16, %0 \n\t"
|
||||||
@ -373,6 +602,7 @@ static void aesni_setkey_enc_192( unsigned char *rk,
|
|||||||
:
|
:
|
||||||
: "r" (rk), "r" (key)
|
: "r" (rk), "r" (key)
|
||||||
: "memory", "cc", "0" );
|
: "memory", "cc", "0" );
|
||||||
|
#endif /* MBEDTLS_HAVE_MSVC_X64_INTRINSICS */
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -381,6 +611,40 @@ static void aesni_setkey_enc_192( unsigned char *rk,
|
|||||||
static void aesni_setkey_enc_256( unsigned char *rk,
|
static void aesni_setkey_enc_256( unsigned char *rk,
|
||||||
const unsigned char *key )
|
const unsigned char *key )
|
||||||
{
|
{
|
||||||
|
#if defined(MBEDTLS_HAVE_MSVC_X64_INTRINSICS)
|
||||||
|
__m128i temp1, temp3;
|
||||||
|
__m128i *key_schedule = (__m128i*)rk;
|
||||||
|
temp1 = _mm_loadu_si128( (__m128i*)key );
|
||||||
|
temp3 = _mm_loadu_si128( (__m128i*)(key + 16) );
|
||||||
|
key_schedule[0] = temp1;
|
||||||
|
key_schedule[1] = temp3;
|
||||||
|
aes_key_256_assist_1( &temp1, _mm_aeskeygenassist_si128( temp3, 0x01 ) );
|
||||||
|
key_schedule[2] = temp1;
|
||||||
|
aes_key_256_assist_2( &temp1, &temp3 );
|
||||||
|
key_schedule[3] = temp3;
|
||||||
|
aes_key_256_assist_1( &temp1, _mm_aeskeygenassist_si128( temp3, 0x02 ) );
|
||||||
|
key_schedule[4] = temp1;
|
||||||
|
aes_key_256_assist_2( &temp1, &temp3 );
|
||||||
|
key_schedule[5] = temp3;
|
||||||
|
aes_key_256_assist_1( &temp1, _mm_aeskeygenassist_si128( temp3, 0x04 ) );
|
||||||
|
key_schedule[6] = temp1;
|
||||||
|
aes_key_256_assist_2( &temp1, &temp3 );
|
||||||
|
key_schedule[7] = temp3;
|
||||||
|
aes_key_256_assist_1( &temp1, _mm_aeskeygenassist_si128( temp3, 0x08 ) );
|
||||||
|
key_schedule[8] = temp1;
|
||||||
|
aes_key_256_assist_2( &temp1, &temp3 );
|
||||||
|
key_schedule[9] = temp3;
|
||||||
|
aes_key_256_assist_1( &temp1, _mm_aeskeygenassist_si128( temp3, 0x10 ) );
|
||||||
|
key_schedule[10] = temp1;
|
||||||
|
aes_key_256_assist_2( &temp1, &temp3 );
|
||||||
|
key_schedule[11] = temp3;
|
||||||
|
aes_key_256_assist_1( &temp1, _mm_aeskeygenassist_si128( temp3, 0x20 ) );
|
||||||
|
key_schedule[12] = temp1;
|
||||||
|
aes_key_256_assist_2( &temp1, &temp3 );
|
||||||
|
key_schedule[13] = temp3;
|
||||||
|
aes_key_256_assist_1( &temp1, _mm_aeskeygenassist_si128( temp3, 0x40 ) );
|
||||||
|
key_schedule[14] = temp1;
|
||||||
|
#else
|
||||||
asm( "movdqu (%1), %%xmm0 \n\t"
|
asm( "movdqu (%1), %%xmm0 \n\t"
|
||||||
"movdqu %%xmm0, (%0) \n\t"
|
"movdqu %%xmm0, (%0) \n\t"
|
||||||
"add $16, %0 \n\t"
|
"add $16, %0 \n\t"
|
||||||
@ -439,6 +703,7 @@ static void aesni_setkey_enc_256( unsigned char *rk,
|
|||||||
:
|
:
|
||||||
: "r" (rk), "r" (key)
|
: "r" (rk), "r" (key)
|
||||||
: "memory", "cc", "0" );
|
: "memory", "cc", "0" );
|
||||||
|
#endif /* MBEDTLS_HAVE_MSVC_X64_INTRINSICS */
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -112,6 +112,23 @@ unsigned long mbedtls_timing_hardclock( void )
|
|||||||
#endif /* !HAVE_HARDCLOCK && MBEDTLS_HAVE_ASM &&
|
#endif /* !HAVE_HARDCLOCK && MBEDTLS_HAVE_ASM &&
|
||||||
__GNUC__ && ( __amd64__ || __x86_64__ ) */
|
__GNUC__ && ( __amd64__ || __x86_64__ ) */
|
||||||
|
|
||||||
|
#if !defined(HAVE_HARDCLOCK) && defined(_MSC_VER) && defined(_M_X64)
|
||||||
|
|
||||||
|
#define HAVE_HARDCLOCK
|
||||||
|
|
||||||
|
unsigned long mbedtls_timing_hardclock(void)
|
||||||
|
{
|
||||||
|
union
|
||||||
|
{
|
||||||
|
unsigned __int64 u64;
|
||||||
|
struct { unsigned long lo, hi; } u32;
|
||||||
|
} tsc;
|
||||||
|
tsc.u64 = __rdtsc();
|
||||||
|
return tsc.u32.lo;
|
||||||
|
}
|
||||||
|
#endif /* !HAVE_HARDCLOCK && _MSC_VER && _M_X64 */
|
||||||
|
|
||||||
|
|
||||||
#if !defined(HAVE_HARDCLOCK) && defined(MBEDTLS_HAVE_ASM) && \
|
#if !defined(HAVE_HARDCLOCK) && defined(MBEDTLS_HAVE_ASM) && \
|
||||||
defined(__GNUC__) && ( defined(__powerpc__) || defined(__ppc__) )
|
defined(__GNUC__) && ( defined(__powerpc__) || defined(__ppc__) )
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user