From d1e7a45fdd59f2d1db081347c4d1ced835be5952 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Manuel=20P=C3=A9gouri=C3=A9-Gonnard?= Date: Tue, 22 Oct 2013 21:03:16 +0200 Subject: [PATCH 01/14] Rework ecp_mod_p192() On x86_64, this makes it 5x faster, and ecp_mul() 17% faster for this curve. The code is shorter too. --- library/ecp.c | 101 +++++++++++++------------------ tests/suites/test_suite_ecp.data | 14 ++++- 2 files changed, 56 insertions(+), 59 deletions(-) diff --git a/library/ecp.c b/library/ecp.c index d3880be55..d53d306a5 100644 --- a/library/ecp.c +++ b/library/ecp.c @@ -475,25 +475,36 @@ cleanup: } #if defined(POLARSSL_ECP_DP_SECP192R1_ENABLED) -/* - * 192 bits in terms of t_uint - */ -#define P192_SIZE_INT ( 192 / CHAR_BIT / sizeof( t_uint ) ) -/* - * Table to get S1, S2, S3 of FIPS 186-3 D.2.1: - * -1 means let this chunk be 0 - * a positive value i means A_i. - */ -#define P192_CHUNKS 3 -#define P192_CHUNK_CHAR ( 64 / CHAR_BIT ) -#define P192_CHUNK_INT ( P192_CHUNK_CHAR / sizeof( t_uint ) ) +/* Add 64-bit chunks (dst += src) and update carry */ +static inline void add_64( t_uint *dst, t_uint *src, t_uint *carry ) +{ + unsigned char i; + t_uint c = 0; + for( i = 0; i < 8 / sizeof( t_uint ); i++, dst++, src++ ) + { + *dst += c; c = ( *dst < c ); + *dst += *src; c += ( *dst < *src ); + } + *carry += c; +} -const signed char p192_tbl[][P192_CHUNKS] = { - { -1, 3, 3 }, /* S1 */ - { 4, 4, -1 }, /* S2 */ - { 5, 5, 5 }, /* S3 */ -}; +/* Add carry to a 64-bit chunk and update carry */ +static inline void carry64( t_uint *dst, t_uint *carry ) +{ + unsigned char i; + for( i = 0; i < 8 / sizeof( t_uint ); i++, dst++ ) + { + *dst += *carry; + *carry = ( *dst < *carry ); + } +} + +#define OFFSET ( 8 / sizeof( t_uint ) ) +#define A( i ) ( N->p + ( i ) * OFFSET ) +#define ADD( i ) add_64( p, A( i ), &c ) +#define NEXT p += OFFSET; carry64( p, &c ) +#define LAST p += OFFSET; *p = c; while( ++p < end ) *p = 0 /* * Fast quasi-reduction modulo p192 (FIPS 186-3 D.2.1) @@ -501,53 +512,27 @@ const signed char p192_tbl[][P192_CHUNKS] = { static int ecp_mod_p192( mpi *N ) { int ret; - unsigned char i, j, offset; - signed char chunk; - mpi tmp, acc; - t_uint tmp_p[P192_SIZE_INT], acc_p[P192_SIZE_INT + 1]; + t_uint c = 0; + t_uint *p, *end; - tmp.s = 1; - tmp.n = sizeof( tmp_p ) / sizeof( tmp_p[0] ); - tmp.p = tmp_p; + /* Make sure we have the correct number of blocks */ + MPI_CHK( mpi_grow( N, 6 * OFFSET ) ); + p = N->p; + end = p + N->n; - acc.s = 1; - acc.n = sizeof( acc_p ) / sizeof( acc_p[0] ); - acc.p = acc_p; - - MPI_CHK( mpi_grow( N, P192_SIZE_INT * 2 ) ); - - /* - * acc = T - */ - memset( acc_p, 0, sizeof( acc_p ) ); - memcpy( acc_p, N->p, P192_CHUNK_CHAR * P192_CHUNKS ); - - for( i = 0; i < sizeof( p192_tbl ) / sizeof( p192_tbl[0] ); i++) - { - /* - * tmp = S_i - */ - memset( tmp_p, 0, sizeof( tmp_p ) ); - for( j = 0, offset = P192_CHUNKS - 1; j < P192_CHUNKS; j++, offset-- ) - { - chunk = p192_tbl[i][j]; - if( chunk >= 0 ) - memcpy( tmp_p + offset * P192_CHUNK_INT, - N->p + chunk * P192_CHUNK_INT, - P192_CHUNK_CHAR ); - } - - /* - * acc += tmp - */ - MPI_CHK( mpi_add_abs( &acc, &acc, &tmp ) ); - } - - MPI_CHK( mpi_copy( N, &acc ) ); + ADD( 3 ); ADD( 5 ); NEXT; // A0 += A3 + A5 + ADD( 3 ); ADD( 4 ); ADD( 5 ); NEXT; // A1 += A3 + A4 + A5 + ADD( 4 ); ADD( 5 ); LAST; // A2 += A4 + A5 cleanup: return( ret ); } + +#undef OFFSET +#undef A +#undef ADD +#undef NEXT +#undef LAST #endif /* POLARSSL_ECP_DP_SECP192R1_ENABLED */ #if defined(POLARSSL_ECP_DP_SECP521R1_ENABLED) diff --git a/tests/suites/test_suite_ecp.data b/tests/suites/test_suite_ecp.data index 9eb302b5b..4748ff98b 100644 --- a/tests/suites/test_suite_ecp.data +++ b/tests/suites/test_suite_ecp.data @@ -253,14 +253,26 @@ ECP gen keypair depends_on:POLARSSL_ECP_DP_SECP192R1_ENABLED ecp_gen_keypair:POLARSSL_ECP_DP_SECP192R1 +ECP mod p192 small (more than 192 bits, less limbs than 2 * 192 bits) +depends_on:POLARSSL_ECP_DP_SECP192R1_ENABLED +ecp_fast_mod:POLARSSL_ECP_DP_SECP192R1:"0100000000000103010000000000010201000000000001010100000000000100" + ECP mod p192 readable depends_on:POLARSSL_ECP_DP_SECP192R1_ENABLED -ecp_fast_mod:POLARSSL_ECP_DP_SECP192R1:"000000000000010500000000000001040000000000000103000000000000010200000000000001010000000000000100" +ecp_fast_mod:POLARSSL_ECP_DP_SECP192R1:"010000000000010501000000000001040100000000000103010000000000010201000000000001010100000000000100" + +ECP mod p192 readable with carry +depends_on:POLARSSL_ECP_DP_SECP192R1_ENABLED +ecp_fast_mod:POLARSSL_ECP_DP_SECP192R1:"FF00000000010500FF00000000010400FF00000000010300FF00000000010200FF00000000010100FF00000000010000" ECP mod p192 random depends_on:POLARSSL_ECP_DP_SECP192R1_ENABLED ecp_fast_mod:POLARSSL_ECP_DP_SECP192R1:"36CF96B45D706A0954D89E52CE5F38517A2270E0175849B6F3740151D238CCABEF921437E475881D83BB69E4AA258EBD" +ECP mod p192 (from a past failure case) +depends_on:POLARSSL_ECP_DP_SECP192R1_ENABLED +ecp_fast_mod:POLARSSL_ECP_DP_SECP192R1:"1AC2D6F96A2A425E9DD1776DD8368D4BBC86BF4964E79FEA713583BF948BBEFF0939F96FB19EC48C585BDA6A2D35C750" + ECP test vectors secp192r1 rfc 5114 depends_on:POLARSSL_ECP_DP_SECP192R1_ENABLED ecp_test_vect:POLARSSL_ECP_DP_SECP192R1:"323FA3169D8E9C6593F59476BC142000AB5BE0E249C43426":"CD46489ECFD6C105E7B3D32566E2B122E249ABAADD870612":"68887B4877DF51DD4DC3D6FD11F0A26F8FD3844317916E9A":"631F95BB4A67632C9C476EEE9AB695AB240A0499307FCF62":"519A121680E0045466BA21DF2EEE47F5973B500577EF13D5":"FF613AB4D64CEE3A20875BDB10F953F6B30CA072C60AA57F":"AD420182633F8526BFE954ACDA376F05E5FF4F837F54FEBE":"4371545ED772A59741D0EDA32C671112B7FDDD51461FCF32" From c9e387ca9ebf8561b9a612ee2f75d02c3ab00276 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Manuel=20P=C3=A9gouri=C3=A9-Gonnard?= Date: Thu, 17 Oct 2013 17:15:35 +0200 Subject: [PATCH 02/14] Optimize ecp_modp() Makes it 22% faster, for a 5% gain on ecp_mul() --- library/ecp.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/library/ecp.c b/library/ecp.c index d53d306a5..78b05c426 100644 --- a/library/ecp.c +++ b/library/ecp.c @@ -451,6 +451,8 @@ int ecp_tls_write_point( const ecp_group *grp, const ecp_point *pt, /* * Wrapper around fast quasi-modp functions, with fall-back to mpi_mod_mpi. * See the documentation of struct ecp_group. + * + * This function is in the critial loop for ecp_mul, so pay attention to perf. */ static int ecp_modp( mpi *N, const ecp_group *grp ) { @@ -459,16 +461,22 @@ static int ecp_modp( mpi *N, const ecp_group *grp ) if( grp->modp == NULL ) return( mpi_mod_mpi( N, N, &grp->P ) ); - if( mpi_cmp_int( N, 0 ) < 0 || mpi_msb( N ) > 2 * grp->pbits ) + /* N->s < 0 is a much faster test, which fails only if N is 0 */ + if( ( N->s < 0 && mpi_cmp_int( N, 0 ) != 0 ) || + mpi_msb( N ) > 2 * grp->pbits ) + { return( POLARSSL_ERR_ECP_BAD_INPUT_DATA ); + } MPI_CHK( grp->modp( N ) ); - while( mpi_cmp_int( N, 0 ) < 0 ) + /* N->s < 0 is a much faster test, which fails only if N is 0 */ + while( N->s < 0 && mpi_cmp_int( N, 0 ) != 0 ) MPI_CHK( mpi_add_mpi( N, N, &grp->P ) ); while( mpi_cmp_mpi( N, &grp->P ) >= 0 ) - MPI_CHK( mpi_sub_mpi( N, N, &grp->P ) ); + /* we known P, N and the result are positive */ + MPI_CHK( mpi_sub_abs( N, N, &grp->P ) ); cleanup: return( ret ); @@ -915,17 +923,20 @@ const ecp_curve_info *ecp_curve_info_from_grp_id( ecp_group_id grp_id ) /* * Reduce a mpi mod p in-place, to use after mpi_sub_mpi + * N->s < 0 is a very fast test, which fails only if N is 0 */ #define MOD_SUB( N ) \ - while( mpi_cmp_int( &N, 0 ) < 0 ) \ + while( N.s < 0 && mpi_cmp_int( &N, 0 ) != 0 ) \ MPI_CHK( mpi_add_mpi( &N, &N, &grp->P ) ) /* - * Reduce a mpi mod p in-place, to use after mpi_add_mpi and mpi_mul_int + * Reduce a mpi mod p in-place, to use after mpi_add_mpi and mpi_mul_int. + * We known P, N and the result are positive, so sub_abs is correct, and + * a bit faster. */ #define MOD_ADD( N ) \ while( mpi_cmp_mpi( &N, &grp->P ) >= 0 ) \ - MPI_CHK( mpi_sub_mpi( &N, &N, &grp->P ) ) + MPI_CHK( mpi_sub_abs( &N, &N, &grp->P ) ) /* * Normalize jacobian coordinates so that Z == 0 || Z == 1 (GECC 3.2.1) From cc67aee9c8845896fcbe2a497b2b3360415773fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Manuel=20P=C3=A9gouri=C3=A9-Gonnard?= Date: Fri, 18 Oct 2013 10:55:45 +0200 Subject: [PATCH 03/14] Make ecp_mod_p521 a bit faster --- library/ecp.c | 28 +++++++++++++++++++--------- tests/suites/test_suite_ecp.data | 16 ++++++++++++++++ 2 files changed, 35 insertions(+), 9 deletions(-) diff --git a/library/ecp.c b/library/ecp.c index 78b05c426..33081a03c 100644 --- a/library/ecp.c +++ b/library/ecp.c @@ -547,12 +547,12 @@ cleanup: /* * Size of p521 in terms of t_uint */ -#define P521_SIZE_INT ( 521 / CHAR_BIT / sizeof( t_uint ) + 1 ) +#define P521_SIZE_INT ( 521 / 8 / sizeof( t_uint ) + 1 ) /* * Bits to keep in the most significant t_uint */ -#if defined(POLARSS_HAVE_INT8) +#if defined(POLARSSL_HAVE_INT8) #define P521_MASK 0x01 #else #define P521_MASK 0x01FF @@ -560,26 +560,36 @@ cleanup: /* * Fast quasi-reduction modulo p521 (FIPS 186-3 D.2.5) + * Write N as A1 + 2^521 A0, return A0 + A1 */ static int ecp_mod_p521( mpi *N ) { int ret; - t_uint Mp[P521_SIZE_INT]; + size_t i; mpi M; + t_uint Mp[P521_SIZE_INT+1]; + /* Worst case for the size of M is when sizeof( t_uint ) == 16: + * we need to hold bits 513 to 1056, which is 34 limbs, that is + * P521_SIZE_INT + 1. Otherwise P521_SIZE is enough. */ if( N->n < P521_SIZE_INT ) return( 0 ); - memset( Mp, 0, P521_SIZE_INT * sizeof( t_uint ) ); - memcpy( Mp, N->p, P521_SIZE_INT * sizeof( t_uint ) ); - Mp[P521_SIZE_INT - 1] &= P521_MASK; - + /* M = A1 */ M.s = 1; - M.n = P521_SIZE_INT; + M.n = N->n - ( P521_SIZE_INT - 1 ); + if( M.n > P521_SIZE_INT + 1 ) + M.n = P521_SIZE_INT + 1; M.p = Mp; + memcpy( Mp, N->p + P521_SIZE_INT - 1, M.n * sizeof( t_uint ) ); + MPI_CHK( mpi_shift_r( &M, 521 % ( 8 * sizeof( t_uint ) ) ) ); - MPI_CHK( mpi_shift_r( N, 521 ) ); + /* N = A0 */ + N->p[P521_SIZE_INT - 1] &= P521_MASK; + for( i = P521_SIZE_INT; i < N->n; i++ ) + N->p[i] = 0; + /* N = A0 + A1 */ MPI_CHK( mpi_add_abs( N, N, &M ) ); cleanup: diff --git a/tests/suites/test_suite_ecp.data b/tests/suites/test_suite_ecp.data index 4748ff98b..c8ed20f6b 100644 --- a/tests/suites/test_suite_ecp.data +++ b/tests/suites/test_suite_ecp.data @@ -273,6 +273,22 @@ ECP mod p192 (from a past failure case) depends_on:POLARSSL_ECP_DP_SECP192R1_ENABLED ecp_fast_mod:POLARSSL_ECP_DP_SECP192R1:"1AC2D6F96A2A425E9DD1776DD8368D4BBC86BF4964E79FEA713583BF948BBEFF0939F96FB19EC48C585BDA6A2D35C750" +ECP mod p521 very small +depends_on:POLARSSL_ECP_DP_SECP521R1_ENABLED +ecp_fast_mod:POLARSSL_ECP_DP_SECP521R1:"01" + +ECP mod p521 small (522 bits) +depends_on:POLARSSL_ECP_DP_SECP521R1_ENABLED +ecp_fast_mod:POLARSSL_ECP_DP_SECP521R1:"030000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000" + +ECP mod p521 readable +depends_on:POLARSSL_ECP_DP_SECP521R1_ENABLED +ecp_fast_mod:POLARSSL_ECP_DP_SECP521R1:"03FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000" + +ECP mod p521 readable with carry +depends_on:POLARSSL_ECP_DP_SECP521R1_ENABLED +ecp_fast_mod:POLARSSL_ECP_DP_SECP521R1:"03FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001" + ECP test vectors secp192r1 rfc 5114 depends_on:POLARSSL_ECP_DP_SECP192R1_ENABLED ecp_test_vect:POLARSSL_ECP_DP_SECP192R1:"323FA3169D8E9C6593F59476BC142000AB5BE0E249C43426":"CD46489ECFD6C105E7B3D32566E2B122E249ABAADD870612":"68887B4877DF51DD4DC3D6FD11F0A26F8FD3844317916E9A":"631F95BB4A67632C9C476EEE9AB695AB240A0499307FCF62":"519A121680E0045466BA21DF2EEE47F5973B500577EF13D5":"FF613AB4D64CEE3A20875BDB10F953F6B30CA072C60AA57F":"AD420182633F8526BFE954ACDA376F05E5FF4F837F54FEBE":"4371545ED772A59741D0EDA32C671112B7FDDD51461FCF32" From e783f06f730f3f19851caaf78a8091780093b085 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Manuel=20P=C3=A9gouri=C3=A9-Gonnard?= Date: Mon, 21 Oct 2013 14:52:21 +0200 Subject: [PATCH 04/14] Start working on mod_p224 (Prototype, works only on 32-bit and little-endian 64-bit.) --- library/ecp.c | 88 ++++++++++++++++++++++++++++ tests/suites/test_suite_ecp.data | 16 +++++ tests/suites/test_suite_ecp.function | 3 +- 3 files changed, 106 insertions(+), 1 deletion(-) diff --git a/library/ecp.c b/library/ecp.c index 33081a03c..4eddcdcb8 100644 --- a/library/ecp.c +++ b/library/ecp.c @@ -543,6 +543,93 @@ cleanup: #undef LAST #endif /* POLARSSL_ECP_DP_SECP192R1_ENABLED */ +#if defined(POLARSSL_ECP_DP_SECP521R1_ENABLED) + +/* For now, prototype version for 32-bit or little-endian 64 bits only */ + +static inline void add32( uint32_t *dst, uint32_t src, signed char *carry ) +{ + *dst += src; + *carry += ( *dst < src ); +} + +static inline void sub32( uint32_t *dst, uint32_t src, signed char *carry ) +{ + *carry -= ( *dst < src ); + *dst -= src; +} + +#define A( i ) ( ((uint32_t *) N->p)[i] ) +#define ADD( i ) add32( p, A( i ), &c ); +#define SUB( i ) sub32( p, A( i ), &c ); + +#define NEXT \ + p++; \ + cc = c; \ + c = 0; \ + if( cc < 0 ) \ + sub32( p, -cc, &c ); \ + else \ + add32( p, cc, &c ); + +#define LAST \ + p++; \ + *p = c > 0 ? c : 0; /* see fix_negative */ \ + while( ++p < end ) \ + *p = 0; \ + if( c < 0 ) fix_negative( N, c, bits ); + +/* + * If the result is negative, we get it in the form c * 2^192 + N, + * with c negative and N positive (the c >= 0 case is handled by LAST). + */ +static inline int fix_negative( mpi *N, signed char c, size_t bits ) +{ + int ret; + mpi C; + + mpi_init( &C ); + + MPI_CHK( mpi_lset( &C, c ) ); + MPI_CHK( mpi_shift_l( &C, bits ) ); + MPI_CHK( mpi_add_mpi( N, N, &C ) ); + +cleanup: + mpi_free( &C ); + + return( ret ); +} + +/* + * Fast quasi-reduction modulo p224 (FIPS 186-3 D.2.2) + */ +static int ecp_mod_p224( mpi *N ) +{ + int ret; + signed char c, cc; + uint32_t *p, *end; + size_t bits = 224; + + /* Make sure we have the correct number of blocks */ + MPI_CHK( mpi_grow( N, bits * 2 / 8 / sizeof( t_uint ) ) ); + + /* Currently assuming 32-bit ints, or 64-bits little-endian */ + p = (uint32_t *) N->p; + end = (uint32_t *) (N->p + N->n); + + SUB( 7 ); SUB( 11 ); NEXT; // A0 += -A7 - A11 + SUB( 8 ); SUB( 12 ); NEXT; // A1 += -A8 - A12 + SUB( 9 ); SUB( 13 ); NEXT; // A2 += -A9 - A13 + SUB( 10 ); ADD( 7 ); ADD( 11 ); NEXT; // A3 += -A10 + A7 + A11 + SUB( 11 ); ADD( 8 ); ADD( 12 ); NEXT; // A4 += -A11 + A8 + A12 + SUB( 12 ); ADD( 9 ); ADD( 13 ); NEXT; // A5 += -A12 + A9 + A13 + SUB( 13 ); ADD( 10 ); LAST; // A6 += -A13 + A10 + +cleanup: + return( ret ); +} +#endif /* POLARSSL_ECP_DP_SECP224R1_ENABLED */ + #if defined(POLARSSL_ECP_DP_SECP521R1_ENABLED) /* * Size of p521 in terms of t_uint @@ -761,6 +848,7 @@ int ecp_use_known_dp( ecp_group *grp, ecp_group_id id ) #if defined(POLARSSL_ECP_DP_SECP224R1_ENABLED) case POLARSSL_ECP_DP_SECP224R1: + grp->modp = ecp_mod_p224; return( ecp_group_read_string( grp, 16, SECP224R1_P, SECP224R1_B, SECP224R1_GX, SECP224R1_GY, SECP224R1_N ) ); diff --git a/tests/suites/test_suite_ecp.data b/tests/suites/test_suite_ecp.data index c8ed20f6b..2f5f4efc7 100644 --- a/tests/suites/test_suite_ecp.data +++ b/tests/suites/test_suite_ecp.data @@ -273,6 +273,22 @@ ECP mod p192 (from a past failure case) depends_on:POLARSSL_ECP_DP_SECP192R1_ENABLED ecp_fast_mod:POLARSSL_ECP_DP_SECP192R1:"1AC2D6F96A2A425E9DD1776DD8368D4BBC86BF4964E79FEA713583BF948BBEFF0939F96FB19EC48C585BDA6A2D35C750" +ECP mod p224 readable without carry +depends_on:POLARSSL_ECP_DP_SECP224R1_ENABLED +ecp_fast_mod:POLARSSL_ECP_DP_SECP224R1:"0000000D0000000C0000000B0000000A0000000900000008000000070000FF060000FF050000FF040000FF03000FF0020000FF010000FF00" + +ECP mod p224 readable with negative carry +depends_on:POLARSSL_ECP_DP_SECP224R1_ENABLED +ecp_fast_mod:POLARSSL_ECP_DP_SECP224R1:"0000000D0000000C0000000B0000000A00000009000000080000000700000006000000050000000400000003000000020000000100000000" + +ECP mod p224 readable with positive carry +depends_on:POLARSSL_ECP_DP_SECP224R1_ENABLED +ecp_fast_mod:POLARSSL_ECP_DP_SECP224R1:"0000000D0000000C0000000BFFFFFF0AFFFFFF09FFFFFF08FFFFFF070000FF060000FF050000FF040000FF03000FF0020000FF010000FF00" + +ECP mod p224 readable with final negative carry +depends_on:POLARSSL_ECP_DP_SECP224R1_ENABLED +ecp_fast_mod:POLARSSL_ECP_DP_SECP224R1:"FF00000D0000000C0000000B0000000A00000009000000080000000700000006000000050000000400000003000000020000000100000000" + ECP mod p521 very small depends_on:POLARSSL_ECP_DP_SECP521R1_ENABLED ecp_fast_mod:POLARSSL_ECP_DP_SECP521R1:"01" diff --git a/tests/suites/test_suite_ecp.function b/tests/suites/test_suite_ecp.function index 6981f47d3..4eb52596c 100644 --- a/tests/suites/test_suite_ecp.function +++ b/tests/suites/test_suite_ecp.function @@ -229,8 +229,9 @@ void ecp_fast_mod( int id, char *N_str ) mpi_init( &N ); mpi_init( &R ); ecp_group_init( &grp ); - TEST_ASSERT( ecp_use_known_dp( &grp, id ) == 0 ); TEST_ASSERT( mpi_read_string( &N, 16, N_str ) == 0 ); + TEST_ASSERT( ecp_use_known_dp( &grp, id ) == 0 ); + TEST_ASSERT( grp.modp != NULL ); /* * Store correct result before we touch N From a47e7058ea6086838fd265b40d64c8ceab24f224 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Manuel=20P=C3=A9gouri=C3=A9-Gonnard?= Date: Mon, 21 Oct 2013 17:51:45 +0200 Subject: [PATCH 05/14] mod_p224 now endian-neutral --- include/polarssl/bignum.h | 1 + library/ecp.c | 58 +++++++++++++++++++++++++-------------- 2 files changed, 38 insertions(+), 21 deletions(-) diff --git a/include/polarssl/bignum.h b/include/polarssl/bignum.h index b1c43b75c..769e546d5 100644 --- a/include/polarssl/bignum.h +++ b/include/polarssl/bignum.h @@ -142,6 +142,7 @@ typedef uint32_t t_udbl; typedef unsigned int t_udbl __attribute__((mode(TI))); #define POLARSSL_HAVE_UDBL #else + #define POLARSSL_HAVE_INT32 typedef int32_t t_sint; typedef uint32_t t_uint; #if ( defined(_MSC_VER) && defined(_M_IX86) ) diff --git a/library/ecp.c b/library/ecp.c index 4eddcdcb8..b33a57fb0 100644 --- a/library/ecp.c +++ b/library/ecp.c @@ -545,8 +545,6 @@ cleanup: #if defined(POLARSSL_ECP_DP_SECP521R1_ENABLED) -/* For now, prototype version for 32-bit or little-endian 64 bits only */ - static inline void add32( uint32_t *dst, uint32_t src, signed char *carry ) { *dst += src; @@ -559,24 +557,44 @@ static inline void sub32( uint32_t *dst, uint32_t src, signed char *carry ) *dst -= src; } -#define A( i ) ( ((uint32_t *) N->p)[i] ) -#define ADD( i ) add32( p, A( i ), &c ); -#define SUB( i ) sub32( p, A( i ), &c ); +#if defined(POLARSSL_HAVE_INT16) || defined(POLARSSL_HAVE_INT8) +#error "Currently not supported, WIP" +#elif defined(POLARSSL_HAVE_INT32) +#define A( j ) N->p[j] +#define STORE32 N->p[i] = cur; +#else /* 64-bit */ +#define A( j ) j % 2 ? (uint32_t)( N->p[j/2] >> 32 ) : (uint32_t)( N->p[j/2] ) +#define STORE32 \ + if( i % 2 ) { \ + N->p[i/2] &= 0x00000000FFFFFFFF; \ + N->p[i/2] |= ((uint64_t) cur) << 32; \ + } else { \ + N->p[i/2] &= 0xFFFFFFFF00000000; \ + N->p[i/2] |= (uint64_t) cur; \ + } +#endif + +#define ADD( j ) add32( &cur, A( j ), &c ); +#define SUB( j ) sub32( &cur, A( j ), &c ); + +#define LOAD32 cur = A( i ); + +#define FIRST c = 0; i = 0; LOAD32; #define NEXT \ - p++; \ - cc = c; \ - c = 0; \ + STORE32; i++; LOAD32; \ + cc = c; c = 0; \ if( cc < 0 ) \ - sub32( p, -cc, &c ); \ + sub32( &cur, -cc, &c ); \ else \ - add32( p, cc, &c ); + add32( &cur, cc, &c ); -#define LAST \ - p++; \ - *p = c > 0 ? c : 0; /* see fix_negative */ \ - while( ++p < end ) \ - *p = 0; \ +#define LAST \ + STORE32; i++; \ + cur = c > 0 ? c : 0; STORE32; /* see fix_negative */ \ + cur = 0; \ + while( ++i < N->n * sizeof( t_uint ) / sizeof( uint32_t ) ) \ + STORE32; \ if( c < 0 ) fix_negative( N, c, bits ); /* @@ -607,16 +625,14 @@ static int ecp_mod_p224( mpi *N ) { int ret; signed char c, cc; - uint32_t *p, *end; + uint32_t cur; + size_t i; size_t bits = 224; - /* Make sure we have the correct number of blocks */ + /* Make sure we have enough blocks */ MPI_CHK( mpi_grow( N, bits * 2 / 8 / sizeof( t_uint ) ) ); - /* Currently assuming 32-bit ints, or 64-bits little-endian */ - p = (uint32_t *) N->p; - end = (uint32_t *) (N->p + N->n); - + FIRST; SUB( 7 ); SUB( 11 ); NEXT; // A0 += -A7 - A11 SUB( 8 ); SUB( 12 ); NEXT; // A1 += -A8 - A12 SUB( 9 ); SUB( 13 ); NEXT; // A2 += -A9 - A13 From 2a08c0debc268785a983d615282d9fcd629a2067 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Manuel=20P=C3=A9gouri=C3=A9-Gonnard?= Date: Tue, 22 Oct 2013 21:07:14 +0200 Subject: [PATCH 06/14] mod_p224 now working with 8-bit and 16-bit ints --- library/ecp.c | 41 +++++++++++++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 10 deletions(-) diff --git a/library/ecp.c b/library/ecp.c index b33a57fb0..21a231505 100644 --- a/library/ecp.c +++ b/library/ecp.c @@ -543,7 +543,7 @@ cleanup: #undef LAST #endif /* POLARSSL_ECP_DP_SECP192R1_ENABLED */ -#if defined(POLARSSL_ECP_DP_SECP521R1_ENABLED) +#if defined(POLARSSL_ECP_DP_SECP224R1_ENABLED) static inline void add32( uint32_t *dst, uint32_t src, signed char *carry ) { @@ -557,12 +557,34 @@ static inline void sub32( uint32_t *dst, uint32_t src, signed char *carry ) *dst -= src; } -#if defined(POLARSSL_HAVE_INT16) || defined(POLARSSL_HAVE_INT8) -#error "Currently not supported, WIP" +#if defined(POLARSSL_HAVE_INT8) + +#define MAX32 N->n / 4 +#define A( j ) (uint32_t)( N->p[4*j+0] ) | \ + ( N->p[4*j+1] << 8 ) | \ + ( N->p[4*j+2] << 16 ) | \ + ( N->p[4*j+3] << 24 ) +#define STORE32 N->p[4*i+0] = (uint8_t)( cur ); \ + N->p[4*i+1] = (uint8_t)( cur >> 8 ); \ + N->p[4*i+2] = (uint8_t)( cur >> 16 ); \ + N->p[4*i+3] = (uint8_t)( cur >> 24 ); + +#elif defined(POLARSSL_HAVE_INT16) + +#define MAX32 N->n / 2 +#define A( j ) (uint32_t)( N->p[2*j] ) | ( N->p[2*j+1] << 16 ) +#define STORE32 N->p[2*i+0] = (uint16_t)( cur ); \ + N->p[2*i+1] = (uint16_t)( cur >> 16 ); + #elif defined(POLARSSL_HAVE_INT32) + +#define MAX32 N->n #define A( j ) N->p[j] #define STORE32 N->p[i] = cur; + #else /* 64-bit */ + +#define MAX32 N->n * 2 #define A( j ) j % 2 ? (uint32_t)( N->p[j/2] >> 32 ) : (uint32_t)( N->p[j/2] ) #define STORE32 \ if( i % 2 ) { \ @@ -572,6 +594,7 @@ static inline void sub32( uint32_t *dst, uint32_t src, signed char *carry ) N->p[i/2] &= 0xFFFFFFFF00000000; \ N->p[i/2] |= (uint64_t) cur; \ } + #endif #define ADD( j ) add32( &cur, A( j ), &c ); @@ -587,14 +610,12 @@ static inline void sub32( uint32_t *dst, uint32_t src, signed char *carry ) if( cc < 0 ) \ sub32( &cur, -cc, &c ); \ else \ - add32( &cur, cc, &c ); + add32( &cur, cc, &c ); \ -#define LAST \ - STORE32; i++; \ - cur = c > 0 ? c : 0; STORE32; /* see fix_negative */ \ - cur = 0; \ - while( ++i < N->n * sizeof( t_uint ) / sizeof( uint32_t ) ) \ - STORE32; \ +#define LAST \ + STORE32; i++; \ + cur = c > 0 ? c : 0; STORE32; \ + cur = 0; while( ++i < MAX32 ) { STORE32; } \ if( c < 0 ) fix_negative( N, c, bits ); /* From 210b458ddce00e43230e12342e306a09a1b51dc8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Manuel=20P=C3=A9gouri=C3=A9-Gonnard?= Date: Wed, 23 Oct 2013 14:03:00 +0200 Subject: [PATCH 07/14] Document and slightly reorganize mod_pXXX --- library/ecp.c | 164 ++++++++++++++++++++++++++++++++++---------------- 1 file changed, 111 insertions(+), 53 deletions(-) diff --git a/library/ecp.c b/library/ecp.c index 21a231505..0f21e2e73 100644 --- a/library/ecp.c +++ b/library/ecp.c @@ -483,9 +483,20 @@ cleanup: } #if defined(POLARSSL_ECP_DP_SECP192R1_ENABLED) +/* + * Compared to the way things are presented in FIPS 186-3 D.2, + * we proceed in columns, from right (least significant chunk) to left, + * adding chunks to N in place, and keeping a carry for the next chunk. + * This avoids moving things around in memory, and uselessly adding zeros, + * compared to the more straightforward, line-oriented approach. + * + * For this prime we need to handle data in chunks of 64 bits. + * Since this is always a multiple of our basic t_uint, we can + * use a t_uint * to designate such a chunk, and small loops to handle it. + */ /* Add 64-bit chunks (dst += src) and update carry */ -static inline void add_64( t_uint *dst, t_uint *src, t_uint *carry ) +static inline void add64( t_uint *dst, t_uint *src, t_uint *carry ) { unsigned char i; t_uint c = 0; @@ -508,11 +519,11 @@ static inline void carry64( t_uint *dst, t_uint *carry ) } } -#define OFFSET ( 8 / sizeof( t_uint ) ) -#define A( i ) ( N->p + ( i ) * OFFSET ) -#define ADD( i ) add_64( p, A( i ), &c ) -#define NEXT p += OFFSET; carry64( p, &c ) -#define LAST p += OFFSET; *p = c; while( ++p < end ) *p = 0 +#define WIDTH 8 / sizeof( t_uint ) +#define A( i ) N->p + i * WIDTH +#define ADD( i ) add64( p, A( i ), &c ) +#define NEXT p += WIDTH; carry64( p, &c ) +#define LAST p += WIDTH; *p = c; while( ++p < end ) *p = 0 /* * Fast quasi-reduction modulo p192 (FIPS 186-3 D.2.1) @@ -523,8 +534,9 @@ static int ecp_mod_p192( mpi *N ) t_uint c = 0; t_uint *p, *end; - /* Make sure we have the correct number of blocks */ - MPI_CHK( mpi_grow( N, 6 * OFFSET ) ); + /* Make sure we have enough blocks so that A(5) is legal */ + MPI_CHK( mpi_grow( N, 6 * WIDTH ) ); + p = N->p; end = p + N->n; @@ -536,28 +548,35 @@ cleanup: return( ret ); } -#undef OFFSET +#undef WIDTH #undef A #undef ADD #undef NEXT #undef LAST #endif /* POLARSSL_ECP_DP_SECP192R1_ENABLED */ -#if defined(POLARSSL_ECP_DP_SECP224R1_ENABLED) +#if defined(POLARSSL_ECP_DP_SECP224R1_ENABLED) || \ + defined(POLARSSL_ECP_DP_SECP256R1_ENABLED) || \ + defined(POLARSSL_ECP_DP_SECP384R1_ENABLED) +/* + * The reader is advised to first understand ecp_mod_p192() since the same + * general structure is used here, but with additional complications: + * (1) chunks of 32 bits, and (2) subtractions. + */ -static inline void add32( uint32_t *dst, uint32_t src, signed char *carry ) -{ - *dst += src; - *carry += ( *dst < src ); -} +/* + * For these primes, we need to handle data in chunks of 32 bits. + * This makes it more complicated if we use 64 bits limbs in MPI, + * which prevents us from using a uniform access method as for p192. + * + * So, we define a mini abstraction layer to access 32 bit chunks, + * load them in 'cur' for work, and store them back from 'cur' when done. + * + * While at it, also define the size of N in terms of 32-bit chunks. + */ +#define LOAD32 cur = A( i ); -static inline void sub32( uint32_t *dst, uint32_t src, signed char *carry ) -{ - *carry -= ( *dst < src ); - *dst -= src; -} - -#if defined(POLARSSL_HAVE_INT8) +#if defined(POLARSSL_HAVE_INT8) /* 8 bit */ #define MAX32 N->n / 4 #define A( j ) (uint32_t)( N->p[4*j+0] ) | \ @@ -569,20 +588,20 @@ static inline void sub32( uint32_t *dst, uint32_t src, signed char *carry ) N->p[4*i+2] = (uint8_t)( cur >> 16 ); \ N->p[4*i+3] = (uint8_t)( cur >> 24 ); -#elif defined(POLARSSL_HAVE_INT16) +#elif defined(POLARSSL_HAVE_INT16) /* 16 bit */ #define MAX32 N->n / 2 #define A( j ) (uint32_t)( N->p[2*j] ) | ( N->p[2*j+1] << 16 ) #define STORE32 N->p[2*i+0] = (uint16_t)( cur ); \ N->p[2*i+1] = (uint16_t)( cur >> 16 ); -#elif defined(POLARSSL_HAVE_INT32) +#elif defined(POLARSSL_HAVE_INT32) /* 32 bit */ #define MAX32 N->n #define A( j ) N->p[j] #define STORE32 N->p[i] = cur; -#else /* 64-bit */ +#else /* 64-bit */ #define MAX32 N->n * 2 #define A( j ) j % 2 ? (uint32_t)( N->p[j/2] >> 32 ) : (uint32_t)( N->p[j/2] ) @@ -595,14 +614,37 @@ static inline void sub32( uint32_t *dst, uint32_t src, signed char *carry ) N->p[i/2] |= (uint64_t) cur; \ } -#endif +#endif /* sizeof( t_uint ) */ + +/* + * Helpers for addition and subtraction of chunks, with signed carry. + */ +static inline void add32( uint32_t *dst, uint32_t src, signed char *carry ) +{ + *dst += src; + *carry += ( *dst < src ); +} + +static inline void sub32( uint32_t *dst, uint32_t src, signed char *carry ) +{ + *carry -= ( *dst < src ); + *dst -= src; +} #define ADD( j ) add32( &cur, A( j ), &c ); #define SUB( j ) sub32( &cur, A( j ), &c ); -#define LOAD32 cur = A( i ); - -#define FIRST c = 0; i = 0; LOAD32; +/* + * Helpers for the main 'loop' + */ +#define INIT( b ) \ + int ret; \ + signed char c = 0, cc; \ + uint32_t cur; \ + size_t i = 0, bits = b; \ + \ + MPI_CHK( mpi_grow( N, b * 2 / 8 / sizeof( t_uint ) ) ); \ + LOAD32; #define NEXT \ STORE32; i++; LOAD32; \ @@ -638,22 +680,18 @@ cleanup: return( ret ); } +#endif /* POLARSSL_ECP_DP_SECP224R1_ENABLED || + POLARSSL_ECP_DP_SECP256R1_ENABLED || + POLARSSL_ECP_DP_SECP384R1_ENABLED */ +#if defined(POLARSSL_ECP_DP_SECP224R1_ENABLED) /* * Fast quasi-reduction modulo p224 (FIPS 186-3 D.2.2) */ static int ecp_mod_p224( mpi *N ) { - int ret; - signed char c, cc; - uint32_t cur; - size_t i; - size_t bits = 224; + INIT( 224 ); - /* Make sure we have enough blocks */ - MPI_CHK( mpi_grow( N, bits * 2 / 8 / sizeof( t_uint ) ) ); - - FIRST; SUB( 7 ); SUB( 11 ); NEXT; // A0 += -A7 - A11 SUB( 8 ); SUB( 12 ); NEXT; // A1 += -A8 - A12 SUB( 9 ); SUB( 13 ); NEXT; // A2 += -A9 - A13 @@ -667,15 +705,32 @@ cleanup: } #endif /* POLARSSL_ECP_DP_SECP224R1_ENABLED */ +#if defined(POLARSSL_ECP_DP_SECP224R1_ENABLED) || \ + defined(POLARSSL_ECP_DP_SECP256R1_ENABLED) || \ + defined(POLARSSL_ECP_DP_SECP384R1_ENABLED) + +#undef A +#undef LOAD32 +#undef STORE32 +#undef MAX32 +#undef INIT +#undef NEXT +#undef LAST + +#endif /* POLARSSL_ECP_DP_SECP224R1_ENABLED || + POLARSSL_ECP_DP_SECP256R1_ENABLED || + POLARSSL_ECP_DP_SECP384R1_ENABLED */ + #if defined(POLARSSL_ECP_DP_SECP521R1_ENABLED) /* - * Size of p521 in terms of t_uint + * Here we have a real Mersenne prime, so things are more straightforward. + * However, things are aligned on a 'weird' boundary (521 bits). */ -#define P521_SIZE_INT ( 521 / 8 / sizeof( t_uint ) + 1 ) -/* - * Bits to keep in the most significant t_uint - */ +/* Size of p521 in terms of t_uint */ +#define P521_WIDTH ( 521 / 8 / sizeof( t_uint ) + 1 ) + +/* Bits to keep in the most significant t_uint */ #if defined(POLARSSL_HAVE_INT8) #define P521_MASK 0x01 #else @@ -691,26 +746,26 @@ static int ecp_mod_p521( mpi *N ) int ret; size_t i; mpi M; - t_uint Mp[P521_SIZE_INT+1]; - /* Worst case for the size of M is when sizeof( t_uint ) == 16: + t_uint Mp[P521_WIDTH + 1]; + /* Worst case for the size of M is when t_uint is 16 bits: * we need to hold bits 513 to 1056, which is 34 limbs, that is - * P521_SIZE_INT + 1. Otherwise P521_SIZE is enough. */ + * P521_WIDTH + 1. Otherwise P521_WIDTH is enough. */ - if( N->n < P521_SIZE_INT ) + if( N->n < P521_WIDTH ) return( 0 ); /* M = A1 */ M.s = 1; - M.n = N->n - ( P521_SIZE_INT - 1 ); - if( M.n > P521_SIZE_INT + 1 ) - M.n = P521_SIZE_INT + 1; + M.n = N->n - ( P521_WIDTH - 1 ); + if( M.n > P521_WIDTH + 1 ) + M.n = P521_WIDTH + 1; M.p = Mp; - memcpy( Mp, N->p + P521_SIZE_INT - 1, M.n * sizeof( t_uint ) ); + memcpy( Mp, N->p + P521_WIDTH - 1, M.n * sizeof( t_uint ) ); MPI_CHK( mpi_shift_r( &M, 521 % ( 8 * sizeof( t_uint ) ) ) ); /* N = A0 */ - N->p[P521_SIZE_INT - 1] &= P521_MASK; - for( i = P521_SIZE_INT; i < N->n; i++ ) + N->p[P521_WIDTH - 1] &= P521_MASK; + for( i = P521_WIDTH; i < N->n; i++ ) N->p[i] = 0; /* N = A0 + A1 */ @@ -719,6 +774,9 @@ static int ecp_mod_p521( mpi *N ) cleanup: return( ret ); } + +#undef P521_WIDTH +#undef P521_MASK #endif /* POLARSSL_ECP_DP_SECP521R1_ENABLED */ /* From ec655c908cbe4c5cbcd8cf484601d6b22211efa9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Manuel=20P=C3=A9gouri=C3=A9-Gonnard?= Date: Wed, 23 Oct 2013 14:50:39 +0200 Subject: [PATCH 08/14] Add mod_p256 --- library/ecp.c | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/library/ecp.c b/library/ecp.c index 0f21e2e73..3fdd34e60 100644 --- a/library/ecp.c +++ b/library/ecp.c @@ -705,6 +705,43 @@ cleanup: } #endif /* POLARSSL_ECP_DP_SECP224R1_ENABLED */ +#if defined(POLARSSL_ECP_DP_SECP256R1_ENABLED) +/* + * Fast quasi-reduction modulo p256 (FIPS 186-3 D.2.3) + */ +static int ecp_mod_p256( mpi *N ) +{ + INIT( 256 ); + + ADD( 8 ); ADD( 9 ); + SUB( 11 ); SUB( 12 ); SUB( 13 ); SUB( 14 ); NEXT; // A0 + + ADD( 9 ); ADD( 10 ); + SUB( 12 ); SUB( 13 ); SUB( 14 ); SUB( 15 ); NEXT; // A1 + + ADD( 10 ); ADD( 11 ); + SUB( 13 ); SUB( 14 ); SUB( 15 ); NEXT; // A2 + + ADD( 11 ); ADD( 11 ); ADD( 12 ); ADD( 12 ); ADD( 13 ); + SUB( 15 ); SUB( 8 ); SUB( 9 ); NEXT; // A3 + + ADD( 12 ); ADD( 12 ); ADD( 13 ); ADD( 13 ); ADD( 14 ); + SUB( 9 ); SUB( 10 ); NEXT; // A4 + + ADD( 13 ); ADD( 13 ); ADD( 14 ); ADD( 14 ); ADD( 15 ); + SUB( 10 ); SUB( 11 ); NEXT; // A5 + + ADD( 14 ); ADD( 14 ); ADD( 15 ); ADD( 15 ); ADD( 14 ); ADD( 13 ); + SUB( 8 ); SUB( 9 ); NEXT; // A6 + + ADD( 15 ); ADD( 15 ); ADD( 15 ); ADD( 8 ); + SUB( 10 ); SUB( 11 ); SUB( 12 ); SUB( 13 ); LAST; // A7 + +cleanup: + return( ret ); +} +#endif /* POLARSSL_ECP_DP_SECP256R1_ENABLED */ + #if defined(POLARSSL_ECP_DP_SECP224R1_ENABLED) || \ defined(POLARSSL_ECP_DP_SECP256R1_ENABLED) || \ defined(POLARSSL_ECP_DP_SECP384R1_ENABLED) @@ -951,6 +988,7 @@ int ecp_use_known_dp( ecp_group *grp, ecp_group_id id ) #if defined(POLARSSL_ECP_DP_SECP256R1_ENABLED) case POLARSSL_ECP_DP_SECP256R1: + grp->modp = ecp_mod_p256; return( ecp_group_read_string( grp, 16, SECP256R1_P, SECP256R1_B, SECP256R1_GX, SECP256R1_GY, SECP256R1_N ) ); From 0f9149cb0a579d8ee009e4fc23c81302a0347d24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Manuel=20P=C3=A9gouri=C3=A9-Gonnard?= Date: Wed, 23 Oct 2013 15:06:37 +0200 Subject: [PATCH 09/14] Add mod_p384 --- library/ecp.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/library/ecp.c b/library/ecp.c index 3fdd34e60..f359f0bae 100644 --- a/library/ecp.c +++ b/library/ecp.c @@ -742,6 +742,55 @@ cleanup: } #endif /* POLARSSL_ECP_DP_SECP256R1_ENABLED */ +#if defined(POLARSSL_ECP_DP_SECP384R1_ENABLED) +/* + * Fast quasi-reduction modulo p384 (FIPS 186-3 D.2.4) + */ +static int ecp_mod_p384( mpi *N ) +{ + INIT( 384 ); + + ADD( 12 ); ADD( 21 ); ADD( 20 ); + SUB( 23 ); NEXT; // A0 + + ADD( 13 ); ADD( 22 ); ADD( 23 ); + SUB( 12 ); SUB( 20 ); NEXT; // A2 + + ADD( 14 ); ADD( 23 ); + SUB( 13 ); SUB( 21 ); NEXT; // A2 + + ADD( 15 ); ADD( 12 ); ADD( 20 ); ADD( 21 ); + SUB( 14 ); SUB( 22 ); SUB( 23 ); NEXT; // A3 + + ADD( 21 ); ADD( 21 ); ADD( 16 ); ADD( 13 ); ADD( 12 ); ADD( 20 ); ADD( 22 ); + SUB( 15 ); SUB( 23 ); SUB( 23 ); NEXT; // A4 + + ADD( 22 ); ADD( 22 ); ADD( 17 ); ADD( 14 ); ADD( 13 ); ADD( 21 ); ADD( 23 ); + SUB( 16 ); NEXT; // A5 + + ADD( 23 ); ADD( 23 ); ADD( 18 ); ADD( 15 ); ADD( 14 ); ADD( 22 ); + SUB( 17 ); NEXT; // A6 + + ADD( 19 ); ADD( 16 ); ADD( 15 ); ADD( 23 ); + SUB( 18 ); NEXT; // A7 + + ADD( 20 ); ADD( 17 ); ADD( 16 ); + SUB( 19 ); NEXT; // A8 + + ADD( 21 ); ADD( 18 ); ADD( 17 ); + SUB( 20 ); NEXT; // A9 + + ADD( 22 ); ADD( 19 ); ADD( 18 ); + SUB( 21 ); NEXT; // A10 + + ADD( 23 ); ADD( 20 ); ADD( 19 ); + SUB( 22 ); LAST; // A11 + +cleanup: + return( ret ); +} +#endif /* POLARSSL_ECP_DP_SECP384R1_ENABLED */ + #if defined(POLARSSL_ECP_DP_SECP224R1_ENABLED) || \ defined(POLARSSL_ECP_DP_SECP256R1_ENABLED) || \ defined(POLARSSL_ECP_DP_SECP384R1_ENABLED) @@ -996,6 +1045,7 @@ int ecp_use_known_dp( ecp_group *grp, ecp_group_id id ) #if defined(POLARSSL_ECP_DP_SECP384R1_ENABLED) case POLARSSL_ECP_DP_SECP384R1: + grp->modp = ecp_mod_p384; return( ecp_group_read_string( grp, 16, SECP384R1_P, SECP384R1_B, SECP384R1_GX, SECP384R1_GY, SECP384R1_N ) ); From c04c530a98bb99b9030650a27fd78ee2ce3dc00c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Manuel=20P=C3=A9gouri=C3=A9-Gonnard?= Date: Wed, 23 Oct 2013 16:11:52 +0200 Subject: [PATCH 10/14] Make NIST curves optimisation an option --- include/polarssl/config.h | 11 +++++++++++ library/ecp.c | 14 ++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/include/polarssl/config.h b/include/polarssl/config.h index 4973ea454..d231b42c8 100644 --- a/include/polarssl/config.h +++ b/include/polarssl/config.h @@ -259,6 +259,17 @@ #define POLARSSL_ECP_DP_BP384R1_ENABLED #define POLARSSL_ECP_DP_BP512R1_ENABLED +/** + * \def POLARSSL_ECP_NIST_OPTIM + * + * Enable specific 'modulo p' routines for each NIST prime. + * Depending on the prime and architecture, makes operations 4 to 8 times + * faster on the corresponding curve. + * + * Comment this macro to disable NIST curves optimisation. + */ +#define POLARSSL_ECP_NIST_OPTIM + /** * \def POLARSSL_KEY_EXCHANGE_PSK_ENABLED * diff --git a/library/ecp.c b/library/ecp.c index f359f0bae..b144d16ad 100644 --- a/library/ecp.c +++ b/library/ecp.c @@ -482,6 +482,8 @@ cleanup: return( ret ); } +#if defined(POLARSSL_ECP_NIST_OPTIM) + #if defined(POLARSSL_ECP_DP_SECP192R1_ENABLED) /* * Compared to the way things are presented in FIPS 186-3 D.2, @@ -865,6 +867,8 @@ cleanup: #undef P521_MASK #endif /* POLARSSL_ECP_DP_SECP521R1_ENABLED */ +#endif /* POLARSSL_ECP_NIST_OPTIM */ + /* * Domain parameters for secp192r1 */ @@ -1021,7 +1025,9 @@ int ecp_use_known_dp( ecp_group *grp, ecp_group_id id ) { #if defined(POLARSSL_ECP_DP_SECP192R1_ENABLED) case POLARSSL_ECP_DP_SECP192R1: +#if defined(POLARSSL_ECP_NIST_OPTIM) grp->modp = ecp_mod_p192; +#endif return( ecp_group_read_string( grp, 16, SECP192R1_P, SECP192R1_B, SECP192R1_GX, SECP192R1_GY, SECP192R1_N ) ); @@ -1029,7 +1035,9 @@ int ecp_use_known_dp( ecp_group *grp, ecp_group_id id ) #if defined(POLARSSL_ECP_DP_SECP224R1_ENABLED) case POLARSSL_ECP_DP_SECP224R1: +#if defined(POLARSSL_ECP_NIST_OPTIM) grp->modp = ecp_mod_p224; +#endif return( ecp_group_read_string( grp, 16, SECP224R1_P, SECP224R1_B, SECP224R1_GX, SECP224R1_GY, SECP224R1_N ) ); @@ -1037,7 +1045,9 @@ int ecp_use_known_dp( ecp_group *grp, ecp_group_id id ) #if defined(POLARSSL_ECP_DP_SECP256R1_ENABLED) case POLARSSL_ECP_DP_SECP256R1: +#if defined(POLARSSL_ECP_NIST_OPTIM) grp->modp = ecp_mod_p256; +#endif return( ecp_group_read_string( grp, 16, SECP256R1_P, SECP256R1_B, SECP256R1_GX, SECP256R1_GY, SECP256R1_N ) ); @@ -1045,7 +1055,9 @@ int ecp_use_known_dp( ecp_group *grp, ecp_group_id id ) #if defined(POLARSSL_ECP_DP_SECP384R1_ENABLED) case POLARSSL_ECP_DP_SECP384R1: +#if defined(POLARSSL_ECP_NIST_OPTIM) grp->modp = ecp_mod_p384; +#endif return( ecp_group_read_string( grp, 16, SECP384R1_P, SECP384R1_B, SECP384R1_GX, SECP384R1_GY, SECP384R1_N ) ); @@ -1053,7 +1065,9 @@ int ecp_use_known_dp( ecp_group *grp, ecp_group_id id ) #if defined(POLARSSL_ECP_DP_SECP521R1_ENABLED) case POLARSSL_ECP_DP_SECP521R1: +#if defined(POLARSSL_ECP_NIST_OPTIM) grp->modp = ecp_mod_p521; +#endif return( ecp_group_read_string( grp, 16, SECP521R1_P, SECP521R1_B, SECP521R1_GX, SECP521R1_GY, SECP521R1_N ) ); From 5779cbe5821f028f2cddad54c49a9aa564d2d300 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Manuel=20P=C3=A9gouri=C3=A9-Gonnard?= Date: Wed, 23 Oct 2013 20:17:00 +0200 Subject: [PATCH 11/14] Make mod_p{224,256,384] a bit faster Speedup is roughly 25%, giving a 6% speedup on ecp_mul() for these curves. --- include/polarssl/bignum.h | 2 ++ library/ecp.c | 24 +++++++++++++++++------- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/include/polarssl/bignum.h b/include/polarssl/bignum.h index 769e546d5..eae15e04d 100644 --- a/include/polarssl/bignum.h +++ b/include/polarssl/bignum.h @@ -128,6 +128,7 @@ typedef uint32_t t_udbl; #define POLARSSL_HAVE_UDBL #else #if ( defined(_MSC_VER) && defined(_M_AMD64) ) + #define POLARSSL_HAVE_INT64 typedef int64_t t_sint; typedef uint64_t t_uint; #else @@ -137,6 +138,7 @@ typedef uint32_t t_udbl; defined(__ia64__) || defined(__alpha__) || \ (defined(__sparc__) && defined(__arch64__)) || \ defined(__s390x__) ) ) + #define POLARSSL_HAVE_INT64 typedef int64_t t_sint; typedef uint64_t t_uint; typedef unsigned int t_udbl __attribute__((mode(TI))); diff --git a/library/ecp.c b/library/ecp.c index b144d16ad..a408f2ba2 100644 --- a/library/ecp.c +++ b/library/ecp.c @@ -663,22 +663,32 @@ static inline void sub32( uint32_t *dst, uint32_t src, signed char *carry ) if( c < 0 ) fix_negative( N, c, bits ); /* - * If the result is negative, we get it in the form c * 2^192 + N, - * with c negative and N positive (the c >= 0 case is handled by LAST). + * If the result is negative, we get it in the form + * c * 2^(bits + 32) + N, with c negative and N positive shorter than 'bits' */ static inline int fix_negative( mpi *N, signed char c, size_t bits ) { int ret; mpi C; + t_uint Cp[ 384 / 8 / sizeof( t_uint) + 1 ]; - mpi_init( &C ); + /* C = - c * 2^(bits + 32) */ + C.s = 1; + C.n = bits / 8 / sizeof( t_uint ) + 1; + C.p = Cp; + memset( Cp, 0, C.n * sizeof( t_uint ) ); +#if defined(POLARSSL_HAVE_INT64) + if( bits == 224 ) + Cp[ C.n - 1 ] = ((t_uint) -c) << 32; + else +#endif + Cp[ C.n - 1 ] = (t_uint) -c; - MPI_CHK( mpi_lset( &C, c ) ); - MPI_CHK( mpi_shift_l( &C, bits ) ); - MPI_CHK( mpi_add_mpi( N, N, &C ) ); + /* N = - ( C - N ) */ + MPI_CHK( mpi_sub_abs( N, &C, N ) ); + N->s = -1; cleanup: - mpi_free( &C ); return( ret ); } From cae6f3ed45099ca5084d91836a4eb5d90c0e425c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Manuel=20P=C3=A9gouri=C3=A9-Gonnard?= Date: Wed, 23 Oct 2013 20:19:57 +0200 Subject: [PATCH 12/14] Reorganize code in ecp.c --- include/polarssl/ecp.h | 178 +++---- library/ecp.c | 1006 ++++++++++++++++++++-------------------- 2 files changed, 593 insertions(+), 591 deletions(-) diff --git a/include/polarssl/ecp.h b/include/polarssl/ecp.h index 7940b3219..02f6f9349 100644 --- a/include/polarssl/ecp.h +++ b/include/polarssl/ecp.h @@ -186,6 +186,24 @@ ecp_keypair; */ const ecp_curve_info *ecp_curve_list( void ); +/** + * \brief Get curve information from an internal group identifier + * + * \param grp_id A POLARSSL_ECP_DP_XXX value + * + * \return The associated curve information or NULL + */ +const ecp_curve_info *ecp_curve_info_from_grp_id( ecp_group_id grp_id ); + +/** + * \brief Get curve information from a TLS NamedCurve value + * + * \param grp_id A POLARSSL_ECP_DP_XXX value + * + * \return The associated curve information or NULL + */ +const ecp_curve_info *ecp_curve_info_from_tls_id( uint16_t tls_id ); + /** * \brief Initialize a point (as zero) */ @@ -216,25 +234,6 @@ void ecp_group_free( ecp_group *grp ); */ void ecp_keypair_free( ecp_keypair *key ); -/** - * \brief Set a point to zero - * - * \param pt Destination point - * - * \return 0 if successful, - * POLARSSL_ERR_MPI_MALLOC_FAILED if memory allocation failed - */ -int ecp_set_zero( ecp_point *pt ); - -/** - * \brief Tell if a point is zero - * - * \param pt Point to test - * - * \return 1 if point is zero, 0 otherwise - */ -int ecp_is_zero( ecp_point *pt ); - /** * \brief Copy the contents of point Q into P * @@ -257,6 +256,25 @@ int ecp_copy( ecp_point *P, const ecp_point *Q ); */ int ecp_group_copy( ecp_group *dst, const ecp_group *src ); +/** + * \brief Set a point to zero + * + * \param pt Destination point + * + * \return 0 if successful, + * POLARSSL_ERR_MPI_MALLOC_FAILED if memory allocation failed + */ +int ecp_set_zero( ecp_point *pt ); + +/** + * \brief Tell if a point is zero + * + * \param pt Point to test + * + * \return 1 if point is zero, 0 otherwise + */ +int ecp_is_zero( ecp_point *pt ); + /** * \brief Import a non-zero point from two ASCII strings * @@ -270,25 +288,6 @@ int ecp_group_copy( ecp_group *dst, const ecp_group *src ); int ecp_point_read_string( ecp_point *P, int radix, const char *x, const char *y ); -/** - * \brief Import an ECP group from null-terminated ASCII strings - * - * \param grp Destination group - * \param radix Input numeric base - * \param p Prime modulus of the base field - * \param b Constant term in the equation - * \param gx The generator's X coordinate - * \param gy The generator's Y coordinate - * \param n The generator's order - * - * \return 0 if successful, or a POLARSSL_ERR_MPI_XXX error code - * - * \note Sets all fields except modp. - */ -int ecp_group_read_string( ecp_group *grp, int radix, - const char *p, const char *b, - const char *gx, const char *gy, const char *n); - /** * \brief Export a point into unsigned binary data * @@ -326,6 +325,58 @@ int ecp_point_write_binary( const ecp_group *grp, const ecp_point *P, int ecp_point_read_binary( const ecp_group *grp, ecp_point *P, const unsigned char *buf, size_t ilen ); +/** + * \brief Import a point from a TLS ECPoint record + * + * \param grp ECP group used + * \param pt Destination point + * \param buf $(Start of input buffer) + * \param len Buffer length + * + * \return O if successful, + * POLARSSL_ERR_MPI_XXX if initialization failed + * POLARSSL_ERR_ECP_BAD_INPUT_DATA if input is invalid + */ +int ecp_tls_read_point( const ecp_group *grp, ecp_point *pt, + const unsigned char **buf, size_t len ); + +/** + * \brief Export a point as a TLS ECPoint record + * + * \param grp ECP group used + * \param pt Point to export + * \param format Export format + * \param olen length of data written + * \param buf Buffer to write to + * \param blen Buffer length + * + * \return 0 if successful, + * or POLARSSL_ERR_ECP_BAD_INPUT_DATA + * or POLARSSL_ERR_ECP_BUFFER_TOO_SMALL + */ +int ecp_tls_write_point( const ecp_group *grp, const ecp_point *pt, + int format, size_t *olen, + unsigned char *buf, size_t blen ); + +/** + * \brief Import an ECP group from null-terminated ASCII strings + * + * \param grp Destination group + * \param radix Input numeric base + * \param p Prime modulus of the base field + * \param b Constant term in the equation + * \param gx The generator's X coordinate + * \param gy The generator's Y coordinate + * \param n The generator's order + * + * \return 0 if successful, or a POLARSSL_ERR_MPI_XXX error code + * + * \note Sets all fields except modp. + */ +int ecp_group_read_string( ecp_group *grp, int radix, + const char *p, const char *b, + const char *gx, const char *gy, const char *n); + /** * \brief Set a group using well-known domain parameters * @@ -368,57 +419,6 @@ int ecp_tls_read_group( ecp_group *grp, const unsigned char **buf, size_t len ); int ecp_tls_write_group( const ecp_group *grp, size_t *olen, unsigned char *buf, size_t blen ); -/** - * \brief Get curve information from an internal group identifier - * - * \param grp_id A POLARSSL_ECP_DP_XXX value - * - * \return The associated curve information or NULL - */ -const ecp_curve_info *ecp_curve_info_from_grp_id( ecp_group_id grp_id ); - -/** - * \brief Get curve information from a TLS NamedCurve value - * - * \param grp_id A POLARSSL_ECP_DP_XXX value - * - * \return The associated curve information or NULL - */ -const ecp_curve_info *ecp_curve_info_from_tls_id( uint16_t tls_id ); - -/** - * \brief Import a point from a TLS ECPoint record - * - * \param grp ECP group used - * \param pt Destination point - * \param buf $(Start of input buffer) - * \param len Buffer length - * - * \return O if successful, - * POLARSSL_ERR_MPI_XXX if initialization failed - * POLARSSL_ERR_ECP_BAD_INPUT_DATA if input is invalid - */ -int ecp_tls_read_point( const ecp_group *grp, ecp_point *pt, - const unsigned char **buf, size_t len ); - -/** - * \brief Export a point as a TLS ECPoint record - * - * \param grp ECP group used - * \param pt Point to export - * \param format Export format - * \param olen length of data written - * \param buf Buffer to write to - * \param blen Buffer length - * - * \return 0 if successful, - * or POLARSSL_ERR_ECP_BAD_INPUT_DATA - * or POLARSSL_ERR_ECP_BUFFER_TOO_SMALL - */ -int ecp_tls_write_point( const ecp_group *grp, const ecp_point *pt, - int format, size_t *olen, - unsigned char *buf, size_t blen ); - /** * \brief Addition: R = P + Q * diff --git a/library/ecp.c b/library/ecp.c index a408f2ba2..995f956b3 100644 --- a/library/ecp.c +++ b/library/ecp.c @@ -111,6 +111,42 @@ const ecp_curve_info *ecp_curve_list( void ) return ecp_supported_curves; } +/* + * Get the curve info for the internal identifer + */ +const ecp_curve_info *ecp_curve_info_from_grp_id( ecp_group_id grp_id ) +{ + const ecp_curve_info *curve_info; + + for( curve_info = ecp_curve_list(); + curve_info->grp_id != POLARSSL_ECP_DP_NONE; + curve_info++ ) + { + if( curve_info->grp_id == grp_id ) + return( curve_info ); + } + + return( NULL ); +} + +/* + * Get the curve info from the TLS identifier + */ +const ecp_curve_info *ecp_curve_info_from_tls_id( uint16_t tls_id ) +{ + const ecp_curve_info *curve_info; + + for( curve_info = ecp_curve_list(); + curve_info->grp_id != POLARSSL_ECP_DP_NONE; + curve_info++ ) + { + if( curve_info->tls_id == tls_id ) + return( curve_info ); + } + + return( NULL ); +} + /* * Initialize (the components of) a point */ @@ -200,6 +236,29 @@ void ecp_keypair_free( ecp_keypair *key ) ecp_point_free( &key->Q ); } +/* + * Copy the contents of a point + */ +int ecp_copy( ecp_point *P, const ecp_point *Q ) +{ + int ret; + + MPI_CHK( mpi_copy( &P->X, &Q->X ) ); + MPI_CHK( mpi_copy( &P->Y, &Q->Y ) ); + MPI_CHK( mpi_copy( &P->Z, &Q->Z ) ); + +cleanup: + return( ret ); +} + +/* + * Copy the contents of a group object + */ +int ecp_group_copy( ecp_group *dst, const ecp_group *src ) +{ + return ecp_use_known_dp( dst, src->id ); +} + /* * Set point to zero */ @@ -223,29 +282,6 @@ int ecp_is_zero( ecp_point *pt ) return( mpi_cmp_int( &pt->Z, 0 ) == 0 ); } -/* - * Copy the contents of Q into P - */ -int ecp_copy( ecp_point *P, const ecp_point *Q ) -{ - int ret; - - MPI_CHK( mpi_copy( &P->X, &Q->X ) ); - MPI_CHK( mpi_copy( &P->Y, &Q->Y ) ); - MPI_CHK( mpi_copy( &P->Z, &Q->Z ) ); - -cleanup: - return( ret ); -} - -/* - * Copy the contents of a group object - */ -int ecp_group_copy( ecp_group *dst, const ecp_group *src ) -{ - return ecp_use_known_dp( dst, src->id ); -} - /* * Import a non-zero point from ASCII strings */ @@ -262,50 +298,6 @@ cleanup: return( ret ); } -/* - * Import an ECP group from ASCII strings, general case (A used) - */ -static int ecp_group_read_string_gen( ecp_group *grp, int radix, - const char *p, const char *a, const char *b, - const char *gx, const char *gy, const char *n) -{ - int ret; - - MPI_CHK( mpi_read_string( &grp->P, radix, p ) ); - MPI_CHK( mpi_read_string( &grp->A, radix, a ) ); - MPI_CHK( mpi_read_string( &grp->B, radix, b ) ); - MPI_CHK( ecp_point_read_string( &grp->G, radix, gx, gy ) ); - MPI_CHK( mpi_read_string( &grp->N, radix, n ) ); - - grp->pbits = mpi_msb( &grp->P ); - grp->nbits = mpi_msb( &grp->N ); - -cleanup: - if( ret != 0 ) - ecp_group_free( grp ); - - return( ret ); -} - -/* - * Import an ECP group from ASCII strings, case A == -3 - */ -int ecp_group_read_string( ecp_group *grp, int radix, - const char *p, const char *b, - const char *gx, const char *gy, const char *n) -{ - int ret; - - MPI_CHK( ecp_group_read_string_gen( grp, radix, p, "00", b, gx, gy, n ) ); - MPI_CHK( mpi_add_int( &grp->A, &grp->P, -3 ) ); - -cleanup: - if( ret != 0 ) - ecp_group_free( grp ); - - return( ret ); -} - /* * Export a point into unsigned binary data (SEC1 2.3.3) */ @@ -449,435 +441,48 @@ int ecp_tls_write_point( const ecp_group *grp, const ecp_point *pt, } /* - * Wrapper around fast quasi-modp functions, with fall-back to mpi_mod_mpi. - * See the documentation of struct ecp_group. - * - * This function is in the critial loop for ecp_mul, so pay attention to perf. + * Import an ECP group from ASCII strings, general case (A used) */ -static int ecp_modp( mpi *N, const ecp_group *grp ) +static int ecp_group_read_string_gen( ecp_group *grp, int radix, + const char *p, const char *a, const char *b, + const char *gx, const char *gy, const char *n) { int ret; - if( grp->modp == NULL ) - return( mpi_mod_mpi( N, N, &grp->P ) ); + MPI_CHK( mpi_read_string( &grp->P, radix, p ) ); + MPI_CHK( mpi_read_string( &grp->A, radix, a ) ); + MPI_CHK( mpi_read_string( &grp->B, radix, b ) ); + MPI_CHK( ecp_point_read_string( &grp->G, radix, gx, gy ) ); + MPI_CHK( mpi_read_string( &grp->N, radix, n ) ); - /* N->s < 0 is a much faster test, which fails only if N is 0 */ - if( ( N->s < 0 && mpi_cmp_int( N, 0 ) != 0 ) || - mpi_msb( N ) > 2 * grp->pbits ) - { - return( POLARSSL_ERR_ECP_BAD_INPUT_DATA ); - } - - MPI_CHK( grp->modp( N ) ); - - /* N->s < 0 is a much faster test, which fails only if N is 0 */ - while( N->s < 0 && mpi_cmp_int( N, 0 ) != 0 ) - MPI_CHK( mpi_add_mpi( N, N, &grp->P ) ); - - while( mpi_cmp_mpi( N, &grp->P ) >= 0 ) - /* we known P, N and the result are positive */ - MPI_CHK( mpi_sub_abs( N, N, &grp->P ) ); + grp->pbits = mpi_msb( &grp->P ); + grp->nbits = mpi_msb( &grp->N ); cleanup: + if( ret != 0 ) + ecp_group_free( grp ); + return( ret ); } -#if defined(POLARSSL_ECP_NIST_OPTIM) - -#if defined(POLARSSL_ECP_DP_SECP192R1_ENABLED) /* - * Compared to the way things are presented in FIPS 186-3 D.2, - * we proceed in columns, from right (least significant chunk) to left, - * adding chunks to N in place, and keeping a carry for the next chunk. - * This avoids moving things around in memory, and uselessly adding zeros, - * compared to the more straightforward, line-oriented approach. - * - * For this prime we need to handle data in chunks of 64 bits. - * Since this is always a multiple of our basic t_uint, we can - * use a t_uint * to designate such a chunk, and small loops to handle it. + * Import an ECP group from ASCII strings, case A == -3 */ - -/* Add 64-bit chunks (dst += src) and update carry */ -static inline void add64( t_uint *dst, t_uint *src, t_uint *carry ) -{ - unsigned char i; - t_uint c = 0; - for( i = 0; i < 8 / sizeof( t_uint ); i++, dst++, src++ ) - { - *dst += c; c = ( *dst < c ); - *dst += *src; c += ( *dst < *src ); - } - *carry += c; -} - -/* Add carry to a 64-bit chunk and update carry */ -static inline void carry64( t_uint *dst, t_uint *carry ) -{ - unsigned char i; - for( i = 0; i < 8 / sizeof( t_uint ); i++, dst++ ) - { - *dst += *carry; - *carry = ( *dst < *carry ); - } -} - -#define WIDTH 8 / sizeof( t_uint ) -#define A( i ) N->p + i * WIDTH -#define ADD( i ) add64( p, A( i ), &c ) -#define NEXT p += WIDTH; carry64( p, &c ) -#define LAST p += WIDTH; *p = c; while( ++p < end ) *p = 0 - -/* - * Fast quasi-reduction modulo p192 (FIPS 186-3 D.2.1) - */ -static int ecp_mod_p192( mpi *N ) +int ecp_group_read_string( ecp_group *grp, int radix, + const char *p, const char *b, + const char *gx, const char *gy, const char *n) { int ret; - t_uint c = 0; - t_uint *p, *end; - /* Make sure we have enough blocks so that A(5) is legal */ - MPI_CHK( mpi_grow( N, 6 * WIDTH ) ); - - p = N->p; - end = p + N->n; - - ADD( 3 ); ADD( 5 ); NEXT; // A0 += A3 + A5 - ADD( 3 ); ADD( 4 ); ADD( 5 ); NEXT; // A1 += A3 + A4 + A5 - ADD( 4 ); ADD( 5 ); LAST; // A2 += A4 + A5 - -cleanup: - return( ret ); -} - -#undef WIDTH -#undef A -#undef ADD -#undef NEXT -#undef LAST -#endif /* POLARSSL_ECP_DP_SECP192R1_ENABLED */ - -#if defined(POLARSSL_ECP_DP_SECP224R1_ENABLED) || \ - defined(POLARSSL_ECP_DP_SECP256R1_ENABLED) || \ - defined(POLARSSL_ECP_DP_SECP384R1_ENABLED) -/* - * The reader is advised to first understand ecp_mod_p192() since the same - * general structure is used here, but with additional complications: - * (1) chunks of 32 bits, and (2) subtractions. - */ - -/* - * For these primes, we need to handle data in chunks of 32 bits. - * This makes it more complicated if we use 64 bits limbs in MPI, - * which prevents us from using a uniform access method as for p192. - * - * So, we define a mini abstraction layer to access 32 bit chunks, - * load them in 'cur' for work, and store them back from 'cur' when done. - * - * While at it, also define the size of N in terms of 32-bit chunks. - */ -#define LOAD32 cur = A( i ); - -#if defined(POLARSSL_HAVE_INT8) /* 8 bit */ - -#define MAX32 N->n / 4 -#define A( j ) (uint32_t)( N->p[4*j+0] ) | \ - ( N->p[4*j+1] << 8 ) | \ - ( N->p[4*j+2] << 16 ) | \ - ( N->p[4*j+3] << 24 ) -#define STORE32 N->p[4*i+0] = (uint8_t)( cur ); \ - N->p[4*i+1] = (uint8_t)( cur >> 8 ); \ - N->p[4*i+2] = (uint8_t)( cur >> 16 ); \ - N->p[4*i+3] = (uint8_t)( cur >> 24 ); - -#elif defined(POLARSSL_HAVE_INT16) /* 16 bit */ - -#define MAX32 N->n / 2 -#define A( j ) (uint32_t)( N->p[2*j] ) | ( N->p[2*j+1] << 16 ) -#define STORE32 N->p[2*i+0] = (uint16_t)( cur ); \ - N->p[2*i+1] = (uint16_t)( cur >> 16 ); - -#elif defined(POLARSSL_HAVE_INT32) /* 32 bit */ - -#define MAX32 N->n -#define A( j ) N->p[j] -#define STORE32 N->p[i] = cur; - -#else /* 64-bit */ - -#define MAX32 N->n * 2 -#define A( j ) j % 2 ? (uint32_t)( N->p[j/2] >> 32 ) : (uint32_t)( N->p[j/2] ) -#define STORE32 \ - if( i % 2 ) { \ - N->p[i/2] &= 0x00000000FFFFFFFF; \ - N->p[i/2] |= ((uint64_t) cur) << 32; \ - } else { \ - N->p[i/2] &= 0xFFFFFFFF00000000; \ - N->p[i/2] |= (uint64_t) cur; \ - } - -#endif /* sizeof( t_uint ) */ - -/* - * Helpers for addition and subtraction of chunks, with signed carry. - */ -static inline void add32( uint32_t *dst, uint32_t src, signed char *carry ) -{ - *dst += src; - *carry += ( *dst < src ); -} - -static inline void sub32( uint32_t *dst, uint32_t src, signed char *carry ) -{ - *carry -= ( *dst < src ); - *dst -= src; -} - -#define ADD( j ) add32( &cur, A( j ), &c ); -#define SUB( j ) sub32( &cur, A( j ), &c ); - -/* - * Helpers for the main 'loop' - */ -#define INIT( b ) \ - int ret; \ - signed char c = 0, cc; \ - uint32_t cur; \ - size_t i = 0, bits = b; \ - \ - MPI_CHK( mpi_grow( N, b * 2 / 8 / sizeof( t_uint ) ) ); \ - LOAD32; - -#define NEXT \ - STORE32; i++; LOAD32; \ - cc = c; c = 0; \ - if( cc < 0 ) \ - sub32( &cur, -cc, &c ); \ - else \ - add32( &cur, cc, &c ); \ - -#define LAST \ - STORE32; i++; \ - cur = c > 0 ? c : 0; STORE32; \ - cur = 0; while( ++i < MAX32 ) { STORE32; } \ - if( c < 0 ) fix_negative( N, c, bits ); - -/* - * If the result is negative, we get it in the form - * c * 2^(bits + 32) + N, with c negative and N positive shorter than 'bits' - */ -static inline int fix_negative( mpi *N, signed char c, size_t bits ) -{ - int ret; - mpi C; - t_uint Cp[ 384 / 8 / sizeof( t_uint) + 1 ]; - - /* C = - c * 2^(bits + 32) */ - C.s = 1; - C.n = bits / 8 / sizeof( t_uint ) + 1; - C.p = Cp; - memset( Cp, 0, C.n * sizeof( t_uint ) ); -#if defined(POLARSSL_HAVE_INT64) - if( bits == 224 ) - Cp[ C.n - 1 ] = ((t_uint) -c) << 32; - else -#endif - Cp[ C.n - 1 ] = (t_uint) -c; - - /* N = - ( C - N ) */ - MPI_CHK( mpi_sub_abs( N, &C, N ) ); - N->s = -1; + MPI_CHK( ecp_group_read_string_gen( grp, radix, p, "00", b, gx, gy, n ) ); + MPI_CHK( mpi_add_int( &grp->A, &grp->P, -3 ) ); cleanup: + if( ret != 0 ) + ecp_group_free( grp ); return( ret ); } -#endif /* POLARSSL_ECP_DP_SECP224R1_ENABLED || - POLARSSL_ECP_DP_SECP256R1_ENABLED || - POLARSSL_ECP_DP_SECP384R1_ENABLED */ - -#if defined(POLARSSL_ECP_DP_SECP224R1_ENABLED) -/* - * Fast quasi-reduction modulo p224 (FIPS 186-3 D.2.2) - */ -static int ecp_mod_p224( mpi *N ) -{ - INIT( 224 ); - - SUB( 7 ); SUB( 11 ); NEXT; // A0 += -A7 - A11 - SUB( 8 ); SUB( 12 ); NEXT; // A1 += -A8 - A12 - SUB( 9 ); SUB( 13 ); NEXT; // A2 += -A9 - A13 - SUB( 10 ); ADD( 7 ); ADD( 11 ); NEXT; // A3 += -A10 + A7 + A11 - SUB( 11 ); ADD( 8 ); ADD( 12 ); NEXT; // A4 += -A11 + A8 + A12 - SUB( 12 ); ADD( 9 ); ADD( 13 ); NEXT; // A5 += -A12 + A9 + A13 - SUB( 13 ); ADD( 10 ); LAST; // A6 += -A13 + A10 - -cleanup: - return( ret ); -} -#endif /* POLARSSL_ECP_DP_SECP224R1_ENABLED */ - -#if defined(POLARSSL_ECP_DP_SECP256R1_ENABLED) -/* - * Fast quasi-reduction modulo p256 (FIPS 186-3 D.2.3) - */ -static int ecp_mod_p256( mpi *N ) -{ - INIT( 256 ); - - ADD( 8 ); ADD( 9 ); - SUB( 11 ); SUB( 12 ); SUB( 13 ); SUB( 14 ); NEXT; // A0 - - ADD( 9 ); ADD( 10 ); - SUB( 12 ); SUB( 13 ); SUB( 14 ); SUB( 15 ); NEXT; // A1 - - ADD( 10 ); ADD( 11 ); - SUB( 13 ); SUB( 14 ); SUB( 15 ); NEXT; // A2 - - ADD( 11 ); ADD( 11 ); ADD( 12 ); ADD( 12 ); ADD( 13 ); - SUB( 15 ); SUB( 8 ); SUB( 9 ); NEXT; // A3 - - ADD( 12 ); ADD( 12 ); ADD( 13 ); ADD( 13 ); ADD( 14 ); - SUB( 9 ); SUB( 10 ); NEXT; // A4 - - ADD( 13 ); ADD( 13 ); ADD( 14 ); ADD( 14 ); ADD( 15 ); - SUB( 10 ); SUB( 11 ); NEXT; // A5 - - ADD( 14 ); ADD( 14 ); ADD( 15 ); ADD( 15 ); ADD( 14 ); ADD( 13 ); - SUB( 8 ); SUB( 9 ); NEXT; // A6 - - ADD( 15 ); ADD( 15 ); ADD( 15 ); ADD( 8 ); - SUB( 10 ); SUB( 11 ); SUB( 12 ); SUB( 13 ); LAST; // A7 - -cleanup: - return( ret ); -} -#endif /* POLARSSL_ECP_DP_SECP256R1_ENABLED */ - -#if defined(POLARSSL_ECP_DP_SECP384R1_ENABLED) -/* - * Fast quasi-reduction modulo p384 (FIPS 186-3 D.2.4) - */ -static int ecp_mod_p384( mpi *N ) -{ - INIT( 384 ); - - ADD( 12 ); ADD( 21 ); ADD( 20 ); - SUB( 23 ); NEXT; // A0 - - ADD( 13 ); ADD( 22 ); ADD( 23 ); - SUB( 12 ); SUB( 20 ); NEXT; // A2 - - ADD( 14 ); ADD( 23 ); - SUB( 13 ); SUB( 21 ); NEXT; // A2 - - ADD( 15 ); ADD( 12 ); ADD( 20 ); ADD( 21 ); - SUB( 14 ); SUB( 22 ); SUB( 23 ); NEXT; // A3 - - ADD( 21 ); ADD( 21 ); ADD( 16 ); ADD( 13 ); ADD( 12 ); ADD( 20 ); ADD( 22 ); - SUB( 15 ); SUB( 23 ); SUB( 23 ); NEXT; // A4 - - ADD( 22 ); ADD( 22 ); ADD( 17 ); ADD( 14 ); ADD( 13 ); ADD( 21 ); ADD( 23 ); - SUB( 16 ); NEXT; // A5 - - ADD( 23 ); ADD( 23 ); ADD( 18 ); ADD( 15 ); ADD( 14 ); ADD( 22 ); - SUB( 17 ); NEXT; // A6 - - ADD( 19 ); ADD( 16 ); ADD( 15 ); ADD( 23 ); - SUB( 18 ); NEXT; // A7 - - ADD( 20 ); ADD( 17 ); ADD( 16 ); - SUB( 19 ); NEXT; // A8 - - ADD( 21 ); ADD( 18 ); ADD( 17 ); - SUB( 20 ); NEXT; // A9 - - ADD( 22 ); ADD( 19 ); ADD( 18 ); - SUB( 21 ); NEXT; // A10 - - ADD( 23 ); ADD( 20 ); ADD( 19 ); - SUB( 22 ); LAST; // A11 - -cleanup: - return( ret ); -} -#endif /* POLARSSL_ECP_DP_SECP384R1_ENABLED */ - -#if defined(POLARSSL_ECP_DP_SECP224R1_ENABLED) || \ - defined(POLARSSL_ECP_DP_SECP256R1_ENABLED) || \ - defined(POLARSSL_ECP_DP_SECP384R1_ENABLED) - -#undef A -#undef LOAD32 -#undef STORE32 -#undef MAX32 -#undef INIT -#undef NEXT -#undef LAST - -#endif /* POLARSSL_ECP_DP_SECP224R1_ENABLED || - POLARSSL_ECP_DP_SECP256R1_ENABLED || - POLARSSL_ECP_DP_SECP384R1_ENABLED */ - -#if defined(POLARSSL_ECP_DP_SECP521R1_ENABLED) -/* - * Here we have a real Mersenne prime, so things are more straightforward. - * However, things are aligned on a 'weird' boundary (521 bits). - */ - -/* Size of p521 in terms of t_uint */ -#define P521_WIDTH ( 521 / 8 / sizeof( t_uint ) + 1 ) - -/* Bits to keep in the most significant t_uint */ -#if defined(POLARSSL_HAVE_INT8) -#define P521_MASK 0x01 -#else -#define P521_MASK 0x01FF -#endif - -/* - * Fast quasi-reduction modulo p521 (FIPS 186-3 D.2.5) - * Write N as A1 + 2^521 A0, return A0 + A1 - */ -static int ecp_mod_p521( mpi *N ) -{ - int ret; - size_t i; - mpi M; - t_uint Mp[P521_WIDTH + 1]; - /* Worst case for the size of M is when t_uint is 16 bits: - * we need to hold bits 513 to 1056, which is 34 limbs, that is - * P521_WIDTH + 1. Otherwise P521_WIDTH is enough. */ - - if( N->n < P521_WIDTH ) - return( 0 ); - - /* M = A1 */ - M.s = 1; - M.n = N->n - ( P521_WIDTH - 1 ); - if( M.n > P521_WIDTH + 1 ) - M.n = P521_WIDTH + 1; - M.p = Mp; - memcpy( Mp, N->p + P521_WIDTH - 1, M.n * sizeof( t_uint ) ); - MPI_CHK( mpi_shift_r( &M, 521 % ( 8 * sizeof( t_uint ) ) ) ); - - /* N = A0 */ - N->p[P521_WIDTH - 1] &= P521_MASK; - for( i = P521_WIDTH; i < N->n; i++ ) - N->p[i] = 0; - - /* N = A0 + A1 */ - MPI_CHK( mpi_add_abs( N, N, &M ) ); - -cleanup: - return( ret ); -} - -#undef P521_WIDTH -#undef P521_MASK -#endif /* POLARSSL_ECP_DP_SECP521R1_ENABLED */ - -#endif /* POLARSSL_ECP_NIST_OPTIM */ /* * Domain parameters for secp192r1 @@ -1024,6 +629,15 @@ cleanup: "AADD9DB8DBE9C48B3FD4E6AE33C9FC07CB308DB3B3C9D20ED6639CCA703308" \ "70553E5C414CA92619418661197FAC10471DB1D381085DDADDB58796829CA90069" +#if defined(POLARSSL_ECP_NIST_OPTIM) +/* Forward declarations */ +static int ecp_mod_p192( mpi * ); +static int ecp_mod_p224( mpi * ); +static int ecp_mod_p256( mpi * ); +static int ecp_mod_p384( mpi * ); +static int ecp_mod_p521( mpi * ); +#endif + /* * Set a group using well-known domain parameters */ @@ -1176,39 +790,37 @@ int ecp_tls_write_group( const ecp_group *grp, size_t *olen, } /* - * Get the curve info from the TLS identifier + * Wrapper around fast quasi-modp functions, with fall-back to mpi_mod_mpi. + * See the documentation of struct ecp_group. + * + * This function is in the critial loop for ecp_mul, so pay attention to perf. */ -const ecp_curve_info *ecp_curve_info_from_tls_id( uint16_t tls_id ) +static int ecp_modp( mpi *N, const ecp_group *grp ) { - const ecp_curve_info *curve_info; + int ret; - for( curve_info = ecp_curve_list(); - curve_info->grp_id != POLARSSL_ECP_DP_NONE; - curve_info++ ) + if( grp->modp == NULL ) + return( mpi_mod_mpi( N, N, &grp->P ) ); + + /* N->s < 0 is a much faster test, which fails only if N is 0 */ + if( ( N->s < 0 && mpi_cmp_int( N, 0 ) != 0 ) || + mpi_msb( N ) > 2 * grp->pbits ) { - if( curve_info->tls_id == tls_id ) - return( curve_info ); + return( POLARSSL_ERR_ECP_BAD_INPUT_DATA ); } - return( NULL ); -} + MPI_CHK( grp->modp( N ) ); -/* - * Get the curve info for the internal identifer - */ -const ecp_curve_info *ecp_curve_info_from_grp_id( ecp_group_id grp_id ) -{ - const ecp_curve_info *curve_info; + /* N->s < 0 is a much faster test, which fails only if N is 0 */ + while( N->s < 0 && mpi_cmp_int( N, 0 ) != 0 ) + MPI_CHK( mpi_add_mpi( N, N, &grp->P ) ); - for( curve_info = ecp_curve_list(); - curve_info->grp_id != POLARSSL_ECP_DP_NONE; - curve_info++ ) - { - if( curve_info->grp_id == grp_id ) - return( curve_info ); - } + while( mpi_cmp_mpi( N, &grp->P ) >= 0 ) + /* we known P, N and the result are positive */ + MPI_CHK( mpi_sub_abs( N, N, &grp->P ) ); - return( NULL ); +cleanup: + return( ret ); } /* @@ -1231,7 +843,7 @@ const ecp_curve_info *ecp_curve_info_from_grp_id( ecp_group_id grp_id ) * N->s < 0 is a very fast test, which fails only if N is 0 */ #define MOD_SUB( N ) \ - while( N.s < 0 && mpi_cmp_int( &N, 0 ) != 0 ) \ + while( N.s < 0 && mpi_cmp_int( &N, 0 ) != 0 ) \ MPI_CHK( mpi_add_mpi( &N, &N, &grp->P ) ) /* @@ -1418,7 +1030,7 @@ cleanup: } /* - * Addition or subtraction: R = P + Q or R = P + Q, + * Addition or subtraction: R = P + Q or R = P - Q, * mixed affine-Jacobian coordinates (GECC 3.22) * * The coordinates of Q must be normalized (= affine), @@ -1968,6 +1580,396 @@ int ecp_gen_keypair( ecp_group *grp, mpi *d, ecp_point *Q, return( ecp_mul( grp, Q, d, &grp->G, f_rng, p_rng ) ); } +#if defined(POLARSSL_ECP_NIST_OPTIM) + +#if defined(POLARSSL_ECP_DP_SECP192R1_ENABLED) +/* + * Compared to the way things are presented in FIPS 186-3 D.2, + * we proceed in columns, from right (least significant chunk) to left, + * adding chunks to N in place, and keeping a carry for the next chunk. + * This avoids moving things around in memory, and uselessly adding zeros, + * compared to the more straightforward, line-oriented approach. + * + * For this prime we need to handle data in chunks of 64 bits. + * Since this is always a multiple of our basic t_uint, we can + * use a t_uint * to designate such a chunk, and small loops to handle it. + */ + +/* Add 64-bit chunks (dst += src) and update carry */ +static inline void add64( t_uint *dst, t_uint *src, t_uint *carry ) +{ + unsigned char i; + t_uint c = 0; + for( i = 0; i < 8 / sizeof( t_uint ); i++, dst++, src++ ) + { + *dst += c; c = ( *dst < c ); + *dst += *src; c += ( *dst < *src ); + } + *carry += c; +} + +/* Add carry to a 64-bit chunk and update carry */ +static inline void carry64( t_uint *dst, t_uint *carry ) +{ + unsigned char i; + for( i = 0; i < 8 / sizeof( t_uint ); i++, dst++ ) + { + *dst += *carry; + *carry = ( *dst < *carry ); + } +} + +#define WIDTH 8 / sizeof( t_uint ) +#define A( i ) N->p + i * WIDTH +#define ADD( i ) add64( p, A( i ), &c ) +#define NEXT p += WIDTH; carry64( p, &c ) +#define LAST p += WIDTH; *p = c; while( ++p < end ) *p = 0 + +/* + * Fast quasi-reduction modulo p192 (FIPS 186-3 D.2.1) + */ +static int ecp_mod_p192( mpi *N ) +{ + int ret; + t_uint c = 0; + t_uint *p, *end; + + /* Make sure we have enough blocks so that A(5) is legal */ + MPI_CHK( mpi_grow( N, 6 * WIDTH ) ); + + p = N->p; + end = p + N->n; + + ADD( 3 ); ADD( 5 ); NEXT; // A0 += A3 + A5 + ADD( 3 ); ADD( 4 ); ADD( 5 ); NEXT; // A1 += A3 + A4 + A5 + ADD( 4 ); ADD( 5 ); LAST; // A2 += A4 + A5 + +cleanup: + return( ret ); +} + +#undef WIDTH +#undef A +#undef ADD +#undef NEXT +#undef LAST +#endif /* POLARSSL_ECP_DP_SECP192R1_ENABLED */ + +#if defined(POLARSSL_ECP_DP_SECP224R1_ENABLED) || \ + defined(POLARSSL_ECP_DP_SECP256R1_ENABLED) || \ + defined(POLARSSL_ECP_DP_SECP384R1_ENABLED) +/* + * The reader is advised to first understand ecp_mod_p192() since the same + * general structure is used here, but with additional complications: + * (1) chunks of 32 bits, and (2) subtractions. + */ + +/* + * For these primes, we need to handle data in chunks of 32 bits. + * This makes it more complicated if we use 64 bits limbs in MPI, + * which prevents us from using a uniform access method as for p192. + * + * So, we define a mini abstraction layer to access 32 bit chunks, + * load them in 'cur' for work, and store them back from 'cur' when done. + * + * While at it, also define the size of N in terms of 32-bit chunks. + */ +#define LOAD32 cur = A( i ); + +#if defined(POLARSSL_HAVE_INT8) /* 8 bit */ + +#define MAX32 N->n / 4 +#define A( j ) (uint32_t)( N->p[4*j+0] ) | \ + ( N->p[4*j+1] << 8 ) | \ + ( N->p[4*j+2] << 16 ) | \ + ( N->p[4*j+3] << 24 ) +#define STORE32 N->p[4*i+0] = (uint8_t)( cur ); \ + N->p[4*i+1] = (uint8_t)( cur >> 8 ); \ + N->p[4*i+2] = (uint8_t)( cur >> 16 ); \ + N->p[4*i+3] = (uint8_t)( cur >> 24 ); + +#elif defined(POLARSSL_HAVE_INT16) /* 16 bit */ + +#define MAX32 N->n / 2 +#define A( j ) (uint32_t)( N->p[2*j] ) | ( N->p[2*j+1] << 16 ) +#define STORE32 N->p[2*i+0] = (uint16_t)( cur ); \ + N->p[2*i+1] = (uint16_t)( cur >> 16 ); + +#elif defined(POLARSSL_HAVE_INT32) /* 32 bit */ + +#define MAX32 N->n +#define A( j ) N->p[j] +#define STORE32 N->p[i] = cur; + +#else /* 64-bit */ + +#define MAX32 N->n * 2 +#define A( j ) j % 2 ? (uint32_t)( N->p[j/2] >> 32 ) : (uint32_t)( N->p[j/2] ) +#define STORE32 \ + if( i % 2 ) { \ + N->p[i/2] &= 0x00000000FFFFFFFF; \ + N->p[i/2] |= ((uint64_t) cur) << 32; \ + } else { \ + N->p[i/2] &= 0xFFFFFFFF00000000; \ + N->p[i/2] |= (uint64_t) cur; \ + } + +#endif /* sizeof( t_uint ) */ + +/* + * Helpers for addition and subtraction of chunks, with signed carry. + */ +static inline void add32( uint32_t *dst, uint32_t src, signed char *carry ) +{ + *dst += src; + *carry += ( *dst < src ); +} + +static inline void sub32( uint32_t *dst, uint32_t src, signed char *carry ) +{ + *carry -= ( *dst < src ); + *dst -= src; +} + +#define ADD( j ) add32( &cur, A( j ), &c ); +#define SUB( j ) sub32( &cur, A( j ), &c ); + +/* + * Helpers for the main 'loop' + */ +#define INIT( b ) \ + int ret; \ + signed char c = 0, cc; \ + uint32_t cur; \ + size_t i = 0, bits = b; \ + \ + MPI_CHK( mpi_grow( N, b * 2 / 8 / sizeof( t_uint ) ) ); \ + LOAD32; + +#define NEXT \ + STORE32; i++; LOAD32; \ + cc = c; c = 0; \ + if( cc < 0 ) \ + sub32( &cur, -cc, &c ); \ + else \ + add32( &cur, cc, &c ); \ + +#define LAST \ + STORE32; i++; \ + cur = c > 0 ? c : 0; STORE32; \ + cur = 0; while( ++i < MAX32 ) { STORE32; } \ + if( c < 0 ) fix_negative( N, c, bits ); + +/* + * If the result is negative, we get it in the form + * c * 2^(bits + 32) + N, with c negative and N positive shorter than 'bits' + */ +static inline int fix_negative( mpi *N, signed char c, size_t bits ) +{ + int ret; + mpi C; + t_uint Cp[ 384 / 8 / sizeof( t_uint) + 1 ]; + + /* C = - c * 2^(bits + 32) */ + C.s = 1; + C.n = bits / 8 / sizeof( t_uint ) + 1; + C.p = Cp; + memset( Cp, 0, C.n * sizeof( t_uint ) ); +#if defined(POLARSSL_HAVE_INT64) + if( bits == 224 ) + Cp[ C.n - 1 ] = ((t_uint) -c) << 32; + else +#endif + Cp[ C.n - 1 ] = (t_uint) -c; + + /* N = - ( C - N ) */ + MPI_CHK( mpi_sub_abs( N, &C, N ) ); + N->s = -1; + +cleanup: + + return( ret ); +} + +#if defined(POLARSSL_ECP_DP_SECP224R1_ENABLED) +/* + * Fast quasi-reduction modulo p224 (FIPS 186-3 D.2.2) + */ +static int ecp_mod_p224( mpi *N ) +{ + INIT( 224 ); + + SUB( 7 ); SUB( 11 ); NEXT; // A0 += -A7 - A11 + SUB( 8 ); SUB( 12 ); NEXT; // A1 += -A8 - A12 + SUB( 9 ); SUB( 13 ); NEXT; // A2 += -A9 - A13 + SUB( 10 ); ADD( 7 ); ADD( 11 ); NEXT; // A3 += -A10 + A7 + A11 + SUB( 11 ); ADD( 8 ); ADD( 12 ); NEXT; // A4 += -A11 + A8 + A12 + SUB( 12 ); ADD( 9 ); ADD( 13 ); NEXT; // A5 += -A12 + A9 + A13 + SUB( 13 ); ADD( 10 ); LAST; // A6 += -A13 + A10 + +cleanup: + return( ret ); +} +#endif /* POLARSSL_ECP_DP_SECP224R1_ENABLED */ + +#if defined(POLARSSL_ECP_DP_SECP256R1_ENABLED) +/* + * Fast quasi-reduction modulo p256 (FIPS 186-3 D.2.3) + */ +static int ecp_mod_p256( mpi *N ) +{ + INIT( 256 ); + + ADD( 8 ); ADD( 9 ); + SUB( 11 ); SUB( 12 ); SUB( 13 ); SUB( 14 ); NEXT; // A0 + + ADD( 9 ); ADD( 10 ); + SUB( 12 ); SUB( 13 ); SUB( 14 ); SUB( 15 ); NEXT; // A1 + + ADD( 10 ); ADD( 11 ); + SUB( 13 ); SUB( 14 ); SUB( 15 ); NEXT; // A2 + + ADD( 11 ); ADD( 11 ); ADD( 12 ); ADD( 12 ); ADD( 13 ); + SUB( 15 ); SUB( 8 ); SUB( 9 ); NEXT; // A3 + + ADD( 12 ); ADD( 12 ); ADD( 13 ); ADD( 13 ); ADD( 14 ); + SUB( 9 ); SUB( 10 ); NEXT; // A4 + + ADD( 13 ); ADD( 13 ); ADD( 14 ); ADD( 14 ); ADD( 15 ); + SUB( 10 ); SUB( 11 ); NEXT; // A5 + + ADD( 14 ); ADD( 14 ); ADD( 15 ); ADD( 15 ); ADD( 14 ); ADD( 13 ); + SUB( 8 ); SUB( 9 ); NEXT; // A6 + + ADD( 15 ); ADD( 15 ); ADD( 15 ); ADD( 8 ); + SUB( 10 ); SUB( 11 ); SUB( 12 ); SUB( 13 ); LAST; // A7 + +cleanup: + return( ret ); +} +#endif /* POLARSSL_ECP_DP_SECP256R1_ENABLED */ + +#if defined(POLARSSL_ECP_DP_SECP384R1_ENABLED) +/* + * Fast quasi-reduction modulo p384 (FIPS 186-3 D.2.4) + */ +static int ecp_mod_p384( mpi *N ) +{ + INIT( 384 ); + + ADD( 12 ); ADD( 21 ); ADD( 20 ); + SUB( 23 ); NEXT; // A0 + + ADD( 13 ); ADD( 22 ); ADD( 23 ); + SUB( 12 ); SUB( 20 ); NEXT; // A2 + + ADD( 14 ); ADD( 23 ); + SUB( 13 ); SUB( 21 ); NEXT; // A2 + + ADD( 15 ); ADD( 12 ); ADD( 20 ); ADD( 21 ); + SUB( 14 ); SUB( 22 ); SUB( 23 ); NEXT; // A3 + + ADD( 21 ); ADD( 21 ); ADD( 16 ); ADD( 13 ); ADD( 12 ); ADD( 20 ); ADD( 22 ); + SUB( 15 ); SUB( 23 ); SUB( 23 ); NEXT; // A4 + + ADD( 22 ); ADD( 22 ); ADD( 17 ); ADD( 14 ); ADD( 13 ); ADD( 21 ); ADD( 23 ); + SUB( 16 ); NEXT; // A5 + + ADD( 23 ); ADD( 23 ); ADD( 18 ); ADD( 15 ); ADD( 14 ); ADD( 22 ); + SUB( 17 ); NEXT; // A6 + + ADD( 19 ); ADD( 16 ); ADD( 15 ); ADD( 23 ); + SUB( 18 ); NEXT; // A7 + + ADD( 20 ); ADD( 17 ); ADD( 16 ); + SUB( 19 ); NEXT; // A8 + + ADD( 21 ); ADD( 18 ); ADD( 17 ); + SUB( 20 ); NEXT; // A9 + + ADD( 22 ); ADD( 19 ); ADD( 18 ); + SUB( 21 ); NEXT; // A10 + + ADD( 23 ); ADD( 20 ); ADD( 19 ); + SUB( 22 ); LAST; // A11 + +cleanup: + return( ret ); +} +#endif /* POLARSSL_ECP_DP_SECP384R1_ENABLED */ + +#undef A +#undef LOAD32 +#undef STORE32 +#undef MAX32 +#undef INIT +#undef NEXT +#undef LAST + +#endif /* POLARSSL_ECP_DP_SECP224R1_ENABLED || + POLARSSL_ECP_DP_SECP256R1_ENABLED || + POLARSSL_ECP_DP_SECP384R1_ENABLED */ + +#if defined(POLARSSL_ECP_DP_SECP521R1_ENABLED) +/* + * Here we have an actual Mersenne prime, so things are more straightforward. + * However, chunks are aligned on a 'weird' boundary (521 bits). + */ + +/* Size of p521 in terms of t_uint */ +#define P521_WIDTH ( 521 / 8 / sizeof( t_uint ) + 1 ) + +/* Bits to keep in the most significant t_uint */ +#if defined(POLARSSL_HAVE_INT8) +#define P521_MASK 0x01 +#else +#define P521_MASK 0x01FF +#endif + +/* + * Fast quasi-reduction modulo p521 (FIPS 186-3 D.2.5) + * Write N as A1 + 2^521 A0, return A0 + A1 + */ +static int ecp_mod_p521( mpi *N ) +{ + int ret; + size_t i; + mpi M; + t_uint Mp[P521_WIDTH + 1]; + /* Worst case for the size of M is when t_uint is 16 bits: + * we need to hold bits 513 to 1056, which is 34 limbs, that is + * P521_WIDTH + 1. Otherwise P521_WIDTH is enough. */ + + if( N->n < P521_WIDTH ) + return( 0 ); + + /* M = A1 */ + M.s = 1; + M.n = N->n - ( P521_WIDTH - 1 ); + if( M.n > P521_WIDTH + 1 ) + M.n = P521_WIDTH + 1; + M.p = Mp; + memcpy( Mp, N->p + P521_WIDTH - 1, M.n * sizeof( t_uint ) ); + MPI_CHK( mpi_shift_r( &M, 521 % ( 8 * sizeof( t_uint ) ) ) ); + + /* N = A0 */ + N->p[P521_WIDTH - 1] &= P521_MASK; + for( i = P521_WIDTH; i < N->n; i++ ) + N->p[i] = 0; + + /* N = A0 + A1 */ + MPI_CHK( mpi_add_abs( N, N, &M ) ); + +cleanup: + return( ret ); +} + +#undef P521_WIDTH +#undef P521_MASK +#endif /* POLARSSL_ECP_DP_SECP521R1_ENABLED */ + +#endif /* POLARSSL_ECP_NIST_OPTIM */ + #if defined(POLARSSL_SELF_TEST) /* From b21c81fb41caf007dd24b6fb0ad9f0a27b6dc56b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Manuel=20P=C3=A9gouri=C3=A9-Gonnard?= Date: Wed, 23 Oct 2013 20:45:04 +0200 Subject: [PATCH 13/14] Use less memory in fix_negative() --- library/ecp.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/library/ecp.c b/library/ecp.c index 995f956b3..64d4e0339 100644 --- a/library/ecp.c +++ b/library/ecp.c @@ -1736,12 +1736,20 @@ static inline void sub32( uint32_t *dst, uint32_t src, signed char *carry ) /* * Helpers for the main 'loop' + * (see fix_negative for the motivation of C) */ #define INIT( b ) \ int ret; \ signed char c = 0, cc; \ uint32_t cur; \ size_t i = 0, bits = b; \ + mpi C; \ + t_uint Cp[ b / 8 / sizeof( t_uint) + 1 ]; \ + \ + C.s = 1; \ + C.n = b / 8 / sizeof( t_uint) + 1; \ + C.p = Cp; \ + memset( Cp, 0, C.n * sizeof( t_uint ) ); \ \ MPI_CHK( mpi_grow( N, b * 2 / 8 / sizeof( t_uint ) ) ); \ LOAD32; @@ -1758,32 +1766,28 @@ static inline void sub32( uint32_t *dst, uint32_t src, signed char *carry ) STORE32; i++; \ cur = c > 0 ? c : 0; STORE32; \ cur = 0; while( ++i < MAX32 ) { STORE32; } \ - if( c < 0 ) fix_negative( N, c, bits ); + if( c < 0 ) fix_negative( N, c, &C, bits ); /* * If the result is negative, we get it in the form * c * 2^(bits + 32) + N, with c negative and N positive shorter than 'bits' */ -static inline int fix_negative( mpi *N, signed char c, size_t bits ) +static inline int fix_negative( mpi *N, signed char c, mpi *C, size_t bits ) { int ret; - mpi C; - t_uint Cp[ 384 / 8 / sizeof( t_uint) + 1 ]; /* C = - c * 2^(bits + 32) */ - C.s = 1; - C.n = bits / 8 / sizeof( t_uint ) + 1; - C.p = Cp; - memset( Cp, 0, C.n * sizeof( t_uint ) ); -#if defined(POLARSSL_HAVE_INT64) +#if !defined(POLARSSL_HAVE_INT64) + ((void) bits); +#else if( bits == 224 ) - Cp[ C.n - 1 ] = ((t_uint) -c) << 32; + C->p[ C->n - 1 ] = ((t_uint) -c) << 32; else #endif - Cp[ C.n - 1 ] = (t_uint) -c; + C->p[ C->n - 1 ] = (t_uint) -c; /* N = - ( C - N ) */ - MPI_CHK( mpi_sub_abs( N, &C, N ) ); + MPI_CHK( mpi_sub_abs( N, C, N ) ); N->s = -1; cleanup: From 9fcceac943006b6e6b3a8a5b3ba9a9463d04090c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Manuel=20P=C3=A9gouri=C3=A9-Gonnard?= Date: Wed, 23 Oct 2013 20:56:12 +0200 Subject: [PATCH 14/14] Add a comment about modules coupling --- library/ecp.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/library/ecp.c b/library/ecp.c index 64d4e0339..bedb67506 100644 --- a/library/ecp.c +++ b/library/ecp.c @@ -1581,6 +1581,15 @@ int ecp_gen_keypair( ecp_group *grp, mpi *d, ecp_point *Q, } #if defined(POLARSSL_ECP_NIST_OPTIM) +/* + * Fast reduction modulo the primes used by the NIST curves. + * + * These functions are: critical for speed, but not need for correct + * operations. So, we make the choice to heavily rely on the internals of our + * bignum library, which creates a tight coupling between these functions and + * our MPI implementation. However, the coupling between the ECP module and + * MPI remains loose, since these functions can be deactivated at will. + */ #if defined(POLARSSL_ECP_DP_SECP192R1_ENABLED) /*