diff --git a/ChangeLog b/ChangeLog index 3078c58e2..030d720ad 100644 --- a/ChangeLog +++ b/ChangeLog @@ -19,6 +19,7 @@ Changes * More constant-time checks in the RSA module * Split off curves from ecp.c into ecp_curves.c * Curves are now stored fully in ROM + * Memory usage optimizations in ECP module Bugfix * Fixed bug in mpi_set_bit() on platforms where t_uint is wider than int diff --git a/include/polarssl/config.h b/include/polarssl/config.h index 283e294a7..dfd107ab6 100644 --- a/include/polarssl/config.h +++ b/include/polarssl/config.h @@ -1890,6 +1890,7 @@ // #define POLARSSL_ECP_MAX_BITS 521 /**< Maximum bit size of groups */ #define POLARSSL_ECP_WINDOW_SIZE 6 /**< Maximum window size used */ +#define POLARSSL_ECP_FIXED_POINT_OPTIM 1 /**< Enable fixed-point speed-up */ // Entropy options // diff --git a/include/polarssl/ecp.h b/include/polarssl/ecp.h index eea4c6dc3..1e9e73786 100644 --- a/include/polarssl/ecp.h +++ b/include/polarssl/ecp.h @@ -178,11 +178,33 @@ ecp_keypair; * Minimum value: 2. Maximum value: 7. * * Result is an array of at most ( 1 << ( POLARSSL_ECP_WINDOW_SIZE - 1 ) ) - * points used for point multiplication. + * points used for point multiplication. This value is directly tied to EC + * peak memory usage, so decreasing it by one should roughly cut memory usage + * by two (if large curves are in use). * - * Reduction in size may reduce speed for big curves. + * Reduction in size may reduce speed, but larger curves are impacted first. + * Sample performances (in ECDHE handshakes/s, with FIXED_POINT_OPTIM = 1): + * w-size: 6 5 4 3 2 + * 521 145 141 135 120 97 + * 384 214 209 198 177 146 + * 256 320 320 303 262 226 + * 224 475 475 453 398 342 + * 192 640 640 633 587 476 */ #define POLARSSL_ECP_WINDOW_SIZE 6 /**< Maximum window size used */ + +/* + * Trade memory for speed on fixed-point multiplication. + * + * This speeds up repeated multiplication of the generator (that is, the + * multiplication in ECDSA signatures, and half of the multiplications in + * ECDSA verification and ECDHE) by a factor roughly 3 to 4. + * + * The cost is increasing EC peak memory usage by a factor roughly 2. + * + * Change this value to 0 to reduce peak memory usage. + */ +#define POLARSSL_ECP_FIXED_POINT_OPTIM 1 /**< Enable fixed-point speed-up */ #endif /* diff --git a/library/ecp.c b/library/ecp.c index c7bd8c289..8b34bf375 100644 --- a/library/ecp.c +++ b/library/ecp.c @@ -791,7 +791,16 @@ static int ecp_normalize_jac_many( const ecp_group *grp, MPI_CHK( mpi_mul_mpi( &T[i]->X, &T[i]->X, &ZZi ) ); MOD_MUL( T[i]->X ); MPI_CHK( mpi_mul_mpi( &T[i]->Y, &T[i]->Y, &ZZi ) ); MOD_MUL( T[i]->Y ); MPI_CHK( mpi_mul_mpi( &T[i]->Y, &T[i]->Y, &Zi ) ); MOD_MUL( T[i]->Y ); - MPI_CHK( mpi_lset( &T[i]->Z, 1 ) ); + + /* + * Post-precessing: reclaim some memory by shrinking coordinates + * - not storing Z (always 1) + * - shrinking other coordinates, but still keeping the same number of + * limbs as P, as otherwise it will too likely be regrown too fast. + */ + MPI_CHK( mpi_shrink( &T[i]->X, grp->P.n ) ); + MPI_CHK( mpi_shrink( &T[i]->Y, grp->P.n ) ); + mpi_free( &T[i]->Z ); if( i == 0 ) break; @@ -915,6 +924,8 @@ cleanup: * due to the choice of precomputed points in the modified comb method. * So branches for these cases do not leak secret information. * + * We accept Q->Z being unset (saving memory in tables) as meaning 1. + * * Cost: 1A := 8M + 3S */ static int ecp_add_mixed( const ecp_group *grp, ecp_point *R, @@ -933,13 +944,13 @@ static int ecp_add_mixed( const ecp_group *grp, ecp_point *R, if( mpi_cmp_int( &P->Z, 0 ) == 0 ) return( ecp_copy( R, Q ) ); - if( mpi_cmp_int( &Q->Z, 0 ) == 0 ) + if( Q->Z.p != NULL && mpi_cmp_int( &Q->Z, 0 ) == 0 ) return( ecp_copy( R, P ) ); /* * Make sure Q coordinates are normalized */ - if( mpi_cmp_int( &Q->Z, 1 ) != 0 ) + if( Q->Z.p != NULL && mpi_cmp_int( &Q->Z, 1 ) != 0 ) return( POLARSSL_ERR_ECP_BAD_INPUT_DATA ); mpi_init( &T1 ); mpi_init( &T2 ); mpi_init( &T3 ); mpi_init( &T4 ); @@ -1025,7 +1036,7 @@ int ecp_sub( const ecp_group *grp, ecp_point *R, return( POLARSSL_ERR_ECP_FEATURE_UNAVAILABLE ); /* mQ = - Q */ - ecp_copy( &mQ, Q ); + MPI_CHK( ecp_copy( &mQ, Q ) ); if( mpi_cmp_int( &mQ.Y, 0 ) != 0 ) MPI_CHK( mpi_sub_mpi( &mQ.Y, &grp->P, &mQ.Y ) ); @@ -1184,7 +1195,7 @@ static int ecp_precompute_comb( const ecp_group *grp, TT[k++] = cur; } - ecp_normalize_jac_many( grp, TT, k ); + MPI_CHK( ecp_normalize_jac_many( grp, TT, k ) ); /* * Compute the remaining ones using the minimal number of additions @@ -1196,25 +1207,12 @@ static int ecp_precompute_comb( const ecp_group *grp, j = i; while( j-- ) { - ecp_add_mixed( grp, &T[i + j], &T[j], &T[i] ); + MPI_CHK( ecp_add_mixed( grp, &T[i + j], &T[j], &T[i] ) ); TT[k++] = &T[i + j]; } } - ecp_normalize_jac_many( grp, TT, k ); - - /* - * Post-precessing: reclaim some memory by - * - not storing Z (always 1) - * - shrinking other coordinates - * Keep the same number of limbs as P to avoid re-growing on next use. - */ - for( i = 0; i < ( 1U << (w-1) ); i++ ) - { - mpi_free( &T[i].Z ); - mpi_shrink( &T[i].X, grp->P.n ); - mpi_shrink( &T[i].Y, grp->P.n ); - } + MPI_CHK( ecp_normalize_jac_many( grp, TT, k ) ); cleanup: return( ret ); @@ -1240,9 +1238,6 @@ static int ecp_select_comb( const ecp_group *grp, ecp_point *R, MPI_CHK( mpi_safe_cond_assign( &R->Y, &T[j].Y, j == ii ) ); } - /* The Z coordinate is always 1 */ - MPI_CHK( mpi_lset( &R->Z, 1 ) ); - /* Safely invert result if i is "negative" */ MPI_CHK( ecp_safe_invert_jac( grp, R, i >> 7 ) ); @@ -1271,6 +1266,7 @@ static int ecp_mul_comb_core( const ecp_group *grp, ecp_point *R, /* Start with a non-zero point and randomize its coordinates */ i = d; MPI_CHK( ecp_select_comb( grp, R, T, t_len, x[i] ) ); + MPI_CHK( mpi_lset( &R->Z, 1 ) ); if( f_rng != 0 ) MPI_CHK( ecp_randomize_jac( grp, R, f_rng, p_rng ) ); @@ -1319,12 +1315,17 @@ static int ecp_mul_comb( ecp_group *grp, ecp_point *R, /* * If P == G, pre-compute a bit more, since this may be re-used later. - * Just adding one ups the cost of the first mul by at most 3%. + * Just adding one avoids upping the cost of the first mul too much, + * and the memory cost too. */ +#if POLARSSL_ECP_FIXED_POINT_OPTIM == 1 p_eq_g = ( mpi_cmp_mpi( &P->Y, &grp->G.Y ) == 0 && mpi_cmp_mpi( &P->X, &grp->G.X ) == 0 ); if( p_eq_g ) w++; +#else + p_eq_g = 0; +#endif /* * Make sure w is within bounds. diff --git a/library/memory_buffer_alloc.c b/library/memory_buffer_alloc.c index 7ec6498de..1038c85a9 100644 --- a/library/memory_buffer_alloc.c +++ b/library/memory_buffer_alloc.c @@ -77,6 +77,7 @@ typedef struct size_t total_used; size_t maximum_used; size_t header_count; + size_t maximum_header_count; #endif #if defined(POLARSSL_THREADING_C) threading_mutex_t mutex; @@ -335,6 +336,8 @@ static void *buffer_alloc_malloc( size_t len ) #if defined(POLARSSL_MEMORY_DEBUG) heap.header_count++; + if( heap.header_count > heap.maximum_header_count ) + heap.maximum_header_count = heap.header_count; heap.total_used += cur->size; if( heap.total_used > heap.maximum_used) heap.maximum_used = heap.total_used; @@ -484,8 +487,11 @@ int memory_buffer_alloc_verify() void memory_buffer_alloc_status() { fprintf( stderr, - "Current use: %u blocks / %u bytes, max: %u bytes, malloc / free: %u / %u\n", - heap.header_count, heap.total_used, heap.maximum_used, + "Current use: %u blocks / %u bytes, max: %u blocks / %u bytes (total %u bytes), malloc / free: %u / %u\n", + heap.header_count, heap.total_used, + heap.maximum_header_count, heap.maximum_used, + heap.maximum_header_count * sizeof( memory_header ) + + heap.maximum_used, heap.malloc_count, heap.free_count ); if( heap.first->next == NULL )