Merged ECP memory usage optimizations

2024-11-22 11:05:40 +01:00 · 2013-12-31 10:32:50 +01:00 · 2013-12-31 10:32:50 +01:00 · c73879139e
commit c73879139e
parent 53e1513fea 26bc1c0f5d
5 changed files with 59 additions and 28 deletions
--- a/1
+++ b/1
@ -19,6 +19,7 @@ Changes
   * More constant-time checks in the RSA module
   * Split off curves from ecp.c into ecp_curves.c
   * Curves are now stored fully in ROM
+   * Memory usage optimizations in ECP module

 Bugfix
   * Fixed bug in mpi_set_bit() on platforms where t_uint is wider than int
--- a/include/polarssl/config.h
+++ b/include/polarssl/config.h
@ -1890,6 +1890,7 @@
 //
 #define POLARSSL_ECP_MAX_BITS             521 /**< Maximum bit size of groups */
 #define POLARSSL_ECP_WINDOW_SIZE            6 /**< Maximum window size used */
+#define POLARSSL_ECP_FIXED_POINT_OPTIM      1 /**< Enable fixed-point speed-up */

 // Entropy options
 //
--- a/include/polarssl/ecp.h
+++ b/include/polarssl/ecp.h
@ -178,11 +178,33 @@ ecp_keypair;
 * Minimum value: 2. Maximum value: 7.
 *
 * Result is an array of at most ( 1 << ( POLARSSL_ECP_WINDOW_SIZE - 1 ) )
- * points used for point multiplication.
+ * points used for point multiplication. This value is directly tied to EC
+ * peak memory usage, so decreasing it by one should roughly cut memory usage
+ * by two (if large curves are in use).
 *
- * Reduction in size may reduce speed for big curves.
+ * Reduction in size may reduce speed, but larger curves are impacted first.
+ * Sample performances (in ECDHE handshakes/s, with FIXED_POINT_OPTIM = 1):
+ *      w-size:     6       5       4       3       2
+ *      521       145     141     135     120      97
+ *      384       214     209     198     177     146
+ *      256       320     320     303     262     226
+ *      224       475     475     453     398     342
+ *      192       640     640     633     587     476
 */
 #define POLARSSL_ECP_WINDOW_SIZE    6   /**< Maximum window size used */
+
+/*
+ * Trade memory for speed on fixed-point multiplication.
+ *
+ * This speeds up repeated multiplication of the generator (that is, the
+ * multiplication in ECDSA signatures, and half of the multiplications in
+ * ECDSA verification and ECDHE) by a factor roughly 3 to 4.
+ *
+ * The cost is increasing EC peak memory usage by a factor roughly 2.
+ *
+ * Change this value to 0 to reduce peak memory usage.
+ */
+#define POLARSSL_ECP_FIXED_POINT_OPTIM  1   /**< Enable fixed-point speed-up */
 #endif

 /*
--- a/library/ecp.c
+++ b/library/ecp.c
@ -791,7 +791,16 @@ static int ecp_normalize_jac_many( const ecp_group *grp,
        MPI_CHK( mpi_mul_mpi( &T[i]->X, &T[i]->X, &ZZi ) ); MOD_MUL( T[i]->X );
        MPI_CHK( mpi_mul_mpi( &T[i]->Y, &T[i]->Y, &ZZi ) ); MOD_MUL( T[i]->Y );
        MPI_CHK( mpi_mul_mpi( &T[i]->Y, &T[i]->Y, &Zi  ) ); MOD_MUL( T[i]->Y );
-        MPI_CHK( mpi_lset( &T[i]->Z, 1 ) );
+
+        /*
+         * Post-precessing: reclaim some memory by shrinking coordinates
+         * - not storing Z (always 1)
+         * - shrinking other coordinates, but still keeping the same number of
+         *   limbs as P, as otherwise it will too likely be regrown too fast.
+         */
+        MPI_CHK( mpi_shrink( &T[i]->X, grp->P.n ) );
+        MPI_CHK( mpi_shrink( &T[i]->Y, grp->P.n ) );
+        mpi_free( &T[i]->Z );

        if( i == 0 )
            break;
@ -915,6 +924,8 @@ cleanup:
 *   due to the choice of precomputed points in the modified comb method.
 * So branches for these cases do not leak secret information.
 *
+ * We accept Q->Z being unset (saving memory in tables) as meaning 1.
+ *
 * Cost: 1A := 8M + 3S
 */
 static int ecp_add_mixed( const ecp_group *grp, ecp_point *R,
@ -933,13 +944,13 @@ static int ecp_add_mixed( const ecp_group *grp, ecp_point *R,
    if( mpi_cmp_int( &P->Z, 0 ) == 0 )
        return( ecp_copy( R, Q ) );

-    if( mpi_cmp_int( &Q->Z, 0 ) == 0 )
+    if( Q->Z.p != NULL && mpi_cmp_int( &Q->Z, 0 ) == 0 )
        return( ecp_copy( R, P ) );

    /*
     * Make sure Q coordinates are normalized
     */
-    if( mpi_cmp_int( &Q->Z, 1 ) != 0 )
+    if( Q->Z.p != NULL && mpi_cmp_int( &Q->Z, 1 ) != 0 )
        return( POLARSSL_ERR_ECP_BAD_INPUT_DATA );

    mpi_init( &T1 ); mpi_init( &T2 ); mpi_init( &T3 ); mpi_init( &T4 );
@ -1025,7 +1036,7 @@ int ecp_sub( const ecp_group *grp, ecp_point *R,
        return( POLARSSL_ERR_ECP_FEATURE_UNAVAILABLE );

    /* mQ = - Q */
-    ecp_copy( &mQ, Q );
+    MPI_CHK( ecp_copy( &mQ, Q ) );
    if( mpi_cmp_int( &mQ.Y, 0 ) != 0 )
        MPI_CHK( mpi_sub_mpi( &mQ.Y, &grp->P, &mQ.Y ) );

@ -1184,7 +1195,7 @@ static int ecp_precompute_comb( const ecp_group *grp,
        TT[k++] = cur;
    }

-    ecp_normalize_jac_many( grp, TT, k );
+    MPI_CHK( ecp_normalize_jac_many( grp, TT, k ) );

    /*
     * Compute the remaining ones using the minimal number of additions
@ -1196,25 +1207,12 @@ static int ecp_precompute_comb( const ecp_group *grp,
        j = i;
        while( j-- )
        {
-            ecp_add_mixed( grp, &T[i + j], &T[j], &T[i] );
+            MPI_CHK( ecp_add_mixed( grp, &T[i + j], &T[j], &T[i] ) );
            TT[k++] = &T[i + j];
        }
    }

-    ecp_normalize_jac_many( grp, TT, k );
-
-    /*
-     * Post-precessing: reclaim some memory by
-     * - not storing Z (always 1)
-     * - shrinking other coordinates
-     * Keep the same number of limbs as P to avoid re-growing on next use.
-     */
-    for( i = 0; i < ( 1U << (w-1) ); i++ )
-    {
-        mpi_free( &T[i].Z );
-        mpi_shrink( &T[i].X, grp->P.n );
-        mpi_shrink( &T[i].Y, grp->P.n );
-    }
+    MPI_CHK( ecp_normalize_jac_many( grp, TT, k ) );

 cleanup:
    return( ret );
@ -1240,9 +1238,6 @@ static int ecp_select_comb( const ecp_group *grp, ecp_point *R,
        MPI_CHK( mpi_safe_cond_assign( &R->Y, &T[j].Y, j == ii ) );
    }

-    /* The Z coordinate is always 1 */
-    MPI_CHK( mpi_lset( &R->Z, 1 ) );
-
    /* Safely invert result if i is "negative" */
    MPI_CHK( ecp_safe_invert_jac( grp, R, i >> 7 ) );

@ -1271,6 +1266,7 @@ static int ecp_mul_comb_core( const ecp_group *grp, ecp_point *R,
    /* Start with a non-zero point and randomize its coordinates */
    i = d;
    MPI_CHK( ecp_select_comb( grp, R, T, t_len, x[i] ) );
+    MPI_CHK( mpi_lset( &R->Z, 1 ) );
    if( f_rng != 0 )
        MPI_CHK( ecp_randomize_jac( grp, R, f_rng, p_rng ) );

@ -1319,12 +1315,17 @@ static int ecp_mul_comb( ecp_group *grp, ecp_point *R,

    /*
     * If P == G, pre-compute a bit more, since this may be re-used later.
-     * Just adding one ups the cost of the first mul by at most 3%.
+     * Just adding one avoids upping the cost of the first mul too much,
+     * and the memory cost too.
     */
+#if POLARSSL_ECP_FIXED_POINT_OPTIM == 1
    p_eq_g = ( mpi_cmp_mpi( &P->Y, &grp->G.Y ) == 0 &&
               mpi_cmp_mpi( &P->X, &grp->G.X ) == 0 );
    if( p_eq_g )
        w++;
+#else
+    p_eq_g = 0;
+#endif

    /*
     * Make sure w is within bounds.
--- a/library/memory_buffer_alloc.c
+++ b/library/memory_buffer_alloc.c
@ -77,6 +77,7 @@ typedef struct
    size_t          total_used;
    size_t          maximum_used;
    size_t          header_count;
+    size_t          maximum_header_count;
 #endif
 #if defined(POLARSSL_THREADING_C)
    threading_mutex_t   mutex;
@ -335,6 +336,8 @@ static void *buffer_alloc_malloc( size_t len )

 #if defined(POLARSSL_MEMORY_DEBUG)
    heap.header_count++;
+    if( heap.header_count > heap.maximum_header_count )
+        heap.maximum_header_count = heap.header_count;
    heap.total_used += cur->size;
    if( heap.total_used > heap.maximum_used)
        heap.maximum_used = heap.total_used;
@ -484,8 +487,11 @@ int memory_buffer_alloc_verify()
 void memory_buffer_alloc_status()
 {
    fprintf( stderr,
-             "Current use: %u blocks / %u bytes, max: %u bytes, malloc / free: %u / %u\n",
-             heap.header_count, heap.total_used, heap.maximum_used,
+             "Current use: %u blocks / %u bytes, max: %u blocks / %u bytes (total %u bytes), malloc / free: %u / %u\n",
+             heap.header_count, heap.total_used,
+             heap.maximum_header_count, heap.maximum_used,
+             heap.maximum_header_count * sizeof( memory_header )
+             + heap.maximum_used,
             heap.malloc_count, heap.free_count );

    if( heap.first->next == NULL )