From d1e7a45fdd59f2d1db081347c4d1ced835be5952 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Manuel=20P=C3=A9gouri=C3=A9-Gonnard?= <mpg@elzevir.fr>
Date: Tue, 22 Oct 2013 21:03:16 +0200
Subject: [PATCH 01/14] Rework ecp_mod_p192()

On x86_64, this makes it 5x faster, and ecp_mul() 17% faster for this curve.
The code is shorter too.
---
 library/ecp.c                    | 101 +++++++++++++------------------
 tests/suites/test_suite_ecp.data |  14 ++++-
 2 files changed, 56 insertions(+), 59 deletions(-)

diff --git a/library/ecp.c b/library/ecp.c
index d3880be55..d53d306a5 100644
--- a/library/ecp.c
+++ b/library/ecp.c
@@ -475,25 +475,36 @@ cleanup:
 }
 
 #if defined(POLARSSL_ECP_DP_SECP192R1_ENABLED)
-/*
- * 192 bits in terms of t_uint
- */
-#define P192_SIZE_INT   ( 192 / CHAR_BIT / sizeof( t_uint ) )
 
-/*
- * Table to get S1, S2, S3 of FIPS 186-3 D.2.1:
- * -1 means let this chunk be 0
- * a positive value i means A_i.
- */
-#define P192_CHUNKS         3
-#define P192_CHUNK_CHAR     ( 64 / CHAR_BIT )
-#define P192_CHUNK_INT      ( P192_CHUNK_CHAR / sizeof( t_uint ) )
+/* Add 64-bit chunks (dst += src) and update carry */
+static inline void add_64( t_uint *dst, t_uint *src, t_uint *carry )
+{
+    unsigned char i;
+    t_uint c = 0;
+    for( i = 0; i < 8 / sizeof( t_uint ); i++, dst++, src++ )
+    {
+        *dst += c;      c  = ( *dst < c );
+        *dst += *src;   c += ( *dst < *src );
+    }
+    *carry += c;
+}
 
-const signed char p192_tbl[][P192_CHUNKS] = {
-    { -1,   3,  3   }, /* S1 */
-    { 4,    4,  -1  }, /* S2 */
-    { 5,    5,  5   }, /* S3 */
-};
+/* Add carry to a 64-bit chunk and update carry */
+static inline void carry64( t_uint *dst, t_uint *carry )
+{
+    unsigned char i;
+    for( i = 0; i < 8 / sizeof( t_uint ); i++, dst++ )
+    {
+        *dst += *carry;
+        *carry  = ( *dst < *carry );
+    }
+}
+
+#define OFFSET      ( 8 / sizeof( t_uint ) )
+#define A( i )      ( N->p + ( i ) * OFFSET )
+#define ADD( i )    add_64( p, A( i ), &c )
+#define NEXT        p += OFFSET; carry64( p, &c )
+#define LAST        p += OFFSET; *p = c; while( ++p < end ) *p = 0
 
 /*
  * Fast quasi-reduction modulo p192 (FIPS 186-3 D.2.1)
@@ -501,53 +512,27 @@ const signed char p192_tbl[][P192_CHUNKS] = {
 static int ecp_mod_p192( mpi *N )
 {
     int ret;
-    unsigned char i, j, offset;
-    signed char chunk;
-    mpi tmp, acc;
-    t_uint tmp_p[P192_SIZE_INT], acc_p[P192_SIZE_INT + 1];
+    t_uint c = 0;
+    t_uint *p, *end;
 
-    tmp.s = 1;
-    tmp.n = sizeof( tmp_p ) / sizeof( tmp_p[0] );
-    tmp.p = tmp_p;
+    /* Make sure we have the correct number of blocks */
+    MPI_CHK( mpi_grow( N, 6 * OFFSET ) );
+    p = N->p;
+    end = p + N->n;
 
-    acc.s = 1;
-    acc.n = sizeof( acc_p ) / sizeof( acc_p[0] );
-    acc.p = acc_p;
-
-    MPI_CHK( mpi_grow( N, P192_SIZE_INT * 2 ) );
-
-    /*
-     * acc = T
-     */
-    memset( acc_p, 0, sizeof( acc_p ) );
-    memcpy( acc_p, N->p, P192_CHUNK_CHAR * P192_CHUNKS );
-
-    for( i = 0; i < sizeof( p192_tbl ) / sizeof( p192_tbl[0] ); i++)
-    {
-        /*
-         * tmp = S_i
-         */
-        memset( tmp_p, 0, sizeof( tmp_p ) );
-        for( j = 0, offset = P192_CHUNKS - 1; j < P192_CHUNKS; j++, offset-- )
-        {
-            chunk = p192_tbl[i][j];
-            if( chunk >= 0 )
-                memcpy( tmp_p + offset * P192_CHUNK_INT,
-                        N->p + chunk * P192_CHUNK_INT,
-                        P192_CHUNK_CHAR );
-        }
-
-        /*
-         * acc += tmp
-         */
-        MPI_CHK( mpi_add_abs( &acc, &acc, &tmp ) );
-    }
-
-    MPI_CHK( mpi_copy( N, &acc ) );
+    ADD( 3 ); ADD( 5 );             NEXT; // A0 += A3 + A5
+    ADD( 3 ); ADD( 4 ); ADD( 5 );   NEXT; // A1 += A3 + A4 + A5
+    ADD( 4 ); ADD( 5 );             LAST; // A2 += A4 + A5
 
 cleanup:
     return( ret );
 }
+
+#undef OFFSET
+#undef A
+#undef ADD
+#undef NEXT
+#undef LAST
 #endif /* POLARSSL_ECP_DP_SECP192R1_ENABLED */
 
 #if defined(POLARSSL_ECP_DP_SECP521R1_ENABLED)
diff --git a/tests/suites/test_suite_ecp.data b/tests/suites/test_suite_ecp.data
index 9eb302b5b..4748ff98b 100644
--- a/tests/suites/test_suite_ecp.data
+++ b/tests/suites/test_suite_ecp.data
@@ -253,14 +253,26 @@ ECP gen keypair
 depends_on:POLARSSL_ECP_DP_SECP192R1_ENABLED
 ecp_gen_keypair:POLARSSL_ECP_DP_SECP192R1
 
+ECP mod p192 small (more than 192 bits, less limbs than 2 * 192 bits)
+depends_on:POLARSSL_ECP_DP_SECP192R1_ENABLED
+ecp_fast_mod:POLARSSL_ECP_DP_SECP192R1:"0100000000000103010000000000010201000000000001010100000000000100"
+
 ECP mod p192 readable
 depends_on:POLARSSL_ECP_DP_SECP192R1_ENABLED
-ecp_fast_mod:POLARSSL_ECP_DP_SECP192R1:"000000000000010500000000000001040000000000000103000000000000010200000000000001010000000000000100"
+ecp_fast_mod:POLARSSL_ECP_DP_SECP192R1:"010000000000010501000000000001040100000000000103010000000000010201000000000001010100000000000100"
+
+ECP mod p192 readable with carry
+depends_on:POLARSSL_ECP_DP_SECP192R1_ENABLED
+ecp_fast_mod:POLARSSL_ECP_DP_SECP192R1:"FF00000000010500FF00000000010400FF00000000010300FF00000000010200FF00000000010100FF00000000010000"
 
 ECP mod p192 random
 depends_on:POLARSSL_ECP_DP_SECP192R1_ENABLED
 ecp_fast_mod:POLARSSL_ECP_DP_SECP192R1:"36CF96B45D706A0954D89E52CE5F38517A2270E0175849B6F3740151D238CCABEF921437E475881D83BB69E4AA258EBD"
 
+ECP mod p192 (from a past failure case)
+depends_on:POLARSSL_ECP_DP_SECP192R1_ENABLED
+ecp_fast_mod:POLARSSL_ECP_DP_SECP192R1:"1AC2D6F96A2A425E9DD1776DD8368D4BBC86BF4964E79FEA713583BF948BBEFF0939F96FB19EC48C585BDA6A2D35C750"
+
 ECP test vectors secp192r1 rfc 5114
 depends_on:POLARSSL_ECP_DP_SECP192R1_ENABLED
 ecp_test_vect:POLARSSL_ECP_DP_SECP192R1:"323FA3169D8E9C6593F59476BC142000AB5BE0E249C43426":"CD46489ECFD6C105E7B3D32566E2B122E249ABAADD870612":"68887B4877DF51DD4DC3D6FD11F0A26F8FD3844317916E9A":"631F95BB4A67632C9C476EEE9AB695AB240A0499307FCF62":"519A121680E0045466BA21DF2EEE47F5973B500577EF13D5":"FF613AB4D64CEE3A20875BDB10F953F6B30CA072C60AA57F":"AD420182633F8526BFE954ACDA376F05E5FF4F837F54FEBE":"4371545ED772A59741D0EDA32C671112B7FDDD51461FCF32"

From c9e387ca9ebf8561b9a612ee2f75d02c3ab00276 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Manuel=20P=C3=A9gouri=C3=A9-Gonnard?= <mpg@elzevir.fr>
Date: Thu, 17 Oct 2013 17:15:35 +0200
Subject: [PATCH 02/14] Optimize ecp_modp()

Makes it 22% faster, for a 5% gain on ecp_mul()
---
 library/ecp.c | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/library/ecp.c b/library/ecp.c
index d53d306a5..78b05c426 100644
--- a/library/ecp.c
+++ b/library/ecp.c
@@ -451,6 +451,8 @@ int ecp_tls_write_point( const ecp_group *grp, const ecp_point *pt,
 /*
  * Wrapper around fast quasi-modp functions, with fall-back to mpi_mod_mpi.
  * See the documentation of struct ecp_group.
+ *
+ * This function is in the critial loop for ecp_mul, so pay attention to perf.
  */
 static int ecp_modp( mpi *N, const ecp_group *grp )
 {
@@ -459,16 +461,22 @@ static int ecp_modp( mpi *N, const ecp_group *grp )
     if( grp->modp == NULL )
         return( mpi_mod_mpi( N, N, &grp->P ) );
 
-    if( mpi_cmp_int( N, 0 ) < 0 || mpi_msb( N ) > 2 * grp->pbits )
+    /* N->s < 0 is a much faster test, which fails only if N is 0 */
+    if( ( N->s < 0 && mpi_cmp_int( N, 0 ) != 0 ) ||
+        mpi_msb( N ) > 2 * grp->pbits )
+    {
         return( POLARSSL_ERR_ECP_BAD_INPUT_DATA );
+    }
 
     MPI_CHK( grp->modp( N ) );
 
-    while( mpi_cmp_int( N, 0 ) < 0 )
+    /* N->s < 0 is a much faster test, which fails only if N is 0 */
+    while( N->s < 0 && mpi_cmp_int( N, 0 ) != 0 )
         MPI_CHK( mpi_add_mpi( N, N, &grp->P ) );
 
     while( mpi_cmp_mpi( N, &grp->P ) >= 0 )
-        MPI_CHK( mpi_sub_mpi( N, N, &grp->P ) );
+        /* we known P, N and the result are positive */
+        MPI_CHK( mpi_sub_abs( N, N, &grp->P ) );
 
 cleanup:
     return( ret );
@@ -915,17 +923,20 @@ const ecp_curve_info *ecp_curve_info_from_grp_id( ecp_group_id grp_id )
 
 /*
  * Reduce a mpi mod p in-place, to use after mpi_sub_mpi
+ * N->s < 0 is a very fast test, which fails only if N is 0
  */
 #define MOD_SUB( N )                                \
-    while( mpi_cmp_int( &N, 0 ) < 0 )               \
+    while( N.s < 0 && mpi_cmp_int( &N, 0 ) != 0 )               \
         MPI_CHK( mpi_add_mpi( &N, &N, &grp->P ) )
 
 /*
- * Reduce a mpi mod p in-place, to use after mpi_add_mpi and mpi_mul_int
+ * Reduce a mpi mod p in-place, to use after mpi_add_mpi and mpi_mul_int.
+ * We known P, N and the result are positive, so sub_abs is correct, and
+ * a bit faster.
  */
 #define MOD_ADD( N )                                \
     while( mpi_cmp_mpi( &N, &grp->P ) >= 0 )        \
-        MPI_CHK( mpi_sub_mpi( &N, &N, &grp->P ) )
+        MPI_CHK( mpi_sub_abs( &N, &N, &grp->P ) )
 
 /*
  * Normalize jacobian coordinates so that Z == 0 || Z == 1  (GECC 3.2.1)

From cc67aee9c8845896fcbe2a497b2b3360415773fa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Manuel=20P=C3=A9gouri=C3=A9-Gonnard?= <mpg@elzevir.fr>
Date: Fri, 18 Oct 2013 10:55:45 +0200
Subject: [PATCH 03/14] Make ecp_mod_p521 a bit faster

---
 library/ecp.c                    | 28 +++++++++++++++++++---------
 tests/suites/test_suite_ecp.data | 16 ++++++++++++++++
 2 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/library/ecp.c b/library/ecp.c
index 78b05c426..33081a03c 100644
--- a/library/ecp.c
+++ b/library/ecp.c
@@ -547,12 +547,12 @@ cleanup:
 /*
  * Size of p521 in terms of t_uint
  */
-#define P521_SIZE_INT   ( 521 / CHAR_BIT / sizeof( t_uint ) + 1 )
+#define P521_SIZE_INT   ( 521 / 8 / sizeof( t_uint ) + 1 )
 
 /*
  * Bits to keep in the most significant t_uint
  */
-#if defined(POLARSS_HAVE_INT8)
+#if defined(POLARSSL_HAVE_INT8)
 #define P521_MASK       0x01
 #else
 #define P521_MASK       0x01FF
@@ -560,26 +560,36 @@ cleanup:
 
 /*
  * Fast quasi-reduction modulo p521 (FIPS 186-3 D.2.5)
+ * Write N as A1 + 2^521 A0, return A0 + A1
  */
 static int ecp_mod_p521( mpi *N )
 {
     int ret;
-    t_uint Mp[P521_SIZE_INT];
+    size_t i;
     mpi M;
+    t_uint Mp[P521_SIZE_INT+1];
+    /* Worst case for the size of M is when sizeof( t_uint ) == 16:
+     * we need to hold bits 513 to 1056, which is 34 limbs, that is
+     * P521_SIZE_INT + 1. Otherwise P521_SIZE is enough. */
 
     if( N->n < P521_SIZE_INT )
         return( 0 );
 
-    memset( Mp, 0, P521_SIZE_INT * sizeof( t_uint ) );
-    memcpy( Mp, N->p, P521_SIZE_INT * sizeof( t_uint ) );
-    Mp[P521_SIZE_INT - 1] &= P521_MASK;
-
+    /* M = A1 */
     M.s = 1;
-    M.n = P521_SIZE_INT;
+    M.n = N->n - ( P521_SIZE_INT - 1 );
+    if( M.n > P521_SIZE_INT + 1 )
+        M.n = P521_SIZE_INT + 1;
     M.p = Mp;
+    memcpy( Mp, N->p + P521_SIZE_INT - 1, M.n * sizeof( t_uint ) );
+    MPI_CHK( mpi_shift_r( &M, 521 % ( 8 * sizeof( t_uint ) ) ) );
 
-    MPI_CHK( mpi_shift_r( N, 521 ) );
+    /* N = A0 */
+    N->p[P521_SIZE_INT - 1] &= P521_MASK;
+    for( i = P521_SIZE_INT; i < N->n; i++ )
+        N->p[i] = 0;
 
+    /* N = A0 + A1 */
     MPI_CHK( mpi_add_abs( N, N, &M ) );
 
 cleanup:
diff --git a/tests/suites/test_suite_ecp.data b/tests/suites/test_suite_ecp.data
index 4748ff98b..c8ed20f6b 100644
--- a/tests/suites/test_suite_ecp.data
+++ b/tests/suites/test_suite_ecp.data
@@ -273,6 +273,22 @@ ECP mod p192 (from a past failure case)
 depends_on:POLARSSL_ECP_DP_SECP192R1_ENABLED
 ecp_fast_mod:POLARSSL_ECP_DP_SECP192R1:"1AC2D6F96A2A425E9DD1776DD8368D4BBC86BF4964E79FEA713583BF948BBEFF0939F96FB19EC48C585BDA6A2D35C750"
 
+ECP mod p521 very small
+depends_on:POLARSSL_ECP_DP_SECP521R1_ENABLED
+ecp_fast_mod:POLARSSL_ECP_DP_SECP521R1:"01"
+
+ECP mod p521 small (522 bits)
+depends_on:POLARSSL_ECP_DP_SECP521R1_ENABLED
+ecp_fast_mod:POLARSSL_ECP_DP_SECP521R1:"030000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000"
+
+ECP mod p521 readable
+depends_on:POLARSSL_ECP_DP_SECP521R1_ENABLED
+ecp_fast_mod:POLARSSL_ECP_DP_SECP521R1:"03FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000"
+
+ECP mod p521 readable with carry
+depends_on:POLARSSL_ECP_DP_SECP521R1_ENABLED
+ecp_fast_mod:POLARSSL_ECP_DP_SECP521R1:"03FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001"
+
 ECP test vectors secp192r1 rfc 5114
 depends_on:POLARSSL_ECP_DP_SECP192R1_ENABLED
 ecp_test_vect:POLARSSL_ECP_DP_SECP192R1:"323FA3169D8E9C6593F59476BC142000AB5BE0E249C43426":"CD46489ECFD6C105E7B3D32566E2B122E249ABAADD870612":"68887B4877DF51DD4DC3D6FD11F0A26F8FD3844317916E9A":"631F95BB4A67632C9C476EEE9AB695AB240A0499307FCF62":"519A121680E0045466BA21DF2EEE47F5973B500577EF13D5":"FF613AB4D64CEE3A20875BDB10F953F6B30CA072C60AA57F":"AD420182633F8526BFE954ACDA376F05E5FF4F837F54FEBE":"4371545ED772A59741D0EDA32C671112B7FDDD51461FCF32"

From e783f06f730f3f19851caaf78a8091780093b085 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Manuel=20P=C3=A9gouri=C3=A9-Gonnard?= <mpg@elzevir.fr>
Date: Mon, 21 Oct 2013 14:52:21 +0200
Subject: [PATCH 04/14] Start working on mod_p224

(Prototype, works only on 32-bit and little-endian 64-bit.)
---
 library/ecp.c                        | 88 ++++++++++++++++++++++++++++
 tests/suites/test_suite_ecp.data     | 16 +++++
 tests/suites/test_suite_ecp.function |  3 +-
 3 files changed, 106 insertions(+), 1 deletion(-)

diff --git a/library/ecp.c b/library/ecp.c
index 33081a03c..4eddcdcb8 100644
--- a/library/ecp.c
+++ b/library/ecp.c
@@ -543,6 +543,93 @@ cleanup:
 #undef LAST
 #endif /* POLARSSL_ECP_DP_SECP192R1_ENABLED */
 
+#if defined(POLARSSL_ECP_DP_SECP521R1_ENABLED)
+
+/* For now, prototype version for 32-bit or little-endian 64 bits only */
+
+static inline void add32( uint32_t *dst, uint32_t src, signed char *carry )
+{
+    *dst += src;
+    *carry += ( *dst < src );
+}
+
+static inline void sub32( uint32_t *dst, uint32_t src, signed char *carry )
+{
+    *carry -= ( *dst < src );
+    *dst -= src;
+}
+
+#define A( i )      ( ((uint32_t *) N->p)[i] )
+#define ADD( i )    add32( p, A( i ), &c );
+#define SUB( i )    sub32( p, A( i ), &c );
+
+#define NEXT                    \
+    p++;                        \
+    cc = c;                     \
+    c = 0;                      \
+    if( cc < 0 )                \
+        sub32( p, -cc, &c );    \
+    else                        \
+        add32( p, cc, &c );
+
+#define LAST                                    \
+    p++;                                        \
+    *p = c > 0 ? c : 0; /* see fix_negative */  \
+    while( ++p < end )                          \
+        *p = 0;                                 \
+    if( c < 0 ) fix_negative( N, c, bits );
+
+/*
+ * If the result is negative, we get it in the form c * 2^192 + N,
+ * with c negative and N positive (the c >= 0 case is handled by LAST).
+ */
+static inline int fix_negative( mpi *N, signed char c, size_t bits )
+{
+    int ret;
+    mpi C;
+
+    mpi_init( &C );
+
+    MPI_CHK( mpi_lset( &C, c ) );
+    MPI_CHK( mpi_shift_l( &C, bits ) );
+    MPI_CHK( mpi_add_mpi( N, N, &C ) );
+
+cleanup:
+    mpi_free( &C );
+
+    return( ret );
+}
+
+/*
+ * Fast quasi-reduction modulo p224 (FIPS 186-3 D.2.2)
+ */
+static int ecp_mod_p224( mpi *N )
+{
+    int ret;
+    signed char c, cc;
+    uint32_t *p, *end;
+    size_t bits = 224;
+
+    /* Make sure we have the correct number of blocks */
+    MPI_CHK( mpi_grow( N, bits * 2 / 8 / sizeof( t_uint ) ) );
+
+    /* Currently assuming 32-bit ints, or 64-bits little-endian */
+    p = (uint32_t *) N->p;
+    end = (uint32_t *) (N->p + N->n);
+
+    SUB(  7 ); SUB( 11 );               NEXT; // A0 += -A7 - A11
+    SUB(  8 ); SUB( 12 );               NEXT; // A1 += -A8 - A12
+    SUB(  9 ); SUB( 13 );               NEXT; // A2 += -A9 - A13
+    SUB( 10 ); ADD(  7 ); ADD( 11 );    NEXT; // A3 += -A10 + A7 + A11
+    SUB( 11 ); ADD(  8 ); ADD( 12 );    NEXT; // A4 += -A11 + A8 + A12
+    SUB( 12 ); ADD(  9 ); ADD( 13 );    NEXT; // A5 += -A12 + A9 + A13
+    SUB( 13 ); ADD( 10 );               LAST; // A6 += -A13 + A10
+
+cleanup:
+    return( ret );
+}
+#endif /* POLARSSL_ECP_DP_SECP224R1_ENABLED */
+
 #if defined(POLARSSL_ECP_DP_SECP521R1_ENABLED)
 /*
  * Size of p521 in terms of t_uint
@@ -761,6 +848,7 @@ int ecp_use_known_dp( ecp_group *grp, ecp_group_id id )
 
 #if defined(POLARSSL_ECP_DP_SECP224R1_ENABLED)
         case POLARSSL_ECP_DP_SECP224R1:
+            grp->modp = ecp_mod_p224;
             return( ecp_group_read_string( grp, 16,
                         SECP224R1_P, SECP224R1_B,
                         SECP224R1_GX, SECP224R1_GY, SECP224R1_N ) );
diff --git a/tests/suites/test_suite_ecp.data b/tests/suites/test_suite_ecp.data
index c8ed20f6b..2f5f4efc7 100644
--- a/tests/suites/test_suite_ecp.data
+++ b/tests/suites/test_suite_ecp.data
@@ -273,6 +273,22 @@ ECP mod p192 (from a past failure case)
 depends_on:POLARSSL_ECP_DP_SECP192R1_ENABLED
 ecp_fast_mod:POLARSSL_ECP_DP_SECP192R1:"1AC2D6F96A2A425E9DD1776DD8368D4BBC86BF4964E79FEA713583BF948BBEFF0939F96FB19EC48C585BDA6A2D35C750"
 
+ECP mod p224 readable without carry
+depends_on:POLARSSL_ECP_DP_SECP224R1_ENABLED
+ecp_fast_mod:POLARSSL_ECP_DP_SECP224R1:"0000000D0000000C0000000B0000000A0000000900000008000000070000FF060000FF050000FF040000FF03000FF0020000FF010000FF00"
+
+ECP mod p224 readable with negative carry
+depends_on:POLARSSL_ECP_DP_SECP224R1_ENABLED
+ecp_fast_mod:POLARSSL_ECP_DP_SECP224R1:"0000000D0000000C0000000B0000000A00000009000000080000000700000006000000050000000400000003000000020000000100000000"
+
+ECP mod p224 readable with positive carry
+depends_on:POLARSSL_ECP_DP_SECP224R1_ENABLED
+ecp_fast_mod:POLARSSL_ECP_DP_SECP224R1:"0000000D0000000C0000000BFFFFFF0AFFFFFF09FFFFFF08FFFFFF070000FF060000FF050000FF040000FF03000FF0020000FF010000FF00"
+
+ECP mod p224 readable with final negative carry
+depends_on:POLARSSL_ECP_DP_SECP224R1_ENABLED
+ecp_fast_mod:POLARSSL_ECP_DP_SECP224R1:"FF00000D0000000C0000000B0000000A00000009000000080000000700000006000000050000000400000003000000020000000100000000"
+
 ECP mod p521 very small
 depends_on:POLARSSL_ECP_DP_SECP521R1_ENABLED
 ecp_fast_mod:POLARSSL_ECP_DP_SECP521R1:"01"
diff --git a/tests/suites/test_suite_ecp.function b/tests/suites/test_suite_ecp.function
index 6981f47d3..4eb52596c 100644
--- a/tests/suites/test_suite_ecp.function
+++ b/tests/suites/test_suite_ecp.function
@@ -229,8 +229,9 @@ void ecp_fast_mod( int id, char *N_str )
     mpi_init( &N ); mpi_init( &R );
     ecp_group_init( &grp );
 
-    TEST_ASSERT( ecp_use_known_dp( &grp, id ) == 0 );
     TEST_ASSERT( mpi_read_string( &N, 16, N_str ) == 0 );
+    TEST_ASSERT( ecp_use_known_dp( &grp, id ) == 0 );
+    TEST_ASSERT( grp.modp != NULL );
 
     /*
      * Store correct result before we touch N

From a47e7058ea6086838fd265b40d64c8ceab24f224 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Manuel=20P=C3=A9gouri=C3=A9-Gonnard?= <mpg@elzevir.fr>
Date: Mon, 21 Oct 2013 17:51:45 +0200
Subject: [PATCH 05/14] mod_p224 now endian-neutral

---
 include/polarssl/bignum.h |  1 +
 library/ecp.c             | 58 +++++++++++++++++++++++++--------------
 2 files changed, 38 insertions(+), 21 deletions(-)

diff --git a/include/polarssl/bignum.h b/include/polarssl/bignum.h
index b1c43b75c..769e546d5 100644
--- a/include/polarssl/bignum.h
+++ b/include/polarssl/bignum.h
@@ -142,6 +142,7 @@ typedef uint32_t t_udbl;
        typedef unsigned int t_udbl __attribute__((mode(TI)));
        #define POLARSSL_HAVE_UDBL
     #else
+       #define POLARSSL_HAVE_INT32
        typedef  int32_t t_sint;
        typedef uint32_t t_uint;
        #if ( defined(_MSC_VER) && defined(_M_IX86) )
diff --git a/library/ecp.c b/library/ecp.c
index 4eddcdcb8..b33a57fb0 100644
--- a/library/ecp.c
+++ b/library/ecp.c
@@ -545,8 +545,6 @@ cleanup:
 
 #if defined(POLARSSL_ECP_DP_SECP521R1_ENABLED)
 
-/* For now, prototype version for 32-bit or little-endian 64 bits only */
-
 static inline void add32( uint32_t *dst, uint32_t src, signed char *carry )
 {
     *dst += src;
@@ -559,24 +557,44 @@ static inline void sub32( uint32_t *dst, uint32_t src, signed char *carry )
     *dst -= src;
 }
 
-#define A( i )      ( ((uint32_t *) N->p)[i] )
-#define ADD( i )    add32( p, A( i ), &c );
-#define SUB( i )    sub32( p, A( i ), &c );
+#if defined(POLARSSL_HAVE_INT16) || defined(POLARSSL_HAVE_INT8)
+#error "Currently not supported, WIP"
+#elif defined(POLARSSL_HAVE_INT32)
+#define A( j )      N->p[j]
+#define STORE32     N->p[i] = cur;
+#else /* 64-bit */
+#define A( j ) j % 2 ? (uint32_t)( N->p[j/2] >> 32 ) : (uint32_t)( N->p[j/2] )
+#define STORE32                                   \
+    if( i % 2 ) {                                 \
+        N->p[i/2] &= 0x00000000FFFFFFFF;          \
+        N->p[i/2] |= ((uint64_t) cur) << 32;      \
+    } else {                                      \
+        N->p[i/2] &= 0xFFFFFFFF00000000;          \
+        N->p[i/2] |= (uint64_t) cur;              \
+    }
+#endif
+
+#define ADD( j )    add32( &cur, A( j ), &c );
+#define SUB( j )    sub32( &cur, A( j ), &c );
+
+#define LOAD32      cur = A( i );
+
+#define FIRST       c = 0; i = 0; LOAD32;
 
 #define NEXT                    \
-    p++;                        \
-    cc = c;                     \
-    c = 0;                      \
+    STORE32; i++; LOAD32;       \
+    cc = c; c = 0;              \
     if( cc < 0 )                \
-        sub32( p, -cc, &c );    \
+        sub32( &cur, -cc, &c ); \
     else                        \
-        add32( p, cc, &c );
+        add32( &cur, cc, &c );
 
-#define LAST                                    \
-    p++;                                        \
-    *p = c > 0 ? c : 0; /* see fix_negative */  \
-    while( ++p < end )                          \
-        *p = 0;                                 \
+#define LAST                                                    \
+    STORE32; i++;                                               \
+    cur = c > 0 ? c : 0; STORE32; /* see fix_negative */        \
+    cur = 0;                                                    \
+    while( ++i < N->n * sizeof( t_uint ) / sizeof( uint32_t ) ) \
+        STORE32;                                                \
     if( c < 0 ) fix_negative( N, c, bits );
 
 /*
@@ -607,16 +625,14 @@ static int ecp_mod_p224( mpi *N )
 {
     int ret;
     signed char c, cc;
-    uint32_t *p, *end;
+    uint32_t cur;
+    size_t i;
     size_t bits = 224;
 
-    /* Make sure we have the correct number of blocks */
+    /* Make sure we have enough blocks */
     MPI_CHK( mpi_grow( N, bits * 2 / 8 / sizeof( t_uint ) ) );
 
-    /* Currently assuming 32-bit ints, or 64-bits little-endian */
-    p = (uint32_t *) N->p;
-    end = (uint32_t *) (N->p + N->n);
-
+    FIRST;
     SUB(  7 ); SUB( 11 );               NEXT; // A0 += -A7 - A11
     SUB(  8 ); SUB( 12 );               NEXT; // A1 += -A8 - A12
     SUB(  9 ); SUB( 13 );               NEXT; // A2 += -A9 - A13

From 2a08c0debc268785a983d615282d9fcd629a2067 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Manuel=20P=C3=A9gouri=C3=A9-Gonnard?= <mpg@elzevir.fr>
Date: Tue, 22 Oct 2013 21:07:14 +0200
Subject: [PATCH 06/14] mod_p224 now working with 8-bit and 16-bit ints

---
 library/ecp.c | 41 +++++++++++++++++++++++++++++++----------
 1 file changed, 31 insertions(+), 10 deletions(-)

diff --git a/library/ecp.c b/library/ecp.c
index b33a57fb0..21a231505 100644
--- a/library/ecp.c
+++ b/library/ecp.c
@@ -543,7 +543,7 @@ cleanup:
 #undef LAST
 #endif /* POLARSSL_ECP_DP_SECP192R1_ENABLED */
 
-#if defined(POLARSSL_ECP_DP_SECP521R1_ENABLED)
+#if defined(POLARSSL_ECP_DP_SECP224R1_ENABLED)
 
 static inline void add32( uint32_t *dst, uint32_t src, signed char *carry )
 {
@@ -557,12 +557,34 @@ static inline void sub32( uint32_t *dst, uint32_t src, signed char *carry )
     *dst -= src;
 }
 
-#if defined(POLARSSL_HAVE_INT16) || defined(POLARSSL_HAVE_INT8)
-#error "Currently not supported, WIP"
+#if defined(POLARSSL_HAVE_INT8)
+
+#define MAX32       N->n / 4
+#define A( j )      (uint32_t)( N->p[4*j+0]       ) |  \
+                              ( N->p[4*j+1] << 8  ) |  \
+                              ( N->p[4*j+2] << 16 ) |  \
+                              ( N->p[4*j+3] << 24 )
+#define STORE32     N->p[4*i+0] = (uint8_t)( cur       );   \
+                    N->p[4*i+1] = (uint8_t)( cur >> 8  );   \
+                    N->p[4*i+2] = (uint8_t)( cur >> 16 );   \
+                    N->p[4*i+3] = (uint8_t)( cur >> 24 );
+
+#elif defined(POLARSSL_HAVE_INT16)
+
+#define MAX32       N->n / 2
+#define A( j )      (uint32_t)( N->p[2*j] ) | ( N->p[2*j+1] << 16 )
+#define STORE32     N->p[2*i+0] = (uint16_t)( cur       );  \
+                    N->p[2*i+1] = (uint16_t)( cur >> 16 );
+
 #elif defined(POLARSSL_HAVE_INT32)
+
+#define MAX32       N->n
 #define A( j )      N->p[j]
 #define STORE32     N->p[i] = cur;
+
 #else /* 64-bit */
+
+#define MAX32       N->n * 2
 #define A( j ) j % 2 ? (uint32_t)( N->p[j/2] >> 32 ) : (uint32_t)( N->p[j/2] )
 #define STORE32                                   \
     if( i % 2 ) {                                 \
@@ -572,6 +594,7 @@ static inline void sub32( uint32_t *dst, uint32_t src, signed char *carry )
         N->p[i/2] &= 0xFFFFFFFF00000000;          \
         N->p[i/2] |= (uint64_t) cur;              \
     }
+
 #endif
 
 #define ADD( j )    add32( &cur, A( j ), &c );
@@ -587,14 +610,12 @@ static inline void sub32( uint32_t *dst, uint32_t src, signed char *carry )
     if( cc < 0 )                \
         sub32( &cur, -cc, &c ); \
     else                        \
-        add32( &cur, cc, &c );
+        add32( &cur, cc, &c );  \
 
-#define LAST                                                    \
-    STORE32; i++;                                               \
-    cur = c > 0 ? c : 0; STORE32; /* see fix_negative */        \
-    cur = 0;                                                    \
-    while( ++i < N->n * sizeof( t_uint ) / sizeof( uint32_t ) ) \
-        STORE32;                                                \
+#define LAST                                    \
+    STORE32; i++;                               \
+    cur = c > 0 ? c : 0; STORE32;               \
+    cur = 0; while( ++i < MAX32 ) { STORE32; }  \
     if( c < 0 ) fix_negative( N, c, bits );
 
 /*

From 210b458ddce00e43230e12342e306a09a1b51dc8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Manuel=20P=C3=A9gouri=C3=A9-Gonnard?= <mpg@elzevir.fr>
Date: Wed, 23 Oct 2013 14:03:00 +0200
Subject: [PATCH 07/14] Document and slightly reorganize mod_pXXX

---
 library/ecp.c | 164 ++++++++++++++++++++++++++++++++++----------------
 1 file changed, 111 insertions(+), 53 deletions(-)

diff --git a/library/ecp.c b/library/ecp.c
index 21a231505..0f21e2e73 100644
--- a/library/ecp.c
+++ b/library/ecp.c
@@ -483,9 +483,20 @@ cleanup:
 }
 
 #if defined(POLARSSL_ECP_DP_SECP192R1_ENABLED)
+/*
+ * Compared to the way things are presented in FIPS 186-3 D.2,
+ * we proceed in columns, from right (least significant chunk) to left,
+ * adding chunks to N in place, and keeping a carry for the next chunk.
+ * This avoids moving things around in memory, and uselessly adding zeros,
+ * compared to the more straightforward, line-oriented approach.
+ *
+ * For this prime we need to handle data in chunks of 64 bits.
+ * Since this is always a multiple of our basic t_uint, we can
+ * use a t_uint * to designate such a chunk, and small loops to handle it.
+ */
 
 /* Add 64-bit chunks (dst += src) and update carry */
-static inline void add_64( t_uint *dst, t_uint *src, t_uint *carry )
+static inline void add64( t_uint *dst, t_uint *src, t_uint *carry )
 {
     unsigned char i;
     t_uint c = 0;
@@ -508,11 +519,11 @@ static inline void carry64( t_uint *dst, t_uint *carry )
     }
 }
 
-#define OFFSET      ( 8 / sizeof( t_uint ) )
-#define A( i )      ( N->p + ( i ) * OFFSET )
-#define ADD( i )    add_64( p, A( i ), &c )
-#define NEXT        p += OFFSET; carry64( p, &c )
-#define LAST        p += OFFSET; *p = c; while( ++p < end ) *p = 0
+#define WIDTH       8 / sizeof( t_uint )
+#define A( i )      N->p + i * WIDTH
+#define ADD( i )    add64( p, A( i ), &c )
+#define NEXT        p += WIDTH; carry64( p, &c )
+#define LAST        p += WIDTH; *p = c; while( ++p < end ) *p = 0
 
 /*
  * Fast quasi-reduction modulo p192 (FIPS 186-3 D.2.1)
@@ -523,8 +534,9 @@ static int ecp_mod_p192( mpi *N )
     t_uint c = 0;
     t_uint *p, *end;
 
-    /* Make sure we have the correct number of blocks */
-    MPI_CHK( mpi_grow( N, 6 * OFFSET ) );
+    /* Make sure we have enough blocks so that A(5) is legal */
+    MPI_CHK( mpi_grow( N, 6 * WIDTH ) );
+
     p = N->p;
     end = p + N->n;
 
@@ -536,28 +548,35 @@ cleanup:
     return( ret );
 }
 
-#undef OFFSET
+#undef WIDTH
 #undef A
 #undef ADD
 #undef NEXT
 #undef LAST
 #endif /* POLARSSL_ECP_DP_SECP192R1_ENABLED */
 
-#if defined(POLARSSL_ECP_DP_SECP224R1_ENABLED)
+#if defined(POLARSSL_ECP_DP_SECP224R1_ENABLED) ||   \
+    defined(POLARSSL_ECP_DP_SECP256R1_ENABLED) ||   \
+    defined(POLARSSL_ECP_DP_SECP384R1_ENABLED)
+/*
+ * The reader is advised to first understand ecp_mod_p192() since the same
+ * general structure is used here, but with additional complications:
+ * (1) chunks of 32 bits, and (2) subtractions.
+ */
 
-static inline void add32( uint32_t *dst, uint32_t src, signed char *carry )
-{
-    *dst += src;
-    *carry += ( *dst < src );
-}
+/*
+ * For these primes, we need to handle data in chunks of 32 bits.
+ * This makes it more complicated if we use 64 bits limbs in MPI,
+ * which prevents us from using a uniform access method as for p192.
+ *
+ * So, we define a mini abstraction layer to access 32 bit chunks,
+ * load them in 'cur' for work, and store them back from 'cur' when done.
+ *
+ * While at it, also define the size of N in terms of 32-bit chunks.
+ */
+#define LOAD32      cur = A( i );
 
-static inline void sub32( uint32_t *dst, uint32_t src, signed char *carry )
-{
-    *carry -= ( *dst < src );
-    *dst -= src;
-}
-
-#if defined(POLARSSL_HAVE_INT8)
+#if defined(POLARSSL_HAVE_INT8)     /* 8 bit */
 
 #define MAX32       N->n / 4
 #define A( j )      (uint32_t)( N->p[4*j+0]       ) |  \
@@ -569,20 +588,20 @@ static inline void sub32( uint32_t *dst, uint32_t src, signed char *carry )
                     N->p[4*i+2] = (uint8_t)( cur >> 16 );   \
                     N->p[4*i+3] = (uint8_t)( cur >> 24 );
 
-#elif defined(POLARSSL_HAVE_INT16)
+#elif defined(POLARSSL_HAVE_INT16)  /* 16 bit */
 
 #define MAX32       N->n / 2
 #define A( j )      (uint32_t)( N->p[2*j] ) | ( N->p[2*j+1] << 16 )
 #define STORE32     N->p[2*i+0] = (uint16_t)( cur       );  \
                     N->p[2*i+1] = (uint16_t)( cur >> 16 );
 
-#elif defined(POLARSSL_HAVE_INT32)
+#elif defined(POLARSSL_HAVE_INT32)  /* 32 bit */
 
 #define MAX32       N->n
 #define A( j )      N->p[j]
 #define STORE32     N->p[i] = cur;
 
-#else /* 64-bit */
+#else                               /* 64-bit */
 
 #define MAX32       N->n * 2
 #define A( j ) j % 2 ? (uint32_t)( N->p[j/2] >> 32 ) : (uint32_t)( N->p[j/2] )
@@ -595,14 +614,37 @@ static inline void sub32( uint32_t *dst, uint32_t src, signed char *carry )
         N->p[i/2] |= (uint64_t) cur;              \
     }
 
-#endif
+#endif /* sizeof( t_uint ) */
+
+/*
+ * Helpers for addition and subtraction of chunks, with signed carry.
+ */
+static inline void add32( uint32_t *dst, uint32_t src, signed char *carry )
+{
+    *dst += src;
+    *carry += ( *dst < src );
+}
+
+static inline void sub32( uint32_t *dst, uint32_t src, signed char *carry )
+{
+    *carry -= ( *dst < src );
+    *dst -= src;
+}
 
 #define ADD( j )    add32( &cur, A( j ), &c );
 #define SUB( j )    sub32( &cur, A( j ), &c );
 
-#define LOAD32      cur = A( i );
-
-#define FIRST       c = 0; i = 0; LOAD32;
+/*
+ * Helpers for the main 'loop'
+ */
+#define INIT( b )                                           \
+    int ret;                                                \
+    signed char c = 0, cc;                                  \
+    uint32_t cur;                                           \
+    size_t i = 0, bits = b;                                 \
+                                                            \
+    MPI_CHK( mpi_grow( N, b * 2 / 8 / sizeof( t_uint ) ) ); \
+    LOAD32;
 
 #define NEXT                    \
     STORE32; i++; LOAD32;       \
@@ -638,22 +680,18 @@ cleanup:
 
     return( ret );
 }
+#endif /* POLARSSL_ECP_DP_SECP224R1_ENABLED ||
+          POLARSSL_ECP_DP_SECP256R1_ENABLED ||
+          POLARSSL_ECP_DP_SECP384R1_ENABLED */
 
+#if defined(POLARSSL_ECP_DP_SECP224R1_ENABLED)
 /*
  * Fast quasi-reduction modulo p224 (FIPS 186-3 D.2.2)
  */
 static int ecp_mod_p224( mpi *N )
 {
-    int ret;
-    signed char c, cc;
-    uint32_t cur;
-    size_t i;
-    size_t bits = 224;
+    INIT( 224 );
 
-    /* Make sure we have enough blocks */
-    MPI_CHK( mpi_grow( N, bits * 2 / 8 / sizeof( t_uint ) ) );
-
-    FIRST;
     SUB(  7 ); SUB( 11 );               NEXT; // A0 += -A7 - A11
     SUB(  8 ); SUB( 12 );               NEXT; // A1 += -A8 - A12
     SUB(  9 ); SUB( 13 );               NEXT; // A2 += -A9 - A13
@@ -667,15 +705,32 @@ cleanup:
 }
 #endif /* POLARSSL_ECP_DP_SECP224R1_ENABLED */
 
+#if defined(POLARSSL_ECP_DP_SECP224R1_ENABLED) ||   \
+    defined(POLARSSL_ECP_DP_SECP256R1_ENABLED) ||   \
+    defined(POLARSSL_ECP_DP_SECP384R1_ENABLED)
+
+#undef A
+#undef LOAD32
+#undef STORE32
+#undef MAX32
+#undef INIT
+#undef NEXT
+#undef LAST
+
+#endif /* POLARSSL_ECP_DP_SECP224R1_ENABLED ||
+          POLARSSL_ECP_DP_SECP256R1_ENABLED ||
+          POLARSSL_ECP_DP_SECP384R1_ENABLED */
+
 #if defined(POLARSSL_ECP_DP_SECP521R1_ENABLED)
 /*
- * Size of p521 in terms of t_uint
+ * Here we have a real Mersenne prime, so things are more straightforward.
+ * However, things are aligned on a 'weird' boundary (521 bits).
  */
-#define P521_SIZE_INT   ( 521 / 8 / sizeof( t_uint ) + 1 )
 
-/*
- * Bits to keep in the most significant t_uint
- */
+/* Size of p521 in terms of t_uint */
+#define P521_WIDTH      ( 521 / 8 / sizeof( t_uint ) + 1 )
+
+/* Bits to keep in the most significant t_uint */
 #if defined(POLARSSL_HAVE_INT8)
 #define P521_MASK       0x01
 #else
@@ -691,26 +746,26 @@ static int ecp_mod_p521( mpi *N )
     int ret;
     size_t i;
     mpi M;
-    t_uint Mp[P521_SIZE_INT+1];
-    /* Worst case for the size of M is when sizeof( t_uint ) == 16:
+    t_uint Mp[P521_WIDTH + 1];
+    /* Worst case for the size of M is when t_uint is 16 bits:
      * we need to hold bits 513 to 1056, which is 34 limbs, that is
-     * P521_SIZE_INT + 1. Otherwise P521_SIZE is enough. */
+     * P521_WIDTH + 1. Otherwise P521_WIDTH is enough. */
 
-    if( N->n < P521_SIZE_INT )
+    if( N->n < P521_WIDTH )
         return( 0 );
 
     /* M = A1 */
     M.s = 1;
-    M.n = N->n - ( P521_SIZE_INT - 1 );
-    if( M.n > P521_SIZE_INT + 1 )
-        M.n = P521_SIZE_INT + 1;
+    M.n = N->n - ( P521_WIDTH - 1 );
+    if( M.n > P521_WIDTH + 1 )
+        M.n = P521_WIDTH + 1;
     M.p = Mp;
-    memcpy( Mp, N->p + P521_SIZE_INT - 1, M.n * sizeof( t_uint ) );
+    memcpy( Mp, N->p + P521_WIDTH - 1, M.n * sizeof( t_uint ) );
     MPI_CHK( mpi_shift_r( &M, 521 % ( 8 * sizeof( t_uint ) ) ) );
 
     /* N = A0 */
-    N->p[P521_SIZE_INT - 1] &= P521_MASK;
-    for( i = P521_SIZE_INT; i < N->n; i++ )
+    N->p[P521_WIDTH - 1] &= P521_MASK;
+    for( i = P521_WIDTH; i < N->n; i++ )
         N->p[i] = 0;
 
     /* N = A0 + A1 */
@@ -719,6 +774,9 @@ static int ecp_mod_p521( mpi *N )
 cleanup:
     return( ret );
 }
+
+#undef P521_WIDTH
+#undef P521_MASK
 #endif /* POLARSSL_ECP_DP_SECP521R1_ENABLED */
 
 /*

From ec655c908cbe4c5cbcd8cf484601d6b22211efa9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Manuel=20P=C3=A9gouri=C3=A9-Gonnard?= <mpg@elzevir.fr>
Date: Wed, 23 Oct 2013 14:50:39 +0200
Subject: [PATCH 08/14] Add mod_p256

---
 library/ecp.c | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/library/ecp.c b/library/ecp.c
index 0f21e2e73..3fdd34e60 100644
--- a/library/ecp.c
+++ b/library/ecp.c
@@ -705,6 +705,43 @@ cleanup:
 }
 #endif /* POLARSSL_ECP_DP_SECP224R1_ENABLED */
 
+#if defined(POLARSSL_ECP_DP_SECP256R1_ENABLED)
+/*
+ * Fast quasi-reduction modulo p256 (FIPS 186-3 D.2.3)
+ */
+static int ecp_mod_p256( mpi *N )
+{
+    INIT( 256 );
+
+    ADD(  8 ); ADD(  9 );
+    SUB( 11 ); SUB( 12 ); SUB( 13 ); SUB( 14 );             NEXT; // A0
+
+    ADD(  9 ); ADD( 10 );
+    SUB( 12 ); SUB( 13 ); SUB( 14 ); SUB( 15 );             NEXT; // A1
+
+    ADD( 10 ); ADD( 11 );
+    SUB( 13 ); SUB( 14 ); SUB( 15 );                        NEXT; // A2
+
+    ADD( 11 ); ADD( 11 ); ADD( 12 ); ADD( 12 ); ADD( 13 );
+    SUB( 15 ); SUB(  8 ); SUB(  9 );                        NEXT; // A3
+
+    ADD( 12 ); ADD( 12 ); ADD( 13 ); ADD( 13 ); ADD( 14 );
+    SUB(  9 ); SUB( 10 );                                   NEXT; // A4
+
+    ADD( 13 ); ADD( 13 ); ADD( 14 ); ADD( 14 ); ADD( 15 );
+    SUB( 10 ); SUB( 11 );                                   NEXT; // A5
+
+    ADD( 14 ); ADD( 14 ); ADD( 15 ); ADD( 15 ); ADD( 14 ); ADD( 13 );
+    SUB(  8 ); SUB(  9 );                                   NEXT; // A6
+
+    ADD( 15 ); ADD( 15 ); ADD( 15 ); ADD( 8 );
+    SUB( 10 ); SUB( 11 ); SUB( 12 ); SUB( 13 );             LAST; // A7
+
+cleanup:
+    return( ret );
+}
+#endif /* POLARSSL_ECP_DP_SECP256R1_ENABLED */
+
 #if defined(POLARSSL_ECP_DP_SECP224R1_ENABLED) ||   \
     defined(POLARSSL_ECP_DP_SECP256R1_ENABLED) ||   \
     defined(POLARSSL_ECP_DP_SECP384R1_ENABLED)
@@ -951,6 +988,7 @@ int ecp_use_known_dp( ecp_group *grp, ecp_group_id id )
 
 #if defined(POLARSSL_ECP_DP_SECP256R1_ENABLED)
         case POLARSSL_ECP_DP_SECP256R1:
+            grp->modp = ecp_mod_p256;
             return( ecp_group_read_string( grp, 16,
                         SECP256R1_P, SECP256R1_B,
                         SECP256R1_GX, SECP256R1_GY, SECP256R1_N ) );

From 0f9149cb0a579d8ee009e4fc23c81302a0347d24 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Manuel=20P=C3=A9gouri=C3=A9-Gonnard?= <mpg@elzevir.fr>
Date: Wed, 23 Oct 2013 15:06:37 +0200
Subject: [PATCH 09/14] Add mod_p384

---
 library/ecp.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/library/ecp.c b/library/ecp.c
index 3fdd34e60..f359f0bae 100644
--- a/library/ecp.c
+++ b/library/ecp.c
@@ -742,6 +742,55 @@ cleanup:
 }
 #endif /* POLARSSL_ECP_DP_SECP256R1_ENABLED */
 
+#if defined(POLARSSL_ECP_DP_SECP384R1_ENABLED)
+/*
+ * Fast quasi-reduction modulo p384 (FIPS 186-3 D.2.4)
+ */
+static int ecp_mod_p384( mpi *N )
+{
+    INIT( 384 );
+
+    ADD( 12 ); ADD( 21 ); ADD( 20 );
+    SUB( 23 );                                              NEXT; // A0
+
+    ADD( 13 ); ADD( 22 ); ADD( 23 );
+    SUB( 12 ); SUB( 20 );                                   NEXT; // A2
+
+    ADD( 14 ); ADD( 23 );
+    SUB( 13 ); SUB( 21 );                                   NEXT; // A2
+
+    ADD( 15 ); ADD( 12 ); ADD( 20 ); ADD( 21 );
+    SUB( 14 ); SUB( 22 ); SUB( 23 );                        NEXT; // A3
+
+    ADD( 21 ); ADD( 21 ); ADD( 16 ); ADD( 13 ); ADD( 12 ); ADD( 20 ); ADD( 22 );
+    SUB( 15 ); SUB( 23 ); SUB( 23 );                        NEXT; // A4
+
+    ADD( 22 ); ADD( 22 ); ADD( 17 ); ADD( 14 ); ADD( 13 ); ADD( 21 ); ADD( 23 );
+    SUB( 16 );                                              NEXT; // A5
+
+    ADD( 23 ); ADD( 23 ); ADD( 18 ); ADD( 15 ); ADD( 14 ); ADD( 22 );
+    SUB( 17 );                                              NEXT; // A6
+
+    ADD( 19 ); ADD( 16 ); ADD( 15 ); ADD( 23 );
+    SUB( 18 );                                              NEXT; // A7
+
+    ADD( 20 ); ADD( 17 ); ADD( 16 );
+    SUB( 19 );                                              NEXT; // A8
+
+    ADD( 21 ); ADD( 18 ); ADD( 17 );
+    SUB( 20 );                                              NEXT; // A9
+
+    ADD( 22 ); ADD( 19 ); ADD( 18 );
+    SUB( 21 );                                              NEXT; // A10
+
+    ADD( 23 ); ADD( 20 ); ADD( 19 );
+    SUB( 22 );                                              LAST; // A11
+
+cleanup:
+    return( ret );
+}
+#endif /* POLARSSL_ECP_DP_SECP384R1_ENABLED */
+
 #if defined(POLARSSL_ECP_DP_SECP224R1_ENABLED) ||   \
     defined(POLARSSL_ECP_DP_SECP256R1_ENABLED) ||   \
     defined(POLARSSL_ECP_DP_SECP384R1_ENABLED)
@@ -996,6 +1045,7 @@ int ecp_use_known_dp( ecp_group *grp, ecp_group_id id )
 
 #if defined(POLARSSL_ECP_DP_SECP384R1_ENABLED)
         case POLARSSL_ECP_DP_SECP384R1:
+            grp->modp = ecp_mod_p384;
             return( ecp_group_read_string( grp, 16,
                         SECP384R1_P, SECP384R1_B,
                         SECP384R1_GX, SECP384R1_GY, SECP384R1_N ) );

From c04c530a98bb99b9030650a27fd78ee2ce3dc00c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Manuel=20P=C3=A9gouri=C3=A9-Gonnard?= <mpg@elzevir.fr>
Date: Wed, 23 Oct 2013 16:11:52 +0200
Subject: [PATCH 10/14] Make NIST curves optimisation an option

---
 include/polarssl/config.h | 11 +++++++++++
 library/ecp.c             | 14 ++++++++++++++
 2 files changed, 25 insertions(+)

diff --git a/include/polarssl/config.h b/include/polarssl/config.h
index 4973ea454..d231b42c8 100644
--- a/include/polarssl/config.h
+++ b/include/polarssl/config.h
@@ -259,6 +259,17 @@
 #define POLARSSL_ECP_DP_BP384R1_ENABLED
 #define POLARSSL_ECP_DP_BP512R1_ENABLED
 
+/**
+ * \def POLARSSL_ECP_NIST_OPTIM
+ *
+ * Enable specific 'modulo p' routines for each NIST prime.
+ * Depending on the prime and architecture, makes operations 4 to 8 times
+ * faster on the corresponding curve.
+ *
+ * Comment this macro to disable NIST curves optimisation.
+ */
+#define POLARSSL_ECP_NIST_OPTIM
+
 /**
  * \def POLARSSL_KEY_EXCHANGE_PSK_ENABLED
  *
diff --git a/library/ecp.c b/library/ecp.c
index f359f0bae..b144d16ad 100644
--- a/library/ecp.c
+++ b/library/ecp.c
@@ -482,6 +482,8 @@ cleanup:
     return( ret );
 }
 
+#if defined(POLARSSL_ECP_NIST_OPTIM)
+
 #if defined(POLARSSL_ECP_DP_SECP192R1_ENABLED)
 /*
  * Compared to the way things are presented in FIPS 186-3 D.2,
@@ -865,6 +867,8 @@ cleanup:
 #undef P521_MASK
 #endif /* POLARSSL_ECP_DP_SECP521R1_ENABLED */
 
+#endif /* POLARSSL_ECP_NIST_OPTIM */
+
 /*
  * Domain parameters for secp192r1
  */
@@ -1021,7 +1025,9 @@ int ecp_use_known_dp( ecp_group *grp, ecp_group_id id )
     {
 #if defined(POLARSSL_ECP_DP_SECP192R1_ENABLED)
         case POLARSSL_ECP_DP_SECP192R1:
+#if defined(POLARSSL_ECP_NIST_OPTIM)
             grp->modp = ecp_mod_p192;
+#endif
             return( ecp_group_read_string( grp, 16,
                         SECP192R1_P, SECP192R1_B,
                         SECP192R1_GX, SECP192R1_GY, SECP192R1_N ) );
@@ -1029,7 +1035,9 @@ int ecp_use_known_dp( ecp_group *grp, ecp_group_id id )
 
 #if defined(POLARSSL_ECP_DP_SECP224R1_ENABLED)
         case POLARSSL_ECP_DP_SECP224R1:
+#if defined(POLARSSL_ECP_NIST_OPTIM)
             grp->modp = ecp_mod_p224;
+#endif
             return( ecp_group_read_string( grp, 16,
                         SECP224R1_P, SECP224R1_B,
                         SECP224R1_GX, SECP224R1_GY, SECP224R1_N ) );
@@ -1037,7 +1045,9 @@ int ecp_use_known_dp( ecp_group *grp, ecp_group_id id )
 
 #if defined(POLARSSL_ECP_DP_SECP256R1_ENABLED)
         case POLARSSL_ECP_DP_SECP256R1:
+#if defined(POLARSSL_ECP_NIST_OPTIM)
             grp->modp = ecp_mod_p256;
+#endif
             return( ecp_group_read_string( grp, 16,
                         SECP256R1_P, SECP256R1_B,
                         SECP256R1_GX, SECP256R1_GY, SECP256R1_N ) );
@@ -1045,7 +1055,9 @@ int ecp_use_known_dp( ecp_group *grp, ecp_group_id id )
 
 #if defined(POLARSSL_ECP_DP_SECP384R1_ENABLED)
         case POLARSSL_ECP_DP_SECP384R1:
+#if defined(POLARSSL_ECP_NIST_OPTIM)
             grp->modp = ecp_mod_p384;
+#endif
             return( ecp_group_read_string( grp, 16,
                         SECP384R1_P, SECP384R1_B,
                         SECP384R1_GX, SECP384R1_GY, SECP384R1_N ) );
@@ -1053,7 +1065,9 @@ int ecp_use_known_dp( ecp_group *grp, ecp_group_id id )
 
 #if defined(POLARSSL_ECP_DP_SECP521R1_ENABLED)
         case POLARSSL_ECP_DP_SECP521R1:
+#if defined(POLARSSL_ECP_NIST_OPTIM)
             grp->modp = ecp_mod_p521;
+#endif
             return( ecp_group_read_string( grp, 16,
                         SECP521R1_P, SECP521R1_B,
                         SECP521R1_GX, SECP521R1_GY, SECP521R1_N ) );

From 5779cbe5821f028f2cddad54c49a9aa564d2d300 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Manuel=20P=C3=A9gouri=C3=A9-Gonnard?= <mpg@elzevir.fr>
Date: Wed, 23 Oct 2013 20:17:00 +0200
Subject: [PATCH 11/14] Make mod_p{224,256,384] a bit faster

Speedup is roughly 25%, giving a 6% speedup on ecp_mul() for these curves.
---
 include/polarssl/bignum.h |  2 ++
 library/ecp.c             | 24 +++++++++++++++++-------
 2 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/include/polarssl/bignum.h b/include/polarssl/bignum.h
index 769e546d5..eae15e04d 100644
--- a/include/polarssl/bignum.h
+++ b/include/polarssl/bignum.h
@@ -128,6 +128,7 @@ typedef uint32_t t_udbl;
 #define POLARSSL_HAVE_UDBL
 #else
   #if ( defined(_MSC_VER) && defined(_M_AMD64) )
+    #define POLARSSL_HAVE_INT64
     typedef  int64_t t_sint;
     typedef uint64_t t_uint;
   #else
@@ -137,6 +138,7 @@ typedef uint32_t t_udbl;
           defined(__ia64__)  || defined(__alpha__)     || \
           (defined(__sparc__) && defined(__arch64__))  || \
           defined(__s390x__) ) )
+       #define POLARSSL_HAVE_INT64
        typedef  int64_t t_sint;
        typedef uint64_t t_uint;
        typedef unsigned int t_udbl __attribute__((mode(TI)));
diff --git a/library/ecp.c b/library/ecp.c
index b144d16ad..a408f2ba2 100644
--- a/library/ecp.c
+++ b/library/ecp.c
@@ -663,22 +663,32 @@ static inline void sub32( uint32_t *dst, uint32_t src, signed char *carry )
     if( c < 0 ) fix_negative( N, c, bits );
 
 /*
- * If the result is negative, we get it in the form c * 2^192 + N,
- * with c negative and N positive (the c >= 0 case is handled by LAST).
+ * If the result is negative, we get it in the form
+ * c * 2^(bits + 32) + N, with c negative and N positive shorter than 'bits'
  */
 static inline int fix_negative( mpi *N, signed char c, size_t bits )
 {
     int ret;
     mpi C;
+    t_uint Cp[ 384 / 8 / sizeof( t_uint) + 1 ];
 
-    mpi_init( &C );
+    /* C = - c * 2^(bits + 32) */
+    C.s = 1;
+    C.n = bits / 8 / sizeof( t_uint ) + 1;
+    C.p = Cp;
+    memset( Cp, 0, C.n * sizeof( t_uint ) );
+#if defined(POLARSSL_HAVE_INT64)
+    if( bits == 224 )
+        Cp[ C.n - 1 ] = ((t_uint) -c) << 32;
+    else
+#endif
+        Cp[ C.n - 1 ] = (t_uint) -c;
 
-    MPI_CHK( mpi_lset( &C, c ) );
-    MPI_CHK( mpi_shift_l( &C, bits ) );
-    MPI_CHK( mpi_add_mpi( N, N, &C ) );
+    /* N = - ( C - N ) */
+    MPI_CHK( mpi_sub_abs( N, &C, N ) );
+    N->s = -1;
 
 cleanup:
-    mpi_free( &C );
 
     return( ret );
 }

From cae6f3ed45099ca5084d91836a4eb5d90c0e425c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Manuel=20P=C3=A9gouri=C3=A9-Gonnard?= <mpg@elzevir.fr>
Date: Wed, 23 Oct 2013 20:19:57 +0200
Subject: [PATCH 12/14] Reorganize code in ecp.c

---
 include/polarssl/ecp.h |  178 +++----
 library/ecp.c          | 1006 ++++++++++++++++++++--------------------
 2 files changed, 593 insertions(+), 591 deletions(-)

diff --git a/include/polarssl/ecp.h b/include/polarssl/ecp.h
index 7940b3219..02f6f9349 100644
--- a/include/polarssl/ecp.h
+++ b/include/polarssl/ecp.h
@@ -186,6 +186,24 @@ ecp_keypair;
  */
 const ecp_curve_info *ecp_curve_list( void );
 
+/**
+ * \brief           Get curve information from an internal group identifier
+ *
+ * \param grp_id    A POLARSSL_ECP_DP_XXX value
+ *
+ * \return          The associated curve information or NULL
+ */
+const ecp_curve_info *ecp_curve_info_from_grp_id( ecp_group_id grp_id );
+
+/**
+ * \brief           Get curve information from a TLS NamedCurve value
+ *
+ * \param grp_id    A POLARSSL_ECP_DP_XXX value
+ *
+ * \return          The associated curve information or NULL
+ */
+const ecp_curve_info *ecp_curve_info_from_tls_id( uint16_t tls_id );
+
 /**
  * \brief           Initialize a point (as zero)
  */
@@ -216,25 +234,6 @@ void ecp_group_free( ecp_group *grp );
  */
 void ecp_keypair_free( ecp_keypair *key );
 
-/**
- * \brief           Set a point to zero
- *
- * \param pt        Destination point
- *
- * \return          0 if successful,
- *                  POLARSSL_ERR_MPI_MALLOC_FAILED if memory allocation failed
- */
-int ecp_set_zero( ecp_point *pt );
-
-/**
- * \brief           Tell if a point is zero
- *
- * \param pt        Point to test
- *
- * \return          1 if point is zero, 0 otherwise
- */
-int ecp_is_zero( ecp_point *pt );
-
 /**
  * \brief           Copy the contents of point Q into P
  *
@@ -257,6 +256,25 @@ int ecp_copy( ecp_point *P, const ecp_point *Q );
  */
 int ecp_group_copy( ecp_group *dst, const ecp_group *src );
 
+/**
+ * \brief           Set a point to zero
+ *
+ * \param pt        Destination point
+ *
+ * \return          0 if successful,
+ *                  POLARSSL_ERR_MPI_MALLOC_FAILED if memory allocation failed
+ */
+int ecp_set_zero( ecp_point *pt );
+
+/**
+ * \brief           Tell if a point is zero
+ *
+ * \param pt        Point to test
+ *
+ * \return          1 if point is zero, 0 otherwise
+ */
+int ecp_is_zero( ecp_point *pt );
+
 /**
  * \brief           Import a non-zero point from two ASCII strings
  *
@@ -270,25 +288,6 @@ int ecp_group_copy( ecp_group *dst, const ecp_group *src );
 int ecp_point_read_string( ecp_point *P, int radix,
                            const char *x, const char *y );
 
-/**
- * \brief           Import an ECP group from null-terminated ASCII strings
- *
- * \param grp       Destination group
- * \param radix     Input numeric base
- * \param p         Prime modulus of the base field
- * \param b         Constant term in the equation
- * \param gx        The generator's X coordinate
- * \param gy        The generator's Y coordinate
- * \param n         The generator's order
- *
- * \return          0 if successful, or a POLARSSL_ERR_MPI_XXX error code
- *
- * \note            Sets all fields except modp.
- */
-int ecp_group_read_string( ecp_group *grp, int radix,
-                           const char *p, const char *b,
-                           const char *gx, const char *gy, const char *n);
-
 /**
  * \brief           Export a point into unsigned binary data
  *
@@ -326,6 +325,58 @@ int ecp_point_write_binary( const ecp_group *grp, const ecp_point *P,
 int ecp_point_read_binary( const ecp_group *grp, ecp_point *P,
                            const unsigned char *buf, size_t ilen );
 
+/**
+ * \brief           Import a point from a TLS ECPoint record
+ *
+ * \param grp       ECP group used
+ * \param pt        Destination point
+ * \param buf       $(Start of input buffer)
+ * \param len       Buffer length
+ *
+ * \return          O if successful,
+ *                  POLARSSL_ERR_MPI_XXX if initialization failed
+ *                  POLARSSL_ERR_ECP_BAD_INPUT_DATA if input is invalid
+ */
+int ecp_tls_read_point( const ecp_group *grp, ecp_point *pt,
+                        const unsigned char **buf, size_t len );
+
+/**
+ * \brief           Export a point as a TLS ECPoint record
+ *
+ * \param grp       ECP group used
+ * \param pt        Point to export
+ * \param format    Export format
+ * \param olen      length of data written
+ * \param buf       Buffer to write to
+ * \param blen      Buffer length
+ *
+ * \return          0 if successful,
+ *                  or POLARSSL_ERR_ECP_BAD_INPUT_DATA
+ *                  or POLARSSL_ERR_ECP_BUFFER_TOO_SMALL
+ */
+int ecp_tls_write_point( const ecp_group *grp, const ecp_point *pt,
+                         int format, size_t *olen,
+                         unsigned char *buf, size_t blen );
+
+/**
+ * \brief           Import an ECP group from null-terminated ASCII strings
+ *
+ * \param grp       Destination group
+ * \param radix     Input numeric base
+ * \param p         Prime modulus of the base field
+ * \param b         Constant term in the equation
+ * \param gx        The generator's X coordinate
+ * \param gy        The generator's Y coordinate
+ * \param n         The generator's order
+ *
+ * \return          0 if successful, or a POLARSSL_ERR_MPI_XXX error code
+ *
+ * \note            Sets all fields except modp.
+ */
+int ecp_group_read_string( ecp_group *grp, int radix,
+                           const char *p, const char *b,
+                           const char *gx, const char *gy, const char *n);
+
 /**
  * \brief           Set a group using well-known domain parameters
  *
@@ -368,57 +419,6 @@ int ecp_tls_read_group( ecp_group *grp, const unsigned char **buf, size_t len );
 int ecp_tls_write_group( const ecp_group *grp, size_t *olen,
                          unsigned char *buf, size_t blen );
 
-/**
- * \brief           Get curve information from an internal group identifier
- *
- * \param grp_id    A POLARSSL_ECP_DP_XXX value
- *
- * \return          The associated curve information or NULL
- */
-const ecp_curve_info *ecp_curve_info_from_grp_id( ecp_group_id grp_id );
-
-/**
- * \brief           Get curve information from a TLS NamedCurve value
- *
- * \param grp_id    A POLARSSL_ECP_DP_XXX value
- *
- * \return          The associated curve information or NULL
- */
-const ecp_curve_info *ecp_curve_info_from_tls_id( uint16_t tls_id );
-
-/**
- * \brief           Import a point from a TLS ECPoint record
- *
- * \param grp       ECP group used
- * \param pt        Destination point
- * \param buf       $(Start of input buffer)
- * \param len       Buffer length
- *
- * \return          O if successful,
- *                  POLARSSL_ERR_MPI_XXX if initialization failed
- *                  POLARSSL_ERR_ECP_BAD_INPUT_DATA if input is invalid
- */
-int ecp_tls_read_point( const ecp_group *grp, ecp_point *pt,
-                        const unsigned char **buf, size_t len );
-
-/**
- * \brief           Export a point as a TLS ECPoint record
- *
- * \param grp       ECP group used
- * \param pt        Point to export
- * \param format    Export format
- * \param olen      length of data written
- * \param buf       Buffer to write to
- * \param blen      Buffer length
- *
- * \return          0 if successful,
- *                  or POLARSSL_ERR_ECP_BAD_INPUT_DATA
- *                  or POLARSSL_ERR_ECP_BUFFER_TOO_SMALL
- */
-int ecp_tls_write_point( const ecp_group *grp, const ecp_point *pt,
-                         int format, size_t *olen,
-                         unsigned char *buf, size_t blen );
-
 /**
  * \brief           Addition: R = P + Q
  *
diff --git a/library/ecp.c b/library/ecp.c
index a408f2ba2..995f956b3 100644
--- a/library/ecp.c
+++ b/library/ecp.c
@@ -111,6 +111,42 @@ const ecp_curve_info *ecp_curve_list( void )
     return ecp_supported_curves;
 }
 
+/*
+ * Get the curve info for the internal identifer
+ */
+const ecp_curve_info *ecp_curve_info_from_grp_id( ecp_group_id grp_id )
+{
+    const ecp_curve_info *curve_info;
+
+    for( curve_info = ecp_curve_list();
+         curve_info->grp_id != POLARSSL_ECP_DP_NONE;
+         curve_info++ )
+    {
+        if( curve_info->grp_id == grp_id )
+            return( curve_info );
+    }
+
+    return( NULL );
+}
+
+/*
+ * Get the curve info from the TLS identifier
+ */
+const ecp_curve_info *ecp_curve_info_from_tls_id( uint16_t tls_id )
+{
+    const ecp_curve_info *curve_info;
+
+    for( curve_info = ecp_curve_list();
+         curve_info->grp_id != POLARSSL_ECP_DP_NONE;
+         curve_info++ )
+    {
+        if( curve_info->tls_id == tls_id )
+            return( curve_info );
+    }
+
+    return( NULL );
+}
+
 /*
  * Initialize (the components of) a point
  */
@@ -200,6 +236,29 @@ void ecp_keypair_free( ecp_keypair *key )
     ecp_point_free( &key->Q );
 }
 
+/*
+ * Copy the contents of a point
+ */
+int ecp_copy( ecp_point *P, const ecp_point *Q )
+{
+    int ret;
+
+    MPI_CHK( mpi_copy( &P->X, &Q->X ) );
+    MPI_CHK( mpi_copy( &P->Y, &Q->Y ) );
+    MPI_CHK( mpi_copy( &P->Z, &Q->Z ) );
+
+cleanup:
+    return( ret );
+}
+
+/*
+ * Copy the contents of a group object
+ */
+int ecp_group_copy( ecp_group *dst, const ecp_group *src )
+{
+    return ecp_use_known_dp( dst, src->id );
+}
+
 /*
  * Set point to zero
  */
@@ -223,29 +282,6 @@ int ecp_is_zero( ecp_point *pt )
     return( mpi_cmp_int( &pt->Z, 0 ) == 0 );
 }
 
-/*
- * Copy the contents of Q into P
- */
-int ecp_copy( ecp_point *P, const ecp_point *Q )
-{
-    int ret;
-
-    MPI_CHK( mpi_copy( &P->X, &Q->X ) );
-    MPI_CHK( mpi_copy( &P->Y, &Q->Y ) );
-    MPI_CHK( mpi_copy( &P->Z, &Q->Z ) );
-
-cleanup:
-    return( ret );
-}
-
-/*
- * Copy the contents of a group object
- */
-int ecp_group_copy( ecp_group *dst, const ecp_group *src )
-{
-    return ecp_use_known_dp( dst, src->id );
-}
-
 /*
  * Import a non-zero point from ASCII strings
  */
@@ -262,50 +298,6 @@ cleanup:
     return( ret );
 }
 
-/*
- * Import an ECP group from ASCII strings, general case (A used)
- */
-static int ecp_group_read_string_gen( ecp_group *grp, int radix,
-                           const char *p, const char *a, const char *b,
-                           const char *gx, const char *gy, const char *n)
-{
-    int ret;
-
-    MPI_CHK( mpi_read_string( &grp->P, radix, p ) );
-    MPI_CHK( mpi_read_string( &grp->A, radix, a ) );
-    MPI_CHK( mpi_read_string( &grp->B, radix, b ) );
-    MPI_CHK( ecp_point_read_string( &grp->G, radix, gx, gy ) );
-    MPI_CHK( mpi_read_string( &grp->N, radix, n ) );
-
-    grp->pbits = mpi_msb( &grp->P );
-    grp->nbits = mpi_msb( &grp->N );
-
-cleanup:
-    if( ret != 0 )
-        ecp_group_free( grp );
-
-    return( ret );
-}
-
-/*
- * Import an ECP group from ASCII strings, case A == -3
- */
-int ecp_group_read_string( ecp_group *grp, int radix,
-                           const char *p, const char *b,
-                           const char *gx, const char *gy, const char *n)
-{
-    int ret;
-
-    MPI_CHK( ecp_group_read_string_gen( grp, radix, p, "00", b, gx, gy, n ) );
-    MPI_CHK( mpi_add_int( &grp->A, &grp->P, -3 ) );
-
-cleanup:
-    if( ret != 0 )
-        ecp_group_free( grp );
-
-    return( ret );
-}
-
 /*
  * Export a point into unsigned binary data (SEC1 2.3.3)
  */
@@ -449,435 +441,48 @@ int ecp_tls_write_point( const ecp_group *grp, const ecp_point *pt,
 }
 
 /*
- * Wrapper around fast quasi-modp functions, with fall-back to mpi_mod_mpi.
- * See the documentation of struct ecp_group.
- *
- * This function is in the critial loop for ecp_mul, so pay attention to perf.
+ * Import an ECP group from ASCII strings, general case (A used)
  */
-static int ecp_modp( mpi *N, const ecp_group *grp )
+static int ecp_group_read_string_gen( ecp_group *grp, int radix,
+                           const char *p, const char *a, const char *b,
+                           const char *gx, const char *gy, const char *n)
 {
     int ret;
 
-    if( grp->modp == NULL )
-        return( mpi_mod_mpi( N, N, &grp->P ) );
+    MPI_CHK( mpi_read_string( &grp->P, radix, p ) );
+    MPI_CHK( mpi_read_string( &grp->A, radix, a ) );
+    MPI_CHK( mpi_read_string( &grp->B, radix, b ) );
+    MPI_CHK( ecp_point_read_string( &grp->G, radix, gx, gy ) );
+    MPI_CHK( mpi_read_string( &grp->N, radix, n ) );
 
-    /* N->s < 0 is a much faster test, which fails only if N is 0 */
-    if( ( N->s < 0 && mpi_cmp_int( N, 0 ) != 0 ) ||
-        mpi_msb( N ) > 2 * grp->pbits )
-    {
-        return( POLARSSL_ERR_ECP_BAD_INPUT_DATA );
-    }
-
-    MPI_CHK( grp->modp( N ) );
-
-    /* N->s < 0 is a much faster test, which fails only if N is 0 */
-    while( N->s < 0 && mpi_cmp_int( N, 0 ) != 0 )
-        MPI_CHK( mpi_add_mpi( N, N, &grp->P ) );
-
-    while( mpi_cmp_mpi( N, &grp->P ) >= 0 )
-        /* we known P, N and the result are positive */
-        MPI_CHK( mpi_sub_abs( N, N, &grp->P ) );
+    grp->pbits = mpi_msb( &grp->P );
+    grp->nbits = mpi_msb( &grp->N );
 
 cleanup:
+    if( ret != 0 )
+        ecp_group_free( grp );
+
     return( ret );
 }
 
-#if defined(POLARSSL_ECP_NIST_OPTIM)
-
-#if defined(POLARSSL_ECP_DP_SECP192R1_ENABLED)
 /*
- * Compared to the way things are presented in FIPS 186-3 D.2,
- * we proceed in columns, from right (least significant chunk) to left,
- * adding chunks to N in place, and keeping a carry for the next chunk.
- * This avoids moving things around in memory, and uselessly adding zeros,
- * compared to the more straightforward, line-oriented approach.
- *
- * For this prime we need to handle data in chunks of 64 bits.
- * Since this is always a multiple of our basic t_uint, we can
- * use a t_uint * to designate such a chunk, and small loops to handle it.
+ * Import an ECP group from ASCII strings, case A == -3
  */
-
-/* Add 64-bit chunks (dst += src) and update carry */
-static inline void add64( t_uint *dst, t_uint *src, t_uint *carry )
-{
-    unsigned char i;
-    t_uint c = 0;
-    for( i = 0; i < 8 / sizeof( t_uint ); i++, dst++, src++ )
-    {
-        *dst += c;      c  = ( *dst < c );
-        *dst += *src;   c += ( *dst < *src );
-    }
-    *carry += c;
-}
-
-/* Add carry to a 64-bit chunk and update carry */
-static inline void carry64( t_uint *dst, t_uint *carry )
-{
-    unsigned char i;
-    for( i = 0; i < 8 / sizeof( t_uint ); i++, dst++ )
-    {
-        *dst += *carry;
-        *carry  = ( *dst < *carry );
-    }
-}
-
-#define WIDTH       8 / sizeof( t_uint )
-#define A( i )      N->p + i * WIDTH
-#define ADD( i )    add64( p, A( i ), &c )
-#define NEXT        p += WIDTH; carry64( p, &c )
-#define LAST        p += WIDTH; *p = c; while( ++p < end ) *p = 0
-
-/*
- * Fast quasi-reduction modulo p192 (FIPS 186-3 D.2.1)
- */
-static int ecp_mod_p192( mpi *N )
+int ecp_group_read_string( ecp_group *grp, int radix,
+                           const char *p, const char *b,
+                           const char *gx, const char *gy, const char *n)
 {
     int ret;
-    t_uint c = 0;
-    t_uint *p, *end;
 
-    /* Make sure we have enough blocks so that A(5) is legal */
-    MPI_CHK( mpi_grow( N, 6 * WIDTH ) );
-
-    p = N->p;
-    end = p + N->n;
-
-    ADD( 3 ); ADD( 5 );             NEXT; // A0 += A3 + A5
-    ADD( 3 ); ADD( 4 ); ADD( 5 );   NEXT; // A1 += A3 + A4 + A5
-    ADD( 4 ); ADD( 5 );             LAST; // A2 += A4 + A5
-
-cleanup:
-    return( ret );
-}
-
-#undef WIDTH
-#undef A
-#undef ADD
-#undef NEXT
-#undef LAST
-#endif /* POLARSSL_ECP_DP_SECP192R1_ENABLED */
-
-#if defined(POLARSSL_ECP_DP_SECP224R1_ENABLED) ||   \
-    defined(POLARSSL_ECP_DP_SECP256R1_ENABLED) ||   \
-    defined(POLARSSL_ECP_DP_SECP384R1_ENABLED)
-/*
- * The reader is advised to first understand ecp_mod_p192() since the same
- * general structure is used here, but with additional complications:
- * (1) chunks of 32 bits, and (2) subtractions.
- */
-
-/*
- * For these primes, we need to handle data in chunks of 32 bits.
- * This makes it more complicated if we use 64 bits limbs in MPI,
- * which prevents us from using a uniform access method as for p192.
- *
- * So, we define a mini abstraction layer to access 32 bit chunks,
- * load them in 'cur' for work, and store them back from 'cur' when done.
- *
- * While at it, also define the size of N in terms of 32-bit chunks.
- */
-#define LOAD32      cur = A( i );
-
-#if defined(POLARSSL_HAVE_INT8)     /* 8 bit */
-
-#define MAX32       N->n / 4
-#define A( j )      (uint32_t)( N->p[4*j+0]       ) |  \
-                              ( N->p[4*j+1] << 8  ) |  \
-                              ( N->p[4*j+2] << 16 ) |  \
-                              ( N->p[4*j+3] << 24 )
-#define STORE32     N->p[4*i+0] = (uint8_t)( cur       );   \
-                    N->p[4*i+1] = (uint8_t)( cur >> 8  );   \
-                    N->p[4*i+2] = (uint8_t)( cur >> 16 );   \
-                    N->p[4*i+3] = (uint8_t)( cur >> 24 );
-
-#elif defined(POLARSSL_HAVE_INT16)  /* 16 bit */
-
-#define MAX32       N->n / 2
-#define A( j )      (uint32_t)( N->p[2*j] ) | ( N->p[2*j+1] << 16 )
-#define STORE32     N->p[2*i+0] = (uint16_t)( cur       );  \
-                    N->p[2*i+1] = (uint16_t)( cur >> 16 );
-
-#elif defined(POLARSSL_HAVE_INT32)  /* 32 bit */
-
-#define MAX32       N->n
-#define A( j )      N->p[j]
-#define STORE32     N->p[i] = cur;
-
-#else                               /* 64-bit */
-
-#define MAX32       N->n * 2
-#define A( j ) j % 2 ? (uint32_t)( N->p[j/2] >> 32 ) : (uint32_t)( N->p[j/2] )
-#define STORE32                                   \
-    if( i % 2 ) {                                 \
-        N->p[i/2] &= 0x00000000FFFFFFFF;          \
-        N->p[i/2] |= ((uint64_t) cur) << 32;      \
-    } else {                                      \
-        N->p[i/2] &= 0xFFFFFFFF00000000;          \
-        N->p[i/2] |= (uint64_t) cur;              \
-    }
-
-#endif /* sizeof( t_uint ) */
-
-/*
- * Helpers for addition and subtraction of chunks, with signed carry.
- */
-static inline void add32( uint32_t *dst, uint32_t src, signed char *carry )
-{
-    *dst += src;
-    *carry += ( *dst < src );
-}
-
-static inline void sub32( uint32_t *dst, uint32_t src, signed char *carry )
-{
-    *carry -= ( *dst < src );
-    *dst -= src;
-}
-
-#define ADD( j )    add32( &cur, A( j ), &c );
-#define SUB( j )    sub32( &cur, A( j ), &c );
-
-/*
- * Helpers for the main 'loop'
- */
-#define INIT( b )                                           \
-    int ret;                                                \
-    signed char c = 0, cc;                                  \
-    uint32_t cur;                                           \
-    size_t i = 0, bits = b;                                 \
-                                                            \
-    MPI_CHK( mpi_grow( N, b * 2 / 8 / sizeof( t_uint ) ) ); \
-    LOAD32;
-
-#define NEXT                    \
-    STORE32; i++; LOAD32;       \
-    cc = c; c = 0;              \
-    if( cc < 0 )                \
-        sub32( &cur, -cc, &c ); \
-    else                        \
-        add32( &cur, cc, &c );  \
-
-#define LAST                                    \
-    STORE32; i++;                               \
-    cur = c > 0 ? c : 0; STORE32;               \
-    cur = 0; while( ++i < MAX32 ) { STORE32; }  \
-    if( c < 0 ) fix_negative( N, c, bits );
-
-/*
- * If the result is negative, we get it in the form
- * c * 2^(bits + 32) + N, with c negative and N positive shorter than 'bits'
- */
-static inline int fix_negative( mpi *N, signed char c, size_t bits )
-{
-    int ret;
-    mpi C;
-    t_uint Cp[ 384 / 8 / sizeof( t_uint) + 1 ];
-
-    /* C = - c * 2^(bits + 32) */
-    C.s = 1;
-    C.n = bits / 8 / sizeof( t_uint ) + 1;
-    C.p = Cp;
-    memset( Cp, 0, C.n * sizeof( t_uint ) );
-#if defined(POLARSSL_HAVE_INT64)
-    if( bits == 224 )
-        Cp[ C.n - 1 ] = ((t_uint) -c) << 32;
-    else
-#endif
-        Cp[ C.n - 1 ] = (t_uint) -c;
-
-    /* N = - ( C - N ) */
-    MPI_CHK( mpi_sub_abs( N, &C, N ) );
-    N->s = -1;
+    MPI_CHK( ecp_group_read_string_gen( grp, radix, p, "00", b, gx, gy, n ) );
+    MPI_CHK( mpi_add_int( &grp->A, &grp->P, -3 ) );
 
 cleanup:
+    if( ret != 0 )
+        ecp_group_free( grp );
 
     return( ret );
 }
-#endif /* POLARSSL_ECP_DP_SECP224R1_ENABLED ||
-          POLARSSL_ECP_DP_SECP256R1_ENABLED ||
-          POLARSSL_ECP_DP_SECP384R1_ENABLED */
-
-#if defined(POLARSSL_ECP_DP_SECP224R1_ENABLED)
-/*
- * Fast quasi-reduction modulo p224 (FIPS 186-3 D.2.2)
- */
-static int ecp_mod_p224( mpi *N )
-{
-    INIT( 224 );
-
-    SUB(  7 ); SUB( 11 );               NEXT; // A0 += -A7 - A11
-    SUB(  8 ); SUB( 12 );               NEXT; // A1 += -A8 - A12
-    SUB(  9 ); SUB( 13 );               NEXT; // A2 += -A9 - A13
-    SUB( 10 ); ADD(  7 ); ADD( 11 );    NEXT; // A3 += -A10 + A7 + A11
-    SUB( 11 ); ADD(  8 ); ADD( 12 );    NEXT; // A4 += -A11 + A8 + A12
-    SUB( 12 ); ADD(  9 ); ADD( 13 );    NEXT; // A5 += -A12 + A9 + A13
-    SUB( 13 ); ADD( 10 );               LAST; // A6 += -A13 + A10
-
-cleanup:
-    return( ret );
-}
-#endif /* POLARSSL_ECP_DP_SECP224R1_ENABLED */
-
-#if defined(POLARSSL_ECP_DP_SECP256R1_ENABLED)
-/*
- * Fast quasi-reduction modulo p256 (FIPS 186-3 D.2.3)
- */
-static int ecp_mod_p256( mpi *N )
-{
-    INIT( 256 );
-
-    ADD(  8 ); ADD(  9 );
-    SUB( 11 ); SUB( 12 ); SUB( 13 ); SUB( 14 );             NEXT; // A0
-
-    ADD(  9 ); ADD( 10 );
-    SUB( 12 ); SUB( 13 ); SUB( 14 ); SUB( 15 );             NEXT; // A1
-
-    ADD( 10 ); ADD( 11 );
-    SUB( 13 ); SUB( 14 ); SUB( 15 );                        NEXT; // A2
-
-    ADD( 11 ); ADD( 11 ); ADD( 12 ); ADD( 12 ); ADD( 13 );
-    SUB( 15 ); SUB(  8 ); SUB(  9 );                        NEXT; // A3
-
-    ADD( 12 ); ADD( 12 ); ADD( 13 ); ADD( 13 ); ADD( 14 );
-    SUB(  9 ); SUB( 10 );                                   NEXT; // A4
-
-    ADD( 13 ); ADD( 13 ); ADD( 14 ); ADD( 14 ); ADD( 15 );
-    SUB( 10 ); SUB( 11 );                                   NEXT; // A5
-
-    ADD( 14 ); ADD( 14 ); ADD( 15 ); ADD( 15 ); ADD( 14 ); ADD( 13 );
-    SUB(  8 ); SUB(  9 );                                   NEXT; // A6
-
-    ADD( 15 ); ADD( 15 ); ADD( 15 ); ADD( 8 );
-    SUB( 10 ); SUB( 11 ); SUB( 12 ); SUB( 13 );             LAST; // A7
-
-cleanup:
-    return( ret );
-}
-#endif /* POLARSSL_ECP_DP_SECP256R1_ENABLED */
-
-#if defined(POLARSSL_ECP_DP_SECP384R1_ENABLED)
-/*
- * Fast quasi-reduction modulo p384 (FIPS 186-3 D.2.4)
- */
-static int ecp_mod_p384( mpi *N )
-{
-    INIT( 384 );
-
-    ADD( 12 ); ADD( 21 ); ADD( 20 );
-    SUB( 23 );                                              NEXT; // A0
-
-    ADD( 13 ); ADD( 22 ); ADD( 23 );
-    SUB( 12 ); SUB( 20 );                                   NEXT; // A2
-
-    ADD( 14 ); ADD( 23 );
-    SUB( 13 ); SUB( 21 );                                   NEXT; // A2
-
-    ADD( 15 ); ADD( 12 ); ADD( 20 ); ADD( 21 );
-    SUB( 14 ); SUB( 22 ); SUB( 23 );                        NEXT; // A3
-
-    ADD( 21 ); ADD( 21 ); ADD( 16 ); ADD( 13 ); ADD( 12 ); ADD( 20 ); ADD( 22 );
-    SUB( 15 ); SUB( 23 ); SUB( 23 );                        NEXT; // A4
-
-    ADD( 22 ); ADD( 22 ); ADD( 17 ); ADD( 14 ); ADD( 13 ); ADD( 21 ); ADD( 23 );
-    SUB( 16 );                                              NEXT; // A5
-
-    ADD( 23 ); ADD( 23 ); ADD( 18 ); ADD( 15 ); ADD( 14 ); ADD( 22 );
-    SUB( 17 );                                              NEXT; // A6
-
-    ADD( 19 ); ADD( 16 ); ADD( 15 ); ADD( 23 );
-    SUB( 18 );                                              NEXT; // A7
-
-    ADD( 20 ); ADD( 17 ); ADD( 16 );
-    SUB( 19 );                                              NEXT; // A8
-
-    ADD( 21 ); ADD( 18 ); ADD( 17 );
-    SUB( 20 );                                              NEXT; // A9
-
-    ADD( 22 ); ADD( 19 ); ADD( 18 );
-    SUB( 21 );                                              NEXT; // A10
-
-    ADD( 23 ); ADD( 20 ); ADD( 19 );
-    SUB( 22 );                                              LAST; // A11
-
-cleanup:
-    return( ret );
-}
-#endif /* POLARSSL_ECP_DP_SECP384R1_ENABLED */
-
-#if defined(POLARSSL_ECP_DP_SECP224R1_ENABLED) ||   \
-    defined(POLARSSL_ECP_DP_SECP256R1_ENABLED) ||   \
-    defined(POLARSSL_ECP_DP_SECP384R1_ENABLED)
-
-#undef A
-#undef LOAD32
-#undef STORE32
-#undef MAX32
-#undef INIT
-#undef NEXT
-#undef LAST
-
-#endif /* POLARSSL_ECP_DP_SECP224R1_ENABLED ||
-          POLARSSL_ECP_DP_SECP256R1_ENABLED ||
-          POLARSSL_ECP_DP_SECP384R1_ENABLED */
-
-#if defined(POLARSSL_ECP_DP_SECP521R1_ENABLED)
-/*
- * Here we have a real Mersenne prime, so things are more straightforward.
- * However, things are aligned on a 'weird' boundary (521 bits).
- */
-
-/* Size of p521 in terms of t_uint */
-#define P521_WIDTH      ( 521 / 8 / sizeof( t_uint ) + 1 )
-
-/* Bits to keep in the most significant t_uint */
-#if defined(POLARSSL_HAVE_INT8)
-#define P521_MASK       0x01
-#else
-#define P521_MASK       0x01FF
-#endif
-
-/*
- * Fast quasi-reduction modulo p521 (FIPS 186-3 D.2.5)
- * Write N as A1 + 2^521 A0, return A0 + A1
- */
-static int ecp_mod_p521( mpi *N )
-{
-    int ret;
-    size_t i;
-    mpi M;
-    t_uint Mp[P521_WIDTH + 1];
-    /* Worst case for the size of M is when t_uint is 16 bits:
-     * we need to hold bits 513 to 1056, which is 34 limbs, that is
-     * P521_WIDTH + 1. Otherwise P521_WIDTH is enough. */
-
-    if( N->n < P521_WIDTH )
-        return( 0 );
-
-    /* M = A1 */
-    M.s = 1;
-    M.n = N->n - ( P521_WIDTH - 1 );
-    if( M.n > P521_WIDTH + 1 )
-        M.n = P521_WIDTH + 1;
-    M.p = Mp;
-    memcpy( Mp, N->p + P521_WIDTH - 1, M.n * sizeof( t_uint ) );
-    MPI_CHK( mpi_shift_r( &M, 521 % ( 8 * sizeof( t_uint ) ) ) );
-
-    /* N = A0 */
-    N->p[P521_WIDTH - 1] &= P521_MASK;
-    for( i = P521_WIDTH; i < N->n; i++ )
-        N->p[i] = 0;
-
-    /* N = A0 + A1 */
-    MPI_CHK( mpi_add_abs( N, N, &M ) );
-
-cleanup:
-    return( ret );
-}
-
-#undef P521_WIDTH
-#undef P521_MASK
-#endif /* POLARSSL_ECP_DP_SECP521R1_ENABLED */
-
-#endif /* POLARSSL_ECP_NIST_OPTIM */
 
 /*
  * Domain parameters for secp192r1
@@ -1024,6 +629,15 @@ cleanup:
     "AADD9DB8DBE9C48B3FD4E6AE33C9FC07CB308DB3B3C9D20ED6639CCA703308" \
     "70553E5C414CA92619418661197FAC10471DB1D381085DDADDB58796829CA90069"
 
+#if defined(POLARSSL_ECP_NIST_OPTIM)
+/* Forward declarations */
+static int ecp_mod_p192( mpi * );
+static int ecp_mod_p224( mpi * );
+static int ecp_mod_p256( mpi * );
+static int ecp_mod_p384( mpi * );
+static int ecp_mod_p521( mpi * );
+#endif
+
 /*
  * Set a group using well-known domain parameters
  */
@@ -1176,39 +790,37 @@ int ecp_tls_write_group( const ecp_group *grp, size_t *olen,
 }
 
 /*
- * Get the curve info from the TLS identifier
+ * Wrapper around fast quasi-modp functions, with fall-back to mpi_mod_mpi.
+ * See the documentation of struct ecp_group.
+ *
+ * This function is in the critial loop for ecp_mul, so pay attention to perf.
  */
-const ecp_curve_info *ecp_curve_info_from_tls_id( uint16_t tls_id )
+static int ecp_modp( mpi *N, const ecp_group *grp )
 {
-    const ecp_curve_info *curve_info;
+    int ret;
 
-    for( curve_info = ecp_curve_list();
-         curve_info->grp_id != POLARSSL_ECP_DP_NONE;
-         curve_info++ )
+    if( grp->modp == NULL )
+        return( mpi_mod_mpi( N, N, &grp->P ) );
+
+    /* N->s < 0 is a much faster test, which fails only if N is 0 */
+    if( ( N->s < 0 && mpi_cmp_int( N, 0 ) != 0 ) ||
+        mpi_msb( N ) > 2 * grp->pbits )
     {
-        if( curve_info->tls_id == tls_id )
-            return( curve_info );
+        return( POLARSSL_ERR_ECP_BAD_INPUT_DATA );
     }
 
-    return( NULL );
-}
+    MPI_CHK( grp->modp( N ) );
 
-/*
- * Get the curve info for the internal identifer
- */
-const ecp_curve_info *ecp_curve_info_from_grp_id( ecp_group_id grp_id )
-{
-    const ecp_curve_info *curve_info;
+    /* N->s < 0 is a much faster test, which fails only if N is 0 */
+    while( N->s < 0 && mpi_cmp_int( N, 0 ) != 0 )
+        MPI_CHK( mpi_add_mpi( N, N, &grp->P ) );
 
-    for( curve_info = ecp_curve_list();
-         curve_info->grp_id != POLARSSL_ECP_DP_NONE;
-         curve_info++ )
-    {
-        if( curve_info->grp_id == grp_id )
-            return( curve_info );
-    }
+    while( mpi_cmp_mpi( N, &grp->P ) >= 0 )
+        /* we known P, N and the result are positive */
+        MPI_CHK( mpi_sub_abs( N, N, &grp->P ) );
 
-    return( NULL );
+cleanup:
+    return( ret );
 }
 
 /*
@@ -1231,7 +843,7 @@ const ecp_curve_info *ecp_curve_info_from_grp_id( ecp_group_id grp_id )
  * N->s < 0 is a very fast test, which fails only if N is 0
  */
 #define MOD_SUB( N )                                \
-    while( N.s < 0 && mpi_cmp_int( &N, 0 ) != 0 )               \
+    while( N.s < 0 && mpi_cmp_int( &N, 0 ) != 0 )   \
         MPI_CHK( mpi_add_mpi( &N, &N, &grp->P ) )
 
 /*
@@ -1418,7 +1030,7 @@ cleanup:
 }
 
 /*
- * Addition or subtraction: R = P + Q or R = P + Q,
+ * Addition or subtraction: R = P + Q or R = P - Q,
  * mixed affine-Jacobian coordinates (GECC 3.22)
  *
  * The coordinates of Q must be normalized (= affine),
@@ -1968,6 +1580,396 @@ int ecp_gen_keypair( ecp_group *grp, mpi *d, ecp_point *Q,
     return( ecp_mul( grp, Q, d, &grp->G, f_rng, p_rng ) );
 }
 
+#if defined(POLARSSL_ECP_NIST_OPTIM)
+
+#if defined(POLARSSL_ECP_DP_SECP192R1_ENABLED)
+/*
+ * Compared to the way things are presented in FIPS 186-3 D.2,
+ * we proceed in columns, from right (least significant chunk) to left,
+ * adding chunks to N in place, and keeping a carry for the next chunk.
+ * This avoids moving things around in memory, and uselessly adding zeros,
+ * compared to the more straightforward, line-oriented approach.
+ *
+ * For this prime we need to handle data in chunks of 64 bits.
+ * Since this is always a multiple of our basic t_uint, we can
+ * use a t_uint * to designate such a chunk, and small loops to handle it.
+ */
+
+/* Add 64-bit chunks (dst += src) and update carry */
+static inline void add64( t_uint *dst, t_uint *src, t_uint *carry )
+{
+    unsigned char i;
+    t_uint c = 0;
+    for( i = 0; i < 8 / sizeof( t_uint ); i++, dst++, src++ )
+    {
+        *dst += c;      c  = ( *dst < c );
+        *dst += *src;   c += ( *dst < *src );
+    }
+    *carry += c;
+}
+
+/* Add carry to a 64-bit chunk and update carry */
+static inline void carry64( t_uint *dst, t_uint *carry )
+{
+    unsigned char i;
+    for( i = 0; i < 8 / sizeof( t_uint ); i++, dst++ )
+    {
+        *dst += *carry;
+        *carry  = ( *dst < *carry );
+    }
+}
+
+#define WIDTH       8 / sizeof( t_uint )
+#define A( i )      N->p + i * WIDTH
+#define ADD( i )    add64( p, A( i ), &c )
+#define NEXT        p += WIDTH; carry64( p, &c )
+#define LAST        p += WIDTH; *p = c; while( ++p < end ) *p = 0
+
+/*
+ * Fast quasi-reduction modulo p192 (FIPS 186-3 D.2.1)
+ */
+static int ecp_mod_p192( mpi *N )
+{
+    int ret;
+    t_uint c = 0;
+    t_uint *p, *end;
+
+    /* Make sure we have enough blocks so that A(5) is legal */
+    MPI_CHK( mpi_grow( N, 6 * WIDTH ) );
+
+    p = N->p;
+    end = p + N->n;
+
+    ADD( 3 ); ADD( 5 );             NEXT; // A0 += A3 + A5
+    ADD( 3 ); ADD( 4 ); ADD( 5 );   NEXT; // A1 += A3 + A4 + A5
+    ADD( 4 ); ADD( 5 );             LAST; // A2 += A4 + A5
+
+cleanup:
+    return( ret );
+}
+
+#undef WIDTH
+#undef A
+#undef ADD
+#undef NEXT
+#undef LAST
+#endif /* POLARSSL_ECP_DP_SECP192R1_ENABLED */
+
+#if defined(POLARSSL_ECP_DP_SECP224R1_ENABLED) ||   \
+    defined(POLARSSL_ECP_DP_SECP256R1_ENABLED) ||   \
+    defined(POLARSSL_ECP_DP_SECP384R1_ENABLED)
+/*
+ * The reader is advised to first understand ecp_mod_p192() since the same
+ * general structure is used here, but with additional complications:
+ * (1) chunks of 32 bits, and (2) subtractions.
+ */
+
+/*
+ * For these primes, we need to handle data in chunks of 32 bits.
+ * This makes it more complicated if we use 64 bits limbs in MPI,
+ * which prevents us from using a uniform access method as for p192.
+ *
+ * So, we define a mini abstraction layer to access 32 bit chunks,
+ * load them in 'cur' for work, and store them back from 'cur' when done.
+ *
+ * While at it, also define the size of N in terms of 32-bit chunks.
+ */
+#define LOAD32      cur = A( i );
+
+#if defined(POLARSSL_HAVE_INT8)     /* 8 bit */
+
+#define MAX32       N->n / 4
+#define A( j )      (uint32_t)( N->p[4*j+0]       ) |  \
+                              ( N->p[4*j+1] << 8  ) |  \
+                              ( N->p[4*j+2] << 16 ) |  \
+                              ( N->p[4*j+3] << 24 )
+#define STORE32     N->p[4*i+0] = (uint8_t)( cur       );   \
+                    N->p[4*i+1] = (uint8_t)( cur >> 8  );   \
+                    N->p[4*i+2] = (uint8_t)( cur >> 16 );   \
+                    N->p[4*i+3] = (uint8_t)( cur >> 24 );
+
+#elif defined(POLARSSL_HAVE_INT16)  /* 16 bit */
+
+#define MAX32       N->n / 2
+#define A( j )      (uint32_t)( N->p[2*j] ) | ( N->p[2*j+1] << 16 )
+#define STORE32     N->p[2*i+0] = (uint16_t)( cur       );  \
+                    N->p[2*i+1] = (uint16_t)( cur >> 16 );
+
+#elif defined(POLARSSL_HAVE_INT32)  /* 32 bit */
+
+#define MAX32       N->n
+#define A( j )      N->p[j]
+#define STORE32     N->p[i] = cur;
+
+#else                               /* 64-bit */
+
+#define MAX32       N->n * 2
+#define A( j ) j % 2 ? (uint32_t)( N->p[j/2] >> 32 ) : (uint32_t)( N->p[j/2] )
+#define STORE32                                   \
+    if( i % 2 ) {                                 \
+        N->p[i/2] &= 0x00000000FFFFFFFF;          \
+        N->p[i/2] |= ((uint64_t) cur) << 32;      \
+    } else {                                      \
+        N->p[i/2] &= 0xFFFFFFFF00000000;          \
+        N->p[i/2] |= (uint64_t) cur;              \
+    }
+
+#endif /* sizeof( t_uint ) */
+
+/*
+ * Helpers for addition and subtraction of chunks, with signed carry.
+ */
+static inline void add32( uint32_t *dst, uint32_t src, signed char *carry )
+{
+    *dst += src;
+    *carry += ( *dst < src );
+}
+
+static inline void sub32( uint32_t *dst, uint32_t src, signed char *carry )
+{
+    *carry -= ( *dst < src );
+    *dst -= src;
+}
+
+#define ADD( j )    add32( &cur, A( j ), &c );
+#define SUB( j )    sub32( &cur, A( j ), &c );
+
+/*
+ * Helpers for the main 'loop'
+ */
+#define INIT( b )                                           \
+    int ret;                                                \
+    signed char c = 0, cc;                                  \
+    uint32_t cur;                                           \
+    size_t i = 0, bits = b;                                 \
+                                                            \
+    MPI_CHK( mpi_grow( N, b * 2 / 8 / sizeof( t_uint ) ) ); \
+    LOAD32;
+
+#define NEXT                    \
+    STORE32; i++; LOAD32;       \
+    cc = c; c = 0;              \
+    if( cc < 0 )                \
+        sub32( &cur, -cc, &c ); \
+    else                        \
+        add32( &cur, cc, &c );  \
+
+#define LAST                                    \
+    STORE32; i++;                               \
+    cur = c > 0 ? c : 0; STORE32;               \
+    cur = 0; while( ++i < MAX32 ) { STORE32; }  \
+    if( c < 0 ) fix_negative( N, c, bits );
+
+/*
+ * If the result is negative, we get it in the form
+ * c * 2^(bits + 32) + N, with c negative and N positive shorter than 'bits'
+ */
+static inline int fix_negative( mpi *N, signed char c, size_t bits )
+{
+    int ret;
+    mpi C;
+    t_uint Cp[ 384 / 8 / sizeof( t_uint) + 1 ];
+
+    /* C = - c * 2^(bits + 32) */
+    C.s = 1;
+    C.n = bits / 8 / sizeof( t_uint ) + 1;
+    C.p = Cp;
+    memset( Cp, 0, C.n * sizeof( t_uint ) );
+#if defined(POLARSSL_HAVE_INT64)
+    if( bits == 224 )
+        Cp[ C.n - 1 ] = ((t_uint) -c) << 32;
+    else
+#endif
+        Cp[ C.n - 1 ] = (t_uint) -c;
+
+    /* N = - ( C - N ) */
+    MPI_CHK( mpi_sub_abs( N, &C, N ) );
+    N->s = -1;
+
+cleanup:
+
+    return( ret );
+}
+
+#if defined(POLARSSL_ECP_DP_SECP224R1_ENABLED)
+/*
+ * Fast quasi-reduction modulo p224 (FIPS 186-3 D.2.2)
+ */
+static int ecp_mod_p224( mpi *N )
+{
+    INIT( 224 );
+
+    SUB(  7 ); SUB( 11 );               NEXT; // A0 += -A7 - A11
+    SUB(  8 ); SUB( 12 );               NEXT; // A1 += -A8 - A12
+    SUB(  9 ); SUB( 13 );               NEXT; // A2 += -A9 - A13
+    SUB( 10 ); ADD(  7 ); ADD( 11 );    NEXT; // A3 += -A10 + A7 + A11
+    SUB( 11 ); ADD(  8 ); ADD( 12 );    NEXT; // A4 += -A11 + A8 + A12
+    SUB( 12 ); ADD(  9 ); ADD( 13 );    NEXT; // A5 += -A12 + A9 + A13
+    SUB( 13 ); ADD( 10 );               LAST; // A6 += -A13 + A10
+
+cleanup:
+    return( ret );
+}
+#endif /* POLARSSL_ECP_DP_SECP224R1_ENABLED */
+
+#if defined(POLARSSL_ECP_DP_SECP256R1_ENABLED)
+/*
+ * Fast quasi-reduction modulo p256 (FIPS 186-3 D.2.3)
+ */
+static int ecp_mod_p256( mpi *N )
+{
+    INIT( 256 );
+
+    ADD(  8 ); ADD(  9 );
+    SUB( 11 ); SUB( 12 ); SUB( 13 ); SUB( 14 );             NEXT; // A0
+
+    ADD(  9 ); ADD( 10 );
+    SUB( 12 ); SUB( 13 ); SUB( 14 ); SUB( 15 );             NEXT; // A1
+
+    ADD( 10 ); ADD( 11 );
+    SUB( 13 ); SUB( 14 ); SUB( 15 );                        NEXT; // A2
+
+    ADD( 11 ); ADD( 11 ); ADD( 12 ); ADD( 12 ); ADD( 13 );
+    SUB( 15 ); SUB(  8 ); SUB(  9 );                        NEXT; // A3
+
+    ADD( 12 ); ADD( 12 ); ADD( 13 ); ADD( 13 ); ADD( 14 );
+    SUB(  9 ); SUB( 10 );                                   NEXT; // A4
+
+    ADD( 13 ); ADD( 13 ); ADD( 14 ); ADD( 14 ); ADD( 15 );
+    SUB( 10 ); SUB( 11 );                                   NEXT; // A5
+
+    ADD( 14 ); ADD( 14 ); ADD( 15 ); ADD( 15 ); ADD( 14 ); ADD( 13 );
+    SUB(  8 ); SUB(  9 );                                   NEXT; // A6
+
+    ADD( 15 ); ADD( 15 ); ADD( 15 ); ADD( 8 );
+    SUB( 10 ); SUB( 11 ); SUB( 12 ); SUB( 13 );             LAST; // A7
+
+cleanup:
+    return( ret );
+}
+#endif /* POLARSSL_ECP_DP_SECP256R1_ENABLED */
+
+#if defined(POLARSSL_ECP_DP_SECP384R1_ENABLED)
+/*
+ * Fast quasi-reduction modulo p384 (FIPS 186-3 D.2.4)
+ */
+static int ecp_mod_p384( mpi *N )
+{
+    INIT( 384 );
+
+    ADD( 12 ); ADD( 21 ); ADD( 20 );
+    SUB( 23 );                                              NEXT; // A0
+
+    ADD( 13 ); ADD( 22 ); ADD( 23 );
+    SUB( 12 ); SUB( 20 );                                   NEXT; // A2
+
+    ADD( 14 ); ADD( 23 );
+    SUB( 13 ); SUB( 21 );                                   NEXT; // A2
+
+    ADD( 15 ); ADD( 12 ); ADD( 20 ); ADD( 21 );
+    SUB( 14 ); SUB( 22 ); SUB( 23 );                        NEXT; // A3
+
+    ADD( 21 ); ADD( 21 ); ADD( 16 ); ADD( 13 ); ADD( 12 ); ADD( 20 ); ADD( 22 );
+    SUB( 15 ); SUB( 23 ); SUB( 23 );                        NEXT; // A4
+
+    ADD( 22 ); ADD( 22 ); ADD( 17 ); ADD( 14 ); ADD( 13 ); ADD( 21 ); ADD( 23 );
+    SUB( 16 );                                              NEXT; // A5
+
+    ADD( 23 ); ADD( 23 ); ADD( 18 ); ADD( 15 ); ADD( 14 ); ADD( 22 );
+    SUB( 17 );                                              NEXT; // A6
+
+    ADD( 19 ); ADD( 16 ); ADD( 15 ); ADD( 23 );
+    SUB( 18 );                                              NEXT; // A7
+
+    ADD( 20 ); ADD( 17 ); ADD( 16 );
+    SUB( 19 );                                              NEXT; // A8
+
+    ADD( 21 ); ADD( 18 ); ADD( 17 );
+    SUB( 20 );                                              NEXT; // A9
+
+    ADD( 22 ); ADD( 19 ); ADD( 18 );
+    SUB( 21 );                                              NEXT; // A10
+
+    ADD( 23 ); ADD( 20 ); ADD( 19 );
+    SUB( 22 );                                              LAST; // A11
+
+cleanup:
+    return( ret );
+}
+#endif /* POLARSSL_ECP_DP_SECP384R1_ENABLED */
+
+#undef A
+#undef LOAD32
+#undef STORE32
+#undef MAX32
+#undef INIT
+#undef NEXT
+#undef LAST
+
+#endif /* POLARSSL_ECP_DP_SECP224R1_ENABLED ||
+          POLARSSL_ECP_DP_SECP256R1_ENABLED ||
+          POLARSSL_ECP_DP_SECP384R1_ENABLED */
+
+#if defined(POLARSSL_ECP_DP_SECP521R1_ENABLED)
+/*
+ * Here we have an actual Mersenne prime, so things are more straightforward.
+ * However, chunks are aligned on a 'weird' boundary (521 bits).
+ */
+
+/* Size of p521 in terms of t_uint */
+#define P521_WIDTH      ( 521 / 8 / sizeof( t_uint ) + 1 )
+
+/* Bits to keep in the most significant t_uint */
+#if defined(POLARSSL_HAVE_INT8)
+#define P521_MASK       0x01
+#else
+#define P521_MASK       0x01FF
+#endif
+
+/*
+ * Fast quasi-reduction modulo p521 (FIPS 186-3 D.2.5)
+ * Write N as A1 + 2^521 A0, return A0 + A1
+ */
+static int ecp_mod_p521( mpi *N )
+{
+    int ret;
+    size_t i;
+    mpi M;
+    t_uint Mp[P521_WIDTH + 1];
+    /* Worst case for the size of M is when t_uint is 16 bits:
+     * we need to hold bits 513 to 1056, which is 34 limbs, that is
+     * P521_WIDTH + 1. Otherwise P521_WIDTH is enough. */
+
+    if( N->n < P521_WIDTH )
+        return( 0 );
+
+    /* M = A1 */
+    M.s = 1;
+    M.n = N->n - ( P521_WIDTH - 1 );
+    if( M.n > P521_WIDTH + 1 )
+        M.n = P521_WIDTH + 1;
+    M.p = Mp;
+    memcpy( Mp, N->p + P521_WIDTH - 1, M.n * sizeof( t_uint ) );
+    MPI_CHK( mpi_shift_r( &M, 521 % ( 8 * sizeof( t_uint ) ) ) );
+
+    /* N = A0 */
+    N->p[P521_WIDTH - 1] &= P521_MASK;
+    for( i = P521_WIDTH; i < N->n; i++ )
+        N->p[i] = 0;
+
+    /* N = A0 + A1 */
+    MPI_CHK( mpi_add_abs( N, N, &M ) );
+
+cleanup:
+    return( ret );
+}
+
+#undef P521_WIDTH
+#undef P521_MASK
+#endif /* POLARSSL_ECP_DP_SECP521R1_ENABLED */
+
+#endif /* POLARSSL_ECP_NIST_OPTIM */
+
 #if defined(POLARSSL_SELF_TEST)
 
 /*

From b21c81fb41caf007dd24b6fb0ad9f0a27b6dc56b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Manuel=20P=C3=A9gouri=C3=A9-Gonnard?= <mpg@elzevir.fr>
Date: Wed, 23 Oct 2013 20:45:04 +0200
Subject: [PATCH 13/14] Use less memory in fix_negative()

---
 library/ecp.c | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/library/ecp.c b/library/ecp.c
index 995f956b3..64d4e0339 100644
--- a/library/ecp.c
+++ b/library/ecp.c
@@ -1736,12 +1736,20 @@ static inline void sub32( uint32_t *dst, uint32_t src, signed char *carry )
 
 /*
  * Helpers for the main 'loop'
+ * (see fix_negative for the motivation of C)
  */
 #define INIT( b )                                           \
     int ret;                                                \
     signed char c = 0, cc;                                  \
     uint32_t cur;                                           \
     size_t i = 0, bits = b;                                 \
+    mpi C;                                                  \
+    t_uint Cp[ b / 8 / sizeof( t_uint) + 1 ];               \
+                                                            \
+    C.s = 1;                                                \
+    C.n = b / 8 / sizeof( t_uint) + 1;                      \
+    C.p = Cp;                                               \
+    memset( Cp, 0, C.n * sizeof( t_uint ) );                \
                                                             \
     MPI_CHK( mpi_grow( N, b * 2 / 8 / sizeof( t_uint ) ) ); \
     LOAD32;
@@ -1758,32 +1766,28 @@ static inline void sub32( uint32_t *dst, uint32_t src, signed char *carry )
     STORE32; i++;                               \
     cur = c > 0 ? c : 0; STORE32;               \
     cur = 0; while( ++i < MAX32 ) { STORE32; }  \
-    if( c < 0 ) fix_negative( N, c, bits );
+    if( c < 0 ) fix_negative( N, c, &C, bits );
 
 /*
  * If the result is negative, we get it in the form
  * c * 2^(bits + 32) + N, with c negative and N positive shorter than 'bits'
  */
-static inline int fix_negative( mpi *N, signed char c, size_t bits )
+static inline int fix_negative( mpi *N, signed char c, mpi *C, size_t bits )
 {
     int ret;
-    mpi C;
-    t_uint Cp[ 384 / 8 / sizeof( t_uint) + 1 ];
 
     /* C = - c * 2^(bits + 32) */
-    C.s = 1;
-    C.n = bits / 8 / sizeof( t_uint ) + 1;
-    C.p = Cp;
-    memset( Cp, 0, C.n * sizeof( t_uint ) );
-#if defined(POLARSSL_HAVE_INT64)
+#if !defined(POLARSSL_HAVE_INT64)
+    ((void) bits);
+#else
     if( bits == 224 )
-        Cp[ C.n - 1 ] = ((t_uint) -c) << 32;
+        C->p[ C->n - 1 ] = ((t_uint) -c) << 32;
     else
 #endif
-        Cp[ C.n - 1 ] = (t_uint) -c;
+        C->p[ C->n - 1 ] = (t_uint) -c;
 
     /* N = - ( C - N ) */
-    MPI_CHK( mpi_sub_abs( N, &C, N ) );
+    MPI_CHK( mpi_sub_abs( N, C, N ) );
     N->s = -1;
 
 cleanup:

From 9fcceac943006b6e6b3a8a5b3ba9a9463d04090c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Manuel=20P=C3=A9gouri=C3=A9-Gonnard?= <mpg@elzevir.fr>
Date: Wed, 23 Oct 2013 20:56:12 +0200
Subject: [PATCH 14/14] Add a comment about modules coupling

---
 library/ecp.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/library/ecp.c b/library/ecp.c
index 64d4e0339..bedb67506 100644
--- a/library/ecp.c
+++ b/library/ecp.c
@@ -1581,6 +1581,15 @@ int ecp_gen_keypair( ecp_group *grp, mpi *d, ecp_point *Q,
 }
 
 #if defined(POLARSSL_ECP_NIST_OPTIM)
+/*
+ * Fast reduction modulo the primes used by the NIST curves.
+ *
+ * These functions are: critical for speed, but not need for correct
+ * operations. So, we make the choice to heavily rely on the internals of our
+ * bignum library, which creates a tight coupling between these functions and
+ * our MPI implementation.  However, the coupling between the ECP module and
+ * MPI remains loose, since these functions can be deactivated at will.
+ */
 
 #if defined(POLARSSL_ECP_DP_SECP192R1_ENABLED)
 /*