Perf: rewrite of ecp_double_jac

- Improve optimization for special case A == -3.
- Add optimization for special case A == 0.
- Use alternative base formula, saving several additions.
- Reduce temp variables to 4 (from 6).
This commit is contained in:
Peter Dettman 2015-02-07 14:43:51 +07:00
parent 82788fb63b
commit ce661b2cb8

View File

@ -912,70 +912,86 @@ cleanup:
/*
* Point doubling R = 2 P, Jacobian coordinates
*
* http://www.hyperelliptic.org/EFD/g1p/auto-code/shortw/jacobian/doubling/dbl-2007-bl.op3
* with heavy variable renaming, some reordering and one minor modification
* (a = 2 * b, c = d - 2a replaced with c = d, c = c - b, c = c - b)
* in order to use a lot less intermediate variables (6 vs 25).
* Based on http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian.html#doubling-dbl-1998-cmo-2 .
*
* Cost: 1D := 2M + 8S
* We follow the variable naming fairly closely. The formula variations that trade a MUL for a SQR
* (plus a few ADDs) aren't useful as our bignum implementation doesn't distinguish squaring.
*
* Standard optimizations are applied when curve parameter A is one of { 0, -3 }.
*
* Cost: 1D := 3M + 4S (A == 0)
* 4M + 4S (A == -3)
* 3M + 6S + 1a otherwise
*/
static int ecp_double_jac( const ecp_group *grp, ecp_point *R,
const ecp_point *P )
{
int ret;
mpi T1, T2, T3, X3, Y3, Z3;
mpi M, S, T, U;
#if defined(POLARSSL_SELF_TEST)
dbl_count++;
#endif
mpi_init( &T1 ); mpi_init( &T2 ); mpi_init( &T3 );
mpi_init( &X3 ); mpi_init( &Y3 ); mpi_init( &Z3 );
MPI_CHK( mpi_mul_mpi( &T3, &P->X, &P->X ) ); MOD_MUL( T3 );
MPI_CHK( mpi_mul_mpi( &T2, &P->Y, &P->Y ) ); MOD_MUL( T2 );
MPI_CHK( mpi_mul_mpi( &Y3, &T2, &T2 ) ); MOD_MUL( Y3 );
MPI_CHK( mpi_add_mpi( &X3, &P->X, &T2 ) ); MOD_ADD( X3 );
MPI_CHK( mpi_mul_mpi( &X3, &X3, &X3 ) ); MOD_MUL( X3 );
MPI_CHK( mpi_sub_mpi( &X3, &X3, &Y3 ) ); MOD_SUB( X3 );
MPI_CHK( mpi_sub_mpi( &X3, &X3, &T3 ) ); MOD_SUB( X3 );
MPI_CHK( mpi_mul_int( &T1, &X3, 2 ) ); MOD_ADD( T1 );
MPI_CHK( mpi_mul_mpi( &Z3, &P->Z, &P->Z ) ); MOD_MUL( Z3 );
MPI_CHK( mpi_mul_mpi( &X3, &Z3, &Z3 ) ); MOD_MUL( X3 );
MPI_CHK( mpi_mul_int( &T3, &T3, 3 ) ); MOD_ADD( T3 );
mpi_init( &M ); mpi_init( &S ); mpi_init( &T ); mpi_init( &U );
/* Special case for A = -3 */
if( grp->A.p == NULL )
{
MPI_CHK( mpi_mul_int( &X3, &X3, 3 ) );
X3.s = -1; /* mpi_mul_int doesn't handle negative numbers */
MOD_SUB( X3 );
/* M = 3(X + Z^2)(X - Z^2) */
MPI_CHK( mpi_mul_mpi( &S, &P->Z, &P->Z ) ); MOD_MUL( S );
MPI_CHK( mpi_add_mpi( &T, &P->X, &S ) ); MOD_ADD( T );
MPI_CHK( mpi_sub_mpi( &U, &P->X, &S ) ); MOD_SUB( U );
MPI_CHK( mpi_mul_mpi( &S, &T, &U ) ); MOD_MUL( S );
MPI_CHK( mpi_mul_int( &M, &S, 3 ) ); MOD_ADD( M );
}
else
{
MPI_CHK( mpi_mul_mpi( &X3, &X3, &grp->A ) ); MOD_MUL( X3 );
/* M = 3.X^2 */
MPI_CHK( mpi_mul_mpi( &S, &P->X, &P->X ) ); MOD_MUL( S );
MPI_CHK( mpi_mul_int( &M, &S, 3 ) ); MOD_ADD( M );
/* Optimize away for "koblitz" curves with A = 0 */
if( mpi_cmp_int( &grp->A, 0 ) != 0 )
{
/* M += A.Z^4 */
MPI_CHK( mpi_mul_mpi( &S, &P->Z, &P->Z ) ); MOD_MUL( S );
MPI_CHK( mpi_mul_mpi( &T, &S, &S ) ); MOD_MUL( T );
MPI_CHK( mpi_mul_mpi( &S, &T, &grp->A ) ); MOD_MUL( S );
MPI_CHK( mpi_add_mpi( &M, &M, &S ) ); MOD_ADD( M );
}
}
MPI_CHK( mpi_add_mpi( &T3, &T3, &X3 ) ); MOD_ADD( T3 );
MPI_CHK( mpi_mul_mpi( &X3, &T3, &T3 ) ); MOD_MUL( X3 );
MPI_CHK( mpi_sub_mpi( &X3, &X3, &T1 ) ); MOD_SUB( X3 );
MPI_CHK( mpi_sub_mpi( &X3, &X3, &T1 ) ); MOD_SUB( X3 );
MPI_CHK( mpi_sub_mpi( &T1, &T1, &X3 ) ); MOD_SUB( T1 );
MPI_CHK( mpi_mul_mpi( &T1, &T3, &T1 ) ); MOD_MUL( T1 );
MPI_CHK( mpi_mul_int( &T3, &Y3, 8 ) ); MOD_ADD( T3 );
MPI_CHK( mpi_sub_mpi( &Y3, &T1, &T3 ) ); MOD_SUB( Y3 );
MPI_CHK( mpi_add_mpi( &T1, &P->Y, &P->Z ) ); MOD_ADD( T1 );
MPI_CHK( mpi_mul_mpi( &T1, &T1, &T1 ) ); MOD_MUL( T1 );
MPI_CHK( mpi_sub_mpi( &T1, &T1, &T2 ) ); MOD_SUB( T1 );
MPI_CHK( mpi_sub_mpi( &Z3, &T1, &Z3 ) ); MOD_SUB( Z3 );
/* S = 4.X.Y^2 */
MPI_CHK( mpi_mul_mpi( &T, &P->Y, &P->Y ) ); MOD_MUL( T );
MPI_CHK( mpi_shift_l( &T, 1 ) ); MOD_ADD( T );
MPI_CHK( mpi_mul_mpi( &S, &P->X, &T ) ); MOD_MUL( S );
MPI_CHK( mpi_shift_l( &S, 1 ) ); MOD_ADD( S );
MPI_CHK( mpi_copy( &R->X, &X3 ) );
MPI_CHK( mpi_copy( &R->Y, &Y3 ) );
MPI_CHK( mpi_copy( &R->Z, &Z3 ) );
/* U = 8.Y^4 */
MPI_CHK( mpi_mul_mpi( &U, &T, &T ) ); MOD_MUL( U );
MPI_CHK( mpi_shift_l( &U, 1 ) ); MOD_ADD( U );
/* T = M^2 - 2.S */
MPI_CHK( mpi_mul_mpi( &T, &M, &M ) ); MOD_MUL( T );
MPI_CHK( mpi_sub_mpi( &T, &T, &S ) ); MOD_SUB( T );
MPI_CHK( mpi_sub_mpi( &T, &T, &S ) ); MOD_SUB( T );
/* S = M(S - T) - U */
MPI_CHK( mpi_sub_mpi( &S, &S, &T ) ); MOD_SUB( S );
MPI_CHK( mpi_mul_mpi( &S, &S, &M ) ); MOD_MUL( S );
MPI_CHK( mpi_sub_mpi( &S, &S, &U ) ); MOD_SUB( S );
/* U = 2.Y.Z */
MPI_CHK( mpi_mul_mpi( &U, &P->Y, &P->Z ) ); MOD_MUL( U );
MPI_CHK( mpi_shift_l( &U, 1 ) ); MOD_ADD( U );
MPI_CHK( mpi_copy( &R->X, &T ) );
MPI_CHK( mpi_copy( &R->Y, &S ) );
MPI_CHK( mpi_copy( &R->Z, &U ) );
cleanup:
mpi_free( &T1 ); mpi_free( &T2 ); mpi_free( &T3 );
mpi_free( &X3 ); mpi_free( &Y3 ); mpi_free( &Z3 );
mpi_free( &M ); mpi_free( &S ); mpi_free( &T ); mpi_free( &U );
return( ret );
}