From 16b1bd89326ece62712c8ecc1142a41bd257d443 Mon Sep 17 00:00:00 2001 From: Aurelien Jarno Date: Mon, 21 May 2018 22:01:21 +0200 Subject: [PATCH] bn_mul.h: add ARM DSP optimized MULADDC code The Cortex M4, M7 MCUs and the Cortex A CPUs support the ARM DSP instructions, and especially the umaal instruction which greatly speed up MULADDC code. In addition the patch switched the ASM constraints to registers instead of memory, giving the opportunity for the compiler to load them the best way. The speed improvement is variable depending on the crypto operation and the CPU. Here are the results on a Cortex M4, a Cortex M7 and a Cortex A8. All tests have been done with GCC 6.3 using -O2. RSA uses a RSA-4096 key. ECDSA uses a secp256r1 curve EC key pair. +--------+--------+--------+ | M4 | M7 | A8 | +----------------+--------+--------+--------+ | ECDSA signing | +6.3% | +7.9% | +4.1% | +----------------+--------+--------+--------+ | RSA signing | +43.7% | +68.3% | +26.3% | +----------------+--------+--------+--------+ | RSA encryption | +3.4% | +9.7% | +3.6% | +----------------+--------+--------+--------+ | RSA decryption | +43.0% | +67.8% | +22.8% | +----------------+--------+--------+--------+ I ran the whole testsuite on the Cortex A8 Linux environment, and it all passes. --- include/mbedtls/bn_mul.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/include/mbedtls/bn_mul.h b/include/mbedtls/bn_mul.h index 354c1cc1a..b631ad278 100644 --- a/include/mbedtls/bn_mul.h +++ b/include/mbedtls/bn_mul.h @@ -630,6 +630,23 @@ "r6", "r7", "r8", "r9", "cc" \ ); +#elif defined (__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1) + +#define MULADDC_INIT \ + asm( + +#define MULADDC_CORE \ + "ldr r0, [%0], #4 \n\t" \ + "ldr r1, [%1] \n\t" \ + "umaal r1, %2, %3, r0 \n\t" \ + "str r1, [%1], #4 \n\t" + +#define MULADDC_STOP \ + : "=r" (s), "=r" (d), "=r" (c) \ + : "r" (b), "0" (s), "1" (d), "2" (c) \ + : "r0", "r1", "memory" \ + ); + #else #define MULADDC_INIT \