Skip to content

Commit a4be322

Browse files
authored
Merge pull request #9356 from SparkiDev/sp_asm_add_sub_p384_arm
SP ASM ARM32/Thumb2: inline asm for add and subs
2 parents 85bfc49 + 5052169 commit a4be322

File tree

2 files changed

+682
-36
lines changed

2 files changed

+682
-36
lines changed

wolfcrypt/src/sp_arm32.c

Lines changed: 351 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -91860,10 +91860,83 @@ WC_OMIT_FRAME_POINTER static void sp_384_mont_add_12(sp_digit* r,
9186091860
register const sp_digit* m asm ("r3") = (const sp_digit*)m_p;
9186191861
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
9186291862

91863-
sp_digit o;
91864-
91865-
o = sp_384_add_12(r, a, b);
91866-
sp_384_cond_sub_12(r, r, m, 0 - o);
91863+
__asm__ __volatile__ (
91864+
"mov r3, #0\n\t"
91865+
"ldm %[a]!, {r8, r9, r10, r11}\n\t"
91866+
"ldm %[b]!, {r4, r5, r6, r7}\n\t"
91867+
"adds r8, r8, r4\n\t"
91868+
"adcs r9, r9, r5\n\t"
91869+
"adcs r10, r10, r6\n\t"
91870+
"adcs r11, r11, r7\n\t"
91871+
"stm %[r]!, {r8, r9, r10, r11}\n\t"
91872+
"ldm %[a]!, {r8, r9, r10, r11}\n\t"
91873+
"ldm %[b]!, {r4, r5, r6, r7}\n\t"
91874+
"adcs r8, r8, r4\n\t"
91875+
"adcs r9, r9, r5\n\t"
91876+
"adcs r10, r10, r6\n\t"
91877+
"adcs r11, r11, r7\n\t"
91878+
"stm %[r]!, {r8, r9, r10, r11}\n\t"
91879+
"ldm %[a]!, {r8, r9, r10, r11}\n\t"
91880+
"ldm %[b]!, {r4, r5, r6, r7}\n\t"
91881+
"adcs r8, r8, r4\n\t"
91882+
"adcs r9, r9, r5\n\t"
91883+
"adcs r10, r10, r6\n\t"
91884+
"adcs r11, r11, r7\n\t"
91885+
"stm %[r]!, {r8, r9, r10, r11}\n\t"
91886+
"adc r3, r3, #0\n\t"
91887+
"sub %[r], %[r], #48\n\t"
91888+
"rsb r3, r3, #0\n\t"
91889+
"lsr r12, r3, #1\n\t"
91890+
"ldm %[r], {r8, r9, r10, r11}\n\t"
91891+
"subs r8, r8, r3\n\t"
91892+
"sbcs r9, r9, #0\n\t"
91893+
"sbcs r10, r10, #0\n\t"
91894+
"sbcs r11, r11, r3\n\t"
91895+
"stm %[r]!, {r8, r9, r10, r11}\n\t"
91896+
"ldm %[r], {r8, r9, r10, r11}\n\t"
91897+
"sbcs r8, r8, r12, LSL #1\n\t"
91898+
"sbcs r9, r9, r3\n\t"
91899+
"sbcs r10, r10, r3\n\t"
91900+
"sbcs r11, r11, r3\n\t"
91901+
"stm %[r]!, {r8, r9, r10, r11}\n\t"
91902+
"ldm %[r], {r8, r9, r10, r11}\n\t"
91903+
"sbcs r8, r8, r3\n\t"
91904+
"sbcs r9, r9, r3\n\t"
91905+
"sbcs r10, r10, r3\n\t"
91906+
"sbcs r11, r11, r3\n\t"
91907+
"stm %[r]!, {r8, r9, r10, r11}\n\t"
91908+
"sbc %[b], %[b], %[b]\n\t"
91909+
"sub %[r], %[r], #48\n\t"
91910+
"sub r3, r3, %[b]\n\t"
91911+
"lsr r12, r3, #1\n\t"
91912+
"ldm %[r], {r8, r9, r10, r11}\n\t"
91913+
"subs r8, r8, r3\n\t"
91914+
"sbcs r9, r9, #0\n\t"
91915+
"sbcs r10, r10, #0\n\t"
91916+
"sbcs r11, r11, r3\n\t"
91917+
"stm %[r]!, {r8, r9, r10, r11}\n\t"
91918+
"ldm %[r], {r8, r9, r10, r11}\n\t"
91919+
"sbcs r8, r8, r12, LSL #1\n\t"
91920+
"sbcs r9, r9, r3\n\t"
91921+
"sbcs r10, r10, r3\n\t"
91922+
"sbcs r11, r11, r3\n\t"
91923+
"stm %[r]!, {r8, r9, r10, r11}\n\t"
91924+
"ldm %[r], {r8, r9, r10, r11}\n\t"
91925+
"sbcs r8, r8, r3\n\t"
91926+
"sbcs r9, r9, r3\n\t"
91927+
"sbcs r10, r10, r3\n\t"
91928+
"sbc r11, r11, r3\n\t"
91929+
"stm %[r]!, {r8, r9, r10, r11}\n\t"
91930+
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
91931+
: [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
91932+
:
91933+
#else
91934+
:
91935+
: [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m)
91936+
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
91937+
: "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
91938+
"r12"
91939+
);
9186791940
}
9186891941

9186991942
/* Double a Montgomery form number (r = a + a % m).
@@ -91886,10 +91959,73 @@ WC_OMIT_FRAME_POINTER static void sp_384_mont_dbl_12(sp_digit* r,
9188691959
register const sp_digit* m asm ("r2") = (const sp_digit*)m_p;
9188791960
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
9188891961

91889-
sp_digit o;
91890-
91891-
o = sp_384_add_12(r, a, a);
91892-
sp_384_cond_sub_12(r, r, m, 0 - o);
91962+
__asm__ __volatile__ (
91963+
"mov r2, #0\n\t"
91964+
"ldm %[a]!, {r4, r5, r6, r7, r8, r9}\n\t"
91965+
"adds r4, r4, r4\n\t"
91966+
"adcs r5, r5, r5\n\t"
91967+
"adcs r6, r6, r6\n\t"
91968+
"adcs r7, r7, r7\n\t"
91969+
"adcs r8, r8, r8\n\t"
91970+
"adcs r9, r9, r9\n\t"
91971+
"stm %[r]!, {r4, r5, r6, r7, r8, r9}\n\t"
91972+
"ldm %[a]!, {r4, r5, r6, r7, r8, r9}\n\t"
91973+
"adcs r4, r4, r4\n\t"
91974+
"adcs r5, r5, r5\n\t"
91975+
"adcs r6, r6, r6\n\t"
91976+
"adcs r7, r7, r7\n\t"
91977+
"adcs r8, r8, r8\n\t"
91978+
"adcs r9, r9, r9\n\t"
91979+
"stm %[r]!, {r4, r5, r6, r7, r8, r9}\n\t"
91980+
"adc r2, r2, #0\n\t"
91981+
"sub %[r], %[r], #48\n\t"
91982+
"rsb r2, r2, #0\n\t"
91983+
"lsr r3, r2, #1\n\t"
91984+
"ldm %[r], {r4, r5, r6, r7, r8, r9}\n\t"
91985+
"subs r4, r4, r2\n\t"
91986+
"sbcs r5, r5, #0\n\t"
91987+
"sbcs r6, r6, #0\n\t"
91988+
"sbcs r7, r7, r2\n\t"
91989+
"sbcs r8, r8, r3, LSL #1\n\t"
91990+
"sbcs r9, r9, r2\n\t"
91991+
"stm %[r]!, {r4, r5, r6, r7, r8, r9}\n\t"
91992+
"ldm %[r], {r4, r5, r6, r7, r8, r9}\n\t"
91993+
"sbcs r4, r4, r2\n\t"
91994+
"sbcs r5, r5, r2\n\t"
91995+
"sbcs r6, r6, r2\n\t"
91996+
"sbcs r7, r7, r2\n\t"
91997+
"sbcs r8, r8, r2\n\t"
91998+
"sbcs r9, r9, r2\n\t"
91999+
"stm %[r]!, {r4, r5, r6, r7, r8, r9}\n\t"
92000+
"sbc %[a], %[a], %[a]\n\t"
92001+
"sub %[r], %[r], #48\n\t"
92002+
"sub r2, r2, %[a]\n\t"
92003+
"lsr r3, r2, #1\n\t"
92004+
"ldm %[r], {r4, r5, r6, r7, r8, r9}\n\t"
92005+
"subs r4, r4, r2\n\t"
92006+
"sbcs r5, r5, #0\n\t"
92007+
"sbcs r6, r6, #0\n\t"
92008+
"sbcs r7, r7, r2\n\t"
92009+
"sbcs r8, r8, r3, LSL #1\n\t"
92010+
"sbcs r9, r9, r2\n\t"
92011+
"stm %[r]!, {r4, r5, r6, r7, r8, r9}\n\t"
92012+
"ldm %[r], {r4, r5, r6, r7, r8, r9}\n\t"
92013+
"sbcs r4, r4, r2\n\t"
92014+
"sbcs r5, r5, r2\n\t"
92015+
"sbcs r6, r6, r2\n\t"
92016+
"sbcs r7, r7, r2\n\t"
92017+
"sbcs r8, r8, r2\n\t"
92018+
"sbc r9, r9, r2\n\t"
92019+
"stm %[r]!, {r4, r5, r6, r7, r8, r9}\n\t"
92020+
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
92021+
: [r] "+r" (r), [a] "+r" (a), [m] "+r" (m)
92022+
:
92023+
#else
92024+
:
92025+
: [r] "r" (r), [a] "r" (a), [m] "r" (m)
92026+
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
92027+
: "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r3"
92028+
);
9189392029
}
9189492030

9189592031
/* Triple a Montgomery form number (r = a + a + a % m).
@@ -91912,12 +92048,138 @@ WC_OMIT_FRAME_POINTER static void sp_384_mont_tpl_12(sp_digit* r,
9191292048
register const sp_digit* m asm ("r2") = (const sp_digit*)m_p;
9191392049
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
9191492050

91915-
sp_digit o;
91916-
91917-
o = sp_384_add_12(r, a, a);
91918-
sp_384_cond_sub_12(r, r, m, 0 - o);
91919-
o = sp_384_add_12(r, r, a);
91920-
sp_384_cond_sub_12(r, r, m, 0 - o);
92051+
__asm__ __volatile__ (
92052+
"mov r2, #0\n\t"
92053+
"ldm %[a]!, {r4, r5, r6, r7, r8, r9}\n\t"
92054+
"adds r4, r4, r4\n\t"
92055+
"adcs r5, r5, r5\n\t"
92056+
"adcs r6, r6, r6\n\t"
92057+
"adcs r7, r7, r7\n\t"
92058+
"adcs r8, r8, r8\n\t"
92059+
"adcs r9, r9, r9\n\t"
92060+
"stm %[r]!, {r4, r5, r6, r7, r8, r9}\n\t"
92061+
"ldm %[a]!, {r4, r5, r6, r7, r8, r9}\n\t"
92062+
"adcs r4, r4, r4\n\t"
92063+
"adcs r5, r5, r5\n\t"
92064+
"adcs r6, r6, r6\n\t"
92065+
"adcs r7, r7, r7\n\t"
92066+
"adcs r8, r8, r8\n\t"
92067+
"adcs r9, r9, r9\n\t"
92068+
"stm %[r]!, {r4, r5, r6, r7, r8, r9}\n\t"
92069+
"adc r2, r2, #0\n\t"
92070+
"sub %[r], %[r], #48\n\t"
92071+
"rsb r2, r2, #0\n\t"
92072+
"lsr r3, r2, #1\n\t"
92073+
"ldm %[r], {r4, r5, r6, r7, r8, r9}\n\t"
92074+
"subs r4, r4, r2\n\t"
92075+
"sbcs r5, r5, #0\n\t"
92076+
"sbcs r6, r6, #0\n\t"
92077+
"sbcs r7, r7, r2\n\t"
92078+
"sbcs r8, r8, r3, LSL #1\n\t"
92079+
"sbcs r9, r9, r2\n\t"
92080+
"stm %[r]!, {r4, r5, r6, r7, r8, r9}\n\t"
92081+
"ldm %[r], {r4, r5, r6, r7, r8, r9}\n\t"
92082+
"sbcs r4, r4, r2\n\t"
92083+
"sbcs r5, r5, r2\n\t"
92084+
"sbcs r6, r6, r2\n\t"
92085+
"sbcs r7, r7, r2\n\t"
92086+
"sbcs r8, r8, r2\n\t"
92087+
"sbcs r9, r9, r2\n\t"
92088+
"stm %[r]!, {r4, r5, r6, r7, r8, r9}\n\t"
92089+
"sbc r12, r12, r12\n\t"
92090+
"sub %[r], %[r], #48\n\t"
92091+
"sub r2, r2, r12\n\t"
92092+
"lsr r3, r2, #1\n\t"
92093+
"ldm %[r], {r4, r5, r6, r7, r8, r9}\n\t"
92094+
"subs r4, r4, r2\n\t"
92095+
"sbcs r5, r5, #0\n\t"
92096+
"sbcs r6, r6, #0\n\t"
92097+
"sbcs r7, r7, r2\n\t"
92098+
"sbcs r8, r8, r3, LSL #1\n\t"
92099+
"sbcs r9, r9, r2\n\t"
92100+
"stm %[r]!, {r4, r5, r6, r7, r8, r9}\n\t"
92101+
"ldm %[r], {r4, r5, r6, r7, r8, r9}\n\t"
92102+
"sbcs r4, r4, r2\n\t"
92103+
"sbcs r5, r5, r2\n\t"
92104+
"sbcs r6, r6, r2\n\t"
92105+
"sbcs r7, r7, r2\n\t"
92106+
"sbcs r8, r8, r2\n\t"
92107+
"sbc r9, r9, r2\n\t"
92108+
"stm %[r]!, {r4, r5, r6, r7, r8, r9}\n\t"
92109+
"sub %[r], %[r], #48\n\t"
92110+
"sub %[a], %[a], #48\n\t"
92111+
"mov r2, #0\n\t"
92112+
"ldm %[a]!, {r4, r5, r6, r7}\n\t"
92113+
"ldm %[r], {r8, r9, r10, r11}\n\t"
92114+
"adds r8, r8, r4\n\t"
92115+
"adcs r9, r9, r5\n\t"
92116+
"adcs r10, r10, r6\n\t"
92117+
"adcs r11, r11, r7\n\t"
92118+
"stm %[r]!, {r8, r9, r10, r11}\n\t"
92119+
"ldm %[a]!, {r4, r5, r6, r7}\n\t"
92120+
"ldm %[r], {r8, r9, r10, r11}\n\t"
92121+
"adcs r8, r8, r4\n\t"
92122+
"adcs r9, r9, r5\n\t"
92123+
"adcs r10, r10, r6\n\t"
92124+
"adcs r11, r11, r7\n\t"
92125+
"stm %[r]!, {r8, r9, r10, r11}\n\t"
92126+
"ldm %[a]!, {r4, r5, r6, r7}\n\t"
92127+
"ldm %[r], {r8, r9, r10, r11}\n\t"
92128+
"adcs r8, r8, r4\n\t"
92129+
"adcs r9, r9, r5\n\t"
92130+
"adcs r10, r10, r6\n\t"
92131+
"adcs r11, r11, r7\n\t"
92132+
"stm %[r]!, {r8, r9, r10, r11}\n\t"
92133+
"adc r2, r2, #0\n\t"
92134+
"sub %[r], %[r], #48\n\t"
92135+
"rsb r2, r2, #0\n\t"
92136+
"lsr r3, r2, #1\n\t"
92137+
"ldm %[r], {r4, r5, r6, r7, r8, r9}\n\t"
92138+
"subs r4, r4, r2\n\t"
92139+
"sbcs r5, r5, #0\n\t"
92140+
"sbcs r6, r6, #0\n\t"
92141+
"sbcs r7, r7, r2\n\t"
92142+
"sbcs r8, r8, r3, LSL #1\n\t"
92143+
"sbcs r9, r9, r2\n\t"
92144+
"stm %[r]!, {r4, r5, r6, r7, r8, r9}\n\t"
92145+
"ldm %[r], {r4, r5, r6, r7, r8, r9}\n\t"
92146+
"sbcs r4, r4, r2\n\t"
92147+
"sbcs r5, r5, r2\n\t"
92148+
"sbcs r6, r6, r2\n\t"
92149+
"sbcs r7, r7, r2\n\t"
92150+
"sbcs r8, r8, r2\n\t"
92151+
"sbcs r9, r9, r2\n\t"
92152+
"stm %[r]!, {r4, r5, r6, r7, r8, r9}\n\t"
92153+
"sbc r12, r12, r12\n\t"
92154+
"sub %[r], %[r], #48\n\t"
92155+
"sub r2, r2, r12\n\t"
92156+
"lsr r3, r2, #1\n\t"
92157+
"ldm %[r], {r4, r5, r6, r7, r8, r9}\n\t"
92158+
"subs r4, r4, r2\n\t"
92159+
"sbcs r5, r5, #0\n\t"
92160+
"sbcs r6, r6, #0\n\t"
92161+
"sbcs r7, r7, r2\n\t"
92162+
"sbcs r8, r8, r3, LSL #1\n\t"
92163+
"sbcs r9, r9, r2\n\t"
92164+
"stm %[r]!, {r4, r5, r6, r7, r8, r9}\n\t"
92165+
"ldm %[r], {r4, r5, r6, r7, r8, r9}\n\t"
92166+
"sbcs r4, r4, r2\n\t"
92167+
"sbcs r5, r5, r2\n\t"
92168+
"sbcs r6, r6, r2\n\t"
92169+
"sbcs r7, r7, r2\n\t"
92170+
"sbcs r8, r8, r2\n\t"
92171+
"sbc r9, r9, r2\n\t"
92172+
"stm %[r]!, {r4, r5, r6, r7, r8, r9}\n\t"
92173+
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
92174+
: [r] "+r" (r), [a] "+r" (a), [m] "+r" (m)
92175+
:
92176+
#else
92177+
:
92178+
: [r] "r" (r), [a] "r" (a), [m] "r" (m)
92179+
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
92180+
: "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
92181+
"r3", "r12"
92182+
);
9192192183
}
9192292184

9192392185
#ifdef WOLFSSL_SP_SMALL
@@ -92185,10 +92447,81 @@ WC_OMIT_FRAME_POINTER static void sp_384_mont_sub_12(sp_digit* r,
9218592447
register const sp_digit* m asm ("r3") = (const sp_digit*)m_p;
9218692448
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
9218792449

92188-
sp_digit o;
92189-
92190-
o = sp_384_sub_12(r, a, b);
92191-
sp_384_cond_add_12(r, r, m, o);
92450+
__asm__ __volatile__ (
92451+
"mov r3, #0\n\t"
92452+
"ldm %[a]!, {r8, r9, r10, r11}\n\t"
92453+
"ldm %[b]!, {r4, r5, r6, r7}\n\t"
92454+
"subs r8, r8, r4\n\t"
92455+
"sbcs r9, r9, r5\n\t"
92456+
"sbcs r10, r10, r6\n\t"
92457+
"sbcs r11, r11, r7\n\t"
92458+
"stm %[r]!, {r8, r9, r10, r11}\n\t"
92459+
"ldm %[a]!, {r8, r9, r10, r11}\n\t"
92460+
"ldm %[b]!, {r4, r5, r6, r7}\n\t"
92461+
"sbcs r8, r8, r4\n\t"
92462+
"sbcs r9, r9, r5\n\t"
92463+
"sbcs r10, r10, r6\n\t"
92464+
"sbcs r11, r11, r7\n\t"
92465+
"stm %[r]!, {r8, r9, r10, r11}\n\t"
92466+
"ldm %[a]!, {r8, r9, r10, r11}\n\t"
92467+
"ldm %[b]!, {r4, r5, r6, r7}\n\t"
92468+
"sbcs r8, r8, r4\n\t"
92469+
"sbcs r9, r9, r5\n\t"
92470+
"sbcs r10, r10, r6\n\t"
92471+
"sbcs r11, r11, r7\n\t"
92472+
"stm %[r]!, {r8, r9, r10, r11}\n\t"
92473+
"sbc r3, r3, #0\n\t"
92474+
"sub %[r], %[r], #48\n\t"
92475+
"lsr r12, r3, #1\n\t"
92476+
"ldm %[r], {r8, r9, r10, r11}\n\t"
92477+
"adds r8, r8, r3\n\t"
92478+
"adcs r9, r9, #0\n\t"
92479+
"adcs r10, r10, #0\n\t"
92480+
"adcs r11, r11, r3\n\t"
92481+
"stm %[r]!, {r8, r9, r10, r11}\n\t"
92482+
"ldm %[r], {r8, r9, r10, r11}\n\t"
92483+
"adcs r8, r8, r12, LSL #1\n\t"
92484+
"adcs r9, r9, r3\n\t"
92485+
"adcs r10, r10, r3\n\t"
92486+
"adcs r11, r11, r3\n\t"
92487+
"stm %[r]!, {r8, r9, r10, r11}\n\t"
92488+
"ldm %[r], {r8, r9, r10, r11}\n\t"
92489+
"adcs r8, r8, r3\n\t"
92490+
"adcs r9, r9, r3\n\t"
92491+
"adcs r10, r10, r3\n\t"
92492+
"adcs r11, r11, r3\n\t"
92493+
"stm %[r]!, {r8, r9, r10, r11}\n\t"
92494+
"adc r3, r3, #0\n\t"
92495+
"sub %[r], %[r], #48\n\t"
92496+
"lsr r12, r3, #1\n\t"
92497+
"ldm %[r], {r8, r9, r10, r11}\n\t"
92498+
"adds r8, r8, r3\n\t"
92499+
"adcs r9, r9, #0\n\t"
92500+
"adcs r10, r10, #0\n\t"
92501+
"adcs r11, r11, r3\n\t"
92502+
"stm %[r]!, {r8, r9, r10, r11}\n\t"
92503+
"ldm %[r], {r8, r9, r10, r11}\n\t"
92504+
"adcs r8, r8, r12, LSL #1\n\t"
92505+
"adcs r9, r9, r3\n\t"
92506+
"adcs r10, r10, r3\n\t"
92507+
"adcs r11, r11, r3\n\t"
92508+
"stm %[r]!, {r8, r9, r10, r11}\n\t"
92509+
"ldm %[r], {r8, r9, r10, r11}\n\t"
92510+
"adcs r8, r8, r3\n\t"
92511+
"adcs r9, r9, r3\n\t"
92512+
"adcs r10, r10, r3\n\t"
92513+
"adc r11, r11, r3\n\t"
92514+
"stm %[r]!, {r8, r9, r10, r11}\n\t"
92515+
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
92516+
: [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
92517+
:
92518+
#else
92519+
:
92520+
: [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m)
92521+
#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
92522+
: "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
92523+
"r12"
92524+
);
9219292525
}
9219392526

9219492527
#ifdef WOLFSSL_SP_SMALL

0 commit comments

Comments
 (0)