diff --git a/src/crypto/slow-hash.c b/src/crypto/slow-hash.c
index 6afa2893..b92b6e6c 100644
--- a/src/crypto/slow-hash.c
+++ b/src/crypto/slow-hash.c
@@ -979,34 +979,31 @@ STATIC void cn_mul128(const uint64_t *a, const uint64_t *b, uint64_t *r)
   r[1] = lo;
 }
 #else /* ARM32 */
-/* Can work as inline, but actually runs slower. Keep it separate */
 #define mul(a, b, c)	cn_mul128((const uint32_t *)a, (const uint32_t *)b, (uint32_t *)c)
-void cn_mul128(const uint32_t *aa, const uint32_t *bb, uint32_t *r)
+STATIC void cn_mul128(const uint32_t *aa, const uint32_t *bb, uint32_t *r)
 {
-  uint32_t t0, t1;
+  uint32_t t0, t1, t2=0, t3=0;
 __asm__ __volatile__(
   "umull %[t0], %[t1], %[a], %[b]\n\t"
-  "str   %[t0], [%[r], #8]\n\t"
+  "str   %[t0], %[ll]\n\t"
 
   // accumulating with 0 can never overflow/carry
-  "mov   %[t0], #0\n\t"
+  "eor   %[t0], %[t0]\n\t"
   "umlal %[t1], %[t0], %[a], %[B]\n\t"
 
-  "mov   %[a], #0\n\t"
-  "umlal %[t1], %[a], %[A], %[b]\n\t"
-  "str   %[t1], [%[r], #12]\n\t"
+  "umlal %[t1], %[t2], %[A], %[b]\n\t"
+  "str   %[t1], %[lh]\n\t"
 
-  "mov   %[b], #0\n\t"
-  "umlal %[t0], %[b], %[A], %[B]\n\t"
+  "umlal %[t0], %[t3], %[A], %[B]\n\t"
 
   // final add may have a carry
-  "adds  %[t0], %[t0], %[a]\n\t"
-  "adc   %[t1], %[b], #0\n\t"
+  "adds  %[t0], %[t0], %[t2]\n\t"
+  "adc   %[t1], %[t3], #0\n\t"
 
-  "str   %[t0], [%[r]]\n\t"
-  "str   %[t1], [%[r], #4]\n\t"
-  : [t0]"=&r"(t0), [t1]"=&r"(t1), "=m"(r[0]), "=m"(r[1]), "=m"(r[2]), "=m"(r[3])
-  : [A]"r"(aa[1]), [a]"r"(aa[0]), [B]"r"(bb[1]), [b]"r"(bb[0]), [r]"r"(r)
+  "str   %[t0], %[hl]\n\t"
+  "str   %[t1], %[hh]\n\t"
+  : [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"+r"(t2), [t3]"+r"(t3), [hl]"=m"(r[0]), [hh]"=m"(r[1]), [ll]"=m"(r[2]), [lh]"=m"(r[3])
+  : [A]"r"(aa[1]), [a]"r"(aa[0]), [B]"r"(bb[1]), [b]"r"(bb[0])
   : "cc");
 }
 #endif /* !aarch64 */