//*******************************************************************************************
// SIDH: an efficient supersingular isogeny cryptography library
//
// Abstract: field arithmetic in x64 assembly for P751 on Linux 
//*******************************************************************************************  

.intel_syntax noprefix 

// Format function and variable names for Mac OS X
#if defined(__APPLE__)
    #define fmt(f)    _oqs_kem_sike_##f
#else
    #define fmt(f)    oqs_kem_sike_##f
#endif

// Registers that are used for parameter passing:
#define reg_p1  rdi
#define reg_p2  rsi
#define reg_p3  rdx


.text
//***********************************************************************
//  Field addition
//  Operation: c [reg_p3] = a [reg_p1] + b [reg_p2]
//*********************************************************************** 
.global fmt(fpadd751_asm)
fmt(fpadd751_asm):
  push   r12
  push   r13
  push   r14
  push   r15
  
  mov    r8, [reg_p1]
  mov    r9, [reg_p1+8]
  mov    r10, [reg_p1+16]
  mov    r11, [reg_p1+24]
  mov    r12, [reg_p1+32]
  mov    r13, [reg_p1+40]
  mov    r14, [reg_p1+48]
  mov    r15, [reg_p1+56] 
  mov    rcx, [reg_p1+64]
  add    r8, [reg_p2] 
  adc    r9, [reg_p2+8] 
  adc    r10, [reg_p2+16] 
  adc    r11, [reg_p2+24] 
  adc    r12, [reg_p2+32] 
  adc    r13, [reg_p2+40] 
  adc    r14, [reg_p2+48] 
  adc    r15, [reg_p2+56]
  adc    rcx, [reg_p2+64] 
  mov    rax, [reg_p1+72]
  adc    rax, [reg_p2+72] 
  mov    [reg_p3+72], rax
  mov    rax, [reg_p1+80]
  adc    rax, [reg_p2+80] 
  mov    [reg_p3+80], rax
  mov    rax, [reg_p1+88]
  adc    rax, [reg_p2+88] 
  mov    [reg_p3+88], rax

  mov    rax, [rip+fmt(p751x2)]
  sub    r8, rax
  mov    rax, [rip+fmt(p751x2)+8]
  sbb    r9, rax
  sbb    r10, rax
  sbb    r11, rax
  sbb    r12, rax
  mov    rax, [rip+fmt(p751x2)+40]
  sbb    r13, rax
  mov    rax, [rip+fmt(p751x2)+48]
  sbb    r14, rax
  mov    rax, [rip+fmt(p751x2)+56]
  sbb    r15, rax
  mov    rax, [rip+fmt(p751x2)+64]
  sbb    rcx, rax
  mov    [reg_p3], r8
  mov    [reg_p3+8], r9
  mov    [reg_p3+16], r10
  mov    [reg_p3+24], r11
  mov    [reg_p3+32], r12
  mov    [reg_p3+40], r13
  mov    [reg_p3+48], r14
  mov    [reg_p3+56], r15
  mov    [reg_p3+64], rcx
  mov    r8, [reg_p3+72]
  mov    r9, [reg_p3+80]
  mov    r10, [reg_p3+88]
  mov    rax, [rip+fmt(p751x2)+72]
  sbb    r8, rax
  mov    rax, [rip+fmt(p751x2)+80]
  sbb    r9, rax
  mov    rax, [rip+fmt(p751x2)+88]
  sbb    r10, rax
  mov    [reg_p3+72], r8
  mov    [reg_p3+80], r9
  mov    [reg_p3+88], r10
  mov    rax, 0
  sbb    rax, 0
  
  mov    rsi, [rip+fmt(p751x2)]
  and    rsi, rax
  mov    r8, [rip+fmt(p751x2)+8]
  and    r8, rax
  mov    r9, [rip+fmt(p751x2)+40]
  and    r9, rax
  mov    r10, [rip+fmt(p751x2)+48]
  and    r10, rax
  mov    r11, [rip+fmt(p751x2)+56]
  and    r11, rax
  mov    r12, [rip+fmt(p751x2)+64]
  and    r12, rax
  mov    r13, [rip+fmt(p751x2)+72]
  and    r13, rax
  mov    r14, [rip+fmt(p751x2)+80]
  and    r14, rax
  mov    r15, [rip+fmt(p751x2)+88]
  and    r15, rax
  
  add    rsi, [reg_p3]  
  mov    [reg_p3], rsi
  mov    rax, [reg_p3+8]
  adc    rax, r8 
  mov    [reg_p3+8], rax  
  mov    rax, [reg_p3+16]
  adc    rax, r8 
  mov    [reg_p3+16], rax
  mov    rax, [reg_p3+24]  
  adc    rax, r8 
  mov    [reg_p3+24], rax 
  mov    rax, [reg_p3+32]  
  adc    rax, r8 
  mov    [reg_p3+32], rax 
  adc    r9, [reg_p3+40]
  adc    r10, [reg_p3+48]
  adc    r11, [reg_p3+56]
  adc    r12, [reg_p3+64]
  adc    r13, [reg_p3+72] 
  adc    r14, [reg_p3+80]
  adc    r15, [reg_p3+88]
  mov    [reg_p3+40], r9 
  mov    [reg_p3+48], r10 
  mov    [reg_p3+56], r11 
  mov    [reg_p3+64], r12  
  mov    [reg_p3+72], r13 
  mov    [reg_p3+80], r14 
  mov    [reg_p3+88], r15 

  pop    r15
  pop    r14
  pop    r13
  pop    r12
  ret


//***********************************************************************
//  Field subtraction
//  Operation: c [reg_p3] = a [reg_p1] - b [reg_p2]
//*********************************************************************** 
.global fmt(fpsub751_asm)
fmt(fpsub751_asm):
  push   r12
  push   r13
  push   r14
  push   r15
  
  mov    r8, [reg_p1]
  mov    r9, [reg_p1+8]
  mov    r10, [reg_p1+16]
  mov    r11, [reg_p1+24]
  mov    r12, [reg_p1+32]
  mov    r13, [reg_p1+40]
  mov    r14, [reg_p1+48]
  mov    r15, [reg_p1+56] 
  mov    rcx, [reg_p1+64]
  sub    r8, [reg_p2] 
  sbb    r9, [reg_p2+8] 
  sbb    r10, [reg_p2+16] 
  sbb    r11, [reg_p2+24] 
  sbb    r12, [reg_p2+32] 
  sbb    r13, [reg_p2+40] 
  sbb    r14, [reg_p2+48] 
  sbb    r15, [reg_p2+56]
  sbb    rcx, [reg_p2+64] 
  mov    [reg_p3], r8
  mov    [reg_p3+8], r9
  mov    [reg_p3+16], r10
  mov    [reg_p3+24], r11
  mov    [reg_p3+32], r12
  mov    [reg_p3+40], r13
  mov    [reg_p3+48], r14
  mov    [reg_p3+56], r15
  mov    [reg_p3+64], rcx
  mov    rax, [reg_p1+72]
  sbb    rax, [reg_p2+72] 
  mov    [reg_p3+72], rax
  mov    rax, [reg_p1+80]
  sbb    rax, [reg_p2+80] 
  mov    [reg_p3+80], rax
  mov    rax, [reg_p1+88]
  sbb    rax, [reg_p2+88] 
  mov    [reg_p3+88], rax
  mov    rax, 0
  sbb    rax, 0
  
  mov    rsi, [rip+fmt(p751x2)]
  and    rsi, rax
  mov    r8, [rip+fmt(p751x2)+8]
  and    r8, rax
  mov    r9, [rip+fmt(p751x2)+40]
  and    r9, rax
  mov    r10, [rip+fmt(p751x2)+48]
  and    r10, rax
  mov    r11, [rip+fmt(p751x2)+56]
  and    r11, rax
  mov    r12, [rip+fmt(p751x2)+64]
  and    r12, rax
  mov    r13, [rip+fmt(p751x2)+72]
  and    r13, rax
  mov    r14, [rip+fmt(p751x2)+80]
  and    r14, rax
  mov    r15, [rip+fmt(p751x2)+88]
  and    r15, rax
  
  mov    rax, [reg_p3]
  add    rax, rsi  
  mov    [reg_p3], rax
  mov    rax, [reg_p3+8]
  adc    rax, r8 
  mov    [reg_p3+8], rax  
  mov    rax, [reg_p3+16]
  adc    rax, r8 
  mov    [reg_p3+16], rax  
  mov    rax, [reg_p3+24]  
  adc    rax, r8 
  mov    [reg_p3+24], rax 
  mov    rax, [reg_p3+32]  
  adc    rax, r8 
  mov    [reg_p3+32], rax 
  adc    r9, [reg_p3+40] 
  adc    r10, [reg_p3+48] 
  adc    r11, [reg_p3+56]
  adc    r12, [reg_p3+64] 
  adc    r13, [reg_p3+72]
  adc    r14, [reg_p3+80]
  adc    r15, [reg_p3+88]
  mov    [reg_p3+40], r9 
  mov    [reg_p3+48], r10
  mov    [reg_p3+56], r11 
  mov    [reg_p3+64], r12
  mov    [reg_p3+72], r13 
  mov    [reg_p3+80], r14  
  mov    [reg_p3+88], r15 
  
  pop    r15
  pop    r14
  pop    r13
  pop    r12
  ret 


///////////////////////////////////////////////////////////////// MACRO
.macro SUB751_PX  P0 
  push   r12
  push   r13
  push   r14
  push   r15
  
  mov    r8, [reg_p1]
  mov    r9, [reg_p1+8]
  mov    r10, [reg_p1+16]
  mov    r11, [reg_p1+24]
  mov    r12, [reg_p1+32]
  mov    r13, [reg_p1+40]
  sub    r8, [reg_p2] 
  sbb    r9, [reg_p2+8] 
  sbb    r10, [reg_p2+16] 
  sbb    r11, [reg_p2+24] 
  sbb    r12, [reg_p2+32] 
  sbb    r13, [reg_p2+40]
  setc   al

  mov    r14, [rip+\P0]
  mov    r15, [rip+\P0+8]
  add    r8, r14  
  adc    r9, r15  
  adc    r10, r15 
  adc    r11, r15 
  adc    r12, r15   
  mov    r14, [rip+\P0+40]
  adc    r13, r14   
  mov    [reg_p3], r8
  mov    [reg_p3+8], r9 
  mov    [reg_p3+16], r10 
  mov    [reg_p3+24], r11
  mov    [reg_p3+32], r12 
  mov    [reg_p3+40], r13
  setc   cl

  bt     rax, 0 
  mov    r8, [reg_p1+48]
  mov    r9, [reg_p1+56]
  mov    r10, [reg_p1+64]
  mov    r11, [reg_p1+72]
  mov    r12, [reg_p1+80]
  mov    r13, [reg_p1+88]
  sbb    r8, [reg_p2+48] 
  sbb    r9, [reg_p2+56] 
  sbb    r10, [reg_p2+64] 
  sbb    r11, [reg_p2+72] 
  sbb    r12, [reg_p2+80] 
  sbb    r13, [reg_p2+88] 

  bt     rcx, 0
  mov    r14, [rip+\P0+48]
  mov    r15, [rip+\P0+56]
  adc    r8, r14  
  adc    r9, r15  
  mov    r14, [rip+\P0+64]
  mov    r15, [rip+\P0+72]
  adc    r10, r14 
  adc    r11, r15    
  mov    r14, [rip+\P0+80]
  mov    r15, [rip+\P0+88]
  adc    r12, r14 
  adc    r13, r15  
  mov    [reg_p3+48], r8
  mov    [reg_p3+56], r9 
  mov    [reg_p3+64], r10 
  mov    [reg_p3+72], r11
  mov    [reg_p3+80], r12 
  mov    [reg_p3+88], r13
  
  pop    r15
  pop    r14
  pop    r13
  pop    r12
  .endm


//***********************************************************************
//  Multiprecision subtraction with correction with 2*p751
//  Operation: c [reg_p3] = a [reg_p1] - b [reg_p2] + 2*p751
//*********************************************************************** 
.global fmt(mp_sub751_p2_asm)
fmt(mp_sub751_p2_asm):

  SUB751_PX  fmt(p751x2)
  ret 


//***********************************************************************
//  Multiprecision subtraction with correction with 4*p751
//  Operation: c [reg_p3] = a [reg_p1] - b [reg_p2] + 4*p751
//*********************************************************************** 
.global fmt(mp_sub751_p4_asm)
fmt(mp_sub751_p4_asm):

  SUB751_PX  fmt(p751x4)
  ret 


#ifdef _MULX_

/////////////////////////////////////////////////////////////////////////// MACRO
// Schoolbook integer multiplication
// Inputs:  memory pointers M0 and M1
// Outputs: memory pointer C
// Temps:   stack space for two 64-bit values (case w/o _ADX_), regs T0:T7
///////////////////////////////////////////////////////////////////////////
#ifdef _ADX_

.macro MUL384_SCHOOL M0, M1, C, S, T0, T1, T2, T3, T4, T5, T6, T7 
    mov    rdx, \M0
    mulx   \T0, \T1, \M1    
    mulx   \T2, \T3, 8\M1
    mov    \C, \T1             // C0_final 
    xor    rax, rax
    mulx   \T4, \T5, 16\M1 
    adox   \T0, \T3               
    adox   \T2, \T5     
    mulx   \T1, \T3, 24\M1
    adox   \T4, \T3         
    mulx   \T5, \T6, 32\M1 
    adox   \T1, \T6        
    mulx   \T3, \T7, 40\M1    
    adox   \T5, \T7       
    adox   \T3, rax        
    
    mov    rdx, 8\M0 
    mulx   \T6, \T7, \M1 
    xor    rax, rax
    adcx   \T0, \T7 
    mov    8\C, \T0            // C1_final 
    adcx   \T2, \T6     
    mulx   \T6, \T7, 8\M1
    adox   \T2, \T7 
    adcx   \T4, \T6        
    mulx   \T0, \T6, 16\M1 
    adox   \T4, \T6  
    adcx   \T0, \T1     
    mulx   \T1, \T7, 24\M1   
    adcx   \T1, \T5  
    mulx   \T5, \T6, 32\M1     
    adcx   \T3, \T5   
    mulx   \T5, rdx, 40\M1
    adcx   \T5, rax 
        
    adox   \T0, \T7  
    adox   \T1, \T6  
    adox   \T3, rdx  
    adox   \T5, rax         
    
    mov    rdx, 16\M0 
    mulx   \T6, \T7, \M1
    xor    rax, rax 
    adcx   \T2, \T7 
    mov    16\C, \T2           // C2_final 
    adcx   \T4, \T6     
    mulx   \T6, \T7, 8\M1
    adox   \T4, \T7 
    adcx   \T0, \T6        
    mulx   \T2, \T6, 16\M1
    adox   \T0, \T6 
    adcx   \T1, \T2     
    mulx   \T2, \T7, 24\M1   
    adcx   \T3, \T2  
    mulx   \T2, \T6, 32\M1     
    adcx   \T5, \T2   
    mulx   \T2, rdx, 40\M1     
    adcx   \T2, rax 
         
    adox   \T1, \T7  
    adox   \T3, \T6  
    adox   \T5, rdx 
    adox   \T2, rax           
    
    mov    rdx, 24\M0 
    mulx   \T6, \T7, \M1
    xor    rax, rax 
    adcx   \T4, \T7 
    mov    24\C, \T4           // C3_final 
    adcx   \T0, \T6     
    mulx   \T6, \T7, 8\M1
    adox   \T0, \T7
    adcx   \T1, \T6        
    mulx   \T4, \T6, 16\M1
    adox   \T1, \T6  
    adcx   \T3, \T4     
    mulx   \T4, \T7, 24\M1   
    adcx   \T5, \T4  
    mulx   \T4, \T6, 32\M1     
    adcx   \T2, \T4   
    mulx   \T4, rdx, 40\M1     
    adcx   \T4, rax
        
    adox   \T3, \T7  
    adox   \T5, \T6  
    adox   \T2, rdx  
    adox   \T4, rax         
    
    mov    rdx, 32\M0 
    mulx   \T6, \T7, \M1 
    xor    rax, rax
    adcx   \T0, \T7 
    mov    32\C, \T0           // C4_final 
    adcx   \T1, \T6     
    mulx   \T6, \T7, 8\M1
    adox   \T1, \T7 
    adcx   \T3, \T6        
    mulx   \T0, \T6, 16\M1 
    adox   \T3, \T6 
    adcx   \T5, \T0     
    mulx   \T0, \T7, 24\M1   
    adcx   \T2, \T0  
    mulx   \T0, \T6, 32\M1     
    adcx   \T4, \T0   
    mulx   \T0, rdx, 40\M1     
    adcx   \T0, rax 
         
    adox   \T5, \T7  
    adox   \T2, \T6  
    adox   \T4, rdx  
    adox   \T0, rax           
    
    mov    rdx, 40\M0 
    mulx   \T6, \T7, \M1 
    xor    rax, rax
    adcx   \T1, \T7 
    mov    40\C, \T1           // C5_final 
    adcx   \T3, \T6     
    mulx   \T6, \T7, 8\M1
    adox   \T3, \T7 
    adcx   \T5, \T6        
    mulx   \T1, \T6, 16\M1
    adox   \T5, \T6 
    adcx   \T2, \T1     
    mulx   \T1, \T7, 24\M1   
    adcx   \T4, \T1  
    mulx   \T1, \T6, 32\M1     
    adcx   \T0, \T1   
    mulx   \T1, rdx, 40\M1     
    adcx   \T1, rax 
         
    adox   \T2, \T7 
    adox   \T4, \T6 
    adox   \T0, rdx 
    adox   \T1, rax 
    mov    48\C, \T3 
    mov    56\C, \T5 
    mov    64\C, \T2 
    mov    72\C, \T4
    mov    80\C, \T0 
    mov    88\C, \T1 
.endm

#else

.macro MUL384_SCHOOL M0, M1, C, S, T0, T1, T2, T3, T4, T5, T6, T7 
    mov    rdx, \M0
    mulx   \T0, \T1, \M1    
    mulx   \T2, \T3, 8\M1
    mov    \C, \T1             // C0_final 
    xor    rax, rax
    mulx   \T4, \T5, 16\M1 
    add    \T0, \T3               
    adc    \T2, \T5     
    mulx   \T1, \T3, 24\M1
    adc    \T4, \T3         
    mulx   \T5, \T6, 32\M1 
    adc    \T1, \T6        
    mulx   \T3, \T7, 40\M1    
    adc    \T5, \T7       
    adc    \T3, rax        
    
    mov    rdx, 8\M0 
    mulx   \T6, \T7, \M1 
    add    \T0, \T7 
    mov    8\C, \T0            // C1_final 
    adc    \T2, \T6     
    mulx   \T6, \T7, 8\M1
    mov    \S, \T7             // store T7
    adc    \T4, \T6        
    mulx   \T0, \T6, 16\M1   
    mov    8\S, \T6            // store T6 
    adc    \T0, \T1     
    mulx   \T1, \T7, 24\M1   
    adc    \T1, \T5  
    mulx   \T5, \T6, 32\M1     
    adc    \T3, \T5   
    mulx   \T5, rdx, 40\M1
    adc    \T5, rax 
        
    xor    rax, rax
    add    \T2, \S 
    adc    \T4, 8\S  
    adc    \T0, \T7  
    adc    \T1, \T6  
    adc    \T3, rdx  
    adc    \T5, rax         
    
    mov    rdx, 16\M0 
    mulx   \T6, \T7, \M1 
    add    \T2, \T7 
    mov    16\C, \T2           // C2_final 
    adc    \T4, \T6     
    mulx   \T6, \T7, 8\M1
    mov    \S, \T7             // store T7
    adc    \T0, \T6        
    mulx   \T2, \T6, 16\M1   
    mov    8\S, \T6            // store T6 
    adc    \T1, \T2     
    mulx   \T2, \T7, 24\M1   
    adc    \T3, \T2  
    mulx   \T2, \T6, 32\M1     
    adc    \T5, \T2   
    mulx   \T2, rdx, 40\M1     
    adc    \T2, rax 
        
    xor    rax, rax
    add    \T4, \S 
    adc    \T0, 8\S  
    adc    \T1, \T7  
    adc    \T3, \T6  
    adc    \T5, rdx 
    adc    \T2, rax           
    
    mov    rdx, 24\M0 
    mulx   \T6, \T7, \M1 
    add    \T4, \T7 
    mov    24\C, \T4           // C3_final 
    adc    \T0, \T6     
    mulx   \T6, \T7, 8\M1
    mov    \S, \T7             // store T7
    adc    \T1, \T6        
    mulx   \T4, \T6, 16\M1   
    mov    8\S, \T6            // store T6 
    adc    \T3, \T4     
    mulx   \T4, \T7, 24\M1   
    adc    \T5, \T4  
    mulx   \T4, \T6, 32\M1     
    adc    \T2, \T4   
    mulx   \T4, rdx, 40\M1     
    adc    \T4, rax
        
    xor    rax, rax
    add    \T0, \S 
    adc    \T1, 8\S  
    adc    \T3, \T7  
    adc    \T5, \T6  
    adc    \T2, rdx  
    adc    \T4, rax         
    
    mov    rdx, 32\M0 
    mulx   \T6, \T7, \M1 
    add    \T0, \T7 
    mov    32\C, \T0           // C4_final 
    adc    \T1, \T6     
    mulx   \T6, \T7, 8\M1
    mov    \S, \T7             // store T7
    adc    \T3, \T6        
    mulx   \T0, \T6, 16\M1   
    mov    8\S, \T6            // store T6 
    adc    \T5, \T0     
    mulx   \T0, \T7, 24\M1   
    adc    \T2, \T0  
    mulx   \T0, \T6, 32\M1     
    adc    \T4, \T0   
    mulx   \T0, rdx, 40\M1     
    adc    \T0, rax 
        
    xor    rax, rax
    add    \T1, \S 
    adc    \T3, 8\S  
    adc    \T5, \T7  
    adc    \T2, \T6  
    adc    \T4, rdx  
    adc    \T0, rax           
    
    mov    rdx, 40\M0 
    mulx   \T6, \T7, \M1 
    add    \T1, \T7 
    mov    40\C, \T1           // C5_final 
    adc    \T3, \T6     
    mulx   \T6, \T7, 8\M1
    mov    \S, \T7             // store T7
    adc    \T5, \T6        
    mulx   \T1, \T6, 16\M1   
    mov    8\S, \T6            // store T6 
    adc    \T2, \T1     
    mulx   \T1, \T7, 24\M1   
    adc    \T4, \T1  
    mulx   \T1, \T6, 32\M1     
    adc    \T0, \T1   
    mulx   \T1, rdx, 40\M1     
    adc    \T1, rax 
        
    add    \T3, \S 
    adc    \T5, 8\S  
    adc    \T2, \T7 
    adc    \T4, \T6 
    adc    \T0, rdx 
    adc    \T1, 0 
    mov    48\C, \T3 
    mov    56\C, \T5 
    mov    64\C, \T2 
    mov    72\C, \T4
    mov    80\C, \T0 
    mov    88\C, \T1 
.endm

#endif


//*****************************************************************************
//  751-bit multiplication using Karatsuba (one level), schoolbook (two levels)
//***************************************************************************** 
.global fmt(mul751_asm)
fmt(mul751_asm):    
    push   r12
    push   r13 
    push   r14 
    push   r15
    mov    rcx, reg_p3 

    // [rsp] <- AH + AL, rax <- mask
    xor    rax, rax
    mov    r8, [reg_p1]
    mov    r9, [reg_p1+8]
    mov    r10, [reg_p1+16]
    mov    r11, [reg_p1+24] 
    mov    r12, [reg_p1+32] 
    mov    r13, [reg_p1+40] 
    push   rbx 
    push   rbp
    sub    rsp, 152
    add    r8, [reg_p1+48]
    adc    r9, [reg_p1+56]
    adc    r10, [reg_p1+64]
    adc    r11, [reg_p1+72]
    adc    r12, [reg_p1+80]
    adc    r13, [reg_p1+88]
    sbb    rax, 0
    mov    [rsp], r8
    mov    [rsp+8], r9
    mov    [rsp+16], r10
    mov    [rsp+24], r11
    mov    [rsp+32], r12
    mov    [rsp+40], r13

    // [rsp+48] <- BH + BL, rdx <- mask
    xor    rdx, rdx
    mov    r8, [reg_p2]
    mov    r9, [reg_p2+8]
    mov    rbx, [reg_p2+16]
    mov    rbp, [reg_p2+24] 
    mov    r14, [reg_p2+32]     
    mov    r15, [reg_p2+40]     
    add    r8, [reg_p2+48]
    adc    r9, [reg_p2+56]
    adc    rbx, [reg_p2+64]
    adc    rbp, [reg_p2+72]
    adc    r14, [reg_p2+80]
    adc    r15, [reg_p2+88]
    sbb    rdx, 0
    mov    [rsp+48], r8
    mov    [rsp+56], r9
    mov    [rsp+64], rbx
    mov    [rsp+72], rbp
    mov    [rsp+80], r14     
    mov    [rsp+88], r15     
    
    // [rcx] <- masked (BH + BL)
    and    r8, rax
    and    r9, rax
    and    rbx, rax
    and    rbp, rax
    and    r14, rax     
    and    r15, rax     
    mov    [rcx], r8
    mov    [rcx+8], r9

    // r8-r13 <- masked (AH + AL)
    mov    r8, [rsp]
    mov    r9, [rsp+8]
    and    r8, rdx
    and    r9, rdx
    and    r10, rdx
    and    r11, rdx
    and    r12, rdx
    and    r13, rdx

    // [rsp+96] <- masked (AH + AL) + masked (AH + AL)
    mov    rax, [rcx]
    mov    rdx, [rcx+8]
    add    r8, rax
    adc    r9, rdx
    adc    r10, rbx
    adc    r11, rbp
    adc    r12, r14         
    adc    r13, r15         
    mov    [rsp+96], r8
    mov    [rsp+104], r9
    mov    [rsp+112], r10
    mov    [rsp+120], r11

    // [rcx] <- AL x BL
    MUL384_SCHOOL  [reg_p1], [reg_p2], [rcx], [rsp+128], r8, r9, r10, r11, rbx, rbp, r14, r15     // Result C0-C5 

    // [rcx+96] <- (AH+AL) x (BH+BL), low part 
    MUL384_SCHOOL  [rsp], [rsp+48], [rcx+96], [rsp+128], r8, r9, r10, r11, rbx, rbp, r14, r15

    // [rsp] <- AH x BH 
    MUL384_SCHOOL  [reg_p1+48], [reg_p2+48], [rsp], [rsp+128], r8, r9, r10, r11, rbx, rbp, r14, r15
    
    // r8-r13 <- (AH+AL) x (BH+BL), final step
    mov    r8, [rsp+96]
    mov    r9, [rsp+104]
    mov    r10, [rsp+112]
    mov    r11, [rsp+120]
    mov    rax, [rcx+144]
    add    r8, rax
    mov    rax, [rcx+152]
    adc    r9, rax
    mov    rax, [rcx+160]
    adc    r10, rax
    mov    rax, [rcx+168]
    adc    r11, rax
    mov    rax, [rcx+176]
    adc    r12, rax
    mov    rax, [rcx+184]
    adc    r13, rax
    
    // rdi,rdx,rbx,rbp,r14,r15,r8-r13 <- (AH+AL) x (BH+BL) - ALxBL
    mov    rdi, [rcx+96]
    sub    rdi, [rcx]
    mov    rdx, [rcx+104]
    sbb    rdx, [rcx+8]
    mov    rbx, [rcx+112]
    sbb    rbx, [rcx+16]
    mov    rbp, [rcx+120]
    sbb    rbp, [rcx+24]
    mov    r14, [rcx+128]     
    sbb    r14, [rcx+32]   
    mov    r15, [rcx+136]     
    sbb    r15, [rcx+40]     
    sbb    r8, [rcx+48]
    sbb    r9, [rcx+56]
    sbb    r10, [rcx+64]
    sbb    r11, [rcx+72]
    sbb    r12, [rcx+80]
    sbb    r13, [rcx+88]
    
    // rdi,rdx,rbx,rbp,r14,r15,r8-r13 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH
    sub    rdi, [rsp]
    sbb    rdx, [rsp+8]
    sbb    rbx, [rsp+16]
    sbb    rbp, [rsp+24]
    sbb    r14, [rsp+32]     
    sbb    r15, [rsp+40]   
    sbb    r8, [rsp+48]
    sbb    r9, [rsp+56]
    sbb    r10, [rsp+64]
    sbb    r11, [rsp+72]
    sbb    r12, [rsp+80]
    sbb    r13, [rsp+88]
    
    mov    rax, [rcx+48]
    add    rax, rdi
    mov    [rcx+48], rax    // Result C6-C11
    mov    rax, [rcx+56]
    adc    rax, rdx
    mov    [rcx+56], rax 
    mov    rax, [rcx+64]
    adc    rax, rbx
    mov    [rcx+64], rax 
    mov    rax, [rcx+72]
    adc    rax, rbp
    mov    [rcx+72], rax 
    mov    rax, [rcx+80]
    adc    rax, r14           
    mov    [rcx+80], rax 
    mov    rax, [rcx+88]
    adc    rax, r15             
    mov    [rcx+88], rax
    mov    rax, [rsp]
    adc    r8, rax 
    mov    [rcx+96], r8    // Result C8-C15
    mov    rax, [rsp+8]
    adc    r9, rax
    mov    [rcx+104], r9 
    mov    rax, [rsp+16]
    adc    r10, rax
    mov    [rcx+112], r10 
    mov    rax, [rsp+24]
    adc    r11, rax
    mov    [rcx+120], r11 
    mov    rax, [rsp+32]
    adc    r12, rax
    mov    [rcx+128], r12 
    mov    rax, [rsp+40]
    adc    r13, rax
    mov    [rcx+136], r13
    mov    r8, [rsp+48]
    mov    r9, [rsp+56]
    mov    r10, [rsp+64]
    mov    r11, [rsp+72]
    mov    r12, [rsp+80]
    mov    r13, [rsp+88]
    adc    r8, 0
    adc    r9, 0
    adc    r10, 0
    adc    r11, 0
    adc    r12, 0
    adc    r13, 0
    add    rsp, 152   
    mov    [rcx+144], r8 
    mov    [rcx+152], r9 
    mov    [rcx+160], r10 
    mov    [rcx+168], r11 
    mov    [rcx+176], r12 
    mov    [rcx+184], r13 
     
    pop    rbp  
    pop    rbx
    pop    r15
    pop    r14
    pop    r13
    pop    r12
    ret

#else

//***********************************************************************
//  Integer multiplication
//  Based on Karatsuba method
//  Operation: c [reg_p3] = a [reg_p1] * b [reg_p2]
//  NOTE: a=c or b=c are not allowed
//***********************************************************************
.global fmt(mul751_asm)
fmt(mul751_asm):
  push   r12
  push   r13
  push   r14
  mov    rcx, reg_p3
  
  // rcx[0-5] <- AH+AL
  xor    rax, rax
  mov    r8, [reg_p1+48]
  mov    r9, [reg_p1+56]
  mov    r10, [reg_p1+64]
  mov    r11, [reg_p1+72]
  mov    r12, [reg_p1+80]
  mov    r13, [reg_p1+88]
  add    r8, [reg_p1] 
  adc    r9, [reg_p1+8] 
  adc    r10, [reg_p1+16] 
  adc    r11, [reg_p1+24] 
  adc    r12, [reg_p1+32] 
  adc    r13, [reg_p1+40] 
  push   r15  
  mov    [rcx], r8
  mov    [rcx+8], r9
  mov    [rcx+16], r10
  mov    [rcx+24], r11
  mov    [rcx+32], r12
  mov    [rcx+40], r13
  sbb    rax, 0 
  sub    rsp, 96           // Allocating space in stack
       
  // rcx[6-11] <- BH+BL
  xor    rdx, rdx
  mov    r8, [reg_p2+48]
  mov    r9, [reg_p2+56]
  mov    r10, [reg_p2+64]
  mov    r11, [reg_p2+72]
  mov    r12, [reg_p2+80]
  mov    r13, [reg_p2+88]
  add    r8, [reg_p2] 
  adc    r9, [reg_p2+8] 
  adc    r10, [reg_p2+16] 
  adc    r11, [reg_p2+24] 
  adc    r12, [reg_p2+32] 
  adc    r13, [reg_p2+40] 
  mov    [rcx+48], r8
  mov    [rcx+56], r9
  mov    [rcx+64], r10
  mov    [rcx+72], r11
  mov    [rcx+80], r12
  mov    [rcx+88], r13
  sbb    rdx, 0 
  mov    [rsp+80], rax
  mov    [rsp+88], rdx
  
  // (rsp[0-8],r10,r8,r9) <- (AH+AL)*(BH+BL)
  mov    r11, [rcx]
  mov    rax, r8 
  mul    r11
  mov    [rsp], rax        // c0
  mov    r14, rdx
  
  xor    r15, r15
  mov    rax, r9
  mul    r11
  xor    r9, r9
  add    r14, rax
  adc    r9, rdx
  
  mov    r12, [rcx+8] 
  mov    rax, r8 
  mul    r12
  add    r14, rax
  mov    [rsp+8], r14      // c1 
  adc    r9, rdx
  adc    r15, 0
  
  xor    r8, r8
  mov    rax, r10 
  mul    r11
  add    r9, rax
  mov    r13, [rcx+48] 
  adc    r15, rdx 
  adc    r8, 0
  
  mov    rax, [rcx+16] 
  mul    r13
  add    r9, rax
  adc    r15, rdx 
  mov    rax, [rcx+56] 
  adc    r8, 0
  
  mul    r12
  add    r9, rax
  mov    [rsp+16], r9      // c2 
  adc    r15, rdx 
  adc    r8, 0
  
  xor    r9, r9
  mov    rax, [rcx+72] 
  mul    r11
  add    r15, rax
  adc    r8, rdx 
  adc    r9, 0
  
  mov    rax, [rcx+24] 
  mul    r13
  add    r15, rax
  adc    r8, rdx 
  adc    r9, 0
  
  mov    rax, r10 
  mul    r12
  add    r15, rax
  adc    r8, rdx 
  adc    r9, 0
  
  mov    r14, [rcx+16] 
  mov    rax, [rcx+56] 
  mul    r14
  add    r15, rax
  mov    [rsp+24], r15     // c3 
  adc    r8, rdx 
  adc    r9, 0
  
  xor    r10, r10
  mov    rax, [rcx+80] 
  mul    r11
  add    r8, rax
  adc    r9, rdx 
  adc    r10, 0
  
  mov    rax, [rcx+64] 
  mul    r14
  add    r8, rax
  adc    r9, rdx 
  adc    r10, 0
  
  mov    r15, [rcx+48] 
  mov    rax, [rcx+32] 
  mul    r15
  add    r8, rax
  adc    r9, rdx 
  adc    r10, 0
  
  mov    rax, [rcx+72] 
  mul    r12
  add    r8, rax
  adc    r9, rdx 
  adc    r10, 0
  
  mov    r13, [rcx+24] 
  mov    rax, [rcx+56] 
  mul    r13
  add    r8, rax
  mov    [rsp+32], r8      // c4 
  adc    r9, rdx 
  adc    r10, 0
  
  xor    r8, r8
  mov    rax, [rcx+88] 
  mul    r11
  add    r9, rax
  adc    r10, rdx 
  adc    r8, 0
  
  mov    rax, [rcx+64] 
  mul    r13
  add    r9, rax
  adc    r10, rdx 
  adc    r8, 0
  
  mov    rax, [rcx+72] 
  mul    r14
  add    r9, rax
  adc    r10, rdx 
  adc    r8, 0
  
  mov    rax, [rcx+40] 
  mul    r15
  add    r9, rax
  adc    r10, rdx 
  adc    r8, 0
  
  mov    rax, [rcx+80] 
  mul    r12
  add    r9, rax
  adc    r10, rdx 
  adc    r8, 0
  
  mov    r15, [rcx+32] 
  mov    rax, [rcx+56] 
  mul    r15
  add    r9, rax
  mov    [rsp+40], r9      // c5 
  adc    r10, rdx 
  adc    r8, 0
  
  xor    r9, r9
  mov    rax, [rcx+64] 
  mul    r15
  add    r10, rax
  adc    r8, rdx 
  adc    r9, 0
  
  mov    rax, [rcx+88] 
  mul    r12
  add    r10, rax
  adc    r8, rdx 
  adc    r9, 0
  
  mov    rax, [rcx+80] 
  mul    r14
  add    r10, rax
  adc    r8, rdx 
  adc    r9, 0
  
  mov    r11, [rcx+40] 
  mov    rax, [rcx+56] 
  mul    r11
  add    r10, rax
  adc    r8, rdx 
  adc    r9, 0
  
  mov    rax, [rcx+72] 
  mul    r13
  add    r10, rax
  mov    [rsp+48], r10     // c6 
  adc    r8, rdx 
  adc    r9, 0
  
  xor    r10, r10
  mov    rax, [rcx+88] 
  mul    r14
  add    r8, rax
  adc    r9, rdx 
  adc    r10, 0
  
  mov    rax, [rcx+64] 
  mul    r11
  add    r8, rax
  adc    r9, rdx 
  adc    r10, 0
  
  mov    rax, [rcx+80]
  mul    r13
  add    r8, rax
  adc    r9, rdx 
  adc    r10, 0
  
  mov    rax, [rcx+72] 
  mul    r15
  add    r8, rax
  mov    [rsp+56], r8      // c7 
  adc    r9, rdx 
  adc    r10, 0
  
  xor    r8, r8
  mov    rax, [rcx+72] 
  mul    r11
  add    r9, rax
  adc    r10, rdx 
  adc    r8, 0
  
  mov    rax, [rcx+80] 
  mul    r15
  add    r9, rax
  adc    r10, rdx 
  adc    r8, 0
  
  mov    rax, [rcx+88] 
  mul    r13
  add    r9, rax
  mov    [rsp+64], r9      // c8 
  adc    r10, rdx 
  adc    r8, 0
  
  xor    r9, r9
  mov    rax, [rcx+88]
  mul    r15
  add    r10, rax
  adc    r8, rdx
  adc    r9, 0

  mov    rax, [rcx+80] 
  mul    r11
  add    r10, rax          // c9 
  adc    r8, rdx
  adc    r9, 0

  mov    rax, [rcx+88] 
  mul    r11
  add    r8, rax           // c10 
  adc    r9, rdx           // c11 
  
  mov    rax, [rsp+88]
  mov    rdx, [rcx]
  and    r12, rax
  and    r14, rax
  and    rdx, rax
  and    r13, rax
  and    r15, rax
  and    r11, rax
  mov    rax, [rsp+48]
  add    rdx, rax
  mov    rax, [rsp+56]
  adc    r12, rax
  mov    rax, [rsp+64]
  adc    r14, rax
  adc    r13, r10
  adc    r15, r8
  adc    r11, r9
  mov    rax, [rsp+80]
  mov    [rsp+48], rdx
  mov    [rsp+56], r12
  mov    [rsp+64], r14
  mov    [rsp+72], r13
  mov    [rsp+80], r15
  mov    [rsp+88], r11
  
  mov    r8, [rcx+48]
  mov    r9, [rcx+56]
  mov    r10, [rcx+64]
  mov    r11, [rcx+72]
  mov    r12, [rcx+80]
  mov    r13, [rcx+88]
  and    r8, rax
  and    r9, rax
  and    r10, rax
  and    r11, rax
  and    r12, rax
  and    r13, rax
  mov    rax, [rsp+48]
  add    r8, rax
  mov    rax, [rsp+56]
  adc    r9, rax
  mov    rax, [rsp+64]
  adc    r10, rax
  mov    rax, [rsp+72]
  adc    r11, rax
  mov    rax, [rsp+80]
  adc    r12, rax
  mov    rax, [rsp+88]
  adc    r13, rax
  mov    [rsp+48], r8
  mov    [rsp+56], r9
  mov    [rsp+72], r11
  
  // rcx[0-11] <- AL*BL
  mov    r11, [reg_p1]
  mov    rax, [reg_p2] 
  mul    r11
  xor    r9, r9
  mov    [rcx], rax        // c0
  mov    [rsp+64], r10
  mov    r8, rdx

  mov    rax, [reg_p2+8]
  mul    r11
  xor    r10, r10
  add    r8, rax
  mov    [rsp+80], r12
  adc    r9, rdx

  mov    r12, [reg_p1+8] 
  mov    rax, [reg_p2] 
  mul    r12
  add    r8, rax
  mov    [rcx+8], r8       // c1 
  adc    r9, rdx
  mov    [rsp+88], r13
  adc    r10, 0
  
  xor    r8, r8
  mov    rax, [reg_p2+16] 
  mul    r11
  add    r9, rax
  adc    r10, rdx 
  adc    r8, 0
  
  mov    r13, [reg_p2] 
  mov    rax, [reg_p1+16] 
  mul    r13
  add    r9, rax
  adc    r10, rdx 
  adc    r8, 0
  
  mov    rax, [reg_p2+8] 
  mul    r12
  add    r9, rax
  mov    [rcx+16], r9      // c2 
  adc    r10, rdx 
  adc    r8, 0
  
  xor    r9, r9
  mov    rax, [reg_p2+24] 
  mul    r11
  add    r10, rax
  adc    r8, rdx 
  adc    r9, 0
  
  mov    rax, [reg_p1+24] 
  mul    r13
  add    r10, rax
  adc    r8, rdx 
  adc    r9, 0
  
  mov    rax, [reg_p2+16] 
  mul    r12
  add    r10, rax
  adc    r8, rdx 
  adc    r9, 0
  
  mov    r14, [reg_p1+16] 
  mov    rax, [reg_p2+8] 
  mul    r14
  add    r10, rax
  mov    [rcx+24], r10     // c3 
  adc    r8, rdx 
  adc    r9, 0
  
  xor    r10, r10
  mov    rax, [reg_p2+32] 
  mul    r11
  add    r8, rax
  adc    r9, rdx 
  adc    r10, 0
  
  mov    rax, [reg_p2+16] 
  mul    r14
  add    r8, rax
  adc    r9, rdx 
  adc    r10, 0
  
  mov    rax, [reg_p1+32] 
  mul    r13
  add    r8, rax
  adc    r9, rdx 
  adc    r10, 0
  
  mov    rax, [reg_p2+24] 
  mul    r12
  add    r8, rax
  adc    r9, rdx 
  adc    r10, 0
  
  mov    r13, [reg_p1+24] 
  mov    rax, [reg_p2+8] 
  mul    r13
  add    r8, rax
  mov    [rcx+32], r8      // c4 
  adc    r9, rdx 
  adc    r10, 0
  
  xor    r8, r8
  mov    rax, [reg_p2+40] 
  mul    r11
  add    r9, rax
  adc    r10, rdx 
  adc    r8, 0
  
  mov    rax, [reg_p2+16] 
  mul    r13
  add    r9, rax
  adc    r10, rdx 
  adc    r8, 0
  
  mov    rax, [reg_p2+24] 
  mul    r14
  add    r9, rax
  adc    r10, rdx 
  adc    r8, 0
  
  mov    r11, [reg_p1+40] 
  mov    rax, [reg_p2] 
  mul    r11
  add    r9, rax
  adc    r10, rdx 
  adc    r8, 0
  
  mov    rax, [reg_p2+32] 
  mul    r12
  add    r9, rax
  adc    r10, rdx 
  adc    r8, 0
  
  mov    r15, [reg_p1+32] 
  mov    rax, [reg_p2+8] 
  mul    r15
  add    r9, rax
  mov    [rcx+40], r9      // c5 
  adc    r10, rdx 
  adc    r8, 0
  
  xor    r9, r9
  mov    rax, [reg_p2+16] 
  mul    r15
  add    r10, rax
  adc    r8, rdx 
  adc    r9, 0
  
  mov    rax, [reg_p2+40] 
  mul    r12
  add    r10, rax
  adc    r8, rdx 
  adc    r9, 0
  
  mov    rax, [reg_p2+32] 
  mul    r14
  add    r10, rax
  adc    r8, rdx 
  adc    r9, 0
  
  mov    rax, [reg_p2+8] 
  mul    r11
  add    r10, rax
  adc    r8, rdx 
  adc    r9, 0
  
  mov    rax, [reg_p2+24] 
  mul    r13
  add    r10, rax
  mov    [rcx+48], r10     // c6 
  adc    r8, rdx 
  adc    r9, 0
  
  xor    r10, r10
  mov    rax, [reg_p2+40] 
  mul    r14
  add    r8, rax
  adc    r9, rdx 
  adc    r10, 0
  
  mov    rax, [reg_p2+16] 
  mul    r11
  add    r8, rax
  adc    r9, rdx 
  adc    r10, 0
  
  mov    rax, [reg_p2+32]
  mul    r13
  add    r8, rax
  adc    r9, rdx 
  adc    r10, 0
  
  mov    rax, [reg_p2+24] 
  mul    r15
  add    r8, rax
  mov    [rcx+56], r8      // c7 
  adc    r9, rdx 
  adc    r10, 0
  
  xor    r8, r8
  mov    rax, [reg_p2+24] 
  mul    r11
  add    r9, rax
  adc    r10, rdx 
  adc    r8, 0
  
  mov    rax, [reg_p2+32] 
  mul    r15
  add    r9, rax
  adc    r10, rdx 
  adc    r8, 0
  
  mov    rax, [reg_p2+40] 
  mul    r13
  add    r9, rax
  mov    [rcx+64], r9     // c8 
  adc    r10, rdx 
  adc    r8, 0
  
  xor    r9, r9
  mov    rax, [reg_p2+40]
  mul    r15
  add    r10, rax
  adc    r8, rdx
  adc    r9, 0

  mov    rax, [reg_p2+32] 
  mul    r11
  add    r10, rax
  mov    [rcx+72], r10     // c9 
  adc    r8, rdx
  adc    r9, 0

  mov    rax, [reg_p2+40] 
  mul    r11
  add    r8, rax
  mov    [rcx+80], r8      // c10 
  adc    r9, rdx   
  mov    [rcx+88], r9      // c11 

  // rcx[12-23] <- AH*BH
  mov    r11, [reg_p1+48]
  mov    rax, [reg_p2+48] 
  mul    r11
  xor    r9, r9
  mov    [rcx+96], rax       // c0
  mov    r8, rdx

  mov    rax, [reg_p2+56]
  mul    r11
  xor    r10, r10
  add    r8, rax
  adc    r9, rdx

  mov    r12, [reg_p1+56] 
  mov    rax, [reg_p2+48] 
  mul    r12
  add    r8, rax
  mov    [rcx+104], r8      // c1 
  adc    r9, rdx
  adc    r10, 0
  
  xor    r8, r8
  mov    rax, [reg_p2+64] 
  mul    r11
  add    r9, rax
  adc    r10, rdx 
  adc    r8, 0
  
  mov    r13, [reg_p2+48] 
  mov    rax, [reg_p1+64] 
  mul    r13
  add    r9, rax
  adc    r10, rdx 
  adc    r8, 0
  
  mov    rax, [reg_p2+56] 
  mul    r12
  add    r9, rax
  mov    [rcx+112], r9     // c2 
  adc    r10, rdx 
  adc    r8, 0
  
  xor    r9, r9
  mov    rax, [reg_p2+72] 
  mul    r11
  add    r10, rax
  adc    r8, rdx 
  adc    r9, 0
  
  mov    rax, [reg_p1+72] 
  mul    r13
  add    r10, rax
  adc    r8, rdx 
  adc    r9, 0
  
  mov    rax, [reg_p2+64] 
  mul    r12
  add    r10, rax
  adc    r8, rdx 
  adc    r9, 0
  
  mov    r14, [reg_p1+64] 
  mov    rax, [reg_p2+56] 
  mul    r14
  add    r10, rax
  mov    [rcx+120], r10    // c3 
  adc    r8, rdx 
  adc    r9, 0
  
  xor    r10, r10
  mov    rax, [reg_p2+80] 
  mul    r11
  add    r8, rax
  adc    r9, rdx 
  adc    r10, 0
  
  mov    rax, [reg_p2+64] 
  mul    r14
  add    r8, rax
  adc    r9, rdx 
  adc    r10, 0
  
  mov    r15, [reg_p1+80] 
  mov    rax, r13 
  mul    r15
  add    r8, rax
  adc    r9, rdx 
  adc    r10, 0
  
  mov    rax, [reg_p2+72] 
  mul    r12
  add    r8, rax
  adc    r9, rdx 
  adc    r10, 0
  
  mov    r13, [reg_p1+72] 
  mov    rax, [reg_p2+56] 
  mul    r13
  add    r8, rax
  mov    [rcx+128], r8     // c4 
  adc    r9, rdx 
  adc    r10, 0
  
  xor    r8, r8
  mov    rax, [reg_p2+88] 
  mul    r11
  add    r9, rax
  adc    r10, rdx 
  adc    r8, 0
  
  mov    rax, [reg_p2+64] 
  mul    r13
  add    r9, rax
  adc    r10, rdx 
  adc    r8, 0
  
  mov    rax, [reg_p2+72] 
  mul    r14
  add    r9, rax
  adc    r10, rdx 
  adc    r8, 0
  
  mov    r11, [reg_p1+88] 
  mov    rax, [reg_p2+48] 
  mul    r11
  add    r9, rax
  adc    r10, rdx 
  adc    r8, 0
  
  mov    rax, [reg_p2+80] 
  mul    r12
  add    r9, rax
  adc    r10, rdx 
  adc    r8, 0
  
  mov    rax, [reg_p2+56] 
  mul    r15
  add    r9, rax
  mov    [rcx+136], r9     // c5 
  adc    r10, rdx 
  adc    r8, 0
  
  xor    r9, r9
  mov    rax, [reg_p2+64] 
  mul    r15
  add    r10, rax
  adc    r8, rdx 
  adc    r9, 0
  
  mov    rax, [reg_p2+88] 
  mul    r12
  add    r10, rax
  adc    r8, rdx 
  adc    r9, 0
  
  mov    rax, [reg_p2+80] 
  mul    r14
  add    r10, rax
  adc    r8, rdx 
  adc    r9, 0
  
  mov    rax, [reg_p2+56] 
  mul    r11
  add    r10, rax
  adc    r8, rdx 
  adc    r9, 0
  
  mov    rax, [reg_p2+72] 
  mul    r13
  add    r10, rax
  mov    [rcx+144], r10    // c6 
  adc    r8, rdx 
  adc    r9, 0
  
  xor    r10, r10
  mov    rax, [reg_p2+88] 
  mul    r14
  add    r8, rax
  adc    r9, rdx 
  adc    r10, 0
  
  mov    rax, [reg_p2+64] 
  mul    r11
  add    r8, rax
  adc    r9, rdx 
  adc    r10, 0
  
  mov    rax, [reg_p2+80]
  mul    r13
  add    r8, rax
  adc    r9, rdx 
  adc    r10, 0
  
  mov    rax, [reg_p2+72] 
  mul    r15
  add    r8, rax
  mov    [rcx+152], r8     // c7 
  adc    r9, rdx 
  adc    r10, 0
  
  xor    r8, r8
  mov    rax, [reg_p2+72] 
  mul    r11
  add    r9, rax
  adc    r10, rdx 
  adc    r8, 0
  
  mov    rax, [reg_p2+80] 
  mul    r15
  add    r9, rax
  adc    r10, rdx 
  adc    r8, 0
  
  mov    rax, [reg_p2+88] 
  mul    r13
  add    r9, rax
  mov    [rcx+160], r9     // c8 
  adc    r10, rdx 
  adc    r8, 0
  
  mov    rax, [reg_p2+88]
  mul    r15
  add    r10, rax
  adc    r8, rdx

  mov    rax, [reg_p2+80] 
  mul    r11
  add    r10, rax
  mov    [rcx+168], r10     // c9 
  adc    r8, rdx

  mov    rax, [reg_p2+88] 
  mul    r11
  add    r8, rax
  mov    [rcx+176], r8      // c10 
  adc    rdx, 0   
  mov    [rcx+184], rdx     // c11  
      
  // [r8-r15,rax,rdx,rdi,[rsp]] <- (AH+AL)*(BH+BL) - AL*BL 
  mov    r8,  [rsp]
  sub    r8,  [rcx] 
  mov    r9,  [rsp+8]
  sbb    r9,  [rcx+8]
  mov    r10, [rsp+16]
  sbb    r10, [rcx+16]
  mov    r11, [rsp+24]
  sbb    r11, [rcx+24] 
  mov    r12, [rsp+32]
  sbb    r12, [rcx+32]
  mov    r13, [rsp+40]
  sbb    r13, [rcx+40] 
  mov    r14, [rsp+48]
  sbb    r14, [rcx+48] 
  mov    r15, [rsp+56]
  sbb    r15, [rcx+56] 
  mov    rax, [rsp+64]
  sbb    rax, [rcx+64]
  mov    rdx, [rsp+72]
  sbb    rdx, [rcx+72] 
  mov    rdi, [rsp+80]
  sbb    rdi, [rcx+80] 
  mov    rsi, [rsp+88]
  sbb    rsi, [rcx+88] 
  mov    [rsp], rsi
      
  // [r8-r15,rax,rdx,rdi,[rsp]] <- (AH+AL)*(BH+BL) - AL*BL - AH*BH
  mov    rsi, [rcx+96]
  sub    r8,  rsi 
  mov    rsi, [rcx+104]
  sbb    r9,  rsi
  mov    rsi, [rcx+112]
  sbb    r10, rsi
  mov    rsi, [rcx+120]
  sbb    r11, rsi 
  mov    rsi, [rcx+128]
  sbb    r12, rsi
  mov    rsi, [rcx+136]
  sbb    r13, rsi
  mov    rsi, [rcx+144]
  sbb    r14, rsi 
  mov    rsi, [rcx+152]
  sbb    r15, rsi 
  mov    rsi, [rcx+160]
  sbb    rax, rsi
  mov    rsi, [rcx+168]
  sbb    rdx, rsi
  mov    rsi, [rcx+176] 
  sbb    rdi, rsi
  mov    rsi, [rsp] 
  sbb    rsi, [rcx+184]
      
  // Final result
  add    r8,  [rcx+48] 
  mov    [rcx+48], r8
  adc    r9,  [rcx+56]
  mov    [rcx+56], r9
  adc    r10, [rcx+64]
  mov    [rcx+64], r10
  adc    r11, [rcx+72]
  mov    [rcx+72], r11
  adc    r12, [rcx+80]
  mov    [rcx+80], r12
  adc    r13, [rcx+88]
  mov    [rcx+88], r13
  adc    r14, [rcx+96] 
  mov    [rcx+96], r14
  adc    r15, [rcx+104] 
  mov    [rcx+104], r15
  adc    rax, [rcx+112]
  mov    [rcx+112], rax
  adc    rdx, [rcx+120]
  mov    [rcx+120], rdx
  adc    rdi, [rcx+128]
  mov    [rcx+128], rdi
  adc    rsi, [rcx+136]
  mov    [rcx+136], rsi  
  mov    rax, [rcx+144]
  adc    rax, 0
  mov    [rcx+144], rax
  mov    rax, [rcx+152]
  adc    rax, 0
  mov    [rcx+152], rax
  mov    rax, [rcx+160]
  adc    rax, 0
  mov    [rcx+160], rax
  mov    rax, [rcx+168]
  adc    rax, 0
  mov    [rcx+168], rax
  mov    rax, [rcx+176]
  adc    rax, 0
  mov    [rcx+176], rax
  mov    rax, [rcx+184]
  adc    rax, 0
  mov    [rcx+184], rax
    
  add    rsp, 96           // Restoring space in stack
  pop    r15
  pop    r14
  pop    r13
  pop    r12
  ret

#endif


#ifdef _MULX_

///////////////////////////////////////////////////////////////// MACRO
// Schoolbook integer multiplication
// Inputs:  memory pointers M0 and M1
// Outputs: memory locations C, C+8, C+16, and regs T0:T7
// Temps:   memory locations regs T7:T9
/////////////////////////////////////////////////////////////////
#ifdef _ADX_

.macro MUL256x448_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10 
    mov    rdx, \M0
    mulx   \T0, \T1, \M1    
    mulx   \T2, \T3, 8\M1
    mov    \C, \T1             // C0_final 
    xor    rax, rax
    mulx   \T4, \T5, 16\M1 
    adox   \T0, \T3               
    adox   \T2, \T5     
    mulx   \T1, \T3, 24\M1
    adox   \T4, \T3         
    mulx   \T5, \T6, 32\M1 
    adox   \T1, \T6        
    mulx   \T3, \T7, 40\M1    
    adox   \T5, \T7          
    mulx   \T6, \T8, 48\M1    
    adox   \T3, \T8          
    adox   \T6, rax  
    
    mov    rdx, 8\M0 
    mulx   \T8, \T7, \M1 
    xor    rax, rax 
    adcx   \T0, \T7 
    mov    8\C, \T0            // C1_final 
    adcx   \T2, \T8     
    mulx   \T7, \T8, 8\M1
    adox   \T2, \T8 
    adcx   \T4, \T7        
    mulx   \T0, \T8, 16\M1  
    adox   \T4, \T8  
    adcx   \T0, \T1     
    mulx   \T1, \T7, 24\M1   
    adcx   \T1, \T5  
    mulx   \T5, \T8, 32\M1     
    adcx   \T3, \T5   
    mulx   \T5, \T9, 40\M1    
    adcx   \T6, \T5   
    mulx   \T5, rdx, 48\M1
    adcx   \T5, rax 
        
    adox   \T0, \T7  
    adox   \T1, \T8  
    adox   \T3, \T9  
    adox   \T6, rdx    
    adox   \T5, rax      
    
    mov    rdx, 16\M0 
    mulx   \T8, \T7, \M1 
    xor    rax, rax 
    adcx   \T2, \T7 
    mov    16\C, \T2           // C2_final 
    adcx   \T4, \T8     
    mulx   \T8, \T7, 8\M1
    adox   \T4, \T7 
    adcx   \T0, \T8        
    mulx   \T2, \T8, 16\M1 
    adox   \T0, \T8 
    adcx   \T1, \T2     
    mulx   \T2, \T7, 24\M1   
    adcx   \T3, \T2  
    mulx   \T2, \T8, 32\M1     
    adcx   \T6, \T2  
    mulx   \T2, \T9, 40\M1    
    adcx   \T5, \T2   
    mulx   \T2, rdx, 48\M1     
    adcx   \T2, rax 
         
    adox   \T1, \T7  
    adox   \T3, \T8   
    adox   \T6, \T9  
    adox   \T5, rdx 
    adox   \T2, rax        
    
    mov    rdx, 24\M0 
    mulx   \T8, \T7, \M1
    xor    rax, rax
    adcx   \T7, \T4 
    adcx   \T0, \T8                 
    mulx   \T8, \T10, 8\M1
    adox   \T0, \T10 
    adcx   \T1, \T8        
    mulx   \T4, \T8, 16\M1
    adox   \T1, \T8  
    adcx   \T3, \T4     
    mulx   \T4, \T10, 24\M1   
    adcx   \T6, \T4  
    mulx   \T4, \T8, 32\M1     
    adcx   \T5, \T4  
    mulx   \T4, \T9, 40\M1    
    adcx   \T2, \T4   
    mulx   \T4, rdx, 48\M1     
    adcx   \T4, rax 
        
    adox   \T3, \T10  
    adox   \T6, \T8   
    adox   \T5, \T9  
    adox   \T2, rdx 
    adox   \T4, rax      
.endm

#else

.macro MUL256x448_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10 
    mov    rdx, \M0
    mulx   \T0, \T1, \M1    
    mulx   \T2, \T3, 8\M1
    mov    \C, \T1             // C0_final 
    xor    rax, rax
    mulx   \T4, \T5, 16\M1 
    add    \T0, \T3               
    adc    \T2, \T5     
    mulx   \T1, \T3, 24\M1
    adc    \T4, \T3         
    mulx   \T5, \T6, 32\M1 
    adc    \T1, \T6        
    mulx   \T3, \T7, 40\M1    
    adc    \T5, \T7          
    mulx   \T6, \T8, 48\M1    
    adc    \T3, \T8          
    adc    \T6, rax   
    
    mov    rdx, 8\M0 
    mulx   \T8, \T7, \M1 
    add    \T0, \T7 
    mov    8\C, \T0            // C1_final 
    adc    \T2, \T8     
    mulx   \T7, \T8, 8\M1
    mov    32\C, \T8           // store
    adc    \T4, \T7        
    mulx   \T0, \T8, 16\M1   
    mov    40\C, \T8           // store 
    adc    \T0, \T1     
    mulx   \T1, \T7, 24\M1   
    adc    \T1, \T5  
    mulx   \T5, \T8, 32\M1     
    adc    \T3, \T5   
    mulx   \T5, \T9, 40\M1    
    adc    \T6, \T5   
    mulx   \T5, rdx, 48\M1
    adc    \T5, rax 
        
    xor    rax, rax
    add    \T2, 32\C 
    adc    \T4, 40\C  
    adc    \T0, \T7  
    adc    \T1, \T8  
    adc    \T3, \T9  
    adc    \T6, rdx    
    adc    \T5, rax        
    
    mov    rdx, 16\M0 
    mulx   \T8, \T7, \M1
    add    \T2, \T7 
    mov    16\C, \T2           // C2_final 
    adc    \T4, \T8     
    mulx   \T8, \T7, 8\M1
    mov    32\C, \T7           // store
    adc    \T0, \T8        
    mulx   \T2, \T8, 16\M1   
    mov    40\C, \T8           // store 
    adc    \T1, \T2     
    mulx   \T2, \T7, 24\M1   
    adc    \T3, \T2  
    mulx   \T2, \T8, 32\M1     
    adc    \T6, \T2  
    mulx   \T2, \T9, 40\M1    
    adc    \T5, \T2   
    mulx   \T2, rdx, 48\M1     
    adc    \T2, rax 
        
    xor    rax, rax
    add    \T4, 32\C 
    adc    \T0, 40\C  
    adc    \T1, \T7  
    adc    \T3, \T8   
    adc    \T6, \T9  
    adc    \T5, rdx 
    adc    \T2, rax        
    
    mov    rdx, 24\M0 
    mulx   \T8, \T7, \M1
    add    \T7, \T4 
    adc    \T0, \T8                 
    mulx   \T8, \T10, 8\M1
    mov    32\C, \T10          // store
    adc    \T1, \T8        
    mulx   \T4, \T8, 16\M1   
    mov    40\C, \T8           // store 
    adc    \T3, \T4     
    mulx   \T4, \T10, 24\M1   
    adc    \T6, \T4  
    mulx   \T4, \T8, 32\M1     
    adc    \T5, \T4  
    mulx   \T4, \T9, 40\M1    
    adc    \T2, \T4   
    mulx   \T4, rdx, 48\M1     
    adc    \T4, rax 
        
    xor    rax, rax
    add    \T0, 32\C 
    adc    \T1, 40\C  
    adc    \T3, \T10  
    adc    \T6, \T8   
    adc    \T5, \T9  
    adc    \T2, rdx 
    adc    \T4, rax      
.endm

#endif

  
//**************************************************************************************
//  Montgomery reduction
//  Based on method described in Faz-Hernandez et al. https://eprint.iacr.org/2017/1015  
//  Operation: c [reg_p2] = a [reg_p1]
//  NOTE: a=c is not allowed
//************************************************************************************** 
.global fmt(rdc751_asm)
fmt(rdc751_asm):
    push   rbx
    push   rbp
    push   r12
    push   r13 
    push   r14 
    push   r15  

    // a[0-3] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14 
    MUL256x448_SCHOOL [reg_p1], [rip+fmt(p751p1)+40], [reg_p2+48], r8, r9, r13, r10, r14, r12, r11, rbp, rbx, rcx, r15     

    xor    r15, r15
    mov    rax, [reg_p2+48]
    mov    rdx, [reg_p2+56]
    mov    rbx, [reg_p2+64]
    add    rax, [reg_p1+40]  
    adc    rdx, [reg_p1+48]  
    adc    rbx, [reg_p1+56]
    mov    [reg_p1+40], rax
    mov    [reg_p1+48], rdx 
    mov    [reg_p1+56], rbx  
    adc    rbp, [reg_p1+64]
    adc    r8, [reg_p1+72]  
    adc    r9, [reg_p1+80]  
    adc    r10, [reg_p1+88]   
    adc    r11, [reg_p1+96]   
    adc    r12, [reg_p1+104]   
    adc    r13, [reg_p1+112]   
    adc    r14, [reg_p1+120]  
    adc    r15, [reg_p1+128]
    mov    [reg_p1+64], rbp   
    mov    [reg_p1+72], r8  
    mov    [reg_p1+80], r9  
    mov    [reg_p1+88], r10  
    mov    [reg_p1+96], r11  
    mov    [reg_p1+104], r12  
    mov    [reg_p1+112], r13  
    mov    [reg_p1+120], r14
    mov    [reg_p1+128], r15   
    mov    r8, [reg_p1+136]  
    mov    r9, [reg_p1+144]  
    mov    r10, [reg_p1+152]
    mov    r11, [reg_p1+160]
    mov    r12, [reg_p1+168]
    mov    r13, [reg_p1+176]
    mov    r14, [reg_p1+184] 
    adc    r8, 0
    adc    r9, 0
    adc    r10, 0
    adc    r11, 0
    adc    r12, 0
    adc    r13, 0
    adc    r14, 0  
    mov    [reg_p1+136], r8  
    mov    [reg_p1+144], r9  
    mov    [reg_p1+152], r10  
    mov    [reg_p1+160], r11  
    mov    [reg_p1+168], r12  
    mov    [reg_p1+176], r13  
    mov    [reg_p1+184], r14

    // a[4-7] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14 
    MUL256x448_SCHOOL [reg_p1+32], [rip+fmt(p751p1)+40], [reg_p2+48], r8, r9, r13, r10, r14, r12, r11, rbp, rbx, rcx, r15 

    xor    r15, r15
    mov    rax, [reg_p2+48]
    mov    rdx, [reg_p2+56]
    mov    rbx, [reg_p2+64]
    add    rax, [reg_p1+72]  
    adc    rdx, [reg_p1+80]  
    adc    rbx, [reg_p1+88]
    mov    [reg_p1+72], rax
    mov    [reg_p1+80], rdx 
    mov    [reg_p1+88], rbx
    adc    rbp, [reg_p1+96]
    adc    r8, [reg_p1+104]  
    adc    r9, [reg_p1+112]  
    adc    r10, [reg_p1+120]   
    adc    r11, [reg_p1+128]  
    adc    r12, [reg_p1+136]   
    adc    r13, [reg_p1+144]   
    adc    r14, [reg_p1+152]  
    adc    r15, [reg_p1+160]
    mov    [reg_p2], rbp       // Final result c0    
    mov    [reg_p1+104], r8  
    mov    [reg_p1+112], r9  
    mov    [reg_p1+120], r10  
    mov    [reg_p1+128], r11   
    mov    [reg_p1+136], r12  
    mov    [reg_p1+144], r13  
    mov    [reg_p1+152], r14
    mov    [reg_p1+160], r15
    mov    r12, [reg_p1+168]
    mov    r13, [reg_p1+176]
    mov    r14, [reg_p1+184] 
    adc    r12, 0
    adc    r13, 0
    adc    r14, 0   
    mov    [reg_p1+168], r12  
    mov    [reg_p1+176], r13  
    mov    [reg_p1+184], r14 

    // a[8-11] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14 
    MUL256x448_SCHOOL [reg_p1+64], [rip+fmt(p751p1)+40], [reg_p2+48], r8, r9, r13, r10, r14, r12, r11, rbp, rbx, rcx, r15 

    // Final result c1:c11
    mov    rax, [reg_p2+48]
    mov    rdx, [reg_p2+56]
    mov    rbx, [reg_p2+64] 
    add    rax, [reg_p1+104] 
    adc    rdx, [reg_p1+112] 
    adc    rbx, [reg_p1+120]
    mov    [reg_p2+8], rax
    mov    [reg_p2+16], rdx
    mov    [reg_p2+24], rbx
    adc    rbp, [reg_p1+128] 
    adc    r8, [reg_p1+136]  
    adc    r9, [reg_p1+144] 
    adc    r10, [reg_p1+152]
    adc    r11, [reg_p1+160]
    adc    r12, [reg_p1+168]
    adc    r13, [reg_p1+176]
    adc    r14, [reg_p1+184]
    mov    [reg_p2+32], rbp
    mov    [reg_p2+40], r8
    mov    [reg_p2+48], r9
    mov    [reg_p2+56], r10
    mov    [reg_p2+64], r11
    mov    [reg_p2+72], r12
    mov    [reg_p2+80], r13
    mov    [reg_p2+88], r14 

    pop    r15
    pop    r14
    pop    r13
    pop    r12
    pop    rbp
    pop    rbx
   ret

  #else
  
//***********************************************************************
//  Montgomery reduction
//  Based on comba method
//  Operation: c [reg_p2] = a [reg_p1]
//  NOTE: a=c is not allowed
//*********************************************************************** 
.global fmt(rdc751_asm)
fmt(rdc751_asm):
  push   r12
  push   r13 
  push   r14 
  push   r15 

  mov    r11, [reg_p1]
  mov    rax, [rip+fmt(p751p1)+40] 
  mul    r11
  xor    r8, r8
  add    rax, [reg_p1+40]
  mov    [reg_p2+40], rax    // z5
  adc    r8, rdx
  
  xor    r9, r9
  mov    rax, [rip+fmt(p751p1)+48] 
  mul    r11
  xor    r10, r10
  add    r8, rax
  adc    r9, rdx

  mov    r12, [reg_p1+8]
  mov    rax, [rip+fmt(p751p1)+40] 
  mul    r12
  add    r8, rax
  adc    r9, rdx
  adc    r10, 0
  add    r8, [reg_p1+48]
  mov    [reg_p2+48], r8    // z6
  adc    r9, 0
  adc    r10, 0
  
  xor    r8, r8
  mov    rax, [rip+fmt(p751p1)+56] 
  mul    r11
  add    r9, rax
  adc    r10, rdx
  adc    r8, 0
  
  mov    rax, [rip+fmt(p751p1)+48] 
  mul    r12
  add    r9, rax
  adc    r10, rdx
  adc    r8, 0
  
  mov    r13, [reg_p1+16]
  mov    rax, [rip+fmt(p751p1)+40] 
  mul    r13
  add    r9, rax
  adc    r10, rdx
  adc    r8, 0
  add    r9, [reg_p1+56]
  mov    [reg_p2+56], r9    // z7
  adc    r10, 0
  adc    r8, 0
  
  xor    r9, r9
  mov    rax, [rip+fmt(p751p1)+64] 
  mul    r11
  add    r10, rax
  adc    r8, rdx
  adc    r9, 0
  
  mov    rax, [rip+fmt(p751p1)+56] 
  mul    r12
  add    r10, rax
  adc    r8, rdx
  adc    r9, 0
  
  mov    rax, [rip+fmt(p751p1)+48] 
  mul    r13
  add    r10, rax
  adc    r8, rdx
  adc    r9, 0
  
  mov    r14, [reg_p1+24]
  mov    rax, [rip+fmt(p751p1)+40] 
  mul    r14
  add    r10, rax
  adc    r8, rdx
  adc    r9, 0
  add    r10, [reg_p1+64]
  mov    [reg_p2+64], r10   // z8
  adc    r8, 0
  adc    r9, 0
  
  xor    r10, r10
  mov    rax, [rip+fmt(p751p1)+72] 
  mul    r11
  add    r8, rax
  adc    r9, rdx
  adc    r10, 0
  
  mov    rax, [rip+fmt(p751p1)+64] 
  mul    r12
  add    r8, rax
  adc    r9, rdx
  adc    r10, 0
  
  mov    rax, [rip+fmt(p751p1)+56] 
  mul    r13
  add    r8, rax
  adc    r9, rdx
  adc    r10, 0
  
  mov    rax, [rip+fmt(p751p1)+48] 
  mul    r14
  add    r8, rax
  adc    r9, rdx
  adc    r10, 0
  
  mov    r15, [reg_p1+32]
  mov    rax, [rip+fmt(p751p1)+40] 
  mul    r15
  add    r8, rax
  adc    r9, rdx
  adc    r10, 0
  add    r8, [reg_p1+72]
  mov    [reg_p2+72], r8    // z9
  adc    r9, 0
  adc    r10, 0
  
  xor    r8, r8
  mov    rax, [rip+fmt(p751p1)+80] 
  mul    r11
  add    r9, rax
  adc    r10, rdx
  adc    r8, 0
  
  mov    rax, [rip+fmt(p751p1)+72] 
  mul    r12
  add    r9, rax
  adc    r10, rdx
  adc    r8, 0
  
  mov    rax, [rip+fmt(p751p1)+64] 
  mul    r13
  add    r9, rax
  adc    r10, rdx
  adc    r8, 0
  
  mov    rax, [rip+fmt(p751p1)+56] 
  mul    r14
  add    r9, rax
  adc    r10, rdx
  adc    r8, 0
  
  mov    rax, [rip+fmt(p751p1)+48] 
  mul    r15
  add    r9, rax
  adc    r10, rdx
  adc    r8, 0
  
  mov    rcx, [reg_p2+40]
  mov    rax, [rip+fmt(p751p1)+40] 
  mul    rcx
  add    r9, rax
  adc    r10, rdx
  adc    r8, 0
  add    r9, [reg_p1+80]
  mov    [reg_p2+80], r9    // z10
  adc    r10, 0
  adc    r8, 0
  
  xor    r9, r9
  mov    rax, [rip+fmt(p751p1)+88] 
  mul    r11
  add    r10, rax
  adc    r8, rdx
  adc    r9, 0
  
  mov    rax, [rip+fmt(p751p1)+80] 
  mul    r12
  add    r10, rax
  adc    r8, rdx
  adc    r9, 0
  
  mov    rax, [rip+fmt(p751p1)+72] 
  mul    r13
  add    r10, rax
  adc    r8, rdx
  adc    r9, 0
  
  mov    rax, [rip+fmt(p751p1)+64] 
  mul    r14
  add    r10, rax
  adc    r8, rdx
  adc    r9, 0
  
  mov    rax, [rip+fmt(p751p1)+56] 
  mul    r15
  add    r10, rax
  adc    r8, rdx
  adc    r9, 0
  
  mov    rax, [rip+fmt(p751p1)+48] 
  mul    rcx
  add    r10, rax
  adc    r8, rdx
  adc    r9, 0
  
  mov    r11, [reg_p2+48]
  mov    rax, [rip+fmt(p751p1)+40] 
  mul    r11
  add    r10, rax
  adc    r8, rdx
  adc    r9, 0
  add    r10, [reg_p1+88]
  mov    [reg_p2+88], r10    // z11
  adc    r8, 0
  adc    r9, 0
  
  xor    r10, r10
  mov    rax, [rip+fmt(p751p1)+88] 
  mul    r12
  add    r8, rax
  adc    r9, rdx
  adc    r10, 0
  
  mov    rax, [rip+fmt(p751p1)+80] 
  mul    r13
  add    r8, rax
  adc    r9, rdx
  adc    r10, 0
  
  mov    rax, [rip+fmt(p751p1)+72] 
  mul    r14
  add    r8, rax
  adc    r9, rdx
  adc    r10, 0
  
  mov    rax, [rip+fmt(p751p1)+64] 
  mul    r15
  add    r8, rax
  adc    r9, rdx
  adc    r10, 0
  
  mov    rax, [rip+fmt(p751p1)+56] 
  mul    rcx
  add    r8, rax
  adc    r9, rdx
  adc    r10, 0
  
  mov    rax, [rip+fmt(p751p1)+48] 
  mul    r11
  add    r8, rax
  adc    r9, rdx
  adc    r10, 0
  
  mov    r12, [reg_p2+56]
  mov    rax, [rip+fmt(p751p1)+40] 
  mul    r12
  add    r8, rax
  adc    r9, rdx
  adc    r10, 0
  add    r8, [reg_p1+96]
  mov    [reg_p2], r8        // z0
  adc    r9, 0
  adc    r10, 0
  
  xor    r8, r8
  mov    rax, [rip+fmt(p751p1)+88] 
  mul    r13
  add    r9, rax
  adc    r10, rdx
  adc    r8, 0

  mov    rax, [rip+fmt(p751p1)+80] 
  mul    r14
  add    r9, rax
  adc    r10, rdx
  adc    r8, 0

  mov    rax, [rip+fmt(p751p1)+72]
  mul    r15
  add    r9, rax
  adc    r10, rdx
  adc    r8, 0

  mov    rax, [rip+fmt(p751p1)+64]
  mul    rcx
  add    r9, rax
  adc    r10, rdx
  adc    r8, 0

  mov    rax, [rip+fmt(p751p1)+56]
  mul    r11
  add    r9, rax
  adc    r10, rdx
  adc    r8, 0

  mov    rax, [rip+fmt(p751p1)+48]
  mul    r12
  add    r9, rax
  adc    r10, rdx
  adc    r8, 0
  
  mov    r13, [reg_p2+64]
  mov    rax, [rip+fmt(p751p1)+40]
  mul    r13
  add    r9, rax
  adc    r10, rdx
  adc    r8, 0
  add    r9, [reg_p1+104]
  mov    [reg_p2+8], r9      // z1
  adc    r10, 0
  adc    r8, 0
  
  xor    r9, r9
  mov    rax, [rip+fmt(p751p1)+88] 
  mul    r14
  add    r10, rax
  adc    r8, rdx
  adc    r9, 0
  
  mov    rax, [rip+fmt(p751p1)+80] 
  mul    r15
  add    r10, rax
  adc    r8, rdx
  adc    r9, 0
  
  mov    rax, [rip+fmt(p751p1)+72] 
  mul    rcx
  add    r10, rax
  adc    r8, rdx
  adc    r9, 0
  
  mov    rax, [rip+fmt(p751p1)+64] 
  mul    r11
  add    r10, rax
  adc    r8, rdx
  adc    r9, 0
  
  mov    rax, [rip+fmt(p751p1)+56] 
  mul    r12
  add    r10, rax
  adc    r8, rdx
  adc    r9, 0
  
  mov    rax, [rip+fmt(p751p1)+48] 
  mul    r13
  add    r10, rax
  adc    r8, rdx
  adc    r9, 0
  
  mov    r14, [reg_p2+72]
  mov    rax, [rip+fmt(p751p1)+40] 
  mul    r14
  add    r10, rax
  adc    r8, rdx
  adc    r9, 0
  add    r10, [reg_p1+112]
  mov    [reg_p2+16], r10    // z2
  adc    r8, 0
  adc    r9, 0
  
  xor    r10, r10
  mov    rax, [rip+fmt(p751p1)+88] 
  mul    r15
  add    r8, rax
  adc    r9, rdx
  adc    r10, 0
  
  mov    rax, [rip+fmt(p751p1)+80] 
  mul    rcx
  add    r8, rax
  adc    r9, rdx
  adc    r10, 0
  
  mov    rax, [rip+fmt(p751p1)+72] 
  mul    r11
  add    r8, rax
  adc    r9, rdx
  adc    r10, 0
  
  mov    rax, [rip+fmt(p751p1)+64] 
  mul    r12
  add    r8, rax
  adc    r9, rdx
  adc    r10, 0
  
  mov    rax, [rip+fmt(p751p1)+56] 
  mul    r13
  add    r8, rax
  adc    r9, rdx
  adc    r10, 0
  
  mov    rax, [rip+fmt(p751p1)+48] 
  mul    r14
  add    r8, rax
  adc    r9, rdx
  adc    r10, 0
  
  mov    r15, [reg_p2+80]
  mov    rax, [rip+fmt(p751p1)+40] 
  mul    r15
  add    r8, rax
  adc    r9, rdx
  adc    r10, 0
  add    r8, [reg_p1+120]
  mov    [reg_p2+24], r8     // z3
  adc    r9, 0
  adc    r10, 0
  
  xor    r8, r8
  mov    rax, [rip+fmt(p751p1)+88] 
  mul    rcx
  add    r9, rax
  adc    r10, rdx
  adc    r8, 0
  
  mov    rax, [rip+fmt(p751p1)+80] 
  mul    r11
  add    r9, rax
  adc    r10, rdx
  adc    r8, 0
  
  mov    rax, [rip+fmt(p751p1)+72] 
  mul    r12
  add    r9, rax
  adc    r10, rdx
  adc    r8, 0
  
  mov    rax, [rip+fmt(p751p1)+64] 
  mul    r13
  add    r9, rax
  adc    r10, rdx
  adc    r8, 0
  
  mov    rax, [rip+fmt(p751p1)+56] 
  mul    r14
  add    r9, rax
  adc    r10, rdx
  adc    r8, 0
  
  mov    rax, [rip+fmt(p751p1)+48] 
  mul    r15
  add    r9, rax
  adc    r10, rdx
  adc    r8, 0
  
  mov    rcx, [reg_p2+88]
  mov    rax, [rip+fmt(p751p1)+40] 
  mul    rcx
  add    r9, rax
  adc    r10, rdx
  adc    r8, 0
  add    r9, [reg_p1+128]
  mov    [reg_p2+32], r9     // z4
  adc    r10, 0
  adc    r8, 0
  
  xor    r9, r9
  mov    rax, [rip+fmt(p751p1)+88] 
  mul    r11
  add    r10, rax
  adc    r8, rdx
  adc    r9, 0
  
  mov    rax, [rip+fmt(p751p1)+80] 
  mul    r12
  add    r10, rax
  adc    r8, rdx
  adc    r9, 0
  
  mov    rax, [rip+fmt(p751p1)+72] 
  mul    r13
  add    r10, rax
  adc    r8, rdx
  adc    r9, 0
  
  mov    rax, [rip+fmt(p751p1)+64] 
  mul    r14
  add    r10, rax
  adc    r8, rdx
  adc    r9, 0
  
  mov    rax, [rip+fmt(p751p1)+56] 
  mul    r15
  add    r10, rax
  adc    r8, rdx
  adc    r9, 0
  
  mov    rax, [rip+fmt(p751p1)+48] 
  mul    rcx
  add    r10, rax
  adc    r8, rdx
  adc    r9, 0
  add    r10, [reg_p1+136]
  mov    [reg_p2+40], r10    // z5
  adc    r8, 0
  adc    r9, 0
  
  xor    r10, r10
  mov    rax, [rip+fmt(p751p1)+88] 
  mul    r12
  add    r8, rax
  adc    r9, rdx
  adc    r10, 0
  
  mov    rax, [rip+fmt(p751p1)+80] 
  mul    r13
  add    r8, rax
  adc    r9, rdx
  adc    r10, 0
  
  mov    rax, [rip+fmt(p751p1)+72] 
  mul    r14
  add    r8, rax
  adc    r9, rdx
  adc    r10, 0
  
  mov    rax, [rip+fmt(p751p1)+64] 
  mul    r15
  add    r8, rax
  adc    r9, rdx
  adc    r10, 0
  
  mov    rax, [rip+fmt(p751p1)+56] 
  mul    rcx
  add    r8, rax
  adc    r9, rdx
  adc    r10, 0
  add    r8, [reg_p1+144]
  mov    [reg_p2+48], r8     // z6
  adc    r9, 0
  adc    r10, 0
  
  xor    r8, r8
  mov    rax, [rip+fmt(p751p1)+88] 
  mul    r13
  add    r9, rax
  adc    r10, rdx
  adc    r8, 0
  
  mov    rax, [rip+fmt(p751p1)+80] 
  mul    r14
  add    r9, rax
  adc    r10, rdx
  adc    r8, 0
  
  mov    rax, [rip+fmt(p751p1)+72] 
  mul    r15
  add    r9, rax
  adc    r10, rdx
  adc    r8, 0
  
  mov    rax, [rip+fmt(p751p1)+64] 
  mul    rcx
  add    r9, rax
  adc    r10, rdx
  adc    r8, 0
  add    r9, [reg_p1+152]
  mov    [reg_p2+56], r9     // z7
  adc    r10, 0
  adc    r8, 0
  
  xor    r9, r9
  mov    rax, [rip+fmt(p751p1)+88] 
  mul    r14
  add    r10, rax
  adc    r8, rdx
  adc    r9, 0
  
  mov    rax, [rip+fmt(p751p1)+80] 
  mul    r15
  add    r10, rax
  adc    r8, rdx
  adc    r9, 0
  
  mov    rax, [rip+fmt(p751p1)+72] 
  mul    rcx
  add    r10, rax
  adc    r8, rdx
  adc    r9, 0
  add    r10, [reg_p1+160]
  mov    [reg_p2+64], r10    // z8
  adc    r8, 0
  adc    r9, 0
  
  xor    r10, r10
  mov    rax, [rip+fmt(p751p1)+88] 
  mul    r15
  add    r8, rax
  adc    r9, rdx
  adc    r10, 0

  mov    rax, [rip+fmt(p751p1)+80] 
  mul    rcx
  add    r8, rax
  adc    r9, rdx
  adc    r10, 0
  add    r8, [reg_p1+168]    // z9
  mov    [reg_p2+72], r8     // z9
  adc    r9, 0
  adc    r10, 0
  
  mov    rax, [rip+fmt(p751p1)+88] 
  mul    rcx
  add    r9, rax
  adc    r10, rdx
  add    r9, [reg_p1+176]    // z10
  mov    [reg_p2+80], r9     // z10
  adc    r10, 0  
  add    r10, [reg_p1+184]   // z11
  mov    [reg_p2+88], r10    // z11

  pop    r15
  pop    r14
  pop    r13
  pop    r12
  ret

  #endif


//***********************************************************************
//  751-bit multiprecision addition
//  Operation: c [reg_p3] = a [reg_p1] + b [reg_p2]
//*********************************************************************** 
.global fmt(mp_add751_asm)
fmt(mp_add751_asm):  
  mov    r8, [reg_p1]
  mov    r9, [reg_p1+8]
  mov    r10, [reg_p1+16]
  mov    r11, [reg_p1+24]
  mov    rax, [reg_p1+32]
  mov    rcx, [reg_p1+40]
  add    r8, [reg_p2] 
  adc    r9, [reg_p2+8] 
  adc    r10, [reg_p2+16] 
  adc    r11, [reg_p2+24] 
  adc    rax, [reg_p2+32] 
  adc    rcx, [reg_p2+40] 
  mov    [reg_p3], r8
  mov    [reg_p3+8], r9
  mov    [reg_p3+16], r10
  mov    [reg_p3+24], r11
  mov    [reg_p3+32], rax
  mov    [reg_p3+40], rcx

  mov    r8, [reg_p1+48]
  mov    r9, [reg_p1+56] 
  mov    r10, [reg_p1+64]
  mov    r11, [reg_p1+72] 
  mov    rax, [reg_p1+80]  
  mov    rcx, [reg_p1+88] 
  adc    r8, [reg_p2+48] 
  adc    r9, [reg_p2+56]
  adc    r10, [reg_p2+64] 
  adc    r11, [reg_p2+72]
  adc    rax, [reg_p2+80]
  adc    rcx, [reg_p2+88]
  mov    [reg_p3+48], r8
  mov    [reg_p3+56], r9
  mov    [reg_p3+64], r10
  mov    [reg_p3+72], r11
  mov    [reg_p3+80], rax
  mov    [reg_p3+88], rcx
  ret


//***********************************************************************
//  2x751-bit multiprecision subtraction/addition
//  Operation: c [x2] = a [x0] - b [x1]. If c < 0, add p751*2^768
//*********************************************************************** 
.global fmt(mp_subadd751x2_asm)
fmt(mp_subadd751x2_asm):
  push   r12
  push   r13 
  push   r14 
  push   r15
  push   rbx
  xor    rax, rax
  mov    r8, [reg_p1]
  mov    r9, [reg_p1+8]
  mov    r10, [reg_p1+16]
  mov    r11, [reg_p1+24]
  mov    rcx, [reg_p1+32]
  sub    r8, [reg_p2] 
  sbb    r9, [reg_p2+8] 
  sbb    r10, [reg_p2+16] 
  sbb    r11, [reg_p2+24] 
  sbb    rcx, [reg_p2+32] 
  mov    [reg_p3], r8
  mov    [reg_p3+8], r9
  mov    [reg_p3+16], r10
  mov    [reg_p3+24], r11
  mov    [reg_p3+32], rcx

  mov    r8, [reg_p1+40]
  mov    r9, [reg_p1+48]
  mov    r10, [reg_p1+56] 
  mov    r11, [reg_p1+64]
  mov    rcx, [reg_p1+72] 
  sbb    r8, [reg_p2+40] 
  sbb    r9, [reg_p2+48] 
  sbb    r10, [reg_p2+56]
  sbb    r11, [reg_p2+64] 
  sbb    rcx, [reg_p2+72]
  mov    [reg_p3+40], r8
  mov    [reg_p3+48], r9
  mov    [reg_p3+56], r10
  mov    [reg_p3+64], r11
  mov    [reg_p3+72], rcx
  
  mov    r8, [reg_p1+80]
  mov    r9, [reg_p1+88] 
  mov    r10, [reg_p1+96]
  mov    r11, [reg_p1+104]
  mov    rcx, [reg_p1+112]
  sbb    r8, [reg_p2+80]
  sbb    r9, [reg_p2+88]
  sbb    r10, [reg_p2+96] 
  sbb    r11, [reg_p2+104] 
  sbb    rcx, [reg_p2+112]
  mov    [reg_p3+80], r8 
  mov    [reg_p3+88], r9
  mov    [reg_p3+96], r10
  mov    [reg_p3+104], r11
  mov    [reg_p3+112], rcx
  
  mov    r8, [reg_p1+120]
  mov    r9, [reg_p1+128]
  mov    r10, [reg_p1+136]
  mov    r11, [reg_p1+144]
  mov    rcx, [reg_p1+152]
  sbb    r8, [reg_p2+120] 
  sbb    r9, [reg_p2+128] 
  sbb    r10, [reg_p2+136] 
  sbb    r11, [reg_p2+144] 
  sbb    rcx, [reg_p2+152]
  mov    [reg_p3+120], r8
  mov    [reg_p3+128], r9
  mov    [reg_p3+136], r10
  mov    [reg_p3+144], r11
  mov    [reg_p3+152], rcx  
   
  mov    r8, [reg_p1+160]
  mov    r9, [reg_p1+168] 
  mov    r10, [reg_p1+176]  
  mov    r11, [reg_p1+184]
  sbb    r8, [reg_p2+160]
  sbb    r9, [reg_p2+168]
  sbb    r10, [reg_p2+176]
  sbb    r11, [reg_p2+184]
  sbb    rax, 0
  
  // Add p751 anded with the mask in rax 
  mov    r12, [rip+fmt(p751)]
  mov    r13, [rip+fmt(p751)+40]
  mov    r14, [rip+fmt(p751)+48]
  mov    r15, [rip+fmt(p751)+56]
  mov    rdi, [rip+fmt(p751)+64]
  mov    rsi, [rip+fmt(p751)+72]
  mov    rbx, [rip+fmt(p751)+80]
  mov    rcx, [rip+fmt(p751)+88]
  and    r12, rax
  and    r13, rax
  and    r14, rax
  and    r15, rax
  and    rdi, rax
  and    rsi, rax
  and    rbx, rax
  and    rcx, rax
  mov    rax, [reg_p3+96]
  add    rax, r12
  mov    [reg_p3+96], rax
  mov    rax, [reg_p3+104]
  adc    rax, r12
  mov    [reg_p3+104], rax
  mov    rax, [reg_p3+112]
  adc    rax, r12
  mov    [reg_p3+112], rax
  mov    rax, [reg_p3+120]
  adc    rax, r12
  mov    [reg_p3+120], rax
  adc    r12, [reg_p3+128]
  adc    r13, [reg_p3+136]
  mov    [reg_p3+128], r12
  mov    [reg_p3+136], r13
  mov    r12, [reg_p3+144]
  mov    r13, [reg_p3+152]
  adc    r12, r14
  adc    r13, r15
  adc    r8, rdi
  adc    r9, rsi
  adc    r10, rbx
  adc    r11, rcx
  
  mov    [reg_p3+144], r12
  mov    [reg_p3+152], r13
  mov    [reg_p3+160], r8
  mov    [reg_p3+168], r9
  mov    [reg_p3+176], r10
  mov    [reg_p3+184], r11
  pop    rbx
  pop    r15
  pop    r14
  pop    r13
  pop    r12
  ret
  ret


//***********************************************************************
//  Double 2x751-bit multiprecision subtraction
//  Operation: c [reg_p3] = c [reg_p3] - a [reg_p1] - b [reg_p2]
//*********************************************************************** 
.global fmt(mp_dblsub751x2_asm)
fmt(mp_dblsub751x2_asm):
  push   r12
  push   r13
  push   r14
  push   r15
  
  mov    r8, [reg_p3]
  mov    r9, [reg_p3+8]
  mov    r10, [reg_p3+16]
  mov    r11, [reg_p3+24]
  mov    r12, [reg_p3+32]
  mov    r13, [reg_p3+40]
  mov    r14, [reg_p3+48]
  mov    r15, [reg_p3+56]
  sub    r8, [reg_p1]
  sbb    r9, [reg_p1+8] 
  sbb    r10, [reg_p1+16] 
  sbb    r11, [reg_p1+24] 
  sbb    r12, [reg_p1+32] 
  sbb    r13, [reg_p1+40] 
  sbb    r14, [reg_p1+48] 
  sbb    r15, [reg_p1+56]
  setc   al
  sub    r8, [reg_p2]
  sbb    r9, [reg_p2+8] 
  sbb    r10, [reg_p2+16] 
  sbb    r11, [reg_p2+24] 
  sbb    r12, [reg_p2+32] 
  sbb    r13, [reg_p2+40] 
  sbb    r14, [reg_p2+48] 
  sbb    r15, [reg_p2+56]
  setc   cl
  mov    [reg_p3], r8
  mov    [reg_p3+8], r9
  mov    [reg_p3+16], r10
  mov    [reg_p3+24], r11
  mov    [reg_p3+32], r12
  mov    [reg_p3+40], r13
  mov    [reg_p3+48], r14
  mov    [reg_p3+56], r15
    
  mov    r8, [reg_p3+64]
  mov    r9, [reg_p3+72]
  mov    r10, [reg_p3+80]
  mov    r11, [reg_p3+88]
  mov    r12, [reg_p3+96]
  mov    r13, [reg_p3+104]
  mov    r14, [reg_p3+112]
  mov    r15, [reg_p3+120]
  bt     rax, 0 
  sbb    r8, [reg_p1+64] 
  sbb    r9, [reg_p1+72] 
  sbb    r10, [reg_p1+80] 
  sbb    r11, [reg_p1+88] 
  sbb    r12, [reg_p1+96] 
  sbb    r13, [reg_p1+104] 
  sbb    r14, [reg_p1+112] 
  sbb    r15, [reg_p1+120]
  setc   al 
  bt     rcx, 0  
  sbb    r8, [reg_p2+64] 
  sbb    r9, [reg_p2+72] 
  sbb    r10, [reg_p2+80] 
  sbb    r11, [reg_p2+88] 
  sbb    r12, [reg_p2+96] 
  sbb    r13, [reg_p2+104] 
  sbb    r14, [reg_p2+112] 
  sbb    r15, [reg_p2+120]
  setc   cl 
  mov    [reg_p3+64], r8
  mov    [reg_p3+72], r9
  mov    [reg_p3+80], r10
  mov    [reg_p3+88], r11
  mov    [reg_p3+96], r12
  mov    [reg_p3+104], r13
  mov    [reg_p3+112], r14
  mov    [reg_p3+120], r15
  
  mov    r8, [reg_p3+128]
  mov    r9, [reg_p3+136]
  mov    r10, [reg_p3+144]
  mov    r11, [reg_p3+152]
  mov    r12, [reg_p3+160]
  mov    r13, [reg_p3+168]
  mov    r14, [reg_p3+176]
  mov    r15, [reg_p3+184]
  bt     rax, 0 
  sbb    r8, [reg_p1+128] 
  sbb    r9, [reg_p1+136] 
  sbb    r10, [reg_p1+144] 
  sbb    r11, [reg_p1+152] 
  sbb    r12, [reg_p1+160] 
  sbb    r13, [reg_p1+168] 
  sbb    r14, [reg_p1+176] 
  sbb    r15, [reg_p1+184]
  bt     rcx, 0 
  sbb    r8, [reg_p2+128] 
  sbb    r9, [reg_p2+136] 
  sbb    r10, [reg_p2+144] 
  sbb    r11, [reg_p2+152] 
  sbb    r12, [reg_p2+160] 
  sbb    r13, [reg_p2+168] 
  sbb    r14, [reg_p2+176] 
  sbb    r15, [reg_p2+184]
  mov    [reg_p3+128], r8
  mov    [reg_p3+136], r9
  mov    [reg_p3+144], r10
  mov    [reg_p3+152], r11
  mov    [reg_p3+160], r12
  mov    [reg_p3+168], r13
  mov    [reg_p3+176], r14
  mov    [reg_p3+184], r15
  
  pop    r15
  pop    r14
  pop    r13
  pop    r12
  ret
