Black Lives Matter. Support the Equal Justice Initiative.

Text file src/crypto/elliptic/p256_asm_amd64.s

Documentation: crypto/elliptic

     1  // Copyright 2015 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // This file contains constant-time, 64-bit assembly implementation of
     6  // P256. The optimizations performed here are described in detail in:
     7  // S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with
     8  //                          256-bit primes"
     9  // https://link.springer.com/article/10.1007%2Fs13389-014-0090-x
    10  // https://eprint.iacr.org/2013/816.pdf
    11  
    12  #include "textflag.h"
    13  
    14  #define res_ptr DI
    15  #define x_ptr SI
    16  #define y_ptr CX
    17  
    18  #define acc0 R8
    19  #define acc1 R9
    20  #define acc2 R10
    21  #define acc3 R11
    22  #define acc4 R12
    23  #define acc5 R13
    24  #define t0 R14
    25  #define t1 R15
    26  
    27  DATA p256const0<>+0x00(SB)/8, $0x00000000ffffffff
    28  DATA p256const1<>+0x00(SB)/8, $0xffffffff00000001
    29  DATA p256ordK0<>+0x00(SB)/8, $0xccd1c8aaee00bc4f
    30  DATA p256ord<>+0x00(SB)/8, $0xf3b9cac2fc632551
    31  DATA p256ord<>+0x08(SB)/8, $0xbce6faada7179e84
    32  DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff
    33  DATA p256ord<>+0x18(SB)/8, $0xffffffff00000000
    34  DATA p256one<>+0x00(SB)/8, $0x0000000000000001
    35  DATA p256one<>+0x08(SB)/8, $0xffffffff00000000
    36  DATA p256one<>+0x10(SB)/8, $0xffffffffffffffff
    37  DATA p256one<>+0x18(SB)/8, $0x00000000fffffffe
    38  GLOBL p256const0<>(SB), 8, $8
    39  GLOBL p256const1<>(SB), 8, $8
    40  GLOBL p256ordK0<>(SB), 8, $8
    41  GLOBL p256ord<>(SB), 8, $32
    42  GLOBL p256one<>(SB), 8, $32
    43  
    44  /* ---------------------------------------*/
    45  // func p256LittleToBig(res []byte, in []uint64)
    46  TEXT ·p256LittleToBig(SB),NOSPLIT,$0
    47  	JMP ·p256BigToLittle(SB)
    48  /* ---------------------------------------*/
    49  // func p256BigToLittle(res []uint64, in []byte)
    50  TEXT ·p256BigToLittle(SB),NOSPLIT,$0
    51  	MOVQ res+0(FP), res_ptr
    52  	MOVQ in+24(FP), x_ptr
    53  
    54  	MOVQ (8*0)(x_ptr), acc0
    55  	MOVQ (8*1)(x_ptr), acc1
    56  	MOVQ (8*2)(x_ptr), acc2
    57  	MOVQ (8*3)(x_ptr), acc3
    58  
    59  	BSWAPQ acc0
    60  	BSWAPQ acc1
    61  	BSWAPQ acc2
    62  	BSWAPQ acc3
    63  
    64  	MOVQ acc3, (8*0)(res_ptr)
    65  	MOVQ acc2, (8*1)(res_ptr)
    66  	MOVQ acc1, (8*2)(res_ptr)
    67  	MOVQ acc0, (8*3)(res_ptr)
    68  
    69  	RET
    70  /* ---------------------------------------*/
    71  // func p256MovCond(res, a, b []uint64, cond int)
    72  // If cond == 0 res=b, else res=a
    73  TEXT ·p256MovCond(SB),NOSPLIT,$0
    74  	MOVQ res+0(FP), res_ptr
    75  	MOVQ a+24(FP), x_ptr
    76  	MOVQ b+48(FP), y_ptr
    77  	MOVQ cond+72(FP), X12
    78  
    79  	PXOR X13, X13
    80  	PSHUFD $0, X12, X12
    81  	PCMPEQL X13, X12
    82  
    83  	MOVOU X12, X0
    84  	MOVOU (16*0)(x_ptr), X6
    85  	PANDN X6, X0
    86  	MOVOU X12, X1
    87  	MOVOU (16*1)(x_ptr), X7
    88  	PANDN X7, X1
    89  	MOVOU X12, X2
    90  	MOVOU (16*2)(x_ptr), X8
    91  	PANDN X8, X2
    92  	MOVOU X12, X3
    93  	MOVOU (16*3)(x_ptr), X9
    94  	PANDN X9, X3
    95  	MOVOU X12, X4
    96  	MOVOU (16*4)(x_ptr), X10
    97  	PANDN X10, X4
    98  	MOVOU X12, X5
    99  	MOVOU (16*5)(x_ptr), X11
   100  	PANDN X11, X5
   101  
   102  	MOVOU (16*0)(y_ptr), X6
   103  	MOVOU (16*1)(y_ptr), X7
   104  	MOVOU (16*2)(y_ptr), X8
   105  	MOVOU (16*3)(y_ptr), X9
   106  	MOVOU (16*4)(y_ptr), X10
   107  	MOVOU (16*5)(y_ptr), X11
   108  
   109  	PAND X12, X6
   110  	PAND X12, X7
   111  	PAND X12, X8
   112  	PAND X12, X9
   113  	PAND X12, X10
   114  	PAND X12, X11
   115  
   116  	PXOR X6, X0
   117  	PXOR X7, X1
   118  	PXOR X8, X2
   119  	PXOR X9, X3
   120  	PXOR X10, X4
   121  	PXOR X11, X5
   122  
   123  	MOVOU X0, (16*0)(res_ptr)
   124  	MOVOU X1, (16*1)(res_ptr)
   125  	MOVOU X2, (16*2)(res_ptr)
   126  	MOVOU X3, (16*3)(res_ptr)
   127  	MOVOU X4, (16*4)(res_ptr)
   128  	MOVOU X5, (16*5)(res_ptr)
   129  
   130  	RET
   131  /* ---------------------------------------*/
   132  // func p256NegCond(val []uint64, cond int)
   133  TEXT ·p256NegCond(SB),NOSPLIT,$0
   134  	MOVQ val+0(FP), res_ptr
   135  	MOVQ cond+24(FP), t0
   136  	// acc = poly
   137  	MOVQ $-1, acc0
   138  	MOVQ p256const0<>(SB), acc1
   139  	MOVQ $0, acc2
   140  	MOVQ p256const1<>(SB), acc3
   141  	// Load the original value
   142  	MOVQ (8*0)(res_ptr), acc5
   143  	MOVQ (8*1)(res_ptr), x_ptr
   144  	MOVQ (8*2)(res_ptr), y_ptr
   145  	MOVQ (8*3)(res_ptr), t1
   146  	// Speculatively subtract
   147  	SUBQ acc5, acc0
   148  	SBBQ x_ptr, acc1
   149  	SBBQ y_ptr, acc2
   150  	SBBQ t1, acc3
   151  	// If condition is 0, keep original value
   152  	TESTQ t0, t0
   153  	CMOVQEQ acc5, acc0
   154  	CMOVQEQ x_ptr, acc1
   155  	CMOVQEQ y_ptr, acc2
   156  	CMOVQEQ t1, acc3
   157  	// Store result
   158  	MOVQ acc0, (8*0)(res_ptr)
   159  	MOVQ acc1, (8*1)(res_ptr)
   160  	MOVQ acc2, (8*2)(res_ptr)
   161  	MOVQ acc3, (8*3)(res_ptr)
   162  
   163  	RET
   164  /* ---------------------------------------*/
   165  // func p256Sqr(res, in []uint64, n int)
   166  TEXT ·p256Sqr(SB),NOSPLIT,$0
   167  	MOVQ res+0(FP), res_ptr
   168  	MOVQ in+24(FP), x_ptr
   169  	MOVQ n+48(FP), BX
   170  
   171  sqrLoop:
   172  
   173  	// y[1:] * y[0]
   174  	MOVQ (8*0)(x_ptr), t0
   175  
   176  	MOVQ (8*1)(x_ptr), AX
   177  	MULQ t0
   178  	MOVQ AX, acc1
   179  	MOVQ DX, acc2
   180  
   181  	MOVQ (8*2)(x_ptr), AX
   182  	MULQ t0
   183  	ADDQ AX, acc2
   184  	ADCQ $0, DX
   185  	MOVQ DX, acc3
   186  
   187  	MOVQ (8*3)(x_ptr), AX
   188  	MULQ t0
   189  	ADDQ AX, acc3
   190  	ADCQ $0, DX
   191  	MOVQ DX, acc4
   192  	// y[2:] * y[1]
   193  	MOVQ (8*1)(x_ptr), t0
   194  
   195  	MOVQ (8*2)(x_ptr), AX
   196  	MULQ t0
   197  	ADDQ AX, acc3
   198  	ADCQ $0, DX
   199  	MOVQ DX, t1
   200  
   201  	MOVQ (8*3)(x_ptr), AX
   202  	MULQ t0
   203  	ADDQ t1, acc4
   204  	ADCQ $0, DX
   205  	ADDQ AX, acc4
   206  	ADCQ $0, DX
   207  	MOVQ DX, acc5
   208  	// y[3] * y[2]
   209  	MOVQ (8*2)(x_ptr), t0
   210  
   211  	MOVQ (8*3)(x_ptr), AX
   212  	MULQ t0
   213  	ADDQ AX, acc5
   214  	ADCQ $0, DX
   215  	MOVQ DX, y_ptr
   216  	XORQ t1, t1
   217  	// *2
   218  	ADDQ acc1, acc1
   219  	ADCQ acc2, acc2
   220  	ADCQ acc3, acc3
   221  	ADCQ acc4, acc4
   222  	ADCQ acc5, acc5
   223  	ADCQ y_ptr, y_ptr
   224  	ADCQ $0, t1
   225  	// Missing products
   226  	MOVQ (8*0)(x_ptr), AX
   227  	MULQ AX
   228  	MOVQ AX, acc0
   229  	MOVQ DX, t0
   230  
   231  	MOVQ (8*1)(x_ptr), AX
   232  	MULQ AX
   233  	ADDQ t0, acc1
   234  	ADCQ AX, acc2
   235  	ADCQ $0, DX
   236  	MOVQ DX, t0
   237  
   238  	MOVQ (8*2)(x_ptr), AX
   239  	MULQ AX
   240  	ADDQ t0, acc3
   241  	ADCQ AX, acc4
   242  	ADCQ $0, DX
   243  	MOVQ DX, t0
   244  
   245  	MOVQ (8*3)(x_ptr), AX
   246  	MULQ AX
   247  	ADDQ t0, acc5
   248  	ADCQ AX, y_ptr
   249  	ADCQ DX, t1
   250  	MOVQ t1, x_ptr
   251  	// First reduction step
   252  	MOVQ acc0, AX
   253  	MOVQ acc0, t1
   254  	SHLQ $32, acc0
   255  	MULQ p256const1<>(SB)
   256  	SHRQ $32, t1
   257  	ADDQ acc0, acc1
   258  	ADCQ t1, acc2
   259  	ADCQ AX, acc3
   260  	ADCQ $0, DX
   261  	MOVQ DX, acc0
   262  	// Second reduction step
   263  	MOVQ acc1, AX
   264  	MOVQ acc1, t1
   265  	SHLQ $32, acc1
   266  	MULQ p256const1<>(SB)
   267  	SHRQ $32, t1
   268  	ADDQ acc1, acc2
   269  	ADCQ t1, acc3
   270  	ADCQ AX, acc0
   271  	ADCQ $0, DX
   272  	MOVQ DX, acc1
   273  	// Third reduction step
   274  	MOVQ acc2, AX
   275  	MOVQ acc2, t1
   276  	SHLQ $32, acc2
   277  	MULQ p256const1<>(SB)
   278  	SHRQ $32, t1
   279  	ADDQ acc2, acc3
   280  	ADCQ t1, acc0
   281  	ADCQ AX, acc1
   282  	ADCQ $0, DX
   283  	MOVQ DX, acc2
   284  	// Last reduction step
   285  	XORQ t0, t0
   286  	MOVQ acc3, AX
   287  	MOVQ acc3, t1
   288  	SHLQ $32, acc3
   289  	MULQ p256const1<>(SB)
   290  	SHRQ $32, t1
   291  	ADDQ acc3, acc0
   292  	ADCQ t1, acc1
   293  	ADCQ AX, acc2
   294  	ADCQ $0, DX
   295  	MOVQ DX, acc3
   296  	// Add bits [511:256] of the sqr result
   297  	ADCQ acc4, acc0
   298  	ADCQ acc5, acc1
   299  	ADCQ y_ptr, acc2
   300  	ADCQ x_ptr, acc3
   301  	ADCQ $0, t0
   302  
   303  	MOVQ acc0, acc4
   304  	MOVQ acc1, acc5
   305  	MOVQ acc2, y_ptr
   306  	MOVQ acc3, t1
   307  	// Subtract p256
   308  	SUBQ $-1, acc0
   309  	SBBQ p256const0<>(SB) ,acc1
   310  	SBBQ $0, acc2
   311  	SBBQ p256const1<>(SB), acc3
   312  	SBBQ $0, t0
   313  
   314  	CMOVQCS acc4, acc0
   315  	CMOVQCS acc5, acc1
   316  	CMOVQCS y_ptr, acc2
   317  	CMOVQCS t1, acc3
   318  
   319  	MOVQ acc0, (8*0)(res_ptr)
   320  	MOVQ acc1, (8*1)(res_ptr)
   321  	MOVQ acc2, (8*2)(res_ptr)
   322  	MOVQ acc3, (8*3)(res_ptr)
   323  	MOVQ res_ptr, x_ptr
   324  	DECQ BX
   325  	JNE  sqrLoop
   326  
   327  	RET
   328  /* ---------------------------------------*/
   329  // func p256Mul(res, in1, in2 []uint64)
   330  TEXT ·p256Mul(SB),NOSPLIT,$0
   331  	MOVQ res+0(FP), res_ptr
   332  	MOVQ in1+24(FP), x_ptr
   333  	MOVQ in2+48(FP), y_ptr
   334  	// x * y[0]
   335  	MOVQ (8*0)(y_ptr), t0
   336  
   337  	MOVQ (8*0)(x_ptr), AX
   338  	MULQ t0
   339  	MOVQ AX, acc0
   340  	MOVQ DX, acc1
   341  
   342  	MOVQ (8*1)(x_ptr), AX
   343  	MULQ t0
   344  	ADDQ AX, acc1
   345  	ADCQ $0, DX
   346  	MOVQ DX, acc2
   347  
   348  	MOVQ (8*2)(x_ptr), AX
   349  	MULQ t0
   350  	ADDQ AX, acc2
   351  	ADCQ $0, DX
   352  	MOVQ DX, acc3
   353  
   354  	MOVQ (8*3)(x_ptr), AX
   355  	MULQ t0
   356  	ADDQ AX, acc3
   357  	ADCQ $0, DX
   358  	MOVQ DX, acc4
   359  	XORQ acc5, acc5
   360  	// First reduction step
   361  	MOVQ acc0, AX
   362  	MOVQ acc0, t1
   363  	SHLQ $32, acc0
   364  	MULQ p256const1<>(SB)
   365  	SHRQ $32, t1
   366  	ADDQ acc0, acc1
   367  	ADCQ t1, acc2
   368  	ADCQ AX, acc3
   369  	ADCQ DX, acc4
   370  	ADCQ $0, acc5
   371  	XORQ acc0, acc0
   372  	// x * y[1]
   373  	MOVQ (8*1)(y_ptr), t0
   374  
   375  	MOVQ (8*0)(x_ptr), AX
   376  	MULQ t0
   377  	ADDQ AX, acc1
   378  	ADCQ $0, DX
   379  	MOVQ DX, t1
   380  
   381  	MOVQ (8*1)(x_ptr), AX
   382  	MULQ t0
   383  	ADDQ t1, acc2
   384  	ADCQ $0, DX
   385  	ADDQ AX, acc2
   386  	ADCQ $0, DX
   387  	MOVQ DX, t1
   388  
   389  	MOVQ (8*2)(x_ptr), AX
   390  	MULQ t0
   391  	ADDQ t1, acc3
   392  	ADCQ $0, DX
   393  	ADDQ AX, acc3
   394  	ADCQ $0, DX
   395  	MOVQ DX, t1
   396  
   397  	MOVQ (8*3)(x_ptr), AX
   398  	MULQ t0
   399  	ADDQ t1, acc4
   400  	ADCQ $0, DX
   401  	ADDQ AX, acc4
   402  	ADCQ DX, acc5
   403  	ADCQ $0, acc0
   404  	// Second reduction step
   405  	MOVQ acc1, AX
   406  	MOVQ acc1, t1
   407  	SHLQ $32, acc1
   408  	MULQ p256const1<>(SB)
   409  	SHRQ $32, t1
   410  	ADDQ acc1, acc2
   411  	ADCQ t1, acc3
   412  	ADCQ AX, acc4
   413  	ADCQ DX, acc5
   414  	ADCQ $0, acc0
   415  	XORQ acc1, acc1
   416  	// x * y[2]
   417  	MOVQ (8*2)(y_ptr), t0
   418  
   419  	MOVQ (8*0)(x_ptr), AX
   420  	MULQ t0
   421  	ADDQ AX, acc2
   422  	ADCQ $0, DX
   423  	MOVQ DX, t1
   424  
   425  	MOVQ (8*1)(x_ptr), AX
   426  	MULQ t0
   427  	ADDQ t1, acc3
   428  	ADCQ $0, DX
   429  	ADDQ AX, acc3
   430  	ADCQ $0, DX
   431  	MOVQ DX, t1
   432  
   433  	MOVQ (8*2)(x_ptr), AX
   434  	MULQ t0
   435  	ADDQ t1, acc4
   436  	ADCQ $0, DX
   437  	ADDQ AX, acc4
   438  	ADCQ $0, DX
   439  	MOVQ DX, t1
   440  
   441  	MOVQ (8*3)(x_ptr), AX
   442  	MULQ t0
   443  	ADDQ t1, acc5
   444  	ADCQ $0, DX
   445  	ADDQ AX, acc5
   446  	ADCQ DX, acc0
   447  	ADCQ $0, acc1
   448  	// Third reduction step
   449  	MOVQ acc2, AX
   450  	MOVQ acc2, t1
   451  	SHLQ $32, acc2
   452  	MULQ p256const1<>(SB)
   453  	SHRQ $32, t1
   454  	ADDQ acc2, acc3
   455  	ADCQ t1, acc4
   456  	ADCQ AX, acc5
   457  	ADCQ DX, acc0
   458  	ADCQ $0, acc1
   459  	XORQ acc2, acc2
   460  	// x * y[3]
   461  	MOVQ (8*3)(y_ptr), t0
   462  
   463  	MOVQ (8*0)(x_ptr), AX
   464  	MULQ t0
   465  	ADDQ AX, acc3
   466  	ADCQ $0, DX
   467  	MOVQ DX, t1
   468  
   469  	MOVQ (8*1)(x_ptr), AX
   470  	MULQ t0
   471  	ADDQ t1, acc4
   472  	ADCQ $0, DX
   473  	ADDQ AX, acc4
   474  	ADCQ $0, DX
   475  	MOVQ DX, t1
   476  
   477  	MOVQ (8*2)(x_ptr), AX
   478  	MULQ t0
   479  	ADDQ t1, acc5
   480  	ADCQ $0, DX
   481  	ADDQ AX, acc5
   482  	ADCQ $0, DX
   483  	MOVQ DX, t1
   484  
   485  	MOVQ (8*3)(x_ptr), AX
   486  	MULQ t0
   487  	ADDQ t1, acc0
   488  	ADCQ $0, DX
   489  	ADDQ AX, acc0
   490  	ADCQ DX, acc1
   491  	ADCQ $0, acc2
   492  	// Last reduction step
   493  	MOVQ acc3, AX
   494  	MOVQ acc3, t1
   495  	SHLQ $32, acc3
   496  	MULQ p256const1<>(SB)
   497  	SHRQ $32, t1
   498  	ADDQ acc3, acc4
   499  	ADCQ t1, acc5
   500  	ADCQ AX, acc0
   501  	ADCQ DX, acc1
   502  	ADCQ $0, acc2
   503  	// Copy result [255:0]
   504  	MOVQ acc4, x_ptr
   505  	MOVQ acc5, acc3
   506  	MOVQ acc0, t0
   507  	MOVQ acc1, t1
   508  	// Subtract p256
   509  	SUBQ $-1, acc4
   510  	SBBQ p256const0<>(SB) ,acc5
   511  	SBBQ $0, acc0
   512  	SBBQ p256const1<>(SB), acc1
   513  	SBBQ $0, acc2
   514  
   515  	CMOVQCS x_ptr, acc4
   516  	CMOVQCS acc3, acc5
   517  	CMOVQCS t0, acc0
   518  	CMOVQCS t1, acc1
   519  
   520  	MOVQ acc4, (8*0)(res_ptr)
   521  	MOVQ acc5, (8*1)(res_ptr)
   522  	MOVQ acc0, (8*2)(res_ptr)
   523  	MOVQ acc1, (8*3)(res_ptr)
   524  
   525  	RET
   526  /* ---------------------------------------*/
   527  // func p256FromMont(res, in []uint64)
   528  TEXT ·p256FromMont(SB),NOSPLIT,$0
   529  	MOVQ res+0(FP), res_ptr
   530  	MOVQ in+24(FP), x_ptr
   531  
   532  	MOVQ (8*0)(x_ptr), acc0
   533  	MOVQ (8*1)(x_ptr), acc1
   534  	MOVQ (8*2)(x_ptr), acc2
   535  	MOVQ (8*3)(x_ptr), acc3
   536  	XORQ acc4, acc4
   537  
   538  	// Only reduce, no multiplications are needed
   539  	// First stage
   540  	MOVQ acc0, AX
   541  	MOVQ acc0, t1
   542  	SHLQ $32, acc0
   543  	MULQ p256const1<>(SB)
   544  	SHRQ $32, t1
   545  	ADDQ acc0, acc1
   546  	ADCQ t1, acc2
   547  	ADCQ AX, acc3
   548  	ADCQ DX, acc4
   549  	XORQ acc5, acc5
   550  	// Second stage
   551  	MOVQ acc1, AX
   552  	MOVQ acc1, t1
   553  	SHLQ $32, acc1
   554  	MULQ p256const1<>(SB)
   555  	SHRQ $32, t1
   556  	ADDQ acc1, acc2
   557  	ADCQ t1, acc3
   558  	ADCQ AX, acc4
   559  	ADCQ DX, acc5
   560  	XORQ acc0, acc0
   561  	// Third stage
   562  	MOVQ acc2, AX
   563  	MOVQ acc2, t1
   564  	SHLQ $32, acc2
   565  	MULQ p256const1<>(SB)
   566  	SHRQ $32, t1
   567  	ADDQ acc2, acc3
   568  	ADCQ t1, acc4
   569  	ADCQ AX, acc5
   570  	ADCQ DX, acc0
   571  	XORQ acc1, acc1
   572  	// Last stage
   573  	MOVQ acc3, AX
   574  	MOVQ acc3, t1
   575  	SHLQ $32, acc3
   576  	MULQ p256const1<>(SB)
   577  	SHRQ $32, t1
   578  	ADDQ acc3, acc4
   579  	ADCQ t1, acc5
   580  	ADCQ AX, acc0
   581  	ADCQ DX, acc1
   582  
   583  	MOVQ acc4, x_ptr
   584  	MOVQ acc5, acc3
   585  	MOVQ acc0, t0
   586  	MOVQ acc1, t1
   587  
   588  	SUBQ $-1, acc4
   589  	SBBQ p256const0<>(SB), acc5
   590  	SBBQ $0, acc0
   591  	SBBQ p256const1<>(SB), acc1
   592  
   593  	CMOVQCS x_ptr, acc4
   594  	CMOVQCS acc3, acc5
   595  	CMOVQCS t0, acc0
   596  	CMOVQCS t1, acc1
   597  
   598  	MOVQ acc4, (8*0)(res_ptr)
   599  	MOVQ acc5, (8*1)(res_ptr)
   600  	MOVQ acc0, (8*2)(res_ptr)
   601  	MOVQ acc1, (8*3)(res_ptr)
   602  
   603  	RET
   604  /* ---------------------------------------*/
   605  // Constant time point access to arbitrary point table.
   606  // Indexed from 1 to 15, with -1 offset
   607  // (index 0 is implicitly point at infinity)
   608  // func p256Select(point, table []uint64, idx int)
   609  TEXT ·p256Select(SB),NOSPLIT,$0
   610  	MOVQ idx+48(FP),AX
   611  	MOVQ table+24(FP),DI
   612  	MOVQ point+0(FP),DX
   613  
   614  	PXOR X15, X15	// X15 = 0
   615  	PCMPEQL X14, X14 // X14 = -1
   616  	PSUBL X14, X15   // X15 = 1
   617  	MOVL AX, X14
   618  	PSHUFD $0, X14, X14
   619  
   620  	PXOR X0, X0
   621  	PXOR X1, X1
   622  	PXOR X2, X2
   623  	PXOR X3, X3
   624  	PXOR X4, X4
   625  	PXOR X5, X5
   626  	MOVQ $16, AX
   627  
   628  	MOVOU X15, X13
   629  
   630  loop_select:
   631  
   632  		MOVOU X13, X12
   633  		PADDL X15, X13
   634  		PCMPEQL X14, X12
   635  
   636  		MOVOU (16*0)(DI), X6
   637  		MOVOU (16*1)(DI), X7
   638  		MOVOU (16*2)(DI), X8
   639  		MOVOU (16*3)(DI), X9
   640  		MOVOU (16*4)(DI), X10
   641  		MOVOU (16*5)(DI), X11
   642  		ADDQ $(16*6), DI
   643  
   644  		PAND X12, X6
   645  		PAND X12, X7
   646  		PAND X12, X8
   647  		PAND X12, X9
   648  		PAND X12, X10
   649  		PAND X12, X11
   650  
   651  		PXOR X6, X0
   652  		PXOR X7, X1
   653  		PXOR X8, X2
   654  		PXOR X9, X3
   655  		PXOR X10, X4
   656  		PXOR X11, X5
   657  
   658  		DECQ AX
   659  		JNE loop_select
   660  
   661  	MOVOU X0, (16*0)(DX)
   662  	MOVOU X1, (16*1)(DX)
   663  	MOVOU X2, (16*2)(DX)
   664  	MOVOU X3, (16*3)(DX)
   665  	MOVOU X4, (16*4)(DX)
   666  	MOVOU X5, (16*5)(DX)
   667  
   668  	RET
   669  /* ---------------------------------------*/
   670  // Constant time point access to base point table.
   671  // func p256SelectBase(point, table []uint64, idx int)
   672  TEXT ·p256SelectBase(SB),NOSPLIT,$0
   673  	MOVQ idx+48(FP),AX
   674  	MOVQ table+24(FP),DI
   675  	MOVQ point+0(FP),DX
   676  
   677  	PXOR X15, X15	// X15 = 0
   678  	PCMPEQL X14, X14 // X14 = -1
   679  	PSUBL X14, X15   // X15 = 1
   680  	MOVL AX, X14
   681  	PSHUFD $0, X14, X14
   682  
   683  	PXOR X0, X0
   684  	PXOR X1, X1
   685  	PXOR X2, X2
   686  	PXOR X3, X3
   687  	MOVQ $16, AX
   688  
   689  	MOVOU X15, X13
   690  
   691  loop_select_base:
   692  
   693  		MOVOU X13, X12
   694  		PADDL X15, X13
   695  		PCMPEQL X14, X12
   696  
   697  		MOVOU (16*0)(DI), X4
   698  		MOVOU (16*1)(DI), X5
   699  		MOVOU (16*2)(DI), X6
   700  		MOVOU (16*3)(DI), X7
   701  
   702  		MOVOU (16*4)(DI), X8
   703  		MOVOU (16*5)(DI), X9
   704  		MOVOU (16*6)(DI), X10
   705  		MOVOU (16*7)(DI), X11
   706  
   707  		ADDQ $(16*8), DI
   708  
   709  		PAND X12, X4
   710  		PAND X12, X5
   711  		PAND X12, X6
   712  		PAND X12, X7
   713  
   714  		MOVOU X13, X12
   715  		PADDL X15, X13
   716  		PCMPEQL X14, X12
   717  
   718  		PAND X12, X8
   719  		PAND X12, X9
   720  		PAND X12, X10
   721  		PAND X12, X11
   722  
   723  		PXOR X4, X0
   724  		PXOR X5, X1
   725  		PXOR X6, X2
   726  		PXOR X7, X3
   727  
   728  		PXOR X8, X0
   729  		PXOR X9, X1
   730  		PXOR X10, X2
   731  		PXOR X11, X3
   732  
   733  		DECQ AX
   734  		JNE loop_select_base
   735  
   736  	MOVOU X0, (16*0)(DX)
   737  	MOVOU X1, (16*1)(DX)
   738  	MOVOU X2, (16*2)(DX)
   739  	MOVOU X3, (16*3)(DX)
   740  
   741  	RET
   742  /* ---------------------------------------*/
   743  // func p256OrdMul(res, in1, in2 []uint64)
   744  TEXT ·p256OrdMul(SB),NOSPLIT,$0
   745  	MOVQ res+0(FP), res_ptr
   746  	MOVQ in1+24(FP), x_ptr
   747  	MOVQ in2+48(FP), y_ptr
   748  	// x * y[0]
   749  	MOVQ (8*0)(y_ptr), t0
   750  
   751  	MOVQ (8*0)(x_ptr), AX
   752  	MULQ t0
   753  	MOVQ AX, acc0
   754  	MOVQ DX, acc1
   755  
   756  	MOVQ (8*1)(x_ptr), AX
   757  	MULQ t0
   758  	ADDQ AX, acc1
   759  	ADCQ $0, DX
   760  	MOVQ DX, acc2
   761  
   762  	MOVQ (8*2)(x_ptr), AX
   763  	MULQ t0
   764  	ADDQ AX, acc2
   765  	ADCQ $0, DX
   766  	MOVQ DX, acc3
   767  
   768  	MOVQ (8*3)(x_ptr), AX
   769  	MULQ t0
   770  	ADDQ AX, acc3
   771  	ADCQ $0, DX
   772  	MOVQ DX, acc4
   773  	XORQ acc5, acc5
   774  	// First reduction step
   775  	MOVQ acc0, AX
   776  	MULQ p256ordK0<>(SB)
   777  	MOVQ AX, t0
   778  
   779  	MOVQ p256ord<>+0x00(SB), AX
   780  	MULQ t0
   781  	ADDQ AX, acc0
   782  	ADCQ $0, DX
   783  	MOVQ DX, t1
   784  
   785  	MOVQ p256ord<>+0x08(SB), AX
   786  	MULQ t0
   787  	ADDQ t1, acc1
   788  	ADCQ $0, DX
   789  	ADDQ AX, acc1
   790  	ADCQ $0, DX
   791  	MOVQ DX, t1
   792  
   793  	MOVQ p256ord<>+0x10(SB), AX
   794  	MULQ t0
   795  	ADDQ t1, acc2
   796  	ADCQ $0, DX
   797  	ADDQ AX, acc2
   798  	ADCQ $0, DX
   799  	MOVQ DX, t1
   800  
   801  	MOVQ p256ord<>+0x18(SB), AX
   802  	MULQ t0
   803  	ADDQ t1, acc3
   804  	ADCQ $0, DX
   805  	ADDQ AX, acc3
   806  	ADCQ DX, acc4
   807  	ADCQ $0, acc5
   808  	// x * y[1]
   809  	MOVQ (8*1)(y_ptr), t0
   810  
   811  	MOVQ (8*0)(x_ptr), AX
   812  	MULQ t0
   813  	ADDQ AX, acc1
   814  	ADCQ $0, DX
   815  	MOVQ DX, t1
   816  
   817  	MOVQ (8*1)(x_ptr), AX
   818  	MULQ t0
   819  	ADDQ t1, acc2
   820  	ADCQ $0, DX
   821  	ADDQ AX, acc2
   822  	ADCQ $0, DX
   823  	MOVQ DX, t1
   824  
   825  	MOVQ (8*2)(x_ptr), AX
   826  	MULQ t0
   827  	ADDQ t1, acc3
   828  	ADCQ $0, DX
   829  	ADDQ AX, acc3
   830  	ADCQ $0, DX
   831  	MOVQ DX, t1
   832  
   833  	MOVQ (8*3)(x_ptr), AX
   834  	MULQ t0
   835  	ADDQ t1, acc4
   836  	ADCQ $0, DX
   837  	ADDQ AX, acc4
   838  	ADCQ DX, acc5
   839  	ADCQ $0, acc0
   840  	// Second reduction step
   841  	MOVQ acc1, AX
   842  	MULQ p256ordK0<>(SB)
   843  	MOVQ AX, t0
   844  
   845  	MOVQ p256ord<>+0x00(SB), AX
   846  	MULQ t0
   847  	ADDQ AX, acc1
   848  	ADCQ $0, DX
   849  	MOVQ DX, t1
   850  
   851  	MOVQ p256ord<>+0x08(SB), AX
   852  	MULQ t0
   853  	ADDQ t1, acc2
   854  	ADCQ $0, DX
   855  	ADDQ AX, acc2
   856  	ADCQ $0, DX
   857  	MOVQ DX, t1
   858  
   859  	MOVQ p256ord<>+0x10(SB), AX
   860  	MULQ t0
   861  	ADDQ t1, acc3
   862  	ADCQ $0, DX
   863  	ADDQ AX, acc3
   864  	ADCQ $0, DX
   865  	MOVQ DX, t1
   866  
   867  	MOVQ p256ord<>+0x18(SB), AX
   868  	MULQ t0
   869  	ADDQ t1, acc4
   870  	ADCQ $0, DX
   871  	ADDQ AX, acc4
   872  	ADCQ DX, acc5
   873  	ADCQ $0, acc0
   874  	// x * y[2]
   875  	MOVQ (8*2)(y_ptr), t0
   876  
   877  	MOVQ (8*0)(x_ptr), AX
   878  	MULQ t0
   879  	ADDQ AX, acc2
   880  	ADCQ $0, DX
   881  	MOVQ DX, t1
   882  
   883  	MOVQ (8*1)(x_ptr), AX
   884  	MULQ t0
   885  	ADDQ t1, acc3
   886  	ADCQ $0, DX
   887  	ADDQ AX, acc3
   888  	ADCQ $0, DX
   889  	MOVQ DX, t1
   890  
   891  	MOVQ (8*2)(x_ptr), AX
   892  	MULQ t0
   893  	ADDQ t1, acc4
   894  	ADCQ $0, DX
   895  	ADDQ AX, acc4
   896  	ADCQ $0, DX
   897  	MOVQ DX, t1
   898  
   899  	MOVQ (8*3)(x_ptr), AX
   900  	MULQ t0
   901  	ADDQ t1, acc5
   902  	ADCQ $0, DX
   903  	ADDQ AX, acc5
   904  	ADCQ DX, acc0
   905  	ADCQ $0, acc1
   906  	// Third reduction step
   907  	MOVQ acc2, AX
   908  	MULQ p256ordK0<>(SB)
   909  	MOVQ AX, t0
   910  
   911  	MOVQ p256ord<>+0x00(SB), AX
   912  	MULQ t0
   913  	ADDQ AX, acc2
   914  	ADCQ $0, DX
   915  	MOVQ DX, t1
   916  
   917  	MOVQ p256ord<>+0x08(SB), AX
   918  	MULQ t0
   919  	ADDQ t1, acc3
   920  	ADCQ $0, DX
   921  	ADDQ AX, acc3
   922  	ADCQ $0, DX
   923  	MOVQ DX, t1
   924  
   925  	MOVQ p256ord<>+0x10(SB), AX
   926  	MULQ t0
   927  	ADDQ t1, acc4
   928  	ADCQ $0, DX
   929  	ADDQ AX, acc4
   930  	ADCQ $0, DX
   931  	MOVQ DX, t1
   932  
   933  	MOVQ p256ord<>+0x18(SB), AX
   934  	MULQ t0
   935  	ADDQ t1, acc5
   936  	ADCQ $0, DX
   937  	ADDQ AX, acc5
   938  	ADCQ DX, acc0
   939  	ADCQ $0, acc1
   940  	// x * y[3]
   941  	MOVQ (8*3)(y_ptr), t0
   942  
   943  	MOVQ (8*0)(x_ptr), AX
   944  	MULQ t0
   945  	ADDQ AX, acc3
   946  	ADCQ $0, DX
   947  	MOVQ DX, t1
   948  
   949  	MOVQ (8*1)(x_ptr), AX
   950  	MULQ t0
   951  	ADDQ t1, acc4
   952  	ADCQ $0, DX
   953  	ADDQ AX, acc4
   954  	ADCQ $0, DX
   955  	MOVQ DX, t1
   956  
   957  	MOVQ (8*2)(x_ptr), AX
   958  	MULQ t0
   959  	ADDQ t1, acc5
   960  	ADCQ $0, DX
   961  	ADDQ AX, acc5
   962  	ADCQ $0, DX
   963  	MOVQ DX, t1
   964  
   965  	MOVQ (8*3)(x_ptr), AX
   966  	MULQ t0
   967  	ADDQ t1, acc0
   968  	ADCQ $0, DX
   969  	ADDQ AX, acc0
   970  	ADCQ DX, acc1
   971  	ADCQ $0, acc2
   972  	// Last reduction step
   973  	MOVQ acc3, AX
   974  	MULQ p256ordK0<>(SB)
   975  	MOVQ AX, t0
   976  
   977  	MOVQ p256ord<>+0x00(SB), AX
   978  	MULQ t0
   979  	ADDQ AX, acc3
   980  	ADCQ $0, DX
   981  	MOVQ DX, t1
   982  
   983  	MOVQ p256ord<>+0x08(SB), AX
   984  	MULQ t0
   985  	ADDQ t1, acc4
   986  	ADCQ $0, DX
   987  	ADDQ AX, acc4
   988  	ADCQ $0, DX
   989  	MOVQ DX, t1
   990  
   991  	MOVQ p256ord<>+0x10(SB), AX
   992  	MULQ t0
   993  	ADDQ t1, acc5
   994  	ADCQ $0, DX
   995  	ADDQ AX, acc5
   996  	ADCQ $0, DX
   997  	MOVQ DX, t1
   998  
   999  	MOVQ p256ord<>+0x18(SB), AX
  1000  	MULQ t0
  1001  	ADDQ t1, acc0
  1002  	ADCQ $0, DX
  1003  	ADDQ AX, acc0
  1004  	ADCQ DX, acc1
  1005  	ADCQ $0, acc2
  1006  	// Copy result [255:0]
  1007  	MOVQ acc4, x_ptr
  1008  	MOVQ acc5, acc3
  1009  	MOVQ acc0, t0
  1010  	MOVQ acc1, t1
  1011  	// Subtract p256
  1012  	SUBQ p256ord<>+0x00(SB), acc4
  1013  	SBBQ p256ord<>+0x08(SB) ,acc5
  1014  	SBBQ p256ord<>+0x10(SB), acc0
  1015  	SBBQ p256ord<>+0x18(SB), acc1
  1016  	SBBQ $0, acc2
  1017  
  1018  	CMOVQCS x_ptr, acc4
  1019  	CMOVQCS acc3, acc5
  1020  	CMOVQCS t0, acc0
  1021  	CMOVQCS t1, acc1
  1022  
  1023  	MOVQ acc4, (8*0)(res_ptr)
  1024  	MOVQ acc5, (8*1)(res_ptr)
  1025  	MOVQ acc0, (8*2)(res_ptr)
  1026  	MOVQ acc1, (8*3)(res_ptr)
  1027  
  1028  	RET
  1029  /* ---------------------------------------*/
  1030  // func p256OrdSqr(res, in []uint64, n int)
  1031  TEXT ·p256OrdSqr(SB),NOSPLIT,$0
  1032  	MOVQ res+0(FP), res_ptr
  1033  	MOVQ in+24(FP), x_ptr
  1034  	MOVQ n+48(FP), BX
  1035  
  1036  ordSqrLoop:
  1037  
  1038  	// y[1:] * y[0]
  1039  	MOVQ (8*0)(x_ptr), t0
  1040  
  1041  	MOVQ (8*1)(x_ptr), AX
  1042  	MULQ t0
  1043  	MOVQ AX, acc1
  1044  	MOVQ DX, acc2
  1045  
  1046  	MOVQ (8*2)(x_ptr), AX
  1047  	MULQ t0
  1048  	ADDQ AX, acc2
  1049  	ADCQ $0, DX
  1050  	MOVQ DX, acc3
  1051  
  1052  	MOVQ (8*3)(x_ptr), AX
  1053  	MULQ t0
  1054  	ADDQ AX, acc3
  1055  	ADCQ $0, DX
  1056  	MOVQ DX, acc4
  1057  	// y[2:] * y[1]
  1058  	MOVQ (8*1)(x_ptr), t0
  1059  
  1060  	MOVQ (8*2)(x_ptr), AX
  1061  	MULQ t0
  1062  	ADDQ AX, acc3
  1063  	ADCQ $0, DX
  1064  	MOVQ DX, t1
  1065  
  1066  	MOVQ (8*3)(x_ptr), AX
  1067  	MULQ t0
  1068  	ADDQ t1, acc4
  1069  	ADCQ $0, DX
  1070  	ADDQ AX, acc4
  1071  	ADCQ $0, DX
  1072  	MOVQ DX, acc5
  1073  	// y[3] * y[2]
  1074  	MOVQ (8*2)(x_ptr), t0
  1075  
  1076  	MOVQ (8*3)(x_ptr), AX
  1077  	MULQ t0
  1078  	ADDQ AX, acc5
  1079  	ADCQ $0, DX
  1080  	MOVQ DX, y_ptr
  1081  	XORQ t1, t1
  1082  	// *2
  1083  	ADDQ acc1, acc1
  1084  	ADCQ acc2, acc2
  1085  	ADCQ acc3, acc3
  1086  	ADCQ acc4, acc4
  1087  	ADCQ acc5, acc5
  1088  	ADCQ y_ptr, y_ptr
  1089  	ADCQ $0, t1
  1090  	// Missing products
  1091  	MOVQ (8*0)(x_ptr), AX
  1092  	MULQ AX
  1093  	MOVQ AX, acc0
  1094  	MOVQ DX, t0
  1095  
  1096  	MOVQ (8*1)(x_ptr), AX
  1097  	MULQ AX
  1098  	ADDQ t0, acc1
  1099  	ADCQ AX, acc2
  1100  	ADCQ $0, DX
  1101  	MOVQ DX, t0
  1102  
  1103  	MOVQ (8*2)(x_ptr), AX
  1104  	MULQ AX
  1105  	ADDQ t0, acc3
  1106  	ADCQ AX, acc4
  1107  	ADCQ $0, DX
  1108  	MOVQ DX, t0
  1109  
  1110  	MOVQ (8*3)(x_ptr), AX
  1111  	MULQ AX
  1112  	ADDQ t0, acc5
  1113  	ADCQ AX, y_ptr
  1114  	ADCQ DX, t1
  1115  	MOVQ t1, x_ptr
  1116  	// First reduction step
  1117  	MOVQ acc0, AX
  1118  	MULQ p256ordK0<>(SB)
  1119  	MOVQ AX, t0
  1120  
  1121  	MOVQ p256ord<>+0x00(SB), AX
  1122  	MULQ t0
  1123  	ADDQ AX, acc0
  1124  	ADCQ $0, DX
  1125  	MOVQ DX, t1
  1126  
  1127  	MOVQ p256ord<>+0x08(SB), AX
  1128  	MULQ t0
  1129  	ADDQ t1, acc1
  1130  	ADCQ $0, DX
  1131  	ADDQ AX, acc1
  1132  
  1133  	MOVQ t0, t1
  1134  	ADCQ DX, acc2
  1135  	ADCQ $0, t1
  1136  	SUBQ t0, acc2
  1137  	SBBQ $0, t1
  1138  
  1139  	MOVQ t0, AX
  1140  	MOVQ t0, DX
  1141  	MOVQ t0, acc0
  1142  	SHLQ $32, AX
  1143  	SHRQ $32, DX
  1144  
  1145  	ADDQ t1, acc3
  1146  	ADCQ $0, acc0
  1147  	SUBQ AX, acc3
  1148  	SBBQ DX, acc0
  1149  	// Second reduction step
  1150  	MOVQ acc1, AX
  1151  	MULQ p256ordK0<>(SB)
  1152  	MOVQ AX, t0
  1153  
  1154  	MOVQ p256ord<>+0x00(SB), AX
  1155  	MULQ t0
  1156  	ADDQ AX, acc1
  1157  	ADCQ $0, DX
  1158  	MOVQ DX, t1
  1159  
  1160  	MOVQ p256ord<>+0x08(SB), AX
  1161  	MULQ t0
  1162  	ADDQ t1, acc2
  1163  	ADCQ $0, DX
  1164  	ADDQ AX, acc2
  1165  
  1166  	MOVQ t0, t1
  1167  	ADCQ DX, acc3
  1168  	ADCQ $0, t1
  1169  	SUBQ t0, acc3
  1170  	SBBQ $0, t1
  1171  
  1172  	MOVQ t0, AX
  1173  	MOVQ t0, DX
  1174  	MOVQ t0, acc1
  1175  	SHLQ $32, AX
  1176  	SHRQ $32, DX
  1177  
  1178  	ADDQ t1, acc0
  1179  	ADCQ $0, acc1
  1180  	SUBQ AX, acc0
  1181  	SBBQ DX, acc1
  1182  	// Third reduction step
  1183  	MOVQ acc2, AX
  1184  	MULQ p256ordK0<>(SB)
  1185  	MOVQ AX, t0
  1186  
  1187  	MOVQ p256ord<>+0x00(SB), AX
  1188  	MULQ t0
  1189  	ADDQ AX, acc2
  1190  	ADCQ $0, DX
  1191  	MOVQ DX, t1
  1192  
  1193  	MOVQ p256ord<>+0x08(SB), AX
  1194  	MULQ t0
  1195  	ADDQ t1, acc3
  1196  	ADCQ $0, DX
  1197  	ADDQ AX, acc3
  1198  
  1199  	MOVQ t0, t1
  1200  	ADCQ DX, acc0
  1201  	ADCQ $0, t1
  1202  	SUBQ t0, acc0
  1203  	SBBQ $0, t1
  1204  
  1205  	MOVQ t0, AX
  1206  	MOVQ t0, DX
  1207  	MOVQ t0, acc2
  1208  	SHLQ $32, AX
  1209  	SHRQ $32, DX
  1210  
  1211  	ADDQ t1, acc1
  1212  	ADCQ $0, acc2
  1213  	SUBQ AX, acc1
  1214  	SBBQ DX, acc2
  1215  	// Last reduction step
  1216  	MOVQ acc3, AX
  1217  	MULQ p256ordK0<>(SB)
  1218  	MOVQ AX, t0
  1219  
  1220  	MOVQ p256ord<>+0x00(SB), AX
  1221  	MULQ t0
  1222  	ADDQ AX, acc3
  1223  	ADCQ $0, DX
  1224  	MOVQ DX, t1
  1225  
  1226  	MOVQ p256ord<>+0x08(SB), AX
  1227  	MULQ t0
  1228  	ADDQ t1, acc0
  1229  	ADCQ $0, DX
  1230  	ADDQ AX, acc0
  1231  	ADCQ $0, DX
  1232  	MOVQ DX, t1
  1233  
  1234  	MOVQ t0, t1
  1235  	ADCQ DX, acc1
  1236  	ADCQ $0, t1
  1237  	SUBQ t0, acc1
  1238  	SBBQ $0, t1
  1239  
  1240  	MOVQ t0, AX
  1241  	MOVQ t0, DX
  1242  	MOVQ t0, acc3
  1243  	SHLQ $32, AX
  1244  	SHRQ $32, DX
  1245  
  1246  	ADDQ t1, acc2
  1247  	ADCQ $0, acc3
  1248  	SUBQ AX, acc2
  1249  	SBBQ DX, acc3
  1250  	XORQ t0, t0
  1251  	// Add bits [511:256] of the sqr result
  1252  	ADCQ acc4, acc0
  1253  	ADCQ acc5, acc1
  1254  	ADCQ y_ptr, acc2
  1255  	ADCQ x_ptr, acc3
  1256  	ADCQ $0, t0
  1257  
  1258  	MOVQ acc0, acc4
  1259  	MOVQ acc1, acc5
  1260  	MOVQ acc2, y_ptr
  1261  	MOVQ acc3, t1
  1262  	// Subtract p256
  1263  	SUBQ p256ord<>+0x00(SB), acc0
  1264  	SBBQ p256ord<>+0x08(SB) ,acc1
  1265  	SBBQ p256ord<>+0x10(SB), acc2
  1266  	SBBQ p256ord<>+0x18(SB), acc3
  1267  	SBBQ $0, t0
  1268  
  1269  	CMOVQCS acc4, acc0
  1270  	CMOVQCS acc5, acc1
  1271  	CMOVQCS y_ptr, acc2
  1272  	CMOVQCS t1, acc3
  1273  
  1274  	MOVQ acc0, (8*0)(res_ptr)
  1275  	MOVQ acc1, (8*1)(res_ptr)
  1276  	MOVQ acc2, (8*2)(res_ptr)
  1277  	MOVQ acc3, (8*3)(res_ptr)
  1278  	MOVQ res_ptr, x_ptr
  1279  	DECQ BX
  1280  	JNE ordSqrLoop
  1281  
  1282  	RET
  1283  /* ---------------------------------------*/
  1284  #undef res_ptr
  1285  #undef x_ptr
  1286  #undef y_ptr
  1287  
  1288  #undef acc0
  1289  #undef acc1
  1290  #undef acc2
  1291  #undef acc3
  1292  #undef acc4
  1293  #undef acc5
  1294  #undef t0
  1295  #undef t1
  1296  /* ---------------------------------------*/
  1297  #define mul0 AX
  1298  #define mul1 DX
  1299  #define acc0 BX
  1300  #define acc1 CX
  1301  #define acc2 R8
  1302  #define acc3 R9
  1303  #define acc4 R10
  1304  #define acc5 R11
  1305  #define acc6 R12
  1306  #define acc7 R13
  1307  #define t0 R14
  1308  #define t1 R15
  1309  #define t2 DI
  1310  #define t3 SI
  1311  #define hlp BP
  1312  /* ---------------------------------------*/
  1313  TEXT p256SubInternal(SB),NOSPLIT,$0
  1314  	XORQ mul0, mul0
  1315  	SUBQ t0, acc4
  1316  	SBBQ t1, acc5
  1317  	SBBQ t2, acc6
  1318  	SBBQ t3, acc7
  1319  	SBBQ $0, mul0
  1320  
  1321  	MOVQ acc4, acc0
  1322  	MOVQ acc5, acc1
  1323  	MOVQ acc6, acc2
  1324  	MOVQ acc7, acc3
  1325  
  1326  	ADDQ $-1, acc4
  1327  	ADCQ p256const0<>(SB), acc5
  1328  	ADCQ $0, acc6
  1329  	ADCQ p256const1<>(SB), acc7
  1330  	ANDQ $1, mul0
  1331  
  1332  	CMOVQEQ acc0, acc4
  1333  	CMOVQEQ acc1, acc5
  1334  	CMOVQEQ acc2, acc6
  1335  	CMOVQEQ acc3, acc7
  1336  
  1337  	RET
  1338  /* ---------------------------------------*/
  1339  TEXT p256MulInternal(SB),NOSPLIT,$8
  1340  	MOVQ acc4, mul0
  1341  	MULQ t0
  1342  	MOVQ mul0, acc0
  1343  	MOVQ mul1, acc1
  1344  
  1345  	MOVQ acc4, mul0
  1346  	MULQ t1
  1347  	ADDQ mul0, acc1
  1348  	ADCQ $0, mul1
  1349  	MOVQ mul1, acc2
  1350  
  1351  	MOVQ acc4, mul0
  1352  	MULQ t2
  1353  	ADDQ mul0, acc2
  1354  	ADCQ $0, mul1
  1355  	MOVQ mul1, acc3
  1356  
  1357  	MOVQ acc4, mul0
  1358  	MULQ t3
  1359  	ADDQ mul0, acc3
  1360  	ADCQ $0, mul1
  1361  	MOVQ mul1, acc4
  1362  
  1363  	MOVQ acc5, mul0
  1364  	MULQ t0
  1365  	ADDQ mul0, acc1
  1366  	ADCQ $0, mul1
  1367  	MOVQ mul1, hlp
  1368  
  1369  	MOVQ acc5, mul0
  1370  	MULQ t1
  1371  	ADDQ hlp, acc2
  1372  	ADCQ $0, mul1
  1373  	ADDQ mul0, acc2
  1374  	ADCQ $0, mul1
  1375  	MOVQ mul1, hlp
  1376  
  1377  	MOVQ acc5, mul0
  1378  	MULQ t2
  1379  	ADDQ hlp, acc3
  1380  	ADCQ $0, mul1
  1381  	ADDQ mul0, acc3
  1382  	ADCQ $0, mul1
  1383  	MOVQ mul1, hlp
  1384  
  1385  	MOVQ acc5, mul0
  1386  	MULQ t3
  1387  	ADDQ hlp, acc4
  1388  	ADCQ $0, mul1
  1389  	ADDQ mul0, acc4
  1390  	ADCQ $0, mul1
  1391  	MOVQ mul1, acc5
  1392  
  1393  	MOVQ acc6, mul0
  1394  	MULQ t0
  1395  	ADDQ mul0, acc2
  1396  	ADCQ $0, mul1
  1397  	MOVQ mul1, hlp
  1398  
  1399  	MOVQ acc6, mul0
  1400  	MULQ t1
  1401  	ADDQ hlp, acc3
  1402  	ADCQ $0, mul1
  1403  	ADDQ mul0, acc3
  1404  	ADCQ $0, mul1
  1405  	MOVQ mul1, hlp
  1406  
  1407  	MOVQ acc6, mul0
  1408  	MULQ t2
  1409  	ADDQ hlp, acc4
  1410  	ADCQ $0, mul1
  1411  	ADDQ mul0, acc4
  1412  	ADCQ $0, mul1
  1413  	MOVQ mul1, hlp
  1414  
  1415  	MOVQ acc6, mul0
  1416  	MULQ t3
  1417  	ADDQ hlp, acc5
  1418  	ADCQ $0, mul1
  1419  	ADDQ mul0, acc5
  1420  	ADCQ $0, mul1
  1421  	MOVQ mul1, acc6
  1422  
  1423  	MOVQ acc7, mul0
  1424  	MULQ t0
  1425  	ADDQ mul0, acc3
  1426  	ADCQ $0, mul1
  1427  	MOVQ mul1, hlp
  1428  
  1429  	MOVQ acc7, mul0
  1430  	MULQ t1
  1431  	ADDQ hlp, acc4
  1432  	ADCQ $0, mul1
  1433  	ADDQ mul0, acc4
  1434  	ADCQ $0, mul1
  1435  	MOVQ mul1, hlp
  1436  
  1437  	MOVQ acc7, mul0
  1438  	MULQ t2
  1439  	ADDQ hlp, acc5
  1440  	ADCQ $0, mul1
  1441  	ADDQ mul0, acc5
  1442  	ADCQ $0, mul1
  1443  	MOVQ mul1, hlp
  1444  
  1445  	MOVQ acc7, mul0
  1446  	MULQ t3
  1447  	ADDQ hlp, acc6
  1448  	ADCQ $0, mul1
  1449  	ADDQ mul0, acc6
  1450  	ADCQ $0, mul1
  1451  	MOVQ mul1, acc7
  1452  	// First reduction step
  1453  	MOVQ acc0, mul0
  1454  	MOVQ acc0, hlp
  1455  	SHLQ $32, acc0
  1456  	MULQ p256const1<>(SB)
  1457  	SHRQ $32, hlp
  1458  	ADDQ acc0, acc1
  1459  	ADCQ hlp, acc2
  1460  	ADCQ mul0, acc3
  1461  	ADCQ $0, mul1
  1462  	MOVQ mul1, acc0
  1463  	// Second reduction step
  1464  	MOVQ acc1, mul0
  1465  	MOVQ acc1, hlp
  1466  	SHLQ $32, acc1
  1467  	MULQ p256const1<>(SB)
  1468  	SHRQ $32, hlp
  1469  	ADDQ acc1, acc2
  1470  	ADCQ hlp, acc3
  1471  	ADCQ mul0, acc0
  1472  	ADCQ $0, mul1
  1473  	MOVQ mul1, acc1
  1474  	// Third reduction step
  1475  	MOVQ acc2, mul0
  1476  	MOVQ acc2, hlp
  1477  	SHLQ $32, acc2
  1478  	MULQ p256const1<>(SB)
  1479  	SHRQ $32, hlp
  1480  	ADDQ acc2, acc3
  1481  	ADCQ hlp, acc0
  1482  	ADCQ mul0, acc1
  1483  	ADCQ $0, mul1
  1484  	MOVQ mul1, acc2
  1485  	// Last reduction step
  1486  	MOVQ acc3, mul0
  1487  	MOVQ acc3, hlp
  1488  	SHLQ $32, acc3
  1489  	MULQ p256const1<>(SB)
  1490  	SHRQ $32, hlp
  1491  	ADDQ acc3, acc0
  1492  	ADCQ hlp, acc1
  1493  	ADCQ mul0, acc2
  1494  	ADCQ $0, mul1
  1495  	MOVQ mul1, acc3
  1496  	MOVQ $0, BP
  1497  	// Add bits [511:256] of the result
  1498  	ADCQ acc0, acc4
  1499  	ADCQ acc1, acc5
  1500  	ADCQ acc2, acc6
  1501  	ADCQ acc3, acc7
  1502  	ADCQ $0, hlp
  1503  	// Copy result
  1504  	MOVQ acc4, acc0
  1505  	MOVQ acc5, acc1
  1506  	MOVQ acc6, acc2
  1507  	MOVQ acc7, acc3
  1508  	// Subtract p256
  1509  	SUBQ $-1, acc4
  1510  	SBBQ p256const0<>(SB) ,acc5
  1511  	SBBQ $0, acc6
  1512  	SBBQ p256const1<>(SB), acc7
  1513  	SBBQ $0, hlp
  1514  	// If the result of the subtraction is negative, restore the previous result
  1515  	CMOVQCS acc0, acc4
  1516  	CMOVQCS acc1, acc5
  1517  	CMOVQCS acc2, acc6
  1518  	CMOVQCS acc3, acc7
  1519  
  1520  	RET
  1521  /* ---------------------------------------*/
  1522  TEXT p256SqrInternal(SB),NOSPLIT,$8
  1523  
  1524  	MOVQ acc4, mul0
  1525  	MULQ acc5
  1526  	MOVQ mul0, acc1
  1527  	MOVQ mul1, acc2
  1528  
  1529  	MOVQ acc4, mul0
  1530  	MULQ acc6
  1531  	ADDQ mul0, acc2
  1532  	ADCQ $0, mul1
  1533  	MOVQ mul1, acc3
  1534  
  1535  	MOVQ acc4, mul0
  1536  	MULQ acc7
  1537  	ADDQ mul0, acc3
  1538  	ADCQ $0, mul1
  1539  	MOVQ mul1, t0
  1540  
  1541  	MOVQ acc5, mul0
  1542  	MULQ acc6
  1543  	ADDQ mul0, acc3
  1544  	ADCQ $0, mul1
  1545  	MOVQ mul1, hlp
  1546  
  1547  	MOVQ acc5, mul0
  1548  	MULQ acc7
  1549  	ADDQ hlp, t0
  1550  	ADCQ $0, mul1
  1551  	ADDQ mul0, t0
  1552  	ADCQ $0, mul1
  1553  	MOVQ mul1, t1
  1554  
  1555  	MOVQ acc6, mul0
  1556  	MULQ acc7
  1557  	ADDQ mul0, t1
  1558  	ADCQ $0, mul1
  1559  	MOVQ mul1, t2
  1560  	XORQ t3, t3
  1561  	// *2
  1562  	ADDQ acc1, acc1
  1563  	ADCQ acc2, acc2
  1564  	ADCQ acc3, acc3
  1565  	ADCQ t0, t0
  1566  	ADCQ t1, t1
  1567  	ADCQ t2, t2
  1568  	ADCQ $0, t3
  1569  	// Missing products
  1570  	MOVQ acc4, mul0
  1571  	MULQ mul0
  1572  	MOVQ mul0, acc0
  1573  	MOVQ DX, acc4
  1574  
  1575  	MOVQ acc5, mul0
  1576  	MULQ mul0
  1577  	ADDQ acc4, acc1
  1578  	ADCQ mul0, acc2
  1579  	ADCQ $0, DX
  1580  	MOVQ DX, acc4
  1581  
  1582  	MOVQ acc6, mul0
  1583  	MULQ mul0
  1584  	ADDQ acc4, acc3
  1585  	ADCQ mul0, t0
  1586  	ADCQ $0, DX
  1587  	MOVQ DX, acc4
  1588  
  1589  	MOVQ acc7, mul0
  1590  	MULQ mul0
  1591  	ADDQ acc4, t1
  1592  	ADCQ mul0, t2
  1593  	ADCQ DX, t3
  1594  	// First reduction step
  1595  	MOVQ acc0, mul0
  1596  	MOVQ acc0, hlp
  1597  	SHLQ $32, acc0
  1598  	MULQ p256const1<>(SB)
  1599  	SHRQ $32, hlp
  1600  	ADDQ acc0, acc1
  1601  	ADCQ hlp, acc2
  1602  	ADCQ mul0, acc3
  1603  	ADCQ $0, mul1
  1604  	MOVQ mul1, acc0
  1605  	// Second reduction step
  1606  	MOVQ acc1, mul0
  1607  	MOVQ acc1, hlp
  1608  	SHLQ $32, acc1
  1609  	MULQ p256const1<>(SB)
  1610  	SHRQ $32, hlp
  1611  	ADDQ acc1, acc2
  1612  	ADCQ hlp, acc3
  1613  	ADCQ mul0, acc0
  1614  	ADCQ $0, mul1
  1615  	MOVQ mul1, acc1
  1616  	// Third reduction step
  1617  	MOVQ acc2, mul0
  1618  	MOVQ acc2, hlp
  1619  	SHLQ $32, acc2
  1620  	MULQ p256const1<>(SB)
  1621  	SHRQ $32, hlp
  1622  	ADDQ acc2, acc3
  1623  	ADCQ hlp, acc0
  1624  	ADCQ mul0, acc1
  1625  	ADCQ $0, mul1
  1626  	MOVQ mul1, acc2
  1627  	// Last reduction step
  1628  	MOVQ acc3, mul0
  1629  	MOVQ acc3, hlp
  1630  	SHLQ $32, acc3
  1631  	MULQ p256const1<>(SB)
  1632  	SHRQ $32, hlp
  1633  	ADDQ acc3, acc0
  1634  	ADCQ hlp, acc1
  1635  	ADCQ mul0, acc2
  1636  	ADCQ $0, mul1
  1637  	MOVQ mul1, acc3
  1638  	MOVQ $0, BP
  1639  	// Add bits [511:256] of the result
  1640  	ADCQ acc0, t0
  1641  	ADCQ acc1, t1
  1642  	ADCQ acc2, t2
  1643  	ADCQ acc3, t3
  1644  	ADCQ $0, hlp
  1645  	// Copy result
  1646  	MOVQ t0, acc4
  1647  	MOVQ t1, acc5
  1648  	MOVQ t2, acc6
  1649  	MOVQ t3, acc7
  1650  	// Subtract p256
  1651  	SUBQ $-1, acc4
  1652  	SBBQ p256const0<>(SB) ,acc5
  1653  	SBBQ $0, acc6
  1654  	SBBQ p256const1<>(SB), acc7
  1655  	SBBQ $0, hlp
  1656  	// If the result of the subtraction is negative, restore the previous result
  1657  	CMOVQCS t0, acc4
  1658  	CMOVQCS t1, acc5
  1659  	CMOVQCS t2, acc6
  1660  	CMOVQCS t3, acc7
  1661  
  1662  	RET
  1663  /* ---------------------------------------*/
  1664  #define p256MulBy2Inline\
  1665  	XORQ mul0, mul0;\
  1666  	ADDQ acc4, acc4;\
  1667  	ADCQ acc5, acc5;\
  1668  	ADCQ acc6, acc6;\
  1669  	ADCQ acc7, acc7;\
  1670  	ADCQ $0, mul0;\
  1671  	MOVQ acc4, t0;\
  1672  	MOVQ acc5, t1;\
  1673  	MOVQ acc6, t2;\
  1674  	MOVQ acc7, t3;\
  1675  	SUBQ $-1, t0;\
  1676  	SBBQ p256const0<>(SB), t1;\
  1677  	SBBQ $0, t2;\
  1678  	SBBQ p256const1<>(SB), t3;\
  1679  	SBBQ $0, mul0;\
  1680  	CMOVQCS acc4, t0;\
  1681  	CMOVQCS acc5, t1;\
  1682  	CMOVQCS acc6, t2;\
  1683  	CMOVQCS acc7, t3;
  1684  /* ---------------------------------------*/
  1685  #define p256AddInline \
  1686  	XORQ mul0, mul0;\
  1687  	ADDQ t0, acc4;\
  1688  	ADCQ t1, acc5;\
  1689  	ADCQ t2, acc6;\
  1690  	ADCQ t3, acc7;\
  1691  	ADCQ $0, mul0;\
  1692  	MOVQ acc4, t0;\
  1693  	MOVQ acc5, t1;\
  1694  	MOVQ acc6, t2;\
  1695  	MOVQ acc7, t3;\
  1696  	SUBQ $-1, t0;\
  1697  	SBBQ p256const0<>(SB), t1;\
  1698  	SBBQ $0, t2;\
  1699  	SBBQ p256const1<>(SB), t3;\
  1700  	SBBQ $0, mul0;\
  1701  	CMOVQCS acc4, t0;\
  1702  	CMOVQCS acc5, t1;\
  1703  	CMOVQCS acc6, t2;\
  1704  	CMOVQCS acc7, t3;
  1705  /* ---------------------------------------*/
  1706  #define LDacc(src) MOVQ src(8*0), acc4; MOVQ src(8*1), acc5; MOVQ src(8*2), acc6; MOVQ src(8*3), acc7
  1707  #define LDt(src)   MOVQ src(8*0), t0; MOVQ src(8*1), t1; MOVQ src(8*2), t2; MOVQ src(8*3), t3
  1708  #define ST(dst)    MOVQ acc4, dst(8*0); MOVQ acc5, dst(8*1); MOVQ acc6, dst(8*2); MOVQ acc7, dst(8*3)
  1709  #define STt(dst)   MOVQ t0, dst(8*0); MOVQ t1, dst(8*1); MOVQ t2, dst(8*2); MOVQ t3, dst(8*3)
  1710  #define acc2t      MOVQ acc4, t0; MOVQ acc5, t1; MOVQ acc6, t2; MOVQ acc7, t3
  1711  #define t2acc      MOVQ t0, acc4; MOVQ t1, acc5; MOVQ t2, acc6; MOVQ t3, acc7
  1712  /* ---------------------------------------*/
  1713  #define x1in(off) (32*0 + off)(SP)
  1714  #define y1in(off) (32*1 + off)(SP)
  1715  #define z1in(off) (32*2 + off)(SP)
  1716  #define x2in(off) (32*3 + off)(SP)
  1717  #define y2in(off) (32*4 + off)(SP)
  1718  #define xout(off) (32*5 + off)(SP)
  1719  #define yout(off) (32*6 + off)(SP)
  1720  #define zout(off) (32*7 + off)(SP)
  1721  #define s2(off)   (32*8 + off)(SP)
  1722  #define z1sqr(off) (32*9 + off)(SP)
  1723  #define h(off)	  (32*10 + off)(SP)
  1724  #define r(off)	  (32*11 + off)(SP)
  1725  #define hsqr(off) (32*12 + off)(SP)
  1726  #define rsqr(off) (32*13 + off)(SP)
  1727  #define hcub(off) (32*14 + off)(SP)
  1728  #define rptr	  (32*15)(SP)
  1729  #define sel_save  (32*15 + 8)(SP)
  1730  #define zero_save (32*15 + 8 + 4)(SP)
  1731  
  1732  // func p256PointAddAffineAsm(res, in1, in2 []uint64, sign, sel, zero int)
  1733  TEXT ·p256PointAddAffineAsm(SB),0,$512-96
  1734  	// Move input to stack in order to free registers
  1735  	MOVQ res+0(FP), AX
  1736  	MOVQ in1+24(FP), BX
  1737  	MOVQ in2+48(FP), CX
  1738  	MOVQ sign+72(FP), DX
  1739  	MOVQ sel+80(FP), t1
  1740  	MOVQ zero+88(FP), t2
  1741  
  1742  	MOVOU (16*0)(BX), X0
  1743  	MOVOU (16*1)(BX), X1
  1744  	MOVOU (16*2)(BX), X2
  1745  	MOVOU (16*3)(BX), X3
  1746  	MOVOU (16*4)(BX), X4
  1747  	MOVOU (16*5)(BX), X5
  1748  
  1749  	MOVOU X0, x1in(16*0)
  1750  	MOVOU X1, x1in(16*1)
  1751  	MOVOU X2, y1in(16*0)
  1752  	MOVOU X3, y1in(16*1)
  1753  	MOVOU X4, z1in(16*0)
  1754  	MOVOU X5, z1in(16*1)
  1755  
  1756  	MOVOU (16*0)(CX), X0
  1757  	MOVOU (16*1)(CX), X1
  1758  
  1759  	MOVOU X0, x2in(16*0)
  1760  	MOVOU X1, x2in(16*1)
  1761  	// Store pointer to result
  1762  	MOVQ mul0, rptr
  1763  	MOVL t1, sel_save
  1764  	MOVL t2, zero_save
  1765  	// Negate y2in based on sign
  1766  	MOVQ (16*2 + 8*0)(CX), acc4
  1767  	MOVQ (16*2 + 8*1)(CX), acc5
  1768  	MOVQ (16*2 + 8*2)(CX), acc6
  1769  	MOVQ (16*2 + 8*3)(CX), acc7
  1770  	MOVQ $-1, acc0
  1771  	MOVQ p256const0<>(SB), acc1
  1772  	MOVQ $0, acc2
  1773  	MOVQ p256const1<>(SB), acc3
  1774  	XORQ mul0, mul0
  1775  	// Speculatively subtract
  1776  	SUBQ acc4, acc0
  1777  	SBBQ acc5, acc1
  1778  	SBBQ acc6, acc2
  1779  	SBBQ acc7, acc3
  1780  	SBBQ $0, mul0
  1781  	MOVQ acc0, t0
  1782  	MOVQ acc1, t1
  1783  	MOVQ acc2, t2
  1784  	MOVQ acc3, t3
  1785  	// Add in case the operand was > p256
  1786  	ADDQ $-1, acc0
  1787  	ADCQ p256const0<>(SB), acc1
  1788  	ADCQ $0, acc2
  1789  	ADCQ p256const1<>(SB), acc3
  1790  	ADCQ $0, mul0
  1791  	CMOVQNE t0, acc0
  1792  	CMOVQNE t1, acc1
  1793  	CMOVQNE t2, acc2
  1794  	CMOVQNE t3, acc3
  1795  	// If condition is 0, keep original value
  1796  	TESTQ DX, DX
  1797  	CMOVQEQ acc4, acc0
  1798  	CMOVQEQ acc5, acc1
  1799  	CMOVQEQ acc6, acc2
  1800  	CMOVQEQ acc7, acc3
  1801  	// Store result
  1802  	MOVQ acc0, y2in(8*0)
  1803  	MOVQ acc1, y2in(8*1)
  1804  	MOVQ acc2, y2in(8*2)
  1805  	MOVQ acc3, y2in(8*3)
  1806  	// Begin point add
  1807  	LDacc (z1in)
  1808  	CALL p256SqrInternal(SB)	// z1ˆ2
  1809  	ST (z1sqr)
  1810  
  1811  	LDt (x2in)
  1812  	CALL p256MulInternal(SB)	// x2 * z1ˆ2
  1813  
  1814  	LDt (x1in)
  1815  	CALL p256SubInternal(SB)	// h = u2 - u1
  1816  	ST (h)
  1817  
  1818  	LDt (z1in)
  1819  	CALL p256MulInternal(SB)	// z3 = h * z1
  1820  	ST (zout)
  1821  
  1822  	LDacc (z1sqr)
  1823  	CALL p256MulInternal(SB)	// z1ˆ3
  1824  
  1825  	LDt (y2in)
  1826  	CALL p256MulInternal(SB)	// s2 = y2 * z1ˆ3
  1827  	ST (s2)
  1828  
  1829  	LDt (y1in)
  1830  	CALL p256SubInternal(SB)	// r = s2 - s1
  1831  	ST (r)
  1832  
  1833  	CALL p256SqrInternal(SB)	// rsqr = rˆ2
  1834  	ST (rsqr)
  1835  
  1836  	LDacc (h)
  1837  	CALL p256SqrInternal(SB)	// hsqr = hˆ2
  1838  	ST (hsqr)
  1839  
  1840  	LDt (h)
  1841  	CALL p256MulInternal(SB)	// hcub = hˆ3
  1842  	ST (hcub)
  1843  
  1844  	LDt (y1in)
  1845  	CALL p256MulInternal(SB)	// y1 * hˆ3
  1846  	ST (s2)
  1847  
  1848  	LDacc (x1in)
  1849  	LDt (hsqr)
  1850  	CALL p256MulInternal(SB)	// u1 * hˆ2
  1851  	ST (h)
  1852  
  1853  	p256MulBy2Inline			// u1 * hˆ2 * 2, inline
  1854  	LDacc (rsqr)
  1855  	CALL p256SubInternal(SB)	// rˆ2 - u1 * hˆ2 * 2
  1856  
  1857  	LDt (hcub)
  1858  	CALL p256SubInternal(SB)
  1859  	ST (xout)
  1860  
  1861  	MOVQ acc4, t0
  1862  	MOVQ acc5, t1
  1863  	MOVQ acc6, t2
  1864  	MOVQ acc7, t3
  1865  	LDacc (h)
  1866  	CALL p256SubInternal(SB)
  1867  
  1868  	LDt (r)
  1869  	CALL p256MulInternal(SB)
  1870  
  1871  	LDt (s2)
  1872  	CALL p256SubInternal(SB)
  1873  	ST (yout)
  1874  	// Load stored values from stack
  1875  	MOVQ rptr, AX
  1876  	MOVL sel_save, BX
  1877  	MOVL zero_save, CX
  1878  	// The result is not valid if (sel == 0), conditional choose
  1879  	MOVOU xout(16*0), X0
  1880  	MOVOU xout(16*1), X1
  1881  	MOVOU yout(16*0), X2
  1882  	MOVOU yout(16*1), X3
  1883  	MOVOU zout(16*0), X4
  1884  	MOVOU zout(16*1), X5
  1885  
  1886  	MOVL BX, X6
  1887  	MOVL CX, X7
  1888  
  1889  	PXOR X8, X8
  1890  	PCMPEQL X9, X9
  1891  
  1892  	PSHUFD $0, X6, X6
  1893  	PSHUFD $0, X7, X7
  1894  
  1895  	PCMPEQL X8, X6
  1896  	PCMPEQL X8, X7
  1897  
  1898  	MOVOU X6, X15
  1899  	PANDN X9, X15
  1900  
  1901  	MOVOU x1in(16*0), X9
  1902  	MOVOU x1in(16*1), X10
  1903  	MOVOU y1in(16*0), X11
  1904  	MOVOU y1in(16*1), X12
  1905  	MOVOU z1in(16*0), X13
  1906  	MOVOU z1in(16*1), X14
  1907  
  1908  	PAND X15, X0
  1909  	PAND X15, X1
  1910  	PAND X15, X2
  1911  	PAND X15, X3
  1912  	PAND X15, X4
  1913  	PAND X15, X5
  1914  
  1915  	PAND X6, X9
  1916  	PAND X6, X10
  1917  	PAND X6, X11
  1918  	PAND X6, X12
  1919  	PAND X6, X13
  1920  	PAND X6, X14
  1921  
  1922  	PXOR X9, X0
  1923  	PXOR X10, X1
  1924  	PXOR X11, X2
  1925  	PXOR X12, X3
  1926  	PXOR X13, X4
  1927  	PXOR X14, X5
  1928  	// Similarly if zero == 0
  1929  	PCMPEQL X9, X9
  1930  	MOVOU X7, X15
  1931  	PANDN X9, X15
  1932  
  1933  	MOVOU x2in(16*0), X9
  1934  	MOVOU x2in(16*1), X10
  1935  	MOVOU y2in(16*0), X11
  1936  	MOVOU y2in(16*1), X12
  1937  	MOVOU p256one<>+0x00(SB), X13
  1938  	MOVOU p256one<>+0x10(SB), X14
  1939  
  1940  	PAND X15, X0
  1941  	PAND X15, X1
  1942  	PAND X15, X2
  1943  	PAND X15, X3
  1944  	PAND X15, X4
  1945  	PAND X15, X5
  1946  
  1947  	PAND X7, X9
  1948  	PAND X7, X10
  1949  	PAND X7, X11
  1950  	PAND X7, X12
  1951  	PAND X7, X13
  1952  	PAND X7, X14
  1953  
  1954  	PXOR X9, X0
  1955  	PXOR X10, X1
  1956  	PXOR X11, X2
  1957  	PXOR X12, X3
  1958  	PXOR X13, X4
  1959  	PXOR X14, X5
  1960  	// Finally output the result
  1961  	MOVOU X0, (16*0)(AX)
  1962  	MOVOU X1, (16*1)(AX)
  1963  	MOVOU X2, (16*2)(AX)
  1964  	MOVOU X3, (16*3)(AX)
  1965  	MOVOU X4, (16*4)(AX)
  1966  	MOVOU X5, (16*5)(AX)
  1967  	MOVQ $0, rptr
  1968  
  1969  	RET
  1970  #undef x1in
  1971  #undef y1in
  1972  #undef z1in
  1973  #undef x2in
  1974  #undef y2in
  1975  #undef xout
  1976  #undef yout
  1977  #undef zout
  1978  #undef s2
  1979  #undef z1sqr
  1980  #undef h
  1981  #undef r
  1982  #undef hsqr
  1983  #undef rsqr
  1984  #undef hcub
  1985  #undef rptr
  1986  #undef sel_save
  1987  #undef zero_save
  1988  
  1989  // p256IsZero returns 1 in AX if [acc4..acc7] represents zero and zero
  1990  // otherwise. It writes to [acc4..acc7], t0 and t1.
  1991  TEXT p256IsZero(SB),NOSPLIT,$0
  1992  	// AX contains a flag that is set if the input is zero.
  1993  	XORQ AX, AX
  1994  	MOVQ $1, t1
  1995  
  1996  	// Check whether [acc4..acc7] are all zero.
  1997  	MOVQ acc4, t0
  1998  	ORQ acc5, t0
  1999  	ORQ acc6, t0
  2000  	ORQ acc7, t0
  2001  
  2002  	// Set the zero flag if so. (CMOV of a constant to a register doesn't
  2003  	// appear to be supported in Go. Thus t1 = 1.)
  2004  	CMOVQEQ t1, AX
  2005  
  2006  	// XOR [acc4..acc7] with P and compare with zero again.
  2007  	XORQ $-1, acc4
  2008  	XORQ p256const0<>(SB), acc5
  2009  	XORQ p256const1<>(SB), acc7
  2010  	ORQ acc5, acc4
  2011  	ORQ acc6, acc4
  2012  	ORQ acc7, acc4
  2013  
  2014  	// Set the zero flag if so.
  2015  	CMOVQEQ t1, AX
  2016  	RET
  2017  
  2018  /* ---------------------------------------*/
  2019  #define x1in(off) (32*0 + off)(SP)
  2020  #define y1in(off) (32*1 + off)(SP)
  2021  #define z1in(off) (32*2 + off)(SP)
  2022  #define x2in(off) (32*3 + off)(SP)
  2023  #define y2in(off) (32*4 + off)(SP)
  2024  #define z2in(off) (32*5 + off)(SP)
  2025  
  2026  #define xout(off) (32*6 + off)(SP)
  2027  #define yout(off) (32*7 + off)(SP)
  2028  #define zout(off) (32*8 + off)(SP)
  2029  
  2030  #define u1(off)    (32*9 + off)(SP)
  2031  #define u2(off)    (32*10 + off)(SP)
  2032  #define s1(off)    (32*11 + off)(SP)
  2033  #define s2(off)    (32*12 + off)(SP)
  2034  #define z1sqr(off) (32*13 + off)(SP)
  2035  #define z2sqr(off) (32*14 + off)(SP)
  2036  #define h(off)     (32*15 + off)(SP)
  2037  #define r(off)     (32*16 + off)(SP)
  2038  #define hsqr(off)  (32*17 + off)(SP)
  2039  #define rsqr(off)  (32*18 + off)(SP)
  2040  #define hcub(off)  (32*19 + off)(SP)
  2041  #define rptr       (32*20)(SP)
  2042  #define points_eq  (32*20+8)(SP)
  2043  
  2044  //func p256PointAddAsm(res, in1, in2 []uint64) int
  2045  TEXT ·p256PointAddAsm(SB),0,$680-80
  2046  	// See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl
  2047  	// Move input to stack in order to free registers
  2048  	MOVQ res+0(FP), AX
  2049  	MOVQ in1+24(FP), BX
  2050  	MOVQ in2+48(FP), CX
  2051  
  2052  	MOVOU (16*0)(BX), X0
  2053  	MOVOU (16*1)(BX), X1
  2054  	MOVOU (16*2)(BX), X2
  2055  	MOVOU (16*3)(BX), X3
  2056  	MOVOU (16*4)(BX), X4
  2057  	MOVOU (16*5)(BX), X5
  2058  
  2059  	MOVOU X0, x1in(16*0)
  2060  	MOVOU X1, x1in(16*1)
  2061  	MOVOU X2, y1in(16*0)
  2062  	MOVOU X3, y1in(16*1)
  2063  	MOVOU X4, z1in(16*0)
  2064  	MOVOU X5, z1in(16*1)
  2065  
  2066  	MOVOU (16*0)(CX), X0
  2067  	MOVOU (16*1)(CX), X1
  2068  	MOVOU (16*2)(CX), X2
  2069  	MOVOU (16*3)(CX), X3
  2070  	MOVOU (16*4)(CX), X4
  2071  	MOVOU (16*5)(CX), X5
  2072  
  2073  	MOVOU X0, x2in(16*0)
  2074  	MOVOU X1, x2in(16*1)
  2075  	MOVOU X2, y2in(16*0)
  2076  	MOVOU X3, y2in(16*1)
  2077  	MOVOU X4, z2in(16*0)
  2078  	MOVOU X5, z2in(16*1)
  2079  	// Store pointer to result
  2080  	MOVQ AX, rptr
  2081  	// Begin point add
  2082  	LDacc (z2in)
  2083  	CALL p256SqrInternal(SB)	// z2ˆ2
  2084  	ST (z2sqr)
  2085  	LDt (z2in)
  2086  	CALL p256MulInternal(SB)	// z2ˆ3
  2087  	LDt (y1in)
  2088  	CALL p256MulInternal(SB)	// s1 = z2ˆ3*y1
  2089  	ST (s1)
  2090  
  2091  	LDacc (z1in)
  2092  	CALL p256SqrInternal(SB)	// z1ˆ2
  2093  	ST (z1sqr)
  2094  	LDt (z1in)
  2095  	CALL p256MulInternal(SB)	// z1ˆ3
  2096  	LDt (y2in)
  2097  	CALL p256MulInternal(SB)	// s2 = z1ˆ3*y2
  2098  	ST (s2)
  2099  
  2100  	LDt (s1)
  2101  	CALL p256SubInternal(SB)	// r = s2 - s1
  2102  	ST (r)
  2103  	CALL p256IsZero(SB)
  2104  	MOVQ AX, points_eq
  2105  
  2106  	LDacc (z2sqr)
  2107  	LDt (x1in)
  2108  	CALL p256MulInternal(SB)	// u1 = x1 * z2ˆ2
  2109  	ST (u1)
  2110  	LDacc (z1sqr)
  2111  	LDt (x2in)
  2112  	CALL p256MulInternal(SB)	// u2 = x2 * z1ˆ2
  2113  	ST (u2)
  2114  
  2115  	LDt (u1)
  2116  	CALL p256SubInternal(SB)	// h = u2 - u1
  2117  	ST (h)
  2118  	CALL p256IsZero(SB)
  2119  	ANDQ points_eq, AX
  2120  	MOVQ AX, points_eq
  2121  
  2122  	LDacc (r)
  2123  	CALL p256SqrInternal(SB)	// rsqr = rˆ2
  2124  	ST (rsqr)
  2125  
  2126  	LDacc (h)
  2127  	CALL p256SqrInternal(SB)	// hsqr = hˆ2
  2128  	ST (hsqr)
  2129  
  2130  	LDt (h)
  2131  	CALL p256MulInternal(SB)	// hcub = hˆ3
  2132  	ST (hcub)
  2133  
  2134  	LDt (s1)
  2135  	CALL p256MulInternal(SB)
  2136  	ST (s2)
  2137  
  2138  	LDacc (z1in)
  2139  	LDt (z2in)
  2140  	CALL p256MulInternal(SB)	// z1 * z2
  2141  	LDt (h)
  2142  	CALL p256MulInternal(SB)	// z1 * z2 * h
  2143  	ST (zout)
  2144  
  2145  	LDacc (hsqr)
  2146  	LDt (u1)
  2147  	CALL p256MulInternal(SB)	// hˆ2 * u1
  2148  	ST (u2)
  2149  
  2150  	p256MulBy2Inline	// u1 * hˆ2 * 2, inline
  2151  	LDacc (rsqr)
  2152  	CALL p256SubInternal(SB)	// rˆ2 - u1 * hˆ2 * 2
  2153  
  2154  	LDt (hcub)
  2155  	CALL p256SubInternal(SB)
  2156  	ST (xout)
  2157  
  2158  	MOVQ acc4, t0
  2159  	MOVQ acc5, t1
  2160  	MOVQ acc6, t2
  2161  	MOVQ acc7, t3
  2162  	LDacc (u2)
  2163  	CALL p256SubInternal(SB)
  2164  
  2165  	LDt (r)
  2166  	CALL p256MulInternal(SB)
  2167  
  2168  	LDt (s2)
  2169  	CALL p256SubInternal(SB)
  2170  	ST (yout)
  2171  
  2172  	MOVOU xout(16*0), X0
  2173  	MOVOU xout(16*1), X1
  2174  	MOVOU yout(16*0), X2
  2175  	MOVOU yout(16*1), X3
  2176  	MOVOU zout(16*0), X4
  2177  	MOVOU zout(16*1), X5
  2178  	// Finally output the result
  2179  	MOVQ rptr, AX
  2180  	MOVQ $0, rptr
  2181  	MOVOU X0, (16*0)(AX)
  2182  	MOVOU X1, (16*1)(AX)
  2183  	MOVOU X2, (16*2)(AX)
  2184  	MOVOU X3, (16*3)(AX)
  2185  	MOVOU X4, (16*4)(AX)
  2186  	MOVOU X5, (16*5)(AX)
  2187  
  2188  	MOVQ points_eq, AX
  2189  	MOVQ AX, ret+72(FP)
  2190  
  2191  	RET
  2192  #undef x1in
  2193  #undef y1in
  2194  #undef z1in
  2195  #undef x2in
  2196  #undef y2in
  2197  #undef z2in
  2198  #undef xout
  2199  #undef yout
  2200  #undef zout
  2201  #undef s1
  2202  #undef s2
  2203  #undef u1
  2204  #undef u2
  2205  #undef z1sqr
  2206  #undef z2sqr
  2207  #undef h
  2208  #undef r
  2209  #undef hsqr
  2210  #undef rsqr
  2211  #undef hcub
  2212  #undef rptr
  2213  /* ---------------------------------------*/
  2214  #define x(off) (32*0 + off)(SP)
  2215  #define y(off) (32*1 + off)(SP)
  2216  #define z(off) (32*2 + off)(SP)
  2217  
  2218  #define s(off)	(32*3 + off)(SP)
  2219  #define m(off)	(32*4 + off)(SP)
  2220  #define zsqr(off) (32*5 + off)(SP)
  2221  #define tmp(off)  (32*6 + off)(SP)
  2222  #define rptr	  (32*7)(SP)
  2223  
  2224  //func p256PointDoubleAsm(res, in []uint64)
  2225  TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$256-48
  2226  	// Move input to stack in order to free registers
  2227  	MOVQ res+0(FP), AX
  2228  	MOVQ in+24(FP), BX
  2229  
  2230  	MOVOU (16*0)(BX), X0
  2231  	MOVOU (16*1)(BX), X1
  2232  	MOVOU (16*2)(BX), X2
  2233  	MOVOU (16*3)(BX), X3
  2234  	MOVOU (16*4)(BX), X4
  2235  	MOVOU (16*5)(BX), X5
  2236  
  2237  	MOVOU X0, x(16*0)
  2238  	MOVOU X1, x(16*1)
  2239  	MOVOU X2, y(16*0)
  2240  	MOVOU X3, y(16*1)
  2241  	MOVOU X4, z(16*0)
  2242  	MOVOU X5, z(16*1)
  2243  	// Store pointer to result
  2244  	MOVQ AX, rptr
  2245  	// Begin point double
  2246  	LDacc (z)
  2247  	CALL p256SqrInternal(SB)
  2248  	ST (zsqr)
  2249  
  2250  	LDt (x)
  2251  	p256AddInline
  2252  	STt (m)
  2253  
  2254  	LDacc (z)
  2255  	LDt (y)
  2256  	CALL p256MulInternal(SB)
  2257  	p256MulBy2Inline
  2258  	MOVQ rptr, AX
  2259  	// Store z
  2260  	MOVQ t0, (16*4 + 8*0)(AX)
  2261  	MOVQ t1, (16*4 + 8*1)(AX)
  2262  	MOVQ t2, (16*4 + 8*2)(AX)
  2263  	MOVQ t3, (16*4 + 8*3)(AX)
  2264  
  2265  	LDacc (x)
  2266  	LDt (zsqr)
  2267  	CALL p256SubInternal(SB)
  2268  	LDt (m)
  2269  	CALL p256MulInternal(SB)
  2270  	ST (m)
  2271  	// Multiply by 3
  2272  	p256MulBy2Inline
  2273  	LDacc (m)
  2274  	p256AddInline
  2275  	STt (m)
  2276  	////////////////////////
  2277  	LDacc (y)
  2278  	p256MulBy2Inline
  2279  	t2acc
  2280  	CALL p256SqrInternal(SB)
  2281  	ST (s)
  2282  	CALL p256SqrInternal(SB)
  2283  	// Divide by 2
  2284  	XORQ mul0, mul0
  2285  	MOVQ acc4, t0
  2286  	MOVQ acc5, t1
  2287  	MOVQ acc6, t2
  2288  	MOVQ acc7, t3
  2289  
  2290  	ADDQ $-1, acc4
  2291  	ADCQ p256const0<>(SB), acc5
  2292  	ADCQ $0, acc6
  2293  	ADCQ p256const1<>(SB), acc7
  2294  	ADCQ $0, mul0
  2295  	TESTQ $1, t0
  2296  
  2297  	CMOVQEQ t0, acc4
  2298  	CMOVQEQ t1, acc5
  2299  	CMOVQEQ t2, acc6
  2300  	CMOVQEQ t3, acc7
  2301  	ANDQ t0, mul0
  2302  
  2303  	SHRQ $1, acc5, acc4
  2304  	SHRQ $1, acc6, acc5
  2305  	SHRQ $1, acc7, acc6
  2306  	SHRQ $1, mul0, acc7
  2307  	ST (y)
  2308  	/////////////////////////
  2309  	LDacc (x)
  2310  	LDt (s)
  2311  	CALL p256MulInternal(SB)
  2312  	ST (s)
  2313  	p256MulBy2Inline
  2314  	STt (tmp)
  2315  
  2316  	LDacc (m)
  2317  	CALL p256SqrInternal(SB)
  2318  	LDt (tmp)
  2319  	CALL p256SubInternal(SB)
  2320  
  2321  	MOVQ rptr, AX
  2322  	// Store x
  2323  	MOVQ acc4, (16*0 + 8*0)(AX)
  2324  	MOVQ acc5, (16*0 + 8*1)(AX)
  2325  	MOVQ acc6, (16*0 + 8*2)(AX)
  2326  	MOVQ acc7, (16*0 + 8*3)(AX)
  2327  
  2328  	acc2t
  2329  	LDacc (s)
  2330  	CALL p256SubInternal(SB)
  2331  
  2332  	LDt (m)
  2333  	CALL p256MulInternal(SB)
  2334  
  2335  	LDt (y)
  2336  	CALL p256SubInternal(SB)
  2337  	MOVQ rptr, AX
  2338  	// Store y
  2339  	MOVQ acc4, (16*2 + 8*0)(AX)
  2340  	MOVQ acc5, (16*2 + 8*1)(AX)
  2341  	MOVQ acc6, (16*2 + 8*2)(AX)
  2342  	MOVQ acc7, (16*2 + 8*3)(AX)
  2343  	///////////////////////
  2344  	MOVQ $0, rptr
  2345  
  2346  	RET
  2347  /* ---------------------------------------*/
  2348  

View as plain text