Merge remote-tracking branches 'regulator/fix/88pm800', 'regulator/fix/max8973',...
[linux-drm-fsl-dcu.git] / arch / arm / crypto / sha512-armv4.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 #
9 # Permission to use under GPL terms is granted.
10 # ====================================================================
11
12 # SHA512 block procedure for ARMv4. September 2007.
13
14 # This code is ~4.5 (four and a half) times faster than code generated
15 # by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
16 # Xscale PXA250 core].
17 #
18 # July 2010.
19 #
20 # Rescheduling for dual-issue pipeline resulted in 6% improvement on
21 # Cortex A8 core and ~40 cycles per processed byte.
22
23 # February 2011.
24 #
25 # Profiler-assisted and platform-specific optimization resulted in 7%
26 # improvement on Coxtex A8 core and ~38 cycles per byte.
27
28 # March 2011.
29 #
30 # Add NEON implementation. On Cortex A8 it was measured to process
31 # one byte in 23.3 cycles or ~60% faster than integer-only code.
32
33 # August 2012.
34 #
35 # Improve NEON performance by 12% on Snapdragon S4. In absolute
36 # terms it's 22.6 cycles per byte, which is disappointing result.
37 # Technical writers asserted that 3-way S4 pipeline can sustain
38 # multiple NEON instructions per cycle, but dual NEON issue could
39 # not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
40 # for further details. On side note Cortex-A15 processes one byte in
41 # 16 cycles.
42
43 # Byte order [in]dependence. =========================================
44 #
45 # Originally caller was expected to maintain specific *dword* order in
46 # h[0-7], namely with most significant dword at *lower* address, which
47 # was reflected in below two parameters as 0 and 4. Now caller is
48 # expected to maintain native byte order for whole 64-bit values.
49 $hi="HI";
50 $lo="LO";
51 # ====================================================================
52
53 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
54 open STDOUT,">$output";
55
56 $ctx="r0";      # parameter block
57 $inp="r1";
58 $len="r2";
59
60 $Tlo="r3";
61 $Thi="r4";
62 $Alo="r5";
63 $Ahi="r6";
64 $Elo="r7";
65 $Ehi="r8";
66 $t0="r9";
67 $t1="r10";
68 $t2="r11";
69 $t3="r12";
70 ############    r13 is stack pointer
71 $Ktbl="r14";
72 ############    r15 is program counter
73
74 $Aoff=8*0;
75 $Boff=8*1;
76 $Coff=8*2;
77 $Doff=8*3;
78 $Eoff=8*4;
79 $Foff=8*5;
80 $Goff=8*6;
81 $Hoff=8*7;
82 $Xoff=8*8;
83
84 sub BODY_00_15() {
85 my $magic = shift;
86 $code.=<<___;
87         @ Sigma1(x)     (ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
88         @ LO            lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
89         @ HI            hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
90         mov     $t0,$Elo,lsr#14
91         str     $Tlo,[sp,#$Xoff+0]
92         mov     $t1,$Ehi,lsr#14
93         str     $Thi,[sp,#$Xoff+4]
94         eor     $t0,$t0,$Ehi,lsl#18
95         ldr     $t2,[sp,#$Hoff+0]       @ h.lo
96         eor     $t1,$t1,$Elo,lsl#18
97         ldr     $t3,[sp,#$Hoff+4]       @ h.hi
98         eor     $t0,$t0,$Elo,lsr#18
99         eor     $t1,$t1,$Ehi,lsr#18
100         eor     $t0,$t0,$Ehi,lsl#14
101         eor     $t1,$t1,$Elo,lsl#14
102         eor     $t0,$t0,$Ehi,lsr#9
103         eor     $t1,$t1,$Elo,lsr#9
104         eor     $t0,$t0,$Elo,lsl#23
105         eor     $t1,$t1,$Ehi,lsl#23     @ Sigma1(e)
106         adds    $Tlo,$Tlo,$t0
107         ldr     $t0,[sp,#$Foff+0]       @ f.lo
108         adc     $Thi,$Thi,$t1           @ T += Sigma1(e)
109         ldr     $t1,[sp,#$Foff+4]       @ f.hi
110         adds    $Tlo,$Tlo,$t2
111         ldr     $t2,[sp,#$Goff+0]       @ g.lo
112         adc     $Thi,$Thi,$t3           @ T += h
113         ldr     $t3,[sp,#$Goff+4]       @ g.hi
114
115         eor     $t0,$t0,$t2
116         str     $Elo,[sp,#$Eoff+0]
117         eor     $t1,$t1,$t3
118         str     $Ehi,[sp,#$Eoff+4]
119         and     $t0,$t0,$Elo
120         str     $Alo,[sp,#$Aoff+0]
121         and     $t1,$t1,$Ehi
122         str     $Ahi,[sp,#$Aoff+4]
123         eor     $t0,$t0,$t2
124         ldr     $t2,[$Ktbl,#$lo]        @ K[i].lo
125         eor     $t1,$t1,$t3             @ Ch(e,f,g)
126         ldr     $t3,[$Ktbl,#$hi]        @ K[i].hi
127
128         adds    $Tlo,$Tlo,$t0
129         ldr     $Elo,[sp,#$Doff+0]      @ d.lo
130         adc     $Thi,$Thi,$t1           @ T += Ch(e,f,g)
131         ldr     $Ehi,[sp,#$Doff+4]      @ d.hi
132         adds    $Tlo,$Tlo,$t2
133         and     $t0,$t2,#0xff
134         adc     $Thi,$Thi,$t3           @ T += K[i]
135         adds    $Elo,$Elo,$Tlo
136         ldr     $t2,[sp,#$Boff+0]       @ b.lo
137         adc     $Ehi,$Ehi,$Thi          @ d += T
138         teq     $t0,#$magic
139
140         ldr     $t3,[sp,#$Coff+0]       @ c.lo
141 #if __ARM_ARCH__>=7
142         it      eq                      @ Thumb2 thing, sanity check in ARM
143 #endif
144         orreq   $Ktbl,$Ktbl,#1
145         @ Sigma0(x)     (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
146         @ LO            lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
147         @ HI            hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
148         mov     $t0,$Alo,lsr#28
149         mov     $t1,$Ahi,lsr#28
150         eor     $t0,$t0,$Ahi,lsl#4
151         eor     $t1,$t1,$Alo,lsl#4
152         eor     $t0,$t0,$Ahi,lsr#2
153         eor     $t1,$t1,$Alo,lsr#2
154         eor     $t0,$t0,$Alo,lsl#30
155         eor     $t1,$t1,$Ahi,lsl#30
156         eor     $t0,$t0,$Ahi,lsr#7
157         eor     $t1,$t1,$Alo,lsr#7
158         eor     $t0,$t0,$Alo,lsl#25
159         eor     $t1,$t1,$Ahi,lsl#25     @ Sigma0(a)
160         adds    $Tlo,$Tlo,$t0
161         and     $t0,$Alo,$t2
162         adc     $Thi,$Thi,$t1           @ T += Sigma0(a)
163
164         ldr     $t1,[sp,#$Boff+4]       @ b.hi
165         orr     $Alo,$Alo,$t2
166         ldr     $t2,[sp,#$Coff+4]       @ c.hi
167         and     $Alo,$Alo,$t3
168         and     $t3,$Ahi,$t1
169         orr     $Ahi,$Ahi,$t1
170         orr     $Alo,$Alo,$t0           @ Maj(a,b,c).lo
171         and     $Ahi,$Ahi,$t2
172         adds    $Alo,$Alo,$Tlo
173         orr     $Ahi,$Ahi,$t3           @ Maj(a,b,c).hi
174         sub     sp,sp,#8
175         adc     $Ahi,$Ahi,$Thi          @ h += T
176         tst     $Ktbl,#1
177         add     $Ktbl,$Ktbl,#8
178 ___
179 }
180 $code=<<___;
181 #ifndef __KERNEL__
182 # include "arm_arch.h"
183 # define VFP_ABI_PUSH   vstmdb  sp!,{d8-d15}
184 # define VFP_ABI_POP    vldmia  sp!,{d8-d15}
185 #else
186 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
187 # define __ARM_MAX_ARCH__ 7
188 # define VFP_ABI_PUSH
189 # define VFP_ABI_POP
190 #endif
191
192 #ifdef __ARMEL__
193 # define LO 0
194 # define HI 4
195 # define WORD64(hi0,lo0,hi1,lo1)        .word   lo0,hi0, lo1,hi1
196 #else
197 # define HI 0
198 # define LO 4
199 # define WORD64(hi0,lo0,hi1,lo1)        .word   hi0,lo0, hi1,lo1
200 #endif
201
202 .text
203 #if __ARM_ARCH__<7
204 .code   32
205 #else
206 .syntax unified
207 # ifdef __thumb2__
208 #  define adrl adr
209 .thumb
210 # else
211 .code   32
212 # endif
213 #endif
214
215 .type   K512,%object
216 .align  5
217 K512:
218 WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
219 WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
220 WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
221 WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
222 WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
223 WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
224 WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
225 WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
226 WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
227 WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
228 WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
229 WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
230 WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
231 WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
232 WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
233 WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
234 WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
235 WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
236 WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
237 WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
238 WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
239 WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
240 WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
241 WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
242 WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
243 WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
244 WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
245 WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
246 WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
247 WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
248 WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
249 WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
250 WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
251 WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
252 WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
253 WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
254 WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
255 WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
256 WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
257 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
258 .size   K512,.-K512
259 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
260 .LOPENSSL_armcap:
261 .word   OPENSSL_armcap_P-sha512_block_data_order
262 .skip   32-4
263 #else
264 .skip   32
265 #endif
266
267 .global sha512_block_data_order
268 .type   sha512_block_data_order,%function
269 sha512_block_data_order:
270 #if __ARM_ARCH__<7
271         sub     r3,pc,#8                @ sha512_block_data_order
272 #else
273         adr     r3,sha512_block_data_order
274 #endif
275 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
276         ldr     r12,.LOPENSSL_armcap
277         ldr     r12,[r3,r12]            @ OPENSSL_armcap_P
278         tst     r12,#1
279         bne     .LNEON
280 #endif
281         add     $len,$inp,$len,lsl#7    @ len to point at the end of inp
282         stmdb   sp!,{r4-r12,lr}
283         sub     $Ktbl,r3,#672           @ K512
284         sub     sp,sp,#9*8
285
286         ldr     $Elo,[$ctx,#$Eoff+$lo]
287         ldr     $Ehi,[$ctx,#$Eoff+$hi]
288         ldr     $t0, [$ctx,#$Goff+$lo]
289         ldr     $t1, [$ctx,#$Goff+$hi]
290         ldr     $t2, [$ctx,#$Hoff+$lo]
291         ldr     $t3, [$ctx,#$Hoff+$hi]
292 .Loop:
293         str     $t0, [sp,#$Goff+0]
294         str     $t1, [sp,#$Goff+4]
295         str     $t2, [sp,#$Hoff+0]
296         str     $t3, [sp,#$Hoff+4]
297         ldr     $Alo,[$ctx,#$Aoff+$lo]
298         ldr     $Ahi,[$ctx,#$Aoff+$hi]
299         ldr     $Tlo,[$ctx,#$Boff+$lo]
300         ldr     $Thi,[$ctx,#$Boff+$hi]
301         ldr     $t0, [$ctx,#$Coff+$lo]
302         ldr     $t1, [$ctx,#$Coff+$hi]
303         ldr     $t2, [$ctx,#$Doff+$lo]
304         ldr     $t3, [$ctx,#$Doff+$hi]
305         str     $Tlo,[sp,#$Boff+0]
306         str     $Thi,[sp,#$Boff+4]
307         str     $t0, [sp,#$Coff+0]
308         str     $t1, [sp,#$Coff+4]
309         str     $t2, [sp,#$Doff+0]
310         str     $t3, [sp,#$Doff+4]
311         ldr     $Tlo,[$ctx,#$Foff+$lo]
312         ldr     $Thi,[$ctx,#$Foff+$hi]
313         str     $Tlo,[sp,#$Foff+0]
314         str     $Thi,[sp,#$Foff+4]
315
316 .L00_15:
317 #if __ARM_ARCH__<7
318         ldrb    $Tlo,[$inp,#7]
319         ldrb    $t0, [$inp,#6]
320         ldrb    $t1, [$inp,#5]
321         ldrb    $t2, [$inp,#4]
322         ldrb    $Thi,[$inp,#3]
323         ldrb    $t3, [$inp,#2]
324         orr     $Tlo,$Tlo,$t0,lsl#8
325         ldrb    $t0, [$inp,#1]
326         orr     $Tlo,$Tlo,$t1,lsl#16
327         ldrb    $t1, [$inp],#8
328         orr     $Tlo,$Tlo,$t2,lsl#24
329         orr     $Thi,$Thi,$t3,lsl#8
330         orr     $Thi,$Thi,$t0,lsl#16
331         orr     $Thi,$Thi,$t1,lsl#24
332 #else
333         ldr     $Tlo,[$inp,#4]
334         ldr     $Thi,[$inp],#8
335 #ifdef __ARMEL__
336         rev     $Tlo,$Tlo
337         rev     $Thi,$Thi
338 #endif
339 #endif
340 ___
341         &BODY_00_15(0x94);
342 $code.=<<___;
343         tst     $Ktbl,#1
344         beq     .L00_15
345         ldr     $t0,[sp,#`$Xoff+8*(16-1)`+0]
346         ldr     $t1,[sp,#`$Xoff+8*(16-1)`+4]
347         bic     $Ktbl,$Ktbl,#1
348 .L16_79:
349         @ sigma0(x)     (ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
350         @ LO            lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
351         @ HI            hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
352         mov     $Tlo,$t0,lsr#1
353         ldr     $t2,[sp,#`$Xoff+8*(16-14)`+0]
354         mov     $Thi,$t1,lsr#1
355         ldr     $t3,[sp,#`$Xoff+8*(16-14)`+4]
356         eor     $Tlo,$Tlo,$t1,lsl#31
357         eor     $Thi,$Thi,$t0,lsl#31
358         eor     $Tlo,$Tlo,$t0,lsr#8
359         eor     $Thi,$Thi,$t1,lsr#8
360         eor     $Tlo,$Tlo,$t1,lsl#24
361         eor     $Thi,$Thi,$t0,lsl#24
362         eor     $Tlo,$Tlo,$t0,lsr#7
363         eor     $Thi,$Thi,$t1,lsr#7
364         eor     $Tlo,$Tlo,$t1,lsl#25
365
366         @ sigma1(x)     (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
367         @ LO            lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
368         @ HI            hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
369         mov     $t0,$t2,lsr#19
370         mov     $t1,$t3,lsr#19
371         eor     $t0,$t0,$t3,lsl#13
372         eor     $t1,$t1,$t2,lsl#13
373         eor     $t0,$t0,$t3,lsr#29
374         eor     $t1,$t1,$t2,lsr#29
375         eor     $t0,$t0,$t2,lsl#3
376         eor     $t1,$t1,$t3,lsl#3
377         eor     $t0,$t0,$t2,lsr#6
378         eor     $t1,$t1,$t3,lsr#6
379         ldr     $t2,[sp,#`$Xoff+8*(16-9)`+0]
380         eor     $t0,$t0,$t3,lsl#26
381
382         ldr     $t3,[sp,#`$Xoff+8*(16-9)`+4]
383         adds    $Tlo,$Tlo,$t0
384         ldr     $t0,[sp,#`$Xoff+8*16`+0]
385         adc     $Thi,$Thi,$t1
386
387         ldr     $t1,[sp,#`$Xoff+8*16`+4]
388         adds    $Tlo,$Tlo,$t2
389         adc     $Thi,$Thi,$t3
390         adds    $Tlo,$Tlo,$t0
391         adc     $Thi,$Thi,$t1
392 ___
393         &BODY_00_15(0x17);
394 $code.=<<___;
395 #if __ARM_ARCH__>=7
396         ittt    eq                      @ Thumb2 thing, sanity check in ARM
397 #endif
398         ldreq   $t0,[sp,#`$Xoff+8*(16-1)`+0]
399         ldreq   $t1,[sp,#`$Xoff+8*(16-1)`+4]
400         beq     .L16_79
401         bic     $Ktbl,$Ktbl,#1
402
403         ldr     $Tlo,[sp,#$Boff+0]
404         ldr     $Thi,[sp,#$Boff+4]
405         ldr     $t0, [$ctx,#$Aoff+$lo]
406         ldr     $t1, [$ctx,#$Aoff+$hi]
407         ldr     $t2, [$ctx,#$Boff+$lo]
408         ldr     $t3, [$ctx,#$Boff+$hi]
409         adds    $t0,$Alo,$t0
410         str     $t0, [$ctx,#$Aoff+$lo]
411         adc     $t1,$Ahi,$t1
412         str     $t1, [$ctx,#$Aoff+$hi]
413         adds    $t2,$Tlo,$t2
414         str     $t2, [$ctx,#$Boff+$lo]
415         adc     $t3,$Thi,$t3
416         str     $t3, [$ctx,#$Boff+$hi]
417
418         ldr     $Alo,[sp,#$Coff+0]
419         ldr     $Ahi,[sp,#$Coff+4]
420         ldr     $Tlo,[sp,#$Doff+0]
421         ldr     $Thi,[sp,#$Doff+4]
422         ldr     $t0, [$ctx,#$Coff+$lo]
423         ldr     $t1, [$ctx,#$Coff+$hi]
424         ldr     $t2, [$ctx,#$Doff+$lo]
425         ldr     $t3, [$ctx,#$Doff+$hi]
426         adds    $t0,$Alo,$t0
427         str     $t0, [$ctx,#$Coff+$lo]
428         adc     $t1,$Ahi,$t1
429         str     $t1, [$ctx,#$Coff+$hi]
430         adds    $t2,$Tlo,$t2
431         str     $t2, [$ctx,#$Doff+$lo]
432         adc     $t3,$Thi,$t3
433         str     $t3, [$ctx,#$Doff+$hi]
434
435         ldr     $Tlo,[sp,#$Foff+0]
436         ldr     $Thi,[sp,#$Foff+4]
437         ldr     $t0, [$ctx,#$Eoff+$lo]
438         ldr     $t1, [$ctx,#$Eoff+$hi]
439         ldr     $t2, [$ctx,#$Foff+$lo]
440         ldr     $t3, [$ctx,#$Foff+$hi]
441         adds    $Elo,$Elo,$t0
442         str     $Elo,[$ctx,#$Eoff+$lo]
443         adc     $Ehi,$Ehi,$t1
444         str     $Ehi,[$ctx,#$Eoff+$hi]
445         adds    $t2,$Tlo,$t2
446         str     $t2, [$ctx,#$Foff+$lo]
447         adc     $t3,$Thi,$t3
448         str     $t3, [$ctx,#$Foff+$hi]
449
450         ldr     $Alo,[sp,#$Goff+0]
451         ldr     $Ahi,[sp,#$Goff+4]
452         ldr     $Tlo,[sp,#$Hoff+0]
453         ldr     $Thi,[sp,#$Hoff+4]
454         ldr     $t0, [$ctx,#$Goff+$lo]
455         ldr     $t1, [$ctx,#$Goff+$hi]
456         ldr     $t2, [$ctx,#$Hoff+$lo]
457         ldr     $t3, [$ctx,#$Hoff+$hi]
458         adds    $t0,$Alo,$t0
459         str     $t0, [$ctx,#$Goff+$lo]
460         adc     $t1,$Ahi,$t1
461         str     $t1, [$ctx,#$Goff+$hi]
462         adds    $t2,$Tlo,$t2
463         str     $t2, [$ctx,#$Hoff+$lo]
464         adc     $t3,$Thi,$t3
465         str     $t3, [$ctx,#$Hoff+$hi]
466
467         add     sp,sp,#640
468         sub     $Ktbl,$Ktbl,#640
469
470         teq     $inp,$len
471         bne     .Loop
472
473         add     sp,sp,#8*9              @ destroy frame
474 #if __ARM_ARCH__>=5
475         ldmia   sp!,{r4-r12,pc}
476 #else
477         ldmia   sp!,{r4-r12,lr}
478         tst     lr,#1
479         moveq   pc,lr                   @ be binary compatible with V4, yet
480         bx      lr                      @ interoperable with Thumb ISA:-)
481 #endif
482 .size   sha512_block_data_order,.-sha512_block_data_order
483 ___
484
485 {
486 my @Sigma0=(28,34,39);
487 my @Sigma1=(14,18,41);
488 my @sigma0=(1, 8, 7);
489 my @sigma1=(19,61,6);
490
491 my $Ktbl="r3";
492 my $cnt="r12";  # volatile register known as ip, intra-procedure-call scratch
493
494 my @X=map("d$_",(0..15));
495 my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
496
497 sub NEON_00_15() {
498 my $i=shift;
499 my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
500 my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31));   # temps
501
502 $code.=<<___ if ($i<16 || $i&1);
503         vshr.u64        $t0,$e,#@Sigma1[0]      @ $i
504 #if $i<16
505         vld1.64         {@X[$i%16]},[$inp]!     @ handles unaligned
506 #endif
507         vshr.u64        $t1,$e,#@Sigma1[1]
508 #if $i>0
509          vadd.i64       $a,$Maj                 @ h+=Maj from the past
510 #endif
511         vshr.u64        $t2,$e,#@Sigma1[2]
512 ___
513 $code.=<<___;
514         vld1.64         {$K},[$Ktbl,:64]!       @ K[i++]
515         vsli.64         $t0,$e,#`64-@Sigma1[0]`
516         vsli.64         $t1,$e,#`64-@Sigma1[1]`
517         vmov            $Ch,$e
518         vsli.64         $t2,$e,#`64-@Sigma1[2]`
519 #if $i<16 && defined(__ARMEL__)
520         vrev64.8        @X[$i],@X[$i]
521 #endif
522         veor            $t1,$t0
523         vbsl            $Ch,$f,$g               @ Ch(e,f,g)
524         vshr.u64        $t0,$a,#@Sigma0[0]
525         veor            $t2,$t1                 @ Sigma1(e)
526         vadd.i64        $T1,$Ch,$h
527         vshr.u64        $t1,$a,#@Sigma0[1]
528         vsli.64         $t0,$a,#`64-@Sigma0[0]`
529         vadd.i64        $T1,$t2
530         vshr.u64        $t2,$a,#@Sigma0[2]
531         vadd.i64        $K,@X[$i%16]
532         vsli.64         $t1,$a,#`64-@Sigma0[1]`
533         veor            $Maj,$a,$b
534         vsli.64         $t2,$a,#`64-@Sigma0[2]`
535         veor            $h,$t0,$t1
536         vadd.i64        $T1,$K
537         vbsl            $Maj,$c,$b              @ Maj(a,b,c)
538         veor            $h,$t2                  @ Sigma0(a)
539         vadd.i64        $d,$T1
540         vadd.i64        $Maj,$T1
541         @ vadd.i64      $h,$Maj
542 ___
543 }
544
545 sub NEON_16_79() {
546 my $i=shift;
547
548 if ($i&1)       { &NEON_00_15($i,@_); return; }
549
550 # 2x-vectorized, therefore runs every 2nd round
551 my @X=map("q$_",(0..7));                        # view @X as 128-bit vector
552 my ($t0,$t1,$s0,$s1) = map("q$_",(12..15));     # temps
553 my ($d0,$d1,$d2) = map("d$_",(24..26));         # temps from NEON_00_15
554 my $e=@_[4];                                    # $e from NEON_00_15
555 $i /= 2;
556 $code.=<<___;
557         vshr.u64        $t0,@X[($i+7)%8],#@sigma1[0]
558         vshr.u64        $t1,@X[($i+7)%8],#@sigma1[1]
559          vadd.i64       @_[0],d30                       @ h+=Maj from the past
560         vshr.u64        $s1,@X[($i+7)%8],#@sigma1[2]
561         vsli.64         $t0,@X[($i+7)%8],#`64-@sigma1[0]`
562         vext.8          $s0,@X[$i%8],@X[($i+1)%8],#8    @ X[i+1]
563         vsli.64         $t1,@X[($i+7)%8],#`64-@sigma1[1]`
564         veor            $s1,$t0
565         vshr.u64        $t0,$s0,#@sigma0[0]
566         veor            $s1,$t1                         @ sigma1(X[i+14])
567         vshr.u64        $t1,$s0,#@sigma0[1]
568         vadd.i64        @X[$i%8],$s1
569         vshr.u64        $s1,$s0,#@sigma0[2]
570         vsli.64         $t0,$s0,#`64-@sigma0[0]`
571         vsli.64         $t1,$s0,#`64-@sigma0[1]`
572         vext.8          $s0,@X[($i+4)%8],@X[($i+5)%8],#8        @ X[i+9]
573         veor            $s1,$t0
574         vshr.u64        $d0,$e,#@Sigma1[0]              @ from NEON_00_15
575         vadd.i64        @X[$i%8],$s0
576         vshr.u64        $d1,$e,#@Sigma1[1]              @ from NEON_00_15
577         veor            $s1,$t1                         @ sigma0(X[i+1])
578         vshr.u64        $d2,$e,#@Sigma1[2]              @ from NEON_00_15
579         vadd.i64        @X[$i%8],$s1
580 ___
581         &NEON_00_15(2*$i,@_);
582 }
583
584 $code.=<<___;
585 #if __ARM_MAX_ARCH__>=7
586 .arch   armv7-a
587 .fpu    neon
588
589 .global sha512_block_data_order_neon
590 .type   sha512_block_data_order_neon,%function
591 .align  4
592 sha512_block_data_order_neon:
593 .LNEON:
594         dmb                             @ errata #451034 on early Cortex A8
595         add     $len,$inp,$len,lsl#7    @ len to point at the end of inp
596         VFP_ABI_PUSH
597         adrl    $Ktbl,K512
598         vldmia  $ctx,{$A-$H}            @ load context
599 .Loop_neon:
600 ___
601 for($i=0;$i<16;$i++)    { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
602 $code.=<<___;
603         mov             $cnt,#4
604 .L16_79_neon:
605         subs            $cnt,#1
606 ___
607 for(;$i<32;$i++)        { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
608 $code.=<<___;
609         bne             .L16_79_neon
610
611          vadd.i64       $A,d30          @ h+=Maj from the past
612         vldmia          $ctx,{d24-d31}  @ load context to temp
613         vadd.i64        q8,q12          @ vectorized accumulate
614         vadd.i64        q9,q13
615         vadd.i64        q10,q14
616         vadd.i64        q11,q15
617         vstmia          $ctx,{$A-$H}    @ save context
618         teq             $inp,$len
619         sub             $Ktbl,#640      @ rewind K512
620         bne             .Loop_neon
621
622         VFP_ABI_POP
623         ret                             @ bx lr
624 .size   sha512_block_data_order_neon,.-sha512_block_data_order_neon
625 #endif
626 ___
627 }
628 $code.=<<___;
629 .asciz  "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
630 .align  2
631 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
632 .comm   OPENSSL_armcap_P,4,4
633 #endif
634 ___
635
636 $code =~ s/\`([^\`]*)\`/eval $1/gem;
637 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;    # make it possible to compile with -march=armv4
638 $code =~ s/\bret\b/bx   lr/gm;
639
640 open SELF,$0;
641 while(<SELF>) {
642         next if (/^#!/);
643         last if (!s/^#/@/ and !/^$/);
644         print;
645 }
646 close SELF;
647
648 print $code;
649 close STDOUT; # enforce flush