diff options
author | 2025-03-08 22:04:20 +0800 | |
---|---|---|
committer | 2025-03-08 22:04:20 +0800 | |
commit | a07bb8fd1299070229f0e8f3dcb57ffd5ef9870a (patch) | |
tree | 84f21bd0bf7071bc5fc7dd989e77d7ceb5476682 /arch/mips/crypto | |
download | ohosKernel-a07bb8fd1299070229f0e8f3dcb57ffd5ef9870a.tar.gz ohosKernel-a07bb8fd1299070229f0e8f3dcb57ffd5ef9870a.zip |
Initial commit: OpenHarmony-v4.0-ReleaseOpenHarmony-v4.0-Release
Diffstat (limited to 'arch/mips/crypto')
-rw-r--r-- | arch/mips/crypto/Makefile | 24 | ||||
-rw-r--r-- | arch/mips/crypto/chacha-core.S | 497 | ||||
-rw-r--r-- | arch/mips/crypto/chacha-glue.c | 152 | ||||
-rw-r--r-- | arch/mips/crypto/crc32-mips.c | 346 | ||||
-rw-r--r-- | arch/mips/crypto/poly1305-glue.c | 191 | ||||
-rw-r--r-- | arch/mips/crypto/poly1305-mips.pl | 1273 |
6 files changed, 2483 insertions, 0 deletions
diff --git a/arch/mips/crypto/Makefile b/arch/mips/crypto/Makefile new file mode 100644 index 000000000..5e4105ccc --- /dev/null +++ b/arch/mips/crypto/Makefile | |||
@@ -0,0 +1,24 @@ | |||
1 | # SPDX-License-Identifier: GPL-2.0 | ||
2 | # | ||
3 | # Makefile for MIPS crypto files.. | ||
4 | # | ||
5 | |||
6 | obj-$(CONFIG_CRYPTO_CRC32_MIPS) += crc32-mips.o | ||
7 | |||
8 | obj-$(CONFIG_CRYPTO_CHACHA_MIPS) += chacha-mips.o | ||
9 | chacha-mips-y := chacha-core.o chacha-glue.o | ||
10 | AFLAGS_chacha-core.o += -O2 # needed to fill branch delay slots | ||
11 | |||
12 | obj-$(CONFIG_CRYPTO_POLY1305_MIPS) += poly1305-mips.o | ||
13 | poly1305-mips-y := poly1305-core.o poly1305-glue.o | ||
14 | |||
15 | perlasm-flavour-$(CONFIG_32BIT) := o32 | ||
16 | perlasm-flavour-$(CONFIG_64BIT) := 64 | ||
17 | |||
18 | quiet_cmd_perlasm = PERLASM $@ | ||
19 | cmd_perlasm = $(PERL) $(<) $(perlasm-flavour-y) $(@) | ||
20 | |||
21 | $(obj)/poly1305-core.S: $(src)/poly1305-mips.pl FORCE | ||
22 | $(call if_changed,perlasm) | ||
23 | |||
24 | targets += poly1305-core.S | ||
diff --git a/arch/mips/crypto/chacha-core.S b/arch/mips/crypto/chacha-core.S new file mode 100644 index 000000000..5755f69cf --- /dev/null +++ b/arch/mips/crypto/chacha-core.S | |||
@@ -0,0 +1,497 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0 OR MIT */ | ||
2 | /* | ||
3 | * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved. | ||
4 | * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. | ||
5 | */ | ||
6 | |||
7 | #define MASK_U32 0x3c | ||
8 | #define CHACHA20_BLOCK_SIZE 64 | ||
9 | #define STACK_SIZE 32 | ||
10 | |||
11 | #define X0 $t0 | ||
12 | #define X1 $t1 | ||
13 | #define X2 $t2 | ||
14 | #define X3 $t3 | ||
15 | #define X4 $t4 | ||
16 | #define X5 $t5 | ||
17 | #define X6 $t6 | ||
18 | #define X7 $t7 | ||
19 | #define X8 $t8 | ||
20 | #define X9 $t9 | ||
21 | #define X10 $v1 | ||
22 | #define X11 $s6 | ||
23 | #define X12 $s5 | ||
24 | #define X13 $s4 | ||
25 | #define X14 $s3 | ||
26 | #define X15 $s2 | ||
27 | /* Use regs which are overwritten on exit for Tx so we don't leak clear data. */ | ||
28 | #define T0 $s1 | ||
29 | #define T1 $s0 | ||
30 | #define T(n) T ## n | ||
31 | #define X(n) X ## n | ||
32 | |||
33 | /* Input arguments */ | ||
34 | #define STATE $a0 | ||
35 | #define OUT $a1 | ||
36 | #define IN $a2 | ||
37 | #define BYTES $a3 | ||
38 | |||
39 | /* Output argument */ | ||
40 | /* NONCE[0] is kept in a register and not in memory. | ||
41 | * We don't want to touch original value in memory. | ||
42 | * Must be incremented every loop iteration. | ||
43 | */ | ||
44 | #define NONCE_0 $v0 | ||
45 | |||
46 | /* SAVED_X and SAVED_CA are set in the jump table. | ||
47 | * Use regs which are overwritten on exit else we don't leak clear data. | ||
48 | * They are used to handling the last bytes which are not multiple of 4. | ||
49 | */ | ||
50 | #define SAVED_X X15 | ||
51 | #define SAVED_CA $s7 | ||
52 | |||
53 | #define IS_UNALIGNED $s7 | ||
54 | |||
55 | #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ | ||
56 | #define MSB 0 | ||
57 | #define LSB 3 | ||
58 | #define ROTx rotl | ||
59 | #define ROTR(n) rotr n, 24 | ||
60 | #define CPU_TO_LE32(n) \ | ||
61 | wsbh n; \ | ||
62 | rotr n, 16; | ||
63 | #else | ||
64 | #define MSB 3 | ||
65 | #define LSB 0 | ||
66 | #define ROTx rotr | ||
67 | #define CPU_TO_LE32(n) | ||
68 | #define ROTR(n) | ||
69 | #endif | ||
70 | |||
71 | #define FOR_EACH_WORD(x) \ | ||
72 | x( 0); \ | ||
73 | x( 1); \ | ||
74 | x( 2); \ | ||
75 | x( 3); \ | ||
76 | x( 4); \ | ||
77 | x( 5); \ | ||
78 | x( 6); \ | ||
79 | x( 7); \ | ||
80 | x( 8); \ | ||
81 | x( 9); \ | ||
82 | x(10); \ | ||
83 | x(11); \ | ||
84 | x(12); \ | ||
85 | x(13); \ | ||
86 | x(14); \ | ||
87 | x(15); | ||
88 | |||
89 | #define FOR_EACH_WORD_REV(x) \ | ||
90 | x(15); \ | ||
91 | x(14); \ | ||
92 | x(13); \ | ||
93 | x(12); \ | ||
94 | x(11); \ | ||
95 | x(10); \ | ||
96 | x( 9); \ | ||
97 | x( 8); \ | ||
98 | x( 7); \ | ||
99 | x( 6); \ | ||
100 | x( 5); \ | ||
101 | x( 4); \ | ||
102 | x( 3); \ | ||
103 | x( 2); \ | ||
104 | x( 1); \ | ||
105 | x( 0); | ||
106 | |||
107 | #define PLUS_ONE_0 1 | ||
108 | #define PLUS_ONE_1 2 | ||
109 | #define PLUS_ONE_2 3 | ||
110 | #define PLUS_ONE_3 4 | ||
111 | #define PLUS_ONE_4 5 | ||
112 | #define PLUS_ONE_5 6 | ||
113 | #define PLUS_ONE_6 7 | ||
114 | #define PLUS_ONE_7 8 | ||
115 | #define PLUS_ONE_8 9 | ||
116 | #define PLUS_ONE_9 10 | ||
117 | #define PLUS_ONE_10 11 | ||
118 | #define PLUS_ONE_11 12 | ||
119 | #define PLUS_ONE_12 13 | ||
120 | #define PLUS_ONE_13 14 | ||
121 | #define PLUS_ONE_14 15 | ||
122 | #define PLUS_ONE_15 16 | ||
123 | #define PLUS_ONE(x) PLUS_ONE_ ## x | ||
124 | #define _CONCAT3(a,b,c) a ## b ## c | ||
125 | #define CONCAT3(a,b,c) _CONCAT3(a,b,c) | ||
126 | |||
127 | #define STORE_UNALIGNED(x) \ | ||
128 | CONCAT3(.Lchacha_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \ | ||
129 | .if (x != 12); \ | ||
130 | lw T0, (x*4)(STATE); \ | ||
131 | .endif; \ | ||
132 | lwl T1, (x*4)+MSB ## (IN); \ | ||
133 | lwr T1, (x*4)+LSB ## (IN); \ | ||
134 | .if (x == 12); \ | ||
135 | addu X ## x, NONCE_0; \ | ||
136 | .else; \ | ||
137 | addu X ## x, T0; \ | ||
138 | .endif; \ | ||
139 | CPU_TO_LE32(X ## x); \ | ||
140 | xor X ## x, T1; \ | ||
141 | swl X ## x, (x*4)+MSB ## (OUT); \ | ||
142 | swr X ## x, (x*4)+LSB ## (OUT); | ||
143 | |||
144 | #define STORE_ALIGNED(x) \ | ||
145 | CONCAT3(.Lchacha_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \ | ||
146 | .if (x != 12); \ | ||
147 | lw T0, (x*4)(STATE); \ | ||
148 | .endif; \ | ||
149 | lw T1, (x*4) ## (IN); \ | ||
150 | .if (x == 12); \ | ||
151 | addu X ## x, NONCE_0; \ | ||
152 | .else; \ | ||
153 | addu X ## x, T0; \ | ||
154 | .endif; \ | ||
155 | CPU_TO_LE32(X ## x); \ | ||
156 | xor X ## x, T1; \ | ||
157 | sw X ## x, (x*4) ## (OUT); | ||
158 | |||
159 | /* Jump table macro. | ||
160 | * Used for setup and handling the last bytes, which are not multiple of 4. | ||
161 | * X15 is free to store Xn | ||
162 | * Every jumptable entry must be equal in size. | ||
163 | */ | ||
164 | #define JMPTBL_ALIGNED(x) \ | ||
165 | .Lchacha_mips_jmptbl_aligned_ ## x: ; \ | ||
166 | .set noreorder; \ | ||
167 | b .Lchacha_mips_xor_aligned_ ## x ## _b; \ | ||
168 | .if (x == 12); \ | ||
169 | addu SAVED_X, X ## x, NONCE_0; \ | ||
170 | .else; \ | ||
171 | addu SAVED_X, X ## x, SAVED_CA; \ | ||
172 | .endif; \ | ||
173 | .set reorder | ||
174 | |||
175 | #define JMPTBL_UNALIGNED(x) \ | ||
176 | .Lchacha_mips_jmptbl_unaligned_ ## x: ; \ | ||
177 | .set noreorder; \ | ||
178 | b .Lchacha_mips_xor_unaligned_ ## x ## _b; \ | ||
179 | .if (x == 12); \ | ||
180 | addu SAVED_X, X ## x, NONCE_0; \ | ||
181 | .else; \ | ||
182 | addu SAVED_X, X ## x, SAVED_CA; \ | ||
183 | .endif; \ | ||
184 | .set reorder | ||
185 | |||
186 | #define AXR(A, B, C, D, K, L, M, N, V, W, Y, Z, S) \ | ||
187 | addu X(A), X(K); \ | ||
188 | addu X(B), X(L); \ | ||
189 | addu X(C), X(M); \ | ||
190 | addu X(D), X(N); \ | ||
191 | xor X(V), X(A); \ | ||
192 | xor X(W), X(B); \ | ||
193 | xor X(Y), X(C); \ | ||
194 | xor X(Z), X(D); \ | ||
195 | rotl X(V), S; \ | ||
196 | rotl X(W), S; \ | ||
197 | rotl X(Y), S; \ | ||
198 | rotl X(Z), S; | ||
199 | |||
200 | .text | ||
201 | .set reorder | ||
202 | .set noat | ||
203 | .globl chacha_crypt_arch | ||
204 | .ent chacha_crypt_arch | ||
205 | chacha_crypt_arch: | ||
206 | .frame $sp, STACK_SIZE, $ra | ||
207 | |||
208 | /* Load number of rounds */ | ||
209 | lw $at, 16($sp) | ||
210 | |||
211 | addiu $sp, -STACK_SIZE | ||
212 | |||
213 | /* Return bytes = 0. */ | ||
214 | beqz BYTES, .Lchacha_mips_end | ||
215 | |||
216 | lw NONCE_0, 48(STATE) | ||
217 | |||
218 | /* Save s0-s7 */ | ||
219 | sw $s0, 0($sp) | ||
220 | sw $s1, 4($sp) | ||
221 | sw $s2, 8($sp) | ||
222 | sw $s3, 12($sp) | ||
223 | sw $s4, 16($sp) | ||
224 | sw $s5, 20($sp) | ||
225 | sw $s6, 24($sp) | ||
226 | sw $s7, 28($sp) | ||
227 | |||
228 | /* Test IN or OUT is unaligned. | ||
229 | * IS_UNALIGNED = ( IN | OUT ) & 0x00000003 | ||
230 | */ | ||
231 | or IS_UNALIGNED, IN, OUT | ||
232 | andi IS_UNALIGNED, 0x3 | ||
233 | |||
234 | b .Lchacha_rounds_start | ||
235 | |||
236 | .align 4 | ||
237 | .Loop_chacha_rounds: | ||
238 | addiu IN, CHACHA20_BLOCK_SIZE | ||
239 | addiu OUT, CHACHA20_BLOCK_SIZE | ||
240 | addiu NONCE_0, 1 | ||
241 | |||
242 | .Lchacha_rounds_start: | ||
243 | lw X0, 0(STATE) | ||
244 | lw X1, 4(STATE) | ||
245 | lw X2, 8(STATE) | ||
246 | lw X3, 12(STATE) | ||
247 | |||
248 | lw X4, 16(STATE) | ||
249 | lw X5, 20(STATE) | ||
250 | lw X6, 24(STATE) | ||
251 | lw X7, 28(STATE) | ||
252 | lw X8, 32(STATE) | ||
253 | lw X9, 36(STATE) | ||
254 | lw X10, 40(STATE) | ||
255 | lw X11, 44(STATE) | ||
256 | |||
257 | move X12, NONCE_0 | ||
258 | lw X13, 52(STATE) | ||
259 | lw X14, 56(STATE) | ||
260 | lw X15, 60(STATE) | ||
261 | |||
262 | .Loop_chacha_xor_rounds: | ||
263 | addiu $at, -2 | ||
264 | AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16); | ||
265 | AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12); | ||
266 | AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8); | ||
267 | AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7); | ||
268 | AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16); | ||
269 | AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12); | ||
270 | AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8); | ||
271 | AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7); | ||
272 | bnez $at, .Loop_chacha_xor_rounds | ||
273 | |||
274 | addiu BYTES, -(CHACHA20_BLOCK_SIZE) | ||
275 | |||
276 | /* Is data src/dst unaligned? Jump */ | ||
277 | bnez IS_UNALIGNED, .Loop_chacha_unaligned | ||
278 | |||
279 | /* Set number rounds here to fill delayslot. */ | ||
280 | lw $at, (STACK_SIZE+16)($sp) | ||
281 | |||
282 | /* BYTES < 0, it has no full block. */ | ||
283 | bltz BYTES, .Lchacha_mips_no_full_block_aligned | ||
284 | |||
285 | FOR_EACH_WORD_REV(STORE_ALIGNED) | ||
286 | |||
287 | /* BYTES > 0? Loop again. */ | ||
288 | bgtz BYTES, .Loop_chacha_rounds | ||
289 | |||
290 | /* Place this here to fill delay slot */ | ||
291 | addiu NONCE_0, 1 | ||
292 | |||
293 | /* BYTES < 0? Handle last bytes */ | ||
294 | bltz BYTES, .Lchacha_mips_xor_bytes | ||
295 | |||
296 | .Lchacha_mips_xor_done: | ||
297 | /* Restore used registers */ | ||
298 | lw $s0, 0($sp) | ||
299 | lw $s1, 4($sp) | ||
300 | lw $s2, 8($sp) | ||
301 | lw $s3, 12($sp) | ||
302 | lw $s4, 16($sp) | ||
303 | lw $s5, 20($sp) | ||
304 | lw $s6, 24($sp) | ||
305 | lw $s7, 28($sp) | ||
306 | |||
307 | /* Write NONCE_0 back to right location in state */ | ||
308 | sw NONCE_0, 48(STATE) | ||
309 | |||
310 | .Lchacha_mips_end: | ||
311 | addiu $sp, STACK_SIZE | ||
312 | jr $ra | ||
313 | |||
314 | .Lchacha_mips_no_full_block_aligned: | ||
315 | /* Restore the offset on BYTES */ | ||
316 | addiu BYTES, CHACHA20_BLOCK_SIZE | ||
317 | |||
318 | /* Get number of full WORDS */ | ||
319 | andi $at, BYTES, MASK_U32 | ||
320 | |||
321 | /* Load upper half of jump table addr */ | ||
322 | lui T0, %hi(.Lchacha_mips_jmptbl_aligned_0) | ||
323 | |||
324 | /* Calculate lower half jump table offset */ | ||
325 | ins T0, $at, 1, 6 | ||
326 | |||
327 | /* Add offset to STATE */ | ||
328 | addu T1, STATE, $at | ||
329 | |||
330 | /* Add lower half jump table addr */ | ||
331 | addiu T0, %lo(.Lchacha_mips_jmptbl_aligned_0) | ||
332 | |||
333 | /* Read value from STATE */ | ||
334 | lw SAVED_CA, 0(T1) | ||
335 | |||
336 | /* Store remaining bytecounter as negative value */ | ||
337 | subu BYTES, $at, BYTES | ||
338 | |||
339 | jr T0 | ||
340 | |||
341 | /* Jump table */ | ||
342 | FOR_EACH_WORD(JMPTBL_ALIGNED) | ||
343 | |||
344 | |||
345 | .Loop_chacha_unaligned: | ||
346 | /* Set number rounds here to fill delayslot. */ | ||
347 | lw $at, (STACK_SIZE+16)($sp) | ||
348 | |||
349 | /* BYTES > 0, it has no full block. */ | ||
350 | bltz BYTES, .Lchacha_mips_no_full_block_unaligned | ||
351 | |||
352 | FOR_EACH_WORD_REV(STORE_UNALIGNED) | ||
353 | |||
354 | /* BYTES > 0? Loop again. */ | ||
355 | bgtz BYTES, .Loop_chacha_rounds | ||
356 | |||
357 | /* Write NONCE_0 back to right location in state */ | ||
358 | sw NONCE_0, 48(STATE) | ||
359 | |||
360 | .set noreorder | ||
361 | /* Fall through to byte handling */ | ||
362 | bgez BYTES, .Lchacha_mips_xor_done | ||
363 | .Lchacha_mips_xor_unaligned_0_b: | ||
364 | .Lchacha_mips_xor_aligned_0_b: | ||
365 | /* Place this here to fill delay slot */ | ||
366 | addiu NONCE_0, 1 | ||
367 | .set reorder | ||
368 | |||
369 | .Lchacha_mips_xor_bytes: | ||
370 | addu IN, $at | ||
371 | addu OUT, $at | ||
372 | /* First byte */ | ||
373 | lbu T1, 0(IN) | ||
374 | addiu $at, BYTES, 1 | ||
375 | CPU_TO_LE32(SAVED_X) | ||
376 | ROTR(SAVED_X) | ||
377 | xor T1, SAVED_X | ||
378 | sb T1, 0(OUT) | ||
379 | beqz $at, .Lchacha_mips_xor_done | ||
380 | /* Second byte */ | ||
381 | lbu T1, 1(IN) | ||
382 | addiu $at, BYTES, 2 | ||
383 | ROTx SAVED_X, 8 | ||
384 | xor T1, SAVED_X | ||
385 | sb T1, 1(OUT) | ||
386 | beqz $at, .Lchacha_mips_xor_done | ||
387 | /* Third byte */ | ||
388 | lbu T1, 2(IN) | ||
389 | ROTx SAVED_X, 8 | ||
390 | xor T1, SAVED_X | ||
391 | sb T1, 2(OUT) | ||
392 | b .Lchacha_mips_xor_done | ||
393 | |||
394 | .Lchacha_mips_no_full_block_unaligned: | ||
395 | /* Restore the offset on BYTES */ | ||
396 | addiu BYTES, CHACHA20_BLOCK_SIZE | ||
397 | |||
398 | /* Get number of full WORDS */ | ||
399 | andi $at, BYTES, MASK_U32 | ||
400 | |||
401 | /* Load upper half of jump table addr */ | ||
402 | lui T0, %hi(.Lchacha_mips_jmptbl_unaligned_0) | ||
403 | |||
404 | /* Calculate lower half jump table offset */ | ||
405 | ins T0, $at, 1, 6 | ||
406 | |||
407 | /* Add offset to STATE */ | ||
408 | addu T1, STATE, $at | ||
409 | |||
410 | /* Add lower half jump table addr */ | ||
411 | addiu T0, %lo(.Lchacha_mips_jmptbl_unaligned_0) | ||
412 | |||
413 | /* Read value from STATE */ | ||
414 | lw SAVED_CA, 0(T1) | ||
415 | |||
416 | /* Store remaining bytecounter as negative value */ | ||
417 | subu BYTES, $at, BYTES | ||
418 | |||
419 | jr T0 | ||
420 | |||
421 | /* Jump table */ | ||
422 | FOR_EACH_WORD(JMPTBL_UNALIGNED) | ||
423 | .end chacha_crypt_arch | ||
424 | .set at | ||
425 | |||
426 | /* Input arguments | ||
427 | * STATE $a0 | ||
428 | * OUT $a1 | ||
429 | * NROUND $a2 | ||
430 | */ | ||
431 | |||
432 | #undef X12 | ||
433 | #undef X13 | ||
434 | #undef X14 | ||
435 | #undef X15 | ||
436 | |||
437 | #define X12 $a3 | ||
438 | #define X13 $at | ||
439 | #define X14 $v0 | ||
440 | #define X15 STATE | ||
441 | |||
442 | .set noat | ||
443 | .globl hchacha_block_arch | ||
444 | .ent hchacha_block_arch | ||
445 | hchacha_block_arch: | ||
446 | .frame $sp, STACK_SIZE, $ra | ||
447 | |||
448 | addiu $sp, -STACK_SIZE | ||
449 | |||
450 | /* Save X11(s6) */ | ||
451 | sw X11, 0($sp) | ||
452 | |||
453 | lw X0, 0(STATE) | ||
454 | lw X1, 4(STATE) | ||
455 | lw X2, 8(STATE) | ||
456 | lw X3, 12(STATE) | ||
457 | lw X4, 16(STATE) | ||
458 | lw X5, 20(STATE) | ||
459 | lw X6, 24(STATE) | ||
460 | lw X7, 28(STATE) | ||
461 | lw X8, 32(STATE) | ||
462 | lw X9, 36(STATE) | ||
463 | lw X10, 40(STATE) | ||
464 | lw X11, 44(STATE) | ||
465 | lw X12, 48(STATE) | ||
466 | lw X13, 52(STATE) | ||
467 | lw X14, 56(STATE) | ||
468 | lw X15, 60(STATE) | ||
469 | |||
470 | .Loop_hchacha_xor_rounds: | ||
471 | addiu $a2, -2 | ||
472 | AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16); | ||
473 | AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12); | ||
474 | AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8); | ||
475 | AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7); | ||
476 | AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16); | ||
477 | AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12); | ||
478 | AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8); | ||
479 | AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7); | ||
480 | bnez $a2, .Loop_hchacha_xor_rounds | ||
481 | |||
482 | /* Restore used register */ | ||
483 | lw X11, 0($sp) | ||
484 | |||
485 | sw X0, 0(OUT) | ||
486 | sw X1, 4(OUT) | ||
487 | sw X2, 8(OUT) | ||
488 | sw X3, 12(OUT) | ||
489 | sw X12, 16(OUT) | ||
490 | sw X13, 20(OUT) | ||
491 | sw X14, 24(OUT) | ||
492 | sw X15, 28(OUT) | ||
493 | |||
494 | addiu $sp, STACK_SIZE | ||
495 | jr $ra | ||
496 | .end hchacha_block_arch | ||
497 | .set at | ||
diff --git a/arch/mips/crypto/chacha-glue.c b/arch/mips/crypto/chacha-glue.c new file mode 100644 index 000000000..d1fd23e6e --- /dev/null +++ b/arch/mips/crypto/chacha-glue.c | |||
@@ -0,0 +1,152 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | /* | ||
3 | * MIPS accelerated ChaCha and XChaCha stream ciphers, | ||
4 | * including ChaCha20 (RFC7539) | ||
5 | * | ||
6 | * Copyright (C) 2019 Linaro, Ltd. <ard.biesheuvel@linaro.org> | ||
7 | */ | ||
8 | |||
9 | #include <asm/byteorder.h> | ||
10 | #include <crypto/algapi.h> | ||
11 | #include <crypto/internal/chacha.h> | ||
12 | #include <crypto/internal/skcipher.h> | ||
13 | #include <linux/kernel.h> | ||
14 | #include <linux/module.h> | ||
15 | |||
16 | asmlinkage void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, | ||
17 | unsigned int bytes, int nrounds); | ||
18 | EXPORT_SYMBOL(chacha_crypt_arch); | ||
19 | |||
20 | asmlinkage void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds); | ||
21 | EXPORT_SYMBOL(hchacha_block_arch); | ||
22 | |||
23 | void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv) | ||
24 | { | ||
25 | chacha_init_generic(state, key, iv); | ||
26 | } | ||
27 | EXPORT_SYMBOL(chacha_init_arch); | ||
28 | |||
29 | static int chacha_mips_stream_xor(struct skcipher_request *req, | ||
30 | const struct chacha_ctx *ctx, const u8 *iv) | ||
31 | { | ||
32 | struct skcipher_walk walk; | ||
33 | u32 state[16]; | ||
34 | int err; | ||
35 | |||
36 | err = skcipher_walk_virt(&walk, req, false); | ||
37 | |||
38 | chacha_init_generic(state, ctx->key, iv); | ||
39 | |||
40 | while (walk.nbytes > 0) { | ||
41 | unsigned int nbytes = walk.nbytes; | ||
42 | |||
43 | if (nbytes < walk.total) | ||
44 | nbytes = round_down(nbytes, walk.stride); | ||
45 | |||
46 | chacha_crypt(state, walk.dst.virt.addr, walk.src.virt.addr, | ||
47 | nbytes, ctx->nrounds); | ||
48 | err = skcipher_walk_done(&walk, walk.nbytes - nbytes); | ||
49 | } | ||
50 | |||
51 | return err; | ||
52 | } | ||
53 | |||
54 | static int chacha_mips(struct skcipher_request *req) | ||
55 | { | ||
56 | struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); | ||
57 | struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); | ||
58 | |||
59 | return chacha_mips_stream_xor(req, ctx, req->iv); | ||
60 | } | ||
61 | |||
62 | static int xchacha_mips(struct skcipher_request *req) | ||
63 | { | ||
64 | struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); | ||
65 | struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); | ||
66 | struct chacha_ctx subctx; | ||
67 | u32 state[16]; | ||
68 | u8 real_iv[16]; | ||
69 | |||
70 | chacha_init_generic(state, ctx->key, req->iv); | ||
71 | |||
72 | hchacha_block(state, subctx.key, ctx->nrounds); | ||
73 | subctx.nrounds = ctx->nrounds; | ||
74 | |||
75 | memcpy(&real_iv[0], req->iv + 24, 8); | ||
76 | memcpy(&real_iv[8], req->iv + 16, 8); | ||
77 | return chacha_mips_stream_xor(req, &subctx, real_iv); | ||
78 | } | ||
79 | |||
80 | static struct skcipher_alg algs[] = { | ||
81 | { | ||
82 | .base.cra_name = "chacha20", | ||
83 | .base.cra_driver_name = "chacha20-mips", | ||
84 | .base.cra_priority = 200, | ||
85 | .base.cra_blocksize = 1, | ||
86 | .base.cra_ctxsize = sizeof(struct chacha_ctx), | ||
87 | .base.cra_module = THIS_MODULE, | ||
88 | |||
89 | .min_keysize = CHACHA_KEY_SIZE, | ||
90 | .max_keysize = CHACHA_KEY_SIZE, | ||
91 | .ivsize = CHACHA_IV_SIZE, | ||
92 | .chunksize = CHACHA_BLOCK_SIZE, | ||
93 | .setkey = chacha20_setkey, | ||
94 | .encrypt = chacha_mips, | ||
95 | .decrypt = chacha_mips, | ||
96 | }, { | ||
97 | .base.cra_name = "xchacha20", | ||
98 | .base.cra_driver_name = "xchacha20-mips", | ||
99 | .base.cra_priority = 200, | ||
100 | .base.cra_blocksize = 1, | ||
101 | .base.cra_ctxsize = sizeof(struct chacha_ctx), | ||
102 | .base.cra_module = THIS_MODULE, | ||
103 | |||
104 | .min_keysize = CHACHA_KEY_SIZE, | ||
105 | .max_keysize = CHACHA_KEY_SIZE, | ||
106 | .ivsize = XCHACHA_IV_SIZE, | ||
107 | .chunksize = CHACHA_BLOCK_SIZE, | ||
108 | .setkey = chacha20_setkey, | ||
109 | .encrypt = xchacha_mips, | ||
110 | .decrypt = xchacha_mips, | ||
111 | }, { | ||
112 | .base.cra_name = "xchacha12", | ||
113 | .base.cra_driver_name = "xchacha12-mips", | ||
114 | .base.cra_priority = 200, | ||
115 | .base.cra_blocksize = 1, | ||
116 | .base.cra_ctxsize = sizeof(struct chacha_ctx), | ||
117 | .base.cra_module = THIS_MODULE, | ||
118 | |||
119 | .min_keysize = CHACHA_KEY_SIZE, | ||
120 | .max_keysize = CHACHA_KEY_SIZE, | ||
121 | .ivsize = XCHACHA_IV_SIZE, | ||
122 | .chunksize = CHACHA_BLOCK_SIZE, | ||
123 | .setkey = chacha12_setkey, | ||
124 | .encrypt = xchacha_mips, | ||
125 | .decrypt = xchacha_mips, | ||
126 | } | ||
127 | }; | ||
128 | |||
129 | static int __init chacha_simd_mod_init(void) | ||
130 | { | ||
131 | return IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER) ? | ||
132 | crypto_register_skciphers(algs, ARRAY_SIZE(algs)) : 0; | ||
133 | } | ||
134 | |||
135 | static void __exit chacha_simd_mod_fini(void) | ||
136 | { | ||
137 | if (IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER)) | ||
138 | crypto_unregister_skciphers(algs, ARRAY_SIZE(algs)); | ||
139 | } | ||
140 | |||
141 | module_init(chacha_simd_mod_init); | ||
142 | module_exit(chacha_simd_mod_fini); | ||
143 | |||
144 | MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (MIPS accelerated)"); | ||
145 | MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); | ||
146 | MODULE_LICENSE("GPL v2"); | ||
147 | MODULE_ALIAS_CRYPTO("chacha20"); | ||
148 | MODULE_ALIAS_CRYPTO("chacha20-mips"); | ||
149 | MODULE_ALIAS_CRYPTO("xchacha20"); | ||
150 | MODULE_ALIAS_CRYPTO("xchacha20-mips"); | ||
151 | MODULE_ALIAS_CRYPTO("xchacha12"); | ||
152 | MODULE_ALIAS_CRYPTO("xchacha12-mips"); | ||
diff --git a/arch/mips/crypto/crc32-mips.c b/arch/mips/crypto/crc32-mips.c new file mode 100644 index 000000000..faa88a6a7 --- /dev/null +++ b/arch/mips/crypto/crc32-mips.c | |||
@@ -0,0 +1,346 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | /* | ||
3 | * crc32-mips.c - CRC32 and CRC32C using optional MIPSr6 instructions | ||
4 | * | ||
5 | * Module based on arm64/crypto/crc32-arm.c | ||
6 | * | ||
7 | * Copyright (C) 2014 Linaro Ltd <yazen.ghannam@linaro.org> | ||
8 | * Copyright (C) 2018 MIPS Tech, LLC | ||
9 | */ | ||
10 | |||
11 | #include <linux/unaligned/access_ok.h> | ||
12 | #include <linux/cpufeature.h> | ||
13 | #include <linux/init.h> | ||
14 | #include <linux/kernel.h> | ||
15 | #include <linux/module.h> | ||
16 | #include <linux/string.h> | ||
17 | #include <asm/mipsregs.h> | ||
18 | |||
19 | #include <crypto/internal/hash.h> | ||
20 | |||
21 | enum crc_op_size { | ||
22 | b, h, w, d, | ||
23 | }; | ||
24 | |||
25 | enum crc_type { | ||
26 | crc32, | ||
27 | crc32c, | ||
28 | }; | ||
29 | |||
30 | #ifndef TOOLCHAIN_SUPPORTS_CRC | ||
31 | #define _ASM_MACRO_CRC32(OP, SZ, TYPE) \ | ||
32 | _ASM_MACRO_3R(OP, rt, rs, rt2, \ | ||
33 | ".ifnc \\rt, \\rt2\n\t" \ | ||
34 | ".error \"invalid operands \\\"" #OP " \\rt,\\rs,\\rt2\\\"\"\n\t" \ | ||
35 | ".endif\n\t" \ | ||
36 | _ASM_INSN_IF_MIPS(0x7c00000f | (__rt << 16) | (__rs << 21) | \ | ||
37 | ((SZ) << 6) | ((TYPE) << 8)) \ | ||
38 | _ASM_INSN32_IF_MM(0x00000030 | (__rs << 16) | (__rt << 21) | \ | ||
39 | ((SZ) << 14) | ((TYPE) << 3))) | ||
40 | _ASM_MACRO_CRC32(crc32b, 0, 0); | ||
41 | _ASM_MACRO_CRC32(crc32h, 1, 0); | ||
42 | _ASM_MACRO_CRC32(crc32w, 2, 0); | ||
43 | _ASM_MACRO_CRC32(crc32d, 3, 0); | ||
44 | _ASM_MACRO_CRC32(crc32cb, 0, 1); | ||
45 | _ASM_MACRO_CRC32(crc32ch, 1, 1); | ||
46 | _ASM_MACRO_CRC32(crc32cw, 2, 1); | ||
47 | _ASM_MACRO_CRC32(crc32cd, 3, 1); | ||
48 | #define _ASM_SET_CRC "" | ||
49 | #else /* !TOOLCHAIN_SUPPORTS_CRC */ | ||
50 | #define _ASM_SET_CRC ".set\tcrc\n\t" | ||
51 | #endif | ||
52 | |||
53 | #define _CRC32(crc, value, size, type) \ | ||
54 | do { \ | ||
55 | __asm__ __volatile__( \ | ||
56 | ".set push\n\t" \ | ||
57 | _ASM_SET_CRC \ | ||
58 | #type #size " %0, %1, %0\n\t" \ | ||
59 | ".set pop" \ | ||
60 | : "+r" (crc) \ | ||
61 | : "r" (value)); \ | ||
62 | } while (0) | ||
63 | |||
64 | #define CRC32(crc, value, size) \ | ||
65 | _CRC32(crc, value, size, crc32) | ||
66 | |||
67 | #define CRC32C(crc, value, size) \ | ||
68 | _CRC32(crc, value, size, crc32c) | ||
69 | |||
70 | static u32 crc32_mips_le_hw(u32 crc_, const u8 *p, unsigned int len) | ||
71 | { | ||
72 | u32 crc = crc_; | ||
73 | |||
74 | #ifdef CONFIG_64BIT | ||
75 | while (len >= sizeof(u64)) { | ||
76 | u64 value = get_unaligned_le64(p); | ||
77 | |||
78 | CRC32(crc, value, d); | ||
79 | p += sizeof(u64); | ||
80 | len -= sizeof(u64); | ||
81 | } | ||
82 | |||
83 | if (len & sizeof(u32)) { | ||
84 | #else /* !CONFIG_64BIT */ | ||
85 | while (len >= sizeof(u32)) { | ||
86 | #endif | ||
87 | u32 value = get_unaligned_le32(p); | ||
88 | |||
89 | CRC32(crc, value, w); | ||
90 | p += sizeof(u32); | ||
91 | len -= sizeof(u32); | ||
92 | } | ||
93 | |||
94 | if (len & sizeof(u16)) { | ||
95 | u16 value = get_unaligned_le16(p); | ||
96 | |||
97 | CRC32(crc, value, h); | ||
98 | p += sizeof(u16); | ||
99 | } | ||
100 | |||
101 | if (len & sizeof(u8)) { | ||
102 | u8 value = *p++; | ||
103 | |||
104 | CRC32(crc, value, b); | ||
105 | } | ||
106 | |||
107 | return crc; | ||
108 | } | ||
109 | |||
110 | static u32 crc32c_mips_le_hw(u32 crc_, const u8 *p, unsigned int len) | ||
111 | { | ||
112 | u32 crc = crc_; | ||
113 | |||
114 | #ifdef CONFIG_64BIT | ||
115 | while (len >= sizeof(u64)) { | ||
116 | u64 value = get_unaligned_le64(p); | ||
117 | |||
118 | CRC32C(crc, value, d); | ||
119 | p += sizeof(u64); | ||
120 | len -= sizeof(u64); | ||
121 | } | ||
122 | |||
123 | if (len & sizeof(u32)) { | ||
124 | #else /* !CONFIG_64BIT */ | ||
125 | while (len >= sizeof(u32)) { | ||
126 | #endif | ||
127 | u32 value = get_unaligned_le32(p); | ||
128 | |||
129 | CRC32C(crc, value, w); | ||
130 | p += sizeof(u32); | ||
131 | len -= sizeof(u32); | ||
132 | } | ||
133 | |||
134 | if (len & sizeof(u16)) { | ||
135 | u16 value = get_unaligned_le16(p); | ||
136 | |||
137 | CRC32C(crc, value, h); | ||
138 | p += sizeof(u16); | ||
139 | } | ||
140 | |||
141 | if (len & sizeof(u8)) { | ||
142 | u8 value = *p++; | ||
143 | |||
144 | CRC32C(crc, value, b); | ||
145 | } | ||
146 | return crc; | ||
147 | } | ||
148 | |||
149 | #define CHKSUM_BLOCK_SIZE 1 | ||
150 | #define CHKSUM_DIGEST_SIZE 4 | ||
151 | |||
152 | struct chksum_ctx { | ||
153 | u32 key; | ||
154 | }; | ||
155 | |||
156 | struct chksum_desc_ctx { | ||
157 | u32 crc; | ||
158 | }; | ||
159 | |||
160 | static int chksum_init(struct shash_desc *desc) | ||
161 | { | ||
162 | struct chksum_ctx *mctx = crypto_shash_ctx(desc->tfm); | ||
163 | struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); | ||
164 | |||
165 | ctx->crc = mctx->key; | ||
166 | |||
167 | return 0; | ||
168 | } | ||
169 | |||
170 | /* | ||
171 | * Setting the seed allows arbitrary accumulators and flexible XOR policy | ||
172 | * If your algorithm starts with ~0, then XOR with ~0 before you set | ||
173 | * the seed. | ||
174 | */ | ||
175 | static int chksum_setkey(struct crypto_shash *tfm, const u8 *key, | ||
176 | unsigned int keylen) | ||
177 | { | ||
178 | struct chksum_ctx *mctx = crypto_shash_ctx(tfm); | ||
179 | |||
180 | if (keylen != sizeof(mctx->key)) | ||
181 | return -EINVAL; | ||
182 | mctx->key = get_unaligned_le32(key); | ||
183 | return 0; | ||
184 | } | ||
185 | |||
186 | static int chksum_update(struct shash_desc *desc, const u8 *data, | ||
187 | unsigned int length) | ||
188 | { | ||
189 | struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); | ||
190 | |||
191 | ctx->crc = crc32_mips_le_hw(ctx->crc, data, length); | ||
192 | return 0; | ||
193 | } | ||
194 | |||
195 | static int chksumc_update(struct shash_desc *desc, const u8 *data, | ||
196 | unsigned int length) | ||
197 | { | ||
198 | struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); | ||
199 | |||
200 | ctx->crc = crc32c_mips_le_hw(ctx->crc, data, length); | ||
201 | return 0; | ||
202 | } | ||
203 | |||
204 | static int chksum_final(struct shash_desc *desc, u8 *out) | ||
205 | { | ||
206 | struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); | ||
207 | |||
208 | put_unaligned_le32(ctx->crc, out); | ||
209 | return 0; | ||
210 | } | ||
211 | |||
212 | static int chksumc_final(struct shash_desc *desc, u8 *out) | ||
213 | { | ||
214 | struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); | ||
215 | |||
216 | put_unaligned_le32(~ctx->crc, out); | ||
217 | return 0; | ||
218 | } | ||
219 | |||
220 | static int __chksum_finup(u32 crc, const u8 *data, unsigned int len, u8 *out) | ||
221 | { | ||
222 | put_unaligned_le32(crc32_mips_le_hw(crc, data, len), out); | ||
223 | return 0; | ||
224 | } | ||
225 | |||
226 | static int __chksumc_finup(u32 crc, const u8 *data, unsigned int len, u8 *out) | ||
227 | { | ||
228 | put_unaligned_le32(~crc32c_mips_le_hw(crc, data, len), out); | ||
229 | return 0; | ||
230 | } | ||
231 | |||
232 | static int chksum_finup(struct shash_desc *desc, const u8 *data, | ||
233 | unsigned int len, u8 *out) | ||
234 | { | ||
235 | struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); | ||
236 | |||
237 | return __chksum_finup(ctx->crc, data, len, out); | ||
238 | } | ||
239 | |||
240 | static int chksumc_finup(struct shash_desc *desc, const u8 *data, | ||
241 | unsigned int len, u8 *out) | ||
242 | { | ||
243 | struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); | ||
244 | |||
245 | return __chksumc_finup(ctx->crc, data, len, out); | ||
246 | } | ||
247 | |||
248 | static int chksum_digest(struct shash_desc *desc, const u8 *data, | ||
249 | unsigned int length, u8 *out) | ||
250 | { | ||
251 | struct chksum_ctx *mctx = crypto_shash_ctx(desc->tfm); | ||
252 | |||
253 | return __chksum_finup(mctx->key, data, length, out); | ||
254 | } | ||
255 | |||
256 | static int chksumc_digest(struct shash_desc *desc, const u8 *data, | ||
257 | unsigned int length, u8 *out) | ||
258 | { | ||
259 | struct chksum_ctx *mctx = crypto_shash_ctx(desc->tfm); | ||
260 | |||
261 | return __chksumc_finup(mctx->key, data, length, out); | ||
262 | } | ||
263 | |||
264 | static int chksum_cra_init(struct crypto_tfm *tfm) | ||
265 | { | ||
266 | struct chksum_ctx *mctx = crypto_tfm_ctx(tfm); | ||
267 | |||
268 | mctx->key = ~0; | ||
269 | return 0; | ||
270 | } | ||
271 | |||
272 | static struct shash_alg crc32_alg = { | ||
273 | .digestsize = CHKSUM_DIGEST_SIZE, | ||
274 | .setkey = chksum_setkey, | ||
275 | .init = chksum_init, | ||
276 | .update = chksum_update, | ||
277 | .final = chksum_final, | ||
278 | .finup = chksum_finup, | ||
279 | .digest = chksum_digest, | ||
280 | .descsize = sizeof(struct chksum_desc_ctx), | ||
281 | .base = { | ||
282 | .cra_name = "crc32", | ||
283 | .cra_driver_name = "crc32-mips-hw", | ||
284 | .cra_priority = 300, | ||
285 | .cra_flags = CRYPTO_ALG_OPTIONAL_KEY, | ||
286 | .cra_blocksize = CHKSUM_BLOCK_SIZE, | ||
287 | .cra_alignmask = 0, | ||
288 | .cra_ctxsize = sizeof(struct chksum_ctx), | ||
289 | .cra_module = THIS_MODULE, | ||
290 | .cra_init = chksum_cra_init, | ||
291 | } | ||
292 | }; | ||
293 | |||
294 | static struct shash_alg crc32c_alg = { | ||
295 | .digestsize = CHKSUM_DIGEST_SIZE, | ||
296 | .setkey = chksum_setkey, | ||
297 | .init = chksum_init, | ||
298 | .update = chksumc_update, | ||
299 | .final = chksumc_final, | ||
300 | .finup = chksumc_finup, | ||
301 | .digest = chksumc_digest, | ||
302 | .descsize = sizeof(struct chksum_desc_ctx), | ||
303 | .base = { | ||
304 | .cra_name = "crc32c", | ||
305 | .cra_driver_name = "crc32c-mips-hw", | ||
306 | .cra_priority = 300, | ||
307 | .cra_flags = CRYPTO_ALG_OPTIONAL_KEY, | ||
308 | .cra_blocksize = CHKSUM_BLOCK_SIZE, | ||
309 | .cra_alignmask = 0, | ||
310 | .cra_ctxsize = sizeof(struct chksum_ctx), | ||
311 | .cra_module = THIS_MODULE, | ||
312 | .cra_init = chksum_cra_init, | ||
313 | } | ||
314 | }; | ||
315 | |||
316 | static int __init crc32_mod_init(void) | ||
317 | { | ||
318 | int err; | ||
319 | |||
320 | err = crypto_register_shash(&crc32_alg); | ||
321 | |||
322 | if (err) | ||
323 | return err; | ||
324 | |||
325 | err = crypto_register_shash(&crc32c_alg); | ||
326 | |||
327 | if (err) { | ||
328 | crypto_unregister_shash(&crc32_alg); | ||
329 | return err; | ||
330 | } | ||
331 | |||
332 | return 0; | ||
333 | } | ||
334 | |||
335 | static void __exit crc32_mod_exit(void) | ||
336 | { | ||
337 | crypto_unregister_shash(&crc32_alg); | ||
338 | crypto_unregister_shash(&crc32c_alg); | ||
339 | } | ||
340 | |||
341 | MODULE_AUTHOR("Marcin Nowakowski <marcin.nowakowski@mips.com"); | ||
342 | MODULE_DESCRIPTION("CRC32 and CRC32C using optional MIPS instructions"); | ||
343 | MODULE_LICENSE("GPL v2"); | ||
344 | |||
345 | module_cpu_feature_match(MIPS_CRC32, crc32_mod_init); | ||
346 | module_exit(crc32_mod_exit); | ||
diff --git a/arch/mips/crypto/poly1305-glue.c b/arch/mips/crypto/poly1305-glue.c new file mode 100644 index 000000000..bc6110fb9 --- /dev/null +++ b/arch/mips/crypto/poly1305-glue.c | |||
@@ -0,0 +1,191 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | /* | ||
3 | * OpenSSL/Cryptogams accelerated Poly1305 transform for MIPS | ||
4 | * | ||
5 | * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org> | ||
6 | */ | ||
7 | |||
8 | #include <asm/unaligned.h> | ||
9 | #include <crypto/algapi.h> | ||
10 | #include <crypto/internal/hash.h> | ||
11 | #include <crypto/internal/poly1305.h> | ||
12 | #include <linux/cpufeature.h> | ||
13 | #include <linux/crypto.h> | ||
14 | #include <linux/module.h> | ||
15 | |||
16 | asmlinkage void poly1305_init_mips(void *state, const u8 *key); | ||
17 | asmlinkage void poly1305_blocks_mips(void *state, const u8 *src, u32 len, u32 hibit); | ||
18 | asmlinkage void poly1305_emit_mips(void *state, u8 *digest, const u32 *nonce); | ||
19 | |||
20 | void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 key[POLY1305_KEY_SIZE]) | ||
21 | { | ||
22 | poly1305_init_mips(&dctx->h, key); | ||
23 | dctx->s[0] = get_unaligned_le32(key + 16); | ||
24 | dctx->s[1] = get_unaligned_le32(key + 20); | ||
25 | dctx->s[2] = get_unaligned_le32(key + 24); | ||
26 | dctx->s[3] = get_unaligned_le32(key + 28); | ||
27 | dctx->buflen = 0; | ||
28 | } | ||
29 | EXPORT_SYMBOL(poly1305_init_arch); | ||
30 | |||
31 | static int mips_poly1305_init(struct shash_desc *desc) | ||
32 | { | ||
33 | struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); | ||
34 | |||
35 | dctx->buflen = 0; | ||
36 | dctx->rset = 0; | ||
37 | dctx->sset = false; | ||
38 | |||
39 | return 0; | ||
40 | } | ||
41 | |||
42 | static void mips_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src, | ||
43 | u32 len, u32 hibit) | ||
44 | { | ||
45 | if (unlikely(!dctx->sset)) { | ||
46 | if (!dctx->rset) { | ||
47 | poly1305_init_mips(&dctx->h, src); | ||
48 | src += POLY1305_BLOCK_SIZE; | ||
49 | len -= POLY1305_BLOCK_SIZE; | ||
50 | dctx->rset = 1; | ||
51 | } | ||
52 | if (len >= POLY1305_BLOCK_SIZE) { | ||
53 | dctx->s[0] = get_unaligned_le32(src + 0); | ||
54 | dctx->s[1] = get_unaligned_le32(src + 4); | ||
55 | dctx->s[2] = get_unaligned_le32(src + 8); | ||
56 | dctx->s[3] = get_unaligned_le32(src + 12); | ||
57 | src += POLY1305_BLOCK_SIZE; | ||
58 | len -= POLY1305_BLOCK_SIZE; | ||
59 | dctx->sset = true; | ||
60 | } | ||
61 | if (len < POLY1305_BLOCK_SIZE) | ||
62 | return; | ||
63 | } | ||
64 | |||
65 | len &= ~(POLY1305_BLOCK_SIZE - 1); | ||
66 | |||
67 | poly1305_blocks_mips(&dctx->h, src, len, hibit); | ||
68 | } | ||
69 | |||
70 | static int mips_poly1305_update(struct shash_desc *desc, const u8 *src, | ||
71 | unsigned int len) | ||
72 | { | ||
73 | struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); | ||
74 | |||
75 | if (unlikely(dctx->buflen)) { | ||
76 | u32 bytes = min(len, POLY1305_BLOCK_SIZE - dctx->buflen); | ||
77 | |||
78 | memcpy(dctx->buf + dctx->buflen, src, bytes); | ||
79 | src += bytes; | ||
80 | len -= bytes; | ||
81 | dctx->buflen += bytes; | ||
82 | |||
83 | if (dctx->buflen == POLY1305_BLOCK_SIZE) { | ||
84 | mips_poly1305_blocks(dctx, dctx->buf, POLY1305_BLOCK_SIZE, 1); | ||
85 | dctx->buflen = 0; | ||
86 | } | ||
87 | } | ||
88 | |||
89 | if (likely(len >= POLY1305_BLOCK_SIZE)) { | ||
90 | mips_poly1305_blocks(dctx, src, len, 1); | ||
91 | src += round_down(len, POLY1305_BLOCK_SIZE); | ||
92 | len %= POLY1305_BLOCK_SIZE; | ||
93 | } | ||
94 | |||
95 | if (unlikely(len)) { | ||
96 | dctx->buflen = len; | ||
97 | memcpy(dctx->buf, src, len); | ||
98 | } | ||
99 | return 0; | ||
100 | } | ||
101 | |||
102 | void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src, | ||
103 | unsigned int nbytes) | ||
104 | { | ||
105 | if (unlikely(dctx->buflen)) { | ||
106 | u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen); | ||
107 | |||
108 | memcpy(dctx->buf + dctx->buflen, src, bytes); | ||
109 | src += bytes; | ||
110 | nbytes -= bytes; | ||
111 | dctx->buflen += bytes; | ||
112 | |||
113 | if (dctx->buflen == POLY1305_BLOCK_SIZE) { | ||
114 | poly1305_blocks_mips(&dctx->h, dctx->buf, | ||
115 | POLY1305_BLOCK_SIZE, 1); | ||
116 | dctx->buflen = 0; | ||
117 | } | ||
118 | } | ||
119 | |||
120 | if (likely(nbytes >= POLY1305_BLOCK_SIZE)) { | ||
121 | unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE); | ||
122 | |||
123 | poly1305_blocks_mips(&dctx->h, src, len, 1); | ||
124 | src += len; | ||
125 | nbytes %= POLY1305_BLOCK_SIZE; | ||
126 | } | ||
127 | |||
128 | if (unlikely(nbytes)) { | ||
129 | dctx->buflen = nbytes; | ||
130 | memcpy(dctx->buf, src, nbytes); | ||
131 | } | ||
132 | } | ||
133 | EXPORT_SYMBOL(poly1305_update_arch); | ||
134 | |||
135 | void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst) | ||
136 | { | ||
137 | if (unlikely(dctx->buflen)) { | ||
138 | dctx->buf[dctx->buflen++] = 1; | ||
139 | memset(dctx->buf + dctx->buflen, 0, | ||
140 | POLY1305_BLOCK_SIZE - dctx->buflen); | ||
141 | poly1305_blocks_mips(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0); | ||
142 | } | ||
143 | |||
144 | poly1305_emit_mips(&dctx->h, dst, dctx->s); | ||
145 | *dctx = (struct poly1305_desc_ctx){}; | ||
146 | } | ||
147 | EXPORT_SYMBOL(poly1305_final_arch); | ||
148 | |||
149 | static int mips_poly1305_final(struct shash_desc *desc, u8 *dst) | ||
150 | { | ||
151 | struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); | ||
152 | |||
153 | if (unlikely(!dctx->sset)) | ||
154 | return -ENOKEY; | ||
155 | |||
156 | poly1305_final_arch(dctx, dst); | ||
157 | return 0; | ||
158 | } | ||
159 | |||
160 | static struct shash_alg mips_poly1305_alg = { | ||
161 | .init = mips_poly1305_init, | ||
162 | .update = mips_poly1305_update, | ||
163 | .final = mips_poly1305_final, | ||
164 | .digestsize = POLY1305_DIGEST_SIZE, | ||
165 | .descsize = sizeof(struct poly1305_desc_ctx), | ||
166 | |||
167 | .base.cra_name = "poly1305", | ||
168 | .base.cra_driver_name = "poly1305-mips", | ||
169 | .base.cra_priority = 200, | ||
170 | .base.cra_blocksize = POLY1305_BLOCK_SIZE, | ||
171 | .base.cra_module = THIS_MODULE, | ||
172 | }; | ||
173 | |||
174 | static int __init mips_poly1305_mod_init(void) | ||
175 | { | ||
176 | return IS_REACHABLE(CONFIG_CRYPTO_HASH) ? | ||
177 | crypto_register_shash(&mips_poly1305_alg) : 0; | ||
178 | } | ||
179 | |||
180 | static void __exit mips_poly1305_mod_exit(void) | ||
181 | { | ||
182 | if (IS_REACHABLE(CONFIG_CRYPTO_HASH)) | ||
183 | crypto_unregister_shash(&mips_poly1305_alg); | ||
184 | } | ||
185 | |||
186 | module_init(mips_poly1305_mod_init); | ||
187 | module_exit(mips_poly1305_mod_exit); | ||
188 | |||
189 | MODULE_LICENSE("GPL v2"); | ||
190 | MODULE_ALIAS_CRYPTO("poly1305"); | ||
191 | MODULE_ALIAS_CRYPTO("poly1305-mips"); | ||
diff --git a/arch/mips/crypto/poly1305-mips.pl b/arch/mips/crypto/poly1305-mips.pl new file mode 100644 index 000000000..b05bab884 --- /dev/null +++ b/arch/mips/crypto/poly1305-mips.pl | |||
@@ -0,0 +1,1273 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | # SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause | ||
3 | # | ||
4 | # ==================================================================== | ||
5 | # Written by Andy Polyakov, @dot-asm, originally for the OpenSSL | ||
6 | # project. | ||
7 | # ==================================================================== | ||
8 | |||
9 | # Poly1305 hash for MIPS. | ||
10 | # | ||
11 | # May 2016 | ||
12 | # | ||
13 | # Numbers are cycles per processed byte with poly1305_blocks alone. | ||
14 | # | ||
15 | # IALU/gcc | ||
16 | # R1x000 ~5.5/+130% (big-endian) | ||
17 | # Octeon II 2.50/+70% (little-endian) | ||
18 | # | ||
19 | # March 2019 | ||
20 | # | ||
21 | # Add 32-bit code path. | ||
22 | # | ||
23 | # October 2019 | ||
24 | # | ||
25 | # Modulo-scheduling reduction allows to omit dependency chain at the | ||
26 | # end of inner loop and improve performance. Also optimize MIPS32R2 | ||
27 | # code path for MIPS 1004K core. Per René von Dorst's suggestions. | ||
28 | # | ||
29 | # IALU/gcc | ||
30 | # R1x000 ~9.8/? (big-endian) | ||
31 | # Octeon II 3.65/+140% (little-endian) | ||
32 | # MT7621/1004K 4.75/? (little-endian) | ||
33 | # | ||
34 | ###################################################################### | ||
35 | # There is a number of MIPS ABI in use, O32 and N32/64 are most | ||
36 | # widely used. Then there is a new contender: NUBI. It appears that if | ||
37 | # one picks the latter, it's possible to arrange code in ABI neutral | ||
38 | # manner. Therefore let's stick to NUBI register layout: | ||
39 | # | ||
40 | ($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); | ||
41 | ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); | ||
42 | ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); | ||
43 | ($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); | ||
44 | # | ||
45 | # The return value is placed in $a0. Following coding rules facilitate | ||
46 | # interoperability: | ||
47 | # | ||
48 | # - never ever touch $tp, "thread pointer", former $gp [o32 can be | ||
49 | # excluded from the rule, because it's specified volatile]; | ||
50 | # - copy return value to $t0, former $v0 [or to $a0 if you're adapting | ||
51 | # old code]; | ||
52 | # - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; | ||
53 | # | ||
54 | # For reference here is register layout for N32/64 MIPS ABIs: | ||
55 | # | ||
56 | # ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); | ||
57 | # ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); | ||
58 | # ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); | ||
59 | # ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); | ||
60 | # ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); | ||
61 | # | ||
62 | # <appro@openssl.org> | ||
63 | # | ||
64 | ###################################################################### | ||
65 | |||
66 | $flavour = shift || "64"; # supported flavours are o32,n32,64,nubi32,nubi64 | ||
67 | |||
68 | $v0 = ($flavour =~ /nubi/i) ? $a0 : $t0; | ||
69 | |||
70 | if ($flavour =~ /64|n32/i) {{{ | ||
71 | ###################################################################### | ||
72 | # 64-bit code path | ||
73 | # | ||
74 | |||
75 | my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3); | ||
76 | my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1); | ||
77 | |||
78 | $code.=<<___; | ||
79 | #if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \\ | ||
80 | defined(_MIPS_ARCH_MIPS64R6)) \\ | ||
81 | && !defined(_MIPS_ARCH_MIPS64R2) | ||
82 | # define _MIPS_ARCH_MIPS64R2 | ||
83 | #endif | ||
84 | |||
85 | #if defined(_MIPS_ARCH_MIPS64R6) | ||
86 | # define dmultu(rs,rt) | ||
87 | # define mflo(rd,rs,rt) dmulu rd,rs,rt | ||
88 | # define mfhi(rd,rs,rt) dmuhu rd,rs,rt | ||
89 | #else | ||
90 | # define dmultu(rs,rt) dmultu rs,rt | ||
91 | # define mflo(rd,rs,rt) mflo rd | ||
92 | # define mfhi(rd,rs,rt) mfhi rd | ||
93 | #endif | ||
94 | |||
95 | #ifdef __KERNEL__ | ||
96 | # define poly1305_init poly1305_init_mips | ||
97 | # define poly1305_blocks poly1305_blocks_mips | ||
98 | # define poly1305_emit poly1305_emit_mips | ||
99 | #endif | ||
100 | |||
101 | #if defined(__MIPSEB__) && !defined(MIPSEB) | ||
102 | # define MIPSEB | ||
103 | #endif | ||
104 | |||
105 | #ifdef MIPSEB | ||
106 | # define MSB 0 | ||
107 | # define LSB 7 | ||
108 | #else | ||
109 | # define MSB 7 | ||
110 | # define LSB 0 | ||
111 | #endif | ||
112 | |||
113 | .text | ||
114 | .set noat | ||
115 | .set noreorder | ||
116 | |||
117 | .align 5 | ||
118 | .globl poly1305_init | ||
119 | .ent poly1305_init | ||
120 | poly1305_init: | ||
121 | .frame $sp,0,$ra | ||
122 | .set reorder | ||
123 | |||
124 | sd $zero,0($ctx) | ||
125 | sd $zero,8($ctx) | ||
126 | sd $zero,16($ctx) | ||
127 | |||
128 | beqz $inp,.Lno_key | ||
129 | |||
130 | #if defined(_MIPS_ARCH_MIPS64R6) | ||
131 | andi $tmp0,$inp,7 # $inp % 8 | ||
132 | dsubu $inp,$inp,$tmp0 # align $inp | ||
133 | sll $tmp0,$tmp0,3 # byte to bit offset | ||
134 | ld $in0,0($inp) | ||
135 | ld $in1,8($inp) | ||
136 | beqz $tmp0,.Laligned_key | ||
137 | ld $tmp2,16($inp) | ||
138 | |||
139 | subu $tmp1,$zero,$tmp0 | ||
140 | # ifdef MIPSEB | ||
141 | dsllv $in0,$in0,$tmp0 | ||
142 | dsrlv $tmp3,$in1,$tmp1 | ||
143 | dsllv $in1,$in1,$tmp0 | ||
144 | dsrlv $tmp2,$tmp2,$tmp1 | ||
145 | # else | ||
146 | dsrlv $in0,$in0,$tmp0 | ||
147 | dsllv $tmp3,$in1,$tmp1 | ||
148 | dsrlv $in1,$in1,$tmp0 | ||
149 | dsllv $tmp2,$tmp2,$tmp1 | ||
150 | # endif | ||
151 | or $in0,$in0,$tmp3 | ||
152 | or $in1,$in1,$tmp2 | ||
153 | .Laligned_key: | ||
154 | #else | ||
155 | ldl $in0,0+MSB($inp) | ||
156 | ldl $in1,8+MSB($inp) | ||
157 | ldr $in0,0+LSB($inp) | ||
158 | ldr $in1,8+LSB($inp) | ||
159 | #endif | ||
160 | #ifdef MIPSEB | ||
161 | # if defined(_MIPS_ARCH_MIPS64R2) | ||
162 | dsbh $in0,$in0 # byte swap | ||
163 | dsbh $in1,$in1 | ||
164 | dshd $in0,$in0 | ||
165 | dshd $in1,$in1 | ||
166 | # else | ||
167 | ori $tmp0,$zero,0xFF | ||
168 | dsll $tmp2,$tmp0,32 | ||
169 | or $tmp0,$tmp2 # 0x000000FF000000FF | ||
170 | |||
171 | and $tmp1,$in0,$tmp0 # byte swap | ||
172 | and $tmp3,$in1,$tmp0 | ||
173 | dsrl $tmp2,$in0,24 | ||
174 | dsrl $tmp4,$in1,24 | ||
175 | dsll $tmp1,24 | ||
176 | dsll $tmp3,24 | ||
177 | and $tmp2,$tmp0 | ||
178 | and $tmp4,$tmp0 | ||
179 | dsll $tmp0,8 # 0x0000FF000000FF00 | ||
180 | or $tmp1,$tmp2 | ||
181 | or $tmp3,$tmp4 | ||
182 | and $tmp2,$in0,$tmp0 | ||
183 | and $tmp4,$in1,$tmp0 | ||
184 | dsrl $in0,8 | ||
185 | dsrl $in1,8 | ||
186 | dsll $tmp2,8 | ||
187 | dsll $tmp4,8 | ||
188 | and $in0,$tmp0 | ||
189 | and $in1,$tmp0 | ||
190 | or $tmp1,$tmp2 | ||
191 | or $tmp3,$tmp4 | ||
192 | or $in0,$tmp1 | ||
193 | or $in1,$tmp3 | ||
194 | dsrl $tmp1,$in0,32 | ||
195 | dsrl $tmp3,$in1,32 | ||
196 | dsll $in0,32 | ||
197 | dsll $in1,32 | ||
198 | or $in0,$tmp1 | ||
199 | or $in1,$tmp3 | ||
200 | # endif | ||
201 | #endif | ||
202 | li $tmp0,1 | ||
203 | dsll $tmp0,32 # 0x0000000100000000 | ||
204 | daddiu $tmp0,-63 # 0x00000000ffffffc1 | ||
205 | dsll $tmp0,28 # 0x0ffffffc10000000 | ||
206 | daddiu $tmp0,-1 # 0x0ffffffc0fffffff | ||
207 | |||
208 | and $in0,$tmp0 | ||
209 | daddiu $tmp0,-3 # 0x0ffffffc0ffffffc | ||
210 | and $in1,$tmp0 | ||
211 | |||
212 | sd $in0,24($ctx) | ||
213 | dsrl $tmp0,$in1,2 | ||
214 | sd $in1,32($ctx) | ||
215 | daddu $tmp0,$in1 # s1 = r1 + (r1 >> 2) | ||
216 | sd $tmp0,40($ctx) | ||
217 | |||
218 | .Lno_key: | ||
219 | li $v0,0 # return 0 | ||
220 | jr $ra | ||
221 | .end poly1305_init | ||
222 | ___ | ||
223 | { | ||
224 | my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000"; | ||
225 | |||
226 | my ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) = | ||
227 | ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2); | ||
228 | my ($shr,$shl) = ($s6,$s7); # used on R6 | ||
229 | |||
230 | $code.=<<___; | ||
231 | .align 5 | ||
232 | .globl poly1305_blocks | ||
233 | .ent poly1305_blocks | ||
234 | poly1305_blocks: | ||
235 | .set noreorder | ||
236 | dsrl $len,4 # number of complete blocks | ||
237 | bnez $len,poly1305_blocks_internal | ||
238 | nop | ||
239 | jr $ra | ||
240 | nop | ||
241 | .end poly1305_blocks | ||
242 | |||
243 | .align 5 | ||
244 | .ent poly1305_blocks_internal | ||
245 | poly1305_blocks_internal: | ||
246 | .set noreorder | ||
247 | #if defined(_MIPS_ARCH_MIPS64R6) | ||
248 | .frame $sp,8*8,$ra | ||
249 | .mask $SAVED_REGS_MASK|0x000c0000,-8 | ||
250 | dsubu $sp,8*8 | ||
251 | sd $s7,56($sp) | ||
252 | sd $s6,48($sp) | ||
253 | #else | ||
254 | .frame $sp,6*8,$ra | ||
255 | .mask $SAVED_REGS_MASK,-8 | ||
256 | dsubu $sp,6*8 | ||
257 | #endif | ||
258 | sd $s5,40($sp) | ||
259 | sd $s4,32($sp) | ||
260 | ___ | ||
261 | $code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue | ||
262 | sd $s3,24($sp) | ||
263 | sd $s2,16($sp) | ||
264 | sd $s1,8($sp) | ||
265 | sd $s0,0($sp) | ||
266 | ___ | ||
267 | $code.=<<___; | ||
268 | .set reorder | ||
269 | |||
270 | #if defined(_MIPS_ARCH_MIPS64R6) | ||
271 | andi $shr,$inp,7 | ||
272 | dsubu $inp,$inp,$shr # align $inp | ||
273 | sll $shr,$shr,3 # byte to bit offset | ||
274 | subu $shl,$zero,$shr | ||
275 | #endif | ||
276 | |||
277 | ld $h0,0($ctx) # load hash value | ||
278 | ld $h1,8($ctx) | ||
279 | ld $h2,16($ctx) | ||
280 | |||
281 | ld $r0,24($ctx) # load key | ||
282 | ld $r1,32($ctx) | ||
283 | ld $rs1,40($ctx) | ||
284 | |||
285 | dsll $len,4 | ||
286 | daddu $len,$inp # end of buffer | ||
287 | b .Loop | ||
288 | |||
289 | .align 4 | ||
290 | .Loop: | ||
291 | #if defined(_MIPS_ARCH_MIPS64R6) | ||
292 | ld $in0,0($inp) # load input | ||
293 | ld $in1,8($inp) | ||
294 | beqz $shr,.Laligned_inp | ||
295 | |||
296 | ld $tmp2,16($inp) | ||
297 | # ifdef MIPSEB | ||
298 | dsllv $in0,$in0,$shr | ||
299 | dsrlv $tmp3,$in1,$shl | ||
300 | dsllv $in1,$in1,$shr | ||
301 | dsrlv $tmp2,$tmp2,$shl | ||
302 | # else | ||
303 | dsrlv $in0,$in0,$shr | ||
304 | dsllv $tmp3,$in1,$shl | ||
305 | dsrlv $in1,$in1,$shr | ||
306 | dsllv $tmp2,$tmp2,$shl | ||
307 | # endif | ||
308 | or $in0,$in0,$tmp3 | ||
309 | or $in1,$in1,$tmp2 | ||
310 | .Laligned_inp: | ||
311 | #else | ||
312 | ldl $in0,0+MSB($inp) # load input | ||
313 | ldl $in1,8+MSB($inp) | ||
314 | ldr $in0,0+LSB($inp) | ||
315 | ldr $in1,8+LSB($inp) | ||
316 | #endif | ||
317 | daddiu $inp,16 | ||
318 | #ifdef MIPSEB | ||
319 | # if defined(_MIPS_ARCH_MIPS64R2) | ||
320 | dsbh $in0,$in0 # byte swap | ||
321 | dsbh $in1,$in1 | ||
322 | dshd $in0,$in0 | ||
323 | dshd $in1,$in1 | ||
324 | # else | ||
325 | ori $tmp0,$zero,0xFF | ||
326 | dsll $tmp2,$tmp0,32 | ||
327 | or $tmp0,$tmp2 # 0x000000FF000000FF | ||
328 | |||
329 | and $tmp1,$in0,$tmp0 # byte swap | ||
330 | and $tmp3,$in1,$tmp0 | ||
331 | dsrl $tmp2,$in0,24 | ||
332 | dsrl $tmp4,$in1,24 | ||
333 | dsll $tmp1,24 | ||
334 | dsll $tmp3,24 | ||
335 | and $tmp2,$tmp0 | ||
336 | and $tmp4,$tmp0 | ||
337 | dsll $tmp0,8 # 0x0000FF000000FF00 | ||
338 | or $tmp1,$tmp2 | ||
339 | or $tmp3,$tmp4 | ||
340 | and $tmp2,$in0,$tmp0 | ||
341 | and $tmp4,$in1,$tmp0 | ||
342 | dsrl $in0,8 | ||
343 | dsrl $in1,8 | ||
344 | dsll $tmp2,8 | ||
345 | dsll $tmp4,8 | ||
346 | and $in0,$tmp0 | ||
347 | and $in1,$tmp0 | ||
348 | or $tmp1,$tmp2 | ||
349 | or $tmp3,$tmp4 | ||
350 | or $in0,$tmp1 | ||
351 | or $in1,$tmp3 | ||
352 | dsrl $tmp1,$in0,32 | ||
353 | dsrl $tmp3,$in1,32 | ||
354 | dsll $in0,32 | ||
355 | dsll $in1,32 | ||
356 | or $in0,$tmp1 | ||
357 | or $in1,$tmp3 | ||
358 | # endif | ||
359 | #endif | ||
360 | dsrl $tmp1,$h2,2 # modulo-scheduled reduction | ||
361 | andi $h2,$h2,3 | ||
362 | dsll $tmp0,$tmp1,2 | ||
363 | |||
364 | daddu $d0,$h0,$in0 # accumulate input | ||
365 | daddu $tmp1,$tmp0 | ||
366 | sltu $tmp0,$d0,$h0 | ||
367 | daddu $d0,$d0,$tmp1 # ... and residue | ||
368 | sltu $tmp1,$d0,$tmp1 | ||
369 | daddu $d1,$h1,$in1 | ||
370 | daddu $tmp0,$tmp1 | ||
371 | sltu $tmp1,$d1,$h1 | ||
372 | daddu $d1,$tmp0 | ||
373 | |||
374 | dmultu ($r0,$d0) # h0*r0 | ||
375 | daddu $d2,$h2,$padbit | ||
376 | sltu $tmp0,$d1,$tmp0 | ||
377 | mflo ($h0,$r0,$d0) | ||
378 | mfhi ($h1,$r0,$d0) | ||
379 | |||
380 | dmultu ($rs1,$d1) # h1*5*r1 | ||
381 | daddu $d2,$tmp1 | ||
382 | daddu $d2,$tmp0 | ||
383 | mflo ($tmp0,$rs1,$d1) | ||
384 | mfhi ($tmp1,$rs1,$d1) | ||
385 | |||
386 | dmultu ($r1,$d0) # h0*r1 | ||
387 | mflo ($tmp2,$r1,$d0) | ||
388 | mfhi ($h2,$r1,$d0) | ||
389 | daddu $h0,$tmp0 | ||
390 | daddu $h1,$tmp1 | ||
391 | sltu $tmp0,$h0,$tmp0 | ||
392 | |||
393 | dmultu ($r0,$d1) # h1*r0 | ||
394 | daddu $h1,$tmp0 | ||
395 | daddu $h1,$tmp2 | ||
396 | mflo ($tmp0,$r0,$d1) | ||
397 | mfhi ($tmp1,$r0,$d1) | ||
398 | |||
399 | dmultu ($rs1,$d2) # h2*5*r1 | ||
400 | sltu $tmp2,$h1,$tmp2 | ||
401 | daddu $h2,$tmp2 | ||
402 | mflo ($tmp2,$rs1,$d2) | ||
403 | |||
404 | dmultu ($r0,$d2) # h2*r0 | ||
405 | daddu $h1,$tmp0 | ||
406 | daddu $h2,$tmp1 | ||
407 | mflo ($tmp3,$r0,$d2) | ||
408 | sltu $tmp0,$h1,$tmp0 | ||
409 | daddu $h2,$tmp0 | ||
410 | |||
411 | daddu $h1,$tmp2 | ||
412 | sltu $tmp2,$h1,$tmp2 | ||
413 | daddu $h2,$tmp2 | ||
414 | daddu $h2,$tmp3 | ||
415 | |||
416 | bne $inp,$len,.Loop | ||
417 | |||
418 | sd $h0,0($ctx) # store hash value | ||
419 | sd $h1,8($ctx) | ||
420 | sd $h2,16($ctx) | ||
421 | |||
422 | .set noreorder | ||
423 | #if defined(_MIPS_ARCH_MIPS64R6) | ||
424 | ld $s7,56($sp) | ||
425 | ld $s6,48($sp) | ||
426 | #endif | ||
427 | ld $s5,40($sp) # epilogue | ||
428 | ld $s4,32($sp) | ||
429 | ___ | ||
430 | $code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi epilogue | ||
431 | ld $s3,24($sp) | ||
432 | ld $s2,16($sp) | ||
433 | ld $s1,8($sp) | ||
434 | ld $s0,0($sp) | ||
435 | ___ | ||
436 | $code.=<<___; | ||
437 | jr $ra | ||
438 | #if defined(_MIPS_ARCH_MIPS64R6) | ||
439 | daddu $sp,8*8 | ||
440 | #else | ||
441 | daddu $sp,6*8 | ||
442 | #endif | ||
443 | .end poly1305_blocks_internal | ||
444 | ___ | ||
445 | } | ||
446 | { | ||
447 | my ($ctx,$mac,$nonce) = ($a0,$a1,$a2); | ||
448 | |||
449 | $code.=<<___; | ||
450 | .align 5 | ||
451 | .globl poly1305_emit | ||
452 | .ent poly1305_emit | ||
453 | poly1305_emit: | ||
454 | .frame $sp,0,$ra | ||
455 | .set reorder | ||
456 | |||
457 | ld $tmp2,16($ctx) | ||
458 | ld $tmp0,0($ctx) | ||
459 | ld $tmp1,8($ctx) | ||
460 | |||
461 | li $in0,-4 # final reduction | ||
462 | dsrl $in1,$tmp2,2 | ||
463 | and $in0,$tmp2 | ||
464 | andi $tmp2,$tmp2,3 | ||
465 | daddu $in0,$in1 | ||
466 | |||
467 | daddu $tmp0,$tmp0,$in0 | ||
468 | sltu $in1,$tmp0,$in0 | ||
469 | daddiu $in0,$tmp0,5 # compare to modulus | ||
470 | daddu $tmp1,$tmp1,$in1 | ||
471 | sltiu $tmp3,$in0,5 | ||
472 | sltu $tmp4,$tmp1,$in1 | ||
473 | daddu $in1,$tmp1,$tmp3 | ||
474 | daddu $tmp2,$tmp2,$tmp4 | ||
475 | sltu $tmp3,$in1,$tmp3 | ||
476 | daddu $tmp2,$tmp2,$tmp3 | ||
477 | |||
478 | dsrl $tmp2,2 # see if it carried/borrowed | ||
479 | dsubu $tmp2,$zero,$tmp2 | ||
480 | |||
481 | xor $in0,$tmp0 | ||
482 | xor $in1,$tmp1 | ||
483 | and $in0,$tmp2 | ||
484 | and $in1,$tmp2 | ||
485 | xor $in0,$tmp0 | ||
486 | xor $in1,$tmp1 | ||
487 | |||
488 | lwu $tmp0,0($nonce) # load nonce | ||
489 | lwu $tmp1,4($nonce) | ||
490 | lwu $tmp2,8($nonce) | ||
491 | lwu $tmp3,12($nonce) | ||
492 | dsll $tmp1,32 | ||
493 | dsll $tmp3,32 | ||
494 | or $tmp0,$tmp1 | ||
495 | or $tmp2,$tmp3 | ||
496 | |||
497 | daddu $in0,$tmp0 # accumulate nonce | ||
498 | daddu $in1,$tmp2 | ||
499 | sltu $tmp0,$in0,$tmp0 | ||
500 | daddu $in1,$tmp0 | ||
501 | |||
502 | dsrl $tmp0,$in0,8 # write mac value | ||
503 | dsrl $tmp1,$in0,16 | ||
504 | dsrl $tmp2,$in0,24 | ||
505 | sb $in0,0($mac) | ||
506 | dsrl $tmp3,$in0,32 | ||
507 | sb $tmp0,1($mac) | ||
508 | dsrl $tmp0,$in0,40 | ||
509 | sb $tmp1,2($mac) | ||
510 | dsrl $tmp1,$in0,48 | ||
511 | sb $tmp2,3($mac) | ||
512 | dsrl $tmp2,$in0,56 | ||
513 | sb $tmp3,4($mac) | ||
514 | dsrl $tmp3,$in1,8 | ||
515 | sb $tmp0,5($mac) | ||
516 | dsrl $tmp0,$in1,16 | ||
517 | sb $tmp1,6($mac) | ||
518 | dsrl $tmp1,$in1,24 | ||
519 | sb $tmp2,7($mac) | ||
520 | |||
521 | sb $in1,8($mac) | ||
522 | dsrl $tmp2,$in1,32 | ||
523 | sb $tmp3,9($mac) | ||
524 | dsrl $tmp3,$in1,40 | ||
525 | sb $tmp0,10($mac) | ||
526 | dsrl $tmp0,$in1,48 | ||
527 | sb $tmp1,11($mac) | ||
528 | dsrl $tmp1,$in1,56 | ||
529 | sb $tmp2,12($mac) | ||
530 | sb $tmp3,13($mac) | ||
531 | sb $tmp0,14($mac) | ||
532 | sb $tmp1,15($mac) | ||
533 | |||
534 | jr $ra | ||
535 | .end poly1305_emit | ||
536 | .rdata | ||
537 | .asciiz "Poly1305 for MIPS64, CRYPTOGAMS by \@dot-asm" | ||
538 | .align 2 | ||
539 | ___ | ||
540 | } | ||
541 | }}} else {{{ | ||
542 | ###################################################################### | ||
543 | # 32-bit code path | ||
544 | # | ||
545 | |||
546 | my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3); | ||
547 | my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) = | ||
548 | ($a4,$a5,$a6,$a7,$at,$t0,$t1,$t2); | ||
549 | |||
550 | $code.=<<___; | ||
551 | #if (defined(_MIPS_ARCH_MIPS32R3) || defined(_MIPS_ARCH_MIPS32R5) || \\ | ||
552 | defined(_MIPS_ARCH_MIPS32R6)) \\ | ||
553 | && !defined(_MIPS_ARCH_MIPS32R2) | ||
554 | # define _MIPS_ARCH_MIPS32R2 | ||
555 | #endif | ||
556 | |||
557 | #if defined(_MIPS_ARCH_MIPS32R6) | ||
558 | # define multu(rs,rt) | ||
559 | # define mflo(rd,rs,rt) mulu rd,rs,rt | ||
560 | # define mfhi(rd,rs,rt) muhu rd,rs,rt | ||
561 | #else | ||
562 | # define multu(rs,rt) multu rs,rt | ||
563 | # define mflo(rd,rs,rt) mflo rd | ||
564 | # define mfhi(rd,rs,rt) mfhi rd | ||
565 | #endif | ||
566 | |||
567 | #ifdef __KERNEL__ | ||
568 | # define poly1305_init poly1305_init_mips | ||
569 | # define poly1305_blocks poly1305_blocks_mips | ||
570 | # define poly1305_emit poly1305_emit_mips | ||
571 | #endif | ||
572 | |||
573 | #if defined(__MIPSEB__) && !defined(MIPSEB) | ||
574 | # define MIPSEB | ||
575 | #endif | ||
576 | |||
577 | #ifdef MIPSEB | ||
578 | # define MSB 0 | ||
579 | # define LSB 3 | ||
580 | #else | ||
581 | # define MSB 3 | ||
582 | # define LSB 0 | ||
583 | #endif | ||
584 | |||
585 | .text | ||
586 | .set noat | ||
587 | .set noreorder | ||
588 | |||
589 | .align 5 | ||
590 | .globl poly1305_init | ||
591 | .ent poly1305_init | ||
592 | poly1305_init: | ||
593 | .frame $sp,0,$ra | ||
594 | .set reorder | ||
595 | |||
596 | sw $zero,0($ctx) | ||
597 | sw $zero,4($ctx) | ||
598 | sw $zero,8($ctx) | ||
599 | sw $zero,12($ctx) | ||
600 | sw $zero,16($ctx) | ||
601 | |||
602 | beqz $inp,.Lno_key | ||
603 | |||
604 | #if defined(_MIPS_ARCH_MIPS32R6) | ||
605 | andi $tmp0,$inp,3 # $inp % 4 | ||
606 | subu $inp,$inp,$tmp0 # align $inp | ||
607 | sll $tmp0,$tmp0,3 # byte to bit offset | ||
608 | lw $in0,0($inp) | ||
609 | lw $in1,4($inp) | ||
610 | lw $in2,8($inp) | ||
611 | lw $in3,12($inp) | ||
612 | beqz $tmp0,.Laligned_key | ||
613 | |||
614 | lw $tmp2,16($inp) | ||
615 | subu $tmp1,$zero,$tmp0 | ||
616 | # ifdef MIPSEB | ||
617 | sllv $in0,$in0,$tmp0 | ||
618 | srlv $tmp3,$in1,$tmp1 | ||
619 | sllv $in1,$in1,$tmp0 | ||
620 | or $in0,$in0,$tmp3 | ||
621 | srlv $tmp3,$in2,$tmp1 | ||
622 | sllv $in2,$in2,$tmp0 | ||
623 | or $in1,$in1,$tmp3 | ||
624 | srlv $tmp3,$in3,$tmp1 | ||
625 | sllv $in3,$in3,$tmp0 | ||
626 | or $in2,$in2,$tmp3 | ||
627 | srlv $tmp2,$tmp2,$tmp1 | ||
628 | or $in3,$in3,$tmp2 | ||
629 | # else | ||
630 | srlv $in0,$in0,$tmp0 | ||
631 | sllv $tmp3,$in1,$tmp1 | ||
632 | srlv $in1,$in1,$tmp0 | ||
633 | or $in0,$in0,$tmp3 | ||
634 | sllv $tmp3,$in2,$tmp1 | ||
635 | srlv $in2,$in2,$tmp0 | ||
636 | or $in1,$in1,$tmp3 | ||
637 | sllv $tmp3,$in3,$tmp1 | ||
638 | srlv $in3,$in3,$tmp0 | ||
639 | or $in2,$in2,$tmp3 | ||
640 | sllv $tmp2,$tmp2,$tmp1 | ||
641 | or $in3,$in3,$tmp2 | ||
642 | # endif | ||
643 | .Laligned_key: | ||
644 | #else | ||
645 | lwl $in0,0+MSB($inp) | ||
646 | lwl $in1,4+MSB($inp) | ||
647 | lwl $in2,8+MSB($inp) | ||
648 | lwl $in3,12+MSB($inp) | ||
649 | lwr $in0,0+LSB($inp) | ||
650 | lwr $in1,4+LSB($inp) | ||
651 | lwr $in2,8+LSB($inp) | ||
652 | lwr $in3,12+LSB($inp) | ||
653 | #endif | ||
654 | #ifdef MIPSEB | ||
655 | # if defined(_MIPS_ARCH_MIPS32R2) | ||
656 | wsbh $in0,$in0 # byte swap | ||
657 | wsbh $in1,$in1 | ||
658 | wsbh $in2,$in2 | ||
659 | wsbh $in3,$in3 | ||
660 | rotr $in0,$in0,16 | ||
661 | rotr $in1,$in1,16 | ||
662 | rotr $in2,$in2,16 | ||
663 | rotr $in3,$in3,16 | ||
664 | # else | ||
665 | srl $tmp0,$in0,24 # byte swap | ||
666 | srl $tmp1,$in0,8 | ||
667 | andi $tmp2,$in0,0xFF00 | ||
668 | sll $in0,$in0,24 | ||
669 | andi $tmp1,0xFF00 | ||
670 | sll $tmp2,$tmp2,8 | ||
671 | or $in0,$tmp0 | ||
672 | srl $tmp0,$in1,24 | ||
673 | or $tmp1,$tmp2 | ||
674 | srl $tmp2,$in1,8 | ||
675 | or $in0,$tmp1 | ||
676 | andi $tmp1,$in1,0xFF00 | ||
677 | sll $in1,$in1,24 | ||
678 | andi $tmp2,0xFF00 | ||
679 | sll $tmp1,$tmp1,8 | ||
680 | or $in1,$tmp0 | ||
681 | srl $tmp0,$in2,24 | ||
682 | or $tmp2,$tmp1 | ||
683 | srl $tmp1,$in2,8 | ||
684 | or $in1,$tmp2 | ||
685 | andi $tmp2,$in2,0xFF00 | ||
686 | sll $in2,$in2,24 | ||
687 | andi $tmp1,0xFF00 | ||
688 | sll $tmp2,$tmp2,8 | ||
689 | or $in2,$tmp0 | ||
690 | srl $tmp0,$in3,24 | ||
691 | or $tmp1,$tmp2 | ||
692 | srl $tmp2,$in3,8 | ||
693 | or $in2,$tmp1 | ||
694 | andi $tmp1,$in3,0xFF00 | ||
695 | sll $in3,$in3,24 | ||
696 | andi $tmp2,0xFF00 | ||
697 | sll $tmp1,$tmp1,8 | ||
698 | or $in3,$tmp0 | ||
699 | or $tmp2,$tmp1 | ||
700 | or $in3,$tmp2 | ||
701 | # endif | ||
702 | #endif | ||
703 | lui $tmp0,0x0fff | ||
704 | ori $tmp0,0xffff # 0x0fffffff | ||
705 | and $in0,$in0,$tmp0 | ||
706 | subu $tmp0,3 # 0x0ffffffc | ||
707 | and $in1,$in1,$tmp0 | ||
708 | and $in2,$in2,$tmp0 | ||
709 | and $in3,$in3,$tmp0 | ||
710 | |||
711 | sw $in0,20($ctx) | ||
712 | sw $in1,24($ctx) | ||
713 | sw $in2,28($ctx) | ||
714 | sw $in3,32($ctx) | ||
715 | |||
716 | srl $tmp1,$in1,2 | ||
717 | srl $tmp2,$in2,2 | ||
718 | srl $tmp3,$in3,2 | ||
719 | addu $in1,$in1,$tmp1 # s1 = r1 + (r1 >> 2) | ||
720 | addu $in2,$in2,$tmp2 | ||
721 | addu $in3,$in3,$tmp3 | ||
722 | sw $in1,36($ctx) | ||
723 | sw $in2,40($ctx) | ||
724 | sw $in3,44($ctx) | ||
725 | .Lno_key: | ||
726 | li $v0,0 | ||
727 | jr $ra | ||
728 | .end poly1305_init | ||
729 | ___ | ||
730 | { | ||
731 | my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x00fff000" : "0x00ff0000"; | ||
732 | |||
733 | my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) = | ||
734 | ($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $s9,$s10,$s11); | ||
735 | my ($d0,$d1,$d2,$d3) = | ||
736 | ($a4,$a5,$a6,$a7); | ||
737 | my $shr = $t2; # used on R6 | ||
738 | my $one = $t2; # used on R2 | ||
739 | |||
740 | $code.=<<___; | ||
741 | .globl poly1305_blocks | ||
742 | .align 5 | ||
743 | .ent poly1305_blocks | ||
744 | poly1305_blocks: | ||
745 | .frame $sp,16*4,$ra | ||
746 | .mask $SAVED_REGS_MASK,-4 | ||
747 | .set noreorder | ||
748 | subu $sp, $sp,4*12 | ||
749 | sw $s11,4*11($sp) | ||
750 | sw $s10,4*10($sp) | ||
751 | sw $s9, 4*9($sp) | ||
752 | sw $s8, 4*8($sp) | ||
753 | sw $s7, 4*7($sp) | ||
754 | sw $s6, 4*6($sp) | ||
755 | sw $s5, 4*5($sp) | ||
756 | sw $s4, 4*4($sp) | ||
757 | ___ | ||
758 | $code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue | ||
759 | sw $s3, 4*3($sp) | ||
760 | sw $s2, 4*2($sp) | ||
761 | sw $s1, 4*1($sp) | ||
762 | sw $s0, 4*0($sp) | ||
763 | ___ | ||
764 | $code.=<<___; | ||
765 | .set reorder | ||
766 | |||
767 | srl $len,4 # number of complete blocks | ||
768 | li $one,1 | ||
769 | beqz $len,.Labort | ||
770 | |||
771 | #if defined(_MIPS_ARCH_MIPS32R6) | ||
772 | andi $shr,$inp,3 | ||
773 | subu $inp,$inp,$shr # align $inp | ||
774 | sll $shr,$shr,3 # byte to bit offset | ||
775 | #endif | ||
776 | |||
777 | lw $h0,0($ctx) # load hash value | ||
778 | lw $h1,4($ctx) | ||
779 | lw $h2,8($ctx) | ||
780 | lw $h3,12($ctx) | ||
781 | lw $h4,16($ctx) | ||
782 | |||
783 | lw $r0,20($ctx) # load key | ||
784 | lw $r1,24($ctx) | ||
785 | lw $r2,28($ctx) | ||
786 | lw $r3,32($ctx) | ||
787 | lw $rs1,36($ctx) | ||
788 | lw $rs2,40($ctx) | ||
789 | lw $rs3,44($ctx) | ||
790 | |||
791 | sll $len,4 | ||
792 | addu $len,$len,$inp # end of buffer | ||
793 | b .Loop | ||
794 | |||
795 | .align 4 | ||
796 | .Loop: | ||
797 | #if defined(_MIPS_ARCH_MIPS32R6) | ||
798 | lw $d0,0($inp) # load input | ||
799 | lw $d1,4($inp) | ||
800 | lw $d2,8($inp) | ||
801 | lw $d3,12($inp) | ||
802 | beqz $shr,.Laligned_inp | ||
803 | |||
804 | lw $t0,16($inp) | ||
805 | subu $t1,$zero,$shr | ||
806 | # ifdef MIPSEB | ||
807 | sllv $d0,$d0,$shr | ||
808 | srlv $at,$d1,$t1 | ||
809 | sllv $d1,$d1,$shr | ||
810 | or $d0,$d0,$at | ||
811 | srlv $at,$d2,$t1 | ||
812 | sllv $d2,$d2,$shr | ||
813 | or $d1,$d1,$at | ||
814 | srlv $at,$d3,$t1 | ||
815 | sllv $d3,$d3,$shr | ||
816 | or $d2,$d2,$at | ||
817 | srlv $t0,$t0,$t1 | ||
818 | or $d3,$d3,$t0 | ||
819 | # else | ||
820 | srlv $d0,$d0,$shr | ||
821 | sllv $at,$d1,$t1 | ||
822 | srlv $d1,$d1,$shr | ||
823 | or $d0,$d0,$at | ||
824 | sllv $at,$d2,$t1 | ||
825 | srlv $d2,$d2,$shr | ||
826 | or $d1,$d1,$at | ||
827 | sllv $at,$d3,$t1 | ||
828 | srlv $d3,$d3,$shr | ||
829 | or $d2,$d2,$at | ||
830 | sllv $t0,$t0,$t1 | ||
831 | or $d3,$d3,$t0 | ||
832 | # endif | ||
833 | .Laligned_inp: | ||
834 | #else | ||
835 | lwl $d0,0+MSB($inp) # load input | ||
836 | lwl $d1,4+MSB($inp) | ||
837 | lwl $d2,8+MSB($inp) | ||
838 | lwl $d3,12+MSB($inp) | ||
839 | lwr $d0,0+LSB($inp) | ||
840 | lwr $d1,4+LSB($inp) | ||
841 | lwr $d2,8+LSB($inp) | ||
842 | lwr $d3,12+LSB($inp) | ||
843 | #endif | ||
844 | #ifdef MIPSEB | ||
845 | # if defined(_MIPS_ARCH_MIPS32R2) | ||
846 | wsbh $d0,$d0 # byte swap | ||
847 | wsbh $d1,$d1 | ||
848 | wsbh $d2,$d2 | ||
849 | wsbh $d3,$d3 | ||
850 | rotr $d0,$d0,16 | ||
851 | rotr $d1,$d1,16 | ||
852 | rotr $d2,$d2,16 | ||
853 | rotr $d3,$d3,16 | ||
854 | # else | ||
855 | srl $at,$d0,24 # byte swap | ||
856 | srl $t0,$d0,8 | ||
857 | andi $t1,$d0,0xFF00 | ||
858 | sll $d0,$d0,24 | ||
859 | andi $t0,0xFF00 | ||
860 | sll $t1,$t1,8 | ||
861 | or $d0,$at | ||
862 | srl $at,$d1,24 | ||
863 | or $t0,$t1 | ||
864 | srl $t1,$d1,8 | ||
865 | or $d0,$t0 | ||
866 | andi $t0,$d1,0xFF00 | ||
867 | sll $d1,$d1,24 | ||
868 | andi $t1,0xFF00 | ||
869 | sll $t0,$t0,8 | ||
870 | or $d1,$at | ||
871 | srl $at,$d2,24 | ||
872 | or $t1,$t0 | ||
873 | srl $t0,$d2,8 | ||
874 | or $d1,$t1 | ||
875 | andi $t1,$d2,0xFF00 | ||
876 | sll $d2,$d2,24 | ||
877 | andi $t0,0xFF00 | ||
878 | sll $t1,$t1,8 | ||
879 | or $d2,$at | ||
880 | srl $at,$d3,24 | ||
881 | or $t0,$t1 | ||
882 | srl $t1,$d3,8 | ||
883 | or $d2,$t0 | ||
884 | andi $t0,$d3,0xFF00 | ||
885 | sll $d3,$d3,24 | ||
886 | andi $t1,0xFF00 | ||
887 | sll $t0,$t0,8 | ||
888 | or $d3,$at | ||
889 | or $t1,$t0 | ||
890 | or $d3,$t1 | ||
891 | # endif | ||
892 | #endif | ||
893 | srl $t0,$h4,2 # modulo-scheduled reduction | ||
894 | andi $h4,$h4,3 | ||
895 | sll $at,$t0,2 | ||
896 | |||
897 | addu $d0,$d0,$h0 # accumulate input | ||
898 | addu $t0,$t0,$at | ||
899 | sltu $h0,$d0,$h0 | ||
900 | addu $d0,$d0,$t0 # ... and residue | ||
901 | sltu $at,$d0,$t0 | ||
902 | |||
903 | addu $d1,$d1,$h1 | ||
904 | addu $h0,$h0,$at # carry | ||
905 | sltu $h1,$d1,$h1 | ||
906 | addu $d1,$d1,$h0 | ||
907 | sltu $h0,$d1,$h0 | ||
908 | |||
909 | addu $d2,$d2,$h2 | ||
910 | addu $h1,$h1,$h0 # carry | ||
911 | sltu $h2,$d2,$h2 | ||
912 | addu $d2,$d2,$h1 | ||
913 | sltu $h1,$d2,$h1 | ||
914 | |||
915 | addu $d3,$d3,$h3 | ||
916 | addu $h2,$h2,$h1 # carry | ||
917 | sltu $h3,$d3,$h3 | ||
918 | addu $d3,$d3,$h2 | ||
919 | |||
920 | #if defined(_MIPS_ARCH_MIPS32R2) && !defined(_MIPS_ARCH_MIPS32R6) | ||
921 | multu $r0,$d0 # d0*r0 | ||
922 | sltu $h2,$d3,$h2 | ||
923 | maddu $rs3,$d1 # d1*s3 | ||
924 | addu $h3,$h3,$h2 # carry | ||
925 | maddu $rs2,$d2 # d2*s2 | ||
926 | addu $h4,$h4,$padbit | ||
927 | maddu $rs1,$d3 # d3*s1 | ||
928 | addu $h4,$h4,$h3 | ||
929 | mfhi $at | ||
930 | mflo $h0 | ||
931 | |||
932 | multu $r1,$d0 # d0*r1 | ||
933 | maddu $r0,$d1 # d1*r0 | ||
934 | maddu $rs3,$d2 # d2*s3 | ||
935 | maddu $rs2,$d3 # d3*s2 | ||
936 | maddu $rs1,$h4 # h4*s1 | ||
937 | maddu $at,$one # hi*1 | ||
938 | mfhi $at | ||
939 | mflo $h1 | ||
940 | |||
941 | multu $r2,$d0 # d0*r2 | ||
942 | maddu $r1,$d1 # d1*r1 | ||
943 | maddu $r0,$d2 # d2*r0 | ||
944 | maddu $rs3,$d3 # d3*s3 | ||
945 | maddu $rs2,$h4 # h4*s2 | ||
946 | maddu $at,$one # hi*1 | ||
947 | mfhi $at | ||
948 | mflo $h2 | ||
949 | |||
950 | mul $t0,$r0,$h4 # h4*r0 | ||
951 | |||
952 | multu $r3,$d0 # d0*r3 | ||
953 | maddu $r2,$d1 # d1*r2 | ||
954 | maddu $r1,$d2 # d2*r1 | ||
955 | maddu $r0,$d3 # d3*r0 | ||
956 | maddu $rs3,$h4 # h4*s3 | ||
957 | maddu $at,$one # hi*1 | ||
958 | mfhi $at | ||
959 | mflo $h3 | ||
960 | |||
961 | addiu $inp,$inp,16 | ||
962 | |||
963 | addu $h4,$t0,$at | ||
964 | #else | ||
965 | multu ($r0,$d0) # d0*r0 | ||
966 | mflo ($h0,$r0,$d0) | ||
967 | mfhi ($h1,$r0,$d0) | ||
968 | |||
969 | sltu $h2,$d3,$h2 | ||
970 | addu $h3,$h3,$h2 # carry | ||
971 | |||
972 | multu ($rs3,$d1) # d1*s3 | ||
973 | mflo ($at,$rs3,$d1) | ||
974 | mfhi ($t0,$rs3,$d1) | ||
975 | |||
976 | addu $h4,$h4,$padbit | ||
977 | addiu $inp,$inp,16 | ||
978 | addu $h4,$h4,$h3 | ||
979 | |||
980 | multu ($rs2,$d2) # d2*s2 | ||
981 | mflo ($a3,$rs2,$d2) | ||
982 | mfhi ($t1,$rs2,$d2) | ||
983 | addu $h0,$h0,$at | ||
984 | addu $h1,$h1,$t0 | ||
985 | multu ($rs1,$d3) # d3*s1 | ||
986 | sltu $at,$h0,$at | ||
987 | addu $h1,$h1,$at | ||
988 | |||
989 | mflo ($at,$rs1,$d3) | ||
990 | mfhi ($t0,$rs1,$d3) | ||
991 | addu $h0,$h0,$a3 | ||
992 | addu $h1,$h1,$t1 | ||
993 | multu ($r1,$d0) # d0*r1 | ||
994 | sltu $a3,$h0,$a3 | ||
995 | addu $h1,$h1,$a3 | ||
996 | |||
997 | |||
998 | mflo ($a3,$r1,$d0) | ||
999 | mfhi ($h2,$r1,$d0) | ||
1000 | addu $h0,$h0,$at | ||
1001 | addu $h1,$h1,$t0 | ||
1002 | multu ($r0,$d1) # d1*r0 | ||
1003 | sltu $at,$h0,$at | ||
1004 | addu $h1,$h1,$at | ||
1005 | |||
1006 | mflo ($at,$r0,$d1) | ||
1007 | mfhi ($t0,$r0,$d1) | ||
1008 | addu $h1,$h1,$a3 | ||
1009 | sltu $a3,$h1,$a3 | ||
1010 | multu ($rs3,$d2) # d2*s3 | ||
1011 | addu $h2,$h2,$a3 | ||
1012 | |||
1013 | mflo ($a3,$rs3,$d2) | ||
1014 | mfhi ($t1,$rs3,$d2) | ||
1015 | addu $h1,$h1,$at | ||
1016 | addu $h2,$h2,$t0 | ||
1017 | multu ($rs2,$d3) # d3*s2 | ||
1018 | sltu $at,$h1,$at | ||
1019 | addu $h2,$h2,$at | ||
1020 | |||
1021 | mflo ($at,$rs2,$d3) | ||
1022 | mfhi ($t0,$rs2,$d3) | ||
1023 | addu $h1,$h1,$a3 | ||
1024 | addu $h2,$h2,$t1 | ||
1025 | multu ($rs1,$h4) # h4*s1 | ||
1026 | sltu $a3,$h1,$a3 | ||
1027 | addu $h2,$h2,$a3 | ||
1028 | |||
1029 | mflo ($a3,$rs1,$h4) | ||
1030 | addu $h1,$h1,$at | ||
1031 | addu $h2,$h2,$t0 | ||
1032 | multu ($r2,$d0) # d0*r2 | ||
1033 | sltu $at,$h1,$at | ||
1034 | addu $h2,$h2,$at | ||
1035 | |||
1036 | |||
1037 | mflo ($at,$r2,$d0) | ||
1038 | mfhi ($h3,$r2,$d0) | ||
1039 | addu $h1,$h1,$a3 | ||
1040 | sltu $a3,$h1,$a3 | ||
1041 | multu ($r1,$d1) # d1*r1 | ||
1042 | addu $h2,$h2,$a3 | ||
1043 | |||
1044 | mflo ($a3,$r1,$d1) | ||
1045 | mfhi ($t1,$r1,$d1) | ||
1046 | addu $h2,$h2,$at | ||
1047 | sltu $at,$h2,$at | ||
1048 | multu ($r0,$d2) # d2*r0 | ||
1049 | addu $h3,$h3,$at | ||
1050 | |||
1051 | mflo ($at,$r0,$d2) | ||
1052 | mfhi ($t0,$r0,$d2) | ||
1053 | addu $h2,$h2,$a3 | ||
1054 | addu $h3,$h3,$t1 | ||
1055 | multu ($rs3,$d3) # d3*s3 | ||
1056 | sltu $a3,$h2,$a3 | ||
1057 | addu $h3,$h3,$a3 | ||
1058 | |||
1059 | mflo ($a3,$rs3,$d3) | ||
1060 | mfhi ($t1,$rs3,$d3) | ||
1061 | addu $h2,$h2,$at | ||
1062 | addu $h3,$h3,$t0 | ||
1063 | multu ($rs2,$h4) # h4*s2 | ||
1064 | sltu $at,$h2,$at | ||
1065 | addu $h3,$h3,$at | ||
1066 | |||
1067 | mflo ($at,$rs2,$h4) | ||
1068 | addu $h2,$h2,$a3 | ||
1069 | addu $h3,$h3,$t1 | ||
1070 | multu ($r3,$d0) # d0*r3 | ||
1071 | sltu $a3,$h2,$a3 | ||
1072 | addu $h3,$h3,$a3 | ||
1073 | |||
1074 | |||
1075 | mflo ($a3,$r3,$d0) | ||
1076 | mfhi ($t1,$r3,$d0) | ||
1077 | addu $h2,$h2,$at | ||
1078 | sltu $at,$h2,$at | ||
1079 | multu ($r2,$d1) # d1*r2 | ||
1080 | addu $h3,$h3,$at | ||
1081 | |||
1082 | mflo ($at,$r2,$d1) | ||
1083 | mfhi ($t0,$r2,$d1) | ||
1084 | addu $h3,$h3,$a3 | ||
1085 | sltu $a3,$h3,$a3 | ||
1086 | multu ($r0,$d3) # d3*r0 | ||
1087 | addu $t1,$t1,$a3 | ||
1088 | |||
1089 | mflo ($a3,$r0,$d3) | ||
1090 | mfhi ($d3,$r0,$d3) | ||
1091 | addu $h3,$h3,$at | ||
1092 | addu $t1,$t1,$t0 | ||
1093 | multu ($r1,$d2) # d2*r1 | ||
1094 | sltu $at,$h3,$at | ||
1095 | addu $t1,$t1,$at | ||
1096 | |||
1097 | mflo ($at,$r1,$d2) | ||
1098 | mfhi ($t0,$r1,$d2) | ||
1099 | addu $h3,$h3,$a3 | ||
1100 | addu $t1,$t1,$d3 | ||
1101 | multu ($rs3,$h4) # h4*s3 | ||
1102 | sltu $a3,$h3,$a3 | ||
1103 | addu $t1,$t1,$a3 | ||
1104 | |||
1105 | mflo ($a3,$rs3,$h4) | ||
1106 | addu $h3,$h3,$at | ||
1107 | addu $t1,$t1,$t0 | ||
1108 | multu ($r0,$h4) # h4*r0 | ||
1109 | sltu $at,$h3,$at | ||
1110 | addu $t1,$t1,$at | ||
1111 | |||
1112 | |||
1113 | mflo ($h4,$r0,$h4) | ||
1114 | addu $h3,$h3,$a3 | ||
1115 | sltu $a3,$h3,$a3 | ||
1116 | addu $t1,$t1,$a3 | ||
1117 | addu $h4,$h4,$t1 | ||
1118 | |||
1119 | li $padbit,1 # if we loop, padbit is 1 | ||
1120 | #endif | ||
1121 | bne $inp,$len,.Loop | ||
1122 | |||
1123 | sw $h0,0($ctx) # store hash value | ||
1124 | sw $h1,4($ctx) | ||
1125 | sw $h2,8($ctx) | ||
1126 | sw $h3,12($ctx) | ||
1127 | sw $h4,16($ctx) | ||
1128 | |||
1129 | .set noreorder | ||
1130 | .Labort: | ||
1131 | lw $s11,4*11($sp) | ||
1132 | lw $s10,4*10($sp) | ||
1133 | lw $s9, 4*9($sp) | ||
1134 | lw $s8, 4*8($sp) | ||
1135 | lw $s7, 4*7($sp) | ||
1136 | lw $s6, 4*6($sp) | ||
1137 | lw $s5, 4*5($sp) | ||
1138 | lw $s4, 4*4($sp) | ||
1139 | ___ | ||
1140 | $code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue | ||
1141 | lw $s3, 4*3($sp) | ||
1142 | lw $s2, 4*2($sp) | ||
1143 | lw $s1, 4*1($sp) | ||
1144 | lw $s0, 4*0($sp) | ||
1145 | ___ | ||
1146 | $code.=<<___; | ||
1147 | jr $ra | ||
1148 | addu $sp,$sp,4*12 | ||
1149 | .end poly1305_blocks | ||
1150 | ___ | ||
1151 | } | ||
1152 | { | ||
1153 | my ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3); | ||
1154 | |||
1155 | $code.=<<___; | ||
1156 | .align 5 | ||
1157 | .globl poly1305_emit | ||
1158 | .ent poly1305_emit | ||
1159 | poly1305_emit: | ||
1160 | .frame $sp,0,$ra | ||
1161 | .set reorder | ||
1162 | |||
1163 | lw $tmp4,16($ctx) | ||
1164 | lw $tmp0,0($ctx) | ||
1165 | lw $tmp1,4($ctx) | ||
1166 | lw $tmp2,8($ctx) | ||
1167 | lw $tmp3,12($ctx) | ||
1168 | |||
1169 | li $in0,-4 # final reduction | ||
1170 | srl $ctx,$tmp4,2 | ||
1171 | and $in0,$in0,$tmp4 | ||
1172 | andi $tmp4,$tmp4,3 | ||
1173 | addu $ctx,$ctx,$in0 | ||
1174 | |||
1175 | addu $tmp0,$tmp0,$ctx | ||
1176 | sltu $ctx,$tmp0,$ctx | ||
1177 | addiu $in0,$tmp0,5 # compare to modulus | ||
1178 | addu $tmp1,$tmp1,$ctx | ||
1179 | sltiu $in1,$in0,5 | ||
1180 | sltu $ctx,$tmp1,$ctx | ||
1181 | addu $in1,$in1,$tmp1 | ||
1182 | addu $tmp2,$tmp2,$ctx | ||
1183 | sltu $in2,$in1,$tmp1 | ||
1184 | sltu $ctx,$tmp2,$ctx | ||
1185 | addu $in2,$in2,$tmp2 | ||
1186 | addu $tmp3,$tmp3,$ctx | ||
1187 | sltu $in3,$in2,$tmp2 | ||
1188 | sltu $ctx,$tmp3,$ctx | ||
1189 | addu $in3,$in3,$tmp3 | ||
1190 | addu $tmp4,$tmp4,$ctx | ||
1191 | sltu $ctx,$in3,$tmp3 | ||
1192 | addu $ctx,$tmp4 | ||
1193 | |||
1194 | srl $ctx,2 # see if it carried/borrowed | ||
1195 | subu $ctx,$zero,$ctx | ||
1196 | |||
1197 | xor $in0,$tmp0 | ||
1198 | xor $in1,$tmp1 | ||
1199 | xor $in2,$tmp2 | ||
1200 | xor $in3,$tmp3 | ||
1201 | and $in0,$ctx | ||
1202 | and $in1,$ctx | ||
1203 | and $in2,$ctx | ||
1204 | and $in3,$ctx | ||
1205 | xor $in0,$tmp0 | ||
1206 | xor $in1,$tmp1 | ||
1207 | xor $in2,$tmp2 | ||
1208 | xor $in3,$tmp3 | ||
1209 | |||
1210 | lw $tmp0,0($nonce) # load nonce | ||
1211 | lw $tmp1,4($nonce) | ||
1212 | lw $tmp2,8($nonce) | ||
1213 | lw $tmp3,12($nonce) | ||
1214 | |||
1215 | addu $in0,$tmp0 # accumulate nonce | ||
1216 | sltu $ctx,$in0,$tmp0 | ||
1217 | |||
1218 | addu $in1,$tmp1 | ||
1219 | sltu $tmp1,$in1,$tmp1 | ||
1220 | addu $in1,$ctx | ||
1221 | sltu $ctx,$in1,$ctx | ||
1222 | addu $ctx,$tmp1 | ||
1223 | |||
1224 | addu $in2,$tmp2 | ||
1225 | sltu $tmp2,$in2,$tmp2 | ||
1226 | addu $in2,$ctx | ||
1227 | sltu $ctx,$in2,$ctx | ||
1228 | addu $ctx,$tmp2 | ||
1229 | |||
1230 | addu $in3,$tmp3 | ||
1231 | addu $in3,$ctx | ||
1232 | |||
1233 | srl $tmp0,$in0,8 # write mac value | ||
1234 | srl $tmp1,$in0,16 | ||
1235 | srl $tmp2,$in0,24 | ||
1236 | sb $in0, 0($mac) | ||
1237 | sb $tmp0,1($mac) | ||
1238 | srl $tmp0,$in1,8 | ||
1239 | sb $tmp1,2($mac) | ||
1240 | srl $tmp1,$in1,16 | ||
1241 | sb $tmp2,3($mac) | ||
1242 | srl $tmp2,$in1,24 | ||
1243 | sb $in1, 4($mac) | ||
1244 | sb $tmp0,5($mac) | ||
1245 | srl $tmp0,$in2,8 | ||
1246 | sb $tmp1,6($mac) | ||
1247 | srl $tmp1,$in2,16 | ||
1248 | sb $tmp2,7($mac) | ||
1249 | srl $tmp2,$in2,24 | ||
1250 | sb $in2, 8($mac) | ||
1251 | sb $tmp0,9($mac) | ||
1252 | srl $tmp0,$in3,8 | ||
1253 | sb $tmp1,10($mac) | ||
1254 | srl $tmp1,$in3,16 | ||
1255 | sb $tmp2,11($mac) | ||
1256 | srl $tmp2,$in3,24 | ||
1257 | sb $in3, 12($mac) | ||
1258 | sb $tmp0,13($mac) | ||
1259 | sb $tmp1,14($mac) | ||
1260 | sb $tmp2,15($mac) | ||
1261 | |||
1262 | jr $ra | ||
1263 | .end poly1305_emit | ||
1264 | .rdata | ||
1265 | .asciiz "Poly1305 for MIPS32, CRYPTOGAMS by \@dot-asm" | ||
1266 | .align 2 | ||
1267 | ___ | ||
1268 | } | ||
1269 | }}} | ||
1270 | |||
1271 | $output=pop and open STDOUT,">$output"; | ||
1272 | print $code; | ||
1273 | close STDOUT; | ||