aboutsummaryrefslogtreecommitdiffstats
path: root/arch/mips/crypto
diff options
context:
space:
mode:
authorWe-unite <3205135446@qq.com>2025-03-08 22:04:20 +0800
committerWe-unite <3205135446@qq.com>2025-03-08 22:04:20 +0800
commita07bb8fd1299070229f0e8f3dcb57ffd5ef9870a (patch)
tree84f21bd0bf7071bc5fc7dd989e77d7ceb5476682 /arch/mips/crypto
downloadohosKernel-a07bb8fd1299070229f0e8f3dcb57ffd5ef9870a.tar.gz
ohosKernel-a07bb8fd1299070229f0e8f3dcb57ffd5ef9870a.zip
Initial commit: OpenHarmony-v4.0-ReleaseOpenHarmony-v4.0-Release
Diffstat (limited to 'arch/mips/crypto')
-rw-r--r--arch/mips/crypto/Makefile24
-rw-r--r--arch/mips/crypto/chacha-core.S497
-rw-r--r--arch/mips/crypto/chacha-glue.c152
-rw-r--r--arch/mips/crypto/crc32-mips.c346
-rw-r--r--arch/mips/crypto/poly1305-glue.c191
-rw-r--r--arch/mips/crypto/poly1305-mips.pl1273
6 files changed, 2483 insertions, 0 deletions
diff --git a/arch/mips/crypto/Makefile b/arch/mips/crypto/Makefile
new file mode 100644
index 000000000..5e4105ccc
--- /dev/null
+++ b/arch/mips/crypto/Makefile
@@ -0,0 +1,24 @@
1# SPDX-License-Identifier: GPL-2.0
2#
3# Makefile for MIPS crypto files..
4#
5
6obj-$(CONFIG_CRYPTO_CRC32_MIPS) += crc32-mips.o
7
8obj-$(CONFIG_CRYPTO_CHACHA_MIPS) += chacha-mips.o
9chacha-mips-y := chacha-core.o chacha-glue.o
10AFLAGS_chacha-core.o += -O2 # needed to fill branch delay slots
11
12obj-$(CONFIG_CRYPTO_POLY1305_MIPS) += poly1305-mips.o
13poly1305-mips-y := poly1305-core.o poly1305-glue.o
14
15perlasm-flavour-$(CONFIG_32BIT) := o32
16perlasm-flavour-$(CONFIG_64BIT) := 64
17
18quiet_cmd_perlasm = PERLASM $@
19 cmd_perlasm = $(PERL) $(<) $(perlasm-flavour-y) $(@)
20
21$(obj)/poly1305-core.S: $(src)/poly1305-mips.pl FORCE
22 $(call if_changed,perlasm)
23
24targets += poly1305-core.S
diff --git a/arch/mips/crypto/chacha-core.S b/arch/mips/crypto/chacha-core.S
new file mode 100644
index 000000000..5755f69cf
--- /dev/null
+++ b/arch/mips/crypto/chacha-core.S
@@ -0,0 +1,497 @@
1/* SPDX-License-Identifier: GPL-2.0 OR MIT */
2/*
3 * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved.
4 * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
5 */
6
7#define MASK_U32 0x3c
8#define CHACHA20_BLOCK_SIZE 64
9#define STACK_SIZE 32
10
11#define X0 $t0
12#define X1 $t1
13#define X2 $t2
14#define X3 $t3
15#define X4 $t4
16#define X5 $t5
17#define X6 $t6
18#define X7 $t7
19#define X8 $t8
20#define X9 $t9
21#define X10 $v1
22#define X11 $s6
23#define X12 $s5
24#define X13 $s4
25#define X14 $s3
26#define X15 $s2
27/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */
28#define T0 $s1
29#define T1 $s0
30#define T(n) T ## n
31#define X(n) X ## n
32
33/* Input arguments */
34#define STATE $a0
35#define OUT $a1
36#define IN $a2
37#define BYTES $a3
38
39/* Output argument */
40/* NONCE[0] is kept in a register and not in memory.
41 * We don't want to touch original value in memory.
42 * Must be incremented every loop iteration.
43 */
44#define NONCE_0 $v0
45
46/* SAVED_X and SAVED_CA are set in the jump table.
47 * Use regs which are overwritten on exit else we don't leak clear data.
48 * They are used to handling the last bytes which are not multiple of 4.
49 */
50#define SAVED_X X15
51#define SAVED_CA $s7
52
53#define IS_UNALIGNED $s7
54
55#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
56#define MSB 0
57#define LSB 3
58#define ROTx rotl
59#define ROTR(n) rotr n, 24
60#define CPU_TO_LE32(n) \
61 wsbh n; \
62 rotr n, 16;
63#else
64#define MSB 3
65#define LSB 0
66#define ROTx rotr
67#define CPU_TO_LE32(n)
68#define ROTR(n)
69#endif
70
71#define FOR_EACH_WORD(x) \
72 x( 0); \
73 x( 1); \
74 x( 2); \
75 x( 3); \
76 x( 4); \
77 x( 5); \
78 x( 6); \
79 x( 7); \
80 x( 8); \
81 x( 9); \
82 x(10); \
83 x(11); \
84 x(12); \
85 x(13); \
86 x(14); \
87 x(15);
88
89#define FOR_EACH_WORD_REV(x) \
90 x(15); \
91 x(14); \
92 x(13); \
93 x(12); \
94 x(11); \
95 x(10); \
96 x( 9); \
97 x( 8); \
98 x( 7); \
99 x( 6); \
100 x( 5); \
101 x( 4); \
102 x( 3); \
103 x( 2); \
104 x( 1); \
105 x( 0);
106
107#define PLUS_ONE_0 1
108#define PLUS_ONE_1 2
109#define PLUS_ONE_2 3
110#define PLUS_ONE_3 4
111#define PLUS_ONE_4 5
112#define PLUS_ONE_5 6
113#define PLUS_ONE_6 7
114#define PLUS_ONE_7 8
115#define PLUS_ONE_8 9
116#define PLUS_ONE_9 10
117#define PLUS_ONE_10 11
118#define PLUS_ONE_11 12
119#define PLUS_ONE_12 13
120#define PLUS_ONE_13 14
121#define PLUS_ONE_14 15
122#define PLUS_ONE_15 16
123#define PLUS_ONE(x) PLUS_ONE_ ## x
124#define _CONCAT3(a,b,c) a ## b ## c
125#define CONCAT3(a,b,c) _CONCAT3(a,b,c)
126
127#define STORE_UNALIGNED(x) \
128CONCAT3(.Lchacha_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \
129 .if (x != 12); \
130 lw T0, (x*4)(STATE); \
131 .endif; \
132 lwl T1, (x*4)+MSB ## (IN); \
133 lwr T1, (x*4)+LSB ## (IN); \
134 .if (x == 12); \
135 addu X ## x, NONCE_0; \
136 .else; \
137 addu X ## x, T0; \
138 .endif; \
139 CPU_TO_LE32(X ## x); \
140 xor X ## x, T1; \
141 swl X ## x, (x*4)+MSB ## (OUT); \
142 swr X ## x, (x*4)+LSB ## (OUT);
143
144#define STORE_ALIGNED(x) \
145CONCAT3(.Lchacha_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \
146 .if (x != 12); \
147 lw T0, (x*4)(STATE); \
148 .endif; \
149 lw T1, (x*4) ## (IN); \
150 .if (x == 12); \
151 addu X ## x, NONCE_0; \
152 .else; \
153 addu X ## x, T0; \
154 .endif; \
155 CPU_TO_LE32(X ## x); \
156 xor X ## x, T1; \
157 sw X ## x, (x*4) ## (OUT);
158
159/* Jump table macro.
160 * Used for setup and handling the last bytes, which are not multiple of 4.
161 * X15 is free to store Xn
162 * Every jumptable entry must be equal in size.
163 */
164#define JMPTBL_ALIGNED(x) \
165.Lchacha_mips_jmptbl_aligned_ ## x: ; \
166 .set noreorder; \
167 b .Lchacha_mips_xor_aligned_ ## x ## _b; \
168 .if (x == 12); \
169 addu SAVED_X, X ## x, NONCE_0; \
170 .else; \
171 addu SAVED_X, X ## x, SAVED_CA; \
172 .endif; \
173 .set reorder
174
175#define JMPTBL_UNALIGNED(x) \
176.Lchacha_mips_jmptbl_unaligned_ ## x: ; \
177 .set noreorder; \
178 b .Lchacha_mips_xor_unaligned_ ## x ## _b; \
179 .if (x == 12); \
180 addu SAVED_X, X ## x, NONCE_0; \
181 .else; \
182 addu SAVED_X, X ## x, SAVED_CA; \
183 .endif; \
184 .set reorder
185
186#define AXR(A, B, C, D, K, L, M, N, V, W, Y, Z, S) \
187 addu X(A), X(K); \
188 addu X(B), X(L); \
189 addu X(C), X(M); \
190 addu X(D), X(N); \
191 xor X(V), X(A); \
192 xor X(W), X(B); \
193 xor X(Y), X(C); \
194 xor X(Z), X(D); \
195 rotl X(V), S; \
196 rotl X(W), S; \
197 rotl X(Y), S; \
198 rotl X(Z), S;
199
200.text
201.set reorder
202.set noat
203.globl chacha_crypt_arch
204.ent chacha_crypt_arch
205chacha_crypt_arch:
206 .frame $sp, STACK_SIZE, $ra
207
208 /* Load number of rounds */
209 lw $at, 16($sp)
210
211 addiu $sp, -STACK_SIZE
212
213 /* Return bytes = 0. */
214 beqz BYTES, .Lchacha_mips_end
215
216 lw NONCE_0, 48(STATE)
217
218 /* Save s0-s7 */
219 sw $s0, 0($sp)
220 sw $s1, 4($sp)
221 sw $s2, 8($sp)
222 sw $s3, 12($sp)
223 sw $s4, 16($sp)
224 sw $s5, 20($sp)
225 sw $s6, 24($sp)
226 sw $s7, 28($sp)
227
228 /* Test IN or OUT is unaligned.
229 * IS_UNALIGNED = ( IN | OUT ) & 0x00000003
230 */
231 or IS_UNALIGNED, IN, OUT
232 andi IS_UNALIGNED, 0x3
233
234 b .Lchacha_rounds_start
235
236.align 4
237.Loop_chacha_rounds:
238 addiu IN, CHACHA20_BLOCK_SIZE
239 addiu OUT, CHACHA20_BLOCK_SIZE
240 addiu NONCE_0, 1
241
242.Lchacha_rounds_start:
243 lw X0, 0(STATE)
244 lw X1, 4(STATE)
245 lw X2, 8(STATE)
246 lw X3, 12(STATE)
247
248 lw X4, 16(STATE)
249 lw X5, 20(STATE)
250 lw X6, 24(STATE)
251 lw X7, 28(STATE)
252 lw X8, 32(STATE)
253 lw X9, 36(STATE)
254 lw X10, 40(STATE)
255 lw X11, 44(STATE)
256
257 move X12, NONCE_0
258 lw X13, 52(STATE)
259 lw X14, 56(STATE)
260 lw X15, 60(STATE)
261
262.Loop_chacha_xor_rounds:
263 addiu $at, -2
264 AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16);
265 AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12);
266 AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8);
267 AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7);
268 AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16);
269 AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12);
270 AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8);
271 AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7);
272 bnez $at, .Loop_chacha_xor_rounds
273
274 addiu BYTES, -(CHACHA20_BLOCK_SIZE)
275
276 /* Is data src/dst unaligned? Jump */
277 bnez IS_UNALIGNED, .Loop_chacha_unaligned
278
279 /* Set number rounds here to fill delayslot. */
280 lw $at, (STACK_SIZE+16)($sp)
281
282 /* BYTES < 0, it has no full block. */
283 bltz BYTES, .Lchacha_mips_no_full_block_aligned
284
285 FOR_EACH_WORD_REV(STORE_ALIGNED)
286
287 /* BYTES > 0? Loop again. */
288 bgtz BYTES, .Loop_chacha_rounds
289
290 /* Place this here to fill delay slot */
291 addiu NONCE_0, 1
292
293 /* BYTES < 0? Handle last bytes */
294 bltz BYTES, .Lchacha_mips_xor_bytes
295
296.Lchacha_mips_xor_done:
297 /* Restore used registers */
298 lw $s0, 0($sp)
299 lw $s1, 4($sp)
300 lw $s2, 8($sp)
301 lw $s3, 12($sp)
302 lw $s4, 16($sp)
303 lw $s5, 20($sp)
304 lw $s6, 24($sp)
305 lw $s7, 28($sp)
306
307 /* Write NONCE_0 back to right location in state */
308 sw NONCE_0, 48(STATE)
309
310.Lchacha_mips_end:
311 addiu $sp, STACK_SIZE
312 jr $ra
313
314.Lchacha_mips_no_full_block_aligned:
315 /* Restore the offset on BYTES */
316 addiu BYTES, CHACHA20_BLOCK_SIZE
317
318 /* Get number of full WORDS */
319 andi $at, BYTES, MASK_U32
320
321 /* Load upper half of jump table addr */
322 lui T0, %hi(.Lchacha_mips_jmptbl_aligned_0)
323
324 /* Calculate lower half jump table offset */
325 ins T0, $at, 1, 6
326
327 /* Add offset to STATE */
328 addu T1, STATE, $at
329
330 /* Add lower half jump table addr */
331 addiu T0, %lo(.Lchacha_mips_jmptbl_aligned_0)
332
333 /* Read value from STATE */
334 lw SAVED_CA, 0(T1)
335
336 /* Store remaining bytecounter as negative value */
337 subu BYTES, $at, BYTES
338
339 jr T0
340
341 /* Jump table */
342 FOR_EACH_WORD(JMPTBL_ALIGNED)
343
344
345.Loop_chacha_unaligned:
346 /* Set number rounds here to fill delayslot. */
347 lw $at, (STACK_SIZE+16)($sp)
348
349 /* BYTES > 0, it has no full block. */
350 bltz BYTES, .Lchacha_mips_no_full_block_unaligned
351
352 FOR_EACH_WORD_REV(STORE_UNALIGNED)
353
354 /* BYTES > 0? Loop again. */
355 bgtz BYTES, .Loop_chacha_rounds
356
357 /* Write NONCE_0 back to right location in state */
358 sw NONCE_0, 48(STATE)
359
360 .set noreorder
361 /* Fall through to byte handling */
362 bgez BYTES, .Lchacha_mips_xor_done
363.Lchacha_mips_xor_unaligned_0_b:
364.Lchacha_mips_xor_aligned_0_b:
365 /* Place this here to fill delay slot */
366 addiu NONCE_0, 1
367 .set reorder
368
369.Lchacha_mips_xor_bytes:
370 addu IN, $at
371 addu OUT, $at
372 /* First byte */
373 lbu T1, 0(IN)
374 addiu $at, BYTES, 1
375 CPU_TO_LE32(SAVED_X)
376 ROTR(SAVED_X)
377 xor T1, SAVED_X
378 sb T1, 0(OUT)
379 beqz $at, .Lchacha_mips_xor_done
380 /* Second byte */
381 lbu T1, 1(IN)
382 addiu $at, BYTES, 2
383 ROTx SAVED_X, 8
384 xor T1, SAVED_X
385 sb T1, 1(OUT)
386 beqz $at, .Lchacha_mips_xor_done
387 /* Third byte */
388 lbu T1, 2(IN)
389 ROTx SAVED_X, 8
390 xor T1, SAVED_X
391 sb T1, 2(OUT)
392 b .Lchacha_mips_xor_done
393
394.Lchacha_mips_no_full_block_unaligned:
395 /* Restore the offset on BYTES */
396 addiu BYTES, CHACHA20_BLOCK_SIZE
397
398 /* Get number of full WORDS */
399 andi $at, BYTES, MASK_U32
400
401 /* Load upper half of jump table addr */
402 lui T0, %hi(.Lchacha_mips_jmptbl_unaligned_0)
403
404 /* Calculate lower half jump table offset */
405 ins T0, $at, 1, 6
406
407 /* Add offset to STATE */
408 addu T1, STATE, $at
409
410 /* Add lower half jump table addr */
411 addiu T0, %lo(.Lchacha_mips_jmptbl_unaligned_0)
412
413 /* Read value from STATE */
414 lw SAVED_CA, 0(T1)
415
416 /* Store remaining bytecounter as negative value */
417 subu BYTES, $at, BYTES
418
419 jr T0
420
421 /* Jump table */
422 FOR_EACH_WORD(JMPTBL_UNALIGNED)
423.end chacha_crypt_arch
424.set at
425
426/* Input arguments
427 * STATE $a0
428 * OUT $a1
429 * NROUND $a2
430 */
431
432#undef X12
433#undef X13
434#undef X14
435#undef X15
436
437#define X12 $a3
438#define X13 $at
439#define X14 $v0
440#define X15 STATE
441
442.set noat
443.globl hchacha_block_arch
444.ent hchacha_block_arch
445hchacha_block_arch:
446 .frame $sp, STACK_SIZE, $ra
447
448 addiu $sp, -STACK_SIZE
449
450 /* Save X11(s6) */
451 sw X11, 0($sp)
452
453 lw X0, 0(STATE)
454 lw X1, 4(STATE)
455 lw X2, 8(STATE)
456 lw X3, 12(STATE)
457 lw X4, 16(STATE)
458 lw X5, 20(STATE)
459 lw X6, 24(STATE)
460 lw X7, 28(STATE)
461 lw X8, 32(STATE)
462 lw X9, 36(STATE)
463 lw X10, 40(STATE)
464 lw X11, 44(STATE)
465 lw X12, 48(STATE)
466 lw X13, 52(STATE)
467 lw X14, 56(STATE)
468 lw X15, 60(STATE)
469
470.Loop_hchacha_xor_rounds:
471 addiu $a2, -2
472 AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16);
473 AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12);
474 AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8);
475 AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7);
476 AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16);
477 AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12);
478 AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8);
479 AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7);
480 bnez $a2, .Loop_hchacha_xor_rounds
481
482 /* Restore used register */
483 lw X11, 0($sp)
484
485 sw X0, 0(OUT)
486 sw X1, 4(OUT)
487 sw X2, 8(OUT)
488 sw X3, 12(OUT)
489 sw X12, 16(OUT)
490 sw X13, 20(OUT)
491 sw X14, 24(OUT)
492 sw X15, 28(OUT)
493
494 addiu $sp, STACK_SIZE
495 jr $ra
496.end hchacha_block_arch
497.set at
diff --git a/arch/mips/crypto/chacha-glue.c b/arch/mips/crypto/chacha-glue.c
new file mode 100644
index 000000000..d1fd23e6e
--- /dev/null
+++ b/arch/mips/crypto/chacha-glue.c
@@ -0,0 +1,152 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * MIPS accelerated ChaCha and XChaCha stream ciphers,
4 * including ChaCha20 (RFC7539)
5 *
6 * Copyright (C) 2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
7 */
8
9#include <asm/byteorder.h>
10#include <crypto/algapi.h>
11#include <crypto/internal/chacha.h>
12#include <crypto/internal/skcipher.h>
13#include <linux/kernel.h>
14#include <linux/module.h>
15
16asmlinkage void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src,
17 unsigned int bytes, int nrounds);
18EXPORT_SYMBOL(chacha_crypt_arch);
19
20asmlinkage void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds);
21EXPORT_SYMBOL(hchacha_block_arch);
22
23void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
24{
25 chacha_init_generic(state, key, iv);
26}
27EXPORT_SYMBOL(chacha_init_arch);
28
29static int chacha_mips_stream_xor(struct skcipher_request *req,
30 const struct chacha_ctx *ctx, const u8 *iv)
31{
32 struct skcipher_walk walk;
33 u32 state[16];
34 int err;
35
36 err = skcipher_walk_virt(&walk, req, false);
37
38 chacha_init_generic(state, ctx->key, iv);
39
40 while (walk.nbytes > 0) {
41 unsigned int nbytes = walk.nbytes;
42
43 if (nbytes < walk.total)
44 nbytes = round_down(nbytes, walk.stride);
45
46 chacha_crypt(state, walk.dst.virt.addr, walk.src.virt.addr,
47 nbytes, ctx->nrounds);
48 err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
49 }
50
51 return err;
52}
53
54static int chacha_mips(struct skcipher_request *req)
55{
56 struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
57 struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
58
59 return chacha_mips_stream_xor(req, ctx, req->iv);
60}
61
62static int xchacha_mips(struct skcipher_request *req)
63{
64 struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
65 struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
66 struct chacha_ctx subctx;
67 u32 state[16];
68 u8 real_iv[16];
69
70 chacha_init_generic(state, ctx->key, req->iv);
71
72 hchacha_block(state, subctx.key, ctx->nrounds);
73 subctx.nrounds = ctx->nrounds;
74
75 memcpy(&real_iv[0], req->iv + 24, 8);
76 memcpy(&real_iv[8], req->iv + 16, 8);
77 return chacha_mips_stream_xor(req, &subctx, real_iv);
78}
79
80static struct skcipher_alg algs[] = {
81 {
82 .base.cra_name = "chacha20",
83 .base.cra_driver_name = "chacha20-mips",
84 .base.cra_priority = 200,
85 .base.cra_blocksize = 1,
86 .base.cra_ctxsize = sizeof(struct chacha_ctx),
87 .base.cra_module = THIS_MODULE,
88
89 .min_keysize = CHACHA_KEY_SIZE,
90 .max_keysize = CHACHA_KEY_SIZE,
91 .ivsize = CHACHA_IV_SIZE,
92 .chunksize = CHACHA_BLOCK_SIZE,
93 .setkey = chacha20_setkey,
94 .encrypt = chacha_mips,
95 .decrypt = chacha_mips,
96 }, {
97 .base.cra_name = "xchacha20",
98 .base.cra_driver_name = "xchacha20-mips",
99 .base.cra_priority = 200,
100 .base.cra_blocksize = 1,
101 .base.cra_ctxsize = sizeof(struct chacha_ctx),
102 .base.cra_module = THIS_MODULE,
103
104 .min_keysize = CHACHA_KEY_SIZE,
105 .max_keysize = CHACHA_KEY_SIZE,
106 .ivsize = XCHACHA_IV_SIZE,
107 .chunksize = CHACHA_BLOCK_SIZE,
108 .setkey = chacha20_setkey,
109 .encrypt = xchacha_mips,
110 .decrypt = xchacha_mips,
111 }, {
112 .base.cra_name = "xchacha12",
113 .base.cra_driver_name = "xchacha12-mips",
114 .base.cra_priority = 200,
115 .base.cra_blocksize = 1,
116 .base.cra_ctxsize = sizeof(struct chacha_ctx),
117 .base.cra_module = THIS_MODULE,
118
119 .min_keysize = CHACHA_KEY_SIZE,
120 .max_keysize = CHACHA_KEY_SIZE,
121 .ivsize = XCHACHA_IV_SIZE,
122 .chunksize = CHACHA_BLOCK_SIZE,
123 .setkey = chacha12_setkey,
124 .encrypt = xchacha_mips,
125 .decrypt = xchacha_mips,
126 }
127};
128
129static int __init chacha_simd_mod_init(void)
130{
131 return IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER) ?
132 crypto_register_skciphers(algs, ARRAY_SIZE(algs)) : 0;
133}
134
135static void __exit chacha_simd_mod_fini(void)
136{
137 if (IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER))
138 crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
139}
140
141module_init(chacha_simd_mod_init);
142module_exit(chacha_simd_mod_fini);
143
144MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (MIPS accelerated)");
145MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
146MODULE_LICENSE("GPL v2");
147MODULE_ALIAS_CRYPTO("chacha20");
148MODULE_ALIAS_CRYPTO("chacha20-mips");
149MODULE_ALIAS_CRYPTO("xchacha20");
150MODULE_ALIAS_CRYPTO("xchacha20-mips");
151MODULE_ALIAS_CRYPTO("xchacha12");
152MODULE_ALIAS_CRYPTO("xchacha12-mips");
diff --git a/arch/mips/crypto/crc32-mips.c b/arch/mips/crypto/crc32-mips.c
new file mode 100644
index 000000000..faa88a6a7
--- /dev/null
+++ b/arch/mips/crypto/crc32-mips.c
@@ -0,0 +1,346 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * crc32-mips.c - CRC32 and CRC32C using optional MIPSr6 instructions
4 *
5 * Module based on arm64/crypto/crc32-arm.c
6 *
7 * Copyright (C) 2014 Linaro Ltd <yazen.ghannam@linaro.org>
8 * Copyright (C) 2018 MIPS Tech, LLC
9 */
10
11#include <linux/unaligned/access_ok.h>
12#include <linux/cpufeature.h>
13#include <linux/init.h>
14#include <linux/kernel.h>
15#include <linux/module.h>
16#include <linux/string.h>
17#include <asm/mipsregs.h>
18
19#include <crypto/internal/hash.h>
20
21enum crc_op_size {
22 b, h, w, d,
23};
24
25enum crc_type {
26 crc32,
27 crc32c,
28};
29
30#ifndef TOOLCHAIN_SUPPORTS_CRC
31#define _ASM_MACRO_CRC32(OP, SZ, TYPE) \
32_ASM_MACRO_3R(OP, rt, rs, rt2, \
33 ".ifnc \\rt, \\rt2\n\t" \
34 ".error \"invalid operands \\\"" #OP " \\rt,\\rs,\\rt2\\\"\"\n\t" \
35 ".endif\n\t" \
36 _ASM_INSN_IF_MIPS(0x7c00000f | (__rt << 16) | (__rs << 21) | \
37 ((SZ) << 6) | ((TYPE) << 8)) \
38 _ASM_INSN32_IF_MM(0x00000030 | (__rs << 16) | (__rt << 21) | \
39 ((SZ) << 14) | ((TYPE) << 3)))
40_ASM_MACRO_CRC32(crc32b, 0, 0);
41_ASM_MACRO_CRC32(crc32h, 1, 0);
42_ASM_MACRO_CRC32(crc32w, 2, 0);
43_ASM_MACRO_CRC32(crc32d, 3, 0);
44_ASM_MACRO_CRC32(crc32cb, 0, 1);
45_ASM_MACRO_CRC32(crc32ch, 1, 1);
46_ASM_MACRO_CRC32(crc32cw, 2, 1);
47_ASM_MACRO_CRC32(crc32cd, 3, 1);
48#define _ASM_SET_CRC ""
49#else /* !TOOLCHAIN_SUPPORTS_CRC */
50#define _ASM_SET_CRC ".set\tcrc\n\t"
51#endif
52
53#define _CRC32(crc, value, size, type) \
54do { \
55 __asm__ __volatile__( \
56 ".set push\n\t" \
57 _ASM_SET_CRC \
58 #type #size " %0, %1, %0\n\t" \
59 ".set pop" \
60 : "+r" (crc) \
61 : "r" (value)); \
62} while (0)
63
64#define CRC32(crc, value, size) \
65 _CRC32(crc, value, size, crc32)
66
67#define CRC32C(crc, value, size) \
68 _CRC32(crc, value, size, crc32c)
69
70static u32 crc32_mips_le_hw(u32 crc_, const u8 *p, unsigned int len)
71{
72 u32 crc = crc_;
73
74#ifdef CONFIG_64BIT
75 while (len >= sizeof(u64)) {
76 u64 value = get_unaligned_le64(p);
77
78 CRC32(crc, value, d);
79 p += sizeof(u64);
80 len -= sizeof(u64);
81 }
82
83 if (len & sizeof(u32)) {
84#else /* !CONFIG_64BIT */
85 while (len >= sizeof(u32)) {
86#endif
87 u32 value = get_unaligned_le32(p);
88
89 CRC32(crc, value, w);
90 p += sizeof(u32);
91 len -= sizeof(u32);
92 }
93
94 if (len & sizeof(u16)) {
95 u16 value = get_unaligned_le16(p);
96
97 CRC32(crc, value, h);
98 p += sizeof(u16);
99 }
100
101 if (len & sizeof(u8)) {
102 u8 value = *p++;
103
104 CRC32(crc, value, b);
105 }
106
107 return crc;
108}
109
110static u32 crc32c_mips_le_hw(u32 crc_, const u8 *p, unsigned int len)
111{
112 u32 crc = crc_;
113
114#ifdef CONFIG_64BIT
115 while (len >= sizeof(u64)) {
116 u64 value = get_unaligned_le64(p);
117
118 CRC32C(crc, value, d);
119 p += sizeof(u64);
120 len -= sizeof(u64);
121 }
122
123 if (len & sizeof(u32)) {
124#else /* !CONFIG_64BIT */
125 while (len >= sizeof(u32)) {
126#endif
127 u32 value = get_unaligned_le32(p);
128
129 CRC32C(crc, value, w);
130 p += sizeof(u32);
131 len -= sizeof(u32);
132 }
133
134 if (len & sizeof(u16)) {
135 u16 value = get_unaligned_le16(p);
136
137 CRC32C(crc, value, h);
138 p += sizeof(u16);
139 }
140
141 if (len & sizeof(u8)) {
142 u8 value = *p++;
143
144 CRC32C(crc, value, b);
145 }
146 return crc;
147}
148
149#define CHKSUM_BLOCK_SIZE 1
150#define CHKSUM_DIGEST_SIZE 4
151
152struct chksum_ctx {
153 u32 key;
154};
155
156struct chksum_desc_ctx {
157 u32 crc;
158};
159
160static int chksum_init(struct shash_desc *desc)
161{
162 struct chksum_ctx *mctx = crypto_shash_ctx(desc->tfm);
163 struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
164
165 ctx->crc = mctx->key;
166
167 return 0;
168}
169
170/*
171 * Setting the seed allows arbitrary accumulators and flexible XOR policy
172 * If your algorithm starts with ~0, then XOR with ~0 before you set
173 * the seed.
174 */
175static int chksum_setkey(struct crypto_shash *tfm, const u8 *key,
176 unsigned int keylen)
177{
178 struct chksum_ctx *mctx = crypto_shash_ctx(tfm);
179
180 if (keylen != sizeof(mctx->key))
181 return -EINVAL;
182 mctx->key = get_unaligned_le32(key);
183 return 0;
184}
185
186static int chksum_update(struct shash_desc *desc, const u8 *data,
187 unsigned int length)
188{
189 struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
190
191 ctx->crc = crc32_mips_le_hw(ctx->crc, data, length);
192 return 0;
193}
194
195static int chksumc_update(struct shash_desc *desc, const u8 *data,
196 unsigned int length)
197{
198 struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
199
200 ctx->crc = crc32c_mips_le_hw(ctx->crc, data, length);
201 return 0;
202}
203
204static int chksum_final(struct shash_desc *desc, u8 *out)
205{
206 struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
207
208 put_unaligned_le32(ctx->crc, out);
209 return 0;
210}
211
212static int chksumc_final(struct shash_desc *desc, u8 *out)
213{
214 struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
215
216 put_unaligned_le32(~ctx->crc, out);
217 return 0;
218}
219
220static int __chksum_finup(u32 crc, const u8 *data, unsigned int len, u8 *out)
221{
222 put_unaligned_le32(crc32_mips_le_hw(crc, data, len), out);
223 return 0;
224}
225
226static int __chksumc_finup(u32 crc, const u8 *data, unsigned int len, u8 *out)
227{
228 put_unaligned_le32(~crc32c_mips_le_hw(crc, data, len), out);
229 return 0;
230}
231
232static int chksum_finup(struct shash_desc *desc, const u8 *data,
233 unsigned int len, u8 *out)
234{
235 struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
236
237 return __chksum_finup(ctx->crc, data, len, out);
238}
239
240static int chksumc_finup(struct shash_desc *desc, const u8 *data,
241 unsigned int len, u8 *out)
242{
243 struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
244
245 return __chksumc_finup(ctx->crc, data, len, out);
246}
247
248static int chksum_digest(struct shash_desc *desc, const u8 *data,
249 unsigned int length, u8 *out)
250{
251 struct chksum_ctx *mctx = crypto_shash_ctx(desc->tfm);
252
253 return __chksum_finup(mctx->key, data, length, out);
254}
255
256static int chksumc_digest(struct shash_desc *desc, const u8 *data,
257 unsigned int length, u8 *out)
258{
259 struct chksum_ctx *mctx = crypto_shash_ctx(desc->tfm);
260
261 return __chksumc_finup(mctx->key, data, length, out);
262}
263
264static int chksum_cra_init(struct crypto_tfm *tfm)
265{
266 struct chksum_ctx *mctx = crypto_tfm_ctx(tfm);
267
268 mctx->key = ~0;
269 return 0;
270}
271
272static struct shash_alg crc32_alg = {
273 .digestsize = CHKSUM_DIGEST_SIZE,
274 .setkey = chksum_setkey,
275 .init = chksum_init,
276 .update = chksum_update,
277 .final = chksum_final,
278 .finup = chksum_finup,
279 .digest = chksum_digest,
280 .descsize = sizeof(struct chksum_desc_ctx),
281 .base = {
282 .cra_name = "crc32",
283 .cra_driver_name = "crc32-mips-hw",
284 .cra_priority = 300,
285 .cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
286 .cra_blocksize = CHKSUM_BLOCK_SIZE,
287 .cra_alignmask = 0,
288 .cra_ctxsize = sizeof(struct chksum_ctx),
289 .cra_module = THIS_MODULE,
290 .cra_init = chksum_cra_init,
291 }
292};
293
294static struct shash_alg crc32c_alg = {
295 .digestsize = CHKSUM_DIGEST_SIZE,
296 .setkey = chksum_setkey,
297 .init = chksum_init,
298 .update = chksumc_update,
299 .final = chksumc_final,
300 .finup = chksumc_finup,
301 .digest = chksumc_digest,
302 .descsize = sizeof(struct chksum_desc_ctx),
303 .base = {
304 .cra_name = "crc32c",
305 .cra_driver_name = "crc32c-mips-hw",
306 .cra_priority = 300,
307 .cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
308 .cra_blocksize = CHKSUM_BLOCK_SIZE,
309 .cra_alignmask = 0,
310 .cra_ctxsize = sizeof(struct chksum_ctx),
311 .cra_module = THIS_MODULE,
312 .cra_init = chksum_cra_init,
313 }
314};
315
316static int __init crc32_mod_init(void)
317{
318 int err;
319
320 err = crypto_register_shash(&crc32_alg);
321
322 if (err)
323 return err;
324
325 err = crypto_register_shash(&crc32c_alg);
326
327 if (err) {
328 crypto_unregister_shash(&crc32_alg);
329 return err;
330 }
331
332 return 0;
333}
334
335static void __exit crc32_mod_exit(void)
336{
337 crypto_unregister_shash(&crc32_alg);
338 crypto_unregister_shash(&crc32c_alg);
339}
340
341MODULE_AUTHOR("Marcin Nowakowski <marcin.nowakowski@mips.com");
342MODULE_DESCRIPTION("CRC32 and CRC32C using optional MIPS instructions");
343MODULE_LICENSE("GPL v2");
344
345module_cpu_feature_match(MIPS_CRC32, crc32_mod_init);
346module_exit(crc32_mod_exit);
diff --git a/arch/mips/crypto/poly1305-glue.c b/arch/mips/crypto/poly1305-glue.c
new file mode 100644
index 000000000..bc6110fb9
--- /dev/null
+++ b/arch/mips/crypto/poly1305-glue.c
@@ -0,0 +1,191 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * OpenSSL/Cryptogams accelerated Poly1305 transform for MIPS
4 *
5 * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
6 */
7
8#include <asm/unaligned.h>
9#include <crypto/algapi.h>
10#include <crypto/internal/hash.h>
11#include <crypto/internal/poly1305.h>
12#include <linux/cpufeature.h>
13#include <linux/crypto.h>
14#include <linux/module.h>
15
16asmlinkage void poly1305_init_mips(void *state, const u8 *key);
17asmlinkage void poly1305_blocks_mips(void *state, const u8 *src, u32 len, u32 hibit);
18asmlinkage void poly1305_emit_mips(void *state, u8 *digest, const u32 *nonce);
19
20void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 key[POLY1305_KEY_SIZE])
21{
22 poly1305_init_mips(&dctx->h, key);
23 dctx->s[0] = get_unaligned_le32(key + 16);
24 dctx->s[1] = get_unaligned_le32(key + 20);
25 dctx->s[2] = get_unaligned_le32(key + 24);
26 dctx->s[3] = get_unaligned_le32(key + 28);
27 dctx->buflen = 0;
28}
29EXPORT_SYMBOL(poly1305_init_arch);
30
31static int mips_poly1305_init(struct shash_desc *desc)
32{
33 struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
34
35 dctx->buflen = 0;
36 dctx->rset = 0;
37 dctx->sset = false;
38
39 return 0;
40}
41
42static void mips_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
43 u32 len, u32 hibit)
44{
45 if (unlikely(!dctx->sset)) {
46 if (!dctx->rset) {
47 poly1305_init_mips(&dctx->h, src);
48 src += POLY1305_BLOCK_SIZE;
49 len -= POLY1305_BLOCK_SIZE;
50 dctx->rset = 1;
51 }
52 if (len >= POLY1305_BLOCK_SIZE) {
53 dctx->s[0] = get_unaligned_le32(src + 0);
54 dctx->s[1] = get_unaligned_le32(src + 4);
55 dctx->s[2] = get_unaligned_le32(src + 8);
56 dctx->s[3] = get_unaligned_le32(src + 12);
57 src += POLY1305_BLOCK_SIZE;
58 len -= POLY1305_BLOCK_SIZE;
59 dctx->sset = true;
60 }
61 if (len < POLY1305_BLOCK_SIZE)
62 return;
63 }
64
65 len &= ~(POLY1305_BLOCK_SIZE - 1);
66
67 poly1305_blocks_mips(&dctx->h, src, len, hibit);
68}
69
70static int mips_poly1305_update(struct shash_desc *desc, const u8 *src,
71 unsigned int len)
72{
73 struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
74
75 if (unlikely(dctx->buflen)) {
76 u32 bytes = min(len, POLY1305_BLOCK_SIZE - dctx->buflen);
77
78 memcpy(dctx->buf + dctx->buflen, src, bytes);
79 src += bytes;
80 len -= bytes;
81 dctx->buflen += bytes;
82
83 if (dctx->buflen == POLY1305_BLOCK_SIZE) {
84 mips_poly1305_blocks(dctx, dctx->buf, POLY1305_BLOCK_SIZE, 1);
85 dctx->buflen = 0;
86 }
87 }
88
89 if (likely(len >= POLY1305_BLOCK_SIZE)) {
90 mips_poly1305_blocks(dctx, src, len, 1);
91 src += round_down(len, POLY1305_BLOCK_SIZE);
92 len %= POLY1305_BLOCK_SIZE;
93 }
94
95 if (unlikely(len)) {
96 dctx->buflen = len;
97 memcpy(dctx->buf, src, len);
98 }
99 return 0;
100}
101
102void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
103 unsigned int nbytes)
104{
105 if (unlikely(dctx->buflen)) {
106 u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen);
107
108 memcpy(dctx->buf + dctx->buflen, src, bytes);
109 src += bytes;
110 nbytes -= bytes;
111 dctx->buflen += bytes;
112
113 if (dctx->buflen == POLY1305_BLOCK_SIZE) {
114 poly1305_blocks_mips(&dctx->h, dctx->buf,
115 POLY1305_BLOCK_SIZE, 1);
116 dctx->buflen = 0;
117 }
118 }
119
120 if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
121 unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
122
123 poly1305_blocks_mips(&dctx->h, src, len, 1);
124 src += len;
125 nbytes %= POLY1305_BLOCK_SIZE;
126 }
127
128 if (unlikely(nbytes)) {
129 dctx->buflen = nbytes;
130 memcpy(dctx->buf, src, nbytes);
131 }
132}
133EXPORT_SYMBOL(poly1305_update_arch);
134
135void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
136{
137 if (unlikely(dctx->buflen)) {
138 dctx->buf[dctx->buflen++] = 1;
139 memset(dctx->buf + dctx->buflen, 0,
140 POLY1305_BLOCK_SIZE - dctx->buflen);
141 poly1305_blocks_mips(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
142 }
143
144 poly1305_emit_mips(&dctx->h, dst, dctx->s);
145 *dctx = (struct poly1305_desc_ctx){};
146}
147EXPORT_SYMBOL(poly1305_final_arch);
148
149static int mips_poly1305_final(struct shash_desc *desc, u8 *dst)
150{
151 struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
152
153 if (unlikely(!dctx->sset))
154 return -ENOKEY;
155
156 poly1305_final_arch(dctx, dst);
157 return 0;
158}
159
160static struct shash_alg mips_poly1305_alg = {
161 .init = mips_poly1305_init,
162 .update = mips_poly1305_update,
163 .final = mips_poly1305_final,
164 .digestsize = POLY1305_DIGEST_SIZE,
165 .descsize = sizeof(struct poly1305_desc_ctx),
166
167 .base.cra_name = "poly1305",
168 .base.cra_driver_name = "poly1305-mips",
169 .base.cra_priority = 200,
170 .base.cra_blocksize = POLY1305_BLOCK_SIZE,
171 .base.cra_module = THIS_MODULE,
172};
173
174static int __init mips_poly1305_mod_init(void)
175{
176 return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
177 crypto_register_shash(&mips_poly1305_alg) : 0;
178}
179
180static void __exit mips_poly1305_mod_exit(void)
181{
182 if (IS_REACHABLE(CONFIG_CRYPTO_HASH))
183 crypto_unregister_shash(&mips_poly1305_alg);
184}
185
186module_init(mips_poly1305_mod_init);
187module_exit(mips_poly1305_mod_exit);
188
189MODULE_LICENSE("GPL v2");
190MODULE_ALIAS_CRYPTO("poly1305");
191MODULE_ALIAS_CRYPTO("poly1305-mips");
diff --git a/arch/mips/crypto/poly1305-mips.pl b/arch/mips/crypto/poly1305-mips.pl
new file mode 100644
index 000000000..b05bab884
--- /dev/null
+++ b/arch/mips/crypto/poly1305-mips.pl
@@ -0,0 +1,1273 @@
1#!/usr/bin/env perl
2# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
3#
4# ====================================================================
5# Written by Andy Polyakov, @dot-asm, originally for the OpenSSL
6# project.
7# ====================================================================
8
9# Poly1305 hash for MIPS.
10#
11# May 2016
12#
13# Numbers are cycles per processed byte with poly1305_blocks alone.
14#
15# IALU/gcc
16# R1x000 ~5.5/+130% (big-endian)
17# Octeon II 2.50/+70% (little-endian)
18#
19# March 2019
20#
21# Add 32-bit code path.
22#
23# October 2019
24#
25# Modulo-scheduling reduction allows to omit dependency chain at the
26# end of inner loop and improve performance. Also optimize MIPS32R2
27# code path for MIPS 1004K core. Per René von Dorst's suggestions.
28#
29# IALU/gcc
30# R1x000 ~9.8/? (big-endian)
31# Octeon II 3.65/+140% (little-endian)
32# MT7621/1004K 4.75/? (little-endian)
33#
34######################################################################
35# There is a number of MIPS ABI in use, O32 and N32/64 are most
36# widely used. Then there is a new contender: NUBI. It appears that if
37# one picks the latter, it's possible to arrange code in ABI neutral
38# manner. Therefore let's stick to NUBI register layout:
39#
40($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
41($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
42($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
43($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
44#
45# The return value is placed in $a0. Following coding rules facilitate
46# interoperability:
47#
48# - never ever touch $tp, "thread pointer", former $gp [o32 can be
49# excluded from the rule, because it's specified volatile];
50# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
51# old code];
52# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
53#
54# For reference here is register layout for N32/64 MIPS ABIs:
55#
56# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
57# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
58# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
59# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
60# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
61#
62# <appro@openssl.org>
63#
64######################################################################
65
66$flavour = shift || "64"; # supported flavours are o32,n32,64,nubi32,nubi64
67
68$v0 = ($flavour =~ /nubi/i) ? $a0 : $t0;
69
70if ($flavour =~ /64|n32/i) {{{
71######################################################################
72# 64-bit code path
73#
74
75my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
76my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1);
77
78$code.=<<___;
79#if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \\
80 defined(_MIPS_ARCH_MIPS64R6)) \\
81 && !defined(_MIPS_ARCH_MIPS64R2)
82# define _MIPS_ARCH_MIPS64R2
83#endif
84
85#if defined(_MIPS_ARCH_MIPS64R6)
86# define dmultu(rs,rt)
87# define mflo(rd,rs,rt) dmulu rd,rs,rt
88# define mfhi(rd,rs,rt) dmuhu rd,rs,rt
89#else
90# define dmultu(rs,rt) dmultu rs,rt
91# define mflo(rd,rs,rt) mflo rd
92# define mfhi(rd,rs,rt) mfhi rd
93#endif
94
95#ifdef __KERNEL__
96# define poly1305_init poly1305_init_mips
97# define poly1305_blocks poly1305_blocks_mips
98# define poly1305_emit poly1305_emit_mips
99#endif
100
101#if defined(__MIPSEB__) && !defined(MIPSEB)
102# define MIPSEB
103#endif
104
105#ifdef MIPSEB
106# define MSB 0
107# define LSB 7
108#else
109# define MSB 7
110# define LSB 0
111#endif
112
113.text
114.set noat
115.set noreorder
116
117.align 5
118.globl poly1305_init
119.ent poly1305_init
120poly1305_init:
121 .frame $sp,0,$ra
122 .set reorder
123
124 sd $zero,0($ctx)
125 sd $zero,8($ctx)
126 sd $zero,16($ctx)
127
128 beqz $inp,.Lno_key
129
130#if defined(_MIPS_ARCH_MIPS64R6)
131 andi $tmp0,$inp,7 # $inp % 8
132 dsubu $inp,$inp,$tmp0 # align $inp
133 sll $tmp0,$tmp0,3 # byte to bit offset
134 ld $in0,0($inp)
135 ld $in1,8($inp)
136 beqz $tmp0,.Laligned_key
137 ld $tmp2,16($inp)
138
139 subu $tmp1,$zero,$tmp0
140# ifdef MIPSEB
141 dsllv $in0,$in0,$tmp0
142 dsrlv $tmp3,$in1,$tmp1
143 dsllv $in1,$in1,$tmp0
144 dsrlv $tmp2,$tmp2,$tmp1
145# else
146 dsrlv $in0,$in0,$tmp0
147 dsllv $tmp3,$in1,$tmp1
148 dsrlv $in1,$in1,$tmp0
149 dsllv $tmp2,$tmp2,$tmp1
150# endif
151 or $in0,$in0,$tmp3
152 or $in1,$in1,$tmp2
153.Laligned_key:
154#else
155 ldl $in0,0+MSB($inp)
156 ldl $in1,8+MSB($inp)
157 ldr $in0,0+LSB($inp)
158 ldr $in1,8+LSB($inp)
159#endif
160#ifdef MIPSEB
161# if defined(_MIPS_ARCH_MIPS64R2)
162 dsbh $in0,$in0 # byte swap
163 dsbh $in1,$in1
164 dshd $in0,$in0
165 dshd $in1,$in1
166# else
167 ori $tmp0,$zero,0xFF
168 dsll $tmp2,$tmp0,32
169 or $tmp0,$tmp2 # 0x000000FF000000FF
170
171 and $tmp1,$in0,$tmp0 # byte swap
172 and $tmp3,$in1,$tmp0
173 dsrl $tmp2,$in0,24
174 dsrl $tmp4,$in1,24
175 dsll $tmp1,24
176 dsll $tmp3,24
177 and $tmp2,$tmp0
178 and $tmp4,$tmp0
179 dsll $tmp0,8 # 0x0000FF000000FF00
180 or $tmp1,$tmp2
181 or $tmp3,$tmp4
182 and $tmp2,$in0,$tmp0
183 and $tmp4,$in1,$tmp0
184 dsrl $in0,8
185 dsrl $in1,8
186 dsll $tmp2,8
187 dsll $tmp4,8
188 and $in0,$tmp0
189 and $in1,$tmp0
190 or $tmp1,$tmp2
191 or $tmp3,$tmp4
192 or $in0,$tmp1
193 or $in1,$tmp3
194 dsrl $tmp1,$in0,32
195 dsrl $tmp3,$in1,32
196 dsll $in0,32
197 dsll $in1,32
198 or $in0,$tmp1
199 or $in1,$tmp3
200# endif
201#endif
202 li $tmp0,1
203 dsll $tmp0,32 # 0x0000000100000000
204 daddiu $tmp0,-63 # 0x00000000ffffffc1
205 dsll $tmp0,28 # 0x0ffffffc10000000
206 daddiu $tmp0,-1 # 0x0ffffffc0fffffff
207
208 and $in0,$tmp0
209 daddiu $tmp0,-3 # 0x0ffffffc0ffffffc
210 and $in1,$tmp0
211
212 sd $in0,24($ctx)
213 dsrl $tmp0,$in1,2
214 sd $in1,32($ctx)
215 daddu $tmp0,$in1 # s1 = r1 + (r1 >> 2)
216 sd $tmp0,40($ctx)
217
218.Lno_key:
219 li $v0,0 # return 0
220 jr $ra
221.end poly1305_init
222___
223{
224my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000";
225
226my ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) =
227 ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2);
228my ($shr,$shl) = ($s6,$s7); # used on R6
229
230$code.=<<___;
231.align 5
232.globl poly1305_blocks
233.ent poly1305_blocks
234poly1305_blocks:
235 .set noreorder
236 dsrl $len,4 # number of complete blocks
237 bnez $len,poly1305_blocks_internal
238 nop
239 jr $ra
240 nop
241.end poly1305_blocks
242
243.align 5
244.ent poly1305_blocks_internal
245poly1305_blocks_internal:
246 .set noreorder
247#if defined(_MIPS_ARCH_MIPS64R6)
248 .frame $sp,8*8,$ra
249 .mask $SAVED_REGS_MASK|0x000c0000,-8
250 dsubu $sp,8*8
251 sd $s7,56($sp)
252 sd $s6,48($sp)
253#else
254 .frame $sp,6*8,$ra
255 .mask $SAVED_REGS_MASK,-8
256 dsubu $sp,6*8
257#endif
258 sd $s5,40($sp)
259 sd $s4,32($sp)
260___
261$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
262 sd $s3,24($sp)
263 sd $s2,16($sp)
264 sd $s1,8($sp)
265 sd $s0,0($sp)
266___
267$code.=<<___;
268 .set reorder
269
270#if defined(_MIPS_ARCH_MIPS64R6)
271 andi $shr,$inp,7
272 dsubu $inp,$inp,$shr # align $inp
273 sll $shr,$shr,3 # byte to bit offset
274 subu $shl,$zero,$shr
275#endif
276
277 ld $h0,0($ctx) # load hash value
278 ld $h1,8($ctx)
279 ld $h2,16($ctx)
280
281 ld $r0,24($ctx) # load key
282 ld $r1,32($ctx)
283 ld $rs1,40($ctx)
284
285 dsll $len,4
286 daddu $len,$inp # end of buffer
287 b .Loop
288
289.align 4
290.Loop:
291#if defined(_MIPS_ARCH_MIPS64R6)
292 ld $in0,0($inp) # load input
293 ld $in1,8($inp)
294 beqz $shr,.Laligned_inp
295
296 ld $tmp2,16($inp)
297# ifdef MIPSEB
298 dsllv $in0,$in0,$shr
299 dsrlv $tmp3,$in1,$shl
300 dsllv $in1,$in1,$shr
301 dsrlv $tmp2,$tmp2,$shl
302# else
303 dsrlv $in0,$in0,$shr
304 dsllv $tmp3,$in1,$shl
305 dsrlv $in1,$in1,$shr
306 dsllv $tmp2,$tmp2,$shl
307# endif
308 or $in0,$in0,$tmp3
309 or $in1,$in1,$tmp2
310.Laligned_inp:
311#else
312 ldl $in0,0+MSB($inp) # load input
313 ldl $in1,8+MSB($inp)
314 ldr $in0,0+LSB($inp)
315 ldr $in1,8+LSB($inp)
316#endif
317 daddiu $inp,16
318#ifdef MIPSEB
319# if defined(_MIPS_ARCH_MIPS64R2)
320 dsbh $in0,$in0 # byte swap
321 dsbh $in1,$in1
322 dshd $in0,$in0
323 dshd $in1,$in1
324# else
325 ori $tmp0,$zero,0xFF
326 dsll $tmp2,$tmp0,32
327 or $tmp0,$tmp2 # 0x000000FF000000FF
328
329 and $tmp1,$in0,$tmp0 # byte swap
330 and $tmp3,$in1,$tmp0
331 dsrl $tmp2,$in0,24
332 dsrl $tmp4,$in1,24
333 dsll $tmp1,24
334 dsll $tmp3,24
335 and $tmp2,$tmp0
336 and $tmp4,$tmp0
337 dsll $tmp0,8 # 0x0000FF000000FF00
338 or $tmp1,$tmp2
339 or $tmp3,$tmp4
340 and $tmp2,$in0,$tmp0
341 and $tmp4,$in1,$tmp0
342 dsrl $in0,8
343 dsrl $in1,8
344 dsll $tmp2,8
345 dsll $tmp4,8
346 and $in0,$tmp0
347 and $in1,$tmp0
348 or $tmp1,$tmp2
349 or $tmp3,$tmp4
350 or $in0,$tmp1
351 or $in1,$tmp3
352 dsrl $tmp1,$in0,32
353 dsrl $tmp3,$in1,32
354 dsll $in0,32
355 dsll $in1,32
356 or $in0,$tmp1
357 or $in1,$tmp3
358# endif
359#endif
360 dsrl $tmp1,$h2,2 # modulo-scheduled reduction
361 andi $h2,$h2,3
362 dsll $tmp0,$tmp1,2
363
364 daddu $d0,$h0,$in0 # accumulate input
365 daddu $tmp1,$tmp0
366 sltu $tmp0,$d0,$h0
367 daddu $d0,$d0,$tmp1 # ... and residue
368 sltu $tmp1,$d0,$tmp1
369 daddu $d1,$h1,$in1
370 daddu $tmp0,$tmp1
371 sltu $tmp1,$d1,$h1
372 daddu $d1,$tmp0
373
374 dmultu ($r0,$d0) # h0*r0
375 daddu $d2,$h2,$padbit
376 sltu $tmp0,$d1,$tmp0
377 mflo ($h0,$r0,$d0)
378 mfhi ($h1,$r0,$d0)
379
380 dmultu ($rs1,$d1) # h1*5*r1
381 daddu $d2,$tmp1
382 daddu $d2,$tmp0
383 mflo ($tmp0,$rs1,$d1)
384 mfhi ($tmp1,$rs1,$d1)
385
386 dmultu ($r1,$d0) # h0*r1
387 mflo ($tmp2,$r1,$d0)
388 mfhi ($h2,$r1,$d0)
389 daddu $h0,$tmp0
390 daddu $h1,$tmp1
391 sltu $tmp0,$h0,$tmp0
392
393 dmultu ($r0,$d1) # h1*r0
394 daddu $h1,$tmp0
395 daddu $h1,$tmp2
396 mflo ($tmp0,$r0,$d1)
397 mfhi ($tmp1,$r0,$d1)
398
399 dmultu ($rs1,$d2) # h2*5*r1
400 sltu $tmp2,$h1,$tmp2
401 daddu $h2,$tmp2
402 mflo ($tmp2,$rs1,$d2)
403
404 dmultu ($r0,$d2) # h2*r0
405 daddu $h1,$tmp0
406 daddu $h2,$tmp1
407 mflo ($tmp3,$r0,$d2)
408 sltu $tmp0,$h1,$tmp0
409 daddu $h2,$tmp0
410
411 daddu $h1,$tmp2
412 sltu $tmp2,$h1,$tmp2
413 daddu $h2,$tmp2
414 daddu $h2,$tmp3
415
416 bne $inp,$len,.Loop
417
418 sd $h0,0($ctx) # store hash value
419 sd $h1,8($ctx)
420 sd $h2,16($ctx)
421
422 .set noreorder
423#if defined(_MIPS_ARCH_MIPS64R6)
424 ld $s7,56($sp)
425 ld $s6,48($sp)
426#endif
427 ld $s5,40($sp) # epilogue
428 ld $s4,32($sp)
429___
430$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi epilogue
431 ld $s3,24($sp)
432 ld $s2,16($sp)
433 ld $s1,8($sp)
434 ld $s0,0($sp)
435___
436$code.=<<___;
437 jr $ra
438#if defined(_MIPS_ARCH_MIPS64R6)
439 daddu $sp,8*8
440#else
441 daddu $sp,6*8
442#endif
443.end poly1305_blocks_internal
444___
445}
446{
447my ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
448
449$code.=<<___;
450.align 5
451.globl poly1305_emit
452.ent poly1305_emit
453poly1305_emit:
454 .frame $sp,0,$ra
455 .set reorder
456
457 ld $tmp2,16($ctx)
458 ld $tmp0,0($ctx)
459 ld $tmp1,8($ctx)
460
461 li $in0,-4 # final reduction
462 dsrl $in1,$tmp2,2
463 and $in0,$tmp2
464 andi $tmp2,$tmp2,3
465 daddu $in0,$in1
466
467 daddu $tmp0,$tmp0,$in0
468 sltu $in1,$tmp0,$in0
469 daddiu $in0,$tmp0,5 # compare to modulus
470 daddu $tmp1,$tmp1,$in1
471 sltiu $tmp3,$in0,5
472 sltu $tmp4,$tmp1,$in1
473 daddu $in1,$tmp1,$tmp3
474 daddu $tmp2,$tmp2,$tmp4
475 sltu $tmp3,$in1,$tmp3
476 daddu $tmp2,$tmp2,$tmp3
477
478 dsrl $tmp2,2 # see if it carried/borrowed
479 dsubu $tmp2,$zero,$tmp2
480
481 xor $in0,$tmp0
482 xor $in1,$tmp1
483 and $in0,$tmp2
484 and $in1,$tmp2
485 xor $in0,$tmp0
486 xor $in1,$tmp1
487
488 lwu $tmp0,0($nonce) # load nonce
489 lwu $tmp1,4($nonce)
490 lwu $tmp2,8($nonce)
491 lwu $tmp3,12($nonce)
492 dsll $tmp1,32
493 dsll $tmp3,32
494 or $tmp0,$tmp1
495 or $tmp2,$tmp3
496
497 daddu $in0,$tmp0 # accumulate nonce
498 daddu $in1,$tmp2
499 sltu $tmp0,$in0,$tmp0
500 daddu $in1,$tmp0
501
502 dsrl $tmp0,$in0,8 # write mac value
503 dsrl $tmp1,$in0,16
504 dsrl $tmp2,$in0,24
505 sb $in0,0($mac)
506 dsrl $tmp3,$in0,32
507 sb $tmp0,1($mac)
508 dsrl $tmp0,$in0,40
509 sb $tmp1,2($mac)
510 dsrl $tmp1,$in0,48
511 sb $tmp2,3($mac)
512 dsrl $tmp2,$in0,56
513 sb $tmp3,4($mac)
514 dsrl $tmp3,$in1,8
515 sb $tmp0,5($mac)
516 dsrl $tmp0,$in1,16
517 sb $tmp1,6($mac)
518 dsrl $tmp1,$in1,24
519 sb $tmp2,7($mac)
520
521 sb $in1,8($mac)
522 dsrl $tmp2,$in1,32
523 sb $tmp3,9($mac)
524 dsrl $tmp3,$in1,40
525 sb $tmp0,10($mac)
526 dsrl $tmp0,$in1,48
527 sb $tmp1,11($mac)
528 dsrl $tmp1,$in1,56
529 sb $tmp2,12($mac)
530 sb $tmp3,13($mac)
531 sb $tmp0,14($mac)
532 sb $tmp1,15($mac)
533
534 jr $ra
535.end poly1305_emit
536.rdata
537.asciiz "Poly1305 for MIPS64, CRYPTOGAMS by \@dot-asm"
538.align 2
539___
540}
541}}} else {{{
542######################################################################
543# 32-bit code path
544#
545
546my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
547my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) =
548 ($a4,$a5,$a6,$a7,$at,$t0,$t1,$t2);
549
550$code.=<<___;
551#if (defined(_MIPS_ARCH_MIPS32R3) || defined(_MIPS_ARCH_MIPS32R5) || \\
552 defined(_MIPS_ARCH_MIPS32R6)) \\
553 && !defined(_MIPS_ARCH_MIPS32R2)
554# define _MIPS_ARCH_MIPS32R2
555#endif
556
557#if defined(_MIPS_ARCH_MIPS32R6)
558# define multu(rs,rt)
559# define mflo(rd,rs,rt) mulu rd,rs,rt
560# define mfhi(rd,rs,rt) muhu rd,rs,rt
561#else
562# define multu(rs,rt) multu rs,rt
563# define mflo(rd,rs,rt) mflo rd
564# define mfhi(rd,rs,rt) mfhi rd
565#endif
566
567#ifdef __KERNEL__
568# define poly1305_init poly1305_init_mips
569# define poly1305_blocks poly1305_blocks_mips
570# define poly1305_emit poly1305_emit_mips
571#endif
572
573#if defined(__MIPSEB__) && !defined(MIPSEB)
574# define MIPSEB
575#endif
576
577#ifdef MIPSEB
578# define MSB 0
579# define LSB 3
580#else
581# define MSB 3
582# define LSB 0
583#endif
584
585.text
586.set noat
587.set noreorder
588
589.align 5
590.globl poly1305_init
591.ent poly1305_init
592poly1305_init:
593 .frame $sp,0,$ra
594 .set reorder
595
596 sw $zero,0($ctx)
597 sw $zero,4($ctx)
598 sw $zero,8($ctx)
599 sw $zero,12($ctx)
600 sw $zero,16($ctx)
601
602 beqz $inp,.Lno_key
603
604#if defined(_MIPS_ARCH_MIPS32R6)
605 andi $tmp0,$inp,3 # $inp % 4
606 subu $inp,$inp,$tmp0 # align $inp
607 sll $tmp0,$tmp0,3 # byte to bit offset
608 lw $in0,0($inp)
609 lw $in1,4($inp)
610 lw $in2,8($inp)
611 lw $in3,12($inp)
612 beqz $tmp0,.Laligned_key
613
614 lw $tmp2,16($inp)
615 subu $tmp1,$zero,$tmp0
616# ifdef MIPSEB
617 sllv $in0,$in0,$tmp0
618 srlv $tmp3,$in1,$tmp1
619 sllv $in1,$in1,$tmp0
620 or $in0,$in0,$tmp3
621 srlv $tmp3,$in2,$tmp1
622 sllv $in2,$in2,$tmp0
623 or $in1,$in1,$tmp3
624 srlv $tmp3,$in3,$tmp1
625 sllv $in3,$in3,$tmp0
626 or $in2,$in2,$tmp3
627 srlv $tmp2,$tmp2,$tmp1
628 or $in3,$in3,$tmp2
629# else
630 srlv $in0,$in0,$tmp0
631 sllv $tmp3,$in1,$tmp1
632 srlv $in1,$in1,$tmp0
633 or $in0,$in0,$tmp3
634 sllv $tmp3,$in2,$tmp1
635 srlv $in2,$in2,$tmp0
636 or $in1,$in1,$tmp3
637 sllv $tmp3,$in3,$tmp1
638 srlv $in3,$in3,$tmp0
639 or $in2,$in2,$tmp3
640 sllv $tmp2,$tmp2,$tmp1
641 or $in3,$in3,$tmp2
642# endif
643.Laligned_key:
644#else
645 lwl $in0,0+MSB($inp)
646 lwl $in1,4+MSB($inp)
647 lwl $in2,8+MSB($inp)
648 lwl $in3,12+MSB($inp)
649 lwr $in0,0+LSB($inp)
650 lwr $in1,4+LSB($inp)
651 lwr $in2,8+LSB($inp)
652 lwr $in3,12+LSB($inp)
653#endif
654#ifdef MIPSEB
655# if defined(_MIPS_ARCH_MIPS32R2)
656 wsbh $in0,$in0 # byte swap
657 wsbh $in1,$in1
658 wsbh $in2,$in2
659 wsbh $in3,$in3
660 rotr $in0,$in0,16
661 rotr $in1,$in1,16
662 rotr $in2,$in2,16
663 rotr $in3,$in3,16
664# else
665 srl $tmp0,$in0,24 # byte swap
666 srl $tmp1,$in0,8
667 andi $tmp2,$in0,0xFF00
668 sll $in0,$in0,24
669 andi $tmp1,0xFF00
670 sll $tmp2,$tmp2,8
671 or $in0,$tmp0
672 srl $tmp0,$in1,24
673 or $tmp1,$tmp2
674 srl $tmp2,$in1,8
675 or $in0,$tmp1
676 andi $tmp1,$in1,0xFF00
677 sll $in1,$in1,24
678 andi $tmp2,0xFF00
679 sll $tmp1,$tmp1,8
680 or $in1,$tmp0
681 srl $tmp0,$in2,24
682 or $tmp2,$tmp1
683 srl $tmp1,$in2,8
684 or $in1,$tmp2
685 andi $tmp2,$in2,0xFF00
686 sll $in2,$in2,24
687 andi $tmp1,0xFF00
688 sll $tmp2,$tmp2,8
689 or $in2,$tmp0
690 srl $tmp0,$in3,24
691 or $tmp1,$tmp2
692 srl $tmp2,$in3,8
693 or $in2,$tmp1
694 andi $tmp1,$in3,0xFF00
695 sll $in3,$in3,24
696 andi $tmp2,0xFF00
697 sll $tmp1,$tmp1,8
698 or $in3,$tmp0
699 or $tmp2,$tmp1
700 or $in3,$tmp2
701# endif
702#endif
703 lui $tmp0,0x0fff
704 ori $tmp0,0xffff # 0x0fffffff
705 and $in0,$in0,$tmp0
706 subu $tmp0,3 # 0x0ffffffc
707 and $in1,$in1,$tmp0
708 and $in2,$in2,$tmp0
709 and $in3,$in3,$tmp0
710
711 sw $in0,20($ctx)
712 sw $in1,24($ctx)
713 sw $in2,28($ctx)
714 sw $in3,32($ctx)
715
716 srl $tmp1,$in1,2
717 srl $tmp2,$in2,2
718 srl $tmp3,$in3,2
719 addu $in1,$in1,$tmp1 # s1 = r1 + (r1 >> 2)
720 addu $in2,$in2,$tmp2
721 addu $in3,$in3,$tmp3
722 sw $in1,36($ctx)
723 sw $in2,40($ctx)
724 sw $in3,44($ctx)
725.Lno_key:
726 li $v0,0
727 jr $ra
728.end poly1305_init
729___
730{
731my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x00fff000" : "0x00ff0000";
732
733my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) =
734 ($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $s9,$s10,$s11);
735my ($d0,$d1,$d2,$d3) =
736 ($a4,$a5,$a6,$a7);
737my $shr = $t2; # used on R6
738my $one = $t2; # used on R2
739
740$code.=<<___;
741.globl poly1305_blocks
742.align 5
743.ent poly1305_blocks
744poly1305_blocks:
745 .frame $sp,16*4,$ra
746 .mask $SAVED_REGS_MASK,-4
747 .set noreorder
748 subu $sp, $sp,4*12
749 sw $s11,4*11($sp)
750 sw $s10,4*10($sp)
751 sw $s9, 4*9($sp)
752 sw $s8, 4*8($sp)
753 sw $s7, 4*7($sp)
754 sw $s6, 4*6($sp)
755 sw $s5, 4*5($sp)
756 sw $s4, 4*4($sp)
757___
758$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
759 sw $s3, 4*3($sp)
760 sw $s2, 4*2($sp)
761 sw $s1, 4*1($sp)
762 sw $s0, 4*0($sp)
763___
764$code.=<<___;
765 .set reorder
766
767 srl $len,4 # number of complete blocks
768 li $one,1
769 beqz $len,.Labort
770
771#if defined(_MIPS_ARCH_MIPS32R6)
772 andi $shr,$inp,3
773 subu $inp,$inp,$shr # align $inp
774 sll $shr,$shr,3 # byte to bit offset
775#endif
776
777 lw $h0,0($ctx) # load hash value
778 lw $h1,4($ctx)
779 lw $h2,8($ctx)
780 lw $h3,12($ctx)
781 lw $h4,16($ctx)
782
783 lw $r0,20($ctx) # load key
784 lw $r1,24($ctx)
785 lw $r2,28($ctx)
786 lw $r3,32($ctx)
787 lw $rs1,36($ctx)
788 lw $rs2,40($ctx)
789 lw $rs3,44($ctx)
790
791 sll $len,4
792 addu $len,$len,$inp # end of buffer
793 b .Loop
794
795.align 4
796.Loop:
797#if defined(_MIPS_ARCH_MIPS32R6)
798 lw $d0,0($inp) # load input
799 lw $d1,4($inp)
800 lw $d2,8($inp)
801 lw $d3,12($inp)
802 beqz $shr,.Laligned_inp
803
804 lw $t0,16($inp)
805 subu $t1,$zero,$shr
806# ifdef MIPSEB
807 sllv $d0,$d0,$shr
808 srlv $at,$d1,$t1
809 sllv $d1,$d1,$shr
810 or $d0,$d0,$at
811 srlv $at,$d2,$t1
812 sllv $d2,$d2,$shr
813 or $d1,$d1,$at
814 srlv $at,$d3,$t1
815 sllv $d3,$d3,$shr
816 or $d2,$d2,$at
817 srlv $t0,$t0,$t1
818 or $d3,$d3,$t0
819# else
820 srlv $d0,$d0,$shr
821 sllv $at,$d1,$t1
822 srlv $d1,$d1,$shr
823 or $d0,$d0,$at
824 sllv $at,$d2,$t1
825 srlv $d2,$d2,$shr
826 or $d1,$d1,$at
827 sllv $at,$d3,$t1
828 srlv $d3,$d3,$shr
829 or $d2,$d2,$at
830 sllv $t0,$t0,$t1
831 or $d3,$d3,$t0
832# endif
833.Laligned_inp:
834#else
835 lwl $d0,0+MSB($inp) # load input
836 lwl $d1,4+MSB($inp)
837 lwl $d2,8+MSB($inp)
838 lwl $d3,12+MSB($inp)
839 lwr $d0,0+LSB($inp)
840 lwr $d1,4+LSB($inp)
841 lwr $d2,8+LSB($inp)
842 lwr $d3,12+LSB($inp)
843#endif
844#ifdef MIPSEB
845# if defined(_MIPS_ARCH_MIPS32R2)
846 wsbh $d0,$d0 # byte swap
847 wsbh $d1,$d1
848 wsbh $d2,$d2
849 wsbh $d3,$d3
850 rotr $d0,$d0,16
851 rotr $d1,$d1,16
852 rotr $d2,$d2,16
853 rotr $d3,$d3,16
854# else
855 srl $at,$d0,24 # byte swap
856 srl $t0,$d0,8
857 andi $t1,$d0,0xFF00
858 sll $d0,$d0,24
859 andi $t0,0xFF00
860 sll $t1,$t1,8
861 or $d0,$at
862 srl $at,$d1,24
863 or $t0,$t1
864 srl $t1,$d1,8
865 or $d0,$t0
866 andi $t0,$d1,0xFF00
867 sll $d1,$d1,24
868 andi $t1,0xFF00
869 sll $t0,$t0,8
870 or $d1,$at
871 srl $at,$d2,24
872 or $t1,$t0
873 srl $t0,$d2,8
874 or $d1,$t1
875 andi $t1,$d2,0xFF00
876 sll $d2,$d2,24
877 andi $t0,0xFF00
878 sll $t1,$t1,8
879 or $d2,$at
880 srl $at,$d3,24
881 or $t0,$t1
882 srl $t1,$d3,8
883 or $d2,$t0
884 andi $t0,$d3,0xFF00
885 sll $d3,$d3,24
886 andi $t1,0xFF00
887 sll $t0,$t0,8
888 or $d3,$at
889 or $t1,$t0
890 or $d3,$t1
891# endif
892#endif
893 srl $t0,$h4,2 # modulo-scheduled reduction
894 andi $h4,$h4,3
895 sll $at,$t0,2
896
897 addu $d0,$d0,$h0 # accumulate input
898 addu $t0,$t0,$at
899 sltu $h0,$d0,$h0
900 addu $d0,$d0,$t0 # ... and residue
901 sltu $at,$d0,$t0
902
903 addu $d1,$d1,$h1
904 addu $h0,$h0,$at # carry
905 sltu $h1,$d1,$h1
906 addu $d1,$d1,$h0
907 sltu $h0,$d1,$h0
908
909 addu $d2,$d2,$h2
910 addu $h1,$h1,$h0 # carry
911 sltu $h2,$d2,$h2
912 addu $d2,$d2,$h1
913 sltu $h1,$d2,$h1
914
915 addu $d3,$d3,$h3
916 addu $h2,$h2,$h1 # carry
917 sltu $h3,$d3,$h3
918 addu $d3,$d3,$h2
919
920#if defined(_MIPS_ARCH_MIPS32R2) && !defined(_MIPS_ARCH_MIPS32R6)
921 multu $r0,$d0 # d0*r0
922 sltu $h2,$d3,$h2
923 maddu $rs3,$d1 # d1*s3
924 addu $h3,$h3,$h2 # carry
925 maddu $rs2,$d2 # d2*s2
926 addu $h4,$h4,$padbit
927 maddu $rs1,$d3 # d3*s1
928 addu $h4,$h4,$h3
929 mfhi $at
930 mflo $h0
931
932 multu $r1,$d0 # d0*r1
933 maddu $r0,$d1 # d1*r0
934 maddu $rs3,$d2 # d2*s3
935 maddu $rs2,$d3 # d3*s2
936 maddu $rs1,$h4 # h4*s1
937 maddu $at,$one # hi*1
938 mfhi $at
939 mflo $h1
940
941 multu $r2,$d0 # d0*r2
942 maddu $r1,$d1 # d1*r1
943 maddu $r0,$d2 # d2*r0
944 maddu $rs3,$d3 # d3*s3
945 maddu $rs2,$h4 # h4*s2
946 maddu $at,$one # hi*1
947 mfhi $at
948 mflo $h2
949
950 mul $t0,$r0,$h4 # h4*r0
951
952 multu $r3,$d0 # d0*r3
953 maddu $r2,$d1 # d1*r2
954 maddu $r1,$d2 # d2*r1
955 maddu $r0,$d3 # d3*r0
956 maddu $rs3,$h4 # h4*s3
957 maddu $at,$one # hi*1
958 mfhi $at
959 mflo $h3
960
961 addiu $inp,$inp,16
962
963 addu $h4,$t0,$at
964#else
965 multu ($r0,$d0) # d0*r0
966 mflo ($h0,$r0,$d0)
967 mfhi ($h1,$r0,$d0)
968
969 sltu $h2,$d3,$h2
970 addu $h3,$h3,$h2 # carry
971
972 multu ($rs3,$d1) # d1*s3
973 mflo ($at,$rs3,$d1)
974 mfhi ($t0,$rs3,$d1)
975
976 addu $h4,$h4,$padbit
977 addiu $inp,$inp,16
978 addu $h4,$h4,$h3
979
980 multu ($rs2,$d2) # d2*s2
981 mflo ($a3,$rs2,$d2)
982 mfhi ($t1,$rs2,$d2)
983 addu $h0,$h0,$at
984 addu $h1,$h1,$t0
985 multu ($rs1,$d3) # d3*s1
986 sltu $at,$h0,$at
987 addu $h1,$h1,$at
988
989 mflo ($at,$rs1,$d3)
990 mfhi ($t0,$rs1,$d3)
991 addu $h0,$h0,$a3
992 addu $h1,$h1,$t1
993 multu ($r1,$d0) # d0*r1
994 sltu $a3,$h0,$a3
995 addu $h1,$h1,$a3
996
997
998 mflo ($a3,$r1,$d0)
999 mfhi ($h2,$r1,$d0)
1000 addu $h0,$h0,$at
1001 addu $h1,$h1,$t0
1002 multu ($r0,$d1) # d1*r0
1003 sltu $at,$h0,$at
1004 addu $h1,$h1,$at
1005
1006 mflo ($at,$r0,$d1)
1007 mfhi ($t0,$r0,$d1)
1008 addu $h1,$h1,$a3
1009 sltu $a3,$h1,$a3
1010 multu ($rs3,$d2) # d2*s3
1011 addu $h2,$h2,$a3
1012
1013 mflo ($a3,$rs3,$d2)
1014 mfhi ($t1,$rs3,$d2)
1015 addu $h1,$h1,$at
1016 addu $h2,$h2,$t0
1017 multu ($rs2,$d3) # d3*s2
1018 sltu $at,$h1,$at
1019 addu $h2,$h2,$at
1020
1021 mflo ($at,$rs2,$d3)
1022 mfhi ($t0,$rs2,$d3)
1023 addu $h1,$h1,$a3
1024 addu $h2,$h2,$t1
1025 multu ($rs1,$h4) # h4*s1
1026 sltu $a3,$h1,$a3
1027 addu $h2,$h2,$a3
1028
1029 mflo ($a3,$rs1,$h4)
1030 addu $h1,$h1,$at
1031 addu $h2,$h2,$t0
1032 multu ($r2,$d0) # d0*r2
1033 sltu $at,$h1,$at
1034 addu $h2,$h2,$at
1035
1036
1037 mflo ($at,$r2,$d0)
1038 mfhi ($h3,$r2,$d0)
1039 addu $h1,$h1,$a3
1040 sltu $a3,$h1,$a3
1041 multu ($r1,$d1) # d1*r1
1042 addu $h2,$h2,$a3
1043
1044 mflo ($a3,$r1,$d1)
1045 mfhi ($t1,$r1,$d1)
1046 addu $h2,$h2,$at
1047 sltu $at,$h2,$at
1048 multu ($r0,$d2) # d2*r0
1049 addu $h3,$h3,$at
1050
1051 mflo ($at,$r0,$d2)
1052 mfhi ($t0,$r0,$d2)
1053 addu $h2,$h2,$a3
1054 addu $h3,$h3,$t1
1055 multu ($rs3,$d3) # d3*s3
1056 sltu $a3,$h2,$a3
1057 addu $h3,$h3,$a3
1058
1059 mflo ($a3,$rs3,$d3)
1060 mfhi ($t1,$rs3,$d3)
1061 addu $h2,$h2,$at
1062 addu $h3,$h3,$t0
1063 multu ($rs2,$h4) # h4*s2
1064 sltu $at,$h2,$at
1065 addu $h3,$h3,$at
1066
1067 mflo ($at,$rs2,$h4)
1068 addu $h2,$h2,$a3
1069 addu $h3,$h3,$t1
1070 multu ($r3,$d0) # d0*r3
1071 sltu $a3,$h2,$a3
1072 addu $h3,$h3,$a3
1073
1074
1075 mflo ($a3,$r3,$d0)
1076 mfhi ($t1,$r3,$d0)
1077 addu $h2,$h2,$at
1078 sltu $at,$h2,$at
1079 multu ($r2,$d1) # d1*r2
1080 addu $h3,$h3,$at
1081
1082 mflo ($at,$r2,$d1)
1083 mfhi ($t0,$r2,$d1)
1084 addu $h3,$h3,$a3
1085 sltu $a3,$h3,$a3
1086 multu ($r0,$d3) # d3*r0
1087 addu $t1,$t1,$a3
1088
1089 mflo ($a3,$r0,$d3)
1090 mfhi ($d3,$r0,$d3)
1091 addu $h3,$h3,$at
1092 addu $t1,$t1,$t0
1093 multu ($r1,$d2) # d2*r1
1094 sltu $at,$h3,$at
1095 addu $t1,$t1,$at
1096
1097 mflo ($at,$r1,$d2)
1098 mfhi ($t0,$r1,$d2)
1099 addu $h3,$h3,$a3
1100 addu $t1,$t1,$d3
1101 multu ($rs3,$h4) # h4*s3
1102 sltu $a3,$h3,$a3
1103 addu $t1,$t1,$a3
1104
1105 mflo ($a3,$rs3,$h4)
1106 addu $h3,$h3,$at
1107 addu $t1,$t1,$t0
1108 multu ($r0,$h4) # h4*r0
1109 sltu $at,$h3,$at
1110 addu $t1,$t1,$at
1111
1112
1113 mflo ($h4,$r0,$h4)
1114 addu $h3,$h3,$a3
1115 sltu $a3,$h3,$a3
1116 addu $t1,$t1,$a3
1117 addu $h4,$h4,$t1
1118
1119 li $padbit,1 # if we loop, padbit is 1
1120#endif
1121 bne $inp,$len,.Loop
1122
1123 sw $h0,0($ctx) # store hash value
1124 sw $h1,4($ctx)
1125 sw $h2,8($ctx)
1126 sw $h3,12($ctx)
1127 sw $h4,16($ctx)
1128
1129 .set noreorder
1130.Labort:
1131 lw $s11,4*11($sp)
1132 lw $s10,4*10($sp)
1133 lw $s9, 4*9($sp)
1134 lw $s8, 4*8($sp)
1135 lw $s7, 4*7($sp)
1136 lw $s6, 4*6($sp)
1137 lw $s5, 4*5($sp)
1138 lw $s4, 4*4($sp)
1139___
1140$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
1141 lw $s3, 4*3($sp)
1142 lw $s2, 4*2($sp)
1143 lw $s1, 4*1($sp)
1144 lw $s0, 4*0($sp)
1145___
1146$code.=<<___;
1147 jr $ra
1148 addu $sp,$sp,4*12
1149.end poly1305_blocks
1150___
1151}
1152{
1153my ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3);
1154
1155$code.=<<___;
1156.align 5
1157.globl poly1305_emit
1158.ent poly1305_emit
1159poly1305_emit:
1160 .frame $sp,0,$ra
1161 .set reorder
1162
1163 lw $tmp4,16($ctx)
1164 lw $tmp0,0($ctx)
1165 lw $tmp1,4($ctx)
1166 lw $tmp2,8($ctx)
1167 lw $tmp3,12($ctx)
1168
1169 li $in0,-4 # final reduction
1170 srl $ctx,$tmp4,2
1171 and $in0,$in0,$tmp4
1172 andi $tmp4,$tmp4,3
1173 addu $ctx,$ctx,$in0
1174
1175 addu $tmp0,$tmp0,$ctx
1176 sltu $ctx,$tmp0,$ctx
1177 addiu $in0,$tmp0,5 # compare to modulus
1178 addu $tmp1,$tmp1,$ctx
1179 sltiu $in1,$in0,5
1180 sltu $ctx,$tmp1,$ctx
1181 addu $in1,$in1,$tmp1
1182 addu $tmp2,$tmp2,$ctx
1183 sltu $in2,$in1,$tmp1
1184 sltu $ctx,$tmp2,$ctx
1185 addu $in2,$in2,$tmp2
1186 addu $tmp3,$tmp3,$ctx
1187 sltu $in3,$in2,$tmp2
1188 sltu $ctx,$tmp3,$ctx
1189 addu $in3,$in3,$tmp3
1190 addu $tmp4,$tmp4,$ctx
1191 sltu $ctx,$in3,$tmp3
1192 addu $ctx,$tmp4
1193
1194 srl $ctx,2 # see if it carried/borrowed
1195 subu $ctx,$zero,$ctx
1196
1197 xor $in0,$tmp0
1198 xor $in1,$tmp1
1199 xor $in2,$tmp2
1200 xor $in3,$tmp3
1201 and $in0,$ctx
1202 and $in1,$ctx
1203 and $in2,$ctx
1204 and $in3,$ctx
1205 xor $in0,$tmp0
1206 xor $in1,$tmp1
1207 xor $in2,$tmp2
1208 xor $in3,$tmp3
1209
1210 lw $tmp0,0($nonce) # load nonce
1211 lw $tmp1,4($nonce)
1212 lw $tmp2,8($nonce)
1213 lw $tmp3,12($nonce)
1214
1215 addu $in0,$tmp0 # accumulate nonce
1216 sltu $ctx,$in0,$tmp0
1217
1218 addu $in1,$tmp1
1219 sltu $tmp1,$in1,$tmp1
1220 addu $in1,$ctx
1221 sltu $ctx,$in1,$ctx
1222 addu $ctx,$tmp1
1223
1224 addu $in2,$tmp2
1225 sltu $tmp2,$in2,$tmp2
1226 addu $in2,$ctx
1227 sltu $ctx,$in2,$ctx
1228 addu $ctx,$tmp2
1229
1230 addu $in3,$tmp3
1231 addu $in3,$ctx
1232
1233 srl $tmp0,$in0,8 # write mac value
1234 srl $tmp1,$in0,16
1235 srl $tmp2,$in0,24
1236 sb $in0, 0($mac)
1237 sb $tmp0,1($mac)
1238 srl $tmp0,$in1,8
1239 sb $tmp1,2($mac)
1240 srl $tmp1,$in1,16
1241 sb $tmp2,3($mac)
1242 srl $tmp2,$in1,24
1243 sb $in1, 4($mac)
1244 sb $tmp0,5($mac)
1245 srl $tmp0,$in2,8
1246 sb $tmp1,6($mac)
1247 srl $tmp1,$in2,16
1248 sb $tmp2,7($mac)
1249 srl $tmp2,$in2,24
1250 sb $in2, 8($mac)
1251 sb $tmp0,9($mac)
1252 srl $tmp0,$in3,8
1253 sb $tmp1,10($mac)
1254 srl $tmp1,$in3,16
1255 sb $tmp2,11($mac)
1256 srl $tmp2,$in3,24
1257 sb $in3, 12($mac)
1258 sb $tmp0,13($mac)
1259 sb $tmp1,14($mac)
1260 sb $tmp2,15($mac)
1261
1262 jr $ra
1263.end poly1305_emit
1264.rdata
1265.asciiz "Poly1305 for MIPS32, CRYPTOGAMS by \@dot-asm"
1266.align 2
1267___
1268}
1269}}}
1270
1271$output=pop and open STDOUT,">$output";
1272print $code;
1273close STDOUT;