-
Notifications
You must be signed in to change notification settings - Fork 12
/
vec_barrett_reduction.c
155 lines (136 loc) · 5.22 KB
/
vec_barrett_reduction.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
/*
* Use the fixed point version of Barrett reduction to compute a mod n
* over GF(2) for n = 0x104c11db7 using POWER8 instructions. We use k = 32.
*
* http://en.wikipedia.org/wiki/Barrett_reduction
*
* Copyright (C) 2017 Rogerio Alves <rogealve@br.ibm.com>, IBM
* Copyright (C) 2017 Steven Munroe <sjmunroe@us.ibm.com>, IBM
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of either:
*
* a) the GNU General Public License as published by the Free Software
* Foundation; either version 2 of the License, or (at your option)
* any later version, or
* b) the Apache License, Version 2.0
*/
#include <stdint.h>
#include <altivec.h>
#if defined (__clang__)
#include "clang_workaround.h"
#else
#define __builtin_pack_vector(a, b) __builtin_pack_vector_int128 ((a), (b))
#define __builtin_unpack_vector_0(a) __builtin_unpack_vector_int128 ((vector __int128_t)(a), 0)
#define __builtin_unpack_vector_1(a) __builtin_unpack_vector_int128 ((vector __int128_t)(a), 1)
#endif
#if defined(__LITTLE_ENDIAN__)
static const __vector unsigned long long v_Barrett_const[2]
__attribute__ ((aligned (16))) = {
/* Barrett constant m - (4^32)/n */
{ 0x0000000104d101dfUL, 0x0000000000000000UL },
/* Barrett constant n */
{ 0x0000000104c11db7UL, 0x0000000000000000UL }
};
static const __vector unsigned long long v_Barrett_reflect_const[2]
__attribute__ ((aligned (16))) = {
/* Barrett constant m - (4^32)/n */
{ 0x00000001f7011641UL, 0x0000000000000000UL },
/* Barrett constant n */
{ 0x00000001db710641UL, 0x0000000000000000UL }
};
#else
static const __vector unsigned long long v_Barrett_const[2]
__attribute__ ((aligned (16))) = {
/* Barrett constant m - (4^32)/n */
{ 0x0000000000000000UL, 0x0000000104d101dfUL },
/* Barrett constant n */
{ 0x0000000000000000UL, 0x0000000104c11db7UL }
};
static const __vector unsigned long long v_Barrett_reflect_const[2]
__attribute__ ((aligned (16))) = {
/* Barrett constant m - (4^32)/n */
{ 0x0000000000000000UL, 0x00000001f7011641UL },
/* Barrett constant n */
{ 0x0000000000000000UL, 0x00000001db710641UL }
};
#endif
unsigned long __attribute__ ((aligned (32)))
barrett_reduction (unsigned long data){
const __vector unsigned long long vzero = {0,0};
__vector unsigned long long vconst1 = vec_ld(0, v_Barrett_const);
__vector unsigned long long vconst2 = vec_ld(16,v_Barrett_const);
__vector unsigned long long v0;
unsigned long result = 0;
/* Get (unsigned long) a into vdata */
__vector unsigned long long vdata = (__vector unsigned long long)
__builtin_pack_vector(0UL, data);
/*
* Now for the actual algorithm. The idea is to calculate q,
* the multiple of our polynomial that we need to subtract. By
* doing the computation 2x bits higher (ie 64 bits) and shifting the
* result back down 2x bits, we round down to the nearest multiple.
*/
/* ma */
v0 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata,
(__vector unsigned long long)vconst1);
/* q = floor(ma/(2^64)) */
v0 = (__vector unsigned long long)vec_sld ((__vector unsigned char)vzero,
(__vector unsigned char)v0, 8);
/* qn */
v0 = __builtin_crypto_vpmsumd ((__vector unsigned long long)v0,
(__vector unsigned long long)vconst2);
/* a - qn, subtraction is xor in GF(2) */
v0 = vec_xor (vdata, v0);
/*
* Get the result into r3. We need to shift it left 8 bytes:
* V0 [ 0 1 2 X ]
* V0 [ 0 X 2 3 ]
*/
result = __builtin_unpack_vector_1(v0);
return result;
}
unsigned long __attribute__ ((aligned (32)))
barrett_reduction_reflected (unsigned long data){
const __vector unsigned long long vzero = {0,0};
const __vector unsigned long long vones = {0xffffffffffffffffUL,
0xffffffffffffffffUL};
const __vector unsigned long long vmask_32bit =
(__vector unsigned long long)vec_sld((__vector unsigned char)vzero,
(__vector unsigned char)vones, 4);
__vector unsigned long long vconst1 = vec_ld(0, v_Barrett_reflect_const);
__vector unsigned long long vconst2 = vec_ld(16,v_Barrett_reflect_const);
__vector unsigned long long v0;
unsigned long result = 0;
/* Get (unsigned long) a into vdata */
__vector unsigned long long vdata = (__vector unsigned long long)
__builtin_pack_vector(0UL, data);
/*
* Now for the actual algorithm. The idea is to calculate q,
* the multiple of our polynomial that we need to subtract. By
* doing the computation 2x bits higher (ie 64 bits) and shifting the
* result back down 2x bits, we round down to the nearest multiple.
*/
/* bottom 32 bits of a */
v0 = vec_and (vdata, vmask_32bit);
/* ma */
v0 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata,
(__vector unsigned long long)vconst1);
/* bottom 32 bits of a */
v0 = vec_and (v0, vmask_32bit);
/* qn */
v0 = __builtin_crypto_vpmsumd ((__vector unsigned long long)v0,
(__vector unsigned long long)vconst2);
/* a - qn, subtraction is xor in GF(2) */
v0 = vec_xor (vdata, v0);
/*
* Since we are bit reflected, the result (ie the low 32 bits) is in the
* high 32 bits. We just need to shift it left 4 bytes
* V0 [ 0 1 X 3 ]
* V0 [ 0 X 2 3 ]
*/
v0 = (__vector unsigned long long )vec_sld((__vector unsigned char)v0,
(__vector unsigned char)vzero, 4);
result = __builtin_unpack_vector_0(v0);
return result;
}