-
Notifications
You must be signed in to change notification settings - Fork 0
/
vec.h
156 lines (127 loc) · 4.48 KB
/
vec.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#pragma once
#include <vecintrin.h>
constexpr int VLEN_BYTES = 16;
#define ALWAYS_INLINE __attribute__((always_inline))
template <typename T>
struct vec_inner_type_t {
using Type __attribute__((vector_size(VLEN_BYTES))) = T;
};
template <typename T>
struct vec_type_t {
public:
using Type = typename vec_inner_type_t<T>::Type;
using ElementType = T;
operator Type &() { return _val; }
operator Type() const { return _val; }
static constexpr int size() { return VLEN_BYTES / sizeof(ElementType); }
ALWAYS_INLINE vec_type_t() { _val = Type {}; }
ALWAYS_INLINE explicit vec_type_t(T scalar)
: _val {vec_splats((T)scalar)} {}
ALWAYS_INLINE vec_type_t(Type v) : _val {v} {}
static vec_type_t<T> ALWAYS_INLINE loadu(const void *ptr) {
return {vec_xl(0, reinterpret_cast<const ElementType *>(ptr))};
}
static ALWAYS_INLINE vec_type_t<T> loadLen(
const void *ptr, uint32_t BYTE_INDEX) {
return {vec_load_len(
reinterpret_cast<const ElementType *>(ptr), BYTE_INDEX)};
}
static vec_type_t<T> ALWAYS_INLINE load_hinted(const void *ptr) {
Type const *addr = (Type const *)ptr;
Type y;
// Doubleword aligned hint
#if __GNUC__ < 9 && !defined(__clang__)
// hex-encode vl %[out],%[addr],3
asm(".insn vrx,0xe70000003006,%[out],%[addr],3"
: [out] "=v"(y)
: [addr] "R"(*addr));
#else
y = *addr;
#endif
return y;
}
void ALWAYS_INLINE store(void *ptr) const {
vec_xst(_val, 0, reinterpret_cast<ElementType *>(ptr));
}
void ALWAYS_INLINE storeLen(void *ptr, uint32_t BYTE_INDEX) const {
vec_store_len(_val, reinterpret_cast<ElementType *>(ptr), BYTE_INDEX);
}
ALWAYS_INLINE const Type &vec() const { return _val; }
vec_type_t<T> &ALWAYS_INLINE operator+=(const vec_type_t<T> &other) {
_val = _val + other._val;
return *this;
}
private:
Type _val;
};
using vuint8 = typename vec_type_t<uint8_t>::Type;
using vuint16 = typename vec_type_t<uint16_t>::Type;
using vint16 = typename vec_type_t<int16_t>::Type;
using vuint32 = typename vec_type_t<uint32_t>::Type;
using vint32 = typename vec_type_t<int32_t>::Type;
template <typename T>
std::ostream &operator<<(std::ostream &stream, const vec_type_t<T> &vec) {
const typename vec_type_t<T>::Type v = vec;
stream << "vec[";
for (int i = 0; i != vec_type_t<T>::size(); i++) {
if (i != 0) { stream << ", "; }
stream << (typename conv_t<typename vec_type_t<T>::ElementType>::V)(
v[i]);
}
stream << "]";
return stream;
}
template < typename V, typename T>
vec_type_t<V> cast(const vec_type_t<T> &x) {
using cast_type = typename vec_type_t<V>::Type;
return vec_type_t<V> {(cast_type)(x.vec())};
}
//
// const vuint16 vone16 = {1, 1, 1, 1, 1, 1, 1, 1};
inline vec_type_t<int32_t> multiplyAdd(vec_type_t<int16_t> va,
vec_type_t<int16_t> vb, vec_type_t<int32_t> vc) {
// 2 ops 2 moad
auto a = va.vec();
auto b = vb.vec();
auto c = vc.vec();
c = vec_moadd(a, b, c);
c = vec_meadd(a, b, c);
return vec_type_t<int32_t> {c};
}
inline vec_type_t<uint32_t> multiplySum4(vec_type_t<uint8_t> va, vec_type_t<uint8_t> vb, vec_type_t<uint32_t> vc ) {
// 6 ops 2 mul 2 vec_sum 2 addition
const vuint16 vz16 = {};
const auto a = va.vec();
const auto b = vb.vec();
auto c = vc.vec();
auto reso = vec_mulo(a, b);
auto rese = vec_mule(a, b);
c= c + vec_sum4(reso, vz16) + vec_sum4(rese, vz16);
return vec_type_t<uint32_t> {c};
}
inline vec_type_t<uint32_t> multiplyAdd(vec_type_t<uint8_t> va,
vec_type_t<uint8_t> vb, vec_type_t<uint32_t> vc) {
// 6 ops
vuint8 a = va.vec();
vuint8 b = vb.vec();
auto c = vc.vec();
const vuint16 vone16 = {1, 1, 1, 1, 1, 1, 1, 1};
vuint16 reso = vec_mulo(a, b);
vuint16 rese = vec_mule(a, b);
c = vec_moadd(reso, vone16, c);
c = vec_meadd(reso, vone16, c);
c = vec_moadd(rese, vone16, c);
c = vec_meadd(rese, vone16, c);
return vec_type_t<uint32_t> {c};
}
inline vec_type_t<uint32_t> multiplySum4Low(vec_type_t<uint8_t> va, vec_type_t<uint8_t> vb, vec_type_t<uint32_t> vc) {
// 4 ops 2 mul 1 vec_sum 1 addition
const vuint16 vz16 = {};
const auto a = va.vec();
const auto b = vb.vec();
auto c = vc.vec();
vuint16 d = vec_mulo(a, b);
vuint16 e = vec_meadd(a, b, d);
c= c + vec_sum4(e, vz16);
return vec_type_t<uint32_t> {c};
}