1 // Boost.uBLAS
2 //
3 // Copyright (c) 2018 Fady Essam
4 // Copyright (c) 2018 Stefan Seefeld
5 //
6 // Distributed under the Boost Software License, Version 1.0.
7 // (See accompanying file LICENSE_1_0.txt or
8 // copy at http://www.boost.org/LICENSE_1_0.txt)
9
10 #ifndef boost_numeric_ublas_opencl_prod_hpp_
11 #define boost_numeric_ublas_opencl_prod_hpp_
12
13 #include <boost/numeric/ublas/opencl/library.hpp>
14 #include <boost/numeric/ublas/opencl/vector.hpp>
15 #include <boost/numeric/ublas/opencl/matrix.hpp>
16 #include <boost/numeric/ublas/opencl/transpose.hpp>
17 #include <boost/compute/buffer.hpp>
18
19 namespace boost { namespace numeric { namespace ublas { namespace opencl {
20
21 #define ONE_DOUBLE_COMPLEX {{1.0, 00.0}}
22 #define ONE_FLOAT_COMPLEX {{1.0f, 00.0f}}
23
24 template <typename T, typename L1, typename L2>
25 typename std::enable_if<is_numeric<T>::value>::type
prod(ublas::matrix<T,L1,opencl::storage> const & a,ublas::matrix<T,L2,opencl::storage> const & b,ublas::matrix<T,L1,opencl::storage> & result,compute::command_queue & queue)26 prod(ublas::matrix<T, L1, opencl::storage> const &a,
27 ublas::matrix<T, L2, opencl::storage> const &b,
28 ublas::matrix<T, L1, opencl::storage> &result,
29 compute::command_queue &queue)
30 {
31 assert(a.device() == b.device() &&
32 a.device() == result.device() &&
33 a.device() == queue.get_device());
34 assert(a.size2() == b.size1());
35
36 result.fill(0, queue);
37
38 //to hold matrix b with layout 1 if the b has different layout
39 std::unique_ptr<ublas::matrix<T, L1, opencl::storage>> bl1;
40
41 cl_event event = NULL;
42
43 cl_mem buffer_a = a.begin().get_buffer().get();
44 cl_mem buffer_b = b.begin().get_buffer().get();
45 cl_mem buffer_result = result.begin().get_buffer().get();
46
47 if (!(std::is_same<L1, L2>::value))
48 {
49 bl1.reset(new ublas::matrix<T, L1, opencl::storage>(b.size1(), b.size2(), queue.get_context()));
50 change_layout(b, *bl1, queue);
51 buffer_b = bl1->begin().get_buffer().get();
52 }
53
54 clblasOrder Order = std::is_same<L1, ublas::basic_row_major<> >::value ? clblasRowMajor : clblasColumnMajor;
55 size_t lda = Order == clblasRowMajor ? a.size2() : a.size1();
56 size_t ldb = Order == clblasRowMajor ? b.size2() : a.size2();
57 size_t ldc = Order == clblasRowMajor ? b.size2() : a.size1();
58
59 if (std::is_same<T, float>::value)
60 clblasSgemm(Order, clblasNoTrans, clblasNoTrans,
61 a.size1(), b.size2(), a.size2(),
62 1, buffer_a, 0, lda,
63 buffer_b, 0, ldb, 1,
64 buffer_result, 0, ldc,
65 1, &(queue.get()), 0, NULL, &event);
66 else if (std::is_same<T, double>::value)
67 clblasDgemm(Order, clblasNoTrans, clblasNoTrans,
68 a.size1(), b.size2(), a.size2(),
69 1, buffer_a, 0, lda,
70 buffer_b, 0, ldb, 1,
71 buffer_result, 0, ldc,
72 1, &(queue.get()), 0, NULL, &event);
73 else if (std::is_same<T, std::complex<float>>::value)
74 clblasCgemm(Order, clblasNoTrans, clblasNoTrans,
75 a.size1(), b.size2(), a.size2(),
76 ONE_FLOAT_COMPLEX, buffer_a, 0, lda,
77 buffer_b, 0, ldb, ONE_FLOAT_COMPLEX,
78 buffer_result, 0, ldc,
79 1, &(queue.get()), 0, NULL, &event);
80 else if (std::is_same<T, std::complex<double>>::value)
81 clblasZgemm(Order, clblasNoTrans, clblasNoTrans,
82 a.size1(), b.size2(), a.size2(),
83 ONE_DOUBLE_COMPLEX, buffer_a, 0, lda,
84 buffer_b, 0, ldb, ONE_DOUBLE_COMPLEX,
85 buffer_result, 0, ldc,
86 1, &(queue.get()), 0, NULL, &event);
87 clWaitForEvents(1, &event);
88 }
89
90 template <typename T, typename L1, typename L2, typename A>
91 typename std::enable_if<is_numeric<T>::value>::type
prod(ublas::matrix<T,L1,A> const & a,ublas::matrix<T,L2,A> const & b,ublas::matrix<T,L1,A> & result,compute::command_queue & queue)92 prod(ublas::matrix<T, L1, A> const &a,
93 ublas::matrix<T, L2, A> const &b,
94 ublas::matrix<T, L1, A> &result,
95 compute::command_queue &queue)
96 {
97 ublas::matrix<T, L1, opencl::storage> adev(a, queue);
98 ublas::matrix<T, L2, opencl::storage> bdev(b, queue);
99 ublas::matrix<T, L1, opencl::storage> rdev(a.size1(), b.size2(), queue.get_context());
100 prod(adev, bdev, rdev, queue);
101 rdev.to_host(result,queue);
102 }
103
104 template <typename T, typename L1, typename L2, typename A>
105 typename std::enable_if<is_numeric<T>::value, ublas::matrix<T, L1, A>>::type
prod(ublas::matrix<T,L1,A> const & a,ublas::matrix<T,L2,A> const & b,compute::command_queue & queue)106 prod(ublas::matrix<T, L1, A> const &a,
107 ublas::matrix<T, L2, A> const &b,
108 compute::command_queue &queue)
109 {
110 ublas::matrix<T, L1, A> result(a.size1(), b.size2());
111 prod(a, b, result, queue);
112 return result;
113 }
114
115 template <typename T, typename L>
116 typename std::enable_if<is_numeric<T>::value>::type
prod(ublas::matrix<T,L,opencl::storage> const & a,ublas::vector<T,opencl::storage> const & b,ublas::vector<T,opencl::storage> & result,compute::command_queue & queue)117 prod(ublas::matrix<T, L, opencl::storage> const &a,
118 ublas::vector<T, opencl::storage> const &b,
119 ublas::vector<T, opencl::storage> &result,
120 compute::command_queue &queue)
121 {
122 assert(a.device() == b.device() &&
123 a.device() == result.device() &&
124 a.device() == queue.get_device());
125 assert(a.size2() == b.size());
126 result.fill(0, queue);
127
128 cl_event event = NULL;
129 clblasOrder Order = std::is_same<L, ublas::basic_row_major<> >::value ? clblasRowMajor : clblasColumnMajor;
130 int lda = Order == clblasRowMajor ? a.size2() : a.size1();
131 int ldb = Order == clblasRowMajor ? 1 : a.size2();
132 int ldc = Order == clblasRowMajor ? 1 : a.size1();
133
134 if (std::is_same<T, float>::value)
135 clblasSgemm(Order, clblasNoTrans, clblasNoTrans,
136 a.size1(), 1, a.size2(),
137 1, a.begin().get_buffer().get(), 0, lda,
138 b.begin().get_buffer().get(), 0, ldb, 1,
139 result.begin().get_buffer().get(), 0, ldc,
140 1, &(queue.get()), 0, NULL, &event);
141 else if (std::is_same<T, double>::value)
142 clblasDgemm(Order, clblasNoTrans, clblasNoTrans,
143 a.size1(), 1, a.size2(),
144 1, a.begin().get_buffer().get(), 0, lda,
145 b.begin().get_buffer().get(), 0, ldb, 1,
146 result.begin().get_buffer().get(), 0, ldc,
147 1, &(queue.get()), 0, NULL, &event);
148 else if (std::is_same<T, std::complex<float>>::value)
149 clblasCgemm(Order, clblasNoTrans, clblasNoTrans,
150 a.size1(), 1, a.size2(),
151 ONE_FLOAT_COMPLEX, a.begin().get_buffer().get(), 0, lda,
152 b.begin().get_buffer().get(), 0, ldb, ONE_FLOAT_COMPLEX,
153 result.begin().get_buffer().get(), 0, ldc,
154 1, &(queue.get()), 0, NULL, &event);
155 else if (std::is_same<T, std::complex<double>>::value)
156 clblasZgemm(Order, clblasNoTrans, clblasNoTrans,
157 a.size1(), 1, a.size2(),
158 ONE_DOUBLE_COMPLEX, a.begin().get_buffer().get(), 0, lda,
159 b.begin().get_buffer().get(), 0, ldb, ONE_DOUBLE_COMPLEX,
160 result.begin().get_buffer().get(), 0, ldc,
161 1, &(queue.get()), 0, NULL, &event);
162 clWaitForEvents(1, &event);
163 }
164
165 template <typename T, typename L, typename A>
166 typename std::enable_if<is_numeric<T>::value>::type
prod(ublas::matrix<T,L,A> const & a,ublas::vector<T,A> const & b,ublas::vector<T,A> & result,compute::command_queue & queue)167 prod(ublas::matrix<T, L, A> const &a,
168 ublas::vector<T, A> const &b,
169 ublas::vector<T, A> &result,
170 compute::command_queue &queue)
171 {
172 ublas::matrix<T, L, opencl::storage> adev(a, queue);
173 ublas::vector<T, opencl::storage> bdev(b, queue);
174 ublas::vector<T, opencl::storage> rdev(a.size1(), queue.get_context());
175 prod(adev, bdev, rdev, queue);
176 rdev.to_host(result, queue);
177 }
178
179 template <typename T, typename L, typename A>
180 typename std::enable_if<is_numeric<T>::value, ublas::vector<T, A>>::type
prod(ublas::matrix<T,L,A> const & a,ublas::vector<T,A> const & b,compute::command_queue & queue)181 prod(ublas::matrix<T, L, A> const &a,
182 ublas::vector<T, A> const &b,
183 compute::command_queue &queue)
184 {
185 ublas::vector<T, A> result(a.size1());
186 prod(a, b, result, queue);
187 return result;
188 }
189
190 template <typename T, typename L>
191 typename std::enable_if<is_numeric<T>::value>::type
prod(ublas::vector<T,opencl::storage> const & a,ublas::matrix<T,L,opencl::storage> const & b,ublas::vector<T,opencl::storage> & result,compute::command_queue & queue)192 prod(ublas::vector<T, opencl::storage> const &a,
193 ublas::matrix<T, L, opencl::storage> const &b,
194 ublas::vector<T, opencl::storage> &result,
195 compute::command_queue &queue)
196 {
197 assert(a.device() == b.device() &&
198 a.device() == result.device() &&
199 a.device() == queue.get_device());
200 assert(a.size() == b.size1());
201 result.fill(0, queue);
202 cl_event event = NULL;
203 clblasOrder Order = std::is_same<L, ublas::basic_row_major<> >::value ? clblasRowMajor : clblasColumnMajor;
204 size_t lda = Order == clblasRowMajor ? a.size() : 1;
205 size_t ldb = Order == clblasRowMajor ? b.size2() : a.size();
206 size_t ldc = Order == clblasRowMajor ? b.size2() : 1;
207
208 if (std::is_same<T, float>::value)
209 clblasSgemm(Order, clblasNoTrans, clblasNoTrans,
210 1, b.size2(), a.size(),
211 1, a.begin().get_buffer().get(), 0, lda,
212 b.begin().get_buffer().get(), 0, ldb, 1,
213 result.begin().get_buffer().get(), 0, ldc,
214 1, &(queue.get()), 0, NULL, &event);
215 else if (std::is_same<T, double>::value)
216 clblasDgemm(Order, clblasNoTrans, clblasNoTrans,
217 1, b.size2(), a.size(),
218 1, a.begin().get_buffer().get(), 0, lda,
219 b.begin().get_buffer().get(), 0, ldb, 1,
220 result.begin().get_buffer().get(), 0, ldc,
221 1, &(queue.get()), 0, NULL, &event);
222 else if (std::is_same<T, std::complex<float>>::value)
223 clblasCgemm(Order, clblasNoTrans, clblasNoTrans,
224 1, b.size2(), a.size(),
225 ONE_FLOAT_COMPLEX, a.begin().get_buffer().get(), 0, lda,
226 b.begin().get_buffer().get(), 0, ldb, ONE_FLOAT_COMPLEX,
227 result.begin().get_buffer().get(), 0, ldc,
228 1, &(queue.get()), 0, NULL, &event);
229 else if (std::is_same<T, std::complex<double>>::value)
230 clblasZgemm(Order, clblasNoTrans, clblasNoTrans,
231 1, b.size2(), a.size(),
232 ONE_DOUBLE_COMPLEX, a.begin().get_buffer().get(), 0, lda,
233 b.begin().get_buffer().get(), 0, ldb, ONE_DOUBLE_COMPLEX,
234 result.begin().get_buffer().get(), 0, ldc,
235 1, &(queue.get()), 0, NULL, &event);
236 clWaitForEvents(1, &event);
237 }
238
239 template <class T, class L, class A>
240 typename std::enable_if<is_numeric<T>::value>::type
prod(ublas::vector<T,A> const & a,ublas::matrix<T,L,A> const & b,ublas::vector<T,A> & result,compute::command_queue & queue)241 prod(ublas::vector<T, A> const &a,
242 ublas::matrix<T, L, A> const &b,
243 ublas::vector<T, A> &result,
244 compute::command_queue &queue)
245 {
246 ublas::vector<T, opencl::storage> adev(a, queue);
247 ublas::matrix<T, L, opencl::storage> bdev(b, queue);
248 ublas::vector<T, opencl::storage> rdev(b.size2(), queue.get_context());
249 prod(adev, bdev, rdev, queue);
250 rdev.to_host(result, queue);
251 }
252
253 template <class T, class L, class A>
254 typename std::enable_if<is_numeric<T>::value, ublas::vector<T, A>>::type
prod(ublas::vector<T,A> const & a,ublas::matrix<T,L,A> const & b,compute::command_queue & queue)255 prod(ublas::vector<T, A> const &a,
256 ublas::matrix<T, L, A> const &b,
257 compute::command_queue &queue)
258 {
259 ublas::vector<T, A> result(b.size2());
260 prod(a, b, result, queue);
261 return result;
262 }
263
264 template<class T>
265 typename std::enable_if<std::is_fundamental<T>::value, T>::type
inner_prod(ublas::vector<T,opencl::storage> const & a,ublas::vector<T,opencl::storage> const & b,compute::command_queue & queue)266 inner_prod(ublas::vector<T, opencl::storage> const &a,
267 ublas::vector<T, opencl::storage> const &b,
268 compute::command_queue &queue)
269 {
270 assert(a.device() == b.device() && a.device() == queue.get_device());
271 assert(a.size() == b.size());
272 return compute::inner_product(a.begin(), a.end(), b.begin(), T(0), queue);
273 }
274
275 template<class T, class A>
276 typename std::enable_if<std::is_fundamental<T>::value, T>::type
inner_prod(ublas::vector<T,A> const & a,ublas::vector<T,A> const & b,compute::command_queue & queue)277 inner_prod(ublas::vector<T, A> const &a,
278 ublas::vector<T, A> const &b,
279 compute::command_queue& queue)
280 {
281 ublas::vector<T, opencl::storage> adev(a, queue);
282 ublas::vector<T, opencl::storage> bdev(b, queue);
283 return inner_prod(adev, bdev, queue);
284 }
285
286 template <class T, class L>
287 typename std::enable_if<is_numeric<T>::value>::type
outer_prod(ublas::vector<T,opencl::storage> const & a,ublas::vector<T,opencl::storage> const & b,ublas::matrix<T,L,opencl::storage> & result,compute::command_queue & queue)288 outer_prod(ublas::vector<T, opencl::storage> const &a,
289 ublas::vector<T, opencl::storage> const &b,
290 ublas::matrix<T, L, opencl::storage> &result,
291 compute::command_queue & queue)
292 {
293 assert(a.device() == b.device() &&
294 a.device() == result.device() &&
295 a.device() == queue.get_device());
296 result.fill(0, queue);
297 cl_event event = NULL;
298 clblasOrder Order = std::is_same<L, ublas::basic_row_major<> >::value ? clblasRowMajor : clblasColumnMajor;
299 size_t lda = Order == clblasRowMajor ? 1 : a.size();
300 size_t ldb = Order == clblasRowMajor ? b.size() : 1;
301 size_t ldc = Order == clblasRowMajor ? b.size() : a.size();
302
303 if (std::is_same<T, float>::value)
304 clblasSgemm(Order, clblasNoTrans, clblasNoTrans,
305 a.size(), b.size(), 1,
306 1, a.begin().get_buffer().get(), 0, lda,
307 b.begin().get_buffer().get(), 0, ldb, 1,
308 result.begin().get_buffer().get(), 0, ldc,
309 1, &(queue.get()), 0, NULL, &event);
310 else if (std::is_same<T, double>::value)
311 clblasDgemm(Order, clblasNoTrans, clblasNoTrans,
312 a.size(), b.size(), 1,
313 1, a.begin().get_buffer().get(), 0, lda,
314 b.begin().get_buffer().get(), 0, ldb, 1,
315 result.begin().get_buffer().get(), 0, ldc,
316 1, &(queue.get()), 0, NULL, &event);
317 else if (std::is_same<T, std::complex<float>>::value)
318 clblasCgemm(Order, clblasNoTrans, clblasNoTrans,
319 a.size(), b.size(), 1,
320 ONE_FLOAT_COMPLEX, a.begin().get_buffer().get(), 0, lda,
321 b.begin().get_buffer().get(), 0, ldb, ONE_FLOAT_COMPLEX,
322 result.begin().get_buffer().get(), 0, ldc,
323 1, &(queue.get()), 0, NULL, &event);
324 else if (std::is_same<T, std::complex<double>>::value)
325 clblasZgemm(Order, clblasNoTrans, clblasNoTrans,
326 a.size(), b.size(), 1,
327 ONE_DOUBLE_COMPLEX, a.begin().get_buffer().get(), 0, lda,
328 b.begin().get_buffer().get(), 0, ldb, ONE_DOUBLE_COMPLEX,
329 result.begin().get_buffer().get(), 0, ldc,
330 1, &(queue.get()), 0, NULL, &event);
331 clWaitForEvents(1, &event);
332 }
333
334 template <class T, class L, class A>
335 typename std::enable_if<is_numeric<T>::value>::type
outer_prod(ublas::vector<T,A> const & a,ublas::vector<T,A> const & b,ublas::matrix<T,L,A> & result,compute::command_queue & queue)336 outer_prod(ublas::vector<T, A> const &a,
337 ublas::vector<T, A> const &b,
338 ublas::matrix<T, L, A> &result,
339 compute::command_queue &queue)
340 {
341 ublas::vector<T, opencl::storage> adev(a, queue);
342 ublas::vector<T, opencl::storage> bdev(b, queue);
343 ublas::matrix<T, L, opencl::storage> rdev(a.size(), b.size(), queue.get_context());
344 outer_prod(adev, bdev, rdev, queue);
345 rdev.to_host(result, queue);
346 }
347
348 template <class T,class L = ublas::basic_row_major<>, class A>
349 typename std::enable_if<is_numeric<T>::value, ublas::matrix<T, L, A>>::type
outer_prod(ublas::vector<T,A> const & a,ublas::vector<T,A> const & b,compute::command_queue & queue)350 outer_prod(ublas::vector<T, A> const &a,
351 ublas::vector<T, A> const &b,
352 compute::command_queue &queue)
353 {
354 ublas::matrix<T, L, A> result(a.size(), b.size());
355 outer_prod(a, b, result, queue);
356 return result;
357 }
358
359 #undef ONE_DOUBLE_COMPLEX
360 #undef ONE_FLOAT_COMPLEX
361
362 }}}}
363
364 #endif
365