1 // Boost.uBLAS
2 //
3 // Copyright (c) 2018 Fady Essam
4 // Copyright (c) 2018 Stefan Seefeld
5 //
6 // Distributed under the Boost Software License, Version 1.0.
7 // (See accompanying file LICENSE_1_0.txt or
8 // copy at http://www.boost.org/LICENSE_1_0.txt)
9 
10 #ifndef boost_numeric_ublas_opencl_prod_hpp_
11 #define boost_numeric_ublas_opencl_prod_hpp_
12 
13 #include <boost/numeric/ublas/opencl/library.hpp>
14 #include <boost/numeric/ublas/opencl/vector.hpp>
15 #include <boost/numeric/ublas/opencl/matrix.hpp>
16 #include <boost/numeric/ublas/opencl/transpose.hpp>
17 #include <boost/compute/buffer.hpp>
18 
19 namespace boost { namespace numeric { namespace ublas { namespace opencl {
20 
21 #define ONE_DOUBLE_COMPLEX  {{1.0, 00.0}}
22 #define ONE_FLOAT_COMPLEX  {{1.0f, 00.0f}}
23 
24 template <typename T, typename L1, typename L2>
25 typename std::enable_if<is_numeric<T>::value>::type
prod(ublas::matrix<T,L1,opencl::storage> const & a,ublas::matrix<T,L2,opencl::storage> const & b,ublas::matrix<T,L1,opencl::storage> & result,compute::command_queue & queue)26 prod(ublas::matrix<T, L1, opencl::storage> const &a,
27      ublas::matrix<T, L2, opencl::storage> const &b,
28      ublas::matrix<T, L1, opencl::storage> &result,
29      compute::command_queue &queue)
30 {
31   assert(a.device() == b.device() &&
32 	 a.device() == result.device() &&
33 	 a.device() == queue.get_device());
34   assert(a.size2() == b.size1());
35 
36   result.fill(0, queue);
37 
38   //to hold matrix b with layout 1 if the b has different layout
39   std::unique_ptr<ublas::matrix<T, L1, opencl::storage>> bl1;
40 
41   cl_event event = NULL;
42 
43   cl_mem buffer_a = a.begin().get_buffer().get();
44   cl_mem buffer_b = b.begin().get_buffer().get();
45   cl_mem buffer_result = result.begin().get_buffer().get();
46 
47   if (!(std::is_same<L1, L2>::value))
48   {
49     bl1.reset(new ublas::matrix<T, L1, opencl::storage>(b.size1(), b.size2(), queue.get_context()));
50     change_layout(b, *bl1, queue);
51     buffer_b = bl1->begin().get_buffer().get();
52   }
53 
54   clblasOrder Order = std::is_same<L1, ublas::basic_row_major<> >::value ? clblasRowMajor : clblasColumnMajor;
55   size_t lda = Order == clblasRowMajor ? a.size2() : a.size1();
56   size_t ldb = Order == clblasRowMajor ? b.size2() : a.size2();
57   size_t ldc = Order == clblasRowMajor ? b.size2() : a.size1();
58 
59   if (std::is_same<T, float>::value)
60     clblasSgemm(Order, clblasNoTrans, clblasNoTrans,
61 		a.size1(), b.size2(), a.size2(),
62 		1, buffer_a, 0, lda,
63 		buffer_b, 0, ldb, 1,
64 		buffer_result, 0, ldc,
65 		1, &(queue.get()), 0, NULL, &event);
66   else if (std::is_same<T, double>::value)
67     clblasDgemm(Order, clblasNoTrans, clblasNoTrans,
68 		a.size1(), b.size2(), a.size2(),
69 		1, buffer_a, 0, lda,
70 		buffer_b, 0, ldb, 1,
71 		buffer_result, 0, ldc,
72 		1, &(queue.get()), 0, NULL, &event);
73   else if (std::is_same<T, std::complex<float>>::value)
74     clblasCgemm(Order, clblasNoTrans, clblasNoTrans,
75 		a.size1(), b.size2(), a.size2(),
76 		ONE_FLOAT_COMPLEX, buffer_a, 0, lda,
77 		buffer_b, 0, ldb, ONE_FLOAT_COMPLEX,
78 		buffer_result, 0, ldc,
79 		1, &(queue.get()), 0, NULL, &event);
80   else if (std::is_same<T, std::complex<double>>::value)
81     clblasZgemm(Order, clblasNoTrans, clblasNoTrans,
82 		a.size1(), b.size2(), a.size2(),
83 		ONE_DOUBLE_COMPLEX, buffer_a, 0, lda,
84 		buffer_b, 0, ldb, ONE_DOUBLE_COMPLEX,
85 		buffer_result, 0, ldc,
86 		1, &(queue.get()), 0, NULL, &event);
87   clWaitForEvents(1, &event);
88 }
89 
90 template <typename T, typename L1, typename L2, typename A>
91 typename std::enable_if<is_numeric<T>::value>::type
prod(ublas::matrix<T,L1,A> const & a,ublas::matrix<T,L2,A> const & b,ublas::matrix<T,L1,A> & result,compute::command_queue & queue)92 prod(ublas::matrix<T, L1, A> const &a,
93      ublas::matrix<T, L2, A> const &b,
94      ublas::matrix<T, L1, A> &result,
95      compute::command_queue &queue)
96 {
97   ublas::matrix<T, L1, opencl::storage> adev(a, queue);
98   ublas::matrix<T, L2, opencl::storage> bdev(b, queue);
99   ublas::matrix<T, L1, opencl::storage> rdev(a.size1(), b.size2(), queue.get_context());
100   prod(adev, bdev, rdev, queue);
101   rdev.to_host(result,queue);
102 }
103 
104 template <typename T, typename L1, typename L2, typename A>
105 typename std::enable_if<is_numeric<T>::value, ublas::matrix<T, L1, A>>::type
prod(ublas::matrix<T,L1,A> const & a,ublas::matrix<T,L2,A> const & b,compute::command_queue & queue)106 prod(ublas::matrix<T, L1, A> const &a,
107      ublas::matrix<T, L2, A> const &b,
108      compute::command_queue &queue)
109 {
110   ublas::matrix<T, L1, A> result(a.size1(), b.size2());
111   prod(a, b, result, queue);
112   return result;
113 }
114 
115 template <typename T, typename L>
116 typename std::enable_if<is_numeric<T>::value>::type
prod(ublas::matrix<T,L,opencl::storage> const & a,ublas::vector<T,opencl::storage> const & b,ublas::vector<T,opencl::storage> & result,compute::command_queue & queue)117 prod(ublas::matrix<T, L, opencl::storage> const &a,
118      ublas::vector<T, opencl::storage> const &b,
119      ublas::vector<T, opencl::storage> &result,
120      compute::command_queue &queue)
121 {
122   assert(a.device() == b.device() &&
123 	 a.device() == result.device() &&
124 	 a.device() == queue.get_device());
125   assert(a.size2() == b.size());
126   result.fill(0, queue);
127 
128   cl_event event = NULL;
129   clblasOrder Order = std::is_same<L, ublas::basic_row_major<> >::value ? clblasRowMajor : clblasColumnMajor;
130   int lda = Order == clblasRowMajor ? a.size2() : a.size1();
131   int ldb = Order == clblasRowMajor ? 1 : a.size2();
132   int ldc = Order == clblasRowMajor ? 1 : a.size1();
133 
134   if (std::is_same<T, float>::value)
135     clblasSgemm(Order, clblasNoTrans, clblasNoTrans,
136 		a.size1(), 1, a.size2(),
137 		1, a.begin().get_buffer().get(), 0, lda,
138 		b.begin().get_buffer().get(), 0, ldb, 1,
139 		result.begin().get_buffer().get(), 0, ldc,
140 		1, &(queue.get()), 0, NULL, &event);
141   else if (std::is_same<T, double>::value)
142     clblasDgemm(Order, clblasNoTrans, clblasNoTrans,
143 		a.size1(), 1, a.size2(),
144 		1, a.begin().get_buffer().get(), 0, lda,
145 		b.begin().get_buffer().get(), 0, ldb, 1,
146 		result.begin().get_buffer().get(), 0, ldc,
147 		1, &(queue.get()), 0, NULL, &event);
148   else if (std::is_same<T, std::complex<float>>::value)
149     clblasCgemm(Order, clblasNoTrans, clblasNoTrans,
150 		a.size1(), 1, a.size2(),
151 		ONE_FLOAT_COMPLEX, a.begin().get_buffer().get(), 0, lda,
152 		b.begin().get_buffer().get(), 0, ldb, ONE_FLOAT_COMPLEX,
153 		result.begin().get_buffer().get(), 0, ldc,
154 		1, &(queue.get()), 0, NULL, &event);
155   else if (std::is_same<T, std::complex<double>>::value)
156     clblasZgemm(Order, clblasNoTrans, clblasNoTrans,
157 		a.size1(), 1, a.size2(),
158 		ONE_DOUBLE_COMPLEX, a.begin().get_buffer().get(), 0, lda,
159 		b.begin().get_buffer().get(), 0, ldb, ONE_DOUBLE_COMPLEX,
160 		result.begin().get_buffer().get(), 0, ldc,
161 		1, &(queue.get()), 0, NULL, &event);
162   clWaitForEvents(1, &event);
163 }
164 
165 template <typename T, typename L, typename A>
166 typename std::enable_if<is_numeric<T>::value>::type
prod(ublas::matrix<T,L,A> const & a,ublas::vector<T,A> const & b,ublas::vector<T,A> & result,compute::command_queue & queue)167 prod(ublas::matrix<T, L, A> const &a,
168      ublas::vector<T, A> const &b,
169      ublas::vector<T, A> &result,
170      compute::command_queue &queue)
171 {
172   ublas::matrix<T, L, opencl::storage> adev(a, queue);
173   ublas::vector<T, opencl::storage> bdev(b, queue);
174   ublas::vector<T, opencl::storage> rdev(a.size1(), queue.get_context());
175   prod(adev, bdev, rdev, queue);
176   rdev.to_host(result, queue);
177 }
178 
179 template <typename T, typename L, typename A>
180 typename std::enable_if<is_numeric<T>::value, ublas::vector<T, A>>::type
prod(ublas::matrix<T,L,A> const & a,ublas::vector<T,A> const & b,compute::command_queue & queue)181 prod(ublas::matrix<T, L, A> const &a,
182      ublas::vector<T, A> const &b,
183      compute::command_queue &queue)
184 {
185   ublas::vector<T, A> result(a.size1());
186   prod(a, b, result, queue);
187   return result;
188 }
189 
190 template <typename T, typename L>
191 typename std::enable_if<is_numeric<T>::value>::type
prod(ublas::vector<T,opencl::storage> const & a,ublas::matrix<T,L,opencl::storage> const & b,ublas::vector<T,opencl::storage> & result,compute::command_queue & queue)192 prod(ublas::vector<T, opencl::storage> const &a,
193      ublas::matrix<T, L, opencl::storage> const &b,
194      ublas::vector<T, opencl::storage> &result,
195      compute::command_queue &queue)
196 {
197   assert(a.device() == b.device() &&
198 	 a.device() == result.device() &&
199 	 a.device() == queue.get_device());
200   assert(a.size() == b.size1());
201   result.fill(0, queue);
202   cl_event event = NULL;
203   clblasOrder Order = std::is_same<L, ublas::basic_row_major<> >::value ? clblasRowMajor : clblasColumnMajor;
204   size_t lda = Order == clblasRowMajor ? a.size() : 1;
205   size_t ldb = Order == clblasRowMajor ? b.size2() : a.size();
206   size_t ldc = Order == clblasRowMajor ? b.size2() : 1;
207 
208   if (std::is_same<T, float>::value)
209     clblasSgemm(Order, clblasNoTrans, clblasNoTrans,
210 		1, b.size2(), a.size(),
211 		1, a.begin().get_buffer().get(), 0, lda,
212 		b.begin().get_buffer().get(), 0, ldb, 1,
213 		result.begin().get_buffer().get(), 0, ldc,
214 		1, &(queue.get()), 0, NULL, &event);
215   else if (std::is_same<T, double>::value)
216     clblasDgemm(Order, clblasNoTrans, clblasNoTrans,
217 		1, b.size2(), a.size(),
218 		1, a.begin().get_buffer().get(), 0, lda,
219 		b.begin().get_buffer().get(), 0, ldb, 1,
220 		result.begin().get_buffer().get(), 0, ldc,
221 		1, &(queue.get()), 0, NULL, &event);
222   else if (std::is_same<T, std::complex<float>>::value)
223     clblasCgemm(Order, clblasNoTrans, clblasNoTrans,
224 		1, b.size2(), a.size(),
225 		ONE_FLOAT_COMPLEX, a.begin().get_buffer().get(), 0, lda,
226 		b.begin().get_buffer().get(), 0, ldb, ONE_FLOAT_COMPLEX,
227 		result.begin().get_buffer().get(), 0, ldc,
228 		1, &(queue.get()), 0, NULL, &event);
229   else if (std::is_same<T, std::complex<double>>::value)
230     clblasZgemm(Order, clblasNoTrans, clblasNoTrans,
231 		1, b.size2(), a.size(),
232 		ONE_DOUBLE_COMPLEX, a.begin().get_buffer().get(), 0, lda,
233 		b.begin().get_buffer().get(), 0, ldb, ONE_DOUBLE_COMPLEX,
234 		result.begin().get_buffer().get(), 0, ldc,
235 		1, &(queue.get()), 0, NULL, &event);
236   clWaitForEvents(1, &event);
237 }
238 
239 template <class T, class L, class A>
240 typename std::enable_if<is_numeric<T>::value>::type
prod(ublas::vector<T,A> const & a,ublas::matrix<T,L,A> const & b,ublas::vector<T,A> & result,compute::command_queue & queue)241 prod(ublas::vector<T, A> const &a,
242      ublas::matrix<T, L, A> const &b,
243      ublas::vector<T, A> &result,
244      compute::command_queue &queue)
245 {
246   ublas::vector<T, opencl::storage> adev(a, queue);
247   ublas::matrix<T, L, opencl::storage> bdev(b, queue);
248   ublas::vector<T, opencl::storage> rdev(b.size2(), queue.get_context());
249   prod(adev, bdev, rdev, queue);
250   rdev.to_host(result, queue);
251 }
252 
253 template <class T, class L, class A>
254 typename std::enable_if<is_numeric<T>::value, ublas::vector<T, A>>::type
prod(ublas::vector<T,A> const & a,ublas::matrix<T,L,A> const & b,compute::command_queue & queue)255 prod(ublas::vector<T, A> const &a,
256      ublas::matrix<T, L, A> const &b,
257      compute::command_queue &queue)
258 {
259   ublas::vector<T, A> result(b.size2());
260   prod(a, b, result, queue);
261   return result;
262 }
263 
264 template<class T>
265 typename std::enable_if<std::is_fundamental<T>::value, T>::type
inner_prod(ublas::vector<T,opencl::storage> const & a,ublas::vector<T,opencl::storage> const & b,compute::command_queue & queue)266 inner_prod(ublas::vector<T, opencl::storage> const &a,
267 	   ublas::vector<T, opencl::storage> const &b,
268 	   compute::command_queue &queue)
269 {
270   assert(a.device() == b.device() && a.device() == queue.get_device());
271   assert(a.size() == b.size());
272   return compute::inner_product(a.begin(), a.end(), b.begin(), T(0), queue);
273 }
274 
275 template<class T, class A>
276 typename std::enable_if<std::is_fundamental<T>::value, T>::type
inner_prod(ublas::vector<T,A> const & a,ublas::vector<T,A> const & b,compute::command_queue & queue)277 inner_prod(ublas::vector<T, A> const &a,
278 	   ublas::vector<T, A> const &b,
279 	   compute::command_queue& queue)
280 {
281   ublas::vector<T, opencl::storage> adev(a, queue);
282   ublas::vector<T, opencl::storage> bdev(b, queue);
283   return inner_prod(adev, bdev, queue);
284 }
285 
286 template <class T, class L>
287 typename std::enable_if<is_numeric<T>::value>::type
outer_prod(ublas::vector<T,opencl::storage> const & a,ublas::vector<T,opencl::storage> const & b,ublas::matrix<T,L,opencl::storage> & result,compute::command_queue & queue)288 outer_prod(ublas::vector<T, opencl::storage> const &a,
289 	   ublas::vector<T, opencl::storage> const &b,
290 	   ublas::matrix<T, L, opencl::storage> &result,
291 	   compute::command_queue & queue)
292 {
293   assert(a.device() == b.device() &&
294 	 a.device() == result.device() &&
295 	 a.device() == queue.get_device());
296   result.fill(0, queue);
297   cl_event event = NULL;
298   clblasOrder Order = std::is_same<L, ublas::basic_row_major<> >::value ? clblasRowMajor : clblasColumnMajor;
299   size_t lda = Order == clblasRowMajor ? 1 : a.size();
300   size_t ldb = Order == clblasRowMajor ? b.size() : 1;
301   size_t ldc = Order == clblasRowMajor ? b.size() : a.size();
302 
303   if (std::is_same<T, float>::value)
304     clblasSgemm(Order, clblasNoTrans, clblasNoTrans,
305 		a.size(), b.size(), 1,
306 		1, a.begin().get_buffer().get(), 0, lda,
307 		b.begin().get_buffer().get(), 0, ldb, 1,
308 		result.begin().get_buffer().get(), 0, ldc,
309 		1, &(queue.get()), 0, NULL, &event);
310   else if (std::is_same<T, double>::value)
311     clblasDgemm(Order, clblasNoTrans, clblasNoTrans,
312 		a.size(), b.size(), 1,
313 		1, a.begin().get_buffer().get(), 0, lda,
314 		b.begin().get_buffer().get(), 0, ldb, 1,
315 		result.begin().get_buffer().get(), 0, ldc,
316 		1, &(queue.get()), 0, NULL, &event);
317   else if (std::is_same<T, std::complex<float>>::value)
318     clblasCgemm(Order, clblasNoTrans, clblasNoTrans,
319 		a.size(), b.size(), 1,
320 		ONE_FLOAT_COMPLEX, a.begin().get_buffer().get(), 0, lda,
321 		b.begin().get_buffer().get(), 0, ldb, ONE_FLOAT_COMPLEX,
322 		result.begin().get_buffer().get(), 0, ldc,
323 		1, &(queue.get()), 0, NULL, &event);
324   else if (std::is_same<T, std::complex<double>>::value)
325     clblasZgemm(Order, clblasNoTrans, clblasNoTrans,
326 		a.size(), b.size(), 1,
327 		ONE_DOUBLE_COMPLEX, a.begin().get_buffer().get(), 0, lda,
328 		b.begin().get_buffer().get(), 0, ldb, ONE_DOUBLE_COMPLEX,
329 		result.begin().get_buffer().get(), 0, ldc,
330 		1, &(queue.get()), 0, NULL, &event);
331   clWaitForEvents(1, &event);
332 }
333 
334 template <class T, class L, class A>
335 typename std::enable_if<is_numeric<T>::value>::type
outer_prod(ublas::vector<T,A> const & a,ublas::vector<T,A> const & b,ublas::matrix<T,L,A> & result,compute::command_queue & queue)336 outer_prod(ublas::vector<T, A> const &a,
337 	   ublas::vector<T, A> const &b,
338 	   ublas::matrix<T, L, A> &result,
339 	   compute::command_queue &queue)
340 {
341   ublas::vector<T, opencl::storage> adev(a, queue);
342   ublas::vector<T, opencl::storage> bdev(b, queue);
343   ublas::matrix<T, L, opencl::storage> rdev(a.size(), b.size(), queue.get_context());
344   outer_prod(adev, bdev, rdev, queue);
345   rdev.to_host(result, queue);
346 }
347 
348 template <class T,class L = ublas::basic_row_major<>, class A>
349 typename std::enable_if<is_numeric<T>::value, ublas::matrix<T, L, A>>::type
outer_prod(ublas::vector<T,A> const & a,ublas::vector<T,A> const & b,compute::command_queue & queue)350 outer_prod(ublas::vector<T, A> const &a,
351 	   ublas::vector<T, A> const &b,
352 	   compute::command_queue &queue)
353 {
354   ublas::matrix<T, L, A> result(a.size(), b.size());
355   outer_prod(a, b, result, queue);
356   return result;
357 }
358 
359 #undef ONE_DOUBLE_COMPLEX
360 #undef ONE_FLOAT_COMPLEX
361 
362 }}}}
363 
364 #endif
365