1 //
2 // Copyright 2012 Francisco Jerez
3 //
4 // Permission is hereby granted, free of charge, to any person obtaining a
5 // copy of this software and associated documentation files (the "Software"),
6 // to deal in the Software without restriction, including without limitation
7 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 // and/or sell copies of the Software, and to permit persons to whom the
9 // Software is furnished to do so, subject to the following conditions:
10 //
11 // The above copyright notice and this permission notice shall be included in
12 // all copies or substantial portions of the Software.
13 //
14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 // OTHER DEALINGS IN THE SOFTWARE.
21 //
22
23 #include "core/kernel.hpp"
24 #include "core/resource.hpp"
25 #include "util/factor.hpp"
26 #include "util/u_math.h"
27 #include "pipe/p_context.h"
28
29 using namespace clover;
30
kernel(clover::program & prog,const std::string & name,const std::vector<binary::argument> & bargs)31 kernel::kernel(clover::program &prog, const std::string &name,
32 const std::vector<binary::argument> &bargs) :
33 program(prog), _name(name), exec(*this),
34 program_ref(prog._kernel_ref_counter) {
35 for (auto &barg : bargs) {
36 if (barg.semantic == binary::argument::general)
37 _args.emplace_back(argument::create(barg));
38 }
39 for (auto &dev : prog.devices()) {
40 auto &b = prog.build(dev).bin;
41 auto bsym = find(name_equals(name), b.syms);
42 const auto f = id_type_equals(bsym.section, binary::section::data_constant);
43 if (!any_of(f, b.secs))
44 continue;
45
46 auto mconst = find(f, b.secs);
47 auto rb = std::make_unique<root_buffer>(prog.context(), std::vector<cl_mem_properties>(),
48 CL_MEM_COPY_HOST_PTR | CL_MEM_READ_ONLY,
49 mconst.size, mconst.data.data());
50 _constant_buffers.emplace(&dev, std::move(rb));
51 }
52 }
53
54 template<typename V>
55 static inline std::vector<uint>
pad_vector(command_queue & q,const V & v,uint x)56 pad_vector(command_queue &q, const V &v, uint x) {
57 std::vector<uint> w { v.begin(), v.end() };
58 w.resize(q.device().max_block_size().size(), x);
59 return w;
60 }
61
62 void
launch(command_queue & q,const std::vector<size_t> & grid_offset,const std::vector<size_t> & grid_size,const std::vector<size_t> & block_size)63 kernel::launch(command_queue &q,
64 const std::vector<size_t> &grid_offset,
65 const std::vector<size_t> &grid_size,
66 const std::vector<size_t> &block_size) {
67 const auto b = program().build(q.device()).bin;
68 const auto reduced_grid_size =
69 map(divides(), grid_size, block_size);
70
71 if (any_of(is_zero(), grid_size))
72 return;
73
74 void *st = exec.bind(&q, grid_offset);
75 struct pipe_grid_info info = {};
76
77 // The handles are created during exec_context::bind(), so we need make
78 // sure to call exec_context::bind() before retrieving them.
79 std::vector<uint32_t *> g_handles = map([&](size_t h) {
80 return (uint32_t *)&exec.input[h];
81 }, exec.g_handles);
82
83 q.pipe->bind_compute_state(q.pipe, st);
84 q.pipe->bind_sampler_states(q.pipe, PIPE_SHADER_COMPUTE,
85 0, exec.samplers.size(),
86 exec.samplers.data());
87
88 q.pipe->set_sampler_views(q.pipe, PIPE_SHADER_COMPUTE, 0,
89 exec.sviews.size(), 0, false, exec.sviews.data());
90 q.pipe->set_shader_images(q.pipe, PIPE_SHADER_COMPUTE, 0,
91 exec.iviews.size(), 0, exec.iviews.data());
92 q.pipe->set_compute_resources(q.pipe, 0, exec.resources.size(),
93 exec.resources.data());
94 q.pipe->set_global_binding(q.pipe, 0, exec.g_buffers.size(),
95 exec.g_buffers.data(), g_handles.data());
96
97 // Fill information for the launch_grid() call.
98 info.work_dim = grid_size.size();
99 copy(pad_vector(q, block_size, 1), info.block);
100 copy(pad_vector(q, reduced_grid_size, 1), info.grid);
101 info.pc = find(name_equals(_name), b.syms).offset;
102 info.input = exec.input.data();
103 info.variable_shared_mem = exec.mem_local;
104
105 q.pipe->launch_grid(q.pipe, &info);
106
107 q.pipe->set_global_binding(q.pipe, 0, exec.g_buffers.size(), NULL, NULL);
108 q.pipe->set_compute_resources(q.pipe, 0, exec.resources.size(), NULL);
109 q.pipe->set_shader_images(q.pipe, PIPE_SHADER_COMPUTE, 0,
110 0, exec.iviews.size(), NULL);
111 q.pipe->set_sampler_views(q.pipe, PIPE_SHADER_COMPUTE, 0,
112 0, exec.sviews.size(), false, NULL);
113 q.pipe->bind_sampler_states(q.pipe, PIPE_SHADER_COMPUTE, 0,
114 exec.samplers.size(), NULL);
115
116 q.pipe->memory_barrier(q.pipe, PIPE_BARRIER_GLOBAL_BUFFER);
117 exec.unbind();
118 }
119
120 size_t
mem_local() const121 kernel::mem_local() const {
122 size_t sz = 0;
123
124 for (auto &arg : args()) {
125 if (dynamic_cast<local_argument *>(&arg))
126 sz += arg.storage();
127 }
128
129 return sz;
130 }
131
132 size_t
mem_private() const133 kernel::mem_private() const {
134 return 0;
135 }
136
137 const std::string &
name() const138 kernel::name() const {
139 return _name;
140 }
141
142 std::vector<size_t>
optimal_block_size(const command_queue & q,const std::vector<size_t> & grid_size) const143 kernel::optimal_block_size(const command_queue &q,
144 const std::vector<size_t> &grid_size) const {
145 if (any_of(is_zero(), grid_size))
146 return grid_size;
147
148 return factor::find_grid_optimal_factor<size_t>(
149 q.device().max_threads_per_block(), q.device().max_block_size(),
150 grid_size);
151 }
152
153 std::vector<size_t>
required_block_size() const154 kernel::required_block_size() const {
155 return find(name_equals(_name), program().symbols()).reqd_work_group_size;
156 }
157
158 kernel::argument_range
args()159 kernel::args() {
160 return map(derefs(), _args);
161 }
162
163 kernel::const_argument_range
args() const164 kernel::args() const {
165 return map(derefs(), _args);
166 }
167
168 std::vector<clover::binary::arg_info>
args_infos()169 kernel::args_infos() {
170 std::vector<clover::binary::arg_info> infos;
171 for (auto &barg: find(name_equals(_name), program().symbols()).args)
172 if (barg.semantic == clover::binary::argument::general)
173 infos.emplace_back(barg.info);
174
175 return infos;
176 }
177
178 const binary &
binary(const command_queue & q) const179 kernel::binary(const command_queue &q) const {
180 return program().build(q.device()).bin;
181 }
182
exec_context(kernel & kern)183 kernel::exec_context::exec_context(kernel &kern) :
184 kern(kern), q(NULL), print_handler(), mem_local(0), st(NULL), cs() {
185 }
186
~exec_context()187 kernel::exec_context::~exec_context() {
188 if (st)
189 q->pipe->delete_compute_state(q->pipe, st);
190 }
191
192 void *
bind(intrusive_ptr<command_queue> _q,const std::vector<size_t> & grid_offset)193 kernel::exec_context::bind(intrusive_ptr<command_queue> _q,
194 const std::vector<size_t> &grid_offset) {
195 std::swap(q, _q);
196
197 // Bind kernel arguments.
198 auto &b = kern.program().build(q->device()).bin;
199 auto bsym = find(name_equals(kern.name()), b.syms);
200 auto bargs = bsym.args;
201 auto msec = find(id_type_equals(bsym.section, binary::section::text_executable), b.secs);
202 auto explicit_arg = kern._args.begin();
203
204 for (auto &barg : bargs) {
205 switch (barg.semantic) {
206 case binary::argument::general:
207 (*(explicit_arg++))->bind(*this, barg);
208 break;
209
210 case binary::argument::grid_dimension: {
211 const cl_uint dimension = grid_offset.size();
212 auto arg = argument::create(barg);
213
214 arg->set(sizeof(dimension), &dimension);
215 arg->bind(*this, barg);
216 break;
217 }
218 case binary::argument::grid_offset: {
219 for (cl_uint x : pad_vector(*q, grid_offset, 0)) {
220 auto arg = argument::create(barg);
221
222 arg->set(sizeof(x), &x);
223 arg->bind(*this, barg);
224 }
225 break;
226 }
227 case binary::argument::image_size: {
228 auto img = dynamic_cast<image_argument &>(**(explicit_arg - 1)).get();
229 std::vector<cl_uint> image_size{
230 static_cast<cl_uint>(img->width()),
231 static_cast<cl_uint>(img->height()),
232 static_cast<cl_uint>(img->depth())};
233 for (auto x : image_size) {
234 auto arg = argument::create(barg);
235
236 arg->set(sizeof(x), &x);
237 arg->bind(*this, barg);
238 }
239 break;
240 }
241 case binary::argument::image_format: {
242 auto img = dynamic_cast<image_argument &>(**(explicit_arg - 1)).get();
243 cl_image_format fmt = img->format();
244 std::vector<cl_uint> image_format{
245 static_cast<cl_uint>(fmt.image_channel_data_type),
246 static_cast<cl_uint>(fmt.image_channel_order)};
247 for (auto x : image_format) {
248 auto arg = argument::create(barg);
249
250 arg->set(sizeof(x), &x);
251 arg->bind(*this, barg);
252 }
253 break;
254 }
255 case binary::argument::constant_buffer: {
256 auto arg = argument::create(barg);
257 cl_mem buf = kern._constant_buffers.at(&q->device()).get();
258 arg->set(sizeof(buf), &buf);
259 arg->bind(*this, barg);
260 break;
261 }
262 case binary::argument::printf_buffer: {
263 print_handler = printf_handler::create(q, b.printf_infos,
264 b.printf_strings_in_buffer,
265 q->device().max_printf_buffer_size());
266 cl_mem print_mem = print_handler->get_mem();
267
268 auto arg = argument::create(barg);
269 arg->set(sizeof(cl_mem), &print_mem);
270 arg->bind(*this, barg);
271 break;
272 }
273 }
274 }
275
276 // Create a new compute state if anything changed.
277 if (!st || q != _q ||
278 cs.req_input_mem != input.size()) {
279 if (st)
280 _q->pipe->delete_compute_state(_q->pipe, st);
281
282 cs.ir_type = q->device().ir_format();
283 cs.prog = &(msec.data[0]);
284 // we only pass in NIRs or LLVMs and both IRs decode the size
285 cs.static_shared_mem = 0;
286 cs.req_input_mem = input.size();
287 st = q->pipe->create_compute_state(q->pipe, &cs);
288 if (!st) {
289 unbind(); // Cleanup
290 throw error(CL_OUT_OF_RESOURCES);
291 }
292 }
293
294 return st;
295 }
296
297 void
unbind()298 kernel::exec_context::unbind() {
299 if (print_handler)
300 print_handler->print();
301
302 for (auto &arg : kern.args())
303 arg.unbind(*this);
304
305 input.clear();
306 samplers.clear();
307 sviews.clear();
308 iviews.clear();
309 resources.clear();
310 g_buffers.clear();
311 g_handles.clear();
312 mem_local = 0;
313 }
314
315 namespace {
316 template<typename T>
317 std::vector<uint8_t>
bytes(const T & x)318 bytes(const T& x) {
319 return { (uint8_t *)&x, (uint8_t *)&x + sizeof(x) };
320 }
321
322 ///
323 /// Transform buffer \a v from the native byte order into the byte
324 /// order specified by \a e.
325 ///
326 template<typename T>
327 void
byteswap(T & v,pipe_endian e)328 byteswap(T &v, pipe_endian e) {
329 if (PIPE_ENDIAN_NATIVE != e)
330 std::reverse(v.begin(), v.end());
331 }
332
333 ///
334 /// Pad buffer \a v to the next multiple of \a n.
335 ///
336 template<typename T>
337 void
align_vector(T & v,size_t n)338 align_vector(T &v, size_t n) {
339 v.resize(util_align_npot(v.size(), n));
340 }
341
342 bool
msb(const std::vector<uint8_t> & s)343 msb(const std::vector<uint8_t> &s) {
344 if (PIPE_ENDIAN_NATIVE == PIPE_ENDIAN_LITTLE)
345 return s.back() & 0x80;
346 else
347 return s.front() & 0x80;
348 }
349
350 ///
351 /// Resize buffer \a v to size \a n using sign or zero extension
352 /// according to \a ext.
353 ///
354 template<typename T>
355 void
extend(T & v,enum binary::argument::ext_type ext,size_t n)356 extend(T &v, enum binary::argument::ext_type ext, size_t n) {
357 const size_t m = std::min(v.size(), n);
358 const bool sign_ext = (ext == binary::argument::sign_ext);
359 const uint8_t fill = (sign_ext && msb(v) ? ~0 : 0);
360 T w(n, fill);
361
362 if (PIPE_ENDIAN_NATIVE == PIPE_ENDIAN_LITTLE)
363 std::copy_n(v.begin(), m, w.begin());
364 else
365 std::copy_n(v.end() - m, m, w.end() - m);
366
367 std::swap(v, w);
368 }
369
370 ///
371 /// Append buffer \a w to \a v.
372 ///
373 template<typename T>
374 void
insert(T & v,const T & w)375 insert(T &v, const T &w) {
376 v.insert(v.end(), w.begin(), w.end());
377 }
378
379 ///
380 /// Append \a n elements to the end of buffer \a v.
381 ///
382 template<typename T>
383 size_t
allocate(T & v,size_t n)384 allocate(T &v, size_t n) {
385 size_t pos = v.size();
386 v.resize(pos + n);
387 return pos;
388 }
389 }
390
391 std::unique_ptr<kernel::argument>
create(const binary::argument & barg)392 kernel::argument::create(const binary::argument &barg) {
393 switch (barg.type) {
394 case binary::argument::scalar:
395 return std::unique_ptr<kernel::argument>(new scalar_argument(barg.size));
396
397 case binary::argument::global:
398 return std::unique_ptr<kernel::argument>(new global_argument);
399
400 case binary::argument::local:
401 return std::unique_ptr<kernel::argument>(new local_argument);
402
403 case binary::argument::constant:
404 return std::unique_ptr<kernel::argument>(new constant_argument);
405
406 case binary::argument::image_rd:
407 return std::unique_ptr<kernel::argument>(new image_rd_argument);
408
409 case binary::argument::image_wr:
410 return std::unique_ptr<kernel::argument>(new image_wr_argument);
411
412 case binary::argument::sampler:
413 return std::unique_ptr<kernel::argument>(new sampler_argument);
414
415 }
416 throw error(CL_INVALID_KERNEL_DEFINITION);
417 }
418
argument()419 kernel::argument::argument() : _set(false) {
420 }
421
422 bool
set() const423 kernel::argument::set() const {
424 return _set;
425 }
426
427 size_t
storage() const428 kernel::argument::storage() const {
429 return 0;
430 }
431
scalar_argument(size_t size)432 kernel::scalar_argument::scalar_argument(size_t size) : size(size) {
433 }
434
435 void
set(size_t size,const void * value)436 kernel::scalar_argument::set(size_t size, const void *value) {
437 if (!value)
438 throw error(CL_INVALID_ARG_VALUE);
439
440 if (size != this->size)
441 throw error(CL_INVALID_ARG_SIZE);
442
443 v = { (uint8_t *)value, (uint8_t *)value + size };
444 _set = true;
445 }
446
447 void
bind(exec_context & ctx,const binary::argument & barg)448 kernel::scalar_argument::bind(exec_context &ctx,
449 const binary::argument &barg) {
450 auto w = v;
451
452 extend(w, barg.ext_type, barg.target_size);
453 byteswap(w, ctx.q->device().endianness());
454 align_vector(ctx.input, barg.target_align);
455 insert(ctx.input, w);
456 }
457
458 void
unbind(exec_context & ctx)459 kernel::scalar_argument::unbind(exec_context &ctx) {
460 }
461
global_argument()462 kernel::global_argument::global_argument() : buf(nullptr), svm(nullptr) {
463 }
464
465 void
set(size_t size,const void * value)466 kernel::global_argument::set(size_t size, const void *value) {
467 if (size != sizeof(cl_mem))
468 throw error(CL_INVALID_ARG_SIZE);
469
470 buf = pobj<buffer>(value ? *(cl_mem *)value : NULL);
471 svm = nullptr;
472 _set = true;
473 }
474
475 void
set_svm(const void * value)476 kernel::global_argument::set_svm(const void *value) {
477 svm = value;
478 buf = nullptr;
479 _set = true;
480 }
481
482 void
bind(exec_context & ctx,const binary::argument & barg)483 kernel::global_argument::bind(exec_context &ctx,
484 const binary::argument &barg) {
485 align_vector(ctx.input, barg.target_align);
486
487 if (buf) {
488 const resource &r = buf->resource_in(*ctx.q);
489 ctx.g_handles.push_back(ctx.input.size());
490 ctx.g_buffers.push_back(r.pipe);
491
492 // How to handle multi-demensional offsets?
493 // We don't need to. Buffer offsets are always
494 // one-dimensional.
495 auto v = bytes(r.offset[0]);
496 extend(v, barg.ext_type, barg.target_size);
497 byteswap(v, ctx.q->device().endianness());
498 insert(ctx.input, v);
499 } else if (svm) {
500 auto v = bytes(svm);
501 extend(v, barg.ext_type, barg.target_size);
502 byteswap(v, ctx.q->device().endianness());
503 insert(ctx.input, v);
504 } else {
505 // Null pointer.
506 allocate(ctx.input, barg.target_size);
507 }
508 }
509
510 void
unbind(exec_context & ctx)511 kernel::global_argument::unbind(exec_context &ctx) {
512 }
513
514 size_t
storage() const515 kernel::local_argument::storage() const {
516 return _storage;
517 }
518
519 void
set(size_t size,const void * value)520 kernel::local_argument::set(size_t size, const void *value) {
521 if (value)
522 throw error(CL_INVALID_ARG_VALUE);
523
524 if (!size)
525 throw error(CL_INVALID_ARG_SIZE);
526
527 _storage = size;
528 _set = true;
529 }
530
531 void
bind(exec_context & ctx,const binary::argument & barg)532 kernel::local_argument::bind(exec_context &ctx,
533 const binary::argument &barg) {
534 ctx.mem_local = ::align(ctx.mem_local, barg.target_align);
535 auto v = bytes(ctx.mem_local);
536
537 extend(v, binary::argument::zero_ext, barg.target_size);
538 byteswap(v, ctx.q->device().endianness());
539 align_vector(ctx.input, ctx.q->device().address_bits() / 8);
540 insert(ctx.input, v);
541
542 ctx.mem_local += _storage;
543 }
544
545 void
unbind(exec_context & ctx)546 kernel::local_argument::unbind(exec_context &ctx) {
547 }
548
constant_argument()549 kernel::constant_argument::constant_argument() : buf(nullptr), st(nullptr) {
550 }
551
552 void
set(size_t size,const void * value)553 kernel::constant_argument::set(size_t size, const void *value) {
554 if (size != sizeof(cl_mem))
555 throw error(CL_INVALID_ARG_SIZE);
556
557 buf = pobj<buffer>(value ? *(cl_mem *)value : NULL);
558 _set = true;
559 }
560
561 void
bind(exec_context & ctx,const binary::argument & barg)562 kernel::constant_argument::bind(exec_context &ctx,
563 const binary::argument &barg) {
564 align_vector(ctx.input, barg.target_align);
565
566 if (buf) {
567 resource &r = buf->resource_in(*ctx.q);
568 auto v = bytes(ctx.resources.size() << 24 | r.offset[0]);
569
570 extend(v, binary::argument::zero_ext, barg.target_size);
571 byteswap(v, ctx.q->device().endianness());
572 insert(ctx.input, v);
573
574 st = r.bind_surface(*ctx.q, false);
575 ctx.resources.push_back(st);
576 } else {
577 // Null pointer.
578 allocate(ctx.input, barg.target_size);
579 }
580 }
581
582 void
unbind(exec_context & ctx)583 kernel::constant_argument::unbind(exec_context &ctx) {
584 if (buf)
585 buf->resource_in(*ctx.q).unbind_surface(*ctx.q, st);
586 }
587
image_rd_argument()588 kernel::image_rd_argument::image_rd_argument() : st(nullptr) {
589 }
590
591 void
set(size_t size,const void * value)592 kernel::image_rd_argument::set(size_t size, const void *value) {
593 if (!value)
594 throw error(CL_INVALID_ARG_VALUE);
595
596 if (size != sizeof(cl_mem))
597 throw error(CL_INVALID_ARG_SIZE);
598
599 img = &obj<image>(*(cl_mem *)value);
600 _set = true;
601 }
602
603 void
bind(exec_context & ctx,const binary::argument & barg)604 kernel::image_rd_argument::bind(exec_context &ctx,
605 const binary::argument &barg) {
606 auto v = bytes(ctx.sviews.size());
607
608 extend(v, binary::argument::zero_ext, barg.target_size);
609 byteswap(v, ctx.q->device().endianness());
610 align_vector(ctx.input, barg.target_align);
611 insert(ctx.input, v);
612
613 st = img->resource_in(*ctx.q).bind_sampler_view(*ctx.q);
614 ctx.sviews.push_back(st);
615 }
616
617 void
unbind(exec_context & ctx)618 kernel::image_rd_argument::unbind(exec_context &ctx) {
619 img->resource_in(*ctx.q).unbind_sampler_view(*ctx.q, st);
620 }
621
622 void
set(size_t size,const void * value)623 kernel::image_wr_argument::set(size_t size, const void *value) {
624 if (!value)
625 throw error(CL_INVALID_ARG_VALUE);
626
627 if (size != sizeof(cl_mem))
628 throw error(CL_INVALID_ARG_SIZE);
629
630 img = &obj<image>(*(cl_mem *)value);
631 _set = true;
632 }
633
634 void
bind(exec_context & ctx,const binary::argument & barg)635 kernel::image_wr_argument::bind(exec_context &ctx,
636 const binary::argument &barg) {
637 auto v = bytes(ctx.iviews.size());
638
639 extend(v, binary::argument::zero_ext, barg.target_size);
640 byteswap(v, ctx.q->device().endianness());
641 align_vector(ctx.input, barg.target_align);
642 insert(ctx.input, v);
643 ctx.iviews.push_back(img->resource_in(*ctx.q).create_image_view(*ctx.q));
644 }
645
646 void
unbind(exec_context & ctx)647 kernel::image_wr_argument::unbind(exec_context &ctx) {
648 }
649
sampler_argument()650 kernel::sampler_argument::sampler_argument() : s(nullptr), st(nullptr) {
651 }
652
653 void
set(size_t size,const void * value)654 kernel::sampler_argument::set(size_t size, const void *value) {
655 if (!value)
656 throw error(CL_INVALID_SAMPLER);
657
658 if (size != sizeof(cl_sampler))
659 throw error(CL_INVALID_ARG_SIZE);
660
661 s = &obj(*(cl_sampler *)value);
662 _set = true;
663 }
664
665 void
bind(exec_context & ctx,const binary::argument & barg)666 kernel::sampler_argument::bind(exec_context &ctx,
667 const binary::argument &barg) {
668 st = s->bind(*ctx.q);
669 ctx.samplers.push_back(st);
670 }
671
672 void
unbind(exec_context & ctx)673 kernel::sampler_argument::unbind(exec_context &ctx) {
674 s->unbind(*ctx.q, st);
675 }
676