1 /*
2 * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
3 * Copyright (c) 2020, Intel Corporation. All rights reserved.
4 *
5 * This software is available to you under a choice of one of two
6 * licenses. You may choose to be licensed under the terms of the GNU
7 * General Public License (GPL) Version 2, available from the file
8 * COPYING in the main directory of this source tree, or the
9 * OpenIB.org BSD license below:
10 *
11 * Redistribution and use in source and binary forms, with or
12 * without modification, are permitted provided that the following
13 * conditions are met:
14 *
15 * - Redistributions of source code must retain the above
16 * copyright notice, this list of conditions and the following
17 * disclaimer.
18 *
19 * - Redistributions in binary form must reproduce the above
20 * copyright notice, this list of conditions and the following
21 * disclaimer in the documentation and/or other materials
22 * provided with the distribution.
23 *
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31 * SOFTWARE.
32 */
33
34
35 #include <linux/kref.h>
36 #include <linux/random.h>
37 #include <linux/debugfs.h>
38 #include <linux/export.h>
39 #include <linux/delay.h>
40 #include <linux/dma-buf.h>
41 #include <linux/dma-resv.h>
42 #include <rdma/ib_umem_odp.h>
43 #include "dm.h"
44 #include "mlx5_ib.h"
45 #include "umr.h"
46 #include "data_direct.h"
47
48 enum {
49 MAX_PENDING_REG_MR = 8,
50 };
51
52 #define MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS 4
53 #define MLX5_UMR_ALIGN 2048
54
55 static void
56 create_mkey_callback(int status, struct mlx5_async_work *context);
57 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
58 u64 iova, int access_flags,
59 unsigned long page_size, bool populate,
60 int access_mode);
61 static int __mlx5_ib_dereg_mr(struct ib_mr *ibmr);
62
set_mkc_access_pd_addr_fields(void * mkc,int acc,u64 start_addr,struct ib_pd * pd)63 static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr,
64 struct ib_pd *pd)
65 {
66 struct mlx5_ib_dev *dev = to_mdev(pd->device);
67
68 MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC));
69 MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE));
70 MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ));
71 MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE));
72 MLX5_SET(mkc, mkc, lr, 1);
73
74 if (acc & IB_ACCESS_RELAXED_ORDERING) {
75 if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write))
76 MLX5_SET(mkc, mkc, relaxed_ordering_write, 1);
77
78 if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read) ||
79 (MLX5_CAP_GEN(dev->mdev,
80 relaxed_ordering_read_pci_enabled) &&
81 pcie_relaxed_ordering_enabled(dev->mdev->pdev)))
82 MLX5_SET(mkc, mkc, relaxed_ordering_read, 1);
83 }
84
85 MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
86 MLX5_SET(mkc, mkc, qpn, 0xffffff);
87 MLX5_SET64(mkc, mkc, start_addr, start_addr);
88 }
89
assign_mkey_variant(struct mlx5_ib_dev * dev,u32 * mkey,u32 * in)90 static void assign_mkey_variant(struct mlx5_ib_dev *dev, u32 *mkey, u32 *in)
91 {
92 u8 key = atomic_inc_return(&dev->mkey_var);
93 void *mkc;
94
95 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
96 MLX5_SET(mkc, mkc, mkey_7_0, key);
97 *mkey = key;
98 }
99
mlx5_ib_create_mkey(struct mlx5_ib_dev * dev,struct mlx5_ib_mkey * mkey,u32 * in,int inlen)100 static int mlx5_ib_create_mkey(struct mlx5_ib_dev *dev,
101 struct mlx5_ib_mkey *mkey, u32 *in, int inlen)
102 {
103 int ret;
104
105 assign_mkey_variant(dev, &mkey->key, in);
106 ret = mlx5_core_create_mkey(dev->mdev, &mkey->key, in, inlen);
107 if (!ret)
108 init_waitqueue_head(&mkey->wait);
109
110 return ret;
111 }
112
mlx5_ib_create_mkey_cb(struct mlx5r_async_create_mkey * async_create)113 static int mlx5_ib_create_mkey_cb(struct mlx5r_async_create_mkey *async_create)
114 {
115 struct mlx5_ib_dev *dev = async_create->ent->dev;
116 size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
117 size_t outlen = MLX5_ST_SZ_BYTES(create_mkey_out);
118
119 MLX5_SET(create_mkey_in, async_create->in, opcode,
120 MLX5_CMD_OP_CREATE_MKEY);
121 assign_mkey_variant(dev, &async_create->mkey, async_create->in);
122 return mlx5_cmd_exec_cb(&dev->async_ctx, async_create->in, inlen,
123 async_create->out, outlen, create_mkey_callback,
124 &async_create->cb_work);
125 }
126
127 static int mkey_cache_max_order(struct mlx5_ib_dev *dev);
128 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent);
129
destroy_mkey(struct mlx5_ib_dev * dev,struct mlx5_ib_mr * mr)130 static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
131 {
132 WARN_ON(xa_load(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)));
133
134 return mlx5_core_destroy_mkey(dev->mdev, mr->mmkey.key);
135 }
136
create_mkey_warn(struct mlx5_ib_dev * dev,int status,void * out)137 static void create_mkey_warn(struct mlx5_ib_dev *dev, int status, void *out)
138 {
139 if (status == -ENXIO) /* core driver is not available */
140 return;
141
142 mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status);
143 if (status != -EREMOTEIO) /* driver specific failure */
144 return;
145
146 /* Failed in FW, print cmd out failure details */
147 mlx5_cmd_out_err(dev->mdev, MLX5_CMD_OP_CREATE_MKEY, 0, out);
148 }
149
push_mkey_locked(struct mlx5_cache_ent * ent,u32 mkey)150 static int push_mkey_locked(struct mlx5_cache_ent *ent, u32 mkey)
151 {
152 unsigned long tmp = ent->mkeys_queue.ci % NUM_MKEYS_PER_PAGE;
153 struct mlx5_mkeys_page *page;
154
155 lockdep_assert_held(&ent->mkeys_queue.lock);
156 if (ent->mkeys_queue.ci >=
157 ent->mkeys_queue.num_pages * NUM_MKEYS_PER_PAGE) {
158 page = kzalloc(sizeof(*page), GFP_ATOMIC);
159 if (!page)
160 return -ENOMEM;
161 ent->mkeys_queue.num_pages++;
162 list_add_tail(&page->list, &ent->mkeys_queue.pages_list);
163 } else {
164 page = list_last_entry(&ent->mkeys_queue.pages_list,
165 struct mlx5_mkeys_page, list);
166 }
167
168 page->mkeys[tmp] = mkey;
169 ent->mkeys_queue.ci++;
170 return 0;
171 }
172
pop_mkey_locked(struct mlx5_cache_ent * ent)173 static int pop_mkey_locked(struct mlx5_cache_ent *ent)
174 {
175 unsigned long tmp = (ent->mkeys_queue.ci - 1) % NUM_MKEYS_PER_PAGE;
176 struct mlx5_mkeys_page *last_page;
177 u32 mkey;
178
179 lockdep_assert_held(&ent->mkeys_queue.lock);
180 last_page = list_last_entry(&ent->mkeys_queue.pages_list,
181 struct mlx5_mkeys_page, list);
182 mkey = last_page->mkeys[tmp];
183 last_page->mkeys[tmp] = 0;
184 ent->mkeys_queue.ci--;
185 if (ent->mkeys_queue.num_pages > 1 && !tmp) {
186 list_del(&last_page->list);
187 ent->mkeys_queue.num_pages--;
188 kfree(last_page);
189 }
190 return mkey;
191 }
192
create_mkey_callback(int status,struct mlx5_async_work * context)193 static void create_mkey_callback(int status, struct mlx5_async_work *context)
194 {
195 struct mlx5r_async_create_mkey *mkey_out =
196 container_of(context, struct mlx5r_async_create_mkey, cb_work);
197 struct mlx5_cache_ent *ent = mkey_out->ent;
198 struct mlx5_ib_dev *dev = ent->dev;
199 unsigned long flags;
200
201 if (status) {
202 create_mkey_warn(dev, status, mkey_out->out);
203 kfree(mkey_out);
204 spin_lock_irqsave(&ent->mkeys_queue.lock, flags);
205 ent->pending--;
206 WRITE_ONCE(dev->fill_delay, 1);
207 spin_unlock_irqrestore(&ent->mkeys_queue.lock, flags);
208 mod_timer(&dev->delay_timer, jiffies + HZ);
209 return;
210 }
211
212 mkey_out->mkey |= mlx5_idx_to_mkey(
213 MLX5_GET(create_mkey_out, mkey_out->out, mkey_index));
214 WRITE_ONCE(dev->cache.last_add, jiffies);
215
216 spin_lock_irqsave(&ent->mkeys_queue.lock, flags);
217 push_mkey_locked(ent, mkey_out->mkey);
218 ent->pending--;
219 /* If we are doing fill_to_high_water then keep going. */
220 queue_adjust_cache_locked(ent);
221 spin_unlock_irqrestore(&ent->mkeys_queue.lock, flags);
222 kfree(mkey_out);
223 }
224
get_mkc_octo_size(unsigned int access_mode,unsigned int ndescs)225 static int get_mkc_octo_size(unsigned int access_mode, unsigned int ndescs)
226 {
227 int ret = 0;
228
229 switch (access_mode) {
230 case MLX5_MKC_ACCESS_MODE_MTT:
231 ret = DIV_ROUND_UP(ndescs, MLX5_IB_UMR_OCTOWORD /
232 sizeof(struct mlx5_mtt));
233 break;
234 case MLX5_MKC_ACCESS_MODE_KSM:
235 ret = DIV_ROUND_UP(ndescs, MLX5_IB_UMR_OCTOWORD /
236 sizeof(struct mlx5_klm));
237 break;
238 default:
239 WARN_ON(1);
240 }
241 return ret;
242 }
243
set_cache_mkc(struct mlx5_cache_ent * ent,void * mkc)244 static void set_cache_mkc(struct mlx5_cache_ent *ent, void *mkc)
245 {
246 set_mkc_access_pd_addr_fields(mkc, ent->rb_key.access_flags, 0,
247 ent->dev->umrc.pd);
248 MLX5_SET(mkc, mkc, free, 1);
249 MLX5_SET(mkc, mkc, umr_en, 1);
250 MLX5_SET(mkc, mkc, access_mode_1_0, ent->rb_key.access_mode & 0x3);
251 MLX5_SET(mkc, mkc, access_mode_4_2,
252 (ent->rb_key.access_mode >> 2) & 0x7);
253 MLX5_SET(mkc, mkc, ma_translation_mode, !!ent->rb_key.ats);
254
255 MLX5_SET(mkc, mkc, translations_octword_size,
256 get_mkc_octo_size(ent->rb_key.access_mode,
257 ent->rb_key.ndescs));
258 MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
259 }
260
261 /* Asynchronously schedule new MRs to be populated in the cache. */
add_keys(struct mlx5_cache_ent * ent,unsigned int num)262 static int add_keys(struct mlx5_cache_ent *ent, unsigned int num)
263 {
264 struct mlx5r_async_create_mkey *async_create;
265 void *mkc;
266 int err = 0;
267 int i;
268
269 for (i = 0; i < num; i++) {
270 async_create = kzalloc(sizeof(struct mlx5r_async_create_mkey),
271 GFP_KERNEL);
272 if (!async_create)
273 return -ENOMEM;
274 mkc = MLX5_ADDR_OF(create_mkey_in, async_create->in,
275 memory_key_mkey_entry);
276 set_cache_mkc(ent, mkc);
277 async_create->ent = ent;
278
279 spin_lock_irq(&ent->mkeys_queue.lock);
280 if (ent->pending >= MAX_PENDING_REG_MR) {
281 err = -EAGAIN;
282 goto free_async_create;
283 }
284 ent->pending++;
285 spin_unlock_irq(&ent->mkeys_queue.lock);
286
287 err = mlx5_ib_create_mkey_cb(async_create);
288 if (err) {
289 mlx5_ib_warn(ent->dev, "create mkey failed %d\n", err);
290 goto err_create_mkey;
291 }
292 }
293
294 return 0;
295
296 err_create_mkey:
297 spin_lock_irq(&ent->mkeys_queue.lock);
298 ent->pending--;
299 free_async_create:
300 spin_unlock_irq(&ent->mkeys_queue.lock);
301 kfree(async_create);
302 return err;
303 }
304
305 /* Synchronously create a MR in the cache */
create_cache_mkey(struct mlx5_cache_ent * ent,u32 * mkey)306 static int create_cache_mkey(struct mlx5_cache_ent *ent, u32 *mkey)
307 {
308 size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
309 void *mkc;
310 u32 *in;
311 int err;
312
313 in = kzalloc(inlen, GFP_KERNEL);
314 if (!in)
315 return -ENOMEM;
316 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
317 set_cache_mkc(ent, mkc);
318
319 err = mlx5_core_create_mkey(ent->dev->mdev, mkey, in, inlen);
320 if (err)
321 goto free_in;
322
323 WRITE_ONCE(ent->dev->cache.last_add, jiffies);
324 free_in:
325 kfree(in);
326 return err;
327 }
328
remove_cache_mr_locked(struct mlx5_cache_ent * ent)329 static void remove_cache_mr_locked(struct mlx5_cache_ent *ent)
330 {
331 u32 mkey;
332
333 lockdep_assert_held(&ent->mkeys_queue.lock);
334 if (!ent->mkeys_queue.ci)
335 return;
336 mkey = pop_mkey_locked(ent);
337 spin_unlock_irq(&ent->mkeys_queue.lock);
338 mlx5_core_destroy_mkey(ent->dev->mdev, mkey);
339 spin_lock_irq(&ent->mkeys_queue.lock);
340 }
341
resize_available_mrs(struct mlx5_cache_ent * ent,unsigned int target,bool limit_fill)342 static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target,
343 bool limit_fill)
344 __acquires(&ent->mkeys_queue.lock) __releases(&ent->mkeys_queue.lock)
345 {
346 int err;
347
348 lockdep_assert_held(&ent->mkeys_queue.lock);
349
350 while (true) {
351 if (limit_fill)
352 target = ent->limit * 2;
353 if (target == ent->pending + ent->mkeys_queue.ci)
354 return 0;
355 if (target > ent->pending + ent->mkeys_queue.ci) {
356 u32 todo = target - (ent->pending + ent->mkeys_queue.ci);
357
358 spin_unlock_irq(&ent->mkeys_queue.lock);
359 err = add_keys(ent, todo);
360 if (err == -EAGAIN)
361 usleep_range(3000, 5000);
362 spin_lock_irq(&ent->mkeys_queue.lock);
363 if (err) {
364 if (err != -EAGAIN)
365 return err;
366 } else
367 return 0;
368 } else {
369 remove_cache_mr_locked(ent);
370 }
371 }
372 }
373
size_write(struct file * filp,const char __user * buf,size_t count,loff_t * pos)374 static ssize_t size_write(struct file *filp, const char __user *buf,
375 size_t count, loff_t *pos)
376 {
377 struct mlx5_cache_ent *ent = filp->private_data;
378 u32 target;
379 int err;
380
381 err = kstrtou32_from_user(buf, count, 0, &target);
382 if (err)
383 return err;
384
385 /*
386 * Target is the new value of total_mrs the user requests, however we
387 * cannot free MRs that are in use. Compute the target value for stored
388 * mkeys.
389 */
390 spin_lock_irq(&ent->mkeys_queue.lock);
391 if (target < ent->in_use) {
392 err = -EINVAL;
393 goto err_unlock;
394 }
395 target = target - ent->in_use;
396 if (target < ent->limit || target > ent->limit*2) {
397 err = -EINVAL;
398 goto err_unlock;
399 }
400 err = resize_available_mrs(ent, target, false);
401 if (err)
402 goto err_unlock;
403 spin_unlock_irq(&ent->mkeys_queue.lock);
404
405 return count;
406
407 err_unlock:
408 spin_unlock_irq(&ent->mkeys_queue.lock);
409 return err;
410 }
411
size_read(struct file * filp,char __user * buf,size_t count,loff_t * pos)412 static ssize_t size_read(struct file *filp, char __user *buf, size_t count,
413 loff_t *pos)
414 {
415 struct mlx5_cache_ent *ent = filp->private_data;
416 char lbuf[20];
417 int err;
418
419 err = snprintf(lbuf, sizeof(lbuf), "%ld\n",
420 ent->mkeys_queue.ci + ent->in_use);
421 if (err < 0)
422 return err;
423
424 return simple_read_from_buffer(buf, count, pos, lbuf, err);
425 }
426
427 static const struct file_operations size_fops = {
428 .owner = THIS_MODULE,
429 .open = simple_open,
430 .write = size_write,
431 .read = size_read,
432 };
433
limit_write(struct file * filp,const char __user * buf,size_t count,loff_t * pos)434 static ssize_t limit_write(struct file *filp, const char __user *buf,
435 size_t count, loff_t *pos)
436 {
437 struct mlx5_cache_ent *ent = filp->private_data;
438 u32 var;
439 int err;
440
441 err = kstrtou32_from_user(buf, count, 0, &var);
442 if (err)
443 return err;
444
445 /*
446 * Upon set we immediately fill the cache to high water mark implied by
447 * the limit.
448 */
449 spin_lock_irq(&ent->mkeys_queue.lock);
450 ent->limit = var;
451 err = resize_available_mrs(ent, 0, true);
452 spin_unlock_irq(&ent->mkeys_queue.lock);
453 if (err)
454 return err;
455 return count;
456 }
457
limit_read(struct file * filp,char __user * buf,size_t count,loff_t * pos)458 static ssize_t limit_read(struct file *filp, char __user *buf, size_t count,
459 loff_t *pos)
460 {
461 struct mlx5_cache_ent *ent = filp->private_data;
462 char lbuf[20];
463 int err;
464
465 err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->limit);
466 if (err < 0)
467 return err;
468
469 return simple_read_from_buffer(buf, count, pos, lbuf, err);
470 }
471
472 static const struct file_operations limit_fops = {
473 .owner = THIS_MODULE,
474 .open = simple_open,
475 .write = limit_write,
476 .read = limit_read,
477 };
478
someone_adding(struct mlx5_mkey_cache * cache)479 static bool someone_adding(struct mlx5_mkey_cache *cache)
480 {
481 struct mlx5_cache_ent *ent;
482 struct rb_node *node;
483 bool ret;
484
485 mutex_lock(&cache->rb_lock);
486 for (node = rb_first(&cache->rb_root); node; node = rb_next(node)) {
487 ent = rb_entry(node, struct mlx5_cache_ent, node);
488 spin_lock_irq(&ent->mkeys_queue.lock);
489 ret = ent->mkeys_queue.ci < ent->limit;
490 spin_unlock_irq(&ent->mkeys_queue.lock);
491 if (ret) {
492 mutex_unlock(&cache->rb_lock);
493 return true;
494 }
495 }
496 mutex_unlock(&cache->rb_lock);
497 return false;
498 }
499
500 /*
501 * Check if the bucket is outside the high/low water mark and schedule an async
502 * update. The cache refill has hysteresis, once the low water mark is hit it is
503 * refilled up to the high mark.
504 */
queue_adjust_cache_locked(struct mlx5_cache_ent * ent)505 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent)
506 {
507 lockdep_assert_held(&ent->mkeys_queue.lock);
508
509 if (ent->disabled || READ_ONCE(ent->dev->fill_delay) || ent->is_tmp)
510 return;
511 if (ent->mkeys_queue.ci < ent->limit) {
512 ent->fill_to_high_water = true;
513 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0);
514 } else if (ent->fill_to_high_water &&
515 ent->mkeys_queue.ci + ent->pending < 2 * ent->limit) {
516 /*
517 * Once we start populating due to hitting a low water mark
518 * continue until we pass the high water mark.
519 */
520 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0);
521 } else if (ent->mkeys_queue.ci == 2 * ent->limit) {
522 ent->fill_to_high_water = false;
523 } else if (ent->mkeys_queue.ci > 2 * ent->limit) {
524 /* Queue deletion of excess entries */
525 ent->fill_to_high_water = false;
526 if (ent->pending)
527 queue_delayed_work(ent->dev->cache.wq, &ent->dwork,
528 msecs_to_jiffies(1000));
529 else
530 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0);
531 }
532 }
533
clean_keys(struct mlx5_ib_dev * dev,struct mlx5_cache_ent * ent)534 static void clean_keys(struct mlx5_ib_dev *dev, struct mlx5_cache_ent *ent)
535 {
536 u32 mkey;
537
538 spin_lock_irq(&ent->mkeys_queue.lock);
539 while (ent->mkeys_queue.ci) {
540 mkey = pop_mkey_locked(ent);
541 spin_unlock_irq(&ent->mkeys_queue.lock);
542 mlx5_core_destroy_mkey(dev->mdev, mkey);
543 spin_lock_irq(&ent->mkeys_queue.lock);
544 }
545 ent->tmp_cleanup_scheduled = false;
546 spin_unlock_irq(&ent->mkeys_queue.lock);
547 }
548
__cache_work_func(struct mlx5_cache_ent * ent)549 static void __cache_work_func(struct mlx5_cache_ent *ent)
550 {
551 struct mlx5_ib_dev *dev = ent->dev;
552 struct mlx5_mkey_cache *cache = &dev->cache;
553 int err;
554
555 spin_lock_irq(&ent->mkeys_queue.lock);
556 if (ent->disabled)
557 goto out;
558
559 if (ent->fill_to_high_water &&
560 ent->mkeys_queue.ci + ent->pending < 2 * ent->limit &&
561 !READ_ONCE(dev->fill_delay)) {
562 spin_unlock_irq(&ent->mkeys_queue.lock);
563 err = add_keys(ent, 1);
564 spin_lock_irq(&ent->mkeys_queue.lock);
565 if (ent->disabled)
566 goto out;
567 if (err) {
568 /*
569 * EAGAIN only happens if there are pending MRs, so we
570 * will be rescheduled when storing them. The only
571 * failure path here is ENOMEM.
572 */
573 if (err != -EAGAIN) {
574 mlx5_ib_warn(
575 dev,
576 "add keys command failed, err %d\n",
577 err);
578 queue_delayed_work(cache->wq, &ent->dwork,
579 msecs_to_jiffies(1000));
580 }
581 }
582 } else if (ent->mkeys_queue.ci > 2 * ent->limit) {
583 bool need_delay;
584
585 /*
586 * The remove_cache_mr() logic is performed as garbage
587 * collection task. Such task is intended to be run when no
588 * other active processes are running.
589 *
590 * The need_resched() will return TRUE if there are user tasks
591 * to be activated in near future.
592 *
593 * In such case, we don't execute remove_cache_mr() and postpone
594 * the garbage collection work to try to run in next cycle, in
595 * order to free CPU resources to other tasks.
596 */
597 spin_unlock_irq(&ent->mkeys_queue.lock);
598 need_delay = need_resched() || someone_adding(cache) ||
599 !time_after(jiffies,
600 READ_ONCE(cache->last_add) + 300 * HZ);
601 spin_lock_irq(&ent->mkeys_queue.lock);
602 if (ent->disabled)
603 goto out;
604 if (need_delay) {
605 queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ);
606 goto out;
607 }
608 remove_cache_mr_locked(ent);
609 queue_adjust_cache_locked(ent);
610 }
611 out:
612 spin_unlock_irq(&ent->mkeys_queue.lock);
613 }
614
delayed_cache_work_func(struct work_struct * work)615 static void delayed_cache_work_func(struct work_struct *work)
616 {
617 struct mlx5_cache_ent *ent;
618
619 ent = container_of(work, struct mlx5_cache_ent, dwork.work);
620 /* temp entries are never filled, only cleaned */
621 if (ent->is_tmp)
622 clean_keys(ent->dev, ent);
623 else
624 __cache_work_func(ent);
625 }
626
cache_ent_key_cmp(struct mlx5r_cache_rb_key key1,struct mlx5r_cache_rb_key key2)627 static int cache_ent_key_cmp(struct mlx5r_cache_rb_key key1,
628 struct mlx5r_cache_rb_key key2)
629 {
630 int res;
631
632 res = key1.ats - key2.ats;
633 if (res)
634 return res;
635
636 res = key1.access_mode - key2.access_mode;
637 if (res)
638 return res;
639
640 res = key1.access_flags - key2.access_flags;
641 if (res)
642 return res;
643
644 /*
645 * keep ndescs the last in the compare table since the find function
646 * searches for an exact match on all properties and only closest
647 * match in size.
648 */
649 return key1.ndescs - key2.ndescs;
650 }
651
mlx5_cache_ent_insert(struct mlx5_mkey_cache * cache,struct mlx5_cache_ent * ent)652 static int mlx5_cache_ent_insert(struct mlx5_mkey_cache *cache,
653 struct mlx5_cache_ent *ent)
654 {
655 struct rb_node **new = &cache->rb_root.rb_node, *parent = NULL;
656 struct mlx5_cache_ent *cur;
657 int cmp;
658
659 /* Figure out where to put new node */
660 while (*new) {
661 cur = rb_entry(*new, struct mlx5_cache_ent, node);
662 parent = *new;
663 cmp = cache_ent_key_cmp(cur->rb_key, ent->rb_key);
664 if (cmp > 0)
665 new = &((*new)->rb_left);
666 if (cmp < 0)
667 new = &((*new)->rb_right);
668 if (cmp == 0)
669 return -EEXIST;
670 }
671
672 /* Add new node and rebalance tree. */
673 rb_link_node(&ent->node, parent, new);
674 rb_insert_color(&ent->node, &cache->rb_root);
675
676 return 0;
677 }
678
679 static struct mlx5_cache_ent *
mkey_cache_ent_from_rb_key(struct mlx5_ib_dev * dev,struct mlx5r_cache_rb_key rb_key)680 mkey_cache_ent_from_rb_key(struct mlx5_ib_dev *dev,
681 struct mlx5r_cache_rb_key rb_key)
682 {
683 struct rb_node *node = dev->cache.rb_root.rb_node;
684 struct mlx5_cache_ent *cur, *smallest = NULL;
685 u64 ndescs_limit;
686 int cmp;
687
688 /*
689 * Find the smallest ent with order >= requested_order.
690 */
691 while (node) {
692 cur = rb_entry(node, struct mlx5_cache_ent, node);
693 cmp = cache_ent_key_cmp(cur->rb_key, rb_key);
694 if (cmp > 0) {
695 smallest = cur;
696 node = node->rb_left;
697 }
698 if (cmp < 0)
699 node = node->rb_right;
700 if (cmp == 0)
701 return cur;
702 }
703
704 /*
705 * Limit the usage of mkeys larger than twice the required size while
706 * also allowing the usage of smallest cache entry for small MRs.
707 */
708 ndescs_limit = max_t(u64, rb_key.ndescs * 2,
709 MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS);
710
711 return (smallest &&
712 smallest->rb_key.access_mode == rb_key.access_mode &&
713 smallest->rb_key.access_flags == rb_key.access_flags &&
714 smallest->rb_key.ats == rb_key.ats &&
715 smallest->rb_key.ndescs <= ndescs_limit) ?
716 smallest :
717 NULL;
718 }
719
_mlx5_mr_cache_alloc(struct mlx5_ib_dev * dev,struct mlx5_cache_ent * ent,int access_flags)720 static struct mlx5_ib_mr *_mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
721 struct mlx5_cache_ent *ent,
722 int access_flags)
723 {
724 struct mlx5_ib_mr *mr;
725 int err;
726
727 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
728 if (!mr)
729 return ERR_PTR(-ENOMEM);
730
731 spin_lock_irq(&ent->mkeys_queue.lock);
732 ent->in_use++;
733
734 if (!ent->mkeys_queue.ci) {
735 queue_adjust_cache_locked(ent);
736 ent->miss++;
737 spin_unlock_irq(&ent->mkeys_queue.lock);
738 err = create_cache_mkey(ent, &mr->mmkey.key);
739 if (err) {
740 spin_lock_irq(&ent->mkeys_queue.lock);
741 ent->in_use--;
742 spin_unlock_irq(&ent->mkeys_queue.lock);
743 kfree(mr);
744 return ERR_PTR(err);
745 }
746 } else {
747 mr->mmkey.key = pop_mkey_locked(ent);
748 queue_adjust_cache_locked(ent);
749 spin_unlock_irq(&ent->mkeys_queue.lock);
750 }
751 mr->mmkey.cache_ent = ent;
752 mr->mmkey.type = MLX5_MKEY_MR;
753 mr->mmkey.rb_key = ent->rb_key;
754 mr->mmkey.cacheable = true;
755 init_waitqueue_head(&mr->mmkey.wait);
756 return mr;
757 }
758
get_unchangeable_access_flags(struct mlx5_ib_dev * dev,int access_flags)759 static int get_unchangeable_access_flags(struct mlx5_ib_dev *dev,
760 int access_flags)
761 {
762 int ret = 0;
763
764 if ((access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
765 MLX5_CAP_GEN(dev->mdev, atomic) &&
766 MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled))
767 ret |= IB_ACCESS_REMOTE_ATOMIC;
768
769 if ((access_flags & IB_ACCESS_RELAXED_ORDERING) &&
770 MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write) &&
771 !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr))
772 ret |= IB_ACCESS_RELAXED_ORDERING;
773
774 if ((access_flags & IB_ACCESS_RELAXED_ORDERING) &&
775 (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read) ||
776 MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_pci_enabled)) &&
777 !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr))
778 ret |= IB_ACCESS_RELAXED_ORDERING;
779
780 return ret;
781 }
782
mlx5_mr_cache_alloc(struct mlx5_ib_dev * dev,int access_flags,int access_mode,int ndescs)783 struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
784 int access_flags, int access_mode,
785 int ndescs)
786 {
787 struct mlx5r_cache_rb_key rb_key = {
788 .ndescs = ndescs,
789 .access_mode = access_mode,
790 .access_flags = get_unchangeable_access_flags(dev, access_flags)
791 };
792 struct mlx5_cache_ent *ent = mkey_cache_ent_from_rb_key(dev, rb_key);
793
794 if (!ent)
795 return ERR_PTR(-EOPNOTSUPP);
796
797 return _mlx5_mr_cache_alloc(dev, ent, access_flags);
798 }
799
mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev * dev)800 static void mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
801 {
802 if (!mlx5_debugfs_root || dev->is_rep)
803 return;
804
805 debugfs_remove_recursive(dev->cache.fs_root);
806 dev->cache.fs_root = NULL;
807 }
808
mlx5_mkey_cache_debugfs_add_ent(struct mlx5_ib_dev * dev,struct mlx5_cache_ent * ent)809 static void mlx5_mkey_cache_debugfs_add_ent(struct mlx5_ib_dev *dev,
810 struct mlx5_cache_ent *ent)
811 {
812 int order = order_base_2(ent->rb_key.ndescs);
813 struct dentry *dir;
814
815 if (!mlx5_debugfs_root || dev->is_rep)
816 return;
817
818 if (ent->rb_key.access_mode == MLX5_MKC_ACCESS_MODE_KSM)
819 order = MLX5_IMR_KSM_CACHE_ENTRY + 2;
820
821 sprintf(ent->name, "%d", order);
822 dir = debugfs_create_dir(ent->name, dev->cache.fs_root);
823 debugfs_create_file("size", 0600, dir, ent, &size_fops);
824 debugfs_create_file("limit", 0600, dir, ent, &limit_fops);
825 debugfs_create_ulong("cur", 0400, dir, &ent->mkeys_queue.ci);
826 debugfs_create_u32("miss", 0600, dir, &ent->miss);
827 }
828
mlx5_mkey_cache_debugfs_init(struct mlx5_ib_dev * dev)829 static void mlx5_mkey_cache_debugfs_init(struct mlx5_ib_dev *dev)
830 {
831 struct dentry *dbg_root = mlx5_debugfs_get_dev_root(dev->mdev);
832 struct mlx5_mkey_cache *cache = &dev->cache;
833
834 if (!mlx5_debugfs_root || dev->is_rep)
835 return;
836
837 cache->fs_root = debugfs_create_dir("mr_cache", dbg_root);
838 }
839
delay_time_func(struct timer_list * t)840 static void delay_time_func(struct timer_list *t)
841 {
842 struct mlx5_ib_dev *dev = from_timer(dev, t, delay_timer);
843
844 WRITE_ONCE(dev->fill_delay, 0);
845 }
846
mlx5r_mkeys_init(struct mlx5_cache_ent * ent)847 static int mlx5r_mkeys_init(struct mlx5_cache_ent *ent)
848 {
849 struct mlx5_mkeys_page *page;
850
851 page = kzalloc(sizeof(*page), GFP_KERNEL);
852 if (!page)
853 return -ENOMEM;
854 INIT_LIST_HEAD(&ent->mkeys_queue.pages_list);
855 spin_lock_init(&ent->mkeys_queue.lock);
856 list_add_tail(&page->list, &ent->mkeys_queue.pages_list);
857 ent->mkeys_queue.num_pages++;
858 return 0;
859 }
860
mlx5r_mkeys_uninit(struct mlx5_cache_ent * ent)861 static void mlx5r_mkeys_uninit(struct mlx5_cache_ent *ent)
862 {
863 struct mlx5_mkeys_page *page;
864
865 WARN_ON(ent->mkeys_queue.ci || ent->mkeys_queue.num_pages > 1);
866 page = list_last_entry(&ent->mkeys_queue.pages_list,
867 struct mlx5_mkeys_page, list);
868 list_del(&page->list);
869 kfree(page);
870 }
871
872 struct mlx5_cache_ent *
mlx5r_cache_create_ent_locked(struct mlx5_ib_dev * dev,struct mlx5r_cache_rb_key rb_key,bool persistent_entry)873 mlx5r_cache_create_ent_locked(struct mlx5_ib_dev *dev,
874 struct mlx5r_cache_rb_key rb_key,
875 bool persistent_entry)
876 {
877 struct mlx5_cache_ent *ent;
878 int order;
879 int ret;
880
881 ent = kzalloc(sizeof(*ent), GFP_KERNEL);
882 if (!ent)
883 return ERR_PTR(-ENOMEM);
884
885 ret = mlx5r_mkeys_init(ent);
886 if (ret)
887 goto mkeys_err;
888 ent->rb_key = rb_key;
889 ent->dev = dev;
890 ent->is_tmp = !persistent_entry;
891
892 INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
893
894 ret = mlx5_cache_ent_insert(&dev->cache, ent);
895 if (ret)
896 goto ent_insert_err;
897
898 if (persistent_entry) {
899 if (rb_key.access_mode == MLX5_MKC_ACCESS_MODE_KSM)
900 order = MLX5_IMR_KSM_CACHE_ENTRY;
901 else
902 order = order_base_2(rb_key.ndescs) - 2;
903
904 if ((dev->mdev->profile.mask & MLX5_PROF_MASK_MR_CACHE) &&
905 !dev->is_rep && mlx5_core_is_pf(dev->mdev) &&
906 mlx5r_umr_can_load_pas(dev, 0))
907 ent->limit = dev->mdev->profile.mr_cache[order].limit;
908 else
909 ent->limit = 0;
910
911 mlx5_mkey_cache_debugfs_add_ent(dev, ent);
912 }
913
914 return ent;
915 ent_insert_err:
916 mlx5r_mkeys_uninit(ent);
917 mkeys_err:
918 kfree(ent);
919 return ERR_PTR(ret);
920 }
921
mlx5r_destroy_cache_entries(struct mlx5_ib_dev * dev)922 static void mlx5r_destroy_cache_entries(struct mlx5_ib_dev *dev)
923 {
924 struct rb_root *root = &dev->cache.rb_root;
925 struct mlx5_cache_ent *ent;
926 struct rb_node *node;
927
928 mutex_lock(&dev->cache.rb_lock);
929 node = rb_first(root);
930 while (node) {
931 ent = rb_entry(node, struct mlx5_cache_ent, node);
932 node = rb_next(node);
933 clean_keys(dev, ent);
934 rb_erase(&ent->node, root);
935 mlx5r_mkeys_uninit(ent);
936 kfree(ent);
937 }
938 mutex_unlock(&dev->cache.rb_lock);
939 }
940
mlx5_mkey_cache_init(struct mlx5_ib_dev * dev)941 int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
942 {
943 struct mlx5_mkey_cache *cache = &dev->cache;
944 struct rb_root *root = &dev->cache.rb_root;
945 struct mlx5r_cache_rb_key rb_key = {
946 .access_mode = MLX5_MKC_ACCESS_MODE_MTT,
947 };
948 struct mlx5_cache_ent *ent;
949 struct rb_node *node;
950 int ret;
951 int i;
952
953 mutex_init(&dev->slow_path_mutex);
954 mutex_init(&dev->cache.rb_lock);
955 dev->cache.rb_root = RB_ROOT;
956 cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM);
957 if (!cache->wq) {
958 mlx5_ib_warn(dev, "failed to create work queue\n");
959 return -ENOMEM;
960 }
961
962 mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx);
963 timer_setup(&dev->delay_timer, delay_time_func, 0);
964 mlx5_mkey_cache_debugfs_init(dev);
965 mutex_lock(&cache->rb_lock);
966 for (i = 0; i <= mkey_cache_max_order(dev); i++) {
967 rb_key.ndescs = MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS << i;
968 ent = mlx5r_cache_create_ent_locked(dev, rb_key, true);
969 if (IS_ERR(ent)) {
970 ret = PTR_ERR(ent);
971 goto err;
972 }
973 }
974
975 ret = mlx5_odp_init_mkey_cache(dev);
976 if (ret)
977 goto err;
978
979 mutex_unlock(&cache->rb_lock);
980 for (node = rb_first(root); node; node = rb_next(node)) {
981 ent = rb_entry(node, struct mlx5_cache_ent, node);
982 spin_lock_irq(&ent->mkeys_queue.lock);
983 queue_adjust_cache_locked(ent);
984 spin_unlock_irq(&ent->mkeys_queue.lock);
985 }
986
987 return 0;
988
989 err:
990 mutex_unlock(&cache->rb_lock);
991 mlx5_mkey_cache_debugfs_cleanup(dev);
992 mlx5r_destroy_cache_entries(dev);
993 destroy_workqueue(cache->wq);
994 mlx5_ib_warn(dev, "failed to create mkey cache entry\n");
995 return ret;
996 }
997
mlx5_mkey_cache_cleanup(struct mlx5_ib_dev * dev)998 void mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev)
999 {
1000 struct rb_root *root = &dev->cache.rb_root;
1001 struct mlx5_cache_ent *ent;
1002 struct rb_node *node;
1003
1004 if (!dev->cache.wq)
1005 return;
1006
1007 mutex_lock(&dev->cache.rb_lock);
1008 for (node = rb_first(root); node; node = rb_next(node)) {
1009 ent = rb_entry(node, struct mlx5_cache_ent, node);
1010 spin_lock_irq(&ent->mkeys_queue.lock);
1011 ent->disabled = true;
1012 spin_unlock_irq(&ent->mkeys_queue.lock);
1013 cancel_delayed_work(&ent->dwork);
1014 }
1015 mutex_unlock(&dev->cache.rb_lock);
1016
1017 /*
1018 * After all entries are disabled and will not reschedule on WQ,
1019 * flush it and all async commands.
1020 */
1021 flush_workqueue(dev->cache.wq);
1022
1023 mlx5_mkey_cache_debugfs_cleanup(dev);
1024 mlx5_cmd_cleanup_async_ctx(&dev->async_ctx);
1025
1026 /* At this point all entries are disabled and have no concurrent work. */
1027 mlx5r_destroy_cache_entries(dev);
1028
1029 destroy_workqueue(dev->cache.wq);
1030 del_timer_sync(&dev->delay_timer);
1031 }
1032
mlx5_ib_get_dma_mr(struct ib_pd * pd,int acc)1033 struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc)
1034 {
1035 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1036 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1037 struct mlx5_ib_mr *mr;
1038 void *mkc;
1039 u32 *in;
1040 int err;
1041
1042 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1043 if (!mr)
1044 return ERR_PTR(-ENOMEM);
1045
1046 in = kzalloc(inlen, GFP_KERNEL);
1047 if (!in) {
1048 err = -ENOMEM;
1049 goto err_free;
1050 }
1051
1052 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1053
1054 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA);
1055 MLX5_SET(mkc, mkc, length64, 1);
1056 set_mkc_access_pd_addr_fields(mkc, acc | IB_ACCESS_RELAXED_ORDERING, 0,
1057 pd);
1058 MLX5_SET(mkc, mkc, ma_translation_mode, MLX5_CAP_GEN(dev->mdev, ats));
1059
1060 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
1061 if (err)
1062 goto err_in;
1063
1064 kfree(in);
1065 mr->mmkey.type = MLX5_MKEY_MR;
1066 mr->ibmr.lkey = mr->mmkey.key;
1067 mr->ibmr.rkey = mr->mmkey.key;
1068 mr->umem = NULL;
1069
1070 return &mr->ibmr;
1071
1072 err_in:
1073 kfree(in);
1074
1075 err_free:
1076 kfree(mr);
1077
1078 return ERR_PTR(err);
1079 }
1080
get_octo_len(u64 addr,u64 len,int page_shift)1081 static int get_octo_len(u64 addr, u64 len, int page_shift)
1082 {
1083 u64 page_size = 1ULL << page_shift;
1084 u64 offset;
1085 int npages;
1086
1087 offset = addr & (page_size - 1);
1088 npages = ALIGN(len + offset, page_size) >> page_shift;
1089 return (npages + 1) / 2;
1090 }
1091
mkey_cache_max_order(struct mlx5_ib_dev * dev)1092 static int mkey_cache_max_order(struct mlx5_ib_dev *dev)
1093 {
1094 if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
1095 return MKEY_CACHE_LAST_STD_ENTRY;
1096 return MLX5_MAX_UMR_SHIFT;
1097 }
1098
set_mr_fields(struct mlx5_ib_dev * dev,struct mlx5_ib_mr * mr,u64 length,int access_flags,u64 iova)1099 static void set_mr_fields(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
1100 u64 length, int access_flags, u64 iova)
1101 {
1102 mr->ibmr.lkey = mr->mmkey.key;
1103 mr->ibmr.rkey = mr->mmkey.key;
1104 mr->ibmr.length = length;
1105 mr->ibmr.device = &dev->ib_dev;
1106 mr->ibmr.iova = iova;
1107 mr->access_flags = access_flags;
1108 }
1109
mlx5_umem_dmabuf_default_pgsz(struct ib_umem * umem,u64 iova)1110 static unsigned int mlx5_umem_dmabuf_default_pgsz(struct ib_umem *umem,
1111 u64 iova)
1112 {
1113 /*
1114 * The alignment of iova has already been checked upon entering
1115 * UVERBS_METHOD_REG_DMABUF_MR
1116 */
1117 umem->iova = iova;
1118 return PAGE_SIZE;
1119 }
1120
alloc_cacheable_mr(struct ib_pd * pd,struct ib_umem * umem,u64 iova,int access_flags,int access_mode)1121 static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
1122 struct ib_umem *umem, u64 iova,
1123 int access_flags, int access_mode)
1124 {
1125 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1126 struct mlx5r_cache_rb_key rb_key = {};
1127 struct mlx5_cache_ent *ent;
1128 struct mlx5_ib_mr *mr;
1129 unsigned long page_size;
1130
1131 if (umem->is_dmabuf)
1132 page_size = mlx5_umem_dmabuf_default_pgsz(umem, iova);
1133 else
1134 page_size = mlx5_umem_mkc_find_best_pgsz(dev, umem, iova);
1135 if (WARN_ON(!page_size))
1136 return ERR_PTR(-EINVAL);
1137
1138 rb_key.access_mode = access_mode;
1139 rb_key.ndescs = ib_umem_num_dma_blocks(umem, page_size);
1140 rb_key.ats = mlx5_umem_needs_ats(dev, umem, access_flags);
1141 rb_key.access_flags = get_unchangeable_access_flags(dev, access_flags);
1142 ent = mkey_cache_ent_from_rb_key(dev, rb_key);
1143 /*
1144 * If the MR can't come from the cache then synchronously create an uncached
1145 * one.
1146 */
1147 if (!ent) {
1148 mutex_lock(&dev->slow_path_mutex);
1149 mr = reg_create(pd, umem, iova, access_flags, page_size, false, access_mode);
1150 mutex_unlock(&dev->slow_path_mutex);
1151 if (IS_ERR(mr))
1152 return mr;
1153 mr->mmkey.rb_key = rb_key;
1154 mr->mmkey.cacheable = true;
1155 return mr;
1156 }
1157
1158 mr = _mlx5_mr_cache_alloc(dev, ent, access_flags);
1159 if (IS_ERR(mr))
1160 return mr;
1161
1162 mr->ibmr.pd = pd;
1163 mr->umem = umem;
1164 mr->page_shift = order_base_2(page_size);
1165 set_mr_fields(dev, mr, umem->length, access_flags, iova);
1166
1167 return mr;
1168 }
1169
1170 static struct ib_mr *
reg_create_crossing_vhca_mr(struct ib_pd * pd,u64 iova,u64 length,int access_flags,u32 crossed_lkey)1171 reg_create_crossing_vhca_mr(struct ib_pd *pd, u64 iova, u64 length, int access_flags,
1172 u32 crossed_lkey)
1173 {
1174 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1175 int access_mode = MLX5_MKC_ACCESS_MODE_CROSSING;
1176 struct mlx5_ib_mr *mr;
1177 void *mkc;
1178 int inlen;
1179 u32 *in;
1180 int err;
1181
1182 if (!MLX5_CAP_GEN(dev->mdev, crossing_vhca_mkey))
1183 return ERR_PTR(-EOPNOTSUPP);
1184
1185 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1186 if (!mr)
1187 return ERR_PTR(-ENOMEM);
1188
1189 inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1190 in = kvzalloc(inlen, GFP_KERNEL);
1191 if (!in) {
1192 err = -ENOMEM;
1193 goto err_1;
1194 }
1195
1196 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1197 MLX5_SET(mkc, mkc, crossing_target_vhca_id,
1198 MLX5_CAP_GEN(dev->mdev, vhca_id));
1199 MLX5_SET(mkc, mkc, translations_octword_size, crossed_lkey);
1200 MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3);
1201 MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7);
1202
1203 /* for this crossing mkey IOVA should be 0 and len should be IOVA + len */
1204 set_mkc_access_pd_addr_fields(mkc, access_flags, 0, pd);
1205 MLX5_SET64(mkc, mkc, len, iova + length);
1206
1207 MLX5_SET(mkc, mkc, free, 0);
1208 MLX5_SET(mkc, mkc, umr_en, 0);
1209 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
1210 if (err)
1211 goto err_2;
1212
1213 mr->mmkey.type = MLX5_MKEY_MR;
1214 set_mr_fields(dev, mr, length, access_flags, iova);
1215 mr->ibmr.pd = pd;
1216 kvfree(in);
1217 mlx5_ib_dbg(dev, "crossing mkey = 0x%x\n", mr->mmkey.key);
1218
1219 return &mr->ibmr;
1220 err_2:
1221 kvfree(in);
1222 err_1:
1223 kfree(mr);
1224 return ERR_PTR(err);
1225 }
1226
1227 /*
1228 * If ibmr is NULL it will be allocated by reg_create.
1229 * Else, the given ibmr will be used.
1230 */
reg_create(struct ib_pd * pd,struct ib_umem * umem,u64 iova,int access_flags,unsigned long page_size,bool populate,int access_mode)1231 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
1232 u64 iova, int access_flags,
1233 unsigned long page_size, bool populate,
1234 int access_mode)
1235 {
1236 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1237 struct mlx5_ib_mr *mr;
1238 __be64 *pas;
1239 void *mkc;
1240 int inlen;
1241 u32 *in;
1242 int err;
1243 bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg)) &&
1244 (access_mode == MLX5_MKC_ACCESS_MODE_MTT);
1245 bool ksm_mode = (access_mode == MLX5_MKC_ACCESS_MODE_KSM);
1246
1247 if (!page_size)
1248 return ERR_PTR(-EINVAL);
1249 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1250 if (!mr)
1251 return ERR_PTR(-ENOMEM);
1252
1253 mr->ibmr.pd = pd;
1254 mr->access_flags = access_flags;
1255 mr->page_shift = order_base_2(page_size);
1256
1257 inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1258 if (populate)
1259 inlen += sizeof(*pas) *
1260 roundup(ib_umem_num_dma_blocks(umem, page_size), 2);
1261 in = kvzalloc(inlen, GFP_KERNEL);
1262 if (!in) {
1263 err = -ENOMEM;
1264 goto err_1;
1265 }
1266 pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
1267 if (populate) {
1268 if (WARN_ON(access_flags & IB_ACCESS_ON_DEMAND || ksm_mode)) {
1269 err = -EINVAL;
1270 goto err_2;
1271 }
1272 mlx5_ib_populate_pas(umem, 1UL << mr->page_shift, pas,
1273 pg_cap ? MLX5_IB_MTT_PRESENT : 0);
1274 }
1275
1276 /* The pg_access bit allows setting the access flags
1277 * in the page list submitted with the command.
1278 */
1279 MLX5_SET(create_mkey_in, in, pg_access, !!(pg_cap));
1280
1281 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1282 set_mkc_access_pd_addr_fields(mkc, access_flags, iova,
1283 populate ? pd : dev->umrc.pd);
1284 /* In case a data direct flow, overwrite the pdn field by its internal kernel PD */
1285 if (umem->is_dmabuf && ksm_mode)
1286 MLX5_SET(mkc, mkc, pd, dev->ddr.pdn);
1287
1288 MLX5_SET(mkc, mkc, free, !populate);
1289 MLX5_SET(mkc, mkc, access_mode_1_0, access_mode);
1290 MLX5_SET(mkc, mkc, umr_en, 1);
1291
1292 MLX5_SET64(mkc, mkc, len, umem->length);
1293 MLX5_SET(mkc, mkc, bsf_octword_size, 0);
1294 if (ksm_mode)
1295 MLX5_SET(mkc, mkc, translations_octword_size,
1296 get_octo_len(iova, umem->length, mr->page_shift) * 2);
1297 else
1298 MLX5_SET(mkc, mkc, translations_octword_size,
1299 get_octo_len(iova, umem->length, mr->page_shift));
1300 MLX5_SET(mkc, mkc, log_page_size, mr->page_shift);
1301 if (mlx5_umem_needs_ats(dev, umem, access_flags))
1302 MLX5_SET(mkc, mkc, ma_translation_mode, 1);
1303 if (populate) {
1304 MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
1305 get_octo_len(iova, umem->length, mr->page_shift));
1306 }
1307
1308 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
1309 if (err) {
1310 mlx5_ib_warn(dev, "create mkey failed\n");
1311 goto err_2;
1312 }
1313 mr->mmkey.type = MLX5_MKEY_MR;
1314 mr->mmkey.ndescs = get_octo_len(iova, umem->length, mr->page_shift);
1315 mr->umem = umem;
1316 set_mr_fields(dev, mr, umem->length, access_flags, iova);
1317 kvfree(in);
1318
1319 mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmkey.key);
1320
1321 return mr;
1322
1323 err_2:
1324 kvfree(in);
1325 err_1:
1326 kfree(mr);
1327 return ERR_PTR(err);
1328 }
1329
mlx5_ib_get_dm_mr(struct ib_pd * pd,u64 start_addr,u64 length,int acc,int mode)1330 static struct ib_mr *mlx5_ib_get_dm_mr(struct ib_pd *pd, u64 start_addr,
1331 u64 length, int acc, int mode)
1332 {
1333 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1334 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1335 struct mlx5_ib_mr *mr;
1336 void *mkc;
1337 u32 *in;
1338 int err;
1339
1340 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1341 if (!mr)
1342 return ERR_PTR(-ENOMEM);
1343
1344 in = kzalloc(inlen, GFP_KERNEL);
1345 if (!in) {
1346 err = -ENOMEM;
1347 goto err_free;
1348 }
1349
1350 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1351
1352 MLX5_SET(mkc, mkc, access_mode_1_0, mode & 0x3);
1353 MLX5_SET(mkc, mkc, access_mode_4_2, (mode >> 2) & 0x7);
1354 MLX5_SET64(mkc, mkc, len, length);
1355 set_mkc_access_pd_addr_fields(mkc, acc, start_addr, pd);
1356
1357 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
1358 if (err)
1359 goto err_in;
1360
1361 kfree(in);
1362
1363 set_mr_fields(dev, mr, length, acc, start_addr);
1364
1365 return &mr->ibmr;
1366
1367 err_in:
1368 kfree(in);
1369
1370 err_free:
1371 kfree(mr);
1372
1373 return ERR_PTR(err);
1374 }
1375
mlx5_ib_advise_mr(struct ib_pd * pd,enum ib_uverbs_advise_mr_advice advice,u32 flags,struct ib_sge * sg_list,u32 num_sge,struct uverbs_attr_bundle * attrs)1376 int mlx5_ib_advise_mr(struct ib_pd *pd,
1377 enum ib_uverbs_advise_mr_advice advice,
1378 u32 flags,
1379 struct ib_sge *sg_list,
1380 u32 num_sge,
1381 struct uverbs_attr_bundle *attrs)
1382 {
1383 if (advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH &&
1384 advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
1385 advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT)
1386 return -EOPNOTSUPP;
1387
1388 return mlx5_ib_advise_mr_prefetch(pd, advice, flags,
1389 sg_list, num_sge);
1390 }
1391
mlx5_ib_reg_dm_mr(struct ib_pd * pd,struct ib_dm * dm,struct ib_dm_mr_attr * attr,struct uverbs_attr_bundle * attrs)1392 struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm,
1393 struct ib_dm_mr_attr *attr,
1394 struct uverbs_attr_bundle *attrs)
1395 {
1396 struct mlx5_ib_dm *mdm = to_mdm(dm);
1397 struct mlx5_core_dev *dev = to_mdev(dm->device)->mdev;
1398 u64 start_addr = mdm->dev_addr + attr->offset;
1399 int mode;
1400
1401 switch (mdm->type) {
1402 case MLX5_IB_UAPI_DM_TYPE_MEMIC:
1403 if (attr->access_flags & ~MLX5_IB_DM_MEMIC_ALLOWED_ACCESS)
1404 return ERR_PTR(-EINVAL);
1405
1406 mode = MLX5_MKC_ACCESS_MODE_MEMIC;
1407 start_addr -= pci_resource_start(dev->pdev, 0);
1408 break;
1409 case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
1410 case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
1411 case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_PATTERN_SW_ICM:
1412 case MLX5_IB_UAPI_DM_TYPE_ENCAP_SW_ICM:
1413 if (attr->access_flags & ~MLX5_IB_DM_SW_ICM_ALLOWED_ACCESS)
1414 return ERR_PTR(-EINVAL);
1415
1416 mode = MLX5_MKC_ACCESS_MODE_SW_ICM;
1417 break;
1418 default:
1419 return ERR_PTR(-EINVAL);
1420 }
1421
1422 return mlx5_ib_get_dm_mr(pd, start_addr, attr->length,
1423 attr->access_flags, mode);
1424 }
1425
create_real_mr(struct ib_pd * pd,struct ib_umem * umem,u64 iova,int access_flags)1426 static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem,
1427 u64 iova, int access_flags)
1428 {
1429 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1430 struct mlx5_ib_mr *mr = NULL;
1431 bool xlt_with_umr;
1432 int err;
1433
1434 xlt_with_umr = mlx5r_umr_can_load_pas(dev, umem->length);
1435 if (xlt_with_umr) {
1436 mr = alloc_cacheable_mr(pd, umem, iova, access_flags,
1437 MLX5_MKC_ACCESS_MODE_MTT);
1438 } else {
1439 unsigned long page_size =
1440 mlx5_umem_mkc_find_best_pgsz(dev, umem, iova);
1441
1442 mutex_lock(&dev->slow_path_mutex);
1443 mr = reg_create(pd, umem, iova, access_flags, page_size,
1444 true, MLX5_MKC_ACCESS_MODE_MTT);
1445 mutex_unlock(&dev->slow_path_mutex);
1446 }
1447 if (IS_ERR(mr)) {
1448 ib_umem_release(umem);
1449 return ERR_CAST(mr);
1450 }
1451
1452 mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key);
1453
1454 atomic_add(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages);
1455
1456 if (xlt_with_umr) {
1457 /*
1458 * If the MR was created with reg_create then it will be
1459 * configured properly but left disabled. It is safe to go ahead
1460 * and configure it again via UMR while enabling it.
1461 */
1462 err = mlx5r_umr_update_mr_pas(mr, MLX5_IB_UPD_XLT_ENABLE);
1463 if (err) {
1464 mlx5_ib_dereg_mr(&mr->ibmr, NULL);
1465 return ERR_PTR(err);
1466 }
1467 }
1468 return &mr->ibmr;
1469 }
1470
create_user_odp_mr(struct ib_pd * pd,u64 start,u64 length,u64 iova,int access_flags,struct ib_udata * udata)1471 static struct ib_mr *create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length,
1472 u64 iova, int access_flags,
1473 struct ib_udata *udata)
1474 {
1475 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1476 struct ib_umem_odp *odp;
1477 struct mlx5_ib_mr *mr;
1478 int err;
1479
1480 if (!IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
1481 return ERR_PTR(-EOPNOTSUPP);
1482
1483 err = mlx5r_odp_create_eq(dev, &dev->odp_pf_eq);
1484 if (err)
1485 return ERR_PTR(err);
1486 if (!start && length == U64_MAX) {
1487 if (iova != 0)
1488 return ERR_PTR(-EINVAL);
1489 if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
1490 return ERR_PTR(-EINVAL);
1491
1492 mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), access_flags);
1493 if (IS_ERR(mr))
1494 return ERR_CAST(mr);
1495 return &mr->ibmr;
1496 }
1497
1498 /* ODP requires xlt update via umr to work. */
1499 if (!mlx5r_umr_can_load_pas(dev, length))
1500 return ERR_PTR(-EINVAL);
1501
1502 odp = ib_umem_odp_get(&dev->ib_dev, start, length, access_flags,
1503 &mlx5_mn_ops);
1504 if (IS_ERR(odp))
1505 return ERR_CAST(odp);
1506
1507 mr = alloc_cacheable_mr(pd, &odp->umem, iova, access_flags,
1508 MLX5_MKC_ACCESS_MODE_MTT);
1509 if (IS_ERR(mr)) {
1510 ib_umem_release(&odp->umem);
1511 return ERR_CAST(mr);
1512 }
1513 xa_init(&mr->implicit_children);
1514
1515 odp->private = mr;
1516 err = mlx5r_store_odp_mkey(dev, &mr->mmkey);
1517 if (err)
1518 goto err_dereg_mr;
1519
1520 err = mlx5_ib_init_odp_mr(mr);
1521 if (err)
1522 goto err_dereg_mr;
1523 return &mr->ibmr;
1524
1525 err_dereg_mr:
1526 mlx5_ib_dereg_mr(&mr->ibmr, NULL);
1527 return ERR_PTR(err);
1528 }
1529
mlx5_ib_reg_user_mr(struct ib_pd * pd,u64 start,u64 length,u64 iova,int access_flags,struct ib_udata * udata)1530 struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
1531 u64 iova, int access_flags,
1532 struct ib_udata *udata)
1533 {
1534 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1535 struct ib_umem *umem;
1536 int err;
1537
1538 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM))
1539 return ERR_PTR(-EOPNOTSUPP);
1540
1541 mlx5_ib_dbg(dev, "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n",
1542 start, iova, length, access_flags);
1543
1544 err = mlx5r_umr_resource_init(dev);
1545 if (err)
1546 return ERR_PTR(err);
1547
1548 if (access_flags & IB_ACCESS_ON_DEMAND)
1549 return create_user_odp_mr(pd, start, length, iova, access_flags,
1550 udata);
1551 umem = ib_umem_get(&dev->ib_dev, start, length, access_flags);
1552 if (IS_ERR(umem))
1553 return ERR_CAST(umem);
1554 return create_real_mr(pd, umem, iova, access_flags);
1555 }
1556
mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment * attach)1557 static void mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment *attach)
1558 {
1559 struct ib_umem_dmabuf *umem_dmabuf = attach->importer_priv;
1560 struct mlx5_ib_mr *mr = umem_dmabuf->private;
1561
1562 dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv);
1563
1564 if (!umem_dmabuf->sgt || !mr)
1565 return;
1566
1567 mlx5r_umr_update_mr_pas(mr, MLX5_IB_UPD_XLT_ZAP);
1568 ib_umem_dmabuf_unmap_pages(umem_dmabuf);
1569 }
1570
1571 static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = {
1572 .allow_peer2peer = 1,
1573 .move_notify = mlx5_ib_dmabuf_invalidate_cb,
1574 };
1575
1576 static struct ib_mr *
reg_user_mr_dmabuf(struct ib_pd * pd,struct device * dma_device,u64 offset,u64 length,u64 virt_addr,int fd,int access_flags,int access_mode)1577 reg_user_mr_dmabuf(struct ib_pd *pd, struct device *dma_device,
1578 u64 offset, u64 length, u64 virt_addr,
1579 int fd, int access_flags, int access_mode)
1580 {
1581 bool pinned_mode = (access_mode == MLX5_MKC_ACCESS_MODE_KSM);
1582 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1583 struct mlx5_ib_mr *mr = NULL;
1584 struct ib_umem_dmabuf *umem_dmabuf;
1585 int err;
1586
1587 err = mlx5r_umr_resource_init(dev);
1588 if (err)
1589 return ERR_PTR(err);
1590
1591 if (!pinned_mode)
1592 umem_dmabuf = ib_umem_dmabuf_get(&dev->ib_dev,
1593 offset, length, fd,
1594 access_flags,
1595 &mlx5_ib_dmabuf_attach_ops);
1596 else
1597 umem_dmabuf = ib_umem_dmabuf_get_pinned_with_dma_device(&dev->ib_dev,
1598 dma_device, offset, length,
1599 fd, access_flags);
1600
1601 if (IS_ERR(umem_dmabuf)) {
1602 mlx5_ib_dbg(dev, "umem_dmabuf get failed (%ld)\n",
1603 PTR_ERR(umem_dmabuf));
1604 return ERR_CAST(umem_dmabuf);
1605 }
1606
1607 mr = alloc_cacheable_mr(pd, &umem_dmabuf->umem, virt_addr,
1608 access_flags, access_mode);
1609 if (IS_ERR(mr)) {
1610 ib_umem_release(&umem_dmabuf->umem);
1611 return ERR_CAST(mr);
1612 }
1613
1614 mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key);
1615
1616 atomic_add(ib_umem_num_pages(mr->umem), &dev->mdev->priv.reg_pages);
1617 umem_dmabuf->private = mr;
1618 if (!pinned_mode) {
1619 err = mlx5r_store_odp_mkey(dev, &mr->mmkey);
1620 if (err)
1621 goto err_dereg_mr;
1622 } else {
1623 mr->data_direct = true;
1624 }
1625
1626 err = mlx5_ib_init_dmabuf_mr(mr);
1627 if (err)
1628 goto err_dereg_mr;
1629 return &mr->ibmr;
1630
1631 err_dereg_mr:
1632 __mlx5_ib_dereg_mr(&mr->ibmr);
1633 return ERR_PTR(err);
1634 }
1635
1636 static struct ib_mr *
reg_user_mr_dmabuf_by_data_direct(struct ib_pd * pd,u64 offset,u64 length,u64 virt_addr,int fd,int access_flags)1637 reg_user_mr_dmabuf_by_data_direct(struct ib_pd *pd, u64 offset,
1638 u64 length, u64 virt_addr,
1639 int fd, int access_flags)
1640 {
1641 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1642 struct mlx5_data_direct_dev *data_direct_dev;
1643 struct ib_mr *crossing_mr;
1644 struct ib_mr *crossed_mr;
1645 int ret = 0;
1646
1647 /* As of HW behaviour the IOVA must be page aligned in KSM mode */
1648 if (!PAGE_ALIGNED(virt_addr) || (access_flags & IB_ACCESS_ON_DEMAND))
1649 return ERR_PTR(-EOPNOTSUPP);
1650
1651 mutex_lock(&dev->data_direct_lock);
1652 data_direct_dev = dev->data_direct_dev;
1653 if (!data_direct_dev) {
1654 ret = -EINVAL;
1655 goto end;
1656 }
1657
1658 /* The device's 'data direct mkey' was created without RO flags to
1659 * simplify things and allow for a single mkey per device.
1660 * Since RO is not a must, mask it out accordingly.
1661 */
1662 access_flags &= ~IB_ACCESS_RELAXED_ORDERING;
1663 crossed_mr = reg_user_mr_dmabuf(pd, &data_direct_dev->pdev->dev,
1664 offset, length, virt_addr, fd,
1665 access_flags, MLX5_MKC_ACCESS_MODE_KSM);
1666 if (IS_ERR(crossed_mr)) {
1667 ret = PTR_ERR(crossed_mr);
1668 goto end;
1669 }
1670
1671 mutex_lock(&dev->slow_path_mutex);
1672 crossing_mr = reg_create_crossing_vhca_mr(pd, virt_addr, length, access_flags,
1673 crossed_mr->lkey);
1674 mutex_unlock(&dev->slow_path_mutex);
1675 if (IS_ERR(crossing_mr)) {
1676 __mlx5_ib_dereg_mr(crossed_mr);
1677 ret = PTR_ERR(crossing_mr);
1678 goto end;
1679 }
1680
1681 list_add_tail(&to_mmr(crossed_mr)->dd_node, &dev->data_direct_mr_list);
1682 to_mmr(crossing_mr)->dd_crossed_mr = to_mmr(crossed_mr);
1683 to_mmr(crossing_mr)->data_direct = true;
1684 end:
1685 mutex_unlock(&dev->data_direct_lock);
1686 return ret ? ERR_PTR(ret) : crossing_mr;
1687 }
1688
mlx5_ib_reg_user_mr_dmabuf(struct ib_pd * pd,u64 offset,u64 length,u64 virt_addr,int fd,int access_flags,struct uverbs_attr_bundle * attrs)1689 struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset,
1690 u64 length, u64 virt_addr,
1691 int fd, int access_flags,
1692 struct uverbs_attr_bundle *attrs)
1693 {
1694 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1695 int mlx5_access_flags = 0;
1696 int err;
1697
1698 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) ||
1699 !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
1700 return ERR_PTR(-EOPNOTSUPP);
1701
1702 if (uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS)) {
1703 err = uverbs_get_flags32(&mlx5_access_flags, attrs,
1704 MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS,
1705 MLX5_IB_UAPI_REG_DMABUF_ACCESS_DATA_DIRECT);
1706 if (err)
1707 return ERR_PTR(err);
1708 }
1709
1710 mlx5_ib_dbg(dev,
1711 "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x, mlx5_access_flags 0x%x\n",
1712 offset, virt_addr, length, fd, access_flags, mlx5_access_flags);
1713
1714 /* dmabuf requires xlt update via umr to work. */
1715 if (!mlx5r_umr_can_load_pas(dev, length))
1716 return ERR_PTR(-EINVAL);
1717
1718 if (mlx5_access_flags & MLX5_IB_UAPI_REG_DMABUF_ACCESS_DATA_DIRECT)
1719 return reg_user_mr_dmabuf_by_data_direct(pd, offset, length, virt_addr,
1720 fd, access_flags);
1721
1722 return reg_user_mr_dmabuf(pd, pd->device->dma_device,
1723 offset, length, virt_addr,
1724 fd, access_flags, MLX5_MKC_ACCESS_MODE_MTT);
1725 }
1726
1727 /*
1728 * True if the change in access flags can be done via UMR, only some access
1729 * flags can be updated.
1730 */
can_use_umr_rereg_access(struct mlx5_ib_dev * dev,unsigned int current_access_flags,unsigned int target_access_flags)1731 static bool can_use_umr_rereg_access(struct mlx5_ib_dev *dev,
1732 unsigned int current_access_flags,
1733 unsigned int target_access_flags)
1734 {
1735 unsigned int diffs = current_access_flags ^ target_access_flags;
1736
1737 if (diffs & ~(IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE |
1738 IB_ACCESS_REMOTE_READ | IB_ACCESS_RELAXED_ORDERING |
1739 IB_ACCESS_REMOTE_ATOMIC))
1740 return false;
1741 return mlx5r_umr_can_reconfig(dev, current_access_flags,
1742 target_access_flags);
1743 }
1744
can_use_umr_rereg_pas(struct mlx5_ib_mr * mr,struct ib_umem * new_umem,int new_access_flags,u64 iova,unsigned long * page_size)1745 static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr,
1746 struct ib_umem *new_umem,
1747 int new_access_flags, u64 iova,
1748 unsigned long *page_size)
1749 {
1750 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
1751
1752 /* We only track the allocated sizes of MRs from the cache */
1753 if (!mr->mmkey.cache_ent)
1754 return false;
1755 if (!mlx5r_umr_can_load_pas(dev, new_umem->length))
1756 return false;
1757
1758 *page_size = mlx5_umem_mkc_find_best_pgsz(dev, new_umem, iova);
1759 if (WARN_ON(!*page_size))
1760 return false;
1761 return (mr->mmkey.cache_ent->rb_key.ndescs) >=
1762 ib_umem_num_dma_blocks(new_umem, *page_size);
1763 }
1764
umr_rereg_pas(struct mlx5_ib_mr * mr,struct ib_pd * pd,int access_flags,int flags,struct ib_umem * new_umem,u64 iova,unsigned long page_size)1765 static int umr_rereg_pas(struct mlx5_ib_mr *mr, struct ib_pd *pd,
1766 int access_flags, int flags, struct ib_umem *new_umem,
1767 u64 iova, unsigned long page_size)
1768 {
1769 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
1770 int upd_flags = MLX5_IB_UPD_XLT_ADDR | MLX5_IB_UPD_XLT_ENABLE;
1771 struct ib_umem *old_umem = mr->umem;
1772 int err;
1773
1774 /*
1775 * To keep everything simple the MR is revoked before we start to mess
1776 * with it. This ensure the change is atomic relative to any use of the
1777 * MR.
1778 */
1779 err = mlx5r_umr_revoke_mr(mr);
1780 if (err)
1781 return err;
1782
1783 if (flags & IB_MR_REREG_PD) {
1784 mr->ibmr.pd = pd;
1785 upd_flags |= MLX5_IB_UPD_XLT_PD;
1786 }
1787 if (flags & IB_MR_REREG_ACCESS) {
1788 mr->access_flags = access_flags;
1789 upd_flags |= MLX5_IB_UPD_XLT_ACCESS;
1790 }
1791
1792 mr->ibmr.iova = iova;
1793 mr->ibmr.length = new_umem->length;
1794 mr->page_shift = order_base_2(page_size);
1795 mr->umem = new_umem;
1796 err = mlx5r_umr_update_mr_pas(mr, upd_flags);
1797 if (err) {
1798 /*
1799 * The MR is revoked at this point so there is no issue to free
1800 * new_umem.
1801 */
1802 mr->umem = old_umem;
1803 return err;
1804 }
1805
1806 atomic_sub(ib_umem_num_pages(old_umem), &dev->mdev->priv.reg_pages);
1807 ib_umem_release(old_umem);
1808 atomic_add(ib_umem_num_pages(new_umem), &dev->mdev->priv.reg_pages);
1809 return 0;
1810 }
1811
mlx5_ib_rereg_user_mr(struct ib_mr * ib_mr,int flags,u64 start,u64 length,u64 iova,int new_access_flags,struct ib_pd * new_pd,struct ib_udata * udata)1812 struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
1813 u64 length, u64 iova, int new_access_flags,
1814 struct ib_pd *new_pd,
1815 struct ib_udata *udata)
1816 {
1817 struct mlx5_ib_dev *dev = to_mdev(ib_mr->device);
1818 struct mlx5_ib_mr *mr = to_mmr(ib_mr);
1819 int err;
1820
1821 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || mr->data_direct)
1822 return ERR_PTR(-EOPNOTSUPP);
1823
1824 mlx5_ib_dbg(
1825 dev,
1826 "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n",
1827 start, iova, length, new_access_flags);
1828
1829 if (flags & ~(IB_MR_REREG_TRANS | IB_MR_REREG_PD | IB_MR_REREG_ACCESS))
1830 return ERR_PTR(-EOPNOTSUPP);
1831
1832 if (!(flags & IB_MR_REREG_ACCESS))
1833 new_access_flags = mr->access_flags;
1834 if (!(flags & IB_MR_REREG_PD))
1835 new_pd = ib_mr->pd;
1836
1837 if (!(flags & IB_MR_REREG_TRANS)) {
1838 struct ib_umem *umem;
1839
1840 /* Fast path for PD/access change */
1841 if (can_use_umr_rereg_access(dev, mr->access_flags,
1842 new_access_flags)) {
1843 err = mlx5r_umr_rereg_pd_access(mr, new_pd,
1844 new_access_flags);
1845 if (err)
1846 return ERR_PTR(err);
1847 return NULL;
1848 }
1849 /* DM or ODP MR's don't have a normal umem so we can't re-use it */
1850 if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr))
1851 goto recreate;
1852
1853 /*
1854 * Only one active MR can refer to a umem at one time, revoke
1855 * the old MR before assigning the umem to the new one.
1856 */
1857 err = mlx5r_umr_revoke_mr(mr);
1858 if (err)
1859 return ERR_PTR(err);
1860 umem = mr->umem;
1861 mr->umem = NULL;
1862 atomic_sub(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages);
1863
1864 return create_real_mr(new_pd, umem, mr->ibmr.iova,
1865 new_access_flags);
1866 }
1867
1868 /*
1869 * DM doesn't have a PAS list so we can't re-use it, odp/dmabuf does
1870 * but the logic around releasing the umem is different
1871 */
1872 if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr))
1873 goto recreate;
1874
1875 if (!(new_access_flags & IB_ACCESS_ON_DEMAND) &&
1876 can_use_umr_rereg_access(dev, mr->access_flags, new_access_flags)) {
1877 struct ib_umem *new_umem;
1878 unsigned long page_size;
1879
1880 new_umem = ib_umem_get(&dev->ib_dev, start, length,
1881 new_access_flags);
1882 if (IS_ERR(new_umem))
1883 return ERR_CAST(new_umem);
1884
1885 /* Fast path for PAS change */
1886 if (can_use_umr_rereg_pas(mr, new_umem, new_access_flags, iova,
1887 &page_size)) {
1888 err = umr_rereg_pas(mr, new_pd, new_access_flags, flags,
1889 new_umem, iova, page_size);
1890 if (err) {
1891 ib_umem_release(new_umem);
1892 return ERR_PTR(err);
1893 }
1894 return NULL;
1895 }
1896 return create_real_mr(new_pd, new_umem, iova, new_access_flags);
1897 }
1898
1899 /*
1900 * Everything else has no state we can preserve, just create a new MR
1901 * from scratch
1902 */
1903 recreate:
1904 return mlx5_ib_reg_user_mr(new_pd, start, length, iova,
1905 new_access_flags, udata);
1906 }
1907
1908 static int
mlx5_alloc_priv_descs(struct ib_device * device,struct mlx5_ib_mr * mr,int ndescs,int desc_size)1909 mlx5_alloc_priv_descs(struct ib_device *device,
1910 struct mlx5_ib_mr *mr,
1911 int ndescs,
1912 int desc_size)
1913 {
1914 struct mlx5_ib_dev *dev = to_mdev(device);
1915 struct device *ddev = &dev->mdev->pdev->dev;
1916 int size = ndescs * desc_size;
1917 int add_size;
1918 int ret;
1919
1920 add_size = max_t(int, MLX5_UMR_ALIGN - ARCH_KMALLOC_MINALIGN, 0);
1921 if (is_power_of_2(MLX5_UMR_ALIGN) && add_size) {
1922 int end = max_t(int, MLX5_UMR_ALIGN, roundup_pow_of_two(size));
1923
1924 add_size = min_t(int, end - size, add_size);
1925 }
1926
1927 mr->descs_alloc = kzalloc(size + add_size, GFP_KERNEL);
1928 if (!mr->descs_alloc)
1929 return -ENOMEM;
1930
1931 mr->descs = PTR_ALIGN(mr->descs_alloc, MLX5_UMR_ALIGN);
1932
1933 mr->desc_map = dma_map_single(ddev, mr->descs, size, DMA_TO_DEVICE);
1934 if (dma_mapping_error(ddev, mr->desc_map)) {
1935 ret = -ENOMEM;
1936 goto err;
1937 }
1938
1939 return 0;
1940 err:
1941 kfree(mr->descs_alloc);
1942
1943 return ret;
1944 }
1945
1946 static void
mlx5_free_priv_descs(struct mlx5_ib_mr * mr)1947 mlx5_free_priv_descs(struct mlx5_ib_mr *mr)
1948 {
1949 if (!mr->umem && !mr->data_direct &&
1950 mr->ibmr.type != IB_MR_TYPE_DM && mr->descs) {
1951 struct ib_device *device = mr->ibmr.device;
1952 int size = mr->max_descs * mr->desc_size;
1953 struct mlx5_ib_dev *dev = to_mdev(device);
1954
1955 dma_unmap_single(&dev->mdev->pdev->dev, mr->desc_map, size,
1956 DMA_TO_DEVICE);
1957 kfree(mr->descs_alloc);
1958 mr->descs = NULL;
1959 }
1960 }
1961
cache_ent_find_and_store(struct mlx5_ib_dev * dev,struct mlx5_ib_mr * mr)1962 static int cache_ent_find_and_store(struct mlx5_ib_dev *dev,
1963 struct mlx5_ib_mr *mr)
1964 {
1965 struct mlx5_mkey_cache *cache = &dev->cache;
1966 struct mlx5_cache_ent *ent;
1967 int ret;
1968
1969 if (mr->mmkey.cache_ent) {
1970 spin_lock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock);
1971 mr->mmkey.cache_ent->in_use--;
1972 goto end;
1973 }
1974
1975 mutex_lock(&cache->rb_lock);
1976 ent = mkey_cache_ent_from_rb_key(dev, mr->mmkey.rb_key);
1977 if (ent) {
1978 if (ent->rb_key.ndescs == mr->mmkey.rb_key.ndescs) {
1979 if (ent->disabled) {
1980 mutex_unlock(&cache->rb_lock);
1981 return -EOPNOTSUPP;
1982 }
1983 mr->mmkey.cache_ent = ent;
1984 spin_lock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock);
1985 mutex_unlock(&cache->rb_lock);
1986 goto end;
1987 }
1988 }
1989
1990 ent = mlx5r_cache_create_ent_locked(dev, mr->mmkey.rb_key, false);
1991 mutex_unlock(&cache->rb_lock);
1992 if (IS_ERR(ent))
1993 return PTR_ERR(ent);
1994
1995 mr->mmkey.cache_ent = ent;
1996 spin_lock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock);
1997
1998 end:
1999 ret = push_mkey_locked(mr->mmkey.cache_ent, mr->mmkey.key);
2000 spin_unlock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock);
2001 return ret;
2002 }
2003
mlx5_ib_revoke_data_direct_mr(struct mlx5_ib_mr * mr)2004 static int mlx5_ib_revoke_data_direct_mr(struct mlx5_ib_mr *mr)
2005 {
2006 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
2007 struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(mr->umem);
2008 int err;
2009
2010 lockdep_assert_held(&dev->data_direct_lock);
2011 mr->revoked = true;
2012 err = mlx5r_umr_revoke_mr(mr);
2013 if (WARN_ON(err))
2014 return err;
2015
2016 ib_umem_dmabuf_revoke(umem_dmabuf);
2017 return 0;
2018 }
2019
mlx5_ib_revoke_data_direct_mrs(struct mlx5_ib_dev * dev)2020 void mlx5_ib_revoke_data_direct_mrs(struct mlx5_ib_dev *dev)
2021 {
2022 struct mlx5_ib_mr *mr, *next;
2023
2024 lockdep_assert_held(&dev->data_direct_lock);
2025
2026 list_for_each_entry_safe(mr, next, &dev->data_direct_mr_list, dd_node) {
2027 list_del(&mr->dd_node);
2028 mlx5_ib_revoke_data_direct_mr(mr);
2029 }
2030 }
2031
mlx5_revoke_mr(struct mlx5_ib_mr * mr)2032 static int mlx5_revoke_mr(struct mlx5_ib_mr *mr)
2033 {
2034 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
2035 struct mlx5_cache_ent *ent = mr->mmkey.cache_ent;
2036 bool is_odp = is_odp_mr(mr);
2037 bool is_odp_dma_buf = is_dmabuf_mr(mr) &&
2038 !to_ib_umem_dmabuf(mr->umem)->pinned;
2039 int ret = 0;
2040
2041 if (is_odp)
2042 mutex_lock(&to_ib_umem_odp(mr->umem)->umem_mutex);
2043
2044 if (is_odp_dma_buf)
2045 dma_resv_lock(to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv, NULL);
2046
2047 if (mr->mmkey.cacheable && !mlx5r_umr_revoke_mr(mr) && !cache_ent_find_and_store(dev, mr)) {
2048 ent = mr->mmkey.cache_ent;
2049 /* upon storing to a clean temp entry - schedule its cleanup */
2050 spin_lock_irq(&ent->mkeys_queue.lock);
2051 if (ent->is_tmp && !ent->tmp_cleanup_scheduled) {
2052 mod_delayed_work(ent->dev->cache.wq, &ent->dwork,
2053 msecs_to_jiffies(30 * 1000));
2054 ent->tmp_cleanup_scheduled = true;
2055 }
2056 spin_unlock_irq(&ent->mkeys_queue.lock);
2057 goto out;
2058 }
2059
2060 if (ent) {
2061 spin_lock_irq(&ent->mkeys_queue.lock);
2062 ent->in_use--;
2063 mr->mmkey.cache_ent = NULL;
2064 spin_unlock_irq(&ent->mkeys_queue.lock);
2065 }
2066 ret = destroy_mkey(dev, mr);
2067 out:
2068 if (is_odp) {
2069 if (!ret)
2070 to_ib_umem_odp(mr->umem)->private = NULL;
2071 mutex_unlock(&to_ib_umem_odp(mr->umem)->umem_mutex);
2072 }
2073
2074 if (is_odp_dma_buf) {
2075 if (!ret)
2076 to_ib_umem_dmabuf(mr->umem)->private = NULL;
2077 dma_resv_unlock(to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv);
2078 }
2079
2080 return ret;
2081 }
2082
__mlx5_ib_dereg_mr(struct ib_mr * ibmr)2083 static int __mlx5_ib_dereg_mr(struct ib_mr *ibmr)
2084 {
2085 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2086 struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
2087 int rc;
2088
2089 /*
2090 * Any async use of the mr must hold the refcount, once the refcount
2091 * goes to zero no other thread, such as ODP page faults, prefetch, any
2092 * UMR activity, etc can touch the mkey. Thus it is safe to destroy it.
2093 */
2094 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) &&
2095 refcount_read(&mr->mmkey.usecount) != 0 &&
2096 xa_erase(&mr_to_mdev(mr)->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)))
2097 mlx5r_deref_wait_odp_mkey(&mr->mmkey);
2098
2099 if (ibmr->type == IB_MR_TYPE_INTEGRITY) {
2100 xa_cmpxchg(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key),
2101 mr->sig, NULL, GFP_KERNEL);
2102
2103 if (mr->mtt_mr) {
2104 rc = mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL);
2105 if (rc)
2106 return rc;
2107 mr->mtt_mr = NULL;
2108 }
2109 if (mr->klm_mr) {
2110 rc = mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL);
2111 if (rc)
2112 return rc;
2113 mr->klm_mr = NULL;
2114 }
2115
2116 if (mlx5_core_destroy_psv(dev->mdev,
2117 mr->sig->psv_memory.psv_idx))
2118 mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
2119 mr->sig->psv_memory.psv_idx);
2120 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx))
2121 mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
2122 mr->sig->psv_wire.psv_idx);
2123 kfree(mr->sig);
2124 mr->sig = NULL;
2125 }
2126
2127 /* Stop DMA */
2128 rc = mlx5_revoke_mr(mr);
2129 if (rc)
2130 return rc;
2131
2132 if (mr->umem) {
2133 bool is_odp = is_odp_mr(mr);
2134
2135 if (!is_odp)
2136 atomic_sub(ib_umem_num_pages(mr->umem),
2137 &dev->mdev->priv.reg_pages);
2138 ib_umem_release(mr->umem);
2139 if (is_odp)
2140 mlx5_ib_free_odp_mr(mr);
2141 }
2142
2143 if (!mr->mmkey.cache_ent)
2144 mlx5_free_priv_descs(mr);
2145
2146 kfree(mr);
2147 return 0;
2148 }
2149
dereg_crossing_data_direct_mr(struct mlx5_ib_dev * dev,struct mlx5_ib_mr * mr)2150 static int dereg_crossing_data_direct_mr(struct mlx5_ib_dev *dev,
2151 struct mlx5_ib_mr *mr)
2152 {
2153 struct mlx5_ib_mr *dd_crossed_mr = mr->dd_crossed_mr;
2154 int ret;
2155
2156 ret = __mlx5_ib_dereg_mr(&mr->ibmr);
2157 if (ret)
2158 return ret;
2159
2160 mutex_lock(&dev->data_direct_lock);
2161 if (!dd_crossed_mr->revoked)
2162 list_del(&dd_crossed_mr->dd_node);
2163
2164 ret = __mlx5_ib_dereg_mr(&dd_crossed_mr->ibmr);
2165 mutex_unlock(&dev->data_direct_lock);
2166 return ret;
2167 }
2168
mlx5_ib_dereg_mr(struct ib_mr * ibmr,struct ib_udata * udata)2169 int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
2170 {
2171 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2172 struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
2173
2174 if (mr->data_direct)
2175 return dereg_crossing_data_direct_mr(dev, mr);
2176
2177 return __mlx5_ib_dereg_mr(ibmr);
2178 }
2179
mlx5_set_umr_free_mkey(struct ib_pd * pd,u32 * in,int ndescs,int access_mode,int page_shift)2180 static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs,
2181 int access_mode, int page_shift)
2182 {
2183 struct mlx5_ib_dev *dev = to_mdev(pd->device);
2184 void *mkc;
2185
2186 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
2187
2188 /* This is only used from the kernel, so setting the PD is OK. */
2189 set_mkc_access_pd_addr_fields(mkc, IB_ACCESS_RELAXED_ORDERING, 0, pd);
2190 MLX5_SET(mkc, mkc, free, 1);
2191 MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
2192 MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3);
2193 MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7);
2194 MLX5_SET(mkc, mkc, umr_en, 1);
2195 MLX5_SET(mkc, mkc, log_page_size, page_shift);
2196 if (access_mode == MLX5_MKC_ACCESS_MODE_PA ||
2197 access_mode == MLX5_MKC_ACCESS_MODE_MTT)
2198 MLX5_SET(mkc, mkc, ma_translation_mode, MLX5_CAP_GEN(dev->mdev, ats));
2199 }
2200
_mlx5_alloc_mkey_descs(struct ib_pd * pd,struct mlx5_ib_mr * mr,int ndescs,int desc_size,int page_shift,int access_mode,u32 * in,int inlen)2201 static int _mlx5_alloc_mkey_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
2202 int ndescs, int desc_size, int page_shift,
2203 int access_mode, u32 *in, int inlen)
2204 {
2205 struct mlx5_ib_dev *dev = to_mdev(pd->device);
2206 int err;
2207
2208 mr->access_mode = access_mode;
2209 mr->desc_size = desc_size;
2210 mr->max_descs = ndescs;
2211
2212 err = mlx5_alloc_priv_descs(pd->device, mr, ndescs, desc_size);
2213 if (err)
2214 return err;
2215
2216 mlx5_set_umr_free_mkey(pd, in, ndescs, access_mode, page_shift);
2217
2218 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
2219 if (err)
2220 goto err_free_descs;
2221
2222 mr->mmkey.type = MLX5_MKEY_MR;
2223 mr->ibmr.lkey = mr->mmkey.key;
2224 mr->ibmr.rkey = mr->mmkey.key;
2225
2226 return 0;
2227
2228 err_free_descs:
2229 mlx5_free_priv_descs(mr);
2230 return err;
2231 }
2232
mlx5_ib_alloc_pi_mr(struct ib_pd * pd,u32 max_num_sg,u32 max_num_meta_sg,int desc_size,int access_mode)2233 static struct mlx5_ib_mr *mlx5_ib_alloc_pi_mr(struct ib_pd *pd,
2234 u32 max_num_sg, u32 max_num_meta_sg,
2235 int desc_size, int access_mode)
2236 {
2237 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
2238 int ndescs = ALIGN(max_num_sg + max_num_meta_sg, 4);
2239 int page_shift = 0;
2240 struct mlx5_ib_mr *mr;
2241 u32 *in;
2242 int err;
2243
2244 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
2245 if (!mr)
2246 return ERR_PTR(-ENOMEM);
2247
2248 mr->ibmr.pd = pd;
2249 mr->ibmr.device = pd->device;
2250
2251 in = kzalloc(inlen, GFP_KERNEL);
2252 if (!in) {
2253 err = -ENOMEM;
2254 goto err_free;
2255 }
2256
2257 if (access_mode == MLX5_MKC_ACCESS_MODE_MTT)
2258 page_shift = PAGE_SHIFT;
2259
2260 err = _mlx5_alloc_mkey_descs(pd, mr, ndescs, desc_size, page_shift,
2261 access_mode, in, inlen);
2262 if (err)
2263 goto err_free_in;
2264
2265 mr->umem = NULL;
2266 kfree(in);
2267
2268 return mr;
2269
2270 err_free_in:
2271 kfree(in);
2272 err_free:
2273 kfree(mr);
2274 return ERR_PTR(err);
2275 }
2276
mlx5_alloc_mem_reg_descs(struct ib_pd * pd,struct mlx5_ib_mr * mr,int ndescs,u32 * in,int inlen)2277 static int mlx5_alloc_mem_reg_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
2278 int ndescs, u32 *in, int inlen)
2279 {
2280 return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_mtt),
2281 PAGE_SHIFT, MLX5_MKC_ACCESS_MODE_MTT, in,
2282 inlen);
2283 }
2284
mlx5_alloc_sg_gaps_descs(struct ib_pd * pd,struct mlx5_ib_mr * mr,int ndescs,u32 * in,int inlen)2285 static int mlx5_alloc_sg_gaps_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
2286 int ndescs, u32 *in, int inlen)
2287 {
2288 return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_klm),
2289 0, MLX5_MKC_ACCESS_MODE_KLMS, in, inlen);
2290 }
2291
mlx5_alloc_integrity_descs(struct ib_pd * pd,struct mlx5_ib_mr * mr,int max_num_sg,int max_num_meta_sg,u32 * in,int inlen)2292 static int mlx5_alloc_integrity_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
2293 int max_num_sg, int max_num_meta_sg,
2294 u32 *in, int inlen)
2295 {
2296 struct mlx5_ib_dev *dev = to_mdev(pd->device);
2297 u32 psv_index[2];
2298 void *mkc;
2299 int err;
2300
2301 mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL);
2302 if (!mr->sig)
2303 return -ENOMEM;
2304
2305 /* create mem & wire PSVs */
2306 err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn, 2, psv_index);
2307 if (err)
2308 goto err_free_sig;
2309
2310 mr->sig->psv_memory.psv_idx = psv_index[0];
2311 mr->sig->psv_wire.psv_idx = psv_index[1];
2312
2313 mr->sig->sig_status_checked = true;
2314 mr->sig->sig_err_exists = false;
2315 /* Next UMR, Arm SIGERR */
2316 ++mr->sig->sigerr_count;
2317 mr->klm_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg,
2318 sizeof(struct mlx5_klm),
2319 MLX5_MKC_ACCESS_MODE_KLMS);
2320 if (IS_ERR(mr->klm_mr)) {
2321 err = PTR_ERR(mr->klm_mr);
2322 goto err_destroy_psv;
2323 }
2324 mr->mtt_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg,
2325 sizeof(struct mlx5_mtt),
2326 MLX5_MKC_ACCESS_MODE_MTT);
2327 if (IS_ERR(mr->mtt_mr)) {
2328 err = PTR_ERR(mr->mtt_mr);
2329 goto err_free_klm_mr;
2330 }
2331
2332 /* Set bsf descriptors for mkey */
2333 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
2334 MLX5_SET(mkc, mkc, bsf_en, 1);
2335 MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE);
2336
2337 err = _mlx5_alloc_mkey_descs(pd, mr, 4, sizeof(struct mlx5_klm), 0,
2338 MLX5_MKC_ACCESS_MODE_KLMS, in, inlen);
2339 if (err)
2340 goto err_free_mtt_mr;
2341
2342 err = xa_err(xa_store(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key),
2343 mr->sig, GFP_KERNEL));
2344 if (err)
2345 goto err_free_descs;
2346 return 0;
2347
2348 err_free_descs:
2349 destroy_mkey(dev, mr);
2350 mlx5_free_priv_descs(mr);
2351 err_free_mtt_mr:
2352 mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL);
2353 mr->mtt_mr = NULL;
2354 err_free_klm_mr:
2355 mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL);
2356 mr->klm_mr = NULL;
2357 err_destroy_psv:
2358 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_memory.psv_idx))
2359 mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
2360 mr->sig->psv_memory.psv_idx);
2361 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx))
2362 mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
2363 mr->sig->psv_wire.psv_idx);
2364 err_free_sig:
2365 kfree(mr->sig);
2366
2367 return err;
2368 }
2369
__mlx5_ib_alloc_mr(struct ib_pd * pd,enum ib_mr_type mr_type,u32 max_num_sg,u32 max_num_meta_sg)2370 static struct ib_mr *__mlx5_ib_alloc_mr(struct ib_pd *pd,
2371 enum ib_mr_type mr_type, u32 max_num_sg,
2372 u32 max_num_meta_sg)
2373 {
2374 struct mlx5_ib_dev *dev = to_mdev(pd->device);
2375 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
2376 int ndescs = ALIGN(max_num_sg, 4);
2377 struct mlx5_ib_mr *mr;
2378 u32 *in;
2379 int err;
2380
2381 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
2382 if (!mr)
2383 return ERR_PTR(-ENOMEM);
2384
2385 in = kzalloc(inlen, GFP_KERNEL);
2386 if (!in) {
2387 err = -ENOMEM;
2388 goto err_free;
2389 }
2390
2391 mr->ibmr.device = pd->device;
2392 mr->umem = NULL;
2393
2394 switch (mr_type) {
2395 case IB_MR_TYPE_MEM_REG:
2396 err = mlx5_alloc_mem_reg_descs(pd, mr, ndescs, in, inlen);
2397 break;
2398 case IB_MR_TYPE_SG_GAPS:
2399 err = mlx5_alloc_sg_gaps_descs(pd, mr, ndescs, in, inlen);
2400 break;
2401 case IB_MR_TYPE_INTEGRITY:
2402 err = mlx5_alloc_integrity_descs(pd, mr, max_num_sg,
2403 max_num_meta_sg, in, inlen);
2404 break;
2405 default:
2406 mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type);
2407 err = -EINVAL;
2408 }
2409
2410 if (err)
2411 goto err_free_in;
2412
2413 kfree(in);
2414
2415 return &mr->ibmr;
2416
2417 err_free_in:
2418 kfree(in);
2419 err_free:
2420 kfree(mr);
2421 return ERR_PTR(err);
2422 }
2423
mlx5_ib_alloc_mr(struct ib_pd * pd,enum ib_mr_type mr_type,u32 max_num_sg)2424 struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
2425 u32 max_num_sg)
2426 {
2427 return __mlx5_ib_alloc_mr(pd, mr_type, max_num_sg, 0);
2428 }
2429
mlx5_ib_alloc_mr_integrity(struct ib_pd * pd,u32 max_num_sg,u32 max_num_meta_sg)2430 struct ib_mr *mlx5_ib_alloc_mr_integrity(struct ib_pd *pd,
2431 u32 max_num_sg, u32 max_num_meta_sg)
2432 {
2433 return __mlx5_ib_alloc_mr(pd, IB_MR_TYPE_INTEGRITY, max_num_sg,
2434 max_num_meta_sg);
2435 }
2436
mlx5_ib_alloc_mw(struct ib_mw * ibmw,struct ib_udata * udata)2437 int mlx5_ib_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata)
2438 {
2439 struct mlx5_ib_dev *dev = to_mdev(ibmw->device);
2440 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
2441 struct mlx5_ib_mw *mw = to_mmw(ibmw);
2442 unsigned int ndescs;
2443 u32 *in = NULL;
2444 void *mkc;
2445 int err;
2446 struct mlx5_ib_alloc_mw req = {};
2447 struct {
2448 __u32 comp_mask;
2449 __u32 response_length;
2450 } resp = {};
2451
2452 err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req)));
2453 if (err)
2454 return err;
2455
2456 if (req.comp_mask || req.reserved1 || req.reserved2)
2457 return -EOPNOTSUPP;
2458
2459 if (udata->inlen > sizeof(req) &&
2460 !ib_is_udata_cleared(udata, sizeof(req),
2461 udata->inlen - sizeof(req)))
2462 return -EOPNOTSUPP;
2463
2464 ndescs = req.num_klms ? roundup(req.num_klms, 4) : roundup(1, 4);
2465
2466 in = kzalloc(inlen, GFP_KERNEL);
2467 if (!in)
2468 return -ENOMEM;
2469
2470 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
2471
2472 MLX5_SET(mkc, mkc, free, 1);
2473 MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
2474 MLX5_SET(mkc, mkc, pd, to_mpd(ibmw->pd)->pdn);
2475 MLX5_SET(mkc, mkc, umr_en, 1);
2476 MLX5_SET(mkc, mkc, lr, 1);
2477 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_KLMS);
2478 MLX5_SET(mkc, mkc, en_rinval, !!((ibmw->type == IB_MW_TYPE_2)));
2479 MLX5_SET(mkc, mkc, qpn, 0xffffff);
2480
2481 err = mlx5_ib_create_mkey(dev, &mw->mmkey, in, inlen);
2482 if (err)
2483 goto free;
2484
2485 mw->mmkey.type = MLX5_MKEY_MW;
2486 ibmw->rkey = mw->mmkey.key;
2487 mw->mmkey.ndescs = ndescs;
2488
2489 resp.response_length =
2490 min(offsetofend(typeof(resp), response_length), udata->outlen);
2491 if (resp.response_length) {
2492 err = ib_copy_to_udata(udata, &resp, resp.response_length);
2493 if (err)
2494 goto free_mkey;
2495 }
2496
2497 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
2498 err = mlx5r_store_odp_mkey(dev, &mw->mmkey);
2499 if (err)
2500 goto free_mkey;
2501 }
2502
2503 kfree(in);
2504 return 0;
2505
2506 free_mkey:
2507 mlx5_core_destroy_mkey(dev->mdev, mw->mmkey.key);
2508 free:
2509 kfree(in);
2510 return err;
2511 }
2512
mlx5_ib_dealloc_mw(struct ib_mw * mw)2513 int mlx5_ib_dealloc_mw(struct ib_mw *mw)
2514 {
2515 struct mlx5_ib_dev *dev = to_mdev(mw->device);
2516 struct mlx5_ib_mw *mmw = to_mmw(mw);
2517
2518 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) &&
2519 xa_erase(&dev->odp_mkeys, mlx5_base_mkey(mmw->mmkey.key)))
2520 /*
2521 * pagefault_single_data_segment() may be accessing mmw
2522 * if the user bound an ODP MR to this MW.
2523 */
2524 mlx5r_deref_wait_odp_mkey(&mmw->mmkey);
2525
2526 return mlx5_core_destroy_mkey(dev->mdev, mmw->mmkey.key);
2527 }
2528
mlx5_ib_check_mr_status(struct ib_mr * ibmr,u32 check_mask,struct ib_mr_status * mr_status)2529 int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
2530 struct ib_mr_status *mr_status)
2531 {
2532 struct mlx5_ib_mr *mmr = to_mmr(ibmr);
2533 int ret = 0;
2534
2535 if (check_mask & ~IB_MR_CHECK_SIG_STATUS) {
2536 pr_err("Invalid status check mask\n");
2537 ret = -EINVAL;
2538 goto done;
2539 }
2540
2541 mr_status->fail_status = 0;
2542 if (check_mask & IB_MR_CHECK_SIG_STATUS) {
2543 if (!mmr->sig) {
2544 ret = -EINVAL;
2545 pr_err("signature status check requested on a non-signature enabled MR\n");
2546 goto done;
2547 }
2548
2549 mmr->sig->sig_status_checked = true;
2550 if (!mmr->sig->sig_err_exists)
2551 goto done;
2552
2553 if (ibmr->lkey == mmr->sig->err_item.key)
2554 memcpy(&mr_status->sig_err, &mmr->sig->err_item,
2555 sizeof(mr_status->sig_err));
2556 else {
2557 mr_status->sig_err.err_type = IB_SIG_BAD_GUARD;
2558 mr_status->sig_err.sig_err_offset = 0;
2559 mr_status->sig_err.key = mmr->sig->err_item.key;
2560 }
2561
2562 mmr->sig->sig_err_exists = false;
2563 mr_status->fail_status |= IB_MR_CHECK_SIG_STATUS;
2564 }
2565
2566 done:
2567 return ret;
2568 }
2569
2570 static int
mlx5_ib_map_pa_mr_sg_pi(struct ib_mr * ibmr,struct scatterlist * data_sg,int data_sg_nents,unsigned int * data_sg_offset,struct scatterlist * meta_sg,int meta_sg_nents,unsigned int * meta_sg_offset)2571 mlx5_ib_map_pa_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2572 int data_sg_nents, unsigned int *data_sg_offset,
2573 struct scatterlist *meta_sg, int meta_sg_nents,
2574 unsigned int *meta_sg_offset)
2575 {
2576 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2577 unsigned int sg_offset = 0;
2578 int n = 0;
2579
2580 mr->meta_length = 0;
2581 if (data_sg_nents == 1) {
2582 n++;
2583 mr->mmkey.ndescs = 1;
2584 if (data_sg_offset)
2585 sg_offset = *data_sg_offset;
2586 mr->data_length = sg_dma_len(data_sg) - sg_offset;
2587 mr->data_iova = sg_dma_address(data_sg) + sg_offset;
2588 if (meta_sg_nents == 1) {
2589 n++;
2590 mr->meta_ndescs = 1;
2591 if (meta_sg_offset)
2592 sg_offset = *meta_sg_offset;
2593 else
2594 sg_offset = 0;
2595 mr->meta_length = sg_dma_len(meta_sg) - sg_offset;
2596 mr->pi_iova = sg_dma_address(meta_sg) + sg_offset;
2597 }
2598 ibmr->length = mr->data_length + mr->meta_length;
2599 }
2600
2601 return n;
2602 }
2603
2604 static int
mlx5_ib_sg_to_klms(struct mlx5_ib_mr * mr,struct scatterlist * sgl,unsigned short sg_nents,unsigned int * sg_offset_p,struct scatterlist * meta_sgl,unsigned short meta_sg_nents,unsigned int * meta_sg_offset_p)2605 mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr,
2606 struct scatterlist *sgl,
2607 unsigned short sg_nents,
2608 unsigned int *sg_offset_p,
2609 struct scatterlist *meta_sgl,
2610 unsigned short meta_sg_nents,
2611 unsigned int *meta_sg_offset_p)
2612 {
2613 struct scatterlist *sg = sgl;
2614 struct mlx5_klm *klms = mr->descs;
2615 unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0;
2616 u32 lkey = mr->ibmr.pd->local_dma_lkey;
2617 int i, j = 0;
2618
2619 mr->ibmr.iova = sg_dma_address(sg) + sg_offset;
2620 mr->ibmr.length = 0;
2621
2622 for_each_sg(sgl, sg, sg_nents, i) {
2623 if (unlikely(i >= mr->max_descs))
2624 break;
2625 klms[i].va = cpu_to_be64(sg_dma_address(sg) + sg_offset);
2626 klms[i].bcount = cpu_to_be32(sg_dma_len(sg) - sg_offset);
2627 klms[i].key = cpu_to_be32(lkey);
2628 mr->ibmr.length += sg_dma_len(sg) - sg_offset;
2629
2630 sg_offset = 0;
2631 }
2632
2633 if (sg_offset_p)
2634 *sg_offset_p = sg_offset;
2635
2636 mr->mmkey.ndescs = i;
2637 mr->data_length = mr->ibmr.length;
2638
2639 if (meta_sg_nents) {
2640 sg = meta_sgl;
2641 sg_offset = meta_sg_offset_p ? *meta_sg_offset_p : 0;
2642 for_each_sg(meta_sgl, sg, meta_sg_nents, j) {
2643 if (unlikely(i + j >= mr->max_descs))
2644 break;
2645 klms[i + j].va = cpu_to_be64(sg_dma_address(sg) +
2646 sg_offset);
2647 klms[i + j].bcount = cpu_to_be32(sg_dma_len(sg) -
2648 sg_offset);
2649 klms[i + j].key = cpu_to_be32(lkey);
2650 mr->ibmr.length += sg_dma_len(sg) - sg_offset;
2651
2652 sg_offset = 0;
2653 }
2654 if (meta_sg_offset_p)
2655 *meta_sg_offset_p = sg_offset;
2656
2657 mr->meta_ndescs = j;
2658 mr->meta_length = mr->ibmr.length - mr->data_length;
2659 }
2660
2661 return i + j;
2662 }
2663
mlx5_set_page(struct ib_mr * ibmr,u64 addr)2664 static int mlx5_set_page(struct ib_mr *ibmr, u64 addr)
2665 {
2666 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2667 __be64 *descs;
2668
2669 if (unlikely(mr->mmkey.ndescs == mr->max_descs))
2670 return -ENOMEM;
2671
2672 descs = mr->descs;
2673 descs[mr->mmkey.ndescs++] = cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR);
2674
2675 return 0;
2676 }
2677
mlx5_set_page_pi(struct ib_mr * ibmr,u64 addr)2678 static int mlx5_set_page_pi(struct ib_mr *ibmr, u64 addr)
2679 {
2680 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2681 __be64 *descs;
2682
2683 if (unlikely(mr->mmkey.ndescs + mr->meta_ndescs == mr->max_descs))
2684 return -ENOMEM;
2685
2686 descs = mr->descs;
2687 descs[mr->mmkey.ndescs + mr->meta_ndescs++] =
2688 cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR);
2689
2690 return 0;
2691 }
2692
2693 static int
mlx5_ib_map_mtt_mr_sg_pi(struct ib_mr * ibmr,struct scatterlist * data_sg,int data_sg_nents,unsigned int * data_sg_offset,struct scatterlist * meta_sg,int meta_sg_nents,unsigned int * meta_sg_offset)2694 mlx5_ib_map_mtt_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2695 int data_sg_nents, unsigned int *data_sg_offset,
2696 struct scatterlist *meta_sg, int meta_sg_nents,
2697 unsigned int *meta_sg_offset)
2698 {
2699 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2700 struct mlx5_ib_mr *pi_mr = mr->mtt_mr;
2701 int n;
2702
2703 pi_mr->mmkey.ndescs = 0;
2704 pi_mr->meta_ndescs = 0;
2705 pi_mr->meta_length = 0;
2706
2707 ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map,
2708 pi_mr->desc_size * pi_mr->max_descs,
2709 DMA_TO_DEVICE);
2710
2711 pi_mr->ibmr.page_size = ibmr->page_size;
2712 n = ib_sg_to_pages(&pi_mr->ibmr, data_sg, data_sg_nents, data_sg_offset,
2713 mlx5_set_page);
2714 if (n != data_sg_nents)
2715 return n;
2716
2717 pi_mr->data_iova = pi_mr->ibmr.iova;
2718 pi_mr->data_length = pi_mr->ibmr.length;
2719 pi_mr->ibmr.length = pi_mr->data_length;
2720 ibmr->length = pi_mr->data_length;
2721
2722 if (meta_sg_nents) {
2723 u64 page_mask = ~((u64)ibmr->page_size - 1);
2724 u64 iova = pi_mr->data_iova;
2725
2726 n += ib_sg_to_pages(&pi_mr->ibmr, meta_sg, meta_sg_nents,
2727 meta_sg_offset, mlx5_set_page_pi);
2728
2729 pi_mr->meta_length = pi_mr->ibmr.length;
2730 /*
2731 * PI address for the HW is the offset of the metadata address
2732 * relative to the first data page address.
2733 * It equals to first data page address + size of data pages +
2734 * metadata offset at the first metadata page
2735 */
2736 pi_mr->pi_iova = (iova & page_mask) +
2737 pi_mr->mmkey.ndescs * ibmr->page_size +
2738 (pi_mr->ibmr.iova & ~page_mask);
2739 /*
2740 * In order to use one MTT MR for data and metadata, we register
2741 * also the gaps between the end of the data and the start of
2742 * the metadata (the sig MR will verify that the HW will access
2743 * to right addresses). This mapping is safe because we use
2744 * internal mkey for the registration.
2745 */
2746 pi_mr->ibmr.length = pi_mr->pi_iova + pi_mr->meta_length - iova;
2747 pi_mr->ibmr.iova = iova;
2748 ibmr->length += pi_mr->meta_length;
2749 }
2750
2751 ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map,
2752 pi_mr->desc_size * pi_mr->max_descs,
2753 DMA_TO_DEVICE);
2754
2755 return n;
2756 }
2757
2758 static int
mlx5_ib_map_klm_mr_sg_pi(struct ib_mr * ibmr,struct scatterlist * data_sg,int data_sg_nents,unsigned int * data_sg_offset,struct scatterlist * meta_sg,int meta_sg_nents,unsigned int * meta_sg_offset)2759 mlx5_ib_map_klm_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2760 int data_sg_nents, unsigned int *data_sg_offset,
2761 struct scatterlist *meta_sg, int meta_sg_nents,
2762 unsigned int *meta_sg_offset)
2763 {
2764 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2765 struct mlx5_ib_mr *pi_mr = mr->klm_mr;
2766 int n;
2767
2768 pi_mr->mmkey.ndescs = 0;
2769 pi_mr->meta_ndescs = 0;
2770 pi_mr->meta_length = 0;
2771
2772 ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map,
2773 pi_mr->desc_size * pi_mr->max_descs,
2774 DMA_TO_DEVICE);
2775
2776 n = mlx5_ib_sg_to_klms(pi_mr, data_sg, data_sg_nents, data_sg_offset,
2777 meta_sg, meta_sg_nents, meta_sg_offset);
2778
2779 ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map,
2780 pi_mr->desc_size * pi_mr->max_descs,
2781 DMA_TO_DEVICE);
2782
2783 /* This is zero-based memory region */
2784 pi_mr->data_iova = 0;
2785 pi_mr->ibmr.iova = 0;
2786 pi_mr->pi_iova = pi_mr->data_length;
2787 ibmr->length = pi_mr->ibmr.length;
2788
2789 return n;
2790 }
2791
mlx5_ib_map_mr_sg_pi(struct ib_mr * ibmr,struct scatterlist * data_sg,int data_sg_nents,unsigned int * data_sg_offset,struct scatterlist * meta_sg,int meta_sg_nents,unsigned int * meta_sg_offset)2792 int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2793 int data_sg_nents, unsigned int *data_sg_offset,
2794 struct scatterlist *meta_sg, int meta_sg_nents,
2795 unsigned int *meta_sg_offset)
2796 {
2797 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2798 struct mlx5_ib_mr *pi_mr = NULL;
2799 int n;
2800
2801 WARN_ON(ibmr->type != IB_MR_TYPE_INTEGRITY);
2802
2803 mr->mmkey.ndescs = 0;
2804 mr->data_length = 0;
2805 mr->data_iova = 0;
2806 mr->meta_ndescs = 0;
2807 mr->pi_iova = 0;
2808 /*
2809 * As a performance optimization, if possible, there is no need to
2810 * perform UMR operation to register the data/metadata buffers.
2811 * First try to map the sg lists to PA descriptors with local_dma_lkey.
2812 * Fallback to UMR only in case of a failure.
2813 */
2814 n = mlx5_ib_map_pa_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2815 data_sg_offset, meta_sg, meta_sg_nents,
2816 meta_sg_offset);
2817 if (n == data_sg_nents + meta_sg_nents)
2818 goto out;
2819 /*
2820 * As a performance optimization, if possible, there is no need to map
2821 * the sg lists to KLM descriptors. First try to map the sg lists to MTT
2822 * descriptors and fallback to KLM only in case of a failure.
2823 * It's more efficient for the HW to work with MTT descriptors
2824 * (especially in high load).
2825 * Use KLM (indirect access) only if it's mandatory.
2826 */
2827 pi_mr = mr->mtt_mr;
2828 n = mlx5_ib_map_mtt_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2829 data_sg_offset, meta_sg, meta_sg_nents,
2830 meta_sg_offset);
2831 if (n == data_sg_nents + meta_sg_nents)
2832 goto out;
2833
2834 pi_mr = mr->klm_mr;
2835 n = mlx5_ib_map_klm_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2836 data_sg_offset, meta_sg, meta_sg_nents,
2837 meta_sg_offset);
2838 if (unlikely(n != data_sg_nents + meta_sg_nents))
2839 return -ENOMEM;
2840
2841 out:
2842 /* This is zero-based memory region */
2843 ibmr->iova = 0;
2844 mr->pi_mr = pi_mr;
2845 if (pi_mr)
2846 ibmr->sig_attrs->meta_length = pi_mr->meta_length;
2847 else
2848 ibmr->sig_attrs->meta_length = mr->meta_length;
2849
2850 return 0;
2851 }
2852
mlx5_ib_map_mr_sg(struct ib_mr * ibmr,struct scatterlist * sg,int sg_nents,unsigned int * sg_offset)2853 int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
2854 unsigned int *sg_offset)
2855 {
2856 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2857 int n;
2858
2859 mr->mmkey.ndescs = 0;
2860
2861 ib_dma_sync_single_for_cpu(ibmr->device, mr->desc_map,
2862 mr->desc_size * mr->max_descs,
2863 DMA_TO_DEVICE);
2864
2865 if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS)
2866 n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset, NULL, 0,
2867 NULL);
2868 else
2869 n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset,
2870 mlx5_set_page);
2871
2872 ib_dma_sync_single_for_device(ibmr->device, mr->desc_map,
2873 mr->desc_size * mr->max_descs,
2874 DMA_TO_DEVICE);
2875
2876 return n;
2877 }
2878