1 // SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
2 /*
3 * Copyright(c) 2020 Cornelis Networks, Inc.
4 * Copyright(c) 2015-2018 Intel Corporation.
5 */
6 #include <asm/page.h>
7 #include <linux/string.h>
8
9 #include "mmu_rb.h"
10 #include "user_exp_rcv.h"
11 #include "trace.h"
12
13 static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt,
14 struct exp_tid_set *set,
15 struct hfi1_filedata *fd);
16 static u32 find_phys_blocks(struct tid_user_buf *tidbuf, unsigned int npages);
17 static int set_rcvarray_entry(struct hfi1_filedata *fd,
18 struct tid_user_buf *tbuf,
19 u32 rcventry, struct tid_group *grp,
20 u16 pageidx, unsigned int npages);
21 static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
22 struct tid_rb_node *tnode);
23 static bool tid_rb_invalidate(struct mmu_interval_notifier *mni,
24 const struct mmu_notifier_range *range,
25 unsigned long cur_seq);
26 static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *,
27 struct tid_group *grp,
28 unsigned int start, u16 count,
29 u32 *tidlist, unsigned int *tididx,
30 unsigned int *pmapped);
31 static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo,
32 struct tid_group **grp);
33 static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node);
34
35 static const struct mmu_interval_notifier_ops tid_mn_ops = {
36 .invalidate = tid_rb_invalidate,
37 };
38
39 /*
40 * Initialize context and file private data needed for Expected
41 * receive caching. This needs to be done after the context has
42 * been configured with the eager/expected RcvEntry counts.
43 */
hfi1_user_exp_rcv_init(struct hfi1_filedata * fd,struct hfi1_ctxtdata * uctxt)44 int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd,
45 struct hfi1_ctxtdata *uctxt)
46 {
47 int ret = 0;
48
49 fd->entry_to_rb = kcalloc(uctxt->expected_count,
50 sizeof(struct rb_node *),
51 GFP_KERNEL);
52 if (!fd->entry_to_rb)
53 return -ENOMEM;
54
55 if (!HFI1_CAP_UGET_MASK(uctxt->flags, TID_UNMAP)) {
56 fd->invalid_tid_idx = 0;
57 fd->invalid_tids = kcalloc(uctxt->expected_count,
58 sizeof(*fd->invalid_tids),
59 GFP_KERNEL);
60 if (!fd->invalid_tids) {
61 kfree(fd->entry_to_rb);
62 fd->entry_to_rb = NULL;
63 return -ENOMEM;
64 }
65 fd->use_mn = true;
66 }
67
68 /*
69 * PSM does not have a good way to separate, count, and
70 * effectively enforce a limit on RcvArray entries used by
71 * subctxts (when context sharing is used) when TID caching
72 * is enabled. To help with that, we calculate a per-process
73 * RcvArray entry share and enforce that.
74 * If TID caching is not in use, PSM deals with usage on its
75 * own. In that case, we allow any subctxt to take all of the
76 * entries.
77 *
78 * Make sure that we set the tid counts only after successful
79 * init.
80 */
81 spin_lock(&fd->tid_lock);
82 if (uctxt->subctxt_cnt && fd->use_mn) {
83 u16 remainder;
84
85 fd->tid_limit = uctxt->expected_count / uctxt->subctxt_cnt;
86 remainder = uctxt->expected_count % uctxt->subctxt_cnt;
87 if (remainder && fd->subctxt < remainder)
88 fd->tid_limit++;
89 } else {
90 fd->tid_limit = uctxt->expected_count;
91 }
92 spin_unlock(&fd->tid_lock);
93
94 return ret;
95 }
96
hfi1_user_exp_rcv_free(struct hfi1_filedata * fd)97 void hfi1_user_exp_rcv_free(struct hfi1_filedata *fd)
98 {
99 struct hfi1_ctxtdata *uctxt = fd->uctxt;
100
101 mutex_lock(&uctxt->exp_mutex);
102 if (!EXP_TID_SET_EMPTY(uctxt->tid_full_list))
103 unlock_exp_tids(uctxt, &uctxt->tid_full_list, fd);
104 if (!EXP_TID_SET_EMPTY(uctxt->tid_used_list))
105 unlock_exp_tids(uctxt, &uctxt->tid_used_list, fd);
106 mutex_unlock(&uctxt->exp_mutex);
107
108 kfree(fd->invalid_tids);
109 fd->invalid_tids = NULL;
110
111 kfree(fd->entry_to_rb);
112 fd->entry_to_rb = NULL;
113 }
114
115 /*
116 * Release pinned receive buffer pages.
117 *
118 * @mapped: true if the pages have been DMA mapped. false otherwise.
119 * @idx: Index of the first page to unpin.
120 * @npages: No of pages to unpin.
121 *
122 * If the pages have been DMA mapped (indicated by mapped parameter), their
123 * info will be passed via a struct tid_rb_node. If they haven't been mapped,
124 * their info will be passed via a struct tid_user_buf.
125 */
unpin_rcv_pages(struct hfi1_filedata * fd,struct tid_user_buf * tidbuf,struct tid_rb_node * node,unsigned int idx,unsigned int npages,bool mapped)126 static void unpin_rcv_pages(struct hfi1_filedata *fd,
127 struct tid_user_buf *tidbuf,
128 struct tid_rb_node *node,
129 unsigned int idx,
130 unsigned int npages,
131 bool mapped)
132 {
133 struct page **pages;
134 struct hfi1_devdata *dd = fd->uctxt->dd;
135 struct mm_struct *mm;
136
137 if (mapped) {
138 dma_unmap_single(&dd->pcidev->dev, node->dma_addr,
139 node->npages * PAGE_SIZE, DMA_FROM_DEVICE);
140 pages = &node->pages[idx];
141 mm = mm_from_tid_node(node);
142 } else {
143 pages = &tidbuf->pages[idx];
144 mm = current->mm;
145 }
146 hfi1_release_user_pages(mm, pages, npages, mapped);
147 fd->tid_n_pinned -= npages;
148 }
149
150 /*
151 * Pin receive buffer pages.
152 */
pin_rcv_pages(struct hfi1_filedata * fd,struct tid_user_buf * tidbuf)153 static int pin_rcv_pages(struct hfi1_filedata *fd, struct tid_user_buf *tidbuf)
154 {
155 int pinned;
156 unsigned int npages;
157 unsigned long vaddr = tidbuf->vaddr;
158 struct page **pages = NULL;
159 struct hfi1_devdata *dd = fd->uctxt->dd;
160
161 /* Get the number of pages the user buffer spans */
162 npages = num_user_pages(vaddr, tidbuf->length);
163 if (!npages)
164 return -EINVAL;
165
166 if (npages > fd->uctxt->expected_count) {
167 dd_dev_err(dd, "Expected buffer too big\n");
168 return -EINVAL;
169 }
170
171 /* Allocate the array of struct page pointers needed for pinning */
172 pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL);
173 if (!pages)
174 return -ENOMEM;
175
176 /*
177 * Pin all the pages of the user buffer. If we can't pin all the
178 * pages, accept the amount pinned so far and program only that.
179 * User space knows how to deal with partially programmed buffers.
180 */
181 if (!hfi1_can_pin_pages(dd, current->mm, fd->tid_n_pinned, npages)) {
182 kfree(pages);
183 return -ENOMEM;
184 }
185
186 pinned = hfi1_acquire_user_pages(current->mm, vaddr, npages, true, pages);
187 if (pinned <= 0) {
188 kfree(pages);
189 return pinned;
190 }
191 tidbuf->pages = pages;
192 tidbuf->npages = npages;
193 fd->tid_n_pinned += pinned;
194 return pinned;
195 }
196
197 /*
198 * RcvArray entry allocation for Expected Receives is done by the
199 * following algorithm:
200 *
201 * The context keeps 3 lists of groups of RcvArray entries:
202 * 1. List of empty groups - tid_group_list
203 * This list is created during user context creation and
204 * contains elements which describe sets (of 8) of empty
205 * RcvArray entries.
206 * 2. List of partially used groups - tid_used_list
207 * This list contains sets of RcvArray entries which are
208 * not completely used up. Another mapping request could
209 * use some of all of the remaining entries.
210 * 3. List of full groups - tid_full_list
211 * This is the list where sets that are completely used
212 * up go.
213 *
214 * An attempt to optimize the usage of RcvArray entries is
215 * made by finding all sets of physically contiguous pages in a
216 * user's buffer.
217 * These physically contiguous sets are further split into
218 * sizes supported by the receive engine of the HFI. The
219 * resulting sets of pages are stored in struct tid_pageset,
220 * which describes the sets as:
221 * * .count - number of pages in this set
222 * * .idx - starting index into struct page ** array
223 * of this set
224 *
225 * From this point on, the algorithm deals with the page sets
226 * described above. The number of pagesets is divided by the
227 * RcvArray group size to produce the number of full groups
228 * needed.
229 *
230 * Groups from the 3 lists are manipulated using the following
231 * rules:
232 * 1. For each set of 8 pagesets, a complete group from
233 * tid_group_list is taken, programmed, and moved to
234 * the tid_full_list list.
235 * 2. For all remaining pagesets:
236 * 2.1 If the tid_used_list is empty and the tid_group_list
237 * is empty, stop processing pageset and return only
238 * what has been programmed up to this point.
239 * 2.2 If the tid_used_list is empty and the tid_group_list
240 * is not empty, move a group from tid_group_list to
241 * tid_used_list.
242 * 2.3 For each group is tid_used_group, program as much as
243 * can fit into the group. If the group becomes fully
244 * used, move it to tid_full_list.
245 */
hfi1_user_exp_rcv_setup(struct hfi1_filedata * fd,struct hfi1_tid_info * tinfo)246 int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd,
247 struct hfi1_tid_info *tinfo)
248 {
249 int ret = 0, need_group = 0, pinned;
250 struct hfi1_ctxtdata *uctxt = fd->uctxt;
251 struct hfi1_devdata *dd = uctxt->dd;
252 unsigned int ngroups, pageidx = 0, pageset_count,
253 tididx = 0, mapped, mapped_pages = 0;
254 u32 *tidlist = NULL;
255 struct tid_user_buf *tidbuf;
256
257 if (!PAGE_ALIGNED(tinfo->vaddr))
258 return -EINVAL;
259
260 tidbuf = kzalloc(sizeof(*tidbuf), GFP_KERNEL);
261 if (!tidbuf)
262 return -ENOMEM;
263
264 tidbuf->vaddr = tinfo->vaddr;
265 tidbuf->length = tinfo->length;
266 tidbuf->psets = kcalloc(uctxt->expected_count, sizeof(*tidbuf->psets),
267 GFP_KERNEL);
268 if (!tidbuf->psets) {
269 kfree(tidbuf);
270 return -ENOMEM;
271 }
272
273 pinned = pin_rcv_pages(fd, tidbuf);
274 if (pinned <= 0) {
275 kfree(tidbuf->psets);
276 kfree(tidbuf);
277 return pinned;
278 }
279
280 /* Find sets of physically contiguous pages */
281 tidbuf->n_psets = find_phys_blocks(tidbuf, pinned);
282
283 /*
284 * We don't need to access this under a lock since tid_used is per
285 * process and the same process cannot be in hfi1_user_exp_rcv_clear()
286 * and hfi1_user_exp_rcv_setup() at the same time.
287 */
288 spin_lock(&fd->tid_lock);
289 if (fd->tid_used + tidbuf->n_psets > fd->tid_limit)
290 pageset_count = fd->tid_limit - fd->tid_used;
291 else
292 pageset_count = tidbuf->n_psets;
293 spin_unlock(&fd->tid_lock);
294
295 if (!pageset_count)
296 goto bail;
297
298 ngroups = pageset_count / dd->rcv_entries.group_size;
299 tidlist = kcalloc(pageset_count, sizeof(*tidlist), GFP_KERNEL);
300 if (!tidlist) {
301 ret = -ENOMEM;
302 goto nomem;
303 }
304
305 tididx = 0;
306
307 /*
308 * From this point on, we are going to be using shared (between master
309 * and subcontexts) context resources. We need to take the lock.
310 */
311 mutex_lock(&uctxt->exp_mutex);
312 /*
313 * The first step is to program the RcvArray entries which are complete
314 * groups.
315 */
316 while (ngroups && uctxt->tid_group_list.count) {
317 struct tid_group *grp =
318 tid_group_pop(&uctxt->tid_group_list);
319
320 ret = program_rcvarray(fd, tidbuf, grp,
321 pageidx, dd->rcv_entries.group_size,
322 tidlist, &tididx, &mapped);
323 /*
324 * If there was a failure to program the RcvArray
325 * entries for the entire group, reset the grp fields
326 * and add the grp back to the free group list.
327 */
328 if (ret <= 0) {
329 tid_group_add_tail(grp, &uctxt->tid_group_list);
330 hfi1_cdbg(TID,
331 "Failed to program RcvArray group %d", ret);
332 goto unlock;
333 }
334
335 tid_group_add_tail(grp, &uctxt->tid_full_list);
336 ngroups--;
337 pageidx += ret;
338 mapped_pages += mapped;
339 }
340
341 while (pageidx < pageset_count) {
342 struct tid_group *grp, *ptr;
343 /*
344 * If we don't have any partially used tid groups, check
345 * if we have empty groups. If so, take one from there and
346 * put in the partially used list.
347 */
348 if (!uctxt->tid_used_list.count || need_group) {
349 if (!uctxt->tid_group_list.count)
350 goto unlock;
351
352 grp = tid_group_pop(&uctxt->tid_group_list);
353 tid_group_add_tail(grp, &uctxt->tid_used_list);
354 need_group = 0;
355 }
356 /*
357 * There is an optimization opportunity here - instead of
358 * fitting as many page sets as we can, check for a group
359 * later on in the list that could fit all of them.
360 */
361 list_for_each_entry_safe(grp, ptr, &uctxt->tid_used_list.list,
362 list) {
363 unsigned use = min_t(unsigned, pageset_count - pageidx,
364 grp->size - grp->used);
365
366 ret = program_rcvarray(fd, tidbuf, grp,
367 pageidx, use, tidlist,
368 &tididx, &mapped);
369 if (ret < 0) {
370 hfi1_cdbg(TID,
371 "Failed to program RcvArray entries %d",
372 ret);
373 goto unlock;
374 } else if (ret > 0) {
375 if (grp->used == grp->size)
376 tid_group_move(grp,
377 &uctxt->tid_used_list,
378 &uctxt->tid_full_list);
379 pageidx += ret;
380 mapped_pages += mapped;
381 need_group = 0;
382 /* Check if we are done so we break out early */
383 if (pageidx >= pageset_count)
384 break;
385 } else if (WARN_ON(ret == 0)) {
386 /*
387 * If ret is 0, we did not program any entries
388 * into this group, which can only happen if
389 * we've screwed up the accounting somewhere.
390 * Warn and try to continue.
391 */
392 need_group = 1;
393 }
394 }
395 }
396 unlock:
397 mutex_unlock(&uctxt->exp_mutex);
398 nomem:
399 hfi1_cdbg(TID, "total mapped: tidpairs:%u pages:%u (%d)", tididx,
400 mapped_pages, ret);
401 if (tididx) {
402 spin_lock(&fd->tid_lock);
403 fd->tid_used += tididx;
404 spin_unlock(&fd->tid_lock);
405 tinfo->tidcnt = tididx;
406 tinfo->length = mapped_pages * PAGE_SIZE;
407
408 if (copy_to_user(u64_to_user_ptr(tinfo->tidlist),
409 tidlist, sizeof(tidlist[0]) * tididx)) {
410 /*
411 * On failure to copy to the user level, we need to undo
412 * everything done so far so we don't leak resources.
413 */
414 tinfo->tidlist = (unsigned long)&tidlist;
415 hfi1_user_exp_rcv_clear(fd, tinfo);
416 tinfo->tidlist = 0;
417 ret = -EFAULT;
418 goto bail;
419 }
420 }
421
422 /*
423 * If not everything was mapped (due to insufficient RcvArray entries,
424 * for example), unpin all unmapped pages so we can pin them nex time.
425 */
426 if (mapped_pages != pinned)
427 unpin_rcv_pages(fd, tidbuf, NULL, mapped_pages,
428 (pinned - mapped_pages), false);
429 bail:
430 kfree(tidbuf->psets);
431 kfree(tidlist);
432 kfree(tidbuf->pages);
433 kfree(tidbuf);
434 return ret > 0 ? 0 : ret;
435 }
436
hfi1_user_exp_rcv_clear(struct hfi1_filedata * fd,struct hfi1_tid_info * tinfo)437 int hfi1_user_exp_rcv_clear(struct hfi1_filedata *fd,
438 struct hfi1_tid_info *tinfo)
439 {
440 int ret = 0;
441 struct hfi1_ctxtdata *uctxt = fd->uctxt;
442 u32 *tidinfo;
443 unsigned tididx;
444
445 if (unlikely(tinfo->tidcnt > fd->tid_used))
446 return -EINVAL;
447
448 tidinfo = memdup_user(u64_to_user_ptr(tinfo->tidlist),
449 sizeof(tidinfo[0]) * tinfo->tidcnt);
450 if (IS_ERR(tidinfo))
451 return PTR_ERR(tidinfo);
452
453 mutex_lock(&uctxt->exp_mutex);
454 for (tididx = 0; tididx < tinfo->tidcnt; tididx++) {
455 ret = unprogram_rcvarray(fd, tidinfo[tididx], NULL);
456 if (ret) {
457 hfi1_cdbg(TID, "Failed to unprogram rcv array %d",
458 ret);
459 break;
460 }
461 }
462 spin_lock(&fd->tid_lock);
463 fd->tid_used -= tididx;
464 spin_unlock(&fd->tid_lock);
465 tinfo->tidcnt = tididx;
466 mutex_unlock(&uctxt->exp_mutex);
467
468 kfree(tidinfo);
469 return ret;
470 }
471
hfi1_user_exp_rcv_invalid(struct hfi1_filedata * fd,struct hfi1_tid_info * tinfo)472 int hfi1_user_exp_rcv_invalid(struct hfi1_filedata *fd,
473 struct hfi1_tid_info *tinfo)
474 {
475 struct hfi1_ctxtdata *uctxt = fd->uctxt;
476 unsigned long *ev = uctxt->dd->events +
477 (uctxt_offset(uctxt) + fd->subctxt);
478 u32 *array;
479 int ret = 0;
480
481 /*
482 * copy_to_user() can sleep, which will leave the invalid_lock
483 * locked and cause the MMU notifier to be blocked on the lock
484 * for a long time.
485 * Copy the data to a local buffer so we can release the lock.
486 */
487 array = kcalloc(uctxt->expected_count, sizeof(*array), GFP_KERNEL);
488 if (!array)
489 return -EFAULT;
490
491 spin_lock(&fd->invalid_lock);
492 if (fd->invalid_tid_idx) {
493 memcpy(array, fd->invalid_tids, sizeof(*array) *
494 fd->invalid_tid_idx);
495 memset(fd->invalid_tids, 0, sizeof(*fd->invalid_tids) *
496 fd->invalid_tid_idx);
497 tinfo->tidcnt = fd->invalid_tid_idx;
498 fd->invalid_tid_idx = 0;
499 /*
500 * Reset the user flag while still holding the lock.
501 * Otherwise, PSM can miss events.
502 */
503 clear_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev);
504 } else {
505 tinfo->tidcnt = 0;
506 }
507 spin_unlock(&fd->invalid_lock);
508
509 if (tinfo->tidcnt) {
510 if (copy_to_user((void __user *)tinfo->tidlist,
511 array, sizeof(*array) * tinfo->tidcnt))
512 ret = -EFAULT;
513 }
514 kfree(array);
515
516 return ret;
517 }
518
find_phys_blocks(struct tid_user_buf * tidbuf,unsigned int npages)519 static u32 find_phys_blocks(struct tid_user_buf *tidbuf, unsigned int npages)
520 {
521 unsigned pagecount, pageidx, setcount = 0, i;
522 unsigned long pfn, this_pfn;
523 struct page **pages = tidbuf->pages;
524 struct tid_pageset *list = tidbuf->psets;
525
526 if (!npages)
527 return 0;
528
529 /*
530 * Look for sets of physically contiguous pages in the user buffer.
531 * This will allow us to optimize Expected RcvArray entry usage by
532 * using the bigger supported sizes.
533 */
534 pfn = page_to_pfn(pages[0]);
535 for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) {
536 this_pfn = i < npages ? page_to_pfn(pages[i]) : 0;
537
538 /*
539 * If the pfn's are not sequential, pages are not physically
540 * contiguous.
541 */
542 if (this_pfn != ++pfn) {
543 /*
544 * At this point we have to loop over the set of
545 * physically contiguous pages and break them down it
546 * sizes supported by the HW.
547 * There are two main constraints:
548 * 1. The max buffer size is MAX_EXPECTED_BUFFER.
549 * If the total set size is bigger than that
550 * program only a MAX_EXPECTED_BUFFER chunk.
551 * 2. The buffer size has to be a power of two. If
552 * it is not, round down to the closes power of
553 * 2 and program that size.
554 */
555 while (pagecount) {
556 int maxpages = pagecount;
557 u32 bufsize = pagecount * PAGE_SIZE;
558
559 if (bufsize > MAX_EXPECTED_BUFFER)
560 maxpages =
561 MAX_EXPECTED_BUFFER >>
562 PAGE_SHIFT;
563 else if (!is_power_of_2(bufsize))
564 maxpages =
565 rounddown_pow_of_two(bufsize) >>
566 PAGE_SHIFT;
567
568 list[setcount].idx = pageidx;
569 list[setcount].count = maxpages;
570 pagecount -= maxpages;
571 pageidx += maxpages;
572 setcount++;
573 }
574 pageidx = i;
575 pagecount = 1;
576 pfn = this_pfn;
577 } else {
578 pagecount++;
579 }
580 }
581 return setcount;
582 }
583
584 /**
585 * program_rcvarray() - program an RcvArray group with receive buffers
586 * @fd: filedata pointer
587 * @tbuf: pointer to struct tid_user_buf that has the user buffer starting
588 * virtual address, buffer length, page pointers, pagesets (array of
589 * struct tid_pageset holding information on physically contiguous
590 * chunks from the user buffer), and other fields.
591 * @grp: RcvArray group
592 * @start: starting index into sets array
593 * @count: number of struct tid_pageset's to program
594 * @tidlist: the array of u32 elements when the information about the
595 * programmed RcvArray entries is to be encoded.
596 * @tididx: starting offset into tidlist
597 * @pmapped: (output parameter) number of pages programmed into the RcvArray
598 * entries.
599 *
600 * This function will program up to 'count' number of RcvArray entries from the
601 * group 'grp'. To make best use of write-combining writes, the function will
602 * perform writes to the unused RcvArray entries which will be ignored by the
603 * HW. Each RcvArray entry will be programmed with a physically contiguous
604 * buffer chunk from the user's virtual buffer.
605 *
606 * Return:
607 * -EINVAL if the requested count is larger than the size of the group,
608 * -ENOMEM or -EFAULT on error from set_rcvarray_entry(), or
609 * number of RcvArray entries programmed.
610 */
program_rcvarray(struct hfi1_filedata * fd,struct tid_user_buf * tbuf,struct tid_group * grp,unsigned int start,u16 count,u32 * tidlist,unsigned int * tididx,unsigned int * pmapped)611 static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *tbuf,
612 struct tid_group *grp,
613 unsigned int start, u16 count,
614 u32 *tidlist, unsigned int *tididx,
615 unsigned int *pmapped)
616 {
617 struct hfi1_ctxtdata *uctxt = fd->uctxt;
618 struct hfi1_devdata *dd = uctxt->dd;
619 u16 idx;
620 u32 tidinfo = 0, rcventry, useidx = 0;
621 int mapped = 0;
622
623 /* Count should never be larger than the group size */
624 if (count > grp->size)
625 return -EINVAL;
626
627 /* Find the first unused entry in the group */
628 for (idx = 0; idx < grp->size; idx++) {
629 if (!(grp->map & (1 << idx))) {
630 useidx = idx;
631 break;
632 }
633 rcv_array_wc_fill(dd, grp->base + idx);
634 }
635
636 idx = 0;
637 while (idx < count) {
638 u16 npages, pageidx, setidx = start + idx;
639 int ret = 0;
640
641 /*
642 * If this entry in the group is used, move to the next one.
643 * If we go past the end of the group, exit the loop.
644 */
645 if (useidx >= grp->size) {
646 break;
647 } else if (grp->map & (1 << useidx)) {
648 rcv_array_wc_fill(dd, grp->base + useidx);
649 useidx++;
650 continue;
651 }
652
653 rcventry = grp->base + useidx;
654 npages = tbuf->psets[setidx].count;
655 pageidx = tbuf->psets[setidx].idx;
656
657 ret = set_rcvarray_entry(fd, tbuf,
658 rcventry, grp, pageidx,
659 npages);
660 if (ret)
661 return ret;
662 mapped += npages;
663
664 tidinfo = rcventry2tidinfo(rcventry - uctxt->expected_base) |
665 EXP_TID_SET(LEN, npages);
666 tidlist[(*tididx)++] = tidinfo;
667 grp->used++;
668 grp->map |= 1 << useidx++;
669 idx++;
670 }
671
672 /* Fill the rest of the group with "blank" writes */
673 for (; useidx < grp->size; useidx++)
674 rcv_array_wc_fill(dd, grp->base + useidx);
675 *pmapped = mapped;
676 return idx;
677 }
678
set_rcvarray_entry(struct hfi1_filedata * fd,struct tid_user_buf * tbuf,u32 rcventry,struct tid_group * grp,u16 pageidx,unsigned int npages)679 static int set_rcvarray_entry(struct hfi1_filedata *fd,
680 struct tid_user_buf *tbuf,
681 u32 rcventry, struct tid_group *grp,
682 u16 pageidx, unsigned int npages)
683 {
684 int ret;
685 struct hfi1_ctxtdata *uctxt = fd->uctxt;
686 struct tid_rb_node *node;
687 struct hfi1_devdata *dd = uctxt->dd;
688 dma_addr_t phys;
689 struct page **pages = tbuf->pages + pageidx;
690
691 /*
692 * Allocate the node first so we can handle a potential
693 * failure before we've programmed anything.
694 */
695 node = kzalloc(struct_size(node, pages, npages), GFP_KERNEL);
696 if (!node)
697 return -ENOMEM;
698
699 phys = dma_map_single(&dd->pcidev->dev, __va(page_to_phys(pages[0])),
700 npages * PAGE_SIZE, DMA_FROM_DEVICE);
701 if (dma_mapping_error(&dd->pcidev->dev, phys)) {
702 dd_dev_err(dd, "Failed to DMA map Exp Rcv pages 0x%llx\n",
703 phys);
704 kfree(node);
705 return -EFAULT;
706 }
707
708 node->fdata = fd;
709 node->phys = page_to_phys(pages[0]);
710 node->npages = npages;
711 node->rcventry = rcventry;
712 node->dma_addr = phys;
713 node->grp = grp;
714 node->freed = false;
715 memcpy(node->pages, pages, flex_array_size(node, pages, npages));
716
717 if (fd->use_mn) {
718 ret = mmu_interval_notifier_insert(
719 &node->notifier, current->mm,
720 tbuf->vaddr + (pageidx * PAGE_SIZE), npages * PAGE_SIZE,
721 &tid_mn_ops);
722 if (ret)
723 goto out_unmap;
724 /*
725 * FIXME: This is in the wrong order, the notifier should be
726 * established before the pages are pinned by pin_rcv_pages.
727 */
728 mmu_interval_read_begin(&node->notifier);
729 }
730 fd->entry_to_rb[node->rcventry - uctxt->expected_base] = node;
731
732 hfi1_put_tid(dd, rcventry, PT_EXPECTED, phys, ilog2(npages) + 1);
733 trace_hfi1_exp_tid_reg(uctxt->ctxt, fd->subctxt, rcventry, npages,
734 node->notifier.interval_tree.start, node->phys,
735 phys);
736 return 0;
737
738 out_unmap:
739 hfi1_cdbg(TID, "Failed to insert RB node %u 0x%lx, 0x%lx %d",
740 node->rcventry, node->notifier.interval_tree.start,
741 node->phys, ret);
742 dma_unmap_single(&dd->pcidev->dev, phys, npages * PAGE_SIZE,
743 DMA_FROM_DEVICE);
744 kfree(node);
745 return -EFAULT;
746 }
747
unprogram_rcvarray(struct hfi1_filedata * fd,u32 tidinfo,struct tid_group ** grp)748 static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo,
749 struct tid_group **grp)
750 {
751 struct hfi1_ctxtdata *uctxt = fd->uctxt;
752 struct hfi1_devdata *dd = uctxt->dd;
753 struct tid_rb_node *node;
754 u8 tidctrl = EXP_TID_GET(tidinfo, CTRL);
755 u32 tididx = EXP_TID_GET(tidinfo, IDX) << 1, rcventry;
756
757 if (tididx >= uctxt->expected_count) {
758 dd_dev_err(dd, "Invalid RcvArray entry (%u) index for ctxt %u\n",
759 tididx, uctxt->ctxt);
760 return -EINVAL;
761 }
762
763 if (tidctrl == 0x3)
764 return -EINVAL;
765
766 rcventry = tididx + (tidctrl - 1);
767
768 node = fd->entry_to_rb[rcventry];
769 if (!node || node->rcventry != (uctxt->expected_base + rcventry))
770 return -EBADF;
771
772 if (grp)
773 *grp = node->grp;
774
775 if (fd->use_mn)
776 mmu_interval_notifier_remove(&node->notifier);
777 cacheless_tid_rb_remove(fd, node);
778
779 return 0;
780 }
781
clear_tid_node(struct hfi1_filedata * fd,struct tid_rb_node * node)782 static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node)
783 {
784 struct hfi1_ctxtdata *uctxt = fd->uctxt;
785 struct hfi1_devdata *dd = uctxt->dd;
786
787 trace_hfi1_exp_tid_unreg(uctxt->ctxt, fd->subctxt, node->rcventry,
788 node->npages,
789 node->notifier.interval_tree.start, node->phys,
790 node->dma_addr);
791
792 /*
793 * Make sure device has seen the write before we unpin the
794 * pages.
795 */
796 hfi1_put_tid(dd, node->rcventry, PT_INVALID_FLUSH, 0, 0);
797
798 unpin_rcv_pages(fd, NULL, node, 0, node->npages, true);
799
800 node->grp->used--;
801 node->grp->map &= ~(1 << (node->rcventry - node->grp->base));
802
803 if (node->grp->used == node->grp->size - 1)
804 tid_group_move(node->grp, &uctxt->tid_full_list,
805 &uctxt->tid_used_list);
806 else if (!node->grp->used)
807 tid_group_move(node->grp, &uctxt->tid_used_list,
808 &uctxt->tid_group_list);
809 kfree(node);
810 }
811
812 /*
813 * As a simple helper for hfi1_user_exp_rcv_free, this function deals with
814 * clearing nodes in the non-cached case.
815 */
unlock_exp_tids(struct hfi1_ctxtdata * uctxt,struct exp_tid_set * set,struct hfi1_filedata * fd)816 static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt,
817 struct exp_tid_set *set,
818 struct hfi1_filedata *fd)
819 {
820 struct tid_group *grp, *ptr;
821 int i;
822
823 list_for_each_entry_safe(grp, ptr, &set->list, list) {
824 list_del_init(&grp->list);
825
826 for (i = 0; i < grp->size; i++) {
827 if (grp->map & (1 << i)) {
828 u16 rcventry = grp->base + i;
829 struct tid_rb_node *node;
830
831 node = fd->entry_to_rb[rcventry -
832 uctxt->expected_base];
833 if (!node || node->rcventry != rcventry)
834 continue;
835
836 if (fd->use_mn)
837 mmu_interval_notifier_remove(
838 &node->notifier);
839 cacheless_tid_rb_remove(fd, node);
840 }
841 }
842 }
843 }
844
tid_rb_invalidate(struct mmu_interval_notifier * mni,const struct mmu_notifier_range * range,unsigned long cur_seq)845 static bool tid_rb_invalidate(struct mmu_interval_notifier *mni,
846 const struct mmu_notifier_range *range,
847 unsigned long cur_seq)
848 {
849 struct tid_rb_node *node =
850 container_of(mni, struct tid_rb_node, notifier);
851 struct hfi1_filedata *fdata = node->fdata;
852 struct hfi1_ctxtdata *uctxt = fdata->uctxt;
853
854 if (node->freed)
855 return true;
856
857 trace_hfi1_exp_tid_inval(uctxt->ctxt, fdata->subctxt,
858 node->notifier.interval_tree.start,
859 node->rcventry, node->npages, node->dma_addr);
860 node->freed = true;
861
862 spin_lock(&fdata->invalid_lock);
863 if (fdata->invalid_tid_idx < uctxt->expected_count) {
864 fdata->invalid_tids[fdata->invalid_tid_idx] =
865 rcventry2tidinfo(node->rcventry - uctxt->expected_base);
866 fdata->invalid_tids[fdata->invalid_tid_idx] |=
867 EXP_TID_SET(LEN, node->npages);
868 if (!fdata->invalid_tid_idx) {
869 unsigned long *ev;
870
871 /*
872 * hfi1_set_uevent_bits() sets a user event flag
873 * for all processes. Because calling into the
874 * driver to process TID cache invalidations is
875 * expensive and TID cache invalidations are
876 * handled on a per-process basis, we can
877 * optimize this to set the flag only for the
878 * process in question.
879 */
880 ev = uctxt->dd->events +
881 (uctxt_offset(uctxt) + fdata->subctxt);
882 set_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev);
883 }
884 fdata->invalid_tid_idx++;
885 }
886 spin_unlock(&fdata->invalid_lock);
887 return true;
888 }
889
cacheless_tid_rb_remove(struct hfi1_filedata * fdata,struct tid_rb_node * tnode)890 static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
891 struct tid_rb_node *tnode)
892 {
893 u32 base = fdata->uctxt->expected_base;
894
895 fdata->entry_to_rb[tnode->rcventry - base] = NULL;
896 clear_tid_node(fdata, tnode);
897 }
898