1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES.
3 *
4 * The io_pagetable is the top of datastructure that maps IOVA's to PFNs. The
5 * PFNs can be placed into an iommu_domain, or returned to the caller as a page
6 * list for access by an in-kernel user.
7 *
8 * The datastructure uses the iopt_pages to optimize the storage of the PFNs
9 * between the domains and xarray.
10 */
11 #include <linux/iommufd.h>
12 #include <linux/lockdep.h>
13 #include <linux/iommu.h>
14 #include <linux/sched/mm.h>
15 #include <linux/err.h>
16 #include <linux/slab.h>
17 #include <linux/errno.h>
18
19 #include "io_pagetable.h"
20 #include "double_span.h"
21
22 struct iopt_pages_list {
23 struct iopt_pages *pages;
24 struct iopt_area *area;
25 struct list_head next;
26 unsigned long start_byte;
27 unsigned long length;
28 };
29
iopt_area_contig_init(struct iopt_area_contig_iter * iter,struct io_pagetable * iopt,unsigned long iova,unsigned long last_iova)30 struct iopt_area *iopt_area_contig_init(struct iopt_area_contig_iter *iter,
31 struct io_pagetable *iopt,
32 unsigned long iova,
33 unsigned long last_iova)
34 {
35 lockdep_assert_held(&iopt->iova_rwsem);
36
37 iter->cur_iova = iova;
38 iter->last_iova = last_iova;
39 iter->area = iopt_area_iter_first(iopt, iova, iova);
40 if (!iter->area)
41 return NULL;
42 if (!iter->area->pages) {
43 iter->area = NULL;
44 return NULL;
45 }
46 return iter->area;
47 }
48
iopt_area_contig_next(struct iopt_area_contig_iter * iter)49 struct iopt_area *iopt_area_contig_next(struct iopt_area_contig_iter *iter)
50 {
51 unsigned long last_iova;
52
53 if (!iter->area)
54 return NULL;
55 last_iova = iopt_area_last_iova(iter->area);
56 if (iter->last_iova <= last_iova)
57 return NULL;
58
59 iter->cur_iova = last_iova + 1;
60 iter->area = iopt_area_iter_next(iter->area, iter->cur_iova,
61 iter->last_iova);
62 if (!iter->area)
63 return NULL;
64 if (iter->cur_iova != iopt_area_iova(iter->area) ||
65 !iter->area->pages) {
66 iter->area = NULL;
67 return NULL;
68 }
69 return iter->area;
70 }
71
__alloc_iova_check_hole(struct interval_tree_double_span_iter * span,unsigned long length,unsigned long iova_alignment,unsigned long page_offset)72 static bool __alloc_iova_check_hole(struct interval_tree_double_span_iter *span,
73 unsigned long length,
74 unsigned long iova_alignment,
75 unsigned long page_offset)
76 {
77 if (span->is_used || span->last_hole - span->start_hole < length - 1)
78 return false;
79
80 span->start_hole = ALIGN(span->start_hole, iova_alignment) |
81 page_offset;
82 if (span->start_hole > span->last_hole ||
83 span->last_hole - span->start_hole < length - 1)
84 return false;
85 return true;
86 }
87
__alloc_iova_check_used(struct interval_tree_span_iter * span,unsigned long length,unsigned long iova_alignment,unsigned long page_offset)88 static bool __alloc_iova_check_used(struct interval_tree_span_iter *span,
89 unsigned long length,
90 unsigned long iova_alignment,
91 unsigned long page_offset)
92 {
93 if (span->is_hole || span->last_used - span->start_used < length - 1)
94 return false;
95
96 span->start_used = ALIGN(span->start_used, iova_alignment) |
97 page_offset;
98 if (span->start_used > span->last_used ||
99 span->last_used - span->start_used < length - 1)
100 return false;
101 return true;
102 }
103
104 /*
105 * Automatically find a block of IOVA that is not being used and not reserved.
106 * Does not return a 0 IOVA even if it is valid.
107 */
iopt_alloc_iova(struct io_pagetable * iopt,unsigned long * iova,unsigned long uptr,unsigned long length)108 static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova,
109 unsigned long uptr, unsigned long length)
110 {
111 unsigned long page_offset = uptr % PAGE_SIZE;
112 struct interval_tree_double_span_iter used_span;
113 struct interval_tree_span_iter allowed_span;
114 unsigned long iova_alignment;
115
116 lockdep_assert_held(&iopt->iova_rwsem);
117
118 /* Protect roundup_pow-of_two() from overflow */
119 if (length == 0 || length >= ULONG_MAX / 2)
120 return -EOVERFLOW;
121
122 /*
123 * Keep alignment present in the uptr when building the IOVA, this
124 * increases the chance we can map a THP.
125 */
126 if (!uptr)
127 iova_alignment = roundup_pow_of_two(length);
128 else
129 iova_alignment = min_t(unsigned long,
130 roundup_pow_of_two(length),
131 1UL << __ffs64(uptr));
132
133 if (iova_alignment < iopt->iova_alignment)
134 return -EINVAL;
135
136 interval_tree_for_each_span(&allowed_span, &iopt->allowed_itree,
137 PAGE_SIZE, ULONG_MAX - PAGE_SIZE) {
138 if (RB_EMPTY_ROOT(&iopt->allowed_itree.rb_root)) {
139 allowed_span.start_used = PAGE_SIZE;
140 allowed_span.last_used = ULONG_MAX - PAGE_SIZE;
141 allowed_span.is_hole = false;
142 }
143
144 if (!__alloc_iova_check_used(&allowed_span, length,
145 iova_alignment, page_offset))
146 continue;
147
148 interval_tree_for_each_double_span(
149 &used_span, &iopt->reserved_itree, &iopt->area_itree,
150 allowed_span.start_used, allowed_span.last_used) {
151 if (!__alloc_iova_check_hole(&used_span, length,
152 iova_alignment,
153 page_offset))
154 continue;
155
156 *iova = used_span.start_hole;
157 return 0;
158 }
159 }
160 return -ENOSPC;
161 }
162
iopt_check_iova(struct io_pagetable * iopt,unsigned long iova,unsigned long length)163 static int iopt_check_iova(struct io_pagetable *iopt, unsigned long iova,
164 unsigned long length)
165 {
166 unsigned long last;
167
168 lockdep_assert_held(&iopt->iova_rwsem);
169
170 if ((iova & (iopt->iova_alignment - 1)))
171 return -EINVAL;
172
173 if (check_add_overflow(iova, length - 1, &last))
174 return -EOVERFLOW;
175
176 /* No reserved IOVA intersects the range */
177 if (iopt_reserved_iter_first(iopt, iova, last))
178 return -EINVAL;
179
180 /* Check that there is not already a mapping in the range */
181 if (iopt_area_iter_first(iopt, iova, last))
182 return -EEXIST;
183 return 0;
184 }
185
186 /*
187 * The area takes a slice of the pages from start_bytes to start_byte + length
188 */
iopt_insert_area(struct io_pagetable * iopt,struct iopt_area * area,struct iopt_pages * pages,unsigned long iova,unsigned long start_byte,unsigned long length,int iommu_prot)189 static int iopt_insert_area(struct io_pagetable *iopt, struct iopt_area *area,
190 struct iopt_pages *pages, unsigned long iova,
191 unsigned long start_byte, unsigned long length,
192 int iommu_prot)
193 {
194 lockdep_assert_held_write(&iopt->iova_rwsem);
195
196 if ((iommu_prot & IOMMU_WRITE) && !pages->writable)
197 return -EPERM;
198
199 area->iommu_prot = iommu_prot;
200 area->page_offset = start_byte % PAGE_SIZE;
201 if (area->page_offset & (iopt->iova_alignment - 1))
202 return -EINVAL;
203
204 area->node.start = iova;
205 if (check_add_overflow(iova, length - 1, &area->node.last))
206 return -EOVERFLOW;
207
208 area->pages_node.start = start_byte / PAGE_SIZE;
209 if (check_add_overflow(start_byte, length - 1, &area->pages_node.last))
210 return -EOVERFLOW;
211 area->pages_node.last = area->pages_node.last / PAGE_SIZE;
212 if (WARN_ON(area->pages_node.last >= pages->npages))
213 return -EOVERFLOW;
214
215 /*
216 * The area is inserted with a NULL pages indicating it is not fully
217 * initialized yet.
218 */
219 area->iopt = iopt;
220 interval_tree_insert(&area->node, &iopt->area_itree);
221 return 0;
222 }
223
iopt_alloc_area_pages(struct io_pagetable * iopt,struct list_head * pages_list,unsigned long length,unsigned long * dst_iova,int iommu_prot,unsigned int flags)224 static int iopt_alloc_area_pages(struct io_pagetable *iopt,
225 struct list_head *pages_list,
226 unsigned long length, unsigned long *dst_iova,
227 int iommu_prot, unsigned int flags)
228 {
229 struct iopt_pages_list *elm;
230 unsigned long iova;
231 int rc = 0;
232
233 list_for_each_entry(elm, pages_list, next) {
234 elm->area = kzalloc(sizeof(*elm->area), GFP_KERNEL_ACCOUNT);
235 if (!elm->area)
236 return -ENOMEM;
237 }
238
239 down_write(&iopt->iova_rwsem);
240 if ((length & (iopt->iova_alignment - 1)) || !length) {
241 rc = -EINVAL;
242 goto out_unlock;
243 }
244
245 if (flags & IOPT_ALLOC_IOVA) {
246 /* Use the first entry to guess the ideal IOVA alignment */
247 elm = list_first_entry(pages_list, struct iopt_pages_list,
248 next);
249 rc = iopt_alloc_iova(
250 iopt, dst_iova,
251 (uintptr_t)elm->pages->uptr + elm->start_byte, length);
252 if (rc)
253 goto out_unlock;
254 if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
255 WARN_ON(iopt_check_iova(iopt, *dst_iova, length))) {
256 rc = -EINVAL;
257 goto out_unlock;
258 }
259 } else {
260 rc = iopt_check_iova(iopt, *dst_iova, length);
261 if (rc)
262 goto out_unlock;
263 }
264
265 /*
266 * Areas are created with a NULL pages so that the IOVA space is
267 * reserved and we can unlock the iova_rwsem.
268 */
269 iova = *dst_iova;
270 list_for_each_entry(elm, pages_list, next) {
271 rc = iopt_insert_area(iopt, elm->area, elm->pages, iova,
272 elm->start_byte, elm->length, iommu_prot);
273 if (rc)
274 goto out_unlock;
275 iova += elm->length;
276 }
277
278 out_unlock:
279 up_write(&iopt->iova_rwsem);
280 return rc;
281 }
282
iopt_abort_area(struct iopt_area * area)283 static void iopt_abort_area(struct iopt_area *area)
284 {
285 if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
286 WARN_ON(area->pages);
287 if (area->iopt) {
288 down_write(&area->iopt->iova_rwsem);
289 interval_tree_remove(&area->node, &area->iopt->area_itree);
290 up_write(&area->iopt->iova_rwsem);
291 }
292 kfree(area);
293 }
294
iopt_free_pages_list(struct list_head * pages_list)295 void iopt_free_pages_list(struct list_head *pages_list)
296 {
297 struct iopt_pages_list *elm;
298
299 while ((elm = list_first_entry_or_null(pages_list,
300 struct iopt_pages_list, next))) {
301 if (elm->area)
302 iopt_abort_area(elm->area);
303 if (elm->pages)
304 iopt_put_pages(elm->pages);
305 list_del(&elm->next);
306 kfree(elm);
307 }
308 }
309
iopt_fill_domains_pages(struct list_head * pages_list)310 static int iopt_fill_domains_pages(struct list_head *pages_list)
311 {
312 struct iopt_pages_list *undo_elm;
313 struct iopt_pages_list *elm;
314 int rc;
315
316 list_for_each_entry(elm, pages_list, next) {
317 rc = iopt_area_fill_domains(elm->area, elm->pages);
318 if (rc)
319 goto err_undo;
320 }
321 return 0;
322
323 err_undo:
324 list_for_each_entry(undo_elm, pages_list, next) {
325 if (undo_elm == elm)
326 break;
327 iopt_area_unfill_domains(undo_elm->area, undo_elm->pages);
328 }
329 return rc;
330 }
331
iopt_map_pages(struct io_pagetable * iopt,struct list_head * pages_list,unsigned long length,unsigned long * dst_iova,int iommu_prot,unsigned int flags)332 int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list,
333 unsigned long length, unsigned long *dst_iova,
334 int iommu_prot, unsigned int flags)
335 {
336 struct iopt_pages_list *elm;
337 int rc;
338
339 rc = iopt_alloc_area_pages(iopt, pages_list, length, dst_iova,
340 iommu_prot, flags);
341 if (rc)
342 return rc;
343
344 down_read(&iopt->domains_rwsem);
345 rc = iopt_fill_domains_pages(pages_list);
346 if (rc)
347 goto out_unlock_domains;
348
349 down_write(&iopt->iova_rwsem);
350 list_for_each_entry(elm, pages_list, next) {
351 /*
352 * area->pages must be set inside the domains_rwsem to ensure
353 * any newly added domains will get filled. Moves the reference
354 * in from the list.
355 */
356 elm->area->pages = elm->pages;
357 elm->pages = NULL;
358 elm->area = NULL;
359 }
360 up_write(&iopt->iova_rwsem);
361 out_unlock_domains:
362 up_read(&iopt->domains_rwsem);
363 return rc;
364 }
365
366 /**
367 * iopt_map_user_pages() - Map a user VA to an iova in the io page table
368 * @ictx: iommufd_ctx the iopt is part of
369 * @iopt: io_pagetable to act on
370 * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains
371 * the chosen iova on output. Otherwise is the iova to map to on input
372 * @uptr: User VA to map
373 * @length: Number of bytes to map
374 * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping
375 * @flags: IOPT_ALLOC_IOVA or zero
376 *
377 * iova, uptr, and length must be aligned to iova_alignment. For domain backed
378 * page tables this will pin the pages and load them into the domain at iova.
379 * For non-domain page tables this will only setup a lazy reference and the
380 * caller must use iopt_access_pages() to touch them.
381 *
382 * iopt_unmap_iova() must be called to undo this before the io_pagetable can be
383 * destroyed.
384 */
iopt_map_user_pages(struct iommufd_ctx * ictx,struct io_pagetable * iopt,unsigned long * iova,void __user * uptr,unsigned long length,int iommu_prot,unsigned int flags)385 int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
386 unsigned long *iova, void __user *uptr,
387 unsigned long length, int iommu_prot,
388 unsigned int flags)
389 {
390 struct iopt_pages_list elm = {};
391 LIST_HEAD(pages_list);
392 int rc;
393
394 elm.pages = iopt_alloc_pages(uptr, length, iommu_prot & IOMMU_WRITE);
395 if (IS_ERR(elm.pages))
396 return PTR_ERR(elm.pages);
397 if (ictx->account_mode == IOPT_PAGES_ACCOUNT_MM &&
398 elm.pages->account_mode == IOPT_PAGES_ACCOUNT_USER)
399 elm.pages->account_mode = IOPT_PAGES_ACCOUNT_MM;
400 elm.start_byte = uptr - elm.pages->uptr;
401 elm.length = length;
402 list_add(&elm.next, &pages_list);
403
404 rc = iopt_map_pages(iopt, &pages_list, length, iova, iommu_prot, flags);
405 if (rc) {
406 if (elm.area)
407 iopt_abort_area(elm.area);
408 if (elm.pages)
409 iopt_put_pages(elm.pages);
410 return rc;
411 }
412 return 0;
413 }
414
iopt_get_pages(struct io_pagetable * iopt,unsigned long iova,unsigned long length,struct list_head * pages_list)415 int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova,
416 unsigned long length, struct list_head *pages_list)
417 {
418 struct iopt_area_contig_iter iter;
419 unsigned long last_iova;
420 struct iopt_area *area;
421 int rc;
422
423 if (!length)
424 return -EINVAL;
425 if (check_add_overflow(iova, length - 1, &last_iova))
426 return -EOVERFLOW;
427
428 down_read(&iopt->iova_rwsem);
429 iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) {
430 struct iopt_pages_list *elm;
431 unsigned long last = min(last_iova, iopt_area_last_iova(area));
432
433 elm = kzalloc(sizeof(*elm), GFP_KERNEL_ACCOUNT);
434 if (!elm) {
435 rc = -ENOMEM;
436 goto err_free;
437 }
438 elm->start_byte = iopt_area_start_byte(area, iter.cur_iova);
439 elm->pages = area->pages;
440 elm->length = (last - iter.cur_iova) + 1;
441 kref_get(&elm->pages->kref);
442 list_add_tail(&elm->next, pages_list);
443 }
444 if (!iopt_area_contig_done(&iter)) {
445 rc = -ENOENT;
446 goto err_free;
447 }
448 up_read(&iopt->iova_rwsem);
449 return 0;
450 err_free:
451 up_read(&iopt->iova_rwsem);
452 iopt_free_pages_list(pages_list);
453 return rc;
454 }
455
iopt_unmap_iova_range(struct io_pagetable * iopt,unsigned long start,unsigned long last,unsigned long * unmapped)456 static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start,
457 unsigned long last, unsigned long *unmapped)
458 {
459 struct iopt_area *area;
460 unsigned long unmapped_bytes = 0;
461 unsigned int tries = 0;
462 int rc = -ENOENT;
463
464 /*
465 * The domains_rwsem must be held in read mode any time any area->pages
466 * is NULL. This prevents domain attach/detatch from running
467 * concurrently with cleaning up the area.
468 */
469 again:
470 down_read(&iopt->domains_rwsem);
471 down_write(&iopt->iova_rwsem);
472 while ((area = iopt_area_iter_first(iopt, start, last))) {
473 unsigned long area_last = iopt_area_last_iova(area);
474 unsigned long area_first = iopt_area_iova(area);
475 struct iopt_pages *pages;
476
477 /* Userspace should not race map/unmap's of the same area */
478 if (!area->pages) {
479 rc = -EBUSY;
480 goto out_unlock_iova;
481 }
482
483 if (area_first < start || area_last > last) {
484 rc = -ENOENT;
485 goto out_unlock_iova;
486 }
487
488 if (area_first != start)
489 tries = 0;
490
491 /*
492 * num_accesses writers must hold the iova_rwsem too, so we can
493 * safely read it under the write side of the iovam_rwsem
494 * without the pages->mutex.
495 */
496 if (area->num_accesses) {
497 size_t length = iopt_area_length(area);
498
499 start = area_first;
500 area->prevent_access = true;
501 up_write(&iopt->iova_rwsem);
502 up_read(&iopt->domains_rwsem);
503
504 iommufd_access_notify_unmap(iopt, area_first, length);
505 /* Something is not responding to unmap requests. */
506 tries++;
507 if (WARN_ON(tries > 100))
508 return -EDEADLOCK;
509 goto again;
510 }
511
512 pages = area->pages;
513 area->pages = NULL;
514 up_write(&iopt->iova_rwsem);
515
516 iopt_area_unfill_domains(area, pages);
517 iopt_abort_area(area);
518 iopt_put_pages(pages);
519
520 unmapped_bytes += area_last - area_first + 1;
521
522 down_write(&iopt->iova_rwsem);
523 }
524 if (unmapped_bytes)
525 rc = 0;
526
527 out_unlock_iova:
528 up_write(&iopt->iova_rwsem);
529 up_read(&iopt->domains_rwsem);
530 if (unmapped)
531 *unmapped = unmapped_bytes;
532 return rc;
533 }
534
535 /**
536 * iopt_unmap_iova() - Remove a range of iova
537 * @iopt: io_pagetable to act on
538 * @iova: Starting iova to unmap
539 * @length: Number of bytes to unmap
540 * @unmapped: Return number of bytes unmapped
541 *
542 * The requested range must be a superset of existing ranges.
543 * Splitting/truncating IOVA mappings is not allowed.
544 */
iopt_unmap_iova(struct io_pagetable * iopt,unsigned long iova,unsigned long length,unsigned long * unmapped)545 int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
546 unsigned long length, unsigned long *unmapped)
547 {
548 unsigned long iova_last;
549
550 if (!length)
551 return -EINVAL;
552
553 if (check_add_overflow(iova, length - 1, &iova_last))
554 return -EOVERFLOW;
555
556 return iopt_unmap_iova_range(iopt, iova, iova_last, unmapped);
557 }
558
iopt_unmap_all(struct io_pagetable * iopt,unsigned long * unmapped)559 int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped)
560 {
561 int rc;
562
563 rc = iopt_unmap_iova_range(iopt, 0, ULONG_MAX, unmapped);
564 /* If the IOVAs are empty then unmap all succeeds */
565 if (rc == -ENOENT)
566 return 0;
567 return rc;
568 }
569
570 /* The caller must always free all the nodes in the allowed_iova rb_root. */
iopt_set_allow_iova(struct io_pagetable * iopt,struct rb_root_cached * allowed_iova)571 int iopt_set_allow_iova(struct io_pagetable *iopt,
572 struct rb_root_cached *allowed_iova)
573 {
574 struct iopt_allowed *allowed;
575
576 down_write(&iopt->iova_rwsem);
577 swap(*allowed_iova, iopt->allowed_itree);
578
579 for (allowed = iopt_allowed_iter_first(iopt, 0, ULONG_MAX); allowed;
580 allowed = iopt_allowed_iter_next(allowed, 0, ULONG_MAX)) {
581 if (iopt_reserved_iter_first(iopt, allowed->node.start,
582 allowed->node.last)) {
583 swap(*allowed_iova, iopt->allowed_itree);
584 up_write(&iopt->iova_rwsem);
585 return -EADDRINUSE;
586 }
587 }
588 up_write(&iopt->iova_rwsem);
589 return 0;
590 }
591
iopt_reserve_iova(struct io_pagetable * iopt,unsigned long start,unsigned long last,void * owner)592 int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start,
593 unsigned long last, void *owner)
594 {
595 struct iopt_reserved *reserved;
596
597 lockdep_assert_held_write(&iopt->iova_rwsem);
598
599 if (iopt_area_iter_first(iopt, start, last) ||
600 iopt_allowed_iter_first(iopt, start, last))
601 return -EADDRINUSE;
602
603 reserved = kzalloc(sizeof(*reserved), GFP_KERNEL_ACCOUNT);
604 if (!reserved)
605 return -ENOMEM;
606 reserved->node.start = start;
607 reserved->node.last = last;
608 reserved->owner = owner;
609 interval_tree_insert(&reserved->node, &iopt->reserved_itree);
610 return 0;
611 }
612
__iopt_remove_reserved_iova(struct io_pagetable * iopt,void * owner)613 static void __iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
614 {
615 struct iopt_reserved *reserved, *next;
616
617 lockdep_assert_held_write(&iopt->iova_rwsem);
618
619 for (reserved = iopt_reserved_iter_first(iopt, 0, ULONG_MAX); reserved;
620 reserved = next) {
621 next = iopt_reserved_iter_next(reserved, 0, ULONG_MAX);
622
623 if (reserved->owner == owner) {
624 interval_tree_remove(&reserved->node,
625 &iopt->reserved_itree);
626 kfree(reserved);
627 }
628 }
629 }
630
iopt_remove_reserved_iova(struct io_pagetable * iopt,void * owner)631 void iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
632 {
633 down_write(&iopt->iova_rwsem);
634 __iopt_remove_reserved_iova(iopt, owner);
635 up_write(&iopt->iova_rwsem);
636 }
637
iopt_init_table(struct io_pagetable * iopt)638 void iopt_init_table(struct io_pagetable *iopt)
639 {
640 init_rwsem(&iopt->iova_rwsem);
641 init_rwsem(&iopt->domains_rwsem);
642 iopt->area_itree = RB_ROOT_CACHED;
643 iopt->allowed_itree = RB_ROOT_CACHED;
644 iopt->reserved_itree = RB_ROOT_CACHED;
645 xa_init_flags(&iopt->domains, XA_FLAGS_ACCOUNT);
646 xa_init_flags(&iopt->access_list, XA_FLAGS_ALLOC);
647
648 /*
649 * iopt's start as SW tables that can use the entire size_t IOVA space
650 * due to the use of size_t in the APIs. They have no alignment
651 * restriction.
652 */
653 iopt->iova_alignment = 1;
654 }
655
iopt_destroy_table(struct io_pagetable * iopt)656 void iopt_destroy_table(struct io_pagetable *iopt)
657 {
658 struct interval_tree_node *node;
659
660 if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
661 iopt_remove_reserved_iova(iopt, NULL);
662
663 while ((node = interval_tree_iter_first(&iopt->allowed_itree, 0,
664 ULONG_MAX))) {
665 interval_tree_remove(node, &iopt->allowed_itree);
666 kfree(container_of(node, struct iopt_allowed, node));
667 }
668
669 WARN_ON(!RB_EMPTY_ROOT(&iopt->reserved_itree.rb_root));
670 WARN_ON(!xa_empty(&iopt->domains));
671 WARN_ON(!xa_empty(&iopt->access_list));
672 WARN_ON(!RB_EMPTY_ROOT(&iopt->area_itree.rb_root));
673 }
674
675 /**
676 * iopt_unfill_domain() - Unfill a domain with PFNs
677 * @iopt: io_pagetable to act on
678 * @domain: domain to unfill
679 *
680 * This is used when removing a domain from the iopt. Every area in the iopt
681 * will be unmapped from the domain. The domain must already be removed from the
682 * domains xarray.
683 */
iopt_unfill_domain(struct io_pagetable * iopt,struct iommu_domain * domain)684 static void iopt_unfill_domain(struct io_pagetable *iopt,
685 struct iommu_domain *domain)
686 {
687 struct iopt_area *area;
688
689 lockdep_assert_held(&iopt->iova_rwsem);
690 lockdep_assert_held_write(&iopt->domains_rwsem);
691
692 /*
693 * Some other domain is holding all the pfns still, rapidly unmap this
694 * domain.
695 */
696 if (iopt->next_domain_id != 0) {
697 /* Pick an arbitrary remaining domain to act as storage */
698 struct iommu_domain *storage_domain =
699 xa_load(&iopt->domains, 0);
700
701 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
702 area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
703 struct iopt_pages *pages = area->pages;
704
705 if (!pages)
706 continue;
707
708 mutex_lock(&pages->mutex);
709 if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
710 WARN_ON(!area->storage_domain);
711 if (area->storage_domain == domain)
712 area->storage_domain = storage_domain;
713 mutex_unlock(&pages->mutex);
714
715 iopt_area_unmap_domain(area, domain);
716 }
717 return;
718 }
719
720 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
721 area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
722 struct iopt_pages *pages = area->pages;
723
724 if (!pages)
725 continue;
726
727 mutex_lock(&pages->mutex);
728 interval_tree_remove(&area->pages_node, &pages->domains_itree);
729 WARN_ON(area->storage_domain != domain);
730 area->storage_domain = NULL;
731 iopt_area_unfill_domain(area, pages, domain);
732 mutex_unlock(&pages->mutex);
733 }
734 }
735
736 /**
737 * iopt_fill_domain() - Fill a domain with PFNs
738 * @iopt: io_pagetable to act on
739 * @domain: domain to fill
740 *
741 * Fill the domain with PFNs from every area in the iopt. On failure the domain
742 * is left unchanged.
743 */
iopt_fill_domain(struct io_pagetable * iopt,struct iommu_domain * domain)744 static int iopt_fill_domain(struct io_pagetable *iopt,
745 struct iommu_domain *domain)
746 {
747 struct iopt_area *end_area;
748 struct iopt_area *area;
749 int rc;
750
751 lockdep_assert_held(&iopt->iova_rwsem);
752 lockdep_assert_held_write(&iopt->domains_rwsem);
753
754 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
755 area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
756 struct iopt_pages *pages = area->pages;
757
758 if (!pages)
759 continue;
760
761 mutex_lock(&pages->mutex);
762 rc = iopt_area_fill_domain(area, domain);
763 if (rc) {
764 mutex_unlock(&pages->mutex);
765 goto out_unfill;
766 }
767 if (!area->storage_domain) {
768 WARN_ON(iopt->next_domain_id != 0);
769 area->storage_domain = domain;
770 interval_tree_insert(&area->pages_node,
771 &pages->domains_itree);
772 }
773 mutex_unlock(&pages->mutex);
774 }
775 return 0;
776
777 out_unfill:
778 end_area = area;
779 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
780 area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
781 struct iopt_pages *pages = area->pages;
782
783 if (area == end_area)
784 break;
785 if (!pages)
786 continue;
787 mutex_lock(&pages->mutex);
788 if (iopt->next_domain_id == 0) {
789 interval_tree_remove(&area->pages_node,
790 &pages->domains_itree);
791 area->storage_domain = NULL;
792 }
793 iopt_area_unfill_domain(area, pages, domain);
794 mutex_unlock(&pages->mutex);
795 }
796 return rc;
797 }
798
799 /* All existing area's conform to an increased page size */
iopt_check_iova_alignment(struct io_pagetable * iopt,unsigned long new_iova_alignment)800 static int iopt_check_iova_alignment(struct io_pagetable *iopt,
801 unsigned long new_iova_alignment)
802 {
803 unsigned long align_mask = new_iova_alignment - 1;
804 struct iopt_area *area;
805
806 lockdep_assert_held(&iopt->iova_rwsem);
807 lockdep_assert_held(&iopt->domains_rwsem);
808
809 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
810 area = iopt_area_iter_next(area, 0, ULONG_MAX))
811 if ((iopt_area_iova(area) & align_mask) ||
812 (iopt_area_length(area) & align_mask) ||
813 (area->page_offset & align_mask))
814 return -EADDRINUSE;
815
816 if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) {
817 struct iommufd_access *access;
818 unsigned long index;
819
820 xa_for_each(&iopt->access_list, index, access)
821 if (WARN_ON(access->iova_alignment >
822 new_iova_alignment))
823 return -EADDRINUSE;
824 }
825 return 0;
826 }
827
iopt_table_add_domain(struct io_pagetable * iopt,struct iommu_domain * domain)828 int iopt_table_add_domain(struct io_pagetable *iopt,
829 struct iommu_domain *domain)
830 {
831 const struct iommu_domain_geometry *geometry = &domain->geometry;
832 struct iommu_domain *iter_domain;
833 unsigned int new_iova_alignment;
834 unsigned long index;
835 int rc;
836
837 down_write(&iopt->domains_rwsem);
838 down_write(&iopt->iova_rwsem);
839
840 xa_for_each(&iopt->domains, index, iter_domain) {
841 if (WARN_ON(iter_domain == domain)) {
842 rc = -EEXIST;
843 goto out_unlock;
844 }
845 }
846
847 /*
848 * The io page size drives the iova_alignment. Internally the iopt_pages
849 * works in PAGE_SIZE units and we adjust when mapping sub-PAGE_SIZE
850 * objects into the iommu_domain.
851 *
852 * A iommu_domain must always be able to accept PAGE_SIZE to be
853 * compatible as we can't guarantee higher contiguity.
854 */
855 new_iova_alignment = max_t(unsigned long,
856 1UL << __ffs(domain->pgsize_bitmap),
857 iopt->iova_alignment);
858 if (new_iova_alignment > PAGE_SIZE) {
859 rc = -EINVAL;
860 goto out_unlock;
861 }
862 if (new_iova_alignment != iopt->iova_alignment) {
863 rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
864 if (rc)
865 goto out_unlock;
866 }
867
868 /* No area exists that is outside the allowed domain aperture */
869 if (geometry->aperture_start != 0) {
870 rc = iopt_reserve_iova(iopt, 0, geometry->aperture_start - 1,
871 domain);
872 if (rc)
873 goto out_reserved;
874 }
875 if (geometry->aperture_end != ULONG_MAX) {
876 rc = iopt_reserve_iova(iopt, geometry->aperture_end + 1,
877 ULONG_MAX, domain);
878 if (rc)
879 goto out_reserved;
880 }
881
882 rc = xa_reserve(&iopt->domains, iopt->next_domain_id, GFP_KERNEL);
883 if (rc)
884 goto out_reserved;
885
886 rc = iopt_fill_domain(iopt, domain);
887 if (rc)
888 goto out_release;
889
890 iopt->iova_alignment = new_iova_alignment;
891 xa_store(&iopt->domains, iopt->next_domain_id, domain, GFP_KERNEL);
892 iopt->next_domain_id++;
893 up_write(&iopt->iova_rwsem);
894 up_write(&iopt->domains_rwsem);
895 return 0;
896 out_release:
897 xa_release(&iopt->domains, iopt->next_domain_id);
898 out_reserved:
899 __iopt_remove_reserved_iova(iopt, domain);
900 out_unlock:
901 up_write(&iopt->iova_rwsem);
902 up_write(&iopt->domains_rwsem);
903 return rc;
904 }
905
iopt_calculate_iova_alignment(struct io_pagetable * iopt)906 static int iopt_calculate_iova_alignment(struct io_pagetable *iopt)
907 {
908 unsigned long new_iova_alignment;
909 struct iommufd_access *access;
910 struct iommu_domain *domain;
911 unsigned long index;
912
913 lockdep_assert_held_write(&iopt->iova_rwsem);
914 lockdep_assert_held(&iopt->domains_rwsem);
915
916 /* See batch_iommu_map_small() */
917 if (iopt->disable_large_pages)
918 new_iova_alignment = PAGE_SIZE;
919 else
920 new_iova_alignment = 1;
921
922 xa_for_each(&iopt->domains, index, domain)
923 new_iova_alignment = max_t(unsigned long,
924 1UL << __ffs(domain->pgsize_bitmap),
925 new_iova_alignment);
926 xa_for_each(&iopt->access_list, index, access)
927 new_iova_alignment = max_t(unsigned long,
928 access->iova_alignment,
929 new_iova_alignment);
930
931 if (new_iova_alignment > iopt->iova_alignment) {
932 int rc;
933
934 rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
935 if (rc)
936 return rc;
937 }
938 iopt->iova_alignment = new_iova_alignment;
939 return 0;
940 }
941
iopt_table_remove_domain(struct io_pagetable * iopt,struct iommu_domain * domain)942 void iopt_table_remove_domain(struct io_pagetable *iopt,
943 struct iommu_domain *domain)
944 {
945 struct iommu_domain *iter_domain = NULL;
946 unsigned long index;
947
948 down_write(&iopt->domains_rwsem);
949 down_write(&iopt->iova_rwsem);
950
951 xa_for_each(&iopt->domains, index, iter_domain)
952 if (iter_domain == domain)
953 break;
954 if (WARN_ON(iter_domain != domain) || index >= iopt->next_domain_id)
955 goto out_unlock;
956
957 /*
958 * Compress the xarray to keep it linear by swapping the entry to erase
959 * with the tail entry and shrinking the tail.
960 */
961 iopt->next_domain_id--;
962 iter_domain = xa_erase(&iopt->domains, iopt->next_domain_id);
963 if (index != iopt->next_domain_id)
964 xa_store(&iopt->domains, index, iter_domain, GFP_KERNEL);
965
966 iopt_unfill_domain(iopt, domain);
967 __iopt_remove_reserved_iova(iopt, domain);
968
969 WARN_ON(iopt_calculate_iova_alignment(iopt));
970 out_unlock:
971 up_write(&iopt->iova_rwsem);
972 up_write(&iopt->domains_rwsem);
973 }
974
975 /**
976 * iopt_area_split - Split an area into two parts at iova
977 * @area: The area to split
978 * @iova: Becomes the last of a new area
979 *
980 * This splits an area into two. It is part of the VFIO compatibility to allow
981 * poking a hole in the mapping. The two areas continue to point at the same
982 * iopt_pages, just with different starting bytes.
983 */
iopt_area_split(struct iopt_area * area,unsigned long iova)984 static int iopt_area_split(struct iopt_area *area, unsigned long iova)
985 {
986 unsigned long alignment = area->iopt->iova_alignment;
987 unsigned long last_iova = iopt_area_last_iova(area);
988 unsigned long start_iova = iopt_area_iova(area);
989 unsigned long new_start = iova + 1;
990 struct io_pagetable *iopt = area->iopt;
991 struct iopt_pages *pages = area->pages;
992 struct iopt_area *lhs;
993 struct iopt_area *rhs;
994 int rc;
995
996 lockdep_assert_held_write(&iopt->iova_rwsem);
997
998 if (iova == start_iova || iova == last_iova)
999 return 0;
1000
1001 if (!pages || area->prevent_access)
1002 return -EBUSY;
1003
1004 if (new_start & (alignment - 1) ||
1005 iopt_area_start_byte(area, new_start) & (alignment - 1))
1006 return -EINVAL;
1007
1008 lhs = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT);
1009 if (!lhs)
1010 return -ENOMEM;
1011
1012 rhs = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT);
1013 if (!rhs) {
1014 rc = -ENOMEM;
1015 goto err_free_lhs;
1016 }
1017
1018 mutex_lock(&pages->mutex);
1019 /*
1020 * Splitting is not permitted if an access exists, we don't track enough
1021 * information to split existing accesses.
1022 */
1023 if (area->num_accesses) {
1024 rc = -EINVAL;
1025 goto err_unlock;
1026 }
1027
1028 /*
1029 * Splitting is not permitted if a domain could have been mapped with
1030 * huge pages.
1031 */
1032 if (area->storage_domain && !iopt->disable_large_pages) {
1033 rc = -EINVAL;
1034 goto err_unlock;
1035 }
1036
1037 interval_tree_remove(&area->node, &iopt->area_itree);
1038 rc = iopt_insert_area(iopt, lhs, area->pages, start_iova,
1039 iopt_area_start_byte(area, start_iova),
1040 (new_start - 1) - start_iova + 1,
1041 area->iommu_prot);
1042 if (WARN_ON(rc))
1043 goto err_insert;
1044
1045 rc = iopt_insert_area(iopt, rhs, area->pages, new_start,
1046 iopt_area_start_byte(area, new_start),
1047 last_iova - new_start + 1, area->iommu_prot);
1048 if (WARN_ON(rc))
1049 goto err_remove_lhs;
1050
1051 lhs->storage_domain = area->storage_domain;
1052 lhs->pages = area->pages;
1053 rhs->storage_domain = area->storage_domain;
1054 rhs->pages = area->pages;
1055 kref_get(&rhs->pages->kref);
1056 kfree(area);
1057 mutex_unlock(&pages->mutex);
1058
1059 /*
1060 * No change to domains or accesses because the pages hasn't been
1061 * changed
1062 */
1063 return 0;
1064
1065 err_remove_lhs:
1066 interval_tree_remove(&lhs->node, &iopt->area_itree);
1067 err_insert:
1068 interval_tree_insert(&area->node, &iopt->area_itree);
1069 err_unlock:
1070 mutex_unlock(&pages->mutex);
1071 kfree(rhs);
1072 err_free_lhs:
1073 kfree(lhs);
1074 return rc;
1075 }
1076
iopt_cut_iova(struct io_pagetable * iopt,unsigned long * iovas,size_t num_iovas)1077 int iopt_cut_iova(struct io_pagetable *iopt, unsigned long *iovas,
1078 size_t num_iovas)
1079 {
1080 int rc = 0;
1081 int i;
1082
1083 down_write(&iopt->iova_rwsem);
1084 for (i = 0; i < num_iovas; i++) {
1085 struct iopt_area *area;
1086
1087 area = iopt_area_iter_first(iopt, iovas[i], iovas[i]);
1088 if (!area)
1089 continue;
1090 rc = iopt_area_split(area, iovas[i]);
1091 if (rc)
1092 break;
1093 }
1094 up_write(&iopt->iova_rwsem);
1095 return rc;
1096 }
1097
iopt_enable_large_pages(struct io_pagetable * iopt)1098 void iopt_enable_large_pages(struct io_pagetable *iopt)
1099 {
1100 int rc;
1101
1102 down_write(&iopt->domains_rwsem);
1103 down_write(&iopt->iova_rwsem);
1104 WRITE_ONCE(iopt->disable_large_pages, false);
1105 rc = iopt_calculate_iova_alignment(iopt);
1106 WARN_ON(rc);
1107 up_write(&iopt->iova_rwsem);
1108 up_write(&iopt->domains_rwsem);
1109 }
1110
iopt_disable_large_pages(struct io_pagetable * iopt)1111 int iopt_disable_large_pages(struct io_pagetable *iopt)
1112 {
1113 int rc = 0;
1114
1115 down_write(&iopt->domains_rwsem);
1116 down_write(&iopt->iova_rwsem);
1117 if (iopt->disable_large_pages)
1118 goto out_unlock;
1119
1120 /* Won't do it if domains already have pages mapped in them */
1121 if (!xa_empty(&iopt->domains) &&
1122 !RB_EMPTY_ROOT(&iopt->area_itree.rb_root)) {
1123 rc = -EINVAL;
1124 goto out_unlock;
1125 }
1126
1127 WRITE_ONCE(iopt->disable_large_pages, true);
1128 rc = iopt_calculate_iova_alignment(iopt);
1129 if (rc)
1130 WRITE_ONCE(iopt->disable_large_pages, false);
1131 out_unlock:
1132 up_write(&iopt->iova_rwsem);
1133 up_write(&iopt->domains_rwsem);
1134 return rc;
1135 }
1136
iopt_add_access(struct io_pagetable * iopt,struct iommufd_access * access)1137 int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access)
1138 {
1139 int rc;
1140
1141 down_write(&iopt->domains_rwsem);
1142 down_write(&iopt->iova_rwsem);
1143 rc = xa_alloc(&iopt->access_list, &access->iopt_access_list_id, access,
1144 xa_limit_16b, GFP_KERNEL_ACCOUNT);
1145 if (rc)
1146 goto out_unlock;
1147
1148 rc = iopt_calculate_iova_alignment(iopt);
1149 if (rc) {
1150 xa_erase(&iopt->access_list, access->iopt_access_list_id);
1151 goto out_unlock;
1152 }
1153
1154 out_unlock:
1155 up_write(&iopt->iova_rwsem);
1156 up_write(&iopt->domains_rwsem);
1157 return rc;
1158 }
1159
iopt_remove_access(struct io_pagetable * iopt,struct iommufd_access * access,u32 iopt_access_list_id)1160 void iopt_remove_access(struct io_pagetable *iopt,
1161 struct iommufd_access *access,
1162 u32 iopt_access_list_id)
1163 {
1164 down_write(&iopt->domains_rwsem);
1165 down_write(&iopt->iova_rwsem);
1166 WARN_ON(xa_erase(&iopt->access_list, iopt_access_list_id) != access);
1167 WARN_ON(iopt_calculate_iova_alignment(iopt));
1168 up_write(&iopt->iova_rwsem);
1169 up_write(&iopt->domains_rwsem);
1170 }
1171
1172 /* Narrow the valid_iova_itree to include reserved ranges from a device. */
iopt_table_enforce_dev_resv_regions(struct io_pagetable * iopt,struct device * dev,phys_addr_t * sw_msi_start)1173 int iopt_table_enforce_dev_resv_regions(struct io_pagetable *iopt,
1174 struct device *dev,
1175 phys_addr_t *sw_msi_start)
1176 {
1177 struct iommu_resv_region *resv;
1178 LIST_HEAD(resv_regions);
1179 unsigned int num_hw_msi = 0;
1180 unsigned int num_sw_msi = 0;
1181 int rc;
1182
1183 if (iommufd_should_fail())
1184 return -EINVAL;
1185
1186 down_write(&iopt->iova_rwsem);
1187 /* FIXME: drivers allocate memory but there is no failure propogated */
1188 iommu_get_resv_regions(dev, &resv_regions);
1189
1190 list_for_each_entry(resv, &resv_regions, list) {
1191 if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE)
1192 continue;
1193
1194 if (sw_msi_start && resv->type == IOMMU_RESV_MSI)
1195 num_hw_msi++;
1196 if (sw_msi_start && resv->type == IOMMU_RESV_SW_MSI) {
1197 *sw_msi_start = resv->start;
1198 num_sw_msi++;
1199 }
1200
1201 rc = iopt_reserve_iova(iopt, resv->start,
1202 resv->length - 1 + resv->start, dev);
1203 if (rc)
1204 goto out_reserved;
1205 }
1206
1207 /* Drivers must offer sane combinations of regions */
1208 if (WARN_ON(num_sw_msi && num_hw_msi) || WARN_ON(num_sw_msi > 1)) {
1209 rc = -EINVAL;
1210 goto out_reserved;
1211 }
1212
1213 rc = 0;
1214 goto out_free_resv;
1215
1216 out_reserved:
1217 __iopt_remove_reserved_iova(iopt, dev);
1218 out_free_resv:
1219 iommu_put_resv_regions(dev, &resv_regions);
1220 up_write(&iopt->iova_rwsem);
1221 return rc;
1222 }
1223