1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * VFIO: IOMMU DMA mapping support for Type1 IOMMU
4  *
5  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
6  *     Author: Alex Williamson <alex.williamson@redhat.com>
7  *
8  * Derived from original vfio:
9  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
10  * Author: Tom Lyon, pugs@cisco.com
11  *
12  * We arbitrarily define a Type1 IOMMU as one matching the below code.
13  * It could be called the x86 IOMMU as it's designed for AMD-Vi & Intel
14  * VT-d, but that makes it harder to re-use as theoretically anyone
15  * implementing a similar IOMMU could make use of this.  We expect the
16  * IOMMU to support the IOMMU API and have few to no restrictions around
17  * the IOVA range that can be mapped.  The Type1 IOMMU is currently
18  * optimized for relatively static mappings of a userspace process with
19  * userpsace pages pinned into memory.  We also assume devices and IOMMU
20  * domains are PCI based as the IOMMU API is still centered around a
21  * device/bus interface rather than a group interface.
22  */
23 
24 #include <linux/compat.h>
25 #include <linux/device.h>
26 #include <linux/fs.h>
27 #include <linux/iommu.h>
28 #include <linux/module.h>
29 #include <linux/mm.h>
30 #include <linux/rbtree.h>
31 #include <linux/sched/signal.h>
32 #include <linux/sched/mm.h>
33 #include <linux/slab.h>
34 #include <linux/uaccess.h>
35 #include <linux/vfio.h>
36 #include <linux/workqueue.h>
37 #include <linux/mdev.h>
38 #include <linux/notifier.h>
39 #include <linux/dma-iommu.h>
40 #include <linux/irqdomain.h>
41 
42 #define DRIVER_VERSION  "0.2"
43 #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
44 #define DRIVER_DESC     "Type1 IOMMU driver for VFIO"
45 
46 static bool allow_unsafe_interrupts;
47 module_param_named(allow_unsafe_interrupts,
48 		   allow_unsafe_interrupts, bool, S_IRUGO | S_IWUSR);
49 MODULE_PARM_DESC(allow_unsafe_interrupts,
50 		 "Enable VFIO IOMMU support for on platforms without interrupt remapping support.");
51 
52 static bool disable_hugepages;
53 module_param_named(disable_hugepages,
54 		   disable_hugepages, bool, S_IRUGO | S_IWUSR);
55 MODULE_PARM_DESC(disable_hugepages,
56 		 "Disable VFIO IOMMU support for IOMMU hugepages.");
57 
58 static unsigned int dma_entry_limit __read_mostly = U16_MAX;
59 module_param_named(dma_entry_limit, dma_entry_limit, uint, 0644);
60 MODULE_PARM_DESC(dma_entry_limit,
61 		 "Maximum number of user DMA mappings per container (65535).");
62 
63 struct vfio_iommu {
64 	struct list_head	domain_list;
65 	struct list_head	iova_list;
66 	struct vfio_domain	*external_domain; /* domain for external user */
67 	struct mutex		lock;
68 	struct rb_root		dma_list;
69 	struct blocking_notifier_head notifier;
70 	unsigned int		dma_avail;
71 	bool			v2;
72 	bool			nesting;
73 };
74 
75 struct vfio_domain {
76 	struct iommu_domain	*domain;
77 	struct list_head	next;
78 	struct list_head	group_list;
79 	int			prot;		/* IOMMU_CACHE */
80 	bool			fgsp;		/* Fine-grained super pages */
81 };
82 
83 struct vfio_dma {
84 	struct rb_node		node;
85 	dma_addr_t		iova;		/* Device address */
86 	unsigned long		vaddr;		/* Process virtual addr */
87 	size_t			size;		/* Map size (bytes) */
88 	int			prot;		/* IOMMU_READ/WRITE */
89 	bool			iommu_mapped;
90 	bool			lock_cap;	/* capable(CAP_IPC_LOCK) */
91 	struct task_struct	*task;
92 	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
93 };
94 
95 struct vfio_group {
96 	struct iommu_group	*iommu_group;
97 	struct list_head	next;
98 	bool			mdev_group;	/* An mdev group */
99 };
100 
101 struct vfio_iova {
102 	struct list_head	list;
103 	dma_addr_t		start;
104 	dma_addr_t		end;
105 };
106 
107 /*
108  * Guest RAM pinning working set or DMA target
109  */
110 struct vfio_pfn {
111 	struct rb_node		node;
112 	dma_addr_t		iova;		/* Device address */
113 	unsigned long		pfn;		/* Host pfn */
114 	atomic_t		ref_count;
115 };
116 
117 struct vfio_regions {
118 	struct list_head list;
119 	dma_addr_t iova;
120 	phys_addr_t phys;
121 	size_t len;
122 };
123 
124 #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
125 					(!list_empty(&iommu->domain_list))
126 
127 static int put_pfn(unsigned long pfn, int prot);
128 
129 /*
130  * This code handles mapping and unmapping of user data buffers
131  * into DMA'ble space using the IOMMU
132  */
133 
vfio_find_dma(struct vfio_iommu * iommu,dma_addr_t start,size_t size)134 static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
135 				      dma_addr_t start, size_t size)
136 {
137 	struct rb_node *node = iommu->dma_list.rb_node;
138 
139 	while (node) {
140 		struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
141 
142 		if (start + size <= dma->iova)
143 			node = node->rb_left;
144 		else if (start >= dma->iova + dma->size)
145 			node = node->rb_right;
146 		else
147 			return dma;
148 	}
149 
150 	return NULL;
151 }
152 
vfio_link_dma(struct vfio_iommu * iommu,struct vfio_dma * new)153 static void vfio_link_dma(struct vfio_iommu *iommu, struct vfio_dma *new)
154 {
155 	struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL;
156 	struct vfio_dma *dma;
157 
158 	while (*link) {
159 		parent = *link;
160 		dma = rb_entry(parent, struct vfio_dma, node);
161 
162 		if (new->iova + new->size <= dma->iova)
163 			link = &(*link)->rb_left;
164 		else
165 			link = &(*link)->rb_right;
166 	}
167 
168 	rb_link_node(&new->node, parent, link);
169 	rb_insert_color(&new->node, &iommu->dma_list);
170 }
171 
vfio_unlink_dma(struct vfio_iommu * iommu,struct vfio_dma * old)172 static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
173 {
174 	rb_erase(&old->node, &iommu->dma_list);
175 }
176 
177 /*
178  * Helper Functions for host iova-pfn list
179  */
vfio_find_vpfn(struct vfio_dma * dma,dma_addr_t iova)180 static struct vfio_pfn *vfio_find_vpfn(struct vfio_dma *dma, dma_addr_t iova)
181 {
182 	struct vfio_pfn *vpfn;
183 	struct rb_node *node = dma->pfn_list.rb_node;
184 
185 	while (node) {
186 		vpfn = rb_entry(node, struct vfio_pfn, node);
187 
188 		if (iova < vpfn->iova)
189 			node = node->rb_left;
190 		else if (iova > vpfn->iova)
191 			node = node->rb_right;
192 		else
193 			return vpfn;
194 	}
195 	return NULL;
196 }
197 
vfio_link_pfn(struct vfio_dma * dma,struct vfio_pfn * new)198 static void vfio_link_pfn(struct vfio_dma *dma,
199 			  struct vfio_pfn *new)
200 {
201 	struct rb_node **link, *parent = NULL;
202 	struct vfio_pfn *vpfn;
203 
204 	link = &dma->pfn_list.rb_node;
205 	while (*link) {
206 		parent = *link;
207 		vpfn = rb_entry(parent, struct vfio_pfn, node);
208 
209 		if (new->iova < vpfn->iova)
210 			link = &(*link)->rb_left;
211 		else
212 			link = &(*link)->rb_right;
213 	}
214 
215 	rb_link_node(&new->node, parent, link);
216 	rb_insert_color(&new->node, &dma->pfn_list);
217 }
218 
vfio_unlink_pfn(struct vfio_dma * dma,struct vfio_pfn * old)219 static void vfio_unlink_pfn(struct vfio_dma *dma, struct vfio_pfn *old)
220 {
221 	rb_erase(&old->node, &dma->pfn_list);
222 }
223 
vfio_add_to_pfn_list(struct vfio_dma * dma,dma_addr_t iova,unsigned long pfn)224 static int vfio_add_to_pfn_list(struct vfio_dma *dma, dma_addr_t iova,
225 				unsigned long pfn)
226 {
227 	struct vfio_pfn *vpfn;
228 
229 	vpfn = kzalloc(sizeof(*vpfn), GFP_KERNEL);
230 	if (!vpfn)
231 		return -ENOMEM;
232 
233 	vpfn->iova = iova;
234 	vpfn->pfn = pfn;
235 	atomic_set(&vpfn->ref_count, 1);
236 	vfio_link_pfn(dma, vpfn);
237 	return 0;
238 }
239 
vfio_remove_from_pfn_list(struct vfio_dma * dma,struct vfio_pfn * vpfn)240 static void vfio_remove_from_pfn_list(struct vfio_dma *dma,
241 				      struct vfio_pfn *vpfn)
242 {
243 	vfio_unlink_pfn(dma, vpfn);
244 	kfree(vpfn);
245 }
246 
vfio_iova_get_vfio_pfn(struct vfio_dma * dma,unsigned long iova)247 static struct vfio_pfn *vfio_iova_get_vfio_pfn(struct vfio_dma *dma,
248 					       unsigned long iova)
249 {
250 	struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
251 
252 	if (vpfn)
253 		atomic_inc(&vpfn->ref_count);
254 	return vpfn;
255 }
256 
vfio_iova_put_vfio_pfn(struct vfio_dma * dma,struct vfio_pfn * vpfn)257 static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn)
258 {
259 	int ret = 0;
260 
261 	if (atomic_dec_and_test(&vpfn->ref_count)) {
262 		ret = put_pfn(vpfn->pfn, dma->prot);
263 		vfio_remove_from_pfn_list(dma, vpfn);
264 	}
265 	return ret;
266 }
267 
vfio_lock_acct(struct vfio_dma * dma,long npage,bool async)268 static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async)
269 {
270 	struct mm_struct *mm;
271 	int ret;
272 
273 	if (!npage)
274 		return 0;
275 
276 	mm = async ? get_task_mm(dma->task) : dma->task->mm;
277 	if (!mm)
278 		return -ESRCH; /* process exited */
279 
280 	ret = down_write_killable(&mm->mmap_sem);
281 	if (!ret) {
282 		ret = __account_locked_vm(mm, abs(npage), npage > 0, dma->task,
283 					  dma->lock_cap);
284 		up_write(&mm->mmap_sem);
285 	}
286 
287 	if (async)
288 		mmput(mm);
289 
290 	return ret;
291 }
292 
293 /*
294  * Some mappings aren't backed by a struct page, for example an mmap'd
295  * MMIO range for our own or another device.  These use a different
296  * pfn conversion and shouldn't be tracked as locked pages.
297  */
is_invalid_reserved_pfn(unsigned long pfn)298 static bool is_invalid_reserved_pfn(unsigned long pfn)
299 {
300 	if (pfn_valid(pfn)) {
301 		bool reserved;
302 		struct page *tail = pfn_to_page(pfn);
303 		struct page *head = compound_head(tail);
304 		reserved = !!(PageReserved(head));
305 		if (head != tail) {
306 			/*
307 			 * "head" is not a dangling pointer
308 			 * (compound_head takes care of that)
309 			 * but the hugepage may have been split
310 			 * from under us (and we may not hold a
311 			 * reference count on the head page so it can
312 			 * be reused before we run PageReferenced), so
313 			 * we've to check PageTail before returning
314 			 * what we just read.
315 			 */
316 			smp_rmb();
317 			if (PageTail(tail))
318 				return reserved;
319 		}
320 		return PageReserved(tail);
321 	}
322 
323 	return true;
324 }
325 
put_pfn(unsigned long pfn,int prot)326 static int put_pfn(unsigned long pfn, int prot)
327 {
328 	if (!is_invalid_reserved_pfn(pfn)) {
329 		struct page *page = pfn_to_page(pfn);
330 		if (prot & IOMMU_WRITE)
331 			SetPageDirty(page);
332 		put_page(page);
333 		return 1;
334 	}
335 	return 0;
336 }
337 
vaddr_get_pfn(struct mm_struct * mm,unsigned long vaddr,int prot,unsigned long * pfn)338 static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
339 			 int prot, unsigned long *pfn)
340 {
341 	struct page *page[1];
342 	struct vm_area_struct *vma;
343 	struct vm_area_struct *vmas[1];
344 	unsigned int flags = 0;
345 	int ret;
346 
347 	if (prot & IOMMU_WRITE)
348 		flags |= FOLL_WRITE;
349 
350 	down_read(&mm->mmap_sem);
351 	if (mm == current->mm) {
352 		ret = get_user_pages(vaddr, 1, flags | FOLL_LONGTERM, page,
353 				     vmas);
354 	} else {
355 		ret = get_user_pages_remote(NULL, mm, vaddr, 1, flags, page,
356 					    vmas, NULL);
357 		/*
358 		 * The lifetime of a vaddr_get_pfn() page pin is
359 		 * userspace-controlled. In the fs-dax case this could
360 		 * lead to indefinite stalls in filesystem operations.
361 		 * Disallow attempts to pin fs-dax pages via this
362 		 * interface.
363 		 */
364 		if (ret > 0 && vma_is_fsdax(vmas[0])) {
365 			ret = -EOPNOTSUPP;
366 			put_page(page[0]);
367 		}
368 	}
369 	up_read(&mm->mmap_sem);
370 
371 	if (ret == 1) {
372 		*pfn = page_to_pfn(page[0]);
373 		return 0;
374 	}
375 
376 	down_read(&mm->mmap_sem);
377 
378 	vaddr = untagged_addr(vaddr);
379 
380 	vma = find_vma_intersection(mm, vaddr, vaddr + 1);
381 
382 	if (vma && vma->vm_flags & VM_PFNMAP) {
383 		*pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
384 		if (is_invalid_reserved_pfn(*pfn))
385 			ret = 0;
386 	}
387 
388 	up_read(&mm->mmap_sem);
389 	return ret;
390 }
391 
392 /*
393  * Attempt to pin pages.  We really don't want to track all the pfns and
394  * the iommu can only map chunks of consecutive pfns anyway, so get the
395  * first page and all consecutive pages with the same locking.
396  */
vfio_pin_pages_remote(struct vfio_dma * dma,unsigned long vaddr,long npage,unsigned long * pfn_base,unsigned long limit)397 static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
398 				  long npage, unsigned long *pfn_base,
399 				  unsigned long limit)
400 {
401 	unsigned long pfn = 0;
402 	long ret, pinned = 0, lock_acct = 0;
403 	bool rsvd;
404 	dma_addr_t iova = vaddr - dma->vaddr + dma->iova;
405 
406 	/* This code path is only user initiated */
407 	if (!current->mm)
408 		return -ENODEV;
409 
410 	ret = vaddr_get_pfn(current->mm, vaddr, dma->prot, pfn_base);
411 	if (ret)
412 		return ret;
413 
414 	pinned++;
415 	rsvd = is_invalid_reserved_pfn(*pfn_base);
416 
417 	/*
418 	 * Reserved pages aren't counted against the user, externally pinned
419 	 * pages are already counted against the user.
420 	 */
421 	if (!rsvd && !vfio_find_vpfn(dma, iova)) {
422 		if (!dma->lock_cap && current->mm->locked_vm + 1 > limit) {
423 			put_pfn(*pfn_base, dma->prot);
424 			pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
425 					limit << PAGE_SHIFT);
426 			return -ENOMEM;
427 		}
428 		lock_acct++;
429 	}
430 
431 	if (unlikely(disable_hugepages))
432 		goto out;
433 
434 	/* Lock all the consecutive pages from pfn_base */
435 	for (vaddr += PAGE_SIZE, iova += PAGE_SIZE; pinned < npage;
436 	     pinned++, vaddr += PAGE_SIZE, iova += PAGE_SIZE) {
437 		ret = vaddr_get_pfn(current->mm, vaddr, dma->prot, &pfn);
438 		if (ret)
439 			break;
440 
441 		if (pfn != *pfn_base + pinned ||
442 		    rsvd != is_invalid_reserved_pfn(pfn)) {
443 			put_pfn(pfn, dma->prot);
444 			break;
445 		}
446 
447 		if (!rsvd && !vfio_find_vpfn(dma, iova)) {
448 			if (!dma->lock_cap &&
449 			    current->mm->locked_vm + lock_acct + 1 > limit) {
450 				put_pfn(pfn, dma->prot);
451 				pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
452 					__func__, limit << PAGE_SHIFT);
453 				ret = -ENOMEM;
454 				goto unpin_out;
455 			}
456 			lock_acct++;
457 		}
458 	}
459 
460 out:
461 	ret = vfio_lock_acct(dma, lock_acct, false);
462 
463 unpin_out:
464 	if (ret) {
465 		if (!rsvd) {
466 			for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
467 				put_pfn(pfn, dma->prot);
468 		}
469 
470 		return ret;
471 	}
472 
473 	return pinned;
474 }
475 
vfio_unpin_pages_remote(struct vfio_dma * dma,dma_addr_t iova,unsigned long pfn,long npage,bool do_accounting)476 static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova,
477 				    unsigned long pfn, long npage,
478 				    bool do_accounting)
479 {
480 	long unlocked = 0, locked = 0;
481 	long i;
482 
483 	for (i = 0; i < npage; i++, iova += PAGE_SIZE) {
484 		if (put_pfn(pfn++, dma->prot)) {
485 			unlocked++;
486 			if (vfio_find_vpfn(dma, iova))
487 				locked++;
488 		}
489 	}
490 
491 	if (do_accounting)
492 		vfio_lock_acct(dma, locked - unlocked, true);
493 
494 	return unlocked;
495 }
496 
vfio_pin_page_external(struct vfio_dma * dma,unsigned long vaddr,unsigned long * pfn_base,bool do_accounting)497 static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
498 				  unsigned long *pfn_base, bool do_accounting)
499 {
500 	struct mm_struct *mm;
501 	int ret;
502 
503 	mm = get_task_mm(dma->task);
504 	if (!mm)
505 		return -ENODEV;
506 
507 	ret = vaddr_get_pfn(mm, vaddr, dma->prot, pfn_base);
508 	if (!ret && do_accounting && !is_invalid_reserved_pfn(*pfn_base)) {
509 		ret = vfio_lock_acct(dma, 1, true);
510 		if (ret) {
511 			put_pfn(*pfn_base, dma->prot);
512 			if (ret == -ENOMEM)
513 				pr_warn("%s: Task %s (%d) RLIMIT_MEMLOCK "
514 					"(%ld) exceeded\n", __func__,
515 					dma->task->comm, task_pid_nr(dma->task),
516 					task_rlimit(dma->task, RLIMIT_MEMLOCK));
517 		}
518 	}
519 
520 	mmput(mm);
521 	return ret;
522 }
523 
vfio_unpin_page_external(struct vfio_dma * dma,dma_addr_t iova,bool do_accounting)524 static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova,
525 				    bool do_accounting)
526 {
527 	int unlocked;
528 	struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
529 
530 	if (!vpfn)
531 		return 0;
532 
533 	unlocked = vfio_iova_put_vfio_pfn(dma, vpfn);
534 
535 	if (do_accounting)
536 		vfio_lock_acct(dma, -unlocked, true);
537 
538 	return unlocked;
539 }
540 
vfio_iommu_type1_pin_pages(void * iommu_data,unsigned long * user_pfn,int npage,int prot,unsigned long * phys_pfn)541 static int vfio_iommu_type1_pin_pages(void *iommu_data,
542 				      unsigned long *user_pfn,
543 				      int npage, int prot,
544 				      unsigned long *phys_pfn)
545 {
546 	struct vfio_iommu *iommu = iommu_data;
547 	int i, j, ret;
548 	unsigned long remote_vaddr;
549 	struct vfio_dma *dma;
550 	bool do_accounting;
551 
552 	if (!iommu || !user_pfn || !phys_pfn)
553 		return -EINVAL;
554 
555 	/* Supported for v2 version only */
556 	if (!iommu->v2)
557 		return -EACCES;
558 
559 	mutex_lock(&iommu->lock);
560 
561 	/* Fail if notifier list is empty */
562 	if (!iommu->notifier.head) {
563 		ret = -EINVAL;
564 		goto pin_done;
565 	}
566 
567 	/*
568 	 * If iommu capable domain exist in the container then all pages are
569 	 * already pinned and accounted. Accouting should be done if there is no
570 	 * iommu capable domain in the container.
571 	 */
572 	do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
573 
574 	for (i = 0; i < npage; i++) {
575 		dma_addr_t iova;
576 		struct vfio_pfn *vpfn;
577 
578 		iova = user_pfn[i] << PAGE_SHIFT;
579 		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
580 		if (!dma) {
581 			ret = -EINVAL;
582 			goto pin_unwind;
583 		}
584 
585 		if ((dma->prot & prot) != prot) {
586 			ret = -EPERM;
587 			goto pin_unwind;
588 		}
589 
590 		vpfn = vfio_iova_get_vfio_pfn(dma, iova);
591 		if (vpfn) {
592 			phys_pfn[i] = vpfn->pfn;
593 			continue;
594 		}
595 
596 		remote_vaddr = dma->vaddr + iova - dma->iova;
597 		ret = vfio_pin_page_external(dma, remote_vaddr, &phys_pfn[i],
598 					     do_accounting);
599 		if (ret)
600 			goto pin_unwind;
601 
602 		ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]);
603 		if (ret) {
604 			vfio_unpin_page_external(dma, iova, do_accounting);
605 			goto pin_unwind;
606 		}
607 	}
608 
609 	ret = i;
610 	goto pin_done;
611 
612 pin_unwind:
613 	phys_pfn[i] = 0;
614 	for (j = 0; j < i; j++) {
615 		dma_addr_t iova;
616 
617 		iova = user_pfn[j] << PAGE_SHIFT;
618 		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
619 		vfio_unpin_page_external(dma, iova, do_accounting);
620 		phys_pfn[j] = 0;
621 	}
622 pin_done:
623 	mutex_unlock(&iommu->lock);
624 	return ret;
625 }
626 
vfio_iommu_type1_unpin_pages(void * iommu_data,unsigned long * user_pfn,int npage)627 static int vfio_iommu_type1_unpin_pages(void *iommu_data,
628 					unsigned long *user_pfn,
629 					int npage)
630 {
631 	struct vfio_iommu *iommu = iommu_data;
632 	bool do_accounting;
633 	int i;
634 
635 	if (!iommu || !user_pfn)
636 		return -EINVAL;
637 
638 	/* Supported for v2 version only */
639 	if (!iommu->v2)
640 		return -EACCES;
641 
642 	mutex_lock(&iommu->lock);
643 
644 	do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
645 	for (i = 0; i < npage; i++) {
646 		struct vfio_dma *dma;
647 		dma_addr_t iova;
648 
649 		iova = user_pfn[i] << PAGE_SHIFT;
650 		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
651 		if (!dma)
652 			goto unpin_exit;
653 		vfio_unpin_page_external(dma, iova, do_accounting);
654 	}
655 
656 unpin_exit:
657 	mutex_unlock(&iommu->lock);
658 	return i > npage ? npage : (i > 0 ? i : -EINVAL);
659 }
660 
vfio_sync_unpin(struct vfio_dma * dma,struct vfio_domain * domain,struct list_head * regions,struct iommu_iotlb_gather * iotlb_gather)661 static long vfio_sync_unpin(struct vfio_dma *dma, struct vfio_domain *domain,
662 			    struct list_head *regions,
663 			    struct iommu_iotlb_gather *iotlb_gather)
664 {
665 	long unlocked = 0;
666 	struct vfio_regions *entry, *next;
667 
668 	iommu_tlb_sync(domain->domain, iotlb_gather);
669 
670 	list_for_each_entry_safe(entry, next, regions, list) {
671 		unlocked += vfio_unpin_pages_remote(dma,
672 						    entry->iova,
673 						    entry->phys >> PAGE_SHIFT,
674 						    entry->len >> PAGE_SHIFT,
675 						    false);
676 		list_del(&entry->list);
677 		kfree(entry);
678 	}
679 
680 	cond_resched();
681 
682 	return unlocked;
683 }
684 
685 /*
686  * Generally, VFIO needs to unpin remote pages after each IOTLB flush.
687  * Therefore, when using IOTLB flush sync interface, VFIO need to keep track
688  * of these regions (currently using a list).
689  *
690  * This value specifies maximum number of regions for each IOTLB flush sync.
691  */
692 #define VFIO_IOMMU_TLB_SYNC_MAX		512
693 
unmap_unpin_fast(struct vfio_domain * domain,struct vfio_dma * dma,dma_addr_t * iova,size_t len,phys_addr_t phys,long * unlocked,struct list_head * unmapped_list,int * unmapped_cnt,struct iommu_iotlb_gather * iotlb_gather)694 static size_t unmap_unpin_fast(struct vfio_domain *domain,
695 			       struct vfio_dma *dma, dma_addr_t *iova,
696 			       size_t len, phys_addr_t phys, long *unlocked,
697 			       struct list_head *unmapped_list,
698 			       int *unmapped_cnt,
699 			       struct iommu_iotlb_gather *iotlb_gather)
700 {
701 	size_t unmapped = 0;
702 	struct vfio_regions *entry = kzalloc(sizeof(*entry), GFP_KERNEL);
703 
704 	if (entry) {
705 		unmapped = iommu_unmap_fast(domain->domain, *iova, len,
706 					    iotlb_gather);
707 
708 		if (!unmapped) {
709 			kfree(entry);
710 		} else {
711 			entry->iova = *iova;
712 			entry->phys = phys;
713 			entry->len  = unmapped;
714 			list_add_tail(&entry->list, unmapped_list);
715 
716 			*iova += unmapped;
717 			(*unmapped_cnt)++;
718 		}
719 	}
720 
721 	/*
722 	 * Sync if the number of fast-unmap regions hits the limit
723 	 * or in case of errors.
724 	 */
725 	if (*unmapped_cnt >= VFIO_IOMMU_TLB_SYNC_MAX || !unmapped) {
726 		*unlocked += vfio_sync_unpin(dma, domain, unmapped_list,
727 					     iotlb_gather);
728 		*unmapped_cnt = 0;
729 	}
730 
731 	return unmapped;
732 }
733 
unmap_unpin_slow(struct vfio_domain * domain,struct vfio_dma * dma,dma_addr_t * iova,size_t len,phys_addr_t phys,long * unlocked)734 static size_t unmap_unpin_slow(struct vfio_domain *domain,
735 			       struct vfio_dma *dma, dma_addr_t *iova,
736 			       size_t len, phys_addr_t phys,
737 			       long *unlocked)
738 {
739 	size_t unmapped = iommu_unmap(domain->domain, *iova, len);
740 
741 	if (unmapped) {
742 		*unlocked += vfio_unpin_pages_remote(dma, *iova,
743 						     phys >> PAGE_SHIFT,
744 						     unmapped >> PAGE_SHIFT,
745 						     false);
746 		*iova += unmapped;
747 		cond_resched();
748 	}
749 	return unmapped;
750 }
751 
vfio_unmap_unpin(struct vfio_iommu * iommu,struct vfio_dma * dma,bool do_accounting)752 static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
753 			     bool do_accounting)
754 {
755 	dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
756 	struct vfio_domain *domain, *d;
757 	LIST_HEAD(unmapped_region_list);
758 	struct iommu_iotlb_gather iotlb_gather;
759 	int unmapped_region_cnt = 0;
760 	long unlocked = 0;
761 
762 	if (!dma->size)
763 		return 0;
764 
765 	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
766 		return 0;
767 
768 	/*
769 	 * We use the IOMMU to track the physical addresses, otherwise we'd
770 	 * need a much more complicated tracking system.  Unfortunately that
771 	 * means we need to use one of the iommu domains to figure out the
772 	 * pfns to unpin.  The rest need to be unmapped in advance so we have
773 	 * no iommu translations remaining when the pages are unpinned.
774 	 */
775 	domain = d = list_first_entry(&iommu->domain_list,
776 				      struct vfio_domain, next);
777 
778 	list_for_each_entry_continue(d, &iommu->domain_list, next) {
779 		iommu_unmap(d->domain, dma->iova, dma->size);
780 		cond_resched();
781 	}
782 
783 	iommu_iotlb_gather_init(&iotlb_gather);
784 	while (iova < end) {
785 		size_t unmapped, len;
786 		phys_addr_t phys, next;
787 
788 		phys = iommu_iova_to_phys(domain->domain, iova);
789 		if (WARN_ON(!phys)) {
790 			iova += PAGE_SIZE;
791 			continue;
792 		}
793 
794 		/*
795 		 * To optimize for fewer iommu_unmap() calls, each of which
796 		 * may require hardware cache flushing, try to find the
797 		 * largest contiguous physical memory chunk to unmap.
798 		 */
799 		for (len = PAGE_SIZE;
800 		     !domain->fgsp && iova + len < end; len += PAGE_SIZE) {
801 			next = iommu_iova_to_phys(domain->domain, iova + len);
802 			if (next != phys + len)
803 				break;
804 		}
805 
806 		/*
807 		 * First, try to use fast unmap/unpin. In case of failure,
808 		 * switch to slow unmap/unpin path.
809 		 */
810 		unmapped = unmap_unpin_fast(domain, dma, &iova, len, phys,
811 					    &unlocked, &unmapped_region_list,
812 					    &unmapped_region_cnt,
813 					    &iotlb_gather);
814 		if (!unmapped) {
815 			unmapped = unmap_unpin_slow(domain, dma, &iova, len,
816 						    phys, &unlocked);
817 			if (WARN_ON(!unmapped))
818 				break;
819 		}
820 	}
821 
822 	dma->iommu_mapped = false;
823 
824 	if (unmapped_region_cnt) {
825 		unlocked += vfio_sync_unpin(dma, domain, &unmapped_region_list,
826 					    &iotlb_gather);
827 	}
828 
829 	if (do_accounting) {
830 		vfio_lock_acct(dma, -unlocked, true);
831 		return 0;
832 	}
833 	return unlocked;
834 }
835 
vfio_remove_dma(struct vfio_iommu * iommu,struct vfio_dma * dma)836 static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
837 {
838 	vfio_unmap_unpin(iommu, dma, true);
839 	vfio_unlink_dma(iommu, dma);
840 	put_task_struct(dma->task);
841 	kfree(dma);
842 	iommu->dma_avail++;
843 }
844 
vfio_pgsize_bitmap(struct vfio_iommu * iommu)845 static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
846 {
847 	struct vfio_domain *domain;
848 	unsigned long bitmap = ULONG_MAX;
849 
850 	mutex_lock(&iommu->lock);
851 	list_for_each_entry(domain, &iommu->domain_list, next)
852 		bitmap &= domain->domain->pgsize_bitmap;
853 	mutex_unlock(&iommu->lock);
854 
855 	/*
856 	 * In case the IOMMU supports page sizes smaller than PAGE_SIZE
857 	 * we pretend PAGE_SIZE is supported and hide sub-PAGE_SIZE sizes.
858 	 * That way the user will be able to map/unmap buffers whose size/
859 	 * start address is aligned with PAGE_SIZE. Pinning code uses that
860 	 * granularity while iommu driver can use the sub-PAGE_SIZE size
861 	 * to map the buffer.
862 	 */
863 	if (bitmap & ~PAGE_MASK) {
864 		bitmap &= PAGE_MASK;
865 		bitmap |= PAGE_SIZE;
866 	}
867 
868 	return bitmap;
869 }
870 
vfio_dma_do_unmap(struct vfio_iommu * iommu,struct vfio_iommu_type1_dma_unmap * unmap)871 static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
872 			     struct vfio_iommu_type1_dma_unmap *unmap)
873 {
874 	uint64_t mask;
875 	struct vfio_dma *dma, *dma_last = NULL;
876 	size_t unmapped = 0;
877 	int ret = 0, retries = 0;
878 
879 	mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
880 
881 	if (unmap->iova & mask)
882 		return -EINVAL;
883 	if (!unmap->size || unmap->size & mask)
884 		return -EINVAL;
885 	if (unmap->iova + unmap->size - 1 < unmap->iova ||
886 	    unmap->size > SIZE_MAX)
887 		return -EINVAL;
888 
889 	WARN_ON(mask & PAGE_MASK);
890 again:
891 	mutex_lock(&iommu->lock);
892 
893 	/*
894 	 * vfio-iommu-type1 (v1) - User mappings were coalesced together to
895 	 * avoid tracking individual mappings.  This means that the granularity
896 	 * of the original mapping was lost and the user was allowed to attempt
897 	 * to unmap any range.  Depending on the contiguousness of physical
898 	 * memory and page sizes supported by the IOMMU, arbitrary unmaps may
899 	 * or may not have worked.  We only guaranteed unmap granularity
900 	 * matching the original mapping; even though it was untracked here,
901 	 * the original mappings are reflected in IOMMU mappings.  This
902 	 * resulted in a couple unusual behaviors.  First, if a range is not
903 	 * able to be unmapped, ex. a set of 4k pages that was mapped as a
904 	 * 2M hugepage into the IOMMU, the unmap ioctl returns success but with
905 	 * a zero sized unmap.  Also, if an unmap request overlaps the first
906 	 * address of a hugepage, the IOMMU will unmap the entire hugepage.
907 	 * This also returns success and the returned unmap size reflects the
908 	 * actual size unmapped.
909 	 *
910 	 * We attempt to maintain compatibility with this "v1" interface, but
911 	 * we take control out of the hands of the IOMMU.  Therefore, an unmap
912 	 * request offset from the beginning of the original mapping will
913 	 * return success with zero sized unmap.  And an unmap request covering
914 	 * the first iova of mapping will unmap the entire range.
915 	 *
916 	 * The v2 version of this interface intends to be more deterministic.
917 	 * Unmap requests must fully cover previous mappings.  Multiple
918 	 * mappings may still be unmaped by specifying large ranges, but there
919 	 * must not be any previous mappings bisected by the range.  An error
920 	 * will be returned if these conditions are not met.  The v2 interface
921 	 * will only return success and a size of zero if there were no
922 	 * mappings within the range.
923 	 */
924 	if (iommu->v2) {
925 		dma = vfio_find_dma(iommu, unmap->iova, 1);
926 		if (dma && dma->iova != unmap->iova) {
927 			ret = -EINVAL;
928 			goto unlock;
929 		}
930 		dma = vfio_find_dma(iommu, unmap->iova + unmap->size - 1, 0);
931 		if (dma && dma->iova + dma->size != unmap->iova + unmap->size) {
932 			ret = -EINVAL;
933 			goto unlock;
934 		}
935 	}
936 
937 	while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) {
938 		if (!iommu->v2 && unmap->iova > dma->iova)
939 			break;
940 		/*
941 		 * Task with same address space who mapped this iova range is
942 		 * allowed to unmap the iova range.
943 		 */
944 		if (dma->task->mm != current->mm)
945 			break;
946 
947 		if (!RB_EMPTY_ROOT(&dma->pfn_list)) {
948 			struct vfio_iommu_type1_dma_unmap nb_unmap;
949 
950 			if (dma_last == dma) {
951 				BUG_ON(++retries > 10);
952 			} else {
953 				dma_last = dma;
954 				retries = 0;
955 			}
956 
957 			nb_unmap.iova = dma->iova;
958 			nb_unmap.size = dma->size;
959 
960 			/*
961 			 * Notify anyone (mdev vendor drivers) to invalidate and
962 			 * unmap iovas within the range we're about to unmap.
963 			 * Vendor drivers MUST unpin pages in response to an
964 			 * invalidation.
965 			 */
966 			mutex_unlock(&iommu->lock);
967 			blocking_notifier_call_chain(&iommu->notifier,
968 						    VFIO_IOMMU_NOTIFY_DMA_UNMAP,
969 						    &nb_unmap);
970 			goto again;
971 		}
972 		unmapped += dma->size;
973 		vfio_remove_dma(iommu, dma);
974 	}
975 
976 unlock:
977 	mutex_unlock(&iommu->lock);
978 
979 	/* Report how much was unmapped */
980 	unmap->size = unmapped;
981 
982 	return ret;
983 }
984 
vfio_iommu_map(struct vfio_iommu * iommu,dma_addr_t iova,unsigned long pfn,long npage,int prot)985 static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova,
986 			  unsigned long pfn, long npage, int prot)
987 {
988 	struct vfio_domain *d;
989 	int ret;
990 
991 	list_for_each_entry(d, &iommu->domain_list, next) {
992 		ret = iommu_map(d->domain, iova, (phys_addr_t)pfn << PAGE_SHIFT,
993 				npage << PAGE_SHIFT, prot | d->prot);
994 		if (ret)
995 			goto unwind;
996 
997 		cond_resched();
998 	}
999 
1000 	return 0;
1001 
1002 unwind:
1003 	list_for_each_entry_continue_reverse(d, &iommu->domain_list, next)
1004 		iommu_unmap(d->domain, iova, npage << PAGE_SHIFT);
1005 
1006 	return ret;
1007 }
1008 
vfio_pin_map_dma(struct vfio_iommu * iommu,struct vfio_dma * dma,size_t map_size)1009 static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
1010 			    size_t map_size)
1011 {
1012 	dma_addr_t iova = dma->iova;
1013 	unsigned long vaddr = dma->vaddr;
1014 	size_t size = map_size;
1015 	long npage;
1016 	unsigned long pfn, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
1017 	int ret = 0;
1018 
1019 	while (size) {
1020 		/* Pin a contiguous chunk of memory */
1021 		npage = vfio_pin_pages_remote(dma, vaddr + dma->size,
1022 					      size >> PAGE_SHIFT, &pfn, limit);
1023 		if (npage <= 0) {
1024 			WARN_ON(!npage);
1025 			ret = (int)npage;
1026 			break;
1027 		}
1028 
1029 		/* Map it! */
1030 		ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage,
1031 				     dma->prot);
1032 		if (ret) {
1033 			vfio_unpin_pages_remote(dma, iova + dma->size, pfn,
1034 						npage, true);
1035 			break;
1036 		}
1037 
1038 		size -= npage << PAGE_SHIFT;
1039 		dma->size += npage << PAGE_SHIFT;
1040 	}
1041 
1042 	dma->iommu_mapped = true;
1043 
1044 	if (ret)
1045 		vfio_remove_dma(iommu, dma);
1046 
1047 	return ret;
1048 }
1049 
1050 /*
1051  * Check dma map request is within a valid iova range
1052  */
vfio_iommu_iova_dma_valid(struct vfio_iommu * iommu,dma_addr_t start,dma_addr_t end)1053 static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,
1054 				      dma_addr_t start, dma_addr_t end)
1055 {
1056 	struct list_head *iova = &iommu->iova_list;
1057 	struct vfio_iova *node;
1058 
1059 	list_for_each_entry(node, iova, list) {
1060 		if (start >= node->start && end <= node->end)
1061 			return true;
1062 	}
1063 
1064 	/*
1065 	 * Check for list_empty() as well since a container with
1066 	 * a single mdev device will have an empty list.
1067 	 */
1068 	return list_empty(iova);
1069 }
1070 
vfio_dma_do_map(struct vfio_iommu * iommu,struct vfio_iommu_type1_dma_map * map)1071 static int vfio_dma_do_map(struct vfio_iommu *iommu,
1072 			   struct vfio_iommu_type1_dma_map *map)
1073 {
1074 	dma_addr_t iova = map->iova;
1075 	unsigned long vaddr = map->vaddr;
1076 	size_t size = map->size;
1077 	int ret = 0, prot = 0;
1078 	uint64_t mask;
1079 	struct vfio_dma *dma;
1080 
1081 	/* Verify that none of our __u64 fields overflow */
1082 	if (map->size != size || map->vaddr != vaddr || map->iova != iova)
1083 		return -EINVAL;
1084 
1085 	mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
1086 
1087 	WARN_ON(mask & PAGE_MASK);
1088 
1089 	/* READ/WRITE from device perspective */
1090 	if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
1091 		prot |= IOMMU_WRITE;
1092 	if (map->flags & VFIO_DMA_MAP_FLAG_READ)
1093 		prot |= IOMMU_READ;
1094 
1095 	if (!prot || !size || (size | iova | vaddr) & mask)
1096 		return -EINVAL;
1097 
1098 	/* Don't allow IOVA or virtual address wrap */
1099 	if (iova + size - 1 < iova || vaddr + size - 1 < vaddr)
1100 		return -EINVAL;
1101 
1102 	mutex_lock(&iommu->lock);
1103 
1104 	if (vfio_find_dma(iommu, iova, size)) {
1105 		ret = -EEXIST;
1106 		goto out_unlock;
1107 	}
1108 
1109 	if (!iommu->dma_avail) {
1110 		ret = -ENOSPC;
1111 		goto out_unlock;
1112 	}
1113 
1114 	if (!vfio_iommu_iova_dma_valid(iommu, iova, iova + size - 1)) {
1115 		ret = -EINVAL;
1116 		goto out_unlock;
1117 	}
1118 
1119 	dma = kzalloc(sizeof(*dma), GFP_KERNEL);
1120 	if (!dma) {
1121 		ret = -ENOMEM;
1122 		goto out_unlock;
1123 	}
1124 
1125 	iommu->dma_avail--;
1126 	dma->iova = iova;
1127 	dma->vaddr = vaddr;
1128 	dma->prot = prot;
1129 
1130 	/*
1131 	 * We need to be able to both add to a task's locked memory and test
1132 	 * against the locked memory limit and we need to be able to do both
1133 	 * outside of this call path as pinning can be asynchronous via the
1134 	 * external interfaces for mdev devices.  RLIMIT_MEMLOCK requires a
1135 	 * task_struct and VM locked pages requires an mm_struct, however
1136 	 * holding an indefinite mm reference is not recommended, therefore we
1137 	 * only hold a reference to a task.  We could hold a reference to
1138 	 * current, however QEMU uses this call path through vCPU threads,
1139 	 * which can be killed resulting in a NULL mm and failure in the unmap
1140 	 * path when called via a different thread.  Avoid this problem by
1141 	 * using the group_leader as threads within the same group require
1142 	 * both CLONE_THREAD and CLONE_VM and will therefore use the same
1143 	 * mm_struct.
1144 	 *
1145 	 * Previously we also used the task for testing CAP_IPC_LOCK at the
1146 	 * time of pinning and accounting, however has_capability() makes use
1147 	 * of real_cred, a copy-on-write field, so we can't guarantee that it
1148 	 * matches group_leader, or in fact that it might not change by the
1149 	 * time it's evaluated.  If a process were to call MAP_DMA with
1150 	 * CAP_IPC_LOCK but later drop it, it doesn't make sense that they
1151 	 * possibly see different results for an iommu_mapped vfio_dma vs
1152 	 * externally mapped.  Therefore track CAP_IPC_LOCK in vfio_dma at the
1153 	 * time of calling MAP_DMA.
1154 	 */
1155 	get_task_struct(current->group_leader);
1156 	dma->task = current->group_leader;
1157 	dma->lock_cap = capable(CAP_IPC_LOCK);
1158 
1159 	dma->pfn_list = RB_ROOT;
1160 
1161 	/* Insert zero-sized and grow as we map chunks of it */
1162 	vfio_link_dma(iommu, dma);
1163 
1164 	/* Don't pin and map if container doesn't contain IOMMU capable domain*/
1165 	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
1166 		dma->size = size;
1167 	else
1168 		ret = vfio_pin_map_dma(iommu, dma, size);
1169 
1170 out_unlock:
1171 	mutex_unlock(&iommu->lock);
1172 	return ret;
1173 }
1174 
vfio_bus_type(struct device * dev,void * data)1175 static int vfio_bus_type(struct device *dev, void *data)
1176 {
1177 	struct bus_type **bus = data;
1178 
1179 	if (*bus && *bus != dev->bus)
1180 		return -EINVAL;
1181 
1182 	*bus = dev->bus;
1183 
1184 	return 0;
1185 }
1186 
vfio_iommu_replay(struct vfio_iommu * iommu,struct vfio_domain * domain)1187 static int vfio_iommu_replay(struct vfio_iommu *iommu,
1188 			     struct vfio_domain *domain)
1189 {
1190 	struct vfio_domain *d;
1191 	struct rb_node *n;
1192 	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
1193 	int ret;
1194 
1195 	/* Arbitrarily pick the first domain in the list for lookups */
1196 	d = list_first_entry(&iommu->domain_list, struct vfio_domain, next);
1197 	n = rb_first(&iommu->dma_list);
1198 
1199 	for (; n; n = rb_next(n)) {
1200 		struct vfio_dma *dma;
1201 		dma_addr_t iova;
1202 
1203 		dma = rb_entry(n, struct vfio_dma, node);
1204 		iova = dma->iova;
1205 
1206 		while (iova < dma->iova + dma->size) {
1207 			phys_addr_t phys;
1208 			size_t size;
1209 
1210 			if (dma->iommu_mapped) {
1211 				phys_addr_t p;
1212 				dma_addr_t i;
1213 
1214 				phys = iommu_iova_to_phys(d->domain, iova);
1215 
1216 				if (WARN_ON(!phys)) {
1217 					iova += PAGE_SIZE;
1218 					continue;
1219 				}
1220 
1221 				size = PAGE_SIZE;
1222 				p = phys + size;
1223 				i = iova + size;
1224 				while (i < dma->iova + dma->size &&
1225 				       p == iommu_iova_to_phys(d->domain, i)) {
1226 					size += PAGE_SIZE;
1227 					p += PAGE_SIZE;
1228 					i += PAGE_SIZE;
1229 				}
1230 			} else {
1231 				unsigned long pfn;
1232 				unsigned long vaddr = dma->vaddr +
1233 						     (iova - dma->iova);
1234 				size_t n = dma->iova + dma->size - iova;
1235 				long npage;
1236 
1237 				npage = vfio_pin_pages_remote(dma, vaddr,
1238 							      n >> PAGE_SHIFT,
1239 							      &pfn, limit);
1240 				if (npage <= 0) {
1241 					WARN_ON(!npage);
1242 					ret = (int)npage;
1243 					return ret;
1244 				}
1245 
1246 				phys = pfn << PAGE_SHIFT;
1247 				size = npage << PAGE_SHIFT;
1248 			}
1249 
1250 			ret = iommu_map(domain->domain, iova, phys,
1251 					size, dma->prot | domain->prot);
1252 			if (ret)
1253 				return ret;
1254 
1255 			iova += size;
1256 		}
1257 		dma->iommu_mapped = true;
1258 	}
1259 	return 0;
1260 }
1261 
1262 /*
1263  * We change our unmap behavior slightly depending on whether the IOMMU
1264  * supports fine-grained superpages.  IOMMUs like AMD-Vi will use a superpage
1265  * for practically any contiguous power-of-two mapping we give it.  This means
1266  * we don't need to look for contiguous chunks ourselves to make unmapping
1267  * more efficient.  On IOMMUs with coarse-grained super pages, like Intel VT-d
1268  * with discrete 2M/1G/512G/1T superpages, identifying contiguous chunks
1269  * significantly boosts non-hugetlbfs mappings and doesn't seem to hurt when
1270  * hugetlbfs is in use.
1271  */
vfio_test_domain_fgsp(struct vfio_domain * domain)1272 static void vfio_test_domain_fgsp(struct vfio_domain *domain)
1273 {
1274 	struct page *pages;
1275 	int ret, order = get_order(PAGE_SIZE * 2);
1276 
1277 	pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, order);
1278 	if (!pages)
1279 		return;
1280 
1281 	ret = iommu_map(domain->domain, 0, page_to_phys(pages), PAGE_SIZE * 2,
1282 			IOMMU_READ | IOMMU_WRITE | domain->prot);
1283 	if (!ret) {
1284 		size_t unmapped = iommu_unmap(domain->domain, 0, PAGE_SIZE);
1285 
1286 		if (unmapped == PAGE_SIZE)
1287 			iommu_unmap(domain->domain, PAGE_SIZE, PAGE_SIZE);
1288 		else
1289 			domain->fgsp = true;
1290 	}
1291 
1292 	__free_pages(pages, order);
1293 }
1294 
find_iommu_group(struct vfio_domain * domain,struct iommu_group * iommu_group)1295 static struct vfio_group *find_iommu_group(struct vfio_domain *domain,
1296 					   struct iommu_group *iommu_group)
1297 {
1298 	struct vfio_group *g;
1299 
1300 	list_for_each_entry(g, &domain->group_list, next) {
1301 		if (g->iommu_group == iommu_group)
1302 			return g;
1303 	}
1304 
1305 	return NULL;
1306 }
1307 
vfio_iommu_has_sw_msi(struct list_head * group_resv_regions,phys_addr_t * base)1308 static bool vfio_iommu_has_sw_msi(struct list_head *group_resv_regions,
1309 				  phys_addr_t *base)
1310 {
1311 	struct iommu_resv_region *region;
1312 	bool ret = false;
1313 
1314 	list_for_each_entry(region, group_resv_regions, list) {
1315 		/*
1316 		 * The presence of any 'real' MSI regions should take
1317 		 * precedence over the software-managed one if the
1318 		 * IOMMU driver happens to advertise both types.
1319 		 */
1320 		if (region->type == IOMMU_RESV_MSI) {
1321 			ret = false;
1322 			break;
1323 		}
1324 
1325 		if (region->type == IOMMU_RESV_SW_MSI) {
1326 			*base = region->start;
1327 			ret = true;
1328 		}
1329 	}
1330 
1331 	return ret;
1332 }
1333 
vfio_mdev_get_iommu_device(struct device * dev)1334 static struct device *vfio_mdev_get_iommu_device(struct device *dev)
1335 {
1336 	struct device *(*fn)(struct device *dev);
1337 	struct device *iommu_device;
1338 
1339 	fn = symbol_get(mdev_get_iommu_device);
1340 	if (fn) {
1341 		iommu_device = fn(dev);
1342 		symbol_put(mdev_get_iommu_device);
1343 
1344 		return iommu_device;
1345 	}
1346 
1347 	return NULL;
1348 }
1349 
vfio_mdev_attach_domain(struct device * dev,void * data)1350 static int vfio_mdev_attach_domain(struct device *dev, void *data)
1351 {
1352 	struct iommu_domain *domain = data;
1353 	struct device *iommu_device;
1354 
1355 	iommu_device = vfio_mdev_get_iommu_device(dev);
1356 	if (iommu_device) {
1357 		if (iommu_dev_feature_enabled(iommu_device, IOMMU_DEV_FEAT_AUX))
1358 			return iommu_aux_attach_device(domain, iommu_device);
1359 		else
1360 			return iommu_attach_device(domain, iommu_device);
1361 	}
1362 
1363 	return -EINVAL;
1364 }
1365 
vfio_mdev_detach_domain(struct device * dev,void * data)1366 static int vfio_mdev_detach_domain(struct device *dev, void *data)
1367 {
1368 	struct iommu_domain *domain = data;
1369 	struct device *iommu_device;
1370 
1371 	iommu_device = vfio_mdev_get_iommu_device(dev);
1372 	if (iommu_device) {
1373 		if (iommu_dev_feature_enabled(iommu_device, IOMMU_DEV_FEAT_AUX))
1374 			iommu_aux_detach_device(domain, iommu_device);
1375 		else
1376 			iommu_detach_device(domain, iommu_device);
1377 	}
1378 
1379 	return 0;
1380 }
1381 
vfio_iommu_attach_group(struct vfio_domain * domain,struct vfio_group * group)1382 static int vfio_iommu_attach_group(struct vfio_domain *domain,
1383 				   struct vfio_group *group)
1384 {
1385 	if (group->mdev_group)
1386 		return iommu_group_for_each_dev(group->iommu_group,
1387 						domain->domain,
1388 						vfio_mdev_attach_domain);
1389 	else
1390 		return iommu_attach_group(domain->domain, group->iommu_group);
1391 }
1392 
vfio_iommu_detach_group(struct vfio_domain * domain,struct vfio_group * group)1393 static void vfio_iommu_detach_group(struct vfio_domain *domain,
1394 				    struct vfio_group *group)
1395 {
1396 	if (group->mdev_group)
1397 		iommu_group_for_each_dev(group->iommu_group, domain->domain,
1398 					 vfio_mdev_detach_domain);
1399 	else
1400 		iommu_detach_group(domain->domain, group->iommu_group);
1401 }
1402 
vfio_bus_is_mdev(struct bus_type * bus)1403 static bool vfio_bus_is_mdev(struct bus_type *bus)
1404 {
1405 	struct bus_type *mdev_bus;
1406 	bool ret = false;
1407 
1408 	mdev_bus = symbol_get(mdev_bus_type);
1409 	if (mdev_bus) {
1410 		ret = (bus == mdev_bus);
1411 		symbol_put(mdev_bus_type);
1412 	}
1413 
1414 	return ret;
1415 }
1416 
vfio_mdev_iommu_device(struct device * dev,void * data)1417 static int vfio_mdev_iommu_device(struct device *dev, void *data)
1418 {
1419 	struct device **old = data, *new;
1420 
1421 	new = vfio_mdev_get_iommu_device(dev);
1422 	if (!new || (*old && *old != new))
1423 		return -EINVAL;
1424 
1425 	*old = new;
1426 
1427 	return 0;
1428 }
1429 
1430 /*
1431  * This is a helper function to insert an address range to iova list.
1432  * The list is initially created with a single entry corresponding to
1433  * the IOMMU domain geometry to which the device group is attached.
1434  * The list aperture gets modified when a new domain is added to the
1435  * container if the new aperture doesn't conflict with the current one
1436  * or with any existing dma mappings. The list is also modified to
1437  * exclude any reserved regions associated with the device group.
1438  */
vfio_iommu_iova_insert(struct list_head * head,dma_addr_t start,dma_addr_t end)1439 static int vfio_iommu_iova_insert(struct list_head *head,
1440 				  dma_addr_t start, dma_addr_t end)
1441 {
1442 	struct vfio_iova *region;
1443 
1444 	region = kmalloc(sizeof(*region), GFP_KERNEL);
1445 	if (!region)
1446 		return -ENOMEM;
1447 
1448 	INIT_LIST_HEAD(&region->list);
1449 	region->start = start;
1450 	region->end = end;
1451 
1452 	list_add_tail(&region->list, head);
1453 	return 0;
1454 }
1455 
1456 /*
1457  * Check the new iommu aperture conflicts with existing aper or with any
1458  * existing dma mappings.
1459  */
vfio_iommu_aper_conflict(struct vfio_iommu * iommu,dma_addr_t start,dma_addr_t end)1460 static bool vfio_iommu_aper_conflict(struct vfio_iommu *iommu,
1461 				     dma_addr_t start, dma_addr_t end)
1462 {
1463 	struct vfio_iova *first, *last;
1464 	struct list_head *iova = &iommu->iova_list;
1465 
1466 	if (list_empty(iova))
1467 		return false;
1468 
1469 	/* Disjoint sets, return conflict */
1470 	first = list_first_entry(iova, struct vfio_iova, list);
1471 	last = list_last_entry(iova, struct vfio_iova, list);
1472 	if (start > last->end || end < first->start)
1473 		return true;
1474 
1475 	/* Check for any existing dma mappings below the new start */
1476 	if (start > first->start) {
1477 		if (vfio_find_dma(iommu, first->start, start - first->start))
1478 			return true;
1479 	}
1480 
1481 	/* Check for any existing dma mappings beyond the new end */
1482 	if (end < last->end) {
1483 		if (vfio_find_dma(iommu, end + 1, last->end - end))
1484 			return true;
1485 	}
1486 
1487 	return false;
1488 }
1489 
1490 /*
1491  * Resize iommu iova aperture window. This is called only if the new
1492  * aperture has no conflict with existing aperture and dma mappings.
1493  */
vfio_iommu_aper_resize(struct list_head * iova,dma_addr_t start,dma_addr_t end)1494 static int vfio_iommu_aper_resize(struct list_head *iova,
1495 				  dma_addr_t start, dma_addr_t end)
1496 {
1497 	struct vfio_iova *node, *next;
1498 
1499 	if (list_empty(iova))
1500 		return vfio_iommu_iova_insert(iova, start, end);
1501 
1502 	/* Adjust iova list start */
1503 	list_for_each_entry_safe(node, next, iova, list) {
1504 		if (start < node->start)
1505 			break;
1506 		if (start >= node->start && start < node->end) {
1507 			node->start = start;
1508 			break;
1509 		}
1510 		/* Delete nodes before new start */
1511 		list_del(&node->list);
1512 		kfree(node);
1513 	}
1514 
1515 	/* Adjust iova list end */
1516 	list_for_each_entry_safe(node, next, iova, list) {
1517 		if (end > node->end)
1518 			continue;
1519 		if (end > node->start && end <= node->end) {
1520 			node->end = end;
1521 			continue;
1522 		}
1523 		/* Delete nodes after new end */
1524 		list_del(&node->list);
1525 		kfree(node);
1526 	}
1527 
1528 	return 0;
1529 }
1530 
1531 /*
1532  * Check reserved region conflicts with existing dma mappings
1533  */
vfio_iommu_resv_conflict(struct vfio_iommu * iommu,struct list_head * resv_regions)1534 static bool vfio_iommu_resv_conflict(struct vfio_iommu *iommu,
1535 				     struct list_head *resv_regions)
1536 {
1537 	struct iommu_resv_region *region;
1538 
1539 	/* Check for conflict with existing dma mappings */
1540 	list_for_each_entry(region, resv_regions, list) {
1541 		if (region->type == IOMMU_RESV_DIRECT_RELAXABLE)
1542 			continue;
1543 
1544 		if (vfio_find_dma(iommu, region->start, region->length))
1545 			return true;
1546 	}
1547 
1548 	return false;
1549 }
1550 
1551 /*
1552  * Check iova region overlap with  reserved regions and
1553  * exclude them from the iommu iova range
1554  */
vfio_iommu_resv_exclude(struct list_head * iova,struct list_head * resv_regions)1555 static int vfio_iommu_resv_exclude(struct list_head *iova,
1556 				   struct list_head *resv_regions)
1557 {
1558 	struct iommu_resv_region *resv;
1559 	struct vfio_iova *n, *next;
1560 
1561 	list_for_each_entry(resv, resv_regions, list) {
1562 		phys_addr_t start, end;
1563 
1564 		if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE)
1565 			continue;
1566 
1567 		start = resv->start;
1568 		end = resv->start + resv->length - 1;
1569 
1570 		list_for_each_entry_safe(n, next, iova, list) {
1571 			int ret = 0;
1572 
1573 			/* No overlap */
1574 			if (start > n->end || end < n->start)
1575 				continue;
1576 			/*
1577 			 * Insert a new node if current node overlaps with the
1578 			 * reserve region to exlude that from valid iova range.
1579 			 * Note that, new node is inserted before the current
1580 			 * node and finally the current node is deleted keeping
1581 			 * the list updated and sorted.
1582 			 */
1583 			if (start > n->start)
1584 				ret = vfio_iommu_iova_insert(&n->list, n->start,
1585 							     start - 1);
1586 			if (!ret && end < n->end)
1587 				ret = vfio_iommu_iova_insert(&n->list, end + 1,
1588 							     n->end);
1589 			if (ret)
1590 				return ret;
1591 
1592 			list_del(&n->list);
1593 			kfree(n);
1594 		}
1595 	}
1596 
1597 	if (list_empty(iova))
1598 		return -EINVAL;
1599 
1600 	return 0;
1601 }
1602 
vfio_iommu_resv_free(struct list_head * resv_regions)1603 static void vfio_iommu_resv_free(struct list_head *resv_regions)
1604 {
1605 	struct iommu_resv_region *n, *next;
1606 
1607 	list_for_each_entry_safe(n, next, resv_regions, list) {
1608 		list_del(&n->list);
1609 		kfree(n);
1610 	}
1611 }
1612 
vfio_iommu_iova_free(struct list_head * iova)1613 static void vfio_iommu_iova_free(struct list_head *iova)
1614 {
1615 	struct vfio_iova *n, *next;
1616 
1617 	list_for_each_entry_safe(n, next, iova, list) {
1618 		list_del(&n->list);
1619 		kfree(n);
1620 	}
1621 }
1622 
vfio_iommu_iova_get_copy(struct vfio_iommu * iommu,struct list_head * iova_copy)1623 static int vfio_iommu_iova_get_copy(struct vfio_iommu *iommu,
1624 				    struct list_head *iova_copy)
1625 {
1626 	struct list_head *iova = &iommu->iova_list;
1627 	struct vfio_iova *n;
1628 	int ret;
1629 
1630 	list_for_each_entry(n, iova, list) {
1631 		ret = vfio_iommu_iova_insert(iova_copy, n->start, n->end);
1632 		if (ret)
1633 			goto out_free;
1634 	}
1635 
1636 	return 0;
1637 
1638 out_free:
1639 	vfio_iommu_iova_free(iova_copy);
1640 	return ret;
1641 }
1642 
vfio_iommu_iova_insert_copy(struct vfio_iommu * iommu,struct list_head * iova_copy)1643 static void vfio_iommu_iova_insert_copy(struct vfio_iommu *iommu,
1644 					struct list_head *iova_copy)
1645 {
1646 	struct list_head *iova = &iommu->iova_list;
1647 
1648 	vfio_iommu_iova_free(iova);
1649 
1650 	list_splice_tail(iova_copy, iova);
1651 }
vfio_iommu_type1_attach_group(void * iommu_data,struct iommu_group * iommu_group)1652 static int vfio_iommu_type1_attach_group(void *iommu_data,
1653 					 struct iommu_group *iommu_group)
1654 {
1655 	struct vfio_iommu *iommu = iommu_data;
1656 	struct vfio_group *group;
1657 	struct vfio_domain *domain, *d;
1658 	struct bus_type *bus = NULL;
1659 	int ret;
1660 	bool resv_msi, msi_remap;
1661 	phys_addr_t resv_msi_base = 0;
1662 	struct iommu_domain_geometry geo;
1663 	LIST_HEAD(iova_copy);
1664 	LIST_HEAD(group_resv_regions);
1665 
1666 	mutex_lock(&iommu->lock);
1667 
1668 	list_for_each_entry(d, &iommu->domain_list, next) {
1669 		if (find_iommu_group(d, iommu_group)) {
1670 			mutex_unlock(&iommu->lock);
1671 			return -EINVAL;
1672 		}
1673 	}
1674 
1675 	if (iommu->external_domain) {
1676 		if (find_iommu_group(iommu->external_domain, iommu_group)) {
1677 			mutex_unlock(&iommu->lock);
1678 			return -EINVAL;
1679 		}
1680 	}
1681 
1682 	group = kzalloc(sizeof(*group), GFP_KERNEL);
1683 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1684 	if (!group || !domain) {
1685 		ret = -ENOMEM;
1686 		goto out_free;
1687 	}
1688 
1689 	group->iommu_group = iommu_group;
1690 
1691 	/* Determine bus_type in order to allocate a domain */
1692 	ret = iommu_group_for_each_dev(iommu_group, &bus, vfio_bus_type);
1693 	if (ret)
1694 		goto out_free;
1695 
1696 	if (vfio_bus_is_mdev(bus)) {
1697 		struct device *iommu_device = NULL;
1698 
1699 		group->mdev_group = true;
1700 
1701 		/* Determine the isolation type */
1702 		ret = iommu_group_for_each_dev(iommu_group, &iommu_device,
1703 					       vfio_mdev_iommu_device);
1704 		if (ret || !iommu_device) {
1705 			if (!iommu->external_domain) {
1706 				INIT_LIST_HEAD(&domain->group_list);
1707 				iommu->external_domain = domain;
1708 			} else {
1709 				kfree(domain);
1710 			}
1711 
1712 			list_add(&group->next,
1713 				 &iommu->external_domain->group_list);
1714 			mutex_unlock(&iommu->lock);
1715 
1716 			return 0;
1717 		}
1718 
1719 		bus = iommu_device->bus;
1720 	}
1721 
1722 	domain->domain = iommu_domain_alloc(bus);
1723 	if (!domain->domain) {
1724 		ret = -EIO;
1725 		goto out_free;
1726 	}
1727 
1728 	if (iommu->nesting) {
1729 		int attr = 1;
1730 
1731 		ret = iommu_domain_set_attr(domain->domain, DOMAIN_ATTR_NESTING,
1732 					    &attr);
1733 		if (ret)
1734 			goto out_domain;
1735 	}
1736 
1737 	ret = vfio_iommu_attach_group(domain, group);
1738 	if (ret)
1739 		goto out_domain;
1740 
1741 	/* Get aperture info */
1742 	iommu_domain_get_attr(domain->domain, DOMAIN_ATTR_GEOMETRY, &geo);
1743 
1744 	if (vfio_iommu_aper_conflict(iommu, geo.aperture_start,
1745 				     geo.aperture_end)) {
1746 		ret = -EINVAL;
1747 		goto out_detach;
1748 	}
1749 
1750 	ret = iommu_get_group_resv_regions(iommu_group, &group_resv_regions);
1751 	if (ret)
1752 		goto out_detach;
1753 
1754 	if (vfio_iommu_resv_conflict(iommu, &group_resv_regions)) {
1755 		ret = -EINVAL;
1756 		goto out_detach;
1757 	}
1758 
1759 	/*
1760 	 * We don't want to work on the original iova list as the list
1761 	 * gets modified and in case of failure we have to retain the
1762 	 * original list. Get a copy here.
1763 	 */
1764 	ret = vfio_iommu_iova_get_copy(iommu, &iova_copy);
1765 	if (ret)
1766 		goto out_detach;
1767 
1768 	ret = vfio_iommu_aper_resize(&iova_copy, geo.aperture_start,
1769 				     geo.aperture_end);
1770 	if (ret)
1771 		goto out_detach;
1772 
1773 	ret = vfio_iommu_resv_exclude(&iova_copy, &group_resv_regions);
1774 	if (ret)
1775 		goto out_detach;
1776 
1777 	resv_msi = vfio_iommu_has_sw_msi(&group_resv_regions, &resv_msi_base);
1778 
1779 	INIT_LIST_HEAD(&domain->group_list);
1780 	list_add(&group->next, &domain->group_list);
1781 
1782 	msi_remap = irq_domain_check_msi_remap() ||
1783 		    iommu_capable(bus, IOMMU_CAP_INTR_REMAP);
1784 
1785 	if (!allow_unsafe_interrupts && !msi_remap) {
1786 		pr_warn("%s: No interrupt remapping support.  Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n",
1787 		       __func__);
1788 		ret = -EPERM;
1789 		goto out_detach;
1790 	}
1791 
1792 	if (iommu_capable(bus, IOMMU_CAP_CACHE_COHERENCY))
1793 		domain->prot |= IOMMU_CACHE;
1794 
1795 	/*
1796 	 * Try to match an existing compatible domain.  We don't want to
1797 	 * preclude an IOMMU driver supporting multiple bus_types and being
1798 	 * able to include different bus_types in the same IOMMU domain, so
1799 	 * we test whether the domains use the same iommu_ops rather than
1800 	 * testing if they're on the same bus_type.
1801 	 */
1802 	list_for_each_entry(d, &iommu->domain_list, next) {
1803 		if (d->domain->ops == domain->domain->ops &&
1804 		    d->prot == domain->prot) {
1805 			vfio_iommu_detach_group(domain, group);
1806 			if (!vfio_iommu_attach_group(d, group)) {
1807 				list_add(&group->next, &d->group_list);
1808 				iommu_domain_free(domain->domain);
1809 				kfree(domain);
1810 				goto done;
1811 			}
1812 
1813 			ret = vfio_iommu_attach_group(domain, group);
1814 			if (ret)
1815 				goto out_domain;
1816 		}
1817 	}
1818 
1819 	vfio_test_domain_fgsp(domain);
1820 
1821 	/* replay mappings on new domains */
1822 	ret = vfio_iommu_replay(iommu, domain);
1823 	if (ret)
1824 		goto out_detach;
1825 
1826 	if (resv_msi) {
1827 		ret = iommu_get_msi_cookie(domain->domain, resv_msi_base);
1828 		if (ret)
1829 			goto out_detach;
1830 	}
1831 
1832 	list_add(&domain->next, &iommu->domain_list);
1833 done:
1834 	/* Delete the old one and insert new iova list */
1835 	vfio_iommu_iova_insert_copy(iommu, &iova_copy);
1836 	mutex_unlock(&iommu->lock);
1837 	vfio_iommu_resv_free(&group_resv_regions);
1838 
1839 	return 0;
1840 
1841 out_detach:
1842 	vfio_iommu_detach_group(domain, group);
1843 out_domain:
1844 	iommu_domain_free(domain->domain);
1845 	vfio_iommu_iova_free(&iova_copy);
1846 	vfio_iommu_resv_free(&group_resv_regions);
1847 out_free:
1848 	kfree(domain);
1849 	kfree(group);
1850 	mutex_unlock(&iommu->lock);
1851 	return ret;
1852 }
1853 
vfio_iommu_unmap_unpin_all(struct vfio_iommu * iommu)1854 static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu)
1855 {
1856 	struct rb_node *node;
1857 
1858 	while ((node = rb_first(&iommu->dma_list)))
1859 		vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
1860 }
1861 
vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu * iommu)1862 static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu *iommu)
1863 {
1864 	struct rb_node *n, *p;
1865 
1866 	n = rb_first(&iommu->dma_list);
1867 	for (; n; n = rb_next(n)) {
1868 		struct vfio_dma *dma;
1869 		long locked = 0, unlocked = 0;
1870 
1871 		dma = rb_entry(n, struct vfio_dma, node);
1872 		unlocked += vfio_unmap_unpin(iommu, dma, false);
1873 		p = rb_first(&dma->pfn_list);
1874 		for (; p; p = rb_next(p)) {
1875 			struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn,
1876 							 node);
1877 
1878 			if (!is_invalid_reserved_pfn(vpfn->pfn))
1879 				locked++;
1880 		}
1881 		vfio_lock_acct(dma, locked - unlocked, true);
1882 	}
1883 }
1884 
vfio_sanity_check_pfn_list(struct vfio_iommu * iommu)1885 static void vfio_sanity_check_pfn_list(struct vfio_iommu *iommu)
1886 {
1887 	struct rb_node *n;
1888 
1889 	n = rb_first(&iommu->dma_list);
1890 	for (; n; n = rb_next(n)) {
1891 		struct vfio_dma *dma;
1892 
1893 		dma = rb_entry(n, struct vfio_dma, node);
1894 
1895 		if (WARN_ON(!RB_EMPTY_ROOT(&dma->pfn_list)))
1896 			break;
1897 	}
1898 	/* mdev vendor driver must unregister notifier */
1899 	WARN_ON(iommu->notifier.head);
1900 }
1901 
1902 /*
1903  * Called when a domain is removed in detach. It is possible that
1904  * the removed domain decided the iova aperture window. Modify the
1905  * iova aperture with the smallest window among existing domains.
1906  */
vfio_iommu_aper_expand(struct vfio_iommu * iommu,struct list_head * iova_copy)1907 static void vfio_iommu_aper_expand(struct vfio_iommu *iommu,
1908 				   struct list_head *iova_copy)
1909 {
1910 	struct vfio_domain *domain;
1911 	struct iommu_domain_geometry geo;
1912 	struct vfio_iova *node;
1913 	dma_addr_t start = 0;
1914 	dma_addr_t end = (dma_addr_t)~0;
1915 
1916 	if (list_empty(iova_copy))
1917 		return;
1918 
1919 	list_for_each_entry(domain, &iommu->domain_list, next) {
1920 		iommu_domain_get_attr(domain->domain, DOMAIN_ATTR_GEOMETRY,
1921 				      &geo);
1922 		if (geo.aperture_start > start)
1923 			start = geo.aperture_start;
1924 		if (geo.aperture_end < end)
1925 			end = geo.aperture_end;
1926 	}
1927 
1928 	/* Modify aperture limits. The new aper is either same or bigger */
1929 	node = list_first_entry(iova_copy, struct vfio_iova, list);
1930 	node->start = start;
1931 	node = list_last_entry(iova_copy, struct vfio_iova, list);
1932 	node->end = end;
1933 }
1934 
1935 /*
1936  * Called when a group is detached. The reserved regions for that
1937  * group can be part of valid iova now. But since reserved regions
1938  * may be duplicated among groups, populate the iova valid regions
1939  * list again.
1940  */
vfio_iommu_resv_refresh(struct vfio_iommu * iommu,struct list_head * iova_copy)1941 static int vfio_iommu_resv_refresh(struct vfio_iommu *iommu,
1942 				   struct list_head *iova_copy)
1943 {
1944 	struct vfio_domain *d;
1945 	struct vfio_group *g;
1946 	struct vfio_iova *node;
1947 	dma_addr_t start, end;
1948 	LIST_HEAD(resv_regions);
1949 	int ret;
1950 
1951 	if (list_empty(iova_copy))
1952 		return -EINVAL;
1953 
1954 	list_for_each_entry(d, &iommu->domain_list, next) {
1955 		list_for_each_entry(g, &d->group_list, next) {
1956 			ret = iommu_get_group_resv_regions(g->iommu_group,
1957 							   &resv_regions);
1958 			if (ret)
1959 				goto done;
1960 		}
1961 	}
1962 
1963 	node = list_first_entry(iova_copy, struct vfio_iova, list);
1964 	start = node->start;
1965 	node = list_last_entry(iova_copy, struct vfio_iova, list);
1966 	end = node->end;
1967 
1968 	/* purge the iova list and create new one */
1969 	vfio_iommu_iova_free(iova_copy);
1970 
1971 	ret = vfio_iommu_aper_resize(iova_copy, start, end);
1972 	if (ret)
1973 		goto done;
1974 
1975 	/* Exclude current reserved regions from iova ranges */
1976 	ret = vfio_iommu_resv_exclude(iova_copy, &resv_regions);
1977 done:
1978 	vfio_iommu_resv_free(&resv_regions);
1979 	return ret;
1980 }
1981 
vfio_iommu_type1_detach_group(void * iommu_data,struct iommu_group * iommu_group)1982 static void vfio_iommu_type1_detach_group(void *iommu_data,
1983 					  struct iommu_group *iommu_group)
1984 {
1985 	struct vfio_iommu *iommu = iommu_data;
1986 	struct vfio_domain *domain;
1987 	struct vfio_group *group;
1988 	LIST_HEAD(iova_copy);
1989 
1990 	mutex_lock(&iommu->lock);
1991 
1992 	if (iommu->external_domain) {
1993 		group = find_iommu_group(iommu->external_domain, iommu_group);
1994 		if (group) {
1995 			list_del(&group->next);
1996 			kfree(group);
1997 
1998 			if (list_empty(&iommu->external_domain->group_list)) {
1999 				vfio_sanity_check_pfn_list(iommu);
2000 
2001 				if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
2002 					vfio_iommu_unmap_unpin_all(iommu);
2003 
2004 				kfree(iommu->external_domain);
2005 				iommu->external_domain = NULL;
2006 			}
2007 			goto detach_group_done;
2008 		}
2009 	}
2010 
2011 	/*
2012 	 * Get a copy of iova list. This will be used to update
2013 	 * and to replace the current one later. Please note that
2014 	 * we will leave the original list as it is if update fails.
2015 	 */
2016 	vfio_iommu_iova_get_copy(iommu, &iova_copy);
2017 
2018 	list_for_each_entry(domain, &iommu->domain_list, next) {
2019 		group = find_iommu_group(domain, iommu_group);
2020 		if (!group)
2021 			continue;
2022 
2023 		vfio_iommu_detach_group(domain, group);
2024 		list_del(&group->next);
2025 		kfree(group);
2026 		/*
2027 		 * Group ownership provides privilege, if the group list is
2028 		 * empty, the domain goes away. If it's the last domain with
2029 		 * iommu and external domain doesn't exist, then all the
2030 		 * mappings go away too. If it's the last domain with iommu and
2031 		 * external domain exist, update accounting
2032 		 */
2033 		if (list_empty(&domain->group_list)) {
2034 			if (list_is_singular(&iommu->domain_list)) {
2035 				if (!iommu->external_domain)
2036 					vfio_iommu_unmap_unpin_all(iommu);
2037 				else
2038 					vfio_iommu_unmap_unpin_reaccount(iommu);
2039 			}
2040 			iommu_domain_free(domain->domain);
2041 			list_del(&domain->next);
2042 			kfree(domain);
2043 			vfio_iommu_aper_expand(iommu, &iova_copy);
2044 		}
2045 		break;
2046 	}
2047 
2048 	if (!vfio_iommu_resv_refresh(iommu, &iova_copy))
2049 		vfio_iommu_iova_insert_copy(iommu, &iova_copy);
2050 	else
2051 		vfio_iommu_iova_free(&iova_copy);
2052 
2053 detach_group_done:
2054 	mutex_unlock(&iommu->lock);
2055 }
2056 
vfio_iommu_type1_open(unsigned long arg)2057 static void *vfio_iommu_type1_open(unsigned long arg)
2058 {
2059 	struct vfio_iommu *iommu;
2060 
2061 	iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
2062 	if (!iommu)
2063 		return ERR_PTR(-ENOMEM);
2064 
2065 	switch (arg) {
2066 	case VFIO_TYPE1_IOMMU:
2067 		break;
2068 	case VFIO_TYPE1_NESTING_IOMMU:
2069 		iommu->nesting = true;
2070 		/* fall through */
2071 	case VFIO_TYPE1v2_IOMMU:
2072 		iommu->v2 = true;
2073 		break;
2074 	default:
2075 		kfree(iommu);
2076 		return ERR_PTR(-EINVAL);
2077 	}
2078 
2079 	INIT_LIST_HEAD(&iommu->domain_list);
2080 	INIT_LIST_HEAD(&iommu->iova_list);
2081 	iommu->dma_list = RB_ROOT;
2082 	iommu->dma_avail = dma_entry_limit;
2083 	mutex_init(&iommu->lock);
2084 	BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier);
2085 
2086 	return iommu;
2087 }
2088 
vfio_release_domain(struct vfio_domain * domain,bool external)2089 static void vfio_release_domain(struct vfio_domain *domain, bool external)
2090 {
2091 	struct vfio_group *group, *group_tmp;
2092 
2093 	list_for_each_entry_safe(group, group_tmp,
2094 				 &domain->group_list, next) {
2095 		if (!external)
2096 			vfio_iommu_detach_group(domain, group);
2097 		list_del(&group->next);
2098 		kfree(group);
2099 	}
2100 
2101 	if (!external)
2102 		iommu_domain_free(domain->domain);
2103 }
2104 
vfio_iommu_type1_release(void * iommu_data)2105 static void vfio_iommu_type1_release(void *iommu_data)
2106 {
2107 	struct vfio_iommu *iommu = iommu_data;
2108 	struct vfio_domain *domain, *domain_tmp;
2109 
2110 	if (iommu->external_domain) {
2111 		vfio_release_domain(iommu->external_domain, true);
2112 		vfio_sanity_check_pfn_list(iommu);
2113 		kfree(iommu->external_domain);
2114 	}
2115 
2116 	vfio_iommu_unmap_unpin_all(iommu);
2117 
2118 	list_for_each_entry_safe(domain, domain_tmp,
2119 				 &iommu->domain_list, next) {
2120 		vfio_release_domain(domain, false);
2121 		list_del(&domain->next);
2122 		kfree(domain);
2123 	}
2124 
2125 	vfio_iommu_iova_free(&iommu->iova_list);
2126 
2127 	kfree(iommu);
2128 }
2129 
vfio_domains_have_iommu_cache(struct vfio_iommu * iommu)2130 static int vfio_domains_have_iommu_cache(struct vfio_iommu *iommu)
2131 {
2132 	struct vfio_domain *domain;
2133 	int ret = 1;
2134 
2135 	mutex_lock(&iommu->lock);
2136 	list_for_each_entry(domain, &iommu->domain_list, next) {
2137 		if (!(domain->prot & IOMMU_CACHE)) {
2138 			ret = 0;
2139 			break;
2140 		}
2141 	}
2142 	mutex_unlock(&iommu->lock);
2143 
2144 	return ret;
2145 }
2146 
vfio_iommu_iova_add_cap(struct vfio_info_cap * caps,struct vfio_iommu_type1_info_cap_iova_range * cap_iovas,size_t size)2147 static int vfio_iommu_iova_add_cap(struct vfio_info_cap *caps,
2148 		 struct vfio_iommu_type1_info_cap_iova_range *cap_iovas,
2149 		 size_t size)
2150 {
2151 	struct vfio_info_cap_header *header;
2152 	struct vfio_iommu_type1_info_cap_iova_range *iova_cap;
2153 
2154 	header = vfio_info_cap_add(caps, size,
2155 				   VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE, 1);
2156 	if (IS_ERR(header))
2157 		return PTR_ERR(header);
2158 
2159 	iova_cap = container_of(header,
2160 				struct vfio_iommu_type1_info_cap_iova_range,
2161 				header);
2162 	iova_cap->nr_iovas = cap_iovas->nr_iovas;
2163 	memcpy(iova_cap->iova_ranges, cap_iovas->iova_ranges,
2164 	       cap_iovas->nr_iovas * sizeof(*cap_iovas->iova_ranges));
2165 	return 0;
2166 }
2167 
vfio_iommu_iova_build_caps(struct vfio_iommu * iommu,struct vfio_info_cap * caps)2168 static int vfio_iommu_iova_build_caps(struct vfio_iommu *iommu,
2169 				      struct vfio_info_cap *caps)
2170 {
2171 	struct vfio_iommu_type1_info_cap_iova_range *cap_iovas;
2172 	struct vfio_iova *iova;
2173 	size_t size;
2174 	int iovas = 0, i = 0, ret;
2175 
2176 	mutex_lock(&iommu->lock);
2177 
2178 	list_for_each_entry(iova, &iommu->iova_list, list)
2179 		iovas++;
2180 
2181 	if (!iovas) {
2182 		/*
2183 		 * Return 0 as a container with a single mdev device
2184 		 * will have an empty list
2185 		 */
2186 		ret = 0;
2187 		goto out_unlock;
2188 	}
2189 
2190 	size = sizeof(*cap_iovas) + (iovas * sizeof(*cap_iovas->iova_ranges));
2191 
2192 	cap_iovas = kzalloc(size, GFP_KERNEL);
2193 	if (!cap_iovas) {
2194 		ret = -ENOMEM;
2195 		goto out_unlock;
2196 	}
2197 
2198 	cap_iovas->nr_iovas = iovas;
2199 
2200 	list_for_each_entry(iova, &iommu->iova_list, list) {
2201 		cap_iovas->iova_ranges[i].start = iova->start;
2202 		cap_iovas->iova_ranges[i].end = iova->end;
2203 		i++;
2204 	}
2205 
2206 	ret = vfio_iommu_iova_add_cap(caps, cap_iovas, size);
2207 
2208 	kfree(cap_iovas);
2209 out_unlock:
2210 	mutex_unlock(&iommu->lock);
2211 	return ret;
2212 }
2213 
vfio_iommu_type1_ioctl(void * iommu_data,unsigned int cmd,unsigned long arg)2214 static long vfio_iommu_type1_ioctl(void *iommu_data,
2215 				   unsigned int cmd, unsigned long arg)
2216 {
2217 	struct vfio_iommu *iommu = iommu_data;
2218 	unsigned long minsz;
2219 
2220 	if (cmd == VFIO_CHECK_EXTENSION) {
2221 		switch (arg) {
2222 		case VFIO_TYPE1_IOMMU:
2223 		case VFIO_TYPE1v2_IOMMU:
2224 		case VFIO_TYPE1_NESTING_IOMMU:
2225 			return 1;
2226 		case VFIO_DMA_CC_IOMMU:
2227 			if (!iommu)
2228 				return 0;
2229 			return vfio_domains_have_iommu_cache(iommu);
2230 		default:
2231 			return 0;
2232 		}
2233 	} else if (cmd == VFIO_IOMMU_GET_INFO) {
2234 		struct vfio_iommu_type1_info info;
2235 		struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
2236 		unsigned long capsz;
2237 		int ret;
2238 
2239 		minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
2240 
2241 		/* For backward compatibility, cannot require this */
2242 		capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset);
2243 
2244 		if (copy_from_user(&info, (void __user *)arg, minsz))
2245 			return -EFAULT;
2246 
2247 		if (info.argsz < minsz)
2248 			return -EINVAL;
2249 
2250 		if (info.argsz >= capsz) {
2251 			minsz = capsz;
2252 			info.cap_offset = 0; /* output, no-recopy necessary */
2253 		}
2254 
2255 		info.flags = VFIO_IOMMU_INFO_PGSIZES;
2256 
2257 		info.iova_pgsizes = vfio_pgsize_bitmap(iommu);
2258 
2259 		ret = vfio_iommu_iova_build_caps(iommu, &caps);
2260 		if (ret)
2261 			return ret;
2262 
2263 		if (caps.size) {
2264 			info.flags |= VFIO_IOMMU_INFO_CAPS;
2265 
2266 			if (info.argsz < sizeof(info) + caps.size) {
2267 				info.argsz = sizeof(info) + caps.size;
2268 			} else {
2269 				vfio_info_cap_shift(&caps, sizeof(info));
2270 				if (copy_to_user((void __user *)arg +
2271 						sizeof(info), caps.buf,
2272 						caps.size)) {
2273 					kfree(caps.buf);
2274 					return -EFAULT;
2275 				}
2276 				info.cap_offset = sizeof(info);
2277 			}
2278 
2279 			kfree(caps.buf);
2280 		}
2281 
2282 		return copy_to_user((void __user *)arg, &info, minsz) ?
2283 			-EFAULT : 0;
2284 
2285 	} else if (cmd == VFIO_IOMMU_MAP_DMA) {
2286 		struct vfio_iommu_type1_dma_map map;
2287 		uint32_t mask = VFIO_DMA_MAP_FLAG_READ |
2288 				VFIO_DMA_MAP_FLAG_WRITE;
2289 
2290 		minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
2291 
2292 		if (copy_from_user(&map, (void __user *)arg, minsz))
2293 			return -EFAULT;
2294 
2295 		if (map.argsz < minsz || map.flags & ~mask)
2296 			return -EINVAL;
2297 
2298 		return vfio_dma_do_map(iommu, &map);
2299 
2300 	} else if (cmd == VFIO_IOMMU_UNMAP_DMA) {
2301 		struct vfio_iommu_type1_dma_unmap unmap;
2302 		long ret;
2303 
2304 		minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
2305 
2306 		if (copy_from_user(&unmap, (void __user *)arg, minsz))
2307 			return -EFAULT;
2308 
2309 		if (unmap.argsz < minsz || unmap.flags)
2310 			return -EINVAL;
2311 
2312 		ret = vfio_dma_do_unmap(iommu, &unmap);
2313 		if (ret)
2314 			return ret;
2315 
2316 		return copy_to_user((void __user *)arg, &unmap, minsz) ?
2317 			-EFAULT : 0;
2318 	}
2319 
2320 	return -ENOTTY;
2321 }
2322 
vfio_iommu_type1_register_notifier(void * iommu_data,unsigned long * events,struct notifier_block * nb)2323 static int vfio_iommu_type1_register_notifier(void *iommu_data,
2324 					      unsigned long *events,
2325 					      struct notifier_block *nb)
2326 {
2327 	struct vfio_iommu *iommu = iommu_data;
2328 
2329 	/* clear known events */
2330 	*events &= ~VFIO_IOMMU_NOTIFY_DMA_UNMAP;
2331 
2332 	/* refuse to register if still events remaining */
2333 	if (*events)
2334 		return -EINVAL;
2335 
2336 	return blocking_notifier_chain_register(&iommu->notifier, nb);
2337 }
2338 
vfio_iommu_type1_unregister_notifier(void * iommu_data,struct notifier_block * nb)2339 static int vfio_iommu_type1_unregister_notifier(void *iommu_data,
2340 						struct notifier_block *nb)
2341 {
2342 	struct vfio_iommu *iommu = iommu_data;
2343 
2344 	return blocking_notifier_chain_unregister(&iommu->notifier, nb);
2345 }
2346 
2347 static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
2348 	.name			= "vfio-iommu-type1",
2349 	.owner			= THIS_MODULE,
2350 	.open			= vfio_iommu_type1_open,
2351 	.release		= vfio_iommu_type1_release,
2352 	.ioctl			= vfio_iommu_type1_ioctl,
2353 	.attach_group		= vfio_iommu_type1_attach_group,
2354 	.detach_group		= vfio_iommu_type1_detach_group,
2355 	.pin_pages		= vfio_iommu_type1_pin_pages,
2356 	.unpin_pages		= vfio_iommu_type1_unpin_pages,
2357 	.register_notifier	= vfio_iommu_type1_register_notifier,
2358 	.unregister_notifier	= vfio_iommu_type1_unregister_notifier,
2359 };
2360 
vfio_iommu_type1_init(void)2361 static int __init vfio_iommu_type1_init(void)
2362 {
2363 	return vfio_register_iommu_driver(&vfio_iommu_driver_ops_type1);
2364 }
2365 
vfio_iommu_type1_cleanup(void)2366 static void __exit vfio_iommu_type1_cleanup(void)
2367 {
2368 	vfio_unregister_iommu_driver(&vfio_iommu_driver_ops_type1);
2369 }
2370 
2371 module_init(vfio_iommu_type1_init);
2372 module_exit(vfio_iommu_type1_cleanup);
2373 
2374 MODULE_VERSION(DRIVER_VERSION);
2375 MODULE_LICENSE("GPL v2");
2376 MODULE_AUTHOR(DRIVER_AUTHOR);
2377 MODULE_DESCRIPTION(DRIVER_DESC);
2378