1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * VFIO core
4  *
5  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
6  *     Author: Alex Williamson <alex.williamson@redhat.com>
7  *
8  * Derived from original vfio:
9  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
10  * Author: Tom Lyon, pugs@cisco.com
11  */
12 
13 #include <linux/cdev.h>
14 #include <linux/compat.h>
15 #include <linux/device.h>
16 #include <linux/file.h>
17 #include <linux/anon_inodes.h>
18 #include <linux/fs.h>
19 #include <linux/idr.h>
20 #include <linux/iommu.h>
21 #include <linux/list.h>
22 #include <linux/miscdevice.h>
23 #include <linux/module.h>
24 #include <linux/mutex.h>
25 #include <linux/pci.h>
26 #include <linux/rwsem.h>
27 #include <linux/sched.h>
28 #include <linux/slab.h>
29 #include <linux/stat.h>
30 #include <linux/string.h>
31 #include <linux/uaccess.h>
32 #include <linux/vfio.h>
33 #include <linux/wait.h>
34 #include <linux/sched/signal.h>
35 
36 #define DRIVER_VERSION	"0.3"
37 #define DRIVER_AUTHOR	"Alex Williamson <alex.williamson@redhat.com>"
38 #define DRIVER_DESC	"VFIO - User Level meta-driver"
39 
40 static struct vfio {
41 	struct class			*class;
42 	struct list_head		iommu_drivers_list;
43 	struct mutex			iommu_drivers_lock;
44 	struct list_head		group_list;
45 	struct idr			group_idr;
46 	struct mutex			group_lock;
47 	struct cdev			group_cdev;
48 	dev_t				group_devt;
49 } vfio;
50 
51 struct vfio_iommu_driver {
52 	const struct vfio_iommu_driver_ops	*ops;
53 	struct list_head			vfio_next;
54 };
55 
56 struct vfio_container {
57 	struct kref			kref;
58 	struct list_head		group_list;
59 	struct rw_semaphore		group_lock;
60 	struct vfio_iommu_driver	*iommu_driver;
61 	void				*iommu_data;
62 	bool				noiommu;
63 };
64 
65 struct vfio_unbound_dev {
66 	struct device			*dev;
67 	struct list_head		unbound_next;
68 };
69 
70 struct vfio_group {
71 	struct kref			kref;
72 	int				minor;
73 	atomic_t			container_users;
74 	struct iommu_group		*iommu_group;
75 	struct vfio_container		*container;
76 	struct list_head		device_list;
77 	struct mutex			device_lock;
78 	struct device			*dev;
79 	struct notifier_block		nb;
80 	struct list_head		vfio_next;
81 	struct list_head		container_next;
82 	struct list_head		unbound_list;
83 	struct mutex			unbound_lock;
84 	atomic_t			opened;
85 	wait_queue_head_t		container_q;
86 	bool				noiommu;
87 	unsigned int			dev_counter;
88 	struct kvm			*kvm;
89 	struct blocking_notifier_head	notifier;
90 };
91 
92 #ifdef CONFIG_VFIO_NOIOMMU
93 static bool noiommu __read_mostly;
94 module_param_named(enable_unsafe_noiommu_mode,
95 		   noiommu, bool, S_IRUGO | S_IWUSR);
96 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
97 #endif
98 
99 static DEFINE_XARRAY(vfio_device_set_xa);
100 
vfio_assign_device_set(struct vfio_device * device,void * set_id)101 int vfio_assign_device_set(struct vfio_device *device, void *set_id)
102 {
103 	unsigned long idx = (unsigned long)set_id;
104 	struct vfio_device_set *new_dev_set;
105 	struct vfio_device_set *dev_set;
106 
107 	if (WARN_ON(!set_id))
108 		return -EINVAL;
109 
110 	/*
111 	 * Atomically acquire a singleton object in the xarray for this set_id
112 	 */
113 	xa_lock(&vfio_device_set_xa);
114 	dev_set = xa_load(&vfio_device_set_xa, idx);
115 	if (dev_set)
116 		goto found_get_ref;
117 	xa_unlock(&vfio_device_set_xa);
118 
119 	new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL);
120 	if (!new_dev_set)
121 		return -ENOMEM;
122 	mutex_init(&new_dev_set->lock);
123 	INIT_LIST_HEAD(&new_dev_set->device_list);
124 	new_dev_set->set_id = set_id;
125 
126 	xa_lock(&vfio_device_set_xa);
127 	dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
128 			       GFP_KERNEL);
129 	if (!dev_set) {
130 		dev_set = new_dev_set;
131 		goto found_get_ref;
132 	}
133 
134 	kfree(new_dev_set);
135 	if (xa_is_err(dev_set)) {
136 		xa_unlock(&vfio_device_set_xa);
137 		return xa_err(dev_set);
138 	}
139 
140 found_get_ref:
141 	dev_set->device_count++;
142 	xa_unlock(&vfio_device_set_xa);
143 	mutex_lock(&dev_set->lock);
144 	device->dev_set = dev_set;
145 	list_add_tail(&device->dev_set_list, &dev_set->device_list);
146 	mutex_unlock(&dev_set->lock);
147 	return 0;
148 }
149 EXPORT_SYMBOL_GPL(vfio_assign_device_set);
150 
vfio_release_device_set(struct vfio_device * device)151 static void vfio_release_device_set(struct vfio_device *device)
152 {
153 	struct vfio_device_set *dev_set = device->dev_set;
154 
155 	if (!dev_set)
156 		return;
157 
158 	mutex_lock(&dev_set->lock);
159 	list_del(&device->dev_set_list);
160 	mutex_unlock(&dev_set->lock);
161 
162 	xa_lock(&vfio_device_set_xa);
163 	if (!--dev_set->device_count) {
164 		__xa_erase(&vfio_device_set_xa,
165 			   (unsigned long)dev_set->set_id);
166 		mutex_destroy(&dev_set->lock);
167 		kfree(dev_set);
168 	}
169 	xa_unlock(&vfio_device_set_xa);
170 }
171 
172 /*
173  * vfio_iommu_group_{get,put} are only intended for VFIO bus driver probe
174  * and remove functions, any use cases other than acquiring the first
175  * reference for the purpose of calling vfio_register_group_dev() or removing
176  * that symmetric reference after vfio_unregister_group_dev() should use the raw
177  * iommu_group_{get,put} functions.  In particular, vfio_iommu_group_put()
178  * removes the device from the dummy group and cannot be nested.
179  */
vfio_iommu_group_get(struct device * dev)180 struct iommu_group *vfio_iommu_group_get(struct device *dev)
181 {
182 	struct iommu_group *group;
183 	int __maybe_unused ret;
184 
185 	group = iommu_group_get(dev);
186 
187 #ifdef CONFIG_VFIO_NOIOMMU
188 	/*
189 	 * With noiommu enabled, an IOMMU group will be created for a device
190 	 * that doesn't already have one and doesn't have an iommu_ops on their
191 	 * bus.  We set iommudata simply to be able to identify these groups
192 	 * as special use and for reclamation later.
193 	 */
194 	if (group || !noiommu || iommu_present(dev->bus))
195 		return group;
196 
197 	group = iommu_group_alloc();
198 	if (IS_ERR(group))
199 		return NULL;
200 
201 	iommu_group_set_name(group, "vfio-noiommu");
202 	iommu_group_set_iommudata(group, &noiommu, NULL);
203 	ret = iommu_group_add_device(group, dev);
204 	if (ret) {
205 		iommu_group_put(group);
206 		return NULL;
207 	}
208 
209 	/*
210 	 * Where to taint?  At this point we've added an IOMMU group for a
211 	 * device that is not backed by iommu_ops, therefore any iommu_
212 	 * callback using iommu_ops can legitimately Oops.  So, while we may
213 	 * be about to give a DMA capable device to a user without IOMMU
214 	 * protection, which is clearly taint-worthy, let's go ahead and do
215 	 * it here.
216 	 */
217 	add_taint(TAINT_USER, LOCKDEP_STILL_OK);
218 	dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
219 #endif
220 
221 	return group;
222 }
223 EXPORT_SYMBOL_GPL(vfio_iommu_group_get);
224 
vfio_iommu_group_put(struct iommu_group * group,struct device * dev)225 void vfio_iommu_group_put(struct iommu_group *group, struct device *dev)
226 {
227 #ifdef CONFIG_VFIO_NOIOMMU
228 	if (iommu_group_get_iommudata(group) == &noiommu)
229 		iommu_group_remove_device(dev);
230 #endif
231 
232 	iommu_group_put(group);
233 }
234 EXPORT_SYMBOL_GPL(vfio_iommu_group_put);
235 
236 #ifdef CONFIG_VFIO_NOIOMMU
vfio_noiommu_open(unsigned long arg)237 static void *vfio_noiommu_open(unsigned long arg)
238 {
239 	if (arg != VFIO_NOIOMMU_IOMMU)
240 		return ERR_PTR(-EINVAL);
241 	if (!capable(CAP_SYS_RAWIO))
242 		return ERR_PTR(-EPERM);
243 
244 	return NULL;
245 }
246 
vfio_noiommu_release(void * iommu_data)247 static void vfio_noiommu_release(void *iommu_data)
248 {
249 }
250 
vfio_noiommu_ioctl(void * iommu_data,unsigned int cmd,unsigned long arg)251 static long vfio_noiommu_ioctl(void *iommu_data,
252 			       unsigned int cmd, unsigned long arg)
253 {
254 	if (cmd == VFIO_CHECK_EXTENSION)
255 		return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;
256 
257 	return -ENOTTY;
258 }
259 
vfio_noiommu_attach_group(void * iommu_data,struct iommu_group * iommu_group)260 static int vfio_noiommu_attach_group(void *iommu_data,
261 				     struct iommu_group *iommu_group)
262 {
263 	return iommu_group_get_iommudata(iommu_group) == &noiommu ? 0 : -EINVAL;
264 }
265 
vfio_noiommu_detach_group(void * iommu_data,struct iommu_group * iommu_group)266 static void vfio_noiommu_detach_group(void *iommu_data,
267 				      struct iommu_group *iommu_group)
268 {
269 }
270 
271 static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
272 	.name = "vfio-noiommu",
273 	.owner = THIS_MODULE,
274 	.open = vfio_noiommu_open,
275 	.release = vfio_noiommu_release,
276 	.ioctl = vfio_noiommu_ioctl,
277 	.attach_group = vfio_noiommu_attach_group,
278 	.detach_group = vfio_noiommu_detach_group,
279 };
280 #endif
281 
282 
283 /**
284  * IOMMU driver registration
285  */
vfio_register_iommu_driver(const struct vfio_iommu_driver_ops * ops)286 int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
287 {
288 	struct vfio_iommu_driver *driver, *tmp;
289 
290 	driver = kzalloc(sizeof(*driver), GFP_KERNEL);
291 	if (!driver)
292 		return -ENOMEM;
293 
294 	driver->ops = ops;
295 
296 	mutex_lock(&vfio.iommu_drivers_lock);
297 
298 	/* Check for duplicates */
299 	list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
300 		if (tmp->ops == ops) {
301 			mutex_unlock(&vfio.iommu_drivers_lock);
302 			kfree(driver);
303 			return -EINVAL;
304 		}
305 	}
306 
307 	list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
308 
309 	mutex_unlock(&vfio.iommu_drivers_lock);
310 
311 	return 0;
312 }
313 EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
314 
vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops * ops)315 void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
316 {
317 	struct vfio_iommu_driver *driver;
318 
319 	mutex_lock(&vfio.iommu_drivers_lock);
320 	list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
321 		if (driver->ops == ops) {
322 			list_del(&driver->vfio_next);
323 			mutex_unlock(&vfio.iommu_drivers_lock);
324 			kfree(driver);
325 			return;
326 		}
327 	}
328 	mutex_unlock(&vfio.iommu_drivers_lock);
329 }
330 EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
331 
332 /**
333  * Group minor allocation/free - both called with vfio.group_lock held
334  */
vfio_alloc_group_minor(struct vfio_group * group)335 static int vfio_alloc_group_minor(struct vfio_group *group)
336 {
337 	return idr_alloc(&vfio.group_idr, group, 0, MINORMASK + 1, GFP_KERNEL);
338 }
339 
vfio_free_group_minor(int minor)340 static void vfio_free_group_minor(int minor)
341 {
342 	idr_remove(&vfio.group_idr, minor);
343 }
344 
345 static int vfio_iommu_group_notifier(struct notifier_block *nb,
346 				     unsigned long action, void *data);
347 static void vfio_group_get(struct vfio_group *group);
348 
349 /**
350  * Container objects - containers are created when /dev/vfio/vfio is
351  * opened, but their lifecycle extends until the last user is done, so
352  * it's freed via kref.  Must support container/group/device being
353  * closed in any order.
354  */
vfio_container_get(struct vfio_container * container)355 static void vfio_container_get(struct vfio_container *container)
356 {
357 	kref_get(&container->kref);
358 }
359 
vfio_container_release(struct kref * kref)360 static void vfio_container_release(struct kref *kref)
361 {
362 	struct vfio_container *container;
363 	container = container_of(kref, struct vfio_container, kref);
364 
365 	kfree(container);
366 }
367 
vfio_container_put(struct vfio_container * container)368 static void vfio_container_put(struct vfio_container *container)
369 {
370 	kref_put(&container->kref, vfio_container_release);
371 }
372 
vfio_group_unlock_and_free(struct vfio_group * group)373 static void vfio_group_unlock_and_free(struct vfio_group *group)
374 {
375 	mutex_unlock(&vfio.group_lock);
376 	/*
377 	 * Unregister outside of lock.  A spurious callback is harmless now
378 	 * that the group is no longer in vfio.group_list.
379 	 */
380 	iommu_group_unregister_notifier(group->iommu_group, &group->nb);
381 	kfree(group);
382 }
383 
384 /**
385  * Group objects - create, release, get, put, search
386  */
vfio_create_group(struct iommu_group * iommu_group)387 static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group)
388 {
389 	struct vfio_group *group, *tmp;
390 	struct device *dev;
391 	int ret, minor;
392 
393 	group = kzalloc(sizeof(*group), GFP_KERNEL);
394 	if (!group)
395 		return ERR_PTR(-ENOMEM);
396 
397 	kref_init(&group->kref);
398 	INIT_LIST_HEAD(&group->device_list);
399 	mutex_init(&group->device_lock);
400 	INIT_LIST_HEAD(&group->unbound_list);
401 	mutex_init(&group->unbound_lock);
402 	atomic_set(&group->container_users, 0);
403 	atomic_set(&group->opened, 0);
404 	init_waitqueue_head(&group->container_q);
405 	group->iommu_group = iommu_group;
406 #ifdef CONFIG_VFIO_NOIOMMU
407 	group->noiommu = (iommu_group_get_iommudata(iommu_group) == &noiommu);
408 #endif
409 	BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
410 
411 	group->nb.notifier_call = vfio_iommu_group_notifier;
412 
413 	/*
414 	 * blocking notifiers acquire a rwsem around registering and hold
415 	 * it around callback.  Therefore, need to register outside of
416 	 * vfio.group_lock to avoid A-B/B-A contention.  Our callback won't
417 	 * do anything unless it can find the group in vfio.group_list, so
418 	 * no harm in registering early.
419 	 */
420 	ret = iommu_group_register_notifier(iommu_group, &group->nb);
421 	if (ret) {
422 		kfree(group);
423 		return ERR_PTR(ret);
424 	}
425 
426 	mutex_lock(&vfio.group_lock);
427 
428 	/* Did we race creating this group? */
429 	list_for_each_entry(tmp, &vfio.group_list, vfio_next) {
430 		if (tmp->iommu_group == iommu_group) {
431 			vfio_group_get(tmp);
432 			vfio_group_unlock_and_free(group);
433 			return tmp;
434 		}
435 	}
436 
437 	minor = vfio_alloc_group_minor(group);
438 	if (minor < 0) {
439 		vfio_group_unlock_and_free(group);
440 		return ERR_PTR(minor);
441 	}
442 
443 	dev = device_create(vfio.class, NULL,
444 			    MKDEV(MAJOR(vfio.group_devt), minor),
445 			    group, "%s%d", group->noiommu ? "noiommu-" : "",
446 			    iommu_group_id(iommu_group));
447 	if (IS_ERR(dev)) {
448 		vfio_free_group_minor(minor);
449 		vfio_group_unlock_and_free(group);
450 		return ERR_CAST(dev);
451 	}
452 
453 	group->minor = minor;
454 	group->dev = dev;
455 
456 	list_add(&group->vfio_next, &vfio.group_list);
457 
458 	mutex_unlock(&vfio.group_lock);
459 
460 	return group;
461 }
462 
463 /* called with vfio.group_lock held */
vfio_group_release(struct kref * kref)464 static void vfio_group_release(struct kref *kref)
465 {
466 	struct vfio_group *group = container_of(kref, struct vfio_group, kref);
467 	struct vfio_unbound_dev *unbound, *tmp;
468 	struct iommu_group *iommu_group = group->iommu_group;
469 
470 	WARN_ON(!list_empty(&group->device_list));
471 	WARN_ON(group->notifier.head);
472 
473 	list_for_each_entry_safe(unbound, tmp,
474 				 &group->unbound_list, unbound_next) {
475 		list_del(&unbound->unbound_next);
476 		kfree(unbound);
477 	}
478 
479 	device_destroy(vfio.class, MKDEV(MAJOR(vfio.group_devt), group->minor));
480 	list_del(&group->vfio_next);
481 	vfio_free_group_minor(group->minor);
482 	vfio_group_unlock_and_free(group);
483 	iommu_group_put(iommu_group);
484 }
485 
vfio_group_put(struct vfio_group * group)486 static void vfio_group_put(struct vfio_group *group)
487 {
488 	kref_put_mutex(&group->kref, vfio_group_release, &vfio.group_lock);
489 }
490 
491 struct vfio_group_put_work {
492 	struct work_struct work;
493 	struct vfio_group *group;
494 };
495 
vfio_group_put_bg(struct work_struct * work)496 static void vfio_group_put_bg(struct work_struct *work)
497 {
498 	struct vfio_group_put_work *do_work;
499 
500 	do_work = container_of(work, struct vfio_group_put_work, work);
501 
502 	vfio_group_put(do_work->group);
503 	kfree(do_work);
504 }
505 
vfio_group_schedule_put(struct vfio_group * group)506 static void vfio_group_schedule_put(struct vfio_group *group)
507 {
508 	struct vfio_group_put_work *do_work;
509 
510 	do_work = kmalloc(sizeof(*do_work), GFP_KERNEL);
511 	if (WARN_ON(!do_work))
512 		return;
513 
514 	INIT_WORK(&do_work->work, vfio_group_put_bg);
515 	do_work->group = group;
516 	schedule_work(&do_work->work);
517 }
518 
519 /* Assume group_lock or group reference is held */
vfio_group_get(struct vfio_group * group)520 static void vfio_group_get(struct vfio_group *group)
521 {
522 	kref_get(&group->kref);
523 }
524 
525 /*
526  * Not really a try as we will sleep for mutex, but we need to make
527  * sure the group pointer is valid under lock and get a reference.
528  */
vfio_group_try_get(struct vfio_group * group)529 static struct vfio_group *vfio_group_try_get(struct vfio_group *group)
530 {
531 	struct vfio_group *target = group;
532 
533 	mutex_lock(&vfio.group_lock);
534 	list_for_each_entry(group, &vfio.group_list, vfio_next) {
535 		if (group == target) {
536 			vfio_group_get(group);
537 			mutex_unlock(&vfio.group_lock);
538 			return group;
539 		}
540 	}
541 	mutex_unlock(&vfio.group_lock);
542 
543 	return NULL;
544 }
545 
546 static
vfio_group_get_from_iommu(struct iommu_group * iommu_group)547 struct vfio_group *vfio_group_get_from_iommu(struct iommu_group *iommu_group)
548 {
549 	struct vfio_group *group;
550 
551 	mutex_lock(&vfio.group_lock);
552 	list_for_each_entry(group, &vfio.group_list, vfio_next) {
553 		if (group->iommu_group == iommu_group) {
554 			vfio_group_get(group);
555 			mutex_unlock(&vfio.group_lock);
556 			return group;
557 		}
558 	}
559 	mutex_unlock(&vfio.group_lock);
560 
561 	return NULL;
562 }
563 
vfio_group_get_from_minor(int minor)564 static struct vfio_group *vfio_group_get_from_minor(int minor)
565 {
566 	struct vfio_group *group;
567 
568 	mutex_lock(&vfio.group_lock);
569 	group = idr_find(&vfio.group_idr, minor);
570 	if (!group) {
571 		mutex_unlock(&vfio.group_lock);
572 		return NULL;
573 	}
574 	vfio_group_get(group);
575 	mutex_unlock(&vfio.group_lock);
576 
577 	return group;
578 }
579 
vfio_group_get_from_dev(struct device * dev)580 static struct vfio_group *vfio_group_get_from_dev(struct device *dev)
581 {
582 	struct iommu_group *iommu_group;
583 	struct vfio_group *group;
584 
585 	iommu_group = iommu_group_get(dev);
586 	if (!iommu_group)
587 		return NULL;
588 
589 	group = vfio_group_get_from_iommu(iommu_group);
590 	iommu_group_put(iommu_group);
591 
592 	return group;
593 }
594 
595 /**
596  * Device objects - create, release, get, put, search
597  */
598 /* Device reference always implies a group reference */
vfio_device_put(struct vfio_device * device)599 void vfio_device_put(struct vfio_device *device)
600 {
601 	if (refcount_dec_and_test(&device->refcount))
602 		complete(&device->comp);
603 }
604 EXPORT_SYMBOL_GPL(vfio_device_put);
605 
vfio_device_try_get(struct vfio_device * device)606 static bool vfio_device_try_get(struct vfio_device *device)
607 {
608 	return refcount_inc_not_zero(&device->refcount);
609 }
610 
vfio_group_get_device(struct vfio_group * group,struct device * dev)611 static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
612 						 struct device *dev)
613 {
614 	struct vfio_device *device;
615 
616 	mutex_lock(&group->device_lock);
617 	list_for_each_entry(device, &group->device_list, group_next) {
618 		if (device->dev == dev && vfio_device_try_get(device)) {
619 			mutex_unlock(&group->device_lock);
620 			return device;
621 		}
622 	}
623 	mutex_unlock(&group->device_lock);
624 	return NULL;
625 }
626 
627 /*
628  * Some drivers, like pci-stub, are only used to prevent other drivers from
629  * claiming a device and are therefore perfectly legitimate for a user owned
630  * group.  The pci-stub driver has no dependencies on DMA or the IOVA mapping
631  * of the device, but it does prevent the user from having direct access to
632  * the device, which is useful in some circumstances.
633  *
634  * We also assume that we can include PCI interconnect devices, ie. bridges.
635  * IOMMU grouping on PCI necessitates that if we lack isolation on a bridge
636  * then all of the downstream devices will be part of the same IOMMU group as
637  * the bridge.  Thus, if placing the bridge into the user owned IOVA space
638  * breaks anything, it only does so for user owned devices downstream.  Note
639  * that error notification via MSI can be affected for platforms that handle
640  * MSI within the same IOVA space as DMA.
641  */
642 static const char * const vfio_driver_allowed[] = { "pci-stub" };
643 
vfio_dev_driver_allowed(struct device * dev,struct device_driver * drv)644 static bool vfio_dev_driver_allowed(struct device *dev,
645 				    struct device_driver *drv)
646 {
647 	if (dev_is_pci(dev)) {
648 		struct pci_dev *pdev = to_pci_dev(dev);
649 
650 		if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
651 			return true;
652 	}
653 
654 	return match_string(vfio_driver_allowed,
655 			    ARRAY_SIZE(vfio_driver_allowed),
656 			    drv->name) >= 0;
657 }
658 
659 /*
660  * A vfio group is viable for use by userspace if all devices are in
661  * one of the following states:
662  *  - driver-less
663  *  - bound to a vfio driver
664  *  - bound to an otherwise allowed driver
665  *  - a PCI interconnect device
666  *
667  * We use two methods to determine whether a device is bound to a vfio
668  * driver.  The first is to test whether the device exists in the vfio
669  * group.  The second is to test if the device exists on the group
670  * unbound_list, indicating it's in the middle of transitioning from
671  * a vfio driver to driver-less.
672  */
vfio_dev_viable(struct device * dev,void * data)673 static int vfio_dev_viable(struct device *dev, void *data)
674 {
675 	struct vfio_group *group = data;
676 	struct vfio_device *device;
677 	struct device_driver *drv = READ_ONCE(dev->driver);
678 	struct vfio_unbound_dev *unbound;
679 	int ret = -EINVAL;
680 
681 	mutex_lock(&group->unbound_lock);
682 	list_for_each_entry(unbound, &group->unbound_list, unbound_next) {
683 		if (dev == unbound->dev) {
684 			ret = 0;
685 			break;
686 		}
687 	}
688 	mutex_unlock(&group->unbound_lock);
689 
690 	if (!ret || !drv || vfio_dev_driver_allowed(dev, drv))
691 		return 0;
692 
693 	device = vfio_group_get_device(group, dev);
694 	if (device) {
695 		vfio_device_put(device);
696 		return 0;
697 	}
698 
699 	return ret;
700 }
701 
702 /**
703  * Async device support
704  */
vfio_group_nb_add_dev(struct vfio_group * group,struct device * dev)705 static int vfio_group_nb_add_dev(struct vfio_group *group, struct device *dev)
706 {
707 	struct vfio_device *device;
708 
709 	/* Do we already know about it?  We shouldn't */
710 	device = vfio_group_get_device(group, dev);
711 	if (WARN_ON_ONCE(device)) {
712 		vfio_device_put(device);
713 		return 0;
714 	}
715 
716 	/* Nothing to do for idle groups */
717 	if (!atomic_read(&group->container_users))
718 		return 0;
719 
720 	/* TODO Prevent device auto probing */
721 	dev_WARN(dev, "Device added to live group %d!\n",
722 		 iommu_group_id(group->iommu_group));
723 
724 	return 0;
725 }
726 
vfio_group_nb_verify(struct vfio_group * group,struct device * dev)727 static int vfio_group_nb_verify(struct vfio_group *group, struct device *dev)
728 {
729 	/* We don't care what happens when the group isn't in use */
730 	if (!atomic_read(&group->container_users))
731 		return 0;
732 
733 	return vfio_dev_viable(dev, group);
734 }
735 
vfio_iommu_group_notifier(struct notifier_block * nb,unsigned long action,void * data)736 static int vfio_iommu_group_notifier(struct notifier_block *nb,
737 				     unsigned long action, void *data)
738 {
739 	struct vfio_group *group = container_of(nb, struct vfio_group, nb);
740 	struct device *dev = data;
741 	struct vfio_unbound_dev *unbound;
742 
743 	/*
744 	 * Need to go through a group_lock lookup to get a reference or we
745 	 * risk racing a group being removed.  Ignore spurious notifies.
746 	 */
747 	group = vfio_group_try_get(group);
748 	if (!group)
749 		return NOTIFY_OK;
750 
751 	switch (action) {
752 	case IOMMU_GROUP_NOTIFY_ADD_DEVICE:
753 		vfio_group_nb_add_dev(group, dev);
754 		break;
755 	case IOMMU_GROUP_NOTIFY_DEL_DEVICE:
756 		/*
757 		 * Nothing to do here.  If the device is in use, then the
758 		 * vfio sub-driver should block the remove callback until
759 		 * it is unused.  If the device is unused or attached to a
760 		 * stub driver, then it should be released and we don't
761 		 * care that it will be going away.
762 		 */
763 		break;
764 	case IOMMU_GROUP_NOTIFY_BIND_DRIVER:
765 		dev_dbg(dev, "%s: group %d binding to driver\n", __func__,
766 			iommu_group_id(group->iommu_group));
767 		break;
768 	case IOMMU_GROUP_NOTIFY_BOUND_DRIVER:
769 		dev_dbg(dev, "%s: group %d bound to driver %s\n", __func__,
770 			iommu_group_id(group->iommu_group), dev->driver->name);
771 		BUG_ON(vfio_group_nb_verify(group, dev));
772 		break;
773 	case IOMMU_GROUP_NOTIFY_UNBIND_DRIVER:
774 		dev_dbg(dev, "%s: group %d unbinding from driver %s\n",
775 			__func__, iommu_group_id(group->iommu_group),
776 			dev->driver->name);
777 		break;
778 	case IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER:
779 		dev_dbg(dev, "%s: group %d unbound from driver\n", __func__,
780 			iommu_group_id(group->iommu_group));
781 		/*
782 		 * XXX An unbound device in a live group is ok, but we'd
783 		 * really like to avoid the above BUG_ON by preventing other
784 		 * drivers from binding to it.  Once that occurs, we have to
785 		 * stop the system to maintain isolation.  At a minimum, we'd
786 		 * want a toggle to disable driver auto probe for this device.
787 		 */
788 
789 		mutex_lock(&group->unbound_lock);
790 		list_for_each_entry(unbound,
791 				    &group->unbound_list, unbound_next) {
792 			if (dev == unbound->dev) {
793 				list_del(&unbound->unbound_next);
794 				kfree(unbound);
795 				break;
796 			}
797 		}
798 		mutex_unlock(&group->unbound_lock);
799 		break;
800 	}
801 
802 	/*
803 	 * If we're the last reference to the group, the group will be
804 	 * released, which includes unregistering the iommu group notifier.
805 	 * We hold a read-lock on that notifier list, unregistering needs
806 	 * a write-lock... deadlock.  Release our reference asynchronously
807 	 * to avoid that situation.
808 	 */
809 	vfio_group_schedule_put(group);
810 	return NOTIFY_OK;
811 }
812 
813 /**
814  * VFIO driver API
815  */
vfio_init_group_dev(struct vfio_device * device,struct device * dev,const struct vfio_device_ops * ops)816 void vfio_init_group_dev(struct vfio_device *device, struct device *dev,
817 			 const struct vfio_device_ops *ops)
818 {
819 	init_completion(&device->comp);
820 	device->dev = dev;
821 	device->ops = ops;
822 }
823 EXPORT_SYMBOL_GPL(vfio_init_group_dev);
824 
vfio_uninit_group_dev(struct vfio_device * device)825 void vfio_uninit_group_dev(struct vfio_device *device)
826 {
827 	vfio_release_device_set(device);
828 }
829 EXPORT_SYMBOL_GPL(vfio_uninit_group_dev);
830 
vfio_register_group_dev(struct vfio_device * device)831 int vfio_register_group_dev(struct vfio_device *device)
832 {
833 	struct vfio_device *existing_device;
834 	struct iommu_group *iommu_group;
835 	struct vfio_group *group;
836 
837 	/*
838 	 * If the driver doesn't specify a set then the device is added to a
839 	 * singleton set just for itself.
840 	 */
841 	if (!device->dev_set)
842 		vfio_assign_device_set(device, device);
843 
844 	iommu_group = iommu_group_get(device->dev);
845 	if (!iommu_group)
846 		return -EINVAL;
847 
848 	group = vfio_group_get_from_iommu(iommu_group);
849 	if (!group) {
850 		group = vfio_create_group(iommu_group);
851 		if (IS_ERR(group)) {
852 			iommu_group_put(iommu_group);
853 			return PTR_ERR(group);
854 		}
855 	} else {
856 		/*
857 		 * A found vfio_group already holds a reference to the
858 		 * iommu_group.  A created vfio_group keeps the reference.
859 		 */
860 		iommu_group_put(iommu_group);
861 	}
862 
863 	existing_device = vfio_group_get_device(group, device->dev);
864 	if (existing_device) {
865 		dev_WARN(device->dev, "Device already exists on group %d\n",
866 			 iommu_group_id(iommu_group));
867 		vfio_device_put(existing_device);
868 		vfio_group_put(group);
869 		return -EBUSY;
870 	}
871 
872 	/* Our reference on group is moved to the device */
873 	device->group = group;
874 
875 	/* Refcounting can't start until the driver calls register */
876 	refcount_set(&device->refcount, 1);
877 
878 	mutex_lock(&group->device_lock);
879 	list_add(&device->group_next, &group->device_list);
880 	group->dev_counter++;
881 	mutex_unlock(&group->device_lock);
882 
883 	return 0;
884 }
885 EXPORT_SYMBOL_GPL(vfio_register_group_dev);
886 
887 /**
888  * Get a reference to the vfio_device for a device.  Even if the
889  * caller thinks they own the device, they could be racing with a
890  * release call path, so we can't trust drvdata for the shortcut.
891  * Go the long way around, from the iommu_group to the vfio_group
892  * to the vfio_device.
893  */
vfio_device_get_from_dev(struct device * dev)894 struct vfio_device *vfio_device_get_from_dev(struct device *dev)
895 {
896 	struct vfio_group *group;
897 	struct vfio_device *device;
898 
899 	group = vfio_group_get_from_dev(dev);
900 	if (!group)
901 		return NULL;
902 
903 	device = vfio_group_get_device(group, dev);
904 	vfio_group_put(group);
905 
906 	return device;
907 }
908 EXPORT_SYMBOL_GPL(vfio_device_get_from_dev);
909 
vfio_device_get_from_name(struct vfio_group * group,char * buf)910 static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
911 						     char *buf)
912 {
913 	struct vfio_device *it, *device = ERR_PTR(-ENODEV);
914 
915 	mutex_lock(&group->device_lock);
916 	list_for_each_entry(it, &group->device_list, group_next) {
917 		int ret;
918 
919 		if (it->ops->match) {
920 			ret = it->ops->match(it, buf);
921 			if (ret < 0) {
922 				device = ERR_PTR(ret);
923 				break;
924 			}
925 		} else {
926 			ret = !strcmp(dev_name(it->dev), buf);
927 		}
928 
929 		if (ret && vfio_device_try_get(it)) {
930 			device = it;
931 			break;
932 		}
933 	}
934 	mutex_unlock(&group->device_lock);
935 
936 	return device;
937 }
938 
939 /*
940  * Decrement the device reference count and wait for the device to be
941  * removed.  Open file descriptors for the device... */
vfio_unregister_group_dev(struct vfio_device * device)942 void vfio_unregister_group_dev(struct vfio_device *device)
943 {
944 	struct vfio_group *group = device->group;
945 	struct vfio_unbound_dev *unbound;
946 	unsigned int i = 0;
947 	bool interrupted = false;
948 	long rc;
949 
950 	/*
951 	 * When the device is removed from the group, the group suddenly
952 	 * becomes non-viable; the device has a driver (until the unbind
953 	 * completes), but it's not present in the group.  This is bad news
954 	 * for any external users that need to re-acquire a group reference
955 	 * in order to match and release their existing reference.  To
956 	 * solve this, we track such devices on the unbound_list to bridge
957 	 * the gap until they're fully unbound.
958 	 */
959 	unbound = kzalloc(sizeof(*unbound), GFP_KERNEL);
960 	if (unbound) {
961 		unbound->dev = device->dev;
962 		mutex_lock(&group->unbound_lock);
963 		list_add(&unbound->unbound_next, &group->unbound_list);
964 		mutex_unlock(&group->unbound_lock);
965 	}
966 	WARN_ON(!unbound);
967 
968 	vfio_device_put(device);
969 	rc = try_wait_for_completion(&device->comp);
970 	while (rc <= 0) {
971 		if (device->ops->request)
972 			device->ops->request(device, i++);
973 
974 		if (interrupted) {
975 			rc = wait_for_completion_timeout(&device->comp,
976 							 HZ * 10);
977 		} else {
978 			rc = wait_for_completion_interruptible_timeout(
979 				&device->comp, HZ * 10);
980 			if (rc < 0) {
981 				interrupted = true;
982 				dev_warn(device->dev,
983 					 "Device is currently in use, task"
984 					 " \"%s\" (%d) "
985 					 "blocked until device is released",
986 					 current->comm, task_pid_nr(current));
987 			}
988 		}
989 	}
990 
991 	mutex_lock(&group->device_lock);
992 	list_del(&device->group_next);
993 	group->dev_counter--;
994 	mutex_unlock(&group->device_lock);
995 
996 	/*
997 	 * In order to support multiple devices per group, devices can be
998 	 * plucked from the group while other devices in the group are still
999 	 * in use.  The container persists with this group and those remaining
1000 	 * devices still attached.  If the user creates an isolation violation
1001 	 * by binding this device to another driver while the group is still in
1002 	 * use, that's their fault.  However, in the case of removing the last,
1003 	 * or potentially the only, device in the group there can be no other
1004 	 * in-use devices in the group.  The user has done their due diligence
1005 	 * and we should lay no claims to those devices.  In order to do that,
1006 	 * we need to make sure the group is detached from the container.
1007 	 * Without this stall, we're potentially racing with a user process
1008 	 * that may attempt to immediately bind this device to another driver.
1009 	 */
1010 	if (list_empty(&group->device_list))
1011 		wait_event(group->container_q, !group->container);
1012 
1013 	/* Matches the get in vfio_register_group_dev() */
1014 	vfio_group_put(group);
1015 }
1016 EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
1017 
1018 /**
1019  * VFIO base fd, /dev/vfio/vfio
1020  */
vfio_ioctl_check_extension(struct vfio_container * container,unsigned long arg)1021 static long vfio_ioctl_check_extension(struct vfio_container *container,
1022 				       unsigned long arg)
1023 {
1024 	struct vfio_iommu_driver *driver;
1025 	long ret = 0;
1026 
1027 	down_read(&container->group_lock);
1028 
1029 	driver = container->iommu_driver;
1030 
1031 	switch (arg) {
1032 		/* No base extensions yet */
1033 	default:
1034 		/*
1035 		 * If no driver is set, poll all registered drivers for
1036 		 * extensions and return the first positive result.  If
1037 		 * a driver is already set, further queries will be passed
1038 		 * only to that driver.
1039 		 */
1040 		if (!driver) {
1041 			mutex_lock(&vfio.iommu_drivers_lock);
1042 			list_for_each_entry(driver, &vfio.iommu_drivers_list,
1043 					    vfio_next) {
1044 
1045 #ifdef CONFIG_VFIO_NOIOMMU
1046 				if (!list_empty(&container->group_list) &&
1047 				    (container->noiommu !=
1048 				     (driver->ops == &vfio_noiommu_ops)))
1049 					continue;
1050 #endif
1051 
1052 				if (!try_module_get(driver->ops->owner))
1053 					continue;
1054 
1055 				ret = driver->ops->ioctl(NULL,
1056 							 VFIO_CHECK_EXTENSION,
1057 							 arg);
1058 				module_put(driver->ops->owner);
1059 				if (ret > 0)
1060 					break;
1061 			}
1062 			mutex_unlock(&vfio.iommu_drivers_lock);
1063 		} else
1064 			ret = driver->ops->ioctl(container->iommu_data,
1065 						 VFIO_CHECK_EXTENSION, arg);
1066 	}
1067 
1068 	up_read(&container->group_lock);
1069 
1070 	return ret;
1071 }
1072 
1073 /* hold write lock on container->group_lock */
__vfio_container_attach_groups(struct vfio_container * container,struct vfio_iommu_driver * driver,void * data)1074 static int __vfio_container_attach_groups(struct vfio_container *container,
1075 					  struct vfio_iommu_driver *driver,
1076 					  void *data)
1077 {
1078 	struct vfio_group *group;
1079 	int ret = -ENODEV;
1080 
1081 	list_for_each_entry(group, &container->group_list, container_next) {
1082 		ret = driver->ops->attach_group(data, group->iommu_group);
1083 		if (ret)
1084 			goto unwind;
1085 	}
1086 
1087 	return ret;
1088 
1089 unwind:
1090 	list_for_each_entry_continue_reverse(group, &container->group_list,
1091 					     container_next) {
1092 		driver->ops->detach_group(data, group->iommu_group);
1093 	}
1094 
1095 	return ret;
1096 }
1097 
vfio_ioctl_set_iommu(struct vfio_container * container,unsigned long arg)1098 static long vfio_ioctl_set_iommu(struct vfio_container *container,
1099 				 unsigned long arg)
1100 {
1101 	struct vfio_iommu_driver *driver;
1102 	long ret = -ENODEV;
1103 
1104 	down_write(&container->group_lock);
1105 
1106 	/*
1107 	 * The container is designed to be an unprivileged interface while
1108 	 * the group can be assigned to specific users.  Therefore, only by
1109 	 * adding a group to a container does the user get the privilege of
1110 	 * enabling the iommu, which may allocate finite resources.  There
1111 	 * is no unset_iommu, but by removing all the groups from a container,
1112 	 * the container is deprivileged and returns to an unset state.
1113 	 */
1114 	if (list_empty(&container->group_list) || container->iommu_driver) {
1115 		up_write(&container->group_lock);
1116 		return -EINVAL;
1117 	}
1118 
1119 	mutex_lock(&vfio.iommu_drivers_lock);
1120 	list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
1121 		void *data;
1122 
1123 #ifdef CONFIG_VFIO_NOIOMMU
1124 		/*
1125 		 * Only noiommu containers can use vfio-noiommu and noiommu
1126 		 * containers can only use vfio-noiommu.
1127 		 */
1128 		if (container->noiommu != (driver->ops == &vfio_noiommu_ops))
1129 			continue;
1130 #endif
1131 
1132 		if (!try_module_get(driver->ops->owner))
1133 			continue;
1134 
1135 		/*
1136 		 * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
1137 		 * so test which iommu driver reported support for this
1138 		 * extension and call open on them.  We also pass them the
1139 		 * magic, allowing a single driver to support multiple
1140 		 * interfaces if they'd like.
1141 		 */
1142 		if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
1143 			module_put(driver->ops->owner);
1144 			continue;
1145 		}
1146 
1147 		data = driver->ops->open(arg);
1148 		if (IS_ERR(data)) {
1149 			ret = PTR_ERR(data);
1150 			module_put(driver->ops->owner);
1151 			continue;
1152 		}
1153 
1154 		ret = __vfio_container_attach_groups(container, driver, data);
1155 		if (ret) {
1156 			driver->ops->release(data);
1157 			module_put(driver->ops->owner);
1158 			continue;
1159 		}
1160 
1161 		container->iommu_driver = driver;
1162 		container->iommu_data = data;
1163 		break;
1164 	}
1165 
1166 	mutex_unlock(&vfio.iommu_drivers_lock);
1167 	up_write(&container->group_lock);
1168 
1169 	return ret;
1170 }
1171 
vfio_fops_unl_ioctl(struct file * filep,unsigned int cmd,unsigned long arg)1172 static long vfio_fops_unl_ioctl(struct file *filep,
1173 				unsigned int cmd, unsigned long arg)
1174 {
1175 	struct vfio_container *container = filep->private_data;
1176 	struct vfio_iommu_driver *driver;
1177 	void *data;
1178 	long ret = -EINVAL;
1179 
1180 	if (!container)
1181 		return ret;
1182 
1183 	switch (cmd) {
1184 	case VFIO_GET_API_VERSION:
1185 		ret = VFIO_API_VERSION;
1186 		break;
1187 	case VFIO_CHECK_EXTENSION:
1188 		ret = vfio_ioctl_check_extension(container, arg);
1189 		break;
1190 	case VFIO_SET_IOMMU:
1191 		ret = vfio_ioctl_set_iommu(container, arg);
1192 		break;
1193 	default:
1194 		driver = container->iommu_driver;
1195 		data = container->iommu_data;
1196 
1197 		if (driver) /* passthrough all unrecognized ioctls */
1198 			ret = driver->ops->ioctl(data, cmd, arg);
1199 	}
1200 
1201 	return ret;
1202 }
1203 
vfio_fops_open(struct inode * inode,struct file * filep)1204 static int vfio_fops_open(struct inode *inode, struct file *filep)
1205 {
1206 	struct vfio_container *container;
1207 
1208 	container = kzalloc(sizeof(*container), GFP_KERNEL);
1209 	if (!container)
1210 		return -ENOMEM;
1211 
1212 	INIT_LIST_HEAD(&container->group_list);
1213 	init_rwsem(&container->group_lock);
1214 	kref_init(&container->kref);
1215 
1216 	filep->private_data = container;
1217 
1218 	return 0;
1219 }
1220 
vfio_fops_release(struct inode * inode,struct file * filep)1221 static int vfio_fops_release(struct inode *inode, struct file *filep)
1222 {
1223 	struct vfio_container *container = filep->private_data;
1224 	struct vfio_iommu_driver *driver = container->iommu_driver;
1225 
1226 	if (driver && driver->ops->notify)
1227 		driver->ops->notify(container->iommu_data,
1228 				    VFIO_IOMMU_CONTAINER_CLOSE);
1229 
1230 	filep->private_data = NULL;
1231 
1232 	vfio_container_put(container);
1233 
1234 	return 0;
1235 }
1236 
1237 /*
1238  * Once an iommu driver is set, we optionally pass read/write/mmap
1239  * on to the driver, allowing management interfaces beyond ioctl.
1240  */
vfio_fops_read(struct file * filep,char __user * buf,size_t count,loff_t * ppos)1241 static ssize_t vfio_fops_read(struct file *filep, char __user *buf,
1242 			      size_t count, loff_t *ppos)
1243 {
1244 	struct vfio_container *container = filep->private_data;
1245 	struct vfio_iommu_driver *driver;
1246 	ssize_t ret = -EINVAL;
1247 
1248 	driver = container->iommu_driver;
1249 	if (likely(driver && driver->ops->read))
1250 		ret = driver->ops->read(container->iommu_data,
1251 					buf, count, ppos);
1252 
1253 	return ret;
1254 }
1255 
vfio_fops_write(struct file * filep,const char __user * buf,size_t count,loff_t * ppos)1256 static ssize_t vfio_fops_write(struct file *filep, const char __user *buf,
1257 			       size_t count, loff_t *ppos)
1258 {
1259 	struct vfio_container *container = filep->private_data;
1260 	struct vfio_iommu_driver *driver;
1261 	ssize_t ret = -EINVAL;
1262 
1263 	driver = container->iommu_driver;
1264 	if (likely(driver && driver->ops->write))
1265 		ret = driver->ops->write(container->iommu_data,
1266 					 buf, count, ppos);
1267 
1268 	return ret;
1269 }
1270 
vfio_fops_mmap(struct file * filep,struct vm_area_struct * vma)1271 static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1272 {
1273 	struct vfio_container *container = filep->private_data;
1274 	struct vfio_iommu_driver *driver;
1275 	int ret = -EINVAL;
1276 
1277 	driver = container->iommu_driver;
1278 	if (likely(driver && driver->ops->mmap))
1279 		ret = driver->ops->mmap(container->iommu_data, vma);
1280 
1281 	return ret;
1282 }
1283 
1284 static const struct file_operations vfio_fops = {
1285 	.owner		= THIS_MODULE,
1286 	.open		= vfio_fops_open,
1287 	.release	= vfio_fops_release,
1288 	.read		= vfio_fops_read,
1289 	.write		= vfio_fops_write,
1290 	.unlocked_ioctl	= vfio_fops_unl_ioctl,
1291 	.compat_ioctl	= compat_ptr_ioctl,
1292 	.mmap		= vfio_fops_mmap,
1293 };
1294 
1295 /**
1296  * VFIO Group fd, /dev/vfio/$GROUP
1297  */
__vfio_group_unset_container(struct vfio_group * group)1298 static void __vfio_group_unset_container(struct vfio_group *group)
1299 {
1300 	struct vfio_container *container = group->container;
1301 	struct vfio_iommu_driver *driver;
1302 
1303 	down_write(&container->group_lock);
1304 
1305 	driver = container->iommu_driver;
1306 	if (driver)
1307 		driver->ops->detach_group(container->iommu_data,
1308 					  group->iommu_group);
1309 
1310 	group->container = NULL;
1311 	wake_up(&group->container_q);
1312 	list_del(&group->container_next);
1313 
1314 	/* Detaching the last group deprivileges a container, remove iommu */
1315 	if (driver && list_empty(&container->group_list)) {
1316 		driver->ops->release(container->iommu_data);
1317 		module_put(driver->ops->owner);
1318 		container->iommu_driver = NULL;
1319 		container->iommu_data = NULL;
1320 	}
1321 
1322 	up_write(&container->group_lock);
1323 
1324 	vfio_container_put(container);
1325 }
1326 
1327 /*
1328  * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
1329  * if there was no container to unset.  Since the ioctl is called on
1330  * the group, we know that still exists, therefore the only valid
1331  * transition here is 1->0.
1332  */
vfio_group_unset_container(struct vfio_group * group)1333 static int vfio_group_unset_container(struct vfio_group *group)
1334 {
1335 	int users = atomic_cmpxchg(&group->container_users, 1, 0);
1336 
1337 	if (!users)
1338 		return -EINVAL;
1339 	if (users != 1)
1340 		return -EBUSY;
1341 
1342 	__vfio_group_unset_container(group);
1343 
1344 	return 0;
1345 }
1346 
1347 /*
1348  * When removing container users, anything that removes the last user
1349  * implicitly removes the group from the container.  That is, if the
1350  * group file descriptor is closed, as well as any device file descriptors,
1351  * the group is free.
1352  */
vfio_group_try_dissolve_container(struct vfio_group * group)1353 static void vfio_group_try_dissolve_container(struct vfio_group *group)
1354 {
1355 	if (0 == atomic_dec_if_positive(&group->container_users))
1356 		__vfio_group_unset_container(group);
1357 }
1358 
vfio_group_set_container(struct vfio_group * group,int container_fd)1359 static int vfio_group_set_container(struct vfio_group *group, int container_fd)
1360 {
1361 	struct fd f;
1362 	struct vfio_container *container;
1363 	struct vfio_iommu_driver *driver;
1364 	int ret = 0;
1365 
1366 	if (atomic_read(&group->container_users))
1367 		return -EINVAL;
1368 
1369 	if (group->noiommu && !capable(CAP_SYS_RAWIO))
1370 		return -EPERM;
1371 
1372 	f = fdget(container_fd);
1373 	if (!f.file)
1374 		return -EBADF;
1375 
1376 	/* Sanity check, is this really our fd? */
1377 	if (f.file->f_op != &vfio_fops) {
1378 		fdput(f);
1379 		return -EINVAL;
1380 	}
1381 
1382 	container = f.file->private_data;
1383 	WARN_ON(!container); /* fget ensures we don't race vfio_release */
1384 
1385 	down_write(&container->group_lock);
1386 
1387 	/* Real groups and fake groups cannot mix */
1388 	if (!list_empty(&container->group_list) &&
1389 	    container->noiommu != group->noiommu) {
1390 		ret = -EPERM;
1391 		goto unlock_out;
1392 	}
1393 
1394 	driver = container->iommu_driver;
1395 	if (driver) {
1396 		ret = driver->ops->attach_group(container->iommu_data,
1397 						group->iommu_group);
1398 		if (ret)
1399 			goto unlock_out;
1400 	}
1401 
1402 	group->container = container;
1403 	container->noiommu = group->noiommu;
1404 	list_add(&group->container_next, &container->group_list);
1405 
1406 	/* Get a reference on the container and mark a user within the group */
1407 	vfio_container_get(container);
1408 	atomic_inc(&group->container_users);
1409 
1410 unlock_out:
1411 	up_write(&container->group_lock);
1412 	fdput(f);
1413 	return ret;
1414 }
1415 
vfio_group_viable(struct vfio_group * group)1416 static bool vfio_group_viable(struct vfio_group *group)
1417 {
1418 	return (iommu_group_for_each_dev(group->iommu_group,
1419 					 group, vfio_dev_viable) == 0);
1420 }
1421 
vfio_group_add_container_user(struct vfio_group * group)1422 static int vfio_group_add_container_user(struct vfio_group *group)
1423 {
1424 	if (!atomic_inc_not_zero(&group->container_users))
1425 		return -EINVAL;
1426 
1427 	if (group->noiommu) {
1428 		atomic_dec(&group->container_users);
1429 		return -EPERM;
1430 	}
1431 	if (!group->container->iommu_driver || !vfio_group_viable(group)) {
1432 		atomic_dec(&group->container_users);
1433 		return -EINVAL;
1434 	}
1435 
1436 	return 0;
1437 }
1438 
1439 static const struct file_operations vfio_device_fops;
1440 
vfio_group_get_device_fd(struct vfio_group * group,char * buf)1441 static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
1442 {
1443 	struct vfio_device *device;
1444 	struct file *filep;
1445 	int fdno;
1446 	int ret = 0;
1447 
1448 	if (0 == atomic_read(&group->container_users) ||
1449 	    !group->container->iommu_driver || !vfio_group_viable(group))
1450 		return -EINVAL;
1451 
1452 	if (group->noiommu && !capable(CAP_SYS_RAWIO))
1453 		return -EPERM;
1454 
1455 	device = vfio_device_get_from_name(group, buf);
1456 	if (IS_ERR(device))
1457 		return PTR_ERR(device);
1458 
1459 	if (!try_module_get(device->dev->driver->owner)) {
1460 		ret = -ENODEV;
1461 		goto err_device_put;
1462 	}
1463 
1464 	mutex_lock(&device->dev_set->lock);
1465 	device->open_count++;
1466 	if (device->open_count == 1 && device->ops->open_device) {
1467 		ret = device->ops->open_device(device);
1468 		if (ret)
1469 			goto err_undo_count;
1470 	}
1471 	mutex_unlock(&device->dev_set->lock);
1472 
1473 	/*
1474 	 * We can't use anon_inode_getfd() because we need to modify
1475 	 * the f_mode flags directly to allow more than just ioctls
1476 	 */
1477 	fdno = ret = get_unused_fd_flags(O_CLOEXEC);
1478 	if (ret < 0)
1479 		goto err_close_device;
1480 
1481 	filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
1482 				   device, O_RDWR);
1483 	if (IS_ERR(filep)) {
1484 		ret = PTR_ERR(filep);
1485 		goto err_fd;
1486 	}
1487 
1488 	/*
1489 	 * TODO: add an anon_inode interface to do this.
1490 	 * Appears to be missing by lack of need rather than
1491 	 * explicitly prevented.  Now there's need.
1492 	 */
1493 	filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
1494 
1495 	atomic_inc(&group->container_users);
1496 
1497 	fd_install(fdno, filep);
1498 
1499 	if (group->noiommu)
1500 		dev_warn(device->dev, "vfio-noiommu device opened by user "
1501 			 "(%s:%d)\n", current->comm, task_pid_nr(current));
1502 	return fdno;
1503 
1504 err_fd:
1505 	put_unused_fd(fdno);
1506 err_close_device:
1507 	mutex_lock(&device->dev_set->lock);
1508 	if (device->open_count == 1 && device->ops->close_device)
1509 		device->ops->close_device(device);
1510 err_undo_count:
1511 	device->open_count--;
1512 	mutex_unlock(&device->dev_set->lock);
1513 	module_put(device->dev->driver->owner);
1514 err_device_put:
1515 	vfio_device_put(device);
1516 	return ret;
1517 }
1518 
vfio_group_fops_unl_ioctl(struct file * filep,unsigned int cmd,unsigned long arg)1519 static long vfio_group_fops_unl_ioctl(struct file *filep,
1520 				      unsigned int cmd, unsigned long arg)
1521 {
1522 	struct vfio_group *group = filep->private_data;
1523 	long ret = -ENOTTY;
1524 
1525 	switch (cmd) {
1526 	case VFIO_GROUP_GET_STATUS:
1527 	{
1528 		struct vfio_group_status status;
1529 		unsigned long minsz;
1530 
1531 		minsz = offsetofend(struct vfio_group_status, flags);
1532 
1533 		if (copy_from_user(&status, (void __user *)arg, minsz))
1534 			return -EFAULT;
1535 
1536 		if (status.argsz < minsz)
1537 			return -EINVAL;
1538 
1539 		status.flags = 0;
1540 
1541 		if (vfio_group_viable(group))
1542 			status.flags |= VFIO_GROUP_FLAGS_VIABLE;
1543 
1544 		if (group->container)
1545 			status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET;
1546 
1547 		if (copy_to_user((void __user *)arg, &status, minsz))
1548 			return -EFAULT;
1549 
1550 		ret = 0;
1551 		break;
1552 	}
1553 	case VFIO_GROUP_SET_CONTAINER:
1554 	{
1555 		int fd;
1556 
1557 		if (get_user(fd, (int __user *)arg))
1558 			return -EFAULT;
1559 
1560 		if (fd < 0)
1561 			return -EINVAL;
1562 
1563 		ret = vfio_group_set_container(group, fd);
1564 		break;
1565 	}
1566 	case VFIO_GROUP_UNSET_CONTAINER:
1567 		ret = vfio_group_unset_container(group);
1568 		break;
1569 	case VFIO_GROUP_GET_DEVICE_FD:
1570 	{
1571 		char *buf;
1572 
1573 		buf = strndup_user((const char __user *)arg, PAGE_SIZE);
1574 		if (IS_ERR(buf))
1575 			return PTR_ERR(buf);
1576 
1577 		ret = vfio_group_get_device_fd(group, buf);
1578 		kfree(buf);
1579 		break;
1580 	}
1581 	}
1582 
1583 	return ret;
1584 }
1585 
vfio_group_fops_open(struct inode * inode,struct file * filep)1586 static int vfio_group_fops_open(struct inode *inode, struct file *filep)
1587 {
1588 	struct vfio_group *group;
1589 	int opened;
1590 
1591 	group = vfio_group_get_from_minor(iminor(inode));
1592 	if (!group)
1593 		return -ENODEV;
1594 
1595 	if (group->noiommu && !capable(CAP_SYS_RAWIO)) {
1596 		vfio_group_put(group);
1597 		return -EPERM;
1598 	}
1599 
1600 	/* Do we need multiple instances of the group open?  Seems not. */
1601 	opened = atomic_cmpxchg(&group->opened, 0, 1);
1602 	if (opened) {
1603 		vfio_group_put(group);
1604 		return -EBUSY;
1605 	}
1606 
1607 	/* Is something still in use from a previous open? */
1608 	if (group->container) {
1609 		atomic_dec(&group->opened);
1610 		vfio_group_put(group);
1611 		return -EBUSY;
1612 	}
1613 
1614 	/* Warn if previous user didn't cleanup and re-init to drop them */
1615 	if (WARN_ON(group->notifier.head))
1616 		BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
1617 
1618 	filep->private_data = group;
1619 
1620 	return 0;
1621 }
1622 
vfio_group_fops_release(struct inode * inode,struct file * filep)1623 static int vfio_group_fops_release(struct inode *inode, struct file *filep)
1624 {
1625 	struct vfio_group *group = filep->private_data;
1626 
1627 	filep->private_data = NULL;
1628 
1629 	vfio_group_try_dissolve_container(group);
1630 
1631 	atomic_dec(&group->opened);
1632 
1633 	vfio_group_put(group);
1634 
1635 	return 0;
1636 }
1637 
1638 static const struct file_operations vfio_group_fops = {
1639 	.owner		= THIS_MODULE,
1640 	.unlocked_ioctl	= vfio_group_fops_unl_ioctl,
1641 	.compat_ioctl	= compat_ptr_ioctl,
1642 	.open		= vfio_group_fops_open,
1643 	.release	= vfio_group_fops_release,
1644 };
1645 
1646 /**
1647  * VFIO Device fd
1648  */
vfio_device_fops_release(struct inode * inode,struct file * filep)1649 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
1650 {
1651 	struct vfio_device *device = filep->private_data;
1652 
1653 	mutex_lock(&device->dev_set->lock);
1654 	if (!--device->open_count && device->ops->close_device)
1655 		device->ops->close_device(device);
1656 	mutex_unlock(&device->dev_set->lock);
1657 
1658 	module_put(device->dev->driver->owner);
1659 
1660 	vfio_group_try_dissolve_container(device->group);
1661 
1662 	vfio_device_put(device);
1663 
1664 	return 0;
1665 }
1666 
vfio_device_fops_unl_ioctl(struct file * filep,unsigned int cmd,unsigned long arg)1667 static long vfio_device_fops_unl_ioctl(struct file *filep,
1668 				       unsigned int cmd, unsigned long arg)
1669 {
1670 	struct vfio_device *device = filep->private_data;
1671 
1672 	if (unlikely(!device->ops->ioctl))
1673 		return -EINVAL;
1674 
1675 	return device->ops->ioctl(device, cmd, arg);
1676 }
1677 
vfio_device_fops_read(struct file * filep,char __user * buf,size_t count,loff_t * ppos)1678 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1679 				     size_t count, loff_t *ppos)
1680 {
1681 	struct vfio_device *device = filep->private_data;
1682 
1683 	if (unlikely(!device->ops->read))
1684 		return -EINVAL;
1685 
1686 	return device->ops->read(device, buf, count, ppos);
1687 }
1688 
vfio_device_fops_write(struct file * filep,const char __user * buf,size_t count,loff_t * ppos)1689 static ssize_t vfio_device_fops_write(struct file *filep,
1690 				      const char __user *buf,
1691 				      size_t count, loff_t *ppos)
1692 {
1693 	struct vfio_device *device = filep->private_data;
1694 
1695 	if (unlikely(!device->ops->write))
1696 		return -EINVAL;
1697 
1698 	return device->ops->write(device, buf, count, ppos);
1699 }
1700 
vfio_device_fops_mmap(struct file * filep,struct vm_area_struct * vma)1701 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1702 {
1703 	struct vfio_device *device = filep->private_data;
1704 
1705 	if (unlikely(!device->ops->mmap))
1706 		return -EINVAL;
1707 
1708 	return device->ops->mmap(device, vma);
1709 }
1710 
1711 static const struct file_operations vfio_device_fops = {
1712 	.owner		= THIS_MODULE,
1713 	.release	= vfio_device_fops_release,
1714 	.read		= vfio_device_fops_read,
1715 	.write		= vfio_device_fops_write,
1716 	.unlocked_ioctl	= vfio_device_fops_unl_ioctl,
1717 	.compat_ioctl	= compat_ptr_ioctl,
1718 	.mmap		= vfio_device_fops_mmap,
1719 };
1720 
1721 /**
1722  * External user API, exported by symbols to be linked dynamically.
1723  *
1724  * The protocol includes:
1725  *  1. do normal VFIO init operation:
1726  *	- opening a new container;
1727  *	- attaching group(s) to it;
1728  *	- setting an IOMMU driver for a container.
1729  * When IOMMU is set for a container, all groups in it are
1730  * considered ready to use by an external user.
1731  *
1732  * 2. User space passes a group fd to an external user.
1733  * The external user calls vfio_group_get_external_user()
1734  * to verify that:
1735  *	- the group is initialized;
1736  *	- IOMMU is set for it.
1737  * If both checks passed, vfio_group_get_external_user()
1738  * increments the container user counter to prevent
1739  * the VFIO group from disposal before KVM exits.
1740  *
1741  * 3. The external user calls vfio_external_user_iommu_id()
1742  * to know an IOMMU ID.
1743  *
1744  * 4. When the external KVM finishes, it calls
1745  * vfio_group_put_external_user() to release the VFIO group.
1746  * This call decrements the container user counter.
1747  */
vfio_group_get_external_user(struct file * filep)1748 struct vfio_group *vfio_group_get_external_user(struct file *filep)
1749 {
1750 	struct vfio_group *group = filep->private_data;
1751 	int ret;
1752 
1753 	if (filep->f_op != &vfio_group_fops)
1754 		return ERR_PTR(-EINVAL);
1755 
1756 	ret = vfio_group_add_container_user(group);
1757 	if (ret)
1758 		return ERR_PTR(ret);
1759 
1760 	vfio_group_get(group);
1761 
1762 	return group;
1763 }
1764 EXPORT_SYMBOL_GPL(vfio_group_get_external_user);
1765 
1766 /**
1767  * External user API, exported by symbols to be linked dynamically.
1768  * The external user passes in a device pointer
1769  * to verify that:
1770  *	- A VFIO group is assiciated with the device;
1771  *	- IOMMU is set for the group.
1772  * If both checks passed, vfio_group_get_external_user_from_dev()
1773  * increments the container user counter to prevent the VFIO group
1774  * from disposal before external user exits and returns the pointer
1775  * to the VFIO group.
1776  *
1777  * When the external user finishes using the VFIO group, it calls
1778  * vfio_group_put_external_user() to release the VFIO group and
1779  * decrement the container user counter.
1780  *
1781  * @dev [in]	: device
1782  * Return error PTR or pointer to VFIO group.
1783  */
1784 
vfio_group_get_external_user_from_dev(struct device * dev)1785 struct vfio_group *vfio_group_get_external_user_from_dev(struct device *dev)
1786 {
1787 	struct vfio_group *group;
1788 	int ret;
1789 
1790 	group = vfio_group_get_from_dev(dev);
1791 	if (!group)
1792 		return ERR_PTR(-ENODEV);
1793 
1794 	ret = vfio_group_add_container_user(group);
1795 	if (ret) {
1796 		vfio_group_put(group);
1797 		return ERR_PTR(ret);
1798 	}
1799 
1800 	return group;
1801 }
1802 EXPORT_SYMBOL_GPL(vfio_group_get_external_user_from_dev);
1803 
vfio_group_put_external_user(struct vfio_group * group)1804 void vfio_group_put_external_user(struct vfio_group *group)
1805 {
1806 	vfio_group_try_dissolve_container(group);
1807 	vfio_group_put(group);
1808 }
1809 EXPORT_SYMBOL_GPL(vfio_group_put_external_user);
1810 
vfio_external_group_match_file(struct vfio_group * test_group,struct file * filep)1811 bool vfio_external_group_match_file(struct vfio_group *test_group,
1812 				    struct file *filep)
1813 {
1814 	struct vfio_group *group = filep->private_data;
1815 
1816 	return (filep->f_op == &vfio_group_fops) && (group == test_group);
1817 }
1818 EXPORT_SYMBOL_GPL(vfio_external_group_match_file);
1819 
vfio_external_user_iommu_id(struct vfio_group * group)1820 int vfio_external_user_iommu_id(struct vfio_group *group)
1821 {
1822 	return iommu_group_id(group->iommu_group);
1823 }
1824 EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id);
1825 
vfio_external_check_extension(struct vfio_group * group,unsigned long arg)1826 long vfio_external_check_extension(struct vfio_group *group, unsigned long arg)
1827 {
1828 	return vfio_ioctl_check_extension(group->container, arg);
1829 }
1830 EXPORT_SYMBOL_GPL(vfio_external_check_extension);
1831 
1832 /**
1833  * Sub-module support
1834  */
1835 /*
1836  * Helper for managing a buffer of info chain capabilities, allocate or
1837  * reallocate a buffer with additional @size, filling in @id and @version
1838  * of the capability.  A pointer to the new capability is returned.
1839  *
1840  * NB. The chain is based at the head of the buffer, so new entries are
1841  * added to the tail, vfio_info_cap_shift() should be called to fixup the
1842  * next offsets prior to copying to the user buffer.
1843  */
vfio_info_cap_add(struct vfio_info_cap * caps,size_t size,u16 id,u16 version)1844 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1845 					       size_t size, u16 id, u16 version)
1846 {
1847 	void *buf;
1848 	struct vfio_info_cap_header *header, *tmp;
1849 
1850 	buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1851 	if (!buf) {
1852 		kfree(caps->buf);
1853 		caps->size = 0;
1854 		return ERR_PTR(-ENOMEM);
1855 	}
1856 
1857 	caps->buf = buf;
1858 	header = buf + caps->size;
1859 
1860 	/* Eventually copied to user buffer, zero */
1861 	memset(header, 0, size);
1862 
1863 	header->id = id;
1864 	header->version = version;
1865 
1866 	/* Add to the end of the capability chain */
1867 	for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1868 		; /* nothing */
1869 
1870 	tmp->next = caps->size;
1871 	caps->size += size;
1872 
1873 	return header;
1874 }
1875 EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1876 
vfio_info_cap_shift(struct vfio_info_cap * caps,size_t offset)1877 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1878 {
1879 	struct vfio_info_cap_header *tmp;
1880 	void *buf = (void *)caps->buf;
1881 
1882 	for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1883 		tmp->next += offset;
1884 }
1885 EXPORT_SYMBOL(vfio_info_cap_shift);
1886 
vfio_info_add_capability(struct vfio_info_cap * caps,struct vfio_info_cap_header * cap,size_t size)1887 int vfio_info_add_capability(struct vfio_info_cap *caps,
1888 			     struct vfio_info_cap_header *cap, size_t size)
1889 {
1890 	struct vfio_info_cap_header *header;
1891 
1892 	header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1893 	if (IS_ERR(header))
1894 		return PTR_ERR(header);
1895 
1896 	memcpy(header + 1, cap + 1, size - sizeof(*header));
1897 
1898 	return 0;
1899 }
1900 EXPORT_SYMBOL(vfio_info_add_capability);
1901 
vfio_set_irqs_validate_and_prepare(struct vfio_irq_set * hdr,int num_irqs,int max_irq_type,size_t * data_size)1902 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1903 				       int max_irq_type, size_t *data_size)
1904 {
1905 	unsigned long minsz;
1906 	size_t size;
1907 
1908 	minsz = offsetofend(struct vfio_irq_set, count);
1909 
1910 	if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1911 	    (hdr->count >= (U32_MAX - hdr->start)) ||
1912 	    (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1913 				VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1914 		return -EINVAL;
1915 
1916 	if (data_size)
1917 		*data_size = 0;
1918 
1919 	if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1920 		return -EINVAL;
1921 
1922 	switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1923 	case VFIO_IRQ_SET_DATA_NONE:
1924 		size = 0;
1925 		break;
1926 	case VFIO_IRQ_SET_DATA_BOOL:
1927 		size = sizeof(uint8_t);
1928 		break;
1929 	case VFIO_IRQ_SET_DATA_EVENTFD:
1930 		size = sizeof(int32_t);
1931 		break;
1932 	default:
1933 		return -EINVAL;
1934 	}
1935 
1936 	if (size) {
1937 		if (hdr->argsz - minsz < hdr->count * size)
1938 			return -EINVAL;
1939 
1940 		if (!data_size)
1941 			return -EINVAL;
1942 
1943 		*data_size = hdr->count * size;
1944 	}
1945 
1946 	return 0;
1947 }
1948 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1949 
1950 /*
1951  * Pin a set of guest PFNs and return their associated host PFNs for local
1952  * domain only.
1953  * @dev [in]     : device
1954  * @user_pfn [in]: array of user/guest PFNs to be pinned.
1955  * @npage [in]   : count of elements in user_pfn array.  This count should not
1956  *		   be greater VFIO_PIN_PAGES_MAX_ENTRIES.
1957  * @prot [in]    : protection flags
1958  * @phys_pfn[out]: array of host PFNs
1959  * Return error or number of pages pinned.
1960  */
vfio_pin_pages(struct device * dev,unsigned long * user_pfn,int npage,int prot,unsigned long * phys_pfn)1961 int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage,
1962 		   int prot, unsigned long *phys_pfn)
1963 {
1964 	struct vfio_container *container;
1965 	struct vfio_group *group;
1966 	struct vfio_iommu_driver *driver;
1967 	int ret;
1968 
1969 	if (!dev || !user_pfn || !phys_pfn || !npage)
1970 		return -EINVAL;
1971 
1972 	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1973 		return -E2BIG;
1974 
1975 	group = vfio_group_get_from_dev(dev);
1976 	if (!group)
1977 		return -ENODEV;
1978 
1979 	if (group->dev_counter > 1) {
1980 		ret = -EINVAL;
1981 		goto err_pin_pages;
1982 	}
1983 
1984 	ret = vfio_group_add_container_user(group);
1985 	if (ret)
1986 		goto err_pin_pages;
1987 
1988 	container = group->container;
1989 	driver = container->iommu_driver;
1990 	if (likely(driver && driver->ops->pin_pages))
1991 		ret = driver->ops->pin_pages(container->iommu_data,
1992 					     group->iommu_group, user_pfn,
1993 					     npage, prot, phys_pfn);
1994 	else
1995 		ret = -ENOTTY;
1996 
1997 	vfio_group_try_dissolve_container(group);
1998 
1999 err_pin_pages:
2000 	vfio_group_put(group);
2001 	return ret;
2002 }
2003 EXPORT_SYMBOL(vfio_pin_pages);
2004 
2005 /*
2006  * Unpin set of host PFNs for local domain only.
2007  * @dev [in]     : device
2008  * @user_pfn [in]: array of user/guest PFNs to be unpinned. Number of user/guest
2009  *		   PFNs should not be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
2010  * @npage [in]   : count of elements in user_pfn array.  This count should not
2011  *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
2012  * Return error or number of pages unpinned.
2013  */
vfio_unpin_pages(struct device * dev,unsigned long * user_pfn,int npage)2014 int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage)
2015 {
2016 	struct vfio_container *container;
2017 	struct vfio_group *group;
2018 	struct vfio_iommu_driver *driver;
2019 	int ret;
2020 
2021 	if (!dev || !user_pfn || !npage)
2022 		return -EINVAL;
2023 
2024 	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
2025 		return -E2BIG;
2026 
2027 	group = vfio_group_get_from_dev(dev);
2028 	if (!group)
2029 		return -ENODEV;
2030 
2031 	ret = vfio_group_add_container_user(group);
2032 	if (ret)
2033 		goto err_unpin_pages;
2034 
2035 	container = group->container;
2036 	driver = container->iommu_driver;
2037 	if (likely(driver && driver->ops->unpin_pages))
2038 		ret = driver->ops->unpin_pages(container->iommu_data, user_pfn,
2039 					       npage);
2040 	else
2041 		ret = -ENOTTY;
2042 
2043 	vfio_group_try_dissolve_container(group);
2044 
2045 err_unpin_pages:
2046 	vfio_group_put(group);
2047 	return ret;
2048 }
2049 EXPORT_SYMBOL(vfio_unpin_pages);
2050 
2051 /*
2052  * Pin a set of guest IOVA PFNs and return their associated host PFNs for a
2053  * VFIO group.
2054  *
2055  * The caller needs to call vfio_group_get_external_user() or
2056  * vfio_group_get_external_user_from_dev() prior to calling this interface,
2057  * so as to prevent the VFIO group from disposal in the middle of the call.
2058  * But it can keep the reference to the VFIO group for several calls into
2059  * this interface.
2060  * After finishing using of the VFIO group, the caller needs to release the
2061  * VFIO group by calling vfio_group_put_external_user().
2062  *
2063  * @group [in]		: VFIO group
2064  * @user_iova_pfn [in]	: array of user/guest IOVA PFNs to be pinned.
2065  * @npage [in]		: count of elements in user_iova_pfn array.
2066  *			  This count should not be greater
2067  *			  VFIO_PIN_PAGES_MAX_ENTRIES.
2068  * @prot [in]		: protection flags
2069  * @phys_pfn [out]	: array of host PFNs
2070  * Return error or number of pages pinned.
2071  */
vfio_group_pin_pages(struct vfio_group * group,unsigned long * user_iova_pfn,int npage,int prot,unsigned long * phys_pfn)2072 int vfio_group_pin_pages(struct vfio_group *group,
2073 			 unsigned long *user_iova_pfn, int npage,
2074 			 int prot, unsigned long *phys_pfn)
2075 {
2076 	struct vfio_container *container;
2077 	struct vfio_iommu_driver *driver;
2078 	int ret;
2079 
2080 	if (!group || !user_iova_pfn || !phys_pfn || !npage)
2081 		return -EINVAL;
2082 
2083 	if (group->dev_counter > 1)
2084 		return -EINVAL;
2085 
2086 	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
2087 		return -E2BIG;
2088 
2089 	container = group->container;
2090 	driver = container->iommu_driver;
2091 	if (likely(driver && driver->ops->pin_pages))
2092 		ret = driver->ops->pin_pages(container->iommu_data,
2093 					     group->iommu_group, user_iova_pfn,
2094 					     npage, prot, phys_pfn);
2095 	else
2096 		ret = -ENOTTY;
2097 
2098 	return ret;
2099 }
2100 EXPORT_SYMBOL(vfio_group_pin_pages);
2101 
2102 /*
2103  * Unpin a set of guest IOVA PFNs for a VFIO group.
2104  *
2105  * The caller needs to call vfio_group_get_external_user() or
2106  * vfio_group_get_external_user_from_dev() prior to calling this interface,
2107  * so as to prevent the VFIO group from disposal in the middle of the call.
2108  * But it can keep the reference to the VFIO group for several calls into
2109  * this interface.
2110  * After finishing using of the VFIO group, the caller needs to release the
2111  * VFIO group by calling vfio_group_put_external_user().
2112  *
2113  * @group [in]		: vfio group
2114  * @user_iova_pfn [in]	: array of user/guest IOVA PFNs to be unpinned.
2115  * @npage [in]		: count of elements in user_iova_pfn array.
2116  *			  This count should not be greater than
2117  *			  VFIO_PIN_PAGES_MAX_ENTRIES.
2118  * Return error or number of pages unpinned.
2119  */
vfio_group_unpin_pages(struct vfio_group * group,unsigned long * user_iova_pfn,int npage)2120 int vfio_group_unpin_pages(struct vfio_group *group,
2121 			   unsigned long *user_iova_pfn, int npage)
2122 {
2123 	struct vfio_container *container;
2124 	struct vfio_iommu_driver *driver;
2125 	int ret;
2126 
2127 	if (!group || !user_iova_pfn || !npage)
2128 		return -EINVAL;
2129 
2130 	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
2131 		return -E2BIG;
2132 
2133 	container = group->container;
2134 	driver = container->iommu_driver;
2135 	if (likely(driver && driver->ops->unpin_pages))
2136 		ret = driver->ops->unpin_pages(container->iommu_data,
2137 					       user_iova_pfn, npage);
2138 	else
2139 		ret = -ENOTTY;
2140 
2141 	return ret;
2142 }
2143 EXPORT_SYMBOL(vfio_group_unpin_pages);
2144 
2145 
2146 /*
2147  * This interface allows the CPUs to perform some sort of virtual DMA on
2148  * behalf of the device.
2149  *
2150  * CPUs read/write from/into a range of IOVAs pointing to user space memory
2151  * into/from a kernel buffer.
2152  *
2153  * As the read/write of user space memory is conducted via the CPUs and is
2154  * not a real device DMA, it is not necessary to pin the user space memory.
2155  *
2156  * The caller needs to call vfio_group_get_external_user() or
2157  * vfio_group_get_external_user_from_dev() prior to calling this interface,
2158  * so as to prevent the VFIO group from disposal in the middle of the call.
2159  * But it can keep the reference to the VFIO group for several calls into
2160  * this interface.
2161  * After finishing using of the VFIO group, the caller needs to release the
2162  * VFIO group by calling vfio_group_put_external_user().
2163  *
2164  * @group [in]		: VFIO group
2165  * @user_iova [in]	: base IOVA of a user space buffer
2166  * @data [in]		: pointer to kernel buffer
2167  * @len [in]		: kernel buffer length
2168  * @write		: indicate read or write
2169  * Return error code on failure or 0 on success.
2170  */
vfio_dma_rw(struct vfio_group * group,dma_addr_t user_iova,void * data,size_t len,bool write)2171 int vfio_dma_rw(struct vfio_group *group, dma_addr_t user_iova,
2172 		void *data, size_t len, bool write)
2173 {
2174 	struct vfio_container *container;
2175 	struct vfio_iommu_driver *driver;
2176 	int ret = 0;
2177 
2178 	if (!group || !data || len <= 0)
2179 		return -EINVAL;
2180 
2181 	container = group->container;
2182 	driver = container->iommu_driver;
2183 
2184 	if (likely(driver && driver->ops->dma_rw))
2185 		ret = driver->ops->dma_rw(container->iommu_data,
2186 					  user_iova, data, len, write);
2187 	else
2188 		ret = -ENOTTY;
2189 
2190 	return ret;
2191 }
2192 EXPORT_SYMBOL(vfio_dma_rw);
2193 
vfio_register_iommu_notifier(struct vfio_group * group,unsigned long * events,struct notifier_block * nb)2194 static int vfio_register_iommu_notifier(struct vfio_group *group,
2195 					unsigned long *events,
2196 					struct notifier_block *nb)
2197 {
2198 	struct vfio_container *container;
2199 	struct vfio_iommu_driver *driver;
2200 	int ret;
2201 
2202 	ret = vfio_group_add_container_user(group);
2203 	if (ret)
2204 		return -EINVAL;
2205 
2206 	container = group->container;
2207 	driver = container->iommu_driver;
2208 	if (likely(driver && driver->ops->register_notifier))
2209 		ret = driver->ops->register_notifier(container->iommu_data,
2210 						     events, nb);
2211 	else
2212 		ret = -ENOTTY;
2213 
2214 	vfio_group_try_dissolve_container(group);
2215 
2216 	return ret;
2217 }
2218 
vfio_unregister_iommu_notifier(struct vfio_group * group,struct notifier_block * nb)2219 static int vfio_unregister_iommu_notifier(struct vfio_group *group,
2220 					  struct notifier_block *nb)
2221 {
2222 	struct vfio_container *container;
2223 	struct vfio_iommu_driver *driver;
2224 	int ret;
2225 
2226 	ret = vfio_group_add_container_user(group);
2227 	if (ret)
2228 		return -EINVAL;
2229 
2230 	container = group->container;
2231 	driver = container->iommu_driver;
2232 	if (likely(driver && driver->ops->unregister_notifier))
2233 		ret = driver->ops->unregister_notifier(container->iommu_data,
2234 						       nb);
2235 	else
2236 		ret = -ENOTTY;
2237 
2238 	vfio_group_try_dissolve_container(group);
2239 
2240 	return ret;
2241 }
2242 
vfio_group_set_kvm(struct vfio_group * group,struct kvm * kvm)2243 void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm)
2244 {
2245 	group->kvm = kvm;
2246 	blocking_notifier_call_chain(&group->notifier,
2247 				VFIO_GROUP_NOTIFY_SET_KVM, kvm);
2248 }
2249 EXPORT_SYMBOL_GPL(vfio_group_set_kvm);
2250 
vfio_register_group_notifier(struct vfio_group * group,unsigned long * events,struct notifier_block * nb)2251 static int vfio_register_group_notifier(struct vfio_group *group,
2252 					unsigned long *events,
2253 					struct notifier_block *nb)
2254 {
2255 	int ret;
2256 	bool set_kvm = false;
2257 
2258 	if (*events & VFIO_GROUP_NOTIFY_SET_KVM)
2259 		set_kvm = true;
2260 
2261 	/* clear known events */
2262 	*events &= ~VFIO_GROUP_NOTIFY_SET_KVM;
2263 
2264 	/* refuse to continue if still events remaining */
2265 	if (*events)
2266 		return -EINVAL;
2267 
2268 	ret = vfio_group_add_container_user(group);
2269 	if (ret)
2270 		return -EINVAL;
2271 
2272 	ret = blocking_notifier_chain_register(&group->notifier, nb);
2273 
2274 	/*
2275 	 * The attaching of kvm and vfio_group might already happen, so
2276 	 * here we replay once upon registration.
2277 	 */
2278 	if (!ret && set_kvm && group->kvm)
2279 		blocking_notifier_call_chain(&group->notifier,
2280 					VFIO_GROUP_NOTIFY_SET_KVM, group->kvm);
2281 
2282 	vfio_group_try_dissolve_container(group);
2283 
2284 	return ret;
2285 }
2286 
vfio_unregister_group_notifier(struct vfio_group * group,struct notifier_block * nb)2287 static int vfio_unregister_group_notifier(struct vfio_group *group,
2288 					 struct notifier_block *nb)
2289 {
2290 	int ret;
2291 
2292 	ret = vfio_group_add_container_user(group);
2293 	if (ret)
2294 		return -EINVAL;
2295 
2296 	ret = blocking_notifier_chain_unregister(&group->notifier, nb);
2297 
2298 	vfio_group_try_dissolve_container(group);
2299 
2300 	return ret;
2301 }
2302 
vfio_register_notifier(struct device * dev,enum vfio_notify_type type,unsigned long * events,struct notifier_block * nb)2303 int vfio_register_notifier(struct device *dev, enum vfio_notify_type type,
2304 			   unsigned long *events, struct notifier_block *nb)
2305 {
2306 	struct vfio_group *group;
2307 	int ret;
2308 
2309 	if (!dev || !nb || !events || (*events == 0))
2310 		return -EINVAL;
2311 
2312 	group = vfio_group_get_from_dev(dev);
2313 	if (!group)
2314 		return -ENODEV;
2315 
2316 	switch (type) {
2317 	case VFIO_IOMMU_NOTIFY:
2318 		ret = vfio_register_iommu_notifier(group, events, nb);
2319 		break;
2320 	case VFIO_GROUP_NOTIFY:
2321 		ret = vfio_register_group_notifier(group, events, nb);
2322 		break;
2323 	default:
2324 		ret = -EINVAL;
2325 	}
2326 
2327 	vfio_group_put(group);
2328 	return ret;
2329 }
2330 EXPORT_SYMBOL(vfio_register_notifier);
2331 
vfio_unregister_notifier(struct device * dev,enum vfio_notify_type type,struct notifier_block * nb)2332 int vfio_unregister_notifier(struct device *dev, enum vfio_notify_type type,
2333 			     struct notifier_block *nb)
2334 {
2335 	struct vfio_group *group;
2336 	int ret;
2337 
2338 	if (!dev || !nb)
2339 		return -EINVAL;
2340 
2341 	group = vfio_group_get_from_dev(dev);
2342 	if (!group)
2343 		return -ENODEV;
2344 
2345 	switch (type) {
2346 	case VFIO_IOMMU_NOTIFY:
2347 		ret = vfio_unregister_iommu_notifier(group, nb);
2348 		break;
2349 	case VFIO_GROUP_NOTIFY:
2350 		ret = vfio_unregister_group_notifier(group, nb);
2351 		break;
2352 	default:
2353 		ret = -EINVAL;
2354 	}
2355 
2356 	vfio_group_put(group);
2357 	return ret;
2358 }
2359 EXPORT_SYMBOL(vfio_unregister_notifier);
2360 
vfio_group_iommu_domain(struct vfio_group * group)2361 struct iommu_domain *vfio_group_iommu_domain(struct vfio_group *group)
2362 {
2363 	struct vfio_container *container;
2364 	struct vfio_iommu_driver *driver;
2365 
2366 	if (!group)
2367 		return ERR_PTR(-EINVAL);
2368 
2369 	container = group->container;
2370 	driver = container->iommu_driver;
2371 	if (likely(driver && driver->ops->group_iommu_domain))
2372 		return driver->ops->group_iommu_domain(container->iommu_data,
2373 						       group->iommu_group);
2374 
2375 	return ERR_PTR(-ENOTTY);
2376 }
2377 EXPORT_SYMBOL_GPL(vfio_group_iommu_domain);
2378 
2379 /**
2380  * Module/class support
2381  */
vfio_devnode(struct device * dev,umode_t * mode)2382 static char *vfio_devnode(struct device *dev, umode_t *mode)
2383 {
2384 	return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
2385 }
2386 
2387 static struct miscdevice vfio_dev = {
2388 	.minor = VFIO_MINOR,
2389 	.name = "vfio",
2390 	.fops = &vfio_fops,
2391 	.nodename = "vfio/vfio",
2392 	.mode = S_IRUGO | S_IWUGO,
2393 };
2394 
vfio_init(void)2395 static int __init vfio_init(void)
2396 {
2397 	int ret;
2398 
2399 	idr_init(&vfio.group_idr);
2400 	mutex_init(&vfio.group_lock);
2401 	mutex_init(&vfio.iommu_drivers_lock);
2402 	INIT_LIST_HEAD(&vfio.group_list);
2403 	INIT_LIST_HEAD(&vfio.iommu_drivers_list);
2404 
2405 	ret = misc_register(&vfio_dev);
2406 	if (ret) {
2407 		pr_err("vfio: misc device register failed\n");
2408 		return ret;
2409 	}
2410 
2411 	/* /dev/vfio/$GROUP */
2412 	vfio.class = class_create(THIS_MODULE, "vfio");
2413 	if (IS_ERR(vfio.class)) {
2414 		ret = PTR_ERR(vfio.class);
2415 		goto err_class;
2416 	}
2417 
2418 	vfio.class->devnode = vfio_devnode;
2419 
2420 	ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK + 1, "vfio");
2421 	if (ret)
2422 		goto err_alloc_chrdev;
2423 
2424 	cdev_init(&vfio.group_cdev, &vfio_group_fops);
2425 	ret = cdev_add(&vfio.group_cdev, vfio.group_devt, MINORMASK + 1);
2426 	if (ret)
2427 		goto err_cdev_add;
2428 
2429 	pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
2430 
2431 #ifdef CONFIG_VFIO_NOIOMMU
2432 	vfio_register_iommu_driver(&vfio_noiommu_ops);
2433 #endif
2434 	return 0;
2435 
2436 err_cdev_add:
2437 	unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
2438 err_alloc_chrdev:
2439 	class_destroy(vfio.class);
2440 	vfio.class = NULL;
2441 err_class:
2442 	misc_deregister(&vfio_dev);
2443 	return ret;
2444 }
2445 
vfio_cleanup(void)2446 static void __exit vfio_cleanup(void)
2447 {
2448 	WARN_ON(!list_empty(&vfio.group_list));
2449 
2450 #ifdef CONFIG_VFIO_NOIOMMU
2451 	vfio_unregister_iommu_driver(&vfio_noiommu_ops);
2452 #endif
2453 	idr_destroy(&vfio.group_idr);
2454 	cdev_del(&vfio.group_cdev);
2455 	unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
2456 	class_destroy(vfio.class);
2457 	vfio.class = NULL;
2458 	misc_deregister(&vfio_dev);
2459 	xa_destroy(&vfio_device_set_xa);
2460 }
2461 
2462 module_init(vfio_init);
2463 module_exit(vfio_cleanup);
2464 
2465 MODULE_VERSION(DRIVER_VERSION);
2466 MODULE_LICENSE("GPL v2");
2467 MODULE_AUTHOR(DRIVER_AUTHOR);
2468 MODULE_DESCRIPTION(DRIVER_DESC);
2469 MODULE_ALIAS_MISCDEV(VFIO_MINOR);
2470 MODULE_ALIAS("devname:vfio/vfio");
2471 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");
2472