1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * VFIO core
4  *
5  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
6  *     Author: Alex Williamson <alex.williamson@redhat.com>
7  *
8  * Derived from original vfio:
9  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
10  * Author: Tom Lyon, pugs@cisco.com
11  */
12 
13 #include <linux/cdev.h>
14 #include <linux/compat.h>
15 #include <linux/device.h>
16 #include <linux/file.h>
17 #include <linux/anon_inodes.h>
18 #include <linux/fs.h>
19 #include <linux/idr.h>
20 #include <linux/iommu.h>
21 #include <linux/list.h>
22 #include <linux/miscdevice.h>
23 #include <linux/module.h>
24 #include <linux/mutex.h>
25 #include <linux/pci.h>
26 #include <linux/rwsem.h>
27 #include <linux/sched.h>
28 #include <linux/slab.h>
29 #include <linux/stat.h>
30 #include <linux/string.h>
31 #include <linux/uaccess.h>
32 #include <linux/vfio.h>
33 #include <linux/wait.h>
34 #include <linux/sched/signal.h>
35 
36 #define DRIVER_VERSION	"0.3"
37 #define DRIVER_AUTHOR	"Alex Williamson <alex.williamson@redhat.com>"
38 #define DRIVER_DESC	"VFIO - User Level meta-driver"
39 
40 static struct vfio {
41 	struct class			*class;
42 	struct list_head		iommu_drivers_list;
43 	struct mutex			iommu_drivers_lock;
44 	struct list_head		group_list;
45 	struct idr			group_idr;
46 	struct mutex			group_lock;
47 	struct cdev			group_cdev;
48 	dev_t				group_devt;
49 	wait_queue_head_t		release_q;
50 } vfio;
51 
52 struct vfio_iommu_driver {
53 	const struct vfio_iommu_driver_ops	*ops;
54 	struct list_head			vfio_next;
55 };
56 
57 struct vfio_container {
58 	struct kref			kref;
59 	struct list_head		group_list;
60 	struct rw_semaphore		group_lock;
61 	struct vfio_iommu_driver	*iommu_driver;
62 	void				*iommu_data;
63 	bool				noiommu;
64 };
65 
66 struct vfio_unbound_dev {
67 	struct device			*dev;
68 	struct list_head		unbound_next;
69 };
70 
71 struct vfio_group {
72 	struct kref			kref;
73 	int				minor;
74 	atomic_t			container_users;
75 	struct iommu_group		*iommu_group;
76 	struct vfio_container		*container;
77 	struct list_head		device_list;
78 	struct mutex			device_lock;
79 	struct device			*dev;
80 	struct notifier_block		nb;
81 	struct list_head		vfio_next;
82 	struct list_head		container_next;
83 	struct list_head		unbound_list;
84 	struct mutex			unbound_lock;
85 	atomic_t			opened;
86 	wait_queue_head_t		container_q;
87 	bool				noiommu;
88 	struct kvm			*kvm;
89 	struct blocking_notifier_head	notifier;
90 };
91 
92 struct vfio_device {
93 	struct kref			kref;
94 	struct device			*dev;
95 	const struct vfio_device_ops	*ops;
96 	struct vfio_group		*group;
97 	struct list_head		group_next;
98 	void				*device_data;
99 };
100 
101 #ifdef CONFIG_VFIO_NOIOMMU
102 static bool noiommu __read_mostly;
103 module_param_named(enable_unsafe_noiommu_mode,
104 		   noiommu, bool, S_IRUGO | S_IWUSR);
105 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
106 #endif
107 
108 /*
109  * vfio_iommu_group_{get,put} are only intended for VFIO bus driver probe
110  * and remove functions, any use cases other than acquiring the first
111  * reference for the purpose of calling vfio_add_group_dev() or removing
112  * that symmetric reference after vfio_del_group_dev() should use the raw
113  * iommu_group_{get,put} functions.  In particular, vfio_iommu_group_put()
114  * removes the device from the dummy group and cannot be nested.
115  */
vfio_iommu_group_get(struct device * dev)116 struct iommu_group *vfio_iommu_group_get(struct device *dev)
117 {
118 	struct iommu_group *group;
119 	int __maybe_unused ret;
120 
121 	group = iommu_group_get(dev);
122 
123 #ifdef CONFIG_VFIO_NOIOMMU
124 	/*
125 	 * With noiommu enabled, an IOMMU group will be created for a device
126 	 * that doesn't already have one and doesn't have an iommu_ops on their
127 	 * bus.  We set iommudata simply to be able to identify these groups
128 	 * as special use and for reclamation later.
129 	 */
130 	if (group || !noiommu || iommu_present(dev->bus))
131 		return group;
132 
133 	group = iommu_group_alloc();
134 	if (IS_ERR(group))
135 		return NULL;
136 
137 	iommu_group_set_name(group, "vfio-noiommu");
138 	iommu_group_set_iommudata(group, &noiommu, NULL);
139 	ret = iommu_group_add_device(group, dev);
140 	if (ret) {
141 		iommu_group_put(group);
142 		return NULL;
143 	}
144 
145 	/*
146 	 * Where to taint?  At this point we've added an IOMMU group for a
147 	 * device that is not backed by iommu_ops, therefore any iommu_
148 	 * callback using iommu_ops can legitimately Oops.  So, while we may
149 	 * be about to give a DMA capable device to a user without IOMMU
150 	 * protection, which is clearly taint-worthy, let's go ahead and do
151 	 * it here.
152 	 */
153 	add_taint(TAINT_USER, LOCKDEP_STILL_OK);
154 	dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
155 #endif
156 
157 	return group;
158 }
159 EXPORT_SYMBOL_GPL(vfio_iommu_group_get);
160 
vfio_iommu_group_put(struct iommu_group * group,struct device * dev)161 void vfio_iommu_group_put(struct iommu_group *group, struct device *dev)
162 {
163 #ifdef CONFIG_VFIO_NOIOMMU
164 	if (iommu_group_get_iommudata(group) == &noiommu)
165 		iommu_group_remove_device(dev);
166 #endif
167 
168 	iommu_group_put(group);
169 }
170 EXPORT_SYMBOL_GPL(vfio_iommu_group_put);
171 
172 #ifdef CONFIG_VFIO_NOIOMMU
vfio_noiommu_open(unsigned long arg)173 static void *vfio_noiommu_open(unsigned long arg)
174 {
175 	if (arg != VFIO_NOIOMMU_IOMMU)
176 		return ERR_PTR(-EINVAL);
177 	if (!capable(CAP_SYS_RAWIO))
178 		return ERR_PTR(-EPERM);
179 
180 	return NULL;
181 }
182 
vfio_noiommu_release(void * iommu_data)183 static void vfio_noiommu_release(void *iommu_data)
184 {
185 }
186 
vfio_noiommu_ioctl(void * iommu_data,unsigned int cmd,unsigned long arg)187 static long vfio_noiommu_ioctl(void *iommu_data,
188 			       unsigned int cmd, unsigned long arg)
189 {
190 	if (cmd == VFIO_CHECK_EXTENSION)
191 		return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;
192 
193 	return -ENOTTY;
194 }
195 
vfio_noiommu_attach_group(void * iommu_data,struct iommu_group * iommu_group)196 static int vfio_noiommu_attach_group(void *iommu_data,
197 				     struct iommu_group *iommu_group)
198 {
199 	return iommu_group_get_iommudata(iommu_group) == &noiommu ? 0 : -EINVAL;
200 }
201 
vfio_noiommu_detach_group(void * iommu_data,struct iommu_group * iommu_group)202 static void vfio_noiommu_detach_group(void *iommu_data,
203 				      struct iommu_group *iommu_group)
204 {
205 }
206 
207 static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
208 	.name = "vfio-noiommu",
209 	.owner = THIS_MODULE,
210 	.open = vfio_noiommu_open,
211 	.release = vfio_noiommu_release,
212 	.ioctl = vfio_noiommu_ioctl,
213 	.attach_group = vfio_noiommu_attach_group,
214 	.detach_group = vfio_noiommu_detach_group,
215 };
216 #endif
217 
218 
219 /**
220  * IOMMU driver registration
221  */
vfio_register_iommu_driver(const struct vfio_iommu_driver_ops * ops)222 int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
223 {
224 	struct vfio_iommu_driver *driver, *tmp;
225 
226 	driver = kzalloc(sizeof(*driver), GFP_KERNEL);
227 	if (!driver)
228 		return -ENOMEM;
229 
230 	driver->ops = ops;
231 
232 	mutex_lock(&vfio.iommu_drivers_lock);
233 
234 	/* Check for duplicates */
235 	list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
236 		if (tmp->ops == ops) {
237 			mutex_unlock(&vfio.iommu_drivers_lock);
238 			kfree(driver);
239 			return -EINVAL;
240 		}
241 	}
242 
243 	list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
244 
245 	mutex_unlock(&vfio.iommu_drivers_lock);
246 
247 	return 0;
248 }
249 EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
250 
vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops * ops)251 void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
252 {
253 	struct vfio_iommu_driver *driver;
254 
255 	mutex_lock(&vfio.iommu_drivers_lock);
256 	list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
257 		if (driver->ops == ops) {
258 			list_del(&driver->vfio_next);
259 			mutex_unlock(&vfio.iommu_drivers_lock);
260 			kfree(driver);
261 			return;
262 		}
263 	}
264 	mutex_unlock(&vfio.iommu_drivers_lock);
265 }
266 EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
267 
268 /**
269  * Group minor allocation/free - both called with vfio.group_lock held
270  */
vfio_alloc_group_minor(struct vfio_group * group)271 static int vfio_alloc_group_minor(struct vfio_group *group)
272 {
273 	return idr_alloc(&vfio.group_idr, group, 0, MINORMASK + 1, GFP_KERNEL);
274 }
275 
vfio_free_group_minor(int minor)276 static void vfio_free_group_minor(int minor)
277 {
278 	idr_remove(&vfio.group_idr, minor);
279 }
280 
281 static int vfio_iommu_group_notifier(struct notifier_block *nb,
282 				     unsigned long action, void *data);
283 static void vfio_group_get(struct vfio_group *group);
284 
285 /**
286  * Container objects - containers are created when /dev/vfio/vfio is
287  * opened, but their lifecycle extends until the last user is done, so
288  * it's freed via kref.  Must support container/group/device being
289  * closed in any order.
290  */
vfio_container_get(struct vfio_container * container)291 static void vfio_container_get(struct vfio_container *container)
292 {
293 	kref_get(&container->kref);
294 }
295 
vfio_container_release(struct kref * kref)296 static void vfio_container_release(struct kref *kref)
297 {
298 	struct vfio_container *container;
299 	container = container_of(kref, struct vfio_container, kref);
300 
301 	kfree(container);
302 }
303 
vfio_container_put(struct vfio_container * container)304 static void vfio_container_put(struct vfio_container *container)
305 {
306 	kref_put(&container->kref, vfio_container_release);
307 }
308 
vfio_group_unlock_and_free(struct vfio_group * group)309 static void vfio_group_unlock_and_free(struct vfio_group *group)
310 {
311 	mutex_unlock(&vfio.group_lock);
312 	/*
313 	 * Unregister outside of lock.  A spurious callback is harmless now
314 	 * that the group is no longer in vfio.group_list.
315 	 */
316 	iommu_group_unregister_notifier(group->iommu_group, &group->nb);
317 	kfree(group);
318 }
319 
320 /**
321  * Group objects - create, release, get, put, search
322  */
vfio_create_group(struct iommu_group * iommu_group)323 static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group)
324 {
325 	struct vfio_group *group, *tmp;
326 	struct device *dev;
327 	int ret, minor;
328 
329 	group = kzalloc(sizeof(*group), GFP_KERNEL);
330 	if (!group)
331 		return ERR_PTR(-ENOMEM);
332 
333 	kref_init(&group->kref);
334 	INIT_LIST_HEAD(&group->device_list);
335 	mutex_init(&group->device_lock);
336 	INIT_LIST_HEAD(&group->unbound_list);
337 	mutex_init(&group->unbound_lock);
338 	atomic_set(&group->container_users, 0);
339 	atomic_set(&group->opened, 0);
340 	init_waitqueue_head(&group->container_q);
341 	group->iommu_group = iommu_group;
342 #ifdef CONFIG_VFIO_NOIOMMU
343 	group->noiommu = (iommu_group_get_iommudata(iommu_group) == &noiommu);
344 #endif
345 	BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
346 
347 	group->nb.notifier_call = vfio_iommu_group_notifier;
348 
349 	/*
350 	 * blocking notifiers acquire a rwsem around registering and hold
351 	 * it around callback.  Therefore, need to register outside of
352 	 * vfio.group_lock to avoid A-B/B-A contention.  Our callback won't
353 	 * do anything unless it can find the group in vfio.group_list, so
354 	 * no harm in registering early.
355 	 */
356 	ret = iommu_group_register_notifier(iommu_group, &group->nb);
357 	if (ret) {
358 		kfree(group);
359 		return ERR_PTR(ret);
360 	}
361 
362 	mutex_lock(&vfio.group_lock);
363 
364 	/* Did we race creating this group? */
365 	list_for_each_entry(tmp, &vfio.group_list, vfio_next) {
366 		if (tmp->iommu_group == iommu_group) {
367 			vfio_group_get(tmp);
368 			vfio_group_unlock_and_free(group);
369 			return tmp;
370 		}
371 	}
372 
373 	minor = vfio_alloc_group_minor(group);
374 	if (minor < 0) {
375 		vfio_group_unlock_and_free(group);
376 		return ERR_PTR(minor);
377 	}
378 
379 	dev = device_create(vfio.class, NULL,
380 			    MKDEV(MAJOR(vfio.group_devt), minor),
381 			    group, "%s%d", group->noiommu ? "noiommu-" : "",
382 			    iommu_group_id(iommu_group));
383 	if (IS_ERR(dev)) {
384 		vfio_free_group_minor(minor);
385 		vfio_group_unlock_and_free(group);
386 		return ERR_CAST(dev);
387 	}
388 
389 	group->minor = minor;
390 	group->dev = dev;
391 
392 	list_add(&group->vfio_next, &vfio.group_list);
393 
394 	mutex_unlock(&vfio.group_lock);
395 
396 	return group;
397 }
398 
399 /* called with vfio.group_lock held */
vfio_group_release(struct kref * kref)400 static void vfio_group_release(struct kref *kref)
401 {
402 	struct vfio_group *group = container_of(kref, struct vfio_group, kref);
403 	struct vfio_unbound_dev *unbound, *tmp;
404 	struct iommu_group *iommu_group = group->iommu_group;
405 
406 	WARN_ON(!list_empty(&group->device_list));
407 	WARN_ON(group->notifier.head);
408 
409 	list_for_each_entry_safe(unbound, tmp,
410 				 &group->unbound_list, unbound_next) {
411 		list_del(&unbound->unbound_next);
412 		kfree(unbound);
413 	}
414 
415 	device_destroy(vfio.class, MKDEV(MAJOR(vfio.group_devt), group->minor));
416 	list_del(&group->vfio_next);
417 	vfio_free_group_minor(group->minor);
418 	vfio_group_unlock_and_free(group);
419 	iommu_group_put(iommu_group);
420 }
421 
vfio_group_put(struct vfio_group * group)422 static void vfio_group_put(struct vfio_group *group)
423 {
424 	kref_put_mutex(&group->kref, vfio_group_release, &vfio.group_lock);
425 }
426 
427 struct vfio_group_put_work {
428 	struct work_struct work;
429 	struct vfio_group *group;
430 };
431 
vfio_group_put_bg(struct work_struct * work)432 static void vfio_group_put_bg(struct work_struct *work)
433 {
434 	struct vfio_group_put_work *do_work;
435 
436 	do_work = container_of(work, struct vfio_group_put_work, work);
437 
438 	vfio_group_put(do_work->group);
439 	kfree(do_work);
440 }
441 
vfio_group_schedule_put(struct vfio_group * group)442 static void vfio_group_schedule_put(struct vfio_group *group)
443 {
444 	struct vfio_group_put_work *do_work;
445 
446 	do_work = kmalloc(sizeof(*do_work), GFP_KERNEL);
447 	if (WARN_ON(!do_work))
448 		return;
449 
450 	INIT_WORK(&do_work->work, vfio_group_put_bg);
451 	do_work->group = group;
452 	schedule_work(&do_work->work);
453 }
454 
455 /* Assume group_lock or group reference is held */
vfio_group_get(struct vfio_group * group)456 static void vfio_group_get(struct vfio_group *group)
457 {
458 	kref_get(&group->kref);
459 }
460 
461 /*
462  * Not really a try as we will sleep for mutex, but we need to make
463  * sure the group pointer is valid under lock and get a reference.
464  */
vfio_group_try_get(struct vfio_group * group)465 static struct vfio_group *vfio_group_try_get(struct vfio_group *group)
466 {
467 	struct vfio_group *target = group;
468 
469 	mutex_lock(&vfio.group_lock);
470 	list_for_each_entry(group, &vfio.group_list, vfio_next) {
471 		if (group == target) {
472 			vfio_group_get(group);
473 			mutex_unlock(&vfio.group_lock);
474 			return group;
475 		}
476 	}
477 	mutex_unlock(&vfio.group_lock);
478 
479 	return NULL;
480 }
481 
482 static
vfio_group_get_from_iommu(struct iommu_group * iommu_group)483 struct vfio_group *vfio_group_get_from_iommu(struct iommu_group *iommu_group)
484 {
485 	struct vfio_group *group;
486 
487 	mutex_lock(&vfio.group_lock);
488 	list_for_each_entry(group, &vfio.group_list, vfio_next) {
489 		if (group->iommu_group == iommu_group) {
490 			vfio_group_get(group);
491 			mutex_unlock(&vfio.group_lock);
492 			return group;
493 		}
494 	}
495 	mutex_unlock(&vfio.group_lock);
496 
497 	return NULL;
498 }
499 
vfio_group_get_from_minor(int minor)500 static struct vfio_group *vfio_group_get_from_minor(int minor)
501 {
502 	struct vfio_group *group;
503 
504 	mutex_lock(&vfio.group_lock);
505 	group = idr_find(&vfio.group_idr, minor);
506 	if (!group) {
507 		mutex_unlock(&vfio.group_lock);
508 		return NULL;
509 	}
510 	vfio_group_get(group);
511 	mutex_unlock(&vfio.group_lock);
512 
513 	return group;
514 }
515 
vfio_group_get_from_dev(struct device * dev)516 static struct vfio_group *vfio_group_get_from_dev(struct device *dev)
517 {
518 	struct iommu_group *iommu_group;
519 	struct vfio_group *group;
520 
521 	iommu_group = iommu_group_get(dev);
522 	if (!iommu_group)
523 		return NULL;
524 
525 	group = vfio_group_get_from_iommu(iommu_group);
526 	iommu_group_put(iommu_group);
527 
528 	return group;
529 }
530 
531 /**
532  * Device objects - create, release, get, put, search
533  */
534 static
vfio_group_create_device(struct vfio_group * group,struct device * dev,const struct vfio_device_ops * ops,void * device_data)535 struct vfio_device *vfio_group_create_device(struct vfio_group *group,
536 					     struct device *dev,
537 					     const struct vfio_device_ops *ops,
538 					     void *device_data)
539 {
540 	struct vfio_device *device;
541 
542 	device = kzalloc(sizeof(*device), GFP_KERNEL);
543 	if (!device)
544 		return ERR_PTR(-ENOMEM);
545 
546 	kref_init(&device->kref);
547 	device->dev = dev;
548 	device->group = group;
549 	device->ops = ops;
550 	device->device_data = device_data;
551 	dev_set_drvdata(dev, device);
552 
553 	/* No need to get group_lock, caller has group reference */
554 	vfio_group_get(group);
555 
556 	mutex_lock(&group->device_lock);
557 	list_add(&device->group_next, &group->device_list);
558 	mutex_unlock(&group->device_lock);
559 
560 	return device;
561 }
562 
vfio_device_release(struct kref * kref)563 static void vfio_device_release(struct kref *kref)
564 {
565 	struct vfio_device *device = container_of(kref,
566 						  struct vfio_device, kref);
567 	struct vfio_group *group = device->group;
568 
569 	list_del(&device->group_next);
570 	mutex_unlock(&group->device_lock);
571 
572 	dev_set_drvdata(device->dev, NULL);
573 
574 	kfree(device);
575 
576 	/* vfio_del_group_dev may be waiting for this device */
577 	wake_up(&vfio.release_q);
578 }
579 
580 /* Device reference always implies a group reference */
vfio_device_put(struct vfio_device * device)581 void vfio_device_put(struct vfio_device *device)
582 {
583 	struct vfio_group *group = device->group;
584 	kref_put_mutex(&device->kref, vfio_device_release, &group->device_lock);
585 	vfio_group_put(group);
586 }
587 EXPORT_SYMBOL_GPL(vfio_device_put);
588 
vfio_device_get(struct vfio_device * device)589 static void vfio_device_get(struct vfio_device *device)
590 {
591 	vfio_group_get(device->group);
592 	kref_get(&device->kref);
593 }
594 
vfio_group_get_device(struct vfio_group * group,struct device * dev)595 static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
596 						 struct device *dev)
597 {
598 	struct vfio_device *device;
599 
600 	mutex_lock(&group->device_lock);
601 	list_for_each_entry(device, &group->device_list, group_next) {
602 		if (device->dev == dev) {
603 			vfio_device_get(device);
604 			mutex_unlock(&group->device_lock);
605 			return device;
606 		}
607 	}
608 	mutex_unlock(&group->device_lock);
609 	return NULL;
610 }
611 
612 /*
613  * Some drivers, like pci-stub, are only used to prevent other drivers from
614  * claiming a device and are therefore perfectly legitimate for a user owned
615  * group.  The pci-stub driver has no dependencies on DMA or the IOVA mapping
616  * of the device, but it does prevent the user from having direct access to
617  * the device, which is useful in some circumstances.
618  *
619  * We also assume that we can include PCI interconnect devices, ie. bridges.
620  * IOMMU grouping on PCI necessitates that if we lack isolation on a bridge
621  * then all of the downstream devices will be part of the same IOMMU group as
622  * the bridge.  Thus, if placing the bridge into the user owned IOVA space
623  * breaks anything, it only does so for user owned devices downstream.  Note
624  * that error notification via MSI can be affected for platforms that handle
625  * MSI within the same IOVA space as DMA.
626  */
627 static const char * const vfio_driver_whitelist[] = { "pci-stub" };
628 
vfio_dev_whitelisted(struct device * dev,struct device_driver * drv)629 static bool vfio_dev_whitelisted(struct device *dev, struct device_driver *drv)
630 {
631 	if (dev_is_pci(dev)) {
632 		struct pci_dev *pdev = to_pci_dev(dev);
633 
634 		if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
635 			return true;
636 	}
637 
638 	return match_string(vfio_driver_whitelist,
639 			    ARRAY_SIZE(vfio_driver_whitelist),
640 			    drv->name) >= 0;
641 }
642 
643 /*
644  * A vfio group is viable for use by userspace if all devices are in
645  * one of the following states:
646  *  - driver-less
647  *  - bound to a vfio driver
648  *  - bound to a whitelisted driver
649  *  - a PCI interconnect device
650  *
651  * We use two methods to determine whether a device is bound to a vfio
652  * driver.  The first is to test whether the device exists in the vfio
653  * group.  The second is to test if the device exists on the group
654  * unbound_list, indicating it's in the middle of transitioning from
655  * a vfio driver to driver-less.
656  */
vfio_dev_viable(struct device * dev,void * data)657 static int vfio_dev_viable(struct device *dev, void *data)
658 {
659 	struct vfio_group *group = data;
660 	struct vfio_device *device;
661 	struct device_driver *drv = READ_ONCE(dev->driver);
662 	struct vfio_unbound_dev *unbound;
663 	int ret = -EINVAL;
664 
665 	mutex_lock(&group->unbound_lock);
666 	list_for_each_entry(unbound, &group->unbound_list, unbound_next) {
667 		if (dev == unbound->dev) {
668 			ret = 0;
669 			break;
670 		}
671 	}
672 	mutex_unlock(&group->unbound_lock);
673 
674 	if (!ret || !drv || vfio_dev_whitelisted(dev, drv))
675 		return 0;
676 
677 	device = vfio_group_get_device(group, dev);
678 	if (device) {
679 		vfio_device_put(device);
680 		return 0;
681 	}
682 
683 	return ret;
684 }
685 
686 /**
687  * Async device support
688  */
vfio_group_nb_add_dev(struct vfio_group * group,struct device * dev)689 static int vfio_group_nb_add_dev(struct vfio_group *group, struct device *dev)
690 {
691 	struct vfio_device *device;
692 
693 	/* Do we already know about it?  We shouldn't */
694 	device = vfio_group_get_device(group, dev);
695 	if (WARN_ON_ONCE(device)) {
696 		vfio_device_put(device);
697 		return 0;
698 	}
699 
700 	/* Nothing to do for idle groups */
701 	if (!atomic_read(&group->container_users))
702 		return 0;
703 
704 	/* TODO Prevent device auto probing */
705 	dev_WARN(dev, "Device added to live group %d!\n",
706 		 iommu_group_id(group->iommu_group));
707 
708 	return 0;
709 }
710 
vfio_group_nb_verify(struct vfio_group * group,struct device * dev)711 static int vfio_group_nb_verify(struct vfio_group *group, struct device *dev)
712 {
713 	/* We don't care what happens when the group isn't in use */
714 	if (!atomic_read(&group->container_users))
715 		return 0;
716 
717 	return vfio_dev_viable(dev, group);
718 }
719 
vfio_iommu_group_notifier(struct notifier_block * nb,unsigned long action,void * data)720 static int vfio_iommu_group_notifier(struct notifier_block *nb,
721 				     unsigned long action, void *data)
722 {
723 	struct vfio_group *group = container_of(nb, struct vfio_group, nb);
724 	struct device *dev = data;
725 	struct vfio_unbound_dev *unbound;
726 
727 	/*
728 	 * Need to go through a group_lock lookup to get a reference or we
729 	 * risk racing a group being removed.  Ignore spurious notifies.
730 	 */
731 	group = vfio_group_try_get(group);
732 	if (!group)
733 		return NOTIFY_OK;
734 
735 	switch (action) {
736 	case IOMMU_GROUP_NOTIFY_ADD_DEVICE:
737 		vfio_group_nb_add_dev(group, dev);
738 		break;
739 	case IOMMU_GROUP_NOTIFY_DEL_DEVICE:
740 		/*
741 		 * Nothing to do here.  If the device is in use, then the
742 		 * vfio sub-driver should block the remove callback until
743 		 * it is unused.  If the device is unused or attached to a
744 		 * stub driver, then it should be released and we don't
745 		 * care that it will be going away.
746 		 */
747 		break;
748 	case IOMMU_GROUP_NOTIFY_BIND_DRIVER:
749 		dev_dbg(dev, "%s: group %d binding to driver\n", __func__,
750 			iommu_group_id(group->iommu_group));
751 		break;
752 	case IOMMU_GROUP_NOTIFY_BOUND_DRIVER:
753 		dev_dbg(dev, "%s: group %d bound to driver %s\n", __func__,
754 			iommu_group_id(group->iommu_group), dev->driver->name);
755 		BUG_ON(vfio_group_nb_verify(group, dev));
756 		break;
757 	case IOMMU_GROUP_NOTIFY_UNBIND_DRIVER:
758 		dev_dbg(dev, "%s: group %d unbinding from driver %s\n",
759 			__func__, iommu_group_id(group->iommu_group),
760 			dev->driver->name);
761 		break;
762 	case IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER:
763 		dev_dbg(dev, "%s: group %d unbound from driver\n", __func__,
764 			iommu_group_id(group->iommu_group));
765 		/*
766 		 * XXX An unbound device in a live group is ok, but we'd
767 		 * really like to avoid the above BUG_ON by preventing other
768 		 * drivers from binding to it.  Once that occurs, we have to
769 		 * stop the system to maintain isolation.  At a minimum, we'd
770 		 * want a toggle to disable driver auto probe for this device.
771 		 */
772 
773 		mutex_lock(&group->unbound_lock);
774 		list_for_each_entry(unbound,
775 				    &group->unbound_list, unbound_next) {
776 			if (dev == unbound->dev) {
777 				list_del(&unbound->unbound_next);
778 				kfree(unbound);
779 				break;
780 			}
781 		}
782 		mutex_unlock(&group->unbound_lock);
783 		break;
784 	}
785 
786 	/*
787 	 * If we're the last reference to the group, the group will be
788 	 * released, which includes unregistering the iommu group notifier.
789 	 * We hold a read-lock on that notifier list, unregistering needs
790 	 * a write-lock... deadlock.  Release our reference asynchronously
791 	 * to avoid that situation.
792 	 */
793 	vfio_group_schedule_put(group);
794 	return NOTIFY_OK;
795 }
796 
797 /**
798  * VFIO driver API
799  */
vfio_add_group_dev(struct device * dev,const struct vfio_device_ops * ops,void * device_data)800 int vfio_add_group_dev(struct device *dev,
801 		       const struct vfio_device_ops *ops, void *device_data)
802 {
803 	struct iommu_group *iommu_group;
804 	struct vfio_group *group;
805 	struct vfio_device *device;
806 
807 	iommu_group = iommu_group_get(dev);
808 	if (!iommu_group)
809 		return -EINVAL;
810 
811 	group = vfio_group_get_from_iommu(iommu_group);
812 	if (!group) {
813 		group = vfio_create_group(iommu_group);
814 		if (IS_ERR(group)) {
815 			iommu_group_put(iommu_group);
816 			return PTR_ERR(group);
817 		}
818 	} else {
819 		/*
820 		 * A found vfio_group already holds a reference to the
821 		 * iommu_group.  A created vfio_group keeps the reference.
822 		 */
823 		iommu_group_put(iommu_group);
824 	}
825 
826 	device = vfio_group_get_device(group, dev);
827 	if (device) {
828 		dev_WARN(dev, "Device already exists on group %d\n",
829 			 iommu_group_id(iommu_group));
830 		vfio_device_put(device);
831 		vfio_group_put(group);
832 		return -EBUSY;
833 	}
834 
835 	device = vfio_group_create_device(group, dev, ops, device_data);
836 	if (IS_ERR(device)) {
837 		vfio_group_put(group);
838 		return PTR_ERR(device);
839 	}
840 
841 	/*
842 	 * Drop all but the vfio_device reference.  The vfio_device holds
843 	 * a reference to the vfio_group, which holds a reference to the
844 	 * iommu_group.
845 	 */
846 	vfio_group_put(group);
847 
848 	return 0;
849 }
850 EXPORT_SYMBOL_GPL(vfio_add_group_dev);
851 
852 /**
853  * Get a reference to the vfio_device for a device.  Even if the
854  * caller thinks they own the device, they could be racing with a
855  * release call path, so we can't trust drvdata for the shortcut.
856  * Go the long way around, from the iommu_group to the vfio_group
857  * to the vfio_device.
858  */
vfio_device_get_from_dev(struct device * dev)859 struct vfio_device *vfio_device_get_from_dev(struct device *dev)
860 {
861 	struct vfio_group *group;
862 	struct vfio_device *device;
863 
864 	group = vfio_group_get_from_dev(dev);
865 	if (!group)
866 		return NULL;
867 
868 	device = vfio_group_get_device(group, dev);
869 	vfio_group_put(group);
870 
871 	return device;
872 }
873 EXPORT_SYMBOL_GPL(vfio_device_get_from_dev);
874 
vfio_device_get_from_name(struct vfio_group * group,char * buf)875 static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
876 						     char *buf)
877 {
878 	struct vfio_device *it, *device = NULL;
879 
880 	mutex_lock(&group->device_lock);
881 	list_for_each_entry(it, &group->device_list, group_next) {
882 		if (!strcmp(dev_name(it->dev), buf)) {
883 			device = it;
884 			vfio_device_get(device);
885 			break;
886 		}
887 	}
888 	mutex_unlock(&group->device_lock);
889 
890 	return device;
891 }
892 
893 /*
894  * Caller must hold a reference to the vfio_device
895  */
vfio_device_data(struct vfio_device * device)896 void *vfio_device_data(struct vfio_device *device)
897 {
898 	return device->device_data;
899 }
900 EXPORT_SYMBOL_GPL(vfio_device_data);
901 
902 /*
903  * Decrement the device reference count and wait for the device to be
904  * removed.  Open file descriptors for the device... */
vfio_del_group_dev(struct device * dev)905 void *vfio_del_group_dev(struct device *dev)
906 {
907 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
908 	struct vfio_device *device = dev_get_drvdata(dev);
909 	struct vfio_group *group = device->group;
910 	void *device_data = device->device_data;
911 	struct vfio_unbound_dev *unbound;
912 	unsigned int i = 0;
913 	bool interrupted = false;
914 
915 	/*
916 	 * The group exists so long as we have a device reference.  Get
917 	 * a group reference and use it to scan for the device going away.
918 	 */
919 	vfio_group_get(group);
920 
921 	/*
922 	 * When the device is removed from the group, the group suddenly
923 	 * becomes non-viable; the device has a driver (until the unbind
924 	 * completes), but it's not present in the group.  This is bad news
925 	 * for any external users that need to re-acquire a group reference
926 	 * in order to match and release their existing reference.  To
927 	 * solve this, we track such devices on the unbound_list to bridge
928 	 * the gap until they're fully unbound.
929 	 */
930 	unbound = kzalloc(sizeof(*unbound), GFP_KERNEL);
931 	if (unbound) {
932 		unbound->dev = dev;
933 		mutex_lock(&group->unbound_lock);
934 		list_add(&unbound->unbound_next, &group->unbound_list);
935 		mutex_unlock(&group->unbound_lock);
936 	}
937 	WARN_ON(!unbound);
938 
939 	vfio_device_put(device);
940 
941 	/*
942 	 * If the device is still present in the group after the above
943 	 * 'put', then it is in use and we need to request it from the
944 	 * bus driver.  The driver may in turn need to request the
945 	 * device from the user.  We send the request on an arbitrary
946 	 * interval with counter to allow the driver to take escalating
947 	 * measures to release the device if it has the ability to do so.
948 	 */
949 	add_wait_queue(&vfio.release_q, &wait);
950 
951 	do {
952 		device = vfio_group_get_device(group, dev);
953 		if (!device)
954 			break;
955 
956 		if (device->ops->request)
957 			device->ops->request(device_data, i++);
958 
959 		vfio_device_put(device);
960 
961 		if (interrupted) {
962 			wait_woken(&wait, TASK_UNINTERRUPTIBLE, HZ * 10);
963 		} else {
964 			wait_woken(&wait, TASK_INTERRUPTIBLE, HZ * 10);
965 			if (signal_pending(current)) {
966 				interrupted = true;
967 				dev_warn(dev,
968 					 "Device is currently in use, task"
969 					 " \"%s\" (%d) "
970 					 "blocked until device is released",
971 					 current->comm, task_pid_nr(current));
972 			}
973 		}
974 
975 	} while (1);
976 
977 	remove_wait_queue(&vfio.release_q, &wait);
978 	/*
979 	 * In order to support multiple devices per group, devices can be
980 	 * plucked from the group while other devices in the group are still
981 	 * in use.  The container persists with this group and those remaining
982 	 * devices still attached.  If the user creates an isolation violation
983 	 * by binding this device to another driver while the group is still in
984 	 * use, that's their fault.  However, in the case of removing the last,
985 	 * or potentially the only, device in the group there can be no other
986 	 * in-use devices in the group.  The user has done their due diligence
987 	 * and we should lay no claims to those devices.  In order to do that,
988 	 * we need to make sure the group is detached from the container.
989 	 * Without this stall, we're potentially racing with a user process
990 	 * that may attempt to immediately bind this device to another driver.
991 	 */
992 	if (list_empty(&group->device_list))
993 		wait_event(group->container_q, !group->container);
994 
995 	vfio_group_put(group);
996 
997 	return device_data;
998 }
999 EXPORT_SYMBOL_GPL(vfio_del_group_dev);
1000 
1001 /**
1002  * VFIO base fd, /dev/vfio/vfio
1003  */
vfio_ioctl_check_extension(struct vfio_container * container,unsigned long arg)1004 static long vfio_ioctl_check_extension(struct vfio_container *container,
1005 				       unsigned long arg)
1006 {
1007 	struct vfio_iommu_driver *driver;
1008 	long ret = 0;
1009 
1010 	down_read(&container->group_lock);
1011 
1012 	driver = container->iommu_driver;
1013 
1014 	switch (arg) {
1015 		/* No base extensions yet */
1016 	default:
1017 		/*
1018 		 * If no driver is set, poll all registered drivers for
1019 		 * extensions and return the first positive result.  If
1020 		 * a driver is already set, further queries will be passed
1021 		 * only to that driver.
1022 		 */
1023 		if (!driver) {
1024 			mutex_lock(&vfio.iommu_drivers_lock);
1025 			list_for_each_entry(driver, &vfio.iommu_drivers_list,
1026 					    vfio_next) {
1027 
1028 #ifdef CONFIG_VFIO_NOIOMMU
1029 				if (!list_empty(&container->group_list) &&
1030 				    (container->noiommu !=
1031 				     (driver->ops == &vfio_noiommu_ops)))
1032 					continue;
1033 #endif
1034 
1035 				if (!try_module_get(driver->ops->owner))
1036 					continue;
1037 
1038 				ret = driver->ops->ioctl(NULL,
1039 							 VFIO_CHECK_EXTENSION,
1040 							 arg);
1041 				module_put(driver->ops->owner);
1042 				if (ret > 0)
1043 					break;
1044 			}
1045 			mutex_unlock(&vfio.iommu_drivers_lock);
1046 		} else
1047 			ret = driver->ops->ioctl(container->iommu_data,
1048 						 VFIO_CHECK_EXTENSION, arg);
1049 	}
1050 
1051 	up_read(&container->group_lock);
1052 
1053 	return ret;
1054 }
1055 
1056 /* hold write lock on container->group_lock */
__vfio_container_attach_groups(struct vfio_container * container,struct vfio_iommu_driver * driver,void * data)1057 static int __vfio_container_attach_groups(struct vfio_container *container,
1058 					  struct vfio_iommu_driver *driver,
1059 					  void *data)
1060 {
1061 	struct vfio_group *group;
1062 	int ret = -ENODEV;
1063 
1064 	list_for_each_entry(group, &container->group_list, container_next) {
1065 		ret = driver->ops->attach_group(data, group->iommu_group);
1066 		if (ret)
1067 			goto unwind;
1068 	}
1069 
1070 	return ret;
1071 
1072 unwind:
1073 	list_for_each_entry_continue_reverse(group, &container->group_list,
1074 					     container_next) {
1075 		driver->ops->detach_group(data, group->iommu_group);
1076 	}
1077 
1078 	return ret;
1079 }
1080 
vfio_ioctl_set_iommu(struct vfio_container * container,unsigned long arg)1081 static long vfio_ioctl_set_iommu(struct vfio_container *container,
1082 				 unsigned long arg)
1083 {
1084 	struct vfio_iommu_driver *driver;
1085 	long ret = -ENODEV;
1086 
1087 	down_write(&container->group_lock);
1088 
1089 	/*
1090 	 * The container is designed to be an unprivileged interface while
1091 	 * the group can be assigned to specific users.  Therefore, only by
1092 	 * adding a group to a container does the user get the privilege of
1093 	 * enabling the iommu, which may allocate finite resources.  There
1094 	 * is no unset_iommu, but by removing all the groups from a container,
1095 	 * the container is deprivileged and returns to an unset state.
1096 	 */
1097 	if (list_empty(&container->group_list) || container->iommu_driver) {
1098 		up_write(&container->group_lock);
1099 		return -EINVAL;
1100 	}
1101 
1102 	mutex_lock(&vfio.iommu_drivers_lock);
1103 	list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
1104 		void *data;
1105 
1106 #ifdef CONFIG_VFIO_NOIOMMU
1107 		/*
1108 		 * Only noiommu containers can use vfio-noiommu and noiommu
1109 		 * containers can only use vfio-noiommu.
1110 		 */
1111 		if (container->noiommu != (driver->ops == &vfio_noiommu_ops))
1112 			continue;
1113 #endif
1114 
1115 		if (!try_module_get(driver->ops->owner))
1116 			continue;
1117 
1118 		/*
1119 		 * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
1120 		 * so test which iommu driver reported support for this
1121 		 * extension and call open on them.  We also pass them the
1122 		 * magic, allowing a single driver to support multiple
1123 		 * interfaces if they'd like.
1124 		 */
1125 		if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
1126 			module_put(driver->ops->owner);
1127 			continue;
1128 		}
1129 
1130 		data = driver->ops->open(arg);
1131 		if (IS_ERR(data)) {
1132 			ret = PTR_ERR(data);
1133 			module_put(driver->ops->owner);
1134 			continue;
1135 		}
1136 
1137 		ret = __vfio_container_attach_groups(container, driver, data);
1138 		if (ret) {
1139 			driver->ops->release(data);
1140 			module_put(driver->ops->owner);
1141 			continue;
1142 		}
1143 
1144 		container->iommu_driver = driver;
1145 		container->iommu_data = data;
1146 		break;
1147 	}
1148 
1149 	mutex_unlock(&vfio.iommu_drivers_lock);
1150 	up_write(&container->group_lock);
1151 
1152 	return ret;
1153 }
1154 
vfio_fops_unl_ioctl(struct file * filep,unsigned int cmd,unsigned long arg)1155 static long vfio_fops_unl_ioctl(struct file *filep,
1156 				unsigned int cmd, unsigned long arg)
1157 {
1158 	struct vfio_container *container = filep->private_data;
1159 	struct vfio_iommu_driver *driver;
1160 	void *data;
1161 	long ret = -EINVAL;
1162 
1163 	if (!container)
1164 		return ret;
1165 
1166 	switch (cmd) {
1167 	case VFIO_GET_API_VERSION:
1168 		ret = VFIO_API_VERSION;
1169 		break;
1170 	case VFIO_CHECK_EXTENSION:
1171 		ret = vfio_ioctl_check_extension(container, arg);
1172 		break;
1173 	case VFIO_SET_IOMMU:
1174 		ret = vfio_ioctl_set_iommu(container, arg);
1175 		break;
1176 	default:
1177 		driver = container->iommu_driver;
1178 		data = container->iommu_data;
1179 
1180 		if (driver) /* passthrough all unrecognized ioctls */
1181 			ret = driver->ops->ioctl(data, cmd, arg);
1182 	}
1183 
1184 	return ret;
1185 }
1186 
1187 #ifdef CONFIG_COMPAT
vfio_fops_compat_ioctl(struct file * filep,unsigned int cmd,unsigned long arg)1188 static long vfio_fops_compat_ioctl(struct file *filep,
1189 				   unsigned int cmd, unsigned long arg)
1190 {
1191 	arg = (unsigned long)compat_ptr(arg);
1192 	return vfio_fops_unl_ioctl(filep, cmd, arg);
1193 }
1194 #endif	/* CONFIG_COMPAT */
1195 
vfio_fops_open(struct inode * inode,struct file * filep)1196 static int vfio_fops_open(struct inode *inode, struct file *filep)
1197 {
1198 	struct vfio_container *container;
1199 
1200 	container = kzalloc(sizeof(*container), GFP_KERNEL);
1201 	if (!container)
1202 		return -ENOMEM;
1203 
1204 	INIT_LIST_HEAD(&container->group_list);
1205 	init_rwsem(&container->group_lock);
1206 	kref_init(&container->kref);
1207 
1208 	filep->private_data = container;
1209 
1210 	return 0;
1211 }
1212 
vfio_fops_release(struct inode * inode,struct file * filep)1213 static int vfio_fops_release(struct inode *inode, struct file *filep)
1214 {
1215 	struct vfio_container *container = filep->private_data;
1216 
1217 	filep->private_data = NULL;
1218 
1219 	vfio_container_put(container);
1220 
1221 	return 0;
1222 }
1223 
1224 /*
1225  * Once an iommu driver is set, we optionally pass read/write/mmap
1226  * on to the driver, allowing management interfaces beyond ioctl.
1227  */
vfio_fops_read(struct file * filep,char __user * buf,size_t count,loff_t * ppos)1228 static ssize_t vfio_fops_read(struct file *filep, char __user *buf,
1229 			      size_t count, loff_t *ppos)
1230 {
1231 	struct vfio_container *container = filep->private_data;
1232 	struct vfio_iommu_driver *driver;
1233 	ssize_t ret = -EINVAL;
1234 
1235 	driver = container->iommu_driver;
1236 	if (likely(driver && driver->ops->read))
1237 		ret = driver->ops->read(container->iommu_data,
1238 					buf, count, ppos);
1239 
1240 	return ret;
1241 }
1242 
vfio_fops_write(struct file * filep,const char __user * buf,size_t count,loff_t * ppos)1243 static ssize_t vfio_fops_write(struct file *filep, const char __user *buf,
1244 			       size_t count, loff_t *ppos)
1245 {
1246 	struct vfio_container *container = filep->private_data;
1247 	struct vfio_iommu_driver *driver;
1248 	ssize_t ret = -EINVAL;
1249 
1250 	driver = container->iommu_driver;
1251 	if (likely(driver && driver->ops->write))
1252 		ret = driver->ops->write(container->iommu_data,
1253 					 buf, count, ppos);
1254 
1255 	return ret;
1256 }
1257 
vfio_fops_mmap(struct file * filep,struct vm_area_struct * vma)1258 static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1259 {
1260 	struct vfio_container *container = filep->private_data;
1261 	struct vfio_iommu_driver *driver;
1262 	int ret = -EINVAL;
1263 
1264 	driver = container->iommu_driver;
1265 	if (likely(driver && driver->ops->mmap))
1266 		ret = driver->ops->mmap(container->iommu_data, vma);
1267 
1268 	return ret;
1269 }
1270 
1271 static const struct file_operations vfio_fops = {
1272 	.owner		= THIS_MODULE,
1273 	.open		= vfio_fops_open,
1274 	.release	= vfio_fops_release,
1275 	.read		= vfio_fops_read,
1276 	.write		= vfio_fops_write,
1277 	.unlocked_ioctl	= vfio_fops_unl_ioctl,
1278 #ifdef CONFIG_COMPAT
1279 	.compat_ioctl	= vfio_fops_compat_ioctl,
1280 #endif
1281 	.mmap		= vfio_fops_mmap,
1282 };
1283 
1284 /**
1285  * VFIO Group fd, /dev/vfio/$GROUP
1286  */
__vfio_group_unset_container(struct vfio_group * group)1287 static void __vfio_group_unset_container(struct vfio_group *group)
1288 {
1289 	struct vfio_container *container = group->container;
1290 	struct vfio_iommu_driver *driver;
1291 
1292 	down_write(&container->group_lock);
1293 
1294 	driver = container->iommu_driver;
1295 	if (driver)
1296 		driver->ops->detach_group(container->iommu_data,
1297 					  group->iommu_group);
1298 
1299 	group->container = NULL;
1300 	wake_up(&group->container_q);
1301 	list_del(&group->container_next);
1302 
1303 	/* Detaching the last group deprivileges a container, remove iommu */
1304 	if (driver && list_empty(&container->group_list)) {
1305 		driver->ops->release(container->iommu_data);
1306 		module_put(driver->ops->owner);
1307 		container->iommu_driver = NULL;
1308 		container->iommu_data = NULL;
1309 	}
1310 
1311 	up_write(&container->group_lock);
1312 
1313 	vfio_container_put(container);
1314 }
1315 
1316 /*
1317  * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
1318  * if there was no container to unset.  Since the ioctl is called on
1319  * the group, we know that still exists, therefore the only valid
1320  * transition here is 1->0.
1321  */
vfio_group_unset_container(struct vfio_group * group)1322 static int vfio_group_unset_container(struct vfio_group *group)
1323 {
1324 	int users = atomic_cmpxchg(&group->container_users, 1, 0);
1325 
1326 	if (!users)
1327 		return -EINVAL;
1328 	if (users != 1)
1329 		return -EBUSY;
1330 
1331 	__vfio_group_unset_container(group);
1332 
1333 	return 0;
1334 }
1335 
1336 /*
1337  * When removing container users, anything that removes the last user
1338  * implicitly removes the group from the container.  That is, if the
1339  * group file descriptor is closed, as well as any device file descriptors,
1340  * the group is free.
1341  */
vfio_group_try_dissolve_container(struct vfio_group * group)1342 static void vfio_group_try_dissolve_container(struct vfio_group *group)
1343 {
1344 	if (0 == atomic_dec_if_positive(&group->container_users))
1345 		__vfio_group_unset_container(group);
1346 }
1347 
vfio_group_set_container(struct vfio_group * group,int container_fd)1348 static int vfio_group_set_container(struct vfio_group *group, int container_fd)
1349 {
1350 	struct fd f;
1351 	struct vfio_container *container;
1352 	struct vfio_iommu_driver *driver;
1353 	int ret = 0;
1354 
1355 	if (atomic_read(&group->container_users))
1356 		return -EINVAL;
1357 
1358 	if (group->noiommu && !capable(CAP_SYS_RAWIO))
1359 		return -EPERM;
1360 
1361 	f = fdget(container_fd);
1362 	if (!f.file)
1363 		return -EBADF;
1364 
1365 	/* Sanity check, is this really our fd? */
1366 	if (f.file->f_op != &vfio_fops) {
1367 		fdput(f);
1368 		return -EINVAL;
1369 	}
1370 
1371 	container = f.file->private_data;
1372 	WARN_ON(!container); /* fget ensures we don't race vfio_release */
1373 
1374 	down_write(&container->group_lock);
1375 
1376 	/* Real groups and fake groups cannot mix */
1377 	if (!list_empty(&container->group_list) &&
1378 	    container->noiommu != group->noiommu) {
1379 		ret = -EPERM;
1380 		goto unlock_out;
1381 	}
1382 
1383 	driver = container->iommu_driver;
1384 	if (driver) {
1385 		ret = driver->ops->attach_group(container->iommu_data,
1386 						group->iommu_group);
1387 		if (ret)
1388 			goto unlock_out;
1389 	}
1390 
1391 	group->container = container;
1392 	container->noiommu = group->noiommu;
1393 	list_add(&group->container_next, &container->group_list);
1394 
1395 	/* Get a reference on the container and mark a user within the group */
1396 	vfio_container_get(container);
1397 	atomic_inc(&group->container_users);
1398 
1399 unlock_out:
1400 	up_write(&container->group_lock);
1401 	fdput(f);
1402 	return ret;
1403 }
1404 
vfio_group_viable(struct vfio_group * group)1405 static bool vfio_group_viable(struct vfio_group *group)
1406 {
1407 	return (iommu_group_for_each_dev(group->iommu_group,
1408 					 group, vfio_dev_viable) == 0);
1409 }
1410 
vfio_group_add_container_user(struct vfio_group * group)1411 static int vfio_group_add_container_user(struct vfio_group *group)
1412 {
1413 	if (!atomic_inc_not_zero(&group->container_users))
1414 		return -EINVAL;
1415 
1416 	if (group->noiommu) {
1417 		atomic_dec(&group->container_users);
1418 		return -EPERM;
1419 	}
1420 	if (!group->container->iommu_driver || !vfio_group_viable(group)) {
1421 		atomic_dec(&group->container_users);
1422 		return -EINVAL;
1423 	}
1424 
1425 	return 0;
1426 }
1427 
1428 static const struct file_operations vfio_device_fops;
1429 
vfio_group_get_device_fd(struct vfio_group * group,char * buf)1430 static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
1431 {
1432 	struct vfio_device *device;
1433 	struct file *filep;
1434 	int ret;
1435 
1436 	if (0 == atomic_read(&group->container_users) ||
1437 	    !group->container->iommu_driver || !vfio_group_viable(group))
1438 		return -EINVAL;
1439 
1440 	if (group->noiommu && !capable(CAP_SYS_RAWIO))
1441 		return -EPERM;
1442 
1443 	device = vfio_device_get_from_name(group, buf);
1444 	if (!device)
1445 		return -ENODEV;
1446 
1447 	ret = device->ops->open(device->device_data);
1448 	if (ret) {
1449 		vfio_device_put(device);
1450 		return ret;
1451 	}
1452 
1453 	/*
1454 	 * We can't use anon_inode_getfd() because we need to modify
1455 	 * the f_mode flags directly to allow more than just ioctls
1456 	 */
1457 	ret = get_unused_fd_flags(O_CLOEXEC);
1458 	if (ret < 0) {
1459 		device->ops->release(device->device_data);
1460 		vfio_device_put(device);
1461 		return ret;
1462 	}
1463 
1464 	filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
1465 				   device, O_RDWR);
1466 	if (IS_ERR(filep)) {
1467 		put_unused_fd(ret);
1468 		ret = PTR_ERR(filep);
1469 		device->ops->release(device->device_data);
1470 		vfio_device_put(device);
1471 		return ret;
1472 	}
1473 
1474 	/*
1475 	 * TODO: add an anon_inode interface to do this.
1476 	 * Appears to be missing by lack of need rather than
1477 	 * explicitly prevented.  Now there's need.
1478 	 */
1479 	filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
1480 
1481 	atomic_inc(&group->container_users);
1482 
1483 	fd_install(ret, filep);
1484 
1485 	if (group->noiommu)
1486 		dev_warn(device->dev, "vfio-noiommu device opened by user "
1487 			 "(%s:%d)\n", current->comm, task_pid_nr(current));
1488 
1489 	return ret;
1490 }
1491 
vfio_group_fops_unl_ioctl(struct file * filep,unsigned int cmd,unsigned long arg)1492 static long vfio_group_fops_unl_ioctl(struct file *filep,
1493 				      unsigned int cmd, unsigned long arg)
1494 {
1495 	struct vfio_group *group = filep->private_data;
1496 	long ret = -ENOTTY;
1497 
1498 	switch (cmd) {
1499 	case VFIO_GROUP_GET_STATUS:
1500 	{
1501 		struct vfio_group_status status;
1502 		unsigned long minsz;
1503 
1504 		minsz = offsetofend(struct vfio_group_status, flags);
1505 
1506 		if (copy_from_user(&status, (void __user *)arg, minsz))
1507 			return -EFAULT;
1508 
1509 		if (status.argsz < minsz)
1510 			return -EINVAL;
1511 
1512 		status.flags = 0;
1513 
1514 		if (vfio_group_viable(group))
1515 			status.flags |= VFIO_GROUP_FLAGS_VIABLE;
1516 
1517 		if (group->container)
1518 			status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET;
1519 
1520 		if (copy_to_user((void __user *)arg, &status, minsz))
1521 			return -EFAULT;
1522 
1523 		ret = 0;
1524 		break;
1525 	}
1526 	case VFIO_GROUP_SET_CONTAINER:
1527 	{
1528 		int fd;
1529 
1530 		if (get_user(fd, (int __user *)arg))
1531 			return -EFAULT;
1532 
1533 		if (fd < 0)
1534 			return -EINVAL;
1535 
1536 		ret = vfio_group_set_container(group, fd);
1537 		break;
1538 	}
1539 	case VFIO_GROUP_UNSET_CONTAINER:
1540 		ret = vfio_group_unset_container(group);
1541 		break;
1542 	case VFIO_GROUP_GET_DEVICE_FD:
1543 	{
1544 		char *buf;
1545 
1546 		buf = strndup_user((const char __user *)arg, PAGE_SIZE);
1547 		if (IS_ERR(buf))
1548 			return PTR_ERR(buf);
1549 
1550 		ret = vfio_group_get_device_fd(group, buf);
1551 		kfree(buf);
1552 		break;
1553 	}
1554 	}
1555 
1556 	return ret;
1557 }
1558 
1559 #ifdef CONFIG_COMPAT
vfio_group_fops_compat_ioctl(struct file * filep,unsigned int cmd,unsigned long arg)1560 static long vfio_group_fops_compat_ioctl(struct file *filep,
1561 					 unsigned int cmd, unsigned long arg)
1562 {
1563 	arg = (unsigned long)compat_ptr(arg);
1564 	return vfio_group_fops_unl_ioctl(filep, cmd, arg);
1565 }
1566 #endif	/* CONFIG_COMPAT */
1567 
vfio_group_fops_open(struct inode * inode,struct file * filep)1568 static int vfio_group_fops_open(struct inode *inode, struct file *filep)
1569 {
1570 	struct vfio_group *group;
1571 	int opened;
1572 
1573 	group = vfio_group_get_from_minor(iminor(inode));
1574 	if (!group)
1575 		return -ENODEV;
1576 
1577 	if (group->noiommu && !capable(CAP_SYS_RAWIO)) {
1578 		vfio_group_put(group);
1579 		return -EPERM;
1580 	}
1581 
1582 	/* Do we need multiple instances of the group open?  Seems not. */
1583 	opened = atomic_cmpxchg(&group->opened, 0, 1);
1584 	if (opened) {
1585 		vfio_group_put(group);
1586 		return -EBUSY;
1587 	}
1588 
1589 	/* Is something still in use from a previous open? */
1590 	if (group->container) {
1591 		atomic_dec(&group->opened);
1592 		vfio_group_put(group);
1593 		return -EBUSY;
1594 	}
1595 
1596 	/* Warn if previous user didn't cleanup and re-init to drop them */
1597 	if (WARN_ON(group->notifier.head))
1598 		BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
1599 
1600 	filep->private_data = group;
1601 
1602 	return 0;
1603 }
1604 
vfio_group_fops_release(struct inode * inode,struct file * filep)1605 static int vfio_group_fops_release(struct inode *inode, struct file *filep)
1606 {
1607 	struct vfio_group *group = filep->private_data;
1608 
1609 	filep->private_data = NULL;
1610 
1611 	vfio_group_try_dissolve_container(group);
1612 
1613 	atomic_dec(&group->opened);
1614 
1615 	vfio_group_put(group);
1616 
1617 	return 0;
1618 }
1619 
1620 static const struct file_operations vfio_group_fops = {
1621 	.owner		= THIS_MODULE,
1622 	.unlocked_ioctl	= vfio_group_fops_unl_ioctl,
1623 #ifdef CONFIG_COMPAT
1624 	.compat_ioctl	= vfio_group_fops_compat_ioctl,
1625 #endif
1626 	.open		= vfio_group_fops_open,
1627 	.release	= vfio_group_fops_release,
1628 };
1629 
1630 /**
1631  * VFIO Device fd
1632  */
vfio_device_fops_release(struct inode * inode,struct file * filep)1633 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
1634 {
1635 	struct vfio_device *device = filep->private_data;
1636 
1637 	device->ops->release(device->device_data);
1638 
1639 	vfio_group_try_dissolve_container(device->group);
1640 
1641 	vfio_device_put(device);
1642 
1643 	return 0;
1644 }
1645 
vfio_device_fops_unl_ioctl(struct file * filep,unsigned int cmd,unsigned long arg)1646 static long vfio_device_fops_unl_ioctl(struct file *filep,
1647 				       unsigned int cmd, unsigned long arg)
1648 {
1649 	struct vfio_device *device = filep->private_data;
1650 
1651 	if (unlikely(!device->ops->ioctl))
1652 		return -EINVAL;
1653 
1654 	return device->ops->ioctl(device->device_data, cmd, arg);
1655 }
1656 
vfio_device_fops_read(struct file * filep,char __user * buf,size_t count,loff_t * ppos)1657 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1658 				     size_t count, loff_t *ppos)
1659 {
1660 	struct vfio_device *device = filep->private_data;
1661 
1662 	if (unlikely(!device->ops->read))
1663 		return -EINVAL;
1664 
1665 	return device->ops->read(device->device_data, buf, count, ppos);
1666 }
1667 
vfio_device_fops_write(struct file * filep,const char __user * buf,size_t count,loff_t * ppos)1668 static ssize_t vfio_device_fops_write(struct file *filep,
1669 				      const char __user *buf,
1670 				      size_t count, loff_t *ppos)
1671 {
1672 	struct vfio_device *device = filep->private_data;
1673 
1674 	if (unlikely(!device->ops->write))
1675 		return -EINVAL;
1676 
1677 	return device->ops->write(device->device_data, buf, count, ppos);
1678 }
1679 
vfio_device_fops_mmap(struct file * filep,struct vm_area_struct * vma)1680 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1681 {
1682 	struct vfio_device *device = filep->private_data;
1683 
1684 	if (unlikely(!device->ops->mmap))
1685 		return -EINVAL;
1686 
1687 	return device->ops->mmap(device->device_data, vma);
1688 }
1689 
1690 #ifdef CONFIG_COMPAT
vfio_device_fops_compat_ioctl(struct file * filep,unsigned int cmd,unsigned long arg)1691 static long vfio_device_fops_compat_ioctl(struct file *filep,
1692 					  unsigned int cmd, unsigned long arg)
1693 {
1694 	arg = (unsigned long)compat_ptr(arg);
1695 	return vfio_device_fops_unl_ioctl(filep, cmd, arg);
1696 }
1697 #endif	/* CONFIG_COMPAT */
1698 
1699 static const struct file_operations vfio_device_fops = {
1700 	.owner		= THIS_MODULE,
1701 	.release	= vfio_device_fops_release,
1702 	.read		= vfio_device_fops_read,
1703 	.write		= vfio_device_fops_write,
1704 	.unlocked_ioctl	= vfio_device_fops_unl_ioctl,
1705 #ifdef CONFIG_COMPAT
1706 	.compat_ioctl	= vfio_device_fops_compat_ioctl,
1707 #endif
1708 	.mmap		= vfio_device_fops_mmap,
1709 };
1710 
1711 /**
1712  * External user API, exported by symbols to be linked dynamically.
1713  *
1714  * The protocol includes:
1715  *  1. do normal VFIO init operation:
1716  *	- opening a new container;
1717  *	- attaching group(s) to it;
1718  *	- setting an IOMMU driver for a container.
1719  * When IOMMU is set for a container, all groups in it are
1720  * considered ready to use by an external user.
1721  *
1722  * 2. User space passes a group fd to an external user.
1723  * The external user calls vfio_group_get_external_user()
1724  * to verify that:
1725  *	- the group is initialized;
1726  *	- IOMMU is set for it.
1727  * If both checks passed, vfio_group_get_external_user()
1728  * increments the container user counter to prevent
1729  * the VFIO group from disposal before KVM exits.
1730  *
1731  * 3. The external user calls vfio_external_user_iommu_id()
1732  * to know an IOMMU ID.
1733  *
1734  * 4. When the external KVM finishes, it calls
1735  * vfio_group_put_external_user() to release the VFIO group.
1736  * This call decrements the container user counter.
1737  */
vfio_group_get_external_user(struct file * filep)1738 struct vfio_group *vfio_group_get_external_user(struct file *filep)
1739 {
1740 	struct vfio_group *group = filep->private_data;
1741 	int ret;
1742 
1743 	if (filep->f_op != &vfio_group_fops)
1744 		return ERR_PTR(-EINVAL);
1745 
1746 	ret = vfio_group_add_container_user(group);
1747 	if (ret)
1748 		return ERR_PTR(ret);
1749 
1750 	vfio_group_get(group);
1751 
1752 	return group;
1753 }
1754 EXPORT_SYMBOL_GPL(vfio_group_get_external_user);
1755 
vfio_group_put_external_user(struct vfio_group * group)1756 void vfio_group_put_external_user(struct vfio_group *group)
1757 {
1758 	vfio_group_try_dissolve_container(group);
1759 	vfio_group_put(group);
1760 }
1761 EXPORT_SYMBOL_GPL(vfio_group_put_external_user);
1762 
vfio_external_group_match_file(struct vfio_group * test_group,struct file * filep)1763 bool vfio_external_group_match_file(struct vfio_group *test_group,
1764 				    struct file *filep)
1765 {
1766 	struct vfio_group *group = filep->private_data;
1767 
1768 	return (filep->f_op == &vfio_group_fops) && (group == test_group);
1769 }
1770 EXPORT_SYMBOL_GPL(vfio_external_group_match_file);
1771 
vfio_external_user_iommu_id(struct vfio_group * group)1772 int vfio_external_user_iommu_id(struct vfio_group *group)
1773 {
1774 	return iommu_group_id(group->iommu_group);
1775 }
1776 EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id);
1777 
vfio_external_check_extension(struct vfio_group * group,unsigned long arg)1778 long vfio_external_check_extension(struct vfio_group *group, unsigned long arg)
1779 {
1780 	return vfio_ioctl_check_extension(group->container, arg);
1781 }
1782 EXPORT_SYMBOL_GPL(vfio_external_check_extension);
1783 
1784 /**
1785  * Sub-module support
1786  */
1787 /*
1788  * Helper for managing a buffer of info chain capabilities, allocate or
1789  * reallocate a buffer with additional @size, filling in @id and @version
1790  * of the capability.  A pointer to the new capability is returned.
1791  *
1792  * NB. The chain is based at the head of the buffer, so new entries are
1793  * added to the tail, vfio_info_cap_shift() should be called to fixup the
1794  * next offsets prior to copying to the user buffer.
1795  */
vfio_info_cap_add(struct vfio_info_cap * caps,size_t size,u16 id,u16 version)1796 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1797 					       size_t size, u16 id, u16 version)
1798 {
1799 	void *buf;
1800 	struct vfio_info_cap_header *header, *tmp;
1801 
1802 	buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1803 	if (!buf) {
1804 		kfree(caps->buf);
1805 		caps->size = 0;
1806 		return ERR_PTR(-ENOMEM);
1807 	}
1808 
1809 	caps->buf = buf;
1810 	header = buf + caps->size;
1811 
1812 	/* Eventually copied to user buffer, zero */
1813 	memset(header, 0, size);
1814 
1815 	header->id = id;
1816 	header->version = version;
1817 
1818 	/* Add to the end of the capability chain */
1819 	for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1820 		; /* nothing */
1821 
1822 	tmp->next = caps->size;
1823 	caps->size += size;
1824 
1825 	return header;
1826 }
1827 EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1828 
vfio_info_cap_shift(struct vfio_info_cap * caps,size_t offset)1829 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1830 {
1831 	struct vfio_info_cap_header *tmp;
1832 	void *buf = (void *)caps->buf;
1833 
1834 	for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1835 		tmp->next += offset;
1836 }
1837 EXPORT_SYMBOL(vfio_info_cap_shift);
1838 
vfio_info_add_capability(struct vfio_info_cap * caps,struct vfio_info_cap_header * cap,size_t size)1839 int vfio_info_add_capability(struct vfio_info_cap *caps,
1840 			     struct vfio_info_cap_header *cap, size_t size)
1841 {
1842 	struct vfio_info_cap_header *header;
1843 
1844 	header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1845 	if (IS_ERR(header))
1846 		return PTR_ERR(header);
1847 
1848 	memcpy(header + 1, cap + 1, size - sizeof(*header));
1849 
1850 	return 0;
1851 }
1852 EXPORT_SYMBOL(vfio_info_add_capability);
1853 
vfio_set_irqs_validate_and_prepare(struct vfio_irq_set * hdr,int num_irqs,int max_irq_type,size_t * data_size)1854 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1855 				       int max_irq_type, size_t *data_size)
1856 {
1857 	unsigned long minsz;
1858 	size_t size;
1859 
1860 	minsz = offsetofend(struct vfio_irq_set, count);
1861 
1862 	if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1863 	    (hdr->count >= (U32_MAX - hdr->start)) ||
1864 	    (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1865 				VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1866 		return -EINVAL;
1867 
1868 	if (data_size)
1869 		*data_size = 0;
1870 
1871 	if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1872 		return -EINVAL;
1873 
1874 	switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1875 	case VFIO_IRQ_SET_DATA_NONE:
1876 		size = 0;
1877 		break;
1878 	case VFIO_IRQ_SET_DATA_BOOL:
1879 		size = sizeof(uint8_t);
1880 		break;
1881 	case VFIO_IRQ_SET_DATA_EVENTFD:
1882 		size = sizeof(int32_t);
1883 		break;
1884 	default:
1885 		return -EINVAL;
1886 	}
1887 
1888 	if (size) {
1889 		if (hdr->argsz - minsz < hdr->count * size)
1890 			return -EINVAL;
1891 
1892 		if (!data_size)
1893 			return -EINVAL;
1894 
1895 		*data_size = hdr->count * size;
1896 	}
1897 
1898 	return 0;
1899 }
1900 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1901 
1902 /*
1903  * Pin a set of guest PFNs and return their associated host PFNs for local
1904  * domain only.
1905  * @dev [in]     : device
1906  * @user_pfn [in]: array of user/guest PFNs to be pinned.
1907  * @npage [in]   : count of elements in user_pfn array.  This count should not
1908  *		   be greater VFIO_PIN_PAGES_MAX_ENTRIES.
1909  * @prot [in]    : protection flags
1910  * @phys_pfn[out]: array of host PFNs
1911  * Return error or number of pages pinned.
1912  */
vfio_pin_pages(struct device * dev,unsigned long * user_pfn,int npage,int prot,unsigned long * phys_pfn)1913 int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage,
1914 		   int prot, unsigned long *phys_pfn)
1915 {
1916 	struct vfio_container *container;
1917 	struct vfio_group *group;
1918 	struct vfio_iommu_driver *driver;
1919 	int ret;
1920 
1921 	if (!dev || !user_pfn || !phys_pfn || !npage)
1922 		return -EINVAL;
1923 
1924 	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1925 		return -E2BIG;
1926 
1927 	group = vfio_group_get_from_dev(dev);
1928 	if (!group)
1929 		return -ENODEV;
1930 
1931 	ret = vfio_group_add_container_user(group);
1932 	if (ret)
1933 		goto err_pin_pages;
1934 
1935 	container = group->container;
1936 	driver = container->iommu_driver;
1937 	if (likely(driver && driver->ops->pin_pages))
1938 		ret = driver->ops->pin_pages(container->iommu_data, user_pfn,
1939 					     npage, prot, phys_pfn);
1940 	else
1941 		ret = -ENOTTY;
1942 
1943 	vfio_group_try_dissolve_container(group);
1944 
1945 err_pin_pages:
1946 	vfio_group_put(group);
1947 	return ret;
1948 }
1949 EXPORT_SYMBOL(vfio_pin_pages);
1950 
1951 /*
1952  * Unpin set of host PFNs for local domain only.
1953  * @dev [in]     : device
1954  * @user_pfn [in]: array of user/guest PFNs to be unpinned. Number of user/guest
1955  *		   PFNs should not be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1956  * @npage [in]   : count of elements in user_pfn array.  This count should not
1957  *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1958  * Return error or number of pages unpinned.
1959  */
vfio_unpin_pages(struct device * dev,unsigned long * user_pfn,int npage)1960 int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage)
1961 {
1962 	struct vfio_container *container;
1963 	struct vfio_group *group;
1964 	struct vfio_iommu_driver *driver;
1965 	int ret;
1966 
1967 	if (!dev || !user_pfn || !npage)
1968 		return -EINVAL;
1969 
1970 	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1971 		return -E2BIG;
1972 
1973 	group = vfio_group_get_from_dev(dev);
1974 	if (!group)
1975 		return -ENODEV;
1976 
1977 	ret = vfio_group_add_container_user(group);
1978 	if (ret)
1979 		goto err_unpin_pages;
1980 
1981 	container = group->container;
1982 	driver = container->iommu_driver;
1983 	if (likely(driver && driver->ops->unpin_pages))
1984 		ret = driver->ops->unpin_pages(container->iommu_data, user_pfn,
1985 					       npage);
1986 	else
1987 		ret = -ENOTTY;
1988 
1989 	vfio_group_try_dissolve_container(group);
1990 
1991 err_unpin_pages:
1992 	vfio_group_put(group);
1993 	return ret;
1994 }
1995 EXPORT_SYMBOL(vfio_unpin_pages);
1996 
vfio_register_iommu_notifier(struct vfio_group * group,unsigned long * events,struct notifier_block * nb)1997 static int vfio_register_iommu_notifier(struct vfio_group *group,
1998 					unsigned long *events,
1999 					struct notifier_block *nb)
2000 {
2001 	struct vfio_container *container;
2002 	struct vfio_iommu_driver *driver;
2003 	int ret;
2004 
2005 	ret = vfio_group_add_container_user(group);
2006 	if (ret)
2007 		return -EINVAL;
2008 
2009 	container = group->container;
2010 	driver = container->iommu_driver;
2011 	if (likely(driver && driver->ops->register_notifier))
2012 		ret = driver->ops->register_notifier(container->iommu_data,
2013 						     events, nb);
2014 	else
2015 		ret = -ENOTTY;
2016 
2017 	vfio_group_try_dissolve_container(group);
2018 
2019 	return ret;
2020 }
2021 
vfio_unregister_iommu_notifier(struct vfio_group * group,struct notifier_block * nb)2022 static int vfio_unregister_iommu_notifier(struct vfio_group *group,
2023 					  struct notifier_block *nb)
2024 {
2025 	struct vfio_container *container;
2026 	struct vfio_iommu_driver *driver;
2027 	int ret;
2028 
2029 	ret = vfio_group_add_container_user(group);
2030 	if (ret)
2031 		return -EINVAL;
2032 
2033 	container = group->container;
2034 	driver = container->iommu_driver;
2035 	if (likely(driver && driver->ops->unregister_notifier))
2036 		ret = driver->ops->unregister_notifier(container->iommu_data,
2037 						       nb);
2038 	else
2039 		ret = -ENOTTY;
2040 
2041 	vfio_group_try_dissolve_container(group);
2042 
2043 	return ret;
2044 }
2045 
vfio_group_set_kvm(struct vfio_group * group,struct kvm * kvm)2046 void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm)
2047 {
2048 	group->kvm = kvm;
2049 	blocking_notifier_call_chain(&group->notifier,
2050 				VFIO_GROUP_NOTIFY_SET_KVM, kvm);
2051 }
2052 EXPORT_SYMBOL_GPL(vfio_group_set_kvm);
2053 
vfio_register_group_notifier(struct vfio_group * group,unsigned long * events,struct notifier_block * nb)2054 static int vfio_register_group_notifier(struct vfio_group *group,
2055 					unsigned long *events,
2056 					struct notifier_block *nb)
2057 {
2058 	int ret;
2059 	bool set_kvm = false;
2060 
2061 	if (*events & VFIO_GROUP_NOTIFY_SET_KVM)
2062 		set_kvm = true;
2063 
2064 	/* clear known events */
2065 	*events &= ~VFIO_GROUP_NOTIFY_SET_KVM;
2066 
2067 	/* refuse to continue if still events remaining */
2068 	if (*events)
2069 		return -EINVAL;
2070 
2071 	ret = vfio_group_add_container_user(group);
2072 	if (ret)
2073 		return -EINVAL;
2074 
2075 	ret = blocking_notifier_chain_register(&group->notifier, nb);
2076 
2077 	/*
2078 	 * The attaching of kvm and vfio_group might already happen, so
2079 	 * here we replay once upon registration.
2080 	 */
2081 	if (!ret && set_kvm && group->kvm)
2082 		blocking_notifier_call_chain(&group->notifier,
2083 					VFIO_GROUP_NOTIFY_SET_KVM, group->kvm);
2084 
2085 	vfio_group_try_dissolve_container(group);
2086 
2087 	return ret;
2088 }
2089 
vfio_unregister_group_notifier(struct vfio_group * group,struct notifier_block * nb)2090 static int vfio_unregister_group_notifier(struct vfio_group *group,
2091 					 struct notifier_block *nb)
2092 {
2093 	int ret;
2094 
2095 	ret = vfio_group_add_container_user(group);
2096 	if (ret)
2097 		return -EINVAL;
2098 
2099 	ret = blocking_notifier_chain_unregister(&group->notifier, nb);
2100 
2101 	vfio_group_try_dissolve_container(group);
2102 
2103 	return ret;
2104 }
2105 
vfio_register_notifier(struct device * dev,enum vfio_notify_type type,unsigned long * events,struct notifier_block * nb)2106 int vfio_register_notifier(struct device *dev, enum vfio_notify_type type,
2107 			   unsigned long *events, struct notifier_block *nb)
2108 {
2109 	struct vfio_group *group;
2110 	int ret;
2111 
2112 	if (!dev || !nb || !events || (*events == 0))
2113 		return -EINVAL;
2114 
2115 	group = vfio_group_get_from_dev(dev);
2116 	if (!group)
2117 		return -ENODEV;
2118 
2119 	switch (type) {
2120 	case VFIO_IOMMU_NOTIFY:
2121 		ret = vfio_register_iommu_notifier(group, events, nb);
2122 		break;
2123 	case VFIO_GROUP_NOTIFY:
2124 		ret = vfio_register_group_notifier(group, events, nb);
2125 		break;
2126 	default:
2127 		ret = -EINVAL;
2128 	}
2129 
2130 	vfio_group_put(group);
2131 	return ret;
2132 }
2133 EXPORT_SYMBOL(vfio_register_notifier);
2134 
vfio_unregister_notifier(struct device * dev,enum vfio_notify_type type,struct notifier_block * nb)2135 int vfio_unregister_notifier(struct device *dev, enum vfio_notify_type type,
2136 			     struct notifier_block *nb)
2137 {
2138 	struct vfio_group *group;
2139 	int ret;
2140 
2141 	if (!dev || !nb)
2142 		return -EINVAL;
2143 
2144 	group = vfio_group_get_from_dev(dev);
2145 	if (!group)
2146 		return -ENODEV;
2147 
2148 	switch (type) {
2149 	case VFIO_IOMMU_NOTIFY:
2150 		ret = vfio_unregister_iommu_notifier(group, nb);
2151 		break;
2152 	case VFIO_GROUP_NOTIFY:
2153 		ret = vfio_unregister_group_notifier(group, nb);
2154 		break;
2155 	default:
2156 		ret = -EINVAL;
2157 	}
2158 
2159 	vfio_group_put(group);
2160 	return ret;
2161 }
2162 EXPORT_SYMBOL(vfio_unregister_notifier);
2163 
2164 /**
2165  * Module/class support
2166  */
vfio_devnode(struct device * dev,umode_t * mode)2167 static char *vfio_devnode(struct device *dev, umode_t *mode)
2168 {
2169 	return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
2170 }
2171 
2172 static struct miscdevice vfio_dev = {
2173 	.minor = VFIO_MINOR,
2174 	.name = "vfio",
2175 	.fops = &vfio_fops,
2176 	.nodename = "vfio/vfio",
2177 	.mode = S_IRUGO | S_IWUGO,
2178 };
2179 
vfio_init(void)2180 static int __init vfio_init(void)
2181 {
2182 	int ret;
2183 
2184 	idr_init(&vfio.group_idr);
2185 	mutex_init(&vfio.group_lock);
2186 	mutex_init(&vfio.iommu_drivers_lock);
2187 	INIT_LIST_HEAD(&vfio.group_list);
2188 	INIT_LIST_HEAD(&vfio.iommu_drivers_list);
2189 	init_waitqueue_head(&vfio.release_q);
2190 
2191 	ret = misc_register(&vfio_dev);
2192 	if (ret) {
2193 		pr_err("vfio: misc device register failed\n");
2194 		return ret;
2195 	}
2196 
2197 	/* /dev/vfio/$GROUP */
2198 	vfio.class = class_create(THIS_MODULE, "vfio");
2199 	if (IS_ERR(vfio.class)) {
2200 		ret = PTR_ERR(vfio.class);
2201 		goto err_class;
2202 	}
2203 
2204 	vfio.class->devnode = vfio_devnode;
2205 
2206 	ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK + 1, "vfio");
2207 	if (ret)
2208 		goto err_alloc_chrdev;
2209 
2210 	cdev_init(&vfio.group_cdev, &vfio_group_fops);
2211 	ret = cdev_add(&vfio.group_cdev, vfio.group_devt, MINORMASK + 1);
2212 	if (ret)
2213 		goto err_cdev_add;
2214 
2215 	pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
2216 
2217 #ifdef CONFIG_VFIO_NOIOMMU
2218 	vfio_register_iommu_driver(&vfio_noiommu_ops);
2219 #endif
2220 	return 0;
2221 
2222 err_cdev_add:
2223 	unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
2224 err_alloc_chrdev:
2225 	class_destroy(vfio.class);
2226 	vfio.class = NULL;
2227 err_class:
2228 	misc_deregister(&vfio_dev);
2229 	return ret;
2230 }
2231 
vfio_cleanup(void)2232 static void __exit vfio_cleanup(void)
2233 {
2234 	WARN_ON(!list_empty(&vfio.group_list));
2235 
2236 #ifdef CONFIG_VFIO_NOIOMMU
2237 	vfio_unregister_iommu_driver(&vfio_noiommu_ops);
2238 #endif
2239 	idr_destroy(&vfio.group_idr);
2240 	cdev_del(&vfio.group_cdev);
2241 	unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
2242 	class_destroy(vfio.class);
2243 	vfio.class = NULL;
2244 	misc_deregister(&vfio_dev);
2245 }
2246 
2247 module_init(vfio_init);
2248 module_exit(vfio_cleanup);
2249 
2250 MODULE_VERSION(DRIVER_VERSION);
2251 MODULE_LICENSE("GPL v2");
2252 MODULE_AUTHOR(DRIVER_AUTHOR);
2253 MODULE_DESCRIPTION(DRIVER_DESC);
2254 MODULE_ALIAS_MISCDEV(VFIO_MINOR);
2255 MODULE_ALIAS("devname:vfio/vfio");
2256 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");
2257