1 // SPDX-License-Identifier: GPL-2.0-only
2 #include <linux/kvm_host.h>
3
4 #include <asm/irq_remapping.h>
5 #include <asm/cpu.h>
6
7 #include "lapic.h"
8 #include "posted_intr.h"
9 #include "trace.h"
10 #include "vmx.h"
11
12 /*
13 * We maintain a per-CPU linked-list of vCPU, so in wakeup_handler() we
14 * can find which vCPU should be waken up.
15 */
16 static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
17 static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
18
vcpu_to_pi_desc(struct kvm_vcpu * vcpu)19 static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
20 {
21 return &(to_vmx(vcpu)->pi_desc);
22 }
23
vmx_vcpu_pi_load(struct kvm_vcpu * vcpu,int cpu)24 void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
25 {
26 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
27 struct pi_desc old, new;
28 unsigned int dest;
29
30 /*
31 * In case of hot-plug or hot-unplug, we may have to undo
32 * vmx_vcpu_pi_put even if there is no assigned device. And we
33 * always keep PI.NDST up to date for simplicity: it makes the
34 * code easier, and CPU migration is not a fast path.
35 */
36 if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
37 return;
38
39 /*
40 * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
41 * PI.NDST: pi_post_block is the one expected to change PID.NDST and the
42 * wakeup handler expects the vCPU to be on the blocked_vcpu_list that
43 * matches PI.NDST. Otherwise, a vcpu may not be able to be woken up
44 * correctly.
45 */
46 if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) {
47 pi_clear_sn(pi_desc);
48 goto after_clear_sn;
49 }
50
51 /* The full case. */
52 do {
53 old.control = new.control = pi_desc->control;
54
55 dest = cpu_physical_id(cpu);
56
57 if (x2apic_mode)
58 new.ndst = dest;
59 else
60 new.ndst = (dest << 8) & 0xFF00;
61
62 new.sn = 0;
63 } while (cmpxchg64(&pi_desc->control, old.control,
64 new.control) != old.control);
65
66 after_clear_sn:
67
68 /*
69 * Clear SN before reading the bitmap. The VT-d firmware
70 * writes the bitmap and reads SN atomically (5.2.3 in the
71 * spec), so it doesn't really have a memory barrier that
72 * pairs with this, but we cannot do that and we need one.
73 */
74 smp_mb__after_atomic();
75
76 if (!pi_is_pir_empty(pi_desc))
77 pi_set_on(pi_desc);
78 }
79
vmx_vcpu_pi_put(struct kvm_vcpu * vcpu)80 void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
81 {
82 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
83
84 if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
85 !irq_remapping_cap(IRQ_POSTING_CAP) ||
86 !kvm_vcpu_apicv_active(vcpu))
87 return;
88
89 /* Set SN when the vCPU is preempted */
90 if (vcpu->preempted)
91 pi_set_sn(pi_desc);
92 }
93
__pi_post_block(struct kvm_vcpu * vcpu)94 static void __pi_post_block(struct kvm_vcpu *vcpu)
95 {
96 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
97 struct pi_desc old, new;
98 unsigned int dest;
99
100 do {
101 old.control = new.control = pi_desc->control;
102 WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR,
103 "Wakeup handler not enabled while the VCPU is blocked\n");
104
105 dest = cpu_physical_id(vcpu->cpu);
106
107 if (x2apic_mode)
108 new.ndst = dest;
109 else
110 new.ndst = (dest << 8) & 0xFF00;
111
112 /* set 'NV' to 'notification vector' */
113 new.nv = POSTED_INTR_VECTOR;
114 } while (cmpxchg64(&pi_desc->control, old.control,
115 new.control) != old.control);
116
117 if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) {
118 spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
119 list_del(&vcpu->blocked_vcpu_list);
120 spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
121 vcpu->pre_pcpu = -1;
122 }
123 }
124
125 /*
126 * This routine does the following things for vCPU which is going
127 * to be blocked if VT-d PI is enabled.
128 * - Store the vCPU to the wakeup list, so when interrupts happen
129 * we can find the right vCPU to wake up.
130 * - Change the Posted-interrupt descriptor as below:
131 * 'NDST' <-- vcpu->pre_pcpu
132 * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR
133 * - If 'ON' is set during this process, which means at least one
134 * interrupt is posted for this vCPU, we cannot block it, in
135 * this case, return 1, otherwise, return 0.
136 *
137 */
pi_pre_block(struct kvm_vcpu * vcpu)138 int pi_pre_block(struct kvm_vcpu *vcpu)
139 {
140 unsigned int dest;
141 struct pi_desc old, new;
142 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
143
144 if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
145 !irq_remapping_cap(IRQ_POSTING_CAP) ||
146 !kvm_vcpu_apicv_active(vcpu))
147 return 0;
148
149 WARN_ON(irqs_disabled());
150 local_irq_disable();
151 if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) {
152 vcpu->pre_pcpu = vcpu->cpu;
153 spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
154 list_add_tail(&vcpu->blocked_vcpu_list,
155 &per_cpu(blocked_vcpu_on_cpu,
156 vcpu->pre_pcpu));
157 spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
158 }
159
160 do {
161 old.control = new.control = pi_desc->control;
162
163 WARN((pi_desc->sn == 1),
164 "Warning: SN field of posted-interrupts "
165 "is set before blocking\n");
166
167 /*
168 * Since vCPU can be preempted during this process,
169 * vcpu->cpu could be different with pre_pcpu, we
170 * need to set pre_pcpu as the destination of wakeup
171 * notification event, then we can find the right vCPU
172 * to wakeup in wakeup handler if interrupts happen
173 * when the vCPU is in blocked state.
174 */
175 dest = cpu_physical_id(vcpu->pre_pcpu);
176
177 if (x2apic_mode)
178 new.ndst = dest;
179 else
180 new.ndst = (dest << 8) & 0xFF00;
181
182 /* set 'NV' to 'wakeup vector' */
183 new.nv = POSTED_INTR_WAKEUP_VECTOR;
184 } while (cmpxchg64(&pi_desc->control, old.control,
185 new.control) != old.control);
186
187 /* We should not block the vCPU if an interrupt is posted for it. */
188 if (pi_test_on(pi_desc) == 1)
189 __pi_post_block(vcpu);
190
191 local_irq_enable();
192 return (vcpu->pre_pcpu == -1);
193 }
194
pi_post_block(struct kvm_vcpu * vcpu)195 void pi_post_block(struct kvm_vcpu *vcpu)
196 {
197 if (vcpu->pre_pcpu == -1)
198 return;
199
200 WARN_ON(irqs_disabled());
201 local_irq_disable();
202 __pi_post_block(vcpu);
203 local_irq_enable();
204 }
205
206 /*
207 * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
208 */
pi_wakeup_handler(void)209 void pi_wakeup_handler(void)
210 {
211 struct kvm_vcpu *vcpu;
212 int cpu = smp_processor_id();
213
214 spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
215 list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu),
216 blocked_vcpu_list) {
217 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
218
219 if (pi_test_on(pi_desc) == 1)
220 kvm_vcpu_kick(vcpu);
221 }
222 spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
223 }
224
pi_init_cpu(int cpu)225 void __init pi_init_cpu(int cpu)
226 {
227 INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
228 spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
229 }
230
pi_has_pending_interrupt(struct kvm_vcpu * vcpu)231 bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu)
232 {
233 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
234
235 return pi_test_on(pi_desc) ||
236 (pi_test_sn(pi_desc) && !pi_is_pir_empty(pi_desc));
237 }
238
239
240 /*
241 * Bail out of the block loop if the VM has an assigned
242 * device, but the blocking vCPU didn't reconfigure the
243 * PI.NV to the wakeup vector, i.e. the assigned device
244 * came along after the initial check in pi_pre_block().
245 */
vmx_pi_start_assignment(struct kvm * kvm)246 void vmx_pi_start_assignment(struct kvm *kvm)
247 {
248 if (!irq_remapping_cap(IRQ_POSTING_CAP))
249 return;
250
251 kvm_make_all_cpus_request(kvm, KVM_REQ_UNBLOCK);
252 }
253
254 /*
255 * pi_update_irte - set IRTE for Posted-Interrupts
256 *
257 * @kvm: kvm
258 * @host_irq: host irq of the interrupt
259 * @guest_irq: gsi of the interrupt
260 * @set: set or unset PI
261 * returns 0 on success, < 0 on failure
262 */
pi_update_irte(struct kvm * kvm,unsigned int host_irq,uint32_t guest_irq,bool set)263 int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq,
264 bool set)
265 {
266 struct kvm_kernel_irq_routing_entry *e;
267 struct kvm_irq_routing_table *irq_rt;
268 struct kvm_lapic_irq irq;
269 struct kvm_vcpu *vcpu;
270 struct vcpu_data vcpu_info;
271 int idx, ret = 0;
272
273 if (!kvm_arch_has_assigned_device(kvm) ||
274 !irq_remapping_cap(IRQ_POSTING_CAP) ||
275 !kvm_vcpu_apicv_active(kvm->vcpus[0]))
276 return 0;
277
278 idx = srcu_read_lock(&kvm->irq_srcu);
279 irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
280 if (guest_irq >= irq_rt->nr_rt_entries ||
281 hlist_empty(&irq_rt->map[guest_irq])) {
282 pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
283 guest_irq, irq_rt->nr_rt_entries);
284 goto out;
285 }
286
287 hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
288 if (e->type != KVM_IRQ_ROUTING_MSI)
289 continue;
290 /*
291 * VT-d PI cannot support posting multicast/broadcast
292 * interrupts to a vCPU, we still use interrupt remapping
293 * for these kind of interrupts.
294 *
295 * For lowest-priority interrupts, we only support
296 * those with single CPU as the destination, e.g. user
297 * configures the interrupts via /proc/irq or uses
298 * irqbalance to make the interrupts single-CPU.
299 *
300 * We will support full lowest-priority interrupt later.
301 *
302 * In addition, we can only inject generic interrupts using
303 * the PI mechanism, refuse to route others through it.
304 */
305
306 kvm_set_msi_irq(kvm, e, &irq);
307 if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
308 !kvm_irq_is_postable(&irq)) {
309 /*
310 * Make sure the IRTE is in remapped mode if
311 * we don't handle it in posted mode.
312 */
313 ret = irq_set_vcpu_affinity(host_irq, NULL);
314 if (ret < 0) {
315 printk(KERN_INFO
316 "failed to back to remapped mode, irq: %u\n",
317 host_irq);
318 goto out;
319 }
320
321 continue;
322 }
323
324 vcpu_info.pi_desc_addr = __pa(&to_vmx(vcpu)->pi_desc);
325 vcpu_info.vector = irq.vector;
326
327 trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi,
328 vcpu_info.vector, vcpu_info.pi_desc_addr, set);
329
330 if (set)
331 ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
332 else
333 ret = irq_set_vcpu_affinity(host_irq, NULL);
334
335 if (ret < 0) {
336 printk(KERN_INFO "%s: failed to update PI IRTE\n",
337 __func__);
338 goto out;
339 }
340 }
341
342 ret = 0;
343 out:
344 srcu_read_unlock(&kvm->irq_srcu, idx);
345 return ret;
346 }
347