1 // SPDX-License-Identifier: GPL-2.0
2 /* XDP sockets
3  *
4  * AF_XDP sockets allows a channel between XDP programs and userspace
5  * applications.
6  * Copyright(c) 2018 Intel Corporation.
7  *
8  * Author(s): Björn Töpel <bjorn.topel@intel.com>
9  *	      Magnus Karlsson <magnus.karlsson@intel.com>
10  */
11 
12 #define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__
13 
14 #include <linux/if_xdp.h>
15 #include <linux/init.h>
16 #include <linux/sched/mm.h>
17 #include <linux/sched/signal.h>
18 #include <linux/sched/task.h>
19 #include <linux/socket.h>
20 #include <linux/file.h>
21 #include <linux/uaccess.h>
22 #include <linux/net.h>
23 #include <linux/netdevice.h>
24 #include <linux/rculist.h>
25 #include <net/xdp_sock_drv.h>
26 #include <net/xdp.h>
27 
28 #include "xsk_queue.h"
29 #include "xdp_umem.h"
30 #include "xsk.h"
31 
32 #define TX_BATCH_SIZE 16
33 
34 static DEFINE_PER_CPU(struct list_head, xskmap_flush_list);
35 
xsk_set_rx_need_wakeup(struct xsk_buff_pool * pool)36 void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
37 {
38 	if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
39 		return;
40 
41 	pool->fq->ring->flags |= XDP_RING_NEED_WAKEUP;
42 	pool->cached_need_wakeup |= XDP_WAKEUP_RX;
43 }
44 EXPORT_SYMBOL(xsk_set_rx_need_wakeup);
45 
xsk_set_tx_need_wakeup(struct xsk_buff_pool * pool)46 void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool)
47 {
48 	struct xdp_sock *xs;
49 
50 	if (pool->cached_need_wakeup & XDP_WAKEUP_TX)
51 		return;
52 
53 	rcu_read_lock();
54 	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
55 		xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
56 	}
57 	rcu_read_unlock();
58 
59 	pool->cached_need_wakeup |= XDP_WAKEUP_TX;
60 }
61 EXPORT_SYMBOL(xsk_set_tx_need_wakeup);
62 
xsk_clear_rx_need_wakeup(struct xsk_buff_pool * pool)63 void xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool)
64 {
65 	if (!(pool->cached_need_wakeup & XDP_WAKEUP_RX))
66 		return;
67 
68 	pool->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP;
69 	pool->cached_need_wakeup &= ~XDP_WAKEUP_RX;
70 }
71 EXPORT_SYMBOL(xsk_clear_rx_need_wakeup);
72 
xsk_clear_tx_need_wakeup(struct xsk_buff_pool * pool)73 void xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool)
74 {
75 	struct xdp_sock *xs;
76 
77 	if (!(pool->cached_need_wakeup & XDP_WAKEUP_TX))
78 		return;
79 
80 	rcu_read_lock();
81 	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
82 		xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP;
83 	}
84 	rcu_read_unlock();
85 
86 	pool->cached_need_wakeup &= ~XDP_WAKEUP_TX;
87 }
88 EXPORT_SYMBOL(xsk_clear_tx_need_wakeup);
89 
xsk_uses_need_wakeup(struct xsk_buff_pool * pool)90 bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool)
91 {
92 	return pool->uses_need_wakeup;
93 }
94 EXPORT_SYMBOL(xsk_uses_need_wakeup);
95 
xsk_get_pool_from_qid(struct net_device * dev,u16 queue_id)96 struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev,
97 					    u16 queue_id)
98 {
99 	if (queue_id < dev->real_num_rx_queues)
100 		return dev->_rx[queue_id].pool;
101 	if (queue_id < dev->real_num_tx_queues)
102 		return dev->_tx[queue_id].pool;
103 
104 	return NULL;
105 }
106 EXPORT_SYMBOL(xsk_get_pool_from_qid);
107 
xsk_clear_pool_at_qid(struct net_device * dev,u16 queue_id)108 void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id)
109 {
110 	if (queue_id < dev->real_num_rx_queues)
111 		dev->_rx[queue_id].pool = NULL;
112 	if (queue_id < dev->real_num_tx_queues)
113 		dev->_tx[queue_id].pool = NULL;
114 }
115 
116 /* The buffer pool is stored both in the _rx struct and the _tx struct as we do
117  * not know if the device has more tx queues than rx, or the opposite.
118  * This might also change during run time.
119  */
xsk_reg_pool_at_qid(struct net_device * dev,struct xsk_buff_pool * pool,u16 queue_id)120 int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool,
121 			u16 queue_id)
122 {
123 	if (queue_id >= max_t(unsigned int,
124 			      dev->real_num_rx_queues,
125 			      dev->real_num_tx_queues))
126 		return -EINVAL;
127 
128 	if (queue_id < dev->real_num_rx_queues)
129 		dev->_rx[queue_id].pool = pool;
130 	if (queue_id < dev->real_num_tx_queues)
131 		dev->_tx[queue_id].pool = pool;
132 
133 	return 0;
134 }
135 
xp_release(struct xdp_buff_xsk * xskb)136 void xp_release(struct xdp_buff_xsk *xskb)
137 {
138 	xskb->pool->free_heads[xskb->pool->free_heads_cnt++] = xskb;
139 }
140 
xp_get_handle(struct xdp_buff_xsk * xskb)141 static u64 xp_get_handle(struct xdp_buff_xsk *xskb)
142 {
143 	u64 offset = xskb->xdp.data - xskb->xdp.data_hard_start;
144 
145 	offset += xskb->pool->headroom;
146 	if (!xskb->pool->unaligned)
147 		return xskb->orig_addr + offset;
148 	return xskb->orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT);
149 }
150 
__xsk_rcv_zc(struct xdp_sock * xs,struct xdp_buff * xdp,u32 len)151 static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
152 {
153 	struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
154 	u64 addr;
155 	int err;
156 
157 	addr = xp_get_handle(xskb);
158 	err = xskq_prod_reserve_desc(xs->rx, addr, len);
159 	if (err) {
160 		xs->rx_queue_full++;
161 		return err;
162 	}
163 
164 	xp_release(xskb);
165 	return 0;
166 }
167 
xsk_copy_xdp(struct xdp_buff * to,struct xdp_buff * from,u32 len)168 static void xsk_copy_xdp(struct xdp_buff *to, struct xdp_buff *from, u32 len)
169 {
170 	void *from_buf, *to_buf;
171 	u32 metalen;
172 
173 	if (unlikely(xdp_data_meta_unsupported(from))) {
174 		from_buf = from->data;
175 		to_buf = to->data;
176 		metalen = 0;
177 	} else {
178 		from_buf = from->data_meta;
179 		metalen = from->data - from->data_meta;
180 		to_buf = to->data - metalen;
181 	}
182 
183 	memcpy(to_buf, from_buf, len + metalen);
184 }
185 
__xsk_rcv(struct xdp_sock * xs,struct xdp_buff * xdp,u32 len,bool explicit_free)186 static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len,
187 		     bool explicit_free)
188 {
189 	struct xdp_buff *xsk_xdp;
190 	int err;
191 
192 	if (len > xsk_pool_get_rx_frame_size(xs->pool)) {
193 		xs->rx_dropped++;
194 		return -ENOSPC;
195 	}
196 
197 	xsk_xdp = xsk_buff_alloc(xs->pool);
198 	if (!xsk_xdp) {
199 		xs->rx_dropped++;
200 		return -ENOSPC;
201 	}
202 
203 	xsk_copy_xdp(xsk_xdp, xdp, len);
204 	err = __xsk_rcv_zc(xs, xsk_xdp, len);
205 	if (err) {
206 		xsk_buff_free(xsk_xdp);
207 		return err;
208 	}
209 	if (explicit_free)
210 		xdp_return_buff(xdp);
211 	return 0;
212 }
213 
xsk_tx_writeable(struct xdp_sock * xs)214 static bool xsk_tx_writeable(struct xdp_sock *xs)
215 {
216 	if (xskq_cons_present_entries(xs->tx) > xs->tx->nentries / 2)
217 		return false;
218 
219 	return true;
220 }
221 
xsk_is_bound(struct xdp_sock * xs)222 static bool xsk_is_bound(struct xdp_sock *xs)
223 {
224 	if (READ_ONCE(xs->state) == XSK_BOUND) {
225 		/* Matches smp_wmb() in bind(). */
226 		smp_rmb();
227 		return true;
228 	}
229 	return false;
230 }
231 
xsk_rcv(struct xdp_sock * xs,struct xdp_buff * xdp,bool explicit_free)232 static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp,
233 		   bool explicit_free)
234 {
235 	u32 len;
236 
237 	if (!xsk_is_bound(xs))
238 		return -EINVAL;
239 
240 	if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
241 		return -EINVAL;
242 
243 	len = xdp->data_end - xdp->data;
244 
245 	return xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL ?
246 		__xsk_rcv_zc(xs, xdp, len) :
247 		__xsk_rcv(xs, xdp, len, explicit_free);
248 }
249 
xsk_flush(struct xdp_sock * xs)250 static void xsk_flush(struct xdp_sock *xs)
251 {
252 	xskq_prod_submit(xs->rx);
253 	__xskq_cons_release(xs->pool->fq);
254 	sock_def_readable(&xs->sk);
255 }
256 
xsk_generic_rcv(struct xdp_sock * xs,struct xdp_buff * xdp)257 int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
258 {
259 	int err;
260 
261 	spin_lock_bh(&xs->rx_lock);
262 	err = xsk_rcv(xs, xdp, false);
263 	xsk_flush(xs);
264 	spin_unlock_bh(&xs->rx_lock);
265 	return err;
266 }
267 
__xsk_map_redirect(struct xdp_sock * xs,struct xdp_buff * xdp)268 int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp)
269 {
270 	struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
271 	int err;
272 
273 	err = xsk_rcv(xs, xdp, true);
274 	if (err)
275 		return err;
276 
277 	if (!xs->flush_node.prev)
278 		list_add(&xs->flush_node, flush_list);
279 
280 	return 0;
281 }
282 
__xsk_map_flush(void)283 void __xsk_map_flush(void)
284 {
285 	struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
286 	struct xdp_sock *xs, *tmp;
287 
288 	list_for_each_entry_safe(xs, tmp, flush_list, flush_node) {
289 		xsk_flush(xs);
290 		__list_del_clearprev(&xs->flush_node);
291 	}
292 }
293 
xsk_tx_completed(struct xsk_buff_pool * pool,u32 nb_entries)294 void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries)
295 {
296 	xskq_prod_submit_n(pool->cq, nb_entries);
297 }
298 EXPORT_SYMBOL(xsk_tx_completed);
299 
xsk_tx_release(struct xsk_buff_pool * pool)300 void xsk_tx_release(struct xsk_buff_pool *pool)
301 {
302 	struct xdp_sock *xs;
303 
304 	rcu_read_lock();
305 	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
306 		__xskq_cons_release(xs->tx);
307 		if (xsk_tx_writeable(xs))
308 			xs->sk.sk_write_space(&xs->sk);
309 	}
310 	rcu_read_unlock();
311 }
312 EXPORT_SYMBOL(xsk_tx_release);
313 
xsk_tx_peek_desc(struct xsk_buff_pool * pool,struct xdp_desc * desc)314 bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc)
315 {
316 	struct xdp_sock *xs;
317 
318 	rcu_read_lock();
319 	list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
320 		if (!xskq_cons_peek_desc(xs->tx, desc, pool)) {
321 			xs->tx->queue_empty_descs++;
322 			continue;
323 		}
324 
325 		/* This is the backpressure mechanism for the Tx path.
326 		 * Reserve space in the completion queue and only proceed
327 		 * if there is space in it. This avoids having to implement
328 		 * any buffering in the Tx path.
329 		 */
330 		if (xskq_prod_reserve_addr(pool->cq, desc->addr))
331 			goto out;
332 
333 		xskq_cons_release(xs->tx);
334 		rcu_read_unlock();
335 		return true;
336 	}
337 
338 out:
339 	rcu_read_unlock();
340 	return false;
341 }
342 EXPORT_SYMBOL(xsk_tx_peek_desc);
343 
xsk_wakeup(struct xdp_sock * xs,u8 flags)344 static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
345 {
346 	struct net_device *dev = xs->dev;
347 	int err;
348 
349 	rcu_read_lock();
350 	err = dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
351 	rcu_read_unlock();
352 
353 	return err;
354 }
355 
xsk_zc_xmit(struct xdp_sock * xs)356 static int xsk_zc_xmit(struct xdp_sock *xs)
357 {
358 	return xsk_wakeup(xs, XDP_WAKEUP_TX);
359 }
360 
xsk_destruct_skb(struct sk_buff * skb)361 static void xsk_destruct_skb(struct sk_buff *skb)
362 {
363 	u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg;
364 	struct xdp_sock *xs = xdp_sk(skb->sk);
365 	unsigned long flags;
366 
367 	spin_lock_irqsave(&xs->tx_completion_lock, flags);
368 	xskq_prod_submit_addr(xs->pool->cq, addr);
369 	spin_unlock_irqrestore(&xs->tx_completion_lock, flags);
370 
371 	sock_wfree(skb);
372 }
373 
xsk_generic_xmit(struct sock * sk)374 static int xsk_generic_xmit(struct sock *sk)
375 {
376 	struct xdp_sock *xs = xdp_sk(sk);
377 	u32 max_batch = TX_BATCH_SIZE;
378 	bool sent_frame = false;
379 	struct xdp_desc desc;
380 	struct sk_buff *skb;
381 	int err = 0;
382 
383 	mutex_lock(&xs->mutex);
384 
385 	if (xs->queue_id >= xs->dev->real_num_tx_queues)
386 		goto out;
387 
388 	while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
389 		char *buffer;
390 		u64 addr;
391 		u32 len;
392 
393 		if (max_batch-- == 0) {
394 			err = -EAGAIN;
395 			goto out;
396 		}
397 
398 		len = desc.len;
399 		skb = sock_alloc_send_skb(sk, len, 1, &err);
400 		if (unlikely(!skb))
401 			goto out;
402 
403 		skb_put(skb, len);
404 		addr = desc.addr;
405 		buffer = xsk_buff_raw_get_data(xs->pool, addr);
406 		err = skb_store_bits(skb, 0, buffer, len);
407 		/* This is the backpressure mechanism for the Tx path.
408 		 * Reserve space in the completion queue and only proceed
409 		 * if there is space in it. This avoids having to implement
410 		 * any buffering in the Tx path.
411 		 */
412 		if (unlikely(err) || xskq_prod_reserve(xs->pool->cq)) {
413 			kfree_skb(skb);
414 			goto out;
415 		}
416 
417 		skb->dev = xs->dev;
418 		skb->priority = sk->sk_priority;
419 		skb->mark = sk->sk_mark;
420 		skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr;
421 		skb->destructor = xsk_destruct_skb;
422 
423 		err = __dev_direct_xmit(skb, xs->queue_id);
424 		if  (err == NETDEV_TX_BUSY) {
425 			/* Tell user-space to retry the send */
426 			skb->destructor = sock_wfree;
427 			/* Free skb without triggering the perf drop trace */
428 			consume_skb(skb);
429 			err = -EAGAIN;
430 			goto out;
431 		}
432 
433 		xskq_cons_release(xs->tx);
434 		/* Ignore NET_XMIT_CN as packet might have been sent */
435 		if (err == NET_XMIT_DROP) {
436 			/* SKB completed but not sent */
437 			err = -EBUSY;
438 			goto out;
439 		}
440 
441 		sent_frame = true;
442 	}
443 
444 	xs->tx->queue_empty_descs++;
445 
446 out:
447 	if (sent_frame)
448 		if (xsk_tx_writeable(xs))
449 			sk->sk_write_space(sk);
450 
451 	mutex_unlock(&xs->mutex);
452 	return err;
453 }
454 
__xsk_sendmsg(struct sock * sk)455 static int __xsk_sendmsg(struct sock *sk)
456 {
457 	struct xdp_sock *xs = xdp_sk(sk);
458 
459 	if (unlikely(!(xs->dev->flags & IFF_UP)))
460 		return -ENETDOWN;
461 	if (unlikely(!xs->tx))
462 		return -ENOBUFS;
463 
464 	return xs->zc ? xsk_zc_xmit(xs) : xsk_generic_xmit(sk);
465 }
466 
xsk_sendmsg(struct socket * sock,struct msghdr * m,size_t total_len)467 static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
468 {
469 	bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
470 	struct sock *sk = sock->sk;
471 	struct xdp_sock *xs = xdp_sk(sk);
472 
473 	if (unlikely(!xsk_is_bound(xs)))
474 		return -ENXIO;
475 	if (unlikely(need_wait))
476 		return -EOPNOTSUPP;
477 
478 	return __xsk_sendmsg(sk);
479 }
480 
xsk_poll(struct file * file,struct socket * sock,struct poll_table_struct * wait)481 static __poll_t xsk_poll(struct file *file, struct socket *sock,
482 			     struct poll_table_struct *wait)
483 {
484 	__poll_t mask = 0;
485 	struct sock *sk = sock->sk;
486 	struct xdp_sock *xs = xdp_sk(sk);
487 	struct xsk_buff_pool *pool;
488 
489 	sock_poll_wait(file, sock, wait);
490 
491 	if (unlikely(!xsk_is_bound(xs)))
492 		return mask;
493 
494 	pool = xs->pool;
495 
496 	if (pool->cached_need_wakeup) {
497 		if (xs->zc)
498 			xsk_wakeup(xs, pool->cached_need_wakeup);
499 		else
500 			/* Poll needs to drive Tx also in copy mode */
501 			__xsk_sendmsg(sk);
502 	}
503 
504 	if (xs->rx && !xskq_prod_is_empty(xs->rx))
505 		mask |= EPOLLIN | EPOLLRDNORM;
506 	if (xs->tx && xsk_tx_writeable(xs))
507 		mask |= EPOLLOUT | EPOLLWRNORM;
508 
509 	return mask;
510 }
511 
xsk_init_queue(u32 entries,struct xsk_queue ** queue,bool umem_queue)512 static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
513 			  bool umem_queue)
514 {
515 	struct xsk_queue *q;
516 
517 	if (entries == 0 || *queue || !is_power_of_2(entries))
518 		return -EINVAL;
519 
520 	q = xskq_create(entries, umem_queue);
521 	if (!q)
522 		return -ENOMEM;
523 
524 	/* Make sure queue is ready before it can be seen by others */
525 	smp_wmb();
526 	WRITE_ONCE(*queue, q);
527 	return 0;
528 }
529 
xsk_unbind_dev(struct xdp_sock * xs)530 static void xsk_unbind_dev(struct xdp_sock *xs)
531 {
532 	struct net_device *dev = xs->dev;
533 
534 	if (xs->state != XSK_BOUND)
535 		return;
536 	WRITE_ONCE(xs->state, XSK_UNBOUND);
537 
538 	/* Wait for driver to stop using the xdp socket. */
539 	xp_del_xsk(xs->pool, xs);
540 	xs->dev = NULL;
541 	synchronize_net();
542 	dev_put(dev);
543 }
544 
xsk_get_map_list_entry(struct xdp_sock * xs,struct xdp_sock *** map_entry)545 static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs,
546 					      struct xdp_sock ***map_entry)
547 {
548 	struct xsk_map *map = NULL;
549 	struct xsk_map_node *node;
550 
551 	*map_entry = NULL;
552 
553 	spin_lock_bh(&xs->map_list_lock);
554 	node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node,
555 					node);
556 	if (node) {
557 		WARN_ON(xsk_map_inc(node->map));
558 		map = node->map;
559 		*map_entry = node->map_entry;
560 	}
561 	spin_unlock_bh(&xs->map_list_lock);
562 	return map;
563 }
564 
xsk_delete_from_maps(struct xdp_sock * xs)565 static void xsk_delete_from_maps(struct xdp_sock *xs)
566 {
567 	/* This function removes the current XDP socket from all the
568 	 * maps it resides in. We need to take extra care here, due to
569 	 * the two locks involved. Each map has a lock synchronizing
570 	 * updates to the entries, and each socket has a lock that
571 	 * synchronizes access to the list of maps (map_list). For
572 	 * deadlock avoidance the locks need to be taken in the order
573 	 * "map lock"->"socket map list lock". We start off by
574 	 * accessing the socket map list, and take a reference to the
575 	 * map to guarantee existence between the
576 	 * xsk_get_map_list_entry() and xsk_map_try_sock_delete()
577 	 * calls. Then we ask the map to remove the socket, which
578 	 * tries to remove the socket from the map. Note that there
579 	 * might be updates to the map between
580 	 * xsk_get_map_list_entry() and xsk_map_try_sock_delete().
581 	 */
582 	struct xdp_sock **map_entry = NULL;
583 	struct xsk_map *map;
584 
585 	while ((map = xsk_get_map_list_entry(xs, &map_entry))) {
586 		xsk_map_try_sock_delete(map, xs, map_entry);
587 		xsk_map_put(map);
588 	}
589 }
590 
xsk_release(struct socket * sock)591 static int xsk_release(struct socket *sock)
592 {
593 	struct sock *sk = sock->sk;
594 	struct xdp_sock *xs = xdp_sk(sk);
595 	struct net *net;
596 
597 	if (!sk)
598 		return 0;
599 
600 	net = sock_net(sk);
601 
602 	mutex_lock(&net->xdp.lock);
603 	sk_del_node_init_rcu(sk);
604 	mutex_unlock(&net->xdp.lock);
605 
606 	local_bh_disable();
607 	sock_prot_inuse_add(net, sk->sk_prot, -1);
608 	local_bh_enable();
609 
610 	xsk_delete_from_maps(xs);
611 	mutex_lock(&xs->mutex);
612 	xsk_unbind_dev(xs);
613 	mutex_unlock(&xs->mutex);
614 
615 	xskq_destroy(xs->rx);
616 	xskq_destroy(xs->tx);
617 	xskq_destroy(xs->fq_tmp);
618 	xskq_destroy(xs->cq_tmp);
619 
620 	sock_orphan(sk);
621 	sock->sk = NULL;
622 
623 	sk_refcnt_debug_release(sk);
624 	sock_put(sk);
625 
626 	return 0;
627 }
628 
xsk_lookup_xsk_from_fd(int fd)629 static struct socket *xsk_lookup_xsk_from_fd(int fd)
630 {
631 	struct socket *sock;
632 	int err;
633 
634 	sock = sockfd_lookup(fd, &err);
635 	if (!sock)
636 		return ERR_PTR(-ENOTSOCK);
637 
638 	if (sock->sk->sk_family != PF_XDP) {
639 		sockfd_put(sock);
640 		return ERR_PTR(-ENOPROTOOPT);
641 	}
642 
643 	return sock;
644 }
645 
xsk_validate_queues(struct xdp_sock * xs)646 static bool xsk_validate_queues(struct xdp_sock *xs)
647 {
648 	return xs->fq_tmp && xs->cq_tmp;
649 }
650 
xsk_bind(struct socket * sock,struct sockaddr * addr,int addr_len)651 static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
652 {
653 	struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
654 	struct sock *sk = sock->sk;
655 	struct xdp_sock *xs = xdp_sk(sk);
656 	struct net_device *dev;
657 	u32 flags, qid;
658 	int err = 0;
659 
660 	if (addr_len < sizeof(struct sockaddr_xdp))
661 		return -EINVAL;
662 	if (sxdp->sxdp_family != AF_XDP)
663 		return -EINVAL;
664 
665 	flags = sxdp->sxdp_flags;
666 	if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY |
667 		      XDP_USE_NEED_WAKEUP))
668 		return -EINVAL;
669 
670 	rtnl_lock();
671 	mutex_lock(&xs->mutex);
672 	if (xs->state != XSK_READY) {
673 		err = -EBUSY;
674 		goto out_release;
675 	}
676 
677 	dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex);
678 	if (!dev) {
679 		err = -ENODEV;
680 		goto out_release;
681 	}
682 
683 	if (!xs->rx && !xs->tx) {
684 		err = -EINVAL;
685 		goto out_unlock;
686 	}
687 
688 	qid = sxdp->sxdp_queue_id;
689 
690 	if (flags & XDP_SHARED_UMEM) {
691 		struct xdp_sock *umem_xs;
692 		struct socket *sock;
693 
694 		if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) ||
695 		    (flags & XDP_USE_NEED_WAKEUP)) {
696 			/* Cannot specify flags for shared sockets. */
697 			err = -EINVAL;
698 			goto out_unlock;
699 		}
700 
701 		if (xs->umem) {
702 			/* We have already our own. */
703 			err = -EINVAL;
704 			goto out_unlock;
705 		}
706 
707 		sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd);
708 		if (IS_ERR(sock)) {
709 			err = PTR_ERR(sock);
710 			goto out_unlock;
711 		}
712 
713 		umem_xs = xdp_sk(sock->sk);
714 		if (!xsk_is_bound(umem_xs)) {
715 			err = -EBADF;
716 			sockfd_put(sock);
717 			goto out_unlock;
718 		}
719 
720 		if (umem_xs->queue_id != qid || umem_xs->dev != dev) {
721 			/* Share the umem with another socket on another qid
722 			 * and/or device.
723 			 */
724 			xs->pool = xp_create_and_assign_umem(xs,
725 							     umem_xs->umem);
726 			if (!xs->pool) {
727 				err = -ENOMEM;
728 				sockfd_put(sock);
729 				goto out_unlock;
730 			}
731 
732 			err = xp_assign_dev_shared(xs->pool, umem_xs->umem,
733 						   dev, qid);
734 			if (err) {
735 				xp_destroy(xs->pool);
736 				xs->pool = NULL;
737 				sockfd_put(sock);
738 				goto out_unlock;
739 			}
740 		} else {
741 			/* Share the buffer pool with the other socket. */
742 			if (xs->fq_tmp || xs->cq_tmp) {
743 				/* Do not allow setting your own fq or cq. */
744 				err = -EINVAL;
745 				sockfd_put(sock);
746 				goto out_unlock;
747 			}
748 
749 			xp_get_pool(umem_xs->pool);
750 			xs->pool = umem_xs->pool;
751 		}
752 
753 		xdp_get_umem(umem_xs->umem);
754 		WRITE_ONCE(xs->umem, umem_xs->umem);
755 		sockfd_put(sock);
756 	} else if (!xs->umem || !xsk_validate_queues(xs)) {
757 		err = -EINVAL;
758 		goto out_unlock;
759 	} else {
760 		/* This xsk has its own umem. */
761 		xs->pool = xp_create_and_assign_umem(xs, xs->umem);
762 		if (!xs->pool) {
763 			err = -ENOMEM;
764 			goto out_unlock;
765 		}
766 
767 		err = xp_assign_dev(xs->pool, dev, qid, flags);
768 		if (err) {
769 			xp_destroy(xs->pool);
770 			xs->pool = NULL;
771 			goto out_unlock;
772 		}
773 	}
774 
775 	xs->dev = dev;
776 	xs->zc = xs->umem->zc;
777 	xs->queue_id = qid;
778 	xp_add_xsk(xs->pool, xs);
779 
780 out_unlock:
781 	if (err) {
782 		dev_put(dev);
783 	} else {
784 		/* Matches smp_rmb() in bind() for shared umem
785 		 * sockets, and xsk_is_bound().
786 		 */
787 		smp_wmb();
788 		WRITE_ONCE(xs->state, XSK_BOUND);
789 	}
790 out_release:
791 	mutex_unlock(&xs->mutex);
792 	rtnl_unlock();
793 	return err;
794 }
795 
796 struct xdp_umem_reg_v1 {
797 	__u64 addr; /* Start of packet data area */
798 	__u64 len; /* Length of packet data area */
799 	__u32 chunk_size;
800 	__u32 headroom;
801 };
802 
xsk_setsockopt(struct socket * sock,int level,int optname,sockptr_t optval,unsigned int optlen)803 static int xsk_setsockopt(struct socket *sock, int level, int optname,
804 			  sockptr_t optval, unsigned int optlen)
805 {
806 	struct sock *sk = sock->sk;
807 	struct xdp_sock *xs = xdp_sk(sk);
808 	int err;
809 
810 	if (level != SOL_XDP)
811 		return -ENOPROTOOPT;
812 
813 	switch (optname) {
814 	case XDP_RX_RING:
815 	case XDP_TX_RING:
816 	{
817 		struct xsk_queue **q;
818 		int entries;
819 
820 		if (optlen < sizeof(entries))
821 			return -EINVAL;
822 		if (copy_from_sockptr(&entries, optval, sizeof(entries)))
823 			return -EFAULT;
824 
825 		mutex_lock(&xs->mutex);
826 		if (xs->state != XSK_READY) {
827 			mutex_unlock(&xs->mutex);
828 			return -EBUSY;
829 		}
830 		q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
831 		err = xsk_init_queue(entries, q, false);
832 		if (!err && optname == XDP_TX_RING)
833 			/* Tx needs to be explicitly woken up the first time */
834 			xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
835 		mutex_unlock(&xs->mutex);
836 		return err;
837 	}
838 	case XDP_UMEM_REG:
839 	{
840 		size_t mr_size = sizeof(struct xdp_umem_reg);
841 		struct xdp_umem_reg mr = {};
842 		struct xdp_umem *umem;
843 
844 		if (optlen < sizeof(struct xdp_umem_reg_v1))
845 			return -EINVAL;
846 		else if (optlen < sizeof(mr))
847 			mr_size = sizeof(struct xdp_umem_reg_v1);
848 
849 		if (copy_from_sockptr(&mr, optval, mr_size))
850 			return -EFAULT;
851 
852 		mutex_lock(&xs->mutex);
853 		if (xs->state != XSK_READY || xs->umem) {
854 			mutex_unlock(&xs->mutex);
855 			return -EBUSY;
856 		}
857 
858 		umem = xdp_umem_create(&mr);
859 		if (IS_ERR(umem)) {
860 			mutex_unlock(&xs->mutex);
861 			return PTR_ERR(umem);
862 		}
863 
864 		/* Make sure umem is ready before it can be seen by others */
865 		smp_wmb();
866 		WRITE_ONCE(xs->umem, umem);
867 		mutex_unlock(&xs->mutex);
868 		return 0;
869 	}
870 	case XDP_UMEM_FILL_RING:
871 	case XDP_UMEM_COMPLETION_RING:
872 	{
873 		struct xsk_queue **q;
874 		int entries;
875 
876 		if (copy_from_sockptr(&entries, optval, sizeof(entries)))
877 			return -EFAULT;
878 
879 		mutex_lock(&xs->mutex);
880 		if (xs->state != XSK_READY) {
881 			mutex_unlock(&xs->mutex);
882 			return -EBUSY;
883 		}
884 
885 		q = (optname == XDP_UMEM_FILL_RING) ? &xs->fq_tmp :
886 			&xs->cq_tmp;
887 		err = xsk_init_queue(entries, q, true);
888 		mutex_unlock(&xs->mutex);
889 		return err;
890 	}
891 	default:
892 		break;
893 	}
894 
895 	return -ENOPROTOOPT;
896 }
897 
xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 * ring)898 static void xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 *ring)
899 {
900 	ring->producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
901 	ring->consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
902 	ring->desc = offsetof(struct xdp_rxtx_ring, desc);
903 }
904 
xsk_enter_umem_offsets(struct xdp_ring_offset_v1 * ring)905 static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring)
906 {
907 	ring->producer = offsetof(struct xdp_umem_ring, ptrs.producer);
908 	ring->consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
909 	ring->desc = offsetof(struct xdp_umem_ring, desc);
910 }
911 
912 struct xdp_statistics_v1 {
913 	__u64 rx_dropped;
914 	__u64 rx_invalid_descs;
915 	__u64 tx_invalid_descs;
916 };
917 
xsk_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)918 static int xsk_getsockopt(struct socket *sock, int level, int optname,
919 			  char __user *optval, int __user *optlen)
920 {
921 	struct sock *sk = sock->sk;
922 	struct xdp_sock *xs = xdp_sk(sk);
923 	int len;
924 
925 	if (level != SOL_XDP)
926 		return -ENOPROTOOPT;
927 
928 	if (get_user(len, optlen))
929 		return -EFAULT;
930 	if (len < 0)
931 		return -EINVAL;
932 
933 	switch (optname) {
934 	case XDP_STATISTICS:
935 	{
936 		struct xdp_statistics stats = {};
937 		bool extra_stats = true;
938 		size_t stats_size;
939 
940 		if (len < sizeof(struct xdp_statistics_v1)) {
941 			return -EINVAL;
942 		} else if (len < sizeof(stats)) {
943 			extra_stats = false;
944 			stats_size = sizeof(struct xdp_statistics_v1);
945 		} else {
946 			stats_size = sizeof(stats);
947 		}
948 
949 		mutex_lock(&xs->mutex);
950 		stats.rx_dropped = xs->rx_dropped;
951 		if (extra_stats) {
952 			stats.rx_ring_full = xs->rx_queue_full;
953 			stats.rx_fill_ring_empty_descs =
954 				xs->pool ? xskq_nb_queue_empty_descs(xs->pool->fq) : 0;
955 			stats.tx_ring_empty_descs = xskq_nb_queue_empty_descs(xs->tx);
956 		} else {
957 			stats.rx_dropped += xs->rx_queue_full;
958 		}
959 		stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx);
960 		stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx);
961 		mutex_unlock(&xs->mutex);
962 
963 		if (copy_to_user(optval, &stats, stats_size))
964 			return -EFAULT;
965 		if (put_user(stats_size, optlen))
966 			return -EFAULT;
967 
968 		return 0;
969 	}
970 	case XDP_MMAP_OFFSETS:
971 	{
972 		struct xdp_mmap_offsets off;
973 		struct xdp_mmap_offsets_v1 off_v1;
974 		bool flags_supported = true;
975 		void *to_copy;
976 
977 		if (len < sizeof(off_v1))
978 			return -EINVAL;
979 		else if (len < sizeof(off))
980 			flags_supported = false;
981 
982 		if (flags_supported) {
983 			/* xdp_ring_offset is identical to xdp_ring_offset_v1
984 			 * except for the flags field added to the end.
985 			 */
986 			xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
987 					       &off.rx);
988 			xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
989 					       &off.tx);
990 			xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
991 					       &off.fr);
992 			xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
993 					       &off.cr);
994 			off.rx.flags = offsetof(struct xdp_rxtx_ring,
995 						ptrs.flags);
996 			off.tx.flags = offsetof(struct xdp_rxtx_ring,
997 						ptrs.flags);
998 			off.fr.flags = offsetof(struct xdp_umem_ring,
999 						ptrs.flags);
1000 			off.cr.flags = offsetof(struct xdp_umem_ring,
1001 						ptrs.flags);
1002 
1003 			len = sizeof(off);
1004 			to_copy = &off;
1005 		} else {
1006 			xsk_enter_rxtx_offsets(&off_v1.rx);
1007 			xsk_enter_rxtx_offsets(&off_v1.tx);
1008 			xsk_enter_umem_offsets(&off_v1.fr);
1009 			xsk_enter_umem_offsets(&off_v1.cr);
1010 
1011 			len = sizeof(off_v1);
1012 			to_copy = &off_v1;
1013 		}
1014 
1015 		if (copy_to_user(optval, to_copy, len))
1016 			return -EFAULT;
1017 		if (put_user(len, optlen))
1018 			return -EFAULT;
1019 
1020 		return 0;
1021 	}
1022 	case XDP_OPTIONS:
1023 	{
1024 		struct xdp_options opts = {};
1025 
1026 		if (len < sizeof(opts))
1027 			return -EINVAL;
1028 
1029 		mutex_lock(&xs->mutex);
1030 		if (xs->zc)
1031 			opts.flags |= XDP_OPTIONS_ZEROCOPY;
1032 		mutex_unlock(&xs->mutex);
1033 
1034 		len = sizeof(opts);
1035 		if (copy_to_user(optval, &opts, len))
1036 			return -EFAULT;
1037 		if (put_user(len, optlen))
1038 			return -EFAULT;
1039 
1040 		return 0;
1041 	}
1042 	default:
1043 		break;
1044 	}
1045 
1046 	return -EOPNOTSUPP;
1047 }
1048 
xsk_mmap(struct file * file,struct socket * sock,struct vm_area_struct * vma)1049 static int xsk_mmap(struct file *file, struct socket *sock,
1050 		    struct vm_area_struct *vma)
1051 {
1052 	loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
1053 	unsigned long size = vma->vm_end - vma->vm_start;
1054 	struct xdp_sock *xs = xdp_sk(sock->sk);
1055 	struct xsk_queue *q = NULL;
1056 	unsigned long pfn;
1057 	struct page *qpg;
1058 
1059 	if (READ_ONCE(xs->state) != XSK_READY)
1060 		return -EBUSY;
1061 
1062 	if (offset == XDP_PGOFF_RX_RING) {
1063 		q = READ_ONCE(xs->rx);
1064 	} else if (offset == XDP_PGOFF_TX_RING) {
1065 		q = READ_ONCE(xs->tx);
1066 	} else {
1067 		/* Matches the smp_wmb() in XDP_UMEM_REG */
1068 		smp_rmb();
1069 		if (offset == XDP_UMEM_PGOFF_FILL_RING)
1070 			q = READ_ONCE(xs->fq_tmp);
1071 		else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
1072 			q = READ_ONCE(xs->cq_tmp);
1073 	}
1074 
1075 	if (!q)
1076 		return -EINVAL;
1077 
1078 	/* Matches the smp_wmb() in xsk_init_queue */
1079 	smp_rmb();
1080 	qpg = virt_to_head_page(q->ring);
1081 	if (size > page_size(qpg))
1082 		return -EINVAL;
1083 
1084 	pfn = virt_to_phys(q->ring) >> PAGE_SHIFT;
1085 	return remap_pfn_range(vma, vma->vm_start, pfn,
1086 			       size, vma->vm_page_prot);
1087 }
1088 
xsk_notifier(struct notifier_block * this,unsigned long msg,void * ptr)1089 static int xsk_notifier(struct notifier_block *this,
1090 			unsigned long msg, void *ptr)
1091 {
1092 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
1093 	struct net *net = dev_net(dev);
1094 	struct sock *sk;
1095 
1096 	switch (msg) {
1097 	case NETDEV_UNREGISTER:
1098 		mutex_lock(&net->xdp.lock);
1099 		sk_for_each(sk, &net->xdp.list) {
1100 			struct xdp_sock *xs = xdp_sk(sk);
1101 
1102 			mutex_lock(&xs->mutex);
1103 			if (xs->dev == dev) {
1104 				sk->sk_err = ENETDOWN;
1105 				if (!sock_flag(sk, SOCK_DEAD))
1106 					sk->sk_error_report(sk);
1107 
1108 				xsk_unbind_dev(xs);
1109 
1110 				/* Clear device references. */
1111 				xp_clear_dev(xs->pool);
1112 			}
1113 			mutex_unlock(&xs->mutex);
1114 		}
1115 		mutex_unlock(&net->xdp.lock);
1116 		break;
1117 	}
1118 	return NOTIFY_DONE;
1119 }
1120 
1121 static struct proto xsk_proto = {
1122 	.name =		"XDP",
1123 	.owner =	THIS_MODULE,
1124 	.obj_size =	sizeof(struct xdp_sock),
1125 };
1126 
1127 static const struct proto_ops xsk_proto_ops = {
1128 	.family		= PF_XDP,
1129 	.owner		= THIS_MODULE,
1130 	.release	= xsk_release,
1131 	.bind		= xsk_bind,
1132 	.connect	= sock_no_connect,
1133 	.socketpair	= sock_no_socketpair,
1134 	.accept		= sock_no_accept,
1135 	.getname	= sock_no_getname,
1136 	.poll		= xsk_poll,
1137 	.ioctl		= sock_no_ioctl,
1138 	.listen		= sock_no_listen,
1139 	.shutdown	= sock_no_shutdown,
1140 	.setsockopt	= xsk_setsockopt,
1141 	.getsockopt	= xsk_getsockopt,
1142 	.sendmsg	= xsk_sendmsg,
1143 	.recvmsg	= sock_no_recvmsg,
1144 	.mmap		= xsk_mmap,
1145 	.sendpage	= sock_no_sendpage,
1146 };
1147 
xsk_destruct(struct sock * sk)1148 static void xsk_destruct(struct sock *sk)
1149 {
1150 	struct xdp_sock *xs = xdp_sk(sk);
1151 
1152 	if (!sock_flag(sk, SOCK_DEAD))
1153 		return;
1154 
1155 	if (!xp_put_pool(xs->pool))
1156 		xdp_put_umem(xs->umem, !xs->pool);
1157 
1158 	sk_refcnt_debug_dec(sk);
1159 }
1160 
xsk_create(struct net * net,struct socket * sock,int protocol,int kern)1161 static int xsk_create(struct net *net, struct socket *sock, int protocol,
1162 		      int kern)
1163 {
1164 	struct xdp_sock *xs;
1165 	struct sock *sk;
1166 
1167 	if (!ns_capable(net->user_ns, CAP_NET_RAW))
1168 		return -EPERM;
1169 	if (sock->type != SOCK_RAW)
1170 		return -ESOCKTNOSUPPORT;
1171 
1172 	if (protocol)
1173 		return -EPROTONOSUPPORT;
1174 
1175 	sock->state = SS_UNCONNECTED;
1176 
1177 	sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern);
1178 	if (!sk)
1179 		return -ENOBUFS;
1180 
1181 	sock->ops = &xsk_proto_ops;
1182 
1183 	sock_init_data(sock, sk);
1184 
1185 	sk->sk_family = PF_XDP;
1186 
1187 	sk->sk_destruct = xsk_destruct;
1188 	sk_refcnt_debug_inc(sk);
1189 
1190 	sock_set_flag(sk, SOCK_RCU_FREE);
1191 
1192 	xs = xdp_sk(sk);
1193 	xs->state = XSK_READY;
1194 	mutex_init(&xs->mutex);
1195 	spin_lock_init(&xs->rx_lock);
1196 	spin_lock_init(&xs->tx_completion_lock);
1197 
1198 	INIT_LIST_HEAD(&xs->map_list);
1199 	spin_lock_init(&xs->map_list_lock);
1200 
1201 	mutex_lock(&net->xdp.lock);
1202 	sk_add_node_rcu(sk, &net->xdp.list);
1203 	mutex_unlock(&net->xdp.lock);
1204 
1205 	local_bh_disable();
1206 	sock_prot_inuse_add(net, &xsk_proto, 1);
1207 	local_bh_enable();
1208 
1209 	return 0;
1210 }
1211 
1212 static const struct net_proto_family xsk_family_ops = {
1213 	.family = PF_XDP,
1214 	.create = xsk_create,
1215 	.owner	= THIS_MODULE,
1216 };
1217 
1218 static struct notifier_block xsk_netdev_notifier = {
1219 	.notifier_call	= xsk_notifier,
1220 };
1221 
xsk_net_init(struct net * net)1222 static int __net_init xsk_net_init(struct net *net)
1223 {
1224 	mutex_init(&net->xdp.lock);
1225 	INIT_HLIST_HEAD(&net->xdp.list);
1226 	return 0;
1227 }
1228 
xsk_net_exit(struct net * net)1229 static void __net_exit xsk_net_exit(struct net *net)
1230 {
1231 	WARN_ON_ONCE(!hlist_empty(&net->xdp.list));
1232 }
1233 
1234 static struct pernet_operations xsk_net_ops = {
1235 	.init = xsk_net_init,
1236 	.exit = xsk_net_exit,
1237 };
1238 
xsk_init(void)1239 static int __init xsk_init(void)
1240 {
1241 	int err, cpu;
1242 
1243 	err = proto_register(&xsk_proto, 0 /* no slab */);
1244 	if (err)
1245 		goto out;
1246 
1247 	err = sock_register(&xsk_family_ops);
1248 	if (err)
1249 		goto out_proto;
1250 
1251 	err = register_pernet_subsys(&xsk_net_ops);
1252 	if (err)
1253 		goto out_sk;
1254 
1255 	err = register_netdevice_notifier(&xsk_netdev_notifier);
1256 	if (err)
1257 		goto out_pernet;
1258 
1259 	for_each_possible_cpu(cpu)
1260 		INIT_LIST_HEAD(&per_cpu(xskmap_flush_list, cpu));
1261 	return 0;
1262 
1263 out_pernet:
1264 	unregister_pernet_subsys(&xsk_net_ops);
1265 out_sk:
1266 	sock_unregister(PF_XDP);
1267 out_proto:
1268 	proto_unregister(&xsk_proto);
1269 out:
1270 	return err;
1271 }
1272 
1273 fs_initcall(xsk_init);
1274