1  /*
2   * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
3   *
4   * This software is available to you under a choice of one of two
5   * licenses.  You may choose to be licensed under the terms of the GNU
6   * General Public License (GPL) Version 2, available from the file
7   * COPYING in the main directory of this source tree, or the
8   * OpenIB.org BSD license below:
9   *
10   *     Redistribution and use in source and binary forms, with or
11   *     without modification, are permitted provided that the following
12   *     conditions are met:
13   *
14   *      - Redistributions of source code must retain the above
15   *        copyright notice, this list of conditions and the following
16   *        disclaimer.
17   *
18   *      - Redistributions in binary form must reproduce the above
19   *        copyright notice, this list of conditions and the following
20   *        disclaimer in the documentation and/or other materials
21   *        provided with the distribution.
22   *
23   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24   * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25   * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26   * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27   * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28   * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29   * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30   * SOFTWARE.
31   *
32   */
33  
34  #include <linux/bpf.h>
35  #include <linux/bpf_trace.h>
36  #include <linux/mlx4/cq.h>
37  #include <linux/slab.h>
38  #include <linux/mlx4/qp.h>
39  #include <linux/skbuff.h>
40  #include <linux/rculist.h>
41  #include <linux/if_ether.h>
42  #include <linux/if_vlan.h>
43  #include <linux/vmalloc.h>
44  #include <linux/irq.h>
45  
46  #include <net/ip.h>
47  #if IS_ENABLED(CONFIG_IPV6)
48  #include <net/ip6_checksum.h>
49  #endif
50  
51  #include "mlx4_en.h"
52  
mlx4_alloc_page(struct mlx4_en_priv * priv,struct mlx4_en_rx_alloc * frag,gfp_t gfp)53  static int mlx4_alloc_page(struct mlx4_en_priv *priv,
54  			   struct mlx4_en_rx_alloc *frag,
55  			   gfp_t gfp)
56  {
57  	struct page *page;
58  	dma_addr_t dma;
59  
60  	page = alloc_page(gfp);
61  	if (unlikely(!page))
62  		return -ENOMEM;
63  	dma = dma_map_page(priv->ddev, page, 0, PAGE_SIZE, priv->dma_dir);
64  	if (unlikely(dma_mapping_error(priv->ddev, dma))) {
65  		__free_page(page);
66  		return -ENOMEM;
67  	}
68  	frag->page = page;
69  	frag->dma = dma;
70  	frag->page_offset = priv->rx_headroom;
71  	return 0;
72  }
73  
mlx4_en_alloc_frags(struct mlx4_en_priv * priv,struct mlx4_en_rx_ring * ring,struct mlx4_en_rx_desc * rx_desc,struct mlx4_en_rx_alloc * frags,gfp_t gfp)74  static int mlx4_en_alloc_frags(struct mlx4_en_priv *priv,
75  			       struct mlx4_en_rx_ring *ring,
76  			       struct mlx4_en_rx_desc *rx_desc,
77  			       struct mlx4_en_rx_alloc *frags,
78  			       gfp_t gfp)
79  {
80  	int i;
81  
82  	for (i = 0; i < priv->num_frags; i++, frags++) {
83  		if (!frags->page) {
84  			if (mlx4_alloc_page(priv, frags, gfp))
85  				return -ENOMEM;
86  			ring->rx_alloc_pages++;
87  		}
88  		rx_desc->data[i].addr = cpu_to_be64(frags->dma +
89  						    frags->page_offset);
90  	}
91  	return 0;
92  }
93  
mlx4_en_free_frag(const struct mlx4_en_priv * priv,struct mlx4_en_rx_alloc * frag)94  static void mlx4_en_free_frag(const struct mlx4_en_priv *priv,
95  			      struct mlx4_en_rx_alloc *frag)
96  {
97  	if (frag->page) {
98  		dma_unmap_page(priv->ddev, frag->dma,
99  			       PAGE_SIZE, priv->dma_dir);
100  		__free_page(frag->page);
101  	}
102  	/* We need to clear all fields, otherwise a change of priv->log_rx_info
103  	 * could lead to see garbage later in frag->page.
104  	 */
105  	memset(frag, 0, sizeof(*frag));
106  }
107  
mlx4_en_init_rx_desc(const struct mlx4_en_priv * priv,struct mlx4_en_rx_ring * ring,int index)108  static void mlx4_en_init_rx_desc(const struct mlx4_en_priv *priv,
109  				 struct mlx4_en_rx_ring *ring, int index)
110  {
111  	struct mlx4_en_rx_desc *rx_desc = ring->buf + ring->stride * index;
112  	int possible_frags;
113  	int i;
114  
115  	/* Set size and memtype fields */
116  	for (i = 0; i < priv->num_frags; i++) {
117  		rx_desc->data[i].byte_count =
118  			cpu_to_be32(priv->frag_info[i].frag_size);
119  		rx_desc->data[i].lkey = cpu_to_be32(priv->mdev->mr.key);
120  	}
121  
122  	/* If the number of used fragments does not fill up the ring stride,
123  	 * remaining (unused) fragments must be padded with null address/size
124  	 * and a special memory key */
125  	possible_frags = (ring->stride - sizeof(struct mlx4_en_rx_desc)) / DS_SIZE;
126  	for (i = priv->num_frags; i < possible_frags; i++) {
127  		rx_desc->data[i].byte_count = 0;
128  		rx_desc->data[i].lkey = cpu_to_be32(MLX4_EN_MEMTYPE_PAD);
129  		rx_desc->data[i].addr = 0;
130  	}
131  }
132  
mlx4_en_prepare_rx_desc(struct mlx4_en_priv * priv,struct mlx4_en_rx_ring * ring,int index,gfp_t gfp)133  static int mlx4_en_prepare_rx_desc(struct mlx4_en_priv *priv,
134  				   struct mlx4_en_rx_ring *ring, int index,
135  				   gfp_t gfp)
136  {
137  	struct mlx4_en_rx_desc *rx_desc = ring->buf +
138  		(index << ring->log_stride);
139  	struct mlx4_en_rx_alloc *frags = ring->rx_info +
140  					(index << priv->log_rx_info);
141  	if (likely(ring->page_cache.index > 0)) {
142  		/* XDP uses a single page per frame */
143  		if (!frags->page) {
144  			ring->page_cache.index--;
145  			frags->page = ring->page_cache.buf[ring->page_cache.index].page;
146  			frags->dma  = ring->page_cache.buf[ring->page_cache.index].dma;
147  		}
148  		frags->page_offset = XDP_PACKET_HEADROOM;
149  		rx_desc->data[0].addr = cpu_to_be64(frags->dma +
150  						    XDP_PACKET_HEADROOM);
151  		return 0;
152  	}
153  
154  	return mlx4_en_alloc_frags(priv, ring, rx_desc, frags, gfp);
155  }
156  
mlx4_en_is_ring_empty(const struct mlx4_en_rx_ring * ring)157  static bool mlx4_en_is_ring_empty(const struct mlx4_en_rx_ring *ring)
158  {
159  	return ring->prod == ring->cons;
160  }
161  
mlx4_en_update_rx_prod_db(struct mlx4_en_rx_ring * ring)162  static inline void mlx4_en_update_rx_prod_db(struct mlx4_en_rx_ring *ring)
163  {
164  	*ring->wqres.db.db = cpu_to_be32(ring->prod & 0xffff);
165  }
166  
167  /* slow path */
mlx4_en_free_rx_desc(const struct mlx4_en_priv * priv,struct mlx4_en_rx_ring * ring,int index)168  static void mlx4_en_free_rx_desc(const struct mlx4_en_priv *priv,
169  				 struct mlx4_en_rx_ring *ring,
170  				 int index)
171  {
172  	struct mlx4_en_rx_alloc *frags;
173  	int nr;
174  
175  	frags = ring->rx_info + (index << priv->log_rx_info);
176  	for (nr = 0; nr < priv->num_frags; nr++) {
177  		en_dbg(DRV, priv, "Freeing fragment:%d\n", nr);
178  		mlx4_en_free_frag(priv, frags + nr);
179  	}
180  }
181  
182  /* Function not in fast-path */
mlx4_en_fill_rx_buffers(struct mlx4_en_priv * priv)183  static int mlx4_en_fill_rx_buffers(struct mlx4_en_priv *priv)
184  {
185  	struct mlx4_en_rx_ring *ring;
186  	int ring_ind;
187  	int buf_ind;
188  	int new_size;
189  
190  	for (buf_ind = 0; buf_ind < priv->prof->rx_ring_size; buf_ind++) {
191  		for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++) {
192  			ring = priv->rx_ring[ring_ind];
193  
194  			if (mlx4_en_prepare_rx_desc(priv, ring,
195  						    ring->actual_size,
196  						    GFP_KERNEL)) {
197  				if (ring->actual_size < MLX4_EN_MIN_RX_SIZE) {
198  					en_err(priv, "Failed to allocate enough rx buffers\n");
199  					return -ENOMEM;
200  				} else {
201  					new_size = rounddown_pow_of_two(ring->actual_size);
202  					en_warn(priv, "Only %d buffers allocated reducing ring size to %d\n",
203  						ring->actual_size, new_size);
204  					goto reduce_rings;
205  				}
206  			}
207  			ring->actual_size++;
208  			ring->prod++;
209  		}
210  	}
211  	return 0;
212  
213  reduce_rings:
214  	for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++) {
215  		ring = priv->rx_ring[ring_ind];
216  		while (ring->actual_size > new_size) {
217  			ring->actual_size--;
218  			ring->prod--;
219  			mlx4_en_free_rx_desc(priv, ring, ring->actual_size);
220  		}
221  	}
222  
223  	return 0;
224  }
225  
mlx4_en_free_rx_buf(struct mlx4_en_priv * priv,struct mlx4_en_rx_ring * ring)226  static void mlx4_en_free_rx_buf(struct mlx4_en_priv *priv,
227  				struct mlx4_en_rx_ring *ring)
228  {
229  	int index;
230  
231  	en_dbg(DRV, priv, "Freeing Rx buf - cons:%d prod:%d\n",
232  	       ring->cons, ring->prod);
233  
234  	/* Unmap and free Rx buffers */
235  	for (index = 0; index < ring->size; index++) {
236  		en_dbg(DRV, priv, "Processing descriptor:%d\n", index);
237  		mlx4_en_free_rx_desc(priv, ring, index);
238  	}
239  	ring->cons = 0;
240  	ring->prod = 0;
241  }
242  
mlx4_en_set_num_rx_rings(struct mlx4_en_dev * mdev)243  void mlx4_en_set_num_rx_rings(struct mlx4_en_dev *mdev)
244  {
245  	int i;
246  	int num_of_eqs;
247  	int num_rx_rings;
248  	struct mlx4_dev *dev = mdev->dev;
249  
250  	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) {
251  		num_of_eqs = max_t(int, MIN_RX_RINGS,
252  				   min_t(int,
253  					 mlx4_get_eqs_per_port(mdev->dev, i),
254  					 DEF_RX_RINGS));
255  
256  		num_rx_rings = mlx4_low_memory_profile() ? MIN_RX_RINGS :
257  			min_t(int, num_of_eqs, num_online_cpus());
258  		mdev->profile.prof[i].rx_ring_num =
259  			rounddown_pow_of_two(num_rx_rings);
260  	}
261  }
262  
mlx4_en_create_rx_ring(struct mlx4_en_priv * priv,struct mlx4_en_rx_ring ** pring,u32 size,u16 stride,int node,int queue_index)263  int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv,
264  			   struct mlx4_en_rx_ring **pring,
265  			   u32 size, u16 stride, int node, int queue_index)
266  {
267  	struct mlx4_en_dev *mdev = priv->mdev;
268  	struct mlx4_en_rx_ring *ring;
269  	int err = -ENOMEM;
270  	int tmp;
271  
272  	ring = kzalloc_node(sizeof(*ring), GFP_KERNEL, node);
273  	if (!ring) {
274  		en_err(priv, "Failed to allocate RX ring structure\n");
275  		return -ENOMEM;
276  	}
277  
278  	ring->prod = 0;
279  	ring->cons = 0;
280  	ring->size = size;
281  	ring->size_mask = size - 1;
282  	ring->stride = stride;
283  	ring->log_stride = ffs(ring->stride) - 1;
284  	ring->buf_size = ring->size * ring->stride + TXBB_SIZE;
285  
286  	if (xdp_rxq_info_reg(&ring->xdp_rxq, priv->dev, queue_index, 0) < 0)
287  		goto err_ring;
288  
289  	tmp = size * roundup_pow_of_two(MLX4_EN_MAX_RX_FRAGS *
290  					sizeof(struct mlx4_en_rx_alloc));
291  	ring->rx_info = kvzalloc_node(tmp, GFP_KERNEL, node);
292  	if (!ring->rx_info) {
293  		err = -ENOMEM;
294  		goto err_xdp_info;
295  	}
296  
297  	en_dbg(DRV, priv, "Allocated rx_info ring at addr:%p size:%d\n",
298  		 ring->rx_info, tmp);
299  
300  	/* Allocate HW buffers on provided NUMA node */
301  	set_dev_node(&mdev->dev->persist->pdev->dev, node);
302  	err = mlx4_alloc_hwq_res(mdev->dev, &ring->wqres, ring->buf_size);
303  	set_dev_node(&mdev->dev->persist->pdev->dev, mdev->dev->numa_node);
304  	if (err)
305  		goto err_info;
306  
307  	ring->buf = ring->wqres.buf.direct.buf;
308  
309  	ring->hwtstamp_rx_filter = priv->hwtstamp_config.rx_filter;
310  
311  	*pring = ring;
312  	return 0;
313  
314  err_info:
315  	kvfree(ring->rx_info);
316  	ring->rx_info = NULL;
317  err_xdp_info:
318  	xdp_rxq_info_unreg(&ring->xdp_rxq);
319  err_ring:
320  	kfree(ring);
321  	*pring = NULL;
322  
323  	return err;
324  }
325  
mlx4_en_activate_rx_rings(struct mlx4_en_priv * priv)326  int mlx4_en_activate_rx_rings(struct mlx4_en_priv *priv)
327  {
328  	struct mlx4_en_rx_ring *ring;
329  	int i;
330  	int ring_ind;
331  	int err;
332  	int stride = roundup_pow_of_two(sizeof(struct mlx4_en_rx_desc) +
333  					DS_SIZE * priv->num_frags);
334  
335  	for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++) {
336  		ring = priv->rx_ring[ring_ind];
337  
338  		ring->prod = 0;
339  		ring->cons = 0;
340  		ring->actual_size = 0;
341  		ring->cqn = priv->rx_cq[ring_ind]->mcq.cqn;
342  
343  		ring->stride = stride;
344  		if (ring->stride <= TXBB_SIZE) {
345  			/* Stamp first unused send wqe */
346  			__be32 *ptr = (__be32 *)ring->buf;
347  			__be32 stamp = cpu_to_be32(1 << STAMP_SHIFT);
348  			*ptr = stamp;
349  			/* Move pointer to start of rx section */
350  			ring->buf += TXBB_SIZE;
351  		}
352  
353  		ring->log_stride = ffs(ring->stride) - 1;
354  		ring->buf_size = ring->size * ring->stride;
355  
356  		memset(ring->buf, 0, ring->buf_size);
357  		mlx4_en_update_rx_prod_db(ring);
358  
359  		/* Initialize all descriptors */
360  		for (i = 0; i < ring->size; i++)
361  			mlx4_en_init_rx_desc(priv, ring, i);
362  	}
363  	err = mlx4_en_fill_rx_buffers(priv);
364  	if (err)
365  		goto err_buffers;
366  
367  	for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++) {
368  		ring = priv->rx_ring[ring_ind];
369  
370  		ring->size_mask = ring->actual_size - 1;
371  		mlx4_en_update_rx_prod_db(ring);
372  	}
373  
374  	return 0;
375  
376  err_buffers:
377  	for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++)
378  		mlx4_en_free_rx_buf(priv, priv->rx_ring[ring_ind]);
379  
380  	ring_ind = priv->rx_ring_num - 1;
381  	while (ring_ind >= 0) {
382  		if (priv->rx_ring[ring_ind]->stride <= TXBB_SIZE)
383  			priv->rx_ring[ring_ind]->buf -= TXBB_SIZE;
384  		ring_ind--;
385  	}
386  	return err;
387  }
388  
389  /* We recover from out of memory by scheduling our napi poll
390   * function (mlx4_en_process_cq), which tries to allocate
391   * all missing RX buffers (call to mlx4_en_refill_rx_buffers).
392   */
mlx4_en_recover_from_oom(struct mlx4_en_priv * priv)393  void mlx4_en_recover_from_oom(struct mlx4_en_priv *priv)
394  {
395  	int ring;
396  
397  	if (!priv->port_up)
398  		return;
399  
400  	for (ring = 0; ring < priv->rx_ring_num; ring++) {
401  		if (mlx4_en_is_ring_empty(priv->rx_ring[ring])) {
402  			local_bh_disable();
403  			napi_reschedule(&priv->rx_cq[ring]->napi);
404  			local_bh_enable();
405  		}
406  	}
407  }
408  
409  /* When the rx ring is running in page-per-packet mode, a released frame can go
410   * directly into a small cache, to avoid unmapping or touching the page
411   * allocator. In bpf prog performance scenarios, buffers are either forwarded
412   * or dropped, never converted to skbs, so every page can come directly from
413   * this cache when it is sized to be a multiple of the napi budget.
414   */
mlx4_en_rx_recycle(struct mlx4_en_rx_ring * ring,struct mlx4_en_rx_alloc * frame)415  bool mlx4_en_rx_recycle(struct mlx4_en_rx_ring *ring,
416  			struct mlx4_en_rx_alloc *frame)
417  {
418  	struct mlx4_en_page_cache *cache = &ring->page_cache;
419  
420  	if (cache->index >= MLX4_EN_CACHE_SIZE)
421  		return false;
422  
423  	cache->buf[cache->index].page = frame->page;
424  	cache->buf[cache->index].dma = frame->dma;
425  	cache->index++;
426  	return true;
427  }
428  
mlx4_en_destroy_rx_ring(struct mlx4_en_priv * priv,struct mlx4_en_rx_ring ** pring,u32 size,u16 stride)429  void mlx4_en_destroy_rx_ring(struct mlx4_en_priv *priv,
430  			     struct mlx4_en_rx_ring **pring,
431  			     u32 size, u16 stride)
432  {
433  	struct mlx4_en_dev *mdev = priv->mdev;
434  	struct mlx4_en_rx_ring *ring = *pring;
435  	struct bpf_prog *old_prog;
436  
437  	old_prog = rcu_dereference_protected(
438  					ring->xdp_prog,
439  					lockdep_is_held(&mdev->state_lock));
440  	if (old_prog)
441  		bpf_prog_put(old_prog);
442  	xdp_rxq_info_unreg(&ring->xdp_rxq);
443  	mlx4_free_hwq_res(mdev->dev, &ring->wqres, size * stride + TXBB_SIZE);
444  	kvfree(ring->rx_info);
445  	ring->rx_info = NULL;
446  	kfree(ring);
447  	*pring = NULL;
448  }
449  
mlx4_en_deactivate_rx_ring(struct mlx4_en_priv * priv,struct mlx4_en_rx_ring * ring)450  void mlx4_en_deactivate_rx_ring(struct mlx4_en_priv *priv,
451  				struct mlx4_en_rx_ring *ring)
452  {
453  	int i;
454  
455  	for (i = 0; i < ring->page_cache.index; i++) {
456  		dma_unmap_page(priv->ddev, ring->page_cache.buf[i].dma,
457  			       PAGE_SIZE, priv->dma_dir);
458  		put_page(ring->page_cache.buf[i].page);
459  	}
460  	ring->page_cache.index = 0;
461  	mlx4_en_free_rx_buf(priv, ring);
462  	if (ring->stride <= TXBB_SIZE)
463  		ring->buf -= TXBB_SIZE;
464  }
465  
466  
mlx4_en_complete_rx_desc(struct mlx4_en_priv * priv,struct mlx4_en_rx_alloc * frags,struct sk_buff * skb,int length)467  static int mlx4_en_complete_rx_desc(struct mlx4_en_priv *priv,
468  				    struct mlx4_en_rx_alloc *frags,
469  				    struct sk_buff *skb,
470  				    int length)
471  {
472  	const struct mlx4_en_frag_info *frag_info = priv->frag_info;
473  	unsigned int truesize = 0;
474  	bool release = true;
475  	int nr, frag_size;
476  	struct page *page;
477  	dma_addr_t dma;
478  
479  	/* Collect used fragments while replacing them in the HW descriptors */
480  	for (nr = 0;; frags++) {
481  		frag_size = min_t(int, length, frag_info->frag_size);
482  
483  		page = frags->page;
484  		if (unlikely(!page))
485  			goto fail;
486  
487  		dma = frags->dma;
488  		dma_sync_single_range_for_cpu(priv->ddev, dma, frags->page_offset,
489  					      frag_size, priv->dma_dir);
490  
491  		__skb_fill_page_desc(skb, nr, page, frags->page_offset,
492  				     frag_size);
493  
494  		truesize += frag_info->frag_stride;
495  		if (frag_info->frag_stride == PAGE_SIZE / 2) {
496  			frags->page_offset ^= PAGE_SIZE / 2;
497  			release = page_count(page) != 1 ||
498  				  page_is_pfmemalloc(page) ||
499  				  page_to_nid(page) != numa_mem_id();
500  		} else if (!priv->rx_headroom) {
501  			/* rx_headroom for non XDP setup is always 0.
502  			 * When XDP is set, the above condition will
503  			 * guarantee page is always released.
504  			 */
505  			u32 sz_align = ALIGN(frag_size, SMP_CACHE_BYTES);
506  
507  			frags->page_offset += sz_align;
508  			release = frags->page_offset + frag_info->frag_size > PAGE_SIZE;
509  		}
510  		if (release) {
511  			dma_unmap_page(priv->ddev, dma, PAGE_SIZE, priv->dma_dir);
512  			frags->page = NULL;
513  		} else {
514  			page_ref_inc(page);
515  		}
516  
517  		nr++;
518  		length -= frag_size;
519  		if (!length)
520  			break;
521  		frag_info++;
522  	}
523  	skb->truesize += truesize;
524  	return nr;
525  
526  fail:
527  	while (nr > 0) {
528  		nr--;
529  		__skb_frag_unref(skb_shinfo(skb)->frags + nr, false);
530  	}
531  	return 0;
532  }
533  
validate_loopback(struct mlx4_en_priv * priv,void * va)534  static void validate_loopback(struct mlx4_en_priv *priv, void *va)
535  {
536  	const unsigned char *data = va + ETH_HLEN;
537  	int i;
538  
539  	for (i = 0; i < MLX4_LOOPBACK_TEST_PAYLOAD; i++) {
540  		if (data[i] != (unsigned char)i)
541  			return;
542  	}
543  	/* Loopback found */
544  	priv->loopback_ok = 1;
545  }
546  
mlx4_en_refill_rx_buffers(struct mlx4_en_priv * priv,struct mlx4_en_rx_ring * ring)547  static void mlx4_en_refill_rx_buffers(struct mlx4_en_priv *priv,
548  				      struct mlx4_en_rx_ring *ring)
549  {
550  	u32 missing = ring->actual_size - (ring->prod - ring->cons);
551  
552  	/* Try to batch allocations, but not too much. */
553  	if (missing < 8)
554  		return;
555  	do {
556  		if (mlx4_en_prepare_rx_desc(priv, ring,
557  					    ring->prod & ring->size_mask,
558  					    GFP_ATOMIC | __GFP_MEMALLOC))
559  			break;
560  		ring->prod++;
561  	} while (likely(--missing));
562  
563  	mlx4_en_update_rx_prod_db(ring);
564  }
565  
566  /* When hardware doesn't strip the vlan, we need to calculate the checksum
567   * over it and add it to the hardware's checksum calculation
568   */
get_fixed_vlan_csum(__wsum hw_checksum,struct vlan_hdr * vlanh)569  static inline __wsum get_fixed_vlan_csum(__wsum hw_checksum,
570  					 struct vlan_hdr *vlanh)
571  {
572  	return csum_add(hw_checksum, *(__wsum *)vlanh);
573  }
574  
575  /* Although the stack expects checksum which doesn't include the pseudo
576   * header, the HW adds it. To address that, we are subtracting the pseudo
577   * header checksum from the checksum value provided by the HW.
578   */
get_fixed_ipv4_csum(__wsum hw_checksum,struct sk_buff * skb,struct iphdr * iph)579  static int get_fixed_ipv4_csum(__wsum hw_checksum, struct sk_buff *skb,
580  			       struct iphdr *iph)
581  {
582  	__u16 length_for_csum = 0;
583  	__wsum csum_pseudo_header = 0;
584  	__u8 ipproto = iph->protocol;
585  
586  	if (unlikely(ipproto == IPPROTO_SCTP))
587  		return -1;
588  
589  	length_for_csum = (be16_to_cpu(iph->tot_len) - (iph->ihl << 2));
590  	csum_pseudo_header = csum_tcpudp_nofold(iph->saddr, iph->daddr,
591  						length_for_csum, ipproto, 0);
592  	skb->csum = csum_sub(hw_checksum, csum_pseudo_header);
593  	return 0;
594  }
595  
596  #if IS_ENABLED(CONFIG_IPV6)
597  /* In IPv6 packets, hw_checksum lacks 6 bytes from IPv6 header:
598   * 4 first bytes : priority, version, flow_lbl
599   * and 2 additional bytes : nexthdr, hop_limit.
600   */
get_fixed_ipv6_csum(__wsum hw_checksum,struct sk_buff * skb,struct ipv6hdr * ipv6h)601  static int get_fixed_ipv6_csum(__wsum hw_checksum, struct sk_buff *skb,
602  			       struct ipv6hdr *ipv6h)
603  {
604  	__u8 nexthdr = ipv6h->nexthdr;
605  	__wsum temp;
606  
607  	if (unlikely(nexthdr == IPPROTO_FRAGMENT ||
608  		     nexthdr == IPPROTO_HOPOPTS ||
609  		     nexthdr == IPPROTO_SCTP))
610  		return -1;
611  
612  	/* priority, version, flow_lbl */
613  	temp = csum_add(hw_checksum, *(__wsum *)ipv6h);
614  	/* nexthdr and hop_limit */
615  	skb->csum = csum_add(temp, (__force __wsum)*(__be16 *)&ipv6h->nexthdr);
616  	return 0;
617  }
618  #endif
619  
620  #define short_frame(size) ((size) <= ETH_ZLEN + ETH_FCS_LEN)
621  
622  /* We reach this function only after checking that any of
623   * the (IPv4 | IPv6) bits are set in cqe->status.
624   */
check_csum(struct mlx4_cqe * cqe,struct sk_buff * skb,void * va,netdev_features_t dev_features)625  static int check_csum(struct mlx4_cqe *cqe, struct sk_buff *skb, void *va,
626  		      netdev_features_t dev_features)
627  {
628  	__wsum hw_checksum = 0;
629  	void *hdr;
630  
631  	/* CQE csum doesn't cover padding octets in short ethernet
632  	 * frames. And the pad field is appended prior to calculating
633  	 * and appending the FCS field.
634  	 *
635  	 * Detecting these padded frames requires to verify and parse
636  	 * IP headers, so we simply force all those small frames to skip
637  	 * checksum complete.
638  	 */
639  	if (short_frame(skb->len))
640  		return -EINVAL;
641  
642  	hdr = (u8 *)va + sizeof(struct ethhdr);
643  	hw_checksum = csum_unfold((__force __sum16)cqe->checksum);
644  
645  	if (cqe->vlan_my_qpn & cpu_to_be32(MLX4_CQE_CVLAN_PRESENT_MASK) &&
646  	    !(dev_features & NETIF_F_HW_VLAN_CTAG_RX)) {
647  		hw_checksum = get_fixed_vlan_csum(hw_checksum, hdr);
648  		hdr += sizeof(struct vlan_hdr);
649  	}
650  
651  #if IS_ENABLED(CONFIG_IPV6)
652  	if (cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPV6))
653  		return get_fixed_ipv6_csum(hw_checksum, skb, hdr);
654  #endif
655  	return get_fixed_ipv4_csum(hw_checksum, skb, hdr);
656  }
657  
658  #if IS_ENABLED(CONFIG_IPV6)
659  #define MLX4_CQE_STATUS_IP_ANY (MLX4_CQE_STATUS_IPV4 | MLX4_CQE_STATUS_IPV6)
660  #else
661  #define MLX4_CQE_STATUS_IP_ANY (MLX4_CQE_STATUS_IPV4)
662  #endif
663  
664  struct mlx4_en_xdp_buff {
665  	struct xdp_buff xdp;
666  	struct mlx4_cqe *cqe;
667  	struct mlx4_en_dev *mdev;
668  	struct mlx4_en_rx_ring *ring;
669  	struct net_device *dev;
670  };
671  
mlx4_en_xdp_rx_timestamp(const struct xdp_md * ctx,u64 * timestamp)672  int mlx4_en_xdp_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp)
673  {
674  	struct mlx4_en_xdp_buff *_ctx = (void *)ctx;
675  
676  	if (unlikely(_ctx->ring->hwtstamp_rx_filter != HWTSTAMP_FILTER_ALL))
677  		return -ENODATA;
678  
679  	*timestamp = mlx4_en_get_hwtstamp(_ctx->mdev,
680  					  mlx4_en_get_cqe_ts(_ctx->cqe));
681  	return 0;
682  }
683  
mlx4_en_xdp_rx_hash(const struct xdp_md * ctx,u32 * hash,enum xdp_rss_hash_type * rss_type)684  int mlx4_en_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash,
685  			enum xdp_rss_hash_type *rss_type)
686  {
687  	struct mlx4_en_xdp_buff *_ctx = (void *)ctx;
688  	struct mlx4_cqe *cqe = _ctx->cqe;
689  	enum xdp_rss_hash_type xht = 0;
690  	__be16 status;
691  
692  	if (unlikely(!(_ctx->dev->features & NETIF_F_RXHASH)))
693  		return -ENODATA;
694  
695  	*hash = be32_to_cpu(cqe->immed_rss_invalid);
696  	status = cqe->status;
697  	if (status & cpu_to_be16(MLX4_CQE_STATUS_TCP))
698  		xht = XDP_RSS_L4_TCP;
699  	if (status & cpu_to_be16(MLX4_CQE_STATUS_UDP))
700  		xht = XDP_RSS_L4_UDP;
701  	if (status & cpu_to_be16(MLX4_CQE_STATUS_IPV4 | MLX4_CQE_STATUS_IPV4F))
702  		xht |= XDP_RSS_L3_IPV4;
703  	if (status & cpu_to_be16(MLX4_CQE_STATUS_IPV6)) {
704  		xht |= XDP_RSS_L3_IPV6;
705  		if (cqe->ipv6_ext_mask)
706  			xht |= XDP_RSS_L3_DYNHDR;
707  	}
708  	*rss_type = xht;
709  
710  	return 0;
711  }
712  
mlx4_en_process_rx_cq(struct net_device * dev,struct mlx4_en_cq * cq,int budget)713  int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int budget)
714  {
715  	struct mlx4_en_priv *priv = netdev_priv(dev);
716  	struct mlx4_en_xdp_buff mxbuf = {};
717  	int factor = priv->cqe_factor;
718  	struct mlx4_en_rx_ring *ring;
719  	struct bpf_prog *xdp_prog;
720  	int cq_ring = cq->ring;
721  	bool doorbell_pending;
722  	bool xdp_redir_flush;
723  	struct mlx4_cqe *cqe;
724  	int polled = 0;
725  	int index;
726  
727  	if (unlikely(!priv->port_up || budget <= 0))
728  		return 0;
729  
730  	ring = priv->rx_ring[cq_ring];
731  
732  	xdp_prog = rcu_dereference_bh(ring->xdp_prog);
733  	xdp_init_buff(&mxbuf.xdp, priv->frag_info[0].frag_stride, &ring->xdp_rxq);
734  	doorbell_pending = false;
735  	xdp_redir_flush = false;
736  
737  	/* We assume a 1:1 mapping between CQEs and Rx descriptors, so Rx
738  	 * descriptor offset can be deduced from the CQE index instead of
739  	 * reading 'cqe->index' */
740  	index = cq->mcq.cons_index & ring->size_mask;
741  	cqe = mlx4_en_get_cqe(cq->buf, index, priv->cqe_size) + factor;
742  
743  	/* Process all completed CQEs */
744  	while (XNOR(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK,
745  		    cq->mcq.cons_index & cq->size)) {
746  		struct mlx4_en_rx_alloc *frags;
747  		enum pkt_hash_types hash_type;
748  		struct sk_buff *skb;
749  		unsigned int length;
750  		int ip_summed;
751  		void *va;
752  		int nr;
753  
754  		frags = ring->rx_info + (index << priv->log_rx_info);
755  		va = page_address(frags[0].page) + frags[0].page_offset;
756  		net_prefetchw(va);
757  		/*
758  		 * make sure we read the CQE after we read the ownership bit
759  		 */
760  		dma_rmb();
761  
762  		/* Drop packet on bad receive or bad checksum */
763  		if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
764  						MLX4_CQE_OPCODE_ERROR)) {
765  			en_err(priv, "CQE completed in error - vendor syndrom:%d syndrom:%d\n",
766  			       ((struct mlx4_err_cqe *)cqe)->vendor_err_syndrome,
767  			       ((struct mlx4_err_cqe *)cqe)->syndrome);
768  			goto next;
769  		}
770  		if (unlikely(cqe->badfcs_enc & MLX4_CQE_BAD_FCS)) {
771  			en_dbg(RX_ERR, priv, "Accepted frame with bad FCS\n");
772  			goto next;
773  		}
774  
775  		/* Check if we need to drop the packet if SRIOV is not enabled
776  		 * and not performing the selftest or flb disabled
777  		 */
778  		if (priv->flags & MLX4_EN_FLAG_RX_FILTER_NEEDED) {
779  			const struct ethhdr *ethh = va;
780  			dma_addr_t dma;
781  			/* Get pointer to first fragment since we haven't
782  			 * skb yet and cast it to ethhdr struct
783  			 */
784  			dma = frags[0].dma + frags[0].page_offset;
785  			dma_sync_single_for_cpu(priv->ddev, dma, sizeof(*ethh),
786  						DMA_FROM_DEVICE);
787  
788  			if (is_multicast_ether_addr(ethh->h_dest)) {
789  				struct mlx4_mac_entry *entry;
790  				struct hlist_head *bucket;
791  				unsigned int mac_hash;
792  
793  				/* Drop the packet, since HW loopback-ed it */
794  				mac_hash = ethh->h_source[MLX4_EN_MAC_HASH_IDX];
795  				bucket = &priv->mac_hash[mac_hash];
796  				hlist_for_each_entry_rcu_bh(entry, bucket, hlist) {
797  					if (ether_addr_equal_64bits(entry->mac,
798  								    ethh->h_source))
799  						goto next;
800  				}
801  			}
802  		}
803  
804  		if (unlikely(priv->validate_loopback)) {
805  			validate_loopback(priv, va);
806  			goto next;
807  		}
808  
809  		/*
810  		 * Packet is OK - process it.
811  		 */
812  		length = be32_to_cpu(cqe->byte_cnt);
813  		length -= ring->fcs_del;
814  
815  		/* A bpf program gets first chance to drop the packet. It may
816  		 * read bytes but not past the end of the frag.
817  		 */
818  		if (xdp_prog) {
819  			dma_addr_t dma;
820  			void *orig_data;
821  			u32 act;
822  
823  			dma = frags[0].dma + frags[0].page_offset;
824  			dma_sync_single_for_cpu(priv->ddev, dma,
825  						priv->frag_info[0].frag_size,
826  						DMA_FROM_DEVICE);
827  
828  			xdp_prepare_buff(&mxbuf.xdp, va - frags[0].page_offset,
829  					 frags[0].page_offset, length, true);
830  			orig_data = mxbuf.xdp.data;
831  			mxbuf.cqe = cqe;
832  			mxbuf.mdev = priv->mdev;
833  			mxbuf.ring = ring;
834  			mxbuf.dev = dev;
835  
836  			act = bpf_prog_run_xdp(xdp_prog, &mxbuf.xdp);
837  
838  			length = mxbuf.xdp.data_end - mxbuf.xdp.data;
839  			if (mxbuf.xdp.data != orig_data) {
840  				frags[0].page_offset = mxbuf.xdp.data -
841  					mxbuf.xdp.data_hard_start;
842  				va = mxbuf.xdp.data;
843  			}
844  
845  			switch (act) {
846  			case XDP_PASS:
847  				break;
848  			case XDP_REDIRECT:
849  				if (likely(!xdp_do_redirect(dev, &mxbuf.xdp, xdp_prog))) {
850  					ring->xdp_redirect++;
851  					xdp_redir_flush = true;
852  					frags[0].page = NULL;
853  					goto next;
854  				}
855  				ring->xdp_redirect_fail++;
856  				trace_xdp_exception(dev, xdp_prog, act);
857  				goto xdp_drop_no_cnt;
858  			case XDP_TX:
859  				if (likely(!mlx4_en_xmit_frame(ring, frags, priv,
860  							length, cq_ring,
861  							&doorbell_pending))) {
862  					frags[0].page = NULL;
863  					goto next;
864  				}
865  				trace_xdp_exception(dev, xdp_prog, act);
866  				goto xdp_drop_no_cnt; /* Drop on xmit failure */
867  			default:
868  				bpf_warn_invalid_xdp_action(dev, xdp_prog, act);
869  				fallthrough;
870  			case XDP_ABORTED:
871  				trace_xdp_exception(dev, xdp_prog, act);
872  				fallthrough;
873  			case XDP_DROP:
874  				ring->xdp_drop++;
875  xdp_drop_no_cnt:
876  				goto next;
877  			}
878  		}
879  
880  		ring->bytes += length;
881  		ring->packets++;
882  
883  		skb = napi_get_frags(&cq->napi);
884  		if (unlikely(!skb))
885  			goto next;
886  
887  		if (unlikely(ring->hwtstamp_rx_filter == HWTSTAMP_FILTER_ALL)) {
888  			u64 timestamp = mlx4_en_get_cqe_ts(cqe);
889  
890  			mlx4_en_fill_hwtstamps(priv->mdev, skb_hwtstamps(skb),
891  					       timestamp);
892  		}
893  		skb_record_rx_queue(skb, cq_ring);
894  
895  		if (likely(dev->features & NETIF_F_RXCSUM)) {
896  			/* TODO: For IP non TCP/UDP packets when csum complete is
897  			 * not an option (not supported or any other reason) we can
898  			 * actually check cqe IPOK status bit and report
899  			 * CHECKSUM_UNNECESSARY rather than CHECKSUM_NONE
900  			 */
901  			if ((cqe->status & cpu_to_be16(MLX4_CQE_STATUS_TCP |
902  						       MLX4_CQE_STATUS_UDP)) &&
903  			    (cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPOK)) &&
904  			    cqe->checksum == cpu_to_be16(0xffff)) {
905  				bool l2_tunnel;
906  
907  				l2_tunnel = (dev->hw_enc_features & NETIF_F_RXCSUM) &&
908  					(cqe->vlan_my_qpn & cpu_to_be32(MLX4_CQE_L2_TUNNEL));
909  				ip_summed = CHECKSUM_UNNECESSARY;
910  				hash_type = PKT_HASH_TYPE_L4;
911  				if (l2_tunnel)
912  					skb->csum_level = 1;
913  				ring->csum_ok++;
914  			} else {
915  				if (!(priv->flags & MLX4_EN_FLAG_RX_CSUM_NON_TCP_UDP &&
916  				      (cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IP_ANY))))
917  					goto csum_none;
918  				if (check_csum(cqe, skb, va, dev->features))
919  					goto csum_none;
920  				ip_summed = CHECKSUM_COMPLETE;
921  				hash_type = PKT_HASH_TYPE_L3;
922  				ring->csum_complete++;
923  			}
924  		} else {
925  csum_none:
926  			ip_summed = CHECKSUM_NONE;
927  			hash_type = PKT_HASH_TYPE_L3;
928  			ring->csum_none++;
929  		}
930  		skb->ip_summed = ip_summed;
931  		if (dev->features & NETIF_F_RXHASH)
932  			skb_set_hash(skb,
933  				     be32_to_cpu(cqe->immed_rss_invalid),
934  				     hash_type);
935  
936  		if ((cqe->vlan_my_qpn &
937  		     cpu_to_be32(MLX4_CQE_CVLAN_PRESENT_MASK)) &&
938  		    (dev->features & NETIF_F_HW_VLAN_CTAG_RX))
939  			__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q),
940  					       be16_to_cpu(cqe->sl_vid));
941  		else if ((cqe->vlan_my_qpn &
942  			  cpu_to_be32(MLX4_CQE_SVLAN_PRESENT_MASK)) &&
943  			 (dev->features & NETIF_F_HW_VLAN_STAG_RX))
944  			__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021AD),
945  					       be16_to_cpu(cqe->sl_vid));
946  
947  		nr = mlx4_en_complete_rx_desc(priv, frags, skb, length);
948  		if (likely(nr)) {
949  			skb_shinfo(skb)->nr_frags = nr;
950  			skb->len = length;
951  			skb->data_len = length;
952  			napi_gro_frags(&cq->napi);
953  		} else {
954  			__vlan_hwaccel_clear_tag(skb);
955  			skb_clear_hash(skb);
956  		}
957  next:
958  		++cq->mcq.cons_index;
959  		index = (cq->mcq.cons_index) & ring->size_mask;
960  		cqe = mlx4_en_get_cqe(cq->buf, index, priv->cqe_size) + factor;
961  		if (unlikely(++polled == budget))
962  			break;
963  	}
964  
965  	if (xdp_redir_flush)
966  		xdp_do_flush();
967  
968  	if (likely(polled)) {
969  		if (doorbell_pending) {
970  			priv->tx_cq[TX_XDP][cq_ring]->xdp_busy = true;
971  			mlx4_en_xmit_doorbell(priv->tx_ring[TX_XDP][cq_ring]);
972  		}
973  
974  		mlx4_cq_set_ci(&cq->mcq);
975  		wmb(); /* ensure HW sees CQ consumer before we post new buffers */
976  		ring->cons = cq->mcq.cons_index;
977  	}
978  
979  	mlx4_en_refill_rx_buffers(priv, ring);
980  
981  	return polled;
982  }
983  
984  
mlx4_en_rx_irq(struct mlx4_cq * mcq)985  void mlx4_en_rx_irq(struct mlx4_cq *mcq)
986  {
987  	struct mlx4_en_cq *cq = container_of(mcq, struct mlx4_en_cq, mcq);
988  	struct mlx4_en_priv *priv = netdev_priv(cq->dev);
989  
990  	if (likely(priv->port_up))
991  		napi_schedule_irqoff(&cq->napi);
992  	else
993  		mlx4_en_arm_cq(priv, cq);
994  }
995  
996  /* Rx CQ polling - called by NAPI */
mlx4_en_poll_rx_cq(struct napi_struct * napi,int budget)997  int mlx4_en_poll_rx_cq(struct napi_struct *napi, int budget)
998  {
999  	struct mlx4_en_cq *cq = container_of(napi, struct mlx4_en_cq, napi);
1000  	struct net_device *dev = cq->dev;
1001  	struct mlx4_en_priv *priv = netdev_priv(dev);
1002  	struct mlx4_en_cq *xdp_tx_cq = NULL;
1003  	bool clean_complete = true;
1004  	int done;
1005  
1006  	if (!budget)
1007  		return 0;
1008  
1009  	if (priv->tx_ring_num[TX_XDP]) {
1010  		xdp_tx_cq = priv->tx_cq[TX_XDP][cq->ring];
1011  		if (xdp_tx_cq->xdp_busy) {
1012  			clean_complete = mlx4_en_process_tx_cq(dev, xdp_tx_cq,
1013  							       budget) < budget;
1014  			xdp_tx_cq->xdp_busy = !clean_complete;
1015  		}
1016  	}
1017  
1018  	done = mlx4_en_process_rx_cq(dev, cq, budget);
1019  
1020  	/* If we used up all the quota - we're probably not done yet... */
1021  	if (done == budget || !clean_complete) {
1022  		int cpu_curr;
1023  
1024  		/* in case we got here because of !clean_complete */
1025  		done = budget;
1026  
1027  		cpu_curr = smp_processor_id();
1028  
1029  		if (likely(cpumask_test_cpu(cpu_curr, cq->aff_mask)))
1030  			return budget;
1031  
1032  		/* Current cpu is not according to smp_irq_affinity -
1033  		 * probably affinity changed. Need to stop this NAPI
1034  		 * poll, and restart it on the right CPU.
1035  		 * Try to avoid returning a too small value (like 0),
1036  		 * to not fool net_rx_action() and its netdev_budget
1037  		 */
1038  		if (done)
1039  			done--;
1040  	}
1041  	/* Done for now */
1042  	if (likely(napi_complete_done(napi, done)))
1043  		mlx4_en_arm_cq(priv, cq);
1044  	return done;
1045  }
1046  
mlx4_en_calc_rx_buf(struct net_device * dev)1047  void mlx4_en_calc_rx_buf(struct net_device *dev)
1048  {
1049  	struct mlx4_en_priv *priv = netdev_priv(dev);
1050  	int eff_mtu = MLX4_EN_EFF_MTU(dev->mtu);
1051  	int i = 0;
1052  
1053  	/* bpf requires buffers to be set up as 1 packet per page.
1054  	 * This only works when num_frags == 1.
1055  	 */
1056  	if (priv->tx_ring_num[TX_XDP]) {
1057  		priv->frag_info[0].frag_size = eff_mtu;
1058  		/* This will gain efficient xdp frame recycling at the
1059  		 * expense of more costly truesize accounting
1060  		 */
1061  		priv->frag_info[0].frag_stride = PAGE_SIZE;
1062  		priv->dma_dir = DMA_BIDIRECTIONAL;
1063  		priv->rx_headroom = XDP_PACKET_HEADROOM;
1064  		i = 1;
1065  	} else {
1066  		int frag_size_max = 2048, buf_size = 0;
1067  
1068  		/* should not happen, right ? */
1069  		if (eff_mtu > PAGE_SIZE + (MLX4_EN_MAX_RX_FRAGS - 1) * 2048)
1070  			frag_size_max = PAGE_SIZE;
1071  
1072  		while (buf_size < eff_mtu) {
1073  			int frag_stride, frag_size = eff_mtu - buf_size;
1074  			int pad, nb;
1075  
1076  			if (i < MLX4_EN_MAX_RX_FRAGS - 1)
1077  				frag_size = min(frag_size, frag_size_max);
1078  
1079  			priv->frag_info[i].frag_size = frag_size;
1080  			frag_stride = ALIGN(frag_size, SMP_CACHE_BYTES);
1081  			/* We can only pack 2 1536-bytes frames in on 4K page
1082  			 * Therefore, each frame would consume more bytes (truesize)
1083  			 */
1084  			nb = PAGE_SIZE / frag_stride;
1085  			pad = (PAGE_SIZE - nb * frag_stride) / nb;
1086  			pad &= ~(SMP_CACHE_BYTES - 1);
1087  			priv->frag_info[i].frag_stride = frag_stride + pad;
1088  
1089  			buf_size += frag_size;
1090  			i++;
1091  		}
1092  		priv->dma_dir = DMA_FROM_DEVICE;
1093  		priv->rx_headroom = 0;
1094  	}
1095  
1096  	priv->num_frags = i;
1097  	priv->rx_skb_size = eff_mtu;
1098  	priv->log_rx_info = ROUNDUP_LOG2(i * sizeof(struct mlx4_en_rx_alloc));
1099  
1100  	en_dbg(DRV, priv, "Rx buffer scatter-list (effective-mtu:%d num_frags:%d):\n",
1101  	       eff_mtu, priv->num_frags);
1102  	for (i = 0; i < priv->num_frags; i++) {
1103  		en_dbg(DRV,
1104  		       priv,
1105  		       "  frag:%d - size:%d stride:%d\n",
1106  		       i,
1107  		       priv->frag_info[i].frag_size,
1108  		       priv->frag_info[i].frag_stride);
1109  	}
1110  }
1111  
1112  /* RSS related functions */
1113  
mlx4_en_config_rss_qp(struct mlx4_en_priv * priv,int qpn,struct mlx4_en_rx_ring * ring,enum mlx4_qp_state * state,struct mlx4_qp * qp)1114  static int mlx4_en_config_rss_qp(struct mlx4_en_priv *priv, int qpn,
1115  				 struct mlx4_en_rx_ring *ring,
1116  				 enum mlx4_qp_state *state,
1117  				 struct mlx4_qp *qp)
1118  {
1119  	struct mlx4_en_dev *mdev = priv->mdev;
1120  	struct mlx4_qp_context *context;
1121  	int err = 0;
1122  
1123  	context = kzalloc(sizeof(*context), GFP_KERNEL);
1124  	if (!context)
1125  		return -ENOMEM;
1126  
1127  	err = mlx4_qp_alloc(mdev->dev, qpn, qp);
1128  	if (err) {
1129  		en_err(priv, "Failed to allocate qp #%x\n", qpn);
1130  		goto out;
1131  	}
1132  	qp->event = mlx4_en_sqp_event;
1133  
1134  	mlx4_en_fill_qp_context(priv, ring->actual_size, ring->stride, 0, 0,
1135  				qpn, ring->cqn, -1, context);
1136  	context->db_rec_addr = cpu_to_be64(ring->wqres.db.dma);
1137  
1138  	/* Cancel FCS removal if FW allows */
1139  	if (mdev->dev->caps.flags & MLX4_DEV_CAP_FLAG_FCS_KEEP) {
1140  		context->param3 |= cpu_to_be32(1 << 29);
1141  		if (priv->dev->features & NETIF_F_RXFCS)
1142  			ring->fcs_del = 0;
1143  		else
1144  			ring->fcs_del = ETH_FCS_LEN;
1145  	} else
1146  		ring->fcs_del = 0;
1147  
1148  	err = mlx4_qp_to_ready(mdev->dev, &ring->wqres.mtt, context, qp, state);
1149  	if (err) {
1150  		mlx4_qp_remove(mdev->dev, qp);
1151  		mlx4_qp_free(mdev->dev, qp);
1152  	}
1153  	mlx4_en_update_rx_prod_db(ring);
1154  out:
1155  	kfree(context);
1156  	return err;
1157  }
1158  
mlx4_en_create_drop_qp(struct mlx4_en_priv * priv)1159  int mlx4_en_create_drop_qp(struct mlx4_en_priv *priv)
1160  {
1161  	int err;
1162  	u32 qpn;
1163  
1164  	err = mlx4_qp_reserve_range(priv->mdev->dev, 1, 1, &qpn,
1165  				    MLX4_RESERVE_A0_QP,
1166  				    MLX4_RES_USAGE_DRIVER);
1167  	if (err) {
1168  		en_err(priv, "Failed reserving drop qpn\n");
1169  		return err;
1170  	}
1171  	err = mlx4_qp_alloc(priv->mdev->dev, qpn, &priv->drop_qp);
1172  	if (err) {
1173  		en_err(priv, "Failed allocating drop qp\n");
1174  		mlx4_qp_release_range(priv->mdev->dev, qpn, 1);
1175  		return err;
1176  	}
1177  
1178  	return 0;
1179  }
1180  
mlx4_en_destroy_drop_qp(struct mlx4_en_priv * priv)1181  void mlx4_en_destroy_drop_qp(struct mlx4_en_priv *priv)
1182  {
1183  	u32 qpn;
1184  
1185  	qpn = priv->drop_qp.qpn;
1186  	mlx4_qp_remove(priv->mdev->dev, &priv->drop_qp);
1187  	mlx4_qp_free(priv->mdev->dev, &priv->drop_qp);
1188  	mlx4_qp_release_range(priv->mdev->dev, qpn, 1);
1189  }
1190  
1191  /* Allocate rx qp's and configure them according to rss map */
mlx4_en_config_rss_steer(struct mlx4_en_priv * priv)1192  int mlx4_en_config_rss_steer(struct mlx4_en_priv *priv)
1193  {
1194  	struct mlx4_en_dev *mdev = priv->mdev;
1195  	struct mlx4_en_rss_map *rss_map = &priv->rss_map;
1196  	struct mlx4_qp_context context;
1197  	struct mlx4_rss_context *rss_context;
1198  	int rss_rings;
1199  	void *ptr;
1200  	u8 rss_mask = (MLX4_RSS_IPV4 | MLX4_RSS_TCP_IPV4 | MLX4_RSS_IPV6 |
1201  			MLX4_RSS_TCP_IPV6);
1202  	int i, qpn;
1203  	int err = 0;
1204  	int good_qps = 0;
1205  	u8 flags;
1206  
1207  	en_dbg(DRV, priv, "Configuring rss steering\n");
1208  
1209  	flags = priv->rx_ring_num == 1 ? MLX4_RESERVE_A0_QP : 0;
1210  	err = mlx4_qp_reserve_range(mdev->dev, priv->rx_ring_num,
1211  				    priv->rx_ring_num,
1212  				    &rss_map->base_qpn, flags,
1213  				    MLX4_RES_USAGE_DRIVER);
1214  	if (err) {
1215  		en_err(priv, "Failed reserving %d qps\n", priv->rx_ring_num);
1216  		return err;
1217  	}
1218  
1219  	for (i = 0; i < priv->rx_ring_num; i++) {
1220  		qpn = rss_map->base_qpn + i;
1221  		err = mlx4_en_config_rss_qp(priv, qpn, priv->rx_ring[i],
1222  					    &rss_map->state[i],
1223  					    &rss_map->qps[i]);
1224  		if (err)
1225  			goto rss_err;
1226  
1227  		++good_qps;
1228  	}
1229  
1230  	if (priv->rx_ring_num == 1) {
1231  		rss_map->indir_qp = &rss_map->qps[0];
1232  		priv->base_qpn = rss_map->indir_qp->qpn;
1233  		en_info(priv, "Optimized Non-RSS steering\n");
1234  		return 0;
1235  	}
1236  
1237  	rss_map->indir_qp = kzalloc(sizeof(*rss_map->indir_qp), GFP_KERNEL);
1238  	if (!rss_map->indir_qp) {
1239  		err = -ENOMEM;
1240  		goto rss_err;
1241  	}
1242  
1243  	/* Configure RSS indirection qp */
1244  	err = mlx4_qp_alloc(mdev->dev, priv->base_qpn, rss_map->indir_qp);
1245  	if (err) {
1246  		en_err(priv, "Failed to allocate RSS indirection QP\n");
1247  		goto qp_alloc_err;
1248  	}
1249  
1250  	rss_map->indir_qp->event = mlx4_en_sqp_event;
1251  	mlx4_en_fill_qp_context(priv, 0, 0, 0, 1, priv->base_qpn,
1252  				priv->rx_ring[0]->cqn, -1, &context);
1253  
1254  	if (!priv->prof->rss_rings || priv->prof->rss_rings > priv->rx_ring_num)
1255  		rss_rings = priv->rx_ring_num;
1256  	else
1257  		rss_rings = priv->prof->rss_rings;
1258  
1259  	ptr = ((void *) &context) + offsetof(struct mlx4_qp_context, pri_path)
1260  					+ MLX4_RSS_OFFSET_IN_QPC_PRI_PATH;
1261  	rss_context = ptr;
1262  	rss_context->base_qpn = cpu_to_be32(ilog2(rss_rings) << 24 |
1263  					    (rss_map->base_qpn));
1264  	rss_context->default_qpn = cpu_to_be32(rss_map->base_qpn);
1265  	if (priv->mdev->profile.udp_rss) {
1266  		rss_mask |=  MLX4_RSS_UDP_IPV4 | MLX4_RSS_UDP_IPV6;
1267  		rss_context->base_qpn_udp = rss_context->default_qpn;
1268  	}
1269  
1270  	if (mdev->dev->caps.tunnel_offload_mode == MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) {
1271  		en_info(priv, "Setting RSS context tunnel type to RSS on inner headers\n");
1272  		rss_mask |= MLX4_RSS_BY_INNER_HEADERS;
1273  	}
1274  
1275  	rss_context->flags = rss_mask;
1276  	rss_context->hash_fn = MLX4_RSS_HASH_TOP;
1277  	if (priv->rss_hash_fn == ETH_RSS_HASH_XOR) {
1278  		rss_context->hash_fn = MLX4_RSS_HASH_XOR;
1279  	} else if (priv->rss_hash_fn == ETH_RSS_HASH_TOP) {
1280  		rss_context->hash_fn = MLX4_RSS_HASH_TOP;
1281  		memcpy(rss_context->rss_key, priv->rss_key,
1282  		       MLX4_EN_RSS_KEY_SIZE);
1283  	} else {
1284  		en_err(priv, "Unknown RSS hash function requested\n");
1285  		err = -EINVAL;
1286  		goto indir_err;
1287  	}
1288  
1289  	err = mlx4_qp_to_ready(mdev->dev, &priv->res.mtt, &context,
1290  			       rss_map->indir_qp, &rss_map->indir_state);
1291  	if (err)
1292  		goto indir_err;
1293  
1294  	return 0;
1295  
1296  indir_err:
1297  	mlx4_qp_modify(mdev->dev, NULL, rss_map->indir_state,
1298  		       MLX4_QP_STATE_RST, NULL, 0, 0, rss_map->indir_qp);
1299  	mlx4_qp_remove(mdev->dev, rss_map->indir_qp);
1300  	mlx4_qp_free(mdev->dev, rss_map->indir_qp);
1301  qp_alloc_err:
1302  	kfree(rss_map->indir_qp);
1303  	rss_map->indir_qp = NULL;
1304  rss_err:
1305  	for (i = 0; i < good_qps; i++) {
1306  		mlx4_qp_modify(mdev->dev, NULL, rss_map->state[i],
1307  			       MLX4_QP_STATE_RST, NULL, 0, 0, &rss_map->qps[i]);
1308  		mlx4_qp_remove(mdev->dev, &rss_map->qps[i]);
1309  		mlx4_qp_free(mdev->dev, &rss_map->qps[i]);
1310  	}
1311  	mlx4_qp_release_range(mdev->dev, rss_map->base_qpn, priv->rx_ring_num);
1312  	return err;
1313  }
1314  
mlx4_en_release_rss_steer(struct mlx4_en_priv * priv)1315  void mlx4_en_release_rss_steer(struct mlx4_en_priv *priv)
1316  {
1317  	struct mlx4_en_dev *mdev = priv->mdev;
1318  	struct mlx4_en_rss_map *rss_map = &priv->rss_map;
1319  	int i;
1320  
1321  	if (priv->rx_ring_num > 1) {
1322  		mlx4_qp_modify(mdev->dev, NULL, rss_map->indir_state,
1323  			       MLX4_QP_STATE_RST, NULL, 0, 0,
1324  			       rss_map->indir_qp);
1325  		mlx4_qp_remove(mdev->dev, rss_map->indir_qp);
1326  		mlx4_qp_free(mdev->dev, rss_map->indir_qp);
1327  		kfree(rss_map->indir_qp);
1328  		rss_map->indir_qp = NULL;
1329  	}
1330  
1331  	for (i = 0; i < priv->rx_ring_num; i++) {
1332  		mlx4_qp_modify(mdev->dev, NULL, rss_map->state[i],
1333  			       MLX4_QP_STATE_RST, NULL, 0, 0, &rss_map->qps[i]);
1334  		mlx4_qp_remove(mdev->dev, &rss_map->qps[i]);
1335  		mlx4_qp_free(mdev->dev, &rss_map->qps[i]);
1336  	}
1337  	mlx4_qp_release_range(mdev->dev, rss_map->base_qpn, priv->rx_ring_num);
1338  }
1339