1 // SPDX-License-Identifier: (GPL-2.0 OR MIT)
2 /* Google virtual Ethernet (gve) driver
3 *
4 * Copyright (C) 2015-2021 Google, Inc.
5 */
6
7 #include "gve.h"
8 #include "gve_adminq.h"
9 #include "gve_utils.h"
10 #include "gve_dqo.h"
11 #include <linux/tcp.h>
12 #include <linux/slab.h>
13 #include <linux/skbuff.h>
14
15 /* Returns true if a gve_tx_pending_packet_dqo object is available. */
gve_has_pending_packet(struct gve_tx_ring * tx)16 static bool gve_has_pending_packet(struct gve_tx_ring *tx)
17 {
18 /* Check TX path's list. */
19 if (tx->dqo_tx.free_pending_packets != -1)
20 return true;
21
22 /* Check completion handler's list. */
23 if (atomic_read_acquire(&tx->dqo_compl.free_pending_packets) != -1)
24 return true;
25
26 return false;
27 }
28
29 static struct gve_tx_pending_packet_dqo *
gve_alloc_pending_packet(struct gve_tx_ring * tx)30 gve_alloc_pending_packet(struct gve_tx_ring *tx)
31 {
32 struct gve_tx_pending_packet_dqo *pending_packet;
33 s16 index;
34
35 index = tx->dqo_tx.free_pending_packets;
36
37 /* No pending_packets available, try to steal the list from the
38 * completion handler.
39 */
40 if (unlikely(index == -1)) {
41 tx->dqo_tx.free_pending_packets =
42 atomic_xchg(&tx->dqo_compl.free_pending_packets, -1);
43 index = tx->dqo_tx.free_pending_packets;
44
45 if (unlikely(index == -1))
46 return NULL;
47 }
48
49 pending_packet = &tx->dqo.pending_packets[index];
50
51 /* Remove pending_packet from free list */
52 tx->dqo_tx.free_pending_packets = pending_packet->next;
53 pending_packet->state = GVE_PACKET_STATE_PENDING_DATA_COMPL;
54
55 return pending_packet;
56 }
57
58 static void
gve_free_pending_packet(struct gve_tx_ring * tx,struct gve_tx_pending_packet_dqo * pending_packet)59 gve_free_pending_packet(struct gve_tx_ring *tx,
60 struct gve_tx_pending_packet_dqo *pending_packet)
61 {
62 s16 index = pending_packet - tx->dqo.pending_packets;
63
64 pending_packet->state = GVE_PACKET_STATE_UNALLOCATED;
65 while (true) {
66 s16 old_head = atomic_read_acquire(&tx->dqo_compl.free_pending_packets);
67
68 pending_packet->next = old_head;
69 if (atomic_cmpxchg(&tx->dqo_compl.free_pending_packets,
70 old_head, index) == old_head) {
71 break;
72 }
73 }
74 }
75
76 /* gve_tx_free_desc - Cleans up all pending tx requests and buffers.
77 */
gve_tx_clean_pending_packets(struct gve_tx_ring * tx)78 static void gve_tx_clean_pending_packets(struct gve_tx_ring *tx)
79 {
80 int i;
81
82 for (i = 0; i < tx->dqo.num_pending_packets; i++) {
83 struct gve_tx_pending_packet_dqo *cur_state =
84 &tx->dqo.pending_packets[i];
85 int j;
86
87 for (j = 0; j < cur_state->num_bufs; j++) {
88 struct gve_tx_dma_buf *buf = &cur_state->bufs[j];
89
90 if (j == 0) {
91 dma_unmap_single(tx->dev,
92 dma_unmap_addr(buf, dma),
93 dma_unmap_len(buf, len),
94 DMA_TO_DEVICE);
95 } else {
96 dma_unmap_page(tx->dev,
97 dma_unmap_addr(buf, dma),
98 dma_unmap_len(buf, len),
99 DMA_TO_DEVICE);
100 }
101 }
102 if (cur_state->skb) {
103 dev_consume_skb_any(cur_state->skb);
104 cur_state->skb = NULL;
105 }
106 }
107 }
108
gve_tx_free_ring_dqo(struct gve_priv * priv,int idx)109 static void gve_tx_free_ring_dqo(struct gve_priv *priv, int idx)
110 {
111 struct gve_tx_ring *tx = &priv->tx[idx];
112 struct device *hdev = &priv->pdev->dev;
113 size_t bytes;
114
115 gve_tx_remove_from_block(priv, idx);
116
117 if (tx->q_resources) {
118 dma_free_coherent(hdev, sizeof(*tx->q_resources),
119 tx->q_resources, tx->q_resources_bus);
120 tx->q_resources = NULL;
121 }
122
123 if (tx->dqo.compl_ring) {
124 bytes = sizeof(tx->dqo.compl_ring[0]) *
125 (tx->dqo.complq_mask + 1);
126 dma_free_coherent(hdev, bytes, tx->dqo.compl_ring,
127 tx->complq_bus_dqo);
128 tx->dqo.compl_ring = NULL;
129 }
130
131 if (tx->dqo.tx_ring) {
132 bytes = sizeof(tx->dqo.tx_ring[0]) * (tx->mask + 1);
133 dma_free_coherent(hdev, bytes, tx->dqo.tx_ring, tx->bus);
134 tx->dqo.tx_ring = NULL;
135 }
136
137 kvfree(tx->dqo.pending_packets);
138 tx->dqo.pending_packets = NULL;
139
140 netif_dbg(priv, drv, priv->dev, "freed tx queue %d\n", idx);
141 }
142
gve_tx_alloc_ring_dqo(struct gve_priv * priv,int idx)143 static int gve_tx_alloc_ring_dqo(struct gve_priv *priv, int idx)
144 {
145 struct gve_tx_ring *tx = &priv->tx[idx];
146 struct device *hdev = &priv->pdev->dev;
147 int num_pending_packets;
148 size_t bytes;
149 int i;
150
151 memset(tx, 0, sizeof(*tx));
152 tx->q_num = idx;
153 tx->dev = &priv->pdev->dev;
154 tx->netdev_txq = netdev_get_tx_queue(priv->dev, idx);
155 atomic_set_release(&tx->dqo_compl.hw_tx_head, 0);
156
157 /* Queue sizes must be a power of 2 */
158 tx->mask = priv->tx_desc_cnt - 1;
159 tx->dqo.complq_mask = priv->options_dqo_rda.tx_comp_ring_entries - 1;
160
161 /* The max number of pending packets determines the maximum number of
162 * descriptors which maybe written to the completion queue.
163 *
164 * We must set the number small enough to make sure we never overrun the
165 * completion queue.
166 */
167 num_pending_packets = tx->dqo.complq_mask + 1;
168
169 /* Reserve space for descriptor completions, which will be reported at
170 * most every GVE_TX_MIN_RE_INTERVAL packets.
171 */
172 num_pending_packets -=
173 (tx->dqo.complq_mask + 1) / GVE_TX_MIN_RE_INTERVAL;
174
175 /* Each packet may have at most 2 buffer completions if it receives both
176 * a miss and reinjection completion.
177 */
178 num_pending_packets /= 2;
179
180 tx->dqo.num_pending_packets = min_t(int, num_pending_packets, S16_MAX);
181 tx->dqo.pending_packets = kvcalloc(tx->dqo.num_pending_packets,
182 sizeof(tx->dqo.pending_packets[0]),
183 GFP_KERNEL);
184 if (!tx->dqo.pending_packets)
185 goto err;
186
187 /* Set up linked list of pending packets */
188 for (i = 0; i < tx->dqo.num_pending_packets - 1; i++)
189 tx->dqo.pending_packets[i].next = i + 1;
190
191 tx->dqo.pending_packets[tx->dqo.num_pending_packets - 1].next = -1;
192 atomic_set_release(&tx->dqo_compl.free_pending_packets, -1);
193 tx->dqo_compl.miss_completions.head = -1;
194 tx->dqo_compl.miss_completions.tail = -1;
195 tx->dqo_compl.timed_out_completions.head = -1;
196 tx->dqo_compl.timed_out_completions.tail = -1;
197
198 bytes = sizeof(tx->dqo.tx_ring[0]) * (tx->mask + 1);
199 tx->dqo.tx_ring = dma_alloc_coherent(hdev, bytes, &tx->bus, GFP_KERNEL);
200 if (!tx->dqo.tx_ring)
201 goto err;
202
203 bytes = sizeof(tx->dqo.compl_ring[0]) * (tx->dqo.complq_mask + 1);
204 tx->dqo.compl_ring = dma_alloc_coherent(hdev, bytes,
205 &tx->complq_bus_dqo,
206 GFP_KERNEL);
207 if (!tx->dqo.compl_ring)
208 goto err;
209
210 tx->q_resources = dma_alloc_coherent(hdev, sizeof(*tx->q_resources),
211 &tx->q_resources_bus, GFP_KERNEL);
212 if (!tx->q_resources)
213 goto err;
214
215 gve_tx_add_to_block(priv, idx);
216
217 return 0;
218
219 err:
220 gve_tx_free_ring_dqo(priv, idx);
221 return -ENOMEM;
222 }
223
gve_tx_alloc_rings_dqo(struct gve_priv * priv)224 int gve_tx_alloc_rings_dqo(struct gve_priv *priv)
225 {
226 int err = 0;
227 int i;
228
229 for (i = 0; i < priv->tx_cfg.num_queues; i++) {
230 err = gve_tx_alloc_ring_dqo(priv, i);
231 if (err) {
232 netif_err(priv, drv, priv->dev,
233 "Failed to alloc tx ring=%d: err=%d\n",
234 i, err);
235 goto err;
236 }
237 }
238
239 return 0;
240
241 err:
242 for (i--; i >= 0; i--)
243 gve_tx_free_ring_dqo(priv, i);
244
245 return err;
246 }
247
gve_tx_free_rings_dqo(struct gve_priv * priv)248 void gve_tx_free_rings_dqo(struct gve_priv *priv)
249 {
250 int i;
251
252 for (i = 0; i < priv->tx_cfg.num_queues; i++) {
253 struct gve_tx_ring *tx = &priv->tx[i];
254
255 gve_clean_tx_done_dqo(priv, tx, /*napi=*/NULL);
256 netdev_tx_reset_queue(tx->netdev_txq);
257 gve_tx_clean_pending_packets(tx);
258
259 gve_tx_free_ring_dqo(priv, i);
260 }
261 }
262
263 /* Returns the number of slots available in the ring */
num_avail_tx_slots(const struct gve_tx_ring * tx)264 static u32 num_avail_tx_slots(const struct gve_tx_ring *tx)
265 {
266 u32 num_used = (tx->dqo_tx.tail - tx->dqo_tx.head) & tx->mask;
267
268 return tx->mask - num_used;
269 }
270
271 /* Stops the queue if available descriptors is less than 'count'.
272 * Return: 0 if stop is not required.
273 */
gve_maybe_stop_tx_dqo(struct gve_tx_ring * tx,int count)274 static int gve_maybe_stop_tx_dqo(struct gve_tx_ring *tx, int count)
275 {
276 if (likely(gve_has_pending_packet(tx) &&
277 num_avail_tx_slots(tx) >= count))
278 return 0;
279
280 /* Update cached TX head pointer */
281 tx->dqo_tx.head = atomic_read_acquire(&tx->dqo_compl.hw_tx_head);
282
283 if (likely(gve_has_pending_packet(tx) &&
284 num_avail_tx_slots(tx) >= count))
285 return 0;
286
287 /* No space, so stop the queue */
288 tx->stop_queue++;
289 netif_tx_stop_queue(tx->netdev_txq);
290
291 /* Sync with restarting queue in `gve_tx_poll_dqo()` */
292 mb();
293
294 /* After stopping queue, check if we can transmit again in order to
295 * avoid TOCTOU bug.
296 */
297 tx->dqo_tx.head = atomic_read_acquire(&tx->dqo_compl.hw_tx_head);
298
299 if (likely(!gve_has_pending_packet(tx) ||
300 num_avail_tx_slots(tx) < count))
301 return -EBUSY;
302
303 netif_tx_start_queue(tx->netdev_txq);
304 tx->wake_queue++;
305 return 0;
306 }
307
gve_extract_tx_metadata_dqo(const struct sk_buff * skb,struct gve_tx_metadata_dqo * metadata)308 static void gve_extract_tx_metadata_dqo(const struct sk_buff *skb,
309 struct gve_tx_metadata_dqo *metadata)
310 {
311 memset(metadata, 0, sizeof(*metadata));
312 metadata->version = GVE_TX_METADATA_VERSION_DQO;
313
314 if (skb->l4_hash) {
315 u16 path_hash = skb->hash ^ (skb->hash >> 16);
316
317 path_hash &= (1 << 15) - 1;
318 if (unlikely(path_hash == 0))
319 path_hash = ~path_hash;
320
321 metadata->path_hash = path_hash;
322 }
323 }
324
gve_tx_fill_pkt_desc_dqo(struct gve_tx_ring * tx,u32 * desc_idx,struct sk_buff * skb,u32 len,u64 addr,s16 compl_tag,bool eop,bool is_gso)325 static void gve_tx_fill_pkt_desc_dqo(struct gve_tx_ring *tx, u32 *desc_idx,
326 struct sk_buff *skb, u32 len, u64 addr,
327 s16 compl_tag, bool eop, bool is_gso)
328 {
329 const bool checksum_offload_en = skb->ip_summed == CHECKSUM_PARTIAL;
330
331 while (len > 0) {
332 struct gve_tx_pkt_desc_dqo *desc =
333 &tx->dqo.tx_ring[*desc_idx].pkt;
334 u32 cur_len = min_t(u32, len, GVE_TX_MAX_BUF_SIZE_DQO);
335 bool cur_eop = eop && cur_len == len;
336
337 *desc = (struct gve_tx_pkt_desc_dqo){
338 .buf_addr = cpu_to_le64(addr),
339 .dtype = GVE_TX_PKT_DESC_DTYPE_DQO,
340 .end_of_packet = cur_eop,
341 .checksum_offload_enable = checksum_offload_en,
342 .compl_tag = cpu_to_le16(compl_tag),
343 .buf_size = cur_len,
344 };
345
346 addr += cur_len;
347 len -= cur_len;
348 *desc_idx = (*desc_idx + 1) & tx->mask;
349 }
350 }
351
352 /* Validates and prepares `skb` for TSO.
353 *
354 * Returns header length, or < 0 if invalid.
355 */
gve_prep_tso(struct sk_buff * skb)356 static int gve_prep_tso(struct sk_buff *skb)
357 {
358 struct tcphdr *tcp;
359 int header_len;
360 u32 paylen;
361 int err;
362
363 /* Note: HW requires MSS (gso_size) to be <= 9728 and the total length
364 * of the TSO to be <= 262143.
365 *
366 * However, we don't validate these because:
367 * - Hypervisor enforces a limit of 9K MTU
368 * - Kernel will not produce a TSO larger than 64k
369 */
370
371 if (unlikely(skb_shinfo(skb)->gso_size < GVE_TX_MIN_TSO_MSS_DQO))
372 return -1;
373
374 /* Needed because we will modify header. */
375 err = skb_cow_head(skb, 0);
376 if (err < 0)
377 return err;
378
379 tcp = tcp_hdr(skb);
380
381 /* Remove payload length from checksum. */
382 paylen = skb->len - skb_transport_offset(skb);
383
384 switch (skb_shinfo(skb)->gso_type) {
385 case SKB_GSO_TCPV4:
386 case SKB_GSO_TCPV6:
387 csum_replace_by_diff(&tcp->check,
388 (__force __wsum)htonl(paylen));
389
390 /* Compute length of segmentation header. */
391 header_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
392 break;
393 default:
394 return -EINVAL;
395 }
396
397 if (unlikely(header_len > GVE_TX_MAX_HDR_SIZE_DQO))
398 return -EINVAL;
399
400 return header_len;
401 }
402
gve_tx_fill_tso_ctx_desc(struct gve_tx_tso_context_desc_dqo * desc,const struct sk_buff * skb,const struct gve_tx_metadata_dqo * metadata,int header_len)403 static void gve_tx_fill_tso_ctx_desc(struct gve_tx_tso_context_desc_dqo *desc,
404 const struct sk_buff *skb,
405 const struct gve_tx_metadata_dqo *metadata,
406 int header_len)
407 {
408 *desc = (struct gve_tx_tso_context_desc_dqo){
409 .header_len = header_len,
410 .cmd_dtype = {
411 .dtype = GVE_TX_TSO_CTX_DESC_DTYPE_DQO,
412 .tso = 1,
413 },
414 .flex0 = metadata->bytes[0],
415 .flex5 = metadata->bytes[5],
416 .flex6 = metadata->bytes[6],
417 .flex7 = metadata->bytes[7],
418 .flex8 = metadata->bytes[8],
419 .flex9 = metadata->bytes[9],
420 .flex10 = metadata->bytes[10],
421 .flex11 = metadata->bytes[11],
422 };
423 desc->tso_total_len = skb->len - header_len;
424 desc->mss = skb_shinfo(skb)->gso_size;
425 }
426
427 static void
gve_tx_fill_general_ctx_desc(struct gve_tx_general_context_desc_dqo * desc,const struct gve_tx_metadata_dqo * metadata)428 gve_tx_fill_general_ctx_desc(struct gve_tx_general_context_desc_dqo *desc,
429 const struct gve_tx_metadata_dqo *metadata)
430 {
431 *desc = (struct gve_tx_general_context_desc_dqo){
432 .flex0 = metadata->bytes[0],
433 .flex1 = metadata->bytes[1],
434 .flex2 = metadata->bytes[2],
435 .flex3 = metadata->bytes[3],
436 .flex4 = metadata->bytes[4],
437 .flex5 = metadata->bytes[5],
438 .flex6 = metadata->bytes[6],
439 .flex7 = metadata->bytes[7],
440 .flex8 = metadata->bytes[8],
441 .flex9 = metadata->bytes[9],
442 .flex10 = metadata->bytes[10],
443 .flex11 = metadata->bytes[11],
444 .cmd_dtype = {.dtype = GVE_TX_GENERAL_CTX_DESC_DTYPE_DQO},
445 };
446 }
447
448 /* Returns 0 on success, or < 0 on error.
449 *
450 * Before this function is called, the caller must ensure
451 * gve_has_pending_packet(tx) returns true.
452 */
gve_tx_add_skb_no_copy_dqo(struct gve_tx_ring * tx,struct sk_buff * skb)453 static int gve_tx_add_skb_no_copy_dqo(struct gve_tx_ring *tx,
454 struct sk_buff *skb)
455 {
456 const struct skb_shared_info *shinfo = skb_shinfo(skb);
457 const bool is_gso = skb_is_gso(skb);
458 u32 desc_idx = tx->dqo_tx.tail;
459
460 struct gve_tx_pending_packet_dqo *pending_packet;
461 struct gve_tx_metadata_dqo metadata;
462 s16 completion_tag;
463 int i;
464
465 pending_packet = gve_alloc_pending_packet(tx);
466 pending_packet->skb = skb;
467 pending_packet->num_bufs = 0;
468 completion_tag = pending_packet - tx->dqo.pending_packets;
469
470 gve_extract_tx_metadata_dqo(skb, &metadata);
471 if (is_gso) {
472 int header_len = gve_prep_tso(skb);
473
474 if (unlikely(header_len < 0))
475 goto err;
476
477 gve_tx_fill_tso_ctx_desc(&tx->dqo.tx_ring[desc_idx].tso_ctx,
478 skb, &metadata, header_len);
479 desc_idx = (desc_idx + 1) & tx->mask;
480 }
481
482 gve_tx_fill_general_ctx_desc(&tx->dqo.tx_ring[desc_idx].general_ctx,
483 &metadata);
484 desc_idx = (desc_idx + 1) & tx->mask;
485
486 /* Note: HW requires that the size of a non-TSO packet be within the
487 * range of [17, 9728].
488 *
489 * We don't double check because
490 * - We limited `netdev->min_mtu` to ETH_MIN_MTU.
491 * - Hypervisor won't allow MTU larger than 9216.
492 */
493
494 /* Map the linear portion of skb */
495 {
496 struct gve_tx_dma_buf *buf =
497 &pending_packet->bufs[pending_packet->num_bufs];
498 u32 len = skb_headlen(skb);
499 dma_addr_t addr;
500
501 addr = dma_map_single(tx->dev, skb->data, len, DMA_TO_DEVICE);
502 if (unlikely(dma_mapping_error(tx->dev, addr)))
503 goto err;
504
505 dma_unmap_len_set(buf, len, len);
506 dma_unmap_addr_set(buf, dma, addr);
507 ++pending_packet->num_bufs;
508
509 gve_tx_fill_pkt_desc_dqo(tx, &desc_idx, skb, len, addr,
510 completion_tag,
511 /*eop=*/shinfo->nr_frags == 0, is_gso);
512 }
513
514 for (i = 0; i < shinfo->nr_frags; i++) {
515 struct gve_tx_dma_buf *buf =
516 &pending_packet->bufs[pending_packet->num_bufs];
517 const skb_frag_t *frag = &shinfo->frags[i];
518 bool is_eop = i == (shinfo->nr_frags - 1);
519 u32 len = skb_frag_size(frag);
520 dma_addr_t addr;
521
522 addr = skb_frag_dma_map(tx->dev, frag, 0, len, DMA_TO_DEVICE);
523 if (unlikely(dma_mapping_error(tx->dev, addr)))
524 goto err;
525
526 dma_unmap_len_set(buf, len, len);
527 dma_unmap_addr_set(buf, dma, addr);
528 ++pending_packet->num_bufs;
529
530 gve_tx_fill_pkt_desc_dqo(tx, &desc_idx, skb, len, addr,
531 completion_tag, is_eop, is_gso);
532 }
533
534 /* Commit the changes to our state */
535 tx->dqo_tx.tail = desc_idx;
536
537 /* Request a descriptor completion on the last descriptor of the
538 * packet if we are allowed to by the HW enforced interval.
539 */
540 {
541 u32 last_desc_idx = (desc_idx - 1) & tx->mask;
542 u32 last_report_event_interval =
543 (last_desc_idx - tx->dqo_tx.last_re_idx) & tx->mask;
544
545 if (unlikely(last_report_event_interval >=
546 GVE_TX_MIN_RE_INTERVAL)) {
547 tx->dqo.tx_ring[last_desc_idx].pkt.report_event = true;
548 tx->dqo_tx.last_re_idx = last_desc_idx;
549 }
550 }
551
552 return 0;
553
554 err:
555 for (i = 0; i < pending_packet->num_bufs; i++) {
556 struct gve_tx_dma_buf *buf = &pending_packet->bufs[i];
557
558 if (i == 0) {
559 dma_unmap_single(tx->dev, dma_unmap_addr(buf, dma),
560 dma_unmap_len(buf, len),
561 DMA_TO_DEVICE);
562 } else {
563 dma_unmap_page(tx->dev, dma_unmap_addr(buf, dma),
564 dma_unmap_len(buf, len), DMA_TO_DEVICE);
565 }
566 }
567
568 pending_packet->skb = NULL;
569 pending_packet->num_bufs = 0;
570 gve_free_pending_packet(tx, pending_packet);
571
572 return -1;
573 }
574
gve_num_descs_per_buf(size_t size)575 static int gve_num_descs_per_buf(size_t size)
576 {
577 return DIV_ROUND_UP(size, GVE_TX_MAX_BUF_SIZE_DQO);
578 }
579
gve_num_buffer_descs_needed(const struct sk_buff * skb)580 static int gve_num_buffer_descs_needed(const struct sk_buff *skb)
581 {
582 const struct skb_shared_info *shinfo = skb_shinfo(skb);
583 int num_descs;
584 int i;
585
586 num_descs = gve_num_descs_per_buf(skb_headlen(skb));
587
588 for (i = 0; i < shinfo->nr_frags; i++) {
589 unsigned int frag_size = skb_frag_size(&shinfo->frags[i]);
590
591 num_descs += gve_num_descs_per_buf(frag_size);
592 }
593
594 return num_descs;
595 }
596
597 /* Returns true if HW is capable of sending TSO represented by `skb`.
598 *
599 * Each segment must not span more than GVE_TX_MAX_DATA_DESCS buffers.
600 * - The header is counted as one buffer for every single segment.
601 * - A buffer which is split between two segments is counted for both.
602 * - If a buffer contains both header and payload, it is counted as two buffers.
603 */
gve_can_send_tso(const struct sk_buff * skb)604 static bool gve_can_send_tso(const struct sk_buff *skb)
605 {
606 const int header_len = skb_checksum_start_offset(skb) + tcp_hdrlen(skb);
607 const int max_bufs_per_seg = GVE_TX_MAX_DATA_DESCS - 1;
608 const struct skb_shared_info *shinfo = skb_shinfo(skb);
609 const int gso_size = shinfo->gso_size;
610 int cur_seg_num_bufs;
611 int cur_seg_size;
612 int i;
613
614 cur_seg_size = skb_headlen(skb) - header_len;
615 cur_seg_num_bufs = cur_seg_size > 0;
616
617 for (i = 0; i < shinfo->nr_frags; i++) {
618 if (cur_seg_size >= gso_size) {
619 cur_seg_size %= gso_size;
620 cur_seg_num_bufs = cur_seg_size > 0;
621 }
622
623 if (unlikely(++cur_seg_num_bufs > max_bufs_per_seg))
624 return false;
625
626 cur_seg_size += skb_frag_size(&shinfo->frags[i]);
627 }
628
629 return true;
630 }
631
632 /* Attempt to transmit specified SKB.
633 *
634 * Returns 0 if the SKB was transmitted or dropped.
635 * Returns -1 if there is not currently enough space to transmit the SKB.
636 */
gve_try_tx_skb(struct gve_priv * priv,struct gve_tx_ring * tx,struct sk_buff * skb)637 static int gve_try_tx_skb(struct gve_priv *priv, struct gve_tx_ring *tx,
638 struct sk_buff *skb)
639 {
640 int num_buffer_descs;
641 int total_num_descs;
642
643 if (skb_is_gso(skb)) {
644 /* If TSO doesn't meet HW requirements, attempt to linearize the
645 * packet.
646 */
647 if (unlikely(!gve_can_send_tso(skb) &&
648 skb_linearize(skb) < 0)) {
649 net_err_ratelimited("%s: Failed to transmit TSO packet\n",
650 priv->dev->name);
651 goto drop;
652 }
653
654 num_buffer_descs = gve_num_buffer_descs_needed(skb);
655 } else {
656 num_buffer_descs = gve_num_buffer_descs_needed(skb);
657
658 if (unlikely(num_buffer_descs > GVE_TX_MAX_DATA_DESCS)) {
659 if (unlikely(skb_linearize(skb) < 0))
660 goto drop;
661
662 num_buffer_descs = 1;
663 }
664 }
665
666 /* Metadata + (optional TSO) + data descriptors. */
667 total_num_descs = 1 + skb_is_gso(skb) + num_buffer_descs;
668 if (unlikely(gve_maybe_stop_tx_dqo(tx, total_num_descs +
669 GVE_TX_MIN_DESC_PREVENT_CACHE_OVERLAP))) {
670 return -1;
671 }
672
673 if (unlikely(gve_tx_add_skb_no_copy_dqo(tx, skb) < 0))
674 goto drop;
675
676 netdev_tx_sent_queue(tx->netdev_txq, skb->len);
677 skb_tx_timestamp(skb);
678 return 0;
679
680 drop:
681 tx->dropped_pkt++;
682 dev_kfree_skb_any(skb);
683 return 0;
684 }
685
686 /* Transmit a given skb and ring the doorbell. */
gve_tx_dqo(struct sk_buff * skb,struct net_device * dev)687 netdev_tx_t gve_tx_dqo(struct sk_buff *skb, struct net_device *dev)
688 {
689 struct gve_priv *priv = netdev_priv(dev);
690 struct gve_tx_ring *tx;
691
692 tx = &priv->tx[skb_get_queue_mapping(skb)];
693 if (unlikely(gve_try_tx_skb(priv, tx, skb) < 0)) {
694 /* We need to ring the txq doorbell -- we have stopped the Tx
695 * queue for want of resources, but prior calls to gve_tx()
696 * may have added descriptors without ringing the doorbell.
697 */
698 gve_tx_put_doorbell_dqo(priv, tx->q_resources, tx->dqo_tx.tail);
699 return NETDEV_TX_BUSY;
700 }
701
702 if (!netif_xmit_stopped(tx->netdev_txq) && netdev_xmit_more())
703 return NETDEV_TX_OK;
704
705 gve_tx_put_doorbell_dqo(priv, tx->q_resources, tx->dqo_tx.tail);
706 return NETDEV_TX_OK;
707 }
708
add_to_list(struct gve_tx_ring * tx,struct gve_index_list * list,struct gve_tx_pending_packet_dqo * pending_packet)709 static void add_to_list(struct gve_tx_ring *tx, struct gve_index_list *list,
710 struct gve_tx_pending_packet_dqo *pending_packet)
711 {
712 s16 old_tail, index;
713
714 index = pending_packet - tx->dqo.pending_packets;
715 old_tail = list->tail;
716 list->tail = index;
717 if (old_tail == -1)
718 list->head = index;
719 else
720 tx->dqo.pending_packets[old_tail].next = index;
721
722 pending_packet->next = -1;
723 pending_packet->prev = old_tail;
724 }
725
remove_from_list(struct gve_tx_ring * tx,struct gve_index_list * list,struct gve_tx_pending_packet_dqo * pending_packet)726 static void remove_from_list(struct gve_tx_ring *tx,
727 struct gve_index_list *list,
728 struct gve_tx_pending_packet_dqo *pending_packet)
729 {
730 s16 prev_index, next_index;
731
732 prev_index = pending_packet->prev;
733 next_index = pending_packet->next;
734
735 if (prev_index == -1) {
736 /* Node is head */
737 list->head = next_index;
738 } else {
739 tx->dqo.pending_packets[prev_index].next = next_index;
740 }
741 if (next_index == -1) {
742 /* Node is tail */
743 list->tail = prev_index;
744 } else {
745 tx->dqo.pending_packets[next_index].prev = prev_index;
746 }
747 }
748
gve_unmap_packet(struct device * dev,struct gve_tx_pending_packet_dqo * pending_packet)749 static void gve_unmap_packet(struct device *dev,
750 struct gve_tx_pending_packet_dqo *pending_packet)
751 {
752 struct gve_tx_dma_buf *buf;
753 int i;
754
755 /* SKB linear portion is guaranteed to be mapped */
756 buf = &pending_packet->bufs[0];
757 dma_unmap_single(dev, dma_unmap_addr(buf, dma),
758 dma_unmap_len(buf, len), DMA_TO_DEVICE);
759 for (i = 1; i < pending_packet->num_bufs; i++) {
760 buf = &pending_packet->bufs[i];
761 dma_unmap_page(dev, dma_unmap_addr(buf, dma),
762 dma_unmap_len(buf, len), DMA_TO_DEVICE);
763 }
764 pending_packet->num_bufs = 0;
765 }
766
767 /* Completion types and expected behavior:
768 * No Miss compl + Packet compl = Packet completed normally.
769 * Miss compl + Re-inject compl = Packet completed normally.
770 * No Miss compl + Re-inject compl = Skipped i.e. packet not completed.
771 * Miss compl + Packet compl = Skipped i.e. packet not completed.
772 */
gve_handle_packet_completion(struct gve_priv * priv,struct gve_tx_ring * tx,bool is_napi,u16 compl_tag,u64 * bytes,u64 * pkts,bool is_reinjection)773 static void gve_handle_packet_completion(struct gve_priv *priv,
774 struct gve_tx_ring *tx, bool is_napi,
775 u16 compl_tag, u64 *bytes, u64 *pkts,
776 bool is_reinjection)
777 {
778 struct gve_tx_pending_packet_dqo *pending_packet;
779
780 if (unlikely(compl_tag >= tx->dqo.num_pending_packets)) {
781 net_err_ratelimited("%s: Invalid TX completion tag: %d\n",
782 priv->dev->name, (int)compl_tag);
783 return;
784 }
785
786 pending_packet = &tx->dqo.pending_packets[compl_tag];
787
788 if (unlikely(is_reinjection)) {
789 if (unlikely(pending_packet->state ==
790 GVE_PACKET_STATE_TIMED_OUT_COMPL)) {
791 net_err_ratelimited("%s: Re-injection completion: %d received after timeout.\n",
792 priv->dev->name, (int)compl_tag);
793 /* Packet was already completed as a result of timeout,
794 * so just remove from list and free pending packet.
795 */
796 remove_from_list(tx,
797 &tx->dqo_compl.timed_out_completions,
798 pending_packet);
799 gve_free_pending_packet(tx, pending_packet);
800 return;
801 }
802 if (unlikely(pending_packet->state !=
803 GVE_PACKET_STATE_PENDING_REINJECT_COMPL)) {
804 /* No outstanding miss completion but packet allocated
805 * implies packet receives a re-injection completion
806 * without a a prior miss completion. Return without
807 * completing the packet.
808 */
809 net_err_ratelimited("%s: Re-injection completion received without corresponding miss completion: %d\n",
810 priv->dev->name, (int)compl_tag);
811 return;
812 }
813 remove_from_list(tx, &tx->dqo_compl.miss_completions,
814 pending_packet);
815 } else {
816 /* Packet is allocated but not a pending data completion. */
817 if (unlikely(pending_packet->state !=
818 GVE_PACKET_STATE_PENDING_DATA_COMPL)) {
819 net_err_ratelimited("%s: No pending data completion: %d\n",
820 priv->dev->name, (int)compl_tag);
821 return;
822 }
823 }
824 gve_unmap_packet(tx->dev, pending_packet);
825
826 *bytes += pending_packet->skb->len;
827 (*pkts)++;
828 napi_consume_skb(pending_packet->skb, is_napi);
829 pending_packet->skb = NULL;
830 gve_free_pending_packet(tx, pending_packet);
831 }
832
gve_handle_miss_completion(struct gve_priv * priv,struct gve_tx_ring * tx,u16 compl_tag,u64 * bytes,u64 * pkts)833 static void gve_handle_miss_completion(struct gve_priv *priv,
834 struct gve_tx_ring *tx, u16 compl_tag,
835 u64 *bytes, u64 *pkts)
836 {
837 struct gve_tx_pending_packet_dqo *pending_packet;
838
839 if (unlikely(compl_tag >= tx->dqo.num_pending_packets)) {
840 net_err_ratelimited("%s: Invalid TX completion tag: %d\n",
841 priv->dev->name, (int)compl_tag);
842 return;
843 }
844
845 pending_packet = &tx->dqo.pending_packets[compl_tag];
846 if (unlikely(pending_packet->state !=
847 GVE_PACKET_STATE_PENDING_DATA_COMPL)) {
848 net_err_ratelimited("%s: Unexpected packet state: %d for completion tag : %d\n",
849 priv->dev->name, (int)pending_packet->state,
850 (int)compl_tag);
851 return;
852 }
853
854 pending_packet->state = GVE_PACKET_STATE_PENDING_REINJECT_COMPL;
855 /* jiffies can wraparound but time comparisons can handle overflows. */
856 pending_packet->timeout_jiffies =
857 jiffies +
858 msecs_to_jiffies(GVE_REINJECT_COMPL_TIMEOUT *
859 MSEC_PER_SEC);
860 add_to_list(tx, &tx->dqo_compl.miss_completions, pending_packet);
861
862 *bytes += pending_packet->skb->len;
863 (*pkts)++;
864 }
865
remove_miss_completions(struct gve_priv * priv,struct gve_tx_ring * tx)866 static void remove_miss_completions(struct gve_priv *priv,
867 struct gve_tx_ring *tx)
868 {
869 struct gve_tx_pending_packet_dqo *pending_packet;
870 s16 next_index;
871
872 next_index = tx->dqo_compl.miss_completions.head;
873 while (next_index != -1) {
874 pending_packet = &tx->dqo.pending_packets[next_index];
875 next_index = pending_packet->next;
876 /* Break early because packets should timeout in order. */
877 if (time_is_after_jiffies(pending_packet->timeout_jiffies))
878 break;
879
880 remove_from_list(tx, &tx->dqo_compl.miss_completions,
881 pending_packet);
882 /* Unmap buffers and free skb but do not unallocate packet i.e.
883 * the completion tag is not freed to ensure that the driver
884 * can take appropriate action if a corresponding valid
885 * completion is received later.
886 */
887 gve_unmap_packet(tx->dev, pending_packet);
888 /* This indicates the packet was dropped. */
889 dev_kfree_skb_any(pending_packet->skb);
890 pending_packet->skb = NULL;
891 tx->dropped_pkt++;
892 net_err_ratelimited("%s: No reinjection completion was received for: %d.\n",
893 priv->dev->name,
894 (int)(pending_packet - tx->dqo.pending_packets));
895
896 pending_packet->state = GVE_PACKET_STATE_TIMED_OUT_COMPL;
897 pending_packet->timeout_jiffies =
898 jiffies +
899 msecs_to_jiffies(GVE_DEALLOCATE_COMPL_TIMEOUT *
900 MSEC_PER_SEC);
901 /* Maintain pending packet in another list so the packet can be
902 * unallocated at a later time.
903 */
904 add_to_list(tx, &tx->dqo_compl.timed_out_completions,
905 pending_packet);
906 }
907 }
908
remove_timed_out_completions(struct gve_priv * priv,struct gve_tx_ring * tx)909 static void remove_timed_out_completions(struct gve_priv *priv,
910 struct gve_tx_ring *tx)
911 {
912 struct gve_tx_pending_packet_dqo *pending_packet;
913 s16 next_index;
914
915 next_index = tx->dqo_compl.timed_out_completions.head;
916 while (next_index != -1) {
917 pending_packet = &tx->dqo.pending_packets[next_index];
918 next_index = pending_packet->next;
919 /* Break early because packets should timeout in order. */
920 if (time_is_after_jiffies(pending_packet->timeout_jiffies))
921 break;
922
923 remove_from_list(tx, &tx->dqo_compl.timed_out_completions,
924 pending_packet);
925 gve_free_pending_packet(tx, pending_packet);
926 }
927 }
928
gve_clean_tx_done_dqo(struct gve_priv * priv,struct gve_tx_ring * tx,struct napi_struct * napi)929 int gve_clean_tx_done_dqo(struct gve_priv *priv, struct gve_tx_ring *tx,
930 struct napi_struct *napi)
931 {
932 u64 reinject_compl_bytes = 0;
933 u64 reinject_compl_pkts = 0;
934 int num_descs_cleaned = 0;
935 u64 miss_compl_bytes = 0;
936 u64 miss_compl_pkts = 0;
937 u64 pkt_compl_bytes = 0;
938 u64 pkt_compl_pkts = 0;
939
940 /* Limit in order to avoid blocking for too long */
941 while (!napi || pkt_compl_pkts < napi->weight) {
942 struct gve_tx_compl_desc *compl_desc =
943 &tx->dqo.compl_ring[tx->dqo_compl.head];
944 u16 type;
945
946 if (compl_desc->generation == tx->dqo_compl.cur_gen_bit)
947 break;
948
949 /* Prefetch the next descriptor. */
950 prefetch(&tx->dqo.compl_ring[(tx->dqo_compl.head + 1) &
951 tx->dqo.complq_mask]);
952
953 /* Do not read data until we own the descriptor */
954 dma_rmb();
955 type = compl_desc->type;
956
957 if (type == GVE_COMPL_TYPE_DQO_DESC) {
958 /* This is the last descriptor fetched by HW plus one */
959 u16 tx_head = le16_to_cpu(compl_desc->tx_head);
960
961 atomic_set_release(&tx->dqo_compl.hw_tx_head, tx_head);
962 } else if (type == GVE_COMPL_TYPE_DQO_PKT) {
963 u16 compl_tag = le16_to_cpu(compl_desc->completion_tag);
964
965 gve_handle_packet_completion(priv, tx, !!napi,
966 compl_tag,
967 &pkt_compl_bytes,
968 &pkt_compl_pkts,
969 /*is_reinjection=*/false);
970 } else if (type == GVE_COMPL_TYPE_DQO_MISS) {
971 u16 compl_tag = le16_to_cpu(compl_desc->completion_tag);
972
973 gve_handle_miss_completion(priv, tx, compl_tag,
974 &miss_compl_bytes,
975 &miss_compl_pkts);
976 } else if (type == GVE_COMPL_TYPE_DQO_REINJECTION) {
977 u16 compl_tag = le16_to_cpu(compl_desc->completion_tag);
978
979 gve_handle_packet_completion(priv, tx, !!napi,
980 compl_tag,
981 &reinject_compl_bytes,
982 &reinject_compl_pkts,
983 /*is_reinjection=*/true);
984 }
985
986 tx->dqo_compl.head =
987 (tx->dqo_compl.head + 1) & tx->dqo.complq_mask;
988 /* Flip the generation bit when we wrap around */
989 tx->dqo_compl.cur_gen_bit ^= tx->dqo_compl.head == 0;
990 num_descs_cleaned++;
991 }
992
993 netdev_tx_completed_queue(tx->netdev_txq,
994 pkt_compl_pkts + miss_compl_pkts,
995 pkt_compl_bytes + miss_compl_bytes);
996
997 remove_miss_completions(priv, tx);
998 remove_timed_out_completions(priv, tx);
999
1000 u64_stats_update_begin(&tx->statss);
1001 tx->bytes_done += pkt_compl_bytes + reinject_compl_bytes;
1002 tx->pkt_done += pkt_compl_pkts + reinject_compl_pkts;
1003 u64_stats_update_end(&tx->statss);
1004 return num_descs_cleaned;
1005 }
1006
gve_tx_poll_dqo(struct gve_notify_block * block,bool do_clean)1007 bool gve_tx_poll_dqo(struct gve_notify_block *block, bool do_clean)
1008 {
1009 struct gve_tx_compl_desc *compl_desc;
1010 struct gve_tx_ring *tx = block->tx;
1011 struct gve_priv *priv = block->priv;
1012
1013 if (do_clean) {
1014 int num_descs_cleaned = gve_clean_tx_done_dqo(priv, tx,
1015 &block->napi);
1016
1017 /* Sync with queue being stopped in `gve_maybe_stop_tx_dqo()` */
1018 mb();
1019
1020 if (netif_tx_queue_stopped(tx->netdev_txq) &&
1021 num_descs_cleaned > 0) {
1022 tx->wake_queue++;
1023 netif_tx_wake_queue(tx->netdev_txq);
1024 }
1025 }
1026
1027 /* Return true if we still have work. */
1028 compl_desc = &tx->dqo.compl_ring[tx->dqo_compl.head];
1029 return compl_desc->generation != tx->dqo_compl.cur_gen_bit;
1030 }
1031