1  /*
2   * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
3   * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
4   * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
5   * Copyright (c) 2004, 2005 Voltaire, Inc. All rights reserved.
6   *
7   * This software is available to you under a choice of one of two
8   * licenses.  You may choose to be licensed under the terms of the GNU
9   * General Public License (GPL) Version 2, available from the file
10   * COPYING in the main directory of this source tree, or the
11   * OpenIB.org BSD license below:
12   *
13   *     Redistribution and use in source and binary forms, with or
14   *     without modification, are permitted provided that the following
15   *     conditions are met:
16   *
17   *      - Redistributions of source code must retain the above
18   *        copyright notice, this list of conditions and the following
19   *        disclaimer.
20   *
21   *      - Redistributions in binary form must reproduce the above
22   *        copyright notice, this list of conditions and the following
23   *        disclaimer in the documentation and/or other materials
24   *        provided with the distribution.
25   *
26   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
27   * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
28   * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
29   * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
30   * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
31   * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
32   * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
33   * SOFTWARE.
34   */
35  
36  #include <linux/delay.h>
37  #include <linux/moduleparam.h>
38  #include <linux/dma-mapping.h>
39  #include <linux/slab.h>
40  
41  #include <linux/ip.h>
42  #include <linux/tcp.h>
43  #include <rdma/ib_cache.h>
44  
45  #include "ipoib.h"
46  
47  #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
48  static int data_debug_level;
49  
50  module_param(data_debug_level, int, 0644);
51  MODULE_PARM_DESC(data_debug_level,
52  		 "Enable data path debug tracing if > 0");
53  #endif
54  
ipoib_create_ah(struct net_device * dev,struct ib_pd * pd,struct rdma_ah_attr * attr)55  struct ipoib_ah *ipoib_create_ah(struct net_device *dev,
56  				 struct ib_pd *pd, struct rdma_ah_attr *attr)
57  {
58  	struct ipoib_ah *ah;
59  	struct ib_ah *vah;
60  
61  	ah = kmalloc(sizeof(*ah), GFP_KERNEL);
62  	if (!ah)
63  		return ERR_PTR(-ENOMEM);
64  
65  	ah->dev       = dev;
66  	ah->last_send = 0;
67  	kref_init(&ah->ref);
68  
69  	vah = rdma_create_ah(pd, attr, RDMA_CREATE_AH_SLEEPABLE);
70  	if (IS_ERR(vah)) {
71  		kfree(ah);
72  		ah = (struct ipoib_ah *)vah;
73  	} else {
74  		ah->ah = vah;
75  		ipoib_dbg(ipoib_priv(dev), "Created ah %p\n", ah->ah);
76  	}
77  
78  	return ah;
79  }
80  
ipoib_free_ah(struct kref * kref)81  void ipoib_free_ah(struct kref *kref)
82  {
83  	struct ipoib_ah *ah = container_of(kref, struct ipoib_ah, ref);
84  	struct ipoib_dev_priv *priv = ipoib_priv(ah->dev);
85  
86  	unsigned long flags;
87  
88  	spin_lock_irqsave(&priv->lock, flags);
89  	list_add_tail(&ah->list, &priv->dead_ahs);
90  	spin_unlock_irqrestore(&priv->lock, flags);
91  }
92  
ipoib_ud_dma_unmap_rx(struct ipoib_dev_priv * priv,u64 mapping[IPOIB_UD_RX_SG])93  static void ipoib_ud_dma_unmap_rx(struct ipoib_dev_priv *priv,
94  				  u64 mapping[IPOIB_UD_RX_SG])
95  {
96  	ib_dma_unmap_single(priv->ca, mapping[0],
97  			    IPOIB_UD_BUF_SIZE(priv->max_ib_mtu),
98  			    DMA_FROM_DEVICE);
99  }
100  
ipoib_ib_post_receive(struct net_device * dev,int id)101  static int ipoib_ib_post_receive(struct net_device *dev, int id)
102  {
103  	struct ipoib_dev_priv *priv = ipoib_priv(dev);
104  	int ret;
105  
106  	priv->rx_wr.wr_id   = id | IPOIB_OP_RECV;
107  	priv->rx_sge[0].addr = priv->rx_ring[id].mapping[0];
108  	priv->rx_sge[1].addr = priv->rx_ring[id].mapping[1];
109  
110  
111  	ret = ib_post_recv(priv->qp, &priv->rx_wr, NULL);
112  	if (unlikely(ret)) {
113  		ipoib_warn(priv, "receive failed for buf %d (%d)\n", id, ret);
114  		ipoib_ud_dma_unmap_rx(priv, priv->rx_ring[id].mapping);
115  		dev_kfree_skb_any(priv->rx_ring[id].skb);
116  		priv->rx_ring[id].skb = NULL;
117  	}
118  
119  	return ret;
120  }
121  
ipoib_alloc_rx_skb(struct net_device * dev,int id)122  static struct sk_buff *ipoib_alloc_rx_skb(struct net_device *dev, int id)
123  {
124  	struct ipoib_dev_priv *priv = ipoib_priv(dev);
125  	struct sk_buff *skb;
126  	int buf_size;
127  	u64 *mapping;
128  
129  	buf_size = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu);
130  
131  	skb = dev_alloc_skb(buf_size + IPOIB_HARD_LEN);
132  	if (unlikely(!skb))
133  		return NULL;
134  
135  	/*
136  	 * the IP header will be at IPOIP_HARD_LEN + IB_GRH_BYTES, that is
137  	 * 64 bytes aligned
138  	 */
139  	skb_reserve(skb, sizeof(struct ipoib_pseudo_header));
140  
141  	mapping = priv->rx_ring[id].mapping;
142  	mapping[0] = ib_dma_map_single(priv->ca, skb->data, buf_size,
143  				       DMA_FROM_DEVICE);
144  	if (unlikely(ib_dma_mapping_error(priv->ca, mapping[0])))
145  		goto error;
146  
147  	priv->rx_ring[id].skb = skb;
148  	return skb;
149  error:
150  	dev_kfree_skb_any(skb);
151  	return NULL;
152  }
153  
ipoib_ib_post_receives(struct net_device * dev)154  static int ipoib_ib_post_receives(struct net_device *dev)
155  {
156  	struct ipoib_dev_priv *priv = ipoib_priv(dev);
157  	int i;
158  
159  	for (i = 0; i < ipoib_recvq_size; ++i) {
160  		if (!ipoib_alloc_rx_skb(dev, i)) {
161  			ipoib_warn(priv, "failed to allocate receive buffer %d\n", i);
162  			return -ENOMEM;
163  		}
164  		if (ipoib_ib_post_receive(dev, i)) {
165  			ipoib_warn(priv, "ipoib_ib_post_receive failed for buf %d\n", i);
166  			return -EIO;
167  		}
168  	}
169  
170  	return 0;
171  }
172  
ipoib_ib_handle_rx_wc(struct net_device * dev,struct ib_wc * wc)173  static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
174  {
175  	struct ipoib_dev_priv *priv = ipoib_priv(dev);
176  	unsigned int wr_id = wc->wr_id & ~IPOIB_OP_RECV;
177  	struct sk_buff *skb;
178  	u64 mapping[IPOIB_UD_RX_SG];
179  	union ib_gid *dgid;
180  	union ib_gid *sgid;
181  
182  	ipoib_dbg_data(priv, "recv completion: id %d, status: %d\n",
183  		       wr_id, wc->status);
184  
185  	if (unlikely(wr_id >= ipoib_recvq_size)) {
186  		ipoib_warn(priv, "recv completion event with wrid %d (> %d)\n",
187  			   wr_id, ipoib_recvq_size);
188  		return;
189  	}
190  
191  	skb  = priv->rx_ring[wr_id].skb;
192  
193  	if (unlikely(wc->status != IB_WC_SUCCESS)) {
194  		if (wc->status != IB_WC_WR_FLUSH_ERR)
195  			ipoib_warn(priv,
196  				   "failed recv event (status=%d, wrid=%d vend_err %#x)\n",
197  				   wc->status, wr_id, wc->vendor_err);
198  		ipoib_ud_dma_unmap_rx(priv, priv->rx_ring[wr_id].mapping);
199  		dev_kfree_skb_any(skb);
200  		priv->rx_ring[wr_id].skb = NULL;
201  		return;
202  	}
203  
204  	memcpy(mapping, priv->rx_ring[wr_id].mapping,
205  	       IPOIB_UD_RX_SG * sizeof(*mapping));
206  
207  	/*
208  	 * If we can't allocate a new RX buffer, dump
209  	 * this packet and reuse the old buffer.
210  	 */
211  	if (unlikely(!ipoib_alloc_rx_skb(dev, wr_id))) {
212  		++dev->stats.rx_dropped;
213  		goto repost;
214  	}
215  
216  	ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
217  		       wc->byte_len, wc->slid);
218  
219  	ipoib_ud_dma_unmap_rx(priv, mapping);
220  
221  	skb_put(skb, wc->byte_len);
222  
223  	/* First byte of dgid signals multicast when 0xff */
224  	dgid = &((struct ib_grh *)skb->data)->dgid;
225  
226  	if (!(wc->wc_flags & IB_WC_GRH) || dgid->raw[0] != 0xff)
227  		skb->pkt_type = PACKET_HOST;
228  	else if (memcmp(dgid, dev->broadcast + 4, sizeof(union ib_gid)) == 0)
229  		skb->pkt_type = PACKET_BROADCAST;
230  	else
231  		skb->pkt_type = PACKET_MULTICAST;
232  
233  	sgid = &((struct ib_grh *)skb->data)->sgid;
234  
235  	/*
236  	 * Drop packets that this interface sent, ie multicast packets
237  	 * that the HCA has replicated.
238  	 */
239  	if (wc->slid == priv->local_lid && wc->src_qp == priv->qp->qp_num) {
240  		int need_repost = 1;
241  
242  		if ((wc->wc_flags & IB_WC_GRH) &&
243  		    sgid->global.interface_id != priv->local_gid.global.interface_id)
244  			need_repost = 0;
245  
246  		if (need_repost) {
247  			dev_kfree_skb_any(skb);
248  			goto repost;
249  		}
250  	}
251  
252  	skb_pull(skb, IB_GRH_BYTES);
253  
254  	skb->protocol = ((struct ipoib_header *) skb->data)->proto;
255  	skb_add_pseudo_hdr(skb);
256  
257  	++dev->stats.rx_packets;
258  	dev->stats.rx_bytes += skb->len;
259  	if (skb->pkt_type == PACKET_MULTICAST)
260  		dev->stats.multicast++;
261  
262  	skb->dev = dev;
263  	if ((dev->features & NETIF_F_RXCSUM) &&
264  			likely(wc->wc_flags & IB_WC_IP_CSUM_OK))
265  		skb->ip_summed = CHECKSUM_UNNECESSARY;
266  
267  	napi_gro_receive(&priv->recv_napi, skb);
268  
269  repost:
270  	if (unlikely(ipoib_ib_post_receive(dev, wr_id)))
271  		ipoib_warn(priv, "ipoib_ib_post_receive failed "
272  			   "for buf %d\n", wr_id);
273  }
274  
ipoib_dma_map_tx(struct ib_device * ca,struct ipoib_tx_buf * tx_req)275  int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req)
276  {
277  	struct sk_buff *skb = tx_req->skb;
278  	u64 *mapping = tx_req->mapping;
279  	int i;
280  	int off;
281  
282  	if (skb_headlen(skb)) {
283  		mapping[0] = ib_dma_map_single(ca, skb->data, skb_headlen(skb),
284  					       DMA_TO_DEVICE);
285  		if (unlikely(ib_dma_mapping_error(ca, mapping[0])))
286  			return -EIO;
287  
288  		off = 1;
289  	} else
290  		off = 0;
291  
292  	for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) {
293  		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
294  		mapping[i + off] = ib_dma_map_page(ca,
295  						 skb_frag_page(frag),
296  						 skb_frag_off(frag),
297  						 skb_frag_size(frag),
298  						 DMA_TO_DEVICE);
299  		if (unlikely(ib_dma_mapping_error(ca, mapping[i + off])))
300  			goto partial_error;
301  	}
302  	return 0;
303  
304  partial_error:
305  	for (; i > 0; --i) {
306  		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i - 1];
307  
308  		ib_dma_unmap_page(ca, mapping[i - !off], skb_frag_size(frag), DMA_TO_DEVICE);
309  	}
310  
311  	if (off)
312  		ib_dma_unmap_single(ca, mapping[0], skb_headlen(skb), DMA_TO_DEVICE);
313  
314  	return -EIO;
315  }
316  
ipoib_dma_unmap_tx(struct ipoib_dev_priv * priv,struct ipoib_tx_buf * tx_req)317  void ipoib_dma_unmap_tx(struct ipoib_dev_priv *priv,
318  			struct ipoib_tx_buf *tx_req)
319  {
320  	struct sk_buff *skb = tx_req->skb;
321  	u64 *mapping = tx_req->mapping;
322  	int i;
323  	int off;
324  
325  	if (skb_headlen(skb)) {
326  		ib_dma_unmap_single(priv->ca, mapping[0], skb_headlen(skb),
327  				    DMA_TO_DEVICE);
328  		off = 1;
329  	} else
330  		off = 0;
331  
332  	for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) {
333  		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
334  
335  		ib_dma_unmap_page(priv->ca, mapping[i + off],
336  				  skb_frag_size(frag), DMA_TO_DEVICE);
337  	}
338  }
339  
340  /*
341   * As the result of a completion error the QP Can be transferred to SQE states.
342   * The function checks if the (send)QP is in SQE state and
343   * moves it back to RTS state, that in order to have it functional again.
344   */
ipoib_qp_state_validate_work(struct work_struct * work)345  static void ipoib_qp_state_validate_work(struct work_struct *work)
346  {
347  	struct ipoib_qp_state_validate *qp_work =
348  		container_of(work, struct ipoib_qp_state_validate, work);
349  
350  	struct ipoib_dev_priv *priv = qp_work->priv;
351  	struct ib_qp_attr qp_attr;
352  	struct ib_qp_init_attr query_init_attr;
353  	int ret;
354  
355  	ret = ib_query_qp(priv->qp, &qp_attr, IB_QP_STATE, &query_init_attr);
356  	if (ret) {
357  		ipoib_warn(priv, "%s: Failed to query QP ret: %d\n",
358  			   __func__, ret);
359  		goto free_res;
360  	}
361  	pr_info("%s: QP: 0x%x is in state: %d\n",
362  		__func__, priv->qp->qp_num, qp_attr.qp_state);
363  
364  	/* currently support only in SQE->RTS transition*/
365  	if (qp_attr.qp_state == IB_QPS_SQE) {
366  		qp_attr.qp_state = IB_QPS_RTS;
367  
368  		ret = ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE);
369  		if (ret) {
370  			pr_warn("failed(%d) modify QP:0x%x SQE->RTS\n",
371  				ret, priv->qp->qp_num);
372  			goto free_res;
373  		}
374  		pr_info("%s: QP: 0x%x moved from IB_QPS_SQE to IB_QPS_RTS\n",
375  			__func__, priv->qp->qp_num);
376  	} else {
377  		pr_warn("QP (%d) will stay in state: %d\n",
378  			priv->qp->qp_num, qp_attr.qp_state);
379  	}
380  
381  free_res:
382  	kfree(qp_work);
383  }
384  
ipoib_ib_handle_tx_wc(struct net_device * dev,struct ib_wc * wc)385  static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
386  {
387  	struct ipoib_dev_priv *priv = ipoib_priv(dev);
388  	unsigned int wr_id = wc->wr_id;
389  	struct ipoib_tx_buf *tx_req;
390  
391  	ipoib_dbg_data(priv, "send completion: id %d, status: %d\n",
392  		       wr_id, wc->status);
393  
394  	if (unlikely(wr_id >= ipoib_sendq_size)) {
395  		ipoib_warn(priv, "send completion event with wrid %d (> %d)\n",
396  			   wr_id, ipoib_sendq_size);
397  		return;
398  	}
399  
400  	tx_req = &priv->tx_ring[wr_id];
401  
402  	ipoib_dma_unmap_tx(priv, tx_req);
403  
404  	++dev->stats.tx_packets;
405  	dev->stats.tx_bytes += tx_req->skb->len;
406  
407  	dev_kfree_skb_any(tx_req->skb);
408  
409  	++priv->tx_tail;
410  	++priv->global_tx_tail;
411  
412  	if (unlikely(netif_queue_stopped(dev) &&
413  		     ((priv->global_tx_head - priv->global_tx_tail) <=
414  		      ipoib_sendq_size >> 1) &&
415  		     test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)))
416  		netif_wake_queue(dev);
417  
418  	if (wc->status != IB_WC_SUCCESS &&
419  	    wc->status != IB_WC_WR_FLUSH_ERR) {
420  		struct ipoib_qp_state_validate *qp_work;
421  		ipoib_warn(priv,
422  			   "failed send event (status=%d, wrid=%d vend_err %#x)\n",
423  			   wc->status, wr_id, wc->vendor_err);
424  		qp_work = kzalloc(sizeof(*qp_work), GFP_ATOMIC);
425  		if (!qp_work)
426  			return;
427  
428  		INIT_WORK(&qp_work->work, ipoib_qp_state_validate_work);
429  		qp_work->priv = priv;
430  		queue_work(priv->wq, &qp_work->work);
431  	}
432  }
433  
poll_tx(struct ipoib_dev_priv * priv)434  static int poll_tx(struct ipoib_dev_priv *priv)
435  {
436  	int n, i;
437  	struct ib_wc *wc;
438  
439  	n = ib_poll_cq(priv->send_cq, MAX_SEND_CQE, priv->send_wc);
440  	for (i = 0; i < n; ++i) {
441  		wc = priv->send_wc + i;
442  		if (wc->wr_id & IPOIB_OP_CM)
443  			ipoib_cm_handle_tx_wc(priv->dev, priv->send_wc + i);
444  		else
445  			ipoib_ib_handle_tx_wc(priv->dev, priv->send_wc + i);
446  	}
447  	return n == MAX_SEND_CQE;
448  }
449  
ipoib_rx_poll(struct napi_struct * napi,int budget)450  int ipoib_rx_poll(struct napi_struct *napi, int budget)
451  {
452  	struct ipoib_dev_priv *priv =
453  		container_of(napi, struct ipoib_dev_priv, recv_napi);
454  	struct net_device *dev = priv->dev;
455  	int done;
456  	int t;
457  	int n, i;
458  
459  	done  = 0;
460  
461  poll_more:
462  	while (done < budget) {
463  		int max = (budget - done);
464  
465  		t = min(IPOIB_NUM_WC, max);
466  		n = ib_poll_cq(priv->recv_cq, t, priv->ibwc);
467  
468  		for (i = 0; i < n; i++) {
469  			struct ib_wc *wc = priv->ibwc + i;
470  
471  			if (wc->wr_id & IPOIB_OP_RECV) {
472  				++done;
473  				if (wc->wr_id & IPOIB_OP_CM)
474  					ipoib_cm_handle_rx_wc(dev, wc);
475  				else
476  					ipoib_ib_handle_rx_wc(dev, wc);
477  			} else {
478  				pr_warn("%s: Got unexpected wqe id\n", __func__);
479  			}
480  		}
481  
482  		if (n != t)
483  			break;
484  	}
485  
486  	if (done < budget) {
487  		napi_complete(napi);
488  		if (unlikely(ib_req_notify_cq(priv->recv_cq,
489  					      IB_CQ_NEXT_COMP |
490  					      IB_CQ_REPORT_MISSED_EVENTS)) &&
491  		    napi_reschedule(napi))
492  			goto poll_more;
493  	}
494  
495  	return done;
496  }
497  
ipoib_tx_poll(struct napi_struct * napi,int budget)498  int ipoib_tx_poll(struct napi_struct *napi, int budget)
499  {
500  	struct ipoib_dev_priv *priv = container_of(napi, struct ipoib_dev_priv,
501  						   send_napi);
502  	struct net_device *dev = priv->dev;
503  	int n, i;
504  	struct ib_wc *wc;
505  
506  poll_more:
507  	n = ib_poll_cq(priv->send_cq, MAX_SEND_CQE, priv->send_wc);
508  
509  	for (i = 0; i < n; i++) {
510  		wc = priv->send_wc + i;
511  		if (wc->wr_id & IPOIB_OP_CM)
512  			ipoib_cm_handle_tx_wc(dev, wc);
513  		else
514  			ipoib_ib_handle_tx_wc(dev, wc);
515  	}
516  
517  	if (n < budget) {
518  		napi_complete(napi);
519  		if (unlikely(ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP |
520  					      IB_CQ_REPORT_MISSED_EVENTS)) &&
521  		    napi_reschedule(napi))
522  			goto poll_more;
523  	}
524  	return n < 0 ? 0 : n;
525  }
526  
ipoib_ib_rx_completion(struct ib_cq * cq,void * ctx_ptr)527  void ipoib_ib_rx_completion(struct ib_cq *cq, void *ctx_ptr)
528  {
529  	struct ipoib_dev_priv *priv = ctx_ptr;
530  
531  	napi_schedule(&priv->recv_napi);
532  }
533  
ipoib_ib_tx_completion(struct ib_cq * cq,void * ctx_ptr)534  void ipoib_ib_tx_completion(struct ib_cq *cq, void *ctx_ptr)
535  {
536  	struct ipoib_dev_priv *priv = ctx_ptr;
537  
538  	napi_schedule(&priv->send_napi);
539  }
540  
post_send(struct ipoib_dev_priv * priv,unsigned int wr_id,struct ib_ah * address,u32 dqpn,struct ipoib_tx_buf * tx_req,void * head,int hlen)541  static inline int post_send(struct ipoib_dev_priv *priv,
542  			    unsigned int wr_id,
543  			    struct ib_ah *address, u32 dqpn,
544  			    struct ipoib_tx_buf *tx_req,
545  			    void *head, int hlen)
546  {
547  	struct sk_buff *skb = tx_req->skb;
548  
549  	ipoib_build_sge(priv, tx_req);
550  
551  	priv->tx_wr.wr.wr_id	= wr_id;
552  	priv->tx_wr.remote_qpn	= dqpn;
553  	priv->tx_wr.ah		= address;
554  
555  	if (head) {
556  		priv->tx_wr.mss		= skb_shinfo(skb)->gso_size;
557  		priv->tx_wr.header	= head;
558  		priv->tx_wr.hlen	= hlen;
559  		priv->tx_wr.wr.opcode	= IB_WR_LSO;
560  	} else
561  		priv->tx_wr.wr.opcode	= IB_WR_SEND;
562  
563  	return ib_post_send(priv->qp, &priv->tx_wr.wr, NULL);
564  }
565  
ipoib_send(struct net_device * dev,struct sk_buff * skb,struct ib_ah * address,u32 dqpn)566  int ipoib_send(struct net_device *dev, struct sk_buff *skb,
567  	       struct ib_ah *address, u32 dqpn)
568  {
569  	struct ipoib_dev_priv *priv = ipoib_priv(dev);
570  	struct ipoib_tx_buf *tx_req;
571  	int hlen, rc;
572  	void *phead;
573  	unsigned int usable_sge = priv->max_send_sge - !!skb_headlen(skb);
574  
575  	if (skb_is_gso(skb)) {
576  		hlen = skb_transport_offset(skb) + tcp_hdrlen(skb);
577  		phead = skb->data;
578  		if (unlikely(!skb_pull(skb, hlen))) {
579  			ipoib_warn(priv, "linear data too small\n");
580  			++dev->stats.tx_dropped;
581  			++dev->stats.tx_errors;
582  			dev_kfree_skb_any(skb);
583  			return -1;
584  		}
585  	} else {
586  		if (unlikely(skb->len > priv->mcast_mtu + IPOIB_ENCAP_LEN)) {
587  			ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n",
588  				   skb->len, priv->mcast_mtu + IPOIB_ENCAP_LEN);
589  			++dev->stats.tx_dropped;
590  			++dev->stats.tx_errors;
591  			ipoib_cm_skb_too_long(dev, skb, priv->mcast_mtu);
592  			return -1;
593  		}
594  		phead = NULL;
595  		hlen  = 0;
596  	}
597  	if (skb_shinfo(skb)->nr_frags > usable_sge) {
598  		if (skb_linearize(skb) < 0) {
599  			ipoib_warn(priv, "skb could not be linearized\n");
600  			++dev->stats.tx_dropped;
601  			++dev->stats.tx_errors;
602  			dev_kfree_skb_any(skb);
603  			return -1;
604  		}
605  		/* Does skb_linearize return ok without reducing nr_frags? */
606  		if (skb_shinfo(skb)->nr_frags > usable_sge) {
607  			ipoib_warn(priv, "too many frags after skb linearize\n");
608  			++dev->stats.tx_dropped;
609  			++dev->stats.tx_errors;
610  			dev_kfree_skb_any(skb);
611  			return -1;
612  		}
613  	}
614  
615  	ipoib_dbg_data(priv,
616  		       "sending packet, length=%d address=%p dqpn=0x%06x\n",
617  		       skb->len, address, dqpn);
618  
619  	/*
620  	 * We put the skb into the tx_ring _before_ we call post_send()
621  	 * because it's entirely possible that the completion handler will
622  	 * run before we execute anything after the post_send().  That
623  	 * means we have to make sure everything is properly recorded and
624  	 * our state is consistent before we call post_send().
625  	 */
626  	tx_req = &priv->tx_ring[priv->tx_head & (ipoib_sendq_size - 1)];
627  	tx_req->skb = skb;
628  	if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req))) {
629  		++dev->stats.tx_errors;
630  		dev_kfree_skb_any(skb);
631  		return -1;
632  	}
633  
634  	if (skb->ip_summed == CHECKSUM_PARTIAL)
635  		priv->tx_wr.wr.send_flags |= IB_SEND_IP_CSUM;
636  	else
637  		priv->tx_wr.wr.send_flags &= ~IB_SEND_IP_CSUM;
638  	/* increase the tx_head after send success, but use it for queue state */
639  	if ((priv->global_tx_head - priv->global_tx_tail) ==
640  	    ipoib_sendq_size - 1) {
641  		ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n");
642  		netif_stop_queue(dev);
643  	}
644  
645  	skb_orphan(skb);
646  	skb_dst_drop(skb);
647  
648  	if (netif_queue_stopped(dev))
649  		if (ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP |
650  				     IB_CQ_REPORT_MISSED_EVENTS) < 0)
651  			ipoib_warn(priv, "request notify on send CQ failed\n");
652  
653  	rc = post_send(priv, priv->tx_head & (ipoib_sendq_size - 1),
654  		       address, dqpn, tx_req, phead, hlen);
655  	if (unlikely(rc)) {
656  		ipoib_warn(priv, "post_send failed, error %d\n", rc);
657  		++dev->stats.tx_errors;
658  		ipoib_dma_unmap_tx(priv, tx_req);
659  		dev_kfree_skb_any(skb);
660  		if (netif_queue_stopped(dev))
661  			netif_wake_queue(dev);
662  		rc = 0;
663  	} else {
664  		netif_trans_update(dev);
665  
666  		rc = priv->tx_head;
667  		++priv->tx_head;
668  		++priv->global_tx_head;
669  	}
670  	return rc;
671  }
672  
ipoib_reap_dead_ahs(struct ipoib_dev_priv * priv)673  static void ipoib_reap_dead_ahs(struct ipoib_dev_priv *priv)
674  {
675  	struct ipoib_ah *ah, *tah;
676  	unsigned long flags;
677  
678  	netif_tx_lock_bh(priv->dev);
679  	spin_lock_irqsave(&priv->lock, flags);
680  
681  	list_for_each_entry_safe(ah, tah, &priv->dead_ahs, list)
682  		if ((int) priv->tx_tail - (int) ah->last_send >= 0) {
683  			list_del(&ah->list);
684  			rdma_destroy_ah(ah->ah, 0);
685  			kfree(ah);
686  		}
687  
688  	spin_unlock_irqrestore(&priv->lock, flags);
689  	netif_tx_unlock_bh(priv->dev);
690  }
691  
ipoib_reap_ah(struct work_struct * work)692  void ipoib_reap_ah(struct work_struct *work)
693  {
694  	struct ipoib_dev_priv *priv =
695  		container_of(work, struct ipoib_dev_priv, ah_reap_task.work);
696  
697  	ipoib_reap_dead_ahs(priv);
698  
699  	if (!test_bit(IPOIB_STOP_REAPER, &priv->flags))
700  		queue_delayed_work(priv->wq, &priv->ah_reap_task,
701  				   round_jiffies_relative(HZ));
702  }
703  
ipoib_start_ah_reaper(struct ipoib_dev_priv * priv)704  static void ipoib_start_ah_reaper(struct ipoib_dev_priv *priv)
705  {
706  	clear_bit(IPOIB_STOP_REAPER, &priv->flags);
707  	queue_delayed_work(priv->wq, &priv->ah_reap_task,
708  			   round_jiffies_relative(HZ));
709  }
710  
ipoib_stop_ah_reaper(struct ipoib_dev_priv * priv)711  static void ipoib_stop_ah_reaper(struct ipoib_dev_priv *priv)
712  {
713  	set_bit(IPOIB_STOP_REAPER, &priv->flags);
714  	cancel_delayed_work(&priv->ah_reap_task);
715  	/*
716  	 * After ipoib_stop_ah_reaper() we always go through
717  	 * ipoib_reap_dead_ahs() which ensures the work is really stopped and
718  	 * does a final flush out of the dead_ah's list
719  	 */
720  }
721  
recvs_pending(struct net_device * dev)722  static int recvs_pending(struct net_device *dev)
723  {
724  	struct ipoib_dev_priv *priv = ipoib_priv(dev);
725  	int pending = 0;
726  	int i;
727  
728  	for (i = 0; i < ipoib_recvq_size; ++i)
729  		if (priv->rx_ring[i].skb)
730  			++pending;
731  
732  	return pending;
733  }
734  
check_qp_movement_and_print(struct ipoib_dev_priv * priv,struct ib_qp * qp,enum ib_qp_state new_state)735  static void check_qp_movement_and_print(struct ipoib_dev_priv *priv,
736  					struct ib_qp *qp,
737  					enum ib_qp_state new_state)
738  {
739  	struct ib_qp_attr qp_attr;
740  	struct ib_qp_init_attr query_init_attr;
741  	int ret;
742  
743  	ret = ib_query_qp(qp, &qp_attr, IB_QP_STATE, &query_init_attr);
744  	if (ret) {
745  		ipoib_warn(priv, "%s: Failed to query QP\n", __func__);
746  		return;
747  	}
748  	/* print according to the new-state and the previous state.*/
749  	if (new_state == IB_QPS_ERR && qp_attr.qp_state == IB_QPS_RESET)
750  		ipoib_dbg(priv, "Failed modify QP, IB_QPS_RESET to IB_QPS_ERR, acceptable\n");
751  	else
752  		ipoib_warn(priv, "Failed to modify QP to state: %d from state: %d\n",
753  			   new_state, qp_attr.qp_state);
754  }
755  
ipoib_napi_enable(struct net_device * dev)756  static void ipoib_napi_enable(struct net_device *dev)
757  {
758  	struct ipoib_dev_priv *priv = ipoib_priv(dev);
759  
760  	napi_enable(&priv->recv_napi);
761  	napi_enable(&priv->send_napi);
762  }
763  
ipoib_napi_disable(struct net_device * dev)764  static void ipoib_napi_disable(struct net_device *dev)
765  {
766  	struct ipoib_dev_priv *priv = ipoib_priv(dev);
767  
768  	napi_disable(&priv->recv_napi);
769  	napi_disable(&priv->send_napi);
770  }
771  
ipoib_ib_dev_stop_default(struct net_device * dev)772  int ipoib_ib_dev_stop_default(struct net_device *dev)
773  {
774  	struct ipoib_dev_priv *priv = ipoib_priv(dev);
775  	struct ib_qp_attr qp_attr;
776  	unsigned long begin;
777  	struct ipoib_tx_buf *tx_req;
778  	int i;
779  
780  	if (test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags))
781  		ipoib_napi_disable(dev);
782  
783  	ipoib_cm_dev_stop(dev);
784  
785  	/*
786  	 * Move our QP to the error state and then reinitialize in
787  	 * when all work requests have completed or have been flushed.
788  	 */
789  	qp_attr.qp_state = IB_QPS_ERR;
790  	if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE))
791  		check_qp_movement_and_print(priv, priv->qp, IB_QPS_ERR);
792  
793  	/* Wait for all sends and receives to complete */
794  	begin = jiffies;
795  
796  	while (priv->tx_head != priv->tx_tail || recvs_pending(dev)) {
797  		if (time_after(jiffies, begin + 5 * HZ)) {
798  			ipoib_warn(priv,
799  				   "timing out; %d sends %d receives not completed\n",
800  				   priv->tx_head - priv->tx_tail,
801  				   recvs_pending(dev));
802  
803  			/*
804  			 * assume the HW is wedged and just free up
805  			 * all our pending work requests.
806  			 */
807  			while ((int)priv->tx_tail - (int)priv->tx_head < 0) {
808  				tx_req = &priv->tx_ring[priv->tx_tail &
809  							(ipoib_sendq_size - 1)];
810  				ipoib_dma_unmap_tx(priv, tx_req);
811  				dev_kfree_skb_any(tx_req->skb);
812  				++priv->tx_tail;
813  				++priv->global_tx_tail;
814  			}
815  
816  			for (i = 0; i < ipoib_recvq_size; ++i) {
817  				struct ipoib_rx_buf *rx_req;
818  
819  				rx_req = &priv->rx_ring[i];
820  				if (!rx_req->skb)
821  					continue;
822  				ipoib_ud_dma_unmap_rx(priv,
823  						      priv->rx_ring[i].mapping);
824  				dev_kfree_skb_any(rx_req->skb);
825  				rx_req->skb = NULL;
826  			}
827  
828  			goto timeout;
829  		}
830  
831  		ipoib_drain_cq(dev);
832  
833  		usleep_range(1000, 2000);
834  	}
835  
836  	ipoib_dbg(priv, "All sends and receives done.\n");
837  
838  timeout:
839  	qp_attr.qp_state = IB_QPS_RESET;
840  	if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE))
841  		ipoib_warn(priv, "Failed to modify QP to RESET state\n");
842  
843  	ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP);
844  
845  	return 0;
846  }
847  
ipoib_ib_dev_open_default(struct net_device * dev)848  int ipoib_ib_dev_open_default(struct net_device *dev)
849  {
850  	struct ipoib_dev_priv *priv = ipoib_priv(dev);
851  	int ret;
852  
853  	ret = ipoib_init_qp(dev);
854  	if (ret) {
855  		ipoib_warn(priv, "ipoib_init_qp returned %d\n", ret);
856  		return -1;
857  	}
858  
859  	ret = ipoib_ib_post_receives(dev);
860  	if (ret) {
861  		ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret);
862  		goto out;
863  	}
864  
865  	ret = ipoib_cm_dev_open(dev);
866  	if (ret) {
867  		ipoib_warn(priv, "ipoib_cm_dev_open returned %d\n", ret);
868  		goto out;
869  	}
870  
871  	if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags))
872  		ipoib_napi_enable(dev);
873  
874  	return 0;
875  out:
876  	return -1;
877  }
878  
ipoib_ib_dev_open(struct net_device * dev)879  int ipoib_ib_dev_open(struct net_device *dev)
880  {
881  	struct ipoib_dev_priv *priv = ipoib_priv(dev);
882  
883  	ipoib_pkey_dev_check_presence(dev);
884  
885  	if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) {
886  		ipoib_warn(priv, "P_Key 0x%04x is %s\n", priv->pkey,
887  			   (!(priv->pkey & 0x7fff) ? "Invalid" : "not found"));
888  		return -1;
889  	}
890  
891  	ipoib_start_ah_reaper(priv);
892  	if (priv->rn_ops->ndo_open(dev)) {
893  		pr_warn("%s: Failed to open dev\n", dev->name);
894  		goto dev_stop;
895  	}
896  
897  	set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags);
898  
899  	return 0;
900  
901  dev_stop:
902  	ipoib_stop_ah_reaper(priv);
903  	return -1;
904  }
905  
ipoib_ib_dev_stop(struct net_device * dev)906  void ipoib_ib_dev_stop(struct net_device *dev)
907  {
908  	struct ipoib_dev_priv *priv = ipoib_priv(dev);
909  
910  	priv->rn_ops->ndo_stop(dev);
911  
912  	clear_bit(IPOIB_FLAG_INITIALIZED, &priv->flags);
913  	ipoib_stop_ah_reaper(priv);
914  }
915  
ipoib_pkey_dev_check_presence(struct net_device * dev)916  void ipoib_pkey_dev_check_presence(struct net_device *dev)
917  {
918  	struct ipoib_dev_priv *priv = ipoib_priv(dev);
919  	struct rdma_netdev *rn = netdev_priv(dev);
920  
921  	if (!(priv->pkey & 0x7fff) ||
922  	    ib_find_pkey(priv->ca, priv->port, priv->pkey,
923  			 &priv->pkey_index)) {
924  		clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
925  	} else {
926  		if (rn->set_id)
927  			rn->set_id(dev, priv->pkey_index);
928  		set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
929  	}
930  }
931  
ipoib_ib_dev_up(struct net_device * dev)932  void ipoib_ib_dev_up(struct net_device *dev)
933  {
934  	struct ipoib_dev_priv *priv = ipoib_priv(dev);
935  
936  	ipoib_pkey_dev_check_presence(dev);
937  
938  	if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) {
939  		ipoib_dbg(priv, "PKEY is not assigned.\n");
940  		return;
941  	}
942  
943  	set_bit(IPOIB_FLAG_OPER_UP, &priv->flags);
944  
945  	ipoib_mcast_start_thread(dev);
946  }
947  
ipoib_ib_dev_down(struct net_device * dev)948  void ipoib_ib_dev_down(struct net_device *dev)
949  {
950  	struct ipoib_dev_priv *priv = ipoib_priv(dev);
951  
952  	ipoib_dbg(priv, "downing ib_dev\n");
953  
954  	clear_bit(IPOIB_FLAG_OPER_UP, &priv->flags);
955  	netif_carrier_off(dev);
956  
957  	ipoib_mcast_stop_thread(dev);
958  	ipoib_mcast_dev_flush(dev);
959  
960  	ipoib_flush_paths(dev);
961  }
962  
ipoib_drain_cq(struct net_device * dev)963  void ipoib_drain_cq(struct net_device *dev)
964  {
965  	struct ipoib_dev_priv *priv = ipoib_priv(dev);
966  	int i, n;
967  
968  	/*
969  	 * We call completion handling routines that expect to be
970  	 * called from the BH-disabled NAPI poll context, so disable
971  	 * BHs here too.
972  	 */
973  	local_bh_disable();
974  
975  	do {
976  		n = ib_poll_cq(priv->recv_cq, IPOIB_NUM_WC, priv->ibwc);
977  		for (i = 0; i < n; ++i) {
978  			/*
979  			 * Convert any successful completions to flush
980  			 * errors to avoid passing packets up the
981  			 * stack after bringing the device down.
982  			 */
983  			if (priv->ibwc[i].status == IB_WC_SUCCESS)
984  				priv->ibwc[i].status = IB_WC_WR_FLUSH_ERR;
985  
986  			if (priv->ibwc[i].wr_id & IPOIB_OP_RECV) {
987  				if (priv->ibwc[i].wr_id & IPOIB_OP_CM)
988  					ipoib_cm_handle_rx_wc(dev, priv->ibwc + i);
989  				else
990  					ipoib_ib_handle_rx_wc(dev, priv->ibwc + i);
991  			} else {
992  				pr_warn("%s: Got unexpected wqe id\n", __func__);
993  			}
994  		}
995  	} while (n == IPOIB_NUM_WC);
996  
997  	while (poll_tx(priv))
998  		; /* nothing */
999  
1000  	local_bh_enable();
1001  }
1002  
1003  /*
1004   * Takes whatever value which is in pkey index 0 and updates priv->pkey
1005   * returns 0 if the pkey value was changed.
1006   */
update_parent_pkey(struct ipoib_dev_priv * priv)1007  static inline int update_parent_pkey(struct ipoib_dev_priv *priv)
1008  {
1009  	int result;
1010  	u16 prev_pkey;
1011  
1012  	prev_pkey = priv->pkey;
1013  	result = ib_query_pkey(priv->ca, priv->port, 0, &priv->pkey);
1014  	if (result) {
1015  		ipoib_warn(priv, "ib_query_pkey port %d failed (ret = %d)\n",
1016  			   priv->port, result);
1017  		return result;
1018  	}
1019  
1020  	priv->pkey |= 0x8000;
1021  
1022  	if (prev_pkey != priv->pkey) {
1023  		ipoib_dbg(priv, "pkey changed from 0x%x to 0x%x\n",
1024  			  prev_pkey, priv->pkey);
1025  		/*
1026  		 * Update the pkey in the broadcast address, while making sure to set
1027  		 * the full membership bit, so that we join the right broadcast group.
1028  		 */
1029  		priv->dev->broadcast[8] = priv->pkey >> 8;
1030  		priv->dev->broadcast[9] = priv->pkey & 0xff;
1031  		return 0;
1032  	}
1033  
1034  	return 1;
1035  }
1036  /*
1037   * returns 0 if pkey value was found in a different slot.
1038   */
update_child_pkey(struct ipoib_dev_priv * priv)1039  static inline int update_child_pkey(struct ipoib_dev_priv *priv)
1040  {
1041  	u16 old_index = priv->pkey_index;
1042  
1043  	priv->pkey_index = 0;
1044  	ipoib_pkey_dev_check_presence(priv->dev);
1045  
1046  	if (test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags) &&
1047  	    (old_index == priv->pkey_index))
1048  		return 1;
1049  	return 0;
1050  }
1051  
1052  /*
1053   * returns true if the device address of the ipoib interface has changed and the
1054   * new address is a valid one (i.e in the gid table), return false otherwise.
1055   */
ipoib_dev_addr_changed_valid(struct ipoib_dev_priv * priv)1056  static bool ipoib_dev_addr_changed_valid(struct ipoib_dev_priv *priv)
1057  {
1058  	union ib_gid search_gid;
1059  	union ib_gid gid0;
1060  	union ib_gid *netdev_gid;
1061  	int err;
1062  	u16 index;
1063  	u8 port;
1064  	bool ret = false;
1065  
1066  	netdev_gid = (union ib_gid *)(priv->dev->dev_addr + 4);
1067  	if (rdma_query_gid(priv->ca, priv->port, 0, &gid0))
1068  		return false;
1069  
1070  	netif_addr_lock_bh(priv->dev);
1071  
1072  	/* The subnet prefix may have changed, update it now so we won't have
1073  	 * to do it later
1074  	 */
1075  	priv->local_gid.global.subnet_prefix = gid0.global.subnet_prefix;
1076  	netdev_gid->global.subnet_prefix = gid0.global.subnet_prefix;
1077  	search_gid.global.subnet_prefix = gid0.global.subnet_prefix;
1078  
1079  	search_gid.global.interface_id = priv->local_gid.global.interface_id;
1080  
1081  	netif_addr_unlock_bh(priv->dev);
1082  
1083  	err = ib_find_gid(priv->ca, &search_gid, &port, &index);
1084  
1085  	netif_addr_lock_bh(priv->dev);
1086  
1087  	if (search_gid.global.interface_id !=
1088  	    priv->local_gid.global.interface_id)
1089  		/* There was a change while we were looking up the gid, bail
1090  		 * here and let the next work sort this out
1091  		 */
1092  		goto out;
1093  
1094  	/* The next section of code needs some background:
1095  	 * Per IB spec the port GUID can't change if the HCA is powered on.
1096  	 * port GUID is the basis for GID at index 0 which is the basis for
1097  	 * the default device address of a ipoib interface.
1098  	 *
1099  	 * so it seems the flow should be:
1100  	 * if user_changed_dev_addr && gid in gid tbl
1101  	 *	set bit dev_addr_set
1102  	 *	return true
1103  	 * else
1104  	 *	return false
1105  	 *
1106  	 * The issue is that there are devices that don't follow the spec,
1107  	 * they change the port GUID when the HCA is powered, so in order
1108  	 * not to break userspace applications, We need to check if the
1109  	 * user wanted to control the device address and we assume that
1110  	 * if he sets the device address back to be based on GID index 0,
1111  	 * he no longer wishs to control it.
1112  	 *
1113  	 * If the user doesn't control the the device address,
1114  	 * IPOIB_FLAG_DEV_ADDR_SET is set and ib_find_gid failed it means
1115  	 * the port GUID has changed and GID at index 0 has changed
1116  	 * so we need to change priv->local_gid and priv->dev->dev_addr
1117  	 * to reflect the new GID.
1118  	 */
1119  	if (!test_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags)) {
1120  		if (!err && port == priv->port) {
1121  			set_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags);
1122  			if (index == 0)
1123  				clear_bit(IPOIB_FLAG_DEV_ADDR_CTRL,
1124  					  &priv->flags);
1125  			else
1126  				set_bit(IPOIB_FLAG_DEV_ADDR_CTRL, &priv->flags);
1127  			ret = true;
1128  		} else {
1129  			ret = false;
1130  		}
1131  	} else {
1132  		if (!err && port == priv->port) {
1133  			ret = true;
1134  		} else {
1135  			if (!test_bit(IPOIB_FLAG_DEV_ADDR_CTRL, &priv->flags)) {
1136  				memcpy(&priv->local_gid, &gid0,
1137  				       sizeof(priv->local_gid));
1138  				memcpy(priv->dev->dev_addr + 4, &gid0,
1139  				       sizeof(priv->local_gid));
1140  				ret = true;
1141  			}
1142  		}
1143  	}
1144  
1145  out:
1146  	netif_addr_unlock_bh(priv->dev);
1147  
1148  	return ret;
1149  }
1150  
__ipoib_ib_dev_flush(struct ipoib_dev_priv * priv,enum ipoib_flush_level level,int nesting)1151  static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv,
1152  				enum ipoib_flush_level level,
1153  				int nesting)
1154  {
1155  	struct ipoib_dev_priv *cpriv;
1156  	struct net_device *dev = priv->dev;
1157  	int result;
1158  
1159  	down_read_nested(&priv->vlan_rwsem, nesting);
1160  
1161  	/*
1162  	 * Flush any child interfaces too -- they might be up even if
1163  	 * the parent is down.
1164  	 */
1165  	list_for_each_entry(cpriv, &priv->child_intfs, list)
1166  		__ipoib_ib_dev_flush(cpriv, level, nesting + 1);
1167  
1168  	up_read(&priv->vlan_rwsem);
1169  
1170  	if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags) &&
1171  	    level != IPOIB_FLUSH_HEAVY) {
1172  		/* Make sure the dev_addr is set even if not flushing */
1173  		if (level == IPOIB_FLUSH_LIGHT)
1174  			ipoib_dev_addr_changed_valid(priv);
1175  		ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_INITIALIZED not set.\n");
1176  		return;
1177  	}
1178  
1179  	if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) {
1180  		/* interface is down. update pkey and leave. */
1181  		if (level == IPOIB_FLUSH_HEAVY) {
1182  			if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags))
1183  				update_parent_pkey(priv);
1184  			else
1185  				update_child_pkey(priv);
1186  		} else if (level == IPOIB_FLUSH_LIGHT)
1187  			ipoib_dev_addr_changed_valid(priv);
1188  		ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_ADMIN_UP not set.\n");
1189  		return;
1190  	}
1191  
1192  	if (level == IPOIB_FLUSH_HEAVY) {
1193  		/* child devices chase their origin pkey value, while non-child
1194  		 * (parent) devices should always takes what present in pkey index 0
1195  		 */
1196  		if (test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
1197  			result = update_child_pkey(priv);
1198  			if (result) {
1199  				/* restart QP only if P_Key index is changed */
1200  				ipoib_dbg(priv, "Not flushing - P_Key index not changed.\n");
1201  				return;
1202  			}
1203  
1204  		} else {
1205  			result = update_parent_pkey(priv);
1206  			/* restart QP only if P_Key value changed */
1207  			if (result) {
1208  				ipoib_dbg(priv, "Not flushing - P_Key value not changed.\n");
1209  				return;
1210  			}
1211  		}
1212  	}
1213  
1214  	if (level == IPOIB_FLUSH_LIGHT) {
1215  		int oper_up;
1216  		ipoib_mark_paths_invalid(dev);
1217  		/* Set IPoIB operation as down to prevent races between:
1218  		 * the flush flow which leaves MCG and on the fly joins
1219  		 * which can happen during that time. mcast restart task
1220  		 * should deal with join requests we missed.
1221  		 */
1222  		oper_up = test_and_clear_bit(IPOIB_FLAG_OPER_UP, &priv->flags);
1223  		ipoib_mcast_dev_flush(dev);
1224  		if (oper_up)
1225  			set_bit(IPOIB_FLAG_OPER_UP, &priv->flags);
1226  		ipoib_reap_dead_ahs(priv);
1227  	}
1228  
1229  	if (level >= IPOIB_FLUSH_NORMAL)
1230  		ipoib_ib_dev_down(dev);
1231  
1232  	if (level == IPOIB_FLUSH_HEAVY) {
1233  		if (test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags))
1234  			ipoib_ib_dev_stop(dev);
1235  
1236  		if (ipoib_ib_dev_open(dev))
1237  			return;
1238  
1239  		if (netif_queue_stopped(dev))
1240  			netif_start_queue(dev);
1241  	}
1242  
1243  	/*
1244  	 * The device could have been brought down between the start and when
1245  	 * we get here, don't bring it back up if it's not configured up
1246  	 */
1247  	if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) {
1248  		if (level >= IPOIB_FLUSH_NORMAL)
1249  			ipoib_ib_dev_up(dev);
1250  		if (ipoib_dev_addr_changed_valid(priv))
1251  			ipoib_mcast_restart_task(&priv->restart_task);
1252  	}
1253  }
1254  
ipoib_ib_dev_flush_light(struct work_struct * work)1255  void ipoib_ib_dev_flush_light(struct work_struct *work)
1256  {
1257  	struct ipoib_dev_priv *priv =
1258  		container_of(work, struct ipoib_dev_priv, flush_light);
1259  
1260  	__ipoib_ib_dev_flush(priv, IPOIB_FLUSH_LIGHT, 0);
1261  }
1262  
ipoib_ib_dev_flush_normal(struct work_struct * work)1263  void ipoib_ib_dev_flush_normal(struct work_struct *work)
1264  {
1265  	struct ipoib_dev_priv *priv =
1266  		container_of(work, struct ipoib_dev_priv, flush_normal);
1267  
1268  	__ipoib_ib_dev_flush(priv, IPOIB_FLUSH_NORMAL, 0);
1269  }
1270  
ipoib_ib_dev_flush_heavy(struct work_struct * work)1271  void ipoib_ib_dev_flush_heavy(struct work_struct *work)
1272  {
1273  	struct ipoib_dev_priv *priv =
1274  		container_of(work, struct ipoib_dev_priv, flush_heavy);
1275  
1276  	rtnl_lock();
1277  	__ipoib_ib_dev_flush(priv, IPOIB_FLUSH_HEAVY, 0);
1278  	rtnl_unlock();
1279  }
1280  
ipoib_ib_dev_cleanup(struct net_device * dev)1281  void ipoib_ib_dev_cleanup(struct net_device *dev)
1282  {
1283  	struct ipoib_dev_priv *priv = ipoib_priv(dev);
1284  
1285  	ipoib_dbg(priv, "cleaning up ib_dev\n");
1286  	/*
1287  	 * We must make sure there are no more (path) completions
1288  	 * that may wish to touch priv fields that are no longer valid
1289  	 */
1290  	ipoib_flush_paths(dev);
1291  
1292  	ipoib_mcast_stop_thread(dev);
1293  	ipoib_mcast_dev_flush(dev);
1294  
1295  	/*
1296  	 * All of our ah references aren't free until after
1297  	 * ipoib_mcast_dev_flush(), ipoib_flush_paths, and
1298  	 * the neighbor garbage collection is stopped and reaped.
1299  	 * That should all be done now, so make a final ah flush.
1300  	 */
1301  	ipoib_reap_dead_ahs(priv);
1302  
1303  	clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
1304  
1305  	priv->rn_ops->ndo_uninit(dev);
1306  
1307  	if (priv->pd) {
1308  		ib_dealloc_pd(priv->pd);
1309  		priv->pd = NULL;
1310  	}
1311  }
1312  
1313