1 /*-
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
3 * The Regents of the University of California. All rights reserved.
4 * Copyright (c) 2007-2008,2010
5 * Swinburne University of Technology, Melbourne, Australia.
6 * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
7 * Copyright (c) 2010 The FreeBSD Foundation
8 * Copyright (c) 2010-2011 Juniper Networks, Inc.
9 * All rights reserved.
10 *
11 * Portions of this software were developed at the Centre for Advanced Internet
12 * Architectures, Swinburne University of Technology, by Lawrence Stewart,
13 * James Healy and David Hayes, made possible in part by a grant from the Cisco
14 * University Research Program Fund at Community Foundation Silicon Valley.
15 *
16 * Portions of this software were developed at the Centre for Advanced
17 * Internet Architectures, Swinburne University of Technology, Melbourne,
18 * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
19 *
20 * Portions of this software were developed by Robert N. M. Watson under
21 * contract to Juniper Networks, Inc.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the above copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 4. Neither the name of the University nor the names of its contributors
32 * may be used to endorse or promote products derived from this software
33 * without specific prior written permission.
34 *
35 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
36 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
38 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
39 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
40 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
41 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
42 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
43 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
44 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
45 * SUCH DAMAGE.
46 *
47 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
48 */
49
50
51 /*
52 * Determine a reasonable value for maxseg size.
53 * If the route is known, check route for mtu.
54 * If none, use an mss that can be handled on the outgoing interface
55 * without forcing IP to fragment. If no route is found, route has no mtu,
56 * or the destination isn't local, use a default, hopefully conservative
57 * size (usually 512 or the default IP max size, but no more than the mtu
58 * of the interface), as we can't discover anything about intervening
59 * gateways or networks. We also initialize the congestion/slow start
60 * window to be a single segment if the destination isn't local.
61 * While looking at the routing entry, we also initialize other path-dependent
62 * parameters from pre-set or cached values in the routing entry.
63 *
64 * Also take into account the space needed for options that we
65 * send regularly. Make maxseg shorter by that amount to assure
66 * that we can send maxseg amount of data even when the options
67 * are present. Store the upper limit of the length of options plus
68 * data in maxopd.
69 *
70 * NOTE that this routine is only called when we process an incoming
71 * segment, or an ICMP need fragmentation datagram. Outgoing SYN/ACK MSS
72 * settings are handled in tcp_mssopt().
73 */
74
75 #include <errno.h>
76 #include <string.h>
77 #include <strings.h>
78
79 #include "tcp.h"
80 #include "tcp_fsm.h"
81 #include "tcp_seq.h"
82 #include "tcp_timer.h"
83 #include "tcp_var.h"
84 #include "../lib/bitmap.h"
85 #include "../lib/cbuf.h"
86 #include "icmp_var.h"
87 #include "ip.h"
88 #include "ip6.h"
89 #include "sys/queue.h"
90
91 #include "tcp_const.h"
92
93 /* samkumar: Copied from in.h */
94 #define IPPROTO_DONE 267
95
96 /* samkumar: Copied from sys/libkern.h */
imax(int a,int b)97 static int imax(int a, int b) { return (a > b ? a : b); }
imin(int a,int b)98 static int imin(int a, int b) { return (a < b ? a : b); }
99
min(int a,int b)100 static int min(int a, int b) { return imin(a, b); }
101
102 static void tcp_dooptions(struct tcpopt *, uint8_t *, int, int);
103 static void
104 tcp_do_segment(struct ip6_hdr* ip6, struct tcphdr *th, otMessage* msg,
105 struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos,
106 struct tcplp_signals* sig);
107 static void tcp_xmit_timer(struct tcpcb *, int);
108 void tcp_hc_get(/*struct in_conninfo *inc*/ struct tcpcb* tp, struct hc_metrics_lite *hc_metrics_lite);
109 static void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *);
110
111 /*
112 * CC wrapper hook functions
113 */
114 static inline void
cc_ack_received(struct tcpcb * tp,struct tcphdr * th,uint16_t type)115 cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t type)
116 {
117 tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th);
118 if (tp->snd_cwnd <= tp->snd_wnd)
119 tp->ccv->flags |= CCF_CWND_LIMITED;
120 else
121 tp->ccv->flags &= ~CCF_CWND_LIMITED;
122
123 if (type == CC_ACK) {
124 if (tp->snd_cwnd > tp->snd_ssthresh) {
125 tp->t_bytes_acked += min(tp->ccv->bytes_this_ack,
126 V_tcp_abc_l_var * tp->t_maxseg);
127 if (tp->t_bytes_acked >= tp->snd_cwnd) {
128 tp->t_bytes_acked -= tp->snd_cwnd;
129 tp->ccv->flags |= CCF_ABC_SENTAWND;
130 }
131 } else {
132 tp->ccv->flags &= ~CCF_ABC_SENTAWND;
133 tp->t_bytes_acked = 0;
134 }
135 }
136
137 if (CC_ALGO(tp)->ack_received != NULL) {
138 /* XXXLAS: Find a way to live without this */
139 tp->ccv->curack = th->th_ack;
140 CC_ALGO(tp)->ack_received(tp->ccv, type);
141 }
142 }
143
144 static inline void
cc_conn_init(struct tcpcb * tp)145 cc_conn_init(struct tcpcb *tp)
146 {
147 struct hc_metrics_lite metrics;
148 int rtt;
149
150 /*
151 * samkumar: remove locks, inpcb, and stats.
152 */
153
154 /* samkumar: Used to take &inp->inp_inc as an argument. */
155 tcp_hc_get(tp, &metrics);
156
157 if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) {
158 tp->t_srtt = rtt;
159 tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE;
160 if (metrics.rmx_rttvar) {
161 tp->t_rttvar = metrics.rmx_rttvar;
162 } else {
163 /* default variation is +- 1 rtt */
164 tp->t_rttvar =
165 tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
166 }
167 TCPT_RANGESET(tp->t_rxtcur,
168 ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
169 tp->t_rttmin, TCPTV_REXMTMAX);
170 }
171 if (metrics.rmx_ssthresh) {
172 /*
173 * There's some sort of gateway or interface
174 * buffer limit on the path. Use this to set
175 * the slow start threshhold, but set the
176 * threshold to no less than 2*mss.
177 */
178 tp->snd_ssthresh = max(2 * tp->t_maxseg, metrics.rmx_ssthresh);
179 }
180
181 /*
182 * Set the initial slow-start flight size.
183 *
184 * RFC5681 Section 3.1 specifies the default conservative values.
185 * RFC3390 specifies slightly more aggressive values.
186 * RFC6928 increases it to ten segments.
187 * Support for user specified value for initial flight size.
188 *
189 * If a SYN or SYN/ACK was lost and retransmitted, we have to
190 * reduce the initial CWND to one segment as congestion is likely
191 * requiring us to be cautious.
192 */
193 if (tp->snd_cwnd == 1)
194 tp->snd_cwnd = tp->t_maxseg; /* SYN(-ACK) lost */
195 else if (V_tcp_initcwnd_segments)
196 tp->snd_cwnd = min(V_tcp_initcwnd_segments * tp->t_maxseg,
197 max(2 * tp->t_maxseg, V_tcp_initcwnd_segments * 1460));
198 else if (V_tcp_do_rfc3390)
199 tp->snd_cwnd = min(4 * tp->t_maxseg,
200 max(2 * tp->t_maxseg, 4380));
201 else {
202 /* Per RFC5681 Section 3.1 */
203 if (tp->t_maxseg > 2190)
204 tp->snd_cwnd = 2 * tp->t_maxseg;
205 else if (tp->t_maxseg > 1095)
206 tp->snd_cwnd = 3 * tp->t_maxseg;
207 else
208 tp->snd_cwnd = 4 * tp->t_maxseg;
209 }
210
211 if (CC_ALGO(tp)->conn_init != NULL)
212 CC_ALGO(tp)->conn_init(tp->ccv);
213
214 /* samkumar: print statement for debugging. Resurrect with DEBUG macro? */
215 #ifdef INSTRUMENT_TCP
216 tcplp_sys_log("TCP CC_INIT %u %d %d", (unsigned int) tcplp_sys_get_millis(), (int) tp->snd_cwnd, (int) tp->snd_ssthresh);
217 #endif
218 }
219
220 inline void
cc_cong_signal(struct tcpcb * tp,struct tcphdr * th,uint32_t type)221 cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type)
222 {
223 /* samkumar: Remove locks and stats from this function. */
224
225 switch(type) {
226 case CC_NDUPACK:
227 if (!IN_FASTRECOVERY(tp->t_flags)) {
228 tp->snd_recover = tp->snd_max;
229 if (tp->t_flags & TF_ECN_PERMIT)
230 tp->t_flags |= TF_ECN_SND_CWR;
231 }
232 break;
233 case CC_ECN:
234 if (!IN_CONGRECOVERY(tp->t_flags)) {
235 tp->snd_recover = tp->snd_max;
236 if (tp->t_flags & TF_ECN_PERMIT)
237 tp->t_flags |= TF_ECN_SND_CWR;
238 }
239 break;
240 case CC_RTO:
241 tp->t_dupacks = 0;
242 tp->t_bytes_acked = 0;
243 EXIT_RECOVERY(tp->t_flags);
244 /*
245 * samkumar: I added the cast to uint64_t below to fix an OpenThread
246 * code scanning alert relating to integer overflow in multiplication.
247 */
248 tp->snd_ssthresh = max(2, min(tp->snd_wnd, tp->snd_cwnd) / 2 /
249 tp->t_maxseg) * ((uint64_t) tp->t_maxseg);
250 tp->snd_cwnd = tp->t_maxseg;
251
252 /*
253 * samkumar: Stats for TCPlp: count the number of timeouts (RTOs).
254 * I've commented this out (with #if 0) because it isn't part of TCP
255 * functionality. At some point, we may want to bring it back to
256 * measure performance.
257 */
258 #if 0
259 tcplp_timeoutRexmitCnt++;
260 #endif
261 #ifdef INSTRUMENT_TCP
262 tcplp_sys_log("TCP CC_RTO %u %d %d", (unsigned int) tcplp_sys_get_millis(), (int) tp->snd_cwnd, (int) tp->snd_ssthresh);
263 #endif
264 break;
265 case CC_RTO_ERR:
266 /* RTO was unnecessary, so reset everything. */
267 tp->snd_cwnd = tp->snd_cwnd_prev;
268 tp->snd_ssthresh = tp->snd_ssthresh_prev;
269 tp->snd_recover = tp->snd_recover_prev;
270 if (tp->t_flags & TF_WASFRECOVERY)
271 ENTER_FASTRECOVERY(tp->t_flags);
272 if (tp->t_flags & TF_WASCRECOVERY)
273 ENTER_CONGRECOVERY(tp->t_flags);
274 tp->snd_nxt = tp->snd_max;
275 tp->t_flags &= ~TF_PREVVALID;
276 tp->t_badrxtwin = 0;
277 #ifdef INSTRUMENT_TCP
278 tcplp_sys_log("TCP CC_RTO_ERR %u %d %d", (unsigned int) tcplp_sys_get_millis(), (int) tp->snd_cwnd, (int) tp->snd_ssthresh);
279 #endif
280 break;
281 }
282
283 if (CC_ALGO(tp)->cong_signal != NULL) {
284 if (th != NULL)
285 tp->ccv->curack = th->th_ack;
286 CC_ALGO(tp)->cong_signal(tp->ccv, type);
287 }
288 }
289
290 static inline void
cc_post_recovery(struct tcpcb * tp,struct tcphdr * th)291 cc_post_recovery(struct tcpcb *tp, struct tcphdr *th)
292 {
293 /* samkumar: remove lock */
294
295 /* XXXLAS: KASSERT that we're in recovery? */
296 if (CC_ALGO(tp)->post_recovery != NULL) {
297 tp->ccv->curack = th->th_ack;
298 CC_ALGO(tp)->post_recovery(tp->ccv);
299 }
300 /* XXXLAS: EXIT_RECOVERY ? */
301 tp->t_bytes_acked = 0;
302 }
303
304
305 /*
306 * Indicate whether this ack should be delayed. We can delay the ack if
307 * following conditions are met:
308 * - There is no delayed ack timer in progress.
309 * - Our last ack wasn't a 0-sized window. We never want to delay
310 * the ack that opens up a 0-sized window.
311 * - LRO wasn't used for this segment. We make sure by checking that the
312 * segment size is not larger than the MSS.
313 * - Delayed acks are enabled or this is a half-synchronized T/TCP
314 * connection.
315 */
316 #define DELAY_ACK(tp, tlen) \
317 ((!tcp_timer_active(tp, TT_DELACK) && \
318 (tp->t_flags & TF_RXWIN0SENT) == 0) && \
319 (tlen <= tp->t_maxopd) && \
320 (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN)))
321
322 static inline void
cc_ecnpkt_handler(struct tcpcb * tp,struct tcphdr * th,uint8_t iptos)323 cc_ecnpkt_handler(struct tcpcb *tp, struct tcphdr *th, uint8_t iptos)
324 {
325 /* samkumar: remove lock */
326
327 if (CC_ALGO(tp)->ecnpkt_handler != NULL) {
328 switch (iptos & IPTOS_ECN_MASK) {
329 case IPTOS_ECN_CE:
330 tp->ccv->flags |= CCF_IPHDR_CE;
331 break;
332 case IPTOS_ECN_ECT0:
333 tp->ccv->flags &= ~CCF_IPHDR_CE;
334 break;
335 case IPTOS_ECN_ECT1:
336 tp->ccv->flags &= ~CCF_IPHDR_CE;
337 break;
338 }
339
340 if (th->th_flags & TH_CWR)
341 tp->ccv->flags |= CCF_TCPHDR_CWR;
342 else
343 tp->ccv->flags &= ~CCF_TCPHDR_CWR;
344
345 if (tp->t_flags & TF_DELACK)
346 tp->ccv->flags |= CCF_DELACK;
347 else
348 tp->ccv->flags &= ~CCF_DELACK;
349
350 CC_ALGO(tp)->ecnpkt_handler(tp->ccv);
351
352 if (tp->ccv->flags & CCF_ACKNOW)
353 tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
354 }
355 }
356
357 /*
358 * External function: look up an entry in the hostcache and fill out the
359 * supplied TCP metrics structure. Fills in NULL when no entry was found or
360 * a value is not set.
361 */
362 /*
363 * samkumar: This function is taken from tcp_hostcache.c. We have no host cache
364 * in TCPlp, so I changed this to always act as if there is a miss. I removed
365 * the first argument, formerly "struct in_coninfo *inc".
366 */
367 void
tcp_hc_get(struct tcpcb * tp,struct hc_metrics_lite * hc_metrics_lite)368 tcp_hc_get(struct tcpcb* tp, struct hc_metrics_lite *hc_metrics_lite)
369 {
370 bzero(hc_metrics_lite, sizeof(*hc_metrics_lite));
371 }
372
373 /*
374 * External function: look up an entry in the hostcache and return the
375 * discovered path MTU. Returns NULL if no entry is found or value is not
376 * set.
377 */
378 /*
379 * samkumar: This function is taken from tcp_hostcache.c. We have no host cache
380 * in TCPlp, so I changed this to always act as if there is a miss.
381 */
382 uint64_t
tcp_hc_getmtu(struct tcpcb * tp)383 tcp_hc_getmtu(struct tcpcb* tp)
384 {
385 return 0;
386 }
387
388
389 /*
390 * Issue RST and make ACK acceptable to originator of segment.
391 * The mbuf must still include the original packet header.
392 * tp may be NULL.
393 */
394 /*
395 * samkumar: Original signature was:
396 * static void tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp,
397 * int tlen, int rstreason)
398 */
399 void
tcp_dropwithreset(struct ip6_hdr * ip6,struct tcphdr * th,struct tcpcb * tp,otInstance * instance,int tlen,int rstreason)400 tcp_dropwithreset(struct ip6_hdr* ip6, struct tcphdr *th, struct tcpcb *tp, otInstance* instance,
401 int tlen, int rstreason)
402 {
403 /*
404 * samkumar: I removed logic to skip this for broadcast or multicast
405 * packets. In the FreeBSD version of this function, it would just
406 * call m_freem(m), if m->m_flags has M_BCAST or M_MCAST set, and not
407 * send a response packet.
408 * I also removed bandwidth limiting.
409 */
410 if (th->th_flags & TH_RST)
411 return;
412
413 /* tcp_respond consumes the mbuf chain. */
414 if (th->th_flags & TH_ACK) {
415 tcp_respond(tp, instance, ip6, th, (tcp_seq) 0, th->th_ack, TH_RST);
416 } else {
417 if (th->th_flags & TH_SYN)
418 tlen++;
419 tcp_respond(tp, instance, ip6, th, th->th_seq + tlen, (tcp_seq) 0, TH_RST | TH_ACK);
420 }
421 return;
422 }
423
424 /*
425 * TCP input handling is split into multiple parts:
426 * tcp6_input is a thin wrapper around tcp_input for the extended
427 * ip6_protox[] call format in ip6_input
428 * tcp_input handles primary segment validation, inpcb lookup and
429 * SYN processing on listen sockets
430 * tcp_do_segment processes the ACK and text of the segment for
431 * establishing, established and closing connections
432 */
433 /* samkumar: The signature of this function was originally:
434 tcp_input(struct mbuf **mp, int *offp, int proto) */
435 /* NOTE: tcp_fields_to_host(th) must be called before this function is called. */
436 int
tcp_input(struct ip6_hdr * ip6,struct tcphdr * th,otMessage * msg,struct tcpcb * tp,struct tcpcb_listen * tpl,struct tcplp_signals * sig)437 tcp_input(struct ip6_hdr* ip6, struct tcphdr* th, otMessage* msg, struct tcpcb* tp, struct tcpcb_listen* tpl,
438 struct tcplp_signals* sig)
439 {
440 /*
441 * samkumar: I significantly modified this function, compared to the
442 * FreeBSD version. This function used to be reponsible for matching an
443 * incoming TCP segment to its TCB. That functionality is now done by
444 * TCPlp, and this function is only called once a match has been
445 * identified.
446 *
447 * The tp and tpl arguments are used to indicate the match. Exactly one of
448 * them must be NULL, and the other must be set. If tp is non-NULL, then
449 * this function assumes that the packet was matched to an active socket
450 * (connection endpoint). If tpl is non-NULL, then this function assumes
451 * that this packet is a candidate match for a passive socket (listener)
452 * and attempts to set up a new connection if the flags, sequence numbers,
453 * etc. look OK.
454 *
455 * TCPlp assumes that the packets are IPv6, so I removed any logic specific
456 * to IPv4.
457 *
458 * And of course, all code pertaining to locks and stats has been removed.
459 */
460 int tlen = 0, off;
461 int thflags;
462 uint8_t iptos = 0;
463 int drop_hdrlen;
464 int rstreason = 0;
465 struct tcpopt to; /* options in this segment */
466 uint8_t* optp = NULL;
467 int optlen = 0;
468 to.to_flags = 0;
469 KASSERT(tp || tpl, ("One of tp and tpl must be positive"));
470
471 /*
472 * samkumar: Here, there used to be code that handled preprocessing:
473 * calling m_pullup(m, sizeof(*ip6) + sizeof(*th)) to get the headers
474 * contiguous in memory, setting the ip6 and th pointers, validating the
475 * checksum, and dropping packets with unspecified source address. In
476 * TCPlp, all of this is done for a packet before this function is called.
477 */
478
479 tlen = ntohs(ip6->ip6_plen); // assume *off == sizeof(*ip6)
480
481 /*
482 * samkumar: Logic that handled IPv4 was deleted below. I won't add a
483 * comment every time this is done, but I'm putting it here (one of the
484 * first instances of this) for clarity.
485 */
486 iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
487
488 /*
489 * Check that TCP offset makes sense,
490 * pull out TCP options and adjust length. XXX
491 */
492 off = (th->th_off_x2 >> TH_OFF_SHIFT) << 2;
493 if (off < sizeof (struct tcphdr) || off > tlen) {
494 goto drop;
495 }
496 tlen -= off; /* tlen is used instead of ti->ti_len */
497 /* samkumar: now, tlen is the length of the data */
498
499 if (off > sizeof (struct tcphdr)) {
500 /*
501 * samkumar: I removed a call to IP6_EXTHDR_CHECK, which I believe
502 * checks for IPv6 extension headers. In TCPlp, we assume that these
503 * are handled elsewhere in the networking stack, before the incoming
504 * packet is processed at the TCP layer. I also removed the followup
505 * calls to reassign the ip6 and th pointers.
506 */
507 optlen = off - sizeof (struct tcphdr);
508 optp = (uint8_t *)(th + 1);
509 }
510
511 thflags = th->th_flags;
512
513 /*
514 * samkumar: There used to be a call here to tcp_fields_to_host(th), which
515 * changes the byte order of various fields to host format. I removed this
516 * call from there and handle it in TCPlp, before calling this. The reason
517 * is that it's possible for this function to be called twice by TCPlp's
518 * logic (e.g., if the packet matches a TIME-WAIT socket this function
519 * returns early, and the packet may then match a listening socket, at
520 * which ppoint this function will be called again). Thus, any operations
521 * like this, which mutate the packet itself, need to happen before calling
522 * this function.
523 */
524
525 /*
526 * Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options.
527 *
528 * samkumar: My TCP header is in a different buffer from the IP header.
529 * drop_hdrlen is only meaningful as an offset into the TCP buffer,
530 * because it is used to determine how much of the packet to discard
531 * before copying it into the receive buffer. Therefore, my offset does
532 * not include the length of IP header and options, only the length of
533 * the TCP header and options.
534 */
535 drop_hdrlen = /*off0 +*/ off;
536
537 /*
538 * Locate pcb for segment; if we're likely to add or remove a
539 * connection then first acquire pcbinfo lock. There are three cases
540 * where we might discover later we need a write lock despite the
541 * flags: ACKs moving a connection out of the syncache, ACKs for a
542 * connection in TIMEWAIT and SYNs not targeting a listening socket.
543 */
544
545 /*
546 * samkumar: Locking code is removed, invalidating most of the above
547 * comment.
548 */
549
550 /*
551 * samkumar: The FreeBSD code at logic here to check m->m_flags for the
552 * M_IP6_NEXTHOP flag, and search for the PACKET_TAG_IPFORWARD tag and
553 * store it in fwd_tag if so. In TCPlp, we assume that the IPv6 layer of
554 * the host network stack handles this kind of IPv6-related functionality,
555 * so this logic has been removed.
556 */
557
558 /*
559 * samkumar: Here, there was code to match the packet to an inpcb and reply
560 * with an RST segment if no match is found. This included taking the
561 * fwd_tag into account, if set above (see the previous comment). I removed
562 * this code because, in TCPlp, this is done before calling this function.
563 */
564
565 /*
566 * A previous connection in TIMEWAIT state is supposed to catch stray
567 * or duplicate segments arriving late. If this segment was a
568 * legitimate new connection attempt, the old INPCB gets removed and
569 * we can try again to find a listening socket.
570 *
571 * At this point, due to earlier optimism, we may hold only an inpcb
572 * lock, and not the inpcbinfo write lock. If so, we need to try to
573 * acquire it, or if that fails, acquire a reference on the inpcb,
574 * drop all locks, acquire a global write lock, and then re-acquire
575 * the inpcb lock. We may at that point discover that another thread
576 * has tried to free the inpcb, in which case we need to loop back
577 * and try to find a new inpcb to deliver to.
578 *
579 * XXXRW: It may be time to rethink timewait locking.
580 */
581 /*
582 * samkumar: The original code checked inp->inp_flags & INP_TIMEWAIT. I
583 * changed it to instead check tp->t_state, since we don't use inpcbs in
584 * TCPlp.
585 */
586 if (tp && tp->t_state == TCP6S_TIME_WAIT) {
587 /*
588 * samkumar: There's nothing wrong with the call to tcp_dooptions call
589 * that I've commented out below; it's just that the modified
590 * "tcp_twcheck" function no longer needs the options structure, so
591 * I figured that there's no longer a good reason to parse the options.
592 * In fact, this call was probably unnecessary even in the original
593 * FreeBSD TCP code, since tcp_twcheck, even without my modifications,
594 * did not use the pointer to the options structure!
595 */
596 //if (thflags & TH_SYN)
597 //tcp_dooptions(&to, optp, optlen, TO_SYN);
598 /*
599 * samkumar: The original code would "goto findpcb;" if this branch is
600 * taken. Matching with a TCB is done outside of this function in
601 * TCPlp, so we instead return a special value so that the caller knows
602 * to try re-matching this packet to a socket.
603 */
604 if (tcp_twcheck(tp,/*inp, &to,*/ th, /*m,*/ tlen))
605 return (RELOOKUP_REQUIRED);
606 return (IPPROTO_DONE);
607 }
608 /*
609 * The TCPCB may no longer exist if the connection is winding
610 * down or it is in the CLOSED state. Either way we drop the
611 * segment and send an appropriate response.
612 */
613 /*
614 * samkumar: There used to be code here that grabs the tp from the inpcb
615 * and drops with reset if the connection is in the closed state or if
616 * the tp is NULL. In TCPlp, the equivalent logic is done before entering
617 * this function. There was also code here to handle TCP offload, which
618 * TCPlp does not handle.
619 */
620
621 /*
622 * We've identified a valid inpcb, but it could be that we need an
623 * inpcbinfo write lock but don't hold it. In this case, attempt to
624 * acquire using the same strategy as the TIMEWAIT case above. If we
625 * relock, we have to jump back to 'relocked' as the connection might
626 * now be in TIMEWAIT.
627 */
628 /*
629 * samkumar: There used to be some code here for synchronization, MAC
630 * management, and debugging.
631 */
632
633 /*
634 * When the socket is accepting connections (the INPCB is in LISTEN
635 * state) we look into the SYN cache if this is a new connection
636 * attempt or the completion of a previous one. Instead of checking
637 * so->so_options to check if the socket is listening, we rely on the
638 * arguments passed to this function (if tp == NULL, then tpl is not NULL
639 * and is the matching listen socket).
640 */
641
642 if (/*so->so_options & SO_ACCEPTCONN*/tp == NULL) {
643 /* samkumar: NULL check isn't needed but prevents a compiler warning */
644 KASSERT(tpl != NULL && tpl->t_state == TCP6S_LISTEN, ("listen socket must be in listening state!"));
645
646 /*
647 * samkumar: There used to be some code here that checks if the
648 * received segment is an ACK, and if so, searches the SYN cache to
649 * find an entry whose connection establishment handshake this segment
650 * completes. If such an entry is found, then a socket is created and
651 * then tcp_do_segment is called to actually run the code to mark the
652 * connection as established. If the received segment is an RST, then
653 * that is processed in the syncache as well. In TCPlp we do not use a
654 * SYN cache, so I've removed that code. The actual connection
655 * establishment/processing logic happens in tcp_do_segment anyway,
656 * which is called at the bottom of this function, so there's no need
657 * to rewrite this code with special-case logic for that.
658 */
659
660 /*
661 * We can't do anything without SYN.
662 */
663 if ((thflags & TH_SYN) == 0) {
664 /*
665 * samkumar: Here, and in several other instances, the FreeBSD
666 * code would call tcp_log_addrs. Improving logging in these
667 * edge cases in TCPlp is left for the future --- for now, I just
668 * put "<addrs go here>" where the address string would go.
669 */
670 tcplp_sys_log("%s; %s: Listen socket: "
671 "SYN is missing, segment ignored",
672 "<addrs go here>", __func__);
673 goto dropunlock;
674 }
675 /*
676 * (SYN|ACK) is bogus on a listen socket.
677 */
678 if (thflags & TH_ACK) {
679 /* samkumar: See above comment regarding tcp_log_addrs. */
680 tcplp_sys_log("%s; %s: Listen socket: "
681 "SYN|ACK invalid, segment rejected",
682 "<addrs go here>", __func__);
683 /* samkumar: Removed call to syncache_badack(&inc); */
684 rstreason = BANDLIM_RST_OPENPORT;
685 goto dropwithreset;
686 }
687 /*
688 * If the drop_synfin option is enabled, drop all
689 * segments with both the SYN and FIN bits set.
690 * This prevents e.g. nmap from identifying the
691 * TCP/IP stack.
692 * XXX: Poor reasoning. nmap has other methods
693 * and is constantly refining its stack detection
694 * strategies.
695 * XXX: This is a violation of the TCP specification
696 * and was used by RFC1644.
697 */
698 if ((thflags & TH_FIN) && V_drop_synfin) {
699 /* samkumar: See above comment regarding tcp_log_addrs. */
700 tcplp_sys_log("%s; %s: Listen socket: "
701 "SYN|FIN segment ignored (based on "
702 "sysctl setting)", "<addrs go here>", __func__);
703 goto dropunlock;
704 }
705 /*
706 * Segment's flags are (SYN) or (SYN|FIN).
707 *
708 * TH_PUSH, TH_URG, TH_ECE, TH_CWR are ignored
709 * as they do not affect the state of the TCP FSM.
710 * The data pointed to by TH_URG and th_urp is ignored.
711 */
712 KASSERT((thflags & (TH_RST|TH_ACK)) == 0,
713 ("%s: Listen socket: TH_RST or TH_ACK set", __func__));
714 KASSERT(thflags & (TH_SYN),
715 ("%s: Listen socket: TH_SYN not set", __func__));
716
717 /*
718 * samkumar: There used to be some code here to reject incoming
719 * SYN packets for deprecated interface addresses unless
720 * V_ip6_use_deprecated is true. Rejecting the packet, in this case,
721 * means to "goto dropwithreset". I removed this functionality.
722 */
723
724 /*
725 * Basic sanity checks on incoming SYN requests:
726 * Don't respond if the destination is a link layer
727 * broadcast according to RFC1122 4.2.3.10, p. 104.
728 * If it is from this socket it must be forged.
729 * Don't respond if the source or destination is a
730 * global or subnet broad- or multicast address.
731 * Note that it is quite possible to receive unicast
732 * link-layer packets with a broadcast IP address. Use
733 * in_broadcast() to find them.
734 */
735
736 /*
737 * samkumar: There used to be a sanity check that drops (via
738 * "goto dropunlock") any broadcast or multicast packets. This check is
739 * done by checking m->m_flags for (M_BAST|M_MCAST). The original
740 * FreeBSD code for this has been removed (since checking m->m_flags
741 * isn't really useful to us anyway). Note that other FreeBSD code that
742 * checks for multicast source/destination addresses is retained below
743 * (but only for the IPv6 case; the original FreeBSD code also handled
744 * it for IPv4 addresses).
745 */
746
747 if (th->th_dport == th->th_sport &&
748 IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) {
749 /* samkumar: See above comment regarding tcp_log_addrs. */
750 tcplp_sys_log("%s; %s: Listen socket: "
751 "Connection attempt to/from self "
752 "ignored", "<addrs go here>", __func__);
753 goto dropunlock;
754 }
755 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
756 IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) {
757 /* samkumar: See above comment regarding tcp_log_addrs. */
758 tcplp_sys_log("%s; %s: Listen socket: "
759 "Connection attempt from/to multicast "
760 "address ignored", "<addrs go here>", __func__);
761 goto dropunlock;
762 }
763
764 /*
765 * samkumar: The FreeBSD code would call
766 * syncache_add(&inc, &to, th, inp, &so, m, NULL, NULL);
767 * to add an entry to the SYN cache at this point. TCPlp doesn't use a
768 * syncache, so we initialize the new socket right away. The code to
769 * initialize the socket is taken from the syncache_socket function.
770 */
771
772 tcp_dooptions(&to, optp, optlen, TO_SYN);
773 tp = tcplp_sys_accept_ready(tpl, &ip6->ip6_src, th->th_sport); // Try to allocate an active socket to accept into
774 if (tp == NULL) {
775 /* If we couldn't allocate, just ignore the SYN. */
776 return IPPROTO_DONE;
777 }
778 if (tp == (struct tcpcb *) -1) {
779 rstreason = ECONNREFUSED;
780 tp = NULL;
781 goto dropwithreset;
782 }
783 tcp_state_change(tp, TCPS_SYN_RECEIVED);
784 tpmarkpassiveopen(tp);
785 tp->t_flags |= TF_ACKNOW; // samkumar: my addition
786 tp->iss = tcp_new_isn(tp);
787 tp->irs = th->th_seq;
788 tcp_rcvseqinit(tp);
789 tcp_sendseqinit(tp);
790 tp->snd_wl1 = th->th_seq;
791 tp->snd_max = tp->iss/* + 1*/;
792 tp->snd_nxt = tp->iss/* + 1*/;
793 tp->rcv_up = th->th_seq + 1;
794 tp->rcv_wnd = imin(imax(cbuf_free_space(&tp->recvbuf), 0), TCP_MAXWIN);
795 tp->rcv_adv += tp->rcv_wnd;
796 tp->last_ack_sent = tp->rcv_nxt;
797 memcpy(&tp->laddr, &ip6->ip6_dst, sizeof(tp->laddr));
798 memcpy(&tp->faddr, &ip6->ip6_src, sizeof(tp->faddr));
799 tp->fport = th->th_sport;
800 tp->lport = tpl->lport;
801
802 /*
803 * samkumar: Several of the checks below (taken from syncache_socket!)
804 * check for flags in sc->sc_flags. They have been written to directly
805 * check for the conditions on the TCP options structure or in the TCP
806 * header that would ordinarily be used to set flags in sc->sc_flags
807 * when adding an entry to the SYN cache.
808 *
809 * In effect, we combine the logic in syncache_add to set elements of
810 * sc with the logic in syncache_socket to transfer state from sc
811 * to the socket, but short-circuit the process to avoid ever storing
812 * data in sc. Since this isn't just adding or deleting code, I decided
813 * that it's better to keep comments indicating exactly how I composed
814 * these two functions.
815 */
816 tp->t_flags = tp->t_flags & (TF_NOPUSH | TF_NODELAY | TF_NOOPT);
817 // tp->t_flags = sototcpcb(lso)->t_flags & (TF_NOPUSH|TF_NODELAY);
818 // if (sc->sc_flags & SCF_NOOPT)
819 // tp->t_flags |= TF_NOOPT;
820 // else {
821 if (!(tp->t_flags & TF_NOOPT) && V_tcp_do_rfc1323) {
822 if (/*sc->sc_flags & SCF_WINSCALE*/to.to_flags & TOF_SCALE) {
823 int wscale = 0;
824
825 /*
826 * Pick the smallest possible scaling factor that
827 * will still allow us to scale up to sb_max, aka
828 * kern.ipc.maxsockbuf.
829 *
830 * We do this because there are broken firewalls that
831 * will corrupt the window scale option, leading to
832 * the other endpoint believing that our advertised
833 * window is unscaled. At scale factors larger than
834 * 5 the unscaled window will drop below 1500 bytes,
835 * leading to serious problems when traversing these
836 * broken firewalls.
837 *
838 * With the default maxsockbuf of 256K, a scale factor
839 * of 3 will be chosen by this algorithm. Those who
840 * choose a larger maxsockbuf should watch out
841 * for the compatiblity problems mentioned above.
842 *
843 * RFC1323: The Window field in a SYN (i.e., a <SYN>
844 * or <SYN,ACK>) segment itself is never scaled.
845 */
846
847 /*
848 * samkumar: The original logic, taken from syncache_add, is
849 * listed below, commented out. In practice, we just use
850 * wscale = 0 because in TCPlp we assume that the buffers
851 * aren't big enough for window scaling to be all that useful.
852 */
853 #if 0
854 while (wscale < TCP_MAX_WINSHIFT &&
855 (TCP_MAXWIN << wscale) < sb_max)
856 wscale++;
857 #endif
858
859 tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE;
860 tp->snd_scale = /*sc->sc_requested_s_scale*/to.to_wscale;
861 tp->request_r_scale = wscale;
862 }
863 if (/*sc->sc_flags & SCF_TIMESTAMP*/to.to_flags & TOF_TS) {
864 tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP;
865 tp->ts_recent = /*sc->sc_tsreflect*/to.to_tsval;
866 tp->ts_recent_age = tcp_ts_getticks();
867 tp->ts_offset = /*sc->sc_tsoff*/0; // No syncookies, so this should always be 0
868 }
869
870 /*
871 * samkumar: there used to be code here that would set the
872 * TF_SIGNATURE flag on tp->t_flags if SCF_SIGNATURE is set on
873 * sc->sc_flags. I've left it in below, commented out.
874 */
875 #if 0
876 #ifdef TCP_SIGNATURE
877 if (sc->sc_flags & SCF_SIGNATURE)
878 tp->t_flags |= TF_SIGNATURE;
879 #endif
880 #endif
881 if (/*sc->sc_flags & SCF_SACK*/ to.to_flags & TOF_SACKPERM)
882 tp->t_flags |= TF_SACK_PERMIT;
883 }
884 if (/*sc->sc_flags & SCF_ECN*/(th->th_flags & (TH_ECE|TH_CWR)) && V_tcp_do_ecn)
885 tp->t_flags |= TF_ECN_PERMIT;
886
887 /*
888 * Set up MSS and get cached values from tcp_hostcache.
889 * This might overwrite some of the defaults we just set.
890 */
891 tcp_mss(tp, /*sc->sc_peer_mss*/(to.to_flags & TOF_MSS) ? to.to_mss : 0);
892
893 tcp_output(tp); // to send the SYN-ACK
894
895 tp->accepted_from = tpl;
896 return (IPPROTO_DONE);
897 } else if (tp->t_state == TCPS_LISTEN) {
898 /*
899 * When a listen socket is torn down the SO_ACCEPTCONN
900 * flag is removed first while connections are drained
901 * from the accept queue in a unlock/lock cycle of the
902 * ACCEPT_LOCK, opening a race condition allowing a SYN
903 * attempt go through unhandled.
904 */
905 goto dropunlock;
906 }
907
908 KASSERT(tp, ("tp is still NULL!"));
909
910 /*
911 * samkumar: There used to be code here to verify TCP signatures. We don't
912 * support TCP signatures in TCPlp.
913 */
914
915 /*
916 * Segment belongs to a connection in SYN_SENT, ESTABLISHED or later
917 * state. tcp_do_segment() always consumes the mbuf chain, unlocks
918 * the inpcb, and unlocks pcbinfo.
919 */
920 tcp_do_segment(ip6, th, msg, tp, drop_hdrlen, tlen, iptos, sig);
921 return (IPPROTO_DONE);
922
923 /*
924 * samkumar: Removed some locking and debugging code under all three of
925 * these labels: dropwithreset, dropunlock, and drop. I also removed some
926 * memory management code (e.g., calling m_freem(m) if m != NULL) since
927 * the caller of this function will take care of that kind of memory
928 * management in TCPlp.
929 */
930 dropwithreset:
931
932 /*
933 * samkumar: The check against inp != NULL is now a check on tp != NULL.
934 */
935 if (tp != NULL) {
936 tcp_dropwithreset(ip6, th, tp, tp->instance, tlen, rstreason);
937 } else
938 tcp_dropwithreset(ip6, th, NULL, tpl->instance, tlen, rstreason);
939 goto drop;
940
941 dropunlock:
942 drop:
943 return (IPPROTO_DONE);
944 }
945
946 /*
947 * samkumar: Original signature
948 * static void
949 * tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
950 * struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos,
951 * int ti_locked)
952 */
953 static void
tcp_do_segment(struct ip6_hdr * ip6,struct tcphdr * th,otMessage * msg,struct tcpcb * tp,int drop_hdrlen,int tlen,uint8_t iptos,struct tcplp_signals * sig)954 tcp_do_segment(struct ip6_hdr* ip6, struct tcphdr *th, otMessage* msg,
955 struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos,
956 struct tcplp_signals* sig)
957 {
958 /*
959 * samkumar: All code pertaining to locks, stats, and debug has been
960 * removed from this function.
961 */
962
963 int thflags, acked, ourfinisacked, needoutput = 0;
964 int rstreason, todrop, win;
965 uint64_t tiwin;
966 struct tcpopt to;
967 uint32_t ticks = tcplp_sys_get_ticks();
968 otInstance* instance = tp->instance;
969 thflags = th->th_flags;
970 tp->sackhint.last_sack_ack = 0;
971
972 /*
973 * If this is either a state-changing packet or current state isn't
974 * established, we require a write lock on tcbinfo. Otherwise, we
975 * allow the tcbinfo to be in either alocked or unlocked, as the
976 * caller may have unnecessarily acquired a write lock due to a race.
977 */
978
979 /* samkumar: There used to be synchronization code here. */
980 KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
981 __func__));
982 KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
983 __func__));
984
985 /*
986 * Segment received on connection.
987 * Reset idle time and keep-alive timer.
988 * XXX: This should be done after segment
989 * validation to ignore broken/spoofed segs.
990 */
991 tp->t_rcvtime = ticks;
992 if (TCPS_HAVEESTABLISHED(tp->t_state))
993 tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp));
994
995 /*
996 * Scale up the window into a 32-bit value.
997 * For the SYN_SENT state the scale is zero.
998 */
999 tiwin = th->th_win << tp->snd_scale;
1000
1001 /*
1002 * TCP ECN processing.
1003 */
1004 /*
1005 * samkumar: I intentionally left the TCPSTAT_INC lines below commented
1006 * out, to avoid altering the structure of the code too much by
1007 * reorganizing the switch statement.
1008 */
1009 if (tp->t_flags & TF_ECN_PERMIT) {
1010 if (thflags & TH_CWR)
1011 tp->t_flags &= ~TF_ECN_SND_ECE;
1012 switch (iptos & IPTOS_ECN_MASK) {
1013 case IPTOS_ECN_CE:
1014 tp->t_flags |= TF_ECN_SND_ECE;
1015 //TCPSTAT_INC(tcps_ecn_ce);
1016 break;
1017 case IPTOS_ECN_ECT0:
1018 //TCPSTAT_INC(tcps_ecn_ect0);
1019 break;
1020 case IPTOS_ECN_ECT1:
1021 //TCPSTAT_INC(tcps_ecn_ect1);
1022 break;
1023 }
1024
1025 /* Process a packet differently from RFC3168. */
1026 cc_ecnpkt_handler(tp, th, iptos);
1027
1028 /* Congestion experienced. */
1029 if (thflags & TH_ECE) {
1030 cc_cong_signal(tp, th, CC_ECN);
1031 }
1032 }
1033
1034 /*
1035 * Parse options on any incoming segment.
1036 */
1037 tcp_dooptions(&to, (uint8_t *)(th + 1),
1038 ((th->th_off_x2 >> TH_OFF_SHIFT) << 2) - sizeof(struct tcphdr),
1039 (thflags & TH_SYN) ? TO_SYN : 0);
1040
1041 /*
1042 * If echoed timestamp is later than the current time,
1043 * fall back to non RFC1323 RTT calculation. Normalize
1044 * timestamp if syncookies were used when this connection
1045 * was established.
1046 */
1047
1048 if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
1049 to.to_tsecr -= tp->ts_offset;
1050 if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks()))
1051 to.to_tsecr = 0;
1052 }
1053 /*
1054 * If timestamps were negotiated during SYN/ACK they should
1055 * appear on every segment during this session and vice versa.
1056 */
1057 if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS)) {
1058 /* samkumar: See above comment regarding tcp_log_addrs. */
1059 tcplp_sys_log("%s; %s: Timestamp missing, "
1060 "no action", "<addrs go here>", __func__);
1061 }
1062 if (!(tp->t_flags & TF_RCVD_TSTMP) && (to.to_flags & TOF_TS)) {
1063 /* samkumar: See above comment regarding tcp_log_addrs. */
1064 tcplp_sys_log("%s; %s: Timestamp not expected, "
1065 "no action", "<addrs go here>", __func__);
1066 }
1067
1068 /*
1069 * Process options only when we get SYN/ACK back. The SYN case
1070 * for incoming connections is handled in tcp_syncache.
1071 * According to RFC1323 the window field in a SYN (i.e., a <SYN>
1072 * or <SYN,ACK>) segment itself is never scaled.
1073 * XXX this is traditional behavior, may need to be cleaned up.
1074 */
1075 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
1076 if ((to.to_flags & TOF_SCALE) &&
1077 (tp->t_flags & TF_REQ_SCALE)) {
1078 tp->t_flags |= TF_RCVD_SCALE;
1079 tp->snd_scale = to.to_wscale;
1080 }
1081 /*
1082 * Initial send window. It will be updated with
1083 * the next incoming segment to the scaled value.
1084 */
1085 tp->snd_wnd = th->th_win;
1086 if (to.to_flags & TOF_TS) {
1087 tp->t_flags |= TF_RCVD_TSTMP;
1088 tp->ts_recent = to.to_tsval;
1089 tp->ts_recent_age = tcp_ts_getticks();
1090 }
1091 if (to.to_flags & TOF_MSS)
1092 tcp_mss(tp, to.to_mss);
1093 if ((tp->t_flags & TF_SACK_PERMIT) &&
1094 (to.to_flags & TOF_SACKPERM) == 0)
1095 tp->t_flags &= ~TF_SACK_PERMIT;
1096 }
1097 /*
1098 * Header prediction: check for the two common cases
1099 * of a uni-directional data xfer. If the packet has
1100 * no control flags, is in-sequence, the window didn't
1101 * change and we're not retransmitting, it's a
1102 * candidate. If the length is zero and the ack moved
1103 * forward, we're the sender side of the xfer. Just
1104 * free the data acked & wake any higher level process
1105 * that was blocked waiting for space. If the length
1106 * is non-zero and the ack didn't move, we're the
1107 * receiver side. If we're getting packets in-order
1108 * (the reassembly queue is empty), add the data to
1109 * the socket buffer and note that we need a delayed ack.
1110 * Make sure that the hidden state-flags are also off.
1111 * Since we check for TCPS_ESTABLISHED first, it can only
1112 * be TH_NEEDSYN.
1113 */
1114 /*
1115 * samkumar: Replaced LIST_EMPTY(&tp->tsegq with the call to bmp_isempty).
1116 */
1117 if (tp->t_state == TCPS_ESTABLISHED &&
1118 th->th_seq == tp->rcv_nxt &&
1119 (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
1120 tp->snd_nxt == tp->snd_max &&
1121 tiwin && tiwin == tp->snd_wnd &&
1122 ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
1123 bmp_isempty(tp->reassbmp, REASSBMP_SIZE(tp)) &&
1124 ((to.to_flags & TOF_TS) == 0 ||
1125 TSTMP_GEQ(to.to_tsval, tp->ts_recent)) ) {
1126
1127 /*
1128 * If last ACK falls within this segment's sequence numbers,
1129 * record the timestamp.
1130 * NOTE that the test is modified according to the latest
1131 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
1132 */
1133 if ((to.to_flags & TOF_TS) != 0 &&
1134 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
1135 tp->ts_recent_age = tcp_ts_getticks();
1136 tp->ts_recent = to.to_tsval;
1137 }
1138
1139 if (tlen == 0) {
1140 if (SEQ_GT(th->th_ack, tp->snd_una) &&
1141 SEQ_LEQ(th->th_ack, tp->snd_max) &&
1142 !IN_RECOVERY(tp->t_flags) &&
1143 (to.to_flags & TOF_SACK) == 0 &&
1144 TAILQ_EMPTY(&tp->snd_holes)) {
1145 /*
1146 * This is a pure ack for outstanding data.
1147 */
1148
1149 /*
1150 * "bad retransmit" recovery.
1151 */
1152 if (tp->t_rxtshift == 1 &&
1153 tp->t_flags & TF_PREVVALID &&
1154 (int)(ticks - tp->t_badrxtwin) < 0) {
1155 cc_cong_signal(tp, th, CC_RTO_ERR);
1156 }
1157
1158 /*
1159 * Recalculate the transmit timer / rtt.
1160 *
1161 * Some boxes send broken timestamp replies
1162 * during the SYN+ACK phase, ignore
1163 * timestamps of 0 or we could calculate a
1164 * huge RTT and blow up the retransmit timer.
1165 */
1166
1167 if ((to.to_flags & TOF_TS) != 0 &&
1168 to.to_tsecr) {
1169 uint32_t t;
1170
1171 t = tcp_ts_getticks() - to.to_tsecr;
1172 if (!tp->t_rttlow || tp->t_rttlow > t)
1173 tp->t_rttlow = t;
1174 tcp_xmit_timer(tp,
1175 TCP_TS_TO_TICKS(t) + 1);
1176 } else if (tp->t_rtttime &&
1177 SEQ_GT(th->th_ack, tp->t_rtseq)) {
1178 if (!tp->t_rttlow ||
1179 tp->t_rttlow > ticks - tp->t_rtttime)
1180 tp->t_rttlow = ticks - tp->t_rtttime;
1181 tcp_xmit_timer(tp,
1182 ticks - tp->t_rtttime);
1183 }
1184
1185 acked = BYTES_THIS_ACK(tp, th);
1186
1187 /*
1188 * samkumar: Replaced sbdrop(&so->so_snd, acked) with this call
1189 * to lbuf_pop.
1190 */
1191 {
1192 uint32_t poppedbytes = lbuf_pop(&tp->sendbuf, acked, &sig->links_popped);
1193 KASSERT(poppedbytes == acked, ("More bytes were acked than are in the send buffer"));
1194 sig->bytes_acked += poppedbytes;
1195 }
1196 if (SEQ_GT(tp->snd_una, tp->snd_recover) &&
1197 SEQ_LEQ(th->th_ack, tp->snd_recover))
1198 tp->snd_recover = th->th_ack - 1;
1199
1200 /*
1201 * Let the congestion control algorithm update
1202 * congestion control related information. This
1203 * typically means increasing the congestion
1204 * window.
1205 */
1206 cc_ack_received(tp, th, CC_ACK);
1207
1208 tp->snd_una = th->th_ack;
1209 /*
1210 * Pull snd_wl2 up to prevent seq wrap relative
1211 * to th_ack.
1212 */
1213 tp->snd_wl2 = th->th_ack;
1214 tp->t_dupacks = 0;
1215
1216 /*
1217 * If all outstanding data are acked, stop
1218 * retransmit timer, otherwise restart timer
1219 * using current (possibly backed-off) value.
1220 * If process is waiting for space,
1221 * wakeup/selwakeup/signal. If data
1222 * are ready to send, let tcp_output
1223 * decide between more output or persist.
1224 */
1225
1226 if (tp->snd_una == tp->snd_max)
1227 tcp_timer_activate(tp, TT_REXMT, 0);
1228 else if (!tcp_timer_active(tp, TT_PERSIST))
1229 tcp_timer_activate(tp, TT_REXMT,
1230 tp->t_rxtcur);
1231
1232 /*
1233 * samkumar: There used to be a call to sowwakeup(so); here,
1234 * which wakes up any threads waiting for the socket to
1235 * become ready for writing. TCPlp handles its send buffer
1236 * differently so we do not need to replace this call with
1237 * specialized code to handle this.
1238 */
1239
1240 /*
1241 * samkumar: Replaced sbavail(&so->so_snd) with this call to
1242 * lbuf_used_space.
1243 */
1244 if (lbuf_used_space(&tp->sendbuf))
1245 (void) tcp_output(tp);
1246 goto check_delack;
1247 }
1248 } else if (th->th_ack == tp->snd_una &&
1249 /*
1250 * samkumar: Replaced sbspace(&so->so_rcv) with this call to
1251 * cbuf_free_space.
1252 */
1253 tlen <= cbuf_free_space(&tp->recvbuf)) {
1254
1255 /*
1256 * This is a pure, in-sequence data packet with
1257 * nothing on the reassembly queue and we have enough
1258 * buffer space to take it.
1259 */
1260 /* Clean receiver SACK report if present */
1261 if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks)
1262 tcp_clean_sackreport(tp);
1263
1264 tp->rcv_nxt += tlen;
1265 /*
1266 * Pull snd_wl1 up to prevent seq wrap relative to
1267 * th_seq.
1268 */
1269 tp->snd_wl1 = th->th_seq;
1270 /*
1271 * Pull rcv_up up to prevent seq wrap relative to
1272 * rcv_nxt.
1273 */
1274 tp->rcv_up = tp->rcv_nxt;
1275
1276 /*
1277 * Automatic sizing of receive socket buffer. Often the send
1278 * buffer size is not optimally adjusted to the actual network
1279 * conditions at hand (delay bandwidth product). Setting the
1280 * buffer size too small limits throughput on links with high
1281 * bandwidth and high delay (eg. trans-continental/oceanic links).
1282 *
1283 * On the receive side the socket buffer memory is only rarely
1284 * used to any significant extent. This allows us to be much
1285 * more aggressive in scaling the receive socket buffer. For
1286 * the case that the buffer space is actually used to a large
1287 * extent and we run out of kernel memory we can simply drop
1288 * the new segments; TCP on the sender will just retransmit it
1289 * later. Setting the buffer size too big may only consume too
1290 * much kernel memory if the application doesn't read() from
1291 * the socket or packet loss or reordering makes use of the
1292 * reassembly queue.
1293 *
1294 * The criteria to step up the receive buffer one notch are:
1295 * 1. Application has not set receive buffer size with
1296 * SO_RCVBUF. Setting SO_RCVBUF clears SB_AUTOSIZE.
1297 * 2. the number of bytes received during the time it takes
1298 * one timestamp to be reflected back to us (the RTT);
1299 * 3. received bytes per RTT is within seven eighth of the
1300 * current socket buffer size;
1301 * 4. receive buffer size has not hit maximal automatic size;
1302 *
1303 * This algorithm does one step per RTT at most and only if
1304 * we receive a bulk stream w/o packet losses or reorderings.
1305 * Shrinking the buffer during idle times is not necessary as
1306 * it doesn't consume any memory when idle.
1307 *
1308 * TODO: Only step up if the application is actually serving
1309 * the buffer to better manage the socket buffer resources.
1310 */
1311
1312 /*
1313 * samkumar: There used to be code here to dynamically size the
1314 * receive buffer (tp->rfbuf_ts, rp->rfbuf_cnt, and the local
1315 * newsize variable). In TCPlp, we don't support this, as the user
1316 * allocates the receive buffer and its size can't be changed here.
1317 * Therefore, I removed the code that does this. Note that the
1318 * actual resizing of the buffer is done using sbreserve_locked,
1319 * whose call comes later (not exactly where this comment is).
1320 */
1321
1322 /* Add data to socket buffer. */
1323
1324 /*
1325 * samkumar: The code that was here would just free the mbuf
1326 * (with m_freem(m)) if SBS_CANTRCVMORE is set in
1327 * so->so_rcv.sb_state. Otherwise, it would cut drop_hdrlen bytes
1328 * from the mbuf (using m_adj(m, drop_hdrlen)) to discard the
1329 * headers and then append the mbuf to the receive buffer using
1330 * sbappendstream_locked(&so->so_rcv, m, 0). I've rewritten this
1331 * to work the TCPlp way. The check to so->so_rcv.sb_state is
1332 * replaced by a tcpiscantrcv call, and we copy bytes into
1333 * TCPlp's circular buffer (since we designed it to avoid
1334 * having dynamically-allocated memory for the receive buffer).
1335 */
1336
1337 if (!tpiscantrcv(tp)) {
1338 cbuf_write(&tp->recvbuf, msg, otMessageGetOffset(msg) + drop_hdrlen, tlen, cbuf_copy_from_message);
1339 if (tlen > 0) {
1340 sig->recvbuf_added = true;
1341 }
1342 } else {
1343 /*
1344 * samkumar: We already know tlen != 0, so if we got here, then
1345 * it means that we got data after we called SHUT_RD, or after
1346 * receiving a FIN. I'm going to drop the connection in this
1347 * case. I think FreeBSD might have just dropped the packet
1348 * silently, but Linux handles it this way; this seems to be
1349 * the right approach to me.
1350 */
1351 tcp_drop(tp, ECONNABORTED);
1352 goto drop;
1353 }
1354 /* NB: sorwakeup_locked() does an implicit unlock. */
1355 /*
1356 * samkumar: There used to be a call to sorwakeup_locked(so); here,
1357 * which wakes up any threads waiting for the socket to become
1358 * become ready for reading. TCPlp handles its buffering
1359 * differently so we do not need to replace this call with
1360 * specialized code to handle this.
1361 */
1362 if (DELAY_ACK(tp, tlen)) {
1363 tp->t_flags |= TF_DELACK;
1364 } else {
1365 tp->t_flags |= TF_ACKNOW;
1366 tcp_output(tp);
1367 }
1368 goto check_delack;
1369 }
1370 }
1371
1372 /*
1373 * Calculate amount of space in receive window,
1374 * and then do TCP input processing.
1375 * Receive window is amount of space in rcv queue,
1376 * but not less than advertised window.
1377 */
1378 /* samkumar: Replaced sbspace(&so->so_rcv) with call to cbuf_free_space. */
1379 win = cbuf_free_space(&tp->recvbuf);
1380 if (win < 0)
1381 win = 0;
1382 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
1383
1384 /* Reset receive buffer auto scaling when not in bulk receive mode. */
1385 /* samkumar: Removed this receive buffer autoscaling code. */
1386
1387 switch (tp->t_state) {
1388
1389 /*
1390 * If the state is SYN_RECEIVED:
1391 * if seg contains an ACK, but not for our SYN/ACK, send a RST.
1392 * (Added by Sam) if seg is resending the original SYN, resend the SYN/ACK
1393 */
1394 /*
1395 * samkumar: If we receive a retransmission of the original SYN, then
1396 * resend the SYN/ACK segment. This case was probably handled by the
1397 * SYN cache. Because TCPlp does not use a SYN cache, we need to write
1398 * custom logic for it. It is handled in the "else if" clause below.
1399 */
1400 case TCPS_SYN_RECEIVED:
1401 if ((thflags & TH_ACK) &&
1402 (SEQ_LEQ(th->th_ack, tp->snd_una) ||
1403 SEQ_GT(th->th_ack, tp->snd_max))) {
1404 rstreason = BANDLIM_RST_OPENPORT;
1405 goto dropwithreset;
1406 } else if ((thflags & TH_SYN) && !(thflags & TH_ACK) && (th->th_seq == tp->irs)) {
1407 tp->t_flags |= TF_ACKNOW;
1408 }
1409 break;
1410
1411 /*
1412 * If the state is SYN_SENT:
1413 * if seg contains an ACK, but not for our SYN, drop the input.
1414 * if seg contains a RST, then drop the connection.
1415 * if seg does not contain SYN, then drop it.
1416 * Otherwise this is an acceptable SYN segment
1417 * initialize tp->rcv_nxt and tp->irs
1418 * if seg contains ack then advance tp->snd_una
1419 * if seg contains an ECE and ECN support is enabled, the stream
1420 * is ECN capable.
1421 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state
1422 * arrange for segment to be acked (eventually)
1423 * continue processing rest of data/controls, beginning with URG
1424 */
1425 case TCPS_SYN_SENT:
1426 if ((thflags & TH_ACK) &&
1427 (SEQ_LEQ(th->th_ack, tp->iss) ||
1428 SEQ_GT(th->th_ack, tp->snd_max))) {
1429 rstreason = BANDLIM_UNLIMITED;
1430 goto dropwithreset;
1431 }
1432 if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) {
1433 tp = tcp_drop(tp, ECONNREFUSED);
1434 }
1435 if (thflags & TH_RST)
1436 goto drop;
1437 if (!(thflags & TH_SYN))
1438 goto drop;
1439
1440 tp->irs = th->th_seq;
1441 tcp_rcvseqinit(tp);
1442 if (thflags & TH_ACK) {
1443 /*
1444 * samkumar: Removed call to soisconnected(so), since TCPlp has its
1445 * own buffering.
1446 */
1447
1448 /* Do window scaling on this connection? */
1449 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
1450 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
1451 tp->rcv_scale = tp->request_r_scale;
1452 }
1453 tp->rcv_adv += imin(tp->rcv_wnd,
1454 TCP_MAXWIN << tp->rcv_scale);
1455 tp->snd_una++; /* SYN is acked */
1456 /*
1457 * If there's data, delay ACK; if there's also a FIN
1458 * ACKNOW will be turned on later.
1459 */
1460 if (DELAY_ACK(tp, tlen) && tlen != 0)
1461 tcp_timer_activate(tp, TT_DELACK,
1462 tcp_delacktime);
1463 else
1464 tp->t_flags |= TF_ACKNOW;
1465
1466 if ((thflags & TH_ECE) && V_tcp_do_ecn) {
1467 tp->t_flags |= TF_ECN_PERMIT;
1468 }
1469
1470 /*
1471 * Received <SYN,ACK> in SYN_SENT[*] state.
1472 * Transitions:
1473 * SYN_SENT --> ESTABLISHED
1474 * SYN_SENT* --> FIN_WAIT_1
1475 */
1476 tp->t_starttime = ticks;
1477 if (tp->t_flags & TF_NEEDFIN) {
1478 tcp_state_change(tp, TCPS_FIN_WAIT_1);
1479 tp->t_flags &= ~TF_NEEDFIN;
1480 thflags &= ~TH_SYN;
1481 } else {
1482 tcp_state_change(tp, TCPS_ESTABLISHED);
1483 /* samkumar: Set conn_established signal for TCPlp. */
1484 sig->conn_established = true;
1485 cc_conn_init(tp);
1486 tcp_timer_activate(tp, TT_KEEP,
1487 TP_KEEPIDLE(tp));
1488 }
1489 } else {
1490 /*
1491 * Received initial SYN in SYN-SENT[*] state =>
1492 * simultaneous open.
1493 * If it succeeds, connection is * half-synchronized.
1494 * Otherwise, do 3-way handshake:
1495 * SYN-SENT -> SYN-RECEIVED
1496 * SYN-SENT* -> SYN-RECEIVED*
1497 */
1498 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
1499 tcp_timer_activate(tp, TT_REXMT, 0);
1500 tcp_state_change(tp, TCPS_SYN_RECEIVED);
1501 /*
1502 * samkumar: We would have incremented snd_next in tcp_output when
1503 * we sent the original SYN, so decrement it here. (Another
1504 * consequence of removing the SYN cache.)
1505 */
1506 tp->snd_nxt--;
1507 }
1508
1509 /*
1510 * Advance th->th_seq to correspond to first data byte.
1511 * If data, trim to stay within window,
1512 * dropping FIN if necessary.
1513 */
1514 th->th_seq++;
1515 if (tlen > tp->rcv_wnd) {
1516 todrop = tlen - tp->rcv_wnd;
1517 /*
1518 * samkumar: I removed a call to m_adj(m, -todrop), which intends
1519 * to trim the data so it fits in the window. We can just read less
1520 * when copying into the receive buffer in TCPlp, so we don't need
1521 * to do this.
1522 */
1523 (void) todrop; /* samkumar: Prevent a compiler warning */
1524 tlen = tp->rcv_wnd;
1525 thflags &= ~TH_FIN;
1526 }
1527 tp->snd_wl1 = th->th_seq - 1;
1528 tp->rcv_up = th->th_seq;
1529 /*
1530 * Client side of transaction: already sent SYN and data.
1531 * If the remote host used T/TCP to validate the SYN,
1532 * our data will be ACK'd; if so, enter normal data segment
1533 * processing in the middle of step 5, ack processing.
1534 * Otherwise, goto step 6.
1535 */
1536 if (thflags & TH_ACK)
1537 goto process_ACK;
1538
1539 goto step6;
1540
1541 /*
1542 * If the state is LAST_ACK or CLOSING or TIME_WAIT:
1543 * do normal processing.
1544 *
1545 * NB: Leftover from RFC1644 T/TCP. Cases to be reused later.
1546 */
1547 case TCPS_LAST_ACK:
1548 case TCPS_CLOSING:
1549 break; /* continue normal processing */
1550 }
1551
1552 /*
1553 * States other than LISTEN or SYN_SENT.
1554 * First check the RST flag and sequence number since reset segments
1555 * are exempt from the timestamp and connection count tests. This
1556 * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix
1557 * below which allowed reset segments in half the sequence space
1558 * to fall though and be processed (which gives forged reset
1559 * segments with a random sequence number a 50 percent chance of
1560 * killing a connection).
1561 * Then check timestamp, if present.
1562 * Then check the connection count, if present.
1563 * Then check that at least some bytes of segment are within
1564 * receive window. If segment begins before rcv_nxt,
1565 * drop leading data (and SYN); if nothing left, just ack.
1566 */
1567 if (thflags & TH_RST) {
1568 /*
1569 * RFC5961 Section 3.2
1570 *
1571 * - RST drops connection only if SEG.SEQ == RCV.NXT.
1572 * - If RST is in window, we send challenge ACK.
1573 *
1574 * Note: to take into account delayed ACKs, we should
1575 * test against last_ack_sent instead of rcv_nxt.
1576 * Note 2: we handle special case of closed window, not
1577 * covered by the RFC.
1578 */
1579 if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
1580 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) ||
1581 (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) {
1582
1583 /*
1584 * samkumar: This if statement used to also be prefaced with
1585 * "V_tcp_insecure_rst ||". But I removed it, since there's no
1586 * reason to support an insecure option in TCPlp (my guess is that
1587 * FreeBSD supported it for legacy reasons).
1588 */
1589 if (tp->last_ack_sent == th->th_seq) {
1590 /*
1591 * samkumar: Normally, the error number would be stored in
1592 * so->so_error. Instead, we put it in this "droperror" local
1593 * variable and then pass it to tcplp_sys_connection_lost.
1594 */
1595 int droperror = 0;
1596 /* Drop the connection. */
1597 switch (tp->t_state) {
1598 case TCPS_SYN_RECEIVED:
1599 droperror = ECONNREFUSED;
1600 goto close;
1601 case TCPS_ESTABLISHED:
1602 case TCPS_FIN_WAIT_1:
1603 case TCPS_FIN_WAIT_2:
1604 case TCPS_CLOSE_WAIT:
1605 droperror = ECONNRESET;
1606 close:
1607 tcp_state_change(tp, TCPS_CLOSED);
1608 /* FALLTHROUGH */
1609 default:
1610 tp = tcp_close(tp);
1611 tcplp_sys_connection_lost(tp, droperror);
1612 }
1613 } else {
1614 /* Send challenge ACK. */
1615 tcp_respond(tp, tp->instance, ip6, th, tp->rcv_nxt, tp->snd_nxt, TH_ACK);
1616 tp->last_ack_sent = tp->rcv_nxt;
1617 }
1618 }
1619 goto drop;
1620 }
1621
1622 /*
1623 * RFC5961 Section 4.2
1624 * Send challenge ACK for any SYN in synchronized state.
1625 */
1626 /*
1627 * samkumar: I added the check for the SYN-RECEIVED state in this if
1628 * statement (another consequence of removing the SYN cache).
1629 */
1630 if ((thflags & TH_SYN) && tp->t_state != TCPS_SYN_SENT && tp->t_state != TCP6S_SYN_RECEIVED) {
1631 /*
1632 * samkumar: The modern way to handle this is to send a Challenge ACK.
1633 * FreeBSD supports this, but it also has this V_tcp_insecure_syn
1634 * options that will cause it to drop the connection if the SYN falls
1635 * in the receive window. In TCPlp we *only* support Challenge ACKs
1636 * (the secure way of doing it), so I've removed code for the insecure
1637 * way. (Presumably the reason why FreeBSD supports the insecure way is
1638 * for legacy code, which we don't really care about in TCPlp).
1639 */
1640 /* Send challenge ACK. */
1641 tcplp_sys_log("Sending challenge ACK");
1642 tcp_respond(tp, tp->instance, ip6, th, tp->rcv_nxt, tp->snd_nxt, TH_ACK);
1643 tp->last_ack_sent = tp->rcv_nxt;
1644 goto drop;
1645 }
1646
1647 /*
1648 * RFC 1323 PAWS: If we have a timestamp reply on this segment
1649 * and it's less than ts_recent, drop it.
1650 */
1651 if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent &&
1652 TSTMP_LT(to.to_tsval, tp->ts_recent)) {
1653
1654 /* Check to see if ts_recent is over 24 days old. */
1655 if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) {
1656 /*
1657 * Invalidate ts_recent. If this segment updates
1658 * ts_recent, the age will be reset later and ts_recent
1659 * will get a valid value. If it does not, setting
1660 * ts_recent to zero will at least satisfy the
1661 * requirement that zero be placed in the timestamp
1662 * echo reply when ts_recent isn't valid. The
1663 * age isn't reset until we get a valid ts_recent
1664 * because we don't want out-of-order segments to be
1665 * dropped when ts_recent is old.
1666 */
1667 tp->ts_recent = 0;
1668 } else {
1669 if (tlen)
1670 goto dropafterack;
1671 goto drop;
1672 }
1673 }
1674
1675 /*
1676 * In the SYN-RECEIVED state, validate that the packet belongs to
1677 * this connection before trimming the data to fit the receive
1678 * window. Check the sequence number versus IRS since we know
1679 * the sequence numbers haven't wrapped. This is a partial fix
1680 * for the "LAND" DoS attack.
1681 */
1682 if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) {
1683 rstreason = BANDLIM_RST_OPENPORT;
1684 goto dropwithreset;
1685 }
1686
1687 todrop = tp->rcv_nxt - th->th_seq;
1688 if (todrop > 0) {
1689 if (thflags & TH_SYN) {
1690 thflags &= ~TH_SYN;
1691 th->th_seq++;
1692 if (th->th_urp > 1)
1693 th->th_urp--;
1694 else
1695 thflags &= ~TH_URG;
1696 todrop--;
1697 }
1698 /*
1699 * Following if statement from Stevens, vol. 2, p. 960.
1700 */
1701 if (todrop > tlen
1702 || (todrop == tlen && (thflags & TH_FIN) == 0)) {
1703 /*
1704 * Any valid FIN must be to the left of the window.
1705 * At this point the FIN must be a duplicate or out
1706 * of sequence; drop it.
1707 */
1708 thflags &= ~TH_FIN;
1709
1710 /*
1711 * Send an ACK to resynchronize and drop any data.
1712 * But keep on processing for RST or ACK.
1713 */
1714 tp->t_flags |= TF_ACKNOW;
1715 todrop = tlen;
1716 }
1717 /* samkumar: There was an else case that only collected stats. */
1718 drop_hdrlen += todrop; /* drop from the top afterwards */
1719 th->th_seq += todrop;
1720 tlen -= todrop;
1721 if (th->th_urp > todrop)
1722 th->th_urp -= todrop;
1723 else {
1724 thflags &= ~TH_URG;
1725 th->th_urp = 0;
1726 }
1727 }
1728
1729 /*
1730 * If new data are received on a connection after the
1731 * user processes are gone, then RST the other end.
1732 */
1733 /*
1734 * samkumar: TCPlp is designed for embedded systems where there is no
1735 * concept of a "process" that has allocated a TCP socket. Therefore, we
1736 * do not implement the functionality in the above comment (the code for
1737 * it used to be here, and I removed it).
1738 */
1739 /*
1740 * If segment ends after window, drop trailing data
1741 * (and PUSH and FIN); if nothing left, just ACK.
1742 */
1743 todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd);
1744 if (todrop > 0) {
1745 if (todrop >= tlen) {
1746 /*
1747 * If window is closed can only take segments at
1748 * window edge, and have to drop data and PUSH from
1749 * incoming segments. Continue processing, but
1750 * remember to ack. Otherwise, drop segment
1751 * and ack.
1752 */
1753 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
1754 tp->t_flags |= TF_ACKNOW;
1755 } else
1756 goto dropafterack;
1757 }
1758 /*
1759 * samkumar: I removed a call to m_adj(m, -todrop), which intends
1760 * to trim the data so it fits in the window. We can just read less
1761 * when copying into the receive buffer in TCPlp, so we don't need
1762 * to do this. Subtracting it from tlen gives us enough information to
1763 * do this later. In FreeBSD, this isn't possible because the mbuf
1764 * itself becomes part of the receive buffer, so the mbuf has to be
1765 * trimmed in order for this to work out.
1766 */
1767 tlen -= todrop;
1768 thflags &= ~(TH_PUSH|TH_FIN);
1769 }
1770
1771 /*
1772 * If last ACK falls within this segment's sequence numbers,
1773 * record its timestamp.
1774 * NOTE:
1775 * 1) That the test incorporates suggestions from the latest
1776 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
1777 * 2) That updating only on newer timestamps interferes with
1778 * our earlier PAWS tests, so this check should be solely
1779 * predicated on the sequence space of this segment.
1780 * 3) That we modify the segment boundary check to be
1781 * Last.ACK.Sent <= SEG.SEQ + SEG.Len
1782 * instead of RFC1323's
1783 * Last.ACK.Sent < SEG.SEQ + SEG.Len,
1784 * This modified check allows us to overcome RFC1323's
1785 * limitations as described in Stevens TCP/IP Illustrated
1786 * Vol. 2 p.869. In such cases, we can still calculate the
1787 * RTT correctly when RCV.NXT == Last.ACK.Sent.
1788 */
1789
1790 if ((to.to_flags & TOF_TS) != 0 &&
1791 SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
1792 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
1793 ((thflags & (TH_SYN|TH_FIN)) != 0))) {
1794 tp->ts_recent_age = tcp_ts_getticks();
1795 tp->ts_recent = to.to_tsval;
1796 }
1797
1798 /*
1799 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN
1800 * flag is on (half-synchronized state), then queue data for
1801 * later processing; else drop segment and return.
1802 */
1803 if ((thflags & TH_ACK) == 0) {
1804 if (tp->t_state == TCPS_SYN_RECEIVED ||
1805 (tp->t_flags & TF_NEEDSYN))
1806 goto step6;
1807 else if (tp->t_flags & TF_ACKNOW)
1808 goto dropafterack;
1809 else
1810 goto drop;
1811 }
1812
1813 tcplp_sys_log("Processing ACK");
1814
1815 /*
1816 * Ack processing.
1817 */
1818 switch (tp->t_state) {
1819
1820 /*
1821 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter
1822 * ESTABLISHED state and continue processing.
1823 * The ACK was checked above.
1824 */
1825 case TCPS_SYN_RECEIVED:
1826 /*
1827 * samkumar: Removed call to soisconnected(so), since TCPlp has its
1828 * own buffering.
1829 */
1830 /* Do window scaling? */
1831 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
1832 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
1833 tp->rcv_scale = tp->request_r_scale;
1834 tp->snd_wnd = tiwin;
1835 }
1836 /*
1837 * Make transitions:
1838 * SYN-RECEIVED -> ESTABLISHED
1839 * SYN-RECEIVED* -> FIN-WAIT-1
1840 */
1841 tp->t_starttime = ticks;
1842 if (tp->t_flags & TF_NEEDFIN) {
1843 tcp_state_change(tp, TCPS_FIN_WAIT_1);
1844 tp->t_flags &= ~TF_NEEDFIN;
1845 } else {
1846 tcp_state_change(tp, TCPS_ESTABLISHED);
1847 /* samkumar: Set conn_established signal for TCPlp. */
1848 sig->conn_established = true;
1849 cc_conn_init(tp);
1850 tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp));
1851 /*
1852 * samkumar: I added this check to account for simultaneous open.
1853 * If this socket was opened actively, then the fact that we are
1854 * in SYN-RECEIVED indicates that we are in simultaneous open.
1855 * Therefore, don't ACK the SYN-ACK (unless it contains data or
1856 * something, which will be processed later).
1857 */
1858 if (!tpispassiveopen(tp)) {
1859 tp->t_flags &= ~TF_ACKNOW;
1860 } else {
1861 /*
1862 * samkumar: Otherwise, we entered the ESTABLISHED state by
1863 * accepting a connection, so call the appropriate callback in
1864 * TCPlp. TODO: consider using signals to handle this?
1865 */
1866 bool accepted = tcplp_sys_accepted_connection(tp->accepted_from, tp, &ip6->ip6_src, th->th_sport);
1867 if (!accepted) {
1868 rstreason = ECONNREFUSED;
1869 goto dropwithreset;
1870 }
1871 }
1872 }
1873 /*
1874 * If segment contains data or ACK, will call tcp_reass()
1875 * later; if not, do so now to pass queued data to user.
1876 */
1877 if (tlen == 0 && (thflags & TH_FIN) == 0)
1878 (void) tcp_reass(tp, (struct tcphdr *)0, 0,
1879 (otMessage*)0, 0, sig);
1880
1881 tp->snd_wl1 = th->th_seq - 1;
1882 /* FALLTHROUGH */
1883
1884 /*
1885 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
1886 * ACKs. If the ack is in the range
1887 * tp->snd_una < th->th_ack <= tp->snd_max
1888 * then advance tp->snd_una to th->th_ack and drop
1889 * data from the retransmission queue. If this ACK reflects
1890 * more up to date window information we update our window information.
1891 */
1892 case TCPS_ESTABLISHED:
1893 case TCPS_FIN_WAIT_1:
1894 case TCPS_FIN_WAIT_2:
1895 case TCPS_CLOSE_WAIT:
1896 case TCPS_CLOSING:
1897 case TCPS_LAST_ACK:
1898 if (SEQ_GT(th->th_ack, tp->snd_max)) {
1899 goto dropafterack;
1900 }
1901
1902 if ((tp->t_flags & TF_SACK_PERMIT) &&
1903 ((to.to_flags & TOF_SACK) ||
1904 !TAILQ_EMPTY(&tp->snd_holes)))
1905 tcp_sack_doack(tp, &to, th->th_ack);
1906
1907 if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
1908 if (tlen == 0 && tiwin == tp->snd_wnd) {
1909 /*
1910 * If this is the first time we've seen a
1911 * FIN from the remote, this is not a
1912 * duplicate and it needs to be processed
1913 * normally. This happens during a
1914 * simultaneous close.
1915 */
1916 if ((thflags & TH_FIN) &&
1917 (TCPS_HAVERCVDFIN(tp->t_state) == 0)) {
1918 tp->t_dupacks = 0;
1919 break;
1920 }
1921 /*
1922 * If we have outstanding data (other than
1923 * a window probe), this is a completely
1924 * duplicate ack (ie, window info didn't
1925 * change and FIN isn't set),
1926 * the ack is the biggest we've
1927 * seen and we've seen exactly our rexmt
1928 * threshhold of them, assume a packet
1929 * has been dropped and retransmit it.
1930 * Kludge snd_nxt & the congestion
1931 * window so we send only this one
1932 * packet.
1933 *
1934 * We know we're losing at the current
1935 * window size so do congestion avoidance
1936 * (set ssthresh to half the current window
1937 * and pull our congestion window back to
1938 * the new ssthresh).
1939 *
1940 * Dup acks mean that packets have left the
1941 * network (they're now cached at the receiver)
1942 * so bump cwnd by the amount in the receiver
1943 * to keep a constant cwnd packets in the
1944 * network.
1945 *
1946 * When using TCP ECN, notify the peer that
1947 * we reduced the cwnd.
1948 */
1949 if (!tcp_timer_active(tp, TT_REXMT) ||
1950 th->th_ack != tp->snd_una)
1951 tp->t_dupacks = 0;
1952 else if (++tp->t_dupacks > tcprexmtthresh ||
1953 IN_FASTRECOVERY(tp->t_flags)) {
1954 cc_ack_received(tp, th, CC_DUPACK);
1955 if ((tp->t_flags & TF_SACK_PERMIT) &&
1956 IN_FASTRECOVERY(tp->t_flags)) {
1957 int awnd;
1958
1959 /*
1960 * Compute the amount of data in flight first.
1961 * We can inject new data into the pipe iff
1962 * we have less than 1/2 the original window's
1963 * worth of data in flight.
1964 */
1965 awnd = (tp->snd_nxt - tp->snd_fack) +
1966 tp->sackhint.sack_bytes_rexmit;
1967 if (awnd < tp->snd_ssthresh) {
1968 tp->snd_cwnd += tp->t_maxseg;
1969 if (tp->snd_cwnd > tp->snd_ssthresh)
1970 tp->snd_cwnd = tp->snd_ssthresh;
1971 }
1972 } else
1973 tp->snd_cwnd += tp->t_maxseg;
1974 #ifdef INSTRUMENT_TCP
1975 tcplp_sys_log("TCP DUPACK");
1976 #endif
1977 (void) tcp_output(tp);
1978 goto drop;
1979 } else if (tp->t_dupacks == tcprexmtthresh) {
1980 tcp_seq onxt = tp->snd_nxt;
1981
1982 /*
1983 * If we're doing sack, check to
1984 * see if we're already in sack
1985 * recovery. If we're not doing sack,
1986 * check to see if we're in newreno
1987 * recovery.
1988 */
1989 if (tp->t_flags & TF_SACK_PERMIT) {
1990 if (IN_FASTRECOVERY(tp->t_flags)) {
1991 tp->t_dupacks = 0;
1992 break;
1993 }
1994 } else {
1995 if (SEQ_LEQ(th->th_ack,
1996 tp->snd_recover)) {
1997 tp->t_dupacks = 0;
1998 break;
1999 }
2000 }
2001 /* Congestion signal before ack. */
2002 cc_cong_signal(tp, th, CC_NDUPACK);
2003 cc_ack_received(tp, th, CC_DUPACK);
2004 tcp_timer_activate(tp, TT_REXMT, 0);
2005 tp->t_rtttime = 0;
2006
2007 #ifdef INSTRUMENT_TCP
2008 tcplp_sys_log("TCP DUPACK_THRESH");
2009 #endif
2010 if (tp->t_flags & TF_SACK_PERMIT) {
2011 tp->sack_newdata = tp->snd_nxt;
2012 tp->snd_cwnd = tp->t_maxseg;
2013 (void) tcp_output(tp);
2014 goto drop;
2015 }
2016
2017 tp->snd_nxt = th->th_ack;
2018 tp->snd_cwnd = tp->t_maxseg;
2019 (void) tcp_output(tp);
2020 /*
2021 * samkumar: I added casts to uint64_t below to
2022 * fix an OpenThread code scanning alert relating
2023 * to integer overflow in multiplication.
2024 */
2025 tp->snd_cwnd = tp->snd_ssthresh +
2026 ((uint64_t) tp->t_maxseg) *
2027 ((uint64_t) (tp->t_dupacks - tp->snd_limited));
2028 #ifdef INSTRUMENT_TCP
2029 tcplp_sys_log("TCP SET_cwnd %d", (int) tp->snd_cwnd);
2030 #endif
2031 if (SEQ_GT(onxt, tp->snd_nxt))
2032 tp->snd_nxt = onxt;
2033 goto drop;
2034 } else if (V_tcp_do_rfc3042) {
2035 /*
2036 * Process first and second duplicate
2037 * ACKs. Each indicates a segment
2038 * leaving the network, creating room
2039 * for more. Make sure we can send a
2040 * packet on reception of each duplicate
2041 * ACK by increasing snd_cwnd by one
2042 * segment. Restore the original
2043 * snd_cwnd after packet transmission.
2044 */
2045 uint64_t oldcwnd;
2046 tcp_seq oldsndmax;
2047 uint32_t sent;
2048 int avail;
2049 cc_ack_received(tp, th, CC_DUPACK);
2050 oldcwnd = tp->snd_cwnd;
2051 oldsndmax = tp->snd_max;
2052
2053 #ifdef INSTRUMENT_TCP
2054 tcplp_sys_log("TCP LIM_TRANS");
2055 #endif
2056
2057 KASSERT(tp->t_dupacks == 1 ||
2058 tp->t_dupacks == 2,
2059 ("%s: dupacks not 1 or 2",
2060 __func__));
2061 if (tp->t_dupacks == 1)
2062 tp->snd_limited = 0;
2063 tp->snd_cwnd =
2064 (tp->snd_nxt - tp->snd_una) +
2065 (tp->t_dupacks - tp->snd_limited) *
2066 tp->t_maxseg;
2067 /*
2068 * Only call tcp_output when there
2069 * is new data available to be sent.
2070 * Otherwise we would send pure ACKs.
2071 */
2072 /*
2073 * samkumar: Replace sbavail(&so->so_snd) with the call to
2074 * lbuf_used_space.
2075 */
2076 avail = lbuf_used_space(&tp->sendbuf) -
2077 (tp->snd_nxt - tp->snd_una);
2078 if (avail > 0)
2079 (void) tcp_output(tp);
2080 sent = tp->snd_max - oldsndmax;
2081 if (sent > tp->t_maxseg) {
2082 KASSERT((tp->t_dupacks == 2 &&
2083 tp->snd_limited == 0) ||
2084 (sent == tp->t_maxseg + 1 &&
2085 tp->t_flags & TF_SENTFIN),
2086 ("%s: sent too much",
2087 __func__));
2088 tp->snd_limited = 2;
2089 } else if (sent > 0)
2090 ++tp->snd_limited;
2091 tp->snd_cwnd = oldcwnd;
2092 #ifdef INSTRUMENT_TCP
2093 tcplp_sys_log("TCP RESET_cwnd %d", (int) tp->snd_cwnd);
2094 #endif
2095 goto drop;
2096 }
2097 } else
2098 tp->t_dupacks = 0;
2099 break;
2100 }
2101
2102 KASSERT(SEQ_GT(th->th_ack, tp->snd_una),
2103 ("%s: th_ack <= snd_una", __func__));
2104
2105 /*
2106 * If the congestion window was inflated to account
2107 * for the other side's cached packets, retract it.
2108 */
2109 if (IN_FASTRECOVERY(tp->t_flags)) {
2110 if (SEQ_LT(th->th_ack, tp->snd_recover)) {
2111 if (tp->t_flags & TF_SACK_PERMIT)
2112 tcp_sack_partialack(tp, th);
2113 else
2114 tcp_newreno_partial_ack(tp, th);
2115 } else
2116 cc_post_recovery(tp, th);
2117 }
2118
2119 tp->t_dupacks = 0;
2120 /*
2121 * If we reach this point, ACK is not a duplicate,
2122 * i.e., it ACKs something we sent.
2123 */
2124 if (tp->t_flags & TF_NEEDSYN) {
2125 /*
2126 * T/TCP: Connection was half-synchronized, and our
2127 * SYN has been ACK'd (so connection is now fully
2128 * synchronized). Go to non-starred state,
2129 * increment snd_una for ACK of SYN, and check if
2130 * we can do window scaling.
2131 */
2132 tp->t_flags &= ~TF_NEEDSYN;
2133 tp->snd_una++;
2134 /* Do window scaling? */
2135 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
2136 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
2137 tp->rcv_scale = tp->request_r_scale;
2138 /* Send window already scaled. */
2139 }
2140 }
2141
2142 process_ACK:
2143 acked = BYTES_THIS_ACK(tp, th);
2144
2145 tcplp_sys_log("Bytes acked: %d", acked);
2146 /*
2147 * If we just performed our first retransmit, and the ACK
2148 * arrives within our recovery window, then it was a mistake
2149 * to do the retransmit in the first place. Recover our
2150 * original cwnd and ssthresh, and proceed to transmit where
2151 * we left off.
2152 */
2153 if (tp->t_rxtshift == 1 && tp->t_flags & TF_PREVVALID &&
2154 (int)(ticks - tp->t_badrxtwin) < 0)
2155 cc_cong_signal(tp, th, CC_RTO_ERR);
2156
2157 /*
2158 * If we have a timestamp reply, update smoothed
2159 * round trip time. If no timestamp is present but
2160 * transmit timer is running and timed sequence
2161 * number was acked, update smoothed round trip time.
2162 * Since we now have an rtt measurement, cancel the
2163 * timer backoff (cf., Phil Karn's retransmit alg.).
2164 * Recompute the initial retransmit timer.
2165 *
2166 * Some boxes send broken timestamp replies
2167 * during the SYN+ACK phase, ignore
2168 * timestamps of 0 or we could calculate a
2169 * huge RTT and blow up the retransmit timer.
2170 */
2171
2172 if ((to.to_flags & TOF_TS) != 0 && to.to_tsecr) {
2173 uint32_t t;
2174
2175 t = tcp_ts_getticks() - to.to_tsecr;
2176 if (!tp->t_rttlow || tp->t_rttlow > t)
2177 tp->t_rttlow = t;
2178 tcp_xmit_timer(tp, TCP_TS_TO_TICKS(t) + 1);
2179 } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) {
2180 if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime)
2181 tp->t_rttlow = ticks - tp->t_rtttime;
2182 tcp_xmit_timer(tp, ticks - tp->t_rtttime);
2183 }
2184
2185 /*
2186 * If all outstanding data is acked, stop retransmit
2187 * timer and remember to restart (more output or persist).
2188 * If there is more data to be acked, restart retransmit
2189 * timer, using current (possibly backed-off) value.
2190 */
2191 if (th->th_ack == tp->snd_max) {
2192 tcp_timer_activate(tp, TT_REXMT, 0);
2193 needoutput = 1;
2194 } else if (!tcp_timer_active(tp, TT_PERSIST)) {
2195 tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
2196 }
2197
2198 /*
2199 * If no data (only SYN) was ACK'd,
2200 * skip rest of ACK processing.
2201 */
2202 if (acked == 0)
2203 goto step6;
2204
2205 /*
2206 * Let the congestion control algorithm update congestion
2207 * control related information. This typically means increasing
2208 * the congestion window.
2209 */
2210 cc_ack_received(tp, th, CC_ACK);
2211
2212 /*
2213 * samkumar: I replaced the calls to sbavail(&so->so_snd) with new
2214 * calls to lbuf_used_space, and then I modified the code to actually
2215 * remove code from the send buffer, formerly done via
2216 * sbcut_locked(&so->so_send, (int)sbavail(&so->so_snd)) in the if case
2217 * and sbcut_locked(&so->so_snd, acked) in the else case, to use the
2218 * data structures for TCPlp's data buffering.
2219 */
2220 if (acked > lbuf_used_space(&tp->sendbuf)) {
2221 uint32_t poppedbytes;
2222 uint32_t usedspace = lbuf_used_space(&tp->sendbuf);
2223 tp->snd_wnd -= usedspace;
2224 poppedbytes = lbuf_pop(&tp->sendbuf, usedspace, &sig->links_popped);
2225 KASSERT(poppedbytes == usedspace, ("Could not fully empty send buffer"));
2226 sig->bytes_acked += poppedbytes;
2227 ourfinisacked = 1;
2228 } else {
2229 uint32_t poppedbytes = lbuf_pop(&tp->sendbuf, acked, &sig->links_popped);
2230 KASSERT(poppedbytes == acked, ("Could not remove acked bytes from send buffer"));
2231 sig->bytes_acked += poppedbytes;
2232 tp->snd_wnd -= acked;
2233 ourfinisacked = 0;
2234 }
2235 /* NB: sowwakeup_locked() does an implicit unlock. */
2236 /*
2237 * samkumar: There used to be a call to sowwakeup(so); here,
2238 * which wakes up any threads waiting for the socket to
2239 * become ready for writing. TCPlp handles its send buffer
2240 * differently so we do not need to replace this call with
2241 * specialized code to handle this.
2242 */
2243 /* Detect una wraparound. */
2244 if (!IN_RECOVERY(tp->t_flags) &&
2245 SEQ_GT(tp->snd_una, tp->snd_recover) &&
2246 SEQ_LEQ(th->th_ack, tp->snd_recover))
2247 tp->snd_recover = th->th_ack - 1;
2248 /* XXXLAS: Can this be moved up into cc_post_recovery? */
2249 if (IN_RECOVERY(tp->t_flags) &&
2250 SEQ_GEQ(th->th_ack, tp->snd_recover)) {
2251 EXIT_RECOVERY(tp->t_flags);
2252 }
2253 tp->snd_una = th->th_ack;
2254 if (tp->t_flags & TF_SACK_PERMIT) {
2255 if (SEQ_GT(tp->snd_una, tp->snd_recover))
2256 tp->snd_recover = tp->snd_una;
2257 }
2258 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
2259 tp->snd_nxt = tp->snd_una;
2260
2261 switch (tp->t_state) {
2262
2263 /*
2264 * In FIN_WAIT_1 STATE in addition to the processing
2265 * for the ESTABLISHED state if our FIN is now acknowledged
2266 * then enter FIN_WAIT_2.
2267 */
2268 case TCPS_FIN_WAIT_1:
2269 if (ourfinisacked) {
2270 /*
2271 * If we can't receive any more
2272 * data, then closing user can proceed.
2273 * Starting the timer is contrary to the
2274 * specification, but if we don't get a FIN
2275 * we'll hang forever.
2276 *
2277 * XXXjl:
2278 * we should release the tp also, and use a
2279 * compressed state.
2280 */
2281 /*
2282 * samkumar: I replaced a check for the SBS_CANTRCVMORE flag
2283 * in so->so_rcv.sb_state with a call to tcpiscantrcv.
2284 */
2285 if (tpiscantrcv(tp)) {
2286 /* samkumar: Removed a call to soisdisconnected(so). */
2287 tcp_timer_activate(tp, TT_2MSL,
2288 (tcp_fast_finwait2_recycle ?
2289 tcp_finwait2_timeout :
2290 TP_MAXIDLE(tp)));
2291 }
2292 tcp_state_change(tp, TCPS_FIN_WAIT_2);
2293 }
2294 break;
2295
2296 /*
2297 * In CLOSING STATE in addition to the processing for
2298 * the ESTABLISHED state if the ACK acknowledges our FIN
2299 * then enter the TIME-WAIT state, otherwise ignore
2300 * the segment.
2301 */
2302 case TCPS_CLOSING:
2303 if (ourfinisacked) {
2304 /*
2305 * samkumar: I added the line below. We need to avoid sending
2306 * an ACK in the TIME-WAIT state, since we don't want to
2307 * ACK ACKs. This edge case appears because TCPlp, unlike the
2308 * original FreeBSD code, uses tcpcbs for connections in the
2309 * TIME-WAIT state (FreeBSD uses a different, smaller
2310 * structure).
2311 */
2312 tp->t_flags &= ~TF_ACKNOW;
2313 tcp_twstart(tp);
2314 return;
2315 }
2316 break;
2317
2318 /*
2319 * In LAST_ACK, we may still be waiting for data to drain
2320 * and/or to be acked, as well as for the ack of our FIN.
2321 * If our FIN is now acknowledged, delete the TCB,
2322 * enter the closed state and return.
2323 */
2324 case TCPS_LAST_ACK:
2325 if (ourfinisacked) {
2326 tp = tcp_close(tp);
2327 tcplp_sys_connection_lost(tp, CONN_LOST_NORMAL);
2328 goto drop;
2329 }
2330 break;
2331 }
2332 }
2333
2334 step6:
2335
2336 /*
2337 * Update window information.
2338 * Don't look at window if no ACK: TAC's send garbage on first SYN.
2339 */
2340 if ((thflags & TH_ACK) &&
2341 (SEQ_LT(tp->snd_wl1, th->th_seq) ||
2342 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
2343 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
2344 /* keep track of pure window updates */
2345 /*
2346 * samkumar: There used to be an if statement here that would check if
2347 * this is a "pure" window update (tlen == 0 &&
2348 * tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) and keep
2349 * statistics for how often that happens.
2350 */
2351 tp->snd_wnd = tiwin;
2352 tp->snd_wl1 = th->th_seq;
2353 tp->snd_wl2 = th->th_ack;
2354 if (tp->snd_wnd > tp->max_sndwnd)
2355 tp->max_sndwnd = tp->snd_wnd;
2356 needoutput = 1;
2357 }
2358
2359 /*
2360 * Process segments with URG.
2361 */
2362 /*
2363 * samkumar: TCPlp does not support the urgent pointer, so we omit all
2364 * urgent-pointer-related processing and buffering. The code below is the
2365 * code that was in the "else" case that handles no valid urgent data in
2366 * the received packet.
2367 */
2368 {
2369 /*
2370 * If no out of band data is expected,
2371 * pull receive urgent pointer along
2372 * with the receive window.
2373 */
2374 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
2375 tp->rcv_up = tp->rcv_nxt;
2376 }
2377
2378 /*
2379 * Process the segment text, merging it into the TCP sequencing queue,
2380 * and arranging for acknowledgment of receipt if necessary.
2381 * This process logically involves adjusting tp->rcv_wnd as data
2382 * is presented to the user (this happens in tcp_usrreq.c,
2383 * case PRU_RCVD). If a FIN has already been received on this
2384 * connection then we just ignore the text.
2385 */
2386 if ((tlen || (thflags & TH_FIN)) &&
2387 TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2388 tcp_seq save_start = th->th_seq;
2389 /*
2390 * samkumar: I removed a call to m_adj(m, drop_hdrlen), which intends
2391 * to drop data from the mbuf so it can be chained into the receive
2392 * header. This is not necessary for TCPlp because we copy the data
2393 * anyway; we just add the offset when copying data into the receive
2394 * buffer.
2395 */
2396 /*
2397 * Insert segment which includes th into TCP reassembly queue
2398 * with control block tp. Set thflags to whether reassembly now
2399 * includes a segment with FIN. This handles the common case
2400 * inline (segment is the next to be received on an established
2401 * connection, and the queue is empty), avoiding linkage into
2402 * and removal from the queue and repetition of various
2403 * conversions.
2404 * Set DELACK for segments received in order, but ack
2405 * immediately when segments are out of order (so
2406 * fast retransmit can work).
2407 */
2408 /*
2409 * samkumar: I replaced LIST_EMPTY(&tp->t_segq) with the calls to
2410 * tpiscantrcv and bmp_isempty on the second line below.
2411 */
2412 if (th->th_seq == tp->rcv_nxt &&
2413 (tpiscantrcv(tp) || bmp_isempty(tp->reassbmp, REASSBMP_SIZE(tp))) &&
2414 TCPS_HAVEESTABLISHED(tp->t_state)) {
2415 if (DELAY_ACK(tp, tlen))
2416 tp->t_flags |= TF_DELACK;
2417 else
2418 tp->t_flags |= TF_ACKNOW;
2419 tp->rcv_nxt += tlen;
2420 thflags = th->th_flags & TH_FIN;
2421
2422 /*
2423 * samkumar: I replaced the code that used to be here (which would
2424 * free the mbuf with m_freem(m) if the SBS_CANTRCVMORE flag is set
2425 * on so->so_rcv.sb_state, and otherwise call
2426 * sbappendstream_locked(&so->so_rcv, m, 0);).
2427 */
2428 if (!tpiscantrcv(tp)) {
2429 cbuf_write(&tp->recvbuf, msg, otMessageGetOffset(msg) + drop_hdrlen, tlen, cbuf_copy_from_message);
2430 if (tlen > 0) {
2431 sig->recvbuf_added = true;
2432 }
2433 } else if (tlen > 0) {
2434 /*
2435 * samkumar: We already know tlen != 0, so if we got here, then
2436 * it means that we got data after we called SHUT_RD, or after
2437 * receiving a FIN. I'm going to drop the connection in this
2438 * case. I think FreeBSD might have just dropped the packet
2439 * silently, but Linux handles it this way; this seems to be
2440 * the right approach to me.
2441 */
2442 tcp_drop(tp, ECONNABORTED);
2443 goto drop;
2444 }
2445 /* NB: sorwakeup_locked() does an implicit unlock. */
2446 /*
2447 * samkumar: There used to be a call to sorwakeup_locked(so); here,
2448 * which wakes up any threads waiting for the socket to become
2449 * become ready for reading. TCPlp handles its buffering
2450 * differently so we do not need to replace this call with
2451 * specialized code to handle this.
2452 */
2453 } else if (tpiscantrcv(tp)) {
2454 /*
2455 * samkumar: We will reach this point if we get out-of-order data
2456 * on a socket which was shut down with SHUT_RD, or where we
2457 * already received a FIN. My response here is to drop the segment
2458 * and send an RST.
2459 */
2460 tcp_drop(tp, ECONNABORTED);
2461 goto drop;
2462 } else {
2463 /*
2464 * XXX: Due to the header drop above "th" is
2465 * theoretically invalid by now. Fortunately
2466 * m_adj() doesn't actually frees any mbufs
2467 * when trimming from the head.
2468 */
2469 thflags = tcp_reass(tp, th, &tlen, msg, otMessageGetOffset(msg) + drop_hdrlen, sig);
2470 tp->t_flags |= TF_ACKNOW;
2471 }
2472 // Only place tlen is used after the call to tcp_reass is below
2473 if (tlen > 0 && (tp->t_flags & TF_SACK_PERMIT))
2474 tcp_update_sack_list(tp, save_start, save_start + tlen);
2475 /*
2476 * samkumar: This is not me commenting things out; this was already
2477 * commented out in the FreeBSD code.
2478 */
2479 #if 0
2480 /*
2481 * Note the amount of data that peer has sent into
2482 * our window, in order to estimate the sender's
2483 * buffer size.
2484 * XXX: Unused.
2485 */
2486 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt))
2487 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
2488 else
2489 len = so->so_rcv.sb_hiwat;
2490 #endif
2491 } else {
2492 thflags &= ~TH_FIN;
2493 }
2494
2495 /*
2496 * If FIN is received ACK the FIN and let the user know
2497 * that the connection is closing.
2498 */
2499 if (thflags & TH_FIN) {
2500 tcplp_sys_log("FIN Processing start");
2501 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2502 /* samkumar: replace socantrcvmore with tpcantrcvmore */
2503 tpcantrcvmore(tp);
2504 /*
2505 * If connection is half-synchronized
2506 * (ie NEEDSYN flag on) then delay ACK,
2507 * so it may be piggybacked when SYN is sent.
2508 * Otherwise, since we received a FIN then no
2509 * more input can be expected, send ACK now.
2510 */
2511 if (tp->t_flags & TF_NEEDSYN)
2512 tp->t_flags |= TF_DELACK;
2513 else
2514 tp->t_flags |= TF_ACKNOW;
2515 tp->rcv_nxt++;
2516 }
2517 /*
2518 * samkumar: This -2 state is added by me, so that we do not consider
2519 * any more FINs in reassembly.
2520 */
2521 if (tp->reass_fin_index != -2) {
2522 sig->rcvd_fin = true;
2523 tp->reass_fin_index = -2;
2524 }
2525 switch (tp->t_state) {
2526
2527 /*
2528 * In SYN_RECEIVED and ESTABLISHED STATES
2529 * enter the CLOSE_WAIT state.
2530 */
2531 case TCPS_SYN_RECEIVED:
2532 tp->t_starttime = ticks;
2533 /* FALLTHROUGH */
2534 case TCPS_ESTABLISHED:
2535 tcp_state_change(tp, TCPS_CLOSE_WAIT);
2536 break;
2537
2538 /*
2539 * If still in FIN_WAIT_1 STATE FIN has not been acked so
2540 * enter the CLOSING state.
2541 */
2542 case TCPS_FIN_WAIT_1:
2543 tcp_state_change(tp, TCPS_CLOSING);
2544 break;
2545
2546 /*
2547 * In FIN_WAIT_2 state enter the TIME_WAIT state,
2548 * starting the time-wait timer, turning off the other
2549 * standard timers.
2550 */
2551 case TCPS_FIN_WAIT_2:
2552 tcp_twstart(tp);
2553 return;
2554 }
2555 }
2556
2557 /*
2558 * samkumar: Remove code for synchronization and debugging, here and in
2559 * the labels below. I also removed the line to free the mbuf if it hasn't
2560 * been freed already (the line was "m_freem(m)").
2561 */
2562 /*
2563 * Return any desired output.
2564 */
2565 if (needoutput || (tp->t_flags & TF_ACKNOW))
2566 (void) tcp_output(tp);
2567
2568 check_delack:
2569 if (tp->t_flags & TF_DELACK) {
2570 tp->t_flags &= ~TF_DELACK;
2571 tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
2572 }
2573 return;
2574
2575 dropafterack:
2576 /*
2577 * Generate an ACK dropping incoming segment if it occupies
2578 * sequence space, where the ACK reflects our state.
2579 *
2580 * We can now skip the test for the RST flag since all
2581 * paths to this code happen after packets containing
2582 * RST have been dropped.
2583 *
2584 * In the SYN-RECEIVED state, don't send an ACK unless the
2585 * segment we received passes the SYN-RECEIVED ACK test.
2586 * If it fails send a RST. This breaks the loop in the
2587 * "LAND" DoS attack, and also prevents an ACK storm
2588 * between two listening ports that have been sent forged
2589 * SYN segments, each with the source address of the other.
2590 */
2591 if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
2592 (SEQ_GT(tp->snd_una, th->th_ack) ||
2593 SEQ_GT(th->th_ack, tp->snd_max)) ) {
2594 rstreason = BANDLIM_RST_OPENPORT;
2595 goto dropwithreset;
2596 }
2597
2598 tp->t_flags |= TF_ACKNOW;
2599 (void) tcp_output(tp);
2600 return;
2601
2602 dropwithreset:
2603 if (tp != NULL) {
2604 tcp_dropwithreset(ip6, th, tp, instance, tlen, rstreason);
2605 } else
2606 tcp_dropwithreset(ip6, th, NULL, instance, tlen, rstreason);
2607 return;
2608
2609 drop:
2610 return;
2611 }
2612
2613 /*
2614 * Parse TCP options and place in tcpopt.
2615 */
2616 static void
tcp_dooptions(struct tcpopt * to,uint8_t * cp,int cnt,int flags)2617 tcp_dooptions(struct tcpopt *to, uint8_t *cp, int cnt, int flags)
2618 {
2619 int opt, optlen;
2620
2621 to->to_flags = 0;
2622 for (; cnt > 0; cnt -= optlen, cp += optlen) {
2623 opt = cp[0];
2624 if (opt == TCPOPT_EOL)
2625 break;
2626 if (opt == TCPOPT_NOP)
2627 optlen = 1;
2628 else {
2629 if (cnt < 2)
2630 break;
2631 optlen = cp[1];
2632 if (optlen < 2 || optlen > cnt)
2633 break;
2634 }
2635 switch (opt) {
2636 case TCPOPT_MAXSEG:
2637 if (optlen != TCPOLEN_MAXSEG)
2638 continue;
2639 if (!(flags & TO_SYN))
2640 continue;
2641 to->to_flags |= TOF_MSS;
2642 bcopy((char *)cp + 2,
2643 (char *)&to->to_mss, sizeof(to->to_mss));
2644 to->to_mss = ntohs(to->to_mss);
2645 break;
2646 case TCPOPT_WINDOW:
2647 if (optlen != TCPOLEN_WINDOW)
2648 continue;
2649 if (!(flags & TO_SYN))
2650 continue;
2651 to->to_flags |= TOF_SCALE;
2652 to->to_wscale = min(cp[2], TCP_MAX_WINSHIFT);
2653 break;
2654 case TCPOPT_TIMESTAMP:
2655 if (optlen != TCPOLEN_TIMESTAMP)
2656 continue;
2657 to->to_flags |= TOF_TS;
2658 bcopy((char *)cp + 2,
2659 (char *)&to->to_tsval, sizeof(to->to_tsval));
2660 to->to_tsval = ntohl(to->to_tsval);
2661 bcopy((char *)cp + 6,
2662 (char *)&to->to_tsecr, sizeof(to->to_tsecr));
2663 to->to_tsecr = ntohl(to->to_tsecr);
2664 break;
2665 #ifdef TCP_SIGNATURE
2666 /*
2667 * XXX In order to reply to a host which has set the
2668 * TCP_SIGNATURE option in its initial SYN, we have to
2669 * record the fact that the option was observed here
2670 * for the syncache code to perform the correct response.
2671 */
2672 case TCPOPT_SIGNATURE:
2673 if (optlen != TCPOLEN_SIGNATURE)
2674 continue;
2675 to->to_flags |= TOF_SIGNATURE;
2676 to->to_signature = cp + 2;
2677 break;
2678 #endif
2679 case TCPOPT_SACK_PERMITTED:
2680 if (optlen != TCPOLEN_SACK_PERMITTED)
2681 continue;
2682 if (!(flags & TO_SYN))
2683 continue;
2684 if (!V_tcp_do_sack)
2685 continue;
2686 to->to_flags |= TOF_SACKPERM;
2687 break;
2688 case TCPOPT_SACK:
2689 if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0)
2690 continue;
2691 if (flags & TO_SYN)
2692 continue;
2693 to->to_flags |= TOF_SACK;
2694 to->to_nsacks = (optlen - 2) / TCPOLEN_SACK;
2695 to->to_sacks = cp + 2;
2696 break;
2697 default:
2698 continue;
2699 }
2700 }
2701 }
2702
2703
2704 /*
2705 * Collect new round-trip time estimate
2706 * and update averages and current timeout.
2707 */
2708 static void
tcp_xmit_timer(struct tcpcb * tp,int rtt)2709 tcp_xmit_timer(struct tcpcb *tp, int rtt)
2710 {
2711 int delta;
2712
2713 tp->t_rttupdated++;
2714 if (tp->t_srtt != 0) {
2715 /*
2716 * srtt is stored as fixed point with 5 bits after the
2717 * binary point (i.e., scaled by 8). The following magic
2718 * is equivalent to the smoothing algorithm in rfc793 with
2719 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
2720 * point). Adjust rtt to origin 0.
2721 */
2722 delta = ((rtt - 1) << TCP_DELTA_SHIFT)
2723 - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
2724
2725 if ((tp->t_srtt += delta) <= 0)
2726 tp->t_srtt = 1;
2727
2728 /*
2729 * We accumulate a smoothed rtt variance (actually, a
2730 * smoothed mean difference), then set the retransmit
2731 * timer to smoothed rtt + 4 times the smoothed variance.
2732 * rttvar is stored as fixed point with 4 bits after the
2733 * binary point (scaled by 16). The following is
2734 * equivalent to rfc793 smoothing with an alpha of .75
2735 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces
2736 * rfc793's wired-in beta.
2737 */
2738 if (delta < 0)
2739 delta = -delta;
2740 delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
2741 if ((tp->t_rttvar += delta) <= 0)
2742 tp->t_rttvar = 1;
2743 if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar)
2744 tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
2745 } else {
2746 /*
2747 * No rtt measurement yet - use the unsmoothed rtt.
2748 * Set the variance to half the rtt (so our first
2749 * retransmit happens at 3*rtt).
2750 */
2751 tp->t_srtt = rtt << TCP_RTT_SHIFT;
2752 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
2753 tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
2754 }
2755 tp->t_rtttime = 0;
2756 tp->t_rxtshift = 0;
2757
2758 /*
2759 * the retransmit should happen at rtt + 4 * rttvar.
2760 * Because of the way we do the smoothing, srtt and rttvar
2761 * will each average +1/2 tick of bias. When we compute
2762 * the retransmit timer, we want 1/2 tick of rounding and
2763 * 1 extra tick because of +-1/2 tick uncertainty in the
2764 * firing of the timer. The bias will give us exactly the
2765 * 1.5 tick we need. But, because the bias is
2766 * statistical, we have to test that we don't drop below
2767 * the minimum feasible timer (which is 2 ticks).
2768 */
2769 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
2770 max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX);
2771
2772 #ifdef INSTRUMENT_TCP
2773 tcplp_sys_log("TCP timer %u %d %d %d", (unsigned int) tcplp_sys_get_millis(), rtt, (int) tp->t_srtt, (int) tp->t_rttvar);
2774 #endif
2775
2776
2777 /*
2778 * We received an ack for a packet that wasn't retransmitted;
2779 * it is probably safe to discard any error indications we've
2780 * received recently. This isn't quite right, but close enough
2781 * for now (a route might have failed after we sent a segment,
2782 * and the return path might not be symmetrical).
2783 */
2784 tp->t_softerror = 0;
2785 }
2786
2787 /*
2788 * samkumar: Taken from netinet6/in6.c.
2789 *
2790 * This function is supposed to check whether the provided address is an
2791 * IPv6 address of this host. This function, however, is used only as a hint,
2792 * as the MSS is clamped at V_tcp_v6mssdflt for connections to non-local
2793 * addresses. It is difficult for us to actually determine if the address
2794 * belongs to us, so we are conservative and only return 1 (true) if it is
2795 * obviously so---we keep the part of the function that checks for loopback or
2796 * link local and remove the rest of the code that checks for the addresses
2797 * assigned to interfaces. In cases where we return 0 but should have returned
2798 * 1, we may conservatively clamp the MTU, but that should be OK for TCPlp.
2799 * In fact, the constants are set such that we'll get the right answer whether
2800 * we clamp or not, so this shouldn't really matter at all.
2801 */
2802 int
in6_localaddr(struct in6_addr * in6)2803 in6_localaddr(struct in6_addr *in6)
2804 {
2805 if (IN6_IS_ADDR_LOOPBACK(in6) || IN6_IS_ADDR_LINKLOCAL(in6))
2806 return 1;
2807 return (0);
2808 }
2809
2810 /*
2811 * Determine a reasonable value for maxseg size.
2812 * If the route is known, check route for mtu.
2813 * If none, use an mss that can be handled on the outgoing interface
2814 * without forcing IP to fragment. If no route is found, route has no mtu,
2815 * or the destination isn't local, use a default, hopefully conservative
2816 * size (usually 512 or the default IP max size, but no more than the mtu
2817 * of the interface), as we can't discover anything about intervening
2818 * gateways or networks. We also initialize the congestion/slow start
2819 * window to be a single segment if the destination isn't local.
2820 * While looking at the routing entry, we also initialize other path-dependent
2821 * parameters from pre-set or cached values in the routing entry.
2822 *
2823 * Also take into account the space needed for options that we
2824 * send regularly. Make maxseg shorter by that amount to assure
2825 * that we can send maxseg amount of data even when the options
2826 * are present. Store the upper limit of the length of options plus
2827 * data in maxopd.
2828 *
2829 * NOTE that this routine is only called when we process an incoming
2830 * segment, or an ICMP need fragmentation datagram. Outgoing SYN/ACK MSS
2831 * settings are handled in tcp_mssopt().
2832 */
2833 /*
2834 * samkumar: Using struct tcpcb instead of the inpcb.
2835 */
2836 void
tcp_mss_update(struct tcpcb * tp,int offer,int mtuoffer,struct hc_metrics_lite * metricptr,struct tcp_ifcap * cap)2837 tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer,
2838 struct hc_metrics_lite *metricptr, struct tcp_ifcap *cap)
2839 {
2840 /*
2841 * samkumar: I removed all IPv4-specific logic and cases, including logic
2842 * to check for IPv4 vs. IPv6, as well as all locking and debugging code.
2843 */
2844 int mss = 0;
2845 uint64_t maxmtu = 0;
2846 struct hc_metrics_lite metrics;
2847 int origoffer;
2848 size_t min_protoh = IP6HDR_SIZE + sizeof (struct tcphdr);
2849
2850 if (mtuoffer != -1) {
2851 KASSERT(offer == -1, ("%s: conflict", __func__));
2852 offer = mtuoffer - min_protoh;
2853 }
2854 origoffer = offer;
2855
2856 maxmtu = tcp_maxmtu6(tp, cap);
2857 tp->t_maxopd = tp->t_maxseg = V_tcp_v6mssdflt;
2858
2859 /*
2860 * No route to sender, stay with default mss and return.
2861 */
2862 if (maxmtu == 0) {
2863 /*
2864 * In case we return early we need to initialize metrics
2865 * to a defined state as tcp_hc_get() would do for us
2866 * if there was no cache hit.
2867 */
2868 if (metricptr != NULL)
2869 bzero(metricptr, sizeof(struct hc_metrics_lite));
2870 return;
2871 }
2872
2873 /* What have we got? */
2874 switch (offer) {
2875 case 0:
2876 /*
2877 * Offer == 0 means that there was no MSS on the SYN
2878 * segment, in this case we use tcp_mssdflt as
2879 * already assigned to t_maxopd above.
2880 */
2881 offer = tp->t_maxopd;
2882 break;
2883
2884 case -1:
2885 /*
2886 * Offer == -1 means that we didn't receive SYN yet.
2887 */
2888 /* FALLTHROUGH */
2889
2890 default:
2891 /*
2892 * Prevent DoS attack with too small MSS. Round up
2893 * to at least minmss.
2894 */
2895 offer = max(offer, V_tcp_minmss);
2896 }
2897
2898 /*
2899 * rmx information is now retrieved from tcp_hostcache.
2900 */
2901 tcp_hc_get(tp, &metrics);
2902 if (metricptr != NULL)
2903 bcopy(&metrics, metricptr, sizeof(struct hc_metrics_lite));
2904
2905 /*
2906 * If there's a discovered mtu in tcp hostcache, use it.
2907 * Else, use the link mtu.
2908 */
2909 if (metrics.rmx_mtu)
2910 mss = min(metrics.rmx_mtu, maxmtu) - min_protoh;
2911 else {
2912 mss = maxmtu - min_protoh;
2913 if (!V_path_mtu_discovery &&
2914 !in6_localaddr(&tp->faddr))
2915 mss = min(mss, V_tcp_v6mssdflt);
2916 /*
2917 * XXX - The above conditional (mss = maxmtu - min_protoh)
2918 * probably violates the TCP spec.
2919 * The problem is that, since we don't know the
2920 * other end's MSS, we are supposed to use a conservative
2921 * default. But, if we do that, then MTU discovery will
2922 * never actually take place, because the conservative
2923 * default is much less than the MTUs typically seen
2924 * on the Internet today. For the moment, we'll sweep
2925 * this under the carpet.
2926 *
2927 * The conservative default might not actually be a problem
2928 * if the only case this occurs is when sending an initial
2929 * SYN with options and data to a host we've never talked
2930 * to before. Then, they will reply with an MSS value which
2931 * will get recorded and the new parameters should get
2932 * recomputed. For Further Study.
2933 */
2934 }
2935 mss = min(mss, offer);
2936
2937 /*
2938 * Sanity check: make sure that maxopd will be large
2939 * enough to allow some data on segments even if the
2940 * all the option space is used (40bytes). Otherwise
2941 * funny things may happen in tcp_output.
2942 */
2943 /*
2944 * samkumar: When I was experimenting with different MSS values, I had
2945 * changed this to "mss = max(mss, TCP_MAXOLEN + 1);" but I am changing it
2946 * back for the version that will be merged into OpenThread.
2947 */
2948 mss = max(mss, 64);
2949
2950 /*
2951 * maxopd stores the maximum length of data AND options
2952 * in a segment; maxseg is the amount of data in a normal
2953 * segment. We need to store this value (maxopd) apart
2954 * from maxseg, because now every segment carries options
2955 * and thus we normally have somewhat less data in segments.
2956 */
2957 tp->t_maxopd = mss;
2958
2959 /*
2960 * origoffer==-1 indicates that no segments were received yet.
2961 * In this case we just guess.
2962 */
2963 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
2964 (origoffer == -1 ||
2965 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP))
2966 mss -= TCPOLEN_TSTAMP_APPA;
2967
2968 tp->t_maxseg = mss;
2969 }
2970
2971 void
tcp_mss(struct tcpcb * tp,int offer)2972 tcp_mss(struct tcpcb *tp, int offer)
2973 {
2974 struct hc_metrics_lite metrics;
2975 struct tcp_ifcap cap;
2976
2977 KASSERT(tp != NULL, ("%s: tp == NULL", __func__));
2978
2979 bzero(&cap, sizeof(cap));
2980 tcp_mss_update(tp, offer, -1, &metrics, &cap);
2981
2982 /*
2983 * samkumar: There used to be code below that might modify the MSS, but I
2984 * removed all of it (see the comments below for the reason). It used to
2985 * read tp->t_maxseg into the local variable mss, modify mss, and then
2986 * reassign tp->t_maxseg to mss. I've kept the assignments, commented out,
2987 * for clarity.
2988 */
2989 //mss = tp->t_maxseg;
2990
2991 /*
2992 * If there's a pipesize, change the socket buffer to that size,
2993 * don't change if sb_hiwat is different than default (then it
2994 * has been changed on purpose with setsockopt).
2995 * Make the socket buffers an integral number of mss units;
2996 * if the mss is larger than the socket buffer, decrease the mss.
2997 */
2998
2999 /*
3000 * samkumar: There used to be code here would would limit the MSS to at
3001 * most the size of the send buffer, and then round up the send buffer to
3002 * a multiple of the MSS using
3003 * "sbreserve_locked(&so->so_snd, bufsize, so, NULL);". With TCPlp, we do
3004 * not do this, because the linked buffer used at the send buffer doesn't
3005 * have a real limit. Had we used a circular buffer, then limiting the MSS
3006 * to the buffer size would have made sense, but we still would not be able
3007 * to resize the send buffer because it is not allocated by TCPlp.
3008 */
3009
3010 /*
3011 * samkumar: See the comment above about me removing code that modifies
3012 * the MSS, making this assignment and the one above both unnecessary.
3013 */
3014 //tp->t_maxseg = mss;
3015
3016 /*
3017 * samkumar: There used to be code here that would round up the receive
3018 * buffer size to a multiple of the MSS, assuming that the receive buffer
3019 * size is bigger than the MSS. The new buffer size is set using
3020 * "sbreserve_locked(&so->so_rcv, bufsize, so, NULL);". In TCPlp, the
3021 * buffer is not allocated by TCPlp so I removed the code for this.
3022 */
3023 /*
3024 * samkumar: There used to be code here to handle TCP Segmentation
3025 * Offloading (TSO); I removed it becuase we don't support that in TCPlp.
3026 */
3027 }
3028
3029 /*
3030 * Determine the MSS option to send on an outgoing SYN.
3031 */
3032 /*
3033 * samkumar: In the signature, changed "struct in_conninfo *inc" to
3034 * "struct tcpcb* tp".
3035 */
3036 int
tcp_mssopt(struct tcpcb * tp)3037 tcp_mssopt(struct tcpcb* tp)
3038 {
3039 /*
3040 * samkumar: I removed all processing code specific to IPv4, or to decide
3041 * between IPv4 and IPv6. This is OK because TCPlp assumes IPv6.
3042 */
3043 int mss = 0;
3044 uint64_t maxmtu = 0;
3045 uint64_t thcmtu = 0;
3046 size_t min_protoh;
3047
3048 KASSERT(tp != NULL, ("tcp_mssopt with NULL tcpcb pointer"));
3049
3050 mss = V_tcp_v6mssdflt;
3051 maxmtu = tcp_maxmtu6(tp, NULL);
3052 min_protoh = IP6HDR_SIZE + sizeof(struct tcphdr);
3053
3054 thcmtu = tcp_hc_getmtu(tp); /* IPv4 and IPv6 */
3055
3056 if (maxmtu && thcmtu)
3057 mss = min(maxmtu, thcmtu) - min_protoh;
3058 else if (maxmtu || thcmtu)
3059 mss = max(maxmtu, thcmtu) - min_protoh;
3060
3061 return (mss);
3062 }
3063
3064 /*
3065 * On a partial ack arrives, force the retransmission of the
3066 * next unacknowledged segment. Do not clear tp->t_dupacks.
3067 * By setting snd_nxt to ti_ack, this forces retransmission timer to
3068 * be started again.
3069 */
3070 static void
tcp_newreno_partial_ack(struct tcpcb * tp,struct tcphdr * th)3071 tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th)
3072 {
3073 tcp_seq onxt = tp->snd_nxt;
3074 uint64_t ocwnd = tp->snd_cwnd;
3075
3076 tcp_timer_activate(tp, TT_REXMT, 0);
3077 tp->t_rtttime = 0;
3078 tp->snd_nxt = th->th_ack;
3079 /*
3080 * Set snd_cwnd to one segment beyond acknowledged offset.
3081 * (tp->snd_una has not yet been updated when this function is called.)
3082 */
3083 tp->snd_cwnd = tp->t_maxseg + BYTES_THIS_ACK(tp, th);
3084 tp->t_flags |= TF_ACKNOW;
3085 #ifdef INSTRUMENT_TCP
3086 tcplp_sys_log("TCP Partial_ACK");
3087 #endif
3088 (void) tcp_output(tp);
3089 tp->snd_cwnd = ocwnd;
3090 if (SEQ_GT(onxt, tp->snd_nxt))
3091 tp->snd_nxt = onxt;
3092 /*
3093 * Partial window deflation. Relies on fact that tp->snd_una
3094 * not updated yet.
3095 */
3096 if (tp->snd_cwnd > BYTES_THIS_ACK(tp, th))
3097 tp->snd_cwnd -= BYTES_THIS_ACK(tp, th);
3098 else
3099 tp->snd_cwnd = 0;
3100 tp->snd_cwnd += tp->t_maxseg;
3101 #ifdef INSTRUMENT_TCP
3102 tcplp_sys_log("TCP Partial_ACK_final %d", (int) tp->snd_cwnd);
3103 #endif
3104 }
3105