1 /*-
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
3 * The Regents of the University of California. All rights reserved.
4 * Copyright (c) 2007-2008,2010
5 * Swinburne University of Technology, Melbourne, Australia.
6 * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
7 * Copyright (c) 2010 The FreeBSD Foundation
8 * Copyright (c) 2010-2011 Juniper Networks, Inc.
9 * All rights reserved.
10 *
11 * Portions of this software were developed at the Centre for Advanced Internet
12 * Architectures, Swinburne University of Technology, by Lawrence Stewart,
13 * James Healy and David Hayes, made possible in part by a grant from the Cisco
14 * University Research Program Fund at Community Foundation Silicon Valley.
15 *
16 * Portions of this software were developed at the Centre for Advanced
17 * Internet Architectures, Swinburne University of Technology, Melbourne,
18 * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
19 *
20 * Portions of this software were developed by Robert N. M. Watson under
21 * contract to Juniper Networks, Inc.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the above copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 4. Neither the name of the University nor the names of its contributors
32 * may be used to endorse or promote products derived from this software
33 * without specific prior written permission.
34 *
35 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
36 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
38 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
39 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
40 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
41 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
42 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
43 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
44 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
45 * SUCH DAMAGE.
46 *
47 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
48 */
49
50
51 /*
52 * Determine a reasonable value for maxseg size.
53 * If the route is known, check route for mtu.
54 * If none, use an mss that can be handled on the outgoing interface
55 * without forcing IP to fragment. If no route is found, route has no mtu,
56 * or the destination isn't local, use a default, hopefully conservative
57 * size (usually 512 or the default IP max size, but no more than the mtu
58 * of the interface), as we can't discover anything about intervening
59 * gateways or networks. We also initialize the congestion/slow start
60 * window to be a single segment if the destination isn't local.
61 * While looking at the routing entry, we also initialize other path-dependent
62 * parameters from pre-set or cached values in the routing entry.
63 *
64 * Also take into account the space needed for options that we
65 * send regularly. Make maxseg shorter by that amount to assure
66 * that we can send maxseg amount of data even when the options
67 * are present. Store the upper limit of the length of options plus
68 * data in maxopd.
69 *
70 * NOTE that this routine is only called when we process an incoming
71 * segment, or an ICMP need fragmentation datagram. Outgoing SYN/ACK MSS
72 * settings are handled in tcp_mssopt().
73 */
74
75 #include <errno.h>
76 #include <string.h>
77 #include <strings.h>
78
79 #include "tcp.h"
80 #include "tcp_fsm.h"
81 #include "tcp_seq.h"
82 #include "tcp_timer.h"
83 #include "tcp_var.h"
84 #include "tcp_fastopen.h"
85 #include "../lib/bitmap.h"
86 #include "../lib/cbuf.h"
87 #include "icmp_var.h"
88 #include "ip.h"
89 #include "ip6.h"
90 #include "sys/queue.h"
91
92 #include "tcp_const.h"
93
94 /* samkumar: Copied from in.h */
95 #define IPPROTO_DONE 267
96
97 /* samkumar: Copied from sys/libkern.h */
imax(int a,int b)98 static int imax(int a, int b) { return (a > b ? a : b); }
imin(int a,int b)99 static int imin(int a, int b) { return (a < b ? a : b); }
100
min(int a,int b)101 static int min(int a, int b) { return imin(a, b); }
102
103 static void tcp_dooptions(struct tcpopt *, uint8_t *, int, int);
104 static void
105 tcp_do_segment(struct ip6_hdr* ip6, struct tcphdr *th, otMessage* msg,
106 struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos,
107 struct tcplp_signals* sig);
108 static void tcp_xmit_timer(struct tcpcb *, int);
109 void tcp_hc_get(/*struct in_conninfo *inc*/ struct tcpcb* tp, struct hc_metrics_lite *hc_metrics_lite);
110 static void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *);
111
112 /*
113 * CC wrapper hook functions
114 */
115 static inline void
cc_ack_received(struct tcpcb * tp,struct tcphdr * th,uint16_t type)116 cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t type)
117 {
118 tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th);
119 if (tp->snd_cwnd <= tp->snd_wnd)
120 tp->ccv->flags |= CCF_CWND_LIMITED;
121 else
122 tp->ccv->flags &= ~CCF_CWND_LIMITED;
123
124 if (type == CC_ACK) {
125 if (tp->snd_cwnd > tp->snd_ssthresh) {
126 tp->t_bytes_acked += min(tp->ccv->bytes_this_ack,
127 V_tcp_abc_l_var * tp->t_maxseg);
128 if (tp->t_bytes_acked >= tp->snd_cwnd) {
129 tp->t_bytes_acked -= tp->snd_cwnd;
130 tp->ccv->flags |= CCF_ABC_SENTAWND;
131 }
132 } else {
133 tp->ccv->flags &= ~CCF_ABC_SENTAWND;
134 tp->t_bytes_acked = 0;
135 }
136 }
137
138 if (CC_ALGO(tp)->ack_received != NULL) {
139 /* XXXLAS: Find a way to live without this */
140 tp->ccv->curack = th->th_ack;
141 CC_ALGO(tp)->ack_received(tp->ccv, type);
142 }
143 }
144
145 static inline void
cc_conn_init(struct tcpcb * tp)146 cc_conn_init(struct tcpcb *tp)
147 {
148 struct hc_metrics_lite metrics;
149 int rtt;
150
151 /*
152 * samkumar: remove locks, inpcb, and stats.
153 */
154
155 /* samkumar: Used to take &inp->inp_inc as an argument. */
156 tcp_hc_get(tp, &metrics);
157
158 if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) {
159 tp->t_srtt = rtt;
160 tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE;
161 if (metrics.rmx_rttvar) {
162 tp->t_rttvar = metrics.rmx_rttvar;
163 } else {
164 /* default variation is +- 1 rtt */
165 tp->t_rttvar =
166 tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
167 }
168 TCPT_RANGESET(tp->t_rxtcur,
169 ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
170 tp->t_rttmin, TCPTV_REXMTMAX);
171 }
172 if (metrics.rmx_ssthresh) {
173 /*
174 * There's some sort of gateway or interface
175 * buffer limit on the path. Use this to set
176 * the slow start threshhold, but set the
177 * threshold to no less than 2*mss.
178 */
179 tp->snd_ssthresh = max(2 * tp->t_maxseg, metrics.rmx_ssthresh);
180 }
181
182 /*
183 * Set the initial slow-start flight size.
184 *
185 * RFC5681 Section 3.1 specifies the default conservative values.
186 * RFC3390 specifies slightly more aggressive values.
187 * RFC6928 increases it to ten segments.
188 * Support for user specified value for initial flight size.
189 *
190 * If a SYN or SYN/ACK was lost and retransmitted, we have to
191 * reduce the initial CWND to one segment as congestion is likely
192 * requiring us to be cautious.
193 */
194 if (tp->snd_cwnd == 1)
195 tp->snd_cwnd = tp->t_maxseg; /* SYN(-ACK) lost */
196 else if (V_tcp_initcwnd_segments)
197 tp->snd_cwnd = min(V_tcp_initcwnd_segments * tp->t_maxseg,
198 max(2 * tp->t_maxseg, V_tcp_initcwnd_segments * 1460));
199 else if (V_tcp_do_rfc3390)
200 tp->snd_cwnd = min(4 * tp->t_maxseg,
201 max(2 * tp->t_maxseg, 4380));
202 else {
203 /* Per RFC5681 Section 3.1 */
204 if (tp->t_maxseg > 2190)
205 tp->snd_cwnd = 2 * tp->t_maxseg;
206 else if (tp->t_maxseg > 1095)
207 tp->snd_cwnd = 3 * tp->t_maxseg;
208 else
209 tp->snd_cwnd = 4 * tp->t_maxseg;
210 }
211
212 if (CC_ALGO(tp)->conn_init != NULL)
213 CC_ALGO(tp)->conn_init(tp->ccv);
214
215 /* samkumar: print statement for debugging. Resurrect with DEBUG macro? */
216 #ifdef INSTRUMENT_TCP
217 tcplp_sys_log("TCP CC_INIT %u %d %d", (unsigned int) tcplp_sys_get_millis(), (int) tp->snd_cwnd, (int) tp->snd_ssthresh);
218 #endif
219 }
220
221 inline void
cc_cong_signal(struct tcpcb * tp,struct tcphdr * th,uint32_t type)222 cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type)
223 {
224 /* samkumar: Remove locks and stats from this function. */
225
226 switch(type) {
227 case CC_NDUPACK:
228 if (!IN_FASTRECOVERY(tp->t_flags)) {
229 tp->snd_recover = tp->snd_max;
230 if (tp->t_flags & TF_ECN_PERMIT)
231 tp->t_flags |= TF_ECN_SND_CWR;
232 }
233 break;
234 case CC_ECN:
235 if (!IN_CONGRECOVERY(tp->t_flags)) {
236 tp->snd_recover = tp->snd_max;
237 if (tp->t_flags & TF_ECN_PERMIT)
238 tp->t_flags |= TF_ECN_SND_CWR;
239 }
240 break;
241 case CC_RTO:
242 tp->t_dupacks = 0;
243 tp->t_bytes_acked = 0;
244 EXIT_RECOVERY(tp->t_flags);
245 /*
246 * samkumar: I added the cast to uint64_t below to fix an OpenThread
247 * code scanning alert relating to integer overflow in multiplication.
248 */
249 tp->snd_ssthresh = max(2, min(tp->snd_wnd, tp->snd_cwnd) / 2 /
250 tp->t_maxseg) * ((uint64_t) tp->t_maxseg);
251 tp->snd_cwnd = tp->t_maxseg;
252
253 /*
254 * samkumar: Stats for TCPlp: count the number of timeouts (RTOs).
255 * I've commented this out (with #if 0) because it isn't part of TCP
256 * functionality. At some point, we may want to bring it back to
257 * measure performance.
258 */
259 #if 0
260 tcplp_timeoutRexmitCnt++;
261 #endif
262 #ifdef INSTRUMENT_TCP
263 tcplp_sys_log("TCP CC_RTO %u %d %d", (unsigned int) tcplp_sys_get_millis(), (int) tp->snd_cwnd, (int) tp->snd_ssthresh);
264 #endif
265 break;
266 case CC_RTO_ERR:
267 /* RTO was unnecessary, so reset everything. */
268 tp->snd_cwnd = tp->snd_cwnd_prev;
269 tp->snd_ssthresh = tp->snd_ssthresh_prev;
270 tp->snd_recover = tp->snd_recover_prev;
271 if (tp->t_flags & TF_WASFRECOVERY)
272 ENTER_FASTRECOVERY(tp->t_flags);
273 if (tp->t_flags & TF_WASCRECOVERY)
274 ENTER_CONGRECOVERY(tp->t_flags);
275 tp->snd_nxt = tp->snd_max;
276 tp->t_flags &= ~TF_PREVVALID;
277 tp->t_badrxtwin = 0;
278 #ifdef INSTRUMENT_TCP
279 tcplp_sys_log("TCP CC_RTO_ERR %u %d %d", (unsigned int) tcplp_sys_get_millis(), (int) tp->snd_cwnd, (int) tp->snd_ssthresh);
280 #endif
281 break;
282 }
283
284 if (CC_ALGO(tp)->cong_signal != NULL) {
285 if (th != NULL)
286 tp->ccv->curack = th->th_ack;
287 CC_ALGO(tp)->cong_signal(tp->ccv, type);
288 }
289 }
290
291 static inline void
cc_post_recovery(struct tcpcb * tp,struct tcphdr * th)292 cc_post_recovery(struct tcpcb *tp, struct tcphdr *th)
293 {
294 /* samkumar: remove lock */
295
296 /* XXXLAS: KASSERT that we're in recovery? */
297 if (CC_ALGO(tp)->post_recovery != NULL) {
298 tp->ccv->curack = th->th_ack;
299 CC_ALGO(tp)->post_recovery(tp->ccv);
300 }
301 /* XXXLAS: EXIT_RECOVERY ? */
302 tp->t_bytes_acked = 0;
303 }
304
305
306 /*
307 * Indicate whether this ack should be delayed. We can delay the ack if
308 * following conditions are met:
309 * - There is no delayed ack timer in progress.
310 * - Our last ack wasn't a 0-sized window. We never want to delay
311 * the ack that opens up a 0-sized window.
312 * - LRO wasn't used for this segment. We make sure by checking that the
313 * segment size is not larger than the MSS.
314 * - Delayed acks are enabled or this is a half-synchronized T/TCP
315 * connection.
316 */
317 #define DELAY_ACK(tp, tlen) \
318 ((!tcp_timer_active(tp, TT_DELACK) && \
319 (tp->t_flags & TF_RXWIN0SENT) == 0) && \
320 (tlen <= tp->t_maxopd) && \
321 (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN)))
322
323 static inline void
cc_ecnpkt_handler(struct tcpcb * tp,struct tcphdr * th,uint8_t iptos)324 cc_ecnpkt_handler(struct tcpcb *tp, struct tcphdr *th, uint8_t iptos)
325 {
326 /* samkumar: remove lock */
327
328 if (CC_ALGO(tp)->ecnpkt_handler != NULL) {
329 switch (iptos & IPTOS_ECN_MASK) {
330 case IPTOS_ECN_CE:
331 tp->ccv->flags |= CCF_IPHDR_CE;
332 break;
333 case IPTOS_ECN_ECT0:
334 tp->ccv->flags &= ~CCF_IPHDR_CE;
335 break;
336 case IPTOS_ECN_ECT1:
337 tp->ccv->flags &= ~CCF_IPHDR_CE;
338 break;
339 }
340
341 if (th->th_flags & TH_CWR)
342 tp->ccv->flags |= CCF_TCPHDR_CWR;
343 else
344 tp->ccv->flags &= ~CCF_TCPHDR_CWR;
345
346 if (tp->t_flags & TF_DELACK)
347 tp->ccv->flags |= CCF_DELACK;
348 else
349 tp->ccv->flags &= ~CCF_DELACK;
350
351 CC_ALGO(tp)->ecnpkt_handler(tp->ccv);
352
353 if (tp->ccv->flags & CCF_ACKNOW)
354 tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
355 }
356 }
357
358 /*
359 * External function: look up an entry in the hostcache and fill out the
360 * supplied TCP metrics structure. Fills in NULL when no entry was found or
361 * a value is not set.
362 */
363 /*
364 * samkumar: This function is taken from tcp_hostcache.c. We have no host cache
365 * in TCPlp, so I changed this to always act as if there is a miss. I removed
366 * the first argument, formerly "struct in_coninfo *inc".
367 */
368 void
tcp_hc_get(struct tcpcb * tp,struct hc_metrics_lite * hc_metrics_lite)369 tcp_hc_get(struct tcpcb* tp, struct hc_metrics_lite *hc_metrics_lite)
370 {
371 bzero(hc_metrics_lite, sizeof(*hc_metrics_lite));
372 }
373
374 /*
375 * External function: look up an entry in the hostcache and return the
376 * discovered path MTU. Returns NULL if no entry is found or value is not
377 * set.
378 */
379 /*
380 * samkumar: This function is taken from tcp_hostcache.c. We have no host cache
381 * in TCPlp, so I changed this to always act as if there is a miss.
382 */
383 uint64_t
tcp_hc_getmtu(struct tcpcb * tp)384 tcp_hc_getmtu(struct tcpcb* tp)
385 {
386 return 0;
387 }
388
389
390 /*
391 * Issue RST and make ACK acceptable to originator of segment.
392 * The mbuf must still include the original packet header.
393 * tp may be NULL.
394 */
395 /*
396 * samkumar: Original signature was:
397 * static void tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp,
398 * int tlen, int rstreason)
399 */
400 void
tcp_dropwithreset(struct ip6_hdr * ip6,struct tcphdr * th,struct tcpcb * tp,otInstance * instance,int tlen,int rstreason)401 tcp_dropwithreset(struct ip6_hdr* ip6, struct tcphdr *th, struct tcpcb *tp, otInstance* instance,
402 int tlen, int rstreason)
403 {
404 /*
405 * samkumar: I removed logic to skip this for broadcast or multicast
406 * packets. In the FreeBSD version of this function, it would just
407 * call m_freem(m), if m->m_flags has M_BCAST or M_MCAST set, and not
408 * send a response packet.
409 * I also removed bandwidth limiting.
410 */
411 if (th->th_flags & TH_RST)
412 return;
413
414 /* tcp_respond consumes the mbuf chain. */
415 if (th->th_flags & TH_ACK) {
416 tcp_respond(tp, instance, ip6, th, (tcp_seq) 0, th->th_ack, TH_RST);
417 } else {
418 if (th->th_flags & TH_SYN)
419 tlen++;
420 tcp_respond(tp, instance, ip6, th, th->th_seq + tlen, (tcp_seq) 0, TH_RST | TH_ACK);
421 }
422 return;
423 }
424
425 /*
426 * TCP input handling is split into multiple parts:
427 * tcp6_input is a thin wrapper around tcp_input for the extended
428 * ip6_protox[] call format in ip6_input
429 * tcp_input handles primary segment validation, inpcb lookup and
430 * SYN processing on listen sockets
431 * tcp_do_segment processes the ACK and text of the segment for
432 * establishing, established and closing connections
433 */
434 /* samkumar: The signature of this function was originally:
435 tcp_input(struct mbuf **mp, int *offp, int proto) */
436 /* NOTE: tcp_fields_to_host(th) must be called before this function is called. */
437 int
tcp_input(struct ip6_hdr * ip6,struct tcphdr * th,otMessage * msg,struct tcpcb * tp,struct tcpcb_listen * tpl,struct tcplp_signals * sig)438 tcp_input(struct ip6_hdr* ip6, struct tcphdr* th, otMessage* msg, struct tcpcb* tp, struct tcpcb_listen* tpl,
439 struct tcplp_signals* sig)
440 {
441 /*
442 * samkumar: I significantly modified this function, compared to the
443 * FreeBSD version. This function used to be reponsible for matching an
444 * incoming TCP segment to its TCB. That functionality is now done by
445 * TCPlp, and this function is only called once a match has been
446 * identified.
447 *
448 * The tp and tpl arguments are used to indicate the match. Exactly one of
449 * them must be NULL, and the other must be set. If tp is non-NULL, then
450 * this function assumes that the packet was matched to an active socket
451 * (connection endpoint). If tpl is non-NULL, then this function assumes
452 * that this packet is a candidate match for a passive socket (listener)
453 * and attempts to set up a new connection if the flags, sequence numbers,
454 * etc. look OK.
455 *
456 * TCPlp assumes that the packets are IPv6, so I removed any logic specific
457 * to IPv4.
458 *
459 * And of course, all code pertaining to locks and stats has been removed.
460 */
461 int tlen = 0, off;
462 int thflags;
463 uint8_t iptos = 0;
464 int drop_hdrlen;
465 int rstreason = 0;
466 struct tcpopt to; /* options in this segment */
467 uint8_t* optp = NULL;
468 int optlen = 0;
469 to.to_flags = 0;
470 KASSERT(tp || tpl, ("One of tp and tpl must be positive"));
471
472 /*
473 * samkumar: Here, there used to be code that handled preprocessing:
474 * calling m_pullup(m, sizeof(*ip6) + sizeof(*th)) to get the headers
475 * contiguous in memory, setting the ip6 and th pointers, validating the
476 * checksum, and dropping packets with unspecified source address. In
477 * TCPlp, all of this is done for a packet before this function is called.
478 */
479
480 tlen = ntohs(ip6->ip6_plen); // assume *off == sizeof(*ip6)
481
482 /*
483 * samkumar: Logic that handled IPv4 was deleted below. I won't add a
484 * comment every time this is done, but I'm putting it here (one of the
485 * first instances of this) for clarity.
486 */
487 iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
488
489 /*
490 * Check that TCP offset makes sense,
491 * pull out TCP options and adjust length. XXX
492 */
493 off = (th->th_off_x2 >> TH_OFF_SHIFT) << 2;
494 if (off < sizeof (struct tcphdr) || off > tlen) {
495 goto drop;
496 }
497 tlen -= off; /* tlen is used instead of ti->ti_len */
498 /* samkumar: now, tlen is the length of the data */
499
500 if (off > sizeof (struct tcphdr)) {
501 /*
502 * samkumar: I removed a call to IP6_EXTHDR_CHECK, which I believe
503 * checks for IPv6 extension headers. In TCPlp, we assume that these
504 * are handled elsewhere in the networking stack, before the incoming
505 * packet is processed at the TCP layer. I also removed the followup
506 * calls to reassign the ip6 and th pointers.
507 */
508 optlen = off - sizeof (struct tcphdr);
509 optp = (uint8_t *)(th + 1);
510 }
511
512 thflags = th->th_flags;
513
514 /*
515 * samkumar: There used to be a call here to tcp_fields_to_host(th), which
516 * changes the byte order of various fields to host format. I removed this
517 * call from there and handle it in TCPlp, before calling this. The reason
518 * is that it's possible for this function to be called twice by TCPlp's
519 * logic (e.g., if the packet matches a TIME-WAIT socket this function
520 * returns early, and the packet may then match a listening socket, at
521 * which ppoint this function will be called again). Thus, any operations
522 * like this, which mutate the packet itself, need to happen before calling
523 * this function.
524 */
525
526 /*
527 * Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options.
528 *
529 * samkumar: My TCP header is in a different buffer from the IP header.
530 * drop_hdrlen is only meaningful as an offset into the TCP buffer,
531 * because it is used to determine how much of the packet to discard
532 * before copying it into the receive buffer. Therefore, my offset does
533 * not include the length of IP header and options, only the length of
534 * the TCP header and options.
535 */
536 drop_hdrlen = /*off0 +*/ off;
537
538 /*
539 * Locate pcb for segment; if we're likely to add or remove a
540 * connection then first acquire pcbinfo lock. There are three cases
541 * where we might discover later we need a write lock despite the
542 * flags: ACKs moving a connection out of the syncache, ACKs for a
543 * connection in TIMEWAIT and SYNs not targeting a listening socket.
544 */
545
546 /*
547 * samkumar: Locking code is removed, invalidating most of the above
548 * comment.
549 */
550
551 /*
552 * samkumar: The FreeBSD code at logic here to check m->m_flags for the
553 * M_IP6_NEXTHOP flag, and search for the PACKET_TAG_IPFORWARD tag and
554 * store it in fwd_tag if so. In TCPlp, we assume that the IPv6 layer of
555 * the host network stack handles this kind of IPv6-related functionality,
556 * so this logic has been removed.
557 */
558
559 /*
560 * samkumar: Here, there was code to match the packet to an inpcb and reply
561 * with an RST segment if no match is found. This included taking the
562 * fwd_tag into account, if set above (see the previous comment). I removed
563 * this code because, in TCPlp, this is done before calling this function.
564 */
565
566 /*
567 * A previous connection in TIMEWAIT state is supposed to catch stray
568 * or duplicate segments arriving late. If this segment was a
569 * legitimate new connection attempt, the old INPCB gets removed and
570 * we can try again to find a listening socket.
571 *
572 * At this point, due to earlier optimism, we may hold only an inpcb
573 * lock, and not the inpcbinfo write lock. If so, we need to try to
574 * acquire it, or if that fails, acquire a reference on the inpcb,
575 * drop all locks, acquire a global write lock, and then re-acquire
576 * the inpcb lock. We may at that point discover that another thread
577 * has tried to free the inpcb, in which case we need to loop back
578 * and try to find a new inpcb to deliver to.
579 *
580 * XXXRW: It may be time to rethink timewait locking.
581 */
582 /*
583 * samkumar: The original code checked inp->inp_flags & INP_TIMEWAIT. I
584 * changed it to instead check tp->t_state, since we don't use inpcbs in
585 * TCPlp.
586 */
587 if (tp && tp->t_state == TCP6S_TIME_WAIT) {
588 /*
589 * samkumar: There's nothing wrong with the call to tcp_dooptions call
590 * that I've commented out below; it's just that the modified
591 * "tcp_twcheck" function no longer needs the options structure, so
592 * I figured that there's no longer a good reason to parse the options.
593 * In fact, this call was probably unnecessary even in the original
594 * FreeBSD TCP code, since tcp_twcheck, even without my modifications,
595 * did not use the pointer to the options structure!
596 */
597 //if (thflags & TH_SYN)
598 //tcp_dooptions(&to, optp, optlen, TO_SYN);
599 /*
600 * samkumar: The original code would "goto findpcb;" if this branch is
601 * taken. Matching with a TCB is done outside of this function in
602 * TCPlp, so we instead return a special value so that the caller knows
603 * to try re-matching this packet to a socket.
604 */
605 if (tcp_twcheck(tp,/*inp, &to,*/ th, /*m,*/ tlen))
606 return (RELOOKUP_REQUIRED);
607 return (IPPROTO_DONE);
608 }
609 /*
610 * The TCPCB may no longer exist if the connection is winding
611 * down or it is in the CLOSED state. Either way we drop the
612 * segment and send an appropriate response.
613 */
614 /*
615 * samkumar: There used to be code here that grabs the tp from the inpcb
616 * and drops with reset if the connection is in the closed state or if
617 * the tp is NULL. In TCPlp, the equivalent logic is done before entering
618 * this function. There was also code here to handle TCP offload, which
619 * TCPlp does not handle.
620 */
621
622 /*
623 * We've identified a valid inpcb, but it could be that we need an
624 * inpcbinfo write lock but don't hold it. In this case, attempt to
625 * acquire using the same strategy as the TIMEWAIT case above. If we
626 * relock, we have to jump back to 'relocked' as the connection might
627 * now be in TIMEWAIT.
628 */
629 /*
630 * samkumar: There used to be some code here for synchronization, MAC
631 * management, and debugging.
632 */
633
634 /*
635 * When the socket is accepting connections (the INPCB is in LISTEN
636 * state) we look into the SYN cache if this is a new connection
637 * attempt or the completion of a previous one. Instead of checking
638 * so->so_options to check if the socket is listening, we rely on the
639 * arguments passed to this function (if tp == NULL, then tpl is not NULL
640 * and is the matching listen socket).
641 */
642
643 if (/*so->so_options & SO_ACCEPTCONN*/tp == NULL) {
644 int tfo_cookie_valid = 0;
645 uint64_t tfo_response_cookie;
646 // int tfo_response_cookie_valid = 0;
647
648 /* samkumar: NULL check isn't needed but prevents a compiler warning */
649 KASSERT(tpl != NULL && tpl->t_state == TCP6S_LISTEN, ("listen socket must be in listening state!"));
650
651 /*
652 * samkumar: There used to be some code here that checks if the
653 * received segment is an ACK, and if so, searches the SYN cache to
654 * find an entry whose connection establishment handshake this segment
655 * completes. If such an entry is found, then a socket is created and
656 * then tcp_do_segment is called to actually run the code to mark the
657 * connection as established. If the received segment is an RST, then
658 * that is processed in the syncache as well. In TCPlp we do not use a
659 * SYN cache, so I've removed that code. The actual connection
660 * establishment/processing logic happens in tcp_do_segment anyway,
661 * which is called at the bottom of this function, so there's no need
662 * to rewrite this code with special-case logic for that.
663 */
664
665 /*
666 * We can't do anything without SYN.
667 */
668 if ((thflags & TH_SYN) == 0) {
669 /*
670 * samkumar: Here, and in several other instances, the FreeBSD
671 * code would call tcp_log_addrs. Improving logging in these
672 * edge cases in TCPlp is left for the future --- for now, I just
673 * put "<addrs go here>" where the address string would go.
674 */
675 tcplp_sys_log("%s; %s: Listen socket: "
676 "SYN is missing, segment ignored",
677 "<addrs go here>", __func__);
678 goto dropunlock;
679 }
680 /*
681 * (SYN|ACK) is bogus on a listen socket.
682 */
683 if (thflags & TH_ACK) {
684 /* samkumar: See above comment regarding tcp_log_addrs. */
685 tcplp_sys_log("%s; %s: Listen socket: "
686 "SYN|ACK invalid, segment rejected",
687 "<addrs go here>", __func__);
688 /* samkumar: Removed call to syncache_badack(&inc); */
689 rstreason = BANDLIM_RST_OPENPORT;
690 goto dropwithreset;
691 }
692 /*
693 * If the drop_synfin option is enabled, drop all
694 * segments with both the SYN and FIN bits set.
695 * This prevents e.g. nmap from identifying the
696 * TCP/IP stack.
697 * XXX: Poor reasoning. nmap has other methods
698 * and is constantly refining its stack detection
699 * strategies.
700 * XXX: This is a violation of the TCP specification
701 * and was used by RFC1644.
702 */
703 if ((thflags & TH_FIN) && V_drop_synfin) {
704 /* samkumar: See above comment regarding tcp_log_addrs. */
705 tcplp_sys_log("%s; %s: Listen socket: "
706 "SYN|FIN segment ignored (based on "
707 "sysctl setting)", "<addrs go here>", __func__);
708 goto dropunlock;
709 }
710 /*
711 * Segment's flags are (SYN) or (SYN|FIN).
712 *
713 * TH_PUSH, TH_URG, TH_ECE, TH_CWR are ignored
714 * as they do not affect the state of the TCP FSM.
715 * The data pointed to by TH_URG and th_urp is ignored.
716 */
717 KASSERT((thflags & (TH_RST|TH_ACK)) == 0,
718 ("%s: Listen socket: TH_RST or TH_ACK set", __func__));
719 KASSERT(thflags & (TH_SYN),
720 ("%s: Listen socket: TH_SYN not set", __func__));
721
722 /*
723 * samkumar: There used to be some code here to reject incoming
724 * SYN packets for deprecated interface addresses unless
725 * V_ip6_use_deprecated is true. Rejecting the packet, in this case,
726 * means to "goto dropwithreset". I removed this functionality.
727 */
728
729 /*
730 * Basic sanity checks on incoming SYN requests:
731 * Don't respond if the destination is a link layer
732 * broadcast according to RFC1122 4.2.3.10, p. 104.
733 * If it is from this socket it must be forged.
734 * Don't respond if the source or destination is a
735 * global or subnet broad- or multicast address.
736 * Note that it is quite possible to receive unicast
737 * link-layer packets with a broadcast IP address. Use
738 * in_broadcast() to find them.
739 */
740
741 /*
742 * samkumar: There used to be a sanity check that drops (via
743 * "goto dropunlock") any broadcast or multicast packets. This check is
744 * done by checking m->m_flags for (M_BAST|M_MCAST). The original
745 * FreeBSD code for this has been removed (since checking m->m_flags
746 * isn't really useful to us anyway). Note that other FreeBSD code that
747 * checks for multicast source/destination addresses is retained below
748 * (but only for the IPv6 case; the original FreeBSD code also handled
749 * it for IPv4 addresses).
750 */
751
752 if (th->th_dport == th->th_sport &&
753 IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) {
754 /* samkumar: See above comment regarding tcp_log_addrs. */
755 tcplp_sys_log("%s; %s: Listen socket: "
756 "Connection attempt to/from self "
757 "ignored", "<addrs go here>", __func__);
758 goto dropunlock;
759 }
760 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
761 IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) {
762 /* samkumar: See above comment regarding tcp_log_addrs. */
763 tcplp_sys_log("%s; %s: Listen socket: "
764 "Connection attempt from/to multicast "
765 "address ignored", "<addrs go here>", __func__);
766 goto dropunlock;
767 }
768
769 /*
770 * samkumar: The FreeBSD code would call
771 * syncache_add(&inc, &to, th, inp, &so, m, NULL, NULL);
772 * to add an entry to the SYN cache at this point. TCPlp doesn't use a
773 * syncache, so we initialize the new socket right away. The code to
774 * initialize the socket is taken from the syncache_socket function.
775 */
776 /*
777 * samkumar: As of FreeBSD 10.3, the syncache_add function returns
778 * a flag indicating if a "fast open" code path should be taken.
779 * In that case, there is a "goto" statement to the removed logic
780 * above that calls tcp_do_segment after expanding a syncache entry.
781 * Analogous logic is implemented below.
782 */
783 tcp_dooptions(&to, optp, optlen, TO_SYN);
784
785 /*
786 * samkumar: TCP Fast Open logic taken from syncache_add in
787 * FreeBSD 12.0.
788 */
789 if (V_tcp_fastopen_server_enable && /*IS_FASTOPEN(tp->t_flags) &&
790 (tp->t_tfo_pending != NULL) && */
791 (to.to_flags & TOF_FASTOPEN)) {
792 /*
793 * Limit the number of pending TFO connections to
794 * approximately half of the queue limit. This prevents TFO
795 * SYN floods from starving the service by filling the
796 * listen queue with bogus TFO connections.
797 */
798 /*
799 * samkumar: Since we let the application handle the listen
800 * queue it doesn't make sense to limit the number of pending
801 * TFO connections as above. Long term, I think the best fix
802 * is to let applications know if an incoming connection is
803 * TFO, so that they can handle the case appropriately (e.g.,
804 * by disabling TFO or by declining the connection).
805 */
806 int result = tcp_fastopen_check_cookie(NULL,
807 to.to_tfo_cookie, to.to_tfo_len,
808 &tfo_response_cookie);
809 tfo_cookie_valid = (result > 0);
810 // tfo_response_cookie_valid = (result >= 0);
811 }
812
813 tp = tcplp_sys_accept_ready(tpl, &ip6->ip6_src, th->th_sport); // Try to allocate an active socket to accept into
814 if (tp == NULL) {
815 /* If we couldn't allocate, just ignore the SYN. */
816 return IPPROTO_DONE;
817 }
818 if (tp == (struct tcpcb *) -1) {
819 rstreason = ECONNREFUSED;
820 tp = NULL;
821 goto dropwithreset;
822 }
823 sig->accepted_connection = tp;
824 tcp_state_change(tp, TCPS_SYN_RECEIVED);
825 tpmarkpassiveopen(tp);
826 tp->iss = tcp_new_isn(tp);
827 tp->irs = th->th_seq;
828 tcp_rcvseqinit(tp);
829 tcp_sendseqinit(tp);
830 tp->snd_wl1 = th->th_seq;
831 /*
832 * samkumar: We remove the "+ 1"s below since we use
833 * tcp_output to send the appropriate SYN-ACK. For
834 * example, syncache_tfo_expand eliminates the "+ 1"s
835 * too. My understanding is that syncache_socket has
836 * the "+ 1"s because it's normally called once the
837 * SYN-ACK has already been ACKed, which is not how
838 * TCPlp operates.
839 */
840 tp->snd_max = tp->iss/* + 1*/;
841 tp->snd_nxt = tp->iss/* + 1*/;
842 tp->rcv_up = th->th_seq + 1;
843 tp->rcv_wnd = imin(imax(cbuf_free_space(&tp->recvbuf), 0), TCP_MAXWIN);
844 tp->rcv_adv += tp->rcv_wnd;
845 tp->last_ack_sent = tp->rcv_nxt;
846 memcpy(&tp->laddr, &ip6->ip6_dst, sizeof(tp->laddr));
847 memcpy(&tp->faddr, &ip6->ip6_src, sizeof(tp->faddr));
848 tp->fport = th->th_sport;
849 tp->lport = tpl->lport;
850
851 /*
852 * samkumar: Several of the checks below (taken from syncache_socket!)
853 * check for flags in sc->sc_flags. They have been written to directly
854 * check for the conditions on the TCP options structure or in the TCP
855 * header that would ordinarily be used to set flags in sc->sc_flags
856 * when adding an entry to the SYN cache.
857 *
858 * In effect, we combine the logic in syncache_add to set elements of
859 * sc with the logic in syncache_socket to transfer state from sc
860 * to the socket, but short-circuit the process to avoid ever storing
861 * data in sc. Since this isn't just adding or deleting code, I decided
862 * that it's better to keep comments indicating exactly how I composed
863 * these two functions.
864 */
865 tp->t_flags = tp->t_flags & (TF_NOPUSH | TF_NODELAY | TF_NOOPT);
866 // tp->t_flags = sototcpcb(lso)->t_flags & (TF_NOPUSH|TF_NODELAY);
867 // if (sc->sc_flags & SCF_NOOPT)
868 // tp->t_flags |= TF_NOOPT;
869 // else {
870 if (!(tp->t_flags & TF_NOOPT) && V_tcp_do_rfc1323) {
871 if (/*sc->sc_flags & SCF_WINSCALE*/to.to_flags & TOF_SCALE) {
872 int wscale = 0;
873
874 /*
875 * Pick the smallest possible scaling factor that
876 * will still allow us to scale up to sb_max, aka
877 * kern.ipc.maxsockbuf.
878 *
879 * We do this because there are broken firewalls that
880 * will corrupt the window scale option, leading to
881 * the other endpoint believing that our advertised
882 * window is unscaled. At scale factors larger than
883 * 5 the unscaled window will drop below 1500 bytes,
884 * leading to serious problems when traversing these
885 * broken firewalls.
886 *
887 * With the default maxsockbuf of 256K, a scale factor
888 * of 3 will be chosen by this algorithm. Those who
889 * choose a larger maxsockbuf should watch out
890 * for the compatiblity problems mentioned above.
891 *
892 * RFC1323: The Window field in a SYN (i.e., a <SYN>
893 * or <SYN,ACK>) segment itself is never scaled.
894 */
895
896 /*
897 * samkumar: The original logic, taken from syncache_add, is
898 * listed below, commented out. In practice, we just use
899 * wscale = 0 because in TCPlp we assume that the buffers
900 * aren't big enough for window scaling to be all that useful.
901 */
902 #if 0
903 while (wscale < TCP_MAX_WINSHIFT &&
904 (TCP_MAXWIN << wscale) < sb_max)
905 wscale++;
906 #endif
907
908 tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE;
909 tp->snd_scale = /*sc->sc_requested_s_scale*/to.to_wscale;
910 tp->request_r_scale = wscale;
911 }
912 if (/*sc->sc_flags & SCF_TIMESTAMP*/to.to_flags & TOF_TS) {
913 tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP;
914 tp->ts_recent = /*sc->sc_tsreflect*/to.to_tsval;
915 tp->ts_recent_age = tcp_ts_getticks();
916 tp->ts_offset = /*sc->sc_tsoff*/0; // No syncookies, so this should always be 0
917 }
918
919 /*
920 * samkumar: there used to be code here that would set the
921 * TF_SIGNATURE flag on tp->t_flags if SCF_SIGNATURE is set on
922 * sc->sc_flags. I've left it in below, commented out.
923 */
924 #if 0
925 #ifdef TCP_SIGNATURE
926 if (sc->sc_flags & SCF_SIGNATURE)
927 tp->t_flags |= TF_SIGNATURE;
928 #endif
929 #endif
930 if (/*sc->sc_flags & SCF_SACK*/ to.to_flags & TOF_SACKPERM)
931 tp->t_flags |= TF_SACK_PERMIT;
932 }
933 if (/*sc->sc_flags & SCF_ECN*/(th->th_flags & (TH_ECE|TH_CWR)) && V_tcp_do_ecn)
934 tp->t_flags |= TF_ECN_PERMIT;
935
936 /*
937 * Set up MSS and get cached values from tcp_hostcache.
938 * This might overwrite some of the defaults we just set.
939 */
940 tcp_mss(tp, /*sc->sc_peer_mss*/(to.to_flags & TOF_MSS) ? to.to_mss : 0);
941
942 if (tfo_cookie_valid) {
943 /*
944 * samkumar: The code below is taken from syncache_tfo_socket.
945 * It calls syncache_socket (upon which the above code is based)
946 * so it makes sense for this logic to go here.
947 */
948 tp->t_flags |= TF_FASTOPEN;
949 tp->t_tfo_cookie.server = tfo_response_cookie;
950 tp->snd_max = tp->iss;
951 tp->snd_nxt = tp->iss;
952 // tp->tfo_pending = pending_counter;
953 /* This would normally "goto" labeled code that calls tcp_do_segment. */
954 tcp_do_segment(ip6, th, msg, tp, drop_hdrlen, tlen, iptos, sig);
955
956 tp->accepted_from = tpl;
957 return (IPPROTO_DONE);
958 } else {
959 tp->t_flags |= TF_ACKNOW; // samkumar: my addition
960 }
961
962 tcp_output(tp); // to send the SYN-ACK
963
964 tp->accepted_from = tpl;
965 return (IPPROTO_DONE);
966 } else if (tp->t_state == TCPS_LISTEN) {
967 /*
968 * When a listen socket is torn down the SO_ACCEPTCONN
969 * flag is removed first while connections are drained
970 * from the accept queue in a unlock/lock cycle of the
971 * ACCEPT_LOCK, opening a race condition allowing a SYN
972 * attempt go through unhandled.
973 */
974 goto dropunlock;
975 }
976
977 KASSERT(tp, ("tp is still NULL!"));
978
979 /*
980 * samkumar: There used to be code here to verify TCP signatures. We don't
981 * support TCP signatures in TCPlp.
982 */
983
984 /*
985 * Segment belongs to a connection in SYN_SENT, ESTABLISHED or later
986 * state. tcp_do_segment() always consumes the mbuf chain, unlocks
987 * the inpcb, and unlocks pcbinfo.
988 */
989 tcp_do_segment(ip6, th, msg, tp, drop_hdrlen, tlen, iptos, sig);
990 return (IPPROTO_DONE);
991
992 /*
993 * samkumar: Removed some locking and debugging code under all three of
994 * these labels: dropwithreset, dropunlock, and drop. I also removed some
995 * memory management code (e.g., calling m_freem(m) if m != NULL) since
996 * the caller of this function will take care of that kind of memory
997 * management in TCPlp.
998 */
999 dropwithreset:
1000
1001 /*
1002 * samkumar: The check against inp != NULL is now a check on tp != NULL.
1003 */
1004 if (tp != NULL) {
1005 tcp_dropwithreset(ip6, th, tp, tp->instance, tlen, rstreason);
1006 } else
1007 tcp_dropwithreset(ip6, th, NULL, tpl->instance, tlen, rstreason);
1008 goto drop;
1009
1010 dropunlock:
1011 drop:
1012 return (IPPROTO_DONE);
1013 }
1014
1015 /*
1016 * samkumar: Original signature
1017 * static void
1018 * tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
1019 * struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos,
1020 * int ti_locked)
1021 */
1022 static void
tcp_do_segment(struct ip6_hdr * ip6,struct tcphdr * th,otMessage * msg,struct tcpcb * tp,int drop_hdrlen,int tlen,uint8_t iptos,struct tcplp_signals * sig)1023 tcp_do_segment(struct ip6_hdr* ip6, struct tcphdr *th, otMessage* msg,
1024 struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos,
1025 struct tcplp_signals* sig)
1026 {
1027 /*
1028 * samkumar: All code pertaining to locks, stats, and debug has been
1029 * removed from this function.
1030 */
1031
1032 int thflags, acked, ourfinisacked, needoutput = 0;
1033 int rstreason, todrop, win;
1034 uint64_t tiwin;
1035 struct tcpopt to;
1036 int tfo_syn;
1037 uint32_t ticks = tcplp_sys_get_ticks();
1038 otInstance* instance = tp->instance;
1039 thflags = th->th_flags;
1040 tp->sackhint.last_sack_ack = 0;
1041
1042 /*
1043 * If this is either a state-changing packet or current state isn't
1044 * established, we require a write lock on tcbinfo. Otherwise, we
1045 * allow the tcbinfo to be in either alocked or unlocked, as the
1046 * caller may have unnecessarily acquired a write lock due to a race.
1047 */
1048
1049 /* samkumar: There used to be synchronization code here. */
1050 KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
1051 __func__));
1052 KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
1053 __func__));
1054
1055 /*
1056 * Segment received on connection.
1057 * Reset idle time and keep-alive timer.
1058 * XXX: This should be done after segment
1059 * validation to ignore broken/spoofed segs.
1060 */
1061 tp->t_rcvtime = ticks;
1062 if (TCPS_HAVEESTABLISHED(tp->t_state))
1063 tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp));
1064
1065 /*
1066 * Scale up the window into a 32-bit value.
1067 * For the SYN_SENT state the scale is zero.
1068 */
1069 tiwin = th->th_win << tp->snd_scale;
1070
1071 /*
1072 * TCP ECN processing.
1073 */
1074 /*
1075 * samkumar: I intentionally left the TCPSTAT_INC lines below commented
1076 * out, to avoid altering the structure of the code too much by
1077 * reorganizing the switch statement.
1078 */
1079 if (tp->t_flags & TF_ECN_PERMIT) {
1080 if (thflags & TH_CWR)
1081 tp->t_flags &= ~TF_ECN_SND_ECE;
1082 switch (iptos & IPTOS_ECN_MASK) {
1083 case IPTOS_ECN_CE:
1084 tp->t_flags |= TF_ECN_SND_ECE;
1085 //TCPSTAT_INC(tcps_ecn_ce);
1086 break;
1087 case IPTOS_ECN_ECT0:
1088 //TCPSTAT_INC(tcps_ecn_ect0);
1089 break;
1090 case IPTOS_ECN_ECT1:
1091 //TCPSTAT_INC(tcps_ecn_ect1);
1092 break;
1093 }
1094
1095 /* Process a packet differently from RFC3168. */
1096 cc_ecnpkt_handler(tp, th, iptos);
1097
1098 /* Congestion experienced. */
1099 if (thflags & TH_ECE) {
1100 cc_cong_signal(tp, th, CC_ECN);
1101 }
1102 }
1103
1104 /*
1105 * Parse options on any incoming segment.
1106 */
1107 tcp_dooptions(&to, (uint8_t *)(th + 1),
1108 ((th->th_off_x2 >> TH_OFF_SHIFT) << 2) - sizeof(struct tcphdr),
1109 (thflags & TH_SYN) ? TO_SYN : 0);
1110
1111 /*
1112 * If echoed timestamp is later than the current time,
1113 * fall back to non RFC1323 RTT calculation. Normalize
1114 * timestamp if syncookies were used when this connection
1115 * was established.
1116 */
1117
1118 if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
1119 to.to_tsecr -= tp->ts_offset;
1120 if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks()))
1121 to.to_tsecr = 0;
1122 }
1123 /*
1124 * If timestamps were negotiated during SYN/ACK they should
1125 * appear on every segment during this session and vice versa.
1126 */
1127 if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS)) {
1128 /* samkumar: See above comment regarding tcp_log_addrs. */
1129 tcplp_sys_log("%s; %s: Timestamp missing, "
1130 "no action", "<addrs go here>", __func__);
1131 }
1132 if (!(tp->t_flags & TF_RCVD_TSTMP) && (to.to_flags & TOF_TS)) {
1133 /* samkumar: See above comment regarding tcp_log_addrs. */
1134 tcplp_sys_log("%s; %s: Timestamp not expected, "
1135 "no action", "<addrs go here>", __func__);
1136 }
1137
1138 /*
1139 * Process options only when we get SYN/ACK back. The SYN case
1140 * for incoming connections is handled in tcp_syncache.
1141 * According to RFC1323 the window field in a SYN (i.e., a <SYN>
1142 * or <SYN,ACK>) segment itself is never scaled.
1143 * XXX this is traditional behavior, may need to be cleaned up.
1144 */
1145 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
1146 if ((to.to_flags & TOF_SCALE) &&
1147 (tp->t_flags & TF_REQ_SCALE)) {
1148 tp->t_flags |= TF_RCVD_SCALE;
1149 tp->snd_scale = to.to_wscale;
1150 }
1151 /*
1152 * Initial send window. It will be updated with
1153 * the next incoming segment to the scaled value.
1154 */
1155 tp->snd_wnd = th->th_win;
1156 if (to.to_flags & TOF_TS) {
1157 tp->t_flags |= TF_RCVD_TSTMP;
1158 tp->ts_recent = to.to_tsval;
1159 tp->ts_recent_age = tcp_ts_getticks();
1160 }
1161 if (to.to_flags & TOF_MSS)
1162 tcp_mss(tp, to.to_mss);
1163 if ((tp->t_flags & TF_SACK_PERMIT) &&
1164 (to.to_flags & TOF_SACKPERM) == 0)
1165 tp->t_flags &= ~TF_SACK_PERMIT;
1166 /*
1167 * samkumar: TCP Fast Open logic from FreeBSD 12.0.
1168 */
1169 if (IS_FASTOPEN(tp->t_flags)) {
1170 if (to.to_flags & TOF_FASTOPEN) {
1171 uint16_t mss;
1172
1173 if (to.to_flags & TOF_MSS)
1174 mss = to.to_mss;
1175 else
1176 /*
1177 * samkumar: The original code here would set
1178 * mss to either TCP6_MSS or TCP_MSS depending
1179 * on whether the INP_IPV6 flag is present in
1180 * tp->t_inpcb->inp_vflag. In TCPlp, we always
1181 * assume IPv6.
1182 */
1183 mss = TCP6_MSS;
1184 tcp_fastopen_update_cache(tp, mss,
1185 to.to_tfo_len, to.to_tfo_cookie);
1186 } else
1187 tcp_fastopen_disable_path(tp);
1188 }
1189 }
1190 /*
1191 * Header prediction: check for the two common cases
1192 * of a uni-directional data xfer. If the packet has
1193 * no control flags, is in-sequence, the window didn't
1194 * change and we're not retransmitting, it's a
1195 * candidate. If the length is zero and the ack moved
1196 * forward, we're the sender side of the xfer. Just
1197 * free the data acked & wake any higher level process
1198 * that was blocked waiting for space. If the length
1199 * is non-zero and the ack didn't move, we're the
1200 * receiver side. If we're getting packets in-order
1201 * (the reassembly queue is empty), add the data to
1202 * the socket buffer and note that we need a delayed ack.
1203 * Make sure that the hidden state-flags are also off.
1204 * Since we check for TCPS_ESTABLISHED first, it can only
1205 * be TH_NEEDSYN.
1206 */
1207 /*
1208 * samkumar: Replaced LIST_EMPTY(&tp->tsegq with the call to bmp_isempty).
1209 */
1210 if (tp->t_state == TCPS_ESTABLISHED &&
1211 th->th_seq == tp->rcv_nxt &&
1212 (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
1213 tp->snd_nxt == tp->snd_max &&
1214 tiwin && tiwin == tp->snd_wnd &&
1215 ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
1216 bmp_isempty(tp->reassbmp, REASSBMP_SIZE(tp)) &&
1217 ((to.to_flags & TOF_TS) == 0 ||
1218 TSTMP_GEQ(to.to_tsval, tp->ts_recent)) ) {
1219
1220 /*
1221 * If last ACK falls within this segment's sequence numbers,
1222 * record the timestamp.
1223 * NOTE that the test is modified according to the latest
1224 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
1225 */
1226 if ((to.to_flags & TOF_TS) != 0 &&
1227 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
1228 tp->ts_recent_age = tcp_ts_getticks();
1229 tp->ts_recent = to.to_tsval;
1230 }
1231
1232 if (tlen == 0) {
1233 if (SEQ_GT(th->th_ack, tp->snd_una) &&
1234 SEQ_LEQ(th->th_ack, tp->snd_max) &&
1235 !IN_RECOVERY(tp->t_flags) &&
1236 (to.to_flags & TOF_SACK) == 0 &&
1237 TAILQ_EMPTY(&tp->snd_holes)) {
1238 /*
1239 * This is a pure ack for outstanding data.
1240 */
1241
1242 /*
1243 * "bad retransmit" recovery.
1244 */
1245 if (tp->t_rxtshift == 1 &&
1246 tp->t_flags & TF_PREVVALID &&
1247 (int)(ticks - tp->t_badrxtwin) < 0) {
1248 cc_cong_signal(tp, th, CC_RTO_ERR);
1249 }
1250
1251 /*
1252 * Recalculate the transmit timer / rtt.
1253 *
1254 * Some boxes send broken timestamp replies
1255 * during the SYN+ACK phase, ignore
1256 * timestamps of 0 or we could calculate a
1257 * huge RTT and blow up the retransmit timer.
1258 */
1259
1260 if ((to.to_flags & TOF_TS) != 0 &&
1261 to.to_tsecr) {
1262 uint32_t t;
1263
1264 t = tcp_ts_getticks() - to.to_tsecr;
1265 if (!tp->t_rttlow || tp->t_rttlow > t)
1266 tp->t_rttlow = t;
1267 tcp_xmit_timer(tp,
1268 TCP_TS_TO_TICKS(t) + 1);
1269 } else if (tp->t_rtttime &&
1270 SEQ_GT(th->th_ack, tp->t_rtseq)) {
1271 if (!tp->t_rttlow ||
1272 tp->t_rttlow > ticks - tp->t_rtttime)
1273 tp->t_rttlow = ticks - tp->t_rtttime;
1274 tcp_xmit_timer(tp,
1275 ticks - tp->t_rtttime);
1276 }
1277
1278 acked = BYTES_THIS_ACK(tp, th);
1279
1280 /*
1281 * samkumar: Replaced sbdrop(&so->so_snd, acked) with this call
1282 * to lbuf_pop.
1283 */
1284 {
1285 uint32_t poppedbytes = lbuf_pop(&tp->sendbuf, acked, &sig->links_popped);
1286 KASSERT(poppedbytes == acked, ("More bytes were acked than are in the send buffer"));
1287 sig->bytes_acked += poppedbytes;
1288 }
1289 if (SEQ_GT(tp->snd_una, tp->snd_recover) &&
1290 SEQ_LEQ(th->th_ack, tp->snd_recover))
1291 tp->snd_recover = th->th_ack - 1;
1292
1293 /*
1294 * Let the congestion control algorithm update
1295 * congestion control related information. This
1296 * typically means increasing the congestion
1297 * window.
1298 */
1299 cc_ack_received(tp, th, CC_ACK);
1300
1301 tp->snd_una = th->th_ack;
1302 /*
1303 * Pull snd_wl2 up to prevent seq wrap relative
1304 * to th_ack.
1305 */
1306 tp->snd_wl2 = th->th_ack;
1307 tp->t_dupacks = 0;
1308
1309 /*
1310 * If all outstanding data are acked, stop
1311 * retransmit timer, otherwise restart timer
1312 * using current (possibly backed-off) value.
1313 * If process is waiting for space,
1314 * wakeup/selwakeup/signal. If data
1315 * are ready to send, let tcp_output
1316 * decide between more output or persist.
1317 */
1318
1319 if (tp->snd_una == tp->snd_max)
1320 tcp_timer_activate(tp, TT_REXMT, 0);
1321 else if (!tcp_timer_active(tp, TT_PERSIST))
1322 tcp_timer_activate(tp, TT_REXMT,
1323 tp->t_rxtcur);
1324
1325 /*
1326 * samkumar: There used to be a call to sowwakeup(so); here,
1327 * which wakes up any threads waiting for the socket to
1328 * become ready for writing. TCPlp handles its send buffer
1329 * differently so we do not need to replace this call with
1330 * specialized code to handle this.
1331 */
1332
1333 /*
1334 * samkumar: Replaced sbavail(&so->so_snd) with this call to
1335 * lbuf_used_space.
1336 */
1337 if (lbuf_used_space(&tp->sendbuf))
1338 (void) tcp_output(tp);
1339 goto check_delack;
1340 }
1341 } else if (th->th_ack == tp->snd_una &&
1342 /*
1343 * samkumar: Replaced sbspace(&so->so_rcv) with this call to
1344 * cbuf_free_space.
1345 */
1346 tlen <= cbuf_free_space(&tp->recvbuf)) {
1347
1348 /*
1349 * This is a pure, in-sequence data packet with
1350 * nothing on the reassembly queue and we have enough
1351 * buffer space to take it.
1352 */
1353 /* Clean receiver SACK report if present */
1354 if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks)
1355 tcp_clean_sackreport(tp);
1356
1357 tp->rcv_nxt += tlen;
1358 /*
1359 * Pull snd_wl1 up to prevent seq wrap relative to
1360 * th_seq.
1361 */
1362 tp->snd_wl1 = th->th_seq;
1363 /*
1364 * Pull rcv_up up to prevent seq wrap relative to
1365 * rcv_nxt.
1366 */
1367 tp->rcv_up = tp->rcv_nxt;
1368
1369 /*
1370 * Automatic sizing of receive socket buffer. Often the send
1371 * buffer size is not optimally adjusted to the actual network
1372 * conditions at hand (delay bandwidth product). Setting the
1373 * buffer size too small limits throughput on links with high
1374 * bandwidth and high delay (eg. trans-continental/oceanic links).
1375 *
1376 * On the receive side the socket buffer memory is only rarely
1377 * used to any significant extent. This allows us to be much
1378 * more aggressive in scaling the receive socket buffer. For
1379 * the case that the buffer space is actually used to a large
1380 * extent and we run out of kernel memory we can simply drop
1381 * the new segments; TCP on the sender will just retransmit it
1382 * later. Setting the buffer size too big may only consume too
1383 * much kernel memory if the application doesn't read() from
1384 * the socket or packet loss or reordering makes use of the
1385 * reassembly queue.
1386 *
1387 * The criteria to step up the receive buffer one notch are:
1388 * 1. Application has not set receive buffer size with
1389 * SO_RCVBUF. Setting SO_RCVBUF clears SB_AUTOSIZE.
1390 * 2. the number of bytes received during the time it takes
1391 * one timestamp to be reflected back to us (the RTT);
1392 * 3. received bytes per RTT is within seven eighth of the
1393 * current socket buffer size;
1394 * 4. receive buffer size has not hit maximal automatic size;
1395 *
1396 * This algorithm does one step per RTT at most and only if
1397 * we receive a bulk stream w/o packet losses or reorderings.
1398 * Shrinking the buffer during idle times is not necessary as
1399 * it doesn't consume any memory when idle.
1400 *
1401 * TODO: Only step up if the application is actually serving
1402 * the buffer to better manage the socket buffer resources.
1403 */
1404
1405 /*
1406 * samkumar: There used to be code here to dynamically size the
1407 * receive buffer (tp->rfbuf_ts, rp->rfbuf_cnt, and the local
1408 * newsize variable). In TCPlp, we don't support this, as the user
1409 * allocates the receive buffer and its size can't be changed here.
1410 * Therefore, I removed the code that does this. Note that the
1411 * actual resizing of the buffer is done using sbreserve_locked,
1412 * whose call comes later (not exactly where this comment is).
1413 */
1414
1415 /* Add data to socket buffer. */
1416
1417 /*
1418 * samkumar: The code that was here would just free the mbuf
1419 * (with m_freem(m)) if SBS_CANTRCVMORE is set in
1420 * so->so_rcv.sb_state. Otherwise, it would cut drop_hdrlen bytes
1421 * from the mbuf (using m_adj(m, drop_hdrlen)) to discard the
1422 * headers and then append the mbuf to the receive buffer using
1423 * sbappendstream_locked(&so->so_rcv, m, 0). I've rewritten this
1424 * to work the TCPlp way. The check to so->so_rcv.sb_state is
1425 * replaced by a tcpiscantrcv call, and we copy bytes into
1426 * TCPlp's circular buffer (since we designed it to avoid
1427 * having dynamically-allocated memory for the receive buffer).
1428 */
1429
1430 if (!tpiscantrcv(tp)) {
1431 cbuf_write(&tp->recvbuf, msg, otMessageGetOffset(msg) + drop_hdrlen, tlen, cbuf_copy_from_message);
1432 if (tlen > 0) {
1433 sig->recvbuf_added = true;
1434 }
1435 } else {
1436 /*
1437 * samkumar: We already know tlen != 0, so if we got here, then
1438 * it means that we got data after we called SHUT_RD, or after
1439 * receiving a FIN. I'm going to drop the connection in this
1440 * case. I think FreeBSD might have just dropped the packet
1441 * silently, but Linux handles it this way; this seems to be
1442 * the right approach to me.
1443 */
1444 tcp_drop(tp, ECONNABORTED);
1445 goto drop;
1446 }
1447 /* NB: sorwakeup_locked() does an implicit unlock. */
1448 /*
1449 * samkumar: There used to be a call to sorwakeup_locked(so); here,
1450 * which wakes up any threads waiting for the socket to become
1451 * become ready for reading. TCPlp handles its buffering
1452 * differently so we do not need to replace this call with
1453 * specialized code to handle this.
1454 */
1455 if (DELAY_ACK(tp, tlen)) {
1456 tp->t_flags |= TF_DELACK;
1457 } else {
1458 tp->t_flags |= TF_ACKNOW;
1459 tcp_output(tp);
1460 }
1461 goto check_delack;
1462 }
1463 }
1464
1465 /*
1466 * Calculate amount of space in receive window,
1467 * and then do TCP input processing.
1468 * Receive window is amount of space in rcv queue,
1469 * but not less than advertised window.
1470 */
1471 /* samkumar: Replaced sbspace(&so->so_rcv) with call to cbuf_free_space. */
1472 win = cbuf_free_space(&tp->recvbuf);
1473 if (win < 0)
1474 win = 0;
1475 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
1476
1477 /* Reset receive buffer auto scaling when not in bulk receive mode. */
1478 /* samkumar: Removed this receive buffer autoscaling code. */
1479
1480 switch (tp->t_state) {
1481
1482 /*
1483 * If the state is SYN_RECEIVED:
1484 * if seg contains an ACK, but not for our SYN/ACK, send a RST.
1485 * (Added by Sam) if seg is resending the original SYN, resend the SYN/ACK
1486 */
1487 /*
1488 * samkumar: If we receive a retransmission of the original SYN, then
1489 * resend the SYN/ACK segment. This case was probably handled by the
1490 * SYN cache. Because TCPlp does not use a SYN cache, we need to write
1491 * custom logic for it. It is handled in the "else if" clause below.
1492 */
1493 case TCPS_SYN_RECEIVED:
1494 if ((thflags & TH_ACK) &&
1495 (SEQ_LEQ(th->th_ack, tp->snd_una) ||
1496 SEQ_GT(th->th_ack, tp->snd_max))) {
1497 rstreason = BANDLIM_RST_OPENPORT;
1498 goto dropwithreset;
1499 } else if (!IS_FASTOPEN(tp->t_flags) && (thflags & TH_SYN) && !(thflags & TH_ACK) && (th->th_seq == tp->irs)) {
1500 tp->t_flags |= TF_ACKNOW;
1501 }
1502 /*
1503 * samkumar: TCP Fast Open Logic from FreeBSD 12.0.
1504 */
1505 if (IS_FASTOPEN(tp->t_flags)) {
1506 /*
1507 * When a TFO connection is in SYN_RECEIVED, the
1508 * only valid packets are the initial SYN, a
1509 * retransmit/copy of the initial SYN (possibly with
1510 * a subset of the original data), a valid ACK, a
1511 * FIN, or a RST.
1512 */
1513 if ((thflags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK)) {
1514 rstreason = BANDLIM_RST_OPENPORT;
1515 goto dropwithreset;
1516 } else if (thflags & TH_SYN) {
1517 /* non-initial SYN is ignored */
1518 if ((tcp_timer_active(tp, TT_DELACK) ||
1519 tcp_timer_active(tp, TT_REXMT)))
1520 goto drop;
1521 } else if (!(thflags & (TH_ACK|TH_FIN|TH_RST))) {
1522 goto drop;
1523 }
1524 }
1525 break;
1526
1527 /*
1528 * If the state is SYN_SENT:
1529 * if seg contains an ACK, but not for our SYN, drop the input.
1530 * if seg contains a RST, then drop the connection.
1531 * if seg does not contain SYN, then drop it.
1532 * Otherwise this is an acceptable SYN segment
1533 * initialize tp->rcv_nxt and tp->irs
1534 * if seg contains ack then advance tp->snd_una
1535 * if seg contains an ECE and ECN support is enabled, the stream
1536 * is ECN capable.
1537 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state
1538 * arrange for segment to be acked (eventually)
1539 * continue processing rest of data/controls, beginning with URG
1540 */
1541 case TCPS_SYN_SENT:
1542 if ((thflags & TH_ACK) &&
1543 (SEQ_LEQ(th->th_ack, tp->iss) ||
1544 SEQ_GT(th->th_ack, tp->snd_max))) {
1545 rstreason = BANDLIM_UNLIMITED;
1546 goto dropwithreset;
1547 }
1548 if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) {
1549 tp = tcp_drop(tp, ECONNREFUSED);
1550 }
1551 if (thflags & TH_RST)
1552 goto drop;
1553 if (!(thflags & TH_SYN))
1554 goto drop;
1555
1556 tp->irs = th->th_seq;
1557 tcp_rcvseqinit(tp);
1558 if (thflags & TH_ACK) {
1559 int tfo_partial_ack = 0;
1560
1561 /*
1562 * samkumar: Removed call to soisconnected(so), since TCPlp has its
1563 * own buffering.
1564 */
1565
1566 /* Do window scaling on this connection? */
1567 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
1568 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
1569 tp->rcv_scale = tp->request_r_scale;
1570 }
1571 tp->rcv_adv += imin(tp->rcv_wnd,
1572 TCP_MAXWIN << tp->rcv_scale);
1573 tp->snd_una++; /* SYN is acked */
1574 /*
1575 * If not all the data that was sent in the TFO SYN
1576 * has been acked, resend the remainder right away.
1577 */
1578 if (IS_FASTOPEN(tp->t_flags) &&
1579 (tp->snd_una != tp->snd_max)) {
1580 tp->snd_nxt = th->th_ack;
1581 tfo_partial_ack = 1;
1582 }
1583 /*
1584 * If there's data, delay ACK; if there's also a FIN
1585 * ACKNOW will be turned on later.
1586 */
1587 if (DELAY_ACK(tp, tlen) && tlen != 0 && !tfo_partial_ack)
1588 tcp_timer_activate(tp, TT_DELACK,
1589 tcp_delacktime);
1590 else
1591 tp->t_flags |= TF_ACKNOW;
1592
1593 if ((thflags & TH_ECE) && V_tcp_do_ecn) {
1594 tp->t_flags |= TF_ECN_PERMIT;
1595 }
1596
1597 /*
1598 * Received <SYN,ACK> in SYN_SENT[*] state.
1599 * Transitions:
1600 * SYN_SENT --> ESTABLISHED
1601 * SYN_SENT* --> FIN_WAIT_1
1602 */
1603 tp->t_starttime = ticks;
1604 if (tp->t_flags & TF_NEEDFIN) {
1605 tcp_state_change(tp, TCPS_FIN_WAIT_1);
1606 tp->t_flags &= ~TF_NEEDFIN;
1607 thflags &= ~TH_SYN;
1608 } else {
1609 tcp_state_change(tp, TCPS_ESTABLISHED);
1610 /* samkumar: Set conn_established signal for TCPlp. */
1611 sig->conn_established = true;
1612 cc_conn_init(tp);
1613 tcp_timer_activate(tp, TT_KEEP,
1614 TP_KEEPIDLE(tp));
1615 }
1616 } else {
1617 /*
1618 * Received initial SYN in SYN-SENT[*] state =>
1619 * simultaneous open.
1620 * If it succeeds, connection is * half-synchronized.
1621 * Otherwise, do 3-way handshake:
1622 * SYN-SENT -> SYN-RECEIVED
1623 * SYN-SENT* -> SYN-RECEIVED*
1624 */
1625 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
1626 tcp_timer_activate(tp, TT_REXMT, 0);
1627 tcp_state_change(tp, TCPS_SYN_RECEIVED);
1628 /*
1629 * samkumar: We would have incremented snd_next in tcp_output when
1630 * we sent the original SYN, so decrement it here. (Another
1631 * consequence of removing the SYN cache.)
1632 */
1633 tp->snd_nxt--;
1634 }
1635
1636 /*
1637 * Advance th->th_seq to correspond to first data byte.
1638 * If data, trim to stay within window,
1639 * dropping FIN if necessary.
1640 */
1641 th->th_seq++;
1642 if (tlen > tp->rcv_wnd) {
1643 todrop = tlen - tp->rcv_wnd;
1644 /*
1645 * samkumar: I removed a call to m_adj(m, -todrop), which intends
1646 * to trim the data so it fits in the window. We can just read less
1647 * when copying into the receive buffer in TCPlp, so we don't need
1648 * to do this.
1649 */
1650 (void) todrop; /* samkumar: Prevent a compiler warning */
1651 tlen = tp->rcv_wnd;
1652 thflags &= ~TH_FIN;
1653 }
1654 tp->snd_wl1 = th->th_seq - 1;
1655 tp->rcv_up = th->th_seq;
1656 /*
1657 * Client side of transaction: already sent SYN and data.
1658 * If the remote host used T/TCP to validate the SYN,
1659 * our data will be ACK'd; if so, enter normal data segment
1660 * processing in the middle of step 5, ack processing.
1661 * Otherwise, goto step 6.
1662 */
1663 if (thflags & TH_ACK)
1664 goto process_ACK;
1665
1666 goto step6;
1667
1668 /*
1669 * If the state is LAST_ACK or CLOSING or TIME_WAIT:
1670 * do normal processing.
1671 *
1672 * NB: Leftover from RFC1644 T/TCP. Cases to be reused later.
1673 */
1674 case TCPS_LAST_ACK:
1675 case TCPS_CLOSING:
1676 break; /* continue normal processing */
1677 }
1678
1679 /*
1680 * States other than LISTEN or SYN_SENT.
1681 * First check the RST flag and sequence number since reset segments
1682 * are exempt from the timestamp and connection count tests. This
1683 * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix
1684 * below which allowed reset segments in half the sequence space
1685 * to fall though and be processed (which gives forged reset
1686 * segments with a random sequence number a 50 percent chance of
1687 * killing a connection).
1688 * Then check timestamp, if present.
1689 * Then check the connection count, if present.
1690 * Then check that at least some bytes of segment are within
1691 * receive window. If segment begins before rcv_nxt,
1692 * drop leading data (and SYN); if nothing left, just ack.
1693 */
1694 if (thflags & TH_RST) {
1695 /*
1696 * RFC5961 Section 3.2
1697 *
1698 * - RST drops connection only if SEG.SEQ == RCV.NXT.
1699 * - If RST is in window, we send challenge ACK.
1700 *
1701 * Note: to take into account delayed ACKs, we should
1702 * test against last_ack_sent instead of rcv_nxt.
1703 * Note 2: we handle special case of closed window, not
1704 * covered by the RFC.
1705 */
1706 if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
1707 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) ||
1708 (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) {
1709
1710 /*
1711 * samkumar: This if statement used to also be prefaced with
1712 * "V_tcp_insecure_rst ||". But I removed it, since there's no
1713 * reason to support an insecure option in TCPlp (my guess is that
1714 * FreeBSD supported it for legacy reasons).
1715 */
1716 if (tp->last_ack_sent == th->th_seq) {
1717 /*
1718 * samkumar: Normally, the error number would be stored in
1719 * so->so_error. Instead, we put it in this "droperror" local
1720 * variable and then pass it to tcplp_sys_connection_lost.
1721 */
1722 int droperror = 0;
1723 /* Drop the connection. */
1724 switch (tp->t_state) {
1725 case TCPS_SYN_RECEIVED:
1726 droperror = ECONNREFUSED;
1727 goto close;
1728 case TCPS_ESTABLISHED:
1729 case TCPS_FIN_WAIT_1:
1730 case TCPS_FIN_WAIT_2:
1731 case TCPS_CLOSE_WAIT:
1732 droperror = ECONNRESET;
1733 close:
1734 tcp_state_change(tp, TCPS_CLOSED);
1735 /* FALLTHROUGH */
1736 default:
1737 tp = tcp_close(tp);
1738 tcplp_sys_connection_lost(tp, droperror);
1739 }
1740 } else {
1741 /* Send challenge ACK. */
1742 tcp_respond(tp, tp->instance, ip6, th, tp->rcv_nxt, tp->snd_nxt, TH_ACK);
1743 tp->last_ack_sent = tp->rcv_nxt;
1744 }
1745 }
1746 goto drop;
1747 }
1748
1749 /*
1750 * RFC5961 Section 4.2
1751 * Send challenge ACK for any SYN in synchronized state.
1752 */
1753 /*
1754 * samkumar: I added the check for the SYN-RECEIVED state in this if
1755 * statement (another consequence of removing the SYN cache).
1756 */
1757 if ((thflags & TH_SYN) && tp->t_state != TCPS_SYN_SENT && tp->t_state != TCP6S_SYN_RECEIVED) {
1758 /*
1759 * samkumar: The modern way to handle this is to send a Challenge ACK.
1760 * FreeBSD supports this, but it also has this V_tcp_insecure_syn
1761 * options that will cause it to drop the connection if the SYN falls
1762 * in the receive window. In TCPlp we *only* support Challenge ACKs
1763 * (the secure way of doing it), so I've removed code for the insecure
1764 * way. (Presumably the reason why FreeBSD supports the insecure way is
1765 * for legacy code, which we don't really care about in TCPlp).
1766 */
1767 /* Send challenge ACK. */
1768 tcplp_sys_log("Sending challenge ACK");
1769 tcp_respond(tp, tp->instance, ip6, th, tp->rcv_nxt, tp->snd_nxt, TH_ACK);
1770 tp->last_ack_sent = tp->rcv_nxt;
1771 goto drop;
1772 }
1773
1774 /*
1775 * RFC 1323 PAWS: If we have a timestamp reply on this segment
1776 * and it's less than ts_recent, drop it.
1777 */
1778 if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent &&
1779 TSTMP_LT(to.to_tsval, tp->ts_recent)) {
1780
1781 /* Check to see if ts_recent is over 24 days old. */
1782 if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) {
1783 /*
1784 * Invalidate ts_recent. If this segment updates
1785 * ts_recent, the age will be reset later and ts_recent
1786 * will get a valid value. If it does not, setting
1787 * ts_recent to zero will at least satisfy the
1788 * requirement that zero be placed in the timestamp
1789 * echo reply when ts_recent isn't valid. The
1790 * age isn't reset until we get a valid ts_recent
1791 * because we don't want out-of-order segments to be
1792 * dropped when ts_recent is old.
1793 */
1794 tp->ts_recent = 0;
1795 } else {
1796 if (tlen)
1797 goto dropafterack;
1798 goto drop;
1799 }
1800 }
1801
1802 /*
1803 * In the SYN-RECEIVED state, validate that the packet belongs to
1804 * this connection before trimming the data to fit the receive
1805 * window. Check the sequence number versus IRS since we know
1806 * the sequence numbers haven't wrapped. This is a partial fix
1807 * for the "LAND" DoS attack.
1808 */
1809 if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) {
1810 rstreason = BANDLIM_RST_OPENPORT;
1811 goto dropwithreset;
1812 }
1813
1814 todrop = tp->rcv_nxt - th->th_seq;
1815 if (todrop > 0) {
1816 if (thflags & TH_SYN) {
1817 thflags &= ~TH_SYN;
1818 th->th_seq++;
1819 if (th->th_urp > 1)
1820 th->th_urp--;
1821 else
1822 thflags &= ~TH_URG;
1823 todrop--;
1824 }
1825 /*
1826 * Following if statement from Stevens, vol. 2, p. 960.
1827 */
1828 if (todrop > tlen
1829 || (todrop == tlen && (thflags & TH_FIN) == 0)) {
1830 /*
1831 * Any valid FIN must be to the left of the window.
1832 * At this point the FIN must be a duplicate or out
1833 * of sequence; drop it.
1834 */
1835 thflags &= ~TH_FIN;
1836
1837 /*
1838 * Send an ACK to resynchronize and drop any data.
1839 * But keep on processing for RST or ACK.
1840 */
1841 tp->t_flags |= TF_ACKNOW;
1842 todrop = tlen;
1843 }
1844 /* samkumar: There was an else case that only collected stats. */
1845 drop_hdrlen += todrop; /* drop from the top afterwards */
1846 th->th_seq += todrop;
1847 tlen -= todrop;
1848 if (th->th_urp > todrop)
1849 th->th_urp -= todrop;
1850 else {
1851 thflags &= ~TH_URG;
1852 th->th_urp = 0;
1853 }
1854 }
1855
1856 /*
1857 * If new data are received on a connection after the
1858 * user processes are gone, then RST the other end.
1859 */
1860 /*
1861 * samkumar: TCPlp is designed for embedded systems where there is no
1862 * concept of a "process" that has allocated a TCP socket. Therefore, we
1863 * do not implement the functionality in the above comment (the code for
1864 * it used to be here, and I removed it).
1865 */
1866 /*
1867 * If segment ends after window, drop trailing data
1868 * (and PUSH and FIN); if nothing left, just ACK.
1869 */
1870 todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd);
1871 if (todrop > 0) {
1872 if (todrop >= tlen) {
1873 /*
1874 * If window is closed can only take segments at
1875 * window edge, and have to drop data and PUSH from
1876 * incoming segments. Continue processing, but
1877 * remember to ack. Otherwise, drop segment
1878 * and ack.
1879 */
1880 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
1881 tp->t_flags |= TF_ACKNOW;
1882 } else
1883 goto dropafterack;
1884 }
1885 /*
1886 * samkumar: I removed a call to m_adj(m, -todrop), which intends
1887 * to trim the data so it fits in the window. We can just read less
1888 * when copying into the receive buffer in TCPlp, so we don't need
1889 * to do this. Subtracting it from tlen gives us enough information to
1890 * do this later. In FreeBSD, this isn't possible because the mbuf
1891 * itself becomes part of the receive buffer, so the mbuf has to be
1892 * trimmed in order for this to work out.
1893 */
1894 tlen -= todrop;
1895 thflags &= ~(TH_PUSH|TH_FIN);
1896 }
1897
1898 /*
1899 * If last ACK falls within this segment's sequence numbers,
1900 * record its timestamp.
1901 * NOTE:
1902 * 1) That the test incorporates suggestions from the latest
1903 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
1904 * 2) That updating only on newer timestamps interferes with
1905 * our earlier PAWS tests, so this check should be solely
1906 * predicated on the sequence space of this segment.
1907 * 3) That we modify the segment boundary check to be
1908 * Last.ACK.Sent <= SEG.SEQ + SEG.Len
1909 * instead of RFC1323's
1910 * Last.ACK.Sent < SEG.SEQ + SEG.Len,
1911 * This modified check allows us to overcome RFC1323's
1912 * limitations as described in Stevens TCP/IP Illustrated
1913 * Vol. 2 p.869. In such cases, we can still calculate the
1914 * RTT correctly when RCV.NXT == Last.ACK.Sent.
1915 */
1916
1917 if ((to.to_flags & TOF_TS) != 0 &&
1918 SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
1919 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
1920 ((thflags & (TH_SYN|TH_FIN)) != 0))) {
1921 tp->ts_recent_age = tcp_ts_getticks();
1922 tp->ts_recent = to.to_tsval;
1923 }
1924
1925 /*
1926 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN
1927 * flag is on (half-synchronized state), then queue data for
1928 * later processing; else drop segment and return.
1929 */
1930 if ((thflags & TH_ACK) == 0) {
1931 if (tp->t_state == TCPS_SYN_RECEIVED ||
1932 (tp->t_flags & TF_NEEDSYN)) {
1933 if (tp->t_state == TCPS_SYN_RECEIVED &&
1934 IS_FASTOPEN(tp->t_flags)) {
1935 tp->snd_wnd = tiwin;
1936 cc_conn_init(tp);
1937 }
1938 goto step6;
1939 } else if (tp->t_flags & TF_ACKNOW)
1940 goto dropafterack;
1941 else
1942 goto drop;
1943 }
1944
1945 tcplp_sys_log("Processing ACK");
1946
1947 /*
1948 * Ack processing.
1949 */
1950 switch (tp->t_state) {
1951
1952 /*
1953 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter
1954 * ESTABLISHED state and continue processing.
1955 * The ACK was checked above.
1956 */
1957 case TCPS_SYN_RECEIVED:
1958 /*
1959 * samkumar: Removed call to soisconnected(so), since TCPlp has its
1960 * own buffering.
1961 */
1962 /* Do window scaling? */
1963 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
1964 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
1965 tp->rcv_scale = tp->request_r_scale;
1966 tp->snd_wnd = tiwin;
1967 }
1968 /*
1969 * Make transitions:
1970 * SYN-RECEIVED -> ESTABLISHED
1971 * SYN-RECEIVED* -> FIN-WAIT-1
1972 */
1973 tp->t_starttime = ticks;
1974 /*
1975 * samkumar: I'm eliminating the TFO pending counter.
1976 */
1977 if (IS_FASTOPEN(tp->t_flags)/* && tp->t_tfo_pending */) {\
1978 /*
1979 tcp_fastopen_decrement_counter(tp->t_tfo_pending);
1980 tp->t_tfo_pending = NULL;
1981 */
1982
1983 /*
1984 * Account for the ACK of our SYN prior to
1985 * regular ACK processing below.
1986 */
1987 tp->snd_una++;
1988 }
1989 if (tp->t_flags & TF_NEEDFIN) {
1990 tcp_state_change(tp, TCPS_FIN_WAIT_1);
1991 tp->t_flags &= ~TF_NEEDFIN;
1992 } else {
1993 tcp_state_change(tp, TCPS_ESTABLISHED);
1994 /* samkumar: Set conn_established signal for TCPlp. */
1995 sig->conn_established = true;
1996 /*
1997 * TFO connections call cc_conn_init() during SYN
1998 * processing. Calling it again here for such
1999 * connections is not harmless as it would undo the
2000 * snd_cwnd reduction that occurs when a TFO SYN|ACK
2001 * is retransmitted.
2002 */
2003 if (!IS_FASTOPEN(tp->t_flags))
2004 cc_conn_init(tp);
2005 tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp));
2006 /*
2007 * samkumar: I added this check to account for simultaneous open.
2008 * If this socket was opened actively, then the fact that we are
2009 * in SYN-RECEIVED indicates that we are in simultaneous open.
2010 * Therefore, don't ACK the SYN-ACK (unless it contains data or
2011 * something, which will be processed later).
2012 */
2013 if (!tpispassiveopen(tp)) {
2014 tp->t_flags &= ~TF_ACKNOW;
2015 } else {
2016 /*
2017 * samkumar: Otherwise, we entered the ESTABLISHED state by
2018 * accepting a connection, so call the appropriate callback in
2019 * TCPlp. TODO: consider using signals to handle this?
2020 */
2021 bool accepted = tcplp_sys_accepted_connection(tp->accepted_from, tp, &ip6->ip6_src, th->th_sport);
2022 if (!accepted) {
2023 rstreason = ECONNREFUSED;
2024 goto dropwithreset;
2025 }
2026 }
2027 }
2028 /*
2029 * If segment contains data or ACK, will call tcp_reass()
2030 * later; if not, do so now to pass queued data to user.
2031 */
2032 if (tlen == 0 && (thflags & TH_FIN) == 0)
2033 (void) tcp_reass(tp, (struct tcphdr *)0, 0,
2034 (otMessage*)0, 0, sig);
2035
2036 tp->snd_wl1 = th->th_seq - 1;
2037 /* FALLTHROUGH */
2038
2039 /*
2040 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
2041 * ACKs. If the ack is in the range
2042 * tp->snd_una < th->th_ack <= tp->snd_max
2043 * then advance tp->snd_una to th->th_ack and drop
2044 * data from the retransmission queue. If this ACK reflects
2045 * more up to date window information we update our window information.
2046 */
2047 case TCPS_ESTABLISHED:
2048 case TCPS_FIN_WAIT_1:
2049 case TCPS_FIN_WAIT_2:
2050 case TCPS_CLOSE_WAIT:
2051 case TCPS_CLOSING:
2052 case TCPS_LAST_ACK:
2053 if (SEQ_GT(th->th_ack, tp->snd_max)) {
2054 goto dropafterack;
2055 }
2056
2057 if ((tp->t_flags & TF_SACK_PERMIT) &&
2058 ((to.to_flags & TOF_SACK) ||
2059 !TAILQ_EMPTY(&tp->snd_holes)))
2060 tcp_sack_doack(tp, &to, th->th_ack);
2061
2062 if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
2063 if (tlen == 0 && tiwin == tp->snd_wnd) {
2064 /*
2065 * If this is the first time we've seen a
2066 * FIN from the remote, this is not a
2067 * duplicate and it needs to be processed
2068 * normally. This happens during a
2069 * simultaneous close.
2070 */
2071 if ((thflags & TH_FIN) &&
2072 (TCPS_HAVERCVDFIN(tp->t_state) == 0)) {
2073 tp->t_dupacks = 0;
2074 break;
2075 }
2076 /*
2077 * If we have outstanding data (other than
2078 * a window probe), this is a completely
2079 * duplicate ack (ie, window info didn't
2080 * change and FIN isn't set),
2081 * the ack is the biggest we've
2082 * seen and we've seen exactly our rexmt
2083 * threshhold of them, assume a packet
2084 * has been dropped and retransmit it.
2085 * Kludge snd_nxt & the congestion
2086 * window so we send only this one
2087 * packet.
2088 *
2089 * We know we're losing at the current
2090 * window size so do congestion avoidance
2091 * (set ssthresh to half the current window
2092 * and pull our congestion window back to
2093 * the new ssthresh).
2094 *
2095 * Dup acks mean that packets have left the
2096 * network (they're now cached at the receiver)
2097 * so bump cwnd by the amount in the receiver
2098 * to keep a constant cwnd packets in the
2099 * network.
2100 *
2101 * When using TCP ECN, notify the peer that
2102 * we reduced the cwnd.
2103 */
2104 if (!tcp_timer_active(tp, TT_REXMT) ||
2105 th->th_ack != tp->snd_una)
2106 tp->t_dupacks = 0;
2107 else if (++tp->t_dupacks > tcprexmtthresh ||
2108 IN_FASTRECOVERY(tp->t_flags)) {
2109 cc_ack_received(tp, th, CC_DUPACK);
2110 if ((tp->t_flags & TF_SACK_PERMIT) &&
2111 IN_FASTRECOVERY(tp->t_flags)) {
2112 int awnd;
2113
2114 /*
2115 * Compute the amount of data in flight first.
2116 * We can inject new data into the pipe iff
2117 * we have less than 1/2 the original window's
2118 * worth of data in flight.
2119 */
2120 awnd = (tp->snd_nxt - tp->snd_fack) +
2121 tp->sackhint.sack_bytes_rexmit;
2122 if (awnd < tp->snd_ssthresh) {
2123 tp->snd_cwnd += tp->t_maxseg;
2124 if (tp->snd_cwnd > tp->snd_ssthresh)
2125 tp->snd_cwnd = tp->snd_ssthresh;
2126 }
2127 } else
2128 tp->snd_cwnd += tp->t_maxseg;
2129 #ifdef INSTRUMENT_TCP
2130 tcplp_sys_log("TCP DUPACK");
2131 #endif
2132 (void) tcp_output(tp);
2133 goto drop;
2134 } else if (tp->t_dupacks == tcprexmtthresh) {
2135 tcp_seq onxt = tp->snd_nxt;
2136
2137 /*
2138 * If we're doing sack, check to
2139 * see if we're already in sack
2140 * recovery. If we're not doing sack,
2141 * check to see if we're in newreno
2142 * recovery.
2143 */
2144 if (tp->t_flags & TF_SACK_PERMIT) {
2145 if (IN_FASTRECOVERY(tp->t_flags)) {
2146 tp->t_dupacks = 0;
2147 break;
2148 }
2149 } else {
2150 if (SEQ_LEQ(th->th_ack,
2151 tp->snd_recover)) {
2152 tp->t_dupacks = 0;
2153 break;
2154 }
2155 }
2156 /* Congestion signal before ack. */
2157 cc_cong_signal(tp, th, CC_NDUPACK);
2158 cc_ack_received(tp, th, CC_DUPACK);
2159 tcp_timer_activate(tp, TT_REXMT, 0);
2160 tp->t_rtttime = 0;
2161
2162 #ifdef INSTRUMENT_TCP
2163 tcplp_sys_log("TCP DUPACK_THRESH");
2164 #endif
2165 if (tp->t_flags & TF_SACK_PERMIT) {
2166 tp->sack_newdata = tp->snd_nxt;
2167 tp->snd_cwnd = tp->t_maxseg;
2168 (void) tcp_output(tp);
2169 goto drop;
2170 }
2171
2172 tp->snd_nxt = th->th_ack;
2173 tp->snd_cwnd = tp->t_maxseg;
2174 (void) tcp_output(tp);
2175 /*
2176 * samkumar: I added casts to uint64_t below to
2177 * fix an OpenThread code scanning alert relating
2178 * to integer overflow in multiplication.
2179 */
2180 tp->snd_cwnd = tp->snd_ssthresh +
2181 ((uint64_t) tp->t_maxseg) *
2182 ((uint64_t) (tp->t_dupacks - tp->snd_limited));
2183 #ifdef INSTRUMENT_TCP
2184 tcplp_sys_log("TCP SET_cwnd %d", (int) tp->snd_cwnd);
2185 #endif
2186 if (SEQ_GT(onxt, tp->snd_nxt))
2187 tp->snd_nxt = onxt;
2188 goto drop;
2189 } else if (V_tcp_do_rfc3042) {
2190 /*
2191 * Process first and second duplicate
2192 * ACKs. Each indicates a segment
2193 * leaving the network, creating room
2194 * for more. Make sure we can send a
2195 * packet on reception of each duplicate
2196 * ACK by increasing snd_cwnd by one
2197 * segment. Restore the original
2198 * snd_cwnd after packet transmission.
2199 */
2200 uint64_t oldcwnd;
2201 tcp_seq oldsndmax;
2202 uint32_t sent;
2203 int avail;
2204 cc_ack_received(tp, th, CC_DUPACK);
2205 oldcwnd = tp->snd_cwnd;
2206 oldsndmax = tp->snd_max;
2207
2208 #ifdef INSTRUMENT_TCP
2209 tcplp_sys_log("TCP LIM_TRANS");
2210 #endif
2211
2212 KASSERT(tp->t_dupacks == 1 ||
2213 tp->t_dupacks == 2,
2214 ("%s: dupacks not 1 or 2",
2215 __func__));
2216 if (tp->t_dupacks == 1)
2217 tp->snd_limited = 0;
2218 tp->snd_cwnd =
2219 (tp->snd_nxt - tp->snd_una) +
2220 (tp->t_dupacks - tp->snd_limited) *
2221 tp->t_maxseg;
2222 /*
2223 * Only call tcp_output when there
2224 * is new data available to be sent.
2225 * Otherwise we would send pure ACKs.
2226 */
2227 /*
2228 * samkumar: Replace sbavail(&so->so_snd) with the call to
2229 * lbuf_used_space.
2230 */
2231 avail = lbuf_used_space(&tp->sendbuf) -
2232 (tp->snd_nxt - tp->snd_una);
2233 if (avail > 0)
2234 (void) tcp_output(tp);
2235 sent = tp->snd_max - oldsndmax;
2236 if (sent > tp->t_maxseg) {
2237 KASSERT((tp->t_dupacks == 2 &&
2238 tp->snd_limited == 0) ||
2239 (sent == tp->t_maxseg + 1 &&
2240 tp->t_flags & TF_SENTFIN),
2241 ("%s: sent too much",
2242 __func__));
2243 tp->snd_limited = 2;
2244 } else if (sent > 0)
2245 ++tp->snd_limited;
2246 tp->snd_cwnd = oldcwnd;
2247 #ifdef INSTRUMENT_TCP
2248 tcplp_sys_log("TCP RESET_cwnd %d", (int) tp->snd_cwnd);
2249 #endif
2250 goto drop;
2251 }
2252 } else
2253 tp->t_dupacks = 0;
2254 break;
2255 }
2256
2257 KASSERT(SEQ_GT(th->th_ack, tp->snd_una),
2258 ("%s: th_ack <= snd_una", __func__));
2259
2260 /*
2261 * If the congestion window was inflated to account
2262 * for the other side's cached packets, retract it.
2263 */
2264 if (IN_FASTRECOVERY(tp->t_flags)) {
2265 if (SEQ_LT(th->th_ack, tp->snd_recover)) {
2266 if (tp->t_flags & TF_SACK_PERMIT)
2267 tcp_sack_partialack(tp, th);
2268 else
2269 tcp_newreno_partial_ack(tp, th);
2270 } else
2271 cc_post_recovery(tp, th);
2272 }
2273
2274 tp->t_dupacks = 0;
2275 /*
2276 * If we reach this point, ACK is not a duplicate,
2277 * i.e., it ACKs something we sent.
2278 */
2279 if (tp->t_flags & TF_NEEDSYN) {
2280 /*
2281 * T/TCP: Connection was half-synchronized, and our
2282 * SYN has been ACK'd (so connection is now fully
2283 * synchronized). Go to non-starred state,
2284 * increment snd_una for ACK of SYN, and check if
2285 * we can do window scaling.
2286 */
2287 tp->t_flags &= ~TF_NEEDSYN;
2288 tp->snd_una++;
2289 /* Do window scaling? */
2290 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
2291 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
2292 tp->rcv_scale = tp->request_r_scale;
2293 /* Send window already scaled. */
2294 }
2295 }
2296
2297 process_ACK:
2298 acked = BYTES_THIS_ACK(tp, th);
2299
2300 tcplp_sys_log("Bytes acked: %d", acked);
2301 /*
2302 * If we just performed our first retransmit, and the ACK
2303 * arrives within our recovery window, then it was a mistake
2304 * to do the retransmit in the first place. Recover our
2305 * original cwnd and ssthresh, and proceed to transmit where
2306 * we left off.
2307 */
2308 if (tp->t_rxtshift == 1 && tp->t_flags & TF_PREVVALID &&
2309 (int)(ticks - tp->t_badrxtwin) < 0)
2310 cc_cong_signal(tp, th, CC_RTO_ERR);
2311
2312 /*
2313 * If we have a timestamp reply, update smoothed
2314 * round trip time. If no timestamp is present but
2315 * transmit timer is running and timed sequence
2316 * number was acked, update smoothed round trip time.
2317 * Since we now have an rtt measurement, cancel the
2318 * timer backoff (cf., Phil Karn's retransmit alg.).
2319 * Recompute the initial retransmit timer.
2320 *
2321 * Some boxes send broken timestamp replies
2322 * during the SYN+ACK phase, ignore
2323 * timestamps of 0 or we could calculate a
2324 * huge RTT and blow up the retransmit timer.
2325 */
2326
2327 if ((to.to_flags & TOF_TS) != 0 && to.to_tsecr) {
2328 uint32_t t;
2329
2330 t = tcp_ts_getticks() - to.to_tsecr;
2331 if (!tp->t_rttlow || tp->t_rttlow > t)
2332 tp->t_rttlow = t;
2333 tcp_xmit_timer(tp, TCP_TS_TO_TICKS(t) + 1);
2334 } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) {
2335 if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime)
2336 tp->t_rttlow = ticks - tp->t_rtttime;
2337 tcp_xmit_timer(tp, ticks - tp->t_rtttime);
2338 }
2339
2340 /*
2341 * If all outstanding data is acked, stop retransmit
2342 * timer and remember to restart (more output or persist).
2343 * If there is more data to be acked, restart retransmit
2344 * timer, using current (possibly backed-off) value.
2345 */
2346 if (th->th_ack == tp->snd_max) {
2347 tcp_timer_activate(tp, TT_REXMT, 0);
2348 needoutput = 1;
2349 } else if (!tcp_timer_active(tp, TT_PERSIST)) {
2350 tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
2351 }
2352
2353 /*
2354 * If no data (only SYN) was ACK'd,
2355 * skip rest of ACK processing.
2356 */
2357 if (acked == 0)
2358 goto step6;
2359
2360 /*
2361 * Let the congestion control algorithm update congestion
2362 * control related information. This typically means increasing
2363 * the congestion window.
2364 */
2365 cc_ack_received(tp, th, CC_ACK);
2366
2367 /*
2368 * samkumar: I replaced the calls to sbavail(&so->so_snd) with new
2369 * calls to lbuf_used_space, and then I modified the code to actually
2370 * remove code from the send buffer, formerly done via
2371 * sbcut_locked(&so->so_send, (int)sbavail(&so->so_snd)) in the if case
2372 * and sbcut_locked(&so->so_snd, acked) in the else case, to use the
2373 * data structures for TCPlp's data buffering.
2374 */
2375 if (acked > lbuf_used_space(&tp->sendbuf)) {
2376 uint32_t poppedbytes;
2377 uint32_t usedspace = lbuf_used_space(&tp->sendbuf);
2378 tp->snd_wnd -= usedspace;
2379 poppedbytes = lbuf_pop(&tp->sendbuf, usedspace, &sig->links_popped);
2380 KASSERT(poppedbytes == usedspace, ("Could not fully empty send buffer"));
2381 sig->bytes_acked += poppedbytes;
2382 ourfinisacked = 1;
2383 } else {
2384 uint32_t poppedbytes = lbuf_pop(&tp->sendbuf, acked, &sig->links_popped);
2385 KASSERT(poppedbytes == acked, ("Could not remove acked bytes from send buffer"));
2386 sig->bytes_acked += poppedbytes;
2387 tp->snd_wnd -= acked;
2388 ourfinisacked = 0;
2389 }
2390 /* NB: sowwakeup_locked() does an implicit unlock. */
2391 /*
2392 * samkumar: There used to be a call to sowwakeup(so); here,
2393 * which wakes up any threads waiting for the socket to
2394 * become ready for writing. TCPlp handles its send buffer
2395 * differently so we do not need to replace this call with
2396 * specialized code to handle this.
2397 */
2398 /* Detect una wraparound. */
2399 if (!IN_RECOVERY(tp->t_flags) &&
2400 SEQ_GT(tp->snd_una, tp->snd_recover) &&
2401 SEQ_LEQ(th->th_ack, tp->snd_recover))
2402 tp->snd_recover = th->th_ack - 1;
2403 /* XXXLAS: Can this be moved up into cc_post_recovery? */
2404 if (IN_RECOVERY(tp->t_flags) &&
2405 SEQ_GEQ(th->th_ack, tp->snd_recover)) {
2406 EXIT_RECOVERY(tp->t_flags);
2407 }
2408 tp->snd_una = th->th_ack;
2409 if (tp->t_flags & TF_SACK_PERMIT) {
2410 if (SEQ_GT(tp->snd_una, tp->snd_recover))
2411 tp->snd_recover = tp->snd_una;
2412 }
2413 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
2414 tp->snd_nxt = tp->snd_una;
2415
2416 switch (tp->t_state) {
2417
2418 /*
2419 * In FIN_WAIT_1 STATE in addition to the processing
2420 * for the ESTABLISHED state if our FIN is now acknowledged
2421 * then enter FIN_WAIT_2.
2422 */
2423 case TCPS_FIN_WAIT_1:
2424 if (ourfinisacked) {
2425 /*
2426 * If we can't receive any more
2427 * data, then closing user can proceed.
2428 * Starting the timer is contrary to the
2429 * specification, but if we don't get a FIN
2430 * we'll hang forever.
2431 *
2432 * XXXjl:
2433 * we should release the tp also, and use a
2434 * compressed state.
2435 */
2436 /*
2437 * samkumar: I replaced a check for the SBS_CANTRCVMORE flag
2438 * in so->so_rcv.sb_state with a call to tcpiscantrcv.
2439 */
2440 if (tpiscantrcv(tp)) {
2441 /* samkumar: Removed a call to soisdisconnected(so). */
2442 tcp_timer_activate(tp, TT_2MSL,
2443 (tcp_fast_finwait2_recycle ?
2444 tcp_finwait2_timeout :
2445 TP_MAXIDLE(tp)));
2446 }
2447 tcp_state_change(tp, TCPS_FIN_WAIT_2);
2448 }
2449 break;
2450
2451 /*
2452 * In CLOSING STATE in addition to the processing for
2453 * the ESTABLISHED state if the ACK acknowledges our FIN
2454 * then enter the TIME-WAIT state, otherwise ignore
2455 * the segment.
2456 */
2457 case TCPS_CLOSING:
2458 if (ourfinisacked) {
2459 /*
2460 * samkumar: I added the line below. We need to avoid sending
2461 * an ACK in the TIME-WAIT state, since we don't want to
2462 * ACK ACKs. This edge case appears because TCPlp, unlike the
2463 * original FreeBSD code, uses tcpcbs for connections in the
2464 * TIME-WAIT state (FreeBSD uses a different, smaller
2465 * structure).
2466 */
2467 tp->t_flags &= ~TF_ACKNOW;
2468 tcp_twstart(tp);
2469 return;
2470 }
2471 break;
2472
2473 /*
2474 * In LAST_ACK, we may still be waiting for data to drain
2475 * and/or to be acked, as well as for the ack of our FIN.
2476 * If our FIN is now acknowledged, delete the TCB,
2477 * enter the closed state and return.
2478 */
2479 case TCPS_LAST_ACK:
2480 if (ourfinisacked) {
2481 tp = tcp_close(tp);
2482 tcplp_sys_connection_lost(tp, CONN_LOST_NORMAL);
2483 goto drop;
2484 }
2485 break;
2486 }
2487 }
2488
2489 step6:
2490
2491 /*
2492 * Update window information.
2493 * Don't look at window if no ACK: TAC's send garbage on first SYN.
2494 */
2495 if ((thflags & TH_ACK) &&
2496 (SEQ_LT(tp->snd_wl1, th->th_seq) ||
2497 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
2498 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
2499 /* keep track of pure window updates */
2500 /*
2501 * samkumar: There used to be an if statement here that would check if
2502 * this is a "pure" window update (tlen == 0 &&
2503 * tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) and keep
2504 * statistics for how often that happens.
2505 */
2506 tp->snd_wnd = tiwin;
2507 tp->snd_wl1 = th->th_seq;
2508 tp->snd_wl2 = th->th_ack;
2509 if (tp->snd_wnd > tp->max_sndwnd)
2510 tp->max_sndwnd = tp->snd_wnd;
2511 needoutput = 1;
2512 }
2513
2514 /*
2515 * Process segments with URG.
2516 */
2517 /*
2518 * samkumar: TCPlp does not support the urgent pointer, so we omit all
2519 * urgent-pointer-related processing and buffering. The code below is the
2520 * code that was in the "else" case that handles no valid urgent data in
2521 * the received packet.
2522 */
2523 {
2524 /*
2525 * If no out of band data is expected,
2526 * pull receive urgent pointer along
2527 * with the receive window.
2528 */
2529 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
2530 tp->rcv_up = tp->rcv_nxt;
2531 }
2532
2533 /*
2534 * Process the segment text, merging it into the TCP sequencing queue,
2535 * and arranging for acknowledgment of receipt if necessary.
2536 * This process logically involves adjusting tp->rcv_wnd as data
2537 * is presented to the user (this happens in tcp_usrreq.c,
2538 * case PRU_RCVD). If a FIN has already been received on this
2539 * connection then we just ignore the text.
2540 */
2541 tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) &&
2542 IS_FASTOPEN(tp->t_flags));
2543 if ((tlen || (thflags & TH_FIN) || tfo_syn) &&
2544 TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2545 tcp_seq save_start = th->th_seq;
2546 /*
2547 * samkumar: I removed a call to m_adj(m, drop_hdrlen), which intends
2548 * to drop data from the mbuf so it can be chained into the receive
2549 * header. This is not necessary for TCPlp because we copy the data
2550 * anyway; we just add the offset when copying data into the receive
2551 * buffer.
2552 */
2553 /*
2554 * Insert segment which includes th into TCP reassembly queue
2555 * with control block tp. Set thflags to whether reassembly now
2556 * includes a segment with FIN. This handles the common case
2557 * inline (segment is the next to be received on an established
2558 * connection, and the queue is empty), avoiding linkage into
2559 * and removal from the queue and repetition of various
2560 * conversions.
2561 * Set DELACK for segments received in order, but ack
2562 * immediately when segments are out of order (so
2563 * fast retransmit can work).
2564 */
2565 /*
2566 * samkumar: I replaced LIST_EMPTY(&tp->t_segq) with the calls to
2567 * tpiscantrcv and bmp_isempty on the second line below.
2568 */
2569 if (th->th_seq == tp->rcv_nxt &&
2570 (tpiscantrcv(tp) || bmp_isempty(tp->reassbmp, REASSBMP_SIZE(tp))) &&
2571 (TCPS_HAVEESTABLISHED(tp->t_state) ||
2572 tfo_syn)) {
2573 if (DELAY_ACK(tp, tlen) || tfo_syn)
2574 tp->t_flags |= TF_DELACK;
2575 else
2576 tp->t_flags |= TF_ACKNOW;
2577 tp->rcv_nxt += tlen;
2578 thflags = th->th_flags & TH_FIN;
2579
2580 /*
2581 * samkumar: I replaced the code that used to be here (which would
2582 * free the mbuf with m_freem(m) if the SBS_CANTRCVMORE flag is set
2583 * on so->so_rcv.sb_state, and otherwise call
2584 * sbappendstream_locked(&so->so_rcv, m, 0);).
2585 */
2586 if (!tpiscantrcv(tp)) {
2587 cbuf_write(&tp->recvbuf, msg, otMessageGetOffset(msg) + drop_hdrlen, tlen, cbuf_copy_from_message);
2588 if (tlen > 0) {
2589 sig->recvbuf_added = true;
2590 }
2591 } else if (tlen > 0) {
2592 /*
2593 * samkumar: We already know tlen != 0, so if we got here, then
2594 * it means that we got data after we called SHUT_RD, or after
2595 * receiving a FIN. I'm going to drop the connection in this
2596 * case. I think FreeBSD might have just dropped the packet
2597 * silently, but Linux handles it this way; this seems to be
2598 * the right approach to me.
2599 */
2600 tcp_drop(tp, ECONNABORTED);
2601 goto drop;
2602 }
2603 /* NB: sorwakeup_locked() does an implicit unlock. */
2604 /*
2605 * samkumar: There used to be a call to sorwakeup_locked(so); here,
2606 * which wakes up any threads waiting for the socket to become
2607 * become ready for reading. TCPlp handles its buffering
2608 * differently so we do not need to replace this call with
2609 * specialized code to handle this.
2610 */
2611 } else if (tpiscantrcv(tp)) {
2612 /*
2613 * samkumar: We will reach this point if we get out-of-order data
2614 * on a socket which was shut down with SHUT_RD, or where we
2615 * already received a FIN. My response here is to drop the segment
2616 * and send an RST.
2617 */
2618 tcp_drop(tp, ECONNABORTED);
2619 goto drop;
2620 } else {
2621 /*
2622 * XXX: Due to the header drop above "th" is
2623 * theoretically invalid by now. Fortunately
2624 * m_adj() doesn't actually frees any mbufs
2625 * when trimming from the head.
2626 */
2627 thflags = tcp_reass(tp, th, &tlen, msg, otMessageGetOffset(msg) + drop_hdrlen, sig);
2628 tp->t_flags |= TF_ACKNOW;
2629 }
2630 // Only place tlen is used after the call to tcp_reass is below
2631 if (tlen > 0 && (tp->t_flags & TF_SACK_PERMIT))
2632 tcp_update_sack_list(tp, save_start, save_start + tlen);
2633 /*
2634 * samkumar: This is not me commenting things out; this was already
2635 * commented out in the FreeBSD code.
2636 */
2637 #if 0
2638 /*
2639 * Note the amount of data that peer has sent into
2640 * our window, in order to estimate the sender's
2641 * buffer size.
2642 * XXX: Unused.
2643 */
2644 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt))
2645 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
2646 else
2647 len = so->so_rcv.sb_hiwat;
2648 #endif
2649 } else {
2650 thflags &= ~TH_FIN;
2651 }
2652
2653 /*
2654 * If FIN is received ACK the FIN and let the user know
2655 * that the connection is closing.
2656 */
2657 if (thflags & TH_FIN) {
2658 tcplp_sys_log("FIN Processing start");
2659 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2660 /* samkumar: replace socantrcvmore with tpcantrcvmore */
2661 tpcantrcvmore(tp);
2662 /*
2663 * If connection is half-synchronized
2664 * (ie NEEDSYN flag on) then delay ACK,
2665 * so it may be piggybacked when SYN is sent.
2666 * Otherwise, since we received a FIN then no
2667 * more input can be expected, send ACK now.
2668 */
2669 if (tp->t_flags & TF_NEEDSYN)
2670 tp->t_flags |= TF_DELACK;
2671 else
2672 tp->t_flags |= TF_ACKNOW;
2673 tp->rcv_nxt++;
2674 }
2675 /*
2676 * samkumar: This -2 state is added by me, so that we do not consider
2677 * any more FINs in reassembly.
2678 */
2679 if (tp->reass_fin_index != -2) {
2680 sig->rcvd_fin = true;
2681 tp->reass_fin_index = -2;
2682 }
2683 switch (tp->t_state) {
2684
2685 /*
2686 * In SYN_RECEIVED and ESTABLISHED STATES
2687 * enter the CLOSE_WAIT state.
2688 */
2689 case TCPS_SYN_RECEIVED:
2690 tp->t_starttime = ticks;
2691 /* FALLTHROUGH */
2692 case TCPS_ESTABLISHED:
2693 tcp_state_change(tp, TCPS_CLOSE_WAIT);
2694 break;
2695
2696 /*
2697 * If still in FIN_WAIT_1 STATE FIN has not been acked so
2698 * enter the CLOSING state.
2699 */
2700 case TCPS_FIN_WAIT_1:
2701 tcp_state_change(tp, TCPS_CLOSING);
2702 break;
2703
2704 /*
2705 * In FIN_WAIT_2 state enter the TIME_WAIT state,
2706 * starting the time-wait timer, turning off the other
2707 * standard timers.
2708 */
2709 case TCPS_FIN_WAIT_2:
2710 tcp_twstart(tp);
2711 return;
2712 }
2713 }
2714
2715 /*
2716 * samkumar: Remove code for synchronization and debugging, here and in
2717 * the labels below. I also removed the line to free the mbuf if it hasn't
2718 * been freed already (the line was "m_freem(m)").
2719 */
2720 /*
2721 * Return any desired output.
2722 */
2723 if (needoutput || (tp->t_flags & TF_ACKNOW))
2724 (void) tcp_output(tp);
2725
2726 check_delack:
2727 if (tp->t_flags & TF_DELACK) {
2728 tp->t_flags &= ~TF_DELACK;
2729 tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
2730 }
2731 return;
2732
2733 dropafterack:
2734 /*
2735 * Generate an ACK dropping incoming segment if it occupies
2736 * sequence space, where the ACK reflects our state.
2737 *
2738 * We can now skip the test for the RST flag since all
2739 * paths to this code happen after packets containing
2740 * RST have been dropped.
2741 *
2742 * In the SYN-RECEIVED state, don't send an ACK unless the
2743 * segment we received passes the SYN-RECEIVED ACK test.
2744 * If it fails send a RST. This breaks the loop in the
2745 * "LAND" DoS attack, and also prevents an ACK storm
2746 * between two listening ports that have been sent forged
2747 * SYN segments, each with the source address of the other.
2748 */
2749 if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
2750 (SEQ_GT(tp->snd_una, th->th_ack) ||
2751 SEQ_GT(th->th_ack, tp->snd_max)) ) {
2752 rstreason = BANDLIM_RST_OPENPORT;
2753 goto dropwithreset;
2754 }
2755
2756 tp->t_flags |= TF_ACKNOW;
2757 (void) tcp_output(tp);
2758 return;
2759
2760 dropwithreset:
2761 if (tp != NULL) {
2762 tcp_dropwithreset(ip6, th, tp, instance, tlen, rstreason);
2763 } else
2764 tcp_dropwithreset(ip6, th, NULL, instance, tlen, rstreason);
2765 return;
2766
2767 drop:
2768 return;
2769 }
2770
2771 /*
2772 * Parse TCP options and place in tcpopt.
2773 */
2774 static void
tcp_dooptions(struct tcpopt * to,uint8_t * cp,int cnt,int flags)2775 tcp_dooptions(struct tcpopt *to, uint8_t *cp, int cnt, int flags)
2776 {
2777 int opt, optlen;
2778
2779 to->to_flags = 0;
2780 for (; cnt > 0; cnt -= optlen, cp += optlen) {
2781 opt = cp[0];
2782 if (opt == TCPOPT_EOL)
2783 break;
2784 if (opt == TCPOPT_NOP)
2785 optlen = 1;
2786 else {
2787 if (cnt < 2)
2788 break;
2789 optlen = cp[1];
2790 if (optlen < 2 || optlen > cnt)
2791 break;
2792 }
2793 switch (opt) {
2794 case TCPOPT_MAXSEG:
2795 if (optlen != TCPOLEN_MAXSEG)
2796 continue;
2797 if (!(flags & TO_SYN))
2798 continue;
2799 to->to_flags |= TOF_MSS;
2800 bcopy((char *)cp + 2,
2801 (char *)&to->to_mss, sizeof(to->to_mss));
2802 to->to_mss = ntohs(to->to_mss);
2803 break;
2804 case TCPOPT_WINDOW:
2805 if (optlen != TCPOLEN_WINDOW)
2806 continue;
2807 if (!(flags & TO_SYN))
2808 continue;
2809 to->to_flags |= TOF_SCALE;
2810 to->to_wscale = min(cp[2], TCP_MAX_WINSHIFT);
2811 break;
2812 case TCPOPT_TIMESTAMP:
2813 if (optlen != TCPOLEN_TIMESTAMP)
2814 continue;
2815 to->to_flags |= TOF_TS;
2816 bcopy((char *)cp + 2,
2817 (char *)&to->to_tsval, sizeof(to->to_tsval));
2818 to->to_tsval = ntohl(to->to_tsval);
2819 bcopy((char *)cp + 6,
2820 (char *)&to->to_tsecr, sizeof(to->to_tsecr));
2821 to->to_tsecr = ntohl(to->to_tsecr);
2822 break;
2823 #ifdef TCP_SIGNATURE
2824 /*
2825 * XXX In order to reply to a host which has set the
2826 * TCP_SIGNATURE option in its initial SYN, we have to
2827 * record the fact that the option was observed here
2828 * for the syncache code to perform the correct response.
2829 */
2830 case TCPOPT_SIGNATURE:
2831 if (optlen != TCPOLEN_SIGNATURE)
2832 continue;
2833 to->to_flags |= TOF_SIGNATURE;
2834 to->to_signature = cp + 2;
2835 break;
2836 #endif
2837 case TCPOPT_SACK_PERMITTED:
2838 if (optlen != TCPOLEN_SACK_PERMITTED)
2839 continue;
2840 if (!(flags & TO_SYN))
2841 continue;
2842 if (!V_tcp_do_sack)
2843 continue;
2844 to->to_flags |= TOF_SACKPERM;
2845 break;
2846 case TCPOPT_SACK:
2847 if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0)
2848 continue;
2849 if (flags & TO_SYN)
2850 continue;
2851 to->to_flags |= TOF_SACK;
2852 to->to_nsacks = (optlen - 2) / TCPOLEN_SACK;
2853 to->to_sacks = cp + 2;
2854 break;
2855 case TCPOPT_FAST_OPEN:
2856 /*
2857 * Cookie length validation is performed by the
2858 * server side cookie checking code or the client
2859 * side cookie cache update code.
2860 */
2861 if (!(flags & TO_SYN))
2862 continue;
2863 if (!V_tcp_fastopen_client_enable &&
2864 !V_tcp_fastopen_server_enable)
2865 continue;
2866 to->to_flags |= TOF_FASTOPEN;
2867 to->to_tfo_len = optlen - 2;
2868 to->to_tfo_cookie = to->to_tfo_len ? cp + 2 : NULL;
2869 break;
2870 default:
2871 continue;
2872 }
2873 }
2874 }
2875
2876
2877 /*
2878 * Collect new round-trip time estimate
2879 * and update averages and current timeout.
2880 */
2881 static void
tcp_xmit_timer(struct tcpcb * tp,int rtt)2882 tcp_xmit_timer(struct tcpcb *tp, int rtt)
2883 {
2884 int delta;
2885
2886 tp->t_rttupdated++;
2887 if (tp->t_srtt != 0) {
2888 /*
2889 * srtt is stored as fixed point with 5 bits after the
2890 * binary point (i.e., scaled by 8). The following magic
2891 * is equivalent to the smoothing algorithm in rfc793 with
2892 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
2893 * point). Adjust rtt to origin 0.
2894 */
2895 delta = ((rtt - 1) << TCP_DELTA_SHIFT)
2896 - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
2897
2898 if ((tp->t_srtt += delta) <= 0)
2899 tp->t_srtt = 1;
2900
2901 /*
2902 * We accumulate a smoothed rtt variance (actually, a
2903 * smoothed mean difference), then set the retransmit
2904 * timer to smoothed rtt + 4 times the smoothed variance.
2905 * rttvar is stored as fixed point with 4 bits after the
2906 * binary point (scaled by 16). The following is
2907 * equivalent to rfc793 smoothing with an alpha of .75
2908 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces
2909 * rfc793's wired-in beta.
2910 */
2911 if (delta < 0)
2912 delta = -delta;
2913 delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
2914 if ((tp->t_rttvar += delta) <= 0)
2915 tp->t_rttvar = 1;
2916 if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar)
2917 tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
2918 } else {
2919 /*
2920 * No rtt measurement yet - use the unsmoothed rtt.
2921 * Set the variance to half the rtt (so our first
2922 * retransmit happens at 3*rtt).
2923 */
2924 tp->t_srtt = rtt << TCP_RTT_SHIFT;
2925 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
2926 tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
2927 }
2928 tp->t_rtttime = 0;
2929 tp->t_rxtshift = 0;
2930
2931 /*
2932 * the retransmit should happen at rtt + 4 * rttvar.
2933 * Because of the way we do the smoothing, srtt and rttvar
2934 * will each average +1/2 tick of bias. When we compute
2935 * the retransmit timer, we want 1/2 tick of rounding and
2936 * 1 extra tick because of +-1/2 tick uncertainty in the
2937 * firing of the timer. The bias will give us exactly the
2938 * 1.5 tick we need. But, because the bias is
2939 * statistical, we have to test that we don't drop below
2940 * the minimum feasible timer (which is 2 ticks).
2941 */
2942 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
2943 max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX);
2944
2945 #ifdef INSTRUMENT_TCP
2946 tcplp_sys_log("TCP timer %u %d %d %d", (unsigned int) tcplp_sys_get_millis(), rtt, (int) tp->t_srtt, (int) tp->t_rttvar);
2947 #endif
2948
2949
2950 /*
2951 * We received an ack for a packet that wasn't retransmitted;
2952 * it is probably safe to discard any error indications we've
2953 * received recently. This isn't quite right, but close enough
2954 * for now (a route might have failed after we sent a segment,
2955 * and the return path might not be symmetrical).
2956 */
2957 tp->t_softerror = 0;
2958 }
2959
2960 /*
2961 * samkumar: Taken from netinet6/in6.c.
2962 *
2963 * This function is supposed to check whether the provided address is an
2964 * IPv6 address of this host. This function, however, is used only as a hint,
2965 * as the MSS is clamped at V_tcp_v6mssdflt for connections to non-local
2966 * addresses. It is difficult for us to actually determine if the address
2967 * belongs to us, so we are conservative and only return 1 (true) if it is
2968 * obviously so---we keep the part of the function that checks for loopback or
2969 * link local and remove the rest of the code that checks for the addresses
2970 * assigned to interfaces. In cases where we return 0 but should have returned
2971 * 1, we may conservatively clamp the MTU, but that should be OK for TCPlp.
2972 * In fact, the constants are set such that we'll get the right answer whether
2973 * we clamp or not, so this shouldn't really matter at all.
2974 */
2975 int
in6_localaddr(struct in6_addr * in6)2976 in6_localaddr(struct in6_addr *in6)
2977 {
2978 if (IN6_IS_ADDR_LOOPBACK(in6) || IN6_IS_ADDR_LINKLOCAL(in6))
2979 return 1;
2980 return (0);
2981 }
2982
2983 /*
2984 * Determine a reasonable value for maxseg size.
2985 * If the route is known, check route for mtu.
2986 * If none, use an mss that can be handled on the outgoing interface
2987 * without forcing IP to fragment. If no route is found, route has no mtu,
2988 * or the destination isn't local, use a default, hopefully conservative
2989 * size (usually 512 or the default IP max size, but no more than the mtu
2990 * of the interface), as we can't discover anything about intervening
2991 * gateways or networks. We also initialize the congestion/slow start
2992 * window to be a single segment if the destination isn't local.
2993 * While looking at the routing entry, we also initialize other path-dependent
2994 * parameters from pre-set or cached values in the routing entry.
2995 *
2996 * Also take into account the space needed for options that we
2997 * send regularly. Make maxseg shorter by that amount to assure
2998 * that we can send maxseg amount of data even when the options
2999 * are present. Store the upper limit of the length of options plus
3000 * data in maxopd.
3001 *
3002 * NOTE that this routine is only called when we process an incoming
3003 * segment, or an ICMP need fragmentation datagram. Outgoing SYN/ACK MSS
3004 * settings are handled in tcp_mssopt().
3005 */
3006 /*
3007 * samkumar: Using struct tcpcb instead of the inpcb.
3008 */
3009 void
tcp_mss_update(struct tcpcb * tp,int offer,int mtuoffer,struct hc_metrics_lite * metricptr,struct tcp_ifcap * cap)3010 tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer,
3011 struct hc_metrics_lite *metricptr, struct tcp_ifcap *cap)
3012 {
3013 /*
3014 * samkumar: I removed all IPv4-specific logic and cases, including logic
3015 * to check for IPv4 vs. IPv6, as well as all locking and debugging code.
3016 */
3017 int mss = 0;
3018 uint64_t maxmtu = 0;
3019 struct hc_metrics_lite metrics;
3020 int origoffer;
3021 size_t min_protoh = IP6HDR_SIZE + sizeof (struct tcphdr);
3022
3023 if (mtuoffer != -1) {
3024 KASSERT(offer == -1, ("%s: conflict", __func__));
3025 offer = mtuoffer - min_protoh;
3026 }
3027 origoffer = offer;
3028
3029 maxmtu = tcp_maxmtu6(tp, cap);
3030 tp->t_maxopd = tp->t_maxseg = V_tcp_v6mssdflt;
3031
3032 /*
3033 * No route to sender, stay with default mss and return.
3034 */
3035 if (maxmtu == 0) {
3036 /*
3037 * In case we return early we need to initialize metrics
3038 * to a defined state as tcp_hc_get() would do for us
3039 * if there was no cache hit.
3040 */
3041 if (metricptr != NULL)
3042 bzero(metricptr, sizeof(struct hc_metrics_lite));
3043 return;
3044 }
3045
3046 /* What have we got? */
3047 switch (offer) {
3048 case 0:
3049 /*
3050 * Offer == 0 means that there was no MSS on the SYN
3051 * segment, in this case we use tcp_mssdflt as
3052 * already assigned to t_maxopd above.
3053 */
3054 offer = tp->t_maxopd;
3055 break;
3056
3057 case -1:
3058 /*
3059 * Offer == -1 means that we didn't receive SYN yet.
3060 */
3061 /* FALLTHROUGH */
3062
3063 default:
3064 /*
3065 * Prevent DoS attack with too small MSS. Round up
3066 * to at least minmss.
3067 */
3068 offer = max(offer, V_tcp_minmss);
3069 }
3070
3071 /*
3072 * rmx information is now retrieved from tcp_hostcache.
3073 */
3074 tcp_hc_get(tp, &metrics);
3075 if (metricptr != NULL)
3076 bcopy(&metrics, metricptr, sizeof(struct hc_metrics_lite));
3077
3078 /*
3079 * If there's a discovered mtu in tcp hostcache, use it.
3080 * Else, use the link mtu.
3081 */
3082 if (metrics.rmx_mtu)
3083 mss = min(metrics.rmx_mtu, maxmtu) - min_protoh;
3084 else {
3085 mss = maxmtu - min_protoh;
3086 if (!V_path_mtu_discovery &&
3087 !in6_localaddr(&tp->faddr))
3088 mss = min(mss, V_tcp_v6mssdflt);
3089 /*
3090 * XXX - The above conditional (mss = maxmtu - min_protoh)
3091 * probably violates the TCP spec.
3092 * The problem is that, since we don't know the
3093 * other end's MSS, we are supposed to use a conservative
3094 * default. But, if we do that, then MTU discovery will
3095 * never actually take place, because the conservative
3096 * default is much less than the MTUs typically seen
3097 * on the Internet today. For the moment, we'll sweep
3098 * this under the carpet.
3099 *
3100 * The conservative default might not actually be a problem
3101 * if the only case this occurs is when sending an initial
3102 * SYN with options and data to a host we've never talked
3103 * to before. Then, they will reply with an MSS value which
3104 * will get recorded and the new parameters should get
3105 * recomputed. For Further Study.
3106 */
3107 }
3108 mss = min(mss, offer);
3109
3110 /*
3111 * Sanity check: make sure that maxopd will be large
3112 * enough to allow some data on segments even if the
3113 * all the option space is used (40bytes). Otherwise
3114 * funny things may happen in tcp_output.
3115 */
3116 /*
3117 * samkumar: When I was experimenting with different MSS values, I had
3118 * changed this to "mss = max(mss, TCP_MAXOLEN + 1);" but I am changing it
3119 * back for the version that will be merged into OpenThread.
3120 */
3121 mss = max(mss, 64);
3122
3123 /*
3124 * maxopd stores the maximum length of data AND options
3125 * in a segment; maxseg is the amount of data in a normal
3126 * segment. We need to store this value (maxopd) apart
3127 * from maxseg, because now every segment carries options
3128 * and thus we normally have somewhat less data in segments.
3129 */
3130 tp->t_maxopd = mss;
3131
3132 /*
3133 * origoffer==-1 indicates that no segments were received yet.
3134 * In this case we just guess.
3135 */
3136 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
3137 (origoffer == -1 ||
3138 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP))
3139 mss -= TCPOLEN_TSTAMP_APPA;
3140
3141 tp->t_maxseg = mss;
3142 }
3143
3144 void
tcp_mss(struct tcpcb * tp,int offer)3145 tcp_mss(struct tcpcb *tp, int offer)
3146 {
3147 struct hc_metrics_lite metrics;
3148 struct tcp_ifcap cap;
3149
3150 KASSERT(tp != NULL, ("%s: tp == NULL", __func__));
3151
3152 bzero(&cap, sizeof(cap));
3153 tcp_mss_update(tp, offer, -1, &metrics, &cap);
3154
3155 /*
3156 * samkumar: There used to be code below that might modify the MSS, but I
3157 * removed all of it (see the comments below for the reason). It used to
3158 * read tp->t_maxseg into the local variable mss, modify mss, and then
3159 * reassign tp->t_maxseg to mss. I've kept the assignments, commented out,
3160 * for clarity.
3161 */
3162 //mss = tp->t_maxseg;
3163
3164 /*
3165 * If there's a pipesize, change the socket buffer to that size,
3166 * don't change if sb_hiwat is different than default (then it
3167 * has been changed on purpose with setsockopt).
3168 * Make the socket buffers an integral number of mss units;
3169 * if the mss is larger than the socket buffer, decrease the mss.
3170 */
3171
3172 /*
3173 * samkumar: There used to be code here would would limit the MSS to at
3174 * most the size of the send buffer, and then round up the send buffer to
3175 * a multiple of the MSS using
3176 * "sbreserve_locked(&so->so_snd, bufsize, so, NULL);". With TCPlp, we do
3177 * not do this, because the linked buffer used at the send buffer doesn't
3178 * have a real limit. Had we used a circular buffer, then limiting the MSS
3179 * to the buffer size would have made sense, but we still would not be able
3180 * to resize the send buffer because it is not allocated by TCPlp.
3181 */
3182
3183 /*
3184 * samkumar: See the comment above about me removing code that modifies
3185 * the MSS, making this assignment and the one above both unnecessary.
3186 */
3187 //tp->t_maxseg = mss;
3188
3189 /*
3190 * samkumar: There used to be code here that would round up the receive
3191 * buffer size to a multiple of the MSS, assuming that the receive buffer
3192 * size is bigger than the MSS. The new buffer size is set using
3193 * "sbreserve_locked(&so->so_rcv, bufsize, so, NULL);". In TCPlp, the
3194 * buffer is not allocated by TCPlp so I removed the code for this.
3195 */
3196 /*
3197 * samkumar: There used to be code here to handle TCP Segmentation
3198 * Offloading (TSO); I removed it becuase we don't support that in TCPlp.
3199 */
3200 }
3201
3202 /*
3203 * Determine the MSS option to send on an outgoing SYN.
3204 */
3205 /*
3206 * samkumar: In the signature, changed "struct in_conninfo *inc" to
3207 * "struct tcpcb* tp".
3208 */
3209 int
tcp_mssopt(struct tcpcb * tp)3210 tcp_mssopt(struct tcpcb* tp)
3211 {
3212 /*
3213 * samkumar: I removed all processing code specific to IPv4, or to decide
3214 * between IPv4 and IPv6. This is OK because TCPlp assumes IPv6.
3215 */
3216 int mss = 0;
3217 uint64_t maxmtu = 0;
3218 uint64_t thcmtu = 0;
3219 size_t min_protoh;
3220
3221 KASSERT(tp != NULL, ("tcp_mssopt with NULL tcpcb pointer"));
3222
3223 mss = V_tcp_v6mssdflt;
3224 maxmtu = tcp_maxmtu6(tp, NULL);
3225 min_protoh = IP6HDR_SIZE + sizeof(struct tcphdr);
3226
3227 thcmtu = tcp_hc_getmtu(tp); /* IPv4 and IPv6 */
3228
3229 if (maxmtu && thcmtu)
3230 mss = min(maxmtu, thcmtu) - min_protoh;
3231 else if (maxmtu || thcmtu)
3232 mss = max(maxmtu, thcmtu) - min_protoh;
3233
3234 return (mss);
3235 }
3236
3237 /*
3238 * On a partial ack arrives, force the retransmission of the
3239 * next unacknowledged segment. Do not clear tp->t_dupacks.
3240 * By setting snd_nxt to ti_ack, this forces retransmission timer to
3241 * be started again.
3242 */
3243 static void
tcp_newreno_partial_ack(struct tcpcb * tp,struct tcphdr * th)3244 tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th)
3245 {
3246 tcp_seq onxt = tp->snd_nxt;
3247 uint64_t ocwnd = tp->snd_cwnd;
3248
3249 tcp_timer_activate(tp, TT_REXMT, 0);
3250 tp->t_rtttime = 0;
3251 tp->snd_nxt = th->th_ack;
3252 /*
3253 * Set snd_cwnd to one segment beyond acknowledged offset.
3254 * (tp->snd_una has not yet been updated when this function is called.)
3255 */
3256 tp->snd_cwnd = tp->t_maxseg + BYTES_THIS_ACK(tp, th);
3257 tp->t_flags |= TF_ACKNOW;
3258 #ifdef INSTRUMENT_TCP
3259 tcplp_sys_log("TCP Partial_ACK");
3260 #endif
3261 (void) tcp_output(tp);
3262 tp->snd_cwnd = ocwnd;
3263 if (SEQ_GT(onxt, tp->snd_nxt))
3264 tp->snd_nxt = onxt;
3265 /*
3266 * Partial window deflation. Relies on fact that tp->snd_una
3267 * not updated yet.
3268 */
3269 if (tp->snd_cwnd > BYTES_THIS_ACK(tp, th))
3270 tp->snd_cwnd -= BYTES_THIS_ACK(tp, th);
3271 else
3272 tp->snd_cwnd = 0;
3273 tp->snd_cwnd += tp->t_maxseg;
3274 #ifdef INSTRUMENT_TCP
3275 tcplp_sys_log("TCP Partial_ACK_final %d", (int) tp->snd_cwnd);
3276 #endif
3277 }
3278