FD.io VPP  v20.05.1-6-gf53edbc3b
Vector Packet Processing
tcp_input.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2016-2019 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #include <vppinfra/sparse_vec.h>
17 #include <vnet/fib/ip4_fib.h>
18 #include <vnet/fib/ip6_fib.h>
19 #include <vnet/tcp/tcp.h>
20 #include <vnet/tcp/tcp_inlines.h>
21 #include <vnet/session/session.h>
22 #include <math.h>
23 
24 static char *tcp_error_strings[] = {
25 #define tcp_error(n,s) s,
26 #include <vnet/tcp/tcp_error.def>
27 #undef tcp_error
28 };
29 
30 /* All TCP nodes have the same outgoing arcs */
31 #define foreach_tcp_state_next \
32  _ (DROP4, "ip4-drop") \
33  _ (DROP6, "ip6-drop") \
34  _ (TCP4_OUTPUT, "tcp4-output") \
35  _ (TCP6_OUTPUT, "tcp6-output")
36 
37 typedef enum _tcp_established_next
38 {
39 #define _(s,n) TCP_ESTABLISHED_NEXT_##s,
41 #undef _
44 
45 typedef enum _tcp_rcv_process_next
46 {
47 #define _(s,n) TCP_RCV_PROCESS_NEXT_##s,
49 #undef _
52 
53 typedef enum _tcp_syn_sent_next
54 {
55 #define _(s,n) TCP_SYN_SENT_NEXT_##s,
57 #undef _
60 
61 typedef enum _tcp_listen_next
62 {
63 #define _(s,n) TCP_LISTEN_NEXT_##s,
65 #undef _
68 
69 /* Generic, state independent indices */
70 typedef enum _tcp_state_next
71 {
72 #define _(s,n) TCP_NEXT_##s,
74 #undef _
77 
78 #define tcp_next_output(is_ip4) (is_ip4 ? TCP_NEXT_TCP4_OUTPUT \
79  : TCP_NEXT_TCP6_OUTPUT)
80 
81 #define tcp_next_drop(is_ip4) (is_ip4 ? TCP_NEXT_DROP4 \
82  : TCP_NEXT_DROP6)
83 
84 /**
85  * Validate segment sequence number. As per RFC793:
86  *
87  * Segment Receive Test
88  * Length Window
89  * ------- ------- -------------------------------------------
90  * 0 0 SEG.SEQ = RCV.NXT
91  * 0 >0 RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND
92  * >0 0 not acceptable
93  * >0 >0 RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND
94  * or RCV.NXT =< SEG.SEQ+SEG.LEN-1 < RCV.NXT+RCV.WND
95  *
96  * This ultimately consists in checking if segment falls within the window.
97  * The one important difference compared to RFC793 is that we use rcv_las,
98  * or the rcv_nxt at last ack sent instead of rcv_nxt since that's the
99  * peer's reference when computing our receive window.
100  *
101  * This:
102  * seq_leq (end_seq, tc->rcv_las + tc->rcv_wnd) && seq_geq (seq, tc->rcv_las)
103  * however, is too strict when we have retransmits. Instead we just check that
104  * the seq is not beyond the right edge and that the end of the segment is not
105  * less than the left edge.
106  *
107  * N.B. rcv_nxt and rcv_wnd are both updated in this node if acks are sent, so
108  * use rcv_nxt in the right edge window test instead of rcv_las.
109  *
110  */
113 {
114  return (seq_geq (end_seq, tc->rcv_las)
115  && seq_leq (seq, tc->rcv_nxt + tc->rcv_wnd));
116 }
117 
118 /**
119  * RFC1323: Check against wrapped sequence numbers (PAWS). If we have
120  * timestamp to echo and it's less than tsval_recent, drop segment
121  * but still send an ACK in order to retain TCP's mechanism for detecting
122  * and recovering from half-open connections
123  *
124  * Or at least that's what the theory says. It seems that this might not work
125  * very well with packet reordering and fast retransmit. XXX
126  */
127 always_inline int
129 {
130  return tcp_opts_tstamp (&tc->rcv_opts)
131  && timestamp_lt (tc->rcv_opts.tsval, tc->tsval_recent);
132 }
133 
134 /**
135  * Update tsval recent
136  */
137 always_inline void
139 {
140  /*
141  * RFC1323: If Last.ACK.sent falls within the range of sequence numbers
142  * of an incoming segment:
143  * SEG.SEQ <= Last.ACK.sent < SEG.SEQ + SEG.LEN
144  * then the TSval from the segment is copied to TS.Recent;
145  * otherwise, the TSval is ignored.
146  */
147  if (tcp_opts_tstamp (&tc->rcv_opts) && seq_leq (seq, tc->rcv_las)
148  && seq_leq (tc->rcv_las, seq_end))
149  {
150  ASSERT (timestamp_leq (tc->tsval_recent, tc->rcv_opts.tsval));
151  tc->tsval_recent = tc->rcv_opts.tsval;
152  tc->tsval_recent_age = tcp_time_now_w_thread (tc->c_thread_index);
153  }
154 }
155 
156 static void
158 {
159  switch (tc->rst_state)
160  {
161  case TCP_STATE_SYN_RCVD:
162  /* Cleanup everything. App wasn't notified yet */
163  session_transport_delete_notify (&tc->connection);
165  break;
166  case TCP_STATE_SYN_SENT:
167  session_stream_connect_notify (&tc->connection, 1 /* fail */ );
169  break;
170  case TCP_STATE_ESTABLISHED:
171  session_transport_reset_notify (&tc->connection);
172  session_transport_closed_notify (&tc->connection);
173  break;
174  case TCP_STATE_CLOSE_WAIT:
175  case TCP_STATE_FIN_WAIT_1:
176  case TCP_STATE_FIN_WAIT_2:
177  case TCP_STATE_CLOSING:
178  case TCP_STATE_LAST_ACK:
179  session_transport_closed_notify (&tc->connection);
180  break;
181  case TCP_STATE_CLOSED:
182  case TCP_STATE_TIME_WAIT:
183  break;
184  default:
185  TCP_DBG ("reset state: %u", tc->state);
186  }
187 }
188 
189 static void
191 {
192  if (!tcp_disconnect_pending (tc))
193  {
194  tc->rst_state = tc->state;
195  vec_add1 (wrk->pending_resets, tc->c_c_index);
197  }
198 }
199 
200 /**
201  * Handle reset packet
202  *
203  * Programs disconnect/reset notification that should be sent
204  * later by calling @ref tcp_handle_disconnects
205  */
206 static void
208 {
209  TCP_EVT (TCP_EVT_RST_RCVD, tc);
210  switch (tc->state)
211  {
212  case TCP_STATE_SYN_RCVD:
213  tcp_program_reset_ntf (wrk, tc);
214  tcp_connection_set_state (tc, TCP_STATE_CLOSED);
215  break;
216  case TCP_STATE_SYN_SENT:
217  /* Do not program ntf because the connection is half-open */
218  tcp_handle_rst (tc);
219  break;
220  case TCP_STATE_ESTABLISHED:
223  tcp_program_reset_ntf (wrk, tc);
224  tcp_connection_set_state (tc, TCP_STATE_CLOSED);
225  tcp_program_cleanup (wrk, tc);
226  break;
227  case TCP_STATE_CLOSE_WAIT:
228  case TCP_STATE_FIN_WAIT_1:
229  case TCP_STATE_FIN_WAIT_2:
230  case TCP_STATE_CLOSING:
231  case TCP_STATE_LAST_ACK:
234  tcp_program_reset_ntf (wrk, tc);
235  /* Make sure we mark the session as closed. In some states we may
236  * be still trying to send data */
237  tcp_connection_set_state (tc, TCP_STATE_CLOSED);
238  tcp_program_cleanup (wrk, tc);
239  break;
240  case TCP_STATE_CLOSED:
241  case TCP_STATE_TIME_WAIT:
242  break;
243  default:
244  TCP_DBG ("reset state: %u", tc->state);
245  }
246 }
247 
248 /**
249  * Validate incoming segment as per RFC793 p. 69 and RFC1323 p. 19
250  *
251  * It first verifies if segment has a wrapped sequence number (PAWS) and then
252  * does the processing associated to the first four steps (ignoring security
253  * and precedence): sequence number, rst bit and syn bit checks.
254  *
255  * @return 0 if segments passes validation.
256  */
257 static int
259  vlib_buffer_t * b0, tcp_header_t * th0, u32 * error0)
260 {
261  /* We could get a burst of RSTs interleaved with acks */
262  if (PREDICT_FALSE (tc0->state == TCP_STATE_CLOSED))
263  {
264  tcp_send_reset (tc0);
265  *error0 = TCP_ERROR_CONNECTION_CLOSED;
266  goto error;
267  }
268 
269  if (PREDICT_FALSE (!tcp_ack (th0) && !tcp_rst (th0) && !tcp_syn (th0)))
270  {
271  *error0 = TCP_ERROR_SEGMENT_INVALID;
272  goto error;
273  }
274 
275  if (PREDICT_FALSE (tcp_options_parse (th0, &tc0->rcv_opts, 0)))
276  {
277  *error0 = TCP_ERROR_OPTIONS;
278  goto error;
279  }
280 
282  {
283  *error0 = TCP_ERROR_PAWS;
284  TCP_EVT (TCP_EVT_PAWS_FAIL, tc0, vnet_buffer (b0)->tcp.seq_number,
285  vnet_buffer (b0)->tcp.seq_end);
286 
287  /* If it just so happens that a segment updates tsval_recent for a
288  * segment over 24 days old, invalidate tsval_recent. */
289  if (timestamp_lt (tc0->tsval_recent_age + TCP_PAWS_IDLE,
290  tcp_time_now_w_thread (tc0->c_thread_index)))
291  {
292  tc0->tsval_recent = tc0->rcv_opts.tsval;
293  clib_warning ("paws failed: 24-day old segment");
294  }
295  /* Drop after ack if not rst. Resets can fail paws check as per
296  * RFC 7323 sec. 5.2: When an <RST> segment is received, it MUST NOT
297  * be subjected to the PAWS check by verifying an acceptable value in
298  * SEG.TSval */
299  else if (!tcp_rst (th0))
300  {
301  tcp_program_ack (tc0);
302  TCP_EVT (TCP_EVT_DUPACK_SENT, tc0, vnet_buffer (b0)->tcp);
303  goto error;
304  }
305  }
306 
307  /* 1st: check sequence number */
308  if (!tcp_segment_in_rcv_wnd (tc0, vnet_buffer (b0)->tcp.seq_number,
309  vnet_buffer (b0)->tcp.seq_end))
310  {
311  /* SYN/SYN-ACK retransmit */
312  if (tcp_syn (th0)
313  && vnet_buffer (b0)->tcp.seq_number == tc0->rcv_nxt - 1)
314  {
315  tcp_options_parse (th0, &tc0->rcv_opts, 1);
316  if (tc0->state == TCP_STATE_SYN_RCVD)
317  {
318  tcp_send_synack (tc0);
319  TCP_EVT (TCP_EVT_SYN_RCVD, tc0, 0);
320  *error0 = TCP_ERROR_SYNS_RCVD;
321  }
322  else
323  {
324  tcp_program_ack (tc0);
325  TCP_EVT (TCP_EVT_SYNACK_RCVD, tc0);
326  *error0 = TCP_ERROR_SYN_ACKS_RCVD;
327  }
328  goto error;
329  }
330 
331  /* If our window is 0 and the packet is in sequence, let it pass
332  * through for ack processing. It should be dropped later. */
333  if (tc0->rcv_wnd < tc0->snd_mss
334  && tc0->rcv_nxt == vnet_buffer (b0)->tcp.seq_number)
335  goto check_reset;
336 
337  /* If we entered recovery and peer did so as well, there's a chance that
338  * dup acks won't be acceptable on either end because seq_end may be less
339  * than rcv_las. This can happen if acks are lost in both directions. */
340  if (tcp_in_recovery (tc0)
341  && seq_geq (vnet_buffer (b0)->tcp.seq_number,
342  tc0->rcv_las - tc0->rcv_wnd)
343  && seq_leq (vnet_buffer (b0)->tcp.seq_end,
344  tc0->rcv_nxt + tc0->rcv_wnd))
345  goto check_reset;
346 
347  *error0 = TCP_ERROR_RCV_WND;
348 
349  /* If we advertised a zero rcv_wnd and the segment is in the past or the
350  * next one that we expect, it is probably a window probe */
351  if ((tc0->flags & TCP_CONN_ZERO_RWND_SENT)
352  && seq_lt (vnet_buffer (b0)->tcp.seq_end,
353  tc0->rcv_las + tc0->rcv_opts.mss))
354  *error0 = TCP_ERROR_ZERO_RWND;
355 
356  tc0->errors.below_data_wnd += seq_lt (vnet_buffer (b0)->tcp.seq_end,
357  tc0->rcv_las);
358 
359  /* If not RST, send dup ack */
360  if (!tcp_rst (th0))
361  {
362  tcp_program_dupack (tc0);
363  TCP_EVT (TCP_EVT_DUPACK_SENT, tc0, vnet_buffer (b0)->tcp);
364  }
365  goto error;
366 
367  check_reset:
368  ;
369  }
370 
371  /* 2nd: check the RST bit */
372  if (PREDICT_FALSE (tcp_rst (th0)))
373  {
374  tcp_rcv_rst (wrk, tc0);
375  *error0 = TCP_ERROR_RST_RCVD;
376  goto error;
377  }
378 
379  /* 3rd: check security and precedence (skip) */
380 
381  /* 4th: check the SYN bit (in window) */
382  if (PREDICT_FALSE (tcp_syn (th0)))
383  {
384  /* As per RFC5961 send challenge ack instead of reset */
385  tcp_program_ack (tc0);
386  *error0 = TCP_ERROR_SPURIOUS_SYN;
387  goto error;
388  }
389 
390  /* If segment in window, save timestamp */
391  tcp_update_timestamp (tc0, vnet_buffer (b0)->tcp.seq_number,
392  vnet_buffer (b0)->tcp.seq_end);
393  return 0;
394 
395 error:
396  return -1;
397 }
398 
399 always_inline int
401 {
402  /* SND.UNA =< SEG.ACK =< SND.NXT */
403  if (!(seq_leq (tc->snd_una, vnet_buffer (b)->tcp.ack_number)
404  && seq_leq (vnet_buffer (b)->tcp.ack_number, tc->snd_nxt)))
405  {
406  if (seq_leq (vnet_buffer (b)->tcp.ack_number, tc->snd_una_max)
407  && seq_gt (vnet_buffer (b)->tcp.ack_number, tc->snd_una))
408  {
409  tc->snd_nxt = vnet_buffer (b)->tcp.ack_number;
410  goto acceptable;
411  }
412  *error = TCP_ERROR_ACK_INVALID;
413  return -1;
414  }
415 
416 acceptable:
417  tc->bytes_acked = vnet_buffer (b)->tcp.ack_number - tc->snd_una;
418  tc->snd_una = vnet_buffer (b)->tcp.ack_number;
419  *error = TCP_ERROR_ACK_OK;
420  return 0;
421 }
422 
423 /**
424  * Compute smoothed RTT as per VJ's '88 SIGCOMM and RFC6298
425  *
426  * Note that although the original article, srtt and rttvar are scaled
427  * to minimize round-off errors, here we don't. Instead, we rely on
428  * better precision time measurements.
429  *
430  * TODO support us rtt resolution
431  */
432 static void
434 {
435  int err, diff;
436 
437  if (tc->srtt != 0)
438  {
439  err = mrtt - tc->srtt;
440 
441  /* XXX Drop in RTT results in RTTVAR increase and bigger RTO.
442  * The increase should be bound */
443  tc->srtt = clib_max ((int) tc->srtt + (err >> 3), 1);
444  diff = (clib_abs (err) - (int) tc->rttvar) >> 2;
445  tc->rttvar = clib_max ((int) tc->rttvar + diff, 1);
446  }
447  else
448  {
449  /* First measurement. */
450  tc->srtt = mrtt;
451  tc->rttvar = mrtt >> 1;
452  }
453 }
454 
455 /**
456  * Update RTT estimate and RTO timer
457  *
458  * Measure RTT: We have two sources of RTT measurements: TSOPT and ACK
459  * timing. Middle boxes are known to fiddle with TCP options so we
460  * should give higher priority to ACK timing.
461  *
462  * This should be called only if previously sent bytes have been acked.
463  *
464  * return 1 if valid rtt 0 otherwise
465  */
466 static int
468 {
469  u32 mrtt = 0;
470 
471  /* Karn's rule, part 1. Don't use retransmitted segments to estimate
472  * RTT because they're ambiguous. */
473  if (tcp_in_cong_recovery (tc))
474  {
475  /* Accept rtt estimates for samples that have not been retransmitted */
476  if ((tc->cfg_flags & TCP_CFG_F_RATE_SAMPLE)
477  && !(rs->flags & TCP_BTS_IS_RXT))
478  {
479  mrtt = rs->rtt_time * THZ;
480  goto estimate_rtt;
481  }
482  goto done;
483  }
484 
485  if (tc->rtt_ts && seq_geq (ack, tc->rtt_seq))
486  {
487  f64 sample = tcp_time_now_us (tc->c_thread_index) - tc->rtt_ts;
488  tc->mrtt_us = tc->mrtt_us + (sample - tc->mrtt_us) * 0.125;
489  mrtt = clib_max ((u32) (sample * THZ), 1);
490  /* Allow measuring of a new RTT */
491  tc->rtt_ts = 0;
492  }
493  /* As per RFC7323 TSecr can be used for RTTM only if the segment advances
494  * snd_una, i.e., the left side of the send window:
495  * seq_lt (tc->snd_una, ack). This is a condition for calling update_rtt */
496  else if (tcp_opts_tstamp (&tc->rcv_opts) && tc->rcv_opts.tsecr)
497  {
498  u32 now = tcp_tstamp (tc);
499  mrtt = clib_max (now - tc->rcv_opts.tsecr, 1);
500  }
501 
502 estimate_rtt:
503 
504  /* Ignore dubious measurements */
505  if (mrtt == 0 || mrtt > TCP_RTT_MAX)
506  goto done;
507 
508  tcp_estimate_rtt (tc, mrtt);
509 
510 done:
511 
512  /* If we got here something must've been ACKed so make sure boff is 0,
513  * even if mrtt is not valid since we update the rto lower */
514  tc->rto_boff = 0;
515  tcp_update_rto (tc);
516 
517  return 0;
518 }
519 
520 static void
522 {
523  u8 thread_index = vlib_num_workers ()? 1 : 0;
524  int mrtt;
525 
526  if (tc->rtt_ts)
527  {
528  tc->mrtt_us = tcp_time_now_us (thread_index) - tc->rtt_ts;
529  tc->mrtt_us = clib_max (tc->mrtt_us, 0.0001);
530  mrtt = clib_max ((u32) (tc->mrtt_us * THZ), 1);
531  tc->rtt_ts = 0;
532  }
533  else
534  {
535  mrtt = tcp_time_now_w_thread (thread_index) - tc->rcv_opts.tsecr;
536  mrtt = clib_max (mrtt, 1);
537  /* Due to retransmits we don't know the initial mrtt */
538  if (tc->rto_boff && mrtt > 1 * THZ)
539  mrtt = 1 * THZ;
540  tc->mrtt_us = (f64) mrtt *TCP_TICK;
541  }
542 
543  if (mrtt > 0 && mrtt < TCP_RTT_MAX)
544  tcp_estimate_rtt (tc, mrtt);
545  tcp_update_rto (tc);
546 }
547 
548 /**
549  * Dequeue bytes for connections that have received acks in last burst
550  */
551 static void
553 {
554  u32 thread_index = wrk->vm->thread_index;
555  u32 *pending_deq_acked;
556  tcp_connection_t *tc;
557  int i;
558 
559  if (!vec_len (wrk->pending_deq_acked))
560  return;
561 
562  pending_deq_acked = wrk->pending_deq_acked;
563  for (i = 0; i < vec_len (pending_deq_acked); i++)
564  {
565  tc = tcp_connection_get (pending_deq_acked[i], thread_index);
566  tc->flags &= ~TCP_CONN_DEQ_PENDING;
567 
568  if (PREDICT_FALSE (!tc->burst_acked))
569  continue;
570 
571  /* Dequeue the newly ACKed bytes */
572  session_tx_fifo_dequeue_drop (&tc->connection, tc->burst_acked);
573  tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una);
574 
575  if (PREDICT_FALSE (tc->flags & TCP_CONN_PSH_PENDING))
576  {
577  if (seq_leq (tc->psh_seq, tc->snd_una))
578  tc->flags &= ~TCP_CONN_PSH_PENDING;
579  }
580 
581  if (tcp_is_descheduled (tc))
582  tcp_reschedule (tc);
583 
584  /* If everything has been acked, stop retransmit timer
585  * otherwise update. */
587 
588  /* Update pacer based on our new cwnd estimate */
590 
591  tc->burst_acked = 0;
592  }
593  _vec_len (wrk->pending_deq_acked) = 0;
594 }
595 
596 static void
598 {
599  if (!(tc->flags & TCP_CONN_DEQ_PENDING))
600  {
601  vec_add1 (wrk->pending_deq_acked, tc->c_c_index);
602  tc->flags |= TCP_CONN_DEQ_PENDING;
603  }
604  tc->burst_acked += tc->bytes_acked;
605 }
606 
607 /**
608  * Try to update snd_wnd based on feedback received from peer.
609  *
610  * If successful, and new window is 'effectively' 0, activate persist
611  * timer.
612  */
613 static void
614 tcp_update_snd_wnd (tcp_connection_t * tc, u32 seq, u32 ack, u32 snd_wnd)
615 {
616  /* If (SND.WL1 < SEG.SEQ or (SND.WL1 = SEG.SEQ and SND.WL2 =< SEG.ACK)), set
617  * SND.WND <- SEG.WND, set SND.WL1 <- SEG.SEQ, and set SND.WL2 <- SEG.ACK */
618  if (seq_lt (tc->snd_wl1, seq)
619  || (tc->snd_wl1 == seq && seq_leq (tc->snd_wl2, ack)))
620  {
621  tc->snd_wnd = snd_wnd;
622  tc->snd_wl1 = seq;
623  tc->snd_wl2 = ack;
624  TCP_EVT (TCP_EVT_SND_WND, tc);
625 
626  if (PREDICT_FALSE (tc->snd_wnd < tc->snd_mss))
627  {
628  /* Set persist timer if not set and we just got 0 wnd */
629  if (!tcp_timer_is_active (tc, TCP_TIMER_PERSIST)
630  && !tcp_timer_is_active (tc, TCP_TIMER_RETRANSMIT))
631  {
632  tcp_worker_ctx_t *wrk = tcp_get_worker (tc->c_thread_index);
634  }
635  }
636  else
637  {
638  if (PREDICT_FALSE (tcp_timer_is_active (tc, TCP_TIMER_PERSIST)))
639  {
640  tcp_worker_ctx_t *wrk = tcp_get_worker (tc->c_thread_index);
642  }
643 
645  tcp_reschedule (tc);
646 
647  if (PREDICT_FALSE (!tcp_in_recovery (tc) && tc->rto_boff > 0))
648  {
649  tc->rto_boff = 0;
650  tcp_update_rto (tc);
651  }
652  }
653  }
654 }
655 
656 /**
657  * Init loss recovery/fast recovery.
658  *
659  * Triggered by dup acks as opposed to timer timeout. Note that cwnd is
660  * updated in @ref tcp_cc_handle_event after fast retransmit
661  */
662 static void
664 {
665  tcp_fastrecovery_on (tc);
666  tc->snd_congestion = tc->snd_nxt;
667  tc->cwnd_acc_bytes = 0;
668  tc->snd_rxt_bytes = 0;
669  tc->rxt_delivered = 0;
670  tc->prr_delivered = 0;
671  tc->prr_start = tc->snd_una;
672  tc->prev_ssthresh = tc->ssthresh;
673  tc->prev_cwnd = tc->cwnd;
674 
675  tc->snd_rxt_ts = tcp_tstamp (tc);
676  tcp_cc_congestion (tc);
677 
678  /* Post retransmit update cwnd to ssthresh and account for the
679  * three segments that have left the network and should've been
680  * buffered at the receiver XXX */
681  if (!tcp_opts_sack_permitted (&tc->rcv_opts))
682  tc->cwnd += 3 * tc->snd_mss;
683 
684  tc->fr_occurences += 1;
685  TCP_EVT (TCP_EVT_CC_EVT, tc, 4);
686 }
687 
688 static void
690 {
691  tc->cwnd = tc->prev_cwnd;
692  tc->ssthresh = tc->prev_ssthresh;
694  ASSERT (tc->rto_boff == 0);
695  TCP_EVT (TCP_EVT_CC_EVT, tc, 5);
696 }
697 
698 static inline u8
700 {
701  return (tcp_in_recovery (tc) && tc->rto_boff == 1
702  && tc->snd_rxt_ts
703  && tcp_opts_tstamp (&tc->rcv_opts)
704  && timestamp_lt (tc->rcv_opts.tsecr, tc->snd_rxt_ts));
705 }
706 
707 static inline u8
709 {
710  return (tcp_cc_is_spurious_timeout_rxt (tc));
711 }
712 
713 static inline u8
715 {
716  return (tc->sack_sb.lost_bytes
717  || ((TCP_DUPACK_THRESHOLD - 1) * tc->snd_mss
718  < tc->sack_sb.sacked_bytes));
719 }
720 
721 static inline u8
723 {
724  if (!has_sack)
725  {
726  /* If of of the two conditions lower hold, reset dupacks because
727  * we're probably after timeout (RFC6582 heuristics).
728  * If Cumulative ack does not cover more than congestion threshold,
729  * and:
730  * 1) The following doesn't hold: The congestion window is greater
731  * than SMSS bytes and the difference between highest_ack
732  * and prev_highest_ack is at most 4*SMSS bytes
733  * 2) Echoed timestamp in the last non-dup ack does not equal the
734  * stored timestamp
735  */
736  if (seq_leq (tc->snd_una, tc->snd_congestion)
737  && ((!(tc->cwnd > tc->snd_mss
738  && tc->bytes_acked <= 4 * tc->snd_mss))
739  || (tc->rcv_opts.tsecr != tc->tsecr_last_ack)))
740  {
741  tc->rcv_dupacks = 0;
742  return 0;
743  }
744  }
745  return ((tc->rcv_dupacks == TCP_DUPACK_THRESHOLD)
747 }
748 
749 static int
751 {
753  u8 is_spurious = 0;
754 
756 
758  {
760  is_spurious = 1;
761  }
762 
763  tcp_connection_tx_pacer_reset (tc, tc->cwnd, 0 /* start bucket */ );
764  tc->rcv_dupacks = 0;
765 
766  /* Previous recovery left us congested. Continue sending as part
767  * of the current recovery event with an updated snd_congestion */
768  if (tc->sack_sb.sacked_bytes)
769  {
770  tc->snd_congestion = tc->snd_nxt;
772  return is_spurious;
773  }
774 
775  tc->rxt_delivered = 0;
776  tc->snd_rxt_bytes = 0;
777  tc->snd_rxt_ts = 0;
778  tc->prr_delivered = 0;
779  tc->rtt_ts = 0;
780  tc->flags &= ~TCP_CONN_RXT_PENDING;
781 
782  hole = scoreboard_first_hole (&tc->sack_sb);
783  if (hole && hole->start == tc->snd_una && hole->end == tc->snd_nxt)
784  scoreboard_clear (&tc->sack_sb);
785 
786  if (!tcp_in_recovery (tc) && !is_spurious)
787  tcp_cc_recovered (tc);
788 
791  tcp_recovery_off (tc);
792  TCP_EVT (TCP_EVT_CC_EVT, tc, 3);
793 
794  ASSERT (tc->rto_boff == 0);
797 
798  return is_spurious;
799 }
800 
801 static void
803 {
805 
806  /* Congestion avoidance */
807  tcp_cc_rcv_ack (tc, rs);
808 
809  /* If a cumulative ack, make sure dupacks is 0 */
810  tc->rcv_dupacks = 0;
811 
812  /* When dupacks hits the threshold we only enter fast retransmit if
813  * cumulative ack covers more than snd_congestion. Should snd_una
814  * wrap this test may fail under otherwise valid circumstances.
815  * Therefore, proactively update snd_congestion when wrap detected. */
816  if (PREDICT_FALSE
817  (seq_leq (tc->snd_congestion, tc->snd_una - tc->bytes_acked)
818  && seq_gt (tc->snd_congestion, tc->snd_una)))
819  tc->snd_congestion = tc->snd_una - 1;
820 }
821 
822 /**
823  * One function to rule them all ... and in the darkness bind them
824  */
825 static void
827  u32 is_dack)
828 {
829  u8 has_sack = tcp_opts_sack_permitted (&tc->rcv_opts);
830 
831  /* If reneging, wait for timer based retransmits */
832  if (PREDICT_FALSE (tcp_is_lost_fin (tc) || tc->sack_sb.is_reneging))
833  return;
834 
835  /*
836  * If not in recovery, figure out if we should enter
837  */
838  if (!tcp_in_cong_recovery (tc))
839  {
840  ASSERT (is_dack);
841 
842  tc->rcv_dupacks++;
843  TCP_EVT (TCP_EVT_DUPACK_RCVD, tc, 1);
845 
846  if (tcp_should_fastrecover (tc, has_sack))
847  {
849 
850  if (has_sack)
851  scoreboard_init_rxt (&tc->sack_sb, tc->snd_una);
852 
853  tcp_connection_tx_pacer_reset (tc, tc->cwnd, 0 /* start bucket */ );
855  }
856 
857  return;
858  }
859 
860  /*
861  * Already in recovery
862  */
863 
864  /*
865  * Process (re)transmit feedback. Output path uses this to decide how much
866  * more data to release into the network
867  */
868  if (has_sack)
869  {
870  if (!tc->bytes_acked && tc->sack_sb.rxt_sacked)
872 
873  tc->rxt_delivered += tc->sack_sb.rxt_sacked;
874  tc->prr_delivered += tc->bytes_acked + tc->sack_sb.last_sacked_bytes
875  - tc->sack_sb.last_bytes_delivered;
876  }
877  else
878  {
879  if (is_dack)
880  {
881  tc->rcv_dupacks += 1;
882  TCP_EVT (TCP_EVT_DUPACK_RCVD, tc, 1);
883  }
884  tc->rxt_delivered = clib_min (tc->rxt_delivered + tc->bytes_acked,
885  tc->snd_rxt_bytes);
886  if (is_dack)
887  tc->prr_delivered += clib_min (tc->snd_mss,
888  tc->snd_nxt - tc->snd_una);
889  else
890  tc->prr_delivered += tc->bytes_acked - clib_min (tc->bytes_acked,
891  tc->snd_mss *
892  tc->rcv_dupacks);
893 
894  /* If partial ack, assume that the first un-acked segment was lost */
895  if (tc->bytes_acked || tc->rcv_dupacks == TCP_DUPACK_THRESHOLD)
897  }
898 
899  /*
900  * See if we can exit and stop retransmitting
901  */
902  if (seq_geq (tc->snd_una, tc->snd_congestion))
903  {
904  /* If spurious return, we've already updated everything */
905  if (tcp_cc_recover (tc))
906  {
907  tc->tsecr_last_ack = tc->rcv_opts.tsecr;
908  return;
909  }
910 
911  /* Treat as congestion avoidance ack */
912  tcp_cc_rcv_ack (tc, rs);
913  return;
914  }
915 
917 
918  /*
919  * Notify cc of the event
920  */
921 
922  if (!tc->bytes_acked)
923  {
925  return;
926  }
927 
928  /* RFC6675: If the incoming ACK is a cumulative acknowledgment,
929  * reset dupacks to 0. Also needed if in congestion recovery */
930  tc->rcv_dupacks = 0;
931 
932  if (tcp_in_recovery (tc))
933  tcp_cc_rcv_ack (tc, rs);
934  else
936 }
937 
938 static void
940 {
941  if (!tcp_in_cong_recovery (tc))
942  return;
943 
944  if (tcp_opts_sack_permitted (&tc->rcv_opts))
945  tcp_rcv_sacks (tc, tc->snd_una);
946 
947  tc->bytes_acked = 0;
948 
949  if (tc->cfg_flags & TCP_CFG_F_RATE_SAMPLE)
951 
952  tcp_cc_handle_event (tc, rs, 1);
953 }
954 
955 /**
956  * Check if duplicate ack as per RFC5681 Sec. 2
957  */
960  u32 prev_snd_una)
961 {
962  return ((vnet_buffer (b)->tcp.ack_number == prev_snd_una)
963  && seq_gt (tc->snd_nxt, tc->snd_una)
964  && (vnet_buffer (b)->tcp.seq_end == vnet_buffer (b)->tcp.seq_number)
965  && (prev_snd_wnd == tc->snd_wnd));
966 }
967 
968 /**
969  * Checks if ack is a congestion control event.
970  */
971 static u8
973  u32 prev_snd_wnd, u32 prev_snd_una, u8 * is_dack)
974 {
975  /* Check if ack is duplicate. Per RFC 6675, ACKs that SACK new data are
976  * defined to be 'duplicate' as well */
977  *is_dack = tc->sack_sb.last_sacked_bytes
978  || tcp_ack_is_dupack (tc, b, prev_snd_wnd, prev_snd_una);
979 
980  return (*is_dack || tcp_in_cong_recovery (tc));
981 }
982 
983 /**
984  * Process incoming ACK
985  */
986 static int
988  tcp_header_t * th, u32 * error)
989 {
990  u32 prev_snd_wnd, prev_snd_una;
991  tcp_rate_sample_t rs = { 0 };
992  u8 is_dack;
993 
994  TCP_EVT (TCP_EVT_CC_STAT, tc);
995 
996  /* If the ACK acks something not yet sent (SEG.ACK > SND.NXT) */
997  if (PREDICT_FALSE (seq_gt (vnet_buffer (b)->tcp.ack_number, tc->snd_nxt)))
998  {
999  /* We've probably entered recovery and the peer still has some
1000  * of the data we've sent. Update snd_nxt and accept the ack */
1001  if (seq_leq (vnet_buffer (b)->tcp.ack_number, tc->snd_una_max)
1002  && seq_gt (vnet_buffer (b)->tcp.ack_number, tc->snd_una))
1003  {
1004  tc->snd_nxt = vnet_buffer (b)->tcp.ack_number;
1005  goto process_ack;
1006  }
1007 
1008  tc->errors.above_ack_wnd += 1;
1009  *error = TCP_ERROR_ACK_FUTURE;
1010  TCP_EVT (TCP_EVT_ACK_RCV_ERR, tc, 0, vnet_buffer (b)->tcp.ack_number);
1011  return -1;
1012  }
1013 
1014  /* If old ACK, probably it's an old dupack */
1015  if (PREDICT_FALSE (seq_lt (vnet_buffer (b)->tcp.ack_number, tc->snd_una)))
1016  {
1017  tc->errors.below_ack_wnd += 1;
1018  *error = TCP_ERROR_ACK_OLD;
1019  TCP_EVT (TCP_EVT_ACK_RCV_ERR, tc, 1, vnet_buffer (b)->tcp.ack_number);
1020 
1021  if (seq_lt (vnet_buffer (b)->tcp.ack_number, tc->snd_una - tc->rcv_wnd))
1022  return -1;
1023 
1024  tcp_handle_old_ack (tc, &rs);
1025 
1026  /* Don't drop yet */
1027  return 0;
1028  }
1029 
1030 process_ack:
1031 
1032  /*
1033  * Looks okay, process feedback
1034  */
1035 
1036  if (tcp_opts_sack_permitted (&tc->rcv_opts))
1037  tcp_rcv_sacks (tc, vnet_buffer (b)->tcp.ack_number);
1038 
1039  prev_snd_wnd = tc->snd_wnd;
1040  prev_snd_una = tc->snd_una;
1041  tcp_update_snd_wnd (tc, vnet_buffer (b)->tcp.seq_number,
1042  vnet_buffer (b)->tcp.ack_number,
1043  clib_net_to_host_u16 (th->window) << tc->snd_wscale);
1044  tc->bytes_acked = vnet_buffer (b)->tcp.ack_number - tc->snd_una;
1045  tc->snd_una = vnet_buffer (b)->tcp.ack_number;
1046  tcp_validate_txf_size (tc, tc->bytes_acked);
1047 
1048  if (tc->cfg_flags & TCP_CFG_F_RATE_SAMPLE)
1049  tcp_bt_sample_delivery_rate (tc, &rs);
1050 
1051  if (tc->bytes_acked)
1052  {
1053  tcp_program_dequeue (wrk, tc);
1054  tcp_update_rtt (tc, &rs, vnet_buffer (b)->tcp.ack_number);
1055  }
1056 
1057  TCP_EVT (TCP_EVT_ACK_RCVD, tc);
1058 
1059  /*
1060  * Check if we have congestion event
1061  */
1062 
1063  if (tcp_ack_is_cc_event (tc, b, prev_snd_wnd, prev_snd_una, &is_dack))
1064  {
1065  tcp_cc_handle_event (tc, &rs, is_dack);
1066  tc->dupacks_in += is_dack;
1067  if (!tcp_in_cong_recovery (tc))
1068  {
1069  *error = TCP_ERROR_ACK_OK;
1070  return 0;
1071  }
1072  *error = TCP_ERROR_ACK_DUP;
1073  if (vnet_buffer (b)->tcp.data_len || tcp_is_fin (th))
1074  return 0;
1075  return -1;
1076  }
1077 
1078  /*
1079  * Update congestion control (slow start/congestion avoidance)
1080  */
1081  tcp_cc_update (tc, &rs);
1082  *error = TCP_ERROR_ACK_OK;
1083  return 0;
1084 }
1085 
1086 static void
1088 {
1089  if (!tcp_disconnect_pending (tc))
1090  {
1091  vec_add1 (wrk->pending_disconnects, tc->c_c_index);
1093  }
1094 }
1095 
1096 static void
1098 {
1099  u32 thread_index, *pending_disconnects, *pending_resets;
1100  tcp_connection_t *tc;
1101  int i;
1102 
1103  if (vec_len (wrk->pending_disconnects))
1104  {
1105  thread_index = wrk->vm->thread_index;
1106  pending_disconnects = wrk->pending_disconnects;
1107  for (i = 0; i < vec_len (pending_disconnects); i++)
1108  {
1109  tc = tcp_connection_get (pending_disconnects[i], thread_index);
1111  session_transport_closing_notify (&tc->connection);
1112  }
1113  _vec_len (wrk->pending_disconnects) = 0;
1114  }
1115 
1116  if (vec_len (wrk->pending_resets))
1117  {
1118  thread_index = wrk->vm->thread_index;
1119  pending_resets = wrk->pending_resets;
1120  for (i = 0; i < vec_len (pending_resets); i++)
1121  {
1122  tc = tcp_connection_get (pending_resets[i], thread_index);
1124  tcp_handle_rst (tc);
1125  }
1126  _vec_len (wrk->pending_resets) = 0;
1127  }
1128 }
1129 
1130 static void
1132  u32 * error)
1133 {
1134  /* Reject out-of-order fins */
1135  if (vnet_buffer (b)->tcp.seq_end != tc->rcv_nxt)
1136  return;
1137 
1138  /* Account for the FIN and send ack */
1139  tc->rcv_nxt += 1;
1140  tc->flags |= TCP_CONN_FINRCVD;
1141  tcp_program_ack (tc);
1142  /* Enter CLOSE-WAIT and notify session. To avoid lingering
1143  * in CLOSE-WAIT, set timer (reuse WAITCLOSE). */
1144  tcp_connection_set_state (tc, TCP_STATE_CLOSE_WAIT);
1145  tcp_program_disconnect (wrk, tc);
1146  tcp_timer_update (&wrk->timer_wheel, tc, TCP_TIMER_WAITCLOSE,
1147  tcp_cfg.closewait_time);
1148  TCP_EVT (TCP_EVT_FIN_RCVD, tc);
1149  *error = TCP_ERROR_FIN_RCVD;
1150 }
1151 
1152 /** Enqueue data for delivery to application */
1153 static int
1155  u16 data_len)
1156 {
1157  int written, error = TCP_ERROR_ENQUEUED;
1158 
1159  ASSERT (seq_geq (vnet_buffer (b)->tcp.seq_number, tc->rcv_nxt));
1160  ASSERT (data_len);
1161  written = session_enqueue_stream_connection (&tc->connection, b, 0,
1162  1 /* queue event */ , 1);
1163  tc->bytes_in += written;
1164 
1165  TCP_EVT (TCP_EVT_INPUT, tc, 0, data_len, written);
1166 
1167  /* Update rcv_nxt */
1168  if (PREDICT_TRUE (written == data_len))
1169  {
1170  tc->rcv_nxt += written;
1171  }
1172  /* If more data written than expected, account for out-of-order bytes. */
1173  else if (written > data_len)
1174  {
1175  tc->rcv_nxt += written;
1176  TCP_EVT (TCP_EVT_CC_INPUT, tc, data_len, written);
1177  }
1178  else if (written > 0)
1179  {
1180  /* We've written something but FIFO is probably full now */
1181  tc->rcv_nxt += written;
1182  error = TCP_ERROR_PARTIALLY_ENQUEUED;
1183  }
1184  else
1185  {
1186  /* Packet made it through for ack processing */
1187  if (tc->rcv_wnd < tc->snd_mss)
1188  return TCP_ERROR_ZERO_RWND;
1189 
1190  return TCP_ERROR_FIFO_FULL;
1191  }
1192 
1193  /* Update SACK list if need be */
1194  if (tcp_opts_sack_permitted (&tc->rcv_opts))
1195  {
1196  /* Remove SACK blocks that have been delivered */
1197  tcp_update_sack_list (tc, tc->rcv_nxt, tc->rcv_nxt);
1198  }
1199 
1200  return error;
1201 }
1202 
1203 /** Enqueue out-of-order data */
1204 static int
1206  u16 data_len)
1207 {
1208  session_t *s0;
1209  int rv, offset;
1210 
1211  ASSERT (seq_gt (vnet_buffer (b)->tcp.seq_number, tc->rcv_nxt));
1212  ASSERT (data_len);
1213 
1214  /* Enqueue out-of-order data with relative offset */
1215  rv = session_enqueue_stream_connection (&tc->connection, b,
1216  vnet_buffer (b)->tcp.seq_number -
1217  tc->rcv_nxt, 0 /* queue event */ ,
1218  0);
1219 
1220  /* Nothing written */
1221  if (rv)
1222  {
1223  TCP_EVT (TCP_EVT_INPUT, tc, 1, data_len, 0);
1224  return TCP_ERROR_FIFO_FULL;
1225  }
1226 
1227  TCP_EVT (TCP_EVT_INPUT, tc, 1, data_len, data_len);
1228  tc->bytes_in += data_len;
1229 
1230  /* Update SACK list if in use */
1231  if (tcp_opts_sack_permitted (&tc->rcv_opts))
1232  {
1233  ooo_segment_t *newest;
1234  u32 start, end;
1235 
1236  s0 = session_get (tc->c_s_index, tc->c_thread_index);
1237 
1238  /* Get the newest segment from the fifo */
1239  newest = svm_fifo_newest_ooo_segment (s0->rx_fifo);
1240  if (newest)
1241  {
1242  offset = ooo_segment_offset_prod (s0->rx_fifo, newest);
1243  ASSERT (offset <= vnet_buffer (b)->tcp.seq_number - tc->rcv_nxt);
1244  start = tc->rcv_nxt + offset;
1245  end = start + ooo_segment_length (s0->rx_fifo, newest);
1246  tcp_update_sack_list (tc, start, end);
1248  TCP_EVT (TCP_EVT_CC_SACKS, tc);
1249  }
1250  }
1251 
1252  return TCP_ERROR_ENQUEUED_OOO;
1253 }
1254 
1255 /**
1256  * Check if ACK could be delayed. If ack can be delayed, it should return
1257  * true for a full frame. If we're always acking return 0.
1258  */
1259 always_inline int
1261 {
1262  /* Send ack if ... */
1263  if (TCP_ALWAYS_ACK
1264  /* just sent a rcv wnd 0
1265  || (tc->flags & TCP_CONN_SENT_RCV_WND0) != 0 */
1266  /* constrained to send ack */
1267  || (tc->flags & TCP_CONN_SNDACK) != 0
1268  /* we're almost out of tx wnd */
1269  || tcp_available_cc_snd_space (tc) < 4 * tc->snd_mss)
1270  return 0;
1271 
1272  return 1;
1273 }
1274 
1275 static int
1277 {
1278  u32 discard, first = b->current_length;
1279  vlib_main_t *vm = vlib_get_main ();
1280 
1281  /* Handle multi-buffer segments */
1282  if (n_bytes_to_drop > b->current_length)
1283  {
1284  if (!(b->flags & VLIB_BUFFER_NEXT_PRESENT))
1285  return -1;
1286  do
1287  {
1288  discard = clib_min (n_bytes_to_drop, b->current_length);
1289  vlib_buffer_advance (b, discard);
1290  b = vlib_get_buffer (vm, b->next_buffer);
1291  n_bytes_to_drop -= discard;
1292  }
1293  while (n_bytes_to_drop);
1294  if (n_bytes_to_drop > first)
1295  b->total_length_not_including_first_buffer -= n_bytes_to_drop - first;
1296  }
1297  else
1298  vlib_buffer_advance (b, n_bytes_to_drop);
1299  vnet_buffer (b)->tcp.data_len -= n_bytes_to_drop;
1300  return 0;
1301 }
1302 
1303 /**
1304  * Receive buffer for connection and handle acks
1305  *
1306  * It handles both in order or out-of-order data.
1307  */
1308 static int
1310  vlib_buffer_t * b)
1311 {
1312  u32 error, n_bytes_to_drop, n_data_bytes;
1313 
1314  vlib_buffer_advance (b, vnet_buffer (b)->tcp.data_offset);
1315  n_data_bytes = vnet_buffer (b)->tcp.data_len;
1316  ASSERT (n_data_bytes);
1317  tc->data_segs_in += 1;
1318 
1319  /* Handle out-of-order data */
1320  if (PREDICT_FALSE (vnet_buffer (b)->tcp.seq_number != tc->rcv_nxt))
1321  {
1322  /* Old sequence numbers allowed through because they overlapped
1323  * the rx window */
1324  if (seq_lt (vnet_buffer (b)->tcp.seq_number, tc->rcv_nxt))
1325  {
1326  /* Completely in the past (possible retransmit). Ack
1327  * retransmissions since we may not have any data to send */
1328  if (seq_leq (vnet_buffer (b)->tcp.seq_end, tc->rcv_nxt))
1329  {
1330  tcp_program_dupack (tc);
1331  tc->errors.below_data_wnd++;
1332  error = TCP_ERROR_SEGMENT_OLD;
1333  goto done;
1334  }
1335 
1336  /* Chop off the bytes in the past and see if what is left
1337  * can be enqueued in order */
1338  n_bytes_to_drop = tc->rcv_nxt - vnet_buffer (b)->tcp.seq_number;
1339  n_data_bytes -= n_bytes_to_drop;
1340  vnet_buffer (b)->tcp.seq_number = tc->rcv_nxt;
1341  if (tcp_buffer_discard_bytes (b, n_bytes_to_drop))
1342  {
1343  error = TCP_ERROR_SEGMENT_OLD;
1344  goto done;
1345  }
1346  goto in_order;
1347  }
1348 
1349  /* RFC2581: Enqueue and send DUPACK for fast retransmit */
1350  error = tcp_session_enqueue_ooo (tc, b, n_data_bytes);
1351  tcp_program_dupack (tc);
1352  TCP_EVT (TCP_EVT_DUPACK_SENT, tc, vnet_buffer (b)->tcp);
1353  tc->errors.above_data_wnd += seq_gt (vnet_buffer (b)->tcp.seq_end,
1354  tc->rcv_las + tc->rcv_wnd);
1355  goto done;
1356  }
1357 
1358 in_order:
1359 
1360  /* In order data, enqueue. Fifo figures out by itself if any out-of-order
1361  * segments can be enqueued after fifo tail offset changes. */
1362  error = tcp_session_enqueue_data (tc, b, n_data_bytes);
1363  if (tcp_can_delack (tc))
1364  {
1365  if (!tcp_timer_is_active (tc, TCP_TIMER_DELACK))
1366  tcp_timer_set (&wrk->timer_wheel, tc, TCP_TIMER_DELACK,
1367  tcp_cfg.delack_time);
1368  goto done;
1369  }
1370 
1371  tcp_program_ack (tc);
1372 
1373 done:
1374  return error;
1375 }
1376 
1377 typedef struct
1378 {
1381 } tcp_rx_trace_t;
1382 
1383 static u8 *
1384 format_tcp_rx_trace (u8 * s, va_list * args)
1385 {
1386  CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1387  CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1388  tcp_rx_trace_t *t = va_arg (*args, tcp_rx_trace_t *);
1389  tcp_connection_t *tc = &t->tcp_connection;
1390  u32 indent = format_get_indent (s);
1391 
1392  s = format (s, "%U state %U\n%U%U", format_tcp_connection_id, tc,
1393  format_tcp_state, tc->state, format_white_space, indent,
1394  format_tcp_header, &t->tcp_header, 128);
1395 
1396  return s;
1397 }
1398 
1399 static u8 *
1400 format_tcp_rx_trace_short (u8 * s, va_list * args)
1401 {
1402  CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
1403  CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
1404  tcp_rx_trace_t *t = va_arg (*args, tcp_rx_trace_t *);
1405 
1406  s = format (s, "%d -> %d (%U)",
1407  clib_net_to_host_u16 (t->tcp_header.dst_port),
1408  clib_net_to_host_u16 (t->tcp_header.src_port), format_tcp_state,
1409  t->tcp_connection.state);
1410 
1411  return s;
1412 }
1413 
1414 static void
1416  tcp_header_t * th0, vlib_buffer_t * b0, u8 is_ip4)
1417 {
1418  if (tc0)
1419  {
1420  clib_memcpy_fast (&t0->tcp_connection, tc0,
1421  sizeof (t0->tcp_connection));
1422  }
1423  else
1424  {
1425  th0 = tcp_buffer_hdr (b0);
1426  }
1427  clib_memcpy_fast (&t0->tcp_header, th0, sizeof (t0->tcp_header));
1428 }
1429 
1430 static void
1432  vlib_frame_t * frame, u8 is_ip4)
1433 {
1434  u32 *from, n_left;
1435 
1436  n_left = frame->n_vectors;
1437  from = vlib_frame_vector_args (frame);
1438 
1439  while (n_left >= 1)
1440  {
1441  tcp_connection_t *tc0;
1442  tcp_rx_trace_t *t0;
1443  tcp_header_t *th0;
1444  vlib_buffer_t *b0;
1445  u32 bi0;
1446 
1447  bi0 = from[0];
1448  b0 = vlib_get_buffer (vm, bi0);
1449 
1450  if (b0->flags & VLIB_BUFFER_IS_TRACED)
1451  {
1452  t0 = vlib_add_trace (vm, node, b0, sizeof (*t0));
1453  tc0 = tcp_connection_get (vnet_buffer (b0)->tcp.connection_index,
1454  vm->thread_index);
1455  th0 = tcp_buffer_hdr (b0);
1456  tcp_set_rx_trace_data (t0, tc0, th0, b0, is_ip4);
1457  }
1458 
1459  from += 1;
1460  n_left -= 1;
1461  }
1462 }
1463 
1464 always_inline void
1465 tcp_node_inc_counter_i (vlib_main_t * vm, u32 tcp4_node, u32 tcp6_node,
1466  u8 is_ip4, u32 evt, u32 val)
1467 {
1468  if (is_ip4)
1469  vlib_node_increment_counter (vm, tcp4_node, evt, val);
1470  else
1471  vlib_node_increment_counter (vm, tcp6_node, evt, val);
1472 }
1473 
1474 #define tcp_maybe_inc_counter(node_id, err, count) \
1475 { \
1476  if (next0 != tcp_next_drop (is_ip4)) \
1477  tcp_node_inc_counter_i (vm, tcp4_##node_id##_node.index, \
1478  tcp6_##node_id##_node.index, is_ip4, err, \
1479  1); \
1480 }
1481 #define tcp_inc_counter(node_id, err, count) \
1482  tcp_node_inc_counter_i (vm, tcp4_##node_id##_node.index, \
1483  tcp6_##node_id##_node.index, is_ip4, \
1484  err, count)
1485 #define tcp_maybe_inc_err_counter(cnts, err) \
1486 { \
1487  cnts[err] += (next0 != tcp_next_drop (is_ip4)); \
1488 }
1489 #define tcp_inc_err_counter(cnts, err, val) \
1490 { \
1491  cnts[err] += val; \
1492 }
1493 #define tcp_store_err_counters(node_id, cnts) \
1494 { \
1495  int i; \
1496  for (i = 0; i < TCP_N_ERROR; i++) \
1497  if (cnts[i]) \
1498  tcp_inc_counter(node_id, i, cnts[i]); \
1499 }
1500 
1501 
1504  vlib_frame_t * frame, int is_ip4)
1505 {
1506  u32 thread_index = vm->thread_index, errors = 0;
1507  tcp_worker_ctx_t *wrk = tcp_get_worker (thread_index);
1508  u32 n_left_from, *from, *first_buffer;
1509  u16 err_counters[TCP_N_ERROR] = { 0 };
1510 
1511  if (node->flags & VLIB_NODE_FLAG_TRACE)
1512  tcp_established_trace_frame (vm, node, frame, is_ip4);
1513 
1514  first_buffer = from = vlib_frame_vector_args (frame);
1515  n_left_from = frame->n_vectors;
1516 
1517  while (n_left_from > 0)
1518  {
1519  u32 bi0, error0 = TCP_ERROR_ACK_OK;
1520  vlib_buffer_t *b0;
1521  tcp_header_t *th0;
1522  tcp_connection_t *tc0;
1523 
1524  if (n_left_from > 1)
1525  {
1526  vlib_buffer_t *pb;
1527  pb = vlib_get_buffer (vm, from[1]);
1528  vlib_prefetch_buffer_header (pb, LOAD);
1529  CLIB_PREFETCH (pb->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD);
1530  }
1531 
1532  bi0 = from[0];
1533  from += 1;
1534  n_left_from -= 1;
1535 
1536  b0 = vlib_get_buffer (vm, bi0);
1537  tc0 = tcp_connection_get (vnet_buffer (b0)->tcp.connection_index,
1538  thread_index);
1539 
1540  if (PREDICT_FALSE (tc0 == 0))
1541  {
1542  error0 = TCP_ERROR_INVALID_CONNECTION;
1543  goto done;
1544  }
1545 
1546  th0 = tcp_buffer_hdr (b0);
1547 
1548  /* TODO header prediction fast path */
1549 
1550  /* 1-4: check SEQ, RST, SYN */
1551  if (PREDICT_FALSE (tcp_segment_validate (wrk, tc0, b0, th0, &error0)))
1552  {
1553  TCP_EVT (TCP_EVT_SEG_INVALID, tc0, vnet_buffer (b0)->tcp);
1554  goto done;
1555  }
1556 
1557  /* 5: check the ACK field */
1558  if (PREDICT_FALSE (tcp_rcv_ack (wrk, tc0, b0, th0, &error0)))
1559  goto done;
1560 
1561  /* 6: check the URG bit TODO */
1562 
1563  /* 7: process the segment text */
1564  if (vnet_buffer (b0)->tcp.data_len)
1565  error0 = tcp_segment_rcv (wrk, tc0, b0);
1566 
1567  /* 8: check the FIN bit */
1568  if (PREDICT_FALSE (tcp_is_fin (th0)))
1569  tcp_rcv_fin (wrk, tc0, b0, &error0);
1570 
1571  done:
1572  tcp_inc_err_counter (err_counters, error0, 1);
1573  }
1574 
1575  errors = session_main_flush_enqueue_events (TRANSPORT_PROTO_TCP,
1576  thread_index);
1577  err_counters[TCP_ERROR_MSG_QUEUE_FULL] = errors;
1578  tcp_store_err_counters (established, err_counters);
1580  tcp_handle_disconnects (wrk);
1581  vlib_buffer_free (vm, first_buffer, frame->n_vectors);
1582 
1583  return frame->n_vectors;
1584 }
1585 
1588  vlib_frame_t * from_frame)
1589 {
1590  return tcp46_established_inline (vm, node, from_frame, 1 /* is_ip4 */ );
1591 }
1592 
1595  vlib_frame_t * from_frame)
1596 {
1597  return tcp46_established_inline (vm, node, from_frame, 0 /* is_ip4 */ );
1598 }
1599 
1600 /* *INDENT-OFF* */
1602 {
1603  .name = "tcp4-established",
1604  /* Takes a vector of packets. */
1605  .vector_size = sizeof (u32),
1606  .n_errors = TCP_N_ERROR,
1607  .error_strings = tcp_error_strings,
1608  .n_next_nodes = TCP_ESTABLISHED_N_NEXT,
1609  .next_nodes =
1610  {
1611 #define _(s,n) [TCP_ESTABLISHED_NEXT_##s] = n,
1613 #undef _
1614  },
1615  .format_trace = format_tcp_rx_trace_short,
1616 };
1617 /* *INDENT-ON* */
1618 
1619 /* *INDENT-OFF* */
1621 {
1622  .name = "tcp6-established",
1623  /* Takes a vector of packets. */
1624  .vector_size = sizeof (u32),
1625  .n_errors = TCP_N_ERROR,
1626  .error_strings = tcp_error_strings,
1627  .n_next_nodes = TCP_ESTABLISHED_N_NEXT,
1628  .next_nodes =
1629  {
1630 #define _(s,n) [TCP_ESTABLISHED_NEXT_##s] = n,
1632 #undef _
1633  },
1634  .format_trace = format_tcp_rx_trace_short,
1635 };
1636 /* *INDENT-ON* */
1637 
1638 
1639 static u8
1641  tcp_header_t * hdr)
1642 {
1643  transport_connection_t *tmp = 0;
1644  u64 handle;
1645 
1646  if (!tc)
1647  return 1;
1648 
1649  /* Proxy case */
1650  if (tc->c_lcl_port == 0 && tc->state == TCP_STATE_LISTEN)
1651  return 1;
1652 
1653  u8 is_ip_valid = 0, val_l, val_r;
1654 
1655  if (tc->connection.is_ip4)
1656  {
1658 
1659  val_l = !ip4_address_compare (&ip4_hdr->dst_address,
1660  &tc->connection.lcl_ip.ip4);
1661  val_l = val_l || ip_is_zero (&tc->connection.lcl_ip, 1);
1662  val_r = !ip4_address_compare (&ip4_hdr->src_address,
1663  &tc->connection.rmt_ip.ip4);
1664  val_r = val_r || tc->state == TCP_STATE_LISTEN;
1665  is_ip_valid = val_l && val_r;
1666  }
1667  else
1668  {
1670 
1671  val_l = !ip6_address_compare (&ip6_hdr->dst_address,
1672  &tc->connection.lcl_ip.ip6);
1673  val_l = val_l || ip_is_zero (&tc->connection.lcl_ip, 0);
1674  val_r = !ip6_address_compare (&ip6_hdr->src_address,
1675  &tc->connection.rmt_ip.ip6);
1676  val_r = val_r || tc->state == TCP_STATE_LISTEN;
1677  is_ip_valid = val_l && val_r;
1678  }
1679 
1680  u8 is_valid = (tc->c_lcl_port == hdr->dst_port
1681  && (tc->state == TCP_STATE_LISTEN
1682  || tc->c_rmt_port == hdr->src_port) && is_ip_valid);
1683 
1684  if (!is_valid)
1685  {
1686  handle = session_lookup_half_open_handle (&tc->connection);
1687  tmp = session_lookup_half_open_connection (handle & 0xFFFFFFFF,
1688  tc->c_proto, tc->c_is_ip4);
1689 
1690  if (tmp)
1691  {
1692  if (tmp->lcl_port == hdr->dst_port
1693  && tmp->rmt_port == hdr->src_port)
1694  {
1695  TCP_DBG ("half-open is valid!");
1696  is_valid = 1;
1697  }
1698  }
1699  }
1700  return is_valid;
1701 }
1702 
1703 /**
1704  * Lookup transport connection
1705  */
1706 static tcp_connection_t *
1707 tcp_lookup_connection (u32 fib_index, vlib_buffer_t * b, u8 thread_index,
1708  u8 is_ip4)
1709 {
1710  tcp_header_t *tcp;
1711  transport_connection_t *tconn;
1712  tcp_connection_t *tc;
1713  u8 is_filtered = 0;
1714  if (is_ip4)
1715  {
1716  ip4_header_t *ip4;
1717  ip4 = vlib_buffer_get_current (b);
1718  tcp = ip4_next_header (ip4);
1719  tconn = session_lookup_connection_wt4 (fib_index,
1720  &ip4->dst_address,
1721  &ip4->src_address,
1722  tcp->dst_port,
1723  tcp->src_port,
1724  TRANSPORT_PROTO_TCP,
1725  thread_index, &is_filtered);
1726  tc = tcp_get_connection_from_transport (tconn);
1727  ASSERT (tcp_lookup_is_valid (tc, b, tcp));
1728  }
1729  else
1730  {
1731  ip6_header_t *ip6;
1732  ip6 = vlib_buffer_get_current (b);
1733  tcp = ip6_next_header (ip6);
1734  tconn = session_lookup_connection_wt6 (fib_index,
1735  &ip6->dst_address,
1736  &ip6->src_address,
1737  tcp->dst_port,
1738  tcp->src_port,
1739  TRANSPORT_PROTO_TCP,
1740  thread_index, &is_filtered);
1741  tc = tcp_get_connection_from_transport (tconn);
1742  ASSERT (tcp_lookup_is_valid (tc, b, tcp));
1743  }
1744  return tc;
1745 }
1746 
1747 static tcp_connection_t *
1748 tcp_lookup_listener (vlib_buffer_t * b, u32 fib_index, int is_ip4)
1749 {
1750  session_t *s;
1751 
1752  if (is_ip4)
1753  {
1755  tcp_header_t *tcp = tcp_buffer_hdr (b);
1756  s = session_lookup_listener4 (fib_index,
1757  &ip4->dst_address,
1758  tcp->dst_port, TRANSPORT_PROTO_TCP, 1);
1759  }
1760  else
1761  {
1763  tcp_header_t *tcp = tcp_buffer_hdr (b);
1764  s = session_lookup_listener6 (fib_index,
1765  &ip6->dst_address,
1766  tcp->dst_port, TRANSPORT_PROTO_TCP, 1);
1767 
1768  }
1769  if (PREDICT_TRUE (s != 0))
1771  (TRANSPORT_PROTO_TCP,
1772  s->connection_index));
1773  else
1774  return 0;
1775 }
1776 
1777 always_inline void
1779 {
1780  vnet_main_t *vnm = vnet_get_main ();
1781  const dpo_id_t *dpo;
1782  const load_balance_t *lb;
1783  vnet_hw_interface_t *hw_if;
1784  u32 sw_if_idx, lb_idx;
1785 
1786  if (is_ipv4)
1787  {
1788  ip4_address_t *dst_addr = &(tc->c_rmt_ip.ip4);
1789  lb_idx = ip4_fib_forwarding_lookup (tc->c_fib_index, dst_addr);
1790  }
1791  else
1792  {
1793  ip6_address_t *dst_addr = &(tc->c_rmt_ip.ip6);
1794  lb_idx = ip6_fib_table_fwding_lookup (tc->c_fib_index, dst_addr);
1795  }
1796 
1797  lb = load_balance_get (lb_idx);
1798  if (PREDICT_FALSE (lb->lb_n_buckets > 1))
1799  return;
1800  dpo = load_balance_get_bucket_i (lb, 0);
1801 
1802  sw_if_idx = dpo_get_urpf (dpo);
1803  if (PREDICT_FALSE (sw_if_idx == ~0))
1804  return;
1805 
1806  hw_if = vnet_get_sup_hw_interface (vnm, sw_if_idx);
1808  tc->cfg_flags |= TCP_CFG_F_TSO;
1809 }
1810 
1813  vlib_frame_t * from_frame, int is_ip4)
1814 {
1815  u32 n_left_from, *from, *first_buffer, errors = 0;
1816  u32 my_thread_index = vm->thread_index;
1817  tcp_worker_ctx_t *wrk = tcp_get_worker (my_thread_index);
1818 
1819  from = first_buffer = vlib_frame_vector_args (from_frame);
1820  n_left_from = from_frame->n_vectors;
1821 
1822  while (n_left_from > 0)
1823  {
1824  u32 bi0, ack0, seq0, error0 = TCP_ERROR_NONE;
1825  tcp_connection_t *tc0, *new_tc0;
1826  tcp_header_t *tcp0 = 0;
1827  tcp_rx_trace_t *t0;
1828  vlib_buffer_t *b0;
1829 
1830  bi0 = from[0];
1831  from += 1;
1832  n_left_from -= 1;
1833 
1834  b0 = vlib_get_buffer (vm, bi0);
1835  tc0 =
1836  tcp_half_open_connection_get (vnet_buffer (b0)->tcp.connection_index);
1837  if (PREDICT_FALSE (tc0 == 0))
1838  {
1839  error0 = TCP_ERROR_INVALID_CONNECTION;
1840  goto drop;
1841  }
1842 
1843  /* Half-open completed recently but the connection was't removed
1844  * yet by the owning thread */
1845  if (PREDICT_FALSE (tc0->flags & TCP_CONN_HALF_OPEN_DONE))
1846  {
1847  /* Make sure the connection actually exists */
1848  ASSERT (tcp_lookup_connection (tc0->c_fib_index, b0,
1849  my_thread_index, is_ip4));
1850  error0 = TCP_ERROR_SPURIOUS_SYN_ACK;
1851  goto drop;
1852  }
1853 
1854  ack0 = vnet_buffer (b0)->tcp.ack_number;
1855  seq0 = vnet_buffer (b0)->tcp.seq_number;
1856  tcp0 = tcp_buffer_hdr (b0);
1857 
1858  /* Crude check to see if the connection handle does not match
1859  * the packet. Probably connection just switched to established */
1860  if (PREDICT_FALSE (tcp0->dst_port != tc0->c_lcl_port
1861  || tcp0->src_port != tc0->c_rmt_port))
1862  {
1863  error0 = TCP_ERROR_INVALID_CONNECTION;
1864  goto drop;
1865  }
1866 
1867  if (PREDICT_FALSE (!tcp_ack (tcp0) && !tcp_rst (tcp0)
1868  && !tcp_syn (tcp0)))
1869  {
1870  error0 = TCP_ERROR_SEGMENT_INVALID;
1871  goto drop;
1872  }
1873 
1874  /* SYNs consume sequence numbers */
1875  vnet_buffer (b0)->tcp.seq_end += tcp_is_syn (tcp0);
1876 
1877  /*
1878  * 1. check the ACK bit
1879  */
1880 
1881  /*
1882  * If the ACK bit is set
1883  * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send a reset (unless
1884  * the RST bit is set, if so drop the segment and return)
1885  * <SEQ=SEG.ACK><CTL=RST>
1886  * and discard the segment. Return.
1887  * If SND.UNA =< SEG.ACK =< SND.NXT then the ACK is acceptable.
1888  */
1889  if (tcp_ack (tcp0))
1890  {
1891  if (seq_leq (ack0, tc0->iss) || seq_gt (ack0, tc0->snd_nxt))
1892  {
1893  if (!tcp_rst (tcp0))
1894  tcp_send_reset_w_pkt (tc0, b0, my_thread_index, is_ip4);
1895  error0 = TCP_ERROR_RCV_WND;
1896  goto drop;
1897  }
1898 
1899  /* Make sure ACK is valid */
1900  if (seq_gt (tc0->snd_una, ack0))
1901  {
1902  error0 = TCP_ERROR_ACK_INVALID;
1903  goto drop;
1904  }
1905  }
1906 
1907  /*
1908  * 2. check the RST bit
1909  */
1910 
1911  if (tcp_rst (tcp0))
1912  {
1913  /* If ACK is acceptable, signal client that peer is not
1914  * willing to accept connection and drop connection*/
1915  if (tcp_ack (tcp0))
1916  tcp_rcv_rst (wrk, tc0);
1917  error0 = TCP_ERROR_RST_RCVD;
1918  goto drop;
1919  }
1920 
1921  /*
1922  * 3. check the security and precedence (skipped)
1923  */
1924 
1925  /*
1926  * 4. check the SYN bit
1927  */
1928 
1929  /* No SYN flag. Drop. */
1930  if (!tcp_syn (tcp0))
1931  {
1932  error0 = TCP_ERROR_SEGMENT_INVALID;
1933  goto drop;
1934  }
1935 
1936  /* Parse options */
1937  if (tcp_options_parse (tcp0, &tc0->rcv_opts, 1))
1938  {
1939  error0 = TCP_ERROR_OPTIONS;
1940  goto drop;
1941  }
1942 
1943  /* Valid SYN or SYN-ACK. Move connection from half-open pool to
1944  * current thread pool. */
1945  new_tc0 = tcp_connection_alloc_w_base (my_thread_index, tc0);
1946  new_tc0->rcv_nxt = vnet_buffer (b0)->tcp.seq_end;
1947  new_tc0->irs = seq0;
1948  new_tc0->timers[TCP_TIMER_RETRANSMIT_SYN] = TCP_TIMER_HANDLE_INVALID;
1949  new_tc0->sw_if_index = vnet_buffer (b0)->sw_if_index[VLIB_RX];
1950 
1951  if (tcp_opts_tstamp (&new_tc0->rcv_opts))
1952  {
1953  new_tc0->tsval_recent = new_tc0->rcv_opts.tsval;
1954  new_tc0->tsval_recent_age = tcp_time_now ();
1955  }
1956 
1957  if (tcp_opts_wscale (&new_tc0->rcv_opts))
1958  new_tc0->snd_wscale = new_tc0->rcv_opts.wscale;
1959  else
1960  new_tc0->rcv_wscale = 0;
1961 
1962  new_tc0->snd_wnd = clib_net_to_host_u16 (tcp0->window)
1963  << new_tc0->snd_wscale;
1964  new_tc0->snd_wl1 = seq0;
1965  new_tc0->snd_wl2 = ack0;
1966 
1967  tcp_connection_init_vars (new_tc0);
1968 
1969  /* SYN-ACK: See if we can switch to ESTABLISHED state */
1970  if (PREDICT_TRUE (tcp_ack (tcp0)))
1971  {
1972  /* Our SYN is ACKed: we have iss < ack = snd_una */
1973 
1974  /* TODO Dequeue acknowledged segments if we support Fast Open */
1975  new_tc0->snd_una = ack0;
1976  new_tc0->state = TCP_STATE_ESTABLISHED;
1977 
1978  /* Make sure las is initialized for the wnd computation */
1979  new_tc0->rcv_las = new_tc0->rcv_nxt;
1980 
1981  /* Notify app that we have connection. If session layer can't
1982  * allocate session send reset */
1983  if (session_stream_connect_notify (&new_tc0->connection,
1984  SESSION_E_NONE))
1985  {
1986  tcp_send_reset_w_pkt (new_tc0, b0, my_thread_index, is_ip4);
1987  tcp_connection_cleanup (new_tc0);
1988  error0 = TCP_ERROR_CREATE_SESSION_FAIL;
1989  goto cleanup_ho;
1990  }
1991 
1992  new_tc0->tx_fifo_size =
1993  transport_tx_fifo_size (&new_tc0->connection);
1994  /* Update rtt with the syn-ack sample */
1995  tcp_estimate_initial_rtt (new_tc0);
1996  TCP_EVT (TCP_EVT_SYNACK_RCVD, new_tc0);
1997  error0 = TCP_ERROR_SYN_ACKS_RCVD;
1998  }
1999  /* SYN: Simultaneous open. Change state to SYN-RCVD and send SYN-ACK */
2000  else
2001  {
2002  new_tc0->state = TCP_STATE_SYN_RCVD;
2003 
2004  /* Notify app that we have connection */
2005  if (session_stream_connect_notify (&new_tc0->connection,
2006  SESSION_E_NONE))
2007  {
2008  tcp_connection_cleanup (new_tc0);
2009  tcp_send_reset_w_pkt (tc0, b0, my_thread_index, is_ip4);
2010  TCP_EVT (TCP_EVT_RST_SENT, tc0);
2011  error0 = TCP_ERROR_CREATE_SESSION_FAIL;
2012  goto cleanup_ho;
2013  }
2014 
2015  new_tc0->tx_fifo_size =
2016  transport_tx_fifo_size (&new_tc0->connection);
2017  new_tc0->rtt_ts = 0;
2018  tcp_init_snd_vars (new_tc0);
2019  tcp_send_synack (new_tc0);
2020  error0 = TCP_ERROR_SYNS_RCVD;
2021  goto cleanup_ho;
2022  }
2023 
2024  if (!(new_tc0->cfg_flags & TCP_CFG_F_NO_TSO))
2025  tcp_check_tx_offload (new_tc0, is_ip4);
2026 
2027  /* Read data, if any */
2028  if (PREDICT_FALSE (vnet_buffer (b0)->tcp.data_len))
2029  {
2030  clib_warning ("rcvd data in syn-sent");
2031  error0 = tcp_segment_rcv (wrk, new_tc0, b0);
2032  if (error0 == TCP_ERROR_ACK_OK)
2033  error0 = TCP_ERROR_SYN_ACKS_RCVD;
2034  }
2035  else
2036  {
2037  /* Send ack now instead of programming it because connection was
2038  * just established and it's not optional. */
2039  tcp_send_ack (new_tc0);
2040  }
2041 
2042  cleanup_ho:
2043 
2044  /* If this is not the owning thread, wait for syn retransmit to
2045  * expire and cleanup then */
2047  tc0->flags |= TCP_CONN_HALF_OPEN_DONE;
2048 
2049  drop:
2050 
2051  tcp_inc_counter (syn_sent, error0, 1);
2052  if (PREDICT_FALSE ((b0->flags & VLIB_BUFFER_IS_TRACED) && tcp0 != 0))
2053  {
2054  t0 = vlib_add_trace (vm, node, b0, sizeof (*t0));
2055  clib_memcpy_fast (&t0->tcp_header, tcp0, sizeof (t0->tcp_header));
2056  clib_memcpy_fast (&t0->tcp_connection, tc0,
2057  sizeof (t0->tcp_connection));
2058  }
2059  }
2060 
2061  errors = session_main_flush_enqueue_events (TRANSPORT_PROTO_TCP,
2062  my_thread_index);
2063  tcp_inc_counter (syn_sent, TCP_ERROR_MSG_QUEUE_FULL, errors);
2064  vlib_buffer_free (vm, first_buffer, from_frame->n_vectors);
2065  tcp_handle_disconnects (wrk);
2066 
2067  return from_frame->n_vectors;
2068 }
2069 
2072  vlib_frame_t * from_frame)
2073 {
2074  return tcp46_syn_sent_inline (vm, node, from_frame, 1 /* is_ip4 */ );
2075 }
2076 
2079  vlib_frame_t * from_frame)
2080 {
2081  return tcp46_syn_sent_inline (vm, node, from_frame, 0 /* is_ip4 */ );
2082 }
2083 
2084 /* *INDENT-OFF* */
2086 {
2087  .name = "tcp4-syn-sent",
2088  /* Takes a vector of packets. */
2089  .vector_size = sizeof (u32),
2090  .n_errors = TCP_N_ERROR,
2091  .error_strings = tcp_error_strings,
2092  .n_next_nodes = TCP_SYN_SENT_N_NEXT,
2093  .next_nodes =
2094  {
2095 #define _(s,n) [TCP_SYN_SENT_NEXT_##s] = n,
2097 #undef _
2098  },
2099  .format_trace = format_tcp_rx_trace_short,
2100 };
2101 /* *INDENT-ON* */
2102 
2103 /* *INDENT-OFF* */
2105 {
2106  .name = "tcp6-syn-sent",
2107  /* Takes a vector of packets. */
2108  .vector_size = sizeof (u32),
2109  .n_errors = TCP_N_ERROR,
2110  .error_strings = tcp_error_strings,
2111  .n_next_nodes = TCP_SYN_SENT_N_NEXT,
2112  .next_nodes =
2113  {
2114 #define _(s,n) [TCP_SYN_SENT_NEXT_##s] = n,
2116 #undef _
2117  },
2118  .format_trace = format_tcp_rx_trace_short,
2119 };
2120 /* *INDENT-ON* */
2121 
2122 /**
2123  * Handles reception for all states except LISTEN, SYN-SENT and ESTABLISHED
2124  * as per RFC793 p. 64
2125  */
2128  vlib_frame_t * from_frame, int is_ip4)
2129 {
2130  u32 thread_index = vm->thread_index, errors = 0, *first_buffer;
2131  tcp_worker_ctx_t *wrk = tcp_get_worker (thread_index);
2132  u32 n_left_from, *from, max_dequeue;
2133 
2134  from = first_buffer = vlib_frame_vector_args (from_frame);
2135  n_left_from = from_frame->n_vectors;
2136 
2137  while (n_left_from > 0)
2138  {
2139  u32 bi0, error0 = TCP_ERROR_NONE;
2140  tcp_header_t *tcp0 = 0;
2141  tcp_connection_t *tc0;
2142  vlib_buffer_t *b0;
2143  u8 is_fin0;
2144 
2145  bi0 = from[0];
2146  from += 1;
2147  n_left_from -= 1;
2148 
2149  b0 = vlib_get_buffer (vm, bi0);
2150  tc0 = tcp_connection_get (vnet_buffer (b0)->tcp.connection_index,
2151  thread_index);
2152  if (PREDICT_FALSE (tc0 == 0))
2153  {
2154  error0 = TCP_ERROR_INVALID_CONNECTION;
2155  goto drop;
2156  }
2157 
2158  tcp0 = tcp_buffer_hdr (b0);
2159  is_fin0 = tcp_is_fin (tcp0);
2160 
2161  if (CLIB_DEBUG)
2162  {
2163  if (!(tc0->connection.flags & TRANSPORT_CONNECTION_F_NO_LOOKUP))
2164  {
2165  tcp_connection_t *tmp;
2166  tmp = tcp_lookup_connection (tc0->c_fib_index, b0, thread_index,
2167  is_ip4);
2168  if (tmp->state != tc0->state)
2169  {
2170  if (tc0->state != TCP_STATE_CLOSED)
2171  clib_warning ("state changed");
2172  goto drop;
2173  }
2174  }
2175  }
2176 
2177  /*
2178  * Special treatment for CLOSED
2179  */
2180  if (PREDICT_FALSE (tc0->state == TCP_STATE_CLOSED))
2181  {
2182  error0 = TCP_ERROR_CONNECTION_CLOSED;
2183  goto drop;
2184  }
2185 
2186  /*
2187  * For all other states (except LISTEN)
2188  */
2189 
2190  /* 1-4: check SEQ, RST, SYN */
2191  if (PREDICT_FALSE (tcp_segment_validate (wrk, tc0, b0, tcp0, &error0)))
2192  goto drop;
2193 
2194  /* 5: check the ACK field */
2195  switch (tc0->state)
2196  {
2197  case TCP_STATE_SYN_RCVD:
2198 
2199  /* Make sure the segment is exactly right */
2200  if (tc0->rcv_nxt != vnet_buffer (b0)->tcp.seq_number || is_fin0)
2201  {
2202  tcp_send_reset_w_pkt (tc0, b0, thread_index, is_ip4);
2203  error0 = TCP_ERROR_SEGMENT_INVALID;
2204  goto drop;
2205  }
2206 
2207  /*
2208  * If the segment acknowledgment is not acceptable, form a
2209  * reset segment,
2210  * <SEQ=SEG.ACK><CTL=RST>
2211  * and send it.
2212  */
2213  if (tcp_rcv_ack_no_cc (tc0, b0, &error0))
2214  {
2215  tcp_send_reset_w_pkt (tc0, b0, thread_index, is_ip4);
2216  error0 = TCP_ERROR_SEGMENT_INVALID;
2217  goto drop;
2218  }
2219 
2220  /* Update rtt and rto */
2223 
2224  /* Switch state to ESTABLISHED */
2225  tc0->state = TCP_STATE_ESTABLISHED;
2226  TCP_EVT (TCP_EVT_STATE_CHANGE, tc0);
2227 
2228  if (!(tc0->cfg_flags & TCP_CFG_F_NO_TSO))
2229  tcp_check_tx_offload (tc0, is_ip4);
2230 
2231  /* Initialize session variables */
2232  tc0->snd_una = vnet_buffer (b0)->tcp.ack_number;
2233  tc0->snd_wnd = clib_net_to_host_u16 (tcp0->window)
2234  << tc0->rcv_opts.wscale;
2235  tc0->snd_wl1 = vnet_buffer (b0)->tcp.seq_number;
2236  tc0->snd_wl2 = vnet_buffer (b0)->tcp.ack_number;
2237 
2238  /* Reset SYN-ACK retransmit and SYN_RCV establish timers */
2240  if (session_stream_accept_notify (&tc0->connection))
2241  {
2242  error0 = TCP_ERROR_MSG_QUEUE_FULL;
2243  tcp_send_reset (tc0);
2244  session_transport_delete_notify (&tc0->connection);
2245  tcp_connection_cleanup (tc0);
2246  goto drop;
2247  }
2248  error0 = TCP_ERROR_ACK_OK;
2249  break;
2250  case TCP_STATE_ESTABLISHED:
2251  /* We can get packets in established state here because they
2252  * were enqueued before state change */
2253  if (tcp_rcv_ack (wrk, tc0, b0, tcp0, &error0))
2254  goto drop;
2255 
2256  break;
2257  case TCP_STATE_FIN_WAIT_1:
2258  /* In addition to the processing for the ESTABLISHED state, if
2259  * our FIN is now acknowledged then enter FIN-WAIT-2 and
2260  * continue processing in that state. */
2261  if (tcp_rcv_ack (wrk, tc0, b0, tcp0, &error0))
2262  goto drop;
2263 
2264  /* Still have to send the FIN */
2265  if (tc0->flags & TCP_CONN_FINPNDG)
2266  {
2267  /* TX fifo finally drained */
2268  max_dequeue = transport_max_tx_dequeue (&tc0->connection);
2269  if (max_dequeue <= tc0->burst_acked)
2270  tcp_send_fin (tc0);
2271  /* If a fin was received and data was acked extend wait */
2272  else if ((tc0->flags & TCP_CONN_FINRCVD) && tc0->bytes_acked)
2273  tcp_timer_update (&wrk->timer_wheel, tc0, TCP_TIMER_WAITCLOSE,
2274  tcp_cfg.closewait_time);
2275  }
2276  /* If FIN is ACKed */
2277  else if (tc0->snd_una == tc0->snd_nxt)
2278  {
2279  /* Stop all retransmit timers because we have nothing more
2280  * to send. */
2282 
2283  /* We already have a FIN but didn't transition to CLOSING
2284  * because of outstanding tx data. Close the connection. */
2285  if (tc0->flags & TCP_CONN_FINRCVD)
2286  {
2287  tcp_connection_set_state (tc0, TCP_STATE_CLOSED);
2288  session_transport_closed_notify (&tc0->connection);
2289  tcp_program_cleanup (wrk, tc0);
2290  goto drop;
2291  }
2292 
2293  tcp_connection_set_state (tc0, TCP_STATE_FIN_WAIT_2);
2294  /* Enable waitclose because we're willing to wait for peer's
2295  * FIN but not indefinitely. */
2296  tcp_timer_set (&wrk->timer_wheel, tc0, TCP_TIMER_WAITCLOSE,
2297  tcp_cfg.finwait2_time);
2298 
2299  /* Don't try to deq the FIN acked */
2300  if (tc0->burst_acked > 1)
2301  session_tx_fifo_dequeue_drop (&tc0->connection,
2302  tc0->burst_acked - 1);
2303  tc0->burst_acked = 0;
2304  }
2305  break;
2306  case TCP_STATE_FIN_WAIT_2:
2307  /* In addition to the processing for the ESTABLISHED state, if
2308  * the retransmission queue is empty, the user's CLOSE can be
2309  * acknowledged ("ok") but do not delete the TCB. */
2310  if (tcp_rcv_ack_no_cc (tc0, b0, &error0))
2311  goto drop;
2312  tc0->burst_acked = 0;
2313  break;
2314  case TCP_STATE_CLOSE_WAIT:
2315  /* Do the same processing as for the ESTABLISHED state. */
2316  if (tcp_rcv_ack (wrk, tc0, b0, tcp0, &error0))
2317  goto drop;
2318 
2319  if (!(tc0->flags & TCP_CONN_FINPNDG))
2320  break;
2321 
2322  /* Still have outstanding tx data */
2323  max_dequeue = transport_max_tx_dequeue (&tc0->connection);
2324  if (max_dequeue > tc0->burst_acked)
2325  break;
2326 
2327  tcp_send_fin (tc0);
2329  tcp_connection_set_state (tc0, TCP_STATE_LAST_ACK);
2330  tcp_timer_set (&wrk->timer_wheel, tc0, TCP_TIMER_WAITCLOSE,
2331  tcp_cfg.lastack_time);
2332  break;
2333  case TCP_STATE_CLOSING:
2334  /* In addition to the processing for the ESTABLISHED state, if
2335  * the ACK acknowledges our FIN then enter the TIME-WAIT state,
2336  * otherwise ignore the segment. */
2337  if (tcp_rcv_ack_no_cc (tc0, b0, &error0))
2338  goto drop;
2339 
2340  if (tc0->snd_una != tc0->snd_nxt)
2341  goto drop;
2342 
2344  tcp_connection_set_state (tc0, TCP_STATE_TIME_WAIT);
2345  tcp_timer_set (&wrk->timer_wheel, tc0, TCP_TIMER_WAITCLOSE,
2346  tcp_cfg.timewait_time);
2347  session_transport_closed_notify (&tc0->connection);
2348  goto drop;
2349 
2350  break;
2351  case TCP_STATE_LAST_ACK:
2352  /* The only thing that [should] arrive in this state is an
2353  * acknowledgment of our FIN. If our FIN is now acknowledged,
2354  * delete the TCB, enter the CLOSED state, and return. */
2355 
2356  if (tcp_rcv_ack_no_cc (tc0, b0, &error0))
2357  goto drop;
2358 
2359  /* Apparently our ACK for the peer's FIN was lost */
2360  if (is_fin0 && tc0->snd_una != tc0->snd_nxt)
2361  {
2362  tcp_send_fin (tc0);
2363  goto drop;
2364  }
2365 
2366  tcp_connection_set_state (tc0, TCP_STATE_CLOSED);
2367  session_transport_closed_notify (&tc0->connection);
2368 
2369  /* Don't free the connection from the data path since
2370  * we can't ensure that we have no packets already enqueued
2371  * to output. Rely instead on the waitclose timer */
2373  tcp_program_cleanup (tcp_get_worker (tc0->c_thread_index), tc0);
2374 
2375  goto drop;
2376 
2377  break;
2378  case TCP_STATE_TIME_WAIT:
2379  /* The only thing that can arrive in this state is a
2380  * retransmission of the remote FIN. Acknowledge it, and restart
2381  * the 2 MSL timeout. */
2382 
2383  if (tcp_rcv_ack_no_cc (tc0, b0, &error0))
2384  goto drop;
2385 
2386  if (!is_fin0)
2387  goto drop;
2388 
2389  tcp_program_ack (tc0);
2390  tcp_timer_update (&wrk->timer_wheel, tc0, TCP_TIMER_WAITCLOSE,
2391  tcp_cfg.timewait_time);
2392  goto drop;
2393 
2394  break;
2395  default:
2396  ASSERT (0);
2397  }
2398 
2399  /* 6: check the URG bit TODO */
2400 
2401  /* 7: process the segment text */
2402  switch (tc0->state)
2403  {
2404  case TCP_STATE_ESTABLISHED:
2405  case TCP_STATE_FIN_WAIT_1:
2406  case TCP_STATE_FIN_WAIT_2:
2407  if (vnet_buffer (b0)->tcp.data_len)
2408  error0 = tcp_segment_rcv (wrk, tc0, b0);
2409  break;
2410  case TCP_STATE_CLOSE_WAIT:
2411  case TCP_STATE_CLOSING:
2412  case TCP_STATE_LAST_ACK:
2413  case TCP_STATE_TIME_WAIT:
2414  /* This should not occur, since a FIN has been received from the
2415  * remote side. Ignore the segment text. */
2416  break;
2417  }
2418 
2419  /* 8: check the FIN bit */
2420  if (!is_fin0)
2421  goto drop;
2422 
2423  TCP_EVT (TCP_EVT_FIN_RCVD, tc0);
2424 
2425  switch (tc0->state)
2426  {
2427  case TCP_STATE_ESTABLISHED:
2428  /* Account for the FIN and send ack */
2429  tc0->rcv_nxt += 1;
2430  tcp_program_ack (tc0);
2431  tcp_connection_set_state (tc0, TCP_STATE_CLOSE_WAIT);
2432  tcp_program_disconnect (wrk, tc0);
2433  tcp_timer_update (&wrk->timer_wheel, tc0, TCP_TIMER_WAITCLOSE,
2434  tcp_cfg.closewait_time);
2435  break;
2436  case TCP_STATE_SYN_RCVD:
2437  /* Send FIN-ACK, enter LAST-ACK and because the app was not
2438  * notified yet, set a cleanup timer instead of relying on
2439  * disconnect notify and the implicit close call. */
2441  tc0->rcv_nxt += 1;
2442  tcp_send_fin (tc0);
2443  tcp_connection_set_state (tc0, TCP_STATE_LAST_ACK);
2444  tcp_timer_set (&wrk->timer_wheel, tc0, TCP_TIMER_WAITCLOSE,
2445  tcp_cfg.lastack_time);
2446  break;
2447  case TCP_STATE_CLOSE_WAIT:
2448  case TCP_STATE_CLOSING:
2449  case TCP_STATE_LAST_ACK:
2450  /* move along .. */
2451  break;
2452  case TCP_STATE_FIN_WAIT_1:
2453  tc0->rcv_nxt += 1;
2454 
2455  if (tc0->flags & TCP_CONN_FINPNDG)
2456  {
2457  /* If data is outstanding, stay in FIN_WAIT_1 and try to finish
2458  * sending it. Since we already received a fin, do not wait
2459  * for too long. */
2460  tc0->flags |= TCP_CONN_FINRCVD;
2461  tcp_timer_update (&wrk->timer_wheel, tc0, TCP_TIMER_WAITCLOSE,
2462  tcp_cfg.closewait_time);
2463  }
2464  else
2465  {
2466  tcp_connection_set_state (tc0, TCP_STATE_CLOSING);
2467  tcp_program_ack (tc0);
2468  /* Wait for ACK for our FIN but not forever */
2469  tcp_timer_update (&wrk->timer_wheel, tc0, TCP_TIMER_WAITCLOSE,
2470  tcp_cfg.closing_time);
2471  }
2472  break;
2473  case TCP_STATE_FIN_WAIT_2:
2474  /* Got FIN, send ACK! Be more aggressive with resource cleanup */
2475  tc0->rcv_nxt += 1;
2476  tcp_connection_set_state (tc0, TCP_STATE_TIME_WAIT);
2478  tcp_timer_set (&wrk->timer_wheel, tc0, TCP_TIMER_WAITCLOSE,
2479  tcp_cfg.timewait_time);
2480  tcp_program_ack (tc0);
2481  session_transport_closed_notify (&tc0->connection);
2482  break;
2483  case TCP_STATE_TIME_WAIT:
2484  /* Remain in the TIME-WAIT state. Restart the time-wait
2485  * timeout.
2486  */
2487  tcp_timer_update (&wrk->timer_wheel, tc0, TCP_TIMER_WAITCLOSE,
2488  tcp_cfg.timewait_time);
2489  break;
2490  }
2491  error0 = TCP_ERROR_FIN_RCVD;
2492 
2493  drop:
2494 
2495  tcp_inc_counter (rcv_process, error0, 1);
2496  if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
2497  {
2498  tcp_rx_trace_t *t0 = vlib_add_trace (vm, node, b0, sizeof (*t0));
2499  tcp_set_rx_trace_data (t0, tc0, tcp0, b0, is_ip4);
2500  }
2501  }
2502 
2503  errors = session_main_flush_enqueue_events (TRANSPORT_PROTO_TCP,
2504  thread_index);
2505  tcp_inc_counter (rcv_process, TCP_ERROR_MSG_QUEUE_FULL, errors);
2507  tcp_handle_disconnects (wrk);
2508  vlib_buffer_free (vm, first_buffer, from_frame->n_vectors);
2509 
2510  return from_frame->n_vectors;
2511 }
2512 
2515  vlib_frame_t * from_frame)
2516 {
2517  return tcp46_rcv_process_inline (vm, node, from_frame, 1 /* is_ip4 */ );
2518 }
2519 
2522  vlib_frame_t * from_frame)
2523 {
2524  return tcp46_rcv_process_inline (vm, node, from_frame, 0 /* is_ip4 */ );
2525 }
2526 
2527 /* *INDENT-OFF* */
2529 {
2530  .name = "tcp4-rcv-process",
2531  /* Takes a vector of packets. */
2532  .vector_size = sizeof (u32),
2533  .n_errors = TCP_N_ERROR,
2534  .error_strings = tcp_error_strings,
2535  .n_next_nodes = TCP_RCV_PROCESS_N_NEXT,
2536  .next_nodes =
2537  {
2538 #define _(s,n) [TCP_RCV_PROCESS_NEXT_##s] = n,
2540 #undef _
2541  },
2542  .format_trace = format_tcp_rx_trace_short,
2543 };
2544 /* *INDENT-ON* */
2545 
2546 /* *INDENT-OFF* */
2548 {
2549  .name = "tcp6-rcv-process",
2550  /* Takes a vector of packets. */
2551  .vector_size = sizeof (u32),
2552  .n_errors = TCP_N_ERROR,
2553  .error_strings = tcp_error_strings,
2554  .n_next_nodes = TCP_RCV_PROCESS_N_NEXT,
2555  .next_nodes =
2556  {
2557 #define _(s,n) [TCP_RCV_PROCESS_NEXT_##s] = n,
2559 #undef _
2560  },
2561  .format_trace = format_tcp_rx_trace_short,
2562 };
2563 /* *INDENT-ON* */
2564 
2565 /**
2566  * LISTEN state processing as per RFC 793 p. 65
2567  */
2570  vlib_frame_t * from_frame, int is_ip4)
2571 {
2572  u32 n_left_from, *from, n_syns = 0, *first_buffer;
2573  u32 thread_index = vm->thread_index;
2574 
2575  from = first_buffer = vlib_frame_vector_args (from_frame);
2576  n_left_from = from_frame->n_vectors;
2577 
2578  while (n_left_from > 0)
2579  {
2580  u32 bi, error = TCP_ERROR_NONE;
2581  tcp_connection_t *lc, *child;
2582  vlib_buffer_t *b;
2583 
2584  bi = from[0];
2585  from += 1;
2586  n_left_from -= 1;
2587 
2588  b = vlib_get_buffer (vm, bi);
2589 
2590  lc = tcp_listener_get (vnet_buffer (b)->tcp.connection_index);
2591  if (PREDICT_FALSE (lc == 0))
2592  {
2593  tcp_connection_t *tc;
2594  tc = tcp_connection_get (vnet_buffer (b)->tcp.connection_index,
2595  thread_index);
2596  if (tc->state != TCP_STATE_TIME_WAIT)
2597  {
2598  error = TCP_ERROR_CREATE_EXISTS;
2599  goto done;
2600  }
2601  lc = tcp_lookup_listener (b, tc->c_fib_index, is_ip4);
2602  /* clean up the old session */
2603  tcp_connection_del (tc);
2604  }
2605 
2606  /* Make sure connection wasn't just created */
2607  child = tcp_lookup_connection (lc->c_fib_index, b, thread_index,
2608  is_ip4);
2609  if (PREDICT_FALSE (child->state != TCP_STATE_LISTEN))
2610  {
2611  error = TCP_ERROR_CREATE_EXISTS;
2612  goto done;
2613  }
2614 
2615  /* Create child session. For syn-flood protection use filter */
2616 
2617  /* 1. first check for an RST: handled in dispatch */
2618  /* if (tcp_rst (th0))
2619  goto drop;
2620  */
2621 
2622  /* 2. second check for an ACK: handled in dispatch */
2623  /* if (tcp_ack (th0))
2624  {
2625  tcp_send_reset (b0, is_ip4);
2626  goto drop;
2627  }
2628  */
2629 
2630  /* 3. check for a SYN (did that already) */
2631 
2632  /* Create child session and send SYN-ACK */
2633  child = tcp_connection_alloc (thread_index);
2634 
2635  if (tcp_options_parse (tcp_buffer_hdr (b), &child->rcv_opts, 1))
2636  {
2637  error = TCP_ERROR_OPTIONS;
2638  tcp_connection_free (child);
2639  goto done;
2640  }
2641 
2642  tcp_init_w_buffer (child, b, is_ip4);
2643 
2644  child->state = TCP_STATE_SYN_RCVD;
2645  child->c_fib_index = lc->c_fib_index;
2646  child->cc_algo = lc->cc_algo;
2647  tcp_connection_init_vars (child);
2648  child->rto = TCP_RTO_MIN;
2649 
2650  if (session_stream_accept (&child->connection, lc->c_s_index,
2651  lc->c_thread_index, 0 /* notify */ ))
2652  {
2653  tcp_connection_cleanup (child);
2654  error = TCP_ERROR_CREATE_SESSION_FAIL;
2655  goto done;
2656  }
2657 
2658  child->tx_fifo_size = transport_tx_fifo_size (&child->connection);
2659 
2660  tcp_send_synack (child);
2661 
2662  TCP_EVT (TCP_EVT_SYN_RCVD, child, 1);
2663 
2664  done:
2665 
2666  if (PREDICT_FALSE (b->flags & VLIB_BUFFER_IS_TRACED))
2667  {
2668  tcp_rx_trace_t *t;
2669  t = vlib_add_trace (vm, node, b, sizeof (*t));
2671  sizeof (t->tcp_header));
2673  sizeof (t->tcp_connection));
2674  }
2675 
2676  n_syns += (error == TCP_ERROR_NONE);
2677  }
2678 
2679  tcp_inc_counter (listen, TCP_ERROR_SYNS_RCVD, n_syns);
2680  vlib_buffer_free (vm, first_buffer, from_frame->n_vectors);
2681 
2682  return from_frame->n_vectors;
2683 }
2684 
2686  vlib_frame_t * from_frame)
2687 {
2688  return tcp46_listen_inline (vm, node, from_frame, 1 /* is_ip4 */ );
2689 }
2690 
2692  vlib_frame_t * from_frame)
2693 {
2694  return tcp46_listen_inline (vm, node, from_frame, 0 /* is_ip4 */ );
2695 }
2696 
2697 /* *INDENT-OFF* */
2699 {
2700  .name = "tcp4-listen",
2701  /* Takes a vector of packets. */
2702  .vector_size = sizeof (u32),
2703  .n_errors = TCP_N_ERROR,
2704  .error_strings = tcp_error_strings,
2705  .n_next_nodes = TCP_LISTEN_N_NEXT,
2706  .next_nodes =
2707  {
2708 #define _(s,n) [TCP_LISTEN_NEXT_##s] = n,
2710 #undef _
2711  },
2712  .format_trace = format_tcp_rx_trace_short,
2713 };
2714 /* *INDENT-ON* */
2715 
2716 /* *INDENT-OFF* */
2718 {
2719  .name = "tcp6-listen",
2720  /* Takes a vector of packets. */
2721  .vector_size = sizeof (u32),
2722  .n_errors = TCP_N_ERROR,
2723  .error_strings = tcp_error_strings,
2724  .n_next_nodes = TCP_LISTEN_N_NEXT,
2725  .next_nodes =
2726  {
2727 #define _(s,n) [TCP_LISTEN_NEXT_##s] = n,
2729 #undef _
2730  },
2731  .format_trace = format_tcp_rx_trace_short,
2732 };
2733 /* *INDENT-ON* */
2734 
2735 typedef enum _tcp_input_next
2736 {
2746 
2747 #define foreach_tcp4_input_next \
2748  _ (DROP, "ip4-drop") \
2749  _ (LISTEN, "tcp4-listen") \
2750  _ (RCV_PROCESS, "tcp4-rcv-process") \
2751  _ (SYN_SENT, "tcp4-syn-sent") \
2752  _ (ESTABLISHED, "tcp4-established") \
2753  _ (RESET, "tcp4-reset") \
2754  _ (PUNT, "ip4-punt")
2755 
2756 #define foreach_tcp6_input_next \
2757  _ (DROP, "ip6-drop") \
2758  _ (LISTEN, "tcp6-listen") \
2759  _ (RCV_PROCESS, "tcp6-rcv-process") \
2760  _ (SYN_SENT, "tcp6-syn-sent") \
2761  _ (ESTABLISHED, "tcp6-established") \
2762  _ (RESET, "tcp6-reset") \
2763  _ (PUNT, "ip6-punt")
2764 
2765 #define filter_flags (TCP_FLAG_SYN|TCP_FLAG_ACK|TCP_FLAG_RST|TCP_FLAG_FIN)
2766 
2767 static void
2769  vlib_buffer_t ** bs, u32 n_bufs, u8 is_ip4)
2770 {
2771  tcp_connection_t *tc;
2772  tcp_header_t *tcp;
2773  tcp_rx_trace_t *t;
2774  int i;
2775 
2776  for (i = 0; i < n_bufs; i++)
2777  {
2778  if (bs[i]->flags & VLIB_BUFFER_IS_TRACED)
2779  {
2780  t = vlib_add_trace (vm, node, bs[i], sizeof (*t));
2781  tc = tcp_connection_get (vnet_buffer (bs[i])->tcp.connection_index,
2782  vm->thread_index);
2783  tcp = vlib_buffer_get_current (bs[i]);
2784  tcp_set_rx_trace_data (t, tc, tcp, bs[i], is_ip4);
2785  }
2786  }
2787 }
2788 
2789 static void
2790 tcp_input_set_error_next (tcp_main_t * tm, u16 * next, u32 * error, u8 is_ip4)
2791 {
2792  if (*error == TCP_ERROR_FILTERED || *error == TCP_ERROR_WRONG_THREAD)
2793  {
2794  *next = TCP_INPUT_NEXT_DROP;
2795  }
2796  else if ((is_ip4 && tm->punt_unknown4) || (!is_ip4 && tm->punt_unknown6))
2797  {
2798  *next = TCP_INPUT_NEXT_PUNT;
2799  *error = TCP_ERROR_PUNT;
2800  }
2801  else
2802  {
2803  *next = TCP_INPUT_NEXT_RESET;
2804  *error = TCP_ERROR_NO_LISTENER;
2805  }
2806 }
2807 
2808 static inline void
2810  vlib_buffer_t * b, u16 * next,
2811  vlib_node_runtime_t * error_node)
2812 {
2813  tcp_header_t *tcp;
2814  u32 error;
2815  u8 flags;
2816 
2817  tcp = tcp_buffer_hdr (b);
2818  flags = tcp->flags & filter_flags;
2819  *next = tm->dispatch_table[tc->state][flags].next;
2820  error = tm->dispatch_table[tc->state][flags].error;
2821  tc->segs_in += 1;
2822 
2823  if (PREDICT_FALSE (error != TCP_ERROR_NONE))
2824  {
2825  b->error = error_node->errors[error];
2826  if (error == TCP_ERROR_DISPATCH)
2827  clib_warning ("tcp conn %u disp error state %U flags %U",
2828  tc->c_c_index, format_tcp_state, tc->state,
2829  format_tcp_flags, (int) flags);
2830  }
2831 }
2832 
2835  vlib_frame_t * frame, int is_ip4, u8 is_nolookup)
2836 {
2837  u32 n_left_from, *from, thread_index = vm->thread_index;
2838  tcp_main_t *tm = vnet_get_tcp_main ();
2839  vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
2840  u16 nexts[VLIB_FRAME_SIZE], *next;
2841 
2842  tcp_set_time_now (tcp_get_worker (thread_index));
2843 
2844  from = vlib_frame_vector_args (frame);
2845  n_left_from = frame->n_vectors;
2846  vlib_get_buffers (vm, from, bufs, n_left_from);
2847 
2848  b = bufs;
2849  next = nexts;
2850 
2851  while (n_left_from >= 4)
2852  {
2853  u32 error0 = TCP_ERROR_NO_LISTENER, error1 = TCP_ERROR_NO_LISTENER;
2854  tcp_connection_t *tc0, *tc1;
2855 
2856  {
2857  vlib_prefetch_buffer_header (b[2], STORE);
2858  CLIB_PREFETCH (b[2]->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD);
2859 
2860  vlib_prefetch_buffer_header (b[3], STORE);
2861  CLIB_PREFETCH (b[3]->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD);
2862  }
2863 
2864  next[0] = next[1] = TCP_INPUT_NEXT_DROP;
2865 
2866  tc0 = tcp_input_lookup_buffer (b[0], thread_index, &error0, is_ip4,
2867  is_nolookup);
2868  tc1 = tcp_input_lookup_buffer (b[1], thread_index, &error1, is_ip4,
2869  is_nolookup);
2870 
2871  if (PREDICT_TRUE (!tc0 + !tc1 == 0))
2872  {
2873  ASSERT (tcp_lookup_is_valid (tc0, b[0], tcp_buffer_hdr (b[0])));
2874  ASSERT (tcp_lookup_is_valid (tc1, b[1], tcp_buffer_hdr (b[1])));
2875 
2876  vnet_buffer (b[0])->tcp.connection_index = tc0->c_c_index;
2877  vnet_buffer (b[1])->tcp.connection_index = tc1->c_c_index;
2878 
2879  tcp_input_dispatch_buffer (tm, tc0, b[0], &next[0], node);
2880  tcp_input_dispatch_buffer (tm, tc1, b[1], &next[1], node);
2881  }
2882  else
2883  {
2884  if (PREDICT_TRUE (tc0 != 0))
2885  {
2886  ASSERT (tcp_lookup_is_valid (tc0, b[0], tcp_buffer_hdr (b[0])));
2887  vnet_buffer (b[0])->tcp.connection_index = tc0->c_c_index;
2888  tcp_input_dispatch_buffer (tm, tc0, b[0], &next[0], node);
2889  }
2890  else
2891  {
2892  tcp_input_set_error_next (tm, &next[0], &error0, is_ip4);
2893  b[0]->error = node->errors[error0];
2894  }
2895 
2896  if (PREDICT_TRUE (tc1 != 0))
2897  {
2898  ASSERT (tcp_lookup_is_valid (tc1, b[1], tcp_buffer_hdr (b[1])));
2899  vnet_buffer (b[1])->tcp.connection_index = tc1->c_c_index;
2900  tcp_input_dispatch_buffer (tm, tc1, b[1], &next[1], node);
2901  }
2902  else
2903  {
2904  tcp_input_set_error_next (tm, &next[1], &error1, is_ip4);
2905  b[1]->error = node->errors[error1];
2906  }
2907  }
2908 
2909  b += 2;
2910  next += 2;
2911  n_left_from -= 2;
2912  }
2913  while (n_left_from > 0)
2914  {
2915  tcp_connection_t *tc0;
2916  u32 error0 = TCP_ERROR_NO_LISTENER;
2917 
2918  if (n_left_from > 1)
2919  {
2920  vlib_prefetch_buffer_header (b[1], STORE);
2921  CLIB_PREFETCH (b[1]->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD);
2922  }
2923 
2924  next[0] = TCP_INPUT_NEXT_DROP;
2925  tc0 = tcp_input_lookup_buffer (b[0], thread_index, &error0, is_ip4,
2926  is_nolookup);
2927  if (PREDICT_TRUE (tc0 != 0))
2928  {
2929  ASSERT (tcp_lookup_is_valid (tc0, b[0], tcp_buffer_hdr (b[0])));
2930  vnet_buffer (b[0])->tcp.connection_index = tc0->c_c_index;
2931  tcp_input_dispatch_buffer (tm, tc0, b[0], &next[0], node);
2932  }
2933  else
2934  {
2935  tcp_input_set_error_next (tm, &next[0], &error0, is_ip4);
2936  b[0]->error = node->errors[error0];
2937  }
2938 
2939  b += 1;
2940  next += 1;
2941  n_left_from -= 1;
2942  }
2943 
2945  tcp_input_trace_frame (vm, node, bufs, frame->n_vectors, is_ip4);
2946 
2947  vlib_buffer_enqueue_to_next (vm, node, from, nexts, frame->n_vectors);
2948  return frame->n_vectors;
2949 }
2950 
2953  vlib_frame_t * from_frame)
2954 {
2955  return tcp46_input_inline (vm, node, from_frame, 1 /* is_ip4 */ ,
2956  1 /* is_nolookup */ );
2957 }
2958 
2961  vlib_frame_t * from_frame)
2962 {
2963  return tcp46_input_inline (vm, node, from_frame, 0 /* is_ip4 */ ,
2964  1 /* is_nolookup */ );
2965 }
2966 
2967 /* *INDENT-OFF* */
2969 {
2970  .name = "tcp4-input-nolookup",
2971  /* Takes a vector of packets. */
2972  .vector_size = sizeof (u32),
2973  .n_errors = TCP_N_ERROR,
2974  .error_strings = tcp_error_strings,
2975  .n_next_nodes = TCP_INPUT_N_NEXT,
2976  .next_nodes =
2977  {
2978 #define _(s,n) [TCP_INPUT_NEXT_##s] = n,
2980 #undef _
2981  },
2982  .format_buffer = format_tcp_header,
2983  .format_trace = format_tcp_rx_trace,
2984 };
2985 /* *INDENT-ON* */
2986 
2987 /* *INDENT-OFF* */
2989 {
2990  .name = "tcp6-input-nolookup",
2991  /* Takes a vector of packets. */
2992  .vector_size = sizeof (u32),
2993  .n_errors = TCP_N_ERROR,
2994  .error_strings = tcp_error_strings,
2995  .n_next_nodes = TCP_INPUT_N_NEXT,
2996  .next_nodes =
2997  {
2998 #define _(s,n) [TCP_INPUT_NEXT_##s] = n,
3000 #undef _
3001  },
3002  .format_buffer = format_tcp_header,
3003  .format_trace = format_tcp_rx_trace,
3004 };
3005 /* *INDENT-ON* */
3006 
3008  vlib_frame_t * from_frame)
3009 {
3010  return tcp46_input_inline (vm, node, from_frame, 1 /* is_ip4 */ ,
3011  0 /* is_nolookup */ );
3012 }
3013 
3015  vlib_frame_t * from_frame)
3016 {
3017  return tcp46_input_inline (vm, node, from_frame, 0 /* is_ip4 */ ,
3018  0 /* is_nolookup */ );
3019 }
3020 
3021 /* *INDENT-OFF* */
3023 {
3024  .name = "tcp4-input",
3025  /* Takes a vector of packets. */
3026  .vector_size = sizeof (u32),
3027  .n_errors = TCP_N_ERROR,
3028  .error_strings = tcp_error_strings,
3029  .n_next_nodes = TCP_INPUT_N_NEXT,
3030  .next_nodes =
3031  {
3032 #define _(s,n) [TCP_INPUT_NEXT_##s] = n,
3034 #undef _
3035  },
3036  .format_buffer = format_tcp_header,
3037  .format_trace = format_tcp_rx_trace,
3038 };
3039 /* *INDENT-ON* */
3040 
3041 /* *INDENT-OFF* */
3043 {
3044  .name = "tcp6-input",
3045  /* Takes a vector of packets. */
3046  .vector_size = sizeof (u32),
3047  .n_errors = TCP_N_ERROR,
3048  .error_strings = tcp_error_strings,
3049  .n_next_nodes = TCP_INPUT_N_NEXT,
3050  .next_nodes =
3051  {
3052 #define _(s,n) [TCP_INPUT_NEXT_##s] = n,
3054 #undef _
3055  },
3056  .format_buffer = format_tcp_header,
3057  .format_trace = format_tcp_rx_trace,
3058 };
3059 /* *INDENT-ON* */
3060 
3061 #ifndef CLIB_MARCH_VARIANT
3062 static void
3064 {
3065  int i, j;
3066  for (i = 0; i < ARRAY_LEN (tm->dispatch_table); i++)
3067  for (j = 0; j < ARRAY_LEN (tm->dispatch_table[i]); j++)
3068  {
3069  tm->dispatch_table[i][j].next = TCP_INPUT_NEXT_DROP;
3070  tm->dispatch_table[i][j].error = TCP_ERROR_DISPATCH;
3071  }
3072 
3073 #define _(t,f,n,e) \
3074 do { \
3075  tm->dispatch_table[TCP_STATE_##t][f].next = (n); \
3076  tm->dispatch_table[TCP_STATE_##t][f].error = (e); \
3077 } while (0)
3078 
3079  /* RFC 793: In LISTEN if RST drop and if ACK return RST */
3080  _(LISTEN, 0, TCP_INPUT_NEXT_DROP, TCP_ERROR_SEGMENT_INVALID);
3081  _(LISTEN, TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET, TCP_ERROR_ACK_INVALID);
3082  _(LISTEN, TCP_FLAG_RST, TCP_INPUT_NEXT_DROP, TCP_ERROR_INVALID_CONNECTION);
3083  _(LISTEN, TCP_FLAG_SYN, TCP_INPUT_NEXT_LISTEN, TCP_ERROR_NONE);
3085  TCP_ERROR_ACK_INVALID);
3087  TCP_ERROR_SEGMENT_INVALID);
3089  TCP_ERROR_SEGMENT_INVALID);
3091  TCP_ERROR_INVALID_CONNECTION);
3092  _(LISTEN, TCP_FLAG_FIN, TCP_INPUT_NEXT_RESET, TCP_ERROR_SEGMENT_INVALID);
3094  TCP_ERROR_SEGMENT_INVALID);
3096  TCP_ERROR_SEGMENT_INVALID);
3098  TCP_ERROR_SEGMENT_INVALID);
3100  TCP_ERROR_SEGMENT_INVALID);
3102  TCP_ERROR_SEGMENT_INVALID);
3104  TCP_ERROR_SEGMENT_INVALID);
3106  TCP_INPUT_NEXT_DROP, TCP_ERROR_SEGMENT_INVALID);
3107  /* ACK for for a SYN-ACK -> tcp-rcv-process. */
3108  _(SYN_RCVD, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3109  _(SYN_RCVD, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3111  TCP_ERROR_NONE);
3112  _(SYN_RCVD, TCP_FLAG_SYN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3114  TCP_ERROR_NONE);
3116  TCP_ERROR_NONE);
3117  _(SYN_RCVD, TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_ACK,
3118  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3119  _(SYN_RCVD, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3121  TCP_ERROR_NONE);
3123  TCP_ERROR_NONE);
3124  _(SYN_RCVD, TCP_FLAG_FIN | TCP_FLAG_RST | TCP_FLAG_ACK,
3125  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3127  TCP_ERROR_NONE);
3128  _(SYN_RCVD, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_RST,
3129  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3130  _(SYN_RCVD, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_ACK,
3131  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3133  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3134  _(SYN_RCVD, 0, TCP_INPUT_NEXT_DROP, TCP_ERROR_SEGMENT_INVALID);
3135  /* SYN-ACK for a SYN */
3137  TCP_ERROR_NONE);
3138  _(SYN_SENT, TCP_FLAG_ACK, TCP_INPUT_NEXT_SYN_SENT, TCP_ERROR_NONE);
3139  _(SYN_SENT, TCP_FLAG_RST, TCP_INPUT_NEXT_SYN_SENT, TCP_ERROR_NONE);
3141  TCP_ERROR_NONE);
3142  _(SYN_SENT, TCP_FLAG_FIN, TCP_INPUT_NEXT_SYN_SENT, TCP_ERROR_NONE);
3144  TCP_ERROR_NONE);
3145  /* ACK for for established connection -> tcp-established. */
3146  _(ESTABLISHED, TCP_FLAG_ACK, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE);
3147  /* FIN for for established connection -> tcp-established. */
3148  _(ESTABLISHED, TCP_FLAG_FIN, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE);
3150  TCP_ERROR_NONE);
3152  TCP_ERROR_NONE);
3153  _(ESTABLISHED, TCP_FLAG_FIN | TCP_FLAG_RST | TCP_FLAG_ACK,
3154  TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE);
3156  TCP_ERROR_NONE);
3157  _(ESTABLISHED, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_ACK,
3158  TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE);
3159  _(ESTABLISHED, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_RST,
3160  TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE);
3161  _(ESTABLISHED, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_ACK,
3162  TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE);
3163  _(ESTABLISHED, TCP_FLAG_RST, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE);
3165  TCP_ERROR_NONE);
3166  _(ESTABLISHED, TCP_FLAG_SYN, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE);
3168  TCP_ERROR_NONE);
3170  TCP_ERROR_NONE);
3171  _(ESTABLISHED, TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_ACK,
3172  TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE);
3173  _(ESTABLISHED, 0, TCP_INPUT_NEXT_DROP, TCP_ERROR_SEGMENT_INVALID);
3174  /* ACK or FIN-ACK to our FIN */
3175  _(FIN_WAIT_1, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3177  TCP_ERROR_NONE);
3178  /* FIN in reply to our FIN from the other side */
3179  _(FIN_WAIT_1, 0, TCP_INPUT_NEXT_DROP, TCP_ERROR_SEGMENT_INVALID);
3180  _(FIN_WAIT_1, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3182  TCP_ERROR_NONE);
3183  _(FIN_WAIT_1, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_ACK,
3184  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3185  _(FIN_WAIT_1, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_RST,
3186  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3187  _(FIN_WAIT_1, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_ACK,
3188  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3190  TCP_ERROR_NONE);
3191  _(FIN_WAIT_1, TCP_FLAG_FIN | TCP_FLAG_RST | TCP_FLAG_ACK,
3192  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3193  _(FIN_WAIT_1, TCP_FLAG_SYN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3195  TCP_ERROR_NONE);
3197  TCP_ERROR_NONE);
3198  _(FIN_WAIT_1, TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_ACK,
3199  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3200  _(FIN_WAIT_1, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3202  TCP_ERROR_NONE);
3203  _(CLOSING, 0, TCP_INPUT_NEXT_DROP, TCP_ERROR_SEGMENT_INVALID);
3204  _(CLOSING, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3205  _(CLOSING, TCP_FLAG_SYN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3207  TCP_ERROR_NONE);
3209  TCP_ERROR_NONE);
3210  _(CLOSING, TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_ACK,
3211  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3212  _(CLOSING, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3214  TCP_ERROR_NONE);
3215  _(CLOSING, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3217  TCP_ERROR_NONE);
3219  TCP_ERROR_NONE);
3220  _(CLOSING, TCP_FLAG_FIN | TCP_FLAG_RST | TCP_FLAG_ACK,
3221  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3223  TCP_ERROR_NONE);
3224  _(CLOSING, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_ACK,
3225  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3226  _(CLOSING, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_RST,
3227  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3229  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3230  /* FIN confirming that the peer (app) has closed */
3231  _(FIN_WAIT_2, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3232  _(FIN_WAIT_2, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3234  TCP_ERROR_NONE);
3235  _(FIN_WAIT_2, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3237  TCP_ERROR_NONE);
3238  _(CLOSE_WAIT, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3240  TCP_ERROR_NONE);
3241  _(CLOSE_WAIT, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3243  TCP_ERROR_NONE);
3244  _(LAST_ACK, 0, TCP_INPUT_NEXT_DROP, TCP_ERROR_SEGMENT_INVALID);
3245  _(LAST_ACK, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3246  _(LAST_ACK, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3248  TCP_ERROR_NONE);
3250  TCP_ERROR_NONE);
3251  _(LAST_ACK, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_ACK,
3252  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3254  TCP_ERROR_NONE);
3255  _(LAST_ACK, TCP_FLAG_FIN | TCP_FLAG_RST | TCP_FLAG_ACK,
3256  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3257  _(LAST_ACK, TCP_FLAG_FIN | TCP_FLAG_SYN | TCP_FLAG_RST,
3258  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3260  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3261  _(LAST_ACK, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3263  TCP_ERROR_NONE);
3264  _(LAST_ACK, TCP_FLAG_SYN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3266  TCP_ERROR_NONE);
3268  TCP_ERROR_NONE);
3269  _(LAST_ACK, TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_ACK,
3270  TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3271  _(TIME_WAIT, TCP_FLAG_SYN, TCP_INPUT_NEXT_LISTEN, TCP_ERROR_NONE);
3272  _(TIME_WAIT, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3274  TCP_ERROR_NONE);
3275  _(TIME_WAIT, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3277  TCP_ERROR_NONE);
3278  _(TIME_WAIT, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
3279  /* RFC793 CLOSED: An incoming segment containing a RST is discarded. An
3280  * incoming segment not containing a RST causes a RST to be sent in
3281  * response.*/
3282  _(CLOSED, TCP_FLAG_RST, TCP_INPUT_NEXT_DROP, TCP_ERROR_CONNECTION_CLOSED);
3284  TCP_ERROR_CONNECTION_CLOSED);
3285  _(CLOSED, TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET, TCP_ERROR_CONNECTION_CLOSED);
3286  _(CLOSED, TCP_FLAG_SYN, TCP_INPUT_NEXT_RESET, TCP_ERROR_CONNECTION_CLOSED);
3288  TCP_ERROR_CONNECTION_CLOSED);
3289 #undef _
3290 }
3291 
3292 static clib_error_t *
3294 {
3295  clib_error_t *error = 0;
3296  tcp_main_t *tm = vnet_get_tcp_main ();
3297 
3298  if ((error = vlib_call_init_function (vm, tcp_init)))
3299  return error;
3300 
3301  /* Initialize dispatch table. */
3303 
3304  return error;
3305 }
3306 
3308 
3309 #endif /* CLIB_MARCH_VARIANT */
3310 
3311 /*
3312  * fd.io coding-style-patch-verification: ON
3313  *
3314  * Local Variables:
3315  * eval: (c-set-style "gnu")
3316  * End:
3317  */
static void tcp_program_disconnect(tcp_worker_ctx_t *wrk, tcp_connection_t *tc)
Definition: tcp_input.c:1087
static int tcp_session_enqueue_ooo(tcp_connection_t *tc, vlib_buffer_t *b, u16 data_len)
Enqueue out-of-order data.
Definition: tcp_input.c:1205
static void tcp_update_timestamp(tcp_connection_t *tc, u32 seq, u32 seq_end)
Update tsval recent.
Definition: tcp_input.c:138
u16 lb_n_buckets
number of buckets in the load-balance.
Definition: load_balance.h:116
u32 flags
buffer flags: VLIB_BUFFER_FREE_LIST_INDEX_MASK: bits used to store free list index, VLIB_BUFFER_IS_TRACED: trace this buffer.
Definition: buffer.h:124
u32 connection_index
Index of the transport connection associated to the session.
void tcp_program_retransmit(tcp_connection_t *tc)
Definition: tcp_output.c:1052
#define TCP_TIMER_HANDLE_INVALID
Definition: tcp_types.h:78
#define clib_min(x, y)
Definition: clib.h:319
#define CLIB_UNUSED(x)
Definition: clib.h:86
u32 * pending_disconnects
vector of pending disconnect notifications
Definition: tcp.h:86
vlib_node_registration_t tcp6_rcv_process_node
(constructor) VLIB_REGISTER_NODE (tcp6_rcv_process_node)
Definition: tcp_input.c:2547
static u32 ip6_fib_table_fwding_lookup(u32 fib_index, const ip6_address_t *dst)
Definition: ip6_fib.h:67
static void tcp_persist_timer_set(tcp_timer_wheel_t *tw, tcp_connection_t *tc)
Definition: tcp_timer.h:76
static void tcp_rcv_fin(tcp_worker_ctx_t *wrk, tcp_connection_t *tc, vlib_buffer_t *b, u32 *error)
Definition: tcp_input.c:1131
static u32 tcp_time_now(void)
Definition: tcp_inlines.h:191
static void vlib_buffer_free(vlib_main_t *vm, u32 *buffers, u32 n_buffers)
Free buffers Frees the entire buffer chain for each buffer.
Definition: buffer_funcs.h:937
static tcp_connection_t * tcp_connection_get(u32 conn_index, u32 thread_index)
Definition: tcp_inlines.h:30
ip4_address_t src_address
Definition: ip4_packet.h:170
static u8 tcp_cc_is_spurious_retransmit(tcp_connection_t *tc)
Definition: tcp_input.c:708
transport_connection_t * session_lookup_connection_wt6(u32 fib_index, ip6_address_t *lcl, ip6_address_t *rmt, u16 lcl_port, u16 rmt_port, u8 proto, u32 thread_index, u8 *result)
Lookup connection with ip6 and transport layer information.
vnet_main_t * vnet_get_main(void)
Definition: misc.c:46
enum _tcp_state_next tcp_state_next_t
static vnet_hw_interface_t * vnet_get_sup_hw_interface(vnet_main_t *vnm, u32 sw_if_index)
#define tcp_rst(_th)
Definition: tcp_packet.h:81
#define TCP_FLAG_SYN
Definition: fa_node.h:13
#define THZ
TCP tick frequency.
Definition: tcp_types.h:26
#define tcp_opts_tstamp(_to)
Definition: tcp_packet.h:156
#define PREDICT_TRUE(x)
Definition: clib.h:119
#define tcp_inc_err_counter(cnts, err, val)
Definition: tcp_input.c:1489
unsigned long u64
Definition: types.h:89
#define tcp_store_err_counters(node_id, cnts)
Definition: tcp_input.c:1493
static void tcp_dispatch_table_init(tcp_main_t *tm)
Definition: tcp_input.c:3063
#define clib_memcpy_fast(a, b, c)
Definition: string.h:81
static u8 * format_tcp_rx_trace_short(u8 *s, va_list *args)
Definition: tcp_input.c:1400
static int tcp_segment_rcv(tcp_worker_ctx_t *wrk, tcp_connection_t *tc, vlib_buffer_t *b)
Receive buffer for connection and handle acks.
Definition: tcp_input.c:1309
void session_transport_delete_notify(transport_connection_t *tc)
Notification from transport that connection is being deleted.
Definition: session.c:967
static uword tcp46_established_inline(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame, int is_ip4)
Definition: tcp_input.c:1503
svm_fifo_t * rx_fifo
Pointers to rx/tx buffers.
#define tcp_fastrecovery_first_off(tc)
Definition: tcp_types.h:421
static void tcp_input_dispatch_buffer(tcp_main_t *tm, tcp_connection_t *tc, vlib_buffer_t *b, u16 *next, vlib_node_runtime_t *error_node)
Definition: tcp_input.c:2809
struct _tcp_main tcp_main_t
u32 thread_index
Definition: main.h:218
void tcp_connection_timers_reset(tcp_connection_t *tc)
Stop all connection timers.
Definition: tcp.c:493
u16 current_length
Nbytes between current data and the end of this buffer.
Definition: buffer.h:113
int session_main_flush_enqueue_events(u8 transport_proto, u32 thread_index)
Flushes queue of sessions that are to be notified of new data enqueued events.
Definition: session.c:715
struct _tcp_connection tcp_connection_t
#define vec_add1(V, E)
Add 1 element to end of vector (unspecified alignment).
Definition: vec.h:590
static u32 tcp_set_time_now(tcp_worker_ctx_t *wrk)
Definition: tcp_inlines.h:219
#define clib_abs(x)
Definition: clib.h:326
void session_transport_reset_notify(transport_connection_t *tc)
Notify application that connection has been reset.
Definition: session.c:1068
u32 dpo_get_urpf(const dpo_id_t *dpo)
Get a uRPF interface for the DPO.
Definition: dpo.c:382
u32 * pending_resets
vector of pending reset notifications
Definition: tcp.h:89
#define tcp_disconnect_pending_on(tc)
Definition: tcp_types.h:417
static u32 format_get_indent(u8 *s)
Definition: format.h:72
vlib_node_registration_t tcp4_rcv_process_node
(constructor) VLIB_REGISTER_NODE (tcp4_rcv_process_node)
Definition: tcp_input.c:2528
static void tcp_cc_congestion(tcp_connection_t *tc)
Definition: tcp_cc.h:36
static u32 tcp_time_now_w_thread(u32 thread_index)
Definition: tcp_inlines.h:197
#define timestamp_lt(_t1, _t2)
Definition: tcp_packet.h:184
static session_t * session_get(u32 si, u32 thread_index)
Definition: session.h:301
u8 * format(u8 *s, const char *fmt,...)
Definition: format.c:424
#define TCP_TICK
TCP tick period (s)
Definition: tcp_types.h:25
#define tcp_disconnect_pending_off(tc)
Definition: tcp_types.h:418
tcp_connection_t tcp_connection
Definition: tcp_input.c:1380
#define VLIB_NODE_FN(node)
Definition: node.h:202
static void tcp_cc_congestion_undo(tcp_connection_t *tc)
Definition: tcp_input.c:689
int session_enqueue_stream_connection(transport_connection_t *tc, vlib_buffer_t *b, u32 offset, u8 queue_event, u8 is_in_order)
Definition: session.c:460
u64 session_lookup_half_open_handle(transport_connection_t *tc)
vlib_error_t * errors
Vector of errors for this node.
Definition: node.h:472
format_function_t format_tcp_flags
Definition: tcp.h:350
static u8 tcp_is_descheduled(tcp_connection_t *tc)
Definition: tcp_inlines.h:382
struct _tcp_header tcp_header_t
int tcp_half_open_connection_cleanup(tcp_connection_t *tc)
Try to cleanup half-open connection.
Definition: tcp.c:209
ip6_address_t src_address
Definition: ip6_packet.h:310
#define tcp_in_cong_recovery(tc)
Definition: tcp_types.h:423
u32 * pending_deq_acked
vector of pending ack dequeues
Definition: tcp.h:83
unsigned char u8
Definition: types.h:56
#define tcp_inc_counter(node_id, err, count)
Definition: tcp_input.c:1481
vlib_node_registration_t tcp6_syn_sent_node
(constructor) VLIB_REGISTER_NODE (tcp6_syn_sent_node)
Definition: tcp_input.c:2104
static tcp_connection_t * tcp_lookup_connection(u32 fib_index, vlib_buffer_t *b, u8 thread_index, u8 is_ip4)
Lookup transport connection.
Definition: tcp_input.c:1707
double f64
Definition: types.h:142
void session_transport_closing_notify(transport_connection_t *tc)
Notification from transport that connection is being closed.
Definition: session.c:945
#define tcp_is_fin(_th)
Definition: tcp_packet.h:90
static u8 * format_tcp_rx_trace(u8 *s, va_list *args)
Definition: tcp_input.c:1384
#define timestamp_leq(_t1, _t2)
Definition: tcp_packet.h:185
void tcp_init_snd_vars(tcp_connection_t *tc)
Initialize connection send variables.
Definition: tcp.c:669
#define tcp_cfg
Definition: tcp.h:273
static void tcp_persist_timer_reset(tcp_timer_wheel_t *tw, tcp_connection_t *tc)
Definition: tcp_timer.h:97
#define VLIB_INIT_FUNCTION(x)
Definition: init.h:173
vlib_node_registration_t tcp4_established_node
(constructor) VLIB_REGISTER_NODE (tcp4_established_node)
Definition: tcp_input.c:1601
static int tcp_options_parse(tcp_header_t *th, tcp_options_t *to, u8 is_syn)
Parse TCP header options.
Definition: tcp_packet.h:196
void tcp_bt_sample_delivery_rate(tcp_connection_t *tc, tcp_rate_sample_t *rs)
Generate a delivery rate sample from recently acked bytes.
Definition: tcp_bt.c:592
vl_api_ip6_address_t ip6
Definition: one.api:424
ip4_address_t dst_address
Definition: ip4_packet.h:170
#define seq_leq(_s1, _s2)
Definition: tcp_packet.h:178
#define TCP_FLAG_ACK
Definition: fa_node.h:16
u8 * format_white_space(u8 *s, va_list *va)
Definition: std-formats.c:129
transport_connection_t * session_lookup_connection_wt4(u32 fib_index, ip4_address_t *lcl, ip4_address_t *rmt, u16 lcl_port, u16 rmt_port, u8 proto, u32 thread_index, u8 *result)
Lookup connection with ip4 and transport layer information.
static void tcp_handle_rst(tcp_connection_t *tc)
Definition: tcp_input.c:157
vnet_hw_interface_flags_t flags
Definition: interface.h:526
#define vlib_prefetch_buffer_header(b, type)
Prefetch buffer metadata.
Definition: buffer.h:203
static int tcp_segment_validate(tcp_worker_ctx_t *wrk, tcp_connection_t *tc0, vlib_buffer_t *b0, tcp_header_t *th0, u32 *error0)
Validate incoming segment as per RFC793 p.
Definition: tcp_input.c:258
#define tcp_fastrecovery_off(tc)
Definition: tcp_types.h:410
vlib_node_registration_t tcp6_input_node
(constructor) VLIB_REGISTER_NODE (tcp6_input_node)
Definition: tcp_input.c:3042
static u8 tcp_ack_is_dupack(tcp_connection_t *tc, vlib_buffer_t *b, u32 prev_snd_wnd, u32 prev_snd_una)
Check if duplicate ack as per RFC5681 Sec.
Definition: tcp_input.c:959
static u32 ooo_segment_length(svm_fifo_t *f, ooo_segment_t *s)
Definition: svm_fifo.h:658
static void * ip4_next_header(ip4_header_t *i)
Definition: ip4_packet.h:241
unsigned int u32
Definition: types.h:88
static sack_scoreboard_hole_t * scoreboard_first_hole(sack_scoreboard_t *sb)
Definition: tcp_sack.h:59
static tcp_header_t * tcp_buffer_hdr(vlib_buffer_t *b)
Definition: tcp_inlines.h:22
#define vlib_call_init_function(vm, x)
Definition: init.h:270
static void tcp_node_inc_counter_i(vlib_main_t *vm, u32 tcp4_node, u32 tcp6_node, u8 is_ip4, u32 evt, u32 val)
Definition: tcp_input.c:1465
#define VLIB_FRAME_SIZE
Definition: node.h:380
static void tcp_cc_init_congestion(tcp_connection_t *tc)
Init loss recovery/fast recovery.
Definition: tcp_input.c:663
#define tcp_validate_txf_size(_tc, _a)
Definition: tcp.h:356
#define tcp_fastrecovery_on(tc)
Definition: tcp_types.h:409
static void tcp_retransmit_timer_update(tcp_timer_wheel_t *tw, tcp_connection_t *tc)
Definition: tcp_timer.h:103
static void tcp_timer_set(tcp_timer_wheel_t *tw, tcp_connection_t *tc, u8 timer_id, u32 interval)
Definition: tcp_timer.h:21
static void tcp_cc_recovered(tcp_connection_t *tc)
Definition: tcp_cc.h:48
static void svm_fifo_newest_ooo_segment_reset(svm_fifo_t *f)
Definition: svm_fifo.h:642
static heap_elt_t * first(heap_header_t *h)
Definition: heap.c:59
vlib_error_t error
Error code for buffers to be enqueued to error handler.
Definition: buffer.h:136
static void tcp_retransmit_timer_reset(tcp_timer_wheel_t *tw, tcp_connection_t *tc)
Definition: tcp_timer.h:62
The identity of a DPO is a combination of its type and its instance number/index of objects of that t...
Definition: dpo.h:170
static u8 tcp_should_fastrecover(tcp_connection_t *tc, u8 has_sack)
Definition: tcp_input.c:722
void tcp_update_sack_list(tcp_connection_t *tc, u32 start, u32 end)
Build SACK list as per RFC2018.
Definition: tcp_sack.c:544
vlib_main_t * vm
convenience pointer to this thread&#39;s vlib main
Definition: tcp.h:92
static tcp_connection_t * tcp_half_open_connection_get(u32 conn_index)
Definition: tcp_inlines.h:67
static void tcp_program_dequeue(tcp_worker_ctx_t *wrk, tcp_connection_t *tc)
Definition: tcp_input.c:597
void tcp_send_ack(tcp_connection_t *tc)
Definition: tcp_output.c:1011
static void tcp_handle_disconnects(tcp_worker_ctx_t *wrk)
Definition: tcp_input.c:1097
static uword tcp46_listen_inline(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *from_frame, int is_ip4)
LISTEN state processing as per RFC 793 p.
Definition: tcp_input.c:2569
void tcp_connection_tx_pacer_reset(tcp_connection_t *tc, u32 window, u32 start_bucket)
Definition: tcp.c:1196
static void tcp_input_set_error_next(tcp_main_t *tm, u16 *next, u32 *error, u8 is_ip4)
Definition: tcp_input.c:2790
tcp_connection_t * tcp_connection_alloc_w_base(u8 thread_index, tcp_connection_t *base)
Definition: tcp.c:309
static const dpo_id_t * load_balance_get_bucket_i(const load_balance_t *lb, u32 bucket)
Definition: load_balance.h:229
format_function_t format_tcp_connection_id
Definition: tcp.h:354
vlib_node_registration_t tcp4_input_nolookup_node
(constructor) VLIB_REGISTER_NODE (tcp4_input_nolookup_node)
Definition: tcp_input.c:2968
unsigned short u16
Definition: types.h:57
#define TCP_DUPACK_THRESHOLD
Definition: tcp_types.h:35
#define foreach_tcp4_input_next
Definition: tcp_input.c:2747
tcp_connection_t * tcp_connection_alloc(u8 thread_index)
Definition: tcp.c:296
static void * vlib_buffer_get_current(vlib_buffer_t *b)
Get pointer to current data to process.
Definition: buffer.h:229
#define filter_flags
Definition: tcp_input.c:2765
void tcp_connection_tx_pacer_update(tcp_connection_t *tc)
Definition: tcp.c:1183
static int tcp_buffer_discard_bytes(vlib_buffer_t *b, u32 n_bytes_to_drop)
Definition: tcp_input.c:1276
#define TCP_PAWS_IDLE
24 days
Definition: tcp_types.h:28
static void tcp_check_tx_offload(tcp_connection_t *tc, int is_ipv4)
Definition: tcp_input.c:1778
#define foreach_tcp6_input_next
Definition: tcp_input.c:2756
The FIB DPO provieds;.
Definition: load_balance.h:106
tcp_timer_wheel_t timer_wheel
worker timer wheel
Definition: tcp.h:118
static void tcp_input_trace_frame(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_buffer_t **bs, u32 n_bufs, u8 is_ip4)
Definition: tcp_input.c:2768
int ip6_address_compare(ip6_address_t *a1, ip6_address_t *a2)
Definition: ip46_cli.c:60
#define PREDICT_FALSE(x)
Definition: clib.h:118
#define always_inline
Definition: ipsec.h:28
static int tcp_rcv_ack_no_cc(tcp_connection_t *tc, vlib_buffer_t *b, u32 *error)
Definition: tcp_input.c:400
vl_api_ip4_address_t ip4
Definition: one.api:376
#define TCP_FLAG_FIN
Definition: fa_node.h:12
static u8 tcp_is_lost_fin(tcp_connection_t *tc)
Definition: tcp_inlines.h:183
static void tcp_cc_handle_event(tcp_connection_t *tc, tcp_rate_sample_t *rs, u32 is_dack)
One function to rule them all ...
Definition: tcp_input.c:826
vlib_node_registration_t tcp4_listen_node
(constructor) VLIB_REGISTER_NODE (tcp4_listen_node)
Definition: tcp_input.c:2698
vlib_main_t * vm
Definition: in2out_ed.c:1599
static ooo_segment_t * svm_fifo_newest_ooo_segment(svm_fifo_t *f)
Definition: svm_fifo.h:634
static void tcp_cc_rcv_ack(tcp_connection_t *tc, tcp_rate_sample_t *rs)
Definition: tcp_cc.h:22
vlib_node_registration_t tcp6_established_node
(constructor) VLIB_REGISTER_NODE (tcp6_established_node)
Definition: tcp_input.c:1620
static int tcp_can_delack(tcp_connection_t *tc)
Check if ACK could be delayed.
Definition: tcp_input.c:1260
static void vlib_node_increment_counter(vlib_main_t *vm, u32 node_index, u32 counter_index, u64 increment)
Definition: node_funcs.h:1150
static int tcp_cc_recover(tcp_connection_t *tc)
Definition: tcp_input.c:750
#define TCP_FLAG_RST
Definition: fa_node.h:14
#define TCP_DBG(_fmt, _args...)
Definition: tcp_debug.h:146
#define tcp_recovery_off(tc)
Definition: tcp_types.h:412
static int tcp_rcv_ack(tcp_worker_ctx_t *wrk, tcp_connection_t *tc, vlib_buffer_t *b, tcp_header_t *th, u32 *error)
Process incoming ACK.
Definition: tcp_input.c:987
void tcp_program_cleanup(tcp_worker_ctx_t *wrk, tcp_connection_t *tc)
Definition: tcp.c:335
void tcp_connection_free(tcp_connection_t *tc)
Definition: tcp.c:322
static void tcp_program_reset_ntf(tcp_worker_ctx_t *wrk, tcp_connection_t *tc)
Definition: tcp_input.c:190
#define VLIB_REGISTER_NODE(x,...)
Definition: node.h:169
vlib_node_registration_t tcp4_syn_sent_node
(constructor) VLIB_REGISTER_NODE (tcp4_syn_sent_node)
Definition: tcp_input.c:2085
u32 flags
Definition: vhost_user.h:248
u16 n_vectors
Definition: node.h:399
#define CLIB_PREFETCH(addr, size, type)
Definition: cache.h:80
int ip4_address_compare(ip4_address_t *a1, ip4_address_t *a2)
Definition: ip46_cli.c:53
static_always_inline void vlib_buffer_enqueue_to_next(vlib_main_t *vm, vlib_node_runtime_t *node, u32 *buffers, u16 *nexts, uword count)
Definition: buffer_node.h:339
#define tcp_disconnect_pending(tc)
Definition: tcp_types.h:416
static void tcp_set_rx_trace_data(tcp_rx_trace_t *t0, tcp_connection_t *tc0, tcp_header_t *th0, vlib_buffer_t *b0, u8 is_ip4)
Definition: tcp_input.c:1415
void tcp_program_dupack(tcp_connection_t *tc)
Definition: tcp_output.c:1040
sll srl srl sll sra u16x4 i
Definition: vector_sse42.h:317
void tcp_send_reset(tcp_connection_t *tc)
Build and set reset packet for connection.
Definition: tcp_output.c:742
format_function_t format_tcp_state
Definition: tcp.h:349
static void tcp_update_rto(tcp_connection_t *tc)
Definition: tcp_inlines.h:375
#define clib_warning(format, args...)
Definition: error.h:59
u8 data[]
Packet data.
Definition: buffer.h:181
#define TCP_RTO_MIN
Definition: tcp_types.h:86
#define tcp_in_recovery(tc)
Definition: tcp_types.h:414
Don&#39;t register connection in lookup.
tcp_header_t tcp_header
Definition: tcp_input.c:1379
format_function_t format_tcp_header
Definition: format.h:100
struct _transport_connection transport_connection_t
f64 rtt_time
RTT for sample.
Definition: tcp_types.h:225
static void tcp_cc_undo_recovery(tcp_connection_t *tc)
Definition: tcp_cc.h:54
#define TCP_RTT_MAX
Definition: tcp_types.h:87
#define ARRAY_LEN(x)
Definition: clib.h:66
static u32 tcp_available_cc_snd_space(const tcp_connection_t *tc)
Estimate of how many bytes we can still push into the network.
Definition: tcp_inlines.h:171
vlib_main_t vlib_node_runtime_t * node
Definition: in2out_ed.c:1599
static void * ip6_next_header(ip6_header_t *i)
Definition: ip6_packet.h:371
static u32 transport_max_tx_dequeue(transport_connection_t *tc)
Definition: session.h:503
void tcp_send_synack(tcp_connection_t *tc)
Definition: tcp_output.c:833
#define seq_geq(_s1, _s2)
Definition: tcp_packet.h:180
#define ASSERT(truth)
#define tcp_syn(_th)
Definition: tcp_packet.h:80
static clib_error_t * tcp_input_init(vlib_main_t *vm)
Definition: tcp_input.c:3293
static void tcp_estimate_rtt(tcp_connection_t *tc, u32 mrtt)
Compute smoothed RTT as per VJ&#39;s &#39;88 SIGCOMM and RFC6298.
Definition: tcp_input.c:433
u8 data[128]
Definition: ipsec_types.api:89
static int tcp_update_rtt(tcp_connection_t *tc, tcp_rate_sample_t *rs, u32 ack)
Update RTT estimate and RTO timer.
Definition: tcp_input.c:467
enum _tcp_rcv_process_next tcp_rcv_process_next_t
static load_balance_t * load_balance_get(index_t lbi)
Definition: load_balance.h:220
static void tcp_cc_update(tcp_connection_t *tc, tcp_rate_sample_t *rs)
Definition: tcp_input.c:802
static void tcp_handle_postponed_dequeues(tcp_worker_ctx_t *wrk)
Dequeue bytes for connections that have received acks in last burst.
Definition: tcp_input.c:552
static void tcp_cong_recovery_off(tcp_connection_t *tc)
Definition: tcp_types.h:427
static index_t ip4_fib_forwarding_lookup(u32 fib_index, const ip4_address_t *addr)
Definition: ip4_fib.h:160
static void tcp_estimate_initial_rtt(tcp_connection_t *tc)
Definition: tcp_input.c:521
static void vlib_buffer_advance(vlib_buffer_t *b, word l)
Advance current data pointer by the supplied (signed!) amount.
Definition: buffer.h:248
#define seq_gt(_s1, _s2)
Definition: tcp_packet.h:179
static int tcp_segment_check_paws(tcp_connection_t *tc)
RFC1323: Check against wrapped sequence numbers (PAWS).
Definition: tcp_input.c:128
static u8 tcp_cc_is_spurious_timeout_rxt(tcp_connection_t *tc)
Definition: tcp_input.c:699
static void tcp_established_trace_frame(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame, u8 is_ip4)
Definition: tcp_input.c:1431
#define tcp_fastrecovery_first_on(tc)
Definition: tcp_types.h:420
enum _tcp_input_next tcp_input_next_t
int session_stream_accept_notify(transport_connection_t *tc)
Definition: session.c:1083
struct _sack_scoreboard_hole sack_scoreboard_hole_t
static u8 tcp_segment_in_rcv_wnd(tcp_connection_t *tc, u32 seq, u32 end_seq)
Validate segment sequence number.
Definition: tcp_input.c:112
#define clib_max(x, y)
Definition: clib.h:312
static vlib_main_t * vlib_get_main(void)
Definition: global_funcs.h:23
static clib_error_t * tcp_init(vlib_main_t *vm)
Definition: tcp.c:1440
static void * vlib_add_trace(vlib_main_t *vm, vlib_node_runtime_t *r, vlib_buffer_t *b, u32 n_data_bytes)
Definition: trace_funcs.h:55
u8 ip_is_zero(ip46_address_t *ip46_address, u8 is_ip4)
Definition: ip.c:20
u8 tcp_scoreboard_is_sane_post_recovery(tcp_connection_t *tc)
Test that scoreboard is sane after recovery.
Definition: tcp_sack.c:296
#define tcp_is_syn(_th)
Definition: tcp_packet.h:89
#define tcp_opts_wscale(_to)
Definition: tcp_packet.h:157
enum _tcp_syn_sent_next tcp_syn_sent_next_t
void tcp_send_reset_w_pkt(tcp_connection_t *tc, vlib_buffer_t *pkt, u32 thread_index, u8 is_ip4)
Send reset without reusing existing buffer.
Definition: tcp_output.c:657
static void tcp_update_snd_wnd(tcp_connection_t *tc, u32 seq, u32 ack, u32 snd_wnd)
Try to update snd_wnd based on feedback received from peer.
Definition: tcp_input.c:614
enum _tcp_established_next tcp_established_next_t
static void tcp_timer_update(tcp_timer_wheel_t *tw, tcp_connection_t *tc, u8 timer_id, u32 interval)
Definition: tcp_timer.h:42
vlib_node_registration_t tcp4_input_node
(constructor) VLIB_REGISTER_NODE (tcp4_input_node)
Definition: tcp_input.c:3022
void scoreboard_clear(sack_scoreboard_t *sb)
Definition: tcp_sack.c:257
void tcp_send_fin(tcp_connection_t *tc)
Send FIN.
Definition: tcp_output.c:860
#define vec_len(v)
Number of elements in vector (rvalue-only, NULL tolerant)
enum _tcp_listen_next tcp_listen_next_t
#define TCP_ALWAYS_ACK
On/off delayed acks.
Definition: tcp_types.h:37
#define foreach_tcp_state_next
Definition: tcp_input.c:31
u32 next_buffer
Next buffer for this linked-list of buffers.
Definition: buffer.h:140
static tcp_connection_t * tcp_listener_get(u32 tli)
Definition: tcp_inlines.h:58
static tcp_worker_ctx_t * tcp_get_worker(u32 thread_index)
Definition: tcp.h:284
void session_transport_closed_notify(transport_connection_t *tc)
Notification from transport that it is closed.
Definition: session.c:1033
VLIB buffer representation.
Definition: buffer.h:102
static int tcp_session_enqueue_data(tcp_connection_t *tc, vlib_buffer_t *b, u16 data_len)
Enqueue data for delivery to application.
Definition: tcp_input.c:1154
static u8 tcp_should_fastrecover_sack(tcp_connection_t *tc)
Definition: tcp_input.c:714
u64 uword
Definition: types.h:112
int session_stream_connect_notify(transport_connection_t *tc, session_error_t err)
Definition: session.c:757
static void * vlib_frame_vector_args(vlib_frame_t *f)
Get pointer to frame vector data.
Definition: node_funcs.h:244
void tcp_connection_init_vars(tcp_connection_t *tc)
Initialize tcp connection variables.
Definition: tcp.c:704
static void tcp_init_w_buffer(tcp_connection_t *tc, vlib_buffer_t *b, u8 is_ip4)
Initialize connection by gleaning network and rcv params from buffer.
Definition: tcp_inlines.h:330
session_t * session_lookup_listener6(u32 fib_index, ip6_address_t *lcl, u16 lcl_port, u8 proto, u8 use_wildcard)
static f64 tcp_time_now_us(u32 thread_index)
Definition: tcp_inlines.h:213
void scoreboard_init_rxt(sack_scoreboard_t *sb, u32 snd_una)
Definition: tcp_sack.c:235
static void tcp_connection_set_state(tcp_connection_t *tc, tcp_state_t state)
Definition: tcp_inlines.h:51
static tcp_connection_t * tcp_lookup_listener(vlib_buffer_t *b, u32 fib_index, int is_ip4)
Definition: tcp_input.c:1748
static u32 ooo_segment_offset_prod(svm_fifo_t *f, ooo_segment_t *s)
Definition: svm_fifo.h:648
struct clib_bihash_value offset
template key/value backing page structure
#define vnet_buffer(b)
Definition: buffer.h:417
static u8 tcp_lookup_is_valid(tcp_connection_t *tc, vlib_buffer_t *b, tcp_header_t *hdr)
Definition: tcp_input.c:1640
static u32 vlib_num_workers()
Definition: threads.h:376
void tcp_connection_cleanup(tcp_connection_t *tc)
Cleans up connection state.
Definition: tcp.c:242
void tcp_connection_del(tcp_connection_t *tc)
Connection removal.
Definition: tcp.c:289
f64 end
end of the time range
Definition: mactime.api:44
vlib_main_t vlib_node_runtime_t vlib_frame_t * frame
Definition: in2out_ed.c:1600
void tcp_reschedule(tcp_connection_t *tc)
Definition: tcp.c:1207
u16 flags
Copy of main node flags.
Definition: node.h:511
u32 session_tx_fifo_dequeue_drop(transport_connection_t *tc, u32 max_bytes)
Definition: session.c:561
static u8 tcp_timer_is_active(tcp_connection_t *tc, tcp_timers_e timer)
Definition: tcp_timer.h:117
void tcp_program_ack(tcp_connection_t *tc)
Definition: tcp_output.c:1030
vlib_node_registration_t tcp6_listen_node
(constructor) VLIB_REGISTER_NODE (tcp6_listen_node)
Definition: tcp_input.c:2717
#define tcp_opts_sack_permitted(_to)
Definition: tcp_packet.h:159
static u32 tcp_tstamp(tcp_connection_t *tc)
Generate timestamp for tcp connection.
Definition: tcp_inlines.h:206
static void tcp_cc_rcv_cong_ack(tcp_connection_t *tc, tcp_cc_ack_t ack_type, tcp_rate_sample_t *rs)
Definition: tcp_cc.h:29
int session_stream_accept(transport_connection_t *tc, u32 listener_index, u32 thread_index, u8 notify)
Accept a stream session.
Definition: session.c:1107
static_always_inline void vlib_get_buffers(vlib_main_t *vm, u32 *bi, vlib_buffer_t **b, int count)
Translate array of buffer indices into buffer pointers.
Definition: buffer_funcs.h:280
#define VLIB_NODE_FLAG_TRACE
Definition: node.h:304
tcp_bts_flags_t flags
Rate sample flags from bt sample.
Definition: tcp_types.h:232
#define CLIB_CACHE_LINE_BYTES
Definition: cache.h:59
static transport_connection_t * transport_get_listener(transport_proto_t tp, u32 conn_index)
Definition: transport.h:156
u32 total_length_not_including_first_buffer
Only valid for first buffer in chain.
Definition: buffer.h:167
static uword tcp46_input_inline(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame, int is_ip4, u8 is_nolookup)
Definition: tcp_input.c:2834
static tcp_connection_t * tcp_get_connection_from_transport(transport_connection_t *tconn)
Definition: tcp_types.h:440
static tcp_main_t * vnet_get_tcp_main()
Definition: tcp.h:278
static uword tcp46_syn_sent_inline(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *from_frame, int is_ip4)
Definition: tcp_input.c:1812
static uword tcp46_rcv_process_inline(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *from_frame, int is_ip4)
Handles reception for all states except LISTEN, SYN-SENT and ESTABLISHED as per RFC793 p...
Definition: tcp_input.c:2127
session_t * session_lookup_listener4(u32 fib_index, ip4_address_t *lcl, u16 lcl_port, u8 proto, u8 use_wildcard)
static vlib_buffer_t * vlib_get_buffer(vlib_main_t *vm, u32 buffer_index)
Translate buffer index into buffer pointer.
Definition: buffer_funcs.h:85
vlib_node_registration_t tcp6_input_nolookup_node
(constructor) VLIB_REGISTER_NODE (tcp6_input_nolookup_node)
Definition: tcp_input.c:2988
static tcp_connection_t * tcp_input_lookup_buffer(vlib_buffer_t *b, u8 thread_index, u32 *error, u8 is_ip4, u8 is_nolookup)
Definition: tcp_inlines.h:227
static void tcp_handle_old_ack(tcp_connection_t *tc, tcp_rate_sample_t *rs)
Definition: tcp_input.c:939
#define tcp_ack(_th)
Definition: tcp_packet.h:83
#define seq_lt(_s1, _s2)
Definition: tcp_packet.h:177
static u32 transport_tx_fifo_size(transport_connection_t *tc)
Definition: session.h:524
transport_connection_t * session_lookup_half_open_connection(u64 handle, u8 proto, u8 is_ip4)
Definition: defs.h:46
ip6_address_t dst_address
Definition: ip6_packet.h:310
static u8 tcp_ack_is_cc_event(tcp_connection_t *tc, vlib_buffer_t *b, u32 prev_snd_wnd, u32 prev_snd_una, u8 *is_dack)
Checks if ack is a congestion control event.
Definition: tcp_input.c:972
void tcp_rcv_sacks(tcp_connection_t *tc, u32 ack)
Definition: tcp_sack.c:305
static char * tcp_error_strings[]
Definition: tcp_input.c:24
#define TCP_EVT(_evt, _args...)
Definition: tcp_debug.h:145
static void tcp_rcv_rst(tcp_worker_ctx_t *wrk, tcp_connection_t *tc)
Handle reset packet.
Definition: tcp_input.c:207