FD.io VPP  v19.04.4-rc0-5-ge88582fac
Vector Packet Processing
tcp_output.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2016-2019 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #include <vnet/tcp/tcp.h>
17 #include <math.h>
18 
19 typedef enum _tcp_output_next
20 {
27 
28 #define foreach_tcp4_output_next \
29  _ (DROP, "error-drop") \
30  _ (IP_LOOKUP, "ip4-lookup") \
31  _ (IP_REWRITE, "ip4-rewrite") \
32  _ (IP_ARP, "ip4-arp")
33 
34 #define foreach_tcp6_output_next \
35  _ (DROP, "error-drop") \
36  _ (IP_LOOKUP, "ip6-lookup") \
37  _ (IP_REWRITE, "ip6-rewrite") \
38  _ (IP_ARP, "ip6-discover-neighbor")
39 
40 static char *tcp_error_strings[] = {
41 #define tcp_error(n,s) s,
42 #include <vnet/tcp/tcp_error.def>
43 #undef tcp_error
44 };
45 
46 typedef struct
47 {
51 
52 static u8 *
53 format_tcp_tx_trace (u8 * s, va_list * args)
54 {
55  CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
56  CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
57  tcp_tx_trace_t *t = va_arg (*args, tcp_tx_trace_t *);
58  u32 indent = format_get_indent (s);
59 
60  s = format (s, "%U\n%U%U",
61  format_tcp_header, &t->tcp_header, 128,
62  format_white_space, indent,
64 
65  return s;
66 }
67 
68 #ifndef CLIB_MARCH_VARIANT
69 static u8
71 {
72  u8 wnd_scale = 0;
73  while (wnd_scale < TCP_MAX_WND_SCALE && (window >> wnd_scale) > TCP_WND_MAX)
74  wnd_scale++;
75  return wnd_scale;
76 }
77 
78 /**
79  * Update max segment size we're able to process.
80  *
81  * The value is constrained by our interface's MTU and IP options. It is
82  * also what we advertise to our peer.
83  */
84 void
86 {
87  /* TODO find our iface MTU */
88  tc->mss = tcp_main.default_mtu - sizeof (tcp_header_t);
89 }
90 
91 /**
92  * TCP's initial window
93  */
96 {
97  /* RFC 6928 recommends the value lower. However at the time our connections
98  * are initialized, fifos may not be allocated. Therefore, advertise the
99  * smallest possible unscaled window size and update once fifos are
100  * assigned to the session.
101  */
102  /*
103  tcp_update_rcv_mss (tc);
104  TCP_IW_N_SEGMENTS * tc->mss;
105  */
106  return TCP_MIN_RX_FIFO_SIZE;
107 }
108 
109 /**
110  * Compute initial window and scale factor. As per RFC1323, window field in
111  * SYN and SYN-ACK segments is never scaled.
112  */
113 u32
115 {
116  tcp_main_t *tm = &tcp_main;
117  u32 max_fifo;
118 
119  /* Initial wnd for SYN. Fifos are not allocated yet.
120  * Use some predefined value. For SYN-ACK we still want the
121  * scale to be computed in the same way */
122  max_fifo = tm->max_rx_fifo ? tm->max_rx_fifo : TCP_MAX_RX_FIFO_SIZE;
123 
124  /* Compute rcv wscale only if peer advertised support for it */
125  if (tc->state != TCP_STATE_SYN_RCVD || tcp_opts_wscale (&tc->rcv_opts))
126  tc->rcv_wscale = tcp_window_compute_scale (max_fifo);
127 
128  tc->rcv_wnd = tcp_initial_wnd_unscaled (tc);
129 
130  return clib_min (tc->rcv_wnd, TCP_WND_MAX);
131 }
132 
133 static inline void
135 {
136  u32 available_space, wnd;
137  i32 observed_wnd;
138 
139  ASSERT (tc->rcv_opts.mss < transport_rx_fifo_size (&tc->connection));
140 
141  /*
142  * Figure out how much space we have available
143  */
144  available_space = transport_max_rx_enqueue (&tc->connection);
145  if (PREDICT_FALSE (available_space < tc->rcv_opts.mss))
146  available_space = 0;
147 
148  /*
149  * Use the above and what we know about what we've previously advertised
150  * to compute the new window
151  */
152  observed_wnd = (i32) tc->rcv_wnd - (tc->rcv_nxt - tc->rcv_las);
153 
154  /* Bad. Thou shalt not shrink */
155  if (PREDICT_FALSE ((i32) available_space < observed_wnd))
156  {
157  wnd = clib_max (observed_wnd, 0);
158  TCP_EVT_DBG (TCP_EVT_RCV_WND_SHRUNK, tc, observed_wnd, available_space);
159  }
160  else
161  {
162  wnd = available_space;
163  }
164 
165  /* Make sure we have a multiple of rcv_wscale */
166  if (wnd && tc->rcv_wscale)
167  {
168  wnd &= ~((1 << tc->rcv_wscale) - 1);
169  if (wnd == 0)
170  wnd = 1 << tc->rcv_wscale;
171  }
172 
173  tc->rcv_wnd = clib_min (wnd, TCP_WND_MAX << tc->rcv_wscale);
174 }
175 
176 /**
177  * Compute and return window to advertise, scaled as per RFC1323
178  */
179 static inline u32
181 {
182  if (state < TCP_STATE_ESTABLISHED)
184 
185  tcp_update_rcv_wnd (tc);
186  return tc->rcv_wnd >> tc->rcv_wscale;
187 }
188 
189 /**
190  * Write TCP options to segment.
191  */
192 static u32
194 {
195  u32 opts_len = 0;
196  u32 buf, seq_len = 4;
197 
198  if (tcp_opts_mss (opts))
199  {
200  *data++ = TCP_OPTION_MSS;
201  *data++ = TCP_OPTION_LEN_MSS;
202  buf = clib_host_to_net_u16 (opts->mss);
203  clib_memcpy_fast (data, &buf, sizeof (opts->mss));
204  data += sizeof (opts->mss);
205  opts_len += TCP_OPTION_LEN_MSS;
206  }
207 
208  if (tcp_opts_wscale (opts))
209  {
210  *data++ = TCP_OPTION_WINDOW_SCALE;
211  *data++ = TCP_OPTION_LEN_WINDOW_SCALE;
212  *data++ = opts->wscale;
213  opts_len += TCP_OPTION_LEN_WINDOW_SCALE;
214  }
215 
216  if (tcp_opts_sack_permitted (opts))
217  {
218  *data++ = TCP_OPTION_SACK_PERMITTED;
220  opts_len += TCP_OPTION_LEN_SACK_PERMITTED;
221  }
222 
223  if (tcp_opts_tstamp (opts))
224  {
225  *data++ = TCP_OPTION_TIMESTAMP;
226  *data++ = TCP_OPTION_LEN_TIMESTAMP;
227  buf = clib_host_to_net_u32 (opts->tsval);
228  clib_memcpy_fast (data, &buf, sizeof (opts->tsval));
229  data += sizeof (opts->tsval);
230  buf = clib_host_to_net_u32 (opts->tsecr);
231  clib_memcpy_fast (data, &buf, sizeof (opts->tsecr));
232  data += sizeof (opts->tsecr);
233  opts_len += TCP_OPTION_LEN_TIMESTAMP;
234  }
235 
236  if (tcp_opts_sack (opts))
237  {
238  int i;
239 
240  if (opts->n_sack_blocks != 0)
241  {
242  *data++ = TCP_OPTION_SACK_BLOCK;
243  *data++ = 2 + opts->n_sack_blocks * TCP_OPTION_LEN_SACK_BLOCK;
244  for (i = 0; i < opts->n_sack_blocks; i++)
245  {
246  buf = clib_host_to_net_u32 (opts->sacks[i].start);
247  clib_memcpy_fast (data, &buf, seq_len);
248  data += seq_len;
249  buf = clib_host_to_net_u32 (opts->sacks[i].end);
250  clib_memcpy_fast (data, &buf, seq_len);
251  data += seq_len;
252  }
253  opts_len += 2 + opts->n_sack_blocks * TCP_OPTION_LEN_SACK_BLOCK;
254  }
255  }
256 
257  /* Terminate TCP options */
258  if (opts_len % 4)
259  {
260  *data++ = TCP_OPTION_EOL;
261  opts_len += TCP_OPTION_LEN_EOL;
262  }
263 
264  /* Pad with zeroes to a u32 boundary */
265  while (opts_len % 4)
266  {
267  *data++ = TCP_OPTION_NOOP;
268  opts_len += TCP_OPTION_LEN_NOOP;
269  }
270  return opts_len;
271 }
272 
273 static int
275 {
276  u8 len = 0;
277 
278  opts->flags |= TCP_OPTS_FLAG_MSS;
279  opts->mss = tcp_main.default_mtu; /*XXX discover that */
280  len += TCP_OPTION_LEN_MSS;
281 
282  opts->flags |= TCP_OPTS_FLAG_WSCALE;
283  opts->wscale = wnd_scale;
285 
286  opts->flags |= TCP_OPTS_FLAG_TSTAMP;
287  opts->tsval = tcp_time_now ();
288  opts->tsecr = 0;
290 
291  if (TCP_USE_SACKS)
292  {
293  opts->flags |= TCP_OPTS_FLAG_SACK_PERMITTED;
295  }
296 
297  /* Align to needed boundary */
298  len += (TCP_OPTS_ALIGN - len % TCP_OPTS_ALIGN) % TCP_OPTS_ALIGN;
299  return len;
300 }
301 
302 static int
304 {
305  u8 len = 0;
306 
307  opts->flags |= TCP_OPTS_FLAG_MSS;
308  opts->mss = tc->mss;
309  len += TCP_OPTION_LEN_MSS;
310 
311  if (tcp_opts_wscale (&tc->rcv_opts))
312  {
313  opts->flags |= TCP_OPTS_FLAG_WSCALE;
314  opts->wscale = tc->rcv_wscale;
316  }
317 
318  if (tcp_opts_tstamp (&tc->rcv_opts))
319  {
320  opts->flags |= TCP_OPTS_FLAG_TSTAMP;
321  opts->tsval = tcp_time_now ();
322  opts->tsecr = tc->tsval_recent;
324  }
325 
326  if (tcp_opts_sack_permitted (&tc->rcv_opts))
327  {
328  opts->flags |= TCP_OPTS_FLAG_SACK_PERMITTED;
330  }
331 
332  /* Align to needed boundary */
333  len += (TCP_OPTS_ALIGN - len % TCP_OPTS_ALIGN) % TCP_OPTS_ALIGN;
334  return len;
335 }
336 
337 static int
339 {
340  u8 len = 0;
341 
342  opts->flags = 0;
343 
344  if (tcp_opts_tstamp (&tc->rcv_opts))
345  {
346  opts->flags |= TCP_OPTS_FLAG_TSTAMP;
347  opts->tsval = tcp_time_now_w_thread (tc->c_thread_index);
348  opts->tsecr = tc->tsval_recent;
350  }
351  if (tcp_opts_sack_permitted (&tc->rcv_opts))
352  {
353  if (vec_len (tc->snd_sacks))
354  {
355  opts->flags |= TCP_OPTS_FLAG_SACK;
356  if (tc->snd_sack_pos >= vec_len (tc->snd_sacks))
357  tc->snd_sack_pos = 0;
358  opts->sacks = &tc->snd_sacks[tc->snd_sack_pos];
359  opts->n_sack_blocks = vec_len (tc->snd_sacks) - tc->snd_sack_pos;
360  opts->n_sack_blocks = clib_min (opts->n_sack_blocks,
362  tc->snd_sack_pos += opts->n_sack_blocks;
363  len += 2 + TCP_OPTION_LEN_SACK_BLOCK * opts->n_sack_blocks;
364  }
365  }
366 
367  /* Align to needed boundary */
368  len += (TCP_OPTS_ALIGN - len % TCP_OPTS_ALIGN) % TCP_OPTS_ALIGN;
369  return len;
370 }
371 
372 always_inline int
375 {
376  switch (state)
377  {
378  case TCP_STATE_ESTABLISHED:
379  case TCP_STATE_CLOSE_WAIT:
380  case TCP_STATE_FIN_WAIT_1:
381  case TCP_STATE_LAST_ACK:
382  case TCP_STATE_CLOSING:
383  case TCP_STATE_FIN_WAIT_2:
384  case TCP_STATE_TIME_WAIT:
385  case TCP_STATE_CLOSED:
386  return tcp_make_established_options (tc, opts);
387  case TCP_STATE_SYN_RCVD:
388  return tcp_make_synack_options (tc, opts);
389  case TCP_STATE_SYN_SENT:
390  return tcp_make_syn_options (opts, tc->rcv_wscale);
391  default:
392  clib_warning ("State not handled! %d", state);
393  return 0;
394  }
395 }
396 
397 /**
398  * Update burst send vars
399  *
400  * - Updates snd_mss to reflect the effective segment size that we can send
401  * by taking into account all TCP options, including SACKs.
402  * - Cache 'on the wire' options for reuse
403  * - Updates receive window which can be reused for a burst.
404  *
405  * This should *only* be called when doing bursts
406  */
407 void
409 {
410  tcp_main_t *tm = &tcp_main;
411 
412  /* Compute options to be used for connection. These may be reused when
413  * sending data or to compute the effective mss (snd_mss) */
414  tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts,
415  TCP_STATE_ESTABLISHED);
416 
417  /* XXX check if MTU has been updated */
418  tc->snd_mss = clib_min (tc->mss, tc->rcv_opts.mss) - tc->snd_opts_len;
419  ASSERT (tc->snd_mss > 0);
420 
421  tcp_options_write (tm->wrk_ctx[tc->c_thread_index].cached_opts,
422  &tc->snd_opts);
423 
424  tcp_update_rcv_wnd (tc);
425 }
426 
427 void
429 {
430  u16 default_min_mss = 536;
431  tcp_update_rcv_mss (tc);
432 
433  /* TODO cache mss and consider PMTU discovery */
434  tc->snd_mss = clib_min (tc->rcv_opts.mss, tc->mss);
435 
436  if (tc->snd_mss < 45)
437  {
438  /* Assume that at least the min default mss works */
439  tc->snd_mss = default_min_mss;
440  tc->rcv_opts.mss = default_min_mss;
441  }
442 
443  /* We should have enough space for 40 bytes of options */
444  ASSERT (tc->snd_mss > 45);
445 
446  /* If we use timestamp option, account for it */
447  if (tcp_opts_tstamp (&tc->rcv_opts))
448  tc->snd_mss -= TCP_OPTION_LEN_TIMESTAMP;
449 }
450 #endif /* CLIB_MARCH_VARIANT */
451 
452 static void *
454 {
455  if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
457  /* Zero all flags but free list index and trace flag */
458  b->flags &= VLIB_BUFFER_NEXT_PRESENT - 1;
459  b->current_data = 0;
460  b->current_length = 0;
462  vnet_buffer (b)->tcp.flags = 0;
463 
464  /* Leave enough space for headers */
466 }
467 
468 #ifndef CLIB_MARCH_VARIANT
469 static void *
471 {
472  ASSERT ((b->flags & VLIB_BUFFER_NEXT_PRESENT) == 0);
473  b->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED;
475  b->current_data = 0;
476  vnet_buffer (b)->tcp.flags = 0;
478  /* Leave enough space for headers */
480 }
481 
482 /**
483  * Prepare ACK
484  */
485 static inline void
487  u8 flags)
488 {
489  tcp_options_t _snd_opts, *snd_opts = &_snd_opts;
490  u8 tcp_opts_len, tcp_hdr_opts_len;
491  tcp_header_t *th;
492  u16 wnd;
493 
494  wnd = tcp_window_to_advertise (tc, state);
495 
496  /* Make and write options */
497  tcp_opts_len = tcp_make_established_options (tc, snd_opts);
498  tcp_hdr_opts_len = tcp_opts_len + sizeof (tcp_header_t);
499 
500  th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->snd_nxt,
501  tc->rcv_nxt, tcp_hdr_opts_len, flags, wnd);
502 
503  tcp_options_write ((u8 *) (th + 1), snd_opts);
504  vnet_buffer (b)->tcp.connection_index = tc->c_c_index;
505 }
506 
507 /**
508  * Convert buffer to ACK
509  */
510 static inline void
512 {
513  tcp_make_ack_i (tc, b, TCP_STATE_ESTABLISHED, TCP_FLAG_ACK);
514  TCP_EVT_DBG (TCP_EVT_ACK_SENT, tc);
515  tc->rcv_las = tc->rcv_nxt;
516 }
517 
518 /**
519  * Convert buffer to FIN-ACK
520  */
521 void
523 {
524  tcp_make_ack_i (tc, b, TCP_STATE_ESTABLISHED, TCP_FLAG_FIN | TCP_FLAG_ACK);
525 }
526 
527 /**
528  * Convert buffer to SYN
529  */
530 void
532 {
533  u8 tcp_hdr_opts_len, tcp_opts_len;
534  tcp_header_t *th;
535  u16 initial_wnd;
536  tcp_options_t snd_opts;
537 
538  initial_wnd = tcp_initial_window_to_advertise (tc);
539 
540  /* Make and write options */
541  clib_memset (&snd_opts, 0, sizeof (snd_opts));
542  tcp_opts_len = tcp_make_syn_options (&snd_opts, tc->rcv_wscale);
543  tcp_hdr_opts_len = tcp_opts_len + sizeof (tcp_header_t);
544 
545  th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->iss,
546  tc->rcv_nxt, tcp_hdr_opts_len, TCP_FLAG_SYN,
547  initial_wnd);
548  vnet_buffer (b)->tcp.connection_index = tc->c_c_index;
549  tcp_options_write ((u8 *) (th + 1), &snd_opts);
550 }
551 
552 /**
553  * Convert buffer to SYN-ACK
554  */
555 void
557 {
558  tcp_options_t _snd_opts, *snd_opts = &_snd_opts;
559  u8 tcp_opts_len, tcp_hdr_opts_len;
560  tcp_header_t *th;
561  u16 initial_wnd;
562 
563  clib_memset (snd_opts, 0, sizeof (*snd_opts));
564  initial_wnd = tcp_initial_window_to_advertise (tc);
565  tcp_opts_len = tcp_make_synack_options (tc, snd_opts);
566  tcp_hdr_opts_len = tcp_opts_len + sizeof (tcp_header_t);
567 
568  th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->iss,
569  tc->rcv_nxt, tcp_hdr_opts_len,
570  TCP_FLAG_SYN | TCP_FLAG_ACK, initial_wnd);
571  tcp_options_write ((u8 *) (th + 1), snd_opts);
572 
573  vnet_buffer (b)->tcp.connection_index = tc->c_c_index;
574 }
575 
576 always_inline void
578  u8 is_ip4, u32 fib_index, u8 flush)
579 {
580  vlib_main_t *vm = wrk->vm;
581  u32 *to_next, next_index;
582  vlib_frame_t *f;
583 
584  b->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED;
585  b->error = 0;
586 
587  vnet_buffer (b)->sw_if_index[VLIB_TX] = fib_index;
588  vnet_buffer (b)->sw_if_index[VLIB_RX] = 0;
589 
590  /* Send to IP lookup */
591  next_index = is_ip4 ? ip4_lookup_node.index : ip6_lookup_node.index;
593 
594  f = wrk->ip_lookup_tx_frames[!is_ip4];
595  if (!f)
596  {
597  f = vlib_get_frame_to_node (vm, next_index);
598  ASSERT (f);
599  wrk->ip_lookup_tx_frames[!is_ip4] = f;
600  }
601 
602  to_next = vlib_frame_vector_args (f);
603  to_next[f->n_vectors] = bi;
604  f->n_vectors += 1;
605  if (flush || f->n_vectors == VLIB_FRAME_SIZE)
606  {
607  vlib_put_frame_to_node (vm, next_index, f);
608  wrk->ip_lookup_tx_frames[!is_ip4] = 0;
609  }
610 }
611 
612 static void
614  u32 bi, u8 is_ip4, u32 fib_index)
615 {
616  tcp_enqueue_to_ip_lookup_i (wrk, b, bi, is_ip4, fib_index, 1);
617 }
618 
619 static void
621  u8 is_ip4, u32 fib_index)
622 {
623  tcp_enqueue_to_ip_lookup_i (wrk, b, bi, is_ip4, fib_index, 0);
624  if (wrk->vm->thread_index == 0 && vlib_num_workers ())
626 }
627 
628 always_inline void
630  u8 is_ip4, u8 flush)
631 {
632  u32 *to_next, next_index;
633  vlib_frame_t *f;
634 
635  b->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED;
636  b->error = 0;
637 
638  /* Decide where to send the packet */
639  next_index = is_ip4 ? tcp4_output_node.index : tcp6_output_node.index;
641 
642  /* Get frame to v4/6 output node */
643  f = wrk->tx_frames[!is_ip4];
644  if (!f)
645  {
646  f = vlib_get_frame_to_node (wrk->vm, next_index);
647  ASSERT (f);
648  wrk->tx_frames[!is_ip4] = f;
649  }
650  to_next = vlib_frame_vector_args (f);
651  to_next[f->n_vectors] = bi;
652  f->n_vectors += 1;
653  if (flush || f->n_vectors == VLIB_FRAME_SIZE)
654  {
655  vlib_put_frame_to_node (wrk->vm, next_index, f);
656  wrk->tx_frames[!is_ip4] = 0;
657  }
658 }
659 
660 static void
662  u8 is_ip4)
663 {
664  tcp_enqueue_to_output_i (wrk, b, bi, is_ip4, 0);
665 }
666 
667 static void
669  u8 is_ip4)
670 {
671  tcp_enqueue_to_output_i (wrk, b, bi, is_ip4, 1);
672 }
673 #endif /* CLIB_MARCH_VARIANT */
674 
675 static int
677  tcp_state_t state, u8 thread_index, u8 is_ip4)
678 {
679  ip4_header_t *ih4;
680  ip6_header_t *ih6;
681  tcp_header_t *th0;
682  ip4_address_t src_ip40, dst_ip40;
683  ip6_address_t src_ip60, dst_ip60;
685  u32 tmp;
686  u32 seq, ack;
687  u8 flags;
688 
689  /* Find IP and TCP headers */
690  th0 = tcp_buffer_hdr (b0);
691 
692  /* Save src and dst ip */
693  if (is_ip4)
694  {
695  ih4 = vlib_buffer_get_current (b0);
696  ASSERT ((ih4->ip_version_and_header_length & 0xF0) == 0x40);
697  src_ip40.as_u32 = ih4->src_address.as_u32;
698  dst_ip40.as_u32 = ih4->dst_address.as_u32;
699  }
700  else
701  {
702  ih6 = vlib_buffer_get_current (b0);
703  ASSERT ((ih6->ip_version_traffic_class_and_flow_label & 0xF0) == 0x60);
704  clib_memcpy_fast (&src_ip60, &ih6->src_address, sizeof (ip6_address_t));
705  clib_memcpy_fast (&dst_ip60, &ih6->dst_address, sizeof (ip6_address_t));
706  }
707 
708  src_port = th0->src_port;
709  dst_port = th0->dst_port;
710 
711  /* Try to determine what/why we're actually resetting */
712  if (state == TCP_STATE_CLOSED)
713  {
714  if (!tcp_syn (th0))
715  return -1;
716 
717  tmp = clib_net_to_host_u32 (th0->seq_number);
718 
719  /* Got a SYN for no listener. */
720  flags = TCP_FLAG_RST | TCP_FLAG_ACK;
721  ack = clib_host_to_net_u32 (tmp + 1);
722  seq = 0;
723  }
724  else
725  {
726  flags = TCP_FLAG_RST;
727  seq = th0->ack_number;
728  ack = 0;
729  }
730 
731  tcp_reuse_buffer (vm, b0);
732  tcp_trajectory_add_start (b0, 4);
733  th0 = vlib_buffer_push_tcp_net_order (b0, dst_port, src_port, seq, ack,
734  sizeof (tcp_header_t), flags, 0);
735 
736  if (is_ip4)
737  {
738  ih4 = vlib_buffer_push_ip4 (vm, b0, &dst_ip40, &src_ip40,
739  IP_PROTOCOL_TCP, 1);
740  th0->checksum = ip4_tcp_udp_compute_checksum (vm, b0, ih4);
741  }
742  else
743  {
744  int bogus = ~0;
745  ih6 = vlib_buffer_push_ip6 (vm, b0, &dst_ip60, &src_ip60,
746  IP_PROTOCOL_TCP);
747  th0->checksum = ip6_tcp_udp_icmp_compute_checksum (vm, b0, ih6, &bogus);
748  ASSERT (!bogus);
749  }
750 
751  return 0;
752 }
753 
754 #ifndef CLIB_MARCH_VARIANT
755 /**
756  * Send reset without reusing existing buffer
757  *
758  * It extracts connection info out of original packet
759  */
760 void
762  u32 thread_index, u8 is_ip4)
763 {
764  tcp_worker_ctx_t *wrk = tcp_get_worker (thread_index);
765  vlib_main_t *vm = wrk->vm;
766  vlib_buffer_t *b;
767  u32 bi, sw_if_index, fib_index;
768  u8 tcp_hdr_len, flags = 0;
769  tcp_header_t *th, *pkt_th;
770  u32 seq, ack;
771  ip4_header_t *ih4, *pkt_ih4;
772  ip6_header_t *ih6, *pkt_ih6;
773  fib_protocol_t fib_proto;
774 
775  if (PREDICT_FALSE (!vlib_buffer_alloc (vm, &bi, 1)))
776  return;
777 
778  b = vlib_get_buffer (vm, bi);
779  sw_if_index = vnet_buffer (pkt)->sw_if_index[VLIB_RX];
780  fib_proto = is_ip4 ? FIB_PROTOCOL_IP4 : FIB_PROTOCOL_IP6;
781  fib_index = fib_table_get_index_for_sw_if_index (fib_proto, sw_if_index);
782  tcp_init_buffer (vm, b);
783 
784  /* Make and write options */
785  tcp_hdr_len = sizeof (tcp_header_t);
786 
787  if (is_ip4)
788  {
789  pkt_ih4 = vlib_buffer_get_current (pkt);
790  pkt_th = ip4_next_header (pkt_ih4);
791  }
792  else
793  {
794  pkt_ih6 = vlib_buffer_get_current (pkt);
795  pkt_th = ip6_next_header (pkt_ih6);
796  }
797 
798  if (tcp_ack (pkt_th))
799  {
800  flags = TCP_FLAG_RST;
801  seq = pkt_th->ack_number;
802  ack = (tc->state >= TCP_STATE_SYN_RCVD) ? tc->rcv_nxt : 0;
803  }
804  else
805  {
806  flags = TCP_FLAG_RST | TCP_FLAG_ACK;
807  seq = 0;
808  ack = clib_host_to_net_u32 (vnet_buffer (pkt)->tcp.seq_end);
809  }
810 
811  th = vlib_buffer_push_tcp_net_order (b, pkt_th->dst_port, pkt_th->src_port,
812  seq, ack, tcp_hdr_len, flags, 0);
813 
814  /* Swap src and dst ip */
815  if (is_ip4)
816  {
817  ASSERT ((pkt_ih4->ip_version_and_header_length & 0xF0) == 0x40);
818  ih4 = vlib_buffer_push_ip4 (vm, b, &pkt_ih4->dst_address,
819  &pkt_ih4->src_address, IP_PROTOCOL_TCP, 1);
820  th->checksum = ip4_tcp_udp_compute_checksum (vm, b, ih4);
821  }
822  else
823  {
824  int bogus = ~0;
825  ASSERT ((pkt_ih6->ip_version_traffic_class_and_flow_label & 0xF0) ==
826  0x60);
827  ih6 = vlib_buffer_push_ip6 (vm, b, &pkt_ih6->dst_address,
828  &pkt_ih6->src_address, IP_PROTOCOL_TCP);
829  th->checksum = ip6_tcp_udp_icmp_compute_checksum (vm, b, ih6, &bogus);
830  ASSERT (!bogus);
831  }
832 
833  tcp_enqueue_to_ip_lookup_now (wrk, b, bi, is_ip4, fib_index);
834  TCP_EVT_DBG (TCP_EVT_RST_SENT, tc);
835 }
836 
837 /**
838  * Build and set reset packet for connection
839  */
840 void
842 {
843  tcp_worker_ctx_t *wrk = tcp_get_worker (tc->c_thread_index);
844  vlib_main_t *vm = wrk->vm;
845  vlib_buffer_t *b;
846  u32 bi;
847  tcp_header_t *th;
848  u16 tcp_hdr_opts_len, advertise_wnd, opts_write_len;
849  u8 flags;
850 
851  if (PREDICT_FALSE (!vlib_buffer_alloc (vm, &bi, 1)))
852  return;
853  b = vlib_get_buffer (vm, bi);
854  tcp_init_buffer (vm, b);
855 
856  tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts, tc->state);
857  tcp_hdr_opts_len = tc->snd_opts_len + sizeof (tcp_header_t);
858  advertise_wnd = tcp_window_to_advertise (tc, TCP_STATE_ESTABLISHED);
859  flags = TCP_FLAG_RST;
860  th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->snd_nxt,
861  tc->rcv_nxt, tcp_hdr_opts_len, flags,
862  advertise_wnd);
863  opts_write_len = tcp_options_write ((u8 *) (th + 1), &tc->snd_opts);
864  ASSERT (opts_write_len == tc->snd_opts_len);
865  vnet_buffer (b)->tcp.connection_index = tc->c_c_index;
866  if (tc->c_is_ip4)
867  {
868  ip4_header_t *ih4;
869  ih4 = vlib_buffer_push_ip4 (vm, b, &tc->c_lcl_ip.ip4,
870  &tc->c_rmt_ip.ip4, IP_PROTOCOL_TCP, 0);
871  th->checksum = ip4_tcp_udp_compute_checksum (vm, b, ih4);
872  }
873  else
874  {
875  int bogus = ~0;
876  ip6_header_t *ih6;
877  ih6 = vlib_buffer_push_ip6 (vm, b, &tc->c_lcl_ip.ip6,
878  &tc->c_rmt_ip.ip6, IP_PROTOCOL_TCP);
879  th->checksum = ip6_tcp_udp_icmp_compute_checksum (vm, b, ih6, &bogus);
880  ASSERT (!bogus);
881  }
882  tcp_enqueue_to_ip_lookup_now (wrk, b, bi, tc->c_is_ip4, tc->c_fib_index);
883  TCP_EVT_DBG (TCP_EVT_RST_SENT, tc);
884 }
885 
886 static void
888  vlib_buffer_t * b)
889 {
891  vlib_main_t *vm = wrk->vm;
892  if (tc->c_is_ip4)
893  {
894  ip4_header_t *ih;
895  ih = vlib_buffer_push_ip4 (vm, b, &tc->c_lcl_ip4,
896  &tc->c_rmt_ip4, IP_PROTOCOL_TCP, 1);
897  th->checksum = ip4_tcp_udp_compute_checksum (vm, b, ih);
898  }
899  else
900  {
901  ip6_header_t *ih;
902  int bogus = ~0;
903 
904  ih = vlib_buffer_push_ip6 (vm, b, &tc->c_lcl_ip6,
905  &tc->c_rmt_ip6, IP_PROTOCOL_TCP);
906  th->checksum = ip6_tcp_udp_icmp_compute_checksum (vm, b, ih, &bogus);
907  ASSERT (!bogus);
908  }
909 }
910 
911 /**
912  * Send SYN
913  *
914  * Builds a SYN packet for a half-open connection and sends it to ipx_lookup.
915  * The packet is not forwarded through tcpx_output to avoid doing lookups
916  * in the half_open pool.
917  */
918 void
920 {
921  tcp_worker_ctx_t *wrk = tcp_get_worker (tc->c_thread_index);
922  vlib_main_t *vm = wrk->vm;
923  vlib_buffer_t *b;
924  u32 bi;
925 
926  /*
927  * Setup retransmit and establish timers before requesting buffer
928  * such that we can return if we've ran out.
929  */
930  tcp_timer_set (tc, TCP_TIMER_ESTABLISH_AO, TCP_ESTABLISH_TIME);
931  tcp_timer_update (tc, TCP_TIMER_RETRANSMIT_SYN,
932  tc->rto * TCP_TO_TIMER_TICK);
933 
934  if (PREDICT_FALSE (!vlib_buffer_alloc (vm, &bi, 1)))
935  {
936  tcp_timer_update (tc, TCP_TIMER_RETRANSMIT_SYN, 1);
937  return;
938  }
939 
940  b = vlib_get_buffer (vm, bi);
941  tcp_init_buffer (vm, b);
942  tcp_make_syn (tc, b);
943 
944  /* Measure RTT with this */
945  tc->rtt_ts = tcp_time_now_us (vlib_num_workers ()? 1 : 0);
946  tc->rtt_seq = tc->snd_nxt;
947  tc->rto_boff = 0;
948 
949  tcp_push_ip_hdr (wrk, tc, b);
950  tcp_enqueue_to_ip_lookup (wrk, b, bi, tc->c_is_ip4, tc->c_fib_index);
951  TCP_EVT_DBG (TCP_EVT_SYN_SENT, tc);
952 }
953 
954 void
956 {
957  tcp_worker_ctx_t *wrk = tcp_get_worker (tc->c_thread_index);
958  vlib_main_t *vm = wrk->vm;
959  vlib_buffer_t *b;
960  u32 bi;
961 
963 
964  if (PREDICT_FALSE (!vlib_buffer_alloc (vm, &bi, 1)))
965  {
966  tcp_timer_update (tc, TCP_TIMER_RETRANSMIT, 1);
967  return;
968  }
969 
970  tc->rtt_ts = tcp_time_now_us (tc->c_thread_index);
971  b = vlib_get_buffer (vm, bi);
972  tcp_init_buffer (vm, b);
973  tcp_make_synack (tc, b);
974  tcp_enqueue_to_output (wrk, b, bi, tc->c_is_ip4);
975  TCP_EVT_DBG (TCP_EVT_SYNACK_SENT, tc);
976 }
977 
978 /**
979  * Flush tx frame populated by retransmits and timer pops
980  */
981 void
983 {
984  if (wrk->tx_frames[!is_ip4])
985  {
986  u32 next_index;
987  next_index = is_ip4 ? tcp4_output_node.index : tcp6_output_node.index;
988  vlib_put_frame_to_node (wrk->vm, next_index, wrk->tx_frames[!is_ip4]);
989  wrk->tx_frames[!is_ip4] = 0;
990  }
991 }
992 
993 /**
994  * Flush ip lookup tx frames populated by timer pops
995  */
996 static void
998 {
999  if (wrk->ip_lookup_tx_frames[!is_ip4])
1000  {
1001  u32 next_index;
1002  next_index = is_ip4 ? ip4_lookup_node.index : ip6_lookup_node.index;
1003  vlib_put_frame_to_node (wrk->vm, next_index,
1004  wrk->ip_lookup_tx_frames[!is_ip4]);
1005  wrk->ip_lookup_tx_frames[!is_ip4] = 0;
1006  }
1007 }
1008 
1009 /**
1010  * Flush v4 and v6 tcp and ip-lookup tx frames for thread index
1011  */
1012 void
1014 {
1015  tcp_flush_frame_to_output (wrk, 1);
1016  tcp_flush_frame_to_output (wrk, 0);
1019 }
1020 
1021 /**
1022  * Send FIN
1023  */
1024 void
1026 {
1027  tcp_worker_ctx_t *wrk = tcp_get_worker (tc->c_thread_index);
1028  vlib_main_t *vm = wrk->vm;
1029  vlib_buffer_t *b;
1030  u32 bi;
1031  u8 fin_snt = 0;
1032 
1033  fin_snt = tc->flags & TCP_CONN_FINSNT;
1034  if (fin_snt)
1035  tc->snd_nxt -= 1;
1036 
1037  if (PREDICT_FALSE (!vlib_buffer_alloc (vm, &bi, 1)))
1038  {
1039  /* Out of buffers so program fin retransmit ASAP */
1040  tcp_timer_update (tc, TCP_TIMER_RETRANSMIT, 1);
1041  if (fin_snt)
1042  tc->snd_nxt += 1;
1043  else
1044  /* Make sure retransmit retries a fin not data */
1045  tc->flags |= TCP_CONN_FINSNT;
1046  return;
1047  }
1048 
1050  b = vlib_get_buffer (vm, bi);
1051  tcp_init_buffer (vm, b);
1052  tcp_make_fin (tc, b);
1053  tcp_enqueue_to_output_now (wrk, b, bi, tc->c_is_ip4);
1054  TCP_EVT_DBG (TCP_EVT_FIN_SENT, tc);
1055  /* Account for the FIN */
1056  tc->snd_nxt += 1;
1057  if (!fin_snt)
1058  {
1059  tc->flags |= TCP_CONN_FINSNT;
1060  tc->flags &= ~TCP_CONN_FINPNDG;
1061  tc->snd_una_max = seq_max (tc->snd_una_max, tc->snd_nxt);
1062  }
1063 }
1064 
1065 /**
1066  * Push TCP header and update connection variables. Should only be called
1067  * for segments with data, not for 'control' packets.
1068  */
1069 always_inline void
1071  u8 compute_opts, u8 maybe_burst, u8 update_snd_nxt)
1072 {
1073  u8 tcp_hdr_opts_len, flags = TCP_FLAG_ACK;
1074  u32 advertise_wnd, data_len;
1075  tcp_main_t *tm = &tcp_main;
1076  tcp_header_t *th;
1077 
1078  data_len = b->current_length;
1079  if (PREDICT_FALSE (b->flags & VLIB_BUFFER_NEXT_PRESENT))
1081 
1082  vnet_buffer (b)->tcp.flags = 0;
1083  vnet_buffer (b)->tcp.connection_index = tc->c_c_index;
1084 
1085  if (compute_opts)
1086  tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts, tc->state);
1087 
1088  tcp_hdr_opts_len = tc->snd_opts_len + sizeof (tcp_header_t);
1089 
1090  if (maybe_burst)
1091  advertise_wnd = tc->rcv_wnd >> tc->rcv_wscale;
1092  else
1093  advertise_wnd = tcp_window_to_advertise (tc, TCP_STATE_ESTABLISHED);
1094 
1095  if (PREDICT_FALSE (tc->flags & TCP_CONN_PSH_PENDING))
1096  {
1097  if (seq_geq (tc->psh_seq, snd_nxt)
1098  && seq_lt (tc->psh_seq, snd_nxt + data_len))
1099  flags |= TCP_FLAG_PSH;
1100  }
1101  th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, snd_nxt,
1102  tc->rcv_nxt, tcp_hdr_opts_len, flags,
1103  advertise_wnd);
1104 
1105  if (maybe_burst)
1106  {
1107  clib_memcpy_fast ((u8 *) (th + 1),
1108  tm->wrk_ctx[tc->c_thread_index].cached_opts,
1109  tc->snd_opts_len);
1110  }
1111  else
1112  {
1113  u8 len = tcp_options_write ((u8 *) (th + 1), &tc->snd_opts);
1114  ASSERT (len == tc->snd_opts_len);
1115  }
1116 
1117  /*
1118  * Update connection variables
1119  */
1120 
1121  if (update_snd_nxt)
1122  tc->snd_nxt += data_len;
1123  tc->rcv_las = tc->rcv_nxt;
1124 
1125  TCP_EVT_DBG (TCP_EVT_PKTIZE, tc);
1126 }
1127 
1128 u32
1130 {
1131  tcp_connection_t *tc = (tcp_connection_t *) tconn;
1132  tcp_push_hdr_i (tc, b, tc->snd_nxt, /* compute opts */ 0, /* burst */ 1,
1133  /* update_snd_nxt */ 1);
1134  tc->snd_una_max = seq_max (tc->snd_nxt, tc->snd_una_max);
1135  tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una);
1136  /* If not tracking an ACK, start tracking */
1137  if (tc->rtt_ts == 0 && !tcp_in_cong_recovery (tc))
1138  {
1139  tc->rtt_ts = tcp_time_now_us (tc->c_thread_index);
1140  tc->rtt_seq = tc->snd_nxt;
1141  }
1142  if (PREDICT_FALSE (!tcp_timer_is_active (tc, TCP_TIMER_RETRANSMIT)))
1143  {
1145  tc->rto_boff = 0;
1146  }
1147  tcp_trajectory_add_start (b, 3);
1148  return 0;
1149 }
1150 
1151 void
1153 {
1154  tcp_worker_ctx_t *wrk = tcp_get_worker (tc->c_thread_index);
1155  vlib_main_t *vm = wrk->vm;
1156  vlib_buffer_t *b;
1157  u32 bi;
1158 
1159  if (PREDICT_FALSE (!vlib_buffer_alloc (vm, &bi, 1)))
1160  {
1161  tcp_update_rcv_wnd (tc);
1162  return;
1163  }
1164  b = vlib_get_buffer (vm, bi);
1165  tcp_init_buffer (vm, b);
1166  tcp_make_ack (tc, b);
1167  tcp_enqueue_to_output (wrk, b, bi, tc->c_is_ip4);
1168 }
1169 
1170 void
1172 {
1173  if (!(tc->flags & TCP_CONN_SNDACK))
1174  {
1175  vec_add1 (wrk->pending_acks, tc->c_c_index);
1176  tc->flags |= TCP_CONN_SNDACK;
1177  }
1178 }
1179 
1180 void
1182 {
1183  if (!(tc->flags & TCP_CONN_SNDACK))
1184  {
1185  vec_add1 (wrk->pending_acks, tc->c_c_index);
1186  tc->flags |= TCP_CONN_SNDACK;
1187  }
1188  if (tc->pending_dupacks < 255)
1189  tc->pending_dupacks += 1;
1190 }
1191 
1192 void
1194 {
1195  u32 thread_index, *pending_acks;
1196  tcp_connection_t *tc;
1197  int i, j, n_acks;
1198 
1199  if (!vec_len (wrk->pending_acks))
1200  return;
1201 
1202  thread_index = wrk->vm->thread_index;
1203  pending_acks = wrk->pending_acks;
1204  for (i = 0; i < vec_len (pending_acks); i++)
1205  {
1206  tc = tcp_connection_get (pending_acks[i], thread_index);
1207  tc->flags &= ~TCP_CONN_SNDACK;
1208  if (!tc->pending_dupacks)
1209  {
1210  tcp_send_ack (tc);
1211  continue;
1212  }
1213 
1214  /* If we're supposed to send dupacks but have no ooo data
1215  * send only one ack */
1216  if (!vec_len (tc->snd_sacks))
1217  {
1218  tcp_send_ack (tc);
1219  continue;
1220  }
1221 
1222  /* Start with first sack block */
1223  tc->snd_sack_pos = 0;
1224 
1225  /* Generate enough dupacks to cover all sack blocks. Do not generate
1226  * more sacks than the number of packets received. But do generate at
1227  * least 3, i.e., the number needed to signal congestion, if needed. */
1228  n_acks = vec_len (tc->snd_sacks) / TCP_OPTS_MAX_SACK_BLOCKS;
1229  n_acks = clib_min (n_acks, tc->pending_dupacks);
1230  n_acks = clib_max (n_acks, clib_min (tc->pending_dupacks, 3));
1231  for (j = 0; j < n_acks; j++)
1232  tcp_send_ack (tc);
1233 
1234  tc->pending_dupacks = 0;
1235  tc->snd_sack_pos = 0;
1236  }
1237  _vec_len (wrk->pending_acks) = 0;
1238 }
1239 
1240 /**
1241  * Delayed ack timer handler
1242  *
1243  * Sends delayed ACK when timer expires
1244  */
1245 void
1247 {
1248  u32 thread_index = vlib_get_thread_index ();
1249  tcp_connection_t *tc;
1250 
1251  tc = tcp_connection_get (index, thread_index);
1252  tc->timers[TCP_TIMER_DELACK] = TCP_TIMER_HANDLE_INVALID;
1253  tcp_send_ack (tc);
1254 }
1255 
1256 /**
1257  * Allocate a new buffer and build a new tcp segment
1258  *
1259  * @param wrk tcp worker
1260  * @param tc connection for which the segment will be allocated
1261  * @param offset offset of the first byte in the tx fifo
1262  * @param max_deq_byte segment size
1263  * @param[out] b pointer to buffer allocated
1264  *
1265  * @return the number of bytes in the segment or 0 if buffer cannot be
1266  * allocated or no data available
1267  */
1268 static int
1270  u32 offset, u32 max_deq_bytes, vlib_buffer_t ** b)
1271 {
1272  u32 bytes_per_buffer = vnet_get_tcp_main ()->bytes_per_buffer;
1273  vlib_main_t *vm = wrk->vm;
1274  u32 bi, seg_size;
1275  int n_bytes = 0;
1276  u8 *data;
1277 
1278  seg_size = max_deq_bytes + TRANSPORT_MAX_HDRS_LEN;
1279 
1280  /*
1281  * Prepare options
1282  */
1283  tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts, tc->state);
1284 
1285  /*
1286  * Allocate and fill in buffer(s)
1287  */
1288 
1289  /* Easy case, buffer size greater than mss */
1290  if (PREDICT_TRUE (seg_size <= bytes_per_buffer))
1291  {
1292  if (PREDICT_FALSE (!vlib_buffer_alloc (vm, &bi, 1)))
1293  return 0;
1294  *b = vlib_get_buffer (vm, bi);
1295  data = tcp_init_buffer (vm, *b);
1296  n_bytes = session_tx_fifo_peek_bytes (&tc->connection, data, offset,
1297  max_deq_bytes);
1298  ASSERT (n_bytes == max_deq_bytes);
1299  b[0]->current_length = n_bytes;
1300  tcp_push_hdr_i (tc, *b, tc->snd_una + offset, /* compute opts */ 0,
1301  /* burst */ 0, /* update_snd_nxt */ 0);
1302  }
1303  /* Split mss into multiple buffers */
1304  else
1305  {
1306  u32 chain_bi = ~0, n_bufs_per_seg, n_bufs;
1307  u16 n_peeked, len_to_deq;
1308  vlib_buffer_t *chain_b, *prev_b;
1309  int i;
1310 
1311  /* Make sure we have enough buffers */
1312  n_bufs_per_seg = ceil ((double) seg_size / bytes_per_buffer);
1313  vec_validate_aligned (wrk->tx_buffers, n_bufs_per_seg - 1,
1315  n_bufs = vlib_buffer_alloc (vm, wrk->tx_buffers, n_bufs_per_seg);
1316  if (PREDICT_FALSE (n_bufs != n_bufs_per_seg))
1317  {
1318  if (n_bufs)
1319  vlib_buffer_free (vm, wrk->tx_buffers, n_bufs);
1320  return 0;
1321  }
1322 
1323  *b = vlib_get_buffer (vm, wrk->tx_buffers[--n_bufs]);
1324  data = tcp_init_buffer (vm, *b);
1325  n_bytes = session_tx_fifo_peek_bytes (&tc->connection, data, offset,
1326  bytes_per_buffer -
1327  TRANSPORT_MAX_HDRS_LEN);
1328  b[0]->current_length = n_bytes;
1329  b[0]->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID;
1331  max_deq_bytes -= n_bytes;
1332 
1333  chain_b = *b;
1334  for (i = 1; i < n_bufs_per_seg; i++)
1335  {
1336  prev_b = chain_b;
1337  len_to_deq = clib_min (max_deq_bytes, bytes_per_buffer);
1338  chain_bi = wrk->tx_buffers[--n_bufs];
1339  chain_b = vlib_get_buffer (vm, chain_bi);
1340  chain_b->current_data = 0;
1341  data = vlib_buffer_get_current (chain_b);
1342  n_peeked = session_tx_fifo_peek_bytes (&tc->connection, data,
1343  offset + n_bytes,
1344  len_to_deq);
1345  ASSERT (n_peeked == len_to_deq);
1346  n_bytes += n_peeked;
1347  chain_b->current_length = n_peeked;
1348  chain_b->next_buffer = 0;
1349 
1350  /* update previous buffer */
1351  prev_b->next_buffer = chain_bi;
1352  prev_b->flags |= VLIB_BUFFER_NEXT_PRESENT;
1353 
1354  max_deq_bytes -= n_peeked;
1355  b[0]->total_length_not_including_first_buffer += n_peeked;
1356  }
1357 
1358  tcp_push_hdr_i (tc, *b, tc->snd_una + offset, /* compute opts */ 0,
1359  /* burst */ 0, /* update_snd_nxt */ 0);
1360 
1361  if (PREDICT_FALSE (n_bufs))
1362  {
1363  clib_warning ("not all buffers consumed");
1364  vlib_buffer_free (vm, wrk->tx_buffers, n_bufs);
1365  }
1366  }
1367 
1368  ASSERT (n_bytes > 0);
1369  ASSERT (((*b)->current_data + (*b)->current_length) <= bytes_per_buffer);
1370 
1371  return n_bytes;
1372 }
1373 
1374 /**
1375  * Build a retransmit segment
1376  *
1377  * @return the number of bytes in the segment or 0 if there's nothing to
1378  * retransmit
1379  */
1380 static u32
1382  tcp_connection_t * tc, u32 offset,
1383  u32 max_deq_bytes, vlib_buffer_t ** b)
1384 {
1385  u32 start, available_bytes;
1386  int n_bytes = 0;
1387 
1388  ASSERT (tc->state >= TCP_STATE_ESTABLISHED);
1389  ASSERT (max_deq_bytes != 0);
1390 
1391  /*
1392  * Make sure we can retransmit something
1393  */
1394  available_bytes = transport_max_tx_dequeue (&tc->connection);
1395  ASSERT (available_bytes >= offset);
1396  available_bytes -= offset;
1397  if (!available_bytes)
1398  return 0;
1399 
1400  max_deq_bytes = clib_min (tc->snd_mss, max_deq_bytes);
1401  max_deq_bytes = clib_min (available_bytes, max_deq_bytes);
1402 
1403  /* Start is beyond snd_congestion */
1404  start = tc->snd_una + offset;
1405  if (seq_geq (start, tc->snd_congestion))
1406  goto done;
1407 
1408  /* Don't overshoot snd_congestion */
1409  if (seq_gt (start + max_deq_bytes, tc->snd_congestion))
1410  {
1411  max_deq_bytes = tc->snd_congestion - start;
1412  if (max_deq_bytes == 0)
1413  goto done;
1414  }
1415 
1416  n_bytes = tcp_prepare_segment (wrk, tc, offset, max_deq_bytes, b);
1417  if (!n_bytes)
1418  return 0;
1419 
1420  if (tcp_in_fastrecovery (tc))
1421  tc->snd_rxt_bytes += n_bytes;
1422 
1423 done:
1424  TCP_EVT_DBG (TCP_EVT_CC_RTX, tc, offset, n_bytes);
1425  return n_bytes;
1426 }
1427 
1428 /**
1429  * Reset congestion control, switch cwnd to loss window and try again.
1430  */
1431 static void
1433 {
1434  TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 6);
1435  tc->prev_ssthresh = tc->ssthresh;
1436  tc->prev_cwnd = tc->cwnd;
1437 
1438  /* Cleanly recover cc (also clears up fast retransmit) */
1439  if (tcp_in_fastrecovery (tc))
1440  {
1441  /* TODO be less aggressive about this */
1442  scoreboard_clear (&tc->sack_sb);
1444  }
1445  else
1446  tc->rcv_dupacks = 0;
1447 
1448  /* Start again from the beginning */
1449  tc->cc_algo->congestion (tc);
1450  tc->cwnd = tcp_loss_wnd (tc);
1451  tc->snd_congestion = tc->snd_nxt;
1452  tc->rtt_ts = 0;
1453  tc->cwnd_acc_bytes = 0;
1454  tcp_connection_tx_pacer_reset (tc, tc->cwnd, 2 * tc->snd_mss);
1455  tcp_recovery_on (tc);
1456 }
1457 
1458 static inline void
1460 {
1461  u32 thread_index = vlib_get_thread_index ();
1462  tcp_worker_ctx_t *wrk = tcp_get_worker (thread_index);
1463  vlib_main_t *vm = wrk->vm;
1464  tcp_connection_t *tc;
1465  vlib_buffer_t *b = 0;
1466  u32 bi, n_bytes;
1467 
1468  if (is_syn)
1469  {
1470  tc = tcp_half_open_connection_get (index);
1471  /* Note: the connection may have transitioned to ESTABLISHED... */
1472  if (PREDICT_FALSE (tc == 0 || tc->state != TCP_STATE_SYN_SENT))
1473  return;
1474  tc->timers[TCP_TIMER_RETRANSMIT_SYN] = TCP_TIMER_HANDLE_INVALID;
1475  }
1476  else
1477  {
1478  tc = tcp_connection_get (index, thread_index);
1479  /* Note: the connection may have been closed and pool_put */
1480  if (PREDICT_FALSE (tc == 0 || tc->state == TCP_STATE_SYN_SENT))
1481  return;
1482  tc->timers[TCP_TIMER_RETRANSMIT] = TCP_TIMER_HANDLE_INVALID;
1483  /* Wait-close and retransmit could pop at the same time */
1484  if (tc->state == TCP_STATE_CLOSED)
1485  return;
1486  }
1487 
1488  if (tc->state >= TCP_STATE_ESTABLISHED)
1489  {
1490  TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 2);
1491 
1492  /* Lost FIN, retransmit and return */
1493  if (tc->flags & TCP_CONN_FINSNT)
1494  {
1495  tcp_send_fin (tc);
1496  tc->rto_boff += 1;
1497  tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX);
1498  return;
1499  }
1500 
1501  /* Shouldn't be here. This condition is tricky because it has to take
1502  * into account boff > 0 due to persist timeout. */
1503  if ((tc->rto_boff == 0 && tc->snd_una == tc->snd_nxt)
1504  || (tc->rto_boff > 0 && seq_geq (tc->snd_una, tc->snd_congestion)
1505  && !tcp_flight_size (tc)))
1506  {
1507  ASSERT (!tcp_in_recovery (tc));
1508  tc->rto_boff = 0;
1509  return;
1510  }
1511 
1512  /* We're not in recovery so make sure rto_boff is 0. Can be non 0 due
1513  * to persist timer timeout */
1514  if (!tcp_in_recovery (tc) && tc->rto_boff > 0)
1515  {
1516  tc->rto_boff = 0;
1517  tcp_update_rto (tc);
1518  }
1519 
1520  /* Increment RTO backoff (also equal to number of retries) and go back
1521  * to first un-acked byte */
1522  tc->rto_boff += 1;
1523 
1524  /* First retransmit timeout */
1525  if (tc->rto_boff == 1)
1526  tcp_rxt_timeout_cc (tc);
1527  else
1528  scoreboard_clear (&tc->sack_sb);
1529 
1530  /* If we've sent beyond snd_congestion, update it */
1531  tc->snd_congestion = seq_max (tc->snd_nxt, tc->snd_congestion);
1532 
1533  tc->snd_nxt = tc->snd_una;
1534  tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX);
1535 
1536  /* Send one segment. Note that n_bytes may be zero due to buffer
1537  * shortfall */
1538  n_bytes = tcp_prepare_retransmit_segment (wrk, tc, 0, tc->snd_mss, &b);
1539  if (!n_bytes)
1540  {
1541  tcp_timer_update (tc, TCP_TIMER_RETRANSMIT, 1);
1542  return;
1543  }
1544 
1545  bi = vlib_get_buffer_index (vm, b);
1546 
1547  /* For first retransmit, record timestamp (Eifel detection RFC3522) */
1548  if (tc->rto_boff == 1)
1549  tc->snd_rxt_ts = tcp_time_now_w_thread (tc->c_thread_index);
1550 
1551  tcp_enqueue_to_output (wrk, b, bi, tc->c_is_ip4);
1553  }
1554  /* Retransmit for SYN */
1555  else if (tc->state == TCP_STATE_SYN_SENT)
1556  {
1557  /* Half-open connection actually moved to established but we were
1558  * waiting for syn retransmit to pop to call cleanup from the right
1559  * thread. */
1560  if (tc->flags & TCP_CONN_HALF_OPEN_DONE)
1561  {
1563  TCP_DBG ("could not remove half-open connection");
1564  return;
1565  }
1566 
1567  TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 2);
1568 
1569  /* Try without increasing RTO a number of times. If this fails,
1570  * start growing RTO exponentially */
1571  tc->rto_boff += 1;
1572  if (tc->rto_boff > TCP_RTO_SYN_RETRIES)
1573  tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX);
1574 
1575  tcp_timer_update (tc, TCP_TIMER_RETRANSMIT_SYN,
1576  tc->rto * TCP_TO_TIMER_TICK);
1577 
1578  if (PREDICT_FALSE (!vlib_buffer_alloc (vm, &bi, 1)))
1579  {
1580  tcp_timer_update (tc, TCP_TIMER_RETRANSMIT_SYN, 1);
1581  return;
1582  }
1583 
1584  b = vlib_get_buffer (vm, bi);
1585  tcp_init_buffer (vm, b);
1586  tcp_make_syn (tc, b);
1587 
1588  tc->rtt_ts = 0;
1589  TCP_EVT_DBG (TCP_EVT_SYN_RXT, tc, 0);
1590 
1591  /* This goes straight to ipx_lookup. Retransmit timer set already */
1592  tcp_push_ip_hdr (wrk, tc, b);
1593  tcp_enqueue_to_ip_lookup (wrk, b, bi, tc->c_is_ip4, tc->c_fib_index);
1594  }
1595  /* Retransmit SYN-ACK */
1596  else if (tc->state == TCP_STATE_SYN_RCVD)
1597  {
1598  TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 2);
1599 
1600  tc->rto_boff += 1;
1601  if (tc->rto_boff > TCP_RTO_SYN_RETRIES)
1602  tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX);
1603  tc->rtt_ts = 0;
1604 
1606 
1607  if (PREDICT_FALSE (!vlib_buffer_alloc (vm, &bi, 1)))
1608  {
1609  tcp_timer_update (tc, TCP_TIMER_RETRANSMIT, 1);
1610  return;
1611  }
1612 
1613  b = vlib_get_buffer (vm, bi);
1614  tcp_init_buffer (vm, b);
1615  tcp_make_synack (tc, b);
1616  TCP_EVT_DBG (TCP_EVT_SYN_RXT, tc, 1);
1617 
1618  /* Retransmit timer already updated, just enqueue to output */
1619  tcp_enqueue_to_output (wrk, b, bi, tc->c_is_ip4);
1620  }
1621  else
1622  {
1623  ASSERT (tc->state == TCP_STATE_CLOSED);
1624  return;
1625  }
1626 }
1627 
1628 void
1630 {
1631  tcp_timer_retransmit_handler_i (index, 0);
1632 }
1633 
1634 void
1636 {
1637  tcp_timer_retransmit_handler_i (index, 1);
1638 }
1639 
1640 /**
1641  * Got 0 snd_wnd from peer, try to do something about it.
1642  *
1643  */
1644 void
1646 {
1647  u32 thread_index = vlib_get_thread_index ();
1648  tcp_worker_ctx_t *wrk = tcp_get_worker (thread_index);
1649  u32 bi, max_snd_bytes, available_bytes, offset;
1650  tcp_main_t *tm = vnet_get_tcp_main ();
1651  vlib_main_t *vm = wrk->vm;
1652  tcp_connection_t *tc;
1653  vlib_buffer_t *b;
1654  int n_bytes = 0;
1655  u8 *data;
1656 
1657  tc = tcp_connection_get_if_valid (index, thread_index);
1658  if (!tc)
1659  return;
1660 
1661  /* Make sure timer handle is set to invalid */
1662  tc->timers[TCP_TIMER_PERSIST] = TCP_TIMER_HANDLE_INVALID;
1663 
1664  /* Problem already solved or worse */
1665  if (tc->state == TCP_STATE_CLOSED || tc->snd_wnd > tc->snd_mss
1666  || (tc->flags & TCP_CONN_FINSNT))
1667  return;
1668 
1669  available_bytes = transport_max_tx_dequeue (&tc->connection);
1670  offset = tc->snd_nxt - tc->snd_una;
1671 
1672  /* Reprogram persist if no new bytes available to send. We may have data
1673  * next time */
1674  if (!available_bytes)
1675  {
1676  tcp_persist_timer_set (tc);
1677  return;
1678  }
1679 
1680  if (available_bytes <= offset)
1681  {
1682  ASSERT (tcp_timer_is_active (tc, TCP_TIMER_RETRANSMIT));
1683  return;
1684  }
1685 
1686  /* Increment RTO backoff */
1687  tc->rto_boff += 1;
1688  tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX);
1689 
1690  /*
1691  * Try to force the first unsent segment (or buffer)
1692  */
1693  if (PREDICT_FALSE (!vlib_buffer_alloc (vm, &bi, 1)))
1694  {
1695  tcp_persist_timer_set (tc);
1696  return;
1697  }
1698  b = vlib_get_buffer (vm, bi);
1699  data = tcp_init_buffer (vm, b);
1700 
1701  tcp_validate_txf_size (tc, offset);
1702  tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts, tc->state);
1703  max_snd_bytes = clib_min (tc->snd_mss,
1704  tm->bytes_per_buffer - TRANSPORT_MAX_HDRS_LEN);
1705  n_bytes = session_tx_fifo_peek_bytes (&tc->connection, data, offset,
1706  max_snd_bytes);
1707  b->current_length = n_bytes;
1708  ASSERT (n_bytes != 0 && (tcp_timer_is_active (tc, TCP_TIMER_RETRANSMIT)
1709  || tc->snd_nxt == tc->snd_una_max
1710  || tc->rto_boff > 1));
1711 
1712  tcp_push_hdr_i (tc, b, tc->snd_nxt, /* compute opts */ 0,
1713  /* burst */ 0, /* update_snd_nxt */ 1);
1714  tc->snd_una_max = seq_max (tc->snd_nxt, tc->snd_una_max);
1715  tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una);
1716  tcp_enqueue_to_output (wrk, b, bi, tc->c_is_ip4);
1717 
1718  /* Just sent new data, enable retransmit */
1720 }
1721 
1722 /**
1723  * Retransmit first unacked segment
1724  */
1725 int
1727 {
1728  vlib_main_t *vm = wrk->vm;
1729  vlib_buffer_t *b;
1730  u32 bi, n_bytes;
1731 
1732  TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 1);
1733 
1734  n_bytes = tcp_prepare_retransmit_segment (wrk, tc, 0, tc->snd_mss, &b);
1735  if (!n_bytes)
1736  return -1;
1737 
1738  bi = vlib_get_buffer_index (vm, b);
1739  tcp_enqueue_to_output (wrk, b, bi, tc->c_is_ip4);
1740 
1741  return 0;
1742 }
1743 
1744 static int
1746  u32 burst_size)
1747 {
1748  u32 offset, n_segs = 0, n_written, bi;
1749  vlib_main_t *vm = wrk->vm;
1750  vlib_buffer_t *b = 0;
1751 
1752  offset = tc->snd_nxt - tc->snd_una;
1753  while (n_segs < burst_size)
1754  {
1755  n_written = tcp_prepare_segment (wrk, tc, offset, tc->snd_mss, &b);
1756  if (!n_written)
1757  goto done;
1758 
1759  bi = vlib_get_buffer_index (vm, b);
1760  tcp_enqueue_to_output (wrk, b, bi, tc->c_is_ip4);
1761  offset += n_written;
1762  n_segs += 1;
1763 
1764  tc->snd_nxt += n_written;
1765  tc->snd_una_max = seq_max (tc->snd_nxt, tc->snd_una_max);
1766  }
1767 
1768 done:
1769  return n_segs;
1770 }
1771 
1772 #define scoreboard_rescue_rxt_valid(_sb, _tc) \
1773  (seq_geq (_sb->rescue_rxt, _tc->snd_una) \
1774  && seq_leq (_sb->rescue_rxt, _tc->snd_congestion))
1775 
1776 /**
1777  * Do fast retransmit with SACKs
1778  */
1779 int
1781  u32 burst_size)
1782 {
1783  u32 n_written = 0, offset, max_bytes, n_segs = 0, n_segs_now;
1784  sack_scoreboard_hole_t *hole;
1785  vlib_main_t *vm = wrk->vm;
1786  vlib_buffer_t *b = 0;
1787  sack_scoreboard_t *sb;
1788  u32 bi, max_deq;
1789  int snd_space;
1790  u8 snd_limited = 0, can_rescue = 0;
1791 
1792  ASSERT (tcp_in_fastrecovery (tc));
1793 
1794  snd_space = tcp_available_cc_snd_space (tc);
1795  if (snd_space < tc->snd_mss)
1796  {
1797  tcp_program_fastretransmit (wrk, tc);
1798  return 0;
1799  }
1800 
1801  TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 0);
1802  sb = &tc->sack_sb;
1803  hole = scoreboard_get_hole (sb, sb->cur_rxt_hole);
1804 
1805  max_deq = transport_max_tx_dequeue (&tc->connection);
1806  max_deq -= tc->snd_nxt - tc->snd_una;
1807 
1808  while (snd_space > 0 && n_segs < burst_size)
1809  {
1810  hole = scoreboard_next_rxt_hole (sb, hole, max_deq, &can_rescue,
1811  &snd_limited);
1812  if (!hole)
1813  {
1814  if (max_deq)
1815  {
1816  snd_space = clib_min (max_deq, snd_space);
1817  burst_size = clib_min (burst_size - n_segs,
1818  snd_space / tc->snd_mss);
1819  n_segs_now = tcp_fast_retransmit_unsent (wrk, tc, burst_size);
1820  if (max_deq > n_segs_now * tc->snd_mss)
1821  tcp_program_fastretransmit (wrk, tc);
1822  n_segs += n_segs_now;
1823  goto done;
1824  }
1825 
1826  if (!can_rescue || scoreboard_rescue_rxt_valid (sb, tc))
1827  break;
1828 
1829  /* If rescue rxt undefined or less than snd_una then one segment of
1830  * up to SMSS octets that MUST include the highest outstanding
1831  * unSACKed sequence number SHOULD be returned, and RescueRxt set to
1832  * RecoveryPoint. HighRxt MUST NOT be updated.
1833  */
1834  max_bytes = clib_min (tc->snd_mss,
1835  tc->snd_congestion - tc->snd_una);
1836  max_bytes = clib_min (max_bytes, snd_space);
1837  offset = tc->snd_congestion - tc->snd_una - max_bytes;
1838  sb->rescue_rxt = tc->snd_congestion;
1839  n_written = tcp_prepare_retransmit_segment (wrk, tc, offset,
1840  max_bytes, &b);
1841  if (!n_written)
1842  goto done;
1843 
1844  bi = vlib_get_buffer_index (vm, b);
1845  tcp_enqueue_to_output (wrk, b, bi, tc->c_is_ip4);
1846  n_segs += 1;
1847  break;
1848  }
1849 
1850  max_bytes = clib_min (hole->end - sb->high_rxt, snd_space);
1851  max_bytes = snd_limited ? clib_min (max_bytes, tc->snd_mss) : max_bytes;
1852  if (max_bytes == 0)
1853  break;
1854 
1855  offset = sb->high_rxt - tc->snd_una;
1856  n_written = tcp_prepare_retransmit_segment (wrk, tc, offset, max_bytes,
1857  &b);
1858  ASSERT (n_written <= snd_space);
1859 
1860  /* Nothing left to retransmit */
1861  if (n_written == 0)
1862  break;
1863 
1864  bi = vlib_get_buffer_index (vm, b);
1865  tcp_enqueue_to_output (wrk, b, bi, tc->c_is_ip4);
1866 
1867  sb->high_rxt += n_written;
1868  snd_space -= n_written;
1869  n_segs += 1;
1870  }
1871 
1872  if (hole)
1873  tcp_program_fastretransmit (wrk, tc);
1874 
1875 done:
1876  return n_segs;
1877 }
1878 
1879 /**
1880  * Fast retransmit without SACK info
1881  */
1882 int
1884  u32 burst_size)
1885 {
1886  u32 n_written = 0, offset = 0, bi, max_deq, n_segs_now;
1887  vlib_main_t *vm = wrk->vm;
1888  int snd_space, n_segs = 0;
1889  vlib_buffer_t *b;
1890 
1891  ASSERT (tcp_in_fastrecovery (tc));
1892  TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 0);
1893 
1894  if (!tcp_fastrecovery_first (tc))
1895  goto send_unsent;
1896 
1897  /* RFC 6582: [If a partial ack], retransmit the first unacknowledged
1898  * segment. */
1899  snd_space = tc->sack_sb.last_bytes_delivered;
1900  while (snd_space > 0 && n_segs < burst_size)
1901  {
1902  n_written = tcp_prepare_retransmit_segment (wrk, tc, offset,
1903  tc->snd_mss, &b);
1904 
1905  /* Nothing left to retransmit */
1906  if (n_written == 0)
1907  break;
1908 
1909  bi = vlib_get_buffer_index (vm, b);
1910  tcp_enqueue_to_output (wrk, b, bi, tc->c_is_ip4);
1911  snd_space -= n_written;
1912  offset += n_written;
1913  n_segs += 1;
1914  }
1915 
1916  if (n_segs == burst_size)
1917  goto done;
1918 
1919 send_unsent:
1920 
1921  /* RFC 6582: Send a new segment if permitted by the new value of cwnd. */
1922  snd_space = tcp_available_cc_snd_space (tc);
1923  if (snd_space < tc->snd_mss || tc->snd_mss == 0)
1924  goto done;
1925 
1926  max_deq = transport_max_tx_dequeue (&tc->connection);
1927  max_deq -= tc->snd_nxt - tc->snd_una;
1928  if (max_deq)
1929  {
1930  snd_space = clib_min (max_deq, snd_space);
1931  burst_size = clib_min (burst_size - n_segs, snd_space / tc->snd_mss);
1932  n_segs_now = tcp_fast_retransmit_unsent (wrk, tc, burst_size);
1933  if (max_deq > n_segs_now * tc->snd_mss)
1934  tcp_program_fastretransmit (wrk, tc);
1935  n_segs += n_segs_now;
1936  }
1937 
1938 done:
1940  return n_segs;
1941 }
1942 
1943 /**
1944  * Do fast retransmit
1945  */
1946 int
1948  u32 burst_size)
1949 {
1950  if (tcp_opts_sack_permitted (&tc->rcv_opts))
1951  return tcp_fast_retransmit_sack (wrk, tc, burst_size);
1952  else
1953  return tcp_fast_retransmit_no_sack (wrk, tc, burst_size);
1954 }
1955 #endif /* CLIB_MARCH_VARIANT */
1956 
1957 static void
1959  u16 * next0, u32 * error0)
1960 {
1961  ip_adjacency_t *adj;
1962  adj_index_t ai;
1963 
1964  /* Not thread safe but as long as the connection exists the adj should
1965  * not be removed */
1966  ai = adj_nbr_find (FIB_PROTOCOL_IP6, VNET_LINK_IP6, &tc0->c_rmt_ip,
1967  tc0->sw_if_index);
1968  if (ai == ADJ_INDEX_INVALID)
1969  {
1970  vnet_buffer (b0)->sw_if_index[VLIB_TX] = ~0;
1971  *next0 = TCP_OUTPUT_NEXT_DROP;
1972  *error0 = TCP_ERROR_LINK_LOCAL_RW;
1973  return;
1974  }
1975 
1976  adj = adj_get (ai);
1978  *next0 = TCP_OUTPUT_NEXT_IP_REWRITE;
1979  else if (adj->lookup_next_index == IP_LOOKUP_NEXT_ARP)
1980  *next0 = TCP_OUTPUT_NEXT_IP_ARP;
1981  else
1982  {
1983  *next0 = TCP_OUTPUT_NEXT_DROP;
1984  *error0 = TCP_ERROR_LINK_LOCAL_RW;
1985  }
1986  vnet_buffer (b0)->ip.adj_index[VLIB_TX] = ai;
1987 }
1988 
1989 static void
1991  u32 * to_next, u32 n_bufs)
1992 {
1993  u32 n_trace = vlib_get_trace_count (vm, node);
1994  tcp_connection_t *tc;
1995  tcp_tx_trace_t *t;
1996  vlib_buffer_t *b;
1997  tcp_header_t *th;
1998  int i;
1999 
2000  for (i = 0; i < clib_min (n_trace, n_bufs); i++)
2001  {
2002  b = vlib_get_buffer (vm, to_next[i]);
2003  th = vlib_buffer_get_current (b);
2004  tc = tcp_connection_get (vnet_buffer (b)->tcp.connection_index,
2005  vm->thread_index);
2006  t = vlib_add_trace (vm, node, b, sizeof (*t));
2007  clib_memcpy_fast (&t->tcp_header, th, sizeof (t->tcp_header));
2008  clib_memcpy_fast (&t->tcp_connection, tc, sizeof (t->tcp_connection));
2009  }
2010 }
2011 
2012 always_inline void
2014  tcp_connection_t * tc0, u8 is_ip4)
2015 {
2016  tcp_header_t *th0 = 0;
2017 
2018  th0 = vlib_buffer_get_current (b0);
2019  TCP_EVT_DBG (TCP_EVT_OUTPUT, tc0, th0->flags, b0->current_length);
2020  if (is_ip4)
2021  {
2022  vlib_buffer_push_ip4 (vm, b0, &tc0->c_lcl_ip4, &tc0->c_rmt_ip4,
2023  IP_PROTOCOL_TCP, 1);
2024  b0->flags |= VNET_BUFFER_F_OFFLOAD_TCP_CKSUM;
2025  vnet_buffer (b0)->l4_hdr_offset = (u8 *) th0 - b0->data;
2026  th0->checksum = 0;
2027  }
2028  else
2029  {
2030  ip6_header_t *ih0;
2031  ih0 = vlib_buffer_push_ip6 (vm, b0, &tc0->c_lcl_ip6,
2032  &tc0->c_rmt_ip6, IP_PROTOCOL_TCP);
2033  b0->flags |= VNET_BUFFER_F_OFFLOAD_TCP_CKSUM;
2034  vnet_buffer (b0)->l3_hdr_offset = (u8 *) ih0 - b0->data;
2035  vnet_buffer (b0)->l4_hdr_offset = (u8 *) th0 - b0->data;
2036  th0->checksum = 0;
2037  }
2038 }
2039 
2040 always_inline void
2042  u32 * error0, u16 * next0, u8 is_ip4)
2043 {
2044 
2045  if (PREDICT_FALSE (tc0->state == TCP_STATE_CLOSED))
2046  {
2047  *error0 = TCP_ERROR_INVALID_CONNECTION;
2048  *next0 = TCP_OUTPUT_NEXT_DROP;
2049  return;
2050  }
2051 
2052  vnet_buffer (b0)->sw_if_index[VLIB_TX] = tc0->c_fib_index;
2053  vnet_buffer (b0)->sw_if_index[VLIB_RX] = 0;
2054 
2055  if (!is_ip4)
2056  {
2057  if (PREDICT_FALSE (ip6_address_is_link_local_unicast (&tc0->c_rmt_ip6)))
2058  tcp_output_handle_link_local (tc0, b0, next0, error0);
2059  }
2060 
2061  if (!TCP_ALWAYS_ACK)
2062  tcp_timer_reset (tc0, TCP_TIMER_DELACK);
2063 }
2064 
2067  vlib_frame_t * frame, int is_ip4)
2068 {
2069  u32 n_left_from, *from, thread_index = vm->thread_index;
2070  vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b;
2071  u16 nexts[VLIB_FRAME_SIZE], *next;
2072 
2073  from = vlib_frame_vector_args (frame);
2074  n_left_from = frame->n_vectors;
2075  tcp_set_time_now (tcp_get_worker (thread_index));
2076 
2078  tcp46_output_trace_frame (vm, node, from, n_left_from);
2079 
2080  vlib_get_buffers (vm, from, bufs, n_left_from);
2081  b = bufs;
2082  next = nexts;
2083 
2084  while (n_left_from >= 4)
2085  {
2086  u32 error0 = TCP_ERROR_PKTS_SENT, error1 = TCP_ERROR_PKTS_SENT;
2087  tcp_connection_t *tc0, *tc1;
2088 
2089  {
2090  vlib_prefetch_buffer_header (b[2], STORE);
2091  CLIB_PREFETCH (b[2]->data, 2 * CLIB_CACHE_LINE_BYTES, STORE);
2092 
2093  vlib_prefetch_buffer_header (b[3], STORE);
2094  CLIB_PREFETCH (b[3]->data, 2 * CLIB_CACHE_LINE_BYTES, STORE);
2095  }
2096 
2097  next[0] = next[1] = TCP_OUTPUT_NEXT_IP_LOOKUP;
2098 
2099  tc0 = tcp_connection_get (vnet_buffer (b[0])->tcp.connection_index,
2100  thread_index);
2101  tc1 = tcp_connection_get (vnet_buffer (b[1])->tcp.connection_index,
2102  thread_index);
2103 
2104  tcp_output_push_ip (vm, b[0], tc0, is_ip4);
2105  tcp_output_push_ip (vm, b[1], tc1, is_ip4);
2106 
2107  tcp_output_handle_packet (tc0, b[0], &error0, &next[0], is_ip4);
2108  tcp_output_handle_packet (tc1, b[1], &error1, &next[1], is_ip4);
2109 
2110  b += 2;
2111  next += 2;
2112  n_left_from -= 2;
2113  }
2114  while (n_left_from > 0)
2115  {
2116  u32 error0 = TCP_ERROR_PKTS_SENT;
2117  tcp_connection_t *tc0;
2118 
2119  if (n_left_from > 1)
2120  {
2121  vlib_prefetch_buffer_header (b[1], STORE);
2122  CLIB_PREFETCH (b[1]->data, 2 * CLIB_CACHE_LINE_BYTES, STORE);
2123  }
2124 
2125  next[0] = TCP_OUTPUT_NEXT_IP_LOOKUP;
2126  tc0 = tcp_connection_get (vnet_buffer (b[0])->tcp.connection_index,
2127  thread_index);
2128 
2129  tcp_output_push_ip (vm, b[0], tc0, is_ip4);
2130  tcp_output_handle_packet (tc0, b[0], &error0, &next[0], is_ip4);
2131 
2132  b += 1;
2133  next += 1;
2134  n_left_from -= 1;
2135  }
2136 
2137  vlib_buffer_enqueue_to_next (vm, node, from, nexts, frame->n_vectors);
2138  return frame->n_vectors;
2139 }
2140 
2142  vlib_frame_t * from_frame)
2143 {
2144  return tcp46_output_inline (vm, node, from_frame, 1 /* is_ip4 */ );
2145 }
2146 
2148  vlib_frame_t * from_frame)
2149 {
2150  return tcp46_output_inline (vm, node, from_frame, 0 /* is_ip4 */ );
2151 }
2152 
2153 /* *INDENT-OFF* */
2155 {
2156  .name = "tcp4-output",
2157  /* Takes a vector of packets. */
2158  .vector_size = sizeof (u32),
2159  .n_errors = TCP_N_ERROR,
2160  .protocol_hint = VLIB_NODE_PROTO_HINT_TCP,
2161  .error_strings = tcp_error_strings,
2162  .n_next_nodes = TCP_OUTPUT_N_NEXT,
2163  .next_nodes = {
2164 #define _(s,n) [TCP_OUTPUT_NEXT_##s] = n,
2166 #undef _
2167  },
2168  .format_buffer = format_tcp_header,
2169  .format_trace = format_tcp_tx_trace,
2170 };
2171 /* *INDENT-ON* */
2172 
2173 /* *INDENT-OFF* */
2175 {
2176  .name = "tcp6-output",
2177  /* Takes a vector of packets. */
2178  .vector_size = sizeof (u32),
2179  .n_errors = TCP_N_ERROR,
2180  .protocol_hint = VLIB_NODE_PROTO_HINT_TCP,
2181  .error_strings = tcp_error_strings,
2182  .n_next_nodes = TCP_OUTPUT_N_NEXT,
2183  .next_nodes = {
2184 #define _(s,n) [TCP_OUTPUT_NEXT_##s] = n,
2186 #undef _
2187  },
2188  .format_buffer = format_tcp_header,
2189  .format_trace = format_tcp_tx_trace,
2190 };
2191 /* *INDENT-ON* */
2192 
2193 typedef enum _tcp_reset_next
2194 {
2199 
2200 #define foreach_tcp4_reset_next \
2201  _(DROP, "error-drop") \
2202  _(IP_LOOKUP, "ip4-lookup")
2203 
2204 #define foreach_tcp6_reset_next \
2205  _(DROP, "error-drop") \
2206  _(IP_LOOKUP, "ip6-lookup")
2207 
2208 static uword
2210  vlib_frame_t * from_frame, u8 is_ip4)
2211 {
2212  u32 n_left_from, next_index, *from, *to_next;
2213  u32 my_thread_index = vm->thread_index;
2214 
2215  from = vlib_frame_vector_args (from_frame);
2216  n_left_from = from_frame->n_vectors;
2217 
2218  next_index = node->cached_next_index;
2219 
2220  while (n_left_from > 0)
2221  {
2222  u32 n_left_to_next;
2223 
2224  vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
2225 
2226  while (n_left_from > 0 && n_left_to_next > 0)
2227  {
2228  u32 bi0;
2229  vlib_buffer_t *b0;
2230  tcp_tx_trace_t *t0;
2231  tcp_header_t *th0;
2232  u32 error0 = TCP_ERROR_RST_SENT, next0 = TCP_RESET_NEXT_IP_LOOKUP;
2233 
2234  bi0 = from[0];
2235  to_next[0] = bi0;
2236  from += 1;
2237  to_next += 1;
2238  n_left_from -= 1;
2239  n_left_to_next -= 1;
2240 
2241  b0 = vlib_get_buffer (vm, bi0);
2242 
2243  if (tcp_make_reset_in_place (vm, b0, vnet_buffer (b0)->tcp.flags,
2244  my_thread_index, is_ip4))
2245  {
2246  error0 = TCP_ERROR_LOOKUP_DROPS;
2247  next0 = TCP_RESET_NEXT_DROP;
2248  goto done;
2249  }
2250 
2251  /* Prepare to send to IP lookup */
2252  vnet_buffer (b0)->sw_if_index[VLIB_TX] = ~0;
2253  next0 = TCP_RESET_NEXT_IP_LOOKUP;
2254 
2255  done:
2256  b0->error = node->errors[error0];
2257  b0->flags |= VNET_BUFFER_F_LOCALLY_ORIGINATED;
2258  if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
2259  {
2260  th0 = vlib_buffer_get_current (b0);
2261  if (is_ip4)
2262  th0 = ip4_next_header ((ip4_header_t *) th0);
2263  else
2264  th0 = ip6_next_header ((ip6_header_t *) th0);
2265  t0 = vlib_add_trace (vm, node, b0, sizeof (*t0));
2266  clib_memcpy_fast (&t0->tcp_header, th0,
2267  sizeof (t0->tcp_header));
2268  }
2269 
2270  vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
2271  n_left_to_next, bi0, next0);
2272  }
2273  vlib_put_next_frame (vm, node, next_index, n_left_to_next);
2274  }
2275  return from_frame->n_vectors;
2276 }
2277 
2279  vlib_frame_t * from_frame)
2280 {
2281  return tcp46_send_reset_inline (vm, node, from_frame, 1);
2282 }
2283 
2285  vlib_frame_t * from_frame)
2286 {
2287  return tcp46_send_reset_inline (vm, node, from_frame, 0);
2288 }
2289 
2290 /* *INDENT-OFF* */
2292  .name = "tcp4-reset",
2293  .vector_size = sizeof (u32),
2294  .n_errors = TCP_N_ERROR,
2295  .error_strings = tcp_error_strings,
2296  .n_next_nodes = TCP_RESET_N_NEXT,
2297  .next_nodes = {
2298 #define _(s,n) [TCP_RESET_NEXT_##s] = n,
2300 #undef _
2301  },
2302  .format_trace = format_tcp_tx_trace,
2303 };
2304 /* *INDENT-ON* */
2305 
2306 /* *INDENT-OFF* */
2308  .name = "tcp6-reset",
2309  .vector_size = sizeof (u32),
2310  .n_errors = TCP_N_ERROR,
2311  .error_strings = tcp_error_strings,
2312  .n_next_nodes = TCP_RESET_N_NEXT,
2313  .next_nodes = {
2314 #define _(s,n) [TCP_RESET_NEXT_##s] = n,
2316 #undef _
2317  },
2318  .format_trace = format_tcp_tx_trace,
2319 };
2320 /* *INDENT-ON* */
2321 
2322 /*
2323  * fd.io coding-style-patch-verification: ON
2324  *
2325  * Local Variables:
2326  * eval: (c-set-style "gnu")
2327  * End:
2328  */
void tcp_make_fin(tcp_connection_t *tc, vlib_buffer_t *b)
Convert buffer to FIN-ACK.
Definition: tcp_output.c:522
#define tcp_in_cong_recovery(tc)
Definition: tcp.h:353
u32 sw_if_index
Definition: ipsec_gre.api:37
u32 flags
buffer flags: VLIB_BUFFER_FREE_LIST_INDEX_MASK: bits used to store free list index, VLIB_BUFFER_IS_TRACED: trace this buffer.
Definition: buffer.h:124
void session_flush_frames_main_thread(vlib_main_t *vm)
Definition: session.c:1301
End of options.
Definition: tcp_packet.h:104
static u32 tcp_options_write(u8 *data, tcp_options_t *opts)
Write TCP options to segment.
Definition: tcp_output.c:193
static void tcp_rxt_timeout_cc(tcp_connection_t *tc)
Reset congestion control, switch cwnd to loss window and try again.
Definition: tcp_output.c:1432
u32 flags
Definition: vhost_user.h:115
#define clib_min(x, y)
Definition: clib.h:295
#define TCP_OPTION_LEN_EOL
Definition: tcp_packet.h:163
#define CLIB_UNUSED(x)
Definition: clib.h:82
u32 * pending_acks
vector of pending acks
Definition: tcp.h:407
#define tcp_in_recovery(tc)
Definition: tcp.h:344
void scoreboard_clear(sack_scoreboard_t *sb)
Definition: tcp_input.c:900
static f64 tcp_time_now_us(u32 thread_index)
Definition: tcp.h:797
static void tcp_retransmit_timer_set(tcp_connection_t *tc)
Definition: tcp.h:867
static u32 transport_rx_fifo_size(transport_connection_t *tc)
Definition: session.h:399
#define TCP_OPTION_LEN_SACK_PERMITTED
Definition: tcp_packet.h:167
static u32 vlib_get_trace_count(vlib_main_t *vm, vlib_node_runtime_t *rt)
Definition: trace_funcs.h:156
void tcp_timer_retransmit_handler(u32 index)
Definition: tcp_output.c:1629
static void vlib_buffer_free(vlib_main_t *vm, u32 *buffers, u32 n_buffers)
Free buffers Frees the entire buffer chain for each buffer.
Definition: buffer_funcs.h:865
ip4_address_t src_address
Definition: ip4_packet.h:170
int session_tx_fifo_peek_bytes(transport_connection_t *tc, u8 *buffer, u32 offset, u32 max_bytes)
Definition: session.c:445
#define TCP_TO_TIMER_TICK
Definition: tcp.h:98
Selective Ack permitted.
Definition: tcp_packet.h:108
#define TCP_FLAG_SYN
Definition: fa_node.h:13
#define TCP_MIN_RX_FIFO_SIZE
Definition: tcp.h:37
#define tcp_opts_tstamp(_to)
Definition: tcp_packet.h:157
void tcp_make_synack(tcp_connection_t *tc, vlib_buffer_t *b)
Convert buffer to SYN-ACK.
Definition: tcp_output.c:556
#define PREDICT_TRUE(x)
Definition: clib.h:112
i16 current_data
signed offset in data[], pre_data[] that we are currently processing.
Definition: buffer.h:110
static void tcp_flush_frame_to_ip_lookup(tcp_worker_ctx_t *wrk, u8 is_ip4)
Flush ip lookup tx frames populated by timer pops.
Definition: tcp_output.c:997
static tcp_connection_t * tcp_connection_get_if_valid(u32 conn_index, u32 thread_index)
Definition: tcp.h:556
static int tcp_make_syn_options(tcp_options_t *opts, u8 wnd_scale)
Definition: tcp_output.c:274
#define clib_memcpy_fast(a, b, c)
Definition: string.h:81
clib_memset(h->entries, 0, sizeof(h->entries[0]) *entries)
struct _sack_scoreboard sack_scoreboard_t
IP unicast adjacency.
Definition: adj.h:221
u32 fib_table_get_index_for_sw_if_index(fib_protocol_t proto, u32 sw_if_index)
Get the index of the FIB bound to the interface.
Definition: fib_table.c:956
static tcp_connection_t * tcp_half_open_connection_get(u32 conn_index)
Definition: tcp.h:601
void tcp_update_rcv_mss(tcp_connection_t *tc)
Update max segment size we&#39;re able to process.
Definition: tcp_output.c:85
vlib_frame_t * tx_frames[2]
tx frames for tcp 4/6 output nodes
Definition: tcp.h:389
void tcp_send_acks(tcp_worker_ctx_t *wrk)
Definition: tcp_output.c:1193
struct _tcp_main tcp_main_t
u32 thread_index
Definition: main.h:197
This packet is to be rewritten and forwarded to the next processing node.
Definition: adj.h:73
u16 current_length
Nbytes between current data and the end of this buffer.
Definition: buffer.h:113
u8 data[0]
Packet data.
Definition: buffer.h:181
#define TCP_OPTS_ALIGN
Definition: tcp_packet.h:174
#define vec_add1(V, E)
Add 1 element to end of vector (unspecified alignment).
Definition: vec.h:522
static u32 tcp_initial_wnd_unscaled(tcp_connection_t *tc)
TCP&#39;s initial window.
Definition: tcp_output.c:95
enum _tcp_output_next tcp_output_next_t
int i
static u32 format_get_indent(u8 *s)
Definition: format.h:72
struct _tcp_connection tcp_connection_t
u8 * format(u8 *s, const char *fmt,...)
Definition: format.c:424
static u32 tcp_available_cc_snd_space(const tcp_connection_t *tc)
Estimate of how many bytes we can still push into the network.
Definition: tcp.h:747
#define tcp_opts_sack(_to)
Definition: tcp_packet.h:159
u8 data[128]
Definition: ipsec.api:248
#define VLIB_NODE_FN(node)
Definition: node.h:201
static void tcp_push_ip_hdr(tcp_worker_ctx_t *wrk, tcp_connection_t *tc, vlib_buffer_t *b)
Definition: tcp_output.c:887
#define vec_validate_aligned(V, I, A)
Make sure vector is long enough for given index (no header, specified alignment)
Definition: vec.h:450
static uword tcp46_send_reset_inline(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *from_frame, u8 is_ip4)
Definition: tcp_output.c:2209
vlib_error_t * errors
Vector of errors for this node.
Definition: node.h:468
No operation.
Definition: tcp_packet.h:105
int tcp_fast_retransmit_no_sack(tcp_worker_ctx_t *wrk, tcp_connection_t *tc, u32 burst_size)
Fast retransmit without SACK info.
Definition: tcp_output.c:1883
u8 n_sack_blocks
Number of SACKs blocks.
Definition: tcp_packet.h:152
static void tcp_enqueue_to_output_i(tcp_worker_ctx_t *wrk, vlib_buffer_t *b, u32 bi, u8 is_ip4, u8 flush)
Definition: tcp_output.c:629
struct _tcp_header tcp_header_t
int tcp_half_open_connection_cleanup(tcp_connection_t *tc)
Try to cleanup half-open connection.
Definition: tcp.c:170
#define scoreboard_rescue_rxt_valid(_sb, _tc)
Definition: tcp_output.c:1772
ip6_address_t src_address
Definition: ip6_packet.h:385
unsigned char u8
Definition: types.h:56
struct _sack_scoreboard_hole sack_scoreboard_hole_t
u8 wscale
Window scale advertised.
Definition: tcp_packet.h:148
enum fib_protocol_t_ fib_protocol_t
Protocol Type.
#define TCP_OPTS_MAX_SACK_BLOCKS
Definition: tcp_packet.h:175
#define TCP_MAX_RX_FIFO_SIZE
Definition: tcp.h:36
vlib_node_registration_t ip4_lookup_node
(constructor) VLIB_REGISTER_NODE (ip4_lookup_node)
Definition: ip4_forward.c:104
static void tcp_timer_retransmit_handler_i(u32 index, u8 is_syn)
Definition: tcp_output.c:1459
#define foreach_tcp4_reset_next
Definition: tcp_output.c:2200
static u32 tcp_prepare_retransmit_segment(tcp_worker_ctx_t *wrk, tcp_connection_t *tc, u32 offset, u32 max_deq_bytes, vlib_buffer_t **b)
Build a retransmit segment.
Definition: tcp_output.c:1381
u16 src_port
Definition: udp.api:41
Limit MSS.
Definition: tcp_packet.h:106
static uword tcp46_output_inline(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame, int is_ip4)
Definition: tcp_output.c:2066
static void * tcp_init_buffer(vlib_main_t *vm, vlib_buffer_t *b)
Definition: tcp_output.c:470
static ip_adjacency_t * adj_get(adj_index_t adj_index)
Get a pointer to an adjacency object from its index.
Definition: adj.h:433
void tcp_make_syn(tcp_connection_t *tc, vlib_buffer_t *b)
Convert buffer to SYN.
Definition: tcp_output.c:531
static int tcp_prepare_segment(tcp_worker_ctx_t *wrk, tcp_connection_t *tc, u32 offset, u32 max_deq_bytes, vlib_buffer_t **b)
Allocate a new buffer and build a new tcp segment.
Definition: tcp_output.c:1269
#define seq_gt(_s1, _s2)
Definition: tcp.h:641
sack_scoreboard_hole_t * scoreboard_get_hole(sack_scoreboard_t *sb, u32 index)
Definition: tcp_input.c:658
#define always_inline
Definition: clib.h:98
#define TCP_OPTION_LEN_SACK_BLOCK
Definition: tcp_packet.h:169
ip4_address_t dst_address
Definition: ip4_packet.h:170
#define TCP_FLAG_ACK
Definition: fa_node.h:16
u8 * format_white_space(u8 *s, va_list *va)
Definition: std-formats.c:113
tcp_main_t tcp_main
Definition: tcp.c:29
static tcp_header_t * tcp_buffer_hdr(vlib_buffer_t *b)
Definition: tcp.h:526
#define vlib_prefetch_buffer_header(b, type)
Prefetch buffer metadata.
Definition: buffer.h:203
vlib_frame_t * vlib_get_frame_to_node(vlib_main_t *vm, u32 to_node_index)
Definition: main.c:187
enum _tcp_state tcp_state_t
#define TCP_ALWAYS_ACK
On/off delayed acks.
Definition: tcp.h:39
#define TCP_RTO_MAX
Definition: tcp.h:110
vhost_vring_state_t state
Definition: vhost_user.h:120
static void * ip4_next_header(ip4_header_t *i)
Definition: ip4_packet.h:241
static u32 tcp_time_now(void)
Definition: tcp.h:785
sack_block_t * sacks
SACK blocks.
Definition: tcp_packet.h:151
unsigned int u32
Definition: types.h:88
static void tcp46_output_trace_frame(vlib_main_t *vm, vlib_node_runtime_t *node, u32 *to_next, u32 n_bufs)
Definition: tcp_output.c:1990
#define TCP_ESTABLISH_TIME
Definition: tcp.h:101
sack_scoreboard_hole_t * scoreboard_next_rxt_hole(sack_scoreboard_t *sb, sack_scoreboard_hole_t *start, u8 have_sent_1_smss, u8 *can_rescue, u8 *snd_limited)
Figure out the next hole to retransmit.
Definition: tcp_input.c:822
#define tcp_validate_txf_size(_tc, _a)
Definition: tcp.h:930
#define VLIB_FRAME_SIZE
Definition: node.h:376
static void tcp_enqueue_to_ip_lookup_now(tcp_worker_ctx_t *wrk, vlib_buffer_t *b, u32 bi, u8 is_ip4, u32 fib_index)
Definition: tcp_output.c:613
static void tcp_push_hdr_i(tcp_connection_t *tc, vlib_buffer_t *b, u32 snd_nxt, u8 compute_opts, u8 maybe_burst, u8 update_snd_nxt)
Push TCP header and update connection variables.
Definition: tcp_output.c:1070
#define TCP_EVT_DBG(_evt, _args...)
Definition: tcp_debug.h:241
static u32 vlib_get_buffer_index(vlib_main_t *vm, void *p)
Translate buffer pointer into buffer index.
Definition: buffer_funcs.h:257
u32 tcp_session_push_header(transport_connection_t *tconn, vlib_buffer_t *b)
Definition: tcp_output.c:1129
static void tcp_timer_set(tcp_connection_t *tc, u8 timer_id, u32 interval)
Definition: tcp.h:828
#define TCP_OPTION_LEN_WINDOW_SCALE
Definition: tcp_packet.h:166
vlib_node_registration_t tcp6_reset_node
(constructor) VLIB_REGISTER_NODE (tcp6_reset_node)
Definition: tcp_output.c:2307
#define TCP_RTO_SYN_RETRIES
Definition: tcp.h:113
vlib_error_t error
Error code for buffers to be enqueued to error handler.
Definition: buffer.h:136
int tcp_fast_retransmit_sack(tcp_worker_ctx_t *wrk, tcp_connection_t *tc, u32 burst_size)
Do fast retransmit with SACKs.
Definition: tcp_output.c:1780
#define tcp_trajectory_add_start(b, start)
Definition: tcp.h:539
#define TRANSPORT_MAX_HDRS_LEN
vlib_main_t * vm
convenience pointer to this thread&#39;s vlib main
Definition: tcp.h:413
void tcp_send_reset(tcp_connection_t *tc)
Build and set reset packet for connection.
Definition: tcp_output.c:841
void tcp_send_synack(tcp_connection_t *tc)
Definition: tcp_output.c:955
#define ADJ_INDEX_INVALID
Invalid ADJ index - used when no adj is known likewise blazoned capitals INVALID speak volumes where ...
Definition: adj_types.h:36
static int tcp_make_synack_options(tcp_connection_t *tc, tcp_options_t *opts)
Definition: tcp_output.c:303
static void * vlib_buffer_make_headroom(vlib_buffer_t *b, u8 size)
Make head room, typically for packet headers.
Definition: buffer.h:350
#define tcp_in_fastrecovery(tc)
Definition: tcp.h:343
void tcp_connection_tx_pacer_reset(tcp_connection_t *tc, u32 window, u32 start_bucket)
Definition: tcp.c:1229
static void * vlib_buffer_push_tcp_net_order(vlib_buffer_t *b, u16 sp, u16 dp, u32 seq, u32 ack, u8 tcp_hdr_opts_len, u8 flags, u16 wnd)
Push TCP header to buffer.
Definition: tcp.h:966
#define tcp_opts_mss(_to)
Definition: tcp_packet.h:156
unsigned short u16
Definition: types.h:57
void tcp_flush_frames_to_output(tcp_worker_ctx_t *wrk)
Flush v4 and v6 tcp and ip-lookup tx frames for thread index.
Definition: tcp_output.c:1013
void vlib_put_frame_to_node(vlib_main_t *vm, u32 to_node_index, vlib_frame_t *f)
Definition: main.c:196
static void * vlib_buffer_get_current(vlib_buffer_t *b)
Get pointer to current data to process.
Definition: buffer.h:229
#define TCP_TIMER_HANDLE_INVALID
Definition: tcp.h:95
static void tcp_output_handle_link_local(tcp_connection_t *tc0, vlib_buffer_t *b0, u16 *next0, u32 *error0)
Definition: tcp_output.c:1958
#define foreach_tcp6_output_next
Definition: tcp_output.c:34
static u32 tcp_flight_size(const tcp_connection_t *tc)
Our estimate of the number of bytes in flight (pipe size)
Definition: tcp.h:665
#define PREDICT_FALSE(x)
Definition: clib.h:111
static int tcp_make_reset_in_place(vlib_main_t *vm, vlib_buffer_t *b0, tcp_state_t state, u8 thread_index, u8 is_ip4)
Definition: tcp_output.c:676
#define TCP_FLAG_FIN
Definition: fa_node.h:12
static u8 tcp_window_compute_scale(u32 window)
Definition: tcp_output.c:70
void tcp_timer_retransmit_syn_handler(u32 index)
Definition: tcp_output.c:1635
#define vlib_validate_buffer_enqueue_x1(vm, node, next_index, to_next, n_left_to_next, bi0, next0)
Finish enqueueing one buffer forward in the graph.
Definition: buffer_node.h:218
#define vlib_get_next_frame(vm, node, next_index, vectors, n_vectors_left)
Get pointer to next frame vector data by (vlib_node_runtime_t, next_index).
Definition: node_funcs.h:338
#define TCP_OPTION_LEN_TIMESTAMP
Definition: tcp_packet.h:168
#define foreach_tcp4_output_next
Definition: tcp_output.c:28
#define TCP_WND_MAX
Definition: tcp_packet.h:172
static void tcp_enqueue_to_ip_lookup(tcp_worker_ctx_t *wrk, vlib_buffer_t *b, u32 bi, u8 is_ip4, u32 fib_index)
Definition: tcp_output.c:620
Selective Ack block.
Definition: tcp_packet.h:109
#define TCP_FLAG_RST
Definition: fa_node.h:14
#define TCP_DBG(_fmt, _args...)
Definition: tcp_debug.h:89
u8 len
Definition: ip_types.api:49
#define TCP_MAX_WND_SCALE
Definition: tcp_packet.h:173
static void tcp_timer_reset(tcp_connection_t *tc, u8 timer_id)
Definition: tcp.h:839
This packet matches an "incomplete adjacency" and packets need to be passed to ARP to find rewrite st...
Definition: adj.h:63
#define VLIB_REGISTER_NODE(x,...)
Definition: node.h:169
static void * vlib_buffer_push_tcp(vlib_buffer_t *b, u16 sp_net, u16 dp_net, u32 seq, u32 ack, u8 tcp_hdr_opts_len, u8 flags, u16 wnd)
Push TCP header to buffer.
Definition: tcp.h:1001
tcp_header_t tcp_header
Definition: tcp_output.c:48
u16 n_vectors
Definition: node.h:395
static_always_inline uword vlib_get_thread_index(void)
Definition: threads.h:212
#define CLIB_PREFETCH(addr, size, type)
Definition: cache.h:80
vlib_main_t * vm
Definition: buffer.c:312
static_always_inline void vlib_buffer_enqueue_to_next(vlib_main_t *vm, vlib_node_runtime_t *node, u32 *buffers, u16 *nexts, uword count)
Definition: buffer_node.h:332
void tcp_send_reset_w_pkt(tcp_connection_t *tc, vlib_buffer_t *pkt, u32 thread_index, u8 is_ip4)
Send reset without reusing existing buffer.
Definition: tcp_output.c:761
#define clib_warning(format, args...)
Definition: error.h:59
format_function_t format_tcp_header
Definition: format.h:101
struct _transport_connection transport_connection_t
#define TCP_USE_SACKS
Disable only for testing.
Definition: tcp.h:40
#define tcp_recovery_on(tc)
Definition: tcp.h:341
static u32 tcp_window_to_advertise(tcp_connection_t *tc, tcp_state_t state)
Compute and return window to advertise, scaled as per RFC1323.
Definition: tcp_output.c:180
#define tcp_fastrecovery_first(tc)
Definition: tcp.h:349
u32 adj_index_t
An index for adjacencies.
Definition: adj_types.h:30
void vlib_put_next_frame(vlib_main_t *vm, vlib_node_runtime_t *r, u32 next_index, u32 n_vectors_left)
Release pointer to next frame vector data.
Definition: main.c:458
u16 mss
Option flags, see above.
Definition: tcp_packet.h:147
static void tcp_output_handle_packet(tcp_connection_t *tc0, vlib_buffer_t *b0, u32 *error0, u16 *next0, u8 is_ip4)
Definition: tcp_output.c:2041
static void * ip6_next_header(ip6_header_t *i)
Definition: ip6_packet.h:412
static void tcp_make_ack(tcp_connection_t *tc, vlib_buffer_t *b)
Convert buffer to ACK.
Definition: tcp_output.c:511
void tcp_program_fastretransmit(tcp_worker_ctx_t *wrk, tcp_connection_t *tc)
Definition: tcp_input.c:1301
static u32 transport_max_tx_dequeue(transport_connection_t *tc)
Definition: session.h:385
static void tcp_timer_update(tcp_connection_t *tc, u8 timer_id, u32 interval)
Definition: tcp.h:852
u16 ip6_tcp_udp_icmp_compute_checksum(vlib_main_t *vm, vlib_buffer_t *p0, ip6_header_t *ip0, int *bogus_lengthp)
Definition: ip6_forward.c:947
signed int i32
Definition: types.h:77
vlib_node_registration_t ip6_lookup_node
(constructor) VLIB_REGISTER_NODE (ip6_lookup_node)
Definition: ip6_forward.c:552
static int tcp_make_established_options(tcp_connection_t *tc, tcp_options_t *opts)
Definition: tcp_output.c:338
u16 cached_next_index
Next frame index that vector arguments were last enqueued to last time this node ran.
Definition: node.h:513
#define ASSERT(truth)
static void tcp_output_push_ip(vlib_main_t *vm, vlib_buffer_t *b0, tcp_connection_t *tc0, u8 is_ip4)
Definition: tcp_output.c:2013
#define tcp_syn(_th)
Definition: tcp_packet.h:80
static u8 * format_tcp_tx_trace(u8 *s, va_list *args)
Definition: tcp_output.c:53
u16 ip4_tcp_udp_compute_checksum(vlib_main_t *vm, vlib_buffer_t *p0, ip4_header_t *ip0)
Definition: ip4_forward.c:1128
void tcp_update_burst_snd_vars(tcp_connection_t *tc)
Update burst send vars.
Definition: tcp_output.c:408
#define seq_geq(_s1, _s2)
Definition: tcp.h:642
void tcp_init_mss(tcp_connection_t *tc)
Definition: tcp_output.c:428
static uword ip6_address_is_link_local_unicast(const ip6_address_t *a)
Definition: ip6_packet.h:328
#define tcp_fastrecovery_first_off(tc)
Definition: tcp.h:351
static void tcp_update_rcv_wnd(tcp_connection_t *tc)
Definition: tcp_output.c:134
void tcp_send_fin(tcp_connection_t *tc)
Send FIN.
Definition: tcp_output.c:1025
#define clib_max(x, y)
Definition: clib.h:288
static u32 tcp_time_now_w_thread(u32 thread_index)
Definition: tcp.h:791
void tcp_send_ack(tcp_connection_t *tc)
Definition: tcp_output.c:1152
static void * vlib_add_trace(vlib_main_t *vm, vlib_node_runtime_t *r, vlib_buffer_t *b, u32 n_data_bytes)
Definition: trace_funcs.h:57
#define seq_lt(_s1, _s2)
Definition: tcp.h:639
int tcp_retransmit_first_unacked(tcp_worker_ctx_t *wrk, tcp_connection_t *tc)
Retransmit first unacked segment.
Definition: tcp_output.c:1726
template key/value backing page structure
Definition: bihash_doc.h:44
u32 ip_version_traffic_class_and_flow_label
Definition: ip6_packet.h:372
#define tcp_opts_wscale(_to)
Definition: tcp_packet.h:158
Definition: defs.h:47
u32 tsval
Timestamp value.
Definition: tcp_packet.h:149
u32 tsecr
Echoed/reflected time stamp.
Definition: tcp_packet.h:150
static void * vlib_buffer_push_ip6(vlib_main_t *vm, vlib_buffer_t *b, ip6_address_t *src, ip6_address_t *dst, int proto)
Push IPv6 header to buffer.
Definition: ip6.h:658
#define vec_len(v)
Number of elements in vector (rvalue-only, NULL tolerant)
ip_lookup_next_t lookup_next_index
Next hop after ip4-lookup.
Definition: adj.h:236
u32 next_buffer
Next buffer for this linked-list of buffers.
Definition: buffer.h:140
#define foreach_tcp6_reset_next
Definition: tcp_output.c:2204
static int tcp_fast_retransmit_unsent(tcp_worker_ctx_t *wrk, tcp_connection_t *tc, u32 burst_size)
Definition: tcp_output.c:1745
#define VLIB_BUFFER_TRACE_TRAJECTORY_INIT(b)
Definition: buffer.h:451
static tcp_worker_ctx_t * tcp_get_worker(u32 thread_index)
Definition: tcp.h:520
static void tcp_retransmit_timer_update(tcp_connection_t *tc)
Definition: tcp.h:911
VLIB buffer representation.
Definition: buffer.h:102
u64 uword
Definition: types.h:112
#define seq_max(_s1, _s2)
Definition: tcp.h:643
static void tcp_enqueue_to_ip_lookup_i(tcp_worker_ctx_t *wrk, vlib_buffer_t *b, u32 bi, u8 is_ip4, u32 fib_index, u8 flush)
Definition: tcp_output.c:577
static void * vlib_frame_vector_args(vlib_frame_t *f)
Get pointer to frame vector data.
Definition: node_funcs.h:244
static void tcp_make_ack_i(tcp_connection_t *tc, vlib_buffer_t *b, tcp_state_t state, u8 flags)
Prepare ACK.
Definition: tcp_output.c:486
void tcp_timer_delack_handler(u32 index)
Delayed ack timer handler.
Definition: tcp_output.c:1246
#define TCP_OPTION_LEN_MSS
Definition: tcp_packet.h:165
void tcp_flush_frame_to_output(tcp_worker_ctx_t *wrk, u8 is_ip4)
Flush tx frame populated by retransmits and timer pops.
Definition: tcp_output.c:982
struct clib_bihash_value offset
template key/value backing page structure
static void tcp_retransmit_timer_force_update(tcp_connection_t *tc)
Definition: tcp.h:881
int tcp_fast_retransmit(tcp_worker_ctx_t *wrk, tcp_connection_t *tc, u32 burst_size)
Do fast retransmit.
Definition: tcp_output.c:1947
u8 * format_tcp_connection(u8 *s, va_list *args)
Definition: tcp.c:906
u32 tcp_initial_window_to_advertise(tcp_connection_t *tc)
Compute initial window and scale factor.
Definition: tcp_output.c:114
#define vnet_buffer(b)
Definition: buffer.h:369
static tcp_connection_t * tcp_connection_get(u32 conn_index, u32 thread_index)
Definition: tcp.h:547
void tcp_update_rto(tcp_connection_t *tc)
Definition: tcp_input.c:465
static u32 vlib_num_workers()
Definition: threads.h:366
#define TCP_OPTION_LEN_NOOP
Definition: tcp_packet.h:164
void tcp_send_syn(tcp_connection_t *tc)
Send SYN.
Definition: tcp_output.c:919
vlib_node_registration_t tcp6_output_node
(constructor) VLIB_REGISTER_NODE (tcp6_output_node)
Definition: tcp_output.c:2174
u16 flags
Copy of main node flags.
Definition: node.h:507
Window scale.
Definition: tcp_packet.h:107
void tcp_program_ack(tcp_worker_ctx_t *wrk, tcp_connection_t *tc)
Definition: tcp_output.c:1171
enum _tcp_reset_next tcp_reset_next_t
static u32 transport_max_rx_enqueue(transport_connection_t *tc)
Definition: session.h:378
#define tcp_opts_sack_permitted(_to)
Definition: tcp_packet.h:160
static void vlib_buffer_free_one(vlib_main_t *vm, u32 buffer_index)
Free one buffer Shorthand to free a single buffer chain.
Definition: buffer_funcs.h:898
tcp_connection_t tcp_connection
Definition: tcp_output.c:49
static u32 tcp_loss_wnd(const tcp_connection_t *tc)
Definition: tcp.h:720
u16 dst_port
Definition: udp.api:42
vlib_frame_t * ip_lookup_tx_frames[2]
tx frames for ip 4/6 lookup nodes
Definition: tcp.h:392
static void * tcp_reuse_buffer(vlib_main_t *vm, vlib_buffer_t *b)
Definition: tcp_output.c:453
u8 ip_version_and_header_length
Definition: ip4_packet.h:138
Timestamps.
Definition: tcp_packet.h:110
static_always_inline void vlib_get_buffers(vlib_main_t *vm, u32 *bi, vlib_buffer_t **b, int count)
Translate array of buffer indices into buffer pointers.
Definition: buffer_funcs.h:244
static void tcp_enqueue_to_output_now(tcp_worker_ctx_t *wrk, vlib_buffer_t *b, u32 bi, u8 is_ip4)
Definition: tcp_output.c:668
vlib_node_registration_t tcp4_reset_node
(constructor) VLIB_REGISTER_NODE (tcp4_reset_node)
Definition: tcp_output.c:2291
#define VLIB_NODE_FLAG_TRACE
Definition: node.h:301
vlib_node_registration_t tcp4_output_node
(constructor) VLIB_REGISTER_NODE (tcp4_output_node)
Definition: tcp_output.c:2154
#define CLIB_CACHE_LINE_BYTES
Definition: cache.h:59
u32 total_length_not_including_first_buffer
Only valid for first buffer in chain.
Definition: buffer.h:167
static void tcp_enqueue_to_output(tcp_worker_ctx_t *wrk, vlib_buffer_t *b, u32 bi, u8 is_ip4)
Definition: tcp_output.c:661
static u32 vlib_buffer_alloc(vlib_main_t *vm, u32 *buffers, u32 n_buffers)
Allocate buffers into supplied array.
Definition: buffer_funcs.h:612
static void tcp_persist_timer_set(tcp_connection_t *tc)
Definition: tcp.h:888
static tcp_main_t * vnet_get_tcp_main()
Definition: tcp.h:514
static char * tcp_error_strings[]
Definition: tcp_output.c:40
static void * vlib_buffer_push_ip4(vlib_main_t *vm, vlib_buffer_t *b, ip4_address_t *src, ip4_address_t *dst, int proto, u8 csum_offload)
Push IPv4 header to buffer.
Definition: ip4.h:370
static vlib_buffer_t * vlib_get_buffer(vlib_main_t *vm, u32 buffer_index)
Translate buffer index into buffer pointer.
Definition: buffer_funcs.h:85
void tcp_cc_fastrecovery_exit(tcp_connection_t *tc)
Definition: tcp_input.c:1187
static u32 tcp_set_time_now(tcp_worker_ctx_t *wrk)
Definition: tcp.h:803
void tcp_timer_persist_handler(u32 index)
Got 0 snd_wnd from peer, try to do something about it.
Definition: tcp_output.c:1645
#define tcp_ack(_th)
Definition: tcp_packet.h:83
static u8 tcp_timer_is_active(tcp_connection_t *tc, tcp_timers_e timer)
Definition: tcp.h:925
Definition: defs.h:46
ip6_address_t dst_address
Definition: ip6_packet.h:385
u32 * tx_buffers
tx buffer free list
Definition: tcp.h:386
adj_index_t adj_nbr_find(fib_protocol_t nh_proto, vnet_link_t link_type, const ip46_address_t *nh_addr, u32 sw_if_index)
Lookup neighbor adjancency.
Definition: adj_nbr.c:99
static int tcp_make_options(tcp_connection_t *tc, tcp_options_t *opts, tcp_state_t state)
Definition: tcp_output.c:373
void tcp_program_dupack(tcp_worker_ctx_t *wrk, tcp_connection_t *tc)
Definition: tcp_output.c:1181