27 #define TCP_TICK 0.001 28 #define THZ (u32) (1/TCP_TICK) 29 #define TCP_TSTAMP_RESOLUTION TCP_TICK 30 #define TCP_PAWS_IDLE 24 * 24 * 60 * 60 * THZ 31 #define TCP_FIB_RECHECK_PERIOD 1 * THZ 32 #define TCP_MAX_OPTION_SPACE 40 34 #define TCP_DUPACK_THRESHOLD 3 35 #define TCP_MAX_RX_FIFO_SIZE 32 << 20 36 #define TCP_MIN_RX_FIFO_SIZE 4 << 10 37 #define TCP_IW_N_SEGMENTS 10 38 #define TCP_ALWAYS_ACK 1 39 #define TCP_USE_SACKS 1 42 #define foreach_tcp_fsm_state \ 45 _(SYN_SENT, "SYN_SENT") \ 46 _(SYN_RCVD, "SYN_RCVD") \ 47 _(ESTABLISHED, "ESTABLISHED") \ 48 _(CLOSE_WAIT, "CLOSE_WAIT") \ 49 _(FIN_WAIT_1, "FIN_WAIT_1") \ 50 _(LAST_ACK, "LAST_ACK") \ 51 _(CLOSING, "CLOSING") \ 52 _(FIN_WAIT_2, "FIN_WAIT_2") \ 53 _(TIME_WAIT, "TIME_WAIT") 55 typedef enum _tcp_state
57 #define _(sym, str) TCP_STATE_##sym, 69 #define foreach_tcp_timer \ 70 _(RETRANSMIT, "RETRANSMIT") \ 71 _(DELACK, "DELAYED ACK") \ 72 _(PERSIST, "PERSIST") \ 74 _(WAITCLOSE, "WAIT CLOSE") \ 75 _(RETRANSMIT_SYN, "RETRANSMIT SYN") \ 76 _(ESTABLISH, "ESTABLISH") 78 typedef enum _tcp_timers
80 #define _(sym, str) TCP_TIMER_##sym, 93 #define TCP_TIMER_HANDLE_INVALID ((u32) ~0) 96 #define TCP_TO_TIMER_TICK TCP_TICK*10 98 #define TCP_DELACK_TIME 1 99 #define TCP_ESTABLISH_TIME 750 100 #define TCP_SYN_RCVD_TIME 600 101 #define TCP_2MSL_TIME 300 102 #define TCP_CLOSEWAIT_TIME 20 103 #define TCP_TIMEWAIT_TIME 100 104 #define TCP_CLEANUP_TIME 10 105 #define TCP_TIMER_PERSIST_MIN 2 107 #define TCP_RTO_MAX 60 * THZ 108 #define TCP_RTO_MIN 0.2 * THZ 109 #define TCP_RTT_MAX 30 * THZ 110 #define TCP_RTO_SYN_RETRIES 3 111 #define TCP_RTO_INIT 1 * THZ 114 #define foreach_tcp_connection_flag \ 115 _(SNDACK, "Send ACK") \ 116 _(FINSNT, "FIN sent") \ 117 _(SENT_RCV_WND0, "Sent 0 rcv_wnd") \ 118 _(RECOVERY, "Recovery") \ 119 _(FAST_RECOVERY, "Fast Recovery") \ 120 _(FR_1_SMSS, "Sent 1 SMSS") \ 121 _(HALF_OPEN_DONE, "Half-open completed") \ 122 _(FINPNDG, "FIN pending") \ 124 typedef enum _tcp_connection_flag_bits
126 #define _(sym, str) TCP_CONN_##sym##_BIT, 132 typedef enum _tcp_connection_flag
134 #define _(sym, str) TCP_CONN_##sym = 1 << TCP_CONN_##sym##_BIT, 141 #define foreach_tcp_buf_flag \ 147 #define _(f) TCP_BUF_BIT_##f, 155 #define _(f) TCP_BUF_FLAG_##f = 1 << TCP_BUF_BIT_##f, 160 #define TCP_SCOREBOARD_TRACE (0) 161 #define TCP_MAX_SACK_BLOCKS 15 162 #define TCP_INVALID_SACK_HOLE_INDEX ((u32)~0) 164 typedef struct _scoreboard_trace_elt
173 typedef struct _sack_scoreboard_hole
182 typedef struct _sack_scoreboard
188 u32 last_sacked_bytes;
189 u32 last_bytes_delivered;
197 #if TCP_SCOREBOARD_TRACE 203 #if TCP_SCOREBOARD_TRACE 204 #define tcp_scoreboard_trace_add(_tc, _ack) \ 206 static u64 _group = 0; \ 207 sack_scoreboard_t *_sb = &_tc->sack_sb; \ 208 sack_block_t *_sack, *_sacks; \ 209 scoreboard_trace_elt_t *_elt; \ 212 _sacks = _tc->rcv_opts.sacks; \ 213 for (i = 0; i < vec_len (_sacks); i++) \ 215 _sack = &_sacks[i]; \ 216 vec_add2 (_sb->trace, _elt, 1); \ 217 _elt->start = _sack->start; \ 218 _elt->end = _sack->end; \ 219 _elt->ack = _elt->end == _ack ? _ack : 0; \ 220 _elt->snd_una_max = _elt->end == _ack ? _tc->snd_una_max : 0; \ 221 _elt->group = _group; \ 225 #define tcp_scoreboard_trace_add(_tc, _ack) 230 start,
u8 have_sent_1_smss,
246 typedef enum _tcp_cc_algorithm_type
253 typedef enum _tcp_cc_ack_t
260 typedef struct _tcp_connection
292 u32 tsval_recent_age;
324 u32 limited_transmit;
329 struct _tcp_cc_algorithm
338 #define tcp_fastrecovery_on(tc) (tc)->flags |= TCP_CONN_FAST_RECOVERY 339 #define tcp_fastrecovery_off(tc) (tc)->flags &= ~TCP_CONN_FAST_RECOVERY 340 #define tcp_recovery_on(tc) (tc)->flags |= TCP_CONN_RECOVERY 341 #define tcp_recovery_off(tc) (tc)->flags &= ~TCP_CONN_RECOVERY 342 #define tcp_in_fastrecovery(tc) ((tc)->flags & TCP_CONN_FAST_RECOVERY) 343 #define tcp_in_recovery(tc) ((tc)->flags & (TCP_CONN_RECOVERY)) 344 #define tcp_in_slowstart(tc) (tc->cwnd < tc->ssthresh) 345 #define tcp_fastrecovery_sent_1_smss(tc) ((tc)->flags & TCP_CONN_FR_1_SMSS) 346 #define tcp_fastrecovery_1_smss_on(tc) ((tc)->flags |= TCP_CONN_FR_1_SMSS) 347 #define tcp_fastrecovery_1_smss_off(tc) ((tc)->flags &= ~TCP_CONN_FR_1_SMSS) 349 #define tcp_in_cong_recovery(tc) ((tc)->flags & \ 350 (TCP_CONN_FAST_RECOVERY | TCP_CONN_RECOVERY)) 355 tc->flags &= ~(TCP_CONN_FAST_RECOVERY | TCP_CONN_RECOVERY);
359 typedef enum _tcp_error
361 #define tcp_error(n,s) TCP_ERROR_##n, 367 typedef struct _tcp_lookup_dispatch
376 tw_timer_wheel_16t_2w_512sl_t timer_wheel;
387 typedef struct _tcp_main
398 u8 log2_tstamp_clocks_per_tick;
399 f64 tstamp_ticks_per_clock;
412 u32 bytes_per_buffer;
426 u32 preallocated_connections;
427 u32 preallocated_half_open_connections;
431 u32 last_v4_address_rotor;
432 u32 last_v6_address_rotor;
439 f64 buffer_fail_fraction;
462 #if (VLIB_BUFFER_TRACE_TRAJECTORY) 463 #define tcp_trajectory_add_start(b, start) \ 465 (*vlib_buffer_trace_trajectory_cb) (b, start); \ 468 #define tcp_trajectory_add_start(b, start) 487 if (
tcp_main.connections[thread_index] == 0)
553 #define seq_lt(_s1, _s2) ((i32)((_s1)-(_s2)) < 0) 554 #define seq_leq(_s1, _s2) ((i32)((_s1)-(_s2)) <= 0) 555 #define seq_gt(_s1, _s2) ((i32)((_s1)-(_s2)) > 0) 556 #define seq_geq(_s1, _s2) ((i32)((_s1)-(_s2)) >= 0) 557 #define seq_max(_s1, _s2) (seq_gt((_s1), (_s2)) ? (_s1) : (_s2)) 560 #define timestamp_lt(_t1, _t2) ((i32)((_t1)-(_t2)) < 0) 561 #define timestamp_leq(_t1, _t2) ((i32)((_t1)-(_t2)) <= 0) 570 return tc->sack_sb.sacked_bytes + tc->sack_sb.lost_bytes;
572 return tc->rcv_dupacks * tc->snd_mss;
583 flight_size = (int) (tc->snd_una_max - tc->snd_una) -
tcp_bytes_out (tc)
590 (
"Negative: %u %u %u dupacks %u sacked bytes %u flags %d",
592 tc->snd_rxt_bytes, tc->rcv_dupacks, tc->sack_sb.sacked_bytes,
606 if (tc->snd_mss > 2190)
607 return 2 * tc->snd_mss;
608 else if (tc->snd_mss > 1095)
609 return 3 * tc->snd_mss;
611 return 4 * tc->snd_mss;
623 return clib_min (tc->cwnd, tc->snd_wnd);
630 int flight_size = (int) (tc->snd_nxt - tc->snd_una);
632 if (available_wnd <= flight_size)
635 return available_wnd - flight_size;
647 if (available_wnd <= flight_size)
650 return available_wnd - flight_size;
656 if ((tc->flags & TCP_CONN_FINSNT) && tc->snd_una_max - tc->snd_una == 1)
686 return tcp_main.wrk_ctx[thread_index].time_now;
700 tc->flags = TCP_CONN_SNDACK;
701 vnet_buffer (b)->tcp.flags &= ~TCP_BUF_FLAG_DUPACK;
709 tc->timers[timer_id] =
710 tw_timer_start_16t_2w_512sl (&
tcp_main.
711 wrk_ctx[tc->c_thread_index].timer_wheel,
712 tc->c_c_index, timer_id, interval);
722 tw_timer_stop_16t_2w_512sl (&
tcp_main.
723 wrk_ctx[tc->c_thread_index].timer_wheel,
724 tc->timers[timer_id]);
733 tw_timer_update_16t_2w_512sl (&
tcp_main.
734 wrk_ctx[tc->c_thread_index].timer_wheel,
735 tc->timers[timer_id], interval);
737 tc->timers[timer_id] =
738 tw_timer_start_16t_2w_512sl (&
tcp_main.
739 wrk_ctx[tc->c_thread_index].timer_wheel,
740 tc->c_c_index, timer_id, interval);
746 ASSERT (tc->snd_una != tc->snd_una_max);
790 if (tc->snd_una == tc->snd_una_max)
793 if (tc->snd_wnd < tc->snd_mss)
807 #define tcp_validate_txf_size(_tc, _a) \ 808 ASSERT(_tc->state != TCP_STATE_ESTABLISHED \ 809 || session_tx_fifo_max_dequeue (&_tc->connection) >= _a) 845 th->seq_number = seq;
846 th->ack_number = ack;
847 th->data_offset_and_reserved = (tcp_hdr_opts_len >> 2) << 4;
851 th->urgent_pointer = 0;
874 clib_host_to_net_u32 (seq),
875 clib_host_to_net_u32 (ack),
876 tcp_hdr_opts_len,
flags,
877 clib_host_to_net_u16 (wnd));
static vlib_cli_command_t trace
(constructor) VLIB_CLI_COMMAND (trace)
#define CLIB_CACHE_LINE_ALIGN_MARK(mark)
#define tcp_fastrecovery_1_smss_off(tc)
void scoreboard_clear(sack_scoreboard_t *sb)
static void tcp_retransmit_timer_set(tcp_connection_t *tc)
void tcp_make_fin(tcp_connection_t *tc, vlib_buffer_t *b)
Convert buffer to FIN-ACK.
struct _sack_block sack_block_t
void tcp_cc_init_congestion(tcp_connection_t *tc)
Init loss recovery/fast recovery.
struct _scoreboard_trace_elt scoreboard_trace_elt_t
void tcp_connection_timers_reset(tcp_connection_t *tc)
Stop all connection timers.
struct _transport_connection transport_connection_t
#define TCP_TO_TIMER_TICK
vlib_node_registration_t tcp4_output_node
(constructor) VLIB_REGISTER_NODE (tcp4_output_node)
void scoreboard_init(sack_scoreboard_t *sb)
tcp_connection_t * tcp_connection_new(u8 thread_index)
void tcp_fast_retransmit(tcp_connection_t *tc)
Do fast retransmit.
static u32 tcp_bytes_out(const tcp_connection_t *tc)
Our estimate of the number of bytes that have left the network.
static tcp_connection_t * tcp_connection_get_if_valid(u32 conn_index, u32 thread_index)
void tcp_connection_del(tcp_connection_t *tc)
Connection removal.
struct _sack_scoreboard sack_scoreboard_t
void tcp_update_sack_list(tcp_connection_t *tc, u32 start, u32 end)
Build SACK list as per RFC2018.
u32 tcp_snd_space(tcp_connection_t *tc)
static tcp_connection_t * tcp_half_open_connection_get(u32 conn_index)
sack_scoreboard_hole_t * scoreboard_last_hole(sack_scoreboard_t *sb)
sack_scoreboard_hole_t * scoreboard_next_hole(sack_scoreboard_t *sb, sack_scoreboard_hole_t *hole)
#define VLIB_BUFFER_PRE_DATA_SIZE
static_always_inline void clib_spinlock_unlock_if_init(clib_spinlock_t *p)
struct _tcp_main tcp_main_t
vlib_node_registration_t tcp6_output_node
(constructor) VLIB_REGISTER_NODE (tcp6_output_node)
static u64 clib_cpu_time_now(void)
timer_expiration_handler tcp_timer_retransmit_handler
u8 * format_tcp_scoreboard(u8 *s, va_list *args)
struct _tcp_lookup_dispatch tcp_lookup_dispatch_t
void tcp_update_burst_snd_vars(tcp_connection_t *tc)
Update burst send vars.
struct _tcp_connection tcp_connection_t
static u32 tcp_available_cc_snd_space(const tcp_connection_t *tc)
Estimate of how many bytes we can still push into the network.
static u32 tcp_available_snd_wnd(const tcp_connection_t *tc)
void tcp_fast_retransmit_sack(tcp_connection_t *tc)
Do fast retransmit with SACKs.
static tcp_connection_t * tcp_get_connection_from_transport(transport_connection_t *tconn)
void tcp_connection_cleanup(tcp_connection_t *tc)
Cleans up connection state.
void tcp_send_reset_w_pkt(tcp_connection_t *tc, vlib_buffer_t *pkt, u8 is_ip4)
Send reset without reusing existing buffer.
format_function_t format_tcp_flags
struct _tcp_header tcp_header_t
void tcp_connection_reset(tcp_connection_t *tc)
Notify session that connection has been reset.
struct _sack_scoreboard_hole sack_scoreboard_hole_t
void tcp_flush_frame_to_output(vlib_main_t *vm, u8 thread_index, u8 is_ip4)
Flush tx frame populated by retransmits and timer pops.
u8 * tcp_scoreboard_replay(u8 *s, tcp_connection_t *tc, u8 verbose)
i16 current_data
signed offset in data[], pre_data[] that we are currently processing.
sack_scoreboard_hole_t * scoreboard_get_hole(sack_scoreboard_t *sb, u32 index)
static u32 tcp_available_output_snd_space(const tcp_connection_t *tc)
static tcp_header_t * tcp_buffer_hdr(vlib_buffer_t *b)
static timer_callback_t * timers
enum _tcp_state tcp_state_t
vhost_vring_state_t state
timer_expiration_handler tcp_timer_retransmit_syn_handler
static u32 tcp_time_now(void)
sack_scoreboard_hole_t * scoreboard_next_rxt_hole(sack_scoreboard_t *sb, sack_scoreboard_hole_t *start, u8 have_sent_1_smss, u8 *can_rescue, u8 *snd_limited)
Figure out the next hole to retransmit.
void tcp_api_reference(void)
static void tcp_timer_set(tcp_connection_t *tc, u8 timer_id, u32 interval)
struct tcp_worker_ctx_ tcp_worker_ctx_t
#define pool_elt_at_index(p, i)
Returns pointer to element at given index.
timer_expiration_handler tcp_timer_persist_handler
u32 tcp_push_header(tcp_connection_t *tconn, vlib_buffer_t *b)
u32 tcp_sack_list_bytes(tcp_connection_t *tc)
static void * vlib_buffer_push_tcp_net_order(vlib_buffer_t *b, u16 sp, u16 dp, u32 seq, u32 ack, u8 tcp_hdr_opts_len, u8 flags, u16 wnd)
Push TCP header to buffer.
void tcp_retransmit_first_unacked(tcp_connection_t *tc)
Retransmit first unacked segment.
void tcp_rcv_sacks(tcp_connection_t *tc, u32 ack)
clib_error_t * vnet_tcp_enable_disable(vlib_main_t *vm, u8 is_en)
void tcp_send_syn(tcp_connection_t *tc)
Send SYN.
sack_scoreboard_hole_t * scoreboard_prev_hole(sack_scoreboard_t *sb, sack_scoreboard_hole_t *hole)
#define TCP_TIMER_HANDLE_INVALID
void tcp_fast_retransmit_no_sack(tcp_connection_t *tc)
Fast retransmit without SACK info.
static u32 tcp_flight_size(const tcp_connection_t *tc)
Our estimate of the number of bytes in flight (pipe size)
void tcp_cc_algo_register(tcp_cc_algorithm_type_e type, const tcp_cc_algorithm_t *vft)
void( timer_expiration_handler)(u32 index)
enum _tcp_cc_ack_t tcp_cc_ack_t
static void tcp_timer_reset(tcp_connection_t *tc, u8 timer_id)
enum _tcp_error tcp_error_t
static void * vlib_buffer_push_tcp(vlib_buffer_t *b, u16 sp_net, u16 dp_net, u32 seq, u32 ack, u8 tcp_hdr_opts_len, u8 flags, u16 wnd)
Push TCP header to buffer.
int tcp_configure_v4_source_address_range(vlib_main_t *vm, ip4_address_t *start, ip4_address_t *end, u32 table_id)
Configure an ipv4 source address range.
static_always_inline uword vlib_get_thread_index(void)
void tcp_send_reset(tcp_connection_t *tc)
Build and set reset packet for connection.
void tcp_punt_unknown(vlib_main_t *vm, u8 is_ip4, u8 is_add)
format_function_t format_tcp_state
#define clib_warning(format, args...)
enum _tcp_timers tcp_timers_e
static void tcp_connection_force_ack(tcp_connection_t *tc, vlib_buffer_t *b)
int tcp_half_open_connection_cleanup(tcp_connection_t *tc)
Try to cleanup half-open connection.
u32 fib_node_index_t
A typedef of a node index.
void tcp_connection_timers_init(tcp_connection_t *tc)
Initialize all connection timers as invalid.
void tcp_make_synack(tcp_connection_t *ts, vlib_buffer_t *b)
Convert buffer to SYN-ACK.
void tcp_flush_frames_to_output(u8 thread_index)
Flush v4 and v6 tcp and ip-lookup tx frames for thread index.
#define pool_is_free_index(P, I)
Use free bitmap to query whether given index is free.
format_function_t format_tcp_rcv_sacks
vlib_node_registration_t tcp6_input_node
(constructor) VLIB_REGISTER_NODE (tcp6_input_node)
fib_node_index_t tcp_lookup_rmt_in_fib(tcp_connection_t *tc)
void tcp_make_ack(tcp_connection_t *ts, vlib_buffer_t *b)
Convert buffer to ACK.
static void tcp_timer_update(tcp_connection_t *tc, u8 timer_id, u32 interval)
enum _tcp_cc_algorithm_type tcp_cc_algorithm_type_e
enum _tcp_connection_flag_bits tcp_connection_flag_bits_e
void tcp_connection_init_vars(tcp_connection_t *tc)
Initialize tcp connection variables.
u8 * format_tcp_connection(u8 *s, va_list *args)
static u32 tcp_end_seq(tcp_header_t *th, u32 len)
struct _tcp_cc_algorithm tcp_cc_algorithm_t
struct _vlib_node_registration vlib_node_registration_t
static void tcp_persist_timer_update(tcp_connection_t *tc)
static void * vlib_buffer_push_uninit(vlib_buffer_t *b, u8 size)
Prepend uninitialized data to buffer.
vlib_node_registration_t tcp4_input_node
(constructor) VLIB_REGISTER_NODE (tcp4_input_node)
tcp_cc_algorithm_t * tcp_cc_algo_get(tcp_cc_algorithm_type_e type)
static u32 tcp_initial_cwnd(const tcp_connection_t *tc)
Initial cwnd as per RFC5681.
#define foreach_tcp_fsm_state
TCP FSM state definitions as per RFC793.
void tcp_send_fin(tcp_connection_t *tc)
Send FIN.
#define foreach_tcp_connection_flag
TCP connection flags.
#define foreach_tcp_timer
TCP timers.
static u32 tcp_set_time_now(u32 thread_index)
static u8 tcp_is_lost_fin(tcp_connection_t *tc)
sack_scoreboard_hole_t * scoreboard_first_hole(sack_scoreboard_t *sb)
static void tcp_retransmit_timer_update(tcp_connection_t *tc)
void tcp_init_snd_vars(tcp_connection_t *tc)
Initialize connection send variables.
void tcp_connection_close(tcp_connection_t *tc)
Begin connection closing procedure.
static void tcp_retransmit_timer_force_update(tcp_connection_t *tc)
static tcp_connection_t * tcp_connection_get(u32 conn_index, u32 thread_index)
void tcp_update_rto(tcp_connection_t *tc)
void tcp_init_mss(tcp_connection_t *tc)
#define foreach_tcp_buf_flag
TCP buffer flags.
enum _tcp_connection_flag tcp_connection_flags_e
format_function_t format_tcp_sacks
#define TCP_TIMER_PERSIST_MIN
#define tcp_opts_sack_permitted(_to)
static u32 tcp_loss_wnd(const tcp_connection_t *tc)
int tcp_configure_v6_source_address_range(vlib_main_t *vm, ip6_address_t *start, ip6_address_t *end, u32 table_id)
Configure an ipv6 source address range.
static void tcp_persist_timer_set(tcp_connection_t *tc)
static tcp_main_t * vnet_get_tcp_main()
static void tcp_cong_recovery_off(tcp_connection_t *tc)
timer_expiration_handler tcp_timer_delack_handler
static_always_inline void clib_spinlock_lock_if_init(clib_spinlock_t *p)
static void tcp_retransmit_timer_reset(tcp_connection_t *tc)
void tcp_cc_fastrecovery_exit(tcp_connection_t *tc)
static u8 tcp_timer_is_active(tcp_connection_t *tc, tcp_timers_e timer)
static tcp_connection_t * tcp_listener_get(u32 tli)
static void tcp_persist_timer_reset(tcp_connection_t *tc)