27 #define TCP_TICK 0.001 28 #define THZ (u32) (1/TCP_TICK) 29 #define TCP_TSTAMP_RESOLUTION TCP_TICK 30 #define TCP_PAWS_IDLE 24 * 24 * 60 * 60 * THZ 31 #define TCP_FIB_RECHECK_PERIOD 1 * THZ 32 #define TCP_MAX_OPTION_SPACE 40 34 #define TCP_DUPACK_THRESHOLD 3 35 #define TCP_MAX_RX_FIFO_SIZE 4 << 20 36 #define TCP_MIN_RX_FIFO_SIZE 4 << 10 37 #define TCP_IW_N_SEGMENTS 10 38 #define TCP_ALWAYS_ACK 1 39 #define TCP_USE_SACKS 1 42 #define foreach_tcp_fsm_state \ 45 _(SYN_SENT, "SYN_SENT") \ 46 _(SYN_RCVD, "SYN_RCVD") \ 47 _(ESTABLISHED, "ESTABLISHED") \ 48 _(CLOSE_WAIT, "CLOSE_WAIT") \ 49 _(FIN_WAIT_1, "FIN_WAIT_1") \ 50 _(LAST_ACK, "LAST_ACK") \ 51 _(CLOSING, "CLOSING") \ 52 _(FIN_WAIT_2, "FIN_WAIT_2") \ 53 _(TIME_WAIT, "TIME_WAIT") 55 typedef enum _tcp_state
57 #define _(sym, str) TCP_STATE_##sym, 69 #define foreach_tcp_timer \ 70 _(RETRANSMIT, "RETRANSMIT") \ 71 _(DELACK, "DELAYED ACK") \ 72 _(PERSIST, "PERSIST") \ 74 _(WAITCLOSE, "WAIT CLOSE") \ 75 _(RETRANSMIT_SYN, "RETRANSMIT SYN") \ 76 _(ESTABLISH, "ESTABLISH") 78 typedef enum _tcp_timers
80 #define _(sym, str) TCP_TIMER_##sym, 93 #define TCP_TIMER_HANDLE_INVALID ((u32) ~0) 96 #define TCP_TO_TIMER_TICK TCP_TICK*10 98 #define TCP_DELACK_TIME 1 99 #define TCP_ESTABLISH_TIME 750 100 #define TCP_SYN_RCVD_TIME 600 101 #define TCP_2MSL_TIME 300 102 #define TCP_CLOSEWAIT_TIME 20 103 #define TCP_TIMEWAIT_TIME 100 104 #define TCP_CLEANUP_TIME 10 105 #define TCP_TIMER_PERSIST_MIN 2 107 #define TCP_RTO_MAX 60 * THZ 108 #define TCP_RTO_MIN 0.2 * THZ 109 #define TCP_RTT_MAX 30 * THZ 110 #define TCP_RTO_SYN_RETRIES 3 111 #define TCP_RTO_INIT 1 * THZ 114 #define foreach_tcp_connection_flag \ 115 _(SNDACK, "Send ACK") \ 116 _(FINSNT, "FIN sent") \ 117 _(SENT_RCV_WND0, "Sent 0 rcv_wnd") \ 118 _(RECOVERY, "Recovery") \ 119 _(FAST_RECOVERY, "Fast Recovery") \ 120 _(FR_1_SMSS, "Sent 1 SMSS") \ 121 _(HALF_OPEN_DONE, "Half-open completed") \ 122 _(FINPNDG, "FIN pending") 124 typedef enum _tcp_connection_flag_bits
126 #define _(sym, str) TCP_CONN_##sym##_BIT, 132 typedef enum _tcp_connection_flag
134 #define _(sym, str) TCP_CONN_##sym = 1 << TCP_CONN_##sym##_BIT, 141 #define foreach_tcp_buf_flag \ 147 #define _(f) TCP_BUF_BIT_##f, 155 #define _(f) TCP_BUF_FLAG_##f = 1 << TCP_BUF_BIT_##f, 160 #define TCP_SCOREBOARD_TRACE (0) 161 #define TCP_MAX_SACK_BLOCKS 15 162 #define TCP_INVALID_SACK_HOLE_INDEX ((u32)~0) 164 typedef struct _scoreboard_trace_elt
173 typedef struct _sack_scoreboard_hole
182 typedef struct _sack_scoreboard
188 u32 last_sacked_bytes;
189 u32 last_bytes_delivered;
197 #if TCP_SCOREBOARD_TRACE 203 #if TCP_SCOREBOARD_TRACE 204 #define tcp_scoreboard_trace_add(_tc, _ack) \ 206 static u64 _group = 0; \ 207 sack_scoreboard_t *_sb = &_tc->sack_sb; \ 208 sack_block_t *_sack, *_sacks; \ 209 scoreboard_trace_elt_t *_elt; \ 212 _sacks = _tc->rcv_opts.sacks; \ 213 for (i = 0; i < vec_len (_sacks); i++) \ 215 _sack = &_sacks[i]; \ 216 vec_add2 (_sb->trace, _elt, 1); \ 217 _elt->start = _sack->start; \ 218 _elt->end = _sack->end; \ 219 _elt->ack = _elt->end == _ack ? _ack : 0; \ 220 _elt->snd_una_max = _elt->end == _ack ? _tc->snd_una_max : 0; \ 221 _elt->group = _group; \ 225 #define tcp_scoreboard_trace_add(_tc, _ack) 228 typedef enum _tcp_cc_algorithm_type
235 typedef enum _tcp_cc_ack_t
242 typedef struct _tcp_connection
276 u32 tsval_recent_age;
305 u32 limited_transmit;
309 struct _tcp_cc_algorithm
318 #define tcp_fastrecovery_on(tc) (tc)->flags |= TCP_CONN_FAST_RECOVERY 319 #define tcp_fastrecovery_off(tc) (tc)->flags &= ~TCP_CONN_FAST_RECOVERY 320 #define tcp_recovery_on(tc) (tc)->flags |= TCP_CONN_RECOVERY 321 #define tcp_recovery_off(tc) (tc)->flags &= ~TCP_CONN_RECOVERY 322 #define tcp_in_fastrecovery(tc) ((tc)->flags & TCP_CONN_FAST_RECOVERY) 323 #define tcp_in_recovery(tc) ((tc)->flags & (TCP_CONN_RECOVERY)) 324 #define tcp_in_slowstart(tc) (tc->cwnd < tc->ssthresh) 325 #define tcp_fastrecovery_sent_1_smss(tc) ((tc)->flags & TCP_CONN_FR_1_SMSS) 326 #define tcp_fastrecovery_1_smss_on(tc) ((tc)->flags |= TCP_CONN_FR_1_SMSS) 327 #define tcp_fastrecovery_1_smss_off(tc) ((tc)->flags &= ~TCP_CONN_FR_1_SMSS) 329 #define tcp_in_cong_recovery(tc) ((tc)->flags & \ 330 (TCP_CONN_FAST_RECOVERY | TCP_CONN_RECOVERY)) 335 tc->flags &= ~(TCP_CONN_FAST_RECOVERY | TCP_CONN_RECOVERY);
346 typedef enum _tcp_error
348 #define tcp_error(n,s) TCP_ERROR_##n, 354 typedef struct _tcp_lookup_dispatch
359 typedef struct _tcp_main
370 u8 log2_tstamp_clocks_per_tick;
371 f64 tstamp_ticks_per_clock;
382 tw_timer_wheel_16t_2w_512sl_t *timer_wheels;
395 u32 preallocated_connections;
396 u32 preallocated_half_open_connections;
400 u32 last_v4_address_rotor;
401 u32 last_v6_address_rotor;
405 u32 bytes_per_buffer;
411 f64 buffer_fail_fraction;
434 #if (VLIB_BUFFER_TRACE_TRAJECTORY) 435 #define tcp_trajectory_add_start(b, start) \ 437 (*vlib_buffer_trace_trajectory_cb) (b, start); \ 440 #define tcp_trajectory_add_start(b, start) 459 if (
tcp_main.connections[thread_index] == 0)
529 #define seq_lt(_s1, _s2) ((i32)((_s1)-(_s2)) < 0) 530 #define seq_leq(_s1, _s2) ((i32)((_s1)-(_s2)) <= 0) 531 #define seq_gt(_s1, _s2) ((i32)((_s1)-(_s2)) > 0) 532 #define seq_geq(_s1, _s2) ((i32)((_s1)-(_s2)) >= 0) 533 #define seq_max(_s1, _s2) (seq_gt((_s1), (_s2)) ? (_s1) : (_s2)) 536 #define timestamp_lt(_t1, _t2) ((i32)((_t1)-(_t2)) < 0) 537 #define timestamp_leq(_t1, _t2) ((i32)((_t1)-(_t2)) <= 0) 546 return tc->sack_sb.sacked_bytes + tc->sack_sb.lost_bytes;
548 return tc->rcv_dupacks * tc->snd_mss;
559 flight_size = (int) (tc->snd_una_max - tc->snd_una) -
tcp_bytes_out (tc)
566 (
"Negative: %u %u %u dupacks %u sacked bytes %u flags %d",
568 tc->snd_rxt_bytes, tc->rcv_dupacks, tc->sack_sb.sacked_bytes,
582 if (tc->snd_mss > 2190)
583 return 2 * tc->snd_mss;
584 else if (tc->snd_mss > 1095)
585 return 3 * tc->snd_mss;
587 return 4 * tc->snd_mss;
599 return clib_min (tc->cwnd, tc->snd_wnd);
606 int flight_size = (int) (tc->snd_nxt - tc->snd_una);
608 if (available_wnd <= flight_size)
611 return available_wnd - flight_size;
623 if (available_wnd <= flight_size)
626 return available_wnd - flight_size;
632 if ((tc->flags & TCP_CONN_FINSNT) && tc->snd_una_max - tc->snd_una == 1)
665 return tcp_main.time_now[thread_index];
683 tc->flags = TCP_CONN_SNDACK;
684 vnet_buffer (b)->tcp.flags &= ~TCP_BUF_FLAG_DUPACK;
693 = tw_timer_start_16t_2w_512sl (&
tcp_main.timer_wheels[tc->c_thread_index],
694 tc->c_c_index, timer_id, interval);
704 tw_timer_stop_16t_2w_512sl (&
tcp_main.timer_wheels[tc->c_thread_index],
705 tc->timers[timer_id]);
714 tw_timer_stop_16t_2w_512sl (&
tcp_main.timer_wheels[tc->c_thread_index],
715 tc->timers[timer_id]);
716 tc->timers[timer_id] =
717 tw_timer_start_16t_2w_512sl (&
tcp_main.timer_wheels[tc->c_thread_index],
718 tc->c_c_index, timer_id, interval);
724 ASSERT (tc->snd_una != tc->snd_una_max);
768 if (tc->snd_una == tc->snd_una_max)
771 if (tc->snd_wnd < tc->snd_mss)
785 #define tcp_validate_txf_size(_tc, _a) \ 786 ASSERT(_tc->state != TCP_STATE_ESTABLISHED \ 787 || stream_session_tx_fifo_max_dequeue (&_tc->connection) >= _a) 794 u32 prev_index,
u32 start,
798 start,
u8 have_sent_1_smss,
853 sb->sacked_bytes = 0;
854 sb->last_sacked_bytes = 0;
855 sb->last_bytes_delivered = 0;
866 return hole->end - hole->start;
873 return hole - sb->holes;
893 tm->cc_algos[type] = *vft;
900 return &tm->cc_algos[type];
931 th->seq_number = seq;
932 th->ack_number = ack;
933 th->data_offset_and_reserved = (tcp_hdr_opts_len >> 2) << 4;
937 th->urgent_pointer = 0;
960 clib_host_to_net_u32 (seq),
961 clib_host_to_net_u32 (ack),
962 tcp_hdr_opts_len,
flags,
963 clib_host_to_net_u16 (wnd));
#define vec_validate(V, I)
Make sure vector is long enough for given index (no header, unspecified alignment) ...
static vlib_cli_command_t trace
(constructor) VLIB_CLI_COMMAND (trace)
#define tcp_fastrecovery_1_smss_off(tc)
u32 tcp_prepare_retransmit_segment(tcp_connection_t *tc, u32 offset, u32 max_bytes, vlib_buffer_t **b)
Build a retransmit segment.
static void tcp_retransmit_timer_set(tcp_connection_t *tc)
void tcp_make_fin(tcp_connection_t *tc, vlib_buffer_t *b)
Convert buffer to FIN-ACK.
struct _sack_block sack_block_t
void tcp_cc_init_congestion(tcp_connection_t *tc)
struct _scoreboard_trace_elt scoreboard_trace_elt_t
void tcp_connection_timers_reset(tcp_connection_t *tc)
Stop all connection timers.
struct _transport_connection transport_connection_t
#define TCP_TO_TIMER_TICK
vlib_node_registration_t tcp4_output_node
(constructor) VLIB_REGISTER_NODE (tcp4_output_node)
tcp_connection_t * tcp_connection_new(u8 thread_index)
void tcp_fast_retransmit(tcp_connection_t *tc)
Do fast retransmit.
static u32 tcp_bytes_out(const tcp_connection_t *tc)
Our estimate of the number of bytes that have left the network.
sack_scoreboard_hole_t * scoreboard_insert_hole(sack_scoreboard_t *sb, u32 prev_index, u32 start, u32 end)
static tcp_connection_t * tcp_connection_get_if_valid(u32 conn_index, u32 thread_index)
void tcp_connection_del(tcp_connection_t *tc)
Connection removal.
struct _sack_scoreboard sack_scoreboard_t
void tcp_update_sack_list(tcp_connection_t *tc, u32 start, u32 end)
Build SACK list as per RFC2018.
u32 tcp_snd_space(tcp_connection_t *tc)
Compute tx window session is allowed to fill.
static tcp_connection_t * tcp_half_open_connection_get(u32 conn_index)
#define VLIB_BUFFER_PRE_DATA_SIZE
static_always_inline void clib_spinlock_unlock_if_init(clib_spinlock_t *p)
void scoreboard_update_lost(tcp_connection_t *tc, sack_scoreboard_t *sb)
struct _tcp_main tcp_main_t
vlib_node_registration_t tcp6_output_node
(constructor) VLIB_REGISTER_NODE (tcp6_output_node)
static u64 clib_cpu_time_now(void)
timer_expiration_handler tcp_timer_retransmit_handler
static void scoreboard_init(sack_scoreboard_t *sb)
u8 * format_tcp_scoreboard(u8 *s, va_list *args)
static sack_scoreboard_hole_t * scoreboard_prev_hole(sack_scoreboard_t *sb, sack_scoreboard_hole_t *hole)
struct _tcp_lookup_dispatch tcp_lookup_dispatch_t
struct _tcp_connection tcp_connection_t
static u32 tcp_available_snd_wnd(const tcp_connection_t *tc)
void tcp_fast_retransmit_sack(tcp_connection_t *tc)
Do fast retransmit with SACKs.
static tcp_connection_t * tcp_get_connection_from_transport(transport_connection_t *tconn)
void tcp_connection_cleanup(tcp_connection_t *tc)
Cleans up connection state.
static void scoreboard_clear(sack_scoreboard_t *sb)
void tcp_send_reset_w_pkt(tcp_connection_t *tc, vlib_buffer_t *pkt, u8 is_ip4)
Send reset without reusing existing buffer.
format_function_t format_tcp_flags
struct _tcp_header tcp_header_t
static u32 tcp_available_snd_space(const tcp_connection_t *tc)
Estimate of how many bytes we can still push into the network.
void tcp_connection_reset(tcp_connection_t *tc)
Notify session that connection has been reset.
struct _sack_scoreboard_hole sack_scoreboard_hole_t
void tcp_flush_frame_to_output(vlib_main_t *vm, u8 thread_index, u8 is_ip4)
Flush tx frame populated by retransmits and timer pops.
u8 * tcp_scoreboard_replay(u8 *s, tcp_connection_t *tc, u8 verbose)
static u32 scoreboard_hole_index(sack_scoreboard_t *sb, sack_scoreboard_hole_t *hole)
i16 current_data
signed offset in data[], pre_data[] that we are currently processing.
i32 tcp_rcv_wnd_available(tcp_connection_t *tc)
static u32 tcp_available_output_snd_space(const tcp_connection_t *tc)
static tcp_header_t * tcp_buffer_hdr(vlib_buffer_t *b)
static timer_callback_t * timers
enum _tcp_state tcp_state_t
void scoreboard_init_high_rxt(sack_scoreboard_t *sb, u32 seq)
timer_expiration_handler tcp_timer_retransmit_syn_handler
static u32 tcp_time_now(void)
static tcp_cc_algorithm_t * tcp_cc_algo_get(tcp_cc_algorithm_type_e type)
static u32 scoreboard_hole_bytes(sack_scoreboard_hole_t *hole)
sack_scoreboard_hole_t * scoreboard_next_rxt_hole(sack_scoreboard_t *sb, sack_scoreboard_hole_t *start, u8 have_sent_1_smss, u8 *can_rescue, u8 *snd_limited)
Figure out the next hole to retransmit.
void tcp_api_reference(void)
static void tcp_timer_set(tcp_connection_t *tc, u8 timer_id, u32 interval)
#define TCP_INVALID_SACK_HOLE_INDEX
#define pool_elt_at_index(p, i)
Returns pointer to element at given index.
timer_expiration_handler tcp_timer_persist_handler
static void * vlib_buffer_push_tcp_net_order(vlib_buffer_t *b, u16 sp, u16 dp, u32 seq, u32 ack, u8 tcp_hdr_opts_len, u8 flags, u16 wnd)
Push TCP header to buffer.
void tcp_retransmit_first_unacked(tcp_connection_t *tc)
Retransmit first unacked segment.
void tcp_rcv_sacks(tcp_connection_t *tc, u32 ack)
clib_error_t * vnet_tcp_enable_disable(vlib_main_t *vm, u8 is_en)
void tcp_send_syn(tcp_connection_t *tc)
Send SYN.
static sack_scoreboard_hole_t * scoreboard_next_hole(sack_scoreboard_t *sb, sack_scoreboard_hole_t *hole)
#define TCP_TIMER_HANDLE_INVALID
void tcp_fast_retransmit_no_sack(tcp_connection_t *tc)
Fast retransmit without SACK info.
static u32 tcp_flight_size(const tcp_connection_t *tc)
Our estimate of the number of bytes in flight (pipe size)
u32 tcp_push_header(transport_connection_t *tconn, vlib_buffer_t *b)
void( timer_expiration_handler)(u32 index)
static sack_scoreboard_hole_t * scoreboard_last_hole(sack_scoreboard_t *sb)
enum _tcp_cc_ack_t tcp_cc_ack_t
static void tcp_timer_reset(tcp_connection_t *tc, u8 timer_id)
static sack_scoreboard_hole_t * scoreboard_first_hole(sack_scoreboard_t *sb)
enum _tcp_error tcp_error_t
static void * vlib_buffer_push_tcp(vlib_buffer_t *b, u16 sp_net, u16 dp_net, u32 seq, u32 ack, u8 tcp_hdr_opts_len, u8 flags, u16 wnd)
Push TCP header to buffer.
int tcp_configure_v4_source_address_range(vlib_main_t *vm, ip4_address_t *start, ip4_address_t *end, u32 table_id)
Configure an ipv4 source address range.
static_always_inline uword vlib_get_thread_index(void)
void tcp_cc_init(tcp_connection_t *tc)
void tcp_send_reset(tcp_connection_t *tc)
Build and set reset packet for connection.
void tcp_punt_unknown(vlib_main_t *vm, u8 is_ip4, u8 is_add)
format_function_t format_tcp_state
#define clib_warning(format, args...)
enum _tcp_timers tcp_timers_e
static void tcp_connection_force_ack(tcp_connection_t *tc, vlib_buffer_t *b)
int tcp_half_open_connection_cleanup(tcp_connection_t *tc)
Try to cleanup half-open connection.
u32 fib_node_index_t
A typedef of a node index.
void tcp_connection_timers_init(tcp_connection_t *tc)
Initialize all connection timers as invalid.
void tcp_make_synack(tcp_connection_t *ts, vlib_buffer_t *b)
Convert buffer to SYN-ACK.
void tcp_flush_frames_to_output(u8 thread_index)
Flush v4 and v6 tcp and ip-lookup tx frames for thread index.
#define pool_is_free_index(P, I)
Use free bitmap to query whether given index is free.
format_function_t format_tcp_rcv_sacks
u8 * format_tcp_connection_id(u8 *s, va_list *args)
vlib_node_registration_t tcp6_input_node
(constructor) VLIB_REGISTER_NODE (tcp6_input_node)
fib_node_index_t tcp_lookup_rmt_in_fib(tcp_connection_t *tc)
void tcp_make_ack(tcp_connection_t *ts, vlib_buffer_t *b)
Convert buffer to ACK.
static void tcp_timer_update(tcp_connection_t *tc, u8 timer_id, u32 interval)
enum _tcp_cc_algorithm_type tcp_cc_algorithm_type_e
enum _tcp_connection_flag_bits tcp_connection_flag_bits_e
vhost_vring_state_t state
void tcp_connection_init_vars(tcp_connection_t *tc)
Initialize tcp connection variables.
u8 * format_tcp_connection(u8 *s, va_list *args)
static u32 tcp_end_seq(tcp_header_t *th, u32 len)
struct _tcp_cc_algorithm tcp_cc_algorithm_t
void scoreboard_remove_hole(sack_scoreboard_t *sb, sack_scoreboard_hole_t *hole)
struct _vlib_node_registration vlib_node_registration_t
template key/value backing page structure
void tcp_update_snd_mss(tcp_connection_t *tc)
Update snd_mss to reflect the effective segment size that we can send by taking into account all TCP ...
static void tcp_persist_timer_update(tcp_connection_t *tc)
static sack_scoreboard_hole_t * scoreboard_get_hole(sack_scoreboard_t *sb, u32 index)
static void * vlib_buffer_push_uninit(vlib_buffer_t *b, u8 size)
Prepend uninitialized data to buffer.
vlib_node_registration_t tcp4_input_node
(constructor) VLIB_REGISTER_NODE (tcp4_input_node)
static u32 tcp_initial_cwnd(const tcp_connection_t *tc)
Initial cwnd as per RFC5681.
#define foreach_tcp_fsm_state
TCP FSM state definitions as per RFC793.
void tcp_send_fin(tcp_connection_t *tc)
Send FIN.
#define foreach_tcp_connection_flag
TCP connection flags.
#define foreach_tcp_timer
TCP timers.
static u32 tcp_set_time_now(u32 thread_index)
static u8 tcp_is_lost_fin(tcp_connection_t *tc)
static void tcp_retransmit_timer_update(tcp_connection_t *tc)
void tcp_init_snd_vars(tcp_connection_t *tc)
Initialize connection send variables.
void tcp_connection_close(tcp_connection_t *tc)
Begin connection closing procedure.
static void tcp_retransmit_timer_force_update(tcp_connection_t *tc)
static tcp_connection_t * tcp_connection_get(u32 conn_index, u32 thread_index)
void tcp_update_rto(tcp_connection_t *tc)
void tcp_init_mss(tcp_connection_t *tc)
static void tcp_cc_algo_register(tcp_cc_algorithm_type_e type, const tcp_cc_algorithm_t *vft)
#define foreach_tcp_buf_flag
TCP buffer flags.
enum _tcp_connection_flag tcp_connection_flags_e
format_function_t format_tcp_sacks
#define TCP_TIMER_PERSIST_MIN
#define tcp_opts_sack_permitted(_to)
static u32 tcp_loss_wnd(const tcp_connection_t *tc)
int tcp_configure_v6_source_address_range(vlib_main_t *vm, ip6_address_t *start, ip6_address_t *end, u32 table_id)
Configure an ipv6 source address range.
static void tcp_persist_timer_set(tcp_connection_t *tc)
static tcp_main_t * vnet_get_tcp_main()
static void tcp_cong_recovery_off(tcp_connection_t *tc)
timer_expiration_handler tcp_timer_delack_handler
static_always_inline void clib_spinlock_lock_if_init(clib_spinlock_t *p)
static void tcp_retransmit_timer_reset(tcp_connection_t *tc)
void tcp_cc_fastrecovery_exit(tcp_connection_t *tc)
void tcp_update_rcv_wnd(tcp_connection_t *tc)
static u8 tcp_timer_is_active(tcp_connection_t *tc, tcp_timers_e timer)
int tcp_cc_recover(tcp_connection_t *tc)
static tcp_connection_t * tcp_listener_get(u32 tli)
static void tcp_persist_timer_reset(tcp_connection_t *tc)
static uword pool_elts(void *v)
Number of active elements in a pool.