FD.io VPP  v18.07-rc0-415-g6c78436
Vector Packet Processing
lb.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2016 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 /**
17  * lb-plugin implements a MagLev-like load balancer.
18  * http://research.google.com/pubs/pub44824.html
19  *
20  * It hasn't been tested for interoperability with the original MagLev
21  * but intends to provide similar functionality.
22  * The load-balancer receives traffic destined to VIP (Virtual IP)
23  * addresses from one or multiple(ECMP) routers.
24  * The load-balancer tunnels the traffic toward many application servers
25  * ensuring session stickyness (i.e. that a single sessions is tunneled
26  * towards a single application server).
27  *
28  */
29 
30 #ifndef LB_PLUGIN_LB_LB_H_
31 #define LB_PLUGIN_LB_LB_H_
32 
33 #include <lb/util.h>
34 #include <vnet/util/refcount.h>
35 
36 #include <vnet/vnet.h>
37 #include <vnet/ip/ip.h>
38 #include <vnet/dpo/dpo.h>
39 #include <vnet/fib/fib_table.h>
40 #include <vppinfra/hash.h>
41 #include <vppinfra/bihash_8_8.h>
42 #include <vppinfra/bihash_24_8.h>
43 #include <lb/lbhash.h>
44 
45 #define LB_DEFAULT_PER_CPU_STICKY_BUCKETS 1 << 10
46 #define LB_DEFAULT_FLOW_TIMEOUT 40
47 #define LB_MAPPING_BUCKETS 1024
48 #define LB_MAPPING_MEMORY_SIZE 64<<20
49 
50 typedef enum {
53 } lb_next_t;
54 
55 typedef enum {
60 
61 typedef enum {
66 
67 #define foreach_lb_nat_in2out_error \
68 _(UNSUPPORTED_PROTOCOL, "Unsupported protocol") \
69 _(IN2OUT_PACKETS, "Good in2out packets processed") \
70 _(NO_TRANSLATION, "No translation")
71 
72 typedef enum {
73 #define _(sym,str) LB_NAT_IN2OUT_ERROR_##sym,
75 #undef _
78 
79 /**
80  * lb for kube-proxy supports three types of service
81  */
82 typedef enum {
87 
88 typedef enum {
93 
94 typedef enum {
99 
100 /**
101  * Each VIP is configured with a set of
102  * application server.
103  */
104 typedef struct {
105  /**
106  * Registration to FIB event.
107  */
109 
110  /**
111  * Destination address used to tunnel traffic towards
112  * that application server.
113  * The address is also used as ID and pseudo-random
114  * seed for the load-balancing process.
115  */
116  ip46_address_t address;
117 
118  /**
119  * ASs are indexed by address and VIP Index.
120  * Which means there will be duplicated if the same server
121  * address is used for multiple VIPs.
122  */
124 
125  /**
126  * Some per-AS flags.
127  * For now only LB_AS_FLAGS_USED is defined.
128  */
130 
131 #define LB_AS_FLAGS_USED 0x1
132 
133  /**
134  * Rotating timestamp of when LB_AS_FLAGS_USED flag was last set.
135  *
136  * AS removal is based on garbage collection and reference counting.
137  * When an AS is removed, there is a race between configuration core
138  * and worker cores which may still add a reference while it should not
139  * be used. This timestamp is used to not remove the AS while a race condition
140  * may happen.
141  */
143 
144  /**
145  * The FIB entry index for the next-hop
146  */
148 
149  /**
150  * The child index on the FIB entry
151  */
153 
154  /**
155  * The next DPO in the graph to follow.
156  */
158 
159 } lb_as_t;
160 
162 
163 typedef struct {
166 
167 #define lb_foreach_vip_counter \
168  _(NEXT_PACKET, "packet from existing sessions", 0) \
169  _(FIRST_PACKET, "first session packet", 1) \
170  _(UNTRACKED_PACKET, "untracked packet", 2) \
171  _(NO_SERVER, "no server configured", 3)
172 
173 typedef enum {
174 #define _(a,b,c) LB_VIP_COUNTER_##a = c,
176 #undef _
179 
180 typedef enum {
188 
189 /**
190  * The load balancer supports IPv4 and IPv6 traffic
191  * and GRE4, GRE6, L3DSR and NAT4, NAT6 encap.
192  */
193 typedef enum {
202 } lb_vip_type_t;
203 
206 
207 
208 /* args for different vip encap types */
209 typedef struct {
210  union
211  {
212  struct
213  {
214  /* Service type. clusterip or nodeport */
216 
217  /* Service port. network byte order */
219 
220  /* Pod's port corresponding to specific service. network byte order */
222 
223  /* Node's port, can access service via NodeIP:node_port. network byte order */
225  };
226  /* DSCP bits for L3DSR */
229  };
231 
232 /**
233  * Load balancing service is provided per VIP.
234  * In this data model, a VIP can be a whole prefix.
235  * But load balancing only
236  * occurs on a per-source-address/port basis. Meaning that if a given source
237  * reuses the same port for multiple destinations within the same VIP,
238  * they will be considered as a single flow.
239  */
240 typedef struct {
241 
242  //Runtime
243 
244  /**
245  * Vector mapping (flow-hash & new_connect_table_mask) to AS index.
246  * This is used for new flows.
247  */
249 
250  /**
251  * New flows table length - 1
252  * (length MUST be a power of 2)
253  */
255 
256  /**
257  * Last time garbage collection was run to free the ASs.
258  */
260 
261  //Not runtime
262 
263  /**
264  * A Virtual IP represents a given service delivered
265  * by a set of application servers. It can be a single
266  * address or a prefix.
267  * IPv4 prefixes are encoded using IPv4-in-IPv6 embedded address
268  * (i.e. ::/96 prefix).
269  */
270  ip46_address_t prefix;
271 
272  /**
273  * The VIP prefix length.
274  * In case of IPv4, plen = 96 + ip4_plen.
275  */
277 
278  /**
279  * The type of traffic for this.
280  * LB_TYPE_UNDEFINED if unknown.
281  */
283 
284  /* args for different vip encap types */
286 
287  /**
288  * Flags related to this VIP.
289  * LB_VIP_FLAGS_USED means the VIP is active.
290  * When it is not set, the VIP in the process of being removed.
291  * We cannot immediately remove a VIP because the VIP index still may be stored
292  * in the adjacency index.
293  */
295 #define LB_VIP_FLAGS_USED 0x1
296 
297  /**
298  * Pool of AS indexes used for this VIP.
299  * This also includes ASs that have been removed (but are still referenced).
300  */
302 } lb_vip_t;
303 
304 #define lb_vip_is_ip4(vip) ((vip)->type == LB_VIP_TYPE_IP4_GRE6 \
305  || (vip)->type == LB_VIP_TYPE_IP4_GRE4 \
306  || (vip)->type == LB_VIP_TYPE_IP4_L3DSR \
307  || (vip)->type == LB_VIP_TYPE_IP4_NAT4 )
308 
309 #define lb_vip_is_gre4(vip) ((vip)->type == LB_VIP_TYPE_IP6_GRE4 \
310  || (vip)->type == LB_VIP_TYPE_IP4_GRE4)
311 
312 #define lb_vip_is_gre6(vip) ((vip)->type == LB_VIP_TYPE_IP6_GRE6 \
313  || (vip)->type == LB_VIP_TYPE_IP4_GRE6)
314 
315 #define lb_encap_is_ip4(vip) ((vip)->type == LB_VIP_TYPE_IP6_GRE4 \
316  || (vip)->type == LB_VIP_TYPE_IP4_GRE4 \
317  || (vip)->type == LB_VIP_TYPE_IP4_L3DSR \
318  || (vip)->type == LB_VIP_TYPE_IP4_NAT4 )
319 
320 always_inline bool
322 {
323  return vip->type == LB_VIP_TYPE_IP4_L3DSR;
324 }
325 always_inline bool
327 {
328  return vip->type == LB_VIP_TYPE_IP4_NAT4;
329 }
330 always_inline bool
332 {
333  return vip->type == LB_VIP_TYPE_IP6_NAT6;
334 }
335 
338 
339 #define foreach_lb_nat_protocol \
340  _(UDP, 0, udp, "udp") \
341  _(TCP, 1, tcp, "tcp")
342 
343 typedef enum {
344 #define _(N, i, n, s) LB_NAT_PROTOCOL_##N = i,
346 #undef _
348 
351 {
352  u32 nat_proto = ~0;
353 
354  nat_proto = (ip_proto == IP_PROTOCOL_UDP) ? LB_NAT_PROTOCOL_UDP : nat_proto;
355  nat_proto = (ip_proto == IP_PROTOCOL_TCP) ? LB_NAT_PROTOCOL_TCP : nat_proto;
356 
357  return nat_proto;
358 }
359 
360 /* Key for Pod's egress SNAT */
361 typedef struct {
362  union
363  {
364  struct
365  {
368  u16 protocol:3,
369  fib_index:13;
370  };
372  };
374 
375 typedef struct
376 {
377  union
378  {
379  struct
380  {
385  };
387  };
389 
390 typedef struct {
391  /**
392  * for vip + port case, src_ip = vip;
393  * for node ip + node_port, src_ip = node_ip
394  */
395  ip46_address_t src_ip;
396  ip46_address_t as_ip;
399  /**
400  * Network byte order
401  * for vip + port case, src_port = port;
402  * for node ip + node_port, src_port = node_port
403  */
405  u16 target_port; /* Network byte order */
409 
410 typedef struct {
411  /**
412  * Each CPU has its own sticky flow hash table.
413  * One single table is used for all VIPs.
414  */
416 } lb_per_cpu_t;
417 
418 typedef struct {
419  /**
420  * Pool of all Virtual IPs
421  */
423 
424  /**
425  * Pool of ASs.
426  * ASs are referenced by address and vip index.
427  * The first element (index 0) is special and used only to fill
428  * new_flow_tables when no AS has been configured.
429  */
431 
432  /**
433  * Each AS has an associated reference counter.
434  * As ass[0] has a special meaning, its associated counter
435  * starts at 0 and is decremented instead. i.e. do not use it.
436  */
438 
439  /* hash lookup vip_index by key: {u16: nodeport} */
441 
442  /**
443  * Some global data is per-cpu
444  */
446 
447  /**
448  * Node next index for IP adjacencies, for each of the traffic types.
449  */
450  u32 ip_lookup_next_index[LB_VIP_N_TYPES];
451 
452  /**
453  * Source address used in IPv6 encapsulated traffic
454  */
456 
457  /**
458  * Source address used for IPv4 encapsulated traffic
459  */
461 
462  /**
463  * Number of buckets in the per-cpu sticky hash table.
464  */
466 
467  /**
468  * Flow timeout in seconds.
469  */
471 
472  /**
473  * Per VIP counter
474  */
476 
477  /**
478  * DPO used to send packet from IP4/6 lookup to LB node.
479  */
485 
486  /**
487  * Node type for registering to fib changes.
488  */
490 
491  /* Find a static mapping by AS IP : target_port */
492  clib_bihash_8_8_t mapping_by_as4;
493  clib_bihash_24_8_t mapping_by_as6;
494 
495  /* Static mapping pool */
497 
498  /**
499  * API dynamically registered base ID.
500  */
502 
503  volatile u32 *writer_lock;
504 
505  /* convenience */
508 } lb_main_t;
509 
510 /* args for different vip encap types */
511 typedef struct {
512  ip46_address_t prefix;
518 
519 extern lb_main_t lb_main;
526 
527 /**
528  * Fix global load-balancer parameters.
529  * @param ip4_address IPv4 source address used for encapsulated traffic
530  * @param ip6_address IPv6 source address used for encapsulated traffic
531  * @return 0 on success. VNET_LB_ERR_XXX on error
532  */
534  u32 sticky_buckets, u32 flow_timeout);
535 
536 int lb_vip_add(lb_vip_add_args_t args, u32 *vip_index);
537 
538 int lb_vip_del(u32 vip_index);
539 
540 int lb_vip_find_index(ip46_address_t *prefix, u8 plen, u32 *vip_index);
541 
542 #define lb_vip_get_by_index(index) (pool_is_free_index(lb_main.vips, index)?NULL:pool_elt_at_index(lb_main.vips, index))
543 
544 int lb_vip_add_ass(u32 vip_index, ip46_address_t *addresses, u32 n);
545 int lb_vip_del_ass(u32 vip_index, ip46_address_t *addresses, u32 n);
546 
548 
549 void lb_garbage_collection();
550 
551 int lb_nat4_interface_add_del (u32 sw_if_index, int is_del);
552 int lb_nat6_interface_add_del (u32 sw_if_index, int is_del);
553 
555 
556 #endif /* LB_PLUGIN_LB_LB_H_ */
format_function_t format_lb_vip
Definition: lb.h:336
u64 as_u64
Definition: lb.h:371
int lb_vip_add_ass(u32 vip_index, ip46_address_t *addresses, u32 n)
Definition: lb.c:527
static u32 lb_ip_proto_to_nat_proto(u8 ip_proto)
Definition: lb.h:350
uword( unformat_function_t)(unformat_input_t *input, va_list *args)
Definition: format.h:231
vnet_main_t * vnet_main
Definition: lb.h:507
Each VIP is configured with a set of application server.
Definition: lb.h:104
lb6_nodeport_next_t
Definition: lb.h:94
static bool lb_vip_is_l3dsr(const lb_vip_t *vip)
Definition: lb.h:321
u32 fib_index
Definition: lb.h:384
u32 per_cpu_sticky_buckets
Number of buckets in the per-cpu sticky hash table.
Definition: lb.h:465
u64 as_u64
Definition: bihash_doc.h:63
unsigned long u64
Definition: types.h:89
u16 msg_id_base
API dynamically registered base ID.
Definition: lb.h:501
Definition: lb.h:52
lb4_nodeport_next_t
Definition: lb.h:88
ip46_address_t prefix
A Virtual IP represents a given service delivered by a set of application servers.
Definition: lb.h:270
u16 port
Definition: lb.h:367
lb_vip_counter_t
Definition: lb.h:173
format_function_t format_lb_vip_type
Definition: lb.h:204
u8 *( format_function_t)(u8 *s, va_list *args)
Definition: format.h:48
u32 vip_index
ASs are indexed by address and VIP Index.
Definition: lb.h:123
lb_hash_t * sticky_ht
Each CPU has its own sticky flow hash table.
Definition: lb.h:415
unsigned char u8
Definition: types.h:56
ip46_address_t address
Destination address used to tunnel traffic towards that application server.
Definition: lb.h:116
lb_next_t
Definition: lb.h:50
int lb_conf(ip4_address_t *ip4_address, ip6_address_t *ip6_address, u32 sticky_buckets, u32 flow_timeout)
Fix global load-balancer parameters.
Definition: lb.c:464
enum dpo_type_t_ dpo_type_t
Common types of data-path objects New types can be dynamically added using dpo_register_new_type() ...
u8 as_ip_is_ipv6
Definition: lb.h:398
static bool lb_vip_is_nat4(const lb_vip_t *vip)
Definition: lb.h:326
#define always_inline
Definition: clib.h:92
unformat_function_t unformat_lb_vip_type
Definition: lb.h:205
u32 flow_timeout
Flow timeout in seconds.
Definition: lb.h:470
Definition: lb.h:418
fib_node_type_t fib_node_type
Node type for registering to fib changes.
Definition: lb.h:489
dpo_type_t dpo_gre4_type
DPO used to send packet from IP4/6 lookup to LB node.
Definition: lb.h:480
vlib_refcount_t as_refcount
Each AS has an associated reference counter.
Definition: lb.h:437
lb_vip_encap_args_t encap_args
Definition: lb.h:285
#define foreach_lb_nat_protocol
Definition: lb.h:339
unsigned int u32
Definition: types.h:88
format_function_t format_lb_as
Definition: lb.h:161
A collection of simple counters.
Definition: counter.h:58
lb_main_t lb_main
Definition: lb.c:28
vlib_node_registration_t lb6_nodeport_node
(constructor) VLIB_REGISTER_NODE (lb6_nodeport_node)
Definition: node.c:1069
lb_vip_t * vips
Pool of all Virtual IPs.
Definition: lb.h:422
u32 last_used
Rotating timestamp of when LB_AS_FLAGS_USED flag was last set.
Definition: lb.h:142
ip4_address_t ip4_src_address
Source address used for IPv4 encapsulated traffic.
Definition: lb.h:460
vlib_node_registration_t lb_nat6_in2out_node
(constructor) VLIB_REGISTER_NODE (lb_nat6_in2out_node)
Definition: node.c:1115
u8 plen
The VIP prefix length.
Definition: lb.h:276
The identity of a DPO is a combination of its type and its instance number/index of objects of that t...
Definition: dpo.h:168
static bool lb_vip_is_nat6(const lb_vip_t *vip)
Definition: lb.h:331
unsigned short u16
Definition: types.h:57
typedef ip6_address
Definition: ip_types.api:20
u16 src_port
Network byte order for vip + port case, src_port = port; for node ip + node_port, src_port = node_por...
Definition: lb.h:404
vlib_node_registration_t lb4_nodeport_node
(constructor) VLIB_REGISTER_NODE (lb4_nodeport_node)
Definition: node.c:1053
format_function_t format_lb_main
Definition: lb.h:554
lb_nat_protocol_t
Definition: lb.h:343
LB_nat4_in2out_next_t
Definition: lb.h:55
clib_bihash_8_8_t mapping_by_as4
Definition: lb.h:492
An node in the FIB graph.
Definition: fib_node.h:286
Definition: lb.h:163
fib_node_t fib_node
Registration to FIB event.
Definition: lb.h:108
lb_encap_type_t
Definition: lb.h:180
u8 src_ip_is_ipv6
Definition: lb.h:397
ip46_address_t src_ip
for vip + port case, src_ip = vip; for node ip + node_port, src_ip = node_ip
Definition: lb.h:395
u32 new_length
Definition: lb.h:515
vlib_node_registration_t lb6_node
vlib_main_t * vm
Definition: buffer.c:294
u32 vrf_id
Definition: lb.h:406
volatile u32 * writer_lock
Definition: lb.h:503
#define lb_foreach_vip_counter
Definition: lb.h:167
vlib_main_t * vlib_main
Definition: lb.h:506
u32 fib_node_index_t
A typedef of a node index.
Definition: fib_types.h:30
u32 as_index
Definition: lb.h:164
dpo_type_t dpo_gre6_type
Definition: lb.h:481
u32 last_garbage_collection
Last time garbage collection was run to free the ASs.
Definition: lb.h:259
lb_as_t * ass
Pool of ASs.
Definition: lb.h:430
uword * vip_index_by_nodeport
Definition: lb.h:440
vlib_node_registration_t lb4_node
ip6_address_t addr
Definition: lb.h:381
int lb_vip_find_index(ip46_address_t *prefix, u8 plen, u32 *vip_index)
Definition: lb.c:500
lb_vip_type_t type
The type of traffic for this.
Definition: lb.h:282
int lb_vip_del(u32 vip_index)
Definition: lb.c:936
lb_svr_type_t
lb for kube-proxy supports three types of service
Definition: lb.h:82
lb_vip_type_t
The load balancer supports IPv4 and IPv6 traffic and GRE4, GRE6, L3DSR and NAT4, NAT6 encap...
Definition: lb.h:193
u16 target_port
Definition: lb.h:405
void lb_garbage_collection()
Definition: lb.c:335
ip46_address_t prefix
Definition: lb.h:512
#define foreach_lb_nat_in2out_error
Definition: lb.h:67
u32 new_flow_table_mask
New flows table length - 1 (length MUST be a power of 2)
Definition: lb.h:254
format_function_t format_lb_vip_detailed
Definition: lb.h:337
int lb_vip_add(lb_vip_add_args_t args, u32 *vip_index)
Definition: lb.c:823
lb_vip_encap_args_t encap_args
Definition: lb.h:516
vlib_node_registration_t lb_nat4_in2out_node
(constructor) VLIB_REGISTER_NODE (lb_nat4_in2out_node)
Definition: node.c:1092
lb_per_cpu_t * per_cpu
Some global data is per-cpu.
Definition: lb.h:445
u16 target_port
Definition: lb.h:221
int lb_vip_del_ass(u32 vip_index, ip46_address_t *addresses, u32 n)
Definition: lb.c:755
struct _vlib_node_registration vlib_node_registration_t
ip6_address_t ip6_src_address
Source address used in IPv6 encapsulated traffic.
Definition: lb.h:455
u64 uword
Definition: types.h:112
lb_snat_mapping_t * snat_mappings
Definition: lb.h:496
u16 port
Definition: lb.h:382
u32 next_hop_child_index
The child index on the FIB entry.
Definition: lb.h:152
dpo_type_t dpo_l3dsr_type
Definition: lb.h:482
typedef ip4_address
Definition: ip_types.api:16
enum fib_node_type_t_ fib_node_type_t
The types of nodes in a FIB graph.
u32 fib_index
Definition: lb.h:407
u32 lb_hash_time_now(vlib_main_t *vm)
Definition: lb.c:70
u16 protocol
Definition: lb.h:383
dpo_id_t dpo
The next DPO in the graph to follow.
Definition: lb.h:157
u8 flags
Some per-AS flags.
Definition: lb.h:129
dpo_type_t dpo_nat4_type
Definition: lb.h:483
ip4_address_t addr
Definition: lb.h:366
clib_bihash_24_8_t mapping_by_as6
Definition: lb.h:493
lb_new_flow_entry_t * new_flow_table
Vector mapping (flow-hash & new_connect_table_mask) to AS index.
Definition: lb.h:248
u8 flags
Flags related to this VIP.
Definition: lb.h:294
LB_nat6_in2out_next_t
Definition: lb.h:61
lb_vip_type_t type
Definition: lb.h:514
ip46_address_t as_ip
Definition: lb.h:396
Load balancing service is provided per VIP.
Definition: lb.h:240
u32 * as_indexes
Pool of AS indexes used for this VIP.
Definition: lb.h:301
lb_nat_in2out_error_t
Definition: lb.h:72
int lb_nat4_interface_add_del(u32 sw_if_index, int is_del)
Definition: lb.c:1045
dpo_type_t dpo_nat6_type
Definition: lb.h:484
fib_node_index_t next_hop_fib_entry_index
The FIB entry index for the next-hop.
Definition: lb.h:147
int lb_nat6_interface_add_del(u32 sw_if_index, int is_del)
Definition: lb.c:1061